diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,57623 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 8226, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00024313153415998054, + "grad_norm": 38.5, + "learning_rate": 2.5000000000000002e-08, + "loss": 1.4348, + "step": 1 + }, + { + "epoch": 0.0004862630683199611, + "grad_norm": 20.5, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.5402, + "step": 2 + }, + { + "epoch": 0.0007293946024799417, + "grad_norm": 37.0, + "learning_rate": 7.500000000000001e-08, + "loss": 0.9649, + "step": 3 + }, + { + "epoch": 0.0009725261366399222, + "grad_norm": 31.125, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.6755, + "step": 4 + }, + { + "epoch": 0.0012156576707999028, + "grad_norm": 35.75, + "learning_rate": 1.2500000000000002e-07, + "loss": 1.5522, + "step": 5 + }, + { + "epoch": 0.0014587892049598833, + "grad_norm": 23.625, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.4726, + "step": 6 + }, + { + "epoch": 0.0017019207391198638, + "grad_norm": 26.25, + "learning_rate": 1.7500000000000002e-07, + "loss": 1.3913, + "step": 7 + }, + { + "epoch": 0.0019450522732798443, + "grad_norm": 31.125, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.578, + "step": 8 + }, + { + "epoch": 0.002188183807439825, + "grad_norm": 30.0, + "learning_rate": 2.2500000000000002e-07, + "loss": 1.1628, + "step": 9 + }, + { + "epoch": 0.0024313153415998056, + "grad_norm": 27.0, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.237, + "step": 10 + }, + { + "epoch": 0.002674446875759786, + "grad_norm": 32.25, + "learning_rate": 2.75e-07, + "loss": 1.5415, + "step": 11 + }, + { + "epoch": 0.0029175784099197666, + "grad_norm": 41.25, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.7788, + "step": 12 + }, + { + "epoch": 0.003160709944079747, + "grad_norm": 33.0, + "learning_rate": 3.25e-07, + "loss": 1.4964, + "step": 13 + }, + { + "epoch": 0.0034038414782397277, + "grad_norm": 21.25, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.2345, + "step": 14 + }, + { + "epoch": 0.0036469730123997084, + "grad_norm": 33.5, + "learning_rate": 3.75e-07, + "loss": 1.8139, + "step": 15 + }, + { + "epoch": 0.0038901045465596887, + "grad_norm": 28.0, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.3461, + "step": 16 + }, + { + "epoch": 0.004133236080719669, + "grad_norm": 29.125, + "learning_rate": 4.2500000000000006e-07, + "loss": 1.45, + "step": 17 + }, + { + "epoch": 0.00437636761487965, + "grad_norm": 43.75, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.8464, + "step": 18 + }, + { + "epoch": 0.0046194991490396305, + "grad_norm": 26.0, + "learning_rate": 4.7500000000000006e-07, + "loss": 1.2513, + "step": 19 + }, + { + "epoch": 0.004862630683199611, + "grad_norm": 31.375, + "learning_rate": 5.000000000000001e-07, + "loss": 1.8003, + "step": 20 + }, + { + "epoch": 0.005105762217359592, + "grad_norm": 27.625, + "learning_rate": 5.250000000000001e-07, + "loss": 1.2935, + "step": 21 + }, + { + "epoch": 0.005348893751519572, + "grad_norm": 24.125, + "learning_rate": 5.5e-07, + "loss": 1.2846, + "step": 22 + }, + { + "epoch": 0.0055920252856795525, + "grad_norm": 39.25, + "learning_rate": 5.750000000000001e-07, + "loss": 1.5984, + "step": 23 + }, + { + "epoch": 0.005835156819839533, + "grad_norm": 22.25, + "learning_rate": 6.000000000000001e-07, + "loss": 1.1517, + "step": 24 + }, + { + "epoch": 0.006078288353999514, + "grad_norm": 49.25, + "learning_rate": 6.25e-07, + "loss": 1.7331, + "step": 25 + }, + { + "epoch": 0.006321419888159494, + "grad_norm": 51.5, + "learning_rate": 6.5e-07, + "loss": 2.2625, + "step": 26 + }, + { + "epoch": 0.006564551422319475, + "grad_norm": 33.5, + "learning_rate": 6.750000000000001e-07, + "loss": 1.2859, + "step": 27 + }, + { + "epoch": 0.006807682956479455, + "grad_norm": 26.625, + "learning_rate": 7.000000000000001e-07, + "loss": 1.4909, + "step": 28 + }, + { + "epoch": 0.007050814490639436, + "grad_norm": 35.5, + "learning_rate": 7.25e-07, + "loss": 0.9688, + "step": 29 + }, + { + "epoch": 0.007293946024799417, + "grad_norm": 29.125, + "learning_rate": 7.5e-07, + "loss": 1.3298, + "step": 30 + }, + { + "epoch": 0.007537077558959397, + "grad_norm": 30.5, + "learning_rate": 7.750000000000001e-07, + "loss": 1.5472, + "step": 31 + }, + { + "epoch": 0.007780209093119377, + "grad_norm": 32.0, + "learning_rate": 8.000000000000001e-07, + "loss": 1.7422, + "step": 32 + }, + { + "epoch": 0.008023340627279359, + "grad_norm": 41.5, + "learning_rate": 8.250000000000001e-07, + "loss": 1.267, + "step": 33 + }, + { + "epoch": 0.008266472161439338, + "grad_norm": 24.125, + "learning_rate": 8.500000000000001e-07, + "loss": 0.8925, + "step": 34 + }, + { + "epoch": 0.008509603695599319, + "grad_norm": 22.0, + "learning_rate": 8.75e-07, + "loss": 1.4644, + "step": 35 + }, + { + "epoch": 0.0087527352297593, + "grad_norm": 26.5, + "learning_rate": 9.000000000000001e-07, + "loss": 1.6057, + "step": 36 + }, + { + "epoch": 0.00899586676391928, + "grad_norm": 23.25, + "learning_rate": 9.25e-07, + "loss": 1.0055, + "step": 37 + }, + { + "epoch": 0.009238998298079261, + "grad_norm": 28.375, + "learning_rate": 9.500000000000001e-07, + "loss": 1.5588, + "step": 38 + }, + { + "epoch": 0.009482129832239242, + "grad_norm": 21.125, + "learning_rate": 9.750000000000002e-07, + "loss": 1.6439, + "step": 39 + }, + { + "epoch": 0.009725261366399222, + "grad_norm": 26.375, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.2073, + "step": 40 + }, + { + "epoch": 0.009968392900559203, + "grad_norm": 25.0, + "learning_rate": 1.025e-06, + "loss": 1.1954, + "step": 41 + }, + { + "epoch": 0.010211524434719184, + "grad_norm": 33.25, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.4811, + "step": 42 + }, + { + "epoch": 0.010454655968879163, + "grad_norm": 29.5, + "learning_rate": 1.075e-06, + "loss": 1.5775, + "step": 43 + }, + { + "epoch": 0.010697787503039144, + "grad_norm": 22.0, + "learning_rate": 1.1e-06, + "loss": 1.1789, + "step": 44 + }, + { + "epoch": 0.010940919037199124, + "grad_norm": 53.75, + "learning_rate": 1.125e-06, + "loss": 1.3597, + "step": 45 + }, + { + "epoch": 0.011184050571359105, + "grad_norm": 28.875, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3105, + "step": 46 + }, + { + "epoch": 0.011427182105519086, + "grad_norm": 21.0, + "learning_rate": 1.175e-06, + "loss": 1.0907, + "step": 47 + }, + { + "epoch": 0.011670313639679067, + "grad_norm": 28.125, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.7235, + "step": 48 + }, + { + "epoch": 0.011913445173839047, + "grad_norm": 24.875, + "learning_rate": 1.2250000000000001e-06, + "loss": 1.2318, + "step": 49 + }, + { + "epoch": 0.012156576707999028, + "grad_norm": 26.125, + "learning_rate": 1.25e-06, + "loss": 1.3413, + "step": 50 + }, + { + "epoch": 0.012399708242159009, + "grad_norm": 24.5, + "learning_rate": 1.275e-06, + "loss": 1.2784, + "step": 51 + }, + { + "epoch": 0.012642839776318988, + "grad_norm": 29.625, + "learning_rate": 1.3e-06, + "loss": 1.5526, + "step": 52 + }, + { + "epoch": 0.012885971310478968, + "grad_norm": 22.625, + "learning_rate": 1.3250000000000002e-06, + "loss": 0.7708, + "step": 53 + }, + { + "epoch": 0.01312910284463895, + "grad_norm": 19.75, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.1115, + "step": 54 + }, + { + "epoch": 0.01337223437879893, + "grad_norm": 22.375, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.8853, + "step": 55 + }, + { + "epoch": 0.01361536591295891, + "grad_norm": 31.0, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.9406, + "step": 56 + }, + { + "epoch": 0.013858497447118891, + "grad_norm": 21.75, + "learning_rate": 1.425e-06, + "loss": 1.1941, + "step": 57 + }, + { + "epoch": 0.014101628981278872, + "grad_norm": 48.5, + "learning_rate": 1.45e-06, + "loss": 1.3161, + "step": 58 + }, + { + "epoch": 0.014344760515438853, + "grad_norm": 23.125, + "learning_rate": 1.475e-06, + "loss": 1.2449, + "step": 59 + }, + { + "epoch": 0.014587892049598834, + "grad_norm": 24.75, + "learning_rate": 1.5e-06, + "loss": 1.2071, + "step": 60 + }, + { + "epoch": 0.014831023583758814, + "grad_norm": 25.125, + "learning_rate": 1.525e-06, + "loss": 1.5179, + "step": 61 + }, + { + "epoch": 0.015074155117918793, + "grad_norm": 27.0, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.3866, + "step": 62 + }, + { + "epoch": 0.015317286652078774, + "grad_norm": 19.25, + "learning_rate": 1.5750000000000002e-06, + "loss": 1.2359, + "step": 63 + }, + { + "epoch": 0.015560418186238755, + "grad_norm": 27.375, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.0784, + "step": 64 + }, + { + "epoch": 0.015803549720398737, + "grad_norm": 21.75, + "learning_rate": 1.6250000000000001e-06, + "loss": 1.4198, + "step": 65 + }, + { + "epoch": 0.016046681254558718, + "grad_norm": 22.125, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.6945, + "step": 66 + }, + { + "epoch": 0.016289812788718695, + "grad_norm": 15.9375, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.8836, + "step": 67 + }, + { + "epoch": 0.016532944322878676, + "grad_norm": 24.25, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.1728, + "step": 68 + }, + { + "epoch": 0.016776075857038657, + "grad_norm": 26.875, + "learning_rate": 1.725e-06, + "loss": 1.2629, + "step": 69 + }, + { + "epoch": 0.017019207391198637, + "grad_norm": 18.125, + "learning_rate": 1.75e-06, + "loss": 1.0292, + "step": 70 + }, + { + "epoch": 0.017262338925358618, + "grad_norm": 27.875, + "learning_rate": 1.7750000000000002e-06, + "loss": 1.4961, + "step": 71 + }, + { + "epoch": 0.0175054704595186, + "grad_norm": 19.375, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.4118, + "step": 72 + }, + { + "epoch": 0.01774860199367858, + "grad_norm": 23.125, + "learning_rate": 1.825e-06, + "loss": 1.0297, + "step": 73 + }, + { + "epoch": 0.01799173352783856, + "grad_norm": 19.125, + "learning_rate": 1.85e-06, + "loss": 1.0484, + "step": 74 + }, + { + "epoch": 0.01823486506199854, + "grad_norm": 16.75, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.9174, + "step": 75 + }, + { + "epoch": 0.018477996596158522, + "grad_norm": 31.0, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.2225, + "step": 76 + }, + { + "epoch": 0.018721128130318503, + "grad_norm": 22.375, + "learning_rate": 1.925e-06, + "loss": 1.0815, + "step": 77 + }, + { + "epoch": 0.018964259664478483, + "grad_norm": 38.5, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.2646, + "step": 78 + }, + { + "epoch": 0.019207391198638464, + "grad_norm": 23.375, + "learning_rate": 1.975e-06, + "loss": 1.1941, + "step": 79 + }, + { + "epoch": 0.019450522732798445, + "grad_norm": 23.375, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.0209, + "step": 80 + }, + { + "epoch": 0.019693654266958426, + "grad_norm": 22.0, + "learning_rate": 2.025e-06, + "loss": 1.1193, + "step": 81 + }, + { + "epoch": 0.019936785801118406, + "grad_norm": 17.875, + "learning_rate": 2.05e-06, + "loss": 0.7613, + "step": 82 + }, + { + "epoch": 0.020179917335278387, + "grad_norm": 20.25, + "learning_rate": 2.075e-06, + "loss": 1.0781, + "step": 83 + }, + { + "epoch": 0.020423048869438368, + "grad_norm": 21.75, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.1135, + "step": 84 + }, + { + "epoch": 0.020666180403598345, + "grad_norm": 24.125, + "learning_rate": 2.125e-06, + "loss": 1.4294, + "step": 85 + }, + { + "epoch": 0.020909311937758326, + "grad_norm": 25.625, + "learning_rate": 2.15e-06, + "loss": 1.3782, + "step": 86 + }, + { + "epoch": 0.021152443471918306, + "grad_norm": 23.875, + "learning_rate": 2.1750000000000004e-06, + "loss": 0.9395, + "step": 87 + }, + { + "epoch": 0.021395575006078287, + "grad_norm": 23.125, + "learning_rate": 2.2e-06, + "loss": 1.1842, + "step": 88 + }, + { + "epoch": 0.021638706540238268, + "grad_norm": 24.75, + "learning_rate": 2.2250000000000003e-06, + "loss": 1.1139, + "step": 89 + }, + { + "epoch": 0.02188183807439825, + "grad_norm": 26.25, + "learning_rate": 2.25e-06, + "loss": 1.0698, + "step": 90 + }, + { + "epoch": 0.02212496960855823, + "grad_norm": 24.125, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.6632, + "step": 91 + }, + { + "epoch": 0.02236810114271821, + "grad_norm": 49.0, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.2604, + "step": 92 + }, + { + "epoch": 0.02261123267687819, + "grad_norm": 19.0, + "learning_rate": 2.325e-06, + "loss": 0.6398, + "step": 93 + }, + { + "epoch": 0.02285436421103817, + "grad_norm": 21.0, + "learning_rate": 2.35e-06, + "loss": 1.2111, + "step": 94 + }, + { + "epoch": 0.023097495745198152, + "grad_norm": 24.625, + "learning_rate": 2.375e-06, + "loss": 1.0793, + "step": 95 + }, + { + "epoch": 0.023340627279358133, + "grad_norm": 18.625, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.6156, + "step": 96 + }, + { + "epoch": 0.023583758813518114, + "grad_norm": 20.875, + "learning_rate": 2.425e-06, + "loss": 0.8457, + "step": 97 + }, + { + "epoch": 0.023826890347678095, + "grad_norm": 28.875, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.2779, + "step": 98 + }, + { + "epoch": 0.024070021881838075, + "grad_norm": 22.625, + "learning_rate": 2.475e-06, + "loss": 1.4615, + "step": 99 + }, + { + "epoch": 0.024313153415998056, + "grad_norm": 23.125, + "learning_rate": 2.5e-06, + "loss": 1.1989, + "step": 100 + }, + { + "epoch": 0.024556284950158037, + "grad_norm": 22.375, + "learning_rate": 2.499999906582956e-06, + "loss": 1.3843, + "step": 101 + }, + { + "epoch": 0.024799416484318017, + "grad_norm": 19.5, + "learning_rate": 2.4999996263318378e-06, + "loss": 0.7177, + "step": 102 + }, + { + "epoch": 0.025042548018477998, + "grad_norm": 21.875, + "learning_rate": 2.4999991592466867e-06, + "loss": 1.0586, + "step": 103 + }, + { + "epoch": 0.025285679552637975, + "grad_norm": 22.375, + "learning_rate": 2.4999985053275737e-06, + "loss": 1.253, + "step": 104 + }, + { + "epoch": 0.025528811086797956, + "grad_norm": 25.5, + "learning_rate": 2.499997664574595e-06, + "loss": 1.1569, + "step": 105 + }, + { + "epoch": 0.025771942620957937, + "grad_norm": 23.0, + "learning_rate": 2.499996636987878e-06, + "loss": 1.1424, + "step": 106 + }, + { + "epoch": 0.026015074155117918, + "grad_norm": 19.25, + "learning_rate": 2.499995422567575e-06, + "loss": 1.1592, + "step": 107 + }, + { + "epoch": 0.0262582056892779, + "grad_norm": 20.0, + "learning_rate": 2.499994021313868e-06, + "loss": 0.9183, + "step": 108 + }, + { + "epoch": 0.02650133722343788, + "grad_norm": 15.75, + "learning_rate": 2.499992433226966e-06, + "loss": 0.815, + "step": 109 + }, + { + "epoch": 0.02674446875759786, + "grad_norm": 23.875, + "learning_rate": 2.499990658307107e-06, + "loss": 0.9233, + "step": 110 + }, + { + "epoch": 0.02698760029175784, + "grad_norm": 19.625, + "learning_rate": 2.499988696554556e-06, + "loss": 1.0855, + "step": 111 + }, + { + "epoch": 0.02723073182591782, + "grad_norm": 23.375, + "learning_rate": 2.499986547969607e-06, + "loss": 1.3179, + "step": 112 + }, + { + "epoch": 0.027473863360077802, + "grad_norm": 18.75, + "learning_rate": 2.499984212552579e-06, + "loss": 1.1547, + "step": 113 + }, + { + "epoch": 0.027716994894237783, + "grad_norm": 16.625, + "learning_rate": 2.4999816903038236e-06, + "loss": 0.8664, + "step": 114 + }, + { + "epoch": 0.027960126428397764, + "grad_norm": 23.75, + "learning_rate": 2.499978981223716e-06, + "loss": 1.032, + "step": 115 + }, + { + "epoch": 0.028203257962557744, + "grad_norm": 20.25, + "learning_rate": 2.499976085312662e-06, + "loss": 0.8514, + "step": 116 + }, + { + "epoch": 0.028446389496717725, + "grad_norm": 19.0, + "learning_rate": 2.4999730025710945e-06, + "loss": 1.1174, + "step": 117 + }, + { + "epoch": 0.028689521030877706, + "grad_norm": 14.8125, + "learning_rate": 2.4999697329994736e-06, + "loss": 1.0144, + "step": 118 + }, + { + "epoch": 0.028932652565037686, + "grad_norm": 22.125, + "learning_rate": 2.4999662765982884e-06, + "loss": 1.0733, + "step": 119 + }, + { + "epoch": 0.029175784099197667, + "grad_norm": 17.875, + "learning_rate": 2.4999626333680554e-06, + "loss": 0.8249, + "step": 120 + }, + { + "epoch": 0.029418915633357648, + "grad_norm": 14.4375, + "learning_rate": 2.49995880330932e-06, + "loss": 0.7594, + "step": 121 + }, + { + "epoch": 0.02966204716751763, + "grad_norm": 18.0, + "learning_rate": 2.4999547864226532e-06, + "loss": 1.0326, + "step": 122 + }, + { + "epoch": 0.029905178701677606, + "grad_norm": 26.875, + "learning_rate": 2.499950582708656e-06, + "loss": 0.809, + "step": 123 + }, + { + "epoch": 0.030148310235837587, + "grad_norm": 17.875, + "learning_rate": 2.4999461921679567e-06, + "loss": 0.9553, + "step": 124 + }, + { + "epoch": 0.030391441769997567, + "grad_norm": 22.75, + "learning_rate": 2.4999416148012122e-06, + "loss": 1.0211, + "step": 125 + }, + { + "epoch": 0.030634573304157548, + "grad_norm": 25.0, + "learning_rate": 2.499936850609106e-06, + "loss": 1.1137, + "step": 126 + }, + { + "epoch": 0.03087770483831753, + "grad_norm": 21.5, + "learning_rate": 2.4999318995923507e-06, + "loss": 0.7679, + "step": 127 + }, + { + "epoch": 0.03112083637247751, + "grad_norm": 44.25, + "learning_rate": 2.499926761751685e-06, + "loss": 1.2556, + "step": 128 + }, + { + "epoch": 0.031363967906637494, + "grad_norm": 21.5, + "learning_rate": 2.499921437087878e-06, + "loss": 0.8554, + "step": 129 + }, + { + "epoch": 0.031607099440797475, + "grad_norm": 19.25, + "learning_rate": 2.499915925601726e-06, + "loss": 0.9156, + "step": 130 + }, + { + "epoch": 0.031850230974957455, + "grad_norm": 21.125, + "learning_rate": 2.4999102272940516e-06, + "loss": 1.0176, + "step": 131 + }, + { + "epoch": 0.032093362509117436, + "grad_norm": 21.375, + "learning_rate": 2.4999043421657075e-06, + "loss": 0.9295, + "step": 132 + }, + { + "epoch": 0.03233649404327741, + "grad_norm": 20.125, + "learning_rate": 2.499898270217572e-06, + "loss": 0.8986, + "step": 133 + }, + { + "epoch": 0.03257962557743739, + "grad_norm": 26.0, + "learning_rate": 2.499892011450554e-06, + "loss": 1.1765, + "step": 134 + }, + { + "epoch": 0.03282275711159737, + "grad_norm": 20.875, + "learning_rate": 2.499885565865589e-06, + "loss": 1.0191, + "step": 135 + }, + { + "epoch": 0.03306588864575735, + "grad_norm": 15.9375, + "learning_rate": 2.4998789334636393e-06, + "loss": 0.9031, + "step": 136 + }, + { + "epoch": 0.03330902017991733, + "grad_norm": 19.0, + "learning_rate": 2.499872114245697e-06, + "loss": 1.0915, + "step": 137 + }, + { + "epoch": 0.03355215171407731, + "grad_norm": 18.5, + "learning_rate": 2.4998651082127815e-06, + "loss": 0.7411, + "step": 138 + }, + { + "epoch": 0.033795283248237294, + "grad_norm": 22.75, + "learning_rate": 2.4998579153659393e-06, + "loss": 1.328, + "step": 139 + }, + { + "epoch": 0.034038414782397275, + "grad_norm": 16.25, + "learning_rate": 2.4998505357062457e-06, + "loss": 0.7537, + "step": 140 + }, + { + "epoch": 0.034281546316557256, + "grad_norm": 20.125, + "learning_rate": 2.499842969234804e-06, + "loss": 1.1528, + "step": 141 + }, + { + "epoch": 0.034524677850717236, + "grad_norm": 21.375, + "learning_rate": 2.4998352159527458e-06, + "loss": 1.0924, + "step": 142 + }, + { + "epoch": 0.03476780938487722, + "grad_norm": 19.25, + "learning_rate": 2.499827275861228e-06, + "loss": 1.127, + "step": 143 + }, + { + "epoch": 0.0350109409190372, + "grad_norm": 19.0, + "learning_rate": 2.4998191489614393e-06, + "loss": 1.0547, + "step": 144 + }, + { + "epoch": 0.03525407245319718, + "grad_norm": 29.25, + "learning_rate": 2.4998108352545933e-06, + "loss": 1.1, + "step": 145 + }, + { + "epoch": 0.03549720398735716, + "grad_norm": 20.625, + "learning_rate": 2.499802334741933e-06, + "loss": 0.8825, + "step": 146 + }, + { + "epoch": 0.03574033552151714, + "grad_norm": 18.75, + "learning_rate": 2.499793647424729e-06, + "loss": 0.9249, + "step": 147 + }, + { + "epoch": 0.03598346705567712, + "grad_norm": 20.25, + "learning_rate": 2.49978477330428e-06, + "loss": 1.0968, + "step": 148 + }, + { + "epoch": 0.0362265985898371, + "grad_norm": 26.375, + "learning_rate": 2.4997757123819117e-06, + "loss": 0.6389, + "step": 149 + }, + { + "epoch": 0.03646973012399708, + "grad_norm": 18.875, + "learning_rate": 2.499766464658979e-06, + "loss": 1.0685, + "step": 150 + }, + { + "epoch": 0.03671286165815706, + "grad_norm": 17.375, + "learning_rate": 2.499757030136864e-06, + "loss": 1.1539, + "step": 151 + }, + { + "epoch": 0.036955993192317044, + "grad_norm": 35.0, + "learning_rate": 2.4997474088169764e-06, + "loss": 1.4932, + "step": 152 + }, + { + "epoch": 0.037199124726477024, + "grad_norm": 19.625, + "learning_rate": 2.499737600700755e-06, + "loss": 0.8824, + "step": 153 + }, + { + "epoch": 0.037442256260637005, + "grad_norm": 26.25, + "learning_rate": 2.4997276057896656e-06, + "loss": 0.9399, + "step": 154 + }, + { + "epoch": 0.037685387794796986, + "grad_norm": 28.125, + "learning_rate": 2.499717424085202e-06, + "loss": 1.3844, + "step": 155 + }, + { + "epoch": 0.03792851932895697, + "grad_norm": 22.625, + "learning_rate": 2.4997070555888855e-06, + "loss": 1.4123, + "step": 156 + }, + { + "epoch": 0.03817165086311695, + "grad_norm": 35.0, + "learning_rate": 2.4996965003022667e-06, + "loss": 0.8678, + "step": 157 + }, + { + "epoch": 0.03841478239727693, + "grad_norm": 16.0, + "learning_rate": 2.499685758226923e-06, + "loss": 0.7204, + "step": 158 + }, + { + "epoch": 0.03865791393143691, + "grad_norm": 19.0, + "learning_rate": 2.4996748293644597e-06, + "loss": 1.0738, + "step": 159 + }, + { + "epoch": 0.03890104546559689, + "grad_norm": 23.25, + "learning_rate": 2.4996637137165106e-06, + "loss": 0.9177, + "step": 160 + }, + { + "epoch": 0.03914417699975687, + "grad_norm": 21.375, + "learning_rate": 2.499652411284737e-06, + "loss": 1.3205, + "step": 161 + }, + { + "epoch": 0.03938730853391685, + "grad_norm": 17.625, + "learning_rate": 2.4996409220708282e-06, + "loss": 0.6068, + "step": 162 + }, + { + "epoch": 0.03963044006807683, + "grad_norm": 21.5, + "learning_rate": 2.499629246076502e-06, + "loss": 1.1155, + "step": 163 + }, + { + "epoch": 0.03987357160223681, + "grad_norm": 19.75, + "learning_rate": 2.4996173833035027e-06, + "loss": 1.0647, + "step": 164 + }, + { + "epoch": 0.04011670313639679, + "grad_norm": 25.125, + "learning_rate": 2.4996053337536043e-06, + "loss": 1.0785, + "step": 165 + }, + { + "epoch": 0.040359834670556774, + "grad_norm": 21.75, + "learning_rate": 2.4995930974286067e-06, + "loss": 1.1023, + "step": 166 + }, + { + "epoch": 0.040602966204716755, + "grad_norm": 21.25, + "learning_rate": 2.4995806743303396e-06, + "loss": 1.0521, + "step": 167 + }, + { + "epoch": 0.040846097738876735, + "grad_norm": 28.625, + "learning_rate": 2.49956806446066e-06, + "loss": 1.0485, + "step": 168 + }, + { + "epoch": 0.041089229273036716, + "grad_norm": 20.125, + "learning_rate": 2.4995552678214523e-06, + "loss": 1.2728, + "step": 169 + }, + { + "epoch": 0.04133236080719669, + "grad_norm": 18.25, + "learning_rate": 2.499542284414629e-06, + "loss": 0.8282, + "step": 170 + }, + { + "epoch": 0.04157549234135667, + "grad_norm": 22.5, + "learning_rate": 2.4995291142421315e-06, + "loss": 1.1808, + "step": 171 + }, + { + "epoch": 0.04181862387551665, + "grad_norm": 16.125, + "learning_rate": 2.4995157573059274e-06, + "loss": 0.6348, + "step": 172 + }, + { + "epoch": 0.04206175540967663, + "grad_norm": 25.125, + "learning_rate": 2.499502213608013e-06, + "loss": 1.0446, + "step": 173 + }, + { + "epoch": 0.04230488694383661, + "grad_norm": 15.875, + "learning_rate": 2.499488483150414e-06, + "loss": 0.6286, + "step": 174 + }, + { + "epoch": 0.042548018477996594, + "grad_norm": 22.625, + "learning_rate": 2.4994745659351815e-06, + "loss": 1.149, + "step": 175 + }, + { + "epoch": 0.042791150012156574, + "grad_norm": 21.75, + "learning_rate": 2.4994604619643957e-06, + "loss": 1.2704, + "step": 176 + }, + { + "epoch": 0.043034281546316555, + "grad_norm": 22.125, + "learning_rate": 2.4994461712401652e-06, + "loss": 1.3536, + "step": 177 + }, + { + "epoch": 0.043277413080476536, + "grad_norm": 16.875, + "learning_rate": 2.4994316937646258e-06, + "loss": 0.7183, + "step": 178 + }, + { + "epoch": 0.04352054461463652, + "grad_norm": 23.375, + "learning_rate": 2.499417029539941e-06, + "loss": 0.9442, + "step": 179 + }, + { + "epoch": 0.0437636761487965, + "grad_norm": 16.0, + "learning_rate": 2.499402178568303e-06, + "loss": 0.968, + "step": 180 + }, + { + "epoch": 0.04400680768295648, + "grad_norm": 19.875, + "learning_rate": 2.499387140851932e-06, + "loss": 1.123, + "step": 181 + }, + { + "epoch": 0.04424993921711646, + "grad_norm": 18.25, + "learning_rate": 2.4993719163930745e-06, + "loss": 0.7866, + "step": 182 + }, + { + "epoch": 0.04449307075127644, + "grad_norm": 26.5, + "learning_rate": 2.4993565051940072e-06, + "loss": 1.1286, + "step": 183 + }, + { + "epoch": 0.04473620228543642, + "grad_norm": 16.5, + "learning_rate": 2.4993409072570328e-06, + "loss": 0.7304, + "step": 184 + }, + { + "epoch": 0.0449793338195964, + "grad_norm": 24.5, + "learning_rate": 2.4993251225844826e-06, + "loss": 1.2202, + "step": 185 + }, + { + "epoch": 0.04522246535375638, + "grad_norm": 23.125, + "learning_rate": 2.499309151178717e-06, + "loss": 1.2155, + "step": 186 + }, + { + "epoch": 0.04546559688791636, + "grad_norm": 16.25, + "learning_rate": 2.4992929930421215e-06, + "loss": 0.9387, + "step": 187 + }, + { + "epoch": 0.04570872842207634, + "grad_norm": 16.375, + "learning_rate": 2.499276648177113e-06, + "loss": 0.6564, + "step": 188 + }, + { + "epoch": 0.045951859956236324, + "grad_norm": 20.0, + "learning_rate": 2.4992601165861334e-06, + "loss": 0.8013, + "step": 189 + }, + { + "epoch": 0.046194991490396305, + "grad_norm": 20.75, + "learning_rate": 2.499243398271654e-06, + "loss": 1.3058, + "step": 190 + }, + { + "epoch": 0.046438123024556285, + "grad_norm": 25.0, + "learning_rate": 2.499226493236173e-06, + "loss": 1.3281, + "step": 191 + }, + { + "epoch": 0.046681254558716266, + "grad_norm": 17.25, + "learning_rate": 2.4992094014822182e-06, + "loss": 0.7938, + "step": 192 + }, + { + "epoch": 0.04692438609287625, + "grad_norm": 19.625, + "learning_rate": 2.499192123012344e-06, + "loss": 1.0102, + "step": 193 + }, + { + "epoch": 0.04716751762703623, + "grad_norm": 28.75, + "learning_rate": 2.499174657829132e-06, + "loss": 1.401, + "step": 194 + }, + { + "epoch": 0.04741064916119621, + "grad_norm": 24.0, + "learning_rate": 2.499157005935194e-06, + "loss": 0.6036, + "step": 195 + }, + { + "epoch": 0.04765378069535619, + "grad_norm": 21.375, + "learning_rate": 2.499139167333168e-06, + "loss": 1.1887, + "step": 196 + }, + { + "epoch": 0.04789691222951617, + "grad_norm": 23.0, + "learning_rate": 2.4991211420257195e-06, + "loss": 1.0193, + "step": 197 + }, + { + "epoch": 0.04814004376367615, + "grad_norm": 15.3125, + "learning_rate": 2.4991029300155432e-06, + "loss": 0.7501, + "step": 198 + }, + { + "epoch": 0.04838317529783613, + "grad_norm": 18.125, + "learning_rate": 2.499084531305361e-06, + "loss": 1.0581, + "step": 199 + }, + { + "epoch": 0.04862630683199611, + "grad_norm": 19.375, + "learning_rate": 2.499065945897924e-06, + "loss": 0.9109, + "step": 200 + }, + { + "epoch": 0.04886943836615609, + "grad_norm": 18.125, + "learning_rate": 2.4990471737960086e-06, + "loss": 0.59, + "step": 201 + }, + { + "epoch": 0.04911256990031607, + "grad_norm": 22.75, + "learning_rate": 2.499028215002422e-06, + "loss": 1.4277, + "step": 202 + }, + { + "epoch": 0.049355701434476054, + "grad_norm": 18.25, + "learning_rate": 2.4990090695199964e-06, + "loss": 0.7634, + "step": 203 + }, + { + "epoch": 0.049598832968636035, + "grad_norm": 18.25, + "learning_rate": 2.498989737351595e-06, + "loss": 1.0516, + "step": 204 + }, + { + "epoch": 0.049841964502796016, + "grad_norm": 20.0, + "learning_rate": 2.498970218500106e-06, + "loss": 0.9042, + "step": 205 + }, + { + "epoch": 0.050085096036955996, + "grad_norm": 16.375, + "learning_rate": 2.4989505129684473e-06, + "loss": 0.8591, + "step": 206 + }, + { + "epoch": 0.05032822757111598, + "grad_norm": 19.625, + "learning_rate": 2.498930620759565e-06, + "loss": 0.9474, + "step": 207 + }, + { + "epoch": 0.05057135910527595, + "grad_norm": 19.0, + "learning_rate": 2.498910541876431e-06, + "loss": 0.8818, + "step": 208 + }, + { + "epoch": 0.05081449063943593, + "grad_norm": 21.375, + "learning_rate": 2.4988902763220472e-06, + "loss": 0.8594, + "step": 209 + }, + { + "epoch": 0.05105762217359591, + "grad_norm": 27.25, + "learning_rate": 2.4988698240994427e-06, + "loss": 1.1321, + "step": 210 + }, + { + "epoch": 0.05130075370775589, + "grad_norm": 25.375, + "learning_rate": 2.498849185211674e-06, + "loss": 1.2577, + "step": 211 + }, + { + "epoch": 0.051543885241915874, + "grad_norm": 15.375, + "learning_rate": 2.4988283596618264e-06, + "loss": 0.5451, + "step": 212 + }, + { + "epoch": 0.051787016776075855, + "grad_norm": 20.25, + "learning_rate": 2.498807347453012e-06, + "loss": 0.8714, + "step": 213 + }, + { + "epoch": 0.052030148310235835, + "grad_norm": 30.375, + "learning_rate": 2.4987861485883726e-06, + "loss": 1.3122, + "step": 214 + }, + { + "epoch": 0.052273279844395816, + "grad_norm": 20.375, + "learning_rate": 2.4987647630710757e-06, + "loss": 0.9451, + "step": 215 + }, + { + "epoch": 0.0525164113785558, + "grad_norm": 21.25, + "learning_rate": 2.498743190904318e-06, + "loss": 0.9435, + "step": 216 + }, + { + "epoch": 0.05275954291271578, + "grad_norm": 23.875, + "learning_rate": 2.498721432091324e-06, + "loss": 1.2594, + "step": 217 + }, + { + "epoch": 0.05300267444687576, + "grad_norm": 21.375, + "learning_rate": 2.498699486635346e-06, + "loss": 1.0688, + "step": 218 + }, + { + "epoch": 0.05324580598103574, + "grad_norm": 20.0, + "learning_rate": 2.498677354539663e-06, + "loss": 0.9037, + "step": 219 + }, + { + "epoch": 0.05348893751519572, + "grad_norm": 20.125, + "learning_rate": 2.498655035807585e-06, + "loss": 0.8801, + "step": 220 + }, + { + "epoch": 0.0537320690493557, + "grad_norm": 17.875, + "learning_rate": 2.4986325304424465e-06, + "loss": 1.0117, + "step": 221 + }, + { + "epoch": 0.05397520058351568, + "grad_norm": 24.375, + "learning_rate": 2.4986098384476116e-06, + "loss": 0.8751, + "step": 222 + }, + { + "epoch": 0.05421833211767566, + "grad_norm": 16.25, + "learning_rate": 2.4985869598264724e-06, + "loss": 0.8845, + "step": 223 + }, + { + "epoch": 0.05446146365183564, + "grad_norm": 16.75, + "learning_rate": 2.498563894582448e-06, + "loss": 0.7487, + "step": 224 + }, + { + "epoch": 0.05470459518599562, + "grad_norm": 20.125, + "learning_rate": 2.498540642718986e-06, + "loss": 0.7439, + "step": 225 + }, + { + "epoch": 0.054947726720155604, + "grad_norm": 25.0, + "learning_rate": 2.4985172042395617e-06, + "loss": 1.1016, + "step": 226 + }, + { + "epoch": 0.055190858254315585, + "grad_norm": 15.625, + "learning_rate": 2.498493579147679e-06, + "loss": 0.9374, + "step": 227 + }, + { + "epoch": 0.055433989788475566, + "grad_norm": 28.0, + "learning_rate": 2.4984697674468688e-06, + "loss": 1.1315, + "step": 228 + }, + { + "epoch": 0.055677121322635546, + "grad_norm": 24.625, + "learning_rate": 2.4984457691406896e-06, + "loss": 1.1034, + "step": 229 + }, + { + "epoch": 0.05592025285679553, + "grad_norm": 19.125, + "learning_rate": 2.498421584232729e-06, + "loss": 0.92, + "step": 230 + }, + { + "epoch": 0.05616338439095551, + "grad_norm": 16.875, + "learning_rate": 2.4983972127266015e-06, + "loss": 0.7087, + "step": 231 + }, + { + "epoch": 0.05640651592511549, + "grad_norm": 20.75, + "learning_rate": 2.49837265462595e-06, + "loss": 1.2363, + "step": 232 + }, + { + "epoch": 0.05664964745927547, + "grad_norm": 21.75, + "learning_rate": 2.4983479099344454e-06, + "loss": 0.8479, + "step": 233 + }, + { + "epoch": 0.05689277899343545, + "grad_norm": 20.375, + "learning_rate": 2.498322978655786e-06, + "loss": 1.3034, + "step": 234 + }, + { + "epoch": 0.05713591052759543, + "grad_norm": 17.0, + "learning_rate": 2.498297860793698e-06, + "loss": 0.9552, + "step": 235 + }, + { + "epoch": 0.05737904206175541, + "grad_norm": 15.6875, + "learning_rate": 2.4982725563519357e-06, + "loss": 0.5749, + "step": 236 + }, + { + "epoch": 0.05762217359591539, + "grad_norm": 20.875, + "learning_rate": 2.4982470653342816e-06, + "loss": 1.4229, + "step": 237 + }, + { + "epoch": 0.05786530513007537, + "grad_norm": 19.75, + "learning_rate": 2.4982213877445456e-06, + "loss": 0.966, + "step": 238 + }, + { + "epoch": 0.058108436664235354, + "grad_norm": 19.625, + "learning_rate": 2.4981955235865657e-06, + "loss": 0.9087, + "step": 239 + }, + { + "epoch": 0.058351568198395334, + "grad_norm": 19.75, + "learning_rate": 2.4981694728642077e-06, + "loss": 1.1955, + "step": 240 + }, + { + "epoch": 0.058594699732555315, + "grad_norm": 19.5, + "learning_rate": 2.498143235581365e-06, + "loss": 0.8822, + "step": 241 + }, + { + "epoch": 0.058837831266715296, + "grad_norm": 20.25, + "learning_rate": 2.49811681174196e-06, + "loss": 1.0489, + "step": 242 + }, + { + "epoch": 0.05908096280087528, + "grad_norm": 22.5, + "learning_rate": 2.4980902013499417e-06, + "loss": 1.1603, + "step": 243 + }, + { + "epoch": 0.05932409433503526, + "grad_norm": 20.125, + "learning_rate": 2.4980634044092877e-06, + "loss": 0.8474, + "step": 244 + }, + { + "epoch": 0.05956722586919524, + "grad_norm": 21.625, + "learning_rate": 2.498036420924003e-06, + "loss": 0.9012, + "step": 245 + }, + { + "epoch": 0.05981035740335521, + "grad_norm": 18.625, + "learning_rate": 2.4980092508981204e-06, + "loss": 1.0037, + "step": 246 + }, + { + "epoch": 0.06005348893751519, + "grad_norm": 18.125, + "learning_rate": 2.497981894335702e-06, + "loss": 0.7192, + "step": 247 + }, + { + "epoch": 0.06029662047167517, + "grad_norm": 17.375, + "learning_rate": 2.4979543512408353e-06, + "loss": 1.0596, + "step": 248 + }, + { + "epoch": 0.060539752005835154, + "grad_norm": 21.75, + "learning_rate": 2.497926621617639e-06, + "loss": 0.8915, + "step": 249 + }, + { + "epoch": 0.060782883539995135, + "grad_norm": 28.375, + "learning_rate": 2.497898705470256e-06, + "loss": 1.4206, + "step": 250 + }, + { + "epoch": 0.061026015074155116, + "grad_norm": 23.125, + "learning_rate": 2.4978706028028595e-06, + "loss": 1.0814, + "step": 251 + }, + { + "epoch": 0.061269146608315096, + "grad_norm": 22.125, + "learning_rate": 2.49784231361965e-06, + "loss": 0.898, + "step": 252 + }, + { + "epoch": 0.06151227814247508, + "grad_norm": 17.5, + "learning_rate": 2.4978138379248555e-06, + "loss": 1.0512, + "step": 253 + }, + { + "epoch": 0.06175540967663506, + "grad_norm": 26.625, + "learning_rate": 2.4977851757227327e-06, + "loss": 1.4677, + "step": 254 + }, + { + "epoch": 0.06199854121079504, + "grad_norm": 16.125, + "learning_rate": 2.497756327017566e-06, + "loss": 0.4791, + "step": 255 + }, + { + "epoch": 0.06224167274495502, + "grad_norm": 24.625, + "learning_rate": 2.4977272918136656e-06, + "loss": 1.4746, + "step": 256 + }, + { + "epoch": 0.062484804279115, + "grad_norm": 20.5, + "learning_rate": 2.497698070115373e-06, + "loss": 0.7946, + "step": 257 + }, + { + "epoch": 0.06272793581327499, + "grad_norm": 23.875, + "learning_rate": 2.4976686619270555e-06, + "loss": 1.0674, + "step": 258 + }, + { + "epoch": 0.06297106734743496, + "grad_norm": 20.5, + "learning_rate": 2.4976390672531082e-06, + "loss": 1.1499, + "step": 259 + }, + { + "epoch": 0.06321419888159495, + "grad_norm": 26.5, + "learning_rate": 2.497609286097955e-06, + "loss": 1.1468, + "step": 260 + }, + { + "epoch": 0.06345733041575492, + "grad_norm": 17.75, + "learning_rate": 2.497579318466047e-06, + "loss": 0.8431, + "step": 261 + }, + { + "epoch": 0.06370046194991491, + "grad_norm": 15.125, + "learning_rate": 2.4975491643618633e-06, + "loss": 0.6385, + "step": 262 + }, + { + "epoch": 0.06394359348407488, + "grad_norm": 20.0, + "learning_rate": 2.497518823789911e-06, + "loss": 1.1233, + "step": 263 + }, + { + "epoch": 0.06418672501823487, + "grad_norm": 19.0, + "learning_rate": 2.4974882967547255e-06, + "loss": 1.2095, + "step": 264 + }, + { + "epoch": 0.06442985655239485, + "grad_norm": 14.125, + "learning_rate": 2.497457583260869e-06, + "loss": 0.4845, + "step": 265 + }, + { + "epoch": 0.06467298808655482, + "grad_norm": 16.125, + "learning_rate": 2.497426683312932e-06, + "loss": 0.7712, + "step": 266 + }, + { + "epoch": 0.06491611962071481, + "grad_norm": 21.5, + "learning_rate": 2.497395596915534e-06, + "loss": 1.0351, + "step": 267 + }, + { + "epoch": 0.06515925115487478, + "grad_norm": 27.625, + "learning_rate": 2.49736432407332e-06, + "loss": 0.9552, + "step": 268 + }, + { + "epoch": 0.06540238268903477, + "grad_norm": 32.5, + "learning_rate": 2.4973328647909657e-06, + "loss": 1.0366, + "step": 269 + }, + { + "epoch": 0.06564551422319474, + "grad_norm": 34.25, + "learning_rate": 2.4973012190731723e-06, + "loss": 1.2298, + "step": 270 + }, + { + "epoch": 0.06588864575735473, + "grad_norm": 14.4375, + "learning_rate": 2.49726938692467e-06, + "loss": 0.7068, + "step": 271 + }, + { + "epoch": 0.0661317772915147, + "grad_norm": 14.8125, + "learning_rate": 2.497237368350217e-06, + "loss": 0.4992, + "step": 272 + }, + { + "epoch": 0.06637490882567469, + "grad_norm": 18.375, + "learning_rate": 2.4972051633545987e-06, + "loss": 0.6491, + "step": 273 + }, + { + "epoch": 0.06661804035983467, + "grad_norm": 18.375, + "learning_rate": 2.4971727719426282e-06, + "loss": 1.1035, + "step": 274 + }, + { + "epoch": 0.06686117189399465, + "grad_norm": 14.3125, + "learning_rate": 2.497140194119148e-06, + "loss": 0.481, + "step": 275 + }, + { + "epoch": 0.06710430342815463, + "grad_norm": 19.125, + "learning_rate": 2.497107429889027e-06, + "loss": 1.1583, + "step": 276 + }, + { + "epoch": 0.06734743496231461, + "grad_norm": 15.3125, + "learning_rate": 2.4970744792571622e-06, + "loss": 0.6236, + "step": 277 + }, + { + "epoch": 0.06759056649647459, + "grad_norm": 25.375, + "learning_rate": 2.497041342228479e-06, + "loss": 1.2272, + "step": 278 + }, + { + "epoch": 0.06783369803063458, + "grad_norm": 17.75, + "learning_rate": 2.4970080188079297e-06, + "loss": 1.0533, + "step": 279 + }, + { + "epoch": 0.06807682956479455, + "grad_norm": 18.875, + "learning_rate": 2.4969745090004952e-06, + "loss": 0.9864, + "step": 280 + }, + { + "epoch": 0.06831996109895454, + "grad_norm": 37.5, + "learning_rate": 2.496940812811185e-06, + "loss": 1.1633, + "step": 281 + }, + { + "epoch": 0.06856309263311451, + "grad_norm": 27.875, + "learning_rate": 2.4969069302450345e-06, + "loss": 1.5523, + "step": 282 + }, + { + "epoch": 0.0688062241672745, + "grad_norm": 23.375, + "learning_rate": 2.4968728613071086e-06, + "loss": 1.1659, + "step": 283 + }, + { + "epoch": 0.06904935570143447, + "grad_norm": 17.875, + "learning_rate": 2.496838606002499e-06, + "loss": 0.8856, + "step": 284 + }, + { + "epoch": 0.06929248723559446, + "grad_norm": 16.375, + "learning_rate": 2.4968041643363265e-06, + "loss": 0.5519, + "step": 285 + }, + { + "epoch": 0.06953561876975443, + "grad_norm": 16.75, + "learning_rate": 2.4967695363137385e-06, + "loss": 0.8209, + "step": 286 + }, + { + "epoch": 0.06977875030391442, + "grad_norm": 18.625, + "learning_rate": 2.4967347219399108e-06, + "loss": 1.0672, + "step": 287 + }, + { + "epoch": 0.0700218818380744, + "grad_norm": 18.125, + "learning_rate": 2.496699721220047e-06, + "loss": 0.9386, + "step": 288 + }, + { + "epoch": 0.07026501337223438, + "grad_norm": 18.625, + "learning_rate": 2.4966645341593786e-06, + "loss": 1.0656, + "step": 289 + }, + { + "epoch": 0.07050814490639436, + "grad_norm": 17.625, + "learning_rate": 2.496629160763165e-06, + "loss": 0.8377, + "step": 290 + }, + { + "epoch": 0.07075127644055434, + "grad_norm": 16.625, + "learning_rate": 2.4965936010366934e-06, + "loss": 0.7864, + "step": 291 + }, + { + "epoch": 0.07099440797471432, + "grad_norm": 18.5, + "learning_rate": 2.4965578549852786e-06, + "loss": 0.8277, + "step": 292 + }, + { + "epoch": 0.0712375395088743, + "grad_norm": 20.875, + "learning_rate": 2.4965219226142635e-06, + "loss": 1.3112, + "step": 293 + }, + { + "epoch": 0.07148067104303428, + "grad_norm": 21.875, + "learning_rate": 2.496485803929019e-06, + "loss": 1.0034, + "step": 294 + }, + { + "epoch": 0.07172380257719427, + "grad_norm": 20.75, + "learning_rate": 2.4964494989349437e-06, + "loss": 0.9285, + "step": 295 + }, + { + "epoch": 0.07196693411135424, + "grad_norm": 21.0, + "learning_rate": 2.4964130076374632e-06, + "loss": 0.8082, + "step": 296 + }, + { + "epoch": 0.07221006564551423, + "grad_norm": 20.125, + "learning_rate": 2.496376330042033e-06, + "loss": 1.1866, + "step": 297 + }, + { + "epoch": 0.0724531971796742, + "grad_norm": 22.875, + "learning_rate": 2.4963394661541345e-06, + "loss": 1.0004, + "step": 298 + }, + { + "epoch": 0.07269632871383419, + "grad_norm": 17.0, + "learning_rate": 2.4963024159792778e-06, + "loss": 1.0495, + "step": 299 + }, + { + "epoch": 0.07293946024799416, + "grad_norm": 15.875, + "learning_rate": 2.496265179523e-06, + "loss": 0.7691, + "step": 300 + }, + { + "epoch": 0.07318259178215415, + "grad_norm": 25.375, + "learning_rate": 2.496227756790868e-06, + "loss": 0.7636, + "step": 301 + }, + { + "epoch": 0.07342572331631413, + "grad_norm": 18.0, + "learning_rate": 2.496190147788475e-06, + "loss": 0.7876, + "step": 302 + }, + { + "epoch": 0.0736688548504741, + "grad_norm": 14.625, + "learning_rate": 2.4961523525214414e-06, + "loss": 0.8633, + "step": 303 + }, + { + "epoch": 0.07391198638463409, + "grad_norm": 20.375, + "learning_rate": 2.4961143709954174e-06, + "loss": 1.1787, + "step": 304 + }, + { + "epoch": 0.07415511791879406, + "grad_norm": 20.375, + "learning_rate": 2.496076203216079e-06, + "loss": 0.9555, + "step": 305 + }, + { + "epoch": 0.07439824945295405, + "grad_norm": 18.5, + "learning_rate": 2.4960378491891317e-06, + "loss": 0.9444, + "step": 306 + }, + { + "epoch": 0.07464138098711402, + "grad_norm": 16.875, + "learning_rate": 2.4959993089203084e-06, + "loss": 0.6795, + "step": 307 + }, + { + "epoch": 0.07488451252127401, + "grad_norm": 26.625, + "learning_rate": 2.4959605824153687e-06, + "loss": 0.9794, + "step": 308 + }, + { + "epoch": 0.07512764405543398, + "grad_norm": 24.375, + "learning_rate": 2.4959216696801012e-06, + "loss": 0.8844, + "step": 309 + }, + { + "epoch": 0.07537077558959397, + "grad_norm": 15.8125, + "learning_rate": 2.4958825707203234e-06, + "loss": 0.6024, + "step": 310 + }, + { + "epoch": 0.07561390712375395, + "grad_norm": 18.0, + "learning_rate": 2.4958432855418776e-06, + "loss": 0.7961, + "step": 311 + }, + { + "epoch": 0.07585703865791393, + "grad_norm": 19.0, + "learning_rate": 2.4958038141506363e-06, + "loss": 1.1005, + "step": 312 + }, + { + "epoch": 0.07610017019207391, + "grad_norm": 15.8125, + "learning_rate": 2.495764156552499e-06, + "loss": 0.8304, + "step": 313 + }, + { + "epoch": 0.0763433017262339, + "grad_norm": 25.5, + "learning_rate": 2.495724312753394e-06, + "loss": 1.1424, + "step": 314 + }, + { + "epoch": 0.07658643326039387, + "grad_norm": 21.375, + "learning_rate": 2.4956842827592757e-06, + "loss": 1.0434, + "step": 315 + }, + { + "epoch": 0.07682956479455386, + "grad_norm": 23.0, + "learning_rate": 2.495644066576128e-06, + "loss": 1.0894, + "step": 316 + }, + { + "epoch": 0.07707269632871383, + "grad_norm": 20.875, + "learning_rate": 2.4956036642099613e-06, + "loss": 1.2238, + "step": 317 + }, + { + "epoch": 0.07731582786287382, + "grad_norm": 29.125, + "learning_rate": 2.4955630756668143e-06, + "loss": 1.6621, + "step": 318 + }, + { + "epoch": 0.07755895939703379, + "grad_norm": 26.0, + "learning_rate": 2.495522300952754e-06, + "loss": 1.5402, + "step": 319 + }, + { + "epoch": 0.07780209093119378, + "grad_norm": 16.875, + "learning_rate": 2.4954813400738754e-06, + "loss": 0.5566, + "step": 320 + }, + { + "epoch": 0.07804522246535375, + "grad_norm": 25.625, + "learning_rate": 2.4954401930363003e-06, + "loss": 0.8753, + "step": 321 + }, + { + "epoch": 0.07828835399951374, + "grad_norm": 20.125, + "learning_rate": 2.4953988598461788e-06, + "loss": 1.0414, + "step": 322 + }, + { + "epoch": 0.07853148553367371, + "grad_norm": 20.375, + "learning_rate": 2.4953573405096886e-06, + "loss": 0.88, + "step": 323 + }, + { + "epoch": 0.0787746170678337, + "grad_norm": 15.625, + "learning_rate": 2.495315635033036e-06, + "loss": 0.79, + "step": 324 + }, + { + "epoch": 0.07901774860199368, + "grad_norm": 20.75, + "learning_rate": 2.4952737434224545e-06, + "loss": 0.9045, + "step": 325 + }, + { + "epoch": 0.07926088013615366, + "grad_norm": 22.25, + "learning_rate": 2.4952316656842055e-06, + "loss": 1.2891, + "step": 326 + }, + { + "epoch": 0.07950401167031364, + "grad_norm": 35.25, + "learning_rate": 2.495189401824578e-06, + "loss": 1.3177, + "step": 327 + }, + { + "epoch": 0.07974714320447363, + "grad_norm": 52.5, + "learning_rate": 2.495146951849889e-06, + "loss": 1.2862, + "step": 328 + }, + { + "epoch": 0.0799902747386336, + "grad_norm": 16.625, + "learning_rate": 2.495104315766484e-06, + "loss": 0.6278, + "step": 329 + }, + { + "epoch": 0.08023340627279359, + "grad_norm": 18.0, + "learning_rate": 2.495061493580735e-06, + "loss": 0.8916, + "step": 330 + }, + { + "epoch": 0.08047653780695356, + "grad_norm": 14.625, + "learning_rate": 2.495018485299043e-06, + "loss": 0.7179, + "step": 331 + }, + { + "epoch": 0.08071966934111355, + "grad_norm": 17.0, + "learning_rate": 2.4949752909278363e-06, + "loss": 1.0449, + "step": 332 + }, + { + "epoch": 0.08096280087527352, + "grad_norm": 16.875, + "learning_rate": 2.494931910473571e-06, + "loss": 0.7221, + "step": 333 + }, + { + "epoch": 0.08120593240943351, + "grad_norm": 14.5625, + "learning_rate": 2.4948883439427305e-06, + "loss": 0.7525, + "step": 334 + }, + { + "epoch": 0.08144906394359348, + "grad_norm": 18.375, + "learning_rate": 2.4948445913418272e-06, + "loss": 0.8925, + "step": 335 + }, + { + "epoch": 0.08169219547775347, + "grad_norm": 26.5, + "learning_rate": 2.4948006526774003e-06, + "loss": 1.4271, + "step": 336 + }, + { + "epoch": 0.08193532701191344, + "grad_norm": 24.0, + "learning_rate": 2.4947565279560183e-06, + "loss": 1.5347, + "step": 337 + }, + { + "epoch": 0.08217845854607343, + "grad_norm": 21.75, + "learning_rate": 2.4947122171842747e-06, + "loss": 0.8424, + "step": 338 + }, + { + "epoch": 0.0824215900802334, + "grad_norm": 22.125, + "learning_rate": 2.4946677203687933e-06, + "loss": 1.273, + "step": 339 + }, + { + "epoch": 0.08266472161439338, + "grad_norm": 14.6875, + "learning_rate": 2.494623037516225e-06, + "loss": 0.5317, + "step": 340 + }, + { + "epoch": 0.08290785314855337, + "grad_norm": 16.875, + "learning_rate": 2.494578168633249e-06, + "loss": 0.8341, + "step": 341 + }, + { + "epoch": 0.08315098468271334, + "grad_norm": 24.5, + "learning_rate": 2.49453311372657e-06, + "loss": 0.9029, + "step": 342 + }, + { + "epoch": 0.08339411621687333, + "grad_norm": 16.125, + "learning_rate": 2.494487872802924e-06, + "loss": 0.815, + "step": 343 + }, + { + "epoch": 0.0836372477510333, + "grad_norm": 19.375, + "learning_rate": 2.4944424458690727e-06, + "loss": 0.8027, + "step": 344 + }, + { + "epoch": 0.08388037928519329, + "grad_norm": 21.25, + "learning_rate": 2.4943968329318046e-06, + "loss": 1.3753, + "step": 345 + }, + { + "epoch": 0.08412351081935326, + "grad_norm": 17.625, + "learning_rate": 2.4943510339979394e-06, + "loss": 0.7825, + "step": 346 + }, + { + "epoch": 0.08436664235351325, + "grad_norm": 19.125, + "learning_rate": 2.4943050490743208e-06, + "loss": 1.1817, + "step": 347 + }, + { + "epoch": 0.08460977388767323, + "grad_norm": 21.0, + "learning_rate": 2.4942588781678227e-06, + "loss": 0.9534, + "step": 348 + }, + { + "epoch": 0.08485290542183321, + "grad_norm": 20.375, + "learning_rate": 2.4942125212853465e-06, + "loss": 1.1751, + "step": 349 + }, + { + "epoch": 0.08509603695599319, + "grad_norm": 27.25, + "learning_rate": 2.494165978433821e-06, + "loss": 1.0186, + "step": 350 + }, + { + "epoch": 0.08533916849015317, + "grad_norm": 23.375, + "learning_rate": 2.4941192496202016e-06, + "loss": 1.0297, + "step": 351 + }, + { + "epoch": 0.08558230002431315, + "grad_norm": 15.9375, + "learning_rate": 2.4940723348514746e-06, + "loss": 0.7368, + "step": 352 + }, + { + "epoch": 0.08582543155847314, + "grad_norm": 20.75, + "learning_rate": 2.4940252341346503e-06, + "loss": 1.2031, + "step": 353 + }, + { + "epoch": 0.08606856309263311, + "grad_norm": 31.125, + "learning_rate": 2.4939779474767706e-06, + "loss": 1.6584, + "step": 354 + }, + { + "epoch": 0.0863116946267931, + "grad_norm": 33.5, + "learning_rate": 2.493930474884902e-06, + "loss": 1.4518, + "step": 355 + }, + { + "epoch": 0.08655482616095307, + "grad_norm": 19.75, + "learning_rate": 2.4938828163661405e-06, + "loss": 1.035, + "step": 356 + }, + { + "epoch": 0.08679795769511306, + "grad_norm": 56.75, + "learning_rate": 2.4938349719276096e-06, + "loss": 1.2927, + "step": 357 + }, + { + "epoch": 0.08704108922927303, + "grad_norm": 20.625, + "learning_rate": 2.4937869415764602e-06, + "loss": 0.8661, + "step": 358 + }, + { + "epoch": 0.08728422076343302, + "grad_norm": 17.875, + "learning_rate": 2.4937387253198715e-06, + "loss": 0.5965, + "step": 359 + }, + { + "epoch": 0.087527352297593, + "grad_norm": 22.875, + "learning_rate": 2.4936903231650504e-06, + "loss": 1.2066, + "step": 360 + }, + { + "epoch": 0.08777048383175298, + "grad_norm": 19.125, + "learning_rate": 2.493641735119231e-06, + "loss": 0.9761, + "step": 361 + }, + { + "epoch": 0.08801361536591296, + "grad_norm": 16.375, + "learning_rate": 2.493592961189676e-06, + "loss": 0.4858, + "step": 362 + }, + { + "epoch": 0.08825674690007294, + "grad_norm": 27.125, + "learning_rate": 2.493544001383675e-06, + "loss": 0.8465, + "step": 363 + }, + { + "epoch": 0.08849987843423292, + "grad_norm": 21.75, + "learning_rate": 2.493494855708547e-06, + "loss": 0.8336, + "step": 364 + }, + { + "epoch": 0.0887430099683929, + "grad_norm": 21.375, + "learning_rate": 2.4934455241716365e-06, + "loss": 1.1203, + "step": 365 + }, + { + "epoch": 0.08898614150255288, + "grad_norm": 19.75, + "learning_rate": 2.493396006780317e-06, + "loss": 1.0841, + "step": 366 + }, + { + "epoch": 0.08922927303671287, + "grad_norm": 35.5, + "learning_rate": 2.493346303541991e-06, + "loss": 1.5637, + "step": 367 + }, + { + "epoch": 0.08947240457087284, + "grad_norm": 37.25, + "learning_rate": 2.4932964144640858e-06, + "loss": 1.0716, + "step": 368 + }, + { + "epoch": 0.08971553610503283, + "grad_norm": 20.0, + "learning_rate": 2.493246339554059e-06, + "loss": 1.3607, + "step": 369 + }, + { + "epoch": 0.0899586676391928, + "grad_norm": 15.625, + "learning_rate": 2.4931960788193956e-06, + "loss": 0.6747, + "step": 370 + }, + { + "epoch": 0.09020179917335279, + "grad_norm": 16.875, + "learning_rate": 2.493145632267607e-06, + "loss": 0.7467, + "step": 371 + }, + { + "epoch": 0.09044493070751276, + "grad_norm": 21.75, + "learning_rate": 2.493094999906234e-06, + "loss": 1.1146, + "step": 372 + }, + { + "epoch": 0.09068806224167275, + "grad_norm": 20.5, + "learning_rate": 2.4930441817428443e-06, + "loss": 1.0361, + "step": 373 + }, + { + "epoch": 0.09093119377583272, + "grad_norm": 18.25, + "learning_rate": 2.4929931777850338e-06, + "loss": 0.9021, + "step": 374 + }, + { + "epoch": 0.09117432530999271, + "grad_norm": 20.625, + "learning_rate": 2.4929419880404252e-06, + "loss": 0.8993, + "step": 375 + }, + { + "epoch": 0.09141745684415269, + "grad_norm": 23.375, + "learning_rate": 2.4928906125166703e-06, + "loss": 1.1707, + "step": 376 + }, + { + "epoch": 0.09166058837831267, + "grad_norm": 17.0, + "learning_rate": 2.492839051221448e-06, + "loss": 0.9641, + "step": 377 + }, + { + "epoch": 0.09190371991247265, + "grad_norm": 21.25, + "learning_rate": 2.4927873041624645e-06, + "loss": 0.966, + "step": 378 + }, + { + "epoch": 0.09214685144663262, + "grad_norm": 25.25, + "learning_rate": 2.492735371347455e-06, + "loss": 1.3207, + "step": 379 + }, + { + "epoch": 0.09238998298079261, + "grad_norm": 20.125, + "learning_rate": 2.4926832527841815e-06, + "loss": 1.0189, + "step": 380 + }, + { + "epoch": 0.09263311451495258, + "grad_norm": 14.5, + "learning_rate": 2.492630948480434e-06, + "loss": 0.5323, + "step": 381 + }, + { + "epoch": 0.09287624604911257, + "grad_norm": 29.5, + "learning_rate": 2.4925784584440304e-06, + "loss": 1.523, + "step": 382 + }, + { + "epoch": 0.09311937758327254, + "grad_norm": 25.625, + "learning_rate": 2.4925257826828157e-06, + "loss": 1.3599, + "step": 383 + }, + { + "epoch": 0.09336250911743253, + "grad_norm": 16.625, + "learning_rate": 2.492472921204664e-06, + "loss": 0.6617, + "step": 384 + }, + { + "epoch": 0.0936056406515925, + "grad_norm": 17.125, + "learning_rate": 2.492419874017476e-06, + "loss": 0.6534, + "step": 385 + }, + { + "epoch": 0.0938487721857525, + "grad_norm": 19.125, + "learning_rate": 2.4923666411291802e-06, + "loss": 1.0552, + "step": 386 + }, + { + "epoch": 0.09409190371991247, + "grad_norm": 17.0, + "learning_rate": 2.4923132225477336e-06, + "loss": 1.0621, + "step": 387 + }, + { + "epoch": 0.09433503525407246, + "grad_norm": 20.25, + "learning_rate": 2.4922596182811206e-06, + "loss": 1.003, + "step": 388 + }, + { + "epoch": 0.09457816678823243, + "grad_norm": 22.75, + "learning_rate": 2.492205828337353e-06, + "loss": 1.0473, + "step": 389 + }, + { + "epoch": 0.09482129832239242, + "grad_norm": 33.5, + "learning_rate": 2.4921518527244705e-06, + "loss": 1.5681, + "step": 390 + }, + { + "epoch": 0.09506442985655239, + "grad_norm": 16.25, + "learning_rate": 2.492097691450541e-06, + "loss": 0.9737, + "step": 391 + }, + { + "epoch": 0.09530756139071238, + "grad_norm": 19.375, + "learning_rate": 2.4920433445236596e-06, + "loss": 0.7637, + "step": 392 + }, + { + "epoch": 0.09555069292487235, + "grad_norm": 18.875, + "learning_rate": 2.4919888119519496e-06, + "loss": 1.172, + "step": 393 + }, + { + "epoch": 0.09579382445903234, + "grad_norm": 16.0, + "learning_rate": 2.491934093743562e-06, + "loss": 0.7444, + "step": 394 + }, + { + "epoch": 0.09603695599319231, + "grad_norm": 24.25, + "learning_rate": 2.491879189906675e-06, + "loss": 0.971, + "step": 395 + }, + { + "epoch": 0.0962800875273523, + "grad_norm": 16.375, + "learning_rate": 2.491824100449495e-06, + "loss": 0.8551, + "step": 396 + }, + { + "epoch": 0.09652321906151227, + "grad_norm": 18.875, + "learning_rate": 2.4917688253802563e-06, + "loss": 0.8836, + "step": 397 + }, + { + "epoch": 0.09676635059567226, + "grad_norm": 26.625, + "learning_rate": 2.4917133647072204e-06, + "loss": 1.2094, + "step": 398 + }, + { + "epoch": 0.09700948212983224, + "grad_norm": 13.4375, + "learning_rate": 2.4916577184386775e-06, + "loss": 0.5406, + "step": 399 + }, + { + "epoch": 0.09725261366399222, + "grad_norm": 21.375, + "learning_rate": 2.491601886582944e-06, + "loss": 1.1714, + "step": 400 + }, + { + "epoch": 0.0974957451981522, + "grad_norm": 18.25, + "learning_rate": 2.491545869148365e-06, + "loss": 0.9329, + "step": 401 + }, + { + "epoch": 0.09773887673231219, + "grad_norm": 20.5, + "learning_rate": 2.491489666143314e-06, + "loss": 0.8319, + "step": 402 + }, + { + "epoch": 0.09798200826647216, + "grad_norm": 14.4375, + "learning_rate": 2.491433277576191e-06, + "loss": 0.6509, + "step": 403 + }, + { + "epoch": 0.09822513980063215, + "grad_norm": 23.875, + "learning_rate": 2.491376703455425e-06, + "loss": 1.1198, + "step": 404 + }, + { + "epoch": 0.09846827133479212, + "grad_norm": 20.0, + "learning_rate": 2.4913199437894705e-06, + "loss": 0.7157, + "step": 405 + }, + { + "epoch": 0.09871140286895211, + "grad_norm": 16.5, + "learning_rate": 2.4912629985868126e-06, + "loss": 0.7598, + "step": 406 + }, + { + "epoch": 0.09895453440311208, + "grad_norm": 24.125, + "learning_rate": 2.491205867855962e-06, + "loss": 1.2696, + "step": 407 + }, + { + "epoch": 0.09919766593727207, + "grad_norm": 20.125, + "learning_rate": 2.4911485516054577e-06, + "loss": 0.8278, + "step": 408 + }, + { + "epoch": 0.09944079747143204, + "grad_norm": 15.25, + "learning_rate": 2.4910910498438674e-06, + "loss": 0.4924, + "step": 409 + }, + { + "epoch": 0.09968392900559203, + "grad_norm": 20.875, + "learning_rate": 2.4910333625797856e-06, + "loss": 1.1013, + "step": 410 + }, + { + "epoch": 0.099927060539752, + "grad_norm": 17.25, + "learning_rate": 2.490975489821834e-06, + "loss": 0.7482, + "step": 411 + }, + { + "epoch": 0.10017019207391199, + "grad_norm": 17.5, + "learning_rate": 2.490917431578663e-06, + "loss": 0.6875, + "step": 412 + }, + { + "epoch": 0.10041332360807197, + "grad_norm": 17.625, + "learning_rate": 2.4908591878589507e-06, + "loss": 1.0509, + "step": 413 + }, + { + "epoch": 0.10065645514223195, + "grad_norm": 23.25, + "learning_rate": 2.4908007586714023e-06, + "loss": 1.0468, + "step": 414 + }, + { + "epoch": 0.10089958667639193, + "grad_norm": 22.125, + "learning_rate": 2.490742144024751e-06, + "loss": 0.7809, + "step": 415 + }, + { + "epoch": 0.1011427182105519, + "grad_norm": 28.375, + "learning_rate": 2.4906833439277577e-06, + "loss": 0.9365, + "step": 416 + }, + { + "epoch": 0.10138584974471189, + "grad_norm": 15.75, + "learning_rate": 2.490624358389212e-06, + "loss": 0.6247, + "step": 417 + }, + { + "epoch": 0.10162898127887186, + "grad_norm": 16.0, + "learning_rate": 2.4905651874179294e-06, + "loss": 0.7028, + "step": 418 + }, + { + "epoch": 0.10187211281303185, + "grad_norm": 17.375, + "learning_rate": 2.4905058310227536e-06, + "loss": 0.9292, + "step": 419 + }, + { + "epoch": 0.10211524434719182, + "grad_norm": 18.0, + "learning_rate": 2.490446289212558e-06, + "loss": 0.9055, + "step": 420 + }, + { + "epoch": 0.10235837588135181, + "grad_norm": 18.25, + "learning_rate": 2.4903865619962405e-06, + "loss": 1.0531, + "step": 421 + }, + { + "epoch": 0.10260150741551179, + "grad_norm": 21.25, + "learning_rate": 2.4903266493827294e-06, + "loss": 1.2379, + "step": 422 + }, + { + "epoch": 0.10284463894967177, + "grad_norm": 21.375, + "learning_rate": 2.4902665513809793e-06, + "loss": 1.206, + "step": 423 + }, + { + "epoch": 0.10308777048383175, + "grad_norm": 18.875, + "learning_rate": 2.490206267999973e-06, + "loss": 0.9579, + "step": 424 + }, + { + "epoch": 0.10333090201799174, + "grad_norm": 24.125, + "learning_rate": 2.490145799248721e-06, + "loss": 1.1271, + "step": 425 + }, + { + "epoch": 0.10357403355215171, + "grad_norm": 19.75, + "learning_rate": 2.4900851451362612e-06, + "loss": 1.0493, + "step": 426 + }, + { + "epoch": 0.1038171650863117, + "grad_norm": 20.5, + "learning_rate": 2.4900243056716593e-06, + "loss": 1.2056, + "step": 427 + }, + { + "epoch": 0.10406029662047167, + "grad_norm": 15.8125, + "learning_rate": 2.489963280864009e-06, + "loss": 0.7268, + "step": 428 + }, + { + "epoch": 0.10430342815463166, + "grad_norm": 20.125, + "learning_rate": 2.489902070722431e-06, + "loss": 1.0395, + "step": 429 + }, + { + "epoch": 0.10454655968879163, + "grad_norm": 15.0625, + "learning_rate": 2.4898406752560756e-06, + "loss": 0.7524, + "step": 430 + }, + { + "epoch": 0.10478969122295162, + "grad_norm": 21.5, + "learning_rate": 2.489779094474118e-06, + "loss": 1.1313, + "step": 431 + }, + { + "epoch": 0.1050328227571116, + "grad_norm": 21.0, + "learning_rate": 2.4897173283857628e-06, + "loss": 1.0249, + "step": 432 + }, + { + "epoch": 0.10527595429127158, + "grad_norm": 19.5, + "learning_rate": 2.4896553770002425e-06, + "loss": 1.0617, + "step": 433 + }, + { + "epoch": 0.10551908582543156, + "grad_norm": 23.0, + "learning_rate": 2.4895932403268165e-06, + "loss": 1.1214, + "step": 434 + }, + { + "epoch": 0.10576221735959154, + "grad_norm": 18.875, + "learning_rate": 2.4895309183747725e-06, + "loss": 0.982, + "step": 435 + }, + { + "epoch": 0.10600534889375152, + "grad_norm": 19.375, + "learning_rate": 2.4894684111534247e-06, + "loss": 1.0424, + "step": 436 + }, + { + "epoch": 0.1062484804279115, + "grad_norm": 28.5, + "learning_rate": 2.489405718672117e-06, + "loss": 1.1515, + "step": 437 + }, + { + "epoch": 0.10649161196207148, + "grad_norm": 23.125, + "learning_rate": 2.489342840940219e-06, + "loss": 1.0771, + "step": 438 + }, + { + "epoch": 0.10673474349623147, + "grad_norm": 23.75, + "learning_rate": 2.4892797779671298e-06, + "loss": 1.128, + "step": 439 + }, + { + "epoch": 0.10697787503039144, + "grad_norm": 20.125, + "learning_rate": 2.4892165297622737e-06, + "loss": 0.7404, + "step": 440 + }, + { + "epoch": 0.10722100656455143, + "grad_norm": 16.5, + "learning_rate": 2.489153096335106e-06, + "loss": 0.825, + "step": 441 + }, + { + "epoch": 0.1074641380987114, + "grad_norm": 20.5, + "learning_rate": 2.489089477695107e-06, + "loss": 0.9519, + "step": 442 + }, + { + "epoch": 0.10770726963287139, + "grad_norm": 18.875, + "learning_rate": 2.4890256738517853e-06, + "loss": 1.0615, + "step": 443 + }, + { + "epoch": 0.10795040116703136, + "grad_norm": 22.25, + "learning_rate": 2.4889616848146786e-06, + "loss": 1.0737, + "step": 444 + }, + { + "epoch": 0.10819353270119135, + "grad_norm": 19.875, + "learning_rate": 2.48889751059335e-06, + "loss": 0.9922, + "step": 445 + }, + { + "epoch": 0.10843666423535132, + "grad_norm": 24.75, + "learning_rate": 2.4888331511973924e-06, + "loss": 1.0417, + "step": 446 + }, + { + "epoch": 0.10867979576951131, + "grad_norm": 25.375, + "learning_rate": 2.4887686066364246e-06, + "loss": 1.2682, + "step": 447 + }, + { + "epoch": 0.10892292730367129, + "grad_norm": 22.5, + "learning_rate": 2.4887038769200943e-06, + "loss": 0.9191, + "step": 448 + }, + { + "epoch": 0.10916605883783127, + "grad_norm": 24.25, + "learning_rate": 2.4886389620580763e-06, + "loss": 1.0753, + "step": 449 + }, + { + "epoch": 0.10940919037199125, + "grad_norm": 19.125, + "learning_rate": 2.4885738620600737e-06, + "loss": 0.8045, + "step": 450 + }, + { + "epoch": 0.10965232190615123, + "grad_norm": 17.375, + "learning_rate": 2.4885085769358166e-06, + "loss": 0.7605, + "step": 451 + }, + { + "epoch": 0.10989545344031121, + "grad_norm": 19.0, + "learning_rate": 2.4884431066950626e-06, + "loss": 1.004, + "step": 452 + }, + { + "epoch": 0.11013858497447118, + "grad_norm": 20.0, + "learning_rate": 2.488377451347598e-06, + "loss": 0.8924, + "step": 453 + }, + { + "epoch": 0.11038171650863117, + "grad_norm": 19.0, + "learning_rate": 2.4883116109032352e-06, + "loss": 0.7186, + "step": 454 + }, + { + "epoch": 0.11062484804279114, + "grad_norm": 16.5, + "learning_rate": 2.4882455853718164e-06, + "loss": 0.7337, + "step": 455 + }, + { + "epoch": 0.11086797957695113, + "grad_norm": 28.75, + "learning_rate": 2.488179374763209e-06, + "loss": 1.0111, + "step": 456 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 23.5, + "learning_rate": 2.4881129790873106e-06, + "loss": 1.238, + "step": 457 + }, + { + "epoch": 0.11135424264527109, + "grad_norm": 24.375, + "learning_rate": 2.488046398354044e-06, + "loss": 1.1166, + "step": 458 + }, + { + "epoch": 0.11159737417943107, + "grad_norm": 20.0, + "learning_rate": 2.4879796325733612e-06, + "loss": 0.9251, + "step": 459 + }, + { + "epoch": 0.11184050571359105, + "grad_norm": 18.625, + "learning_rate": 2.487912681755242e-06, + "loss": 0.9987, + "step": 460 + }, + { + "epoch": 0.11208363724775103, + "grad_norm": 16.5, + "learning_rate": 2.4878455459096936e-06, + "loss": 0.8793, + "step": 461 + }, + { + "epoch": 0.11232676878191102, + "grad_norm": 16.625, + "learning_rate": 2.4877782250467493e-06, + "loss": 0.7242, + "step": 462 + }, + { + "epoch": 0.11256990031607099, + "grad_norm": 23.375, + "learning_rate": 2.487710719176472e-06, + "loss": 1.1936, + "step": 463 + }, + { + "epoch": 0.11281303185023098, + "grad_norm": 19.125, + "learning_rate": 2.487643028308952e-06, + "loss": 0.8931, + "step": 464 + }, + { + "epoch": 0.11305616338439095, + "grad_norm": 23.75, + "learning_rate": 2.4875751524543067e-06, + "loss": 1.3118, + "step": 465 + }, + { + "epoch": 0.11329929491855094, + "grad_norm": 23.75, + "learning_rate": 2.487507091622681e-06, + "loss": 1.5804, + "step": 466 + }, + { + "epoch": 0.11354242645271091, + "grad_norm": 20.875, + "learning_rate": 2.487438845824248e-06, + "loss": 0.9065, + "step": 467 + }, + { + "epoch": 0.1137855579868709, + "grad_norm": 13.375, + "learning_rate": 2.487370415069208e-06, + "loss": 0.5431, + "step": 468 + }, + { + "epoch": 0.11402868952103087, + "grad_norm": 20.125, + "learning_rate": 2.48730179936779e-06, + "loss": 1.0669, + "step": 469 + }, + { + "epoch": 0.11427182105519086, + "grad_norm": 17.625, + "learning_rate": 2.4872329987302484e-06, + "loss": 0.8652, + "step": 470 + }, + { + "epoch": 0.11451495258935084, + "grad_norm": 20.75, + "learning_rate": 2.487164013166868e-06, + "loss": 1.491, + "step": 471 + }, + { + "epoch": 0.11475808412351082, + "grad_norm": 25.625, + "learning_rate": 2.487094842687959e-06, + "loss": 1.1311, + "step": 472 + }, + { + "epoch": 0.1150012156576708, + "grad_norm": 22.375, + "learning_rate": 2.4870254873038602e-06, + "loss": 1.1097, + "step": 473 + }, + { + "epoch": 0.11524434719183078, + "grad_norm": 15.5625, + "learning_rate": 2.4869559470249384e-06, + "loss": 0.6992, + "step": 474 + }, + { + "epoch": 0.11548747872599076, + "grad_norm": 19.875, + "learning_rate": 2.486886221861587e-06, + "loss": 0.8973, + "step": 475 + }, + { + "epoch": 0.11573061026015075, + "grad_norm": 20.75, + "learning_rate": 2.4868163118242283e-06, + "loss": 1.2272, + "step": 476 + }, + { + "epoch": 0.11597374179431072, + "grad_norm": 17.625, + "learning_rate": 2.486746216923311e-06, + "loss": 1.1184, + "step": 477 + }, + { + "epoch": 0.11621687332847071, + "grad_norm": 16.75, + "learning_rate": 2.4866759371693127e-06, + "loss": 0.5163, + "step": 478 + }, + { + "epoch": 0.11646000486263068, + "grad_norm": 23.875, + "learning_rate": 2.486605472572737e-06, + "loss": 1.2058, + "step": 479 + }, + { + "epoch": 0.11670313639679067, + "grad_norm": 16.375, + "learning_rate": 2.4865348231441168e-06, + "loss": 0.7734, + "step": 480 + }, + { + "epoch": 0.11694626793095064, + "grad_norm": 20.5, + "learning_rate": 2.486463988894011e-06, + "loss": 0.7897, + "step": 481 + }, + { + "epoch": 0.11718939946511063, + "grad_norm": 29.125, + "learning_rate": 2.4863929698330085e-06, + "loss": 1.7795, + "step": 482 + }, + { + "epoch": 0.1174325309992706, + "grad_norm": 23.5, + "learning_rate": 2.486321765971723e-06, + "loss": 1.096, + "step": 483 + }, + { + "epoch": 0.11767566253343059, + "grad_norm": 24.125, + "learning_rate": 2.4862503773207973e-06, + "loss": 1.3117, + "step": 484 + }, + { + "epoch": 0.11791879406759057, + "grad_norm": 21.25, + "learning_rate": 2.486178803890902e-06, + "loss": 1.0737, + "step": 485 + }, + { + "epoch": 0.11816192560175055, + "grad_norm": 20.625, + "learning_rate": 2.486107045692735e-06, + "loss": 1.2179, + "step": 486 + }, + { + "epoch": 0.11840505713591053, + "grad_norm": 21.5, + "learning_rate": 2.486035102737022e-06, + "loss": 0.756, + "step": 487 + }, + { + "epoch": 0.11864818867007051, + "grad_norm": 17.875, + "learning_rate": 2.4859629750345154e-06, + "loss": 0.7913, + "step": 488 + }, + { + "epoch": 0.11889132020423049, + "grad_norm": 28.0, + "learning_rate": 2.4858906625959965e-06, + "loss": 1.277, + "step": 489 + }, + { + "epoch": 0.11913445173839048, + "grad_norm": 21.25, + "learning_rate": 2.4858181654322737e-06, + "loss": 1.3129, + "step": 490 + }, + { + "epoch": 0.11937758327255045, + "grad_norm": 19.75, + "learning_rate": 2.485745483554183e-06, + "loss": 0.7163, + "step": 491 + }, + { + "epoch": 0.11962071480671042, + "grad_norm": 18.5, + "learning_rate": 2.485672616972587e-06, + "loss": 0.9383, + "step": 492 + }, + { + "epoch": 0.11986384634087041, + "grad_norm": 14.625, + "learning_rate": 2.4855995656983782e-06, + "loss": 0.4747, + "step": 493 + }, + { + "epoch": 0.12010697787503039, + "grad_norm": 19.75, + "learning_rate": 2.4855263297424742e-06, + "loss": 0.8701, + "step": 494 + }, + { + "epoch": 0.12035010940919037, + "grad_norm": 23.125, + "learning_rate": 2.4854529091158224e-06, + "loss": 0.8861, + "step": 495 + }, + { + "epoch": 0.12059324094335035, + "grad_norm": 23.5, + "learning_rate": 2.485379303829396e-06, + "loss": 1.0566, + "step": 496 + }, + { + "epoch": 0.12083637247751033, + "grad_norm": 17.25, + "learning_rate": 2.485305513894197e-06, + "loss": 0.4624, + "step": 497 + }, + { + "epoch": 0.12107950401167031, + "grad_norm": 19.625, + "learning_rate": 2.4852315393212547e-06, + "loss": 0.8714, + "step": 498 + }, + { + "epoch": 0.1213226355458303, + "grad_norm": 20.375, + "learning_rate": 2.4851573801216254e-06, + "loss": 0.9453, + "step": 499 + }, + { + "epoch": 0.12156576707999027, + "grad_norm": 19.375, + "learning_rate": 2.485083036306394e-06, + "loss": 0.953, + "step": 500 + }, + { + "epoch": 0.12180889861415026, + "grad_norm": 17.625, + "learning_rate": 2.485008507886672e-06, + "loss": 0.9461, + "step": 501 + }, + { + "epoch": 0.12205203014831023, + "grad_norm": 25.625, + "learning_rate": 2.484933794873599e-06, + "loss": 0.9967, + "step": 502 + }, + { + "epoch": 0.12229516168247022, + "grad_norm": 27.25, + "learning_rate": 2.4848588972783426e-06, + "loss": 1.2476, + "step": 503 + }, + { + "epoch": 0.12253829321663019, + "grad_norm": 14.25, + "learning_rate": 2.4847838151120974e-06, + "loss": 0.6132, + "step": 504 + }, + { + "epoch": 0.12278142475079018, + "grad_norm": 13.75, + "learning_rate": 2.4847085483860854e-06, + "loss": 0.5193, + "step": 505 + }, + { + "epoch": 0.12302455628495015, + "grad_norm": 21.0, + "learning_rate": 2.4846330971115563e-06, + "loss": 1.1056, + "step": 506 + }, + { + "epoch": 0.12326768781911014, + "grad_norm": 22.0, + "learning_rate": 2.4845574612997887e-06, + "loss": 1.1884, + "step": 507 + }, + { + "epoch": 0.12351081935327012, + "grad_norm": 16.125, + "learning_rate": 2.4844816409620863e-06, + "loss": 0.7524, + "step": 508 + }, + { + "epoch": 0.1237539508874301, + "grad_norm": 19.375, + "learning_rate": 2.484405636109783e-06, + "loss": 0.8291, + "step": 509 + }, + { + "epoch": 0.12399708242159008, + "grad_norm": 19.875, + "learning_rate": 2.484329446754238e-06, + "loss": 0.8617, + "step": 510 + }, + { + "epoch": 0.12424021395575006, + "grad_norm": 21.75, + "learning_rate": 2.48425307290684e-06, + "loss": 1.0044, + "step": 511 + }, + { + "epoch": 0.12448334548991004, + "grad_norm": 22.875, + "learning_rate": 2.4841765145790034e-06, + "loss": 1.0024, + "step": 512 + }, + { + "epoch": 0.12472647702407003, + "grad_norm": 23.5, + "learning_rate": 2.484099771782172e-06, + "loss": 1.1801, + "step": 513 + }, + { + "epoch": 0.12496960855823, + "grad_norm": 15.625, + "learning_rate": 2.484022844527816e-06, + "loss": 0.5301, + "step": 514 + }, + { + "epoch": 0.12521274009238997, + "grad_norm": 28.625, + "learning_rate": 2.483945732827434e-06, + "loss": 1.3416, + "step": 515 + }, + { + "epoch": 0.12545587162654998, + "grad_norm": 23.875, + "learning_rate": 2.483868436692551e-06, + "loss": 1.2757, + "step": 516 + }, + { + "epoch": 0.12569900316070995, + "grad_norm": 21.5, + "learning_rate": 2.4837909561347202e-06, + "loss": 1.1423, + "step": 517 + }, + { + "epoch": 0.12594213469486992, + "grad_norm": 17.0, + "learning_rate": 2.483713291165523e-06, + "loss": 0.8155, + "step": 518 + }, + { + "epoch": 0.1261852662290299, + "grad_norm": 19.0, + "learning_rate": 2.4836354417965675e-06, + "loss": 0.9583, + "step": 519 + }, + { + "epoch": 0.1264283977631899, + "grad_norm": 14.4375, + "learning_rate": 2.483557408039489e-06, + "loss": 0.4277, + "step": 520 + }, + { + "epoch": 0.12667152929734987, + "grad_norm": 23.625, + "learning_rate": 2.4834791899059524e-06, + "loss": 0.829, + "step": 521 + }, + { + "epoch": 0.12691466083150985, + "grad_norm": 17.25, + "learning_rate": 2.4834007874076475e-06, + "loss": 0.8476, + "step": 522 + }, + { + "epoch": 0.12715779236566982, + "grad_norm": 14.1875, + "learning_rate": 2.4833222005562936e-06, + "loss": 0.5097, + "step": 523 + }, + { + "epoch": 0.12740092389982982, + "grad_norm": 17.875, + "learning_rate": 2.4832434293636364e-06, + "loss": 1.2364, + "step": 524 + }, + { + "epoch": 0.1276440554339898, + "grad_norm": 17.875, + "learning_rate": 2.48316447384145e-06, + "loss": 0.8209, + "step": 525 + }, + { + "epoch": 0.12788718696814977, + "grad_norm": 20.25, + "learning_rate": 2.483085334001535e-06, + "loss": 0.8799, + "step": 526 + }, + { + "epoch": 0.12813031850230974, + "grad_norm": 22.625, + "learning_rate": 2.4830060098557217e-06, + "loss": 0.9055, + "step": 527 + }, + { + "epoch": 0.12837345003646974, + "grad_norm": 29.375, + "learning_rate": 2.482926501415865e-06, + "loss": 1.4084, + "step": 528 + }, + { + "epoch": 0.12861658157062972, + "grad_norm": 18.375, + "learning_rate": 2.482846808693849e-06, + "loss": 0.9058, + "step": 529 + }, + { + "epoch": 0.1288597131047897, + "grad_norm": 29.0, + "learning_rate": 2.4827669317015857e-06, + "loss": 1.0377, + "step": 530 + }, + { + "epoch": 0.12910284463894967, + "grad_norm": 20.0, + "learning_rate": 2.4826868704510137e-06, + "loss": 1.447, + "step": 531 + }, + { + "epoch": 0.12934597617310964, + "grad_norm": 38.0, + "learning_rate": 2.4826066249540997e-06, + "loss": 1.3393, + "step": 532 + }, + { + "epoch": 0.12958910770726964, + "grad_norm": 36.5, + "learning_rate": 2.482526195222838e-06, + "loss": 1.5646, + "step": 533 + }, + { + "epoch": 0.12983223924142961, + "grad_norm": 20.5, + "learning_rate": 2.4824455812692495e-06, + "loss": 0.9242, + "step": 534 + }, + { + "epoch": 0.1300753707755896, + "grad_norm": 16.125, + "learning_rate": 2.4823647831053844e-06, + "loss": 0.8967, + "step": 535 + }, + { + "epoch": 0.13031850230974956, + "grad_norm": 18.25, + "learning_rate": 2.482283800743318e-06, + "loss": 1.033, + "step": 536 + }, + { + "epoch": 0.13056163384390956, + "grad_norm": 30.375, + "learning_rate": 2.482202634195156e-06, + "loss": 1.1283, + "step": 537 + }, + { + "epoch": 0.13080476537806954, + "grad_norm": 21.5, + "learning_rate": 2.482121283473029e-06, + "loss": 1.1388, + "step": 538 + }, + { + "epoch": 0.1310478969122295, + "grad_norm": 23.5, + "learning_rate": 2.482039748589097e-06, + "loss": 1.4592, + "step": 539 + }, + { + "epoch": 0.13129102844638948, + "grad_norm": 22.375, + "learning_rate": 2.4819580295555467e-06, + "loss": 1.0908, + "step": 540 + }, + { + "epoch": 0.1315341599805495, + "grad_norm": 20.875, + "learning_rate": 2.481876126384592e-06, + "loss": 1.2061, + "step": 541 + }, + { + "epoch": 0.13177729151470946, + "grad_norm": 19.625, + "learning_rate": 2.481794039088475e-06, + "loss": 1.0404, + "step": 542 + }, + { + "epoch": 0.13202042304886943, + "grad_norm": 22.5, + "learning_rate": 2.4817117676794647e-06, + "loss": 1.2118, + "step": 543 + }, + { + "epoch": 0.1322635545830294, + "grad_norm": 15.5625, + "learning_rate": 2.4816293121698586e-06, + "loss": 0.967, + "step": 544 + }, + { + "epoch": 0.1325066861171894, + "grad_norm": 21.375, + "learning_rate": 2.481546672571981e-06, + "loss": 1.1325, + "step": 545 + }, + { + "epoch": 0.13274981765134938, + "grad_norm": 16.625, + "learning_rate": 2.481463848898183e-06, + "loss": 0.7889, + "step": 546 + }, + { + "epoch": 0.13299294918550936, + "grad_norm": 22.125, + "learning_rate": 2.481380841160845e-06, + "loss": 0.7796, + "step": 547 + }, + { + "epoch": 0.13323608071966933, + "grad_norm": 19.875, + "learning_rate": 2.481297649372374e-06, + "loss": 0.9637, + "step": 548 + }, + { + "epoch": 0.13347921225382933, + "grad_norm": 17.875, + "learning_rate": 2.481214273545204e-06, + "loss": 0.7144, + "step": 549 + }, + { + "epoch": 0.1337223437879893, + "grad_norm": 20.375, + "learning_rate": 2.4811307136917966e-06, + "loss": 1.0322, + "step": 550 + }, + { + "epoch": 0.13396547532214928, + "grad_norm": 29.75, + "learning_rate": 2.481046969824642e-06, + "loss": 1.5122, + "step": 551 + }, + { + "epoch": 0.13420860685630925, + "grad_norm": 18.625, + "learning_rate": 2.4809630419562567e-06, + "loss": 1.0251, + "step": 552 + }, + { + "epoch": 0.13445173839046926, + "grad_norm": 20.75, + "learning_rate": 2.4808789300991853e-06, + "loss": 1.6416, + "step": 553 + }, + { + "epoch": 0.13469486992462923, + "grad_norm": 23.5, + "learning_rate": 2.4807946342659995e-06, + "loss": 1.1206, + "step": 554 + }, + { + "epoch": 0.1349380014587892, + "grad_norm": 18.125, + "learning_rate": 2.4807101544692995e-06, + "loss": 0.9281, + "step": 555 + }, + { + "epoch": 0.13518113299294918, + "grad_norm": 17.75, + "learning_rate": 2.480625490721712e-06, + "loss": 1.0279, + "step": 556 + }, + { + "epoch": 0.13542426452710918, + "grad_norm": 27.375, + "learning_rate": 2.480540643035891e-06, + "loss": 1.4237, + "step": 557 + }, + { + "epoch": 0.13566739606126915, + "grad_norm": 24.5, + "learning_rate": 2.4804556114245183e-06, + "loss": 1.2811, + "step": 558 + }, + { + "epoch": 0.13591052759542913, + "grad_norm": 17.375, + "learning_rate": 2.4803703959003044e-06, + "loss": 0.62, + "step": 559 + }, + { + "epoch": 0.1361536591295891, + "grad_norm": 17.625, + "learning_rate": 2.480284996475985e-06, + "loss": 0.8716, + "step": 560 + }, + { + "epoch": 0.1363967906637491, + "grad_norm": 17.25, + "learning_rate": 2.4801994131643255e-06, + "loss": 0.6587, + "step": 561 + }, + { + "epoch": 0.13663992219790908, + "grad_norm": 31.0, + "learning_rate": 2.4801136459781177e-06, + "loss": 1.0227, + "step": 562 + }, + { + "epoch": 0.13688305373206905, + "grad_norm": 19.5, + "learning_rate": 2.48002769493018e-06, + "loss": 1.1458, + "step": 563 + }, + { + "epoch": 0.13712618526622902, + "grad_norm": 18.5, + "learning_rate": 2.4799415600333606e-06, + "loss": 0.9259, + "step": 564 + }, + { + "epoch": 0.13736931680038902, + "grad_norm": 18.125, + "learning_rate": 2.4798552413005327e-06, + "loss": 0.5576, + "step": 565 + }, + { + "epoch": 0.137612448334549, + "grad_norm": 25.125, + "learning_rate": 2.479768738744599e-06, + "loss": 1.3333, + "step": 566 + }, + { + "epoch": 0.13785557986870897, + "grad_norm": 17.625, + "learning_rate": 2.479682052378489e-06, + "loss": 1.0037, + "step": 567 + }, + { + "epoch": 0.13809871140286895, + "grad_norm": 21.5, + "learning_rate": 2.479595182215158e-06, + "loss": 1.0878, + "step": 568 + }, + { + "epoch": 0.13834184293702892, + "grad_norm": 20.25, + "learning_rate": 2.4795081282675917e-06, + "loss": 0.7082, + "step": 569 + }, + { + "epoch": 0.13858497447118892, + "grad_norm": 20.75, + "learning_rate": 2.479420890548801e-06, + "loss": 0.8939, + "step": 570 + }, + { + "epoch": 0.1388281060053489, + "grad_norm": 22.125, + "learning_rate": 2.4793334690718253e-06, + "loss": 0.9109, + "step": 571 + }, + { + "epoch": 0.13907123753950887, + "grad_norm": 21.0, + "learning_rate": 2.479245863849731e-06, + "loss": 0.8302, + "step": 572 + }, + { + "epoch": 0.13931436907366884, + "grad_norm": 22.0, + "learning_rate": 2.4791580748956133e-06, + "loss": 0.8713, + "step": 573 + }, + { + "epoch": 0.13955750060782884, + "grad_norm": 22.25, + "learning_rate": 2.479070102222593e-06, + "loss": 0.9753, + "step": 574 + }, + { + "epoch": 0.13980063214198882, + "grad_norm": 17.75, + "learning_rate": 2.478981945843819e-06, + "loss": 0.6024, + "step": 575 + }, + { + "epoch": 0.1400437636761488, + "grad_norm": 20.0, + "learning_rate": 2.478893605772468e-06, + "loss": 1.0587, + "step": 576 + }, + { + "epoch": 0.14028689521030877, + "grad_norm": 19.0, + "learning_rate": 2.4788050820217437e-06, + "loss": 0.9547, + "step": 577 + }, + { + "epoch": 0.14053002674446877, + "grad_norm": 27.375, + "learning_rate": 2.4787163746048776e-06, + "loss": 1.2551, + "step": 578 + }, + { + "epoch": 0.14077315827862874, + "grad_norm": 19.125, + "learning_rate": 2.478627483535129e-06, + "loss": 0.9121, + "step": 579 + }, + { + "epoch": 0.14101628981278871, + "grad_norm": 21.75, + "learning_rate": 2.4785384088257835e-06, + "loss": 0.6421, + "step": 580 + }, + { + "epoch": 0.1412594213469487, + "grad_norm": 16.375, + "learning_rate": 2.478449150490155e-06, + "loss": 0.7209, + "step": 581 + }, + { + "epoch": 0.1415025528811087, + "grad_norm": 23.375, + "learning_rate": 2.4783597085415855e-06, + "loss": 1.075, + "step": 582 + }, + { + "epoch": 0.14174568441526866, + "grad_norm": 19.625, + "learning_rate": 2.4782700829934423e-06, + "loss": 1.0413, + "step": 583 + }, + { + "epoch": 0.14198881594942864, + "grad_norm": 17.75, + "learning_rate": 2.4781802738591232e-06, + "loss": 0.7188, + "step": 584 + }, + { + "epoch": 0.1422319474835886, + "grad_norm": 18.25, + "learning_rate": 2.4780902811520503e-06, + "loss": 0.8088, + "step": 585 + }, + { + "epoch": 0.1424750790177486, + "grad_norm": 20.625, + "learning_rate": 2.478000104885675e-06, + "loss": 1.2832, + "step": 586 + }, + { + "epoch": 0.1427182105519086, + "grad_norm": 15.0625, + "learning_rate": 2.4779097450734756e-06, + "loss": 0.6821, + "step": 587 + }, + { + "epoch": 0.14296134208606856, + "grad_norm": 21.875, + "learning_rate": 2.477819201728958e-06, + "loss": 0.8267, + "step": 588 + }, + { + "epoch": 0.14320447362022853, + "grad_norm": 25.125, + "learning_rate": 2.477728474865656e-06, + "loss": 1.3358, + "step": 589 + }, + { + "epoch": 0.14344760515438854, + "grad_norm": 15.6875, + "learning_rate": 2.4776375644971297e-06, + "loss": 0.6596, + "step": 590 + }, + { + "epoch": 0.1436907366885485, + "grad_norm": 19.25, + "learning_rate": 2.477546470636967e-06, + "loss": 0.9614, + "step": 591 + }, + { + "epoch": 0.14393386822270848, + "grad_norm": 16.75, + "learning_rate": 2.477455193298784e-06, + "loss": 0.8964, + "step": 592 + }, + { + "epoch": 0.14417699975686846, + "grad_norm": 15.375, + "learning_rate": 2.4773637324962236e-06, + "loss": 0.8571, + "step": 593 + }, + { + "epoch": 0.14442013129102846, + "grad_norm": 17.0, + "learning_rate": 2.4772720882429557e-06, + "loss": 0.7895, + "step": 594 + }, + { + "epoch": 0.14466326282518843, + "grad_norm": 18.375, + "learning_rate": 2.477180260552679e-06, + "loss": 0.8672, + "step": 595 + }, + { + "epoch": 0.1449063943593484, + "grad_norm": 21.5, + "learning_rate": 2.477088249439118e-06, + "loss": 1.0772, + "step": 596 + }, + { + "epoch": 0.14514952589350838, + "grad_norm": 18.625, + "learning_rate": 2.4769960549160255e-06, + "loss": 0.9394, + "step": 597 + }, + { + "epoch": 0.14539265742766838, + "grad_norm": 32.0, + "learning_rate": 2.4769036769971816e-06, + "loss": 1.3212, + "step": 598 + }, + { + "epoch": 0.14563578896182836, + "grad_norm": 18.125, + "learning_rate": 2.4768111156963944e-06, + "loss": 1.0702, + "step": 599 + }, + { + "epoch": 0.14587892049598833, + "grad_norm": 21.5, + "learning_rate": 2.4767183710274974e-06, + "loss": 1.245, + "step": 600 + }, + { + "epoch": 0.1461220520301483, + "grad_norm": 27.0, + "learning_rate": 2.476625443004354e-06, + "loss": 1.0605, + "step": 601 + }, + { + "epoch": 0.1463651835643083, + "grad_norm": 21.5, + "learning_rate": 2.4765323316408537e-06, + "loss": 0.4655, + "step": 602 + }, + { + "epoch": 0.14660831509846828, + "grad_norm": 21.625, + "learning_rate": 2.4764390369509133e-06, + "loss": 1.1133, + "step": 603 + }, + { + "epoch": 0.14685144663262825, + "grad_norm": 24.0, + "learning_rate": 2.4763455589484776e-06, + "loss": 1.1572, + "step": 604 + }, + { + "epoch": 0.14709457816678823, + "grad_norm": 18.875, + "learning_rate": 2.4762518976475184e-06, + "loss": 0.7207, + "step": 605 + }, + { + "epoch": 0.1473377097009482, + "grad_norm": 22.75, + "learning_rate": 2.476158053062035e-06, + "loss": 0.9064, + "step": 606 + }, + { + "epoch": 0.1475808412351082, + "grad_norm": 26.875, + "learning_rate": 2.476064025206054e-06, + "loss": 1.184, + "step": 607 + }, + { + "epoch": 0.14782397276926817, + "grad_norm": 17.875, + "learning_rate": 2.4759698140936294e-06, + "loss": 0.8613, + "step": 608 + }, + { + "epoch": 0.14806710430342815, + "grad_norm": 18.25, + "learning_rate": 2.4758754197388433e-06, + "loss": 0.6474, + "step": 609 + }, + { + "epoch": 0.14831023583758812, + "grad_norm": 14.75, + "learning_rate": 2.475780842155804e-06, + "loss": 0.6431, + "step": 610 + }, + { + "epoch": 0.14855336737174812, + "grad_norm": 17.5, + "learning_rate": 2.4756860813586474e-06, + "loss": 0.8249, + "step": 611 + }, + { + "epoch": 0.1487964989059081, + "grad_norm": 22.5, + "learning_rate": 2.4755911373615382e-06, + "loss": 1.7489, + "step": 612 + }, + { + "epoch": 0.14903963044006807, + "grad_norm": 19.75, + "learning_rate": 2.4754960101786663e-06, + "loss": 0.9744, + "step": 613 + }, + { + "epoch": 0.14928276197422805, + "grad_norm": 16.875, + "learning_rate": 2.4754006998242513e-06, + "loss": 0.7147, + "step": 614 + }, + { + "epoch": 0.14952589350838805, + "grad_norm": 18.75, + "learning_rate": 2.4753052063125377e-06, + "loss": 0.8841, + "step": 615 + }, + { + "epoch": 0.14976902504254802, + "grad_norm": 17.0, + "learning_rate": 2.4752095296577996e-06, + "loss": 0.8324, + "step": 616 + }, + { + "epoch": 0.150012156576708, + "grad_norm": 17.25, + "learning_rate": 2.4751136698743372e-06, + "loss": 0.9666, + "step": 617 + }, + { + "epoch": 0.15025528811086797, + "grad_norm": 19.875, + "learning_rate": 2.475017626976478e-06, + "loss": 0.9407, + "step": 618 + }, + { + "epoch": 0.15049841964502797, + "grad_norm": 21.875, + "learning_rate": 2.4749214009785784e-06, + "loss": 0.9423, + "step": 619 + }, + { + "epoch": 0.15074155117918794, + "grad_norm": 22.625, + "learning_rate": 2.4748249918950196e-06, + "loss": 1.0306, + "step": 620 + }, + { + "epoch": 0.15098468271334792, + "grad_norm": 18.875, + "learning_rate": 2.4747283997402128e-06, + "loss": 1.0252, + "step": 621 + }, + { + "epoch": 0.1512278142475079, + "grad_norm": 17.625, + "learning_rate": 2.4746316245285947e-06, + "loss": 0.955, + "step": 622 + }, + { + "epoch": 0.1514709457816679, + "grad_norm": 22.625, + "learning_rate": 2.47453466627463e-06, + "loss": 0.7389, + "step": 623 + }, + { + "epoch": 0.15171407731582787, + "grad_norm": 21.125, + "learning_rate": 2.474437524992811e-06, + "loss": 1.063, + "step": 624 + }, + { + "epoch": 0.15195720884998784, + "grad_norm": 18.125, + "learning_rate": 2.4743402006976573e-06, + "loss": 0.7157, + "step": 625 + }, + { + "epoch": 0.15220034038414781, + "grad_norm": 16.75, + "learning_rate": 2.4742426934037155e-06, + "loss": 0.8126, + "step": 626 + }, + { + "epoch": 0.15244347191830782, + "grad_norm": 22.5, + "learning_rate": 2.4741450031255595e-06, + "loss": 0.977, + "step": 627 + }, + { + "epoch": 0.1526866034524678, + "grad_norm": 24.5, + "learning_rate": 2.4740471298777914e-06, + "loss": 1.0679, + "step": 628 + }, + { + "epoch": 0.15292973498662776, + "grad_norm": 20.375, + "learning_rate": 2.4739490736750393e-06, + "loss": 0.6969, + "step": 629 + }, + { + "epoch": 0.15317286652078774, + "grad_norm": 23.625, + "learning_rate": 2.47385083453196e-06, + "loss": 0.8285, + "step": 630 + }, + { + "epoch": 0.15341599805494774, + "grad_norm": 21.0, + "learning_rate": 2.4737524124632373e-06, + "loss": 0.9942, + "step": 631 + }, + { + "epoch": 0.1536591295891077, + "grad_norm": 20.875, + "learning_rate": 2.4736538074835812e-06, + "loss": 0.9077, + "step": 632 + }, + { + "epoch": 0.1539022611232677, + "grad_norm": 15.125, + "learning_rate": 2.4735550196077304e-06, + "loss": 0.6105, + "step": 633 + }, + { + "epoch": 0.15414539265742766, + "grad_norm": 20.625, + "learning_rate": 2.4734560488504507e-06, + "loss": 1.1664, + "step": 634 + }, + { + "epoch": 0.15438852419158766, + "grad_norm": 18.75, + "learning_rate": 2.4733568952265342e-06, + "loss": 1.0338, + "step": 635 + }, + { + "epoch": 0.15463165572574764, + "grad_norm": 27.375, + "learning_rate": 2.4732575587508016e-06, + "loss": 0.6824, + "step": 636 + }, + { + "epoch": 0.1548747872599076, + "grad_norm": 28.5, + "learning_rate": 2.4731580394381005e-06, + "loss": 1.1426, + "step": 637 + }, + { + "epoch": 0.15511791879406758, + "grad_norm": 19.125, + "learning_rate": 2.473058337303306e-06, + "loss": 0.7891, + "step": 638 + }, + { + "epoch": 0.15536105032822758, + "grad_norm": 23.0, + "learning_rate": 2.4729584523613196e-06, + "loss": 0.9048, + "step": 639 + }, + { + "epoch": 0.15560418186238756, + "grad_norm": 17.125, + "learning_rate": 2.472858384627072e-06, + "loss": 1.0675, + "step": 640 + }, + { + "epoch": 0.15584731339654753, + "grad_norm": 18.75, + "learning_rate": 2.4727581341155186e-06, + "loss": 0.7432, + "step": 641 + }, + { + "epoch": 0.1560904449307075, + "grad_norm": 16.125, + "learning_rate": 2.472657700841645e-06, + "loss": 0.8413, + "step": 642 + }, + { + "epoch": 0.15633357646486748, + "grad_norm": 17.375, + "learning_rate": 2.4725570848204615e-06, + "loss": 0.5183, + "step": 643 + }, + { + "epoch": 0.15657670799902748, + "grad_norm": 20.625, + "learning_rate": 2.472456286067007e-06, + "loss": 0.778, + "step": 644 + }, + { + "epoch": 0.15681983953318746, + "grad_norm": 14.375, + "learning_rate": 2.4723553045963488e-06, + "loss": 0.6765, + "step": 645 + }, + { + "epoch": 0.15706297106734743, + "grad_norm": 18.875, + "learning_rate": 2.4722541404235793e-06, + "loss": 0.9651, + "step": 646 + }, + { + "epoch": 0.1573061026015074, + "grad_norm": 20.5, + "learning_rate": 2.4721527935638194e-06, + "loss": 1.0763, + "step": 647 + }, + { + "epoch": 0.1575492341356674, + "grad_norm": 20.0, + "learning_rate": 2.472051264032217e-06, + "loss": 1.0098, + "step": 648 + }, + { + "epoch": 0.15779236566982738, + "grad_norm": 21.625, + "learning_rate": 2.471949551843948e-06, + "loss": 1.3047, + "step": 649 + }, + { + "epoch": 0.15803549720398735, + "grad_norm": 20.25, + "learning_rate": 2.4718476570142142e-06, + "loss": 1.3861, + "step": 650 + }, + { + "epoch": 0.15827862873814733, + "grad_norm": 19.25, + "learning_rate": 2.4717455795582462e-06, + "loss": 0.919, + "step": 651 + }, + { + "epoch": 0.15852176027230733, + "grad_norm": 16.125, + "learning_rate": 2.471643319491301e-06, + "loss": 0.5179, + "step": 652 + }, + { + "epoch": 0.1587648918064673, + "grad_norm": 22.375, + "learning_rate": 2.4715408768286638e-06, + "loss": 1.1422, + "step": 653 + }, + { + "epoch": 0.15900802334062727, + "grad_norm": 15.875, + "learning_rate": 2.471438251585645e-06, + "loss": 0.6059, + "step": 654 + }, + { + "epoch": 0.15925115487478725, + "grad_norm": 24.0, + "learning_rate": 2.471335443777585e-06, + "loss": 0.9181, + "step": 655 + }, + { + "epoch": 0.15949428640894725, + "grad_norm": 17.625, + "learning_rate": 2.4712324534198497e-06, + "loss": 0.952, + "step": 656 + }, + { + "epoch": 0.15973741794310722, + "grad_norm": 15.625, + "learning_rate": 2.4711292805278327e-06, + "loss": 0.9161, + "step": 657 + }, + { + "epoch": 0.1599805494772672, + "grad_norm": 25.375, + "learning_rate": 2.471025925116955e-06, + "loss": 0.9753, + "step": 658 + }, + { + "epoch": 0.16022368101142717, + "grad_norm": 20.125, + "learning_rate": 2.470922387202665e-06, + "loss": 0.4339, + "step": 659 + }, + { + "epoch": 0.16046681254558717, + "grad_norm": 20.625, + "learning_rate": 2.470818666800438e-06, + "loss": 1.052, + "step": 660 + }, + { + "epoch": 0.16070994407974715, + "grad_norm": 23.375, + "learning_rate": 2.470714763925777e-06, + "loss": 0.8525, + "step": 661 + }, + { + "epoch": 0.16095307561390712, + "grad_norm": 15.625, + "learning_rate": 2.4706106785942123e-06, + "loss": 0.7993, + "step": 662 + }, + { + "epoch": 0.1611962071480671, + "grad_norm": 22.125, + "learning_rate": 2.470506410821301e-06, + "loss": 0.9482, + "step": 663 + }, + { + "epoch": 0.1614393386822271, + "grad_norm": 21.5, + "learning_rate": 2.470401960622628e-06, + "loss": 1.0482, + "step": 664 + }, + { + "epoch": 0.16168247021638707, + "grad_norm": 29.0, + "learning_rate": 2.4702973280138044e-06, + "loss": 1.6028, + "step": 665 + }, + { + "epoch": 0.16192560175054704, + "grad_norm": 17.375, + "learning_rate": 2.4701925130104705e-06, + "loss": 1.0979, + "step": 666 + }, + { + "epoch": 0.16216873328470702, + "grad_norm": 19.875, + "learning_rate": 2.4700875156282918e-06, + "loss": 0.8368, + "step": 667 + }, + { + "epoch": 0.16241186481886702, + "grad_norm": 18.125, + "learning_rate": 2.4699823358829616e-06, + "loss": 1.1269, + "step": 668 + }, + { + "epoch": 0.162654996353027, + "grad_norm": 21.5, + "learning_rate": 2.469876973790202e-06, + "loss": 1.0932, + "step": 669 + }, + { + "epoch": 0.16289812788718697, + "grad_norm": 15.25, + "learning_rate": 2.4697714293657608e-06, + "loss": 0.5482, + "step": 670 + }, + { + "epoch": 0.16314125942134694, + "grad_norm": 14.875, + "learning_rate": 2.4696657026254133e-06, + "loss": 0.4568, + "step": 671 + }, + { + "epoch": 0.16338439095550694, + "grad_norm": 21.0, + "learning_rate": 2.469559793584962e-06, + "loss": 1.2617, + "step": 672 + }, + { + "epoch": 0.16362752248966692, + "grad_norm": 16.375, + "learning_rate": 2.4694537022602367e-06, + "loss": 0.6635, + "step": 673 + }, + { + "epoch": 0.1638706540238269, + "grad_norm": 20.125, + "learning_rate": 2.4693474286670955e-06, + "loss": 0.8394, + "step": 674 + }, + { + "epoch": 0.16411378555798686, + "grad_norm": 17.25, + "learning_rate": 2.4692409728214216e-06, + "loss": 1.0343, + "step": 675 + }, + { + "epoch": 0.16435691709214686, + "grad_norm": 22.375, + "learning_rate": 2.469134334739128e-06, + "loss": 0.9508, + "step": 676 + }, + { + "epoch": 0.16460004862630684, + "grad_norm": 14.8125, + "learning_rate": 2.4690275144361526e-06, + "loss": 0.6007, + "step": 677 + }, + { + "epoch": 0.1648431801604668, + "grad_norm": 19.125, + "learning_rate": 2.4689205119284618e-06, + "loss": 0.629, + "step": 678 + }, + { + "epoch": 0.16508631169462679, + "grad_norm": 34.75, + "learning_rate": 2.468813327232049e-06, + "loss": 1.4319, + "step": 679 + }, + { + "epoch": 0.16532944322878676, + "grad_norm": 17.875, + "learning_rate": 2.4687059603629348e-06, + "loss": 0.8659, + "step": 680 + }, + { + "epoch": 0.16557257476294676, + "grad_norm": 16.5, + "learning_rate": 2.4685984113371668e-06, + "loss": 0.6219, + "step": 681 + }, + { + "epoch": 0.16581570629710674, + "grad_norm": 15.375, + "learning_rate": 2.468490680170821e-06, + "loss": 0.7307, + "step": 682 + }, + { + "epoch": 0.1660588378312667, + "grad_norm": 17.625, + "learning_rate": 2.4683827668799985e-06, + "loss": 0.7113, + "step": 683 + }, + { + "epoch": 0.16630196936542668, + "grad_norm": 23.25, + "learning_rate": 2.468274671480829e-06, + "loss": 1.0579, + "step": 684 + }, + { + "epoch": 0.16654510089958668, + "grad_norm": 18.25, + "learning_rate": 2.4681663939894703e-06, + "loss": 1.0621, + "step": 685 + }, + { + "epoch": 0.16678823243374666, + "grad_norm": 14.25, + "learning_rate": 2.468057934422105e-06, + "loss": 0.5793, + "step": 686 + }, + { + "epoch": 0.16703136396790663, + "grad_norm": 27.625, + "learning_rate": 2.467949292794945e-06, + "loss": 1.1904, + "step": 687 + }, + { + "epoch": 0.1672744955020666, + "grad_norm": 15.125, + "learning_rate": 2.4678404691242285e-06, + "loss": 0.5465, + "step": 688 + }, + { + "epoch": 0.1675176270362266, + "grad_norm": 27.5, + "learning_rate": 2.4677314634262206e-06, + "loss": 1.4834, + "step": 689 + }, + { + "epoch": 0.16776075857038658, + "grad_norm": 17.875, + "learning_rate": 2.467622275717215e-06, + "loss": 0.9697, + "step": 690 + }, + { + "epoch": 0.16800389010454655, + "grad_norm": 20.125, + "learning_rate": 2.467512906013531e-06, + "loss": 1.0066, + "step": 691 + }, + { + "epoch": 0.16824702163870653, + "grad_norm": 22.375, + "learning_rate": 2.4674033543315164e-06, + "loss": 1.0141, + "step": 692 + }, + { + "epoch": 0.16849015317286653, + "grad_norm": 23.125, + "learning_rate": 2.467293620687545e-06, + "loss": 1.1554, + "step": 693 + }, + { + "epoch": 0.1687332847070265, + "grad_norm": 18.5, + "learning_rate": 2.4671837050980186e-06, + "loss": 0.8953, + "step": 694 + }, + { + "epoch": 0.16897641624118648, + "grad_norm": 14.875, + "learning_rate": 2.467073607579366e-06, + "loss": 0.7851, + "step": 695 + }, + { + "epoch": 0.16921954777534645, + "grad_norm": 12.5, + "learning_rate": 2.466963328148043e-06, + "loss": 0.3995, + "step": 696 + }, + { + "epoch": 0.16946267930950645, + "grad_norm": 18.5, + "learning_rate": 2.466852866820533e-06, + "loss": 1.0252, + "step": 697 + }, + { + "epoch": 0.16970581084366643, + "grad_norm": 22.25, + "learning_rate": 2.4667422236133463e-06, + "loss": 0.9539, + "step": 698 + }, + { + "epoch": 0.1699489423778264, + "grad_norm": 23.75, + "learning_rate": 2.4666313985430205e-06, + "loss": 1.281, + "step": 699 + }, + { + "epoch": 0.17019207391198637, + "grad_norm": 17.875, + "learning_rate": 2.46652039162612e-06, + "loss": 0.8046, + "step": 700 + }, + { + "epoch": 0.17043520544614638, + "grad_norm": 14.0, + "learning_rate": 2.466409202879237e-06, + "loss": 0.4778, + "step": 701 + }, + { + "epoch": 0.17067833698030635, + "grad_norm": 19.875, + "learning_rate": 2.4662978323189907e-06, + "loss": 1.0883, + "step": 702 + }, + { + "epoch": 0.17092146851446632, + "grad_norm": 21.625, + "learning_rate": 2.4661862799620275e-06, + "loss": 0.9669, + "step": 703 + }, + { + "epoch": 0.1711646000486263, + "grad_norm": 19.375, + "learning_rate": 2.4660745458250197e-06, + "loss": 0.9314, + "step": 704 + }, + { + "epoch": 0.1714077315827863, + "grad_norm": 23.75, + "learning_rate": 2.465962629924669e-06, + "loss": 1.1983, + "step": 705 + }, + { + "epoch": 0.17165086311694627, + "grad_norm": 21.875, + "learning_rate": 2.4658505322777032e-06, + "loss": 0.9296, + "step": 706 + }, + { + "epoch": 0.17189399465110625, + "grad_norm": 20.25, + "learning_rate": 2.4657382529008765e-06, + "loss": 1.1447, + "step": 707 + }, + { + "epoch": 0.17213712618526622, + "grad_norm": 19.375, + "learning_rate": 2.4656257918109716e-06, + "loss": 1.5269, + "step": 708 + }, + { + "epoch": 0.17238025771942622, + "grad_norm": 19.375, + "learning_rate": 2.4655131490247974e-06, + "loss": 0.5523, + "step": 709 + }, + { + "epoch": 0.1726233892535862, + "grad_norm": 38.5, + "learning_rate": 2.4654003245591905e-06, + "loss": 1.1902, + "step": 710 + }, + { + "epoch": 0.17286652078774617, + "grad_norm": 18.5, + "learning_rate": 2.4652873184310143e-06, + "loss": 0.7753, + "step": 711 + }, + { + "epoch": 0.17310965232190614, + "grad_norm": 16.25, + "learning_rate": 2.4651741306571596e-06, + "loss": 0.6987, + "step": 712 + }, + { + "epoch": 0.17335278385606614, + "grad_norm": 20.875, + "learning_rate": 2.465060761254544e-06, + "loss": 0.8424, + "step": 713 + }, + { + "epoch": 0.17359591539022612, + "grad_norm": 23.0, + "learning_rate": 2.4649472102401134e-06, + "loss": 0.976, + "step": 714 + }, + { + "epoch": 0.1738390469243861, + "grad_norm": 19.625, + "learning_rate": 2.4648334776308395e-06, + "loss": 1.0214, + "step": 715 + }, + { + "epoch": 0.17408217845854607, + "grad_norm": 18.0, + "learning_rate": 2.464719563443721e-06, + "loss": 0.5665, + "step": 716 + }, + { + "epoch": 0.17432530999270604, + "grad_norm": 17.625, + "learning_rate": 2.4646054676957847e-06, + "loss": 0.7222, + "step": 717 + }, + { + "epoch": 0.17456844152686604, + "grad_norm": 23.5, + "learning_rate": 2.4644911904040846e-06, + "loss": 1.3754, + "step": 718 + }, + { + "epoch": 0.17481157306102602, + "grad_norm": 17.0, + "learning_rate": 2.4643767315857013e-06, + "loss": 0.8122, + "step": 719 + }, + { + "epoch": 0.175054704595186, + "grad_norm": 17.625, + "learning_rate": 2.464262091257742e-06, + "loss": 0.5982, + "step": 720 + }, + { + "epoch": 0.17529783612934596, + "grad_norm": 25.125, + "learning_rate": 2.4641472694373427e-06, + "loss": 1.4758, + "step": 721 + }, + { + "epoch": 0.17554096766350596, + "grad_norm": 23.125, + "learning_rate": 2.4640322661416645e-06, + "loss": 1.1522, + "step": 722 + }, + { + "epoch": 0.17578409919766594, + "grad_norm": 16.875, + "learning_rate": 2.463917081387897e-06, + "loss": 0.8721, + "step": 723 + }, + { + "epoch": 0.1760272307318259, + "grad_norm": 16.25, + "learning_rate": 2.4638017151932565e-06, + "loss": 0.7402, + "step": 724 + }, + { + "epoch": 0.17627036226598589, + "grad_norm": 17.375, + "learning_rate": 2.463686167574987e-06, + "loss": 0.9078, + "step": 725 + }, + { + "epoch": 0.1765134938001459, + "grad_norm": 23.875, + "learning_rate": 2.4635704385503585e-06, + "loss": 1.1702, + "step": 726 + }, + { + "epoch": 0.17675662533430586, + "grad_norm": 16.125, + "learning_rate": 2.4634545281366688e-06, + "loss": 0.4668, + "step": 727 + }, + { + "epoch": 0.17699975686846584, + "grad_norm": 44.25, + "learning_rate": 2.4633384363512424e-06, + "loss": 0.9433, + "step": 728 + }, + { + "epoch": 0.1772428884026258, + "grad_norm": 15.5625, + "learning_rate": 2.463222163211432e-06, + "loss": 0.6542, + "step": 729 + }, + { + "epoch": 0.1774860199367858, + "grad_norm": 23.875, + "learning_rate": 2.4631057087346166e-06, + "loss": 1.027, + "step": 730 + }, + { + "epoch": 0.17772915147094578, + "grad_norm": 14.4375, + "learning_rate": 2.4629890729382018e-06, + "loss": 0.5723, + "step": 731 + }, + { + "epoch": 0.17797228300510576, + "grad_norm": 16.5, + "learning_rate": 2.4628722558396206e-06, + "loss": 0.8273, + "step": 732 + }, + { + "epoch": 0.17821541453926573, + "grad_norm": 21.0, + "learning_rate": 2.462755257456334e-06, + "loss": 1.07, + "step": 733 + }, + { + "epoch": 0.17845854607342573, + "grad_norm": 28.875, + "learning_rate": 2.4626380778058293e-06, + "loss": 0.8394, + "step": 734 + }, + { + "epoch": 0.1787016776075857, + "grad_norm": 17.125, + "learning_rate": 2.4625207169056204e-06, + "loss": 1.3333, + "step": 735 + }, + { + "epoch": 0.17894480914174568, + "grad_norm": 25.875, + "learning_rate": 2.46240317477325e-06, + "loss": 1.1257, + "step": 736 + }, + { + "epoch": 0.17918794067590565, + "grad_norm": 19.5, + "learning_rate": 2.462285451426286e-06, + "loss": 0.9058, + "step": 737 + }, + { + "epoch": 0.17943107221006566, + "grad_norm": 22.5, + "learning_rate": 2.4621675468823243e-06, + "loss": 1.0879, + "step": 738 + }, + { + "epoch": 0.17967420374422563, + "grad_norm": 36.5, + "learning_rate": 2.4620494611589877e-06, + "loss": 0.7984, + "step": 739 + }, + { + "epoch": 0.1799173352783856, + "grad_norm": 21.0, + "learning_rate": 2.4619311942739266e-06, + "loss": 1.0539, + "step": 740 + }, + { + "epoch": 0.18016046681254558, + "grad_norm": 14.75, + "learning_rate": 2.4618127462448177e-06, + "loss": 0.6653, + "step": 741 + }, + { + "epoch": 0.18040359834670558, + "grad_norm": 23.75, + "learning_rate": 2.4616941170893647e-06, + "loss": 1.0389, + "step": 742 + }, + { + "epoch": 0.18064672988086555, + "grad_norm": 34.25, + "learning_rate": 2.4615753068253e-06, + "loss": 0.9567, + "step": 743 + }, + { + "epoch": 0.18088986141502553, + "grad_norm": 21.5, + "learning_rate": 2.4614563154703808e-06, + "loss": 0.8491, + "step": 744 + }, + { + "epoch": 0.1811329929491855, + "grad_norm": 18.0, + "learning_rate": 2.4613371430423925e-06, + "loss": 0.7903, + "step": 745 + }, + { + "epoch": 0.1813761244833455, + "grad_norm": 15.8125, + "learning_rate": 2.4612177895591475e-06, + "loss": 0.7727, + "step": 746 + }, + { + "epoch": 0.18161925601750548, + "grad_norm": 19.5, + "learning_rate": 2.4610982550384855e-06, + "loss": 0.908, + "step": 747 + }, + { + "epoch": 0.18186238755166545, + "grad_norm": 24.0, + "learning_rate": 2.460978539498273e-06, + "loss": 1.0352, + "step": 748 + }, + { + "epoch": 0.18210551908582542, + "grad_norm": 21.375, + "learning_rate": 2.4608586429564037e-06, + "loss": 0.5442, + "step": 749 + }, + { + "epoch": 0.18234865061998543, + "grad_norm": 25.125, + "learning_rate": 2.4607385654307976e-06, + "loss": 1.1106, + "step": 750 + }, + { + "epoch": 0.1825917821541454, + "grad_norm": 50.5, + "learning_rate": 2.460618306939403e-06, + "loss": 1.0396, + "step": 751 + }, + { + "epoch": 0.18283491368830537, + "grad_norm": 17.875, + "learning_rate": 2.460497867500194e-06, + "loss": 0.9884, + "step": 752 + }, + { + "epoch": 0.18307804522246535, + "grad_norm": 24.625, + "learning_rate": 2.4603772471311727e-06, + "loss": 1.0661, + "step": 753 + }, + { + "epoch": 0.18332117675662535, + "grad_norm": 18.375, + "learning_rate": 2.460256445850368e-06, + "loss": 0.7287, + "step": 754 + }, + { + "epoch": 0.18356430829078532, + "grad_norm": 19.75, + "learning_rate": 2.4601354636758357e-06, + "loss": 1.3232, + "step": 755 + }, + { + "epoch": 0.1838074398249453, + "grad_norm": 17.25, + "learning_rate": 2.4600143006256587e-06, + "loss": 0.7209, + "step": 756 + }, + { + "epoch": 0.18405057135910527, + "grad_norm": 18.125, + "learning_rate": 2.459892956717946e-06, + "loss": 0.7975, + "step": 757 + }, + { + "epoch": 0.18429370289326524, + "grad_norm": 36.5, + "learning_rate": 2.4597714319708365e-06, + "loss": 0.6597, + "step": 758 + }, + { + "epoch": 0.18453683442742524, + "grad_norm": 18.125, + "learning_rate": 2.4596497264024926e-06, + "loss": 0.9782, + "step": 759 + }, + { + "epoch": 0.18477996596158522, + "grad_norm": 21.375, + "learning_rate": 2.4595278400311053e-06, + "loss": 1.129, + "step": 760 + }, + { + "epoch": 0.1850230974957452, + "grad_norm": 15.25, + "learning_rate": 2.4594057728748934e-06, + "loss": 0.5534, + "step": 761 + }, + { + "epoch": 0.18526622902990517, + "grad_norm": 17.0, + "learning_rate": 2.4592835249521013e-06, + "loss": 0.6545, + "step": 762 + }, + { + "epoch": 0.18550936056406517, + "grad_norm": 17.125, + "learning_rate": 2.4591610962810015e-06, + "loss": 0.7302, + "step": 763 + }, + { + "epoch": 0.18575249209822514, + "grad_norm": 19.875, + "learning_rate": 2.4590384868798933e-06, + "loss": 0.8268, + "step": 764 + }, + { + "epoch": 0.18599562363238512, + "grad_norm": 15.125, + "learning_rate": 2.458915696767102e-06, + "loss": 0.4773, + "step": 765 + }, + { + "epoch": 0.1862387551665451, + "grad_norm": 21.75, + "learning_rate": 2.458792725960981e-06, + "loss": 0.7745, + "step": 766 + }, + { + "epoch": 0.1864818867007051, + "grad_norm": 19.875, + "learning_rate": 2.458669574479911e-06, + "loss": 1.0215, + "step": 767 + }, + { + "epoch": 0.18672501823486506, + "grad_norm": 17.375, + "learning_rate": 2.4585462423422984e-06, + "loss": 0.7035, + "step": 768 + }, + { + "epoch": 0.18696814976902504, + "grad_norm": 19.75, + "learning_rate": 2.4584227295665776e-06, + "loss": 0.9188, + "step": 769 + }, + { + "epoch": 0.187211281303185, + "grad_norm": 22.25, + "learning_rate": 2.4582990361712096e-06, + "loss": 0.9235, + "step": 770 + }, + { + "epoch": 0.187454412837345, + "grad_norm": 33.75, + "learning_rate": 2.4581751621746827e-06, + "loss": 1.1085, + "step": 771 + }, + { + "epoch": 0.187697544371505, + "grad_norm": 31.125, + "learning_rate": 2.458051107595512e-06, + "loss": 1.2226, + "step": 772 + }, + { + "epoch": 0.18794067590566496, + "grad_norm": 18.875, + "learning_rate": 2.4579268724522392e-06, + "loss": 0.9582, + "step": 773 + }, + { + "epoch": 0.18818380743982493, + "grad_norm": 21.125, + "learning_rate": 2.457802456763434e-06, + "loss": 1.123, + "step": 774 + }, + { + "epoch": 0.18842693897398494, + "grad_norm": 17.0, + "learning_rate": 2.457677860547692e-06, + "loss": 0.7536, + "step": 775 + }, + { + "epoch": 0.1886700705081449, + "grad_norm": 20.25, + "learning_rate": 2.4575530838236364e-06, + "loss": 1.163, + "step": 776 + }, + { + "epoch": 0.18891320204230488, + "grad_norm": 23.875, + "learning_rate": 2.4574281266099172e-06, + "loss": 0.929, + "step": 777 + }, + { + "epoch": 0.18915633357646486, + "grad_norm": 22.875, + "learning_rate": 2.4573029889252115e-06, + "loss": 1.1094, + "step": 778 + }, + { + "epoch": 0.18939946511062486, + "grad_norm": 18.75, + "learning_rate": 2.4571776707882235e-06, + "loss": 1.2334, + "step": 779 + }, + { + "epoch": 0.18964259664478483, + "grad_norm": 23.75, + "learning_rate": 2.457052172217684e-06, + "loss": 0.958, + "step": 780 + }, + { + "epoch": 0.1898857281789448, + "grad_norm": 18.5, + "learning_rate": 2.4569264932323505e-06, + "loss": 1.0465, + "step": 781 + }, + { + "epoch": 0.19012885971310478, + "grad_norm": 20.875, + "learning_rate": 2.456800633851008e-06, + "loss": 1.2705, + "step": 782 + }, + { + "epoch": 0.19037199124726478, + "grad_norm": 21.75, + "learning_rate": 2.456674594092469e-06, + "loss": 0.8074, + "step": 783 + }, + { + "epoch": 0.19061512278142476, + "grad_norm": 18.75, + "learning_rate": 2.456548373975572e-06, + "loss": 1.0174, + "step": 784 + }, + { + "epoch": 0.19085825431558473, + "grad_norm": 23.375, + "learning_rate": 2.4564219735191824e-06, + "loss": 1.3718, + "step": 785 + }, + { + "epoch": 0.1911013858497447, + "grad_norm": 20.375, + "learning_rate": 2.4562953927421935e-06, + "loss": 1.0486, + "step": 786 + }, + { + "epoch": 0.1913445173839047, + "grad_norm": 19.625, + "learning_rate": 2.4561686316635246e-06, + "loss": 0.9703, + "step": 787 + }, + { + "epoch": 0.19158764891806468, + "grad_norm": 20.25, + "learning_rate": 2.4560416903021224e-06, + "loss": 0.9202, + "step": 788 + }, + { + "epoch": 0.19183078045222465, + "grad_norm": 17.25, + "learning_rate": 2.455914568676961e-06, + "loss": 0.6329, + "step": 789 + }, + { + "epoch": 0.19207391198638463, + "grad_norm": 18.5, + "learning_rate": 2.45578726680704e-06, + "loss": 0.811, + "step": 790 + }, + { + "epoch": 0.19231704352054463, + "grad_norm": 20.75, + "learning_rate": 2.4556597847113873e-06, + "loss": 1.3838, + "step": 791 + }, + { + "epoch": 0.1925601750547046, + "grad_norm": 31.875, + "learning_rate": 2.455532122409057e-06, + "loss": 1.3928, + "step": 792 + }, + { + "epoch": 0.19280330658886458, + "grad_norm": 20.375, + "learning_rate": 2.4554042799191313e-06, + "loss": 0.9493, + "step": 793 + }, + { + "epoch": 0.19304643812302455, + "grad_norm": 17.625, + "learning_rate": 2.4552762572607174e-06, + "loss": 0.7572, + "step": 794 + }, + { + "epoch": 0.19328956965718452, + "grad_norm": 10.875, + "learning_rate": 2.4551480544529518e-06, + "loss": 0.3469, + "step": 795 + }, + { + "epoch": 0.19353270119134452, + "grad_norm": 13.5625, + "learning_rate": 2.4550196715149953e-06, + "loss": 0.4945, + "step": 796 + }, + { + "epoch": 0.1937758327255045, + "grad_norm": 18.75, + "learning_rate": 2.4548911084660375e-06, + "loss": 0.8778, + "step": 797 + }, + { + "epoch": 0.19401896425966447, + "grad_norm": 23.625, + "learning_rate": 2.4547623653252945e-06, + "loss": 1.0358, + "step": 798 + }, + { + "epoch": 0.19426209579382445, + "grad_norm": 20.875, + "learning_rate": 2.454633442112009e-06, + "loss": 0.9956, + "step": 799 + }, + { + "epoch": 0.19450522732798445, + "grad_norm": 21.875, + "learning_rate": 2.4545043388454505e-06, + "loss": 0.6224, + "step": 800 + }, + { + "epoch": 0.19474835886214442, + "grad_norm": 31.25, + "learning_rate": 2.454375055544916e-06, + "loss": 1.0289, + "step": 801 + }, + { + "epoch": 0.1949914903963044, + "grad_norm": 23.25, + "learning_rate": 2.4542455922297297e-06, + "loss": 1.1676, + "step": 802 + }, + { + "epoch": 0.19523462193046437, + "grad_norm": 21.125, + "learning_rate": 2.4541159489192414e-06, + "loss": 0.8504, + "step": 803 + }, + { + "epoch": 0.19547775346462437, + "grad_norm": 19.5, + "learning_rate": 2.4539861256328286e-06, + "loss": 0.9696, + "step": 804 + }, + { + "epoch": 0.19572088499878434, + "grad_norm": 19.125, + "learning_rate": 2.453856122389896e-06, + "loss": 1.1432, + "step": 805 + }, + { + "epoch": 0.19596401653294432, + "grad_norm": 16.5, + "learning_rate": 2.4537259392098745e-06, + "loss": 0.5648, + "step": 806 + }, + { + "epoch": 0.1962071480671043, + "grad_norm": 26.875, + "learning_rate": 2.4535955761122223e-06, + "loss": 1.1229, + "step": 807 + }, + { + "epoch": 0.1964502796012643, + "grad_norm": 15.6875, + "learning_rate": 2.4534650331164247e-06, + "loss": 1.0883, + "step": 808 + }, + { + "epoch": 0.19669341113542427, + "grad_norm": 19.875, + "learning_rate": 2.4533343102419927e-06, + "loss": 0.8562, + "step": 809 + }, + { + "epoch": 0.19693654266958424, + "grad_norm": 19.125, + "learning_rate": 2.453203407508466e-06, + "loss": 1.0313, + "step": 810 + }, + { + "epoch": 0.19717967420374422, + "grad_norm": 23.0, + "learning_rate": 2.4530723249354105e-06, + "loss": 0.605, + "step": 811 + }, + { + "epoch": 0.19742280573790422, + "grad_norm": 21.625, + "learning_rate": 2.452941062542418e-06, + "loss": 1.0167, + "step": 812 + }, + { + "epoch": 0.1976659372720642, + "grad_norm": 24.5, + "learning_rate": 2.4528096203491074e-06, + "loss": 1.066, + "step": 813 + }, + { + "epoch": 0.19790906880622416, + "grad_norm": 22.875, + "learning_rate": 2.4526779983751266e-06, + "loss": 0.9114, + "step": 814 + }, + { + "epoch": 0.19815220034038414, + "grad_norm": 20.75, + "learning_rate": 2.4525461966401482e-06, + "loss": 0.8261, + "step": 815 + }, + { + "epoch": 0.19839533187454414, + "grad_norm": 21.625, + "learning_rate": 2.4524142151638712e-06, + "loss": 0.6334, + "step": 816 + }, + { + "epoch": 0.1986384634087041, + "grad_norm": 24.0, + "learning_rate": 2.452282053966024e-06, + "loss": 1.1767, + "step": 817 + }, + { + "epoch": 0.1988815949428641, + "grad_norm": 17.5, + "learning_rate": 2.4521497130663595e-06, + "loss": 0.7888, + "step": 818 + }, + { + "epoch": 0.19912472647702406, + "grad_norm": 23.25, + "learning_rate": 2.452017192484659e-06, + "loss": 1.0081, + "step": 819 + }, + { + "epoch": 0.19936785801118406, + "grad_norm": 20.75, + "learning_rate": 2.4518844922407287e-06, + "loss": 1.1283, + "step": 820 + }, + { + "epoch": 0.19961098954534404, + "grad_norm": 23.125, + "learning_rate": 2.451751612354404e-06, + "loss": 0.7548, + "step": 821 + }, + { + "epoch": 0.199854121079504, + "grad_norm": 18.375, + "learning_rate": 2.451618552845546e-06, + "loss": 1.0112, + "step": 822 + }, + { + "epoch": 0.20009725261366398, + "grad_norm": 14.3125, + "learning_rate": 2.4514853137340427e-06, + "loss": 0.5844, + "step": 823 + }, + { + "epoch": 0.20034038414782399, + "grad_norm": 22.875, + "learning_rate": 2.4513518950398085e-06, + "loss": 1.0063, + "step": 824 + }, + { + "epoch": 0.20058351568198396, + "grad_norm": 21.375, + "learning_rate": 2.451218296782786e-06, + "loss": 0.7965, + "step": 825 + }, + { + "epoch": 0.20082664721614393, + "grad_norm": 24.125, + "learning_rate": 2.451084518982943e-06, + "loss": 1.2595, + "step": 826 + }, + { + "epoch": 0.2010697787503039, + "grad_norm": 20.125, + "learning_rate": 2.4509505616602753e-06, + "loss": 0.7854, + "step": 827 + }, + { + "epoch": 0.2013129102844639, + "grad_norm": 19.875, + "learning_rate": 2.450816424834805e-06, + "loss": 0.7807, + "step": 828 + }, + { + "epoch": 0.20155604181862388, + "grad_norm": 22.375, + "learning_rate": 2.4506821085265813e-06, + "loss": 1.1725, + "step": 829 + }, + { + "epoch": 0.20179917335278386, + "grad_norm": 16.625, + "learning_rate": 2.45054761275568e-06, + "loss": 0.8272, + "step": 830 + }, + { + "epoch": 0.20204230488694383, + "grad_norm": 16.0, + "learning_rate": 2.4504129375422037e-06, + "loss": 0.7311, + "step": 831 + }, + { + "epoch": 0.2022854364211038, + "grad_norm": 17.875, + "learning_rate": 2.450278082906282e-06, + "loss": 0.9647, + "step": 832 + }, + { + "epoch": 0.2025285679552638, + "grad_norm": 21.75, + "learning_rate": 2.450143048868071e-06, + "loss": 0.802, + "step": 833 + }, + { + "epoch": 0.20277169948942378, + "grad_norm": 18.625, + "learning_rate": 2.4500078354477547e-06, + "loss": 0.8647, + "step": 834 + }, + { + "epoch": 0.20301483102358375, + "grad_norm": 18.75, + "learning_rate": 2.4498724426655424e-06, + "loss": 0.7957, + "step": 835 + }, + { + "epoch": 0.20325796255774373, + "grad_norm": 13.0625, + "learning_rate": 2.449736870541671e-06, + "loss": 0.4855, + "step": 836 + }, + { + "epoch": 0.20350109409190373, + "grad_norm": 13.6875, + "learning_rate": 2.4496011190964044e-06, + "loss": 0.4245, + "step": 837 + }, + { + "epoch": 0.2037442256260637, + "grad_norm": 24.375, + "learning_rate": 2.449465188350032e-06, + "loss": 1.2063, + "step": 838 + }, + { + "epoch": 0.20398735716022368, + "grad_norm": 14.4375, + "learning_rate": 2.4493290783228723e-06, + "loss": 0.5422, + "step": 839 + }, + { + "epoch": 0.20423048869438365, + "grad_norm": 19.75, + "learning_rate": 2.4491927890352685e-06, + "loss": 0.9839, + "step": 840 + }, + { + "epoch": 0.20447362022854365, + "grad_norm": 17.5, + "learning_rate": 2.4490563205075916e-06, + "loss": 0.7722, + "step": 841 + }, + { + "epoch": 0.20471675176270362, + "grad_norm": 25.25, + "learning_rate": 2.448919672760239e-06, + "loss": 0.6736, + "step": 842 + }, + { + "epoch": 0.2049598832968636, + "grad_norm": 17.5, + "learning_rate": 2.4487828458136354e-06, + "loss": 0.6458, + "step": 843 + }, + { + "epoch": 0.20520301483102357, + "grad_norm": 20.25, + "learning_rate": 2.4486458396882317e-06, + "loss": 0.89, + "step": 844 + }, + { + "epoch": 0.20544614636518357, + "grad_norm": 25.25, + "learning_rate": 2.4485086544045063e-06, + "loss": 1.1589, + "step": 845 + }, + { + "epoch": 0.20568927789934355, + "grad_norm": 18.375, + "learning_rate": 2.4483712899829636e-06, + "loss": 0.9451, + "step": 846 + }, + { + "epoch": 0.20593240943350352, + "grad_norm": 19.125, + "learning_rate": 2.448233746444135e-06, + "loss": 1.145, + "step": 847 + }, + { + "epoch": 0.2061755409676635, + "grad_norm": 22.125, + "learning_rate": 2.448096023808578e-06, + "loss": 0.7977, + "step": 848 + }, + { + "epoch": 0.2064186725018235, + "grad_norm": 19.375, + "learning_rate": 2.447958122096879e-06, + "loss": 0.8558, + "step": 849 + }, + { + "epoch": 0.20666180403598347, + "grad_norm": 20.875, + "learning_rate": 2.4478200413296494e-06, + "loss": 1.0523, + "step": 850 + }, + { + "epoch": 0.20690493557014344, + "grad_norm": 22.0, + "learning_rate": 2.447681781527527e-06, + "loss": 0.955, + "step": 851 + }, + { + "epoch": 0.20714806710430342, + "grad_norm": 14.8125, + "learning_rate": 2.447543342711178e-06, + "loss": 0.5719, + "step": 852 + }, + { + "epoch": 0.20739119863846342, + "grad_norm": 23.0, + "learning_rate": 2.447404724901294e-06, + "loss": 0.7843, + "step": 853 + }, + { + "epoch": 0.2076343301726234, + "grad_norm": 21.625, + "learning_rate": 2.447265928118594e-06, + "loss": 1.3464, + "step": 854 + }, + { + "epoch": 0.20787746170678337, + "grad_norm": 18.5, + "learning_rate": 2.447126952383824e-06, + "loss": 0.7989, + "step": 855 + }, + { + "epoch": 0.20812059324094334, + "grad_norm": 28.875, + "learning_rate": 2.446987797717755e-06, + "loss": 0.5581, + "step": 856 + }, + { + "epoch": 0.20836372477510334, + "grad_norm": 15.1875, + "learning_rate": 2.4468484641411877e-06, + "loss": 1.1626, + "step": 857 + }, + { + "epoch": 0.20860685630926332, + "grad_norm": 17.75, + "learning_rate": 2.446708951674947e-06, + "loss": 1.2429, + "step": 858 + }, + { + "epoch": 0.2088499878434233, + "grad_norm": 20.125, + "learning_rate": 2.4465692603398854e-06, + "loss": 1.0368, + "step": 859 + }, + { + "epoch": 0.20909311937758326, + "grad_norm": 23.25, + "learning_rate": 2.4464293901568824e-06, + "loss": 1.3265, + "step": 860 + }, + { + "epoch": 0.20933625091174327, + "grad_norm": 22.25, + "learning_rate": 2.446289341146844e-06, + "loss": 0.842, + "step": 861 + }, + { + "epoch": 0.20957938244590324, + "grad_norm": 22.75, + "learning_rate": 2.446149113330703e-06, + "loss": 1.1281, + "step": 862 + }, + { + "epoch": 0.2098225139800632, + "grad_norm": 20.875, + "learning_rate": 2.4460087067294186e-06, + "loss": 1.2158, + "step": 863 + }, + { + "epoch": 0.2100656455142232, + "grad_norm": 15.875, + "learning_rate": 2.4458681213639773e-06, + "loss": 0.6732, + "step": 864 + }, + { + "epoch": 0.2103087770483832, + "grad_norm": 24.25, + "learning_rate": 2.445727357255392e-06, + "loss": 1.3589, + "step": 865 + }, + { + "epoch": 0.21055190858254316, + "grad_norm": 18.625, + "learning_rate": 2.4455864144247023e-06, + "loss": 1.2187, + "step": 866 + }, + { + "epoch": 0.21079504011670314, + "grad_norm": 26.625, + "learning_rate": 2.4454452928929746e-06, + "loss": 1.1716, + "step": 867 + }, + { + "epoch": 0.2110381716508631, + "grad_norm": 22.125, + "learning_rate": 2.4453039926813014e-06, + "loss": 0.9565, + "step": 868 + }, + { + "epoch": 0.21128130318502308, + "grad_norm": 22.0, + "learning_rate": 2.445162513810803e-06, + "loss": 0.9566, + "step": 869 + }, + { + "epoch": 0.21152443471918309, + "grad_norm": 22.375, + "learning_rate": 2.445020856302626e-06, + "loss": 1.2194, + "step": 870 + }, + { + "epoch": 0.21176756625334306, + "grad_norm": 22.625, + "learning_rate": 2.4448790201779428e-06, + "loss": 1.1004, + "step": 871 + }, + { + "epoch": 0.21201069778750303, + "grad_norm": 32.5, + "learning_rate": 2.4447370054579542e-06, + "loss": 1.3613, + "step": 872 + }, + { + "epoch": 0.212253829321663, + "grad_norm": 16.875, + "learning_rate": 2.444594812163886e-06, + "loss": 0.9028, + "step": 873 + }, + { + "epoch": 0.212496960855823, + "grad_norm": 19.5, + "learning_rate": 2.4444524403169922e-06, + "loss": 1.2903, + "step": 874 + }, + { + "epoch": 0.21274009238998298, + "grad_norm": 18.875, + "learning_rate": 2.444309889938552e-06, + "loss": 1.2591, + "step": 875 + }, + { + "epoch": 0.21298322392414296, + "grad_norm": 15.5625, + "learning_rate": 2.4441671610498725e-06, + "loss": 0.7273, + "step": 876 + }, + { + "epoch": 0.21322635545830293, + "grad_norm": 22.75, + "learning_rate": 2.4440242536722863e-06, + "loss": 1.3297, + "step": 877 + }, + { + "epoch": 0.21346948699246293, + "grad_norm": 21.75, + "learning_rate": 2.4438811678271543e-06, + "loss": 0.8035, + "step": 878 + }, + { + "epoch": 0.2137126185266229, + "grad_norm": 34.0, + "learning_rate": 2.4437379035358626e-06, + "loss": 1.1964, + "step": 879 + }, + { + "epoch": 0.21395575006078288, + "grad_norm": 29.75, + "learning_rate": 2.4435944608198246e-06, + "loss": 0.9425, + "step": 880 + }, + { + "epoch": 0.21419888159494285, + "grad_norm": 21.0, + "learning_rate": 2.4434508397004806e-06, + "loss": 1.0503, + "step": 881 + }, + { + "epoch": 0.21444201312910285, + "grad_norm": 26.75, + "learning_rate": 2.4433070401992968e-06, + "loss": 1.116, + "step": 882 + }, + { + "epoch": 0.21468514466326283, + "grad_norm": 23.625, + "learning_rate": 2.4431630623377665e-06, + "loss": 1.0084, + "step": 883 + }, + { + "epoch": 0.2149282761974228, + "grad_norm": 18.0, + "learning_rate": 2.4430189061374103e-06, + "loss": 1.1068, + "step": 884 + }, + { + "epoch": 0.21517140773158278, + "grad_norm": 20.25, + "learning_rate": 2.4428745716197746e-06, + "loss": 0.8787, + "step": 885 + }, + { + "epoch": 0.21541453926574278, + "grad_norm": 19.875, + "learning_rate": 2.4427300588064316e-06, + "loss": 0.9147, + "step": 886 + }, + { + "epoch": 0.21565767079990275, + "grad_norm": 14.6875, + "learning_rate": 2.4425853677189833e-06, + "loss": 0.8199, + "step": 887 + }, + { + "epoch": 0.21590080233406272, + "grad_norm": 18.5, + "learning_rate": 2.4424404983790547e-06, + "loss": 0.9097, + "step": 888 + }, + { + "epoch": 0.2161439338682227, + "grad_norm": 22.375, + "learning_rate": 2.442295450808299e-06, + "loss": 0.9711, + "step": 889 + }, + { + "epoch": 0.2163870654023827, + "grad_norm": 19.375, + "learning_rate": 2.4421502250283966e-06, + "loss": 0.7945, + "step": 890 + }, + { + "epoch": 0.21663019693654267, + "grad_norm": 20.75, + "learning_rate": 2.4420048210610542e-06, + "loss": 0.9914, + "step": 891 + }, + { + "epoch": 0.21687332847070265, + "grad_norm": 19.0, + "learning_rate": 2.441859238928005e-06, + "loss": 0.9432, + "step": 892 + }, + { + "epoch": 0.21711646000486262, + "grad_norm": 21.0, + "learning_rate": 2.4417134786510077e-06, + "loss": 0.8256, + "step": 893 + }, + { + "epoch": 0.21735959153902262, + "grad_norm": 19.75, + "learning_rate": 2.44156754025185e-06, + "loss": 0.6091, + "step": 894 + }, + { + "epoch": 0.2176027230731826, + "grad_norm": 27.375, + "learning_rate": 2.441421423752344e-06, + "loss": 1.0419, + "step": 895 + }, + { + "epoch": 0.21784585460734257, + "grad_norm": 20.875, + "learning_rate": 2.4412751291743297e-06, + "loss": 1.296, + "step": 896 + }, + { + "epoch": 0.21808898614150254, + "grad_norm": 23.625, + "learning_rate": 2.4411286565396735e-06, + "loss": 0.8414, + "step": 897 + }, + { + "epoch": 0.21833211767566255, + "grad_norm": 17.375, + "learning_rate": 2.4409820058702678e-06, + "loss": 0.9116, + "step": 898 + }, + { + "epoch": 0.21857524920982252, + "grad_norm": 18.875, + "learning_rate": 2.4408351771880324e-06, + "loss": 0.8075, + "step": 899 + }, + { + "epoch": 0.2188183807439825, + "grad_norm": 16.75, + "learning_rate": 2.4406881705149133e-06, + "loss": 0.9483, + "step": 900 + }, + { + "epoch": 0.21906151227814247, + "grad_norm": 28.75, + "learning_rate": 2.4405409858728836e-06, + "loss": 1.284, + "step": 901 + }, + { + "epoch": 0.21930464381230247, + "grad_norm": 22.875, + "learning_rate": 2.4403936232839418e-06, + "loss": 0.869, + "step": 902 + }, + { + "epoch": 0.21954777534646244, + "grad_norm": 18.375, + "learning_rate": 2.440246082770114e-06, + "loss": 1.0692, + "step": 903 + }, + { + "epoch": 0.21979090688062242, + "grad_norm": 25.125, + "learning_rate": 2.440098364353454e-06, + "loss": 0.92, + "step": 904 + }, + { + "epoch": 0.2200340384147824, + "grad_norm": 19.125, + "learning_rate": 2.4399504680560387e-06, + "loss": 1.0571, + "step": 905 + }, + { + "epoch": 0.22027716994894236, + "grad_norm": 23.875, + "learning_rate": 2.439802393899975e-06, + "loss": 1.2083, + "step": 906 + }, + { + "epoch": 0.22052030148310237, + "grad_norm": 23.5, + "learning_rate": 2.4396541419073947e-06, + "loss": 1.2092, + "step": 907 + }, + { + "epoch": 0.22076343301726234, + "grad_norm": 20.125, + "learning_rate": 2.4395057121004573e-06, + "loss": 1.0874, + "step": 908 + }, + { + "epoch": 0.2210065645514223, + "grad_norm": 18.125, + "learning_rate": 2.4393571045013475e-06, + "loss": 0.6744, + "step": 909 + }, + { + "epoch": 0.2212496960855823, + "grad_norm": 22.375, + "learning_rate": 2.4392083191322774e-06, + "loss": 0.9102, + "step": 910 + }, + { + "epoch": 0.2214928276197423, + "grad_norm": 16.875, + "learning_rate": 2.439059356015486e-06, + "loss": 0.7533, + "step": 911 + }, + { + "epoch": 0.22173595915390226, + "grad_norm": 17.5, + "learning_rate": 2.438910215173238e-06, + "loss": 0.7811, + "step": 912 + }, + { + "epoch": 0.22197909068806224, + "grad_norm": 19.5, + "learning_rate": 2.438760896627825e-06, + "loss": 1.1445, + "step": 913 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 18.5, + "learning_rate": 2.4386114004015653e-06, + "loss": 1.0945, + "step": 914 + }, + { + "epoch": 0.2224653537563822, + "grad_norm": 23.625, + "learning_rate": 2.4384617265168043e-06, + "loss": 1.0269, + "step": 915 + }, + { + "epoch": 0.22270848529054219, + "grad_norm": 18.5, + "learning_rate": 2.4383118749959122e-06, + "loss": 0.8623, + "step": 916 + }, + { + "epoch": 0.22295161682470216, + "grad_norm": 20.25, + "learning_rate": 2.438161845861288e-06, + "loss": 1.0068, + "step": 917 + }, + { + "epoch": 0.22319474835886213, + "grad_norm": 23.375, + "learning_rate": 2.438011639135355e-06, + "loss": 1.1542, + "step": 918 + }, + { + "epoch": 0.22343787989302213, + "grad_norm": 21.125, + "learning_rate": 2.4378612548405657e-06, + "loss": 0.9252, + "step": 919 + }, + { + "epoch": 0.2236810114271821, + "grad_norm": 17.25, + "learning_rate": 2.4377106929993966e-06, + "loss": 0.5471, + "step": 920 + }, + { + "epoch": 0.22392414296134208, + "grad_norm": 16.625, + "learning_rate": 2.4375599536343515e-06, + "loss": 1.4131, + "step": 921 + }, + { + "epoch": 0.22416727449550206, + "grad_norm": 19.625, + "learning_rate": 2.437409036767962e-06, + "loss": 0.686, + "step": 922 + }, + { + "epoch": 0.22441040602966206, + "grad_norm": 21.375, + "learning_rate": 2.4372579424227843e-06, + "loss": 1.0637, + "step": 923 + }, + { + "epoch": 0.22465353756382203, + "grad_norm": 24.375, + "learning_rate": 2.4371066706214026e-06, + "loss": 1.1009, + "step": 924 + }, + { + "epoch": 0.224896669097982, + "grad_norm": 23.25, + "learning_rate": 2.436955221386427e-06, + "loss": 1.106, + "step": 925 + }, + { + "epoch": 0.22513980063214198, + "grad_norm": 20.375, + "learning_rate": 2.436803594740494e-06, + "loss": 1.0571, + "step": 926 + }, + { + "epoch": 0.22538293216630198, + "grad_norm": 35.0, + "learning_rate": 2.436651790706267e-06, + "loss": 0.4582, + "step": 927 + }, + { + "epoch": 0.22562606370046195, + "grad_norm": 32.5, + "learning_rate": 2.4364998093064357e-06, + "loss": 0.9672, + "step": 928 + }, + { + "epoch": 0.22586919523462193, + "grad_norm": 14.8125, + "learning_rate": 2.4363476505637162e-06, + "loss": 0.7489, + "step": 929 + }, + { + "epoch": 0.2261123267687819, + "grad_norm": 20.625, + "learning_rate": 2.4361953145008517e-06, + "loss": 0.8131, + "step": 930 + }, + { + "epoch": 0.2263554583029419, + "grad_norm": 15.4375, + "learning_rate": 2.436042801140611e-06, + "loss": 0.4668, + "step": 931 + }, + { + "epoch": 0.22659858983710188, + "grad_norm": 22.625, + "learning_rate": 2.4358901105057902e-06, + "loss": 0.7394, + "step": 932 + }, + { + "epoch": 0.22684172137126185, + "grad_norm": 26.75, + "learning_rate": 2.435737242619211e-06, + "loss": 0.9973, + "step": 933 + }, + { + "epoch": 0.22708485290542182, + "grad_norm": 17.875, + "learning_rate": 2.4355841975037226e-06, + "loss": 0.9461, + "step": 934 + }, + { + "epoch": 0.22732798443958183, + "grad_norm": 17.25, + "learning_rate": 2.4354309751822004e-06, + "loss": 0.6809, + "step": 935 + }, + { + "epoch": 0.2275711159737418, + "grad_norm": 20.0, + "learning_rate": 2.4352775756775453e-06, + "loss": 0.8722, + "step": 936 + }, + { + "epoch": 0.22781424750790177, + "grad_norm": 26.375, + "learning_rate": 2.435123999012687e-06, + "loss": 0.9036, + "step": 937 + }, + { + "epoch": 0.22805737904206175, + "grad_norm": 17.5, + "learning_rate": 2.4349702452105783e-06, + "loss": 0.7738, + "step": 938 + }, + { + "epoch": 0.22830051057622175, + "grad_norm": 18.25, + "learning_rate": 2.4348163142942017e-06, + "loss": 1.0418, + "step": 939 + }, + { + "epoch": 0.22854364211038172, + "grad_norm": 18.125, + "learning_rate": 2.4346622062865645e-06, + "loss": 0.8831, + "step": 940 + }, + { + "epoch": 0.2287867736445417, + "grad_norm": 18.5, + "learning_rate": 2.4345079212107003e-06, + "loss": 0.9789, + "step": 941 + }, + { + "epoch": 0.22902990517870167, + "grad_norm": 18.25, + "learning_rate": 2.4343534590896705e-06, + "loss": 0.9599, + "step": 942 + }, + { + "epoch": 0.22927303671286167, + "grad_norm": 21.25, + "learning_rate": 2.434198819946562e-06, + "loss": 1.1084, + "step": 943 + }, + { + "epoch": 0.22951616824702165, + "grad_norm": 18.625, + "learning_rate": 2.4340440038044877e-06, + "loss": 0.8981, + "step": 944 + }, + { + "epoch": 0.22975929978118162, + "grad_norm": 19.875, + "learning_rate": 2.433889010686588e-06, + "loss": 0.842, + "step": 945 + }, + { + "epoch": 0.2300024313153416, + "grad_norm": 22.625, + "learning_rate": 2.433733840616029e-06, + "loss": 0.9982, + "step": 946 + }, + { + "epoch": 0.23024556284950157, + "grad_norm": 21.25, + "learning_rate": 2.433578493616004e-06, + "loss": 0.9833, + "step": 947 + }, + { + "epoch": 0.23048869438366157, + "grad_norm": 26.25, + "learning_rate": 2.4334229697097315e-06, + "loss": 1.354, + "step": 948 + }, + { + "epoch": 0.23073182591782154, + "grad_norm": 18.625, + "learning_rate": 2.4332672689204583e-06, + "loss": 0.9825, + "step": 949 + }, + { + "epoch": 0.23097495745198152, + "grad_norm": 19.875, + "learning_rate": 2.433111391271456e-06, + "loss": 0.9362, + "step": 950 + }, + { + "epoch": 0.2312180889861415, + "grad_norm": 16.875, + "learning_rate": 2.432955336786023e-06, + "loss": 0.7623, + "step": 951 + }, + { + "epoch": 0.2314612205203015, + "grad_norm": 19.75, + "learning_rate": 2.4327991054874843e-06, + "loss": 0.8751, + "step": 952 + }, + { + "epoch": 0.23170435205446147, + "grad_norm": 14.8125, + "learning_rate": 2.4326426973991922e-06, + "loss": 0.4809, + "step": 953 + }, + { + "epoch": 0.23194748358862144, + "grad_norm": 15.8125, + "learning_rate": 2.4324861125445236e-06, + "loss": 0.7759, + "step": 954 + }, + { + "epoch": 0.2321906151227814, + "grad_norm": 19.625, + "learning_rate": 2.4323293509468837e-06, + "loss": 0.7713, + "step": 955 + }, + { + "epoch": 0.23243374665694141, + "grad_norm": 18.375, + "learning_rate": 2.4321724126297026e-06, + "loss": 1.0325, + "step": 956 + }, + { + "epoch": 0.2326768781911014, + "grad_norm": 20.5, + "learning_rate": 2.432015297616437e-06, + "loss": 0.8978, + "step": 957 + }, + { + "epoch": 0.23292000972526136, + "grad_norm": 23.75, + "learning_rate": 2.431858005930572e-06, + "loss": 1.1904, + "step": 958 + }, + { + "epoch": 0.23316314125942134, + "grad_norm": 23.75, + "learning_rate": 2.4317005375956163e-06, + "loss": 1.2113, + "step": 959 + }, + { + "epoch": 0.23340627279358134, + "grad_norm": 20.25, + "learning_rate": 2.4315428926351067e-06, + "loss": 0.8656, + "step": 960 + }, + { + "epoch": 0.2336494043277413, + "grad_norm": 19.5, + "learning_rate": 2.4313850710726054e-06, + "loss": 1.0967, + "step": 961 + }, + { + "epoch": 0.23389253586190129, + "grad_norm": 22.5, + "learning_rate": 2.4312270729317024e-06, + "loss": 0.7397, + "step": 962 + }, + { + "epoch": 0.23413566739606126, + "grad_norm": 29.25, + "learning_rate": 2.4310688982360125e-06, + "loss": 1.0042, + "step": 963 + }, + { + "epoch": 0.23437879893022126, + "grad_norm": 24.875, + "learning_rate": 2.430910547009178e-06, + "loss": 1.1196, + "step": 964 + }, + { + "epoch": 0.23462193046438123, + "grad_norm": 22.125, + "learning_rate": 2.4307520192748675e-06, + "loss": 0.9531, + "step": 965 + }, + { + "epoch": 0.2348650619985412, + "grad_norm": 20.125, + "learning_rate": 2.430593315056776e-06, + "loss": 0.7087, + "step": 966 + }, + { + "epoch": 0.23510819353270118, + "grad_norm": 17.0, + "learning_rate": 2.430434434378623e-06, + "loss": 0.7385, + "step": 967 + }, + { + "epoch": 0.23535132506686118, + "grad_norm": 23.5, + "learning_rate": 2.430275377264157e-06, + "loss": 1.1282, + "step": 968 + }, + { + "epoch": 0.23559445660102116, + "grad_norm": 17.0, + "learning_rate": 2.4301161437371525e-06, + "loss": 0.7602, + "step": 969 + }, + { + "epoch": 0.23583758813518113, + "grad_norm": 22.75, + "learning_rate": 2.4299567338214086e-06, + "loss": 1.0104, + "step": 970 + }, + { + "epoch": 0.2360807196693411, + "grad_norm": 17.375, + "learning_rate": 2.429797147540752e-06, + "loss": 0.7066, + "step": 971 + }, + { + "epoch": 0.2363238512035011, + "grad_norm": 35.25, + "learning_rate": 2.4296373849190363e-06, + "loss": 0.8827, + "step": 972 + }, + { + "epoch": 0.23656698273766108, + "grad_norm": 21.5, + "learning_rate": 2.42947744598014e-06, + "loss": 0.9952, + "step": 973 + }, + { + "epoch": 0.23681011427182105, + "grad_norm": 19.75, + "learning_rate": 2.4293173307479696e-06, + "loss": 0.9081, + "step": 974 + }, + { + "epoch": 0.23705324580598103, + "grad_norm": 22.125, + "learning_rate": 2.4291570392464566e-06, + "loss": 1.469, + "step": 975 + }, + { + "epoch": 0.23729637734014103, + "grad_norm": 23.875, + "learning_rate": 2.4289965714995588e-06, + "loss": 1.0484, + "step": 976 + }, + { + "epoch": 0.237539508874301, + "grad_norm": 19.0, + "learning_rate": 2.428835927531262e-06, + "loss": 1.0291, + "step": 977 + }, + { + "epoch": 0.23778264040846098, + "grad_norm": 19.25, + "learning_rate": 2.428675107365576e-06, + "loss": 1.0113, + "step": 978 + }, + { + "epoch": 0.23802577194262095, + "grad_norm": 23.0, + "learning_rate": 2.42851411102654e-06, + "loss": 1.4051, + "step": 979 + }, + { + "epoch": 0.23826890347678095, + "grad_norm": 21.125, + "learning_rate": 2.4283529385382154e-06, + "loss": 1.0861, + "step": 980 + }, + { + "epoch": 0.23851203501094093, + "grad_norm": 12.4375, + "learning_rate": 2.4281915899246934e-06, + "loss": 0.6562, + "step": 981 + }, + { + "epoch": 0.2387551665451009, + "grad_norm": 20.5, + "learning_rate": 2.4280300652100904e-06, + "loss": 1.0857, + "step": 982 + }, + { + "epoch": 0.23899829807926087, + "grad_norm": 19.25, + "learning_rate": 2.4278683644185487e-06, + "loss": 0.8665, + "step": 983 + }, + { + "epoch": 0.23924142961342085, + "grad_norm": 17.25, + "learning_rate": 2.4277064875742375e-06, + "loss": 0.7677, + "step": 984 + }, + { + "epoch": 0.23948456114758085, + "grad_norm": 17.875, + "learning_rate": 2.4275444347013523e-06, + "loss": 0.9632, + "step": 985 + }, + { + "epoch": 0.23972769268174082, + "grad_norm": 21.0, + "learning_rate": 2.427382205824114e-06, + "loss": 0.9758, + "step": 986 + }, + { + "epoch": 0.2399708242159008, + "grad_norm": 20.25, + "learning_rate": 2.427219800966771e-06, + "loss": 0.8743, + "step": 987 + }, + { + "epoch": 0.24021395575006077, + "grad_norm": 23.0, + "learning_rate": 2.427057220153598e-06, + "loss": 1.2395, + "step": 988 + }, + { + "epoch": 0.24045708728422077, + "grad_norm": 21.125, + "learning_rate": 2.426894463408894e-06, + "loss": 1.0091, + "step": 989 + }, + { + "epoch": 0.24070021881838075, + "grad_norm": 17.125, + "learning_rate": 2.4267315307569876e-06, + "loss": 0.8063, + "step": 990 + }, + { + "epoch": 0.24094335035254072, + "grad_norm": 14.0, + "learning_rate": 2.4265684222222307e-06, + "loss": 0.4877, + "step": 991 + }, + { + "epoch": 0.2411864818867007, + "grad_norm": 34.75, + "learning_rate": 2.426405137829003e-06, + "loss": 1.0394, + "step": 992 + }, + { + "epoch": 0.2414296134208607, + "grad_norm": 18.5, + "learning_rate": 2.4262416776017107e-06, + "loss": 1.3841, + "step": 993 + }, + { + "epoch": 0.24167274495502067, + "grad_norm": 17.25, + "learning_rate": 2.426078041564785e-06, + "loss": 0.8635, + "step": 994 + }, + { + "epoch": 0.24191587648918064, + "grad_norm": 19.25, + "learning_rate": 2.4259142297426846e-06, + "loss": 0.9947, + "step": 995 + }, + { + "epoch": 0.24215900802334062, + "grad_norm": 17.25, + "learning_rate": 2.4257502421598934e-06, + "loss": 0.9596, + "step": 996 + }, + { + "epoch": 0.24240213955750062, + "grad_norm": 23.0, + "learning_rate": 2.425586078840923e-06, + "loss": 1.1512, + "step": 997 + }, + { + "epoch": 0.2426452710916606, + "grad_norm": 17.875, + "learning_rate": 2.42542173981031e-06, + "loss": 0.9097, + "step": 998 + }, + { + "epoch": 0.24288840262582057, + "grad_norm": 16.0, + "learning_rate": 2.4252572250926176e-06, + "loss": 0.7223, + "step": 999 + }, + { + "epoch": 0.24313153415998054, + "grad_norm": 14.375, + "learning_rate": 2.4250925347124353e-06, + "loss": 0.5323, + "step": 1000 + }, + { + "epoch": 0.24337466569414054, + "grad_norm": 24.875, + "learning_rate": 2.4249276686943797e-06, + "loss": 1.383, + "step": 1001 + }, + { + "epoch": 0.24361779722830051, + "grad_norm": 31.25, + "learning_rate": 2.424762627063092e-06, + "loss": 0.877, + "step": 1002 + }, + { + "epoch": 0.2438609287624605, + "grad_norm": 18.5, + "learning_rate": 2.4245974098432406e-06, + "loss": 1.054, + "step": 1003 + }, + { + "epoch": 0.24410406029662046, + "grad_norm": 24.375, + "learning_rate": 2.4244320170595206e-06, + "loss": 1.2724, + "step": 1004 + }, + { + "epoch": 0.24434719183078046, + "grad_norm": 18.75, + "learning_rate": 2.4242664487366523e-06, + "loss": 1.0731, + "step": 1005 + }, + { + "epoch": 0.24459032336494044, + "grad_norm": 20.25, + "learning_rate": 2.424100704899383e-06, + "loss": 1.1288, + "step": 1006 + }, + { + "epoch": 0.2448334548991004, + "grad_norm": 19.25, + "learning_rate": 2.4239347855724863e-06, + "loss": 1.3863, + "step": 1007 + }, + { + "epoch": 0.24507658643326038, + "grad_norm": 19.0, + "learning_rate": 2.4237686907807612e-06, + "loss": 0.8535, + "step": 1008 + }, + { + "epoch": 0.2453197179674204, + "grad_norm": 17.875, + "learning_rate": 2.4236024205490335e-06, + "loss": 1.1126, + "step": 1009 + }, + { + "epoch": 0.24556284950158036, + "grad_norm": 16.75, + "learning_rate": 2.423435974902155e-06, + "loss": 0.9586, + "step": 1010 + }, + { + "epoch": 0.24580598103574033, + "grad_norm": 17.875, + "learning_rate": 2.4232693538650043e-06, + "loss": 0.7974, + "step": 1011 + }, + { + "epoch": 0.2460491125699003, + "grad_norm": 17.875, + "learning_rate": 2.4231025574624855e-06, + "loss": 1.0039, + "step": 1012 + }, + { + "epoch": 0.2462922441040603, + "grad_norm": 14.625, + "learning_rate": 2.42293558571953e-06, + "loss": 0.5908, + "step": 1013 + }, + { + "epoch": 0.24653537563822028, + "grad_norm": 25.75, + "learning_rate": 2.4227684386610927e-06, + "loss": 1.2829, + "step": 1014 + }, + { + "epoch": 0.24677850717238026, + "grad_norm": 14.875, + "learning_rate": 2.422601116312159e-06, + "loss": 0.5478, + "step": 1015 + }, + { + "epoch": 0.24702163870654023, + "grad_norm": 22.0, + "learning_rate": 2.422433618697736e-06, + "loss": 1.4427, + "step": 1016 + }, + { + "epoch": 0.24726477024070023, + "grad_norm": 21.125, + "learning_rate": 2.4222659458428606e-06, + "loss": 0.9482, + "step": 1017 + }, + { + "epoch": 0.2475079017748602, + "grad_norm": 15.3125, + "learning_rate": 2.4220980977725934e-06, + "loss": 0.6992, + "step": 1018 + }, + { + "epoch": 0.24775103330902018, + "grad_norm": 26.25, + "learning_rate": 2.421930074512023e-06, + "loss": 1.6438, + "step": 1019 + }, + { + "epoch": 0.24799416484318015, + "grad_norm": 23.5, + "learning_rate": 2.421761876086263e-06, + "loss": 1.2188, + "step": 1020 + }, + { + "epoch": 0.24823729637734013, + "grad_norm": 14.0625, + "learning_rate": 2.4215935025204536e-06, + "loss": 0.4709, + "step": 1021 + }, + { + "epoch": 0.24848042791150013, + "grad_norm": 19.125, + "learning_rate": 2.421424953839761e-06, + "loss": 1.0291, + "step": 1022 + }, + { + "epoch": 0.2487235594456601, + "grad_norm": 18.25, + "learning_rate": 2.421256230069378e-06, + "loss": 1.1222, + "step": 1023 + }, + { + "epoch": 0.24896669097982008, + "grad_norm": 18.375, + "learning_rate": 2.421087331234523e-06, + "loss": 1.1241, + "step": 1024 + }, + { + "epoch": 0.24920982251398005, + "grad_norm": 20.625, + "learning_rate": 2.4209182573604414e-06, + "loss": 1.2151, + "step": 1025 + }, + { + "epoch": 0.24945295404814005, + "grad_norm": 19.75, + "learning_rate": 2.4207490084724033e-06, + "loss": 1.3846, + "step": 1026 + }, + { + "epoch": 0.24969608558230003, + "grad_norm": 14.125, + "learning_rate": 2.4205795845957062e-06, + "loss": 0.4152, + "step": 1027 + }, + { + "epoch": 0.24993921711646, + "grad_norm": 16.25, + "learning_rate": 2.420409985755674e-06, + "loss": 0.7009, + "step": 1028 + }, + { + "epoch": 0.25018234865062, + "grad_norm": 31.0, + "learning_rate": 2.4202402119776556e-06, + "loss": 0.9549, + "step": 1029 + }, + { + "epoch": 0.25042548018477995, + "grad_norm": 23.375, + "learning_rate": 2.420070263287027e-06, + "loss": 0.9725, + "step": 1030 + }, + { + "epoch": 0.25066861171893995, + "grad_norm": 23.625, + "learning_rate": 2.4199001397091894e-06, + "loss": 1.3791, + "step": 1031 + }, + { + "epoch": 0.25091174325309995, + "grad_norm": 19.25, + "learning_rate": 2.4197298412695712e-06, + "loss": 0.9608, + "step": 1032 + }, + { + "epoch": 0.2511548747872599, + "grad_norm": 18.875, + "learning_rate": 2.419559367993626e-06, + "loss": 0.7395, + "step": 1033 + }, + { + "epoch": 0.2513980063214199, + "grad_norm": 22.5, + "learning_rate": 2.4193887199068342e-06, + "loss": 1.0357, + "step": 1034 + }, + { + "epoch": 0.25164113785557984, + "grad_norm": 18.875, + "learning_rate": 2.419217897034703e-06, + "loss": 0.6159, + "step": 1035 + }, + { + "epoch": 0.25188426938973985, + "grad_norm": 19.375, + "learning_rate": 2.4190468994027633e-06, + "loss": 0.8969, + "step": 1036 + }, + { + "epoch": 0.25212740092389985, + "grad_norm": 13.8125, + "learning_rate": 2.4188757270365744e-06, + "loss": 0.5303, + "step": 1037 + }, + { + "epoch": 0.2523705324580598, + "grad_norm": 14.6875, + "learning_rate": 2.418704379961721e-06, + "loss": 0.464, + "step": 1038 + }, + { + "epoch": 0.2526136639922198, + "grad_norm": 22.625, + "learning_rate": 2.418532858203814e-06, + "loss": 1.5141, + "step": 1039 + }, + { + "epoch": 0.2528567955263798, + "grad_norm": 17.25, + "learning_rate": 2.41836116178849e-06, + "loss": 0.8802, + "step": 1040 + }, + { + "epoch": 0.25309992706053974, + "grad_norm": 26.0, + "learning_rate": 2.4181892907414116e-06, + "loss": 1.1084, + "step": 1041 + }, + { + "epoch": 0.25334305859469974, + "grad_norm": 23.375, + "learning_rate": 2.418017245088269e-06, + "loss": 1.1972, + "step": 1042 + }, + { + "epoch": 0.2535861901288597, + "grad_norm": 30.875, + "learning_rate": 2.4178450248547763e-06, + "loss": 0.9698, + "step": 1043 + }, + { + "epoch": 0.2538293216630197, + "grad_norm": 33.5, + "learning_rate": 2.4176726300666757e-06, + "loss": 0.8843, + "step": 1044 + }, + { + "epoch": 0.2540724531971797, + "grad_norm": 18.375, + "learning_rate": 2.417500060749734e-06, + "loss": 0.6135, + "step": 1045 + }, + { + "epoch": 0.25431558473133964, + "grad_norm": 19.0, + "learning_rate": 2.4173273169297446e-06, + "loss": 0.8931, + "step": 1046 + }, + { + "epoch": 0.25455871626549964, + "grad_norm": 16.0, + "learning_rate": 2.4171543986325272e-06, + "loss": 0.7161, + "step": 1047 + }, + { + "epoch": 0.25480184779965964, + "grad_norm": 16.375, + "learning_rate": 2.4169813058839277e-06, + "loss": 0.7568, + "step": 1048 + }, + { + "epoch": 0.2550449793338196, + "grad_norm": 19.0, + "learning_rate": 2.4168080387098175e-06, + "loss": 1.3455, + "step": 1049 + }, + { + "epoch": 0.2552881108679796, + "grad_norm": 18.875, + "learning_rate": 2.4166345971360944e-06, + "loss": 0.7672, + "step": 1050 + }, + { + "epoch": 0.25553124240213954, + "grad_norm": 24.25, + "learning_rate": 2.416460981188682e-06, + "loss": 1.2802, + "step": 1051 + }, + { + "epoch": 0.25577437393629954, + "grad_norm": 27.0, + "learning_rate": 2.4162871908935308e-06, + "loss": 1.5866, + "step": 1052 + }, + { + "epoch": 0.25601750547045954, + "grad_norm": 21.875, + "learning_rate": 2.4161132262766163e-06, + "loss": 0.988, + "step": 1053 + }, + { + "epoch": 0.2562606370046195, + "grad_norm": 59.75, + "learning_rate": 2.415939087363941e-06, + "loss": 1.196, + "step": 1054 + }, + { + "epoch": 0.2565037685387795, + "grad_norm": 17.5, + "learning_rate": 2.415764774181533e-06, + "loss": 0.967, + "step": 1055 + }, + { + "epoch": 0.2567469000729395, + "grad_norm": 17.75, + "learning_rate": 2.415590286755445e-06, + "loss": 0.9417, + "step": 1056 + }, + { + "epoch": 0.25699003160709943, + "grad_norm": 17.25, + "learning_rate": 2.4154156251117584e-06, + "loss": 0.8826, + "step": 1057 + }, + { + "epoch": 0.25723316314125944, + "grad_norm": 20.375, + "learning_rate": 2.4152407892765798e-06, + "loss": 1.0421, + "step": 1058 + }, + { + "epoch": 0.2574762946754194, + "grad_norm": 17.375, + "learning_rate": 2.4150657792760404e-06, + "loss": 0.8625, + "step": 1059 + }, + { + "epoch": 0.2577194262095794, + "grad_norm": 15.625, + "learning_rate": 2.414890595136299e-06, + "loss": 0.755, + "step": 1060 + }, + { + "epoch": 0.2579625577437394, + "grad_norm": 18.875, + "learning_rate": 2.41471523688354e-06, + "loss": 0.953, + "step": 1061 + }, + { + "epoch": 0.25820568927789933, + "grad_norm": 16.5, + "learning_rate": 2.4145397045439734e-06, + "loss": 0.8457, + "step": 1062 + }, + { + "epoch": 0.25844882081205933, + "grad_norm": 16.25, + "learning_rate": 2.4143639981438357e-06, + "loss": 0.7202, + "step": 1063 + }, + { + "epoch": 0.2586919523462193, + "grad_norm": 16.0, + "learning_rate": 2.414188117709389e-06, + "loss": 0.9083, + "step": 1064 + }, + { + "epoch": 0.2589350838803793, + "grad_norm": 25.375, + "learning_rate": 2.4140120632669216e-06, + "loss": 0.9105, + "step": 1065 + }, + { + "epoch": 0.2591782154145393, + "grad_norm": 20.5, + "learning_rate": 2.413835834842749e-06, + "loss": 0.9964, + "step": 1066 + }, + { + "epoch": 0.2594213469486992, + "grad_norm": 17.75, + "learning_rate": 2.4136594324632102e-06, + "loss": 0.9518, + "step": 1067 + }, + { + "epoch": 0.25966447848285923, + "grad_norm": 18.25, + "learning_rate": 2.413482856154672e-06, + "loss": 0.8343, + "step": 1068 + }, + { + "epoch": 0.25990761001701923, + "grad_norm": 17.375, + "learning_rate": 2.413306105943527e-06, + "loss": 0.5186, + "step": 1069 + }, + { + "epoch": 0.2601507415511792, + "grad_norm": 16.25, + "learning_rate": 2.4131291818561937e-06, + "loss": 1.4428, + "step": 1070 + }, + { + "epoch": 0.2603938730853392, + "grad_norm": 16.75, + "learning_rate": 2.4129520839191162e-06, + "loss": 0.5751, + "step": 1071 + }, + { + "epoch": 0.2606370046194991, + "grad_norm": 18.875, + "learning_rate": 2.4127748121587646e-06, + "loss": 0.971, + "step": 1072 + }, + { + "epoch": 0.2608801361536591, + "grad_norm": 29.375, + "learning_rate": 2.412597366601636e-06, + "loss": 1.0751, + "step": 1073 + }, + { + "epoch": 0.2611232676878191, + "grad_norm": 30.375, + "learning_rate": 2.4124197472742516e-06, + "loss": 1.3877, + "step": 1074 + }, + { + "epoch": 0.2613663992219791, + "grad_norm": 22.875, + "learning_rate": 2.4122419542031607e-06, + "loss": 1.0804, + "step": 1075 + }, + { + "epoch": 0.2616095307561391, + "grad_norm": 27.125, + "learning_rate": 2.412063987414937e-06, + "loss": 0.8204, + "step": 1076 + }, + { + "epoch": 0.2618526622902991, + "grad_norm": 19.375, + "learning_rate": 2.4118858469361813e-06, + "loss": 1.0674, + "step": 1077 + }, + { + "epoch": 0.262095793824459, + "grad_norm": 19.75, + "learning_rate": 2.4117075327935186e-06, + "loss": 0.8349, + "step": 1078 + }, + { + "epoch": 0.262338925358619, + "grad_norm": 18.375, + "learning_rate": 2.411529045013602e-06, + "loss": 0.6545, + "step": 1079 + }, + { + "epoch": 0.26258205689277897, + "grad_norm": 17.125, + "learning_rate": 2.4113503836231096e-06, + "loss": 0.9066, + "step": 1080 + }, + { + "epoch": 0.26282518842693897, + "grad_norm": 20.125, + "learning_rate": 2.4111715486487447e-06, + "loss": 0.7835, + "step": 1081 + }, + { + "epoch": 0.263068319961099, + "grad_norm": 16.0, + "learning_rate": 2.4109925401172377e-06, + "loss": 0.6715, + "step": 1082 + }, + { + "epoch": 0.2633114514952589, + "grad_norm": 17.375, + "learning_rate": 2.410813358055345e-06, + "loss": 0.8965, + "step": 1083 + }, + { + "epoch": 0.2635545830294189, + "grad_norm": 38.75, + "learning_rate": 2.4106340024898478e-06, + "loss": 1.6617, + "step": 1084 + }, + { + "epoch": 0.2637977145635789, + "grad_norm": 27.375, + "learning_rate": 2.4104544734475544e-06, + "loss": 1.2628, + "step": 1085 + }, + { + "epoch": 0.26404084609773887, + "grad_norm": 17.5, + "learning_rate": 2.4102747709552975e-06, + "loss": 0.632, + "step": 1086 + }, + { + "epoch": 0.26428397763189887, + "grad_norm": 26.5, + "learning_rate": 2.410094895039938e-06, + "loss": 1.0735, + "step": 1087 + }, + { + "epoch": 0.2645271091660588, + "grad_norm": 23.75, + "learning_rate": 2.4099148457283606e-06, + "loss": 1.2874, + "step": 1088 + }, + { + "epoch": 0.2647702407002188, + "grad_norm": 17.0, + "learning_rate": 2.4097346230474774e-06, + "loss": 0.5789, + "step": 1089 + }, + { + "epoch": 0.2650133722343788, + "grad_norm": 16.125, + "learning_rate": 2.409554227024225e-06, + "loss": 0.9835, + "step": 1090 + }, + { + "epoch": 0.26525650376853876, + "grad_norm": 20.625, + "learning_rate": 2.4093736576855675e-06, + "loss": 0.8613, + "step": 1091 + }, + { + "epoch": 0.26549963530269877, + "grad_norm": 15.625, + "learning_rate": 2.4091929150584935e-06, + "loss": 0.7219, + "step": 1092 + }, + { + "epoch": 0.26574276683685877, + "grad_norm": 22.25, + "learning_rate": 2.4090119991700187e-06, + "loss": 1.1011, + "step": 1093 + }, + { + "epoch": 0.2659858983710187, + "grad_norm": 19.625, + "learning_rate": 2.408830910047184e-06, + "loss": 0.727, + "step": 1094 + }, + { + "epoch": 0.2662290299051787, + "grad_norm": 18.5, + "learning_rate": 2.4086496477170556e-06, + "loss": 0.9894, + "step": 1095 + }, + { + "epoch": 0.26647216143933866, + "grad_norm": 17.125, + "learning_rate": 2.408468212206727e-06, + "loss": 0.593, + "step": 1096 + }, + { + "epoch": 0.26671529297349866, + "grad_norm": 15.5625, + "learning_rate": 2.4082866035433167e-06, + "loss": 0.6468, + "step": 1097 + }, + { + "epoch": 0.26695842450765866, + "grad_norm": 20.875, + "learning_rate": 2.4081048217539693e-06, + "loss": 0.8832, + "step": 1098 + }, + { + "epoch": 0.2672015560418186, + "grad_norm": 18.5, + "learning_rate": 2.407922866865855e-06, + "loss": 1.0391, + "step": 1099 + }, + { + "epoch": 0.2674446875759786, + "grad_norm": 18.875, + "learning_rate": 2.4077407389061703e-06, + "loss": 0.9531, + "step": 1100 + }, + { + "epoch": 0.26768781911013856, + "grad_norm": 19.625, + "learning_rate": 2.407558437902137e-06, + "loss": 1.1267, + "step": 1101 + }, + { + "epoch": 0.26793095064429856, + "grad_norm": 15.6875, + "learning_rate": 2.4073759638810034e-06, + "loss": 0.4987, + "step": 1102 + }, + { + "epoch": 0.26817408217845856, + "grad_norm": 18.375, + "learning_rate": 2.407193316870044e-06, + "loss": 0.6768, + "step": 1103 + }, + { + "epoch": 0.2684172137126185, + "grad_norm": 21.875, + "learning_rate": 2.4070104968965572e-06, + "loss": 0.9036, + "step": 1104 + }, + { + "epoch": 0.2686603452467785, + "grad_norm": 20.625, + "learning_rate": 2.40682750398787e-06, + "loss": 1.1075, + "step": 1105 + }, + { + "epoch": 0.2689034767809385, + "grad_norm": 19.5, + "learning_rate": 2.4066443381713332e-06, + "loss": 0.6668, + "step": 1106 + }, + { + "epoch": 0.26914660831509846, + "grad_norm": 19.625, + "learning_rate": 2.406460999474324e-06, + "loss": 1.027, + "step": 1107 + }, + { + "epoch": 0.26938973984925846, + "grad_norm": 28.125, + "learning_rate": 2.4062774879242454e-06, + "loss": 1.557, + "step": 1108 + }, + { + "epoch": 0.2696328713834184, + "grad_norm": 25.25, + "learning_rate": 2.406093803548527e-06, + "loss": 1.5739, + "step": 1109 + }, + { + "epoch": 0.2698760029175784, + "grad_norm": 15.0, + "learning_rate": 2.4059099463746228e-06, + "loss": 0.4134, + "step": 1110 + }, + { + "epoch": 0.2701191344517384, + "grad_norm": 16.75, + "learning_rate": 2.405725916430014e-06, + "loss": 0.5757, + "step": 1111 + }, + { + "epoch": 0.27036226598589835, + "grad_norm": 22.125, + "learning_rate": 2.4055417137422072e-06, + "loss": 1.015, + "step": 1112 + }, + { + "epoch": 0.27060539752005836, + "grad_norm": 22.625, + "learning_rate": 2.405357338338734e-06, + "loss": 0.7689, + "step": 1113 + }, + { + "epoch": 0.27084852905421836, + "grad_norm": 19.375, + "learning_rate": 2.4051727902471532e-06, + "loss": 1.0271, + "step": 1114 + }, + { + "epoch": 0.2710916605883783, + "grad_norm": 17.625, + "learning_rate": 2.4049880694950485e-06, + "loss": 0.9973, + "step": 1115 + }, + { + "epoch": 0.2713347921225383, + "grad_norm": 17.75, + "learning_rate": 2.4048031761100286e-06, + "loss": 0.9478, + "step": 1116 + }, + { + "epoch": 0.27157792365669825, + "grad_norm": 17.0, + "learning_rate": 2.4046181101197307e-06, + "loss": 0.7035, + "step": 1117 + }, + { + "epoch": 0.27182105519085825, + "grad_norm": 17.625, + "learning_rate": 2.4044328715518154e-06, + "loss": 0.999, + "step": 1118 + }, + { + "epoch": 0.27206418672501825, + "grad_norm": 20.75, + "learning_rate": 2.4042474604339693e-06, + "loss": 0.8228, + "step": 1119 + }, + { + "epoch": 0.2723073182591782, + "grad_norm": 17.875, + "learning_rate": 2.404061876793906e-06, + "loss": 0.8087, + "step": 1120 + }, + { + "epoch": 0.2725504497933382, + "grad_norm": 15.9375, + "learning_rate": 2.4038761206593636e-06, + "loss": 0.4776, + "step": 1121 + }, + { + "epoch": 0.2727935813274982, + "grad_norm": 12.5625, + "learning_rate": 2.403690192058107e-06, + "loss": 0.4574, + "step": 1122 + }, + { + "epoch": 0.27303671286165815, + "grad_norm": 24.375, + "learning_rate": 2.4035040910179262e-06, + "loss": 1.0786, + "step": 1123 + }, + { + "epoch": 0.27327984439581815, + "grad_norm": 32.75, + "learning_rate": 2.403317817566637e-06, + "loss": 1.121, + "step": 1124 + }, + { + "epoch": 0.2735229759299781, + "grad_norm": 19.0, + "learning_rate": 2.403131371732082e-06, + "loss": 1.0104, + "step": 1125 + }, + { + "epoch": 0.2737661074641381, + "grad_norm": 17.0, + "learning_rate": 2.402944753542128e-06, + "loss": 0.5847, + "step": 1126 + }, + { + "epoch": 0.2740092389982981, + "grad_norm": 17.125, + "learning_rate": 2.4027579630246683e-06, + "loss": 0.819, + "step": 1127 + }, + { + "epoch": 0.27425237053245805, + "grad_norm": 21.875, + "learning_rate": 2.4025710002076225e-06, + "loss": 1.0926, + "step": 1128 + }, + { + "epoch": 0.27449550206661805, + "grad_norm": 19.0, + "learning_rate": 2.402383865118935e-06, + "loss": 1.0038, + "step": 1129 + }, + { + "epoch": 0.27473863360077805, + "grad_norm": 15.3125, + "learning_rate": 2.402196557786577e-06, + "loss": 1.035, + "step": 1130 + }, + { + "epoch": 0.274981765134938, + "grad_norm": 17.125, + "learning_rate": 2.4020090782385437e-06, + "loss": 0.8398, + "step": 1131 + }, + { + "epoch": 0.275224896669098, + "grad_norm": 16.75, + "learning_rate": 2.4018214265028577e-06, + "loss": 0.6444, + "step": 1132 + }, + { + "epoch": 0.27546802820325794, + "grad_norm": 18.75, + "learning_rate": 2.401633602607567e-06, + "loss": 0.7797, + "step": 1133 + }, + { + "epoch": 0.27571115973741794, + "grad_norm": 22.125, + "learning_rate": 2.4014456065807457e-06, + "loss": 1.0577, + "step": 1134 + }, + { + "epoch": 0.27595429127157795, + "grad_norm": 27.0, + "learning_rate": 2.4012574384504917e-06, + "loss": 1.0948, + "step": 1135 + }, + { + "epoch": 0.2761974228057379, + "grad_norm": 30.875, + "learning_rate": 2.4010690982449307e-06, + "loss": 0.9635, + "step": 1136 + }, + { + "epoch": 0.2764405543398979, + "grad_norm": 20.375, + "learning_rate": 2.400880585992213e-06, + "loss": 1.0681, + "step": 1137 + }, + { + "epoch": 0.27668368587405784, + "grad_norm": 20.5, + "learning_rate": 2.4006919017205158e-06, + "loss": 0.85, + "step": 1138 + }, + { + "epoch": 0.27692681740821784, + "grad_norm": 23.5, + "learning_rate": 2.4005030454580403e-06, + "loss": 1.1541, + "step": 1139 + }, + { + "epoch": 0.27716994894237784, + "grad_norm": 26.625, + "learning_rate": 2.4003140172330154e-06, + "loss": 0.8538, + "step": 1140 + }, + { + "epoch": 0.2774130804765378, + "grad_norm": 17.875, + "learning_rate": 2.4001248170736934e-06, + "loss": 1.057, + "step": 1141 + }, + { + "epoch": 0.2776562120106978, + "grad_norm": 19.0, + "learning_rate": 2.3999354450083545e-06, + "loss": 1.1297, + "step": 1142 + }, + { + "epoch": 0.2778993435448578, + "grad_norm": 19.0, + "learning_rate": 2.3997459010653033e-06, + "loss": 1.4319, + "step": 1143 + }, + { + "epoch": 0.27814247507901774, + "grad_norm": 13.0625, + "learning_rate": 2.39955618527287e-06, + "loss": 0.5, + "step": 1144 + }, + { + "epoch": 0.27838560661317774, + "grad_norm": 20.75, + "learning_rate": 2.3993662976594116e-06, + "loss": 0.9077, + "step": 1145 + }, + { + "epoch": 0.2786287381473377, + "grad_norm": 20.0, + "learning_rate": 2.3991762382533097e-06, + "loss": 1.0673, + "step": 1146 + }, + { + "epoch": 0.2788718696814977, + "grad_norm": 23.0, + "learning_rate": 2.3989860070829724e-06, + "loss": 1.1778, + "step": 1147 + }, + { + "epoch": 0.2791150012156577, + "grad_norm": 26.25, + "learning_rate": 2.3987956041768325e-06, + "loss": 0.9862, + "step": 1148 + }, + { + "epoch": 0.27935813274981763, + "grad_norm": 20.875, + "learning_rate": 2.3986050295633486e-06, + "loss": 0.8852, + "step": 1149 + }, + { + "epoch": 0.27960126428397764, + "grad_norm": 14.375, + "learning_rate": 2.3984142832710065e-06, + "loss": 0.3197, + "step": 1150 + }, + { + "epoch": 0.27984439581813764, + "grad_norm": 18.0, + "learning_rate": 2.3982233653283156e-06, + "loss": 1.1707, + "step": 1151 + }, + { + "epoch": 0.2800875273522976, + "grad_norm": 18.125, + "learning_rate": 2.3980322757638124e-06, + "loss": 1.2708, + "step": 1152 + }, + { + "epoch": 0.2803306588864576, + "grad_norm": 16.125, + "learning_rate": 2.397841014606059e-06, + "loss": 1.1025, + "step": 1153 + }, + { + "epoch": 0.28057379042061753, + "grad_norm": 17.875, + "learning_rate": 2.3976495818836408e-06, + "loss": 1.0117, + "step": 1154 + }, + { + "epoch": 0.28081692195477753, + "grad_norm": 15.625, + "learning_rate": 2.397457977625173e-06, + "loss": 0.7081, + "step": 1155 + }, + { + "epoch": 0.28106005348893753, + "grad_norm": 18.75, + "learning_rate": 2.397266201859293e-06, + "loss": 0.9149, + "step": 1156 + }, + { + "epoch": 0.2813031850230975, + "grad_norm": 15.25, + "learning_rate": 2.3970742546146646e-06, + "loss": 0.5956, + "step": 1157 + }, + { + "epoch": 0.2815463165572575, + "grad_norm": 22.75, + "learning_rate": 2.396882135919979e-06, + "loss": 1.4236, + "step": 1158 + }, + { + "epoch": 0.2817894480914175, + "grad_norm": 19.875, + "learning_rate": 2.3966898458039505e-06, + "loss": 0.8948, + "step": 1159 + }, + { + "epoch": 0.28203257962557743, + "grad_norm": 20.375, + "learning_rate": 2.3964973842953202e-06, + "loss": 1.0794, + "step": 1160 + }, + { + "epoch": 0.28227571115973743, + "grad_norm": 23.5, + "learning_rate": 2.3963047514228556e-06, + "loss": 1.3466, + "step": 1161 + }, + { + "epoch": 0.2825188426938974, + "grad_norm": 18.375, + "learning_rate": 2.3961119472153484e-06, + "loss": 0.7326, + "step": 1162 + }, + { + "epoch": 0.2827619742280574, + "grad_norm": 17.125, + "learning_rate": 2.395918971701616e-06, + "loss": 0.9474, + "step": 1163 + }, + { + "epoch": 0.2830051057622174, + "grad_norm": 20.375, + "learning_rate": 2.3957258249105035e-06, + "loss": 0.859, + "step": 1164 + }, + { + "epoch": 0.2832482372963773, + "grad_norm": 17.625, + "learning_rate": 2.3955325068708788e-06, + "loss": 0.7619, + "step": 1165 + }, + { + "epoch": 0.2834913688305373, + "grad_norm": 38.25, + "learning_rate": 2.395339017611637e-06, + "loss": 0.9685, + "step": 1166 + }, + { + "epoch": 0.28373450036469733, + "grad_norm": 21.0, + "learning_rate": 2.395145357161698e-06, + "loss": 0.9813, + "step": 1167 + }, + { + "epoch": 0.2839776318988573, + "grad_norm": 24.75, + "learning_rate": 2.3949515255500083e-06, + "loss": 1.2902, + "step": 1168 + }, + { + "epoch": 0.2842207634330173, + "grad_norm": 18.0, + "learning_rate": 2.394757522805539e-06, + "loss": 1.0547, + "step": 1169 + }, + { + "epoch": 0.2844638949671772, + "grad_norm": 20.125, + "learning_rate": 2.3945633489572874e-06, + "loss": 1.0345, + "step": 1170 + }, + { + "epoch": 0.2847070265013372, + "grad_norm": 19.0, + "learning_rate": 2.394369004034276e-06, + "loss": 1.0358, + "step": 1171 + }, + { + "epoch": 0.2849501580354972, + "grad_norm": 24.375, + "learning_rate": 2.394174488065553e-06, + "loss": 1.0853, + "step": 1172 + }, + { + "epoch": 0.28519328956965717, + "grad_norm": 16.25, + "learning_rate": 2.3939798010801918e-06, + "loss": 0.5782, + "step": 1173 + }, + { + "epoch": 0.2854364211038172, + "grad_norm": 20.25, + "learning_rate": 2.3937849431072924e-06, + "loss": 1.1041, + "step": 1174 + }, + { + "epoch": 0.2856795526379771, + "grad_norm": 20.0, + "learning_rate": 2.3935899141759794e-06, + "loss": 0.7543, + "step": 1175 + }, + { + "epoch": 0.2859226841721371, + "grad_norm": 17.125, + "learning_rate": 2.3933947143154036e-06, + "loss": 0.7341, + "step": 1176 + }, + { + "epoch": 0.2861658157062971, + "grad_norm": 15.9375, + "learning_rate": 2.39319934355474e-06, + "loss": 0.7032, + "step": 1177 + }, + { + "epoch": 0.28640894724045707, + "grad_norm": 28.75, + "learning_rate": 2.393003801923191e-06, + "loss": 1.172, + "step": 1178 + }, + { + "epoch": 0.28665207877461707, + "grad_norm": 18.25, + "learning_rate": 2.3928080894499835e-06, + "loss": 1.2408, + "step": 1179 + }, + { + "epoch": 0.28689521030877707, + "grad_norm": 23.0, + "learning_rate": 2.3926122061643703e-06, + "loss": 0.8347, + "step": 1180 + }, + { + "epoch": 0.287138341842937, + "grad_norm": 24.25, + "learning_rate": 2.392416152095629e-06, + "loss": 1.0152, + "step": 1181 + }, + { + "epoch": 0.287381473377097, + "grad_norm": 24.0, + "learning_rate": 2.3922199272730632e-06, + "loss": 1.8937, + "step": 1182 + }, + { + "epoch": 0.28762460491125696, + "grad_norm": 15.5, + "learning_rate": 2.392023531726003e-06, + "loss": 0.7869, + "step": 1183 + }, + { + "epoch": 0.28786773644541697, + "grad_norm": 16.125, + "learning_rate": 2.3918269654838028e-06, + "loss": 0.7737, + "step": 1184 + }, + { + "epoch": 0.28811086797957697, + "grad_norm": 20.75, + "learning_rate": 2.391630228575842e-06, + "loss": 0.7496, + "step": 1185 + }, + { + "epoch": 0.2883539995137369, + "grad_norm": 20.125, + "learning_rate": 2.391433321031527e-06, + "loss": 0.8947, + "step": 1186 + }, + { + "epoch": 0.2885971310478969, + "grad_norm": 16.75, + "learning_rate": 2.391236242880289e-06, + "loss": 0.7558, + "step": 1187 + }, + { + "epoch": 0.2888402625820569, + "grad_norm": 17.875, + "learning_rate": 2.391038994151585e-06, + "loss": 0.773, + "step": 1188 + }, + { + "epoch": 0.28908339411621686, + "grad_norm": 27.875, + "learning_rate": 2.3908415748748964e-06, + "loss": 1.1688, + "step": 1189 + }, + { + "epoch": 0.28932652565037686, + "grad_norm": 19.375, + "learning_rate": 2.390643985079732e-06, + "loss": 0.9692, + "step": 1190 + }, + { + "epoch": 0.2895696571845368, + "grad_norm": 19.25, + "learning_rate": 2.390446224795624e-06, + "loss": 0.8757, + "step": 1191 + }, + { + "epoch": 0.2898127887186968, + "grad_norm": 24.25, + "learning_rate": 2.3902482940521316e-06, + "loss": 0.9955, + "step": 1192 + }, + { + "epoch": 0.2900559202528568, + "grad_norm": 22.625, + "learning_rate": 2.3900501928788386e-06, + "loss": 0.9887, + "step": 1193 + }, + { + "epoch": 0.29029905178701676, + "grad_norm": 20.0, + "learning_rate": 2.389851921305355e-06, + "loss": 0.9916, + "step": 1194 + }, + { + "epoch": 0.29054218332117676, + "grad_norm": 22.5, + "learning_rate": 2.3896534793613164e-06, + "loss": 1.0105, + "step": 1195 + }, + { + "epoch": 0.29078531485533676, + "grad_norm": 20.625, + "learning_rate": 2.3894548670763825e-06, + "loss": 0.8992, + "step": 1196 + }, + { + "epoch": 0.2910284463894967, + "grad_norm": 20.25, + "learning_rate": 2.3892560844802394e-06, + "loss": 1.1296, + "step": 1197 + }, + { + "epoch": 0.2912715779236567, + "grad_norm": 15.9375, + "learning_rate": 2.389057131602599e-06, + "loss": 0.8563, + "step": 1198 + }, + { + "epoch": 0.29151470945781666, + "grad_norm": 22.125, + "learning_rate": 2.388858008473198e-06, + "loss": 0.8952, + "step": 1199 + }, + { + "epoch": 0.29175784099197666, + "grad_norm": 21.0, + "learning_rate": 2.3886587151217986e-06, + "loss": 0.8936, + "step": 1200 + }, + { + "epoch": 0.29200097252613666, + "grad_norm": 16.125, + "learning_rate": 2.3884592515781895e-06, + "loss": 0.6503, + "step": 1201 + }, + { + "epoch": 0.2922441040602966, + "grad_norm": 21.5, + "learning_rate": 2.3882596178721835e-06, + "loss": 1.1833, + "step": 1202 + }, + { + "epoch": 0.2924872355944566, + "grad_norm": 17.625, + "learning_rate": 2.3880598140336185e-06, + "loss": 0.9162, + "step": 1203 + }, + { + "epoch": 0.2927303671286166, + "grad_norm": 17.0, + "learning_rate": 2.3878598400923597e-06, + "loss": 0.8729, + "step": 1204 + }, + { + "epoch": 0.29297349866277655, + "grad_norm": 21.125, + "learning_rate": 2.3876596960782967e-06, + "loss": 1.2932, + "step": 1205 + }, + { + "epoch": 0.29321663019693656, + "grad_norm": 23.375, + "learning_rate": 2.3874593820213434e-06, + "loss": 1.1353, + "step": 1206 + }, + { + "epoch": 0.2934597617310965, + "grad_norm": 20.75, + "learning_rate": 2.387258897951441e-06, + "loss": 0.6816, + "step": 1207 + }, + { + "epoch": 0.2937028932652565, + "grad_norm": 16.875, + "learning_rate": 2.3870582438985552e-06, + "loss": 0.662, + "step": 1208 + }, + { + "epoch": 0.2939460247994165, + "grad_norm": 16.25, + "learning_rate": 2.386857419892677e-06, + "loss": 0.8001, + "step": 1209 + }, + { + "epoch": 0.29418915633357645, + "grad_norm": 23.5, + "learning_rate": 2.3866564259638237e-06, + "loss": 1.026, + "step": 1210 + }, + { + "epoch": 0.29443228786773645, + "grad_norm": 20.625, + "learning_rate": 2.3864552621420365e-06, + "loss": 0.8811, + "step": 1211 + }, + { + "epoch": 0.2946754194018964, + "grad_norm": 15.0625, + "learning_rate": 2.386253928457383e-06, + "loss": 0.7637, + "step": 1212 + }, + { + "epoch": 0.2949185509360564, + "grad_norm": 18.125, + "learning_rate": 2.3860524249399564e-06, + "loss": 0.7777, + "step": 1213 + }, + { + "epoch": 0.2951616824702164, + "grad_norm": 26.375, + "learning_rate": 2.385850751619874e-06, + "loss": 1.4308, + "step": 1214 + }, + { + "epoch": 0.29540481400437635, + "grad_norm": 16.0, + "learning_rate": 2.3856489085272806e-06, + "loss": 0.67, + "step": 1215 + }, + { + "epoch": 0.29564794553853635, + "grad_norm": 16.875, + "learning_rate": 2.3854468956923444e-06, + "loss": 0.6661, + "step": 1216 + }, + { + "epoch": 0.29589107707269635, + "grad_norm": 19.375, + "learning_rate": 2.3852447131452593e-06, + "loss": 0.992, + "step": 1217 + }, + { + "epoch": 0.2961342086068563, + "grad_norm": 15.5, + "learning_rate": 2.385042360916246e-06, + "loss": 0.6511, + "step": 1218 + }, + { + "epoch": 0.2963773401410163, + "grad_norm": 23.125, + "learning_rate": 2.384839839035549e-06, + "loss": 0.8863, + "step": 1219 + }, + { + "epoch": 0.29662047167517624, + "grad_norm": 26.25, + "learning_rate": 2.3846371475334382e-06, + "loss": 1.0645, + "step": 1220 + }, + { + "epoch": 0.29686360320933625, + "grad_norm": 21.0, + "learning_rate": 2.3844342864402103e-06, + "loss": 1.3082, + "step": 1221 + }, + { + "epoch": 0.29710673474349625, + "grad_norm": 17.125, + "learning_rate": 2.3842312557861854e-06, + "loss": 1.1422, + "step": 1222 + }, + { + "epoch": 0.2973498662776562, + "grad_norm": 16.5, + "learning_rate": 2.384028055601711e-06, + "loss": 0.6125, + "step": 1223 + }, + { + "epoch": 0.2975929978118162, + "grad_norm": 23.125, + "learning_rate": 2.3838246859171584e-06, + "loss": 1.024, + "step": 1224 + }, + { + "epoch": 0.2978361293459762, + "grad_norm": 17.625, + "learning_rate": 2.383621146762924e-06, + "loss": 0.8368, + "step": 1225 + }, + { + "epoch": 0.29807926088013614, + "grad_norm": 17.375, + "learning_rate": 2.3834174381694314e-06, + "loss": 0.8363, + "step": 1226 + }, + { + "epoch": 0.29832239241429614, + "grad_norm": 20.25, + "learning_rate": 2.383213560167128e-06, + "loss": 1.2673, + "step": 1227 + }, + { + "epoch": 0.2985655239484561, + "grad_norm": 29.25, + "learning_rate": 2.3830095127864867e-06, + "loss": 1.0421, + "step": 1228 + }, + { + "epoch": 0.2988086554826161, + "grad_norm": 24.125, + "learning_rate": 2.3828052960580057e-06, + "loss": 0.8843, + "step": 1229 + }, + { + "epoch": 0.2990517870167761, + "grad_norm": 23.625, + "learning_rate": 2.3826009100122087e-06, + "loss": 0.9703, + "step": 1230 + }, + { + "epoch": 0.29929491855093604, + "grad_norm": 21.0, + "learning_rate": 2.3823963546796456e-06, + "loss": 1.1192, + "step": 1231 + }, + { + "epoch": 0.29953805008509604, + "grad_norm": 22.125, + "learning_rate": 2.38219163009089e-06, + "loss": 0.6999, + "step": 1232 + }, + { + "epoch": 0.29978118161925604, + "grad_norm": 25.25, + "learning_rate": 2.381986736276542e-06, + "loss": 1.0703, + "step": 1233 + }, + { + "epoch": 0.300024313153416, + "grad_norm": 22.0, + "learning_rate": 2.3817816732672255e-06, + "loss": 0.9507, + "step": 1234 + }, + { + "epoch": 0.300267444687576, + "grad_norm": 18.5, + "learning_rate": 2.3815764410935914e-06, + "loss": 0.681, + "step": 1235 + }, + { + "epoch": 0.30051057622173594, + "grad_norm": 18.25, + "learning_rate": 2.3813710397863158e-06, + "loss": 0.7719, + "step": 1236 + }, + { + "epoch": 0.30075370775589594, + "grad_norm": 21.75, + "learning_rate": 2.3811654693760985e-06, + "loss": 0.9225, + "step": 1237 + }, + { + "epoch": 0.30099683929005594, + "grad_norm": 19.25, + "learning_rate": 2.3809597298936656e-06, + "loss": 0.8943, + "step": 1238 + }, + { + "epoch": 0.3012399708242159, + "grad_norm": 16.375, + "learning_rate": 2.380753821369769e-06, + "loss": 0.624, + "step": 1239 + }, + { + "epoch": 0.3014831023583759, + "grad_norm": 16.375, + "learning_rate": 2.380547743835185e-06, + "loss": 0.4559, + "step": 1240 + }, + { + "epoch": 0.3017262338925359, + "grad_norm": 16.125, + "learning_rate": 2.3803414973207154e-06, + "loss": 0.5842, + "step": 1241 + }, + { + "epoch": 0.30196936542669583, + "grad_norm": 17.375, + "learning_rate": 2.3801350818571876e-06, + "loss": 0.9134, + "step": 1242 + }, + { + "epoch": 0.30221249696085584, + "grad_norm": 20.0, + "learning_rate": 2.3799284974754534e-06, + "loss": 1.0343, + "step": 1243 + }, + { + "epoch": 0.3024556284950158, + "grad_norm": 20.5, + "learning_rate": 2.379721744206391e-06, + "loss": 0.8462, + "step": 1244 + }, + { + "epoch": 0.3026987600291758, + "grad_norm": 16.625, + "learning_rate": 2.3795148220809027e-06, + "loss": 0.6042, + "step": 1245 + }, + { + "epoch": 0.3029418915633358, + "grad_norm": 16.625, + "learning_rate": 2.379307731129917e-06, + "loss": 0.683, + "step": 1246 + }, + { + "epoch": 0.30318502309749573, + "grad_norm": 17.625, + "learning_rate": 2.379100471384387e-06, + "loss": 0.5934, + "step": 1247 + }, + { + "epoch": 0.30342815463165573, + "grad_norm": 17.25, + "learning_rate": 2.3788930428752914e-06, + "loss": 0.9201, + "step": 1248 + }, + { + "epoch": 0.3036712861658157, + "grad_norm": 23.625, + "learning_rate": 2.378685445633634e-06, + "loss": 1.0846, + "step": 1249 + }, + { + "epoch": 0.3039144176999757, + "grad_norm": 20.875, + "learning_rate": 2.378477679690443e-06, + "loss": 0.904, + "step": 1250 + }, + { + "epoch": 0.3041575492341357, + "grad_norm": 22.875, + "learning_rate": 2.378269745076774e-06, + "loss": 1.0509, + "step": 1251 + }, + { + "epoch": 0.30440068076829563, + "grad_norm": 21.625, + "learning_rate": 2.378061641823705e-06, + "loss": 0.9499, + "step": 1252 + }, + { + "epoch": 0.30464381230245563, + "grad_norm": 19.125, + "learning_rate": 2.377853369962342e-06, + "loss": 1.0333, + "step": 1253 + }, + { + "epoch": 0.30488694383661563, + "grad_norm": 23.5, + "learning_rate": 2.3776449295238142e-06, + "loss": 0.8748, + "step": 1254 + }, + { + "epoch": 0.3051300753707756, + "grad_norm": 26.125, + "learning_rate": 2.377436320539276e-06, + "loss": 0.8633, + "step": 1255 + }, + { + "epoch": 0.3053732069049356, + "grad_norm": 25.75, + "learning_rate": 2.3772275430399087e-06, + "loss": 1.3877, + "step": 1256 + }, + { + "epoch": 0.3056163384390955, + "grad_norm": 26.0, + "learning_rate": 2.377018597056917e-06, + "loss": 0.8743, + "step": 1257 + }, + { + "epoch": 0.3058594699732555, + "grad_norm": 25.5, + "learning_rate": 2.3768094826215317e-06, + "loss": 0.9658, + "step": 1258 + }, + { + "epoch": 0.30610260150741553, + "grad_norm": 30.875, + "learning_rate": 2.3766001997650086e-06, + "loss": 0.8682, + "step": 1259 + }, + { + "epoch": 0.3063457330415755, + "grad_norm": 22.25, + "learning_rate": 2.3763907485186287e-06, + "loss": 0.8051, + "step": 1260 + }, + { + "epoch": 0.3065888645757355, + "grad_norm": 24.0, + "learning_rate": 2.3761811289136978e-06, + "loss": 1.1045, + "step": 1261 + }, + { + "epoch": 0.3068319961098955, + "grad_norm": 13.5625, + "learning_rate": 2.3759713409815473e-06, + "loss": 0.7073, + "step": 1262 + }, + { + "epoch": 0.3070751276440554, + "grad_norm": 21.125, + "learning_rate": 2.375761384753534e-06, + "loss": 1.251, + "step": 1263 + }, + { + "epoch": 0.3073182591782154, + "grad_norm": 17.625, + "learning_rate": 2.3755512602610386e-06, + "loss": 0.9009, + "step": 1264 + }, + { + "epoch": 0.30756139071237537, + "grad_norm": 25.5, + "learning_rate": 2.375340967535469e-06, + "loss": 0.9195, + "step": 1265 + }, + { + "epoch": 0.3078045222465354, + "grad_norm": 18.25, + "learning_rate": 2.3751305066082563e-06, + "loss": 1.0106, + "step": 1266 + }, + { + "epoch": 0.3080476537806954, + "grad_norm": 22.25, + "learning_rate": 2.3749198775108578e-06, + "loss": 1.1506, + "step": 1267 + }, + { + "epoch": 0.3082907853148553, + "grad_norm": 17.375, + "learning_rate": 2.3747090802747553e-06, + "loss": 1.0442, + "step": 1268 + }, + { + "epoch": 0.3085339168490153, + "grad_norm": 17.0, + "learning_rate": 2.3744981149314567e-06, + "loss": 0.6376, + "step": 1269 + }, + { + "epoch": 0.3087770483831753, + "grad_norm": 17.625, + "learning_rate": 2.3742869815124934e-06, + "loss": 1.0133, + "step": 1270 + }, + { + "epoch": 0.30902017991733527, + "grad_norm": 20.5, + "learning_rate": 2.374075680049424e-06, + "loss": 1.0628, + "step": 1271 + }, + { + "epoch": 0.30926331145149527, + "grad_norm": 17.125, + "learning_rate": 2.373864210573831e-06, + "loss": 0.9332, + "step": 1272 + }, + { + "epoch": 0.3095064429856552, + "grad_norm": 19.25, + "learning_rate": 2.3736525731173217e-06, + "loss": 0.9674, + "step": 1273 + }, + { + "epoch": 0.3097495745198152, + "grad_norm": 17.75, + "learning_rate": 2.3734407677115295e-06, + "loss": 0.8424, + "step": 1274 + }, + { + "epoch": 0.3099927060539752, + "grad_norm": 19.625, + "learning_rate": 2.3732287943881114e-06, + "loss": 0.695, + "step": 1275 + }, + { + "epoch": 0.31023583758813517, + "grad_norm": 19.0, + "learning_rate": 2.373016653178752e-06, + "loss": 1.032, + "step": 1276 + }, + { + "epoch": 0.31047896912229517, + "grad_norm": 14.9375, + "learning_rate": 2.3728043441151584e-06, + "loss": 0.6565, + "step": 1277 + }, + { + "epoch": 0.31072210065645517, + "grad_norm": 18.5, + "learning_rate": 2.3725918672290637e-06, + "loss": 0.8705, + "step": 1278 + }, + { + "epoch": 0.3109652321906151, + "grad_norm": 14.8125, + "learning_rate": 2.3723792225522267e-06, + "loss": 0.4942, + "step": 1279 + }, + { + "epoch": 0.3112083637247751, + "grad_norm": 25.125, + "learning_rate": 2.372166410116431e-06, + "loss": 1.1596, + "step": 1280 + }, + { + "epoch": 0.31145149525893506, + "grad_norm": 24.625, + "learning_rate": 2.3719534299534845e-06, + "loss": 0.9487, + "step": 1281 + }, + { + "epoch": 0.31169462679309506, + "grad_norm": 19.125, + "learning_rate": 2.3717402820952212e-06, + "loss": 1.0082, + "step": 1282 + }, + { + "epoch": 0.31193775832725507, + "grad_norm": 18.875, + "learning_rate": 2.3715269665734996e-06, + "loss": 1.0386, + "step": 1283 + }, + { + "epoch": 0.312180889861415, + "grad_norm": 18.375, + "learning_rate": 2.3713134834202033e-06, + "loss": 1.0792, + "step": 1284 + }, + { + "epoch": 0.312424021395575, + "grad_norm": 21.0, + "learning_rate": 2.371099832667241e-06, + "loss": 0.809, + "step": 1285 + }, + { + "epoch": 0.31266715292973496, + "grad_norm": 20.875, + "learning_rate": 2.3708860143465473e-06, + "loss": 0.7824, + "step": 1286 + }, + { + "epoch": 0.31291028446389496, + "grad_norm": 27.625, + "learning_rate": 2.37067202849008e-06, + "loss": 1.1836, + "step": 1287 + }, + { + "epoch": 0.31315341599805496, + "grad_norm": 18.0, + "learning_rate": 2.3704578751298237e-06, + "loss": 0.7504, + "step": 1288 + }, + { + "epoch": 0.3133965475322149, + "grad_norm": 17.875, + "learning_rate": 2.3702435542977863e-06, + "loss": 0.7071, + "step": 1289 + }, + { + "epoch": 0.3136396790663749, + "grad_norm": 18.75, + "learning_rate": 2.3700290660260026e-06, + "loss": 0.8358, + "step": 1290 + }, + { + "epoch": 0.3138828106005349, + "grad_norm": 20.875, + "learning_rate": 2.369814410346532e-06, + "loss": 1.0616, + "step": 1291 + }, + { + "epoch": 0.31412594213469486, + "grad_norm": 21.0, + "learning_rate": 2.3695995872914573e-06, + "loss": 1.0078, + "step": 1292 + }, + { + "epoch": 0.31436907366885486, + "grad_norm": 22.375, + "learning_rate": 2.3693845968928885e-06, + "loss": 1.2884, + "step": 1293 + }, + { + "epoch": 0.3146122052030148, + "grad_norm": 25.5, + "learning_rate": 2.369169439182959e-06, + "loss": 1.3012, + "step": 1294 + }, + { + "epoch": 0.3148553367371748, + "grad_norm": 20.25, + "learning_rate": 2.368954114193828e-06, + "loss": 1.1192, + "step": 1295 + }, + { + "epoch": 0.3150984682713348, + "grad_norm": 19.25, + "learning_rate": 2.36873862195768e-06, + "loss": 1.0548, + "step": 1296 + }, + { + "epoch": 0.31534159980549475, + "grad_norm": 21.25, + "learning_rate": 2.3685229625067234e-06, + "loss": 1.0982, + "step": 1297 + }, + { + "epoch": 0.31558473133965476, + "grad_norm": 19.75, + "learning_rate": 2.3683071358731923e-06, + "loss": 0.912, + "step": 1298 + }, + { + "epoch": 0.31582786287381476, + "grad_norm": 22.375, + "learning_rate": 2.3680911420893464e-06, + "loss": 0.9697, + "step": 1299 + }, + { + "epoch": 0.3160709944079747, + "grad_norm": 17.625, + "learning_rate": 2.367874981187469e-06, + "loss": 0.654, + "step": 1300 + }, + { + "epoch": 0.3163141259421347, + "grad_norm": 17.5, + "learning_rate": 2.367658653199869e-06, + "loss": 0.854, + "step": 1301 + }, + { + "epoch": 0.31655725747629465, + "grad_norm": 15.5, + "learning_rate": 2.367442158158881e-06, + "loss": 0.5738, + "step": 1302 + }, + { + "epoch": 0.31680038901045465, + "grad_norm": 17.0, + "learning_rate": 2.367225496096864e-06, + "loss": 0.6767, + "step": 1303 + }, + { + "epoch": 0.31704352054461465, + "grad_norm": 20.0, + "learning_rate": 2.3670086670462007e-06, + "loss": 0.8495, + "step": 1304 + }, + { + "epoch": 0.3172866520787746, + "grad_norm": 19.75, + "learning_rate": 2.366791671039301e-06, + "loss": 0.852, + "step": 1305 + }, + { + "epoch": 0.3175297836129346, + "grad_norm": 15.625, + "learning_rate": 2.3665745081085983e-06, + "loss": 0.5346, + "step": 1306 + }, + { + "epoch": 0.3177729151470946, + "grad_norm": 15.9375, + "learning_rate": 2.3663571782865515e-06, + "loss": 0.5753, + "step": 1307 + }, + { + "epoch": 0.31801604668125455, + "grad_norm": 16.5, + "learning_rate": 2.366139681605644e-06, + "loss": 0.652, + "step": 1308 + }, + { + "epoch": 0.31825917821541455, + "grad_norm": 18.375, + "learning_rate": 2.365922018098385e-06, + "loss": 0.7502, + "step": 1309 + }, + { + "epoch": 0.3185023097495745, + "grad_norm": 19.25, + "learning_rate": 2.365704187797308e-06, + "loss": 0.9262, + "step": 1310 + }, + { + "epoch": 0.3187454412837345, + "grad_norm": 14.9375, + "learning_rate": 2.3654861907349706e-06, + "loss": 0.6902, + "step": 1311 + }, + { + "epoch": 0.3189885728178945, + "grad_norm": 17.125, + "learning_rate": 2.3652680269439575e-06, + "loss": 0.9122, + "step": 1312 + }, + { + "epoch": 0.31923170435205445, + "grad_norm": 18.375, + "learning_rate": 2.3650496964568765e-06, + "loss": 0.8371, + "step": 1313 + }, + { + "epoch": 0.31947483588621445, + "grad_norm": 22.125, + "learning_rate": 2.36483119930636e-06, + "loss": 1.1271, + "step": 1314 + }, + { + "epoch": 0.31971796742037445, + "grad_norm": 18.625, + "learning_rate": 2.3646125355250677e-06, + "loss": 0.8004, + "step": 1315 + }, + { + "epoch": 0.3199610989545344, + "grad_norm": 14.0, + "learning_rate": 2.3643937051456817e-06, + "loss": 0.5843, + "step": 1316 + }, + { + "epoch": 0.3202042304886944, + "grad_norm": 21.875, + "learning_rate": 2.36417470820091e-06, + "loss": 0.9962, + "step": 1317 + }, + { + "epoch": 0.32044736202285434, + "grad_norm": 26.5, + "learning_rate": 2.363955544723486e-06, + "loss": 0.9226, + "step": 1318 + }, + { + "epoch": 0.32069049355701434, + "grad_norm": 17.875, + "learning_rate": 2.363736214746167e-06, + "loss": 0.9539, + "step": 1319 + }, + { + "epoch": 0.32093362509117435, + "grad_norm": 17.125, + "learning_rate": 2.363516718301736e-06, + "loss": 0.6266, + "step": 1320 + }, + { + "epoch": 0.3211767566253343, + "grad_norm": 20.125, + "learning_rate": 2.363297055423e-06, + "loss": 0.8119, + "step": 1321 + }, + { + "epoch": 0.3214198881594943, + "grad_norm": 15.375, + "learning_rate": 2.363077226142792e-06, + "loss": 0.5649, + "step": 1322 + }, + { + "epoch": 0.32166301969365424, + "grad_norm": 27.5, + "learning_rate": 2.362857230493969e-06, + "loss": 0.9691, + "step": 1323 + }, + { + "epoch": 0.32190615122781424, + "grad_norm": 21.0, + "learning_rate": 2.362637068509413e-06, + "loss": 0.8346, + "step": 1324 + }, + { + "epoch": 0.32214928276197424, + "grad_norm": 19.625, + "learning_rate": 2.3624167402220317e-06, + "loss": 1.1271, + "step": 1325 + }, + { + "epoch": 0.3223924142961342, + "grad_norm": 22.875, + "learning_rate": 2.3621962456647564e-06, + "loss": 1.166, + "step": 1326 + }, + { + "epoch": 0.3226355458302942, + "grad_norm": 20.25, + "learning_rate": 2.361975584870543e-06, + "loss": 0.7447, + "step": 1327 + }, + { + "epoch": 0.3228786773644542, + "grad_norm": 19.625, + "learning_rate": 2.361754757872375e-06, + "loss": 0.7225, + "step": 1328 + }, + { + "epoch": 0.32312180889861414, + "grad_norm": 19.0, + "learning_rate": 2.361533764703258e-06, + "loss": 0.9487, + "step": 1329 + }, + { + "epoch": 0.32336494043277414, + "grad_norm": 17.625, + "learning_rate": 2.361312605396222e-06, + "loss": 1.0187, + "step": 1330 + }, + { + "epoch": 0.3236080719669341, + "grad_norm": 16.0, + "learning_rate": 2.3610912799843242e-06, + "loss": 0.5292, + "step": 1331 + }, + { + "epoch": 0.3238512035010941, + "grad_norm": 20.375, + "learning_rate": 2.360869788500646e-06, + "loss": 1.4659, + "step": 1332 + }, + { + "epoch": 0.3240943350352541, + "grad_norm": 21.0, + "learning_rate": 2.360648130978292e-06, + "loss": 0.8654, + "step": 1333 + }, + { + "epoch": 0.32433746656941403, + "grad_norm": 27.5, + "learning_rate": 2.3604263074503934e-06, + "loss": 0.697, + "step": 1334 + }, + { + "epoch": 0.32458059810357404, + "grad_norm": 22.375, + "learning_rate": 2.3602043179501056e-06, + "loss": 0.8634, + "step": 1335 + }, + { + "epoch": 0.32482372963773404, + "grad_norm": 18.5, + "learning_rate": 2.3599821625106086e-06, + "loss": 0.909, + "step": 1336 + }, + { + "epoch": 0.325066861171894, + "grad_norm": 16.625, + "learning_rate": 2.3597598411651072e-06, + "loss": 0.781, + "step": 1337 + }, + { + "epoch": 0.325309992706054, + "grad_norm": 17.0, + "learning_rate": 2.3595373539468315e-06, + "loss": 0.6706, + "step": 1338 + }, + { + "epoch": 0.32555312424021393, + "grad_norm": 22.25, + "learning_rate": 2.3593147008890356e-06, + "loss": 1.2848, + "step": 1339 + }, + { + "epoch": 0.32579625577437393, + "grad_norm": 20.375, + "learning_rate": 2.3590918820249993e-06, + "loss": 1.0801, + "step": 1340 + }, + { + "epoch": 0.32603938730853393, + "grad_norm": 18.375, + "learning_rate": 2.3588688973880268e-06, + "loss": 1.015, + "step": 1341 + }, + { + "epoch": 0.3262825188426939, + "grad_norm": 17.375, + "learning_rate": 2.3586457470114466e-06, + "loss": 0.5862, + "step": 1342 + }, + { + "epoch": 0.3265256503768539, + "grad_norm": 22.375, + "learning_rate": 2.3584224309286124e-06, + "loss": 1.3698, + "step": 1343 + }, + { + "epoch": 0.3267687819110139, + "grad_norm": 13.5625, + "learning_rate": 2.3581989491729028e-06, + "loss": 0.4445, + "step": 1344 + }, + { + "epoch": 0.32701191344517383, + "grad_norm": 16.25, + "learning_rate": 2.357975301777721e-06, + "loss": 0.8973, + "step": 1345 + }, + { + "epoch": 0.32725504497933383, + "grad_norm": 25.5, + "learning_rate": 2.357751488776495e-06, + "loss": 0.9841, + "step": 1346 + }, + { + "epoch": 0.3274981765134938, + "grad_norm": 17.25, + "learning_rate": 2.3575275102026775e-06, + "loss": 0.7709, + "step": 1347 + }, + { + "epoch": 0.3277413080476538, + "grad_norm": 17.25, + "learning_rate": 2.3573033660897457e-06, + "loss": 0.9584, + "step": 1348 + }, + { + "epoch": 0.3279844395818138, + "grad_norm": 19.5, + "learning_rate": 2.357079056471202e-06, + "loss": 0.7564, + "step": 1349 + }, + { + "epoch": 0.3282275711159737, + "grad_norm": 20.625, + "learning_rate": 2.3568545813805737e-06, + "loss": 0.839, + "step": 1350 + }, + { + "epoch": 0.32847070265013373, + "grad_norm": 18.375, + "learning_rate": 2.356629940851412e-06, + "loss": 0.8169, + "step": 1351 + }, + { + "epoch": 0.32871383418429373, + "grad_norm": 17.625, + "learning_rate": 2.3564051349172937e-06, + "loss": 0.9295, + "step": 1352 + }, + { + "epoch": 0.3289569657184537, + "grad_norm": 19.0, + "learning_rate": 2.3561801636118197e-06, + "loss": 0.7949, + "step": 1353 + }, + { + "epoch": 0.3292000972526137, + "grad_norm": 16.625, + "learning_rate": 2.3559550269686153e-06, + "loss": 0.8806, + "step": 1354 + }, + { + "epoch": 0.3294432287867736, + "grad_norm": 13.5, + "learning_rate": 2.3557297250213324e-06, + "loss": 0.6512, + "step": 1355 + }, + { + "epoch": 0.3296863603209336, + "grad_norm": 17.375, + "learning_rate": 2.355504257803645e-06, + "loss": 0.9964, + "step": 1356 + }, + { + "epoch": 0.3299294918550936, + "grad_norm": 16.0, + "learning_rate": 2.3552786253492537e-06, + "loss": 0.5643, + "step": 1357 + }, + { + "epoch": 0.33017262338925357, + "grad_norm": 21.625, + "learning_rate": 2.3550528276918834e-06, + "loss": 1.1216, + "step": 1358 + }, + { + "epoch": 0.3304157549234136, + "grad_norm": 18.25, + "learning_rate": 2.354826864865282e-06, + "loss": 1.0007, + "step": 1359 + }, + { + "epoch": 0.3306588864575735, + "grad_norm": 19.0, + "learning_rate": 2.3546007369032255e-06, + "loss": 0.8458, + "step": 1360 + }, + { + "epoch": 0.3309020179917335, + "grad_norm": 20.625, + "learning_rate": 2.3543744438395117e-06, + "loss": 0.8335, + "step": 1361 + }, + { + "epoch": 0.3311451495258935, + "grad_norm": 25.125, + "learning_rate": 2.3541479857079636e-06, + "loss": 1.193, + "step": 1362 + }, + { + "epoch": 0.33138828106005347, + "grad_norm": 20.25, + "learning_rate": 2.3539213625424304e-06, + "loss": 1.2014, + "step": 1363 + }, + { + "epoch": 0.33163141259421347, + "grad_norm": 19.75, + "learning_rate": 2.3536945743767836e-06, + "loss": 1.1478, + "step": 1364 + }, + { + "epoch": 0.33187454412837347, + "grad_norm": 21.5, + "learning_rate": 2.3534676212449214e-06, + "loss": 0.9309, + "step": 1365 + }, + { + "epoch": 0.3321176756625334, + "grad_norm": 28.5, + "learning_rate": 2.353240503180766e-06, + "loss": 1.0866, + "step": 1366 + }, + { + "epoch": 0.3323608071966934, + "grad_norm": 24.125, + "learning_rate": 2.3530132202182633e-06, + "loss": 0.7259, + "step": 1367 + }, + { + "epoch": 0.33260393873085337, + "grad_norm": 18.0, + "learning_rate": 2.3527857723913853e-06, + "loss": 0.9255, + "step": 1368 + }, + { + "epoch": 0.33284707026501337, + "grad_norm": 18.625, + "learning_rate": 2.3525581597341278e-06, + "loss": 0.8591, + "step": 1369 + }, + { + "epoch": 0.33309020179917337, + "grad_norm": 21.625, + "learning_rate": 2.3523303822805117e-06, + "loss": 1.11, + "step": 1370 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 15.5, + "learning_rate": 2.352102440064582e-06, + "loss": 0.5432, + "step": 1371 + }, + { + "epoch": 0.3335764648674933, + "grad_norm": 16.25, + "learning_rate": 2.3518743331204085e-06, + "loss": 0.8395, + "step": 1372 + }, + { + "epoch": 0.3338195964016533, + "grad_norm": 18.125, + "learning_rate": 2.351646061482086e-06, + "loss": 0.794, + "step": 1373 + }, + { + "epoch": 0.33406272793581326, + "grad_norm": 66.5, + "learning_rate": 2.3514176251837332e-06, + "loss": 1.0758, + "step": 1374 + }, + { + "epoch": 0.33430585946997327, + "grad_norm": 23.75, + "learning_rate": 2.351189024259495e-06, + "loss": 0.9747, + "step": 1375 + }, + { + "epoch": 0.3345489910041332, + "grad_norm": 16.375, + "learning_rate": 2.3509602587435383e-06, + "loss": 0.8169, + "step": 1376 + }, + { + "epoch": 0.3347921225382932, + "grad_norm": 22.875, + "learning_rate": 2.3507313286700568e-06, + "loss": 0.8971, + "step": 1377 + }, + { + "epoch": 0.3350352540724532, + "grad_norm": 21.375, + "learning_rate": 2.350502234073268e-06, + "loss": 0.9816, + "step": 1378 + }, + { + "epoch": 0.33527838560661316, + "grad_norm": 24.75, + "learning_rate": 2.3502729749874136e-06, + "loss": 1.3641, + "step": 1379 + }, + { + "epoch": 0.33552151714077316, + "grad_norm": 18.0, + "learning_rate": 2.3500435514467612e-06, + "loss": 0.8322, + "step": 1380 + }, + { + "epoch": 0.33576464867493316, + "grad_norm": 15.1875, + "learning_rate": 2.3498139634856017e-06, + "loss": 0.4193, + "step": 1381 + }, + { + "epoch": 0.3360077802090931, + "grad_norm": 21.625, + "learning_rate": 2.3495842111382505e-06, + "loss": 1.4416, + "step": 1382 + }, + { + "epoch": 0.3362509117432531, + "grad_norm": 18.625, + "learning_rate": 2.3493542944390484e-06, + "loss": 1.1323, + "step": 1383 + }, + { + "epoch": 0.33649404327741306, + "grad_norm": 17.875, + "learning_rate": 2.349124213422361e-06, + "loss": 0.854, + "step": 1384 + }, + { + "epoch": 0.33673717481157306, + "grad_norm": 32.75, + "learning_rate": 2.348893968122577e-06, + "loss": 0.9347, + "step": 1385 + }, + { + "epoch": 0.33698030634573306, + "grad_norm": 21.875, + "learning_rate": 2.348663558574111e-06, + "loss": 1.0152, + "step": 1386 + }, + { + "epoch": 0.337223437879893, + "grad_norm": 24.125, + "learning_rate": 2.3484329848114017e-06, + "loss": 0.9265, + "step": 1387 + }, + { + "epoch": 0.337466569414053, + "grad_norm": 20.875, + "learning_rate": 2.348202246868912e-06, + "loss": 0.9097, + "step": 1388 + }, + { + "epoch": 0.337709700948213, + "grad_norm": 25.25, + "learning_rate": 2.34797134478113e-06, + "loss": 1.3648, + "step": 1389 + }, + { + "epoch": 0.33795283248237296, + "grad_norm": 17.75, + "learning_rate": 2.3477402785825683e-06, + "loss": 0.6084, + "step": 1390 + }, + { + "epoch": 0.33819596401653296, + "grad_norm": 16.875, + "learning_rate": 2.347509048307763e-06, + "loss": 0.6965, + "step": 1391 + }, + { + "epoch": 0.3384390955506929, + "grad_norm": 15.375, + "learning_rate": 2.3472776539912756e-06, + "loss": 0.5375, + "step": 1392 + }, + { + "epoch": 0.3386822270848529, + "grad_norm": 20.625, + "learning_rate": 2.347046095667692e-06, + "loss": 0.8974, + "step": 1393 + }, + { + "epoch": 0.3389253586190129, + "grad_norm": 28.625, + "learning_rate": 2.346814373371623e-06, + "loss": 1.0636, + "step": 1394 + }, + { + "epoch": 0.33916849015317285, + "grad_norm": 34.0, + "learning_rate": 2.3465824871377036e-06, + "loss": 1.1303, + "step": 1395 + }, + { + "epoch": 0.33941162168733285, + "grad_norm": 20.625, + "learning_rate": 2.3463504370005927e-06, + "loss": 1.0416, + "step": 1396 + }, + { + "epoch": 0.3396547532214928, + "grad_norm": 19.375, + "learning_rate": 2.346118222994974e-06, + "loss": 0.81, + "step": 1397 + }, + { + "epoch": 0.3398978847556528, + "grad_norm": 17.5, + "learning_rate": 2.345885845155557e-06, + "loss": 0.9071, + "step": 1398 + }, + { + "epoch": 0.3401410162898128, + "grad_norm": 18.375, + "learning_rate": 2.345653303517073e-06, + "loss": 0.7898, + "step": 1399 + }, + { + "epoch": 0.34038414782397275, + "grad_norm": 18.25, + "learning_rate": 2.345420598114281e-06, + "loss": 0.8184, + "step": 1400 + }, + { + "epoch": 0.34062727935813275, + "grad_norm": 21.25, + "learning_rate": 2.3451877289819614e-06, + "loss": 1.1307, + "step": 1401 + }, + { + "epoch": 0.34087041089229275, + "grad_norm": 21.25, + "learning_rate": 2.3449546961549215e-06, + "loss": 1.1001, + "step": 1402 + }, + { + "epoch": 0.3411135424264527, + "grad_norm": 19.125, + "learning_rate": 2.3447214996679913e-06, + "loss": 1.2553, + "step": 1403 + }, + { + "epoch": 0.3413566739606127, + "grad_norm": 21.25, + "learning_rate": 2.344488139556027e-06, + "loss": 1.0843, + "step": 1404 + }, + { + "epoch": 0.34159980549477265, + "grad_norm": 22.375, + "learning_rate": 2.3442546158539074e-06, + "loss": 0.8769, + "step": 1405 + }, + { + "epoch": 0.34184293702893265, + "grad_norm": 15.3125, + "learning_rate": 2.3440209285965374e-06, + "loss": 0.6061, + "step": 1406 + }, + { + "epoch": 0.34208606856309265, + "grad_norm": 24.875, + "learning_rate": 2.343787077818845e-06, + "loss": 0.6801, + "step": 1407 + }, + { + "epoch": 0.3423292000972526, + "grad_norm": 33.5, + "learning_rate": 2.3435530635557836e-06, + "loss": 1.4359, + "step": 1408 + }, + { + "epoch": 0.3425723316314126, + "grad_norm": 22.5, + "learning_rate": 2.34331888584233e-06, + "loss": 1.2318, + "step": 1409 + }, + { + "epoch": 0.3428154631655726, + "grad_norm": 19.25, + "learning_rate": 2.343084544713487e-06, + "loss": 0.9751, + "step": 1410 + }, + { + "epoch": 0.34305859469973254, + "grad_norm": 24.875, + "learning_rate": 2.3428500402042807e-06, + "loss": 0.8773, + "step": 1411 + }, + { + "epoch": 0.34330172623389255, + "grad_norm": 25.125, + "learning_rate": 2.342615372349762e-06, + "loss": 0.8469, + "step": 1412 + }, + { + "epoch": 0.3435448577680525, + "grad_norm": 36.0, + "learning_rate": 2.3423805411850054e-06, + "loss": 1.3903, + "step": 1413 + }, + { + "epoch": 0.3437879893022125, + "grad_norm": 21.125, + "learning_rate": 2.342145546745111e-06, + "loss": 0.9616, + "step": 1414 + }, + { + "epoch": 0.3440311208363725, + "grad_norm": 16.75, + "learning_rate": 2.3419103890652023e-06, + "loss": 0.4577, + "step": 1415 + }, + { + "epoch": 0.34427425237053244, + "grad_norm": 19.75, + "learning_rate": 2.3416750681804283e-06, + "loss": 0.7604, + "step": 1416 + }, + { + "epoch": 0.34451738390469244, + "grad_norm": 15.25, + "learning_rate": 2.341439584125961e-06, + "loss": 0.9324, + "step": 1417 + }, + { + "epoch": 0.34476051543885244, + "grad_norm": 17.5, + "learning_rate": 2.341203936936999e-06, + "loss": 0.7031, + "step": 1418 + }, + { + "epoch": 0.3450036469730124, + "grad_norm": 17.375, + "learning_rate": 2.3409681266487623e-06, + "loss": 1.0507, + "step": 1419 + }, + { + "epoch": 0.3452467785071724, + "grad_norm": 14.875, + "learning_rate": 2.340732153296497e-06, + "loss": 0.4448, + "step": 1420 + }, + { + "epoch": 0.34548991004133234, + "grad_norm": 21.625, + "learning_rate": 2.3404960169154743e-06, + "loss": 0.8389, + "step": 1421 + }, + { + "epoch": 0.34573304157549234, + "grad_norm": 27.625, + "learning_rate": 2.3402597175409887e-06, + "loss": 0.8419, + "step": 1422 + }, + { + "epoch": 0.34597617310965234, + "grad_norm": 20.625, + "learning_rate": 2.3400232552083585e-06, + "loss": 0.9551, + "step": 1423 + }, + { + "epoch": 0.3462193046438123, + "grad_norm": 15.8125, + "learning_rate": 2.339786629952927e-06, + "loss": 0.4091, + "step": 1424 + }, + { + "epoch": 0.3464624361779723, + "grad_norm": 25.375, + "learning_rate": 2.339549841810063e-06, + "loss": 1.3345, + "step": 1425 + }, + { + "epoch": 0.3467055677121323, + "grad_norm": 19.75, + "learning_rate": 2.3393128908151576e-06, + "loss": 0.9239, + "step": 1426 + }, + { + "epoch": 0.34694869924629224, + "grad_norm": 16.25, + "learning_rate": 2.3390757770036277e-06, + "loss": 0.7221, + "step": 1427 + }, + { + "epoch": 0.34719183078045224, + "grad_norm": 20.0, + "learning_rate": 2.338838500410914e-06, + "loss": 0.7285, + "step": 1428 + }, + { + "epoch": 0.3474349623146122, + "grad_norm": 16.625, + "learning_rate": 2.3386010610724817e-06, + "loss": 0.6575, + "step": 1429 + }, + { + "epoch": 0.3476780938487722, + "grad_norm": 21.875, + "learning_rate": 2.33836345902382e-06, + "loss": 0.9959, + "step": 1430 + }, + { + "epoch": 0.3479212253829322, + "grad_norm": 25.125, + "learning_rate": 2.3381256943004425e-06, + "loss": 1.1109, + "step": 1431 + }, + { + "epoch": 0.34816435691709213, + "grad_norm": 19.375, + "learning_rate": 2.3378877669378874e-06, + "loss": 1.3305, + "step": 1432 + }, + { + "epoch": 0.34840748845125213, + "grad_norm": 21.25, + "learning_rate": 2.337649676971717e-06, + "loss": 1.4021, + "step": 1433 + }, + { + "epoch": 0.3486506199854121, + "grad_norm": 17.5, + "learning_rate": 2.3374114244375177e-06, + "loss": 0.8123, + "step": 1434 + }, + { + "epoch": 0.3488937515195721, + "grad_norm": 16.875, + "learning_rate": 2.337173009370902e-06, + "loss": 0.6366, + "step": 1435 + }, + { + "epoch": 0.3491368830537321, + "grad_norm": 18.625, + "learning_rate": 2.336934431807503e-06, + "loss": 0.964, + "step": 1436 + }, + { + "epoch": 0.34938001458789203, + "grad_norm": 22.5, + "learning_rate": 2.336695691782981e-06, + "loss": 0.9122, + "step": 1437 + }, + { + "epoch": 0.34962314612205203, + "grad_norm": 17.75, + "learning_rate": 2.33645678933302e-06, + "loss": 0.8408, + "step": 1438 + }, + { + "epoch": 0.34986627765621203, + "grad_norm": 21.25, + "learning_rate": 2.336217724493328e-06, + "loss": 1.0306, + "step": 1439 + }, + { + "epoch": 0.350109409190372, + "grad_norm": 13.5625, + "learning_rate": 2.335978497299638e-06, + "loss": 0.8109, + "step": 1440 + }, + { + "epoch": 0.350352540724532, + "grad_norm": 22.375, + "learning_rate": 2.335739107787706e-06, + "loss": 0.939, + "step": 1441 + }, + { + "epoch": 0.3505956722586919, + "grad_norm": 19.5, + "learning_rate": 2.3354995559933127e-06, + "loss": 1.1162, + "step": 1442 + }, + { + "epoch": 0.3508388037928519, + "grad_norm": 18.75, + "learning_rate": 2.335259841952264e-06, + "loss": 0.7332, + "step": 1443 + }, + { + "epoch": 0.35108193532701193, + "grad_norm": 22.5, + "learning_rate": 2.3350199657003882e-06, + "loss": 1.2186, + "step": 1444 + }, + { + "epoch": 0.3513250668611719, + "grad_norm": 21.875, + "learning_rate": 2.3347799272735398e-06, + "loss": 0.7035, + "step": 1445 + }, + { + "epoch": 0.3515681983953319, + "grad_norm": 24.0, + "learning_rate": 2.3345397267075962e-06, + "loss": 0.9537, + "step": 1446 + }, + { + "epoch": 0.3518113299294919, + "grad_norm": 18.5, + "learning_rate": 2.3342993640384604e-06, + "loss": 0.9893, + "step": 1447 + }, + { + "epoch": 0.3520544614636518, + "grad_norm": 20.375, + "learning_rate": 2.334058839302058e-06, + "loss": 0.7534, + "step": 1448 + }, + { + "epoch": 0.3522975929978118, + "grad_norm": 22.125, + "learning_rate": 2.3338181525343395e-06, + "loss": 1.2002, + "step": 1449 + }, + { + "epoch": 0.35254072453197177, + "grad_norm": 21.5, + "learning_rate": 2.33357730377128e-06, + "loss": 1.2812, + "step": 1450 + }, + { + "epoch": 0.3527838560661318, + "grad_norm": 15.875, + "learning_rate": 2.3333362930488785e-06, + "loss": 0.7726, + "step": 1451 + }, + { + "epoch": 0.3530269876002918, + "grad_norm": 21.375, + "learning_rate": 2.333095120403158e-06, + "loss": 0.9819, + "step": 1452 + }, + { + "epoch": 0.3532701191344517, + "grad_norm": 21.125, + "learning_rate": 2.332853785870166e-06, + "loss": 0.8736, + "step": 1453 + }, + { + "epoch": 0.3535132506686117, + "grad_norm": 23.25, + "learning_rate": 2.3326122894859745e-06, + "loss": 1.0426, + "step": 1454 + }, + { + "epoch": 0.3537563822027717, + "grad_norm": 22.75, + "learning_rate": 2.3323706312866785e-06, + "loss": 1.1238, + "step": 1455 + }, + { + "epoch": 0.35399951373693167, + "grad_norm": 18.75, + "learning_rate": 2.332128811308399e-06, + "loss": 1.0771, + "step": 1456 + }, + { + "epoch": 0.35424264527109167, + "grad_norm": 20.875, + "learning_rate": 2.3318868295872793e-06, + "loss": 1.0684, + "step": 1457 + }, + { + "epoch": 0.3544857768052516, + "grad_norm": 15.625, + "learning_rate": 2.3316446861594878e-06, + "loss": 0.5876, + "step": 1458 + }, + { + "epoch": 0.3547289083394116, + "grad_norm": 18.125, + "learning_rate": 2.331402381061218e-06, + "loss": 0.859, + "step": 1459 + }, + { + "epoch": 0.3549720398735716, + "grad_norm": 22.875, + "learning_rate": 2.3311599143286855e-06, + "loss": 0.8267, + "step": 1460 + }, + { + "epoch": 0.35521517140773157, + "grad_norm": 21.375, + "learning_rate": 2.3309172859981317e-06, + "loss": 1.0423, + "step": 1461 + }, + { + "epoch": 0.35545830294189157, + "grad_norm": 20.875, + "learning_rate": 2.330674496105821e-06, + "loss": 1.1045, + "step": 1462 + }, + { + "epoch": 0.35570143447605157, + "grad_norm": 22.125, + "learning_rate": 2.3304315446880434e-06, + "loss": 0.9494, + "step": 1463 + }, + { + "epoch": 0.3559445660102115, + "grad_norm": 19.5, + "learning_rate": 2.330188431781111e-06, + "loss": 0.8851, + "step": 1464 + }, + { + "epoch": 0.3561876975443715, + "grad_norm": 20.625, + "learning_rate": 2.329945157421363e-06, + "loss": 1.2262, + "step": 1465 + }, + { + "epoch": 0.35643082907853146, + "grad_norm": 20.75, + "learning_rate": 2.3297017216451597e-06, + "loss": 0.6847, + "step": 1466 + }, + { + "epoch": 0.35667396061269147, + "grad_norm": 27.25, + "learning_rate": 2.3294581244888867e-06, + "loss": 0.8174, + "step": 1467 + }, + { + "epoch": 0.35691709214685147, + "grad_norm": 19.5, + "learning_rate": 2.329214365988954e-06, + "loss": 1.0658, + "step": 1468 + }, + { + "epoch": 0.3571602236810114, + "grad_norm": 19.625, + "learning_rate": 2.328970446181796e-06, + "loss": 0.9411, + "step": 1469 + }, + { + "epoch": 0.3574033552151714, + "grad_norm": 17.625, + "learning_rate": 2.32872636510387e-06, + "loss": 0.9679, + "step": 1470 + }, + { + "epoch": 0.3576464867493314, + "grad_norm": 21.0, + "learning_rate": 2.3284821227916586e-06, + "loss": 0.9815, + "step": 1471 + }, + { + "epoch": 0.35788961828349136, + "grad_norm": 17.5, + "learning_rate": 2.3282377192816682e-06, + "loss": 1.068, + "step": 1472 + }, + { + "epoch": 0.35813274981765136, + "grad_norm": 24.625, + "learning_rate": 2.3279931546104286e-06, + "loss": 1.2518, + "step": 1473 + }, + { + "epoch": 0.3583758813518113, + "grad_norm": 14.9375, + "learning_rate": 2.3277484288144947e-06, + "loss": 0.5412, + "step": 1474 + }, + { + "epoch": 0.3586190128859713, + "grad_norm": 21.375, + "learning_rate": 2.3275035419304443e-06, + "loss": 0.9545, + "step": 1475 + }, + { + "epoch": 0.3588621444201313, + "grad_norm": 19.875, + "learning_rate": 2.3272584939948807e-06, + "loss": 0.9992, + "step": 1476 + }, + { + "epoch": 0.35910527595429126, + "grad_norm": 17.875, + "learning_rate": 2.3270132850444304e-06, + "loss": 0.4695, + "step": 1477 + }, + { + "epoch": 0.35934840748845126, + "grad_norm": 25.875, + "learning_rate": 2.3267679151157437e-06, + "loss": 0.7442, + "step": 1478 + }, + { + "epoch": 0.3595915390226112, + "grad_norm": 14.5625, + "learning_rate": 2.326522384245496e-06, + "loss": 0.5161, + "step": 1479 + }, + { + "epoch": 0.3598346705567712, + "grad_norm": 20.75, + "learning_rate": 2.3262766924703856e-06, + "loss": 0.6954, + "step": 1480 + }, + { + "epoch": 0.3600778020909312, + "grad_norm": 19.625, + "learning_rate": 2.3260308398271353e-06, + "loss": 1.154, + "step": 1481 + }, + { + "epoch": 0.36032093362509116, + "grad_norm": 25.875, + "learning_rate": 2.325784826352493e-06, + "loss": 1.123, + "step": 1482 + }, + { + "epoch": 0.36056406515925116, + "grad_norm": 12.9375, + "learning_rate": 2.3255386520832282e-06, + "loss": 0.4059, + "step": 1483 + }, + { + "epoch": 0.36080719669341116, + "grad_norm": 18.125, + "learning_rate": 2.325292317056137e-06, + "loss": 0.8285, + "step": 1484 + }, + { + "epoch": 0.3610503282275711, + "grad_norm": 26.0, + "learning_rate": 2.3250458213080378e-06, + "loss": 1.1255, + "step": 1485 + }, + { + "epoch": 0.3612934597617311, + "grad_norm": 24.75, + "learning_rate": 2.324799164875774e-06, + "loss": 0.9591, + "step": 1486 + }, + { + "epoch": 0.36153659129589105, + "grad_norm": 21.5, + "learning_rate": 2.3245523477962133e-06, + "loss": 1.1673, + "step": 1487 + }, + { + "epoch": 0.36177972283005105, + "grad_norm": 23.5, + "learning_rate": 2.324305370106245e-06, + "loss": 1.0086, + "step": 1488 + }, + { + "epoch": 0.36202285436421106, + "grad_norm": 24.125, + "learning_rate": 2.324058231842786e-06, + "loss": 0.9124, + "step": 1489 + }, + { + "epoch": 0.362265985898371, + "grad_norm": 24.125, + "learning_rate": 2.3238109330427746e-06, + "loss": 0.9154, + "step": 1490 + }, + { + "epoch": 0.362509117432531, + "grad_norm": 19.125, + "learning_rate": 2.323563473743173e-06, + "loss": 1.0681, + "step": 1491 + }, + { + "epoch": 0.362752248966691, + "grad_norm": 20.75, + "learning_rate": 2.32331585398097e-06, + "loss": 1.0651, + "step": 1492 + }, + { + "epoch": 0.36299538050085095, + "grad_norm": 16.875, + "learning_rate": 2.323068073793176e-06, + "loss": 0.9262, + "step": 1493 + }, + { + "epoch": 0.36323851203501095, + "grad_norm": 22.5, + "learning_rate": 2.3228201332168253e-06, + "loss": 0.8907, + "step": 1494 + }, + { + "epoch": 0.3634816435691709, + "grad_norm": 23.375, + "learning_rate": 2.3225720322889778e-06, + "loss": 1.1956, + "step": 1495 + }, + { + "epoch": 0.3637247751033309, + "grad_norm": 16.375, + "learning_rate": 2.3223237710467157e-06, + "loss": 0.6987, + "step": 1496 + }, + { + "epoch": 0.3639679066374909, + "grad_norm": 25.25, + "learning_rate": 2.322075349527147e-06, + "loss": 0.9879, + "step": 1497 + }, + { + "epoch": 0.36421103817165085, + "grad_norm": 29.25, + "learning_rate": 2.321826767767401e-06, + "loss": 1.1628, + "step": 1498 + }, + { + "epoch": 0.36445416970581085, + "grad_norm": 34.25, + "learning_rate": 2.3215780258046344e-06, + "loss": 1.0867, + "step": 1499 + }, + { + "epoch": 0.36469730123997085, + "grad_norm": 20.5, + "learning_rate": 2.321329123676024e-06, + "loss": 1.0927, + "step": 1500 + }, + { + "epoch": 0.3649404327741308, + "grad_norm": 16.875, + "learning_rate": 2.321080061418775e-06, + "loss": 0.7475, + "step": 1501 + }, + { + "epoch": 0.3651835643082908, + "grad_norm": 16.625, + "learning_rate": 2.320830839070112e-06, + "loss": 0.5888, + "step": 1502 + }, + { + "epoch": 0.36542669584245074, + "grad_norm": 18.25, + "learning_rate": 2.3205814566672857e-06, + "loss": 0.7968, + "step": 1503 + }, + { + "epoch": 0.36566982737661075, + "grad_norm": 14.4375, + "learning_rate": 2.320331914247571e-06, + "loss": 0.7453, + "step": 1504 + }, + { + "epoch": 0.36591295891077075, + "grad_norm": 17.5, + "learning_rate": 2.3200822118482675e-06, + "loss": 0.7019, + "step": 1505 + }, + { + "epoch": 0.3661560904449307, + "grad_norm": 19.5, + "learning_rate": 2.3198323495066957e-06, + "loss": 0.916, + "step": 1506 + }, + { + "epoch": 0.3663992219790907, + "grad_norm": 14.9375, + "learning_rate": 2.319582327260203e-06, + "loss": 0.7411, + "step": 1507 + }, + { + "epoch": 0.3666423535132507, + "grad_norm": 18.0, + "learning_rate": 2.319332145146159e-06, + "loss": 1.0345, + "step": 1508 + }, + { + "epoch": 0.36688548504741064, + "grad_norm": 24.5, + "learning_rate": 2.3190818032019578e-06, + "loss": 0.7271, + "step": 1509 + }, + { + "epoch": 0.36712861658157064, + "grad_norm": 27.0, + "learning_rate": 2.3188313014650178e-06, + "loss": 0.8754, + "step": 1510 + }, + { + "epoch": 0.3673717481157306, + "grad_norm": 18.0, + "learning_rate": 2.31858063997278e-06, + "loss": 0.7795, + "step": 1511 + }, + { + "epoch": 0.3676148796498906, + "grad_norm": 27.5, + "learning_rate": 2.3183298187627107e-06, + "loss": 0.9894, + "step": 1512 + }, + { + "epoch": 0.3678580111840506, + "grad_norm": 18.0, + "learning_rate": 2.318078837872299e-06, + "loss": 1.0625, + "step": 1513 + }, + { + "epoch": 0.36810114271821054, + "grad_norm": 17.375, + "learning_rate": 2.317827697339059e-06, + "loss": 0.7488, + "step": 1514 + }, + { + "epoch": 0.36834427425237054, + "grad_norm": 29.75, + "learning_rate": 2.3175763972005277e-06, + "loss": 1.2496, + "step": 1515 + }, + { + "epoch": 0.3685874057865305, + "grad_norm": 13.625, + "learning_rate": 2.3173249374942657e-06, + "loss": 0.8552, + "step": 1516 + }, + { + "epoch": 0.3688305373206905, + "grad_norm": 14.9375, + "learning_rate": 2.3170733182578586e-06, + "loss": 0.4992, + "step": 1517 + }, + { + "epoch": 0.3690736688548505, + "grad_norm": 20.25, + "learning_rate": 2.3168215395289156e-06, + "loss": 1.0485, + "step": 1518 + }, + { + "epoch": 0.36931680038901044, + "grad_norm": 18.25, + "learning_rate": 2.3165696013450682e-06, + "loss": 0.7444, + "step": 1519 + }, + { + "epoch": 0.36955993192317044, + "grad_norm": 24.125, + "learning_rate": 2.316317503743974e-06, + "loss": 1.4502, + "step": 1520 + }, + { + "epoch": 0.36980306345733044, + "grad_norm": 21.125, + "learning_rate": 2.3160652467633127e-06, + "loss": 0.9617, + "step": 1521 + }, + { + "epoch": 0.3700461949914904, + "grad_norm": 15.75, + "learning_rate": 2.315812830440789e-06, + "loss": 0.7754, + "step": 1522 + }, + { + "epoch": 0.3702893265256504, + "grad_norm": 16.75, + "learning_rate": 2.3155602548141303e-06, + "loss": 0.7547, + "step": 1523 + }, + { + "epoch": 0.37053245805981033, + "grad_norm": 18.25, + "learning_rate": 2.3153075199210886e-06, + "loss": 0.8524, + "step": 1524 + }, + { + "epoch": 0.37077558959397033, + "grad_norm": 20.125, + "learning_rate": 2.3150546257994396e-06, + "loss": 0.9676, + "step": 1525 + }, + { + "epoch": 0.37101872112813034, + "grad_norm": 16.5, + "learning_rate": 2.314801572486983e-06, + "loss": 0.8736, + "step": 1526 + }, + { + "epoch": 0.3712618526622903, + "grad_norm": 12.6875, + "learning_rate": 2.3145483600215414e-06, + "loss": 0.4003, + "step": 1527 + }, + { + "epoch": 0.3715049841964503, + "grad_norm": 16.75, + "learning_rate": 2.3142949884409616e-06, + "loss": 0.6586, + "step": 1528 + }, + { + "epoch": 0.3717481157306103, + "grad_norm": 14.0625, + "learning_rate": 2.314041457783115e-06, + "loss": 0.4269, + "step": 1529 + }, + { + "epoch": 0.37199124726477023, + "grad_norm": 22.0, + "learning_rate": 2.313787768085896e-06, + "loss": 1.127, + "step": 1530 + }, + { + "epoch": 0.37223437879893023, + "grad_norm": 20.75, + "learning_rate": 2.313533919387223e-06, + "loss": 0.9467, + "step": 1531 + }, + { + "epoch": 0.3724775103330902, + "grad_norm": 22.375, + "learning_rate": 2.3132799117250378e-06, + "loss": 0.9898, + "step": 1532 + }, + { + "epoch": 0.3727206418672502, + "grad_norm": 23.875, + "learning_rate": 2.313025745137306e-06, + "loss": 1.0176, + "step": 1533 + }, + { + "epoch": 0.3729637734014102, + "grad_norm": 17.75, + "learning_rate": 2.312771419662018e-06, + "loss": 0.4831, + "step": 1534 + }, + { + "epoch": 0.3732069049355701, + "grad_norm": 24.75, + "learning_rate": 2.312516935337186e-06, + "loss": 0.9142, + "step": 1535 + }, + { + "epoch": 0.37345003646973013, + "grad_norm": 25.75, + "learning_rate": 2.312262292200848e-06, + "loss": 1.4025, + "step": 1536 + }, + { + "epoch": 0.37369316800389013, + "grad_norm": 22.25, + "learning_rate": 2.312007490291065e-06, + "loss": 1.1622, + "step": 1537 + }, + { + "epoch": 0.3739362995380501, + "grad_norm": 24.25, + "learning_rate": 2.3117525296459203e-06, + "loss": 1.3053, + "step": 1538 + }, + { + "epoch": 0.3741794310722101, + "grad_norm": 18.125, + "learning_rate": 2.3114974103035236e-06, + "loss": 0.9604, + "step": 1539 + }, + { + "epoch": 0.37442256260637, + "grad_norm": 16.125, + "learning_rate": 2.311242132302006e-06, + "loss": 0.8601, + "step": 1540 + }, + { + "epoch": 0.37466569414053, + "grad_norm": 18.5, + "learning_rate": 2.3109866956795234e-06, + "loss": 1.0024, + "step": 1541 + }, + { + "epoch": 0.37490882567469, + "grad_norm": 23.25, + "learning_rate": 2.310731100474255e-06, + "loss": 1.064, + "step": 1542 + }, + { + "epoch": 0.37515195720885, + "grad_norm": 22.25, + "learning_rate": 2.3104753467244045e-06, + "loss": 0.8009, + "step": 1543 + }, + { + "epoch": 0.37539508874301, + "grad_norm": 21.375, + "learning_rate": 2.310219434468198e-06, + "loss": 0.6426, + "step": 1544 + }, + { + "epoch": 0.37563822027717, + "grad_norm": 19.0, + "learning_rate": 2.309963363743887e-06, + "loss": 1.0863, + "step": 1545 + }, + { + "epoch": 0.3758813518113299, + "grad_norm": 21.125, + "learning_rate": 2.309707134589745e-06, + "loss": 1.032, + "step": 1546 + }, + { + "epoch": 0.3761244833454899, + "grad_norm": 19.125, + "learning_rate": 2.3094507470440697e-06, + "loss": 1.0391, + "step": 1547 + }, + { + "epoch": 0.37636761487964987, + "grad_norm": 21.75, + "learning_rate": 2.309194201145183e-06, + "loss": 0.7259, + "step": 1548 + }, + { + "epoch": 0.37661074641380987, + "grad_norm": 15.5, + "learning_rate": 2.3089374969314297e-06, + "loss": 0.5964, + "step": 1549 + }, + { + "epoch": 0.3768538779479699, + "grad_norm": 21.5, + "learning_rate": 2.3086806344411795e-06, + "loss": 0.8881, + "step": 1550 + }, + { + "epoch": 0.3770970094821298, + "grad_norm": 18.875, + "learning_rate": 2.308423613712824e-06, + "loss": 0.8602, + "step": 1551 + }, + { + "epoch": 0.3773401410162898, + "grad_norm": 13.4375, + "learning_rate": 2.30816643478478e-06, + "loss": 0.6663, + "step": 1552 + }, + { + "epoch": 0.37758327255044977, + "grad_norm": 19.5, + "learning_rate": 2.307909097695487e-06, + "loss": 1.0368, + "step": 1553 + }, + { + "epoch": 0.37782640408460977, + "grad_norm": 17.375, + "learning_rate": 2.307651602483409e-06, + "loss": 0.955, + "step": 1554 + }, + { + "epoch": 0.37806953561876977, + "grad_norm": 23.125, + "learning_rate": 2.3073939491870326e-06, + "loss": 1.3472, + "step": 1555 + }, + { + "epoch": 0.3783126671529297, + "grad_norm": 16.375, + "learning_rate": 2.307136137844869e-06, + "loss": 0.5445, + "step": 1556 + }, + { + "epoch": 0.3785557986870897, + "grad_norm": 18.625, + "learning_rate": 2.3068781684954515e-06, + "loss": 1.0573, + "step": 1557 + }, + { + "epoch": 0.3787989302212497, + "grad_norm": 18.375, + "learning_rate": 2.306620041177339e-06, + "loss": 1.0862, + "step": 1558 + }, + { + "epoch": 0.37904206175540966, + "grad_norm": 17.25, + "learning_rate": 2.306361755929113e-06, + "loss": 0.8455, + "step": 1559 + }, + { + "epoch": 0.37928519328956967, + "grad_norm": 16.625, + "learning_rate": 2.3061033127893788e-06, + "loss": 0.8344, + "step": 1560 + }, + { + "epoch": 0.3795283248237296, + "grad_norm": 26.75, + "learning_rate": 2.3058447117967646e-06, + "loss": 1.2789, + "step": 1561 + }, + { + "epoch": 0.3797714563578896, + "grad_norm": 22.375, + "learning_rate": 2.3055859529899235e-06, + "loss": 1.0004, + "step": 1562 + }, + { + "epoch": 0.3800145878920496, + "grad_norm": 16.25, + "learning_rate": 2.305327036407531e-06, + "loss": 0.7607, + "step": 1563 + }, + { + "epoch": 0.38025771942620956, + "grad_norm": 22.625, + "learning_rate": 2.3050679620882865e-06, + "loss": 0.8534, + "step": 1564 + }, + { + "epoch": 0.38050085096036956, + "grad_norm": 132.0, + "learning_rate": 2.3048087300709137e-06, + "loss": 0.9035, + "step": 1565 + }, + { + "epoch": 0.38074398249452956, + "grad_norm": 19.625, + "learning_rate": 2.304549340394159e-06, + "loss": 0.7826, + "step": 1566 + }, + { + "epoch": 0.3809871140286895, + "grad_norm": 15.0, + "learning_rate": 2.304289793096793e-06, + "loss": 0.5729, + "step": 1567 + }, + { + "epoch": 0.3812302455628495, + "grad_norm": 18.25, + "learning_rate": 2.3040300882176084e-06, + "loss": 0.5804, + "step": 1568 + }, + { + "epoch": 0.38147337709700946, + "grad_norm": 27.0, + "learning_rate": 2.303770225795424e-06, + "loss": 1.1457, + "step": 1569 + }, + { + "epoch": 0.38171650863116946, + "grad_norm": 35.5, + "learning_rate": 2.30351020586908e-06, + "loss": 1.2839, + "step": 1570 + }, + { + "epoch": 0.38195964016532946, + "grad_norm": 21.5, + "learning_rate": 2.3032500284774407e-06, + "loss": 0.8459, + "step": 1571 + }, + { + "epoch": 0.3822027716994894, + "grad_norm": 18.625, + "learning_rate": 2.302989693659395e-06, + "loss": 0.959, + "step": 1572 + }, + { + "epoch": 0.3824459032336494, + "grad_norm": 17.375, + "learning_rate": 2.3027292014538533e-06, + "loss": 0.767, + "step": 1573 + }, + { + "epoch": 0.3826890347678094, + "grad_norm": 20.5, + "learning_rate": 2.3024685518997514e-06, + "loss": 1.0778, + "step": 1574 + }, + { + "epoch": 0.38293216630196936, + "grad_norm": 28.125, + "learning_rate": 2.3022077450360474e-06, + "loss": 1.1952, + "step": 1575 + }, + { + "epoch": 0.38317529783612936, + "grad_norm": 17.125, + "learning_rate": 2.3019467809017235e-06, + "loss": 1.0121, + "step": 1576 + }, + { + "epoch": 0.3834184293702893, + "grad_norm": 18.125, + "learning_rate": 2.301685659535786e-06, + "loss": 0.7615, + "step": 1577 + }, + { + "epoch": 0.3836615609044493, + "grad_norm": 27.125, + "learning_rate": 2.301424380977263e-06, + "loss": 1.1592, + "step": 1578 + }, + { + "epoch": 0.3839046924386093, + "grad_norm": 18.5, + "learning_rate": 2.301162945265208e-06, + "loss": 0.8433, + "step": 1579 + }, + { + "epoch": 0.38414782397276925, + "grad_norm": 21.125, + "learning_rate": 2.3009013524386963e-06, + "loss": 1.1105, + "step": 1580 + }, + { + "epoch": 0.38439095550692925, + "grad_norm": 16.75, + "learning_rate": 2.300639602536828e-06, + "loss": 0.7393, + "step": 1581 + }, + { + "epoch": 0.38463408704108926, + "grad_norm": 18.25, + "learning_rate": 2.3003776955987258e-06, + "loss": 0.6562, + "step": 1582 + }, + { + "epoch": 0.3848772185752492, + "grad_norm": 18.0, + "learning_rate": 2.3001156316635362e-06, + "loss": 1.1729, + "step": 1583 + }, + { + "epoch": 0.3851203501094092, + "grad_norm": 17.5, + "learning_rate": 2.2998534107704294e-06, + "loss": 0.7324, + "step": 1584 + }, + { + "epoch": 0.38536348164356915, + "grad_norm": 18.75, + "learning_rate": 2.2995910329585987e-06, + "loss": 0.9908, + "step": 1585 + }, + { + "epoch": 0.38560661317772915, + "grad_norm": 20.625, + "learning_rate": 2.2993284982672613e-06, + "loss": 0.6683, + "step": 1586 + }, + { + "epoch": 0.38584974471188915, + "grad_norm": 20.875, + "learning_rate": 2.2990658067356574e-06, + "loss": 0.9222, + "step": 1587 + }, + { + "epoch": 0.3860928762460491, + "grad_norm": 22.375, + "learning_rate": 2.2988029584030503e-06, + "loss": 0.7977, + "step": 1588 + }, + { + "epoch": 0.3863360077802091, + "grad_norm": 17.375, + "learning_rate": 2.2985399533087275e-06, + "loss": 0.919, + "step": 1589 + }, + { + "epoch": 0.38657913931436905, + "grad_norm": 16.75, + "learning_rate": 2.2982767914920002e-06, + "loss": 0.9644, + "step": 1590 + }, + { + "epoch": 0.38682227084852905, + "grad_norm": 15.8125, + "learning_rate": 2.2980134729922017e-06, + "loss": 0.9162, + "step": 1591 + }, + { + "epoch": 0.38706540238268905, + "grad_norm": 16.875, + "learning_rate": 2.29774999784869e-06, + "loss": 0.6784, + "step": 1592 + }, + { + "epoch": 0.387308533916849, + "grad_norm": 16.625, + "learning_rate": 2.2974863661008464e-06, + "loss": 0.9269, + "step": 1593 + }, + { + "epoch": 0.387551665451009, + "grad_norm": 19.375, + "learning_rate": 2.297222577788074e-06, + "loss": 1.1508, + "step": 1594 + }, + { + "epoch": 0.387794796985169, + "grad_norm": 16.5, + "learning_rate": 2.296958632949801e-06, + "loss": 0.7253, + "step": 1595 + }, + { + "epoch": 0.38803792851932895, + "grad_norm": 15.6875, + "learning_rate": 2.296694531625479e-06, + "loss": 0.7891, + "step": 1596 + }, + { + "epoch": 0.38828106005348895, + "grad_norm": 17.625, + "learning_rate": 2.2964302738545823e-06, + "loss": 0.6991, + "step": 1597 + }, + { + "epoch": 0.3885241915876489, + "grad_norm": 15.25, + "learning_rate": 2.2961658596766087e-06, + "loss": 0.9986, + "step": 1598 + }, + { + "epoch": 0.3887673231218089, + "grad_norm": 19.625, + "learning_rate": 2.2959012891310794e-06, + "loss": 0.7878, + "step": 1599 + }, + { + "epoch": 0.3890104546559689, + "grad_norm": 17.875, + "learning_rate": 2.2956365622575395e-06, + "loss": 0.9268, + "step": 1600 + }, + { + "epoch": 0.38925358619012884, + "grad_norm": 15.1875, + "learning_rate": 2.295371679095556e-06, + "loss": 0.7907, + "step": 1601 + }, + { + "epoch": 0.38949671772428884, + "grad_norm": 19.125, + "learning_rate": 2.295106639684721e-06, + "loss": 1.0365, + "step": 1602 + }, + { + "epoch": 0.38973984925844884, + "grad_norm": 20.375, + "learning_rate": 2.294841444064649e-06, + "loss": 0.909, + "step": 1603 + }, + { + "epoch": 0.3899829807926088, + "grad_norm": 20.0, + "learning_rate": 2.2945760922749783e-06, + "loss": 0.8828, + "step": 1604 + }, + { + "epoch": 0.3902261123267688, + "grad_norm": 18.25, + "learning_rate": 2.29431058435537e-06, + "loss": 0.7795, + "step": 1605 + }, + { + "epoch": 0.39046924386092874, + "grad_norm": 16.625, + "learning_rate": 2.2940449203455097e-06, + "loss": 0.5941, + "step": 1606 + }, + { + "epoch": 0.39071237539508874, + "grad_norm": 16.625, + "learning_rate": 2.293779100285104e-06, + "loss": 0.6087, + "step": 1607 + }, + { + "epoch": 0.39095550692924874, + "grad_norm": 23.5, + "learning_rate": 2.2935131242138855e-06, + "loss": 0.9364, + "step": 1608 + }, + { + "epoch": 0.3911986384634087, + "grad_norm": 19.75, + "learning_rate": 2.293246992171608e-06, + "loss": 1.2017, + "step": 1609 + }, + { + "epoch": 0.3914417699975687, + "grad_norm": 22.75, + "learning_rate": 2.2929807041980505e-06, + "loss": 0.9784, + "step": 1610 + }, + { + "epoch": 0.3916849015317287, + "grad_norm": 17.75, + "learning_rate": 2.2927142603330137e-06, + "loss": 0.5636, + "step": 1611 + }, + { + "epoch": 0.39192803306588864, + "grad_norm": 19.75, + "learning_rate": 2.2924476606163223e-06, + "loss": 1.1359, + "step": 1612 + }, + { + "epoch": 0.39217116460004864, + "grad_norm": 12.4375, + "learning_rate": 2.2921809050878245e-06, + "loss": 0.3779, + "step": 1613 + }, + { + "epoch": 0.3924142961342086, + "grad_norm": 16.25, + "learning_rate": 2.2919139937873915e-06, + "loss": 0.8724, + "step": 1614 + }, + { + "epoch": 0.3926574276683686, + "grad_norm": 17.875, + "learning_rate": 2.291646926754917e-06, + "loss": 0.8597, + "step": 1615 + }, + { + "epoch": 0.3929005592025286, + "grad_norm": 19.0, + "learning_rate": 2.29137970403032e-06, + "loss": 0.8451, + "step": 1616 + }, + { + "epoch": 0.39314369073668853, + "grad_norm": 19.5, + "learning_rate": 2.2911123256535407e-06, + "loss": 0.895, + "step": 1617 + }, + { + "epoch": 0.39338682227084854, + "grad_norm": 22.25, + "learning_rate": 2.2908447916645436e-06, + "loss": 1.1503, + "step": 1618 + }, + { + "epoch": 0.39362995380500854, + "grad_norm": 20.75, + "learning_rate": 2.2905771021033167e-06, + "loss": 0.9719, + "step": 1619 + }, + { + "epoch": 0.3938730853391685, + "grad_norm": 26.25, + "learning_rate": 2.290309257009871e-06, + "loss": 0.9519, + "step": 1620 + }, + { + "epoch": 0.3941162168733285, + "grad_norm": 19.0, + "learning_rate": 2.290041256424239e-06, + "loss": 1.1053, + "step": 1621 + }, + { + "epoch": 0.39435934840748843, + "grad_norm": 19.75, + "learning_rate": 2.2897731003864794e-06, + "loss": 0.975, + "step": 1622 + }, + { + "epoch": 0.39460247994164843, + "grad_norm": 18.0, + "learning_rate": 2.289504788936673e-06, + "loss": 0.7324, + "step": 1623 + }, + { + "epoch": 0.39484561147580843, + "grad_norm": 48.25, + "learning_rate": 2.2892363221149223e-06, + "loss": 1.228, + "step": 1624 + }, + { + "epoch": 0.3950887430099684, + "grad_norm": 19.5, + "learning_rate": 2.288967699961355e-06, + "loss": 1.047, + "step": 1625 + }, + { + "epoch": 0.3953318745441284, + "grad_norm": 22.0, + "learning_rate": 2.288698922516122e-06, + "loss": 1.057, + "step": 1626 + }, + { + "epoch": 0.3955750060782883, + "grad_norm": 23.75, + "learning_rate": 2.2884299898193958e-06, + "loss": 0.9644, + "step": 1627 + }, + { + "epoch": 0.39581813761244833, + "grad_norm": 21.75, + "learning_rate": 2.2881609019113735e-06, + "loss": 1.0534, + "step": 1628 + }, + { + "epoch": 0.39606126914660833, + "grad_norm": 18.625, + "learning_rate": 2.2878916588322744e-06, + "loss": 0.875, + "step": 1629 + }, + { + "epoch": 0.3963044006807683, + "grad_norm": 18.625, + "learning_rate": 2.287622260622342e-06, + "loss": 0.9888, + "step": 1630 + }, + { + "epoch": 0.3965475322149283, + "grad_norm": 20.625, + "learning_rate": 2.2873527073218424e-06, + "loss": 0.6882, + "step": 1631 + }, + { + "epoch": 0.3967906637490883, + "grad_norm": 20.875, + "learning_rate": 2.2870829989710653e-06, + "loss": 1.2406, + "step": 1632 + }, + { + "epoch": 0.3970337952832482, + "grad_norm": 19.25, + "learning_rate": 2.2868131356103226e-06, + "loss": 0.9361, + "step": 1633 + }, + { + "epoch": 0.3972769268174082, + "grad_norm": 17.875, + "learning_rate": 2.2865431172799504e-06, + "loss": 0.8386, + "step": 1634 + }, + { + "epoch": 0.3975200583515682, + "grad_norm": 21.5, + "learning_rate": 2.2862729440203078e-06, + "loss": 1.2811, + "step": 1635 + }, + { + "epoch": 0.3977631898857282, + "grad_norm": 15.6875, + "learning_rate": 2.286002615871776e-06, + "loss": 0.7588, + "step": 1636 + }, + { + "epoch": 0.3980063214198882, + "grad_norm": 20.25, + "learning_rate": 2.2857321328747615e-06, + "loss": 0.9633, + "step": 1637 + }, + { + "epoch": 0.3982494529540481, + "grad_norm": 18.875, + "learning_rate": 2.2854614950696923e-06, + "loss": 0.5652, + "step": 1638 + }, + { + "epoch": 0.3984925844882081, + "grad_norm": 19.375, + "learning_rate": 2.2851907024970196e-06, + "loss": 0.9574, + "step": 1639 + }, + { + "epoch": 0.3987357160223681, + "grad_norm": 24.25, + "learning_rate": 2.2849197551972173e-06, + "loss": 1.315, + "step": 1640 + }, + { + "epoch": 0.39897884755652807, + "grad_norm": 16.625, + "learning_rate": 2.284648653210784e-06, + "loss": 1.0215, + "step": 1641 + }, + { + "epoch": 0.3992219790906881, + "grad_norm": 20.0, + "learning_rate": 2.2843773965782413e-06, + "loss": 0.922, + "step": 1642 + }, + { + "epoch": 0.399465110624848, + "grad_norm": 23.75, + "learning_rate": 2.2841059853401315e-06, + "loss": 0.8942, + "step": 1643 + }, + { + "epoch": 0.399708242159008, + "grad_norm": 20.625, + "learning_rate": 2.283834419537023e-06, + "loss": 1.0954, + "step": 1644 + }, + { + "epoch": 0.399951373693168, + "grad_norm": 20.125, + "learning_rate": 2.2835626992095055e-06, + "loss": 0.8404, + "step": 1645 + }, + { + "epoch": 0.40019450522732797, + "grad_norm": 20.75, + "learning_rate": 2.2832908243981923e-06, + "loss": 0.9693, + "step": 1646 + }, + { + "epoch": 0.40043763676148797, + "grad_norm": 22.625, + "learning_rate": 2.28301879514372e-06, + "loss": 0.9124, + "step": 1647 + }, + { + "epoch": 0.40068076829564797, + "grad_norm": 19.5, + "learning_rate": 2.282746611486748e-06, + "loss": 0.8976, + "step": 1648 + }, + { + "epoch": 0.4009238998298079, + "grad_norm": 21.0, + "learning_rate": 2.2824742734679585e-06, + "loss": 1.1095, + "step": 1649 + }, + { + "epoch": 0.4011670313639679, + "grad_norm": 28.625, + "learning_rate": 2.2822017811280573e-06, + "loss": 1.1894, + "step": 1650 + }, + { + "epoch": 0.40141016289812786, + "grad_norm": 17.0, + "learning_rate": 2.281929134507773e-06, + "loss": 0.6871, + "step": 1651 + }, + { + "epoch": 0.40165329443228787, + "grad_norm": 17.5, + "learning_rate": 2.2816563336478582e-06, + "loss": 0.61, + "step": 1652 + }, + { + "epoch": 0.40189642596644787, + "grad_norm": 33.25, + "learning_rate": 2.2813833785890864e-06, + "loss": 1.0288, + "step": 1653 + }, + { + "epoch": 0.4021395575006078, + "grad_norm": 43.5, + "learning_rate": 2.2811102693722565e-06, + "loss": 1.2935, + "step": 1654 + }, + { + "epoch": 0.4023826890347678, + "grad_norm": 22.25, + "learning_rate": 2.280837006038189e-06, + "loss": 1.3191, + "step": 1655 + }, + { + "epoch": 0.4026258205689278, + "grad_norm": 12.9375, + "learning_rate": 2.280563588627727e-06, + "loss": 0.4425, + "step": 1656 + }, + { + "epoch": 0.40286895210308776, + "grad_norm": 16.25, + "learning_rate": 2.280290017181739e-06, + "loss": 0.7694, + "step": 1657 + }, + { + "epoch": 0.40311208363724776, + "grad_norm": 16.375, + "learning_rate": 2.280016291741114e-06, + "loss": 0.953, + "step": 1658 + }, + { + "epoch": 0.4033552151714077, + "grad_norm": 15.9375, + "learning_rate": 2.2797424123467656e-06, + "loss": 0.6801, + "step": 1659 + }, + { + "epoch": 0.4035983467055677, + "grad_norm": 21.5, + "learning_rate": 2.279468379039629e-06, + "loss": 0.6453, + "step": 1660 + }, + { + "epoch": 0.4038414782397277, + "grad_norm": 17.25, + "learning_rate": 2.279194191860663e-06, + "loss": 0.9078, + "step": 1661 + }, + { + "epoch": 0.40408460977388766, + "grad_norm": 18.125, + "learning_rate": 2.278919850850851e-06, + "loss": 0.8203, + "step": 1662 + }, + { + "epoch": 0.40432774130804766, + "grad_norm": 16.5, + "learning_rate": 2.2786453560511975e-06, + "loss": 0.537, + "step": 1663 + }, + { + "epoch": 0.4045708728422076, + "grad_norm": 18.625, + "learning_rate": 2.2783707075027295e-06, + "loss": 0.7241, + "step": 1664 + }, + { + "epoch": 0.4048140043763676, + "grad_norm": 18.25, + "learning_rate": 2.278095905246499e-06, + "loss": 0.9083, + "step": 1665 + }, + { + "epoch": 0.4050571359105276, + "grad_norm": 20.0, + "learning_rate": 2.2778209493235794e-06, + "loss": 0.7556, + "step": 1666 + }, + { + "epoch": 0.40530026744468756, + "grad_norm": 18.375, + "learning_rate": 2.277545839775068e-06, + "loss": 0.8547, + "step": 1667 + }, + { + "epoch": 0.40554339897884756, + "grad_norm": 19.125, + "learning_rate": 2.277270576642084e-06, + "loss": 0.9679, + "step": 1668 + }, + { + "epoch": 0.40578653051300756, + "grad_norm": 16.625, + "learning_rate": 2.276995159965772e-06, + "loss": 0.6753, + "step": 1669 + }, + { + "epoch": 0.4060296620471675, + "grad_norm": 26.5, + "learning_rate": 2.2767195897872955e-06, + "loss": 0.552, + "step": 1670 + }, + { + "epoch": 0.4062727935813275, + "grad_norm": 35.25, + "learning_rate": 2.276443866147845e-06, + "loss": 1.304, + "step": 1671 + }, + { + "epoch": 0.40651592511548745, + "grad_norm": 17.25, + "learning_rate": 2.2761679890886307e-06, + "loss": 0.7139, + "step": 1672 + }, + { + "epoch": 0.40675905664964745, + "grad_norm": 15.8125, + "learning_rate": 2.275891958650888e-06, + "loss": 0.6474, + "step": 1673 + }, + { + "epoch": 0.40700218818380746, + "grad_norm": 17.25, + "learning_rate": 2.2756157748758745e-06, + "loss": 1.0807, + "step": 1674 + }, + { + "epoch": 0.4072453197179674, + "grad_norm": 15.6875, + "learning_rate": 2.2753394378048705e-06, + "loss": 0.5286, + "step": 1675 + }, + { + "epoch": 0.4074884512521274, + "grad_norm": 19.625, + "learning_rate": 2.2750629474791792e-06, + "loss": 0.8319, + "step": 1676 + }, + { + "epoch": 0.4077315827862874, + "grad_norm": 25.5, + "learning_rate": 2.2747863039401267e-06, + "loss": 0.8369, + "step": 1677 + }, + { + "epoch": 0.40797471432044735, + "grad_norm": 19.75, + "learning_rate": 2.274509507229063e-06, + "loss": 0.6813, + "step": 1678 + }, + { + "epoch": 0.40821784585460735, + "grad_norm": 20.0, + "learning_rate": 2.274232557387359e-06, + "loss": 1.2366, + "step": 1679 + }, + { + "epoch": 0.4084609773887673, + "grad_norm": 19.125, + "learning_rate": 2.2739554544564107e-06, + "loss": 0.8795, + "step": 1680 + }, + { + "epoch": 0.4087041089229273, + "grad_norm": 16.0, + "learning_rate": 2.2736781984776354e-06, + "loss": 0.7821, + "step": 1681 + }, + { + "epoch": 0.4089472404570873, + "grad_norm": 18.0, + "learning_rate": 2.273400789492473e-06, + "loss": 0.8354, + "step": 1682 + }, + { + "epoch": 0.40919037199124725, + "grad_norm": 17.375, + "learning_rate": 2.2731232275423886e-06, + "loss": 0.8526, + "step": 1683 + }, + { + "epoch": 0.40943350352540725, + "grad_norm": 27.75, + "learning_rate": 2.272845512668868e-06, + "loss": 1.19, + "step": 1684 + }, + { + "epoch": 0.40967663505956725, + "grad_norm": 18.75, + "learning_rate": 2.27256764491342e-06, + "loss": 1.0011, + "step": 1685 + }, + { + "epoch": 0.4099197665937272, + "grad_norm": 20.875, + "learning_rate": 2.2722896243175767e-06, + "loss": 0.9363, + "step": 1686 + }, + { + "epoch": 0.4101628981278872, + "grad_norm": 21.875, + "learning_rate": 2.272011450922894e-06, + "loss": 0.6616, + "step": 1687 + }, + { + "epoch": 0.41040602966204714, + "grad_norm": 24.625, + "learning_rate": 2.2717331247709496e-06, + "loss": 0.9938, + "step": 1688 + }, + { + "epoch": 0.41064916119620715, + "grad_norm": 19.375, + "learning_rate": 2.271454645903343e-06, + "loss": 0.7729, + "step": 1689 + }, + { + "epoch": 0.41089229273036715, + "grad_norm": 18.375, + "learning_rate": 2.271176014361699e-06, + "loss": 1.2154, + "step": 1690 + }, + { + "epoch": 0.4111354242645271, + "grad_norm": 20.625, + "learning_rate": 2.270897230187663e-06, + "loss": 0.8767, + "step": 1691 + }, + { + "epoch": 0.4113785557986871, + "grad_norm": 17.625, + "learning_rate": 2.270618293422905e-06, + "loss": 0.8918, + "step": 1692 + }, + { + "epoch": 0.4116216873328471, + "grad_norm": 17.25, + "learning_rate": 2.2703392041091156e-06, + "loss": 0.7898, + "step": 1693 + }, + { + "epoch": 0.41186481886700704, + "grad_norm": 21.0, + "learning_rate": 2.2700599622880106e-06, + "loss": 1.0549, + "step": 1694 + }, + { + "epoch": 0.41210795040116704, + "grad_norm": 22.625, + "learning_rate": 2.2697805680013274e-06, + "loss": 0.7342, + "step": 1695 + }, + { + "epoch": 0.412351081935327, + "grad_norm": 20.375, + "learning_rate": 2.2695010212908256e-06, + "loss": 1.0939, + "step": 1696 + }, + { + "epoch": 0.412594213469487, + "grad_norm": 22.5, + "learning_rate": 2.269221322198289e-06, + "loss": 0.7617, + "step": 1697 + }, + { + "epoch": 0.412837345003647, + "grad_norm": 20.125, + "learning_rate": 2.2689414707655233e-06, + "loss": 1.1387, + "step": 1698 + }, + { + "epoch": 0.41308047653780694, + "grad_norm": 18.125, + "learning_rate": 2.268661467034357e-06, + "loss": 0.7587, + "step": 1699 + }, + { + "epoch": 0.41332360807196694, + "grad_norm": 16.375, + "learning_rate": 2.2683813110466417e-06, + "loss": 0.7771, + "step": 1700 + }, + { + "epoch": 0.4135667396061269, + "grad_norm": 20.875, + "learning_rate": 2.2681010028442517e-06, + "loss": 0.9262, + "step": 1701 + }, + { + "epoch": 0.4138098711402869, + "grad_norm": 20.75, + "learning_rate": 2.267820542469083e-06, + "loss": 1.1631, + "step": 1702 + }, + { + "epoch": 0.4140530026744469, + "grad_norm": 16.625, + "learning_rate": 2.2675399299630563e-06, + "loss": 0.45, + "step": 1703 + }, + { + "epoch": 0.41429613420860684, + "grad_norm": 21.125, + "learning_rate": 2.267259165368113e-06, + "loss": 0.96, + "step": 1704 + }, + { + "epoch": 0.41453926574276684, + "grad_norm": 20.375, + "learning_rate": 2.2669782487262193e-06, + "loss": 0.9909, + "step": 1705 + }, + { + "epoch": 0.41478239727692684, + "grad_norm": 17.125, + "learning_rate": 2.2666971800793625e-06, + "loss": 0.535, + "step": 1706 + }, + { + "epoch": 0.4150255288110868, + "grad_norm": 17.375, + "learning_rate": 2.2664159594695527e-06, + "loss": 0.8767, + "step": 1707 + }, + { + "epoch": 0.4152686603452468, + "grad_norm": 18.125, + "learning_rate": 2.266134586938824e-06, + "loss": 0.6714, + "step": 1708 + }, + { + "epoch": 0.41551179187940673, + "grad_norm": 21.625, + "learning_rate": 2.265853062529232e-06, + "loss": 0.9772, + "step": 1709 + }, + { + "epoch": 0.41575492341356673, + "grad_norm": 18.25, + "learning_rate": 2.2655713862828554e-06, + "loss": 0.8306, + "step": 1710 + }, + { + "epoch": 0.41599805494772674, + "grad_norm": 15.375, + "learning_rate": 2.2652895582417955e-06, + "loss": 0.6248, + "step": 1711 + }, + { + "epoch": 0.4162411864818867, + "grad_norm": 20.375, + "learning_rate": 2.2650075784481767e-06, + "loss": 1.3364, + "step": 1712 + }, + { + "epoch": 0.4164843180160467, + "grad_norm": 15.5625, + "learning_rate": 2.2647254469441456e-06, + "loss": 0.7473, + "step": 1713 + }, + { + "epoch": 0.4167274495502067, + "grad_norm": 15.625, + "learning_rate": 2.2644431637718713e-06, + "loss": 0.5194, + "step": 1714 + }, + { + "epoch": 0.41697058108436663, + "grad_norm": 17.25, + "learning_rate": 2.2641607289735455e-06, + "loss": 1.151, + "step": 1715 + }, + { + "epoch": 0.41721371261852663, + "grad_norm": 21.125, + "learning_rate": 2.2638781425913846e-06, + "loss": 0.9437, + "step": 1716 + }, + { + "epoch": 0.4174568441526866, + "grad_norm": 14.9375, + "learning_rate": 2.2635954046676247e-06, + "loss": 0.7822, + "step": 1717 + }, + { + "epoch": 0.4176999756868466, + "grad_norm": 23.0, + "learning_rate": 2.263312515244526e-06, + "loss": 1.0253, + "step": 1718 + }, + { + "epoch": 0.4179431072210066, + "grad_norm": 22.25, + "learning_rate": 2.2630294743643717e-06, + "loss": 1.1386, + "step": 1719 + }, + { + "epoch": 0.41818623875516653, + "grad_norm": 19.5, + "learning_rate": 2.2627462820694664e-06, + "loss": 1.0932, + "step": 1720 + }, + { + "epoch": 0.41842937028932653, + "grad_norm": 17.75, + "learning_rate": 2.2624629384021386e-06, + "loss": 0.7368, + "step": 1721 + }, + { + "epoch": 0.41867250182348653, + "grad_norm": 18.5, + "learning_rate": 2.262179443404739e-06, + "loss": 0.9348, + "step": 1722 + }, + { + "epoch": 0.4189156333576465, + "grad_norm": 18.125, + "learning_rate": 2.2618957971196402e-06, + "loss": 0.8927, + "step": 1723 + }, + { + "epoch": 0.4191587648918065, + "grad_norm": 16.625, + "learning_rate": 2.2616119995892394e-06, + "loss": 0.7519, + "step": 1724 + }, + { + "epoch": 0.4194018964259664, + "grad_norm": 25.375, + "learning_rate": 2.2613280508559536e-06, + "loss": 1.2584, + "step": 1725 + }, + { + "epoch": 0.4196450279601264, + "grad_norm": 22.5, + "learning_rate": 2.261043950962224e-06, + "loss": 0.8007, + "step": 1726 + }, + { + "epoch": 0.41988815949428643, + "grad_norm": 18.625, + "learning_rate": 2.260759699950515e-06, + "loss": 0.7818, + "step": 1727 + }, + { + "epoch": 0.4201312910284464, + "grad_norm": 35.25, + "learning_rate": 2.2604752978633124e-06, + "loss": 0.8461, + "step": 1728 + }, + { + "epoch": 0.4203744225626064, + "grad_norm": 24.0, + "learning_rate": 2.2601907447431247e-06, + "loss": 1.2568, + "step": 1729 + }, + { + "epoch": 0.4206175540967664, + "grad_norm": 68.0, + "learning_rate": 2.2599060406324842e-06, + "loss": 1.2069, + "step": 1730 + }, + { + "epoch": 0.4208606856309263, + "grad_norm": 21.0, + "learning_rate": 2.259621185573944e-06, + "loss": 0.9982, + "step": 1731 + }, + { + "epoch": 0.4211038171650863, + "grad_norm": 29.75, + "learning_rate": 2.2593361796100803e-06, + "loss": 1.2981, + "step": 1732 + }, + { + "epoch": 0.42134694869924627, + "grad_norm": 22.125, + "learning_rate": 2.2590510227834937e-06, + "loss": 1.6931, + "step": 1733 + }, + { + "epoch": 0.4215900802334063, + "grad_norm": 21.375, + "learning_rate": 2.2587657151368044e-06, + "loss": 0.766, + "step": 1734 + }, + { + "epoch": 0.4218332117675663, + "grad_norm": 27.375, + "learning_rate": 2.2584802567126567e-06, + "loss": 0.9785, + "step": 1735 + }, + { + "epoch": 0.4220763433017262, + "grad_norm": 22.5, + "learning_rate": 2.2581946475537177e-06, + "loss": 1.2253, + "step": 1736 + }, + { + "epoch": 0.4223194748358862, + "grad_norm": 21.625, + "learning_rate": 2.2579088877026767e-06, + "loss": 0.7935, + "step": 1737 + }, + { + "epoch": 0.42256260637004617, + "grad_norm": 20.25, + "learning_rate": 2.2576229772022452e-06, + "loss": 0.8855, + "step": 1738 + }, + { + "epoch": 0.42280573790420617, + "grad_norm": 18.125, + "learning_rate": 2.2573369160951574e-06, + "loss": 0.9769, + "step": 1739 + }, + { + "epoch": 0.42304886943836617, + "grad_norm": 22.75, + "learning_rate": 2.2570507044241702e-06, + "loss": 0.9223, + "step": 1740 + }, + { + "epoch": 0.4232920009725261, + "grad_norm": 17.0, + "learning_rate": 2.256764342232063e-06, + "loss": 0.6006, + "step": 1741 + }, + { + "epoch": 0.4235351325066861, + "grad_norm": 25.5, + "learning_rate": 2.2564778295616373e-06, + "loss": 1.0001, + "step": 1742 + }, + { + "epoch": 0.4237782640408461, + "grad_norm": 20.375, + "learning_rate": 2.2561911664557173e-06, + "loss": 0.7565, + "step": 1743 + }, + { + "epoch": 0.42402139557500607, + "grad_norm": 18.625, + "learning_rate": 2.25590435295715e-06, + "loss": 0.4233, + "step": 1744 + }, + { + "epoch": 0.42426452710916607, + "grad_norm": 21.75, + "learning_rate": 2.2556173891088047e-06, + "loss": 0.7461, + "step": 1745 + }, + { + "epoch": 0.424507658643326, + "grad_norm": 21.5, + "learning_rate": 2.2553302749535733e-06, + "loss": 0.7831, + "step": 1746 + }, + { + "epoch": 0.424750790177486, + "grad_norm": 40.25, + "learning_rate": 2.255043010534369e-06, + "loss": 0.9118, + "step": 1747 + }, + { + "epoch": 0.424993921711646, + "grad_norm": 23.5, + "learning_rate": 2.2547555958941296e-06, + "loss": 0.9295, + "step": 1748 + }, + { + "epoch": 0.42523705324580596, + "grad_norm": 20.625, + "learning_rate": 2.2544680310758136e-06, + "loss": 0.8796, + "step": 1749 + }, + { + "epoch": 0.42548018477996596, + "grad_norm": 16.125, + "learning_rate": 2.2541803161224023e-06, + "loss": 0.7127, + "step": 1750 + }, + { + "epoch": 0.42572331631412597, + "grad_norm": 18.5, + "learning_rate": 2.2538924510769004e-06, + "loss": 0.6023, + "step": 1751 + }, + { + "epoch": 0.4259664478482859, + "grad_norm": 17.5, + "learning_rate": 2.2536044359823338e-06, + "loss": 0.9475, + "step": 1752 + }, + { + "epoch": 0.4262095793824459, + "grad_norm": 25.25, + "learning_rate": 2.253316270881751e-06, + "loss": 0.909, + "step": 1753 + }, + { + "epoch": 0.42645271091660586, + "grad_norm": 24.375, + "learning_rate": 2.253027955818224e-06, + "loss": 1.1146, + "step": 1754 + }, + { + "epoch": 0.42669584245076586, + "grad_norm": 18.0, + "learning_rate": 2.252739490834846e-06, + "loss": 0.6651, + "step": 1755 + }, + { + "epoch": 0.42693897398492586, + "grad_norm": 20.5, + "learning_rate": 2.252450875974733e-06, + "loss": 1.4401, + "step": 1756 + }, + { + "epoch": 0.4271821055190858, + "grad_norm": 21.625, + "learning_rate": 2.2521621112810236e-06, + "loss": 0.8544, + "step": 1757 + }, + { + "epoch": 0.4274252370532458, + "grad_norm": 16.75, + "learning_rate": 2.2518731967968794e-06, + "loss": 0.8524, + "step": 1758 + }, + { + "epoch": 0.4276683685874058, + "grad_norm": 19.5, + "learning_rate": 2.2515841325654824e-06, + "loss": 0.7561, + "step": 1759 + }, + { + "epoch": 0.42791150012156576, + "grad_norm": 20.625, + "learning_rate": 2.251294918630039e-06, + "loss": 0.7684, + "step": 1760 + }, + { + "epoch": 0.42815463165572576, + "grad_norm": 35.25, + "learning_rate": 2.251005555033777e-06, + "loss": 1.1054, + "step": 1761 + }, + { + "epoch": 0.4283977631898857, + "grad_norm": 20.0, + "learning_rate": 2.250716041819947e-06, + "loss": 1.2366, + "step": 1762 + }, + { + "epoch": 0.4286408947240457, + "grad_norm": 15.0, + "learning_rate": 2.2504263790318215e-06, + "loss": 0.6197, + "step": 1763 + }, + { + "epoch": 0.4288840262582057, + "grad_norm": 20.5, + "learning_rate": 2.2501365667126954e-06, + "loss": 1.1374, + "step": 1764 + }, + { + "epoch": 0.42912715779236565, + "grad_norm": 24.75, + "learning_rate": 2.2498466049058866e-06, + "loss": 0.9917, + "step": 1765 + }, + { + "epoch": 0.42937028932652566, + "grad_norm": 22.125, + "learning_rate": 2.249556493654735e-06, + "loss": 1.0051, + "step": 1766 + }, + { + "epoch": 0.42961342086068566, + "grad_norm": 27.125, + "learning_rate": 2.249266233002602e-06, + "loss": 0.9072, + "step": 1767 + }, + { + "epoch": 0.4298565523948456, + "grad_norm": 16.625, + "learning_rate": 2.248975822992873e-06, + "loss": 0.8493, + "step": 1768 + }, + { + "epoch": 0.4300996839290056, + "grad_norm": 16.5, + "learning_rate": 2.248685263668954e-06, + "loss": 0.7018, + "step": 1769 + }, + { + "epoch": 0.43034281546316555, + "grad_norm": 44.75, + "learning_rate": 2.248394555074275e-06, + "loss": 0.9015, + "step": 1770 + }, + { + "epoch": 0.43058594699732555, + "grad_norm": 17.25, + "learning_rate": 2.248103697252287e-06, + "loss": 0.9825, + "step": 1771 + }, + { + "epoch": 0.43082907853148555, + "grad_norm": 17.25, + "learning_rate": 2.247812690246463e-06, + "loss": 0.7986, + "step": 1772 + }, + { + "epoch": 0.4310722100656455, + "grad_norm": 21.125, + "learning_rate": 2.2475215341002998e-06, + "loss": 0.8086, + "step": 1773 + }, + { + "epoch": 0.4313153415998055, + "grad_norm": 15.5, + "learning_rate": 2.2472302288573153e-06, + "loss": 0.6696, + "step": 1774 + }, + { + "epoch": 0.43155847313396545, + "grad_norm": 21.75, + "learning_rate": 2.2469387745610504e-06, + "loss": 1.0677, + "step": 1775 + }, + { + "epoch": 0.43180160466812545, + "grad_norm": 22.375, + "learning_rate": 2.2466471712550682e-06, + "loss": 0.8187, + "step": 1776 + }, + { + "epoch": 0.43204473620228545, + "grad_norm": 18.0, + "learning_rate": 2.2463554189829534e-06, + "loss": 0.632, + "step": 1777 + }, + { + "epoch": 0.4322878677364454, + "grad_norm": 24.125, + "learning_rate": 2.2460635177883137e-06, + "loss": 0.8745, + "step": 1778 + }, + { + "epoch": 0.4325309992706054, + "grad_norm": 20.375, + "learning_rate": 2.2457714677147786e-06, + "loss": 0.9126, + "step": 1779 + }, + { + "epoch": 0.4327741308047654, + "grad_norm": 16.125, + "learning_rate": 2.2454792688060002e-06, + "loss": 0.6082, + "step": 1780 + }, + { + "epoch": 0.43301726233892535, + "grad_norm": 17.375, + "learning_rate": 2.245186921105652e-06, + "loss": 0.6234, + "step": 1781 + }, + { + "epoch": 0.43326039387308535, + "grad_norm": 32.25, + "learning_rate": 2.2448944246574314e-06, + "loss": 1.3383, + "step": 1782 + }, + { + "epoch": 0.4335035254072453, + "grad_norm": 22.0, + "learning_rate": 2.2446017795050564e-06, + "loss": 0.7792, + "step": 1783 + }, + { + "epoch": 0.4337466569414053, + "grad_norm": 23.0, + "learning_rate": 2.2443089856922683e-06, + "loss": 1.1139, + "step": 1784 + }, + { + "epoch": 0.4339897884755653, + "grad_norm": 28.0, + "learning_rate": 2.24401604326283e-06, + "loss": 1.2003, + "step": 1785 + }, + { + "epoch": 0.43423292000972524, + "grad_norm": 20.25, + "learning_rate": 2.243722952260527e-06, + "loss": 1.0905, + "step": 1786 + }, + { + "epoch": 0.43447605154388524, + "grad_norm": 16.5, + "learning_rate": 2.243429712729166e-06, + "loss": 0.6541, + "step": 1787 + }, + { + "epoch": 0.43471918307804525, + "grad_norm": 24.0, + "learning_rate": 2.2431363247125777e-06, + "loss": 0.5291, + "step": 1788 + }, + { + "epoch": 0.4349623146122052, + "grad_norm": 30.375, + "learning_rate": 2.2428427882546136e-06, + "loss": 1.2832, + "step": 1789 + }, + { + "epoch": 0.4352054461463652, + "grad_norm": 16.5, + "learning_rate": 2.2425491033991474e-06, + "loss": 0.8466, + "step": 1790 + }, + { + "epoch": 0.43544857768052514, + "grad_norm": 31.375, + "learning_rate": 2.242255270190076e-06, + "loss": 0.9529, + "step": 1791 + }, + { + "epoch": 0.43569170921468514, + "grad_norm": 19.75, + "learning_rate": 2.241961288671318e-06, + "loss": 0.752, + "step": 1792 + }, + { + "epoch": 0.43593484074884514, + "grad_norm": 22.0, + "learning_rate": 2.2416671588868136e-06, + "loss": 0.766, + "step": 1793 + }, + { + "epoch": 0.4361779722830051, + "grad_norm": 16.875, + "learning_rate": 2.2413728808805256e-06, + "loss": 1.022, + "step": 1794 + }, + { + "epoch": 0.4364211038171651, + "grad_norm": 24.125, + "learning_rate": 2.2410784546964385e-06, + "loss": 0.9557, + "step": 1795 + }, + { + "epoch": 0.4366642353513251, + "grad_norm": 16.875, + "learning_rate": 2.2407838803785604e-06, + "loss": 0.8656, + "step": 1796 + }, + { + "epoch": 0.43690736688548504, + "grad_norm": 27.875, + "learning_rate": 2.240489157970919e-06, + "loss": 0.9394, + "step": 1797 + }, + { + "epoch": 0.43715049841964504, + "grad_norm": 23.0, + "learning_rate": 2.2401942875175675e-06, + "loss": 1.0365, + "step": 1798 + }, + { + "epoch": 0.437393629953805, + "grad_norm": 34.25, + "learning_rate": 2.2398992690625785e-06, + "loss": 1.6354, + "step": 1799 + }, + { + "epoch": 0.437636761487965, + "grad_norm": 20.75, + "learning_rate": 2.239604102650047e-06, + "loss": 1.0085, + "step": 1800 + }, + { + "epoch": 0.437879893022125, + "grad_norm": 15.8125, + "learning_rate": 2.2393087883240917e-06, + "loss": 0.7051, + "step": 1801 + }, + { + "epoch": 0.43812302455628493, + "grad_norm": 21.625, + "learning_rate": 2.2390133261288523e-06, + "loss": 0.8884, + "step": 1802 + }, + { + "epoch": 0.43836615609044494, + "grad_norm": 17.5, + "learning_rate": 2.23871771610849e-06, + "loss": 0.5416, + "step": 1803 + }, + { + "epoch": 0.43860928762460494, + "grad_norm": 33.5, + "learning_rate": 2.238421958307189e-06, + "loss": 1.0586, + "step": 1804 + }, + { + "epoch": 0.4388524191587649, + "grad_norm": 26.5, + "learning_rate": 2.238126052769156e-06, + "loss": 0.9796, + "step": 1805 + }, + { + "epoch": 0.4390955506929249, + "grad_norm": 22.0, + "learning_rate": 2.2378299995386194e-06, + "loss": 0.6153, + "step": 1806 + }, + { + "epoch": 0.43933868222708483, + "grad_norm": 20.75, + "learning_rate": 2.2375337986598282e-06, + "loss": 0.9212, + "step": 1807 + }, + { + "epoch": 0.43958181376124483, + "grad_norm": 23.25, + "learning_rate": 2.237237450177056e-06, + "loss": 1.2888, + "step": 1808 + }, + { + "epoch": 0.43982494529540483, + "grad_norm": 15.0625, + "learning_rate": 2.2369409541345967e-06, + "loss": 0.6865, + "step": 1809 + }, + { + "epoch": 0.4400680768295648, + "grad_norm": 16.75, + "learning_rate": 2.2366443105767667e-06, + "loss": 0.8341, + "step": 1810 + }, + { + "epoch": 0.4403112083637248, + "grad_norm": 26.125, + "learning_rate": 2.236347519547904e-06, + "loss": 1.2071, + "step": 1811 + }, + { + "epoch": 0.44055433989788473, + "grad_norm": 15.8125, + "learning_rate": 2.23605058109237e-06, + "loss": 0.6755, + "step": 1812 + }, + { + "epoch": 0.44079747143204473, + "grad_norm": 18.25, + "learning_rate": 2.235753495254547e-06, + "loss": 0.6688, + "step": 1813 + }, + { + "epoch": 0.44104060296620473, + "grad_norm": 17.25, + "learning_rate": 2.23545626207884e-06, + "loss": 0.7241, + "step": 1814 + }, + { + "epoch": 0.4412837345003647, + "grad_norm": 18.875, + "learning_rate": 2.235158881609675e-06, + "loss": 0.7078, + "step": 1815 + }, + { + "epoch": 0.4415268660345247, + "grad_norm": 17.625, + "learning_rate": 2.2348613538915004e-06, + "loss": 0.8524, + "step": 1816 + }, + { + "epoch": 0.4417699975686847, + "grad_norm": 17.625, + "learning_rate": 2.234563678968788e-06, + "loss": 1.007, + "step": 1817 + }, + { + "epoch": 0.4420131291028446, + "grad_norm": 14.8125, + "learning_rate": 2.2342658568860292e-06, + "loss": 0.6466, + "step": 1818 + }, + { + "epoch": 0.44225626063700463, + "grad_norm": 21.875, + "learning_rate": 2.2339678876877393e-06, + "loss": 1.3698, + "step": 1819 + }, + { + "epoch": 0.4424993921711646, + "grad_norm": 21.375, + "learning_rate": 2.233669771418455e-06, + "loss": 0.5664, + "step": 1820 + }, + { + "epoch": 0.4427425237053246, + "grad_norm": 22.875, + "learning_rate": 2.2333715081227347e-06, + "loss": 0.7115, + "step": 1821 + }, + { + "epoch": 0.4429856552394846, + "grad_norm": 18.625, + "learning_rate": 2.2330730978451593e-06, + "loss": 1.0623, + "step": 1822 + }, + { + "epoch": 0.4432287867736445, + "grad_norm": 15.6875, + "learning_rate": 2.2327745406303314e-06, + "loss": 0.7299, + "step": 1823 + }, + { + "epoch": 0.4434719183078045, + "grad_norm": 19.375, + "learning_rate": 2.2324758365228745e-06, + "loss": 0.7466, + "step": 1824 + }, + { + "epoch": 0.4437150498419645, + "grad_norm": 17.125, + "learning_rate": 2.2321769855674365e-06, + "loss": 0.6095, + "step": 1825 + }, + { + "epoch": 0.44395818137612447, + "grad_norm": 21.5, + "learning_rate": 2.2318779878086853e-06, + "loss": 0.847, + "step": 1826 + }, + { + "epoch": 0.4442013129102845, + "grad_norm": 18.5, + "learning_rate": 2.231578843291311e-06, + "loss": 1.123, + "step": 1827 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 15.375, + "learning_rate": 2.231279552060026e-06, + "loss": 0.3888, + "step": 1828 + }, + { + "epoch": 0.4446875759786044, + "grad_norm": 16.875, + "learning_rate": 2.230980114159565e-06, + "loss": 0.6976, + "step": 1829 + }, + { + "epoch": 0.4449307075127644, + "grad_norm": 14.8125, + "learning_rate": 2.2306805296346836e-06, + "loss": 0.5196, + "step": 1830 + }, + { + "epoch": 0.44517383904692437, + "grad_norm": 39.25, + "learning_rate": 2.23038079853016e-06, + "loss": 0.6276, + "step": 1831 + }, + { + "epoch": 0.44541697058108437, + "grad_norm": 18.5, + "learning_rate": 2.2300809208907943e-06, + "loss": 0.743, + "step": 1832 + }, + { + "epoch": 0.44566010211524437, + "grad_norm": 17.375, + "learning_rate": 2.2297808967614085e-06, + "loss": 0.9677, + "step": 1833 + }, + { + "epoch": 0.4459032336494043, + "grad_norm": 22.625, + "learning_rate": 2.2294807261868463e-06, + "loss": 0.7887, + "step": 1834 + }, + { + "epoch": 0.4461463651835643, + "grad_norm": 17.5, + "learning_rate": 2.2291804092119736e-06, + "loss": 0.8214, + "step": 1835 + }, + { + "epoch": 0.44638949671772427, + "grad_norm": 18.875, + "learning_rate": 2.228879945881677e-06, + "loss": 0.6916, + "step": 1836 + }, + { + "epoch": 0.44663262825188427, + "grad_norm": 16.375, + "learning_rate": 2.228579336240867e-06, + "loss": 0.7197, + "step": 1837 + }, + { + "epoch": 0.44687575978604427, + "grad_norm": 19.5, + "learning_rate": 2.228278580334475e-06, + "loss": 1.1411, + "step": 1838 + }, + { + "epoch": 0.4471188913202042, + "grad_norm": 17.375, + "learning_rate": 2.227977678207453e-06, + "loss": 1.0894, + "step": 1839 + }, + { + "epoch": 0.4473620228543642, + "grad_norm": 20.375, + "learning_rate": 2.227676629904777e-06, + "loss": 0.9801, + "step": 1840 + }, + { + "epoch": 0.4476051543885242, + "grad_norm": 22.25, + "learning_rate": 2.2273754354714437e-06, + "loss": 1.063, + "step": 1841 + }, + { + "epoch": 0.44784828592268416, + "grad_norm": 19.0, + "learning_rate": 2.2270740949524717e-06, + "loss": 0.7653, + "step": 1842 + }, + { + "epoch": 0.44809141745684417, + "grad_norm": 22.375, + "learning_rate": 2.2267726083929015e-06, + "loss": 0.9432, + "step": 1843 + }, + { + "epoch": 0.4483345489910041, + "grad_norm": 16.5, + "learning_rate": 2.2264709758377957e-06, + "loss": 0.6867, + "step": 1844 + }, + { + "epoch": 0.4485776805251641, + "grad_norm": 15.75, + "learning_rate": 2.226169197332238e-06, + "loss": 0.4329, + "step": 1845 + }, + { + "epoch": 0.4488208120593241, + "grad_norm": 18.625, + "learning_rate": 2.225867272921335e-06, + "loss": 0.8247, + "step": 1846 + }, + { + "epoch": 0.44906394359348406, + "grad_norm": 17.375, + "learning_rate": 2.2255652026502144e-06, + "loss": 0.8816, + "step": 1847 + }, + { + "epoch": 0.44930707512764406, + "grad_norm": 16.625, + "learning_rate": 2.225262986564025e-06, + "loss": 0.6855, + "step": 1848 + }, + { + "epoch": 0.449550206661804, + "grad_norm": 21.375, + "learning_rate": 2.2249606247079397e-06, + "loss": 0.9725, + "step": 1849 + }, + { + "epoch": 0.449793338195964, + "grad_norm": 19.75, + "learning_rate": 2.2246581171271503e-06, + "loss": 0.8016, + "step": 1850 + }, + { + "epoch": 0.450036469730124, + "grad_norm": 23.0, + "learning_rate": 2.2243554638668727e-06, + "loss": 0.8972, + "step": 1851 + }, + { + "epoch": 0.45027960126428396, + "grad_norm": 22.5, + "learning_rate": 2.2240526649723433e-06, + "loss": 0.6963, + "step": 1852 + }, + { + "epoch": 0.45052273279844396, + "grad_norm": 15.375, + "learning_rate": 2.2237497204888205e-06, + "loss": 0.4336, + "step": 1853 + }, + { + "epoch": 0.45076586433260396, + "grad_norm": 22.25, + "learning_rate": 2.223446630461585e-06, + "loss": 0.962, + "step": 1854 + }, + { + "epoch": 0.4510089958667639, + "grad_norm": 17.875, + "learning_rate": 2.2231433949359384e-06, + "loss": 0.7468, + "step": 1855 + }, + { + "epoch": 0.4512521274009239, + "grad_norm": 19.125, + "learning_rate": 2.2228400139572043e-06, + "loss": 1.029, + "step": 1856 + }, + { + "epoch": 0.45149525893508385, + "grad_norm": 21.25, + "learning_rate": 2.222536487570729e-06, + "loss": 0.9867, + "step": 1857 + }, + { + "epoch": 0.45173839046924386, + "grad_norm": 16.5, + "learning_rate": 2.2222328158218793e-06, + "loss": 0.7912, + "step": 1858 + }, + { + "epoch": 0.45198152200340386, + "grad_norm": 21.375, + "learning_rate": 2.221928998756044e-06, + "loss": 1.0065, + "step": 1859 + }, + { + "epoch": 0.4522246535375638, + "grad_norm": 18.625, + "learning_rate": 2.2216250364186344e-06, + "loss": 0.8939, + "step": 1860 + }, + { + "epoch": 0.4524677850717238, + "grad_norm": 16.25, + "learning_rate": 2.2213209288550826e-06, + "loss": 0.7344, + "step": 1861 + }, + { + "epoch": 0.4527109166058838, + "grad_norm": 18.625, + "learning_rate": 2.2210166761108422e-06, + "loss": 0.9777, + "step": 1862 + }, + { + "epoch": 0.45295404814004375, + "grad_norm": 16.375, + "learning_rate": 2.2207122782313895e-06, + "loss": 0.6445, + "step": 1863 + }, + { + "epoch": 0.45319717967420375, + "grad_norm": 16.625, + "learning_rate": 2.220407735262223e-06, + "loss": 0.6399, + "step": 1864 + }, + { + "epoch": 0.4534403112083637, + "grad_norm": 22.25, + "learning_rate": 2.22010304724886e-06, + "loss": 0.6662, + "step": 1865 + }, + { + "epoch": 0.4536834427425237, + "grad_norm": 17.125, + "learning_rate": 2.2197982142368423e-06, + "loss": 0.8791, + "step": 1866 + }, + { + "epoch": 0.4539265742766837, + "grad_norm": 17.5, + "learning_rate": 2.219493236271733e-06, + "loss": 0.7613, + "step": 1867 + }, + { + "epoch": 0.45416970581084365, + "grad_norm": 22.875, + "learning_rate": 2.2191881133991154e-06, + "loss": 0.8419, + "step": 1868 + }, + { + "epoch": 0.45441283734500365, + "grad_norm": 16.125, + "learning_rate": 2.218882845664596e-06, + "loss": 0.6044, + "step": 1869 + }, + { + "epoch": 0.45465596887916365, + "grad_norm": 18.0, + "learning_rate": 2.2185774331138023e-06, + "loss": 1.0509, + "step": 1870 + }, + { + "epoch": 0.4548991004133236, + "grad_norm": 25.75, + "learning_rate": 2.2182718757923834e-06, + "loss": 1.0906, + "step": 1871 + }, + { + "epoch": 0.4551422319474836, + "grad_norm": 18.875, + "learning_rate": 2.2179661737460096e-06, + "loss": 0.9405, + "step": 1872 + }, + { + "epoch": 0.45538536348164355, + "grad_norm": 20.625, + "learning_rate": 2.217660327020374e-06, + "loss": 1.2509, + "step": 1873 + }, + { + "epoch": 0.45562849501580355, + "grad_norm": 15.9375, + "learning_rate": 2.2173543356611903e-06, + "loss": 0.6884, + "step": 1874 + }, + { + "epoch": 0.45587162654996355, + "grad_norm": 18.5, + "learning_rate": 2.217048199714194e-06, + "loss": 1.1921, + "step": 1875 + }, + { + "epoch": 0.4561147580841235, + "grad_norm": 16.75, + "learning_rate": 2.2167419192251435e-06, + "loss": 0.66, + "step": 1876 + }, + { + "epoch": 0.4563578896182835, + "grad_norm": 19.625, + "learning_rate": 2.216435494239817e-06, + "loss": 0.6735, + "step": 1877 + }, + { + "epoch": 0.4566010211524435, + "grad_norm": 19.75, + "learning_rate": 2.2161289248040144e-06, + "loss": 0.8849, + "step": 1878 + }, + { + "epoch": 0.45684415268660344, + "grad_norm": 20.75, + "learning_rate": 2.2158222109635583e-06, + "loss": 0.8143, + "step": 1879 + }, + { + "epoch": 0.45708728422076345, + "grad_norm": 19.0, + "learning_rate": 2.215515352764293e-06, + "loss": 0.8738, + "step": 1880 + }, + { + "epoch": 0.4573304157549234, + "grad_norm": 33.5, + "learning_rate": 2.215208350252083e-06, + "loss": 1.2343, + "step": 1881 + }, + { + "epoch": 0.4575735472890834, + "grad_norm": 22.5, + "learning_rate": 2.214901203472815e-06, + "loss": 1.3959, + "step": 1882 + }, + { + "epoch": 0.4578166788232434, + "grad_norm": 20.75, + "learning_rate": 2.214593912472398e-06, + "loss": 0.6936, + "step": 1883 + }, + { + "epoch": 0.45805981035740334, + "grad_norm": 20.75, + "learning_rate": 2.214286477296762e-06, + "loss": 0.7576, + "step": 1884 + }, + { + "epoch": 0.45830294189156334, + "grad_norm": 23.5, + "learning_rate": 2.2139788979918577e-06, + "loss": 1.4372, + "step": 1885 + }, + { + "epoch": 0.45854607342572334, + "grad_norm": 14.25, + "learning_rate": 2.2136711746036587e-06, + "loss": 0.5822, + "step": 1886 + }, + { + "epoch": 0.4587892049598833, + "grad_norm": 18.125, + "learning_rate": 2.2133633071781597e-06, + "loss": 0.8024, + "step": 1887 + }, + { + "epoch": 0.4590323364940433, + "grad_norm": 13.3125, + "learning_rate": 2.213055295761376e-06, + "loss": 0.4761, + "step": 1888 + }, + { + "epoch": 0.45927546802820324, + "grad_norm": 31.0, + "learning_rate": 2.2127471403993463e-06, + "loss": 0.9431, + "step": 1889 + }, + { + "epoch": 0.45951859956236324, + "grad_norm": 18.75, + "learning_rate": 2.2124388411381294e-06, + "loss": 1.0756, + "step": 1890 + }, + { + "epoch": 0.45976173109652324, + "grad_norm": 24.375, + "learning_rate": 2.2121303980238053e-06, + "loss": 0.5698, + "step": 1891 + }, + { + "epoch": 0.4600048626306832, + "grad_norm": 16.625, + "learning_rate": 2.2118218111024768e-06, + "loss": 0.8241, + "step": 1892 + }, + { + "epoch": 0.4602479941648432, + "grad_norm": 18.875, + "learning_rate": 2.2115130804202676e-06, + "loss": 0.6232, + "step": 1893 + }, + { + "epoch": 0.46049112569900313, + "grad_norm": 20.5, + "learning_rate": 2.2112042060233225e-06, + "loss": 0.9004, + "step": 1894 + }, + { + "epoch": 0.46073425723316314, + "grad_norm": 15.25, + "learning_rate": 2.2108951879578082e-06, + "loss": 0.6967, + "step": 1895 + }, + { + "epoch": 0.46097738876732314, + "grad_norm": 21.125, + "learning_rate": 2.210586026269913e-06, + "loss": 1.2644, + "step": 1896 + }, + { + "epoch": 0.4612205203014831, + "grad_norm": 26.625, + "learning_rate": 2.210276721005846e-06, + "loss": 1.1362, + "step": 1897 + }, + { + "epoch": 0.4614636518356431, + "grad_norm": 18.0, + "learning_rate": 2.2099672722118386e-06, + "loss": 1.0583, + "step": 1898 + }, + { + "epoch": 0.4617067833698031, + "grad_norm": 19.375, + "learning_rate": 2.2096576799341436e-06, + "loss": 0.6459, + "step": 1899 + }, + { + "epoch": 0.46194991490396303, + "grad_norm": 25.0, + "learning_rate": 2.209347944219034e-06, + "loss": 1.2253, + "step": 1900 + }, + { + "epoch": 0.46219304643812303, + "grad_norm": 22.625, + "learning_rate": 2.2090380651128056e-06, + "loss": 0.904, + "step": 1901 + }, + { + "epoch": 0.462436177972283, + "grad_norm": 22.0, + "learning_rate": 2.2087280426617754e-06, + "loss": 0.6962, + "step": 1902 + }, + { + "epoch": 0.462679309506443, + "grad_norm": 17.875, + "learning_rate": 2.208417876912281e-06, + "loss": 0.4962, + "step": 1903 + }, + { + "epoch": 0.462922441040603, + "grad_norm": 15.8125, + "learning_rate": 2.208107567910683e-06, + "loss": 0.5747, + "step": 1904 + }, + { + "epoch": 0.46316557257476293, + "grad_norm": 21.5, + "learning_rate": 2.207797115703362e-06, + "loss": 0.6997, + "step": 1905 + }, + { + "epoch": 0.46340870410892293, + "grad_norm": 21.375, + "learning_rate": 2.2074865203367196e-06, + "loss": 1.1637, + "step": 1906 + }, + { + "epoch": 0.46365183564308293, + "grad_norm": 16.375, + "learning_rate": 2.2071757818571808e-06, + "loss": 1.3612, + "step": 1907 + }, + { + "epoch": 0.4638949671772429, + "grad_norm": 18.875, + "learning_rate": 2.2068649003111903e-06, + "loss": 0.786, + "step": 1908 + }, + { + "epoch": 0.4641380987114029, + "grad_norm": 14.0, + "learning_rate": 2.2065538757452148e-06, + "loss": 0.5215, + "step": 1909 + }, + { + "epoch": 0.4643812302455628, + "grad_norm": 14.6875, + "learning_rate": 2.2062427082057427e-06, + "loss": 0.5998, + "step": 1910 + }, + { + "epoch": 0.4646243617797228, + "grad_norm": 25.75, + "learning_rate": 2.2059313977392825e-06, + "loss": 0.954, + "step": 1911 + }, + { + "epoch": 0.46486749331388283, + "grad_norm": 17.375, + "learning_rate": 2.2056199443923656e-06, + "loss": 1.0733, + "step": 1912 + }, + { + "epoch": 0.4651106248480428, + "grad_norm": 19.5, + "learning_rate": 2.205308348211544e-06, + "loss": 0.8767, + "step": 1913 + }, + { + "epoch": 0.4653537563822028, + "grad_norm": 19.0, + "learning_rate": 2.2049966092433906e-06, + "loss": 0.9691, + "step": 1914 + }, + { + "epoch": 0.4655968879163628, + "grad_norm": 24.75, + "learning_rate": 2.2046847275345007e-06, + "loss": 1.2806, + "step": 1915 + }, + { + "epoch": 0.4658400194505227, + "grad_norm": 16.75, + "learning_rate": 2.2043727031314906e-06, + "loss": 0.9385, + "step": 1916 + }, + { + "epoch": 0.4660831509846827, + "grad_norm": 19.0, + "learning_rate": 2.2040605360809973e-06, + "loss": 0.5772, + "step": 1917 + }, + { + "epoch": 0.46632628251884267, + "grad_norm": 15.4375, + "learning_rate": 2.20374822642968e-06, + "loss": 0.5349, + "step": 1918 + }, + { + "epoch": 0.4665694140530027, + "grad_norm": 18.625, + "learning_rate": 2.203435774224218e-06, + "loss": 0.8595, + "step": 1919 + }, + { + "epoch": 0.4668125455871627, + "grad_norm": 16.0, + "learning_rate": 2.203123179511313e-06, + "loss": 0.6659, + "step": 1920 + }, + { + "epoch": 0.4670556771213226, + "grad_norm": 16.25, + "learning_rate": 2.202810442337688e-06, + "loss": 0.606, + "step": 1921 + }, + { + "epoch": 0.4672988086554826, + "grad_norm": 21.5, + "learning_rate": 2.202497562750087e-06, + "loss": 0.6258, + "step": 1922 + }, + { + "epoch": 0.4675419401896426, + "grad_norm": 23.125, + "learning_rate": 2.202184540795275e-06, + "loss": 1.0312, + "step": 1923 + }, + { + "epoch": 0.46778507172380257, + "grad_norm": 21.875, + "learning_rate": 2.2018713765200384e-06, + "loss": 1.1694, + "step": 1924 + }, + { + "epoch": 0.46802820325796257, + "grad_norm": 21.0, + "learning_rate": 2.201558069971185e-06, + "loss": 1.1825, + "step": 1925 + }, + { + "epoch": 0.4682713347921225, + "grad_norm": 17.5, + "learning_rate": 2.2012446211955445e-06, + "loss": 1.0307, + "step": 1926 + }, + { + "epoch": 0.4685144663262825, + "grad_norm": 26.375, + "learning_rate": 2.2009310302399666e-06, + "loss": 0.9426, + "step": 1927 + }, + { + "epoch": 0.4687575978604425, + "grad_norm": 17.0, + "learning_rate": 2.2006172971513234e-06, + "loss": 0.9969, + "step": 1928 + }, + { + "epoch": 0.46900072939460247, + "grad_norm": 20.0, + "learning_rate": 2.200303421976507e-06, + "loss": 0.9373, + "step": 1929 + }, + { + "epoch": 0.46924386092876247, + "grad_norm": 21.0, + "learning_rate": 2.199989404762432e-06, + "loss": 0.7108, + "step": 1930 + }, + { + "epoch": 0.4694869924629224, + "grad_norm": 23.5, + "learning_rate": 2.1996752455560337e-06, + "loss": 0.8034, + "step": 1931 + }, + { + "epoch": 0.4697301239970824, + "grad_norm": 13.0, + "learning_rate": 2.1993609444042683e-06, + "loss": 1.0756, + "step": 1932 + }, + { + "epoch": 0.4699732555312424, + "grad_norm": 13.9375, + "learning_rate": 2.199046501354114e-06, + "loss": 0.4195, + "step": 1933 + }, + { + "epoch": 0.47021638706540236, + "grad_norm": 18.625, + "learning_rate": 2.1987319164525692e-06, + "loss": 0.9116, + "step": 1934 + }, + { + "epoch": 0.47045951859956237, + "grad_norm": 22.5, + "learning_rate": 2.1984171897466544e-06, + "loss": 1.0722, + "step": 1935 + }, + { + "epoch": 0.47070265013372237, + "grad_norm": 18.625, + "learning_rate": 2.198102321283411e-06, + "loss": 0.9446, + "step": 1936 + }, + { + "epoch": 0.4709457816678823, + "grad_norm": 17.375, + "learning_rate": 2.1977873111099014e-06, + "loss": 0.8527, + "step": 1937 + }, + { + "epoch": 0.4711889132020423, + "grad_norm": 17.5, + "learning_rate": 2.197472159273209e-06, + "loss": 0.7372, + "step": 1938 + }, + { + "epoch": 0.47143204473620226, + "grad_norm": 17.625, + "learning_rate": 2.197156865820439e-06, + "loss": 0.7782, + "step": 1939 + }, + { + "epoch": 0.47167517627036226, + "grad_norm": 51.75, + "learning_rate": 2.1968414307987178e-06, + "loss": 1.2376, + "step": 1940 + }, + { + "epoch": 0.47191830780452226, + "grad_norm": 19.75, + "learning_rate": 2.196525854255192e-06, + "loss": 0.9464, + "step": 1941 + }, + { + "epoch": 0.4721614393386822, + "grad_norm": 21.75, + "learning_rate": 2.1962101362370305e-06, + "loss": 0.6516, + "step": 1942 + }, + { + "epoch": 0.4724045708728422, + "grad_norm": 21.25, + "learning_rate": 2.1958942767914223e-06, + "loss": 1.1686, + "step": 1943 + }, + { + "epoch": 0.4726477024070022, + "grad_norm": 26.0, + "learning_rate": 2.195578275965578e-06, + "loss": 1.0714, + "step": 1944 + }, + { + "epoch": 0.47289083394116216, + "grad_norm": 17.75, + "learning_rate": 2.19526213380673e-06, + "loss": 0.7905, + "step": 1945 + }, + { + "epoch": 0.47313396547532216, + "grad_norm": 17.625, + "learning_rate": 2.1949458503621308e-06, + "loss": 0.5541, + "step": 1946 + }, + { + "epoch": 0.4733770970094821, + "grad_norm": 30.0, + "learning_rate": 2.194629425679054e-06, + "loss": 1.2114, + "step": 1947 + }, + { + "epoch": 0.4736202285436421, + "grad_norm": 18.875, + "learning_rate": 2.1943128598047957e-06, + "loss": 0.4835, + "step": 1948 + }, + { + "epoch": 0.4738633600778021, + "grad_norm": 19.5, + "learning_rate": 2.1939961527866718e-06, + "loss": 1.0849, + "step": 1949 + }, + { + "epoch": 0.47410649161196206, + "grad_norm": 19.5, + "learning_rate": 2.1936793046720196e-06, + "loss": 0.9007, + "step": 1950 + }, + { + "epoch": 0.47434962314612206, + "grad_norm": 25.5, + "learning_rate": 2.1933623155081967e-06, + "loss": 1.0149, + "step": 1951 + }, + { + "epoch": 0.47459275468028206, + "grad_norm": 19.375, + "learning_rate": 2.1930451853425837e-06, + "loss": 0.912, + "step": 1952 + }, + { + "epoch": 0.474835886214442, + "grad_norm": 34.5, + "learning_rate": 2.1927279142225812e-06, + "loss": 1.1245, + "step": 1953 + }, + { + "epoch": 0.475079017748602, + "grad_norm": 22.625, + "learning_rate": 2.1924105021956097e-06, + "loss": 1.1844, + "step": 1954 + }, + { + "epoch": 0.47532214928276195, + "grad_norm": 23.875, + "learning_rate": 2.192092949309113e-06, + "loss": 1.0106, + "step": 1955 + }, + { + "epoch": 0.47556528081692195, + "grad_norm": 24.75, + "learning_rate": 2.191775255610555e-06, + "loss": 1.0633, + "step": 1956 + }, + { + "epoch": 0.47580841235108196, + "grad_norm": 18.125, + "learning_rate": 2.1914574211474194e-06, + "loss": 0.5309, + "step": 1957 + }, + { + "epoch": 0.4760515438852419, + "grad_norm": 20.25, + "learning_rate": 2.191139445967213e-06, + "loss": 0.7897, + "step": 1958 + }, + { + "epoch": 0.4762946754194019, + "grad_norm": 19.25, + "learning_rate": 2.190821330117462e-06, + "loss": 0.8475, + "step": 1959 + }, + { + "epoch": 0.4765378069535619, + "grad_norm": 16.75, + "learning_rate": 2.190503073645715e-06, + "loss": 0.7451, + "step": 1960 + }, + { + "epoch": 0.47678093848772185, + "grad_norm": 20.5, + "learning_rate": 2.1901846765995406e-06, + "loss": 0.9894, + "step": 1961 + }, + { + "epoch": 0.47702407002188185, + "grad_norm": 18.0, + "learning_rate": 2.1898661390265287e-06, + "loss": 0.9663, + "step": 1962 + }, + { + "epoch": 0.4772672015560418, + "grad_norm": 17.375, + "learning_rate": 2.1895474609742897e-06, + "loss": 0.9645, + "step": 1963 + }, + { + "epoch": 0.4775103330902018, + "grad_norm": 21.25, + "learning_rate": 2.1892286424904567e-06, + "loss": 1.2093, + "step": 1964 + }, + { + "epoch": 0.4777534646243618, + "grad_norm": 14.8125, + "learning_rate": 2.188909683622682e-06, + "loss": 0.4981, + "step": 1965 + }, + { + "epoch": 0.47799659615852175, + "grad_norm": 21.375, + "learning_rate": 2.1885905844186395e-06, + "loss": 1.0424, + "step": 1966 + }, + { + "epoch": 0.47823972769268175, + "grad_norm": 21.0, + "learning_rate": 2.188271344926024e-06, + "loss": 0.9301, + "step": 1967 + }, + { + "epoch": 0.4784828592268417, + "grad_norm": 18.375, + "learning_rate": 2.187951965192552e-06, + "loss": 0.8298, + "step": 1968 + }, + { + "epoch": 0.4787259907610017, + "grad_norm": 18.75, + "learning_rate": 2.1876324452659593e-06, + "loss": 1.1689, + "step": 1969 + }, + { + "epoch": 0.4789691222951617, + "grad_norm": 25.375, + "learning_rate": 2.187312785194004e-06, + "loss": 0.9826, + "step": 1970 + }, + { + "epoch": 0.47921225382932164, + "grad_norm": 20.625, + "learning_rate": 2.1869929850244655e-06, + "loss": 0.732, + "step": 1971 + }, + { + "epoch": 0.47945538536348165, + "grad_norm": 15.125, + "learning_rate": 2.1866730448051427e-06, + "loss": 0.7193, + "step": 1972 + }, + { + "epoch": 0.47969851689764165, + "grad_norm": 18.625, + "learning_rate": 2.1863529645838564e-06, + "loss": 0.9485, + "step": 1973 + }, + { + "epoch": 0.4799416484318016, + "grad_norm": 18.0, + "learning_rate": 2.1860327444084483e-06, + "loss": 0.718, + "step": 1974 + }, + { + "epoch": 0.4801847799659616, + "grad_norm": 16.875, + "learning_rate": 2.1857123843267808e-06, + "loss": 0.7812, + "step": 1975 + }, + { + "epoch": 0.48042791150012154, + "grad_norm": 18.25, + "learning_rate": 2.185391884386737e-06, + "loss": 0.8458, + "step": 1976 + }, + { + "epoch": 0.48067104303428154, + "grad_norm": 21.0, + "learning_rate": 2.185071244636221e-06, + "loss": 0.9139, + "step": 1977 + }, + { + "epoch": 0.48091417456844154, + "grad_norm": 15.4375, + "learning_rate": 2.1847504651231586e-06, + "loss": 0.4537, + "step": 1978 + }, + { + "epoch": 0.4811573061026015, + "grad_norm": 24.25, + "learning_rate": 2.1844295458954956e-06, + "loss": 0.6585, + "step": 1979 + }, + { + "epoch": 0.4814004376367615, + "grad_norm": 22.125, + "learning_rate": 2.184108487001199e-06, + "loss": 0.8459, + "step": 1980 + }, + { + "epoch": 0.4816435691709215, + "grad_norm": 19.75, + "learning_rate": 2.183787288488256e-06, + "loss": 0.7802, + "step": 1981 + }, + { + "epoch": 0.48188670070508144, + "grad_norm": 17.875, + "learning_rate": 2.183465950404676e-06, + "loss": 0.8244, + "step": 1982 + }, + { + "epoch": 0.48212983223924144, + "grad_norm": 16.375, + "learning_rate": 2.1831444727984877e-06, + "loss": 0.8622, + "step": 1983 + }, + { + "epoch": 0.4823729637734014, + "grad_norm": 18.125, + "learning_rate": 2.1828228557177424e-06, + "loss": 0.912, + "step": 1984 + }, + { + "epoch": 0.4826160953075614, + "grad_norm": 21.75, + "learning_rate": 2.1825010992105107e-06, + "loss": 1.3709, + "step": 1985 + }, + { + "epoch": 0.4828592268417214, + "grad_norm": 19.25, + "learning_rate": 2.1821792033248847e-06, + "loss": 0.7946, + "step": 1986 + }, + { + "epoch": 0.48310235837588134, + "grad_norm": 13.1875, + "learning_rate": 2.181857168108978e-06, + "loss": 0.7402, + "step": 1987 + }, + { + "epoch": 0.48334548991004134, + "grad_norm": 17.25, + "learning_rate": 2.1815349936109233e-06, + "loss": 0.713, + "step": 1988 + }, + { + "epoch": 0.48358862144420134, + "grad_norm": 39.5, + "learning_rate": 2.1812126798788758e-06, + "loss": 0.987, + "step": 1989 + }, + { + "epoch": 0.4838317529783613, + "grad_norm": 16.5, + "learning_rate": 2.1808902269610106e-06, + "loss": 0.8076, + "step": 1990 + }, + { + "epoch": 0.4840748845125213, + "grad_norm": 18.125, + "learning_rate": 2.1805676349055244e-06, + "loss": 1.0758, + "step": 1991 + }, + { + "epoch": 0.48431801604668123, + "grad_norm": 20.5, + "learning_rate": 2.1802449037606333e-06, + "loss": 1.0461, + "step": 1992 + }, + { + "epoch": 0.48456114758084123, + "grad_norm": 15.25, + "learning_rate": 2.1799220335745753e-06, + "loss": 0.8085, + "step": 1993 + }, + { + "epoch": 0.48480427911500124, + "grad_norm": 15.25, + "learning_rate": 2.1795990243956094e-06, + "loss": 0.8672, + "step": 1994 + }, + { + "epoch": 0.4850474106491612, + "grad_norm": 16.375, + "learning_rate": 2.1792758762720147e-06, + "loss": 0.7869, + "step": 1995 + }, + { + "epoch": 0.4852905421833212, + "grad_norm": 19.0, + "learning_rate": 2.1789525892520906e-06, + "loss": 1.2812, + "step": 1996 + }, + { + "epoch": 0.4855336737174812, + "grad_norm": 28.375, + "learning_rate": 2.178629163384159e-06, + "loss": 0.9107, + "step": 1997 + }, + { + "epoch": 0.48577680525164113, + "grad_norm": 15.5625, + "learning_rate": 2.1783055987165604e-06, + "loss": 0.7029, + "step": 1998 + }, + { + "epoch": 0.48601993678580113, + "grad_norm": 20.5, + "learning_rate": 2.177981895297658e-06, + "loss": 0.8151, + "step": 1999 + }, + { + "epoch": 0.4862630683199611, + "grad_norm": 23.625, + "learning_rate": 2.1776580531758344e-06, + "loss": 1.1322, + "step": 2000 + }, + { + "epoch": 0.4865061998541211, + "grad_norm": 16.75, + "learning_rate": 2.177334072399494e-06, + "loss": 0.6582, + "step": 2001 + }, + { + "epoch": 0.4867493313882811, + "grad_norm": 14.5625, + "learning_rate": 2.1770099530170606e-06, + "loss": 0.571, + "step": 2002 + }, + { + "epoch": 0.486992462922441, + "grad_norm": 16.25, + "learning_rate": 2.1766856950769798e-06, + "loss": 0.8654, + "step": 2003 + }, + { + "epoch": 0.48723559445660103, + "grad_norm": 19.5, + "learning_rate": 2.176361298627717e-06, + "loss": 0.8847, + "step": 2004 + }, + { + "epoch": 0.487478725990761, + "grad_norm": 14.875, + "learning_rate": 2.1760367637177597e-06, + "loss": 0.6756, + "step": 2005 + }, + { + "epoch": 0.487721857524921, + "grad_norm": 23.125, + "learning_rate": 2.1757120903956146e-06, + "loss": 0.7871, + "step": 2006 + }, + { + "epoch": 0.487964989059081, + "grad_norm": 17.375, + "learning_rate": 2.17538727870981e-06, + "loss": 0.6673, + "step": 2007 + }, + { + "epoch": 0.4882081205932409, + "grad_norm": 23.75, + "learning_rate": 2.1750623287088953e-06, + "loss": 0.9512, + "step": 2008 + }, + { + "epoch": 0.4884512521274009, + "grad_norm": 17.5, + "learning_rate": 2.1747372404414385e-06, + "loss": 1.0196, + "step": 2009 + }, + { + "epoch": 0.4886943836615609, + "grad_norm": 18.125, + "learning_rate": 2.1744120139560306e-06, + "loss": 0.8133, + "step": 2010 + }, + { + "epoch": 0.4889375151957209, + "grad_norm": 14.1875, + "learning_rate": 2.174086649301282e-06, + "loss": 0.4063, + "step": 2011 + }, + { + "epoch": 0.4891806467298809, + "grad_norm": 14.625, + "learning_rate": 2.1737611465258242e-06, + "loss": 0.6986, + "step": 2012 + }, + { + "epoch": 0.4894237782640408, + "grad_norm": 25.875, + "learning_rate": 2.1734355056783092e-06, + "loss": 0.8912, + "step": 2013 + }, + { + "epoch": 0.4896669097982008, + "grad_norm": 21.375, + "learning_rate": 2.173109726807409e-06, + "loss": 1.2023, + "step": 2014 + }, + { + "epoch": 0.4899100413323608, + "grad_norm": 19.0, + "learning_rate": 2.172783809961818e-06, + "loss": 0.6458, + "step": 2015 + }, + { + "epoch": 0.49015317286652077, + "grad_norm": 16.0, + "learning_rate": 2.1724577551902497e-06, + "loss": 0.6342, + "step": 2016 + }, + { + "epoch": 0.49039630440068077, + "grad_norm": 23.625, + "learning_rate": 2.172131562541438e-06, + "loss": 1.0564, + "step": 2017 + }, + { + "epoch": 0.4906394359348408, + "grad_norm": 17.75, + "learning_rate": 2.171805232064139e-06, + "loss": 0.6947, + "step": 2018 + }, + { + "epoch": 0.4908825674690007, + "grad_norm": 13.75, + "learning_rate": 2.1714787638071276e-06, + "loss": 0.8488, + "step": 2019 + }, + { + "epoch": 0.4911256990031607, + "grad_norm": 21.625, + "learning_rate": 2.1711521578192008e-06, + "loss": 1.1226, + "step": 2020 + }, + { + "epoch": 0.49136883053732067, + "grad_norm": 27.125, + "learning_rate": 2.170825414149175e-06, + "loss": 1.0455, + "step": 2021 + }, + { + "epoch": 0.49161196207148067, + "grad_norm": 17.125, + "learning_rate": 2.1704985328458877e-06, + "loss": 0.5323, + "step": 2022 + }, + { + "epoch": 0.49185509360564067, + "grad_norm": 15.9375, + "learning_rate": 2.1701715139581974e-06, + "loss": 0.505, + "step": 2023 + }, + { + "epoch": 0.4920982251398006, + "grad_norm": 19.5, + "learning_rate": 2.1698443575349824e-06, + "loss": 0.7535, + "step": 2024 + }, + { + "epoch": 0.4923413566739606, + "grad_norm": 23.125, + "learning_rate": 2.1695170636251416e-06, + "loss": 1.2536, + "step": 2025 + }, + { + "epoch": 0.4925844882081206, + "grad_norm": 18.0, + "learning_rate": 2.169189632277595e-06, + "loss": 0.8221, + "step": 2026 + }, + { + "epoch": 0.49282761974228056, + "grad_norm": 21.375, + "learning_rate": 2.168862063541283e-06, + "loss": 1.145, + "step": 2027 + }, + { + "epoch": 0.49307075127644057, + "grad_norm": 30.5, + "learning_rate": 2.168534357465167e-06, + "loss": 1.123, + "step": 2028 + }, + { + "epoch": 0.4933138828106005, + "grad_norm": 17.5, + "learning_rate": 2.1682065140982266e-06, + "loss": 0.9509, + "step": 2029 + }, + { + "epoch": 0.4935570143447605, + "grad_norm": 20.625, + "learning_rate": 2.167878533489465e-06, + "loss": 0.9344, + "step": 2030 + }, + { + "epoch": 0.4938001458789205, + "grad_norm": 16.5, + "learning_rate": 2.1675504156879047e-06, + "loss": 0.8253, + "step": 2031 + }, + { + "epoch": 0.49404327741308046, + "grad_norm": 18.375, + "learning_rate": 2.167222160742588e-06, + "loss": 0.8082, + "step": 2032 + }, + { + "epoch": 0.49428640894724046, + "grad_norm": 20.875, + "learning_rate": 2.166893768702578e-06, + "loss": 1.0062, + "step": 2033 + }, + { + "epoch": 0.49452954048140046, + "grad_norm": 28.0, + "learning_rate": 2.1665652396169593e-06, + "loss": 1.1932, + "step": 2034 + }, + { + "epoch": 0.4947726720155604, + "grad_norm": 24.0, + "learning_rate": 2.1662365735348358e-06, + "loss": 1.0181, + "step": 2035 + }, + { + "epoch": 0.4950158035497204, + "grad_norm": 24.375, + "learning_rate": 2.165907770505332e-06, + "loss": 1.3114, + "step": 2036 + }, + { + "epoch": 0.49525893508388036, + "grad_norm": 23.875, + "learning_rate": 2.1655788305775945e-06, + "loss": 1.3005, + "step": 2037 + }, + { + "epoch": 0.49550206661804036, + "grad_norm": 18.375, + "learning_rate": 2.165249753800788e-06, + "loss": 1.0847, + "step": 2038 + }, + { + "epoch": 0.49574519815220036, + "grad_norm": 22.875, + "learning_rate": 2.1649205402240984e-06, + "loss": 1.1625, + "step": 2039 + }, + { + "epoch": 0.4959883296863603, + "grad_norm": 21.0, + "learning_rate": 2.164591189896733e-06, + "loss": 0.849, + "step": 2040 + }, + { + "epoch": 0.4962314612205203, + "grad_norm": 19.125, + "learning_rate": 2.164261702867919e-06, + "loss": 0.6409, + "step": 2041 + }, + { + "epoch": 0.49647459275468026, + "grad_norm": 24.125, + "learning_rate": 2.1639320791869035e-06, + "loss": 1.1276, + "step": 2042 + }, + { + "epoch": 0.49671772428884026, + "grad_norm": 18.625, + "learning_rate": 2.163602318902954e-06, + "loss": 1.0885, + "step": 2043 + }, + { + "epoch": 0.49696085582300026, + "grad_norm": 18.875, + "learning_rate": 2.1632724220653607e-06, + "loss": 0.8804, + "step": 2044 + }, + { + "epoch": 0.4972039873571602, + "grad_norm": 20.625, + "learning_rate": 2.16294238872343e-06, + "loss": 1.1937, + "step": 2045 + }, + { + "epoch": 0.4974471188913202, + "grad_norm": 18.375, + "learning_rate": 2.162612218926493e-06, + "loss": 0.9079, + "step": 2046 + }, + { + "epoch": 0.4976902504254802, + "grad_norm": 20.375, + "learning_rate": 2.162281912723898e-06, + "loss": 1.02, + "step": 2047 + }, + { + "epoch": 0.49793338195964015, + "grad_norm": 21.75, + "learning_rate": 2.161951470165016e-06, + "loss": 0.8726, + "step": 2048 + }, + { + "epoch": 0.49817651349380015, + "grad_norm": 20.375, + "learning_rate": 2.1616208912992363e-06, + "loss": 1.0263, + "step": 2049 + }, + { + "epoch": 0.4984196450279601, + "grad_norm": 19.75, + "learning_rate": 2.161290176175971e-06, + "loss": 0.8888, + "step": 2050 + }, + { + "epoch": 0.4986627765621201, + "grad_norm": 20.25, + "learning_rate": 2.16095932484465e-06, + "loss": 0.9729, + "step": 2051 + }, + { + "epoch": 0.4989059080962801, + "grad_norm": 19.5, + "learning_rate": 2.1606283373547246e-06, + "loss": 0.9825, + "step": 2052 + }, + { + "epoch": 0.49914903963044005, + "grad_norm": 19.625, + "learning_rate": 2.160297213755667e-06, + "loss": 0.6419, + "step": 2053 + }, + { + "epoch": 0.49939217116460005, + "grad_norm": 21.5, + "learning_rate": 2.1599659540969705e-06, + "loss": 1.0555, + "step": 2054 + }, + { + "epoch": 0.49963530269876005, + "grad_norm": 21.75, + "learning_rate": 2.1596345584281453e-06, + "loss": 1.0693, + "step": 2055 + }, + { + "epoch": 0.49987843423292, + "grad_norm": 15.0625, + "learning_rate": 2.1593030267987262e-06, + "loss": 0.8538, + "step": 2056 + }, + { + "epoch": 0.50012156576708, + "grad_norm": 14.6875, + "learning_rate": 2.158971359258265e-06, + "loss": 0.4969, + "step": 2057 + }, + { + "epoch": 0.50036469730124, + "grad_norm": 21.25, + "learning_rate": 2.1586395558563363e-06, + "loss": 1.3187, + "step": 2058 + }, + { + "epoch": 0.5006078288354, + "grad_norm": 18.5, + "learning_rate": 2.1583076166425328e-06, + "loss": 1.2774, + "step": 2059 + }, + { + "epoch": 0.5008509603695599, + "grad_norm": 22.125, + "learning_rate": 2.157975541666469e-06, + "loss": 1.0634, + "step": 2060 + }, + { + "epoch": 0.5010940919037199, + "grad_norm": 18.0, + "learning_rate": 2.1576433309777794e-06, + "loss": 0.8712, + "step": 2061 + }, + { + "epoch": 0.5013372234378799, + "grad_norm": 15.3125, + "learning_rate": 2.157310984626118e-06, + "loss": 0.6967, + "step": 2062 + }, + { + "epoch": 0.5015803549720399, + "grad_norm": 23.375, + "learning_rate": 2.1569785026611605e-06, + "loss": 1.2955, + "step": 2063 + }, + { + "epoch": 0.5018234865061999, + "grad_norm": 21.25, + "learning_rate": 2.1566458851326015e-06, + "loss": 0.8218, + "step": 2064 + }, + { + "epoch": 0.5020666180403598, + "grad_norm": 15.125, + "learning_rate": 2.156313132090157e-06, + "loss": 0.6786, + "step": 2065 + }, + { + "epoch": 0.5023097495745198, + "grad_norm": 18.875, + "learning_rate": 2.1559802435835623e-06, + "loss": 0.669, + "step": 2066 + }, + { + "epoch": 0.5025528811086798, + "grad_norm": 16.375, + "learning_rate": 2.1556472196625733e-06, + "loss": 0.71, + "step": 2067 + }, + { + "epoch": 0.5027960126428398, + "grad_norm": 16.25, + "learning_rate": 2.155314060376966e-06, + "loss": 0.6031, + "step": 2068 + }, + { + "epoch": 0.5030391441769998, + "grad_norm": 19.375, + "learning_rate": 2.1549807657765375e-06, + "loss": 1.1718, + "step": 2069 + }, + { + "epoch": 0.5032822757111597, + "grad_norm": 26.125, + "learning_rate": 2.1546473359111037e-06, + "loss": 0.8826, + "step": 2070 + }, + { + "epoch": 0.5035254072453197, + "grad_norm": 19.625, + "learning_rate": 2.154313770830502e-06, + "loss": 0.8892, + "step": 2071 + }, + { + "epoch": 0.5037685387794797, + "grad_norm": 21.0, + "learning_rate": 2.1539800705845886e-06, + "loss": 1.0622, + "step": 2072 + }, + { + "epoch": 0.5040116703136397, + "grad_norm": 17.375, + "learning_rate": 2.1536462352232416e-06, + "loss": 0.7742, + "step": 2073 + }, + { + "epoch": 0.5042548018477997, + "grad_norm": 16.25, + "learning_rate": 2.153312264796359e-06, + "loss": 1.0432, + "step": 2074 + }, + { + "epoch": 0.5044979333819597, + "grad_norm": 20.75, + "learning_rate": 2.152978159353857e-06, + "loss": 0.6966, + "step": 2075 + }, + { + "epoch": 0.5047410649161196, + "grad_norm": 20.75, + "learning_rate": 2.152643918945674e-06, + "loss": 0.9008, + "step": 2076 + }, + { + "epoch": 0.5049841964502796, + "grad_norm": 15.375, + "learning_rate": 2.1523095436217685e-06, + "loss": 0.6613, + "step": 2077 + }, + { + "epoch": 0.5052273279844396, + "grad_norm": 20.75, + "learning_rate": 2.151975033432118e-06, + "loss": 1.3671, + "step": 2078 + }, + { + "epoch": 0.5054704595185996, + "grad_norm": 19.25, + "learning_rate": 2.151640388426721e-06, + "loss": 1.0413, + "step": 2079 + }, + { + "epoch": 0.5057135910527596, + "grad_norm": 19.125, + "learning_rate": 2.151305608655597e-06, + "loss": 0.8604, + "step": 2080 + }, + { + "epoch": 0.5059567225869195, + "grad_norm": 19.0, + "learning_rate": 2.1509706941687824e-06, + "loss": 0.794, + "step": 2081 + }, + { + "epoch": 0.5061998541210795, + "grad_norm": 17.5, + "learning_rate": 2.150635645016338e-06, + "loss": 1.0629, + "step": 2082 + }, + { + "epoch": 0.5064429856552395, + "grad_norm": 16.625, + "learning_rate": 2.150300461248342e-06, + "loss": 1.1524, + "step": 2083 + }, + { + "epoch": 0.5066861171893995, + "grad_norm": 22.5, + "learning_rate": 2.149965142914893e-06, + "loss": 1.0185, + "step": 2084 + }, + { + "epoch": 0.5069292487235595, + "grad_norm": 18.875, + "learning_rate": 2.1496296900661106e-06, + "loss": 1.0102, + "step": 2085 + }, + { + "epoch": 0.5071723802577194, + "grad_norm": 16.75, + "learning_rate": 2.149294102752134e-06, + "loss": 0.8522, + "step": 2086 + }, + { + "epoch": 0.5074155117918794, + "grad_norm": 17.75, + "learning_rate": 2.1489583810231217e-06, + "loss": 0.6616, + "step": 2087 + }, + { + "epoch": 0.5076586433260394, + "grad_norm": 22.25, + "learning_rate": 2.148622524929255e-06, + "loss": 1.2406, + "step": 2088 + }, + { + "epoch": 0.5079017748601994, + "grad_norm": 25.75, + "learning_rate": 2.148286534520731e-06, + "loss": 0.9268, + "step": 2089 + }, + { + "epoch": 0.5081449063943594, + "grad_norm": 21.0, + "learning_rate": 2.147950409847771e-06, + "loss": 1.0492, + "step": 2090 + }, + { + "epoch": 0.5083880379285193, + "grad_norm": 20.125, + "learning_rate": 2.1476141509606144e-06, + "loss": 1.084, + "step": 2091 + }, + { + "epoch": 0.5086311694626793, + "grad_norm": 18.875, + "learning_rate": 2.1472777579095205e-06, + "loss": 0.9973, + "step": 2092 + }, + { + "epoch": 0.5088743009968393, + "grad_norm": 16.125, + "learning_rate": 2.146941230744769e-06, + "loss": 0.3655, + "step": 2093 + }, + { + "epoch": 0.5091174325309993, + "grad_norm": 12.8125, + "learning_rate": 2.14660456951666e-06, + "loss": 0.7654, + "step": 2094 + }, + { + "epoch": 0.5093605640651593, + "grad_norm": 18.625, + "learning_rate": 2.146267774275513e-06, + "loss": 0.9237, + "step": 2095 + }, + { + "epoch": 0.5096036955993193, + "grad_norm": 17.25, + "learning_rate": 2.145930845071668e-06, + "loss": 0.7004, + "step": 2096 + }, + { + "epoch": 0.5098468271334792, + "grad_norm": 26.5, + "learning_rate": 2.145593781955485e-06, + "loss": 0.9624, + "step": 2097 + }, + { + "epoch": 0.5100899586676392, + "grad_norm": 22.75, + "learning_rate": 2.145256584977344e-06, + "loss": 0.8197, + "step": 2098 + }, + { + "epoch": 0.5103330902017992, + "grad_norm": 22.625, + "learning_rate": 2.1449192541876447e-06, + "loss": 1.3957, + "step": 2099 + }, + { + "epoch": 0.5105762217359592, + "grad_norm": 25.5, + "learning_rate": 2.144581789636807e-06, + "loss": 0.9053, + "step": 2100 + }, + { + "epoch": 0.5108193532701192, + "grad_norm": 17.125, + "learning_rate": 2.144244191375271e-06, + "loss": 0.8517, + "step": 2101 + }, + { + "epoch": 0.5110624848042791, + "grad_norm": 17.125, + "learning_rate": 2.143906459453496e-06, + "loss": 0.8091, + "step": 2102 + }, + { + "epoch": 0.5113056163384391, + "grad_norm": 23.125, + "learning_rate": 2.143568593921963e-06, + "loss": 0.8385, + "step": 2103 + }, + { + "epoch": 0.5115487478725991, + "grad_norm": 20.75, + "learning_rate": 2.143230594831171e-06, + "loss": 0.9638, + "step": 2104 + }, + { + "epoch": 0.5117918794067591, + "grad_norm": 19.25, + "learning_rate": 2.1428924622316396e-06, + "loss": 0.9676, + "step": 2105 + }, + { + "epoch": 0.5120350109409191, + "grad_norm": 19.125, + "learning_rate": 2.1425541961739093e-06, + "loss": 0.8893, + "step": 2106 + }, + { + "epoch": 0.512278142475079, + "grad_norm": 17.375, + "learning_rate": 2.1422157967085394e-06, + "loss": 0.8004, + "step": 2107 + }, + { + "epoch": 0.512521274009239, + "grad_norm": 25.625, + "learning_rate": 2.1418772638861095e-06, + "loss": 0.8912, + "step": 2108 + }, + { + "epoch": 0.512764405543399, + "grad_norm": 18.5, + "learning_rate": 2.141538597757219e-06, + "loss": 0.6878, + "step": 2109 + }, + { + "epoch": 0.513007537077559, + "grad_norm": 20.0, + "learning_rate": 2.141199798372488e-06, + "loss": 0.8896, + "step": 2110 + }, + { + "epoch": 0.513250668611719, + "grad_norm": 25.25, + "learning_rate": 2.140860865782556e-06, + "loss": 1.0707, + "step": 2111 + }, + { + "epoch": 0.513493800145879, + "grad_norm": 31.25, + "learning_rate": 2.1405218000380813e-06, + "loss": 0.9574, + "step": 2112 + }, + { + "epoch": 0.5137369316800389, + "grad_norm": 21.0, + "learning_rate": 2.1401826011897436e-06, + "loss": 0.8259, + "step": 2113 + }, + { + "epoch": 0.5139800632141989, + "grad_norm": 19.5, + "learning_rate": 2.1398432692882423e-06, + "loss": 1.1377, + "step": 2114 + }, + { + "epoch": 0.5142231947483589, + "grad_norm": 17.125, + "learning_rate": 2.1395038043842966e-06, + "loss": 0.5954, + "step": 2115 + }, + { + "epoch": 0.5144663262825189, + "grad_norm": 21.75, + "learning_rate": 2.139164206528645e-06, + "loss": 0.9733, + "step": 2116 + }, + { + "epoch": 0.5147094578166789, + "grad_norm": 16.25, + "learning_rate": 2.138824475772046e-06, + "loss": 0.6823, + "step": 2117 + }, + { + "epoch": 0.5149525893508388, + "grad_norm": 15.375, + "learning_rate": 2.138484612165279e-06, + "loss": 1.3318, + "step": 2118 + }, + { + "epoch": 0.5151957208849988, + "grad_norm": 24.625, + "learning_rate": 2.138144615759142e-06, + "loss": 1.221, + "step": 2119 + }, + { + "epoch": 0.5154388524191588, + "grad_norm": 17.125, + "learning_rate": 2.137804486604453e-06, + "loss": 0.7901, + "step": 2120 + }, + { + "epoch": 0.5156819839533188, + "grad_norm": 16.125, + "learning_rate": 2.1374642247520506e-06, + "loss": 0.819, + "step": 2121 + }, + { + "epoch": 0.5159251154874788, + "grad_norm": 17.375, + "learning_rate": 2.137123830252793e-06, + "loss": 0.6808, + "step": 2122 + }, + { + "epoch": 0.5161682470216387, + "grad_norm": 20.0, + "learning_rate": 2.1367833031575576e-06, + "loss": 0.6986, + "step": 2123 + }, + { + "epoch": 0.5164113785557987, + "grad_norm": 28.625, + "learning_rate": 2.1364426435172426e-06, + "loss": 0.9064, + "step": 2124 + }, + { + "epoch": 0.5166545100899587, + "grad_norm": 26.875, + "learning_rate": 2.136101851382765e-06, + "loss": 0.8917, + "step": 2125 + }, + { + "epoch": 0.5168976416241187, + "grad_norm": 17.375, + "learning_rate": 2.1357609268050623e-06, + "loss": 0.7116, + "step": 2126 + }, + { + "epoch": 0.5171407731582787, + "grad_norm": 23.125, + "learning_rate": 2.135419869835091e-06, + "loss": 1.0916, + "step": 2127 + }, + { + "epoch": 0.5173839046924386, + "grad_norm": 26.25, + "learning_rate": 2.1350786805238287e-06, + "loss": 1.1954, + "step": 2128 + }, + { + "epoch": 0.5176270362265986, + "grad_norm": 28.0, + "learning_rate": 2.1347373589222718e-06, + "loss": 1.0237, + "step": 2129 + }, + { + "epoch": 0.5178701677607586, + "grad_norm": 18.75, + "learning_rate": 2.134395905081437e-06, + "loss": 0.6715, + "step": 2130 + }, + { + "epoch": 0.5181132992949186, + "grad_norm": 18.25, + "learning_rate": 2.13405431905236e-06, + "loss": 1.0831, + "step": 2131 + }, + { + "epoch": 0.5183564308290786, + "grad_norm": 17.25, + "learning_rate": 2.1337126008860964e-06, + "loss": 0.7293, + "step": 2132 + }, + { + "epoch": 0.5185995623632386, + "grad_norm": 20.125, + "learning_rate": 2.1333707506337227e-06, + "loss": 0.7882, + "step": 2133 + }, + { + "epoch": 0.5188426938973985, + "grad_norm": 19.375, + "learning_rate": 2.133028768346334e-06, + "loss": 0.761, + "step": 2134 + }, + { + "epoch": 0.5190858254315585, + "grad_norm": 16.5, + "learning_rate": 2.132686654075045e-06, + "loss": 0.5887, + "step": 2135 + }, + { + "epoch": 0.5193289569657185, + "grad_norm": 22.0, + "learning_rate": 2.132344407870992e-06, + "loss": 0.9873, + "step": 2136 + }, + { + "epoch": 0.5195720884998785, + "grad_norm": 28.25, + "learning_rate": 2.1320020297853276e-06, + "loss": 1.2165, + "step": 2137 + }, + { + "epoch": 0.5198152200340385, + "grad_norm": 18.75, + "learning_rate": 2.1316595198692274e-06, + "loss": 1.0632, + "step": 2138 + }, + { + "epoch": 0.5200583515681984, + "grad_norm": 25.625, + "learning_rate": 2.1313168781738855e-06, + "loss": 1.2686, + "step": 2139 + }, + { + "epoch": 0.5203014831023584, + "grad_norm": 17.125, + "learning_rate": 2.130974104750515e-06, + "loss": 0.75, + "step": 2140 + }, + { + "epoch": 0.5205446146365184, + "grad_norm": 22.0, + "learning_rate": 2.13063119965035e-06, + "loss": 1.0972, + "step": 2141 + }, + { + "epoch": 0.5207877461706784, + "grad_norm": 16.125, + "learning_rate": 2.1302881629246426e-06, + "loss": 0.5599, + "step": 2142 + }, + { + "epoch": 0.5210308777048384, + "grad_norm": 19.875, + "learning_rate": 2.1299449946246666e-06, + "loss": 1.2064, + "step": 2143 + }, + { + "epoch": 0.5212740092389982, + "grad_norm": 21.75, + "learning_rate": 2.129601694801714e-06, + "loss": 1.2053, + "step": 2144 + }, + { + "epoch": 0.5215171407731582, + "grad_norm": 20.375, + "learning_rate": 2.1292582635070966e-06, + "loss": 0.8525, + "step": 2145 + }, + { + "epoch": 0.5217602723073183, + "grad_norm": 14.3125, + "learning_rate": 2.128914700792146e-06, + "loss": 0.859, + "step": 2146 + }, + { + "epoch": 0.5220034038414783, + "grad_norm": 14.8125, + "learning_rate": 2.1285710067082147e-06, + "loss": 0.4744, + "step": 2147 + }, + { + "epoch": 0.5222465353756383, + "grad_norm": 16.125, + "learning_rate": 2.128227181306673e-06, + "loss": 0.8446, + "step": 2148 + }, + { + "epoch": 0.5224896669097983, + "grad_norm": 34.5, + "learning_rate": 2.1278832246389116e-06, + "loss": 0.8184, + "step": 2149 + }, + { + "epoch": 0.5227327984439581, + "grad_norm": 24.375, + "learning_rate": 2.1275391367563403e-06, + "loss": 1.5662, + "step": 2150 + }, + { + "epoch": 0.5229759299781181, + "grad_norm": 17.75, + "learning_rate": 2.1271949177103894e-06, + "loss": 0.6558, + "step": 2151 + }, + { + "epoch": 0.5232190615122781, + "grad_norm": 23.375, + "learning_rate": 2.1268505675525084e-06, + "loss": 0.8729, + "step": 2152 + }, + { + "epoch": 0.5234621930464382, + "grad_norm": 21.0, + "learning_rate": 2.1265060863341665e-06, + "loss": 1.039, + "step": 2153 + }, + { + "epoch": 0.5237053245805982, + "grad_norm": 17.5, + "learning_rate": 2.1261614741068522e-06, + "loss": 0.7471, + "step": 2154 + }, + { + "epoch": 0.523948456114758, + "grad_norm": 15.9375, + "learning_rate": 2.1258167309220737e-06, + "loss": 0.5395, + "step": 2155 + }, + { + "epoch": 0.524191587648918, + "grad_norm": 17.125, + "learning_rate": 2.125471856831359e-06, + "loss": 0.7584, + "step": 2156 + }, + { + "epoch": 0.524434719183078, + "grad_norm": 41.25, + "learning_rate": 2.125126851886255e-06, + "loss": 0.7686, + "step": 2157 + }, + { + "epoch": 0.524677850717238, + "grad_norm": 15.5, + "learning_rate": 2.1247817161383295e-06, + "loss": 0.4855, + "step": 2158 + }, + { + "epoch": 0.524920982251398, + "grad_norm": 16.875, + "learning_rate": 2.1244364496391684e-06, + "loss": 0.7316, + "step": 2159 + }, + { + "epoch": 0.5251641137855579, + "grad_norm": 20.375, + "learning_rate": 2.124091052440378e-06, + "loss": 1.142, + "step": 2160 + }, + { + "epoch": 0.5254072453197179, + "grad_norm": 22.75, + "learning_rate": 2.123745524593583e-06, + "loss": 0.729, + "step": 2161 + }, + { + "epoch": 0.5256503768538779, + "grad_norm": 21.0, + "learning_rate": 2.1233998661504297e-06, + "loss": 0.9577, + "step": 2162 + }, + { + "epoch": 0.525893508388038, + "grad_norm": 15.5625, + "learning_rate": 2.1230540771625823e-06, + "loss": 0.657, + "step": 2163 + }, + { + "epoch": 0.526136639922198, + "grad_norm": 16.75, + "learning_rate": 2.122708157681725e-06, + "loss": 1.1673, + "step": 2164 + }, + { + "epoch": 0.5263797714563578, + "grad_norm": 14.9375, + "learning_rate": 2.122362107759561e-06, + "loss": 0.7474, + "step": 2165 + }, + { + "epoch": 0.5266229029905178, + "grad_norm": 19.625, + "learning_rate": 2.1220159274478145e-06, + "loss": 0.9294, + "step": 2166 + }, + { + "epoch": 0.5268660345246778, + "grad_norm": 17.25, + "learning_rate": 2.121669616798227e-06, + "loss": 0.7146, + "step": 2167 + }, + { + "epoch": 0.5271091660588378, + "grad_norm": 23.25, + "learning_rate": 2.1213231758625606e-06, + "loss": 0.9958, + "step": 2168 + }, + { + "epoch": 0.5273522975929978, + "grad_norm": 18.5, + "learning_rate": 2.1209766046925976e-06, + "loss": 0.7999, + "step": 2169 + }, + { + "epoch": 0.5275954291271578, + "grad_norm": 15.0625, + "learning_rate": 2.120629903340139e-06, + "loss": 0.6183, + "step": 2170 + }, + { + "epoch": 0.5278385606613177, + "grad_norm": 20.75, + "learning_rate": 2.120283071857005e-06, + "loss": 0.8308, + "step": 2171 + }, + { + "epoch": 0.5280816921954777, + "grad_norm": 18.625, + "learning_rate": 2.1199361102950357e-06, + "loss": 0.6661, + "step": 2172 + }, + { + "epoch": 0.5283248237296377, + "grad_norm": 22.375, + "learning_rate": 2.11958901870609e-06, + "loss": 0.8004, + "step": 2173 + }, + { + "epoch": 0.5285679552637977, + "grad_norm": 23.5, + "learning_rate": 2.119241797142047e-06, + "loss": 1.2852, + "step": 2174 + }, + { + "epoch": 0.5288110867979577, + "grad_norm": 27.0, + "learning_rate": 2.1188944456548054e-06, + "loss": 0.9778, + "step": 2175 + }, + { + "epoch": 0.5290542183321176, + "grad_norm": 23.0, + "learning_rate": 2.1185469642962826e-06, + "loss": 0.8277, + "step": 2176 + }, + { + "epoch": 0.5292973498662776, + "grad_norm": 17.75, + "learning_rate": 2.1181993531184156e-06, + "loss": 0.6125, + "step": 2177 + }, + { + "epoch": 0.5295404814004376, + "grad_norm": 20.375, + "learning_rate": 2.117851612173161e-06, + "loss": 1.027, + "step": 2178 + }, + { + "epoch": 0.5297836129345976, + "grad_norm": 17.875, + "learning_rate": 2.1175037415124947e-06, + "loss": 0.8801, + "step": 2179 + }, + { + "epoch": 0.5300267444687576, + "grad_norm": 21.0, + "learning_rate": 2.1171557411884116e-06, + "loss": 1.0453, + "step": 2180 + }, + { + "epoch": 0.5302698760029175, + "grad_norm": 25.875, + "learning_rate": 2.116807611252927e-06, + "loss": 1.467, + "step": 2181 + }, + { + "epoch": 0.5305130075370775, + "grad_norm": 16.875, + "learning_rate": 2.1164593517580746e-06, + "loss": 0.9979, + "step": 2182 + }, + { + "epoch": 0.5307561390712375, + "grad_norm": 15.375, + "learning_rate": 2.116110962755908e-06, + "loss": 0.6979, + "step": 2183 + }, + { + "epoch": 0.5309992706053975, + "grad_norm": 31.0, + "learning_rate": 2.1157624442984993e-06, + "loss": 1.4906, + "step": 2184 + }, + { + "epoch": 0.5312424021395575, + "grad_norm": 19.625, + "learning_rate": 2.115413796437941e-06, + "loss": 0.9826, + "step": 2185 + }, + { + "epoch": 0.5314855336737175, + "grad_norm": 20.625, + "learning_rate": 2.115065019226345e-06, + "loss": 0.7923, + "step": 2186 + }, + { + "epoch": 0.5317286652078774, + "grad_norm": 20.625, + "learning_rate": 2.114716112715842e-06, + "loss": 1.0459, + "step": 2187 + }, + { + "epoch": 0.5319717967420374, + "grad_norm": 17.25, + "learning_rate": 2.114367076958581e-06, + "loss": 1.155, + "step": 2188 + }, + { + "epoch": 0.5322149282761974, + "grad_norm": 18.625, + "learning_rate": 2.1140179120067324e-06, + "loss": 1.0049, + "step": 2189 + }, + { + "epoch": 0.5324580598103574, + "grad_norm": 19.75, + "learning_rate": 2.113668617912485e-06, + "loss": 0.8863, + "step": 2190 + }, + { + "epoch": 0.5327011913445174, + "grad_norm": 15.375, + "learning_rate": 2.1133191947280465e-06, + "loss": 0.7787, + "step": 2191 + }, + { + "epoch": 0.5329443228786773, + "grad_norm": 21.125, + "learning_rate": 2.112969642505644e-06, + "loss": 0.8467, + "step": 2192 + }, + { + "epoch": 0.5331874544128373, + "grad_norm": 22.25, + "learning_rate": 2.112619961297525e-06, + "loss": 0.7615, + "step": 2193 + }, + { + "epoch": 0.5334305859469973, + "grad_norm": 23.625, + "learning_rate": 2.1122701511559548e-06, + "loss": 0.7331, + "step": 2194 + }, + { + "epoch": 0.5336737174811573, + "grad_norm": 17.875, + "learning_rate": 2.1119202121332185e-06, + "loss": 0.751, + "step": 2195 + }, + { + "epoch": 0.5339168490153173, + "grad_norm": 24.625, + "learning_rate": 2.11157014428162e-06, + "loss": 1.0139, + "step": 2196 + }, + { + "epoch": 0.5341599805494772, + "grad_norm": 17.125, + "learning_rate": 2.111219947653484e-06, + "loss": 0.7356, + "step": 2197 + }, + { + "epoch": 0.5344031120836372, + "grad_norm": 23.625, + "learning_rate": 2.1108696223011534e-06, + "loss": 1.0466, + "step": 2198 + }, + { + "epoch": 0.5346462436177972, + "grad_norm": 19.0, + "learning_rate": 2.1105191682769895e-06, + "loss": 0.7842, + "step": 2199 + }, + { + "epoch": 0.5348893751519572, + "grad_norm": 25.125, + "learning_rate": 2.1101685856333744e-06, + "loss": 1.2724, + "step": 2200 + }, + { + "epoch": 0.5351325066861172, + "grad_norm": 20.5, + "learning_rate": 2.1098178744227088e-06, + "loss": 0.8252, + "step": 2201 + }, + { + "epoch": 0.5353756382202771, + "grad_norm": 24.5, + "learning_rate": 2.109467034697412e-06, + "loss": 0.5705, + "step": 2202 + }, + { + "epoch": 0.5356187697544371, + "grad_norm": 22.25, + "learning_rate": 2.1091160665099235e-06, + "loss": 0.9459, + "step": 2203 + }, + { + "epoch": 0.5358619012885971, + "grad_norm": 19.875, + "learning_rate": 2.108764969912701e-06, + "loss": 1.189, + "step": 2204 + }, + { + "epoch": 0.5361050328227571, + "grad_norm": 23.625, + "learning_rate": 2.108413744958223e-06, + "loss": 1.2873, + "step": 2205 + }, + { + "epoch": 0.5363481643569171, + "grad_norm": 17.125, + "learning_rate": 2.108062391698985e-06, + "loss": 0.7731, + "step": 2206 + }, + { + "epoch": 0.5365912958910771, + "grad_norm": 18.625, + "learning_rate": 2.1077109101875036e-06, + "loss": 0.8861, + "step": 2207 + }, + { + "epoch": 0.536834427425237, + "grad_norm": 18.5, + "learning_rate": 2.1073593004763134e-06, + "loss": 1.0385, + "step": 2208 + }, + { + "epoch": 0.537077558959397, + "grad_norm": 24.375, + "learning_rate": 2.1070075626179686e-06, + "loss": 0.8896, + "step": 2209 + }, + { + "epoch": 0.537320690493557, + "grad_norm": 15.5625, + "learning_rate": 2.1066556966650427e-06, + "loss": 0.6847, + "step": 2210 + }, + { + "epoch": 0.537563822027717, + "grad_norm": 18.25, + "learning_rate": 2.1063037026701277e-06, + "loss": 0.8065, + "step": 2211 + }, + { + "epoch": 0.537806953561877, + "grad_norm": 24.75, + "learning_rate": 2.1059515806858357e-06, + "loss": 0.9747, + "step": 2212 + }, + { + "epoch": 0.5380500850960369, + "grad_norm": 22.75, + "learning_rate": 2.105599330764797e-06, + "loss": 1.2146, + "step": 2213 + }, + { + "epoch": 0.5382932166301969, + "grad_norm": 21.0, + "learning_rate": 2.105246952959662e-06, + "loss": 0.8581, + "step": 2214 + }, + { + "epoch": 0.5385363481643569, + "grad_norm": 24.5, + "learning_rate": 2.104894447323099e-06, + "loss": 1.2318, + "step": 2215 + }, + { + "epoch": 0.5387794796985169, + "grad_norm": 22.875, + "learning_rate": 2.104541813907796e-06, + "loss": 1.3717, + "step": 2216 + }, + { + "epoch": 0.5390226112326769, + "grad_norm": 16.375, + "learning_rate": 2.104189052766461e-06, + "loss": 0.9492, + "step": 2217 + }, + { + "epoch": 0.5392657427668368, + "grad_norm": 26.75, + "learning_rate": 2.1038361639518195e-06, + "loss": 0.9797, + "step": 2218 + }, + { + "epoch": 0.5395088743009968, + "grad_norm": 35.5, + "learning_rate": 2.1034831475166166e-06, + "loss": 0.8712, + "step": 2219 + }, + { + "epoch": 0.5397520058351568, + "grad_norm": 25.125, + "learning_rate": 2.103130003513618e-06, + "loss": 1.3308, + "step": 2220 + }, + { + "epoch": 0.5399951373693168, + "grad_norm": 13.1875, + "learning_rate": 2.1027767319956055e-06, + "loss": 0.3165, + "step": 2221 + }, + { + "epoch": 0.5402382689034768, + "grad_norm": 20.375, + "learning_rate": 2.1024233330153828e-06, + "loss": 0.7538, + "step": 2222 + }, + { + "epoch": 0.5404814004376368, + "grad_norm": 20.25, + "learning_rate": 2.1020698066257707e-06, + "loss": 0.9751, + "step": 2223 + }, + { + "epoch": 0.5407245319717967, + "grad_norm": 17.875, + "learning_rate": 2.101716152879611e-06, + "loss": 0.8774, + "step": 2224 + }, + { + "epoch": 0.5409676635059567, + "grad_norm": 17.75, + "learning_rate": 2.1013623718297623e-06, + "loss": 0.8974, + "step": 2225 + }, + { + "epoch": 0.5412107950401167, + "grad_norm": 18.875, + "learning_rate": 2.1010084635291036e-06, + "loss": 0.8043, + "step": 2226 + }, + { + "epoch": 0.5414539265742767, + "grad_norm": 21.75, + "learning_rate": 2.1006544280305325e-06, + "loss": 0.8488, + "step": 2227 + }, + { + "epoch": 0.5416970581084367, + "grad_norm": 22.0, + "learning_rate": 2.1003002653869658e-06, + "loss": 0.9424, + "step": 2228 + }, + { + "epoch": 0.5419401896425966, + "grad_norm": 20.75, + "learning_rate": 2.099945975651339e-06, + "loss": 0.9605, + "step": 2229 + }, + { + "epoch": 0.5421833211767566, + "grad_norm": 24.375, + "learning_rate": 2.0995915588766074e-06, + "loss": 1.0632, + "step": 2230 + }, + { + "epoch": 0.5424264527109166, + "grad_norm": 25.0, + "learning_rate": 2.0992370151157444e-06, + "loss": 1.0454, + "step": 2231 + }, + { + "epoch": 0.5426695842450766, + "grad_norm": 17.25, + "learning_rate": 2.0988823444217426e-06, + "loss": 0.8084, + "step": 2232 + }, + { + "epoch": 0.5429127157792366, + "grad_norm": 17.75, + "learning_rate": 2.0985275468476137e-06, + "loss": 0.9918, + "step": 2233 + }, + { + "epoch": 0.5431558473133965, + "grad_norm": 19.5, + "learning_rate": 2.098172622446388e-06, + "loss": 0.9706, + "step": 2234 + }, + { + "epoch": 0.5433989788475565, + "grad_norm": 23.625, + "learning_rate": 2.097817571271116e-06, + "loss": 0.8768, + "step": 2235 + }, + { + "epoch": 0.5436421103817165, + "grad_norm": 18.75, + "learning_rate": 2.0974623933748655e-06, + "loss": 0.9007, + "step": 2236 + }, + { + "epoch": 0.5438852419158765, + "grad_norm": 14.8125, + "learning_rate": 2.097107088810724e-06, + "loss": 0.5056, + "step": 2237 + }, + { + "epoch": 0.5441283734500365, + "grad_norm": 17.25, + "learning_rate": 2.096751657631798e-06, + "loss": 1.0783, + "step": 2238 + }, + { + "epoch": 0.5443715049841964, + "grad_norm": 22.875, + "learning_rate": 2.0963960998912132e-06, + "loss": 0.9539, + "step": 2239 + }, + { + "epoch": 0.5446146365183564, + "grad_norm": 20.375, + "learning_rate": 2.0960404156421133e-06, + "loss": 0.9082, + "step": 2240 + }, + { + "epoch": 0.5448577680525164, + "grad_norm": 23.375, + "learning_rate": 2.095684604937662e-06, + "loss": 1.0703, + "step": 2241 + }, + { + "epoch": 0.5451008995866764, + "grad_norm": 15.3125, + "learning_rate": 2.09532866783104e-06, + "loss": 0.6463, + "step": 2242 + }, + { + "epoch": 0.5453440311208364, + "grad_norm": 18.5, + "learning_rate": 2.09497260437545e-06, + "loss": 0.8202, + "step": 2243 + }, + { + "epoch": 0.5455871626549964, + "grad_norm": 21.0, + "learning_rate": 2.094616414624111e-06, + "loss": 0.8521, + "step": 2244 + }, + { + "epoch": 0.5458302941891563, + "grad_norm": 33.0, + "learning_rate": 2.094260098630262e-06, + "loss": 1.0215, + "step": 2245 + }, + { + "epoch": 0.5460734257233163, + "grad_norm": 22.375, + "learning_rate": 2.09390365644716e-06, + "loss": 1.2234, + "step": 2246 + }, + { + "epoch": 0.5463165572574763, + "grad_norm": 20.5, + "learning_rate": 2.093547088128082e-06, + "loss": 0.7184, + "step": 2247 + }, + { + "epoch": 0.5465596887916363, + "grad_norm": 34.25, + "learning_rate": 2.093190393726323e-06, + "loss": 1.2451, + "step": 2248 + }, + { + "epoch": 0.5468028203257963, + "grad_norm": 14.375, + "learning_rate": 2.0928335732951976e-06, + "loss": 0.5257, + "step": 2249 + }, + { + "epoch": 0.5470459518599562, + "grad_norm": 21.125, + "learning_rate": 2.0924766268880382e-06, + "loss": 0.7474, + "step": 2250 + }, + { + "epoch": 0.5472890833941162, + "grad_norm": 34.0, + "learning_rate": 2.0921195545581967e-06, + "loss": 0.8276, + "step": 2251 + }, + { + "epoch": 0.5475322149282762, + "grad_norm": 31.625, + "learning_rate": 2.091762356359044e-06, + "loss": 0.9416, + "step": 2252 + }, + { + "epoch": 0.5477753464624362, + "grad_norm": 21.75, + "learning_rate": 2.0914050323439695e-06, + "loss": 1.4075, + "step": 2253 + }, + { + "epoch": 0.5480184779965962, + "grad_norm": 24.5, + "learning_rate": 2.0910475825663813e-06, + "loss": 1.0821, + "step": 2254 + }, + { + "epoch": 0.5482616095307561, + "grad_norm": 15.0, + "learning_rate": 2.0906900070797067e-06, + "loss": 0.6176, + "step": 2255 + }, + { + "epoch": 0.5485047410649161, + "grad_norm": 12.8125, + "learning_rate": 2.090332305937391e-06, + "loss": 0.4484, + "step": 2256 + }, + { + "epoch": 0.5487478725990761, + "grad_norm": 24.375, + "learning_rate": 2.089974479192899e-06, + "loss": 1.1816, + "step": 2257 + }, + { + "epoch": 0.5489910041332361, + "grad_norm": 15.4375, + "learning_rate": 2.0896165268997145e-06, + "loss": 1.0037, + "step": 2258 + }, + { + "epoch": 0.5492341356673961, + "grad_norm": 16.25, + "learning_rate": 2.089258449111339e-06, + "loss": 0.5961, + "step": 2259 + }, + { + "epoch": 0.5494772672015561, + "grad_norm": 27.75, + "learning_rate": 2.088900245881294e-06, + "loss": 0.9547, + "step": 2260 + }, + { + "epoch": 0.549720398735716, + "grad_norm": 18.375, + "learning_rate": 2.0885419172631192e-06, + "loss": 1.0561, + "step": 2261 + }, + { + "epoch": 0.549963530269876, + "grad_norm": 20.0, + "learning_rate": 2.088183463310373e-06, + "loss": 0.9799, + "step": 2262 + }, + { + "epoch": 0.550206661804036, + "grad_norm": 15.625, + "learning_rate": 2.0878248840766317e-06, + "loss": 0.5904, + "step": 2263 + }, + { + "epoch": 0.550449793338196, + "grad_norm": 24.75, + "learning_rate": 2.0874661796154916e-06, + "loss": 0.7311, + "step": 2264 + }, + { + "epoch": 0.550692924872356, + "grad_norm": 18.5, + "learning_rate": 2.087107349980568e-06, + "loss": 0.7602, + "step": 2265 + }, + { + "epoch": 0.5509360564065159, + "grad_norm": 15.9375, + "learning_rate": 2.086748395225493e-06, + "loss": 0.6092, + "step": 2266 + }, + { + "epoch": 0.5511791879406759, + "grad_norm": 18.625, + "learning_rate": 2.08638931540392e-06, + "loss": 0.7429, + "step": 2267 + }, + { + "epoch": 0.5514223194748359, + "grad_norm": 32.5, + "learning_rate": 2.086030110569518e-06, + "loss": 0.9813, + "step": 2268 + }, + { + "epoch": 0.5516654510089959, + "grad_norm": 15.6875, + "learning_rate": 2.0856707807759776e-06, + "loss": 0.6213, + "step": 2269 + }, + { + "epoch": 0.5519085825431559, + "grad_norm": 21.5, + "learning_rate": 2.0853113260770063e-06, + "loss": 1.0692, + "step": 2270 + }, + { + "epoch": 0.5521517140773158, + "grad_norm": 23.0, + "learning_rate": 2.0849517465263315e-06, + "loss": 1.0135, + "step": 2271 + }, + { + "epoch": 0.5523948456114758, + "grad_norm": 18.0, + "learning_rate": 2.084592042177698e-06, + "loss": 1.0545, + "step": 2272 + }, + { + "epoch": 0.5526379771456358, + "grad_norm": 12.3125, + "learning_rate": 2.084232213084869e-06, + "loss": 0.5815, + "step": 2273 + }, + { + "epoch": 0.5528811086797958, + "grad_norm": 21.375, + "learning_rate": 2.0838722593016288e-06, + "loss": 0.8707, + "step": 2274 + }, + { + "epoch": 0.5531242402139558, + "grad_norm": 25.625, + "learning_rate": 2.083512180881778e-06, + "loss": 0.76, + "step": 2275 + }, + { + "epoch": 0.5533673717481157, + "grad_norm": 14.25, + "learning_rate": 2.0831519778791363e-06, + "loss": 0.4806, + "step": 2276 + }, + { + "epoch": 0.5536105032822757, + "grad_norm": 13.75, + "learning_rate": 2.0827916503475425e-06, + "loss": 0.5682, + "step": 2277 + }, + { + "epoch": 0.5538536348164357, + "grad_norm": 26.875, + "learning_rate": 2.082431198340854e-06, + "loss": 0.9897, + "step": 2278 + }, + { + "epoch": 0.5540967663505957, + "grad_norm": 18.75, + "learning_rate": 2.082070621912946e-06, + "loss": 1.3372, + "step": 2279 + }, + { + "epoch": 0.5543398978847557, + "grad_norm": 24.0, + "learning_rate": 2.0817099211177137e-06, + "loss": 1.2054, + "step": 2280 + }, + { + "epoch": 0.5545830294189157, + "grad_norm": 26.5, + "learning_rate": 2.081349096009069e-06, + "loss": 0.9381, + "step": 2281 + }, + { + "epoch": 0.5548261609530756, + "grad_norm": 22.375, + "learning_rate": 2.0809881466409444e-06, + "loss": 0.9875, + "step": 2282 + }, + { + "epoch": 0.5550692924872356, + "grad_norm": 23.375, + "learning_rate": 2.080627073067289e-06, + "loss": 0.9427, + "step": 2283 + }, + { + "epoch": 0.5553124240213956, + "grad_norm": 18.75, + "learning_rate": 2.0802658753420728e-06, + "loss": 0.8702, + "step": 2284 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 17.5, + "learning_rate": 2.0799045535192817e-06, + "loss": 0.5937, + "step": 2285 + }, + { + "epoch": 0.5557986870897156, + "grad_norm": 21.75, + "learning_rate": 2.0795431076529226e-06, + "loss": 1.0784, + "step": 2286 + }, + { + "epoch": 0.5560418186238755, + "grad_norm": 21.25, + "learning_rate": 2.0791815377970197e-06, + "loss": 1.1514, + "step": 2287 + }, + { + "epoch": 0.5562849501580355, + "grad_norm": 16.25, + "learning_rate": 2.078819844005615e-06, + "loss": 0.6522, + "step": 2288 + }, + { + "epoch": 0.5565280816921955, + "grad_norm": 17.5, + "learning_rate": 2.07845802633277e-06, + "loss": 0.9068, + "step": 2289 + }, + { + "epoch": 0.5567712132263555, + "grad_norm": 21.0, + "learning_rate": 2.078096084832566e-06, + "loss": 0.8342, + "step": 2290 + }, + { + "epoch": 0.5570143447605155, + "grad_norm": 21.5, + "learning_rate": 2.0777340195590996e-06, + "loss": 0.6393, + "step": 2291 + }, + { + "epoch": 0.5572574762946754, + "grad_norm": 21.375, + "learning_rate": 2.0773718305664887e-06, + "loss": 0.6652, + "step": 2292 + }, + { + "epoch": 0.5575006078288354, + "grad_norm": 22.75, + "learning_rate": 2.0770095179088688e-06, + "loss": 0.7814, + "step": 2293 + }, + { + "epoch": 0.5577437393629954, + "grad_norm": 18.375, + "learning_rate": 2.0766470816403935e-06, + "loss": 0.9092, + "step": 2294 + }, + { + "epoch": 0.5579868708971554, + "grad_norm": 21.5, + "learning_rate": 2.076284521815235e-06, + "loss": 0.9198, + "step": 2295 + }, + { + "epoch": 0.5582300024313154, + "grad_norm": 21.125, + "learning_rate": 2.075921838487584e-06, + "loss": 0.7984, + "step": 2296 + }, + { + "epoch": 0.5584731339654754, + "grad_norm": 17.75, + "learning_rate": 2.07555903171165e-06, + "loss": 0.8968, + "step": 2297 + }, + { + "epoch": 0.5587162654996353, + "grad_norm": 22.0, + "learning_rate": 2.0751961015416617e-06, + "loss": 1.0454, + "step": 2298 + }, + { + "epoch": 0.5589593970337953, + "grad_norm": 14.9375, + "learning_rate": 2.0748330480318637e-06, + "loss": 0.7281, + "step": 2299 + }, + { + "epoch": 0.5592025285679553, + "grad_norm": 15.1875, + "learning_rate": 2.0744698712365215e-06, + "loss": 0.7593, + "step": 2300 + }, + { + "epoch": 0.5594456601021153, + "grad_norm": 22.25, + "learning_rate": 2.074106571209918e-06, + "loss": 0.8927, + "step": 2301 + }, + { + "epoch": 0.5596887916362753, + "grad_norm": 16.75, + "learning_rate": 2.073743148006354e-06, + "loss": 0.9008, + "step": 2302 + }, + { + "epoch": 0.5599319231704352, + "grad_norm": 30.25, + "learning_rate": 2.073379601680151e-06, + "loss": 0.936, + "step": 2303 + }, + { + "epoch": 0.5601750547045952, + "grad_norm": 19.25, + "learning_rate": 2.0730159322856454e-06, + "loss": 0.7952, + "step": 2304 + }, + { + "epoch": 0.5604181862387552, + "grad_norm": 17.5, + "learning_rate": 2.0726521398771956e-06, + "loss": 0.6307, + "step": 2305 + }, + { + "epoch": 0.5606613177729152, + "grad_norm": 19.125, + "learning_rate": 2.0722882245091753e-06, + "loss": 0.8667, + "step": 2306 + }, + { + "epoch": 0.5609044493070752, + "grad_norm": 14.0625, + "learning_rate": 2.0719241862359786e-06, + "loss": 0.623, + "step": 2307 + }, + { + "epoch": 0.5611475808412351, + "grad_norm": 21.125, + "learning_rate": 2.0715600251120167e-06, + "loss": 1.173, + "step": 2308 + }, + { + "epoch": 0.5613907123753951, + "grad_norm": 16.25, + "learning_rate": 2.0711957411917207e-06, + "loss": 0.8189, + "step": 2309 + }, + { + "epoch": 0.5616338439095551, + "grad_norm": 16.25, + "learning_rate": 2.0708313345295384e-06, + "loss": 0.6672, + "step": 2310 + }, + { + "epoch": 0.5618769754437151, + "grad_norm": 19.125, + "learning_rate": 2.070466805179937e-06, + "loss": 0.7677, + "step": 2311 + }, + { + "epoch": 0.5621201069778751, + "grad_norm": 18.625, + "learning_rate": 2.0701021531974014e-06, + "loss": 0.9986, + "step": 2312 + }, + { + "epoch": 0.562363238512035, + "grad_norm": 26.0, + "learning_rate": 2.0697373786364357e-06, + "loss": 0.9489, + "step": 2313 + }, + { + "epoch": 0.562606370046195, + "grad_norm": 42.75, + "learning_rate": 2.0693724815515612e-06, + "loss": 1.7043, + "step": 2314 + }, + { + "epoch": 0.562849501580355, + "grad_norm": 18.375, + "learning_rate": 2.0690074619973185e-06, + "loss": 0.7949, + "step": 2315 + }, + { + "epoch": 0.563092633114515, + "grad_norm": 19.875, + "learning_rate": 2.0686423200282652e-06, + "loss": 1.1615, + "step": 2316 + }, + { + "epoch": 0.563335764648675, + "grad_norm": 16.875, + "learning_rate": 2.0682770556989797e-06, + "loss": 0.8947, + "step": 2317 + }, + { + "epoch": 0.563578896182835, + "grad_norm": 15.375, + "learning_rate": 2.0679116690640556e-06, + "loss": 0.5107, + "step": 2318 + }, + { + "epoch": 0.5638220277169949, + "grad_norm": 17.75, + "learning_rate": 2.0675461601781067e-06, + "loss": 0.8202, + "step": 2319 + }, + { + "epoch": 0.5640651592511549, + "grad_norm": 20.75, + "learning_rate": 2.0671805290957646e-06, + "loss": 1.0881, + "step": 2320 + }, + { + "epoch": 0.5643082907853149, + "grad_norm": 19.125, + "learning_rate": 2.0668147758716792e-06, + "loss": 0.9051, + "step": 2321 + }, + { + "epoch": 0.5645514223194749, + "grad_norm": 28.375, + "learning_rate": 2.0664489005605187e-06, + "loss": 0.9758, + "step": 2322 + }, + { + "epoch": 0.5647945538536349, + "grad_norm": 18.375, + "learning_rate": 2.0660829032169695e-06, + "loss": 0.8816, + "step": 2323 + }, + { + "epoch": 0.5650376853877948, + "grad_norm": 17.875, + "learning_rate": 2.0657167838957365e-06, + "loss": 0.7318, + "step": 2324 + }, + { + "epoch": 0.5652808169219548, + "grad_norm": 17.5, + "learning_rate": 2.065350542651542e-06, + "loss": 0.6686, + "step": 2325 + }, + { + "epoch": 0.5655239484561148, + "grad_norm": 12.125, + "learning_rate": 2.064984179539127e-06, + "loss": 0.3821, + "step": 2326 + }, + { + "epoch": 0.5657670799902748, + "grad_norm": 22.375, + "learning_rate": 2.064617694613251e-06, + "loss": 1.1379, + "step": 2327 + }, + { + "epoch": 0.5660102115244348, + "grad_norm": 17.75, + "learning_rate": 2.0642510879286924e-06, + "loss": 0.8428, + "step": 2328 + }, + { + "epoch": 0.5662533430585946, + "grad_norm": 20.0, + "learning_rate": 2.0638843595402456e-06, + "loss": 1.07, + "step": 2329 + }, + { + "epoch": 0.5664964745927547, + "grad_norm": 27.625, + "learning_rate": 2.063517509502725e-06, + "loss": 1.1483, + "step": 2330 + }, + { + "epoch": 0.5667396061269147, + "grad_norm": 20.25, + "learning_rate": 2.063150537870963e-06, + "loss": 0.9096, + "step": 2331 + }, + { + "epoch": 0.5669827376610747, + "grad_norm": 19.25, + "learning_rate": 2.062783444699809e-06, + "loss": 0.633, + "step": 2332 + }, + { + "epoch": 0.5672258691952347, + "grad_norm": 30.875, + "learning_rate": 2.0624162300441327e-06, + "loss": 1.0003, + "step": 2333 + }, + { + "epoch": 0.5674690007293947, + "grad_norm": 30.375, + "learning_rate": 2.062048893958819e-06, + "loss": 1.4803, + "step": 2334 + }, + { + "epoch": 0.5677121322635545, + "grad_norm": 30.75, + "learning_rate": 2.0616814364987738e-06, + "loss": 1.3023, + "step": 2335 + }, + { + "epoch": 0.5679552637977145, + "grad_norm": 23.375, + "learning_rate": 2.0613138577189203e-06, + "loss": 0.8766, + "step": 2336 + }, + { + "epoch": 0.5681983953318746, + "grad_norm": 21.0, + "learning_rate": 2.060946157674198e-06, + "loss": 0.7785, + "step": 2337 + }, + { + "epoch": 0.5684415268660346, + "grad_norm": 18.375, + "learning_rate": 2.0605783364195676e-06, + "loss": 0.9711, + "step": 2338 + }, + { + "epoch": 0.5686846584001946, + "grad_norm": 17.625, + "learning_rate": 2.060210394010005e-06, + "loss": 0.9325, + "step": 2339 + }, + { + "epoch": 0.5689277899343544, + "grad_norm": 16.25, + "learning_rate": 2.0598423305005065e-06, + "loss": 0.8241, + "step": 2340 + }, + { + "epoch": 0.5691709214685144, + "grad_norm": 17.875, + "learning_rate": 2.059474145946086e-06, + "loss": 0.7612, + "step": 2341 + }, + { + "epoch": 0.5694140530026744, + "grad_norm": 16.0, + "learning_rate": 2.0591058404017735e-06, + "loss": 0.7962, + "step": 2342 + }, + { + "epoch": 0.5696571845368344, + "grad_norm": 18.75, + "learning_rate": 2.058737413922619e-06, + "loss": 1.0329, + "step": 2343 + }, + { + "epoch": 0.5699003160709945, + "grad_norm": 30.375, + "learning_rate": 2.0583688665636915e-06, + "loss": 1.0044, + "step": 2344 + }, + { + "epoch": 0.5701434476051543, + "grad_norm": 21.75, + "learning_rate": 2.0580001983800752e-06, + "loss": 1.1053, + "step": 2345 + }, + { + "epoch": 0.5703865791393143, + "grad_norm": 18.5, + "learning_rate": 2.0576314094268753e-06, + "loss": 1.1189, + "step": 2346 + }, + { + "epoch": 0.5706297106734743, + "grad_norm": 16.375, + "learning_rate": 2.0572624997592124e-06, + "loss": 0.6924, + "step": 2347 + }, + { + "epoch": 0.5708728422076343, + "grad_norm": 26.125, + "learning_rate": 2.0568934694322274e-06, + "loss": 1.156, + "step": 2348 + }, + { + "epoch": 0.5711159737417943, + "grad_norm": 16.625, + "learning_rate": 2.0565243185010776e-06, + "loss": 0.633, + "step": 2349 + }, + { + "epoch": 0.5713591052759542, + "grad_norm": 26.875, + "learning_rate": 2.056155047020939e-06, + "loss": 0.922, + "step": 2350 + }, + { + "epoch": 0.5716022368101142, + "grad_norm": 19.875, + "learning_rate": 2.055785655047006e-06, + "loss": 0.7393, + "step": 2351 + }, + { + "epoch": 0.5718453683442742, + "grad_norm": 16.5, + "learning_rate": 2.055416142634491e-06, + "loss": 0.7964, + "step": 2352 + }, + { + "epoch": 0.5720884998784342, + "grad_norm": 20.5, + "learning_rate": 2.055046509838623e-06, + "loss": 0.9522, + "step": 2353 + }, + { + "epoch": 0.5723316314125942, + "grad_norm": 14.625, + "learning_rate": 2.05467675671465e-06, + "loss": 0.7326, + "step": 2354 + }, + { + "epoch": 0.5725747629467542, + "grad_norm": 20.0, + "learning_rate": 2.0543068833178394e-06, + "loss": 0.659, + "step": 2355 + }, + { + "epoch": 0.5728178944809141, + "grad_norm": 20.125, + "learning_rate": 2.053936889703474e-06, + "loss": 0.7541, + "step": 2356 + }, + { + "epoch": 0.5730610260150741, + "grad_norm": 16.625, + "learning_rate": 2.0535667759268556e-06, + "loss": 0.9131, + "step": 2357 + }, + { + "epoch": 0.5733041575492341, + "grad_norm": 29.125, + "learning_rate": 2.0531965420433046e-06, + "loss": 1.2867, + "step": 2358 + }, + { + "epoch": 0.5735472890833941, + "grad_norm": 21.25, + "learning_rate": 2.0528261881081587e-06, + "loss": 1.1126, + "step": 2359 + }, + { + "epoch": 0.5737904206175541, + "grad_norm": 17.375, + "learning_rate": 2.052455714176774e-06, + "loss": 0.9647, + "step": 2360 + }, + { + "epoch": 0.574033552151714, + "grad_norm": 25.375, + "learning_rate": 2.0520851203045236e-06, + "loss": 0.9846, + "step": 2361 + }, + { + "epoch": 0.574276683685874, + "grad_norm": 20.5, + "learning_rate": 2.0517144065467993e-06, + "loss": 0.9121, + "step": 2362 + }, + { + "epoch": 0.574519815220034, + "grad_norm": 19.25, + "learning_rate": 2.0513435729590114e-06, + "loss": 0.7062, + "step": 2363 + }, + { + "epoch": 0.574762946754194, + "grad_norm": 20.5, + "learning_rate": 2.0509726195965863e-06, + "loss": 0.8853, + "step": 2364 + }, + { + "epoch": 0.575006078288354, + "grad_norm": 18.375, + "learning_rate": 2.0506015465149704e-06, + "loss": 0.7734, + "step": 2365 + }, + { + "epoch": 0.5752492098225139, + "grad_norm": 17.5, + "learning_rate": 2.050230353769626e-06, + "loss": 0.7197, + "step": 2366 + }, + { + "epoch": 0.5754923413566739, + "grad_norm": 20.125, + "learning_rate": 2.049859041416035e-06, + "loss": 0.8364, + "step": 2367 + }, + { + "epoch": 0.5757354728908339, + "grad_norm": 14.5, + "learning_rate": 2.0494876095096964e-06, + "loss": 0.7339, + "step": 2368 + }, + { + "epoch": 0.5759786044249939, + "grad_norm": 14.9375, + "learning_rate": 2.049116058106127e-06, + "loss": 0.757, + "step": 2369 + }, + { + "epoch": 0.5762217359591539, + "grad_norm": 21.75, + "learning_rate": 2.0487443872608613e-06, + "loss": 1.1455, + "step": 2370 + }, + { + "epoch": 0.5764648674933139, + "grad_norm": 21.5, + "learning_rate": 2.048372597029452e-06, + "loss": 0.7505, + "step": 2371 + }, + { + "epoch": 0.5767079990274738, + "grad_norm": 15.8125, + "learning_rate": 2.04800068746747e-06, + "loss": 0.6198, + "step": 2372 + }, + { + "epoch": 0.5769511305616338, + "grad_norm": 18.0, + "learning_rate": 2.047628658630503e-06, + "loss": 0.9446, + "step": 2373 + }, + { + "epoch": 0.5771942620957938, + "grad_norm": 15.5, + "learning_rate": 2.0472565105741578e-06, + "loss": 1.1734, + "step": 2374 + }, + { + "epoch": 0.5774373936299538, + "grad_norm": 19.625, + "learning_rate": 2.0468842433540576e-06, + "loss": 1.1002, + "step": 2375 + }, + { + "epoch": 0.5776805251641138, + "grad_norm": 16.625, + "learning_rate": 2.046511857025845e-06, + "loss": 1.0567, + "step": 2376 + }, + { + "epoch": 0.5779236566982737, + "grad_norm": 17.875, + "learning_rate": 2.0461393516451785e-06, + "loss": 0.7922, + "step": 2377 + }, + { + "epoch": 0.5781667882324337, + "grad_norm": 24.375, + "learning_rate": 2.0457667272677366e-06, + "loss": 1.0617, + "step": 2378 + }, + { + "epoch": 0.5784099197665937, + "grad_norm": 24.375, + "learning_rate": 2.0453939839492137e-06, + "loss": 0.9944, + "step": 2379 + }, + { + "epoch": 0.5786530513007537, + "grad_norm": 17.25, + "learning_rate": 2.0450211217453235e-06, + "loss": 0.962, + "step": 2380 + }, + { + "epoch": 0.5788961828349137, + "grad_norm": 15.5, + "learning_rate": 2.0446481407117953e-06, + "loss": 0.5756, + "step": 2381 + }, + { + "epoch": 0.5791393143690736, + "grad_norm": 17.75, + "learning_rate": 2.044275040904379e-06, + "loss": 1.2556, + "step": 2382 + }, + { + "epoch": 0.5793824459032336, + "grad_norm": 22.875, + "learning_rate": 2.0439018223788402e-06, + "loss": 0.9041, + "step": 2383 + }, + { + "epoch": 0.5796255774373936, + "grad_norm": 14.5625, + "learning_rate": 2.043528485190963e-06, + "loss": 0.414, + "step": 2384 + }, + { + "epoch": 0.5798687089715536, + "grad_norm": 18.375, + "learning_rate": 2.0431550293965486e-06, + "loss": 0.8813, + "step": 2385 + }, + { + "epoch": 0.5801118405057136, + "grad_norm": 21.75, + "learning_rate": 2.042781455051417e-06, + "loss": 0.7461, + "step": 2386 + }, + { + "epoch": 0.5803549720398735, + "grad_norm": 19.75, + "learning_rate": 2.042407762211405e-06, + "loss": 1.0001, + "step": 2387 + }, + { + "epoch": 0.5805981035740335, + "grad_norm": 19.125, + "learning_rate": 2.042033950932368e-06, + "loss": 1.0402, + "step": 2388 + }, + { + "epoch": 0.5808412351081935, + "grad_norm": 20.125, + "learning_rate": 2.0416600212701777e-06, + "loss": 1.1356, + "step": 2389 + }, + { + "epoch": 0.5810843666423535, + "grad_norm": 21.125, + "learning_rate": 2.041285973280725e-06, + "loss": 0.9779, + "step": 2390 + }, + { + "epoch": 0.5813274981765135, + "grad_norm": 15.5625, + "learning_rate": 2.0409118070199175e-06, + "loss": 0.5775, + "step": 2391 + }, + { + "epoch": 0.5815706297106735, + "grad_norm": 19.375, + "learning_rate": 2.040537522543681e-06, + "loss": 0.9431, + "step": 2392 + }, + { + "epoch": 0.5818137612448334, + "grad_norm": 18.625, + "learning_rate": 2.0401631199079584e-06, + "loss": 0.9297, + "step": 2393 + }, + { + "epoch": 0.5820568927789934, + "grad_norm": 25.0, + "learning_rate": 2.039788599168711e-06, + "loss": 1.3637, + "step": 2394 + }, + { + "epoch": 0.5823000243131534, + "grad_norm": 23.75, + "learning_rate": 2.0394139603819176e-06, + "loss": 1.4877, + "step": 2395 + }, + { + "epoch": 0.5825431558473134, + "grad_norm": 24.75, + "learning_rate": 2.039039203603574e-06, + "loss": 1.1025, + "step": 2396 + }, + { + "epoch": 0.5827862873814734, + "grad_norm": 23.625, + "learning_rate": 2.0386643288896944e-06, + "loss": 1.1648, + "step": 2397 + }, + { + "epoch": 0.5830294189156333, + "grad_norm": 20.375, + "learning_rate": 2.0382893362963102e-06, + "loss": 0.6469, + "step": 2398 + }, + { + "epoch": 0.5832725504497933, + "grad_norm": 20.375, + "learning_rate": 2.0379142258794703e-06, + "loss": 0.8905, + "step": 2399 + }, + { + "epoch": 0.5835156819839533, + "grad_norm": 14.4375, + "learning_rate": 2.0375389976952416e-06, + "loss": 0.4437, + "step": 2400 + }, + { + "epoch": 0.5837588135181133, + "grad_norm": 19.625, + "learning_rate": 2.0371636517997085e-06, + "loss": 1.5071, + "step": 2401 + }, + { + "epoch": 0.5840019450522733, + "grad_norm": 18.5, + "learning_rate": 2.0367881882489727e-06, + "loss": 0.8795, + "step": 2402 + }, + { + "epoch": 0.5842450765864332, + "grad_norm": 21.5, + "learning_rate": 2.0364126070991543e-06, + "loss": 0.9467, + "step": 2403 + }, + { + "epoch": 0.5844882081205932, + "grad_norm": 19.75, + "learning_rate": 2.036036908406389e-06, + "loss": 1.3867, + "step": 2404 + }, + { + "epoch": 0.5847313396547532, + "grad_norm": 20.125, + "learning_rate": 2.0356610922268335e-06, + "loss": 0.7784, + "step": 2405 + }, + { + "epoch": 0.5849744711889132, + "grad_norm": 23.125, + "learning_rate": 2.035285158616658e-06, + "loss": 0.912, + "step": 2406 + }, + { + "epoch": 0.5852176027230732, + "grad_norm": 26.125, + "learning_rate": 2.034909107632054e-06, + "loss": 1.0614, + "step": 2407 + }, + { + "epoch": 0.5854607342572332, + "grad_norm": 17.25, + "learning_rate": 2.0345329393292272e-06, + "loss": 0.643, + "step": 2408 + }, + { + "epoch": 0.5857038657913931, + "grad_norm": 19.625, + "learning_rate": 2.034156653764404e-06, + "loss": 0.7394, + "step": 2409 + }, + { + "epoch": 0.5859469973255531, + "grad_norm": 18.625, + "learning_rate": 2.033780250993826e-06, + "loss": 0.7713, + "step": 2410 + }, + { + "epoch": 0.5861901288597131, + "grad_norm": 21.5, + "learning_rate": 2.033403731073753e-06, + "loss": 1.119, + "step": 2411 + }, + { + "epoch": 0.5864332603938731, + "grad_norm": 19.75, + "learning_rate": 2.033027094060462e-06, + "loss": 1.2968, + "step": 2412 + }, + { + "epoch": 0.5866763919280331, + "grad_norm": 15.5, + "learning_rate": 2.0326503400102494e-06, + "loss": 0.6065, + "step": 2413 + }, + { + "epoch": 0.586919523462193, + "grad_norm": 18.0, + "learning_rate": 2.0322734689794262e-06, + "loss": 0.7435, + "step": 2414 + }, + { + "epoch": 0.587162654996353, + "grad_norm": 14.75, + "learning_rate": 2.0318964810243224e-06, + "loss": 0.4709, + "step": 2415 + }, + { + "epoch": 0.587405786530513, + "grad_norm": 27.125, + "learning_rate": 2.031519376201286e-06, + "loss": 0.9531, + "step": 2416 + }, + { + "epoch": 0.587648918064673, + "grad_norm": 16.25, + "learning_rate": 2.0311421545666817e-06, + "loss": 0.7211, + "step": 2417 + }, + { + "epoch": 0.587892049598833, + "grad_norm": 17.625, + "learning_rate": 2.0307648161768914e-06, + "loss": 0.83, + "step": 2418 + }, + { + "epoch": 0.5881351811329929, + "grad_norm": 17.25, + "learning_rate": 2.030387361088315e-06, + "loss": 0.8202, + "step": 2419 + }, + { + "epoch": 0.5883783126671529, + "grad_norm": 13.625, + "learning_rate": 2.0300097893573694e-06, + "loss": 0.3608, + "step": 2420 + }, + { + "epoch": 0.5886214442013129, + "grad_norm": 20.375, + "learning_rate": 2.02963210104049e-06, + "loss": 0.7299, + "step": 2421 + }, + { + "epoch": 0.5888645757354729, + "grad_norm": 17.125, + "learning_rate": 2.0292542961941285e-06, + "loss": 0.8545, + "step": 2422 + }, + { + "epoch": 0.5891077072696329, + "grad_norm": 22.0, + "learning_rate": 2.028876374874754e-06, + "loss": 1.1379, + "step": 2423 + }, + { + "epoch": 0.5893508388037928, + "grad_norm": 18.75, + "learning_rate": 2.028498337138853e-06, + "loss": 1.0685, + "step": 2424 + }, + { + "epoch": 0.5895939703379528, + "grad_norm": 15.3125, + "learning_rate": 2.0281201830429316e-06, + "loss": 0.9339, + "step": 2425 + }, + { + "epoch": 0.5898371018721128, + "grad_norm": 17.75, + "learning_rate": 2.02774191264351e-06, + "loss": 1.0232, + "step": 2426 + }, + { + "epoch": 0.5900802334062728, + "grad_norm": 18.375, + "learning_rate": 2.0273635259971268e-06, + "loss": 1.0813, + "step": 2427 + }, + { + "epoch": 0.5903233649404328, + "grad_norm": 23.625, + "learning_rate": 2.0269850231603393e-06, + "loss": 1.1019, + "step": 2428 + }, + { + "epoch": 0.5905664964745928, + "grad_norm": 17.25, + "learning_rate": 2.0266064041897216e-06, + "loss": 0.6596, + "step": 2429 + }, + { + "epoch": 0.5908096280087527, + "grad_norm": 22.25, + "learning_rate": 2.026227669141864e-06, + "loss": 1.3176, + "step": 2430 + }, + { + "epoch": 0.5910527595429127, + "grad_norm": 19.875, + "learning_rate": 2.0258488180733755e-06, + "loss": 0.7592, + "step": 2431 + }, + { + "epoch": 0.5912958910770727, + "grad_norm": 17.875, + "learning_rate": 2.0254698510408815e-06, + "loss": 1.0825, + "step": 2432 + }, + { + "epoch": 0.5915390226112327, + "grad_norm": 17.0, + "learning_rate": 2.0250907681010255e-06, + "loss": 1.1475, + "step": 2433 + }, + { + "epoch": 0.5917821541453927, + "grad_norm": 17.125, + "learning_rate": 2.024711569310468e-06, + "loss": 0.8014, + "step": 2434 + }, + { + "epoch": 0.5920252856795526, + "grad_norm": 20.0, + "learning_rate": 2.0243322547258866e-06, + "loss": 1.1512, + "step": 2435 + }, + { + "epoch": 0.5922684172137126, + "grad_norm": 23.0, + "learning_rate": 2.0239528244039767e-06, + "loss": 0.7642, + "step": 2436 + }, + { + "epoch": 0.5925115487478726, + "grad_norm": 23.75, + "learning_rate": 2.0235732784014507e-06, + "loss": 1.2959, + "step": 2437 + }, + { + "epoch": 0.5927546802820326, + "grad_norm": 16.875, + "learning_rate": 2.0231936167750378e-06, + "loss": 0.6246, + "step": 2438 + }, + { + "epoch": 0.5929978118161926, + "grad_norm": 24.125, + "learning_rate": 2.0228138395814854e-06, + "loss": 1.1494, + "step": 2439 + }, + { + "epoch": 0.5932409433503525, + "grad_norm": 13.25, + "learning_rate": 2.022433946877558e-06, + "loss": 0.3006, + "step": 2440 + }, + { + "epoch": 0.5934840748845125, + "grad_norm": 37.5, + "learning_rate": 2.0220539387200365e-06, + "loss": 1.3829, + "step": 2441 + }, + { + "epoch": 0.5937272064186725, + "grad_norm": 15.75, + "learning_rate": 2.0216738151657208e-06, + "loss": 0.7871, + "step": 2442 + }, + { + "epoch": 0.5939703379528325, + "grad_norm": 15.25, + "learning_rate": 2.0212935762714254e-06, + "loss": 0.817, + "step": 2443 + }, + { + "epoch": 0.5942134694869925, + "grad_norm": 15.125, + "learning_rate": 2.0209132220939845e-06, + "loss": 0.5611, + "step": 2444 + }, + { + "epoch": 0.5944566010211525, + "grad_norm": 17.375, + "learning_rate": 2.0205327526902486e-06, + "loss": 0.873, + "step": 2445 + }, + { + "epoch": 0.5946997325553124, + "grad_norm": 17.125, + "learning_rate": 2.020152168117085e-06, + "loss": 1.2222, + "step": 2446 + }, + { + "epoch": 0.5949428640894724, + "grad_norm": 21.875, + "learning_rate": 2.0197714684313786e-06, + "loss": 0.689, + "step": 2447 + }, + { + "epoch": 0.5951859956236324, + "grad_norm": 22.5, + "learning_rate": 2.019390653690033e-06, + "loss": 1.0182, + "step": 2448 + }, + { + "epoch": 0.5954291271577924, + "grad_norm": 20.0, + "learning_rate": 2.019009723949965e-06, + "loss": 1.0079, + "step": 2449 + }, + { + "epoch": 0.5956722586919524, + "grad_norm": 23.5, + "learning_rate": 2.018628679268113e-06, + "loss": 0.9264, + "step": 2450 + }, + { + "epoch": 0.5959153902261123, + "grad_norm": 21.625, + "learning_rate": 2.0182475197014306e-06, + "loss": 0.9782, + "step": 2451 + }, + { + "epoch": 0.5961585217602723, + "grad_norm": 16.25, + "learning_rate": 2.0178662453068877e-06, + "loss": 0.6295, + "step": 2452 + }, + { + "epoch": 0.5964016532944323, + "grad_norm": 19.375, + "learning_rate": 2.0174848561414734e-06, + "loss": 0.6381, + "step": 2453 + }, + { + "epoch": 0.5966447848285923, + "grad_norm": 17.625, + "learning_rate": 2.017103352262192e-06, + "loss": 0.589, + "step": 2454 + }, + { + "epoch": 0.5968879163627523, + "grad_norm": 16.875, + "learning_rate": 2.0167217337260665e-06, + "loss": 0.4894, + "step": 2455 + }, + { + "epoch": 0.5971310478969122, + "grad_norm": 17.875, + "learning_rate": 2.0163400005901362e-06, + "loss": 0.8663, + "step": 2456 + }, + { + "epoch": 0.5973741794310722, + "grad_norm": 13.875, + "learning_rate": 2.015958152911458e-06, + "loss": 0.6678, + "step": 2457 + }, + { + "epoch": 0.5976173109652322, + "grad_norm": 16.625, + "learning_rate": 2.0155761907471043e-06, + "loss": 0.7652, + "step": 2458 + }, + { + "epoch": 0.5978604424993922, + "grad_norm": 23.375, + "learning_rate": 2.015194114154168e-06, + "loss": 0.9417, + "step": 2459 + }, + { + "epoch": 0.5981035740335522, + "grad_norm": 18.125, + "learning_rate": 2.0148119231897556e-06, + "loss": 0.912, + "step": 2460 + }, + { + "epoch": 0.5983467055677121, + "grad_norm": 24.0, + "learning_rate": 2.0144296179109923e-06, + "loss": 0.8892, + "step": 2461 + }, + { + "epoch": 0.5985898371018721, + "grad_norm": 26.25, + "learning_rate": 2.0140471983750205e-06, + "loss": 0.9228, + "step": 2462 + }, + { + "epoch": 0.5988329686360321, + "grad_norm": 18.75, + "learning_rate": 2.0136646646389996e-06, + "loss": 1.0494, + "step": 2463 + }, + { + "epoch": 0.5990761001701921, + "grad_norm": 15.5625, + "learning_rate": 2.013282016760105e-06, + "loss": 0.5649, + "step": 2464 + }, + { + "epoch": 0.5993192317043521, + "grad_norm": 39.75, + "learning_rate": 2.0128992547955315e-06, + "loss": 1.1498, + "step": 2465 + }, + { + "epoch": 0.5995623632385121, + "grad_norm": 22.125, + "learning_rate": 2.012516378802488e-06, + "loss": 1.0061, + "step": 2466 + }, + { + "epoch": 0.599805494772672, + "grad_norm": 23.75, + "learning_rate": 2.0121333888382032e-06, + "loss": 1.245, + "step": 2467 + }, + { + "epoch": 0.600048626306832, + "grad_norm": 16.5, + "learning_rate": 2.0117502849599204e-06, + "loss": 0.6547, + "step": 2468 + }, + { + "epoch": 0.600291757840992, + "grad_norm": 19.0, + "learning_rate": 2.011367067224902e-06, + "loss": 0.6211, + "step": 2469 + }, + { + "epoch": 0.600534889375152, + "grad_norm": 19.5, + "learning_rate": 2.0109837356904257e-06, + "loss": 0.8737, + "step": 2470 + }, + { + "epoch": 0.600778020909312, + "grad_norm": 23.375, + "learning_rate": 2.0106002904137877e-06, + "loss": 0.9821, + "step": 2471 + }, + { + "epoch": 0.6010211524434719, + "grad_norm": 18.125, + "learning_rate": 2.0102167314523004e-06, + "loss": 0.9827, + "step": 2472 + }, + { + "epoch": 0.6012642839776319, + "grad_norm": 16.75, + "learning_rate": 2.009833058863293e-06, + "loss": 0.8096, + "step": 2473 + }, + { + "epoch": 0.6015074155117919, + "grad_norm": 24.25, + "learning_rate": 2.0094492727041124e-06, + "loss": 0.8818, + "step": 2474 + }, + { + "epoch": 0.6017505470459519, + "grad_norm": 14.375, + "learning_rate": 2.009065373032122e-06, + "loss": 0.4699, + "step": 2475 + }, + { + "epoch": 0.6019936785801119, + "grad_norm": 14.125, + "learning_rate": 2.0086813599047012e-06, + "loss": 0.6093, + "step": 2476 + }, + { + "epoch": 0.6022368101142718, + "grad_norm": 17.25, + "learning_rate": 2.0082972333792496e-06, + "loss": 0.6792, + "step": 2477 + }, + { + "epoch": 0.6024799416484318, + "grad_norm": 13.5, + "learning_rate": 2.007912993513179e-06, + "loss": 0.4687, + "step": 2478 + }, + { + "epoch": 0.6027230731825918, + "grad_norm": 23.875, + "learning_rate": 2.0075286403639226e-06, + "loss": 0.792, + "step": 2479 + }, + { + "epoch": 0.6029662047167518, + "grad_norm": 15.9375, + "learning_rate": 2.0071441739889278e-06, + "loss": 0.7047, + "step": 2480 + }, + { + "epoch": 0.6032093362509118, + "grad_norm": 25.0, + "learning_rate": 2.0067595944456598e-06, + "loss": 1.1387, + "step": 2481 + }, + { + "epoch": 0.6034524677850718, + "grad_norm": 37.25, + "learning_rate": 2.006374901791601e-06, + "loss": 1.6589, + "step": 2482 + }, + { + "epoch": 0.6036955993192317, + "grad_norm": 16.75, + "learning_rate": 2.0059900960842493e-06, + "loss": 0.6608, + "step": 2483 + }, + { + "epoch": 0.6039387308533917, + "grad_norm": 20.625, + "learning_rate": 2.005605177381122e-06, + "loss": 0.7116, + "step": 2484 + }, + { + "epoch": 0.6041818623875517, + "grad_norm": 32.5, + "learning_rate": 2.0052201457397507e-06, + "loss": 1.7751, + "step": 2485 + }, + { + "epoch": 0.6044249939217117, + "grad_norm": 25.125, + "learning_rate": 2.004835001217686e-06, + "loss": 1.0185, + "step": 2486 + }, + { + "epoch": 0.6046681254558717, + "grad_norm": 15.75, + "learning_rate": 2.004449743872494e-06, + "loss": 0.5704, + "step": 2487 + }, + { + "epoch": 0.6049112569900316, + "grad_norm": 19.375, + "learning_rate": 2.0040643737617577e-06, + "loss": 0.7855, + "step": 2488 + }, + { + "epoch": 0.6051543885241916, + "grad_norm": 14.5625, + "learning_rate": 2.0036788909430774e-06, + "loss": 0.4763, + "step": 2489 + }, + { + "epoch": 0.6053975200583516, + "grad_norm": 20.75, + "learning_rate": 2.0032932954740707e-06, + "loss": 0.9844, + "step": 2490 + }, + { + "epoch": 0.6056406515925116, + "grad_norm": 17.375, + "learning_rate": 2.002907587412371e-06, + "loss": 0.7559, + "step": 2491 + }, + { + "epoch": 0.6058837831266716, + "grad_norm": 20.25, + "learning_rate": 2.0025217668156295e-06, + "loss": 0.9571, + "step": 2492 + }, + { + "epoch": 0.6061269146608315, + "grad_norm": 16.5, + "learning_rate": 2.002135833741513e-06, + "loss": 0.667, + "step": 2493 + }, + { + "epoch": 0.6063700461949915, + "grad_norm": 23.125, + "learning_rate": 2.0017497882477068e-06, + "loss": 1.0873, + "step": 2494 + }, + { + "epoch": 0.6066131777291515, + "grad_norm": 21.375, + "learning_rate": 2.001363630391911e-06, + "loss": 0.8989, + "step": 2495 + }, + { + "epoch": 0.6068563092633115, + "grad_norm": 28.5, + "learning_rate": 2.0009773602318444e-06, + "loss": 0.8248, + "step": 2496 + }, + { + "epoch": 0.6070994407974715, + "grad_norm": 24.125, + "learning_rate": 2.0005909778252415e-06, + "loss": 0.9701, + "step": 2497 + }, + { + "epoch": 0.6073425723316314, + "grad_norm": 15.5, + "learning_rate": 2.000204483229854e-06, + "loss": 0.7341, + "step": 2498 + }, + { + "epoch": 0.6075857038657914, + "grad_norm": 16.125, + "learning_rate": 1.9998178765034496e-06, + "loss": 0.5449, + "step": 2499 + }, + { + "epoch": 0.6078288353999514, + "grad_norm": 19.625, + "learning_rate": 1.9994311577038146e-06, + "loss": 0.8267, + "step": 2500 + }, + { + "epoch": 0.6080719669341114, + "grad_norm": 22.25, + "learning_rate": 1.999044326888749e-06, + "loss": 0.9458, + "step": 2501 + }, + { + "epoch": 0.6083150984682714, + "grad_norm": 14.8125, + "learning_rate": 1.9986573841160728e-06, + "loss": 0.5654, + "step": 2502 + }, + { + "epoch": 0.6085582300024314, + "grad_norm": 15.9375, + "learning_rate": 1.9982703294436206e-06, + "loss": 0.6877, + "step": 2503 + }, + { + "epoch": 0.6088013615365913, + "grad_norm": 18.375, + "learning_rate": 1.9978831629292444e-06, + "loss": 0.7262, + "step": 2504 + }, + { + "epoch": 0.6090444930707513, + "grad_norm": 22.375, + "learning_rate": 1.9974958846308136e-06, + "loss": 0.7039, + "step": 2505 + }, + { + "epoch": 0.6092876246049113, + "grad_norm": 20.125, + "learning_rate": 1.9971084946062126e-06, + "loss": 0.7864, + "step": 2506 + }, + { + "epoch": 0.6095307561390713, + "grad_norm": 20.375, + "learning_rate": 1.996720992913345e-06, + "loss": 0.7615, + "step": 2507 + }, + { + "epoch": 0.6097738876732313, + "grad_norm": 19.625, + "learning_rate": 1.9963333796101275e-06, + "loss": 0.6926, + "step": 2508 + }, + { + "epoch": 0.6100170192073912, + "grad_norm": 18.375, + "learning_rate": 1.995945654754497e-06, + "loss": 0.777, + "step": 2509 + }, + { + "epoch": 0.6102601507415512, + "grad_norm": 27.875, + "learning_rate": 1.9955578184044062e-06, + "loss": 1.2121, + "step": 2510 + }, + { + "epoch": 0.6105032822757112, + "grad_norm": 18.375, + "learning_rate": 1.995169870617823e-06, + "loss": 0.7759, + "step": 2511 + }, + { + "epoch": 0.6107464138098712, + "grad_norm": 19.75, + "learning_rate": 1.994781811452733e-06, + "loss": 0.672, + "step": 2512 + }, + { + "epoch": 0.6109895453440312, + "grad_norm": 23.25, + "learning_rate": 1.994393640967138e-06, + "loss": 1.1134, + "step": 2513 + }, + { + "epoch": 0.611232676878191, + "grad_norm": 26.5, + "learning_rate": 1.994005359219058e-06, + "loss": 0.9399, + "step": 2514 + }, + { + "epoch": 0.611475808412351, + "grad_norm": 23.125, + "learning_rate": 1.993616966266527e-06, + "loss": 1.2102, + "step": 2515 + }, + { + "epoch": 0.611718939946511, + "grad_norm": 16.75, + "learning_rate": 1.993228462167598e-06, + "loss": 0.6831, + "step": 2516 + }, + { + "epoch": 0.611962071480671, + "grad_norm": 16.875, + "learning_rate": 1.992839846980339e-06, + "loss": 0.5729, + "step": 2517 + }, + { + "epoch": 0.6122052030148311, + "grad_norm": 16.625, + "learning_rate": 1.992451120762836e-06, + "loss": 0.8248, + "step": 2518 + }, + { + "epoch": 0.6124483345489911, + "grad_norm": 22.0, + "learning_rate": 1.99206228357319e-06, + "loss": 1.1109, + "step": 2519 + }, + { + "epoch": 0.612691466083151, + "grad_norm": 18.25, + "learning_rate": 1.9916733354695204e-06, + "loss": 0.9915, + "step": 2520 + }, + { + "epoch": 0.612934597617311, + "grad_norm": 17.125, + "learning_rate": 1.9912842765099617e-06, + "loss": 0.7568, + "step": 2521 + }, + { + "epoch": 0.613177729151471, + "grad_norm": 25.875, + "learning_rate": 1.990895106752665e-06, + "loss": 1.5433, + "step": 2522 + }, + { + "epoch": 0.613420860685631, + "grad_norm": 25.125, + "learning_rate": 1.9905058262557993e-06, + "loss": 0.8675, + "step": 2523 + }, + { + "epoch": 0.613663992219791, + "grad_norm": 18.75, + "learning_rate": 1.9901164350775482e-06, + "loss": 0.8637, + "step": 2524 + }, + { + "epoch": 0.6139071237539508, + "grad_norm": 22.625, + "learning_rate": 1.9897269332761145e-06, + "loss": 0.7131, + "step": 2525 + }, + { + "epoch": 0.6141502552881108, + "grad_norm": 17.375, + "learning_rate": 1.9893373209097142e-06, + "loss": 0.7901, + "step": 2526 + }, + { + "epoch": 0.6143933868222708, + "grad_norm": 17.5, + "learning_rate": 1.988947598036583e-06, + "loss": 0.7823, + "step": 2527 + }, + { + "epoch": 0.6146365183564308, + "grad_norm": 17.875, + "learning_rate": 1.988557764714971e-06, + "loss": 0.8806, + "step": 2528 + }, + { + "epoch": 0.6148796498905909, + "grad_norm": 20.5, + "learning_rate": 1.9881678210031462e-06, + "loss": 0.7563, + "step": 2529 + }, + { + "epoch": 0.6151227814247507, + "grad_norm": 21.0, + "learning_rate": 1.9877777669593917e-06, + "loss": 0.9966, + "step": 2530 + }, + { + "epoch": 0.6153659129589107, + "grad_norm": 19.625, + "learning_rate": 1.987387602642008e-06, + "loss": 0.6233, + "step": 2531 + }, + { + "epoch": 0.6156090444930707, + "grad_norm": 18.25, + "learning_rate": 1.986997328109312e-06, + "loss": 1.0035, + "step": 2532 + }, + { + "epoch": 0.6158521760272307, + "grad_norm": 14.0, + "learning_rate": 1.9866069434196367e-06, + "loss": 0.8368, + "step": 2533 + }, + { + "epoch": 0.6160953075613907, + "grad_norm": 17.625, + "learning_rate": 1.9862164486313323e-06, + "loss": 0.6484, + "step": 2534 + }, + { + "epoch": 0.6163384390955506, + "grad_norm": 17.0, + "learning_rate": 1.985825843802765e-06, + "loss": 0.6463, + "step": 2535 + }, + { + "epoch": 0.6165815706297106, + "grad_norm": 19.875, + "learning_rate": 1.985435128992317e-06, + "loss": 0.7574, + "step": 2536 + }, + { + "epoch": 0.6168247021638706, + "grad_norm": 18.5, + "learning_rate": 1.9850443042583872e-06, + "loss": 0.7423, + "step": 2537 + }, + { + "epoch": 0.6170678336980306, + "grad_norm": 15.125, + "learning_rate": 1.984653369659392e-06, + "loss": 0.6347, + "step": 2538 + }, + { + "epoch": 0.6173109652321906, + "grad_norm": 27.125, + "learning_rate": 1.9842623252537624e-06, + "loss": 1.0904, + "step": 2539 + }, + { + "epoch": 0.6175540967663506, + "grad_norm": 21.375, + "learning_rate": 1.983871171099947e-06, + "loss": 0.8306, + "step": 2540 + }, + { + "epoch": 0.6177972283005105, + "grad_norm": 18.75, + "learning_rate": 1.983479907256411e-06, + "loss": 1.1409, + "step": 2541 + }, + { + "epoch": 0.6180403598346705, + "grad_norm": 14.3125, + "learning_rate": 1.983088533781635e-06, + "loss": 0.5206, + "step": 2542 + }, + { + "epoch": 0.6182834913688305, + "grad_norm": 21.25, + "learning_rate": 1.9826970507341173e-06, + "loss": 0.9233, + "step": 2543 + }, + { + "epoch": 0.6185266229029905, + "grad_norm": 27.125, + "learning_rate": 1.982305458172371e-06, + "loss": 0.7664, + "step": 2544 + }, + { + "epoch": 0.6187697544371505, + "grad_norm": 21.25, + "learning_rate": 1.9819137561549265e-06, + "loss": 1.3842, + "step": 2545 + }, + { + "epoch": 0.6190128859713104, + "grad_norm": 18.375, + "learning_rate": 1.9815219447403305e-06, + "loss": 0.8527, + "step": 2546 + }, + { + "epoch": 0.6192560175054704, + "grad_norm": 19.0, + "learning_rate": 1.9811300239871463e-06, + "loss": 1.0483, + "step": 2547 + }, + { + "epoch": 0.6194991490396304, + "grad_norm": 16.625, + "learning_rate": 1.9807379939539527e-06, + "loss": 0.7207, + "step": 2548 + }, + { + "epoch": 0.6197422805737904, + "grad_norm": 16.0, + "learning_rate": 1.9803458546993456e-06, + "loss": 0.5849, + "step": 2549 + }, + { + "epoch": 0.6199854121079504, + "grad_norm": 19.625, + "learning_rate": 1.9799536062819376e-06, + "loss": 0.6721, + "step": 2550 + }, + { + "epoch": 0.6202285436421103, + "grad_norm": 21.25, + "learning_rate": 1.9795612487603553e-06, + "loss": 0.7343, + "step": 2551 + }, + { + "epoch": 0.6204716751762703, + "grad_norm": 18.875, + "learning_rate": 1.9791687821932456e-06, + "loss": 0.6469, + "step": 2552 + }, + { + "epoch": 0.6207148067104303, + "grad_norm": 13.375, + "learning_rate": 1.9787762066392675e-06, + "loss": 0.4419, + "step": 2553 + }, + { + "epoch": 0.6209579382445903, + "grad_norm": 17.25, + "learning_rate": 1.978383522157099e-06, + "loss": 0.6656, + "step": 2554 + }, + { + "epoch": 0.6212010697787503, + "grad_norm": 17.875, + "learning_rate": 1.9779907288054332e-06, + "loss": 0.5274, + "step": 2555 + }, + { + "epoch": 0.6214442013129103, + "grad_norm": 24.625, + "learning_rate": 1.977597826642981e-06, + "loss": 1.051, + "step": 2556 + }, + { + "epoch": 0.6216873328470702, + "grad_norm": 20.0, + "learning_rate": 1.9772048157284666e-06, + "loss": 0.9632, + "step": 2557 + }, + { + "epoch": 0.6219304643812302, + "grad_norm": 19.0, + "learning_rate": 1.976811696120634e-06, + "loss": 1.0646, + "step": 2558 + }, + { + "epoch": 0.6221735959153902, + "grad_norm": 17.0, + "learning_rate": 1.9764184678782406e-06, + "loss": 0.711, + "step": 2559 + }, + { + "epoch": 0.6224167274495502, + "grad_norm": 21.75, + "learning_rate": 1.9760251310600614e-06, + "loss": 1.0041, + "step": 2560 + }, + { + "epoch": 0.6226598589837102, + "grad_norm": 17.75, + "learning_rate": 1.9756316857248877e-06, + "loss": 0.7743, + "step": 2561 + }, + { + "epoch": 0.6229029905178701, + "grad_norm": 18.625, + "learning_rate": 1.9752381319315267e-06, + "loss": 0.9294, + "step": 2562 + }, + { + "epoch": 0.6231461220520301, + "grad_norm": 16.125, + "learning_rate": 1.9748444697388008e-06, + "loss": 0.631, + "step": 2563 + }, + { + "epoch": 0.6233892535861901, + "grad_norm": 17.875, + "learning_rate": 1.974450699205551e-06, + "loss": 0.8134, + "step": 2564 + }, + { + "epoch": 0.6236323851203501, + "grad_norm": 16.375, + "learning_rate": 1.9740568203906325e-06, + "loss": 0.4797, + "step": 2565 + }, + { + "epoch": 0.6238755166545101, + "grad_norm": 26.0, + "learning_rate": 1.973662833352917e-06, + "loss": 1.2025, + "step": 2566 + }, + { + "epoch": 0.62411864818867, + "grad_norm": 17.25, + "learning_rate": 1.9732687381512933e-06, + "loss": 0.8941, + "step": 2567 + }, + { + "epoch": 0.62436177972283, + "grad_norm": 18.5, + "learning_rate": 1.9728745348446654e-06, + "loss": 1.0143, + "step": 2568 + }, + { + "epoch": 0.62460491125699, + "grad_norm": 20.0, + "learning_rate": 1.9724802234919535e-06, + "loss": 0.9828, + "step": 2569 + }, + { + "epoch": 0.62484804279115, + "grad_norm": 17.125, + "learning_rate": 1.9720858041520944e-06, + "loss": 0.7606, + "step": 2570 + }, + { + "epoch": 0.62509117432531, + "grad_norm": 18.375, + "learning_rate": 1.9716912768840417e-06, + "loss": 0.7916, + "step": 2571 + }, + { + "epoch": 0.6253343058594699, + "grad_norm": 19.25, + "learning_rate": 1.9712966417467634e-06, + "loss": 0.9992, + "step": 2572 + }, + { + "epoch": 0.6255774373936299, + "grad_norm": 13.75, + "learning_rate": 1.970901898799244e-06, + "loss": 0.3656, + "step": 2573 + }, + { + "epoch": 0.6258205689277899, + "grad_norm": 19.0, + "learning_rate": 1.9705070481004862e-06, + "loss": 0.8741, + "step": 2574 + }, + { + "epoch": 0.6260637004619499, + "grad_norm": 20.0, + "learning_rate": 1.9701120897095063e-06, + "loss": 0.6176, + "step": 2575 + }, + { + "epoch": 0.6263068319961099, + "grad_norm": 16.25, + "learning_rate": 1.969717023685338e-06, + "loss": 0.7102, + "step": 2576 + }, + { + "epoch": 0.6265499635302699, + "grad_norm": 25.625, + "learning_rate": 1.9693218500870303e-06, + "loss": 1.0332, + "step": 2577 + }, + { + "epoch": 0.6267930950644298, + "grad_norm": 17.375, + "learning_rate": 1.968926568973649e-06, + "loss": 0.8218, + "step": 2578 + }, + { + "epoch": 0.6270362265985898, + "grad_norm": 18.25, + "learning_rate": 1.9685311804042756e-06, + "loss": 0.8964, + "step": 2579 + }, + { + "epoch": 0.6272793581327498, + "grad_norm": 17.75, + "learning_rate": 1.968135684438008e-06, + "loss": 1.0285, + "step": 2580 + }, + { + "epoch": 0.6275224896669098, + "grad_norm": 25.375, + "learning_rate": 1.96774008113396e-06, + "loss": 1.0416, + "step": 2581 + }, + { + "epoch": 0.6277656212010698, + "grad_norm": 19.75, + "learning_rate": 1.9673443705512605e-06, + "loss": 0.8864, + "step": 2582 + }, + { + "epoch": 0.6280087527352297, + "grad_norm": 22.125, + "learning_rate": 1.9669485527490563e-06, + "loss": 1.0568, + "step": 2583 + }, + { + "epoch": 0.6282518842693897, + "grad_norm": 17.375, + "learning_rate": 1.9665526277865084e-06, + "loss": 0.6917, + "step": 2584 + }, + { + "epoch": 0.6284950158035497, + "grad_norm": 21.5, + "learning_rate": 1.9661565957227954e-06, + "loss": 1.0235, + "step": 2585 + }, + { + "epoch": 0.6287381473377097, + "grad_norm": 20.375, + "learning_rate": 1.96576045661711e-06, + "loss": 0.9047, + "step": 2586 + }, + { + "epoch": 0.6289812788718697, + "grad_norm": 21.375, + "learning_rate": 1.9653642105286636e-06, + "loss": 1.0859, + "step": 2587 + }, + { + "epoch": 0.6292244104060296, + "grad_norm": 17.375, + "learning_rate": 1.9649678575166808e-06, + "loss": 0.7796, + "step": 2588 + }, + { + "epoch": 0.6294675419401896, + "grad_norm": 17.875, + "learning_rate": 1.9645713976404036e-06, + "loss": 0.9683, + "step": 2589 + }, + { + "epoch": 0.6297106734743496, + "grad_norm": 17.0, + "learning_rate": 1.96417483095909e-06, + "loss": 0.3621, + "step": 2590 + }, + { + "epoch": 0.6299538050085096, + "grad_norm": 15.5625, + "learning_rate": 1.9637781575320138e-06, + "loss": 0.7793, + "step": 2591 + }, + { + "epoch": 0.6301969365426696, + "grad_norm": 20.25, + "learning_rate": 1.9633813774184646e-06, + "loss": 0.94, + "step": 2592 + }, + { + "epoch": 0.6304400680768296, + "grad_norm": 19.875, + "learning_rate": 1.9629844906777483e-06, + "loss": 0.7717, + "step": 2593 + }, + { + "epoch": 0.6306831996109895, + "grad_norm": 17.125, + "learning_rate": 1.9625874973691856e-06, + "loss": 0.6891, + "step": 2594 + }, + { + "epoch": 0.6309263311451495, + "grad_norm": 19.375, + "learning_rate": 1.962190397552115e-06, + "loss": 0.9662, + "step": 2595 + }, + { + "epoch": 0.6311694626793095, + "grad_norm": 21.75, + "learning_rate": 1.9617931912858897e-06, + "loss": 0.7517, + "step": 2596 + }, + { + "epoch": 0.6314125942134695, + "grad_norm": 15.8125, + "learning_rate": 1.9613958786298783e-06, + "loss": 0.551, + "step": 2597 + }, + { + "epoch": 0.6316557257476295, + "grad_norm": 25.25, + "learning_rate": 1.960998459643467e-06, + "loss": 0.9711, + "step": 2598 + }, + { + "epoch": 0.6318988572817894, + "grad_norm": 18.5, + "learning_rate": 1.9606009343860566e-06, + "loss": 1.0415, + "step": 2599 + }, + { + "epoch": 0.6321419888159494, + "grad_norm": 18.125, + "learning_rate": 1.9602033029170637e-06, + "loss": 1.009, + "step": 2600 + }, + { + "epoch": 0.6323851203501094, + "grad_norm": 19.75, + "learning_rate": 1.959805565295922e-06, + "loss": 0.7581, + "step": 2601 + }, + { + "epoch": 0.6326282518842694, + "grad_norm": 16.625, + "learning_rate": 1.9594077215820795e-06, + "loss": 0.6777, + "step": 2602 + }, + { + "epoch": 0.6328713834184294, + "grad_norm": 20.75, + "learning_rate": 1.959009771835001e-06, + "loss": 0.7276, + "step": 2603 + }, + { + "epoch": 0.6331145149525893, + "grad_norm": 17.875, + "learning_rate": 1.9586117161141672e-06, + "loss": 1.1352, + "step": 2604 + }, + { + "epoch": 0.6333576464867493, + "grad_norm": 45.0, + "learning_rate": 1.958213554479074e-06, + "loss": 0.9656, + "step": 2605 + }, + { + "epoch": 0.6336007780209093, + "grad_norm": 16.75, + "learning_rate": 1.957815286989235e-06, + "loss": 0.7441, + "step": 2606 + }, + { + "epoch": 0.6338439095550693, + "grad_norm": 15.5, + "learning_rate": 1.957416913704176e-06, + "loss": 0.5765, + "step": 2607 + }, + { + "epoch": 0.6340870410892293, + "grad_norm": 24.75, + "learning_rate": 1.9570184346834415e-06, + "loss": 1.3351, + "step": 2608 + }, + { + "epoch": 0.6343301726233892, + "grad_norm": 24.0, + "learning_rate": 1.9566198499865917e-06, + "loss": 0.9138, + "step": 2609 + }, + { + "epoch": 0.6345733041575492, + "grad_norm": 16.375, + "learning_rate": 1.9562211596732012e-06, + "loss": 0.5859, + "step": 2610 + }, + { + "epoch": 0.6348164356917092, + "grad_norm": 17.5, + "learning_rate": 1.955822363802862e-06, + "loss": 0.6955, + "step": 2611 + }, + { + "epoch": 0.6350595672258692, + "grad_norm": 22.375, + "learning_rate": 1.9554234624351807e-06, + "loss": 0.9278, + "step": 2612 + }, + { + "epoch": 0.6353026987600292, + "grad_norm": 18.375, + "learning_rate": 1.9550244556297794e-06, + "loss": 0.5837, + "step": 2613 + }, + { + "epoch": 0.6355458302941892, + "grad_norm": 17.875, + "learning_rate": 1.954625343446297e-06, + "loss": 0.9824, + "step": 2614 + }, + { + "epoch": 0.6357889618283491, + "grad_norm": 23.625, + "learning_rate": 1.954226125944388e-06, + "loss": 1.0017, + "step": 2615 + }, + { + "epoch": 0.6360320933625091, + "grad_norm": 21.125, + "learning_rate": 1.953826803183722e-06, + "loss": 1.2419, + "step": 2616 + }, + { + "epoch": 0.6362752248966691, + "grad_norm": 17.5, + "learning_rate": 1.9534273752239844e-06, + "loss": 0.44, + "step": 2617 + }, + { + "epoch": 0.6365183564308291, + "grad_norm": 18.75, + "learning_rate": 1.953027842124878e-06, + "loss": 0.7213, + "step": 2618 + }, + { + "epoch": 0.6367614879649891, + "grad_norm": 17.25, + "learning_rate": 1.9526282039461177e-06, + "loss": 0.7985, + "step": 2619 + }, + { + "epoch": 0.637004619499149, + "grad_norm": 20.0, + "learning_rate": 1.952228460747438e-06, + "loss": 0.8595, + "step": 2620 + }, + { + "epoch": 0.637247751033309, + "grad_norm": 16.625, + "learning_rate": 1.9518286125885872e-06, + "loss": 0.7828, + "step": 2621 + }, + { + "epoch": 0.637490882567469, + "grad_norm": 20.125, + "learning_rate": 1.9514286595293286e-06, + "loss": 0.9267, + "step": 2622 + }, + { + "epoch": 0.637734014101629, + "grad_norm": 12.125, + "learning_rate": 1.9510286016294432e-06, + "loss": 0.544, + "step": 2623 + }, + { + "epoch": 0.637977145635789, + "grad_norm": 22.0, + "learning_rate": 1.9506284389487256e-06, + "loss": 0.8828, + "step": 2624 + }, + { + "epoch": 0.6382202771699489, + "grad_norm": 18.5, + "learning_rate": 1.9502281715469883e-06, + "loss": 0.9487, + "step": 2625 + }, + { + "epoch": 0.6384634087041089, + "grad_norm": 21.75, + "learning_rate": 1.949827799484057e-06, + "loss": 0.8824, + "step": 2626 + }, + { + "epoch": 0.6387065402382689, + "grad_norm": 18.5, + "learning_rate": 1.9494273228197747e-06, + "loss": 1.0268, + "step": 2627 + }, + { + "epoch": 0.6389496717724289, + "grad_norm": 20.125, + "learning_rate": 1.949026741613999e-06, + "loss": 0.6952, + "step": 2628 + }, + { + "epoch": 0.6391928033065889, + "grad_norm": 14.875, + "learning_rate": 1.948626055926605e-06, + "loss": 0.6886, + "step": 2629 + }, + { + "epoch": 0.6394359348407489, + "grad_norm": 19.875, + "learning_rate": 1.948225265817481e-06, + "loss": 0.7787, + "step": 2630 + }, + { + "epoch": 0.6396790663749088, + "grad_norm": 24.625, + "learning_rate": 1.947824371346532e-06, + "loss": 1.0134, + "step": 2631 + }, + { + "epoch": 0.6399221979090688, + "grad_norm": 16.0, + "learning_rate": 1.9474233725736787e-06, + "loss": 0.8336, + "step": 2632 + }, + { + "epoch": 0.6401653294432288, + "grad_norm": 18.125, + "learning_rate": 1.947022269558858e-06, + "loss": 0.9251, + "step": 2633 + }, + { + "epoch": 0.6404084609773888, + "grad_norm": 21.0, + "learning_rate": 1.9466210623620207e-06, + "loss": 1.2374, + "step": 2634 + }, + { + "epoch": 0.6406515925115488, + "grad_norm": 16.75, + "learning_rate": 1.9462197510431346e-06, + "loss": 0.5718, + "step": 2635 + }, + { + "epoch": 0.6408947240457087, + "grad_norm": 18.5, + "learning_rate": 1.9458183356621826e-06, + "loss": 0.7142, + "step": 2636 + }, + { + "epoch": 0.6411378555798687, + "grad_norm": 21.625, + "learning_rate": 1.9454168162791635e-06, + "loss": 0.8723, + "step": 2637 + }, + { + "epoch": 0.6413809871140287, + "grad_norm": 17.0, + "learning_rate": 1.9450151929540908e-06, + "loss": 0.6637, + "step": 2638 + }, + { + "epoch": 0.6416241186481887, + "grad_norm": 16.375, + "learning_rate": 1.944613465746994e-06, + "loss": 0.9261, + "step": 2639 + }, + { + "epoch": 0.6418672501823487, + "grad_norm": 26.25, + "learning_rate": 1.944211634717918e-06, + "loss": 0.985, + "step": 2640 + }, + { + "epoch": 0.6421103817165086, + "grad_norm": 17.625, + "learning_rate": 1.9438096999269243e-06, + "loss": 0.9207, + "step": 2641 + }, + { + "epoch": 0.6423535132506686, + "grad_norm": 22.875, + "learning_rate": 1.9434076614340883e-06, + "loss": 0.7383, + "step": 2642 + }, + { + "epoch": 0.6425966447848286, + "grad_norm": 15.875, + "learning_rate": 1.9430055192995016e-06, + "loss": 0.8852, + "step": 2643 + }, + { + "epoch": 0.6428397763189886, + "grad_norm": 16.5, + "learning_rate": 1.9426032735832717e-06, + "loss": 0.6596, + "step": 2644 + }, + { + "epoch": 0.6430829078531486, + "grad_norm": 23.25, + "learning_rate": 1.94220092434552e-06, + "loss": 1.1477, + "step": 2645 + }, + { + "epoch": 0.6433260393873085, + "grad_norm": 17.625, + "learning_rate": 1.9417984716463868e-06, + "loss": 0.555, + "step": 2646 + }, + { + "epoch": 0.6435691709214685, + "grad_norm": 19.0, + "learning_rate": 1.941395915546024e-06, + "loss": 0.7148, + "step": 2647 + }, + { + "epoch": 0.6438123024556285, + "grad_norm": 26.125, + "learning_rate": 1.9409932561045995e-06, + "loss": 1.0506, + "step": 2648 + }, + { + "epoch": 0.6440554339897885, + "grad_norm": 16.125, + "learning_rate": 1.9405904933823e-06, + "loss": 0.7608, + "step": 2649 + }, + { + "epoch": 0.6442985655239485, + "grad_norm": 21.25, + "learning_rate": 1.940187627439325e-06, + "loss": 1.1644, + "step": 2650 + }, + { + "epoch": 0.6445416970581085, + "grad_norm": 21.5, + "learning_rate": 1.939784658335888e-06, + "loss": 0.9409, + "step": 2651 + }, + { + "epoch": 0.6447848285922684, + "grad_norm": 17.375, + "learning_rate": 1.939381586132221e-06, + "loss": 0.9638, + "step": 2652 + }, + { + "epoch": 0.6450279601264284, + "grad_norm": 19.375, + "learning_rate": 1.93897841088857e-06, + "loss": 1.0011, + "step": 2653 + }, + { + "epoch": 0.6452710916605884, + "grad_norm": 19.375, + "learning_rate": 1.938575132665197e-06, + "loss": 0.9863, + "step": 2654 + }, + { + "epoch": 0.6455142231947484, + "grad_norm": 17.625, + "learning_rate": 1.9381717515223775e-06, + "loss": 0.7573, + "step": 2655 + }, + { + "epoch": 0.6457573547289084, + "grad_norm": 17.375, + "learning_rate": 1.9377682675204053e-06, + "loss": 0.6723, + "step": 2656 + }, + { + "epoch": 0.6460004862630683, + "grad_norm": 20.25, + "learning_rate": 1.9373646807195867e-06, + "loss": 1.2054, + "step": 2657 + }, + { + "epoch": 0.6462436177972283, + "grad_norm": 22.875, + "learning_rate": 1.9369609911802455e-06, + "loss": 1.0758, + "step": 2658 + }, + { + "epoch": 0.6464867493313883, + "grad_norm": 17.375, + "learning_rate": 1.93655719896272e-06, + "loss": 0.9359, + "step": 2659 + }, + { + "epoch": 0.6467298808655483, + "grad_norm": 17.125, + "learning_rate": 1.9361533041273643e-06, + "loss": 0.8533, + "step": 2660 + }, + { + "epoch": 0.6469730123997083, + "grad_norm": 17.875, + "learning_rate": 1.935749306734547e-06, + "loss": 0.9048, + "step": 2661 + }, + { + "epoch": 0.6472161439338682, + "grad_norm": 16.875, + "learning_rate": 1.935345206844652e-06, + "loss": 0.7516, + "step": 2662 + }, + { + "epoch": 0.6474592754680282, + "grad_norm": 15.125, + "learning_rate": 1.9349410045180796e-06, + "loss": 0.6485, + "step": 2663 + }, + { + "epoch": 0.6477024070021882, + "grad_norm": 15.6875, + "learning_rate": 1.9345366998152448e-06, + "loss": 0.7834, + "step": 2664 + }, + { + "epoch": 0.6479455385363482, + "grad_norm": 16.625, + "learning_rate": 1.9341322927965782e-06, + "loss": 1.2497, + "step": 2665 + }, + { + "epoch": 0.6481886700705082, + "grad_norm": 17.25, + "learning_rate": 1.9337277835225248e-06, + "loss": 0.7493, + "step": 2666 + }, + { + "epoch": 0.6484318016046682, + "grad_norm": 19.125, + "learning_rate": 1.9333231720535456e-06, + "loss": 1.232, + "step": 2667 + }, + { + "epoch": 0.6486749331388281, + "grad_norm": 21.625, + "learning_rate": 1.932918458450117e-06, + "loss": 1.0351, + "step": 2668 + }, + { + "epoch": 0.6489180646729881, + "grad_norm": 22.5, + "learning_rate": 1.9325136427727302e-06, + "loss": 0.9951, + "step": 2669 + }, + { + "epoch": 0.6491611962071481, + "grad_norm": 17.5, + "learning_rate": 1.9321087250818927e-06, + "loss": 0.8068, + "step": 2670 + }, + { + "epoch": 0.6494043277413081, + "grad_norm": 18.75, + "learning_rate": 1.9317037054381255e-06, + "loss": 0.5792, + "step": 2671 + }, + { + "epoch": 0.6496474592754681, + "grad_norm": 13.75, + "learning_rate": 1.931298583901966e-06, + "loss": 0.6272, + "step": 2672 + }, + { + "epoch": 0.649890590809628, + "grad_norm": 18.375, + "learning_rate": 1.9308933605339667e-06, + "loss": 0.647, + "step": 2673 + }, + { + "epoch": 0.650133722343788, + "grad_norm": 16.5, + "learning_rate": 1.9304880353946952e-06, + "loss": 0.6139, + "step": 2674 + }, + { + "epoch": 0.650376853877948, + "grad_norm": 14.5625, + "learning_rate": 1.9300826085447345e-06, + "loss": 0.671, + "step": 2675 + }, + { + "epoch": 0.650619985412108, + "grad_norm": 21.625, + "learning_rate": 1.9296770800446825e-06, + "loss": 1.1881, + "step": 2676 + }, + { + "epoch": 0.650863116946268, + "grad_norm": 29.75, + "learning_rate": 1.9292714499551524e-06, + "loss": 0.8084, + "step": 2677 + }, + { + "epoch": 0.6511062484804279, + "grad_norm": 22.0, + "learning_rate": 1.9288657183367725e-06, + "loss": 0.8615, + "step": 2678 + }, + { + "epoch": 0.6513493800145879, + "grad_norm": 21.625, + "learning_rate": 1.9284598852501867e-06, + "loss": 1.3256, + "step": 2679 + }, + { + "epoch": 0.6515925115487479, + "grad_norm": 16.375, + "learning_rate": 1.928053950756054e-06, + "loss": 0.7895, + "step": 2680 + }, + { + "epoch": 0.6518356430829079, + "grad_norm": 19.375, + "learning_rate": 1.9276479149150475e-06, + "loss": 0.5394, + "step": 2681 + }, + { + "epoch": 0.6520787746170679, + "grad_norm": 24.25, + "learning_rate": 1.9272417777878573e-06, + "loss": 0.9726, + "step": 2682 + }, + { + "epoch": 0.6523219061512278, + "grad_norm": 25.25, + "learning_rate": 1.9268355394351862e-06, + "loss": 1.1387, + "step": 2683 + }, + { + "epoch": 0.6525650376853878, + "grad_norm": 22.875, + "learning_rate": 1.9264291999177547e-06, + "loss": 1.2903, + "step": 2684 + }, + { + "epoch": 0.6528081692195478, + "grad_norm": 18.25, + "learning_rate": 1.9260227592962976e-06, + "loss": 0.8315, + "step": 2685 + }, + { + "epoch": 0.6530513007537078, + "grad_norm": 20.75, + "learning_rate": 1.925616217631563e-06, + "loss": 1.2539, + "step": 2686 + }, + { + "epoch": 0.6532944322878678, + "grad_norm": 16.625, + "learning_rate": 1.9252095749843162e-06, + "loss": 0.6728, + "step": 2687 + }, + { + "epoch": 0.6535375638220278, + "grad_norm": 21.5, + "learning_rate": 1.9248028314153383e-06, + "loss": 0.796, + "step": 2688 + }, + { + "epoch": 0.6537806953561877, + "grad_norm": 15.625, + "learning_rate": 1.9243959869854222e-06, + "loss": 0.6722, + "step": 2689 + }, + { + "epoch": 0.6540238268903477, + "grad_norm": 15.75, + "learning_rate": 1.9239890417553786e-06, + "loss": 0.8586, + "step": 2690 + }, + { + "epoch": 0.6542669584245077, + "grad_norm": 19.25, + "learning_rate": 1.9235819957860323e-06, + "loss": 0.9895, + "step": 2691 + }, + { + "epoch": 0.6545100899586677, + "grad_norm": 20.875, + "learning_rate": 1.923174849138224e-06, + "loss": 0.9195, + "step": 2692 + }, + { + "epoch": 0.6547532214928277, + "grad_norm": 18.0, + "learning_rate": 1.9227676018728087e-06, + "loss": 1.1034, + "step": 2693 + }, + { + "epoch": 0.6549963530269876, + "grad_norm": 20.75, + "learning_rate": 1.922360254050655e-06, + "loss": 0.9611, + "step": 2694 + }, + { + "epoch": 0.6552394845611476, + "grad_norm": 18.25, + "learning_rate": 1.9219528057326507e-06, + "loss": 0.5477, + "step": 2695 + }, + { + "epoch": 0.6554826160953076, + "grad_norm": 17.625, + "learning_rate": 1.921545256979694e-06, + "loss": 0.7053, + "step": 2696 + }, + { + "epoch": 0.6557257476294676, + "grad_norm": 26.75, + "learning_rate": 1.9211376078527003e-06, + "loss": 0.9475, + "step": 2697 + }, + { + "epoch": 0.6559688791636276, + "grad_norm": 23.75, + "learning_rate": 1.9207298584126005e-06, + "loss": 0.6847, + "step": 2698 + }, + { + "epoch": 0.6562120106977875, + "grad_norm": 16.875, + "learning_rate": 1.920322008720339e-06, + "loss": 0.5618, + "step": 2699 + }, + { + "epoch": 0.6564551422319475, + "grad_norm": 16.625, + "learning_rate": 1.919914058836877e-06, + "loss": 0.8941, + "step": 2700 + }, + { + "epoch": 0.6566982737661075, + "grad_norm": 22.25, + "learning_rate": 1.919506008823189e-06, + "loss": 0.6823, + "step": 2701 + }, + { + "epoch": 0.6569414053002675, + "grad_norm": 19.375, + "learning_rate": 1.919097858740265e-06, + "loss": 0.9736, + "step": 2702 + }, + { + "epoch": 0.6571845368344275, + "grad_norm": 16.75, + "learning_rate": 1.91868960864911e-06, + "loss": 0.8121, + "step": 2703 + }, + { + "epoch": 0.6574276683685875, + "grad_norm": 16.5, + "learning_rate": 1.9182812586107454e-06, + "loss": 0.6385, + "step": 2704 + }, + { + "epoch": 0.6576707999027473, + "grad_norm": 18.625, + "learning_rate": 1.917872808686204e-06, + "loss": 1.0717, + "step": 2705 + }, + { + "epoch": 0.6579139314369074, + "grad_norm": 17.0, + "learning_rate": 1.9174642589365372e-06, + "loss": 0.6511, + "step": 2706 + }, + { + "epoch": 0.6581570629710674, + "grad_norm": 25.0, + "learning_rate": 1.9170556094228092e-06, + "loss": 0.8808, + "step": 2707 + }, + { + "epoch": 0.6584001945052274, + "grad_norm": 14.9375, + "learning_rate": 1.9166468602061e-06, + "loss": 0.5074, + "step": 2708 + }, + { + "epoch": 0.6586433260393874, + "grad_norm": 18.375, + "learning_rate": 1.9162380113475045e-06, + "loss": 0.6399, + "step": 2709 + }, + { + "epoch": 0.6588864575735472, + "grad_norm": 21.875, + "learning_rate": 1.9158290629081317e-06, + "loss": 1.036, + "step": 2710 + }, + { + "epoch": 0.6591295891077072, + "grad_norm": 46.75, + "learning_rate": 1.915420014949106e-06, + "loss": 1.1031, + "step": 2711 + }, + { + "epoch": 0.6593727206418672, + "grad_norm": 18.5, + "learning_rate": 1.915010867531567e-06, + "loss": 0.5675, + "step": 2712 + }, + { + "epoch": 0.6596158521760273, + "grad_norm": 26.25, + "learning_rate": 1.9146016207166684e-06, + "loss": 0.987, + "step": 2713 + }, + { + "epoch": 0.6598589837101873, + "grad_norm": 17.5, + "learning_rate": 1.91419227456558e-06, + "loss": 0.6809, + "step": 2714 + }, + { + "epoch": 0.6601021152443471, + "grad_norm": 19.125, + "learning_rate": 1.913782829139485e-06, + "loss": 0.8617, + "step": 2715 + }, + { + "epoch": 0.6603452467785071, + "grad_norm": 15.3125, + "learning_rate": 1.9133732844995824e-06, + "loss": 0.5928, + "step": 2716 + }, + { + "epoch": 0.6605883783126671, + "grad_norm": 20.0, + "learning_rate": 1.912963640707085e-06, + "loss": 1.1929, + "step": 2717 + }, + { + "epoch": 0.6608315098468271, + "grad_norm": 12.5625, + "learning_rate": 1.912553897823222e-06, + "loss": 0.5935, + "step": 2718 + }, + { + "epoch": 0.6610746413809871, + "grad_norm": 23.25, + "learning_rate": 1.912144055909237e-06, + "loss": 1.0319, + "step": 2719 + }, + { + "epoch": 0.661317772915147, + "grad_norm": 25.875, + "learning_rate": 1.9117341150263864e-06, + "loss": 1.0592, + "step": 2720 + }, + { + "epoch": 0.661560904449307, + "grad_norm": 19.875, + "learning_rate": 1.911324075235944e-06, + "loss": 0.9669, + "step": 2721 + }, + { + "epoch": 0.661804035983467, + "grad_norm": 23.375, + "learning_rate": 1.910913936599197e-06, + "loss": 0.8921, + "step": 2722 + }, + { + "epoch": 0.662047167517627, + "grad_norm": 18.875, + "learning_rate": 1.9105036991774476e-06, + "loss": 0.8377, + "step": 2723 + }, + { + "epoch": 0.662290299051787, + "grad_norm": 25.5, + "learning_rate": 1.9100933630320135e-06, + "loss": 0.8749, + "step": 2724 + }, + { + "epoch": 0.662533430585947, + "grad_norm": 19.125, + "learning_rate": 1.9096829282242257e-06, + "loss": 0.7983, + "step": 2725 + }, + { + "epoch": 0.6627765621201069, + "grad_norm": 18.75, + "learning_rate": 1.909272394815432e-06, + "loss": 0.935, + "step": 2726 + }, + { + "epoch": 0.6630196936542669, + "grad_norm": 17.5, + "learning_rate": 1.908861762866992e-06, + "loss": 0.8205, + "step": 2727 + }, + { + "epoch": 0.6632628251884269, + "grad_norm": 15.5625, + "learning_rate": 1.908451032440283e-06, + "loss": 0.5174, + "step": 2728 + }, + { + "epoch": 0.6635059567225869, + "grad_norm": 17.125, + "learning_rate": 1.908040203596695e-06, + "loss": 0.6204, + "step": 2729 + }, + { + "epoch": 0.6637490882567469, + "grad_norm": 22.75, + "learning_rate": 1.9076292763976338e-06, + "loss": 0.6751, + "step": 2730 + }, + { + "epoch": 0.6639922197909068, + "grad_norm": 23.875, + "learning_rate": 1.90721825090452e-06, + "loss": 0.939, + "step": 2731 + }, + { + "epoch": 0.6642353513250668, + "grad_norm": 42.0, + "learning_rate": 1.906807127178788e-06, + "loss": 1.2166, + "step": 2732 + }, + { + "epoch": 0.6644784828592268, + "grad_norm": 17.875, + "learning_rate": 1.906395905281887e-06, + "loss": 0.433, + "step": 2733 + }, + { + "epoch": 0.6647216143933868, + "grad_norm": 30.5, + "learning_rate": 1.905984585275282e-06, + "loss": 1.1259, + "step": 2734 + }, + { + "epoch": 0.6649647459275468, + "grad_norm": 18.875, + "learning_rate": 1.9055731672204513e-06, + "loss": 1.1893, + "step": 2735 + }, + { + "epoch": 0.6652078774617067, + "grad_norm": 15.5625, + "learning_rate": 1.9051616511788886e-06, + "loss": 0.7084, + "step": 2736 + }, + { + "epoch": 0.6654510089958667, + "grad_norm": 16.875, + "learning_rate": 1.9047500372121022e-06, + "loss": 0.8062, + "step": 2737 + }, + { + "epoch": 0.6656941405300267, + "grad_norm": 19.625, + "learning_rate": 1.904338325381615e-06, + "loss": 0.9368, + "step": 2738 + }, + { + "epoch": 0.6659372720641867, + "grad_norm": 21.375, + "learning_rate": 1.903926515748964e-06, + "loss": 0.8167, + "step": 2739 + }, + { + "epoch": 0.6661804035983467, + "grad_norm": 23.75, + "learning_rate": 1.9035146083757012e-06, + "loss": 1.1495, + "step": 2740 + }, + { + "epoch": 0.6664235351325067, + "grad_norm": 20.875, + "learning_rate": 1.903102603323394e-06, + "loss": 0.652, + "step": 2741 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 16.875, + "learning_rate": 1.9026905006536234e-06, + "loss": 0.693, + "step": 2742 + }, + { + "epoch": 0.6669097982008266, + "grad_norm": 17.0, + "learning_rate": 1.9022783004279852e-06, + "loss": 0.8309, + "step": 2743 + }, + { + "epoch": 0.6671529297349866, + "grad_norm": 16.75, + "learning_rate": 1.9018660027080893e-06, + "loss": 0.5793, + "step": 2744 + }, + { + "epoch": 0.6673960612691466, + "grad_norm": 16.875, + "learning_rate": 1.9014536075555612e-06, + "loss": 0.7686, + "step": 2745 + }, + { + "epoch": 0.6676391928033066, + "grad_norm": 14.75, + "learning_rate": 1.9010411150320408e-06, + "loss": 0.7207, + "step": 2746 + }, + { + "epoch": 0.6678823243374665, + "grad_norm": 17.5, + "learning_rate": 1.9006285251991818e-06, + "loss": 0.6781, + "step": 2747 + }, + { + "epoch": 0.6681254558716265, + "grad_norm": 17.5, + "learning_rate": 1.9002158381186527e-06, + "loss": 0.5717, + "step": 2748 + }, + { + "epoch": 0.6683685874057865, + "grad_norm": 18.875, + "learning_rate": 1.8998030538521373e-06, + "loss": 0.7242, + "step": 2749 + }, + { + "epoch": 0.6686117189399465, + "grad_norm": 22.125, + "learning_rate": 1.8993901724613328e-06, + "loss": 0.6692, + "step": 2750 + }, + { + "epoch": 0.6688548504741065, + "grad_norm": 20.625, + "learning_rate": 1.8989771940079517e-06, + "loss": 0.7458, + "step": 2751 + }, + { + "epoch": 0.6690979820082664, + "grad_norm": 18.75, + "learning_rate": 1.8985641185537207e-06, + "loss": 1.3859, + "step": 2752 + }, + { + "epoch": 0.6693411135424264, + "grad_norm": 16.75, + "learning_rate": 1.8981509461603815e-06, + "loss": 0.6103, + "step": 2753 + }, + { + "epoch": 0.6695842450765864, + "grad_norm": 17.5, + "learning_rate": 1.8977376768896888e-06, + "loss": 0.8004, + "step": 2754 + }, + { + "epoch": 0.6698273766107464, + "grad_norm": 17.375, + "learning_rate": 1.897324310803414e-06, + "loss": 0.7593, + "step": 2755 + }, + { + "epoch": 0.6700705081449064, + "grad_norm": 20.5, + "learning_rate": 1.8969108479633408e-06, + "loss": 0.9617, + "step": 2756 + }, + { + "epoch": 0.6703136396790663, + "grad_norm": 17.75, + "learning_rate": 1.8964972884312694e-06, + "loss": 0.6147, + "step": 2757 + }, + { + "epoch": 0.6705567712132263, + "grad_norm": 19.375, + "learning_rate": 1.8960836322690124e-06, + "loss": 1.0357, + "step": 2758 + }, + { + "epoch": 0.6707999027473863, + "grad_norm": 24.875, + "learning_rate": 1.8956698795383985e-06, + "loss": 1.1743, + "step": 2759 + }, + { + "epoch": 0.6710430342815463, + "grad_norm": 16.875, + "learning_rate": 1.8952560303012702e-06, + "loss": 0.5745, + "step": 2760 + }, + { + "epoch": 0.6712861658157063, + "grad_norm": 26.375, + "learning_rate": 1.8948420846194837e-06, + "loss": 1.2181, + "step": 2761 + }, + { + "epoch": 0.6715292973498663, + "grad_norm": 17.875, + "learning_rate": 1.894428042554911e-06, + "loss": 1.0507, + "step": 2762 + }, + { + "epoch": 0.6717724288840262, + "grad_norm": 17.0, + "learning_rate": 1.8940139041694377e-06, + "loss": 0.9185, + "step": 2763 + }, + { + "epoch": 0.6720155604181862, + "grad_norm": 25.25, + "learning_rate": 1.8935996695249643e-06, + "loss": 0.8981, + "step": 2764 + }, + { + "epoch": 0.6722586919523462, + "grad_norm": 21.0, + "learning_rate": 1.8931853386834047e-06, + "loss": 0.8092, + "step": 2765 + }, + { + "epoch": 0.6725018234865062, + "grad_norm": 21.375, + "learning_rate": 1.8927709117066878e-06, + "loss": 0.9859, + "step": 2766 + }, + { + "epoch": 0.6727449550206662, + "grad_norm": 19.25, + "learning_rate": 1.8923563886567574e-06, + "loss": 1.413, + "step": 2767 + }, + { + "epoch": 0.6729880865548261, + "grad_norm": 15.625, + "learning_rate": 1.8919417695955705e-06, + "loss": 0.6552, + "step": 2768 + }, + { + "epoch": 0.6732312180889861, + "grad_norm": 15.8125, + "learning_rate": 1.8915270545850998e-06, + "loss": 0.9794, + "step": 2769 + }, + { + "epoch": 0.6734743496231461, + "grad_norm": 17.375, + "learning_rate": 1.8911122436873313e-06, + "loss": 0.726, + "step": 2770 + }, + { + "epoch": 0.6737174811573061, + "grad_norm": 13.5, + "learning_rate": 1.890697336964265e-06, + "loss": 0.4577, + "step": 2771 + }, + { + "epoch": 0.6739606126914661, + "grad_norm": 13.9375, + "learning_rate": 1.890282334477917e-06, + "loss": 0.637, + "step": 2772 + }, + { + "epoch": 0.674203744225626, + "grad_norm": 17.5, + "learning_rate": 1.889867236290316e-06, + "loss": 0.8888, + "step": 2773 + }, + { + "epoch": 0.674446875759786, + "grad_norm": 22.875, + "learning_rate": 1.8894520424635055e-06, + "loss": 0.9943, + "step": 2774 + }, + { + "epoch": 0.674690007293946, + "grad_norm": 19.75, + "learning_rate": 1.8890367530595435e-06, + "loss": 1.2011, + "step": 2775 + }, + { + "epoch": 0.674933138828106, + "grad_norm": 16.625, + "learning_rate": 1.8886213681405022e-06, + "loss": 0.8127, + "step": 2776 + }, + { + "epoch": 0.675176270362266, + "grad_norm": 17.0, + "learning_rate": 1.8882058877684684e-06, + "loss": 0.8076, + "step": 2777 + }, + { + "epoch": 0.675419401896426, + "grad_norm": 18.5, + "learning_rate": 1.887790312005542e-06, + "loss": 1.0948, + "step": 2778 + }, + { + "epoch": 0.6756625334305859, + "grad_norm": 24.75, + "learning_rate": 1.887374640913839e-06, + "loss": 0.8811, + "step": 2779 + }, + { + "epoch": 0.6759056649647459, + "grad_norm": 16.75, + "learning_rate": 1.8869588745554874e-06, + "loss": 0.9598, + "step": 2780 + }, + { + "epoch": 0.6761487964989059, + "grad_norm": 18.5, + "learning_rate": 1.8865430129926316e-06, + "loss": 0.8612, + "step": 2781 + }, + { + "epoch": 0.6763919280330659, + "grad_norm": 15.6875, + "learning_rate": 1.8861270562874295e-06, + "loss": 0.6872, + "step": 2782 + }, + { + "epoch": 0.6766350595672259, + "grad_norm": 17.875, + "learning_rate": 1.8857110045020518e-06, + "loss": 0.836, + "step": 2783 + }, + { + "epoch": 0.6768781911013858, + "grad_norm": 22.125, + "learning_rate": 1.885294857698686e-06, + "loss": 0.9759, + "step": 2784 + }, + { + "epoch": 0.6771213226355458, + "grad_norm": 20.375, + "learning_rate": 1.8848786159395317e-06, + "loss": 1.1223, + "step": 2785 + }, + { + "epoch": 0.6773644541697058, + "grad_norm": 20.375, + "learning_rate": 1.884462279286803e-06, + "loss": 0.7113, + "step": 2786 + }, + { + "epoch": 0.6776075857038658, + "grad_norm": 18.625, + "learning_rate": 1.8840458478027296e-06, + "loss": 0.818, + "step": 2787 + }, + { + "epoch": 0.6778507172380258, + "grad_norm": 19.375, + "learning_rate": 1.8836293215495535e-06, + "loss": 1.1104, + "step": 2788 + }, + { + "epoch": 0.6780938487721857, + "grad_norm": 16.625, + "learning_rate": 1.8832127005895325e-06, + "loss": 0.6935, + "step": 2789 + }, + { + "epoch": 0.6783369803063457, + "grad_norm": 20.0, + "learning_rate": 1.882795984984937e-06, + "loss": 0.8294, + "step": 2790 + }, + { + "epoch": 0.6785801118405057, + "grad_norm": 16.875, + "learning_rate": 1.8823791747980535e-06, + "loss": 0.7348, + "step": 2791 + }, + { + "epoch": 0.6788232433746657, + "grad_norm": 17.875, + "learning_rate": 1.8819622700911804e-06, + "loss": 0.95, + "step": 2792 + }, + { + "epoch": 0.6790663749088257, + "grad_norm": 20.625, + "learning_rate": 1.8815452709266314e-06, + "loss": 0.8172, + "step": 2793 + }, + { + "epoch": 0.6793095064429856, + "grad_norm": 18.75, + "learning_rate": 1.8811281773667347e-06, + "loss": 0.8765, + "step": 2794 + }, + { + "epoch": 0.6795526379771456, + "grad_norm": 20.125, + "learning_rate": 1.8807109894738317e-06, + "loss": 0.7575, + "step": 2795 + }, + { + "epoch": 0.6797957695113056, + "grad_norm": 24.125, + "learning_rate": 1.8802937073102796e-06, + "loss": 0.8822, + "step": 2796 + }, + { + "epoch": 0.6800389010454656, + "grad_norm": 14.875, + "learning_rate": 1.8798763309384463e-06, + "loss": 0.6037, + "step": 2797 + }, + { + "epoch": 0.6802820325796256, + "grad_norm": 22.875, + "learning_rate": 1.8794588604207173e-06, + "loss": 0.8779, + "step": 2798 + }, + { + "epoch": 0.6805251641137856, + "grad_norm": 24.125, + "learning_rate": 1.8790412958194903e-06, + "loss": 0.6963, + "step": 2799 + }, + { + "epoch": 0.6807682956479455, + "grad_norm": 17.375, + "learning_rate": 1.878623637197178e-06, + "loss": 0.6414, + "step": 2800 + }, + { + "epoch": 0.6810114271821055, + "grad_norm": 15.1875, + "learning_rate": 1.8782058846162065e-06, + "loss": 0.7041, + "step": 2801 + }, + { + "epoch": 0.6812545587162655, + "grad_norm": 16.375, + "learning_rate": 1.8777880381390157e-06, + "loss": 0.4329, + "step": 2802 + }, + { + "epoch": 0.6814976902504255, + "grad_norm": 22.75, + "learning_rate": 1.8773700978280607e-06, + "loss": 1.1243, + "step": 2803 + }, + { + "epoch": 0.6817408217845855, + "grad_norm": 16.75, + "learning_rate": 1.8769520637458094e-06, + "loss": 0.8327, + "step": 2804 + }, + { + "epoch": 0.6819839533187454, + "grad_norm": 17.125, + "learning_rate": 1.8765339359547441e-06, + "loss": 0.6192, + "step": 2805 + }, + { + "epoch": 0.6822270848529054, + "grad_norm": 20.875, + "learning_rate": 1.8761157145173613e-06, + "loss": 1.2173, + "step": 2806 + }, + { + "epoch": 0.6824702163870654, + "grad_norm": 18.125, + "learning_rate": 1.875697399496172e-06, + "loss": 0.9912, + "step": 2807 + }, + { + "epoch": 0.6827133479212254, + "grad_norm": 19.375, + "learning_rate": 1.8752789909537005e-06, + "loss": 0.7923, + "step": 2808 + }, + { + "epoch": 0.6829564794553854, + "grad_norm": 31.75, + "learning_rate": 1.8748604889524844e-06, + "loss": 1.0251, + "step": 2809 + }, + { + "epoch": 0.6831996109895453, + "grad_norm": 16.625, + "learning_rate": 1.8744418935550764e-06, + "loss": 0.625, + "step": 2810 + }, + { + "epoch": 0.6834427425237053, + "grad_norm": 17.125, + "learning_rate": 1.874023204824043e-06, + "loss": 1.0231, + "step": 2811 + }, + { + "epoch": 0.6836858740578653, + "grad_norm": 18.0, + "learning_rate": 1.8736044228219647e-06, + "loss": 0.8958, + "step": 2812 + }, + { + "epoch": 0.6839290055920253, + "grad_norm": 22.5, + "learning_rate": 1.8731855476114353e-06, + "loss": 1.2198, + "step": 2813 + }, + { + "epoch": 0.6841721371261853, + "grad_norm": 14.9375, + "learning_rate": 1.8727665792550625e-06, + "loss": 0.4517, + "step": 2814 + }, + { + "epoch": 0.6844152686603453, + "grad_norm": 20.875, + "learning_rate": 1.8723475178154693e-06, + "loss": 0.9555, + "step": 2815 + }, + { + "epoch": 0.6846584001945052, + "grad_norm": 17.875, + "learning_rate": 1.8719283633552913e-06, + "loss": 0.8075, + "step": 2816 + }, + { + "epoch": 0.6849015317286652, + "grad_norm": 16.625, + "learning_rate": 1.8715091159371781e-06, + "loss": 0.7464, + "step": 2817 + }, + { + "epoch": 0.6851446632628252, + "grad_norm": 19.5, + "learning_rate": 1.8710897756237939e-06, + "loss": 0.9057, + "step": 2818 + }, + { + "epoch": 0.6853877947969852, + "grad_norm": 19.0, + "learning_rate": 1.8706703424778159e-06, + "loss": 0.8518, + "step": 2819 + }, + { + "epoch": 0.6856309263311452, + "grad_norm": 18.375, + "learning_rate": 1.8702508165619363e-06, + "loss": 0.9205, + "step": 2820 + }, + { + "epoch": 0.6858740578653051, + "grad_norm": 17.375, + "learning_rate": 1.8698311979388594e-06, + "loss": 0.6844, + "step": 2821 + }, + { + "epoch": 0.6861171893994651, + "grad_norm": 23.5, + "learning_rate": 1.8694114866713056e-06, + "loss": 1.3278, + "step": 2822 + }, + { + "epoch": 0.6863603209336251, + "grad_norm": 30.875, + "learning_rate": 1.8689916828220075e-06, + "loss": 1.3844, + "step": 2823 + }, + { + "epoch": 0.6866034524677851, + "grad_norm": 17.875, + "learning_rate": 1.8685717864537116e-06, + "loss": 0.4919, + "step": 2824 + }, + { + "epoch": 0.6868465840019451, + "grad_norm": 19.75, + "learning_rate": 1.8681517976291796e-06, + "loss": 0.8494, + "step": 2825 + }, + { + "epoch": 0.687089715536105, + "grad_norm": 20.5, + "learning_rate": 1.8677317164111856e-06, + "loss": 1.1265, + "step": 2826 + }, + { + "epoch": 0.687332847070265, + "grad_norm": 18.75, + "learning_rate": 1.867311542862518e-06, + "loss": 0.5187, + "step": 2827 + }, + { + "epoch": 0.687575978604425, + "grad_norm": 17.375, + "learning_rate": 1.8668912770459787e-06, + "loss": 0.8619, + "step": 2828 + }, + { + "epoch": 0.687819110138585, + "grad_norm": 18.625, + "learning_rate": 1.866470919024384e-06, + "loss": 0.9234, + "step": 2829 + }, + { + "epoch": 0.688062241672745, + "grad_norm": 20.0, + "learning_rate": 1.8660504688605638e-06, + "loss": 0.9266, + "step": 2830 + }, + { + "epoch": 0.6883053732069049, + "grad_norm": 18.625, + "learning_rate": 1.8656299266173613e-06, + "loss": 0.9105, + "step": 2831 + }, + { + "epoch": 0.6885485047410649, + "grad_norm": 20.625, + "learning_rate": 1.8652092923576342e-06, + "loss": 0.8332, + "step": 2832 + }, + { + "epoch": 0.6887916362752249, + "grad_norm": 19.875, + "learning_rate": 1.864788566144253e-06, + "loss": 1.1016, + "step": 2833 + }, + { + "epoch": 0.6890347678093849, + "grad_norm": 18.75, + "learning_rate": 1.8643677480401032e-06, + "loss": 0.8181, + "step": 2834 + }, + { + "epoch": 0.6892778993435449, + "grad_norm": 17.875, + "learning_rate": 1.8639468381080828e-06, + "loss": 0.6619, + "step": 2835 + }, + { + "epoch": 0.6895210308777049, + "grad_norm": 15.75, + "learning_rate": 1.8635258364111042e-06, + "loss": 0.7536, + "step": 2836 + }, + { + "epoch": 0.6897641624118648, + "grad_norm": 19.5, + "learning_rate": 1.863104743012093e-06, + "loss": 0.9637, + "step": 2837 + }, + { + "epoch": 0.6900072939460248, + "grad_norm": 20.75, + "learning_rate": 1.86268355797399e-06, + "loss": 1.1947, + "step": 2838 + }, + { + "epoch": 0.6902504254801848, + "grad_norm": 16.125, + "learning_rate": 1.8622622813597474e-06, + "loss": 0.8352, + "step": 2839 + }, + { + "epoch": 0.6904935570143448, + "grad_norm": 22.125, + "learning_rate": 1.8618409132323329e-06, + "loss": 1.2988, + "step": 2840 + }, + { + "epoch": 0.6907366885485048, + "grad_norm": 21.375, + "learning_rate": 1.861419453654727e-06, + "loss": 0.8299, + "step": 2841 + }, + { + "epoch": 0.6909798200826647, + "grad_norm": 20.5, + "learning_rate": 1.8609979026899239e-06, + "loss": 0.5336, + "step": 2842 + }, + { + "epoch": 0.6912229516168247, + "grad_norm": 16.75, + "learning_rate": 1.8605762604009323e-06, + "loss": 0.6185, + "step": 2843 + }, + { + "epoch": 0.6914660831509847, + "grad_norm": 18.5, + "learning_rate": 1.8601545268507734e-06, + "loss": 0.6208, + "step": 2844 + }, + { + "epoch": 0.6917092146851447, + "grad_norm": 23.625, + "learning_rate": 1.8597327021024825e-06, + "loss": 0.8914, + "step": 2845 + }, + { + "epoch": 0.6919523462193047, + "grad_norm": 15.1875, + "learning_rate": 1.8593107862191095e-06, + "loss": 0.6565, + "step": 2846 + }, + { + "epoch": 0.6921954777534646, + "grad_norm": 20.0, + "learning_rate": 1.8588887792637158e-06, + "loss": 0.9686, + "step": 2847 + }, + { + "epoch": 0.6924386092876246, + "grad_norm": 18.75, + "learning_rate": 1.858466681299378e-06, + "loss": 1.0165, + "step": 2848 + }, + { + "epoch": 0.6926817408217846, + "grad_norm": 15.25, + "learning_rate": 1.8580444923891865e-06, + "loss": 0.6777, + "step": 2849 + }, + { + "epoch": 0.6929248723559446, + "grad_norm": 20.75, + "learning_rate": 1.8576222125962442e-06, + "loss": 1.0999, + "step": 2850 + }, + { + "epoch": 0.6931680038901046, + "grad_norm": 28.75, + "learning_rate": 1.8571998419836684e-06, + "loss": 1.1889, + "step": 2851 + }, + { + "epoch": 0.6934111354242646, + "grad_norm": 15.875, + "learning_rate": 1.8567773806145892e-06, + "loss": 0.5628, + "step": 2852 + }, + { + "epoch": 0.6936542669584245, + "grad_norm": 21.375, + "learning_rate": 1.8563548285521515e-06, + "loss": 1.0427, + "step": 2853 + }, + { + "epoch": 0.6938973984925845, + "grad_norm": 22.875, + "learning_rate": 1.8559321858595121e-06, + "loss": 1.1127, + "step": 2854 + }, + { + "epoch": 0.6941405300267445, + "grad_norm": 18.625, + "learning_rate": 1.855509452599843e-06, + "loss": 1.1072, + "step": 2855 + }, + { + "epoch": 0.6943836615609045, + "grad_norm": 14.5, + "learning_rate": 1.8550866288363284e-06, + "loss": 0.471, + "step": 2856 + }, + { + "epoch": 0.6946267930950645, + "grad_norm": 21.875, + "learning_rate": 1.8546637146321672e-06, + "loss": 0.9184, + "step": 2857 + }, + { + "epoch": 0.6948699246292244, + "grad_norm": 122.5, + "learning_rate": 1.854240710050571e-06, + "loss": 0.9554, + "step": 2858 + }, + { + "epoch": 0.6951130561633844, + "grad_norm": 21.125, + "learning_rate": 1.853817615154765e-06, + "loss": 0.8874, + "step": 2859 + }, + { + "epoch": 0.6953561876975444, + "grad_norm": 22.375, + "learning_rate": 1.8533944300079876e-06, + "loss": 0.8626, + "step": 2860 + }, + { + "epoch": 0.6955993192317044, + "grad_norm": 15.25, + "learning_rate": 1.8529711546734925e-06, + "loss": 0.4943, + "step": 2861 + }, + { + "epoch": 0.6958424507658644, + "grad_norm": 18.0, + "learning_rate": 1.852547789214544e-06, + "loss": 1.1291, + "step": 2862 + }, + { + "epoch": 0.6960855823000243, + "grad_norm": 16.625, + "learning_rate": 1.8521243336944227e-06, + "loss": 0.6409, + "step": 2863 + }, + { + "epoch": 0.6963287138341843, + "grad_norm": 20.625, + "learning_rate": 1.85170078817642e-06, + "loss": 0.7132, + "step": 2864 + }, + { + "epoch": 0.6965718453683443, + "grad_norm": 26.75, + "learning_rate": 1.8512771527238433e-06, + "loss": 0.9868, + "step": 2865 + }, + { + "epoch": 0.6968149769025043, + "grad_norm": 12.625, + "learning_rate": 1.8508534274000114e-06, + "loss": 0.4037, + "step": 2866 + }, + { + "epoch": 0.6970581084366643, + "grad_norm": 18.5, + "learning_rate": 1.8504296122682578e-06, + "loss": 0.9511, + "step": 2867 + }, + { + "epoch": 0.6973012399708242, + "grad_norm": 18.75, + "learning_rate": 1.8500057073919286e-06, + "loss": 1.1812, + "step": 2868 + }, + { + "epoch": 0.6975443715049842, + "grad_norm": 20.125, + "learning_rate": 1.8495817128343844e-06, + "loss": 0.7531, + "step": 2869 + }, + { + "epoch": 0.6977875030391442, + "grad_norm": 13.625, + "learning_rate": 1.849157628658998e-06, + "loss": 0.4609, + "step": 2870 + }, + { + "epoch": 0.6980306345733042, + "grad_norm": 16.375, + "learning_rate": 1.8487334549291562e-06, + "loss": 0.7991, + "step": 2871 + }, + { + "epoch": 0.6982737661074642, + "grad_norm": 14.125, + "learning_rate": 1.8483091917082586e-06, + "loss": 0.2674, + "step": 2872 + }, + { + "epoch": 0.6985168976416242, + "grad_norm": 25.875, + "learning_rate": 1.8478848390597195e-06, + "loss": 1.0291, + "step": 2873 + }, + { + "epoch": 0.6987600291757841, + "grad_norm": 19.5, + "learning_rate": 1.8474603970469653e-06, + "loss": 0.8569, + "step": 2874 + }, + { + "epoch": 0.6990031607099441, + "grad_norm": 17.75, + "learning_rate": 1.8470358657334363e-06, + "loss": 1.0268, + "step": 2875 + }, + { + "epoch": 0.6992462922441041, + "grad_norm": 12.9375, + "learning_rate": 1.846611245182586e-06, + "loss": 0.2849, + "step": 2876 + }, + { + "epoch": 0.6994894237782641, + "grad_norm": 19.75, + "learning_rate": 1.8461865354578814e-06, + "loss": 0.7604, + "step": 2877 + }, + { + "epoch": 0.6997325553124241, + "grad_norm": 16.625, + "learning_rate": 1.8457617366228027e-06, + "loss": 0.8744, + "step": 2878 + }, + { + "epoch": 0.699975686846584, + "grad_norm": 24.125, + "learning_rate": 1.8453368487408427e-06, + "loss": 1.1708, + "step": 2879 + }, + { + "epoch": 0.700218818380744, + "grad_norm": 20.625, + "learning_rate": 1.8449118718755094e-06, + "loss": 0.6322, + "step": 2880 + }, + { + "epoch": 0.700461949914904, + "grad_norm": 20.125, + "learning_rate": 1.844486806090322e-06, + "loss": 0.7541, + "step": 2881 + }, + { + "epoch": 0.700705081449064, + "grad_norm": 19.0, + "learning_rate": 1.8440616514488146e-06, + "loss": 0.7688, + "step": 2882 + }, + { + "epoch": 0.700948212983224, + "grad_norm": 28.875, + "learning_rate": 1.8436364080145333e-06, + "loss": 1.0153, + "step": 2883 + }, + { + "epoch": 0.7011913445173839, + "grad_norm": 32.5, + "learning_rate": 1.8432110758510386e-06, + "loss": 1.027, + "step": 2884 + }, + { + "epoch": 0.7014344760515439, + "grad_norm": 24.25, + "learning_rate": 1.8427856550219038e-06, + "loss": 0.7096, + "step": 2885 + }, + { + "epoch": 0.7016776075857039, + "grad_norm": 21.375, + "learning_rate": 1.8423601455907145e-06, + "loss": 0.7179, + "step": 2886 + }, + { + "epoch": 0.7019207391198639, + "grad_norm": 17.625, + "learning_rate": 1.8419345476210712e-06, + "loss": 0.9237, + "step": 2887 + }, + { + "epoch": 0.7021638706540239, + "grad_norm": 17.125, + "learning_rate": 1.8415088611765866e-06, + "loss": 0.7091, + "step": 2888 + }, + { + "epoch": 0.7024070021881839, + "grad_norm": 14.0625, + "learning_rate": 1.8410830863208873e-06, + "loss": 0.4741, + "step": 2889 + }, + { + "epoch": 0.7026501337223438, + "grad_norm": 19.5, + "learning_rate": 1.8406572231176124e-06, + "loss": 0.789, + "step": 2890 + }, + { + "epoch": 0.7028932652565038, + "grad_norm": 16.625, + "learning_rate": 1.8402312716304138e-06, + "loss": 0.7747, + "step": 2891 + }, + { + "epoch": 0.7031363967906638, + "grad_norm": 29.125, + "learning_rate": 1.8398052319229586e-06, + "loss": 0.9976, + "step": 2892 + }, + { + "epoch": 0.7033795283248238, + "grad_norm": 23.375, + "learning_rate": 1.8393791040589255e-06, + "loss": 1.1398, + "step": 2893 + }, + { + "epoch": 0.7036226598589838, + "grad_norm": 19.125, + "learning_rate": 1.8389528881020061e-06, + "loss": 0.8569, + "step": 2894 + }, + { + "epoch": 0.7038657913931436, + "grad_norm": 18.0, + "learning_rate": 1.8385265841159056e-06, + "loss": 0.8613, + "step": 2895 + }, + { + "epoch": 0.7041089229273036, + "grad_norm": 20.375, + "learning_rate": 1.8381001921643431e-06, + "loss": 0.7865, + "step": 2896 + }, + { + "epoch": 0.7043520544614637, + "grad_norm": 16.125, + "learning_rate": 1.8376737123110503e-06, + "loss": 0.6729, + "step": 2897 + }, + { + "epoch": 0.7045951859956237, + "grad_norm": 19.25, + "learning_rate": 1.8372471446197716e-06, + "loss": 0.7436, + "step": 2898 + }, + { + "epoch": 0.7048383175297837, + "grad_norm": 17.75, + "learning_rate": 1.8368204891542648e-06, + "loss": 0.6284, + "step": 2899 + }, + { + "epoch": 0.7050814490639435, + "grad_norm": 23.625, + "learning_rate": 1.8363937459783016e-06, + "loss": 1.0442, + "step": 2900 + }, + { + "epoch": 0.7053245805981035, + "grad_norm": 19.25, + "learning_rate": 1.8359669151556652e-06, + "loss": 0.8138, + "step": 2901 + }, + { + "epoch": 0.7055677121322635, + "grad_norm": 19.0, + "learning_rate": 1.8355399967501538e-06, + "loss": 1.0141, + "step": 2902 + }, + { + "epoch": 0.7058108436664235, + "grad_norm": 20.75, + "learning_rate": 1.8351129908255767e-06, + "loss": 1.1416, + "step": 2903 + }, + { + "epoch": 0.7060539752005836, + "grad_norm": 18.0, + "learning_rate": 1.8346858974457585e-06, + "loss": 0.8712, + "step": 2904 + }, + { + "epoch": 0.7062971067347436, + "grad_norm": 20.0, + "learning_rate": 1.8342587166745346e-06, + "loss": 0.9932, + "step": 2905 + }, + { + "epoch": 0.7065402382689034, + "grad_norm": 22.375, + "learning_rate": 1.8338314485757553e-06, + "loss": 1.1834, + "step": 2906 + }, + { + "epoch": 0.7067833698030634, + "grad_norm": 22.5, + "learning_rate": 1.8334040932132825e-06, + "loss": 1.0299, + "step": 2907 + }, + { + "epoch": 0.7070265013372234, + "grad_norm": 34.0, + "learning_rate": 1.8329766506509925e-06, + "loss": 1.161, + "step": 2908 + }, + { + "epoch": 0.7072696328713834, + "grad_norm": 21.25, + "learning_rate": 1.8325491209527737e-06, + "loss": 1.3779, + "step": 2909 + }, + { + "epoch": 0.7075127644055434, + "grad_norm": 15.875, + "learning_rate": 1.8321215041825276e-06, + "loss": 0.6067, + "step": 2910 + }, + { + "epoch": 0.7077558959397033, + "grad_norm": 26.0, + "learning_rate": 1.8316938004041695e-06, + "loss": 1.2639, + "step": 2911 + }, + { + "epoch": 0.7079990274738633, + "grad_norm": 15.8125, + "learning_rate": 1.8312660096816265e-06, + "loss": 0.8692, + "step": 2912 + }, + { + "epoch": 0.7082421590080233, + "grad_norm": 24.5, + "learning_rate": 1.8308381320788397e-06, + "loss": 0.9224, + "step": 2913 + }, + { + "epoch": 0.7084852905421833, + "grad_norm": 29.5, + "learning_rate": 1.8304101676597624e-06, + "loss": 1.2138, + "step": 2914 + }, + { + "epoch": 0.7087284220763433, + "grad_norm": 28.5, + "learning_rate": 1.8299821164883613e-06, + "loss": 0.9979, + "step": 2915 + }, + { + "epoch": 0.7089715536105032, + "grad_norm": 21.25, + "learning_rate": 1.829553978628617e-06, + "loss": 1.2982, + "step": 2916 + }, + { + "epoch": 0.7092146851446632, + "grad_norm": 17.125, + "learning_rate": 1.8291257541445206e-06, + "loss": 0.807, + "step": 2917 + }, + { + "epoch": 0.7094578166788232, + "grad_norm": 23.75, + "learning_rate": 1.828697443100079e-06, + "loss": 1.2644, + "step": 2918 + }, + { + "epoch": 0.7097009482129832, + "grad_norm": 19.875, + "learning_rate": 1.8282690455593096e-06, + "loss": 0.8658, + "step": 2919 + }, + { + "epoch": 0.7099440797471432, + "grad_norm": 18.125, + "learning_rate": 1.8278405615862444e-06, + "loss": 0.876, + "step": 2920 + }, + { + "epoch": 0.7101872112813031, + "grad_norm": 20.125, + "learning_rate": 1.8274119912449279e-06, + "loss": 1.1041, + "step": 2921 + }, + { + "epoch": 0.7104303428154631, + "grad_norm": 16.875, + "learning_rate": 1.8269833345994168e-06, + "loss": 0.5185, + "step": 2922 + }, + { + "epoch": 0.7106734743496231, + "grad_norm": 16.875, + "learning_rate": 1.8265545917137817e-06, + "loss": 0.878, + "step": 2923 + }, + { + "epoch": 0.7109166058837831, + "grad_norm": 17.375, + "learning_rate": 1.826125762652105e-06, + "loss": 0.8941, + "step": 2924 + }, + { + "epoch": 0.7111597374179431, + "grad_norm": 19.5, + "learning_rate": 1.8256968474784835e-06, + "loss": 0.6803, + "step": 2925 + }, + { + "epoch": 0.7114028689521031, + "grad_norm": 18.875, + "learning_rate": 1.8252678462570253e-06, + "loss": 1.1147, + "step": 2926 + }, + { + "epoch": 0.711646000486263, + "grad_norm": 18.375, + "learning_rate": 1.8248387590518522e-06, + "loss": 1.1708, + "step": 2927 + }, + { + "epoch": 0.711889132020423, + "grad_norm": 19.875, + "learning_rate": 1.8244095859270992e-06, + "loss": 0.8755, + "step": 2928 + }, + { + "epoch": 0.712132263554583, + "grad_norm": 22.0, + "learning_rate": 1.8239803269469126e-06, + "loss": 0.9856, + "step": 2929 + }, + { + "epoch": 0.712375395088743, + "grad_norm": 19.375, + "learning_rate": 1.8235509821754532e-06, + "loss": 0.7377, + "step": 2930 + }, + { + "epoch": 0.712618526622903, + "grad_norm": 20.625, + "learning_rate": 1.823121551676894e-06, + "loss": 0.9506, + "step": 2931 + }, + { + "epoch": 0.7128616581570629, + "grad_norm": 16.375, + "learning_rate": 1.822692035515421e-06, + "loss": 0.7385, + "step": 2932 + }, + { + "epoch": 0.7131047896912229, + "grad_norm": 20.25, + "learning_rate": 1.8222624337552325e-06, + "loss": 0.9211, + "step": 2933 + }, + { + "epoch": 0.7133479212253829, + "grad_norm": 23.375, + "learning_rate": 1.8218327464605397e-06, + "loss": 1.2839, + "step": 2934 + }, + { + "epoch": 0.7135910527595429, + "grad_norm": 22.375, + "learning_rate": 1.8214029736955675e-06, + "loss": 1.0768, + "step": 2935 + }, + { + "epoch": 0.7138341842937029, + "grad_norm": 22.75, + "learning_rate": 1.8209731155245523e-06, + "loss": 1.1243, + "step": 2936 + }, + { + "epoch": 0.7140773158278628, + "grad_norm": 23.375, + "learning_rate": 1.8205431720117436e-06, + "loss": 0.8342, + "step": 2937 + }, + { + "epoch": 0.7143204473620228, + "grad_norm": 15.875, + "learning_rate": 1.8201131432214045e-06, + "loss": 0.6138, + "step": 2938 + }, + { + "epoch": 0.7145635788961828, + "grad_norm": 17.625, + "learning_rate": 1.8196830292178097e-06, + "loss": 0.805, + "step": 2939 + }, + { + "epoch": 0.7148067104303428, + "grad_norm": 16.375, + "learning_rate": 1.8192528300652479e-06, + "loss": 0.7749, + "step": 2940 + }, + { + "epoch": 0.7150498419645028, + "grad_norm": 18.625, + "learning_rate": 1.8188225458280187e-06, + "loss": 0.7135, + "step": 2941 + }, + { + "epoch": 0.7152929734986628, + "grad_norm": 16.25, + "learning_rate": 1.8183921765704365e-06, + "loss": 0.5857, + "step": 2942 + }, + { + "epoch": 0.7155361050328227, + "grad_norm": 19.125, + "learning_rate": 1.8179617223568269e-06, + "loss": 0.7907, + "step": 2943 + }, + { + "epoch": 0.7157792365669827, + "grad_norm": 15.25, + "learning_rate": 1.8175311832515289e-06, + "loss": 0.4061, + "step": 2944 + }, + { + "epoch": 0.7160223681011427, + "grad_norm": 17.0, + "learning_rate": 1.8171005593188939e-06, + "loss": 0.865, + "step": 2945 + }, + { + "epoch": 0.7162654996353027, + "grad_norm": 13.75, + "learning_rate": 1.816669850623286e-06, + "loss": 0.5594, + "step": 2946 + }, + { + "epoch": 0.7165086311694627, + "grad_norm": 29.75, + "learning_rate": 1.8162390572290828e-06, + "loss": 1.004, + "step": 2947 + }, + { + "epoch": 0.7167517627036226, + "grad_norm": 28.75, + "learning_rate": 1.8158081792006727e-06, + "loss": 1.5714, + "step": 2948 + }, + { + "epoch": 0.7169948942377826, + "grad_norm": 16.125, + "learning_rate": 1.8153772166024585e-06, + "loss": 0.6644, + "step": 2949 + }, + { + "epoch": 0.7172380257719426, + "grad_norm": 14.0, + "learning_rate": 1.8149461694988548e-06, + "loss": 0.4888, + "step": 2950 + }, + { + "epoch": 0.7174811573061026, + "grad_norm": 18.625, + "learning_rate": 1.814515037954289e-06, + "loss": 0.5384, + "step": 2951 + }, + { + "epoch": 0.7177242888402626, + "grad_norm": 15.0, + "learning_rate": 1.8140838220332019e-06, + "loss": 0.6208, + "step": 2952 + }, + { + "epoch": 0.7179674203744225, + "grad_norm": 17.75, + "learning_rate": 1.8136525218000448e-06, + "loss": 0.6364, + "step": 2953 + }, + { + "epoch": 0.7182105519085825, + "grad_norm": 17.5, + "learning_rate": 1.8132211373192844e-06, + "loss": 0.8892, + "step": 2954 + }, + { + "epoch": 0.7184536834427425, + "grad_norm": 21.125, + "learning_rate": 1.8127896686553973e-06, + "loss": 0.8518, + "step": 2955 + }, + { + "epoch": 0.7186968149769025, + "grad_norm": 19.25, + "learning_rate": 1.8123581158728744e-06, + "loss": 0.7045, + "step": 2956 + }, + { + "epoch": 0.7189399465110625, + "grad_norm": 14.8125, + "learning_rate": 1.811926479036219e-06, + "loss": 0.5171, + "step": 2957 + }, + { + "epoch": 0.7191830780452224, + "grad_norm": 20.125, + "learning_rate": 1.8114947582099466e-06, + "loss": 0.8784, + "step": 2958 + }, + { + "epoch": 0.7194262095793824, + "grad_norm": 18.625, + "learning_rate": 1.8110629534585854e-06, + "loss": 0.797, + "step": 2959 + }, + { + "epoch": 0.7196693411135424, + "grad_norm": 16.375, + "learning_rate": 1.8106310648466754e-06, + "loss": 0.7181, + "step": 2960 + }, + { + "epoch": 0.7199124726477024, + "grad_norm": 19.75, + "learning_rate": 1.8101990924387708e-06, + "loss": 1.164, + "step": 2961 + }, + { + "epoch": 0.7201556041818624, + "grad_norm": 17.25, + "learning_rate": 1.8097670362994368e-06, + "loss": 1.1201, + "step": 2962 + }, + { + "epoch": 0.7203987357160224, + "grad_norm": 20.875, + "learning_rate": 1.8093348964932516e-06, + "loss": 1.043, + "step": 2963 + }, + { + "epoch": 0.7206418672501823, + "grad_norm": 29.875, + "learning_rate": 1.808902673084806e-06, + "loss": 1.2297, + "step": 2964 + }, + { + "epoch": 0.7208849987843423, + "grad_norm": 19.75, + "learning_rate": 1.8084703661387035e-06, + "loss": 0.9002, + "step": 2965 + }, + { + "epoch": 0.7211281303185023, + "grad_norm": 21.5, + "learning_rate": 1.8080379757195597e-06, + "loss": 0.912, + "step": 2966 + }, + { + "epoch": 0.7213712618526623, + "grad_norm": 15.375, + "learning_rate": 1.8076055018920024e-06, + "loss": 0.65, + "step": 2967 + }, + { + "epoch": 0.7216143933868223, + "grad_norm": 20.625, + "learning_rate": 1.8071729447206731e-06, + "loss": 0.8992, + "step": 2968 + }, + { + "epoch": 0.7218575249209822, + "grad_norm": 19.875, + "learning_rate": 1.8067403042702241e-06, + "loss": 1.1088, + "step": 2969 + }, + { + "epoch": 0.7221006564551422, + "grad_norm": 18.875, + "learning_rate": 1.8063075806053219e-06, + "loss": 0.909, + "step": 2970 + }, + { + "epoch": 0.7223437879893022, + "grad_norm": 16.875, + "learning_rate": 1.8058747737906436e-06, + "loss": 1.0959, + "step": 2971 + }, + { + "epoch": 0.7225869195234622, + "grad_norm": 17.875, + "learning_rate": 1.80544188389088e-06, + "loss": 1.126, + "step": 2972 + }, + { + "epoch": 0.7228300510576222, + "grad_norm": 14.875, + "learning_rate": 1.8050089109707345e-06, + "loss": 0.4312, + "step": 2973 + }, + { + "epoch": 0.7230731825917821, + "grad_norm": 19.375, + "learning_rate": 1.8045758550949217e-06, + "loss": 0.9033, + "step": 2974 + }, + { + "epoch": 0.7233163141259421, + "grad_norm": 19.125, + "learning_rate": 1.8041427163281693e-06, + "loss": 0.9799, + "step": 2975 + }, + { + "epoch": 0.7235594456601021, + "grad_norm": 20.375, + "learning_rate": 1.8037094947352177e-06, + "loss": 0.8835, + "step": 2976 + }, + { + "epoch": 0.7238025771942621, + "grad_norm": 21.625, + "learning_rate": 1.8032761903808194e-06, + "loss": 1.0635, + "step": 2977 + }, + { + "epoch": 0.7240457087284221, + "grad_norm": 17.375, + "learning_rate": 1.802842803329739e-06, + "loss": 0.768, + "step": 2978 + }, + { + "epoch": 0.7242888402625821, + "grad_norm": 20.5, + "learning_rate": 1.8024093336467535e-06, + "loss": 1.0363, + "step": 2979 + }, + { + "epoch": 0.724531971796742, + "grad_norm": 23.875, + "learning_rate": 1.8019757813966526e-06, + "loss": 0.9208, + "step": 2980 + }, + { + "epoch": 0.724775103330902, + "grad_norm": 19.75, + "learning_rate": 1.8015421466442385e-06, + "loss": 0.7719, + "step": 2981 + }, + { + "epoch": 0.725018234865062, + "grad_norm": 16.625, + "learning_rate": 1.8011084294543245e-06, + "loss": 0.6558, + "step": 2982 + }, + { + "epoch": 0.725261366399222, + "grad_norm": 20.0, + "learning_rate": 1.8006746298917389e-06, + "loss": 0.8556, + "step": 2983 + }, + { + "epoch": 0.725504497933382, + "grad_norm": 26.125, + "learning_rate": 1.8002407480213183e-06, + "loss": 1.2889, + "step": 2984 + }, + { + "epoch": 0.7257476294675419, + "grad_norm": 18.875, + "learning_rate": 1.7998067839079154e-06, + "loss": 0.9437, + "step": 2985 + }, + { + "epoch": 0.7259907610017019, + "grad_norm": 23.875, + "learning_rate": 1.799372737616393e-06, + "loss": 1.1154, + "step": 2986 + }, + { + "epoch": 0.7262338925358619, + "grad_norm": 22.5, + "learning_rate": 1.798938609211627e-06, + "loss": 0.8806, + "step": 2987 + }, + { + "epoch": 0.7264770240700219, + "grad_norm": 19.875, + "learning_rate": 1.7985043987585054e-06, + "loss": 1.0027, + "step": 2988 + }, + { + "epoch": 0.7267201556041819, + "grad_norm": 22.875, + "learning_rate": 1.7980701063219286e-06, + "loss": 1.3771, + "step": 2989 + }, + { + "epoch": 0.7269632871383418, + "grad_norm": 24.0, + "learning_rate": 1.7976357319668086e-06, + "loss": 1.0942, + "step": 2990 + }, + { + "epoch": 0.7272064186725018, + "grad_norm": 18.75, + "learning_rate": 1.7972012757580703e-06, + "loss": 0.6214, + "step": 2991 + }, + { + "epoch": 0.7274495502066618, + "grad_norm": 21.125, + "learning_rate": 1.7967667377606515e-06, + "loss": 1.0108, + "step": 2992 + }, + { + "epoch": 0.7276926817408218, + "grad_norm": 20.25, + "learning_rate": 1.7963321180395004e-06, + "loss": 0.8376, + "step": 2993 + }, + { + "epoch": 0.7279358132749818, + "grad_norm": 34.0, + "learning_rate": 1.7958974166595788e-06, + "loss": 0.9713, + "step": 2994 + }, + { + "epoch": 0.7281789448091417, + "grad_norm": 21.5, + "learning_rate": 1.7954626336858602e-06, + "loss": 1.6373, + "step": 2995 + }, + { + "epoch": 0.7284220763433017, + "grad_norm": 12.0625, + "learning_rate": 1.7950277691833308e-06, + "loss": 0.3456, + "step": 2996 + }, + { + "epoch": 0.7286652078774617, + "grad_norm": 22.625, + "learning_rate": 1.7945928232169879e-06, + "loss": 0.9632, + "step": 2997 + }, + { + "epoch": 0.7289083394116217, + "grad_norm": 17.75, + "learning_rate": 1.7941577958518424e-06, + "loss": 0.7156, + "step": 2998 + }, + { + "epoch": 0.7291514709457817, + "grad_norm": 14.9375, + "learning_rate": 1.7937226871529162e-06, + "loss": 0.9448, + "step": 2999 + }, + { + "epoch": 0.7293946024799417, + "grad_norm": 22.625, + "learning_rate": 1.7932874971852443e-06, + "loss": 1.0137, + "step": 3000 + }, + { + "epoch": 0.7296377340141016, + "grad_norm": 21.125, + "learning_rate": 1.7928522260138729e-06, + "loss": 0.891, + "step": 3001 + }, + { + "epoch": 0.7298808655482616, + "grad_norm": 17.25, + "learning_rate": 1.7924168737038612e-06, + "loss": 0.8415, + "step": 3002 + }, + { + "epoch": 0.7301239970824216, + "grad_norm": 21.75, + "learning_rate": 1.791981440320279e-06, + "loss": 0.641, + "step": 3003 + }, + { + "epoch": 0.7303671286165816, + "grad_norm": 15.125, + "learning_rate": 1.791545925928211e-06, + "loss": 0.6934, + "step": 3004 + }, + { + "epoch": 0.7306102601507416, + "grad_norm": 18.125, + "learning_rate": 1.7911103305927512e-06, + "loss": 0.8781, + "step": 3005 + }, + { + "epoch": 0.7308533916849015, + "grad_norm": 15.9375, + "learning_rate": 1.7906746543790075e-06, + "loss": 0.711, + "step": 3006 + }, + { + "epoch": 0.7310965232190615, + "grad_norm": 17.0, + "learning_rate": 1.7902388973520987e-06, + "loss": 0.7602, + "step": 3007 + }, + { + "epoch": 0.7313396547532215, + "grad_norm": 20.5, + "learning_rate": 1.7898030595771566e-06, + "loss": 0.6901, + "step": 3008 + }, + { + "epoch": 0.7315827862873815, + "grad_norm": 19.125, + "learning_rate": 1.7893671411193244e-06, + "loss": 0.6929, + "step": 3009 + }, + { + "epoch": 0.7318259178215415, + "grad_norm": 20.25, + "learning_rate": 1.7889311420437578e-06, + "loss": 1.0812, + "step": 3010 + }, + { + "epoch": 0.7320690493557014, + "grad_norm": 28.25, + "learning_rate": 1.7884950624156242e-06, + "loss": 1.1491, + "step": 3011 + }, + { + "epoch": 0.7323121808898614, + "grad_norm": 18.375, + "learning_rate": 1.7880589023001036e-06, + "loss": 0.7281, + "step": 3012 + }, + { + "epoch": 0.7325553124240214, + "grad_norm": 27.25, + "learning_rate": 1.7876226617623874e-06, + "loss": 0.9335, + "step": 3013 + }, + { + "epoch": 0.7327984439581814, + "grad_norm": 36.25, + "learning_rate": 1.7871863408676796e-06, + "loss": 1.1777, + "step": 3014 + }, + { + "epoch": 0.7330415754923414, + "grad_norm": 23.0, + "learning_rate": 1.7867499396811949e-06, + "loss": 1.0634, + "step": 3015 + }, + { + "epoch": 0.7332847070265014, + "grad_norm": 20.5, + "learning_rate": 1.786313458268162e-06, + "loss": 0.9238, + "step": 3016 + }, + { + "epoch": 0.7335278385606613, + "grad_norm": 16.75, + "learning_rate": 1.785876896693821e-06, + "loss": 0.9873, + "step": 3017 + }, + { + "epoch": 0.7337709700948213, + "grad_norm": 17.125, + "learning_rate": 1.7854402550234218e-06, + "loss": 0.6296, + "step": 3018 + }, + { + "epoch": 0.7340141016289813, + "grad_norm": 18.125, + "learning_rate": 1.7850035333222298e-06, + "loss": 0.6889, + "step": 3019 + }, + { + "epoch": 0.7342572331631413, + "grad_norm": 18.0, + "learning_rate": 1.7845667316555198e-06, + "loss": 0.4127, + "step": 3020 + }, + { + "epoch": 0.7345003646973013, + "grad_norm": 20.0, + "learning_rate": 1.7841298500885798e-06, + "loss": 0.682, + "step": 3021 + }, + { + "epoch": 0.7347434962314612, + "grad_norm": 23.0, + "learning_rate": 1.7836928886867082e-06, + "loss": 1.0004, + "step": 3022 + }, + { + "epoch": 0.7349866277656212, + "grad_norm": 20.75, + "learning_rate": 1.783255847515218e-06, + "loss": 0.78, + "step": 3023 + }, + { + "epoch": 0.7352297592997812, + "grad_norm": 19.625, + "learning_rate": 1.7828187266394312e-06, + "loss": 0.886, + "step": 3024 + }, + { + "epoch": 0.7354728908339412, + "grad_norm": 14.4375, + "learning_rate": 1.7823815261246839e-06, + "loss": 0.6049, + "step": 3025 + }, + { + "epoch": 0.7357160223681012, + "grad_norm": 14.9375, + "learning_rate": 1.7819442460363225e-06, + "loss": 0.9734, + "step": 3026 + }, + { + "epoch": 0.7359591539022611, + "grad_norm": 20.5, + "learning_rate": 1.781506886439707e-06, + "loss": 0.9641, + "step": 3027 + }, + { + "epoch": 0.7362022854364211, + "grad_norm": 37.75, + "learning_rate": 1.7810694474002076e-06, + "loss": 1.4406, + "step": 3028 + }, + { + "epoch": 0.7364454169705811, + "grad_norm": 22.125, + "learning_rate": 1.7806319289832078e-06, + "loss": 1.0294, + "step": 3029 + }, + { + "epoch": 0.7366885485047411, + "grad_norm": 24.0, + "learning_rate": 1.7801943312541014e-06, + "loss": 0.6694, + "step": 3030 + }, + { + "epoch": 0.7369316800389011, + "grad_norm": 18.625, + "learning_rate": 1.7797566542782956e-06, + "loss": 0.6523, + "step": 3031 + }, + { + "epoch": 0.737174811573061, + "grad_norm": 18.75, + "learning_rate": 1.779318898121209e-06, + "loss": 0.8146, + "step": 3032 + }, + { + "epoch": 0.737417943107221, + "grad_norm": 31.875, + "learning_rate": 1.7788810628482708e-06, + "loss": 0.816, + "step": 3033 + }, + { + "epoch": 0.737661074641381, + "grad_norm": 28.125, + "learning_rate": 1.778443148524924e-06, + "loss": 1.3549, + "step": 3034 + }, + { + "epoch": 0.737904206175541, + "grad_norm": 24.0, + "learning_rate": 1.778005155216622e-06, + "loss": 0.9372, + "step": 3035 + }, + { + "epoch": 0.738147337709701, + "grad_norm": 23.125, + "learning_rate": 1.7775670829888309e-06, + "loss": 0.8605, + "step": 3036 + }, + { + "epoch": 0.738390469243861, + "grad_norm": 20.0, + "learning_rate": 1.7771289319070276e-06, + "loss": 0.9511, + "step": 3037 + }, + { + "epoch": 0.7386336007780209, + "grad_norm": 20.25, + "learning_rate": 1.7766907020367013e-06, + "loss": 0.708, + "step": 3038 + }, + { + "epoch": 0.7388767323121809, + "grad_norm": 21.0, + "learning_rate": 1.7762523934433538e-06, + "loss": 0.8422, + "step": 3039 + }, + { + "epoch": 0.7391198638463409, + "grad_norm": 16.875, + "learning_rate": 1.7758140061924971e-06, + "loss": 0.686, + "step": 3040 + }, + { + "epoch": 0.7393629953805009, + "grad_norm": 17.25, + "learning_rate": 1.7753755403496564e-06, + "loss": 0.73, + "step": 3041 + }, + { + "epoch": 0.7396061269146609, + "grad_norm": 17.0, + "learning_rate": 1.774936995980367e-06, + "loss": 0.6003, + "step": 3042 + }, + { + "epoch": 0.7398492584488208, + "grad_norm": 19.25, + "learning_rate": 1.7744983731501783e-06, + "loss": 0.9744, + "step": 3043 + }, + { + "epoch": 0.7400923899829808, + "grad_norm": 20.625, + "learning_rate": 1.774059671924649e-06, + "loss": 1.174, + "step": 3044 + }, + { + "epoch": 0.7403355215171408, + "grad_norm": 17.625, + "learning_rate": 1.773620892369351e-06, + "loss": 0.8853, + "step": 3045 + }, + { + "epoch": 0.7405786530513008, + "grad_norm": 12.6875, + "learning_rate": 1.7731820345498672e-06, + "loss": 0.5966, + "step": 3046 + }, + { + "epoch": 0.7408217845854608, + "grad_norm": 18.625, + "learning_rate": 1.7727430985317927e-06, + "loss": 1.2801, + "step": 3047 + }, + { + "epoch": 0.7410649161196207, + "grad_norm": 17.125, + "learning_rate": 1.7723040843807343e-06, + "loss": 0.8067, + "step": 3048 + }, + { + "epoch": 0.7413080476537807, + "grad_norm": 16.5, + "learning_rate": 1.7718649921623097e-06, + "loss": 0.635, + "step": 3049 + }, + { + "epoch": 0.7415511791879407, + "grad_norm": 16.125, + "learning_rate": 1.7714258219421493e-06, + "loss": 0.6125, + "step": 3050 + }, + { + "epoch": 0.7417943107221007, + "grad_norm": 18.125, + "learning_rate": 1.7709865737858945e-06, + "loss": 1.0174, + "step": 3051 + }, + { + "epoch": 0.7420374422562607, + "grad_norm": 16.625, + "learning_rate": 1.7705472477591982e-06, + "loss": 0.7565, + "step": 3052 + }, + { + "epoch": 0.7422805737904207, + "grad_norm": 23.5, + "learning_rate": 1.7701078439277255e-06, + "loss": 0.4331, + "step": 3053 + }, + { + "epoch": 0.7425237053245806, + "grad_norm": 22.5, + "learning_rate": 1.7696683623571533e-06, + "loss": 0.9624, + "step": 3054 + }, + { + "epoch": 0.7427668368587406, + "grad_norm": 13.3125, + "learning_rate": 1.7692288031131694e-06, + "loss": 0.4313, + "step": 3055 + }, + { + "epoch": 0.7430099683929006, + "grad_norm": 18.375, + "learning_rate": 1.7687891662614733e-06, + "loss": 0.7108, + "step": 3056 + }, + { + "epoch": 0.7432530999270606, + "grad_norm": 21.875, + "learning_rate": 1.7683494518677766e-06, + "loss": 0.8518, + "step": 3057 + }, + { + "epoch": 0.7434962314612206, + "grad_norm": 15.0625, + "learning_rate": 1.7679096599978019e-06, + "loss": 0.62, + "step": 3058 + }, + { + "epoch": 0.7437393629953805, + "grad_norm": 21.75, + "learning_rate": 1.7674697907172841e-06, + "loss": 1.3389, + "step": 3059 + }, + { + "epoch": 0.7439824945295405, + "grad_norm": 22.5, + "learning_rate": 1.7670298440919692e-06, + "loss": 1.0756, + "step": 3060 + }, + { + "epoch": 0.7442256260637005, + "grad_norm": 19.625, + "learning_rate": 1.766589820187614e-06, + "loss": 0.735, + "step": 3061 + }, + { + "epoch": 0.7444687575978605, + "grad_norm": 24.625, + "learning_rate": 1.7661497190699894e-06, + "loss": 0.9854, + "step": 3062 + }, + { + "epoch": 0.7447118891320205, + "grad_norm": 17.75, + "learning_rate": 1.7657095408048744e-06, + "loss": 0.9337, + "step": 3063 + }, + { + "epoch": 0.7449550206661804, + "grad_norm": 16.25, + "learning_rate": 1.7652692854580622e-06, + "loss": 0.5433, + "step": 3064 + }, + { + "epoch": 0.7451981522003404, + "grad_norm": 18.375, + "learning_rate": 1.7648289530953561e-06, + "loss": 0.8421, + "step": 3065 + }, + { + "epoch": 0.7454412837345004, + "grad_norm": 31.875, + "learning_rate": 1.7643885437825715e-06, + "loss": 1.266, + "step": 3066 + }, + { + "epoch": 0.7456844152686604, + "grad_norm": 18.625, + "learning_rate": 1.7639480575855356e-06, + "loss": 0.7353, + "step": 3067 + }, + { + "epoch": 0.7459275468028204, + "grad_norm": 14.0, + "learning_rate": 1.7635074945700858e-06, + "loss": 0.684, + "step": 3068 + }, + { + "epoch": 0.7461706783369803, + "grad_norm": 20.75, + "learning_rate": 1.7630668548020726e-06, + "loss": 0.6465, + "step": 3069 + }, + { + "epoch": 0.7464138098711403, + "grad_norm": 36.5, + "learning_rate": 1.762626138347357e-06, + "loss": 1.2077, + "step": 3070 + }, + { + "epoch": 0.7466569414053003, + "grad_norm": 20.875, + "learning_rate": 1.7621853452718115e-06, + "loss": 1.0533, + "step": 3071 + }, + { + "epoch": 0.7469000729394603, + "grad_norm": 13.875, + "learning_rate": 1.7617444756413205e-06, + "loss": 0.5045, + "step": 3072 + }, + { + "epoch": 0.7471432044736203, + "grad_norm": 16.5, + "learning_rate": 1.7613035295217795e-06, + "loss": 0.6456, + "step": 3073 + }, + { + "epoch": 0.7473863360077803, + "grad_norm": 17.125, + "learning_rate": 1.7608625069790959e-06, + "loss": 0.8867, + "step": 3074 + }, + { + "epoch": 0.7476294675419402, + "grad_norm": 24.0, + "learning_rate": 1.760421408079187e-06, + "loss": 0.805, + "step": 3075 + }, + { + "epoch": 0.7478725990761002, + "grad_norm": 21.5, + "learning_rate": 1.759980232887984e-06, + "loss": 0.5942, + "step": 3076 + }, + { + "epoch": 0.7481157306102602, + "grad_norm": 12.375, + "learning_rate": 1.759538981471427e-06, + "loss": 0.3936, + "step": 3077 + }, + { + "epoch": 0.7483588621444202, + "grad_norm": 18.375, + "learning_rate": 1.7590976538954696e-06, + "loss": 1.0391, + "step": 3078 + }, + { + "epoch": 0.7486019936785802, + "grad_norm": 13.1875, + "learning_rate": 1.7586562502260753e-06, + "loss": 0.4015, + "step": 3079 + }, + { + "epoch": 0.74884512521274, + "grad_norm": 22.0, + "learning_rate": 1.7582147705292192e-06, + "loss": 0.9547, + "step": 3080 + }, + { + "epoch": 0.7490882567469, + "grad_norm": 18.5, + "learning_rate": 1.757773214870889e-06, + "loss": 0.93, + "step": 3081 + }, + { + "epoch": 0.74933138828106, + "grad_norm": 17.25, + "learning_rate": 1.7573315833170821e-06, + "loss": 0.8952, + "step": 3082 + }, + { + "epoch": 0.74957451981522, + "grad_norm": 19.125, + "learning_rate": 1.7568898759338082e-06, + "loss": 0.8293, + "step": 3083 + }, + { + "epoch": 0.74981765134938, + "grad_norm": 18.5, + "learning_rate": 1.756448092787088e-06, + "loss": 0.8411, + "step": 3084 + }, + { + "epoch": 0.75006078288354, + "grad_norm": 13.375, + "learning_rate": 1.7560062339429533e-06, + "loss": 0.4048, + "step": 3085 + }, + { + "epoch": 0.7503039144177, + "grad_norm": 17.75, + "learning_rate": 1.7555642994674489e-06, + "loss": 1.0634, + "step": 3086 + }, + { + "epoch": 0.75054704595186, + "grad_norm": 18.0, + "learning_rate": 1.7551222894266278e-06, + "loss": 0.7873, + "step": 3087 + }, + { + "epoch": 0.75079017748602, + "grad_norm": 18.375, + "learning_rate": 1.7546802038865568e-06, + "loss": 0.7158, + "step": 3088 + }, + { + "epoch": 0.75103330902018, + "grad_norm": 13.1875, + "learning_rate": 1.7542380429133133e-06, + "loss": 0.3718, + "step": 3089 + }, + { + "epoch": 0.75127644055434, + "grad_norm": 12.375, + "learning_rate": 1.7537958065729857e-06, + "loss": 0.3316, + "step": 3090 + }, + { + "epoch": 0.7515195720884998, + "grad_norm": 26.0, + "learning_rate": 1.7533534949316745e-06, + "loss": 1.5041, + "step": 3091 + }, + { + "epoch": 0.7517627036226598, + "grad_norm": 23.25, + "learning_rate": 1.7529111080554894e-06, + "loss": 0.8524, + "step": 3092 + }, + { + "epoch": 0.7520058351568198, + "grad_norm": 15.0625, + "learning_rate": 1.7524686460105542e-06, + "loss": 0.6641, + "step": 3093 + }, + { + "epoch": 0.7522489666909798, + "grad_norm": 22.875, + "learning_rate": 1.7520261088630016e-06, + "loss": 0.5891, + "step": 3094 + }, + { + "epoch": 0.7524920982251398, + "grad_norm": 17.875, + "learning_rate": 1.751583496678977e-06, + "loss": 0.9094, + "step": 3095 + }, + { + "epoch": 0.7527352297592997, + "grad_norm": 15.0625, + "learning_rate": 1.751140809524636e-06, + "loss": 0.4343, + "step": 3096 + }, + { + "epoch": 0.7529783612934597, + "grad_norm": 22.25, + "learning_rate": 1.7506980474661462e-06, + "loss": 0.7665, + "step": 3097 + }, + { + "epoch": 0.7532214928276197, + "grad_norm": 17.0, + "learning_rate": 1.750255210569686e-06, + "loss": 0.8135, + "step": 3098 + }, + { + "epoch": 0.7534646243617797, + "grad_norm": 20.75, + "learning_rate": 1.7498122989014443e-06, + "loss": 0.927, + "step": 3099 + }, + { + "epoch": 0.7537077558959397, + "grad_norm": 18.5, + "learning_rate": 1.749369312527623e-06, + "loss": 0.7866, + "step": 3100 + }, + { + "epoch": 0.7539508874300996, + "grad_norm": 18.5, + "learning_rate": 1.7489262515144333e-06, + "loss": 0.7273, + "step": 3101 + }, + { + "epoch": 0.7541940189642596, + "grad_norm": 18.5, + "learning_rate": 1.7484831159280986e-06, + "loss": 1.1789, + "step": 3102 + }, + { + "epoch": 0.7544371504984196, + "grad_norm": 16.25, + "learning_rate": 1.7480399058348529e-06, + "loss": 0.8168, + "step": 3103 + }, + { + "epoch": 0.7546802820325796, + "grad_norm": 22.625, + "learning_rate": 1.747596621300942e-06, + "loss": 0.8012, + "step": 3104 + }, + { + "epoch": 0.7549234135667396, + "grad_norm": 26.25, + "learning_rate": 1.7471532623926227e-06, + "loss": 1.0752, + "step": 3105 + }, + { + "epoch": 0.7551665451008995, + "grad_norm": 36.25, + "learning_rate": 1.7467098291761616e-06, + "loss": 0.8673, + "step": 3106 + }, + { + "epoch": 0.7554096766350595, + "grad_norm": 20.375, + "learning_rate": 1.7462663217178382e-06, + "loss": 0.9313, + "step": 3107 + }, + { + "epoch": 0.7556528081692195, + "grad_norm": 18.875, + "learning_rate": 1.7458227400839422e-06, + "loss": 0.8523, + "step": 3108 + }, + { + "epoch": 0.7558959397033795, + "grad_norm": 16.75, + "learning_rate": 1.7453790843407747e-06, + "loss": 0.7026, + "step": 3109 + }, + { + "epoch": 0.7561390712375395, + "grad_norm": 15.9375, + "learning_rate": 1.7449353545546477e-06, + "loss": 0.5233, + "step": 3110 + }, + { + "epoch": 0.7563822027716995, + "grad_norm": 14.125, + "learning_rate": 1.7444915507918835e-06, + "loss": 0.653, + "step": 3111 + }, + { + "epoch": 0.7566253343058594, + "grad_norm": 21.75, + "learning_rate": 1.7440476731188175e-06, + "loss": 1.1768, + "step": 3112 + }, + { + "epoch": 0.7568684658400194, + "grad_norm": 17.75, + "learning_rate": 1.743603721601794e-06, + "loss": 0.5922, + "step": 3113 + }, + { + "epoch": 0.7571115973741794, + "grad_norm": 16.375, + "learning_rate": 1.7431596963071695e-06, + "loss": 0.7568, + "step": 3114 + }, + { + "epoch": 0.7573547289083394, + "grad_norm": 15.5, + "learning_rate": 1.742715597301311e-06, + "loss": 0.6965, + "step": 3115 + }, + { + "epoch": 0.7575978604424994, + "grad_norm": 16.0, + "learning_rate": 1.7422714246505972e-06, + "loss": 0.5604, + "step": 3116 + }, + { + "epoch": 0.7578409919766593, + "grad_norm": 24.375, + "learning_rate": 1.7418271784214174e-06, + "loss": 1.4837, + "step": 3117 + }, + { + "epoch": 0.7580841235108193, + "grad_norm": 18.75, + "learning_rate": 1.7413828586801713e-06, + "loss": 0.9794, + "step": 3118 + }, + { + "epoch": 0.7583272550449793, + "grad_norm": 20.125, + "learning_rate": 1.7409384654932707e-06, + "loss": 1.03, + "step": 3119 + }, + { + "epoch": 0.7585703865791393, + "grad_norm": 18.125, + "learning_rate": 1.7404939989271374e-06, + "loss": 0.923, + "step": 3120 + }, + { + "epoch": 0.7588135181132993, + "grad_norm": 19.5, + "learning_rate": 1.7400494590482049e-06, + "loss": 1.0926, + "step": 3121 + }, + { + "epoch": 0.7590566496474592, + "grad_norm": 21.875, + "learning_rate": 1.7396048459229175e-06, + "loss": 0.6412, + "step": 3122 + }, + { + "epoch": 0.7592997811816192, + "grad_norm": 21.375, + "learning_rate": 1.73916015961773e-06, + "loss": 1.078, + "step": 3123 + }, + { + "epoch": 0.7595429127157792, + "grad_norm": 17.125, + "learning_rate": 1.7387154001991086e-06, + "loss": 0.6388, + "step": 3124 + }, + { + "epoch": 0.7597860442499392, + "grad_norm": 17.125, + "learning_rate": 1.73827056773353e-06, + "loss": 0.6687, + "step": 3125 + }, + { + "epoch": 0.7600291757840992, + "grad_norm": 17.5, + "learning_rate": 1.7378256622874826e-06, + "loss": 0.8569, + "step": 3126 + }, + { + "epoch": 0.7602723073182592, + "grad_norm": 18.375, + "learning_rate": 1.7373806839274647e-06, + "loss": 1.1778, + "step": 3127 + }, + { + "epoch": 0.7605154388524191, + "grad_norm": 19.75, + "learning_rate": 1.7369356327199862e-06, + "loss": 1.0933, + "step": 3128 + }, + { + "epoch": 0.7607585703865791, + "grad_norm": 24.375, + "learning_rate": 1.736490508731568e-06, + "loss": 0.9143, + "step": 3129 + }, + { + "epoch": 0.7610017019207391, + "grad_norm": 19.25, + "learning_rate": 1.736045312028741e-06, + "loss": 0.7533, + "step": 3130 + }, + { + "epoch": 0.7612448334548991, + "grad_norm": 18.375, + "learning_rate": 1.735600042678048e-06, + "loss": 0.9688, + "step": 3131 + }, + { + "epoch": 0.7614879649890591, + "grad_norm": 16.875, + "learning_rate": 1.735154700746042e-06, + "loss": 0.3887, + "step": 3132 + }, + { + "epoch": 0.761731096523219, + "grad_norm": 18.875, + "learning_rate": 1.7347092862992871e-06, + "loss": 0.8986, + "step": 3133 + }, + { + "epoch": 0.761974228057379, + "grad_norm": 22.75, + "learning_rate": 1.7342637994043582e-06, + "loss": 1.1174, + "step": 3134 + }, + { + "epoch": 0.762217359591539, + "grad_norm": 14.9375, + "learning_rate": 1.733818240127841e-06, + "loss": 0.7159, + "step": 3135 + }, + { + "epoch": 0.762460491125699, + "grad_norm": 14.5625, + "learning_rate": 1.7333726085363317e-06, + "loss": 0.5699, + "step": 3136 + }, + { + "epoch": 0.762703622659859, + "grad_norm": 19.0, + "learning_rate": 1.732926904696438e-06, + "loss": 0.7077, + "step": 3137 + }, + { + "epoch": 0.7629467541940189, + "grad_norm": 25.25, + "learning_rate": 1.7324811286747779e-06, + "loss": 0.9547, + "step": 3138 + }, + { + "epoch": 0.7631898857281789, + "grad_norm": 16.375, + "learning_rate": 1.7320352805379807e-06, + "loss": 0.7508, + "step": 3139 + }, + { + "epoch": 0.7634330172623389, + "grad_norm": 18.5, + "learning_rate": 1.7315893603526857e-06, + "loss": 1.1658, + "step": 3140 + }, + { + "epoch": 0.7636761487964989, + "grad_norm": 20.375, + "learning_rate": 1.7311433681855432e-06, + "loss": 1.1308, + "step": 3141 + }, + { + "epoch": 0.7639192803306589, + "grad_norm": 17.75, + "learning_rate": 1.7306973041032145e-06, + "loss": 1.1094, + "step": 3142 + }, + { + "epoch": 0.7641624118648188, + "grad_norm": 18.875, + "learning_rate": 1.7302511681723721e-06, + "loss": 1.0631, + "step": 3143 + }, + { + "epoch": 0.7644055433989788, + "grad_norm": 19.75, + "learning_rate": 1.729804960459699e-06, + "loss": 0.7407, + "step": 3144 + }, + { + "epoch": 0.7646486749331388, + "grad_norm": 20.625, + "learning_rate": 1.7293586810318872e-06, + "loss": 1.0228, + "step": 3145 + }, + { + "epoch": 0.7648918064672988, + "grad_norm": 14.6875, + "learning_rate": 1.7289123299556419e-06, + "loss": 0.5856, + "step": 3146 + }, + { + "epoch": 0.7651349380014588, + "grad_norm": 14.0, + "learning_rate": 1.7284659072976778e-06, + "loss": 0.7226, + "step": 3147 + }, + { + "epoch": 0.7653780695356188, + "grad_norm": 17.875, + "learning_rate": 1.7280194131247208e-06, + "loss": 0.8066, + "step": 3148 + }, + { + "epoch": 0.7656212010697787, + "grad_norm": 17.25, + "learning_rate": 1.7275728475035063e-06, + "loss": 0.6307, + "step": 3149 + }, + { + "epoch": 0.7658643326039387, + "grad_norm": 19.125, + "learning_rate": 1.727126210500782e-06, + "loss": 0.8575, + "step": 3150 + }, + { + "epoch": 0.7661074641380987, + "grad_norm": 23.625, + "learning_rate": 1.7266795021833052e-06, + "loss": 0.8573, + "step": 3151 + }, + { + "epoch": 0.7663505956722587, + "grad_norm": 20.875, + "learning_rate": 1.7262327226178445e-06, + "loss": 1.252, + "step": 3152 + }, + { + "epoch": 0.7665937272064187, + "grad_norm": 17.5, + "learning_rate": 1.7257858718711784e-06, + "loss": 0.9626, + "step": 3153 + }, + { + "epoch": 0.7668368587405786, + "grad_norm": 20.25, + "learning_rate": 1.7253389500100965e-06, + "loss": 0.6853, + "step": 3154 + }, + { + "epoch": 0.7670799902747386, + "grad_norm": 16.125, + "learning_rate": 1.724891957101399e-06, + "loss": 0.7116, + "step": 3155 + }, + { + "epoch": 0.7673231218088986, + "grad_norm": 16.75, + "learning_rate": 1.7244448932118976e-06, + "loss": 0.7574, + "step": 3156 + }, + { + "epoch": 0.7675662533430586, + "grad_norm": 20.25, + "learning_rate": 1.7239977584084122e-06, + "loss": 0.9469, + "step": 3157 + }, + { + "epoch": 0.7678093848772186, + "grad_norm": 21.0, + "learning_rate": 1.723550552757776e-06, + "loss": 1.0499, + "step": 3158 + }, + { + "epoch": 0.7680525164113785, + "grad_norm": 16.75, + "learning_rate": 1.7231032763268314e-06, + "loss": 0.4783, + "step": 3159 + }, + { + "epoch": 0.7682956479455385, + "grad_norm": 18.75, + "learning_rate": 1.722655929182431e-06, + "loss": 1.0957, + "step": 3160 + }, + { + "epoch": 0.7685387794796985, + "grad_norm": 17.375, + "learning_rate": 1.7222085113914388e-06, + "loss": 0.6827, + "step": 3161 + }, + { + "epoch": 0.7687819110138585, + "grad_norm": 16.0, + "learning_rate": 1.7217610230207294e-06, + "loss": 0.682, + "step": 3162 + }, + { + "epoch": 0.7690250425480185, + "grad_norm": 42.5, + "learning_rate": 1.7213134641371876e-06, + "loss": 0.893, + "step": 3163 + }, + { + "epoch": 0.7692681740821785, + "grad_norm": 18.75, + "learning_rate": 1.7208658348077087e-06, + "loss": 0.5515, + "step": 3164 + }, + { + "epoch": 0.7695113056163384, + "grad_norm": 19.25, + "learning_rate": 1.7204181350991987e-06, + "loss": 0.762, + "step": 3165 + }, + { + "epoch": 0.7697544371504984, + "grad_norm": 17.5, + "learning_rate": 1.7199703650785738e-06, + "loss": 0.7527, + "step": 3166 + }, + { + "epoch": 0.7699975686846584, + "grad_norm": 14.625, + "learning_rate": 1.7195225248127611e-06, + "loss": 0.764, + "step": 3167 + }, + { + "epoch": 0.7702407002188184, + "grad_norm": 14.875, + "learning_rate": 1.7190746143686986e-06, + "loss": 0.6568, + "step": 3168 + }, + { + "epoch": 0.7704838317529784, + "grad_norm": 21.75, + "learning_rate": 1.7186266338133334e-06, + "loss": 0.9405, + "step": 3169 + }, + { + "epoch": 0.7707269632871383, + "grad_norm": 16.875, + "learning_rate": 1.7181785832136245e-06, + "loss": 0.9862, + "step": 3170 + }, + { + "epoch": 0.7709700948212983, + "grad_norm": 26.0, + "learning_rate": 1.7177304626365404e-06, + "loss": 1.051, + "step": 3171 + }, + { + "epoch": 0.7712132263554583, + "grad_norm": 23.25, + "learning_rate": 1.717282272149061e-06, + "loss": 1.0199, + "step": 3172 + }, + { + "epoch": 0.7714563578896183, + "grad_norm": 25.25, + "learning_rate": 1.7168340118181754e-06, + "loss": 1.2688, + "step": 3173 + }, + { + "epoch": 0.7716994894237783, + "grad_norm": 18.875, + "learning_rate": 1.7163856817108845e-06, + "loss": 0.8713, + "step": 3174 + }, + { + "epoch": 0.7719426209579382, + "grad_norm": 14.125, + "learning_rate": 1.715937281894199e-06, + "loss": 0.4008, + "step": 3175 + }, + { + "epoch": 0.7721857524920982, + "grad_norm": 16.25, + "learning_rate": 1.715488812435139e-06, + "loss": 0.7364, + "step": 3176 + }, + { + "epoch": 0.7724288840262582, + "grad_norm": 20.375, + "learning_rate": 1.7150402734007372e-06, + "loss": 1.1006, + "step": 3177 + }, + { + "epoch": 0.7726720155604182, + "grad_norm": 15.0, + "learning_rate": 1.7145916648580345e-06, + "loss": 0.5837, + "step": 3178 + }, + { + "epoch": 0.7729151470945782, + "grad_norm": 15.9375, + "learning_rate": 1.7141429868740843e-06, + "loss": 0.6657, + "step": 3179 + }, + { + "epoch": 0.7731582786287381, + "grad_norm": 15.25, + "learning_rate": 1.7136942395159487e-06, + "loss": 0.4304, + "step": 3180 + }, + { + "epoch": 0.7734014101628981, + "grad_norm": 16.125, + "learning_rate": 1.7132454228507002e-06, + "loss": 0.6216, + "step": 3181 + }, + { + "epoch": 0.7736445416970581, + "grad_norm": 19.875, + "learning_rate": 1.7127965369454233e-06, + "loss": 1.0955, + "step": 3182 + }, + { + "epoch": 0.7738876732312181, + "grad_norm": 18.25, + "learning_rate": 1.7123475818672108e-06, + "loss": 0.8218, + "step": 3183 + }, + { + "epoch": 0.7741308047653781, + "grad_norm": 22.375, + "learning_rate": 1.7118985576831673e-06, + "loss": 1.0814, + "step": 3184 + }, + { + "epoch": 0.7743739362995381, + "grad_norm": 20.75, + "learning_rate": 1.7114494644604072e-06, + "loss": 1.1863, + "step": 3185 + }, + { + "epoch": 0.774617067833698, + "grad_norm": 18.125, + "learning_rate": 1.7110003022660548e-06, + "loss": 0.825, + "step": 3186 + }, + { + "epoch": 0.774860199367858, + "grad_norm": 25.375, + "learning_rate": 1.7105510711672456e-06, + "loss": 0.7863, + "step": 3187 + }, + { + "epoch": 0.775103330902018, + "grad_norm": 23.5, + "learning_rate": 1.710101771231125e-06, + "loss": 0.903, + "step": 3188 + }, + { + "epoch": 0.775346462436178, + "grad_norm": 20.75, + "learning_rate": 1.7096524025248483e-06, + "loss": 1.0174, + "step": 3189 + }, + { + "epoch": 0.775589593970338, + "grad_norm": 27.0, + "learning_rate": 1.7092029651155816e-06, + "loss": 1.0477, + "step": 3190 + }, + { + "epoch": 0.7758327255044979, + "grad_norm": 19.375, + "learning_rate": 1.7087534590705012e-06, + "loss": 0.965, + "step": 3191 + }, + { + "epoch": 0.7760758570386579, + "grad_norm": 20.375, + "learning_rate": 1.7083038844567931e-06, + "loss": 1.0624, + "step": 3192 + }, + { + "epoch": 0.7763189885728179, + "grad_norm": 20.25, + "learning_rate": 1.7078542413416547e-06, + "loss": 1.0174, + "step": 3193 + }, + { + "epoch": 0.7765621201069779, + "grad_norm": 12.125, + "learning_rate": 1.7074045297922924e-06, + "loss": 0.6654, + "step": 3194 + }, + { + "epoch": 0.7768052516411379, + "grad_norm": 17.375, + "learning_rate": 1.7069547498759231e-06, + "loss": 0.8047, + "step": 3195 + }, + { + "epoch": 0.7770483831752978, + "grad_norm": 18.125, + "learning_rate": 1.706504901659775e-06, + "loss": 0.7833, + "step": 3196 + }, + { + "epoch": 0.7772915147094578, + "grad_norm": 16.625, + "learning_rate": 1.706054985211085e-06, + "loss": 0.676, + "step": 3197 + }, + { + "epoch": 0.7775346462436178, + "grad_norm": 27.125, + "learning_rate": 1.7056050005971008e-06, + "loss": 1.1959, + "step": 3198 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 17.875, + "learning_rate": 1.7051549478850816e-06, + "loss": 0.9621, + "step": 3199 + }, + { + "epoch": 0.7780209093119378, + "grad_norm": 19.25, + "learning_rate": 1.7047048271422937e-06, + "loss": 1.1471, + "step": 3200 + }, + { + "epoch": 0.7782640408460978, + "grad_norm": 20.375, + "learning_rate": 1.7042546384360171e-06, + "loss": 0.7744, + "step": 3201 + }, + { + "epoch": 0.7785071723802577, + "grad_norm": 15.3125, + "learning_rate": 1.7038043818335389e-06, + "loss": 0.691, + "step": 3202 + }, + { + "epoch": 0.7787503039144177, + "grad_norm": 19.25, + "learning_rate": 1.7033540574021587e-06, + "loss": 0.6704, + "step": 3203 + }, + { + "epoch": 0.7789934354485777, + "grad_norm": 19.0, + "learning_rate": 1.7029036652091846e-06, + "loss": 1.4227, + "step": 3204 + }, + { + "epoch": 0.7792365669827377, + "grad_norm": 16.625, + "learning_rate": 1.7024532053219362e-06, + "loss": 0.6215, + "step": 3205 + }, + { + "epoch": 0.7794796985168977, + "grad_norm": 16.25, + "learning_rate": 1.7020026778077423e-06, + "loss": 0.7057, + "step": 3206 + }, + { + "epoch": 0.7797228300510576, + "grad_norm": 24.75, + "learning_rate": 1.7015520827339415e-06, + "loss": 0.8641, + "step": 3207 + }, + { + "epoch": 0.7799659615852176, + "grad_norm": 16.125, + "learning_rate": 1.7011014201678832e-06, + "loss": 1.3396, + "step": 3208 + }, + { + "epoch": 0.7802090931193776, + "grad_norm": 17.0, + "learning_rate": 1.7006506901769271e-06, + "loss": 0.9634, + "step": 3209 + }, + { + "epoch": 0.7804522246535376, + "grad_norm": 15.1875, + "learning_rate": 1.7001998928284423e-06, + "loss": 0.627, + "step": 3210 + }, + { + "epoch": 0.7806953561876976, + "grad_norm": 22.5, + "learning_rate": 1.6997490281898082e-06, + "loss": 0.8008, + "step": 3211 + }, + { + "epoch": 0.7809384877218575, + "grad_norm": 19.625, + "learning_rate": 1.6992980963284144e-06, + "loss": 0.8307, + "step": 3212 + }, + { + "epoch": 0.7811816192560175, + "grad_norm": 16.375, + "learning_rate": 1.6988470973116611e-06, + "loss": 0.7118, + "step": 3213 + }, + { + "epoch": 0.7814247507901775, + "grad_norm": 20.875, + "learning_rate": 1.6983960312069566e-06, + "loss": 0.92, + "step": 3214 + }, + { + "epoch": 0.7816678823243375, + "grad_norm": 22.0, + "learning_rate": 1.6979448980817212e-06, + "loss": 1.061, + "step": 3215 + }, + { + "epoch": 0.7819110138584975, + "grad_norm": 18.75, + "learning_rate": 1.6974936980033846e-06, + "loss": 0.8675, + "step": 3216 + }, + { + "epoch": 0.7821541453926574, + "grad_norm": 19.625, + "learning_rate": 1.6970424310393863e-06, + "loss": 0.705, + "step": 3217 + }, + { + "epoch": 0.7823972769268174, + "grad_norm": 18.875, + "learning_rate": 1.6965910972571763e-06, + "loss": 0.833, + "step": 3218 + }, + { + "epoch": 0.7826404084609774, + "grad_norm": 23.5, + "learning_rate": 1.6961396967242133e-06, + "loss": 1.1624, + "step": 3219 + }, + { + "epoch": 0.7828835399951374, + "grad_norm": 20.875, + "learning_rate": 1.6956882295079683e-06, + "loss": 1.2196, + "step": 3220 + }, + { + "epoch": 0.7831266715292974, + "grad_norm": 19.625, + "learning_rate": 1.6952366956759195e-06, + "loss": 1.1915, + "step": 3221 + }, + { + "epoch": 0.7833698030634574, + "grad_norm": 16.875, + "learning_rate": 1.6947850952955572e-06, + "loss": 0.7576, + "step": 3222 + }, + { + "epoch": 0.7836129345976173, + "grad_norm": 18.75, + "learning_rate": 1.6943334284343807e-06, + "loss": 0.648, + "step": 3223 + }, + { + "epoch": 0.7838560661317773, + "grad_norm": 23.875, + "learning_rate": 1.6938816951598993e-06, + "loss": 0.8941, + "step": 3224 + }, + { + "epoch": 0.7840991976659373, + "grad_norm": 29.125, + "learning_rate": 1.6934298955396331e-06, + "loss": 1.1015, + "step": 3225 + }, + { + "epoch": 0.7843423292000973, + "grad_norm": 21.125, + "learning_rate": 1.6929780296411099e-06, + "loss": 0.9174, + "step": 3226 + }, + { + "epoch": 0.7845854607342573, + "grad_norm": 23.25, + "learning_rate": 1.69252609753187e-06, + "loss": 0.6624, + "step": 3227 + }, + { + "epoch": 0.7848285922684172, + "grad_norm": 17.625, + "learning_rate": 1.692074099279462e-06, + "loss": 0.7137, + "step": 3228 + }, + { + "epoch": 0.7850717238025772, + "grad_norm": 21.375, + "learning_rate": 1.6916220349514451e-06, + "loss": 1.0116, + "step": 3229 + }, + { + "epoch": 0.7853148553367372, + "grad_norm": 21.5, + "learning_rate": 1.6911699046153884e-06, + "loss": 0.8142, + "step": 3230 + }, + { + "epoch": 0.7855579868708972, + "grad_norm": 20.875, + "learning_rate": 1.6907177083388693e-06, + "loss": 0.8175, + "step": 3231 + }, + { + "epoch": 0.7858011184050572, + "grad_norm": 21.125, + "learning_rate": 1.690265446189478e-06, + "loss": 0.7351, + "step": 3232 + }, + { + "epoch": 0.7860442499392171, + "grad_norm": 26.875, + "learning_rate": 1.6898131182348118e-06, + "loss": 1.2597, + "step": 3233 + }, + { + "epoch": 0.7862873814733771, + "grad_norm": 22.5, + "learning_rate": 1.6893607245424792e-06, + "loss": 0.9505, + "step": 3234 + }, + { + "epoch": 0.7865305130075371, + "grad_norm": 19.375, + "learning_rate": 1.6889082651800983e-06, + "loss": 0.994, + "step": 3235 + }, + { + "epoch": 0.7867736445416971, + "grad_norm": 41.0, + "learning_rate": 1.6884557402152971e-06, + "loss": 0.8545, + "step": 3236 + }, + { + "epoch": 0.7870167760758571, + "grad_norm": 29.25, + "learning_rate": 1.6880031497157133e-06, + "loss": 0.4975, + "step": 3237 + }, + { + "epoch": 0.7872599076100171, + "grad_norm": 22.75, + "learning_rate": 1.687550493748994e-06, + "loss": 1.159, + "step": 3238 + }, + { + "epoch": 0.787503039144177, + "grad_norm": 28.625, + "learning_rate": 1.6870977723827963e-06, + "loss": 1.4826, + "step": 3239 + }, + { + "epoch": 0.787746170678337, + "grad_norm": 28.125, + "learning_rate": 1.686644985684788e-06, + "loss": 1.3861, + "step": 3240 + }, + { + "epoch": 0.787989302212497, + "grad_norm": 21.25, + "learning_rate": 1.6861921337226453e-06, + "loss": 1.2006, + "step": 3241 + }, + { + "epoch": 0.788232433746657, + "grad_norm": 25.5, + "learning_rate": 1.685739216564055e-06, + "loss": 1.059, + "step": 3242 + }, + { + "epoch": 0.788475565280817, + "grad_norm": 14.5625, + "learning_rate": 1.6852862342767132e-06, + "loss": 0.6043, + "step": 3243 + }, + { + "epoch": 0.7887186968149769, + "grad_norm": 16.75, + "learning_rate": 1.6848331869283263e-06, + "loss": 0.6907, + "step": 3244 + }, + { + "epoch": 0.7889618283491369, + "grad_norm": 17.375, + "learning_rate": 1.6843800745866095e-06, + "loss": 0.6402, + "step": 3245 + }, + { + "epoch": 0.7892049598832969, + "grad_norm": 17.125, + "learning_rate": 1.6839268973192888e-06, + "loss": 0.814, + "step": 3246 + }, + { + "epoch": 0.7894480914174569, + "grad_norm": 18.375, + "learning_rate": 1.683473655194099e-06, + "loss": 0.4058, + "step": 3247 + }, + { + "epoch": 0.7896912229516169, + "grad_norm": 30.75, + "learning_rate": 1.683020348278785e-06, + "loss": 1.2858, + "step": 3248 + }, + { + "epoch": 0.7899343544857768, + "grad_norm": 19.75, + "learning_rate": 1.6825669766411015e-06, + "loss": 1.006, + "step": 3249 + }, + { + "epoch": 0.7901774860199368, + "grad_norm": 22.25, + "learning_rate": 1.6821135403488126e-06, + "loss": 0.8786, + "step": 3250 + }, + { + "epoch": 0.7904206175540968, + "grad_norm": 26.125, + "learning_rate": 1.6816600394696924e-06, + "loss": 1.0818, + "step": 3251 + }, + { + "epoch": 0.7906637490882568, + "grad_norm": 27.75, + "learning_rate": 1.681206474071524e-06, + "loss": 1.0326, + "step": 3252 + }, + { + "epoch": 0.7909068806224168, + "grad_norm": 21.75, + "learning_rate": 1.680752844222101e-06, + "loss": 1.0456, + "step": 3253 + }, + { + "epoch": 0.7911500121565767, + "grad_norm": 23.625, + "learning_rate": 1.680299149989226e-06, + "loss": 0.6997, + "step": 3254 + }, + { + "epoch": 0.7913931436907367, + "grad_norm": 29.25, + "learning_rate": 1.6798453914407115e-06, + "loss": 0.9938, + "step": 3255 + }, + { + "epoch": 0.7916362752248967, + "grad_norm": 22.5, + "learning_rate": 1.6793915686443798e-06, + "loss": 0.9355, + "step": 3256 + }, + { + "epoch": 0.7918794067590567, + "grad_norm": 17.5, + "learning_rate": 1.6789376816680622e-06, + "loss": 0.5805, + "step": 3257 + }, + { + "epoch": 0.7921225382932167, + "grad_norm": 24.25, + "learning_rate": 1.6784837305796e-06, + "loss": 1.3705, + "step": 3258 + }, + { + "epoch": 0.7923656698273767, + "grad_norm": 19.5, + "learning_rate": 1.6780297154468442e-06, + "loss": 1.0128, + "step": 3259 + }, + { + "epoch": 0.7926088013615366, + "grad_norm": 13.4375, + "learning_rate": 1.677575636337655e-06, + "loss": 0.3846, + "step": 3260 + }, + { + "epoch": 0.7928519328956966, + "grad_norm": 25.0, + "learning_rate": 1.6771214933199027e-06, + "loss": 1.0039, + "step": 3261 + }, + { + "epoch": 0.7930950644298566, + "grad_norm": 21.0, + "learning_rate": 1.6766672864614658e-06, + "loss": 1.1159, + "step": 3262 + }, + { + "epoch": 0.7933381959640166, + "grad_norm": 20.5, + "learning_rate": 1.676213015830235e-06, + "loss": 1.0886, + "step": 3263 + }, + { + "epoch": 0.7935813274981766, + "grad_norm": 18.75, + "learning_rate": 1.6757586814941078e-06, + "loss": 1.0525, + "step": 3264 + }, + { + "epoch": 0.7938244590323364, + "grad_norm": 19.0, + "learning_rate": 1.6753042835209924e-06, + "loss": 1.0886, + "step": 3265 + }, + { + "epoch": 0.7940675905664965, + "grad_norm": 16.625, + "learning_rate": 1.6748498219788067e-06, + "loss": 0.6404, + "step": 3266 + }, + { + "epoch": 0.7943107221006565, + "grad_norm": 18.875, + "learning_rate": 1.6743952969354777e-06, + "loss": 0.972, + "step": 3267 + }, + { + "epoch": 0.7945538536348165, + "grad_norm": 16.25, + "learning_rate": 1.673940708458942e-06, + "loss": 0.5333, + "step": 3268 + }, + { + "epoch": 0.7947969851689765, + "grad_norm": 22.25, + "learning_rate": 1.6734860566171454e-06, + "loss": 0.8507, + "step": 3269 + }, + { + "epoch": 0.7950401167031363, + "grad_norm": 15.5625, + "learning_rate": 1.6730313414780442e-06, + "loss": 0.6042, + "step": 3270 + }, + { + "epoch": 0.7952832482372963, + "grad_norm": 16.0, + "learning_rate": 1.672576563109603e-06, + "loss": 0.6339, + "step": 3271 + }, + { + "epoch": 0.7955263797714563, + "grad_norm": 14.4375, + "learning_rate": 1.6721217215797961e-06, + "loss": 0.7245, + "step": 3272 + }, + { + "epoch": 0.7957695113056164, + "grad_norm": 18.125, + "learning_rate": 1.6716668169566074e-06, + "loss": 0.7123, + "step": 3273 + }, + { + "epoch": 0.7960126428397764, + "grad_norm": 19.25, + "learning_rate": 1.671211849308031e-06, + "loss": 0.8693, + "step": 3274 + }, + { + "epoch": 0.7962557743739364, + "grad_norm": 23.25, + "learning_rate": 1.670756818702069e-06, + "loss": 0.9725, + "step": 3275 + }, + { + "epoch": 0.7964989059080962, + "grad_norm": 19.25, + "learning_rate": 1.6703017252067336e-06, + "loss": 1.1398, + "step": 3276 + }, + { + "epoch": 0.7967420374422562, + "grad_norm": 23.625, + "learning_rate": 1.6698465688900462e-06, + "loss": 0.9758, + "step": 3277 + }, + { + "epoch": 0.7969851689764162, + "grad_norm": 21.75, + "learning_rate": 1.6693913498200383e-06, + "loss": 0.9211, + "step": 3278 + }, + { + "epoch": 0.7972283005105762, + "grad_norm": 16.5, + "learning_rate": 1.6689360680647504e-06, + "loss": 0.589, + "step": 3279 + }, + { + "epoch": 0.7974714320447363, + "grad_norm": 20.875, + "learning_rate": 1.6684807236922318e-06, + "loss": 0.9694, + "step": 3280 + }, + { + "epoch": 0.7977145635788961, + "grad_norm": 14.0, + "learning_rate": 1.6680253167705409e-06, + "loss": 0.4407, + "step": 3281 + }, + { + "epoch": 0.7979576951130561, + "grad_norm": 16.0, + "learning_rate": 1.6675698473677473e-06, + "loss": 0.7154, + "step": 3282 + }, + { + "epoch": 0.7982008266472161, + "grad_norm": 14.125, + "learning_rate": 1.6671143155519286e-06, + "loss": 0.6744, + "step": 3283 + }, + { + "epoch": 0.7984439581813761, + "grad_norm": 14.125, + "learning_rate": 1.6666587213911714e-06, + "loss": 0.4512, + "step": 3284 + }, + { + "epoch": 0.7986870897155361, + "grad_norm": 21.25, + "learning_rate": 1.6662030649535725e-06, + "loss": 1.0703, + "step": 3285 + }, + { + "epoch": 0.798930221249696, + "grad_norm": 15.3125, + "learning_rate": 1.665747346307237e-06, + "loss": 0.6652, + "step": 3286 + }, + { + "epoch": 0.799173352783856, + "grad_norm": 18.5, + "learning_rate": 1.665291565520281e-06, + "loss": 0.7123, + "step": 3287 + }, + { + "epoch": 0.799416484318016, + "grad_norm": 18.0, + "learning_rate": 1.6648357226608281e-06, + "loss": 0.6371, + "step": 3288 + }, + { + "epoch": 0.799659615852176, + "grad_norm": 20.125, + "learning_rate": 1.6643798177970113e-06, + "loss": 0.9583, + "step": 3289 + }, + { + "epoch": 0.799902747386336, + "grad_norm": 18.5, + "learning_rate": 1.663923850996975e-06, + "loss": 1.0215, + "step": 3290 + }, + { + "epoch": 0.8001458789204959, + "grad_norm": 19.125, + "learning_rate": 1.66346782232887e-06, + "loss": 0.9595, + "step": 3291 + }, + { + "epoch": 0.8003890104546559, + "grad_norm": 17.875, + "learning_rate": 1.6630117318608585e-06, + "loss": 0.9341, + "step": 3292 + }, + { + "epoch": 0.8006321419888159, + "grad_norm": 11.1875, + "learning_rate": 1.6625555796611102e-06, + "loss": 0.2916, + "step": 3293 + }, + { + "epoch": 0.8008752735229759, + "grad_norm": 19.5, + "learning_rate": 1.6620993657978054e-06, + "loss": 0.9743, + "step": 3294 + }, + { + "epoch": 0.8011184050571359, + "grad_norm": 23.375, + "learning_rate": 1.6616430903391337e-06, + "loss": 1.0148, + "step": 3295 + }, + { + "epoch": 0.8013615365912959, + "grad_norm": 22.875, + "learning_rate": 1.6611867533532921e-06, + "loss": 1.2657, + "step": 3296 + }, + { + "epoch": 0.8016046681254558, + "grad_norm": 20.0, + "learning_rate": 1.6607303549084897e-06, + "loss": 1.0109, + "step": 3297 + }, + { + "epoch": 0.8018477996596158, + "grad_norm": 29.625, + "learning_rate": 1.6602738950729417e-06, + "loss": 1.0593, + "step": 3298 + }, + { + "epoch": 0.8020909311937758, + "grad_norm": 17.625, + "learning_rate": 1.6598173739148744e-06, + "loss": 0.6808, + "step": 3299 + }, + { + "epoch": 0.8023340627279358, + "grad_norm": 15.1875, + "learning_rate": 1.6593607915025227e-06, + "loss": 0.5933, + "step": 3300 + }, + { + "epoch": 0.8025771942620958, + "grad_norm": 14.625, + "learning_rate": 1.658904147904131e-06, + "loss": 0.5634, + "step": 3301 + }, + { + "epoch": 0.8028203257962557, + "grad_norm": 14.75, + "learning_rate": 1.6584474431879527e-06, + "loss": 0.6136, + "step": 3302 + }, + { + "epoch": 0.8030634573304157, + "grad_norm": 18.875, + "learning_rate": 1.6579906774222493e-06, + "loss": 0.7261, + "step": 3303 + }, + { + "epoch": 0.8033065888645757, + "grad_norm": 16.875, + "learning_rate": 1.657533850675293e-06, + "loss": 0.5733, + "step": 3304 + }, + { + "epoch": 0.8035497203987357, + "grad_norm": 23.375, + "learning_rate": 1.6570769630153643e-06, + "loss": 0.7423, + "step": 3305 + }, + { + "epoch": 0.8037928519328957, + "grad_norm": 17.375, + "learning_rate": 1.6566200145107534e-06, + "loss": 0.5861, + "step": 3306 + }, + { + "epoch": 0.8040359834670556, + "grad_norm": 20.625, + "learning_rate": 1.6561630052297586e-06, + "loss": 0.9351, + "step": 3307 + }, + { + "epoch": 0.8042791150012156, + "grad_norm": 23.375, + "learning_rate": 1.655705935240688e-06, + "loss": 1.3245, + "step": 3308 + }, + { + "epoch": 0.8045222465353756, + "grad_norm": 16.75, + "learning_rate": 1.6552488046118588e-06, + "loss": 0.7572, + "step": 3309 + }, + { + "epoch": 0.8047653780695356, + "grad_norm": 15.5, + "learning_rate": 1.6547916134115964e-06, + "loss": 0.4335, + "step": 3310 + }, + { + "epoch": 0.8050085096036956, + "grad_norm": 15.5625, + "learning_rate": 1.6543343617082364e-06, + "loss": 0.6441, + "step": 3311 + }, + { + "epoch": 0.8052516411378556, + "grad_norm": 16.75, + "learning_rate": 1.653877049570123e-06, + "loss": 0.798, + "step": 3312 + }, + { + "epoch": 0.8054947726720155, + "grad_norm": 18.375, + "learning_rate": 1.6534196770656097e-06, + "loss": 0.6102, + "step": 3313 + }, + { + "epoch": 0.8057379042061755, + "grad_norm": 20.875, + "learning_rate": 1.6529622442630583e-06, + "loss": 1.1213, + "step": 3314 + }, + { + "epoch": 0.8059810357403355, + "grad_norm": 20.625, + "learning_rate": 1.6525047512308398e-06, + "loss": 0.8487, + "step": 3315 + }, + { + "epoch": 0.8062241672744955, + "grad_norm": 61.5, + "learning_rate": 1.6520471980373348e-06, + "loss": 0.8042, + "step": 3316 + }, + { + "epoch": 0.8064672988086555, + "grad_norm": 17.75, + "learning_rate": 1.6515895847509325e-06, + "loss": 0.9871, + "step": 3317 + }, + { + "epoch": 0.8067104303428154, + "grad_norm": 17.5, + "learning_rate": 1.6511319114400308e-06, + "loss": 0.6412, + "step": 3318 + }, + { + "epoch": 0.8069535618769754, + "grad_norm": 20.125, + "learning_rate": 1.6506741781730379e-06, + "loss": 0.6683, + "step": 3319 + }, + { + "epoch": 0.8071966934111354, + "grad_norm": 18.0, + "learning_rate": 1.6502163850183683e-06, + "loss": 0.94, + "step": 3320 + }, + { + "epoch": 0.8074398249452954, + "grad_norm": 38.5, + "learning_rate": 1.6497585320444487e-06, + "loss": 1.5201, + "step": 3321 + }, + { + "epoch": 0.8076829564794554, + "grad_norm": 21.625, + "learning_rate": 1.649300619319712e-06, + "loss": 0.8041, + "step": 3322 + }, + { + "epoch": 0.8079260880136153, + "grad_norm": 27.875, + "learning_rate": 1.6488426469126017e-06, + "loss": 0.9246, + "step": 3323 + }, + { + "epoch": 0.8081692195477753, + "grad_norm": 17.125, + "learning_rate": 1.6483846148915698e-06, + "loss": 0.7319, + "step": 3324 + }, + { + "epoch": 0.8084123510819353, + "grad_norm": 19.25, + "learning_rate": 1.6479265233250763e-06, + "loss": 1.2679, + "step": 3325 + }, + { + "epoch": 0.8086554826160953, + "grad_norm": 22.125, + "learning_rate": 1.647468372281592e-06, + "loss": 0.7942, + "step": 3326 + }, + { + "epoch": 0.8088986141502553, + "grad_norm": 25.125, + "learning_rate": 1.6470101618295946e-06, + "loss": 1.0763, + "step": 3327 + }, + { + "epoch": 0.8091417456844152, + "grad_norm": 24.125, + "learning_rate": 1.6465518920375723e-06, + "loss": 1.0571, + "step": 3328 + }, + { + "epoch": 0.8093848772185752, + "grad_norm": 17.625, + "learning_rate": 1.6460935629740207e-06, + "loss": 0.8949, + "step": 3329 + }, + { + "epoch": 0.8096280087527352, + "grad_norm": 20.25, + "learning_rate": 1.6456351747074454e-06, + "loss": 1.1852, + "step": 3330 + }, + { + "epoch": 0.8098711402868952, + "grad_norm": 18.625, + "learning_rate": 1.6451767273063605e-06, + "loss": 0.7703, + "step": 3331 + }, + { + "epoch": 0.8101142718210552, + "grad_norm": 19.0, + "learning_rate": 1.6447182208392887e-06, + "loss": 1.0443, + "step": 3332 + }, + { + "epoch": 0.8103574033552152, + "grad_norm": 16.5, + "learning_rate": 1.644259655374762e-06, + "loss": 0.7272, + "step": 3333 + }, + { + "epoch": 0.8106005348893751, + "grad_norm": 25.125, + "learning_rate": 1.6438010309813202e-06, + "loss": 1.2095, + "step": 3334 + }, + { + "epoch": 0.8108436664235351, + "grad_norm": 16.0, + "learning_rate": 1.6433423477275134e-06, + "loss": 0.9695, + "step": 3335 + }, + { + "epoch": 0.8110867979576951, + "grad_norm": 25.375, + "learning_rate": 1.6428836056818995e-06, + "loss": 1.247, + "step": 3336 + }, + { + "epoch": 0.8113299294918551, + "grad_norm": 18.625, + "learning_rate": 1.6424248049130453e-06, + "loss": 1.0522, + "step": 3337 + }, + { + "epoch": 0.8115730610260151, + "grad_norm": 20.0, + "learning_rate": 1.641965945489527e-06, + "loss": 0.7952, + "step": 3338 + }, + { + "epoch": 0.811816192560175, + "grad_norm": 20.625, + "learning_rate": 1.641507027479928e-06, + "loss": 0.8989, + "step": 3339 + }, + { + "epoch": 0.812059324094335, + "grad_norm": 16.5, + "learning_rate": 1.6410480509528427e-06, + "loss": 0.5721, + "step": 3340 + }, + { + "epoch": 0.812302455628495, + "grad_norm": 21.5, + "learning_rate": 1.6405890159768722e-06, + "loss": 1.0475, + "step": 3341 + }, + { + "epoch": 0.812545587162655, + "grad_norm": 19.125, + "learning_rate": 1.640129922620628e-06, + "loss": 1.0032, + "step": 3342 + }, + { + "epoch": 0.812788718696815, + "grad_norm": 12.5, + "learning_rate": 1.6396707709527287e-06, + "loss": 0.3883, + "step": 3343 + }, + { + "epoch": 0.8130318502309749, + "grad_norm": 20.75, + "learning_rate": 1.639211561041803e-06, + "loss": 0.6252, + "step": 3344 + }, + { + "epoch": 0.8132749817651349, + "grad_norm": 17.25, + "learning_rate": 1.6387522929564874e-06, + "loss": 0.5771, + "step": 3345 + }, + { + "epoch": 0.8135181132992949, + "grad_norm": 40.5, + "learning_rate": 1.6382929667654278e-06, + "loss": 1.5006, + "step": 3346 + }, + { + "epoch": 0.8137612448334549, + "grad_norm": 17.75, + "learning_rate": 1.6378335825372786e-06, + "loss": 0.7383, + "step": 3347 + }, + { + "epoch": 0.8140043763676149, + "grad_norm": 20.125, + "learning_rate": 1.6373741403407018e-06, + "loss": 1.3033, + "step": 3348 + }, + { + "epoch": 0.8142475079017749, + "grad_norm": 16.625, + "learning_rate": 1.6369146402443698e-06, + "loss": 0.6044, + "step": 3349 + }, + { + "epoch": 0.8144906394359348, + "grad_norm": 17.25, + "learning_rate": 1.6364550823169625e-06, + "loss": 0.704, + "step": 3350 + }, + { + "epoch": 0.8147337709700948, + "grad_norm": 19.125, + "learning_rate": 1.6359954666271688e-06, + "loss": 1.004, + "step": 3351 + }, + { + "epoch": 0.8149769025042548, + "grad_norm": 13.9375, + "learning_rate": 1.6355357932436863e-06, + "loss": 0.4525, + "step": 3352 + }, + { + "epoch": 0.8152200340384148, + "grad_norm": 22.5, + "learning_rate": 1.635076062235221e-06, + "loss": 1.1442, + "step": 3353 + }, + { + "epoch": 0.8154631655725748, + "grad_norm": 25.625, + "learning_rate": 1.6346162736704878e-06, + "loss": 1.3611, + "step": 3354 + }, + { + "epoch": 0.8157062971067347, + "grad_norm": 14.8125, + "learning_rate": 1.6341564276182097e-06, + "loss": 0.8538, + "step": 3355 + }, + { + "epoch": 0.8159494286408947, + "grad_norm": 18.125, + "learning_rate": 1.6336965241471193e-06, + "loss": 0.6753, + "step": 3356 + }, + { + "epoch": 0.8161925601750547, + "grad_norm": 14.25, + "learning_rate": 1.6332365633259568e-06, + "loss": 0.7703, + "step": 3357 + }, + { + "epoch": 0.8164356917092147, + "grad_norm": 15.0, + "learning_rate": 1.6327765452234706e-06, + "loss": 0.4715, + "step": 3358 + }, + { + "epoch": 0.8166788232433747, + "grad_norm": 16.5, + "learning_rate": 1.6323164699084193e-06, + "loss": 0.6636, + "step": 3359 + }, + { + "epoch": 0.8169219547775346, + "grad_norm": 25.25, + "learning_rate": 1.6318563374495686e-06, + "loss": 1.0865, + "step": 3360 + }, + { + "epoch": 0.8171650863116946, + "grad_norm": 17.125, + "learning_rate": 1.6313961479156935e-06, + "loss": 0.5912, + "step": 3361 + }, + { + "epoch": 0.8174082178458546, + "grad_norm": 14.3125, + "learning_rate": 1.6309359013755772e-06, + "loss": 0.2972, + "step": 3362 + }, + { + "epoch": 0.8176513493800146, + "grad_norm": 18.875, + "learning_rate": 1.630475597898011e-06, + "loss": 0.9035, + "step": 3363 + }, + { + "epoch": 0.8178944809141746, + "grad_norm": 19.75, + "learning_rate": 1.6300152375517964e-06, + "loss": 0.9052, + "step": 3364 + }, + { + "epoch": 0.8181376124483345, + "grad_norm": 13.5625, + "learning_rate": 1.629554820405741e-06, + "loss": 0.411, + "step": 3365 + }, + { + "epoch": 0.8183807439824945, + "grad_norm": 24.25, + "learning_rate": 1.6290943465286623e-06, + "loss": 0.6659, + "step": 3366 + }, + { + "epoch": 0.8186238755166545, + "grad_norm": 22.5, + "learning_rate": 1.6286338159893867e-06, + "loss": 0.7502, + "step": 3367 + }, + { + "epoch": 0.8188670070508145, + "grad_norm": 22.875, + "learning_rate": 1.6281732288567482e-06, + "loss": 0.9068, + "step": 3368 + }, + { + "epoch": 0.8191101385849745, + "grad_norm": 22.125, + "learning_rate": 1.6277125851995892e-06, + "loss": 1.0115, + "step": 3369 + }, + { + "epoch": 0.8193532701191345, + "grad_norm": 23.0, + "learning_rate": 1.6272518850867609e-06, + "loss": 0.9395, + "step": 3370 + }, + { + "epoch": 0.8195964016532944, + "grad_norm": 23.25, + "learning_rate": 1.6267911285871233e-06, + "loss": 0.9708, + "step": 3371 + }, + { + "epoch": 0.8198395331874544, + "grad_norm": 22.0, + "learning_rate": 1.6263303157695438e-06, + "loss": 1.1156, + "step": 3372 + }, + { + "epoch": 0.8200826647216144, + "grad_norm": 18.625, + "learning_rate": 1.625869446702899e-06, + "loss": 0.7308, + "step": 3373 + }, + { + "epoch": 0.8203257962557744, + "grad_norm": 18.375, + "learning_rate": 1.6254085214560743e-06, + "loss": 0.7755, + "step": 3374 + }, + { + "epoch": 0.8205689277899344, + "grad_norm": 17.75, + "learning_rate": 1.6249475400979625e-06, + "loss": 0.8015, + "step": 3375 + }, + { + "epoch": 0.8208120593240943, + "grad_norm": 18.625, + "learning_rate": 1.6244865026974654e-06, + "loss": 0.7887, + "step": 3376 + }, + { + "epoch": 0.8210551908582543, + "grad_norm": 16.125, + "learning_rate": 1.6240254093234925e-06, + "loss": 0.7635, + "step": 3377 + }, + { + "epoch": 0.8212983223924143, + "grad_norm": 14.75, + "learning_rate": 1.6235642600449628e-06, + "loss": 0.5193, + "step": 3378 + }, + { + "epoch": 0.8215414539265743, + "grad_norm": 21.125, + "learning_rate": 1.6231030549308024e-06, + "loss": 0.6491, + "step": 3379 + }, + { + "epoch": 0.8217845854607343, + "grad_norm": 21.0, + "learning_rate": 1.622641794049947e-06, + "loss": 0.8781, + "step": 3380 + }, + { + "epoch": 0.8220277169948942, + "grad_norm": 17.625, + "learning_rate": 1.6221804774713397e-06, + "loss": 1.0416, + "step": 3381 + }, + { + "epoch": 0.8222708485290542, + "grad_norm": 17.625, + "learning_rate": 1.6217191052639323e-06, + "loss": 0.6293, + "step": 3382 + }, + { + "epoch": 0.8225139800632142, + "grad_norm": 16.875, + "learning_rate": 1.6212576774966848e-06, + "loss": 0.6833, + "step": 3383 + }, + { + "epoch": 0.8227571115973742, + "grad_norm": 22.75, + "learning_rate": 1.6207961942385655e-06, + "loss": 1.1186, + "step": 3384 + }, + { + "epoch": 0.8230002431315342, + "grad_norm": 16.25, + "learning_rate": 1.620334655558551e-06, + "loss": 0.8758, + "step": 3385 + }, + { + "epoch": 0.8232433746656942, + "grad_norm": 17.75, + "learning_rate": 1.6198730615256267e-06, + "loss": 0.9285, + "step": 3386 + }, + { + "epoch": 0.8234865061998541, + "grad_norm": 17.75, + "learning_rate": 1.6194114122087852e-06, + "loss": 0.6273, + "step": 3387 + }, + { + "epoch": 0.8237296377340141, + "grad_norm": 20.25, + "learning_rate": 1.6189497076770282e-06, + "loss": 0.8995, + "step": 3388 + }, + { + "epoch": 0.8239727692681741, + "grad_norm": 19.0, + "learning_rate": 1.618487947999365e-06, + "loss": 0.6926, + "step": 3389 + }, + { + "epoch": 0.8242159008023341, + "grad_norm": 16.875, + "learning_rate": 1.6180261332448146e-06, + "loss": 0.6152, + "step": 3390 + }, + { + "epoch": 0.8244590323364941, + "grad_norm": 22.25, + "learning_rate": 1.6175642634824025e-06, + "loss": 1.3543, + "step": 3391 + }, + { + "epoch": 0.824702163870654, + "grad_norm": 21.25, + "learning_rate": 1.6171023387811627e-06, + "loss": 1.1115, + "step": 3392 + }, + { + "epoch": 0.824945295404814, + "grad_norm": 18.75, + "learning_rate": 1.6166403592101384e-06, + "loss": 0.8612, + "step": 3393 + }, + { + "epoch": 0.825188426938974, + "grad_norm": 16.875, + "learning_rate": 1.6161783248383805e-06, + "loss": 0.5647, + "step": 3394 + }, + { + "epoch": 0.825431558473134, + "grad_norm": 18.5, + "learning_rate": 1.6157162357349482e-06, + "loss": 0.7841, + "step": 3395 + }, + { + "epoch": 0.825674690007294, + "grad_norm": 16.625, + "learning_rate": 1.6152540919689077e-06, + "loss": 0.5308, + "step": 3396 + }, + { + "epoch": 0.8259178215414539, + "grad_norm": 30.5, + "learning_rate": 1.6147918936093355e-06, + "loss": 1.1225, + "step": 3397 + }, + { + "epoch": 0.8261609530756139, + "grad_norm": 21.75, + "learning_rate": 1.6143296407253142e-06, + "loss": 0.9388, + "step": 3398 + }, + { + "epoch": 0.8264040846097739, + "grad_norm": 27.5, + "learning_rate": 1.613867333385936e-06, + "loss": 1.1364, + "step": 3399 + }, + { + "epoch": 0.8266472161439339, + "grad_norm": 18.5, + "learning_rate": 1.613404971660301e-06, + "loss": 0.8135, + "step": 3400 + }, + { + "epoch": 0.8268903476780939, + "grad_norm": 14.625, + "learning_rate": 1.612942555617516e-06, + "loss": 0.5177, + "step": 3401 + }, + { + "epoch": 0.8271334792122538, + "grad_norm": 21.75, + "learning_rate": 1.6124800853266986e-06, + "loss": 1.094, + "step": 3402 + }, + { + "epoch": 0.8273766107464138, + "grad_norm": 24.625, + "learning_rate": 1.6120175608569718e-06, + "loss": 0.786, + "step": 3403 + }, + { + "epoch": 0.8276197422805738, + "grad_norm": 20.125, + "learning_rate": 1.6115549822774684e-06, + "loss": 0.6718, + "step": 3404 + }, + { + "epoch": 0.8278628738147338, + "grad_norm": 20.875, + "learning_rate": 1.6110923496573283e-06, + "loss": 0.68, + "step": 3405 + }, + { + "epoch": 0.8281060053488938, + "grad_norm": 21.0, + "learning_rate": 1.6106296630657005e-06, + "loss": 1.1493, + "step": 3406 + }, + { + "epoch": 0.8283491368830538, + "grad_norm": 20.75, + "learning_rate": 1.6101669225717417e-06, + "loss": 0.554, + "step": 3407 + }, + { + "epoch": 0.8285922684172137, + "grad_norm": 23.25, + "learning_rate": 1.6097041282446152e-06, + "loss": 1.284, + "step": 3408 + }, + { + "epoch": 0.8288353999513737, + "grad_norm": 19.875, + "learning_rate": 1.6092412801534949e-06, + "loss": 0.8646, + "step": 3409 + }, + { + "epoch": 0.8290785314855337, + "grad_norm": 16.75, + "learning_rate": 1.6087783783675611e-06, + "loss": 0.7641, + "step": 3410 + }, + { + "epoch": 0.8293216630196937, + "grad_norm": 24.5, + "learning_rate": 1.6083154229560022e-06, + "loss": 1.0882, + "step": 3411 + }, + { + "epoch": 0.8295647945538537, + "grad_norm": 15.1875, + "learning_rate": 1.607852413988015e-06, + "loss": 0.4787, + "step": 3412 + }, + { + "epoch": 0.8298079260880136, + "grad_norm": 17.625, + "learning_rate": 1.607389351532804e-06, + "loss": 0.78, + "step": 3413 + }, + { + "epoch": 0.8300510576221736, + "grad_norm": 24.5, + "learning_rate": 1.6069262356595827e-06, + "loss": 0.9327, + "step": 3414 + }, + { + "epoch": 0.8302941891563336, + "grad_norm": 23.625, + "learning_rate": 1.6064630664375705e-06, + "loss": 1.0338, + "step": 3415 + }, + { + "epoch": 0.8305373206904936, + "grad_norm": 20.875, + "learning_rate": 1.6059998439359967e-06, + "loss": 0.9083, + "step": 3416 + }, + { + "epoch": 0.8307804522246536, + "grad_norm": 20.375, + "learning_rate": 1.6055365682240985e-06, + "loss": 1.0087, + "step": 3417 + }, + { + "epoch": 0.8310235837588135, + "grad_norm": 20.875, + "learning_rate": 1.6050732393711193e-06, + "loss": 0.9165, + "step": 3418 + }, + { + "epoch": 0.8312667152929735, + "grad_norm": 28.0, + "learning_rate": 1.6046098574463126e-06, + "loss": 1.3364, + "step": 3419 + }, + { + "epoch": 0.8315098468271335, + "grad_norm": 13.5625, + "learning_rate": 1.6041464225189376e-06, + "loss": 0.5851, + "step": 3420 + }, + { + "epoch": 0.8317529783612935, + "grad_norm": 16.25, + "learning_rate": 1.603682934658264e-06, + "loss": 0.3832, + "step": 3421 + }, + { + "epoch": 0.8319961098954535, + "grad_norm": 21.5, + "learning_rate": 1.6032193939335676e-06, + "loss": 0.7287, + "step": 3422 + }, + { + "epoch": 0.8322392414296135, + "grad_norm": 21.875, + "learning_rate": 1.6027558004141323e-06, + "loss": 0.9834, + "step": 3423 + }, + { + "epoch": 0.8324823729637734, + "grad_norm": 19.375, + "learning_rate": 1.6022921541692501e-06, + "loss": 1.0242, + "step": 3424 + }, + { + "epoch": 0.8327255044979334, + "grad_norm": 19.375, + "learning_rate": 1.6018284552682215e-06, + "loss": 0.9596, + "step": 3425 + }, + { + "epoch": 0.8329686360320934, + "grad_norm": 21.125, + "learning_rate": 1.6013647037803539e-06, + "loss": 0.8035, + "step": 3426 + }, + { + "epoch": 0.8332117675662534, + "grad_norm": 20.75, + "learning_rate": 1.6009008997749631e-06, + "loss": 1.0939, + "step": 3427 + }, + { + "epoch": 0.8334548991004134, + "grad_norm": 16.625, + "learning_rate": 1.600437043321372e-06, + "loss": 0.4163, + "step": 3428 + }, + { + "epoch": 0.8336980306345733, + "grad_norm": 23.625, + "learning_rate": 1.5999731344889132e-06, + "loss": 1.03, + "step": 3429 + }, + { + "epoch": 0.8339411621687333, + "grad_norm": 15.3125, + "learning_rate": 1.599509173346925e-06, + "loss": 0.7969, + "step": 3430 + }, + { + "epoch": 0.8341842937028933, + "grad_norm": 17.875, + "learning_rate": 1.599045159964755e-06, + "loss": 1.0562, + "step": 3431 + }, + { + "epoch": 0.8344274252370533, + "grad_norm": 32.5, + "learning_rate": 1.598581094411757e-06, + "loss": 0.8758, + "step": 3432 + }, + { + "epoch": 0.8346705567712133, + "grad_norm": 24.0, + "learning_rate": 1.598116976757294e-06, + "loss": 1.3562, + "step": 3433 + }, + { + "epoch": 0.8349136883053732, + "grad_norm": 18.125, + "learning_rate": 1.5976528070707376e-06, + "loss": 0.7141, + "step": 3434 + }, + { + "epoch": 0.8351568198395332, + "grad_norm": 18.75, + "learning_rate": 1.5971885854214642e-06, + "loss": 0.9906, + "step": 3435 + }, + { + "epoch": 0.8353999513736932, + "grad_norm": 18.75, + "learning_rate": 1.596724311878861e-06, + "loss": 0.7147, + "step": 3436 + }, + { + "epoch": 0.8356430829078532, + "grad_norm": 20.25, + "learning_rate": 1.596259986512321e-06, + "loss": 0.7191, + "step": 3437 + }, + { + "epoch": 0.8358862144420132, + "grad_norm": 22.875, + "learning_rate": 1.5957956093912459e-06, + "loss": 1.1478, + "step": 3438 + }, + { + "epoch": 0.836129345976173, + "grad_norm": 31.5, + "learning_rate": 1.5953311805850448e-06, + "loss": 0.8481, + "step": 3439 + }, + { + "epoch": 0.8363724775103331, + "grad_norm": 16.375, + "learning_rate": 1.5948667001631352e-06, + "loss": 0.921, + "step": 3440 + }, + { + "epoch": 0.8366156090444931, + "grad_norm": 22.375, + "learning_rate": 1.594402168194941e-06, + "loss": 0.8647, + "step": 3441 + }, + { + "epoch": 0.8368587405786531, + "grad_norm": 21.5, + "learning_rate": 1.5939375847498944e-06, + "loss": 1.1712, + "step": 3442 + }, + { + "epoch": 0.8371018721128131, + "grad_norm": 21.625, + "learning_rate": 1.5934729498974362e-06, + "loss": 0.8327, + "step": 3443 + }, + { + "epoch": 0.8373450036469731, + "grad_norm": 23.375, + "learning_rate": 1.5930082637070132e-06, + "loss": 1.1919, + "step": 3444 + }, + { + "epoch": 0.837588135181133, + "grad_norm": 20.875, + "learning_rate": 1.5925435262480815e-06, + "loss": 0.7159, + "step": 3445 + }, + { + "epoch": 0.837831266715293, + "grad_norm": 17.0, + "learning_rate": 1.592078737590104e-06, + "loss": 0.8424, + "step": 3446 + }, + { + "epoch": 0.838074398249453, + "grad_norm": 17.625, + "learning_rate": 1.5916138978025509e-06, + "loss": 0.6072, + "step": 3447 + }, + { + "epoch": 0.838317529783613, + "grad_norm": 19.625, + "learning_rate": 1.591149006954901e-06, + "loss": 0.6674, + "step": 3448 + }, + { + "epoch": 0.838560661317773, + "grad_norm": 31.375, + "learning_rate": 1.5906840651166402e-06, + "loss": 1.2384, + "step": 3449 + }, + { + "epoch": 0.8388037928519329, + "grad_norm": 15.9375, + "learning_rate": 1.5902190723572622e-06, + "loss": 0.7587, + "step": 3450 + }, + { + "epoch": 0.8390469243860929, + "grad_norm": 22.75, + "learning_rate": 1.589754028746268e-06, + "loss": 1.1475, + "step": 3451 + }, + { + "epoch": 0.8392900559202529, + "grad_norm": 16.75, + "learning_rate": 1.5892889343531662e-06, + "loss": 0.665, + "step": 3452 + }, + { + "epoch": 0.8395331874544129, + "grad_norm": 29.5, + "learning_rate": 1.588823789247474e-06, + "loss": 1.2101, + "step": 3453 + }, + { + "epoch": 0.8397763189885729, + "grad_norm": 50.75, + "learning_rate": 1.588358593498714e-06, + "loss": 0.9499, + "step": 3454 + }, + { + "epoch": 0.8400194505227327, + "grad_norm": 15.375, + "learning_rate": 1.5878933471764192e-06, + "loss": 0.7118, + "step": 3455 + }, + { + "epoch": 0.8402625820568927, + "grad_norm": 17.5, + "learning_rate": 1.5874280503501278e-06, + "loss": 0.7053, + "step": 3456 + }, + { + "epoch": 0.8405057135910527, + "grad_norm": 19.25, + "learning_rate": 1.5869627030893867e-06, + "loss": 1.09, + "step": 3457 + }, + { + "epoch": 0.8407488451252128, + "grad_norm": 17.625, + "learning_rate": 1.5864973054637504e-06, + "loss": 1.0151, + "step": 3458 + }, + { + "epoch": 0.8409919766593728, + "grad_norm": 20.625, + "learning_rate": 1.5860318575427793e-06, + "loss": 0.9082, + "step": 3459 + }, + { + "epoch": 0.8412351081935328, + "grad_norm": 15.25, + "learning_rate": 1.5855663593960446e-06, + "loss": 0.7381, + "step": 3460 + }, + { + "epoch": 0.8414782397276926, + "grad_norm": 25.5, + "learning_rate": 1.585100811093122e-06, + "loss": 1.1652, + "step": 3461 + }, + { + "epoch": 0.8417213712618526, + "grad_norm": 21.75, + "learning_rate": 1.5846352127035952e-06, + "loss": 1.0499, + "step": 3462 + }, + { + "epoch": 0.8419645027960126, + "grad_norm": 22.625, + "learning_rate": 1.584169564297057e-06, + "loss": 0.8066, + "step": 3463 + }, + { + "epoch": 0.8422076343301726, + "grad_norm": 17.875, + "learning_rate": 1.5837038659431059e-06, + "loss": 0.79, + "step": 3464 + }, + { + "epoch": 0.8424507658643327, + "grad_norm": 17.375, + "learning_rate": 1.583238117711349e-06, + "loss": 0.822, + "step": 3465 + }, + { + "epoch": 0.8426938973984925, + "grad_norm": 17.375, + "learning_rate": 1.5827723196713998e-06, + "loss": 0.814, + "step": 3466 + }, + { + "epoch": 0.8429370289326525, + "grad_norm": 17.625, + "learning_rate": 1.5823064718928807e-06, + "loss": 0.6785, + "step": 3467 + }, + { + "epoch": 0.8431801604668125, + "grad_norm": 18.75, + "learning_rate": 1.58184057444542e-06, + "loss": 0.6204, + "step": 3468 + }, + { + "epoch": 0.8434232920009725, + "grad_norm": 19.75, + "learning_rate": 1.5813746273986541e-06, + "loss": 1.026, + "step": 3469 + }, + { + "epoch": 0.8436664235351325, + "grad_norm": 16.875, + "learning_rate": 1.5809086308222273e-06, + "loss": 0.8074, + "step": 3470 + }, + { + "epoch": 0.8439095550692924, + "grad_norm": 20.75, + "learning_rate": 1.5804425847857908e-06, + "loss": 0.9195, + "step": 3471 + }, + { + "epoch": 0.8441526866034524, + "grad_norm": 21.25, + "learning_rate": 1.5799764893590033e-06, + "loss": 1.2829, + "step": 3472 + }, + { + "epoch": 0.8443958181376124, + "grad_norm": 20.5, + "learning_rate": 1.5795103446115302e-06, + "loss": 0.7941, + "step": 3473 + }, + { + "epoch": 0.8446389496717724, + "grad_norm": 15.9375, + "learning_rate": 1.5790441506130453e-06, + "loss": 0.7508, + "step": 3474 + }, + { + "epoch": 0.8448820812059324, + "grad_norm": 18.75, + "learning_rate": 1.5785779074332292e-06, + "loss": 1.1244, + "step": 3475 + }, + { + "epoch": 0.8451252127400923, + "grad_norm": 19.25, + "learning_rate": 1.5781116151417703e-06, + "loss": 0.7926, + "step": 3476 + }, + { + "epoch": 0.8453683442742523, + "grad_norm": 24.625, + "learning_rate": 1.5776452738083637e-06, + "loss": 0.9534, + "step": 3477 + }, + { + "epoch": 0.8456114758084123, + "grad_norm": 17.625, + "learning_rate": 1.5771788835027122e-06, + "loss": 0.9546, + "step": 3478 + }, + { + "epoch": 0.8458546073425723, + "grad_norm": 16.875, + "learning_rate": 1.5767124442945264e-06, + "loss": 0.7319, + "step": 3479 + }, + { + "epoch": 0.8460977388767323, + "grad_norm": 15.125, + "learning_rate": 1.576245956253523e-06, + "loss": 0.7444, + "step": 3480 + }, + { + "epoch": 0.8463408704108923, + "grad_norm": 20.0, + "learning_rate": 1.575779419449427e-06, + "loss": 0.9531, + "step": 3481 + }, + { + "epoch": 0.8465840019450522, + "grad_norm": 20.875, + "learning_rate": 1.5753128339519702e-06, + "loss": 0.9928, + "step": 3482 + }, + { + "epoch": 0.8468271334792122, + "grad_norm": 20.5, + "learning_rate": 1.574846199830892e-06, + "loss": 0.7358, + "step": 3483 + }, + { + "epoch": 0.8470702650133722, + "grad_norm": 18.0, + "learning_rate": 1.5743795171559392e-06, + "loss": 0.78, + "step": 3484 + }, + { + "epoch": 0.8473133965475322, + "grad_norm": 17.0, + "learning_rate": 1.5739127859968652e-06, + "loss": 0.5908, + "step": 3485 + }, + { + "epoch": 0.8475565280816922, + "grad_norm": 20.875, + "learning_rate": 1.5734460064234314e-06, + "loss": 0.9047, + "step": 3486 + }, + { + "epoch": 0.8477996596158521, + "grad_norm": 15.6875, + "learning_rate": 1.5729791785054056e-06, + "loss": 0.5851, + "step": 3487 + }, + { + "epoch": 0.8480427911500121, + "grad_norm": 14.75, + "learning_rate": 1.5725123023125633e-06, + "loss": 0.9931, + "step": 3488 + }, + { + "epoch": 0.8482859226841721, + "grad_norm": 17.0, + "learning_rate": 1.572045377914688e-06, + "loss": 0.7318, + "step": 3489 + }, + { + "epoch": 0.8485290542183321, + "grad_norm": 17.875, + "learning_rate": 1.5715784053815687e-06, + "loss": 0.7543, + "step": 3490 + }, + { + "epoch": 0.8487721857524921, + "grad_norm": 17.25, + "learning_rate": 1.5711113847830029e-06, + "loss": 0.6945, + "step": 3491 + }, + { + "epoch": 0.849015317286652, + "grad_norm": 21.5, + "learning_rate": 1.5706443161887948e-06, + "loss": 0.6763, + "step": 3492 + }, + { + "epoch": 0.849258448820812, + "grad_norm": 22.625, + "learning_rate": 1.570177199668756e-06, + "loss": 1.0924, + "step": 3493 + }, + { + "epoch": 0.849501580354972, + "grad_norm": 23.125, + "learning_rate": 1.569710035292705e-06, + "loss": 0.9922, + "step": 3494 + }, + { + "epoch": 0.849744711889132, + "grad_norm": 24.125, + "learning_rate": 1.5692428231304676e-06, + "loss": 0.8303, + "step": 3495 + }, + { + "epoch": 0.849987843423292, + "grad_norm": 25.75, + "learning_rate": 1.5687755632518772e-06, + "loss": 0.9958, + "step": 3496 + }, + { + "epoch": 0.850230974957452, + "grad_norm": 14.5625, + "learning_rate": 1.5683082557267728e-06, + "loss": 0.3681, + "step": 3497 + }, + { + "epoch": 0.8504741064916119, + "grad_norm": 25.25, + "learning_rate": 1.567840900625003e-06, + "loss": 0.8429, + "step": 3498 + }, + { + "epoch": 0.8507172380257719, + "grad_norm": 23.75, + "learning_rate": 1.5673734980164204e-06, + "loss": 0.8081, + "step": 3499 + }, + { + "epoch": 0.8509603695599319, + "grad_norm": 23.0, + "learning_rate": 1.5669060479708878e-06, + "loss": 0.9996, + "step": 3500 + }, + { + "epoch": 0.8512035010940919, + "grad_norm": 19.125, + "learning_rate": 1.566438550558273e-06, + "loss": 0.906, + "step": 3501 + }, + { + "epoch": 0.8514466326282519, + "grad_norm": 16.375, + "learning_rate": 1.5659710058484518e-06, + "loss": 0.3945, + "step": 3502 + }, + { + "epoch": 0.8516897641624118, + "grad_norm": 21.5, + "learning_rate": 1.5655034139113072e-06, + "loss": 0.812, + "step": 3503 + }, + { + "epoch": 0.8519328956965718, + "grad_norm": 15.0625, + "learning_rate": 1.5650357748167278e-06, + "loss": 0.8199, + "step": 3504 + }, + { + "epoch": 0.8521760272307318, + "grad_norm": 23.0, + "learning_rate": 1.5645680886346112e-06, + "loss": 1.5766, + "step": 3505 + }, + { + "epoch": 0.8524191587648918, + "grad_norm": 18.0, + "learning_rate": 1.564100355434861e-06, + "loss": 0.7244, + "step": 3506 + }, + { + "epoch": 0.8526622902990518, + "grad_norm": 15.25, + "learning_rate": 1.563632575287388e-06, + "loss": 0.6229, + "step": 3507 + }, + { + "epoch": 0.8529054218332117, + "grad_norm": 19.75, + "learning_rate": 1.56316474826211e-06, + "loss": 0.9417, + "step": 3508 + }, + { + "epoch": 0.8531485533673717, + "grad_norm": 20.75, + "learning_rate": 1.5626968744289516e-06, + "loss": 0.8819, + "step": 3509 + }, + { + "epoch": 0.8533916849015317, + "grad_norm": 20.0, + "learning_rate": 1.5622289538578453e-06, + "loss": 0.9293, + "step": 3510 + }, + { + "epoch": 0.8536348164356917, + "grad_norm": 18.875, + "learning_rate": 1.5617609866187291e-06, + "loss": 0.8597, + "step": 3511 + }, + { + "epoch": 0.8538779479698517, + "grad_norm": 16.875, + "learning_rate": 1.5612929727815494e-06, + "loss": 0.6089, + "step": 3512 + }, + { + "epoch": 0.8541210795040116, + "grad_norm": 17.5, + "learning_rate": 1.5608249124162586e-06, + "loss": 0.7872, + "step": 3513 + }, + { + "epoch": 0.8543642110381716, + "grad_norm": 19.875, + "learning_rate": 1.5603568055928164e-06, + "loss": 0.9658, + "step": 3514 + }, + { + "epoch": 0.8546073425723316, + "grad_norm": 20.375, + "learning_rate": 1.5598886523811898e-06, + "loss": 0.8721, + "step": 3515 + }, + { + "epoch": 0.8548504741064916, + "grad_norm": 18.625, + "learning_rate": 1.559420452851352e-06, + "loss": 0.7256, + "step": 3516 + }, + { + "epoch": 0.8550936056406516, + "grad_norm": 19.75, + "learning_rate": 1.5589522070732838e-06, + "loss": 0.8953, + "step": 3517 + }, + { + "epoch": 0.8553367371748116, + "grad_norm": 16.875, + "learning_rate": 1.558483915116972e-06, + "loss": 0.6124, + "step": 3518 + }, + { + "epoch": 0.8555798687089715, + "grad_norm": 14.5, + "learning_rate": 1.5580155770524119e-06, + "loss": 0.4481, + "step": 3519 + }, + { + "epoch": 0.8558230002431315, + "grad_norm": 17.25, + "learning_rate": 1.557547192949604e-06, + "loss": 0.9589, + "step": 3520 + }, + { + "epoch": 0.8560661317772915, + "grad_norm": 21.375, + "learning_rate": 1.5570787628785563e-06, + "loss": 0.9213, + "step": 3521 + }, + { + "epoch": 0.8563092633114515, + "grad_norm": 20.5, + "learning_rate": 1.5566102869092847e-06, + "loss": 0.661, + "step": 3522 + }, + { + "epoch": 0.8565523948456115, + "grad_norm": 23.0, + "learning_rate": 1.5561417651118098e-06, + "loss": 0.9228, + "step": 3523 + }, + { + "epoch": 0.8567955263797714, + "grad_norm": 16.625, + "learning_rate": 1.5556731975561613e-06, + "loss": 0.3625, + "step": 3524 + }, + { + "epoch": 0.8570386579139314, + "grad_norm": 19.25, + "learning_rate": 1.5552045843123737e-06, + "loss": 1.1483, + "step": 3525 + }, + { + "epoch": 0.8572817894480914, + "grad_norm": 19.625, + "learning_rate": 1.5547359254504903e-06, + "loss": 0.6586, + "step": 3526 + }, + { + "epoch": 0.8575249209822514, + "grad_norm": 23.125, + "learning_rate": 1.5542672210405603e-06, + "loss": 0.9238, + "step": 3527 + }, + { + "epoch": 0.8577680525164114, + "grad_norm": 21.875, + "learning_rate": 1.5537984711526382e-06, + "loss": 0.9914, + "step": 3528 + }, + { + "epoch": 0.8580111840505713, + "grad_norm": 20.5, + "learning_rate": 1.5533296758567884e-06, + "loss": 0.9804, + "step": 3529 + }, + { + "epoch": 0.8582543155847313, + "grad_norm": 24.875, + "learning_rate": 1.5528608352230798e-06, + "loss": 1.1042, + "step": 3530 + }, + { + "epoch": 0.8584974471188913, + "grad_norm": 16.875, + "learning_rate": 1.5523919493215888e-06, + "loss": 0.88, + "step": 3531 + }, + { + "epoch": 0.8587405786530513, + "grad_norm": 21.75, + "learning_rate": 1.5519230182223984e-06, + "loss": 0.8651, + "step": 3532 + }, + { + "epoch": 0.8589837101872113, + "grad_norm": 19.875, + "learning_rate": 1.5514540419955986e-06, + "loss": 0.7751, + "step": 3533 + }, + { + "epoch": 0.8592268417213713, + "grad_norm": 17.5, + "learning_rate": 1.550985020711286e-06, + "loss": 0.6555, + "step": 3534 + }, + { + "epoch": 0.8594699732555312, + "grad_norm": 18.375, + "learning_rate": 1.550515954439564e-06, + "loss": 0.829, + "step": 3535 + }, + { + "epoch": 0.8597131047896912, + "grad_norm": 19.375, + "learning_rate": 1.5500468432505422e-06, + "loss": 0.7958, + "step": 3536 + }, + { + "epoch": 0.8599562363238512, + "grad_norm": 19.375, + "learning_rate": 1.5495776872143379e-06, + "loss": 0.8569, + "step": 3537 + }, + { + "epoch": 0.8601993678580112, + "grad_norm": 18.25, + "learning_rate": 1.5491084864010741e-06, + "loss": 0.6053, + "step": 3538 + }, + { + "epoch": 0.8604424993921712, + "grad_norm": 18.125, + "learning_rate": 1.5486392408808818e-06, + "loss": 0.7797, + "step": 3539 + }, + { + "epoch": 0.8606856309263311, + "grad_norm": 19.375, + "learning_rate": 1.5481699507238965e-06, + "loss": 0.7783, + "step": 3540 + }, + { + "epoch": 0.8609287624604911, + "grad_norm": 18.25, + "learning_rate": 1.5477006160002631e-06, + "loss": 0.6464, + "step": 3541 + }, + { + "epoch": 0.8611718939946511, + "grad_norm": 20.875, + "learning_rate": 1.547231236780131e-06, + "loss": 0.7141, + "step": 3542 + }, + { + "epoch": 0.8614150255288111, + "grad_norm": 21.875, + "learning_rate": 1.546761813133657e-06, + "loss": 0.877, + "step": 3543 + }, + { + "epoch": 0.8616581570629711, + "grad_norm": 24.125, + "learning_rate": 1.5462923451310049e-06, + "loss": 1.0813, + "step": 3544 + }, + { + "epoch": 0.861901288597131, + "grad_norm": 17.75, + "learning_rate": 1.5458228328423447e-06, + "loss": 0.6834, + "step": 3545 + }, + { + "epoch": 0.862144420131291, + "grad_norm": 13.5625, + "learning_rate": 1.545353276337853e-06, + "loss": 0.4675, + "step": 3546 + }, + { + "epoch": 0.862387551665451, + "grad_norm": 18.125, + "learning_rate": 1.5448836756877135e-06, + "loss": 0.8901, + "step": 3547 + }, + { + "epoch": 0.862630683199611, + "grad_norm": 21.625, + "learning_rate": 1.5444140309621153e-06, + "loss": 1.0051, + "step": 3548 + }, + { + "epoch": 0.862873814733771, + "grad_norm": 19.5, + "learning_rate": 1.5439443422312562e-06, + "loss": 0.8683, + "step": 3549 + }, + { + "epoch": 0.8631169462679309, + "grad_norm": 15.8125, + "learning_rate": 1.543474609565338e-06, + "loss": 0.7164, + "step": 3550 + }, + { + "epoch": 0.8633600778020909, + "grad_norm": 17.75, + "learning_rate": 1.5430048330345712e-06, + "loss": 0.5809, + "step": 3551 + }, + { + "epoch": 0.8636032093362509, + "grad_norm": 18.125, + "learning_rate": 1.5425350127091716e-06, + "loss": 0.6418, + "step": 3552 + }, + { + "epoch": 0.8638463408704109, + "grad_norm": 22.0, + "learning_rate": 1.5420651486593624e-06, + "loss": 0.9831, + "step": 3553 + }, + { + "epoch": 0.8640894724045709, + "grad_norm": 25.25, + "learning_rate": 1.5415952409553721e-06, + "loss": 0.9321, + "step": 3554 + }, + { + "epoch": 0.8643326039387309, + "grad_norm": 25.125, + "learning_rate": 1.5411252896674369e-06, + "loss": 0.6299, + "step": 3555 + }, + { + "epoch": 0.8645757354728908, + "grad_norm": 24.25, + "learning_rate": 1.5406552948658e-06, + "loss": 1.4371, + "step": 3556 + }, + { + "epoch": 0.8648188670070508, + "grad_norm": 23.0, + "learning_rate": 1.540185256620709e-06, + "loss": 1.1024, + "step": 3557 + }, + { + "epoch": 0.8650619985412108, + "grad_norm": 19.125, + "learning_rate": 1.53971517500242e-06, + "loss": 0.7464, + "step": 3558 + }, + { + "epoch": 0.8653051300753708, + "grad_norm": 20.625, + "learning_rate": 1.539245050081194e-06, + "loss": 0.9117, + "step": 3559 + }, + { + "epoch": 0.8655482616095308, + "grad_norm": 22.125, + "learning_rate": 1.5387748819273001e-06, + "loss": 0.8153, + "step": 3560 + }, + { + "epoch": 0.8657913931436907, + "grad_norm": 21.25, + "learning_rate": 1.5383046706110133e-06, + "loss": 1.0534, + "step": 3561 + }, + { + "epoch": 0.8660345246778507, + "grad_norm": 14.6875, + "learning_rate": 1.5378344162026137e-06, + "loss": 0.3717, + "step": 3562 + }, + { + "epoch": 0.8662776562120107, + "grad_norm": 16.375, + "learning_rate": 1.5373641187723898e-06, + "loss": 0.666, + "step": 3563 + }, + { + "epoch": 0.8665207877461707, + "grad_norm": 19.25, + "learning_rate": 1.5368937783906352e-06, + "loss": 0.7635, + "step": 3564 + }, + { + "epoch": 0.8667639192803307, + "grad_norm": 20.75, + "learning_rate": 1.5364233951276505e-06, + "loss": 0.6799, + "step": 3565 + }, + { + "epoch": 0.8670070508144906, + "grad_norm": 82.0, + "learning_rate": 1.5359529690537431e-06, + "loss": 1.3364, + "step": 3566 + }, + { + "epoch": 0.8672501823486506, + "grad_norm": 23.875, + "learning_rate": 1.5354825002392254e-06, + "loss": 1.0862, + "step": 3567 + }, + { + "epoch": 0.8674933138828106, + "grad_norm": 21.5, + "learning_rate": 1.535011988754418e-06, + "loss": 1.0549, + "step": 3568 + }, + { + "epoch": 0.8677364454169706, + "grad_norm": 19.25, + "learning_rate": 1.5345414346696463e-06, + "loss": 0.6873, + "step": 3569 + }, + { + "epoch": 0.8679795769511306, + "grad_norm": 17.125, + "learning_rate": 1.5340708380552436e-06, + "loss": 0.7122, + "step": 3570 + }, + { + "epoch": 0.8682227084852906, + "grad_norm": 17.0, + "learning_rate": 1.5336001989815472e-06, + "loss": 0.655, + "step": 3571 + }, + { + "epoch": 0.8684658400194505, + "grad_norm": 19.75, + "learning_rate": 1.5331295175189034e-06, + "loss": 0.7531, + "step": 3572 + }, + { + "epoch": 0.8687089715536105, + "grad_norm": 17.375, + "learning_rate": 1.5326587937376635e-06, + "loss": 0.7692, + "step": 3573 + }, + { + "epoch": 0.8689521030877705, + "grad_norm": 22.5, + "learning_rate": 1.5321880277081852e-06, + "loss": 0.8734, + "step": 3574 + }, + { + "epoch": 0.8691952346219305, + "grad_norm": 18.875, + "learning_rate": 1.5317172195008326e-06, + "loss": 0.583, + "step": 3575 + }, + { + "epoch": 0.8694383661560905, + "grad_norm": 17.25, + "learning_rate": 1.531246369185976e-06, + "loss": 0.7171, + "step": 3576 + }, + { + "epoch": 0.8696814976902504, + "grad_norm": 19.375, + "learning_rate": 1.5307754768339922e-06, + "loss": 0.9413, + "step": 3577 + }, + { + "epoch": 0.8699246292244104, + "grad_norm": 20.625, + "learning_rate": 1.5303045425152643e-06, + "loss": 0.9347, + "step": 3578 + }, + { + "epoch": 0.8701677607585704, + "grad_norm": 15.9375, + "learning_rate": 1.5298335663001814e-06, + "loss": 0.7473, + "step": 3579 + }, + { + "epoch": 0.8704108922927304, + "grad_norm": 23.25, + "learning_rate": 1.5293625482591396e-06, + "loss": 1.1312, + "step": 3580 + }, + { + "epoch": 0.8706540238268904, + "grad_norm": 18.625, + "learning_rate": 1.52889148846254e-06, + "loss": 0.5924, + "step": 3581 + }, + { + "epoch": 0.8708971553610503, + "grad_norm": 21.75, + "learning_rate": 1.5284203869807906e-06, + "loss": 0.9332, + "step": 3582 + }, + { + "epoch": 0.8711402868952103, + "grad_norm": 20.875, + "learning_rate": 1.5279492438843058e-06, + "loss": 0.7122, + "step": 3583 + }, + { + "epoch": 0.8713834184293703, + "grad_norm": 20.5, + "learning_rate": 1.5274780592435064e-06, + "loss": 1.2327, + "step": 3584 + }, + { + "epoch": 0.8716265499635303, + "grad_norm": 40.75, + "learning_rate": 1.527006833128819e-06, + "loss": 0.9345, + "step": 3585 + }, + { + "epoch": 0.8718696814976903, + "grad_norm": 33.75, + "learning_rate": 1.5265355656106757e-06, + "loss": 1.377, + "step": 3586 + }, + { + "epoch": 0.8721128130318502, + "grad_norm": 18.125, + "learning_rate": 1.526064256759517e-06, + "loss": 1.1876, + "step": 3587 + }, + { + "epoch": 0.8723559445660102, + "grad_norm": 16.75, + "learning_rate": 1.5255929066457868e-06, + "loss": 1.1715, + "step": 3588 + }, + { + "epoch": 0.8725990761001702, + "grad_norm": 17.25, + "learning_rate": 1.525121515339937e-06, + "loss": 0.5743, + "step": 3589 + }, + { + "epoch": 0.8728422076343302, + "grad_norm": 18.125, + "learning_rate": 1.5246500829124253e-06, + "loss": 0.9771, + "step": 3590 + }, + { + "epoch": 0.8730853391684902, + "grad_norm": 14.6875, + "learning_rate": 1.5241786094337151e-06, + "loss": 0.6867, + "step": 3591 + }, + { + "epoch": 0.8733284707026502, + "grad_norm": 16.125, + "learning_rate": 1.5237070949742772e-06, + "loss": 0.5928, + "step": 3592 + }, + { + "epoch": 0.8735716022368101, + "grad_norm": 14.9375, + "learning_rate": 1.5232355396045864e-06, + "loss": 0.447, + "step": 3593 + }, + { + "epoch": 0.8738147337709701, + "grad_norm": 28.375, + "learning_rate": 1.5227639433951252e-06, + "loss": 0.8632, + "step": 3594 + }, + { + "epoch": 0.8740578653051301, + "grad_norm": 20.25, + "learning_rate": 1.5222923064163822e-06, + "loss": 0.8428, + "step": 3595 + }, + { + "epoch": 0.8743009968392901, + "grad_norm": 23.0, + "learning_rate": 1.521820628738851e-06, + "loss": 1.3935, + "step": 3596 + }, + { + "epoch": 0.8745441283734501, + "grad_norm": 19.375, + "learning_rate": 1.5213489104330328e-06, + "loss": 0.6194, + "step": 3597 + }, + { + "epoch": 0.87478725990761, + "grad_norm": 18.75, + "learning_rate": 1.5208771515694329e-06, + "loss": 0.7012, + "step": 3598 + }, + { + "epoch": 0.87503039144177, + "grad_norm": 16.75, + "learning_rate": 1.520405352218565e-06, + "loss": 0.681, + "step": 3599 + }, + { + "epoch": 0.87527352297593, + "grad_norm": 19.5, + "learning_rate": 1.519933512450947e-06, + "loss": 0.7969, + "step": 3600 + }, + { + "epoch": 0.87551665451009, + "grad_norm": 16.5, + "learning_rate": 1.5194616323371036e-06, + "loss": 0.6389, + "step": 3601 + }, + { + "epoch": 0.87575978604425, + "grad_norm": 19.75, + "learning_rate": 1.5189897119475654e-06, + "loss": 1.2287, + "step": 3602 + }, + { + "epoch": 0.8760029175784099, + "grad_norm": 23.875, + "learning_rate": 1.5185177513528693e-06, + "loss": 1.3549, + "step": 3603 + }, + { + "epoch": 0.8762460491125699, + "grad_norm": 18.125, + "learning_rate": 1.518045750623558e-06, + "loss": 1.167, + "step": 3604 + }, + { + "epoch": 0.8764891806467299, + "grad_norm": 24.875, + "learning_rate": 1.5175737098301792e-06, + "loss": 1.3455, + "step": 3605 + }, + { + "epoch": 0.8767323121808899, + "grad_norm": 22.625, + "learning_rate": 1.517101629043289e-06, + "loss": 0.9083, + "step": 3606 + }, + { + "epoch": 0.8769754437150499, + "grad_norm": 17.5, + "learning_rate": 1.5166295083334473e-06, + "loss": 0.8156, + "step": 3607 + }, + { + "epoch": 0.8772185752492099, + "grad_norm": 16.375, + "learning_rate": 1.5161573477712205e-06, + "loss": 0.5163, + "step": 3608 + }, + { + "epoch": 0.8774617067833698, + "grad_norm": 22.375, + "learning_rate": 1.5156851474271815e-06, + "loss": 1.2113, + "step": 3609 + }, + { + "epoch": 0.8777048383175298, + "grad_norm": 18.125, + "learning_rate": 1.5152129073719085e-06, + "loss": 0.9579, + "step": 3610 + }, + { + "epoch": 0.8779479698516898, + "grad_norm": 16.875, + "learning_rate": 1.5147406276759865e-06, + "loss": 0.4668, + "step": 3611 + }, + { + "epoch": 0.8781911013858498, + "grad_norm": 19.125, + "learning_rate": 1.514268308410005e-06, + "loss": 0.5061, + "step": 3612 + }, + { + "epoch": 0.8784342329200098, + "grad_norm": 17.125, + "learning_rate": 1.5137959496445612e-06, + "loss": 1.0777, + "step": 3613 + }, + { + "epoch": 0.8786773644541697, + "grad_norm": 17.625, + "learning_rate": 1.5133235514502564e-06, + "loss": 0.7726, + "step": 3614 + }, + { + "epoch": 0.8789204959883297, + "grad_norm": 15.1875, + "learning_rate": 1.5128511138976992e-06, + "loss": 0.7334, + "step": 3615 + }, + { + "epoch": 0.8791636275224897, + "grad_norm": 16.75, + "learning_rate": 1.5123786370575038e-06, + "loss": 0.5958, + "step": 3616 + }, + { + "epoch": 0.8794067590566497, + "grad_norm": 18.25, + "learning_rate": 1.5119061210002892e-06, + "loss": 0.8657, + "step": 3617 + }, + { + "epoch": 0.8796498905908097, + "grad_norm": 19.0, + "learning_rate": 1.5114335657966816e-06, + "loss": 1.0357, + "step": 3618 + }, + { + "epoch": 0.8798930221249696, + "grad_norm": 20.375, + "learning_rate": 1.5109609715173127e-06, + "loss": 0.795, + "step": 3619 + }, + { + "epoch": 0.8801361536591296, + "grad_norm": 22.875, + "learning_rate": 1.5104883382328195e-06, + "loss": 0.9602, + "step": 3620 + }, + { + "epoch": 0.8803792851932896, + "grad_norm": 18.75, + "learning_rate": 1.5100156660138454e-06, + "loss": 0.9176, + "step": 3621 + }, + { + "epoch": 0.8806224167274496, + "grad_norm": 18.125, + "learning_rate": 1.5095429549310392e-06, + "loss": 0.5905, + "step": 3622 + }, + { + "epoch": 0.8808655482616096, + "grad_norm": 18.75, + "learning_rate": 1.5090702050550562e-06, + "loss": 0.6135, + "step": 3623 + }, + { + "epoch": 0.8811086797957695, + "grad_norm": 23.0, + "learning_rate": 1.5085974164565567e-06, + "loss": 1.1195, + "step": 3624 + }, + { + "epoch": 0.8813518113299295, + "grad_norm": 15.5625, + "learning_rate": 1.5081245892062072e-06, + "loss": 0.6659, + "step": 3625 + }, + { + "epoch": 0.8815949428640895, + "grad_norm": 17.25, + "learning_rate": 1.5076517233746796e-06, + "loss": 0.8656, + "step": 3626 + }, + { + "epoch": 0.8818380743982495, + "grad_norm": 20.625, + "learning_rate": 1.5071788190326521e-06, + "loss": 0.7428, + "step": 3627 + }, + { + "epoch": 0.8820812059324095, + "grad_norm": 27.0, + "learning_rate": 1.506705876250809e-06, + "loss": 1.2332, + "step": 3628 + }, + { + "epoch": 0.8823243374665695, + "grad_norm": 32.75, + "learning_rate": 1.5062328950998386e-06, + "loss": 0.8189, + "step": 3629 + }, + { + "epoch": 0.8825674690007294, + "grad_norm": 30.375, + "learning_rate": 1.5057598756504373e-06, + "loss": 1.1142, + "step": 3630 + }, + { + "epoch": 0.8828106005348894, + "grad_norm": 33.5, + "learning_rate": 1.5052868179733054e-06, + "loss": 0.918, + "step": 3631 + }, + { + "epoch": 0.8830537320690494, + "grad_norm": 26.125, + "learning_rate": 1.5048137221391493e-06, + "loss": 1.3675, + "step": 3632 + }, + { + "epoch": 0.8832968636032094, + "grad_norm": 21.625, + "learning_rate": 1.5043405882186819e-06, + "loss": 0.8361, + "step": 3633 + }, + { + "epoch": 0.8835399951373694, + "grad_norm": 18.875, + "learning_rate": 1.5038674162826205e-06, + "loss": 0.5911, + "step": 3634 + }, + { + "epoch": 0.8837831266715293, + "grad_norm": 18.75, + "learning_rate": 1.50339420640169e-06, + "loss": 0.7616, + "step": 3635 + }, + { + "epoch": 0.8840262582056893, + "grad_norm": 22.875, + "learning_rate": 1.5029209586466184e-06, + "loss": 1.0386, + "step": 3636 + }, + { + "epoch": 0.8842693897398493, + "grad_norm": 23.0, + "learning_rate": 1.502447673088142e-06, + "loss": 0.999, + "step": 3637 + }, + { + "epoch": 0.8845125212740093, + "grad_norm": 25.875, + "learning_rate": 1.5019743497970008e-06, + "loss": 0.8509, + "step": 3638 + }, + { + "epoch": 0.8847556528081693, + "grad_norm": 18.25, + "learning_rate": 1.5015009888439408e-06, + "loss": 0.5531, + "step": 3639 + }, + { + "epoch": 0.8849987843423291, + "grad_norm": 23.375, + "learning_rate": 1.5010275902997148e-06, + "loss": 0.7159, + "step": 3640 + }, + { + "epoch": 0.8852419158764891, + "grad_norm": 12.75, + "learning_rate": 1.5005541542350802e-06, + "loss": 0.3923, + "step": 3641 + }, + { + "epoch": 0.8854850474106492, + "grad_norm": 20.125, + "learning_rate": 1.5000806807207999e-06, + "loss": 0.7973, + "step": 3642 + }, + { + "epoch": 0.8857281789448092, + "grad_norm": 17.0, + "learning_rate": 1.499607169827643e-06, + "loss": 1.0972, + "step": 3643 + }, + { + "epoch": 0.8859713104789692, + "grad_norm": 18.75, + "learning_rate": 1.4991336216263833e-06, + "loss": 0.7585, + "step": 3644 + }, + { + "epoch": 0.8862144420131292, + "grad_norm": 19.625, + "learning_rate": 1.4986600361878012e-06, + "loss": 1.1922, + "step": 3645 + }, + { + "epoch": 0.886457573547289, + "grad_norm": 26.375, + "learning_rate": 1.4981864135826823e-06, + "loss": 1.1931, + "step": 3646 + }, + { + "epoch": 0.886700705081449, + "grad_norm": 19.5, + "learning_rate": 1.497712753881818e-06, + "loss": 0.8558, + "step": 3647 + }, + { + "epoch": 0.886943836615609, + "grad_norm": 16.875, + "learning_rate": 1.4972390571560035e-06, + "loss": 0.6759, + "step": 3648 + }, + { + "epoch": 0.887186968149769, + "grad_norm": 20.0, + "learning_rate": 1.496765323476043e-06, + "loss": 1.4714, + "step": 3649 + }, + { + "epoch": 0.887430099683929, + "grad_norm": 18.25, + "learning_rate": 1.4962915529127426e-06, + "loss": 0.779, + "step": 3650 + }, + { + "epoch": 0.8876732312180889, + "grad_norm": 15.5625, + "learning_rate": 1.495817745536916e-06, + "loss": 0.7506, + "step": 3651 + }, + { + "epoch": 0.8879163627522489, + "grad_norm": 17.75, + "learning_rate": 1.495343901419382e-06, + "loss": 0.7888, + "step": 3652 + }, + { + "epoch": 0.888159494286409, + "grad_norm": 19.75, + "learning_rate": 1.4948700206309645e-06, + "loss": 0.9544, + "step": 3653 + }, + { + "epoch": 0.888402625820569, + "grad_norm": 17.375, + "learning_rate": 1.4943961032424939e-06, + "loss": 0.877, + "step": 3654 + }, + { + "epoch": 0.888645757354729, + "grad_norm": 17.5, + "learning_rate": 1.4939221493248043e-06, + "loss": 0.6839, + "step": 3655 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 18.0, + "learning_rate": 1.493448158948737e-06, + "loss": 0.6713, + "step": 3656 + }, + { + "epoch": 0.8891320204230488, + "grad_norm": 18.875, + "learning_rate": 1.4929741321851376e-06, + "loss": 1.162, + "step": 3657 + }, + { + "epoch": 0.8893751519572088, + "grad_norm": 20.75, + "learning_rate": 1.492500069104858e-06, + "loss": 0.7053, + "step": 3658 + }, + { + "epoch": 0.8896182834913688, + "grad_norm": 13.5, + "learning_rate": 1.492025969778755e-06, + "loss": 0.5691, + "step": 3659 + }, + { + "epoch": 0.8898614150255288, + "grad_norm": 20.25, + "learning_rate": 1.4915518342776909e-06, + "loss": 1.1944, + "step": 3660 + }, + { + "epoch": 0.8901045465596887, + "grad_norm": 24.875, + "learning_rate": 1.4910776626725336e-06, + "loss": 1.1712, + "step": 3661 + }, + { + "epoch": 0.8903476780938487, + "grad_norm": 15.1875, + "learning_rate": 1.4906034550341559e-06, + "loss": 0.7042, + "step": 3662 + }, + { + "epoch": 0.8905908096280087, + "grad_norm": 18.875, + "learning_rate": 1.4901292114334362e-06, + "loss": 0.9218, + "step": 3663 + }, + { + "epoch": 0.8908339411621687, + "grad_norm": 23.5, + "learning_rate": 1.489654931941259e-06, + "loss": 0.8784, + "step": 3664 + }, + { + "epoch": 0.8910770726963287, + "grad_norm": 28.625, + "learning_rate": 1.4891806166285131e-06, + "loss": 1.4625, + "step": 3665 + }, + { + "epoch": 0.8913202042304887, + "grad_norm": 17.5, + "learning_rate": 1.4887062655660933e-06, + "loss": 1.1436, + "step": 3666 + }, + { + "epoch": 0.8915633357646486, + "grad_norm": 22.25, + "learning_rate": 1.4882318788248996e-06, + "loss": 0.7973, + "step": 3667 + }, + { + "epoch": 0.8918064672988086, + "grad_norm": 20.125, + "learning_rate": 1.487757456475837e-06, + "loss": 0.7844, + "step": 3668 + }, + { + "epoch": 0.8920495988329686, + "grad_norm": 17.75, + "learning_rate": 1.4872829985898167e-06, + "loss": 0.9402, + "step": 3669 + }, + { + "epoch": 0.8922927303671286, + "grad_norm": 21.0, + "learning_rate": 1.4868085052377538e-06, + "loss": 0.7525, + "step": 3670 + }, + { + "epoch": 0.8925358619012886, + "grad_norm": 25.625, + "learning_rate": 1.4863339764905698e-06, + "loss": 1.0096, + "step": 3671 + }, + { + "epoch": 0.8927789934354485, + "grad_norm": 19.625, + "learning_rate": 1.485859412419192e-06, + "loss": 0.5815, + "step": 3672 + }, + { + "epoch": 0.8930221249696085, + "grad_norm": 23.875, + "learning_rate": 1.4853848130945514e-06, + "loss": 0.9197, + "step": 3673 + }, + { + "epoch": 0.8932652565037685, + "grad_norm": 20.5, + "learning_rate": 1.4849101785875853e-06, + "loss": 0.8862, + "step": 3674 + }, + { + "epoch": 0.8935083880379285, + "grad_norm": 16.625, + "learning_rate": 1.4844355089692352e-06, + "loss": 0.6091, + "step": 3675 + }, + { + "epoch": 0.8937515195720885, + "grad_norm": 20.125, + "learning_rate": 1.4839608043104503e-06, + "loss": 1.1555, + "step": 3676 + }, + { + "epoch": 0.8939946511062484, + "grad_norm": 16.75, + "learning_rate": 1.4834860646821825e-06, + "loss": 0.6236, + "step": 3677 + }, + { + "epoch": 0.8942377826404084, + "grad_norm": 24.25, + "learning_rate": 1.4830112901553898e-06, + "loss": 1.389, + "step": 3678 + }, + { + "epoch": 0.8944809141745684, + "grad_norm": 17.75, + "learning_rate": 1.482536480801035e-06, + "loss": 0.6193, + "step": 3679 + }, + { + "epoch": 0.8947240457087284, + "grad_norm": 21.375, + "learning_rate": 1.4820616366900878e-06, + "loss": 0.6483, + "step": 3680 + }, + { + "epoch": 0.8949671772428884, + "grad_norm": 19.875, + "learning_rate": 1.481586757893521e-06, + "loss": 0.8801, + "step": 3681 + }, + { + "epoch": 0.8952103087770484, + "grad_norm": 21.5, + "learning_rate": 1.4811118444823133e-06, + "loss": 0.6479, + "step": 3682 + }, + { + "epoch": 0.8954534403112083, + "grad_norm": 22.0, + "learning_rate": 1.4806368965274492e-06, + "loss": 1.0339, + "step": 3683 + }, + { + "epoch": 0.8956965718453683, + "grad_norm": 18.25, + "learning_rate": 1.4801619140999176e-06, + "loss": 0.9632, + "step": 3684 + }, + { + "epoch": 0.8959397033795283, + "grad_norm": 15.625, + "learning_rate": 1.4796868972707132e-06, + "loss": 0.695, + "step": 3685 + }, + { + "epoch": 0.8961828349136883, + "grad_norm": 26.5, + "learning_rate": 1.479211846110835e-06, + "loss": 0.8159, + "step": 3686 + }, + { + "epoch": 0.8964259664478483, + "grad_norm": 22.125, + "learning_rate": 1.4787367606912872e-06, + "loss": 1.1078, + "step": 3687 + }, + { + "epoch": 0.8966690979820082, + "grad_norm": 18.375, + "learning_rate": 1.478261641083081e-06, + "loss": 0.6567, + "step": 3688 + }, + { + "epoch": 0.8969122295161682, + "grad_norm": 20.25, + "learning_rate": 1.47778648735723e-06, + "loss": 0.7557, + "step": 3689 + }, + { + "epoch": 0.8971553610503282, + "grad_norm": 16.0, + "learning_rate": 1.4773112995847543e-06, + "loss": 0.7756, + "step": 3690 + }, + { + "epoch": 0.8973984925844882, + "grad_norm": 17.5, + "learning_rate": 1.4768360778366791e-06, + "loss": 0.5595, + "step": 3691 + }, + { + "epoch": 0.8976416241186482, + "grad_norm": 19.375, + "learning_rate": 1.4763608221840346e-06, + "loss": 1.1363, + "step": 3692 + }, + { + "epoch": 0.8978847556528081, + "grad_norm": 18.0, + "learning_rate": 1.475885532697856e-06, + "loss": 0.909, + "step": 3693 + }, + { + "epoch": 0.8981278871869681, + "grad_norm": 21.625, + "learning_rate": 1.4754102094491826e-06, + "loss": 1.0297, + "step": 3694 + }, + { + "epoch": 0.8983710187211281, + "grad_norm": 34.25, + "learning_rate": 1.4749348525090611e-06, + "loss": 0.9239, + "step": 3695 + }, + { + "epoch": 0.8986141502552881, + "grad_norm": 22.0, + "learning_rate": 1.4744594619485412e-06, + "loss": 1.225, + "step": 3696 + }, + { + "epoch": 0.8988572817894481, + "grad_norm": 13.9375, + "learning_rate": 1.4739840378386782e-06, + "loss": 0.3779, + "step": 3697 + }, + { + "epoch": 0.899100413323608, + "grad_norm": 19.875, + "learning_rate": 1.473508580250532e-06, + "loss": 0.8047, + "step": 3698 + }, + { + "epoch": 0.899343544857768, + "grad_norm": 28.625, + "learning_rate": 1.4730330892551684e-06, + "loss": 0.9387, + "step": 3699 + }, + { + "epoch": 0.899586676391928, + "grad_norm": 20.375, + "learning_rate": 1.4725575649236578e-06, + "loss": 0.5998, + "step": 3700 + }, + { + "epoch": 0.899829807926088, + "grad_norm": 16.125, + "learning_rate": 1.4720820073270755e-06, + "loss": 0.6035, + "step": 3701 + }, + { + "epoch": 0.900072939460248, + "grad_norm": 28.0, + "learning_rate": 1.4716064165365018e-06, + "loss": 0.7062, + "step": 3702 + }, + { + "epoch": 0.900316070994408, + "grad_norm": 15.75, + "learning_rate": 1.4711307926230216e-06, + "loss": 0.7638, + "step": 3703 + }, + { + "epoch": 0.9005592025285679, + "grad_norm": 17.625, + "learning_rate": 1.4706551356577253e-06, + "loss": 0.7957, + "step": 3704 + }, + { + "epoch": 0.9008023340627279, + "grad_norm": 17.25, + "learning_rate": 1.4701794457117088e-06, + "loss": 0.6963, + "step": 3705 + }, + { + "epoch": 0.9010454655968879, + "grad_norm": 20.875, + "learning_rate": 1.4697037228560706e-06, + "loss": 0.5563, + "step": 3706 + }, + { + "epoch": 0.9012885971310479, + "grad_norm": 24.375, + "learning_rate": 1.4692279671619173e-06, + "loss": 1.057, + "step": 3707 + }, + { + "epoch": 0.9015317286652079, + "grad_norm": 15.75, + "learning_rate": 1.4687521787003577e-06, + "loss": 0.8086, + "step": 3708 + }, + { + "epoch": 0.9017748601993678, + "grad_norm": 20.625, + "learning_rate": 1.4682763575425078e-06, + "loss": 0.876, + "step": 3709 + }, + { + "epoch": 0.9020179917335278, + "grad_norm": 21.375, + "learning_rate": 1.4678005037594853e-06, + "loss": 0.7953, + "step": 3710 + }, + { + "epoch": 0.9022611232676878, + "grad_norm": 17.75, + "learning_rate": 1.4673246174224166e-06, + "loss": 0.7017, + "step": 3711 + }, + { + "epoch": 0.9025042548018478, + "grad_norm": 17.125, + "learning_rate": 1.4668486986024305e-06, + "loss": 0.7379, + "step": 3712 + }, + { + "epoch": 0.9027473863360078, + "grad_norm": 19.375, + "learning_rate": 1.4663727473706612e-06, + "loss": 0.6913, + "step": 3713 + }, + { + "epoch": 0.9029905178701677, + "grad_norm": 22.0, + "learning_rate": 1.465896763798248e-06, + "loss": 0.8068, + "step": 3714 + }, + { + "epoch": 0.9032336494043277, + "grad_norm": 23.375, + "learning_rate": 1.4654207479563349e-06, + "loss": 1.0214, + "step": 3715 + }, + { + "epoch": 0.9034767809384877, + "grad_norm": 16.125, + "learning_rate": 1.4649446999160701e-06, + "loss": 1.1374, + "step": 3716 + }, + { + "epoch": 0.9037199124726477, + "grad_norm": 17.25, + "learning_rate": 1.464468619748608e-06, + "loss": 0.6149, + "step": 3717 + }, + { + "epoch": 0.9039630440068077, + "grad_norm": 20.625, + "learning_rate": 1.4639925075251065e-06, + "loss": 0.5648, + "step": 3718 + }, + { + "epoch": 0.9042061755409677, + "grad_norm": 16.375, + "learning_rate": 1.4635163633167296e-06, + "loss": 0.8324, + "step": 3719 + }, + { + "epoch": 0.9044493070751276, + "grad_norm": 21.5, + "learning_rate": 1.463040187194644e-06, + "loss": 0.8693, + "step": 3720 + }, + { + "epoch": 0.9046924386092876, + "grad_norm": 23.875, + "learning_rate": 1.4625639792300233e-06, + "loss": 1.0723, + "step": 3721 + }, + { + "epoch": 0.9049355701434476, + "grad_norm": 16.25, + "learning_rate": 1.4620877394940447e-06, + "loss": 0.7044, + "step": 3722 + }, + { + "epoch": 0.9051787016776076, + "grad_norm": 23.75, + "learning_rate": 1.4616114680578905e-06, + "loss": 0.8572, + "step": 3723 + }, + { + "epoch": 0.9054218332117676, + "grad_norm": 16.125, + "learning_rate": 1.4611351649927482e-06, + "loss": 0.5652, + "step": 3724 + }, + { + "epoch": 0.9056649647459275, + "grad_norm": 15.625, + "learning_rate": 1.4606588303698082e-06, + "loss": 0.3811, + "step": 3725 + }, + { + "epoch": 0.9059080962800875, + "grad_norm": 19.625, + "learning_rate": 1.4601824642602688e-06, + "loss": 1.3879, + "step": 3726 + }, + { + "epoch": 0.9061512278142475, + "grad_norm": 21.375, + "learning_rate": 1.4597060667353296e-06, + "loss": 0.9602, + "step": 3727 + }, + { + "epoch": 0.9063943593484075, + "grad_norm": 19.5, + "learning_rate": 1.4592296378661968e-06, + "loss": 1.0221, + "step": 3728 + }, + { + "epoch": 0.9066374908825675, + "grad_norm": 25.125, + "learning_rate": 1.4587531777240814e-06, + "loss": 1.2204, + "step": 3729 + }, + { + "epoch": 0.9068806224167274, + "grad_norm": 18.875, + "learning_rate": 1.4582766863801984e-06, + "loss": 0.971, + "step": 3730 + }, + { + "epoch": 0.9071237539508874, + "grad_norm": 15.75, + "learning_rate": 1.4578001639057676e-06, + "loss": 0.602, + "step": 3731 + }, + { + "epoch": 0.9073668854850474, + "grad_norm": 19.5, + "learning_rate": 1.4573236103720132e-06, + "loss": 0.8516, + "step": 3732 + }, + { + "epoch": 0.9076100170192074, + "grad_norm": 16.5, + "learning_rate": 1.4568470258501646e-06, + "loss": 0.5341, + "step": 3733 + }, + { + "epoch": 0.9078531485533674, + "grad_norm": 18.5, + "learning_rate": 1.4563704104114557e-06, + "loss": 0.7416, + "step": 3734 + }, + { + "epoch": 0.9080962800875274, + "grad_norm": 18.125, + "learning_rate": 1.4558937641271248e-06, + "loss": 0.6536, + "step": 3735 + }, + { + "epoch": 0.9083394116216873, + "grad_norm": 22.75, + "learning_rate": 1.455417087068415e-06, + "loss": 0.8411, + "step": 3736 + }, + { + "epoch": 0.9085825431558473, + "grad_norm": 18.0, + "learning_rate": 1.4549403793065737e-06, + "loss": 0.8491, + "step": 3737 + }, + { + "epoch": 0.9088256746900073, + "grad_norm": 17.875, + "learning_rate": 1.4544636409128538e-06, + "loss": 0.8147, + "step": 3738 + }, + { + "epoch": 0.9090688062241673, + "grad_norm": 18.625, + "learning_rate": 1.453986871958511e-06, + "loss": 0.6442, + "step": 3739 + }, + { + "epoch": 0.9093119377583273, + "grad_norm": 24.625, + "learning_rate": 1.4535100725148072e-06, + "loss": 0.7537, + "step": 3740 + }, + { + "epoch": 0.9095550692924872, + "grad_norm": 19.0, + "learning_rate": 1.4530332426530086e-06, + "loss": 0.6678, + "step": 3741 + }, + { + "epoch": 0.9097982008266472, + "grad_norm": 15.9375, + "learning_rate": 1.452556382444385e-06, + "loss": 1.1911, + "step": 3742 + }, + { + "epoch": 0.9100413323608072, + "grad_norm": 19.75, + "learning_rate": 1.4520794919602125e-06, + "loss": 1.2636, + "step": 3743 + }, + { + "epoch": 0.9102844638949672, + "grad_norm": 24.125, + "learning_rate": 1.4516025712717692e-06, + "loss": 0.9072, + "step": 3744 + }, + { + "epoch": 0.9105275954291272, + "grad_norm": 24.25, + "learning_rate": 1.4511256204503403e-06, + "loss": 0.6314, + "step": 3745 + }, + { + "epoch": 0.9107707269632871, + "grad_norm": 20.625, + "learning_rate": 1.4506486395672134e-06, + "loss": 0.6908, + "step": 3746 + }, + { + "epoch": 0.9110138584974471, + "grad_norm": 22.375, + "learning_rate": 1.4501716286936824e-06, + "loss": 1.0111, + "step": 3747 + }, + { + "epoch": 0.9112569900316071, + "grad_norm": 19.125, + "learning_rate": 1.449694587901044e-06, + "loss": 0.7819, + "step": 3748 + }, + { + "epoch": 0.9115001215657671, + "grad_norm": 21.0, + "learning_rate": 1.4492175172606006e-06, + "loss": 0.5429, + "step": 3749 + }, + { + "epoch": 0.9117432530999271, + "grad_norm": 21.75, + "learning_rate": 1.4487404168436593e-06, + "loss": 1.2505, + "step": 3750 + }, + { + "epoch": 0.911986384634087, + "grad_norm": 22.875, + "learning_rate": 1.44826328672153e-06, + "loss": 0.6566, + "step": 3751 + }, + { + "epoch": 0.912229516168247, + "grad_norm": 20.125, + "learning_rate": 1.447786126965528e-06, + "loss": 1.0915, + "step": 3752 + }, + { + "epoch": 0.912472647702407, + "grad_norm": 21.75, + "learning_rate": 1.4473089376469737e-06, + "loss": 1.2058, + "step": 3753 + }, + { + "epoch": 0.912715779236567, + "grad_norm": 17.375, + "learning_rate": 1.446831718837191e-06, + "loss": 0.8974, + "step": 3754 + }, + { + "epoch": 0.912958910770727, + "grad_norm": 15.4375, + "learning_rate": 1.4463544706075088e-06, + "loss": 0.5681, + "step": 3755 + }, + { + "epoch": 0.913202042304887, + "grad_norm": 23.375, + "learning_rate": 1.4458771930292592e-06, + "loss": 0.84, + "step": 3756 + }, + { + "epoch": 0.9134451738390469, + "grad_norm": 22.875, + "learning_rate": 1.4453998861737808e-06, + "loss": 1.123, + "step": 3757 + }, + { + "epoch": 0.9136883053732069, + "grad_norm": 20.5, + "learning_rate": 1.4449225501124146e-06, + "loss": 1.0275, + "step": 3758 + }, + { + "epoch": 0.9139314369073669, + "grad_norm": 18.75, + "learning_rate": 1.4444451849165067e-06, + "loss": 0.8495, + "step": 3759 + }, + { + "epoch": 0.9141745684415269, + "grad_norm": 18.75, + "learning_rate": 1.4439677906574076e-06, + "loss": 0.754, + "step": 3760 + }, + { + "epoch": 0.9144176999756869, + "grad_norm": 22.375, + "learning_rate": 1.4434903674064726e-06, + "loss": 0.7724, + "step": 3761 + }, + { + "epoch": 0.9146608315098468, + "grad_norm": 17.25, + "learning_rate": 1.4430129152350605e-06, + "loss": 0.496, + "step": 3762 + }, + { + "epoch": 0.9149039630440068, + "grad_norm": 24.0, + "learning_rate": 1.4425354342145346e-06, + "loss": 0.9878, + "step": 3763 + }, + { + "epoch": 0.9151470945781668, + "grad_norm": 15.75, + "learning_rate": 1.4420579244162627e-06, + "loss": 0.8022, + "step": 3764 + }, + { + "epoch": 0.9153902261123268, + "grad_norm": 20.75, + "learning_rate": 1.4415803859116173e-06, + "loss": 0.7064, + "step": 3765 + }, + { + "epoch": 0.9156333576464868, + "grad_norm": 18.5, + "learning_rate": 1.441102818771974e-06, + "loss": 0.8631, + "step": 3766 + }, + { + "epoch": 0.9158764891806467, + "grad_norm": 17.375, + "learning_rate": 1.4406252230687148e-06, + "loss": 1.3142, + "step": 3767 + }, + { + "epoch": 0.9161196207148067, + "grad_norm": 24.125, + "learning_rate": 1.440147598873223e-06, + "loss": 0.9144, + "step": 3768 + }, + { + "epoch": 0.9163627522489667, + "grad_norm": 17.75, + "learning_rate": 1.4396699462568894e-06, + "loss": 0.5567, + "step": 3769 + }, + { + "epoch": 0.9166058837831267, + "grad_norm": 14.5625, + "learning_rate": 1.439192265291106e-06, + "loss": 0.43, + "step": 3770 + }, + { + "epoch": 0.9168490153172867, + "grad_norm": 16.5, + "learning_rate": 1.4387145560472712e-06, + "loss": 1.0951, + "step": 3771 + }, + { + "epoch": 0.9170921468514467, + "grad_norm": 20.75, + "learning_rate": 1.4382368185967868e-06, + "loss": 0.6864, + "step": 3772 + }, + { + "epoch": 0.9173352783856066, + "grad_norm": 21.25, + "learning_rate": 1.4377590530110591e-06, + "loss": 1.1724, + "step": 3773 + }, + { + "epoch": 0.9175784099197666, + "grad_norm": 19.75, + "learning_rate": 1.4372812593614983e-06, + "loss": 1.2391, + "step": 3774 + }, + { + "epoch": 0.9178215414539266, + "grad_norm": 18.625, + "learning_rate": 1.4368034377195183e-06, + "loss": 0.8745, + "step": 3775 + }, + { + "epoch": 0.9180646729880866, + "grad_norm": 20.5, + "learning_rate": 1.4363255881565389e-06, + "loss": 0.8104, + "step": 3776 + }, + { + "epoch": 0.9183078045222466, + "grad_norm": 20.625, + "learning_rate": 1.435847710743982e-06, + "loss": 0.8274, + "step": 3777 + }, + { + "epoch": 0.9185509360564065, + "grad_norm": 14.75, + "learning_rate": 1.435369805553275e-06, + "loss": 0.7391, + "step": 3778 + }, + { + "epoch": 0.9187940675905665, + "grad_norm": 19.625, + "learning_rate": 1.4348918726558495e-06, + "loss": 1.0393, + "step": 3779 + }, + { + "epoch": 0.9190371991247265, + "grad_norm": 21.0, + "learning_rate": 1.4344139121231402e-06, + "loss": 0.9398, + "step": 3780 + }, + { + "epoch": 0.9192803306588865, + "grad_norm": 14.8125, + "learning_rate": 1.433935924026587e-06, + "loss": 0.7216, + "step": 3781 + }, + { + "epoch": 0.9195234621930465, + "grad_norm": 18.0, + "learning_rate": 1.433457908437633e-06, + "loss": 0.9834, + "step": 3782 + }, + { + "epoch": 0.9197665937272064, + "grad_norm": 22.25, + "learning_rate": 1.432979865427726e-06, + "loss": 1.0621, + "step": 3783 + }, + { + "epoch": 0.9200097252613664, + "grad_norm": 18.625, + "learning_rate": 1.4325017950683182e-06, + "loss": 0.7211, + "step": 3784 + }, + { + "epoch": 0.9202528567955264, + "grad_norm": 25.875, + "learning_rate": 1.4320236974308652e-06, + "loss": 1.1432, + "step": 3785 + }, + { + "epoch": 0.9204959883296864, + "grad_norm": 21.125, + "learning_rate": 1.431545572586827e-06, + "loss": 0.7333, + "step": 3786 + }, + { + "epoch": 0.9207391198638464, + "grad_norm": 23.625, + "learning_rate": 1.4310674206076675e-06, + "loss": 1.1414, + "step": 3787 + }, + { + "epoch": 0.9209822513980063, + "grad_norm": 20.5, + "learning_rate": 1.4305892415648549e-06, + "loss": 0.8259, + "step": 3788 + }, + { + "epoch": 0.9212253829321663, + "grad_norm": 19.5, + "learning_rate": 1.4301110355298612e-06, + "loss": 1.0309, + "step": 3789 + }, + { + "epoch": 0.9214685144663263, + "grad_norm": 17.125, + "learning_rate": 1.4296328025741626e-06, + "loss": 0.8037, + "step": 3790 + }, + { + "epoch": 0.9217116460004863, + "grad_norm": 16.5, + "learning_rate": 1.4291545427692394e-06, + "loss": 1.0646, + "step": 3791 + }, + { + "epoch": 0.9219547775346463, + "grad_norm": 20.0, + "learning_rate": 1.4286762561865756e-06, + "loss": 0.8686, + "step": 3792 + }, + { + "epoch": 0.9221979090688063, + "grad_norm": 28.125, + "learning_rate": 1.4281979428976594e-06, + "loss": 1.1802, + "step": 3793 + }, + { + "epoch": 0.9224410406029662, + "grad_norm": 15.8125, + "learning_rate": 1.4277196029739831e-06, + "loss": 0.7507, + "step": 3794 + }, + { + "epoch": 0.9226841721371262, + "grad_norm": 18.5, + "learning_rate": 1.427241236487043e-06, + "loss": 1.3671, + "step": 3795 + }, + { + "epoch": 0.9229273036712862, + "grad_norm": 21.25, + "learning_rate": 1.4267628435083388e-06, + "loss": 0.7674, + "step": 3796 + }, + { + "epoch": 0.9231704352054462, + "grad_norm": 22.375, + "learning_rate": 1.4262844241093749e-06, + "loss": 1.1878, + "step": 3797 + }, + { + "epoch": 0.9234135667396062, + "grad_norm": 14.1875, + "learning_rate": 1.4258059783616596e-06, + "loss": 0.7436, + "step": 3798 + }, + { + "epoch": 0.9236566982737661, + "grad_norm": 20.875, + "learning_rate": 1.4253275063367038e-06, + "loss": 0.6861, + "step": 3799 + }, + { + "epoch": 0.9238998298079261, + "grad_norm": 18.625, + "learning_rate": 1.4248490081060248e-06, + "loss": 1.0555, + "step": 3800 + }, + { + "epoch": 0.9241429613420861, + "grad_norm": 15.25, + "learning_rate": 1.4243704837411418e-06, + "loss": 0.5213, + "step": 3801 + }, + { + "epoch": 0.9243860928762461, + "grad_norm": 19.375, + "learning_rate": 1.4238919333135778e-06, + "loss": 0.7359, + "step": 3802 + }, + { + "epoch": 0.9246292244104061, + "grad_norm": 19.625, + "learning_rate": 1.423413356894862e-06, + "loss": 0.8722, + "step": 3803 + }, + { + "epoch": 0.924872355944566, + "grad_norm": 20.5, + "learning_rate": 1.4229347545565248e-06, + "loss": 0.9464, + "step": 3804 + }, + { + "epoch": 0.925115487478726, + "grad_norm": 18.125, + "learning_rate": 1.422456126370102e-06, + "loss": 0.777, + "step": 3805 + }, + { + "epoch": 0.925358619012886, + "grad_norm": 17.75, + "learning_rate": 1.4219774724071322e-06, + "loss": 0.8119, + "step": 3806 + }, + { + "epoch": 0.925601750547046, + "grad_norm": 16.25, + "learning_rate": 1.4214987927391594e-06, + "loss": 0.8855, + "step": 3807 + }, + { + "epoch": 0.925844882081206, + "grad_norm": 18.625, + "learning_rate": 1.42102008743773e-06, + "loss": 0.804, + "step": 3808 + }, + { + "epoch": 0.926088013615366, + "grad_norm": 16.625, + "learning_rate": 1.420541356574395e-06, + "loss": 0.4214, + "step": 3809 + }, + { + "epoch": 0.9263311451495259, + "grad_norm": 22.25, + "learning_rate": 1.4200626002207089e-06, + "loss": 0.8928, + "step": 3810 + }, + { + "epoch": 0.9265742766836859, + "grad_norm": 36.5, + "learning_rate": 1.41958381844823e-06, + "loss": 1.06, + "step": 3811 + }, + { + "epoch": 0.9268174082178459, + "grad_norm": 23.0, + "learning_rate": 1.419105011328521e-06, + "loss": 1.194, + "step": 3812 + }, + { + "epoch": 0.9270605397520059, + "grad_norm": 19.625, + "learning_rate": 1.4186261789331471e-06, + "loss": 0.8113, + "step": 3813 + }, + { + "epoch": 0.9273036712861659, + "grad_norm": 15.5625, + "learning_rate": 1.4181473213336783e-06, + "loss": 0.7907, + "step": 3814 + }, + { + "epoch": 0.9275468028203258, + "grad_norm": 23.125, + "learning_rate": 1.4176684386016886e-06, + "loss": 0.9357, + "step": 3815 + }, + { + "epoch": 0.9277899343544858, + "grad_norm": 24.25, + "learning_rate": 1.417189530808755e-06, + "loss": 0.8202, + "step": 3816 + }, + { + "epoch": 0.9280330658886458, + "grad_norm": 22.375, + "learning_rate": 1.416710598026459e-06, + "loss": 0.9124, + "step": 3817 + }, + { + "epoch": 0.9282761974228058, + "grad_norm": 19.125, + "learning_rate": 1.416231640326384e-06, + "loss": 0.8411, + "step": 3818 + }, + { + "epoch": 0.9285193289569658, + "grad_norm": 16.5, + "learning_rate": 1.41575265778012e-06, + "loss": 0.5253, + "step": 3819 + }, + { + "epoch": 0.9287624604911257, + "grad_norm": 19.375, + "learning_rate": 1.4152736504592587e-06, + "loss": 0.9743, + "step": 3820 + }, + { + "epoch": 0.9290055920252857, + "grad_norm": 18.875, + "learning_rate": 1.4147946184353958e-06, + "loss": 0.4579, + "step": 3821 + }, + { + "epoch": 0.9292487235594457, + "grad_norm": 23.25, + "learning_rate": 1.414315561780131e-06, + "loss": 0.8227, + "step": 3822 + }, + { + "epoch": 0.9294918550936057, + "grad_norm": 18.875, + "learning_rate": 1.4138364805650679e-06, + "loss": 0.8655, + "step": 3823 + }, + { + "epoch": 0.9297349866277657, + "grad_norm": 22.375, + "learning_rate": 1.4133573748618135e-06, + "loss": 0.9211, + "step": 3824 + }, + { + "epoch": 0.9299781181619255, + "grad_norm": 19.0, + "learning_rate": 1.4128782447419775e-06, + "loss": 0.7798, + "step": 3825 + }, + { + "epoch": 0.9302212496960856, + "grad_norm": 19.625, + "learning_rate": 1.4123990902771747e-06, + "loss": 1.0926, + "step": 3826 + }, + { + "epoch": 0.9304643812302456, + "grad_norm": 13.0625, + "learning_rate": 1.411919911539024e-06, + "loss": 0.392, + "step": 3827 + }, + { + "epoch": 0.9307075127644056, + "grad_norm": 36.0, + "learning_rate": 1.4114407085991456e-06, + "loss": 0.8685, + "step": 3828 + }, + { + "epoch": 0.9309506442985656, + "grad_norm": 24.0, + "learning_rate": 1.4109614815291648e-06, + "loss": 0.9956, + "step": 3829 + }, + { + "epoch": 0.9311937758327256, + "grad_norm": 22.75, + "learning_rate": 1.410482230400711e-06, + "loss": 1.1699, + "step": 3830 + }, + { + "epoch": 0.9314369073668854, + "grad_norm": 15.6875, + "learning_rate": 1.410002955285416e-06, + "loss": 0.5616, + "step": 3831 + }, + { + "epoch": 0.9316800389010454, + "grad_norm": 19.375, + "learning_rate": 1.4095236562549167e-06, + "loss": 0.9048, + "step": 3832 + }, + { + "epoch": 0.9319231704352055, + "grad_norm": 25.0, + "learning_rate": 1.409044333380851e-06, + "loss": 0.8924, + "step": 3833 + }, + { + "epoch": 0.9321663019693655, + "grad_norm": 18.5, + "learning_rate": 1.4085649867348635e-06, + "loss": 0.9992, + "step": 3834 + }, + { + "epoch": 0.9324094335035255, + "grad_norm": 18.5, + "learning_rate": 1.4080856163886001e-06, + "loss": 0.7677, + "step": 3835 + }, + { + "epoch": 0.9326525650376853, + "grad_norm": 21.5, + "learning_rate": 1.407606222413711e-06, + "loss": 0.9993, + "step": 3836 + }, + { + "epoch": 0.9328956965718453, + "grad_norm": 18.125, + "learning_rate": 1.4071268048818499e-06, + "loss": 0.9306, + "step": 3837 + }, + { + "epoch": 0.9331388281060053, + "grad_norm": 22.625, + "learning_rate": 1.406647363864674e-06, + "loss": 1.0934, + "step": 3838 + }, + { + "epoch": 0.9333819596401653, + "grad_norm": 18.5, + "learning_rate": 1.4061678994338449e-06, + "loss": 0.744, + "step": 3839 + }, + { + "epoch": 0.9336250911743253, + "grad_norm": 18.75, + "learning_rate": 1.4056884116610255e-06, + "loss": 1.2328, + "step": 3840 + }, + { + "epoch": 0.9338682227084852, + "grad_norm": 21.0, + "learning_rate": 1.4052089006178843e-06, + "loss": 0.9657, + "step": 3841 + }, + { + "epoch": 0.9341113542426452, + "grad_norm": 20.0, + "learning_rate": 1.4047293663760922e-06, + "loss": 0.7684, + "step": 3842 + }, + { + "epoch": 0.9343544857768052, + "grad_norm": 15.5625, + "learning_rate": 1.4042498090073243e-06, + "loss": 0.5175, + "step": 3843 + }, + { + "epoch": 0.9345976173109652, + "grad_norm": 17.375, + "learning_rate": 1.4037702285832586e-06, + "loss": 0.9267, + "step": 3844 + }, + { + "epoch": 0.9348407488451252, + "grad_norm": 13.1875, + "learning_rate": 1.403290625175576e-06, + "loss": 0.3068, + "step": 3845 + }, + { + "epoch": 0.9350838803792852, + "grad_norm": 21.25, + "learning_rate": 1.4028109988559624e-06, + "loss": 1.2203, + "step": 3846 + }, + { + "epoch": 0.9353270119134451, + "grad_norm": 23.75, + "learning_rate": 1.4023313496961059e-06, + "loss": 1.0322, + "step": 3847 + }, + { + "epoch": 0.9355701434476051, + "grad_norm": 18.875, + "learning_rate": 1.401851677767698e-06, + "loss": 0.6709, + "step": 3848 + }, + { + "epoch": 0.9358132749817651, + "grad_norm": 22.875, + "learning_rate": 1.4013719831424347e-06, + "loss": 0.9684, + "step": 3849 + }, + { + "epoch": 0.9360564065159251, + "grad_norm": 20.625, + "learning_rate": 1.4008922658920138e-06, + "loss": 0.8002, + "step": 3850 + }, + { + "epoch": 0.9362995380500851, + "grad_norm": 20.125, + "learning_rate": 1.4004125260881383e-06, + "loss": 1.0745, + "step": 3851 + }, + { + "epoch": 0.936542669584245, + "grad_norm": 12.8125, + "learning_rate": 1.3999327638025128e-06, + "loss": 0.4899, + "step": 3852 + }, + { + "epoch": 0.936785801118405, + "grad_norm": 17.375, + "learning_rate": 1.3994529791068461e-06, + "loss": 0.5899, + "step": 3853 + }, + { + "epoch": 0.937028932652565, + "grad_norm": 19.125, + "learning_rate": 1.398973172072851e-06, + "loss": 0.694, + "step": 3854 + }, + { + "epoch": 0.937272064186725, + "grad_norm": 24.5, + "learning_rate": 1.3984933427722419e-06, + "loss": 0.9549, + "step": 3855 + }, + { + "epoch": 0.937515195720885, + "grad_norm": 19.0, + "learning_rate": 1.3980134912767385e-06, + "loss": 0.899, + "step": 3856 + }, + { + "epoch": 0.9377583272550449, + "grad_norm": 16.5, + "learning_rate": 1.3975336176580625e-06, + "loss": 0.9902, + "step": 3857 + }, + { + "epoch": 0.9380014587892049, + "grad_norm": 21.25, + "learning_rate": 1.3970537219879395e-06, + "loss": 0.5704, + "step": 3858 + }, + { + "epoch": 0.9382445903233649, + "grad_norm": 17.25, + "learning_rate": 1.396573804338098e-06, + "loss": 0.6479, + "step": 3859 + }, + { + "epoch": 0.9384877218575249, + "grad_norm": 16.625, + "learning_rate": 1.3960938647802699e-06, + "loss": 1.0088, + "step": 3860 + }, + { + "epoch": 0.9387308533916849, + "grad_norm": 16.875, + "learning_rate": 1.3956139033861904e-06, + "loss": 0.8491, + "step": 3861 + }, + { + "epoch": 0.9389739849258448, + "grad_norm": 17.0, + "learning_rate": 1.3951339202275984e-06, + "loss": 0.7275, + "step": 3862 + }, + { + "epoch": 0.9392171164600048, + "grad_norm": 16.625, + "learning_rate": 1.394653915376236e-06, + "loss": 0.8649, + "step": 3863 + }, + { + "epoch": 0.9394602479941648, + "grad_norm": 19.125, + "learning_rate": 1.3941738889038467e-06, + "loss": 0.7899, + "step": 3864 + }, + { + "epoch": 0.9397033795283248, + "grad_norm": 24.875, + "learning_rate": 1.3936938408821804e-06, + "loss": 1.1052, + "step": 3865 + }, + { + "epoch": 0.9399465110624848, + "grad_norm": 16.125, + "learning_rate": 1.3932137713829878e-06, + "loss": 0.974, + "step": 3866 + }, + { + "epoch": 0.9401896425966448, + "grad_norm": 16.25, + "learning_rate": 1.3927336804780235e-06, + "loss": 0.5647, + "step": 3867 + }, + { + "epoch": 0.9404327741308047, + "grad_norm": 15.125, + "learning_rate": 1.3922535682390453e-06, + "loss": 0.678, + "step": 3868 + }, + { + "epoch": 0.9406759056649647, + "grad_norm": 20.625, + "learning_rate": 1.391773434737815e-06, + "loss": 0.7141, + "step": 3869 + }, + { + "epoch": 0.9409190371991247, + "grad_norm": 21.5, + "learning_rate": 1.3912932800460965e-06, + "loss": 0.9183, + "step": 3870 + }, + { + "epoch": 0.9411621687332847, + "grad_norm": 15.625, + "learning_rate": 1.3908131042356568e-06, + "loss": 0.6303, + "step": 3871 + }, + { + "epoch": 0.9414053002674447, + "grad_norm": 18.0, + "learning_rate": 1.3903329073782668e-06, + "loss": 0.7484, + "step": 3872 + }, + { + "epoch": 0.9416484318016046, + "grad_norm": 19.625, + "learning_rate": 1.3898526895457e-06, + "loss": 0.9081, + "step": 3873 + }, + { + "epoch": 0.9418915633357646, + "grad_norm": 20.5, + "learning_rate": 1.3893724508097334e-06, + "loss": 1.2226, + "step": 3874 + }, + { + "epoch": 0.9421346948699246, + "grad_norm": 17.75, + "learning_rate": 1.3888921912421473e-06, + "loss": 0.7871, + "step": 3875 + }, + { + "epoch": 0.9423778264040846, + "grad_norm": 16.875, + "learning_rate": 1.388411910914724e-06, + "loss": 0.8181, + "step": 3876 + }, + { + "epoch": 0.9426209579382446, + "grad_norm": 22.625, + "learning_rate": 1.3879316098992507e-06, + "loss": 0.9265, + "step": 3877 + }, + { + "epoch": 0.9428640894724045, + "grad_norm": 15.875, + "learning_rate": 1.3874512882675156e-06, + "loss": 0.4322, + "step": 3878 + }, + { + "epoch": 0.9431072210065645, + "grad_norm": 17.75, + "learning_rate": 1.386970946091312e-06, + "loss": 0.6555, + "step": 3879 + }, + { + "epoch": 0.9433503525407245, + "grad_norm": 20.125, + "learning_rate": 1.3864905834424348e-06, + "loss": 0.8817, + "step": 3880 + }, + { + "epoch": 0.9435934840748845, + "grad_norm": 19.0, + "learning_rate": 1.3860102003926827e-06, + "loss": 0.8455, + "step": 3881 + }, + { + "epoch": 0.9438366156090445, + "grad_norm": 17.125, + "learning_rate": 1.3855297970138571e-06, + "loss": 0.6743, + "step": 3882 + }, + { + "epoch": 0.9440797471432045, + "grad_norm": 21.5, + "learning_rate": 1.3850493733777622e-06, + "loss": 1.0024, + "step": 3883 + }, + { + "epoch": 0.9443228786773644, + "grad_norm": 20.875, + "learning_rate": 1.384568929556207e-06, + "loss": 0.8406, + "step": 3884 + }, + { + "epoch": 0.9445660102115244, + "grad_norm": 18.5, + "learning_rate": 1.3840884656210007e-06, + "loss": 1.0866, + "step": 3885 + }, + { + "epoch": 0.9448091417456844, + "grad_norm": 17.875, + "learning_rate": 1.3836079816439575e-06, + "loss": 0.8666, + "step": 3886 + }, + { + "epoch": 0.9450522732798444, + "grad_norm": 19.625, + "learning_rate": 1.3831274776968936e-06, + "loss": 0.9899, + "step": 3887 + }, + { + "epoch": 0.9452954048140044, + "grad_norm": 21.125, + "learning_rate": 1.3826469538516292e-06, + "loss": 1.2078, + "step": 3888 + }, + { + "epoch": 0.9455385363481643, + "grad_norm": 17.375, + "learning_rate": 1.382166410179987e-06, + "loss": 0.8049, + "step": 3889 + }, + { + "epoch": 0.9457816678823243, + "grad_norm": 24.125, + "learning_rate": 1.381685846753792e-06, + "loss": 0.9659, + "step": 3890 + }, + { + "epoch": 0.9460247994164843, + "grad_norm": 16.25, + "learning_rate": 1.3812052636448728e-06, + "loss": 0.4699, + "step": 3891 + }, + { + "epoch": 0.9462679309506443, + "grad_norm": 17.625, + "learning_rate": 1.380724660925061e-06, + "loss": 0.905, + "step": 3892 + }, + { + "epoch": 0.9465110624848043, + "grad_norm": 19.0, + "learning_rate": 1.3802440386661908e-06, + "loss": 0.5628, + "step": 3893 + }, + { + "epoch": 0.9467541940189642, + "grad_norm": 16.5, + "learning_rate": 1.3797633969401e-06, + "loss": 0.7007, + "step": 3894 + }, + { + "epoch": 0.9469973255531242, + "grad_norm": 19.375, + "learning_rate": 1.3792827358186277e-06, + "loss": 1.1727, + "step": 3895 + }, + { + "epoch": 0.9472404570872842, + "grad_norm": 16.125, + "learning_rate": 1.3788020553736186e-06, + "loss": 0.7185, + "step": 3896 + }, + { + "epoch": 0.9474835886214442, + "grad_norm": 14.4375, + "learning_rate": 1.3783213556769177e-06, + "loss": 0.4945, + "step": 3897 + }, + { + "epoch": 0.9477267201556042, + "grad_norm": 18.0, + "learning_rate": 1.3778406368003735e-06, + "loss": 0.5671, + "step": 3898 + }, + { + "epoch": 0.9479698516897641, + "grad_norm": 25.25, + "learning_rate": 1.3773598988158386e-06, + "loss": 1.0576, + "step": 3899 + }, + { + "epoch": 0.9482129832239241, + "grad_norm": 22.5, + "learning_rate": 1.3768791417951671e-06, + "loss": 0.6711, + "step": 3900 + }, + { + "epoch": 0.9484561147580841, + "grad_norm": 20.0, + "learning_rate": 1.3763983658102168e-06, + "loss": 0.7188, + "step": 3901 + }, + { + "epoch": 0.9486992462922441, + "grad_norm": 23.375, + "learning_rate": 1.3759175709328476e-06, + "loss": 1.1597, + "step": 3902 + }, + { + "epoch": 0.9489423778264041, + "grad_norm": 18.625, + "learning_rate": 1.3754367572349225e-06, + "loss": 0.7496, + "step": 3903 + }, + { + "epoch": 0.9491855093605641, + "grad_norm": 20.875, + "learning_rate": 1.374955924788308e-06, + "loss": 0.664, + "step": 3904 + }, + { + "epoch": 0.949428640894724, + "grad_norm": 17.25, + "learning_rate": 1.3744750736648724e-06, + "loss": 0.7426, + "step": 3905 + }, + { + "epoch": 0.949671772428884, + "grad_norm": 15.875, + "learning_rate": 1.3739942039364876e-06, + "loss": 0.5051, + "step": 3906 + }, + { + "epoch": 0.949914903963044, + "grad_norm": 22.625, + "learning_rate": 1.3735133156750268e-06, + "loss": 1.3829, + "step": 3907 + }, + { + "epoch": 0.950158035497204, + "grad_norm": 23.125, + "learning_rate": 1.3730324089523683e-06, + "loss": 0.8434, + "step": 3908 + }, + { + "epoch": 0.950401167031364, + "grad_norm": 20.625, + "learning_rate": 1.3725514838403914e-06, + "loss": 0.5742, + "step": 3909 + }, + { + "epoch": 0.9506442985655239, + "grad_norm": 23.0, + "learning_rate": 1.3720705404109787e-06, + "loss": 1.0885, + "step": 3910 + }, + { + "epoch": 0.9508874300996839, + "grad_norm": 25.375, + "learning_rate": 1.3715895787360155e-06, + "loss": 1.2471, + "step": 3911 + }, + { + "epoch": 0.9511305616338439, + "grad_norm": 19.25, + "learning_rate": 1.3711085988873898e-06, + "loss": 0.6364, + "step": 3912 + }, + { + "epoch": 0.9513736931680039, + "grad_norm": 18.5, + "learning_rate": 1.3706276009369925e-06, + "loss": 0.5563, + "step": 3913 + }, + { + "epoch": 0.9516168247021639, + "grad_norm": 17.75, + "learning_rate": 1.3701465849567167e-06, + "loss": 0.7277, + "step": 3914 + }, + { + "epoch": 0.9518599562363238, + "grad_norm": 30.875, + "learning_rate": 1.3696655510184592e-06, + "loss": 1.1632, + "step": 3915 + }, + { + "epoch": 0.9521030877704838, + "grad_norm": 16.375, + "learning_rate": 1.369184499194118e-06, + "loss": 0.7922, + "step": 3916 + }, + { + "epoch": 0.9523462193046438, + "grad_norm": 22.125, + "learning_rate": 1.3687034295555951e-06, + "loss": 1.055, + "step": 3917 + }, + { + "epoch": 0.9525893508388038, + "grad_norm": 21.375, + "learning_rate": 1.3682223421747948e-06, + "loss": 0.8957, + "step": 3918 + }, + { + "epoch": 0.9528324823729638, + "grad_norm": 27.625, + "learning_rate": 1.3677412371236232e-06, + "loss": 0.9606, + "step": 3919 + }, + { + "epoch": 0.9530756139071238, + "grad_norm": 14.9375, + "learning_rate": 1.367260114473991e-06, + "loss": 0.4122, + "step": 3920 + }, + { + "epoch": 0.9533187454412837, + "grad_norm": 17.0, + "learning_rate": 1.3667789742978089e-06, + "loss": 0.7189, + "step": 3921 + }, + { + "epoch": 0.9535618769754437, + "grad_norm": 16.75, + "learning_rate": 1.3662978166669924e-06, + "loss": 0.6071, + "step": 3922 + }, + { + "epoch": 0.9538050085096037, + "grad_norm": 22.0, + "learning_rate": 1.3658166416534588e-06, + "loss": 0.8416, + "step": 3923 + }, + { + "epoch": 0.9540481400437637, + "grad_norm": 20.25, + "learning_rate": 1.3653354493291276e-06, + "loss": 0.9223, + "step": 3924 + }, + { + "epoch": 0.9542912715779237, + "grad_norm": 14.75, + "learning_rate": 1.364854239765922e-06, + "loss": 0.494, + "step": 3925 + }, + { + "epoch": 0.9545344031120836, + "grad_norm": 21.875, + "learning_rate": 1.3643730130357662e-06, + "loss": 0.9353, + "step": 3926 + }, + { + "epoch": 0.9547775346462436, + "grad_norm": 18.5, + "learning_rate": 1.3638917692105888e-06, + "loss": 0.7108, + "step": 3927 + }, + { + "epoch": 0.9550206661804036, + "grad_norm": 22.375, + "learning_rate": 1.3634105083623191e-06, + "loss": 0.9413, + "step": 3928 + }, + { + "epoch": 0.9552637977145636, + "grad_norm": 34.0, + "learning_rate": 1.3629292305628905e-06, + "loss": 0.3229, + "step": 3929 + }, + { + "epoch": 0.9555069292487236, + "grad_norm": 18.5, + "learning_rate": 1.362447935884238e-06, + "loss": 1.3861, + "step": 3930 + }, + { + "epoch": 0.9557500607828835, + "grad_norm": 19.375, + "learning_rate": 1.3619666243982993e-06, + "loss": 0.6782, + "step": 3931 + }, + { + "epoch": 0.9559931923170435, + "grad_norm": 16.875, + "learning_rate": 1.361485296177015e-06, + "loss": 1.1356, + "step": 3932 + }, + { + "epoch": 0.9562363238512035, + "grad_norm": 18.75, + "learning_rate": 1.3610039512923278e-06, + "loss": 0.9059, + "step": 3933 + }, + { + "epoch": 0.9564794553853635, + "grad_norm": 18.25, + "learning_rate": 1.3605225898161828e-06, + "loss": 0.7756, + "step": 3934 + }, + { + "epoch": 0.9567225869195235, + "grad_norm": 17.125, + "learning_rate": 1.360041211820528e-06, + "loss": 0.6717, + "step": 3935 + }, + { + "epoch": 0.9569657184536834, + "grad_norm": 16.0, + "learning_rate": 1.3595598173773137e-06, + "loss": 0.621, + "step": 3936 + }, + { + "epoch": 0.9572088499878434, + "grad_norm": 36.25, + "learning_rate": 1.3590784065584927e-06, + "loss": 1.2804, + "step": 3937 + }, + { + "epoch": 0.9574519815220034, + "grad_norm": 21.5, + "learning_rate": 1.3585969794360197e-06, + "loss": 0.6779, + "step": 3938 + }, + { + "epoch": 0.9576951130561634, + "grad_norm": 19.375, + "learning_rate": 1.3581155360818526e-06, + "loss": 1.0433, + "step": 3939 + }, + { + "epoch": 0.9579382445903234, + "grad_norm": 20.5, + "learning_rate": 1.3576340765679516e-06, + "loss": 1.1591, + "step": 3940 + }, + { + "epoch": 0.9581813761244834, + "grad_norm": 14.9375, + "learning_rate": 1.3571526009662784e-06, + "loss": 0.6295, + "step": 3941 + }, + { + "epoch": 0.9584245076586433, + "grad_norm": 15.625, + "learning_rate": 1.356671109348799e-06, + "loss": 0.7017, + "step": 3942 + }, + { + "epoch": 0.9586676391928033, + "grad_norm": 23.875, + "learning_rate": 1.3561896017874799e-06, + "loss": 0.8728, + "step": 3943 + }, + { + "epoch": 0.9589107707269633, + "grad_norm": 41.75, + "learning_rate": 1.355708078354291e-06, + "loss": 0.951, + "step": 3944 + }, + { + "epoch": 0.9591539022611233, + "grad_norm": 23.125, + "learning_rate": 1.3552265391212038e-06, + "loss": 1.0707, + "step": 3945 + }, + { + "epoch": 0.9593970337952833, + "grad_norm": 19.625, + "learning_rate": 1.3547449841601935e-06, + "loss": 1.3283, + "step": 3946 + }, + { + "epoch": 0.9596401653294432, + "grad_norm": 21.0, + "learning_rate": 1.354263413543236e-06, + "loss": 0.6235, + "step": 3947 + }, + { + "epoch": 0.9598832968636032, + "grad_norm": 21.125, + "learning_rate": 1.3537818273423103e-06, + "loss": 0.6245, + "step": 3948 + }, + { + "epoch": 0.9601264283977632, + "grad_norm": 25.5, + "learning_rate": 1.3533002256293987e-06, + "loss": 0.8943, + "step": 3949 + }, + { + "epoch": 0.9603695599319232, + "grad_norm": 19.75, + "learning_rate": 1.352818608476484e-06, + "loss": 0.9066, + "step": 3950 + }, + { + "epoch": 0.9606126914660832, + "grad_norm": 18.0, + "learning_rate": 1.3523369759555526e-06, + "loss": 0.6345, + "step": 3951 + }, + { + "epoch": 0.9608558230002431, + "grad_norm": 20.625, + "learning_rate": 1.3518553281385929e-06, + "loss": 1.0054, + "step": 3952 + }, + { + "epoch": 0.9610989545344031, + "grad_norm": 18.625, + "learning_rate": 1.3513736650975947e-06, + "loss": 0.3696, + "step": 3953 + }, + { + "epoch": 0.9613420860685631, + "grad_norm": 21.25, + "learning_rate": 1.3508919869045522e-06, + "loss": 0.9926, + "step": 3954 + }, + { + "epoch": 0.9615852176027231, + "grad_norm": 20.625, + "learning_rate": 1.3504102936314594e-06, + "loss": 1.0307, + "step": 3955 + }, + { + "epoch": 0.9618283491368831, + "grad_norm": 25.75, + "learning_rate": 1.3499285853503146e-06, + "loss": 1.0777, + "step": 3956 + }, + { + "epoch": 0.9620714806710431, + "grad_norm": 17.25, + "learning_rate": 1.349446862133116e-06, + "loss": 0.9713, + "step": 3957 + }, + { + "epoch": 0.962314612205203, + "grad_norm": 19.75, + "learning_rate": 1.348965124051867e-06, + "loss": 0.9016, + "step": 3958 + }, + { + "epoch": 0.962557743739363, + "grad_norm": 19.375, + "learning_rate": 1.348483371178571e-06, + "loss": 0.6962, + "step": 3959 + }, + { + "epoch": 0.962800875273523, + "grad_norm": 22.375, + "learning_rate": 1.3480016035852342e-06, + "loss": 1.2241, + "step": 3960 + }, + { + "epoch": 0.963044006807683, + "grad_norm": 18.125, + "learning_rate": 1.3475198213438651e-06, + "loss": 0.7728, + "step": 3961 + }, + { + "epoch": 0.963287138341843, + "grad_norm": 21.375, + "learning_rate": 1.3470380245264744e-06, + "loss": 0.9239, + "step": 3962 + }, + { + "epoch": 0.9635302698760029, + "grad_norm": 26.75, + "learning_rate": 1.3465562132050752e-06, + "loss": 0.9932, + "step": 3963 + }, + { + "epoch": 0.9637734014101629, + "grad_norm": 21.125, + "learning_rate": 1.3460743874516823e-06, + "loss": 0.8634, + "step": 3964 + }, + { + "epoch": 0.9640165329443229, + "grad_norm": 22.375, + "learning_rate": 1.3455925473383128e-06, + "loss": 1.2086, + "step": 3965 + }, + { + "epoch": 0.9642596644784829, + "grad_norm": 18.0, + "learning_rate": 1.3451106929369864e-06, + "loss": 0.6385, + "step": 3966 + }, + { + "epoch": 0.9645027960126429, + "grad_norm": 15.0625, + "learning_rate": 1.3446288243197242e-06, + "loss": 0.7051, + "step": 3967 + }, + { + "epoch": 0.9647459275468028, + "grad_norm": 24.875, + "learning_rate": 1.3441469415585501e-06, + "loss": 1.1945, + "step": 3968 + }, + { + "epoch": 0.9649890590809628, + "grad_norm": 19.5, + "learning_rate": 1.3436650447254892e-06, + "loss": 0.7287, + "step": 3969 + }, + { + "epoch": 0.9652321906151228, + "grad_norm": 16.875, + "learning_rate": 1.3431831338925699e-06, + "loss": 0.6114, + "step": 3970 + }, + { + "epoch": 0.9654753221492828, + "grad_norm": 15.4375, + "learning_rate": 1.3427012091318224e-06, + "loss": 0.8383, + "step": 3971 + }, + { + "epoch": 0.9657184536834428, + "grad_norm": 17.875, + "learning_rate": 1.3422192705152773e-06, + "loss": 0.697, + "step": 3972 + }, + { + "epoch": 0.9659615852176027, + "grad_norm": 26.0, + "learning_rate": 1.3417373181149704e-06, + "loss": 0.8018, + "step": 3973 + }, + { + "epoch": 0.9662047167517627, + "grad_norm": 26.875, + "learning_rate": 1.3412553520029365e-06, + "loss": 1.0745, + "step": 3974 + }, + { + "epoch": 0.9664478482859227, + "grad_norm": 20.625, + "learning_rate": 1.3407733722512144e-06, + "loss": 1.014, + "step": 3975 + }, + { + "epoch": 0.9666909798200827, + "grad_norm": 29.125, + "learning_rate": 1.3402913789318436e-06, + "loss": 1.9196, + "step": 3976 + }, + { + "epoch": 0.9669341113542427, + "grad_norm": 25.875, + "learning_rate": 1.3398093721168672e-06, + "loss": 0.949, + "step": 3977 + }, + { + "epoch": 0.9671772428884027, + "grad_norm": 23.5, + "learning_rate": 1.3393273518783292e-06, + "loss": 1.3465, + "step": 3978 + }, + { + "epoch": 0.9674203744225626, + "grad_norm": 27.5, + "learning_rate": 1.3388453182882757e-06, + "loss": 0.9278, + "step": 3979 + }, + { + "epoch": 0.9676635059567226, + "grad_norm": 40.5, + "learning_rate": 1.3383632714187547e-06, + "loss": 1.2588, + "step": 3980 + }, + { + "epoch": 0.9679066374908826, + "grad_norm": 18.0, + "learning_rate": 1.3378812113418168e-06, + "loss": 0.7095, + "step": 3981 + }, + { + "epoch": 0.9681497690250426, + "grad_norm": 20.125, + "learning_rate": 1.3373991381295142e-06, + "loss": 1.1938, + "step": 3982 + }, + { + "epoch": 0.9683929005592026, + "grad_norm": 19.875, + "learning_rate": 1.3369170518539013e-06, + "loss": 0.9127, + "step": 3983 + }, + { + "epoch": 0.9686360320933625, + "grad_norm": 15.5, + "learning_rate": 1.3364349525870332e-06, + "loss": 0.596, + "step": 3984 + }, + { + "epoch": 0.9688791636275225, + "grad_norm": 18.125, + "learning_rate": 1.3359528404009691e-06, + "loss": 0.7775, + "step": 3985 + }, + { + "epoch": 0.9691222951616825, + "grad_norm": 24.5, + "learning_rate": 1.3354707153677685e-06, + "loss": 0.8741, + "step": 3986 + }, + { + "epoch": 0.9693654266958425, + "grad_norm": 18.25, + "learning_rate": 1.334988577559493e-06, + "loss": 0.9123, + "step": 3987 + }, + { + "epoch": 0.9696085582300025, + "grad_norm": 15.0625, + "learning_rate": 1.3345064270482072e-06, + "loss": 0.5815, + "step": 3988 + }, + { + "epoch": 0.9698516897641624, + "grad_norm": 16.625, + "learning_rate": 1.3340242639059764e-06, + "loss": 0.8815, + "step": 3989 + }, + { + "epoch": 0.9700948212983224, + "grad_norm": 14.625, + "learning_rate": 1.3335420882048683e-06, + "loss": 0.9273, + "step": 3990 + }, + { + "epoch": 0.9703379528324824, + "grad_norm": 17.125, + "learning_rate": 1.3330599000169519e-06, + "loss": 0.6381, + "step": 3991 + }, + { + "epoch": 0.9705810843666424, + "grad_norm": 36.5, + "learning_rate": 1.3325776994142991e-06, + "loss": 1.6005, + "step": 3992 + }, + { + "epoch": 0.9708242159008024, + "grad_norm": 19.75, + "learning_rate": 1.3320954864689831e-06, + "loss": 0.9846, + "step": 3993 + }, + { + "epoch": 0.9710673474349624, + "grad_norm": 19.625, + "learning_rate": 1.3316132612530786e-06, + "loss": 0.9429, + "step": 3994 + }, + { + "epoch": 0.9713104789691223, + "grad_norm": 22.625, + "learning_rate": 1.3311310238386626e-06, + "loss": 0.8868, + "step": 3995 + }, + { + "epoch": 0.9715536105032823, + "grad_norm": 18.875, + "learning_rate": 1.3306487742978142e-06, + "loss": 0.6734, + "step": 3996 + }, + { + "epoch": 0.9717967420374423, + "grad_norm": 21.75, + "learning_rate": 1.3301665127026137e-06, + "loss": 1.0677, + "step": 3997 + }, + { + "epoch": 0.9720398735716023, + "grad_norm": 15.9375, + "learning_rate": 1.329684239125143e-06, + "loss": 0.7513, + "step": 3998 + }, + { + "epoch": 0.9722830051057623, + "grad_norm": 19.0, + "learning_rate": 1.3292019536374866e-06, + "loss": 1.0038, + "step": 3999 + }, + { + "epoch": 0.9725261366399222, + "grad_norm": 16.875, + "learning_rate": 1.3287196563117308e-06, + "loss": 0.9442, + "step": 4000 + }, + { + "epoch": 0.9727692681740822, + "grad_norm": 27.625, + "learning_rate": 1.3282373472199623e-06, + "loss": 0.8517, + "step": 4001 + }, + { + "epoch": 0.9730123997082422, + "grad_norm": 19.75, + "learning_rate": 1.3277550264342714e-06, + "loss": 1.0273, + "step": 4002 + }, + { + "epoch": 0.9732555312424022, + "grad_norm": 18.125, + "learning_rate": 1.3272726940267485e-06, + "loss": 0.5411, + "step": 4003 + }, + { + "epoch": 0.9734986627765622, + "grad_norm": 19.875, + "learning_rate": 1.3267903500694875e-06, + "loss": 0.998, + "step": 4004 + }, + { + "epoch": 0.973741794310722, + "grad_norm": 24.125, + "learning_rate": 1.3263079946345822e-06, + "loss": 0.9674, + "step": 4005 + }, + { + "epoch": 0.973984925844882, + "grad_norm": 21.875, + "learning_rate": 1.3258256277941291e-06, + "loss": 1.068, + "step": 4006 + }, + { + "epoch": 0.9742280573790421, + "grad_norm": 15.875, + "learning_rate": 1.3253432496202267e-06, + "loss": 0.667, + "step": 4007 + }, + { + "epoch": 0.9744711889132021, + "grad_norm": 15.5625, + "learning_rate": 1.3248608601849741e-06, + "loss": 0.3742, + "step": 4008 + }, + { + "epoch": 0.9747143204473621, + "grad_norm": 20.875, + "learning_rate": 1.3243784595604733e-06, + "loss": 0.6048, + "step": 4009 + }, + { + "epoch": 0.974957451981522, + "grad_norm": 22.75, + "learning_rate": 1.323896047818827e-06, + "loss": 1.2635, + "step": 4010 + }, + { + "epoch": 0.975200583515682, + "grad_norm": 17.5, + "learning_rate": 1.3234136250321403e-06, + "loss": 0.6713, + "step": 4011 + }, + { + "epoch": 0.975443715049842, + "grad_norm": 19.375, + "learning_rate": 1.3229311912725193e-06, + "loss": 0.5918, + "step": 4012 + }, + { + "epoch": 0.975686846584002, + "grad_norm": 16.25, + "learning_rate": 1.322448746612072e-06, + "loss": 0.7757, + "step": 4013 + }, + { + "epoch": 0.975929978118162, + "grad_norm": 20.0, + "learning_rate": 1.321966291122909e-06, + "loss": 0.7835, + "step": 4014 + }, + { + "epoch": 0.976173109652322, + "grad_norm": 17.5, + "learning_rate": 1.3214838248771396e-06, + "loss": 1.0936, + "step": 4015 + }, + { + "epoch": 0.9764162411864818, + "grad_norm": 21.0, + "learning_rate": 1.3210013479468791e-06, + "loss": 0.9025, + "step": 4016 + }, + { + "epoch": 0.9766593727206418, + "grad_norm": 23.5, + "learning_rate": 1.3205188604042407e-06, + "loss": 1.0179, + "step": 4017 + }, + { + "epoch": 0.9769025042548019, + "grad_norm": 19.125, + "learning_rate": 1.3200363623213406e-06, + "loss": 0.9663, + "step": 4018 + }, + { + "epoch": 0.9771456357889619, + "grad_norm": 21.5, + "learning_rate": 1.3195538537702965e-06, + "loss": 0.9284, + "step": 4019 + }, + { + "epoch": 0.9773887673231219, + "grad_norm": 16.5, + "learning_rate": 1.319071334823228e-06, + "loss": 0.912, + "step": 4020 + }, + { + "epoch": 0.9776318988572817, + "grad_norm": 19.625, + "learning_rate": 1.3185888055522556e-06, + "loss": 0.7986, + "step": 4021 + }, + { + "epoch": 0.9778750303914417, + "grad_norm": 24.25, + "learning_rate": 1.3181062660295013e-06, + "loss": 1.0828, + "step": 4022 + }, + { + "epoch": 0.9781181619256017, + "grad_norm": 25.0, + "learning_rate": 1.3176237163270893e-06, + "loss": 0.8634, + "step": 4023 + }, + { + "epoch": 0.9783612934597617, + "grad_norm": 14.75, + "learning_rate": 1.3171411565171452e-06, + "loss": 0.7959, + "step": 4024 + }, + { + "epoch": 0.9786044249939218, + "grad_norm": 20.625, + "learning_rate": 1.3166585866717953e-06, + "loss": 0.7384, + "step": 4025 + }, + { + "epoch": 0.9788475565280816, + "grad_norm": 17.125, + "learning_rate": 1.3161760068631691e-06, + "loss": 0.7498, + "step": 4026 + }, + { + "epoch": 0.9790906880622416, + "grad_norm": 22.0, + "learning_rate": 1.315693417163395e-06, + "loss": 0.7234, + "step": 4027 + }, + { + "epoch": 0.9793338195964016, + "grad_norm": 17.0, + "learning_rate": 1.315210817644606e-06, + "loss": 0.9886, + "step": 4028 + }, + { + "epoch": 0.9795769511305616, + "grad_norm": 18.0, + "learning_rate": 1.3147282083789337e-06, + "loss": 0.9663, + "step": 4029 + }, + { + "epoch": 0.9798200826647216, + "grad_norm": 15.5, + "learning_rate": 1.3142455894385125e-06, + "loss": 0.4486, + "step": 4030 + }, + { + "epoch": 0.9800632141988816, + "grad_norm": 15.75, + "learning_rate": 1.3137629608954785e-06, + "loss": 0.8873, + "step": 4031 + }, + { + "epoch": 0.9803063457330415, + "grad_norm": 22.375, + "learning_rate": 1.3132803228219688e-06, + "loss": 0.794, + "step": 4032 + }, + { + "epoch": 0.9805494772672015, + "grad_norm": 13.9375, + "learning_rate": 1.3127976752901222e-06, + "loss": 0.6404, + "step": 4033 + }, + { + "epoch": 0.9807926088013615, + "grad_norm": 17.125, + "learning_rate": 1.312315018372078e-06, + "loss": 0.9471, + "step": 4034 + }, + { + "epoch": 0.9810357403355215, + "grad_norm": 21.0, + "learning_rate": 1.3118323521399787e-06, + "loss": 1.1323, + "step": 4035 + }, + { + "epoch": 0.9812788718696815, + "grad_norm": 12.6875, + "learning_rate": 1.3113496766659661e-06, + "loss": 0.3732, + "step": 4036 + }, + { + "epoch": 0.9815220034038414, + "grad_norm": 21.125, + "learning_rate": 1.3108669920221848e-06, + "loss": 0.6477, + "step": 4037 + }, + { + "epoch": 0.9817651349380014, + "grad_norm": 18.0, + "learning_rate": 1.3103842982807802e-06, + "loss": 0.7884, + "step": 4038 + }, + { + "epoch": 0.9820082664721614, + "grad_norm": 22.25, + "learning_rate": 1.3099015955138997e-06, + "loss": 0.7812, + "step": 4039 + }, + { + "epoch": 0.9822513980063214, + "grad_norm": 17.25, + "learning_rate": 1.3094188837936912e-06, + "loss": 0.9805, + "step": 4040 + }, + { + "epoch": 0.9824945295404814, + "grad_norm": 18.75, + "learning_rate": 1.3089361631923043e-06, + "loss": 0.8195, + "step": 4041 + }, + { + "epoch": 0.9827376610746413, + "grad_norm": 21.75, + "learning_rate": 1.3084534337818896e-06, + "loss": 1.1097, + "step": 4042 + }, + { + "epoch": 0.9829807926088013, + "grad_norm": 28.125, + "learning_rate": 1.3079706956345997e-06, + "loss": 1.3946, + "step": 4043 + }, + { + "epoch": 0.9832239241429613, + "grad_norm": 23.125, + "learning_rate": 1.3074879488225883e-06, + "loss": 0.8835, + "step": 4044 + }, + { + "epoch": 0.9834670556771213, + "grad_norm": 23.125, + "learning_rate": 1.3070051934180106e-06, + "loss": 0.9037, + "step": 4045 + }, + { + "epoch": 0.9837101872112813, + "grad_norm": 17.0, + "learning_rate": 1.3065224294930213e-06, + "loss": 0.6628, + "step": 4046 + }, + { + "epoch": 0.9839533187454412, + "grad_norm": 18.75, + "learning_rate": 1.3060396571197794e-06, + "loss": 0.8511, + "step": 4047 + }, + { + "epoch": 0.9841964502796012, + "grad_norm": 17.75, + "learning_rate": 1.3055568763704425e-06, + "loss": 0.9845, + "step": 4048 + }, + { + "epoch": 0.9844395818137612, + "grad_norm": 17.5, + "learning_rate": 1.3050740873171714e-06, + "loss": 0.873, + "step": 4049 + }, + { + "epoch": 0.9846827133479212, + "grad_norm": 23.375, + "learning_rate": 1.3045912900321264e-06, + "loss": 1.1688, + "step": 4050 + }, + { + "epoch": 0.9849258448820812, + "grad_norm": 18.5, + "learning_rate": 1.3041084845874705e-06, + "loss": 0.71, + "step": 4051 + }, + { + "epoch": 0.9851689764162412, + "grad_norm": 19.5, + "learning_rate": 1.303625671055367e-06, + "loss": 0.6832, + "step": 4052 + }, + { + "epoch": 0.9854121079504011, + "grad_norm": 18.75, + "learning_rate": 1.3031428495079807e-06, + "loss": 0.6501, + "step": 4053 + }, + { + "epoch": 0.9856552394845611, + "grad_norm": 31.5, + "learning_rate": 1.302660020017478e-06, + "loss": 0.9338, + "step": 4054 + }, + { + "epoch": 0.9858983710187211, + "grad_norm": 24.875, + "learning_rate": 1.3021771826560256e-06, + "loss": 0.9302, + "step": 4055 + }, + { + "epoch": 0.9861415025528811, + "grad_norm": 15.9375, + "learning_rate": 1.3016943374957922e-06, + "loss": 1.0072, + "step": 4056 + }, + { + "epoch": 0.9863846340870411, + "grad_norm": 20.875, + "learning_rate": 1.301211484608947e-06, + "loss": 0.7759, + "step": 4057 + }, + { + "epoch": 0.986627765621201, + "grad_norm": 20.5, + "learning_rate": 1.3007286240676614e-06, + "loss": 0.9501, + "step": 4058 + }, + { + "epoch": 0.986870897155361, + "grad_norm": 12.625, + "learning_rate": 1.300245755944107e-06, + "loss": 0.356, + "step": 4059 + }, + { + "epoch": 0.987114028689521, + "grad_norm": 19.75, + "learning_rate": 1.2997628803104563e-06, + "loss": 0.8057, + "step": 4060 + }, + { + "epoch": 0.987357160223681, + "grad_norm": 29.75, + "learning_rate": 1.2992799972388836e-06, + "loss": 0.9597, + "step": 4061 + }, + { + "epoch": 0.987600291757841, + "grad_norm": 18.75, + "learning_rate": 1.2987971068015643e-06, + "loss": 0.9781, + "step": 4062 + }, + { + "epoch": 0.9878434232920009, + "grad_norm": 14.625, + "learning_rate": 1.2983142090706744e-06, + "loss": 0.5611, + "step": 4063 + }, + { + "epoch": 0.9880865548261609, + "grad_norm": 22.125, + "learning_rate": 1.297831304118392e-06, + "loss": 1.1952, + "step": 4064 + }, + { + "epoch": 0.9883296863603209, + "grad_norm": 25.75, + "learning_rate": 1.2973483920168948e-06, + "loss": 1.1019, + "step": 4065 + }, + { + "epoch": 0.9885728178944809, + "grad_norm": 18.375, + "learning_rate": 1.2968654728383629e-06, + "loss": 0.9886, + "step": 4066 + }, + { + "epoch": 0.9888159494286409, + "grad_norm": 16.75, + "learning_rate": 1.2963825466549765e-06, + "loss": 0.7126, + "step": 4067 + }, + { + "epoch": 0.9890590809628009, + "grad_norm": 18.875, + "learning_rate": 1.2958996135389174e-06, + "loss": 0.6975, + "step": 4068 + }, + { + "epoch": 0.9893022124969608, + "grad_norm": 28.125, + "learning_rate": 1.2954166735623682e-06, + "loss": 1.2519, + "step": 4069 + }, + { + "epoch": 0.9895453440311208, + "grad_norm": 16.625, + "learning_rate": 1.294933726797513e-06, + "loss": 0.7895, + "step": 4070 + }, + { + "epoch": 0.9897884755652808, + "grad_norm": 14.5, + "learning_rate": 1.2944507733165367e-06, + "loss": 0.8475, + "step": 4071 + }, + { + "epoch": 0.9900316070994408, + "grad_norm": 63.5, + "learning_rate": 1.293967813191624e-06, + "loss": 1.0635, + "step": 4072 + }, + { + "epoch": 0.9902747386336008, + "grad_norm": 20.875, + "learning_rate": 1.2934848464949625e-06, + "loss": 0.9305, + "step": 4073 + }, + { + "epoch": 0.9905178701677607, + "grad_norm": 20.25, + "learning_rate": 1.29300187329874e-06, + "loss": 0.7422, + "step": 4074 + }, + { + "epoch": 0.9907610017019207, + "grad_norm": 17.5, + "learning_rate": 1.2925188936751443e-06, + "loss": 1.0064, + "step": 4075 + }, + { + "epoch": 0.9910041332360807, + "grad_norm": 17.875, + "learning_rate": 1.2920359076963663e-06, + "loss": 0.786, + "step": 4076 + }, + { + "epoch": 0.9912472647702407, + "grad_norm": 25.125, + "learning_rate": 1.291552915434595e-06, + "loss": 0.7397, + "step": 4077 + }, + { + "epoch": 0.9914903963044007, + "grad_norm": 17.375, + "learning_rate": 1.2910699169620235e-06, + "loss": 0.6564, + "step": 4078 + }, + { + "epoch": 0.9917335278385606, + "grad_norm": 20.375, + "learning_rate": 1.2905869123508435e-06, + "loss": 0.6919, + "step": 4079 + }, + { + "epoch": 0.9919766593727206, + "grad_norm": 14.6875, + "learning_rate": 1.290103901673248e-06, + "loss": 0.3727, + "step": 4080 + }, + { + "epoch": 0.9922197909068806, + "grad_norm": 16.75, + "learning_rate": 1.2896208850014325e-06, + "loss": 0.8732, + "step": 4081 + }, + { + "epoch": 0.9924629224410406, + "grad_norm": 19.625, + "learning_rate": 1.2891378624075912e-06, + "loss": 0.8371, + "step": 4082 + }, + { + "epoch": 0.9927060539752006, + "grad_norm": 17.0, + "learning_rate": 1.2886548339639205e-06, + "loss": 0.646, + "step": 4083 + }, + { + "epoch": 0.9929491855093605, + "grad_norm": 13.8125, + "learning_rate": 1.288171799742617e-06, + "loss": 0.6926, + "step": 4084 + }, + { + "epoch": 0.9931923170435205, + "grad_norm": 18.375, + "learning_rate": 1.287688759815879e-06, + "loss": 0.4883, + "step": 4085 + }, + { + "epoch": 0.9934354485776805, + "grad_norm": 22.5, + "learning_rate": 1.2872057142559049e-06, + "loss": 0.7492, + "step": 4086 + }, + { + "epoch": 0.9936785801118405, + "grad_norm": 18.5, + "learning_rate": 1.2867226631348943e-06, + "loss": 0.9716, + "step": 4087 + }, + { + "epoch": 0.9939217116460005, + "grad_norm": 20.0, + "learning_rate": 1.2862396065250473e-06, + "loss": 1.0586, + "step": 4088 + }, + { + "epoch": 0.9941648431801605, + "grad_norm": 14.5625, + "learning_rate": 1.285756544498565e-06, + "loss": 0.5721, + "step": 4089 + }, + { + "epoch": 0.9944079747143204, + "grad_norm": 22.375, + "learning_rate": 1.2852734771276504e-06, + "loss": 0.9043, + "step": 4090 + }, + { + "epoch": 0.9946511062484804, + "grad_norm": 15.4375, + "learning_rate": 1.284790404484505e-06, + "loss": 0.4759, + "step": 4091 + }, + { + "epoch": 0.9948942377826404, + "grad_norm": 21.0, + "learning_rate": 1.2843073266413323e-06, + "loss": 1.0459, + "step": 4092 + }, + { + "epoch": 0.9951373693168004, + "grad_norm": 20.875, + "learning_rate": 1.2838242436703377e-06, + "loss": 1.2038, + "step": 4093 + }, + { + "epoch": 0.9953805008509604, + "grad_norm": 23.375, + "learning_rate": 1.2833411556437255e-06, + "loss": 0.9746, + "step": 4094 + }, + { + "epoch": 0.9956236323851203, + "grad_norm": 14.9375, + "learning_rate": 1.2828580626337024e-06, + "loss": 0.5975, + "step": 4095 + }, + { + "epoch": 0.9958667639192803, + "grad_norm": 19.875, + "learning_rate": 1.2823749647124733e-06, + "loss": 0.7218, + "step": 4096 + }, + { + "epoch": 0.9961098954534403, + "grad_norm": 16.875, + "learning_rate": 1.2818918619522471e-06, + "loss": 0.5233, + "step": 4097 + }, + { + "epoch": 0.9963530269876003, + "grad_norm": 21.5, + "learning_rate": 1.2814087544252316e-06, + "loss": 0.8869, + "step": 4098 + }, + { + "epoch": 0.9965961585217603, + "grad_norm": 27.875, + "learning_rate": 1.2809256422036351e-06, + "loss": 0.9395, + "step": 4099 + }, + { + "epoch": 0.9968392900559202, + "grad_norm": 20.375, + "learning_rate": 1.2804425253596672e-06, + "loss": 0.9283, + "step": 4100 + }, + { + "epoch": 0.9970824215900802, + "grad_norm": 18.0, + "learning_rate": 1.279959403965538e-06, + "loss": 0.5885, + "step": 4101 + }, + { + "epoch": 0.9973255531242402, + "grad_norm": 21.125, + "learning_rate": 1.2794762780934588e-06, + "loss": 0.929, + "step": 4102 + }, + { + "epoch": 0.9975686846584002, + "grad_norm": 16.75, + "learning_rate": 1.2789931478156406e-06, + "loss": 0.8388, + "step": 4103 + }, + { + "epoch": 0.9978118161925602, + "grad_norm": 16.75, + "learning_rate": 1.2785100132042954e-06, + "loss": 0.6561, + "step": 4104 + }, + { + "epoch": 0.9980549477267202, + "grad_norm": 18.125, + "learning_rate": 1.2780268743316369e-06, + "loss": 0.4373, + "step": 4105 + }, + { + "epoch": 0.9982980792608801, + "grad_norm": 25.625, + "learning_rate": 1.2775437312698776e-06, + "loss": 1.1276, + "step": 4106 + }, + { + "epoch": 0.9985412107950401, + "grad_norm": 23.25, + "learning_rate": 1.277060584091232e-06, + "loss": 1.1631, + "step": 4107 + }, + { + "epoch": 0.9987843423292001, + "grad_norm": 13.0, + "learning_rate": 1.2765774328679147e-06, + "loss": 0.3814, + "step": 4108 + }, + { + "epoch": 0.9990274738633601, + "grad_norm": 21.875, + "learning_rate": 1.2760942776721414e-06, + "loss": 1.1216, + "step": 4109 + }, + { + "epoch": 0.9992706053975201, + "grad_norm": 15.75, + "learning_rate": 1.2756111185761277e-06, + "loss": 0.5756, + "step": 4110 + }, + { + "epoch": 0.99951373693168, + "grad_norm": 17.75, + "learning_rate": 1.2751279556520893e-06, + "loss": 0.7563, + "step": 4111 + }, + { + "epoch": 0.99975686846584, + "grad_norm": 16.375, + "learning_rate": 1.2746447889722446e-06, + "loss": 0.8834, + "step": 4112 + }, + { + "epoch": 1.0, + "grad_norm": 17.875, + "learning_rate": 1.2741616186088103e-06, + "loss": 0.9585, + "step": 4113 + }, + { + "epoch": 1.0, + "eval_loss": 1.1157740354537964, + "eval_runtime": 98.4299, + "eval_samples_per_second": 5.689, + "eval_steps_per_second": 5.689, + "step": 4113 + }, + { + "epoch": 1.00024313153416, + "grad_norm": 16.375, + "learning_rate": 1.273678444634005e-06, + "loss": 0.6687, + "step": 4114 + }, + { + "epoch": 1.00048626306832, + "grad_norm": 20.625, + "learning_rate": 1.273195267120047e-06, + "loss": 0.6081, + "step": 4115 + }, + { + "epoch": 1.00072939460248, + "grad_norm": 15.8125, + "learning_rate": 1.272712086139156e-06, + "loss": 0.9411, + "step": 4116 + }, + { + "epoch": 1.00097252613664, + "grad_norm": 16.875, + "learning_rate": 1.2722289017635515e-06, + "loss": 0.4011, + "step": 4117 + }, + { + "epoch": 1.0012156576708, + "grad_norm": 21.5, + "learning_rate": 1.2717457140654533e-06, + "loss": 1.1048, + "step": 4118 + }, + { + "epoch": 1.0014587892049598, + "grad_norm": 21.625, + "learning_rate": 1.2712625231170827e-06, + "loss": 0.9995, + "step": 4119 + }, + { + "epoch": 1.0017019207391198, + "grad_norm": 20.0, + "learning_rate": 1.2707793289906609e-06, + "loss": 0.7247, + "step": 4120 + }, + { + "epoch": 1.0019450522732798, + "grad_norm": 16.375, + "learning_rate": 1.2702961317584096e-06, + "loss": 0.7889, + "step": 4121 + }, + { + "epoch": 1.0021881838074398, + "grad_norm": 20.0, + "learning_rate": 1.2698129314925508e-06, + "loss": 0.7779, + "step": 4122 + }, + { + "epoch": 1.0024313153415998, + "grad_norm": 17.125, + "learning_rate": 1.269329728265307e-06, + "loss": 0.5287, + "step": 4123 + }, + { + "epoch": 1.0026744468757598, + "grad_norm": 24.625, + "learning_rate": 1.2688465221489018e-06, + "loss": 1.009, + "step": 4124 + }, + { + "epoch": 1.0029175784099198, + "grad_norm": 15.3125, + "learning_rate": 1.2683633132155582e-06, + "loss": 0.6406, + "step": 4125 + }, + { + "epoch": 1.0031607099440798, + "grad_norm": 17.125, + "learning_rate": 1.2678801015375003e-06, + "loss": 0.7235, + "step": 4126 + }, + { + "epoch": 1.0034038414782398, + "grad_norm": 15.75, + "learning_rate": 1.2673968871869524e-06, + "loss": 0.5363, + "step": 4127 + }, + { + "epoch": 1.0036469730123998, + "grad_norm": 14.5625, + "learning_rate": 1.2669136702361396e-06, + "loss": 0.5582, + "step": 4128 + }, + { + "epoch": 1.0038901045465596, + "grad_norm": 21.0, + "learning_rate": 1.2664304507572864e-06, + "loss": 0.722, + "step": 4129 + }, + { + "epoch": 1.0041332360807196, + "grad_norm": 17.25, + "learning_rate": 1.2659472288226185e-06, + "loss": 0.5628, + "step": 4130 + }, + { + "epoch": 1.0043763676148796, + "grad_norm": 15.125, + "learning_rate": 1.265464004504362e-06, + "loss": 0.4223, + "step": 4131 + }, + { + "epoch": 1.0046194991490396, + "grad_norm": 20.25, + "learning_rate": 1.2649807778747428e-06, + "loss": 0.5436, + "step": 4132 + }, + { + "epoch": 1.0048626306831996, + "grad_norm": 22.75, + "learning_rate": 1.2644975490059875e-06, + "loss": 0.5505, + "step": 4133 + }, + { + "epoch": 1.0051057622173596, + "grad_norm": 19.25, + "learning_rate": 1.2640143179703235e-06, + "loss": 0.8603, + "step": 4134 + }, + { + "epoch": 1.0053488937515196, + "grad_norm": 20.75, + "learning_rate": 1.2635310848399773e-06, + "loss": 0.714, + "step": 4135 + }, + { + "epoch": 1.0055920252856796, + "grad_norm": 21.0, + "learning_rate": 1.2630478496871771e-06, + "loss": 0.6515, + "step": 4136 + }, + { + "epoch": 1.0058351568198396, + "grad_norm": 50.0, + "learning_rate": 1.26256461258415e-06, + "loss": 1.0408, + "step": 4137 + }, + { + "epoch": 1.0060782883539996, + "grad_norm": 17.0, + "learning_rate": 1.2620813736031248e-06, + "loss": 0.7504, + "step": 4138 + }, + { + "epoch": 1.0063214198881596, + "grad_norm": 21.625, + "learning_rate": 1.2615981328163296e-06, + "loss": 0.8906, + "step": 4139 + }, + { + "epoch": 1.0065645514223194, + "grad_norm": 18.25, + "learning_rate": 1.2611148902959932e-06, + "loss": 0.8087, + "step": 4140 + }, + { + "epoch": 1.0068076829564794, + "grad_norm": 16.125, + "learning_rate": 1.2606316461143447e-06, + "loss": 0.5794, + "step": 4141 + }, + { + "epoch": 1.0070508144906394, + "grad_norm": 29.0, + "learning_rate": 1.2601484003436127e-06, + "loss": 1.214, + "step": 4142 + }, + { + "epoch": 1.0072939460247994, + "grad_norm": 18.75, + "learning_rate": 1.2596651530560273e-06, + "loss": 0.5319, + "step": 4143 + }, + { + "epoch": 1.0075370775589594, + "grad_norm": 16.5, + "learning_rate": 1.2591819043238177e-06, + "loss": 0.7777, + "step": 4144 + }, + { + "epoch": 1.0077802090931194, + "grad_norm": 19.75, + "learning_rate": 1.258698654219214e-06, + "loss": 0.7306, + "step": 4145 + }, + { + "epoch": 1.0080233406272794, + "grad_norm": 21.375, + "learning_rate": 1.2582154028144457e-06, + "loss": 0.7118, + "step": 4146 + }, + { + "epoch": 1.0082664721614394, + "grad_norm": 19.5, + "learning_rate": 1.257732150181744e-06, + "loss": 0.9558, + "step": 4147 + }, + { + "epoch": 1.0085096036955994, + "grad_norm": 15.8125, + "learning_rate": 1.2572488963933394e-06, + "loss": 0.5373, + "step": 4148 + }, + { + "epoch": 1.0087527352297594, + "grad_norm": 21.125, + "learning_rate": 1.2567656415214614e-06, + "loss": 0.6095, + "step": 4149 + }, + { + "epoch": 1.0089958667639194, + "grad_norm": 18.75, + "learning_rate": 1.2562823856383415e-06, + "loss": 0.7064, + "step": 4150 + }, + { + "epoch": 1.0092389982980792, + "grad_norm": 23.5, + "learning_rate": 1.2557991288162106e-06, + "loss": 0.8255, + "step": 4151 + }, + { + "epoch": 1.0094821298322392, + "grad_norm": 30.625, + "learning_rate": 1.2553158711272997e-06, + "loss": 1.0326, + "step": 4152 + }, + { + "epoch": 1.0097252613663992, + "grad_norm": 17.875, + "learning_rate": 1.2548326126438403e-06, + "loss": 0.6889, + "step": 4153 + }, + { + "epoch": 1.0099683929005592, + "grad_norm": 23.25, + "learning_rate": 1.2543493534380632e-06, + "loss": 0.9978, + "step": 4154 + }, + { + "epoch": 1.0102115244347192, + "grad_norm": 19.0, + "learning_rate": 1.2538660935822004e-06, + "loss": 0.6695, + "step": 4155 + }, + { + "epoch": 1.0104546559688792, + "grad_norm": 16.5, + "learning_rate": 1.253382833148483e-06, + "loss": 0.4427, + "step": 4156 + }, + { + "epoch": 1.0106977875030392, + "grad_norm": 13.75, + "learning_rate": 1.2528995722091424e-06, + "loss": 0.296, + "step": 4157 + }, + { + "epoch": 1.0109409190371992, + "grad_norm": 14.0625, + "learning_rate": 1.2524163108364113e-06, + "loss": 0.385, + "step": 4158 + }, + { + "epoch": 1.0111840505713592, + "grad_norm": 19.5, + "learning_rate": 1.2519330491025204e-06, + "loss": 0.7604, + "step": 4159 + }, + { + "epoch": 1.0114271821055192, + "grad_norm": 15.375, + "learning_rate": 1.251449787079702e-06, + "loss": 0.4404, + "step": 4160 + }, + { + "epoch": 1.011670313639679, + "grad_norm": 22.375, + "learning_rate": 1.2509665248401882e-06, + "loss": 0.5392, + "step": 4161 + }, + { + "epoch": 1.011913445173839, + "grad_norm": 22.5, + "learning_rate": 1.25048326245621e-06, + "loss": 0.8855, + "step": 4162 + }, + { + "epoch": 1.012156576707999, + "grad_norm": 19.625, + "learning_rate": 1.25e-06, + "loss": 0.6698, + "step": 4163 + }, + { + "epoch": 1.012399708242159, + "grad_norm": 17.875, + "learning_rate": 1.2495167375437902e-06, + "loss": 0.6317, + "step": 4164 + }, + { + "epoch": 1.012642839776319, + "grad_norm": 17.875, + "learning_rate": 1.2490334751598127e-06, + "loss": 0.6481, + "step": 4165 + }, + { + "epoch": 1.012885971310479, + "grad_norm": 20.125, + "learning_rate": 1.2485502129202986e-06, + "loss": 1.0355, + "step": 4166 + }, + { + "epoch": 1.013129102844639, + "grad_norm": 15.0625, + "learning_rate": 1.2480669508974798e-06, + "loss": 0.3629, + "step": 4167 + }, + { + "epoch": 1.013372234378799, + "grad_norm": 22.875, + "learning_rate": 1.247583689163589e-06, + "loss": 1.0129, + "step": 4168 + }, + { + "epoch": 1.013615365912959, + "grad_norm": 21.625, + "learning_rate": 1.2471004277908578e-06, + "loss": 0.9057, + "step": 4169 + }, + { + "epoch": 1.013858497447119, + "grad_norm": 16.5, + "learning_rate": 1.2466171668515174e-06, + "loss": 0.5149, + "step": 4170 + }, + { + "epoch": 1.014101628981279, + "grad_norm": 18.75, + "learning_rate": 1.2461339064178e-06, + "loss": 0.6032, + "step": 4171 + }, + { + "epoch": 1.0143447605154388, + "grad_norm": 29.0, + "learning_rate": 1.2456506465619372e-06, + "loss": 0.7118, + "step": 4172 + }, + { + "epoch": 1.0145878920495988, + "grad_norm": 18.875, + "learning_rate": 1.2451673873561603e-06, + "loss": 0.8595, + "step": 4173 + }, + { + "epoch": 1.0148310235837588, + "grad_norm": 20.5, + "learning_rate": 1.2446841288727005e-06, + "loss": 0.7378, + "step": 4174 + }, + { + "epoch": 1.0150741551179188, + "grad_norm": 17.0, + "learning_rate": 1.2442008711837894e-06, + "loss": 0.4911, + "step": 4175 + }, + { + "epoch": 1.0153172866520788, + "grad_norm": 23.25, + "learning_rate": 1.243717614361659e-06, + "loss": 0.7517, + "step": 4176 + }, + { + "epoch": 1.0155604181862388, + "grad_norm": 19.5, + "learning_rate": 1.2432343584785388e-06, + "loss": 1.1732, + "step": 4177 + }, + { + "epoch": 1.0158035497203988, + "grad_norm": 17.625, + "learning_rate": 1.242751103606661e-06, + "loss": 0.7368, + "step": 4178 + }, + { + "epoch": 1.0160466812545588, + "grad_norm": 16.375, + "learning_rate": 1.2422678498182562e-06, + "loss": 0.5969, + "step": 4179 + }, + { + "epoch": 1.0162898127887188, + "grad_norm": 18.25, + "learning_rate": 1.2417845971855545e-06, + "loss": 0.5698, + "step": 4180 + }, + { + "epoch": 1.0165329443228788, + "grad_norm": 23.25, + "learning_rate": 1.2413013457807865e-06, + "loss": 1.1906, + "step": 4181 + }, + { + "epoch": 1.0167760758570386, + "grad_norm": 17.75, + "learning_rate": 1.240818095676183e-06, + "loss": 0.4573, + "step": 4182 + }, + { + "epoch": 1.0170192073911986, + "grad_norm": 20.25, + "learning_rate": 1.2403348469439731e-06, + "loss": 0.7516, + "step": 4183 + }, + { + "epoch": 1.0172623389253586, + "grad_norm": 26.0, + "learning_rate": 1.2398515996563875e-06, + "loss": 0.4984, + "step": 4184 + }, + { + "epoch": 1.0175054704595186, + "grad_norm": 24.875, + "learning_rate": 1.2393683538856556e-06, + "loss": 0.7951, + "step": 4185 + }, + { + "epoch": 1.0177486019936786, + "grad_norm": 19.375, + "learning_rate": 1.238885109704007e-06, + "loss": 0.9981, + "step": 4186 + }, + { + "epoch": 1.0179917335278386, + "grad_norm": 17.625, + "learning_rate": 1.2384018671836706e-06, + "loss": 0.523, + "step": 4187 + }, + { + "epoch": 1.0182348650619986, + "grad_norm": 20.625, + "learning_rate": 1.2379186263968754e-06, + "loss": 0.646, + "step": 4188 + }, + { + "epoch": 1.0184779965961586, + "grad_norm": 17.0, + "learning_rate": 1.2374353874158506e-06, + "loss": 0.4136, + "step": 4189 + }, + { + "epoch": 1.0187211281303186, + "grad_norm": 21.375, + "learning_rate": 1.2369521503128235e-06, + "loss": 1.1995, + "step": 4190 + }, + { + "epoch": 1.0189642596644786, + "grad_norm": 17.75, + "learning_rate": 1.2364689151600229e-06, + "loss": 0.4988, + "step": 4191 + }, + { + "epoch": 1.0192073911986386, + "grad_norm": 16.375, + "learning_rate": 1.2359856820296767e-06, + "loss": 0.3786, + "step": 4192 + }, + { + "epoch": 1.0194505227327983, + "grad_norm": 22.125, + "learning_rate": 1.2355024509940127e-06, + "loss": 0.737, + "step": 4193 + }, + { + "epoch": 1.0196936542669583, + "grad_norm": 12.5, + "learning_rate": 1.2350192221252576e-06, + "loss": 0.2558, + "step": 4194 + }, + { + "epoch": 1.0199367858011184, + "grad_norm": 20.25, + "learning_rate": 1.2345359954956384e-06, + "loss": 0.7269, + "step": 4195 + }, + { + "epoch": 1.0201799173352784, + "grad_norm": 27.625, + "learning_rate": 1.234052771177382e-06, + "loss": 0.7338, + "step": 4196 + }, + { + "epoch": 1.0204230488694384, + "grad_norm": 21.5, + "learning_rate": 1.2335695492427142e-06, + "loss": 0.9474, + "step": 4197 + }, + { + "epoch": 1.0206661804035984, + "grad_norm": 24.375, + "learning_rate": 1.2330863297638606e-06, + "loss": 0.7627, + "step": 4198 + }, + { + "epoch": 1.0209093119377584, + "grad_norm": 15.5, + "learning_rate": 1.2326031128130476e-06, + "loss": 0.6164, + "step": 4199 + }, + { + "epoch": 1.0211524434719184, + "grad_norm": 21.875, + "learning_rate": 1.2321198984624999e-06, + "loss": 0.335, + "step": 4200 + }, + { + "epoch": 1.0213955750060784, + "grad_norm": 20.875, + "learning_rate": 1.231636686784442e-06, + "loss": 0.5493, + "step": 4201 + }, + { + "epoch": 1.0216387065402384, + "grad_norm": 18.25, + "learning_rate": 1.2311534778510986e-06, + "loss": 0.525, + "step": 4202 + }, + { + "epoch": 1.0218818380743981, + "grad_norm": 19.25, + "learning_rate": 1.2306702717346935e-06, + "loss": 0.8582, + "step": 4203 + }, + { + "epoch": 1.0221249696085581, + "grad_norm": 22.75, + "learning_rate": 1.2301870685074498e-06, + "loss": 0.7255, + "step": 4204 + }, + { + "epoch": 1.0223681011427181, + "grad_norm": 15.375, + "learning_rate": 1.2297038682415909e-06, + "loss": 0.4337, + "step": 4205 + }, + { + "epoch": 1.0226112326768781, + "grad_norm": 33.0, + "learning_rate": 1.2292206710093391e-06, + "loss": 0.6941, + "step": 4206 + }, + { + "epoch": 1.0228543642110381, + "grad_norm": 17.375, + "learning_rate": 1.2287374768829173e-06, + "loss": 0.4654, + "step": 4207 + }, + { + "epoch": 1.0230974957451981, + "grad_norm": 16.25, + "learning_rate": 1.228254285934547e-06, + "loss": 0.3755, + "step": 4208 + }, + { + "epoch": 1.0233406272793582, + "grad_norm": 21.75, + "learning_rate": 1.227771098236449e-06, + "loss": 0.7989, + "step": 4209 + }, + { + "epoch": 1.0235837588135182, + "grad_norm": 23.125, + "learning_rate": 1.2272879138608446e-06, + "loss": 0.7926, + "step": 4210 + }, + { + "epoch": 1.0238268903476782, + "grad_norm": 21.125, + "learning_rate": 1.2268047328799534e-06, + "loss": 0.5802, + "step": 4211 + }, + { + "epoch": 1.0240700218818382, + "grad_norm": 22.0, + "learning_rate": 1.2263215553659953e-06, + "loss": 1.2618, + "step": 4212 + }, + { + "epoch": 1.0243131534159982, + "grad_norm": 15.0, + "learning_rate": 1.2258383813911903e-06, + "loss": 0.7276, + "step": 4213 + }, + { + "epoch": 1.024556284950158, + "grad_norm": 19.125, + "learning_rate": 1.2253552110277554e-06, + "loss": 0.487, + "step": 4214 + }, + { + "epoch": 1.024799416484318, + "grad_norm": 18.125, + "learning_rate": 1.2248720443479107e-06, + "loss": 0.8522, + "step": 4215 + }, + { + "epoch": 1.025042548018478, + "grad_norm": 17.5, + "learning_rate": 1.2243888814238727e-06, + "loss": 0.5451, + "step": 4216 + }, + { + "epoch": 1.025285679552638, + "grad_norm": 18.75, + "learning_rate": 1.223905722327859e-06, + "loss": 0.7036, + "step": 4217 + }, + { + "epoch": 1.025528811086798, + "grad_norm": 22.0, + "learning_rate": 1.2234225671320855e-06, + "loss": 0.7887, + "step": 4218 + }, + { + "epoch": 1.025771942620958, + "grad_norm": 18.75, + "learning_rate": 1.2229394159087682e-06, + "loss": 0.5522, + "step": 4219 + }, + { + "epoch": 1.026015074155118, + "grad_norm": 21.625, + "learning_rate": 1.2224562687301228e-06, + "loss": 0.8437, + "step": 4220 + }, + { + "epoch": 1.026258205689278, + "grad_norm": 23.5, + "learning_rate": 1.2219731256683633e-06, + "loss": 0.7484, + "step": 4221 + }, + { + "epoch": 1.026501337223438, + "grad_norm": 19.875, + "learning_rate": 1.2214899867957048e-06, + "loss": 0.6022, + "step": 4222 + }, + { + "epoch": 1.026744468757598, + "grad_norm": 26.5, + "learning_rate": 1.2210068521843598e-06, + "loss": 0.931, + "step": 4223 + }, + { + "epoch": 1.0269876002917577, + "grad_norm": 22.125, + "learning_rate": 1.2205237219065414e-06, + "loss": 0.7175, + "step": 4224 + }, + { + "epoch": 1.0272307318259177, + "grad_norm": 17.875, + "learning_rate": 1.2200405960344622e-06, + "loss": 0.5514, + "step": 4225 + }, + { + "epoch": 1.0274738633600777, + "grad_norm": 16.125, + "learning_rate": 1.2195574746403332e-06, + "loss": 0.7143, + "step": 4226 + }, + { + "epoch": 1.0277169948942377, + "grad_norm": 17.125, + "learning_rate": 1.2190743577963655e-06, + "loss": 0.5525, + "step": 4227 + }, + { + "epoch": 1.0279601264283977, + "grad_norm": 22.5, + "learning_rate": 1.218591245574769e-06, + "loss": 0.9265, + "step": 4228 + }, + { + "epoch": 1.0282032579625577, + "grad_norm": 24.0, + "learning_rate": 1.218108138047753e-06, + "loss": 0.7215, + "step": 4229 + }, + { + "epoch": 1.0284463894967177, + "grad_norm": 21.875, + "learning_rate": 1.2176250352875267e-06, + "loss": 1.129, + "step": 4230 + }, + { + "epoch": 1.0286895210308777, + "grad_norm": 24.25, + "learning_rate": 1.217141937366298e-06, + "loss": 0.6935, + "step": 4231 + }, + { + "epoch": 1.0289326525650377, + "grad_norm": 23.625, + "learning_rate": 1.2166588443562747e-06, + "loss": 0.9392, + "step": 4232 + }, + { + "epoch": 1.0291757840991977, + "grad_norm": 41.5, + "learning_rate": 1.2161757563296625e-06, + "loss": 0.799, + "step": 4233 + }, + { + "epoch": 1.0294189156333577, + "grad_norm": 35.5, + "learning_rate": 1.2156926733586681e-06, + "loss": 1.0792, + "step": 4234 + }, + { + "epoch": 1.0296620471675175, + "grad_norm": 18.0, + "learning_rate": 1.2152095955154958e-06, + "loss": 0.5697, + "step": 4235 + }, + { + "epoch": 1.0299051787016775, + "grad_norm": 18.875, + "learning_rate": 1.2147265228723502e-06, + "loss": 0.4776, + "step": 4236 + }, + { + "epoch": 1.0301483102358375, + "grad_norm": 26.125, + "learning_rate": 1.2142434555014348e-06, + "loss": 0.8266, + "step": 4237 + }, + { + "epoch": 1.0303914417699975, + "grad_norm": 19.25, + "learning_rate": 1.213760393474953e-06, + "loss": 0.6032, + "step": 4238 + }, + { + "epoch": 1.0306345733041575, + "grad_norm": 32.25, + "learning_rate": 1.2132773368651061e-06, + "loss": 0.8117, + "step": 4239 + }, + { + "epoch": 1.0308777048383175, + "grad_norm": 17.375, + "learning_rate": 1.2127942857440953e-06, + "loss": 0.7527, + "step": 4240 + }, + { + "epoch": 1.0311208363724775, + "grad_norm": 19.875, + "learning_rate": 1.2123112401841212e-06, + "loss": 0.8044, + "step": 4241 + }, + { + "epoch": 1.0313639679066375, + "grad_norm": 21.375, + "learning_rate": 1.2118282002573835e-06, + "loss": 0.7041, + "step": 4242 + }, + { + "epoch": 1.0316070994407975, + "grad_norm": 18.625, + "learning_rate": 1.21134516603608e-06, + "loss": 0.5308, + "step": 4243 + }, + { + "epoch": 1.0318502309749575, + "grad_norm": 19.875, + "learning_rate": 1.2108621375924097e-06, + "loss": 0.7222, + "step": 4244 + }, + { + "epoch": 1.0320933625091175, + "grad_norm": 21.625, + "learning_rate": 1.2103791149985677e-06, + "loss": 0.6742, + "step": 4245 + }, + { + "epoch": 1.0323364940432773, + "grad_norm": 12.5625, + "learning_rate": 1.209896098326752e-06, + "loss": 0.2817, + "step": 4246 + }, + { + "epoch": 1.0325796255774373, + "grad_norm": 16.5, + "learning_rate": 1.209413087649157e-06, + "loss": 0.3466, + "step": 4247 + }, + { + "epoch": 1.0328227571115973, + "grad_norm": 25.375, + "learning_rate": 1.2089300830379767e-06, + "loss": 0.6938, + "step": 4248 + }, + { + "epoch": 1.0330658886457573, + "grad_norm": 26.625, + "learning_rate": 1.2084470845654054e-06, + "loss": 0.9129, + "step": 4249 + }, + { + "epoch": 1.0333090201799173, + "grad_norm": 17.625, + "learning_rate": 1.2079640923036345e-06, + "loss": 0.9526, + "step": 4250 + }, + { + "epoch": 1.0335521517140773, + "grad_norm": 23.125, + "learning_rate": 1.2074811063248564e-06, + "loss": 0.5601, + "step": 4251 + }, + { + "epoch": 1.0337952832482373, + "grad_norm": 19.25, + "learning_rate": 1.2069981267012603e-06, + "loss": 0.8824, + "step": 4252 + }, + { + "epoch": 1.0340384147823973, + "grad_norm": 14.9375, + "learning_rate": 1.2065151535050377e-06, + "loss": 0.4185, + "step": 4253 + }, + { + "epoch": 1.0342815463165573, + "grad_norm": 18.625, + "learning_rate": 1.2060321868083761e-06, + "loss": 0.5302, + "step": 4254 + }, + { + "epoch": 1.0345246778507173, + "grad_norm": 20.0, + "learning_rate": 1.2055492266834637e-06, + "loss": 0.3381, + "step": 4255 + }, + { + "epoch": 1.0347678093848771, + "grad_norm": 22.375, + "learning_rate": 1.205066273202487e-06, + "loss": 0.5787, + "step": 4256 + }, + { + "epoch": 1.0350109409190371, + "grad_norm": 18.625, + "learning_rate": 1.204583326437632e-06, + "loss": 0.6395, + "step": 4257 + }, + { + "epoch": 1.0352540724531971, + "grad_norm": 14.125, + "learning_rate": 1.2041003864610832e-06, + "loss": 0.329, + "step": 4258 + }, + { + "epoch": 1.0354972039873571, + "grad_norm": 20.75, + "learning_rate": 1.2036174533450242e-06, + "loss": 0.575, + "step": 4259 + }, + { + "epoch": 1.0357403355215171, + "grad_norm": 21.0, + "learning_rate": 1.2031345271616376e-06, + "loss": 0.6864, + "step": 4260 + }, + { + "epoch": 1.0359834670556771, + "grad_norm": 20.0, + "learning_rate": 1.2026516079831054e-06, + "loss": 0.7953, + "step": 4261 + }, + { + "epoch": 1.0362265985898371, + "grad_norm": 20.125, + "learning_rate": 1.2021686958816084e-06, + "loss": 0.7184, + "step": 4262 + }, + { + "epoch": 1.0364697301239971, + "grad_norm": 23.0, + "learning_rate": 1.2016857909293258e-06, + "loss": 0.9226, + "step": 4263 + }, + { + "epoch": 1.0367128616581571, + "grad_norm": 22.5, + "learning_rate": 1.2012028931984362e-06, + "loss": 1.0413, + "step": 4264 + }, + { + "epoch": 1.0369559931923171, + "grad_norm": 20.0, + "learning_rate": 1.2007200027611168e-06, + "loss": 0.9578, + "step": 4265 + }, + { + "epoch": 1.0371991247264771, + "grad_norm": 23.125, + "learning_rate": 1.2002371196895444e-06, + "loss": 0.9385, + "step": 4266 + }, + { + "epoch": 1.037442256260637, + "grad_norm": 22.75, + "learning_rate": 1.1997542440558936e-06, + "loss": 0.761, + "step": 4267 + }, + { + "epoch": 1.037685387794797, + "grad_norm": 17.625, + "learning_rate": 1.1992713759323386e-06, + "loss": 0.3711, + "step": 4268 + }, + { + "epoch": 1.037928519328957, + "grad_norm": 15.8125, + "learning_rate": 1.1987885153910527e-06, + "loss": 0.3699, + "step": 4269 + }, + { + "epoch": 1.038171650863117, + "grad_norm": 17.625, + "learning_rate": 1.1983056625042082e-06, + "loss": 0.5926, + "step": 4270 + }, + { + "epoch": 1.038414782397277, + "grad_norm": 21.5, + "learning_rate": 1.1978228173439746e-06, + "loss": 0.7479, + "step": 4271 + }, + { + "epoch": 1.038657913931437, + "grad_norm": 17.875, + "learning_rate": 1.1973399799825222e-06, + "loss": 0.8557, + "step": 4272 + }, + { + "epoch": 1.038901045465597, + "grad_norm": 26.375, + "learning_rate": 1.1968571504920198e-06, + "loss": 1.1538, + "step": 4273 + }, + { + "epoch": 1.039144176999757, + "grad_norm": 380.0, + "learning_rate": 1.1963743289446335e-06, + "loss": 0.7731, + "step": 4274 + }, + { + "epoch": 1.039387308533917, + "grad_norm": 24.5, + "learning_rate": 1.1958915154125303e-06, + "loss": 0.669, + "step": 4275 + }, + { + "epoch": 1.039630440068077, + "grad_norm": 15.375, + "learning_rate": 1.1954087099678736e-06, + "loss": 0.3717, + "step": 4276 + }, + { + "epoch": 1.0398735716022367, + "grad_norm": 21.75, + "learning_rate": 1.194925912682829e-06, + "loss": 0.5048, + "step": 4277 + }, + { + "epoch": 1.0401167031363967, + "grad_norm": 25.0, + "learning_rate": 1.1944431236295577e-06, + "loss": 0.9457, + "step": 4278 + }, + { + "epoch": 1.0403598346705567, + "grad_norm": 17.875, + "learning_rate": 1.1939603428802208e-06, + "loss": 0.564, + "step": 4279 + }, + { + "epoch": 1.0406029662047167, + "grad_norm": 24.25, + "learning_rate": 1.193477570506979e-06, + "loss": 0.8875, + "step": 4280 + }, + { + "epoch": 1.0408460977388767, + "grad_norm": 26.125, + "learning_rate": 1.1929948065819903e-06, + "loss": 0.9343, + "step": 4281 + }, + { + "epoch": 1.0410892292730367, + "grad_norm": 16.0, + "learning_rate": 1.192512051177412e-06, + "loss": 0.5083, + "step": 4282 + }, + { + "epoch": 1.0413323608071967, + "grad_norm": 13.875, + "learning_rate": 1.1920293043654002e-06, + "loss": 0.2945, + "step": 4283 + }, + { + "epoch": 1.0415754923413567, + "grad_norm": 15.625, + "learning_rate": 1.1915465662181109e-06, + "loss": 0.499, + "step": 4284 + }, + { + "epoch": 1.0418186238755167, + "grad_norm": 15.625, + "learning_rate": 1.1910638368076963e-06, + "loss": 0.3521, + "step": 4285 + }, + { + "epoch": 1.0420617554096767, + "grad_norm": 17.5, + "learning_rate": 1.1905811162063093e-06, + "loss": 0.6663, + "step": 4286 + }, + { + "epoch": 1.0423048869438367, + "grad_norm": 17.125, + "learning_rate": 1.1900984044861008e-06, + "loss": 0.8063, + "step": 4287 + }, + { + "epoch": 1.0425480184779965, + "grad_norm": 18.0, + "learning_rate": 1.18961570171922e-06, + "loss": 0.6233, + "step": 4288 + }, + { + "epoch": 1.0427911500121565, + "grad_norm": 17.375, + "learning_rate": 1.1891330079778156e-06, + "loss": 0.4607, + "step": 4289 + }, + { + "epoch": 1.0430342815463165, + "grad_norm": 18.375, + "learning_rate": 1.1886503233340347e-06, + "loss": 0.6102, + "step": 4290 + }, + { + "epoch": 1.0432774130804765, + "grad_norm": 14.375, + "learning_rate": 1.1881676478600217e-06, + "loss": 0.5538, + "step": 4291 + }, + { + "epoch": 1.0435205446146365, + "grad_norm": 22.0, + "learning_rate": 1.1876849816279222e-06, + "loss": 0.7654, + "step": 4292 + }, + { + "epoch": 1.0437636761487965, + "grad_norm": 18.875, + "learning_rate": 1.187202324709878e-06, + "loss": 0.8974, + "step": 4293 + }, + { + "epoch": 1.0440068076829565, + "grad_norm": 21.125, + "learning_rate": 1.1867196771780314e-06, + "loss": 0.5014, + "step": 4294 + }, + { + "epoch": 1.0442499392171165, + "grad_norm": 21.875, + "learning_rate": 1.1862370391045217e-06, + "loss": 0.7386, + "step": 4295 + }, + { + "epoch": 1.0444930707512765, + "grad_norm": 18.0, + "learning_rate": 1.185754410561488e-06, + "loss": 0.5077, + "step": 4296 + }, + { + "epoch": 1.0447362022854365, + "grad_norm": 21.375, + "learning_rate": 1.1852717916210672e-06, + "loss": 0.7701, + "step": 4297 + }, + { + "epoch": 1.0449793338195965, + "grad_norm": 17.125, + "learning_rate": 1.1847891823553947e-06, + "loss": 0.4472, + "step": 4298 + }, + { + "epoch": 1.0452224653537563, + "grad_norm": 27.125, + "learning_rate": 1.184306582836605e-06, + "loss": 0.6802, + "step": 4299 + }, + { + "epoch": 1.0454655968879163, + "grad_norm": 19.75, + "learning_rate": 1.183823993136831e-06, + "loss": 0.7162, + "step": 4300 + }, + { + "epoch": 1.0457087284220763, + "grad_norm": 16.25, + "learning_rate": 1.1833414133282049e-06, + "loss": 0.3975, + "step": 4301 + }, + { + "epoch": 1.0459518599562363, + "grad_norm": 18.625, + "learning_rate": 1.1828588434828552e-06, + "loss": 0.7045, + "step": 4302 + }, + { + "epoch": 1.0461949914903963, + "grad_norm": 16.125, + "learning_rate": 1.182376283672911e-06, + "loss": 0.3439, + "step": 4303 + }, + { + "epoch": 1.0464381230245563, + "grad_norm": 25.25, + "learning_rate": 1.1818937339704995e-06, + "loss": 0.6235, + "step": 4304 + }, + { + "epoch": 1.0466812545587163, + "grad_norm": 27.75, + "learning_rate": 1.1814111944477452e-06, + "loss": 0.7153, + "step": 4305 + }, + { + "epoch": 1.0469243860928763, + "grad_norm": 26.625, + "learning_rate": 1.1809286651767723e-06, + "loss": 1.2673, + "step": 4306 + }, + { + "epoch": 1.0471675176270363, + "grad_norm": 22.125, + "learning_rate": 1.1804461462297035e-06, + "loss": 0.7497, + "step": 4307 + }, + { + "epoch": 1.0474106491611963, + "grad_norm": 20.125, + "learning_rate": 1.1799636376786598e-06, + "loss": 0.648, + "step": 4308 + }, + { + "epoch": 1.047653780695356, + "grad_norm": 18.875, + "learning_rate": 1.1794811395957595e-06, + "loss": 0.7847, + "step": 4309 + }, + { + "epoch": 1.047896912229516, + "grad_norm": 17.0, + "learning_rate": 1.178998652053121e-06, + "loss": 0.5397, + "step": 4310 + }, + { + "epoch": 1.048140043763676, + "grad_norm": 17.0, + "learning_rate": 1.1785161751228606e-06, + "loss": 1.2018, + "step": 4311 + }, + { + "epoch": 1.048383175297836, + "grad_norm": 22.125, + "learning_rate": 1.178033708877092e-06, + "loss": 0.9839, + "step": 4312 + }, + { + "epoch": 1.048626306831996, + "grad_norm": 18.5, + "learning_rate": 1.1775512533879282e-06, + "loss": 1.0773, + "step": 4313 + }, + { + "epoch": 1.048869438366156, + "grad_norm": 24.75, + "learning_rate": 1.1770688087274809e-06, + "loss": 0.5905, + "step": 4314 + }, + { + "epoch": 1.049112569900316, + "grad_norm": 19.0, + "learning_rate": 1.17658637496786e-06, + "loss": 0.5783, + "step": 4315 + }, + { + "epoch": 1.049355701434476, + "grad_norm": 20.875, + "learning_rate": 1.1761039521811731e-06, + "loss": 0.5302, + "step": 4316 + }, + { + "epoch": 1.049598832968636, + "grad_norm": 19.75, + "learning_rate": 1.175621540439527e-06, + "loss": 0.7245, + "step": 4317 + }, + { + "epoch": 1.049841964502796, + "grad_norm": 24.375, + "learning_rate": 1.1751391398150263e-06, + "loss": 0.5827, + "step": 4318 + }, + { + "epoch": 1.050085096036956, + "grad_norm": 24.75, + "learning_rate": 1.1746567503797738e-06, + "loss": 0.7684, + "step": 4319 + }, + { + "epoch": 1.0503282275711159, + "grad_norm": 18.0, + "learning_rate": 1.174174372205871e-06, + "loss": 0.8336, + "step": 4320 + }, + { + "epoch": 1.0505713591052759, + "grad_norm": 19.0, + "learning_rate": 1.1736920053654183e-06, + "loss": 0.8152, + "step": 4321 + }, + { + "epoch": 1.0508144906394359, + "grad_norm": 26.625, + "learning_rate": 1.1732096499305127e-06, + "loss": 0.974, + "step": 4322 + }, + { + "epoch": 1.0510576221735959, + "grad_norm": 33.0, + "learning_rate": 1.1727273059732513e-06, + "loss": 0.7634, + "step": 4323 + }, + { + "epoch": 1.0513007537077559, + "grad_norm": 15.375, + "learning_rate": 1.1722449735657288e-06, + "loss": 1.243, + "step": 4324 + }, + { + "epoch": 1.0515438852419159, + "grad_norm": 17.75, + "learning_rate": 1.171762652780038e-06, + "loss": 0.6222, + "step": 4325 + }, + { + "epoch": 1.051787016776076, + "grad_norm": 17.375, + "learning_rate": 1.1712803436882696e-06, + "loss": 0.678, + "step": 4326 + }, + { + "epoch": 1.052030148310236, + "grad_norm": 23.625, + "learning_rate": 1.1707980463625136e-06, + "loss": 0.5647, + "step": 4327 + }, + { + "epoch": 1.052273279844396, + "grad_norm": 17.25, + "learning_rate": 1.1703157608748574e-06, + "loss": 0.7858, + "step": 4328 + }, + { + "epoch": 1.052516411378556, + "grad_norm": 22.875, + "learning_rate": 1.169833487297387e-06, + "loss": 1.1034, + "step": 4329 + }, + { + "epoch": 1.0527595429127157, + "grad_norm": 22.375, + "learning_rate": 1.1693512257021858e-06, + "loss": 0.4154, + "step": 4330 + }, + { + "epoch": 1.0530026744468757, + "grad_norm": 22.25, + "learning_rate": 1.1688689761613374e-06, + "loss": 0.6577, + "step": 4331 + }, + { + "epoch": 1.0532458059810357, + "grad_norm": 23.5, + "learning_rate": 1.168386738746922e-06, + "loss": 0.9481, + "step": 4332 + }, + { + "epoch": 1.0534889375151957, + "grad_norm": 23.0, + "learning_rate": 1.1679045135310175e-06, + "loss": 0.7352, + "step": 4333 + }, + { + "epoch": 1.0537320690493557, + "grad_norm": 23.5, + "learning_rate": 1.1674223005857013e-06, + "loss": 0.6326, + "step": 4334 + }, + { + "epoch": 1.0539752005835157, + "grad_norm": 21.125, + "learning_rate": 1.1669400999830487e-06, + "loss": 0.7645, + "step": 4335 + }, + { + "epoch": 1.0542183321176757, + "grad_norm": 16.25, + "learning_rate": 1.1664579117951326e-06, + "loss": 0.5578, + "step": 4336 + }, + { + "epoch": 1.0544614636518357, + "grad_norm": 14.6875, + "learning_rate": 1.1659757360940238e-06, + "loss": 0.3474, + "step": 4337 + }, + { + "epoch": 1.0547045951859957, + "grad_norm": 19.875, + "learning_rate": 1.1654935729517928e-06, + "loss": 0.9139, + "step": 4338 + }, + { + "epoch": 1.0549477267201557, + "grad_norm": 20.875, + "learning_rate": 1.165011422440507e-06, + "loss": 0.6661, + "step": 4339 + }, + { + "epoch": 1.0551908582543157, + "grad_norm": 21.375, + "learning_rate": 1.164529284632232e-06, + "loss": 0.9341, + "step": 4340 + }, + { + "epoch": 1.0554339897884755, + "grad_norm": 23.5, + "learning_rate": 1.164047159599031e-06, + "loss": 0.8347, + "step": 4341 + }, + { + "epoch": 1.0556771213226355, + "grad_norm": 14.125, + "learning_rate": 1.1635650474129672e-06, + "loss": 0.3434, + "step": 4342 + }, + { + "epoch": 1.0559202528567955, + "grad_norm": 18.125, + "learning_rate": 1.1630829481460993e-06, + "loss": 0.8507, + "step": 4343 + }, + { + "epoch": 1.0561633843909555, + "grad_norm": 21.5, + "learning_rate": 1.1626008618704862e-06, + "loss": 0.6312, + "step": 4344 + }, + { + "epoch": 1.0564065159251155, + "grad_norm": 21.5, + "learning_rate": 1.1621187886581832e-06, + "loss": 0.725, + "step": 4345 + }, + { + "epoch": 1.0566496474592755, + "grad_norm": 23.875, + "learning_rate": 1.1616367285812453e-06, + "loss": 0.8004, + "step": 4346 + }, + { + "epoch": 1.0568927789934355, + "grad_norm": 23.625, + "learning_rate": 1.1611546817117248e-06, + "loss": 0.7786, + "step": 4347 + }, + { + "epoch": 1.0571359105275955, + "grad_norm": 22.625, + "learning_rate": 1.160672648121671e-06, + "loss": 0.6954, + "step": 4348 + }, + { + "epoch": 1.0573790420617555, + "grad_norm": 22.875, + "learning_rate": 1.1601906278831332e-06, + "loss": 0.7504, + "step": 4349 + }, + { + "epoch": 1.0576221735959155, + "grad_norm": 26.125, + "learning_rate": 1.1597086210681566e-06, + "loss": 0.7485, + "step": 4350 + }, + { + "epoch": 1.0578653051300755, + "grad_norm": 20.5, + "learning_rate": 1.1592266277487862e-06, + "loss": 0.8543, + "step": 4351 + }, + { + "epoch": 1.0581084366642353, + "grad_norm": 18.375, + "learning_rate": 1.1587446479970642e-06, + "loss": 0.956, + "step": 4352 + }, + { + "epoch": 1.0583515681983953, + "grad_norm": 27.25, + "learning_rate": 1.1582626818850298e-06, + "loss": 0.7695, + "step": 4353 + }, + { + "epoch": 1.0585946997325553, + "grad_norm": 19.375, + "learning_rate": 1.1577807294847227e-06, + "loss": 0.6911, + "step": 4354 + }, + { + "epoch": 1.0588378312667153, + "grad_norm": 18.75, + "learning_rate": 1.157298790868178e-06, + "loss": 0.8286, + "step": 4355 + }, + { + "epoch": 1.0590809628008753, + "grad_norm": 26.25, + "learning_rate": 1.1568168661074303e-06, + "loss": 0.982, + "step": 4356 + }, + { + "epoch": 1.0593240943350353, + "grad_norm": 28.75, + "learning_rate": 1.156334955274511e-06, + "loss": 0.3835, + "step": 4357 + }, + { + "epoch": 1.0595672258691953, + "grad_norm": 27.5, + "learning_rate": 1.1558530584414505e-06, + "loss": 0.7446, + "step": 4358 + }, + { + "epoch": 1.0598103574033553, + "grad_norm": 16.875, + "learning_rate": 1.1553711756802762e-06, + "loss": 0.614, + "step": 4359 + }, + { + "epoch": 1.0600534889375153, + "grad_norm": 15.125, + "learning_rate": 1.1548893070630135e-06, + "loss": 0.3711, + "step": 4360 + }, + { + "epoch": 1.0602966204716753, + "grad_norm": 26.875, + "learning_rate": 1.1544074526616874e-06, + "loss": 0.9628, + "step": 4361 + }, + { + "epoch": 1.060539752005835, + "grad_norm": 19.0, + "learning_rate": 1.153925612548318e-06, + "loss": 0.4451, + "step": 4362 + }, + { + "epoch": 1.060782883539995, + "grad_norm": 15.25, + "learning_rate": 1.153443786794925e-06, + "loss": 0.492, + "step": 4363 + }, + { + "epoch": 1.061026015074155, + "grad_norm": 20.625, + "learning_rate": 1.152961975473526e-06, + "loss": 0.9892, + "step": 4364 + }, + { + "epoch": 1.061269146608315, + "grad_norm": 17.625, + "learning_rate": 1.1524801786561355e-06, + "loss": 0.6712, + "step": 4365 + }, + { + "epoch": 1.061512278142475, + "grad_norm": 20.125, + "learning_rate": 1.1519983964147667e-06, + "loss": 0.6189, + "step": 4366 + }, + { + "epoch": 1.061755409676635, + "grad_norm": 18.5, + "learning_rate": 1.1515166288214298e-06, + "loss": 0.7311, + "step": 4367 + }, + { + "epoch": 1.061998541210795, + "grad_norm": 18.875, + "learning_rate": 1.1510348759481332e-06, + "loss": 0.6576, + "step": 4368 + }, + { + "epoch": 1.062241672744955, + "grad_norm": 16.5, + "learning_rate": 1.150553137866884e-06, + "loss": 0.6377, + "step": 4369 + }, + { + "epoch": 1.062484804279115, + "grad_norm": 18.375, + "learning_rate": 1.1500714146496858e-06, + "loss": 0.4656, + "step": 4370 + }, + { + "epoch": 1.062727935813275, + "grad_norm": 15.1875, + "learning_rate": 1.149589706368541e-06, + "loss": 0.5009, + "step": 4371 + }, + { + "epoch": 1.0629710673474349, + "grad_norm": 15.3125, + "learning_rate": 1.149108013095448e-06, + "loss": 0.3528, + "step": 4372 + }, + { + "epoch": 1.0632141988815949, + "grad_norm": 23.625, + "learning_rate": 1.1486263349024055e-06, + "loss": 0.7346, + "step": 4373 + }, + { + "epoch": 1.0634573304157549, + "grad_norm": 25.625, + "learning_rate": 1.1481446718614077e-06, + "loss": 0.9986, + "step": 4374 + }, + { + "epoch": 1.0637004619499149, + "grad_norm": 21.625, + "learning_rate": 1.1476630240444478e-06, + "loss": 0.5542, + "step": 4375 + }, + { + "epoch": 1.0639435934840749, + "grad_norm": 16.375, + "learning_rate": 1.147181391523516e-06, + "loss": 0.788, + "step": 4376 + }, + { + "epoch": 1.0641867250182349, + "grad_norm": 26.375, + "learning_rate": 1.1466997743706015e-06, + "loss": 0.7264, + "step": 4377 + }, + { + "epoch": 1.0644298565523949, + "grad_norm": 17.5, + "learning_rate": 1.14621817265769e-06, + "loss": 0.8345, + "step": 4378 + }, + { + "epoch": 1.0646729880865549, + "grad_norm": 17.0, + "learning_rate": 1.1457365864567645e-06, + "loss": 0.9265, + "step": 4379 + }, + { + "epoch": 1.0649161196207149, + "grad_norm": 26.75, + "learning_rate": 1.145255015839807e-06, + "loss": 0.6017, + "step": 4380 + }, + { + "epoch": 1.0651592511548749, + "grad_norm": 20.625, + "learning_rate": 1.1447734608787966e-06, + "loss": 0.6863, + "step": 4381 + }, + { + "epoch": 1.0654023826890349, + "grad_norm": 17.5, + "learning_rate": 1.1442919216457095e-06, + "loss": 0.7533, + "step": 4382 + }, + { + "epoch": 1.0656455142231946, + "grad_norm": 17.125, + "learning_rate": 1.1438103982125206e-06, + "loss": 0.4611, + "step": 4383 + }, + { + "epoch": 1.0658886457573546, + "grad_norm": 23.0, + "learning_rate": 1.143328890651201e-06, + "loss": 0.5866, + "step": 4384 + }, + { + "epoch": 1.0661317772915146, + "grad_norm": 21.375, + "learning_rate": 1.1428473990337216e-06, + "loss": 1.1776, + "step": 4385 + }, + { + "epoch": 1.0663749088256747, + "grad_norm": 20.25, + "learning_rate": 1.1423659234320489e-06, + "loss": 0.8063, + "step": 4386 + }, + { + "epoch": 1.0666180403598347, + "grad_norm": 24.375, + "learning_rate": 1.1418844639181476e-06, + "loss": 0.9249, + "step": 4387 + }, + { + "epoch": 1.0668611718939947, + "grad_norm": 19.375, + "learning_rate": 1.141403020563981e-06, + "loss": 0.5387, + "step": 4388 + }, + { + "epoch": 1.0671043034281547, + "grad_norm": 18.25, + "learning_rate": 1.140921593441508e-06, + "loss": 0.5215, + "step": 4389 + }, + { + "epoch": 1.0673474349623147, + "grad_norm": 23.125, + "learning_rate": 1.140440182622687e-06, + "loss": 0.7367, + "step": 4390 + }, + { + "epoch": 1.0675905664964747, + "grad_norm": 17.875, + "learning_rate": 1.139958788179472e-06, + "loss": 0.7743, + "step": 4391 + }, + { + "epoch": 1.0678336980306347, + "grad_norm": 17.0, + "learning_rate": 1.1394774101838176e-06, + "loss": 0.5295, + "step": 4392 + }, + { + "epoch": 1.0680768295647947, + "grad_norm": 17.625, + "learning_rate": 1.1389960487076726e-06, + "loss": 0.4567, + "step": 4393 + }, + { + "epoch": 1.0683199610989544, + "grad_norm": 17.875, + "learning_rate": 1.1385147038229853e-06, + "loss": 0.7191, + "step": 4394 + }, + { + "epoch": 1.0685630926331144, + "grad_norm": 26.0, + "learning_rate": 1.1380333756017013e-06, + "loss": 0.684, + "step": 4395 + }, + { + "epoch": 1.0688062241672744, + "grad_norm": 17.25, + "learning_rate": 1.1375520641157626e-06, + "loss": 0.4503, + "step": 4396 + }, + { + "epoch": 1.0690493557014344, + "grad_norm": 27.25, + "learning_rate": 1.13707076943711e-06, + "loss": 0.7169, + "step": 4397 + }, + { + "epoch": 1.0692924872355944, + "grad_norm": 17.75, + "learning_rate": 1.1365894916376815e-06, + "loss": 0.6929, + "step": 4398 + }, + { + "epoch": 1.0695356187697544, + "grad_norm": 20.25, + "learning_rate": 1.1361082307894116e-06, + "loss": 0.7774, + "step": 4399 + }, + { + "epoch": 1.0697787503039144, + "grad_norm": 21.75, + "learning_rate": 1.135626986964234e-06, + "loss": 0.6137, + "step": 4400 + }, + { + "epoch": 1.0700218818380745, + "grad_norm": 21.875, + "learning_rate": 1.1351457602340783e-06, + "loss": 0.9395, + "step": 4401 + }, + { + "epoch": 1.0702650133722345, + "grad_norm": 34.25, + "learning_rate": 1.1346645506708726e-06, + "loss": 0.7565, + "step": 4402 + }, + { + "epoch": 1.0705081449063945, + "grad_norm": 49.0, + "learning_rate": 1.1341833583465418e-06, + "loss": 1.0853, + "step": 4403 + }, + { + "epoch": 1.0707512764405545, + "grad_norm": 20.0, + "learning_rate": 1.1337021833330079e-06, + "loss": 0.8362, + "step": 4404 + }, + { + "epoch": 1.0709944079747142, + "grad_norm": 26.875, + "learning_rate": 1.1332210257021917e-06, + "loss": 0.4942, + "step": 4405 + }, + { + "epoch": 1.0712375395088742, + "grad_norm": 19.375, + "learning_rate": 1.1327398855260097e-06, + "loss": 0.7441, + "step": 4406 + }, + { + "epoch": 1.0714806710430342, + "grad_norm": 20.25, + "learning_rate": 1.1322587628763768e-06, + "loss": 0.7362, + "step": 4407 + }, + { + "epoch": 1.0717238025771942, + "grad_norm": 29.0, + "learning_rate": 1.1317776578252054e-06, + "loss": 1.0423, + "step": 4408 + }, + { + "epoch": 1.0719669341113542, + "grad_norm": 30.5, + "learning_rate": 1.1312965704444053e-06, + "loss": 0.6931, + "step": 4409 + }, + { + "epoch": 1.0722100656455142, + "grad_norm": 16.75, + "learning_rate": 1.1308155008058823e-06, + "loss": 0.3339, + "step": 4410 + }, + { + "epoch": 1.0724531971796742, + "grad_norm": 15.8125, + "learning_rate": 1.1303344489815412e-06, + "loss": 1.0337, + "step": 4411 + }, + { + "epoch": 1.0726963287138342, + "grad_norm": 21.25, + "learning_rate": 1.1298534150432835e-06, + "loss": 0.7422, + "step": 4412 + }, + { + "epoch": 1.0729394602479942, + "grad_norm": 29.5, + "learning_rate": 1.129372399063008e-06, + "loss": 1.074, + "step": 4413 + }, + { + "epoch": 1.0731825917821542, + "grad_norm": 22.125, + "learning_rate": 1.1288914011126104e-06, + "loss": 0.5714, + "step": 4414 + }, + { + "epoch": 1.073425723316314, + "grad_norm": 24.5, + "learning_rate": 1.1284104212639847e-06, + "loss": 0.6241, + "step": 4415 + }, + { + "epoch": 1.073668854850474, + "grad_norm": 20.5, + "learning_rate": 1.1279294595890215e-06, + "loss": 1.1062, + "step": 4416 + }, + { + "epoch": 1.073911986384634, + "grad_norm": 20.875, + "learning_rate": 1.1274485161596088e-06, + "loss": 0.7436, + "step": 4417 + }, + { + "epoch": 1.074155117918794, + "grad_norm": 22.125, + "learning_rate": 1.1269675910476319e-06, + "loss": 0.9513, + "step": 4418 + }, + { + "epoch": 1.074398249452954, + "grad_norm": 21.75, + "learning_rate": 1.1264866843249736e-06, + "loss": 1.0243, + "step": 4419 + }, + { + "epoch": 1.074641380987114, + "grad_norm": 24.5, + "learning_rate": 1.1260057960635132e-06, + "loss": 0.909, + "step": 4420 + }, + { + "epoch": 1.074884512521274, + "grad_norm": 35.5, + "learning_rate": 1.125524926335128e-06, + "loss": 0.6748, + "step": 4421 + }, + { + "epoch": 1.075127644055434, + "grad_norm": 32.25, + "learning_rate": 1.125044075211692e-06, + "loss": 0.9951, + "step": 4422 + }, + { + "epoch": 1.075370775589594, + "grad_norm": 26.625, + "learning_rate": 1.1245632427650777e-06, + "loss": 0.9535, + "step": 4423 + }, + { + "epoch": 1.075613907123754, + "grad_norm": 18.75, + "learning_rate": 1.1240824290671528e-06, + "loss": 0.5518, + "step": 4424 + }, + { + "epoch": 1.0758570386579138, + "grad_norm": 26.25, + "learning_rate": 1.1236016341897836e-06, + "loss": 0.6448, + "step": 4425 + }, + { + "epoch": 1.0761001701920738, + "grad_norm": 20.625, + "learning_rate": 1.1231208582048333e-06, + "loss": 0.5226, + "step": 4426 + }, + { + "epoch": 1.0763433017262338, + "grad_norm": 18.25, + "learning_rate": 1.1226401011841618e-06, + "loss": 0.6849, + "step": 4427 + }, + { + "epoch": 1.0765864332603938, + "grad_norm": 15.25, + "learning_rate": 1.122159363199627e-06, + "loss": 0.425, + "step": 4428 + }, + { + "epoch": 1.0768295647945538, + "grad_norm": 27.25, + "learning_rate": 1.1216786443230832e-06, + "loss": 0.8265, + "step": 4429 + }, + { + "epoch": 1.0770726963287138, + "grad_norm": 20.875, + "learning_rate": 1.1211979446263816e-06, + "loss": 0.9099, + "step": 4430 + }, + { + "epoch": 1.0773158278628738, + "grad_norm": 15.0, + "learning_rate": 1.1207172641813723e-06, + "loss": 0.369, + "step": 4431 + }, + { + "epoch": 1.0775589593970338, + "grad_norm": 28.5, + "learning_rate": 1.1202366030599003e-06, + "loss": 0.5714, + "step": 4432 + }, + { + "epoch": 1.0778020909311938, + "grad_norm": 28.75, + "learning_rate": 1.1197559613338094e-06, + "loss": 0.6227, + "step": 4433 + }, + { + "epoch": 1.0780452224653538, + "grad_norm": 20.875, + "learning_rate": 1.1192753390749394e-06, + "loss": 0.9075, + "step": 4434 + }, + { + "epoch": 1.0782883539995138, + "grad_norm": 23.0, + "learning_rate": 1.1187947363551276e-06, + "loss": 0.5589, + "step": 4435 + }, + { + "epoch": 1.0785314855336736, + "grad_norm": 19.25, + "learning_rate": 1.1183141532462085e-06, + "loss": 0.7107, + "step": 4436 + }, + { + "epoch": 1.0787746170678336, + "grad_norm": 23.75, + "learning_rate": 1.1178335898200135e-06, + "loss": 1.2436, + "step": 4437 + }, + { + "epoch": 1.0790177486019936, + "grad_norm": 19.875, + "learning_rate": 1.1173530461483708e-06, + "loss": 0.6621, + "step": 4438 + }, + { + "epoch": 1.0792608801361536, + "grad_norm": 20.875, + "learning_rate": 1.1168725223031062e-06, + "loss": 0.6918, + "step": 4439 + }, + { + "epoch": 1.0795040116703136, + "grad_norm": 21.125, + "learning_rate": 1.116392018356043e-06, + "loss": 0.8615, + "step": 4440 + }, + { + "epoch": 1.0797471432044736, + "grad_norm": 14.75, + "learning_rate": 1.1159115343789997e-06, + "loss": 0.629, + "step": 4441 + }, + { + "epoch": 1.0799902747386336, + "grad_norm": 18.875, + "learning_rate": 1.1154310704437934e-06, + "loss": 0.6937, + "step": 4442 + }, + { + "epoch": 1.0802334062727936, + "grad_norm": 19.875, + "learning_rate": 1.114950626622238e-06, + "loss": 0.4885, + "step": 4443 + }, + { + "epoch": 1.0804765378069536, + "grad_norm": 20.75, + "learning_rate": 1.1144702029861435e-06, + "loss": 0.6823, + "step": 4444 + }, + { + "epoch": 1.0807196693411136, + "grad_norm": 18.125, + "learning_rate": 1.1139897996073175e-06, + "loss": 0.7005, + "step": 4445 + }, + { + "epoch": 1.0809628008752736, + "grad_norm": 22.75, + "learning_rate": 1.1135094165575654e-06, + "loss": 0.9733, + "step": 4446 + }, + { + "epoch": 1.0812059324094334, + "grad_norm": 25.5, + "learning_rate": 1.1130290539086885e-06, + "loss": 0.743, + "step": 4447 + }, + { + "epoch": 1.0814490639435934, + "grad_norm": 13.5625, + "learning_rate": 1.1125487117324846e-06, + "loss": 0.291, + "step": 4448 + }, + { + "epoch": 1.0816921954777534, + "grad_norm": 22.875, + "learning_rate": 1.1120683901007497e-06, + "loss": 1.1769, + "step": 4449 + }, + { + "epoch": 1.0819353270119134, + "grad_norm": 15.25, + "learning_rate": 1.1115880890852763e-06, + "loss": 0.3238, + "step": 4450 + }, + { + "epoch": 1.0821784585460734, + "grad_norm": 17.125, + "learning_rate": 1.1111078087578533e-06, + "loss": 0.5467, + "step": 4451 + }, + { + "epoch": 1.0824215900802334, + "grad_norm": 24.875, + "learning_rate": 1.110627549190267e-06, + "loss": 0.5565, + "step": 4452 + }, + { + "epoch": 1.0826647216143934, + "grad_norm": 21.875, + "learning_rate": 1.1101473104543003e-06, + "loss": 0.8823, + "step": 4453 + }, + { + "epoch": 1.0829078531485534, + "grad_norm": 17.125, + "learning_rate": 1.1096670926217338e-06, + "loss": 0.4465, + "step": 4454 + }, + { + "epoch": 1.0831509846827134, + "grad_norm": 21.375, + "learning_rate": 1.1091868957643436e-06, + "loss": 0.6273, + "step": 4455 + }, + { + "epoch": 1.0833941162168734, + "grad_norm": 18.75, + "learning_rate": 1.108706719953904e-06, + "loss": 0.5618, + "step": 4456 + }, + { + "epoch": 1.0836372477510332, + "grad_norm": 18.375, + "learning_rate": 1.1082265652621854e-06, + "loss": 0.5161, + "step": 4457 + }, + { + "epoch": 1.0838803792851932, + "grad_norm": 17.75, + "learning_rate": 1.1077464317609549e-06, + "loss": 0.4918, + "step": 4458 + }, + { + "epoch": 1.0841235108193532, + "grad_norm": 16.375, + "learning_rate": 1.107266319521977e-06, + "loss": 0.4809, + "step": 4459 + }, + { + "epoch": 1.0843666423535132, + "grad_norm": 19.75, + "learning_rate": 1.106786228617013e-06, + "loss": 0.7428, + "step": 4460 + }, + { + "epoch": 1.0846097738876732, + "grad_norm": 22.875, + "learning_rate": 1.1063061591178198e-06, + "loss": 0.6756, + "step": 4461 + }, + { + "epoch": 1.0848529054218332, + "grad_norm": 21.375, + "learning_rate": 1.1058261110961535e-06, + "loss": 0.9119, + "step": 4462 + }, + { + "epoch": 1.0850960369559932, + "grad_norm": 20.25, + "learning_rate": 1.1053460846237645e-06, + "loss": 0.4709, + "step": 4463 + }, + { + "epoch": 1.0853391684901532, + "grad_norm": 19.375, + "learning_rate": 1.1048660797724018e-06, + "loss": 0.6212, + "step": 4464 + }, + { + "epoch": 1.0855823000243132, + "grad_norm": 19.125, + "learning_rate": 1.1043860966138098e-06, + "loss": 0.7112, + "step": 4465 + }, + { + "epoch": 1.0858254315584732, + "grad_norm": 18.25, + "learning_rate": 1.1039061352197306e-06, + "loss": 0.5805, + "step": 4466 + }, + { + "epoch": 1.086068563092633, + "grad_norm": 25.875, + "learning_rate": 1.1034261956619028e-06, + "loss": 0.514, + "step": 4467 + }, + { + "epoch": 1.086311694626793, + "grad_norm": 21.5, + "learning_rate": 1.1029462780120611e-06, + "loss": 0.584, + "step": 4468 + }, + { + "epoch": 1.086554826160953, + "grad_norm": 16.125, + "learning_rate": 1.1024663823419377e-06, + "loss": 0.6386, + "step": 4469 + }, + { + "epoch": 1.086797957695113, + "grad_norm": 14.8125, + "learning_rate": 1.1019865087232615e-06, + "loss": 0.5411, + "step": 4470 + }, + { + "epoch": 1.087041089229273, + "grad_norm": 15.75, + "learning_rate": 1.1015066572277583e-06, + "loss": 0.471, + "step": 4471 + }, + { + "epoch": 1.087284220763433, + "grad_norm": 16.125, + "learning_rate": 1.1010268279271495e-06, + "loss": 0.483, + "step": 4472 + }, + { + "epoch": 1.087527352297593, + "grad_norm": 16.0, + "learning_rate": 1.100547020893154e-06, + "loss": 0.7322, + "step": 4473 + }, + { + "epoch": 1.087770483831753, + "grad_norm": 18.25, + "learning_rate": 1.1000672361974879e-06, + "loss": 0.7431, + "step": 4474 + }, + { + "epoch": 1.088013615365913, + "grad_norm": 19.375, + "learning_rate": 1.0995874739118623e-06, + "loss": 0.7328, + "step": 4475 + }, + { + "epoch": 1.088256746900073, + "grad_norm": 18.5, + "learning_rate": 1.0991077341079862e-06, + "loss": 0.6849, + "step": 4476 + }, + { + "epoch": 1.088499878434233, + "grad_norm": 20.375, + "learning_rate": 1.0986280168575655e-06, + "loss": 0.9232, + "step": 4477 + }, + { + "epoch": 1.0887430099683928, + "grad_norm": 20.875, + "learning_rate": 1.0981483222323023e-06, + "loss": 1.1457, + "step": 4478 + }, + { + "epoch": 1.0889861415025528, + "grad_norm": 19.375, + "learning_rate": 1.0976686503038945e-06, + "loss": 0.677, + "step": 4479 + }, + { + "epoch": 1.0892292730367128, + "grad_norm": 24.25, + "learning_rate": 1.097189001144038e-06, + "loss": 1.3531, + "step": 4480 + }, + { + "epoch": 1.0894724045708728, + "grad_norm": 19.5, + "learning_rate": 1.0967093748244245e-06, + "loss": 0.733, + "step": 4481 + }, + { + "epoch": 1.0897155361050328, + "grad_norm": 20.125, + "learning_rate": 1.096229771416742e-06, + "loss": 0.6181, + "step": 4482 + }, + { + "epoch": 1.0899586676391928, + "grad_norm": 39.25, + "learning_rate": 1.0957501909926761e-06, + "loss": 0.8568, + "step": 4483 + }, + { + "epoch": 1.0902017991733528, + "grad_norm": 16.875, + "learning_rate": 1.0952706336239077e-06, + "loss": 0.5019, + "step": 4484 + }, + { + "epoch": 1.0904449307075128, + "grad_norm": 17.125, + "learning_rate": 1.0947910993821157e-06, + "loss": 0.4438, + "step": 4485 + }, + { + "epoch": 1.0906880622416728, + "grad_norm": 10.1875, + "learning_rate": 1.0943115883389747e-06, + "loss": 0.3666, + "step": 4486 + }, + { + "epoch": 1.0909311937758328, + "grad_norm": 25.875, + "learning_rate": 1.0938321005661555e-06, + "loss": 0.9279, + "step": 4487 + }, + { + "epoch": 1.0911743253099928, + "grad_norm": 23.125, + "learning_rate": 1.0933526361353261e-06, + "loss": 0.8652, + "step": 4488 + }, + { + "epoch": 1.0914174568441526, + "grad_norm": 15.1875, + "learning_rate": 1.0928731951181505e-06, + "loss": 0.7262, + "step": 4489 + }, + { + "epoch": 1.0916605883783126, + "grad_norm": 19.75, + "learning_rate": 1.0923937775862894e-06, + "loss": 0.65, + "step": 4490 + }, + { + "epoch": 1.0919037199124726, + "grad_norm": 16.5, + "learning_rate": 1.0919143836114005e-06, + "loss": 0.8162, + "step": 4491 + }, + { + "epoch": 1.0921468514466326, + "grad_norm": 17.25, + "learning_rate": 1.0914350132651367e-06, + "loss": 0.6148, + "step": 4492 + }, + { + "epoch": 1.0923899829807926, + "grad_norm": 19.25, + "learning_rate": 1.090955666619149e-06, + "loss": 1.0146, + "step": 4493 + }, + { + "epoch": 1.0926331145149526, + "grad_norm": 13.6875, + "learning_rate": 1.0904763437450839e-06, + "loss": 0.2574, + "step": 4494 + }, + { + "epoch": 1.0928762460491126, + "grad_norm": 22.25, + "learning_rate": 1.0899970447145843e-06, + "loss": 0.8433, + "step": 4495 + }, + { + "epoch": 1.0931193775832726, + "grad_norm": 18.0, + "learning_rate": 1.0895177695992895e-06, + "loss": 0.5903, + "step": 4496 + }, + { + "epoch": 1.0933625091174326, + "grad_norm": 20.875, + "learning_rate": 1.0890385184708354e-06, + "loss": 0.488, + "step": 4497 + }, + { + "epoch": 1.0936056406515926, + "grad_norm": 27.0, + "learning_rate": 1.0885592914008552e-06, + "loss": 0.9404, + "step": 4498 + }, + { + "epoch": 1.0938487721857526, + "grad_norm": 16.875, + "learning_rate": 1.088080088460977e-06, + "loss": 0.5162, + "step": 4499 + }, + { + "epoch": 1.0940919037199124, + "grad_norm": 35.5, + "learning_rate": 1.0876009097228253e-06, + "loss": 1.0972, + "step": 4500 + }, + { + "epoch": 1.0943350352540724, + "grad_norm": 22.25, + "learning_rate": 1.0871217552580227e-06, + "loss": 0.7532, + "step": 4501 + }, + { + "epoch": 1.0945781667882324, + "grad_norm": 18.875, + "learning_rate": 1.0866426251381871e-06, + "loss": 0.8938, + "step": 4502 + }, + { + "epoch": 1.0948212983223924, + "grad_norm": 19.375, + "learning_rate": 1.0861635194349326e-06, + "loss": 0.4472, + "step": 4503 + }, + { + "epoch": 1.0950644298565524, + "grad_norm": 20.625, + "learning_rate": 1.0856844382198691e-06, + "loss": 0.505, + "step": 4504 + }, + { + "epoch": 1.0953075613907124, + "grad_norm": 19.625, + "learning_rate": 1.0852053815646048e-06, + "loss": 0.8325, + "step": 4505 + }, + { + "epoch": 1.0955506929248724, + "grad_norm": 20.75, + "learning_rate": 1.084726349540742e-06, + "loss": 0.7421, + "step": 4506 + }, + { + "epoch": 1.0957938244590324, + "grad_norm": 21.25, + "learning_rate": 1.0842473422198801e-06, + "loss": 1.3701, + "step": 4507 + }, + { + "epoch": 1.0960369559931924, + "grad_norm": 17.5, + "learning_rate": 1.083768359673616e-06, + "loss": 0.4848, + "step": 4508 + }, + { + "epoch": 1.0962800875273524, + "grad_norm": 32.0, + "learning_rate": 1.0832894019735416e-06, + "loss": 0.7744, + "step": 4509 + }, + { + "epoch": 1.0965232190615122, + "grad_norm": 22.75, + "learning_rate": 1.0828104691912452e-06, + "loss": 0.894, + "step": 4510 + }, + { + "epoch": 1.0967663505956722, + "grad_norm": 19.875, + "learning_rate": 1.0823315613983118e-06, + "loss": 0.913, + "step": 4511 + }, + { + "epoch": 1.0970094821298322, + "grad_norm": 18.75, + "learning_rate": 1.0818526786663221e-06, + "loss": 0.4371, + "step": 4512 + }, + { + "epoch": 1.0972526136639922, + "grad_norm": 23.875, + "learning_rate": 1.0813738210668537e-06, + "loss": 0.8429, + "step": 4513 + }, + { + "epoch": 1.0974957451981522, + "grad_norm": 20.0, + "learning_rate": 1.0808949886714798e-06, + "loss": 0.7703, + "step": 4514 + }, + { + "epoch": 1.0977388767323122, + "grad_norm": 25.5, + "learning_rate": 1.0804161815517702e-06, + "loss": 0.9715, + "step": 4515 + }, + { + "epoch": 1.0979820082664722, + "grad_norm": 18.625, + "learning_rate": 1.0799373997792913e-06, + "loss": 0.678, + "step": 4516 + }, + { + "epoch": 1.0982251398006322, + "grad_norm": 16.25, + "learning_rate": 1.0794586434256053e-06, + "loss": 0.6187, + "step": 4517 + }, + { + "epoch": 1.0984682713347922, + "grad_norm": 18.875, + "learning_rate": 1.0789799125622701e-06, + "loss": 0.5908, + "step": 4518 + }, + { + "epoch": 1.0987114028689522, + "grad_norm": 21.625, + "learning_rate": 1.0785012072608408e-06, + "loss": 0.631, + "step": 4519 + }, + { + "epoch": 1.098954534403112, + "grad_norm": 20.75, + "learning_rate": 1.0780225275928682e-06, + "loss": 0.8226, + "step": 4520 + }, + { + "epoch": 1.099197665937272, + "grad_norm": 24.875, + "learning_rate": 1.0775438736298987e-06, + "loss": 0.7697, + "step": 4521 + }, + { + "epoch": 1.099440797471432, + "grad_norm": 16.875, + "learning_rate": 1.0770652454434758e-06, + "loss": 0.4744, + "step": 4522 + }, + { + "epoch": 1.099683929005592, + "grad_norm": 20.25, + "learning_rate": 1.076586643105138e-06, + "loss": 0.4628, + "step": 4523 + }, + { + "epoch": 1.099927060539752, + "grad_norm": 24.75, + "learning_rate": 1.0761080666864222e-06, + "loss": 0.8995, + "step": 4524 + }, + { + "epoch": 1.100170192073912, + "grad_norm": 15.625, + "learning_rate": 1.0756295162588586e-06, + "loss": 0.5974, + "step": 4525 + }, + { + "epoch": 1.100413323608072, + "grad_norm": 24.75, + "learning_rate": 1.0751509918939754e-06, + "loss": 1.1693, + "step": 4526 + }, + { + "epoch": 1.100656455142232, + "grad_norm": 25.375, + "learning_rate": 1.0746724936632966e-06, + "loss": 0.9175, + "step": 4527 + }, + { + "epoch": 1.100899586676392, + "grad_norm": 18.625, + "learning_rate": 1.0741940216383412e-06, + "loss": 0.6916, + "step": 4528 + }, + { + "epoch": 1.101142718210552, + "grad_norm": 17.5, + "learning_rate": 1.0737155758906258e-06, + "loss": 0.5722, + "step": 4529 + }, + { + "epoch": 1.101385849744712, + "grad_norm": 21.25, + "learning_rate": 1.0732371564916614e-06, + "loss": 1.0089, + "step": 4530 + }, + { + "epoch": 1.1016289812788718, + "grad_norm": 18.125, + "learning_rate": 1.0727587635129574e-06, + "loss": 0.6362, + "step": 4531 + }, + { + "epoch": 1.1018721128130318, + "grad_norm": 25.5, + "learning_rate": 1.072280397026017e-06, + "loss": 0.8854, + "step": 4532 + }, + { + "epoch": 1.1021152443471918, + "grad_norm": 19.25, + "learning_rate": 1.0718020571023408e-06, + "loss": 1.0526, + "step": 4533 + }, + { + "epoch": 1.1023583758813518, + "grad_norm": 22.625, + "learning_rate": 1.0713237438134249e-06, + "loss": 0.7364, + "step": 4534 + }, + { + "epoch": 1.1026015074155118, + "grad_norm": 18.25, + "learning_rate": 1.070845457230761e-06, + "loss": 0.8941, + "step": 4535 + }, + { + "epoch": 1.1028446389496718, + "grad_norm": 19.375, + "learning_rate": 1.0703671974258378e-06, + "loss": 0.8003, + "step": 4536 + }, + { + "epoch": 1.1030877704838318, + "grad_norm": 24.0, + "learning_rate": 1.0698889644701394e-06, + "loss": 0.2289, + "step": 4537 + }, + { + "epoch": 1.1033309020179918, + "grad_norm": 19.0, + "learning_rate": 1.0694107584351453e-06, + "loss": 0.7821, + "step": 4538 + }, + { + "epoch": 1.1035740335521518, + "grad_norm": 23.375, + "learning_rate": 1.0689325793923327e-06, + "loss": 0.7209, + "step": 4539 + }, + { + "epoch": 1.1038171650863118, + "grad_norm": 20.625, + "learning_rate": 1.0684544274131731e-06, + "loss": 0.9191, + "step": 4540 + }, + { + "epoch": 1.1040602966204718, + "grad_norm": 18.5, + "learning_rate": 1.0679763025691352e-06, + "loss": 0.7183, + "step": 4541 + }, + { + "epoch": 1.1043034281546316, + "grad_norm": 20.25, + "learning_rate": 1.0674982049316822e-06, + "loss": 0.779, + "step": 4542 + }, + { + "epoch": 1.1045465596887916, + "grad_norm": 19.75, + "learning_rate": 1.0670201345722742e-06, + "loss": 0.7229, + "step": 4543 + }, + { + "epoch": 1.1047896912229516, + "grad_norm": 23.625, + "learning_rate": 1.0665420915623678e-06, + "loss": 0.8702, + "step": 4544 + }, + { + "epoch": 1.1050328227571116, + "grad_norm": 22.0, + "learning_rate": 1.0660640759734137e-06, + "loss": 0.7513, + "step": 4545 + }, + { + "epoch": 1.1052759542912716, + "grad_norm": 23.625, + "learning_rate": 1.06558608787686e-06, + "loss": 0.8785, + "step": 4546 + }, + { + "epoch": 1.1055190858254316, + "grad_norm": 22.625, + "learning_rate": 1.0651081273441507e-06, + "loss": 1.1261, + "step": 4547 + }, + { + "epoch": 1.1057622173595916, + "grad_norm": 25.875, + "learning_rate": 1.0646301944467252e-06, + "loss": 0.692, + "step": 4548 + }, + { + "epoch": 1.1060053488937516, + "grad_norm": 21.125, + "learning_rate": 1.0641522892560184e-06, + "loss": 1.2272, + "step": 4549 + }, + { + "epoch": 1.1062484804279116, + "grad_norm": 22.0, + "learning_rate": 1.0636744118434615e-06, + "loss": 0.6876, + "step": 4550 + }, + { + "epoch": 1.1064916119620716, + "grad_norm": 18.75, + "learning_rate": 1.0631965622804821e-06, + "loss": 0.7962, + "step": 4551 + }, + { + "epoch": 1.1067347434962316, + "grad_norm": 17.125, + "learning_rate": 1.0627187406385023e-06, + "loss": 0.4454, + "step": 4552 + }, + { + "epoch": 1.1069778750303914, + "grad_norm": 22.75, + "learning_rate": 1.0622409469889413e-06, + "loss": 0.8664, + "step": 4553 + }, + { + "epoch": 1.1072210065645514, + "grad_norm": 19.125, + "learning_rate": 1.0617631814032132e-06, + "loss": 0.8129, + "step": 4554 + }, + { + "epoch": 1.1074641380987114, + "grad_norm": 32.5, + "learning_rate": 1.061285443952729e-06, + "loss": 0.8578, + "step": 4555 + }, + { + "epoch": 1.1077072696328714, + "grad_norm": 20.875, + "learning_rate": 1.0608077347088943e-06, + "loss": 0.7495, + "step": 4556 + }, + { + "epoch": 1.1079504011670314, + "grad_norm": 22.375, + "learning_rate": 1.060330053743111e-06, + "loss": 0.8623, + "step": 4557 + }, + { + "epoch": 1.1081935327011914, + "grad_norm": 16.875, + "learning_rate": 1.0598524011267771e-06, + "loss": 0.4749, + "step": 4558 + }, + { + "epoch": 1.1084366642353514, + "grad_norm": 19.5, + "learning_rate": 1.0593747769312858e-06, + "loss": 1.0007, + "step": 4559 + }, + { + "epoch": 1.1086797957695114, + "grad_norm": 18.125, + "learning_rate": 1.0588971812280261e-06, + "loss": 0.6778, + "step": 4560 + }, + { + "epoch": 1.1089229273036714, + "grad_norm": 21.5, + "learning_rate": 1.058419614088383e-06, + "loss": 0.7019, + "step": 4561 + }, + { + "epoch": 1.1091660588378314, + "grad_norm": 15.75, + "learning_rate": 1.0579420755837375e-06, + "loss": 0.5354, + "step": 4562 + }, + { + "epoch": 1.1094091903719911, + "grad_norm": 16.75, + "learning_rate": 1.0574645657854659e-06, + "loss": 0.6889, + "step": 4563 + }, + { + "epoch": 1.1096523219061512, + "grad_norm": 22.25, + "learning_rate": 1.05698708476494e-06, + "loss": 1.0348, + "step": 4564 + }, + { + "epoch": 1.1098954534403112, + "grad_norm": 16.0, + "learning_rate": 1.0565096325935278e-06, + "loss": 0.6408, + "step": 4565 + }, + { + "epoch": 1.1101385849744712, + "grad_norm": 25.0, + "learning_rate": 1.0560322093425926e-06, + "loss": 0.8725, + "step": 4566 + }, + { + "epoch": 1.1103817165086312, + "grad_norm": 23.75, + "learning_rate": 1.0555548150834937e-06, + "loss": 0.6499, + "step": 4567 + }, + { + "epoch": 1.1106248480427912, + "grad_norm": 24.25, + "learning_rate": 1.055077449887586e-06, + "loss": 0.9151, + "step": 4568 + }, + { + "epoch": 1.1108679795769512, + "grad_norm": 26.25, + "learning_rate": 1.0546001138262196e-06, + "loss": 0.9657, + "step": 4569 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 24.75, + "learning_rate": 1.0541228069707408e-06, + "loss": 0.9306, + "step": 4570 + }, + { + "epoch": 1.1113542426452712, + "grad_norm": 21.375, + "learning_rate": 1.0536455293924914e-06, + "loss": 0.6871, + "step": 4571 + }, + { + "epoch": 1.1115973741794312, + "grad_norm": 17.375, + "learning_rate": 1.0531682811628092e-06, + "loss": 0.6849, + "step": 4572 + }, + { + "epoch": 1.111840505713591, + "grad_norm": 24.0, + "learning_rate": 1.0526910623530267e-06, + "loss": 0.7795, + "step": 4573 + }, + { + "epoch": 1.112083637247751, + "grad_norm": 19.75, + "learning_rate": 1.0522138730344722e-06, + "loss": 0.389, + "step": 4574 + }, + { + "epoch": 1.112326768781911, + "grad_norm": 20.625, + "learning_rate": 1.0517367132784707e-06, + "loss": 0.8928, + "step": 4575 + }, + { + "epoch": 1.112569900316071, + "grad_norm": 20.625, + "learning_rate": 1.0512595831563413e-06, + "loss": 0.7636, + "step": 4576 + }, + { + "epoch": 1.112813031850231, + "grad_norm": 18.0, + "learning_rate": 1.0507824827393994e-06, + "loss": 0.7822, + "step": 4577 + }, + { + "epoch": 1.113056163384391, + "grad_norm": 21.25, + "learning_rate": 1.0503054120989562e-06, + "loss": 0.6797, + "step": 4578 + }, + { + "epoch": 1.113299294918551, + "grad_norm": 16.875, + "learning_rate": 1.0498283713063182e-06, + "loss": 0.5428, + "step": 4579 + }, + { + "epoch": 1.113542426452711, + "grad_norm": 14.1875, + "learning_rate": 1.0493513604327868e-06, + "loss": 0.4768, + "step": 4580 + }, + { + "epoch": 1.113785557986871, + "grad_norm": 17.125, + "learning_rate": 1.0488743795496602e-06, + "loss": 0.5572, + "step": 4581 + }, + { + "epoch": 1.114028689521031, + "grad_norm": 24.625, + "learning_rate": 1.0483974287282313e-06, + "loss": 0.7668, + "step": 4582 + }, + { + "epoch": 1.114271821055191, + "grad_norm": 16.0, + "learning_rate": 1.0479205080397881e-06, + "loss": 0.7331, + "step": 4583 + }, + { + "epoch": 1.1145149525893507, + "grad_norm": 19.625, + "learning_rate": 1.047443617555615e-06, + "loss": 0.5156, + "step": 4584 + }, + { + "epoch": 1.1147580841235107, + "grad_norm": 17.125, + "learning_rate": 1.0469667573469916e-06, + "loss": 0.8256, + "step": 4585 + }, + { + "epoch": 1.1150012156576707, + "grad_norm": 23.625, + "learning_rate": 1.046489927485193e-06, + "loss": 0.973, + "step": 4586 + }, + { + "epoch": 1.1152443471918307, + "grad_norm": 17.75, + "learning_rate": 1.0460131280414896e-06, + "loss": 0.6681, + "step": 4587 + }, + { + "epoch": 1.1154874787259907, + "grad_norm": 20.625, + "learning_rate": 1.0455363590871468e-06, + "loss": 0.6001, + "step": 4588 + }, + { + "epoch": 1.1157306102601507, + "grad_norm": 21.5, + "learning_rate": 1.0450596206934267e-06, + "loss": 0.7672, + "step": 4589 + }, + { + "epoch": 1.1159737417943107, + "grad_norm": 34.5, + "learning_rate": 1.0445829129315854e-06, + "loss": 0.6228, + "step": 4590 + }, + { + "epoch": 1.1162168733284707, + "grad_norm": 18.875, + "learning_rate": 1.0441062358728757e-06, + "loss": 0.5794, + "step": 4591 + }, + { + "epoch": 1.1164600048626308, + "grad_norm": 19.0, + "learning_rate": 1.0436295895885445e-06, + "loss": 0.934, + "step": 4592 + }, + { + "epoch": 1.1167031363967908, + "grad_norm": 16.625, + "learning_rate": 1.0431529741498358e-06, + "loss": 0.4576, + "step": 4593 + }, + { + "epoch": 1.1169462679309508, + "grad_norm": 17.25, + "learning_rate": 1.0426763896279873e-06, + "loss": 0.5815, + "step": 4594 + }, + { + "epoch": 1.1171893994651105, + "grad_norm": 20.625, + "learning_rate": 1.0421998360942328e-06, + "loss": 0.6244, + "step": 4595 + }, + { + "epoch": 1.1174325309992705, + "grad_norm": 19.75, + "learning_rate": 1.041723313619802e-06, + "loss": 0.9553, + "step": 4596 + }, + { + "epoch": 1.1176756625334305, + "grad_norm": 19.125, + "learning_rate": 1.041246822275919e-06, + "loss": 0.4917, + "step": 4597 + }, + { + "epoch": 1.1179187940675905, + "grad_norm": 21.5, + "learning_rate": 1.0407703621338034e-06, + "loss": 0.5445, + "step": 4598 + }, + { + "epoch": 1.1181619256017505, + "grad_norm": 18.5, + "learning_rate": 1.040293933264671e-06, + "loss": 0.7912, + "step": 4599 + }, + { + "epoch": 1.1184050571359105, + "grad_norm": 17.5, + "learning_rate": 1.0398175357397314e-06, + "loss": 0.6897, + "step": 4600 + }, + { + "epoch": 1.1186481886700705, + "grad_norm": 15.3125, + "learning_rate": 1.0393411696301918e-06, + "loss": 0.7391, + "step": 4601 + }, + { + "epoch": 1.1188913202042305, + "grad_norm": 19.125, + "learning_rate": 1.0388648350072522e-06, + "loss": 0.6874, + "step": 4602 + }, + { + "epoch": 1.1191344517383905, + "grad_norm": 23.75, + "learning_rate": 1.0383885319421097e-06, + "loss": 0.9047, + "step": 4603 + }, + { + "epoch": 1.1193775832725505, + "grad_norm": 18.875, + "learning_rate": 1.0379122605059557e-06, + "loss": 0.7662, + "step": 4604 + }, + { + "epoch": 1.1196207148067103, + "grad_norm": 16.75, + "learning_rate": 1.0374360207699771e-06, + "loss": 0.4598, + "step": 4605 + }, + { + "epoch": 1.1198638463408703, + "grad_norm": 22.75, + "learning_rate": 1.0369598128053565e-06, + "loss": 0.6787, + "step": 4606 + }, + { + "epoch": 1.1201069778750303, + "grad_norm": 18.125, + "learning_rate": 1.036483636683271e-06, + "loss": 0.619, + "step": 4607 + }, + { + "epoch": 1.1203501094091903, + "grad_norm": 13.9375, + "learning_rate": 1.0360074924748934e-06, + "loss": 0.3686, + "step": 4608 + }, + { + "epoch": 1.1205932409433503, + "grad_norm": 18.5, + "learning_rate": 1.0355313802513922e-06, + "loss": 0.5171, + "step": 4609 + }, + { + "epoch": 1.1208363724775103, + "grad_norm": 19.125, + "learning_rate": 1.03505530008393e-06, + "loss": 0.8641, + "step": 4610 + }, + { + "epoch": 1.1210795040116703, + "grad_norm": 20.5, + "learning_rate": 1.0345792520436657e-06, + "loss": 0.6632, + "step": 4611 + }, + { + "epoch": 1.1213226355458303, + "grad_norm": 17.75, + "learning_rate": 1.0341032362017523e-06, + "loss": 0.8097, + "step": 4612 + }, + { + "epoch": 1.1215657670799903, + "grad_norm": 25.0, + "learning_rate": 1.0336272526293392e-06, + "loss": 0.5701, + "step": 4613 + }, + { + "epoch": 1.1218088986141503, + "grad_norm": 23.875, + "learning_rate": 1.03315130139757e-06, + "loss": 0.8333, + "step": 4614 + }, + { + "epoch": 1.1220520301483101, + "grad_norm": 20.5, + "learning_rate": 1.0326753825775837e-06, + "loss": 0.5231, + "step": 4615 + }, + { + "epoch": 1.1222951616824701, + "grad_norm": 17.25, + "learning_rate": 1.0321994962405147e-06, + "loss": 0.7017, + "step": 4616 + }, + { + "epoch": 1.1225382932166301, + "grad_norm": 19.0, + "learning_rate": 1.0317236424574929e-06, + "loss": 0.7746, + "step": 4617 + }, + { + "epoch": 1.1227814247507901, + "grad_norm": 22.375, + "learning_rate": 1.0312478212996425e-06, + "loss": 0.6846, + "step": 4618 + }, + { + "epoch": 1.1230245562849501, + "grad_norm": 17.125, + "learning_rate": 1.030772032838083e-06, + "loss": 0.5556, + "step": 4619 + }, + { + "epoch": 1.1232676878191101, + "grad_norm": 20.875, + "learning_rate": 1.0302962771439296e-06, + "loss": 0.8408, + "step": 4620 + }, + { + "epoch": 1.1235108193532701, + "grad_norm": 27.5, + "learning_rate": 1.029820554288292e-06, + "loss": 0.6969, + "step": 4621 + }, + { + "epoch": 1.1237539508874301, + "grad_norm": 16.375, + "learning_rate": 1.029344864342275e-06, + "loss": 0.5205, + "step": 4622 + }, + { + "epoch": 1.1239970824215901, + "grad_norm": 16.625, + "learning_rate": 1.0288692073769786e-06, + "loss": 0.4991, + "step": 4623 + }, + { + "epoch": 1.1242402139557501, + "grad_norm": 19.25, + "learning_rate": 1.0283935834634984e-06, + "loss": 0.6053, + "step": 4624 + }, + { + "epoch": 1.1244833454899101, + "grad_norm": 19.75, + "learning_rate": 1.0279179926729249e-06, + "loss": 0.8172, + "step": 4625 + }, + { + "epoch": 1.12472647702407, + "grad_norm": 13.3125, + "learning_rate": 1.0274424350763424e-06, + "loss": 0.2787, + "step": 4626 + }, + { + "epoch": 1.12496960855823, + "grad_norm": 18.875, + "learning_rate": 1.026966910744832e-06, + "loss": 0.6541, + "step": 4627 + }, + { + "epoch": 1.12521274009239, + "grad_norm": 17.125, + "learning_rate": 1.0264914197494685e-06, + "loss": 0.4244, + "step": 4628 + }, + { + "epoch": 1.12545587162655, + "grad_norm": 34.25, + "learning_rate": 1.0260159621613224e-06, + "loss": 0.6205, + "step": 4629 + }, + { + "epoch": 1.12569900316071, + "grad_norm": 19.625, + "learning_rate": 1.0255405380514594e-06, + "loss": 0.7377, + "step": 4630 + }, + { + "epoch": 1.12594213469487, + "grad_norm": 14.0625, + "learning_rate": 1.0250651474909386e-06, + "loss": 0.4619, + "step": 4631 + }, + { + "epoch": 1.12618526622903, + "grad_norm": 26.0, + "learning_rate": 1.0245897905508174e-06, + "loss": 0.8511, + "step": 4632 + }, + { + "epoch": 1.12642839776319, + "grad_norm": 25.5, + "learning_rate": 1.0241144673021444e-06, + "loss": 0.5489, + "step": 4633 + }, + { + "epoch": 1.12667152929735, + "grad_norm": 21.5, + "learning_rate": 1.0236391778159658e-06, + "loss": 0.6719, + "step": 4634 + }, + { + "epoch": 1.12691466083151, + "grad_norm": 19.375, + "learning_rate": 1.0231639221633213e-06, + "loss": 0.459, + "step": 4635 + }, + { + "epoch": 1.12715779236567, + "grad_norm": 16.5, + "learning_rate": 1.022688700415246e-06, + "loss": 0.7492, + "step": 4636 + }, + { + "epoch": 1.1274009238998297, + "grad_norm": 19.875, + "learning_rate": 1.0222135126427708e-06, + "loss": 0.7292, + "step": 4637 + }, + { + "epoch": 1.1276440554339897, + "grad_norm": 22.0, + "learning_rate": 1.0217383589169196e-06, + "loss": 0.6016, + "step": 4638 + }, + { + "epoch": 1.1278871869681497, + "grad_norm": 21.375, + "learning_rate": 1.0212632393087126e-06, + "loss": 0.8727, + "step": 4639 + }, + { + "epoch": 1.1281303185023097, + "grad_norm": 18.75, + "learning_rate": 1.0207881538891654e-06, + "loss": 0.7021, + "step": 4640 + }, + { + "epoch": 1.1283734500364697, + "grad_norm": 18.25, + "learning_rate": 1.020313102729287e-06, + "loss": 0.7066, + "step": 4641 + }, + { + "epoch": 1.1286165815706297, + "grad_norm": 17.5, + "learning_rate": 1.0198380859000828e-06, + "loss": 0.6684, + "step": 4642 + }, + { + "epoch": 1.1288597131047897, + "grad_norm": 25.125, + "learning_rate": 1.019363103472551e-06, + "loss": 0.5839, + "step": 4643 + }, + { + "epoch": 1.1291028446389497, + "grad_norm": 21.0, + "learning_rate": 1.018888155517687e-06, + "loss": 0.7214, + "step": 4644 + }, + { + "epoch": 1.1293459761731097, + "grad_norm": 23.5, + "learning_rate": 1.0184132421064797e-06, + "loss": 0.6538, + "step": 4645 + }, + { + "epoch": 1.1295891077072697, + "grad_norm": 18.375, + "learning_rate": 1.0179383633099124e-06, + "loss": 0.5012, + "step": 4646 + }, + { + "epoch": 1.1298322392414297, + "grad_norm": 19.75, + "learning_rate": 1.017463519198965e-06, + "loss": 0.7287, + "step": 4647 + }, + { + "epoch": 1.1300753707755895, + "grad_norm": 20.5, + "learning_rate": 1.0169887098446106e-06, + "loss": 0.6575, + "step": 4648 + }, + { + "epoch": 1.1303185023097495, + "grad_norm": 19.875, + "learning_rate": 1.016513935317818e-06, + "loss": 0.6859, + "step": 4649 + }, + { + "epoch": 1.1305616338439095, + "grad_norm": 31.75, + "learning_rate": 1.01603919568955e-06, + "loss": 1.0938, + "step": 4650 + }, + { + "epoch": 1.1308047653780695, + "grad_norm": 19.875, + "learning_rate": 1.015564491030765e-06, + "loss": 0.7796, + "step": 4651 + }, + { + "epoch": 1.1310478969122295, + "grad_norm": 19.0, + "learning_rate": 1.0150898214124155e-06, + "loss": 0.6401, + "step": 4652 + }, + { + "epoch": 1.1312910284463895, + "grad_norm": 19.625, + "learning_rate": 1.0146151869054492e-06, + "loss": 0.9332, + "step": 4653 + }, + { + "epoch": 1.1315341599805495, + "grad_norm": 25.375, + "learning_rate": 1.0141405875808083e-06, + "loss": 0.8476, + "step": 4654 + }, + { + "epoch": 1.1317772915147095, + "grad_norm": 20.75, + "learning_rate": 1.01366602350943e-06, + "loss": 1.0623, + "step": 4655 + }, + { + "epoch": 1.1320204230488695, + "grad_norm": 15.25, + "learning_rate": 1.0131914947622466e-06, + "loss": 0.6053, + "step": 4656 + }, + { + "epoch": 1.1322635545830293, + "grad_norm": 17.5, + "learning_rate": 1.012717001410184e-06, + "loss": 0.6007, + "step": 4657 + }, + { + "epoch": 1.1325066861171895, + "grad_norm": 18.75, + "learning_rate": 1.0122425435241633e-06, + "loss": 0.7136, + "step": 4658 + }, + { + "epoch": 1.1327498176513493, + "grad_norm": 20.375, + "learning_rate": 1.011768121175101e-06, + "loss": 0.5298, + "step": 4659 + }, + { + "epoch": 1.1329929491855093, + "grad_norm": 25.25, + "learning_rate": 1.0112937344339071e-06, + "loss": 0.7498, + "step": 4660 + }, + { + "epoch": 1.1332360807196693, + "grad_norm": 18.375, + "learning_rate": 1.0108193833714875e-06, + "loss": 0.5374, + "step": 4661 + }, + { + "epoch": 1.1334792122538293, + "grad_norm": 23.625, + "learning_rate": 1.0103450680587412e-06, + "loss": 0.7976, + "step": 4662 + }, + { + "epoch": 1.1337223437879893, + "grad_norm": 24.75, + "learning_rate": 1.009870788566564e-06, + "loss": 0.7245, + "step": 4663 + }, + { + "epoch": 1.1339654753221493, + "grad_norm": 23.125, + "learning_rate": 1.0093965449658445e-06, + "loss": 0.8302, + "step": 4664 + }, + { + "epoch": 1.1342086068563093, + "grad_norm": 17.5, + "learning_rate": 1.0089223373274668e-06, + "loss": 0.5245, + "step": 4665 + }, + { + "epoch": 1.1344517383904693, + "grad_norm": 16.125, + "learning_rate": 1.0084481657223093e-06, + "loss": 0.2544, + "step": 4666 + }, + { + "epoch": 1.1346948699246293, + "grad_norm": 23.5, + "learning_rate": 1.0079740302212452e-06, + "loss": 0.9773, + "step": 4667 + }, + { + "epoch": 1.134938001458789, + "grad_norm": 17.125, + "learning_rate": 1.0074999308951426e-06, + "loss": 0.3679, + "step": 4668 + }, + { + "epoch": 1.135181132992949, + "grad_norm": 20.375, + "learning_rate": 1.0070258678148624e-06, + "loss": 0.7059, + "step": 4669 + }, + { + "epoch": 1.135424264527109, + "grad_norm": 22.5, + "learning_rate": 1.0065518410512634e-06, + "loss": 0.6372, + "step": 4670 + }, + { + "epoch": 1.135667396061269, + "grad_norm": 29.625, + "learning_rate": 1.006077850675196e-06, + "loss": 0.7484, + "step": 4671 + }, + { + "epoch": 1.135910527595429, + "grad_norm": 25.875, + "learning_rate": 1.0056038967575065e-06, + "loss": 0.9291, + "step": 4672 + }, + { + "epoch": 1.136153659129589, + "grad_norm": 17.875, + "learning_rate": 1.0051299793690359e-06, + "loss": 0.6821, + "step": 4673 + }, + { + "epoch": 1.136396790663749, + "grad_norm": 18.75, + "learning_rate": 1.0046560985806183e-06, + "loss": 0.679, + "step": 4674 + }, + { + "epoch": 1.136639922197909, + "grad_norm": 11.9375, + "learning_rate": 1.0041822544630844e-06, + "loss": 0.4055, + "step": 4675 + }, + { + "epoch": 1.136883053732069, + "grad_norm": 21.25, + "learning_rate": 1.003708447087258e-06, + "loss": 0.3208, + "step": 4676 + }, + { + "epoch": 1.137126185266229, + "grad_norm": 18.25, + "learning_rate": 1.0032346765239574e-06, + "loss": 0.7156, + "step": 4677 + }, + { + "epoch": 1.137369316800389, + "grad_norm": 17.125, + "learning_rate": 1.0027609428439963e-06, + "loss": 0.6838, + "step": 4678 + }, + { + "epoch": 1.1376124483345489, + "grad_norm": 18.25, + "learning_rate": 1.0022872461181823e-06, + "loss": 0.3967, + "step": 4679 + }, + { + "epoch": 1.1378555798687089, + "grad_norm": 17.0, + "learning_rate": 1.001813586417318e-06, + "loss": 0.6606, + "step": 4680 + }, + { + "epoch": 1.138098711402869, + "grad_norm": 20.875, + "learning_rate": 1.001339963812199e-06, + "loss": 1.1108, + "step": 4681 + }, + { + "epoch": 1.138341842937029, + "grad_norm": 27.125, + "learning_rate": 1.0008663783736172e-06, + "loss": 0.7837, + "step": 4682 + }, + { + "epoch": 1.138584974471189, + "grad_norm": 22.125, + "learning_rate": 1.0003928301723579e-06, + "loss": 0.4921, + "step": 4683 + }, + { + "epoch": 1.138828106005349, + "grad_norm": 23.375, + "learning_rate": 9.999193192792005e-07, + "loss": 0.837, + "step": 4684 + }, + { + "epoch": 1.139071237539509, + "grad_norm": 39.0, + "learning_rate": 9.9944584576492e-07, + "loss": 1.2307, + "step": 4685 + }, + { + "epoch": 1.139314369073669, + "grad_norm": 42.75, + "learning_rate": 9.989724097002852e-07, + "loss": 1.0931, + "step": 4686 + }, + { + "epoch": 1.139557500607829, + "grad_norm": 23.5, + "learning_rate": 9.984990111560594e-07, + "loss": 0.8322, + "step": 4687 + }, + { + "epoch": 1.139800632141989, + "grad_norm": 20.875, + "learning_rate": 9.980256502029998e-07, + "loss": 0.9468, + "step": 4688 + }, + { + "epoch": 1.140043763676149, + "grad_norm": 18.375, + "learning_rate": 9.975523269118583e-07, + "loss": 0.623, + "step": 4689 + }, + { + "epoch": 1.1402868952103087, + "grad_norm": 24.0, + "learning_rate": 9.97079041353382e-07, + "loss": 0.8993, + "step": 4690 + }, + { + "epoch": 1.1405300267444687, + "grad_norm": 21.25, + "learning_rate": 9.966057935983107e-07, + "loss": 0.8847, + "step": 4691 + }, + { + "epoch": 1.1407731582786287, + "grad_norm": 22.75, + "learning_rate": 9.961325837173797e-07, + "loss": 0.9124, + "step": 4692 + }, + { + "epoch": 1.1410162898127887, + "grad_norm": 28.5, + "learning_rate": 9.956594117813183e-07, + "loss": 0.5026, + "step": 4693 + }, + { + "epoch": 1.1412594213469487, + "grad_norm": 37.75, + "learning_rate": 9.951862778608511e-07, + "loss": 1.2797, + "step": 4694 + }, + { + "epoch": 1.1415025528811087, + "grad_norm": 15.9375, + "learning_rate": 9.94713182026695e-07, + "loss": 0.7834, + "step": 4695 + }, + { + "epoch": 1.1417456844152687, + "grad_norm": 20.25, + "learning_rate": 9.942401243495629e-07, + "loss": 0.5828, + "step": 4696 + }, + { + "epoch": 1.1419888159494287, + "grad_norm": 23.25, + "learning_rate": 9.937671049001616e-07, + "loss": 0.8871, + "step": 4697 + }, + { + "epoch": 1.1422319474835887, + "grad_norm": 17.375, + "learning_rate": 9.932941237491916e-07, + "loss": 0.617, + "step": 4698 + }, + { + "epoch": 1.1424750790177487, + "grad_norm": 24.375, + "learning_rate": 9.92821180967348e-07, + "loss": 0.4949, + "step": 4699 + }, + { + "epoch": 1.1427182105519087, + "grad_norm": 22.875, + "learning_rate": 9.923482766253204e-07, + "loss": 0.7303, + "step": 4700 + }, + { + "epoch": 1.1429613420860685, + "grad_norm": 16.625, + "learning_rate": 9.918754107937933e-07, + "loss": 0.5431, + "step": 4701 + }, + { + "epoch": 1.1432044736202285, + "grad_norm": 26.25, + "learning_rate": 9.914025835434435e-07, + "loss": 0.9557, + "step": 4702 + }, + { + "epoch": 1.1434476051543885, + "grad_norm": 11.6875, + "learning_rate": 9.90929794944944e-07, + "loss": 0.5551, + "step": 4703 + }, + { + "epoch": 1.1436907366885485, + "grad_norm": 24.375, + "learning_rate": 9.904570450689612e-07, + "loss": 0.9468, + "step": 4704 + }, + { + "epoch": 1.1439338682227085, + "grad_norm": 23.75, + "learning_rate": 9.89984333986155e-07, + "loss": 1.0116, + "step": 4705 + }, + { + "epoch": 1.1441769997568685, + "grad_norm": 17.375, + "learning_rate": 9.89511661767181e-07, + "loss": 0.6854, + "step": 4706 + }, + { + "epoch": 1.1444201312910285, + "grad_norm": 17.375, + "learning_rate": 9.89039028482688e-07, + "loss": 0.4921, + "step": 4707 + }, + { + "epoch": 1.1446632628251885, + "grad_norm": 12.5, + "learning_rate": 9.885664342033186e-07, + "loss": 0.4337, + "step": 4708 + }, + { + "epoch": 1.1449063943593485, + "grad_norm": 18.875, + "learning_rate": 9.88093878999711e-07, + "loss": 0.33, + "step": 4709 + }, + { + "epoch": 1.1451495258935083, + "grad_norm": 19.625, + "learning_rate": 9.876213629424966e-07, + "loss": 0.4763, + "step": 4710 + }, + { + "epoch": 1.1453926574276685, + "grad_norm": 24.0, + "learning_rate": 9.87148886102301e-07, + "loss": 1.1494, + "step": 4711 + }, + { + "epoch": 1.1456357889618283, + "grad_norm": 17.0, + "learning_rate": 9.866764485497438e-07, + "loss": 0.5958, + "step": 4712 + }, + { + "epoch": 1.1458789204959883, + "grad_norm": 17.0, + "learning_rate": 9.862040503554392e-07, + "loss": 0.8135, + "step": 4713 + }, + { + "epoch": 1.1461220520301483, + "grad_norm": 15.9375, + "learning_rate": 9.857316915899953e-07, + "loss": 0.4461, + "step": 4714 + }, + { + "epoch": 1.1463651835643083, + "grad_norm": 20.875, + "learning_rate": 9.852593723240142e-07, + "loss": 0.9367, + "step": 4715 + }, + { + "epoch": 1.1466083150984683, + "grad_norm": 19.875, + "learning_rate": 9.847870926280915e-07, + "loss": 0.7702, + "step": 4716 + }, + { + "epoch": 1.1468514466326283, + "grad_norm": 16.25, + "learning_rate": 9.843148525728187e-07, + "loss": 0.5595, + "step": 4717 + }, + { + "epoch": 1.1470945781667883, + "grad_norm": 21.125, + "learning_rate": 9.838426522287797e-07, + "loss": 0.7565, + "step": 4718 + }, + { + "epoch": 1.1473377097009483, + "grad_norm": 20.125, + "learning_rate": 9.83370491666553e-07, + "loss": 0.7135, + "step": 4719 + }, + { + "epoch": 1.1475808412351083, + "grad_norm": 18.0, + "learning_rate": 9.828983709567112e-07, + "loss": 1.0726, + "step": 4720 + }, + { + "epoch": 1.147823972769268, + "grad_norm": 13.5, + "learning_rate": 9.82426290169821e-07, + "loss": 0.3294, + "step": 4721 + }, + { + "epoch": 1.148067104303428, + "grad_norm": 22.375, + "learning_rate": 9.819542493764427e-07, + "loss": 0.8569, + "step": 4722 + }, + { + "epoch": 1.148310235837588, + "grad_norm": 18.25, + "learning_rate": 9.81482248647131e-07, + "loss": 0.5157, + "step": 4723 + }, + { + "epoch": 1.148553367371748, + "grad_norm": 25.25, + "learning_rate": 9.810102880524348e-07, + "loss": 0.9876, + "step": 4724 + }, + { + "epoch": 1.148796498905908, + "grad_norm": 18.75, + "learning_rate": 9.805383676628968e-07, + "loss": 0.8455, + "step": 4725 + }, + { + "epoch": 1.149039630440068, + "grad_norm": 19.5, + "learning_rate": 9.800664875490533e-07, + "loss": 0.5097, + "step": 4726 + }, + { + "epoch": 1.149282761974228, + "grad_norm": 22.875, + "learning_rate": 9.795946477814352e-07, + "loss": 0.702, + "step": 4727 + }, + { + "epoch": 1.149525893508388, + "grad_norm": 16.625, + "learning_rate": 9.791228484305675e-07, + "loss": 0.4389, + "step": 4728 + }, + { + "epoch": 1.149769025042548, + "grad_norm": 19.5, + "learning_rate": 9.786510895669678e-07, + "loss": 0.6537, + "step": 4729 + }, + { + "epoch": 1.150012156576708, + "grad_norm": 22.25, + "learning_rate": 9.781793712611492e-07, + "loss": 0.6354, + "step": 4730 + }, + { + "epoch": 1.150255288110868, + "grad_norm": 17.0, + "learning_rate": 9.77707693583618e-07, + "loss": 0.7206, + "step": 4731 + }, + { + "epoch": 1.1504984196450279, + "grad_norm": 17.875, + "learning_rate": 9.77236056604875e-07, + "loss": 0.717, + "step": 4732 + }, + { + "epoch": 1.1507415511791879, + "grad_norm": 15.5625, + "learning_rate": 9.767644603954138e-07, + "loss": 0.5995, + "step": 4733 + }, + { + "epoch": 1.1509846827133479, + "grad_norm": 20.25, + "learning_rate": 9.76292905025723e-07, + "loss": 0.617, + "step": 4734 + }, + { + "epoch": 1.1512278142475079, + "grad_norm": 12.625, + "learning_rate": 9.75821390566285e-07, + "loss": 0.4788, + "step": 4735 + }, + { + "epoch": 1.1514709457816679, + "grad_norm": 17.875, + "learning_rate": 9.75349917087575e-07, + "loss": 0.7572, + "step": 4736 + }, + { + "epoch": 1.1517140773158279, + "grad_norm": 21.875, + "learning_rate": 9.748784846600634e-07, + "loss": 1.1137, + "step": 4737 + }, + { + "epoch": 1.1519572088499879, + "grad_norm": 22.5, + "learning_rate": 9.744070933542139e-07, + "loss": 1.0709, + "step": 4738 + }, + { + "epoch": 1.1522003403841479, + "grad_norm": 19.125, + "learning_rate": 9.739357432404833e-07, + "loss": 0.8677, + "step": 4739 + }, + { + "epoch": 1.1524434719183079, + "grad_norm": 17.5, + "learning_rate": 9.734644343893243e-07, + "loss": 0.4708, + "step": 4740 + }, + { + "epoch": 1.1526866034524679, + "grad_norm": 21.375, + "learning_rate": 9.729931668711815e-07, + "loss": 0.461, + "step": 4741 + }, + { + "epoch": 1.1529297349866279, + "grad_norm": 16.25, + "learning_rate": 9.72521940756494e-07, + "loss": 0.5278, + "step": 4742 + }, + { + "epoch": 1.1531728665207877, + "grad_norm": 19.875, + "learning_rate": 9.720507561156944e-07, + "loss": 0.7804, + "step": 4743 + }, + { + "epoch": 1.1534159980549477, + "grad_norm": 18.625, + "learning_rate": 9.715796130192099e-07, + "loss": 0.7236, + "step": 4744 + }, + { + "epoch": 1.1536591295891077, + "grad_norm": 17.875, + "learning_rate": 9.711085115374608e-07, + "loss": 0.5271, + "step": 4745 + }, + { + "epoch": 1.1539022611232677, + "grad_norm": 22.625, + "learning_rate": 9.706374517408608e-07, + "loss": 0.5795, + "step": 4746 + }, + { + "epoch": 1.1541453926574277, + "grad_norm": 24.0, + "learning_rate": 9.701664336998183e-07, + "loss": 0.6542, + "step": 4747 + }, + { + "epoch": 1.1543885241915877, + "grad_norm": 21.125, + "learning_rate": 9.696954574847357e-07, + "loss": 1.0421, + "step": 4748 + }, + { + "epoch": 1.1546316557257477, + "grad_norm": 23.125, + "learning_rate": 9.69224523166008e-07, + "loss": 0.575, + "step": 4749 + }, + { + "epoch": 1.1548747872599077, + "grad_norm": 18.75, + "learning_rate": 9.687536308140244e-07, + "loss": 0.6088, + "step": 4750 + }, + { + "epoch": 1.1551179187940677, + "grad_norm": 24.5, + "learning_rate": 9.682827804991679e-07, + "loss": 0.759, + "step": 4751 + }, + { + "epoch": 1.1553610503282277, + "grad_norm": 17.25, + "learning_rate": 9.678119722918154e-07, + "loss": 0.4644, + "step": 4752 + }, + { + "epoch": 1.1556041818623877, + "grad_norm": 14.1875, + "learning_rate": 9.673412062623371e-07, + "loss": 0.3513, + "step": 4753 + }, + { + "epoch": 1.1558473133965474, + "grad_norm": 20.625, + "learning_rate": 9.668704824810968e-07, + "loss": 0.5674, + "step": 4754 + }, + { + "epoch": 1.1560904449307075, + "grad_norm": 20.125, + "learning_rate": 9.66399801018453e-07, + "loss": 0.5522, + "step": 4755 + }, + { + "epoch": 1.1563335764648675, + "grad_norm": 22.875, + "learning_rate": 9.65929161944757e-07, + "loss": 0.7269, + "step": 4756 + }, + { + "epoch": 1.1565767079990275, + "grad_norm": 21.375, + "learning_rate": 9.65458565330354e-07, + "loss": 0.7541, + "step": 4757 + }, + { + "epoch": 1.1568198395331875, + "grad_norm": 15.625, + "learning_rate": 9.649880112455823e-07, + "loss": 0.4317, + "step": 4758 + }, + { + "epoch": 1.1570629710673475, + "grad_norm": 19.0, + "learning_rate": 9.64517499760775e-07, + "loss": 0.426, + "step": 4759 + }, + { + "epoch": 1.1573061026015075, + "grad_norm": 19.875, + "learning_rate": 9.640470309462575e-07, + "loss": 0.7548, + "step": 4760 + }, + { + "epoch": 1.1575492341356675, + "grad_norm": 22.25, + "learning_rate": 9.635766048723497e-07, + "loss": 0.7269, + "step": 4761 + }, + { + "epoch": 1.1577923656698275, + "grad_norm": 18.875, + "learning_rate": 9.63106221609365e-07, + "loss": 0.8873, + "step": 4762 + }, + { + "epoch": 1.1580354972039872, + "grad_norm": 19.75, + "learning_rate": 9.626358812276104e-07, + "loss": 0.6826, + "step": 4763 + }, + { + "epoch": 1.1582786287381472, + "grad_norm": 26.875, + "learning_rate": 9.621655837973865e-07, + "loss": 0.7798, + "step": 4764 + }, + { + "epoch": 1.1585217602723072, + "grad_norm": 18.25, + "learning_rate": 9.616953293889871e-07, + "loss": 0.6742, + "step": 4765 + }, + { + "epoch": 1.1587648918064672, + "grad_norm": 26.625, + "learning_rate": 9.612251180727e-07, + "loss": 1.0155, + "step": 4766 + }, + { + "epoch": 1.1590080233406272, + "grad_norm": 17.75, + "learning_rate": 9.607549499188062e-07, + "loss": 0.3763, + "step": 4767 + }, + { + "epoch": 1.1592511548747872, + "grad_norm": 24.375, + "learning_rate": 9.602848249975805e-07, + "loss": 0.9412, + "step": 4768 + }, + { + "epoch": 1.1594942864089473, + "grad_norm": 30.25, + "learning_rate": 9.598147433792915e-07, + "loss": 0.855, + "step": 4769 + }, + { + "epoch": 1.1597374179431073, + "grad_norm": 14.75, + "learning_rate": 9.593447051342e-07, + "loss": 0.6047, + "step": 4770 + }, + { + "epoch": 1.1599805494772673, + "grad_norm": 16.75, + "learning_rate": 9.58874710332563e-07, + "loss": 0.4301, + "step": 4771 + }, + { + "epoch": 1.1602236810114273, + "grad_norm": 18.125, + "learning_rate": 9.58404759044628e-07, + "loss": 0.5674, + "step": 4772 + }, + { + "epoch": 1.1604668125455873, + "grad_norm": 21.625, + "learning_rate": 9.57934851340638e-07, + "loss": 0.7534, + "step": 4773 + }, + { + "epoch": 1.160709944079747, + "grad_norm": 22.125, + "learning_rate": 9.574649872908286e-07, + "loss": 0.9484, + "step": 4774 + }, + { + "epoch": 1.160953075613907, + "grad_norm": 21.625, + "learning_rate": 9.56995166965429e-07, + "loss": 0.9059, + "step": 4775 + }, + { + "epoch": 1.161196207148067, + "grad_norm": 31.0, + "learning_rate": 9.565253904346624e-07, + "loss": 0.9156, + "step": 4776 + }, + { + "epoch": 1.161439338682227, + "grad_norm": 23.875, + "learning_rate": 9.560556577687445e-07, + "loss": 0.563, + "step": 4777 + }, + { + "epoch": 1.161682470216387, + "grad_norm": 16.625, + "learning_rate": 9.555859690378846e-07, + "loss": 0.723, + "step": 4778 + }, + { + "epoch": 1.161925601750547, + "grad_norm": 16.375, + "learning_rate": 9.551163243122868e-07, + "loss": 0.7377, + "step": 4779 + }, + { + "epoch": 1.162168733284707, + "grad_norm": 15.9375, + "learning_rate": 9.546467236621472e-07, + "loss": 0.4267, + "step": 4780 + }, + { + "epoch": 1.162411864818867, + "grad_norm": 24.75, + "learning_rate": 9.541771671576557e-07, + "loss": 0.9024, + "step": 4781 + }, + { + "epoch": 1.162654996353027, + "grad_norm": 25.125, + "learning_rate": 9.537076548689953e-07, + "loss": 0.9069, + "step": 4782 + }, + { + "epoch": 1.162898127887187, + "grad_norm": 27.0, + "learning_rate": 9.532381868663436e-07, + "loss": 0.7099, + "step": 4783 + }, + { + "epoch": 1.163141259421347, + "grad_norm": 20.75, + "learning_rate": 9.527687632198697e-07, + "loss": 0.5455, + "step": 4784 + }, + { + "epoch": 1.1633843909555068, + "grad_norm": 25.625, + "learning_rate": 9.522993839997372e-07, + "loss": 0.5441, + "step": 4785 + }, + { + "epoch": 1.1636275224896668, + "grad_norm": 26.875, + "learning_rate": 9.518300492761035e-07, + "loss": 0.8369, + "step": 4786 + }, + { + "epoch": 1.1638706540238268, + "grad_norm": 20.25, + "learning_rate": 9.513607591191186e-07, + "loss": 0.6305, + "step": 4787 + }, + { + "epoch": 1.1641137855579868, + "grad_norm": 23.5, + "learning_rate": 9.508915135989261e-07, + "loss": 0.6392, + "step": 4788 + }, + { + "epoch": 1.1643569170921468, + "grad_norm": 16.625, + "learning_rate": 9.504223127856624e-07, + "loss": 0.5864, + "step": 4789 + }, + { + "epoch": 1.1646000486263068, + "grad_norm": 15.4375, + "learning_rate": 9.49953156749458e-07, + "loss": 0.4625, + "step": 4790 + }, + { + "epoch": 1.1648431801604668, + "grad_norm": 20.5, + "learning_rate": 9.494840455604366e-07, + "loss": 0.6627, + "step": 4791 + }, + { + "epoch": 1.1650863116946268, + "grad_norm": 19.625, + "learning_rate": 9.490149792887143e-07, + "loss": 0.7221, + "step": 4792 + }, + { + "epoch": 1.1653294432287868, + "grad_norm": 21.5, + "learning_rate": 9.485459580044014e-07, + "loss": 0.5299, + "step": 4793 + }, + { + "epoch": 1.1655725747629468, + "grad_norm": 18.5, + "learning_rate": 9.480769817776016e-07, + "loss": 0.4763, + "step": 4794 + }, + { + "epoch": 1.1658157062971068, + "grad_norm": 15.5625, + "learning_rate": 9.476080506784115e-07, + "loss": 0.3602, + "step": 4795 + }, + { + "epoch": 1.1660588378312666, + "grad_norm": 23.625, + "learning_rate": 9.471391647769204e-07, + "loss": 0.7227, + "step": 4796 + }, + { + "epoch": 1.1663019693654266, + "grad_norm": 18.375, + "learning_rate": 9.466703241432118e-07, + "loss": 0.6979, + "step": 4797 + }, + { + "epoch": 1.1665451008995866, + "grad_norm": 21.5, + "learning_rate": 9.462015288473622e-07, + "loss": 0.8033, + "step": 4798 + }, + { + "epoch": 1.1667882324337466, + "grad_norm": 24.75, + "learning_rate": 9.457327789594406e-07, + "loss": 0.5965, + "step": 4799 + }, + { + "epoch": 1.1670313639679066, + "grad_norm": 32.5, + "learning_rate": 9.452640745495104e-07, + "loss": 0.8128, + "step": 4800 + }, + { + "epoch": 1.1672744955020666, + "grad_norm": 24.875, + "learning_rate": 9.447954156876263e-07, + "loss": 0.8793, + "step": 4801 + }, + { + "epoch": 1.1675176270362266, + "grad_norm": 16.25, + "learning_rate": 9.443268024438393e-07, + "loss": 0.6922, + "step": 4802 + }, + { + "epoch": 1.1677607585703866, + "grad_norm": 21.5, + "learning_rate": 9.438582348881906e-07, + "loss": 0.9107, + "step": 4803 + }, + { + "epoch": 1.1680038901045466, + "grad_norm": 28.375, + "learning_rate": 9.433897130907157e-07, + "loss": 0.9011, + "step": 4804 + }, + { + "epoch": 1.1682470216387064, + "grad_norm": 23.125, + "learning_rate": 9.42921237121444e-07, + "loss": 0.5479, + "step": 4805 + }, + { + "epoch": 1.1684901531728666, + "grad_norm": 26.375, + "learning_rate": 9.424528070503967e-07, + "loss": 0.6548, + "step": 4806 + }, + { + "epoch": 1.1687332847070264, + "grad_norm": 21.875, + "learning_rate": 9.419844229475889e-07, + "loss": 0.7603, + "step": 4807 + }, + { + "epoch": 1.1689764162411864, + "grad_norm": 17.5, + "learning_rate": 9.415160848830279e-07, + "loss": 0.4958, + "step": 4808 + }, + { + "epoch": 1.1692195477753464, + "grad_norm": 18.625, + "learning_rate": 9.410477929267167e-07, + "loss": 0.5777, + "step": 4809 + }, + { + "epoch": 1.1694626793095064, + "grad_norm": 21.75, + "learning_rate": 9.405795471486483e-07, + "loss": 0.5548, + "step": 4810 + }, + { + "epoch": 1.1697058108436664, + "grad_norm": 23.875, + "learning_rate": 9.401113476188105e-07, + "loss": 0.6941, + "step": 4811 + }, + { + "epoch": 1.1699489423778264, + "grad_norm": 21.75, + "learning_rate": 9.396431944071839e-07, + "loss": 1.0553, + "step": 4812 + }, + { + "epoch": 1.1701920739119864, + "grad_norm": 18.0, + "learning_rate": 9.391750875837418e-07, + "loss": 0.504, + "step": 4813 + }, + { + "epoch": 1.1704352054461464, + "grad_norm": 22.875, + "learning_rate": 9.387070272184509e-07, + "loss": 0.923, + "step": 4814 + }, + { + "epoch": 1.1706783369803064, + "grad_norm": 24.375, + "learning_rate": 9.382390133812714e-07, + "loss": 0.708, + "step": 4815 + }, + { + "epoch": 1.1709214685144662, + "grad_norm": 19.125, + "learning_rate": 9.37771046142155e-07, + "loss": 0.443, + "step": 4816 + }, + { + "epoch": 1.1711646000486262, + "grad_norm": 22.25, + "learning_rate": 9.373031255710486e-07, + "loss": 0.9353, + "step": 4817 + }, + { + "epoch": 1.1714077315827862, + "grad_norm": 23.75, + "learning_rate": 9.368352517378903e-07, + "loss": 0.7583, + "step": 4818 + }, + { + "epoch": 1.1716508631169462, + "grad_norm": 14.5625, + "learning_rate": 9.363674247126126e-07, + "loss": 0.3643, + "step": 4819 + }, + { + "epoch": 1.1718939946511062, + "grad_norm": 22.875, + "learning_rate": 9.358996445651394e-07, + "loss": 0.537, + "step": 4820 + }, + { + "epoch": 1.1721371261852662, + "grad_norm": 25.25, + "learning_rate": 9.354319113653893e-07, + "loss": 1.0392, + "step": 4821 + }, + { + "epoch": 1.1723802577194262, + "grad_norm": 24.0, + "learning_rate": 9.349642251832729e-07, + "loss": 0.8923, + "step": 4822 + }, + { + "epoch": 1.1726233892535862, + "grad_norm": 18.875, + "learning_rate": 9.344965860886937e-07, + "loss": 0.7016, + "step": 4823 + }, + { + "epoch": 1.1728665207877462, + "grad_norm": 21.0, + "learning_rate": 9.340289941515483e-07, + "loss": 0.5655, + "step": 4824 + }, + { + "epoch": 1.1731096523219062, + "grad_norm": 20.125, + "learning_rate": 9.335614494417271e-07, + "loss": 0.6167, + "step": 4825 + }, + { + "epoch": 1.1733527838560662, + "grad_norm": 25.125, + "learning_rate": 9.330939520291127e-07, + "loss": 0.9493, + "step": 4826 + }, + { + "epoch": 1.173595915390226, + "grad_norm": 26.875, + "learning_rate": 9.326265019835798e-07, + "loss": 0.8987, + "step": 4827 + }, + { + "epoch": 1.173839046924386, + "grad_norm": 23.625, + "learning_rate": 9.321590993749977e-07, + "loss": 1.0924, + "step": 4828 + }, + { + "epoch": 1.174082178458546, + "grad_norm": 19.75, + "learning_rate": 9.316917442732277e-07, + "loss": 0.6558, + "step": 4829 + }, + { + "epoch": 1.174325309992706, + "grad_norm": 17.375, + "learning_rate": 9.312244367481234e-07, + "loss": 0.3978, + "step": 4830 + }, + { + "epoch": 1.174568441526866, + "grad_norm": 26.75, + "learning_rate": 9.307571768695327e-07, + "loss": 0.7809, + "step": 4831 + }, + { + "epoch": 1.174811573061026, + "grad_norm": 21.5, + "learning_rate": 9.302899647072951e-07, + "loss": 1.1568, + "step": 4832 + }, + { + "epoch": 1.175054704595186, + "grad_norm": 19.0, + "learning_rate": 9.298228003312443e-07, + "loss": 0.5374, + "step": 4833 + }, + { + "epoch": 1.175297836129346, + "grad_norm": 17.75, + "learning_rate": 9.293556838112056e-07, + "loss": 0.3968, + "step": 4834 + }, + { + "epoch": 1.175540967663506, + "grad_norm": 21.125, + "learning_rate": 9.288886152169974e-07, + "loss": 0.9098, + "step": 4835 + }, + { + "epoch": 1.175784099197666, + "grad_norm": 21.75, + "learning_rate": 9.284215946184319e-07, + "loss": 0.5139, + "step": 4836 + }, + { + "epoch": 1.176027230731826, + "grad_norm": 23.625, + "learning_rate": 9.279546220853125e-07, + "loss": 0.566, + "step": 4837 + }, + { + "epoch": 1.1762703622659858, + "grad_norm": 18.375, + "learning_rate": 9.27487697687437e-07, + "loss": 0.5967, + "step": 4838 + }, + { + "epoch": 1.1765134938001458, + "grad_norm": 20.5, + "learning_rate": 9.270208214945947e-07, + "loss": 0.5644, + "step": 4839 + }, + { + "epoch": 1.1767566253343058, + "grad_norm": 21.875, + "learning_rate": 9.265539935765691e-07, + "loss": 1.1081, + "step": 4840 + }, + { + "epoch": 1.1769997568684658, + "grad_norm": 21.625, + "learning_rate": 9.26087214003135e-07, + "loss": 0.605, + "step": 4841 + }, + { + "epoch": 1.1772428884026258, + "grad_norm": 22.625, + "learning_rate": 9.25620482844061e-07, + "loss": 0.8402, + "step": 4842 + }, + { + "epoch": 1.1774860199367858, + "grad_norm": 18.25, + "learning_rate": 9.251538001691084e-07, + "loss": 0.7289, + "step": 4843 + }, + { + "epoch": 1.1777291514709458, + "grad_norm": 22.375, + "learning_rate": 9.246871660480303e-07, + "loss": 0.5228, + "step": 4844 + }, + { + "epoch": 1.1779722830051058, + "grad_norm": 15.3125, + "learning_rate": 9.242205805505735e-07, + "loss": 0.49, + "step": 4845 + }, + { + "epoch": 1.1782154145392658, + "grad_norm": 19.25, + "learning_rate": 9.237540437464779e-07, + "loss": 0.759, + "step": 4846 + }, + { + "epoch": 1.1784585460734258, + "grad_norm": 17.875, + "learning_rate": 9.23287555705474e-07, + "loss": 0.6646, + "step": 4847 + }, + { + "epoch": 1.1787016776075858, + "grad_norm": 19.5, + "learning_rate": 9.228211164972879e-07, + "loss": 0.3533, + "step": 4848 + }, + { + "epoch": 1.1789448091417456, + "grad_norm": 21.625, + "learning_rate": 9.223547261916366e-07, + "loss": 0.5101, + "step": 4849 + }, + { + "epoch": 1.1791879406759056, + "grad_norm": 22.0, + "learning_rate": 9.218883848582302e-07, + "loss": 0.6311, + "step": 4850 + }, + { + "epoch": 1.1794310722100656, + "grad_norm": 20.875, + "learning_rate": 9.214220925667712e-07, + "loss": 1.1025, + "step": 4851 + }, + { + "epoch": 1.1796742037442256, + "grad_norm": 18.625, + "learning_rate": 9.209558493869551e-07, + "loss": 0.6219, + "step": 4852 + }, + { + "epoch": 1.1799173352783856, + "grad_norm": 18.375, + "learning_rate": 9.204896553884705e-07, + "loss": 0.5677, + "step": 4853 + }, + { + "epoch": 1.1801604668125456, + "grad_norm": 23.125, + "learning_rate": 9.200235106409974e-07, + "loss": 1.0367, + "step": 4854 + }, + { + "epoch": 1.1804035983467056, + "grad_norm": 16.375, + "learning_rate": 9.195574152142092e-07, + "loss": 0.4637, + "step": 4855 + }, + { + "epoch": 1.1806467298808656, + "grad_norm": 15.125, + "learning_rate": 9.190913691777726e-07, + "loss": 0.3284, + "step": 4856 + }, + { + "epoch": 1.1808898614150256, + "grad_norm": 23.25, + "learning_rate": 9.186253726013461e-07, + "loss": 0.8505, + "step": 4857 + }, + { + "epoch": 1.1811329929491854, + "grad_norm": 25.125, + "learning_rate": 9.181594255545805e-07, + "loss": 0.718, + "step": 4858 + }, + { + "epoch": 1.1813761244833456, + "grad_norm": 22.875, + "learning_rate": 9.176935281071198e-07, + "loss": 0.9073, + "step": 4859 + }, + { + "epoch": 1.1816192560175054, + "grad_norm": 18.875, + "learning_rate": 9.172276803286006e-07, + "loss": 0.8079, + "step": 4860 + }, + { + "epoch": 1.1818623875516654, + "grad_norm": 17.75, + "learning_rate": 9.167618822886516e-07, + "loss": 0.49, + "step": 4861 + }, + { + "epoch": 1.1821055190858254, + "grad_norm": 24.375, + "learning_rate": 9.162961340568944e-07, + "loss": 0.9699, + "step": 4862 + }, + { + "epoch": 1.1823486506199854, + "grad_norm": 23.875, + "learning_rate": 9.158304357029432e-07, + "loss": 0.5796, + "step": 4863 + }, + { + "epoch": 1.1825917821541454, + "grad_norm": 24.75, + "learning_rate": 9.15364787296405e-07, + "loss": 1.1586, + "step": 4864 + }, + { + "epoch": 1.1828349136883054, + "grad_norm": 18.375, + "learning_rate": 9.148991889068785e-07, + "loss": 0.5214, + "step": 4865 + }, + { + "epoch": 1.1830780452224654, + "grad_norm": 22.25, + "learning_rate": 9.144336406039556e-07, + "loss": 0.853, + "step": 4866 + }, + { + "epoch": 1.1833211767566254, + "grad_norm": 14.75, + "learning_rate": 9.139681424572208e-07, + "loss": 0.389, + "step": 4867 + }, + { + "epoch": 1.1835643082907854, + "grad_norm": 18.125, + "learning_rate": 9.135026945362505e-07, + "loss": 0.5399, + "step": 4868 + }, + { + "epoch": 1.1838074398249452, + "grad_norm": 21.25, + "learning_rate": 9.130372969106138e-07, + "loss": 0.6072, + "step": 4869 + }, + { + "epoch": 1.1840505713591052, + "grad_norm": 19.5, + "learning_rate": 9.125719496498723e-07, + "loss": 0.7865, + "step": 4870 + }, + { + "epoch": 1.1842937028932652, + "grad_norm": 24.5, + "learning_rate": 9.121066528235812e-07, + "loss": 0.8741, + "step": 4871 + }, + { + "epoch": 1.1845368344274252, + "grad_norm": 18.125, + "learning_rate": 9.116414065012861e-07, + "loss": 0.6396, + "step": 4872 + }, + { + "epoch": 1.1847799659615852, + "grad_norm": 19.875, + "learning_rate": 9.111762107525266e-07, + "loss": 0.6386, + "step": 4873 + }, + { + "epoch": 1.1850230974957452, + "grad_norm": 16.75, + "learning_rate": 9.107110656468342e-07, + "loss": 0.4619, + "step": 4874 + }, + { + "epoch": 1.1852662290299052, + "grad_norm": 17.625, + "learning_rate": 9.102459712537326e-07, + "loss": 0.4006, + "step": 4875 + }, + { + "epoch": 1.1855093605640652, + "grad_norm": 28.5, + "learning_rate": 9.097809276427382e-07, + "loss": 0.7335, + "step": 4876 + }, + { + "epoch": 1.1857524920982252, + "grad_norm": 18.75, + "learning_rate": 9.093159348833604e-07, + "loss": 0.6676, + "step": 4877 + }, + { + "epoch": 1.1859956236323852, + "grad_norm": 22.0, + "learning_rate": 9.08850993045099e-07, + "loss": 1.0245, + "step": 4878 + }, + { + "epoch": 1.1862387551665452, + "grad_norm": 19.25, + "learning_rate": 9.083861021974494e-07, + "loss": 0.5719, + "step": 4879 + }, + { + "epoch": 1.186481886700705, + "grad_norm": 15.375, + "learning_rate": 9.079212624098966e-07, + "loss": 0.3109, + "step": 4880 + }, + { + "epoch": 1.186725018234865, + "grad_norm": 27.25, + "learning_rate": 9.07456473751919e-07, + "loss": 0.7522, + "step": 4881 + }, + { + "epoch": 1.186968149769025, + "grad_norm": 13.6875, + "learning_rate": 9.069917362929873e-07, + "loss": 0.3014, + "step": 4882 + }, + { + "epoch": 1.187211281303185, + "grad_norm": 19.875, + "learning_rate": 9.065270501025645e-07, + "loss": 0.6737, + "step": 4883 + }, + { + "epoch": 1.187454412837345, + "grad_norm": 25.75, + "learning_rate": 9.060624152501062e-07, + "loss": 0.7117, + "step": 4884 + }, + { + "epoch": 1.187697544371505, + "grad_norm": 20.25, + "learning_rate": 9.055978318050597e-07, + "loss": 0.6932, + "step": 4885 + }, + { + "epoch": 1.187940675905665, + "grad_norm": 18.0, + "learning_rate": 9.051332998368651e-07, + "loss": 0.5434, + "step": 4886 + }, + { + "epoch": 1.188183807439825, + "grad_norm": 17.5, + "learning_rate": 9.046688194149552e-07, + "loss": 1.0233, + "step": 4887 + }, + { + "epoch": 1.188426938973985, + "grad_norm": 16.75, + "learning_rate": 9.042043906087544e-07, + "loss": 0.6163, + "step": 4888 + }, + { + "epoch": 1.188670070508145, + "grad_norm": 21.0, + "learning_rate": 9.037400134876793e-07, + "loss": 0.8709, + "step": 4889 + }, + { + "epoch": 1.188913202042305, + "grad_norm": 22.375, + "learning_rate": 9.032756881211394e-07, + "loss": 0.7121, + "step": 4890 + }, + { + "epoch": 1.1891563335764648, + "grad_norm": 19.5, + "learning_rate": 9.028114145785363e-07, + "loss": 0.7821, + "step": 4891 + }, + { + "epoch": 1.1893994651106248, + "grad_norm": 22.0, + "learning_rate": 9.023471929292632e-07, + "loss": 0.7796, + "step": 4892 + }, + { + "epoch": 1.1896425966447848, + "grad_norm": 20.125, + "learning_rate": 9.018830232427059e-07, + "loss": 0.9188, + "step": 4893 + }, + { + "epoch": 1.1898857281789448, + "grad_norm": 18.625, + "learning_rate": 9.014189055882433e-07, + "loss": 0.4755, + "step": 4894 + }, + { + "epoch": 1.1901288597131048, + "grad_norm": 17.25, + "learning_rate": 9.009548400352455e-07, + "loss": 0.6711, + "step": 4895 + }, + { + "epoch": 1.1903719912472648, + "grad_norm": 20.625, + "learning_rate": 9.004908266530754e-07, + "loss": 0.5938, + "step": 4896 + }, + { + "epoch": 1.1906151227814248, + "grad_norm": 18.625, + "learning_rate": 9.000268655110871e-07, + "loss": 0.5651, + "step": 4897 + }, + { + "epoch": 1.1908582543155848, + "grad_norm": 15.75, + "learning_rate": 8.995629566786282e-07, + "loss": 0.295, + "step": 4898 + }, + { + "epoch": 1.1911013858497448, + "grad_norm": 20.125, + "learning_rate": 8.990991002250376e-07, + "loss": 0.5402, + "step": 4899 + }, + { + "epoch": 1.1913445173839048, + "grad_norm": 22.75, + "learning_rate": 8.986352962196466e-07, + "loss": 1.0619, + "step": 4900 + }, + { + "epoch": 1.1915876489180648, + "grad_norm": 17.625, + "learning_rate": 8.981715447317788e-07, + "loss": 0.4252, + "step": 4901 + }, + { + "epoch": 1.1918307804522246, + "grad_norm": 21.375, + "learning_rate": 8.977078458307499e-07, + "loss": 0.8044, + "step": 4902 + }, + { + "epoch": 1.1920739119863846, + "grad_norm": 23.875, + "learning_rate": 8.972441995858681e-07, + "loss": 0.5866, + "step": 4903 + }, + { + "epoch": 1.1923170435205446, + "grad_norm": 24.0, + "learning_rate": 8.967806060664328e-07, + "loss": 1.1851, + "step": 4904 + }, + { + "epoch": 1.1925601750547046, + "grad_norm": 17.75, + "learning_rate": 8.963170653417364e-07, + "loss": 0.4021, + "step": 4905 + }, + { + "epoch": 1.1928033065888646, + "grad_norm": 22.75, + "learning_rate": 8.958535774810626e-07, + "loss": 0.8427, + "step": 4906 + }, + { + "epoch": 1.1930464381230246, + "grad_norm": 18.875, + "learning_rate": 8.953901425536881e-07, + "loss": 0.6998, + "step": 4907 + }, + { + "epoch": 1.1932895696571846, + "grad_norm": 15.75, + "learning_rate": 8.949267606288814e-07, + "loss": 0.318, + "step": 4908 + }, + { + "epoch": 1.1935327011913446, + "grad_norm": 20.5, + "learning_rate": 8.944634317759019e-07, + "loss": 0.5234, + "step": 4909 + }, + { + "epoch": 1.1937758327255046, + "grad_norm": 22.875, + "learning_rate": 8.940001560640034e-07, + "loss": 0.8072, + "step": 4910 + }, + { + "epoch": 1.1940189642596644, + "grad_norm": 17.625, + "learning_rate": 8.935369335624297e-07, + "loss": 0.4621, + "step": 4911 + }, + { + "epoch": 1.1942620957938244, + "grad_norm": 21.25, + "learning_rate": 8.930737643404178e-07, + "loss": 0.759, + "step": 4912 + }, + { + "epoch": 1.1945052273279844, + "grad_norm": 23.25, + "learning_rate": 8.926106484671962e-07, + "loss": 0.6255, + "step": 4913 + }, + { + "epoch": 1.1947483588621444, + "grad_norm": 19.5, + "learning_rate": 8.921475860119854e-07, + "loss": 0.8516, + "step": 4914 + }, + { + "epoch": 1.1949914903963044, + "grad_norm": 16.5, + "learning_rate": 8.916845770439984e-07, + "loss": 0.4676, + "step": 4915 + }, + { + "epoch": 1.1952346219304644, + "grad_norm": 20.25, + "learning_rate": 8.912216216324395e-07, + "loss": 0.5871, + "step": 4916 + }, + { + "epoch": 1.1954777534646244, + "grad_norm": 20.125, + "learning_rate": 8.907587198465051e-07, + "loss": 0.5728, + "step": 4917 + }, + { + "epoch": 1.1957208849987844, + "grad_norm": 23.75, + "learning_rate": 8.902958717553848e-07, + "loss": 0.6871, + "step": 4918 + }, + { + "epoch": 1.1959640165329444, + "grad_norm": 18.25, + "learning_rate": 8.898330774282588e-07, + "loss": 0.7488, + "step": 4919 + }, + { + "epoch": 1.1962071480671044, + "grad_norm": 23.875, + "learning_rate": 8.893703369342998e-07, + "loss": 0.8874, + "step": 4920 + }, + { + "epoch": 1.1964502796012644, + "grad_norm": 27.25, + "learning_rate": 8.889076503426719e-07, + "loss": 0.9963, + "step": 4921 + }, + { + "epoch": 1.1966934111354242, + "grad_norm": 17.875, + "learning_rate": 8.884450177225323e-07, + "loss": 0.7539, + "step": 4922 + }, + { + "epoch": 1.1969365426695842, + "grad_norm": 21.25, + "learning_rate": 8.879824391430289e-07, + "loss": 0.554, + "step": 4923 + }, + { + "epoch": 1.1971796742037442, + "grad_norm": 29.375, + "learning_rate": 8.875199146733018e-07, + "loss": 0.8313, + "step": 4924 + }, + { + "epoch": 1.1974228057379042, + "grad_norm": 25.0, + "learning_rate": 8.87057444382484e-07, + "loss": 0.8371, + "step": 4925 + }, + { + "epoch": 1.1976659372720642, + "grad_norm": 19.375, + "learning_rate": 8.865950283396995e-07, + "loss": 0.664, + "step": 4926 + }, + { + "epoch": 1.1979090688062242, + "grad_norm": 14.0625, + "learning_rate": 8.861326666140644e-07, + "loss": 0.4368, + "step": 4927 + }, + { + "epoch": 1.1981522003403842, + "grad_norm": 22.125, + "learning_rate": 8.856703592746862e-07, + "loss": 0.8901, + "step": 4928 + }, + { + "epoch": 1.1983953318745442, + "grad_norm": 17.25, + "learning_rate": 8.85208106390665e-07, + "loss": 0.519, + "step": 4929 + }, + { + "epoch": 1.1986384634087042, + "grad_norm": 17.75, + "learning_rate": 8.847459080310927e-07, + "loss": 0.7561, + "step": 4930 + }, + { + "epoch": 1.1988815949428642, + "grad_norm": 21.625, + "learning_rate": 8.842837642650526e-07, + "loss": 0.9093, + "step": 4931 + }, + { + "epoch": 1.1991247264770242, + "grad_norm": 23.875, + "learning_rate": 8.838216751616195e-07, + "loss": 0.6419, + "step": 4932 + }, + { + "epoch": 1.199367858011184, + "grad_norm": 16.25, + "learning_rate": 8.833596407898615e-07, + "loss": 0.4743, + "step": 4933 + }, + { + "epoch": 1.199610989545344, + "grad_norm": 17.5, + "learning_rate": 8.828976612188376e-07, + "loss": 0.6557, + "step": 4934 + }, + { + "epoch": 1.199854121079504, + "grad_norm": 16.0, + "learning_rate": 8.824357365175982e-07, + "loss": 0.538, + "step": 4935 + }, + { + "epoch": 1.200097252613664, + "grad_norm": 22.125, + "learning_rate": 8.819738667551857e-07, + "loss": 0.8751, + "step": 4936 + }, + { + "epoch": 1.200340384147824, + "grad_norm": 17.125, + "learning_rate": 8.815120520006352e-07, + "loss": 0.5195, + "step": 4937 + }, + { + "epoch": 1.200583515681984, + "grad_norm": 19.75, + "learning_rate": 8.810502923229724e-07, + "loss": 0.6029, + "step": 4938 + }, + { + "epoch": 1.200826647216144, + "grad_norm": 19.625, + "learning_rate": 8.805885877912156e-07, + "loss": 1.3229, + "step": 4939 + }, + { + "epoch": 1.201069778750304, + "grad_norm": 18.625, + "learning_rate": 8.801269384743735e-07, + "loss": 0.9567, + "step": 4940 + }, + { + "epoch": 1.201312910284464, + "grad_norm": 18.0, + "learning_rate": 8.796653444414491e-07, + "loss": 0.5203, + "step": 4941 + }, + { + "epoch": 1.201556041818624, + "grad_norm": 25.75, + "learning_rate": 8.792038057614348e-07, + "loss": 0.6189, + "step": 4942 + }, + { + "epoch": 1.201799173352784, + "grad_norm": 14.3125, + "learning_rate": 8.787423225033154e-07, + "loss": 0.6033, + "step": 4943 + }, + { + "epoch": 1.2020423048869437, + "grad_norm": 21.75, + "learning_rate": 8.782808947360682e-07, + "loss": 1.0152, + "step": 4944 + }, + { + "epoch": 1.2022854364211037, + "grad_norm": 24.125, + "learning_rate": 8.778195225286607e-07, + "loss": 0.8958, + "step": 4945 + }, + { + "epoch": 1.2025285679552637, + "grad_norm": 19.0, + "learning_rate": 8.773582059500534e-07, + "loss": 0.7713, + "step": 4946 + }, + { + "epoch": 1.2027716994894238, + "grad_norm": 25.5, + "learning_rate": 8.768969450691982e-07, + "loss": 0.5732, + "step": 4947 + }, + { + "epoch": 1.2030148310235838, + "grad_norm": 14.9375, + "learning_rate": 8.764357399550377e-07, + "loss": 0.3609, + "step": 4948 + }, + { + "epoch": 1.2032579625577438, + "grad_norm": 18.75, + "learning_rate": 8.759745906765079e-07, + "loss": 0.5953, + "step": 4949 + }, + { + "epoch": 1.2035010940919038, + "grad_norm": 31.0, + "learning_rate": 8.75513497302535e-07, + "loss": 1.0023, + "step": 4950 + }, + { + "epoch": 1.2037442256260638, + "grad_norm": 23.5, + "learning_rate": 8.75052459902038e-07, + "loss": 0.9095, + "step": 4951 + }, + { + "epoch": 1.2039873571602238, + "grad_norm": 21.125, + "learning_rate": 8.745914785439261e-07, + "loss": 0.9881, + "step": 4952 + }, + { + "epoch": 1.2042304886943835, + "grad_norm": 19.625, + "learning_rate": 8.741305532971011e-07, + "loss": 0.8702, + "step": 4953 + }, + { + "epoch": 1.2044736202285438, + "grad_norm": 17.5, + "learning_rate": 8.736696842304567e-07, + "loss": 0.671, + "step": 4954 + }, + { + "epoch": 1.2047167517627035, + "grad_norm": 17.125, + "learning_rate": 8.732088714128773e-07, + "loss": 0.6673, + "step": 4955 + }, + { + "epoch": 1.2049598832968635, + "grad_norm": 27.625, + "learning_rate": 8.727481149132394e-07, + "loss": 0.5197, + "step": 4956 + }, + { + "epoch": 1.2052030148310235, + "grad_norm": 20.75, + "learning_rate": 8.722874148004111e-07, + "loss": 1.0014, + "step": 4957 + }, + { + "epoch": 1.2054461463651835, + "grad_norm": 20.125, + "learning_rate": 8.718267711432524e-07, + "loss": 0.5627, + "step": 4958 + }, + { + "epoch": 1.2056892778993435, + "grad_norm": 18.375, + "learning_rate": 8.713661840106136e-07, + "loss": 0.8288, + "step": 4959 + }, + { + "epoch": 1.2059324094335035, + "grad_norm": 19.125, + "learning_rate": 8.709056534713378e-07, + "loss": 0.4407, + "step": 4960 + }, + { + "epoch": 1.2061755409676636, + "grad_norm": 29.625, + "learning_rate": 8.704451795942596e-07, + "loss": 1.067, + "step": 4961 + }, + { + "epoch": 1.2064186725018236, + "grad_norm": 19.75, + "learning_rate": 8.699847624482042e-07, + "loss": 0.7111, + "step": 4962 + }, + { + "epoch": 1.2066618040359836, + "grad_norm": 24.5, + "learning_rate": 8.69524402101989e-07, + "loss": 0.895, + "step": 4963 + }, + { + "epoch": 1.2069049355701433, + "grad_norm": 24.875, + "learning_rate": 8.690640986244231e-07, + "loss": 0.7428, + "step": 4964 + }, + { + "epoch": 1.2071480671043033, + "grad_norm": 22.75, + "learning_rate": 8.686038520843068e-07, + "loss": 0.8462, + "step": 4965 + }, + { + "epoch": 1.2073911986384633, + "grad_norm": 21.25, + "learning_rate": 8.681436625504316e-07, + "loss": 0.7423, + "step": 4966 + }, + { + "epoch": 1.2076343301726233, + "grad_norm": 20.875, + "learning_rate": 8.67683530091581e-07, + "loss": 0.567, + "step": 4967 + }, + { + "epoch": 1.2078774617067833, + "grad_norm": 15.625, + "learning_rate": 8.672234547765298e-07, + "loss": 0.5139, + "step": 4968 + }, + { + "epoch": 1.2081205932409433, + "grad_norm": 17.375, + "learning_rate": 8.667634366740439e-07, + "loss": 0.5036, + "step": 4969 + }, + { + "epoch": 1.2083637247751033, + "grad_norm": 18.875, + "learning_rate": 8.663034758528809e-07, + "loss": 0.4269, + "step": 4970 + }, + { + "epoch": 1.2086068563092633, + "grad_norm": 19.125, + "learning_rate": 8.658435723817902e-07, + "loss": 0.5858, + "step": 4971 + }, + { + "epoch": 1.2088499878434233, + "grad_norm": 18.625, + "learning_rate": 8.653837263295126e-07, + "loss": 0.5424, + "step": 4972 + }, + { + "epoch": 1.2090931193775833, + "grad_norm": 18.875, + "learning_rate": 8.649239377647791e-07, + "loss": 0.6346, + "step": 4973 + }, + { + "epoch": 1.2093362509117433, + "grad_norm": 21.0, + "learning_rate": 8.644642067563138e-07, + "loss": 0.7891, + "step": 4974 + }, + { + "epoch": 1.2095793824459031, + "grad_norm": 14.125, + "learning_rate": 8.640045333728316e-07, + "loss": 0.2975, + "step": 4975 + }, + { + "epoch": 1.2098225139800631, + "grad_norm": 20.125, + "learning_rate": 8.635449176830379e-07, + "loss": 0.7084, + "step": 4976 + }, + { + "epoch": 1.2100656455142231, + "grad_norm": 16.0, + "learning_rate": 8.630853597556308e-07, + "loss": 0.6825, + "step": 4977 + }, + { + "epoch": 1.2103087770483831, + "grad_norm": 32.75, + "learning_rate": 8.626258596592984e-07, + "loss": 0.7569, + "step": 4978 + }, + { + "epoch": 1.2105519085825431, + "grad_norm": 30.875, + "learning_rate": 8.62166417462722e-07, + "loss": 0.6905, + "step": 4979 + }, + { + "epoch": 1.2107950401167031, + "grad_norm": 20.5, + "learning_rate": 8.617070332345723e-07, + "loss": 0.6176, + "step": 4980 + }, + { + "epoch": 1.2110381716508631, + "grad_norm": 18.0, + "learning_rate": 8.612477070435127e-07, + "loss": 0.6297, + "step": 4981 + }, + { + "epoch": 1.2112813031850231, + "grad_norm": 20.5, + "learning_rate": 8.607884389581975e-07, + "loss": 0.8307, + "step": 4982 + }, + { + "epoch": 1.2115244347191831, + "grad_norm": 24.25, + "learning_rate": 8.603292290472717e-07, + "loss": 1.0392, + "step": 4983 + }, + { + "epoch": 1.2117675662533431, + "grad_norm": 16.125, + "learning_rate": 8.598700773793725e-07, + "loss": 0.7302, + "step": 4984 + }, + { + "epoch": 1.2120106977875031, + "grad_norm": 20.125, + "learning_rate": 8.594109840231282e-07, + "loss": 0.6587, + "step": 4985 + }, + { + "epoch": 1.212253829321663, + "grad_norm": 27.875, + "learning_rate": 8.589519490471576e-07, + "loss": 0.7282, + "step": 4986 + }, + { + "epoch": 1.212496960855823, + "grad_norm": 17.875, + "learning_rate": 8.58492972520072e-07, + "loss": 0.4925, + "step": 4987 + }, + { + "epoch": 1.212740092389983, + "grad_norm": 16.5, + "learning_rate": 8.580340545104735e-07, + "loss": 0.4443, + "step": 4988 + }, + { + "epoch": 1.212983223924143, + "grad_norm": 17.625, + "learning_rate": 8.57575195086955e-07, + "loss": 0.4446, + "step": 4989 + }, + { + "epoch": 1.213226355458303, + "grad_norm": 21.625, + "learning_rate": 8.571163943181008e-07, + "loss": 0.9306, + "step": 4990 + }, + { + "epoch": 1.213469486992463, + "grad_norm": 13.875, + "learning_rate": 8.566576522724869e-07, + "loss": 0.3169, + "step": 4991 + }, + { + "epoch": 1.213712618526623, + "grad_norm": 25.375, + "learning_rate": 8.561989690186803e-07, + "loss": 0.4153, + "step": 4992 + }, + { + "epoch": 1.213955750060783, + "grad_norm": 22.625, + "learning_rate": 8.557403446252388e-07, + "loss": 0.5255, + "step": 4993 + }, + { + "epoch": 1.214198881594943, + "grad_norm": 23.875, + "learning_rate": 8.552817791607115e-07, + "loss": 0.7175, + "step": 4994 + }, + { + "epoch": 1.214442013129103, + "grad_norm": 26.625, + "learning_rate": 8.548232726936396e-07, + "loss": 0.7659, + "step": 4995 + }, + { + "epoch": 1.214685144663263, + "grad_norm": 18.75, + "learning_rate": 8.543648252925549e-07, + "loss": 0.3982, + "step": 4996 + }, + { + "epoch": 1.2149282761974227, + "grad_norm": 26.625, + "learning_rate": 8.539064370259796e-07, + "loss": 1.0359, + "step": 4997 + }, + { + "epoch": 1.2151714077315827, + "grad_norm": 30.75, + "learning_rate": 8.53448107962428e-07, + "loss": 0.7103, + "step": 4998 + }, + { + "epoch": 1.2154145392657427, + "grad_norm": 19.375, + "learning_rate": 8.529898381704058e-07, + "loss": 0.4476, + "step": 4999 + }, + { + "epoch": 1.2156576707999027, + "grad_norm": 20.0, + "learning_rate": 8.525316277184084e-07, + "loss": 0.5951, + "step": 5000 + }, + { + "epoch": 1.2159008023340627, + "grad_norm": 21.75, + "learning_rate": 8.520734766749239e-07, + "loss": 0.7579, + "step": 5001 + }, + { + "epoch": 1.2161439338682227, + "grad_norm": 20.0, + "learning_rate": 8.516153851084305e-07, + "loss": 1.0184, + "step": 5002 + }, + { + "epoch": 1.2163870654023827, + "grad_norm": 20.5, + "learning_rate": 8.511573530873985e-07, + "loss": 0.88, + "step": 5003 + }, + { + "epoch": 1.2166301969365427, + "grad_norm": 23.375, + "learning_rate": 8.506993806802882e-07, + "loss": 0.6534, + "step": 5004 + }, + { + "epoch": 1.2168733284707027, + "grad_norm": 25.125, + "learning_rate": 8.502414679555515e-07, + "loss": 0.7731, + "step": 5005 + }, + { + "epoch": 1.2171164600048625, + "grad_norm": 25.25, + "learning_rate": 8.497836149816318e-07, + "loss": 0.6918, + "step": 5006 + }, + { + "epoch": 1.2173595915390227, + "grad_norm": 20.625, + "learning_rate": 8.493258218269627e-07, + "loss": 1.1542, + "step": 5007 + }, + { + "epoch": 1.2176027230731825, + "grad_norm": 20.0, + "learning_rate": 8.488680885599692e-07, + "loss": 0.7988, + "step": 5008 + }, + { + "epoch": 1.2178458546073425, + "grad_norm": 32.5, + "learning_rate": 8.484104152490677e-07, + "loss": 1.2164, + "step": 5009 + }, + { + "epoch": 1.2180889861415025, + "grad_norm": 15.125, + "learning_rate": 8.479528019626654e-07, + "loss": 0.4186, + "step": 5010 + }, + { + "epoch": 1.2183321176756625, + "grad_norm": 18.875, + "learning_rate": 8.474952487691607e-07, + "loss": 0.8304, + "step": 5011 + }, + { + "epoch": 1.2185752492098225, + "grad_norm": 18.875, + "learning_rate": 8.470377557369422e-07, + "loss": 0.536, + "step": 5012 + }, + { + "epoch": 1.2188183807439825, + "grad_norm": 21.0, + "learning_rate": 8.465803229343908e-07, + "loss": 0.6295, + "step": 5013 + }, + { + "epoch": 1.2190615122781425, + "grad_norm": 39.5, + "learning_rate": 8.461229504298772e-07, + "loss": 1.0606, + "step": 5014 + }, + { + "epoch": 1.2193046438123025, + "grad_norm": 22.375, + "learning_rate": 8.456656382917639e-07, + "loss": 0.6199, + "step": 5015 + }, + { + "epoch": 1.2195477753464625, + "grad_norm": 24.875, + "learning_rate": 8.452083865884044e-07, + "loss": 0.977, + "step": 5016 + }, + { + "epoch": 1.2197909068806223, + "grad_norm": 22.375, + "learning_rate": 8.447511953881416e-07, + "loss": 1.1204, + "step": 5017 + }, + { + "epoch": 1.2200340384147823, + "grad_norm": 17.625, + "learning_rate": 8.442940647593123e-07, + "loss": 0.5688, + "step": 5018 + }, + { + "epoch": 1.2202771699489423, + "grad_norm": 17.375, + "learning_rate": 8.438369947702416e-07, + "loss": 0.3245, + "step": 5019 + }, + { + "epoch": 1.2205203014831023, + "grad_norm": 18.625, + "learning_rate": 8.43379985489247e-07, + "loss": 0.5849, + "step": 5020 + }, + { + "epoch": 1.2207634330172623, + "grad_norm": 22.0, + "learning_rate": 8.429230369846358e-07, + "loss": 0.5065, + "step": 5021 + }, + { + "epoch": 1.2210065645514223, + "grad_norm": 27.375, + "learning_rate": 8.424661493247073e-07, + "loss": 0.7494, + "step": 5022 + }, + { + "epoch": 1.2212496960855823, + "grad_norm": 28.5, + "learning_rate": 8.420093225777513e-07, + "loss": 0.8999, + "step": 5023 + }, + { + "epoch": 1.2214928276197423, + "grad_norm": 18.125, + "learning_rate": 8.41552556812048e-07, + "loss": 0.5888, + "step": 5024 + }, + { + "epoch": 1.2217359591539023, + "grad_norm": 21.625, + "learning_rate": 8.410958520958691e-07, + "loss": 0.4691, + "step": 5025 + }, + { + "epoch": 1.2219790906880623, + "grad_norm": 21.25, + "learning_rate": 8.406392084974774e-07, + "loss": 0.5534, + "step": 5026 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 17.375, + "learning_rate": 8.40182626085126e-07, + "loss": 0.635, + "step": 5027 + }, + { + "epoch": 1.222465353756382, + "grad_norm": 17.5, + "learning_rate": 8.397261049270586e-07, + "loss": 0.4704, + "step": 5028 + }, + { + "epoch": 1.222708485290542, + "grad_norm": 18.0, + "learning_rate": 8.392696450915106e-07, + "loss": 0.5269, + "step": 5029 + }, + { + "epoch": 1.222951616824702, + "grad_norm": 22.0, + "learning_rate": 8.388132466467081e-07, + "loss": 0.6745, + "step": 5030 + }, + { + "epoch": 1.223194748358862, + "grad_norm": 24.25, + "learning_rate": 8.383569096608668e-07, + "loss": 0.8552, + "step": 5031 + }, + { + "epoch": 1.223437879893022, + "grad_norm": 19.0, + "learning_rate": 8.379006342021946e-07, + "loss": 0.6174, + "step": 5032 + }, + { + "epoch": 1.223681011427182, + "grad_norm": 20.75, + "learning_rate": 8.374444203388902e-07, + "loss": 0.8532, + "step": 5033 + }, + { + "epoch": 1.223924142961342, + "grad_norm": 26.125, + "learning_rate": 8.36988268139142e-07, + "loss": 0.665, + "step": 5034 + }, + { + "epoch": 1.2241672744955021, + "grad_norm": 25.375, + "learning_rate": 8.365321776711304e-07, + "loss": 0.738, + "step": 5035 + }, + { + "epoch": 1.2244104060296621, + "grad_norm": 16.875, + "learning_rate": 8.360761490030254e-07, + "loss": 0.5678, + "step": 5036 + }, + { + "epoch": 1.2246535375638221, + "grad_norm": 18.75, + "learning_rate": 8.356201822029889e-07, + "loss": 0.6996, + "step": 5037 + }, + { + "epoch": 1.2248966690979821, + "grad_norm": 29.125, + "learning_rate": 8.351642773391727e-07, + "loss": 0.9046, + "step": 5038 + }, + { + "epoch": 1.225139800632142, + "grad_norm": 21.875, + "learning_rate": 8.347084344797196e-07, + "loss": 0.9709, + "step": 5039 + }, + { + "epoch": 1.225382932166302, + "grad_norm": 15.8125, + "learning_rate": 8.342526536927629e-07, + "loss": 0.4506, + "step": 5040 + }, + { + "epoch": 1.225626063700462, + "grad_norm": 21.75, + "learning_rate": 8.337969350464278e-07, + "loss": 0.7138, + "step": 5041 + }, + { + "epoch": 1.225869195234622, + "grad_norm": 14.75, + "learning_rate": 8.333412786088289e-07, + "loss": 0.344, + "step": 5042 + }, + { + "epoch": 1.226112326768782, + "grad_norm": 18.5, + "learning_rate": 8.328856844480718e-07, + "loss": 0.5888, + "step": 5043 + }, + { + "epoch": 1.226355458302942, + "grad_norm": 21.375, + "learning_rate": 8.32430152632253e-07, + "loss": 0.9529, + "step": 5044 + }, + { + "epoch": 1.226598589837102, + "grad_norm": 19.75, + "learning_rate": 8.319746832294595e-07, + "loss": 0.6544, + "step": 5045 + }, + { + "epoch": 1.226841721371262, + "grad_norm": 16.75, + "learning_rate": 8.315192763077689e-07, + "loss": 0.4995, + "step": 5046 + }, + { + "epoch": 1.227084852905422, + "grad_norm": 22.75, + "learning_rate": 8.310639319352504e-07, + "loss": 0.9184, + "step": 5047 + }, + { + "epoch": 1.227327984439582, + "grad_norm": 35.25, + "learning_rate": 8.306086501799616e-07, + "loss": 0.6188, + "step": 5048 + }, + { + "epoch": 1.227571115973742, + "grad_norm": 20.25, + "learning_rate": 8.30153431109954e-07, + "loss": 0.519, + "step": 5049 + }, + { + "epoch": 1.2278142475079017, + "grad_norm": 18.375, + "learning_rate": 8.29698274793267e-07, + "loss": 0.5042, + "step": 5050 + }, + { + "epoch": 1.2280573790420617, + "grad_norm": 20.0, + "learning_rate": 8.292431812979315e-07, + "loss": 0.9208, + "step": 5051 + }, + { + "epoch": 1.2283005105762217, + "grad_norm": 20.75, + "learning_rate": 8.287881506919696e-07, + "loss": 0.5641, + "step": 5052 + }, + { + "epoch": 1.2285436421103817, + "grad_norm": 19.875, + "learning_rate": 8.283331830433928e-07, + "loss": 0.7488, + "step": 5053 + }, + { + "epoch": 1.2287867736445417, + "grad_norm": 16.5, + "learning_rate": 8.278782784202047e-07, + "loss": 0.5215, + "step": 5054 + }, + { + "epoch": 1.2290299051787017, + "grad_norm": 20.25, + "learning_rate": 8.274234368903978e-07, + "loss": 0.7046, + "step": 5055 + }, + { + "epoch": 1.2292730367128617, + "grad_norm": 19.625, + "learning_rate": 8.269686585219561e-07, + "loss": 0.3506, + "step": 5056 + }, + { + "epoch": 1.2295161682470217, + "grad_norm": 24.25, + "learning_rate": 8.265139433828548e-07, + "loss": 0.9019, + "step": 5057 + }, + { + "epoch": 1.2297592997811817, + "grad_norm": 27.125, + "learning_rate": 8.260592915410584e-07, + "loss": 0.6694, + "step": 5058 + }, + { + "epoch": 1.2300024313153415, + "grad_norm": 25.375, + "learning_rate": 8.256047030645228e-07, + "loss": 0.3019, + "step": 5059 + }, + { + "epoch": 1.2302455628495015, + "grad_norm": 34.0, + "learning_rate": 8.251501780211938e-07, + "loss": 0.8181, + "step": 5060 + }, + { + "epoch": 1.2304886943836615, + "grad_norm": 19.0, + "learning_rate": 8.246957164790082e-07, + "loss": 0.8579, + "step": 5061 + }, + { + "epoch": 1.2307318259178215, + "grad_norm": 21.625, + "learning_rate": 8.242413185058928e-07, + "loss": 0.8039, + "step": 5062 + }, + { + "epoch": 1.2309749574519815, + "grad_norm": 19.75, + "learning_rate": 8.237869841697652e-07, + "loss": 0.9045, + "step": 5063 + }, + { + "epoch": 1.2312180889861415, + "grad_norm": 19.25, + "learning_rate": 8.233327135385341e-07, + "loss": 0.5451, + "step": 5064 + }, + { + "epoch": 1.2314612205203015, + "grad_norm": 17.75, + "learning_rate": 8.228785066800977e-07, + "loss": 0.6289, + "step": 5065 + }, + { + "epoch": 1.2317043520544615, + "grad_norm": 24.375, + "learning_rate": 8.224243636623455e-07, + "loss": 0.8035, + "step": 5066 + }, + { + "epoch": 1.2319474835886215, + "grad_norm": 18.5, + "learning_rate": 8.219702845531563e-07, + "loss": 0.5471, + "step": 5067 + }, + { + "epoch": 1.2321906151227815, + "grad_norm": 19.875, + "learning_rate": 8.215162694204003e-07, + "loss": 0.5268, + "step": 5068 + }, + { + "epoch": 1.2324337466569415, + "grad_norm": 17.5, + "learning_rate": 8.210623183319383e-07, + "loss": 0.5326, + "step": 5069 + }, + { + "epoch": 1.2326768781911013, + "grad_norm": 18.875, + "learning_rate": 8.206084313556207e-07, + "loss": 0.4787, + "step": 5070 + }, + { + "epoch": 1.2329200097252613, + "grad_norm": 22.75, + "learning_rate": 8.201546085592884e-07, + "loss": 0.8955, + "step": 5071 + }, + { + "epoch": 1.2331631412594213, + "grad_norm": 14.9375, + "learning_rate": 8.19700850010774e-07, + "loss": 0.3327, + "step": 5072 + }, + { + "epoch": 1.2334062727935813, + "grad_norm": 16.5, + "learning_rate": 8.192471557778995e-07, + "loss": 0.7224, + "step": 5073 + }, + { + "epoch": 1.2336494043277413, + "grad_norm": 23.375, + "learning_rate": 8.187935259284762e-07, + "loss": 0.5969, + "step": 5074 + }, + { + "epoch": 1.2338925358619013, + "grad_norm": 16.75, + "learning_rate": 8.18339960530308e-07, + "loss": 0.5852, + "step": 5075 + }, + { + "epoch": 1.2341356673960613, + "grad_norm": 22.75, + "learning_rate": 8.178864596511879e-07, + "loss": 0.5865, + "step": 5076 + }, + { + "epoch": 1.2343787989302213, + "grad_norm": 20.625, + "learning_rate": 8.174330233588989e-07, + "loss": 0.7485, + "step": 5077 + }, + { + "epoch": 1.2346219304643813, + "grad_norm": 21.75, + "learning_rate": 8.169796517212157e-07, + "loss": 0.6504, + "step": 5078 + }, + { + "epoch": 1.2348650619985413, + "grad_norm": 23.125, + "learning_rate": 8.165263448059013e-07, + "loss": 0.7995, + "step": 5079 + }, + { + "epoch": 1.2351081935327013, + "grad_norm": 39.25, + "learning_rate": 8.160731026807116e-07, + "loss": 0.7201, + "step": 5080 + }, + { + "epoch": 1.235351325066861, + "grad_norm": 19.75, + "learning_rate": 8.156199254133907e-07, + "loss": 0.5835, + "step": 5081 + }, + { + "epoch": 1.235594456601021, + "grad_norm": 20.625, + "learning_rate": 8.151668130716739e-07, + "loss": 0.9944, + "step": 5082 + }, + { + "epoch": 1.235837588135181, + "grad_norm": 23.0, + "learning_rate": 8.14713765723287e-07, + "loss": 0.6267, + "step": 5083 + }, + { + "epoch": 1.236080719669341, + "grad_norm": 27.875, + "learning_rate": 8.142607834359453e-07, + "loss": 0.8266, + "step": 5084 + }, + { + "epoch": 1.236323851203501, + "grad_norm": 26.375, + "learning_rate": 8.138078662773549e-07, + "loss": 0.8328, + "step": 5085 + }, + { + "epoch": 1.236566982737661, + "grad_norm": 20.0, + "learning_rate": 8.133550143152126e-07, + "loss": 0.503, + "step": 5086 + }, + { + "epoch": 1.236810114271821, + "grad_norm": 15.4375, + "learning_rate": 8.129022276172038e-07, + "loss": 0.3991, + "step": 5087 + }, + { + "epoch": 1.237053245805981, + "grad_norm": 17.0, + "learning_rate": 8.124495062510065e-07, + "loss": 0.3644, + "step": 5088 + }, + { + "epoch": 1.237296377340141, + "grad_norm": 18.375, + "learning_rate": 8.11996850284287e-07, + "loss": 0.3558, + "step": 5089 + }, + { + "epoch": 1.237539508874301, + "grad_norm": 20.875, + "learning_rate": 8.115442597847033e-07, + "loss": 0.627, + "step": 5090 + }, + { + "epoch": 1.237782640408461, + "grad_norm": 19.125, + "learning_rate": 8.11091734819902e-07, + "loss": 0.6868, + "step": 5091 + }, + { + "epoch": 1.2380257719426209, + "grad_norm": 21.25, + "learning_rate": 8.106392754575211e-07, + "loss": 0.7741, + "step": 5092 + }, + { + "epoch": 1.2382689034767809, + "grad_norm": 17.5, + "learning_rate": 8.101868817651889e-07, + "loss": 0.4642, + "step": 5093 + }, + { + "epoch": 1.2385120350109409, + "grad_norm": 20.75, + "learning_rate": 8.097345538105223e-07, + "loss": 0.6459, + "step": 5094 + }, + { + "epoch": 1.2387551665451009, + "grad_norm": 22.5, + "learning_rate": 8.092822916611307e-07, + "loss": 0.9879, + "step": 5095 + }, + { + "epoch": 1.2389982980792609, + "grad_norm": 24.0, + "learning_rate": 8.08830095384612e-07, + "loss": 0.7213, + "step": 5096 + }, + { + "epoch": 1.2392414296134209, + "grad_norm": 21.625, + "learning_rate": 8.083779650485552e-07, + "loss": 0.8667, + "step": 5097 + }, + { + "epoch": 1.2394845611475809, + "grad_norm": 23.75, + "learning_rate": 8.079259007205381e-07, + "loss": 0.9636, + "step": 5098 + }, + { + "epoch": 1.2397276926817409, + "grad_norm": 19.75, + "learning_rate": 8.074739024681302e-07, + "loss": 0.6448, + "step": 5099 + }, + { + "epoch": 1.2399708242159009, + "grad_norm": 22.5, + "learning_rate": 8.070219703588905e-07, + "loss": 0.4778, + "step": 5100 + }, + { + "epoch": 1.2402139557500607, + "grad_norm": 20.5, + "learning_rate": 8.065701044603677e-07, + "loss": 0.9189, + "step": 5101 + }, + { + "epoch": 1.2404570872842209, + "grad_norm": 24.25, + "learning_rate": 8.061183048401005e-07, + "loss": 0.7258, + "step": 5102 + }, + { + "epoch": 1.2407002188183807, + "grad_norm": 20.875, + "learning_rate": 8.056665715656193e-07, + "loss": 0.7502, + "step": 5103 + }, + { + "epoch": 1.2409433503525407, + "grad_norm": 18.25, + "learning_rate": 8.05214904704443e-07, + "loss": 0.8119, + "step": 5104 + }, + { + "epoch": 1.2411864818867007, + "grad_norm": 20.25, + "learning_rate": 8.047633043240807e-07, + "loss": 0.6244, + "step": 5105 + }, + { + "epoch": 1.2414296134208607, + "grad_norm": 15.8125, + "learning_rate": 8.043117704920321e-07, + "loss": 0.6666, + "step": 5106 + }, + { + "epoch": 1.2416727449550207, + "grad_norm": 32.5, + "learning_rate": 8.038603032757869e-07, + "loss": 0.5042, + "step": 5107 + }, + { + "epoch": 1.2419158764891807, + "grad_norm": 24.625, + "learning_rate": 8.034089027428244e-07, + "loss": 0.8816, + "step": 5108 + }, + { + "epoch": 1.2421590080233407, + "grad_norm": 16.625, + "learning_rate": 8.029575689606139e-07, + "loss": 0.4124, + "step": 5109 + }, + { + "epoch": 1.2424021395575007, + "grad_norm": 16.375, + "learning_rate": 8.025063019966153e-07, + "loss": 0.4326, + "step": 5110 + }, + { + "epoch": 1.2426452710916607, + "grad_norm": 20.25, + "learning_rate": 8.02055101918279e-07, + "loss": 0.5714, + "step": 5111 + }, + { + "epoch": 1.2428884026258205, + "grad_norm": 20.25, + "learning_rate": 8.016039687930438e-07, + "loss": 0.7711, + "step": 5112 + }, + { + "epoch": 1.2431315341599805, + "grad_norm": 19.625, + "learning_rate": 8.011529026883394e-07, + "loss": 0.9322, + "step": 5113 + }, + { + "epoch": 1.2433746656941405, + "grad_norm": 17.375, + "learning_rate": 8.007019036715858e-07, + "loss": 0.6829, + "step": 5114 + }, + { + "epoch": 1.2436177972283005, + "grad_norm": 14.8125, + "learning_rate": 8.00250971810192e-07, + "loss": 0.5632, + "step": 5115 + }, + { + "epoch": 1.2438609287624605, + "grad_norm": 20.25, + "learning_rate": 7.998001071715581e-07, + "loss": 0.762, + "step": 5116 + }, + { + "epoch": 1.2441040602966205, + "grad_norm": 19.125, + "learning_rate": 7.993493098230729e-07, + "loss": 0.8899, + "step": 5117 + }, + { + "epoch": 1.2443471918307805, + "grad_norm": 15.0625, + "learning_rate": 7.988985798321169e-07, + "loss": 0.3197, + "step": 5118 + }, + { + "epoch": 1.2445903233649405, + "grad_norm": 23.75, + "learning_rate": 7.984479172660589e-07, + "loss": 0.7413, + "step": 5119 + }, + { + "epoch": 1.2448334548991005, + "grad_norm": 39.75, + "learning_rate": 7.979973221922581e-07, + "loss": 1.0104, + "step": 5120 + }, + { + "epoch": 1.2450765864332605, + "grad_norm": 31.375, + "learning_rate": 7.975467946780641e-07, + "loss": 0.7779, + "step": 5121 + }, + { + "epoch": 1.2453197179674205, + "grad_norm": 47.0, + "learning_rate": 7.970963347908156e-07, + "loss": 1.2212, + "step": 5122 + }, + { + "epoch": 1.2455628495015802, + "grad_norm": 21.375, + "learning_rate": 7.966459425978418e-07, + "loss": 0.8814, + "step": 5123 + }, + { + "epoch": 1.2458059810357403, + "grad_norm": 19.375, + "learning_rate": 7.961956181664616e-07, + "loss": 0.3904, + "step": 5124 + }, + { + "epoch": 1.2460491125699003, + "grad_norm": 22.375, + "learning_rate": 7.957453615639835e-07, + "loss": 0.6804, + "step": 5125 + }, + { + "epoch": 1.2462922441040603, + "grad_norm": 20.625, + "learning_rate": 7.952951728577064e-07, + "loss": 0.6559, + "step": 5126 + }, + { + "epoch": 1.2465353756382203, + "grad_norm": 18.0, + "learning_rate": 7.948450521149189e-07, + "loss": 0.4762, + "step": 5127 + }, + { + "epoch": 1.2467785071723803, + "grad_norm": 22.625, + "learning_rate": 7.943949994028993e-07, + "loss": 0.7315, + "step": 5128 + }, + { + "epoch": 1.2470216387065403, + "grad_norm": 17.0, + "learning_rate": 7.939450147889155e-07, + "loss": 0.7564, + "step": 5129 + }, + { + "epoch": 1.2472647702407003, + "grad_norm": 16.625, + "learning_rate": 7.934950983402254e-07, + "loss": 0.8133, + "step": 5130 + }, + { + "epoch": 1.2475079017748603, + "grad_norm": 14.9375, + "learning_rate": 7.930452501240773e-07, + "loss": 0.3438, + "step": 5131 + }, + { + "epoch": 1.2477510333090203, + "grad_norm": 19.875, + "learning_rate": 7.925954702077082e-07, + "loss": 0.7378, + "step": 5132 + }, + { + "epoch": 1.2479941648431803, + "grad_norm": 16.625, + "learning_rate": 7.921457586583456e-07, + "loss": 0.4752, + "step": 5133 + }, + { + "epoch": 1.24823729637734, + "grad_norm": 17.5, + "learning_rate": 7.916961155432069e-07, + "loss": 0.868, + "step": 5134 + }, + { + "epoch": 1.2484804279115, + "grad_norm": 18.375, + "learning_rate": 7.912465409294992e-07, + "loss": 0.5162, + "step": 5135 + }, + { + "epoch": 1.24872355944566, + "grad_norm": 22.0, + "learning_rate": 7.907970348844186e-07, + "loss": 0.5627, + "step": 5136 + }, + { + "epoch": 1.24896669097982, + "grad_norm": 21.0, + "learning_rate": 7.903475974751519e-07, + "loss": 0.5853, + "step": 5137 + }, + { + "epoch": 1.24920982251398, + "grad_norm": 21.0, + "learning_rate": 7.898982287688756e-07, + "loss": 0.9628, + "step": 5138 + }, + { + "epoch": 1.24945295404814, + "grad_norm": 17.75, + "learning_rate": 7.894489288327548e-07, + "loss": 0.3787, + "step": 5139 + }, + { + "epoch": 1.2496960855823, + "grad_norm": 22.875, + "learning_rate": 7.889996977339456e-07, + "loss": 0.7657, + "step": 5140 + }, + { + "epoch": 1.24993921711646, + "grad_norm": 26.5, + "learning_rate": 7.885505355395931e-07, + "loss": 0.6755, + "step": 5141 + }, + { + "epoch": 1.25018234865062, + "grad_norm": 24.0, + "learning_rate": 7.881014423168331e-07, + "loss": 1.288, + "step": 5142 + }, + { + "epoch": 1.2504254801847798, + "grad_norm": 17.0, + "learning_rate": 7.876524181327895e-07, + "loss": 0.4901, + "step": 5143 + }, + { + "epoch": 1.25066861171894, + "grad_norm": 24.625, + "learning_rate": 7.87203463054577e-07, + "loss": 0.9405, + "step": 5144 + }, + { + "epoch": 1.2509117432530998, + "grad_norm": 19.5, + "learning_rate": 7.867545771493002e-07, + "loss": 0.9092, + "step": 5145 + }, + { + "epoch": 1.2511548747872598, + "grad_norm": 19.625, + "learning_rate": 7.863057604840518e-07, + "loss": 0.4593, + "step": 5146 + }, + { + "epoch": 1.2513980063214198, + "grad_norm": 23.125, + "learning_rate": 7.858570131259161e-07, + "loss": 0.8733, + "step": 5147 + }, + { + "epoch": 1.2516411378555798, + "grad_norm": 25.375, + "learning_rate": 7.854083351419652e-07, + "loss": 0.9887, + "step": 5148 + }, + { + "epoch": 1.2518842693897398, + "grad_norm": 21.625, + "learning_rate": 7.849597265992628e-07, + "loss": 0.6092, + "step": 5149 + }, + { + "epoch": 1.2521274009238998, + "grad_norm": 20.875, + "learning_rate": 7.845111875648612e-07, + "loss": 0.793, + "step": 5150 + }, + { + "epoch": 1.2523705324580598, + "grad_norm": 17.75, + "learning_rate": 7.840627181058015e-07, + "loss": 0.606, + "step": 5151 + }, + { + "epoch": 1.2526136639922199, + "grad_norm": 20.875, + "learning_rate": 7.836143182891159e-07, + "loss": 0.689, + "step": 5152 + }, + { + "epoch": 1.2528567955263799, + "grad_norm": 30.125, + "learning_rate": 7.831659881818249e-07, + "loss": 0.9238, + "step": 5153 + }, + { + "epoch": 1.2530999270605396, + "grad_norm": 18.0, + "learning_rate": 7.827177278509394e-07, + "loss": 0.785, + "step": 5154 + }, + { + "epoch": 1.2533430585946999, + "grad_norm": 24.25, + "learning_rate": 7.822695373634601e-07, + "loss": 0.6184, + "step": 5155 + }, + { + "epoch": 1.2535861901288596, + "grad_norm": 24.625, + "learning_rate": 7.818214167863755e-07, + "loss": 0.6755, + "step": 5156 + }, + { + "epoch": 1.2538293216630196, + "grad_norm": 25.75, + "learning_rate": 7.813733661866668e-07, + "loss": 0.764, + "step": 5157 + }, + { + "epoch": 1.2540724531971796, + "grad_norm": 18.625, + "learning_rate": 7.809253856313018e-07, + "loss": 0.7369, + "step": 5158 + }, + { + "epoch": 1.2543155847313396, + "grad_norm": 23.25, + "learning_rate": 7.804774751872391e-07, + "loss": 0.7766, + "step": 5159 + }, + { + "epoch": 1.2545587162654996, + "grad_norm": 14.8125, + "learning_rate": 7.800296349214267e-07, + "loss": 0.36, + "step": 5160 + }, + { + "epoch": 1.2548018477996596, + "grad_norm": 17.0, + "learning_rate": 7.795818649008017e-07, + "loss": 0.5848, + "step": 5161 + }, + { + "epoch": 1.2550449793338196, + "grad_norm": 16.375, + "learning_rate": 7.79134165192292e-07, + "loss": 0.611, + "step": 5162 + }, + { + "epoch": 1.2552881108679796, + "grad_norm": 18.75, + "learning_rate": 7.786865358628129e-07, + "loss": 0.6683, + "step": 5163 + }, + { + "epoch": 1.2555312424021396, + "grad_norm": 16.75, + "learning_rate": 7.782389769792708e-07, + "loss": 0.5806, + "step": 5164 + }, + { + "epoch": 1.2557743739362994, + "grad_norm": 22.875, + "learning_rate": 7.777914886085613e-07, + "loss": 0.5107, + "step": 5165 + }, + { + "epoch": 1.2560175054704596, + "grad_norm": 25.625, + "learning_rate": 7.773440708175694e-07, + "loss": 0.9852, + "step": 5166 + }, + { + "epoch": 1.2562606370046194, + "grad_norm": 20.625, + "learning_rate": 7.768967236731692e-07, + "loss": 0.6061, + "step": 5167 + }, + { + "epoch": 1.2565037685387794, + "grad_norm": 16.25, + "learning_rate": 7.764494472422243e-07, + "loss": 0.54, + "step": 5168 + }, + { + "epoch": 1.2567469000729394, + "grad_norm": 18.75, + "learning_rate": 7.760022415915881e-07, + "loss": 0.4746, + "step": 5169 + }, + { + "epoch": 1.2569900316070994, + "grad_norm": 17.875, + "learning_rate": 7.75555106788103e-07, + "loss": 0.9075, + "step": 5170 + }, + { + "epoch": 1.2572331631412594, + "grad_norm": 27.375, + "learning_rate": 7.75108042898601e-07, + "loss": 0.7413, + "step": 5171 + }, + { + "epoch": 1.2574762946754194, + "grad_norm": 22.125, + "learning_rate": 7.746610499899036e-07, + "loss": 0.7065, + "step": 5172 + }, + { + "epoch": 1.2577194262095794, + "grad_norm": 20.375, + "learning_rate": 7.742141281288218e-07, + "loss": 0.4949, + "step": 5173 + }, + { + "epoch": 1.2579625577437394, + "grad_norm": 21.375, + "learning_rate": 7.73767277382156e-07, + "loss": 0.6434, + "step": 5174 + }, + { + "epoch": 1.2582056892778994, + "grad_norm": 25.0, + "learning_rate": 7.73320497816695e-07, + "loss": 0.7034, + "step": 5175 + }, + { + "epoch": 1.2584488208120592, + "grad_norm": 16.125, + "learning_rate": 7.728737894992186e-07, + "loss": 0.3293, + "step": 5176 + }, + { + "epoch": 1.2586919523462192, + "grad_norm": 18.125, + "learning_rate": 7.724271524964942e-07, + "loss": 0.6795, + "step": 5177 + }, + { + "epoch": 1.2589350838803792, + "grad_norm": 20.0, + "learning_rate": 7.7198058687528e-07, + "loss": 0.588, + "step": 5178 + }, + { + "epoch": 1.2591782154145392, + "grad_norm": 22.25, + "learning_rate": 7.715340927023224e-07, + "loss": 0.7101, + "step": 5179 + }, + { + "epoch": 1.2594213469486992, + "grad_norm": 15.125, + "learning_rate": 7.710876700443581e-07, + "loss": 0.363, + "step": 5180 + }, + { + "epoch": 1.2596644784828592, + "grad_norm": 22.875, + "learning_rate": 7.706413189681132e-07, + "loss": 0.5323, + "step": 5181 + }, + { + "epoch": 1.2599076100170192, + "grad_norm": 15.5, + "learning_rate": 7.701950395403015e-07, + "loss": 0.3437, + "step": 5182 + }, + { + "epoch": 1.2601507415511792, + "grad_norm": 14.0625, + "learning_rate": 7.697488318276281e-07, + "loss": 0.36, + "step": 5183 + }, + { + "epoch": 1.2603938730853392, + "grad_norm": 18.125, + "learning_rate": 7.693026958967856e-07, + "loss": 0.4509, + "step": 5184 + }, + { + "epoch": 1.260637004619499, + "grad_norm": 16.75, + "learning_rate": 7.688566318144572e-07, + "loss": 0.5548, + "step": 5185 + }, + { + "epoch": 1.2608801361536592, + "grad_norm": 24.25, + "learning_rate": 7.684106396473151e-07, + "loss": 0.8648, + "step": 5186 + }, + { + "epoch": 1.261123267687819, + "grad_norm": 18.0, + "learning_rate": 7.679647194620194e-07, + "loss": 0.5511, + "step": 5187 + }, + { + "epoch": 1.261366399221979, + "grad_norm": 25.625, + "learning_rate": 7.675188713252222e-07, + "loss": 0.5117, + "step": 5188 + }, + { + "epoch": 1.261609530756139, + "grad_norm": 17.0, + "learning_rate": 7.670730953035623e-07, + "loss": 0.7025, + "step": 5189 + }, + { + "epoch": 1.261852662290299, + "grad_norm": 18.375, + "learning_rate": 7.666273914636685e-07, + "loss": 0.8594, + "step": 5190 + }, + { + "epoch": 1.262095793824459, + "grad_norm": 22.125, + "learning_rate": 7.661817598721596e-07, + "loss": 0.7871, + "step": 5191 + }, + { + "epoch": 1.262338925358619, + "grad_norm": 17.625, + "learning_rate": 7.657362005956423e-07, + "loss": 0.4572, + "step": 5192 + }, + { + "epoch": 1.262582056892779, + "grad_norm": 26.25, + "learning_rate": 7.652907137007135e-07, + "loss": 0.9232, + "step": 5193 + }, + { + "epoch": 1.262825188426939, + "grad_norm": 16.25, + "learning_rate": 7.648452992539584e-07, + "loss": 0.3624, + "step": 5194 + }, + { + "epoch": 1.263068319961099, + "grad_norm": 16.125, + "learning_rate": 7.643999573219521e-07, + "loss": 0.5707, + "step": 5195 + }, + { + "epoch": 1.2633114514952588, + "grad_norm": 17.125, + "learning_rate": 7.639546879712592e-07, + "loss": 0.4665, + "step": 5196 + }, + { + "epoch": 1.263554583029419, + "grad_norm": 15.875, + "learning_rate": 7.635094912684323e-07, + "loss": 0.5266, + "step": 5197 + }, + { + "epoch": 1.2637977145635788, + "grad_norm": 24.125, + "learning_rate": 7.630643672800142e-07, + "loss": 1.1576, + "step": 5198 + }, + { + "epoch": 1.2640408460977388, + "grad_norm": 13.3125, + "learning_rate": 7.626193160725357e-07, + "loss": 0.2176, + "step": 5199 + }, + { + "epoch": 1.2642839776318988, + "grad_norm": 15.25, + "learning_rate": 7.621743377125182e-07, + "loss": 0.2813, + "step": 5200 + }, + { + "epoch": 1.2645271091660588, + "grad_norm": 30.125, + "learning_rate": 7.617294322664706e-07, + "loss": 0.764, + "step": 5201 + }, + { + "epoch": 1.2647702407002188, + "grad_norm": 22.125, + "learning_rate": 7.612845998008919e-07, + "loss": 0.9649, + "step": 5202 + }, + { + "epoch": 1.2650133722343788, + "grad_norm": 24.5, + "learning_rate": 7.608398403822703e-07, + "loss": 0.8815, + "step": 5203 + }, + { + "epoch": 1.2652565037685388, + "grad_norm": 21.5, + "learning_rate": 7.603951540770828e-07, + "loss": 0.7787, + "step": 5204 + }, + { + "epoch": 1.2654996353026988, + "grad_norm": 20.25, + "learning_rate": 7.599505409517954e-07, + "loss": 0.5999, + "step": 5205 + }, + { + "epoch": 1.2657427668368588, + "grad_norm": 22.125, + "learning_rate": 7.595060010728629e-07, + "loss": 0.5027, + "step": 5206 + }, + { + "epoch": 1.2659858983710186, + "grad_norm": 19.375, + "learning_rate": 7.590615345067298e-07, + "loss": 0.6925, + "step": 5207 + }, + { + "epoch": 1.2662290299051788, + "grad_norm": 16.875, + "learning_rate": 7.586171413198293e-07, + "loss": 0.4931, + "step": 5208 + }, + { + "epoch": 1.2664721614393386, + "grad_norm": 20.125, + "learning_rate": 7.581728215785832e-07, + "loss": 1.0039, + "step": 5209 + }, + { + "epoch": 1.2667152929734986, + "grad_norm": 17.125, + "learning_rate": 7.57728575349403e-07, + "loss": 0.5779, + "step": 5210 + }, + { + "epoch": 1.2669584245076586, + "grad_norm": 22.625, + "learning_rate": 7.572844026986889e-07, + "loss": 0.9309, + "step": 5211 + }, + { + "epoch": 1.2672015560418186, + "grad_norm": 16.375, + "learning_rate": 7.56840303692831e-07, + "loss": 0.4784, + "step": 5212 + }, + { + "epoch": 1.2674446875759786, + "grad_norm": 20.5, + "learning_rate": 7.563962783982064e-07, + "loss": 0.8777, + "step": 5213 + }, + { + "epoch": 1.2676878191101386, + "grad_norm": 24.25, + "learning_rate": 7.559523268811828e-07, + "loss": 0.9291, + "step": 5214 + }, + { + "epoch": 1.2679309506442986, + "grad_norm": 21.75, + "learning_rate": 7.555084492081169e-07, + "loss": 0.6857, + "step": 5215 + }, + { + "epoch": 1.2681740821784586, + "grad_norm": 19.875, + "learning_rate": 7.550646454453529e-07, + "loss": 0.5013, + "step": 5216 + }, + { + "epoch": 1.2684172137126186, + "grad_norm": 20.0, + "learning_rate": 7.546209156592258e-07, + "loss": 0.4317, + "step": 5217 + }, + { + "epoch": 1.2686603452467784, + "grad_norm": 22.625, + "learning_rate": 7.541772599160577e-07, + "loss": 0.4605, + "step": 5218 + }, + { + "epoch": 1.2689034767809386, + "grad_norm": 17.0, + "learning_rate": 7.537336782821621e-07, + "loss": 0.7397, + "step": 5219 + }, + { + "epoch": 1.2691466083150984, + "grad_norm": 25.375, + "learning_rate": 7.532901708238387e-07, + "loss": 0.6636, + "step": 5220 + }, + { + "epoch": 1.2693897398492584, + "grad_norm": 18.75, + "learning_rate": 7.528467376073778e-07, + "loss": 0.717, + "step": 5221 + }, + { + "epoch": 1.2696328713834184, + "grad_norm": 18.75, + "learning_rate": 7.524033786990583e-07, + "loss": 0.6163, + "step": 5222 + }, + { + "epoch": 1.2698760029175784, + "grad_norm": 19.875, + "learning_rate": 7.519600941651472e-07, + "loss": 0.6147, + "step": 5223 + }, + { + "epoch": 1.2701191344517384, + "grad_norm": 21.0, + "learning_rate": 7.515168840719017e-07, + "loss": 0.608, + "step": 5224 + }, + { + "epoch": 1.2703622659858984, + "grad_norm": 19.5, + "learning_rate": 7.510737484855673e-07, + "loss": 0.9512, + "step": 5225 + }, + { + "epoch": 1.2706053975200584, + "grad_norm": 22.5, + "learning_rate": 7.506306874723774e-07, + "loss": 0.5054, + "step": 5226 + }, + { + "epoch": 1.2708485290542184, + "grad_norm": 33.0, + "learning_rate": 7.501877010985559e-07, + "loss": 0.671, + "step": 5227 + }, + { + "epoch": 1.2710916605883784, + "grad_norm": 21.375, + "learning_rate": 7.497447894303145e-07, + "loss": 0.6001, + "step": 5228 + }, + { + "epoch": 1.2713347921225382, + "grad_norm": 17.25, + "learning_rate": 7.493019525338542e-07, + "loss": 0.7246, + "step": 5229 + }, + { + "epoch": 1.2715779236566982, + "grad_norm": 21.875, + "learning_rate": 7.488591904753642e-07, + "loss": 0.5793, + "step": 5230 + }, + { + "epoch": 1.2718210551908582, + "grad_norm": 14.9375, + "learning_rate": 7.484165033210233e-07, + "loss": 0.3067, + "step": 5231 + }, + { + "epoch": 1.2720641867250182, + "grad_norm": 13.9375, + "learning_rate": 7.479738911369988e-07, + "loss": 0.3615, + "step": 5232 + }, + { + "epoch": 1.2723073182591782, + "grad_norm": 22.0, + "learning_rate": 7.47531353989446e-07, + "loss": 0.6091, + "step": 5233 + }, + { + "epoch": 1.2725504497933382, + "grad_norm": 20.125, + "learning_rate": 7.470888919445107e-07, + "loss": 0.7712, + "step": 5234 + }, + { + "epoch": 1.2727935813274982, + "grad_norm": 21.375, + "learning_rate": 7.466465050683261e-07, + "loss": 0.584, + "step": 5235 + }, + { + "epoch": 1.2730367128616582, + "grad_norm": 18.0, + "learning_rate": 7.462041934270145e-07, + "loss": 0.6518, + "step": 5236 + }, + { + "epoch": 1.2732798443958182, + "grad_norm": 31.375, + "learning_rate": 7.457619570866869e-07, + "loss": 0.9055, + "step": 5237 + }, + { + "epoch": 1.273522975929978, + "grad_norm": 18.125, + "learning_rate": 7.453197961134436e-07, + "loss": 0.5466, + "step": 5238 + }, + { + "epoch": 1.2737661074641382, + "grad_norm": 17.125, + "learning_rate": 7.448777105733729e-07, + "loss": 0.7802, + "step": 5239 + }, + { + "epoch": 1.274009238998298, + "grad_norm": 17.375, + "learning_rate": 7.444357005325519e-07, + "loss": 0.7875, + "step": 5240 + }, + { + "epoch": 1.274252370532458, + "grad_norm": 20.125, + "learning_rate": 7.439937660570464e-07, + "loss": 0.7861, + "step": 5241 + }, + { + "epoch": 1.274495502066618, + "grad_norm": 15.9375, + "learning_rate": 7.435519072129121e-07, + "loss": 0.3423, + "step": 5242 + }, + { + "epoch": 1.274738633600778, + "grad_norm": 18.0, + "learning_rate": 7.431101240661923e-07, + "loss": 0.5573, + "step": 5243 + }, + { + "epoch": 1.274981765134938, + "grad_norm": 19.125, + "learning_rate": 7.426684166829182e-07, + "loss": 0.6575, + "step": 5244 + }, + { + "epoch": 1.275224896669098, + "grad_norm": 25.125, + "learning_rate": 7.422267851291113e-07, + "loss": 1.0183, + "step": 5245 + }, + { + "epoch": 1.275468028203258, + "grad_norm": 22.375, + "learning_rate": 7.417852294707812e-07, + "loss": 0.4543, + "step": 5246 + }, + { + "epoch": 1.275711159737418, + "grad_norm": 18.25, + "learning_rate": 7.413437497739254e-07, + "loss": 0.5249, + "step": 5247 + }, + { + "epoch": 1.275954291271578, + "grad_norm": 20.375, + "learning_rate": 7.409023461045309e-07, + "loss": 0.6888, + "step": 5248 + }, + { + "epoch": 1.2761974228057378, + "grad_norm": 16.625, + "learning_rate": 7.404610185285729e-07, + "loss": 0.621, + "step": 5249 + }, + { + "epoch": 1.276440554339898, + "grad_norm": 14.625, + "learning_rate": 7.400197671120165e-07, + "loss": 0.5498, + "step": 5250 + }, + { + "epoch": 1.2766836858740578, + "grad_norm": 17.5, + "learning_rate": 7.395785919208131e-07, + "loss": 0.4632, + "step": 5251 + }, + { + "epoch": 1.2769268174082178, + "grad_norm": 17.25, + "learning_rate": 7.391374930209046e-07, + "loss": 0.6256, + "step": 5252 + }, + { + "epoch": 1.2771699489423778, + "grad_norm": 17.875, + "learning_rate": 7.386964704782208e-07, + "loss": 0.4981, + "step": 5253 + }, + { + "epoch": 1.2774130804765378, + "grad_norm": 19.875, + "learning_rate": 7.382555243586797e-07, + "loss": 0.8559, + "step": 5254 + }, + { + "epoch": 1.2776562120106978, + "grad_norm": 35.5, + "learning_rate": 7.378146547281888e-07, + "loss": 1.1875, + "step": 5255 + }, + { + "epoch": 1.2778993435448578, + "grad_norm": 18.75, + "learning_rate": 7.373738616526431e-07, + "loss": 0.7175, + "step": 5256 + }, + { + "epoch": 1.2781424750790178, + "grad_norm": 23.375, + "learning_rate": 7.369331451979278e-07, + "loss": 0.5727, + "step": 5257 + }, + { + "epoch": 1.2783856066131778, + "grad_norm": 18.0, + "learning_rate": 7.364925054299143e-07, + "loss": 0.654, + "step": 5258 + }, + { + "epoch": 1.2786287381473378, + "grad_norm": 17.375, + "learning_rate": 7.36051942414465e-07, + "loss": 0.3481, + "step": 5259 + }, + { + "epoch": 1.2788718696814976, + "grad_norm": 21.25, + "learning_rate": 7.356114562174287e-07, + "loss": 0.6114, + "step": 5260 + }, + { + "epoch": 1.2791150012156578, + "grad_norm": 32.5, + "learning_rate": 7.351710469046444e-07, + "loss": 0.7523, + "step": 5261 + }, + { + "epoch": 1.2793581327498176, + "grad_norm": 18.375, + "learning_rate": 7.347307145419383e-07, + "loss": 0.5799, + "step": 5262 + }, + { + "epoch": 1.2796012642839776, + "grad_norm": 22.75, + "learning_rate": 7.342904591951261e-07, + "loss": 0.8784, + "step": 5263 + }, + { + "epoch": 1.2798443958181376, + "grad_norm": 16.125, + "learning_rate": 7.338502809300111e-07, + "loss": 0.496, + "step": 5264 + }, + { + "epoch": 1.2800875273522976, + "grad_norm": 20.125, + "learning_rate": 7.334101798123858e-07, + "loss": 0.4859, + "step": 5265 + }, + { + "epoch": 1.2803306588864576, + "grad_norm": 19.125, + "learning_rate": 7.32970155908031e-07, + "loss": 0.6878, + "step": 5266 + }, + { + "epoch": 1.2805737904206176, + "grad_norm": 19.25, + "learning_rate": 7.325302092827162e-07, + "loss": 0.8037, + "step": 5267 + }, + { + "epoch": 1.2808169219547776, + "grad_norm": 15.6875, + "learning_rate": 7.320903400021983e-07, + "loss": 0.5192, + "step": 5268 + }, + { + "epoch": 1.2810600534889376, + "grad_norm": 15.9375, + "learning_rate": 7.316505481322237e-07, + "loss": 0.4567, + "step": 5269 + }, + { + "epoch": 1.2813031850230976, + "grad_norm": 18.125, + "learning_rate": 7.312108337385272e-07, + "loss": 0.4329, + "step": 5270 + }, + { + "epoch": 1.2815463165572574, + "grad_norm": 27.25, + "learning_rate": 7.307711968868311e-07, + "loss": 0.7742, + "step": 5271 + }, + { + "epoch": 1.2817894480914176, + "grad_norm": 21.375, + "learning_rate": 7.303316376428468e-07, + "loss": 0.5598, + "step": 5272 + }, + { + "epoch": 1.2820325796255774, + "grad_norm": 23.375, + "learning_rate": 7.298921560722744e-07, + "loss": 0.7967, + "step": 5273 + }, + { + "epoch": 1.2822757111597374, + "grad_norm": 17.625, + "learning_rate": 7.294527522408019e-07, + "loss": 0.394, + "step": 5274 + }, + { + "epoch": 1.2825188426938974, + "grad_norm": 29.875, + "learning_rate": 7.29013426214106e-07, + "loss": 0.9392, + "step": 5275 + }, + { + "epoch": 1.2827619742280574, + "grad_norm": 24.25, + "learning_rate": 7.285741780578513e-07, + "loss": 0.7566, + "step": 5276 + }, + { + "epoch": 1.2830051057622174, + "grad_norm": 15.625, + "learning_rate": 7.281350078376906e-07, + "loss": 0.3234, + "step": 5277 + }, + { + "epoch": 1.2832482372963774, + "grad_norm": 21.625, + "learning_rate": 7.276959156192664e-07, + "loss": 0.8778, + "step": 5278 + }, + { + "epoch": 1.2834913688305374, + "grad_norm": 20.25, + "learning_rate": 7.272569014682079e-07, + "loss": 0.8403, + "step": 5279 + }, + { + "epoch": 1.2837345003646974, + "grad_norm": 15.8125, + "learning_rate": 7.268179654501332e-07, + "loss": 0.4489, + "step": 5280 + }, + { + "epoch": 1.2839776318988574, + "grad_norm": 26.125, + "learning_rate": 7.263791076306492e-07, + "loss": 0.4786, + "step": 5281 + }, + { + "epoch": 1.2842207634330172, + "grad_norm": 30.625, + "learning_rate": 7.259403280753513e-07, + "loss": 1.072, + "step": 5282 + }, + { + "epoch": 1.2844638949671772, + "grad_norm": 19.125, + "learning_rate": 7.255016268498223e-07, + "loss": 0.6029, + "step": 5283 + }, + { + "epoch": 1.2847070265013372, + "grad_norm": 18.75, + "learning_rate": 7.25063004019633e-07, + "loss": 0.8734, + "step": 5284 + }, + { + "epoch": 1.2849501580354972, + "grad_norm": 16.5, + "learning_rate": 7.246244596503442e-07, + "loss": 0.6562, + "step": 5285 + }, + { + "epoch": 1.2851932895696572, + "grad_norm": 17.125, + "learning_rate": 7.241859938075035e-07, + "loss": 0.614, + "step": 5286 + }, + { + "epoch": 1.2854364211038172, + "grad_norm": 18.75, + "learning_rate": 7.237476065566466e-07, + "loss": 0.8221, + "step": 5287 + }, + { + "epoch": 1.2856795526379772, + "grad_norm": 22.625, + "learning_rate": 7.233092979632986e-07, + "loss": 0.7513, + "step": 5288 + }, + { + "epoch": 1.2859226841721372, + "grad_norm": 20.25, + "learning_rate": 7.22871068092973e-07, + "loss": 0.7276, + "step": 5289 + }, + { + "epoch": 1.2861658157062972, + "grad_norm": 19.5, + "learning_rate": 7.224329170111698e-07, + "loss": 0.4139, + "step": 5290 + }, + { + "epoch": 1.286408947240457, + "grad_norm": 22.25, + "learning_rate": 7.219948447833782e-07, + "loss": 0.5554, + "step": 5291 + }, + { + "epoch": 1.2866520787746172, + "grad_norm": 17.5, + "learning_rate": 7.215568514750765e-07, + "loss": 0.7128, + "step": 5292 + }, + { + "epoch": 1.286895210308777, + "grad_norm": 24.0, + "learning_rate": 7.211189371517297e-07, + "loss": 0.7536, + "step": 5293 + }, + { + "epoch": 1.287138341842937, + "grad_norm": 20.375, + "learning_rate": 7.206811018787915e-07, + "loss": 0.7304, + "step": 5294 + }, + { + "epoch": 1.287381473377097, + "grad_norm": 32.0, + "learning_rate": 7.202433457217043e-07, + "loss": 0.754, + "step": 5295 + }, + { + "epoch": 1.287624604911257, + "grad_norm": 16.75, + "learning_rate": 7.198056687458988e-07, + "loss": 0.5729, + "step": 5296 + }, + { + "epoch": 1.287867736445417, + "grad_norm": 17.25, + "learning_rate": 7.193680710167927e-07, + "loss": 0.3206, + "step": 5297 + }, + { + "epoch": 1.288110867979577, + "grad_norm": 17.25, + "learning_rate": 7.189305525997925e-07, + "loss": 0.734, + "step": 5298 + }, + { + "epoch": 1.288353999513737, + "grad_norm": 16.0, + "learning_rate": 7.184931135602933e-07, + "loss": 0.4035, + "step": 5299 + }, + { + "epoch": 1.288597131047897, + "grad_norm": 18.5, + "learning_rate": 7.180557539636779e-07, + "loss": 0.7211, + "step": 5300 + }, + { + "epoch": 1.288840262582057, + "grad_norm": 19.0, + "learning_rate": 7.176184738753166e-07, + "loss": 0.804, + "step": 5301 + }, + { + "epoch": 1.2890833941162168, + "grad_norm": 19.0, + "learning_rate": 7.171812733605694e-07, + "loss": 0.7541, + "step": 5302 + }, + { + "epoch": 1.289326525650377, + "grad_norm": 22.875, + "learning_rate": 7.167441524847824e-07, + "loss": 1.01, + "step": 5303 + }, + { + "epoch": 1.2895696571845368, + "grad_norm": 19.25, + "learning_rate": 7.163071113132919e-07, + "loss": 0.746, + "step": 5304 + }, + { + "epoch": 1.2898127887186968, + "grad_norm": 26.25, + "learning_rate": 7.158701499114205e-07, + "loss": 0.9137, + "step": 5305 + }, + { + "epoch": 1.2900559202528568, + "grad_norm": 19.375, + "learning_rate": 7.154332683444803e-07, + "loss": 0.8958, + "step": 5306 + }, + { + "epoch": 1.2902990517870168, + "grad_norm": 19.5, + "learning_rate": 7.149964666777706e-07, + "loss": 0.6978, + "step": 5307 + }, + { + "epoch": 1.2905421833211768, + "grad_norm": 20.25, + "learning_rate": 7.145597449765782e-07, + "loss": 0.4383, + "step": 5308 + }, + { + "epoch": 1.2907853148553368, + "grad_norm": 18.375, + "learning_rate": 7.141231033061797e-07, + "loss": 0.6132, + "step": 5309 + }, + { + "epoch": 1.2910284463894968, + "grad_norm": 37.0, + "learning_rate": 7.136865417318384e-07, + "loss": 0.7585, + "step": 5310 + }, + { + "epoch": 1.2912715779236568, + "grad_norm": 33.0, + "learning_rate": 7.132500603188054e-07, + "loss": 0.7202, + "step": 5311 + }, + { + "epoch": 1.2915147094578168, + "grad_norm": 22.75, + "learning_rate": 7.128136591323209e-07, + "loss": 1.0635, + "step": 5312 + }, + { + "epoch": 1.2917578409919765, + "grad_norm": 29.25, + "learning_rate": 7.12377338237613e-07, + "loss": 0.6653, + "step": 5313 + }, + { + "epoch": 1.2920009725261368, + "grad_norm": 20.375, + "learning_rate": 7.11941097699897e-07, + "loss": 0.7537, + "step": 5314 + }, + { + "epoch": 1.2922441040602966, + "grad_norm": 23.0, + "learning_rate": 7.115049375843761e-07, + "loss": 1.0126, + "step": 5315 + }, + { + "epoch": 1.2924872355944566, + "grad_norm": 21.875, + "learning_rate": 7.110688579562428e-07, + "loss": 0.7576, + "step": 5316 + }, + { + "epoch": 1.2927303671286166, + "grad_norm": 17.625, + "learning_rate": 7.106328588806762e-07, + "loss": 0.5833, + "step": 5317 + }, + { + "epoch": 1.2929734986627766, + "grad_norm": 18.25, + "learning_rate": 7.101969404228439e-07, + "loss": 0.7418, + "step": 5318 + }, + { + "epoch": 1.2932166301969366, + "grad_norm": 22.5, + "learning_rate": 7.097611026479015e-07, + "loss": 0.581, + "step": 5319 + }, + { + "epoch": 1.2934597617310966, + "grad_norm": 22.75, + "learning_rate": 7.093253456209928e-07, + "loss": 0.8539, + "step": 5320 + }, + { + "epoch": 1.2937028932652566, + "grad_norm": 23.875, + "learning_rate": 7.088896694072492e-07, + "loss": 0.7946, + "step": 5321 + }, + { + "epoch": 1.2939460247994166, + "grad_norm": 19.75, + "learning_rate": 7.084540740717892e-07, + "loss": 0.9326, + "step": 5322 + }, + { + "epoch": 1.2941891563335766, + "grad_norm": 32.25, + "learning_rate": 7.080185596797212e-07, + "loss": 0.7788, + "step": 5323 + }, + { + "epoch": 1.2944322878677363, + "grad_norm": 34.0, + "learning_rate": 7.075831262961399e-07, + "loss": 1.151, + "step": 5324 + }, + { + "epoch": 1.2946754194018963, + "grad_norm": 17.25, + "learning_rate": 7.071477739861276e-07, + "loss": 0.437, + "step": 5325 + }, + { + "epoch": 1.2949185509360563, + "grad_norm": 16.375, + "learning_rate": 7.067125028147559e-07, + "loss": 0.7148, + "step": 5326 + }, + { + "epoch": 1.2951616824702163, + "grad_norm": 20.5, + "learning_rate": 7.06277312847084e-07, + "loss": 0.7783, + "step": 5327 + }, + { + "epoch": 1.2954048140043763, + "grad_norm": 15.25, + "learning_rate": 7.058422041481581e-07, + "loss": 0.3319, + "step": 5328 + }, + { + "epoch": 1.2956479455385363, + "grad_norm": 25.5, + "learning_rate": 7.054071767830123e-07, + "loss": 0.7161, + "step": 5329 + }, + { + "epoch": 1.2958910770726964, + "grad_norm": 22.875, + "learning_rate": 7.049722308166699e-07, + "loss": 0.6475, + "step": 5330 + }, + { + "epoch": 1.2961342086068564, + "grad_norm": 20.375, + "learning_rate": 7.045373663141406e-07, + "loss": 0.9005, + "step": 5331 + }, + { + "epoch": 1.2963773401410164, + "grad_norm": 21.75, + "learning_rate": 7.041025833404216e-07, + "loss": 0.9873, + "step": 5332 + }, + { + "epoch": 1.2966204716751761, + "grad_norm": 17.25, + "learning_rate": 7.036678819605003e-07, + "loss": 0.6963, + "step": 5333 + }, + { + "epoch": 1.2968636032093364, + "grad_norm": 21.875, + "learning_rate": 7.032332622393488e-07, + "loss": 0.8343, + "step": 5334 + }, + { + "epoch": 1.2971067347434961, + "grad_norm": 26.125, + "learning_rate": 7.027987242419299e-07, + "loss": 1.0523, + "step": 5335 + }, + { + "epoch": 1.2973498662776561, + "grad_norm": 16.75, + "learning_rate": 7.023642680331915e-07, + "loss": 0.5916, + "step": 5336 + }, + { + "epoch": 1.2975929978118161, + "grad_norm": 16.875, + "learning_rate": 7.019298936780719e-07, + "loss": 0.6827, + "step": 5337 + }, + { + "epoch": 1.2978361293459761, + "grad_norm": 23.625, + "learning_rate": 7.01495601241495e-07, + "loss": 0.9354, + "step": 5338 + }, + { + "epoch": 1.2980792608801361, + "grad_norm": 24.75, + "learning_rate": 7.010613907883731e-07, + "loss": 0.7278, + "step": 5339 + }, + { + "epoch": 1.2983223924142961, + "grad_norm": 17.625, + "learning_rate": 7.006272623836075e-07, + "loss": 0.5342, + "step": 5340 + }, + { + "epoch": 1.2985655239484561, + "grad_norm": 16.875, + "learning_rate": 7.001932160920849e-07, + "loss": 0.8626, + "step": 5341 + }, + { + "epoch": 1.2988086554826161, + "grad_norm": 20.25, + "learning_rate": 6.99759251978682e-07, + "loss": 0.6627, + "step": 5342 + }, + { + "epoch": 1.2990517870167761, + "grad_norm": 24.5, + "learning_rate": 6.993253701082617e-07, + "loss": 0.8373, + "step": 5343 + }, + { + "epoch": 1.299294918550936, + "grad_norm": 24.0, + "learning_rate": 6.988915705456756e-07, + "loss": 0.9368, + "step": 5344 + }, + { + "epoch": 1.2995380500850962, + "grad_norm": 20.125, + "learning_rate": 6.984578533557624e-07, + "loss": 0.7427, + "step": 5345 + }, + { + "epoch": 1.299781181619256, + "grad_norm": 16.0, + "learning_rate": 6.980242186033477e-07, + "loss": 0.589, + "step": 5346 + }, + { + "epoch": 1.300024313153416, + "grad_norm": 11.6875, + "learning_rate": 6.975906663532473e-07, + "loss": 0.2786, + "step": 5347 + }, + { + "epoch": 1.300267444687576, + "grad_norm": 18.625, + "learning_rate": 6.97157196670262e-07, + "loss": 0.846, + "step": 5348 + }, + { + "epoch": 1.300510576221736, + "grad_norm": 13.875, + "learning_rate": 6.967238096191806e-07, + "loss": 0.249, + "step": 5349 + }, + { + "epoch": 1.300753707755896, + "grad_norm": 25.5, + "learning_rate": 6.962905052647824e-07, + "loss": 0.8746, + "step": 5350 + }, + { + "epoch": 1.300996839290056, + "grad_norm": 15.125, + "learning_rate": 6.958572836718311e-07, + "loss": 0.5124, + "step": 5351 + }, + { + "epoch": 1.301239970824216, + "grad_norm": 24.75, + "learning_rate": 6.95424144905079e-07, + "loss": 0.4673, + "step": 5352 + }, + { + "epoch": 1.301483102358376, + "grad_norm": 26.125, + "learning_rate": 6.949910890292657e-07, + "loss": 0.6952, + "step": 5353 + }, + { + "epoch": 1.301726233892536, + "grad_norm": 21.25, + "learning_rate": 6.945581161091203e-07, + "loss": 0.7587, + "step": 5354 + }, + { + "epoch": 1.3019693654266957, + "grad_norm": 17.625, + "learning_rate": 6.94125226209357e-07, + "loss": 0.4456, + "step": 5355 + }, + { + "epoch": 1.302212496960856, + "grad_norm": 21.25, + "learning_rate": 6.936924193946787e-07, + "loss": 0.7767, + "step": 5356 + }, + { + "epoch": 1.3024556284950157, + "grad_norm": 22.75, + "learning_rate": 6.932596957297757e-07, + "loss": 0.8849, + "step": 5357 + }, + { + "epoch": 1.3026987600291757, + "grad_norm": 22.0, + "learning_rate": 6.928270552793273e-07, + "loss": 0.7143, + "step": 5358 + }, + { + "epoch": 1.3029418915633357, + "grad_norm": 18.375, + "learning_rate": 6.923944981079981e-07, + "loss": 0.4373, + "step": 5359 + }, + { + "epoch": 1.3031850230974957, + "grad_norm": 23.375, + "learning_rate": 6.919620242804406e-07, + "loss": 0.7099, + "step": 5360 + }, + { + "epoch": 1.3034281546316557, + "grad_norm": 27.5, + "learning_rate": 6.91529633861297e-07, + "loss": 1.33, + "step": 5361 + }, + { + "epoch": 1.3036712861658157, + "grad_norm": 26.25, + "learning_rate": 6.910973269151946e-07, + "loss": 0.7442, + "step": 5362 + }, + { + "epoch": 1.3039144176999757, + "grad_norm": 19.0, + "learning_rate": 6.906651035067488e-07, + "loss": 0.8893, + "step": 5363 + }, + { + "epoch": 1.3041575492341357, + "grad_norm": 17.25, + "learning_rate": 6.902329637005639e-07, + "loss": 0.651, + "step": 5364 + }, + { + "epoch": 1.3044006807682957, + "grad_norm": 20.375, + "learning_rate": 6.898009075612294e-07, + "loss": 0.7097, + "step": 5365 + }, + { + "epoch": 1.3046438123024555, + "grad_norm": 19.25, + "learning_rate": 6.893689351533245e-07, + "loss": 0.688, + "step": 5366 + }, + { + "epoch": 1.3048869438366157, + "grad_norm": 18.625, + "learning_rate": 6.88937046541415e-07, + "loss": 0.9324, + "step": 5367 + }, + { + "epoch": 1.3051300753707755, + "grad_norm": 15.1875, + "learning_rate": 6.885052417900537e-07, + "loss": 0.502, + "step": 5368 + }, + { + "epoch": 1.3053732069049355, + "grad_norm": 18.625, + "learning_rate": 6.880735209637815e-07, + "loss": 0.7486, + "step": 5369 + }, + { + "epoch": 1.3056163384390955, + "grad_norm": 21.875, + "learning_rate": 6.87641884127126e-07, + "loss": 0.6823, + "step": 5370 + }, + { + "epoch": 1.3058594699732555, + "grad_norm": 14.6875, + "learning_rate": 6.872103313446036e-07, + "loss": 0.3315, + "step": 5371 + }, + { + "epoch": 1.3061026015074155, + "grad_norm": 17.5, + "learning_rate": 6.867788626807162e-07, + "loss": 0.7381, + "step": 5372 + }, + { + "epoch": 1.3063457330415755, + "grad_norm": 20.875, + "learning_rate": 6.863474781999552e-07, + "loss": 0.4472, + "step": 5373 + }, + { + "epoch": 1.3065888645757355, + "grad_norm": 24.875, + "learning_rate": 6.859161779667984e-07, + "loss": 1.0121, + "step": 5374 + }, + { + "epoch": 1.3068319961098955, + "grad_norm": 21.125, + "learning_rate": 6.85484962045711e-07, + "loss": 0.8792, + "step": 5375 + }, + { + "epoch": 1.3070751276440555, + "grad_norm": 21.75, + "learning_rate": 6.850538305011457e-07, + "loss": 0.596, + "step": 5376 + }, + { + "epoch": 1.3073182591782153, + "grad_norm": 24.875, + "learning_rate": 6.846227833975417e-07, + "loss": 0.6542, + "step": 5377 + }, + { + "epoch": 1.3075613907123753, + "grad_norm": 18.0, + "learning_rate": 6.841918207993279e-07, + "loss": 0.77, + "step": 5378 + }, + { + "epoch": 1.3078045222465353, + "grad_norm": 22.375, + "learning_rate": 6.837609427709179e-07, + "loss": 0.7198, + "step": 5379 + }, + { + "epoch": 1.3080476537806953, + "grad_norm": 24.75, + "learning_rate": 6.833301493767136e-07, + "loss": 0.5151, + "step": 5380 + }, + { + "epoch": 1.3082907853148553, + "grad_norm": 19.375, + "learning_rate": 6.828994406811062e-07, + "loss": 0.6333, + "step": 5381 + }, + { + "epoch": 1.3085339168490153, + "grad_norm": 19.375, + "learning_rate": 6.824688167484713e-07, + "loss": 0.8866, + "step": 5382 + }, + { + "epoch": 1.3087770483831753, + "grad_norm": 18.75, + "learning_rate": 6.820382776431732e-07, + "loss": 0.5126, + "step": 5383 + }, + { + "epoch": 1.3090201799173353, + "grad_norm": 18.875, + "learning_rate": 6.816078234295638e-07, + "loss": 0.6611, + "step": 5384 + }, + { + "epoch": 1.3092633114514953, + "grad_norm": 24.875, + "learning_rate": 6.811774541719818e-07, + "loss": 0.7533, + "step": 5385 + }, + { + "epoch": 1.309506442985655, + "grad_norm": 21.375, + "learning_rate": 6.807471699347531e-07, + "loss": 0.9855, + "step": 5386 + }, + { + "epoch": 1.3097495745198153, + "grad_norm": 17.375, + "learning_rate": 6.803169707821907e-07, + "loss": 0.6446, + "step": 5387 + }, + { + "epoch": 1.309992706053975, + "grad_norm": 17.625, + "learning_rate": 6.798868567785958e-07, + "loss": 0.4483, + "step": 5388 + }, + { + "epoch": 1.310235837588135, + "grad_norm": 21.125, + "learning_rate": 6.794568279882567e-07, + "loss": 0.4895, + "step": 5389 + }, + { + "epoch": 1.3104789691222951, + "grad_norm": 20.125, + "learning_rate": 6.79026884475448e-07, + "loss": 0.8769, + "step": 5390 + }, + { + "epoch": 1.3107221006564551, + "grad_norm": 20.0, + "learning_rate": 6.785970263044328e-07, + "loss": 0.6661, + "step": 5391 + }, + { + "epoch": 1.3109652321906151, + "grad_norm": 19.375, + "learning_rate": 6.781672535394605e-07, + "loss": 0.5786, + "step": 5392 + }, + { + "epoch": 1.3112083637247751, + "grad_norm": 21.25, + "learning_rate": 6.777375662447682e-07, + "loss": 0.5528, + "step": 5393 + }, + { + "epoch": 1.3114514952589351, + "grad_norm": 21.5, + "learning_rate": 6.773079644845792e-07, + "loss": 0.3921, + "step": 5394 + }, + { + "epoch": 1.3116946267930951, + "grad_norm": 22.25, + "learning_rate": 6.768784483231064e-07, + "loss": 0.9042, + "step": 5395 + }, + { + "epoch": 1.3119377583272551, + "grad_norm": 18.5, + "learning_rate": 6.76449017824547e-07, + "loss": 0.9287, + "step": 5396 + }, + { + "epoch": 1.312180889861415, + "grad_norm": 21.875, + "learning_rate": 6.760196730530874e-07, + "loss": 0.8828, + "step": 5397 + }, + { + "epoch": 1.3124240213955751, + "grad_norm": 17.25, + "learning_rate": 6.755904140729014e-07, + "loss": 0.4136, + "step": 5398 + }, + { + "epoch": 1.312667152929735, + "grad_norm": 19.875, + "learning_rate": 6.751612409481482e-07, + "loss": 0.7068, + "step": 5399 + }, + { + "epoch": 1.312910284463895, + "grad_norm": 25.125, + "learning_rate": 6.747321537429749e-07, + "loss": 0.9837, + "step": 5400 + }, + { + "epoch": 1.313153415998055, + "grad_norm": 21.25, + "learning_rate": 6.743031525215169e-07, + "loss": 0.4716, + "step": 5401 + }, + { + "epoch": 1.313396547532215, + "grad_norm": 20.5, + "learning_rate": 6.738742373478954e-07, + "loss": 0.6532, + "step": 5402 + }, + { + "epoch": 1.313639679066375, + "grad_norm": 29.0, + "learning_rate": 6.734454082862188e-07, + "loss": 1.0469, + "step": 5403 + }, + { + "epoch": 1.313882810600535, + "grad_norm": 34.75, + "learning_rate": 6.730166654005832e-07, + "loss": 0.7992, + "step": 5404 + }, + { + "epoch": 1.314125942134695, + "grad_norm": 34.75, + "learning_rate": 6.725880087550723e-07, + "loss": 1.3177, + "step": 5405 + }, + { + "epoch": 1.314369073668855, + "grad_norm": 17.75, + "learning_rate": 6.721594384137558e-07, + "loss": 0.6191, + "step": 5406 + }, + { + "epoch": 1.314612205203015, + "grad_norm": 17.875, + "learning_rate": 6.717309544406905e-07, + "loss": 0.8107, + "step": 5407 + }, + { + "epoch": 1.3148553367371747, + "grad_norm": 23.25, + "learning_rate": 6.713025568999216e-07, + "loss": 0.5281, + "step": 5408 + }, + { + "epoch": 1.315098468271335, + "grad_norm": 23.875, + "learning_rate": 6.708742458554798e-07, + "loss": 0.5955, + "step": 5409 + }, + { + "epoch": 1.3153415998054947, + "grad_norm": 19.5, + "learning_rate": 6.70446021371384e-07, + "loss": 0.5887, + "step": 5410 + }, + { + "epoch": 1.3155847313396547, + "grad_norm": 17.625, + "learning_rate": 6.700178835116383e-07, + "loss": 0.4819, + "step": 5411 + }, + { + "epoch": 1.3158278628738147, + "grad_norm": 20.0, + "learning_rate": 6.695898323402378e-07, + "loss": 0.6117, + "step": 5412 + }, + { + "epoch": 1.3160709944079747, + "grad_norm": 21.125, + "learning_rate": 6.691618679211609e-07, + "loss": 0.5834, + "step": 5413 + }, + { + "epoch": 1.3163141259421347, + "grad_norm": 16.75, + "learning_rate": 6.687339903183735e-07, + "loss": 0.4409, + "step": 5414 + }, + { + "epoch": 1.3165572574762947, + "grad_norm": 30.25, + "learning_rate": 6.683061995958308e-07, + "loss": 1.1829, + "step": 5415 + }, + { + "epoch": 1.3168003890104547, + "grad_norm": 32.5, + "learning_rate": 6.678784958174728e-07, + "loss": 0.8507, + "step": 5416 + }, + { + "epoch": 1.3170435205446147, + "grad_norm": 18.375, + "learning_rate": 6.674508790472264e-07, + "loss": 1.0468, + "step": 5417 + }, + { + "epoch": 1.3172866520787747, + "grad_norm": 21.375, + "learning_rate": 6.670233493490079e-07, + "loss": 0.6007, + "step": 5418 + }, + { + "epoch": 1.3175297836129345, + "grad_norm": 25.625, + "learning_rate": 6.665959067867175e-07, + "loss": 0.8114, + "step": 5419 + }, + { + "epoch": 1.3177729151470947, + "grad_norm": 22.125, + "learning_rate": 6.661685514242453e-07, + "loss": 0.5541, + "step": 5420 + }, + { + "epoch": 1.3180160466812545, + "grad_norm": 26.125, + "learning_rate": 6.657412833254655e-07, + "loss": 0.879, + "step": 5421 + }, + { + "epoch": 1.3182591782154145, + "grad_norm": 43.0, + "learning_rate": 6.65314102554242e-07, + "loss": 0.5993, + "step": 5422 + }, + { + "epoch": 1.3185023097495745, + "grad_norm": 27.0, + "learning_rate": 6.648870091744236e-07, + "loss": 0.5893, + "step": 5423 + }, + { + "epoch": 1.3187454412837345, + "grad_norm": 16.75, + "learning_rate": 6.644600032498466e-07, + "loss": 0.3996, + "step": 5424 + }, + { + "epoch": 1.3189885728178945, + "grad_norm": 18.25, + "learning_rate": 6.640330848443352e-07, + "loss": 0.6858, + "step": 5425 + }, + { + "epoch": 1.3192317043520545, + "grad_norm": 14.625, + "learning_rate": 6.636062540216987e-07, + "loss": 0.2918, + "step": 5426 + }, + { + "epoch": 1.3194748358862145, + "grad_norm": 25.375, + "learning_rate": 6.631795108457354e-07, + "loss": 0.9733, + "step": 5427 + }, + { + "epoch": 1.3197179674203745, + "grad_norm": 15.0, + "learning_rate": 6.627528553802285e-07, + "loss": 0.4875, + "step": 5428 + }, + { + "epoch": 1.3199610989545345, + "grad_norm": 16.75, + "learning_rate": 6.6232628768895e-07, + "loss": 0.429, + "step": 5429 + }, + { + "epoch": 1.3202042304886943, + "grad_norm": 16.5, + "learning_rate": 6.618998078356573e-07, + "loss": 0.5399, + "step": 5430 + }, + { + "epoch": 1.3204473620228543, + "grad_norm": 23.5, + "learning_rate": 6.614734158840946e-07, + "loss": 0.6783, + "step": 5431 + }, + { + "epoch": 1.3206904935570143, + "grad_norm": 19.0, + "learning_rate": 6.610471118979946e-07, + "loss": 0.7945, + "step": 5432 + }, + { + "epoch": 1.3209336250911743, + "grad_norm": 31.75, + "learning_rate": 6.606208959410754e-07, + "loss": 1.2546, + "step": 5433 + }, + { + "epoch": 1.3211767566253343, + "grad_norm": 24.125, + "learning_rate": 6.601947680770415e-07, + "loss": 0.8433, + "step": 5434 + }, + { + "epoch": 1.3214198881594943, + "grad_norm": 26.875, + "learning_rate": 6.59768728369586e-07, + "loss": 0.6929, + "step": 5435 + }, + { + "epoch": 1.3216630196936543, + "grad_norm": 17.25, + "learning_rate": 6.593427768823881e-07, + "loss": 0.4186, + "step": 5436 + }, + { + "epoch": 1.3219061512278143, + "grad_norm": 14.0625, + "learning_rate": 6.589169136791132e-07, + "loss": 0.5019, + "step": 5437 + }, + { + "epoch": 1.3221492827619743, + "grad_norm": 18.0, + "learning_rate": 6.584911388234135e-07, + "loss": 0.7286, + "step": 5438 + }, + { + "epoch": 1.322392414296134, + "grad_norm": 29.375, + "learning_rate": 6.580654523789292e-07, + "loss": 1.0972, + "step": 5439 + }, + { + "epoch": 1.3226355458302943, + "grad_norm": 20.25, + "learning_rate": 6.576398544092861e-07, + "loss": 0.8966, + "step": 5440 + }, + { + "epoch": 1.322878677364454, + "grad_norm": 29.125, + "learning_rate": 6.572143449780969e-07, + "loss": 0.7435, + "step": 5441 + }, + { + "epoch": 1.323121808898614, + "grad_norm": 20.0, + "learning_rate": 6.567889241489614e-07, + "loss": 0.6646, + "step": 5442 + }, + { + "epoch": 1.323364940432774, + "grad_norm": 22.0, + "learning_rate": 6.563635919854668e-07, + "loss": 0.6163, + "step": 5443 + }, + { + "epoch": 1.323608071966934, + "grad_norm": 20.875, + "learning_rate": 6.559383485511859e-07, + "loss": 0.5634, + "step": 5444 + }, + { + "epoch": 1.323851203501094, + "grad_norm": 17.5, + "learning_rate": 6.55513193909678e-07, + "loss": 0.5734, + "step": 5445 + }, + { + "epoch": 1.324094335035254, + "grad_norm": 17.875, + "learning_rate": 6.55088128124491e-07, + "loss": 0.7802, + "step": 5446 + }, + { + "epoch": 1.324337466569414, + "grad_norm": 16.125, + "learning_rate": 6.546631512591577e-07, + "loss": 0.4235, + "step": 5447 + }, + { + "epoch": 1.324580598103574, + "grad_norm": 24.0, + "learning_rate": 6.542382633771978e-07, + "loss": 0.943, + "step": 5448 + }, + { + "epoch": 1.324823729637734, + "grad_norm": 18.625, + "learning_rate": 6.53813464542119e-07, + "loss": 0.6957, + "step": 5449 + }, + { + "epoch": 1.3250668611718939, + "grad_norm": 17.625, + "learning_rate": 6.53388754817414e-07, + "loss": 0.7104, + "step": 5450 + }, + { + "epoch": 1.325309992706054, + "grad_norm": 25.75, + "learning_rate": 6.529641342665641e-07, + "loss": 0.9805, + "step": 5451 + }, + { + "epoch": 1.3255531242402139, + "grad_norm": 16.375, + "learning_rate": 6.525396029530349e-07, + "loss": 0.5378, + "step": 5452 + }, + { + "epoch": 1.3257962557743739, + "grad_norm": 16.5, + "learning_rate": 6.521151609402808e-07, + "loss": 0.3958, + "step": 5453 + }, + { + "epoch": 1.3260393873085339, + "grad_norm": 17.75, + "learning_rate": 6.51690808291742e-07, + "loss": 0.6653, + "step": 5454 + }, + { + "epoch": 1.3262825188426939, + "grad_norm": 18.125, + "learning_rate": 6.512665450708445e-07, + "loss": 0.4327, + "step": 5455 + }, + { + "epoch": 1.3265256503768539, + "grad_norm": 21.375, + "learning_rate": 6.508423713410027e-07, + "loss": 0.6051, + "step": 5456 + }, + { + "epoch": 1.3267687819110139, + "grad_norm": 20.5, + "learning_rate": 6.504182871656159e-07, + "loss": 0.6215, + "step": 5457 + }, + { + "epoch": 1.3270119134451739, + "grad_norm": 14.125, + "learning_rate": 6.499942926080716e-07, + "loss": 0.2769, + "step": 5458 + }, + { + "epoch": 1.3272550449793339, + "grad_norm": 20.625, + "learning_rate": 6.495703877317425e-07, + "loss": 0.872, + "step": 5459 + }, + { + "epoch": 1.3274981765134939, + "grad_norm": 19.25, + "learning_rate": 6.49146572599989e-07, + "loss": 0.7892, + "step": 5460 + }, + { + "epoch": 1.3277413080476537, + "grad_norm": 22.0, + "learning_rate": 6.487228472761572e-07, + "loss": 0.9695, + "step": 5461 + }, + { + "epoch": 1.327984439581814, + "grad_norm": 21.125, + "learning_rate": 6.482992118235801e-07, + "loss": 0.7597, + "step": 5462 + }, + { + "epoch": 1.3282275711159737, + "grad_norm": 20.375, + "learning_rate": 6.478756663055779e-07, + "loss": 0.805, + "step": 5463 + }, + { + "epoch": 1.3284707026501337, + "grad_norm": 20.25, + "learning_rate": 6.474522107854564e-07, + "loss": 1.0494, + "step": 5464 + }, + { + "epoch": 1.3287138341842937, + "grad_norm": 20.0, + "learning_rate": 6.470288453265079e-07, + "loss": 0.3861, + "step": 5465 + }, + { + "epoch": 1.3289569657184537, + "grad_norm": 33.25, + "learning_rate": 6.466055699920121e-07, + "loss": 0.9758, + "step": 5466 + }, + { + "epoch": 1.3292000972526137, + "grad_norm": 19.125, + "learning_rate": 6.461823848452353e-07, + "loss": 0.8472, + "step": 5467 + }, + { + "epoch": 1.3294432287867737, + "grad_norm": 19.625, + "learning_rate": 6.457592899494295e-07, + "loss": 0.7136, + "step": 5468 + }, + { + "epoch": 1.3296863603209337, + "grad_norm": 24.125, + "learning_rate": 6.45336285367833e-07, + "loss": 0.9375, + "step": 5469 + }, + { + "epoch": 1.3299294918550937, + "grad_norm": 14.75, + "learning_rate": 6.449133711636718e-07, + "loss": 0.2902, + "step": 5470 + }, + { + "epoch": 1.3301726233892537, + "grad_norm": 18.875, + "learning_rate": 6.444905474001576e-07, + "loss": 0.6764, + "step": 5471 + }, + { + "epoch": 1.3304157549234135, + "grad_norm": 25.375, + "learning_rate": 6.440678141404883e-07, + "loss": 0.4396, + "step": 5472 + }, + { + "epoch": 1.3306588864575735, + "grad_norm": 21.25, + "learning_rate": 6.436451714478487e-07, + "loss": 0.6173, + "step": 5473 + }, + { + "epoch": 1.3309020179917335, + "grad_norm": 21.75, + "learning_rate": 6.432226193854109e-07, + "loss": 0.6661, + "step": 5474 + }, + { + "epoch": 1.3311451495258935, + "grad_norm": 20.625, + "learning_rate": 6.42800158016332e-07, + "loss": 0.8773, + "step": 5475 + }, + { + "epoch": 1.3313882810600535, + "grad_norm": 16.5, + "learning_rate": 6.423777874037558e-07, + "loss": 0.3369, + "step": 5476 + }, + { + "epoch": 1.3316314125942135, + "grad_norm": 21.125, + "learning_rate": 6.419555076108137e-07, + "loss": 0.5003, + "step": 5477 + }, + { + "epoch": 1.3318745441283735, + "grad_norm": 25.75, + "learning_rate": 6.415333187006223e-07, + "loss": 0.7549, + "step": 5478 + }, + { + "epoch": 1.3321176756625335, + "grad_norm": 22.0, + "learning_rate": 6.411112207362846e-07, + "loss": 0.777, + "step": 5479 + }, + { + "epoch": 1.3323608071966935, + "grad_norm": 24.625, + "learning_rate": 6.406892137808907e-07, + "loss": 0.8066, + "step": 5480 + }, + { + "epoch": 1.3326039387308533, + "grad_norm": 21.5, + "learning_rate": 6.402672978975175e-07, + "loss": 0.8785, + "step": 5481 + }, + { + "epoch": 1.3328470702650135, + "grad_norm": 33.5, + "learning_rate": 6.398454731492271e-07, + "loss": 0.7769, + "step": 5482 + }, + { + "epoch": 1.3330902017991733, + "grad_norm": 21.125, + "learning_rate": 6.39423739599068e-07, + "loss": 0.7673, + "step": 5483 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 22.5, + "learning_rate": 6.390020973100763e-07, + "loss": 1.0259, + "step": 5484 + }, + { + "epoch": 1.3335764648674933, + "grad_norm": 29.125, + "learning_rate": 6.385805463452738e-07, + "loss": 0.5669, + "step": 5485 + }, + { + "epoch": 1.3338195964016533, + "grad_norm": 22.625, + "learning_rate": 6.381590867676677e-07, + "loss": 0.7433, + "step": 5486 + }, + { + "epoch": 1.3340627279358133, + "grad_norm": 16.125, + "learning_rate": 6.377377186402531e-07, + "loss": 0.5105, + "step": 5487 + }, + { + "epoch": 1.3343058594699733, + "grad_norm": 19.75, + "learning_rate": 6.373164420260103e-07, + "loss": 1.1968, + "step": 5488 + }, + { + "epoch": 1.3345489910041333, + "grad_norm": 22.875, + "learning_rate": 6.368952569879072e-07, + "loss": 0.9597, + "step": 5489 + }, + { + "epoch": 1.3347921225382933, + "grad_norm": 25.0, + "learning_rate": 6.36474163588896e-07, + "loss": 0.3777, + "step": 5490 + }, + { + "epoch": 1.3350352540724533, + "grad_norm": 33.0, + "learning_rate": 6.360531618919177e-07, + "loss": 0.9988, + "step": 5491 + }, + { + "epoch": 1.335278385606613, + "grad_norm": 25.125, + "learning_rate": 6.356322519598974e-07, + "loss": 0.7696, + "step": 5492 + }, + { + "epoch": 1.3355215171407733, + "grad_norm": 19.0, + "learning_rate": 6.352114338557472e-07, + "loss": 0.8448, + "step": 5493 + }, + { + "epoch": 1.335764648674933, + "grad_norm": 24.5, + "learning_rate": 6.347907076423662e-07, + "loss": 0.6717, + "step": 5494 + }, + { + "epoch": 1.336007780209093, + "grad_norm": 21.875, + "learning_rate": 6.343700733826393e-07, + "loss": 0.9157, + "step": 5495 + }, + { + "epoch": 1.336250911743253, + "grad_norm": 17.375, + "learning_rate": 6.339495311394365e-07, + "loss": 0.4617, + "step": 5496 + }, + { + "epoch": 1.336494043277413, + "grad_norm": 28.0, + "learning_rate": 6.33529080975616e-07, + "loss": 0.8627, + "step": 5497 + }, + { + "epoch": 1.336737174811573, + "grad_norm": 15.5, + "learning_rate": 6.331087229540217e-07, + "loss": 0.6037, + "step": 5498 + }, + { + "epoch": 1.336980306345733, + "grad_norm": 16.375, + "learning_rate": 6.326884571374826e-07, + "loss": 0.3355, + "step": 5499 + }, + { + "epoch": 1.337223437879893, + "grad_norm": 20.75, + "learning_rate": 6.322682835888145e-07, + "loss": 0.6281, + "step": 5500 + }, + { + "epoch": 1.337466569414053, + "grad_norm": 17.375, + "learning_rate": 6.318482023708205e-07, + "loss": 0.3225, + "step": 5501 + }, + { + "epoch": 1.337709700948213, + "grad_norm": 18.625, + "learning_rate": 6.314282135462888e-07, + "loss": 0.8917, + "step": 5502 + }, + { + "epoch": 1.3379528324823728, + "grad_norm": 26.375, + "learning_rate": 6.310083171779929e-07, + "loss": 0.7172, + "step": 5503 + }, + { + "epoch": 1.338195964016533, + "grad_norm": 20.125, + "learning_rate": 6.305885133286943e-07, + "loss": 0.5759, + "step": 5504 + }, + { + "epoch": 1.3384390955506928, + "grad_norm": 14.625, + "learning_rate": 6.301688020611408e-07, + "loss": 0.3775, + "step": 5505 + }, + { + "epoch": 1.3386822270848528, + "grad_norm": 17.0, + "learning_rate": 6.297491834380644e-07, + "loss": 0.7441, + "step": 5506 + }, + { + "epoch": 1.3389253586190129, + "grad_norm": 26.5, + "learning_rate": 6.293296575221842e-07, + "loss": 1.0262, + "step": 5507 + }, + { + "epoch": 1.3391684901531729, + "grad_norm": 16.125, + "learning_rate": 6.289102243762066e-07, + "loss": 0.4134, + "step": 5508 + }, + { + "epoch": 1.3394116216873329, + "grad_norm": 30.5, + "learning_rate": 6.284908840628225e-07, + "loss": 1.225, + "step": 5509 + }, + { + "epoch": 1.3396547532214929, + "grad_norm": 18.375, + "learning_rate": 6.280716366447092e-07, + "loss": 0.4572, + "step": 5510 + }, + { + "epoch": 1.3398978847556529, + "grad_norm": 18.5, + "learning_rate": 6.276524821845307e-07, + "loss": 0.6042, + "step": 5511 + }, + { + "epoch": 1.3401410162898129, + "grad_norm": 19.75, + "learning_rate": 6.272334207449376e-07, + "loss": 0.6372, + "step": 5512 + }, + { + "epoch": 1.3403841478239729, + "grad_norm": 19.125, + "learning_rate": 6.268144523885656e-07, + "loss": 0.8907, + "step": 5513 + }, + { + "epoch": 1.3406272793581326, + "grad_norm": 23.25, + "learning_rate": 6.263955771780355e-07, + "loss": 0.9603, + "step": 5514 + }, + { + "epoch": 1.3408704108922929, + "grad_norm": 17.0, + "learning_rate": 6.259767951759573e-07, + "loss": 0.6493, + "step": 5515 + }, + { + "epoch": 1.3411135424264526, + "grad_norm": 21.875, + "learning_rate": 6.255581064449242e-07, + "loss": 1.1151, + "step": 5516 + }, + { + "epoch": 1.3413566739606126, + "grad_norm": 16.5, + "learning_rate": 6.25139511047516e-07, + "loss": 0.5045, + "step": 5517 + }, + { + "epoch": 1.3415998054947726, + "grad_norm": 19.75, + "learning_rate": 6.247210090463002e-07, + "loss": 0.7112, + "step": 5518 + }, + { + "epoch": 1.3418429370289326, + "grad_norm": 30.375, + "learning_rate": 6.24302600503828e-07, + "loss": 0.6472, + "step": 5519 + }, + { + "epoch": 1.3420860685630926, + "grad_norm": 23.0, + "learning_rate": 6.238842854826388e-07, + "loss": 0.5853, + "step": 5520 + }, + { + "epoch": 1.3423292000972527, + "grad_norm": 20.625, + "learning_rate": 6.23466064045256e-07, + "loss": 0.8197, + "step": 5521 + }, + { + "epoch": 1.3425723316314127, + "grad_norm": 24.875, + "learning_rate": 6.23047936254191e-07, + "loss": 0.8313, + "step": 5522 + }, + { + "epoch": 1.3428154631655727, + "grad_norm": 18.75, + "learning_rate": 6.2262990217194e-07, + "loss": 0.4227, + "step": 5523 + }, + { + "epoch": 1.3430585946997327, + "grad_norm": 17.875, + "learning_rate": 6.222119618609844e-07, + "loss": 0.4389, + "step": 5524 + }, + { + "epoch": 1.3433017262338924, + "grad_norm": 18.5, + "learning_rate": 6.217941153837941e-07, + "loss": 0.6153, + "step": 5525 + }, + { + "epoch": 1.3435448577680524, + "grad_norm": 25.875, + "learning_rate": 6.213763628028226e-07, + "loss": 0.9794, + "step": 5526 + }, + { + "epoch": 1.3437879893022124, + "grad_norm": 22.125, + "learning_rate": 6.209587041805099e-07, + "loss": 0.8706, + "step": 5527 + }, + { + "epoch": 1.3440311208363724, + "grad_norm": 17.875, + "learning_rate": 6.205411395792827e-07, + "loss": 0.786, + "step": 5528 + }, + { + "epoch": 1.3442742523705324, + "grad_norm": 17.375, + "learning_rate": 6.201236690615541e-07, + "loss": 0.8461, + "step": 5529 + }, + { + "epoch": 1.3445173839046924, + "grad_norm": 20.0, + "learning_rate": 6.197062926897213e-07, + "loss": 0.9461, + "step": 5530 + }, + { + "epoch": 1.3447605154388524, + "grad_norm": 17.5, + "learning_rate": 6.192890105261682e-07, + "loss": 0.5896, + "step": 5531 + }, + { + "epoch": 1.3450036469730124, + "grad_norm": 15.6875, + "learning_rate": 6.188718226332656e-07, + "loss": 0.4941, + "step": 5532 + }, + { + "epoch": 1.3452467785071724, + "grad_norm": 31.0, + "learning_rate": 6.18454729073369e-07, + "loss": 0.824, + "step": 5533 + }, + { + "epoch": 1.3454899100413322, + "grad_norm": 20.25, + "learning_rate": 6.1803772990882e-07, + "loss": 0.7432, + "step": 5534 + }, + { + "epoch": 1.3457330415754925, + "grad_norm": 17.75, + "learning_rate": 6.176208252019467e-07, + "loss": 0.7477, + "step": 5535 + }, + { + "epoch": 1.3459761731096522, + "grad_norm": 21.5, + "learning_rate": 6.172040150150629e-07, + "loss": 0.7695, + "step": 5536 + }, + { + "epoch": 1.3462193046438122, + "grad_norm": 17.75, + "learning_rate": 6.167872994104679e-07, + "loss": 0.4621, + "step": 5537 + }, + { + "epoch": 1.3464624361779722, + "grad_norm": 17.875, + "learning_rate": 6.163706784504466e-07, + "loss": 1.0213, + "step": 5538 + }, + { + "epoch": 1.3467055677121322, + "grad_norm": 23.0, + "learning_rate": 6.159541521972709e-07, + "loss": 0.4595, + "step": 5539 + }, + { + "epoch": 1.3469486992462922, + "grad_norm": 18.25, + "learning_rate": 6.155377207131975e-07, + "loss": 0.3787, + "step": 5540 + }, + { + "epoch": 1.3471918307804522, + "grad_norm": 18.875, + "learning_rate": 6.151213840604688e-07, + "loss": 0.9116, + "step": 5541 + }, + { + "epoch": 1.3474349623146122, + "grad_norm": 17.75, + "learning_rate": 6.147051423013141e-07, + "loss": 0.363, + "step": 5542 + }, + { + "epoch": 1.3476780938487722, + "grad_norm": 20.375, + "learning_rate": 6.142889954979482e-07, + "loss": 0.9249, + "step": 5543 + }, + { + "epoch": 1.3479212253829322, + "grad_norm": 19.75, + "learning_rate": 6.138729437125711e-07, + "loss": 1.2321, + "step": 5544 + }, + { + "epoch": 1.348164356917092, + "grad_norm": 18.125, + "learning_rate": 6.134569870073684e-07, + "loss": 0.6476, + "step": 5545 + }, + { + "epoch": 1.3484074884512522, + "grad_norm": 18.125, + "learning_rate": 6.130411254445129e-07, + "loss": 0.4712, + "step": 5546 + }, + { + "epoch": 1.348650619985412, + "grad_norm": 18.375, + "learning_rate": 6.126253590861618e-07, + "loss": 0.5074, + "step": 5547 + }, + { + "epoch": 1.348893751519572, + "grad_norm": 18.0, + "learning_rate": 6.122096879944582e-07, + "loss": 0.741, + "step": 5548 + }, + { + "epoch": 1.349136883053732, + "grad_norm": 18.75, + "learning_rate": 6.117941122315323e-07, + "loss": 0.7042, + "step": 5549 + }, + { + "epoch": 1.349380014587892, + "grad_norm": 20.875, + "learning_rate": 6.113786318594979e-07, + "loss": 0.8076, + "step": 5550 + }, + { + "epoch": 1.349623146122052, + "grad_norm": 22.125, + "learning_rate": 6.109632469404568e-07, + "loss": 0.5434, + "step": 5551 + }, + { + "epoch": 1.349866277656212, + "grad_norm": 22.5, + "learning_rate": 6.105479575364946e-07, + "loss": 0.7464, + "step": 5552 + }, + { + "epoch": 1.350109409190372, + "grad_norm": 26.25, + "learning_rate": 6.101327637096844e-07, + "loss": 0.649, + "step": 5553 + }, + { + "epoch": 1.350352540724532, + "grad_norm": 22.0, + "learning_rate": 6.097176655220835e-07, + "loss": 0.6611, + "step": 5554 + }, + { + "epoch": 1.350595672258692, + "grad_norm": 18.125, + "learning_rate": 6.09302663035735e-07, + "loss": 0.4305, + "step": 5555 + }, + { + "epoch": 1.3508388037928518, + "grad_norm": 22.5, + "learning_rate": 6.088877563126694e-07, + "loss": 0.6376, + "step": 5556 + }, + { + "epoch": 1.351081935327012, + "grad_norm": 23.375, + "learning_rate": 6.084729454149008e-07, + "loss": 0.7713, + "step": 5557 + }, + { + "epoch": 1.3513250668611718, + "grad_norm": 21.125, + "learning_rate": 6.080582304044297e-07, + "loss": 0.7473, + "step": 5558 + }, + { + "epoch": 1.3515681983953318, + "grad_norm": 24.875, + "learning_rate": 6.076436113432427e-07, + "loss": 0.5891, + "step": 5559 + }, + { + "epoch": 1.3518113299294918, + "grad_norm": 20.875, + "learning_rate": 6.072290882933124e-07, + "loss": 0.6971, + "step": 5560 + }, + { + "epoch": 1.3520544614636518, + "grad_norm": 22.125, + "learning_rate": 6.068146613165959e-07, + "loss": 0.5151, + "step": 5561 + }, + { + "epoch": 1.3522975929978118, + "grad_norm": 19.875, + "learning_rate": 6.06400330475036e-07, + "loss": 0.3973, + "step": 5562 + }, + { + "epoch": 1.3525407245319718, + "grad_norm": 17.875, + "learning_rate": 6.059860958305625e-07, + "loss": 0.5082, + "step": 5563 + }, + { + "epoch": 1.3527838560661318, + "grad_norm": 19.5, + "learning_rate": 6.055719574450895e-07, + "loss": 0.4535, + "step": 5564 + }, + { + "epoch": 1.3530269876002918, + "grad_norm": 21.625, + "learning_rate": 6.051579153805166e-07, + "loss": 0.9679, + "step": 5565 + }, + { + "epoch": 1.3532701191344518, + "grad_norm": 21.25, + "learning_rate": 6.047439696987302e-07, + "loss": 0.7651, + "step": 5566 + }, + { + "epoch": 1.3535132506686116, + "grad_norm": 15.4375, + "learning_rate": 6.043301204616017e-07, + "loss": 0.2833, + "step": 5567 + }, + { + "epoch": 1.3537563822027718, + "grad_norm": 14.5, + "learning_rate": 6.039163677309881e-07, + "loss": 0.5003, + "step": 5568 + }, + { + "epoch": 1.3539995137369316, + "grad_norm": 17.125, + "learning_rate": 6.035027115687309e-07, + "loss": 0.595, + "step": 5569 + }, + { + "epoch": 1.3542426452710916, + "grad_norm": 21.375, + "learning_rate": 6.030891520366594e-07, + "loss": 0.8139, + "step": 5570 + }, + { + "epoch": 1.3544857768052516, + "grad_norm": 22.625, + "learning_rate": 6.026756891965867e-07, + "loss": 0.5346, + "step": 5571 + }, + { + "epoch": 1.3547289083394116, + "grad_norm": 17.875, + "learning_rate": 6.022623231103114e-07, + "loss": 0.8117, + "step": 5572 + }, + { + "epoch": 1.3549720398735716, + "grad_norm": 17.375, + "learning_rate": 6.018490538396188e-07, + "loss": 0.5895, + "step": 5573 + }, + { + "epoch": 1.3552151714077316, + "grad_norm": 19.875, + "learning_rate": 6.014358814462793e-07, + "loss": 0.4554, + "step": 5574 + }, + { + "epoch": 1.3554583029418916, + "grad_norm": 27.25, + "learning_rate": 6.010228059920487e-07, + "loss": 1.1678, + "step": 5575 + }, + { + "epoch": 1.3557014344760516, + "grad_norm": 24.375, + "learning_rate": 6.006098275386674e-07, + "loss": 0.8656, + "step": 5576 + }, + { + "epoch": 1.3559445660102116, + "grad_norm": 16.125, + "learning_rate": 6.001969461478631e-07, + "loss": 0.7357, + "step": 5577 + }, + { + "epoch": 1.3561876975443714, + "grad_norm": 14.125, + "learning_rate": 5.997841618813478e-07, + "loss": 0.3415, + "step": 5578 + }, + { + "epoch": 1.3564308290785314, + "grad_norm": 24.625, + "learning_rate": 5.993714748008186e-07, + "loss": 0.6959, + "step": 5579 + }, + { + "epoch": 1.3566739606126914, + "grad_norm": 17.125, + "learning_rate": 5.989588849679598e-07, + "loss": 0.3308, + "step": 5580 + }, + { + "epoch": 1.3569170921468514, + "grad_norm": 27.0, + "learning_rate": 5.985463924444388e-07, + "loss": 0.692, + "step": 5581 + }, + { + "epoch": 1.3571602236810114, + "grad_norm": 18.875, + "learning_rate": 5.981339972919112e-07, + "loss": 0.8544, + "step": 5582 + }, + { + "epoch": 1.3574033552151714, + "grad_norm": 15.6875, + "learning_rate": 5.977216995720152e-07, + "loss": 0.4573, + "step": 5583 + }, + { + "epoch": 1.3576464867493314, + "grad_norm": 17.625, + "learning_rate": 5.97309499346377e-07, + "loss": 0.5268, + "step": 5584 + }, + { + "epoch": 1.3578896182834914, + "grad_norm": 19.875, + "learning_rate": 5.968973966766064e-07, + "loss": 0.7208, + "step": 5585 + }, + { + "epoch": 1.3581327498176514, + "grad_norm": 26.25, + "learning_rate": 5.964853916242988e-07, + "loss": 0.7436, + "step": 5586 + }, + { + "epoch": 1.3583758813518112, + "grad_norm": 21.125, + "learning_rate": 5.960734842510366e-07, + "loss": 0.4939, + "step": 5587 + }, + { + "epoch": 1.3586190128859714, + "grad_norm": 15.25, + "learning_rate": 5.956616746183859e-07, + "loss": 0.4749, + "step": 5588 + }, + { + "epoch": 1.3588621444201312, + "grad_norm": 21.25, + "learning_rate": 5.952499627878981e-07, + "loss": 0.5949, + "step": 5589 + }, + { + "epoch": 1.3591052759542912, + "grad_norm": 16.125, + "learning_rate": 5.948383488211114e-07, + "loss": 0.3801, + "step": 5590 + }, + { + "epoch": 1.3593484074884512, + "grad_norm": 15.3125, + "learning_rate": 5.94426832779549e-07, + "loss": 0.6721, + "step": 5591 + }, + { + "epoch": 1.3595915390226112, + "grad_norm": 19.625, + "learning_rate": 5.940154147247186e-07, + "loss": 0.7625, + "step": 5592 + }, + { + "epoch": 1.3598346705567712, + "grad_norm": 16.125, + "learning_rate": 5.936040947181132e-07, + "loss": 0.4876, + "step": 5593 + }, + { + "epoch": 1.3600778020909312, + "grad_norm": 16.125, + "learning_rate": 5.931928728212126e-07, + "loss": 0.2877, + "step": 5594 + }, + { + "epoch": 1.3603209336250912, + "grad_norm": 14.3125, + "learning_rate": 5.927817490954807e-07, + "loss": 0.2737, + "step": 5595 + }, + { + "epoch": 1.3605640651592512, + "grad_norm": 20.875, + "learning_rate": 5.923707236023663e-07, + "loss": 0.7164, + "step": 5596 + }, + { + "epoch": 1.3608071966934112, + "grad_norm": 17.75, + "learning_rate": 5.919597964033051e-07, + "loss": 0.8874, + "step": 5597 + }, + { + "epoch": 1.361050328227571, + "grad_norm": 21.75, + "learning_rate": 5.915489675597176e-07, + "loss": 0.5238, + "step": 5598 + }, + { + "epoch": 1.3612934597617312, + "grad_norm": 20.125, + "learning_rate": 5.911382371330087e-07, + "loss": 0.6454, + "step": 5599 + }, + { + "epoch": 1.361536591295891, + "grad_norm": 20.5, + "learning_rate": 5.907276051845686e-07, + "loss": 0.8754, + "step": 5600 + }, + { + "epoch": 1.361779722830051, + "grad_norm": 16.375, + "learning_rate": 5.903170717757745e-07, + "loss": 0.5905, + "step": 5601 + }, + { + "epoch": 1.362022854364211, + "grad_norm": 16.5, + "learning_rate": 5.89906636967987e-07, + "loss": 0.6097, + "step": 5602 + }, + { + "epoch": 1.362265985898371, + "grad_norm": 18.375, + "learning_rate": 5.894963008225525e-07, + "loss": 0.8333, + "step": 5603 + }, + { + "epoch": 1.362509117432531, + "grad_norm": 25.75, + "learning_rate": 5.89086063400803e-07, + "loss": 0.7765, + "step": 5604 + }, + { + "epoch": 1.362752248966691, + "grad_norm": 32.25, + "learning_rate": 5.886759247640564e-07, + "loss": 0.7475, + "step": 5605 + }, + { + "epoch": 1.362995380500851, + "grad_norm": 18.75, + "learning_rate": 5.88265884973614e-07, + "loss": 0.4395, + "step": 5606 + }, + { + "epoch": 1.363238512035011, + "grad_norm": 18.0, + "learning_rate": 5.878559440907635e-07, + "loss": 0.4385, + "step": 5607 + }, + { + "epoch": 1.363481643569171, + "grad_norm": 20.75, + "learning_rate": 5.87446102176778e-07, + "loss": 0.7954, + "step": 5608 + }, + { + "epoch": 1.3637247751033308, + "grad_norm": 19.875, + "learning_rate": 5.870363592929154e-07, + "loss": 0.4758, + "step": 5609 + }, + { + "epoch": 1.363967906637491, + "grad_norm": 22.375, + "learning_rate": 5.866267155004181e-07, + "loss": 0.9392, + "step": 5610 + }, + { + "epoch": 1.3642110381716508, + "grad_norm": 26.625, + "learning_rate": 5.862171708605155e-07, + "loss": 0.9368, + "step": 5611 + }, + { + "epoch": 1.3644541697058108, + "grad_norm": 29.375, + "learning_rate": 5.858077254344203e-07, + "loss": 0.625, + "step": 5612 + }, + { + "epoch": 1.3646973012399708, + "grad_norm": 22.25, + "learning_rate": 5.853983792833318e-07, + "loss": 0.6548, + "step": 5613 + }, + { + "epoch": 1.3649404327741308, + "grad_norm": 20.0, + "learning_rate": 5.849891324684331e-07, + "loss": 0.6221, + "step": 5614 + }, + { + "epoch": 1.3651835643082908, + "grad_norm": 23.125, + "learning_rate": 5.845799850508943e-07, + "loss": 0.9084, + "step": 5615 + }, + { + "epoch": 1.3654266958424508, + "grad_norm": 20.0, + "learning_rate": 5.84170937091869e-07, + "loss": 1.063, + "step": 5616 + }, + { + "epoch": 1.3656698273766108, + "grad_norm": 20.75, + "learning_rate": 5.837619886524957e-07, + "loss": 0.8154, + "step": 5617 + }, + { + "epoch": 1.3659129589107708, + "grad_norm": 20.25, + "learning_rate": 5.833531397939002e-07, + "loss": 0.5817, + "step": 5618 + }, + { + "epoch": 1.3661560904449308, + "grad_norm": 26.625, + "learning_rate": 5.829443905771913e-07, + "loss": 1.072, + "step": 5619 + }, + { + "epoch": 1.3663992219790906, + "grad_norm": 16.875, + "learning_rate": 5.825357410634628e-07, + "loss": 0.4841, + "step": 5620 + }, + { + "epoch": 1.3666423535132508, + "grad_norm": 16.625, + "learning_rate": 5.82127191313796e-07, + "loss": 0.6542, + "step": 5621 + }, + { + "epoch": 1.3668854850474106, + "grad_norm": 16.875, + "learning_rate": 5.817187413892552e-07, + "loss": 0.6381, + "step": 5622 + }, + { + "epoch": 1.3671286165815706, + "grad_norm": 22.5, + "learning_rate": 5.813103913508902e-07, + "loss": 0.8829, + "step": 5623 + }, + { + "epoch": 1.3673717481157306, + "grad_norm": 22.125, + "learning_rate": 5.809021412597353e-07, + "loss": 1.1265, + "step": 5624 + }, + { + "epoch": 1.3676148796498906, + "grad_norm": 27.0, + "learning_rate": 5.804939911768117e-07, + "loss": 0.8627, + "step": 5625 + }, + { + "epoch": 1.3678580111840506, + "grad_norm": 19.0, + "learning_rate": 5.800859411631236e-07, + "loss": 0.618, + "step": 5626 + }, + { + "epoch": 1.3681011427182106, + "grad_norm": 19.5, + "learning_rate": 5.796779912796608e-07, + "loss": 0.5365, + "step": 5627 + }, + { + "epoch": 1.3683442742523706, + "grad_norm": 21.5, + "learning_rate": 5.792701415873998e-07, + "loss": 0.3092, + "step": 5628 + }, + { + "epoch": 1.3685874057865304, + "grad_norm": 21.625, + "learning_rate": 5.788623921473001e-07, + "loss": 0.7268, + "step": 5629 + }, + { + "epoch": 1.3688305373206906, + "grad_norm": 19.625, + "learning_rate": 5.784547430203068e-07, + "loss": 0.6092, + "step": 5630 + }, + { + "epoch": 1.3690736688548504, + "grad_norm": 16.875, + "learning_rate": 5.780471942673498e-07, + "loss": 0.3973, + "step": 5631 + }, + { + "epoch": 1.3693168003890104, + "grad_norm": 21.875, + "learning_rate": 5.77639745949345e-07, + "loss": 0.5814, + "step": 5632 + }, + { + "epoch": 1.3695599319231704, + "grad_norm": 14.1875, + "learning_rate": 5.772323981271922e-07, + "loss": 0.3311, + "step": 5633 + }, + { + "epoch": 1.3698030634573304, + "grad_norm": 16.875, + "learning_rate": 5.768251508617762e-07, + "loss": 0.7044, + "step": 5634 + }, + { + "epoch": 1.3700461949914904, + "grad_norm": 16.625, + "learning_rate": 5.764180042139675e-07, + "loss": 0.5737, + "step": 5635 + }, + { + "epoch": 1.3702893265256504, + "grad_norm": 21.375, + "learning_rate": 5.760109582446218e-07, + "loss": 0.6558, + "step": 5636 + }, + { + "epoch": 1.3705324580598104, + "grad_norm": 30.375, + "learning_rate": 5.756040130145781e-07, + "loss": 0.6485, + "step": 5637 + }, + { + "epoch": 1.3707755895939704, + "grad_norm": 26.75, + "learning_rate": 5.751971685846622e-07, + "loss": 0.8077, + "step": 5638 + }, + { + "epoch": 1.3710187211281304, + "grad_norm": 16.875, + "learning_rate": 5.747904250156839e-07, + "loss": 0.6828, + "step": 5639 + }, + { + "epoch": 1.3712618526622902, + "grad_norm": 21.875, + "learning_rate": 5.743837823684377e-07, + "loss": 0.8346, + "step": 5640 + }, + { + "epoch": 1.3715049841964504, + "grad_norm": 15.0625, + "learning_rate": 5.739772407037031e-07, + "loss": 0.6601, + "step": 5641 + }, + { + "epoch": 1.3717481157306102, + "grad_norm": 18.625, + "learning_rate": 5.735708000822456e-07, + "loss": 0.3835, + "step": 5642 + }, + { + "epoch": 1.3719912472647702, + "grad_norm": 16.5, + "learning_rate": 5.73164460564814e-07, + "loss": 0.6508, + "step": 5643 + }, + { + "epoch": 1.3722343787989302, + "grad_norm": 15.25, + "learning_rate": 5.72758222212143e-07, + "loss": 0.4461, + "step": 5644 + }, + { + "epoch": 1.3724775103330902, + "grad_norm": 19.625, + "learning_rate": 5.723520850849526e-07, + "loss": 0.5593, + "step": 5645 + }, + { + "epoch": 1.3727206418672502, + "grad_norm": 21.5, + "learning_rate": 5.719460492439465e-07, + "loss": 0.7001, + "step": 5646 + }, + { + "epoch": 1.3729637734014102, + "grad_norm": 16.5, + "learning_rate": 5.715401147498137e-07, + "loss": 0.392, + "step": 5647 + }, + { + "epoch": 1.3732069049355702, + "grad_norm": 19.375, + "learning_rate": 5.711342816632277e-07, + "loss": 0.8772, + "step": 5648 + }, + { + "epoch": 1.3734500364697302, + "grad_norm": 16.875, + "learning_rate": 5.707285500448483e-07, + "loss": 0.2809, + "step": 5649 + }, + { + "epoch": 1.3736931680038902, + "grad_norm": 22.0, + "learning_rate": 5.703229199553178e-07, + "loss": 0.8582, + "step": 5650 + }, + { + "epoch": 1.37393629953805, + "grad_norm": 19.25, + "learning_rate": 5.699173914552655e-07, + "loss": 0.7408, + "step": 5651 + }, + { + "epoch": 1.3741794310722102, + "grad_norm": 17.5, + "learning_rate": 5.69511964605305e-07, + "loss": 0.4454, + "step": 5652 + }, + { + "epoch": 1.37442256260637, + "grad_norm": 14.9375, + "learning_rate": 5.691066394660337e-07, + "loss": 0.4825, + "step": 5653 + }, + { + "epoch": 1.37466569414053, + "grad_norm": 18.5, + "learning_rate": 5.687014160980346e-07, + "loss": 0.7598, + "step": 5654 + }, + { + "epoch": 1.37490882567469, + "grad_norm": 18.875, + "learning_rate": 5.682962945618749e-07, + "loss": 0.66, + "step": 5655 + }, + { + "epoch": 1.37515195720885, + "grad_norm": 28.25, + "learning_rate": 5.678912749181079e-07, + "loss": 1.0435, + "step": 5656 + }, + { + "epoch": 1.37539508874301, + "grad_norm": 18.25, + "learning_rate": 5.674863572272701e-07, + "loss": 0.938, + "step": 5657 + }, + { + "epoch": 1.37563822027717, + "grad_norm": 22.5, + "learning_rate": 5.670815415498829e-07, + "loss": 0.6949, + "step": 5658 + }, + { + "epoch": 1.37588135181133, + "grad_norm": 15.6875, + "learning_rate": 5.666768279464546e-07, + "loss": 0.6275, + "step": 5659 + }, + { + "epoch": 1.37612448334549, + "grad_norm": 20.5, + "learning_rate": 5.662722164774757e-07, + "loss": 0.6347, + "step": 5660 + }, + { + "epoch": 1.37636761487965, + "grad_norm": 17.375, + "learning_rate": 5.65867707203422e-07, + "loss": 0.4161, + "step": 5661 + }, + { + "epoch": 1.3766107464138098, + "grad_norm": 16.75, + "learning_rate": 5.654633001847553e-07, + "loss": 0.643, + "step": 5662 + }, + { + "epoch": 1.37685387794797, + "grad_norm": 20.375, + "learning_rate": 5.650589954819208e-07, + "loss": 1.1652, + "step": 5663 + }, + { + "epoch": 1.3770970094821298, + "grad_norm": 17.875, + "learning_rate": 5.646547931553487e-07, + "loss": 0.3996, + "step": 5664 + }, + { + "epoch": 1.3773401410162898, + "grad_norm": 25.0, + "learning_rate": 5.642506932654536e-07, + "loss": 0.6739, + "step": 5665 + }, + { + "epoch": 1.3775832725504498, + "grad_norm": 21.375, + "learning_rate": 5.638466958726358e-07, + "loss": 0.8701, + "step": 5666 + }, + { + "epoch": 1.3778264040846098, + "grad_norm": 31.125, + "learning_rate": 5.6344280103728e-07, + "loss": 0.8251, + "step": 5667 + }, + { + "epoch": 1.3780695356187698, + "grad_norm": 17.125, + "learning_rate": 5.630390088197543e-07, + "loss": 0.6207, + "step": 5668 + }, + { + "epoch": 1.3783126671529298, + "grad_norm": 20.375, + "learning_rate": 5.626353192804135e-07, + "loss": 0.7564, + "step": 5669 + }, + { + "epoch": 1.3785557986870898, + "grad_norm": 28.25, + "learning_rate": 5.622317324795954e-07, + "loss": 0.7164, + "step": 5670 + }, + { + "epoch": 1.3787989302212498, + "grad_norm": 67.5, + "learning_rate": 5.61828248477623e-07, + "loss": 1.9273, + "step": 5671 + }, + { + "epoch": 1.3790420617554098, + "grad_norm": 17.875, + "learning_rate": 5.614248673348036e-07, + "loss": 0.5701, + "step": 5672 + }, + { + "epoch": 1.3792851932895696, + "grad_norm": 27.125, + "learning_rate": 5.610215891114304e-07, + "loss": 1.1853, + "step": 5673 + }, + { + "epoch": 1.3795283248237296, + "grad_norm": 20.0, + "learning_rate": 5.606184138677792e-07, + "loss": 0.7212, + "step": 5674 + }, + { + "epoch": 1.3797714563578896, + "grad_norm": 26.5, + "learning_rate": 5.602153416641121e-07, + "loss": 0.94, + "step": 5675 + }, + { + "epoch": 1.3800145878920496, + "grad_norm": 19.75, + "learning_rate": 5.598123725606757e-07, + "loss": 0.7837, + "step": 5676 + }, + { + "epoch": 1.3802577194262096, + "grad_norm": 19.875, + "learning_rate": 5.594095066177001e-07, + "loss": 0.6102, + "step": 5677 + }, + { + "epoch": 1.3805008509603696, + "grad_norm": 20.25, + "learning_rate": 5.590067438954003e-07, + "loss": 0.488, + "step": 5678 + }, + { + "epoch": 1.3807439824945296, + "grad_norm": 19.75, + "learning_rate": 5.586040844539769e-07, + "loss": 0.5539, + "step": 5679 + }, + { + "epoch": 1.3809871140286896, + "grad_norm": 18.125, + "learning_rate": 5.582015283536137e-07, + "loss": 0.6042, + "step": 5680 + }, + { + "epoch": 1.3812302455628496, + "grad_norm": 18.375, + "learning_rate": 5.577990756544797e-07, + "loss": 0.7593, + "step": 5681 + }, + { + "epoch": 1.3814733770970093, + "grad_norm": 17.5, + "learning_rate": 5.573967264167285e-07, + "loss": 0.7128, + "step": 5682 + }, + { + "epoch": 1.3817165086311696, + "grad_norm": 16.0, + "learning_rate": 5.569944807004986e-07, + "loss": 0.5725, + "step": 5683 + }, + { + "epoch": 1.3819596401653294, + "grad_norm": 16.375, + "learning_rate": 5.565923385659122e-07, + "loss": 0.3594, + "step": 5684 + }, + { + "epoch": 1.3822027716994894, + "grad_norm": 19.75, + "learning_rate": 5.561903000730759e-07, + "loss": 0.7702, + "step": 5685 + }, + { + "epoch": 1.3824459032336494, + "grad_norm": 21.0, + "learning_rate": 5.557883652820823e-07, + "loss": 0.7749, + "step": 5686 + }, + { + "epoch": 1.3826890347678094, + "grad_norm": 15.6875, + "learning_rate": 5.553865342530068e-07, + "loss": 0.666, + "step": 5687 + }, + { + "epoch": 1.3829321663019694, + "grad_norm": 21.625, + "learning_rate": 5.549848070459102e-07, + "loss": 0.589, + "step": 5688 + }, + { + "epoch": 1.3831752978361294, + "grad_norm": 17.0, + "learning_rate": 5.545831837208366e-07, + "loss": 0.6232, + "step": 5689 + }, + { + "epoch": 1.3834184293702894, + "grad_norm": 19.25, + "learning_rate": 5.541816643378174e-07, + "loss": 0.5763, + "step": 5690 + }, + { + "epoch": 1.3836615609044494, + "grad_norm": 18.125, + "learning_rate": 5.537802489568657e-07, + "loss": 0.56, + "step": 5691 + }, + { + "epoch": 1.3839046924386094, + "grad_norm": 24.5, + "learning_rate": 5.533789376379795e-07, + "loss": 0.7102, + "step": 5692 + }, + { + "epoch": 1.3841478239727691, + "grad_norm": 27.875, + "learning_rate": 5.529777304411424e-07, + "loss": 1.0647, + "step": 5693 + }, + { + "epoch": 1.3843909555069294, + "grad_norm": 27.625, + "learning_rate": 5.525766274263217e-07, + "loss": 0.6592, + "step": 5694 + }, + { + "epoch": 1.3846340870410891, + "grad_norm": 14.0625, + "learning_rate": 5.521756286534682e-07, + "loss": 0.3931, + "step": 5695 + }, + { + "epoch": 1.3848772185752491, + "grad_norm": 18.5, + "learning_rate": 5.517747341825196e-07, + "loss": 0.6468, + "step": 5696 + }, + { + "epoch": 1.3851203501094091, + "grad_norm": 20.5, + "learning_rate": 5.513739440733953e-07, + "loss": 0.492, + "step": 5697 + }, + { + "epoch": 1.3853634816435692, + "grad_norm": 23.625, + "learning_rate": 5.50973258386001e-07, + "loss": 0.6066, + "step": 5698 + }, + { + "epoch": 1.3856066131777292, + "grad_norm": 22.5, + "learning_rate": 5.505726771802254e-07, + "loss": 0.4876, + "step": 5699 + }, + { + "epoch": 1.3858497447118892, + "grad_norm": 23.625, + "learning_rate": 5.501722005159432e-07, + "loss": 0.6459, + "step": 5700 + }, + { + "epoch": 1.3860928762460492, + "grad_norm": 21.5, + "learning_rate": 5.497718284530123e-07, + "loss": 0.8453, + "step": 5701 + }, + { + "epoch": 1.3863360077802092, + "grad_norm": 24.75, + "learning_rate": 5.493715610512742e-07, + "loss": 0.812, + "step": 5702 + }, + { + "epoch": 1.3865791393143692, + "grad_norm": 23.625, + "learning_rate": 5.489713983705572e-07, + "loss": 0.6894, + "step": 5703 + }, + { + "epoch": 1.386822270848529, + "grad_norm": 24.5, + "learning_rate": 5.485713404706719e-07, + "loss": 0.7908, + "step": 5704 + }, + { + "epoch": 1.3870654023826892, + "grad_norm": 18.5, + "learning_rate": 5.481713874114134e-07, + "loss": 0.4933, + "step": 5705 + }, + { + "epoch": 1.387308533916849, + "grad_norm": 26.375, + "learning_rate": 5.47771539252562e-07, + "loss": 0.6707, + "step": 5706 + }, + { + "epoch": 1.387551665451009, + "grad_norm": 25.875, + "learning_rate": 5.473717960538825e-07, + "loss": 0.8116, + "step": 5707 + }, + { + "epoch": 1.387794796985169, + "grad_norm": 16.625, + "learning_rate": 5.469721578751229e-07, + "loss": 0.4498, + "step": 5708 + }, + { + "epoch": 1.388037928519329, + "grad_norm": 16.125, + "learning_rate": 5.465726247760155e-07, + "loss": 0.7928, + "step": 5709 + }, + { + "epoch": 1.388281060053489, + "grad_norm": 23.25, + "learning_rate": 5.461731968162784e-07, + "loss": 0.736, + "step": 5710 + }, + { + "epoch": 1.388524191587649, + "grad_norm": 19.75, + "learning_rate": 5.457738740556125e-07, + "loss": 0.8674, + "step": 5711 + }, + { + "epoch": 1.388767323121809, + "grad_norm": 20.0, + "learning_rate": 5.453746565537031e-07, + "loss": 0.905, + "step": 5712 + }, + { + "epoch": 1.389010454655969, + "grad_norm": 24.5, + "learning_rate": 5.449755443702206e-07, + "loss": 0.7641, + "step": 5713 + }, + { + "epoch": 1.389253586190129, + "grad_norm": 22.5, + "learning_rate": 5.445765375648197e-07, + "loss": 0.8638, + "step": 5714 + }, + { + "epoch": 1.3894967177242887, + "grad_norm": 14.4375, + "learning_rate": 5.441776361971382e-07, + "loss": 0.2904, + "step": 5715 + }, + { + "epoch": 1.389739849258449, + "grad_norm": 24.75, + "learning_rate": 5.437788403267986e-07, + "loss": 0.8309, + "step": 5716 + }, + { + "epoch": 1.3899829807926087, + "grad_norm": 20.25, + "learning_rate": 5.433801500134087e-07, + "loss": 0.7246, + "step": 5717 + }, + { + "epoch": 1.3902261123267687, + "grad_norm": 20.375, + "learning_rate": 5.42981565316559e-07, + "loss": 0.8662, + "step": 5718 + }, + { + "epoch": 1.3904692438609287, + "grad_norm": 17.5, + "learning_rate": 5.425830862958245e-07, + "loss": 0.4607, + "step": 5719 + }, + { + "epoch": 1.3907123753950887, + "grad_norm": 14.375, + "learning_rate": 5.421847130107655e-07, + "loss": 0.2762, + "step": 5720 + }, + { + "epoch": 1.3909555069292487, + "grad_norm": 36.25, + "learning_rate": 5.417864455209257e-07, + "loss": 0.9315, + "step": 5721 + }, + { + "epoch": 1.3911986384634087, + "grad_norm": 22.125, + "learning_rate": 5.413882838858331e-07, + "loss": 0.8537, + "step": 5722 + }, + { + "epoch": 1.3914417699975687, + "grad_norm": 22.0, + "learning_rate": 5.409902281649991e-07, + "loss": 0.7202, + "step": 5723 + }, + { + "epoch": 1.3916849015317287, + "grad_norm": 21.0, + "learning_rate": 5.405922784179209e-07, + "loss": 0.603, + "step": 5724 + }, + { + "epoch": 1.3919280330658887, + "grad_norm": 19.625, + "learning_rate": 5.401944347040787e-07, + "loss": 0.5254, + "step": 5725 + }, + { + "epoch": 1.3921711646000485, + "grad_norm": 28.375, + "learning_rate": 5.397966970829365e-07, + "loss": 1.0057, + "step": 5726 + }, + { + "epoch": 1.3924142961342085, + "grad_norm": 22.0, + "learning_rate": 5.393990656139438e-07, + "loss": 0.6172, + "step": 5727 + }, + { + "epoch": 1.3926574276683685, + "grad_norm": 17.75, + "learning_rate": 5.390015403565331e-07, + "loss": 0.5187, + "step": 5728 + }, + { + "epoch": 1.3929005592025285, + "grad_norm": 19.75, + "learning_rate": 5.38604121370122e-07, + "loss": 0.5958, + "step": 5729 + }, + { + "epoch": 1.3931436907366885, + "grad_norm": 22.25, + "learning_rate": 5.382068087141105e-07, + "loss": 0.7699, + "step": 5730 + }, + { + "epoch": 1.3933868222708485, + "grad_norm": 20.875, + "learning_rate": 5.378096024478852e-07, + "loss": 0.8332, + "step": 5731 + }, + { + "epoch": 1.3936299538050085, + "grad_norm": 18.5, + "learning_rate": 5.374125026308148e-07, + "loss": 0.854, + "step": 5732 + }, + { + "epoch": 1.3938730853391685, + "grad_norm": 23.0, + "learning_rate": 5.37015509322252e-07, + "loss": 0.8707, + "step": 5733 + }, + { + "epoch": 1.3941162168733285, + "grad_norm": 23.5, + "learning_rate": 5.366186225815356e-07, + "loss": 0.7658, + "step": 5734 + }, + { + "epoch": 1.3943593484074883, + "grad_norm": 18.25, + "learning_rate": 5.362218424679862e-07, + "loss": 0.561, + "step": 5735 + }, + { + "epoch": 1.3946024799416485, + "grad_norm": 16.625, + "learning_rate": 5.358251690409103e-07, + "loss": 0.7389, + "step": 5736 + }, + { + "epoch": 1.3948456114758083, + "grad_norm": 17.875, + "learning_rate": 5.354286023595964e-07, + "loss": 0.4713, + "step": 5737 + }, + { + "epoch": 1.3950887430099683, + "grad_norm": 23.0, + "learning_rate": 5.350321424833196e-07, + "loss": 0.7443, + "step": 5738 + }, + { + "epoch": 1.3953318745441283, + "grad_norm": 17.0, + "learning_rate": 5.34635789471337e-07, + "loss": 0.5654, + "step": 5739 + }, + { + "epoch": 1.3955750060782883, + "grad_norm": 27.375, + "learning_rate": 5.342395433828899e-07, + "loss": 1.1186, + "step": 5740 + }, + { + "epoch": 1.3958181376124483, + "grad_norm": 29.875, + "learning_rate": 5.338434042772052e-07, + "loss": 1.1762, + "step": 5741 + }, + { + "epoch": 1.3960612691466083, + "grad_norm": 32.75, + "learning_rate": 5.334473722134923e-07, + "loss": 0.5788, + "step": 5742 + }, + { + "epoch": 1.3963044006807683, + "grad_norm": 22.625, + "learning_rate": 5.330514472509442e-07, + "loss": 0.9732, + "step": 5743 + }, + { + "epoch": 1.3965475322149283, + "grad_norm": 17.375, + "learning_rate": 5.326556294487396e-07, + "loss": 0.4992, + "step": 5744 + }, + { + "epoch": 1.3967906637490883, + "grad_norm": 18.125, + "learning_rate": 5.322599188660406e-07, + "loss": 0.3687, + "step": 5745 + }, + { + "epoch": 1.3970337952832481, + "grad_norm": 17.5, + "learning_rate": 5.318643155619924e-07, + "loss": 0.5838, + "step": 5746 + }, + { + "epoch": 1.3972769268174083, + "grad_norm": 17.875, + "learning_rate": 5.314688195957245e-07, + "loss": 0.6116, + "step": 5747 + }, + { + "epoch": 1.3975200583515681, + "grad_norm": 17.0, + "learning_rate": 5.310734310263515e-07, + "loss": 0.4648, + "step": 5748 + }, + { + "epoch": 1.3977631898857281, + "grad_norm": 23.375, + "learning_rate": 5.306781499129704e-07, + "loss": 0.5402, + "step": 5749 + }, + { + "epoch": 1.3980063214198881, + "grad_norm": 20.375, + "learning_rate": 5.302829763146625e-07, + "loss": 0.4471, + "step": 5750 + }, + { + "epoch": 1.3982494529540481, + "grad_norm": 19.375, + "learning_rate": 5.298879102904937e-07, + "loss": 0.6008, + "step": 5751 + }, + { + "epoch": 1.3984925844882081, + "grad_norm": 23.25, + "learning_rate": 5.29492951899514e-07, + "loss": 0.8676, + "step": 5752 + }, + { + "epoch": 1.3987357160223681, + "grad_norm": 22.5, + "learning_rate": 5.290981012007563e-07, + "loss": 0.5519, + "step": 5753 + }, + { + "epoch": 1.3989788475565281, + "grad_norm": 23.75, + "learning_rate": 5.28703358253237e-07, + "loss": 0.5874, + "step": 5754 + }, + { + "epoch": 1.3992219790906881, + "grad_norm": 21.125, + "learning_rate": 5.283087231159588e-07, + "loss": 0.533, + "step": 5755 + }, + { + "epoch": 1.3994651106248481, + "grad_norm": 24.625, + "learning_rate": 5.279141958479059e-07, + "loss": 1.2137, + "step": 5756 + }, + { + "epoch": 1.399708242159008, + "grad_norm": 12.9375, + "learning_rate": 5.275197765080469e-07, + "loss": 0.2967, + "step": 5757 + }, + { + "epoch": 1.3999513736931681, + "grad_norm": 17.625, + "learning_rate": 5.271254651553352e-07, + "loss": 0.4688, + "step": 5758 + }, + { + "epoch": 1.400194505227328, + "grad_norm": 19.375, + "learning_rate": 5.26731261848707e-07, + "loss": 0.7406, + "step": 5759 + }, + { + "epoch": 1.400437636761488, + "grad_norm": 20.875, + "learning_rate": 5.263371666470833e-07, + "loss": 0.5208, + "step": 5760 + }, + { + "epoch": 1.400680768295648, + "grad_norm": 19.375, + "learning_rate": 5.259431796093676e-07, + "loss": 0.594, + "step": 5761 + }, + { + "epoch": 1.400923899829808, + "grad_norm": 21.5, + "learning_rate": 5.255493007944492e-07, + "loss": 0.5439, + "step": 5762 + }, + { + "epoch": 1.401167031363968, + "grad_norm": 22.0, + "learning_rate": 5.251555302611995e-07, + "loss": 0.6506, + "step": 5763 + }, + { + "epoch": 1.401410162898128, + "grad_norm": 27.75, + "learning_rate": 5.247618680684738e-07, + "loss": 0.8935, + "step": 5764 + }, + { + "epoch": 1.401653294432288, + "grad_norm": 22.0, + "learning_rate": 5.243683142751127e-07, + "loss": 0.5009, + "step": 5765 + }, + { + "epoch": 1.401896425966448, + "grad_norm": 20.25, + "learning_rate": 5.239748689399387e-07, + "loss": 0.7143, + "step": 5766 + }, + { + "epoch": 1.402139557500608, + "grad_norm": 39.0, + "learning_rate": 5.235815321217598e-07, + "loss": 1.0894, + "step": 5767 + }, + { + "epoch": 1.4023826890347677, + "grad_norm": 25.0, + "learning_rate": 5.231883038793662e-07, + "loss": 0.9148, + "step": 5768 + }, + { + "epoch": 1.402625820568928, + "grad_norm": 21.625, + "learning_rate": 5.227951842715335e-07, + "loss": 0.6216, + "step": 5769 + }, + { + "epoch": 1.4028689521030877, + "grad_norm": 15.625, + "learning_rate": 5.224021733570197e-07, + "loss": 0.4572, + "step": 5770 + }, + { + "epoch": 1.4031120836372477, + "grad_norm": 21.5, + "learning_rate": 5.220092711945668e-07, + "loss": 0.9574, + "step": 5771 + }, + { + "epoch": 1.4033552151714077, + "grad_norm": 23.0, + "learning_rate": 5.216164778429016e-07, + "loss": 0.6536, + "step": 5772 + }, + { + "epoch": 1.4035983467055677, + "grad_norm": 15.5, + "learning_rate": 5.212237933607332e-07, + "loss": 0.5542, + "step": 5773 + }, + { + "epoch": 1.4038414782397277, + "grad_norm": 24.5, + "learning_rate": 5.208312178067551e-07, + "loss": 1.0369, + "step": 5774 + }, + { + "epoch": 1.4040846097738877, + "grad_norm": 26.5, + "learning_rate": 5.204387512396446e-07, + "loss": 0.9562, + "step": 5775 + }, + { + "epoch": 1.4043277413080477, + "grad_norm": 21.0, + "learning_rate": 5.20046393718063e-07, + "loss": 0.9378, + "step": 5776 + }, + { + "epoch": 1.4045708728422075, + "grad_norm": 17.0, + "learning_rate": 5.196541453006547e-07, + "loss": 0.6464, + "step": 5777 + }, + { + "epoch": 1.4048140043763677, + "grad_norm": 18.0, + "learning_rate": 5.192620060460475e-07, + "loss": 0.556, + "step": 5778 + }, + { + "epoch": 1.4050571359105275, + "grad_norm": 21.125, + "learning_rate": 5.188699760128542e-07, + "loss": 0.7421, + "step": 5779 + }, + { + "epoch": 1.4053002674446875, + "grad_norm": 19.375, + "learning_rate": 5.184780552596699e-07, + "loss": 0.7795, + "step": 5780 + }, + { + "epoch": 1.4055433989788475, + "grad_norm": 16.0, + "learning_rate": 5.180862438450739e-07, + "loss": 0.3672, + "step": 5781 + }, + { + "epoch": 1.4057865305130075, + "grad_norm": 15.3125, + "learning_rate": 5.176945418276292e-07, + "loss": 0.6121, + "step": 5782 + }, + { + "epoch": 1.4060296620471675, + "grad_norm": 17.25, + "learning_rate": 5.173029492658829e-07, + "loss": 0.5308, + "step": 5783 + }, + { + "epoch": 1.4062727935813275, + "grad_norm": 13.75, + "learning_rate": 5.16911466218365e-07, + "loss": 0.3052, + "step": 5784 + }, + { + "epoch": 1.4065159251154875, + "grad_norm": 25.0, + "learning_rate": 5.16520092743589e-07, + "loss": 0.9342, + "step": 5785 + }, + { + "epoch": 1.4067590566496475, + "grad_norm": 22.0, + "learning_rate": 5.161288289000532e-07, + "loss": 1.0008, + "step": 5786 + }, + { + "epoch": 1.4070021881838075, + "grad_norm": 23.875, + "learning_rate": 5.157376747462382e-07, + "loss": 0.8272, + "step": 5787 + }, + { + "epoch": 1.4072453197179673, + "grad_norm": 19.25, + "learning_rate": 5.153466303406085e-07, + "loss": 0.9004, + "step": 5788 + }, + { + "epoch": 1.4074884512521275, + "grad_norm": 23.125, + "learning_rate": 5.149556957416129e-07, + "loss": 0.9683, + "step": 5789 + }, + { + "epoch": 1.4077315827862873, + "grad_norm": 16.0, + "learning_rate": 5.145648710076835e-07, + "loss": 0.4709, + "step": 5790 + }, + { + "epoch": 1.4079747143204473, + "grad_norm": 18.625, + "learning_rate": 5.141741561972356e-07, + "loss": 0.7457, + "step": 5791 + }, + { + "epoch": 1.4082178458546073, + "grad_norm": 18.375, + "learning_rate": 5.137835513686677e-07, + "loss": 0.4687, + "step": 5792 + }, + { + "epoch": 1.4084609773887673, + "grad_norm": 29.875, + "learning_rate": 5.133930565803635e-07, + "loss": 0.7572, + "step": 5793 + }, + { + "epoch": 1.4087041089229273, + "grad_norm": 25.125, + "learning_rate": 5.130026718906886e-07, + "loss": 0.6181, + "step": 5794 + }, + { + "epoch": 1.4089472404570873, + "grad_norm": 114.0, + "learning_rate": 5.126123973579923e-07, + "loss": 0.6233, + "step": 5795 + }, + { + "epoch": 1.4091903719912473, + "grad_norm": 23.25, + "learning_rate": 5.122222330406089e-07, + "loss": 1.0922, + "step": 5796 + }, + { + "epoch": 1.4094335035254073, + "grad_norm": 29.125, + "learning_rate": 5.118321789968541e-07, + "loss": 1.1315, + "step": 5797 + }, + { + "epoch": 1.4096766350595673, + "grad_norm": 19.875, + "learning_rate": 5.114422352850291e-07, + "loss": 0.8951, + "step": 5798 + }, + { + "epoch": 1.409919766593727, + "grad_norm": 23.75, + "learning_rate": 5.110524019634171e-07, + "loss": 0.9874, + "step": 5799 + }, + { + "epoch": 1.4101628981278873, + "grad_norm": 22.375, + "learning_rate": 5.10662679090286e-07, + "loss": 0.515, + "step": 5800 + }, + { + "epoch": 1.410406029662047, + "grad_norm": 25.75, + "learning_rate": 5.102730667238862e-07, + "loss": 1.2052, + "step": 5801 + }, + { + "epoch": 1.410649161196207, + "grad_norm": 14.125, + "learning_rate": 5.098835649224519e-07, + "loss": 0.2532, + "step": 5802 + }, + { + "epoch": 1.410892292730367, + "grad_norm": 26.875, + "learning_rate": 5.094941737442014e-07, + "loss": 0.8128, + "step": 5803 + }, + { + "epoch": 1.411135424264527, + "grad_norm": 22.5, + "learning_rate": 5.091048932473356e-07, + "loss": 1.0882, + "step": 5804 + }, + { + "epoch": 1.411378555798687, + "grad_norm": 27.5, + "learning_rate": 5.087157234900389e-07, + "loss": 0.8706, + "step": 5805 + }, + { + "epoch": 1.411621687332847, + "grad_norm": 22.125, + "learning_rate": 5.083266645304796e-07, + "loss": 0.9997, + "step": 5806 + }, + { + "epoch": 1.411864818867007, + "grad_norm": 22.25, + "learning_rate": 5.0793771642681e-07, + "loss": 0.4459, + "step": 5807 + }, + { + "epoch": 1.412107950401167, + "grad_norm": 25.25, + "learning_rate": 5.075488792371644e-07, + "loss": 0.7581, + "step": 5808 + }, + { + "epoch": 1.412351081935327, + "grad_norm": 19.0, + "learning_rate": 5.07160153019661e-07, + "loss": 0.8137, + "step": 5809 + }, + { + "epoch": 1.4125942134694869, + "grad_norm": 17.25, + "learning_rate": 5.067715378324026e-07, + "loss": 0.6121, + "step": 5810 + }, + { + "epoch": 1.412837345003647, + "grad_norm": 19.5, + "learning_rate": 5.063830337334737e-07, + "loss": 0.8765, + "step": 5811 + }, + { + "epoch": 1.4130804765378069, + "grad_norm": 37.25, + "learning_rate": 5.059946407809427e-07, + "loss": 1.1682, + "step": 5812 + }, + { + "epoch": 1.4133236080719669, + "grad_norm": 26.5, + "learning_rate": 5.056063590328619e-07, + "loss": 0.6475, + "step": 5813 + }, + { + "epoch": 1.4135667396061269, + "grad_norm": 19.25, + "learning_rate": 5.052181885472676e-07, + "loss": 0.8764, + "step": 5814 + }, + { + "epoch": 1.413809871140287, + "grad_norm": 14.6875, + "learning_rate": 5.048301293821776e-07, + "loss": 0.4941, + "step": 5815 + }, + { + "epoch": 1.414053002674447, + "grad_norm": 21.125, + "learning_rate": 5.04442181595594e-07, + "loss": 0.9546, + "step": 5816 + }, + { + "epoch": 1.414296134208607, + "grad_norm": 22.5, + "learning_rate": 5.040543452455029e-07, + "loss": 0.7142, + "step": 5817 + }, + { + "epoch": 1.414539265742767, + "grad_norm": 18.25, + "learning_rate": 5.036666203898731e-07, + "loss": 0.5793, + "step": 5818 + }, + { + "epoch": 1.414782397276927, + "grad_norm": 24.875, + "learning_rate": 5.032790070866558e-07, + "loss": 0.6766, + "step": 5819 + }, + { + "epoch": 1.415025528811087, + "grad_norm": 24.0, + "learning_rate": 5.028915053937873e-07, + "loss": 0.8037, + "step": 5820 + }, + { + "epoch": 1.4152686603452467, + "grad_norm": 20.875, + "learning_rate": 5.025041153691868e-07, + "loss": 0.8649, + "step": 5821 + }, + { + "epoch": 1.4155117918794067, + "grad_norm": 16.375, + "learning_rate": 5.021168370707559e-07, + "loss": 0.3932, + "step": 5822 + }, + { + "epoch": 1.4157549234135667, + "grad_norm": 18.25, + "learning_rate": 5.017296705563797e-07, + "loss": 0.5906, + "step": 5823 + }, + { + "epoch": 1.4159980549477267, + "grad_norm": 24.5, + "learning_rate": 5.013426158839277e-07, + "loss": 0.7213, + "step": 5824 + }, + { + "epoch": 1.4162411864818867, + "grad_norm": 28.625, + "learning_rate": 5.009556731112515e-07, + "loss": 0.6516, + "step": 5825 + }, + { + "epoch": 1.4164843180160467, + "grad_norm": 21.125, + "learning_rate": 5.00568842296186e-07, + "loss": 0.7865, + "step": 5826 + }, + { + "epoch": 1.4167274495502067, + "grad_norm": 27.5, + "learning_rate": 5.001821234965507e-07, + "loss": 0.832, + "step": 5827 + }, + { + "epoch": 1.4169705810843667, + "grad_norm": 18.375, + "learning_rate": 4.997955167701463e-07, + "loss": 0.6087, + "step": 5828 + }, + { + "epoch": 1.4172137126185267, + "grad_norm": 19.75, + "learning_rate": 4.994090221747587e-07, + "loss": 0.6195, + "step": 5829 + }, + { + "epoch": 1.4174568441526865, + "grad_norm": 24.125, + "learning_rate": 4.990226397681556e-07, + "loss": 0.5386, + "step": 5830 + }, + { + "epoch": 1.4176999756868467, + "grad_norm": 19.875, + "learning_rate": 4.986363696080893e-07, + "loss": 0.8244, + "step": 5831 + }, + { + "epoch": 1.4179431072210065, + "grad_norm": 20.0, + "learning_rate": 4.98250211752294e-07, + "loss": 0.7723, + "step": 5832 + }, + { + "epoch": 1.4181862387551665, + "grad_norm": 23.75, + "learning_rate": 4.978641662584872e-07, + "loss": 0.6594, + "step": 5833 + }, + { + "epoch": 1.4184293702893265, + "grad_norm": 19.875, + "learning_rate": 4.974782331843711e-07, + "loss": 0.8725, + "step": 5834 + }, + { + "epoch": 1.4186725018234865, + "grad_norm": 18.75, + "learning_rate": 4.970924125876296e-07, + "loss": 1.0599, + "step": 5835 + }, + { + "epoch": 1.4189156333576465, + "grad_norm": 25.125, + "learning_rate": 4.967067045259297e-07, + "loss": 0.8882, + "step": 5836 + }, + { + "epoch": 1.4191587648918065, + "grad_norm": 23.625, + "learning_rate": 4.963211090569227e-07, + "loss": 0.6374, + "step": 5837 + }, + { + "epoch": 1.4194018964259665, + "grad_norm": 23.25, + "learning_rate": 4.959356262382428e-07, + "loss": 1.1284, + "step": 5838 + }, + { + "epoch": 1.4196450279601265, + "grad_norm": 16.375, + "learning_rate": 4.955502561275068e-07, + "loss": 0.6845, + "step": 5839 + }, + { + "epoch": 1.4198881594942865, + "grad_norm": 16.875, + "learning_rate": 4.951649987823141e-07, + "loss": 0.6355, + "step": 5840 + }, + { + "epoch": 1.4201312910284463, + "grad_norm": 21.25, + "learning_rate": 4.947798542602496e-07, + "loss": 0.389, + "step": 5841 + }, + { + "epoch": 1.4203744225626065, + "grad_norm": 81.5, + "learning_rate": 4.943948226188787e-07, + "loss": 0.7621, + "step": 5842 + }, + { + "epoch": 1.4206175540967663, + "grad_norm": 22.25, + "learning_rate": 4.94009903915751e-07, + "loss": 0.7873, + "step": 5843 + }, + { + "epoch": 1.4208606856309263, + "grad_norm": 20.125, + "learning_rate": 4.936250982083996e-07, + "loss": 0.8401, + "step": 5844 + }, + { + "epoch": 1.4211038171650863, + "grad_norm": 20.125, + "learning_rate": 4.932404055543406e-07, + "loss": 0.6991, + "step": 5845 + }, + { + "epoch": 1.4213469486992463, + "grad_norm": 24.25, + "learning_rate": 4.928558260110729e-07, + "loss": 0.8347, + "step": 5846 + }, + { + "epoch": 1.4215900802334063, + "grad_norm": 25.75, + "learning_rate": 4.924713596360778e-07, + "loss": 0.6477, + "step": 5847 + }, + { + "epoch": 1.4218332117675663, + "grad_norm": 17.25, + "learning_rate": 4.920870064868214e-07, + "loss": 0.9365, + "step": 5848 + }, + { + "epoch": 1.4220763433017263, + "grad_norm": 21.75, + "learning_rate": 4.917027666207514e-07, + "loss": 0.6038, + "step": 5849 + }, + { + "epoch": 1.4223194748358863, + "grad_norm": 40.0, + "learning_rate": 4.91318640095299e-07, + "loss": 0.9744, + "step": 5850 + }, + { + "epoch": 1.4225626063700463, + "grad_norm": 18.5, + "learning_rate": 4.909346269678785e-07, + "loss": 0.7813, + "step": 5851 + }, + { + "epoch": 1.422805737904206, + "grad_norm": 18.25, + "learning_rate": 4.905507272958879e-07, + "loss": 0.6914, + "step": 5852 + }, + { + "epoch": 1.4230488694383663, + "grad_norm": 23.875, + "learning_rate": 4.901669411367073e-07, + "loss": 0.435, + "step": 5853 + }, + { + "epoch": 1.423292000972526, + "grad_norm": 24.75, + "learning_rate": 4.897832685476997e-07, + "loss": 0.7319, + "step": 5854 + }, + { + "epoch": 1.423535132506686, + "grad_norm": 22.75, + "learning_rate": 4.893997095862126e-07, + "loss": 0.6193, + "step": 5855 + }, + { + "epoch": 1.423778264040846, + "grad_norm": 18.75, + "learning_rate": 4.890162643095747e-07, + "loss": 0.5112, + "step": 5856 + }, + { + "epoch": 1.424021395575006, + "grad_norm": 24.75, + "learning_rate": 4.886329327750984e-07, + "loss": 0.7389, + "step": 5857 + }, + { + "epoch": 1.424264527109166, + "grad_norm": 23.375, + "learning_rate": 4.8824971504008e-07, + "loss": 0.8028, + "step": 5858 + }, + { + "epoch": 1.424507658643326, + "grad_norm": 17.0, + "learning_rate": 4.878666111617972e-07, + "loss": 0.5618, + "step": 5859 + }, + { + "epoch": 1.424750790177486, + "grad_norm": 33.25, + "learning_rate": 4.874836211975122e-07, + "loss": 0.9848, + "step": 5860 + }, + { + "epoch": 1.424993921711646, + "grad_norm": 32.75, + "learning_rate": 4.871007452044686e-07, + "loss": 0.6187, + "step": 5861 + }, + { + "epoch": 1.425237053245806, + "grad_norm": 19.125, + "learning_rate": 4.867179832398949e-07, + "loss": 0.9497, + "step": 5862 + }, + { + "epoch": 1.4254801847799659, + "grad_norm": 20.75, + "learning_rate": 4.863353353610011e-07, + "loss": 0.821, + "step": 5863 + }, + { + "epoch": 1.425723316314126, + "grad_norm": 25.875, + "learning_rate": 4.859528016249796e-07, + "loss": 0.8392, + "step": 5864 + }, + { + "epoch": 1.4259664478482859, + "grad_norm": 23.75, + "learning_rate": 4.855703820890083e-07, + "loss": 0.7004, + "step": 5865 + }, + { + "epoch": 1.4262095793824459, + "grad_norm": 18.75, + "learning_rate": 4.851880768102453e-07, + "loss": 0.5366, + "step": 5866 + }, + { + "epoch": 1.4264527109166059, + "grad_norm": 19.0, + "learning_rate": 4.848058858458326e-07, + "loss": 0.6864, + "step": 5867 + }, + { + "epoch": 1.4266958424507659, + "grad_norm": 19.0, + "learning_rate": 4.844238092528955e-07, + "loss": 0.4801, + "step": 5868 + }, + { + "epoch": 1.4269389739849259, + "grad_norm": 26.375, + "learning_rate": 4.840418470885426e-07, + "loss": 0.6771, + "step": 5869 + }, + { + "epoch": 1.4271821055190859, + "grad_norm": 21.625, + "learning_rate": 4.836599994098643e-07, + "loss": 0.7433, + "step": 5870 + }, + { + "epoch": 1.4274252370532459, + "grad_norm": 16.25, + "learning_rate": 4.832782662739336e-07, + "loss": 0.5384, + "step": 5871 + }, + { + "epoch": 1.4276683685874059, + "grad_norm": 18.625, + "learning_rate": 4.828966477378083e-07, + "loss": 0.5939, + "step": 5872 + }, + { + "epoch": 1.4279115001215659, + "grad_norm": 18.125, + "learning_rate": 4.825151438585272e-07, + "loss": 0.6243, + "step": 5873 + }, + { + "epoch": 1.4281546316557256, + "grad_norm": 19.75, + "learning_rate": 4.821337546931125e-07, + "loss": 0.5652, + "step": 5874 + }, + { + "epoch": 1.4283977631898856, + "grad_norm": 22.125, + "learning_rate": 4.817524802985697e-07, + "loss": 0.7576, + "step": 5875 + }, + { + "epoch": 1.4286408947240457, + "grad_norm": 19.0, + "learning_rate": 4.813713207318871e-07, + "loss": 0.7209, + "step": 5876 + }, + { + "epoch": 1.4288840262582057, + "grad_norm": 15.8125, + "learning_rate": 4.809902760500354e-07, + "loss": 0.9383, + "step": 5877 + }, + { + "epoch": 1.4291271577923657, + "grad_norm": 36.75, + "learning_rate": 4.806093463099677e-07, + "loss": 0.7053, + "step": 5878 + }, + { + "epoch": 1.4293702893265257, + "grad_norm": 20.625, + "learning_rate": 4.802285315686216e-07, + "loss": 0.6179, + "step": 5879 + }, + { + "epoch": 1.4296134208606857, + "grad_norm": 28.125, + "learning_rate": 4.798478318829157e-07, + "loss": 0.986, + "step": 5880 + }, + { + "epoch": 1.4298565523948457, + "grad_norm": 22.875, + "learning_rate": 4.79467247309752e-07, + "loss": 0.7236, + "step": 5881 + }, + { + "epoch": 1.4300996839290057, + "grad_norm": 18.75, + "learning_rate": 4.790867779060155e-07, + "loss": 0.6398, + "step": 5882 + }, + { + "epoch": 1.4303428154631654, + "grad_norm": 17.625, + "learning_rate": 4.787064237285749e-07, + "loss": 0.5652, + "step": 5883 + }, + { + "epoch": 1.4305859469973257, + "grad_norm": 23.5, + "learning_rate": 4.783261848342799e-07, + "loss": 0.6914, + "step": 5884 + }, + { + "epoch": 1.4308290785314854, + "grad_norm": 22.125, + "learning_rate": 4.779460612799635e-07, + "loss": 0.6014, + "step": 5885 + }, + { + "epoch": 1.4310722100656454, + "grad_norm": 33.5, + "learning_rate": 4.775660531224423e-07, + "loss": 0.7086, + "step": 5886 + }, + { + "epoch": 1.4313153415998054, + "grad_norm": 24.125, + "learning_rate": 4.771861604185149e-07, + "loss": 0.7281, + "step": 5887 + }, + { + "epoch": 1.4315584731339654, + "grad_norm": 25.625, + "learning_rate": 4.7680638322496253e-07, + "loss": 1.0503, + "step": 5888 + }, + { + "epoch": 1.4318016046681254, + "grad_norm": 23.75, + "learning_rate": 4.764267215985499e-07, + "loss": 0.7713, + "step": 5889 + }, + { + "epoch": 1.4320447362022855, + "grad_norm": 28.25, + "learning_rate": 4.7604717559602347e-07, + "loss": 0.9048, + "step": 5890 + }, + { + "epoch": 1.4322878677364455, + "grad_norm": 19.5, + "learning_rate": 4.7566774527411373e-07, + "loss": 1.0033, + "step": 5891 + }, + { + "epoch": 1.4325309992706055, + "grad_norm": 15.75, + "learning_rate": 4.7528843068953224e-07, + "loss": 0.3963, + "step": 5892 + }, + { + "epoch": 1.4327741308047655, + "grad_norm": 21.875, + "learning_rate": 4.7490923189897494e-07, + "loss": 0.9483, + "step": 5893 + }, + { + "epoch": 1.4330172623389252, + "grad_norm": 16.75, + "learning_rate": 4.745301489591191e-07, + "loss": 0.7503, + "step": 5894 + }, + { + "epoch": 1.4332603938730855, + "grad_norm": 18.75, + "learning_rate": 4.741511819266249e-07, + "loss": 0.5521, + "step": 5895 + }, + { + "epoch": 1.4335035254072452, + "grad_norm": 15.6875, + "learning_rate": 4.7377233085813656e-07, + "loss": 0.3149, + "step": 5896 + }, + { + "epoch": 1.4337466569414052, + "grad_norm": 34.0, + "learning_rate": 4.7339359581027907e-07, + "loss": 0.53, + "step": 5897 + }, + { + "epoch": 1.4339897884755652, + "grad_norm": 28.5, + "learning_rate": 4.730149768396604e-07, + "loss": 0.7893, + "step": 5898 + }, + { + "epoch": 1.4342329200097252, + "grad_norm": 23.375, + "learning_rate": 4.726364740028733e-07, + "loss": 0.6011, + "step": 5899 + }, + { + "epoch": 1.4344760515438852, + "grad_norm": 24.25, + "learning_rate": 4.7225808735649067e-07, + "loss": 0.4862, + "step": 5900 + }, + { + "epoch": 1.4347191830780452, + "grad_norm": 28.625, + "learning_rate": 4.7187981695706886e-07, + "loss": 0.9825, + "step": 5901 + }, + { + "epoch": 1.4349623146122052, + "grad_norm": 19.5, + "learning_rate": 4.715016628611467e-07, + "loss": 0.601, + "step": 5902 + }, + { + "epoch": 1.4352054461463652, + "grad_norm": 19.0, + "learning_rate": 4.7112362512524655e-07, + "loss": 0.8823, + "step": 5903 + }, + { + "epoch": 1.4354485776805253, + "grad_norm": 17.75, + "learning_rate": 4.7074570380587226e-07, + "loss": 0.4759, + "step": 5904 + }, + { + "epoch": 1.435691709214685, + "grad_norm": 22.75, + "learning_rate": 4.703678989595099e-07, + "loss": 0.8828, + "step": 5905 + }, + { + "epoch": 1.4359348407488453, + "grad_norm": 19.5, + "learning_rate": 4.6999021064263057e-07, + "loss": 0.5674, + "step": 5906 + }, + { + "epoch": 1.436177972283005, + "grad_norm": 19.0, + "learning_rate": 4.696126389116856e-07, + "loss": 0.7699, + "step": 5907 + }, + { + "epoch": 1.436421103817165, + "grad_norm": 19.25, + "learning_rate": 4.6923518382310933e-07, + "loss": 0.6402, + "step": 5908 + }, + { + "epoch": 1.436664235351325, + "grad_norm": 21.25, + "learning_rate": 4.688578454333188e-07, + "loss": 0.7068, + "step": 5909 + }, + { + "epoch": 1.436907366885485, + "grad_norm": 17.5, + "learning_rate": 4.6848062379871454e-07, + "loss": 0.4421, + "step": 5910 + }, + { + "epoch": 1.437150498419645, + "grad_norm": 19.625, + "learning_rate": 4.681035189756783e-07, + "loss": 0.6486, + "step": 5911 + }, + { + "epoch": 1.437393629953805, + "grad_norm": 20.0, + "learning_rate": 4.6772653102057447e-07, + "loss": 0.5045, + "step": 5912 + }, + { + "epoch": 1.437636761487965, + "grad_norm": 19.625, + "learning_rate": 4.6734965998975106e-07, + "loss": 0.8552, + "step": 5913 + }, + { + "epoch": 1.437879893022125, + "grad_norm": 17.75, + "learning_rate": 4.669729059395381e-07, + "loss": 0.6541, + "step": 5914 + }, + { + "epoch": 1.438123024556285, + "grad_norm": 18.875, + "learning_rate": 4.6659626892624723e-07, + "loss": 0.6457, + "step": 5915 + }, + { + "epoch": 1.4383661560904448, + "grad_norm": 24.875, + "learning_rate": 4.6621974900617444e-07, + "loss": 0.9936, + "step": 5916 + }, + { + "epoch": 1.438609287624605, + "grad_norm": 18.5, + "learning_rate": 4.658433462355963e-07, + "loss": 0.6769, + "step": 5917 + }, + { + "epoch": 1.4388524191587648, + "grad_norm": 20.0, + "learning_rate": 4.65467060670773e-07, + "loss": 0.5245, + "step": 5918 + }, + { + "epoch": 1.4390955506929248, + "grad_norm": 21.375, + "learning_rate": 4.6509089236794645e-07, + "loss": 0.5577, + "step": 5919 + }, + { + "epoch": 1.4393386822270848, + "grad_norm": 18.375, + "learning_rate": 4.647148413833423e-07, + "loss": 0.4768, + "step": 5920 + }, + { + "epoch": 1.4395818137612448, + "grad_norm": 21.25, + "learning_rate": 4.643389077731669e-07, + "loss": 0.9441, + "step": 5921 + }, + { + "epoch": 1.4398249452954048, + "grad_norm": 22.875, + "learning_rate": 4.639630915936108e-07, + "loss": 0.7449, + "step": 5922 + }, + { + "epoch": 1.4400680768295648, + "grad_norm": 19.875, + "learning_rate": 4.635873929008462e-07, + "loss": 0.32, + "step": 5923 + }, + { + "epoch": 1.4403112083637248, + "grad_norm": 20.0, + "learning_rate": 4.6321181175102773e-07, + "loss": 0.458, + "step": 5924 + }, + { + "epoch": 1.4405543398978846, + "grad_norm": 16.125, + "learning_rate": 4.628363482002922e-07, + "loss": 0.5633, + "step": 5925 + }, + { + "epoch": 1.4407974714320448, + "grad_norm": 20.875, + "learning_rate": 4.6246100230475876e-07, + "loss": 0.8196, + "step": 5926 + }, + { + "epoch": 1.4410406029662046, + "grad_norm": 17.125, + "learning_rate": 4.620857741205302e-07, + "loss": 0.3961, + "step": 5927 + }, + { + "epoch": 1.4412837345003646, + "grad_norm": 21.375, + "learning_rate": 4.6171066370369056e-07, + "loss": 0.7585, + "step": 5928 + }, + { + "epoch": 1.4415268660345246, + "grad_norm": 22.125, + "learning_rate": 4.613356711103055e-07, + "loss": 0.8838, + "step": 5929 + }, + { + "epoch": 1.4417699975686846, + "grad_norm": 15.5625, + "learning_rate": 4.60960796396426e-07, + "loss": 0.3827, + "step": 5930 + }, + { + "epoch": 1.4420131291028446, + "grad_norm": 24.75, + "learning_rate": 4.605860396180828e-07, + "loss": 0.7036, + "step": 5931 + }, + { + "epoch": 1.4422562606370046, + "grad_norm": 17.25, + "learning_rate": 4.60211400831289e-07, + "loss": 0.6268, + "step": 5932 + }, + { + "epoch": 1.4424993921711646, + "grad_norm": 27.375, + "learning_rate": 4.598368800920419e-07, + "loss": 0.9218, + "step": 5933 + }, + { + "epoch": 1.4427425237053246, + "grad_norm": 24.125, + "learning_rate": 4.5946247745631973e-07, + "loss": 0.6123, + "step": 5934 + }, + { + "epoch": 1.4429856552394846, + "grad_norm": 19.0, + "learning_rate": 4.590881929800833e-07, + "loss": 0.5335, + "step": 5935 + }, + { + "epoch": 1.4432287867736444, + "grad_norm": 21.375, + "learning_rate": 4.5871402671927523e-07, + "loss": 0.727, + "step": 5936 + }, + { + "epoch": 1.4434719183078046, + "grad_norm": 25.25, + "learning_rate": 4.5833997872982263e-07, + "loss": 0.6745, + "step": 5937 + }, + { + "epoch": 1.4437150498419644, + "grad_norm": 19.125, + "learning_rate": 4.5796604906763254e-07, + "loss": 0.5413, + "step": 5938 + }, + { + "epoch": 1.4439581813761244, + "grad_norm": 22.375, + "learning_rate": 4.5759223778859495e-07, + "loss": 0.7198, + "step": 5939 + }, + { + "epoch": 1.4442013129102844, + "grad_norm": 16.0, + "learning_rate": 4.5721854494858333e-07, + "loss": 0.7373, + "step": 5940 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 26.125, + "learning_rate": 4.568449706034518e-07, + "loss": 0.792, + "step": 5941 + }, + { + "epoch": 1.4446875759786044, + "grad_norm": 20.25, + "learning_rate": 4.5647151480903775e-07, + "loss": 1.1198, + "step": 5942 + }, + { + "epoch": 1.4449307075127644, + "grad_norm": 25.75, + "learning_rate": 4.560981776211601e-07, + "loss": 0.4278, + "step": 5943 + }, + { + "epoch": 1.4451738390469244, + "grad_norm": 21.625, + "learning_rate": 4.55724959095621e-07, + "loss": 0.8805, + "step": 5944 + }, + { + "epoch": 1.4454169705810844, + "grad_norm": 21.875, + "learning_rate": 4.5535185928820474e-07, + "loss": 0.8968, + "step": 5945 + }, + { + "epoch": 1.4456601021152444, + "grad_norm": 21.75, + "learning_rate": 4.5497887825467684e-07, + "loss": 1.1248, + "step": 5946 + }, + { + "epoch": 1.4459032336494042, + "grad_norm": 21.0, + "learning_rate": 4.546060160507863e-07, + "loss": 0.5402, + "step": 5947 + }, + { + "epoch": 1.4461463651835644, + "grad_norm": 21.25, + "learning_rate": 4.5423327273226376e-07, + "loss": 0.6048, + "step": 5948 + }, + { + "epoch": 1.4463894967177242, + "grad_norm": 20.25, + "learning_rate": 4.538606483548215e-07, + "loss": 0.7775, + "step": 5949 + }, + { + "epoch": 1.4466326282518842, + "grad_norm": 17.875, + "learning_rate": 4.5348814297415555e-07, + "loss": 0.6131, + "step": 5950 + }, + { + "epoch": 1.4468757597860442, + "grad_norm": 20.875, + "learning_rate": 4.531157566459429e-07, + "loss": 0.4645, + "step": 5951 + }, + { + "epoch": 1.4471188913202042, + "grad_norm": 17.125, + "learning_rate": 4.527434894258427e-07, + "loss": 0.6405, + "step": 5952 + }, + { + "epoch": 1.4473620228543642, + "grad_norm": 19.375, + "learning_rate": 4.52371341369497e-07, + "loss": 0.5227, + "step": 5953 + }, + { + "epoch": 1.4476051543885242, + "grad_norm": 19.0, + "learning_rate": 4.519993125325303e-07, + "loss": 0.6499, + "step": 5954 + }, + { + "epoch": 1.4478482859226842, + "grad_norm": 19.75, + "learning_rate": 4.516274029705484e-07, + "loss": 0.7574, + "step": 5955 + }, + { + "epoch": 1.4480914174568442, + "grad_norm": 20.375, + "learning_rate": 4.512556127391389e-07, + "loss": 0.5497, + "step": 5956 + }, + { + "epoch": 1.4483345489910042, + "grad_norm": 17.5, + "learning_rate": 4.5088394189387347e-07, + "loss": 0.3428, + "step": 5957 + }, + { + "epoch": 1.448577680525164, + "grad_norm": 21.375, + "learning_rate": 4.5051239049030403e-07, + "loss": 0.6076, + "step": 5958 + }, + { + "epoch": 1.4488208120593242, + "grad_norm": 27.0, + "learning_rate": 4.50140958583965e-07, + "loss": 1.128, + "step": 5959 + }, + { + "epoch": 1.449063943593484, + "grad_norm": 20.875, + "learning_rate": 4.497696462303738e-07, + "loss": 0.6823, + "step": 5960 + }, + { + "epoch": 1.449307075127644, + "grad_norm": 20.25, + "learning_rate": 4.4939845348502985e-07, + "loss": 0.8267, + "step": 5961 + }, + { + "epoch": 1.449550206661804, + "grad_norm": 24.75, + "learning_rate": 4.4902738040341396e-07, + "loss": 0.9122, + "step": 5962 + }, + { + "epoch": 1.449793338195964, + "grad_norm": 24.25, + "learning_rate": 4.4865642704098893e-07, + "loss": 0.6708, + "step": 5963 + }, + { + "epoch": 1.450036469730124, + "grad_norm": 17.375, + "learning_rate": 4.4828559345320085e-07, + "loss": 0.4427, + "step": 5964 + }, + { + "epoch": 1.450279601264284, + "grad_norm": 21.625, + "learning_rate": 4.47914879695477e-07, + "loss": 0.8135, + "step": 5965 + }, + { + "epoch": 1.450522732798444, + "grad_norm": 19.125, + "learning_rate": 4.475442858232264e-07, + "loss": 0.5175, + "step": 5966 + }, + { + "epoch": 1.450765864332604, + "grad_norm": 14.9375, + "learning_rate": 4.4717381189184116e-07, + "loss": 0.3037, + "step": 5967 + }, + { + "epoch": 1.451008995866764, + "grad_norm": 18.0, + "learning_rate": 4.468034579566956e-07, + "loss": 0.7232, + "step": 5968 + }, + { + "epoch": 1.4512521274009238, + "grad_norm": 17.875, + "learning_rate": 4.464332240731448e-07, + "loss": 0.4817, + "step": 5969 + }, + { + "epoch": 1.4514952589350838, + "grad_norm": 25.5, + "learning_rate": 4.460631102965263e-07, + "loss": 0.6684, + "step": 5970 + }, + { + "epoch": 1.4517383904692438, + "grad_norm": 23.875, + "learning_rate": 4.4569311668216083e-07, + "loss": 0.5933, + "step": 5971 + }, + { + "epoch": 1.4519815220034038, + "grad_norm": 24.125, + "learning_rate": 4.453232432853501e-07, + "loss": 0.8794, + "step": 5972 + }, + { + "epoch": 1.4522246535375638, + "grad_norm": 23.375, + "learning_rate": 4.449534901613774e-07, + "loss": 0.9315, + "step": 5973 + }, + { + "epoch": 1.4524677850717238, + "grad_norm": 16.125, + "learning_rate": 4.4458385736550964e-07, + "loss": 0.3689, + "step": 5974 + }, + { + "epoch": 1.4527109166058838, + "grad_norm": 35.75, + "learning_rate": 4.44214344952994e-07, + "loss": 1.4904, + "step": 5975 + }, + { + "epoch": 1.4529540481400438, + "grad_norm": 23.75, + "learning_rate": 4.438449529790613e-07, + "loss": 0.9447, + "step": 5976 + }, + { + "epoch": 1.4531971796742038, + "grad_norm": 19.0, + "learning_rate": 4.4347568149892274e-07, + "loss": 0.6315, + "step": 5977 + }, + { + "epoch": 1.4534403112083636, + "grad_norm": 19.75, + "learning_rate": 4.4310653056777313e-07, + "loss": 0.5131, + "step": 5978 + }, + { + "epoch": 1.4536834427425238, + "grad_norm": 18.25, + "learning_rate": 4.427375002407881e-07, + "loss": 0.5167, + "step": 5979 + }, + { + "epoch": 1.4539265742766836, + "grad_norm": 19.625, + "learning_rate": 4.4236859057312523e-07, + "loss": 0.9916, + "step": 5980 + }, + { + "epoch": 1.4541697058108436, + "grad_norm": 20.375, + "learning_rate": 4.419998016199251e-07, + "loss": 0.9532, + "step": 5981 + }, + { + "epoch": 1.4544128373450036, + "grad_norm": 19.25, + "learning_rate": 4.416311334363092e-07, + "loss": 0.7733, + "step": 5982 + }, + { + "epoch": 1.4546559688791636, + "grad_norm": 23.125, + "learning_rate": 4.4126258607738115e-07, + "loss": 0.7843, + "step": 5983 + }, + { + "epoch": 1.4548991004133236, + "grad_norm": 30.5, + "learning_rate": 4.408941595982269e-07, + "loss": 0.7897, + "step": 5984 + }, + { + "epoch": 1.4551422319474836, + "grad_norm": 24.625, + "learning_rate": 4.4052585405391464e-07, + "loss": 0.6895, + "step": 5985 + }, + { + "epoch": 1.4553853634816436, + "grad_norm": 24.875, + "learning_rate": 4.401576694994937e-07, + "loss": 1.0368, + "step": 5986 + }, + { + "epoch": 1.4556284950158036, + "grad_norm": 20.0, + "learning_rate": 4.3978960598999505e-07, + "loss": 0.7658, + "step": 5987 + }, + { + "epoch": 1.4558716265499636, + "grad_norm": 23.25, + "learning_rate": 4.39421663580433e-07, + "loss": 0.469, + "step": 5988 + }, + { + "epoch": 1.4561147580841234, + "grad_norm": 27.25, + "learning_rate": 4.390538423258024e-07, + "loss": 1.2069, + "step": 5989 + }, + { + "epoch": 1.4563578896182836, + "grad_norm": 24.625, + "learning_rate": 4.386861422810802e-07, + "loss": 0.7784, + "step": 5990 + }, + { + "epoch": 1.4566010211524434, + "grad_norm": 22.125, + "learning_rate": 4.3831856350122603e-07, + "loss": 0.6391, + "step": 5991 + }, + { + "epoch": 1.4568441526866034, + "grad_norm": 21.625, + "learning_rate": 4.3795110604118117e-07, + "loss": 0.8035, + "step": 5992 + }, + { + "epoch": 1.4570872842207634, + "grad_norm": 19.875, + "learning_rate": 4.37583769955868e-07, + "loss": 0.75, + "step": 5993 + }, + { + "epoch": 1.4573304157549234, + "grad_norm": 21.75, + "learning_rate": 4.37216555300191e-07, + "loss": 0.8879, + "step": 5994 + }, + { + "epoch": 1.4575735472890834, + "grad_norm": 19.875, + "learning_rate": 4.3684946212903743e-07, + "loss": 0.6677, + "step": 5995 + }, + { + "epoch": 1.4578166788232434, + "grad_norm": 15.125, + "learning_rate": 4.364824904972753e-07, + "loss": 0.5898, + "step": 5996 + }, + { + "epoch": 1.4580598103574034, + "grad_norm": 23.25, + "learning_rate": 4.3611564045975453e-07, + "loss": 0.9381, + "step": 5997 + }, + { + "epoch": 1.4583029418915634, + "grad_norm": 18.5, + "learning_rate": 4.3574891207130767e-07, + "loss": 0.6625, + "step": 5998 + }, + { + "epoch": 1.4585460734257234, + "grad_norm": 17.0, + "learning_rate": 4.353823053867487e-07, + "loss": 0.4623, + "step": 5999 + }, + { + "epoch": 1.4587892049598832, + "grad_norm": 19.375, + "learning_rate": 4.350158204608733e-07, + "loss": 0.4713, + "step": 6000 + }, + { + "epoch": 1.4590323364940434, + "grad_norm": 19.375, + "learning_rate": 4.346494573484583e-07, + "loss": 0.7481, + "step": 6001 + }, + { + "epoch": 1.4592754680282032, + "grad_norm": 18.625, + "learning_rate": 4.34283216104264e-07, + "loss": 0.5707, + "step": 6002 + }, + { + "epoch": 1.4595185995623632, + "grad_norm": 26.125, + "learning_rate": 4.3391709678303083e-07, + "loss": 0.6832, + "step": 6003 + }, + { + "epoch": 1.4597617310965232, + "grad_norm": 28.5, + "learning_rate": 4.335510994394815e-07, + "loss": 0.9882, + "step": 6004 + }, + { + "epoch": 1.4600048626306832, + "grad_norm": 24.25, + "learning_rate": 4.3318522412832117e-07, + "loss": 1.2792, + "step": 6005 + }, + { + "epoch": 1.4602479941648432, + "grad_norm": 13.75, + "learning_rate": 4.3281947090423564e-07, + "loss": 0.2988, + "step": 6006 + }, + { + "epoch": 1.4604911256990032, + "grad_norm": 23.5, + "learning_rate": 4.324538398218939e-07, + "loss": 0.861, + "step": 6007 + }, + { + "epoch": 1.4607342572331632, + "grad_norm": 21.625, + "learning_rate": 4.320883309359447e-07, + "loss": 1.0645, + "step": 6008 + }, + { + "epoch": 1.4609773887673232, + "grad_norm": 12.3125, + "learning_rate": 4.3172294430102083e-07, + "loss": 0.2965, + "step": 6009 + }, + { + "epoch": 1.4612205203014832, + "grad_norm": 17.25, + "learning_rate": 4.3135767997173506e-07, + "loss": 0.5699, + "step": 6010 + }, + { + "epoch": 1.461463651835643, + "grad_norm": 15.6875, + "learning_rate": 4.3099253800268194e-07, + "loss": 0.3311, + "step": 6011 + }, + { + "epoch": 1.4617067833698032, + "grad_norm": 19.125, + "learning_rate": 4.306275184484393e-07, + "loss": 0.8128, + "step": 6012 + }, + { + "epoch": 1.461949914903963, + "grad_norm": 18.375, + "learning_rate": 4.302626213635645e-07, + "loss": 0.6445, + "step": 6013 + }, + { + "epoch": 1.462193046438123, + "grad_norm": 24.625, + "learning_rate": 4.2989784680259895e-07, + "loss": 0.9853, + "step": 6014 + }, + { + "epoch": 1.462436177972283, + "grad_norm": 20.875, + "learning_rate": 4.295331948200633e-07, + "loss": 0.5928, + "step": 6015 + }, + { + "epoch": 1.462679309506443, + "grad_norm": 22.875, + "learning_rate": 4.29168665470462e-07, + "loss": 0.9678, + "step": 6016 + }, + { + "epoch": 1.462922441040603, + "grad_norm": 20.375, + "learning_rate": 4.2880425880828e-07, + "loss": 0.7878, + "step": 6017 + }, + { + "epoch": 1.463165572574763, + "grad_norm": 24.5, + "learning_rate": 4.284399748879836e-07, + "loss": 0.9565, + "step": 6018 + }, + { + "epoch": 1.463408704108923, + "grad_norm": 24.875, + "learning_rate": 4.2807581376402214e-07, + "loss": 1.0191, + "step": 6019 + }, + { + "epoch": 1.463651835643083, + "grad_norm": 43.25, + "learning_rate": 4.2771177549082557e-07, + "loss": 0.7639, + "step": 6020 + }, + { + "epoch": 1.463894967177243, + "grad_norm": 21.25, + "learning_rate": 4.2734786012280495e-07, + "loss": 0.8329, + "step": 6021 + }, + { + "epoch": 1.4641380987114028, + "grad_norm": 15.75, + "learning_rate": 4.269840677143544e-07, + "loss": 0.5673, + "step": 6022 + }, + { + "epoch": 1.4643812302455628, + "grad_norm": 19.875, + "learning_rate": 4.2662039831984944e-07, + "loss": 0.7499, + "step": 6023 + }, + { + "epoch": 1.4646243617797228, + "grad_norm": 29.0, + "learning_rate": 4.2625685199364606e-07, + "loss": 0.7239, + "step": 6024 + }, + { + "epoch": 1.4648674933138828, + "grad_norm": 15.1875, + "learning_rate": 4.258934287900825e-07, + "loss": 0.5465, + "step": 6025 + }, + { + "epoch": 1.4651106248480428, + "grad_norm": 20.25, + "learning_rate": 4.2553012876347904e-07, + "loss": 0.7416, + "step": 6026 + }, + { + "epoch": 1.4653537563822028, + "grad_norm": 24.625, + "learning_rate": 4.251669519681369e-07, + "loss": 0.6116, + "step": 6027 + }, + { + "epoch": 1.4655968879163628, + "grad_norm": 23.25, + "learning_rate": 4.2480389845833884e-07, + "loss": 0.8547, + "step": 6028 + }, + { + "epoch": 1.4658400194505228, + "grad_norm": 20.25, + "learning_rate": 4.244409682883498e-07, + "loss": 0.8097, + "step": 6029 + }, + { + "epoch": 1.4660831509846828, + "grad_norm": 17.875, + "learning_rate": 4.240781615124162e-07, + "loss": 0.8004, + "step": 6030 + }, + { + "epoch": 1.4663262825188426, + "grad_norm": 17.125, + "learning_rate": 4.237154781847656e-07, + "loss": 0.6866, + "step": 6031 + }, + { + "epoch": 1.4665694140530028, + "grad_norm": 21.5, + "learning_rate": 4.233529183596069e-07, + "loss": 0.5433, + "step": 6032 + }, + { + "epoch": 1.4668125455871626, + "grad_norm": 25.0, + "learning_rate": 4.2299048209113154e-07, + "loss": 0.7478, + "step": 6033 + }, + { + "epoch": 1.4670556771213226, + "grad_norm": 21.125, + "learning_rate": 4.226281694335117e-07, + "loss": 0.5145, + "step": 6034 + }, + { + "epoch": 1.4672988086554826, + "grad_norm": 23.0, + "learning_rate": 4.2226598044090057e-07, + "loss": 0.8473, + "step": 6035 + }, + { + "epoch": 1.4675419401896426, + "grad_norm": 20.625, + "learning_rate": 4.2190391516743464e-07, + "loss": 1.1187, + "step": 6036 + }, + { + "epoch": 1.4677850717238026, + "grad_norm": 16.375, + "learning_rate": 4.2154197366722994e-07, + "loss": 0.6487, + "step": 6037 + }, + { + "epoch": 1.4680282032579626, + "grad_norm": 18.25, + "learning_rate": 4.2118015599438555e-07, + "loss": 0.7943, + "step": 6038 + }, + { + "epoch": 1.4682713347921226, + "grad_norm": 17.75, + "learning_rate": 4.2081846220298066e-07, + "loss": 0.6841, + "step": 6039 + }, + { + "epoch": 1.4685144663262826, + "grad_norm": 21.625, + "learning_rate": 4.204568923470775e-07, + "loss": 0.6428, + "step": 6040 + }, + { + "epoch": 1.4687575978604426, + "grad_norm": 19.625, + "learning_rate": 4.2009544648071847e-07, + "loss": 0.5299, + "step": 6041 + }, + { + "epoch": 1.4690007293946024, + "grad_norm": 17.5, + "learning_rate": 4.197341246579276e-07, + "loss": 0.2902, + "step": 6042 + }, + { + "epoch": 1.4692438609287626, + "grad_norm": 25.625, + "learning_rate": 4.1937292693271123e-07, + "loss": 0.7123, + "step": 6043 + }, + { + "epoch": 1.4694869924629224, + "grad_norm": 18.75, + "learning_rate": 4.1901185335905604e-07, + "loss": 0.5246, + "step": 6044 + }, + { + "epoch": 1.4697301239970824, + "grad_norm": 21.375, + "learning_rate": 4.1865090399093146e-07, + "loss": 0.6615, + "step": 6045 + }, + { + "epoch": 1.4699732555312424, + "grad_norm": 15.3125, + "learning_rate": 4.182900788822866e-07, + "loss": 0.5837, + "step": 6046 + }, + { + "epoch": 1.4702163870654024, + "grad_norm": 23.75, + "learning_rate": 4.1792937808705413e-07, + "loss": 0.948, + "step": 6047 + }, + { + "epoch": 1.4704595185995624, + "grad_norm": 20.25, + "learning_rate": 4.175688016591464e-07, + "loss": 0.9462, + "step": 6048 + }, + { + "epoch": 1.4707026501337224, + "grad_norm": 22.5, + "learning_rate": 4.1720834965245747e-07, + "loss": 0.6936, + "step": 6049 + }, + { + "epoch": 1.4709457816678824, + "grad_norm": 34.25, + "learning_rate": 4.16848022120864e-07, + "loss": 0.7489, + "step": 6050 + }, + { + "epoch": 1.4711889132020424, + "grad_norm": 22.25, + "learning_rate": 4.164878191182226e-07, + "loss": 0.8976, + "step": 6051 + }, + { + "epoch": 1.4714320447362024, + "grad_norm": 27.625, + "learning_rate": 4.161277406983713e-07, + "loss": 0.8001, + "step": 6052 + }, + { + "epoch": 1.4716751762703622, + "grad_norm": 14.5625, + "learning_rate": 4.1576778691513084e-07, + "loss": 0.2567, + "step": 6053 + }, + { + "epoch": 1.4719183078045224, + "grad_norm": 18.625, + "learning_rate": 4.1540795782230264e-07, + "loss": 0.88, + "step": 6054 + }, + { + "epoch": 1.4721614393386822, + "grad_norm": 33.5, + "learning_rate": 4.150482534736691e-07, + "loss": 1.0864, + "step": 6055 + }, + { + "epoch": 1.4724045708728422, + "grad_norm": 24.0, + "learning_rate": 4.146886739229937e-07, + "loss": 0.9677, + "step": 6056 + }, + { + "epoch": 1.4726477024070022, + "grad_norm": 20.25, + "learning_rate": 4.143292192240228e-07, + "loss": 0.4961, + "step": 6057 + }, + { + "epoch": 1.4728908339411622, + "grad_norm": 14.5, + "learning_rate": 4.139698894304825e-07, + "loss": 0.3054, + "step": 6058 + }, + { + "epoch": 1.4731339654753222, + "grad_norm": 18.25, + "learning_rate": 4.1361068459608065e-07, + "loss": 0.788, + "step": 6059 + }, + { + "epoch": 1.4733770970094822, + "grad_norm": 25.75, + "learning_rate": 4.1325160477450694e-07, + "loss": 0.8181, + "step": 6060 + }, + { + "epoch": 1.4736202285436422, + "grad_norm": 15.8125, + "learning_rate": 4.1289265001943244e-07, + "loss": 0.6519, + "step": 6061 + }, + { + "epoch": 1.4738633600778022, + "grad_norm": 24.625, + "learning_rate": 4.1253382038450874e-07, + "loss": 0.6285, + "step": 6062 + }, + { + "epoch": 1.4741064916119622, + "grad_norm": 26.0, + "learning_rate": 4.121751159233686e-07, + "loss": 0.6115, + "step": 6063 + }, + { + "epoch": 1.474349623146122, + "grad_norm": 16.625, + "learning_rate": 4.118165366896278e-07, + "loss": 0.5134, + "step": 6064 + }, + { + "epoch": 1.4745927546802822, + "grad_norm": 19.75, + "learning_rate": 4.114580827368812e-07, + "loss": 0.8648, + "step": 6065 + }, + { + "epoch": 1.474835886214442, + "grad_norm": 24.75, + "learning_rate": 4.1109975411870603e-07, + "loss": 1.2879, + "step": 6066 + }, + { + "epoch": 1.475079017748602, + "grad_norm": 20.375, + "learning_rate": 4.107415508886613e-07, + "loss": 0.6188, + "step": 6067 + }, + { + "epoch": 1.475322149282762, + "grad_norm": 17.375, + "learning_rate": 4.1038347310028595e-07, + "loss": 0.5522, + "step": 6068 + }, + { + "epoch": 1.475565280816922, + "grad_norm": 22.25, + "learning_rate": 4.1002552080710146e-07, + "loss": 0.8245, + "step": 6069 + }, + { + "epoch": 1.475808412351082, + "grad_norm": 18.875, + "learning_rate": 4.0966769406260943e-07, + "loss": 0.6155, + "step": 6070 + }, + { + "epoch": 1.476051543885242, + "grad_norm": 27.25, + "learning_rate": 4.09309992920294e-07, + "loss": 0.6586, + "step": 6071 + }, + { + "epoch": 1.476294675419402, + "grad_norm": 21.125, + "learning_rate": 4.089524174336193e-07, + "loss": 0.6301, + "step": 6072 + }, + { + "epoch": 1.476537806953562, + "grad_norm": 14.875, + "learning_rate": 4.085949676560308e-07, + "loss": 0.5514, + "step": 6073 + }, + { + "epoch": 1.476780938487722, + "grad_norm": 33.5, + "learning_rate": 4.0823764364095643e-07, + "loss": 1.1112, + "step": 6074 + }, + { + "epoch": 1.4770240700218817, + "grad_norm": 26.375, + "learning_rate": 4.0788044544180354e-07, + "loss": 0.8851, + "step": 6075 + }, + { + "epoch": 1.4772672015560417, + "grad_norm": 19.375, + "learning_rate": 4.0752337311196224e-07, + "loss": 0.6695, + "step": 6076 + }, + { + "epoch": 1.4775103330902017, + "grad_norm": 20.25, + "learning_rate": 4.071664267048027e-07, + "loss": 1.2643, + "step": 6077 + }, + { + "epoch": 1.4777534646243617, + "grad_norm": 26.0, + "learning_rate": 4.0680960627367717e-07, + "loss": 0.9361, + "step": 6078 + }, + { + "epoch": 1.4779965961585217, + "grad_norm": 19.375, + "learning_rate": 4.0645291187191847e-07, + "loss": 0.6155, + "step": 6079 + }, + { + "epoch": 1.4782397276926817, + "grad_norm": 20.625, + "learning_rate": 4.060963435528402e-07, + "loss": 0.4762, + "step": 6080 + }, + { + "epoch": 1.4784828592268418, + "grad_norm": 20.25, + "learning_rate": 4.0573990136973855e-07, + "loss": 0.6792, + "step": 6081 + }, + { + "epoch": 1.4787259907610018, + "grad_norm": 18.5, + "learning_rate": 4.053835853758896e-07, + "loss": 0.6455, + "step": 6082 + }, + { + "epoch": 1.4789691222951618, + "grad_norm": 20.0, + "learning_rate": 4.0502739562455026e-07, + "loss": 0.764, + "step": 6083 + }, + { + "epoch": 1.4792122538293215, + "grad_norm": 16.375, + "learning_rate": 4.0467133216895994e-07, + "loss": 0.6791, + "step": 6084 + }, + { + "epoch": 1.4794553853634818, + "grad_norm": 16.75, + "learning_rate": 4.043153950623388e-07, + "loss": 0.5095, + "step": 6085 + }, + { + "epoch": 1.4796985168976415, + "grad_norm": 25.0, + "learning_rate": 4.0395958435788725e-07, + "loss": 0.9203, + "step": 6086 + }, + { + "epoch": 1.4799416484318015, + "grad_norm": 13.125, + "learning_rate": 4.0360390010878714e-07, + "loss": 0.3822, + "step": 6087 + }, + { + "epoch": 1.4801847799659615, + "grad_norm": 19.25, + "learning_rate": 4.0324834236820224e-07, + "loss": 0.7481, + "step": 6088 + }, + { + "epoch": 1.4804279115001215, + "grad_norm": 22.375, + "learning_rate": 4.0289291118927655e-07, + "loss": 0.4939, + "step": 6089 + }, + { + "epoch": 1.4806710430342815, + "grad_norm": 26.0, + "learning_rate": 4.025376066251348e-07, + "loss": 0.739, + "step": 6090 + }, + { + "epoch": 1.4809141745684415, + "grad_norm": 24.0, + "learning_rate": 4.0218242872888407e-07, + "loss": 0.6906, + "step": 6091 + }, + { + "epoch": 1.4811573061026015, + "grad_norm": 20.375, + "learning_rate": 4.018273775536119e-07, + "loss": 0.6155, + "step": 6092 + }, + { + "epoch": 1.4814004376367615, + "grad_norm": 18.25, + "learning_rate": 4.0147245315238685e-07, + "loss": 1.0447, + "step": 6093 + }, + { + "epoch": 1.4816435691709215, + "grad_norm": 16.875, + "learning_rate": 4.011176555782577e-07, + "loss": 0.2817, + "step": 6094 + }, + { + "epoch": 1.4818867007050813, + "grad_norm": 31.875, + "learning_rate": 4.0076298488425604e-07, + "loss": 0.7011, + "step": 6095 + }, + { + "epoch": 1.4821298322392416, + "grad_norm": 29.375, + "learning_rate": 4.0040844112339315e-07, + "loss": 0.723, + "step": 6096 + }, + { + "epoch": 1.4823729637734013, + "grad_norm": 19.375, + "learning_rate": 4.000540243486613e-07, + "loss": 0.729, + "step": 6097 + }, + { + "epoch": 1.4826160953075613, + "grad_norm": 19.875, + "learning_rate": 3.996997346130345e-07, + "loss": 0.5732, + "step": 6098 + }, + { + "epoch": 1.4828592268417213, + "grad_norm": 19.375, + "learning_rate": 3.99345571969468e-07, + "loss": 0.5748, + "step": 6099 + }, + { + "epoch": 1.4831023583758813, + "grad_norm": 17.875, + "learning_rate": 3.9899153647089713e-07, + "loss": 0.916, + "step": 6100 + }, + { + "epoch": 1.4833454899100413, + "grad_norm": 20.625, + "learning_rate": 3.986376281702381e-07, + "loss": 0.5299, + "step": 6101 + }, + { + "epoch": 1.4835886214442013, + "grad_norm": 20.125, + "learning_rate": 3.9828384712038956e-07, + "loss": 0.5411, + "step": 6102 + }, + { + "epoch": 1.4838317529783613, + "grad_norm": 18.75, + "learning_rate": 3.979301933742295e-07, + "loss": 0.9038, + "step": 6103 + }, + { + "epoch": 1.4840748845125213, + "grad_norm": 21.25, + "learning_rate": 3.975766669846176e-07, + "loss": 0.7698, + "step": 6104 + }, + { + "epoch": 1.4843180160466813, + "grad_norm": 32.5, + "learning_rate": 3.97223268004395e-07, + "loss": 0.8023, + "step": 6105 + }, + { + "epoch": 1.4845611475808411, + "grad_norm": 20.5, + "learning_rate": 3.968699964863826e-07, + "loss": 0.7003, + "step": 6106 + }, + { + "epoch": 1.4848042791150013, + "grad_norm": 18.75, + "learning_rate": 3.965168524833837e-07, + "loss": 0.8199, + "step": 6107 + }, + { + "epoch": 1.4850474106491611, + "grad_norm": 24.625, + "learning_rate": 3.9616383604818094e-07, + "loss": 1.1406, + "step": 6108 + }, + { + "epoch": 1.4852905421833211, + "grad_norm": 21.875, + "learning_rate": 3.958109472335396e-07, + "loss": 0.6098, + "step": 6109 + }, + { + "epoch": 1.4855336737174811, + "grad_norm": 23.25, + "learning_rate": 3.9545818609220436e-07, + "loss": 0.8669, + "step": 6110 + }, + { + "epoch": 1.4857768052516411, + "grad_norm": 21.125, + "learning_rate": 3.951055526769014e-07, + "loss": 0.525, + "step": 6111 + }, + { + "epoch": 1.4860199367858011, + "grad_norm": 23.625, + "learning_rate": 3.947530470403385e-07, + "loss": 0.6387, + "step": 6112 + }, + { + "epoch": 1.4862630683199611, + "grad_norm": 18.25, + "learning_rate": 3.9440066923520345e-07, + "loss": 0.693, + "step": 6113 + }, + { + "epoch": 1.4865061998541211, + "grad_norm": 21.875, + "learning_rate": 3.9404841931416457e-07, + "loss": 0.5969, + "step": 6114 + }, + { + "epoch": 1.4867493313882811, + "grad_norm": 24.25, + "learning_rate": 3.936962973298723e-07, + "loss": 0.9472, + "step": 6115 + }, + { + "epoch": 1.4869924629224411, + "grad_norm": 16.25, + "learning_rate": 3.9334430333495764e-07, + "loss": 0.8326, + "step": 6116 + }, + { + "epoch": 1.487235594456601, + "grad_norm": 19.375, + "learning_rate": 3.929924373820318e-07, + "loss": 0.7791, + "step": 6117 + }, + { + "epoch": 1.487478725990761, + "grad_norm": 24.0, + "learning_rate": 3.9264069952368674e-07, + "loss": 0.6789, + "step": 6118 + }, + { + "epoch": 1.487721857524921, + "grad_norm": 19.0, + "learning_rate": 3.9228908981249677e-07, + "loss": 0.4578, + "step": 6119 + }, + { + "epoch": 1.487964989059081, + "grad_norm": 17.125, + "learning_rate": 3.919376083010154e-07, + "loss": 0.4755, + "step": 6120 + }, + { + "epoch": 1.488208120593241, + "grad_norm": 19.75, + "learning_rate": 3.9158625504177746e-07, + "loss": 0.6604, + "step": 6121 + }, + { + "epoch": 1.488451252127401, + "grad_norm": 19.375, + "learning_rate": 3.9123503008729884e-07, + "loss": 0.5169, + "step": 6122 + }, + { + "epoch": 1.488694383661561, + "grad_norm": 22.75, + "learning_rate": 3.90883933490077e-07, + "loss": 0.7933, + "step": 6123 + }, + { + "epoch": 1.488937515195721, + "grad_norm": 22.5, + "learning_rate": 3.9053296530258854e-07, + "loss": 0.7724, + "step": 6124 + }, + { + "epoch": 1.489180646729881, + "grad_norm": 17.0, + "learning_rate": 3.9018212557729154e-07, + "loss": 0.574, + "step": 6125 + }, + { + "epoch": 1.4894237782640407, + "grad_norm": 23.125, + "learning_rate": 3.8983141436662597e-07, + "loss": 0.9739, + "step": 6126 + }, + { + "epoch": 1.489666909798201, + "grad_norm": 18.875, + "learning_rate": 3.8948083172301103e-07, + "loss": 0.6615, + "step": 6127 + }, + { + "epoch": 1.4899100413323607, + "grad_norm": 23.125, + "learning_rate": 3.891303776988471e-07, + "loss": 0.7575, + "step": 6128 + }, + { + "epoch": 1.4901531728665207, + "grad_norm": 30.0, + "learning_rate": 3.88780052346516e-07, + "loss": 0.7343, + "step": 6129 + }, + { + "epoch": 1.4903963044006807, + "grad_norm": 14.5625, + "learning_rate": 3.8842985571838023e-07, + "loss": 0.3929, + "step": 6130 + }, + { + "epoch": 1.4906394359348407, + "grad_norm": 16.25, + "learning_rate": 3.880797878667823e-07, + "loss": 0.3027, + "step": 6131 + }, + { + "epoch": 1.4908825674690007, + "grad_norm": 22.25, + "learning_rate": 3.8772984884404564e-07, + "loss": 0.8387, + "step": 6132 + }, + { + "epoch": 1.4911256990031607, + "grad_norm": 18.25, + "learning_rate": 3.873800387024754e-07, + "loss": 0.5991, + "step": 6133 + }, + { + "epoch": 1.4913688305373207, + "grad_norm": 26.125, + "learning_rate": 3.8703035749435624e-07, + "loss": 0.4715, + "step": 6134 + }, + { + "epoch": 1.4916119620714807, + "grad_norm": 17.375, + "learning_rate": 3.866808052719538e-07, + "loss": 0.748, + "step": 6135 + }, + { + "epoch": 1.4918550936056407, + "grad_norm": 17.125, + "learning_rate": 3.863313820875154e-07, + "loss": 0.5773, + "step": 6136 + }, + { + "epoch": 1.4920982251398005, + "grad_norm": 18.25, + "learning_rate": 3.859820879932677e-07, + "loss": 0.6522, + "step": 6137 + }, + { + "epoch": 1.4923413566739607, + "grad_norm": 19.25, + "learning_rate": 3.856329230414195e-07, + "loss": 0.5437, + "step": 6138 + }, + { + "epoch": 1.4925844882081205, + "grad_norm": 20.125, + "learning_rate": 3.852838872841586e-07, + "loss": 0.6105, + "step": 6139 + }, + { + "epoch": 1.4928276197422805, + "grad_norm": 18.25, + "learning_rate": 3.8493498077365534e-07, + "loss": 0.5001, + "step": 6140 + }, + { + "epoch": 1.4930707512764405, + "grad_norm": 18.75, + "learning_rate": 3.8458620356205927e-07, + "loss": 0.8182, + "step": 6141 + }, + { + "epoch": 1.4933138828106005, + "grad_norm": 20.375, + "learning_rate": 3.84237555701501e-07, + "loss": 0.8294, + "step": 6142 + }, + { + "epoch": 1.4935570143447605, + "grad_norm": 22.125, + "learning_rate": 3.8388903724409275e-07, + "loss": 0.552, + "step": 6143 + }, + { + "epoch": 1.4938001458789205, + "grad_norm": 20.125, + "learning_rate": 3.83540648241926e-07, + "loss": 0.7927, + "step": 6144 + }, + { + "epoch": 1.4940432774130805, + "grad_norm": 37.0, + "learning_rate": 3.831923887470733e-07, + "loss": 1.1963, + "step": 6145 + }, + { + "epoch": 1.4942864089472405, + "grad_norm": 18.75, + "learning_rate": 3.828442588115884e-07, + "loss": 0.5773, + "step": 6146 + }, + { + "epoch": 1.4945295404814005, + "grad_norm": 21.625, + "learning_rate": 3.824962584875057e-07, + "loss": 0.9266, + "step": 6147 + }, + { + "epoch": 1.4947726720155603, + "grad_norm": 16.625, + "learning_rate": 3.821483878268395e-07, + "loss": 0.4933, + "step": 6148 + }, + { + "epoch": 1.4950158035497205, + "grad_norm": 24.625, + "learning_rate": 3.8180064688158464e-07, + "loss": 0.9639, + "step": 6149 + }, + { + "epoch": 1.4952589350838803, + "grad_norm": 22.125, + "learning_rate": 3.8145303570371774e-07, + "loss": 0.7591, + "step": 6150 + }, + { + "epoch": 1.4955020666180403, + "grad_norm": 25.875, + "learning_rate": 3.811055543451951e-07, + "loss": 0.6283, + "step": 6151 + }, + { + "epoch": 1.4957451981522003, + "grad_norm": 16.25, + "learning_rate": 3.807582028579532e-07, + "loss": 0.5153, + "step": 6152 + }, + { + "epoch": 1.4959883296863603, + "grad_norm": 32.5, + "learning_rate": 3.8041098129391023e-07, + "loss": 1.0004, + "step": 6153 + }, + { + "epoch": 1.4962314612205203, + "grad_norm": 34.0, + "learning_rate": 3.8006388970496493e-07, + "loss": 0.7645, + "step": 6154 + }, + { + "epoch": 1.4964745927546803, + "grad_norm": 23.625, + "learning_rate": 3.797169281429955e-07, + "loss": 0.9273, + "step": 6155 + }, + { + "epoch": 1.4967177242888403, + "grad_norm": 16.875, + "learning_rate": 3.793700966598611e-07, + "loss": 0.6612, + "step": 6156 + }, + { + "epoch": 1.4969608558230003, + "grad_norm": 25.75, + "learning_rate": 3.7902339530740255e-07, + "loss": 0.8358, + "step": 6157 + }, + { + "epoch": 1.4972039873571603, + "grad_norm": 17.25, + "learning_rate": 3.7867682413743974e-07, + "loss": 0.9172, + "step": 6158 + }, + { + "epoch": 1.49744711889132, + "grad_norm": 20.625, + "learning_rate": 3.7833038320177345e-07, + "loss": 0.3885, + "step": 6159 + }, + { + "epoch": 1.4976902504254803, + "grad_norm": 23.25, + "learning_rate": 3.7798407255218565e-07, + "loss": 0.8555, + "step": 6160 + }, + { + "epoch": 1.49793338195964, + "grad_norm": 20.375, + "learning_rate": 3.776378922404389e-07, + "loss": 0.6206, + "step": 6161 + }, + { + "epoch": 1.4981765134938, + "grad_norm": 19.375, + "learning_rate": 3.772918423182753e-07, + "loss": 1.0621, + "step": 6162 + }, + { + "epoch": 1.49841964502796, + "grad_norm": 18.125, + "learning_rate": 3.7694592283741767e-07, + "loss": 0.6415, + "step": 6163 + }, + { + "epoch": 1.49866277656212, + "grad_norm": 19.75, + "learning_rate": 3.766001338495705e-07, + "loss": 0.7096, + "step": 6164 + }, + { + "epoch": 1.49890590809628, + "grad_norm": 23.625, + "learning_rate": 3.762544754064175e-07, + "loss": 0.6475, + "step": 6165 + }, + { + "epoch": 1.49914903963044, + "grad_norm": 18.0, + "learning_rate": 3.759089475596227e-07, + "loss": 0.7357, + "step": 6166 + }, + { + "epoch": 1.4993921711646, + "grad_norm": 16.375, + "learning_rate": 3.7556355036083225e-07, + "loss": 0.4537, + "step": 6167 + }, + { + "epoch": 1.49963530269876, + "grad_norm": 16.625, + "learning_rate": 3.752182838616708e-07, + "loss": 0.508, + "step": 6168 + }, + { + "epoch": 1.49987843423292, + "grad_norm": 22.375, + "learning_rate": 3.7487314811374494e-07, + "loss": 0.9527, + "step": 6169 + }, + { + "epoch": 1.50012156576708, + "grad_norm": 19.375, + "learning_rate": 3.7452814316864134e-07, + "loss": 0.8152, + "step": 6170 + }, + { + "epoch": 1.5003646973012401, + "grad_norm": 17.625, + "learning_rate": 3.7418326907792664e-07, + "loss": 0.5876, + "step": 6171 + }, + { + "epoch": 1.5006078288354, + "grad_norm": 15.75, + "learning_rate": 3.738385258931483e-07, + "loss": 0.4343, + "step": 6172 + }, + { + "epoch": 1.50085096036956, + "grad_norm": 25.75, + "learning_rate": 3.7349391366583375e-07, + "loss": 0.6182, + "step": 6173 + }, + { + "epoch": 1.50109409190372, + "grad_norm": 15.25, + "learning_rate": 3.73149432447492e-07, + "loss": 0.2298, + "step": 6174 + }, + { + "epoch": 1.50133722343788, + "grad_norm": 17.0, + "learning_rate": 3.728050822896112e-07, + "loss": 0.3318, + "step": 6175 + }, + { + "epoch": 1.50158035497204, + "grad_norm": 16.875, + "learning_rate": 3.7246086324365977e-07, + "loss": 0.4708, + "step": 6176 + }, + { + "epoch": 1.5018234865062, + "grad_norm": 20.5, + "learning_rate": 3.721167753610888e-07, + "loss": 0.3161, + "step": 6177 + }, + { + "epoch": 1.50206661804036, + "grad_norm": 20.875, + "learning_rate": 3.717728186933273e-07, + "loss": 0.762, + "step": 6178 + }, + { + "epoch": 1.5023097495745197, + "grad_norm": 21.875, + "learning_rate": 3.714289932917856e-07, + "loss": 1.0256, + "step": 6179 + }, + { + "epoch": 1.50255288110868, + "grad_norm": 14.6875, + "learning_rate": 3.710852992078538e-07, + "loss": 0.3279, + "step": 6180 + }, + { + "epoch": 1.5027960126428397, + "grad_norm": 21.75, + "learning_rate": 3.707417364929039e-07, + "loss": 0.7004, + "step": 6181 + }, + { + "epoch": 1.503039144177, + "grad_norm": 21.625, + "learning_rate": 3.703983051982868e-07, + "loss": 0.5921, + "step": 6182 + }, + { + "epoch": 1.5032822757111597, + "grad_norm": 19.875, + "learning_rate": 3.700550053753334e-07, + "loss": 0.429, + "step": 6183 + }, + { + "epoch": 1.5035254072453197, + "grad_norm": 23.625, + "learning_rate": 3.6971183707535733e-07, + "loss": 0.8703, + "step": 6184 + }, + { + "epoch": 1.5037685387794797, + "grad_norm": 19.125, + "learning_rate": 3.693688003496504e-07, + "loss": 0.7276, + "step": 6185 + }, + { + "epoch": 1.5040116703136397, + "grad_norm": 24.0, + "learning_rate": 3.690258952494849e-07, + "loss": 0.5193, + "step": 6186 + }, + { + "epoch": 1.5042548018477997, + "grad_norm": 20.75, + "learning_rate": 3.686831218261147e-07, + "loss": 0.562, + "step": 6187 + }, + { + "epoch": 1.5044979333819597, + "grad_norm": 16.5, + "learning_rate": 3.6834048013077275e-07, + "loss": 0.3605, + "step": 6188 + }, + { + "epoch": 1.5047410649161197, + "grad_norm": 19.5, + "learning_rate": 3.6799797021467283e-07, + "loss": 0.6409, + "step": 6189 + }, + { + "epoch": 1.5049841964502795, + "grad_norm": 21.125, + "learning_rate": 3.6765559212900867e-07, + "loss": 0.6381, + "step": 6190 + }, + { + "epoch": 1.5052273279844397, + "grad_norm": 20.875, + "learning_rate": 3.6731334592495477e-07, + "loss": 0.7849, + "step": 6191 + }, + { + "epoch": 1.5054704595185995, + "grad_norm": 18.875, + "learning_rate": 3.6697123165366637e-07, + "loss": 0.4924, + "step": 6192 + }, + { + "epoch": 1.5057135910527597, + "grad_norm": 19.75, + "learning_rate": 3.6662924936627735e-07, + "loss": 0.5789, + "step": 6193 + }, + { + "epoch": 1.5059567225869195, + "grad_norm": 18.0, + "learning_rate": 3.6628739911390377e-07, + "loss": 0.5383, + "step": 6194 + }, + { + "epoch": 1.5061998541210795, + "grad_norm": 20.375, + "learning_rate": 3.659456809476407e-07, + "loss": 0.6224, + "step": 6195 + }, + { + "epoch": 1.5064429856552395, + "grad_norm": 15.9375, + "learning_rate": 3.656040949185637e-07, + "loss": 0.4936, + "step": 6196 + }, + { + "epoch": 1.5066861171893995, + "grad_norm": 17.125, + "learning_rate": 3.652626410777282e-07, + "loss": 0.6152, + "step": 6197 + }, + { + "epoch": 1.5069292487235595, + "grad_norm": 24.0, + "learning_rate": 3.649213194761715e-07, + "loss": 0.9093, + "step": 6198 + }, + { + "epoch": 1.5071723802577193, + "grad_norm": 21.375, + "learning_rate": 3.6458013016490905e-07, + "loss": 0.5359, + "step": 6199 + }, + { + "epoch": 1.5074155117918795, + "grad_norm": 27.875, + "learning_rate": 3.6423907319493787e-07, + "loss": 0.7859, + "step": 6200 + }, + { + "epoch": 1.5076586433260393, + "grad_norm": 21.625, + "learning_rate": 3.638981486172352e-07, + "loss": 0.7023, + "step": 6201 + }, + { + "epoch": 1.5079017748601995, + "grad_norm": 24.25, + "learning_rate": 3.6355735648275785e-07, + "loss": 0.8522, + "step": 6202 + }, + { + "epoch": 1.5081449063943593, + "grad_norm": 14.5, + "learning_rate": 3.6321669684244274e-07, + "loss": 0.2743, + "step": 6203 + }, + { + "epoch": 1.5083880379285193, + "grad_norm": 17.0, + "learning_rate": 3.628761697472073e-07, + "loss": 0.4714, + "step": 6204 + }, + { + "epoch": 1.5086311694626793, + "grad_norm": 16.5, + "learning_rate": 3.625357752479498e-07, + "loss": 0.4457, + "step": 6205 + }, + { + "epoch": 1.5088743009968393, + "grad_norm": 18.5, + "learning_rate": 3.621955133955478e-07, + "loss": 0.8612, + "step": 6206 + }, + { + "epoch": 1.5091174325309993, + "grad_norm": 24.0, + "learning_rate": 3.618553842408584e-07, + "loss": 0.6889, + "step": 6207 + }, + { + "epoch": 1.5093605640651593, + "grad_norm": 19.375, + "learning_rate": 3.6151538783472134e-07, + "loss": 0.5208, + "step": 6208 + }, + { + "epoch": 1.5096036955993193, + "grad_norm": 16.125, + "learning_rate": 3.611755242279544e-07, + "loss": 0.6766, + "step": 6209 + }, + { + "epoch": 1.509846827133479, + "grad_norm": 19.0, + "learning_rate": 3.6083579347135534e-07, + "loss": 0.8017, + "step": 6210 + }, + { + "epoch": 1.5100899586676393, + "grad_norm": 20.875, + "learning_rate": 3.604961956157038e-07, + "loss": 0.7553, + "step": 6211 + }, + { + "epoch": 1.510333090201799, + "grad_norm": 24.5, + "learning_rate": 3.6015673071175804e-07, + "loss": 0.9282, + "step": 6212 + }, + { + "epoch": 1.5105762217359593, + "grad_norm": 19.75, + "learning_rate": 3.5981739881025715e-07, + "loss": 0.518, + "step": 6213 + }, + { + "epoch": 1.510819353270119, + "grad_norm": 19.25, + "learning_rate": 3.59478199961919e-07, + "loss": 0.3966, + "step": 6214 + }, + { + "epoch": 1.511062484804279, + "grad_norm": 14.625, + "learning_rate": 3.5913913421744453e-07, + "loss": 0.649, + "step": 6215 + }, + { + "epoch": 1.511305616338439, + "grad_norm": 18.0, + "learning_rate": 3.588002016275123e-07, + "loss": 0.5706, + "step": 6216 + }, + { + "epoch": 1.511548747872599, + "grad_norm": 17.25, + "learning_rate": 3.58461402242781e-07, + "loss": 0.505, + "step": 6217 + }, + { + "epoch": 1.511791879406759, + "grad_norm": 30.75, + "learning_rate": 3.581227361138909e-07, + "loss": 0.8332, + "step": 6218 + }, + { + "epoch": 1.512035010940919, + "grad_norm": 27.25, + "learning_rate": 3.577842032914612e-07, + "loss": 0.8551, + "step": 6219 + }, + { + "epoch": 1.512278142475079, + "grad_norm": 25.875, + "learning_rate": 3.574458038260914e-07, + "loss": 0.9446, + "step": 6220 + }, + { + "epoch": 1.5125212740092389, + "grad_norm": 23.25, + "learning_rate": 3.571075377683608e-07, + "loss": 1.0182, + "step": 6221 + }, + { + "epoch": 1.512764405543399, + "grad_norm": 17.125, + "learning_rate": 3.5676940516882933e-07, + "loss": 0.6912, + "step": 6222 + }, + { + "epoch": 1.5130075370775589, + "grad_norm": 17.25, + "learning_rate": 3.5643140607803745e-07, + "loss": 0.5258, + "step": 6223 + }, + { + "epoch": 1.513250668611719, + "grad_norm": 14.4375, + "learning_rate": 3.5609354054650395e-07, + "loss": 0.3123, + "step": 6224 + }, + { + "epoch": 1.5134938001458789, + "grad_norm": 25.875, + "learning_rate": 3.557558086247295e-07, + "loss": 1.3475, + "step": 6225 + }, + { + "epoch": 1.5137369316800389, + "grad_norm": 29.5, + "learning_rate": 3.5541821036319355e-07, + "loss": 0.3728, + "step": 6226 + }, + { + "epoch": 1.5139800632141989, + "grad_norm": 26.0, + "learning_rate": 3.550807458123556e-07, + "loss": 0.6712, + "step": 6227 + }, + { + "epoch": 1.5142231947483589, + "grad_norm": 23.0, + "learning_rate": 3.547434150226564e-07, + "loss": 0.7177, + "step": 6228 + }, + { + "epoch": 1.5144663262825189, + "grad_norm": 19.5, + "learning_rate": 3.5440621804451555e-07, + "loss": 0.8626, + "step": 6229 + }, + { + "epoch": 1.5147094578166789, + "grad_norm": 22.5, + "learning_rate": 3.5406915492833233e-07, + "loss": 0.832, + "step": 6230 + }, + { + "epoch": 1.5149525893508389, + "grad_norm": 20.125, + "learning_rate": 3.5373222572448724e-07, + "loss": 0.893, + "step": 6231 + }, + { + "epoch": 1.5151957208849987, + "grad_norm": 19.75, + "learning_rate": 3.5339543048334047e-07, + "loss": 0.42, + "step": 6232 + }, + { + "epoch": 1.5154388524191589, + "grad_norm": 32.75, + "learning_rate": 3.5305876925523146e-07, + "loss": 0.7095, + "step": 6233 + }, + { + "epoch": 1.5156819839533187, + "grad_norm": 21.125, + "learning_rate": 3.527222420904798e-07, + "loss": 0.696, + "step": 6234 + }, + { + "epoch": 1.5159251154874789, + "grad_norm": 23.625, + "learning_rate": 3.5238584903938584e-07, + "loss": 0.8977, + "step": 6235 + }, + { + "epoch": 1.5161682470216387, + "grad_norm": 24.0, + "learning_rate": 3.5204959015222916e-07, + "loss": 0.9621, + "step": 6236 + }, + { + "epoch": 1.5164113785557987, + "grad_norm": 18.875, + "learning_rate": 3.5171346547926907e-07, + "loss": 0.4414, + "step": 6237 + }, + { + "epoch": 1.5166545100899587, + "grad_norm": 19.375, + "learning_rate": 3.5137747507074537e-07, + "loss": 0.4407, + "step": 6238 + }, + { + "epoch": 1.5168976416241187, + "grad_norm": 21.625, + "learning_rate": 3.510416189768782e-07, + "loss": 0.8799, + "step": 6239 + }, + { + "epoch": 1.5171407731582787, + "grad_norm": 22.875, + "learning_rate": 3.5070589724786666e-07, + "loss": 1.233, + "step": 6240 + }, + { + "epoch": 1.5173839046924384, + "grad_norm": 32.5, + "learning_rate": 3.5037030993388965e-07, + "loss": 0.9792, + "step": 6241 + }, + { + "epoch": 1.5176270362265987, + "grad_norm": 20.5, + "learning_rate": 3.500348570851074e-07, + "loss": 0.6742, + "step": 6242 + }, + { + "epoch": 1.5178701677607584, + "grad_norm": 23.375, + "learning_rate": 3.496995387516587e-07, + "loss": 0.7058, + "step": 6243 + }, + { + "epoch": 1.5181132992949187, + "grad_norm": 17.75, + "learning_rate": 3.493643549836623e-07, + "loss": 0.5027, + "step": 6244 + }, + { + "epoch": 1.5183564308290785, + "grad_norm": 20.0, + "learning_rate": 3.490293058312175e-07, + "loss": 0.8238, + "step": 6245 + }, + { + "epoch": 1.5185995623632387, + "grad_norm": 19.875, + "learning_rate": 3.486943913444037e-07, + "loss": 0.4994, + "step": 6246 + }, + { + "epoch": 1.5188426938973985, + "grad_norm": 20.0, + "learning_rate": 3.4835961157327915e-07, + "loss": 0.8607, + "step": 6247 + }, + { + "epoch": 1.5190858254315585, + "grad_norm": 19.125, + "learning_rate": 3.480249665678821e-07, + "loss": 0.7661, + "step": 6248 + }, + { + "epoch": 1.5193289569657185, + "grad_norm": 25.375, + "learning_rate": 3.4769045637823184e-07, + "loss": 0.8371, + "step": 6249 + }, + { + "epoch": 1.5195720884998785, + "grad_norm": 16.25, + "learning_rate": 3.473560810543264e-07, + "loss": 0.5626, + "step": 6250 + }, + { + "epoch": 1.5198152200340385, + "grad_norm": 20.0, + "learning_rate": 3.470218406461434e-07, + "loss": 0.8862, + "step": 6251 + }, + { + "epoch": 1.5200583515681982, + "grad_norm": 22.75, + "learning_rate": 3.4668773520364173e-07, + "loss": 0.5772, + "step": 6252 + }, + { + "epoch": 1.5203014831023585, + "grad_norm": 20.5, + "learning_rate": 3.463537647767583e-07, + "loss": 0.9129, + "step": 6253 + }, + { + "epoch": 1.5205446146365182, + "grad_norm": 16.875, + "learning_rate": 3.4601992941541167e-07, + "loss": 0.4075, + "step": 6254 + }, + { + "epoch": 1.5207877461706785, + "grad_norm": 20.0, + "learning_rate": 3.4568622916949844e-07, + "loss": 1.0263, + "step": 6255 + }, + { + "epoch": 1.5210308777048382, + "grad_norm": 17.25, + "learning_rate": 3.453526640888967e-07, + "loss": 0.6099, + "step": 6256 + }, + { + "epoch": 1.5212740092389982, + "grad_norm": 16.25, + "learning_rate": 3.4501923422346304e-07, + "loss": 0.635, + "step": 6257 + }, + { + "epoch": 1.5215171407731582, + "grad_norm": 22.5, + "learning_rate": 3.446859396230341e-07, + "loss": 0.6274, + "step": 6258 + }, + { + "epoch": 1.5217602723073183, + "grad_norm": 16.375, + "learning_rate": 3.443527803374272e-07, + "loss": 0.5064, + "step": 6259 + }, + { + "epoch": 1.5220034038414783, + "grad_norm": 20.0, + "learning_rate": 3.4401975641643824e-07, + "loss": 0.8685, + "step": 6260 + }, + { + "epoch": 1.5222465353756383, + "grad_norm": 14.5625, + "learning_rate": 3.436868679098432e-07, + "loss": 0.3722, + "step": 6261 + }, + { + "epoch": 1.5224896669097983, + "grad_norm": 16.75, + "learning_rate": 3.433541148673983e-07, + "loss": 0.5688, + "step": 6262 + }, + { + "epoch": 1.522732798443958, + "grad_norm": 21.375, + "learning_rate": 3.430214973388396e-07, + "loss": 0.5267, + "step": 6263 + }, + { + "epoch": 1.5229759299781183, + "grad_norm": 20.5, + "learning_rate": 3.4268901537388223e-07, + "loss": 0.8173, + "step": 6264 + }, + { + "epoch": 1.523219061512278, + "grad_norm": 23.375, + "learning_rate": 3.4235666902222105e-07, + "loss": 0.5716, + "step": 6265 + }, + { + "epoch": 1.5234621930464383, + "grad_norm": 19.25, + "learning_rate": 3.4202445833353136e-07, + "loss": 0.6776, + "step": 6266 + }, + { + "epoch": 1.523705324580598, + "grad_norm": 23.375, + "learning_rate": 3.4169238335746786e-07, + "loss": 0.7741, + "step": 6267 + }, + { + "epoch": 1.523948456114758, + "grad_norm": 32.5, + "learning_rate": 3.4136044414366426e-07, + "loss": 1.1893, + "step": 6268 + }, + { + "epoch": 1.524191587648918, + "grad_norm": 23.875, + "learning_rate": 3.4102864074173486e-07, + "loss": 1.0365, + "step": 6269 + }, + { + "epoch": 1.524434719183078, + "grad_norm": 16.25, + "learning_rate": 3.406969732012741e-07, + "loss": 0.3805, + "step": 6270 + }, + { + "epoch": 1.524677850717238, + "grad_norm": 26.125, + "learning_rate": 3.403654415718549e-07, + "loss": 1.1491, + "step": 6271 + }, + { + "epoch": 1.524920982251398, + "grad_norm": 25.25, + "learning_rate": 3.4003404590303005e-07, + "loss": 1.0536, + "step": 6272 + }, + { + "epoch": 1.525164113785558, + "grad_norm": 17.125, + "learning_rate": 3.3970278624433306e-07, + "loss": 0.5627, + "step": 6273 + }, + { + "epoch": 1.5254072453197178, + "grad_norm": 21.0, + "learning_rate": 3.393716626452759e-07, + "loss": 0.4166, + "step": 6274 + }, + { + "epoch": 1.525650376853878, + "grad_norm": 20.25, + "learning_rate": 3.3904067515535056e-07, + "loss": 0.6059, + "step": 6275 + }, + { + "epoch": 1.5258935083880378, + "grad_norm": 21.875, + "learning_rate": 3.3870982382402927e-07, + "loss": 0.6766, + "step": 6276 + }, + { + "epoch": 1.526136639922198, + "grad_norm": 25.5, + "learning_rate": 3.383791087007636e-07, + "loss": 0.769, + "step": 6277 + }, + { + "epoch": 1.5263797714563578, + "grad_norm": 20.75, + "learning_rate": 3.380485298349843e-07, + "loss": 0.5913, + "step": 6278 + }, + { + "epoch": 1.5266229029905178, + "grad_norm": 24.5, + "learning_rate": 3.3771808727610186e-07, + "loss": 0.716, + "step": 6279 + }, + { + "epoch": 1.5268660345246778, + "grad_norm": 15.875, + "learning_rate": 3.3738778107350724e-07, + "loss": 0.5866, + "step": 6280 + }, + { + "epoch": 1.5271091660588378, + "grad_norm": 18.625, + "learning_rate": 3.370576112765703e-07, + "loss": 0.7001, + "step": 6281 + }, + { + "epoch": 1.5273522975929978, + "grad_norm": 29.5, + "learning_rate": 3.3672757793463974e-07, + "loss": 0.7475, + "step": 6282 + }, + { + "epoch": 1.5275954291271578, + "grad_norm": 24.0, + "learning_rate": 3.3639768109704607e-07, + "loss": 0.882, + "step": 6283 + }, + { + "epoch": 1.5278385606613178, + "grad_norm": 24.5, + "learning_rate": 3.3606792081309693e-07, + "loss": 0.617, + "step": 6284 + }, + { + "epoch": 1.5280816921954776, + "grad_norm": 18.25, + "learning_rate": 3.357382971320815e-07, + "loss": 0.6626, + "step": 6285 + }, + { + "epoch": 1.5283248237296378, + "grad_norm": 20.625, + "learning_rate": 3.354088101032671e-07, + "loss": 0.7945, + "step": 6286 + }, + { + "epoch": 1.5285679552637976, + "grad_norm": 23.625, + "learning_rate": 3.350794597759019e-07, + "loss": 0.9075, + "step": 6287 + }, + { + "epoch": 1.5288110867979579, + "grad_norm": 15.375, + "learning_rate": 3.347502461992126e-07, + "loss": 0.6195, + "step": 6288 + }, + { + "epoch": 1.5290542183321176, + "grad_norm": 19.75, + "learning_rate": 3.3442116942240575e-07, + "loss": 0.6145, + "step": 6289 + }, + { + "epoch": 1.5292973498662776, + "grad_norm": 16.625, + "learning_rate": 3.3409222949466786e-07, + "loss": 0.6245, + "step": 6290 + }, + { + "epoch": 1.5295404814004376, + "grad_norm": 24.875, + "learning_rate": 3.337634264651647e-07, + "loss": 0.5847, + "step": 6291 + }, + { + "epoch": 1.5297836129345976, + "grad_norm": 21.5, + "learning_rate": 3.3343476038304095e-07, + "loss": 0.8813, + "step": 6292 + }, + { + "epoch": 1.5300267444687576, + "grad_norm": 41.75, + "learning_rate": 3.3310623129742205e-07, + "loss": 0.7915, + "step": 6293 + }, + { + "epoch": 1.5302698760029174, + "grad_norm": 23.75, + "learning_rate": 3.327778392574124e-07, + "loss": 0.8187, + "step": 6294 + }, + { + "epoch": 1.5305130075370776, + "grad_norm": 30.625, + "learning_rate": 3.324495843120956e-07, + "loss": 0.7529, + "step": 6295 + }, + { + "epoch": 1.5307561390712374, + "grad_norm": 23.5, + "learning_rate": 3.321214665105349e-07, + "loss": 0.8982, + "step": 6296 + }, + { + "epoch": 1.5309992706053976, + "grad_norm": 17.5, + "learning_rate": 3.3179348590177353e-07, + "loss": 0.4632, + "step": 6297 + }, + { + "epoch": 1.5312424021395574, + "grad_norm": 24.875, + "learning_rate": 3.314656425348338e-07, + "loss": 0.6157, + "step": 6298 + }, + { + "epoch": 1.5314855336737176, + "grad_norm": 21.125, + "learning_rate": 3.3113793645871696e-07, + "loss": 0.847, + "step": 6299 + }, + { + "epoch": 1.5317286652078774, + "grad_norm": 24.0, + "learning_rate": 3.308103677224049e-07, + "loss": 0.7655, + "step": 6300 + }, + { + "epoch": 1.5319717967420374, + "grad_norm": 27.25, + "learning_rate": 3.3048293637485865e-07, + "loss": 0.9015, + "step": 6301 + }, + { + "epoch": 1.5322149282761974, + "grad_norm": 25.375, + "learning_rate": 3.301556424650182e-07, + "loss": 1.109, + "step": 6302 + }, + { + "epoch": 1.5324580598103574, + "grad_norm": 18.75, + "learning_rate": 3.298284860418027e-07, + "loss": 0.7065, + "step": 6303 + }, + { + "epoch": 1.5327011913445174, + "grad_norm": 24.125, + "learning_rate": 3.2950146715411246e-07, + "loss": 1.0324, + "step": 6304 + }, + { + "epoch": 1.5329443228786772, + "grad_norm": 27.0, + "learning_rate": 3.291745858508255e-07, + "loss": 1.2628, + "step": 6305 + }, + { + "epoch": 1.5331874544128374, + "grad_norm": 21.125, + "learning_rate": 3.2884784218079944e-07, + "loss": 0.8716, + "step": 6306 + }, + { + "epoch": 1.5334305859469972, + "grad_norm": 15.75, + "learning_rate": 3.2852123619287217e-07, + "loss": 0.7188, + "step": 6307 + }, + { + "epoch": 1.5336737174811574, + "grad_norm": 19.875, + "learning_rate": 3.2819476793586114e-07, + "loss": 0.6904, + "step": 6308 + }, + { + "epoch": 1.5339168490153172, + "grad_norm": 14.6875, + "learning_rate": 3.2786843745856207e-07, + "loss": 0.3485, + "step": 6309 + }, + { + "epoch": 1.5341599805494772, + "grad_norm": 33.5, + "learning_rate": 3.2754224480975055e-07, + "loss": 0.4056, + "step": 6310 + }, + { + "epoch": 1.5344031120836372, + "grad_norm": 22.875, + "learning_rate": 3.272161900381822e-07, + "loss": 0.8901, + "step": 6311 + }, + { + "epoch": 1.5346462436177972, + "grad_norm": 16.375, + "learning_rate": 3.2689027319259133e-07, + "loss": 0.5373, + "step": 6312 + }, + { + "epoch": 1.5348893751519572, + "grad_norm": 29.25, + "learning_rate": 3.265644943216913e-07, + "loss": 0.679, + "step": 6313 + }, + { + "epoch": 1.5351325066861172, + "grad_norm": 22.125, + "learning_rate": 3.262388534741763e-07, + "loss": 0.8547, + "step": 6314 + }, + { + "epoch": 1.5353756382202772, + "grad_norm": 21.875, + "learning_rate": 3.259133506987182e-07, + "loss": 0.7246, + "step": 6315 + }, + { + "epoch": 1.535618769754437, + "grad_norm": 24.625, + "learning_rate": 3.255879860439698e-07, + "loss": 1.0471, + "step": 6316 + }, + { + "epoch": 1.5358619012885972, + "grad_norm": 22.125, + "learning_rate": 3.252627595585615e-07, + "loss": 1.1357, + "step": 6317 + }, + { + "epoch": 1.536105032822757, + "grad_norm": 29.25, + "learning_rate": 3.2493767129110507e-07, + "loss": 0.9973, + "step": 6318 + }, + { + "epoch": 1.5363481643569172, + "grad_norm": 23.25, + "learning_rate": 3.2461272129019006e-07, + "loss": 0.6529, + "step": 6319 + }, + { + "epoch": 1.536591295891077, + "grad_norm": 26.125, + "learning_rate": 3.2428790960438536e-07, + "loss": 0.8078, + "step": 6320 + }, + { + "epoch": 1.536834427425237, + "grad_norm": 36.5, + "learning_rate": 3.239632362822406e-07, + "loss": 0.5222, + "step": 6321 + }, + { + "epoch": 1.537077558959397, + "grad_norm": 21.125, + "learning_rate": 3.2363870137228305e-07, + "loss": 0.9279, + "step": 6322 + }, + { + "epoch": 1.537320690493557, + "grad_norm": 19.75, + "learning_rate": 3.233143049230207e-07, + "loss": 0.7125, + "step": 6323 + }, + { + "epoch": 1.537563822027717, + "grad_norm": 18.75, + "learning_rate": 3.229900469829396e-07, + "loss": 0.5521, + "step": 6324 + }, + { + "epoch": 1.537806953561877, + "grad_norm": 18.0, + "learning_rate": 3.2266592760050635e-07, + "loss": 0.6292, + "step": 6325 + }, + { + "epoch": 1.538050085096037, + "grad_norm": 23.0, + "learning_rate": 3.223419468241658e-07, + "loss": 0.6819, + "step": 6326 + }, + { + "epoch": 1.5382932166301968, + "grad_norm": 20.875, + "learning_rate": 3.220181047023421e-07, + "loss": 1.0832, + "step": 6327 + }, + { + "epoch": 1.538536348164357, + "grad_norm": 27.5, + "learning_rate": 3.216944012834398e-07, + "loss": 0.6635, + "step": 6328 + }, + { + "epoch": 1.5387794796985168, + "grad_norm": 24.25, + "learning_rate": 3.2137083661584175e-07, + "loss": 0.9284, + "step": 6329 + }, + { + "epoch": 1.539022611232677, + "grad_norm": 22.25, + "learning_rate": 3.210474107479097e-07, + "loss": 0.8478, + "step": 6330 + }, + { + "epoch": 1.5392657427668368, + "grad_norm": 26.625, + "learning_rate": 3.2072412372798565e-07, + "loss": 0.6834, + "step": 6331 + }, + { + "epoch": 1.5395088743009968, + "grad_norm": 35.75, + "learning_rate": 3.204009756043909e-07, + "loss": 1.0705, + "step": 6332 + }, + { + "epoch": 1.5397520058351568, + "grad_norm": 20.125, + "learning_rate": 3.2007796642542513e-07, + "loss": 0.869, + "step": 6333 + }, + { + "epoch": 1.5399951373693168, + "grad_norm": 24.75, + "learning_rate": 3.197550962393671e-07, + "loss": 0.9519, + "step": 6334 + }, + { + "epoch": 1.5402382689034768, + "grad_norm": 19.75, + "learning_rate": 3.1943236509447625e-07, + "loss": 0.7127, + "step": 6335 + }, + { + "epoch": 1.5404814004376368, + "grad_norm": 19.5, + "learning_rate": 3.1910977303898985e-07, + "loss": 0.5472, + "step": 6336 + }, + { + "epoch": 1.5407245319717968, + "grad_norm": 22.25, + "learning_rate": 3.1878732012112447e-07, + "loss": 1.1632, + "step": 6337 + }, + { + "epoch": 1.5409676635059566, + "grad_norm": 22.875, + "learning_rate": 3.184650063890768e-07, + "loss": 0.9107, + "step": 6338 + }, + { + "epoch": 1.5412107950401168, + "grad_norm": 15.625, + "learning_rate": 3.181428318910225e-07, + "loss": 0.3992, + "step": 6339 + }, + { + "epoch": 1.5414539265742766, + "grad_norm": 19.5, + "learning_rate": 3.1782079667511554e-07, + "loss": 0.764, + "step": 6340 + }, + { + "epoch": 1.5416970581084368, + "grad_norm": 21.875, + "learning_rate": 3.174989007894896e-07, + "loss": 0.7726, + "step": 6341 + }, + { + "epoch": 1.5419401896425966, + "grad_norm": 17.5, + "learning_rate": 3.171771442822581e-07, + "loss": 0.3811, + "step": 6342 + }, + { + "epoch": 1.5421833211767566, + "grad_norm": 22.5, + "learning_rate": 3.168555272015128e-07, + "loss": 0.8022, + "step": 6343 + }, + { + "epoch": 1.5424264527109166, + "grad_norm": 23.75, + "learning_rate": 3.165340495953245e-07, + "loss": 1.0406, + "step": 6344 + }, + { + "epoch": 1.5426695842450766, + "grad_norm": 17.625, + "learning_rate": 3.162127115117444e-07, + "loss": 0.6932, + "step": 6345 + }, + { + "epoch": 1.5429127157792366, + "grad_norm": 21.25, + "learning_rate": 3.1589151299880133e-07, + "loss": 0.5317, + "step": 6346 + }, + { + "epoch": 1.5431558473133964, + "grad_norm": 13.25, + "learning_rate": 3.155704541045046e-07, + "loss": 0.2932, + "step": 6347 + }, + { + "epoch": 1.5433989788475566, + "grad_norm": 20.5, + "learning_rate": 3.152495348768413e-07, + "loss": 0.672, + "step": 6348 + }, + { + "epoch": 1.5436421103817164, + "grad_norm": 25.875, + "learning_rate": 3.1492875536377906e-07, + "loss": 0.8262, + "step": 6349 + }, + { + "epoch": 1.5438852419158766, + "grad_norm": 25.75, + "learning_rate": 3.1460811561326353e-07, + "loss": 0.6919, + "step": 6350 + }, + { + "epoch": 1.5441283734500364, + "grad_norm": 23.375, + "learning_rate": 3.1428761567321953e-07, + "loss": 0.6993, + "step": 6351 + }, + { + "epoch": 1.5443715049841964, + "grad_norm": 23.25, + "learning_rate": 3.1396725559155213e-07, + "loss": 0.7133, + "step": 6352 + }, + { + "epoch": 1.5446146365183564, + "grad_norm": 19.875, + "learning_rate": 3.1364703541614374e-07, + "loss": 0.7139, + "step": 6353 + }, + { + "epoch": 1.5448577680525164, + "grad_norm": 17.5, + "learning_rate": 3.133269551948577e-07, + "loss": 0.6425, + "step": 6354 + }, + { + "epoch": 1.5451008995866764, + "grad_norm": 23.75, + "learning_rate": 3.130070149755347e-07, + "loss": 0.8137, + "step": 6355 + }, + { + "epoch": 1.5453440311208364, + "grad_norm": 18.875, + "learning_rate": 3.126872148059962e-07, + "loss": 0.7958, + "step": 6356 + }, + { + "epoch": 1.5455871626549964, + "grad_norm": 20.25, + "learning_rate": 3.123675547340414e-07, + "loss": 0.6986, + "step": 6357 + }, + { + "epoch": 1.5458302941891562, + "grad_norm": 18.875, + "learning_rate": 3.1204803480744867e-07, + "loss": 0.7421, + "step": 6358 + }, + { + "epoch": 1.5460734257233164, + "grad_norm": 25.25, + "learning_rate": 3.1172865507397635e-07, + "loss": 0.8952, + "step": 6359 + }, + { + "epoch": 1.5463165572574762, + "grad_norm": 16.5, + "learning_rate": 3.1140941558136117e-07, + "loss": 0.4089, + "step": 6360 + }, + { + "epoch": 1.5465596887916364, + "grad_norm": 21.625, + "learning_rate": 3.110903163773184e-07, + "loss": 0.6667, + "step": 6361 + }, + { + "epoch": 1.5468028203257962, + "grad_norm": 17.5, + "learning_rate": 3.107713575095435e-07, + "loss": 0.379, + "step": 6362 + }, + { + "epoch": 1.5470459518599562, + "grad_norm": 17.375, + "learning_rate": 3.1045253902571044e-07, + "loss": 0.5482, + "step": 6363 + }, + { + "epoch": 1.5472890833941162, + "grad_norm": 20.375, + "learning_rate": 3.1013386097347205e-07, + "loss": 0.5824, + "step": 6364 + }, + { + "epoch": 1.5475322149282762, + "grad_norm": 19.25, + "learning_rate": 3.0981532340045985e-07, + "loss": 1.0763, + "step": 6365 + }, + { + "epoch": 1.5477753464624362, + "grad_norm": 18.375, + "learning_rate": 3.094969263542855e-07, + "loss": 0.537, + "step": 6366 + }, + { + "epoch": 1.5480184779965962, + "grad_norm": 41.75, + "learning_rate": 3.0917866988253857e-07, + "loss": 0.6596, + "step": 6367 + }, + { + "epoch": 1.5482616095307562, + "grad_norm": 21.25, + "learning_rate": 3.0886055403278756e-07, + "loss": 0.9174, + "step": 6368 + }, + { + "epoch": 1.548504741064916, + "grad_norm": 26.5, + "learning_rate": 3.085425788525807e-07, + "loss": 1.025, + "step": 6369 + }, + { + "epoch": 1.5487478725990762, + "grad_norm": 16.75, + "learning_rate": 3.082247443894455e-07, + "loss": 0.6325, + "step": 6370 + }, + { + "epoch": 1.548991004133236, + "grad_norm": 21.875, + "learning_rate": 3.0790705069088724e-07, + "loss": 0.6059, + "step": 6371 + }, + { + "epoch": 1.5492341356673962, + "grad_norm": 18.875, + "learning_rate": 3.075894978043903e-07, + "loss": 0.6588, + "step": 6372 + }, + { + "epoch": 1.549477267201556, + "grad_norm": 21.375, + "learning_rate": 3.0727208577741946e-07, + "loss": 0.6662, + "step": 6373 + }, + { + "epoch": 1.549720398735716, + "grad_norm": 25.125, + "learning_rate": 3.0695481465741665e-07, + "loss": 1.0003, + "step": 6374 + }, + { + "epoch": 1.549963530269876, + "grad_norm": 18.875, + "learning_rate": 3.0663768449180354e-07, + "loss": 0.6756, + "step": 6375 + }, + { + "epoch": 1.550206661804036, + "grad_norm": 17.25, + "learning_rate": 3.063206953279811e-07, + "loss": 0.3996, + "step": 6376 + }, + { + "epoch": 1.550449793338196, + "grad_norm": 17.75, + "learning_rate": 3.060038472133285e-07, + "loss": 0.4859, + "step": 6377 + }, + { + "epoch": 1.550692924872356, + "grad_norm": 18.875, + "learning_rate": 3.0568714019520455e-07, + "loss": 0.8684, + "step": 6378 + }, + { + "epoch": 1.550936056406516, + "grad_norm": 15.9375, + "learning_rate": 3.0537057432094603e-07, + "loss": 0.4608, + "step": 6379 + }, + { + "epoch": 1.5511791879406758, + "grad_norm": 20.625, + "learning_rate": 3.0505414963786977e-07, + "loss": 0.6788, + "step": 6380 + }, + { + "epoch": 1.551422319474836, + "grad_norm": 22.125, + "learning_rate": 3.047378661932707e-07, + "loss": 0.7022, + "step": 6381 + }, + { + "epoch": 1.5516654510089958, + "grad_norm": 37.25, + "learning_rate": 3.0442172403442244e-07, + "loss": 0.8, + "step": 6382 + }, + { + "epoch": 1.551908582543156, + "grad_norm": 13.375, + "learning_rate": 3.041057232085785e-07, + "loss": 0.4686, + "step": 6383 + }, + { + "epoch": 1.5521517140773158, + "grad_norm": 15.8125, + "learning_rate": 3.037898637629701e-07, + "loss": 0.4005, + "step": 6384 + }, + { + "epoch": 1.5523948456114758, + "grad_norm": 15.0, + "learning_rate": 3.034741457448084e-07, + "loss": 0.2845, + "step": 6385 + }, + { + "epoch": 1.5526379771456358, + "grad_norm": 23.125, + "learning_rate": 3.031585692012826e-07, + "loss": 0.9698, + "step": 6386 + }, + { + "epoch": 1.5528811086797958, + "grad_norm": 17.25, + "learning_rate": 3.0284313417956126e-07, + "loss": 0.6786, + "step": 6387 + }, + { + "epoch": 1.5531242402139558, + "grad_norm": 20.125, + "learning_rate": 3.025278407267915e-07, + "loss": 0.6165, + "step": 6388 + }, + { + "epoch": 1.5533673717481156, + "grad_norm": 21.375, + "learning_rate": 3.0221268889009903e-07, + "loss": 0.4938, + "step": 6389 + }, + { + "epoch": 1.5536105032822758, + "grad_norm": 24.0, + "learning_rate": 3.0189767871658947e-07, + "loss": 0.8756, + "step": 6390 + }, + { + "epoch": 1.5538536348164356, + "grad_norm": 22.75, + "learning_rate": 3.015828102533461e-07, + "loss": 0.4819, + "step": 6391 + }, + { + "epoch": 1.5540967663505958, + "grad_norm": 19.625, + "learning_rate": 3.012680835474312e-07, + "loss": 0.7592, + "step": 6392 + }, + { + "epoch": 1.5543398978847556, + "grad_norm": 28.5, + "learning_rate": 3.0095349864588617e-07, + "loss": 0.6524, + "step": 6393 + }, + { + "epoch": 1.5545830294189158, + "grad_norm": 21.875, + "learning_rate": 3.0063905559573186e-07, + "loss": 0.8198, + "step": 6394 + }, + { + "epoch": 1.5548261609530756, + "grad_norm": 15.1875, + "learning_rate": 3.003247544439666e-07, + "loss": 0.61, + "step": 6395 + }, + { + "epoch": 1.5550692924872356, + "grad_norm": 18.125, + "learning_rate": 3.0001059523756816e-07, + "loss": 0.7242, + "step": 6396 + }, + { + "epoch": 1.5553124240213956, + "grad_norm": 22.75, + "learning_rate": 2.9969657802349324e-07, + "loss": 0.8387, + "step": 6397 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 17.375, + "learning_rate": 2.993827028486772e-07, + "loss": 0.405, + "step": 6398 + }, + { + "epoch": 1.5557986870897156, + "grad_norm": 26.625, + "learning_rate": 2.990689697600335e-07, + "loss": 0.9947, + "step": 6399 + }, + { + "epoch": 1.5560418186238754, + "grad_norm": 15.125, + "learning_rate": 2.987553788044555e-07, + "loss": 0.5964, + "step": 6400 + }, + { + "epoch": 1.5562849501580356, + "grad_norm": 21.625, + "learning_rate": 2.9844193002881493e-07, + "loss": 0.8416, + "step": 6401 + }, + { + "epoch": 1.5565280816921954, + "grad_norm": 22.625, + "learning_rate": 2.9812862347996197e-07, + "loss": 0.8532, + "step": 6402 + }, + { + "epoch": 1.5567712132263556, + "grad_norm": 19.875, + "learning_rate": 2.9781545920472526e-07, + "loss": 0.7714, + "step": 6403 + }, + { + "epoch": 1.5570143447605154, + "grad_norm": 13.625, + "learning_rate": 2.9750243724991327e-07, + "loss": 0.453, + "step": 6404 + }, + { + "epoch": 1.5572574762946754, + "grad_norm": 25.375, + "learning_rate": 2.971895576623124e-07, + "loss": 0.8208, + "step": 6405 + }, + { + "epoch": 1.5575006078288354, + "grad_norm": 20.25, + "learning_rate": 2.9687682048868724e-07, + "loss": 0.9632, + "step": 6406 + }, + { + "epoch": 1.5577437393629954, + "grad_norm": 14.75, + "learning_rate": 2.9656422577578226e-07, + "loss": 0.3296, + "step": 6407 + }, + { + "epoch": 1.5579868708971554, + "grad_norm": 29.875, + "learning_rate": 2.9625177357032046e-07, + "loss": 1.1691, + "step": 6408 + }, + { + "epoch": 1.5582300024313154, + "grad_norm": 22.125, + "learning_rate": 2.95939463919003e-07, + "loss": 1.1166, + "step": 6409 + }, + { + "epoch": 1.5584731339654754, + "grad_norm": 19.125, + "learning_rate": 2.956272968685095e-07, + "loss": 0.8856, + "step": 6410 + }, + { + "epoch": 1.5587162654996352, + "grad_norm": 24.625, + "learning_rate": 2.9531527246549937e-07, + "loss": 1.0867, + "step": 6411 + }, + { + "epoch": 1.5589593970337954, + "grad_norm": 14.625, + "learning_rate": 2.950033907566098e-07, + "loss": 0.5691, + "step": 6412 + }, + { + "epoch": 1.5592025285679552, + "grad_norm": 21.5, + "learning_rate": 2.9469165178845655e-07, + "loss": 0.8177, + "step": 6413 + }, + { + "epoch": 1.5594456601021154, + "grad_norm": 20.5, + "learning_rate": 2.943800556076348e-07, + "loss": 0.5111, + "step": 6414 + }, + { + "epoch": 1.5596887916362752, + "grad_norm": 18.125, + "learning_rate": 2.940686022607178e-07, + "loss": 0.5951, + "step": 6415 + }, + { + "epoch": 1.5599319231704352, + "grad_norm": 16.5, + "learning_rate": 2.9375729179425784e-07, + "loss": 0.4576, + "step": 6416 + }, + { + "epoch": 1.5601750547045952, + "grad_norm": 24.0, + "learning_rate": 2.9344612425478523e-07, + "loss": 0.8859, + "step": 6417 + }, + { + "epoch": 1.5604181862387552, + "grad_norm": 30.5, + "learning_rate": 2.9313509968881005e-07, + "loss": 0.5793, + "step": 6418 + }, + { + "epoch": 1.5606613177729152, + "grad_norm": 17.125, + "learning_rate": 2.9282421814281965e-07, + "loss": 0.3839, + "step": 6419 + }, + { + "epoch": 1.5609044493070752, + "grad_norm": 20.375, + "learning_rate": 2.9251347966328073e-07, + "loss": 0.5984, + "step": 6420 + }, + { + "epoch": 1.5611475808412352, + "grad_norm": 18.5, + "learning_rate": 2.922028842966389e-07, + "loss": 0.4034, + "step": 6421 + }, + { + "epoch": 1.561390712375395, + "grad_norm": 20.125, + "learning_rate": 2.9189243208931765e-07, + "loss": 0.7853, + "step": 6422 + }, + { + "epoch": 1.5616338439095552, + "grad_norm": 18.625, + "learning_rate": 2.9158212308771915e-07, + "loss": 0.2526, + "step": 6423 + }, + { + "epoch": 1.561876975443715, + "grad_norm": 30.5, + "learning_rate": 2.9127195733822485e-07, + "loss": 0.7495, + "step": 6424 + }, + { + "epoch": 1.5621201069778752, + "grad_norm": 21.25, + "learning_rate": 2.909619348871948e-07, + "loss": 0.6682, + "step": 6425 + }, + { + "epoch": 1.562363238512035, + "grad_norm": 19.25, + "learning_rate": 2.9065205578096667e-07, + "loss": 0.514, + "step": 6426 + }, + { + "epoch": 1.562606370046195, + "grad_norm": 25.0, + "learning_rate": 2.9034232006585685e-07, + "loss": 1.1263, + "step": 6427 + }, + { + "epoch": 1.562849501580355, + "grad_norm": 23.625, + "learning_rate": 2.900327277881616e-07, + "loss": 0.7936, + "step": 6428 + }, + { + "epoch": 1.563092633114515, + "grad_norm": 25.625, + "learning_rate": 2.8972327899415437e-07, + "loss": 0.9782, + "step": 6429 + }, + { + "epoch": 1.563335764648675, + "grad_norm": 18.625, + "learning_rate": 2.8941397373008746e-07, + "loss": 0.8169, + "step": 6430 + }, + { + "epoch": 1.563578896182835, + "grad_norm": 21.5, + "learning_rate": 2.891048120421919e-07, + "loss": 0.859, + "step": 6431 + }, + { + "epoch": 1.563822027716995, + "grad_norm": 16.75, + "learning_rate": 2.887957939766778e-07, + "loss": 0.499, + "step": 6432 + }, + { + "epoch": 1.5640651592511547, + "grad_norm": 21.375, + "learning_rate": 2.884869195797328e-07, + "loss": 0.8267, + "step": 6433 + }, + { + "epoch": 1.564308290785315, + "grad_norm": 18.75, + "learning_rate": 2.881781888975232e-07, + "loss": 0.7452, + "step": 6434 + }, + { + "epoch": 1.5645514223194747, + "grad_norm": 23.375, + "learning_rate": 2.878696019761951e-07, + "loss": 0.7286, + "step": 6435 + }, + { + "epoch": 1.564794553853635, + "grad_norm": 18.0, + "learning_rate": 2.8756115886187125e-07, + "loss": 0.4586, + "step": 6436 + }, + { + "epoch": 1.5650376853877948, + "grad_norm": 17.0, + "learning_rate": 2.8725285960065396e-07, + "loss": 0.5311, + "step": 6437 + }, + { + "epoch": 1.5652808169219548, + "grad_norm": 23.375, + "learning_rate": 2.869447042386239e-07, + "loss": 1.0925, + "step": 6438 + }, + { + "epoch": 1.5655239484561148, + "grad_norm": 19.5, + "learning_rate": 2.8663669282184075e-07, + "loss": 0.5943, + "step": 6439 + }, + { + "epoch": 1.5657670799902748, + "grad_norm": 34.25, + "learning_rate": 2.863288253963417e-07, + "loss": 1.2476, + "step": 6440 + }, + { + "epoch": 1.5660102115244348, + "grad_norm": 34.25, + "learning_rate": 2.8602110200814257e-07, + "loss": 0.6716, + "step": 6441 + }, + { + "epoch": 1.5662533430585945, + "grad_norm": 16.625, + "learning_rate": 2.8571352270323857e-07, + "loss": 0.4485, + "step": 6442 + }, + { + "epoch": 1.5664964745927548, + "grad_norm": 17.75, + "learning_rate": 2.8540608752760227e-07, + "loss": 0.7211, + "step": 6443 + }, + { + "epoch": 1.5667396061269145, + "grad_norm": 18.0, + "learning_rate": 2.850987965271851e-07, + "loss": 0.6797, + "step": 6444 + }, + { + "epoch": 1.5669827376610748, + "grad_norm": 19.25, + "learning_rate": 2.847916497479175e-07, + "loss": 0.6479, + "step": 6445 + }, + { + "epoch": 1.5672258691952345, + "grad_norm": 22.0, + "learning_rate": 2.844846472357073e-07, + "loss": 0.9931, + "step": 6446 + }, + { + "epoch": 1.5674690007293948, + "grad_norm": 17.125, + "learning_rate": 2.8417778903644155e-07, + "loss": 0.5199, + "step": 6447 + }, + { + "epoch": 1.5677121322635545, + "grad_norm": 18.625, + "learning_rate": 2.8387107519598584e-07, + "loss": 0.3357, + "step": 6448 + }, + { + "epoch": 1.5679552637977145, + "grad_norm": 21.25, + "learning_rate": 2.8356450576018366e-07, + "loss": 0.6823, + "step": 6449 + }, + { + "epoch": 1.5681983953318746, + "grad_norm": 16.125, + "learning_rate": 2.8325808077485693e-07, + "loss": 0.5947, + "step": 6450 + }, + { + "epoch": 1.5684415268660346, + "grad_norm": 18.0, + "learning_rate": 2.829518002858059e-07, + "loss": 0.7884, + "step": 6451 + }, + { + "epoch": 1.5686846584001946, + "grad_norm": 18.5, + "learning_rate": 2.826456643388102e-07, + "loss": 0.5786, + "step": 6452 + }, + { + "epoch": 1.5689277899343543, + "grad_norm": 22.25, + "learning_rate": 2.823396729796267e-07, + "loss": 1.0034, + "step": 6453 + }, + { + "epoch": 1.5691709214685146, + "grad_norm": 21.25, + "learning_rate": 2.8203382625399056e-07, + "loss": 1.0719, + "step": 6454 + }, + { + "epoch": 1.5694140530026743, + "grad_norm": 19.75, + "learning_rate": 2.817281242076171e-07, + "loss": 0.8782, + "step": 6455 + }, + { + "epoch": 1.5696571845368346, + "grad_norm": 23.5, + "learning_rate": 2.814225668861981e-07, + "loss": 0.7174, + "step": 6456 + }, + { + "epoch": 1.5699003160709943, + "grad_norm": 18.375, + "learning_rate": 2.8111715433540437e-07, + "loss": 0.7646, + "step": 6457 + }, + { + "epoch": 1.5701434476051543, + "grad_norm": 31.875, + "learning_rate": 2.8081188660088464e-07, + "loss": 0.9543, + "step": 6458 + }, + { + "epoch": 1.5703865791393143, + "grad_norm": 20.625, + "learning_rate": 2.805067637282674e-07, + "loss": 0.6195, + "step": 6459 + }, + { + "epoch": 1.5706297106734743, + "grad_norm": 19.75, + "learning_rate": 2.802017857631581e-07, + "loss": 0.779, + "step": 6460 + }, + { + "epoch": 1.5708728422076343, + "grad_norm": 25.875, + "learning_rate": 2.798969527511401e-07, + "loss": 0.9677, + "step": 6461 + }, + { + "epoch": 1.5711159737417943, + "grad_norm": 25.625, + "learning_rate": 2.795922647377776e-07, + "loss": 0.9384, + "step": 6462 + }, + { + "epoch": 1.5713591052759543, + "grad_norm": 19.375, + "learning_rate": 2.792877217686106e-07, + "loss": 0.7889, + "step": 6463 + }, + { + "epoch": 1.5716022368101141, + "grad_norm": 19.75, + "learning_rate": 2.7898332388915787e-07, + "loss": 0.6775, + "step": 6464 + }, + { + "epoch": 1.5718453683442744, + "grad_norm": 16.5, + "learning_rate": 2.786790711449179e-07, + "loss": 0.3814, + "step": 6465 + }, + { + "epoch": 1.5720884998784341, + "grad_norm": 17.125, + "learning_rate": 2.7837496358136607e-07, + "loss": 0.7327, + "step": 6466 + }, + { + "epoch": 1.5723316314125944, + "grad_norm": 19.5, + "learning_rate": 2.780710012439565e-07, + "loss": 0.8297, + "step": 6467 + }, + { + "epoch": 1.5725747629467541, + "grad_norm": 16.875, + "learning_rate": 2.777671841781211e-07, + "loss": 0.5352, + "step": 6468 + }, + { + "epoch": 1.5728178944809141, + "grad_norm": 28.375, + "learning_rate": 2.7746351242927114e-07, + "loss": 0.8466, + "step": 6469 + }, + { + "epoch": 1.5730610260150741, + "grad_norm": 17.0, + "learning_rate": 2.7715998604279595e-07, + "loss": 0.4403, + "step": 6470 + }, + { + "epoch": 1.5733041575492341, + "grad_norm": 17.5, + "learning_rate": 2.76856605064062e-07, + "loss": 0.3576, + "step": 6471 + }, + { + "epoch": 1.5735472890833941, + "grad_norm": 20.125, + "learning_rate": 2.765533695384155e-07, + "loss": 0.7101, + "step": 6472 + }, + { + "epoch": 1.5737904206175541, + "grad_norm": 19.125, + "learning_rate": 2.762502795111799e-07, + "loss": 0.4901, + "step": 6473 + }, + { + "epoch": 1.5740335521517141, + "grad_norm": 25.625, + "learning_rate": 2.759473350276573e-07, + "loss": 1.4545, + "step": 6474 + }, + { + "epoch": 1.574276683685874, + "grad_norm": 30.375, + "learning_rate": 2.756445361331275e-07, + "loss": 1.1691, + "step": 6475 + }, + { + "epoch": 1.5745198152200341, + "grad_norm": 23.25, + "learning_rate": 2.7534188287285006e-07, + "loss": 0.9106, + "step": 6476 + }, + { + "epoch": 1.574762946754194, + "grad_norm": 24.125, + "learning_rate": 2.7503937529206063e-07, + "loss": 0.7725, + "step": 6477 + }, + { + "epoch": 1.5750060782883541, + "grad_norm": 21.5, + "learning_rate": 2.7473701343597474e-07, + "loss": 0.8067, + "step": 6478 + }, + { + "epoch": 1.575249209822514, + "grad_norm": 20.875, + "learning_rate": 2.74434797349786e-07, + "loss": 0.7489, + "step": 6479 + }, + { + "epoch": 1.575492341356674, + "grad_norm": 20.5, + "learning_rate": 2.741327270786652e-07, + "loss": 0.8283, + "step": 6480 + }, + { + "epoch": 1.575735472890834, + "grad_norm": 20.375, + "learning_rate": 2.7383080266776203e-07, + "loss": 0.6953, + "step": 6481 + }, + { + "epoch": 1.575978604424994, + "grad_norm": 20.5, + "learning_rate": 2.7352902416220463e-07, + "loss": 0.6859, + "step": 6482 + }, + { + "epoch": 1.576221735959154, + "grad_norm": 20.25, + "learning_rate": 2.7322739160709877e-07, + "loss": 0.8769, + "step": 6483 + }, + { + "epoch": 1.576464867493314, + "grad_norm": 18.5, + "learning_rate": 2.7292590504752884e-07, + "loss": 0.4509, + "step": 6484 + }, + { + "epoch": 1.576707999027474, + "grad_norm": 24.375, + "learning_rate": 2.726245645285562e-07, + "loss": 0.3359, + "step": 6485 + }, + { + "epoch": 1.5769511305616337, + "grad_norm": 26.5, + "learning_rate": 2.723233700952231e-07, + "loss": 1.3967, + "step": 6486 + }, + { + "epoch": 1.577194262095794, + "grad_norm": 20.375, + "learning_rate": 2.720223217925473e-07, + "loss": 0.9031, + "step": 6487 + }, + { + "epoch": 1.5774373936299537, + "grad_norm": 27.5, + "learning_rate": 2.7172141966552545e-07, + "loss": 0.9432, + "step": 6488 + }, + { + "epoch": 1.577680525164114, + "grad_norm": 19.625, + "learning_rate": 2.714206637591332e-07, + "loss": 0.7459, + "step": 6489 + }, + { + "epoch": 1.5779236566982737, + "grad_norm": 17.5, + "learning_rate": 2.7112005411832345e-07, + "loss": 0.6623, + "step": 6490 + }, + { + "epoch": 1.5781667882324337, + "grad_norm": 21.0, + "learning_rate": 2.7081959078802734e-07, + "loss": 0.8332, + "step": 6491 + }, + { + "epoch": 1.5784099197665937, + "grad_norm": 17.875, + "learning_rate": 2.705192738131539e-07, + "loss": 0.5069, + "step": 6492 + }, + { + "epoch": 1.5786530513007537, + "grad_norm": 20.125, + "learning_rate": 2.7021910323859173e-07, + "loss": 0.7234, + "step": 6493 + }, + { + "epoch": 1.5788961828349137, + "grad_norm": 18.0, + "learning_rate": 2.699190791092061e-07, + "loss": 0.5635, + "step": 6494 + }, + { + "epoch": 1.5791393143690735, + "grad_norm": 16.75, + "learning_rate": 2.6961920146984024e-07, + "loss": 0.4454, + "step": 6495 + }, + { + "epoch": 1.5793824459032337, + "grad_norm": 14.8125, + "learning_rate": 2.6931947036531696e-07, + "loss": 0.3257, + "step": 6496 + }, + { + "epoch": 1.5796255774373935, + "grad_norm": 18.0, + "learning_rate": 2.690198858404357e-07, + "loss": 0.7404, + "step": 6497 + }, + { + "epoch": 1.5798687089715537, + "grad_norm": 18.125, + "learning_rate": 2.6872044793997424e-07, + "loss": 0.7507, + "step": 6498 + }, + { + "epoch": 1.5801118405057135, + "grad_norm": 23.25, + "learning_rate": 2.6842115670868947e-07, + "loss": 1.0221, + "step": 6499 + }, + { + "epoch": 1.5803549720398735, + "grad_norm": 20.25, + "learning_rate": 2.6812201219131503e-07, + "loss": 1.0032, + "step": 6500 + }, + { + "epoch": 1.5805981035740335, + "grad_norm": 22.25, + "learning_rate": 2.6782301443256374e-07, + "loss": 0.9321, + "step": 6501 + }, + { + "epoch": 1.5808412351081935, + "grad_norm": 27.0, + "learning_rate": 2.675241634771253e-07, + "loss": 0.8659, + "step": 6502 + }, + { + "epoch": 1.5810843666423535, + "grad_norm": 21.625, + "learning_rate": 2.672254593696691e-07, + "loss": 0.9163, + "step": 6503 + }, + { + "epoch": 1.5813274981765135, + "grad_norm": 28.0, + "learning_rate": 2.6692690215484096e-07, + "loss": 0.6282, + "step": 6504 + }, + { + "epoch": 1.5815706297106735, + "grad_norm": 17.25, + "learning_rate": 2.6662849187726533e-07, + "loss": 0.4821, + "step": 6505 + }, + { + "epoch": 1.5818137612448333, + "grad_norm": 18.375, + "learning_rate": 2.663302285815453e-07, + "loss": 0.7284, + "step": 6506 + }, + { + "epoch": 1.5820568927789935, + "grad_norm": 22.375, + "learning_rate": 2.660321123122611e-07, + "loss": 1.2149, + "step": 6507 + }, + { + "epoch": 1.5823000243131533, + "grad_norm": 27.625, + "learning_rate": 2.6573414311397107e-07, + "loss": 0.9231, + "step": 6508 + }, + { + "epoch": 1.5825431558473135, + "grad_norm": 20.125, + "learning_rate": 2.6543632103121234e-07, + "loss": 0.6957, + "step": 6509 + }, + { + "epoch": 1.5827862873814733, + "grad_norm": 22.75, + "learning_rate": 2.651386461084997e-07, + "loss": 0.6521, + "step": 6510 + }, + { + "epoch": 1.5830294189156333, + "grad_norm": 18.625, + "learning_rate": 2.648411183903256e-07, + "loss": 0.7038, + "step": 6511 + }, + { + "epoch": 1.5832725504497933, + "grad_norm": 18.25, + "learning_rate": 2.645437379211603e-07, + "loss": 0.8506, + "step": 6512 + }, + { + "epoch": 1.5835156819839533, + "grad_norm": 20.0, + "learning_rate": 2.64246504745453e-07, + "loss": 0.6279, + "step": 6513 + }, + { + "epoch": 1.5837588135181133, + "grad_norm": 17.125, + "learning_rate": 2.6394941890763025e-07, + "loss": 0.6043, + "step": 6514 + }, + { + "epoch": 1.5840019450522733, + "grad_norm": 17.25, + "learning_rate": 2.636524804520961e-07, + "loss": 0.6408, + "step": 6515 + }, + { + "epoch": 1.5842450765864333, + "grad_norm": 21.25, + "learning_rate": 2.6335568942323365e-07, + "loss": 0.7061, + "step": 6516 + }, + { + "epoch": 1.584488208120593, + "grad_norm": 23.5, + "learning_rate": 2.6305904586540367e-07, + "loss": 0.6464, + "step": 6517 + }, + { + "epoch": 1.5847313396547533, + "grad_norm": 21.125, + "learning_rate": 2.6276254982294427e-07, + "loss": 0.7062, + "step": 6518 + }, + { + "epoch": 1.584974471188913, + "grad_norm": 18.125, + "learning_rate": 2.6246620134017174e-07, + "loss": 0.6233, + "step": 6519 + }, + { + "epoch": 1.5852176027230733, + "grad_norm": 21.75, + "learning_rate": 2.62170000461381e-07, + "loss": 0.8793, + "step": 6520 + }, + { + "epoch": 1.585460734257233, + "grad_norm": 18.75, + "learning_rate": 2.6187394723084403e-07, + "loss": 0.6398, + "step": 6521 + }, + { + "epoch": 1.585703865791393, + "grad_norm": 31.125, + "learning_rate": 2.61578041692811e-07, + "loss": 0.837, + "step": 6522 + }, + { + "epoch": 1.585946997325553, + "grad_norm": 22.375, + "learning_rate": 2.612822838915102e-07, + "loss": 0.7556, + "step": 6523 + }, + { + "epoch": 1.586190128859713, + "grad_norm": 21.5, + "learning_rate": 2.609866738711481e-07, + "loss": 0.561, + "step": 6524 + }, + { + "epoch": 1.5864332603938731, + "grad_norm": 23.625, + "learning_rate": 2.6069121167590846e-07, + "loss": 0.8384, + "step": 6525 + }, + { + "epoch": 1.5866763919280331, + "grad_norm": 18.75, + "learning_rate": 2.603958973499529e-07, + "loss": 0.5325, + "step": 6526 + }, + { + "epoch": 1.5869195234621931, + "grad_norm": 17.25, + "learning_rate": 2.601007309374219e-07, + "loss": 0.6433, + "step": 6527 + }, + { + "epoch": 1.587162654996353, + "grad_norm": 18.0, + "learning_rate": 2.598057124824328e-07, + "loss": 0.4489, + "step": 6528 + }, + { + "epoch": 1.5874057865305131, + "grad_norm": 18.625, + "learning_rate": 2.595108420290808e-07, + "loss": 0.6789, + "step": 6529 + }, + { + "epoch": 1.587648918064673, + "grad_norm": 21.625, + "learning_rate": 2.5921611962144014e-07, + "loss": 0.7601, + "step": 6530 + }, + { + "epoch": 1.5878920495988331, + "grad_norm": 20.125, + "learning_rate": 2.5892154530356166e-07, + "loss": 0.5507, + "step": 6531 + }, + { + "epoch": 1.588135181132993, + "grad_norm": 21.875, + "learning_rate": 2.586271191194749e-07, + "loss": 1.0553, + "step": 6532 + }, + { + "epoch": 1.588378312667153, + "grad_norm": 24.0, + "learning_rate": 2.5833284111318647e-07, + "loss": 0.8897, + "step": 6533 + }, + { + "epoch": 1.588621444201313, + "grad_norm": 21.0, + "learning_rate": 2.580387113286821e-07, + "loss": 0.9927, + "step": 6534 + }, + { + "epoch": 1.588864575735473, + "grad_norm": 22.75, + "learning_rate": 2.5774472980992394e-07, + "loss": 0.4649, + "step": 6535 + }, + { + "epoch": 1.589107707269633, + "grad_norm": 15.75, + "learning_rate": 2.574508966008525e-07, + "loss": 0.5479, + "step": 6536 + }, + { + "epoch": 1.5893508388037927, + "grad_norm": 19.625, + "learning_rate": 2.571572117453867e-07, + "loss": 0.7894, + "step": 6537 + }, + { + "epoch": 1.589593970337953, + "grad_norm": 17.625, + "learning_rate": 2.5686367528742277e-07, + "loss": 0.603, + "step": 6538 + }, + { + "epoch": 1.5898371018721127, + "grad_norm": 19.75, + "learning_rate": 2.5657028727083424e-07, + "loss": 1.1031, + "step": 6539 + }, + { + "epoch": 1.590080233406273, + "grad_norm": 27.125, + "learning_rate": 2.5627704773947326e-07, + "loss": 0.8461, + "step": 6540 + }, + { + "epoch": 1.5903233649404327, + "grad_norm": 19.125, + "learning_rate": 2.559839567371701e-07, + "loss": 0.416, + "step": 6541 + }, + { + "epoch": 1.590566496474593, + "grad_norm": 21.125, + "learning_rate": 2.5569101430773195e-07, + "loss": 0.8185, + "step": 6542 + }, + { + "epoch": 1.5908096280087527, + "grad_norm": 17.0, + "learning_rate": 2.553982204949436e-07, + "loss": 0.5952, + "step": 6543 + }, + { + "epoch": 1.5910527595429127, + "grad_norm": 22.5, + "learning_rate": 2.551055753425689e-07, + "loss": 0.6843, + "step": 6544 + }, + { + "epoch": 1.5912958910770727, + "grad_norm": 26.125, + "learning_rate": 2.548130788943484e-07, + "loss": 0.7056, + "step": 6545 + }, + { + "epoch": 1.5915390226112327, + "grad_norm": 24.875, + "learning_rate": 2.545207311940004e-07, + "loss": 0.9516, + "step": 6546 + }, + { + "epoch": 1.5917821541453927, + "grad_norm": 24.125, + "learning_rate": 2.5422853228522155e-07, + "loss": 0.7309, + "step": 6547 + }, + { + "epoch": 1.5920252856795525, + "grad_norm": 30.375, + "learning_rate": 2.539364822116866e-07, + "loss": 0.76, + "step": 6548 + }, + { + "epoch": 1.5922684172137127, + "grad_norm": 21.5, + "learning_rate": 2.5364458101704694e-07, + "loss": 0.7654, + "step": 6549 + }, + { + "epoch": 1.5925115487478725, + "grad_norm": 20.75, + "learning_rate": 2.5335282874493204e-07, + "loss": 0.5495, + "step": 6550 + }, + { + "epoch": 1.5927546802820327, + "grad_norm": 16.75, + "learning_rate": 2.5306122543894975e-07, + "loss": 0.6322, + "step": 6551 + }, + { + "epoch": 1.5929978118161925, + "grad_norm": 25.0, + "learning_rate": 2.527697711426852e-07, + "loss": 0.7557, + "step": 6552 + }, + { + "epoch": 1.5932409433503525, + "grad_norm": 19.5, + "learning_rate": 2.5247846589970065e-07, + "loss": 0.7574, + "step": 6553 + }, + { + "epoch": 1.5934840748845125, + "grad_norm": 19.0, + "learning_rate": 2.5218730975353725e-07, + "loss": 0.6171, + "step": 6554 + }, + { + "epoch": 1.5937272064186725, + "grad_norm": 21.25, + "learning_rate": 2.518963027477135e-07, + "loss": 0.871, + "step": 6555 + }, + { + "epoch": 1.5939703379528325, + "grad_norm": 23.625, + "learning_rate": 2.5160544492572517e-07, + "loss": 0.9971, + "step": 6556 + }, + { + "epoch": 1.5942134694869925, + "grad_norm": 16.25, + "learning_rate": 2.5131473633104575e-07, + "loss": 0.4762, + "step": 6557 + }, + { + "epoch": 1.5944566010211525, + "grad_norm": 19.625, + "learning_rate": 2.5102417700712714e-07, + "loss": 0.8368, + "step": 6558 + }, + { + "epoch": 1.5946997325553123, + "grad_norm": 22.375, + "learning_rate": 2.5073376699739805e-07, + "loss": 0.7926, + "step": 6559 + }, + { + "epoch": 1.5949428640894725, + "grad_norm": 20.25, + "learning_rate": 2.504435063452652e-07, + "loss": 0.5631, + "step": 6560 + }, + { + "epoch": 1.5951859956236323, + "grad_norm": 18.5, + "learning_rate": 2.501533950941136e-07, + "loss": 0.7315, + "step": 6561 + }, + { + "epoch": 1.5954291271577925, + "grad_norm": 19.625, + "learning_rate": 2.498634332873047e-07, + "loss": 0.7402, + "step": 6562 + }, + { + "epoch": 1.5956722586919523, + "grad_norm": 14.1875, + "learning_rate": 2.49573620968179e-07, + "loss": 0.3719, + "step": 6563 + }, + { + "epoch": 1.5959153902261123, + "grad_norm": 21.875, + "learning_rate": 2.4928395818005324e-07, + "loss": 0.8335, + "step": 6564 + }, + { + "epoch": 1.5961585217602723, + "grad_norm": 21.875, + "learning_rate": 2.489944449662232e-07, + "loss": 1.0901, + "step": 6565 + }, + { + "epoch": 1.5964016532944323, + "grad_norm": 11.5, + "learning_rate": 2.487050813699614e-07, + "loss": 0.2278, + "step": 6566 + }, + { + "epoch": 1.5966447848285923, + "grad_norm": 15.9375, + "learning_rate": 2.484158674345179e-07, + "loss": 0.4144, + "step": 6567 + }, + { + "epoch": 1.5968879163627523, + "grad_norm": 22.875, + "learning_rate": 2.481268032031212e-07, + "loss": 0.89, + "step": 6568 + }, + { + "epoch": 1.5971310478969123, + "grad_norm": 24.625, + "learning_rate": 2.4783788871897654e-07, + "loss": 0.6295, + "step": 6569 + }, + { + "epoch": 1.597374179431072, + "grad_norm": 22.375, + "learning_rate": 2.4754912402526727e-07, + "loss": 0.511, + "step": 6570 + }, + { + "epoch": 1.5976173109652323, + "grad_norm": 31.875, + "learning_rate": 2.472605091651542e-07, + "loss": 0.7927, + "step": 6571 + }, + { + "epoch": 1.597860442499392, + "grad_norm": 17.875, + "learning_rate": 2.4697204418177634e-07, + "loss": 0.5545, + "step": 6572 + }, + { + "epoch": 1.5981035740335523, + "grad_norm": 22.875, + "learning_rate": 2.466837291182493e-07, + "loss": 1.0362, + "step": 6573 + }, + { + "epoch": 1.598346705567712, + "grad_norm": 16.625, + "learning_rate": 2.4639556401766655e-07, + "loss": 0.2959, + "step": 6574 + }, + { + "epoch": 1.598589837101872, + "grad_norm": 22.125, + "learning_rate": 2.461075489230999e-07, + "loss": 0.4179, + "step": 6575 + }, + { + "epoch": 1.598832968636032, + "grad_norm": 17.125, + "learning_rate": 2.4581968387759795e-07, + "loss": 0.7986, + "step": 6576 + }, + { + "epoch": 1.599076100170192, + "grad_norm": 14.6875, + "learning_rate": 2.455319689241867e-07, + "loss": 0.3135, + "step": 6577 + }, + { + "epoch": 1.599319231704352, + "grad_norm": 24.25, + "learning_rate": 2.4524440410587047e-07, + "loss": 0.906, + "step": 6578 + }, + { + "epoch": 1.599562363238512, + "grad_norm": 22.0, + "learning_rate": 2.4495698946563103e-07, + "loss": 0.6496, + "step": 6579 + }, + { + "epoch": 1.599805494772672, + "grad_norm": 17.25, + "learning_rate": 2.4466972504642724e-07, + "loss": 0.8979, + "step": 6580 + }, + { + "epoch": 1.6000486263068319, + "grad_norm": 17.875, + "learning_rate": 2.4438261089119535e-07, + "loss": 0.7324, + "step": 6581 + }, + { + "epoch": 1.600291757840992, + "grad_norm": 15.5, + "learning_rate": 2.4409564704285013e-07, + "loss": 0.476, + "step": 6582 + }, + { + "epoch": 1.6005348893751519, + "grad_norm": 14.1875, + "learning_rate": 2.4380883354428316e-07, + "loss": 0.3257, + "step": 6583 + }, + { + "epoch": 1.600778020909312, + "grad_norm": 19.875, + "learning_rate": 2.435221704383632e-07, + "loss": 0.7453, + "step": 6584 + }, + { + "epoch": 1.6010211524434719, + "grad_norm": 15.625, + "learning_rate": 2.4323565776793735e-07, + "loss": 0.353, + "step": 6585 + }, + { + "epoch": 1.6012642839776319, + "grad_norm": 17.375, + "learning_rate": 2.429492955758302e-07, + "loss": 0.5536, + "step": 6586 + }, + { + "epoch": 1.6015074155117919, + "grad_norm": 24.0, + "learning_rate": 2.4266308390484315e-07, + "loss": 0.8279, + "step": 6587 + }, + { + "epoch": 1.6017505470459519, + "grad_norm": 23.375, + "learning_rate": 2.423770227977551e-07, + "loss": 0.6825, + "step": 6588 + }, + { + "epoch": 1.6019936785801119, + "grad_norm": 20.125, + "learning_rate": 2.4209111229732363e-07, + "loss": 0.6046, + "step": 6589 + }, + { + "epoch": 1.6022368101142717, + "grad_norm": 21.375, + "learning_rate": 2.4180535244628265e-07, + "loss": 0.7474, + "step": 6590 + }, + { + "epoch": 1.6024799416484319, + "grad_norm": 22.125, + "learning_rate": 2.415197432873437e-07, + "loss": 0.8632, + "step": 6591 + }, + { + "epoch": 1.6027230731825917, + "grad_norm": 15.8125, + "learning_rate": 2.4123428486319627e-07, + "loss": 0.4441, + "step": 6592 + }, + { + "epoch": 1.6029662047167519, + "grad_norm": 17.5, + "learning_rate": 2.4094897721650675e-07, + "loss": 0.6094, + "step": 6593 + }, + { + "epoch": 1.6032093362509117, + "grad_norm": 23.125, + "learning_rate": 2.4066382038991973e-07, + "loss": 0.7961, + "step": 6594 + }, + { + "epoch": 1.6034524677850719, + "grad_norm": 19.0, + "learning_rate": 2.4037881442605633e-07, + "loss": 0.5949, + "step": 6595 + }, + { + "epoch": 1.6036955993192317, + "grad_norm": 18.625, + "learning_rate": 2.400939593675161e-07, + "loss": 0.7009, + "step": 6596 + }, + { + "epoch": 1.6039387308533917, + "grad_norm": 20.875, + "learning_rate": 2.398092552568755e-07, + "loss": 0.833, + "step": 6597 + }, + { + "epoch": 1.6041818623875517, + "grad_norm": 15.0, + "learning_rate": 2.3952470213668785e-07, + "loss": 0.3738, + "step": 6598 + }, + { + "epoch": 1.6044249939217117, + "grad_norm": 18.625, + "learning_rate": 2.392403000494853e-07, + "loss": 0.8037, + "step": 6599 + }, + { + "epoch": 1.6046681254558717, + "grad_norm": 19.875, + "learning_rate": 2.389560490377764e-07, + "loss": 0.6583, + "step": 6600 + }, + { + "epoch": 1.6049112569900315, + "grad_norm": 21.375, + "learning_rate": 2.38671949144047e-07, + "loss": 0.7524, + "step": 6601 + }, + { + "epoch": 1.6051543885241917, + "grad_norm": 18.375, + "learning_rate": 2.3838800041076096e-07, + "loss": 0.5015, + "step": 6602 + }, + { + "epoch": 1.6053975200583515, + "grad_norm": 17.25, + "learning_rate": 2.3810420288035971e-07, + "loss": 0.4567, + "step": 6603 + }, + { + "epoch": 1.6056406515925117, + "grad_norm": 20.5, + "learning_rate": 2.378205565952614e-07, + "loss": 0.6397, + "step": 6604 + }, + { + "epoch": 1.6058837831266715, + "grad_norm": 21.625, + "learning_rate": 2.3753706159786158e-07, + "loss": 0.7376, + "step": 6605 + }, + { + "epoch": 1.6061269146608315, + "grad_norm": 92.0, + "learning_rate": 2.372537179305341e-07, + "loss": 1.0405, + "step": 6606 + }, + { + "epoch": 1.6063700461949915, + "grad_norm": 20.625, + "learning_rate": 2.3697052563562918e-07, + "loss": 0.8989, + "step": 6607 + }, + { + "epoch": 1.6066131777291515, + "grad_norm": 21.375, + "learning_rate": 2.366874847554744e-07, + "loss": 0.7002, + "step": 6608 + }, + { + "epoch": 1.6068563092633115, + "grad_norm": 23.375, + "learning_rate": 2.3640459533237556e-07, + "loss": 0.8436, + "step": 6609 + }, + { + "epoch": 1.6070994407974715, + "grad_norm": 25.125, + "learning_rate": 2.3612185740861577e-07, + "loss": 0.911, + "step": 6610 + }, + { + "epoch": 1.6073425723316315, + "grad_norm": 32.25, + "learning_rate": 2.3583927102645467e-07, + "loss": 1.0438, + "step": 6611 + }, + { + "epoch": 1.6075857038657912, + "grad_norm": 22.625, + "learning_rate": 2.3555683622812922e-07, + "loss": 0.7154, + "step": 6612 + }, + { + "epoch": 1.6078288353999515, + "grad_norm": 16.125, + "learning_rate": 2.3527455305585506e-07, + "loss": 0.3308, + "step": 6613 + }, + { + "epoch": 1.6080719669341113, + "grad_norm": 19.875, + "learning_rate": 2.3499242155182387e-07, + "loss": 0.7239, + "step": 6614 + }, + { + "epoch": 1.6083150984682715, + "grad_norm": 24.625, + "learning_rate": 2.3471044175820468e-07, + "loss": 0.6991, + "step": 6615 + }, + { + "epoch": 1.6085582300024313, + "grad_norm": 14.875, + "learning_rate": 2.3442861371714477e-07, + "loss": 0.3567, + "step": 6616 + }, + { + "epoch": 1.6088013615365913, + "grad_norm": 35.5, + "learning_rate": 2.341469374707682e-07, + "loss": 0.6461, + "step": 6617 + }, + { + "epoch": 1.6090444930707513, + "grad_norm": 20.25, + "learning_rate": 2.338654130611763e-07, + "loss": 1.3156, + "step": 6618 + }, + { + "epoch": 1.6092876246049113, + "grad_norm": 21.0, + "learning_rate": 2.3358404053044736e-07, + "loss": 0.7001, + "step": 6619 + }, + { + "epoch": 1.6095307561390713, + "grad_norm": 13.0625, + "learning_rate": 2.3330281992063803e-07, + "loss": 0.381, + "step": 6620 + }, + { + "epoch": 1.6097738876732313, + "grad_norm": 22.625, + "learning_rate": 2.330217512737812e-07, + "loss": 1.0427, + "step": 6621 + }, + { + "epoch": 1.6100170192073913, + "grad_norm": 21.75, + "learning_rate": 2.3274083463188712e-07, + "loss": 0.6807, + "step": 6622 + }, + { + "epoch": 1.610260150741551, + "grad_norm": 21.75, + "learning_rate": 2.324600700369442e-07, + "loss": 0.5963, + "step": 6623 + }, + { + "epoch": 1.6105032822757113, + "grad_norm": 20.625, + "learning_rate": 2.321794575309172e-07, + "loss": 0.7102, + "step": 6624 + }, + { + "epoch": 1.610746413809871, + "grad_norm": 31.625, + "learning_rate": 2.3189899715574892e-07, + "loss": 0.6249, + "step": 6625 + }, + { + "epoch": 1.6109895453440313, + "grad_norm": 22.625, + "learning_rate": 2.3161868895335838e-07, + "loss": 0.6783, + "step": 6626 + }, + { + "epoch": 1.611232676878191, + "grad_norm": 24.125, + "learning_rate": 2.3133853296564313e-07, + "loss": 1.0236, + "step": 6627 + }, + { + "epoch": 1.611475808412351, + "grad_norm": 21.75, + "learning_rate": 2.3105852923447692e-07, + "loss": 1.2369, + "step": 6628 + }, + { + "epoch": 1.611718939946511, + "grad_norm": 23.875, + "learning_rate": 2.3077867780171112e-07, + "loss": 0.6579, + "step": 6629 + }, + { + "epoch": 1.611962071480671, + "grad_norm": 22.125, + "learning_rate": 2.3049897870917466e-07, + "loss": 0.6771, + "step": 6630 + }, + { + "epoch": 1.612205203014831, + "grad_norm": 21.25, + "learning_rate": 2.3021943199867295e-07, + "loss": 0.7821, + "step": 6631 + }, + { + "epoch": 1.612448334548991, + "grad_norm": 24.25, + "learning_rate": 2.2994003771198967e-07, + "loss": 0.9348, + "step": 6632 + }, + { + "epoch": 1.612691466083151, + "grad_norm": 25.375, + "learning_rate": 2.2966079589088446e-07, + "loss": 0.7367, + "step": 6633 + }, + { + "epoch": 1.6129345976173108, + "grad_norm": 12.375, + "learning_rate": 2.2938170657709562e-07, + "loss": 0.2971, + "step": 6634 + }, + { + "epoch": 1.613177729151471, + "grad_norm": 20.375, + "learning_rate": 2.2910276981233728e-07, + "loss": 1.1531, + "step": 6635 + }, + { + "epoch": 1.6134208606856308, + "grad_norm": 20.125, + "learning_rate": 2.2882398563830122e-07, + "loss": 0.6736, + "step": 6636 + }, + { + "epoch": 1.613663992219791, + "grad_norm": 21.625, + "learning_rate": 2.2854535409665715e-07, + "loss": 0.5975, + "step": 6637 + }, + { + "epoch": 1.6139071237539508, + "grad_norm": 18.625, + "learning_rate": 2.2826687522905096e-07, + "loss": 0.6806, + "step": 6638 + }, + { + "epoch": 1.6141502552881108, + "grad_norm": 24.125, + "learning_rate": 2.2798854907710596e-07, + "loss": 0.7261, + "step": 6639 + }, + { + "epoch": 1.6143933868222708, + "grad_norm": 19.5, + "learning_rate": 2.277103756824231e-07, + "loss": 0.9476, + "step": 6640 + }, + { + "epoch": 1.6146365183564308, + "grad_norm": 23.5, + "learning_rate": 2.274323550865805e-07, + "loss": 1.0675, + "step": 6641 + }, + { + "epoch": 1.6148796498905909, + "grad_norm": 18.5, + "learning_rate": 2.2715448733113264e-07, + "loss": 0.9203, + "step": 6642 + }, + { + "epoch": 1.6151227814247506, + "grad_norm": 22.375, + "learning_rate": 2.2687677245761153e-07, + "loss": 0.5243, + "step": 6643 + }, + { + "epoch": 1.6153659129589109, + "grad_norm": 21.5, + "learning_rate": 2.2659921050752708e-07, + "loss": 0.8116, + "step": 6644 + }, + { + "epoch": 1.6156090444930706, + "grad_norm": 20.25, + "learning_rate": 2.2632180152236532e-07, + "loss": 0.6487, + "step": 6645 + }, + { + "epoch": 1.6158521760272309, + "grad_norm": 19.375, + "learning_rate": 2.260445455435896e-07, + "loss": 0.6804, + "step": 6646 + }, + { + "epoch": 1.6160953075613906, + "grad_norm": 19.5, + "learning_rate": 2.2576744261264077e-07, + "loss": 0.5433, + "step": 6647 + }, + { + "epoch": 1.6163384390955506, + "grad_norm": 18.25, + "learning_rate": 2.2549049277093712e-07, + "loss": 0.8044, + "step": 6648 + }, + { + "epoch": 1.6165815706297106, + "grad_norm": 16.75, + "learning_rate": 2.2521369605987332e-07, + "loss": 0.2774, + "step": 6649 + }, + { + "epoch": 1.6168247021638706, + "grad_norm": 24.875, + "learning_rate": 2.2493705252082081e-07, + "loss": 0.7089, + "step": 6650 + }, + { + "epoch": 1.6170678336980306, + "grad_norm": 13.5, + "learning_rate": 2.2466056219512976e-07, + "loss": 0.2571, + "step": 6651 + }, + { + "epoch": 1.6173109652321906, + "grad_norm": 16.125, + "learning_rate": 2.2438422512412572e-07, + "loss": 0.6296, + "step": 6652 + }, + { + "epoch": 1.6175540967663506, + "grad_norm": 27.25, + "learning_rate": 2.2410804134911201e-07, + "loss": 0.9578, + "step": 6653 + }, + { + "epoch": 1.6177972283005104, + "grad_norm": 21.75, + "learning_rate": 2.2383201091136965e-07, + "loss": 0.8257, + "step": 6654 + }, + { + "epoch": 1.6180403598346706, + "grad_norm": 17.5, + "learning_rate": 2.2355613385215538e-07, + "loss": 0.6513, + "step": 6655 + }, + { + "epoch": 1.6182834913688304, + "grad_norm": 17.0, + "learning_rate": 2.2328041021270467e-07, + "loss": 0.739, + "step": 6656 + }, + { + "epoch": 1.6185266229029907, + "grad_norm": 30.875, + "learning_rate": 2.230048400342283e-07, + "loss": 1.4384, + "step": 6657 + }, + { + "epoch": 1.6187697544371504, + "grad_norm": 26.875, + "learning_rate": 2.2272942335791566e-07, + "loss": 0.8301, + "step": 6658 + }, + { + "epoch": 1.6190128859713104, + "grad_norm": 21.75, + "learning_rate": 2.2245416022493236e-07, + "loss": 0.6196, + "step": 6659 + }, + { + "epoch": 1.6192560175054704, + "grad_norm": 16.625, + "learning_rate": 2.2217905067642083e-07, + "loss": 0.9399, + "step": 6660 + }, + { + "epoch": 1.6194991490396304, + "grad_norm": 15.8125, + "learning_rate": 2.2190409475350158e-07, + "loss": 0.5162, + "step": 6661 + }, + { + "epoch": 1.6197422805737904, + "grad_norm": 18.25, + "learning_rate": 2.2162929249727087e-07, + "loss": 0.7829, + "step": 6662 + }, + { + "epoch": 1.6199854121079504, + "grad_norm": 20.125, + "learning_rate": 2.213546439488033e-07, + "loss": 0.7781, + "step": 6663 + }, + { + "epoch": 1.6202285436421104, + "grad_norm": 16.5, + "learning_rate": 2.210801491491492e-07, + "loss": 0.3432, + "step": 6664 + }, + { + "epoch": 1.6204716751762702, + "grad_norm": 18.75, + "learning_rate": 2.2080580813933717e-07, + "loss": 0.717, + "step": 6665 + }, + { + "epoch": 1.6207148067104304, + "grad_norm": 18.375, + "learning_rate": 2.205316209603718e-07, + "loss": 0.6753, + "step": 6666 + }, + { + "epoch": 1.6209579382445902, + "grad_norm": 23.25, + "learning_rate": 2.2025758765323506e-07, + "loss": 0.7017, + "step": 6667 + }, + { + "epoch": 1.6212010697787504, + "grad_norm": 20.125, + "learning_rate": 2.199837082588864e-07, + "loss": 0.6525, + "step": 6668 + }, + { + "epoch": 1.6214442013129102, + "grad_norm": 19.375, + "learning_rate": 2.197099828182614e-07, + "loss": 0.3315, + "step": 6669 + }, + { + "epoch": 1.6216873328470702, + "grad_norm": 141.0, + "learning_rate": 2.19436411372273e-07, + "loss": 0.8343, + "step": 6670 + }, + { + "epoch": 1.6219304643812302, + "grad_norm": 22.5, + "learning_rate": 2.1916299396181146e-07, + "loss": 1.3092, + "step": 6671 + }, + { + "epoch": 1.6221735959153902, + "grad_norm": 17.0, + "learning_rate": 2.1888973062774376e-07, + "loss": 0.4956, + "step": 6672 + }, + { + "epoch": 1.6224167274495502, + "grad_norm": 16.375, + "learning_rate": 2.186166214109138e-07, + "loss": 0.4634, + "step": 6673 + }, + { + "epoch": 1.6226598589837102, + "grad_norm": 19.875, + "learning_rate": 2.183436663521421e-07, + "loss": 0.6935, + "step": 6674 + }, + { + "epoch": 1.6229029905178702, + "grad_norm": 19.375, + "learning_rate": 2.18070865492227e-07, + "loss": 0.7186, + "step": 6675 + }, + { + "epoch": 1.62314612205203, + "grad_norm": 24.75, + "learning_rate": 2.1779821887194323e-07, + "loss": 1.0167, + "step": 6676 + }, + { + "epoch": 1.6233892535861902, + "grad_norm": 28.875, + "learning_rate": 2.1752572653204198e-07, + "loss": 1.1666, + "step": 6677 + }, + { + "epoch": 1.62363238512035, + "grad_norm": 20.875, + "learning_rate": 2.172533885132523e-07, + "loss": 0.7182, + "step": 6678 + }, + { + "epoch": 1.6238755166545102, + "grad_norm": 27.625, + "learning_rate": 2.1698120485628027e-07, + "loss": 1.1638, + "step": 6679 + }, + { + "epoch": 1.62411864818867, + "grad_norm": 31.875, + "learning_rate": 2.1670917560180805e-07, + "loss": 0.8286, + "step": 6680 + }, + { + "epoch": 1.62436177972283, + "grad_norm": 30.875, + "learning_rate": 2.1643730079049463e-07, + "loss": 1.0799, + "step": 6681 + }, + { + "epoch": 1.62460491125699, + "grad_norm": 23.0, + "learning_rate": 2.1616558046297724e-07, + "loss": 0.9894, + "step": 6682 + }, + { + "epoch": 1.62484804279115, + "grad_norm": 14.6875, + "learning_rate": 2.1589401465986883e-07, + "loss": 0.1656, + "step": 6683 + }, + { + "epoch": 1.62509117432531, + "grad_norm": 27.875, + "learning_rate": 2.1562260342175913e-07, + "loss": 0.8089, + "step": 6684 + }, + { + "epoch": 1.6253343058594698, + "grad_norm": 22.125, + "learning_rate": 2.1535134678921585e-07, + "loss": 0.9627, + "step": 6685 + }, + { + "epoch": 1.62557743739363, + "grad_norm": 21.5, + "learning_rate": 2.1508024480278292e-07, + "loss": 0.7536, + "step": 6686 + }, + { + "epoch": 1.6258205689277898, + "grad_norm": 21.375, + "learning_rate": 2.1480929750298126e-07, + "loss": 0.5848, + "step": 6687 + }, + { + "epoch": 1.62606370046195, + "grad_norm": 23.0, + "learning_rate": 2.1453850493030795e-07, + "loss": 1.0846, + "step": 6688 + }, + { + "epoch": 1.6263068319961098, + "grad_norm": 16.375, + "learning_rate": 2.142678671252385e-07, + "loss": 0.8118, + "step": 6689 + }, + { + "epoch": 1.62654996353027, + "grad_norm": 21.375, + "learning_rate": 2.1399738412822406e-07, + "loss": 0.5555, + "step": 6690 + }, + { + "epoch": 1.6267930950644298, + "grad_norm": 26.0, + "learning_rate": 2.1372705597969266e-07, + "loss": 1.2868, + "step": 6691 + }, + { + "epoch": 1.6270362265985898, + "grad_norm": 22.625, + "learning_rate": 2.1345688272005005e-07, + "loss": 0.6446, + "step": 6692 + }, + { + "epoch": 1.6272793581327498, + "grad_norm": 21.125, + "learning_rate": 2.131868643896777e-07, + "loss": 0.5819, + "step": 6693 + }, + { + "epoch": 1.6275224896669098, + "grad_norm": 22.125, + "learning_rate": 2.1291700102893526e-07, + "loss": 0.6422, + "step": 6694 + }, + { + "epoch": 1.6277656212010698, + "grad_norm": 20.625, + "learning_rate": 2.1264729267815767e-07, + "loss": 0.6746, + "step": 6695 + }, + { + "epoch": 1.6280087527352296, + "grad_norm": 18.0, + "learning_rate": 2.1237773937765827e-07, + "loss": 0.5748, + "step": 6696 + }, + { + "epoch": 1.6282518842693898, + "grad_norm": 18.375, + "learning_rate": 2.1210834116772605e-07, + "loss": 0.6676, + "step": 6697 + }, + { + "epoch": 1.6284950158035496, + "grad_norm": 19.375, + "learning_rate": 2.1183909808862686e-07, + "loss": 0.6756, + "step": 6698 + }, + { + "epoch": 1.6287381473377098, + "grad_norm": 30.875, + "learning_rate": 2.1157001018060452e-07, + "loss": 0.7719, + "step": 6699 + }, + { + "epoch": 1.6289812788718696, + "grad_norm": 15.75, + "learning_rate": 2.1130107748387849e-07, + "loss": 0.7149, + "step": 6700 + }, + { + "epoch": 1.6292244104060296, + "grad_norm": 18.25, + "learning_rate": 2.1103230003864467e-07, + "loss": 0.7606, + "step": 6701 + }, + { + "epoch": 1.6294675419401896, + "grad_norm": 28.875, + "learning_rate": 2.1076367788507787e-07, + "loss": 0.666, + "step": 6702 + }, + { + "epoch": 1.6297106734743496, + "grad_norm": 38.0, + "learning_rate": 2.1049521106332752e-07, + "loss": 0.7115, + "step": 6703 + }, + { + "epoch": 1.6299538050085096, + "grad_norm": 22.0, + "learning_rate": 2.1022689961352094e-07, + "loss": 1.1008, + "step": 6704 + }, + { + "epoch": 1.6301969365426696, + "grad_norm": 19.625, + "learning_rate": 2.0995874357576136e-07, + "loss": 0.5102, + "step": 6705 + }, + { + "epoch": 1.6304400680768296, + "grad_norm": 31.75, + "learning_rate": 2.0969074299012986e-07, + "loss": 0.5456, + "step": 6706 + }, + { + "epoch": 1.6306831996109894, + "grad_norm": 20.875, + "learning_rate": 2.0942289789668377e-07, + "loss": 0.8614, + "step": 6707 + }, + { + "epoch": 1.6309263311451496, + "grad_norm": 16.875, + "learning_rate": 2.0915520833545658e-07, + "loss": 0.6767, + "step": 6708 + }, + { + "epoch": 1.6311694626793094, + "grad_norm": 22.125, + "learning_rate": 2.088876743464595e-07, + "loss": 0.5842, + "step": 6709 + }, + { + "epoch": 1.6314125942134696, + "grad_norm": 15.375, + "learning_rate": 2.0862029596968036e-07, + "loss": 0.3612, + "step": 6710 + }, + { + "epoch": 1.6316557257476294, + "grad_norm": 27.0, + "learning_rate": 2.0835307324508336e-07, + "loss": 0.5747, + "step": 6711 + }, + { + "epoch": 1.6318988572817894, + "grad_norm": 23.625, + "learning_rate": 2.0808600621260916e-07, + "loss": 1.1054, + "step": 6712 + }, + { + "epoch": 1.6321419888159494, + "grad_norm": 27.5, + "learning_rate": 2.0781909491217596e-07, + "loss": 0.7226, + "step": 6713 + }, + { + "epoch": 1.6323851203501094, + "grad_norm": 18.375, + "learning_rate": 2.0755233938367815e-07, + "loss": 0.3595, + "step": 6714 + }, + { + "epoch": 1.6326282518842694, + "grad_norm": 20.75, + "learning_rate": 2.0728573966698667e-07, + "loss": 0.9376, + "step": 6715 + }, + { + "epoch": 1.6328713834184294, + "grad_norm": 19.375, + "learning_rate": 2.0701929580194972e-07, + "loss": 0.7214, + "step": 6716 + }, + { + "epoch": 1.6331145149525894, + "grad_norm": 23.75, + "learning_rate": 2.0675300782839218e-07, + "loss": 0.9348, + "step": 6717 + }, + { + "epoch": 1.6333576464867492, + "grad_norm": 15.4375, + "learning_rate": 2.064868757861148e-07, + "loss": 0.516, + "step": 6718 + }, + { + "epoch": 1.6336007780209094, + "grad_norm": 23.25, + "learning_rate": 2.0622089971489624e-07, + "loss": 0.9958, + "step": 6719 + }, + { + "epoch": 1.6338439095550692, + "grad_norm": 19.125, + "learning_rate": 2.0595507965449083e-07, + "loss": 0.6534, + "step": 6720 + }, + { + "epoch": 1.6340870410892294, + "grad_norm": 19.375, + "learning_rate": 2.0568941564463014e-07, + "loss": 0.637, + "step": 6721 + }, + { + "epoch": 1.6343301726233892, + "grad_norm": 20.75, + "learning_rate": 2.0542390772502183e-07, + "loss": 0.737, + "step": 6722 + }, + { + "epoch": 1.6345733041575492, + "grad_norm": 19.625, + "learning_rate": 2.0515855593535124e-07, + "loss": 0.5851, + "step": 6723 + }, + { + "epoch": 1.6348164356917092, + "grad_norm": 23.25, + "learning_rate": 2.048933603152793e-07, + "loss": 1.1741, + "step": 6724 + }, + { + "epoch": 1.6350595672258692, + "grad_norm": 19.0, + "learning_rate": 2.0462832090444417e-07, + "loss": 0.5789, + "step": 6725 + }, + { + "epoch": 1.6353026987600292, + "grad_norm": 20.625, + "learning_rate": 2.0436343774246094e-07, + "loss": 0.5312, + "step": 6726 + }, + { + "epoch": 1.6355458302941892, + "grad_norm": 18.875, + "learning_rate": 2.0409871086892087e-07, + "loss": 0.6926, + "step": 6727 + }, + { + "epoch": 1.6357889618283492, + "grad_norm": 19.875, + "learning_rate": 2.0383414032339162e-07, + "loss": 0.6181, + "step": 6728 + }, + { + "epoch": 1.636032093362509, + "grad_norm": 22.625, + "learning_rate": 2.0356972614541782e-07, + "loss": 0.7802, + "step": 6729 + }, + { + "epoch": 1.6362752248966692, + "grad_norm": 29.375, + "learning_rate": 2.033054683745213e-07, + "loss": 0.7724, + "step": 6730 + }, + { + "epoch": 1.636518356430829, + "grad_norm": 22.375, + "learning_rate": 2.0304136705019933e-07, + "loss": 0.9762, + "step": 6731 + }, + { + "epoch": 1.6367614879649892, + "grad_norm": 20.0, + "learning_rate": 2.0277742221192623e-07, + "loss": 0.9337, + "step": 6732 + }, + { + "epoch": 1.637004619499149, + "grad_norm": 18.25, + "learning_rate": 2.0251363389915414e-07, + "loss": 0.831, + "step": 6733 + }, + { + "epoch": 1.637247751033309, + "grad_norm": 17.5, + "learning_rate": 2.022500021513102e-07, + "loss": 0.8069, + "step": 6734 + }, + { + "epoch": 1.637490882567469, + "grad_norm": 27.125, + "learning_rate": 2.0198652700779823e-07, + "loss": 0.8059, + "step": 6735 + }, + { + "epoch": 1.637734014101629, + "grad_norm": 22.125, + "learning_rate": 2.017232085080001e-07, + "loss": 0.4712, + "step": 6736 + }, + { + "epoch": 1.637977145635789, + "grad_norm": 20.5, + "learning_rate": 2.0146004669127279e-07, + "loss": 0.8741, + "step": 6737 + }, + { + "epoch": 1.6382202771699488, + "grad_norm": 20.375, + "learning_rate": 2.0119704159695038e-07, + "loss": 0.4799, + "step": 6738 + }, + { + "epoch": 1.638463408704109, + "grad_norm": 20.0, + "learning_rate": 2.0093419326434326e-07, + "loss": 0.6843, + "step": 6739 + }, + { + "epoch": 1.6387065402382688, + "grad_norm": 15.6875, + "learning_rate": 2.0067150173273888e-07, + "loss": 0.3769, + "step": 6740 + }, + { + "epoch": 1.638949671772429, + "grad_norm": 19.5, + "learning_rate": 2.0040896704140153e-07, + "loss": 0.7515, + "step": 6741 + }, + { + "epoch": 1.6391928033065888, + "grad_norm": 21.875, + "learning_rate": 2.0014658922957077e-07, + "loss": 0.4303, + "step": 6742 + }, + { + "epoch": 1.639435934840749, + "grad_norm": 34.5, + "learning_rate": 1.9988436833646415e-07, + "loss": 1.0822, + "step": 6743 + }, + { + "epoch": 1.6396790663749088, + "grad_norm": 28.5, + "learning_rate": 1.9962230440127474e-07, + "loss": 1.1679, + "step": 6744 + }, + { + "epoch": 1.6399221979090688, + "grad_norm": 26.625, + "learning_rate": 1.9936039746317261e-07, + "loss": 1.0491, + "step": 6745 + }, + { + "epoch": 1.6401653294432288, + "grad_norm": 21.625, + "learning_rate": 1.9909864756130395e-07, + "loss": 0.8999, + "step": 6746 + }, + { + "epoch": 1.6404084609773888, + "grad_norm": 19.25, + "learning_rate": 1.9883705473479217e-07, + "loss": 0.5207, + "step": 6747 + }, + { + "epoch": 1.6406515925115488, + "grad_norm": 16.625, + "learning_rate": 1.985756190227371e-07, + "loss": 0.7254, + "step": 6748 + }, + { + "epoch": 1.6408947240457086, + "grad_norm": 16.625, + "learning_rate": 1.9831434046421405e-07, + "loss": 0.6691, + "step": 6749 + }, + { + "epoch": 1.6411378555798688, + "grad_norm": 18.375, + "learning_rate": 1.9805321909827645e-07, + "loss": 0.6687, + "step": 6750 + }, + { + "epoch": 1.6413809871140286, + "grad_norm": 19.875, + "learning_rate": 1.9779225496395298e-07, + "loss": 0.7149, + "step": 6751 + }, + { + "epoch": 1.6416241186481888, + "grad_norm": 20.0, + "learning_rate": 1.9753144810024937e-07, + "loss": 1.0038, + "step": 6752 + }, + { + "epoch": 1.6418672501823486, + "grad_norm": 20.625, + "learning_rate": 1.9727079854614723e-07, + "loss": 0.7706, + "step": 6753 + }, + { + "epoch": 1.6421103817165086, + "grad_norm": 15.5625, + "learning_rate": 1.9701030634060578e-07, + "loss": 0.4311, + "step": 6754 + }, + { + "epoch": 1.6423535132506686, + "grad_norm": 18.125, + "learning_rate": 1.9674997152255944e-07, + "loss": 0.4856, + "step": 6755 + }, + { + "epoch": 1.6425966447848286, + "grad_norm": 20.125, + "learning_rate": 1.9648979413092017e-07, + "loss": 0.6617, + "step": 6756 + }, + { + "epoch": 1.6428397763189886, + "grad_norm": 18.625, + "learning_rate": 1.9622977420457628e-07, + "loss": 0.5935, + "step": 6757 + }, + { + "epoch": 1.6430829078531486, + "grad_norm": 25.0, + "learning_rate": 1.9596991178239183e-07, + "loss": 0.6934, + "step": 6758 + }, + { + "epoch": 1.6433260393873086, + "grad_norm": 22.125, + "learning_rate": 1.9571020690320754e-07, + "loss": 0.8906, + "step": 6759 + }, + { + "epoch": 1.6435691709214684, + "grad_norm": 20.375, + "learning_rate": 1.9545065960584127e-07, + "loss": 1.0914, + "step": 6760 + }, + { + "epoch": 1.6438123024556286, + "grad_norm": 19.125, + "learning_rate": 1.951912699290867e-07, + "loss": 0.7872, + "step": 6761 + }, + { + "epoch": 1.6440554339897884, + "grad_norm": 20.75, + "learning_rate": 1.9493203791171393e-07, + "loss": 0.5704, + "step": 6762 + }, + { + "epoch": 1.6442985655239486, + "grad_norm": 22.75, + "learning_rate": 1.946729635924692e-07, + "loss": 0.988, + "step": 6763 + }, + { + "epoch": 1.6445416970581084, + "grad_norm": 21.5, + "learning_rate": 1.944140470100768e-07, + "loss": 0.6622, + "step": 6764 + }, + { + "epoch": 1.6447848285922684, + "grad_norm": 19.125, + "learning_rate": 1.9415528820323562e-07, + "loss": 0.744, + "step": 6765 + }, + { + "epoch": 1.6450279601264284, + "grad_norm": 24.25, + "learning_rate": 1.9389668721062142e-07, + "loss": 0.609, + "step": 6766 + }, + { + "epoch": 1.6452710916605884, + "grad_norm": 20.625, + "learning_rate": 1.9363824407088714e-07, + "loss": 0.7015, + "step": 6767 + }, + { + "epoch": 1.6455142231947484, + "grad_norm": 19.875, + "learning_rate": 1.9337995882266133e-07, + "loss": 0.5253, + "step": 6768 + }, + { + "epoch": 1.6457573547289084, + "grad_norm": 21.625, + "learning_rate": 1.931218315045491e-07, + "loss": 0.5222, + "step": 6769 + }, + { + "epoch": 1.6460004862630684, + "grad_norm": 20.125, + "learning_rate": 1.9286386215513139e-07, + "loss": 0.733, + "step": 6770 + }, + { + "epoch": 1.6462436177972282, + "grad_norm": 23.75, + "learning_rate": 1.9260605081296763e-07, + "loss": 0.6605, + "step": 6771 + }, + { + "epoch": 1.6464867493313884, + "grad_norm": 19.875, + "learning_rate": 1.923483975165913e-07, + "loss": 0.8826, + "step": 6772 + }, + { + "epoch": 1.6467298808655482, + "grad_norm": 18.875, + "learning_rate": 1.9209090230451283e-07, + "loss": 0.7168, + "step": 6773 + }, + { + "epoch": 1.6469730123997084, + "grad_norm": 19.25, + "learning_rate": 1.918335652152202e-07, + "loss": 0.823, + "step": 6774 + }, + { + "epoch": 1.6472161439338682, + "grad_norm": 20.75, + "learning_rate": 1.915763862871764e-07, + "loss": 0.7573, + "step": 6775 + }, + { + "epoch": 1.6474592754680282, + "grad_norm": 18.5, + "learning_rate": 1.9131936555882093e-07, + "loss": 0.4875, + "step": 6776 + }, + { + "epoch": 1.6477024070021882, + "grad_norm": 24.375, + "learning_rate": 1.910625030685706e-07, + "loss": 0.6212, + "step": 6777 + }, + { + "epoch": 1.6479455385363482, + "grad_norm": 19.125, + "learning_rate": 1.9080579885481745e-07, + "loss": 0.6861, + "step": 6778 + }, + { + "epoch": 1.6481886700705082, + "grad_norm": 23.5, + "learning_rate": 1.905492529559308e-07, + "loss": 1.1977, + "step": 6779 + }, + { + "epoch": 1.6484318016046682, + "grad_norm": 20.375, + "learning_rate": 1.902928654102554e-07, + "loss": 0.4884, + "step": 6780 + }, + { + "epoch": 1.6486749331388282, + "grad_norm": 20.0, + "learning_rate": 1.900366362561132e-07, + "loss": 0.6032, + "step": 6781 + }, + { + "epoch": 1.648918064672988, + "grad_norm": 19.125, + "learning_rate": 1.8978056553180205e-07, + "loss": 0.4183, + "step": 6782 + }, + { + "epoch": 1.6491611962071482, + "grad_norm": 15.25, + "learning_rate": 1.8952465327559565e-07, + "loss": 0.4614, + "step": 6783 + }, + { + "epoch": 1.649404327741308, + "grad_norm": 22.125, + "learning_rate": 1.8926889952574523e-07, + "loss": 0.8141, + "step": 6784 + }, + { + "epoch": 1.6496474592754682, + "grad_norm": 15.8125, + "learning_rate": 1.8901330432047713e-07, + "loss": 0.3837, + "step": 6785 + }, + { + "epoch": 1.649890590809628, + "grad_norm": 20.125, + "learning_rate": 1.887578676979944e-07, + "loss": 0.6217, + "step": 6786 + }, + { + "epoch": 1.650133722343788, + "grad_norm": 18.625, + "learning_rate": 1.885025896964765e-07, + "loss": 0.6743, + "step": 6787 + }, + { + "epoch": 1.650376853877948, + "grad_norm": 24.0, + "learning_rate": 1.8824747035407972e-07, + "loss": 1.2319, + "step": 6788 + }, + { + "epoch": 1.650619985412108, + "grad_norm": 33.25, + "learning_rate": 1.879925097089355e-07, + "loss": 1.034, + "step": 6789 + }, + { + "epoch": 1.650863116946268, + "grad_norm": 23.875, + "learning_rate": 1.8773770779915185e-07, + "loss": 0.8589, + "step": 6790 + }, + { + "epoch": 1.6511062484804278, + "grad_norm": 24.625, + "learning_rate": 1.8748306466281411e-07, + "loss": 0.8367, + "step": 6791 + }, + { + "epoch": 1.651349380014588, + "grad_norm": 20.125, + "learning_rate": 1.8722858033798253e-07, + "loss": 0.6895, + "step": 6792 + }, + { + "epoch": 1.6515925115487478, + "grad_norm": 20.5, + "learning_rate": 1.8697425486269404e-07, + "loss": 0.7425, + "step": 6793 + }, + { + "epoch": 1.651835643082908, + "grad_norm": 22.375, + "learning_rate": 1.867200882749623e-07, + "loss": 0.8686, + "step": 6794 + }, + { + "epoch": 1.6520787746170678, + "grad_norm": 22.625, + "learning_rate": 1.8646608061277717e-07, + "loss": 0.705, + "step": 6795 + }, + { + "epoch": 1.6523219061512278, + "grad_norm": 20.125, + "learning_rate": 1.862122319141041e-07, + "loss": 0.5989, + "step": 6796 + }, + { + "epoch": 1.6525650376853878, + "grad_norm": 21.125, + "learning_rate": 1.8595854221688497e-07, + "loss": 0.685, + "step": 6797 + }, + { + "epoch": 1.6528081692195478, + "grad_norm": 22.625, + "learning_rate": 1.857050115590385e-07, + "loss": 0.6797, + "step": 6798 + }, + { + "epoch": 1.6530513007537078, + "grad_norm": 20.625, + "learning_rate": 1.8545163997845921e-07, + "loss": 0.7317, + "step": 6799 + }, + { + "epoch": 1.6532944322878678, + "grad_norm": 21.125, + "learning_rate": 1.851984275130174e-07, + "loss": 0.5705, + "step": 6800 + }, + { + "epoch": 1.6535375638220278, + "grad_norm": 15.6875, + "learning_rate": 1.8494537420056038e-07, + "loss": 0.6592, + "step": 6801 + }, + { + "epoch": 1.6537806953561875, + "grad_norm": 16.625, + "learning_rate": 1.8469248007891154e-07, + "loss": 0.5066, + "step": 6802 + }, + { + "epoch": 1.6540238268903478, + "grad_norm": 20.125, + "learning_rate": 1.844397451858701e-07, + "loss": 0.8888, + "step": 6803 + }, + { + "epoch": 1.6542669584245075, + "grad_norm": 17.5, + "learning_rate": 1.8418716955921124e-07, + "loss": 0.7053, + "step": 6804 + }, + { + "epoch": 1.6545100899586678, + "grad_norm": 17.125, + "learning_rate": 1.8393475323668739e-07, + "loss": 0.3783, + "step": 6805 + }, + { + "epoch": 1.6547532214928276, + "grad_norm": 21.875, + "learning_rate": 1.836824962560263e-07, + "loss": 1.0748, + "step": 6806 + }, + { + "epoch": 1.6549963530269876, + "grad_norm": 15.5625, + "learning_rate": 1.8343039865493184e-07, + "loss": 0.6374, + "step": 6807 + }, + { + "epoch": 1.6552394845611476, + "grad_norm": 20.0, + "learning_rate": 1.8317846047108484e-07, + "loss": 0.546, + "step": 6808 + }, + { + "epoch": 1.6554826160953076, + "grad_norm": 19.625, + "learning_rate": 1.829266817421413e-07, + "loss": 0.7105, + "step": 6809 + }, + { + "epoch": 1.6557257476294676, + "grad_norm": 25.875, + "learning_rate": 1.8267506250573441e-07, + "loss": 0.9099, + "step": 6810 + }, + { + "epoch": 1.6559688791636276, + "grad_norm": 18.5, + "learning_rate": 1.824236027994726e-07, + "loss": 0.7669, + "step": 6811 + }, + { + "epoch": 1.6562120106977876, + "grad_norm": 16.625, + "learning_rate": 1.8217230266094122e-07, + "loss": 0.8342, + "step": 6812 + }, + { + "epoch": 1.6564551422319473, + "grad_norm": 17.125, + "learning_rate": 1.8192116212770116e-07, + "loss": 0.4895, + "step": 6813 + }, + { + "epoch": 1.6566982737661076, + "grad_norm": 27.25, + "learning_rate": 1.8167018123728967e-07, + "loss": 0.4567, + "step": 6814 + }, + { + "epoch": 1.6569414053002673, + "grad_norm": 23.625, + "learning_rate": 1.8141936002722044e-07, + "loss": 0.9059, + "step": 6815 + }, + { + "epoch": 1.6571845368344276, + "grad_norm": 17.0, + "learning_rate": 1.8116869853498295e-07, + "loss": 0.5689, + "step": 6816 + }, + { + "epoch": 1.6574276683685873, + "grad_norm": 21.625, + "learning_rate": 1.8091819679804252e-07, + "loss": 0.9049, + "step": 6817 + }, + { + "epoch": 1.6576707999027473, + "grad_norm": 21.125, + "learning_rate": 1.8066785485384125e-07, + "loss": 0.7582, + "step": 6818 + }, + { + "epoch": 1.6579139314369074, + "grad_norm": 23.375, + "learning_rate": 1.804176727397973e-07, + "loss": 1.0467, + "step": 6819 + }, + { + "epoch": 1.6581570629710674, + "grad_norm": 17.5, + "learning_rate": 1.801676504933046e-07, + "loss": 0.659, + "step": 6820 + }, + { + "epoch": 1.6584001945052274, + "grad_norm": 17.75, + "learning_rate": 1.7991778815173278e-07, + "loss": 0.4853, + "step": 6821 + }, + { + "epoch": 1.6586433260393874, + "grad_norm": 19.75, + "learning_rate": 1.7966808575242883e-07, + "loss": 0.5262, + "step": 6822 + }, + { + "epoch": 1.6588864575735474, + "grad_norm": 18.25, + "learning_rate": 1.7941854333271474e-07, + "loss": 0.5657, + "step": 6823 + }, + { + "epoch": 1.6591295891077071, + "grad_norm": 17.0, + "learning_rate": 1.791691609298886e-07, + "loss": 0.4819, + "step": 6824 + }, + { + "epoch": 1.6593727206418674, + "grad_norm": 17.125, + "learning_rate": 1.7891993858122528e-07, + "loss": 0.6681, + "step": 6825 + }, + { + "epoch": 1.6596158521760271, + "grad_norm": 22.125, + "learning_rate": 1.786708763239757e-07, + "loss": 0.4846, + "step": 6826 + }, + { + "epoch": 1.6598589837101874, + "grad_norm": 22.375, + "learning_rate": 1.7842197419536605e-07, + "loss": 0.8214, + "step": 6827 + }, + { + "epoch": 1.6601021152443471, + "grad_norm": 23.125, + "learning_rate": 1.781732322325988e-07, + "loss": 0.7494, + "step": 6828 + }, + { + "epoch": 1.6603452467785071, + "grad_norm": 17.5, + "learning_rate": 1.7792465047285352e-07, + "loss": 0.724, + "step": 6829 + }, + { + "epoch": 1.6605883783126671, + "grad_norm": 23.0, + "learning_rate": 1.7767622895328448e-07, + "loss": 1.0386, + "step": 6830 + }, + { + "epoch": 1.6608315098468271, + "grad_norm": 18.0, + "learning_rate": 1.7742796771102257e-07, + "loss": 0.378, + "step": 6831 + }, + { + "epoch": 1.6610746413809871, + "grad_norm": 23.875, + "learning_rate": 1.7717986678317465e-07, + "loss": 0.6293, + "step": 6832 + }, + { + "epoch": 1.661317772915147, + "grad_norm": 18.25, + "learning_rate": 1.7693192620682433e-07, + "loss": 0.9571, + "step": 6833 + }, + { + "epoch": 1.6615609044493072, + "grad_norm": 22.75, + "learning_rate": 1.7668414601903007e-07, + "loss": 0.8777, + "step": 6834 + }, + { + "epoch": 1.661804035983467, + "grad_norm": 19.125, + "learning_rate": 1.764365262568267e-07, + "loss": 0.5456, + "step": 6835 + }, + { + "epoch": 1.6620471675176272, + "grad_norm": 20.125, + "learning_rate": 1.7618906695722597e-07, + "loss": 1.2321, + "step": 6836 + }, + { + "epoch": 1.662290299051787, + "grad_norm": 27.375, + "learning_rate": 1.759417681572144e-07, + "loss": 0.4281, + "step": 6837 + }, + { + "epoch": 1.6625334305859472, + "grad_norm": 31.125, + "learning_rate": 1.7569462989375493e-07, + "loss": 0.641, + "step": 6838 + }, + { + "epoch": 1.662776562120107, + "grad_norm": 29.25, + "learning_rate": 1.7544765220378734e-07, + "loss": 0.7262, + "step": 6839 + }, + { + "epoch": 1.663019693654267, + "grad_norm": 23.0, + "learning_rate": 1.7520083512422584e-07, + "loss": 1.0293, + "step": 6840 + }, + { + "epoch": 1.663262825188427, + "grad_norm": 17.875, + "learning_rate": 1.749541786919623e-07, + "loss": 0.4684, + "step": 6841 + }, + { + "epoch": 1.663505956722587, + "grad_norm": 23.5, + "learning_rate": 1.747076829438632e-07, + "loss": 0.5234, + "step": 6842 + }, + { + "epoch": 1.663749088256747, + "grad_norm": 24.5, + "learning_rate": 1.7446134791677214e-07, + "loss": 1.2282, + "step": 6843 + }, + { + "epoch": 1.6639922197909067, + "grad_norm": 18.375, + "learning_rate": 1.7421517364750771e-07, + "loss": 0.8967, + "step": 6844 + }, + { + "epoch": 1.664235351325067, + "grad_norm": 20.5, + "learning_rate": 1.7396916017286477e-07, + "loss": 0.7396, + "step": 6845 + }, + { + "epoch": 1.6644784828592267, + "grad_norm": 19.5, + "learning_rate": 1.7372330752961492e-07, + "loss": 0.7407, + "step": 6846 + }, + { + "epoch": 1.664721614393387, + "grad_norm": 16.0, + "learning_rate": 1.7347761575450457e-07, + "loss": 0.5837, + "step": 6847 + }, + { + "epoch": 1.6649647459275467, + "grad_norm": 16.625, + "learning_rate": 1.7323208488425656e-07, + "loss": 0.6942, + "step": 6848 + }, + { + "epoch": 1.6652078774617067, + "grad_norm": 19.5, + "learning_rate": 1.7298671495556974e-07, + "loss": 0.5391, + "step": 6849 + }, + { + "epoch": 1.6654510089958667, + "grad_norm": 22.125, + "learning_rate": 1.727415060051195e-07, + "loss": 0.77, + "step": 6850 + }, + { + "epoch": 1.6656941405300267, + "grad_norm": 37.0, + "learning_rate": 1.7249645806955594e-07, + "loss": 0.8063, + "step": 6851 + }, + { + "epoch": 1.6659372720641867, + "grad_norm": 14.75, + "learning_rate": 1.7225157118550565e-07, + "loss": 0.482, + "step": 6852 + }, + { + "epoch": 1.6661804035983467, + "grad_norm": 16.75, + "learning_rate": 1.7200684538957166e-07, + "loss": 0.7561, + "step": 6853 + }, + { + "epoch": 1.6664235351325067, + "grad_norm": 20.125, + "learning_rate": 1.7176228071833223e-07, + "loss": 0.6574, + "step": 6854 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 37.0, + "learning_rate": 1.7151787720834146e-07, + "loss": 0.814, + "step": 6855 + }, + { + "epoch": 1.6669097982008267, + "grad_norm": 20.375, + "learning_rate": 1.7127363489612997e-07, + "loss": 1.0402, + "step": 6856 + }, + { + "epoch": 1.6671529297349865, + "grad_norm": 24.875, + "learning_rate": 1.7102955381820427e-07, + "loss": 0.5734, + "step": 6857 + }, + { + "epoch": 1.6673960612691467, + "grad_norm": 22.875, + "learning_rate": 1.7078563401104613e-07, + "loss": 0.8925, + "step": 6858 + }, + { + "epoch": 1.6676391928033065, + "grad_norm": 36.5, + "learning_rate": 1.7054187551111351e-07, + "loss": 0.9493, + "step": 6859 + }, + { + "epoch": 1.6678823243374665, + "grad_norm": 19.625, + "learning_rate": 1.702982783548407e-07, + "loss": 0.7106, + "step": 6860 + }, + { + "epoch": 1.6681254558716265, + "grad_norm": 23.375, + "learning_rate": 1.7005484257863733e-07, + "loss": 0.6031, + "step": 6861 + }, + { + "epoch": 1.6683685874057865, + "grad_norm": 45.75, + "learning_rate": 1.6981156821888875e-07, + "loss": 0.7476, + "step": 6862 + }, + { + "epoch": 1.6686117189399465, + "grad_norm": 18.25, + "learning_rate": 1.6956845531195684e-07, + "loss": 0.6627, + "step": 6863 + }, + { + "epoch": 1.6688548504741065, + "grad_norm": 18.875, + "learning_rate": 1.693255038941792e-07, + "loss": 0.6377, + "step": 6864 + }, + { + "epoch": 1.6690979820082665, + "grad_norm": 28.375, + "learning_rate": 1.6908271400186887e-07, + "loss": 0.9894, + "step": 6865 + }, + { + "epoch": 1.6693411135424263, + "grad_norm": 21.375, + "learning_rate": 1.6884008567131473e-07, + "loss": 0.8733, + "step": 6866 + }, + { + "epoch": 1.6695842450765865, + "grad_norm": 15.9375, + "learning_rate": 1.6859761893878237e-07, + "loss": 0.4895, + "step": 6867 + }, + { + "epoch": 1.6698273766107463, + "grad_norm": 22.125, + "learning_rate": 1.6835531384051238e-07, + "loss": 0.7276, + "step": 6868 + }, + { + "epoch": 1.6700705081449065, + "grad_norm": 19.875, + "learning_rate": 1.6811317041272104e-07, + "loss": 0.8208, + "step": 6869 + }, + { + "epoch": 1.6703136396790663, + "grad_norm": 17.0, + "learning_rate": 1.678711886916015e-07, + "loss": 0.6899, + "step": 6870 + }, + { + "epoch": 1.6705567712132263, + "grad_norm": 21.375, + "learning_rate": 1.6762936871332166e-07, + "loss": 0.5199, + "step": 6871 + }, + { + "epoch": 1.6707999027473863, + "grad_norm": 29.875, + "learning_rate": 1.6738771051402604e-07, + "loss": 0.7683, + "step": 6872 + }, + { + "epoch": 1.6710430342815463, + "grad_norm": 20.75, + "learning_rate": 1.6714621412983413e-07, + "loss": 0.6856, + "step": 6873 + }, + { + "epoch": 1.6712861658157063, + "grad_norm": 18.75, + "learning_rate": 1.669048795968424e-07, + "loss": 0.7694, + "step": 6874 + }, + { + "epoch": 1.6715292973498663, + "grad_norm": 22.75, + "learning_rate": 1.6666370695112206e-07, + "loss": 1.1907, + "step": 6875 + }, + { + "epoch": 1.6717724288840263, + "grad_norm": 20.125, + "learning_rate": 1.6642269622872035e-07, + "loss": 0.7955, + "step": 6876 + }, + { + "epoch": 1.672015560418186, + "grad_norm": 20.875, + "learning_rate": 1.6618184746566097e-07, + "loss": 0.7652, + "step": 6877 + }, + { + "epoch": 1.6722586919523463, + "grad_norm": 23.5, + "learning_rate": 1.6594116069794262e-07, + "loss": 0.7562, + "step": 6878 + }, + { + "epoch": 1.6725018234865061, + "grad_norm": 18.0, + "learning_rate": 1.6570063596153998e-07, + "loss": 0.5837, + "step": 6879 + }, + { + "epoch": 1.6727449550206663, + "grad_norm": 22.375, + "learning_rate": 1.6546027329240365e-07, + "loss": 0.5496, + "step": 6880 + }, + { + "epoch": 1.6729880865548261, + "grad_norm": 18.75, + "learning_rate": 1.6522007272646052e-07, + "loss": 0.8426, + "step": 6881 + }, + { + "epoch": 1.6732312180889861, + "grad_norm": 20.875, + "learning_rate": 1.649800342996123e-07, + "loss": 0.7899, + "step": 6882 + }, + { + "epoch": 1.6734743496231461, + "grad_norm": 18.875, + "learning_rate": 1.647401580477366e-07, + "loss": 0.681, + "step": 6883 + }, + { + "epoch": 1.6737174811573061, + "grad_norm": 23.125, + "learning_rate": 1.645004440066876e-07, + "loss": 0.792, + "step": 6884 + }, + { + "epoch": 1.6739606126914661, + "grad_norm": 20.5, + "learning_rate": 1.6426089221229456e-07, + "loss": 0.7977, + "step": 6885 + }, + { + "epoch": 1.674203744225626, + "grad_norm": 19.75, + "learning_rate": 1.640215027003622e-07, + "loss": 0.7682, + "step": 6886 + }, + { + "epoch": 1.6744468757597861, + "grad_norm": 38.25, + "learning_rate": 1.6378227550667176e-07, + "loss": 0.5646, + "step": 6887 + }, + { + "epoch": 1.674690007293946, + "grad_norm": 19.875, + "learning_rate": 1.6354321066698015e-07, + "loss": 0.5912, + "step": 6888 + }, + { + "epoch": 1.6749331388281061, + "grad_norm": 20.875, + "learning_rate": 1.633043082170195e-07, + "loss": 0.7568, + "step": 6889 + }, + { + "epoch": 1.675176270362266, + "grad_norm": 20.75, + "learning_rate": 1.6306556819249767e-07, + "loss": 0.674, + "step": 6890 + }, + { + "epoch": 1.6754194018964261, + "grad_norm": 15.875, + "learning_rate": 1.6282699062909886e-07, + "loss": 0.4707, + "step": 6891 + }, + { + "epoch": 1.675662533430586, + "grad_norm": 24.125, + "learning_rate": 1.6258857556248244e-07, + "loss": 0.8059, + "step": 6892 + }, + { + "epoch": 1.675905664964746, + "grad_norm": 19.125, + "learning_rate": 1.623503230282833e-07, + "loss": 0.7181, + "step": 6893 + }, + { + "epoch": 1.676148796498906, + "grad_norm": 15.625, + "learning_rate": 1.6211223306211275e-07, + "loss": 0.3808, + "step": 6894 + }, + { + "epoch": 1.676391928033066, + "grad_norm": 14.5625, + "learning_rate": 1.6187430569955782e-07, + "loss": 0.4352, + "step": 6895 + }, + { + "epoch": 1.676635059567226, + "grad_norm": 26.125, + "learning_rate": 1.6163654097618054e-07, + "loss": 0.9999, + "step": 6896 + }, + { + "epoch": 1.6768781911013857, + "grad_norm": 23.125, + "learning_rate": 1.6139893892751848e-07, + "loss": 0.6284, + "step": 6897 + }, + { + "epoch": 1.677121322635546, + "grad_norm": 20.5, + "learning_rate": 1.6116149958908604e-07, + "loss": 0.605, + "step": 6898 + }, + { + "epoch": 1.6773644541697057, + "grad_norm": 34.0, + "learning_rate": 1.6092422299637243e-07, + "loss": 0.7257, + "step": 6899 + }, + { + "epoch": 1.677607585703866, + "grad_norm": 15.6875, + "learning_rate": 1.6068710918484248e-07, + "loss": 0.433, + "step": 6900 + }, + { + "epoch": 1.6778507172380257, + "grad_norm": 19.375, + "learning_rate": 1.6045015818993732e-07, + "loss": 0.7528, + "step": 6901 + }, + { + "epoch": 1.6780938487721857, + "grad_norm": 31.375, + "learning_rate": 1.60213370047073e-07, + "loss": 0.757, + "step": 6902 + }, + { + "epoch": 1.6783369803063457, + "grad_norm": 30.125, + "learning_rate": 1.59976744791642e-07, + "loss": 0.7798, + "step": 6903 + }, + { + "epoch": 1.6785801118405057, + "grad_norm": 20.25, + "learning_rate": 1.5974028245901162e-07, + "loss": 1.1059, + "step": 6904 + }, + { + "epoch": 1.6788232433746657, + "grad_norm": 17.25, + "learning_rate": 1.5950398308452575e-07, + "loss": 0.483, + "step": 6905 + }, + { + "epoch": 1.6790663749088257, + "grad_norm": 22.125, + "learning_rate": 1.5926784670350305e-07, + "loss": 0.6642, + "step": 6906 + }, + { + "epoch": 1.6793095064429857, + "grad_norm": 27.125, + "learning_rate": 1.5903187335123805e-07, + "loss": 0.5887, + "step": 6907 + }, + { + "epoch": 1.6795526379771455, + "grad_norm": 21.875, + "learning_rate": 1.5879606306300157e-07, + "loss": 0.6357, + "step": 6908 + }, + { + "epoch": 1.6797957695113057, + "grad_norm": 25.875, + "learning_rate": 1.5856041587403907e-07, + "loss": 0.8933, + "step": 6909 + }, + { + "epoch": 1.6800389010454655, + "grad_norm": 24.375, + "learning_rate": 1.5832493181957203e-07, + "loss": 1.0323, + "step": 6910 + }, + { + "epoch": 1.6802820325796257, + "grad_norm": 18.5, + "learning_rate": 1.5808961093479776e-07, + "loss": 0.6313, + "step": 6911 + }, + { + "epoch": 1.6805251641137855, + "grad_norm": 25.25, + "learning_rate": 1.578544532548895e-07, + "loss": 0.6426, + "step": 6912 + }, + { + "epoch": 1.6807682956479455, + "grad_norm": 15.875, + "learning_rate": 1.5761945881499514e-07, + "loss": 0.6022, + "step": 6913 + }, + { + "epoch": 1.6810114271821055, + "grad_norm": 21.25, + "learning_rate": 1.5738462765023831e-07, + "loss": 0.8832, + "step": 6914 + }, + { + "epoch": 1.6812545587162655, + "grad_norm": 22.625, + "learning_rate": 1.571499597957195e-07, + "loss": 0.5752, + "step": 6915 + }, + { + "epoch": 1.6814976902504255, + "grad_norm": 27.125, + "learning_rate": 1.569154552865132e-07, + "loss": 0.9953, + "step": 6916 + }, + { + "epoch": 1.6817408217845855, + "grad_norm": 28.625, + "learning_rate": 1.5668111415767004e-07, + "loss": 1.1177, + "step": 6917 + }, + { + "epoch": 1.6819839533187455, + "grad_norm": 14.8125, + "learning_rate": 1.5644693644421672e-07, + "loss": 1.0306, + "step": 6918 + }, + { + "epoch": 1.6822270848529053, + "grad_norm": 20.625, + "learning_rate": 1.5621292218115537e-07, + "loss": 0.786, + "step": 6919 + }, + { + "epoch": 1.6824702163870655, + "grad_norm": 18.875, + "learning_rate": 1.5597907140346314e-07, + "loss": 0.44, + "step": 6920 + }, + { + "epoch": 1.6827133479212253, + "grad_norm": 22.5, + "learning_rate": 1.557453841460928e-07, + "loss": 0.7679, + "step": 6921 + }, + { + "epoch": 1.6829564794553855, + "grad_norm": 19.875, + "learning_rate": 1.5551186044397342e-07, + "loss": 0.8135, + "step": 6922 + }, + { + "epoch": 1.6831996109895453, + "grad_norm": 18.875, + "learning_rate": 1.5527850033200897e-07, + "loss": 1.1998, + "step": 6923 + }, + { + "epoch": 1.6834427425237053, + "grad_norm": 18.25, + "learning_rate": 1.55045303845079e-07, + "loss": 0.7589, + "step": 6924 + }, + { + "epoch": 1.6836858740578653, + "grad_norm": 17.875, + "learning_rate": 1.5481227101803886e-07, + "loss": 0.729, + "step": 6925 + }, + { + "epoch": 1.6839290055920253, + "grad_norm": 18.25, + "learning_rate": 1.5457940188571953e-07, + "loss": 0.629, + "step": 6926 + }, + { + "epoch": 1.6841721371261853, + "grad_norm": 25.625, + "learning_rate": 1.5434669648292724e-07, + "loss": 0.7621, + "step": 6927 + }, + { + "epoch": 1.6844152686603453, + "grad_norm": 26.25, + "learning_rate": 1.5411415484444344e-07, + "loss": 0.4665, + "step": 6928 + }, + { + "epoch": 1.6846584001945053, + "grad_norm": 25.625, + "learning_rate": 1.5388177700502604e-07, + "loss": 0.7456, + "step": 6929 + }, + { + "epoch": 1.684901531728665, + "grad_norm": 16.75, + "learning_rate": 1.536495629994078e-07, + "loss": 0.5649, + "step": 6930 + }, + { + "epoch": 1.6851446632628253, + "grad_norm": 17.125, + "learning_rate": 1.5341751286229667e-07, + "loss": 0.6208, + "step": 6931 + }, + { + "epoch": 1.685387794796985, + "grad_norm": 18.5, + "learning_rate": 1.531856266283771e-07, + "loss": 0.4659, + "step": 6932 + }, + { + "epoch": 1.6856309263311453, + "grad_norm": 18.5, + "learning_rate": 1.5295390433230805e-07, + "loss": 0.7431, + "step": 6933 + }, + { + "epoch": 1.685874057865305, + "grad_norm": 20.5, + "learning_rate": 1.5272234600872488e-07, + "loss": 0.7232, + "step": 6934 + }, + { + "epoch": 1.686117189399465, + "grad_norm": 16.375, + "learning_rate": 1.524909516922375e-07, + "loss": 0.4456, + "step": 6935 + }, + { + "epoch": 1.686360320933625, + "grad_norm": 18.75, + "learning_rate": 1.5225972141743235e-07, + "loss": 0.7808, + "step": 6936 + }, + { + "epoch": 1.686603452467785, + "grad_norm": 24.125, + "learning_rate": 1.520286552188703e-07, + "loss": 0.7493, + "step": 6937 + }, + { + "epoch": 1.686846584001945, + "grad_norm": 22.25, + "learning_rate": 1.5179775313108825e-07, + "loss": 0.9569, + "step": 6938 + }, + { + "epoch": 1.6870897155361049, + "grad_norm": 21.625, + "learning_rate": 1.5156701518859882e-07, + "loss": 0.7673, + "step": 6939 + }, + { + "epoch": 1.687332847070265, + "grad_norm": 22.0, + "learning_rate": 1.5133644142588916e-07, + "loss": 0.5399, + "step": 6940 + }, + { + "epoch": 1.6875759786044249, + "grad_norm": 28.75, + "learning_rate": 1.5110603187742336e-07, + "loss": 1.0731, + "step": 6941 + }, + { + "epoch": 1.687819110138585, + "grad_norm": 22.0, + "learning_rate": 1.5087578657763933e-07, + "loss": 1.0513, + "step": 6942 + }, + { + "epoch": 1.6880622416727449, + "grad_norm": 23.125, + "learning_rate": 1.506457055609517e-07, + "loss": 0.962, + "step": 6943 + }, + { + "epoch": 1.6883053732069049, + "grad_norm": 26.5, + "learning_rate": 1.5041578886174995e-07, + "loss": 0.7448, + "step": 6944 + }, + { + "epoch": 1.6885485047410649, + "grad_norm": 23.625, + "learning_rate": 1.501860365143988e-07, + "loss": 0.7738, + "step": 6945 + }, + { + "epoch": 1.6887916362752249, + "grad_norm": 22.625, + "learning_rate": 1.4995644855323926e-07, + "loss": 0.7364, + "step": 6946 + }, + { + "epoch": 1.6890347678093849, + "grad_norm": 19.0, + "learning_rate": 1.497270250125868e-07, + "loss": 0.674, + "step": 6947 + }, + { + "epoch": 1.6892778993435449, + "grad_norm": 24.875, + "learning_rate": 1.4949776592673258e-07, + "loss": 0.8241, + "step": 6948 + }, + { + "epoch": 1.6895210308777049, + "grad_norm": 15.5, + "learning_rate": 1.4926867132994348e-07, + "loss": 0.3412, + "step": 6949 + }, + { + "epoch": 1.6897641624118647, + "grad_norm": 21.625, + "learning_rate": 1.4903974125646205e-07, + "loss": 0.7731, + "step": 6950 + }, + { + "epoch": 1.690007293946025, + "grad_norm": 22.125, + "learning_rate": 1.4881097574050554e-07, + "loss": 0.6524, + "step": 6951 + }, + { + "epoch": 1.6902504254801847, + "grad_norm": 17.125, + "learning_rate": 1.4858237481626668e-07, + "loss": 0.6306, + "step": 6952 + }, + { + "epoch": 1.690493557014345, + "grad_norm": 21.625, + "learning_rate": 1.4835393851791432e-07, + "loss": 0.6385, + "step": 6953 + }, + { + "epoch": 1.6907366885485047, + "grad_norm": 20.125, + "learning_rate": 1.481256668795919e-07, + "loss": 0.7038, + "step": 6954 + }, + { + "epoch": 1.6909798200826647, + "grad_norm": 21.0, + "learning_rate": 1.4789755993541835e-07, + "loss": 0.609, + "step": 6955 + }, + { + "epoch": 1.6912229516168247, + "grad_norm": 18.5, + "learning_rate": 1.4766961771948835e-07, + "loss": 0.5567, + "step": 6956 + }, + { + "epoch": 1.6914660831509847, + "grad_norm": 17.625, + "learning_rate": 1.4744184026587227e-07, + "loss": 0.554, + "step": 6957 + }, + { + "epoch": 1.6917092146851447, + "grad_norm": 19.5, + "learning_rate": 1.4721422760861497e-07, + "loss": 0.6309, + "step": 6958 + }, + { + "epoch": 1.6919523462193047, + "grad_norm": 18.25, + "learning_rate": 1.4698677978173675e-07, + "loss": 0.5148, + "step": 6959 + }, + { + "epoch": 1.6921954777534647, + "grad_norm": 22.375, + "learning_rate": 1.4675949681923443e-07, + "loss": 0.8596, + "step": 6960 + }, + { + "epoch": 1.6924386092876245, + "grad_norm": 21.625, + "learning_rate": 1.4653237875507876e-07, + "loss": 0.6779, + "step": 6961 + }, + { + "epoch": 1.6926817408217847, + "grad_norm": 19.375, + "learning_rate": 1.4630542562321645e-07, + "loss": 0.9988, + "step": 6962 + }, + { + "epoch": 1.6929248723559445, + "grad_norm": 18.75, + "learning_rate": 1.4607863745756998e-07, + "loss": 0.4227, + "step": 6963 + }, + { + "epoch": 1.6931680038901047, + "grad_norm": 24.0, + "learning_rate": 1.458520142920364e-07, + "loss": 0.7502, + "step": 6964 + }, + { + "epoch": 1.6934111354242645, + "grad_norm": 19.875, + "learning_rate": 1.456255561604887e-07, + "loss": 0.7303, + "step": 6965 + }, + { + "epoch": 1.6936542669584245, + "grad_norm": 16.5, + "learning_rate": 1.453992630967746e-07, + "loss": 0.4584, + "step": 6966 + }, + { + "epoch": 1.6938973984925845, + "grad_norm": 18.125, + "learning_rate": 1.4517313513471798e-07, + "loss": 0.3222, + "step": 6967 + }, + { + "epoch": 1.6941405300267445, + "grad_norm": 25.125, + "learning_rate": 1.4494717230811736e-07, + "loss": 0.8891, + "step": 6968 + }, + { + "epoch": 1.6943836615609045, + "grad_norm": 23.25, + "learning_rate": 1.4472137465074656e-07, + "loss": 0.4947, + "step": 6969 + }, + { + "epoch": 1.6946267930950645, + "grad_norm": 19.75, + "learning_rate": 1.4449574219635525e-07, + "loss": 0.5912, + "step": 6970 + }, + { + "epoch": 1.6948699246292245, + "grad_norm": 18.125, + "learning_rate": 1.4427027497866784e-07, + "loss": 0.6814, + "step": 6971 + }, + { + "epoch": 1.6951130561633843, + "grad_norm": 20.0, + "learning_rate": 1.4404497303138477e-07, + "loss": 1.0506, + "step": 6972 + }, + { + "epoch": 1.6953561876975445, + "grad_norm": 19.0, + "learning_rate": 1.4381983638818055e-07, + "loss": 0.6342, + "step": 6973 + }, + { + "epoch": 1.6955993192317043, + "grad_norm": 25.75, + "learning_rate": 1.4359486508270654e-07, + "loss": 0.7947, + "step": 6974 + }, + { + "epoch": 1.6958424507658645, + "grad_norm": 23.375, + "learning_rate": 1.433700591485883e-07, + "loss": 0.7217, + "step": 6975 + }, + { + "epoch": 1.6960855823000243, + "grad_norm": 16.75, + "learning_rate": 1.4314541861942644e-07, + "loss": 0.6003, + "step": 6976 + }, + { + "epoch": 1.6963287138341843, + "grad_norm": 22.125, + "learning_rate": 1.4292094352879816e-07, + "loss": 0.762, + "step": 6977 + }, + { + "epoch": 1.6965718453683443, + "grad_norm": 17.375, + "learning_rate": 1.426966339102548e-07, + "loss": 0.9147, + "step": 6978 + }, + { + "epoch": 1.6968149769025043, + "grad_norm": 25.375, + "learning_rate": 1.4247248979732273e-07, + "loss": 0.6724, + "step": 6979 + }, + { + "epoch": 1.6970581084366643, + "grad_norm": 21.375, + "learning_rate": 1.422485112235053e-07, + "loss": 0.6231, + "step": 6980 + }, + { + "epoch": 1.697301239970824, + "grad_norm": 21.25, + "learning_rate": 1.4202469822227936e-07, + "loss": 0.8129, + "step": 6981 + }, + { + "epoch": 1.6975443715049843, + "grad_norm": 21.375, + "learning_rate": 1.418010508270977e-07, + "loss": 0.798, + "step": 6982 + }, + { + "epoch": 1.697787503039144, + "grad_norm": 23.125, + "learning_rate": 1.4157756907138804e-07, + "loss": 0.9029, + "step": 6983 + }, + { + "epoch": 1.6980306345733043, + "grad_norm": 21.875, + "learning_rate": 1.4135425298855394e-07, + "loss": 0.3993, + "step": 6984 + }, + { + "epoch": 1.698273766107464, + "grad_norm": 19.125, + "learning_rate": 1.4113110261197368e-07, + "loss": 0.6003, + "step": 6985 + }, + { + "epoch": 1.6985168976416243, + "grad_norm": 26.25, + "learning_rate": 1.409081179750009e-07, + "loss": 0.7194, + "step": 6986 + }, + { + "epoch": 1.698760029175784, + "grad_norm": 23.375, + "learning_rate": 1.4068529911096433e-07, + "loss": 0.673, + "step": 6987 + }, + { + "epoch": 1.699003160709944, + "grad_norm": 19.25, + "learning_rate": 1.404626460531687e-07, + "loss": 0.5228, + "step": 6988 + }, + { + "epoch": 1.699246292244104, + "grad_norm": 15.3125, + "learning_rate": 1.4024015883489312e-07, + "loss": 0.3539, + "step": 6989 + }, + { + "epoch": 1.699489423778264, + "grad_norm": 21.0, + "learning_rate": 1.4001783748939152e-07, + "loss": 0.7357, + "step": 6990 + }, + { + "epoch": 1.699732555312424, + "grad_norm": 21.75, + "learning_rate": 1.3979568204989456e-07, + "loss": 0.6048, + "step": 6991 + }, + { + "epoch": 1.6999756868465838, + "grad_norm": 26.125, + "learning_rate": 1.3957369254960675e-07, + "loss": 0.7623, + "step": 6992 + }, + { + "epoch": 1.700218818380744, + "grad_norm": 21.25, + "learning_rate": 1.3935186902170815e-07, + "loss": 0.7856, + "step": 6993 + }, + { + "epoch": 1.7004619499149038, + "grad_norm": 18.625, + "learning_rate": 1.3913021149935425e-07, + "loss": 0.4841, + "step": 6994 + }, + { + "epoch": 1.700705081449064, + "grad_norm": 19.875, + "learning_rate": 1.3890872001567578e-07, + "loss": 0.8518, + "step": 6995 + }, + { + "epoch": 1.7009482129832239, + "grad_norm": 18.25, + "learning_rate": 1.3868739460377824e-07, + "loss": 0.7088, + "step": 6996 + }, + { + "epoch": 1.7011913445173839, + "grad_norm": 19.625, + "learning_rate": 1.384662352967428e-07, + "loss": 0.509, + "step": 6997 + }, + { + "epoch": 1.7014344760515439, + "grad_norm": 22.125, + "learning_rate": 1.3824524212762522e-07, + "loss": 0.5836, + "step": 6998 + }, + { + "epoch": 1.7016776075857039, + "grad_norm": 20.125, + "learning_rate": 1.3802441512945698e-07, + "loss": 0.4863, + "step": 6999 + }, + { + "epoch": 1.7019207391198639, + "grad_norm": 16.875, + "learning_rate": 1.378037543352441e-07, + "loss": 0.4595, + "step": 7000 + }, + { + "epoch": 1.7021638706540239, + "grad_norm": 32.5, + "learning_rate": 1.3758325977796875e-07, + "loss": 0.8221, + "step": 7001 + }, + { + "epoch": 1.7024070021881839, + "grad_norm": 23.625, + "learning_rate": 1.3736293149058695e-07, + "loss": 1.1364, + "step": 7002 + }, + { + "epoch": 1.7026501337223436, + "grad_norm": 22.25, + "learning_rate": 1.371427695060311e-07, + "loss": 0.5761, + "step": 7003 + }, + { + "epoch": 1.7028932652565039, + "grad_norm": 16.25, + "learning_rate": 1.369227738572082e-07, + "loss": 0.7817, + "step": 7004 + }, + { + "epoch": 1.7031363967906636, + "grad_norm": 19.875, + "learning_rate": 1.367029445770003e-07, + "loss": 0.5736, + "step": 7005 + }, + { + "epoch": 1.7033795283248239, + "grad_norm": 18.625, + "learning_rate": 1.364832816982646e-07, + "loss": 0.5242, + "step": 7006 + }, + { + "epoch": 1.7036226598589836, + "grad_norm": 19.375, + "learning_rate": 1.362637852538333e-07, + "loss": 0.7528, + "step": 7007 + }, + { + "epoch": 1.7038657913931436, + "grad_norm": 24.25, + "learning_rate": 1.3604445527651444e-07, + "loss": 0.8263, + "step": 7008 + }, + { + "epoch": 1.7041089229273036, + "grad_norm": 16.75, + "learning_rate": 1.3582529179909044e-07, + "loss": 0.4456, + "step": 7009 + }, + { + "epoch": 1.7043520544614637, + "grad_norm": 19.375, + "learning_rate": 1.3560629485431853e-07, + "loss": 0.7074, + "step": 7010 + }, + { + "epoch": 1.7045951859956237, + "grad_norm": 21.5, + "learning_rate": 1.3538746447493257e-07, + "loss": 1.0045, + "step": 7011 + }, + { + "epoch": 1.7048383175297837, + "grad_norm": 13.9375, + "learning_rate": 1.351688006936401e-07, + "loss": 0.3327, + "step": 7012 + }, + { + "epoch": 1.7050814490639437, + "grad_norm": 17.25, + "learning_rate": 1.3495030354312392e-07, + "loss": 0.9107, + "step": 7013 + }, + { + "epoch": 1.7053245805981034, + "grad_norm": 21.875, + "learning_rate": 1.3473197305604272e-07, + "loss": 0.705, + "step": 7014 + }, + { + "epoch": 1.7055677121322637, + "grad_norm": 24.25, + "learning_rate": 1.3451380926502944e-07, + "loss": 1.0641, + "step": 7015 + }, + { + "epoch": 1.7058108436664234, + "grad_norm": 24.0, + "learning_rate": 1.3429581220269245e-07, + "loss": 0.8427, + "step": 7016 + }, + { + "epoch": 1.7060539752005837, + "grad_norm": 21.875, + "learning_rate": 1.3407798190161512e-07, + "loss": 0.699, + "step": 7017 + }, + { + "epoch": 1.7062971067347434, + "grad_norm": 18.5, + "learning_rate": 1.3386031839435585e-07, + "loss": 0.3591, + "step": 7018 + }, + { + "epoch": 1.7065402382689034, + "grad_norm": 23.0, + "learning_rate": 1.3364282171344874e-07, + "loss": 0.7765, + "step": 7019 + }, + { + "epoch": 1.7067833698030634, + "grad_norm": 20.25, + "learning_rate": 1.3342549189140186e-07, + "loss": 0.5656, + "step": 7020 + }, + { + "epoch": 1.7070265013372234, + "grad_norm": 20.625, + "learning_rate": 1.3320832896069933e-07, + "loss": 1.0029, + "step": 7021 + }, + { + "epoch": 1.7072696328713834, + "grad_norm": 17.375, + "learning_rate": 1.3299133295379961e-07, + "loss": 0.3076, + "step": 7022 + }, + { + "epoch": 1.7075127644055434, + "grad_norm": 20.875, + "learning_rate": 1.3277450390313667e-07, + "loss": 0.8906, + "step": 7023 + }, + { + "epoch": 1.7077558959397034, + "grad_norm": 22.125, + "learning_rate": 1.32557841841119e-07, + "loss": 0.8653, + "step": 7024 + }, + { + "epoch": 1.7079990274738632, + "grad_norm": 20.125, + "learning_rate": 1.3234134680013085e-07, + "loss": 0.4807, + "step": 7025 + }, + { + "epoch": 1.7082421590080235, + "grad_norm": 22.875, + "learning_rate": 1.321250188125313e-07, + "loss": 0.7162, + "step": 7026 + }, + { + "epoch": 1.7084852905421832, + "grad_norm": 26.625, + "learning_rate": 1.3190885791065368e-07, + "loss": 0.847, + "step": 7027 + }, + { + "epoch": 1.7087284220763435, + "grad_norm": 20.5, + "learning_rate": 1.3169286412680768e-07, + "loss": 0.9272, + "step": 7028 + }, + { + "epoch": 1.7089715536105032, + "grad_norm": 27.875, + "learning_rate": 1.3147703749327695e-07, + "loss": 0.6254, + "step": 7029 + }, + { + "epoch": 1.7092146851446632, + "grad_norm": 30.0, + "learning_rate": 1.3126137804232025e-07, + "loss": 0.6389, + "step": 7030 + }, + { + "epoch": 1.7094578166788232, + "grad_norm": 26.25, + "learning_rate": 1.310458858061722e-07, + "loss": 0.9339, + "step": 7031 + }, + { + "epoch": 1.7097009482129832, + "grad_norm": 17.5, + "learning_rate": 1.308305608170414e-07, + "loss": 0.3279, + "step": 7032 + }, + { + "epoch": 1.7099440797471432, + "grad_norm": 21.75, + "learning_rate": 1.306154031071119e-07, + "loss": 1.0276, + "step": 7033 + }, + { + "epoch": 1.710187211281303, + "grad_norm": 19.5, + "learning_rate": 1.3040041270854286e-07, + "loss": 0.8698, + "step": 7034 + }, + { + "epoch": 1.7104303428154632, + "grad_norm": 19.125, + "learning_rate": 1.3018558965346844e-07, + "loss": 0.5881, + "step": 7035 + }, + { + "epoch": 1.710673474349623, + "grad_norm": 23.625, + "learning_rate": 1.2997093397399754e-07, + "loss": 0.5417, + "step": 7036 + }, + { + "epoch": 1.7109166058837832, + "grad_norm": 24.5, + "learning_rate": 1.2975644570221394e-07, + "loss": 1.1227, + "step": 7037 + }, + { + "epoch": 1.711159737417943, + "grad_norm": 17.625, + "learning_rate": 1.2954212487017697e-07, + "loss": 0.4961, + "step": 7038 + }, + { + "epoch": 1.7114028689521033, + "grad_norm": 19.375, + "learning_rate": 1.2932797150992048e-07, + "loss": 1.0014, + "step": 7039 + }, + { + "epoch": 1.711646000486263, + "grad_norm": 13.625, + "learning_rate": 1.2911398565345317e-07, + "loss": 0.3568, + "step": 7040 + }, + { + "epoch": 1.711889132020423, + "grad_norm": 18.5, + "learning_rate": 1.2890016733275877e-07, + "loss": 0.676, + "step": 7041 + }, + { + "epoch": 1.712132263554583, + "grad_norm": 19.5, + "learning_rate": 1.2868651657979682e-07, + "loss": 0.6297, + "step": 7042 + }, + { + "epoch": 1.712375395088743, + "grad_norm": 16.25, + "learning_rate": 1.284730334265008e-07, + "loss": 0.318, + "step": 7043 + }, + { + "epoch": 1.712618526622903, + "grad_norm": 19.75, + "learning_rate": 1.2825971790477912e-07, + "loss": 0.741, + "step": 7044 + }, + { + "epoch": 1.7128616581570628, + "grad_norm": 17.375, + "learning_rate": 1.2804657004651583e-07, + "loss": 0.5294, + "step": 7045 + }, + { + "epoch": 1.713104789691223, + "grad_norm": 17.375, + "learning_rate": 1.278335898835696e-07, + "loss": 0.6027, + "step": 7046 + }, + { + "epoch": 1.7133479212253828, + "grad_norm": 24.25, + "learning_rate": 1.2762077744777363e-07, + "loss": 0.5592, + "step": 7047 + }, + { + "epoch": 1.713591052759543, + "grad_norm": 17.75, + "learning_rate": 1.2740813277093673e-07, + "loss": 0.6728, + "step": 7048 + }, + { + "epoch": 1.7138341842937028, + "grad_norm": 22.0, + "learning_rate": 1.2719565588484208e-07, + "loss": 0.6397, + "step": 7049 + }, + { + "epoch": 1.7140773158278628, + "grad_norm": 22.25, + "learning_rate": 1.2698334682124834e-07, + "loss": 0.8683, + "step": 7050 + }, + { + "epoch": 1.7143204473620228, + "grad_norm": 17.75, + "learning_rate": 1.2677120561188834e-07, + "loss": 0.4273, + "step": 7051 + }, + { + "epoch": 1.7145635788961828, + "grad_norm": 20.75, + "learning_rate": 1.2655923228847082e-07, + "loss": 0.9747, + "step": 7052 + }, + { + "epoch": 1.7148067104303428, + "grad_norm": 21.5, + "learning_rate": 1.263474268826785e-07, + "loss": 0.9047, + "step": 7053 + }, + { + "epoch": 1.7150498419645028, + "grad_norm": 19.5, + "learning_rate": 1.2613578942616902e-07, + "loss": 0.6919, + "step": 7054 + }, + { + "epoch": 1.7152929734986628, + "grad_norm": 20.625, + "learning_rate": 1.2592431995057608e-07, + "loss": 0.4946, + "step": 7055 + }, + { + "epoch": 1.7155361050328226, + "grad_norm": 24.0, + "learning_rate": 1.257130184875066e-07, + "loss": 0.9144, + "step": 7056 + }, + { + "epoch": 1.7157792365669828, + "grad_norm": 18.0, + "learning_rate": 1.2550188506854383e-07, + "loss": 0.5277, + "step": 7057 + }, + { + "epoch": 1.7160223681011426, + "grad_norm": 21.5, + "learning_rate": 1.2529091972524486e-07, + "loss": 0.8893, + "step": 7058 + }, + { + "epoch": 1.7162654996353028, + "grad_norm": 15.8125, + "learning_rate": 1.2508012248914265e-07, + "loss": 0.3351, + "step": 7059 + }, + { + "epoch": 1.7165086311694626, + "grad_norm": 22.125, + "learning_rate": 1.248694933917441e-07, + "loss": 0.5553, + "step": 7060 + }, + { + "epoch": 1.7167517627036226, + "grad_norm": 26.5, + "learning_rate": 1.2465903246453123e-07, + "loss": 1.0793, + "step": 7061 + }, + { + "epoch": 1.7169948942377826, + "grad_norm": 22.625, + "learning_rate": 1.2444873973896153e-07, + "loss": 0.6153, + "step": 7062 + }, + { + "epoch": 1.7172380257719426, + "grad_norm": 21.5, + "learning_rate": 1.242386152464667e-07, + "loss": 0.7403, + "step": 7063 + }, + { + "epoch": 1.7174811573061026, + "grad_norm": 17.875, + "learning_rate": 1.2402865901845294e-07, + "loss": 0.325, + "step": 7064 + }, + { + "epoch": 1.7177242888402626, + "grad_norm": 23.125, + "learning_rate": 1.238188710863024e-07, + "loss": 0.9473, + "step": 7065 + }, + { + "epoch": 1.7179674203744226, + "grad_norm": 36.75, + "learning_rate": 1.2360925148137165e-07, + "loss": 0.9482, + "step": 7066 + }, + { + "epoch": 1.7182105519085824, + "grad_norm": 15.5625, + "learning_rate": 1.2339980023499176e-07, + "loss": 0.4364, + "step": 7067 + }, + { + "epoch": 1.7184536834427426, + "grad_norm": 30.0, + "learning_rate": 1.2319051737846838e-07, + "loss": 1.1723, + "step": 7068 + }, + { + "epoch": 1.7186968149769024, + "grad_norm": 17.125, + "learning_rate": 1.229814029430833e-07, + "loss": 0.6533, + "step": 7069 + }, + { + "epoch": 1.7189399465110626, + "grad_norm": 20.0, + "learning_rate": 1.2277245696009175e-07, + "loss": 0.5493, + "step": 7070 + }, + { + "epoch": 1.7191830780452224, + "grad_norm": 19.0, + "learning_rate": 1.2256367946072405e-07, + "loss": 0.8707, + "step": 7071 + }, + { + "epoch": 1.7194262095793824, + "grad_norm": 23.0, + "learning_rate": 1.2235507047618612e-07, + "loss": 0.6375, + "step": 7072 + }, + { + "epoch": 1.7196693411135424, + "grad_norm": 25.5, + "learning_rate": 1.2214663003765808e-07, + "loss": 1.1586, + "step": 7073 + }, + { + "epoch": 1.7199124726477024, + "grad_norm": 18.375, + "learning_rate": 1.2193835817629492e-07, + "loss": 0.6262, + "step": 7074 + }, + { + "epoch": 1.7201556041818624, + "grad_norm": 12.9375, + "learning_rate": 1.2173025492322633e-07, + "loss": 0.4056, + "step": 7075 + }, + { + "epoch": 1.7203987357160224, + "grad_norm": 18.75, + "learning_rate": 1.215223203095571e-07, + "loss": 0.6889, + "step": 7076 + }, + { + "epoch": 1.7206418672501824, + "grad_norm": 24.0, + "learning_rate": 1.213145543663667e-07, + "loss": 0.9116, + "step": 7077 + }, + { + "epoch": 1.7208849987843422, + "grad_norm": 25.25, + "learning_rate": 1.2110695712470893e-07, + "loss": 0.4975, + "step": 7078 + }, + { + "epoch": 1.7211281303185024, + "grad_norm": 17.0, + "learning_rate": 1.208995286156131e-07, + "loss": 0.6069, + "step": 7079 + }, + { + "epoch": 1.7213712618526622, + "grad_norm": 17.75, + "learning_rate": 1.206922688700833e-07, + "loss": 0.6969, + "step": 7080 + }, + { + "epoch": 1.7216143933868224, + "grad_norm": 14.75, + "learning_rate": 1.2048517791909755e-07, + "loss": 0.333, + "step": 7081 + }, + { + "epoch": 1.7218575249209822, + "grad_norm": 19.25, + "learning_rate": 1.2027825579360927e-07, + "loss": 0.7456, + "step": 7082 + }, + { + "epoch": 1.7221006564551422, + "grad_norm": 21.875, + "learning_rate": 1.2007150252454676e-07, + "loss": 0.8768, + "step": 7083 + }, + { + "epoch": 1.7223437879893022, + "grad_norm": 18.25, + "learning_rate": 1.198649181428127e-07, + "loss": 0.7055, + "step": 7084 + }, + { + "epoch": 1.7225869195234622, + "grad_norm": 17.125, + "learning_rate": 1.1965850267928458e-07, + "loss": 0.334, + "step": 7085 + }, + { + "epoch": 1.7228300510576222, + "grad_norm": 23.75, + "learning_rate": 1.1945225616481523e-07, + "loss": 0.5971, + "step": 7086 + }, + { + "epoch": 1.723073182591782, + "grad_norm": 19.0, + "learning_rate": 1.192461786302311e-07, + "loss": 0.7236, + "step": 7087 + }, + { + "epoch": 1.7233163141259422, + "grad_norm": 22.5, + "learning_rate": 1.190402701063345e-07, + "loss": 0.6064, + "step": 7088 + }, + { + "epoch": 1.723559445660102, + "grad_norm": 18.75, + "learning_rate": 1.1883453062390179e-07, + "loss": 0.9833, + "step": 7089 + }, + { + "epoch": 1.7238025771942622, + "grad_norm": 30.25, + "learning_rate": 1.1862896021368461e-07, + "loss": 1.0316, + "step": 7090 + }, + { + "epoch": 1.724045708728422, + "grad_norm": 23.125, + "learning_rate": 1.1842355890640866e-07, + "loss": 1.0296, + "step": 7091 + }, + { + "epoch": 1.7242888402625822, + "grad_norm": 18.25, + "learning_rate": 1.182183267327748e-07, + "loss": 0.5016, + "step": 7092 + }, + { + "epoch": 1.724531971796742, + "grad_norm": 17.375, + "learning_rate": 1.1801326372345865e-07, + "loss": 0.4573, + "step": 7093 + }, + { + "epoch": 1.724775103330902, + "grad_norm": 17.25, + "learning_rate": 1.1780836990911037e-07, + "loss": 0.6343, + "step": 7094 + }, + { + "epoch": 1.725018234865062, + "grad_norm": 33.75, + "learning_rate": 1.1760364532035454e-07, + "loss": 0.7275, + "step": 7095 + }, + { + "epoch": 1.725261366399222, + "grad_norm": 29.5, + "learning_rate": 1.1739908998779109e-07, + "loss": 0.6388, + "step": 7096 + }, + { + "epoch": 1.725504497933382, + "grad_norm": 20.625, + "learning_rate": 1.1719470394199461e-07, + "loss": 0.6406, + "step": 7097 + }, + { + "epoch": 1.7257476294675418, + "grad_norm": 20.5, + "learning_rate": 1.1699048721351386e-07, + "loss": 0.6727, + "step": 7098 + }, + { + "epoch": 1.725990761001702, + "grad_norm": 16.125, + "learning_rate": 1.1678643983287233e-07, + "loss": 0.5676, + "step": 7099 + }, + { + "epoch": 1.7262338925358618, + "grad_norm": 17.625, + "learning_rate": 1.1658256183056883e-07, + "loss": 0.7208, + "step": 7100 + }, + { + "epoch": 1.726477024070022, + "grad_norm": 16.125, + "learning_rate": 1.1637885323707618e-07, + "loss": 0.3214, + "step": 7101 + }, + { + "epoch": 1.7267201556041818, + "grad_norm": 23.25, + "learning_rate": 1.1617531408284213e-07, + "loss": 0.6523, + "step": 7102 + }, + { + "epoch": 1.7269632871383418, + "grad_norm": 20.0, + "learning_rate": 1.1597194439828916e-07, + "loss": 0.7844, + "step": 7103 + }, + { + "epoch": 1.7272064186725018, + "grad_norm": 19.75, + "learning_rate": 1.1576874421381473e-07, + "loss": 0.5816, + "step": 7104 + }, + { + "epoch": 1.7274495502066618, + "grad_norm": 22.5, + "learning_rate": 1.1556571355979027e-07, + "loss": 0.6491, + "step": 7105 + }, + { + "epoch": 1.7276926817408218, + "grad_norm": 28.25, + "learning_rate": 1.1536285246656203e-07, + "loss": 1.1528, + "step": 7106 + }, + { + "epoch": 1.7279358132749818, + "grad_norm": 20.75, + "learning_rate": 1.1516016096445162e-07, + "loss": 0.8362, + "step": 7107 + }, + { + "epoch": 1.7281789448091418, + "grad_norm": 32.0, + "learning_rate": 1.1495763908375452e-07, + "loss": 0.6713, + "step": 7108 + }, + { + "epoch": 1.7284220763433016, + "grad_norm": 17.5, + "learning_rate": 1.147552868547408e-07, + "loss": 0.7046, + "step": 7109 + }, + { + "epoch": 1.7286652078774618, + "grad_norm": 14.375, + "learning_rate": 1.1455310430765601e-07, + "loss": 0.4023, + "step": 7110 + }, + { + "epoch": 1.7289083394116216, + "grad_norm": 20.125, + "learning_rate": 1.1435109147271972e-07, + "loss": 0.9681, + "step": 7111 + }, + { + "epoch": 1.7291514709457818, + "grad_norm": 17.625, + "learning_rate": 1.1414924838012611e-07, + "loss": 0.3903, + "step": 7112 + }, + { + "epoch": 1.7293946024799416, + "grad_norm": 20.25, + "learning_rate": 1.1394757506004397e-07, + "loss": 0.7485, + "step": 7113 + }, + { + "epoch": 1.7296377340141016, + "grad_norm": 18.625, + "learning_rate": 1.1374607154261724e-07, + "loss": 1.008, + "step": 7114 + }, + { + "epoch": 1.7298808655482616, + "grad_norm": 16.125, + "learning_rate": 1.1354473785796405e-07, + "loss": 0.3276, + "step": 7115 + }, + { + "epoch": 1.7301239970824216, + "grad_norm": 24.5, + "learning_rate": 1.1334357403617671e-07, + "loss": 1.0734, + "step": 7116 + }, + { + "epoch": 1.7303671286165816, + "grad_norm": 16.375, + "learning_rate": 1.1314258010732312e-07, + "loss": 0.4096, + "step": 7117 + }, + { + "epoch": 1.7306102601507416, + "grad_norm": 18.5, + "learning_rate": 1.1294175610144495e-07, + "loss": 0.6694, + "step": 7118 + }, + { + "epoch": 1.7308533916849016, + "grad_norm": 20.875, + "learning_rate": 1.1274110204855929e-07, + "loss": 0.6163, + "step": 7119 + }, + { + "epoch": 1.7310965232190614, + "grad_norm": 23.125, + "learning_rate": 1.1254061797865687e-07, + "loss": 0.5054, + "step": 7120 + }, + { + "epoch": 1.7313396547532216, + "grad_norm": 26.25, + "learning_rate": 1.1234030392170386e-07, + "loss": 0.5857, + "step": 7121 + }, + { + "epoch": 1.7315827862873814, + "grad_norm": 18.0, + "learning_rate": 1.1214015990764049e-07, + "loss": 0.4037, + "step": 7122 + }, + { + "epoch": 1.7318259178215416, + "grad_norm": 15.8125, + "learning_rate": 1.1194018596638156e-07, + "loss": 0.6404, + "step": 7123 + }, + { + "epoch": 1.7320690493557014, + "grad_norm": 20.75, + "learning_rate": 1.1174038212781693e-07, + "loss": 0.7418, + "step": 7124 + }, + { + "epoch": 1.7323121808898614, + "grad_norm": 23.0, + "learning_rate": 1.1154074842181075e-07, + "loss": 0.4778, + "step": 7125 + }, + { + "epoch": 1.7325553124240214, + "grad_norm": 22.875, + "learning_rate": 1.1134128487820126e-07, + "loss": 0.6934, + "step": 7126 + }, + { + "epoch": 1.7327984439581814, + "grad_norm": 16.5, + "learning_rate": 1.1114199152680208e-07, + "loss": 0.4853, + "step": 7127 + }, + { + "epoch": 1.7330415754923414, + "grad_norm": 24.25, + "learning_rate": 1.1094286839740134e-07, + "loss": 0.9262, + "step": 7128 + }, + { + "epoch": 1.7332847070265014, + "grad_norm": 21.25, + "learning_rate": 1.1074391551976108e-07, + "loss": 0.6301, + "step": 7129 + }, + { + "epoch": 1.7335278385606614, + "grad_norm": 28.125, + "learning_rate": 1.1054513292361804e-07, + "loss": 0.9771, + "step": 7130 + }, + { + "epoch": 1.7337709700948212, + "grad_norm": 23.375, + "learning_rate": 1.1034652063868417e-07, + "loss": 0.9385, + "step": 7131 + }, + { + "epoch": 1.7340141016289814, + "grad_norm": 21.375, + "learning_rate": 1.1014807869464516e-07, + "loss": 0.7921, + "step": 7132 + }, + { + "epoch": 1.7342572331631412, + "grad_norm": 18.0, + "learning_rate": 1.099498071211616e-07, + "loss": 0.5227, + "step": 7133 + }, + { + "epoch": 1.7345003646973014, + "grad_norm": 19.875, + "learning_rate": 1.097517059478688e-07, + "loss": 0.4977, + "step": 7134 + }, + { + "epoch": 1.7347434962314612, + "grad_norm": 20.875, + "learning_rate": 1.095537752043764e-07, + "loss": 0.7416, + "step": 7135 + }, + { + "epoch": 1.7349866277656212, + "grad_norm": 16.25, + "learning_rate": 1.0935601492026854e-07, + "loss": 0.5215, + "step": 7136 + }, + { + "epoch": 1.7352297592997812, + "grad_norm": 23.0, + "learning_rate": 1.0915842512510364e-07, + "loss": 0.4592, + "step": 7137 + }, + { + "epoch": 1.7354728908339412, + "grad_norm": 24.375, + "learning_rate": 1.0896100584841543e-07, + "loss": 1.4003, + "step": 7138 + }, + { + "epoch": 1.7357160223681012, + "grad_norm": 20.25, + "learning_rate": 1.0876375711971115e-07, + "loss": 0.6258, + "step": 7139 + }, + { + "epoch": 1.735959153902261, + "grad_norm": 29.125, + "learning_rate": 1.0856667896847306e-07, + "loss": 0.9952, + "step": 7140 + }, + { + "epoch": 1.7362022854364212, + "grad_norm": 20.375, + "learning_rate": 1.08369771424158e-07, + "loss": 0.6542, + "step": 7141 + }, + { + "epoch": 1.736445416970581, + "grad_norm": 31.125, + "learning_rate": 1.0817303451619756e-07, + "loss": 0.728, + "step": 7142 + }, + { + "epoch": 1.7366885485047412, + "grad_norm": 24.25, + "learning_rate": 1.0797646827399714e-07, + "loss": 1.1587, + "step": 7143 + }, + { + "epoch": 1.736931680038901, + "grad_norm": 22.625, + "learning_rate": 1.0778007272693666e-07, + "loss": 0.8443, + "step": 7144 + }, + { + "epoch": 1.737174811573061, + "grad_norm": 22.125, + "learning_rate": 1.0758384790437129e-07, + "loss": 0.5823, + "step": 7145 + }, + { + "epoch": 1.737417943107221, + "grad_norm": 23.375, + "learning_rate": 1.0738779383563019e-07, + "loss": 0.6295, + "step": 7146 + }, + { + "epoch": 1.737661074641381, + "grad_norm": 19.625, + "learning_rate": 1.071919105500166e-07, + "loss": 0.5701, + "step": 7147 + }, + { + "epoch": 1.737904206175541, + "grad_norm": 21.125, + "learning_rate": 1.0699619807680916e-07, + "loss": 0.7625, + "step": 7148 + }, + { + "epoch": 1.738147337709701, + "grad_norm": 18.875, + "learning_rate": 1.068006564452602e-07, + "loss": 0.5946, + "step": 7149 + }, + { + "epoch": 1.738390469243861, + "grad_norm": 14.5, + "learning_rate": 1.06605285684597e-07, + "loss": 0.3013, + "step": 7150 + }, + { + "epoch": 1.7386336007780208, + "grad_norm": 20.0, + "learning_rate": 1.0641008582402065e-07, + "loss": 0.725, + "step": 7151 + }, + { + "epoch": 1.738876732312181, + "grad_norm": 15.875, + "learning_rate": 1.0621505689270783e-07, + "loss": 0.6137, + "step": 7152 + }, + { + "epoch": 1.7391198638463408, + "grad_norm": 19.0, + "learning_rate": 1.0602019891980856e-07, + "loss": 0.7135, + "step": 7153 + }, + { + "epoch": 1.739362995380501, + "grad_norm": 22.375, + "learning_rate": 1.0582551193444743e-07, + "loss": 0.9001, + "step": 7154 + }, + { + "epoch": 1.7396061269146608, + "grad_norm": 17.25, + "learning_rate": 1.0563099596572452e-07, + "loss": 0.537, + "step": 7155 + }, + { + "epoch": 1.7398492584488208, + "grad_norm": 20.375, + "learning_rate": 1.0543665104271309e-07, + "loss": 0.4311, + "step": 7156 + }, + { + "epoch": 1.7400923899829808, + "grad_norm": 22.25, + "learning_rate": 1.0524247719446129e-07, + "loss": 0.5902, + "step": 7157 + }, + { + "epoch": 1.7403355215171408, + "grad_norm": 22.25, + "learning_rate": 1.0504847444999189e-07, + "loss": 0.4986, + "step": 7158 + }, + { + "epoch": 1.7405786530513008, + "grad_norm": 21.0, + "learning_rate": 1.0485464283830224e-07, + "loss": 0.5163, + "step": 7159 + }, + { + "epoch": 1.7408217845854608, + "grad_norm": 21.125, + "learning_rate": 1.0466098238836347e-07, + "loss": 0.482, + "step": 7160 + }, + { + "epoch": 1.7410649161196208, + "grad_norm": 18.75, + "learning_rate": 1.0446749312912147e-07, + "loss": 0.5752, + "step": 7161 + }, + { + "epoch": 1.7413080476537806, + "grad_norm": 20.25, + "learning_rate": 1.0427417508949669e-07, + "loss": 0.6879, + "step": 7162 + }, + { + "epoch": 1.7415511791879408, + "grad_norm": 19.0, + "learning_rate": 1.0408102829838395e-07, + "loss": 0.5716, + "step": 7163 + }, + { + "epoch": 1.7417943107221006, + "grad_norm": 12.25, + "learning_rate": 1.0388805278465211e-07, + "loss": 0.24, + "step": 7164 + }, + { + "epoch": 1.7420374422562608, + "grad_norm": 18.5, + "learning_rate": 1.036952485771446e-07, + "loss": 0.4406, + "step": 7165 + }, + { + "epoch": 1.7422805737904206, + "grad_norm": 19.875, + "learning_rate": 1.0350261570467993e-07, + "loss": 0.6768, + "step": 7166 + }, + { + "epoch": 1.7425237053245806, + "grad_norm": 20.875, + "learning_rate": 1.0331015419605003e-07, + "loss": 0.6819, + "step": 7167 + }, + { + "epoch": 1.7427668368587406, + "grad_norm": 23.125, + "learning_rate": 1.0311786408002137e-07, + "loss": 0.9034, + "step": 7168 + }, + { + "epoch": 1.7430099683929006, + "grad_norm": 23.75, + "learning_rate": 1.0292574538533537e-07, + "loss": 0.7285, + "step": 7169 + }, + { + "epoch": 1.7432530999270606, + "grad_norm": 14.0625, + "learning_rate": 1.0273379814070756e-07, + "loss": 0.242, + "step": 7170 + }, + { + "epoch": 1.7434962314612206, + "grad_norm": 30.625, + "learning_rate": 1.0254202237482733e-07, + "loss": 0.838, + "step": 7171 + }, + { + "epoch": 1.7437393629953806, + "grad_norm": 19.75, + "learning_rate": 1.0235041811635899e-07, + "loss": 0.9775, + "step": 7172 + }, + { + "epoch": 1.7439824945295404, + "grad_norm": 21.5, + "learning_rate": 1.0215898539394156e-07, + "loss": 0.8749, + "step": 7173 + }, + { + "epoch": 1.7442256260637006, + "grad_norm": 23.5, + "learning_rate": 1.0196772423618772e-07, + "loss": 0.8868, + "step": 7174 + }, + { + "epoch": 1.7444687575978604, + "grad_norm": 18.25, + "learning_rate": 1.0177663467168447e-07, + "loss": 0.3057, + "step": 7175 + }, + { + "epoch": 1.7447118891320206, + "grad_norm": 20.5, + "learning_rate": 1.0158571672899381e-07, + "loss": 0.8536, + "step": 7176 + }, + { + "epoch": 1.7449550206661804, + "grad_norm": 23.25, + "learning_rate": 1.0139497043665166e-07, + "loss": 0.5509, + "step": 7177 + }, + { + "epoch": 1.7451981522003404, + "grad_norm": 20.125, + "learning_rate": 1.0120439582316802e-07, + "loss": 0.4302, + "step": 7178 + }, + { + "epoch": 1.7454412837345004, + "grad_norm": 24.25, + "learning_rate": 1.0101399291702813e-07, + "loss": 0.6958, + "step": 7179 + }, + { + "epoch": 1.7456844152686604, + "grad_norm": 31.75, + "learning_rate": 1.0082376174669034e-07, + "loss": 0.7287, + "step": 7180 + }, + { + "epoch": 1.7459275468028204, + "grad_norm": 21.125, + "learning_rate": 1.0063370234058859e-07, + "loss": 0.6602, + "step": 7181 + }, + { + "epoch": 1.7461706783369801, + "grad_norm": 15.5, + "learning_rate": 1.0044381472712999e-07, + "loss": 0.5224, + "step": 7182 + }, + { + "epoch": 1.7464138098711404, + "grad_norm": 17.125, + "learning_rate": 1.0025409893469701e-07, + "loss": 0.6278, + "step": 7183 + }, + { + "epoch": 1.7466569414053001, + "grad_norm": 24.75, + "learning_rate": 1.0006455499164582e-07, + "loss": 0.8102, + "step": 7184 + }, + { + "epoch": 1.7469000729394604, + "grad_norm": 18.75, + "learning_rate": 9.987518292630672e-08, + "loss": 0.6583, + "step": 7185 + }, + { + "epoch": 1.7471432044736201, + "grad_norm": 17.0, + "learning_rate": 9.968598276698508e-08, + "loss": 0.4623, + "step": 7186 + }, + { + "epoch": 1.7473863360077804, + "grad_norm": 13.9375, + "learning_rate": 9.949695454195996e-08, + "loss": 0.2664, + "step": 7187 + }, + { + "epoch": 1.7476294675419402, + "grad_norm": 20.0, + "learning_rate": 9.93080982794846e-08, + "loss": 0.7451, + "step": 7188 + }, + { + "epoch": 1.7478725990761002, + "grad_norm": 21.375, + "learning_rate": 9.91194140077871e-08, + "loss": 0.8819, + "step": 7189 + }, + { + "epoch": 1.7481157306102602, + "grad_norm": 18.5, + "learning_rate": 9.893090175506979e-08, + "loss": 0.7237, + "step": 7190 + }, + { + "epoch": 1.7483588621444202, + "grad_norm": 18.75, + "learning_rate": 9.874256154950885e-08, + "loss": 0.6904, + "step": 7191 + }, + { + "epoch": 1.7486019936785802, + "grad_norm": 18.75, + "learning_rate": 9.855439341925482e-08, + "loss": 0.6456, + "step": 7192 + }, + { + "epoch": 1.74884512521274, + "grad_norm": 18.125, + "learning_rate": 9.8366397392433e-08, + "loss": 0.5506, + "step": 7193 + }, + { + "epoch": 1.7490882567469002, + "grad_norm": 16.75, + "learning_rate": 9.817857349714244e-08, + "loss": 0.4657, + "step": 7194 + }, + { + "epoch": 1.74933138828106, + "grad_norm": 19.0, + "learning_rate": 9.799092176145664e-08, + "loss": 0.5334, + "step": 7195 + }, + { + "epoch": 1.7495745198152202, + "grad_norm": 22.625, + "learning_rate": 9.780344221342344e-08, + "loss": 0.8858, + "step": 7196 + }, + { + "epoch": 1.74981765134938, + "grad_norm": 18.125, + "learning_rate": 9.761613488106503e-08, + "loss": 0.4806, + "step": 7197 + }, + { + "epoch": 1.75006078288354, + "grad_norm": 24.0, + "learning_rate": 9.742899979237774e-08, + "loss": 1.0021, + "step": 7198 + }, + { + "epoch": 1.7503039144177, + "grad_norm": 19.5, + "learning_rate": 9.724203697533172e-08, + "loss": 0.6598, + "step": 7199 + }, + { + "epoch": 1.75054704595186, + "grad_norm": 19.625, + "learning_rate": 9.705524645787237e-08, + "loss": 0.6493, + "step": 7200 + }, + { + "epoch": 1.75079017748602, + "grad_norm": 20.5, + "learning_rate": 9.686862826791849e-08, + "loss": 0.7907, + "step": 7201 + }, + { + "epoch": 1.75103330902018, + "grad_norm": 49.0, + "learning_rate": 9.668218243336317e-08, + "loss": 0.7562, + "step": 7202 + }, + { + "epoch": 1.75127644055434, + "grad_norm": 21.75, + "learning_rate": 9.649590898207412e-08, + "loss": 0.8688, + "step": 7203 + }, + { + "epoch": 1.7515195720884997, + "grad_norm": 15.0, + "learning_rate": 9.630980794189338e-08, + "loss": 0.4958, + "step": 7204 + }, + { + "epoch": 1.75176270362266, + "grad_norm": 24.625, + "learning_rate": 9.612387934063674e-08, + "loss": 0.8671, + "step": 7205 + }, + { + "epoch": 1.7520058351568197, + "grad_norm": 23.125, + "learning_rate": 9.593812320609436e-08, + "loss": 0.7222, + "step": 7206 + }, + { + "epoch": 1.75224896669098, + "grad_norm": 21.625, + "learning_rate": 9.575253956603095e-08, + "loss": 0.8224, + "step": 7207 + }, + { + "epoch": 1.7524920982251397, + "grad_norm": 13.125, + "learning_rate": 9.556712844818502e-08, + "loss": 0.278, + "step": 7208 + }, + { + "epoch": 1.7527352297592997, + "grad_norm": 22.5, + "learning_rate": 9.538188988026928e-08, + "loss": 0.7725, + "step": 7209 + }, + { + "epoch": 1.7529783612934597, + "grad_norm": 22.625, + "learning_rate": 9.519682388997142e-08, + "loss": 0.9668, + "step": 7210 + }, + { + "epoch": 1.7532214928276197, + "grad_norm": 18.5, + "learning_rate": 9.501193050495197e-08, + "loss": 0.6622, + "step": 7211 + }, + { + "epoch": 1.7534646243617797, + "grad_norm": 22.625, + "learning_rate": 9.482720975284715e-08, + "loss": 0.6568, + "step": 7212 + }, + { + "epoch": 1.7537077558959397, + "grad_norm": 27.125, + "learning_rate": 9.464266166126613e-08, + "loss": 1.0969, + "step": 7213 + }, + { + "epoch": 1.7539508874300997, + "grad_norm": 20.875, + "learning_rate": 9.445828625779321e-08, + "loss": 0.7734, + "step": 7214 + }, + { + "epoch": 1.7541940189642595, + "grad_norm": 19.0, + "learning_rate": 9.427408356998624e-08, + "loss": 0.3951, + "step": 7215 + }, + { + "epoch": 1.7544371504984198, + "grad_norm": 17.875, + "learning_rate": 9.409005362537749e-08, + "loss": 0.6622, + "step": 7216 + }, + { + "epoch": 1.7546802820325795, + "grad_norm": 21.25, + "learning_rate": 9.390619645147355e-08, + "loss": 0.5264, + "step": 7217 + }, + { + "epoch": 1.7549234135667398, + "grad_norm": 22.25, + "learning_rate": 9.372251207575483e-08, + "loss": 0.5337, + "step": 7218 + }, + { + "epoch": 1.7551665451008995, + "grad_norm": 21.25, + "learning_rate": 9.353900052567658e-08, + "loss": 0.9108, + "step": 7219 + }, + { + "epoch": 1.7554096766350595, + "grad_norm": 16.375, + "learning_rate": 9.335566182866712e-08, + "loss": 0.6408, + "step": 7220 + }, + { + "epoch": 1.7556528081692195, + "grad_norm": 15.0, + "learning_rate": 9.317249601213025e-08, + "loss": 0.5524, + "step": 7221 + }, + { + "epoch": 1.7558959397033795, + "grad_norm": 19.125, + "learning_rate": 9.298950310344293e-08, + "loss": 0.7348, + "step": 7222 + }, + { + "epoch": 1.7561390712375395, + "grad_norm": 16.75, + "learning_rate": 9.28066831299565e-08, + "loss": 0.344, + "step": 7223 + }, + { + "epoch": 1.7563822027716995, + "grad_norm": 27.875, + "learning_rate": 9.262403611899673e-08, + "loss": 0.7012, + "step": 7224 + }, + { + "epoch": 1.7566253343058595, + "grad_norm": 19.0, + "learning_rate": 9.244156209786345e-08, + "loss": 0.5723, + "step": 7225 + }, + { + "epoch": 1.7568684658400193, + "grad_norm": 23.625, + "learning_rate": 9.225926109383026e-08, + "loss": 0.7488, + "step": 7226 + }, + { + "epoch": 1.7571115973741795, + "grad_norm": 23.25, + "learning_rate": 9.207713313414523e-08, + "loss": 0.7095, + "step": 7227 + }, + { + "epoch": 1.7573547289083393, + "grad_norm": 18.0, + "learning_rate": 9.189517824603103e-08, + "loss": 0.6345, + "step": 7228 + }, + { + "epoch": 1.7575978604424995, + "grad_norm": 21.75, + "learning_rate": 9.171339645668353e-08, + "loss": 0.7663, + "step": 7229 + }, + { + "epoch": 1.7578409919766593, + "grad_norm": 44.75, + "learning_rate": 9.15317877932731e-08, + "loss": 0.6386, + "step": 7230 + }, + { + "epoch": 1.7580841235108193, + "grad_norm": 15.625, + "learning_rate": 9.135035228294453e-08, + "loss": 0.4772, + "step": 7231 + }, + { + "epoch": 1.7583272550449793, + "grad_norm": 23.125, + "learning_rate": 9.116908995281642e-08, + "loss": 0.8511, + "step": 7232 + }, + { + "epoch": 1.7585703865791393, + "grad_norm": 22.125, + "learning_rate": 9.098800082998141e-08, + "loss": 0.7585, + "step": 7233 + }, + { + "epoch": 1.7588135181132993, + "grad_norm": 30.875, + "learning_rate": 9.08070849415063e-08, + "loss": 0.6268, + "step": 7234 + }, + { + "epoch": 1.7590566496474591, + "grad_norm": 24.25, + "learning_rate": 9.062634231443268e-08, + "loss": 0.9315, + "step": 7235 + }, + { + "epoch": 1.7592997811816193, + "grad_norm": 20.875, + "learning_rate": 9.044577297577517e-08, + "loss": 0.7464, + "step": 7236 + }, + { + "epoch": 1.7595429127157791, + "grad_norm": 21.75, + "learning_rate": 9.026537695252302e-08, + "loss": 0.4642, + "step": 7237 + }, + { + "epoch": 1.7597860442499393, + "grad_norm": 19.375, + "learning_rate": 9.008515427163966e-08, + "loss": 0.664, + "step": 7238 + }, + { + "epoch": 1.7600291757840991, + "grad_norm": 19.75, + "learning_rate": 8.990510496006244e-08, + "loss": 0.6459, + "step": 7239 + }, + { + "epoch": 1.7602723073182593, + "grad_norm": 21.0, + "learning_rate": 8.97252290447026e-08, + "loss": 0.9834, + "step": 7240 + }, + { + "epoch": 1.7605154388524191, + "grad_norm": 24.75, + "learning_rate": 8.954552655244627e-08, + "loss": 0.8953, + "step": 7241 + }, + { + "epoch": 1.7607585703865791, + "grad_norm": 20.375, + "learning_rate": 8.93659975101524e-08, + "loss": 0.8663, + "step": 7242 + }, + { + "epoch": 1.7610017019207391, + "grad_norm": 16.625, + "learning_rate": 8.91866419446552e-08, + "loss": 0.7115, + "step": 7243 + }, + { + "epoch": 1.7612448334548991, + "grad_norm": 16.375, + "learning_rate": 8.900745988276227e-08, + "loss": 0.7156, + "step": 7244 + }, + { + "epoch": 1.7614879649890591, + "grad_norm": 24.625, + "learning_rate": 8.882845135125551e-08, + "loss": 1.1081, + "step": 7245 + }, + { + "epoch": 1.761731096523219, + "grad_norm": 22.625, + "learning_rate": 8.864961637689102e-08, + "loss": 1.007, + "step": 7246 + }, + { + "epoch": 1.7619742280573791, + "grad_norm": 26.75, + "learning_rate": 8.847095498639823e-08, + "loss": 0.7365, + "step": 7247 + }, + { + "epoch": 1.762217359591539, + "grad_norm": 21.125, + "learning_rate": 8.829246720648165e-08, + "loss": 0.52, + "step": 7248 + }, + { + "epoch": 1.7624604911256991, + "grad_norm": 19.0, + "learning_rate": 8.811415306381924e-08, + "loss": 0.5325, + "step": 7249 + }, + { + "epoch": 1.762703622659859, + "grad_norm": 18.375, + "learning_rate": 8.793601258506299e-08, + "loss": 0.6283, + "step": 7250 + }, + { + "epoch": 1.762946754194019, + "grad_norm": 15.375, + "learning_rate": 8.775804579683939e-08, + "loss": 0.297, + "step": 7251 + }, + { + "epoch": 1.763189885728179, + "grad_norm": 19.75, + "learning_rate": 8.758025272574854e-08, + "loss": 0.8917, + "step": 7252 + }, + { + "epoch": 1.763433017262339, + "grad_norm": 15.1875, + "learning_rate": 8.740263339836449e-08, + "loss": 0.3076, + "step": 7253 + }, + { + "epoch": 1.763676148796499, + "grad_norm": 24.375, + "learning_rate": 8.722518784123557e-08, + "loss": 0.9268, + "step": 7254 + }, + { + "epoch": 1.763919280330659, + "grad_norm": 16.75, + "learning_rate": 8.704791608088417e-08, + "loss": 0.5323, + "step": 7255 + }, + { + "epoch": 1.764162411864819, + "grad_norm": 19.25, + "learning_rate": 8.687081814380674e-08, + "loss": 0.5292, + "step": 7256 + }, + { + "epoch": 1.7644055433989787, + "grad_norm": 16.875, + "learning_rate": 8.669389405647294e-08, + "loss": 0.3933, + "step": 7257 + }, + { + "epoch": 1.764648674933139, + "grad_norm": 19.5, + "learning_rate": 8.651714384532814e-08, + "loss": 1.0579, + "step": 7258 + }, + { + "epoch": 1.7648918064672987, + "grad_norm": 23.75, + "learning_rate": 8.634056753679024e-08, + "loss": 0.5955, + "step": 7259 + }, + { + "epoch": 1.765134938001459, + "grad_norm": 21.125, + "learning_rate": 8.616416515725174e-08, + "loss": 0.8159, + "step": 7260 + }, + { + "epoch": 1.7653780695356187, + "grad_norm": 17.75, + "learning_rate": 8.598793673307848e-08, + "loss": 0.5359, + "step": 7261 + }, + { + "epoch": 1.7656212010697787, + "grad_norm": 17.25, + "learning_rate": 8.581188229061163e-08, + "loss": 0.5799, + "step": 7262 + }, + { + "epoch": 1.7658643326039387, + "grad_norm": 19.75, + "learning_rate": 8.563600185616513e-08, + "loss": 0.6923, + "step": 7263 + }, + { + "epoch": 1.7661074641380987, + "grad_norm": 19.25, + "learning_rate": 8.546029545602713e-08, + "loss": 0.6216, + "step": 7264 + }, + { + "epoch": 1.7663505956722587, + "grad_norm": 17.875, + "learning_rate": 8.528476311646036e-08, + "loss": 0.8677, + "step": 7265 + }, + { + "epoch": 1.7665937272064187, + "grad_norm": 22.0, + "learning_rate": 8.51094048637012e-08, + "loss": 0.8781, + "step": 7266 + }, + { + "epoch": 1.7668368587405787, + "grad_norm": 18.125, + "learning_rate": 8.493422072395979e-08, + "loss": 0.5803, + "step": 7267 + }, + { + "epoch": 1.7670799902747385, + "grad_norm": 17.875, + "learning_rate": 8.475921072342047e-08, + "loss": 0.6115, + "step": 7268 + }, + { + "epoch": 1.7673231218088987, + "grad_norm": 24.125, + "learning_rate": 8.458437488824162e-08, + "loss": 0.549, + "step": 7269 + }, + { + "epoch": 1.7675662533430585, + "grad_norm": 19.625, + "learning_rate": 8.440971324455538e-08, + "loss": 0.8231, + "step": 7270 + }, + { + "epoch": 1.7678093848772187, + "grad_norm": 22.5, + "learning_rate": 8.423522581846783e-08, + "loss": 0.9291, + "step": 7271 + }, + { + "epoch": 1.7680525164113785, + "grad_norm": 17.75, + "learning_rate": 8.406091263605934e-08, + "loss": 1.018, + "step": 7272 + }, + { + "epoch": 1.7682956479455385, + "grad_norm": 17.375, + "learning_rate": 8.388677372338366e-08, + "loss": 0.6525, + "step": 7273 + }, + { + "epoch": 1.7685387794796985, + "grad_norm": 23.5, + "learning_rate": 8.371280910646914e-08, + "loss": 0.5221, + "step": 7274 + }, + { + "epoch": 1.7687819110138585, + "grad_norm": 19.5, + "learning_rate": 8.353901881131804e-08, + "loss": 0.894, + "step": 7275 + }, + { + "epoch": 1.7690250425480185, + "grad_norm": 15.125, + "learning_rate": 8.336540286390596e-08, + "loss": 0.9805, + "step": 7276 + }, + { + "epoch": 1.7692681740821785, + "grad_norm": 17.125, + "learning_rate": 8.319196129018298e-08, + "loss": 0.4495, + "step": 7277 + }, + { + "epoch": 1.7695113056163385, + "grad_norm": 223.0, + "learning_rate": 8.301869411607266e-08, + "loss": 0.6571, + "step": 7278 + }, + { + "epoch": 1.7697544371504983, + "grad_norm": 18.375, + "learning_rate": 8.284560136747318e-08, + "loss": 0.5611, + "step": 7279 + }, + { + "epoch": 1.7699975686846585, + "grad_norm": 21.625, + "learning_rate": 8.26726830702558e-08, + "loss": 0.9508, + "step": 7280 + }, + { + "epoch": 1.7702407002188183, + "grad_norm": 16.625, + "learning_rate": 8.249993925026636e-08, + "loss": 0.5974, + "step": 7281 + }, + { + "epoch": 1.7704838317529785, + "grad_norm": 18.625, + "learning_rate": 8.232736993332464e-08, + "loss": 0.5968, + "step": 7282 + }, + { + "epoch": 1.7707269632871383, + "grad_norm": 26.375, + "learning_rate": 8.215497514522386e-08, + "loss": 0.9187, + "step": 7283 + }, + { + "epoch": 1.7709700948212983, + "grad_norm": 23.0, + "learning_rate": 8.198275491173121e-08, + "loss": 0.8458, + "step": 7284 + }, + { + "epoch": 1.7712132263554583, + "grad_norm": 15.9375, + "learning_rate": 8.181070925858847e-08, + "loss": 0.2648, + "step": 7285 + }, + { + "epoch": 1.7714563578896183, + "grad_norm": 22.875, + "learning_rate": 8.163883821151047e-08, + "loss": 0.6686, + "step": 7286 + }, + { + "epoch": 1.7716994894237783, + "grad_norm": 18.625, + "learning_rate": 8.146714179618653e-08, + "loss": 0.854, + "step": 7287 + }, + { + "epoch": 1.771942620957938, + "grad_norm": 19.625, + "learning_rate": 8.129562003827903e-08, + "loss": 0.4788, + "step": 7288 + }, + { + "epoch": 1.7721857524920983, + "grad_norm": 26.125, + "learning_rate": 8.112427296342568e-08, + "loss": 1.1571, + "step": 7289 + }, + { + "epoch": 1.772428884026258, + "grad_norm": 18.5, + "learning_rate": 8.095310059723694e-08, + "loss": 0.8155, + "step": 7290 + }, + { + "epoch": 1.7726720155604183, + "grad_norm": 14.875, + "learning_rate": 8.078210296529734e-08, + "loss": 1.2172, + "step": 7291 + }, + { + "epoch": 1.772915147094578, + "grad_norm": 20.25, + "learning_rate": 8.061128009316577e-08, + "loss": 0.7776, + "step": 7292 + }, + { + "epoch": 1.773158278628738, + "grad_norm": 17.125, + "learning_rate": 8.044063200637428e-08, + "loss": 0.4722, + "step": 7293 + }, + { + "epoch": 1.773401410162898, + "grad_norm": 17.5, + "learning_rate": 8.027015873042942e-08, + "loss": 0.4354, + "step": 7294 + }, + { + "epoch": 1.773644541697058, + "grad_norm": 18.125, + "learning_rate": 8.009986029081097e-08, + "loss": 0.4938, + "step": 7295 + }, + { + "epoch": 1.773887673231218, + "grad_norm": 24.375, + "learning_rate": 7.992973671297338e-08, + "loss": 0.838, + "step": 7296 + }, + { + "epoch": 1.774130804765378, + "grad_norm": 23.625, + "learning_rate": 7.975978802234468e-08, + "loss": 0.9385, + "step": 7297 + }, + { + "epoch": 1.774373936299538, + "grad_norm": 17.5, + "learning_rate": 7.959001424432608e-08, + "loss": 0.7159, + "step": 7298 + }, + { + "epoch": 1.7746170678336979, + "grad_norm": 22.5, + "learning_rate": 7.942041540429379e-08, + "loss": 0.6502, + "step": 7299 + }, + { + "epoch": 1.774860199367858, + "grad_norm": 19.625, + "learning_rate": 7.925099152759714e-08, + "loss": 0.4799, + "step": 7300 + }, + { + "epoch": 1.7751033309020179, + "grad_norm": 20.875, + "learning_rate": 7.908174263955917e-08, + "loss": 0.6748, + "step": 7301 + }, + { + "epoch": 1.775346462436178, + "grad_norm": 19.375, + "learning_rate": 7.891266876547717e-08, + "loss": 0.9074, + "step": 7302 + }, + { + "epoch": 1.7755895939703379, + "grad_norm": 24.375, + "learning_rate": 7.874376993062205e-08, + "loss": 0.488, + "step": 7303 + }, + { + "epoch": 1.7758327255044979, + "grad_norm": 21.25, + "learning_rate": 7.857504616023915e-08, + "loss": 0.7793, + "step": 7304 + }, + { + "epoch": 1.776075857038658, + "grad_norm": 21.25, + "learning_rate": 7.840649747954648e-08, + "loss": 0.7053, + "step": 7305 + }, + { + "epoch": 1.776318988572818, + "grad_norm": 16.625, + "learning_rate": 7.823812391373711e-08, + "loss": 0.7816, + "step": 7306 + }, + { + "epoch": 1.776562120106978, + "grad_norm": 19.875, + "learning_rate": 7.806992548797729e-08, + "loss": 0.642, + "step": 7307 + }, + { + "epoch": 1.776805251641138, + "grad_norm": 19.375, + "learning_rate": 7.790190222740662e-08, + "loss": 0.8313, + "step": 7308 + }, + { + "epoch": 1.777048383175298, + "grad_norm": 19.25, + "learning_rate": 7.773405415713988e-08, + "loss": 0.5274, + "step": 7309 + }, + { + "epoch": 1.7772915147094577, + "grad_norm": 21.25, + "learning_rate": 7.756638130226438e-08, + "loss": 0.4838, + "step": 7310 + }, + { + "epoch": 1.777534646243618, + "grad_norm": 19.625, + "learning_rate": 7.739888368784171e-08, + "loss": 0.5142, + "step": 7311 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 35.25, + "learning_rate": 7.723156133890727e-08, + "loss": 0.8411, + "step": 7312 + }, + { + "epoch": 1.778020909311938, + "grad_norm": 14.625, + "learning_rate": 7.706441428047065e-08, + "loss": 0.3793, + "step": 7313 + }, + { + "epoch": 1.7782640408460977, + "grad_norm": 15.0, + "learning_rate": 7.689744253751463e-08, + "loss": 0.6047, + "step": 7314 + }, + { + "epoch": 1.7785071723802577, + "grad_norm": 14.875, + "learning_rate": 7.673064613499578e-08, + "loss": 0.6834, + "step": 7315 + }, + { + "epoch": 1.7787503039144177, + "grad_norm": 36.5, + "learning_rate": 7.656402509784527e-08, + "loss": 0.6551, + "step": 7316 + }, + { + "epoch": 1.7789934354485777, + "grad_norm": 24.625, + "learning_rate": 7.639757945096693e-08, + "loss": 0.5765, + "step": 7317 + }, + { + "epoch": 1.7792365669827377, + "grad_norm": 29.0, + "learning_rate": 7.623130921923932e-08, + "loss": 0.974, + "step": 7318 + }, + { + "epoch": 1.7794796985168977, + "grad_norm": 18.25, + "learning_rate": 7.606521442751383e-08, + "loss": 0.816, + "step": 7319 + }, + { + "epoch": 1.7797228300510577, + "grad_norm": 14.4375, + "learning_rate": 7.589929510061683e-08, + "loss": 0.3443, + "step": 7320 + }, + { + "epoch": 1.7799659615852175, + "grad_norm": 22.125, + "learning_rate": 7.57335512633478e-08, + "loss": 0.9432, + "step": 7321 + }, + { + "epoch": 1.7802090931193777, + "grad_norm": 18.375, + "learning_rate": 7.556798294047943e-08, + "loss": 0.5595, + "step": 7322 + }, + { + "epoch": 1.7804522246535375, + "grad_norm": 17.5, + "learning_rate": 7.540259015675955e-08, + "loss": 0.2119, + "step": 7323 + }, + { + "epoch": 1.7806953561876977, + "grad_norm": 23.125, + "learning_rate": 7.523737293690838e-08, + "loss": 0.6449, + "step": 7324 + }, + { + "epoch": 1.7809384877218575, + "grad_norm": 17.75, + "learning_rate": 7.507233130562064e-08, + "loss": 0.6951, + "step": 7325 + }, + { + "epoch": 1.7811816192560175, + "grad_norm": 18.875, + "learning_rate": 7.490746528756476e-08, + "loss": 0.8004, + "step": 7326 + }, + { + "epoch": 1.7814247507901775, + "grad_norm": 13.0, + "learning_rate": 7.474277490738257e-08, + "loss": 0.4268, + "step": 7327 + }, + { + "epoch": 1.7816678823243375, + "grad_norm": 20.875, + "learning_rate": 7.457826018969037e-08, + "loss": 1.2651, + "step": 7328 + }, + { + "epoch": 1.7819110138584975, + "grad_norm": 26.625, + "learning_rate": 7.441392115907706e-08, + "loss": 0.8745, + "step": 7329 + }, + { + "epoch": 1.7821541453926573, + "grad_norm": 19.375, + "learning_rate": 7.424975784010662e-08, + "loss": 0.4935, + "step": 7330 + }, + { + "epoch": 1.7823972769268175, + "grad_norm": 23.375, + "learning_rate": 7.40857702573157e-08, + "loss": 0.7011, + "step": 7331 + }, + { + "epoch": 1.7826404084609773, + "grad_norm": 15.0, + "learning_rate": 7.392195843521507e-08, + "loss": 0.2392, + "step": 7332 + }, + { + "epoch": 1.7828835399951375, + "grad_norm": 18.25, + "learning_rate": 7.375832239828948e-08, + "loss": 0.7848, + "step": 7333 + }, + { + "epoch": 1.7831266715292973, + "grad_norm": 15.9375, + "learning_rate": 7.35948621709967e-08, + "loss": 0.3441, + "step": 7334 + }, + { + "epoch": 1.7833698030634575, + "grad_norm": 14.6875, + "learning_rate": 7.34315777777693e-08, + "loss": 0.4485, + "step": 7335 + }, + { + "epoch": 1.7836129345976173, + "grad_norm": 20.25, + "learning_rate": 7.326846924301245e-08, + "loss": 0.6855, + "step": 7336 + }, + { + "epoch": 1.7838560661317773, + "grad_norm": 23.625, + "learning_rate": 7.310553659110584e-08, + "loss": 0.9441, + "step": 7337 + }, + { + "epoch": 1.7840991976659373, + "grad_norm": 17.375, + "learning_rate": 7.294277984640261e-08, + "loss": 0.3482, + "step": 7338 + }, + { + "epoch": 1.7843423292000973, + "grad_norm": 18.75, + "learning_rate": 7.2780199033229e-08, + "loss": 0.959, + "step": 7339 + }, + { + "epoch": 1.7845854607342573, + "grad_norm": 18.0, + "learning_rate": 7.261779417588628e-08, + "loss": 0.5354, + "step": 7340 + }, + { + "epoch": 1.784828592268417, + "grad_norm": 20.125, + "learning_rate": 7.245556529864834e-08, + "loss": 0.2532, + "step": 7341 + }, + { + "epoch": 1.7850717238025773, + "grad_norm": 25.25, + "learning_rate": 7.229351242576274e-08, + "loss": 0.5816, + "step": 7342 + }, + { + "epoch": 1.785314855336737, + "grad_norm": 19.0, + "learning_rate": 7.21316355814515e-08, + "loss": 0.9048, + "step": 7343 + }, + { + "epoch": 1.7855579868708973, + "grad_norm": 21.375, + "learning_rate": 7.196993478990999e-08, + "loss": 0.6518, + "step": 7344 + }, + { + "epoch": 1.785801118405057, + "grad_norm": 15.25, + "learning_rate": 7.180841007530693e-08, + "loss": 0.398, + "step": 7345 + }, + { + "epoch": 1.786044249939217, + "grad_norm": 18.875, + "learning_rate": 7.164706146178493e-08, + "loss": 0.6759, + "step": 7346 + }, + { + "epoch": 1.786287381473377, + "grad_norm": 20.5, + "learning_rate": 7.14858889734607e-08, + "loss": 1.0445, + "step": 7347 + }, + { + "epoch": 1.786530513007537, + "grad_norm": 17.125, + "learning_rate": 7.132489263442399e-08, + "loss": 0.9325, + "step": 7348 + }, + { + "epoch": 1.786773644541697, + "grad_norm": 17.25, + "learning_rate": 7.116407246873818e-08, + "loss": 0.4169, + "step": 7349 + }, + { + "epoch": 1.787016776075857, + "grad_norm": 21.0, + "learning_rate": 7.100342850044101e-08, + "loss": 0.8287, + "step": 7350 + }, + { + "epoch": 1.787259907610017, + "grad_norm": 16.0, + "learning_rate": 7.084296075354367e-08, + "loss": 0.3084, + "step": 7351 + }, + { + "epoch": 1.7875030391441769, + "grad_norm": 20.375, + "learning_rate": 7.068266925203057e-08, + "loss": 0.6098, + "step": 7352 + }, + { + "epoch": 1.787746170678337, + "grad_norm": 22.75, + "learning_rate": 7.052255401985994e-08, + "loss": 1.1271, + "step": 7353 + }, + { + "epoch": 1.7879893022124969, + "grad_norm": 14.0625, + "learning_rate": 7.036261508096387e-08, + "loss": 0.2789, + "step": 7354 + }, + { + "epoch": 1.788232433746657, + "grad_norm": 21.125, + "learning_rate": 7.020285245924824e-08, + "loss": 0.8638, + "step": 7355 + }, + { + "epoch": 1.7884755652808169, + "grad_norm": 14.5625, + "learning_rate": 7.004326617859187e-08, + "loss": 0.27, + "step": 7356 + }, + { + "epoch": 1.7887186968149769, + "grad_norm": 25.375, + "learning_rate": 6.988385626284802e-08, + "loss": 0.6371, + "step": 7357 + }, + { + "epoch": 1.7889618283491369, + "grad_norm": 17.625, + "learning_rate": 6.972462273584307e-08, + "loss": 0.6549, + "step": 7358 + }, + { + "epoch": 1.7892049598832969, + "grad_norm": 22.625, + "learning_rate": 6.956556562137743e-08, + "loss": 0.6225, + "step": 7359 + }, + { + "epoch": 1.7894480914174569, + "grad_norm": 19.25, + "learning_rate": 6.94066849432247e-08, + "loss": 0.7344, + "step": 7360 + }, + { + "epoch": 1.7896912229516169, + "grad_norm": 23.75, + "learning_rate": 6.924798072513256e-08, + "loss": 1.1601, + "step": 7361 + }, + { + "epoch": 1.7899343544857769, + "grad_norm": 21.125, + "learning_rate": 6.908945299082203e-08, + "loss": 0.8278, + "step": 7362 + }, + { + "epoch": 1.7901774860199366, + "grad_norm": 18.375, + "learning_rate": 6.893110176398765e-08, + "loss": 0.6753, + "step": 7363 + }, + { + "epoch": 1.7904206175540969, + "grad_norm": 17.375, + "learning_rate": 6.877292706829796e-08, + "loss": 0.5124, + "step": 7364 + }, + { + "epoch": 1.7906637490882567, + "grad_norm": 22.5, + "learning_rate": 6.861492892739477e-08, + "loss": 0.8909, + "step": 7365 + }, + { + "epoch": 1.7909068806224169, + "grad_norm": 26.625, + "learning_rate": 6.845710736489375e-08, + "loss": 0.6656, + "step": 7366 + }, + { + "epoch": 1.7911500121565767, + "grad_norm": 21.5, + "learning_rate": 6.829946240438382e-08, + "loss": 0.67, + "step": 7367 + }, + { + "epoch": 1.7913931436907367, + "grad_norm": 18.375, + "learning_rate": 6.81419940694282e-08, + "loss": 0.783, + "step": 7368 + }, + { + "epoch": 1.7916362752248967, + "grad_norm": 15.5625, + "learning_rate": 6.79847023835628e-08, + "loss": 0.3322, + "step": 7369 + }, + { + "epoch": 1.7918794067590567, + "grad_norm": 23.75, + "learning_rate": 6.782758737029771e-08, + "loss": 0.6024, + "step": 7370 + }, + { + "epoch": 1.7921225382932167, + "grad_norm": 18.5, + "learning_rate": 6.767064905311649e-08, + "loss": 0.672, + "step": 7371 + }, + { + "epoch": 1.7923656698273767, + "grad_norm": 15.875, + "learning_rate": 6.751388745547649e-08, + "loss": 0.4361, + "step": 7372 + }, + { + "epoch": 1.7926088013615367, + "grad_norm": 20.25, + "learning_rate": 6.7357302600808e-08, + "loss": 0.6361, + "step": 7373 + }, + { + "epoch": 1.7928519328956964, + "grad_norm": 18.875, + "learning_rate": 6.720089451251563e-08, + "loss": 0.7835, + "step": 7374 + }, + { + "epoch": 1.7930950644298567, + "grad_norm": 26.375, + "learning_rate": 6.704466321397734e-08, + "loss": 0.8165, + "step": 7375 + }, + { + "epoch": 1.7933381959640164, + "grad_norm": 25.25, + "learning_rate": 6.688860872854446e-08, + "loss": 1.1557, + "step": 7376 + }, + { + "epoch": 1.7935813274981767, + "grad_norm": 23.125, + "learning_rate": 6.673273107954195e-08, + "loss": 0.5129, + "step": 7377 + }, + { + "epoch": 1.7938244590323364, + "grad_norm": 21.375, + "learning_rate": 6.657703029026865e-08, + "loss": 0.832, + "step": 7378 + }, + { + "epoch": 1.7940675905664965, + "grad_norm": 19.375, + "learning_rate": 6.642150638399653e-08, + "loss": 0.4706, + "step": 7379 + }, + { + "epoch": 1.7943107221006565, + "grad_norm": 21.125, + "learning_rate": 6.626615938397127e-08, + "loss": 0.7821, + "step": 7380 + }, + { + "epoch": 1.7945538536348165, + "grad_norm": 22.375, + "learning_rate": 6.611098931341237e-08, + "loss": 0.4564, + "step": 7381 + }, + { + "epoch": 1.7947969851689765, + "grad_norm": 19.0, + "learning_rate": 6.595599619551266e-08, + "loss": 0.6673, + "step": 7382 + }, + { + "epoch": 1.7950401167031362, + "grad_norm": 24.125, + "learning_rate": 6.580118005343847e-08, + "loss": 0.8958, + "step": 7383 + }, + { + "epoch": 1.7952832482372965, + "grad_norm": 17.25, + "learning_rate": 6.564654091032949e-08, + "loss": 0.5238, + "step": 7384 + }, + { + "epoch": 1.7955263797714562, + "grad_norm": 23.75, + "learning_rate": 6.549207878929972e-08, + "loss": 0.7977, + "step": 7385 + }, + { + "epoch": 1.7957695113056165, + "grad_norm": 18.25, + "learning_rate": 6.533779371343599e-08, + "loss": 0.7698, + "step": 7386 + }, + { + "epoch": 1.7960126428397762, + "grad_norm": 18.625, + "learning_rate": 6.518368570579859e-08, + "loss": 0.7963, + "step": 7387 + }, + { + "epoch": 1.7962557743739365, + "grad_norm": 20.0, + "learning_rate": 6.502975478942186e-08, + "loss": 0.8037, + "step": 7388 + }, + { + "epoch": 1.7964989059080962, + "grad_norm": 15.75, + "learning_rate": 6.487600098731353e-08, + "loss": 0.6715, + "step": 7389 + }, + { + "epoch": 1.7967420374422562, + "grad_norm": 18.125, + "learning_rate": 6.47224243224548e-08, + "loss": 0.969, + "step": 7390 + }, + { + "epoch": 1.7969851689764162, + "grad_norm": 22.625, + "learning_rate": 6.456902481779992e-08, + "loss": 0.5991, + "step": 7391 + }, + { + "epoch": 1.7972283005105762, + "grad_norm": 18.625, + "learning_rate": 6.441580249627751e-08, + "loss": 0.4721, + "step": 7392 + }, + { + "epoch": 1.7974714320447363, + "grad_norm": 26.25, + "learning_rate": 6.426275738078928e-08, + "loss": 0.8928, + "step": 7393 + }, + { + "epoch": 1.797714563578896, + "grad_norm": 19.0, + "learning_rate": 6.410988949421007e-08, + "loss": 0.3274, + "step": 7394 + }, + { + "epoch": 1.7979576951130563, + "grad_norm": 21.125, + "learning_rate": 6.395719885938914e-08, + "loss": 0.6072, + "step": 7395 + }, + { + "epoch": 1.798200826647216, + "grad_norm": 25.875, + "learning_rate": 6.380468549914837e-08, + "loss": 0.8625, + "step": 7396 + }, + { + "epoch": 1.7984439581813763, + "grad_norm": 23.125, + "learning_rate": 6.365234943628382e-08, + "loss": 0.6045, + "step": 7397 + }, + { + "epoch": 1.798687089715536, + "grad_norm": 23.5, + "learning_rate": 6.350019069356436e-08, + "loss": 0.9674, + "step": 7398 + }, + { + "epoch": 1.798930221249696, + "grad_norm": 17.125, + "learning_rate": 6.33482092937332e-08, + "loss": 0.8304, + "step": 7399 + }, + { + "epoch": 1.799173352783856, + "grad_norm": 21.75, + "learning_rate": 6.319640525950632e-08, + "loss": 0.7151, + "step": 7400 + }, + { + "epoch": 1.799416484318016, + "grad_norm": 20.875, + "learning_rate": 6.304477861357322e-08, + "loss": 0.6211, + "step": 7401 + }, + { + "epoch": 1.799659615852176, + "grad_norm": 17.25, + "learning_rate": 6.289332937859757e-08, + "loss": 0.45, + "step": 7402 + }, + { + "epoch": 1.799902747386336, + "grad_norm": 20.0, + "learning_rate": 6.274205757721599e-08, + "loss": 0.4845, + "step": 7403 + }, + { + "epoch": 1.800145878920496, + "grad_norm": 18.875, + "learning_rate": 6.259096323203832e-08, + "loss": 0.5576, + "step": 7404 + }, + { + "epoch": 1.8003890104546558, + "grad_norm": 16.75, + "learning_rate": 6.244004636564855e-08, + "loss": 0.3467, + "step": 7405 + }, + { + "epoch": 1.800632141988816, + "grad_norm": 15.6875, + "learning_rate": 6.228930700060379e-08, + "loss": 0.3939, + "step": 7406 + }, + { + "epoch": 1.8008752735229758, + "grad_norm": 21.125, + "learning_rate": 6.21387451594345e-08, + "loss": 1.0078, + "step": 7407 + }, + { + "epoch": 1.801118405057136, + "grad_norm": 22.75, + "learning_rate": 6.198836086464474e-08, + "loss": 0.9298, + "step": 7408 + }, + { + "epoch": 1.8013615365912958, + "grad_norm": 21.5, + "learning_rate": 6.183815413871238e-08, + "loss": 0.651, + "step": 7409 + }, + { + "epoch": 1.8016046681254558, + "grad_norm": 17.0, + "learning_rate": 6.168812500408808e-08, + "loss": 0.8019, + "step": 7410 + }, + { + "epoch": 1.8018477996596158, + "grad_norm": 26.25, + "learning_rate": 6.15382734831961e-08, + "loss": 1.1185, + "step": 7411 + }, + { + "epoch": 1.8020909311937758, + "grad_norm": 19.0, + "learning_rate": 6.138859959843466e-08, + "loss": 0.5034, + "step": 7412 + }, + { + "epoch": 1.8023340627279358, + "grad_norm": 18.625, + "learning_rate": 6.123910337217528e-08, + "loss": 0.691, + "step": 7413 + }, + { + "epoch": 1.8025771942620958, + "grad_norm": 26.375, + "learning_rate": 6.108978482676233e-08, + "loss": 1.2395, + "step": 7414 + }, + { + "epoch": 1.8028203257962558, + "grad_norm": 16.625, + "learning_rate": 6.094064398451421e-08, + "loss": 0.4114, + "step": 7415 + }, + { + "epoch": 1.8030634573304156, + "grad_norm": 22.875, + "learning_rate": 6.07916808677228e-08, + "loss": 0.596, + "step": 7416 + }, + { + "epoch": 1.8033065888645758, + "grad_norm": 21.125, + "learning_rate": 6.064289549865293e-08, + "loss": 0.4784, + "step": 7417 + }, + { + "epoch": 1.8035497203987356, + "grad_norm": 18.0, + "learning_rate": 6.049428789954307e-08, + "loss": 0.7692, + "step": 7418 + }, + { + "epoch": 1.8037928519328958, + "grad_norm": 15.25, + "learning_rate": 6.034585809260543e-08, + "loss": 0.3401, + "step": 7419 + }, + { + "epoch": 1.8040359834670556, + "grad_norm": 19.0, + "learning_rate": 6.019760610002548e-08, + "loss": 0.4493, + "step": 7420 + }, + { + "epoch": 1.8042791150012156, + "grad_norm": 17.875, + "learning_rate": 6.004953194396187e-08, + "loss": 0.6573, + "step": 7421 + }, + { + "epoch": 1.8045222465353756, + "grad_norm": 15.4375, + "learning_rate": 5.990163564654663e-08, + "loss": 0.3954, + "step": 7422 + }, + { + "epoch": 1.8047653780695356, + "grad_norm": 16.875, + "learning_rate": 5.975391722988597e-08, + "loss": 0.4296, + "step": 7423 + }, + { + "epoch": 1.8050085096036956, + "grad_norm": 25.75, + "learning_rate": 5.96063767160586e-08, + "loss": 0.6389, + "step": 7424 + }, + { + "epoch": 1.8052516411378556, + "grad_norm": 18.625, + "learning_rate": 5.9459014127116743e-08, + "loss": 0.538, + "step": 7425 + }, + { + "epoch": 1.8054947726720156, + "grad_norm": 20.875, + "learning_rate": 5.931182948508696e-08, + "loss": 0.7424, + "step": 7426 + }, + { + "epoch": 1.8057379042061754, + "grad_norm": 23.25, + "learning_rate": 5.916482281196775e-08, + "loss": 0.7739, + "step": 7427 + }, + { + "epoch": 1.8059810357403356, + "grad_norm": 15.6875, + "learning_rate": 5.901799412973252e-08, + "loss": 0.3448, + "step": 7428 + }, + { + "epoch": 1.8062241672744954, + "grad_norm": 21.0, + "learning_rate": 5.8871343460326916e-08, + "loss": 0.6944, + "step": 7429 + }, + { + "epoch": 1.8064672988086556, + "grad_norm": 16.375, + "learning_rate": 5.872487082567061e-08, + "loss": 0.5195, + "step": 7430 + }, + { + "epoch": 1.8067104303428154, + "grad_norm": 21.0, + "learning_rate": 5.857857624765637e-08, + "loss": 0.4985, + "step": 7431 + }, + { + "epoch": 1.8069535618769754, + "grad_norm": 18.75, + "learning_rate": 5.8432459748150315e-08, + "loss": 0.7131, + "step": 7432 + }, + { + "epoch": 1.8071966934111354, + "grad_norm": 21.625, + "learning_rate": 5.8286521348992484e-08, + "loss": 0.3244, + "step": 7433 + }, + { + "epoch": 1.8074398249452954, + "grad_norm": 17.875, + "learning_rate": 5.814076107199557e-08, + "loss": 1.1122, + "step": 7434 + }, + { + "epoch": 1.8076829564794554, + "grad_norm": 24.875, + "learning_rate": 5.799517893894588e-08, + "loss": 0.5813, + "step": 7435 + }, + { + "epoch": 1.8079260880136152, + "grad_norm": 22.375, + "learning_rate": 5.7849774971603376e-08, + "loss": 0.7805, + "step": 7436 + }, + { + "epoch": 1.8081692195477754, + "grad_norm": 20.75, + "learning_rate": 5.7704549191701236e-08, + "loss": 0.7114, + "step": 7437 + }, + { + "epoch": 1.8084123510819352, + "grad_norm": 19.375, + "learning_rate": 5.755950162094598e-08, + "loss": 0.837, + "step": 7438 + }, + { + "epoch": 1.8086554826160954, + "grad_norm": 19.0, + "learning_rate": 5.7414632281017206e-08, + "loss": 0.5499, + "step": 7439 + }, + { + "epoch": 1.8088986141502552, + "grad_norm": 19.375, + "learning_rate": 5.72699411935683e-08, + "loss": 0.4997, + "step": 7440 + }, + { + "epoch": 1.8091417456844152, + "grad_norm": 18.875, + "learning_rate": 5.712542838022597e-08, + "loss": 0.5257, + "step": 7441 + }, + { + "epoch": 1.8093848772185752, + "grad_norm": 20.125, + "learning_rate": 5.6981093862589904e-08, + "loss": 0.9424, + "step": 7442 + }, + { + "epoch": 1.8096280087527352, + "grad_norm": 26.125, + "learning_rate": 5.6836937662233385e-08, + "loss": 0.6369, + "step": 7443 + }, + { + "epoch": 1.8098711402868952, + "grad_norm": 22.75, + "learning_rate": 5.6692959800703356e-08, + "loss": 1.0135, + "step": 7444 + }, + { + "epoch": 1.8101142718210552, + "grad_norm": 16.125, + "learning_rate": 5.654916029951968e-08, + "loss": 0.7887, + "step": 7445 + }, + { + "epoch": 1.8103574033552152, + "grad_norm": 22.625, + "learning_rate": 5.640553918017544e-08, + "loss": 0.8612, + "step": 7446 + }, + { + "epoch": 1.810600534889375, + "grad_norm": 19.25, + "learning_rate": 5.6262096464137635e-08, + "loss": 0.6758, + "step": 7447 + }, + { + "epoch": 1.8108436664235352, + "grad_norm": 25.0, + "learning_rate": 5.6118832172845914e-08, + "loss": 0.7105, + "step": 7448 + }, + { + "epoch": 1.811086797957695, + "grad_norm": 19.625, + "learning_rate": 5.5975746327713854e-08, + "loss": 0.6348, + "step": 7449 + }, + { + "epoch": 1.8113299294918552, + "grad_norm": 21.25, + "learning_rate": 5.583283895012781e-08, + "loss": 1.0206, + "step": 7450 + }, + { + "epoch": 1.811573061026015, + "grad_norm": 20.125, + "learning_rate": 5.56901100614482e-08, + "loss": 0.6636, + "step": 7451 + }, + { + "epoch": 1.811816192560175, + "grad_norm": 17.875, + "learning_rate": 5.55475596830081e-08, + "loss": 0.9417, + "step": 7452 + }, + { + "epoch": 1.812059324094335, + "grad_norm": 20.75, + "learning_rate": 5.540518783611393e-08, + "loss": 0.7553, + "step": 7453 + }, + { + "epoch": 1.812302455628495, + "grad_norm": 30.625, + "learning_rate": 5.526299454204603e-08, + "loss": 1.0515, + "step": 7454 + }, + { + "epoch": 1.812545587162655, + "grad_norm": 19.625, + "learning_rate": 5.512097982205741e-08, + "loss": 0.5147, + "step": 7455 + }, + { + "epoch": 1.812788718696815, + "grad_norm": 23.5, + "learning_rate": 5.497914369737442e-08, + "loss": 0.7567, + "step": 7456 + }, + { + "epoch": 1.813031850230975, + "grad_norm": 25.0, + "learning_rate": 5.483748618919732e-08, + "loss": 0.9327, + "step": 7457 + }, + { + "epoch": 1.8132749817651348, + "grad_norm": 25.5, + "learning_rate": 5.4696007318698894e-08, + "loss": 1.2147, + "step": 7458 + }, + { + "epoch": 1.813518113299295, + "grad_norm": 26.875, + "learning_rate": 5.4554707107025846e-08, + "loss": 0.4982, + "step": 7459 + }, + { + "epoch": 1.8137612448334548, + "grad_norm": 20.75, + "learning_rate": 5.44135855752978e-08, + "loss": 0.5704, + "step": 7460 + }, + { + "epoch": 1.814004376367615, + "grad_norm": 13.875, + "learning_rate": 5.4272642744608166e-08, + "loss": 0.4078, + "step": 7461 + }, + { + "epoch": 1.8142475079017748, + "grad_norm": 19.5, + "learning_rate": 5.413187863602287e-08, + "loss": 0.8482, + "step": 7462 + }, + { + "epoch": 1.8144906394359348, + "grad_norm": 19.25, + "learning_rate": 5.399129327058147e-08, + "loss": 0.4841, + "step": 7463 + }, + { + "epoch": 1.8147337709700948, + "grad_norm": 22.875, + "learning_rate": 5.3850886669297304e-08, + "loss": 0.6655, + "step": 7464 + }, + { + "epoch": 1.8149769025042548, + "grad_norm": 18.375, + "learning_rate": 5.371065885315635e-08, + "loss": 0.5621, + "step": 7465 + }, + { + "epoch": 1.8152200340384148, + "grad_norm": 18.75, + "learning_rate": 5.357060984311796e-08, + "loss": 0.3608, + "step": 7466 + }, + { + "epoch": 1.8154631655725748, + "grad_norm": 18.0, + "learning_rate": 5.3430739660114835e-08, + "loss": 0.6578, + "step": 7467 + }, + { + "epoch": 1.8157062971067348, + "grad_norm": 18.125, + "learning_rate": 5.329104832505344e-08, + "loss": 0.725, + "step": 7468 + }, + { + "epoch": 1.8159494286408946, + "grad_norm": 23.25, + "learning_rate": 5.3151535858812634e-08, + "loss": 0.7315, + "step": 7469 + }, + { + "epoch": 1.8161925601750548, + "grad_norm": 17.75, + "learning_rate": 5.301220228224491e-08, + "loss": 0.5221, + "step": 7470 + }, + { + "epoch": 1.8164356917092146, + "grad_norm": 19.625, + "learning_rate": 5.287304761617651e-08, + "loss": 0.7665, + "step": 7471 + }, + { + "epoch": 1.8166788232433748, + "grad_norm": 18.5, + "learning_rate": 5.2734071881406226e-08, + "loss": 0.5429, + "step": 7472 + }, + { + "epoch": 1.8169219547775346, + "grad_norm": 20.625, + "learning_rate": 5.2595275098706186e-08, + "loss": 0.7075, + "step": 7473 + }, + { + "epoch": 1.8171650863116946, + "grad_norm": 19.5, + "learning_rate": 5.245665728882215e-08, + "loss": 0.6894, + "step": 7474 + }, + { + "epoch": 1.8174082178458546, + "grad_norm": 20.875, + "learning_rate": 5.231821847247326e-08, + "loss": 0.6145, + "step": 7475 + }, + { + "epoch": 1.8176513493800146, + "grad_norm": 23.5, + "learning_rate": 5.217995867035114e-08, + "loss": 1.0259, + "step": 7476 + }, + { + "epoch": 1.8178944809141746, + "grad_norm": 18.5, + "learning_rate": 5.204187790312121e-08, + "loss": 0.5431, + "step": 7477 + }, + { + "epoch": 1.8181376124483344, + "grad_norm": 23.0, + "learning_rate": 5.190397619142223e-08, + "loss": 0.6012, + "step": 7478 + }, + { + "epoch": 1.8183807439824946, + "grad_norm": 20.0, + "learning_rate": 5.176625355586579e-08, + "loss": 0.8422, + "step": 7479 + }, + { + "epoch": 1.8186238755166544, + "grad_norm": 23.25, + "learning_rate": 5.162871001703693e-08, + "loss": 0.9246, + "step": 7480 + }, + { + "epoch": 1.8188670070508146, + "grad_norm": 23.75, + "learning_rate": 5.149134559549379e-08, + "loss": 0.79, + "step": 7481 + }, + { + "epoch": 1.8191101385849744, + "grad_norm": 19.25, + "learning_rate": 5.135416031176829e-08, + "loss": 0.6807, + "step": 7482 + }, + { + "epoch": 1.8193532701191346, + "grad_norm": 21.125, + "learning_rate": 5.121715418636472e-08, + "loss": 0.7234, + "step": 7483 + }, + { + "epoch": 1.8195964016532944, + "grad_norm": 20.375, + "learning_rate": 5.108032723976114e-08, + "loss": 0.8167, + "step": 7484 + }, + { + "epoch": 1.8198395331874544, + "grad_norm": 17.75, + "learning_rate": 5.094367949240883e-08, + "loss": 0.586, + "step": 7485 + }, + { + "epoch": 1.8200826647216144, + "grad_norm": 32.5, + "learning_rate": 5.0807210964732014e-08, + "loss": 0.7678, + "step": 7486 + }, + { + "epoch": 1.8203257962557744, + "grad_norm": 28.625, + "learning_rate": 5.067092167712811e-08, + "loss": 1.0422, + "step": 7487 + }, + { + "epoch": 1.8205689277899344, + "grad_norm": 17.25, + "learning_rate": 5.053481164996835e-08, + "loss": 0.6595, + "step": 7488 + }, + { + "epoch": 1.8208120593240942, + "grad_norm": 24.5, + "learning_rate": 5.0398880903596294e-08, + "loss": 0.6553, + "step": 7489 + }, + { + "epoch": 1.8210551908582544, + "grad_norm": 21.75, + "learning_rate": 5.026312945832931e-08, + "loss": 0.805, + "step": 7490 + }, + { + "epoch": 1.8212983223924142, + "grad_norm": 19.75, + "learning_rate": 5.0127557334457846e-08, + "loss": 0.9541, + "step": 7491 + }, + { + "epoch": 1.8215414539265744, + "grad_norm": 21.875, + "learning_rate": 4.999216455224554e-08, + "loss": 1.0441, + "step": 7492 + }, + { + "epoch": 1.8217845854607342, + "grad_norm": 28.875, + "learning_rate": 4.9856951131928983e-08, + "loss": 0.909, + "step": 7493 + }, + { + "epoch": 1.8220277169948942, + "grad_norm": 21.0, + "learning_rate": 4.972191709371826e-08, + "loss": 0.8385, + "step": 7494 + }, + { + "epoch": 1.8222708485290542, + "grad_norm": 21.375, + "learning_rate": 4.9587062457796684e-08, + "loss": 0.9484, + "step": 7495 + }, + { + "epoch": 1.8225139800632142, + "grad_norm": 28.625, + "learning_rate": 4.945238724432047e-08, + "loss": 0.88, + "step": 7496 + }, + { + "epoch": 1.8227571115973742, + "grad_norm": 20.0, + "learning_rate": 4.931789147341895e-08, + "loss": 0.7199, + "step": 7497 + }, + { + "epoch": 1.8230002431315342, + "grad_norm": 18.625, + "learning_rate": 4.918357516519506e-08, + "loss": 0.9908, + "step": 7498 + }, + { + "epoch": 1.8232433746656942, + "grad_norm": 16.625, + "learning_rate": 4.9049438339724836e-08, + "loss": 0.6022, + "step": 7499 + }, + { + "epoch": 1.823486506199854, + "grad_norm": 40.5, + "learning_rate": 4.8915481017057235e-08, + "loss": 1.7718, + "step": 7500 + }, + { + "epoch": 1.8237296377340142, + "grad_norm": 20.75, + "learning_rate": 4.878170321721415e-08, + "loss": 0.9465, + "step": 7501 + }, + { + "epoch": 1.823972769268174, + "grad_norm": 20.125, + "learning_rate": 4.864810496019154e-08, + "loss": 0.6056, + "step": 7502 + }, + { + "epoch": 1.8242159008023342, + "grad_norm": 15.25, + "learning_rate": 4.851468626595773e-08, + "loss": 0.2311, + "step": 7503 + }, + { + "epoch": 1.824459032336494, + "grad_norm": 27.25, + "learning_rate": 4.8381447154454144e-08, + "loss": 0.5883, + "step": 7504 + }, + { + "epoch": 1.824702163870654, + "grad_norm": 30.125, + "learning_rate": 4.824838764559608e-08, + "loss": 0.7411, + "step": 7505 + }, + { + "epoch": 1.824945295404814, + "grad_norm": 22.0, + "learning_rate": 4.811550775927168e-08, + "loss": 0.6528, + "step": 7506 + }, + { + "epoch": 1.825188426938974, + "grad_norm": 21.625, + "learning_rate": 4.798280751534171e-08, + "loss": 0.8038, + "step": 7507 + }, + { + "epoch": 1.825431558473134, + "grad_norm": 28.0, + "learning_rate": 4.7850286933640716e-08, + "loss": 0.9102, + "step": 7508 + }, + { + "epoch": 1.825674690007294, + "grad_norm": 18.5, + "learning_rate": 4.77179460339762e-08, + "loss": 0.7418, + "step": 7509 + }, + { + "epoch": 1.825917821541454, + "grad_norm": 18.75, + "learning_rate": 4.758578483612886e-08, + "loss": 1.2349, + "step": 7510 + }, + { + "epoch": 1.8261609530756138, + "grad_norm": 18.375, + "learning_rate": 4.7453803359852196e-08, + "loss": 0.7239, + "step": 7511 + }, + { + "epoch": 1.826404084609774, + "grad_norm": 26.625, + "learning_rate": 4.732200162487335e-08, + "loss": 1.275, + "step": 7512 + }, + { + "epoch": 1.8266472161439338, + "grad_norm": 20.0, + "learning_rate": 4.719037965089254e-08, + "loss": 0.5573, + "step": 7513 + }, + { + "epoch": 1.826890347678094, + "grad_norm": 21.375, + "learning_rate": 4.705893745758264e-08, + "loss": 0.7901, + "step": 7514 + }, + { + "epoch": 1.8271334792122538, + "grad_norm": 16.625, + "learning_rate": 4.692767506458987e-08, + "loss": 0.3731, + "step": 7515 + }, + { + "epoch": 1.8273766107464138, + "grad_norm": 18.0, + "learning_rate": 4.6796592491534094e-08, + "loss": 0.5301, + "step": 7516 + }, + { + "epoch": 1.8276197422805738, + "grad_norm": 19.25, + "learning_rate": 4.666568975800756e-08, + "loss": 0.4829, + "step": 7517 + }, + { + "epoch": 1.8278628738147338, + "grad_norm": 27.625, + "learning_rate": 4.653496688357586e-08, + "loss": 1.1146, + "step": 7518 + }, + { + "epoch": 1.8281060053488938, + "grad_norm": 23.625, + "learning_rate": 4.640442388777797e-08, + "loss": 1.0188, + "step": 7519 + }, + { + "epoch": 1.8283491368830538, + "grad_norm": 15.125, + "learning_rate": 4.6274060790125754e-08, + "loss": 0.2492, + "step": 7520 + }, + { + "epoch": 1.8285922684172138, + "grad_norm": 20.875, + "learning_rate": 4.614387761010433e-08, + "loss": 0.5477, + "step": 7521 + }, + { + "epoch": 1.8288353999513736, + "grad_norm": 18.125, + "learning_rate": 4.6013874367171475e-08, + "loss": 0.4903, + "step": 7522 + }, + { + "epoch": 1.8290785314855338, + "grad_norm": 21.25, + "learning_rate": 4.5884051080758856e-08, + "loss": 0.7898, + "step": 7523 + }, + { + "epoch": 1.8293216630196936, + "grad_norm": 19.875, + "learning_rate": 4.575440777027054e-08, + "loss": 0.6642, + "step": 7524 + }, + { + "epoch": 1.8295647945538538, + "grad_norm": 18.625, + "learning_rate": 4.5624944455083944e-08, + "loss": 0.6796, + "step": 7525 + }, + { + "epoch": 1.8298079260880136, + "grad_norm": 16.5, + "learning_rate": 4.5495661154549844e-08, + "loss": 0.6323, + "step": 7526 + }, + { + "epoch": 1.8300510576221736, + "grad_norm": 14.5625, + "learning_rate": 4.5366557887991516e-08, + "loss": 0.283, + "step": 7527 + }, + { + "epoch": 1.8302941891563336, + "grad_norm": 16.25, + "learning_rate": 4.523763467470591e-08, + "loss": 0.6075, + "step": 7528 + }, + { + "epoch": 1.8305373206904936, + "grad_norm": 16.625, + "learning_rate": 4.510889153396286e-08, + "loss": 0.4247, + "step": 7529 + }, + { + "epoch": 1.8307804522246536, + "grad_norm": 37.75, + "learning_rate": 4.498032848500505e-08, + "loss": 1.2243, + "step": 7530 + }, + { + "epoch": 1.8310235837588134, + "grad_norm": 20.125, + "learning_rate": 4.4851945547048774e-08, + "loss": 0.8127, + "step": 7531 + }, + { + "epoch": 1.8312667152929736, + "grad_norm": 23.625, + "learning_rate": 4.472374273928257e-08, + "loss": 0.7337, + "step": 7532 + }, + { + "epoch": 1.8315098468271334, + "grad_norm": 20.75, + "learning_rate": 4.4595720080869016e-08, + "loss": 0.692, + "step": 7533 + }, + { + "epoch": 1.8317529783612936, + "grad_norm": 19.5, + "learning_rate": 4.446787759094323e-08, + "loss": 0.3627, + "step": 7534 + }, + { + "epoch": 1.8319961098954534, + "grad_norm": 36.25, + "learning_rate": 4.434021528861299e-08, + "loss": 0.9994, + "step": 7535 + }, + { + "epoch": 1.8322392414296136, + "grad_norm": 19.5, + "learning_rate": 4.4212733192960377e-08, + "loss": 0.7424, + "step": 7536 + }, + { + "epoch": 1.8324823729637734, + "grad_norm": 23.125, + "learning_rate": 4.408543132303947e-08, + "loss": 0.8957, + "step": 7537 + }, + { + "epoch": 1.8327255044979334, + "grad_norm": 19.125, + "learning_rate": 4.3958309697877815e-08, + "loss": 0.5535, + "step": 7538 + }, + { + "epoch": 1.8329686360320934, + "grad_norm": 16.125, + "learning_rate": 4.38313683364755e-08, + "loss": 0.4912, + "step": 7539 + }, + { + "epoch": 1.8332117675662534, + "grad_norm": 26.0, + "learning_rate": 4.3704607257806644e-08, + "loss": 0.7422, + "step": 7540 + }, + { + "epoch": 1.8334548991004134, + "grad_norm": 40.25, + "learning_rate": 4.357802648081777e-08, + "loss": 1.1415, + "step": 7541 + }, + { + "epoch": 1.8336980306345732, + "grad_norm": 25.125, + "learning_rate": 4.3451626024428315e-08, + "loss": 0.6099, + "step": 7542 + }, + { + "epoch": 1.8339411621687334, + "grad_norm": 19.25, + "learning_rate": 4.332540590753109e-08, + "loss": 0.6833, + "step": 7543 + }, + { + "epoch": 1.8341842937028932, + "grad_norm": 18.0, + "learning_rate": 4.3199366148992115e-08, + "loss": 0.6793, + "step": 7544 + }, + { + "epoch": 1.8344274252370534, + "grad_norm": 20.5, + "learning_rate": 4.3073506767649794e-08, + "loss": 0.7335, + "step": 7545 + }, + { + "epoch": 1.8346705567712132, + "grad_norm": 22.25, + "learning_rate": 4.294782778231657e-08, + "loss": 0.8019, + "step": 7546 + }, + { + "epoch": 1.8349136883053732, + "grad_norm": 17.125, + "learning_rate": 4.282232921177687e-08, + "loss": 0.5312, + "step": 7547 + }, + { + "epoch": 1.8351568198395332, + "grad_norm": 24.625, + "learning_rate": 4.269701107478874e-08, + "loss": 0.853, + "step": 7548 + }, + { + "epoch": 1.8353999513736932, + "grad_norm": 20.875, + "learning_rate": 4.257187339008303e-08, + "loss": 0.6265, + "step": 7549 + }, + { + "epoch": 1.8356430829078532, + "grad_norm": 15.4375, + "learning_rate": 4.2446916176363955e-08, + "loss": 0.3864, + "step": 7550 + }, + { + "epoch": 1.8358862144420132, + "grad_norm": 19.125, + "learning_rate": 4.232213945230837e-08, + "loss": 0.4786, + "step": 7551 + }, + { + "epoch": 1.8361293459761732, + "grad_norm": 16.875, + "learning_rate": 4.219754323656636e-08, + "loss": 0.8536, + "step": 7552 + }, + { + "epoch": 1.836372477510333, + "grad_norm": 24.125, + "learning_rate": 4.207312754776094e-08, + "loss": 0.554, + "step": 7553 + }, + { + "epoch": 1.8366156090444932, + "grad_norm": 16.625, + "learning_rate": 4.1948892404488487e-08, + "loss": 0.3813, + "step": 7554 + }, + { + "epoch": 1.836858740578653, + "grad_norm": 18.0, + "learning_rate": 4.1824837825317626e-08, + "loss": 0.6299, + "step": 7555 + }, + { + "epoch": 1.8371018721128132, + "grad_norm": 19.0, + "learning_rate": 4.1700963828790606e-08, + "loss": 0.6437, + "step": 7556 + }, + { + "epoch": 1.837345003646973, + "grad_norm": 18.125, + "learning_rate": 4.1577270433422745e-08, + "loss": 0.6631, + "step": 7557 + }, + { + "epoch": 1.837588135181133, + "grad_norm": 28.25, + "learning_rate": 4.145375765770177e-08, + "loss": 1.2545, + "step": 7558 + }, + { + "epoch": 1.837831266715293, + "grad_norm": 24.25, + "learning_rate": 4.133042552008915e-08, + "loss": 0.8158, + "step": 7559 + }, + { + "epoch": 1.838074398249453, + "grad_norm": 17.5, + "learning_rate": 4.120727403901889e-08, + "loss": 0.6079, + "step": 7560 + }, + { + "epoch": 1.838317529783613, + "grad_norm": 15.875, + "learning_rate": 4.108430323289822e-08, + "loss": 0.3736, + "step": 7561 + }, + { + "epoch": 1.838560661317773, + "grad_norm": 22.5, + "learning_rate": 4.0961513120107015e-08, + "loss": 0.5513, + "step": 7562 + }, + { + "epoch": 1.838803792851933, + "grad_norm": 15.875, + "learning_rate": 4.0838903718998516e-08, + "loss": 0.6372, + "step": 7563 + }, + { + "epoch": 1.8390469243860927, + "grad_norm": 23.875, + "learning_rate": 4.071647504789875e-08, + "loss": 0.8594, + "step": 7564 + }, + { + "epoch": 1.839290055920253, + "grad_norm": 17.875, + "learning_rate": 4.059422712510697e-08, + "loss": 0.8465, + "step": 7565 + }, + { + "epoch": 1.8395331874544127, + "grad_norm": 24.5, + "learning_rate": 4.047215996889481e-08, + "loss": 0.8671, + "step": 7566 + }, + { + "epoch": 1.839776318988573, + "grad_norm": 21.5, + "learning_rate": 4.035027359750782e-08, + "loss": 0.8169, + "step": 7567 + }, + { + "epoch": 1.8400194505227327, + "grad_norm": 16.75, + "learning_rate": 4.022856802916392e-08, + "loss": 0.4717, + "step": 7568 + }, + { + "epoch": 1.8402625820568927, + "grad_norm": 18.5, + "learning_rate": 4.01070432820537e-08, + "loss": 0.5276, + "step": 7569 + }, + { + "epoch": 1.8405057135910527, + "grad_norm": 19.0, + "learning_rate": 3.9985699374341656e-08, + "loss": 0.6881, + "step": 7570 + }, + { + "epoch": 1.8407488451252128, + "grad_norm": 23.625, + "learning_rate": 3.9864536324164536e-08, + "loss": 0.7364, + "step": 7571 + }, + { + "epoch": 1.8409919766593728, + "grad_norm": 18.5, + "learning_rate": 3.97435541496323e-08, + "loss": 0.5181, + "step": 7572 + }, + { + "epoch": 1.8412351081935328, + "grad_norm": 17.75, + "learning_rate": 3.962275286882741e-08, + "loss": 0.4092, + "step": 7573 + }, + { + "epoch": 1.8414782397276928, + "grad_norm": 19.875, + "learning_rate": 3.950213249980614e-08, + "loss": 0.8533, + "step": 7574 + }, + { + "epoch": 1.8417213712618525, + "grad_norm": 26.875, + "learning_rate": 3.9381693060597385e-08, + "loss": 0.9544, + "step": 7575 + }, + { + "epoch": 1.8419645027960128, + "grad_norm": 21.125, + "learning_rate": 3.926143456920259e-08, + "loss": 0.7246, + "step": 7576 + }, + { + "epoch": 1.8422076343301725, + "grad_norm": 17.375, + "learning_rate": 3.914135704359667e-08, + "loss": 0.5233, + "step": 7577 + }, + { + "epoch": 1.8424507658643328, + "grad_norm": 17.125, + "learning_rate": 3.9021460501727086e-08, + "loss": 0.5068, + "step": 7578 + }, + { + "epoch": 1.8426938973984925, + "grad_norm": 22.0, + "learning_rate": 3.89017449615145e-08, + "loss": 0.4583, + "step": 7579 + }, + { + "epoch": 1.8429370289326525, + "grad_norm": 18.625, + "learning_rate": 3.878221044085265e-08, + "loss": 0.4468, + "step": 7580 + }, + { + "epoch": 1.8431801604668125, + "grad_norm": 20.25, + "learning_rate": 3.866285695760794e-08, + "loss": 1.0066, + "step": 7581 + }, + { + "epoch": 1.8434232920009725, + "grad_norm": 20.125, + "learning_rate": 3.854368452961957e-08, + "loss": 0.2508, + "step": 7582 + }, + { + "epoch": 1.8436664235351325, + "grad_norm": 21.75, + "learning_rate": 3.842469317470024e-08, + "loss": 0.5249, + "step": 7583 + }, + { + "epoch": 1.8439095550692923, + "grad_norm": 25.25, + "learning_rate": 3.830588291063517e-08, + "loss": 0.7665, + "step": 7584 + }, + { + "epoch": 1.8441526866034526, + "grad_norm": 27.125, + "learning_rate": 3.818725375518265e-08, + "loss": 0.7356, + "step": 7585 + }, + { + "epoch": 1.8443958181376123, + "grad_norm": 17.875, + "learning_rate": 3.8068805726073634e-08, + "loss": 0.4984, + "step": 7586 + }, + { + "epoch": 1.8446389496717726, + "grad_norm": 15.4375, + "learning_rate": 3.7950538841012434e-08, + "loss": 0.5548, + "step": 7587 + }, + { + "epoch": 1.8448820812059323, + "grad_norm": 17.625, + "learning_rate": 3.7832453117676024e-08, + "loss": 0.4331, + "step": 7588 + }, + { + "epoch": 1.8451252127400923, + "grad_norm": 30.125, + "learning_rate": 3.7714548573714313e-08, + "loss": 0.8156, + "step": 7589 + }, + { + "epoch": 1.8453683442742523, + "grad_norm": 18.625, + "learning_rate": 3.759682522675015e-08, + "loss": 0.6269, + "step": 7590 + }, + { + "epoch": 1.8456114758084123, + "grad_norm": 19.125, + "learning_rate": 3.7479283094379596e-08, + "loss": 0.615, + "step": 7591 + }, + { + "epoch": 1.8458546073425723, + "grad_norm": 22.875, + "learning_rate": 3.736192219417109e-08, + "loss": 0.7458, + "step": 7592 + }, + { + "epoch": 1.8460977388767323, + "grad_norm": 17.0, + "learning_rate": 3.724474254366617e-08, + "loss": 0.7284, + "step": 7593 + }, + { + "epoch": 1.8463408704108923, + "grad_norm": 18.5, + "learning_rate": 3.7127744160379565e-08, + "loss": 0.544, + "step": 7594 + }, + { + "epoch": 1.8465840019450521, + "grad_norm": 17.0, + "learning_rate": 3.7010927061798676e-08, + "loss": 1.2383, + "step": 7595 + }, + { + "epoch": 1.8468271334792123, + "grad_norm": 22.625, + "learning_rate": 3.689429126538372e-08, + "loss": 0.5412, + "step": 7596 + }, + { + "epoch": 1.8470702650133721, + "grad_norm": 26.75, + "learning_rate": 3.67778367885678e-08, + "loss": 0.8867, + "step": 7597 + }, + { + "epoch": 1.8473133965475323, + "grad_norm": 21.375, + "learning_rate": 3.6661563648757586e-08, + "loss": 0.7603, + "step": 7598 + }, + { + "epoch": 1.8475565280816921, + "grad_norm": 19.0, + "learning_rate": 3.6545471863331656e-08, + "loss": 0.7846, + "step": 7599 + }, + { + "epoch": 1.8477996596158521, + "grad_norm": 21.375, + "learning_rate": 3.642956144964183e-08, + "loss": 0.5344, + "step": 7600 + }, + { + "epoch": 1.8480427911500121, + "grad_norm": 21.125, + "learning_rate": 3.631383242501341e-08, + "loss": 1.0846, + "step": 7601 + }, + { + "epoch": 1.8482859226841721, + "grad_norm": 22.5, + "learning_rate": 3.6198284806743814e-08, + "loss": 0.6092, + "step": 7602 + }, + { + "epoch": 1.8485290542183321, + "grad_norm": 20.875, + "learning_rate": 3.608291861210339e-08, + "loss": 0.7028, + "step": 7603 + }, + { + "epoch": 1.8487721857524921, + "grad_norm": 16.5, + "learning_rate": 3.596773385833613e-08, + "loss": 0.399, + "step": 7604 + }, + { + "epoch": 1.8490153172866521, + "grad_norm": 27.5, + "learning_rate": 3.585273056265784e-08, + "loss": 1.1686, + "step": 7605 + }, + { + "epoch": 1.849258448820812, + "grad_norm": 19.5, + "learning_rate": 3.573790874225824e-08, + "loss": 0.6415, + "step": 7606 + }, + { + "epoch": 1.8495015803549721, + "grad_norm": 19.625, + "learning_rate": 3.562326841429903e-08, + "loss": 0.7175, + "step": 7607 + }, + { + "epoch": 1.849744711889132, + "grad_norm": 23.875, + "learning_rate": 3.550880959591552e-08, + "loss": 0.8093, + "step": 7608 + }, + { + "epoch": 1.8499878434232921, + "grad_norm": 22.25, + "learning_rate": 3.5394532304215413e-08, + "loss": 0.6922, + "step": 7609 + }, + { + "epoch": 1.850230974957452, + "grad_norm": 13.125, + "learning_rate": 3.5280436556279216e-08, + "loss": 0.2974, + "step": 7610 + }, + { + "epoch": 1.850474106491612, + "grad_norm": 26.25, + "learning_rate": 3.516652236916093e-08, + "loss": 0.9919, + "step": 7611 + }, + { + "epoch": 1.850717238025772, + "grad_norm": 32.75, + "learning_rate": 3.5052789759886656e-08, + "loss": 0.7669, + "step": 7612 + }, + { + "epoch": 1.850960369559932, + "grad_norm": 20.375, + "learning_rate": 3.493923874545585e-08, + "loss": 0.7524, + "step": 7613 + }, + { + "epoch": 1.851203501094092, + "grad_norm": 23.25, + "learning_rate": 3.482586934284049e-08, + "loss": 0.8704, + "step": 7614 + }, + { + "epoch": 1.851446632628252, + "grad_norm": 15.75, + "learning_rate": 3.471268156898605e-08, + "loss": 0.7977, + "step": 7615 + }, + { + "epoch": 1.851689764162412, + "grad_norm": 16.5, + "learning_rate": 3.459967544080997e-08, + "loss": 0.3564, + "step": 7616 + }, + { + "epoch": 1.8519328956965717, + "grad_norm": 13.6875, + "learning_rate": 3.4486850975202904e-08, + "loss": 0.4405, + "step": 7617 + }, + { + "epoch": 1.852176027230732, + "grad_norm": 20.75, + "learning_rate": 3.437420818902873e-08, + "loss": 0.57, + "step": 7618 + }, + { + "epoch": 1.8524191587648917, + "grad_norm": 20.25, + "learning_rate": 3.4261747099123834e-08, + "loss": 0.7879, + "step": 7619 + }, + { + "epoch": 1.852662290299052, + "grad_norm": 17.125, + "learning_rate": 3.414946772229713e-08, + "loss": 0.6246, + "step": 7620 + }, + { + "epoch": 1.8529054218332117, + "grad_norm": 24.0, + "learning_rate": 3.4037370075331035e-08, + "loss": 0.9031, + "step": 7621 + }, + { + "epoch": 1.8531485533673717, + "grad_norm": 22.875, + "learning_rate": 3.392545417498047e-08, + "loss": 0.6658, + "step": 7622 + }, + { + "epoch": 1.8533916849015317, + "grad_norm": 19.75, + "learning_rate": 3.3813720037973034e-08, + "loss": 0.9077, + "step": 7623 + }, + { + "epoch": 1.8536348164356917, + "grad_norm": 18.875, + "learning_rate": 3.37021676810094e-08, + "loss": 1.0388, + "step": 7624 + }, + { + "epoch": 1.8538779479698517, + "grad_norm": 12.8125, + "learning_rate": 3.359079712076316e-08, + "loss": 0.4981, + "step": 7625 + }, + { + "epoch": 1.8541210795040115, + "grad_norm": 18.875, + "learning_rate": 3.347960837388031e-08, + "loss": 0.6078, + "step": 7626 + }, + { + "epoch": 1.8543642110381717, + "grad_norm": 19.875, + "learning_rate": 3.336860145697979e-08, + "loss": 0.6266, + "step": 7627 + }, + { + "epoch": 1.8546073425723315, + "grad_norm": 17.875, + "learning_rate": 3.3257776386653865e-08, + "loss": 0.6471, + "step": 7628 + }, + { + "epoch": 1.8548504741064917, + "grad_norm": 22.875, + "learning_rate": 3.3147133179467215e-08, + "loss": 0.5607, + "step": 7629 + }, + { + "epoch": 1.8550936056406515, + "grad_norm": 20.75, + "learning_rate": 3.30366718519573e-08, + "loss": 0.5692, + "step": 7630 + }, + { + "epoch": 1.8553367371748117, + "grad_norm": 14.125, + "learning_rate": 3.292639242063439e-08, + "loss": 0.5095, + "step": 7631 + }, + { + "epoch": 1.8555798687089715, + "grad_norm": 23.375, + "learning_rate": 3.281629490198182e-08, + "loss": 1.0919, + "step": 7632 + }, + { + "epoch": 1.8558230002431315, + "grad_norm": 22.0, + "learning_rate": 3.270637931245546e-08, + "loss": 0.77, + "step": 7633 + }, + { + "epoch": 1.8560661317772915, + "grad_norm": 25.625, + "learning_rate": 3.2596645668483974e-08, + "loss": 0.6726, + "step": 7634 + }, + { + "epoch": 1.8563092633114515, + "grad_norm": 19.125, + "learning_rate": 3.24870939864691e-08, + "loss": 0.6936, + "step": 7635 + }, + { + "epoch": 1.8565523948456115, + "grad_norm": 21.25, + "learning_rate": 3.237772428278524e-08, + "loss": 0.8342, + "step": 7636 + }, + { + "epoch": 1.8567955263797713, + "grad_norm": 25.5, + "learning_rate": 3.2268536573779596e-08, + "loss": 0.8295, + "step": 7637 + }, + { + "epoch": 1.8570386579139315, + "grad_norm": 22.25, + "learning_rate": 3.2159530875771896e-08, + "loss": 0.6691, + "step": 7638 + }, + { + "epoch": 1.8572817894480913, + "grad_norm": 19.125, + "learning_rate": 3.205070720505535e-08, + "loss": 0.726, + "step": 7639 + }, + { + "epoch": 1.8575249209822515, + "grad_norm": 15.5625, + "learning_rate": 3.194206557789542e-08, + "loss": 0.6589, + "step": 7640 + }, + { + "epoch": 1.8577680525164113, + "grad_norm": 18.75, + "learning_rate": 3.1833606010530094e-08, + "loss": 0.5514, + "step": 7641 + }, + { + "epoch": 1.8580111840505713, + "grad_norm": 22.125, + "learning_rate": 3.1725328519170984e-08, + "loss": 0.8581, + "step": 7642 + }, + { + "epoch": 1.8582543155847313, + "grad_norm": 21.75, + "learning_rate": 3.1617233120001814e-08, + "loss": 0.9477, + "step": 7643 + }, + { + "epoch": 1.8584974471188913, + "grad_norm": 21.25, + "learning_rate": 3.150931982917937e-08, + "loss": 1.2574, + "step": 7644 + }, + { + "epoch": 1.8587405786530513, + "grad_norm": 16.375, + "learning_rate": 3.140158866283313e-08, + "loss": 0.5894, + "step": 7645 + }, + { + "epoch": 1.8589837101872113, + "grad_norm": 22.625, + "learning_rate": 3.129403963706548e-08, + "loss": 0.5343, + "step": 7646 + }, + { + "epoch": 1.8592268417213713, + "grad_norm": 17.5, + "learning_rate": 3.1186672767951345e-08, + "loss": 0.5812, + "step": 7647 + }, + { + "epoch": 1.859469973255531, + "grad_norm": 18.5, + "learning_rate": 3.107948807153843e-08, + "loss": 0.6572, + "step": 7648 + }, + { + "epoch": 1.8597131047896913, + "grad_norm": 20.625, + "learning_rate": 3.0972485563847684e-08, + "loss": 0.7052, + "step": 7649 + }, + { + "epoch": 1.859956236323851, + "grad_norm": 23.0, + "learning_rate": 3.0865665260872287e-08, + "loss": 0.5064, + "step": 7650 + }, + { + "epoch": 1.8601993678580113, + "grad_norm": 21.0, + "learning_rate": 3.0759027178578345e-08, + "loss": 0.839, + "step": 7651 + }, + { + "epoch": 1.860442499392171, + "grad_norm": 19.875, + "learning_rate": 3.065257133290478e-08, + "loss": 0.63, + "step": 7652 + }, + { + "epoch": 1.860685630926331, + "grad_norm": 14.9375, + "learning_rate": 3.054629773976331e-08, + "loss": 0.3206, + "step": 7653 + }, + { + "epoch": 1.860928762460491, + "grad_norm": 30.625, + "learning_rate": 3.044020641503845e-08, + "loss": 0.7292, + "step": 7654 + }, + { + "epoch": 1.861171893994651, + "grad_norm": 18.25, + "learning_rate": 3.03342973745871e-08, + "loss": 0.7985, + "step": 7655 + }, + { + "epoch": 1.861415025528811, + "grad_norm": 20.5, + "learning_rate": 3.0228570634239505e-08, + "loss": 0.4857, + "step": 7656 + }, + { + "epoch": 1.861658157062971, + "grad_norm": 18.75, + "learning_rate": 3.0123026209798176e-08, + "loss": 0.6108, + "step": 7657 + }, + { + "epoch": 1.861901288597131, + "grad_norm": 20.75, + "learning_rate": 3.001766411703855e-08, + "loss": 0.523, + "step": 7658 + }, + { + "epoch": 1.862144420131291, + "grad_norm": 22.25, + "learning_rate": 2.991248437170871e-08, + "loss": 0.9962, + "step": 7659 + }, + { + "epoch": 1.8623875516654511, + "grad_norm": 19.75, + "learning_rate": 2.980748698952998e-08, + "loss": 0.4358, + "step": 7660 + }, + { + "epoch": 1.862630683199611, + "grad_norm": 17.625, + "learning_rate": 2.9702671986195642e-08, + "loss": 0.4995, + "step": 7661 + }, + { + "epoch": 1.8628738147337711, + "grad_norm": 18.875, + "learning_rate": 2.9598039377372184e-08, + "loss": 0.7364, + "step": 7662 + }, + { + "epoch": 1.863116946267931, + "grad_norm": 17.875, + "learning_rate": 2.9493589178698905e-08, + "loss": 0.4898, + "step": 7663 + }, + { + "epoch": 1.863360077802091, + "grad_norm": 20.5, + "learning_rate": 2.9389321405787623e-08, + "loss": 0.8234, + "step": 7664 + }, + { + "epoch": 1.863603209336251, + "grad_norm": 18.125, + "learning_rate": 2.9285236074222817e-08, + "loss": 0.7785, + "step": 7665 + }, + { + "epoch": 1.863846340870411, + "grad_norm": 21.875, + "learning_rate": 2.918133319956204e-08, + "loss": 0.8563, + "step": 7666 + }, + { + "epoch": 1.864089472404571, + "grad_norm": 17.5, + "learning_rate": 2.907761279733523e-08, + "loss": 0.654, + "step": 7667 + }, + { + "epoch": 1.864332603938731, + "grad_norm": 15.625, + "learning_rate": 2.8974074883045406e-08, + "loss": 0.3309, + "step": 7668 + }, + { + "epoch": 1.864575735472891, + "grad_norm": 20.75, + "learning_rate": 2.8870719472167684e-08, + "loss": 1.0329, + "step": 7669 + }, + { + "epoch": 1.8648188670070507, + "grad_norm": 23.125, + "learning_rate": 2.8767546580150823e-08, + "loss": 0.9319, + "step": 7670 + }, + { + "epoch": 1.865061998541211, + "grad_norm": 20.875, + "learning_rate": 2.8664556222415403e-08, + "loss": 0.6446, + "step": 7671 + }, + { + "epoch": 1.8653051300753707, + "grad_norm": 17.375, + "learning_rate": 2.8561748414355217e-08, + "loss": 0.6958, + "step": 7672 + }, + { + "epoch": 1.865548261609531, + "grad_norm": 25.75, + "learning_rate": 2.8459123171336868e-08, + "loss": 0.8497, + "step": 7673 + }, + { + "epoch": 1.8657913931436907, + "grad_norm": 15.875, + "learning_rate": 2.835668050869905e-08, + "loss": 0.6374, + "step": 7674 + }, + { + "epoch": 1.8660345246778507, + "grad_norm": 25.875, + "learning_rate": 2.825442044175397e-08, + "loss": 0.6558, + "step": 7675 + }, + { + "epoch": 1.8662776562120107, + "grad_norm": 23.375, + "learning_rate": 2.8152342985786062e-08, + "loss": 0.709, + "step": 7676 + }, + { + "epoch": 1.8665207877461707, + "grad_norm": 23.125, + "learning_rate": 2.8050448156052566e-08, + "loss": 1.2235, + "step": 7677 + }, + { + "epoch": 1.8667639192803307, + "grad_norm": 25.375, + "learning_rate": 2.7948735967783247e-08, + "loss": 1.0966, + "step": 7678 + }, + { + "epoch": 1.8670070508144905, + "grad_norm": 18.625, + "learning_rate": 2.784720643618094e-08, + "loss": 0.6256, + "step": 7679 + }, + { + "epoch": 1.8672501823486507, + "grad_norm": 21.875, + "learning_rate": 2.774585957642101e-08, + "loss": 0.8769, + "step": 7680 + }, + { + "epoch": 1.8674933138828105, + "grad_norm": 15.9375, + "learning_rate": 2.7644695403651472e-08, + "loss": 0.5008, + "step": 7681 + }, + { + "epoch": 1.8677364454169707, + "grad_norm": 17.5, + "learning_rate": 2.7543713932992878e-08, + "loss": 0.4466, + "step": 7682 + }, + { + "epoch": 1.8679795769511305, + "grad_norm": 20.25, + "learning_rate": 2.7442915179538843e-08, + "loss": 0.5773, + "step": 7683 + }, + { + "epoch": 1.8682227084852907, + "grad_norm": 19.875, + "learning_rate": 2.7342299158355374e-08, + "loss": 0.4528, + "step": 7684 + }, + { + "epoch": 1.8684658400194505, + "grad_norm": 17.5, + "learning_rate": 2.7241865884481412e-08, + "loss": 0.4812, + "step": 7685 + }, + { + "epoch": 1.8687089715536105, + "grad_norm": 19.125, + "learning_rate": 2.714161537292828e-08, + "loss": 0.9347, + "step": 7686 + }, + { + "epoch": 1.8689521030877705, + "grad_norm": 18.0, + "learning_rate": 2.7041547638680388e-08, + "loss": 0.4699, + "step": 7687 + }, + { + "epoch": 1.8691952346219305, + "grad_norm": 18.5, + "learning_rate": 2.6941662696694236e-08, + "loss": 0.6178, + "step": 7688 + }, + { + "epoch": 1.8694383661560905, + "grad_norm": 14.375, + "learning_rate": 2.6841960561899555e-08, + "loss": 0.2273, + "step": 7689 + }, + { + "epoch": 1.8696814976902503, + "grad_norm": 17.125, + "learning_rate": 2.6742441249198586e-08, + "loss": 0.7081, + "step": 7690 + }, + { + "epoch": 1.8699246292244105, + "grad_norm": 16.125, + "learning_rate": 2.664310477346624e-08, + "loss": 0.5804, + "step": 7691 + }, + { + "epoch": 1.8701677607585703, + "grad_norm": 21.625, + "learning_rate": 2.654395114954994e-08, + "loss": 0.6758, + "step": 7692 + }, + { + "epoch": 1.8704108922927305, + "grad_norm": 42.5, + "learning_rate": 2.6444980392269782e-08, + "loss": 0.7383, + "step": 7693 + }, + { + "epoch": 1.8706540238268903, + "grad_norm": 15.9375, + "learning_rate": 2.6346192516419065e-08, + "loss": 0.7225, + "step": 7694 + }, + { + "epoch": 1.8708971553610503, + "grad_norm": 24.25, + "learning_rate": 2.624758753676307e-08, + "loss": 0.6042, + "step": 7695 + }, + { + "epoch": 1.8711402868952103, + "grad_norm": 19.5, + "learning_rate": 2.614916546804e-08, + "loss": 1.126, + "step": 7696 + }, + { + "epoch": 1.8713834184293703, + "grad_norm": 21.0, + "learning_rate": 2.6050926324960728e-08, + "loss": 0.733, + "step": 7697 + }, + { + "epoch": 1.8716265499635303, + "grad_norm": 19.0, + "learning_rate": 2.595287012220893e-08, + "loss": 0.7603, + "step": 7698 + }, + { + "epoch": 1.8718696814976903, + "grad_norm": 29.5, + "learning_rate": 2.585499687444079e-08, + "loss": 0.7504, + "step": 7699 + }, + { + "epoch": 1.8721128130318503, + "grad_norm": 24.25, + "learning_rate": 2.5757306596284892e-08, + "loss": 0.8763, + "step": 7700 + }, + { + "epoch": 1.87235594456601, + "grad_norm": 18.5, + "learning_rate": 2.5659799302343026e-08, + "loss": 0.9594, + "step": 7701 + }, + { + "epoch": 1.8725990761001703, + "grad_norm": 17.125, + "learning_rate": 2.5562475007189364e-08, + "loss": 0.4957, + "step": 7702 + }, + { + "epoch": 1.87284220763433, + "grad_norm": 20.375, + "learning_rate": 2.546533372537033e-08, + "loss": 0.9014, + "step": 7703 + }, + { + "epoch": 1.8730853391684903, + "grad_norm": 20.125, + "learning_rate": 2.536837547140583e-08, + "loss": 0.4721, + "step": 7704 + }, + { + "epoch": 1.87332847070265, + "grad_norm": 26.75, + "learning_rate": 2.5271600259787617e-08, + "loss": 0.875, + "step": 7705 + }, + { + "epoch": 1.87357160223681, + "grad_norm": 23.5, + "learning_rate": 2.5175008104980636e-08, + "loss": 0.8767, + "step": 7706 + }, + { + "epoch": 1.87381473377097, + "grad_norm": 17.5, + "learning_rate": 2.5078599021421957e-08, + "loss": 0.5381, + "step": 7707 + }, + { + "epoch": 1.87405786530513, + "grad_norm": 17.75, + "learning_rate": 2.4982373023521994e-08, + "loss": 0.4825, + "step": 7708 + }, + { + "epoch": 1.87430099683929, + "grad_norm": 33.5, + "learning_rate": 2.4886330125663133e-08, + "loss": 1.1793, + "step": 7709 + }, + { + "epoch": 1.87454412837345, + "grad_norm": 26.0, + "learning_rate": 2.4790470342200556e-08, + "loss": 0.8028, + "step": 7710 + }, + { + "epoch": 1.87478725990761, + "grad_norm": 19.625, + "learning_rate": 2.4694793687462386e-08, + "loss": 1.033, + "step": 7711 + }, + { + "epoch": 1.8750303914417699, + "grad_norm": 25.125, + "learning_rate": 2.459930017574913e-08, + "loss": 0.9768, + "step": 7712 + }, + { + "epoch": 1.87527352297593, + "grad_norm": 20.25, + "learning_rate": 2.4503989821333675e-08, + "loss": 0.5419, + "step": 7713 + }, + { + "epoch": 1.8755166545100899, + "grad_norm": 20.125, + "learning_rate": 2.440886263846212e-08, + "loss": 0.9091, + "step": 7714 + }, + { + "epoch": 1.87575978604425, + "grad_norm": 30.25, + "learning_rate": 2.4313918641352684e-08, + "loss": 0.5382, + "step": 7715 + }, + { + "epoch": 1.8760029175784099, + "grad_norm": 27.0, + "learning_rate": 2.4219157844196512e-08, + "loss": 0.6541, + "step": 7716 + }, + { + "epoch": 1.8762460491125699, + "grad_norm": 18.125, + "learning_rate": 2.4124580261156994e-08, + "loss": 0.3218, + "step": 7717 + }, + { + "epoch": 1.8764891806467299, + "grad_norm": 23.5, + "learning_rate": 2.403018590637074e-08, + "loss": 0.6784, + "step": 7718 + }, + { + "epoch": 1.8767323121808899, + "grad_norm": 19.25, + "learning_rate": 2.3935974793946466e-08, + "loss": 0.5894, + "step": 7719 + }, + { + "epoch": 1.8769754437150499, + "grad_norm": 16.125, + "learning_rate": 2.3841946937965404e-08, + "loss": 0.7901, + "step": 7720 + }, + { + "epoch": 1.8772185752492099, + "grad_norm": 17.75, + "learning_rate": 2.3748102352481873e-08, + "loss": 0.6835, + "step": 7721 + }, + { + "epoch": 1.8774617067833699, + "grad_norm": 41.25, + "learning_rate": 2.3654441051522704e-08, + "loss": 1.0942, + "step": 7722 + }, + { + "epoch": 1.8777048383175297, + "grad_norm": 22.125, + "learning_rate": 2.3560963049086975e-08, + "loss": 0.7283, + "step": 7723 + }, + { + "epoch": 1.8779479698516899, + "grad_norm": 27.5, + "learning_rate": 2.346766835914671e-08, + "loss": 0.7578, + "step": 7724 + }, + { + "epoch": 1.8781911013858497, + "grad_norm": 17.5, + "learning_rate": 2.3374556995646307e-08, + "loss": 0.3706, + "step": 7725 + }, + { + "epoch": 1.8784342329200099, + "grad_norm": 25.375, + "learning_rate": 2.328162897250297e-08, + "loss": 0.8971, + "step": 7726 + }, + { + "epoch": 1.8786773644541697, + "grad_norm": 24.5, + "learning_rate": 2.3188884303606135e-08, + "loss": 0.7438, + "step": 7727 + }, + { + "epoch": 1.8789204959883297, + "grad_norm": 18.5, + "learning_rate": 2.309632300281847e-08, + "loss": 0.6857, + "step": 7728 + }, + { + "epoch": 1.8791636275224897, + "grad_norm": 20.5, + "learning_rate": 2.3003945083974745e-08, + "loss": 0.7616, + "step": 7729 + }, + { + "epoch": 1.8794067590566497, + "grad_norm": 16.875, + "learning_rate": 2.291175056088224e-08, + "loss": 0.5579, + "step": 7730 + }, + { + "epoch": 1.8796498905908097, + "grad_norm": 28.125, + "learning_rate": 2.2819739447321188e-08, + "loss": 0.8129, + "step": 7731 + }, + { + "epoch": 1.8798930221249694, + "grad_norm": 18.875, + "learning_rate": 2.2727911757044334e-08, + "loss": 0.4981, + "step": 7732 + }, + { + "epoch": 1.8801361536591297, + "grad_norm": 21.5, + "learning_rate": 2.2636267503776817e-08, + "loss": 0.7, + "step": 7733 + }, + { + "epoch": 1.8803792851932895, + "grad_norm": 22.0, + "learning_rate": 2.2544806701216145e-08, + "loss": 0.6185, + "step": 7734 + }, + { + "epoch": 1.8806224167274497, + "grad_norm": 19.75, + "learning_rate": 2.2453529363033328e-08, + "loss": 0.8049, + "step": 7735 + }, + { + "epoch": 1.8808655482616095, + "grad_norm": 16.75, + "learning_rate": 2.236243550287079e-08, + "loss": 0.4919, + "step": 7736 + }, + { + "epoch": 1.8811086797957695, + "grad_norm": 19.625, + "learning_rate": 2.2271525134344302e-08, + "loss": 0.9488, + "step": 7737 + }, + { + "epoch": 1.8813518113299295, + "grad_norm": 19.125, + "learning_rate": 2.2180798271042027e-08, + "loss": 0.8047, + "step": 7738 + }, + { + "epoch": 1.8815949428640895, + "grad_norm": 25.75, + "learning_rate": 2.209025492652464e-08, + "loss": 0.8352, + "step": 7739 + }, + { + "epoch": 1.8818380743982495, + "grad_norm": 20.875, + "learning_rate": 2.1999895114325488e-08, + "loss": 0.8531, + "step": 7740 + }, + { + "epoch": 1.8820812059324095, + "grad_norm": 17.875, + "learning_rate": 2.190971884795015e-08, + "loss": 1.0134, + "step": 7741 + }, + { + "epoch": 1.8823243374665695, + "grad_norm": 18.75, + "learning_rate": 2.1819726140877294e-08, + "loss": 0.5415, + "step": 7742 + }, + { + "epoch": 1.8825674690007292, + "grad_norm": 16.625, + "learning_rate": 2.172991700655769e-08, + "loss": 0.298, + "step": 7743 + }, + { + "epoch": 1.8828106005348895, + "grad_norm": 22.25, + "learning_rate": 2.1640291458415036e-08, + "loss": 0.617, + "step": 7744 + }, + { + "epoch": 1.8830537320690492, + "grad_norm": 19.5, + "learning_rate": 2.1550849509845152e-08, + "loss": 0.7055, + "step": 7745 + }, + { + "epoch": 1.8832968636032095, + "grad_norm": 14.0625, + "learning_rate": 2.146159117421706e-08, + "loss": 0.3976, + "step": 7746 + }, + { + "epoch": 1.8835399951373692, + "grad_norm": 17.75, + "learning_rate": 2.1372516464871623e-08, + "loss": 0.4532, + "step": 7747 + }, + { + "epoch": 1.8837831266715293, + "grad_norm": 17.25, + "learning_rate": 2.128362539512277e-08, + "loss": 0.4402, + "step": 7748 + }, + { + "epoch": 1.8840262582056893, + "grad_norm": 16.5, + "learning_rate": 2.119491797825682e-08, + "loss": 0.3731, + "step": 7749 + }, + { + "epoch": 1.8842693897398493, + "grad_norm": 22.875, + "learning_rate": 2.1106394227532605e-08, + "loss": 0.8768, + "step": 7750 + }, + { + "epoch": 1.8845125212740093, + "grad_norm": 26.0, + "learning_rate": 2.1018054156181355e-08, + "loss": 0.6892, + "step": 7751 + }, + { + "epoch": 1.8847556528081693, + "grad_norm": 25.625, + "learning_rate": 2.0929897777407226e-08, + "loss": 0.8972, + "step": 7752 + }, + { + "epoch": 1.8849987843423293, + "grad_norm": 16.5, + "learning_rate": 2.0841925104386764e-08, + "loss": 0.5226, + "step": 7753 + }, + { + "epoch": 1.885241915876489, + "grad_norm": 18.5, + "learning_rate": 2.075413615026875e-08, + "loss": 0.5157, + "step": 7754 + }, + { + "epoch": 1.8854850474106493, + "grad_norm": 22.375, + "learning_rate": 2.0666530928174917e-08, + "loss": 0.6178, + "step": 7755 + }, + { + "epoch": 1.885728178944809, + "grad_norm": 21.75, + "learning_rate": 2.057910945119937e-08, + "loss": 0.7248, + "step": 7756 + }, + { + "epoch": 1.8859713104789693, + "grad_norm": 19.75, + "learning_rate": 2.049187173240888e-08, + "loss": 0.651, + "step": 7757 + }, + { + "epoch": 1.886214442013129, + "grad_norm": 19.75, + "learning_rate": 2.0404817784842323e-08, + "loss": 0.4942, + "step": 7758 + }, + { + "epoch": 1.886457573547289, + "grad_norm": 20.0, + "learning_rate": 2.0317947621511653e-08, + "loss": 1.0092, + "step": 7759 + }, + { + "epoch": 1.886700705081449, + "grad_norm": 17.5, + "learning_rate": 2.0231261255401065e-08, + "loss": 0.8084, + "step": 7760 + }, + { + "epoch": 1.886943836615609, + "grad_norm": 19.875, + "learning_rate": 2.0144758699467276e-08, + "loss": 0.7384, + "step": 7761 + }, + { + "epoch": 1.887186968149769, + "grad_norm": 38.5, + "learning_rate": 2.005843996663967e-08, + "loss": 1.0055, + "step": 7762 + }, + { + "epoch": 1.887430099683929, + "grad_norm": 21.125, + "learning_rate": 1.997230506982001e-08, + "loss": 0.736, + "step": 7763 + }, + { + "epoch": 1.887673231218089, + "grad_norm": 17.875, + "learning_rate": 1.9886354021882852e-08, + "loss": 0.4741, + "step": 7764 + }, + { + "epoch": 1.8879163627522488, + "grad_norm": 28.125, + "learning_rate": 1.9800586835674596e-08, + "loss": 0.7851, + "step": 7765 + }, + { + "epoch": 1.888159494286409, + "grad_norm": 19.5, + "learning_rate": 1.971500352401512e-08, + "loss": 0.9229, + "step": 7766 + }, + { + "epoch": 1.8884026258205688, + "grad_norm": 24.75, + "learning_rate": 1.9629604099695997e-08, + "loss": 0.946, + "step": 7767 + }, + { + "epoch": 1.888645757354729, + "grad_norm": 37.0, + "learning_rate": 1.954438857548188e-08, + "loss": 1.1072, + "step": 7768 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 20.875, + "learning_rate": 1.9459356964109527e-08, + "loss": 0.4563, + "step": 7769 + }, + { + "epoch": 1.8891320204230488, + "grad_norm": 23.75, + "learning_rate": 1.9374509278288488e-08, + "loss": 0.5726, + "step": 7770 + }, + { + "epoch": 1.8893751519572088, + "grad_norm": 18.125, + "learning_rate": 1.9289845530700707e-08, + "loss": 0.9522, + "step": 7771 + }, + { + "epoch": 1.8896182834913688, + "grad_norm": 27.5, + "learning_rate": 1.9205365734000503e-08, + "loss": 1.043, + "step": 7772 + }, + { + "epoch": 1.8898614150255288, + "grad_norm": 17.625, + "learning_rate": 1.9121069900814998e-08, + "loss": 0.7864, + "step": 7773 + }, + { + "epoch": 1.8901045465596886, + "grad_norm": 19.625, + "learning_rate": 1.9036958043743697e-08, + "loss": 0.5282, + "step": 7774 + }, + { + "epoch": 1.8903476780938488, + "grad_norm": 22.125, + "learning_rate": 1.895303017535835e-08, + "loss": 0.5427, + "step": 7775 + }, + { + "epoch": 1.8905908096280086, + "grad_norm": 22.0, + "learning_rate": 1.8869286308203506e-08, + "loss": 1.0772, + "step": 7776 + }, + { + "epoch": 1.8908339411621689, + "grad_norm": 14.9375, + "learning_rate": 1.878572645479637e-08, + "loss": 0.3036, + "step": 7777 + }, + { + "epoch": 1.8910770726963286, + "grad_norm": 19.875, + "learning_rate": 1.8702350627626125e-08, + "loss": 0.8388, + "step": 7778 + }, + { + "epoch": 1.8913202042304889, + "grad_norm": 22.125, + "learning_rate": 1.8619158839154883e-08, + "loss": 1.0803, + "step": 7779 + }, + { + "epoch": 1.8915633357646486, + "grad_norm": 20.125, + "learning_rate": 1.8536151101817003e-08, + "loss": 1.1311, + "step": 7780 + }, + { + "epoch": 1.8918064672988086, + "grad_norm": 17.25, + "learning_rate": 1.84533274280195e-08, + "loss": 0.5389, + "step": 7781 + }, + { + "epoch": 1.8920495988329686, + "grad_norm": 21.875, + "learning_rate": 1.8370687830141508e-08, + "loss": 1.2689, + "step": 7782 + }, + { + "epoch": 1.8922927303671286, + "grad_norm": 16.625, + "learning_rate": 1.8288232320535504e-08, + "loss": 0.6092, + "step": 7783 + }, + { + "epoch": 1.8925358619012886, + "grad_norm": 19.625, + "learning_rate": 1.8205960911525518e-08, + "loss": 0.9303, + "step": 7784 + }, + { + "epoch": 1.8927789934354484, + "grad_norm": 20.75, + "learning_rate": 1.8123873615408515e-08, + "loss": 0.6299, + "step": 7785 + }, + { + "epoch": 1.8930221249696086, + "grad_norm": 17.625, + "learning_rate": 1.8041970444453716e-08, + "loss": 0.7145, + "step": 7786 + }, + { + "epoch": 1.8932652565037684, + "grad_norm": 15.3125, + "learning_rate": 1.7960251410903128e-08, + "loss": 0.5215, + "step": 7787 + }, + { + "epoch": 1.8935083880379286, + "grad_norm": 24.875, + "learning_rate": 1.7878716526971152e-08, + "loss": 0.7656, + "step": 7788 + }, + { + "epoch": 1.8937515195720884, + "grad_norm": 15.8125, + "learning_rate": 1.7797365804844285e-08, + "loss": 0.6144, + "step": 7789 + }, + { + "epoch": 1.8939946511062484, + "grad_norm": 17.5, + "learning_rate": 1.771619925668197e-08, + "loss": 0.3488, + "step": 7790 + }, + { + "epoch": 1.8942377826404084, + "grad_norm": 26.375, + "learning_rate": 1.763521689461603e-08, + "loss": 0.6864, + "step": 7791 + }, + { + "epoch": 1.8944809141745684, + "grad_norm": 19.125, + "learning_rate": 1.7554418730750673e-08, + "loss": 0.8534, + "step": 7792 + }, + { + "epoch": 1.8947240457087284, + "grad_norm": 17.625, + "learning_rate": 1.7473804777162346e-08, + "loss": 0.7827, + "step": 7793 + }, + { + "epoch": 1.8949671772428884, + "grad_norm": 17.0, + "learning_rate": 1.739337504590044e-08, + "loss": 0.4682, + "step": 7794 + }, + { + "epoch": 1.8952103087770484, + "grad_norm": 19.125, + "learning_rate": 1.731312954898659e-08, + "loss": 0.7828, + "step": 7795 + }, + { + "epoch": 1.8954534403112082, + "grad_norm": 20.5, + "learning_rate": 1.7233068298414536e-08, + "loss": 0.7769, + "step": 7796 + }, + { + "epoch": 1.8956965718453684, + "grad_norm": 22.5, + "learning_rate": 1.715319130615123e-08, + "loss": 0.8071, + "step": 7797 + }, + { + "epoch": 1.8959397033795282, + "grad_norm": 18.75, + "learning_rate": 1.7073498584135457e-08, + "loss": 0.5649, + "step": 7798 + }, + { + "epoch": 1.8961828349136884, + "grad_norm": 22.375, + "learning_rate": 1.699399014427852e-08, + "loss": 0.5957, + "step": 7799 + }, + { + "epoch": 1.8964259664478482, + "grad_norm": 21.625, + "learning_rate": 1.6914665998464808e-08, + "loss": 0.7547, + "step": 7800 + }, + { + "epoch": 1.8966690979820082, + "grad_norm": 25.125, + "learning_rate": 1.6835526158550248e-08, + "loss": 0.9502, + "step": 7801 + }, + { + "epoch": 1.8969122295161682, + "grad_norm": 20.5, + "learning_rate": 1.675657063636385e-08, + "loss": 1.0027, + "step": 7802 + }, + { + "epoch": 1.8971553610503282, + "grad_norm": 21.5, + "learning_rate": 1.6677799443706593e-08, + "loss": 0.6465, + "step": 7803 + }, + { + "epoch": 1.8973984925844882, + "grad_norm": 27.125, + "learning_rate": 1.659921259235267e-08, + "loss": 0.71, + "step": 7804 + }, + { + "epoch": 1.8976416241186482, + "grad_norm": 28.625, + "learning_rate": 1.6520810094047963e-08, + "loss": 0.8231, + "step": 7805 + }, + { + "epoch": 1.8978847556528082, + "grad_norm": 20.125, + "learning_rate": 1.6442591960510873e-08, + "loss": 0.5697, + "step": 7806 + }, + { + "epoch": 1.898127887186968, + "grad_norm": 25.375, + "learning_rate": 1.6364558203432885e-08, + "loss": 0.7279, + "step": 7807 + }, + { + "epoch": 1.8983710187211282, + "grad_norm": 21.5, + "learning_rate": 1.6286708834477304e-08, + "loss": 0.6939, + "step": 7808 + }, + { + "epoch": 1.898614150255288, + "grad_norm": 19.125, + "learning_rate": 1.6209043865280105e-08, + "loss": 0.7817, + "step": 7809 + }, + { + "epoch": 1.8988572817894482, + "grad_norm": 22.125, + "learning_rate": 1.61315633074495e-08, + "loss": 0.4963, + "step": 7810 + }, + { + "epoch": 1.899100413323608, + "grad_norm": 23.0, + "learning_rate": 1.605426717256636e-08, + "loss": 1.0589, + "step": 7811 + }, + { + "epoch": 1.899343544857768, + "grad_norm": 19.625, + "learning_rate": 1.5977155472184093e-08, + "loss": 0.7394, + "step": 7812 + }, + { + "epoch": 1.899586676391928, + "grad_norm": 19.875, + "learning_rate": 1.5900228217828052e-08, + "loss": 0.5944, + "step": 7813 + }, + { + "epoch": 1.899829807926088, + "grad_norm": 21.375, + "learning_rate": 1.582348542099682e-08, + "loss": 0.7664, + "step": 7814 + }, + { + "epoch": 1.900072939460248, + "grad_norm": 15.625, + "learning_rate": 1.574692709316053e-08, + "loss": 0.3386, + "step": 7815 + }, + { + "epoch": 1.900316070994408, + "grad_norm": 20.5, + "learning_rate": 1.567055324576225e-08, + "loss": 0.7334, + "step": 7816 + }, + { + "epoch": 1.900559202528568, + "grad_norm": 17.75, + "learning_rate": 1.559436389021743e-08, + "loss": 0.473, + "step": 7817 + }, + { + "epoch": 1.9008023340627278, + "grad_norm": 24.5, + "learning_rate": 1.5518359037913905e-08, + "loss": 0.7397, + "step": 7818 + }, + { + "epoch": 1.901045465596888, + "grad_norm": 18.25, + "learning_rate": 1.5442538700211756e-08, + "loss": 0.6509, + "step": 7819 + }, + { + "epoch": 1.9012885971310478, + "grad_norm": 13.8125, + "learning_rate": 1.5366902888443723e-08, + "loss": 0.3035, + "step": 7820 + }, + { + "epoch": 1.901531728665208, + "grad_norm": 18.125, + "learning_rate": 1.529145161391493e-08, + "loss": 0.5966, + "step": 7821 + }, + { + "epoch": 1.9017748601993678, + "grad_norm": 19.75, + "learning_rate": 1.5216184887902884e-08, + "loss": 0.4218, + "step": 7822 + }, + { + "epoch": 1.9020179917335278, + "grad_norm": 18.375, + "learning_rate": 1.5141102721657475e-08, + "loss": 0.7222, + "step": 7823 + }, + { + "epoch": 1.9022611232676878, + "grad_norm": 18.125, + "learning_rate": 1.5066205126400973e-08, + "loss": 0.6308, + "step": 7824 + }, + { + "epoch": 1.9025042548018478, + "grad_norm": 23.5, + "learning_rate": 1.4991492113328177e-08, + "loss": 0.792, + "step": 7825 + }, + { + "epoch": 1.9027473863360078, + "grad_norm": 17.75, + "learning_rate": 1.491696369360626e-08, + "loss": 0.7579, + "step": 7826 + }, + { + "epoch": 1.9029905178701676, + "grad_norm": 23.625, + "learning_rate": 1.4842619878374654e-08, + "loss": 0.8057, + "step": 7827 + }, + { + "epoch": 1.9032336494043278, + "grad_norm": 16.0, + "learning_rate": 1.4768460678745573e-08, + "loss": 0.4927, + "step": 7828 + }, + { + "epoch": 1.9034767809384876, + "grad_norm": 19.0, + "learning_rate": 1.469448610580307e-08, + "loss": 0.6272, + "step": 7829 + }, + { + "epoch": 1.9037199124726478, + "grad_norm": 20.625, + "learning_rate": 1.4620696170604132e-08, + "loss": 0.4111, + "step": 7830 + }, + { + "epoch": 1.9039630440068076, + "grad_norm": 25.0, + "learning_rate": 1.4547090884177855e-08, + "loss": 0.9337, + "step": 7831 + }, + { + "epoch": 1.9042061755409678, + "grad_norm": 17.5, + "learning_rate": 1.4473670257525996e-08, + "loss": 0.5651, + "step": 7832 + }, + { + "epoch": 1.9044493070751276, + "grad_norm": 16.625, + "learning_rate": 1.4400434301622135e-08, + "loss": 0.4158, + "step": 7833 + }, + { + "epoch": 1.9046924386092876, + "grad_norm": 18.125, + "learning_rate": 1.432738302741321e-08, + "loss": 0.5588, + "step": 7834 + }, + { + "epoch": 1.9049355701434476, + "grad_norm": 18.0, + "learning_rate": 1.4254516445817573e-08, + "loss": 0.7193, + "step": 7835 + }, + { + "epoch": 1.9051787016776076, + "grad_norm": 23.875, + "learning_rate": 1.4181834567726372e-08, + "loss": 1.0141, + "step": 7836 + }, + { + "epoch": 1.9054218332117676, + "grad_norm": 17.375, + "learning_rate": 1.4109337404003414e-08, + "loss": 0.4704, + "step": 7837 + }, + { + "epoch": 1.9056649647459274, + "grad_norm": 20.0, + "learning_rate": 1.4037024965484618e-08, + "loss": 0.6783, + "step": 7838 + }, + { + "epoch": 1.9059080962800876, + "grad_norm": 16.0, + "learning_rate": 1.3964897262978283e-08, + "loss": 0.5158, + "step": 7839 + }, + { + "epoch": 1.9061512278142474, + "grad_norm": 21.75, + "learning_rate": 1.389295430726495e-08, + "loss": 0.6328, + "step": 7840 + }, + { + "epoch": 1.9063943593484076, + "grad_norm": 26.25, + "learning_rate": 1.38211961090981e-08, + "loss": 0.6498, + "step": 7841 + }, + { + "epoch": 1.9066374908825674, + "grad_norm": 17.875, + "learning_rate": 1.3749622679202907e-08, + "loss": 0.7024, + "step": 7842 + }, + { + "epoch": 1.9068806224167274, + "grad_norm": 20.375, + "learning_rate": 1.3678234028277476e-08, + "loss": 0.5721, + "step": 7843 + }, + { + "epoch": 1.9071237539508874, + "grad_norm": 15.5625, + "learning_rate": 1.3607030166991747e-08, + "loss": 0.5827, + "step": 7844 + }, + { + "epoch": 1.9073668854850474, + "grad_norm": 24.125, + "learning_rate": 1.3536011105988872e-08, + "loss": 1.0471, + "step": 7845 + }, + { + "epoch": 1.9076100170192074, + "grad_norm": 20.0, + "learning_rate": 1.3465176855883555e-08, + "loss": 0.9289, + "step": 7846 + }, + { + "epoch": 1.9078531485533674, + "grad_norm": 19.0, + "learning_rate": 1.339452742726316e-08, + "loss": 0.6694, + "step": 7847 + }, + { + "epoch": 1.9080962800875274, + "grad_norm": 19.5, + "learning_rate": 1.3324062830687712e-08, + "loss": 0.7547, + "step": 7848 + }, + { + "epoch": 1.9083394116216872, + "grad_norm": 17.125, + "learning_rate": 1.3253783076689064e-08, + "loss": 0.5016, + "step": 7849 + }, + { + "epoch": 1.9085825431558474, + "grad_norm": 18.375, + "learning_rate": 1.3183688175772008e-08, + "loss": 0.8364, + "step": 7850 + }, + { + "epoch": 1.9088256746900072, + "grad_norm": 17.0, + "learning_rate": 1.3113778138413164e-08, + "loss": 0.4185, + "step": 7851 + }, + { + "epoch": 1.9090688062241674, + "grad_norm": 18.0, + "learning_rate": 1.304405297506195e-08, + "loss": 0.4495, + "step": 7852 + }, + { + "epoch": 1.9093119377583272, + "grad_norm": 16.625, + "learning_rate": 1.2974512696140167e-08, + "loss": 0.583, + "step": 7853 + }, + { + "epoch": 1.9095550692924872, + "grad_norm": 20.875, + "learning_rate": 1.290515731204145e-08, + "loss": 0.6287, + "step": 7854 + }, + { + "epoch": 1.9097982008266472, + "grad_norm": 18.0, + "learning_rate": 1.2835986833132502e-08, + "loss": 0.5895, + "step": 7855 + }, + { + "epoch": 1.9100413323608072, + "grad_norm": 25.875, + "learning_rate": 1.2767001269751722e-08, + "loss": 0.9779, + "step": 7856 + }, + { + "epoch": 1.9102844638949672, + "grad_norm": 21.0, + "learning_rate": 1.2698200632210445e-08, + "loss": 0.6093, + "step": 7857 + }, + { + "epoch": 1.9105275954291272, + "grad_norm": 19.0, + "learning_rate": 1.2629584930791972e-08, + "loss": 0.6462, + "step": 7858 + }, + { + "epoch": 1.9107707269632872, + "grad_norm": 18.5, + "learning_rate": 1.2561154175752266e-08, + "loss": 0.9409, + "step": 7859 + }, + { + "epoch": 1.911013858497447, + "grad_norm": 26.875, + "learning_rate": 1.2492908377319257e-08, + "loss": 0.6514, + "step": 7860 + }, + { + "epoch": 1.9112569900316072, + "grad_norm": 20.375, + "learning_rate": 1.2424847545693536e-08, + "loss": 0.732, + "step": 7861 + }, + { + "epoch": 1.911500121565767, + "grad_norm": 15.0625, + "learning_rate": 1.2356971691048075e-08, + "loss": 0.4285, + "step": 7862 + }, + { + "epoch": 1.9117432530999272, + "grad_norm": 27.125, + "learning_rate": 1.2289280823528094e-08, + "loss": 0.9337, + "step": 7863 + }, + { + "epoch": 1.911986384634087, + "grad_norm": 23.625, + "learning_rate": 1.2221774953251053e-08, + "loss": 0.8571, + "step": 7864 + }, + { + "epoch": 1.912229516168247, + "grad_norm": 19.5, + "learning_rate": 1.2154454090306939e-08, + "loss": 0.6406, + "step": 7865 + }, + { + "epoch": 1.912472647702407, + "grad_norm": 20.75, + "learning_rate": 1.2087318244757978e-08, + "loss": 0.5717, + "step": 7866 + }, + { + "epoch": 1.912715779236567, + "grad_norm": 21.75, + "learning_rate": 1.2020367426638785e-08, + "loss": 0.7081, + "step": 7867 + }, + { + "epoch": 1.912958910770727, + "grad_norm": 20.0, + "learning_rate": 1.1953601645956214e-08, + "loss": 0.8989, + "step": 7868 + }, + { + "epoch": 1.913202042304887, + "grad_norm": 20.625, + "learning_rate": 1.1887020912689784e-08, + "loss": 0.6977, + "step": 7869 + }, + { + "epoch": 1.913445173839047, + "grad_norm": 22.75, + "learning_rate": 1.1820625236791116e-08, + "loss": 0.6997, + "step": 7870 + }, + { + "epoch": 1.9136883053732068, + "grad_norm": 21.0, + "learning_rate": 1.1754414628183941e-08, + "loss": 0.6479, + "step": 7871 + }, + { + "epoch": 1.913931436907367, + "grad_norm": 20.125, + "learning_rate": 1.1688389096764923e-08, + "loss": 0.8286, + "step": 7872 + }, + { + "epoch": 1.9141745684415268, + "grad_norm": 19.875, + "learning_rate": 1.162254865240242e-08, + "loss": 0.8877, + "step": 7873 + }, + { + "epoch": 1.914417699975687, + "grad_norm": 19.25, + "learning_rate": 1.1556893304937588e-08, + "loss": 0.8329, + "step": 7874 + }, + { + "epoch": 1.9146608315098468, + "grad_norm": 18.875, + "learning_rate": 1.1491423064183549e-08, + "loss": 0.7368, + "step": 7875 + }, + { + "epoch": 1.9149039630440068, + "grad_norm": 25.375, + "learning_rate": 1.1426137939926362e-08, + "loss": 0.8761, + "step": 7876 + }, + { + "epoch": 1.9151470945781668, + "grad_norm": 24.25, + "learning_rate": 1.1361037941923641e-08, + "loss": 1.0683, + "step": 7877 + }, + { + "epoch": 1.9153902261123268, + "grad_norm": 28.5, + "learning_rate": 1.1296123079905796e-08, + "loss": 0.8153, + "step": 7878 + }, + { + "epoch": 1.9156333576464868, + "grad_norm": 23.875, + "learning_rate": 1.123139336357562e-08, + "loss": 0.8908, + "step": 7879 + }, + { + "epoch": 1.9158764891806466, + "grad_norm": 18.25, + "learning_rate": 1.1166848802607877e-08, + "loss": 0.6323, + "step": 7880 + }, + { + "epoch": 1.9161196207148068, + "grad_norm": 18.875, + "learning_rate": 1.1102489406649986e-08, + "loss": 0.6903, + "step": 7881 + }, + { + "epoch": 1.9163627522489666, + "grad_norm": 19.875, + "learning_rate": 1.1038315185321613e-08, + "loss": 0.7712, + "step": 7882 + }, + { + "epoch": 1.9166058837831268, + "grad_norm": 19.875, + "learning_rate": 1.097432614821467e-08, + "loss": 0.5251, + "step": 7883 + }, + { + "epoch": 1.9168490153172866, + "grad_norm": 24.0, + "learning_rate": 1.0910522304893312e-08, + "loss": 0.7394, + "step": 7884 + }, + { + "epoch": 1.9170921468514468, + "grad_norm": 22.875, + "learning_rate": 1.0846903664894215e-08, + "loss": 0.4537, + "step": 7885 + }, + { + "epoch": 1.9173352783856066, + "grad_norm": 20.125, + "learning_rate": 1.07834702377263e-08, + "loss": 0.5776, + "step": 7886 + }, + { + "epoch": 1.9175784099197666, + "grad_norm": 21.125, + "learning_rate": 1.0720222032870874e-08, + "loss": 0.857, + "step": 7887 + }, + { + "epoch": 1.9178215414539266, + "grad_norm": 26.75, + "learning_rate": 1.0657159059781208e-08, + "loss": 0.9864, + "step": 7888 + }, + { + "epoch": 1.9180646729880866, + "grad_norm": 18.0, + "learning_rate": 1.059428132788337e-08, + "loss": 0.6663, + "step": 7889 + }, + { + "epoch": 1.9183078045222466, + "grad_norm": 22.125, + "learning_rate": 1.0531588846575403e-08, + "loss": 0.6014, + "step": 7890 + }, + { + "epoch": 1.9185509360564064, + "grad_norm": 24.0, + "learning_rate": 1.0469081625228e-08, + "loss": 0.5684, + "step": 7891 + }, + { + "epoch": 1.9187940675905666, + "grad_norm": 17.375, + "learning_rate": 1.0406759673183554e-08, + "loss": 0.5461, + "step": 7892 + }, + { + "epoch": 1.9190371991247264, + "grad_norm": 21.125, + "learning_rate": 1.0344622999757525e-08, + "loss": 0.4815, + "step": 7893 + }, + { + "epoch": 1.9192803306588866, + "grad_norm": 14.8125, + "learning_rate": 1.028267161423721e-08, + "loss": 0.3031, + "step": 7894 + }, + { + "epoch": 1.9195234621930464, + "grad_norm": 16.375, + "learning_rate": 1.0220905525882285e-08, + "loss": 0.451, + "step": 7895 + }, + { + "epoch": 1.9197665937272064, + "grad_norm": 18.5, + "learning_rate": 1.0159324743924809e-08, + "loss": 0.3167, + "step": 7896 + }, + { + "epoch": 1.9200097252613664, + "grad_norm": 23.875, + "learning_rate": 1.0097929277568946e-08, + "loss": 0.7654, + "step": 7897 + }, + { + "epoch": 1.9202528567955264, + "grad_norm": 20.25, + "learning_rate": 1.0036719135991385e-08, + "loss": 0.577, + "step": 7898 + }, + { + "epoch": 1.9204959883296864, + "grad_norm": 21.625, + "learning_rate": 9.975694328341057e-09, + "loss": 0.8609, + "step": 7899 + }, + { + "epoch": 1.9207391198638464, + "grad_norm": 18.125, + "learning_rate": 9.914854863739138e-09, + "loss": 0.501, + "step": 7900 + }, + { + "epoch": 1.9209822513980064, + "grad_norm": 18.75, + "learning_rate": 9.854200751279325e-09, + "loss": 0.4953, + "step": 7901 + }, + { + "epoch": 1.9212253829321662, + "grad_norm": 15.0625, + "learning_rate": 9.793732000027145e-09, + "loss": 0.4904, + "step": 7902 + }, + { + "epoch": 1.9214685144663264, + "grad_norm": 15.0625, + "learning_rate": 9.733448619020786e-09, + "loss": 0.612, + "step": 7903 + }, + { + "epoch": 1.9217116460004862, + "grad_norm": 16.125, + "learning_rate": 9.673350617270816e-09, + "loss": 0.619, + "step": 7904 + }, + { + "epoch": 1.9219547775346464, + "grad_norm": 22.25, + "learning_rate": 9.613438003759634e-09, + "loss": 0.8661, + "step": 7905 + }, + { + "epoch": 1.9221979090688062, + "grad_norm": 18.75, + "learning_rate": 9.553710787442438e-09, + "loss": 0.3533, + "step": 7906 + }, + { + "epoch": 1.9224410406029662, + "grad_norm": 19.5, + "learning_rate": 9.494168977246394e-09, + "loss": 0.5105, + "step": 7907 + }, + { + "epoch": 1.9226841721371262, + "grad_norm": 13.5, + "learning_rate": 9.434812582071045e-09, + "loss": 0.3481, + "step": 7908 + }, + { + "epoch": 1.9229273036712862, + "grad_norm": 19.5, + "learning_rate": 9.375641610788327e-09, + "loss": 0.815, + "step": 7909 + }, + { + "epoch": 1.9231704352054462, + "grad_norm": 20.625, + "learning_rate": 9.316656072242275e-09, + "loss": 0.712, + "step": 7910 + }, + { + "epoch": 1.9234135667396062, + "grad_norm": 27.25, + "learning_rate": 9.257855975249308e-09, + "loss": 0.9866, + "step": 7911 + }, + { + "epoch": 1.9236566982737662, + "grad_norm": 17.75, + "learning_rate": 9.199241328598092e-09, + "loss": 1.1286, + "step": 7912 + }, + { + "epoch": 1.923899829807926, + "grad_norm": 20.125, + "learning_rate": 9.14081214104967e-09, + "loss": 0.7686, + "step": 7913 + }, + { + "epoch": 1.9241429613420862, + "grad_norm": 20.25, + "learning_rate": 9.0825684213372e-09, + "loss": 0.8863, + "step": 7914 + }, + { + "epoch": 1.924386092876246, + "grad_norm": 21.125, + "learning_rate": 9.024510178166357e-09, + "loss": 0.8258, + "step": 7915 + }, + { + "epoch": 1.9246292244104062, + "grad_norm": 25.875, + "learning_rate": 8.966637420214779e-09, + "loss": 0.8853, + "step": 7916 + }, + { + "epoch": 1.924872355944566, + "grad_norm": 19.5, + "learning_rate": 8.908950156132635e-09, + "loss": 0.7537, + "step": 7917 + }, + { + "epoch": 1.925115487478726, + "grad_norm": 21.0, + "learning_rate": 8.851448394542328e-09, + "loss": 0.7481, + "step": 7918 + }, + { + "epoch": 1.925358619012886, + "grad_norm": 20.75, + "learning_rate": 8.794132144038376e-09, + "loss": 0.7676, + "step": 7919 + }, + { + "epoch": 1.925601750547046, + "grad_norm": 22.875, + "learning_rate": 8.737001413187813e-09, + "loss": 0.6935, + "step": 7920 + }, + { + "epoch": 1.925844882081206, + "grad_norm": 16.625, + "learning_rate": 8.680056210529642e-09, + "loss": 0.3174, + "step": 7921 + }, + { + "epoch": 1.926088013615366, + "grad_norm": 16.75, + "learning_rate": 8.623296544575526e-09, + "loss": 0.5871, + "step": 7922 + }, + { + "epoch": 1.926331145149526, + "grad_norm": 30.5, + "learning_rate": 8.56672242380896e-09, + "loss": 0.7486, + "step": 7923 + }, + { + "epoch": 1.9265742766836857, + "grad_norm": 26.5, + "learning_rate": 8.510333856686092e-09, + "loss": 0.8505, + "step": 7924 + }, + { + "epoch": 1.926817408217846, + "grad_norm": 24.25, + "learning_rate": 8.454130851635045e-09, + "loss": 0.8154, + "step": 7925 + }, + { + "epoch": 1.9270605397520058, + "grad_norm": 23.375, + "learning_rate": 8.39811341705632e-09, + "loss": 0.8016, + "step": 7926 + }, + { + "epoch": 1.927303671286166, + "grad_norm": 22.75, + "learning_rate": 8.342281561322801e-09, + "loss": 0.7308, + "step": 7927 + }, + { + "epoch": 1.9275468028203258, + "grad_norm": 18.875, + "learning_rate": 8.286635292779621e-09, + "loss": 0.6389, + "step": 7928 + }, + { + "epoch": 1.9277899343544858, + "grad_norm": 30.125, + "learning_rate": 8.231174619743736e-09, + "loss": 0.9354, + "step": 7929 + }, + { + "epoch": 1.9280330658886458, + "grad_norm": 19.375, + "learning_rate": 8.175899550504906e-09, + "loss": 0.7407, + "step": 7930 + }, + { + "epoch": 1.9282761974228058, + "grad_norm": 26.0, + "learning_rate": 8.120810093324993e-09, + "loss": 0.6151, + "step": 7931 + }, + { + "epoch": 1.9285193289569658, + "grad_norm": 17.75, + "learning_rate": 8.065906256438105e-09, + "loss": 0.7461, + "step": 7932 + }, + { + "epoch": 1.9287624604911255, + "grad_norm": 24.25, + "learning_rate": 8.011188048050316e-09, + "loss": 0.7335, + "step": 7933 + }, + { + "epoch": 1.9290055920252858, + "grad_norm": 32.0, + "learning_rate": 7.956655476340503e-09, + "loss": 1.4917, + "step": 7934 + }, + { + "epoch": 1.9292487235594455, + "grad_norm": 20.5, + "learning_rate": 7.902308549459226e-09, + "loss": 0.8437, + "step": 7935 + }, + { + "epoch": 1.9294918550936058, + "grad_norm": 22.375, + "learning_rate": 7.84814727552971e-09, + "loss": 1.0596, + "step": 7936 + }, + { + "epoch": 1.9297349866277655, + "grad_norm": 22.75, + "learning_rate": 7.794171662647287e-09, + "loss": 0.7269, + "step": 7937 + }, + { + "epoch": 1.9299781181619255, + "grad_norm": 19.875, + "learning_rate": 7.740381718879669e-09, + "loss": 1.0005, + "step": 7938 + }, + { + "epoch": 1.9302212496960856, + "grad_norm": 28.375, + "learning_rate": 7.686777452266537e-09, + "loss": 0.7057, + "step": 7939 + }, + { + "epoch": 1.9304643812302456, + "grad_norm": 20.375, + "learning_rate": 7.633358870819812e-09, + "loss": 0.6135, + "step": 7940 + }, + { + "epoch": 1.9307075127644056, + "grad_norm": 17.375, + "learning_rate": 7.580125982524223e-09, + "loss": 0.5086, + "step": 7941 + }, + { + "epoch": 1.9309506442985656, + "grad_norm": 27.25, + "learning_rate": 7.527078795336179e-09, + "loss": 0.5235, + "step": 7942 + }, + { + "epoch": 1.9311937758327256, + "grad_norm": 24.75, + "learning_rate": 7.47421731718434e-09, + "loss": 1.2145, + "step": 7943 + }, + { + "epoch": 1.9314369073668853, + "grad_norm": 20.125, + "learning_rate": 7.421541555969885e-09, + "loss": 0.6006, + "step": 7944 + }, + { + "epoch": 1.9316800389010456, + "grad_norm": 15.75, + "learning_rate": 7.369051519566101e-09, + "loss": 0.3053, + "step": 7945 + }, + { + "epoch": 1.9319231704352053, + "grad_norm": 22.375, + "learning_rate": 7.316747215818654e-09, + "loss": 1.1693, + "step": 7946 + }, + { + "epoch": 1.9321663019693656, + "grad_norm": 16.75, + "learning_rate": 7.264628652545042e-09, + "loss": 0.4853, + "step": 7947 + }, + { + "epoch": 1.9324094335035253, + "grad_norm": 23.5, + "learning_rate": 7.212695837535561e-09, + "loss": 0.7163, + "step": 7948 + }, + { + "epoch": 1.9326525650376853, + "grad_norm": 24.0, + "learning_rate": 7.160948778552474e-09, + "loss": 0.9888, + "step": 7949 + }, + { + "epoch": 1.9328956965718453, + "grad_norm": 19.5, + "learning_rate": 7.10938748333001e-09, + "loss": 0.5207, + "step": 7950 + }, + { + "epoch": 1.9331388281060053, + "grad_norm": 19.75, + "learning_rate": 7.0580119595752e-09, + "loss": 0.3175, + "step": 7951 + }, + { + "epoch": 1.9333819596401653, + "grad_norm": 24.125, + "learning_rate": 7.0068222149666246e-09, + "loss": 0.851, + "step": 7952 + }, + { + "epoch": 1.9336250911743253, + "grad_norm": 19.5, + "learning_rate": 6.955818257155938e-09, + "loss": 0.6074, + "step": 7953 + }, + { + "epoch": 1.9338682227084854, + "grad_norm": 21.125, + "learning_rate": 6.905000093766212e-09, + "loss": 0.5639, + "step": 7954 + }, + { + "epoch": 1.9341113542426451, + "grad_norm": 17.5, + "learning_rate": 6.854367732393175e-09, + "loss": 0.4098, + "step": 7955 + }, + { + "epoch": 1.9343544857768054, + "grad_norm": 17.75, + "learning_rate": 6.8039211806048e-09, + "loss": 0.9482, + "step": 7956 + }, + { + "epoch": 1.9345976173109651, + "grad_norm": 17.125, + "learning_rate": 6.7536604459411684e-09, + "loss": 0.4689, + "step": 7957 + }, + { + "epoch": 1.9348407488451254, + "grad_norm": 20.875, + "learning_rate": 6.703585535914603e-09, + "loss": 0.8622, + "step": 7958 + }, + { + "epoch": 1.9350838803792851, + "grad_norm": 19.875, + "learning_rate": 6.653696458009673e-09, + "loss": 0.6347, + "step": 7959 + }, + { + "epoch": 1.9353270119134451, + "grad_norm": 23.375, + "learning_rate": 6.6039932196830524e-09, + "loss": 0.8108, + "step": 7960 + }, + { + "epoch": 1.9355701434476051, + "grad_norm": 25.0, + "learning_rate": 6.554475828363799e-09, + "loss": 1.2805, + "step": 7961 + }, + { + "epoch": 1.9358132749817651, + "grad_norm": 17.125, + "learning_rate": 6.505144291453352e-09, + "loss": 0.4351, + "step": 7962 + }, + { + "epoch": 1.9360564065159251, + "grad_norm": 14.5, + "learning_rate": 6.455998616324843e-09, + "loss": 0.3876, + "step": 7963 + }, + { + "epoch": 1.9362995380500851, + "grad_norm": 20.5, + "learning_rate": 6.407038810324062e-09, + "loss": 1.4158, + "step": 7964 + }, + { + "epoch": 1.9365426695842451, + "grad_norm": 17.25, + "learning_rate": 6.358264880769044e-09, + "loss": 0.627, + "step": 7965 + }, + { + "epoch": 1.936785801118405, + "grad_norm": 26.0, + "learning_rate": 6.309676834949791e-09, + "loss": 0.9844, + "step": 7966 + }, + { + "epoch": 1.9370289326525651, + "grad_norm": 28.5, + "learning_rate": 6.261274680128549e-09, + "loss": 0.914, + "step": 7967 + }, + { + "epoch": 1.937272064186725, + "grad_norm": 19.25, + "learning_rate": 6.213058423539809e-09, + "loss": 0.5061, + "step": 7968 + }, + { + "epoch": 1.9375151957208852, + "grad_norm": 28.0, + "learning_rate": 6.165028072390583e-09, + "loss": 1.0491, + "step": 7969 + }, + { + "epoch": 1.937758327255045, + "grad_norm": 26.375, + "learning_rate": 6.117183633859714e-09, + "loss": 0.7604, + "step": 7970 + }, + { + "epoch": 1.938001458789205, + "grad_norm": 16.75, + "learning_rate": 6.0695251150982844e-09, + "loss": 0.6431, + "step": 7971 + }, + { + "epoch": 1.938244590323365, + "grad_norm": 15.1875, + "learning_rate": 6.0220525232297655e-09, + "loss": 0.4362, + "step": 7972 + }, + { + "epoch": 1.938487721857525, + "grad_norm": 15.6875, + "learning_rate": 5.97476586534973e-09, + "loss": 0.545, + "step": 7973 + }, + { + "epoch": 1.938730853391685, + "grad_norm": 22.0, + "learning_rate": 5.927665148525858e-09, + "loss": 0.6529, + "step": 7974 + }, + { + "epoch": 1.9389739849258447, + "grad_norm": 19.125, + "learning_rate": 5.880750379798489e-09, + "loss": 0.6962, + "step": 7975 + }, + { + "epoch": 1.939217116460005, + "grad_norm": 16.625, + "learning_rate": 5.834021566179515e-09, + "loss": 0.7686, + "step": 7976 + }, + { + "epoch": 1.9394602479941647, + "grad_norm": 16.5, + "learning_rate": 5.787478714653627e-09, + "loss": 0.454, + "step": 7977 + }, + { + "epoch": 1.939703379528325, + "grad_norm": 24.5, + "learning_rate": 5.741121832177343e-09, + "loss": 0.7541, + "step": 7978 + }, + { + "epoch": 1.9399465110624847, + "grad_norm": 20.625, + "learning_rate": 5.694950925679427e-09, + "loss": 0.5118, + "step": 7979 + }, + { + "epoch": 1.940189642596645, + "grad_norm": 19.5, + "learning_rate": 5.648966002061024e-09, + "loss": 0.6334, + "step": 7980 + }, + { + "epoch": 1.9404327741308047, + "grad_norm": 20.375, + "learning_rate": 5.603167068195386e-09, + "loss": 0.7428, + "step": 7981 + }, + { + "epoch": 1.9406759056649647, + "grad_norm": 16.125, + "learning_rate": 5.557554130927872e-09, + "loss": 0.3143, + "step": 7982 + }, + { + "epoch": 1.9409190371991247, + "grad_norm": 18.125, + "learning_rate": 5.512127197076084e-09, + "loss": 0.5427, + "step": 7983 + }, + { + "epoch": 1.9411621687332847, + "grad_norm": 19.125, + "learning_rate": 5.466886273430005e-09, + "loss": 0.6041, + "step": 7984 + }, + { + "epoch": 1.9414053002674447, + "grad_norm": 21.375, + "learning_rate": 5.421831366751451e-09, + "loss": 0.8419, + "step": 7985 + }, + { + "epoch": 1.9416484318016045, + "grad_norm": 18.125, + "learning_rate": 5.376962483775033e-09, + "loss": 0.8757, + "step": 7986 + }, + { + "epoch": 1.9418915633357647, + "grad_norm": 18.375, + "learning_rate": 5.3322796312069156e-09, + "loss": 0.4442, + "step": 7987 + }, + { + "epoch": 1.9421346948699245, + "grad_norm": 24.0, + "learning_rate": 5.287782815725645e-09, + "loss": 1.2253, + "step": 7988 + }, + { + "epoch": 1.9423778264040847, + "grad_norm": 22.875, + "learning_rate": 5.243472043982289e-09, + "loss": 1.0615, + "step": 7989 + }, + { + "epoch": 1.9426209579382445, + "grad_norm": 22.875, + "learning_rate": 5.199347322599607e-09, + "loss": 0.5945, + "step": 7990 + }, + { + "epoch": 1.9428640894724045, + "grad_norm": 24.875, + "learning_rate": 5.155408658173017e-09, + "loss": 0.7143, + "step": 7991 + }, + { + "epoch": 1.9431072210065645, + "grad_norm": 17.5, + "learning_rate": 5.1116560572696264e-09, + "loss": 0.6492, + "step": 7992 + }, + { + "epoch": 1.9433503525407245, + "grad_norm": 19.5, + "learning_rate": 5.068089526429482e-09, + "loss": 0.5401, + "step": 7993 + }, + { + "epoch": 1.9435934840748845, + "grad_norm": 21.5, + "learning_rate": 5.024709072163903e-09, + "loss": 0.5217, + "step": 7994 + }, + { + "epoch": 1.9438366156090445, + "grad_norm": 21.75, + "learning_rate": 4.981514700957008e-09, + "loss": 0.7356, + "step": 7995 + }, + { + "epoch": 1.9440797471432045, + "grad_norm": 23.25, + "learning_rate": 4.938506419265021e-09, + "loss": 0.6327, + "step": 7996 + }, + { + "epoch": 1.9443228786773643, + "grad_norm": 15.5, + "learning_rate": 4.895684233516274e-09, + "loss": 0.7631, + "step": 7997 + }, + { + "epoch": 1.9445660102115245, + "grad_norm": 19.125, + "learning_rate": 4.8530481501110615e-09, + "loss": 0.596, + "step": 7998 + }, + { + "epoch": 1.9448091417456843, + "grad_norm": 20.25, + "learning_rate": 4.810598175422204e-09, + "loss": 0.7739, + "step": 7999 + }, + { + "epoch": 1.9450522732798445, + "grad_norm": 18.875, + "learning_rate": 4.768334315794765e-09, + "loss": 0.8091, + "step": 8000 + }, + { + "epoch": 1.9452954048140043, + "grad_norm": 19.125, + "learning_rate": 4.726256577545635e-09, + "loss": 0.6666, + "step": 8001 + }, + { + "epoch": 1.9455385363481643, + "grad_norm": 17.875, + "learning_rate": 4.68436496696395e-09, + "loss": 0.6151, + "step": 8002 + }, + { + "epoch": 1.9457816678823243, + "grad_norm": 19.875, + "learning_rate": 4.642659490311508e-09, + "loss": 0.7217, + "step": 8003 + }, + { + "epoch": 1.9460247994164843, + "grad_norm": 18.75, + "learning_rate": 4.601140153821515e-09, + "loss": 0.506, + "step": 8004 + }, + { + "epoch": 1.9462679309506443, + "grad_norm": 25.25, + "learning_rate": 4.559806963699842e-09, + "loss": 0.9511, + "step": 8005 + }, + { + "epoch": 1.9465110624848043, + "grad_norm": 18.25, + "learning_rate": 4.518659926124602e-09, + "loss": 0.584, + "step": 8006 + }, + { + "epoch": 1.9467541940189643, + "grad_norm": 31.125, + "learning_rate": 4.477699047245876e-09, + "loss": 1.1278, + "step": 8007 + }, + { + "epoch": 1.946997325553124, + "grad_norm": 18.5, + "learning_rate": 4.436924333185849e-09, + "loss": 0.4784, + "step": 8008 + }, + { + "epoch": 1.9472404570872843, + "grad_norm": 18.375, + "learning_rate": 4.3963357900390915e-09, + "loss": 0.2685, + "step": 8009 + }, + { + "epoch": 1.947483588621444, + "grad_norm": 36.25, + "learning_rate": 4.355933423872416e-09, + "loss": 0.7514, + "step": 8010 + }, + { + "epoch": 1.9477267201556043, + "grad_norm": 28.0, + "learning_rate": 4.315717240724465e-09, + "loss": 0.5849, + "step": 8011 + }, + { + "epoch": 1.947969851689764, + "grad_norm": 23.625, + "learning_rate": 4.275687246606125e-09, + "loss": 0.7061, + "step": 8012 + }, + { + "epoch": 1.948212983223924, + "grad_norm": 20.25, + "learning_rate": 4.235843447500942e-09, + "loss": 0.5486, + "step": 8013 + }, + { + "epoch": 1.9484561147580841, + "grad_norm": 19.0, + "learning_rate": 4.196185849363876e-09, + "loss": 0.6862, + "step": 8014 + }, + { + "epoch": 1.9486992462922441, + "grad_norm": 20.625, + "learning_rate": 4.156714458122685e-09, + "loss": 0.8226, + "step": 8015 + }, + { + "epoch": 1.9489423778264041, + "grad_norm": 20.875, + "learning_rate": 4.117429279676954e-09, + "loss": 0.6409, + "step": 8016 + }, + { + "epoch": 1.9491855093605641, + "grad_norm": 18.375, + "learning_rate": 4.078330319898655e-09, + "loss": 0.5071, + "step": 8017 + }, + { + "epoch": 1.9494286408947241, + "grad_norm": 21.5, + "learning_rate": 4.039417584631583e-09, + "loss": 0.6227, + "step": 8018 + }, + { + "epoch": 1.949671772428884, + "grad_norm": 25.375, + "learning_rate": 4.00069107969206e-09, + "loss": 1.1477, + "step": 8019 + }, + { + "epoch": 1.9499149039630441, + "grad_norm": 28.25, + "learning_rate": 3.962150810868509e-09, + "loss": 0.9367, + "step": 8020 + }, + { + "epoch": 1.950158035497204, + "grad_norm": 28.5, + "learning_rate": 3.923796783921185e-09, + "loss": 1.0795, + "step": 8021 + }, + { + "epoch": 1.9504011670313641, + "grad_norm": 20.0, + "learning_rate": 3.8856290045830025e-09, + "loss": 0.8159, + "step": 8022 + }, + { + "epoch": 1.950644298565524, + "grad_norm": 17.375, + "learning_rate": 3.847647478558564e-09, + "loss": 0.9348, + "step": 8023 + }, + { + "epoch": 1.950887430099684, + "grad_norm": 23.875, + "learning_rate": 3.809852211525134e-09, + "loss": 1.037, + "step": 8024 + }, + { + "epoch": 1.951130561633844, + "grad_norm": 17.25, + "learning_rate": 3.772243209131804e-09, + "loss": 0.5541, + "step": 8025 + }, + { + "epoch": 1.951373693168004, + "grad_norm": 14.875, + "learning_rate": 3.734820476999773e-09, + "loss": 0.3642, + "step": 8026 + }, + { + "epoch": 1.951616824702164, + "grad_norm": 18.875, + "learning_rate": 3.6975840207224832e-09, + "loss": 0.6307, + "step": 8027 + }, + { + "epoch": 1.9518599562363237, + "grad_norm": 21.75, + "learning_rate": 3.6605338458657604e-09, + "loss": 0.5342, + "step": 8028 + }, + { + "epoch": 1.952103087770484, + "grad_norm": 17.5, + "learning_rate": 3.6236699579672574e-09, + "loss": 0.8729, + "step": 8029 + }, + { + "epoch": 1.9523462193046437, + "grad_norm": 19.625, + "learning_rate": 3.586992362536873e-09, + "loss": 0.9134, + "step": 8030 + }, + { + "epoch": 1.952589350838804, + "grad_norm": 20.375, + "learning_rate": 3.5505010650567493e-09, + "loss": 0.5217, + "step": 8031 + }, + { + "epoch": 1.9528324823729637, + "grad_norm": 17.75, + "learning_rate": 3.514196070981274e-09, + "loss": 0.7421, + "step": 8032 + }, + { + "epoch": 1.953075613907124, + "grad_norm": 17.5, + "learning_rate": 3.4780773857366613e-09, + "loss": 0.468, + "step": 8033 + }, + { + "epoch": 1.9533187454412837, + "grad_norm": 20.125, + "learning_rate": 3.4421450147216483e-09, + "loss": 0.6938, + "step": 8034 + }, + { + "epoch": 1.9535618769754437, + "grad_norm": 20.125, + "learning_rate": 3.4063989633069395e-09, + "loss": 0.7182, + "step": 8035 + }, + { + "epoch": 1.9538050085096037, + "grad_norm": 21.875, + "learning_rate": 3.370839236835066e-09, + "loss": 0.669, + "step": 8036 + }, + { + "epoch": 1.9540481400437637, + "grad_norm": 21.5, + "learning_rate": 3.335465840621499e-09, + "loss": 0.6374, + "step": 8037 + }, + { + "epoch": 1.9542912715779237, + "grad_norm": 21.75, + "learning_rate": 3.30027877995312e-09, + "loss": 0.644, + "step": 8038 + }, + { + "epoch": 1.9545344031120835, + "grad_norm": 20.375, + "learning_rate": 3.2652780600894717e-09, + "loss": 0.7191, + "step": 8039 + }, + { + "epoch": 1.9547775346462437, + "grad_norm": 25.0, + "learning_rate": 3.230463686261648e-09, + "loss": 0.9739, + "step": 8040 + }, + { + "epoch": 1.9550206661804035, + "grad_norm": 14.5, + "learning_rate": 3.19583566367368e-09, + "loss": 0.2626, + "step": 8041 + }, + { + "epoch": 1.9552637977145637, + "grad_norm": 18.25, + "learning_rate": 3.161393997501011e-09, + "loss": 0.4982, + "step": 8042 + }, + { + "epoch": 1.9555069292487235, + "grad_norm": 19.5, + "learning_rate": 3.127138692891607e-09, + "loss": 0.6999, + "step": 8043 + }, + { + "epoch": 1.9557500607828835, + "grad_norm": 18.5, + "learning_rate": 3.0930697549656776e-09, + "loss": 0.7614, + "step": 8044 + }, + { + "epoch": 1.9559931923170435, + "grad_norm": 19.375, + "learning_rate": 3.0591871888152604e-09, + "loss": 0.6758, + "step": 8045 + }, + { + "epoch": 1.9562363238512035, + "grad_norm": 18.25, + "learning_rate": 3.0254909995047764e-09, + "loss": 0.8464, + "step": 8046 + }, + { + "epoch": 1.9564794553853635, + "grad_norm": 19.375, + "learning_rate": 2.991981192070614e-09, + "loss": 0.8545, + "step": 8047 + }, + { + "epoch": 1.9567225869195235, + "grad_norm": 19.5, + "learning_rate": 2.958657771521406e-09, + "loss": 0.4463, + "step": 8048 + }, + { + "epoch": 1.9569657184536835, + "grad_norm": 22.25, + "learning_rate": 2.925520742838028e-09, + "loss": 0.7963, + "step": 8049 + }, + { + "epoch": 1.9572088499878433, + "grad_norm": 29.25, + "learning_rate": 2.892570110973186e-09, + "loss": 0.7685, + "step": 8050 + }, + { + "epoch": 1.9574519815220035, + "grad_norm": 25.25, + "learning_rate": 2.8598058808521066e-09, + "loss": 0.9221, + "step": 8051 + }, + { + "epoch": 1.9576951130561633, + "grad_norm": 17.75, + "learning_rate": 2.8272280573718445e-09, + "loss": 0.7689, + "step": 8052 + }, + { + "epoch": 1.9579382445903235, + "grad_norm": 30.5, + "learning_rate": 2.7948366454016997e-09, + "loss": 0.7574, + "step": 8053 + }, + { + "epoch": 1.9581813761244833, + "grad_norm": 18.25, + "learning_rate": 2.7626316497833537e-09, + "loss": 0.5203, + "step": 8054 + }, + { + "epoch": 1.9584245076586433, + "grad_norm": 23.875, + "learning_rate": 2.7306130753301796e-09, + "loss": 0.3809, + "step": 8055 + }, + { + "epoch": 1.9586676391928033, + "grad_norm": 19.75, + "learning_rate": 2.6987809268280708e-09, + "loss": 0.4265, + "step": 8056 + }, + { + "epoch": 1.9589107707269633, + "grad_norm": 25.625, + "learning_rate": 2.6671352090346102e-09, + "loss": 0.9329, + "step": 8057 + }, + { + "epoch": 1.9591539022611233, + "grad_norm": 21.0, + "learning_rate": 2.6356759266800424e-09, + "loss": 0.6036, + "step": 8058 + }, + { + "epoch": 1.9593970337952833, + "grad_norm": 55.75, + "learning_rate": 2.6044030844664394e-09, + "loss": 0.9563, + "step": 8059 + }, + { + "epoch": 1.9596401653294433, + "grad_norm": 24.0, + "learning_rate": 2.573316687068117e-09, + "loss": 0.7326, + "step": 8060 + }, + { + "epoch": 1.959883296863603, + "grad_norm": 19.875, + "learning_rate": 2.5424167391313594e-09, + "loss": 0.887, + "step": 8061 + }, + { + "epoch": 1.9601264283977633, + "grad_norm": 15.125, + "learning_rate": 2.5117032452748326e-09, + "loss": 0.4569, + "step": 8062 + }, + { + "epoch": 1.960369559931923, + "grad_norm": 27.375, + "learning_rate": 2.4811762100890313e-09, + "loss": 0.7148, + "step": 8063 + }, + { + "epoch": 1.9606126914660833, + "grad_norm": 24.25, + "learning_rate": 2.4508356381368338e-09, + "loss": 0.5914, + "step": 8064 + }, + { + "epoch": 1.960855823000243, + "grad_norm": 27.125, + "learning_rate": 2.420681533953223e-09, + "loss": 0.6904, + "step": 8065 + }, + { + "epoch": 1.961098954534403, + "grad_norm": 18.25, + "learning_rate": 2.3907139020451498e-09, + "loss": 0.7931, + "step": 8066 + }, + { + "epoch": 1.961342086068563, + "grad_norm": 19.75, + "learning_rate": 2.3609327468919475e-09, + "loss": 0.5704, + "step": 8067 + }, + { + "epoch": 1.961585217602723, + "grad_norm": 12.625, + "learning_rate": 2.3313380729446387e-09, + "loss": 0.2747, + "step": 8068 + }, + { + "epoch": 1.961828349136883, + "grad_norm": 22.25, + "learning_rate": 2.3019298846270465e-09, + "loss": 0.6828, + "step": 8069 + }, + { + "epoch": 1.962071480671043, + "grad_norm": 23.5, + "learning_rate": 2.2727081863344046e-09, + "loss": 0.9959, + "step": 8070 + }, + { + "epoch": 1.962314612205203, + "grad_norm": 16.25, + "learning_rate": 2.243672982434608e-09, + "loss": 0.7359, + "step": 8071 + }, + { + "epoch": 1.9625577437393629, + "grad_norm": 16.25, + "learning_rate": 2.2148242772672414e-09, + "loss": 0.6741, + "step": 8072 + }, + { + "epoch": 1.962800875273523, + "grad_norm": 20.625, + "learning_rate": 2.186162075144549e-09, + "loss": 0.6812, + "step": 8073 + }, + { + "epoch": 1.9630440068076829, + "grad_norm": 18.25, + "learning_rate": 2.157686380350188e-09, + "loss": 0.5299, + "step": 8074 + }, + { + "epoch": 1.963287138341843, + "grad_norm": 18.75, + "learning_rate": 2.1293971971407535e-09, + "loss": 0.6591, + "step": 8075 + }, + { + "epoch": 1.9635302698760029, + "grad_norm": 25.25, + "learning_rate": 2.101294529744391e-09, + "loss": 0.5148, + "step": 8076 + }, + { + "epoch": 1.9637734014101629, + "grad_norm": 20.75, + "learning_rate": 2.0733783823614906e-09, + "loss": 0.6525, + "step": 8077 + }, + { + "epoch": 1.9640165329443229, + "grad_norm": 19.75, + "learning_rate": 2.045648759164548e-09, + "loss": 0.4643, + "step": 8078 + }, + { + "epoch": 1.9642596644784829, + "grad_norm": 19.5, + "learning_rate": 2.018105664298442e-09, + "loss": 0.6874, + "step": 8079 + }, + { + "epoch": 1.9645027960126429, + "grad_norm": 17.375, + "learning_rate": 1.9907491018797407e-09, + "loss": 0.2986, + "step": 8080 + }, + { + "epoch": 1.9647459275468027, + "grad_norm": 19.75, + "learning_rate": 1.963579075997535e-09, + "loss": 0.758, + "step": 8081 + }, + { + "epoch": 1.9649890590809629, + "grad_norm": 17.875, + "learning_rate": 1.936595590712742e-09, + "loss": 0.622, + "step": 8082 + }, + { + "epoch": 1.9652321906151227, + "grad_norm": 24.125, + "learning_rate": 1.9097986500585252e-09, + "loss": 0.7895, + "step": 8083 + }, + { + "epoch": 1.9654753221492829, + "grad_norm": 24.625, + "learning_rate": 1.883188258040153e-09, + "loss": 0.9597, + "step": 8084 + }, + { + "epoch": 1.9657184536834427, + "grad_norm": 24.0, + "learning_rate": 1.8567644186349987e-09, + "loss": 1.202, + "step": 8085 + }, + { + "epoch": 1.9659615852176027, + "grad_norm": 25.375, + "learning_rate": 1.8305271357925425e-09, + "loss": 1.1381, + "step": 8086 + }, + { + "epoch": 1.9662047167517627, + "grad_norm": 16.75, + "learning_rate": 1.8044764134346471e-09, + "loss": 1.1715, + "step": 8087 + }, + { + "epoch": 1.9664478482859227, + "grad_norm": 14.5, + "learning_rate": 1.778612255454587e-09, + "loss": 0.3874, + "step": 8088 + }, + { + "epoch": 1.9666909798200827, + "grad_norm": 22.625, + "learning_rate": 1.7529346657185754e-09, + "loss": 0.6248, + "step": 8089 + }, + { + "epoch": 1.9669341113542427, + "grad_norm": 20.75, + "learning_rate": 1.7274436480645141e-09, + "loss": 0.5769, + "step": 8090 + }, + { + "epoch": 1.9671772428884027, + "grad_norm": 16.375, + "learning_rate": 1.7021392063022724e-09, + "loss": 0.6754, + "step": 8091 + }, + { + "epoch": 1.9674203744225625, + "grad_norm": 26.875, + "learning_rate": 1.6770213442142412e-09, + "loss": 0.7949, + "step": 8092 + }, + { + "epoch": 1.9676635059567227, + "grad_norm": 19.875, + "learning_rate": 1.6520900655546401e-09, + "loss": 0.63, + "step": 8093 + }, + { + "epoch": 1.9679066374908825, + "grad_norm": 23.375, + "learning_rate": 1.6273453740499323e-09, + "loss": 0.5896, + "step": 8094 + }, + { + "epoch": 1.9681497690250427, + "grad_norm": 20.0, + "learning_rate": 1.6027872733985483e-09, + "loss": 0.4427, + "step": 8095 + }, + { + "epoch": 1.9683929005592025, + "grad_norm": 19.75, + "learning_rate": 1.578415767271163e-09, + "loss": 0.7467, + "step": 8096 + }, + { + "epoch": 1.9686360320933625, + "grad_norm": 18.0, + "learning_rate": 1.554230859310557e-09, + "loss": 0.7658, + "step": 8097 + }, + { + "epoch": 1.9688791636275225, + "grad_norm": 18.0, + "learning_rate": 1.5302325531316164e-09, + "loss": 0.5146, + "step": 8098 + }, + { + "epoch": 1.9691222951616825, + "grad_norm": 20.75, + "learning_rate": 1.5064208523211942e-09, + "loss": 0.6021, + "step": 8099 + }, + { + "epoch": 1.9693654266958425, + "grad_norm": 16.5, + "learning_rate": 1.482795760438388e-09, + "loss": 0.4044, + "step": 8100 + }, + { + "epoch": 1.9696085582300025, + "grad_norm": 20.125, + "learning_rate": 1.4593572810144008e-09, + "loss": 0.607, + "step": 8101 + }, + { + "epoch": 1.9698516897641625, + "grad_norm": 17.75, + "learning_rate": 1.4361054175524025e-09, + "loss": 0.5903, + "step": 8102 + }, + { + "epoch": 1.9700948212983223, + "grad_norm": 14.3125, + "learning_rate": 1.4130401735280852e-09, + "loss": 0.2701, + "step": 8103 + }, + { + "epoch": 1.9703379528324825, + "grad_norm": 30.75, + "learning_rate": 1.3901615523886914e-09, + "loss": 1.1624, + "step": 8104 + }, + { + "epoch": 1.9705810843666423, + "grad_norm": 27.0, + "learning_rate": 1.3674695575538465e-09, + "loss": 0.8752, + "step": 8105 + }, + { + "epoch": 1.9708242159008025, + "grad_norm": 16.375, + "learning_rate": 1.3449641924152823e-09, + "loss": 0.4863, + "step": 8106 + }, + { + "epoch": 1.9710673474349623, + "grad_norm": 19.75, + "learning_rate": 1.3226454603369744e-09, + "loss": 0.7416, + "step": 8107 + }, + { + "epoch": 1.9713104789691223, + "grad_norm": 16.0, + "learning_rate": 1.3005133646545875e-09, + "loss": 0.4618, + "step": 8108 + }, + { + "epoch": 1.9715536105032823, + "grad_norm": 28.0, + "learning_rate": 1.2785679086763092e-09, + "loss": 0.8066, + "step": 8109 + }, + { + "epoch": 1.9717967420374423, + "grad_norm": 18.25, + "learning_rate": 1.2568090956821543e-09, + "loss": 0.5981, + "step": 8110 + }, + { + "epoch": 1.9720398735716023, + "grad_norm": 17.375, + "learning_rate": 1.2352369289245214e-09, + "loss": 0.5006, + "step": 8111 + }, + { + "epoch": 1.9722830051057623, + "grad_norm": 15.875, + "learning_rate": 1.2138514116276369e-09, + "loss": 0.398, + "step": 8112 + }, + { + "epoch": 1.9725261366399223, + "grad_norm": 15.125, + "learning_rate": 1.1926525469878326e-09, + "loss": 0.5582, + "step": 8113 + }, + { + "epoch": 1.972769268174082, + "grad_norm": 23.625, + "learning_rate": 1.171640338173824e-09, + "loss": 0.8987, + "step": 8114 + }, + { + "epoch": 1.9730123997082423, + "grad_norm": 18.5, + "learning_rate": 1.1508147883261544e-09, + "loss": 0.5504, + "step": 8115 + }, + { + "epoch": 1.973255531242402, + "grad_norm": 17.0, + "learning_rate": 1.1301759005576118e-09, + "loss": 0.6745, + "step": 8116 + }, + { + "epoch": 1.9734986627765623, + "grad_norm": 18.5, + "learning_rate": 1.1097236779530895e-09, + "loss": 0.7148, + "step": 8117 + }, + { + "epoch": 1.973741794310722, + "grad_norm": 23.375, + "learning_rate": 1.0894581235693091e-09, + "loss": 1.4508, + "step": 8118 + }, + { + "epoch": 1.973984925844882, + "grad_norm": 23.25, + "learning_rate": 1.0693792404355141e-09, + "loss": 1.0882, + "step": 8119 + }, + { + "epoch": 1.974228057379042, + "grad_norm": 23.125, + "learning_rate": 1.049487031552776e-09, + "loss": 1.2678, + "step": 8120 + }, + { + "epoch": 1.974471188913202, + "grad_norm": 19.125, + "learning_rate": 1.0297814998942723e-09, + "loss": 0.6853, + "step": 8121 + }, + { + "epoch": 1.974714320447362, + "grad_norm": 16.375, + "learning_rate": 1.0102626484054245e-09, + "loss": 0.7197, + "step": 8122 + }, + { + "epoch": 1.9749574519815218, + "grad_norm": 14.6875, + "learning_rate": 9.909304800036213e-10, + "loss": 0.3409, + "step": 8123 + }, + { + "epoch": 1.975200583515682, + "grad_norm": 21.0, + "learning_rate": 9.717849975783567e-10, + "loss": 0.593, + "step": 8124 + }, + { + "epoch": 1.9754437150498418, + "grad_norm": 18.125, + "learning_rate": 9.528262039912306e-10, + "loss": 0.5253, + "step": 8125 + }, + { + "epoch": 1.975686846584002, + "grad_norm": 22.5, + "learning_rate": 9.340541020762262e-10, + "loss": 0.9469, + "step": 8126 + }, + { + "epoch": 1.9759299781181618, + "grad_norm": 25.875, + "learning_rate": 9.154686946387381e-10, + "loss": 0.716, + "step": 8127 + }, + { + "epoch": 1.976173109652322, + "grad_norm": 16.5, + "learning_rate": 8.970699844569608e-10, + "loss": 0.4917, + "step": 8128 + }, + { + "epoch": 1.9764162411864818, + "grad_norm": 16.25, + "learning_rate": 8.788579742809167e-10, + "loss": 0.4885, + "step": 8129 + }, + { + "epoch": 1.9766593727206418, + "grad_norm": 20.0, + "learning_rate": 8.608326668325951e-10, + "loss": 0.7131, + "step": 8130 + }, + { + "epoch": 1.9769025042548019, + "grad_norm": 18.125, + "learning_rate": 8.429940648060908e-10, + "loss": 0.4055, + "step": 8131 + }, + { + "epoch": 1.9771456357889619, + "grad_norm": 20.5, + "learning_rate": 8.253421708678822e-10, + "loss": 0.533, + "step": 8132 + }, + { + "epoch": 1.9773887673231219, + "grad_norm": 19.25, + "learning_rate": 8.078769876562753e-10, + "loss": 0.5524, + "step": 8133 + }, + { + "epoch": 1.9776318988572816, + "grad_norm": 17.875, + "learning_rate": 7.90598517781821e-10, + "loss": 0.6555, + "step": 8134 + }, + { + "epoch": 1.9778750303914419, + "grad_norm": 16.875, + "learning_rate": 7.735067638268978e-10, + "loss": 0.6905, + "step": 8135 + }, + { + "epoch": 1.9781181619256016, + "grad_norm": 19.375, + "learning_rate": 7.566017283462679e-10, + "loss": 0.9201, + "step": 8136 + }, + { + "epoch": 1.9783612934597619, + "grad_norm": 25.625, + "learning_rate": 7.398834138667987e-10, + "loss": 0.986, + "step": 8137 + }, + { + "epoch": 1.9786044249939216, + "grad_norm": 14.1875, + "learning_rate": 7.233518228871861e-10, + "loss": 0.2434, + "step": 8138 + }, + { + "epoch": 1.9788475565280816, + "grad_norm": 17.5, + "learning_rate": 7.070069578783701e-10, + "loss": 0.5198, + "step": 8139 + }, + { + "epoch": 1.9790906880622416, + "grad_norm": 20.5, + "learning_rate": 6.908488212833963e-10, + "loss": 0.708, + "step": 8140 + }, + { + "epoch": 1.9793338195964016, + "grad_norm": 17.75, + "learning_rate": 6.748774155174165e-10, + "loss": 0.4918, + "step": 8141 + }, + { + "epoch": 1.9795769511305616, + "grad_norm": 16.25, + "learning_rate": 6.590927429675487e-10, + "loss": 0.6461, + "step": 8142 + }, + { + "epoch": 1.9798200826647216, + "grad_norm": 17.875, + "learning_rate": 6.434948059931557e-10, + "loss": 0.5496, + "step": 8143 + }, + { + "epoch": 1.9800632141988816, + "grad_norm": 19.125, + "learning_rate": 6.280836069255669e-10, + "loss": 0.51, + "step": 8144 + }, + { + "epoch": 1.9803063457330414, + "grad_norm": 17.875, + "learning_rate": 6.128591480683566e-10, + "loss": 0.5675, + "step": 8145 + }, + { + "epoch": 1.9805494772672017, + "grad_norm": 23.625, + "learning_rate": 5.978214316969267e-10, + "loss": 0.6228, + "step": 8146 + }, + { + "epoch": 1.9807926088013614, + "grad_norm": 25.875, + "learning_rate": 5.829704600590624e-10, + "loss": 0.7185, + "step": 8147 + }, + { + "epoch": 1.9810357403355217, + "grad_norm": 20.25, + "learning_rate": 5.683062353745161e-10, + "loss": 1.0372, + "step": 8148 + }, + { + "epoch": 1.9812788718696814, + "grad_norm": 24.375, + "learning_rate": 5.53828759834868e-10, + "loss": 0.6557, + "step": 8149 + }, + { + "epoch": 1.9815220034038414, + "grad_norm": 20.25, + "learning_rate": 5.39538035604359e-10, + "loss": 0.9553, + "step": 8150 + }, + { + "epoch": 1.9817651349380014, + "grad_norm": 19.75, + "learning_rate": 5.254340648186418e-10, + "loss": 0.4918, + "step": 8151 + }, + { + "epoch": 1.9820082664721614, + "grad_norm": 17.625, + "learning_rate": 5.115168495861689e-10, + "loss": 0.5586, + "step": 8152 + }, + { + "epoch": 1.9822513980063214, + "grad_norm": 17.0, + "learning_rate": 4.977863919868042e-10, + "loss": 0.6619, + "step": 8153 + }, + { + "epoch": 1.9824945295404814, + "grad_norm": 19.75, + "learning_rate": 4.842426940729339e-10, + "loss": 0.5333, + "step": 8154 + }, + { + "epoch": 1.9827376610746414, + "grad_norm": 37.0, + "learning_rate": 4.70885757868772e-10, + "loss": 1.0077, + "step": 8155 + }, + { + "epoch": 1.9829807926088012, + "grad_norm": 28.5, + "learning_rate": 4.577155853709159e-10, + "loss": 0.7056, + "step": 8156 + }, + { + "epoch": 1.9832239241429614, + "grad_norm": 17.375, + "learning_rate": 4.4473217854792974e-10, + "loss": 0.8223, + "step": 8157 + }, + { + "epoch": 1.9834670556771212, + "grad_norm": 16.375, + "learning_rate": 4.3193553934020584e-10, + "loss": 0.3124, + "step": 8158 + }, + { + "epoch": 1.9837101872112815, + "grad_norm": 21.0, + "learning_rate": 4.193256696605197e-10, + "loss": 0.6194, + "step": 8159 + }, + { + "epoch": 1.9839533187454412, + "grad_norm": 22.75, + "learning_rate": 4.069025713934749e-10, + "loss": 0.6732, + "step": 8160 + }, + { + "epoch": 1.9841964502796012, + "grad_norm": 17.375, + "learning_rate": 3.94666246396197e-10, + "loss": 0.3517, + "step": 8161 + }, + { + "epoch": 1.9844395818137612, + "grad_norm": 19.75, + "learning_rate": 3.826166964975009e-10, + "loss": 0.8733, + "step": 8162 + }, + { + "epoch": 1.9846827133479212, + "grad_norm": 23.125, + "learning_rate": 3.707539234983071e-10, + "loss": 0.7119, + "step": 8163 + }, + { + "epoch": 1.9849258448820812, + "grad_norm": 20.5, + "learning_rate": 3.5907792917178055e-10, + "loss": 0.7217, + "step": 8164 + }, + { + "epoch": 1.9851689764162412, + "grad_norm": 16.0, + "learning_rate": 3.475887152631918e-10, + "loss": 0.6005, + "step": 8165 + }, + { + "epoch": 1.9854121079504012, + "grad_norm": 20.25, + "learning_rate": 3.362862834896397e-10, + "loss": 0.6347, + "step": 8166 + }, + { + "epoch": 1.985655239484561, + "grad_norm": 22.875, + "learning_rate": 3.251706355404671e-10, + "loss": 0.5622, + "step": 8167 + }, + { + "epoch": 1.9858983710187212, + "grad_norm": 17.375, + "learning_rate": 3.1424177307726177e-10, + "loss": 0.4076, + "step": 8168 + }, + { + "epoch": 1.986141502552881, + "grad_norm": 16.875, + "learning_rate": 3.0349969773343924e-10, + "loss": 0.4844, + "step": 8169 + }, + { + "epoch": 1.9863846340870412, + "grad_norm": 22.375, + "learning_rate": 2.9294441111465956e-10, + "loss": 0.5908, + "step": 8170 + }, + { + "epoch": 1.986627765621201, + "grad_norm": 20.5, + "learning_rate": 2.8257591479841083e-10, + "loss": 1.0264, + "step": 8171 + }, + { + "epoch": 1.986870897155361, + "grad_norm": 19.625, + "learning_rate": 2.7239421033470324e-10, + "loss": 0.6554, + "step": 8172 + }, + { + "epoch": 1.987114028689521, + "grad_norm": 24.25, + "learning_rate": 2.623992992450974e-10, + "loss": 0.9349, + "step": 8173 + }, + { + "epoch": 1.987357160223681, + "grad_norm": 25.25, + "learning_rate": 2.52591183023676e-10, + "loss": 0.8927, + "step": 8174 + }, + { + "epoch": 1.987600291757841, + "grad_norm": 22.0, + "learning_rate": 2.4296986313634973e-10, + "loss": 0.7329, + "step": 8175 + }, + { + "epoch": 1.9878434232920008, + "grad_norm": 19.0, + "learning_rate": 2.335353410212737e-10, + "loss": 0.4344, + "step": 8176 + }, + { + "epoch": 1.988086554826161, + "grad_norm": 24.5, + "learning_rate": 2.2428761808857002e-10, + "loss": 1.004, + "step": 8177 + }, + { + "epoch": 1.9883296863603208, + "grad_norm": 16.875, + "learning_rate": 2.1522669572032752e-10, + "loss": 0.6985, + "step": 8178 + }, + { + "epoch": 1.988572817894481, + "grad_norm": 18.625, + "learning_rate": 2.0635257527115705e-10, + "loss": 0.4614, + "step": 8179 + }, + { + "epoch": 1.9888159494286408, + "grad_norm": 23.125, + "learning_rate": 1.9766525806708126e-10, + "loss": 0.9136, + "step": 8180 + }, + { + "epoch": 1.989059080962801, + "grad_norm": 18.125, + "learning_rate": 1.8916474540692231e-10, + "loss": 1.0467, + "step": 8181 + }, + { + "epoch": 1.9893022124969608, + "grad_norm": 19.375, + "learning_rate": 1.808510385610529e-10, + "loss": 0.9221, + "step": 8182 + }, + { + "epoch": 1.9895453440311208, + "grad_norm": 17.875, + "learning_rate": 1.7272413877209015e-10, + "loss": 0.7192, + "step": 8183 + }, + { + "epoch": 1.9897884755652808, + "grad_norm": 26.75, + "learning_rate": 1.6478404725475683e-10, + "loss": 1.3566, + "step": 8184 + }, + { + "epoch": 1.9900316070994408, + "grad_norm": 25.125, + "learning_rate": 1.5703076519588135e-10, + "loss": 0.7287, + "step": 8185 + }, + { + "epoch": 1.9902747386336008, + "grad_norm": 23.625, + "learning_rate": 1.4946429375425896e-10, + "loss": 0.665, + "step": 8186 + }, + { + "epoch": 1.9905178701677606, + "grad_norm": 15.0625, + "learning_rate": 1.4208463406092941e-10, + "loss": 0.3656, + "step": 8187 + }, + { + "epoch": 1.9907610017019208, + "grad_norm": 25.125, + "learning_rate": 1.3489178721876052e-10, + "loss": 0.8067, + "step": 8188 + }, + { + "epoch": 1.9910041332360806, + "grad_norm": 21.125, + "learning_rate": 1.2788575430300321e-10, + "loss": 0.8212, + "step": 8189 + }, + { + "epoch": 1.9912472647702408, + "grad_norm": 20.75, + "learning_rate": 1.210665363607366e-10, + "loss": 0.6031, + "step": 8190 + }, + { + "epoch": 1.9914903963044006, + "grad_norm": 23.125, + "learning_rate": 1.1443413441114548e-10, + "loss": 0.8211, + "step": 8191 + }, + { + "epoch": 1.9917335278385606, + "grad_norm": 20.125, + "learning_rate": 1.0798854944579774e-10, + "loss": 0.855, + "step": 8192 + }, + { + "epoch": 1.9919766593727206, + "grad_norm": 20.25, + "learning_rate": 1.0172978242781184e-10, + "loss": 0.5624, + "step": 8193 + }, + { + "epoch": 1.9922197909068806, + "grad_norm": 24.0, + "learning_rate": 9.565783429282827e-11, + "loss": 1.0241, + "step": 8194 + }, + { + "epoch": 1.9924629224410406, + "grad_norm": 25.25, + "learning_rate": 8.977270594845433e-11, + "loss": 0.5422, + "step": 8195 + }, + { + "epoch": 1.9927060539752006, + "grad_norm": 28.5, + "learning_rate": 8.407439827412545e-11, + "loss": 0.5915, + "step": 8196 + }, + { + "epoch": 1.9929491855093606, + "grad_norm": 15.125, + "learning_rate": 7.856291212179901e-11, + "loss": 0.3425, + "step": 8197 + }, + { + "epoch": 1.9931923170435204, + "grad_norm": 21.25, + "learning_rate": 7.323824831512171e-11, + "loss": 0.506, + "step": 8198 + }, + { + "epoch": 1.9934354485776806, + "grad_norm": 64.0, + "learning_rate": 6.810040764984593e-11, + "loss": 0.915, + "step": 8199 + }, + { + "epoch": 1.9936785801118404, + "grad_norm": 19.625, + "learning_rate": 6.314939089410721e-11, + "loss": 0.445, + "step": 8200 + }, + { + "epoch": 1.9939217116460006, + "grad_norm": 18.25, + "learning_rate": 5.838519878786919e-11, + "loss": 0.4293, + "step": 8201 + }, + { + "epoch": 1.9941648431801604, + "grad_norm": 17.625, + "learning_rate": 5.380783204320117e-11, + "loss": 0.8994, + "step": 8202 + }, + { + "epoch": 1.9944079747143204, + "grad_norm": 24.75, + "learning_rate": 4.9417291344278086e-11, + "loss": 0.6165, + "step": 8203 + }, + { + "epoch": 1.9946511062484804, + "grad_norm": 22.125, + "learning_rate": 4.5213577347241744e-11, + "loss": 0.7128, + "step": 8204 + }, + { + "epoch": 1.9948942377826404, + "grad_norm": 16.625, + "learning_rate": 4.119669068061716e-11, + "loss": 0.3796, + "step": 8205 + }, + { + "epoch": 1.9951373693168004, + "grad_norm": 20.5, + "learning_rate": 3.7366631944618646e-11, + "loss": 0.8241, + "step": 8206 + }, + { + "epoch": 1.9953805008509604, + "grad_norm": 23.25, + "learning_rate": 3.372340171184374e-11, + "loss": 0.7091, + "step": 8207 + }, + { + "epoch": 1.9956236323851204, + "grad_norm": 24.125, + "learning_rate": 3.026700052671805e-11, + "loss": 1.0373, + "step": 8208 + }, + { + "epoch": 1.9958667639192802, + "grad_norm": 18.0, + "learning_rate": 2.6997428905911617e-11, + "loss": 0.6254, + "step": 8209 + }, + { + "epoch": 1.9961098954534404, + "grad_norm": 20.0, + "learning_rate": 2.391468733806135e-11, + "loss": 0.6479, + "step": 8210 + }, + { + "epoch": 1.9963530269876002, + "grad_norm": 18.75, + "learning_rate": 2.101877628418736e-11, + "loss": 0.8676, + "step": 8211 + }, + { + "epoch": 1.9965961585217604, + "grad_norm": 16.25, + "learning_rate": 1.8309696176721514e-11, + "loss": 0.4131, + "step": 8212 + }, + { + "epoch": 1.9968392900559202, + "grad_norm": 21.25, + "learning_rate": 1.578744742089522e-11, + "loss": 0.5246, + "step": 8213 + }, + { + "epoch": 1.9970824215900802, + "grad_norm": 21.0, + "learning_rate": 1.345203039362919e-11, + "loss": 0.7029, + "step": 8214 + }, + { + "epoch": 1.9973255531242402, + "grad_norm": 19.875, + "learning_rate": 1.130344544394979e-11, + "loss": 0.686, + "step": 8215 + }, + { + "epoch": 1.9975686846584002, + "grad_norm": 30.625, + "learning_rate": 9.341692892989029e-12, + "loss": 1.0333, + "step": 8216 + }, + { + "epoch": 1.9978118161925602, + "grad_norm": 17.375, + "learning_rate": 7.566773034123342e-12, + "loss": 0.9167, + "step": 8217 + }, + { + "epoch": 1.9980549477267202, + "grad_norm": 17.875, + "learning_rate": 5.9786861324184765e-12, + "loss": 0.67, + "step": 8218 + }, + { + "epoch": 1.9982980792608802, + "grad_norm": 18.75, + "learning_rate": 4.57743242532338e-12, + "loss": 0.4391, + "step": 8219 + }, + { + "epoch": 1.99854121079504, + "grad_norm": 16.375, + "learning_rate": 3.3630121223926502e-12, + "loss": 0.6415, + "step": 8220 + }, + { + "epoch": 1.9987843423292002, + "grad_norm": 22.375, + "learning_rate": 2.3354254050089733e-12, + "loss": 0.7354, + "step": 8221 + }, + { + "epoch": 1.99902747386336, + "grad_norm": 20.25, + "learning_rate": 1.4946724266606815e-12, + "loss": 0.518, + "step": 8222 + }, + { + "epoch": 1.9992706053975202, + "grad_norm": 16.25, + "learning_rate": 8.407533133580891e-13, + "loss": 0.4478, + "step": 8223 + }, + { + "epoch": 1.99951373693168, + "grad_norm": 34.75, + "learning_rate": 3.7366816252326633e-13, + "loss": 1.1643, + "step": 8224 + }, + { + "epoch": 1.99975686846584, + "grad_norm": 32.75, + "learning_rate": 9.341704410026354e-14, + "loss": 0.9797, + "step": 8225 + }, + { + "epoch": 2.0, + "grad_norm": 22.875, + "learning_rate": 0.0, + "loss": 0.6045, + "step": 8226 + } + ], + "logging_steps": 1, + "max_steps": 8226, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2057, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.629779584710083e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}