diff --git "a/checkpoint-6444/trainer_state.json" "b/checkpoint-6444/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-6444/trainer_state.json" @@ -0,0 +1,45141 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 6444, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000931098696461825, + "grad_norm": 4.652266025543213, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.7999, + "step": 1 + }, + { + "epoch": 0.00186219739292365, + "grad_norm": 4.820754528045654, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.7973, + "step": 2 + }, + { + "epoch": 0.002793296089385475, + "grad_norm": 4.462317943572998, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.8345, + "step": 3 + }, + { + "epoch": 0.0037243947858473, + "grad_norm": 4.625609874725342, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.8793, + "step": 4 + }, + { + "epoch": 0.004655493482309125, + "grad_norm": 5.072265625, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.8234, + "step": 5 + }, + { + "epoch": 0.00558659217877095, + "grad_norm": 5.068264484405518, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.7846, + "step": 6 + }, + { + "epoch": 0.006517690875232775, + "grad_norm": 4.836957931518555, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.8265, + "step": 7 + }, + { + "epoch": 0.0074487895716946, + "grad_norm": 4.526134490966797, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.777, + "step": 8 + }, + { + "epoch": 0.008379888268156424, + "grad_norm": 4.688526153564453, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.7964, + "step": 9 + }, + { + "epoch": 0.00931098696461825, + "grad_norm": 4.841691017150879, + "learning_rate": 5.000000000000001e-07, + "loss": 1.8594, + "step": 10 + }, + { + "epoch": 0.010242085661080074, + "grad_norm": 4.664731979370117, + "learning_rate": 5.5e-07, + "loss": 1.8016, + "step": 11 + }, + { + "epoch": 0.0111731843575419, + "grad_norm": 4.7134294509887695, + "learning_rate": 6.000000000000001e-07, + "loss": 1.7898, + "step": 12 + }, + { + "epoch": 0.012104283054003724, + "grad_norm": 4.467348098754883, + "learning_rate": 6.5e-07, + "loss": 1.767, + "step": 13 + }, + { + "epoch": 0.01303538175046555, + "grad_norm": 4.61416482925415, + "learning_rate": 7.000000000000001e-07, + "loss": 1.8564, + "step": 14 + }, + { + "epoch": 0.013966480446927373, + "grad_norm": 4.473025798797607, + "learning_rate": 7.5e-07, + "loss": 1.7543, + "step": 15 + }, + { + "epoch": 0.0148975791433892, + "grad_norm": 4.157939910888672, + "learning_rate": 8.000000000000001e-07, + "loss": 1.6981, + "step": 16 + }, + { + "epoch": 0.015828677839851025, + "grad_norm": 4.375277519226074, + "learning_rate": 8.500000000000001e-07, + "loss": 1.8426, + "step": 17 + }, + { + "epoch": 0.01675977653631285, + "grad_norm": 3.9843714237213135, + "learning_rate": 9.000000000000001e-07, + "loss": 1.8064, + "step": 18 + }, + { + "epoch": 0.017690875232774673, + "grad_norm": 4.007570743560791, + "learning_rate": 9.500000000000001e-07, + "loss": 1.7724, + "step": 19 + }, + { + "epoch": 0.0186219739292365, + "grad_norm": 3.71211838722229, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.7475, + "step": 20 + }, + { + "epoch": 0.019553072625698324, + "grad_norm": 3.6564059257507324, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.7303, + "step": 21 + }, + { + "epoch": 0.020484171322160148, + "grad_norm": 3.2663612365722656, + "learning_rate": 1.1e-06, + "loss": 1.68, + "step": 22 + }, + { + "epoch": 0.021415270018621976, + "grad_norm": 3.5662412643432617, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.7961, + "step": 23 + }, + { + "epoch": 0.0223463687150838, + "grad_norm": 3.1484689712524414, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.6784, + "step": 24 + }, + { + "epoch": 0.023277467411545624, + "grad_norm": 2.9891276359558105, + "learning_rate": 1.25e-06, + "loss": 1.6174, + "step": 25 + }, + { + "epoch": 0.024208566108007448, + "grad_norm": 2.7955873012542725, + "learning_rate": 1.3e-06, + "loss": 1.6626, + "step": 26 + }, + { + "epoch": 0.025139664804469275, + "grad_norm": 2.6824326515197754, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.693, + "step": 27 + }, + { + "epoch": 0.0260707635009311, + "grad_norm": 2.4676928520202637, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.6401, + "step": 28 + }, + { + "epoch": 0.027001862197392923, + "grad_norm": 2.366990327835083, + "learning_rate": 1.45e-06, + "loss": 1.595, + "step": 29 + }, + { + "epoch": 0.027932960893854747, + "grad_norm": 2.2949702739715576, + "learning_rate": 1.5e-06, + "loss": 1.595, + "step": 30 + }, + { + "epoch": 0.028864059590316574, + "grad_norm": 2.186319351196289, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.6694, + "step": 31 + }, + { + "epoch": 0.0297951582867784, + "grad_norm": 2.113795280456543, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.5779, + "step": 32 + }, + { + "epoch": 0.030726256983240222, + "grad_norm": 2.0262715816497803, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.5292, + "step": 33 + }, + { + "epoch": 0.03165735567970205, + "grad_norm": 1.9731030464172363, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.5463, + "step": 34 + }, + { + "epoch": 0.032588454376163874, + "grad_norm": 2.100799322128296, + "learning_rate": 1.75e-06, + "loss": 1.4808, + "step": 35 + }, + { + "epoch": 0.0335195530726257, + "grad_norm": 2.134826183319092, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.56, + "step": 36 + }, + { + "epoch": 0.03445065176908752, + "grad_norm": 2.2226216793060303, + "learning_rate": 1.85e-06, + "loss": 1.5058, + "step": 37 + }, + { + "epoch": 0.035381750465549346, + "grad_norm": 2.211540937423706, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.5198, + "step": 38 + }, + { + "epoch": 0.036312849162011177, + "grad_norm": 2.066465139389038, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.5065, + "step": 39 + }, + { + "epoch": 0.037243947858473, + "grad_norm": 2.0119385719299316, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.461, + "step": 40 + }, + { + "epoch": 0.038175046554934824, + "grad_norm": 1.9685887098312378, + "learning_rate": 2.05e-06, + "loss": 1.485, + "step": 41 + }, + { + "epoch": 0.03910614525139665, + "grad_norm": 1.9120490550994873, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.4428, + "step": 42 + }, + { + "epoch": 0.04003724394785847, + "grad_norm": 1.780713677406311, + "learning_rate": 2.15e-06, + "loss": 1.4762, + "step": 43 + }, + { + "epoch": 0.040968342644320296, + "grad_norm": 1.8144075870513916, + "learning_rate": 2.2e-06, + "loss": 1.4752, + "step": 44 + }, + { + "epoch": 0.04189944134078212, + "grad_norm": 1.7422245740890503, + "learning_rate": 2.25e-06, + "loss": 1.4352, + "step": 45 + }, + { + "epoch": 0.04283054003724395, + "grad_norm": 1.7721035480499268, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.413, + "step": 46 + }, + { + "epoch": 0.043761638733705775, + "grad_norm": 1.5673249959945679, + "learning_rate": 2.35e-06, + "loss": 1.4243, + "step": 47 + }, + { + "epoch": 0.0446927374301676, + "grad_norm": 1.660043716430664, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.4073, + "step": 48 + }, + { + "epoch": 0.04562383612662942, + "grad_norm": 1.6705303192138672, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.4532, + "step": 49 + }, + { + "epoch": 0.04655493482309125, + "grad_norm": 1.587410807609558, + "learning_rate": 2.5e-06, + "loss": 1.3665, + "step": 50 + }, + { + "epoch": 0.04748603351955307, + "grad_norm": 1.536221981048584, + "learning_rate": 2.55e-06, + "loss": 1.4453, + "step": 51 + }, + { + "epoch": 0.048417132216014895, + "grad_norm": 1.5890748500823975, + "learning_rate": 2.6e-06, + "loss": 1.4185, + "step": 52 + }, + { + "epoch": 0.049348230912476726, + "grad_norm": 1.5140084028244019, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.3715, + "step": 53 + }, + { + "epoch": 0.05027932960893855, + "grad_norm": 1.5359089374542236, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.4454, + "step": 54 + }, + { + "epoch": 0.051210428305400374, + "grad_norm": 1.6898878812789917, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.42, + "step": 55 + }, + { + "epoch": 0.0521415270018622, + "grad_norm": 1.7680490016937256, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.3853, + "step": 56 + }, + { + "epoch": 0.05307262569832402, + "grad_norm": 1.5281091928482056, + "learning_rate": 2.85e-06, + "loss": 1.3481, + "step": 57 + }, + { + "epoch": 0.054003724394785846, + "grad_norm": 1.6096315383911133, + "learning_rate": 2.9e-06, + "loss": 1.3911, + "step": 58 + }, + { + "epoch": 0.05493482309124767, + "grad_norm": 1.6074330806732178, + "learning_rate": 2.95e-06, + "loss": 1.3572, + "step": 59 + }, + { + "epoch": 0.055865921787709494, + "grad_norm": 1.5256246328353882, + "learning_rate": 3e-06, + "loss": 1.3775, + "step": 60 + }, + { + "epoch": 0.056797020484171325, + "grad_norm": 1.5154070854187012, + "learning_rate": 3.05e-06, + "loss": 1.3726, + "step": 61 + }, + { + "epoch": 0.05772811918063315, + "grad_norm": 1.547666072845459, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.422, + "step": 62 + }, + { + "epoch": 0.05865921787709497, + "grad_norm": 1.7028982639312744, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.3521, + "step": 63 + }, + { + "epoch": 0.0595903165735568, + "grad_norm": 1.5640915632247925, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.3419, + "step": 64 + }, + { + "epoch": 0.06052141527001862, + "grad_norm": 1.460614800453186, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.3397, + "step": 65 + }, + { + "epoch": 0.061452513966480445, + "grad_norm": 1.4689096212387085, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.3239, + "step": 66 + }, + { + "epoch": 0.06238361266294227, + "grad_norm": 1.5040433406829834, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.368, + "step": 67 + }, + { + "epoch": 0.0633147113594041, + "grad_norm": 1.4992578029632568, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.3752, + "step": 68 + }, + { + "epoch": 0.06424581005586592, + "grad_norm": 1.4988967180252075, + "learning_rate": 3.45e-06, + "loss": 1.3404, + "step": 69 + }, + { + "epoch": 0.06517690875232775, + "grad_norm": 1.4750804901123047, + "learning_rate": 3.5e-06, + "loss": 1.3108, + "step": 70 + }, + { + "epoch": 0.06610800744878957, + "grad_norm": 1.5298198461532593, + "learning_rate": 3.5500000000000003e-06, + "loss": 1.3074, + "step": 71 + }, + { + "epoch": 0.0670391061452514, + "grad_norm": 1.4945071935653687, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.3569, + "step": 72 + }, + { + "epoch": 0.06797020484171322, + "grad_norm": 1.4995956420898438, + "learning_rate": 3.65e-06, + "loss": 1.341, + "step": 73 + }, + { + "epoch": 0.06890130353817504, + "grad_norm": 1.4956507682800293, + "learning_rate": 3.7e-06, + "loss": 1.3281, + "step": 74 + }, + { + "epoch": 0.06983240223463687, + "grad_norm": 1.8083794116973877, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.3557, + "step": 75 + }, + { + "epoch": 0.07076350093109869, + "grad_norm": 1.417014241218567, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.32, + "step": 76 + }, + { + "epoch": 0.07169459962756052, + "grad_norm": 1.4146186113357544, + "learning_rate": 3.85e-06, + "loss": 1.3017, + "step": 77 + }, + { + "epoch": 0.07262569832402235, + "grad_norm": 1.394770622253418, + "learning_rate": 3.900000000000001e-06, + "loss": 1.289, + "step": 78 + }, + { + "epoch": 0.07355679702048418, + "grad_norm": 1.4548990726470947, + "learning_rate": 3.95e-06, + "loss": 1.2983, + "step": 79 + }, + { + "epoch": 0.074487895716946, + "grad_norm": 1.4482989311218262, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3115, + "step": 80 + }, + { + "epoch": 0.07541899441340782, + "grad_norm": 1.4169769287109375, + "learning_rate": 4.05e-06, + "loss": 1.2504, + "step": 81 + }, + { + "epoch": 0.07635009310986965, + "grad_norm": 1.5154104232788086, + "learning_rate": 4.1e-06, + "loss": 1.2701, + "step": 82 + }, + { + "epoch": 0.07728119180633147, + "grad_norm": 1.543452262878418, + "learning_rate": 4.15e-06, + "loss": 1.3176, + "step": 83 + }, + { + "epoch": 0.0782122905027933, + "grad_norm": 1.4130207300186157, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.2498, + "step": 84 + }, + { + "epoch": 0.07914338919925512, + "grad_norm": 1.4742012023925781, + "learning_rate": 4.25e-06, + "loss": 1.3082, + "step": 85 + }, + { + "epoch": 0.08007448789571694, + "grad_norm": 1.4841349124908447, + "learning_rate": 4.3e-06, + "loss": 1.3189, + "step": 86 + }, + { + "epoch": 0.08100558659217877, + "grad_norm": 1.4499276876449585, + "learning_rate": 4.350000000000001e-06, + "loss": 1.2709, + "step": 87 + }, + { + "epoch": 0.08193668528864059, + "grad_norm": 1.3884755373001099, + "learning_rate": 4.4e-06, + "loss": 1.2855, + "step": 88 + }, + { + "epoch": 0.08286778398510242, + "grad_norm": 1.4352260828018188, + "learning_rate": 4.450000000000001e-06, + "loss": 1.2763, + "step": 89 + }, + { + "epoch": 0.08379888268156424, + "grad_norm": 1.4299675226211548, + "learning_rate": 4.5e-06, + "loss": 1.2597, + "step": 90 + }, + { + "epoch": 0.08472998137802606, + "grad_norm": 1.4941706657409668, + "learning_rate": 4.5500000000000005e-06, + "loss": 1.2807, + "step": 91 + }, + { + "epoch": 0.0856610800744879, + "grad_norm": 1.5136346817016602, + "learning_rate": 4.600000000000001e-06, + "loss": 1.2434, + "step": 92 + }, + { + "epoch": 0.08659217877094973, + "grad_norm": 1.5864630937576294, + "learning_rate": 4.65e-06, + "loss": 1.2247, + "step": 93 + }, + { + "epoch": 0.08752327746741155, + "grad_norm": 1.6212531328201294, + "learning_rate": 4.7e-06, + "loss": 1.3218, + "step": 94 + }, + { + "epoch": 0.08845437616387337, + "grad_norm": 1.4686894416809082, + "learning_rate": 4.75e-06, + "loss": 1.2693, + "step": 95 + }, + { + "epoch": 0.0893854748603352, + "grad_norm": 1.456432819366455, + "learning_rate": 4.800000000000001e-06, + "loss": 1.2696, + "step": 96 + }, + { + "epoch": 0.09031657355679702, + "grad_norm": 1.449016809463501, + "learning_rate": 4.85e-06, + "loss": 1.2402, + "step": 97 + }, + { + "epoch": 0.09124767225325885, + "grad_norm": 1.454689860343933, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.3209, + "step": 98 + }, + { + "epoch": 0.09217877094972067, + "grad_norm": 1.5578409433364868, + "learning_rate": 4.95e-06, + "loss": 1.2525, + "step": 99 + }, + { + "epoch": 0.0931098696461825, + "grad_norm": 1.3689794540405273, + "learning_rate": 5e-06, + "loss": 1.2663, + "step": 100 + }, + { + "epoch": 0.09404096834264432, + "grad_norm": 1.7641292810440063, + "learning_rate": 4.999999693462649e-06, + "loss": 1.2557, + "step": 101 + }, + { + "epoch": 0.09497206703910614, + "grad_norm": 1.497525930404663, + "learning_rate": 4.999998773850669e-06, + "loss": 1.2893, + "step": 102 + }, + { + "epoch": 0.09590316573556797, + "grad_norm": 1.5183961391448975, + "learning_rate": 4.999997241164287e-06, + "loss": 1.2804, + "step": 103 + }, + { + "epoch": 0.09683426443202979, + "grad_norm": 1.4652289152145386, + "learning_rate": 4.999995095403878e-06, + "loss": 1.2537, + "step": 104 + }, + { + "epoch": 0.09776536312849161, + "grad_norm": 1.6081790924072266, + "learning_rate": 4.999992336569969e-06, + "loss": 1.2438, + "step": 105 + }, + { + "epoch": 0.09869646182495345, + "grad_norm": 1.6800453662872314, + "learning_rate": 4.999988964663236e-06, + "loss": 1.2564, + "step": 106 + }, + { + "epoch": 0.09962756052141528, + "grad_norm": 1.4903538227081299, + "learning_rate": 4.999984979684505e-06, + "loss": 1.2658, + "step": 107 + }, + { + "epoch": 0.1005586592178771, + "grad_norm": 1.5448410511016846, + "learning_rate": 4.999980381634756e-06, + "loss": 1.2739, + "step": 108 + }, + { + "epoch": 0.10148975791433892, + "grad_norm": 1.549705982208252, + "learning_rate": 4.999975170515114e-06, + "loss": 1.2424, + "step": 109 + }, + { + "epoch": 0.10242085661080075, + "grad_norm": 1.7979212999343872, + "learning_rate": 4.999969346326857e-06, + "loss": 1.2482, + "step": 110 + }, + { + "epoch": 0.10335195530726257, + "grad_norm": 1.4740439653396606, + "learning_rate": 4.999962909071414e-06, + "loss": 1.257, + "step": 111 + }, + { + "epoch": 0.1042830540037244, + "grad_norm": 1.5189530849456787, + "learning_rate": 4.999955858750365e-06, + "loss": 1.1822, + "step": 112 + }, + { + "epoch": 0.10521415270018622, + "grad_norm": 1.541293740272522, + "learning_rate": 4.999948195365436e-06, + "loss": 1.2725, + "step": 113 + }, + { + "epoch": 0.10614525139664804, + "grad_norm": 1.4437657594680786, + "learning_rate": 4.9999399189185085e-06, + "loss": 1.2148, + "step": 114 + }, + { + "epoch": 0.10707635009310987, + "grad_norm": 1.7418147325515747, + "learning_rate": 4.999931029411611e-06, + "loss": 1.2545, + "step": 115 + }, + { + "epoch": 0.10800744878957169, + "grad_norm": 1.450005292892456, + "learning_rate": 4.999921526846925e-06, + "loss": 1.2965, + "step": 116 + }, + { + "epoch": 0.10893854748603352, + "grad_norm": 1.5809992551803589, + "learning_rate": 4.999911411226779e-06, + "loss": 1.2881, + "step": 117 + }, + { + "epoch": 0.10986964618249534, + "grad_norm": 1.6253516674041748, + "learning_rate": 4.9999006825536545e-06, + "loss": 1.2578, + "step": 118 + }, + { + "epoch": 0.11080074487895716, + "grad_norm": 1.5688159465789795, + "learning_rate": 4.999889340830183e-06, + "loss": 1.2826, + "step": 119 + }, + { + "epoch": 0.11173184357541899, + "grad_norm": 1.5473788976669312, + "learning_rate": 4.999877386059144e-06, + "loss": 1.1899, + "step": 120 + }, + { + "epoch": 0.11266294227188083, + "grad_norm": 1.547783374786377, + "learning_rate": 4.999864818243471e-06, + "loss": 1.2287, + "step": 121 + }, + { + "epoch": 0.11359404096834265, + "grad_norm": 1.5663732290267944, + "learning_rate": 4.999851637386246e-06, + "loss": 1.2096, + "step": 122 + }, + { + "epoch": 0.11452513966480447, + "grad_norm": 1.7208932638168335, + "learning_rate": 4.9998378434907e-06, + "loss": 1.2281, + "step": 123 + }, + { + "epoch": 0.1154562383612663, + "grad_norm": 1.6368746757507324, + "learning_rate": 4.9998234365602164e-06, + "loss": 1.2892, + "step": 124 + }, + { + "epoch": 0.11638733705772812, + "grad_norm": 1.7141762971878052, + "learning_rate": 4.999808416598329e-06, + "loss": 1.2207, + "step": 125 + }, + { + "epoch": 0.11731843575418995, + "grad_norm": 1.6172239780426025, + "learning_rate": 4.99979278360872e-06, + "loss": 1.2188, + "step": 126 + }, + { + "epoch": 0.11824953445065177, + "grad_norm": 1.4444636106491089, + "learning_rate": 4.999776537595223e-06, + "loss": 1.2105, + "step": 127 + }, + { + "epoch": 0.1191806331471136, + "grad_norm": 1.526252269744873, + "learning_rate": 4.999759678561822e-06, + "loss": 1.2613, + "step": 128 + }, + { + "epoch": 0.12011173184357542, + "grad_norm": 1.5656245946884155, + "learning_rate": 4.999742206512653e-06, + "loss": 1.2439, + "step": 129 + }, + { + "epoch": 0.12104283054003724, + "grad_norm": 1.6116547584533691, + "learning_rate": 4.999724121451998e-06, + "loss": 1.2393, + "step": 130 + }, + { + "epoch": 0.12197392923649907, + "grad_norm": 1.5212883949279785, + "learning_rate": 4.999705423384296e-06, + "loss": 1.2332, + "step": 131 + }, + { + "epoch": 0.12290502793296089, + "grad_norm": 1.5971696376800537, + "learning_rate": 4.9996861123141274e-06, + "loss": 1.2412, + "step": 132 + }, + { + "epoch": 0.12383612662942271, + "grad_norm": 1.5015281438827515, + "learning_rate": 4.999666188246231e-06, + "loss": 1.262, + "step": 133 + }, + { + "epoch": 0.12476722532588454, + "grad_norm": 1.527295470237732, + "learning_rate": 4.999645651185492e-06, + "loss": 1.2187, + "step": 134 + }, + { + "epoch": 0.12569832402234637, + "grad_norm": 1.572506070137024, + "learning_rate": 4.999624501136947e-06, + "loss": 1.2401, + "step": 135 + }, + { + "epoch": 0.1266294227188082, + "grad_norm": 1.5193438529968262, + "learning_rate": 4.9996027381057825e-06, + "loss": 1.2444, + "step": 136 + }, + { + "epoch": 0.12756052141527002, + "grad_norm": 1.600056767463684, + "learning_rate": 4.9995803620973335e-06, + "loss": 1.2193, + "step": 137 + }, + { + "epoch": 0.12849162011173185, + "grad_norm": 1.4413717985153198, + "learning_rate": 4.999557373117091e-06, + "loss": 1.2166, + "step": 138 + }, + { + "epoch": 0.12942271880819367, + "grad_norm": 1.5173741579055786, + "learning_rate": 4.99953377117069e-06, + "loss": 1.2545, + "step": 139 + }, + { + "epoch": 0.1303538175046555, + "grad_norm": 1.6268305778503418, + "learning_rate": 4.999509556263919e-06, + "loss": 1.2699, + "step": 140 + }, + { + "epoch": 0.13128491620111732, + "grad_norm": 1.5302830934524536, + "learning_rate": 4.999484728402716e-06, + "loss": 1.2474, + "step": 141 + }, + { + "epoch": 0.13221601489757914, + "grad_norm": 1.5415630340576172, + "learning_rate": 4.99945928759317e-06, + "loss": 1.2487, + "step": 142 + }, + { + "epoch": 0.13314711359404097, + "grad_norm": 2.400120258331299, + "learning_rate": 4.999433233841519e-06, + "loss": 1.2102, + "step": 143 + }, + { + "epoch": 0.1340782122905028, + "grad_norm": 1.6354401111602783, + "learning_rate": 4.999406567154155e-06, + "loss": 1.2152, + "step": 144 + }, + { + "epoch": 0.13500931098696461, + "grad_norm": 1.504473090171814, + "learning_rate": 4.999379287537613e-06, + "loss": 1.2464, + "step": 145 + }, + { + "epoch": 0.13594040968342644, + "grad_norm": 1.461122751235962, + "learning_rate": 4.999351394998586e-06, + "loss": 1.2347, + "step": 146 + }, + { + "epoch": 0.13687150837988826, + "grad_norm": 1.4422621726989746, + "learning_rate": 4.999322889543913e-06, + "loss": 1.2246, + "step": 147 + }, + { + "epoch": 0.1378026070763501, + "grad_norm": 1.5868957042694092, + "learning_rate": 4.999293771180584e-06, + "loss": 1.2221, + "step": 148 + }, + { + "epoch": 0.1387337057728119, + "grad_norm": 1.6620584726333618, + "learning_rate": 4.999264039915741e-06, + "loss": 1.2739, + "step": 149 + }, + { + "epoch": 0.13966480446927373, + "grad_norm": 1.5058534145355225, + "learning_rate": 4.9992336957566735e-06, + "loss": 1.2088, + "step": 150 + }, + { + "epoch": 0.14059590316573556, + "grad_norm": 1.5899105072021484, + "learning_rate": 4.999202738710824e-06, + "loss": 1.2266, + "step": 151 + }, + { + "epoch": 0.14152700186219738, + "grad_norm": 1.592184066772461, + "learning_rate": 4.999171168785783e-06, + "loss": 1.1896, + "step": 152 + }, + { + "epoch": 0.1424581005586592, + "grad_norm": 1.6183990240097046, + "learning_rate": 4.999138985989293e-06, + "loss": 1.267, + "step": 153 + }, + { + "epoch": 0.14338919925512103, + "grad_norm": 1.4739596843719482, + "learning_rate": 4.999106190329247e-06, + "loss": 1.1763, + "step": 154 + }, + { + "epoch": 0.14432029795158285, + "grad_norm": 1.6200612783432007, + "learning_rate": 4.9990727818136865e-06, + "loss": 1.2519, + "step": 155 + }, + { + "epoch": 0.1452513966480447, + "grad_norm": 1.555408000946045, + "learning_rate": 4.9990387604508035e-06, + "loss": 1.2342, + "step": 156 + }, + { + "epoch": 0.14618249534450653, + "grad_norm": 1.7190477848052979, + "learning_rate": 4.999004126248943e-06, + "loss": 1.2354, + "step": 157 + }, + { + "epoch": 0.14711359404096835, + "grad_norm": 1.5006803274154663, + "learning_rate": 4.998968879216597e-06, + "loss": 1.2088, + "step": 158 + }, + { + "epoch": 0.14804469273743018, + "grad_norm": 1.536983847618103, + "learning_rate": 4.998933019362408e-06, + "loss": 1.2691, + "step": 159 + }, + { + "epoch": 0.148975791433892, + "grad_norm": 1.5591208934783936, + "learning_rate": 4.998896546695173e-06, + "loss": 1.212, + "step": 160 + }, + { + "epoch": 0.14990689013035383, + "grad_norm": 1.4551342725753784, + "learning_rate": 4.998859461223834e-06, + "loss": 1.248, + "step": 161 + }, + { + "epoch": 0.15083798882681565, + "grad_norm": 1.506801724433899, + "learning_rate": 4.9988217629574855e-06, + "loss": 1.2388, + "step": 162 + }, + { + "epoch": 0.15176908752327747, + "grad_norm": 2.3571014404296875, + "learning_rate": 4.998783451905374e-06, + "loss": 1.2038, + "step": 163 + }, + { + "epoch": 0.1527001862197393, + "grad_norm": 1.5549551248550415, + "learning_rate": 4.998744528076892e-06, + "loss": 1.2224, + "step": 164 + }, + { + "epoch": 0.15363128491620112, + "grad_norm": 1.6795048713684082, + "learning_rate": 4.998704991481587e-06, + "loss": 1.2473, + "step": 165 + }, + { + "epoch": 0.15456238361266295, + "grad_norm": 1.58402419090271, + "learning_rate": 4.9986648421291525e-06, + "loss": 1.2011, + "step": 166 + }, + { + "epoch": 0.15549348230912477, + "grad_norm": 1.589712142944336, + "learning_rate": 4.998624080029436e-06, + "loss": 1.1923, + "step": 167 + }, + { + "epoch": 0.1564245810055866, + "grad_norm": 1.5033369064331055, + "learning_rate": 4.998582705192433e-06, + "loss": 1.2498, + "step": 168 + }, + { + "epoch": 0.15735567970204842, + "grad_norm": 1.5704784393310547, + "learning_rate": 4.99854071762829e-06, + "loss": 1.2255, + "step": 169 + }, + { + "epoch": 0.15828677839851024, + "grad_norm": 1.637010097503662, + "learning_rate": 4.9984981173473025e-06, + "loss": 1.1929, + "step": 170 + }, + { + "epoch": 0.15921787709497207, + "grad_norm": 1.563796043395996, + "learning_rate": 4.998454904359919e-06, + "loss": 1.2376, + "step": 171 + }, + { + "epoch": 0.1601489757914339, + "grad_norm": 1.512030839920044, + "learning_rate": 4.998411078676736e-06, + "loss": 1.1806, + "step": 172 + }, + { + "epoch": 0.1610800744878957, + "grad_norm": 1.468132734298706, + "learning_rate": 4.998366640308501e-06, + "loss": 1.2143, + "step": 173 + }, + { + "epoch": 0.16201117318435754, + "grad_norm": 1.5265820026397705, + "learning_rate": 4.998321589266111e-06, + "loss": 1.2083, + "step": 174 + }, + { + "epoch": 0.16294227188081936, + "grad_norm": 1.471686601638794, + "learning_rate": 4.998275925560614e-06, + "loss": 1.2075, + "step": 175 + }, + { + "epoch": 0.16387337057728119, + "grad_norm": 1.5236564874649048, + "learning_rate": 4.9982296492032084e-06, + "loss": 1.1755, + "step": 176 + }, + { + "epoch": 0.164804469273743, + "grad_norm": 1.5038493871688843, + "learning_rate": 4.998182760205243e-06, + "loss": 1.2131, + "step": 177 + }, + { + "epoch": 0.16573556797020483, + "grad_norm": 1.6880675554275513, + "learning_rate": 4.9981352585782154e-06, + "loss": 1.2333, + "step": 178 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 1.4586597681045532, + "learning_rate": 4.9980871443337755e-06, + "loss": 1.1889, + "step": 179 + }, + { + "epoch": 0.16759776536312848, + "grad_norm": 1.5220293998718262, + "learning_rate": 4.998038417483721e-06, + "loss": 1.2134, + "step": 180 + }, + { + "epoch": 0.1685288640595903, + "grad_norm": 1.6228876113891602, + "learning_rate": 4.997989078040003e-06, + "loss": 1.2357, + "step": 181 + }, + { + "epoch": 0.16945996275605213, + "grad_norm": 1.5319768190383911, + "learning_rate": 4.99793912601472e-06, + "loss": 1.1883, + "step": 182 + }, + { + "epoch": 0.17039106145251395, + "grad_norm": 1.460939884185791, + "learning_rate": 4.99788856142012e-06, + "loss": 1.1607, + "step": 183 + }, + { + "epoch": 0.1713221601489758, + "grad_norm": 1.5423537492752075, + "learning_rate": 4.997837384268606e-06, + "loss": 1.2033, + "step": 184 + }, + { + "epoch": 0.17225325884543763, + "grad_norm": 1.5639525651931763, + "learning_rate": 4.997785594572726e-06, + "loss": 1.231, + "step": 185 + }, + { + "epoch": 0.17318435754189945, + "grad_norm": 1.5919618606567383, + "learning_rate": 4.997733192345181e-06, + "loss": 1.211, + "step": 186 + }, + { + "epoch": 0.17411545623836128, + "grad_norm": 1.6297861337661743, + "learning_rate": 4.997680177598823e-06, + "loss": 1.1782, + "step": 187 + }, + { + "epoch": 0.1750465549348231, + "grad_norm": 1.542855978012085, + "learning_rate": 4.99762655034665e-06, + "loss": 1.2185, + "step": 188 + }, + { + "epoch": 0.17597765363128492, + "grad_norm": 1.547610878944397, + "learning_rate": 4.997572310601816e-06, + "loss": 1.209, + "step": 189 + }, + { + "epoch": 0.17690875232774675, + "grad_norm": 1.5525280237197876, + "learning_rate": 4.99751745837762e-06, + "loss": 1.2088, + "step": 190 + }, + { + "epoch": 0.17783985102420857, + "grad_norm": 1.5458629131317139, + "learning_rate": 4.997461993687514e-06, + "loss": 1.2057, + "step": 191 + }, + { + "epoch": 0.1787709497206704, + "grad_norm": 1.614859700202942, + "learning_rate": 4.997405916545101e-06, + "loss": 1.2307, + "step": 192 + }, + { + "epoch": 0.17970204841713222, + "grad_norm": 16.333786010742188, + "learning_rate": 4.99734922696413e-06, + "loss": 1.2222, + "step": 193 + }, + { + "epoch": 0.18063314711359404, + "grad_norm": 1.5468389987945557, + "learning_rate": 4.997291924958506e-06, + "loss": 1.2269, + "step": 194 + }, + { + "epoch": 0.18156424581005587, + "grad_norm": 1.5191587209701538, + "learning_rate": 4.997234010542279e-06, + "loss": 1.2483, + "step": 195 + }, + { + "epoch": 0.1824953445065177, + "grad_norm": 1.5326647758483887, + "learning_rate": 4.9971754837296514e-06, + "loss": 1.1967, + "step": 196 + }, + { + "epoch": 0.18342644320297952, + "grad_norm": 1.5105819702148438, + "learning_rate": 4.9971163445349775e-06, + "loss": 1.2059, + "step": 197 + }, + { + "epoch": 0.18435754189944134, + "grad_norm": 1.5659078359603882, + "learning_rate": 4.997056592972758e-06, + "loss": 1.226, + "step": 198 + }, + { + "epoch": 0.18528864059590316, + "grad_norm": 1.4845701456069946, + "learning_rate": 4.996996229057648e-06, + "loss": 1.1691, + "step": 199 + }, + { + "epoch": 0.186219739292365, + "grad_norm": 1.5166332721710205, + "learning_rate": 4.996935252804448e-06, + "loss": 1.2265, + "step": 200 + }, + { + "epoch": 0.1871508379888268, + "grad_norm": 1.6071895360946655, + "learning_rate": 4.9968736642281125e-06, + "loss": 1.2409, + "step": 201 + }, + { + "epoch": 0.18808193668528864, + "grad_norm": 1.712390661239624, + "learning_rate": 4.9968114633437445e-06, + "loss": 1.1973, + "step": 202 + }, + { + "epoch": 0.18901303538175046, + "grad_norm": 1.5515329837799072, + "learning_rate": 4.996748650166599e-06, + "loss": 1.1616, + "step": 203 + }, + { + "epoch": 0.18994413407821228, + "grad_norm": 1.6127967834472656, + "learning_rate": 4.996685224712077e-06, + "loss": 1.2296, + "step": 204 + }, + { + "epoch": 0.1908752327746741, + "grad_norm": 1.585608720779419, + "learning_rate": 4.996621186995734e-06, + "loss": 1.2399, + "step": 205 + }, + { + "epoch": 0.19180633147113593, + "grad_norm": 1.6636781692504883, + "learning_rate": 4.996556537033274e-06, + "loss": 1.2427, + "step": 206 + }, + { + "epoch": 0.19273743016759776, + "grad_norm": 1.6432000398635864, + "learning_rate": 4.9964912748405504e-06, + "loss": 1.2324, + "step": 207 + }, + { + "epoch": 0.19366852886405958, + "grad_norm": 1.5804307460784912, + "learning_rate": 4.996425400433569e-06, + "loss": 1.1804, + "step": 208 + }, + { + "epoch": 0.1945996275605214, + "grad_norm": 1.5581324100494385, + "learning_rate": 4.996358913828482e-06, + "loss": 1.1843, + "step": 209 + }, + { + "epoch": 0.19553072625698323, + "grad_norm": 1.606521725654602, + "learning_rate": 4.996291815041595e-06, + "loss": 1.2607, + "step": 210 + }, + { + "epoch": 0.19646182495344505, + "grad_norm": 1.5909523963928223, + "learning_rate": 4.996224104089363e-06, + "loss": 1.2228, + "step": 211 + }, + { + "epoch": 0.1973929236499069, + "grad_norm": 1.5124651193618774, + "learning_rate": 4.99615578098839e-06, + "loss": 1.1794, + "step": 212 + }, + { + "epoch": 0.19832402234636873, + "grad_norm": 1.5392098426818848, + "learning_rate": 4.9960868457554305e-06, + "loss": 1.2181, + "step": 213 + }, + { + "epoch": 0.19925512104283055, + "grad_norm": 1.5319880247116089, + "learning_rate": 4.996017298407391e-06, + "loss": 1.1936, + "step": 214 + }, + { + "epoch": 0.20018621973929238, + "grad_norm": 1.5836493968963623, + "learning_rate": 4.995947138961326e-06, + "loss": 1.2581, + "step": 215 + }, + { + "epoch": 0.2011173184357542, + "grad_norm": 1.6041829586029053, + "learning_rate": 4.995876367434439e-06, + "loss": 1.2059, + "step": 216 + }, + { + "epoch": 0.20204841713221602, + "grad_norm": 1.6239100694656372, + "learning_rate": 4.995804983844088e-06, + "loss": 1.1831, + "step": 217 + }, + { + "epoch": 0.20297951582867785, + "grad_norm": 1.5361995697021484, + "learning_rate": 4.995732988207777e-06, + "loss": 1.1793, + "step": 218 + }, + { + "epoch": 0.20391061452513967, + "grad_norm": 1.5640617609024048, + "learning_rate": 4.995660380543162e-06, + "loss": 1.2027, + "step": 219 + }, + { + "epoch": 0.2048417132216015, + "grad_norm": 1.5686819553375244, + "learning_rate": 4.995587160868047e-06, + "loss": 1.1653, + "step": 220 + }, + { + "epoch": 0.20577281191806332, + "grad_norm": 1.7007719278335571, + "learning_rate": 4.99551332920039e-06, + "loss": 1.1909, + "step": 221 + }, + { + "epoch": 0.20670391061452514, + "grad_norm": 1.6069990396499634, + "learning_rate": 4.995438885558294e-06, + "loss": 1.1549, + "step": 222 + }, + { + "epoch": 0.20763500931098697, + "grad_norm": 1.6227385997772217, + "learning_rate": 4.9953638299600174e-06, + "loss": 1.2101, + "step": 223 + }, + { + "epoch": 0.2085661080074488, + "grad_norm": 1.5943350791931152, + "learning_rate": 4.995288162423965e-06, + "loss": 1.2195, + "step": 224 + }, + { + "epoch": 0.20949720670391062, + "grad_norm": 1.5146862268447876, + "learning_rate": 4.995211882968692e-06, + "loss": 1.154, + "step": 225 + }, + { + "epoch": 0.21042830540037244, + "grad_norm": 1.5101516246795654, + "learning_rate": 4.995134991612906e-06, + "loss": 1.2102, + "step": 226 + }, + { + "epoch": 0.21135940409683426, + "grad_norm": 1.5723508596420288, + "learning_rate": 4.995057488375462e-06, + "loss": 1.2054, + "step": 227 + }, + { + "epoch": 0.2122905027932961, + "grad_norm": 1.5589677095413208, + "learning_rate": 4.994979373275366e-06, + "loss": 1.1447, + "step": 228 + }, + { + "epoch": 0.2132216014897579, + "grad_norm": 2.5887372493743896, + "learning_rate": 4.9949006463317754e-06, + "loss": 1.1973, + "step": 229 + }, + { + "epoch": 0.21415270018621974, + "grad_norm": 1.6149126291275024, + "learning_rate": 4.994821307563995e-06, + "loss": 1.2131, + "step": 230 + }, + { + "epoch": 0.21508379888268156, + "grad_norm": 1.491944432258606, + "learning_rate": 4.994741356991481e-06, + "loss": 1.2111, + "step": 231 + }, + { + "epoch": 0.21601489757914338, + "grad_norm": 1.6424049139022827, + "learning_rate": 4.99466079463384e-06, + "loss": 1.1892, + "step": 232 + }, + { + "epoch": 0.2169459962756052, + "grad_norm": 1.5669760704040527, + "learning_rate": 4.99457962051083e-06, + "loss": 1.2215, + "step": 233 + }, + { + "epoch": 0.21787709497206703, + "grad_norm": 1.597413420677185, + "learning_rate": 4.994497834642355e-06, + "loss": 1.2191, + "step": 234 + }, + { + "epoch": 0.21880819366852886, + "grad_norm": 1.5258022546768188, + "learning_rate": 4.994415437048471e-06, + "loss": 1.1769, + "step": 235 + }, + { + "epoch": 0.21973929236499068, + "grad_norm": 1.5535242557525635, + "learning_rate": 4.994332427749387e-06, + "loss": 1.213, + "step": 236 + }, + { + "epoch": 0.2206703910614525, + "grad_norm": 1.4210128784179688, + "learning_rate": 4.994248806765457e-06, + "loss": 1.1963, + "step": 237 + }, + { + "epoch": 0.22160148975791433, + "grad_norm": 1.450096607208252, + "learning_rate": 4.994164574117189e-06, + "loss": 1.1394, + "step": 238 + }, + { + "epoch": 0.22253258845437615, + "grad_norm": 1.6149117946624756, + "learning_rate": 4.994079729825238e-06, + "loss": 1.1759, + "step": 239 + }, + { + "epoch": 0.22346368715083798, + "grad_norm": 1.5469238758087158, + "learning_rate": 4.993994273910411e-06, + "loss": 1.2085, + "step": 240 + }, + { + "epoch": 0.22439478584729983, + "grad_norm": 1.533961534500122, + "learning_rate": 4.9939082063936646e-06, + "loss": 1.1617, + "step": 241 + }, + { + "epoch": 0.22532588454376165, + "grad_norm": 1.6694387197494507, + "learning_rate": 4.993821527296104e-06, + "loss": 1.173, + "step": 242 + }, + { + "epoch": 0.22625698324022347, + "grad_norm": 1.634507656097412, + "learning_rate": 4.9937342366389875e-06, + "loss": 1.1698, + "step": 243 + }, + { + "epoch": 0.2271880819366853, + "grad_norm": 1.6566752195358276, + "learning_rate": 4.9936463344437196e-06, + "loss": 1.1804, + "step": 244 + }, + { + "epoch": 0.22811918063314712, + "grad_norm": 1.530820369720459, + "learning_rate": 4.993557820731857e-06, + "loss": 1.1491, + "step": 245 + }, + { + "epoch": 0.22905027932960895, + "grad_norm": 1.553288221359253, + "learning_rate": 4.993468695525106e-06, + "loss": 1.1816, + "step": 246 + }, + { + "epoch": 0.22998137802607077, + "grad_norm": 1.5366573333740234, + "learning_rate": 4.993378958845323e-06, + "loss": 1.1069, + "step": 247 + }, + { + "epoch": 0.2309124767225326, + "grad_norm": 1.692987322807312, + "learning_rate": 4.993288610714515e-06, + "loss": 1.2053, + "step": 248 + }, + { + "epoch": 0.23184357541899442, + "grad_norm": 1.6329057216644287, + "learning_rate": 4.993197651154835e-06, + "loss": 1.1967, + "step": 249 + }, + { + "epoch": 0.23277467411545624, + "grad_norm": 1.5369672775268555, + "learning_rate": 4.9931060801885924e-06, + "loss": 1.2103, + "step": 250 + }, + { + "epoch": 0.23370577281191807, + "grad_norm": 1.6424072980880737, + "learning_rate": 4.993013897838242e-06, + "loss": 1.2123, + "step": 251 + }, + { + "epoch": 0.2346368715083799, + "grad_norm": 1.7182517051696777, + "learning_rate": 4.992921104126388e-06, + "loss": 1.1979, + "step": 252 + }, + { + "epoch": 0.23556797020484171, + "grad_norm": 1.5643051862716675, + "learning_rate": 4.992827699075789e-06, + "loss": 1.153, + "step": 253 + }, + { + "epoch": 0.23649906890130354, + "grad_norm": 1.7497291564941406, + "learning_rate": 4.992733682709347e-06, + "loss": 1.1993, + "step": 254 + }, + { + "epoch": 0.23743016759776536, + "grad_norm": 1.7165666818618774, + "learning_rate": 4.9926390550501225e-06, + "loss": 1.1385, + "step": 255 + }, + { + "epoch": 0.2383612662942272, + "grad_norm": 1.5467196702957153, + "learning_rate": 4.992543816121317e-06, + "loss": 1.2077, + "step": 256 + }, + { + "epoch": 0.239292364990689, + "grad_norm": 1.5102182626724243, + "learning_rate": 4.9924479659462875e-06, + "loss": 1.1998, + "step": 257 + }, + { + "epoch": 0.24022346368715083, + "grad_norm": 1.470045804977417, + "learning_rate": 4.9923515045485395e-06, + "loss": 1.216, + "step": 258 + }, + { + "epoch": 0.24115456238361266, + "grad_norm": 1.7479827404022217, + "learning_rate": 4.992254431951729e-06, + "loss": 1.1651, + "step": 259 + }, + { + "epoch": 0.24208566108007448, + "grad_norm": 1.5148561000823975, + "learning_rate": 4.992156748179659e-06, + "loss": 1.1773, + "step": 260 + }, + { + "epoch": 0.2430167597765363, + "grad_norm": 1.5441746711730957, + "learning_rate": 4.992058453256284e-06, + "loss": 1.1554, + "step": 261 + }, + { + "epoch": 0.24394785847299813, + "grad_norm": 1.5650780200958252, + "learning_rate": 4.991959547205713e-06, + "loss": 1.1348, + "step": 262 + }, + { + "epoch": 0.24487895716945995, + "grad_norm": 1.6538890600204468, + "learning_rate": 4.991860030052196e-06, + "loss": 1.1757, + "step": 263 + }, + { + "epoch": 0.24581005586592178, + "grad_norm": 1.5534696578979492, + "learning_rate": 4.991759901820141e-06, + "loss": 1.1876, + "step": 264 + }, + { + "epoch": 0.2467411545623836, + "grad_norm": 1.6564414501190186, + "learning_rate": 4.991659162534101e-06, + "loss": 1.2146, + "step": 265 + }, + { + "epoch": 0.24767225325884543, + "grad_norm": 1.4657020568847656, + "learning_rate": 4.991557812218779e-06, + "loss": 1.1072, + "step": 266 + }, + { + "epoch": 0.24860335195530725, + "grad_norm": 1.6279875040054321, + "learning_rate": 4.991455850899032e-06, + "loss": 1.1923, + "step": 267 + }, + { + "epoch": 0.24953445065176907, + "grad_norm": 1.6531950235366821, + "learning_rate": 4.991353278599862e-06, + "loss": 1.1993, + "step": 268 + }, + { + "epoch": 0.2504655493482309, + "grad_norm": 1.5914767980575562, + "learning_rate": 4.991250095346423e-06, + "loss": 1.1991, + "step": 269 + }, + { + "epoch": 0.25139664804469275, + "grad_norm": 1.557533860206604, + "learning_rate": 4.991146301164019e-06, + "loss": 1.1358, + "step": 270 + }, + { + "epoch": 0.25232774674115455, + "grad_norm": 1.6352646350860596, + "learning_rate": 4.991041896078104e-06, + "loss": 1.1973, + "step": 271 + }, + { + "epoch": 0.2532588454376164, + "grad_norm": 1.5098541975021362, + "learning_rate": 4.990936880114279e-06, + "loss": 1.1683, + "step": 272 + }, + { + "epoch": 0.2541899441340782, + "grad_norm": 1.6232593059539795, + "learning_rate": 4.990831253298299e-06, + "loss": 1.1356, + "step": 273 + }, + { + "epoch": 0.25512104283054005, + "grad_norm": 1.5661216974258423, + "learning_rate": 4.990725015656068e-06, + "loss": 1.2006, + "step": 274 + }, + { + "epoch": 0.25605214152700184, + "grad_norm": 1.507642149925232, + "learning_rate": 4.990618167213636e-06, + "loss": 1.1316, + "step": 275 + }, + { + "epoch": 0.2569832402234637, + "grad_norm": 1.5950425863265991, + "learning_rate": 4.990510707997207e-06, + "loss": 1.1983, + "step": 276 + }, + { + "epoch": 0.2579143389199255, + "grad_norm": 1.6054936647415161, + "learning_rate": 4.990402638033132e-06, + "loss": 1.217, + "step": 277 + }, + { + "epoch": 0.25884543761638734, + "grad_norm": 1.5690265893936157, + "learning_rate": 4.990293957347914e-06, + "loss": 1.1826, + "step": 278 + }, + { + "epoch": 0.25977653631284914, + "grad_norm": 1.579464316368103, + "learning_rate": 4.990184665968204e-06, + "loss": 1.2222, + "step": 279 + }, + { + "epoch": 0.260707635009311, + "grad_norm": 1.4905120134353638, + "learning_rate": 4.990074763920804e-06, + "loss": 1.162, + "step": 280 + }, + { + "epoch": 0.2616387337057728, + "grad_norm": 1.6381080150604248, + "learning_rate": 4.989964251232667e-06, + "loss": 1.1937, + "step": 281 + }, + { + "epoch": 0.26256983240223464, + "grad_norm": 1.4682097434997559, + "learning_rate": 4.98985312793089e-06, + "loss": 1.1548, + "step": 282 + }, + { + "epoch": 0.2635009310986965, + "grad_norm": 1.54086434841156, + "learning_rate": 4.989741394042728e-06, + "loss": 1.1879, + "step": 283 + }, + { + "epoch": 0.2644320297951583, + "grad_norm": 1.6787452697753906, + "learning_rate": 4.989629049595579e-06, + "loss": 1.1718, + "step": 284 + }, + { + "epoch": 0.26536312849162014, + "grad_norm": 1.6381014585494995, + "learning_rate": 4.989516094616993e-06, + "loss": 1.1987, + "step": 285 + }, + { + "epoch": 0.26629422718808193, + "grad_norm": 1.5789836645126343, + "learning_rate": 4.98940252913467e-06, + "loss": 1.1694, + "step": 286 + }, + { + "epoch": 0.2672253258845438, + "grad_norm": 1.561802864074707, + "learning_rate": 4.989288353176463e-06, + "loss": 1.1576, + "step": 287 + }, + { + "epoch": 0.2681564245810056, + "grad_norm": 1.568241834640503, + "learning_rate": 4.989173566770366e-06, + "loss": 1.1579, + "step": 288 + }, + { + "epoch": 0.26908752327746743, + "grad_norm": 1.5002732276916504, + "learning_rate": 4.989058169944532e-06, + "loss": 1.18, + "step": 289 + }, + { + "epoch": 0.27001862197392923, + "grad_norm": 1.5480579137802124, + "learning_rate": 4.9889421627272575e-06, + "loss": 1.139, + "step": 290 + }, + { + "epoch": 0.2709497206703911, + "grad_norm": 1.5767931938171387, + "learning_rate": 4.988825545146993e-06, + "loss": 1.186, + "step": 291 + }, + { + "epoch": 0.2718808193668529, + "grad_norm": 1.6289966106414795, + "learning_rate": 4.988708317232335e-06, + "loss": 1.1851, + "step": 292 + }, + { + "epoch": 0.27281191806331473, + "grad_norm": 1.5453596115112305, + "learning_rate": 4.988590479012032e-06, + "loss": 1.1442, + "step": 293 + }, + { + "epoch": 0.2737430167597765, + "grad_norm": 1.6295948028564453, + "learning_rate": 4.988472030514982e-06, + "loss": 1.2072, + "step": 294 + }, + { + "epoch": 0.2746741154562384, + "grad_norm": 1.5502432584762573, + "learning_rate": 4.988352971770229e-06, + "loss": 1.201, + "step": 295 + }, + { + "epoch": 0.2756052141527002, + "grad_norm": 1.5260909795761108, + "learning_rate": 4.988233302806974e-06, + "loss": 1.2135, + "step": 296 + }, + { + "epoch": 0.276536312849162, + "grad_norm": 1.5481847524642944, + "learning_rate": 4.988113023654562e-06, + "loss": 1.1454, + "step": 297 + }, + { + "epoch": 0.2774674115456238, + "grad_norm": 1.5329092741012573, + "learning_rate": 4.987992134342488e-06, + "loss": 1.1294, + "step": 298 + }, + { + "epoch": 0.2783985102420857, + "grad_norm": 1.5602904558181763, + "learning_rate": 4.987870634900398e-06, + "loss": 1.1469, + "step": 299 + }, + { + "epoch": 0.27932960893854747, + "grad_norm": 1.5570727586746216, + "learning_rate": 4.987748525358087e-06, + "loss": 1.2041, + "step": 300 + }, + { + "epoch": 0.2802607076350093, + "grad_norm": 1.5078455209732056, + "learning_rate": 4.9876258057455015e-06, + "loss": 1.1436, + "step": 301 + }, + { + "epoch": 0.2811918063314711, + "grad_norm": 1.5646103620529175, + "learning_rate": 4.987502476092734e-06, + "loss": 1.1338, + "step": 302 + }, + { + "epoch": 0.28212290502793297, + "grad_norm": 1.6741316318511963, + "learning_rate": 4.987378536430031e-06, + "loss": 1.1459, + "step": 303 + }, + { + "epoch": 0.28305400372439476, + "grad_norm": 1.6432318687438965, + "learning_rate": 4.987253986787783e-06, + "loss": 1.2054, + "step": 304 + }, + { + "epoch": 0.2839851024208566, + "grad_norm": 1.5824742317199707, + "learning_rate": 4.987128827196537e-06, + "loss": 1.1358, + "step": 305 + }, + { + "epoch": 0.2849162011173184, + "grad_norm": 1.5743874311447144, + "learning_rate": 4.987003057686983e-06, + "loss": 1.1483, + "step": 306 + }, + { + "epoch": 0.28584729981378026, + "grad_norm": 1.6464556455612183, + "learning_rate": 4.986876678289964e-06, + "loss": 1.1897, + "step": 307 + }, + { + "epoch": 0.28677839851024206, + "grad_norm": 1.6144617795944214, + "learning_rate": 4.9867496890364734e-06, + "loss": 1.2007, + "step": 308 + }, + { + "epoch": 0.2877094972067039, + "grad_norm": 1.5669097900390625, + "learning_rate": 4.986622089957651e-06, + "loss": 1.212, + "step": 309 + }, + { + "epoch": 0.2886405959031657, + "grad_norm": 1.61880362033844, + "learning_rate": 4.986493881084789e-06, + "loss": 1.1408, + "step": 310 + }, + { + "epoch": 0.28957169459962756, + "grad_norm": 1.652443766593933, + "learning_rate": 4.986365062449328e-06, + "loss": 1.1441, + "step": 311 + }, + { + "epoch": 0.2905027932960894, + "grad_norm": 1.6292386054992676, + "learning_rate": 4.986235634082857e-06, + "loss": 1.1859, + "step": 312 + }, + { + "epoch": 0.2914338919925512, + "grad_norm": 1.5104798078536987, + "learning_rate": 4.986105596017118e-06, + "loss": 1.1938, + "step": 313 + }, + { + "epoch": 0.29236499068901306, + "grad_norm": 1.5984587669372559, + "learning_rate": 4.985974948283997e-06, + "loss": 1.1797, + "step": 314 + }, + { + "epoch": 0.29329608938547486, + "grad_norm": 1.568224310874939, + "learning_rate": 4.985843690915536e-06, + "loss": 1.1488, + "step": 315 + }, + { + "epoch": 0.2942271880819367, + "grad_norm": 1.6096382141113281, + "learning_rate": 4.985711823943921e-06, + "loss": 1.1292, + "step": 316 + }, + { + "epoch": 0.2951582867783985, + "grad_norm": 1.5105839967727661, + "learning_rate": 4.985579347401491e-06, + "loss": 1.1253, + "step": 317 + }, + { + "epoch": 0.29608938547486036, + "grad_norm": 1.5043179988861084, + "learning_rate": 4.985446261320732e-06, + "loss": 1.1599, + "step": 318 + }, + { + "epoch": 0.29702048417132215, + "grad_norm": 1.5188047885894775, + "learning_rate": 4.985312565734283e-06, + "loss": 1.1521, + "step": 319 + }, + { + "epoch": 0.297951582867784, + "grad_norm": 1.5714865922927856, + "learning_rate": 4.985178260674927e-06, + "loss": 1.1467, + "step": 320 + }, + { + "epoch": 0.2988826815642458, + "grad_norm": 1.6004184484481812, + "learning_rate": 4.985043346175602e-06, + "loss": 1.1482, + "step": 321 + }, + { + "epoch": 0.29981378026070765, + "grad_norm": 1.5001561641693115, + "learning_rate": 4.984907822269391e-06, + "loss": 1.1448, + "step": 322 + }, + { + "epoch": 0.30074487895716945, + "grad_norm": 1.4722920656204224, + "learning_rate": 4.984771688989532e-06, + "loss": 1.1172, + "step": 323 + }, + { + "epoch": 0.3016759776536313, + "grad_norm": 1.5229582786560059, + "learning_rate": 4.984634946369405e-06, + "loss": 1.1548, + "step": 324 + }, + { + "epoch": 0.3026070763500931, + "grad_norm": 1.5634204149246216, + "learning_rate": 4.984497594442545e-06, + "loss": 1.1541, + "step": 325 + }, + { + "epoch": 0.30353817504655495, + "grad_norm": 1.673488974571228, + "learning_rate": 4.984359633242636e-06, + "loss": 1.1821, + "step": 326 + }, + { + "epoch": 0.30446927374301674, + "grad_norm": 1.5141621828079224, + "learning_rate": 4.984221062803508e-06, + "loss": 1.1469, + "step": 327 + }, + { + "epoch": 0.3054003724394786, + "grad_norm": 1.5276451110839844, + "learning_rate": 4.984081883159144e-06, + "loss": 1.1489, + "step": 328 + }, + { + "epoch": 0.3063314711359404, + "grad_norm": 1.567543387413025, + "learning_rate": 4.983942094343675e-06, + "loss": 1.1288, + "step": 329 + }, + { + "epoch": 0.30726256983240224, + "grad_norm": 1.5250859260559082, + "learning_rate": 4.983801696391381e-06, + "loss": 1.1456, + "step": 330 + }, + { + "epoch": 0.30819366852886404, + "grad_norm": 1.6484228372573853, + "learning_rate": 4.983660689336692e-06, + "loss": 1.1432, + "step": 331 + }, + { + "epoch": 0.3091247672253259, + "grad_norm": 1.469672679901123, + "learning_rate": 4.983519073214186e-06, + "loss": 1.1402, + "step": 332 + }, + { + "epoch": 0.3100558659217877, + "grad_norm": 1.6095240116119385, + "learning_rate": 4.983376848058593e-06, + "loss": 1.1993, + "step": 333 + }, + { + "epoch": 0.31098696461824954, + "grad_norm": 1.5325345993041992, + "learning_rate": 4.983234013904791e-06, + "loss": 1.1637, + "step": 334 + }, + { + "epoch": 0.31191806331471134, + "grad_norm": 1.6400229930877686, + "learning_rate": 4.983090570787806e-06, + "loss": 1.1667, + "step": 335 + }, + { + "epoch": 0.3128491620111732, + "grad_norm": 1.539421558380127, + "learning_rate": 4.982946518742815e-06, + "loss": 1.1727, + "step": 336 + }, + { + "epoch": 0.313780260707635, + "grad_norm": 1.7082252502441406, + "learning_rate": 4.982801857805144e-06, + "loss": 1.1511, + "step": 337 + }, + { + "epoch": 0.31471135940409684, + "grad_norm": 1.596779227256775, + "learning_rate": 4.982656588010269e-06, + "loss": 1.1945, + "step": 338 + }, + { + "epoch": 0.31564245810055863, + "grad_norm": 1.6482007503509521, + "learning_rate": 4.982510709393813e-06, + "loss": 1.1822, + "step": 339 + }, + { + "epoch": 0.3165735567970205, + "grad_norm": 1.7460479736328125, + "learning_rate": 4.98236422199155e-06, + "loss": 1.1871, + "step": 340 + }, + { + "epoch": 0.31750465549348234, + "grad_norm": 1.5625799894332886, + "learning_rate": 4.982217125839403e-06, + "loss": 1.1541, + "step": 341 + }, + { + "epoch": 0.31843575418994413, + "grad_norm": 1.5512086153030396, + "learning_rate": 4.982069420973446e-06, + "loss": 1.1585, + "step": 342 + }, + { + "epoch": 0.319366852886406, + "grad_norm": 1.670642614364624, + "learning_rate": 4.9819211074299e-06, + "loss": 1.1908, + "step": 343 + }, + { + "epoch": 0.3202979515828678, + "grad_norm": 1.5584107637405396, + "learning_rate": 4.981772185245135e-06, + "loss": 1.075, + "step": 344 + }, + { + "epoch": 0.32122905027932963, + "grad_norm": 1.5482008457183838, + "learning_rate": 4.9816226544556725e-06, + "loss": 1.1959, + "step": 345 + }, + { + "epoch": 0.3221601489757914, + "grad_norm": 1.5843377113342285, + "learning_rate": 4.98147251509818e-06, + "loss": 1.188, + "step": 346 + }, + { + "epoch": 0.3230912476722533, + "grad_norm": 1.5460178852081299, + "learning_rate": 4.981321767209477e-06, + "loss": 1.1275, + "step": 347 + }, + { + "epoch": 0.3240223463687151, + "grad_norm": 1.645349383354187, + "learning_rate": 4.981170410826532e-06, + "loss": 1.126, + "step": 348 + }, + { + "epoch": 0.3249534450651769, + "grad_norm": 1.5918056964874268, + "learning_rate": 4.9810184459864615e-06, + "loss": 1.1475, + "step": 349 + }, + { + "epoch": 0.3258845437616387, + "grad_norm": 1.502294659614563, + "learning_rate": 4.980865872726532e-06, + "loss": 1.1257, + "step": 350 + }, + { + "epoch": 0.3268156424581006, + "grad_norm": 1.5638993978500366, + "learning_rate": 4.9807126910841595e-06, + "loss": 1.1593, + "step": 351 + }, + { + "epoch": 0.32774674115456237, + "grad_norm": 1.7481025457382202, + "learning_rate": 4.980558901096908e-06, + "loss": 1.1387, + "step": 352 + }, + { + "epoch": 0.3286778398510242, + "grad_norm": 1.622081995010376, + "learning_rate": 4.980404502802492e-06, + "loss": 1.1767, + "step": 353 + }, + { + "epoch": 0.329608938547486, + "grad_norm": 1.5219521522521973, + "learning_rate": 4.980249496238774e-06, + "loss": 1.1513, + "step": 354 + }, + { + "epoch": 0.33054003724394787, + "grad_norm": 1.5893964767456055, + "learning_rate": 4.980093881443766e-06, + "loss": 1.1612, + "step": 355 + }, + { + "epoch": 0.33147113594040967, + "grad_norm": 1.650078535079956, + "learning_rate": 4.9799376584556304e-06, + "loss": 1.0907, + "step": 356 + }, + { + "epoch": 0.3324022346368715, + "grad_norm": 1.578399419784546, + "learning_rate": 4.9797808273126765e-06, + "loss": 1.1579, + "step": 357 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.5139063596725464, + "learning_rate": 4.979623388053366e-06, + "loss": 1.1327, + "step": 358 + }, + { + "epoch": 0.33426443202979517, + "grad_norm": 1.5960277318954468, + "learning_rate": 4.979465340716306e-06, + "loss": 1.1434, + "step": 359 + }, + { + "epoch": 0.33519553072625696, + "grad_norm": 1.5807005167007446, + "learning_rate": 4.9793066853402535e-06, + "loss": 1.1833, + "step": 360 + }, + { + "epoch": 0.3361266294227188, + "grad_norm": 1.5984652042388916, + "learning_rate": 4.979147421964119e-06, + "loss": 1.1528, + "step": 361 + }, + { + "epoch": 0.3370577281191806, + "grad_norm": 1.5312142372131348, + "learning_rate": 4.978987550626955e-06, + "loss": 1.1644, + "step": 362 + }, + { + "epoch": 0.33798882681564246, + "grad_norm": 1.5473692417144775, + "learning_rate": 4.978827071367969e-06, + "loss": 1.1445, + "step": 363 + }, + { + "epoch": 0.33891992551210426, + "grad_norm": 1.6537644863128662, + "learning_rate": 4.978665984226514e-06, + "loss": 1.1617, + "step": 364 + }, + { + "epoch": 0.3398510242085661, + "grad_norm": 1.6679675579071045, + "learning_rate": 4.978504289242094e-06, + "loss": 1.1474, + "step": 365 + }, + { + "epoch": 0.3407821229050279, + "grad_norm": 1.507922649383545, + "learning_rate": 4.978341986454363e-06, + "loss": 1.1524, + "step": 366 + }, + { + "epoch": 0.34171322160148976, + "grad_norm": 1.6119061708450317, + "learning_rate": 4.978179075903119e-06, + "loss": 1.1789, + "step": 367 + }, + { + "epoch": 0.3426443202979516, + "grad_norm": 1.5990979671478271, + "learning_rate": 4.978015557628315e-06, + "loss": 1.1562, + "step": 368 + }, + { + "epoch": 0.3435754189944134, + "grad_norm": 1.6053398847579956, + "learning_rate": 4.9778514316700496e-06, + "loss": 1.1645, + "step": 369 + }, + { + "epoch": 0.34450651769087526, + "grad_norm": 1.5216419696807861, + "learning_rate": 4.977686698068572e-06, + "loss": 1.1137, + "step": 370 + }, + { + "epoch": 0.34543761638733705, + "grad_norm": 1.5150953531265259, + "learning_rate": 4.9775213568642796e-06, + "loss": 1.1092, + "step": 371 + }, + { + "epoch": 0.3463687150837989, + "grad_norm": 1.616699457168579, + "learning_rate": 4.97735540809772e-06, + "loss": 1.1725, + "step": 372 + }, + { + "epoch": 0.3472998137802607, + "grad_norm": 1.5947240591049194, + "learning_rate": 4.9771888518095855e-06, + "loss": 1.1512, + "step": 373 + }, + { + "epoch": 0.34823091247672255, + "grad_norm": 1.6488538980484009, + "learning_rate": 4.977021688040724e-06, + "loss": 1.1565, + "step": 374 + }, + { + "epoch": 0.34916201117318435, + "grad_norm": 1.4983750581741333, + "learning_rate": 4.976853916832129e-06, + "loss": 1.112, + "step": 375 + }, + { + "epoch": 0.3500931098696462, + "grad_norm": 1.5410925149917603, + "learning_rate": 4.9766855382249414e-06, + "loss": 1.1333, + "step": 376 + }, + { + "epoch": 0.351024208566108, + "grad_norm": 1.7927250862121582, + "learning_rate": 4.976516552260453e-06, + "loss": 1.1381, + "step": 377 + }, + { + "epoch": 0.35195530726256985, + "grad_norm": 1.568892478942871, + "learning_rate": 4.976346958980105e-06, + "loss": 1.1423, + "step": 378 + }, + { + "epoch": 0.35288640595903165, + "grad_norm": 1.5994700193405151, + "learning_rate": 4.9761767584254855e-06, + "loss": 1.1756, + "step": 379 + }, + { + "epoch": 0.3538175046554935, + "grad_norm": 1.5248911380767822, + "learning_rate": 4.976005950638334e-06, + "loss": 1.1473, + "step": 380 + }, + { + "epoch": 0.3547486033519553, + "grad_norm": 1.5898964405059814, + "learning_rate": 4.975834535660538e-06, + "loss": 1.1558, + "step": 381 + }, + { + "epoch": 0.35567970204841715, + "grad_norm": 1.5030688047409058, + "learning_rate": 4.975662513534131e-06, + "loss": 1.0974, + "step": 382 + }, + { + "epoch": 0.35661080074487894, + "grad_norm": 1.5825740098953247, + "learning_rate": 4.975489884301301e-06, + "loss": 1.1819, + "step": 383 + }, + { + "epoch": 0.3575418994413408, + "grad_norm": 1.5107347965240479, + "learning_rate": 4.97531664800438e-06, + "loss": 1.1406, + "step": 384 + }, + { + "epoch": 0.3584729981378026, + "grad_norm": 1.5390645265579224, + "learning_rate": 4.975142804685851e-06, + "loss": 1.1333, + "step": 385 + }, + { + "epoch": 0.35940409683426444, + "grad_norm": 1.5294268131256104, + "learning_rate": 4.974968354388346e-06, + "loss": 1.177, + "step": 386 + }, + { + "epoch": 0.36033519553072624, + "grad_norm": 1.518444538116455, + "learning_rate": 4.974793297154645e-06, + "loss": 1.0785, + "step": 387 + }, + { + "epoch": 0.3612662942271881, + "grad_norm": 1.7795239686965942, + "learning_rate": 4.974617633027678e-06, + "loss": 1.1469, + "step": 388 + }, + { + "epoch": 0.3621973929236499, + "grad_norm": 1.5898357629776, + "learning_rate": 4.974441362050523e-06, + "loss": 1.105, + "step": 389 + }, + { + "epoch": 0.36312849162011174, + "grad_norm": 1.6565510034561157, + "learning_rate": 4.9742644842664066e-06, + "loss": 1.1728, + "step": 390 + }, + { + "epoch": 0.36405959031657353, + "grad_norm": 1.6090935468673706, + "learning_rate": 4.9740869997187035e-06, + "loss": 1.1556, + "step": 391 + }, + { + "epoch": 0.3649906890130354, + "grad_norm": 1.5549203157424927, + "learning_rate": 4.97390890845094e-06, + "loss": 1.1392, + "step": 392 + }, + { + "epoch": 0.3659217877094972, + "grad_norm": 1.576033353805542, + "learning_rate": 4.973730210506788e-06, + "loss": 1.1421, + "step": 393 + }, + { + "epoch": 0.36685288640595903, + "grad_norm": 1.5537548065185547, + "learning_rate": 4.97355090593007e-06, + "loss": 1.1855, + "step": 394 + }, + { + "epoch": 0.36778398510242083, + "grad_norm": 1.558398723602295, + "learning_rate": 4.973370994764758e-06, + "loss": 1.1753, + "step": 395 + }, + { + "epoch": 0.3687150837988827, + "grad_norm": 1.5602357387542725, + "learning_rate": 4.97319047705497e-06, + "loss": 1.1199, + "step": 396 + }, + { + "epoch": 0.36964618249534453, + "grad_norm": 1.4839043617248535, + "learning_rate": 4.973009352844974e-06, + "loss": 1.128, + "step": 397 + }, + { + "epoch": 0.37057728119180633, + "grad_norm": 1.548038125038147, + "learning_rate": 4.9728276221791895e-06, + "loss": 1.1709, + "step": 398 + }, + { + "epoch": 0.3715083798882682, + "grad_norm": 1.5458474159240723, + "learning_rate": 4.9726452851021804e-06, + "loss": 1.1752, + "step": 399 + }, + { + "epoch": 0.37243947858473, + "grad_norm": 1.4915302991867065, + "learning_rate": 4.972462341658661e-06, + "loss": 1.1563, + "step": 400 + }, + { + "epoch": 0.37337057728119183, + "grad_norm": 1.5692366361618042, + "learning_rate": 4.972278791893496e-06, + "loss": 1.1479, + "step": 401 + }, + { + "epoch": 0.3743016759776536, + "grad_norm": 1.5486400127410889, + "learning_rate": 4.972094635851695e-06, + "loss": 1.1555, + "step": 402 + }, + { + "epoch": 0.3752327746741155, + "grad_norm": 1.509786605834961, + "learning_rate": 4.971909873578421e-06, + "loss": 1.1315, + "step": 403 + }, + { + "epoch": 0.3761638733705773, + "grad_norm": 1.497616171836853, + "learning_rate": 4.971724505118982e-06, + "loss": 1.1305, + "step": 404 + }, + { + "epoch": 0.3770949720670391, + "grad_norm": 1.6088786125183105, + "learning_rate": 4.971538530518836e-06, + "loss": 1.1403, + "step": 405 + }, + { + "epoch": 0.3780260707635009, + "grad_norm": 1.5144068002700806, + "learning_rate": 4.971351949823589e-06, + "loss": 1.1195, + "step": 406 + }, + { + "epoch": 0.3789571694599628, + "grad_norm": 1.5215985774993896, + "learning_rate": 4.971164763078998e-06, + "loss": 1.1357, + "step": 407 + }, + { + "epoch": 0.37988826815642457, + "grad_norm": 1.4947015047073364, + "learning_rate": 4.970976970330964e-06, + "loss": 1.1286, + "step": 408 + }, + { + "epoch": 0.3808193668528864, + "grad_norm": 1.6067999601364136, + "learning_rate": 4.970788571625542e-06, + "loss": 1.1786, + "step": 409 + }, + { + "epoch": 0.3817504655493482, + "grad_norm": 1.657520055770874, + "learning_rate": 4.970599567008931e-06, + "loss": 1.143, + "step": 410 + }, + { + "epoch": 0.38268156424581007, + "grad_norm": 1.4607744216918945, + "learning_rate": 4.970409956527483e-06, + "loss": 1.1543, + "step": 411 + }, + { + "epoch": 0.38361266294227186, + "grad_norm": 1.51872718334198, + "learning_rate": 4.970219740227693e-06, + "loss": 1.1456, + "step": 412 + }, + { + "epoch": 0.3845437616387337, + "grad_norm": 1.6181915998458862, + "learning_rate": 4.97002891815621e-06, + "loss": 1.1318, + "step": 413 + }, + { + "epoch": 0.3854748603351955, + "grad_norm": 1.5810741186141968, + "learning_rate": 4.969837490359829e-06, + "loss": 1.1315, + "step": 414 + }, + { + "epoch": 0.38640595903165736, + "grad_norm": 1.5425630807876587, + "learning_rate": 4.969645456885493e-06, + "loss": 1.1152, + "step": 415 + }, + { + "epoch": 0.38733705772811916, + "grad_norm": 1.6084840297698975, + "learning_rate": 4.969452817780295e-06, + "loss": 1.1991, + "step": 416 + }, + { + "epoch": 0.388268156424581, + "grad_norm": 1.6024158000946045, + "learning_rate": 4.969259573091476e-06, + "loss": 1.1318, + "step": 417 + }, + { + "epoch": 0.3891992551210428, + "grad_norm": 1.6052086353302002, + "learning_rate": 4.9690657228664244e-06, + "loss": 1.1744, + "step": 418 + }, + { + "epoch": 0.39013035381750466, + "grad_norm": 1.4998224973678589, + "learning_rate": 4.9688712671526786e-06, + "loss": 1.0993, + "step": 419 + }, + { + "epoch": 0.39106145251396646, + "grad_norm": 1.5480080842971802, + "learning_rate": 4.9686762059979255e-06, + "loss": 1.1203, + "step": 420 + }, + { + "epoch": 0.3919925512104283, + "grad_norm": 1.593430519104004, + "learning_rate": 4.968480539449999e-06, + "loss": 1.1335, + "step": 421 + }, + { + "epoch": 0.3929236499068901, + "grad_norm": 1.5293151140213013, + "learning_rate": 4.968284267556883e-06, + "loss": 1.1365, + "step": 422 + }, + { + "epoch": 0.39385474860335196, + "grad_norm": 1.5110721588134766, + "learning_rate": 4.9680873903667095e-06, + "loss": 1.1576, + "step": 423 + }, + { + "epoch": 0.3947858472998138, + "grad_norm": 1.6544134616851807, + "learning_rate": 4.967889907927758e-06, + "loss": 1.1416, + "step": 424 + }, + { + "epoch": 0.3957169459962756, + "grad_norm": 1.583020567893982, + "learning_rate": 4.967691820288457e-06, + "loss": 1.1544, + "step": 425 + }, + { + "epoch": 0.39664804469273746, + "grad_norm": 1.6162288188934326, + "learning_rate": 4.967493127497385e-06, + "loss": 1.1582, + "step": 426 + }, + { + "epoch": 0.39757914338919925, + "grad_norm": 1.5714863538742065, + "learning_rate": 4.9672938296032656e-06, + "loss": 1.1535, + "step": 427 + }, + { + "epoch": 0.3985102420856611, + "grad_norm": 1.587739109992981, + "learning_rate": 4.967093926654973e-06, + "loss": 1.1497, + "step": 428 + }, + { + "epoch": 0.3994413407821229, + "grad_norm": 1.5518815517425537, + "learning_rate": 4.966893418701529e-06, + "loss": 1.1046, + "step": 429 + }, + { + "epoch": 0.40037243947858475, + "grad_norm": 1.57876718044281, + "learning_rate": 4.966692305792106e-06, + "loss": 1.1518, + "step": 430 + }, + { + "epoch": 0.40130353817504655, + "grad_norm": 1.5824034214019775, + "learning_rate": 4.966490587976021e-06, + "loss": 1.1365, + "step": 431 + }, + { + "epoch": 0.4022346368715084, + "grad_norm": 1.6374784708023071, + "learning_rate": 4.9662882653027434e-06, + "loss": 1.1732, + "step": 432 + }, + { + "epoch": 0.4031657355679702, + "grad_norm": 1.6593799591064453, + "learning_rate": 4.966085337821886e-06, + "loss": 1.1394, + "step": 433 + }, + { + "epoch": 0.40409683426443205, + "grad_norm": 1.595018982887268, + "learning_rate": 4.965881805583214e-06, + "loss": 1.0903, + "step": 434 + }, + { + "epoch": 0.40502793296089384, + "grad_norm": 1.5136500597000122, + "learning_rate": 4.965677668636639e-06, + "loss": 1.0883, + "step": 435 + }, + { + "epoch": 0.4059590316573557, + "grad_norm": 1.5129704475402832, + "learning_rate": 4.9654729270322236e-06, + "loss": 1.0843, + "step": 436 + }, + { + "epoch": 0.4068901303538175, + "grad_norm": 1.9252679347991943, + "learning_rate": 4.965267580820173e-06, + "loss": 1.1544, + "step": 437 + }, + { + "epoch": 0.40782122905027934, + "grad_norm": 1.6633261442184448, + "learning_rate": 4.965061630050848e-06, + "loss": 1.1798, + "step": 438 + }, + { + "epoch": 0.40875232774674114, + "grad_norm": 1.5632121562957764, + "learning_rate": 4.964855074774751e-06, + "loss": 1.1276, + "step": 439 + }, + { + "epoch": 0.409683426443203, + "grad_norm": 1.54603910446167, + "learning_rate": 4.964647915042537e-06, + "loss": 1.1236, + "step": 440 + }, + { + "epoch": 0.4106145251396648, + "grad_norm": 1.5920768976211548, + "learning_rate": 4.964440150905008e-06, + "loss": 1.1273, + "step": 441 + }, + { + "epoch": 0.41154562383612664, + "grad_norm": 1.5279273986816406, + "learning_rate": 4.9642317824131125e-06, + "loss": 1.1274, + "step": 442 + }, + { + "epoch": 0.41247672253258844, + "grad_norm": 1.5296270847320557, + "learning_rate": 4.96402280961795e-06, + "loss": 1.1374, + "step": 443 + }, + { + "epoch": 0.4134078212290503, + "grad_norm": 1.5581321716308594, + "learning_rate": 4.963813232570767e-06, + "loss": 1.1089, + "step": 444 + }, + { + "epoch": 0.4143389199255121, + "grad_norm": 1.5483899116516113, + "learning_rate": 4.963603051322956e-06, + "loss": 1.1198, + "step": 445 + }, + { + "epoch": 0.41527001862197394, + "grad_norm": 1.6096707582473755, + "learning_rate": 4.963392265926062e-06, + "loss": 1.1416, + "step": 446 + }, + { + "epoch": 0.41620111731843573, + "grad_norm": 1.5583285093307495, + "learning_rate": 4.963180876431775e-06, + "loss": 1.1425, + "step": 447 + }, + { + "epoch": 0.4171322160148976, + "grad_norm": 1.6165952682495117, + "learning_rate": 4.962968882891934e-06, + "loss": 1.1218, + "step": 448 + }, + { + "epoch": 0.4180633147113594, + "grad_norm": 1.6122556924819946, + "learning_rate": 4.962756285358527e-06, + "loss": 1.2283, + "step": 449 + }, + { + "epoch": 0.41899441340782123, + "grad_norm": 1.5779941082000732, + "learning_rate": 4.9625430838836875e-06, + "loss": 1.1163, + "step": 450 + }, + { + "epoch": 0.419925512104283, + "grad_norm": 1.5453237295150757, + "learning_rate": 4.9623292785197e-06, + "loss": 1.1109, + "step": 451 + }, + { + "epoch": 0.4208566108007449, + "grad_norm": 1.4900524616241455, + "learning_rate": 4.962114869318996e-06, + "loss": 1.1068, + "step": 452 + }, + { + "epoch": 0.42178770949720673, + "grad_norm": 1.582170009613037, + "learning_rate": 4.961899856334155e-06, + "loss": 1.1642, + "step": 453 + }, + { + "epoch": 0.4227188081936685, + "grad_norm": 1.568885326385498, + "learning_rate": 4.961684239617904e-06, + "loss": 1.1053, + "step": 454 + }, + { + "epoch": 0.4236499068901304, + "grad_norm": 1.4655330181121826, + "learning_rate": 4.96146801922312e-06, + "loss": 1.107, + "step": 455 + }, + { + "epoch": 0.4245810055865922, + "grad_norm": 1.5923532247543335, + "learning_rate": 4.961251195202825e-06, + "loss": 1.1428, + "step": 456 + }, + { + "epoch": 0.425512104283054, + "grad_norm": 1.5150216817855835, + "learning_rate": 4.961033767610193e-06, + "loss": 1.1066, + "step": 457 + }, + { + "epoch": 0.4264432029795158, + "grad_norm": 1.5623857975006104, + "learning_rate": 4.960815736498541e-06, + "loss": 1.1533, + "step": 458 + }, + { + "epoch": 0.4273743016759777, + "grad_norm": 1.6647369861602783, + "learning_rate": 4.960597101921338e-06, + "loss": 1.1528, + "step": 459 + }, + { + "epoch": 0.42830540037243947, + "grad_norm": 1.5871447324752808, + "learning_rate": 4.960377863932201e-06, + "loss": 1.1001, + "step": 460 + }, + { + "epoch": 0.4292364990689013, + "grad_norm": 1.4906527996063232, + "learning_rate": 4.9601580225848914e-06, + "loss": 1.0991, + "step": 461 + }, + { + "epoch": 0.4301675977653631, + "grad_norm": 1.5552783012390137, + "learning_rate": 4.959937577933323e-06, + "loss": 1.1788, + "step": 462 + }, + { + "epoch": 0.43109869646182497, + "grad_norm": 1.5226411819458008, + "learning_rate": 4.959716530031553e-06, + "loss": 1.1406, + "step": 463 + }, + { + "epoch": 0.43202979515828677, + "grad_norm": 1.5362269878387451, + "learning_rate": 4.959494878933792e-06, + "loss": 1.0898, + "step": 464 + }, + { + "epoch": 0.4329608938547486, + "grad_norm": 1.5201159715652466, + "learning_rate": 4.9592726246943924e-06, + "loss": 1.0861, + "step": 465 + }, + { + "epoch": 0.4338919925512104, + "grad_norm": 1.6300363540649414, + "learning_rate": 4.95904976736786e-06, + "loss": 1.1056, + "step": 466 + }, + { + "epoch": 0.43482309124767227, + "grad_norm": 1.5079962015151978, + "learning_rate": 4.9588263070088435e-06, + "loss": 1.0943, + "step": 467 + }, + { + "epoch": 0.43575418994413406, + "grad_norm": 1.5123474597930908, + "learning_rate": 4.958602243672145e-06, + "loss": 1.0997, + "step": 468 + }, + { + "epoch": 0.4366852886405959, + "grad_norm": 1.614089846611023, + "learning_rate": 4.95837757741271e-06, + "loss": 1.146, + "step": 469 + }, + { + "epoch": 0.4376163873370577, + "grad_norm": 1.557093620300293, + "learning_rate": 4.9581523082856335e-06, + "loss": 1.1745, + "step": 470 + }, + { + "epoch": 0.43854748603351956, + "grad_norm": 1.6429771184921265, + "learning_rate": 4.957926436346158e-06, + "loss": 1.084, + "step": 471 + }, + { + "epoch": 0.43947858472998136, + "grad_norm": 1.5478867292404175, + "learning_rate": 4.9576999616496744e-06, + "loss": 1.1426, + "step": 472 + }, + { + "epoch": 0.4404096834264432, + "grad_norm": 1.6172984838485718, + "learning_rate": 4.957472884251722e-06, + "loss": 1.1599, + "step": 473 + }, + { + "epoch": 0.441340782122905, + "grad_norm": 1.5386159420013428, + "learning_rate": 4.9572452042079845e-06, + "loss": 1.1179, + "step": 474 + }, + { + "epoch": 0.44227188081936686, + "grad_norm": 1.5015331506729126, + "learning_rate": 4.957016921574298e-06, + "loss": 1.1114, + "step": 475 + }, + { + "epoch": 0.44320297951582865, + "grad_norm": 1.5971821546554565, + "learning_rate": 4.9567880364066435e-06, + "loss": 1.0885, + "step": 476 + }, + { + "epoch": 0.4441340782122905, + "grad_norm": 1.5242514610290527, + "learning_rate": 4.95655854876115e-06, + "loss": 1.0912, + "step": 477 + }, + { + "epoch": 0.4450651769087523, + "grad_norm": 1.5206764936447144, + "learning_rate": 4.956328458694096e-06, + "loss": 1.143, + "step": 478 + }, + { + "epoch": 0.44599627560521415, + "grad_norm": 1.595788836479187, + "learning_rate": 4.956097766261905e-06, + "loss": 1.1071, + "step": 479 + }, + { + "epoch": 0.44692737430167595, + "grad_norm": 2.1942999362945557, + "learning_rate": 4.95586647152115e-06, + "loss": 1.1245, + "step": 480 + }, + { + "epoch": 0.4478584729981378, + "grad_norm": 1.5881682634353638, + "learning_rate": 4.955634574528553e-06, + "loss": 1.1411, + "step": 481 + }, + { + "epoch": 0.44878957169459965, + "grad_norm": 1.6564148664474487, + "learning_rate": 4.95540207534098e-06, + "loss": 1.1259, + "step": 482 + }, + { + "epoch": 0.44972067039106145, + "grad_norm": 1.5351526737213135, + "learning_rate": 4.955168974015447e-06, + "loss": 1.1233, + "step": 483 + }, + { + "epoch": 0.4506517690875233, + "grad_norm": 1.5172646045684814, + "learning_rate": 4.954935270609119e-06, + "loss": 1.0793, + "step": 484 + }, + { + "epoch": 0.4515828677839851, + "grad_norm": 1.5817103385925293, + "learning_rate": 4.954700965179306e-06, + "loss": 1.0754, + "step": 485 + }, + { + "epoch": 0.45251396648044695, + "grad_norm": 1.6204304695129395, + "learning_rate": 4.954466057783466e-06, + "loss": 1.1562, + "step": 486 + }, + { + "epoch": 0.45344506517690875, + "grad_norm": 1.5794012546539307, + "learning_rate": 4.9542305484792066e-06, + "loss": 1.122, + "step": 487 + }, + { + "epoch": 0.4543761638733706, + "grad_norm": 1.6076405048370361, + "learning_rate": 4.953994437324281e-06, + "loss": 1.1669, + "step": 488 + }, + { + "epoch": 0.4553072625698324, + "grad_norm": 1.5874429941177368, + "learning_rate": 4.953757724376591e-06, + "loss": 1.1477, + "step": 489 + }, + { + "epoch": 0.45623836126629425, + "grad_norm": 1.6093076467514038, + "learning_rate": 4.953520409694186e-06, + "loss": 1.1499, + "step": 490 + }, + { + "epoch": 0.45716945996275604, + "grad_norm": 1.5191084146499634, + "learning_rate": 4.953282493335261e-06, + "loss": 1.1399, + "step": 491 + }, + { + "epoch": 0.4581005586592179, + "grad_norm": 1.5259243249893188, + "learning_rate": 4.953043975358162e-06, + "loss": 1.1177, + "step": 492 + }, + { + "epoch": 0.4590316573556797, + "grad_norm": 1.5378997325897217, + "learning_rate": 4.95280485582138e-06, + "loss": 1.1194, + "step": 493 + }, + { + "epoch": 0.45996275605214154, + "grad_norm": 1.5503298044204712, + "learning_rate": 4.952565134783554e-06, + "loss": 1.1356, + "step": 494 + }, + { + "epoch": 0.46089385474860334, + "grad_norm": 1.6783236265182495, + "learning_rate": 4.952324812303473e-06, + "loss": 1.1194, + "step": 495 + }, + { + "epoch": 0.4618249534450652, + "grad_norm": 1.6769311428070068, + "learning_rate": 4.952083888440068e-06, + "loss": 1.0884, + "step": 496 + }, + { + "epoch": 0.462756052141527, + "grad_norm": 1.6050509214401245, + "learning_rate": 4.951842363252421e-06, + "loss": 1.1494, + "step": 497 + }, + { + "epoch": 0.46368715083798884, + "grad_norm": 1.5492044687271118, + "learning_rate": 4.951600236799765e-06, + "loss": 1.1018, + "step": 498 + }, + { + "epoch": 0.46461824953445063, + "grad_norm": 1.689003586769104, + "learning_rate": 4.951357509141472e-06, + "loss": 1.15, + "step": 499 + }, + { + "epoch": 0.4655493482309125, + "grad_norm": 1.5559686422348022, + "learning_rate": 4.951114180337068e-06, + "loss": 1.0971, + "step": 500 + }, + { + "epoch": 0.4664804469273743, + "grad_norm": 1.630271315574646, + "learning_rate": 4.950870250446226e-06, + "loss": 1.1491, + "step": 501 + }, + { + "epoch": 0.46741154562383613, + "grad_norm": 1.5715354681015015, + "learning_rate": 4.950625719528762e-06, + "loss": 1.1375, + "step": 502 + }, + { + "epoch": 0.46834264432029793, + "grad_norm": 1.5592912435531616, + "learning_rate": 4.950380587644645e-06, + "loss": 1.1869, + "step": 503 + }, + { + "epoch": 0.4692737430167598, + "grad_norm": 1.5133591890335083, + "learning_rate": 4.950134854853986e-06, + "loss": 1.1025, + "step": 504 + }, + { + "epoch": 0.4702048417132216, + "grad_norm": 1.5977835655212402, + "learning_rate": 4.949888521217049e-06, + "loss": 1.1079, + "step": 505 + }, + { + "epoch": 0.47113594040968343, + "grad_norm": 1.5206948518753052, + "learning_rate": 4.949641586794239e-06, + "loss": 1.1285, + "step": 506 + }, + { + "epoch": 0.4720670391061452, + "grad_norm": 1.591206669807434, + "learning_rate": 4.949394051646115e-06, + "loss": 1.1545, + "step": 507 + }, + { + "epoch": 0.4729981378026071, + "grad_norm": 1.546183466911316, + "learning_rate": 4.9491459158333775e-06, + "loss": 1.1069, + "step": 508 + }, + { + "epoch": 0.47392923649906893, + "grad_norm": 1.6251070499420166, + "learning_rate": 4.9488971794168785e-06, + "loss": 1.1175, + "step": 509 + }, + { + "epoch": 0.4748603351955307, + "grad_norm": 1.5359351634979248, + "learning_rate": 4.948647842457615e-06, + "loss": 1.0687, + "step": 510 + }, + { + "epoch": 0.4757914338919926, + "grad_norm": 1.519960880279541, + "learning_rate": 4.948397905016731e-06, + "loss": 1.115, + "step": 511 + }, + { + "epoch": 0.4767225325884544, + "grad_norm": 1.5621953010559082, + "learning_rate": 4.94814736715552e-06, + "loss": 1.1167, + "step": 512 + }, + { + "epoch": 0.4776536312849162, + "grad_norm": 1.575027346611023, + "learning_rate": 4.947896228935421e-06, + "loss": 1.1495, + "step": 513 + }, + { + "epoch": 0.478584729981378, + "grad_norm": 1.539473295211792, + "learning_rate": 4.94764449041802e-06, + "loss": 1.146, + "step": 514 + }, + { + "epoch": 0.4795158286778399, + "grad_norm": 1.5453004837036133, + "learning_rate": 4.94739215166505e-06, + "loss": 1.1036, + "step": 515 + }, + { + "epoch": 0.48044692737430167, + "grad_norm": 1.6224074363708496, + "learning_rate": 4.947139212738395e-06, + "loss": 1.1356, + "step": 516 + }, + { + "epoch": 0.4813780260707635, + "grad_norm": 1.5118556022644043, + "learning_rate": 4.946885673700081e-06, + "loss": 1.1427, + "step": 517 + }, + { + "epoch": 0.4823091247672253, + "grad_norm": 1.4834543466567993, + "learning_rate": 4.9466315346122825e-06, + "loss": 1.0838, + "step": 518 + }, + { + "epoch": 0.48324022346368717, + "grad_norm": 1.6373393535614014, + "learning_rate": 4.946376795537325e-06, + "loss": 1.1342, + "step": 519 + }, + { + "epoch": 0.48417132216014896, + "grad_norm": 1.6570336818695068, + "learning_rate": 4.946121456537676e-06, + "loss": 1.1947, + "step": 520 + }, + { + "epoch": 0.4851024208566108, + "grad_norm": 1.5891071557998657, + "learning_rate": 4.9458655176759515e-06, + "loss": 1.1161, + "step": 521 + }, + { + "epoch": 0.4860335195530726, + "grad_norm": 1.5352654457092285, + "learning_rate": 4.945608979014917e-06, + "loss": 1.1303, + "step": 522 + }, + { + "epoch": 0.48696461824953446, + "grad_norm": 1.565674901008606, + "learning_rate": 4.9453518406174835e-06, + "loss": 1.146, + "step": 523 + }, + { + "epoch": 0.48789571694599626, + "grad_norm": 1.6120684146881104, + "learning_rate": 4.945094102546708e-06, + "loss": 1.1477, + "step": 524 + }, + { + "epoch": 0.4888268156424581, + "grad_norm": 1.664642572402954, + "learning_rate": 4.944835764865796e-06, + "loss": 1.1291, + "step": 525 + }, + { + "epoch": 0.4897579143389199, + "grad_norm": 1.5092589855194092, + "learning_rate": 4.944576827638099e-06, + "loss": 1.0896, + "step": 526 + }, + { + "epoch": 0.49068901303538176, + "grad_norm": 1.5625349283218384, + "learning_rate": 4.9443172909271174e-06, + "loss": 1.1628, + "step": 527 + }, + { + "epoch": 0.49162011173184356, + "grad_norm": 1.5484577417373657, + "learning_rate": 4.9440571547964964e-06, + "loss": 1.0536, + "step": 528 + }, + { + "epoch": 0.4925512104283054, + "grad_norm": 1.5750197172164917, + "learning_rate": 4.94379641931003e-06, + "loss": 1.1176, + "step": 529 + }, + { + "epoch": 0.4934823091247672, + "grad_norm": 1.5709573030471802, + "learning_rate": 4.943535084531658e-06, + "loss": 1.0891, + "step": 530 + }, + { + "epoch": 0.49441340782122906, + "grad_norm": 1.5684819221496582, + "learning_rate": 4.943273150525467e-06, + "loss": 1.0996, + "step": 531 + }, + { + "epoch": 0.49534450651769085, + "grad_norm": 1.5334537029266357, + "learning_rate": 4.943010617355691e-06, + "loss": 1.1499, + "step": 532 + }, + { + "epoch": 0.4962756052141527, + "grad_norm": 1.5027036666870117, + "learning_rate": 4.942747485086712e-06, + "loss": 1.0837, + "step": 533 + }, + { + "epoch": 0.4972067039106145, + "grad_norm": 1.515656590461731, + "learning_rate": 4.942483753783056e-06, + "loss": 1.0909, + "step": 534 + }, + { + "epoch": 0.49813780260707635, + "grad_norm": 1.5232927799224854, + "learning_rate": 4.9422194235094e-06, + "loss": 1.1155, + "step": 535 + }, + { + "epoch": 0.49906890130353815, + "grad_norm": 1.5004891157150269, + "learning_rate": 4.9419544943305645e-06, + "loss": 1.0875, + "step": 536 + }, + { + "epoch": 0.5, + "grad_norm": 1.5596823692321777, + "learning_rate": 4.941688966311519e-06, + "loss": 1.1505, + "step": 537 + }, + { + "epoch": 0.5009310986964618, + "grad_norm": 1.6406277418136597, + "learning_rate": 4.941422839517377e-06, + "loss": 1.1892, + "step": 538 + }, + { + "epoch": 0.5018621973929237, + "grad_norm": 1.5456867218017578, + "learning_rate": 4.941156114013403e-06, + "loss": 1.1329, + "step": 539 + }, + { + "epoch": 0.5027932960893855, + "grad_norm": 1.4837782382965088, + "learning_rate": 4.940888789865004e-06, + "loss": 1.1124, + "step": 540 + }, + { + "epoch": 0.5037243947858473, + "grad_norm": 1.4902780055999756, + "learning_rate": 4.940620867137736e-06, + "loss": 1.1124, + "step": 541 + }, + { + "epoch": 0.5046554934823091, + "grad_norm": 1.4913779497146606, + "learning_rate": 4.940352345897304e-06, + "loss": 1.1245, + "step": 542 + }, + { + "epoch": 0.505586592178771, + "grad_norm": 1.4790959358215332, + "learning_rate": 4.940083226209555e-06, + "loss": 1.0903, + "step": 543 + }, + { + "epoch": 0.5065176908752328, + "grad_norm": 1.5012257099151611, + "learning_rate": 4.939813508140487e-06, + "loss": 1.1274, + "step": 544 + }, + { + "epoch": 0.5074487895716946, + "grad_norm": 1.7029085159301758, + "learning_rate": 4.9395431917562416e-06, + "loss": 1.1124, + "step": 545 + }, + { + "epoch": 0.5083798882681564, + "grad_norm": 1.5168663263320923, + "learning_rate": 4.939272277123109e-06, + "loss": 1.1341, + "step": 546 + }, + { + "epoch": 0.5093109869646183, + "grad_norm": 1.5790756940841675, + "learning_rate": 4.939000764307526e-06, + "loss": 1.0935, + "step": 547 + }, + { + "epoch": 0.5102420856610801, + "grad_norm": 1.5467796325683594, + "learning_rate": 4.9387286533760745e-06, + "loss": 1.0967, + "step": 548 + }, + { + "epoch": 0.5111731843575419, + "grad_norm": 1.5498733520507812, + "learning_rate": 4.938455944395485e-06, + "loss": 1.1503, + "step": 549 + }, + { + "epoch": 0.5121042830540037, + "grad_norm": 1.564219355583191, + "learning_rate": 4.938182637432634e-06, + "loss": 1.1287, + "step": 550 + }, + { + "epoch": 0.5130353817504656, + "grad_norm": 1.519954800605774, + "learning_rate": 4.937908732554544e-06, + "loss": 1.1007, + "step": 551 + }, + { + "epoch": 0.5139664804469274, + "grad_norm": 1.5214513540267944, + "learning_rate": 4.937634229828384e-06, + "loss": 1.0951, + "step": 552 + }, + { + "epoch": 0.5148975791433892, + "grad_norm": 1.4941843748092651, + "learning_rate": 4.9373591293214725e-06, + "loss": 1.1077, + "step": 553 + }, + { + "epoch": 0.515828677839851, + "grad_norm": 1.5349756479263306, + "learning_rate": 4.937083431101271e-06, + "loss": 1.1217, + "step": 554 + }, + { + "epoch": 0.5167597765363129, + "grad_norm": 1.5093350410461426, + "learning_rate": 4.936807135235389e-06, + "loss": 1.1127, + "step": 555 + }, + { + "epoch": 0.5176908752327747, + "grad_norm": 1.491680383682251, + "learning_rate": 4.936530241791582e-06, + "loss": 1.1055, + "step": 556 + }, + { + "epoch": 0.5186219739292365, + "grad_norm": 1.5149484872817993, + "learning_rate": 4.936252750837752e-06, + "loss": 1.0967, + "step": 557 + }, + { + "epoch": 0.5195530726256983, + "grad_norm": 1.5864510536193848, + "learning_rate": 4.935974662441952e-06, + "loss": 1.1255, + "step": 558 + }, + { + "epoch": 0.5204841713221602, + "grad_norm": 1.577508807182312, + "learning_rate": 4.935695976672372e-06, + "loss": 1.1149, + "step": 559 + }, + { + "epoch": 0.521415270018622, + "grad_norm": 1.517242193222046, + "learning_rate": 4.935416693597358e-06, + "loss": 1.0857, + "step": 560 + }, + { + "epoch": 0.5223463687150838, + "grad_norm": 1.5732022523880005, + "learning_rate": 4.935136813285398e-06, + "loss": 1.1494, + "step": 561 + }, + { + "epoch": 0.5232774674115456, + "grad_norm": 1.6665898561477661, + "learning_rate": 4.934856335805125e-06, + "loss": 1.1114, + "step": 562 + }, + { + "epoch": 0.5242085661080075, + "grad_norm": 1.5523788928985596, + "learning_rate": 4.934575261225322e-06, + "loss": 1.1198, + "step": 563 + }, + { + "epoch": 0.5251396648044693, + "grad_norm": 1.5870845317840576, + "learning_rate": 4.934293589614917e-06, + "loss": 1.1332, + "step": 564 + }, + { + "epoch": 0.5260707635009311, + "grad_norm": 1.5657655000686646, + "learning_rate": 4.934011321042984e-06, + "loss": 1.0994, + "step": 565 + }, + { + "epoch": 0.527001862197393, + "grad_norm": 1.4799336194992065, + "learning_rate": 4.933728455578745e-06, + "loss": 1.091, + "step": 566 + }, + { + "epoch": 0.5279329608938548, + "grad_norm": 1.5232096910476685, + "learning_rate": 4.933444993291564e-06, + "loss": 1.1084, + "step": 567 + }, + { + "epoch": 0.5288640595903166, + "grad_norm": 1.7085096836090088, + "learning_rate": 4.933160934250957e-06, + "loss": 1.0918, + "step": 568 + }, + { + "epoch": 0.5297951582867784, + "grad_norm": 1.5313915014266968, + "learning_rate": 4.932876278526583e-06, + "loss": 1.1068, + "step": 569 + }, + { + "epoch": 0.5307262569832403, + "grad_norm": 1.61378014087677, + "learning_rate": 4.932591026188247e-06, + "loss": 1.0907, + "step": 570 + }, + { + "epoch": 0.5316573556797021, + "grad_norm": 1.5584440231323242, + "learning_rate": 4.932305177305903e-06, + "loss": 1.1082, + "step": 571 + }, + { + "epoch": 0.5325884543761639, + "grad_norm": 1.5553605556488037, + "learning_rate": 4.932018731949649e-06, + "loss": 1.0443, + "step": 572 + }, + { + "epoch": 0.5335195530726257, + "grad_norm": 1.559438943862915, + "learning_rate": 4.931731690189731e-06, + "loss": 1.0706, + "step": 573 + }, + { + "epoch": 0.5344506517690876, + "grad_norm": 1.82330322265625, + "learning_rate": 4.931444052096539e-06, + "loss": 1.1079, + "step": 574 + }, + { + "epoch": 0.5353817504655494, + "grad_norm": 1.6149985790252686, + "learning_rate": 4.9311558177406105e-06, + "loss": 1.0747, + "step": 575 + }, + { + "epoch": 0.5363128491620112, + "grad_norm": 1.5697458982467651, + "learning_rate": 4.93086698719263e-06, + "loss": 1.1586, + "step": 576 + }, + { + "epoch": 0.537243947858473, + "grad_norm": 1.5794774293899536, + "learning_rate": 4.9305775605234255e-06, + "loss": 1.1616, + "step": 577 + }, + { + "epoch": 0.5381750465549349, + "grad_norm": 1.6230155229568481, + "learning_rate": 4.930287537803975e-06, + "loss": 1.0912, + "step": 578 + }, + { + "epoch": 0.5391061452513967, + "grad_norm": 1.6342494487762451, + "learning_rate": 4.9299969191054e-06, + "loss": 1.1393, + "step": 579 + }, + { + "epoch": 0.5400372439478585, + "grad_norm": 1.4576398134231567, + "learning_rate": 4.929705704498969e-06, + "loss": 1.1042, + "step": 580 + }, + { + "epoch": 0.5409683426443203, + "grad_norm": 1.6272011995315552, + "learning_rate": 4.929413894056098e-06, + "loss": 1.0928, + "step": 581 + }, + { + "epoch": 0.5418994413407822, + "grad_norm": 1.6674431562423706, + "learning_rate": 4.929121487848344e-06, + "loss": 1.1872, + "step": 582 + }, + { + "epoch": 0.542830540037244, + "grad_norm": 1.6330026388168335, + "learning_rate": 4.9288284859474165e-06, + "loss": 1.1248, + "step": 583 + }, + { + "epoch": 0.5437616387337058, + "grad_norm": 1.5673110485076904, + "learning_rate": 4.928534888425168e-06, + "loss": 1.1088, + "step": 584 + }, + { + "epoch": 0.5446927374301676, + "grad_norm": 1.533722162246704, + "learning_rate": 4.928240695353598e-06, + "loss": 1.0675, + "step": 585 + }, + { + "epoch": 0.5456238361266295, + "grad_norm": 1.5629221200942993, + "learning_rate": 4.92794590680485e-06, + "loss": 1.0901, + "step": 586 + }, + { + "epoch": 0.5465549348230913, + "grad_norm": 1.624135136604309, + "learning_rate": 4.927650522851215e-06, + "loss": 1.0508, + "step": 587 + }, + { + "epoch": 0.547486033519553, + "grad_norm": 1.578065037727356, + "learning_rate": 4.927354543565131e-06, + "loss": 1.1512, + "step": 588 + }, + { + "epoch": 0.5484171322160148, + "grad_norm": 1.513796329498291, + "learning_rate": 4.92705796901918e-06, + "loss": 1.0465, + "step": 589 + }, + { + "epoch": 0.5493482309124768, + "grad_norm": 1.6567186117172241, + "learning_rate": 4.926760799286091e-06, + "loss": 1.0986, + "step": 590 + }, + { + "epoch": 0.5502793296089385, + "grad_norm": 1.5196467638015747, + "learning_rate": 4.92646303443874e-06, + "loss": 1.0875, + "step": 591 + }, + { + "epoch": 0.5512104283054003, + "grad_norm": 1.599839687347412, + "learning_rate": 4.926164674550147e-06, + "loss": 1.0962, + "step": 592 + }, + { + "epoch": 0.5521415270018621, + "grad_norm": 1.503763198852539, + "learning_rate": 4.925865719693479e-06, + "loss": 1.1416, + "step": 593 + }, + { + "epoch": 0.553072625698324, + "grad_norm": 1.5292751789093018, + "learning_rate": 4.925566169942048e-06, + "loss": 1.0939, + "step": 594 + }, + { + "epoch": 0.5540037243947858, + "grad_norm": 1.5088088512420654, + "learning_rate": 4.925266025369314e-06, + "loss": 1.132, + "step": 595 + }, + { + "epoch": 0.5549348230912476, + "grad_norm": 1.5630978345870972, + "learning_rate": 4.92496528604888e-06, + "loss": 1.1056, + "step": 596 + }, + { + "epoch": 0.5558659217877095, + "grad_norm": 1.6168630123138428, + "learning_rate": 4.924663952054497e-06, + "loss": 1.1408, + "step": 597 + }, + { + "epoch": 0.5567970204841713, + "grad_norm": 1.5950313806533813, + "learning_rate": 4.924362023460061e-06, + "loss": 1.1514, + "step": 598 + }, + { + "epoch": 0.5577281191806331, + "grad_norm": 1.5354509353637695, + "learning_rate": 4.924059500339613e-06, + "loss": 1.1279, + "step": 599 + }, + { + "epoch": 0.5586592178770949, + "grad_norm": 1.5805270671844482, + "learning_rate": 4.923756382767342e-06, + "loss": 1.1022, + "step": 600 + }, + { + "epoch": 0.5595903165735568, + "grad_norm": 1.48416268825531, + "learning_rate": 4.923452670817581e-06, + "loss": 1.0623, + "step": 601 + }, + { + "epoch": 0.5605214152700186, + "grad_norm": 1.545617699623108, + "learning_rate": 4.923148364564809e-06, + "loss": 1.1428, + "step": 602 + }, + { + "epoch": 0.5614525139664804, + "grad_norm": 1.4851430654525757, + "learning_rate": 4.922843464083651e-06, + "loss": 1.1682, + "step": 603 + }, + { + "epoch": 0.5623836126629422, + "grad_norm": 1.5792040824890137, + "learning_rate": 4.922537969448879e-06, + "loss": 1.1727, + "step": 604 + }, + { + "epoch": 0.5633147113594041, + "grad_norm": 1.5662609338760376, + "learning_rate": 4.922231880735407e-06, + "loss": 1.1186, + "step": 605 + }, + { + "epoch": 0.5642458100558659, + "grad_norm": 1.6631107330322266, + "learning_rate": 4.9219251980183e-06, + "loss": 1.0989, + "step": 606 + }, + { + "epoch": 0.5651769087523277, + "grad_norm": 1.559081792831421, + "learning_rate": 4.921617921372764e-06, + "loss": 1.0703, + "step": 607 + }, + { + "epoch": 0.5661080074487895, + "grad_norm": 1.5378901958465576, + "learning_rate": 4.921310050874151e-06, + "loss": 1.1408, + "step": 608 + }, + { + "epoch": 0.5670391061452514, + "grad_norm": 1.4655958414077759, + "learning_rate": 4.921001586597963e-06, + "loss": 1.1095, + "step": 609 + }, + { + "epoch": 0.5679702048417132, + "grad_norm": 1.6246010065078735, + "learning_rate": 4.920692528619843e-06, + "loss": 1.1076, + "step": 610 + }, + { + "epoch": 0.568901303538175, + "grad_norm": 1.5420719385147095, + "learning_rate": 4.920382877015581e-06, + "loss": 1.0959, + "step": 611 + }, + { + "epoch": 0.5698324022346368, + "grad_norm": 1.6560289859771729, + "learning_rate": 4.920072631861115e-06, + "loss": 1.1583, + "step": 612 + }, + { + "epoch": 0.5707635009310987, + "grad_norm": 1.4817291498184204, + "learning_rate": 4.919761793232524e-06, + "loss": 1.1129, + "step": 613 + }, + { + "epoch": 0.5716945996275605, + "grad_norm": 1.7070050239562988, + "learning_rate": 4.919450361206035e-06, + "loss": 1.1142, + "step": 614 + }, + { + "epoch": 0.5726256983240223, + "grad_norm": 1.7083073854446411, + "learning_rate": 4.919138335858021e-06, + "loss": 1.1474, + "step": 615 + }, + { + "epoch": 0.5735567970204841, + "grad_norm": 1.6288410425186157, + "learning_rate": 4.918825717265001e-06, + "loss": 1.1543, + "step": 616 + }, + { + "epoch": 0.574487895716946, + "grad_norm": 1.6135038137435913, + "learning_rate": 4.918512505503638e-06, + "loss": 1.0989, + "step": 617 + }, + { + "epoch": 0.5754189944134078, + "grad_norm": 1.4804624319076538, + "learning_rate": 4.9181987006507396e-06, + "loss": 1.0831, + "step": 618 + }, + { + "epoch": 0.5763500931098696, + "grad_norm": 1.4810327291488647, + "learning_rate": 4.91788430278326e-06, + "loss": 1.1531, + "step": 619 + }, + { + "epoch": 0.5772811918063314, + "grad_norm": 1.5278757810592651, + "learning_rate": 4.917569311978301e-06, + "loss": 1.1033, + "step": 620 + }, + { + "epoch": 0.5782122905027933, + "grad_norm": 1.474737524986267, + "learning_rate": 4.917253728313107e-06, + "loss": 1.0526, + "step": 621 + }, + { + "epoch": 0.5791433891992551, + "grad_norm": 1.5314648151397705, + "learning_rate": 4.916937551865068e-06, + "loss": 1.0946, + "step": 622 + }, + { + "epoch": 0.5800744878957169, + "grad_norm": 1.654606580734253, + "learning_rate": 4.916620782711719e-06, + "loss": 1.1657, + "step": 623 + }, + { + "epoch": 0.5810055865921788, + "grad_norm": 1.4837652444839478, + "learning_rate": 4.9163034209307435e-06, + "loss": 1.1388, + "step": 624 + }, + { + "epoch": 0.5819366852886406, + "grad_norm": 1.6011992692947388, + "learning_rate": 4.915985466599967e-06, + "loss": 1.0946, + "step": 625 + }, + { + "epoch": 0.5828677839851024, + "grad_norm": 1.623832106590271, + "learning_rate": 4.91566691979736e-06, + "loss": 1.1649, + "step": 626 + }, + { + "epoch": 0.5837988826815642, + "grad_norm": 1.526106357574463, + "learning_rate": 4.915347780601042e-06, + "loss": 1.1469, + "step": 627 + }, + { + "epoch": 0.5847299813780261, + "grad_norm": 1.5569554567337036, + "learning_rate": 4.915028049089275e-06, + "loss": 1.1163, + "step": 628 + }, + { + "epoch": 0.5856610800744879, + "grad_norm": 1.5431631803512573, + "learning_rate": 4.914707725340465e-06, + "loss": 1.0944, + "step": 629 + }, + { + "epoch": 0.5865921787709497, + "grad_norm": 1.4687777757644653, + "learning_rate": 4.914386809433167e-06, + "loss": 1.0564, + "step": 630 + }, + { + "epoch": 0.5875232774674115, + "grad_norm": 1.5362383127212524, + "learning_rate": 4.914065301446078e-06, + "loss": 1.1252, + "step": 631 + }, + { + "epoch": 0.5884543761638734, + "grad_norm": 1.5959590673446655, + "learning_rate": 4.913743201458042e-06, + "loss": 1.133, + "step": 632 + }, + { + "epoch": 0.5893854748603352, + "grad_norm": 1.552869439125061, + "learning_rate": 4.913420509548047e-06, + "loss": 1.1218, + "step": 633 + }, + { + "epoch": 0.590316573556797, + "grad_norm": 1.591748833656311, + "learning_rate": 4.913097225795227e-06, + "loss": 1.1433, + "step": 634 + }, + { + "epoch": 0.5912476722532588, + "grad_norm": 1.5221431255340576, + "learning_rate": 4.912773350278861e-06, + "loss": 1.0595, + "step": 635 + }, + { + "epoch": 0.5921787709497207, + "grad_norm": 1.5379856824874878, + "learning_rate": 4.912448883078373e-06, + "loss": 1.1289, + "step": 636 + }, + { + "epoch": 0.5931098696461825, + "grad_norm": 1.562050223350525, + "learning_rate": 4.912123824273331e-06, + "loss": 1.1394, + "step": 637 + }, + { + "epoch": 0.5940409683426443, + "grad_norm": 1.6056617498397827, + "learning_rate": 4.91179817394345e-06, + "loss": 1.1104, + "step": 638 + }, + { + "epoch": 0.5949720670391061, + "grad_norm": 1.5202099084854126, + "learning_rate": 4.91147193216859e-06, + "loss": 1.0701, + "step": 639 + }, + { + "epoch": 0.595903165735568, + "grad_norm": 1.53488028049469, + "learning_rate": 4.911145099028753e-06, + "loss": 1.0748, + "step": 640 + }, + { + "epoch": 0.5968342644320298, + "grad_norm": 1.6276181936264038, + "learning_rate": 4.91081767460409e-06, + "loss": 1.129, + "step": 641 + }, + { + "epoch": 0.5977653631284916, + "grad_norm": 1.5922019481658936, + "learning_rate": 4.910489658974896e-06, + "loss": 1.1204, + "step": 642 + }, + { + "epoch": 0.5986964618249534, + "grad_norm": 1.5041857957839966, + "learning_rate": 4.910161052221608e-06, + "loss": 1.0824, + "step": 643 + }, + { + "epoch": 0.5996275605214153, + "grad_norm": 1.6397160291671753, + "learning_rate": 4.909831854424812e-06, + "loss": 1.1401, + "step": 644 + }, + { + "epoch": 0.6005586592178771, + "grad_norm": 1.5218939781188965, + "learning_rate": 4.909502065665236e-06, + "loss": 1.0875, + "step": 645 + }, + { + "epoch": 0.6014897579143389, + "grad_norm": 1.5679396390914917, + "learning_rate": 4.9091716860237545e-06, + "loss": 1.1467, + "step": 646 + }, + { + "epoch": 0.6024208566108007, + "grad_norm": 1.606338620185852, + "learning_rate": 4.908840715581386e-06, + "loss": 1.0847, + "step": 647 + }, + { + "epoch": 0.6033519553072626, + "grad_norm": 1.495506763458252, + "learning_rate": 4.908509154419296e-06, + "loss": 1.1315, + "step": 648 + }, + { + "epoch": 0.6042830540037244, + "grad_norm": 1.4830591678619385, + "learning_rate": 4.9081770026187915e-06, + "loss": 1.1011, + "step": 649 + }, + { + "epoch": 0.6052141527001862, + "grad_norm": 1.540897250175476, + "learning_rate": 4.9078442602613265e-06, + "loss": 1.1061, + "step": 650 + }, + { + "epoch": 0.6061452513966481, + "grad_norm": 1.5537681579589844, + "learning_rate": 4.907510927428499e-06, + "loss": 1.138, + "step": 651 + }, + { + "epoch": 0.6070763500931099, + "grad_norm": 1.626189947128296, + "learning_rate": 4.907177004202053e-06, + "loss": 1.1607, + "step": 652 + }, + { + "epoch": 0.6080074487895717, + "grad_norm": 1.6425658464431763, + "learning_rate": 4.9068424906638756e-06, + "loss": 1.1357, + "step": 653 + }, + { + "epoch": 0.6089385474860335, + "grad_norm": 1.5693364143371582, + "learning_rate": 4.906507386896e-06, + "loss": 1.1053, + "step": 654 + }, + { + "epoch": 0.6098696461824954, + "grad_norm": 1.557714581489563, + "learning_rate": 4.906171692980603e-06, + "loss": 1.1253, + "step": 655 + }, + { + "epoch": 0.6108007448789572, + "grad_norm": 1.5892163515090942, + "learning_rate": 4.905835409000009e-06, + "loss": 1.1183, + "step": 656 + }, + { + "epoch": 0.611731843575419, + "grad_norm": 1.5864640474319458, + "learning_rate": 4.905498535036683e-06, + "loss": 1.0769, + "step": 657 + }, + { + "epoch": 0.6126629422718808, + "grad_norm": 1.4962928295135498, + "learning_rate": 4.905161071173236e-06, + "loss": 1.0852, + "step": 658 + }, + { + "epoch": 0.6135940409683427, + "grad_norm": 1.4973156452178955, + "learning_rate": 4.904823017492425e-06, + "loss": 1.126, + "step": 659 + }, + { + "epoch": 0.6145251396648045, + "grad_norm": 1.5556844472885132, + "learning_rate": 4.904484374077151e-06, + "loss": 1.1195, + "step": 660 + }, + { + "epoch": 0.6154562383612663, + "grad_norm": 1.661524772644043, + "learning_rate": 4.9041451410104595e-06, + "loss": 1.1442, + "step": 661 + }, + { + "epoch": 0.6163873370577281, + "grad_norm": 1.5103766918182373, + "learning_rate": 4.903805318375541e-06, + "loss": 1.0951, + "step": 662 + }, + { + "epoch": 0.61731843575419, + "grad_norm": 1.5827866792678833, + "learning_rate": 4.9034649062557295e-06, + "loss": 1.1581, + "step": 663 + }, + { + "epoch": 0.6182495344506518, + "grad_norm": 1.5530108213424683, + "learning_rate": 4.903123904734504e-06, + "loss": 1.1187, + "step": 664 + }, + { + "epoch": 0.6191806331471136, + "grad_norm": 1.930545449256897, + "learning_rate": 4.902782313895489e-06, + "loss": 1.0908, + "step": 665 + }, + { + "epoch": 0.6201117318435754, + "grad_norm": 1.5824450254440308, + "learning_rate": 4.902440133822452e-06, + "loss": 1.0865, + "step": 666 + }, + { + "epoch": 0.6210428305400373, + "grad_norm": 1.5734223127365112, + "learning_rate": 4.902097364599306e-06, + "loss": 1.0816, + "step": 667 + }, + { + "epoch": 0.6219739292364991, + "grad_norm": 1.5792101621627808, + "learning_rate": 4.9017540063101085e-06, + "loss": 1.0988, + "step": 668 + }, + { + "epoch": 0.6229050279329609, + "grad_norm": 1.5171549320220947, + "learning_rate": 4.901410059039061e-06, + "loss": 1.0898, + "step": 669 + }, + { + "epoch": 0.6238361266294227, + "grad_norm": 1.5120694637298584, + "learning_rate": 4.901065522870511e-06, + "loss": 1.0758, + "step": 670 + }, + { + "epoch": 0.6247672253258846, + "grad_norm": 1.6109082698822021, + "learning_rate": 4.900720397888947e-06, + "loss": 1.1205, + "step": 671 + }, + { + "epoch": 0.6256983240223464, + "grad_norm": 1.5656803846359253, + "learning_rate": 4.900374684179005e-06, + "loss": 1.0803, + "step": 672 + }, + { + "epoch": 0.6266294227188082, + "grad_norm": 1.5185593366622925, + "learning_rate": 4.900028381825464e-06, + "loss": 1.1101, + "step": 673 + }, + { + "epoch": 0.62756052141527, + "grad_norm": 1.51339852809906, + "learning_rate": 4.8996814909132475e-06, + "loss": 1.0805, + "step": 674 + }, + { + "epoch": 0.6284916201117319, + "grad_norm": 1.5645750761032104, + "learning_rate": 4.899334011527424e-06, + "loss": 1.1125, + "step": 675 + }, + { + "epoch": 0.6294227188081937, + "grad_norm": 1.6254100799560547, + "learning_rate": 4.898985943753207e-06, + "loss": 1.1332, + "step": 676 + }, + { + "epoch": 0.6303538175046555, + "grad_norm": 1.5608704090118408, + "learning_rate": 4.89863728767595e-06, + "loss": 1.1132, + "step": 677 + }, + { + "epoch": 0.6312849162011173, + "grad_norm": 1.5128356218338013, + "learning_rate": 4.898288043381157e-06, + "loss": 1.0267, + "step": 678 + }, + { + "epoch": 0.6322160148975792, + "grad_norm": 1.5966126918792725, + "learning_rate": 4.897938210954472e-06, + "loss": 1.0597, + "step": 679 + }, + { + "epoch": 0.633147113594041, + "grad_norm": 1.5826008319854736, + "learning_rate": 4.8975877904816825e-06, + "loss": 1.0882, + "step": 680 + }, + { + "epoch": 0.6340782122905028, + "grad_norm": 1.5969572067260742, + "learning_rate": 4.897236782048726e-06, + "loss": 1.0752, + "step": 681 + }, + { + "epoch": 0.6350093109869647, + "grad_norm": 1.5381343364715576, + "learning_rate": 4.896885185741676e-06, + "loss": 1.1118, + "step": 682 + }, + { + "epoch": 0.6359404096834265, + "grad_norm": 1.5589604377746582, + "learning_rate": 4.896533001646757e-06, + "loss": 1.1077, + "step": 683 + }, + { + "epoch": 0.6368715083798883, + "grad_norm": 1.4659178256988525, + "learning_rate": 4.8961802298503355e-06, + "loss": 1.078, + "step": 684 + }, + { + "epoch": 0.6378026070763501, + "grad_norm": 1.5804171562194824, + "learning_rate": 4.89582687043892e-06, + "loss": 1.0963, + "step": 685 + }, + { + "epoch": 0.638733705772812, + "grad_norm": 1.4594801664352417, + "learning_rate": 4.895472923499165e-06, + "loss": 1.1026, + "step": 686 + }, + { + "epoch": 0.6396648044692738, + "grad_norm": 1.5364091396331787, + "learning_rate": 4.89511838911787e-06, + "loss": 1.1093, + "step": 687 + }, + { + "epoch": 0.6405959031657356, + "grad_norm": 1.5281437635421753, + "learning_rate": 4.894763267381977e-06, + "loss": 1.1183, + "step": 688 + }, + { + "epoch": 0.6415270018621974, + "grad_norm": 1.5509490966796875, + "learning_rate": 4.894407558378572e-06, + "loss": 1.1291, + "step": 689 + }, + { + "epoch": 0.6424581005586593, + "grad_norm": 1.616011381149292, + "learning_rate": 4.894051262194885e-06, + "loss": 1.0723, + "step": 690 + }, + { + "epoch": 0.6433891992551211, + "grad_norm": 1.6060771942138672, + "learning_rate": 4.893694378918292e-06, + "loss": 1.067, + "step": 691 + }, + { + "epoch": 0.6443202979515829, + "grad_norm": 1.5956634283065796, + "learning_rate": 4.8933369086363105e-06, + "loss": 1.1002, + "step": 692 + }, + { + "epoch": 0.6452513966480447, + "grad_norm": 1.5667318105697632, + "learning_rate": 4.892978851436603e-06, + "loss": 1.1144, + "step": 693 + }, + { + "epoch": 0.6461824953445066, + "grad_norm": 1.488541603088379, + "learning_rate": 4.892620207406975e-06, + "loss": 1.0363, + "step": 694 + }, + { + "epoch": 0.6471135940409684, + "grad_norm": 1.6104015111923218, + "learning_rate": 4.892260976635379e-06, + "loss": 1.0853, + "step": 695 + }, + { + "epoch": 0.6480446927374302, + "grad_norm": 1.565591812133789, + "learning_rate": 4.891901159209907e-06, + "loss": 1.0491, + "step": 696 + }, + { + "epoch": 0.648975791433892, + "grad_norm": 1.5780127048492432, + "learning_rate": 4.891540755218797e-06, + "loss": 1.0958, + "step": 697 + }, + { + "epoch": 0.6499068901303539, + "grad_norm": 1.596530795097351, + "learning_rate": 4.891179764750434e-06, + "loss": 1.1335, + "step": 698 + }, + { + "epoch": 0.6508379888268156, + "grad_norm": 1.5459315776824951, + "learning_rate": 4.890818187893338e-06, + "loss": 1.091, + "step": 699 + }, + { + "epoch": 0.6517690875232774, + "grad_norm": 1.5360651016235352, + "learning_rate": 4.890456024736183e-06, + "loss": 1.1095, + "step": 700 + }, + { + "epoch": 0.6527001862197392, + "grad_norm": 1.5870563983917236, + "learning_rate": 4.890093275367781e-06, + "loss": 1.0876, + "step": 701 + }, + { + "epoch": 0.6536312849162011, + "grad_norm": 1.5088231563568115, + "learning_rate": 4.889729939877089e-06, + "loss": 1.1153, + "step": 702 + }, + { + "epoch": 0.654562383612663, + "grad_norm": 1.4789011478424072, + "learning_rate": 4.889366018353207e-06, + "loss": 1.1195, + "step": 703 + }, + { + "epoch": 0.6554934823091247, + "grad_norm": 1.4999905824661255, + "learning_rate": 4.8890015108853805e-06, + "loss": 1.1156, + "step": 704 + }, + { + "epoch": 0.6564245810055865, + "grad_norm": 1.5061445236206055, + "learning_rate": 4.888636417562996e-06, + "loss": 1.0963, + "step": 705 + }, + { + "epoch": 0.6573556797020484, + "grad_norm": 1.5363857746124268, + "learning_rate": 4.888270738475588e-06, + "loss": 1.1112, + "step": 706 + }, + { + "epoch": 0.6582867783985102, + "grad_norm": 1.4777934551239014, + "learning_rate": 4.887904473712829e-06, + "loss": 1.1139, + "step": 707 + }, + { + "epoch": 0.659217877094972, + "grad_norm": 1.4956679344177246, + "learning_rate": 4.88753762336454e-06, + "loss": 1.0872, + "step": 708 + }, + { + "epoch": 0.660148975791434, + "grad_norm": 1.4835432767868042, + "learning_rate": 4.887170187520684e-06, + "loss": 1.1361, + "step": 709 + }, + { + "epoch": 0.6610800744878957, + "grad_norm": 1.5423203706741333, + "learning_rate": 4.886802166271365e-06, + "loss": 1.0813, + "step": 710 + }, + { + "epoch": 0.6620111731843575, + "grad_norm": 1.559105634689331, + "learning_rate": 4.8864335597068335e-06, + "loss": 1.1319, + "step": 711 + }, + { + "epoch": 0.6629422718808193, + "grad_norm": 1.5082135200500488, + "learning_rate": 4.886064367917485e-06, + "loss": 1.1182, + "step": 712 + }, + { + "epoch": 0.6638733705772812, + "grad_norm": 1.5104544162750244, + "learning_rate": 4.885694590993854e-06, + "loss": 1.0512, + "step": 713 + }, + { + "epoch": 0.664804469273743, + "grad_norm": 1.4970797300338745, + "learning_rate": 4.8853242290266216e-06, + "loss": 1.1005, + "step": 714 + }, + { + "epoch": 0.6657355679702048, + "grad_norm": 1.579653263092041, + "learning_rate": 4.884953282106612e-06, + "loss": 1.0788, + "step": 715 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.603585958480835, + "learning_rate": 4.884581750324792e-06, + "loss": 1.0905, + "step": 716 + }, + { + "epoch": 0.6675977653631285, + "grad_norm": 1.5491455793380737, + "learning_rate": 4.884209633772272e-06, + "loss": 1.0607, + "step": 717 + }, + { + "epoch": 0.6685288640595903, + "grad_norm": 1.4984642267227173, + "learning_rate": 4.883836932540308e-06, + "loss": 1.0939, + "step": 718 + }, + { + "epoch": 0.6694599627560521, + "grad_norm": 1.6042760610580444, + "learning_rate": 4.883463646720295e-06, + "loss": 1.071, + "step": 719 + }, + { + "epoch": 0.6703910614525139, + "grad_norm": 1.4980610609054565, + "learning_rate": 4.883089776403775e-06, + "loss": 1.0743, + "step": 720 + }, + { + "epoch": 0.6713221601489758, + "grad_norm": 1.5268374681472778, + "learning_rate": 4.882715321682432e-06, + "loss": 1.0753, + "step": 721 + }, + { + "epoch": 0.6722532588454376, + "grad_norm": 1.61496102809906, + "learning_rate": 4.882340282648094e-06, + "loss": 1.1517, + "step": 722 + }, + { + "epoch": 0.6731843575418994, + "grad_norm": 1.528674602508545, + "learning_rate": 4.881964659392731e-06, + "loss": 1.1136, + "step": 723 + }, + { + "epoch": 0.6741154562383612, + "grad_norm": 1.451182246208191, + "learning_rate": 4.881588452008457e-06, + "loss": 1.0732, + "step": 724 + }, + { + "epoch": 0.6750465549348231, + "grad_norm": 1.580771565437317, + "learning_rate": 4.8812116605875295e-06, + "loss": 1.1069, + "step": 725 + }, + { + "epoch": 0.6759776536312849, + "grad_norm": 1.5257370471954346, + "learning_rate": 4.88083428522235e-06, + "loss": 1.0937, + "step": 726 + }, + { + "epoch": 0.6769087523277467, + "grad_norm": 1.6596299409866333, + "learning_rate": 4.88045632600546e-06, + "loss": 1.0962, + "step": 727 + }, + { + "epoch": 0.6778398510242085, + "grad_norm": 1.533115029335022, + "learning_rate": 4.880077783029549e-06, + "loss": 1.0583, + "step": 728 + }, + { + "epoch": 0.6787709497206704, + "grad_norm": 1.6121939420700073, + "learning_rate": 4.879698656387446e-06, + "loss": 1.0924, + "step": 729 + }, + { + "epoch": 0.6797020484171322, + "grad_norm": 1.555610179901123, + "learning_rate": 4.879318946172124e-06, + "loss": 1.1304, + "step": 730 + }, + { + "epoch": 0.680633147113594, + "grad_norm": 1.5840232372283936, + "learning_rate": 4.878938652476698e-06, + "loss": 1.1388, + "step": 731 + }, + { + "epoch": 0.6815642458100558, + "grad_norm": 1.578559398651123, + "learning_rate": 4.878557775394429e-06, + "loss": 1.1108, + "step": 732 + }, + { + "epoch": 0.6824953445065177, + "grad_norm": 1.5036128759384155, + "learning_rate": 4.87817631501872e-06, + "loss": 1.0703, + "step": 733 + }, + { + "epoch": 0.6834264432029795, + "grad_norm": 1.5150079727172852, + "learning_rate": 4.877794271443116e-06, + "loss": 1.1165, + "step": 734 + }, + { + "epoch": 0.6843575418994413, + "grad_norm": 1.5486232042312622, + "learning_rate": 4.877411644761304e-06, + "loss": 1.0724, + "step": 735 + }, + { + "epoch": 0.6852886405959032, + "grad_norm": 1.5734553337097168, + "learning_rate": 4.877028435067117e-06, + "loss": 1.0929, + "step": 736 + }, + { + "epoch": 0.686219739292365, + "grad_norm": 1.570406198501587, + "learning_rate": 4.876644642454529e-06, + "loss": 1.0725, + "step": 737 + }, + { + "epoch": 0.6871508379888268, + "grad_norm": 1.5357011556625366, + "learning_rate": 4.8762602670176574e-06, + "loss": 1.0723, + "step": 738 + }, + { + "epoch": 0.6880819366852886, + "grad_norm": 1.5737360715866089, + "learning_rate": 4.875875308850762e-06, + "loss": 1.1451, + "step": 739 + }, + { + "epoch": 0.6890130353817505, + "grad_norm": 1.6345967054367065, + "learning_rate": 4.875489768048247e-06, + "loss": 1.157, + "step": 740 + }, + { + "epoch": 0.6899441340782123, + "grad_norm": 1.5567008256912231, + "learning_rate": 4.87510364470466e-06, + "loss": 1.0703, + "step": 741 + }, + { + "epoch": 0.6908752327746741, + "grad_norm": 1.5474246740341187, + "learning_rate": 4.874716938914686e-06, + "loss": 1.0862, + "step": 742 + }, + { + "epoch": 0.6918063314711359, + "grad_norm": 1.5408908128738403, + "learning_rate": 4.87432965077316e-06, + "loss": 1.0408, + "step": 743 + }, + { + "epoch": 0.6927374301675978, + "grad_norm": 1.5973511934280396, + "learning_rate": 4.873941780375055e-06, + "loss": 1.0974, + "step": 744 + }, + { + "epoch": 0.6936685288640596, + "grad_norm": 1.5032771825790405, + "learning_rate": 4.873553327815489e-06, + "loss": 1.0607, + "step": 745 + }, + { + "epoch": 0.6945996275605214, + "grad_norm": 1.5170742273330688, + "learning_rate": 4.873164293189723e-06, + "loss": 1.0646, + "step": 746 + }, + { + "epoch": 0.6955307262569832, + "grad_norm": 1.6156871318817139, + "learning_rate": 4.872774676593158e-06, + "loss": 1.1065, + "step": 747 + }, + { + "epoch": 0.6964618249534451, + "grad_norm": 1.5451059341430664, + "learning_rate": 4.872384478121342e-06, + "loss": 1.087, + "step": 748 + }, + { + "epoch": 0.6973929236499069, + "grad_norm": 1.5151008367538452, + "learning_rate": 4.871993697869961e-06, + "loss": 1.0941, + "step": 749 + }, + { + "epoch": 0.6983240223463687, + "grad_norm": 1.5981560945510864, + "learning_rate": 4.871602335934847e-06, + "loss": 1.0877, + "step": 750 + }, + { + "epoch": 0.6992551210428305, + "grad_norm": 1.584849238395691, + "learning_rate": 4.8712103924119744e-06, + "loss": 1.1211, + "step": 751 + }, + { + "epoch": 0.7001862197392924, + "grad_norm": 1.5590605735778809, + "learning_rate": 4.870817867397459e-06, + "loss": 1.1352, + "step": 752 + }, + { + "epoch": 0.7011173184357542, + "grad_norm": 1.5966578722000122, + "learning_rate": 4.870424760987559e-06, + "loss": 1.1345, + "step": 753 + }, + { + "epoch": 0.702048417132216, + "grad_norm": 1.4746921062469482, + "learning_rate": 4.870031073278676e-06, + "loss": 1.0907, + "step": 754 + }, + { + "epoch": 0.7029795158286778, + "grad_norm": 1.5347262620925903, + "learning_rate": 4.869636804367355e-06, + "loss": 1.0769, + "step": 755 + }, + { + "epoch": 0.7039106145251397, + "grad_norm": 1.5220005512237549, + "learning_rate": 4.869241954350281e-06, + "loss": 1.0585, + "step": 756 + }, + { + "epoch": 0.7048417132216015, + "grad_norm": 1.5988545417785645, + "learning_rate": 4.868846523324284e-06, + "loss": 1.1004, + "step": 757 + }, + { + "epoch": 0.7057728119180633, + "grad_norm": 1.574826717376709, + "learning_rate": 4.868450511386336e-06, + "loss": 1.1146, + "step": 758 + }, + { + "epoch": 0.7067039106145251, + "grad_norm": 1.5736562013626099, + "learning_rate": 4.868053918633549e-06, + "loss": 1.0703, + "step": 759 + }, + { + "epoch": 0.707635009310987, + "grad_norm": 1.464430809020996, + "learning_rate": 4.867656745163182e-06, + "loss": 1.0535, + "step": 760 + }, + { + "epoch": 0.7085661080074488, + "grad_norm": 1.561993956565857, + "learning_rate": 4.8672589910726305e-06, + "loss": 1.0615, + "step": 761 + }, + { + "epoch": 0.7094972067039106, + "grad_norm": 1.5395992994308472, + "learning_rate": 4.86686065645944e-06, + "loss": 1.0774, + "step": 762 + }, + { + "epoch": 0.7104283054003724, + "grad_norm": 1.5870630741119385, + "learning_rate": 4.86646174142129e-06, + "loss": 1.0823, + "step": 763 + }, + { + "epoch": 0.7113594040968343, + "grad_norm": 1.6562172174453735, + "learning_rate": 4.8660622460560096e-06, + "loss": 1.1273, + "step": 764 + }, + { + "epoch": 0.7122905027932961, + "grad_norm": 1.6088863611221313, + "learning_rate": 4.865662170461564e-06, + "loss": 1.1102, + "step": 765 + }, + { + "epoch": 0.7132216014897579, + "grad_norm": 1.506174921989441, + "learning_rate": 4.865261514736066e-06, + "loss": 1.0233, + "step": 766 + }, + { + "epoch": 0.7141527001862198, + "grad_norm": 1.5461585521697998, + "learning_rate": 4.864860278977767e-06, + "loss": 1.1069, + "step": 767 + }, + { + "epoch": 0.7150837988826816, + "grad_norm": 1.6047543287277222, + "learning_rate": 4.864458463285063e-06, + "loss": 1.1141, + "step": 768 + }, + { + "epoch": 0.7160148975791434, + "grad_norm": 1.550565481185913, + "learning_rate": 4.864056067756491e-06, + "loss": 1.0916, + "step": 769 + }, + { + "epoch": 0.7169459962756052, + "grad_norm": 1.6570130586624146, + "learning_rate": 4.86365309249073e-06, + "loss": 1.1312, + "step": 770 + }, + { + "epoch": 0.7178770949720671, + "grad_norm": 1.5110983848571777, + "learning_rate": 4.863249537586601e-06, + "loss": 1.08, + "step": 771 + }, + { + "epoch": 0.7188081936685289, + "grad_norm": 1.571300745010376, + "learning_rate": 4.86284540314307e-06, + "loss": 1.0873, + "step": 772 + }, + { + "epoch": 0.7197392923649907, + "grad_norm": 1.5201406478881836, + "learning_rate": 4.8624406892592394e-06, + "loss": 1.1234, + "step": 773 + }, + { + "epoch": 0.7206703910614525, + "grad_norm": 1.4831037521362305, + "learning_rate": 4.862035396034359e-06, + "loss": 1.0649, + "step": 774 + }, + { + "epoch": 0.7216014897579144, + "grad_norm": 1.5336039066314697, + "learning_rate": 4.86162952356782e-06, + "loss": 1.0121, + "step": 775 + }, + { + "epoch": 0.7225325884543762, + "grad_norm": 1.5172268152236938, + "learning_rate": 4.8612230719591535e-06, + "loss": 1.1539, + "step": 776 + }, + { + "epoch": 0.723463687150838, + "grad_norm": 1.5218273401260376, + "learning_rate": 4.860816041308033e-06, + "loss": 1.064, + "step": 777 + }, + { + "epoch": 0.7243947858472998, + "grad_norm": 1.5292189121246338, + "learning_rate": 4.860408431714275e-06, + "loss": 1.0914, + "step": 778 + }, + { + "epoch": 0.7253258845437617, + "grad_norm": 1.5450135469436646, + "learning_rate": 4.860000243277837e-06, + "loss": 1.082, + "step": 779 + }, + { + "epoch": 0.7262569832402235, + "grad_norm": 1.6913230419158936, + "learning_rate": 4.85959147609882e-06, + "loss": 1.0812, + "step": 780 + }, + { + "epoch": 0.7271880819366853, + "grad_norm": 1.5307987928390503, + "learning_rate": 4.859182130277465e-06, + "loss": 1.0964, + "step": 781 + }, + { + "epoch": 0.7281191806331471, + "grad_norm": 1.5680224895477295, + "learning_rate": 4.858772205914158e-06, + "loss": 1.1084, + "step": 782 + }, + { + "epoch": 0.729050279329609, + "grad_norm": 1.5493946075439453, + "learning_rate": 4.85836170310942e-06, + "loss": 1.066, + "step": 783 + }, + { + "epoch": 0.7299813780260708, + "grad_norm": 1.5324965715408325, + "learning_rate": 4.857950621963924e-06, + "loss": 1.0525, + "step": 784 + }, + { + "epoch": 0.7309124767225326, + "grad_norm": 1.5298123359680176, + "learning_rate": 4.857538962578475e-06, + "loss": 1.0998, + "step": 785 + }, + { + "epoch": 0.7318435754189944, + "grad_norm": 1.6049745082855225, + "learning_rate": 4.857126725054028e-06, + "loss": 1.1404, + "step": 786 + }, + { + "epoch": 0.7327746741154563, + "grad_norm": 1.6233011484146118, + "learning_rate": 4.856713909491673e-06, + "loss": 1.0963, + "step": 787 + }, + { + "epoch": 0.7337057728119181, + "grad_norm": 1.594020128250122, + "learning_rate": 4.856300515992645e-06, + "loss": 1.0649, + "step": 788 + }, + { + "epoch": 0.7346368715083799, + "grad_norm": 1.6714119911193848, + "learning_rate": 4.855886544658322e-06, + "loss": 1.0816, + "step": 789 + }, + { + "epoch": 0.7355679702048417, + "grad_norm": 1.5608378648757935, + "learning_rate": 4.8554719955902215e-06, + "loss": 1.0909, + "step": 790 + }, + { + "epoch": 0.7364990689013036, + "grad_norm": 1.4881634712219238, + "learning_rate": 4.855056868890004e-06, + "loss": 1.0887, + "step": 791 + }, + { + "epoch": 0.7374301675977654, + "grad_norm": 1.5242223739624023, + "learning_rate": 4.854641164659468e-06, + "loss": 1.102, + "step": 792 + }, + { + "epoch": 0.7383612662942272, + "grad_norm": 1.6237317323684692, + "learning_rate": 4.854224883000561e-06, + "loss": 1.0703, + "step": 793 + }, + { + "epoch": 0.7392923649906891, + "grad_norm": 1.5481069087982178, + "learning_rate": 4.853808024015364e-06, + "loss": 1.0754, + "step": 794 + }, + { + "epoch": 0.7402234636871509, + "grad_norm": 1.6555005311965942, + "learning_rate": 4.853390587806105e-06, + "loss": 1.0886, + "step": 795 + }, + { + "epoch": 0.7411545623836127, + "grad_norm": 1.604873776435852, + "learning_rate": 4.852972574475151e-06, + "loss": 1.1246, + "step": 796 + }, + { + "epoch": 0.7420856610800745, + "grad_norm": 1.5767509937286377, + "learning_rate": 4.852553984125013e-06, + "loss": 1.0772, + "step": 797 + }, + { + "epoch": 0.7430167597765364, + "grad_norm": 1.6008780002593994, + "learning_rate": 4.852134816858341e-06, + "loss": 1.1223, + "step": 798 + }, + { + "epoch": 0.7439478584729982, + "grad_norm": 1.5163817405700684, + "learning_rate": 4.851715072777926e-06, + "loss": 1.0917, + "step": 799 + }, + { + "epoch": 0.74487895716946, + "grad_norm": 1.5727468729019165, + "learning_rate": 4.8512947519867025e-06, + "loss": 1.1302, + "step": 800 + }, + { + "epoch": 0.7458100558659218, + "grad_norm": 1.613973617553711, + "learning_rate": 4.850873854587747e-06, + "loss": 1.0977, + "step": 801 + }, + { + "epoch": 0.7467411545623837, + "grad_norm": 1.7014856338500977, + "learning_rate": 4.850452380684275e-06, + "loss": 1.0715, + "step": 802 + }, + { + "epoch": 0.7476722532588455, + "grad_norm": 1.5576146841049194, + "learning_rate": 4.850030330379645e-06, + "loss": 1.0662, + "step": 803 + }, + { + "epoch": 0.7486033519553073, + "grad_norm": 1.6135509014129639, + "learning_rate": 4.849607703777356e-06, + "loss": 1.1317, + "step": 804 + }, + { + "epoch": 0.749534450651769, + "grad_norm": 1.5958330631256104, + "learning_rate": 4.849184500981048e-06, + "loss": 1.121, + "step": 805 + }, + { + "epoch": 0.750465549348231, + "grad_norm": 1.4927066564559937, + "learning_rate": 4.848760722094504e-06, + "loss": 1.0863, + "step": 806 + }, + { + "epoch": 0.7513966480446927, + "grad_norm": 1.5335276126861572, + "learning_rate": 4.8483363672216475e-06, + "loss": 1.0748, + "step": 807 + }, + { + "epoch": 0.7523277467411545, + "grad_norm": 1.4925564527511597, + "learning_rate": 4.8479114364665425e-06, + "loss": 1.0899, + "step": 808 + }, + { + "epoch": 0.7532588454376163, + "grad_norm": 1.5200743675231934, + "learning_rate": 4.847485929933395e-06, + "loss": 1.0907, + "step": 809 + }, + { + "epoch": 0.7541899441340782, + "grad_norm": 1.5277286767959595, + "learning_rate": 4.84705984772655e-06, + "loss": 1.0982, + "step": 810 + }, + { + "epoch": 0.75512104283054, + "grad_norm": 1.5960828065872192, + "learning_rate": 4.846633189950498e-06, + "loss": 1.0875, + "step": 811 + }, + { + "epoch": 0.7560521415270018, + "grad_norm": 1.5886591672897339, + "learning_rate": 4.846205956709868e-06, + "loss": 1.0638, + "step": 812 + }, + { + "epoch": 0.7569832402234636, + "grad_norm": 1.5943529605865479, + "learning_rate": 4.845778148109429e-06, + "loss": 1.1377, + "step": 813 + }, + { + "epoch": 0.7579143389199255, + "grad_norm": 1.5340068340301514, + "learning_rate": 4.8453497642540935e-06, + "loss": 1.0761, + "step": 814 + }, + { + "epoch": 0.7588454376163873, + "grad_norm": 1.4797906875610352, + "learning_rate": 4.844920805248914e-06, + "loss": 1.0684, + "step": 815 + }, + { + "epoch": 0.7597765363128491, + "grad_norm": 1.5927391052246094, + "learning_rate": 4.844491271199083e-06, + "loss": 1.0687, + "step": 816 + }, + { + "epoch": 0.7607076350093109, + "grad_norm": 1.531162977218628, + "learning_rate": 4.844061162209937e-06, + "loss": 1.1113, + "step": 817 + }, + { + "epoch": 0.7616387337057728, + "grad_norm": 1.5677597522735596, + "learning_rate": 4.84363047838695e-06, + "loss": 1.1091, + "step": 818 + }, + { + "epoch": 0.7625698324022346, + "grad_norm": 1.511192798614502, + "learning_rate": 4.843199219835739e-06, + "loss": 1.0765, + "step": 819 + }, + { + "epoch": 0.7635009310986964, + "grad_norm": 1.5178874731063843, + "learning_rate": 4.842767386662062e-06, + "loss": 1.0829, + "step": 820 + }, + { + "epoch": 0.7644320297951583, + "grad_norm": 1.5365880727767944, + "learning_rate": 4.842334978971815e-06, + "loss": 1.0914, + "step": 821 + }, + { + "epoch": 0.7653631284916201, + "grad_norm": 1.5762591361999512, + "learning_rate": 4.8419019968710415e-06, + "loss": 1.1028, + "step": 822 + }, + { + "epoch": 0.7662942271880819, + "grad_norm": 1.5376968383789062, + "learning_rate": 4.841468440465918e-06, + "loss": 1.1049, + "step": 823 + }, + { + "epoch": 0.7672253258845437, + "grad_norm": 1.476027488708496, + "learning_rate": 4.841034309862768e-06, + "loss": 1.0825, + "step": 824 + }, + { + "epoch": 0.7681564245810056, + "grad_norm": 1.4843603372573853, + "learning_rate": 4.8405996051680505e-06, + "loss": 1.0894, + "step": 825 + }, + { + "epoch": 0.7690875232774674, + "grad_norm": 1.5072650909423828, + "learning_rate": 4.84016432648837e-06, + "loss": 1.0703, + "step": 826 + }, + { + "epoch": 0.7700186219739292, + "grad_norm": 1.519260287284851, + "learning_rate": 4.839728473930471e-06, + "loss": 1.0286, + "step": 827 + }, + { + "epoch": 0.770949720670391, + "grad_norm": 1.636559247970581, + "learning_rate": 4.839292047601234e-06, + "loss": 1.1477, + "step": 828 + }, + { + "epoch": 0.7718808193668529, + "grad_norm": 1.5612605810165405, + "learning_rate": 4.838855047607688e-06, + "loss": 1.093, + "step": 829 + }, + { + "epoch": 0.7728119180633147, + "grad_norm": 1.7009735107421875, + "learning_rate": 4.838417474056994e-06, + "loss": 1.0631, + "step": 830 + }, + { + "epoch": 0.7737430167597765, + "grad_norm": 1.5374367237091064, + "learning_rate": 4.8379793270564625e-06, + "loss": 1.0809, + "step": 831 + }, + { + "epoch": 0.7746741154562383, + "grad_norm": 1.537718653678894, + "learning_rate": 4.837540606713538e-06, + "loss": 1.0802, + "step": 832 + }, + { + "epoch": 0.7756052141527002, + "grad_norm": 1.6281205415725708, + "learning_rate": 4.837101313135807e-06, + "loss": 1.0901, + "step": 833 + }, + { + "epoch": 0.776536312849162, + "grad_norm": 1.516488790512085, + "learning_rate": 4.836661446430999e-06, + "loss": 1.0512, + "step": 834 + }, + { + "epoch": 0.7774674115456238, + "grad_norm": 1.5857281684875488, + "learning_rate": 4.836221006706982e-06, + "loss": 1.1061, + "step": 835 + }, + { + "epoch": 0.7783985102420856, + "grad_norm": 1.579249620437622, + "learning_rate": 4.8357799940717644e-06, + "loss": 1.1102, + "step": 836 + }, + { + "epoch": 0.7793296089385475, + "grad_norm": 1.4527626037597656, + "learning_rate": 4.8353384086334965e-06, + "loss": 1.1271, + "step": 837 + }, + { + "epoch": 0.7802607076350093, + "grad_norm": 1.55983304977417, + "learning_rate": 4.834896250500467e-06, + "loss": 1.097, + "step": 838 + }, + { + "epoch": 0.7811918063314711, + "grad_norm": 1.5362640619277954, + "learning_rate": 4.834453519781108e-06, + "loss": 1.0798, + "step": 839 + }, + { + "epoch": 0.7821229050279329, + "grad_norm": 1.5832512378692627, + "learning_rate": 4.83401021658399e-06, + "loss": 1.1011, + "step": 840 + }, + { + "epoch": 0.7830540037243948, + "grad_norm": 1.5125010013580322, + "learning_rate": 4.833566341017823e-06, + "loss": 1.1151, + "step": 841 + }, + { + "epoch": 0.7839851024208566, + "grad_norm": 1.5389574766159058, + "learning_rate": 4.833121893191459e-06, + "loss": 1.0744, + "step": 842 + }, + { + "epoch": 0.7849162011173184, + "grad_norm": 1.5336229801177979, + "learning_rate": 4.832676873213891e-06, + "loss": 1.0641, + "step": 843 + }, + { + "epoch": 0.7858472998137802, + "grad_norm": 1.525045394897461, + "learning_rate": 4.83223128119425e-06, + "loss": 1.1052, + "step": 844 + }, + { + "epoch": 0.7867783985102421, + "grad_norm": 1.5727417469024658, + "learning_rate": 4.831785117241809e-06, + "loss": 1.1179, + "step": 845 + }, + { + "epoch": 0.7877094972067039, + "grad_norm": 1.6233503818511963, + "learning_rate": 4.831338381465979e-06, + "loss": 1.0909, + "step": 846 + }, + { + "epoch": 0.7886405959031657, + "grad_norm": 1.517408013343811, + "learning_rate": 4.830891073976316e-06, + "loss": 1.0837, + "step": 847 + }, + { + "epoch": 0.7895716945996276, + "grad_norm": 1.5409659147262573, + "learning_rate": 4.830443194882511e-06, + "loss": 1.1028, + "step": 848 + }, + { + "epoch": 0.7905027932960894, + "grad_norm": 1.5416734218597412, + "learning_rate": 4.829994744294398e-06, + "loss": 1.0996, + "step": 849 + }, + { + "epoch": 0.7914338919925512, + "grad_norm": 1.5739552974700928, + "learning_rate": 4.82954572232195e-06, + "loss": 1.138, + "step": 850 + }, + { + "epoch": 0.792364990689013, + "grad_norm": 1.5199893712997437, + "learning_rate": 4.8290961290752825e-06, + "loss": 1.0863, + "step": 851 + }, + { + "epoch": 0.7932960893854749, + "grad_norm": 1.5755997896194458, + "learning_rate": 4.8286459646646465e-06, + "loss": 1.1103, + "step": 852 + }, + { + "epoch": 0.7942271880819367, + "grad_norm": 1.5463471412658691, + "learning_rate": 4.828195229200438e-06, + "loss": 1.0744, + "step": 853 + }, + { + "epoch": 0.7951582867783985, + "grad_norm": 1.519218921661377, + "learning_rate": 4.827743922793189e-06, + "loss": 1.0795, + "step": 854 + }, + { + "epoch": 0.7960893854748603, + "grad_norm": 1.5830116271972656, + "learning_rate": 4.827292045553574e-06, + "loss": 1.1171, + "step": 855 + }, + { + "epoch": 0.7970204841713222, + "grad_norm": 1.5137747526168823, + "learning_rate": 4.826839597592408e-06, + "loss": 1.0656, + "step": 856 + }, + { + "epoch": 0.797951582867784, + "grad_norm": 1.5891633033752441, + "learning_rate": 4.826386579020643e-06, + "loss": 1.1629, + "step": 857 + }, + { + "epoch": 0.7988826815642458, + "grad_norm": 1.5737459659576416, + "learning_rate": 4.825932989949373e-06, + "loss": 1.0798, + "step": 858 + }, + { + "epoch": 0.7998137802607076, + "grad_norm": 1.4494706392288208, + "learning_rate": 4.8254788304898335e-06, + "loss": 1.0276, + "step": 859 + }, + { + "epoch": 0.8007448789571695, + "grad_norm": 1.5734295845031738, + "learning_rate": 4.825024100753395e-06, + "loss": 1.0917, + "step": 860 + }, + { + "epoch": 0.8016759776536313, + "grad_norm": 1.5180836915969849, + "learning_rate": 4.824568800851574e-06, + "loss": 1.0714, + "step": 861 + }, + { + "epoch": 0.8026070763500931, + "grad_norm": 1.5582964420318604, + "learning_rate": 4.82411293089602e-06, + "loss": 1.1214, + "step": 862 + }, + { + "epoch": 0.8035381750465549, + "grad_norm": 1.6028590202331543, + "learning_rate": 4.823656490998529e-06, + "loss": 1.1307, + "step": 863 + }, + { + "epoch": 0.8044692737430168, + "grad_norm": 1.7554177045822144, + "learning_rate": 4.823199481271031e-06, + "loss": 1.0846, + "step": 864 + }, + { + "epoch": 0.8054003724394786, + "grad_norm": 1.5387828350067139, + "learning_rate": 4.822741901825602e-06, + "loss": 1.0349, + "step": 865 + }, + { + "epoch": 0.8063314711359404, + "grad_norm": 1.5495141744613647, + "learning_rate": 4.8222837527744514e-06, + "loss": 1.1161, + "step": 866 + }, + { + "epoch": 0.8072625698324022, + "grad_norm": 1.5196735858917236, + "learning_rate": 4.8218250342299314e-06, + "loss": 1.0591, + "step": 867 + }, + { + "epoch": 0.8081936685288641, + "grad_norm": 1.6387678384780884, + "learning_rate": 4.821365746304535e-06, + "loss": 1.0852, + "step": 868 + }, + { + "epoch": 0.8091247672253259, + "grad_norm": 1.5407074689865112, + "learning_rate": 4.8209058891108905e-06, + "loss": 1.0724, + "step": 869 + }, + { + "epoch": 0.8100558659217877, + "grad_norm": 1.6038005352020264, + "learning_rate": 4.820445462761771e-06, + "loss": 1.0933, + "step": 870 + }, + { + "epoch": 0.8109869646182495, + "grad_norm": 1.5694278478622437, + "learning_rate": 4.819984467370087e-06, + "loss": 1.077, + "step": 871 + }, + { + "epoch": 0.8119180633147114, + "grad_norm": 1.564446210861206, + "learning_rate": 4.819522903048887e-06, + "loss": 1.046, + "step": 872 + }, + { + "epoch": 0.8128491620111732, + "grad_norm": 1.5972493886947632, + "learning_rate": 4.819060769911361e-06, + "loss": 1.1444, + "step": 873 + }, + { + "epoch": 0.813780260707635, + "grad_norm": 1.5357850790023804, + "learning_rate": 4.8185980680708375e-06, + "loss": 1.1271, + "step": 874 + }, + { + "epoch": 0.8147113594040968, + "grad_norm": 1.5091062784194946, + "learning_rate": 4.818134797640785e-06, + "loss": 1.0779, + "step": 875 + }, + { + "epoch": 0.8156424581005587, + "grad_norm": 1.523606538772583, + "learning_rate": 4.817670958734812e-06, + "loss": 1.0626, + "step": 876 + }, + { + "epoch": 0.8165735567970205, + "grad_norm": 1.566107988357544, + "learning_rate": 4.8172065514666634e-06, + "loss": 1.1188, + "step": 877 + }, + { + "epoch": 0.8175046554934823, + "grad_norm": 1.4585009813308716, + "learning_rate": 4.8167415759502275e-06, + "loss": 1.0964, + "step": 878 + }, + { + "epoch": 0.8184357541899442, + "grad_norm": 1.5572326183319092, + "learning_rate": 4.8162760322995314e-06, + "loss": 1.1198, + "step": 879 + }, + { + "epoch": 0.819366852886406, + "grad_norm": 1.5219098329544067, + "learning_rate": 4.815809920628738e-06, + "loss": 1.0747, + "step": 880 + }, + { + "epoch": 0.8202979515828678, + "grad_norm": 1.5456024408340454, + "learning_rate": 4.815343241052153e-06, + "loss": 1.0782, + "step": 881 + }, + { + "epoch": 0.8212290502793296, + "grad_norm": 1.5535281896591187, + "learning_rate": 4.8148759936842196e-06, + "loss": 1.0902, + "step": 882 + }, + { + "epoch": 0.8221601489757915, + "grad_norm": 1.5116961002349854, + "learning_rate": 4.81440817863952e-06, + "loss": 1.0608, + "step": 883 + }, + { + "epoch": 0.8230912476722533, + "grad_norm": 1.5352904796600342, + "learning_rate": 4.813939796032779e-06, + "loss": 1.128, + "step": 884 + }, + { + "epoch": 0.8240223463687151, + "grad_norm": 1.5234910249710083, + "learning_rate": 4.813470845978856e-06, + "loss": 1.1008, + "step": 885 + }, + { + "epoch": 0.8249534450651769, + "grad_norm": 1.5409780740737915, + "learning_rate": 4.813001328592752e-06, + "loss": 1.0754, + "step": 886 + }, + { + "epoch": 0.8258845437616388, + "grad_norm": 1.53434419631958, + "learning_rate": 4.812531243989608e-06, + "loss": 1.1088, + "step": 887 + }, + { + "epoch": 0.8268156424581006, + "grad_norm": 1.6033190488815308, + "learning_rate": 4.8120605922847e-06, + "loss": 1.1187, + "step": 888 + }, + { + "epoch": 0.8277467411545624, + "grad_norm": 1.5325690507888794, + "learning_rate": 4.811589373593448e-06, + "loss": 1.0988, + "step": 889 + }, + { + "epoch": 0.8286778398510242, + "grad_norm": 1.487269639968872, + "learning_rate": 4.811117588031409e-06, + "loss": 1.0486, + "step": 890 + }, + { + "epoch": 0.8296089385474861, + "grad_norm": 1.545894742012024, + "learning_rate": 4.810645235714277e-06, + "loss": 1.0542, + "step": 891 + }, + { + "epoch": 0.8305400372439479, + "grad_norm": 1.5531600713729858, + "learning_rate": 4.810172316757889e-06, + "loss": 1.0589, + "step": 892 + }, + { + "epoch": 0.8314711359404097, + "grad_norm": 1.579483151435852, + "learning_rate": 4.809698831278217e-06, + "loss": 1.0866, + "step": 893 + }, + { + "epoch": 0.8324022346368715, + "grad_norm": 1.5545785427093506, + "learning_rate": 4.809224779391376e-06, + "loss": 1.1418, + "step": 894 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 1.6231085062026978, + "learning_rate": 4.808750161213615e-06, + "loss": 1.0997, + "step": 895 + }, + { + "epoch": 0.8342644320297952, + "grad_norm": 1.6763361692428589, + "learning_rate": 4.8082749768613275e-06, + "loss": 1.1184, + "step": 896 + }, + { + "epoch": 0.835195530726257, + "grad_norm": 1.5546660423278809, + "learning_rate": 4.807799226451041e-06, + "loss": 1.1144, + "step": 897 + }, + { + "epoch": 0.8361266294227188, + "grad_norm": 1.5605677366256714, + "learning_rate": 4.807322910099425e-06, + "loss": 1.081, + "step": 898 + }, + { + "epoch": 0.8370577281191807, + "grad_norm": 1.603249430656433, + "learning_rate": 4.806846027923284e-06, + "loss": 1.0923, + "step": 899 + }, + { + "epoch": 0.8379888268156425, + "grad_norm": 1.5507538318634033, + "learning_rate": 4.806368580039567e-06, + "loss": 1.0814, + "step": 900 + }, + { + "epoch": 0.8389199255121043, + "grad_norm": 1.527549386024475, + "learning_rate": 4.805890566565356e-06, + "loss": 1.0446, + "step": 901 + }, + { + "epoch": 0.839851024208566, + "grad_norm": 1.5054140090942383, + "learning_rate": 4.805411987617875e-06, + "loss": 1.092, + "step": 902 + }, + { + "epoch": 0.840782122905028, + "grad_norm": 1.5621106624603271, + "learning_rate": 4.804932843314487e-06, + "loss": 1.0731, + "step": 903 + }, + { + "epoch": 0.8417132216014898, + "grad_norm": 1.5620564222335815, + "learning_rate": 4.804453133772691e-06, + "loss": 1.1233, + "step": 904 + }, + { + "epoch": 0.8426443202979516, + "grad_norm": 1.5361804962158203, + "learning_rate": 4.803972859110126e-06, + "loss": 1.1221, + "step": 905 + }, + { + "epoch": 0.8435754189944135, + "grad_norm": 1.6729267835617065, + "learning_rate": 4.803492019444571e-06, + "loss": 1.1071, + "step": 906 + }, + { + "epoch": 0.8445065176908753, + "grad_norm": 1.5836952924728394, + "learning_rate": 4.80301061489394e-06, + "loss": 1.1474, + "step": 907 + }, + { + "epoch": 0.845437616387337, + "grad_norm": 1.5220308303833008, + "learning_rate": 4.8025286455762905e-06, + "loss": 1.056, + "step": 908 + }, + { + "epoch": 0.8463687150837989, + "grad_norm": 1.6178399324417114, + "learning_rate": 4.802046111609815e-06, + "loss": 1.079, + "step": 909 + }, + { + "epoch": 0.8472998137802608, + "grad_norm": 1.501449465751648, + "learning_rate": 4.8015630131128446e-06, + "loss": 1.0707, + "step": 910 + }, + { + "epoch": 0.8482309124767226, + "grad_norm": 1.543421745300293, + "learning_rate": 4.801079350203849e-06, + "loss": 1.1117, + "step": 911 + }, + { + "epoch": 0.8491620111731844, + "grad_norm": 1.4939581155776978, + "learning_rate": 4.800595123001439e-06, + "loss": 1.0927, + "step": 912 + }, + { + "epoch": 0.8500931098696461, + "grad_norm": 1.5124197006225586, + "learning_rate": 4.8001103316243585e-06, + "loss": 1.1018, + "step": 913 + }, + { + "epoch": 0.851024208566108, + "grad_norm": 1.535314917564392, + "learning_rate": 4.799624976191495e-06, + "loss": 1.1157, + "step": 914 + }, + { + "epoch": 0.8519553072625698, + "grad_norm": 1.6102347373962402, + "learning_rate": 4.799139056821872e-06, + "loss": 1.0903, + "step": 915 + }, + { + "epoch": 0.8528864059590316, + "grad_norm": 1.5451605319976807, + "learning_rate": 4.798652573634651e-06, + "loss": 1.1138, + "step": 916 + }, + { + "epoch": 0.8538175046554934, + "grad_norm": 1.6535327434539795, + "learning_rate": 4.798165526749132e-06, + "loss": 1.1009, + "step": 917 + }, + { + "epoch": 0.8547486033519553, + "grad_norm": 1.5381852388381958, + "learning_rate": 4.797677916284753e-06, + "loss": 1.0654, + "step": 918 + }, + { + "epoch": 0.8556797020484171, + "grad_norm": 1.5182949304580688, + "learning_rate": 4.7971897423610925e-06, + "loss": 1.1074, + "step": 919 + }, + { + "epoch": 0.8566108007448789, + "grad_norm": 1.5372347831726074, + "learning_rate": 4.796701005097863e-06, + "loss": 1.0757, + "step": 920 + }, + { + "epoch": 0.8575418994413407, + "grad_norm": 1.5097237825393677, + "learning_rate": 4.7962117046149205e-06, + "loss": 1.0554, + "step": 921 + }, + { + "epoch": 0.8584729981378026, + "grad_norm": 1.5212228298187256, + "learning_rate": 4.795721841032253e-06, + "loss": 1.0421, + "step": 922 + }, + { + "epoch": 0.8594040968342644, + "grad_norm": 1.6038963794708252, + "learning_rate": 4.795231414469991e-06, + "loss": 1.1119, + "step": 923 + }, + { + "epoch": 0.8603351955307262, + "grad_norm": 1.5137346982955933, + "learning_rate": 4.794740425048402e-06, + "loss": 1.0676, + "step": 924 + }, + { + "epoch": 0.861266294227188, + "grad_norm": 1.5880333185195923, + "learning_rate": 4.794248872887891e-06, + "loss": 1.1038, + "step": 925 + }, + { + "epoch": 0.8621973929236499, + "grad_norm": 1.553910732269287, + "learning_rate": 4.793756758109e-06, + "loss": 1.1233, + "step": 926 + }, + { + "epoch": 0.8631284916201117, + "grad_norm": 1.525766372680664, + "learning_rate": 4.793264080832414e-06, + "loss": 1.0828, + "step": 927 + }, + { + "epoch": 0.8640595903165735, + "grad_norm": 1.5134408473968506, + "learning_rate": 4.792770841178947e-06, + "loss": 1.1278, + "step": 928 + }, + { + "epoch": 0.8649906890130353, + "grad_norm": 1.5016069412231445, + "learning_rate": 4.792277039269561e-06, + "loss": 1.0933, + "step": 929 + }, + { + "epoch": 0.8659217877094972, + "grad_norm": 1.592409610748291, + "learning_rate": 4.791782675225348e-06, + "loss": 1.106, + "step": 930 + }, + { + "epoch": 0.866852886405959, + "grad_norm": 1.62725031375885, + "learning_rate": 4.791287749167541e-06, + "loss": 1.1183, + "step": 931 + }, + { + "epoch": 0.8677839851024208, + "grad_norm": 1.5325590372085571, + "learning_rate": 4.790792261217513e-06, + "loss": 1.0763, + "step": 932 + }, + { + "epoch": 0.8687150837988827, + "grad_norm": 1.4817781448364258, + "learning_rate": 4.790296211496769e-06, + "loss": 1.0604, + "step": 933 + }, + { + "epoch": 0.8696461824953445, + "grad_norm": 1.5212786197662354, + "learning_rate": 4.789799600126957e-06, + "loss": 1.0935, + "step": 934 + }, + { + "epoch": 0.8705772811918063, + "grad_norm": 1.580877661705017, + "learning_rate": 4.7893024272298615e-06, + "loss": 1.0778, + "step": 935 + }, + { + "epoch": 0.8715083798882681, + "grad_norm": 1.5551815032958984, + "learning_rate": 4.788804692927403e-06, + "loss": 1.0734, + "step": 936 + }, + { + "epoch": 0.87243947858473, + "grad_norm": 1.5052282810211182, + "learning_rate": 4.788306397341643e-06, + "loss": 1.0978, + "step": 937 + }, + { + "epoch": 0.8733705772811918, + "grad_norm": 1.5590263605117798, + "learning_rate": 4.7878075405947755e-06, + "loss": 1.0542, + "step": 938 + }, + { + "epoch": 0.8743016759776536, + "grad_norm": 1.5098881721496582, + "learning_rate": 4.787308122809137e-06, + "loss": 1.0664, + "step": 939 + }, + { + "epoch": 0.8752327746741154, + "grad_norm": 1.5522726774215698, + "learning_rate": 4.7868081441071975e-06, + "loss": 1.1083, + "step": 940 + }, + { + "epoch": 0.8761638733705773, + "grad_norm": 1.5374565124511719, + "learning_rate": 4.78630760461157e-06, + "loss": 1.0984, + "step": 941 + }, + { + "epoch": 0.8770949720670391, + "grad_norm": 1.546431303024292, + "learning_rate": 4.785806504445e-06, + "loss": 1.0553, + "step": 942 + }, + { + "epoch": 0.8780260707635009, + "grad_norm": 1.510360836982727, + "learning_rate": 4.7853048437303716e-06, + "loss": 1.0387, + "step": 943 + }, + { + "epoch": 0.8789571694599627, + "grad_norm": 1.5364389419555664, + "learning_rate": 4.784802622590707e-06, + "loss": 1.0961, + "step": 944 + }, + { + "epoch": 0.8798882681564246, + "grad_norm": 1.6120407581329346, + "learning_rate": 4.784299841149168e-06, + "loss": 1.0731, + "step": 945 + }, + { + "epoch": 0.8808193668528864, + "grad_norm": 1.5982757806777954, + "learning_rate": 4.78379649952905e-06, + "loss": 1.0507, + "step": 946 + }, + { + "epoch": 0.8817504655493482, + "grad_norm": 1.5438737869262695, + "learning_rate": 4.783292597853787e-06, + "loss": 1.0688, + "step": 947 + }, + { + "epoch": 0.88268156424581, + "grad_norm": 1.5605790615081787, + "learning_rate": 4.782788136246951e-06, + "loss": 1.137, + "step": 948 + }, + { + "epoch": 0.8836126629422719, + "grad_norm": 1.6020355224609375, + "learning_rate": 4.782283114832252e-06, + "loss": 1.0574, + "step": 949 + }, + { + "epoch": 0.8845437616387337, + "grad_norm": 1.6144306659698486, + "learning_rate": 4.781777533733535e-06, + "loss": 1.0832, + "step": 950 + }, + { + "epoch": 0.8854748603351955, + "grad_norm": 1.5760427713394165, + "learning_rate": 4.781271393074785e-06, + "loss": 1.1107, + "step": 951 + }, + { + "epoch": 0.8864059590316573, + "grad_norm": 1.572853684425354, + "learning_rate": 4.780764692980122e-06, + "loss": 1.1229, + "step": 952 + }, + { + "epoch": 0.8873370577281192, + "grad_norm": 1.629422903060913, + "learning_rate": 4.780257433573804e-06, + "loss": 1.0583, + "step": 953 + }, + { + "epoch": 0.888268156424581, + "grad_norm": 1.4656858444213867, + "learning_rate": 4.779749614980225e-06, + "loss": 1.052, + "step": 954 + }, + { + "epoch": 0.8891992551210428, + "grad_norm": 1.571095585823059, + "learning_rate": 4.77924123732392e-06, + "loss": 1.0923, + "step": 955 + }, + { + "epoch": 0.8901303538175046, + "grad_norm": 1.621502161026001, + "learning_rate": 4.7787323007295575e-06, + "loss": 1.086, + "step": 956 + }, + { + "epoch": 0.8910614525139665, + "grad_norm": 1.5676844120025635, + "learning_rate": 4.778222805321942e-06, + "loss": 1.0927, + "step": 957 + }, + { + "epoch": 0.8919925512104283, + "grad_norm": 1.539885401725769, + "learning_rate": 4.777712751226019e-06, + "loss": 1.0733, + "step": 958 + }, + { + "epoch": 0.8929236499068901, + "grad_norm": 1.4780341386795044, + "learning_rate": 4.777202138566869e-06, + "loss": 1.0664, + "step": 959 + }, + { + "epoch": 0.8938547486033519, + "grad_norm": 1.502509593963623, + "learning_rate": 4.776690967469708e-06, + "loss": 1.0859, + "step": 960 + }, + { + "epoch": 0.8947858472998138, + "grad_norm": 1.5148625373840332, + "learning_rate": 4.7761792380598916e-06, + "loss": 1.0833, + "step": 961 + }, + { + "epoch": 0.8957169459962756, + "grad_norm": 1.5759106874465942, + "learning_rate": 4.775666950462911e-06, + "loss": 1.1062, + "step": 962 + }, + { + "epoch": 0.8966480446927374, + "grad_norm": 1.485351324081421, + "learning_rate": 4.775154104804393e-06, + "loss": 1.0503, + "step": 963 + }, + { + "epoch": 0.8975791433891993, + "grad_norm": 1.509264349937439, + "learning_rate": 4.774640701210106e-06, + "loss": 1.104, + "step": 964 + }, + { + "epoch": 0.8985102420856611, + "grad_norm": 1.5137369632720947, + "learning_rate": 4.77412673980595e-06, + "loss": 1.0921, + "step": 965 + }, + { + "epoch": 0.8994413407821229, + "grad_norm": 1.502916932106018, + "learning_rate": 4.773612220717962e-06, + "loss": 1.1148, + "step": 966 + }, + { + "epoch": 0.9003724394785847, + "grad_norm": 1.5210751295089722, + "learning_rate": 4.7730971440723196e-06, + "loss": 1.0766, + "step": 967 + }, + { + "epoch": 0.9013035381750466, + "grad_norm": 1.5958471298217773, + "learning_rate": 4.7725815099953344e-06, + "loss": 1.11, + "step": 968 + }, + { + "epoch": 0.9022346368715084, + "grad_norm": 1.5454305410385132, + "learning_rate": 4.772065318613456e-06, + "loss": 1.0933, + "step": 969 + }, + { + "epoch": 0.9031657355679702, + "grad_norm": 1.5663084983825684, + "learning_rate": 4.771548570053268e-06, + "loss": 1.0882, + "step": 970 + }, + { + "epoch": 0.904096834264432, + "grad_norm": 1.4800734519958496, + "learning_rate": 4.771031264441494e-06, + "loss": 1.06, + "step": 971 + }, + { + "epoch": 0.9050279329608939, + "grad_norm": 1.574017882347107, + "learning_rate": 4.770513401904994e-06, + "loss": 1.0682, + "step": 972 + }, + { + "epoch": 0.9059590316573557, + "grad_norm": 1.4770042896270752, + "learning_rate": 4.76999498257076e-06, + "loss": 1.0528, + "step": 973 + }, + { + "epoch": 0.9068901303538175, + "grad_norm": 1.5239113569259644, + "learning_rate": 4.7694760065659275e-06, + "loss": 1.1145, + "step": 974 + }, + { + "epoch": 0.9078212290502793, + "grad_norm": 1.4944844245910645, + "learning_rate": 4.768956474017763e-06, + "loss": 1.0629, + "step": 975 + }, + { + "epoch": 0.9087523277467412, + "grad_norm": 1.534879207611084, + "learning_rate": 4.7684363850536715e-06, + "loss": 1.0944, + "step": 976 + }, + { + "epoch": 0.909683426443203, + "grad_norm": 1.5772992372512817, + "learning_rate": 4.767915739801194e-06, + "loss": 1.1354, + "step": 977 + }, + { + "epoch": 0.9106145251396648, + "grad_norm": 1.5922162532806396, + "learning_rate": 4.76739453838801e-06, + "loss": 1.0478, + "step": 978 + }, + { + "epoch": 0.9115456238361266, + "grad_norm": 1.4816398620605469, + "learning_rate": 4.766872780941933e-06, + "loss": 1.0951, + "step": 979 + }, + { + "epoch": 0.9124767225325885, + "grad_norm": 1.4929137229919434, + "learning_rate": 4.766350467590911e-06, + "loss": 1.0562, + "step": 980 + }, + { + "epoch": 0.9134078212290503, + "grad_norm": 1.577746868133545, + "learning_rate": 4.7658275984630345e-06, + "loss": 1.0923, + "step": 981 + }, + { + "epoch": 0.9143389199255121, + "grad_norm": 1.5916051864624023, + "learning_rate": 4.765304173686525e-06, + "loss": 1.0723, + "step": 982 + }, + { + "epoch": 0.9152700186219739, + "grad_norm": 1.5880569219589233, + "learning_rate": 4.764780193389741e-06, + "loss": 1.0696, + "step": 983 + }, + { + "epoch": 0.9162011173184358, + "grad_norm": 1.5580594539642334, + "learning_rate": 4.764255657701179e-06, + "loss": 1.0591, + "step": 984 + }, + { + "epoch": 0.9171322160148976, + "grad_norm": 1.6069998741149902, + "learning_rate": 4.763730566749472e-06, + "loss": 1.0753, + "step": 985 + }, + { + "epoch": 0.9180633147113594, + "grad_norm": 1.6108243465423584, + "learning_rate": 4.763204920663386e-06, + "loss": 1.0907, + "step": 986 + }, + { + "epoch": 0.9189944134078212, + "grad_norm": 1.5411450862884521, + "learning_rate": 4.762678719571826e-06, + "loss": 1.0865, + "step": 987 + }, + { + "epoch": 0.9199255121042831, + "grad_norm": 1.5383867025375366, + "learning_rate": 4.762151963603832e-06, + "loss": 1.0598, + "step": 988 + }, + { + "epoch": 0.9208566108007449, + "grad_norm": 1.537736177444458, + "learning_rate": 4.761624652888581e-06, + "loss": 1.0907, + "step": 989 + }, + { + "epoch": 0.9217877094972067, + "grad_norm": 1.5310419797897339, + "learning_rate": 4.761096787555385e-06, + "loss": 1.1208, + "step": 990 + }, + { + "epoch": 0.9227188081936686, + "grad_norm": 1.5367666482925415, + "learning_rate": 4.760568367733691e-06, + "loss": 1.1241, + "step": 991 + }, + { + "epoch": 0.9236499068901304, + "grad_norm": 1.493673324584961, + "learning_rate": 4.7600393935530865e-06, + "loss": 1.0776, + "step": 992 + }, + { + "epoch": 0.9245810055865922, + "grad_norm": 1.5405055284500122, + "learning_rate": 4.759509865143289e-06, + "loss": 1.1081, + "step": 993 + }, + { + "epoch": 0.925512104283054, + "grad_norm": 1.518207311630249, + "learning_rate": 4.758979782634155e-06, + "loss": 1.0798, + "step": 994 + }, + { + "epoch": 0.9264432029795159, + "grad_norm": 1.4959720373153687, + "learning_rate": 4.758449146155677e-06, + "loss": 1.0789, + "step": 995 + }, + { + "epoch": 0.9273743016759777, + "grad_norm": 1.509627103805542, + "learning_rate": 4.757917955837984e-06, + "loss": 1.1218, + "step": 996 + }, + { + "epoch": 0.9283054003724395, + "grad_norm": 1.5125606060028076, + "learning_rate": 4.757386211811338e-06, + "loss": 1.0951, + "step": 997 + }, + { + "epoch": 0.9292364990689013, + "grad_norm": 1.458593726158142, + "learning_rate": 4.7568539142061395e-06, + "loss": 1.0851, + "step": 998 + }, + { + "epoch": 0.9301675977653632, + "grad_norm": 1.560383677482605, + "learning_rate": 4.756321063152924e-06, + "loss": 1.0952, + "step": 999 + }, + { + "epoch": 0.931098696461825, + "grad_norm": 1.5245909690856934, + "learning_rate": 4.755787658782361e-06, + "loss": 1.0847, + "step": 1000 + }, + { + "epoch": 0.9320297951582868, + "grad_norm": 1.5548995733261108, + "learning_rate": 4.755253701225259e-06, + "loss": 1.0239, + "step": 1001 + }, + { + "epoch": 0.9329608938547486, + "grad_norm": 1.5523535013198853, + "learning_rate": 4.754719190612559e-06, + "loss": 1.1271, + "step": 1002 + }, + { + "epoch": 0.9338919925512105, + "grad_norm": 1.4968879222869873, + "learning_rate": 4.75418412707534e-06, + "loss": 1.0334, + "step": 1003 + }, + { + "epoch": 0.9348230912476723, + "grad_norm": 1.5370330810546875, + "learning_rate": 4.753648510744815e-06, + "loss": 1.0667, + "step": 1004 + }, + { + "epoch": 0.9357541899441341, + "grad_norm": 1.5096780061721802, + "learning_rate": 4.753112341752333e-06, + "loss": 1.0667, + "step": 1005 + }, + { + "epoch": 0.9366852886405959, + "grad_norm": 1.585119366645813, + "learning_rate": 4.752575620229379e-06, + "loss": 1.0813, + "step": 1006 + }, + { + "epoch": 0.9376163873370578, + "grad_norm": 1.5708082914352417, + "learning_rate": 4.752038346307573e-06, + "loss": 1.0632, + "step": 1007 + }, + { + "epoch": 0.9385474860335196, + "grad_norm": 1.6266708374023438, + "learning_rate": 4.751500520118671e-06, + "loss": 1.1227, + "step": 1008 + }, + { + "epoch": 0.9394785847299814, + "grad_norm": 1.5830202102661133, + "learning_rate": 4.750962141794565e-06, + "loss": 1.0873, + "step": 1009 + }, + { + "epoch": 0.9404096834264432, + "grad_norm": 1.624855399131775, + "learning_rate": 4.750423211467278e-06, + "loss": 1.0675, + "step": 1010 + }, + { + "epoch": 0.9413407821229051, + "grad_norm": 1.5412731170654297, + "learning_rate": 4.749883729268975e-06, + "loss": 1.0973, + "step": 1011 + }, + { + "epoch": 0.9422718808193669, + "grad_norm": 1.4659055471420288, + "learning_rate": 4.749343695331952e-06, + "loss": 1.0657, + "step": 1012 + }, + { + "epoch": 0.9432029795158287, + "grad_norm": 1.5228955745697021, + "learning_rate": 4.748803109788642e-06, + "loss": 1.042, + "step": 1013 + }, + { + "epoch": 0.9441340782122905, + "grad_norm": 1.5122032165527344, + "learning_rate": 4.748261972771612e-06, + "loss": 1.0389, + "step": 1014 + }, + { + "epoch": 0.9450651769087524, + "grad_norm": 1.5943773984909058, + "learning_rate": 4.747720284413565e-06, + "loss": 1.1077, + "step": 1015 + }, + { + "epoch": 0.9459962756052142, + "grad_norm": 1.5315780639648438, + "learning_rate": 4.74717804484734e-06, + "loss": 1.0354, + "step": 1016 + }, + { + "epoch": 0.946927374301676, + "grad_norm": 1.5775771141052246, + "learning_rate": 4.74663525420591e-06, + "loss": 1.0711, + "step": 1017 + }, + { + "epoch": 0.9478584729981379, + "grad_norm": 1.4658482074737549, + "learning_rate": 4.7460919126223825e-06, + "loss": 1.0556, + "step": 1018 + }, + { + "epoch": 0.9487895716945997, + "grad_norm": 1.5183653831481934, + "learning_rate": 4.745548020230003e-06, + "loss": 1.1033, + "step": 1019 + }, + { + "epoch": 0.9497206703910615, + "grad_norm": 1.5640509128570557, + "learning_rate": 4.745003577162148e-06, + "loss": 1.0281, + "step": 1020 + }, + { + "epoch": 0.9506517690875232, + "grad_norm": 1.539595603942871, + "learning_rate": 4.7444585835523335e-06, + "loss": 1.0728, + "step": 1021 + }, + { + "epoch": 0.9515828677839852, + "grad_norm": 1.4784270524978638, + "learning_rate": 4.743913039534206e-06, + "loss": 1.1092, + "step": 1022 + }, + { + "epoch": 0.952513966480447, + "grad_norm": 1.55125892162323, + "learning_rate": 4.74336694524155e-06, + "loss": 1.0749, + "step": 1023 + }, + { + "epoch": 0.9534450651769087, + "grad_norm": 1.5337117910385132, + "learning_rate": 4.7428203008082855e-06, + "loss": 1.0731, + "step": 1024 + }, + { + "epoch": 0.9543761638733705, + "grad_norm": 1.5211478471755981, + "learning_rate": 4.742273106368464e-06, + "loss": 1.0597, + "step": 1025 + }, + { + "epoch": 0.9553072625698324, + "grad_norm": 1.4872453212738037, + "learning_rate": 4.741725362056275e-06, + "loss": 1.0439, + "step": 1026 + }, + { + "epoch": 0.9562383612662942, + "grad_norm": 1.5244958400726318, + "learning_rate": 4.741177068006042e-06, + "loss": 1.0959, + "step": 1027 + }, + { + "epoch": 0.957169459962756, + "grad_norm": 1.8209069967269897, + "learning_rate": 4.7406282243522224e-06, + "loss": 1.1022, + "step": 1028 + }, + { + "epoch": 0.9581005586592178, + "grad_norm": 1.5457854270935059, + "learning_rate": 4.740078831229408e-06, + "loss": 1.0691, + "step": 1029 + }, + { + "epoch": 0.9590316573556797, + "grad_norm": 1.4670771360397339, + "learning_rate": 4.7395288887723304e-06, + "loss": 1.0195, + "step": 1030 + }, + { + "epoch": 0.9599627560521415, + "grad_norm": 1.5621514320373535, + "learning_rate": 4.738978397115848e-06, + "loss": 1.0913, + "step": 1031 + }, + { + "epoch": 0.9608938547486033, + "grad_norm": 1.5232383012771606, + "learning_rate": 4.738427356394959e-06, + "loss": 1.0054, + "step": 1032 + }, + { + "epoch": 0.9618249534450651, + "grad_norm": 1.5279157161712646, + "learning_rate": 4.737875766744795e-06, + "loss": 1.0832, + "step": 1033 + }, + { + "epoch": 0.962756052141527, + "grad_norm": 1.6728878021240234, + "learning_rate": 4.737323628300622e-06, + "loss": 1.121, + "step": 1034 + }, + { + "epoch": 0.9636871508379888, + "grad_norm": 1.5027371644973755, + "learning_rate": 4.736770941197842e-06, + "loss": 1.0849, + "step": 1035 + }, + { + "epoch": 0.9646182495344506, + "grad_norm": 1.6248314380645752, + "learning_rate": 4.736217705571989e-06, + "loss": 1.0889, + "step": 1036 + }, + { + "epoch": 0.9655493482309124, + "grad_norm": 1.536523699760437, + "learning_rate": 4.735663921558734e-06, + "loss": 1.0619, + "step": 1037 + }, + { + "epoch": 0.9664804469273743, + "grad_norm": 1.6086184978485107, + "learning_rate": 4.735109589293881e-06, + "loss": 1.1171, + "step": 1038 + }, + { + "epoch": 0.9674115456238361, + "grad_norm": 1.5546424388885498, + "learning_rate": 4.734554708913368e-06, + "loss": 1.0684, + "step": 1039 + }, + { + "epoch": 0.9683426443202979, + "grad_norm": 1.589662790298462, + "learning_rate": 4.73399928055327e-06, + "loss": 1.1086, + "step": 1040 + }, + { + "epoch": 0.9692737430167597, + "grad_norm": 1.4940744638442993, + "learning_rate": 4.733443304349793e-06, + "loss": 1.0359, + "step": 1041 + }, + { + "epoch": 0.9702048417132216, + "grad_norm": 1.6239373683929443, + "learning_rate": 4.7328867804392805e-06, + "loss": 1.0987, + "step": 1042 + }, + { + "epoch": 0.9711359404096834, + "grad_norm": 1.4926732778549194, + "learning_rate": 4.732329708958208e-06, + "loss": 1.0538, + "step": 1043 + }, + { + "epoch": 0.9720670391061452, + "grad_norm": 1.48908269405365, + "learning_rate": 4.731772090043184e-06, + "loss": 1.0165, + "step": 1044 + }, + { + "epoch": 0.972998137802607, + "grad_norm": 1.4550187587738037, + "learning_rate": 4.7312139238309574e-06, + "loss": 1.0609, + "step": 1045 + }, + { + "epoch": 0.9739292364990689, + "grad_norm": 1.53258216381073, + "learning_rate": 4.730655210458404e-06, + "loss": 1.0263, + "step": 1046 + }, + { + "epoch": 0.9748603351955307, + "grad_norm": 1.5673062801361084, + "learning_rate": 4.730095950062539e-06, + "loss": 1.0963, + "step": 1047 + }, + { + "epoch": 0.9757914338919925, + "grad_norm": 1.6262962818145752, + "learning_rate": 4.729536142780507e-06, + "loss": 1.0938, + "step": 1048 + }, + { + "epoch": 0.9767225325884544, + "grad_norm": 1.5321420431137085, + "learning_rate": 4.7289757887495935e-06, + "loss": 1.0464, + "step": 1049 + }, + { + "epoch": 0.9776536312849162, + "grad_norm": 1.6310549974441528, + "learning_rate": 4.728414888107211e-06, + "loss": 1.0772, + "step": 1050 + }, + { + "epoch": 0.978584729981378, + "grad_norm": 1.585007667541504, + "learning_rate": 4.7278534409909106e-06, + "loss": 1.0536, + "step": 1051 + }, + { + "epoch": 0.9795158286778398, + "grad_norm": 1.567671298980713, + "learning_rate": 4.727291447538375e-06, + "loss": 1.0715, + "step": 1052 + }, + { + "epoch": 0.9804469273743017, + "grad_norm": 1.5334957838058472, + "learning_rate": 4.726728907887422e-06, + "loss": 1.0591, + "step": 1053 + }, + { + "epoch": 0.9813780260707635, + "grad_norm": 1.603265643119812, + "learning_rate": 4.726165822176003e-06, + "loss": 1.1033, + "step": 1054 + }, + { + "epoch": 0.9823091247672253, + "grad_norm": 1.4996964931488037, + "learning_rate": 4.725602190542204e-06, + "loss": 1.0485, + "step": 1055 + }, + { + "epoch": 0.9832402234636871, + "grad_norm": 1.5344486236572266, + "learning_rate": 4.725038013124245e-06, + "loss": 1.0936, + "step": 1056 + }, + { + "epoch": 0.984171322160149, + "grad_norm": 1.5993379354476929, + "learning_rate": 4.724473290060477e-06, + "loss": 1.0931, + "step": 1057 + }, + { + "epoch": 0.9851024208566108, + "grad_norm": 1.5253984928131104, + "learning_rate": 4.7239080214893885e-06, + "loss": 1.1228, + "step": 1058 + }, + { + "epoch": 0.9860335195530726, + "grad_norm": 1.5465527772903442, + "learning_rate": 4.7233422075496e-06, + "loss": 1.0769, + "step": 1059 + }, + { + "epoch": 0.9869646182495344, + "grad_norm": 1.5549031496047974, + "learning_rate": 4.722775848379866e-06, + "loss": 1.0596, + "step": 1060 + }, + { + "epoch": 0.9878957169459963, + "grad_norm": 1.5188418626785278, + "learning_rate": 4.722208944119075e-06, + "loss": 1.082, + "step": 1061 + }, + { + "epoch": 0.9888268156424581, + "grad_norm": 1.4955778121948242, + "learning_rate": 4.721641494906247e-06, + "loss": 1.0732, + "step": 1062 + }, + { + "epoch": 0.9897579143389199, + "grad_norm": 1.6066385507583618, + "learning_rate": 4.7210735008805395e-06, + "loss": 1.098, + "step": 1063 + }, + { + "epoch": 0.9906890130353817, + "grad_norm": 1.5475724935531616, + "learning_rate": 4.720504962181241e-06, + "loss": 1.0925, + "step": 1064 + }, + { + "epoch": 0.9916201117318436, + "grad_norm": 1.5275098085403442, + "learning_rate": 4.719935878947775e-06, + "loss": 1.1194, + "step": 1065 + }, + { + "epoch": 0.9925512104283054, + "grad_norm": 1.5309367179870605, + "learning_rate": 4.719366251319696e-06, + "loss": 1.0911, + "step": 1066 + }, + { + "epoch": 0.9934823091247672, + "grad_norm": 1.573983907699585, + "learning_rate": 4.718796079436696e-06, + "loss": 1.0641, + "step": 1067 + }, + { + "epoch": 0.994413407821229, + "grad_norm": 1.5268535614013672, + "learning_rate": 4.718225363438595e-06, + "loss": 1.079, + "step": 1068 + }, + { + "epoch": 0.9953445065176909, + "grad_norm": 1.4826828241348267, + "learning_rate": 4.717654103465354e-06, + "loss": 1.0295, + "step": 1069 + }, + { + "epoch": 0.9962756052141527, + "grad_norm": 1.4942678213119507, + "learning_rate": 4.717082299657058e-06, + "loss": 1.0894, + "step": 1070 + }, + { + "epoch": 0.9972067039106145, + "grad_norm": 1.5466221570968628, + "learning_rate": 4.716509952153934e-06, + "loss": 1.0756, + "step": 1071 + }, + { + "epoch": 0.9981378026070763, + "grad_norm": 1.5310014486312866, + "learning_rate": 4.715937061096337e-06, + "loss": 1.099, + "step": 1072 + }, + { + "epoch": 0.9990689013035382, + "grad_norm": 1.5186617374420166, + "learning_rate": 4.7153636266247586e-06, + "loss": 1.1122, + "step": 1073 + }, + { + "epoch": 1.0, + "grad_norm": 1.5495269298553467, + "learning_rate": 4.71478964887982e-06, + "loss": 1.1152, + "step": 1074 + }, + { + "epoch": 1.000931098696462, + "grad_norm": 1.487399935722351, + "learning_rate": 4.714215128002279e-06, + "loss": 1.049, + "step": 1075 + }, + { + "epoch": 1.0018621973929236, + "grad_norm": 1.4866782426834106, + "learning_rate": 4.7136400641330245e-06, + "loss": 1.0436, + "step": 1076 + }, + { + "epoch": 1.0027932960893855, + "grad_norm": 1.4660440683364868, + "learning_rate": 4.713064457413081e-06, + "loss": 1.0009, + "step": 1077 + }, + { + "epoch": 1.0037243947858474, + "grad_norm": 1.575087070465088, + "learning_rate": 4.712488307983603e-06, + "loss": 1.0632, + "step": 1078 + }, + { + "epoch": 1.004655493482309, + "grad_norm": 1.6018379926681519, + "learning_rate": 4.7119116159858795e-06, + "loss": 1.0791, + "step": 1079 + }, + { + "epoch": 1.005586592178771, + "grad_norm": 1.611703634262085, + "learning_rate": 4.711334381561333e-06, + "loss": 1.0551, + "step": 1080 + }, + { + "epoch": 1.0065176908752327, + "grad_norm": 1.5527249574661255, + "learning_rate": 4.710756604851519e-06, + "loss": 1.0131, + "step": 1081 + }, + { + "epoch": 1.0074487895716946, + "grad_norm": 1.5393624305725098, + "learning_rate": 4.710178285998125e-06, + "loss": 1.0383, + "step": 1082 + }, + { + "epoch": 1.0083798882681565, + "grad_norm": 1.5630712509155273, + "learning_rate": 4.709599425142973e-06, + "loss": 1.0142, + "step": 1083 + }, + { + "epoch": 1.0093109869646182, + "grad_norm": 1.448293924331665, + "learning_rate": 4.709020022428016e-06, + "loss": 0.9501, + "step": 1084 + }, + { + "epoch": 1.01024208566108, + "grad_norm": 1.540922999382019, + "learning_rate": 4.70844007799534e-06, + "loss": 1.0014, + "step": 1085 + }, + { + "epoch": 1.011173184357542, + "grad_norm": 1.4855278730392456, + "learning_rate": 4.707859591987167e-06, + "loss": 1.0071, + "step": 1086 + }, + { + "epoch": 1.0121042830540037, + "grad_norm": 1.5268491506576538, + "learning_rate": 4.707278564545849e-06, + "loss": 1.0471, + "step": 1087 + }, + { + "epoch": 1.0130353817504656, + "grad_norm": 1.5671162605285645, + "learning_rate": 4.706696995813869e-06, + "loss": 1.0394, + "step": 1088 + }, + { + "epoch": 1.0139664804469273, + "grad_norm": 1.5690945386886597, + "learning_rate": 4.706114885933847e-06, + "loss": 1.0595, + "step": 1089 + }, + { + "epoch": 1.0148975791433892, + "grad_norm": 1.4803555011749268, + "learning_rate": 4.705532235048534e-06, + "loss": 1.0331, + "step": 1090 + }, + { + "epoch": 1.015828677839851, + "grad_norm": 1.6176484823226929, + "learning_rate": 4.7049490433008125e-06, + "loss": 0.9892, + "step": 1091 + }, + { + "epoch": 1.0167597765363128, + "grad_norm": 1.5290471315383911, + "learning_rate": 4.7043653108337e-06, + "loss": 0.9948, + "step": 1092 + }, + { + "epoch": 1.0176908752327747, + "grad_norm": 1.6027026176452637, + "learning_rate": 4.703781037790342e-06, + "loss": 1.0675, + "step": 1093 + }, + { + "epoch": 1.0186219739292366, + "grad_norm": 1.58138108253479, + "learning_rate": 4.703196224314023e-06, + "loss": 1.0456, + "step": 1094 + }, + { + "epoch": 1.0195530726256983, + "grad_norm": 1.620977520942688, + "learning_rate": 4.702610870548155e-06, + "loss": 1.0387, + "step": 1095 + }, + { + "epoch": 1.0204841713221602, + "grad_norm": 1.5576655864715576, + "learning_rate": 4.702024976636286e-06, + "loss": 1.0722, + "step": 1096 + }, + { + "epoch": 1.0214152700186219, + "grad_norm": 1.4967273473739624, + "learning_rate": 4.701438542722092e-06, + "loss": 1.0387, + "step": 1097 + }, + { + "epoch": 1.0223463687150838, + "grad_norm": 1.5341986417770386, + "learning_rate": 4.700851568949386e-06, + "loss": 1.0523, + "step": 1098 + }, + { + "epoch": 1.0232774674115457, + "grad_norm": 1.54250168800354, + "learning_rate": 4.70026405546211e-06, + "loss": 0.9612, + "step": 1099 + }, + { + "epoch": 1.0242085661080074, + "grad_norm": 1.5784211158752441, + "learning_rate": 4.699676002404342e-06, + "loss": 1.0184, + "step": 1100 + }, + { + "epoch": 1.0251396648044693, + "grad_norm": 1.5577175617218018, + "learning_rate": 4.699087409920289e-06, + "loss": 1.0436, + "step": 1101 + }, + { + "epoch": 1.0260707635009312, + "grad_norm": 1.5604437589645386, + "learning_rate": 4.698498278154291e-06, + "loss": 1.02, + "step": 1102 + }, + { + "epoch": 1.0270018621973929, + "grad_norm": 1.577594518661499, + "learning_rate": 4.697908607250822e-06, + "loss": 1.0243, + "step": 1103 + }, + { + "epoch": 1.0279329608938548, + "grad_norm": 1.5273542404174805, + "learning_rate": 4.6973183973544854e-06, + "loss": 1.0192, + "step": 1104 + }, + { + "epoch": 1.0288640595903167, + "grad_norm": 1.6009721755981445, + "learning_rate": 4.69672764861002e-06, + "loss": 1.0125, + "step": 1105 + }, + { + "epoch": 1.0297951582867784, + "grad_norm": 1.5656324625015259, + "learning_rate": 4.696136361162293e-06, + "loss": 1.0289, + "step": 1106 + }, + { + "epoch": 1.0307262569832403, + "grad_norm": 1.5725963115692139, + "learning_rate": 4.695544535156308e-06, + "loss": 1.0008, + "step": 1107 + }, + { + "epoch": 1.031657355679702, + "grad_norm": 1.5703850984573364, + "learning_rate": 4.694952170737197e-06, + "loss": 1.0054, + "step": 1108 + }, + { + "epoch": 1.0325884543761639, + "grad_norm": 1.5812164545059204, + "learning_rate": 4.694359268050225e-06, + "loss": 1.0616, + "step": 1109 + }, + { + "epoch": 1.0335195530726258, + "grad_norm": 1.4778999090194702, + "learning_rate": 4.693765827240791e-06, + "loss": 0.9779, + "step": 1110 + }, + { + "epoch": 1.0344506517690875, + "grad_norm": 1.5594961643218994, + "learning_rate": 4.693171848454423e-06, + "loss": 1.0284, + "step": 1111 + }, + { + "epoch": 1.0353817504655494, + "grad_norm": 1.4904820919036865, + "learning_rate": 4.692577331836784e-06, + "loss": 0.996, + "step": 1112 + }, + { + "epoch": 1.0363128491620113, + "grad_norm": 1.5186986923217773, + "learning_rate": 4.691982277533665e-06, + "loss": 1.0641, + "step": 1113 + }, + { + "epoch": 1.037243947858473, + "grad_norm": 1.5642173290252686, + "learning_rate": 4.691386685690993e-06, + "loss": 1.0516, + "step": 1114 + }, + { + "epoch": 1.0381750465549349, + "grad_norm": 1.5472195148468018, + "learning_rate": 4.690790556454824e-06, + "loss": 1.0251, + "step": 1115 + }, + { + "epoch": 1.0391061452513966, + "grad_norm": 1.57706880569458, + "learning_rate": 4.690193889971346e-06, + "loss": 1.0486, + "step": 1116 + }, + { + "epoch": 1.0400372439478585, + "grad_norm": 1.5180350542068481, + "learning_rate": 4.689596686386882e-06, + "loss": 1.0376, + "step": 1117 + }, + { + "epoch": 1.0409683426443204, + "grad_norm": 1.584374189376831, + "learning_rate": 4.688998945847881e-06, + "loss": 0.9991, + "step": 1118 + }, + { + "epoch": 1.041899441340782, + "grad_norm": 1.5857142210006714, + "learning_rate": 4.6884006685009295e-06, + "loss": 0.995, + "step": 1119 + }, + { + "epoch": 1.042830540037244, + "grad_norm": 1.5227071046829224, + "learning_rate": 4.6878018544927415e-06, + "loss": 1.0097, + "step": 1120 + }, + { + "epoch": 1.0437616387337059, + "grad_norm": 1.551161289215088, + "learning_rate": 4.687202503970165e-06, + "loss": 1.0228, + "step": 1121 + }, + { + "epoch": 1.0446927374301676, + "grad_norm": 1.5544378757476807, + "learning_rate": 4.686602617080177e-06, + "loss": 1.0527, + "step": 1122 + }, + { + "epoch": 1.0456238361266295, + "grad_norm": 1.5288114547729492, + "learning_rate": 4.68600219396989e-06, + "loss": 1.006, + "step": 1123 + }, + { + "epoch": 1.0465549348230911, + "grad_norm": 1.5617018938064575, + "learning_rate": 4.685401234786544e-06, + "loss": 1.0623, + "step": 1124 + }, + { + "epoch": 1.047486033519553, + "grad_norm": 1.4837076663970947, + "learning_rate": 4.6847997396775125e-06, + "loss": 0.9941, + "step": 1125 + }, + { + "epoch": 1.048417132216015, + "grad_norm": 1.5589977502822876, + "learning_rate": 4.6841977087903e-06, + "loss": 1.0266, + "step": 1126 + }, + { + "epoch": 1.0493482309124766, + "grad_norm": 1.542898178100586, + "learning_rate": 4.683595142272544e-06, + "loss": 1.0045, + "step": 1127 + }, + { + "epoch": 1.0502793296089385, + "grad_norm": 1.5327948331832886, + "learning_rate": 4.682992040272008e-06, + "loss": 0.9836, + "step": 1128 + }, + { + "epoch": 1.0512104283054005, + "grad_norm": 1.5851460695266724, + "learning_rate": 4.682388402936595e-06, + "loss": 1.0505, + "step": 1129 + }, + { + "epoch": 1.0521415270018621, + "grad_norm": 1.578179121017456, + "learning_rate": 4.6817842304143325e-06, + "loss": 1.0409, + "step": 1130 + }, + { + "epoch": 1.053072625698324, + "grad_norm": 1.5836224555969238, + "learning_rate": 4.681179522853383e-06, + "loss": 1.0505, + "step": 1131 + }, + { + "epoch": 1.0540037243947857, + "grad_norm": 1.5516797304153442, + "learning_rate": 4.680574280402037e-06, + "loss": 0.9993, + "step": 1132 + }, + { + "epoch": 1.0549348230912476, + "grad_norm": 1.5390299558639526, + "learning_rate": 4.67996850320872e-06, + "loss": 0.9907, + "step": 1133 + }, + { + "epoch": 1.0558659217877095, + "grad_norm": 1.6459771394729614, + "learning_rate": 4.679362191421984e-06, + "loss": 1.0276, + "step": 1134 + }, + { + "epoch": 1.0567970204841712, + "grad_norm": 1.6367114782333374, + "learning_rate": 4.678755345190517e-06, + "loss": 1.0565, + "step": 1135 + }, + { + "epoch": 1.0577281191806331, + "grad_norm": 1.5795568227767944, + "learning_rate": 4.678147964663137e-06, + "loss": 0.9907, + "step": 1136 + }, + { + "epoch": 1.058659217877095, + "grad_norm": 1.6172724962234497, + "learning_rate": 4.6775400499887894e-06, + "loss": 1.0417, + "step": 1137 + }, + { + "epoch": 1.0595903165735567, + "grad_norm": 1.5257259607315063, + "learning_rate": 4.676931601316553e-06, + "loss": 1.0324, + "step": 1138 + }, + { + "epoch": 1.0605214152700186, + "grad_norm": 1.5546989440917969, + "learning_rate": 4.67632261879564e-06, + "loss": 1.0049, + "step": 1139 + }, + { + "epoch": 1.0614525139664805, + "grad_norm": 1.5146756172180176, + "learning_rate": 4.675713102575389e-06, + "loss": 1.0025, + "step": 1140 + }, + { + "epoch": 1.0623836126629422, + "grad_norm": 1.5709218978881836, + "learning_rate": 4.675103052805271e-06, + "loss": 1.027, + "step": 1141 + }, + { + "epoch": 1.0633147113594041, + "grad_norm": 1.533260703086853, + "learning_rate": 4.6744924696348906e-06, + "loss": 1.0348, + "step": 1142 + }, + { + "epoch": 1.0642458100558658, + "grad_norm": 1.5131194591522217, + "learning_rate": 4.67388135321398e-06, + "loss": 1.0271, + "step": 1143 + }, + { + "epoch": 1.0651769087523277, + "grad_norm": 1.5011087656021118, + "learning_rate": 4.673269703692403e-06, + "loss": 1.0093, + "step": 1144 + }, + { + "epoch": 1.0661080074487896, + "grad_norm": 1.5335296392440796, + "learning_rate": 4.672657521220155e-06, + "loss": 1.0285, + "step": 1145 + }, + { + "epoch": 1.0670391061452513, + "grad_norm": 1.5683685541152954, + "learning_rate": 4.67204480594736e-06, + "loss": 1.0436, + "step": 1146 + }, + { + "epoch": 1.0679702048417132, + "grad_norm": 1.5955976247787476, + "learning_rate": 4.671431558024276e-06, + "loss": 1.0619, + "step": 1147 + }, + { + "epoch": 1.0689013035381751, + "grad_norm": 1.5921692848205566, + "learning_rate": 4.670817777601289e-06, + "loss": 1.0398, + "step": 1148 + }, + { + "epoch": 1.0698324022346368, + "grad_norm": 1.5250576734542847, + "learning_rate": 4.670203464828915e-06, + "loss": 1.0437, + "step": 1149 + }, + { + "epoch": 1.0707635009310987, + "grad_norm": 1.5908156633377075, + "learning_rate": 4.669588619857804e-06, + "loss": 1.0793, + "step": 1150 + }, + { + "epoch": 1.0716945996275604, + "grad_norm": 1.577242136001587, + "learning_rate": 4.668973242838733e-06, + "loss": 1.0361, + "step": 1151 + }, + { + "epoch": 1.0726256983240223, + "grad_norm": 1.5931251049041748, + "learning_rate": 4.6683573339226105e-06, + "loss": 1.0279, + "step": 1152 + }, + { + "epoch": 1.0735567970204842, + "grad_norm": 1.6169497966766357, + "learning_rate": 4.667740893260477e-06, + "loss": 1.0306, + "step": 1153 + }, + { + "epoch": 1.074487895716946, + "grad_norm": 1.531818151473999, + "learning_rate": 4.667123921003502e-06, + "loss": 0.9994, + "step": 1154 + }, + { + "epoch": 1.0754189944134078, + "grad_norm": 1.6143760681152344, + "learning_rate": 4.6665064173029845e-06, + "loss": 1.0164, + "step": 1155 + }, + { + "epoch": 1.0763500931098697, + "grad_norm": 1.6116831302642822, + "learning_rate": 4.6658883823103555e-06, + "loss": 1.0356, + "step": 1156 + }, + { + "epoch": 1.0772811918063314, + "grad_norm": 1.5921729803085327, + "learning_rate": 4.665269816177176e-06, + "loss": 1.0267, + "step": 1157 + }, + { + "epoch": 1.0782122905027933, + "grad_norm": 1.5189334154129028, + "learning_rate": 4.664650719055136e-06, + "loss": 1.0014, + "step": 1158 + }, + { + "epoch": 1.0791433891992552, + "grad_norm": 1.5935183763504028, + "learning_rate": 4.664031091096058e-06, + "loss": 1.0025, + "step": 1159 + }, + { + "epoch": 1.080074487895717, + "grad_norm": 1.50205659866333, + "learning_rate": 4.663410932451892e-06, + "loss": 0.9977, + "step": 1160 + }, + { + "epoch": 1.0810055865921788, + "grad_norm": 1.5565632581710815, + "learning_rate": 4.66279024327472e-06, + "loss": 1.0247, + "step": 1161 + }, + { + "epoch": 1.0819366852886405, + "grad_norm": 1.5481642484664917, + "learning_rate": 4.6621690237167525e-06, + "loss": 1.023, + "step": 1162 + }, + { + "epoch": 1.0828677839851024, + "grad_norm": 1.5725595951080322, + "learning_rate": 4.661547273930333e-06, + "loss": 1.0354, + "step": 1163 + }, + { + "epoch": 1.0837988826815643, + "grad_norm": 1.5911078453063965, + "learning_rate": 4.6609249940679316e-06, + "loss": 0.9993, + "step": 1164 + }, + { + "epoch": 1.084729981378026, + "grad_norm": 1.5475032329559326, + "learning_rate": 4.6603021842821504e-06, + "loss": 1.012, + "step": 1165 + }, + { + "epoch": 1.085661080074488, + "grad_norm": 1.6458382606506348, + "learning_rate": 4.659678844725722e-06, + "loss": 1.0992, + "step": 1166 + }, + { + "epoch": 1.0865921787709498, + "grad_norm": 1.4876608848571777, + "learning_rate": 4.6590549755515055e-06, + "loss": 1.0253, + "step": 1167 + }, + { + "epoch": 1.0875232774674115, + "grad_norm": 1.5520896911621094, + "learning_rate": 4.658430576912495e-06, + "loss": 1.0218, + "step": 1168 + }, + { + "epoch": 1.0884543761638734, + "grad_norm": 1.4791535139083862, + "learning_rate": 4.657805648961809e-06, + "loss": 1.0252, + "step": 1169 + }, + { + "epoch": 1.089385474860335, + "grad_norm": 1.5296577215194702, + "learning_rate": 4.657180191852701e-06, + "loss": 1.0488, + "step": 1170 + }, + { + "epoch": 1.090316573556797, + "grad_norm": 1.541603922843933, + "learning_rate": 4.65655420573855e-06, + "loss": 1.0759, + "step": 1171 + }, + { + "epoch": 1.091247672253259, + "grad_norm": 1.632855772972107, + "learning_rate": 4.655927690772868e-06, + "loss": 1.0103, + "step": 1172 + }, + { + "epoch": 1.0921787709497206, + "grad_norm": 1.6715481281280518, + "learning_rate": 4.655300647109293e-06, + "loss": 1.041, + "step": 1173 + }, + { + "epoch": 1.0931098696461825, + "grad_norm": 1.5514028072357178, + "learning_rate": 4.654673074901596e-06, + "loss": 1.0283, + "step": 1174 + }, + { + "epoch": 1.0940409683426444, + "grad_norm": 1.5209184885025024, + "learning_rate": 4.654044974303679e-06, + "loss": 1.0072, + "step": 1175 + }, + { + "epoch": 1.094972067039106, + "grad_norm": 1.562577247619629, + "learning_rate": 4.653416345469567e-06, + "loss": 1.0396, + "step": 1176 + }, + { + "epoch": 1.095903165735568, + "grad_norm": 1.6706187725067139, + "learning_rate": 4.65278718855342e-06, + "loss": 1.046, + "step": 1177 + }, + { + "epoch": 1.0968342644320297, + "grad_norm": 1.4715631008148193, + "learning_rate": 4.652157503709527e-06, + "loss": 1.007, + "step": 1178 + }, + { + "epoch": 1.0977653631284916, + "grad_norm": 1.5649617910385132, + "learning_rate": 4.651527291092305e-06, + "loss": 1.0125, + "step": 1179 + }, + { + "epoch": 1.0986964618249535, + "grad_norm": 1.675752878189087, + "learning_rate": 4.6508965508563e-06, + "loss": 1.0586, + "step": 1180 + }, + { + "epoch": 1.0996275605214152, + "grad_norm": 1.6852096319198608, + "learning_rate": 4.650265283156189e-06, + "loss": 1.0755, + "step": 1181 + }, + { + "epoch": 1.100558659217877, + "grad_norm": 1.493811845779419, + "learning_rate": 4.649633488146779e-06, + "loss": 1.0031, + "step": 1182 + }, + { + "epoch": 1.101489757914339, + "grad_norm": 1.600391149520874, + "learning_rate": 4.6490011659830035e-06, + "loss": 1.0142, + "step": 1183 + }, + { + "epoch": 1.1024208566108007, + "grad_norm": 1.6148325204849243, + "learning_rate": 4.648368316819927e-06, + "loss": 1.0453, + "step": 1184 + }, + { + "epoch": 1.1033519553072626, + "grad_norm": 1.5433683395385742, + "learning_rate": 4.647734940812743e-06, + "loss": 0.9821, + "step": 1185 + }, + { + "epoch": 1.1042830540037243, + "grad_norm": 1.5970865488052368, + "learning_rate": 4.647101038116775e-06, + "loss": 1.0295, + "step": 1186 + }, + { + "epoch": 1.1052141527001862, + "grad_norm": 1.570863127708435, + "learning_rate": 4.646466608887474e-06, + "loss": 1.0342, + "step": 1187 + }, + { + "epoch": 1.106145251396648, + "grad_norm": 1.64969801902771, + "learning_rate": 4.645831653280421e-06, + "loss": 1.0779, + "step": 1188 + }, + { + "epoch": 1.1070763500931098, + "grad_norm": 1.5375019311904907, + "learning_rate": 4.645196171451327e-06, + "loss": 1.0346, + "step": 1189 + }, + { + "epoch": 1.1080074487895717, + "grad_norm": 1.5763280391693115, + "learning_rate": 4.6445601635560305e-06, + "loss": 1.0118, + "step": 1190 + }, + { + "epoch": 1.1089385474860336, + "grad_norm": 1.623668909072876, + "learning_rate": 4.6439236297505e-06, + "loss": 1.0476, + "step": 1191 + }, + { + "epoch": 1.1098696461824953, + "grad_norm": 1.6408751010894775, + "learning_rate": 4.643286570190832e-06, + "loss": 1.0261, + "step": 1192 + }, + { + "epoch": 1.1108007448789572, + "grad_norm": 1.563445806503296, + "learning_rate": 4.6426489850332515e-06, + "loss": 1.024, + "step": 1193 + }, + { + "epoch": 1.111731843575419, + "grad_norm": 1.5648657083511353, + "learning_rate": 4.642010874434116e-06, + "loss": 1.0513, + "step": 1194 + }, + { + "epoch": 1.1126629422718808, + "grad_norm": 1.5322439670562744, + "learning_rate": 4.641372238549909e-06, + "loss": 0.9958, + "step": 1195 + }, + { + "epoch": 1.1135940409683427, + "grad_norm": 1.521952748298645, + "learning_rate": 4.640733077537241e-06, + "loss": 1.0329, + "step": 1196 + }, + { + "epoch": 1.1145251396648044, + "grad_norm": 1.5218099355697632, + "learning_rate": 4.640093391552854e-06, + "loss": 0.9921, + "step": 1197 + }, + { + "epoch": 1.1154562383612663, + "grad_norm": 1.6294655799865723, + "learning_rate": 4.639453180753619e-06, + "loss": 1.0276, + "step": 1198 + }, + { + "epoch": 1.1163873370577282, + "grad_norm": 1.5066817998886108, + "learning_rate": 4.638812445296535e-06, + "loss": 1.0537, + "step": 1199 + }, + { + "epoch": 1.1173184357541899, + "grad_norm": 1.5842376947402954, + "learning_rate": 4.638171185338729e-06, + "loss": 1.0, + "step": 1200 + }, + { + "epoch": 1.1182495344506518, + "grad_norm": 1.5416522026062012, + "learning_rate": 4.637529401037456e-06, + "loss": 1.0459, + "step": 1201 + }, + { + "epoch": 1.1191806331471137, + "grad_norm": 1.5635859966278076, + "learning_rate": 4.636887092550103e-06, + "loss": 1.0746, + "step": 1202 + }, + { + "epoch": 1.1201117318435754, + "grad_norm": 1.5437886714935303, + "learning_rate": 4.636244260034182e-06, + "loss": 1.0373, + "step": 1203 + }, + { + "epoch": 1.1210428305400373, + "grad_norm": 1.4998105764389038, + "learning_rate": 4.635600903647334e-06, + "loss": 1.0077, + "step": 1204 + }, + { + "epoch": 1.121973929236499, + "grad_norm": 1.4908320903778076, + "learning_rate": 4.63495702354733e-06, + "loss": 1.0359, + "step": 1205 + }, + { + "epoch": 1.1229050279329609, + "grad_norm": 1.551636815071106, + "learning_rate": 4.634312619892069e-06, + "loss": 1.0086, + "step": 1206 + }, + { + "epoch": 1.1238361266294228, + "grad_norm": 1.5259814262390137, + "learning_rate": 4.633667692839577e-06, + "loss": 1.0517, + "step": 1207 + }, + { + "epoch": 1.1247672253258845, + "grad_norm": 1.5764120817184448, + "learning_rate": 4.6330222425480095e-06, + "loss": 1.0802, + "step": 1208 + }, + { + "epoch": 1.1256983240223464, + "grad_norm": 1.5756115913391113, + "learning_rate": 4.632376269175653e-06, + "loss": 1.0305, + "step": 1209 + }, + { + "epoch": 1.1266294227188083, + "grad_norm": 1.6039260625839233, + "learning_rate": 4.631729772880914e-06, + "loss": 1.0446, + "step": 1210 + }, + { + "epoch": 1.12756052141527, + "grad_norm": 1.570855975151062, + "learning_rate": 4.631082753822338e-06, + "loss": 1.048, + "step": 1211 + }, + { + "epoch": 1.1284916201117319, + "grad_norm": 1.5383471250534058, + "learning_rate": 4.630435212158591e-06, + "loss": 1.0401, + "step": 1212 + }, + { + "epoch": 1.1294227188081938, + "grad_norm": 1.5055118799209595, + "learning_rate": 4.6297871480484694e-06, + "loss": 0.9918, + "step": 1213 + }, + { + "epoch": 1.1303538175046555, + "grad_norm": 1.5805336236953735, + "learning_rate": 4.629138561650899e-06, + "loss": 1.0582, + "step": 1214 + }, + { + "epoch": 1.1312849162011174, + "grad_norm": 1.4991474151611328, + "learning_rate": 4.628489453124931e-06, + "loss": 0.9945, + "step": 1215 + }, + { + "epoch": 1.132216014897579, + "grad_norm": 1.5370382070541382, + "learning_rate": 4.627839822629748e-06, + "loss": 0.9885, + "step": 1216 + }, + { + "epoch": 1.133147113594041, + "grad_norm": 1.5403425693511963, + "learning_rate": 4.627189670324657e-06, + "loss": 1.0336, + "step": 1217 + }, + { + "epoch": 1.1340782122905029, + "grad_norm": 1.6169673204421997, + "learning_rate": 4.626538996369096e-06, + "loss": 1.0285, + "step": 1218 + }, + { + "epoch": 1.1350093109869646, + "grad_norm": 1.4898627996444702, + "learning_rate": 4.62588780092263e-06, + "loss": 0.9995, + "step": 1219 + }, + { + "epoch": 1.1359404096834265, + "grad_norm": 1.5606540441513062, + "learning_rate": 4.6252360841449504e-06, + "loss": 0.9895, + "step": 1220 + }, + { + "epoch": 1.1368715083798882, + "grad_norm": 1.5657724142074585, + "learning_rate": 4.624583846195878e-06, + "loss": 1.051, + "step": 1221 + }, + { + "epoch": 1.13780260707635, + "grad_norm": 1.6210291385650635, + "learning_rate": 4.623931087235361e-06, + "loss": 1.0703, + "step": 1222 + }, + { + "epoch": 1.138733705772812, + "grad_norm": 1.4690114259719849, + "learning_rate": 4.623277807423477e-06, + "loss": 0.9946, + "step": 1223 + }, + { + "epoch": 1.1396648044692737, + "grad_norm": 1.558742642402649, + "learning_rate": 4.622624006920426e-06, + "loss": 1.0388, + "step": 1224 + }, + { + "epoch": 1.1405959031657356, + "grad_norm": 1.565818428993225, + "learning_rate": 4.621969685886544e-06, + "loss": 1.0368, + "step": 1225 + }, + { + "epoch": 1.1415270018621975, + "grad_norm": 1.5682542324066162, + "learning_rate": 4.621314844482287e-06, + "loss": 1.0392, + "step": 1226 + }, + { + "epoch": 1.1424581005586592, + "grad_norm": 1.4746406078338623, + "learning_rate": 4.6206594828682425e-06, + "loss": 0.9779, + "step": 1227 + }, + { + "epoch": 1.143389199255121, + "grad_norm": 1.5437195301055908, + "learning_rate": 4.620003601205125e-06, + "loss": 1.0153, + "step": 1228 + }, + { + "epoch": 1.144320297951583, + "grad_norm": 1.542839527130127, + "learning_rate": 4.619347199653775e-06, + "loss": 1.0044, + "step": 1229 + }, + { + "epoch": 1.1452513966480447, + "grad_norm": 1.4853402376174927, + "learning_rate": 4.6186902783751645e-06, + "loss": 1.0036, + "step": 1230 + }, + { + "epoch": 1.1461824953445066, + "grad_norm": 1.5345730781555176, + "learning_rate": 4.6180328375303876e-06, + "loss": 1.0376, + "step": 1231 + }, + { + "epoch": 1.1471135940409685, + "grad_norm": 1.5952215194702148, + "learning_rate": 4.617374877280669e-06, + "loss": 1.039, + "step": 1232 + }, + { + "epoch": 1.1480446927374302, + "grad_norm": 1.5528311729431152, + "learning_rate": 4.616716397787362e-06, + "loss": 1.0234, + "step": 1233 + }, + { + "epoch": 1.148975791433892, + "grad_norm": 1.5732342004776, + "learning_rate": 4.616057399211943e-06, + "loss": 1.0117, + "step": 1234 + }, + { + "epoch": 1.1499068901303537, + "grad_norm": 1.5569034814834595, + "learning_rate": 4.615397881716019e-06, + "loss": 1.0582, + "step": 1235 + }, + { + "epoch": 1.1508379888268156, + "grad_norm": 1.5574308633804321, + "learning_rate": 4.614737845461325e-06, + "loss": 1.0399, + "step": 1236 + }, + { + "epoch": 1.1517690875232776, + "grad_norm": 1.820462703704834, + "learning_rate": 4.614077290609719e-06, + "loss": 1.0413, + "step": 1237 + }, + { + "epoch": 1.1527001862197392, + "grad_norm": 1.6015551090240479, + "learning_rate": 4.61341621732319e-06, + "loss": 1.0486, + "step": 1238 + }, + { + "epoch": 1.1536312849162011, + "grad_norm": 1.5389423370361328, + "learning_rate": 4.612754625763854e-06, + "loss": 1.0387, + "step": 1239 + }, + { + "epoch": 1.1545623836126628, + "grad_norm": 1.5318962335586548, + "learning_rate": 4.61209251609395e-06, + "loss": 1.0429, + "step": 1240 + }, + { + "epoch": 1.1554934823091247, + "grad_norm": 1.5230445861816406, + "learning_rate": 4.61142988847585e-06, + "loss": 1.0348, + "step": 1241 + }, + { + "epoch": 1.1564245810055866, + "grad_norm": 1.472663402557373, + "learning_rate": 4.61076674307205e-06, + "loss": 0.9974, + "step": 1242 + }, + { + "epoch": 1.1573556797020483, + "grad_norm": 1.6207207441329956, + "learning_rate": 4.610103080045171e-06, + "loss": 1.0286, + "step": 1243 + }, + { + "epoch": 1.1582867783985102, + "grad_norm": 1.5338255167007446, + "learning_rate": 4.609438899557964e-06, + "loss": 1.0252, + "step": 1244 + }, + { + "epoch": 1.1592178770949721, + "grad_norm": 1.560064673423767, + "learning_rate": 4.608774201773307e-06, + "loss": 1.0351, + "step": 1245 + }, + { + "epoch": 1.1601489757914338, + "grad_norm": 1.538946270942688, + "learning_rate": 4.608108986854202e-06, + "loss": 1.0156, + "step": 1246 + }, + { + "epoch": 1.1610800744878957, + "grad_norm": 1.6004894971847534, + "learning_rate": 4.607443254963782e-06, + "loss": 1.0239, + "step": 1247 + }, + { + "epoch": 1.1620111731843576, + "grad_norm": 1.6527992486953735, + "learning_rate": 4.606777006265302e-06, + "loss": 0.9598, + "step": 1248 + }, + { + "epoch": 1.1629422718808193, + "grad_norm": 1.5303829908370972, + "learning_rate": 4.606110240922146e-06, + "loss": 1.0364, + "step": 1249 + }, + { + "epoch": 1.1638733705772812, + "grad_norm": 1.5249247550964355, + "learning_rate": 4.605442959097826e-06, + "loss": 1.0127, + "step": 1250 + }, + { + "epoch": 1.164804469273743, + "grad_norm": 1.5821163654327393, + "learning_rate": 4.604775160955979e-06, + "loss": 1.0117, + "step": 1251 + }, + { + "epoch": 1.1657355679702048, + "grad_norm": 1.56100594997406, + "learning_rate": 4.60410684666037e-06, + "loss": 1.044, + "step": 1252 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 1.534448266029358, + "learning_rate": 4.603438016374888e-06, + "loss": 1.0152, + "step": 1253 + }, + { + "epoch": 1.1675977653631284, + "grad_norm": 1.5872039794921875, + "learning_rate": 4.602768670263551e-06, + "loss": 1.0006, + "step": 1254 + }, + { + "epoch": 1.1685288640595903, + "grad_norm": 1.58054518699646, + "learning_rate": 4.602098808490503e-06, + "loss": 1.0303, + "step": 1255 + }, + { + "epoch": 1.169459962756052, + "grad_norm": 1.5923792123794556, + "learning_rate": 4.6014284312200134e-06, + "loss": 0.9982, + "step": 1256 + }, + { + "epoch": 1.170391061452514, + "grad_norm": 1.5675373077392578, + "learning_rate": 4.600757538616479e-06, + "loss": 1.0388, + "step": 1257 + }, + { + "epoch": 1.1713221601489758, + "grad_norm": 1.5689672231674194, + "learning_rate": 4.600086130844424e-06, + "loss": 1.0358, + "step": 1258 + }, + { + "epoch": 1.1722532588454375, + "grad_norm": 1.5517081022262573, + "learning_rate": 4.5994142080684956e-06, + "loss": 1.0056, + "step": 1259 + }, + { + "epoch": 1.1731843575418994, + "grad_norm": 1.5735931396484375, + "learning_rate": 4.5987417704534695e-06, + "loss": 1.0255, + "step": 1260 + }, + { + "epoch": 1.1741154562383613, + "grad_norm": 1.5739511251449585, + "learning_rate": 4.598068818164249e-06, + "loss": 0.9954, + "step": 1261 + }, + { + "epoch": 1.175046554934823, + "grad_norm": 1.588541865348816, + "learning_rate": 4.597395351365861e-06, + "loss": 1.0068, + "step": 1262 + }, + { + "epoch": 1.175977653631285, + "grad_norm": 1.6256474256515503, + "learning_rate": 4.596721370223461e-06, + "loss": 1.05, + "step": 1263 + }, + { + "epoch": 1.1769087523277468, + "grad_norm": 1.5730301141738892, + "learning_rate": 4.5960468749023265e-06, + "loss": 1.0262, + "step": 1264 + }, + { + "epoch": 1.1778398510242085, + "grad_norm": 1.6214555501937866, + "learning_rate": 4.595371865567866e-06, + "loss": 1.051, + "step": 1265 + }, + { + "epoch": 1.1787709497206704, + "grad_norm": 1.581821322441101, + "learning_rate": 4.5946963423856125e-06, + "loss": 1.0208, + "step": 1266 + }, + { + "epoch": 1.1797020484171323, + "grad_norm": 1.6067352294921875, + "learning_rate": 4.594020305521223e-06, + "loss": 1.014, + "step": 1267 + }, + { + "epoch": 1.180633147113594, + "grad_norm": 1.6256908178329468, + "learning_rate": 4.593343755140484e-06, + "loss": 1.0744, + "step": 1268 + }, + { + "epoch": 1.181564245810056, + "grad_norm": 1.5307555198669434, + "learning_rate": 4.592666691409303e-06, + "loss": 0.9939, + "step": 1269 + }, + { + "epoch": 1.1824953445065176, + "grad_norm": 1.5120658874511719, + "learning_rate": 4.591989114493718e-06, + "loss": 1.0313, + "step": 1270 + }, + { + "epoch": 1.1834264432029795, + "grad_norm": 1.5359355211257935, + "learning_rate": 4.591311024559891e-06, + "loss": 1.0259, + "step": 1271 + }, + { + "epoch": 1.1843575418994414, + "grad_norm": 1.6252797842025757, + "learning_rate": 4.59063242177411e-06, + "loss": 1.0344, + "step": 1272 + }, + { + "epoch": 1.185288640595903, + "grad_norm": 1.6357321739196777, + "learning_rate": 4.589953306302787e-06, + "loss": 1.0615, + "step": 1273 + }, + { + "epoch": 1.186219739292365, + "grad_norm": 1.6225498914718628, + "learning_rate": 4.5892736783124635e-06, + "loss": 1.0005, + "step": 1274 + }, + { + "epoch": 1.1871508379888267, + "grad_norm": 1.6100691556930542, + "learning_rate": 4.588593537969805e-06, + "loss": 1.0212, + "step": 1275 + }, + { + "epoch": 1.1880819366852886, + "grad_norm": 1.5478780269622803, + "learning_rate": 4.5879128854415996e-06, + "loss": 1.0574, + "step": 1276 + }, + { + "epoch": 1.1890130353817505, + "grad_norm": 1.5705329179763794, + "learning_rate": 4.5872317208947656e-06, + "loss": 1.0777, + "step": 1277 + }, + { + "epoch": 1.1899441340782122, + "grad_norm": 1.463334321975708, + "learning_rate": 4.586550044496345e-06, + "loss": 1.0335, + "step": 1278 + }, + { + "epoch": 1.190875232774674, + "grad_norm": 1.537367820739746, + "learning_rate": 4.585867856413505e-06, + "loss": 1.0388, + "step": 1279 + }, + { + "epoch": 1.191806331471136, + "grad_norm": 1.6201003789901733, + "learning_rate": 4.5851851568135376e-06, + "loss": 1.0516, + "step": 1280 + }, + { + "epoch": 1.1927374301675977, + "grad_norm": 1.52261483669281, + "learning_rate": 4.5845019458638614e-06, + "loss": 0.9937, + "step": 1281 + }, + { + "epoch": 1.1936685288640596, + "grad_norm": 1.5462313890457153, + "learning_rate": 4.583818223732021e-06, + "loss": 1.0214, + "step": 1282 + }, + { + "epoch": 1.1945996275605215, + "grad_norm": 1.5227042436599731, + "learning_rate": 4.583133990585684e-06, + "loss": 1.0141, + "step": 1283 + }, + { + "epoch": 1.1955307262569832, + "grad_norm": 1.5403075218200684, + "learning_rate": 4.5824492465926474e-06, + "loss": 1.0451, + "step": 1284 + }, + { + "epoch": 1.196461824953445, + "grad_norm": 1.556834101676941, + "learning_rate": 4.581763991920829e-06, + "loss": 1.0162, + "step": 1285 + }, + { + "epoch": 1.197392923649907, + "grad_norm": 1.5314834117889404, + "learning_rate": 4.5810782267382736e-06, + "loss": 1.0482, + "step": 1286 + }, + { + "epoch": 1.1983240223463687, + "grad_norm": 1.6128249168395996, + "learning_rate": 4.580391951213151e-06, + "loss": 1.0176, + "step": 1287 + }, + { + "epoch": 1.1992551210428306, + "grad_norm": 1.5465834140777588, + "learning_rate": 4.579705165513758e-06, + "loss": 1.029, + "step": 1288 + }, + { + "epoch": 1.2001862197392923, + "grad_norm": 1.5844080448150635, + "learning_rate": 4.579017869808514e-06, + "loss": 1.0404, + "step": 1289 + }, + { + "epoch": 1.2011173184357542, + "grad_norm": 1.6418040990829468, + "learning_rate": 4.578330064265965e-06, + "loss": 1.0137, + "step": 1290 + }, + { + "epoch": 1.202048417132216, + "grad_norm": 1.597098708152771, + "learning_rate": 4.57764174905478e-06, + "loss": 0.9727, + "step": 1291 + }, + { + "epoch": 1.2029795158286778, + "grad_norm": 1.6501483917236328, + "learning_rate": 4.576952924343756e-06, + "loss": 1.0167, + "step": 1292 + }, + { + "epoch": 1.2039106145251397, + "grad_norm": 1.6502028703689575, + "learning_rate": 4.576263590301814e-06, + "loss": 1.0172, + "step": 1293 + }, + { + "epoch": 1.2048417132216014, + "grad_norm": 1.546657681465149, + "learning_rate": 4.575573747097996e-06, + "loss": 1.034, + "step": 1294 + }, + { + "epoch": 1.2057728119180633, + "grad_norm": 1.6021524667739868, + "learning_rate": 4.5748833949014766e-06, + "loss": 1.0109, + "step": 1295 + }, + { + "epoch": 1.2067039106145252, + "grad_norm": 1.5246201753616333, + "learning_rate": 4.574192533881547e-06, + "loss": 0.9966, + "step": 1296 + }, + { + "epoch": 1.2076350093109869, + "grad_norm": 1.6043974161148071, + "learning_rate": 4.57350116420763e-06, + "loss": 1.0458, + "step": 1297 + }, + { + "epoch": 1.2085661080074488, + "grad_norm": 1.6759452819824219, + "learning_rate": 4.572809286049268e-06, + "loss": 1.0335, + "step": 1298 + }, + { + "epoch": 1.2094972067039107, + "grad_norm": 1.5275174379348755, + "learning_rate": 4.572116899576131e-06, + "loss": 1.0203, + "step": 1299 + }, + { + "epoch": 1.2104283054003724, + "grad_norm": 1.6002684831619263, + "learning_rate": 4.571424004958012e-06, + "loss": 1.0415, + "step": 1300 + }, + { + "epoch": 1.2113594040968343, + "grad_norm": 1.5781075954437256, + "learning_rate": 4.570730602364831e-06, + "loss": 1.0681, + "step": 1301 + }, + { + "epoch": 1.2122905027932962, + "grad_norm": 1.5570040941238403, + "learning_rate": 4.5700366919666294e-06, + "loss": 1.0484, + "step": 1302 + }, + { + "epoch": 1.2132216014897579, + "grad_norm": 1.5383262634277344, + "learning_rate": 4.569342273933576e-06, + "loss": 1.0193, + "step": 1303 + }, + { + "epoch": 1.2141527001862198, + "grad_norm": 1.5316905975341797, + "learning_rate": 4.568647348435963e-06, + "loss": 1.0248, + "step": 1304 + }, + { + "epoch": 1.2150837988826815, + "grad_norm": 1.4854018688201904, + "learning_rate": 4.567951915644205e-06, + "loss": 1.0422, + "step": 1305 + }, + { + "epoch": 1.2160148975791434, + "grad_norm": 1.5791949033737183, + "learning_rate": 4.567255975728846e-06, + "loss": 1.0302, + "step": 1306 + }, + { + "epoch": 1.2169459962756053, + "grad_norm": 1.6900380849838257, + "learning_rate": 4.566559528860548e-06, + "loss": 1.0799, + "step": 1307 + }, + { + "epoch": 1.217877094972067, + "grad_norm": 1.6398446559906006, + "learning_rate": 4.565862575210102e-06, + "loss": 1.0161, + "step": 1308 + }, + { + "epoch": 1.2188081936685289, + "grad_norm": 1.5573872327804565, + "learning_rate": 4.565165114948423e-06, + "loss": 0.9874, + "step": 1309 + }, + { + "epoch": 1.2197392923649906, + "grad_norm": 1.5711264610290527, + "learning_rate": 4.564467148246548e-06, + "loss": 1.0225, + "step": 1310 + }, + { + "epoch": 1.2206703910614525, + "grad_norm": 1.58504319190979, + "learning_rate": 4.563768675275639e-06, + "loss": 1.0156, + "step": 1311 + }, + { + "epoch": 1.2216014897579144, + "grad_norm": 1.845436930656433, + "learning_rate": 4.563069696206982e-06, + "loss": 1.0516, + "step": 1312 + }, + { + "epoch": 1.222532588454376, + "grad_norm": 1.6350762844085693, + "learning_rate": 4.56237021121199e-06, + "loss": 1.0155, + "step": 1313 + }, + { + "epoch": 1.223463687150838, + "grad_norm": 1.5159375667572021, + "learning_rate": 4.561670220462194e-06, + "loss": 1.019, + "step": 1314 + }, + { + "epoch": 1.2243947858472999, + "grad_norm": 1.6053423881530762, + "learning_rate": 4.560969724129256e-06, + "loss": 1.0054, + "step": 1315 + }, + { + "epoch": 1.2253258845437616, + "grad_norm": 1.5837332010269165, + "learning_rate": 4.560268722384956e-06, + "loss": 0.9912, + "step": 1316 + }, + { + "epoch": 1.2262569832402235, + "grad_norm": 1.707977294921875, + "learning_rate": 4.559567215401203e-06, + "loss": 1.0047, + "step": 1317 + }, + { + "epoch": 1.2271880819366854, + "grad_norm": 1.5581670999526978, + "learning_rate": 4.558865203350026e-06, + "loss": 1.0079, + "step": 1318 + }, + { + "epoch": 1.228119180633147, + "grad_norm": 1.537711501121521, + "learning_rate": 4.558162686403579e-06, + "loss": 1.0281, + "step": 1319 + }, + { + "epoch": 1.229050279329609, + "grad_norm": 1.6358866691589355, + "learning_rate": 4.5574596647341414e-06, + "loss": 1.0651, + "step": 1320 + }, + { + "epoch": 1.2299813780260709, + "grad_norm": 1.6060906648635864, + "learning_rate": 4.556756138514114e-06, + "loss": 1.0007, + "step": 1321 + }, + { + "epoch": 1.2309124767225326, + "grad_norm": 1.5629162788391113, + "learning_rate": 4.556052107916023e-06, + "loss": 0.9999, + "step": 1322 + }, + { + "epoch": 1.2318435754189945, + "grad_norm": 1.5924534797668457, + "learning_rate": 4.555347573112519e-06, + "loss": 1.0452, + "step": 1323 + }, + { + "epoch": 1.2327746741154562, + "grad_norm": 1.632931113243103, + "learning_rate": 4.5546425342763715e-06, + "loss": 1.0224, + "step": 1324 + }, + { + "epoch": 1.233705772811918, + "grad_norm": 1.5387897491455078, + "learning_rate": 4.55393699158048e-06, + "loss": 1.0182, + "step": 1325 + }, + { + "epoch": 1.23463687150838, + "grad_norm": 1.515651822090149, + "learning_rate": 4.553230945197864e-06, + "loss": 0.9789, + "step": 1326 + }, + { + "epoch": 1.2355679702048417, + "grad_norm": 1.5131114721298218, + "learning_rate": 4.552524395301667e-06, + "loss": 1.0067, + "step": 1327 + }, + { + "epoch": 1.2364990689013036, + "grad_norm": 1.6065956354141235, + "learning_rate": 4.551817342065157e-06, + "loss": 1.0358, + "step": 1328 + }, + { + "epoch": 1.2374301675977653, + "grad_norm": 1.5832058191299438, + "learning_rate": 4.551109785661722e-06, + "loss": 1.0342, + "step": 1329 + }, + { + "epoch": 1.2383612662942272, + "grad_norm": 1.5282360315322876, + "learning_rate": 4.55040172626488e-06, + "loss": 1.0214, + "step": 1330 + }, + { + "epoch": 1.239292364990689, + "grad_norm": 1.595635175704956, + "learning_rate": 4.549693164048265e-06, + "loss": 1.0128, + "step": 1331 + }, + { + "epoch": 1.2402234636871508, + "grad_norm": 1.6100645065307617, + "learning_rate": 4.548984099185638e-06, + "loss": 1.0168, + "step": 1332 + }, + { + "epoch": 1.2411545623836127, + "grad_norm": 1.5903205871582031, + "learning_rate": 4.548274531850885e-06, + "loss": 1.0006, + "step": 1333 + }, + { + "epoch": 1.2420856610800746, + "grad_norm": 1.5367014408111572, + "learning_rate": 4.5475644622180105e-06, + "loss": 0.9752, + "step": 1334 + }, + { + "epoch": 1.2430167597765363, + "grad_norm": 1.553397536277771, + "learning_rate": 4.546853890461147e-06, + "loss": 1.0178, + "step": 1335 + }, + { + "epoch": 1.2439478584729982, + "grad_norm": 1.5958094596862793, + "learning_rate": 4.546142816754546e-06, + "loss": 1.0039, + "step": 1336 + }, + { + "epoch": 1.24487895716946, + "grad_norm": 1.5748871564865112, + "learning_rate": 4.545431241272585e-06, + "loss": 0.9865, + "step": 1337 + }, + { + "epoch": 1.2458100558659218, + "grad_norm": 1.6458196640014648, + "learning_rate": 4.5447191641897645e-06, + "loss": 1.0063, + "step": 1338 + }, + { + "epoch": 1.2467411545623837, + "grad_norm": 1.6087242364883423, + "learning_rate": 4.544006585680706e-06, + "loss": 1.0252, + "step": 1339 + }, + { + "epoch": 1.2476722532588453, + "grad_norm": 1.6460249423980713, + "learning_rate": 4.543293505920155e-06, + "loss": 1.0613, + "step": 1340 + }, + { + "epoch": 1.2486033519553073, + "grad_norm": 1.590306043624878, + "learning_rate": 4.542579925082979e-06, + "loss": 1.0291, + "step": 1341 + }, + { + "epoch": 1.2495344506517692, + "grad_norm": 1.529325008392334, + "learning_rate": 4.541865843344171e-06, + "loss": 0.9829, + "step": 1342 + }, + { + "epoch": 1.2504655493482308, + "grad_norm": 1.5599408149719238, + "learning_rate": 4.5411512608788454e-06, + "loss": 0.9913, + "step": 1343 + }, + { + "epoch": 1.2513966480446927, + "grad_norm": 1.6064761877059937, + "learning_rate": 4.540436177862237e-06, + "loss": 1.0504, + "step": 1344 + }, + { + "epoch": 1.2523277467411544, + "grad_norm": 1.5412648916244507, + "learning_rate": 4.5397205944697084e-06, + "loss": 1.0459, + "step": 1345 + }, + { + "epoch": 1.2532588454376163, + "grad_norm": 1.5696524381637573, + "learning_rate": 4.53900451087674e-06, + "loss": 1.0483, + "step": 1346 + }, + { + "epoch": 1.2541899441340782, + "grad_norm": 1.5144925117492676, + "learning_rate": 4.538287927258937e-06, + "loss": 0.9966, + "step": 1347 + }, + { + "epoch": 1.25512104283054, + "grad_norm": 1.576715111732483, + "learning_rate": 4.537570843792028e-06, + "loss": 1.0069, + "step": 1348 + }, + { + "epoch": 1.2560521415270018, + "grad_norm": 1.5711338520050049, + "learning_rate": 4.536853260651863e-06, + "loss": 1.0318, + "step": 1349 + }, + { + "epoch": 1.2569832402234637, + "grad_norm": 1.5961750745773315, + "learning_rate": 4.536135178014416e-06, + "loss": 1.0429, + "step": 1350 + }, + { + "epoch": 1.2579143389199254, + "grad_norm": 1.564344048500061, + "learning_rate": 4.535416596055779e-06, + "loss": 1.0069, + "step": 1351 + }, + { + "epoch": 1.2588454376163873, + "grad_norm": 1.5682843923568726, + "learning_rate": 4.534697514952172e-06, + "loss": 1.0283, + "step": 1352 + }, + { + "epoch": 1.2597765363128492, + "grad_norm": 1.581691026687622, + "learning_rate": 4.533977934879936e-06, + "loss": 1.0382, + "step": 1353 + }, + { + "epoch": 1.260707635009311, + "grad_norm": 1.5946353673934937, + "learning_rate": 4.533257856015532e-06, + "loss": 1.0676, + "step": 1354 + }, + { + "epoch": 1.2616387337057728, + "grad_norm": 1.5726041793823242, + "learning_rate": 4.532537278535545e-06, + "loss": 0.974, + "step": 1355 + }, + { + "epoch": 1.2625698324022347, + "grad_norm": 1.545648217201233, + "learning_rate": 4.531816202616682e-06, + "loss": 1.04, + "step": 1356 + }, + { + "epoch": 1.2635009310986964, + "grad_norm": 1.5651519298553467, + "learning_rate": 4.531094628435774e-06, + "loss": 1.0368, + "step": 1357 + }, + { + "epoch": 1.2644320297951583, + "grad_norm": 1.5435962677001953, + "learning_rate": 4.530372556169771e-06, + "loss": 1.0279, + "step": 1358 + }, + { + "epoch": 1.2653631284916202, + "grad_norm": 1.5451003313064575, + "learning_rate": 4.5296499859957475e-06, + "loss": 1.0387, + "step": 1359 + }, + { + "epoch": 1.266294227188082, + "grad_norm": 1.5139414072036743, + "learning_rate": 4.528926918090898e-06, + "loss": 1.0284, + "step": 1360 + }, + { + "epoch": 1.2672253258845438, + "grad_norm": 1.6322270631790161, + "learning_rate": 4.528203352632542e-06, + "loss": 1.0512, + "step": 1361 + }, + { + "epoch": 1.2681564245810055, + "grad_norm": 1.5866525173187256, + "learning_rate": 4.527479289798118e-06, + "loss": 1.0128, + "step": 1362 + }, + { + "epoch": 1.2690875232774674, + "grad_norm": 1.8181447982788086, + "learning_rate": 4.526754729765188e-06, + "loss": 1.0794, + "step": 1363 + }, + { + "epoch": 1.2700186219739291, + "grad_norm": 1.521165132522583, + "learning_rate": 4.526029672711437e-06, + "loss": 1.0179, + "step": 1364 + }, + { + "epoch": 1.270949720670391, + "grad_norm": 1.547357201576233, + "learning_rate": 4.525304118814671e-06, + "loss": 1.0195, + "step": 1365 + }, + { + "epoch": 1.271880819366853, + "grad_norm": 1.5587042570114136, + "learning_rate": 4.524578068252815e-06, + "loss": 1.0101, + "step": 1366 + }, + { + "epoch": 1.2728119180633146, + "grad_norm": 1.5763295888900757, + "learning_rate": 4.52385152120392e-06, + "loss": 1.056, + "step": 1367 + }, + { + "epoch": 1.2737430167597765, + "grad_norm": 1.5521553754806519, + "learning_rate": 4.523124477846156e-06, + "loss": 1.0119, + "step": 1368 + }, + { + "epoch": 1.2746741154562384, + "grad_norm": 1.5642468929290771, + "learning_rate": 4.522396938357817e-06, + "loss": 1.0362, + "step": 1369 + }, + { + "epoch": 1.2756052141527001, + "grad_norm": 1.5667338371276855, + "learning_rate": 4.5216689029173175e-06, + "loss": 1.0295, + "step": 1370 + }, + { + "epoch": 1.276536312849162, + "grad_norm": 1.5273995399475098, + "learning_rate": 4.520940371703192e-06, + "loss": 1.0064, + "step": 1371 + }, + { + "epoch": 1.277467411545624, + "grad_norm": 1.5165363550186157, + "learning_rate": 4.5202113448941e-06, + "loss": 0.984, + "step": 1372 + }, + { + "epoch": 1.2783985102420856, + "grad_norm": 1.538142204284668, + "learning_rate": 4.519481822668819e-06, + "loss": 1.0077, + "step": 1373 + }, + { + "epoch": 1.2793296089385475, + "grad_norm": 1.5915653705596924, + "learning_rate": 4.518751805206251e-06, + "loss": 1.0175, + "step": 1374 + }, + { + "epoch": 1.2802607076350094, + "grad_norm": 1.5434679985046387, + "learning_rate": 4.518021292685417e-06, + "loss": 1.0021, + "step": 1375 + }, + { + "epoch": 1.2811918063314711, + "grad_norm": 1.5857088565826416, + "learning_rate": 4.5172902852854604e-06, + "loss": 1.0481, + "step": 1376 + }, + { + "epoch": 1.282122905027933, + "grad_norm": 1.5424895286560059, + "learning_rate": 4.516558783185647e-06, + "loss": 1.0049, + "step": 1377 + }, + { + "epoch": 1.2830540037243947, + "grad_norm": 1.5552512407302856, + "learning_rate": 4.5158267865653636e-06, + "loss": 1.0189, + "step": 1378 + }, + { + "epoch": 1.2839851024208566, + "grad_norm": 1.6167283058166504, + "learning_rate": 4.515094295604115e-06, + "loss": 1.0196, + "step": 1379 + }, + { + "epoch": 1.2849162011173183, + "grad_norm": 1.6639224290847778, + "learning_rate": 4.514361310481533e-06, + "loss": 1.0404, + "step": 1380 + }, + { + "epoch": 1.2858472998137802, + "grad_norm": 1.5814796686172485, + "learning_rate": 4.513627831377365e-06, + "loss": 1.0158, + "step": 1381 + }, + { + "epoch": 1.2867783985102421, + "grad_norm": 1.6043853759765625, + "learning_rate": 4.512893858471483e-06, + "loss": 1.0288, + "step": 1382 + }, + { + "epoch": 1.2877094972067038, + "grad_norm": 1.6249885559082031, + "learning_rate": 4.51215939194388e-06, + "loss": 1.0724, + "step": 1383 + }, + { + "epoch": 1.2886405959031657, + "grad_norm": 1.6318695545196533, + "learning_rate": 4.511424431974667e-06, + "loss": 0.9819, + "step": 1384 + }, + { + "epoch": 1.2895716945996276, + "grad_norm": 1.5797929763793945, + "learning_rate": 4.51068897874408e-06, + "loss": 1.0519, + "step": 1385 + }, + { + "epoch": 1.2905027932960893, + "grad_norm": 1.6023569107055664, + "learning_rate": 4.509953032432474e-06, + "loss": 1.0146, + "step": 1386 + }, + { + "epoch": 1.2914338919925512, + "grad_norm": 1.6638710498809814, + "learning_rate": 4.509216593220324e-06, + "loss": 1.0712, + "step": 1387 + }, + { + "epoch": 1.2923649906890131, + "grad_norm": 1.6138070821762085, + "learning_rate": 4.508479661288227e-06, + "loss": 1.0388, + "step": 1388 + }, + { + "epoch": 1.2932960893854748, + "grad_norm": 1.5506830215454102, + "learning_rate": 4.507742236816901e-06, + "loss": 0.9748, + "step": 1389 + }, + { + "epoch": 1.2942271880819367, + "grad_norm": 1.5509533882141113, + "learning_rate": 4.507004319987185e-06, + "loss": 1.0001, + "step": 1390 + }, + { + "epoch": 1.2951582867783986, + "grad_norm": 1.5898278951644897, + "learning_rate": 4.506265910980038e-06, + "loss": 1.0029, + "step": 1391 + }, + { + "epoch": 1.2960893854748603, + "grad_norm": 1.6037395000457764, + "learning_rate": 4.5055270099765396e-06, + "loss": 1.0227, + "step": 1392 + }, + { + "epoch": 1.2970204841713222, + "grad_norm": 1.5255653858184814, + "learning_rate": 4.50478761715789e-06, + "loss": 1.0208, + "step": 1393 + }, + { + "epoch": 1.2979515828677841, + "grad_norm": 1.5290815830230713, + "learning_rate": 4.504047732705412e-06, + "loss": 0.996, + "step": 1394 + }, + { + "epoch": 1.2988826815642458, + "grad_norm": 1.578757405281067, + "learning_rate": 4.503307356800546e-06, + "loss": 0.9945, + "step": 1395 + }, + { + "epoch": 1.2998137802607077, + "grad_norm": 1.5922045707702637, + "learning_rate": 4.502566489624855e-06, + "loss": 1.0281, + "step": 1396 + }, + { + "epoch": 1.3007448789571694, + "grad_norm": 1.6003899574279785, + "learning_rate": 4.501825131360022e-06, + "loss": 1.0071, + "step": 1397 + }, + { + "epoch": 1.3016759776536313, + "grad_norm": 1.5832836627960205, + "learning_rate": 4.501083282187848e-06, + "loss": 1.0216, + "step": 1398 + }, + { + "epoch": 1.302607076350093, + "grad_norm": 1.5400363206863403, + "learning_rate": 4.500340942290259e-06, + "loss": 1.0267, + "step": 1399 + }, + { + "epoch": 1.303538175046555, + "grad_norm": 1.6731758117675781, + "learning_rate": 4.499598111849299e-06, + "loss": 1.0123, + "step": 1400 + }, + { + "epoch": 1.3044692737430168, + "grad_norm": 1.6038299798965454, + "learning_rate": 4.498854791047131e-06, + "loss": 1.0038, + "step": 1401 + }, + { + "epoch": 1.3054003724394785, + "grad_norm": 1.5957401990890503, + "learning_rate": 4.4981109800660395e-06, + "loss": 0.9816, + "step": 1402 + }, + { + "epoch": 1.3063314711359404, + "grad_norm": 1.628294587135315, + "learning_rate": 4.49736667908843e-06, + "loss": 0.9827, + "step": 1403 + }, + { + "epoch": 1.3072625698324023, + "grad_norm": 1.552747130393982, + "learning_rate": 4.496621888296827e-06, + "loss": 1.0261, + "step": 1404 + }, + { + "epoch": 1.308193668528864, + "grad_norm": 1.5063432455062866, + "learning_rate": 4.4958766078738745e-06, + "loss": 1.0045, + "step": 1405 + }, + { + "epoch": 1.309124767225326, + "grad_norm": 1.618623971939087, + "learning_rate": 4.495130838002339e-06, + "loss": 1.0185, + "step": 1406 + }, + { + "epoch": 1.3100558659217878, + "grad_norm": 1.5620105266571045, + "learning_rate": 4.4943845788651055e-06, + "loss": 1.0118, + "step": 1407 + }, + { + "epoch": 1.3109869646182495, + "grad_norm": 1.5479755401611328, + "learning_rate": 4.493637830645178e-06, + "loss": 1.0042, + "step": 1408 + }, + { + "epoch": 1.3119180633147114, + "grad_norm": 1.5699485540390015, + "learning_rate": 4.492890593525682e-06, + "loss": 1.012, + "step": 1409 + }, + { + "epoch": 1.3128491620111733, + "grad_norm": 1.6348872184753418, + "learning_rate": 4.492142867689861e-06, + "loss": 0.9904, + "step": 1410 + }, + { + "epoch": 1.313780260707635, + "grad_norm": 1.5649633407592773, + "learning_rate": 4.491394653321083e-06, + "loss": 1.0283, + "step": 1411 + }, + { + "epoch": 1.314711359404097, + "grad_norm": 1.5797923803329468, + "learning_rate": 4.49064595060283e-06, + "loss": 1.0507, + "step": 1412 + }, + { + "epoch": 1.3156424581005586, + "grad_norm": 1.5442322492599487, + "learning_rate": 4.489896759718706e-06, + "loss": 1.0131, + "step": 1413 + }, + { + "epoch": 1.3165735567970205, + "grad_norm": 1.5516129732131958, + "learning_rate": 4.489147080852437e-06, + "loss": 1.0079, + "step": 1414 + }, + { + "epoch": 1.3175046554934824, + "grad_norm": 1.6696643829345703, + "learning_rate": 4.488396914187865e-06, + "loss": 1.0633, + "step": 1415 + }, + { + "epoch": 1.318435754189944, + "grad_norm": 1.5874762535095215, + "learning_rate": 4.487646259908955e-06, + "loss": 1.0289, + "step": 1416 + }, + { + "epoch": 1.319366852886406, + "grad_norm": 1.5189284086227417, + "learning_rate": 4.486895118199787e-06, + "loss": 1.017, + "step": 1417 + }, + { + "epoch": 1.3202979515828677, + "grad_norm": 1.4921741485595703, + "learning_rate": 4.4861434892445645e-06, + "loss": 1.0013, + "step": 1418 + }, + { + "epoch": 1.3212290502793296, + "grad_norm": 1.5352593660354614, + "learning_rate": 4.485391373227611e-06, + "loss": 1.0171, + "step": 1419 + }, + { + "epoch": 1.3221601489757915, + "grad_norm": 1.5205575227737427, + "learning_rate": 4.484638770333367e-06, + "loss": 0.9941, + "step": 1420 + }, + { + "epoch": 1.3230912476722532, + "grad_norm": 1.538454294204712, + "learning_rate": 4.483885680746393e-06, + "loss": 0.9701, + "step": 1421 + }, + { + "epoch": 1.324022346368715, + "grad_norm": 1.5228955745697021, + "learning_rate": 4.483132104651369e-06, + "loss": 0.9875, + "step": 1422 + }, + { + "epoch": 1.324953445065177, + "grad_norm": 1.6079059839248657, + "learning_rate": 4.4823780422330935e-06, + "loss": 1.0667, + "step": 1423 + }, + { + "epoch": 1.3258845437616387, + "grad_norm": 1.5907213687896729, + "learning_rate": 4.481623493676487e-06, + "loss": 1.0127, + "step": 1424 + }, + { + "epoch": 1.3268156424581006, + "grad_norm": 1.5828615427017212, + "learning_rate": 4.480868459166586e-06, + "loss": 1.0278, + "step": 1425 + }, + { + "epoch": 1.3277467411545625, + "grad_norm": 1.6869210004806519, + "learning_rate": 4.4801129388885475e-06, + "loss": 1.0464, + "step": 1426 + }, + { + "epoch": 1.3286778398510242, + "grad_norm": 1.5865498781204224, + "learning_rate": 4.479356933027649e-06, + "loss": 1.0158, + "step": 1427 + }, + { + "epoch": 1.329608938547486, + "grad_norm": 1.5338958501815796, + "learning_rate": 4.478600441769284e-06, + "loss": 1.0471, + "step": 1428 + }, + { + "epoch": 1.330540037243948, + "grad_norm": 1.5718988180160522, + "learning_rate": 4.477843465298968e-06, + "loss": 0.9899, + "step": 1429 + }, + { + "epoch": 1.3314711359404097, + "grad_norm": 1.5694533586502075, + "learning_rate": 4.477086003802333e-06, + "loss": 1.0218, + "step": 1430 + }, + { + "epoch": 1.3324022346368716, + "grad_norm": 1.5479017496109009, + "learning_rate": 4.476328057465133e-06, + "loss": 0.9838, + "step": 1431 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.5717720985412598, + "learning_rate": 4.475569626473238e-06, + "loss": 1.0287, + "step": 1432 + }, + { + "epoch": 1.3342644320297952, + "grad_norm": 1.7197667360305786, + "learning_rate": 4.474810711012637e-06, + "loss": 1.0404, + "step": 1433 + }, + { + "epoch": 1.3351955307262569, + "grad_norm": 1.569809913635254, + "learning_rate": 4.474051311269441e-06, + "loss": 1.0421, + "step": 1434 + }, + { + "epoch": 1.3361266294227188, + "grad_norm": 1.6082358360290527, + "learning_rate": 4.473291427429876e-06, + "loss": 1.0136, + "step": 1435 + }, + { + "epoch": 1.3370577281191807, + "grad_norm": 1.5059759616851807, + "learning_rate": 4.472531059680289e-06, + "loss": 0.9791, + "step": 1436 + }, + { + "epoch": 1.3379888268156424, + "grad_norm": 1.6574952602386475, + "learning_rate": 4.471770208207143e-06, + "loss": 1.0398, + "step": 1437 + }, + { + "epoch": 1.3389199255121043, + "grad_norm": 1.6103168725967407, + "learning_rate": 4.4710088731970245e-06, + "loss": 1.0475, + "step": 1438 + }, + { + "epoch": 1.3398510242085662, + "grad_norm": 1.6753737926483154, + "learning_rate": 4.470247054836633e-06, + "loss": 1.0361, + "step": 1439 + }, + { + "epoch": 1.3407821229050279, + "grad_norm": 1.6116617918014526, + "learning_rate": 4.469484753312791e-06, + "loss": 1.0348, + "step": 1440 + }, + { + "epoch": 1.3417132216014898, + "grad_norm": 1.5506784915924072, + "learning_rate": 4.468721968812435e-06, + "loss": 1.0374, + "step": 1441 + }, + { + "epoch": 1.3426443202979517, + "grad_norm": 1.6124831438064575, + "learning_rate": 4.4679587015226255e-06, + "loss": 0.995, + "step": 1442 + }, + { + "epoch": 1.3435754189944134, + "grad_norm": 1.5187324285507202, + "learning_rate": 4.467194951630538e-06, + "loss": 0.9978, + "step": 1443 + }, + { + "epoch": 1.3445065176908753, + "grad_norm": 1.521330714225769, + "learning_rate": 4.466430719323464e-06, + "loss": 1.0057, + "step": 1444 + }, + { + "epoch": 1.3454376163873372, + "grad_norm": 1.5347623825073242, + "learning_rate": 4.46566600478882e-06, + "loss": 1.0561, + "step": 1445 + }, + { + "epoch": 1.3463687150837989, + "grad_norm": 1.619405746459961, + "learning_rate": 4.464900808214134e-06, + "loss": 1.0268, + "step": 1446 + }, + { + "epoch": 1.3472998137802608, + "grad_norm": 1.498252272605896, + "learning_rate": 4.464135129787057e-06, + "loss": 0.9973, + "step": 1447 + }, + { + "epoch": 1.3482309124767227, + "grad_norm": 1.6767488718032837, + "learning_rate": 4.463368969695355e-06, + "loss": 1.039, + "step": 1448 + }, + { + "epoch": 1.3491620111731844, + "grad_norm": 1.6489245891571045, + "learning_rate": 4.462602328126913e-06, + "loss": 1.015, + "step": 1449 + }, + { + "epoch": 1.3500931098696463, + "grad_norm": 1.6135965585708618, + "learning_rate": 4.461835205269736e-06, + "loss": 1.0581, + "step": 1450 + }, + { + "epoch": 1.351024208566108, + "grad_norm": 1.527063250541687, + "learning_rate": 4.461067601311944e-06, + "loss": 0.996, + "step": 1451 + }, + { + "epoch": 1.3519553072625698, + "grad_norm": 1.5755585432052612, + "learning_rate": 4.460299516441777e-06, + "loss": 1.0149, + "step": 1452 + }, + { + "epoch": 1.3528864059590315, + "grad_norm": 1.5916379690170288, + "learning_rate": 4.459530950847591e-06, + "loss": 1.0469, + "step": 1453 + }, + { + "epoch": 1.3538175046554934, + "grad_norm": 1.6131742000579834, + "learning_rate": 4.458761904717864e-06, + "loss": 1.0034, + "step": 1454 + }, + { + "epoch": 1.3547486033519553, + "grad_norm": 1.567866325378418, + "learning_rate": 4.457992378241188e-06, + "loss": 1.0275, + "step": 1455 + }, + { + "epoch": 1.355679702048417, + "grad_norm": 1.5476313829421997, + "learning_rate": 4.4572223716062725e-06, + "loss": 0.9842, + "step": 1456 + }, + { + "epoch": 1.356610800744879, + "grad_norm": 1.5024303197860718, + "learning_rate": 4.456451885001948e-06, + "loss": 0.9864, + "step": 1457 + }, + { + "epoch": 1.3575418994413408, + "grad_norm": 1.5810939073562622, + "learning_rate": 4.455680918617159e-06, + "loss": 1.0143, + "step": 1458 + }, + { + "epoch": 1.3584729981378025, + "grad_norm": 1.6658401489257812, + "learning_rate": 4.454909472640972e-06, + "loss": 1.0468, + "step": 1459 + }, + { + "epoch": 1.3594040968342644, + "grad_norm": 1.58247971534729, + "learning_rate": 4.454137547262566e-06, + "loss": 1.0124, + "step": 1460 + }, + { + "epoch": 1.3603351955307263, + "grad_norm": 1.5933088064193726, + "learning_rate": 4.453365142671241e-06, + "loss": 1.0646, + "step": 1461 + }, + { + "epoch": 1.361266294227188, + "grad_norm": 1.5584853887557983, + "learning_rate": 4.4525922590564144e-06, + "loss": 1.0202, + "step": 1462 + }, + { + "epoch": 1.36219739292365, + "grad_norm": 1.535901665687561, + "learning_rate": 4.45181889660762e-06, + "loss": 1.0196, + "step": 1463 + }, + { + "epoch": 1.3631284916201118, + "grad_norm": 1.5713459253311157, + "learning_rate": 4.45104505551451e-06, + "loss": 1.0111, + "step": 1464 + }, + { + "epoch": 1.3640595903165735, + "grad_norm": 1.5734684467315674, + "learning_rate": 4.4502707359668515e-06, + "loss": 1.0329, + "step": 1465 + }, + { + "epoch": 1.3649906890130354, + "grad_norm": 1.4879016876220703, + "learning_rate": 4.4494959381545325e-06, + "loss": 1.0261, + "step": 1466 + }, + { + "epoch": 1.3659217877094971, + "grad_norm": 1.565568447113037, + "learning_rate": 4.448720662267556e-06, + "loss": 1.0189, + "step": 1467 + }, + { + "epoch": 1.366852886405959, + "grad_norm": 1.5577555894851685, + "learning_rate": 4.447944908496042e-06, + "loss": 1.0312, + "step": 1468 + }, + { + "epoch": 1.3677839851024207, + "grad_norm": 1.570672869682312, + "learning_rate": 4.44716867703023e-06, + "loss": 1.0695, + "step": 1469 + }, + { + "epoch": 1.3687150837988826, + "grad_norm": 1.5811032056808472, + "learning_rate": 4.446391968060475e-06, + "loss": 1.0106, + "step": 1470 + }, + { + "epoch": 1.3696461824953445, + "grad_norm": 1.6152100563049316, + "learning_rate": 4.445614781777248e-06, + "loss": 1.0293, + "step": 1471 + }, + { + "epoch": 1.3705772811918062, + "grad_norm": 1.5285683870315552, + "learning_rate": 4.444837118371139e-06, + "loss": 0.992, + "step": 1472 + }, + { + "epoch": 1.3715083798882681, + "grad_norm": 1.5448665618896484, + "learning_rate": 4.444058978032855e-06, + "loss": 1.0511, + "step": 1473 + }, + { + "epoch": 1.37243947858473, + "grad_norm": 1.6055021286010742, + "learning_rate": 4.443280360953218e-06, + "loss": 1.034, + "step": 1474 + }, + { + "epoch": 1.3733705772811917, + "grad_norm": 1.5773824453353882, + "learning_rate": 4.442501267323169e-06, + "loss": 1.0308, + "step": 1475 + }, + { + "epoch": 1.3743016759776536, + "grad_norm": 1.5723717212677002, + "learning_rate": 4.441721697333765e-06, + "loss": 1.012, + "step": 1476 + }, + { + "epoch": 1.3752327746741155, + "grad_norm": 1.5339237451553345, + "learning_rate": 4.440941651176181e-06, + "loss": 0.9972, + "step": 1477 + }, + { + "epoch": 1.3761638733705772, + "grad_norm": 1.5498487949371338, + "learning_rate": 4.440161129041704e-06, + "loss": 1.0377, + "step": 1478 + }, + { + "epoch": 1.3770949720670391, + "grad_norm": 1.6140390634536743, + "learning_rate": 4.439380131121744e-06, + "loss": 1.0794, + "step": 1479 + }, + { + "epoch": 1.378026070763501, + "grad_norm": 1.5776286125183105, + "learning_rate": 4.438598657607826e-06, + "loss": 1.0189, + "step": 1480 + }, + { + "epoch": 1.3789571694599627, + "grad_norm": 1.6249407529830933, + "learning_rate": 4.437816708691588e-06, + "loss": 1.023, + "step": 1481 + }, + { + "epoch": 1.3798882681564246, + "grad_norm": 1.561781406402588, + "learning_rate": 4.437034284564789e-06, + "loss": 1.0348, + "step": 1482 + }, + { + "epoch": 1.3808193668528865, + "grad_norm": 1.6608562469482422, + "learning_rate": 4.436251385419302e-06, + "loss": 1.0405, + "step": 1483 + }, + { + "epoch": 1.3817504655493482, + "grad_norm": 1.5895798206329346, + "learning_rate": 4.4354680114471184e-06, + "loss": 1.0264, + "step": 1484 + }, + { + "epoch": 1.3826815642458101, + "grad_norm": 1.554221749305725, + "learning_rate": 4.434684162840344e-06, + "loss": 0.994, + "step": 1485 + }, + { + "epoch": 1.3836126629422718, + "grad_norm": 1.5708531141281128, + "learning_rate": 4.433899839791202e-06, + "loss": 1.0285, + "step": 1486 + }, + { + "epoch": 1.3845437616387337, + "grad_norm": 1.6230992078781128, + "learning_rate": 4.433115042492031e-06, + "loss": 1.0506, + "step": 1487 + }, + { + "epoch": 1.3854748603351954, + "grad_norm": 1.6104018688201904, + "learning_rate": 4.4323297711352885e-06, + "loss": 1.052, + "step": 1488 + }, + { + "epoch": 1.3864059590316573, + "grad_norm": 1.5672563314437866, + "learning_rate": 4.431544025913546e-06, + "loss": 1.0258, + "step": 1489 + }, + { + "epoch": 1.3873370577281192, + "grad_norm": 1.5545810461044312, + "learning_rate": 4.430757807019491e-06, + "loss": 0.9816, + "step": 1490 + }, + { + "epoch": 1.388268156424581, + "grad_norm": 1.576515555381775, + "learning_rate": 4.429971114645928e-06, + "loss": 1.0404, + "step": 1491 + }, + { + "epoch": 1.3891992551210428, + "grad_norm": 1.5094834566116333, + "learning_rate": 4.4291839489857775e-06, + "loss": 0.9768, + "step": 1492 + }, + { + "epoch": 1.3901303538175047, + "grad_norm": 1.6064214706420898, + "learning_rate": 4.428396310232076e-06, + "loss": 1.0372, + "step": 1493 + }, + { + "epoch": 1.3910614525139664, + "grad_norm": 1.6829825639724731, + "learning_rate": 4.427608198577976e-06, + "loss": 1.0051, + "step": 1494 + }, + { + "epoch": 1.3919925512104283, + "grad_norm": 1.6020152568817139, + "learning_rate": 4.426819614216747e-06, + "loss": 1.0152, + "step": 1495 + }, + { + "epoch": 1.3929236499068902, + "grad_norm": 1.6294755935668945, + "learning_rate": 4.426030557341774e-06, + "loss": 1.0368, + "step": 1496 + }, + { + "epoch": 1.393854748603352, + "grad_norm": 1.5974087715148926, + "learning_rate": 4.425241028146554e-06, + "loss": 1.026, + "step": 1497 + }, + { + "epoch": 1.3947858472998138, + "grad_norm": 1.516837477684021, + "learning_rate": 4.424451026824707e-06, + "loss": 1.042, + "step": 1498 + }, + { + "epoch": 1.3957169459962757, + "grad_norm": 1.560184359550476, + "learning_rate": 4.423660553569961e-06, + "loss": 1.0287, + "step": 1499 + }, + { + "epoch": 1.3966480446927374, + "grad_norm": 1.6382440328598022, + "learning_rate": 4.4228696085761665e-06, + "loss": 1.0682, + "step": 1500 + }, + { + "epoch": 1.3975791433891993, + "grad_norm": 1.6879359483718872, + "learning_rate": 4.422078192037287e-06, + "loss": 1.0623, + "step": 1501 + }, + { + "epoch": 1.3985102420856612, + "grad_norm": 1.498881459236145, + "learning_rate": 4.4212863041474005e-06, + "loss": 0.9874, + "step": 1502 + }, + { + "epoch": 1.399441340782123, + "grad_norm": 1.5965639352798462, + "learning_rate": 4.420493945100702e-06, + "loss": 1.08, + "step": 1503 + }, + { + "epoch": 1.4003724394785848, + "grad_norm": 1.5797770023345947, + "learning_rate": 4.4197011150915e-06, + "loss": 1.025, + "step": 1504 + }, + { + "epoch": 1.4013035381750465, + "grad_norm": 1.603064775466919, + "learning_rate": 4.418907814314223e-06, + "loss": 1.0367, + "step": 1505 + }, + { + "epoch": 1.4022346368715084, + "grad_norm": 1.594980239868164, + "learning_rate": 4.418114042963409e-06, + "loss": 0.9674, + "step": 1506 + }, + { + "epoch": 1.40316573556797, + "grad_norm": 1.5968581438064575, + "learning_rate": 4.417319801233717e-06, + "loss": 0.9967, + "step": 1507 + }, + { + "epoch": 1.404096834264432, + "grad_norm": 1.6094814538955688, + "learning_rate": 4.416525089319917e-06, + "loss": 1.0468, + "step": 1508 + }, + { + "epoch": 1.405027932960894, + "grad_norm": 1.589282751083374, + "learning_rate": 4.415729907416898e-06, + "loss": 1.029, + "step": 1509 + }, + { + "epoch": 1.4059590316573556, + "grad_norm": 1.6730644702911377, + "learning_rate": 4.4149342557196605e-06, + "loss": 0.9902, + "step": 1510 + }, + { + "epoch": 1.4068901303538175, + "grad_norm": 1.6359583139419556, + "learning_rate": 4.414138134423323e-06, + "loss": 1.0121, + "step": 1511 + }, + { + "epoch": 1.4078212290502794, + "grad_norm": 1.587175726890564, + "learning_rate": 4.413341543723118e-06, + "loss": 1.0261, + "step": 1512 + }, + { + "epoch": 1.408752327746741, + "grad_norm": 1.5310614109039307, + "learning_rate": 4.412544483814394e-06, + "loss": 1.0227, + "step": 1513 + }, + { + "epoch": 1.409683426443203, + "grad_norm": 1.5506380796432495, + "learning_rate": 4.411746954892612e-06, + "loss": 1.0063, + "step": 1514 + }, + { + "epoch": 1.410614525139665, + "grad_norm": 1.5596989393234253, + "learning_rate": 4.410948957153351e-06, + "loss": 1.0346, + "step": 1515 + }, + { + "epoch": 1.4115456238361266, + "grad_norm": 1.5759638547897339, + "learning_rate": 4.410150490792306e-06, + "loss": 0.9939, + "step": 1516 + }, + { + "epoch": 1.4124767225325885, + "grad_norm": 1.5548573732376099, + "learning_rate": 4.409351556005281e-06, + "loss": 1.0091, + "step": 1517 + }, + { + "epoch": 1.4134078212290504, + "grad_norm": 1.5779527425765991, + "learning_rate": 4.408552152988202e-06, + "loss": 1.0538, + "step": 1518 + }, + { + "epoch": 1.414338919925512, + "grad_norm": 1.5253233909606934, + "learning_rate": 4.407752281937104e-06, + "loss": 1.0112, + "step": 1519 + }, + { + "epoch": 1.415270018621974, + "grad_norm": 1.5554537773132324, + "learning_rate": 4.406951943048141e-06, + "loss": 1.0002, + "step": 1520 + }, + { + "epoch": 1.4162011173184357, + "grad_norm": 1.5797187089920044, + "learning_rate": 4.406151136517579e-06, + "loss": 0.996, + "step": 1521 + }, + { + "epoch": 1.4171322160148976, + "grad_norm": 1.5718674659729004, + "learning_rate": 4.405349862541801e-06, + "loss": 1.0041, + "step": 1522 + }, + { + "epoch": 1.4180633147113593, + "grad_norm": 1.608933448791504, + "learning_rate": 4.404548121317301e-06, + "loss": 1.0468, + "step": 1523 + }, + { + "epoch": 1.4189944134078212, + "grad_norm": 1.651956558227539, + "learning_rate": 4.403745913040692e-06, + "loss": 1.0345, + "step": 1524 + }, + { + "epoch": 1.419925512104283, + "grad_norm": 1.5862330198287964, + "learning_rate": 4.402943237908699e-06, + "loss": 1.0425, + "step": 1525 + }, + { + "epoch": 1.4208566108007448, + "grad_norm": 1.631689429283142, + "learning_rate": 4.4021400961181614e-06, + "loss": 1.0175, + "step": 1526 + }, + { + "epoch": 1.4217877094972067, + "grad_norm": 1.5958492755889893, + "learning_rate": 4.401336487866035e-06, + "loss": 1.037, + "step": 1527 + }, + { + "epoch": 1.4227188081936686, + "grad_norm": 1.5291881561279297, + "learning_rate": 4.400532413349385e-06, + "loss": 0.995, + "step": 1528 + }, + { + "epoch": 1.4236499068901303, + "grad_norm": 1.5820049047470093, + "learning_rate": 4.399727872765399e-06, + "loss": 1.0117, + "step": 1529 + }, + { + "epoch": 1.4245810055865922, + "grad_norm": 1.4826836585998535, + "learning_rate": 4.398922866311371e-06, + "loss": 1.0209, + "step": 1530 + }, + { + "epoch": 1.425512104283054, + "grad_norm": 1.5255011320114136, + "learning_rate": 4.398117394184715e-06, + "loss": 0.99, + "step": 1531 + }, + { + "epoch": 1.4264432029795158, + "grad_norm": 1.5269635915756226, + "learning_rate": 4.397311456582955e-06, + "loss": 1.0077, + "step": 1532 + }, + { + "epoch": 1.4273743016759777, + "grad_norm": 1.5802326202392578, + "learning_rate": 4.3965050537037325e-06, + "loss": 0.9775, + "step": 1533 + }, + { + "epoch": 1.4283054003724396, + "grad_norm": 1.6086894273757935, + "learning_rate": 4.3956981857448e-06, + "loss": 0.997, + "step": 1534 + }, + { + "epoch": 1.4292364990689013, + "grad_norm": 1.5799283981323242, + "learning_rate": 4.394890852904027e-06, + "loss": 0.9973, + "step": 1535 + }, + { + "epoch": 1.4301675977653632, + "grad_norm": 1.5577309131622314, + "learning_rate": 4.3940830553793946e-06, + "loss": 0.963, + "step": 1536 + }, + { + "epoch": 1.431098696461825, + "grad_norm": 1.6366912126541138, + "learning_rate": 4.393274793368999e-06, + "loss": 1.0124, + "step": 1537 + }, + { + "epoch": 1.4320297951582868, + "grad_norm": 1.6136518716812134, + "learning_rate": 4.392466067071051e-06, + "loss": 1.0405, + "step": 1538 + }, + { + "epoch": 1.4329608938547487, + "grad_norm": 1.6023740768432617, + "learning_rate": 4.391656876683875e-06, + "loss": 0.9989, + "step": 1539 + }, + { + "epoch": 1.4338919925512104, + "grad_norm": 1.6204147338867188, + "learning_rate": 4.390847222405907e-06, + "loss": 1.0159, + "step": 1540 + }, + { + "epoch": 1.4348230912476723, + "grad_norm": 1.537744164466858, + "learning_rate": 4.3900371044357e-06, + "loss": 0.9919, + "step": 1541 + }, + { + "epoch": 1.435754189944134, + "grad_norm": 1.5629849433898926, + "learning_rate": 4.389226522971917e-06, + "loss": 1.0466, + "step": 1542 + }, + { + "epoch": 1.4366852886405959, + "grad_norm": 1.5706342458724976, + "learning_rate": 4.388415478213337e-06, + "loss": 1.0111, + "step": 1543 + }, + { + "epoch": 1.4376163873370578, + "grad_norm": 1.628672480583191, + "learning_rate": 4.387603970358856e-06, + "loss": 1.0371, + "step": 1544 + }, + { + "epoch": 1.4385474860335195, + "grad_norm": 1.5431934595108032, + "learning_rate": 4.386791999607476e-06, + "loss": 1.0496, + "step": 1545 + }, + { + "epoch": 1.4394785847299814, + "grad_norm": 1.5898290872573853, + "learning_rate": 4.385979566158318e-06, + "loss": 1.0132, + "step": 1546 + }, + { + "epoch": 1.4404096834264433, + "grad_norm": 1.5480161905288696, + "learning_rate": 4.385166670210615e-06, + "loss": 1.0285, + "step": 1547 + }, + { + "epoch": 1.441340782122905, + "grad_norm": 1.5950345993041992, + "learning_rate": 4.384353311963713e-06, + "loss": 1.0059, + "step": 1548 + }, + { + "epoch": 1.4422718808193669, + "grad_norm": 1.5423212051391602, + "learning_rate": 4.383539491617073e-06, + "loss": 1.0412, + "step": 1549 + }, + { + "epoch": 1.4432029795158288, + "grad_norm": 1.5239518880844116, + "learning_rate": 4.382725209370266e-06, + "loss": 0.984, + "step": 1550 + }, + { + "epoch": 1.4441340782122905, + "grad_norm": 1.771134853363037, + "learning_rate": 4.38191046542298e-06, + "loss": 1.0527, + "step": 1551 + }, + { + "epoch": 1.4450651769087524, + "grad_norm": 1.5928163528442383, + "learning_rate": 4.381095259975013e-06, + "loss": 1.0041, + "step": 1552 + }, + { + "epoch": 1.4459962756052143, + "grad_norm": 1.5586247444152832, + "learning_rate": 4.38027959322628e-06, + "loss": 1.0317, + "step": 1553 + }, + { + "epoch": 1.446927374301676, + "grad_norm": 1.601347804069519, + "learning_rate": 4.379463465376805e-06, + "loss": 1.0223, + "step": 1554 + }, + { + "epoch": 1.4478584729981379, + "grad_norm": 1.5670347213745117, + "learning_rate": 4.378646876626728e-06, + "loss": 0.986, + "step": 1555 + }, + { + "epoch": 1.4487895716945998, + "grad_norm": 1.5701487064361572, + "learning_rate": 4.3778298271762995e-06, + "loss": 1.0203, + "step": 1556 + }, + { + "epoch": 1.4497206703910615, + "grad_norm": 1.572339653968811, + "learning_rate": 4.377012317225886e-06, + "loss": 1.0236, + "step": 1557 + }, + { + "epoch": 1.4506517690875234, + "grad_norm": 1.579890251159668, + "learning_rate": 4.376194346975965e-06, + "loss": 1.0269, + "step": 1558 + }, + { + "epoch": 1.451582867783985, + "grad_norm": 1.661680817604065, + "learning_rate": 4.375375916627127e-06, + "loss": 0.9753, + "step": 1559 + }, + { + "epoch": 1.452513966480447, + "grad_norm": 1.7139066457748413, + "learning_rate": 4.374557026380075e-06, + "loss": 1.0403, + "step": 1560 + }, + { + "epoch": 1.4534450651769086, + "grad_norm": 1.5265493392944336, + "learning_rate": 4.373737676435627e-06, + "loss": 1.0078, + "step": 1561 + }, + { + "epoch": 1.4543761638733705, + "grad_norm": 1.4877139329910278, + "learning_rate": 4.37291786699471e-06, + "loss": 0.99, + "step": 1562 + }, + { + "epoch": 1.4553072625698324, + "grad_norm": 1.6461578607559204, + "learning_rate": 4.372097598258368e-06, + "loss": 1.0565, + "step": 1563 + }, + { + "epoch": 1.4562383612662941, + "grad_norm": 1.5626616477966309, + "learning_rate": 4.3712768704277535e-06, + "loss": 1.0245, + "step": 1564 + }, + { + "epoch": 1.457169459962756, + "grad_norm": 1.5262725353240967, + "learning_rate": 4.370455683704134e-06, + "loss": 0.9997, + "step": 1565 + }, + { + "epoch": 1.458100558659218, + "grad_norm": 1.5577821731567383, + "learning_rate": 4.369634038288889e-06, + "loss": 1.0397, + "step": 1566 + }, + { + "epoch": 1.4590316573556796, + "grad_norm": 1.558105230331421, + "learning_rate": 4.368811934383511e-06, + "loss": 1.0504, + "step": 1567 + }, + { + "epoch": 1.4599627560521415, + "grad_norm": 1.6111888885498047, + "learning_rate": 4.367989372189605e-06, + "loss": 1.0177, + "step": 1568 + }, + { + "epoch": 1.4608938547486034, + "grad_norm": 1.5383460521697998, + "learning_rate": 4.367166351908886e-06, + "loss": 0.9705, + "step": 1569 + }, + { + "epoch": 1.4618249534450651, + "grad_norm": 1.5505338907241821, + "learning_rate": 4.366342873743184e-06, + "loss": 1.0018, + "step": 1570 + }, + { + "epoch": 1.462756052141527, + "grad_norm": 1.5472747087478638, + "learning_rate": 4.365518937894442e-06, + "loss": 1.0328, + "step": 1571 + }, + { + "epoch": 1.463687150837989, + "grad_norm": 1.6104297637939453, + "learning_rate": 4.364694544564711e-06, + "loss": 1.0077, + "step": 1572 + }, + { + "epoch": 1.4646182495344506, + "grad_norm": 1.5873563289642334, + "learning_rate": 4.3638696939561595e-06, + "loss": 1.0187, + "step": 1573 + }, + { + "epoch": 1.4655493482309125, + "grad_norm": 1.6917014122009277, + "learning_rate": 4.363044386271063e-06, + "loss": 1.0458, + "step": 1574 + }, + { + "epoch": 1.4664804469273742, + "grad_norm": 1.612997055053711, + "learning_rate": 4.3622186217118135e-06, + "loss": 1.0292, + "step": 1575 + }, + { + "epoch": 1.4674115456238361, + "grad_norm": 1.602902889251709, + "learning_rate": 4.361392400480912e-06, + "loss": 1.019, + "step": 1576 + }, + { + "epoch": 1.4683426443202978, + "grad_norm": 1.5190726518630981, + "learning_rate": 4.360565722780974e-06, + "loss": 0.9725, + "step": 1577 + }, + { + "epoch": 1.4692737430167597, + "grad_norm": 1.535841464996338, + "learning_rate": 4.3597385888147235e-06, + "loss": 0.988, + "step": 1578 + }, + { + "epoch": 1.4702048417132216, + "grad_norm": 1.4984718561172485, + "learning_rate": 4.358910998785001e-06, + "loss": 0.9924, + "step": 1579 + }, + { + "epoch": 1.4711359404096833, + "grad_norm": 1.6019173860549927, + "learning_rate": 4.358082952894753e-06, + "loss": 1.053, + "step": 1580 + }, + { + "epoch": 1.4720670391061452, + "grad_norm": 1.569354772567749, + "learning_rate": 4.357254451347045e-06, + "loss": 1.0012, + "step": 1581 + }, + { + "epoch": 1.4729981378026071, + "grad_norm": 1.5266960859298706, + "learning_rate": 4.356425494345047e-06, + "loss": 0.9885, + "step": 1582 + }, + { + "epoch": 1.4739292364990688, + "grad_norm": 1.5382027626037598, + "learning_rate": 4.3555960820920465e-06, + "loss": 1.0262, + "step": 1583 + }, + { + "epoch": 1.4748603351955307, + "grad_norm": 1.5143629312515259, + "learning_rate": 4.354766214791439e-06, + "loss": 0.9881, + "step": 1584 + }, + { + "epoch": 1.4757914338919926, + "grad_norm": 1.5462934970855713, + "learning_rate": 4.353935892646732e-06, + "loss": 1.029, + "step": 1585 + }, + { + "epoch": 1.4767225325884543, + "grad_norm": 1.58848237991333, + "learning_rate": 4.353105115861546e-06, + "loss": 1.0053, + "step": 1586 + }, + { + "epoch": 1.4776536312849162, + "grad_norm": 1.5712151527404785, + "learning_rate": 4.352273884639613e-06, + "loss": 0.9747, + "step": 1587 + }, + { + "epoch": 1.4785847299813781, + "grad_norm": 1.526538372039795, + "learning_rate": 4.351442199184776e-06, + "loss": 1.0062, + "step": 1588 + }, + { + "epoch": 1.4795158286778398, + "grad_norm": 1.5572766065597534, + "learning_rate": 4.350610059700986e-06, + "loss": 1.0126, + "step": 1589 + }, + { + "epoch": 1.4804469273743017, + "grad_norm": 1.522646427154541, + "learning_rate": 4.349777466392313e-06, + "loss": 0.9894, + "step": 1590 + }, + { + "epoch": 1.4813780260707636, + "grad_norm": 1.5419663190841675, + "learning_rate": 4.34894441946293e-06, + "loss": 0.9672, + "step": 1591 + }, + { + "epoch": 1.4823091247672253, + "grad_norm": 1.551070213317871, + "learning_rate": 4.348110919117128e-06, + "loss": 1.0119, + "step": 1592 + }, + { + "epoch": 1.4832402234636872, + "grad_norm": 1.5017564296722412, + "learning_rate": 4.3472769655593035e-06, + "loss": 1.0066, + "step": 1593 + }, + { + "epoch": 1.484171322160149, + "grad_norm": 1.5424667596817017, + "learning_rate": 4.346442558993969e-06, + "loss": 1.025, + "step": 1594 + }, + { + "epoch": 1.4851024208566108, + "grad_norm": 1.5252048969268799, + "learning_rate": 4.345607699625744e-06, + "loss": 1.0331, + "step": 1595 + }, + { + "epoch": 1.4860335195530725, + "grad_norm": 1.5838569402694702, + "learning_rate": 4.344772387659362e-06, + "loss": 1.0777, + "step": 1596 + }, + { + "epoch": 1.4869646182495344, + "grad_norm": 1.491519570350647, + "learning_rate": 4.343936623299667e-06, + "loss": 0.9835, + "step": 1597 + }, + { + "epoch": 1.4878957169459963, + "grad_norm": 1.5592520236968994, + "learning_rate": 4.343100406751612e-06, + "loss": 0.9963, + "step": 1598 + }, + { + "epoch": 1.488826815642458, + "grad_norm": 1.525503158569336, + "learning_rate": 4.342263738220264e-06, + "loss": 0.9823, + "step": 1599 + }, + { + "epoch": 1.48975791433892, + "grad_norm": 1.5770426988601685, + "learning_rate": 4.3414266179107975e-06, + "loss": 1.0168, + "step": 1600 + }, + { + "epoch": 1.4906890130353818, + "grad_norm": 1.5425713062286377, + "learning_rate": 4.340589046028501e-06, + "loss": 0.9938, + "step": 1601 + }, + { + "epoch": 1.4916201117318435, + "grad_norm": 1.5985755920410156, + "learning_rate": 4.33975102277877e-06, + "loss": 0.9895, + "step": 1602 + }, + { + "epoch": 1.4925512104283054, + "grad_norm": 1.6213315725326538, + "learning_rate": 4.3389125483671145e-06, + "loss": 1.0204, + "step": 1603 + }, + { + "epoch": 1.4934823091247673, + "grad_norm": 1.5698057413101196, + "learning_rate": 4.3380736229991535e-06, + "loss": 1.0563, + "step": 1604 + }, + { + "epoch": 1.494413407821229, + "grad_norm": 1.6291435956954956, + "learning_rate": 4.337234246880616e-06, + "loss": 1.0361, + "step": 1605 + }, + { + "epoch": 1.495344506517691, + "grad_norm": 1.4916856288909912, + "learning_rate": 4.336394420217342e-06, + "loss": 1.0341, + "step": 1606 + }, + { + "epoch": 1.4962756052141528, + "grad_norm": 1.5709009170532227, + "learning_rate": 4.3355541432152826e-06, + "loss": 1.0507, + "step": 1607 + }, + { + "epoch": 1.4972067039106145, + "grad_norm": 1.554448127746582, + "learning_rate": 4.334713416080498e-06, + "loss": 1.0495, + "step": 1608 + }, + { + "epoch": 1.4981378026070764, + "grad_norm": 1.5857200622558594, + "learning_rate": 4.3338722390191615e-06, + "loss": 1.065, + "step": 1609 + }, + { + "epoch": 1.499068901303538, + "grad_norm": 1.514541745185852, + "learning_rate": 4.3330306122375524e-06, + "loss": 1.0344, + "step": 1610 + }, + { + "epoch": 1.5, + "grad_norm": 1.496036410331726, + "learning_rate": 4.3321885359420635e-06, + "loss": 1.0013, + "step": 1611 + }, + { + "epoch": 1.5009310986964617, + "grad_norm": 1.5402806997299194, + "learning_rate": 4.331346010339198e-06, + "loss": 1.049, + "step": 1612 + }, + { + "epoch": 1.5018621973929238, + "grad_norm": 1.5287505388259888, + "learning_rate": 4.330503035635568e-06, + "loss": 1.0085, + "step": 1613 + }, + { + "epoch": 1.5027932960893855, + "grad_norm": 1.5743392705917358, + "learning_rate": 4.329659612037895e-06, + "loss": 0.996, + "step": 1614 + }, + { + "epoch": 1.5037243947858472, + "grad_norm": 1.5880268812179565, + "learning_rate": 4.3288157397530135e-06, + "loss": 1.0273, + "step": 1615 + }, + { + "epoch": 1.504655493482309, + "grad_norm": 1.5983085632324219, + "learning_rate": 4.327971418987866e-06, + "loss": 0.9985, + "step": 1616 + }, + { + "epoch": 1.505586592178771, + "grad_norm": 1.6333311796188354, + "learning_rate": 4.327126649949504e-06, + "loss": 1.0232, + "step": 1617 + }, + { + "epoch": 1.5065176908752327, + "grad_norm": 1.74416184425354, + "learning_rate": 4.326281432845089e-06, + "loss": 1.0844, + "step": 1618 + }, + { + "epoch": 1.5074487895716946, + "grad_norm": 1.5786820650100708, + "learning_rate": 4.325435767881896e-06, + "loss": 1.0042, + "step": 1619 + }, + { + "epoch": 1.5083798882681565, + "grad_norm": 1.5413870811462402, + "learning_rate": 4.324589655267306e-06, + "loss": 1.0145, + "step": 1620 + }, + { + "epoch": 1.5093109869646182, + "grad_norm": 1.6098166704177856, + "learning_rate": 4.323743095208812e-06, + "loss": 1.0154, + "step": 1621 + }, + { + "epoch": 1.51024208566108, + "grad_norm": 1.5322644710540771, + "learning_rate": 4.322896087914016e-06, + "loss": 1.0256, + "step": 1622 + }, + { + "epoch": 1.511173184357542, + "grad_norm": 1.534049153327942, + "learning_rate": 4.322048633590628e-06, + "loss": 1.0344, + "step": 1623 + }, + { + "epoch": 1.5121042830540037, + "grad_norm": 1.6334002017974854, + "learning_rate": 4.3212007324464684e-06, + "loss": 1.0057, + "step": 1624 + }, + { + "epoch": 1.5130353817504656, + "grad_norm": 1.5892307758331299, + "learning_rate": 4.3203523846894715e-06, + "loss": 0.985, + "step": 1625 + }, + { + "epoch": 1.5139664804469275, + "grad_norm": 1.5497863292694092, + "learning_rate": 4.319503590527675e-06, + "loss": 1.0127, + "step": 1626 + }, + { + "epoch": 1.5148975791433892, + "grad_norm": 1.5987825393676758, + "learning_rate": 4.318654350169228e-06, + "loss": 0.9887, + "step": 1627 + }, + { + "epoch": 1.5158286778398509, + "grad_norm": 1.5655267238616943, + "learning_rate": 4.317804663822391e-06, + "loss": 0.9991, + "step": 1628 + }, + { + "epoch": 1.516759776536313, + "grad_norm": 1.5631802082061768, + "learning_rate": 4.316954531695533e-06, + "loss": 0.988, + "step": 1629 + }, + { + "epoch": 1.5176908752327747, + "grad_norm": 1.5858218669891357, + "learning_rate": 4.31610395399713e-06, + "loss": 1.0262, + "step": 1630 + }, + { + "epoch": 1.5186219739292364, + "grad_norm": 1.6221550703048706, + "learning_rate": 4.315252930935771e-06, + "loss": 1.0221, + "step": 1631 + }, + { + "epoch": 1.5195530726256983, + "grad_norm": 1.6089915037155151, + "learning_rate": 4.31440146272015e-06, + "loss": 1.0106, + "step": 1632 + }, + { + "epoch": 1.5204841713221602, + "grad_norm": 1.5913159847259521, + "learning_rate": 4.313549549559074e-06, + "loss": 1.0143, + "step": 1633 + }, + { + "epoch": 1.5214152700186219, + "grad_norm": 1.49701726436615, + "learning_rate": 4.312697191661457e-06, + "loss": 0.9691, + "step": 1634 + }, + { + "epoch": 1.5223463687150838, + "grad_norm": 1.5818606615066528, + "learning_rate": 4.311844389236324e-06, + "loss": 1.0178, + "step": 1635 + }, + { + "epoch": 1.5232774674115457, + "grad_norm": 1.660035490989685, + "learning_rate": 4.310991142492806e-06, + "loss": 0.98, + "step": 1636 + }, + { + "epoch": 1.5242085661080074, + "grad_norm": 1.5978046655654907, + "learning_rate": 4.310137451640144e-06, + "loss": 1.0154, + "step": 1637 + }, + { + "epoch": 1.5251396648044693, + "grad_norm": 1.5881710052490234, + "learning_rate": 4.309283316887691e-06, + "loss": 1.0269, + "step": 1638 + }, + { + "epoch": 1.5260707635009312, + "grad_norm": 1.5614145994186401, + "learning_rate": 4.308428738444904e-06, + "loss": 1.0075, + "step": 1639 + }, + { + "epoch": 1.5270018621973929, + "grad_norm": 1.546956181526184, + "learning_rate": 4.307573716521353e-06, + "loss": 1.0178, + "step": 1640 + }, + { + "epoch": 1.5279329608938548, + "grad_norm": 1.6735785007476807, + "learning_rate": 4.306718251326714e-06, + "loss": 1.0419, + "step": 1641 + }, + { + "epoch": 1.5288640595903167, + "grad_norm": 1.528800129890442, + "learning_rate": 4.305862343070772e-06, + "loss": 0.9834, + "step": 1642 + }, + { + "epoch": 1.5297951582867784, + "grad_norm": 1.5266664028167725, + "learning_rate": 4.305005991963423e-06, + "loss": 1.0035, + "step": 1643 + }, + { + "epoch": 1.5307262569832403, + "grad_norm": 1.5515495538711548, + "learning_rate": 4.304149198214669e-06, + "loss": 1.0017, + "step": 1644 + }, + { + "epoch": 1.5316573556797022, + "grad_norm": 1.508175253868103, + "learning_rate": 4.30329196203462e-06, + "loss": 0.95, + "step": 1645 + }, + { + "epoch": 1.5325884543761639, + "grad_norm": 1.6205800771713257, + "learning_rate": 4.302434283633499e-06, + "loss": 1.0154, + "step": 1646 + }, + { + "epoch": 1.5335195530726256, + "grad_norm": 1.5815434455871582, + "learning_rate": 4.301576163221631e-06, + "loss": 1.0083, + "step": 1647 + }, + { + "epoch": 1.5344506517690877, + "grad_norm": 1.5696470737457275, + "learning_rate": 4.3007176010094545e-06, + "loss": 1.0549, + "step": 1648 + }, + { + "epoch": 1.5353817504655494, + "grad_norm": 1.6360572576522827, + "learning_rate": 4.299858597207514e-06, + "loss": 1.0432, + "step": 1649 + }, + { + "epoch": 1.536312849162011, + "grad_norm": 1.6072953939437866, + "learning_rate": 4.298999152026465e-06, + "loss": 1.0322, + "step": 1650 + }, + { + "epoch": 1.537243947858473, + "grad_norm": 1.5885045528411865, + "learning_rate": 4.298139265677067e-06, + "loss": 0.9939, + "step": 1651 + }, + { + "epoch": 1.5381750465549349, + "grad_norm": 1.584114909172058, + "learning_rate": 4.29727893837019e-06, + "loss": 0.9872, + "step": 1652 + }, + { + "epoch": 1.5391061452513966, + "grad_norm": 1.5574947595596313, + "learning_rate": 4.296418170316813e-06, + "loss": 1.04, + "step": 1653 + }, + { + "epoch": 1.5400372439478585, + "grad_norm": 1.6010804176330566, + "learning_rate": 4.29555696172802e-06, + "loss": 1.0243, + "step": 1654 + }, + { + "epoch": 1.5409683426443204, + "grad_norm": 1.6047048568725586, + "learning_rate": 4.294695312815008e-06, + "loss": 1.043, + "step": 1655 + }, + { + "epoch": 1.541899441340782, + "grad_norm": 1.6073797941207886, + "learning_rate": 4.293833223789076e-06, + "loss": 1.019, + "step": 1656 + }, + { + "epoch": 1.542830540037244, + "grad_norm": 1.5430538654327393, + "learning_rate": 4.292970694861636e-06, + "loss": 0.9849, + "step": 1657 + }, + { + "epoch": 1.5437616387337059, + "grad_norm": 1.6261460781097412, + "learning_rate": 4.292107726244206e-06, + "loss": 1.0406, + "step": 1658 + }, + { + "epoch": 1.5446927374301676, + "grad_norm": 1.582450270652771, + "learning_rate": 4.291244318148411e-06, + "loss": 1.021, + "step": 1659 + }, + { + "epoch": 1.5456238361266295, + "grad_norm": 1.5819180011749268, + "learning_rate": 4.290380470785984e-06, + "loss": 1.0121, + "step": 1660 + }, + { + "epoch": 1.5465549348230914, + "grad_norm": 1.5410597324371338, + "learning_rate": 4.289516184368766e-06, + "loss": 1.012, + "step": 1661 + }, + { + "epoch": 1.547486033519553, + "grad_norm": 1.5834349393844604, + "learning_rate": 4.288651459108708e-06, + "loss": 0.9964, + "step": 1662 + }, + { + "epoch": 1.5484171322160147, + "grad_norm": 1.5701252222061157, + "learning_rate": 4.287786295217864e-06, + "loss": 1.0471, + "step": 1663 + }, + { + "epoch": 1.5493482309124769, + "grad_norm": 1.610573172569275, + "learning_rate": 4.286920692908399e-06, + "loss": 1.0419, + "step": 1664 + }, + { + "epoch": 1.5502793296089385, + "grad_norm": 1.5417912006378174, + "learning_rate": 4.286054652392586e-06, + "loss": 0.9699, + "step": 1665 + }, + { + "epoch": 1.5512104283054002, + "grad_norm": 1.555499792098999, + "learning_rate": 4.285188173882802e-06, + "loss": 1.0109, + "step": 1666 + }, + { + "epoch": 1.5521415270018621, + "grad_norm": 1.5716497898101807, + "learning_rate": 4.284321257591533e-06, + "loss": 1.0581, + "step": 1667 + }, + { + "epoch": 1.553072625698324, + "grad_norm": 1.6050313711166382, + "learning_rate": 4.283453903731375e-06, + "loss": 0.9937, + "step": 1668 + }, + { + "epoch": 1.5540037243947857, + "grad_norm": 1.5794506072998047, + "learning_rate": 4.282586112515027e-06, + "loss": 1.0522, + "step": 1669 + }, + { + "epoch": 1.5549348230912476, + "grad_norm": 1.5321145057678223, + "learning_rate": 4.2817178841552985e-06, + "loss": 1.0093, + "step": 1670 + }, + { + "epoch": 1.5558659217877095, + "grad_norm": 1.5704569816589355, + "learning_rate": 4.2808492188651054e-06, + "loss": 1.0276, + "step": 1671 + }, + { + "epoch": 1.5567970204841712, + "grad_norm": 1.5735173225402832, + "learning_rate": 4.279980116857469e-06, + "loss": 1.0586, + "step": 1672 + }, + { + "epoch": 1.5577281191806331, + "grad_norm": 1.586900234222412, + "learning_rate": 4.27911057834552e-06, + "loss": 1.0028, + "step": 1673 + }, + { + "epoch": 1.558659217877095, + "grad_norm": 1.5465446710586548, + "learning_rate": 4.278240603542496e-06, + "loss": 0.9711, + "step": 1674 + }, + { + "epoch": 1.5595903165735567, + "grad_norm": 1.613728404045105, + "learning_rate": 4.27737019266174e-06, + "loss": 1.0134, + "step": 1675 + }, + { + "epoch": 1.5605214152700186, + "grad_norm": 1.5634862184524536, + "learning_rate": 4.276499345916701e-06, + "loss": 0.9971, + "step": 1676 + }, + { + "epoch": 1.5614525139664805, + "grad_norm": 1.5275167226791382, + "learning_rate": 4.275628063520939e-06, + "loss": 1.0019, + "step": 1677 + }, + { + "epoch": 1.5623836126629422, + "grad_norm": 1.6941787004470825, + "learning_rate": 4.274756345688118e-06, + "loss": 1.016, + "step": 1678 + }, + { + "epoch": 1.5633147113594041, + "grad_norm": 1.5935890674591064, + "learning_rate": 4.2738841926320095e-06, + "loss": 1.0739, + "step": 1679 + }, + { + "epoch": 1.564245810055866, + "grad_norm": 1.5766217708587646, + "learning_rate": 4.27301160456649e-06, + "loss": 0.99, + "step": 1680 + }, + { + "epoch": 1.5651769087523277, + "grad_norm": 1.532702922821045, + "learning_rate": 4.2721385817055465e-06, + "loss": 0.9736, + "step": 1681 + }, + { + "epoch": 1.5661080074487894, + "grad_norm": 1.4808470010757446, + "learning_rate": 4.271265124263267e-06, + "loss": 0.9814, + "step": 1682 + }, + { + "epoch": 1.5670391061452515, + "grad_norm": 1.5192599296569824, + "learning_rate": 4.270391232453853e-06, + "loss": 1.0212, + "step": 1683 + }, + { + "epoch": 1.5679702048417132, + "grad_norm": 1.5223890542984009, + "learning_rate": 4.269516906491607e-06, + "loss": 1.0154, + "step": 1684 + }, + { + "epoch": 1.568901303538175, + "grad_norm": 1.5646499395370483, + "learning_rate": 4.26864214659094e-06, + "loss": 0.9994, + "step": 1685 + }, + { + "epoch": 1.5698324022346368, + "grad_norm": 1.5947473049163818, + "learning_rate": 4.267766952966369e-06, + "loss": 1.0549, + "step": 1686 + }, + { + "epoch": 1.5707635009310987, + "grad_norm": 1.5698765516281128, + "learning_rate": 4.2668913258325186e-06, + "loss": 0.9759, + "step": 1687 + }, + { + "epoch": 1.5716945996275604, + "grad_norm": 1.563552975654602, + "learning_rate": 4.266015265404118e-06, + "loss": 1.0288, + "step": 1688 + }, + { + "epoch": 1.5726256983240223, + "grad_norm": 1.5093134641647339, + "learning_rate": 4.265138771896003e-06, + "loss": 0.9821, + "step": 1689 + }, + { + "epoch": 1.5735567970204842, + "grad_norm": 1.45540189743042, + "learning_rate": 4.264261845523117e-06, + "loss": 0.9767, + "step": 1690 + }, + { + "epoch": 1.574487895716946, + "grad_norm": 1.6118829250335693, + "learning_rate": 4.263384486500508e-06, + "loss": 1.0468, + "step": 1691 + }, + { + "epoch": 1.5754189944134078, + "grad_norm": 1.5699982643127441, + "learning_rate": 4.2625066950433305e-06, + "loss": 1.0319, + "step": 1692 + }, + { + "epoch": 1.5763500931098697, + "grad_norm": 1.5150530338287354, + "learning_rate": 4.261628471366845e-06, + "loss": 0.9948, + "step": 1693 + }, + { + "epoch": 1.5772811918063314, + "grad_norm": 1.581607460975647, + "learning_rate": 4.260749815686419e-06, + "loss": 1.0573, + "step": 1694 + }, + { + "epoch": 1.5782122905027933, + "grad_norm": 1.5703697204589844, + "learning_rate": 4.259870728217525e-06, + "loss": 1.0074, + "step": 1695 + }, + { + "epoch": 1.5791433891992552, + "grad_norm": 1.597098469734192, + "learning_rate": 4.2589912091757415e-06, + "loss": 1.0501, + "step": 1696 + }, + { + "epoch": 1.580074487895717, + "grad_norm": 1.5741326808929443, + "learning_rate": 4.258111258776751e-06, + "loss": 1.0303, + "step": 1697 + }, + { + "epoch": 1.5810055865921788, + "grad_norm": 1.583868145942688, + "learning_rate": 4.257230877236347e-06, + "loss": 1.0344, + "step": 1698 + }, + { + "epoch": 1.5819366852886407, + "grad_norm": 1.5371819734573364, + "learning_rate": 4.256350064770424e-06, + "loss": 1.0098, + "step": 1699 + }, + { + "epoch": 1.5828677839851024, + "grad_norm": 1.5712038278579712, + "learning_rate": 4.255468821594981e-06, + "loss": 1.0294, + "step": 1700 + }, + { + "epoch": 1.583798882681564, + "grad_norm": 1.5606169700622559, + "learning_rate": 4.254587147926129e-06, + "loss": 1.0818, + "step": 1701 + }, + { + "epoch": 1.5847299813780262, + "grad_norm": 1.6024121046066284, + "learning_rate": 4.2537050439800775e-06, + "loss": 1.0258, + "step": 1702 + }, + { + "epoch": 1.585661080074488, + "grad_norm": 1.604117512702942, + "learning_rate": 4.252822509973148e-06, + "loss": 1.0501, + "step": 1703 + }, + { + "epoch": 1.5865921787709496, + "grad_norm": 1.5062448978424072, + "learning_rate": 4.251939546121761e-06, + "loss": 1.0455, + "step": 1704 + }, + { + "epoch": 1.5875232774674115, + "grad_norm": 1.592667579650879, + "learning_rate": 4.251056152642448e-06, + "loss": 0.9994, + "step": 1705 + }, + { + "epoch": 1.5884543761638734, + "grad_norm": 1.587271809577942, + "learning_rate": 4.250172329751843e-06, + "loss": 0.9718, + "step": 1706 + }, + { + "epoch": 1.589385474860335, + "grad_norm": 1.6200915575027466, + "learning_rate": 4.249288077666684e-06, + "loss": 1.0291, + "step": 1707 + }, + { + "epoch": 1.590316573556797, + "grad_norm": 1.5204311609268188, + "learning_rate": 4.248403396603818e-06, + "loss": 0.9555, + "step": 1708 + }, + { + "epoch": 1.591247672253259, + "grad_norm": 1.5054893493652344, + "learning_rate": 4.2475182867801945e-06, + "loss": 0.985, + "step": 1709 + }, + { + "epoch": 1.5921787709497206, + "grad_norm": 1.698233723640442, + "learning_rate": 4.246632748412869e-06, + "loss": 1.0638, + "step": 1710 + }, + { + "epoch": 1.5931098696461825, + "grad_norm": 1.586362361907959, + "learning_rate": 4.245746781719002e-06, + "loss": 0.9907, + "step": 1711 + }, + { + "epoch": 1.5940409683426444, + "grad_norm": 1.6033439636230469, + "learning_rate": 4.2448603869158585e-06, + "loss": 1.0306, + "step": 1712 + }, + { + "epoch": 1.594972067039106, + "grad_norm": 1.551690697669983, + "learning_rate": 4.243973564220811e-06, + "loss": 0.9998, + "step": 1713 + }, + { + "epoch": 1.595903165735568, + "grad_norm": 1.5957772731781006, + "learning_rate": 4.243086313851332e-06, + "loss": 1.0524, + "step": 1714 + }, + { + "epoch": 1.59683426443203, + "grad_norm": 1.5673012733459473, + "learning_rate": 4.242198636025004e-06, + "loss": 1.0687, + "step": 1715 + }, + { + "epoch": 1.5977653631284916, + "grad_norm": 1.5716279745101929, + "learning_rate": 4.241310530959511e-06, + "loss": 1.0395, + "step": 1716 + }, + { + "epoch": 1.5986964618249533, + "grad_norm": 1.5629993677139282, + "learning_rate": 4.240421998872643e-06, + "loss": 1.0239, + "step": 1717 + }, + { + "epoch": 1.5996275605214154, + "grad_norm": 1.5452954769134521, + "learning_rate": 4.239533039982295e-06, + "loss": 0.9931, + "step": 1718 + }, + { + "epoch": 1.600558659217877, + "grad_norm": 1.561960220336914, + "learning_rate": 4.238643654506466e-06, + "loss": 1.0137, + "step": 1719 + }, + { + "epoch": 1.6014897579143388, + "grad_norm": 1.5363296270370483, + "learning_rate": 4.237753842663259e-06, + "loss": 1.0249, + "step": 1720 + }, + { + "epoch": 1.6024208566108007, + "grad_norm": 1.5614672899246216, + "learning_rate": 4.236863604670885e-06, + "loss": 1.0258, + "step": 1721 + }, + { + "epoch": 1.6033519553072626, + "grad_norm": 1.493328332901001, + "learning_rate": 4.235972940747655e-06, + "loss": 1.0081, + "step": 1722 + }, + { + "epoch": 1.6042830540037243, + "grad_norm": 1.5374797582626343, + "learning_rate": 4.235081851111987e-06, + "loss": 1.0028, + "step": 1723 + }, + { + "epoch": 1.6052141527001862, + "grad_norm": 1.5515986680984497, + "learning_rate": 4.234190335982402e-06, + "loss": 1.0305, + "step": 1724 + }, + { + "epoch": 1.606145251396648, + "grad_norm": 1.576271891593933, + "learning_rate": 4.233298395577527e-06, + "loss": 1.0491, + "step": 1725 + }, + { + "epoch": 1.6070763500931098, + "grad_norm": 1.4784661531448364, + "learning_rate": 4.232406030116093e-06, + "loss": 0.9514, + "step": 1726 + }, + { + "epoch": 1.6080074487895717, + "grad_norm": 1.55087149143219, + "learning_rate": 4.231513239816933e-06, + "loss": 1.035, + "step": 1727 + }, + { + "epoch": 1.6089385474860336, + "grad_norm": 1.5262744426727295, + "learning_rate": 4.230620024898987e-06, + "loss": 0.9637, + "step": 1728 + }, + { + "epoch": 1.6098696461824953, + "grad_norm": 1.5404300689697266, + "learning_rate": 4.229726385581298e-06, + "loss": 0.9982, + "step": 1729 + }, + { + "epoch": 1.6108007448789572, + "grad_norm": 1.5411280393600464, + "learning_rate": 4.228832322083013e-06, + "loss": 0.9682, + "step": 1730 + }, + { + "epoch": 1.611731843575419, + "grad_norm": 1.5592252016067505, + "learning_rate": 4.227937834623382e-06, + "loss": 0.995, + "step": 1731 + }, + { + "epoch": 1.6126629422718808, + "grad_norm": 1.5622261762619019, + "learning_rate": 4.227042923421762e-06, + "loss": 1.0032, + "step": 1732 + }, + { + "epoch": 1.6135940409683427, + "grad_norm": 1.6128736734390259, + "learning_rate": 4.22614758869761e-06, + "loss": 1.0497, + "step": 1733 + }, + { + "epoch": 1.6145251396648046, + "grad_norm": 1.5503853559494019, + "learning_rate": 4.22525183067049e-06, + "loss": 1.0408, + "step": 1734 + }, + { + "epoch": 1.6154562383612663, + "grad_norm": 1.5317991971969604, + "learning_rate": 4.224355649560069e-06, + "loss": 0.975, + "step": 1735 + }, + { + "epoch": 1.616387337057728, + "grad_norm": 1.544202446937561, + "learning_rate": 4.223459045586115e-06, + "loss": 0.9766, + "step": 1736 + }, + { + "epoch": 1.61731843575419, + "grad_norm": 1.6110060214996338, + "learning_rate": 4.222562018968506e-06, + "loss": 1.0375, + "step": 1737 + }, + { + "epoch": 1.6182495344506518, + "grad_norm": 1.545624017715454, + "learning_rate": 4.221664569927217e-06, + "loss": 1.0226, + "step": 1738 + }, + { + "epoch": 1.6191806331471135, + "grad_norm": 1.5491719245910645, + "learning_rate": 4.2207666986823295e-06, + "loss": 0.9853, + "step": 1739 + }, + { + "epoch": 1.6201117318435754, + "grad_norm": 1.5949021577835083, + "learning_rate": 4.219868405454029e-06, + "loss": 1.0244, + "step": 1740 + }, + { + "epoch": 1.6210428305400373, + "grad_norm": 1.5667827129364014, + "learning_rate": 4.218969690462603e-06, + "loss": 1.0295, + "step": 1741 + }, + { + "epoch": 1.621973929236499, + "grad_norm": 1.5576833486557007, + "learning_rate": 4.218070553928444e-06, + "loss": 1.0323, + "step": 1742 + }, + { + "epoch": 1.6229050279329609, + "grad_norm": 1.5543231964111328, + "learning_rate": 4.217170996072048e-06, + "loss": 1.0304, + "step": 1743 + }, + { + "epoch": 1.6238361266294228, + "grad_norm": 1.523926854133606, + "learning_rate": 4.216271017114012e-06, + "loss": 1.0071, + "step": 1744 + }, + { + "epoch": 1.6247672253258845, + "grad_norm": 1.5696873664855957, + "learning_rate": 4.2153706172750375e-06, + "loss": 0.9715, + "step": 1745 + }, + { + "epoch": 1.6256983240223464, + "grad_norm": 1.656225562095642, + "learning_rate": 4.2144697967759315e-06, + "loss": 1.0515, + "step": 1746 + }, + { + "epoch": 1.6266294227188083, + "grad_norm": 1.5823173522949219, + "learning_rate": 4.2135685558376e-06, + "loss": 1.0513, + "step": 1747 + }, + { + "epoch": 1.62756052141527, + "grad_norm": 1.5069504976272583, + "learning_rate": 4.212666894681054e-06, + "loss": 0.9981, + "step": 1748 + }, + { + "epoch": 1.6284916201117319, + "grad_norm": 1.5372499227523804, + "learning_rate": 4.211764813527411e-06, + "loss": 0.9588, + "step": 1749 + }, + { + "epoch": 1.6294227188081938, + "grad_norm": 1.6231273412704468, + "learning_rate": 4.210862312597884e-06, + "loss": 1.032, + "step": 1750 + }, + { + "epoch": 1.6303538175046555, + "grad_norm": 1.5773149728775024, + "learning_rate": 4.209959392113796e-06, + "loss": 1.0025, + "step": 1751 + }, + { + "epoch": 1.6312849162011172, + "grad_norm": 1.5779573917388916, + "learning_rate": 4.209056052296569e-06, + "loss": 1.007, + "step": 1752 + }, + { + "epoch": 1.6322160148975793, + "grad_norm": 1.6042360067367554, + "learning_rate": 4.20815229336773e-06, + "loss": 1.0212, + "step": 1753 + }, + { + "epoch": 1.633147113594041, + "grad_norm": 1.5667567253112793, + "learning_rate": 4.207248115548906e-06, + "loss": 0.9833, + "step": 1754 + }, + { + "epoch": 1.6340782122905027, + "grad_norm": 1.601111650466919, + "learning_rate": 4.20634351906183e-06, + "loss": 1.0281, + "step": 1755 + }, + { + "epoch": 1.6350093109869648, + "grad_norm": 1.6138057708740234, + "learning_rate": 4.205438504128335e-06, + "loss": 1.02, + "step": 1756 + }, + { + "epoch": 1.6359404096834265, + "grad_norm": 1.5983747243881226, + "learning_rate": 4.204533070970358e-06, + "loss": 0.9883, + "step": 1757 + }, + { + "epoch": 1.6368715083798882, + "grad_norm": 1.6062757968902588, + "learning_rate": 4.2036272198099384e-06, + "loss": 1.0299, + "step": 1758 + }, + { + "epoch": 1.63780260707635, + "grad_norm": 1.5497345924377441, + "learning_rate": 4.202720950869218e-06, + "loss": 1.0102, + "step": 1759 + }, + { + "epoch": 1.638733705772812, + "grad_norm": 1.6185259819030762, + "learning_rate": 4.201814264370441e-06, + "loss": 1.0081, + "step": 1760 + }, + { + "epoch": 1.6396648044692737, + "grad_norm": 1.5611456632614136, + "learning_rate": 4.200907160535954e-06, + "loss": 1.0048, + "step": 1761 + }, + { + "epoch": 1.6405959031657356, + "grad_norm": 1.5442298650741577, + "learning_rate": 4.199999639588206e-06, + "loss": 0.9999, + "step": 1762 + }, + { + "epoch": 1.6415270018621975, + "grad_norm": 1.5870798826217651, + "learning_rate": 4.199091701749748e-06, + "loss": 0.9889, + "step": 1763 + }, + { + "epoch": 1.6424581005586592, + "grad_norm": 1.5467183589935303, + "learning_rate": 4.198183347243233e-06, + "loss": 1.0103, + "step": 1764 + }, + { + "epoch": 1.643389199255121, + "grad_norm": 1.5057183504104614, + "learning_rate": 4.197274576291418e-06, + "loss": 0.9773, + "step": 1765 + }, + { + "epoch": 1.644320297951583, + "grad_norm": 1.5750811100006104, + "learning_rate": 4.196365389117161e-06, + "loss": 0.9911, + "step": 1766 + }, + { + "epoch": 1.6452513966480447, + "grad_norm": 1.6021958589553833, + "learning_rate": 4.19545578594342e-06, + "loss": 0.9964, + "step": 1767 + }, + { + "epoch": 1.6461824953445066, + "grad_norm": 1.5625004768371582, + "learning_rate": 4.1945457669932575e-06, + "loss": 1.0119, + "step": 1768 + }, + { + "epoch": 1.6471135940409685, + "grad_norm": 1.5571106672286987, + "learning_rate": 4.193635332489839e-06, + "loss": 1.0412, + "step": 1769 + }, + { + "epoch": 1.6480446927374302, + "grad_norm": 1.5201088190078735, + "learning_rate": 4.192724482656429e-06, + "loss": 1.011, + "step": 1770 + }, + { + "epoch": 1.6489757914338918, + "grad_norm": 1.5553494691848755, + "learning_rate": 4.191813217716394e-06, + "loss": 1.0398, + "step": 1771 + }, + { + "epoch": 1.649906890130354, + "grad_norm": 1.5127707719802856, + "learning_rate": 4.190901537893205e-06, + "loss": 0.9685, + "step": 1772 + }, + { + "epoch": 1.6508379888268156, + "grad_norm": 1.5730139017105103, + "learning_rate": 4.189989443410432e-06, + "loss": 1.0446, + "step": 1773 + }, + { + "epoch": 1.6517690875232773, + "grad_norm": 1.5341529846191406, + "learning_rate": 4.189076934491749e-06, + "loss": 1.0077, + "step": 1774 + }, + { + "epoch": 1.6527001862197392, + "grad_norm": 1.6464959383010864, + "learning_rate": 4.18816401136093e-06, + "loss": 1.0555, + "step": 1775 + }, + { + "epoch": 1.6536312849162011, + "grad_norm": 1.5322630405426025, + "learning_rate": 4.187250674241851e-06, + "loss": 0.9992, + "step": 1776 + }, + { + "epoch": 1.6545623836126628, + "grad_norm": 1.5465375185012817, + "learning_rate": 4.186336923358488e-06, + "loss": 0.9907, + "step": 1777 + }, + { + "epoch": 1.6554934823091247, + "grad_norm": 1.518801212310791, + "learning_rate": 4.185422758934923e-06, + "loss": 1.0328, + "step": 1778 + }, + { + "epoch": 1.6564245810055866, + "grad_norm": 1.5929596424102783, + "learning_rate": 4.1845081811953345e-06, + "loss": 1.0329, + "step": 1779 + }, + { + "epoch": 1.6573556797020483, + "grad_norm": 1.5499824285507202, + "learning_rate": 4.183593190364005e-06, + "loss": 0.9968, + "step": 1780 + }, + { + "epoch": 1.6582867783985102, + "grad_norm": 1.517143726348877, + "learning_rate": 4.182677786665317e-06, + "loss": 0.9946, + "step": 1781 + }, + { + "epoch": 1.6592178770949721, + "grad_norm": 1.5566442012786865, + "learning_rate": 4.181761970323756e-06, + "loss": 0.9972, + "step": 1782 + }, + { + "epoch": 1.6601489757914338, + "grad_norm": 1.5849595069885254, + "learning_rate": 4.180845741563905e-06, + "loss": 1.0058, + "step": 1783 + }, + { + "epoch": 1.6610800744878957, + "grad_norm": 1.5653990507125854, + "learning_rate": 4.179929100610454e-06, + "loss": 1.0369, + "step": 1784 + }, + { + "epoch": 1.6620111731843576, + "grad_norm": 1.56673264503479, + "learning_rate": 4.1790120476881875e-06, + "loss": 1.0497, + "step": 1785 + }, + { + "epoch": 1.6629422718808193, + "grad_norm": 1.6064715385437012, + "learning_rate": 4.178094583021997e-06, + "loss": 1.0392, + "step": 1786 + }, + { + "epoch": 1.6638733705772812, + "grad_norm": 1.5575141906738281, + "learning_rate": 4.177176706836871e-06, + "loss": 1.0179, + "step": 1787 + }, + { + "epoch": 1.6648044692737431, + "grad_norm": 1.5933163166046143, + "learning_rate": 4.1762584193578996e-06, + "loss": 1.0264, + "step": 1788 + }, + { + "epoch": 1.6657355679702048, + "grad_norm": 1.533276081085205, + "learning_rate": 4.175339720810276e-06, + "loss": 1.0351, + "step": 1789 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.5583761930465698, + "learning_rate": 4.1744206114192895e-06, + "loss": 1.0436, + "step": 1790 + }, + { + "epoch": 1.6675977653631286, + "grad_norm": 1.5703086853027344, + "learning_rate": 4.173501091410338e-06, + "loss": 0.9978, + "step": 1791 + }, + { + "epoch": 1.6685288640595903, + "grad_norm": 1.5729907751083374, + "learning_rate": 4.172581161008911e-06, + "loss": 1.0299, + "step": 1792 + }, + { + "epoch": 1.669459962756052, + "grad_norm": 1.5342987775802612, + "learning_rate": 4.171660820440605e-06, + "loss": 1.0342, + "step": 1793 + }, + { + "epoch": 1.670391061452514, + "grad_norm": 1.5894691944122314, + "learning_rate": 4.170740069931114e-06, + "loss": 1.0314, + "step": 1794 + }, + { + "epoch": 1.6713221601489758, + "grad_norm": 1.6253712177276611, + "learning_rate": 4.169818909706234e-06, + "loss": 1.032, + "step": 1795 + }, + { + "epoch": 1.6722532588454375, + "grad_norm": 1.6311748027801514, + "learning_rate": 4.168897339991862e-06, + "loss": 1.008, + "step": 1796 + }, + { + "epoch": 1.6731843575418994, + "grad_norm": 1.6183005571365356, + "learning_rate": 4.167975361013992e-06, + "loss": 1.0035, + "step": 1797 + }, + { + "epoch": 1.6741154562383613, + "grad_norm": 1.533651351928711, + "learning_rate": 4.167052972998723e-06, + "loss": 1.0347, + "step": 1798 + }, + { + "epoch": 1.675046554934823, + "grad_norm": 1.4968410730361938, + "learning_rate": 4.166130176172251e-06, + "loss": 1.0056, + "step": 1799 + }, + { + "epoch": 1.675977653631285, + "grad_norm": 1.605269193649292, + "learning_rate": 4.165206970760874e-06, + "loss": 1.0483, + "step": 1800 + }, + { + "epoch": 1.6769087523277468, + "grad_norm": 1.6567280292510986, + "learning_rate": 4.164283356990989e-06, + "loss": 1.0096, + "step": 1801 + }, + { + "epoch": 1.6778398510242085, + "grad_norm": 1.5661386251449585, + "learning_rate": 4.1633593350890945e-06, + "loss": 1.0253, + "step": 1802 + }, + { + "epoch": 1.6787709497206704, + "grad_norm": 1.632533073425293, + "learning_rate": 4.162434905281787e-06, + "loss": 0.9913, + "step": 1803 + }, + { + "epoch": 1.6797020484171323, + "grad_norm": 1.564606785774231, + "learning_rate": 4.1615100677957655e-06, + "loss": 0.9877, + "step": 1804 + }, + { + "epoch": 1.680633147113594, + "grad_norm": 1.518401861190796, + "learning_rate": 4.160584822857827e-06, + "loss": 0.9998, + "step": 1805 + }, + { + "epoch": 1.6815642458100557, + "grad_norm": 1.6043238639831543, + "learning_rate": 4.1596591706948695e-06, + "loss": 1.031, + "step": 1806 + }, + { + "epoch": 1.6824953445065178, + "grad_norm": 1.655834674835205, + "learning_rate": 4.158733111533892e-06, + "loss": 1.0401, + "step": 1807 + }, + { + "epoch": 1.6834264432029795, + "grad_norm": 1.5445494651794434, + "learning_rate": 4.1578066456019885e-06, + "loss": 1.0024, + "step": 1808 + }, + { + "epoch": 1.6843575418994412, + "grad_norm": 1.5908763408660889, + "learning_rate": 4.156879773126359e-06, + "loss": 1.016, + "step": 1809 + }, + { + "epoch": 1.6852886405959033, + "grad_norm": 1.5460089445114136, + "learning_rate": 4.155952494334299e-06, + "loss": 1.0061, + "step": 1810 + }, + { + "epoch": 1.686219739292365, + "grad_norm": 1.6270462274551392, + "learning_rate": 4.1550248094532055e-06, + "loss": 1.0111, + "step": 1811 + }, + { + "epoch": 1.6871508379888267, + "grad_norm": 1.6202434301376343, + "learning_rate": 4.154096718710575e-06, + "loss": 1.0104, + "step": 1812 + }, + { + "epoch": 1.6880819366852886, + "grad_norm": 1.6059703826904297, + "learning_rate": 4.153168222334002e-06, + "loss": 1.0274, + "step": 1813 + }, + { + "epoch": 1.6890130353817505, + "grad_norm": 1.5943083763122559, + "learning_rate": 4.152239320551182e-06, + "loss": 1.0276, + "step": 1814 + }, + { + "epoch": 1.6899441340782122, + "grad_norm": 1.5937108993530273, + "learning_rate": 4.151310013589911e-06, + "loss": 0.9987, + "step": 1815 + }, + { + "epoch": 1.690875232774674, + "grad_norm": 1.5884298086166382, + "learning_rate": 4.15038030167808e-06, + "loss": 1.0126, + "step": 1816 + }, + { + "epoch": 1.691806331471136, + "grad_norm": 1.5987244844436646, + "learning_rate": 4.149450185043684e-06, + "loss": 1.0124, + "step": 1817 + }, + { + "epoch": 1.6927374301675977, + "grad_norm": 1.5544172525405884, + "learning_rate": 4.148519663914814e-06, + "loss": 1.0055, + "step": 1818 + }, + { + "epoch": 1.6936685288640596, + "grad_norm": 1.6529874801635742, + "learning_rate": 4.1475887385196635e-06, + "loss": 0.9751, + "step": 1819 + }, + { + "epoch": 1.6945996275605215, + "grad_norm": 1.5711724758148193, + "learning_rate": 4.146657409086522e-06, + "loss": 1.0334, + "step": 1820 + }, + { + "epoch": 1.6955307262569832, + "grad_norm": 1.590208888053894, + "learning_rate": 4.1457256758437795e-06, + "loss": 0.9884, + "step": 1821 + }, + { + "epoch": 1.696461824953445, + "grad_norm": 1.5541894435882568, + "learning_rate": 4.144793539019926e-06, + "loss": 1.0326, + "step": 1822 + }, + { + "epoch": 1.697392923649907, + "grad_norm": 1.5732228755950928, + "learning_rate": 4.143860998843546e-06, + "loss": 0.9827, + "step": 1823 + }, + { + "epoch": 1.6983240223463687, + "grad_norm": 1.5404633283615112, + "learning_rate": 4.1429280555433305e-06, + "loss": 0.992, + "step": 1824 + }, + { + "epoch": 1.6992551210428304, + "grad_norm": 1.616262674331665, + "learning_rate": 4.141994709348062e-06, + "loss": 1.0318, + "step": 1825 + }, + { + "epoch": 1.7001862197392925, + "grad_norm": 1.5422700643539429, + "learning_rate": 4.141060960486626e-06, + "loss": 1.0038, + "step": 1826 + }, + { + "epoch": 1.7011173184357542, + "grad_norm": 1.6094343662261963, + "learning_rate": 4.1401268091880054e-06, + "loss": 1.0101, + "step": 1827 + }, + { + "epoch": 1.7020484171322159, + "grad_norm": 1.5428720712661743, + "learning_rate": 4.139192255681281e-06, + "loss": 1.0174, + "step": 1828 + }, + { + "epoch": 1.7029795158286778, + "grad_norm": 1.570723056793213, + "learning_rate": 4.138257300195636e-06, + "loss": 1.0162, + "step": 1829 + }, + { + "epoch": 1.7039106145251397, + "grad_norm": 1.4893836975097656, + "learning_rate": 4.137321942960348e-06, + "loss": 1.0002, + "step": 1830 + }, + { + "epoch": 1.7048417132216014, + "grad_norm": 1.5964380502700806, + "learning_rate": 4.136386184204793e-06, + "loss": 1.0456, + "step": 1831 + }, + { + "epoch": 1.7057728119180633, + "grad_norm": 1.5750490427017212, + "learning_rate": 4.135450024158448e-06, + "loss": 1.0276, + "step": 1832 + }, + { + "epoch": 1.7067039106145252, + "grad_norm": 1.5630074739456177, + "learning_rate": 4.134513463050889e-06, + "loss": 0.9788, + "step": 1833 + }, + { + "epoch": 1.7076350093109869, + "grad_norm": 1.5705476999282837, + "learning_rate": 4.133576501111787e-06, + "loss": 1.0317, + "step": 1834 + }, + { + "epoch": 1.7085661080074488, + "grad_norm": 1.5644968748092651, + "learning_rate": 4.132639138570913e-06, + "loss": 1.0207, + "step": 1835 + }, + { + "epoch": 1.7094972067039107, + "grad_norm": 1.5199353694915771, + "learning_rate": 4.131701375658138e-06, + "loss": 0.9853, + "step": 1836 + }, + { + "epoch": 1.7104283054003724, + "grad_norm": 1.5593675374984741, + "learning_rate": 4.130763212603428e-06, + "loss": 0.9939, + "step": 1837 + }, + { + "epoch": 1.7113594040968343, + "grad_norm": 1.580325961112976, + "learning_rate": 4.129824649636849e-06, + "loss": 1.0683, + "step": 1838 + }, + { + "epoch": 1.7122905027932962, + "grad_norm": 1.5195567607879639, + "learning_rate": 4.128885686988564e-06, + "loss": 1.0409, + "step": 1839 + }, + { + "epoch": 1.7132216014897579, + "grad_norm": 1.5187969207763672, + "learning_rate": 4.127946324888836e-06, + "loss": 1.0059, + "step": 1840 + }, + { + "epoch": 1.7141527001862198, + "grad_norm": 1.5323264598846436, + "learning_rate": 4.127006563568024e-06, + "loss": 0.993, + "step": 1841 + }, + { + "epoch": 1.7150837988826817, + "grad_norm": 1.602422833442688, + "learning_rate": 4.126066403256585e-06, + "loss": 1.0041, + "step": 1842 + }, + { + "epoch": 1.7160148975791434, + "grad_norm": 1.577170968055725, + "learning_rate": 4.125125844185076e-06, + "loss": 1.0102, + "step": 1843 + }, + { + "epoch": 1.716945996275605, + "grad_norm": 1.5555764436721802, + "learning_rate": 4.1241848865841485e-06, + "loss": 1.0169, + "step": 1844 + }, + { + "epoch": 1.7178770949720672, + "grad_norm": 1.5609252452850342, + "learning_rate": 4.123243530684554e-06, + "loss": 0.9838, + "step": 1845 + }, + { + "epoch": 1.7188081936685289, + "grad_norm": 1.5773577690124512, + "learning_rate": 4.122301776717141e-06, + "loss": 1.0646, + "step": 1846 + }, + { + "epoch": 1.7197392923649906, + "grad_norm": 1.5558104515075684, + "learning_rate": 4.121359624912856e-06, + "loss": 1.0225, + "step": 1847 + }, + { + "epoch": 1.7206703910614525, + "grad_norm": 1.6140307188034058, + "learning_rate": 4.120417075502743e-06, + "loss": 1.0302, + "step": 1848 + }, + { + "epoch": 1.7216014897579144, + "grad_norm": 1.6136341094970703, + "learning_rate": 4.119474128717943e-06, + "loss": 1.0444, + "step": 1849 + }, + { + "epoch": 1.722532588454376, + "grad_norm": 1.5411615371704102, + "learning_rate": 4.118530784789694e-06, + "loss": 0.9695, + "step": 1850 + }, + { + "epoch": 1.723463687150838, + "grad_norm": 1.5063066482543945, + "learning_rate": 4.117587043949334e-06, + "loss": 1.0043, + "step": 1851 + }, + { + "epoch": 1.7243947858472999, + "grad_norm": 1.5670325756072998, + "learning_rate": 4.116642906428294e-06, + "loss": 1.0175, + "step": 1852 + }, + { + "epoch": 1.7253258845437616, + "grad_norm": 1.5807342529296875, + "learning_rate": 4.115698372458107e-06, + "loss": 1.0311, + "step": 1853 + }, + { + "epoch": 1.7262569832402235, + "grad_norm": 1.539474606513977, + "learning_rate": 4.114753442270399e-06, + "loss": 0.9885, + "step": 1854 + }, + { + "epoch": 1.7271880819366854, + "grad_norm": 1.572718620300293, + "learning_rate": 4.113808116096897e-06, + "loss": 1.0285, + "step": 1855 + }, + { + "epoch": 1.728119180633147, + "grad_norm": 1.5126920938491821, + "learning_rate": 4.112862394169422e-06, + "loss": 1.0006, + "step": 1856 + }, + { + "epoch": 1.729050279329609, + "grad_norm": 1.5438448190689087, + "learning_rate": 4.111916276719892e-06, + "loss": 1.0345, + "step": 1857 + }, + { + "epoch": 1.7299813780260709, + "grad_norm": 1.55963933467865, + "learning_rate": 4.110969763980326e-06, + "loss": 1.0144, + "step": 1858 + }, + { + "epoch": 1.7309124767225326, + "grad_norm": 1.5334101915359497, + "learning_rate": 4.110022856182836e-06, + "loss": 1.0181, + "step": 1859 + }, + { + "epoch": 1.7318435754189943, + "grad_norm": 1.5000497102737427, + "learning_rate": 4.109075553559633e-06, + "loss": 0.9822, + "step": 1860 + }, + { + "epoch": 1.7327746741154564, + "grad_norm": 1.6354038715362549, + "learning_rate": 4.108127856343022e-06, + "loss": 1.0128, + "step": 1861 + }, + { + "epoch": 1.733705772811918, + "grad_norm": 1.625012993812561, + "learning_rate": 4.107179764765408e-06, + "loss": 0.9666, + "step": 1862 + }, + { + "epoch": 1.7346368715083798, + "grad_norm": 1.60771906375885, + "learning_rate": 4.106231279059291e-06, + "loss": 1.0731, + "step": 1863 + }, + { + "epoch": 1.7355679702048417, + "grad_norm": 1.5370794534683228, + "learning_rate": 4.105282399457268e-06, + "loss": 0.9957, + "step": 1864 + }, + { + "epoch": 1.7364990689013036, + "grad_norm": 1.6249788999557495, + "learning_rate": 4.1043331261920325e-06, + "loss": 1.0013, + "step": 1865 + }, + { + "epoch": 1.7374301675977653, + "grad_norm": 1.5934343338012695, + "learning_rate": 4.103383459496376e-06, + "loss": 0.9848, + "step": 1866 + }, + { + "epoch": 1.7383612662942272, + "grad_norm": 1.6212162971496582, + "learning_rate": 4.102433399603183e-06, + "loss": 0.9908, + "step": 1867 + }, + { + "epoch": 1.739292364990689, + "grad_norm": 1.5215398073196411, + "learning_rate": 4.101482946745438e-06, + "loss": 0.9839, + "step": 1868 + }, + { + "epoch": 1.7402234636871508, + "grad_norm": 1.6186209917068481, + "learning_rate": 4.10053210115622e-06, + "loss": 1.0727, + "step": 1869 + }, + { + "epoch": 1.7411545623836127, + "grad_norm": 1.563692569732666, + "learning_rate": 4.099580863068706e-06, + "loss": 0.9986, + "step": 1870 + }, + { + "epoch": 1.7420856610800746, + "grad_norm": 1.573208212852478, + "learning_rate": 4.098629232716166e-06, + "loss": 1.0244, + "step": 1871 + }, + { + "epoch": 1.7430167597765363, + "grad_norm": 1.5983200073242188, + "learning_rate": 4.097677210331968e-06, + "loss": 1.0464, + "step": 1872 + }, + { + "epoch": 1.7439478584729982, + "grad_norm": 1.5691585540771484, + "learning_rate": 4.096724796149578e-06, + "loss": 0.9984, + "step": 1873 + }, + { + "epoch": 1.74487895716946, + "grad_norm": 1.5949965715408325, + "learning_rate": 4.095771990402556e-06, + "loss": 1.0507, + "step": 1874 + }, + { + "epoch": 1.7458100558659218, + "grad_norm": 1.5889729261398315, + "learning_rate": 4.0948187933245585e-06, + "loss": 1.0005, + "step": 1875 + }, + { + "epoch": 1.7467411545623837, + "grad_norm": 1.5813437700271606, + "learning_rate": 4.093865205149338e-06, + "loss": 1.0553, + "step": 1876 + }, + { + "epoch": 1.7476722532588456, + "grad_norm": 1.5170701742172241, + "learning_rate": 4.092911226110742e-06, + "loss": 1.0448, + "step": 1877 + }, + { + "epoch": 1.7486033519553073, + "grad_norm": 1.5029665231704712, + "learning_rate": 4.091956856442715e-06, + "loss": 0.9802, + "step": 1878 + }, + { + "epoch": 1.749534450651769, + "grad_norm": 1.572916030883789, + "learning_rate": 4.0910020963792966e-06, + "loss": 1.0368, + "step": 1879 + }, + { + "epoch": 1.750465549348231, + "grad_norm": 1.5514036417007446, + "learning_rate": 4.090046946154624e-06, + "loss": 1.037, + "step": 1880 + }, + { + "epoch": 1.7513966480446927, + "grad_norm": 1.5397133827209473, + "learning_rate": 4.089091406002926e-06, + "loss": 1.0147, + "step": 1881 + }, + { + "epoch": 1.7523277467411544, + "grad_norm": 1.5901501178741455, + "learning_rate": 4.088135476158533e-06, + "loss": 1.015, + "step": 1882 + }, + { + "epoch": 1.7532588454376163, + "grad_norm": 1.468786597251892, + "learning_rate": 4.087179156855865e-06, + "loss": 0.9723, + "step": 1883 + }, + { + "epoch": 1.7541899441340782, + "grad_norm": 1.7198137044906616, + "learning_rate": 4.086222448329441e-06, + "loss": 0.9827, + "step": 1884 + }, + { + "epoch": 1.75512104283054, + "grad_norm": 1.5916560888290405, + "learning_rate": 4.085265350813873e-06, + "loss": 1.0184, + "step": 1885 + }, + { + "epoch": 1.7560521415270018, + "grad_norm": 1.5931979417800903, + "learning_rate": 4.084307864543873e-06, + "loss": 0.9839, + "step": 1886 + }, + { + "epoch": 1.7569832402234637, + "grad_norm": 1.5414892435073853, + "learning_rate": 4.0833499897542425e-06, + "loss": 1.002, + "step": 1887 + }, + { + "epoch": 1.7579143389199254, + "grad_norm": 1.5468543767929077, + "learning_rate": 4.082391726679882e-06, + "loss": 0.9616, + "step": 1888 + }, + { + "epoch": 1.7588454376163873, + "grad_norm": 1.537461757659912, + "learning_rate": 4.081433075555786e-06, + "loss": 1.0143, + "step": 1889 + }, + { + "epoch": 1.7597765363128492, + "grad_norm": 1.552990436553955, + "learning_rate": 4.080474036617045e-06, + "loss": 1.0131, + "step": 1890 + }, + { + "epoch": 1.760707635009311, + "grad_norm": 1.620948314666748, + "learning_rate": 4.079514610098844e-06, + "loss": 1.013, + "step": 1891 + }, + { + "epoch": 1.7616387337057728, + "grad_norm": 1.562349557876587, + "learning_rate": 4.078554796236462e-06, + "loss": 0.9969, + "step": 1892 + }, + { + "epoch": 1.7625698324022347, + "grad_norm": 1.5185693502426147, + "learning_rate": 4.077594595265275e-06, + "loss": 0.9843, + "step": 1893 + }, + { + "epoch": 1.7635009310986964, + "grad_norm": 1.6121083498001099, + "learning_rate": 4.076634007420754e-06, + "loss": 1.0312, + "step": 1894 + }, + { + "epoch": 1.7644320297951583, + "grad_norm": 1.5583982467651367, + "learning_rate": 4.07567303293846e-06, + "loss": 1.0156, + "step": 1895 + }, + { + "epoch": 1.7653631284916202, + "grad_norm": 1.58591890335083, + "learning_rate": 4.074711672054057e-06, + "loss": 0.9978, + "step": 1896 + }, + { + "epoch": 1.766294227188082, + "grad_norm": 1.5953869819641113, + "learning_rate": 4.073749925003297e-06, + "loss": 1.0297, + "step": 1897 + }, + { + "epoch": 1.7672253258845436, + "grad_norm": 1.6581158638000488, + "learning_rate": 4.07278779202203e-06, + "loss": 0.9896, + "step": 1898 + }, + { + "epoch": 1.7681564245810057, + "grad_norm": 1.5670716762542725, + "learning_rate": 4.0718252733461995e-06, + "loss": 1.0352, + "step": 1899 + }, + { + "epoch": 1.7690875232774674, + "grad_norm": 1.5343213081359863, + "learning_rate": 4.070862369211843e-06, + "loss": 0.9853, + "step": 1900 + }, + { + "epoch": 1.7700186219739291, + "grad_norm": 1.519212007522583, + "learning_rate": 4.069899079855095e-06, + "loss": 1.012, + "step": 1901 + }, + { + "epoch": 1.770949720670391, + "grad_norm": 1.54884934425354, + "learning_rate": 4.068935405512182e-06, + "loss": 1.006, + "step": 1902 + }, + { + "epoch": 1.771880819366853, + "grad_norm": 1.5804592370986938, + "learning_rate": 4.067971346419425e-06, + "loss": 1.0343, + "step": 1903 + }, + { + "epoch": 1.7728119180633146, + "grad_norm": 1.6775153875350952, + "learning_rate": 4.0670069028132414e-06, + "loss": 1.0048, + "step": 1904 + }, + { + "epoch": 1.7737430167597765, + "grad_norm": 1.5643740892410278, + "learning_rate": 4.066042074930141e-06, + "loss": 0.9855, + "step": 1905 + }, + { + "epoch": 1.7746741154562384, + "grad_norm": 1.5684539079666138, + "learning_rate": 4.065076863006729e-06, + "loss": 1.0003, + "step": 1906 + }, + { + "epoch": 1.7756052141527001, + "grad_norm": 1.6218197345733643, + "learning_rate": 4.064111267279703e-06, + "loss": 1.0386, + "step": 1907 + }, + { + "epoch": 1.776536312849162, + "grad_norm": 1.6494494676589966, + "learning_rate": 4.063145287985857e-06, + "loss": 1.0209, + "step": 1908 + }, + { + "epoch": 1.777467411545624, + "grad_norm": 1.5487157106399536, + "learning_rate": 4.062178925362077e-06, + "loss": 1.011, + "step": 1909 + }, + { + "epoch": 1.7783985102420856, + "grad_norm": 1.53665030002594, + "learning_rate": 4.0612121796453455e-06, + "loss": 1.0126, + "step": 1910 + }, + { + "epoch": 1.7793296089385475, + "grad_norm": 1.5955960750579834, + "learning_rate": 4.060245051072736e-06, + "loss": 1.0329, + "step": 1911 + }, + { + "epoch": 1.7802607076350094, + "grad_norm": 1.5115478038787842, + "learning_rate": 4.059277539881418e-06, + "loss": 0.9835, + "step": 1912 + }, + { + "epoch": 1.7811918063314711, + "grad_norm": 1.5240764617919922, + "learning_rate": 4.058309646308654e-06, + "loss": 1.0095, + "step": 1913 + }, + { + "epoch": 1.7821229050279328, + "grad_norm": 1.6130045652389526, + "learning_rate": 4.0573413705918e-06, + "loss": 0.9779, + "step": 1914 + }, + { + "epoch": 1.783054003724395, + "grad_norm": 1.540324330329895, + "learning_rate": 4.056372712968308e-06, + "loss": 1.0292, + "step": 1915 + }, + { + "epoch": 1.7839851024208566, + "grad_norm": 1.581092119216919, + "learning_rate": 4.055403673675718e-06, + "loss": 0.998, + "step": 1916 + }, + { + "epoch": 1.7849162011173183, + "grad_norm": 1.6607609987258911, + "learning_rate": 4.054434252951671e-06, + "loss": 1.0253, + "step": 1917 + }, + { + "epoch": 1.7858472998137802, + "grad_norm": 1.5878102779388428, + "learning_rate": 4.0534644510338976e-06, + "loss": 1.0132, + "step": 1918 + }, + { + "epoch": 1.7867783985102421, + "grad_norm": 1.5497157573699951, + "learning_rate": 4.052494268160219e-06, + "loss": 0.9781, + "step": 1919 + }, + { + "epoch": 1.7877094972067038, + "grad_norm": 1.6174079179763794, + "learning_rate": 4.051523704568557e-06, + "loss": 0.9921, + "step": 1920 + }, + { + "epoch": 1.7886405959031657, + "grad_norm": 1.569042682647705, + "learning_rate": 4.050552760496921e-06, + "loss": 1.0481, + "step": 1921 + }, + { + "epoch": 1.7895716945996276, + "grad_norm": 1.5762373208999634, + "learning_rate": 4.049581436183416e-06, + "loss": 0.9592, + "step": 1922 + }, + { + "epoch": 1.7905027932960893, + "grad_norm": 1.5199081897735596, + "learning_rate": 4.048609731866239e-06, + "loss": 1.0148, + "step": 1923 + }, + { + "epoch": 1.7914338919925512, + "grad_norm": 1.5941567420959473, + "learning_rate": 4.047637647783681e-06, + "loss": 1.0775, + "step": 1924 + }, + { + "epoch": 1.7923649906890131, + "grad_norm": 1.5598386526107788, + "learning_rate": 4.046665184174126e-06, + "loss": 1.0571, + "step": 1925 + }, + { + "epoch": 1.7932960893854748, + "grad_norm": 1.5563305616378784, + "learning_rate": 4.0456923412760516e-06, + "loss": 1.0311, + "step": 1926 + }, + { + "epoch": 1.7942271880819367, + "grad_norm": 1.5213959217071533, + "learning_rate": 4.044719119328029e-06, + "loss": 0.9834, + "step": 1927 + }, + { + "epoch": 1.7951582867783986, + "grad_norm": 1.5801384449005127, + "learning_rate": 4.043745518568719e-06, + "loss": 1.0473, + "step": 1928 + }, + { + "epoch": 1.7960893854748603, + "grad_norm": 1.5230423212051392, + "learning_rate": 4.042771539236879e-06, + "loss": 1.0417, + "step": 1929 + }, + { + "epoch": 1.7970204841713222, + "grad_norm": 1.491419792175293, + "learning_rate": 4.041797181571358e-06, + "loss": 1.0325, + "step": 1930 + }, + { + "epoch": 1.7979515828677841, + "grad_norm": 1.5568187236785889, + "learning_rate": 4.040822445811097e-06, + "loss": 1.016, + "step": 1931 + }, + { + "epoch": 1.7988826815642458, + "grad_norm": 1.5121091604232788, + "learning_rate": 4.03984733219513e-06, + "loss": 0.9922, + "step": 1932 + }, + { + "epoch": 1.7998137802607075, + "grad_norm": 1.5466973781585693, + "learning_rate": 4.038871840962585e-06, + "loss": 1.0187, + "step": 1933 + }, + { + "epoch": 1.8007448789571696, + "grad_norm": 1.5360347032546997, + "learning_rate": 4.037895972352681e-06, + "loss": 1.0495, + "step": 1934 + }, + { + "epoch": 1.8016759776536313, + "grad_norm": 1.5188478231430054, + "learning_rate": 4.036919726604731e-06, + "loss": 0.9754, + "step": 1935 + }, + { + "epoch": 1.802607076350093, + "grad_norm": 1.5918091535568237, + "learning_rate": 4.035943103958138e-06, + "loss": 1.0008, + "step": 1936 + }, + { + "epoch": 1.803538175046555, + "grad_norm": 1.551121473312378, + "learning_rate": 4.0349661046524e-06, + "loss": 1.0466, + "step": 1937 + }, + { + "epoch": 1.8044692737430168, + "grad_norm": 1.613853096961975, + "learning_rate": 4.033988728927108e-06, + "loss": 1.0346, + "step": 1938 + }, + { + "epoch": 1.8054003724394785, + "grad_norm": 1.580567479133606, + "learning_rate": 4.03301097702194e-06, + "loss": 1.0191, + "step": 1939 + }, + { + "epoch": 1.8063314711359404, + "grad_norm": 1.4997655153274536, + "learning_rate": 4.032032849176672e-06, + "loss": 1.013, + "step": 1940 + }, + { + "epoch": 1.8072625698324023, + "grad_norm": 1.550525426864624, + "learning_rate": 4.031054345631172e-06, + "loss": 1.0317, + "step": 1941 + }, + { + "epoch": 1.808193668528864, + "grad_norm": 1.586842656135559, + "learning_rate": 4.030075466625395e-06, + "loss": 1.0114, + "step": 1942 + }, + { + "epoch": 1.809124767225326, + "grad_norm": 1.5969867706298828, + "learning_rate": 4.029096212399394e-06, + "loss": 1.0094, + "step": 1943 + }, + { + "epoch": 1.8100558659217878, + "grad_norm": 1.5217729806900024, + "learning_rate": 4.02811658319331e-06, + "loss": 0.9947, + "step": 1944 + }, + { + "epoch": 1.8109869646182495, + "grad_norm": 1.5831209421157837, + "learning_rate": 4.0271365792473774e-06, + "loss": 0.9888, + "step": 1945 + }, + { + "epoch": 1.8119180633147114, + "grad_norm": 1.5278444290161133, + "learning_rate": 4.026156200801924e-06, + "loss": 1.0594, + "step": 1946 + }, + { + "epoch": 1.8128491620111733, + "grad_norm": 1.5892688035964966, + "learning_rate": 4.025175448097365e-06, + "loss": 1.0214, + "step": 1947 + }, + { + "epoch": 1.813780260707635, + "grad_norm": 1.658461332321167, + "learning_rate": 4.024194321374213e-06, + "loss": 1.0635, + "step": 1948 + }, + { + "epoch": 1.8147113594040967, + "grad_norm": 1.5656095743179321, + "learning_rate": 4.023212820873068e-06, + "loss": 1.0001, + "step": 1949 + }, + { + "epoch": 1.8156424581005588, + "grad_norm": 1.538415789604187, + "learning_rate": 4.022230946834624e-06, + "loss": 0.973, + "step": 1950 + }, + { + "epoch": 1.8165735567970205, + "grad_norm": 1.6055189371109009, + "learning_rate": 4.021248699499666e-06, + "loss": 0.9945, + "step": 1951 + }, + { + "epoch": 1.8175046554934822, + "grad_norm": 1.5532666444778442, + "learning_rate": 4.02026607910907e-06, + "loss": 1.0098, + "step": 1952 + }, + { + "epoch": 1.8184357541899443, + "grad_norm": 1.6120880842208862, + "learning_rate": 4.019283085903803e-06, + "loss": 1.0449, + "step": 1953 + }, + { + "epoch": 1.819366852886406, + "grad_norm": 1.5207961797714233, + "learning_rate": 4.0182997201249255e-06, + "loss": 0.9536, + "step": 1954 + }, + { + "epoch": 1.8202979515828677, + "grad_norm": 1.5699912309646606, + "learning_rate": 4.017315982013588e-06, + "loss": 1.0186, + "step": 1955 + }, + { + "epoch": 1.8212290502793296, + "grad_norm": 1.5570015907287598, + "learning_rate": 4.0163318718110324e-06, + "loss": 1.0087, + "step": 1956 + }, + { + "epoch": 1.8221601489757915, + "grad_norm": 1.597419261932373, + "learning_rate": 4.015347389758592e-06, + "loss": 0.9972, + "step": 1957 + }, + { + "epoch": 1.8230912476722532, + "grad_norm": 1.5418192148208618, + "learning_rate": 4.014362536097691e-06, + "loss": 1.0293, + "step": 1958 + }, + { + "epoch": 1.824022346368715, + "grad_norm": 1.5408064126968384, + "learning_rate": 4.0133773110698454e-06, + "loss": 0.9764, + "step": 1959 + }, + { + "epoch": 1.824953445065177, + "grad_norm": 1.5946096181869507, + "learning_rate": 4.012391714916662e-06, + "loss": 1.009, + "step": 1960 + }, + { + "epoch": 1.8258845437616387, + "grad_norm": 1.6126819849014282, + "learning_rate": 4.011405747879836e-06, + "loss": 1.0125, + "step": 1961 + }, + { + "epoch": 1.8268156424581006, + "grad_norm": 1.5833483934402466, + "learning_rate": 4.010419410201159e-06, + "loss": 1.0153, + "step": 1962 + }, + { + "epoch": 1.8277467411545625, + "grad_norm": 1.5596377849578857, + "learning_rate": 4.00943270212251e-06, + "loss": 1.0023, + "step": 1963 + }, + { + "epoch": 1.8286778398510242, + "grad_norm": 1.5329121351242065, + "learning_rate": 4.008445623885857e-06, + "loss": 0.9717, + "step": 1964 + }, + { + "epoch": 1.829608938547486, + "grad_norm": 1.5359712839126587, + "learning_rate": 4.007458175733264e-06, + "loss": 1.0082, + "step": 1965 + }, + { + "epoch": 1.830540037243948, + "grad_norm": 1.5251449346542358, + "learning_rate": 4.0064703579068805e-06, + "loss": 0.9666, + "step": 1966 + }, + { + "epoch": 1.8314711359404097, + "grad_norm": 1.6319305896759033, + "learning_rate": 4.005482170648951e-06, + "loss": 0.9903, + "step": 1967 + }, + { + "epoch": 1.8324022346368714, + "grad_norm": 1.5564286708831787, + "learning_rate": 4.004493614201808e-06, + "loss": 0.9939, + "step": 1968 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 1.6011050939559937, + "learning_rate": 4.003504688807873e-06, + "loss": 1.0329, + "step": 1969 + }, + { + "epoch": 1.8342644320297952, + "grad_norm": 1.5545549392700195, + "learning_rate": 4.002515394709663e-06, + "loss": 0.9753, + "step": 1970 + }, + { + "epoch": 1.8351955307262569, + "grad_norm": 1.5623029470443726, + "learning_rate": 4.00152573214978e-06, + "loss": 1.0234, + "step": 1971 + }, + { + "epoch": 1.8361266294227188, + "grad_norm": 1.4882252216339111, + "learning_rate": 4.0005357013709215e-06, + "loss": 0.948, + "step": 1972 + }, + { + "epoch": 1.8370577281191807, + "grad_norm": 1.515845775604248, + "learning_rate": 3.999545302615869e-06, + "loss": 0.9678, + "step": 1973 + }, + { + "epoch": 1.8379888268156424, + "grad_norm": 1.6062061786651611, + "learning_rate": 3.998554536127502e-06, + "loss": 1.02, + "step": 1974 + }, + { + "epoch": 1.8389199255121043, + "grad_norm": 1.6199564933776855, + "learning_rate": 3.997563402148783e-06, + "loss": 1.0337, + "step": 1975 + }, + { + "epoch": 1.8398510242085662, + "grad_norm": 1.4950660467147827, + "learning_rate": 3.996571900922769e-06, + "loss": 0.9718, + "step": 1976 + }, + { + "epoch": 1.8407821229050279, + "grad_norm": 1.546147346496582, + "learning_rate": 3.995580032692604e-06, + "loss": 1.0317, + "step": 1977 + }, + { + "epoch": 1.8417132216014898, + "grad_norm": 1.5217554569244385, + "learning_rate": 3.994587797701527e-06, + "loss": 0.9711, + "step": 1978 + }, + { + "epoch": 1.8426443202979517, + "grad_norm": 1.492216944694519, + "learning_rate": 3.993595196192861e-06, + "loss": 1.011, + "step": 1979 + }, + { + "epoch": 1.8435754189944134, + "grad_norm": 1.574309229850769, + "learning_rate": 3.992602228410023e-06, + "loss": 0.9773, + "step": 1980 + }, + { + "epoch": 1.8445065176908753, + "grad_norm": 1.548874020576477, + "learning_rate": 3.9916088945965165e-06, + "loss": 1.0043, + "step": 1981 + }, + { + "epoch": 1.8454376163873372, + "grad_norm": 1.4939056634902954, + "learning_rate": 3.990615194995939e-06, + "loss": 0.978, + "step": 1982 + }, + { + "epoch": 1.8463687150837989, + "grad_norm": 1.5849956274032593, + "learning_rate": 3.9896211298519735e-06, + "loss": 1.0117, + "step": 1983 + }, + { + "epoch": 1.8472998137802608, + "grad_norm": 1.5433441400527954, + "learning_rate": 3.988626699408396e-06, + "loss": 1.0079, + "step": 1984 + }, + { + "epoch": 1.8482309124767227, + "grad_norm": 1.5646699666976929, + "learning_rate": 3.987631903909068e-06, + "loss": 1.0014, + "step": 1985 + }, + { + "epoch": 1.8491620111731844, + "grad_norm": 1.5790705680847168, + "learning_rate": 3.986636743597946e-06, + "loss": 1.0123, + "step": 1986 + }, + { + "epoch": 1.850093109869646, + "grad_norm": 1.536658525466919, + "learning_rate": 3.9856412187190715e-06, + "loss": 0.9933, + "step": 1987 + }, + { + "epoch": 1.8510242085661082, + "grad_norm": 1.5895529985427856, + "learning_rate": 3.984645329516578e-06, + "loss": 0.9964, + "step": 1988 + }, + { + "epoch": 1.8519553072625698, + "grad_norm": 1.5518970489501953, + "learning_rate": 3.9836490762346866e-06, + "loss": 1.0266, + "step": 1989 + }, + { + "epoch": 1.8528864059590315, + "grad_norm": 1.576671838760376, + "learning_rate": 3.9826524591177075e-06, + "loss": 1.0368, + "step": 1990 + }, + { + "epoch": 1.8538175046554934, + "grad_norm": 1.5843610763549805, + "learning_rate": 3.981655478410043e-06, + "loss": 1.0336, + "step": 1991 + }, + { + "epoch": 1.8547486033519553, + "grad_norm": 1.5642727613449097, + "learning_rate": 3.98065813435618e-06, + "loss": 0.9767, + "step": 1992 + }, + { + "epoch": 1.855679702048417, + "grad_norm": 1.5783385038375854, + "learning_rate": 3.979660427200699e-06, + "loss": 1.0132, + "step": 1993 + }, + { + "epoch": 1.856610800744879, + "grad_norm": 1.6256749629974365, + "learning_rate": 3.978662357188268e-06, + "loss": 1.0206, + "step": 1994 + }, + { + "epoch": 1.8575418994413408, + "grad_norm": 1.5093574523925781, + "learning_rate": 3.977663924563642e-06, + "loss": 0.9927, + "step": 1995 + }, + { + "epoch": 1.8584729981378025, + "grad_norm": 1.561970591545105, + "learning_rate": 3.976665129571667e-06, + "loss": 0.9956, + "step": 1996 + }, + { + "epoch": 1.8594040968342644, + "grad_norm": 1.5527724027633667, + "learning_rate": 3.975665972457278e-06, + "loss": 1.0261, + "step": 1997 + }, + { + "epoch": 1.8603351955307263, + "grad_norm": 1.6018401384353638, + "learning_rate": 3.9746664534654975e-06, + "loss": 1.0463, + "step": 1998 + }, + { + "epoch": 1.861266294227188, + "grad_norm": 1.5350617170333862, + "learning_rate": 3.973666572841438e-06, + "loss": 1.0295, + "step": 1999 + }, + { + "epoch": 1.86219739292365, + "grad_norm": 1.4940292835235596, + "learning_rate": 3.9726663308302995e-06, + "loss": 0.9825, + "step": 2000 + }, + { + "epoch": 1.8631284916201118, + "grad_norm": 1.593605875968933, + "learning_rate": 3.971665727677371e-06, + "loss": 1.0417, + "step": 2001 + }, + { + "epoch": 1.8640595903165735, + "grad_norm": 1.5552031993865967, + "learning_rate": 3.970664763628032e-06, + "loss": 1.0131, + "step": 2002 + }, + { + "epoch": 1.8649906890130352, + "grad_norm": 1.5304882526397705, + "learning_rate": 3.969663438927747e-06, + "loss": 1.0129, + "step": 2003 + }, + { + "epoch": 1.8659217877094973, + "grad_norm": 1.5545238256454468, + "learning_rate": 3.96866175382207e-06, + "loss": 1.0015, + "step": 2004 + }, + { + "epoch": 1.866852886405959, + "grad_norm": 1.5410882234573364, + "learning_rate": 3.967659708556647e-06, + "loss": 1.0546, + "step": 2005 + }, + { + "epoch": 1.8677839851024207, + "grad_norm": 1.518667221069336, + "learning_rate": 3.966657303377209e-06, + "loss": 0.9997, + "step": 2006 + }, + { + "epoch": 1.8687150837988828, + "grad_norm": 1.5257500410079956, + "learning_rate": 3.965654538529572e-06, + "loss": 0.9914, + "step": 2007 + }, + { + "epoch": 1.8696461824953445, + "grad_norm": 1.586334228515625, + "learning_rate": 3.964651414259648e-06, + "loss": 0.9874, + "step": 2008 + }, + { + "epoch": 1.8705772811918062, + "grad_norm": 1.5263134241104126, + "learning_rate": 3.963647930813432e-06, + "loss": 1.0115, + "step": 2009 + }, + { + "epoch": 1.8715083798882681, + "grad_norm": 1.5681990385055542, + "learning_rate": 3.962644088437006e-06, + "loss": 1.0251, + "step": 2010 + }, + { + "epoch": 1.87243947858473, + "grad_norm": 1.5999913215637207, + "learning_rate": 3.961639887376546e-06, + "loss": 1.0047, + "step": 2011 + }, + { + "epoch": 1.8733705772811917, + "grad_norm": 1.5397801399230957, + "learning_rate": 3.9606353278783085e-06, + "loss": 0.9562, + "step": 2012 + }, + { + "epoch": 1.8743016759776536, + "grad_norm": 1.6281425952911377, + "learning_rate": 3.959630410188643e-06, + "loss": 0.9745, + "step": 2013 + }, + { + "epoch": 1.8752327746741155, + "grad_norm": 1.5694729089736938, + "learning_rate": 3.958625134553985e-06, + "loss": 0.9789, + "step": 2014 + }, + { + "epoch": 1.8761638733705772, + "grad_norm": 1.5906692743301392, + "learning_rate": 3.95761950122086e-06, + "loss": 1.0406, + "step": 2015 + }, + { + "epoch": 1.8770949720670391, + "grad_norm": 1.592438817024231, + "learning_rate": 3.956613510435876e-06, + "loss": 0.986, + "step": 2016 + }, + { + "epoch": 1.878026070763501, + "grad_norm": 1.57843816280365, + "learning_rate": 3.955607162445735e-06, + "loss": 1.0199, + "step": 2017 + }, + { + "epoch": 1.8789571694599627, + "grad_norm": 1.5538474321365356, + "learning_rate": 3.9546004574972215e-06, + "loss": 1.0128, + "step": 2018 + }, + { + "epoch": 1.8798882681564246, + "grad_norm": 1.5547465085983276, + "learning_rate": 3.953593395837211e-06, + "loss": 1.0451, + "step": 2019 + }, + { + "epoch": 1.8808193668528865, + "grad_norm": 1.537627935409546, + "learning_rate": 3.952585977712664e-06, + "loss": 1.0052, + "step": 2020 + }, + { + "epoch": 1.8817504655493482, + "grad_norm": 1.548630714416504, + "learning_rate": 3.9515782033706305e-06, + "loss": 1.0159, + "step": 2021 + }, + { + "epoch": 1.88268156424581, + "grad_norm": 1.5562455654144287, + "learning_rate": 3.950570073058247e-06, + "loss": 0.9971, + "step": 2022 + }, + { + "epoch": 1.883612662942272, + "grad_norm": 1.559014081954956, + "learning_rate": 3.949561587022736e-06, + "loss": 1.0016, + "step": 2023 + }, + { + "epoch": 1.8845437616387337, + "grad_norm": 1.589950680732727, + "learning_rate": 3.9485527455114095e-06, + "loss": 1.0237, + "step": 2024 + }, + { + "epoch": 1.8854748603351954, + "grad_norm": 1.5429989099502563, + "learning_rate": 3.9475435487716655e-06, + "loss": 1.01, + "step": 2025 + }, + { + "epoch": 1.8864059590316573, + "grad_norm": 1.6332148313522339, + "learning_rate": 3.946533997050988e-06, + "loss": 1.018, + "step": 2026 + }, + { + "epoch": 1.8873370577281192, + "grad_norm": 1.5471529960632324, + "learning_rate": 3.94552409059695e-06, + "loss": 1.0015, + "step": 2027 + }, + { + "epoch": 1.888268156424581, + "grad_norm": 1.5839293003082275, + "learning_rate": 3.944513829657211e-06, + "loss": 1.0256, + "step": 2028 + }, + { + "epoch": 1.8891992551210428, + "grad_norm": 1.5847171545028687, + "learning_rate": 3.9435032144795185e-06, + "loss": 1.0063, + "step": 2029 + }, + { + "epoch": 1.8901303538175047, + "grad_norm": 1.5821939706802368, + "learning_rate": 3.9424922453117036e-06, + "loss": 1.0064, + "step": 2030 + }, + { + "epoch": 1.8910614525139664, + "grad_norm": 1.626847267150879, + "learning_rate": 3.941480922401685e-06, + "loss": 0.9998, + "step": 2031 + }, + { + "epoch": 1.8919925512104283, + "grad_norm": 1.6688166856765747, + "learning_rate": 3.940469245997473e-06, + "loss": 1.0122, + "step": 2032 + }, + { + "epoch": 1.8929236499068902, + "grad_norm": 1.5906034708023071, + "learning_rate": 3.939457216347157e-06, + "loss": 1.0132, + "step": 2033 + }, + { + "epoch": 1.893854748603352, + "grad_norm": 1.5552074909210205, + "learning_rate": 3.93844483369892e-06, + "loss": 0.9796, + "step": 2034 + }, + { + "epoch": 1.8947858472998138, + "grad_norm": 1.6331313848495483, + "learning_rate": 3.937432098301026e-06, + "loss": 1.0066, + "step": 2035 + }, + { + "epoch": 1.8957169459962757, + "grad_norm": 1.6071209907531738, + "learning_rate": 3.936419010401831e-06, + "loss": 1.0172, + "step": 2036 + }, + { + "epoch": 1.8966480446927374, + "grad_norm": 1.590070366859436, + "learning_rate": 3.9354055702497715e-06, + "loss": 0.9958, + "step": 2037 + }, + { + "epoch": 1.8975791433891993, + "grad_norm": 1.5456080436706543, + "learning_rate": 3.934391778093374e-06, + "loss": 0.9578, + "step": 2038 + }, + { + "epoch": 1.8985102420856612, + "grad_norm": 1.5467373132705688, + "learning_rate": 3.933377634181251e-06, + "loss": 1.0239, + "step": 2039 + }, + { + "epoch": 1.899441340782123, + "grad_norm": 1.611258625984192, + "learning_rate": 3.932363138762102e-06, + "loss": 1.0315, + "step": 2040 + }, + { + "epoch": 1.9003724394785846, + "grad_norm": 1.6488022804260254, + "learning_rate": 3.93134829208471e-06, + "loss": 1.0337, + "step": 2041 + }, + { + "epoch": 1.9013035381750467, + "grad_norm": 1.5456761121749878, + "learning_rate": 3.9303330943979465e-06, + "loss": 1.0238, + "step": 2042 + }, + { + "epoch": 1.9022346368715084, + "grad_norm": 1.5027612447738647, + "learning_rate": 3.929317545950767e-06, + "loss": 1.0046, + "step": 2043 + }, + { + "epoch": 1.90316573556797, + "grad_norm": 1.5116413831710815, + "learning_rate": 3.9283016469922165e-06, + "loss": 0.9841, + "step": 2044 + }, + { + "epoch": 1.904096834264432, + "grad_norm": 1.6401547193527222, + "learning_rate": 3.927285397771422e-06, + "loss": 1.0078, + "step": 2045 + }, + { + "epoch": 1.905027932960894, + "grad_norm": 1.5742777585983276, + "learning_rate": 3.9262687985376e-06, + "loss": 0.9828, + "step": 2046 + }, + { + "epoch": 1.9059590316573556, + "grad_norm": 1.5467455387115479, + "learning_rate": 3.925251849540048e-06, + "loss": 0.9998, + "step": 2047 + }, + { + "epoch": 1.9068901303538175, + "grad_norm": 1.534225344657898, + "learning_rate": 3.9242345510281555e-06, + "loss": 0.994, + "step": 2048 + }, + { + "epoch": 1.9078212290502794, + "grad_norm": 1.4439020156860352, + "learning_rate": 3.9232169032513934e-06, + "loss": 0.9708, + "step": 2049 + }, + { + "epoch": 1.908752327746741, + "grad_norm": 1.616662621498108, + "learning_rate": 3.922198906459318e-06, + "loss": 0.9909, + "step": 2050 + }, + { + "epoch": 1.909683426443203, + "grad_norm": 1.6388851404190063, + "learning_rate": 3.921180560901574e-06, + "loss": 1.0037, + "step": 2051 + }, + { + "epoch": 1.910614525139665, + "grad_norm": 1.625038504600525, + "learning_rate": 3.92016186682789e-06, + "loss": 1.028, + "step": 2052 + }, + { + "epoch": 1.9115456238361266, + "grad_norm": 1.6417691707611084, + "learning_rate": 3.91914282448808e-06, + "loss": 1.0026, + "step": 2053 + }, + { + "epoch": 1.9124767225325885, + "grad_norm": 1.5444130897521973, + "learning_rate": 3.918123434132043e-06, + "loss": 0.9872, + "step": 2054 + }, + { + "epoch": 1.9134078212290504, + "grad_norm": 1.6016485691070557, + "learning_rate": 3.9171036960097655e-06, + "loss": 1.0215, + "step": 2055 + }, + { + "epoch": 1.914338919925512, + "grad_norm": 1.5767979621887207, + "learning_rate": 3.9160836103713165e-06, + "loss": 1.025, + "step": 2056 + }, + { + "epoch": 1.9152700186219738, + "grad_norm": 1.6227903366088867, + "learning_rate": 3.915063177466851e-06, + "loss": 0.9902, + "step": 2057 + }, + { + "epoch": 1.916201117318436, + "grad_norm": 1.5350115299224854, + "learning_rate": 3.914042397546611e-06, + "loss": 0.9852, + "step": 2058 + }, + { + "epoch": 1.9171322160148976, + "grad_norm": 1.5428268909454346, + "learning_rate": 3.9130212708609225e-06, + "loss": 0.9821, + "step": 2059 + }, + { + "epoch": 1.9180633147113593, + "grad_norm": 1.5585871934890747, + "learning_rate": 3.911999797660195e-06, + "loss": 1.0135, + "step": 2060 + }, + { + "epoch": 1.9189944134078212, + "grad_norm": 1.589049220085144, + "learning_rate": 3.910977978194925e-06, + "loss": 0.9644, + "step": 2061 + }, + { + "epoch": 1.919925512104283, + "grad_norm": 1.6130081415176392, + "learning_rate": 3.909955812715692e-06, + "loss": 1.0156, + "step": 2062 + }, + { + "epoch": 1.9208566108007448, + "grad_norm": 1.604012131690979, + "learning_rate": 3.908933301473163e-06, + "loss": 1.0444, + "step": 2063 + }, + { + "epoch": 1.9217877094972067, + "grad_norm": 1.5679888725280762, + "learning_rate": 3.907910444718088e-06, + "loss": 1.0237, + "step": 2064 + }, + { + "epoch": 1.9227188081936686, + "grad_norm": 1.6054996252059937, + "learning_rate": 3.906887242701302e-06, + "loss": 1.0414, + "step": 2065 + }, + { + "epoch": 1.9236499068901303, + "grad_norm": 1.5020970106124878, + "learning_rate": 3.9058636956737235e-06, + "loss": 0.9605, + "step": 2066 + }, + { + "epoch": 1.9245810055865922, + "grad_norm": 1.5365707874298096, + "learning_rate": 3.904839803886359e-06, + "loss": 1.0201, + "step": 2067 + }, + { + "epoch": 1.925512104283054, + "grad_norm": 1.5040481090545654, + "learning_rate": 3.903815567590296e-06, + "loss": 1.0102, + "step": 2068 + }, + { + "epoch": 1.9264432029795158, + "grad_norm": 1.502992868423462, + "learning_rate": 3.902790987036707e-06, + "loss": 1.0013, + "step": 2069 + }, + { + "epoch": 1.9273743016759777, + "grad_norm": 1.5323235988616943, + "learning_rate": 3.901766062476852e-06, + "loss": 0.9882, + "step": 2070 + }, + { + "epoch": 1.9283054003724396, + "grad_norm": 1.5687792301177979, + "learning_rate": 3.90074079416207e-06, + "loss": 0.9641, + "step": 2071 + }, + { + "epoch": 1.9292364990689013, + "grad_norm": 1.5298455953598022, + "learning_rate": 3.8997151823437915e-06, + "loss": 0.9996, + "step": 2072 + }, + { + "epoch": 1.9301675977653632, + "grad_norm": 1.5928900241851807, + "learning_rate": 3.8986892272735235e-06, + "loss": 0.9994, + "step": 2073 + }, + { + "epoch": 1.931098696461825, + "grad_norm": 1.631481647491455, + "learning_rate": 3.897662929202863e-06, + "loss": 1.0374, + "step": 2074 + }, + { + "epoch": 1.9320297951582868, + "grad_norm": 1.5729954242706299, + "learning_rate": 3.896636288383489e-06, + "loss": 0.942, + "step": 2075 + }, + { + "epoch": 1.9329608938547485, + "grad_norm": 1.5741610527038574, + "learning_rate": 3.895609305067162e-06, + "loss": 1.0351, + "step": 2076 + }, + { + "epoch": 1.9338919925512106, + "grad_norm": 1.606162667274475, + "learning_rate": 3.894581979505732e-06, + "loss": 1.0079, + "step": 2077 + }, + { + "epoch": 1.9348230912476723, + "grad_norm": 1.5142172574996948, + "learning_rate": 3.8935543119511285e-06, + "loss": 0.9669, + "step": 2078 + }, + { + "epoch": 1.935754189944134, + "grad_norm": 1.596543550491333, + "learning_rate": 3.892526302655367e-06, + "loss": 1.0376, + "step": 2079 + }, + { + "epoch": 1.9366852886405959, + "grad_norm": 1.5575776100158691, + "learning_rate": 3.8914979518705455e-06, + "loss": 1.0272, + "step": 2080 + }, + { + "epoch": 1.9376163873370578, + "grad_norm": 1.5682884454727173, + "learning_rate": 3.8904692598488454e-06, + "loss": 0.9919, + "step": 2081 + }, + { + "epoch": 1.9385474860335195, + "grad_norm": 1.6136903762817383, + "learning_rate": 3.889440226842535e-06, + "loss": 1.0278, + "step": 2082 + }, + { + "epoch": 1.9394785847299814, + "grad_norm": 1.556594729423523, + "learning_rate": 3.8884108531039625e-06, + "loss": 1.037, + "step": 2083 + }, + { + "epoch": 1.9404096834264433, + "grad_norm": 1.5085079669952393, + "learning_rate": 3.887381138885561e-06, + "loss": 1.0262, + "step": 2084 + }, + { + "epoch": 1.941340782122905, + "grad_norm": 1.5283361673355103, + "learning_rate": 3.886351084439847e-06, + "loss": 0.9872, + "step": 2085 + }, + { + "epoch": 1.9422718808193669, + "grad_norm": 1.53825044631958, + "learning_rate": 3.885320690019422e-06, + "loss": 0.9761, + "step": 2086 + }, + { + "epoch": 1.9432029795158288, + "grad_norm": 1.5831376314163208, + "learning_rate": 3.884289955876968e-06, + "loss": 1.0072, + "step": 2087 + }, + { + "epoch": 1.9441340782122905, + "grad_norm": 1.556359052658081, + "learning_rate": 3.883258882265253e-06, + "loss": 1.0276, + "step": 2088 + }, + { + "epoch": 1.9450651769087524, + "grad_norm": 1.6506770849227905, + "learning_rate": 3.882227469437126e-06, + "loss": 1.0758, + "step": 2089 + }, + { + "epoch": 1.9459962756052143, + "grad_norm": 1.5666788816452026, + "learning_rate": 3.881195717645522e-06, + "loss": 1.0241, + "step": 2090 + }, + { + "epoch": 1.946927374301676, + "grad_norm": 1.5779006481170654, + "learning_rate": 3.880163627143454e-06, + "loss": 1.008, + "step": 2091 + }, + { + "epoch": 1.9478584729981379, + "grad_norm": 1.5439555644989014, + "learning_rate": 3.879131198184026e-06, + "loss": 0.9954, + "step": 2092 + }, + { + "epoch": 1.9487895716945998, + "grad_norm": 1.50613534450531, + "learning_rate": 3.878098431020416e-06, + "loss": 1.0075, + "step": 2093 + }, + { + "epoch": 1.9497206703910615, + "grad_norm": 1.5468829870224, + "learning_rate": 3.8770653259058924e-06, + "loss": 1.0132, + "step": 2094 + }, + { + "epoch": 1.9506517690875231, + "grad_norm": 1.4966695308685303, + "learning_rate": 3.876031883093802e-06, + "loss": 0.98, + "step": 2095 + }, + { + "epoch": 1.9515828677839853, + "grad_norm": 1.5314258337020874, + "learning_rate": 3.874998102837577e-06, + "loss": 0.9633, + "step": 2096 + }, + { + "epoch": 1.952513966480447, + "grad_norm": 1.6018353700637817, + "learning_rate": 3.873963985390729e-06, + "loss": 1.0623, + "step": 2097 + }, + { + "epoch": 1.9534450651769086, + "grad_norm": 1.5945348739624023, + "learning_rate": 3.872929531006858e-06, + "loss": 1.0109, + "step": 2098 + }, + { + "epoch": 1.9543761638733705, + "grad_norm": 1.5638896226882935, + "learning_rate": 3.87189473993964e-06, + "loss": 0.9776, + "step": 2099 + }, + { + "epoch": 1.9553072625698324, + "grad_norm": 1.555023193359375, + "learning_rate": 3.870859612442837e-06, + "loss": 1.0104, + "step": 2100 + }, + { + "epoch": 1.9562383612662941, + "grad_norm": 1.6065038442611694, + "learning_rate": 3.869824148770295e-06, + "loss": 1.0362, + "step": 2101 + }, + { + "epoch": 1.957169459962756, + "grad_norm": 1.5318093299865723, + "learning_rate": 3.868788349175939e-06, + "loss": 0.9929, + "step": 2102 + }, + { + "epoch": 1.958100558659218, + "grad_norm": 1.661205768585205, + "learning_rate": 3.867752213913779e-06, + "loss": 1.0048, + "step": 2103 + }, + { + "epoch": 1.9590316573556796, + "grad_norm": 1.540018916130066, + "learning_rate": 3.866715743237906e-06, + "loss": 1.016, + "step": 2104 + }, + { + "epoch": 1.9599627560521415, + "grad_norm": 1.7148969173431396, + "learning_rate": 3.865678937402494e-06, + "loss": 0.988, + "step": 2105 + }, + { + "epoch": 1.9608938547486034, + "grad_norm": 1.6059976816177368, + "learning_rate": 3.864641796661798e-06, + "loss": 1.0014, + "step": 2106 + }, + { + "epoch": 1.9618249534450651, + "grad_norm": 1.5444613695144653, + "learning_rate": 3.863604321270156e-06, + "loss": 0.9694, + "step": 2107 + }, + { + "epoch": 1.962756052141527, + "grad_norm": 1.566719889640808, + "learning_rate": 3.862566511481987e-06, + "loss": 1.009, + "step": 2108 + }, + { + "epoch": 1.963687150837989, + "grad_norm": 1.6250853538513184, + "learning_rate": 3.8615283675517965e-06, + "loss": 1.0017, + "step": 2109 + }, + { + "epoch": 1.9646182495344506, + "grad_norm": 1.5748748779296875, + "learning_rate": 3.860489889734165e-06, + "loss": 0.9933, + "step": 2110 + }, + { + "epoch": 1.9655493482309123, + "grad_norm": 1.5659135580062866, + "learning_rate": 3.859451078283759e-06, + "loss": 0.9979, + "step": 2111 + }, + { + "epoch": 1.9664804469273744, + "grad_norm": 1.5945208072662354, + "learning_rate": 3.858411933455326e-06, + "loss": 1.0218, + "step": 2112 + }, + { + "epoch": 1.9674115456238361, + "grad_norm": 1.6475328207015991, + "learning_rate": 3.857372455503698e-06, + "loss": 1.0261, + "step": 2113 + }, + { + "epoch": 1.9683426443202978, + "grad_norm": 1.5161283016204834, + "learning_rate": 3.856332644683781e-06, + "loss": 0.9859, + "step": 2114 + }, + { + "epoch": 1.9692737430167597, + "grad_norm": 1.5574990510940552, + "learning_rate": 3.855292501250573e-06, + "loss": 1.0285, + "step": 2115 + }, + { + "epoch": 1.9702048417132216, + "grad_norm": 1.5281257629394531, + "learning_rate": 3.854252025459144e-06, + "loss": 1.0304, + "step": 2116 + }, + { + "epoch": 1.9711359404096833, + "grad_norm": 1.5524795055389404, + "learning_rate": 3.853211217564653e-06, + "loss": 0.9691, + "step": 2117 + }, + { + "epoch": 1.9720670391061452, + "grad_norm": 1.481471300125122, + "learning_rate": 3.852170077822335e-06, + "loss": 0.9963, + "step": 2118 + }, + { + "epoch": 1.9729981378026071, + "grad_norm": 1.5913469791412354, + "learning_rate": 3.851128606487509e-06, + "loss": 0.9942, + "step": 2119 + }, + { + "epoch": 1.9739292364990688, + "grad_norm": 1.5506529808044434, + "learning_rate": 3.850086803815576e-06, + "loss": 1.0041, + "step": 2120 + }, + { + "epoch": 1.9748603351955307, + "grad_norm": 1.5746383666992188, + "learning_rate": 3.849044670062016e-06, + "loss": 1.0189, + "step": 2121 + }, + { + "epoch": 1.9757914338919926, + "grad_norm": 1.5480257272720337, + "learning_rate": 3.848002205482392e-06, + "loss": 1.0129, + "step": 2122 + }, + { + "epoch": 1.9767225325884543, + "grad_norm": 1.5308213233947754, + "learning_rate": 3.8469594103323475e-06, + "loss": 1.0026, + "step": 2123 + }, + { + "epoch": 1.9776536312849162, + "grad_norm": 1.5448893308639526, + "learning_rate": 3.845916284867606e-06, + "loss": 0.9704, + "step": 2124 + }, + { + "epoch": 1.9785847299813781, + "grad_norm": 1.517349362373352, + "learning_rate": 3.844872829343973e-06, + "loss": 1.0055, + "step": 2125 + }, + { + "epoch": 1.9795158286778398, + "grad_norm": 1.5437157154083252, + "learning_rate": 3.843829044017337e-06, + "loss": 1.0325, + "step": 2126 + }, + { + "epoch": 1.9804469273743017, + "grad_norm": 1.5579521656036377, + "learning_rate": 3.842784929143663e-06, + "loss": 1.0458, + "step": 2127 + }, + { + "epoch": 1.9813780260707636, + "grad_norm": 1.5024526119232178, + "learning_rate": 3.841740484979002e-06, + "loss": 0.9796, + "step": 2128 + }, + { + "epoch": 1.9823091247672253, + "grad_norm": 1.5668773651123047, + "learning_rate": 3.840695711779479e-06, + "loss": 0.9917, + "step": 2129 + }, + { + "epoch": 1.983240223463687, + "grad_norm": 1.5908945798873901, + "learning_rate": 3.839650609801307e-06, + "loss": 0.9805, + "step": 2130 + }, + { + "epoch": 1.9841713221601491, + "grad_norm": 1.5236387252807617, + "learning_rate": 3.838605179300775e-06, + "loss": 0.9831, + "step": 2131 + }, + { + "epoch": 1.9851024208566108, + "grad_norm": 1.5576443672180176, + "learning_rate": 3.837559420534253e-06, + "loss": 1.0294, + "step": 2132 + }, + { + "epoch": 1.9860335195530725, + "grad_norm": 1.5494862794876099, + "learning_rate": 3.836513333758195e-06, + "loss": 1.0007, + "step": 2133 + }, + { + "epoch": 1.9869646182495344, + "grad_norm": 1.5600168704986572, + "learning_rate": 3.835466919229129e-06, + "loss": 0.9943, + "step": 2134 + }, + { + "epoch": 1.9878957169459963, + "grad_norm": 1.4927576780319214, + "learning_rate": 3.83442017720367e-06, + "loss": 0.9551, + "step": 2135 + }, + { + "epoch": 1.988826815642458, + "grad_norm": 1.5802292823791504, + "learning_rate": 3.833373107938509e-06, + "loss": 1.0159, + "step": 2136 + }, + { + "epoch": 1.98975791433892, + "grad_norm": 1.5049283504486084, + "learning_rate": 3.832325711690419e-06, + "loss": 1.0163, + "step": 2137 + }, + { + "epoch": 1.9906890130353818, + "grad_norm": 1.6011770963668823, + "learning_rate": 3.831277988716252e-06, + "loss": 1.0292, + "step": 2138 + }, + { + "epoch": 1.9916201117318435, + "grad_norm": 1.5500998497009277, + "learning_rate": 3.830229939272943e-06, + "loss": 1.0216, + "step": 2139 + }, + { + "epoch": 1.9925512104283054, + "grad_norm": 1.6106137037277222, + "learning_rate": 3.829181563617504e-06, + "loss": 1.0065, + "step": 2140 + }, + { + "epoch": 1.9934823091247673, + "grad_norm": 1.5394093990325928, + "learning_rate": 3.828132862007027e-06, + "loss": 0.9953, + "step": 2141 + }, + { + "epoch": 1.994413407821229, + "grad_norm": 1.4998019933700562, + "learning_rate": 3.827083834698687e-06, + "loss": 0.9858, + "step": 2142 + }, + { + "epoch": 1.995344506517691, + "grad_norm": 1.6170512437820435, + "learning_rate": 3.826034481949734e-06, + "loss": 1.0158, + "step": 2143 + }, + { + "epoch": 1.9962756052141528, + "grad_norm": 1.5653436183929443, + "learning_rate": 3.824984804017505e-06, + "loss": 0.981, + "step": 2144 + }, + { + "epoch": 1.9972067039106145, + "grad_norm": 1.4922889471054077, + "learning_rate": 3.823934801159408e-06, + "loss": 0.9946, + "step": 2145 + }, + { + "epoch": 1.9981378026070762, + "grad_norm": 1.5930675268173218, + "learning_rate": 3.822884473632937e-06, + "loss": 1.0029, + "step": 2146 + }, + { + "epoch": 1.9990689013035383, + "grad_norm": 1.6091874837875366, + "learning_rate": 3.821833821695664e-06, + "loss": 1.0429, + "step": 2147 + }, + { + "epoch": 2.0, + "grad_norm": 1.80600905418396, + "learning_rate": 3.82078284560524e-06, + "loss": 1.0268, + "step": 2148 + }, + { + "epoch": 2.0009310986964617, + "grad_norm": 1.5678883790969849, + "learning_rate": 3.819731545619395e-06, + "loss": 0.9571, + "step": 2149 + }, + { + "epoch": 2.001862197392924, + "grad_norm": 1.5916420221328735, + "learning_rate": 3.81867992199594e-06, + "loss": 0.9762, + "step": 2150 + }, + { + "epoch": 2.0027932960893855, + "grad_norm": 1.561748743057251, + "learning_rate": 3.817627974992765e-06, + "loss": 0.9838, + "step": 2151 + }, + { + "epoch": 2.003724394785847, + "grad_norm": 1.5832701921463013, + "learning_rate": 3.816575704867836e-06, + "loss": 0.9845, + "step": 2152 + }, + { + "epoch": 2.0046554934823093, + "grad_norm": 1.5434550046920776, + "learning_rate": 3.815523111879206e-06, + "loss": 0.9923, + "step": 2153 + }, + { + "epoch": 2.005586592178771, + "grad_norm": 1.609713077545166, + "learning_rate": 3.8144701962849973e-06, + "loss": 0.9661, + "step": 2154 + }, + { + "epoch": 2.0065176908752327, + "grad_norm": 1.5397262573242188, + "learning_rate": 3.81341695834342e-06, + "loss": 0.928, + "step": 2155 + }, + { + "epoch": 2.007448789571695, + "grad_norm": 1.503147840499878, + "learning_rate": 3.812363398312757e-06, + "loss": 0.9651, + "step": 2156 + }, + { + "epoch": 2.0083798882681565, + "grad_norm": 1.5558241605758667, + "learning_rate": 3.8113095164513737e-06, + "loss": 0.9624, + "step": 2157 + }, + { + "epoch": 2.009310986964618, + "grad_norm": 1.6071900129318237, + "learning_rate": 3.8102553130177133e-06, + "loss": 0.9785, + "step": 2158 + }, + { + "epoch": 2.01024208566108, + "grad_norm": 1.5680207014083862, + "learning_rate": 3.8092007882702973e-06, + "loss": 0.9524, + "step": 2159 + }, + { + "epoch": 2.011173184357542, + "grad_norm": 1.6078006029129028, + "learning_rate": 3.808145942467729e-06, + "loss": 0.9604, + "step": 2160 + }, + { + "epoch": 2.0121042830540037, + "grad_norm": 1.5493887662887573, + "learning_rate": 3.807090775868686e-06, + "loss": 0.9508, + "step": 2161 + }, + { + "epoch": 2.0130353817504654, + "grad_norm": 1.6327743530273438, + "learning_rate": 3.8060352887319264e-06, + "loss": 1.0041, + "step": 2162 + }, + { + "epoch": 2.0139664804469275, + "grad_norm": 1.5337445735931396, + "learning_rate": 3.804979481316289e-06, + "loss": 0.9469, + "step": 2163 + }, + { + "epoch": 2.014897579143389, + "grad_norm": 1.581568717956543, + "learning_rate": 3.8039233538806873e-06, + "loss": 0.9633, + "step": 2164 + }, + { + "epoch": 2.015828677839851, + "grad_norm": 1.6584010124206543, + "learning_rate": 3.8028669066841172e-06, + "loss": 1.0006, + "step": 2165 + }, + { + "epoch": 2.016759776536313, + "grad_norm": 1.5558737516403198, + "learning_rate": 3.80181013998565e-06, + "loss": 0.971, + "step": 2166 + }, + { + "epoch": 2.0176908752327747, + "grad_norm": 1.5990629196166992, + "learning_rate": 3.800753054044437e-06, + "loss": 0.9652, + "step": 2167 + }, + { + "epoch": 2.0186219739292364, + "grad_norm": 1.5948203802108765, + "learning_rate": 3.799695649119706e-06, + "loss": 0.9601, + "step": 2168 + }, + { + "epoch": 2.0195530726256985, + "grad_norm": 1.646085500717163, + "learning_rate": 3.7986379254707663e-06, + "loss": 0.9891, + "step": 2169 + }, + { + "epoch": 2.02048417132216, + "grad_norm": 1.5336343050003052, + "learning_rate": 3.797579883357002e-06, + "loss": 0.9692, + "step": 2170 + }, + { + "epoch": 2.021415270018622, + "grad_norm": 1.5603774785995483, + "learning_rate": 3.7965215230378766e-06, + "loss": 0.9762, + "step": 2171 + }, + { + "epoch": 2.022346368715084, + "grad_norm": 1.5954972505569458, + "learning_rate": 3.7954628447729326e-06, + "loss": 0.9657, + "step": 2172 + }, + { + "epoch": 2.0232774674115457, + "grad_norm": 1.5735665559768677, + "learning_rate": 3.7944038488217884e-06, + "loss": 0.9386, + "step": 2173 + }, + { + "epoch": 2.0242085661080074, + "grad_norm": 1.5985512733459473, + "learning_rate": 3.793344535444142e-06, + "loss": 0.9596, + "step": 2174 + }, + { + "epoch": 2.0251396648044695, + "grad_norm": 1.658065915107727, + "learning_rate": 3.7922849048997688e-06, + "loss": 0.9683, + "step": 2175 + }, + { + "epoch": 2.026070763500931, + "grad_norm": 1.6113154888153076, + "learning_rate": 3.7912249574485226e-06, + "loss": 0.963, + "step": 2176 + }, + { + "epoch": 2.027001862197393, + "grad_norm": 1.6236968040466309, + "learning_rate": 3.7901646933503323e-06, + "loss": 0.9318, + "step": 2177 + }, + { + "epoch": 2.0279329608938546, + "grad_norm": 1.576192021369934, + "learning_rate": 3.7891041128652085e-06, + "loss": 0.9485, + "step": 2178 + }, + { + "epoch": 2.0288640595903167, + "grad_norm": 1.5539826154708862, + "learning_rate": 3.7880432162532354e-06, + "loss": 0.9295, + "step": 2179 + }, + { + "epoch": 2.0297951582867784, + "grad_norm": 1.624599575996399, + "learning_rate": 3.7869820037745773e-06, + "loss": 0.9312, + "step": 2180 + }, + { + "epoch": 2.03072625698324, + "grad_norm": 1.5872856378555298, + "learning_rate": 3.7859204756894754e-06, + "loss": 0.9381, + "step": 2181 + }, + { + "epoch": 2.031657355679702, + "grad_norm": 1.6143959760665894, + "learning_rate": 3.7848586322582475e-06, + "loss": 0.9437, + "step": 2182 + }, + { + "epoch": 2.032588454376164, + "grad_norm": 1.548168420791626, + "learning_rate": 3.78379647374129e-06, + "loss": 0.9271, + "step": 2183 + }, + { + "epoch": 2.0335195530726256, + "grad_norm": 1.6314013004302979, + "learning_rate": 3.7827340003990752e-06, + "loss": 1.0021, + "step": 2184 + }, + { + "epoch": 2.0344506517690877, + "grad_norm": 1.6110419034957886, + "learning_rate": 3.7816712124921553e-06, + "loss": 0.9576, + "step": 2185 + }, + { + "epoch": 2.0353817504655494, + "grad_norm": 1.6009128093719482, + "learning_rate": 3.7806081102811542e-06, + "loss": 0.965, + "step": 2186 + }, + { + "epoch": 2.036312849162011, + "grad_norm": 1.5964843034744263, + "learning_rate": 3.77954469402678e-06, + "loss": 0.9919, + "step": 2187 + }, + { + "epoch": 2.037243947858473, + "grad_norm": 1.5717049837112427, + "learning_rate": 3.7784809639898114e-06, + "loss": 0.9531, + "step": 2188 + }, + { + "epoch": 2.038175046554935, + "grad_norm": 1.5899442434310913, + "learning_rate": 3.777416920431108e-06, + "loss": 0.9588, + "step": 2189 + }, + { + "epoch": 2.0391061452513966, + "grad_norm": 1.6514818668365479, + "learning_rate": 3.776352563611604e-06, + "loss": 0.9617, + "step": 2190 + }, + { + "epoch": 2.0400372439478587, + "grad_norm": 1.5895659923553467, + "learning_rate": 3.775287893792314e-06, + "loss": 0.9674, + "step": 2191 + }, + { + "epoch": 2.0409683426443204, + "grad_norm": 1.5712453126907349, + "learning_rate": 3.7742229112343245e-06, + "loss": 0.9766, + "step": 2192 + }, + { + "epoch": 2.041899441340782, + "grad_norm": 1.5232397317886353, + "learning_rate": 3.7731576161988005e-06, + "loss": 0.9601, + "step": 2193 + }, + { + "epoch": 2.0428305400372437, + "grad_norm": 1.5671443939208984, + "learning_rate": 3.772092008946987e-06, + "loss": 0.9401, + "step": 2194 + }, + { + "epoch": 2.043761638733706, + "grad_norm": 1.6170986890792847, + "learning_rate": 3.7710260897402e-06, + "loss": 0.9331, + "step": 2195 + }, + { + "epoch": 2.0446927374301676, + "grad_norm": 1.6012412309646606, + "learning_rate": 3.7699598588398367e-06, + "loss": 0.9889, + "step": 2196 + }, + { + "epoch": 2.0456238361266292, + "grad_norm": 1.5996229648590088, + "learning_rate": 3.7688933165073676e-06, + "loss": 0.9558, + "step": 2197 + }, + { + "epoch": 2.0465549348230914, + "grad_norm": 1.5653645992279053, + "learning_rate": 3.7678264630043416e-06, + "loss": 0.9898, + "step": 2198 + }, + { + "epoch": 2.047486033519553, + "grad_norm": 1.6206297874450684, + "learning_rate": 3.7667592985923827e-06, + "loss": 0.9878, + "step": 2199 + }, + { + "epoch": 2.0484171322160147, + "grad_norm": 1.6035964488983154, + "learning_rate": 3.7656918235331906e-06, + "loss": 0.9504, + "step": 2200 + }, + { + "epoch": 2.049348230912477, + "grad_norm": 1.5887678861618042, + "learning_rate": 3.7646240380885434e-06, + "loss": 0.9061, + "step": 2201 + }, + { + "epoch": 2.0502793296089385, + "grad_norm": 1.528428554534912, + "learning_rate": 3.763555942520293e-06, + "loss": 0.9024, + "step": 2202 + }, + { + "epoch": 2.0512104283054002, + "grad_norm": 1.601758599281311, + "learning_rate": 3.7624875370903695e-06, + "loss": 0.9566, + "step": 2203 + }, + { + "epoch": 2.0521415270018624, + "grad_norm": 1.5678621530532837, + "learning_rate": 3.7614188220607767e-06, + "loss": 0.9635, + "step": 2204 + }, + { + "epoch": 2.053072625698324, + "grad_norm": 1.6154072284698486, + "learning_rate": 3.7603497976935967e-06, + "loss": 0.9341, + "step": 2205 + }, + { + "epoch": 2.0540037243947857, + "grad_norm": 1.5354888439178467, + "learning_rate": 3.7592804642509844e-06, + "loss": 0.9631, + "step": 2206 + }, + { + "epoch": 2.054934823091248, + "grad_norm": 1.5997956991195679, + "learning_rate": 3.758210821995174e-06, + "loss": 0.9818, + "step": 2207 + }, + { + "epoch": 2.0558659217877095, + "grad_norm": 1.6126775741577148, + "learning_rate": 3.7571408711884726e-06, + "loss": 1.0008, + "step": 2208 + }, + { + "epoch": 2.0567970204841712, + "grad_norm": 1.6038191318511963, + "learning_rate": 3.756070612093265e-06, + "loss": 0.9348, + "step": 2209 + }, + { + "epoch": 2.0577281191806334, + "grad_norm": 1.617152452468872, + "learning_rate": 3.7550000449720103e-06, + "loss": 0.9586, + "step": 2210 + }, + { + "epoch": 2.058659217877095, + "grad_norm": 1.6337593793869019, + "learning_rate": 3.7539291700872426e-06, + "loss": 0.9399, + "step": 2211 + }, + { + "epoch": 2.0595903165735567, + "grad_norm": 1.5585325956344604, + "learning_rate": 3.752857987701575e-06, + "loss": 0.9364, + "step": 2212 + }, + { + "epoch": 2.0605214152700184, + "grad_norm": 1.645774483680725, + "learning_rate": 3.751786498077691e-06, + "loss": 0.983, + "step": 2213 + }, + { + "epoch": 2.0614525139664805, + "grad_norm": 1.5499606132507324, + "learning_rate": 3.7507147014783523e-06, + "loss": 0.9152, + "step": 2214 + }, + { + "epoch": 2.0623836126629422, + "grad_norm": 1.648878574371338, + "learning_rate": 3.7496425981663965e-06, + "loss": 0.9616, + "step": 2215 + }, + { + "epoch": 2.063314711359404, + "grad_norm": 1.6605618000030518, + "learning_rate": 3.748570188404734e-06, + "loss": 0.9531, + "step": 2216 + }, + { + "epoch": 2.064245810055866, + "grad_norm": 1.619666576385498, + "learning_rate": 3.747497472456353e-06, + "loss": 0.9419, + "step": 2217 + }, + { + "epoch": 2.0651769087523277, + "grad_norm": 1.5980136394500732, + "learning_rate": 3.746424450584315e-06, + "loss": 0.9534, + "step": 2218 + }, + { + "epoch": 2.0661080074487894, + "grad_norm": 1.578047513961792, + "learning_rate": 3.7453511230517563e-06, + "loss": 0.9449, + "step": 2219 + }, + { + "epoch": 2.0670391061452515, + "grad_norm": 1.6305173635482788, + "learning_rate": 3.7442774901218903e-06, + "loss": 0.955, + "step": 2220 + }, + { + "epoch": 2.0679702048417132, + "grad_norm": 1.612795352935791, + "learning_rate": 3.7432035520580025e-06, + "loss": 0.9186, + "step": 2221 + }, + { + "epoch": 2.068901303538175, + "grad_norm": 1.5992122888565063, + "learning_rate": 3.7421293091234555e-06, + "loss": 0.945, + "step": 2222 + }, + { + "epoch": 2.069832402234637, + "grad_norm": 1.6819311380386353, + "learning_rate": 3.741054761581686e-06, + "loss": 0.9983, + "step": 2223 + }, + { + "epoch": 2.0707635009310987, + "grad_norm": 1.6414536237716675, + "learning_rate": 3.7399799096962035e-06, + "loss": 0.953, + "step": 2224 + }, + { + "epoch": 2.0716945996275604, + "grad_norm": 1.5660078525543213, + "learning_rate": 3.738904753730596e-06, + "loss": 0.9565, + "step": 2225 + }, + { + "epoch": 2.0726256983240225, + "grad_norm": 1.6420103311538696, + "learning_rate": 3.7378292939485218e-06, + "loss": 0.9476, + "step": 2226 + }, + { + "epoch": 2.0735567970204842, + "grad_norm": 1.6457041501998901, + "learning_rate": 3.7367535306137175e-06, + "loss": 0.9656, + "step": 2227 + }, + { + "epoch": 2.074487895716946, + "grad_norm": 1.6613293886184692, + "learning_rate": 3.735677463989992e-06, + "loss": 0.9705, + "step": 2228 + }, + { + "epoch": 2.0754189944134076, + "grad_norm": 1.5280052423477173, + "learning_rate": 3.7346010943412282e-06, + "loss": 0.94, + "step": 2229 + }, + { + "epoch": 2.0763500931098697, + "grad_norm": 1.5736757516860962, + "learning_rate": 3.733524421931385e-06, + "loss": 0.9651, + "step": 2230 + }, + { + "epoch": 2.0772811918063314, + "grad_norm": 1.640153169631958, + "learning_rate": 3.732447447024493e-06, + "loss": 0.986, + "step": 2231 + }, + { + "epoch": 2.078212290502793, + "grad_norm": 1.59702467918396, + "learning_rate": 3.7313701698846616e-06, + "loss": 0.9461, + "step": 2232 + }, + { + "epoch": 2.0791433891992552, + "grad_norm": 1.6325974464416504, + "learning_rate": 3.7302925907760682e-06, + "loss": 0.9514, + "step": 2233 + }, + { + "epoch": 2.080074487895717, + "grad_norm": 1.5375553369522095, + "learning_rate": 3.7292147099629707e-06, + "loss": 0.9078, + "step": 2234 + }, + { + "epoch": 2.0810055865921786, + "grad_norm": 1.582912802696228, + "learning_rate": 3.7281365277096937e-06, + "loss": 0.9446, + "step": 2235 + }, + { + "epoch": 2.0819366852886407, + "grad_norm": 1.616733193397522, + "learning_rate": 3.7270580442806425e-06, + "loss": 0.9491, + "step": 2236 + }, + { + "epoch": 2.0828677839851024, + "grad_norm": 1.6279919147491455, + "learning_rate": 3.725979259940293e-06, + "loss": 0.9445, + "step": 2237 + }, + { + "epoch": 2.083798882681564, + "grad_norm": 1.6524853706359863, + "learning_rate": 3.7249001749531955e-06, + "loss": 0.9977, + "step": 2238 + }, + { + "epoch": 2.0847299813780262, + "grad_norm": 1.6813267469406128, + "learning_rate": 3.723820789583973e-06, + "loss": 0.9726, + "step": 2239 + }, + { + "epoch": 2.085661080074488, + "grad_norm": 1.6247060298919678, + "learning_rate": 3.7227411040973232e-06, + "loss": 0.9426, + "step": 2240 + }, + { + "epoch": 2.0865921787709496, + "grad_norm": 1.6058865785598755, + "learning_rate": 3.7216611187580188e-06, + "loss": 0.9519, + "step": 2241 + }, + { + "epoch": 2.0875232774674117, + "grad_norm": 1.6968483924865723, + "learning_rate": 3.7205808338309023e-06, + "loss": 0.987, + "step": 2242 + }, + { + "epoch": 2.0884543761638734, + "grad_norm": 1.563751220703125, + "learning_rate": 3.719500249580893e-06, + "loss": 0.915, + "step": 2243 + }, + { + "epoch": 2.089385474860335, + "grad_norm": 1.5815222263336182, + "learning_rate": 3.718419366272982e-06, + "loss": 0.9558, + "step": 2244 + }, + { + "epoch": 2.0903165735567972, + "grad_norm": 1.536697268486023, + "learning_rate": 3.7173381841722344e-06, + "loss": 0.9312, + "step": 2245 + }, + { + "epoch": 2.091247672253259, + "grad_norm": 1.6369175910949707, + "learning_rate": 3.7162567035437897e-06, + "loss": 0.9809, + "step": 2246 + }, + { + "epoch": 2.0921787709497206, + "grad_norm": 1.7439841032028198, + "learning_rate": 3.7151749246528567e-06, + "loss": 0.982, + "step": 2247 + }, + { + "epoch": 2.0931098696461823, + "grad_norm": 2.0753540992736816, + "learning_rate": 3.714092847764722e-06, + "loss": 0.9526, + "step": 2248 + }, + { + "epoch": 2.0940409683426444, + "grad_norm": 1.5672013759613037, + "learning_rate": 3.7130104731447415e-06, + "loss": 0.9388, + "step": 2249 + }, + { + "epoch": 2.094972067039106, + "grad_norm": 1.6285289525985718, + "learning_rate": 3.711927801058347e-06, + "loss": 0.9833, + "step": 2250 + }, + { + "epoch": 2.095903165735568, + "grad_norm": 1.6870088577270508, + "learning_rate": 3.710844831771042e-06, + "loss": 0.9354, + "step": 2251 + }, + { + "epoch": 2.09683426443203, + "grad_norm": 1.6003613471984863, + "learning_rate": 3.7097615655484024e-06, + "loss": 0.9592, + "step": 2252 + }, + { + "epoch": 2.0977653631284916, + "grad_norm": 1.6126164197921753, + "learning_rate": 3.708678002656078e-06, + "loss": 0.924, + "step": 2253 + }, + { + "epoch": 2.0986964618249533, + "grad_norm": 1.697689414024353, + "learning_rate": 3.70759414335979e-06, + "loss": 0.9904, + "step": 2254 + }, + { + "epoch": 2.0996275605214154, + "grad_norm": 1.6612224578857422, + "learning_rate": 3.7065099879253343e-06, + "loss": 0.9832, + "step": 2255 + }, + { + "epoch": 2.100558659217877, + "grad_norm": 1.532278060913086, + "learning_rate": 3.7054255366185763e-06, + "loss": 0.9333, + "step": 2256 + }, + { + "epoch": 2.101489757914339, + "grad_norm": 1.6134192943572998, + "learning_rate": 3.7043407897054585e-06, + "loss": 0.9766, + "step": 2257 + }, + { + "epoch": 2.102420856610801, + "grad_norm": 1.6757482290267944, + "learning_rate": 3.703255747451991e-06, + "loss": 0.9844, + "step": 2258 + }, + { + "epoch": 2.1033519553072626, + "grad_norm": 1.5740753412246704, + "learning_rate": 3.7021704101242596e-06, + "loss": 0.9228, + "step": 2259 + }, + { + "epoch": 2.1042830540037243, + "grad_norm": 1.6292288303375244, + "learning_rate": 3.7010847779884207e-06, + "loss": 0.9713, + "step": 2260 + }, + { + "epoch": 2.1052141527001864, + "grad_norm": 1.6327106952667236, + "learning_rate": 3.6999988513107047e-06, + "loss": 0.9618, + "step": 2261 + }, + { + "epoch": 2.106145251396648, + "grad_norm": 1.5581427812576294, + "learning_rate": 3.698912630357413e-06, + "loss": 0.9708, + "step": 2262 + }, + { + "epoch": 2.10707635009311, + "grad_norm": 1.6215169429779053, + "learning_rate": 3.6978261153949197e-06, + "loss": 0.9514, + "step": 2263 + }, + { + "epoch": 2.1080074487895715, + "grad_norm": 1.6247423887252808, + "learning_rate": 3.6967393066896697e-06, + "loss": 0.9442, + "step": 2264 + }, + { + "epoch": 2.1089385474860336, + "grad_norm": 1.639685869216919, + "learning_rate": 3.6956522045081815e-06, + "loss": 0.969, + "step": 2265 + }, + { + "epoch": 2.1098696461824953, + "grad_norm": 1.6477811336517334, + "learning_rate": 3.6945648091170454e-06, + "loss": 0.9996, + "step": 2266 + }, + { + "epoch": 2.110800744878957, + "grad_norm": 1.6385444402694702, + "learning_rate": 3.693477120782923e-06, + "loss": 0.9448, + "step": 2267 + }, + { + "epoch": 2.111731843575419, + "grad_norm": 1.6664146184921265, + "learning_rate": 3.692389139772548e-06, + "loss": 0.9739, + "step": 2268 + }, + { + "epoch": 2.112662942271881, + "grad_norm": 1.606868028640747, + "learning_rate": 3.6913008663527254e-06, + "loss": 0.9649, + "step": 2269 + }, + { + "epoch": 2.1135940409683425, + "grad_norm": 1.6053881645202637, + "learning_rate": 3.690212300790333e-06, + "loss": 0.9266, + "step": 2270 + }, + { + "epoch": 2.1145251396648046, + "grad_norm": 1.6166749000549316, + "learning_rate": 3.689123443352319e-06, + "loss": 0.9652, + "step": 2271 + }, + { + "epoch": 2.1154562383612663, + "grad_norm": 1.7247328758239746, + "learning_rate": 3.688034294305705e-06, + "loss": 1.0175, + "step": 2272 + }, + { + "epoch": 2.116387337057728, + "grad_norm": 1.593933343887329, + "learning_rate": 3.686944853917582e-06, + "loss": 0.9756, + "step": 2273 + }, + { + "epoch": 2.11731843575419, + "grad_norm": 1.5824462175369263, + "learning_rate": 3.6858551224551127e-06, + "loss": 0.9258, + "step": 2274 + }, + { + "epoch": 2.118249534450652, + "grad_norm": 1.7062172889709473, + "learning_rate": 3.6847651001855336e-06, + "loss": 0.9449, + "step": 2275 + }, + { + "epoch": 2.1191806331471135, + "grad_norm": 1.5969610214233398, + "learning_rate": 3.683674787376148e-06, + "loss": 0.9855, + "step": 2276 + }, + { + "epoch": 2.1201117318435756, + "grad_norm": 1.61667799949646, + "learning_rate": 3.6825841842943362e-06, + "loss": 0.9656, + "step": 2277 + }, + { + "epoch": 2.1210428305400373, + "grad_norm": 1.6104581356048584, + "learning_rate": 3.681493291207544e-06, + "loss": 0.9553, + "step": 2278 + }, + { + "epoch": 2.121973929236499, + "grad_norm": 1.6627345085144043, + "learning_rate": 3.680402108383293e-06, + "loss": 0.9833, + "step": 2279 + }, + { + "epoch": 2.122905027932961, + "grad_norm": 1.6288490295410156, + "learning_rate": 3.679310636089174e-06, + "loss": 0.9731, + "step": 2280 + }, + { + "epoch": 2.123836126629423, + "grad_norm": 1.6077247858047485, + "learning_rate": 3.678218874592846e-06, + "loss": 0.9668, + "step": 2281 + }, + { + "epoch": 2.1247672253258845, + "grad_norm": 1.5970665216445923, + "learning_rate": 3.6771268241620444e-06, + "loss": 0.9408, + "step": 2282 + }, + { + "epoch": 2.1256983240223466, + "grad_norm": 1.5959385633468628, + "learning_rate": 3.6760344850645707e-06, + "loss": 0.9184, + "step": 2283 + }, + { + "epoch": 2.1266294227188083, + "grad_norm": 1.637071132659912, + "learning_rate": 3.6749418575683005e-06, + "loss": 0.9414, + "step": 2284 + }, + { + "epoch": 2.12756052141527, + "grad_norm": 1.5946859121322632, + "learning_rate": 3.6738489419411775e-06, + "loss": 0.9392, + "step": 2285 + }, + { + "epoch": 2.1284916201117317, + "grad_norm": 1.6978176832199097, + "learning_rate": 3.6727557384512187e-06, + "loss": 0.9659, + "step": 2286 + }, + { + "epoch": 2.129422718808194, + "grad_norm": 1.620904803276062, + "learning_rate": 3.6716622473665085e-06, + "loss": 0.954, + "step": 2287 + }, + { + "epoch": 2.1303538175046555, + "grad_norm": 1.6063650846481323, + "learning_rate": 3.670568468955205e-06, + "loss": 0.9505, + "step": 2288 + }, + { + "epoch": 2.131284916201117, + "grad_norm": 1.5732965469360352, + "learning_rate": 3.6694744034855347e-06, + "loss": 0.9194, + "step": 2289 + }, + { + "epoch": 2.1322160148975793, + "grad_norm": 1.5468127727508545, + "learning_rate": 3.6683800512257945e-06, + "loss": 0.9728, + "step": 2290 + }, + { + "epoch": 2.133147113594041, + "grad_norm": 1.596658706665039, + "learning_rate": 3.667285412444354e-06, + "loss": 0.9436, + "step": 2291 + }, + { + "epoch": 2.1340782122905027, + "grad_norm": 1.5953705310821533, + "learning_rate": 3.6661904874096506e-06, + "loss": 0.9453, + "step": 2292 + }, + { + "epoch": 2.135009310986965, + "grad_norm": 1.610055923461914, + "learning_rate": 3.6650952763901913e-06, + "loss": 0.9292, + "step": 2293 + }, + { + "epoch": 2.1359404096834265, + "grad_norm": 1.6304115056991577, + "learning_rate": 3.6639997796545567e-06, + "loss": 0.9441, + "step": 2294 + }, + { + "epoch": 2.136871508379888, + "grad_norm": 1.6363351345062256, + "learning_rate": 3.662903997471394e-06, + "loss": 0.9426, + "step": 2295 + }, + { + "epoch": 2.1378026070763503, + "grad_norm": 1.6297560930252075, + "learning_rate": 3.661807930109422e-06, + "loss": 0.9981, + "step": 2296 + }, + { + "epoch": 2.138733705772812, + "grad_norm": 1.6084342002868652, + "learning_rate": 3.660711577837429e-06, + "loss": 0.986, + "step": 2297 + }, + { + "epoch": 2.1396648044692737, + "grad_norm": 1.5748109817504883, + "learning_rate": 3.6596149409242735e-06, + "loss": 0.9023, + "step": 2298 + }, + { + "epoch": 2.1405959031657353, + "grad_norm": 1.623326301574707, + "learning_rate": 3.6585180196388844e-06, + "loss": 0.9698, + "step": 2299 + }, + { + "epoch": 2.1415270018621975, + "grad_norm": 1.6260778903961182, + "learning_rate": 3.6574208142502582e-06, + "loss": 0.9415, + "step": 2300 + }, + { + "epoch": 2.142458100558659, + "grad_norm": 1.6899642944335938, + "learning_rate": 3.656323325027463e-06, + "loss": 0.9999, + "step": 2301 + }, + { + "epoch": 2.143389199255121, + "grad_norm": 1.654906988143921, + "learning_rate": 3.6552255522396367e-06, + "loss": 0.9616, + "step": 2302 + }, + { + "epoch": 2.144320297951583, + "grad_norm": 1.6278623342514038, + "learning_rate": 3.6541274961559854e-06, + "loss": 0.9397, + "step": 2303 + }, + { + "epoch": 2.1452513966480447, + "grad_norm": 1.6147063970565796, + "learning_rate": 3.653029157045785e-06, + "loss": 0.9506, + "step": 2304 + }, + { + "epoch": 2.1461824953445063, + "grad_norm": 1.640665054321289, + "learning_rate": 3.6519305351783814e-06, + "loss": 0.954, + "step": 2305 + }, + { + "epoch": 2.1471135940409685, + "grad_norm": 1.5741463899612427, + "learning_rate": 3.650831630823189e-06, + "loss": 0.9804, + "step": 2306 + }, + { + "epoch": 2.14804469273743, + "grad_norm": 1.5746980905532837, + "learning_rate": 3.649732444249693e-06, + "loss": 0.9251, + "step": 2307 + }, + { + "epoch": 2.148975791433892, + "grad_norm": 1.572954773902893, + "learning_rate": 3.6486329757274454e-06, + "loss": 0.9174, + "step": 2308 + }, + { + "epoch": 2.149906890130354, + "grad_norm": 1.587693214416504, + "learning_rate": 3.6475332255260697e-06, + "loss": 0.9928, + "step": 2309 + }, + { + "epoch": 2.1508379888268156, + "grad_norm": 1.6290266513824463, + "learning_rate": 3.6464331939152576e-06, + "loss": 0.9989, + "step": 2310 + }, + { + "epoch": 2.1517690875232773, + "grad_norm": 1.6513193845748901, + "learning_rate": 3.64533288116477e-06, + "loss": 0.9571, + "step": 2311 + }, + { + "epoch": 2.1527001862197395, + "grad_norm": 1.6338671445846558, + "learning_rate": 3.644232287544435e-06, + "loss": 0.9754, + "step": 2312 + }, + { + "epoch": 2.153631284916201, + "grad_norm": 1.615729808807373, + "learning_rate": 3.6431314133241526e-06, + "loss": 0.957, + "step": 2313 + }, + { + "epoch": 2.154562383612663, + "grad_norm": 1.576277256011963, + "learning_rate": 3.6420302587738886e-06, + "loss": 0.9765, + "step": 2314 + }, + { + "epoch": 2.155493482309125, + "grad_norm": 1.6133235692977905, + "learning_rate": 3.6409288241636808e-06, + "loss": 0.9698, + "step": 2315 + }, + { + "epoch": 2.1564245810055866, + "grad_norm": 1.6441055536270142, + "learning_rate": 3.6398271097636322e-06, + "loss": 0.9425, + "step": 2316 + }, + { + "epoch": 2.1573556797020483, + "grad_norm": 1.6123785972595215, + "learning_rate": 3.6387251158439173e-06, + "loss": 0.9623, + "step": 2317 + }, + { + "epoch": 2.1582867783985105, + "grad_norm": 1.6008034944534302, + "learning_rate": 3.637622842674777e-06, + "loss": 0.9389, + "step": 2318 + }, + { + "epoch": 2.159217877094972, + "grad_norm": 1.6267389059066772, + "learning_rate": 3.6365202905265224e-06, + "loss": 0.9516, + "step": 2319 + }, + { + "epoch": 2.160148975791434, + "grad_norm": 1.635219693183899, + "learning_rate": 3.6354174596695324e-06, + "loss": 0.9528, + "step": 2320 + }, + { + "epoch": 2.1610800744878955, + "grad_norm": 1.6462581157684326, + "learning_rate": 3.6343143503742524e-06, + "loss": 0.9308, + "step": 2321 + }, + { + "epoch": 2.1620111731843576, + "grad_norm": 1.5747215747833252, + "learning_rate": 3.633210962911199e-06, + "loss": 0.9676, + "step": 2322 + }, + { + "epoch": 2.1629422718808193, + "grad_norm": 1.6596400737762451, + "learning_rate": 3.6321072975509564e-06, + "loss": 0.9738, + "step": 2323 + }, + { + "epoch": 2.163873370577281, + "grad_norm": 1.5887181758880615, + "learning_rate": 3.6310033545641753e-06, + "loss": 0.9101, + "step": 2324 + }, + { + "epoch": 2.164804469273743, + "grad_norm": 1.6391619443893433, + "learning_rate": 3.629899134221576e-06, + "loss": 0.9517, + "step": 2325 + }, + { + "epoch": 2.165735567970205, + "grad_norm": 1.5836042165756226, + "learning_rate": 3.6287946367939455e-06, + "loss": 0.9443, + "step": 2326 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 1.7963414192199707, + "learning_rate": 3.6276898625521413e-06, + "loss": 0.9787, + "step": 2327 + }, + { + "epoch": 2.1675977653631286, + "grad_norm": 1.5905530452728271, + "learning_rate": 3.6265848117670847e-06, + "loss": 0.928, + "step": 2328 + }, + { + "epoch": 2.1685288640595903, + "grad_norm": 1.6398658752441406, + "learning_rate": 3.6254794847097695e-06, + "loss": 0.9697, + "step": 2329 + }, + { + "epoch": 2.169459962756052, + "grad_norm": 1.6280919313430786, + "learning_rate": 3.624373881651254e-06, + "loss": 0.9456, + "step": 2330 + }, + { + "epoch": 2.170391061452514, + "grad_norm": 1.691656231880188, + "learning_rate": 3.6232680028626644e-06, + "loss": 0.9969, + "step": 2331 + }, + { + "epoch": 2.171322160148976, + "grad_norm": 1.6128681898117065, + "learning_rate": 3.6221618486151953e-06, + "loss": 0.9372, + "step": 2332 + }, + { + "epoch": 2.1722532588454375, + "grad_norm": 1.6305981874465942, + "learning_rate": 3.6210554191801102e-06, + "loss": 0.9642, + "step": 2333 + }, + { + "epoch": 2.1731843575418996, + "grad_norm": 1.6264302730560303, + "learning_rate": 3.6199487148287376e-06, + "loss": 0.9315, + "step": 2334 + }, + { + "epoch": 2.1741154562383613, + "grad_norm": 1.5924581289291382, + "learning_rate": 3.618841735832474e-06, + "loss": 0.962, + "step": 2335 + }, + { + "epoch": 2.175046554934823, + "grad_norm": 1.5694663524627686, + "learning_rate": 3.6177344824627854e-06, + "loss": 0.9778, + "step": 2336 + }, + { + "epoch": 2.1759776536312847, + "grad_norm": 1.6225179433822632, + "learning_rate": 3.6166269549912013e-06, + "loss": 0.9664, + "step": 2337 + }, + { + "epoch": 2.176908752327747, + "grad_norm": 1.6614285707473755, + "learning_rate": 3.6155191536893225e-06, + "loss": 0.9783, + "step": 2338 + }, + { + "epoch": 2.1778398510242085, + "grad_norm": 1.5901024341583252, + "learning_rate": 3.6144110788288135e-06, + "loss": 0.9345, + "step": 2339 + }, + { + "epoch": 2.17877094972067, + "grad_norm": 1.5973265171051025, + "learning_rate": 3.6133027306814085e-06, + "loss": 0.9681, + "step": 2340 + }, + { + "epoch": 2.1797020484171323, + "grad_norm": 1.6260349750518799, + "learning_rate": 3.612194109518906e-06, + "loss": 0.9398, + "step": 2341 + }, + { + "epoch": 2.180633147113594, + "grad_norm": 1.604882001876831, + "learning_rate": 3.6110852156131746e-06, + "loss": 0.9414, + "step": 2342 + }, + { + "epoch": 2.1815642458100557, + "grad_norm": 1.6518100500106812, + "learning_rate": 3.609976049236148e-06, + "loss": 0.968, + "step": 2343 + }, + { + "epoch": 2.182495344506518, + "grad_norm": 1.56865656375885, + "learning_rate": 3.6088666106598265e-06, + "loss": 0.9395, + "step": 2344 + }, + { + "epoch": 2.1834264432029795, + "grad_norm": 1.7989506721496582, + "learning_rate": 3.6077569001562775e-06, + "loss": 1.0015, + "step": 2345 + }, + { + "epoch": 2.184357541899441, + "grad_norm": 1.6323250532150269, + "learning_rate": 3.6066469179976347e-06, + "loss": 0.9175, + "step": 2346 + }, + { + "epoch": 2.1852886405959033, + "grad_norm": 1.6387637853622437, + "learning_rate": 3.6055366644561006e-06, + "loss": 0.9317, + "step": 2347 + }, + { + "epoch": 2.186219739292365, + "grad_norm": 1.6584813594818115, + "learning_rate": 3.6044261398039416e-06, + "loss": 0.9766, + "step": 2348 + }, + { + "epoch": 2.1871508379888267, + "grad_norm": 1.5685219764709473, + "learning_rate": 3.6033153443134903e-06, + "loss": 0.9108, + "step": 2349 + }, + { + "epoch": 2.188081936685289, + "grad_norm": 1.6261427402496338, + "learning_rate": 3.602204278257149e-06, + "loss": 0.9839, + "step": 2350 + }, + { + "epoch": 2.1890130353817505, + "grad_norm": 1.5805799961090088, + "learning_rate": 3.601092941907384e-06, + "loss": 0.9655, + "step": 2351 + }, + { + "epoch": 2.189944134078212, + "grad_norm": 1.6410917043685913, + "learning_rate": 3.5999813355367262e-06, + "loss": 0.9629, + "step": 2352 + }, + { + "epoch": 2.1908752327746743, + "grad_norm": 1.6219847202301025, + "learning_rate": 3.598869459417777e-06, + "loss": 0.9617, + "step": 2353 + }, + { + "epoch": 2.191806331471136, + "grad_norm": 1.6232783794403076, + "learning_rate": 3.5977573138232e-06, + "loss": 0.9682, + "step": 2354 + }, + { + "epoch": 2.1927374301675977, + "grad_norm": 1.5942046642303467, + "learning_rate": 3.596644899025728e-06, + "loss": 0.9381, + "step": 2355 + }, + { + "epoch": 2.1936685288640594, + "grad_norm": 1.6250545978546143, + "learning_rate": 3.5955322152981575e-06, + "loss": 0.9658, + "step": 2356 + }, + { + "epoch": 2.1945996275605215, + "grad_norm": 1.648079752922058, + "learning_rate": 3.594419262913351e-06, + "loss": 0.9646, + "step": 2357 + }, + { + "epoch": 2.195530726256983, + "grad_norm": 1.5967533588409424, + "learning_rate": 3.59330604214424e-06, + "loss": 0.9281, + "step": 2358 + }, + { + "epoch": 2.196461824953445, + "grad_norm": 1.6187447309494019, + "learning_rate": 3.592192553263817e-06, + "loss": 0.981, + "step": 2359 + }, + { + "epoch": 2.197392923649907, + "grad_norm": 1.5705369710922241, + "learning_rate": 3.5910787965451444e-06, + "loss": 0.9382, + "step": 2360 + }, + { + "epoch": 2.1983240223463687, + "grad_norm": 1.5949432849884033, + "learning_rate": 3.5899647722613482e-06, + "loss": 0.963, + "step": 2361 + }, + { + "epoch": 2.1992551210428304, + "grad_norm": 1.608249306678772, + "learning_rate": 3.5888504806856194e-06, + "loss": 0.964, + "step": 2362 + }, + { + "epoch": 2.2001862197392925, + "grad_norm": 1.6186444759368896, + "learning_rate": 3.5877359220912174e-06, + "loss": 0.9722, + "step": 2363 + }, + { + "epoch": 2.201117318435754, + "grad_norm": 1.613977074623108, + "learning_rate": 3.5866210967514635e-06, + "loss": 0.9208, + "step": 2364 + }, + { + "epoch": 2.202048417132216, + "grad_norm": 1.5724316835403442, + "learning_rate": 3.585506004939748e-06, + "loss": 0.9136, + "step": 2365 + }, + { + "epoch": 2.202979515828678, + "grad_norm": 1.63250732421875, + "learning_rate": 3.5843906469295226e-06, + "loss": 1.0011, + "step": 2366 + }, + { + "epoch": 2.2039106145251397, + "grad_norm": 1.6363906860351562, + "learning_rate": 3.583275022994308e-06, + "loss": 0.9643, + "step": 2367 + }, + { + "epoch": 2.2048417132216014, + "grad_norm": 1.6174997091293335, + "learning_rate": 3.5821591334076893e-06, + "loss": 0.9573, + "step": 2368 + }, + { + "epoch": 2.2057728119180635, + "grad_norm": 1.6489567756652832, + "learning_rate": 3.5810429784433133e-06, + "loss": 0.9676, + "step": 2369 + }, + { + "epoch": 2.206703910614525, + "grad_norm": 1.6908695697784424, + "learning_rate": 3.579926558374897e-06, + "loss": 0.9682, + "step": 2370 + }, + { + "epoch": 2.207635009310987, + "grad_norm": 1.647871494293213, + "learning_rate": 3.5788098734762177e-06, + "loss": 0.9665, + "step": 2371 + }, + { + "epoch": 2.2085661080074486, + "grad_norm": 1.6591806411743164, + "learning_rate": 3.5776929240211227e-06, + "loss": 0.9527, + "step": 2372 + }, + { + "epoch": 2.2094972067039107, + "grad_norm": 1.6059842109680176, + "learning_rate": 3.5765757102835197e-06, + "loss": 0.9841, + "step": 2373 + }, + { + "epoch": 2.2104283054003724, + "grad_norm": 1.6544197797775269, + "learning_rate": 3.5754582325373823e-06, + "loss": 0.9641, + "step": 2374 + }, + { + "epoch": 2.211359404096834, + "grad_norm": 1.6366745233535767, + "learning_rate": 3.574340491056751e-06, + "loss": 0.9386, + "step": 2375 + }, + { + "epoch": 2.212290502793296, + "grad_norm": 1.6419512033462524, + "learning_rate": 3.573222486115727e-06, + "loss": 1.0125, + "step": 2376 + }, + { + "epoch": 2.213221601489758, + "grad_norm": 1.604271650314331, + "learning_rate": 3.5721042179884824e-06, + "loss": 0.9483, + "step": 2377 + }, + { + "epoch": 2.2141527001862196, + "grad_norm": 1.6592247486114502, + "learning_rate": 3.570985686949246e-06, + "loss": 0.9794, + "step": 2378 + }, + { + "epoch": 2.2150837988826817, + "grad_norm": 1.619288682937622, + "learning_rate": 3.569866893272318e-06, + "loss": 0.9412, + "step": 2379 + }, + { + "epoch": 2.2160148975791434, + "grad_norm": 1.6512722969055176, + "learning_rate": 3.568747837232058e-06, + "loss": 0.9479, + "step": 2380 + }, + { + "epoch": 2.216945996275605, + "grad_norm": 1.602245807647705, + "learning_rate": 3.5676285191028926e-06, + "loss": 0.9529, + "step": 2381 + }, + { + "epoch": 2.217877094972067, + "grad_norm": 1.6473281383514404, + "learning_rate": 3.566508939159312e-06, + "loss": 0.961, + "step": 2382 + }, + { + "epoch": 2.218808193668529, + "grad_norm": 1.6244070529937744, + "learning_rate": 3.565389097675872e-06, + "loss": 0.9704, + "step": 2383 + }, + { + "epoch": 2.2197392923649906, + "grad_norm": 1.6561599969863892, + "learning_rate": 3.5642689949271892e-06, + "loss": 0.9874, + "step": 2384 + }, + { + "epoch": 2.2206703910614527, + "grad_norm": 1.6253948211669922, + "learning_rate": 3.563148631187947e-06, + "loss": 0.9708, + "step": 2385 + }, + { + "epoch": 2.2216014897579144, + "grad_norm": 1.6938610076904297, + "learning_rate": 3.562028006732893e-06, + "loss": 0.9425, + "step": 2386 + }, + { + "epoch": 2.222532588454376, + "grad_norm": 1.5649609565734863, + "learning_rate": 3.5609071218368363e-06, + "loss": 0.9248, + "step": 2387 + }, + { + "epoch": 2.223463687150838, + "grad_norm": 1.6441490650177002, + "learning_rate": 3.5597859767746524e-06, + "loss": 0.9543, + "step": 2388 + }, + { + "epoch": 2.2243947858473, + "grad_norm": 1.5588210821151733, + "learning_rate": 3.5586645718212787e-06, + "loss": 0.9334, + "step": 2389 + }, + { + "epoch": 2.2253258845437616, + "grad_norm": 1.6426608562469482, + "learning_rate": 3.557542907251718e-06, + "loss": 0.9572, + "step": 2390 + }, + { + "epoch": 2.2262569832402237, + "grad_norm": 1.6280072927474976, + "learning_rate": 3.5564209833410357e-06, + "loss": 0.9539, + "step": 2391 + }, + { + "epoch": 2.2271880819366854, + "grad_norm": 1.5709218978881836, + "learning_rate": 3.5552988003643613e-06, + "loss": 0.901, + "step": 2392 + }, + { + "epoch": 2.228119180633147, + "grad_norm": 1.657243251800537, + "learning_rate": 3.5541763585968874e-06, + "loss": 0.9599, + "step": 2393 + }, + { + "epoch": 2.2290502793296088, + "grad_norm": 1.68219792842865, + "learning_rate": 3.55305365831387e-06, + "loss": 0.9478, + "step": 2394 + }, + { + "epoch": 2.229981378026071, + "grad_norm": 1.6564964056015015, + "learning_rate": 3.551930699790629e-06, + "loss": 0.9289, + "step": 2395 + }, + { + "epoch": 2.2309124767225326, + "grad_norm": 1.6154779195785522, + "learning_rate": 3.5508074833025476e-06, + "loss": 0.9397, + "step": 2396 + }, + { + "epoch": 2.2318435754189943, + "grad_norm": 1.6187154054641724, + "learning_rate": 3.5496840091250716e-06, + "loss": 0.9596, + "step": 2397 + }, + { + "epoch": 2.2327746741154564, + "grad_norm": 1.6263182163238525, + "learning_rate": 3.548560277533711e-06, + "loss": 0.9812, + "step": 2398 + }, + { + "epoch": 2.233705772811918, + "grad_norm": 1.5656661987304688, + "learning_rate": 3.547436288804038e-06, + "loss": 0.9455, + "step": 2399 + }, + { + "epoch": 2.2346368715083798, + "grad_norm": 1.5954402685165405, + "learning_rate": 3.546312043211687e-06, + "loss": 0.92, + "step": 2400 + }, + { + "epoch": 2.235567970204842, + "grad_norm": 1.6701622009277344, + "learning_rate": 3.5451875410323587e-06, + "loss": 0.9477, + "step": 2401 + }, + { + "epoch": 2.2364990689013036, + "grad_norm": 1.6704421043395996, + "learning_rate": 3.5440627825418143e-06, + "loss": 0.9595, + "step": 2402 + }, + { + "epoch": 2.2374301675977653, + "grad_norm": 1.6545499563217163, + "learning_rate": 3.542937768015877e-06, + "loss": 0.9771, + "step": 2403 + }, + { + "epoch": 2.2383612662942274, + "grad_norm": 1.6518737077713013, + "learning_rate": 3.541812497730435e-06, + "loss": 0.9653, + "step": 2404 + }, + { + "epoch": 2.239292364990689, + "grad_norm": 1.6180881261825562, + "learning_rate": 3.540686971961438e-06, + "loss": 0.9728, + "step": 2405 + }, + { + "epoch": 2.2402234636871508, + "grad_norm": 1.6360397338867188, + "learning_rate": 3.5395611909848986e-06, + "loss": 0.9367, + "step": 2406 + }, + { + "epoch": 2.2411545623836124, + "grad_norm": 1.682340383529663, + "learning_rate": 3.5384351550768916e-06, + "loss": 0.9692, + "step": 2407 + }, + { + "epoch": 2.2420856610800746, + "grad_norm": 1.6571754217147827, + "learning_rate": 3.5373088645135543e-06, + "loss": 1.0257, + "step": 2408 + }, + { + "epoch": 2.2430167597765363, + "grad_norm": 1.5855857133865356, + "learning_rate": 3.5361823195710874e-06, + "loss": 0.948, + "step": 2409 + }, + { + "epoch": 2.243947858472998, + "grad_norm": 1.8361417055130005, + "learning_rate": 3.535055520525753e-06, + "loss": 0.9757, + "step": 2410 + }, + { + "epoch": 2.24487895716946, + "grad_norm": 1.6739006042480469, + "learning_rate": 3.5339284676538774e-06, + "loss": 1.0081, + "step": 2411 + }, + { + "epoch": 2.2458100558659218, + "grad_norm": 1.6460050344467163, + "learning_rate": 3.5328011612318454e-06, + "loss": 0.9522, + "step": 2412 + }, + { + "epoch": 2.2467411545623834, + "grad_norm": 1.601426124572754, + "learning_rate": 3.531673601536108e-06, + "loss": 0.9687, + "step": 2413 + }, + { + "epoch": 2.2476722532588456, + "grad_norm": 1.6364185810089111, + "learning_rate": 3.5305457888431747e-06, + "loss": 1.0097, + "step": 2414 + }, + { + "epoch": 2.2486033519553073, + "grad_norm": 1.6102561950683594, + "learning_rate": 3.529417723429621e-06, + "loss": 0.9442, + "step": 2415 + }, + { + "epoch": 2.249534450651769, + "grad_norm": 1.646165370941162, + "learning_rate": 3.5282894055720803e-06, + "loss": 0.9272, + "step": 2416 + }, + { + "epoch": 2.250465549348231, + "grad_norm": 1.6000772714614868, + "learning_rate": 3.5271608355472513e-06, + "loss": 0.941, + "step": 2417 + }, + { + "epoch": 2.2513966480446927, + "grad_norm": 1.7036372423171997, + "learning_rate": 3.5260320136318927e-06, + "loss": 0.9855, + "step": 2418 + }, + { + "epoch": 2.2523277467411544, + "grad_norm": 1.646814227104187, + "learning_rate": 3.5249029401028247e-06, + "loss": 0.9612, + "step": 2419 + }, + { + "epoch": 2.2532588454376166, + "grad_norm": 1.5991038084030151, + "learning_rate": 3.5237736152369305e-06, + "loss": 0.9251, + "step": 2420 + }, + { + "epoch": 2.2541899441340782, + "grad_norm": 1.5567059516906738, + "learning_rate": 3.522644039311154e-06, + "loss": 0.9331, + "step": 2421 + }, + { + "epoch": 2.25512104283054, + "grad_norm": 1.5913971662521362, + "learning_rate": 3.5215142126025013e-06, + "loss": 0.919, + "step": 2422 + }, + { + "epoch": 2.256052141527002, + "grad_norm": 1.616965651512146, + "learning_rate": 3.52038413538804e-06, + "loss": 0.964, + "step": 2423 + }, + { + "epoch": 2.2569832402234637, + "grad_norm": 1.612487554550171, + "learning_rate": 3.519253807944898e-06, + "loss": 0.972, + "step": 2424 + }, + { + "epoch": 2.2579143389199254, + "grad_norm": 1.5858962535858154, + "learning_rate": 3.5181232305502657e-06, + "loss": 0.9513, + "step": 2425 + }, + { + "epoch": 2.2588454376163876, + "grad_norm": 1.7430367469787598, + "learning_rate": 3.516992403481394e-06, + "loss": 0.9933, + "step": 2426 + }, + { + "epoch": 2.2597765363128492, + "grad_norm": 1.5940974950790405, + "learning_rate": 3.515861327015596e-06, + "loss": 0.964, + "step": 2427 + }, + { + "epoch": 2.260707635009311, + "grad_norm": 1.5860908031463623, + "learning_rate": 3.514730001430246e-06, + "loss": 0.9204, + "step": 2428 + }, + { + "epoch": 2.2616387337057726, + "grad_norm": 1.585479497909546, + "learning_rate": 3.513598427002778e-06, + "loss": 0.9387, + "step": 2429 + }, + { + "epoch": 2.2625698324022347, + "grad_norm": 1.726728916168213, + "learning_rate": 3.512466604010688e-06, + "loss": 0.9552, + "step": 2430 + }, + { + "epoch": 2.2635009310986964, + "grad_norm": 1.5958534479141235, + "learning_rate": 3.511334532731533e-06, + "loss": 0.9718, + "step": 2431 + }, + { + "epoch": 2.264432029795158, + "grad_norm": 1.6186741590499878, + "learning_rate": 3.51020221344293e-06, + "loss": 0.9475, + "step": 2432 + }, + { + "epoch": 2.2653631284916202, + "grad_norm": 1.6689815521240234, + "learning_rate": 3.5090696464225587e-06, + "loss": 0.9716, + "step": 2433 + }, + { + "epoch": 2.266294227188082, + "grad_norm": 1.5765066146850586, + "learning_rate": 3.507936831948158e-06, + "loss": 0.8982, + "step": 2434 + }, + { + "epoch": 2.2672253258845436, + "grad_norm": 1.6396182775497437, + "learning_rate": 3.5068037702975266e-06, + "loss": 0.9696, + "step": 2435 + }, + { + "epoch": 2.2681564245810057, + "grad_norm": 1.5978269577026367, + "learning_rate": 3.5056704617485273e-06, + "loss": 0.9596, + "step": 2436 + }, + { + "epoch": 2.2690875232774674, + "grad_norm": 1.629413366317749, + "learning_rate": 3.504536906579079e-06, + "loss": 0.968, + "step": 2437 + }, + { + "epoch": 2.270018621973929, + "grad_norm": 1.5942696332931519, + "learning_rate": 3.503403105067165e-06, + "loss": 0.9502, + "step": 2438 + }, + { + "epoch": 2.2709497206703912, + "grad_norm": 1.6827514171600342, + "learning_rate": 3.502269057490827e-06, + "loss": 0.9928, + "step": 2439 + }, + { + "epoch": 2.271880819366853, + "grad_norm": 1.6567050218582153, + "learning_rate": 3.501134764128167e-06, + "loss": 0.9822, + "step": 2440 + }, + { + "epoch": 2.2728119180633146, + "grad_norm": 1.6781362295150757, + "learning_rate": 3.5000002252573466e-06, + "loss": 0.9575, + "step": 2441 + }, + { + "epoch": 2.2737430167597763, + "grad_norm": 1.5720270872116089, + "learning_rate": 3.498865441156591e-06, + "loss": 0.9085, + "step": 2442 + }, + { + "epoch": 2.2746741154562384, + "grad_norm": 1.6472513675689697, + "learning_rate": 3.497730412104181e-06, + "loss": 0.969, + "step": 2443 + }, + { + "epoch": 2.2756052141527, + "grad_norm": 1.5873229503631592, + "learning_rate": 3.4965951383784603e-06, + "loss": 0.9627, + "step": 2444 + }, + { + "epoch": 2.276536312849162, + "grad_norm": 1.5857559442520142, + "learning_rate": 3.495459620257833e-06, + "loss": 0.9274, + "step": 2445 + }, + { + "epoch": 2.277467411545624, + "grad_norm": 1.5823582410812378, + "learning_rate": 3.4943238580207604e-06, + "loss": 0.9593, + "step": 2446 + }, + { + "epoch": 2.2783985102420856, + "grad_norm": 1.5935879945755005, + "learning_rate": 3.4931878519457664e-06, + "loss": 0.99, + "step": 2447 + }, + { + "epoch": 2.2793296089385473, + "grad_norm": 1.6486268043518066, + "learning_rate": 3.4920516023114337e-06, + "loss": 0.9198, + "step": 2448 + }, + { + "epoch": 2.2802607076350094, + "grad_norm": 1.6650099754333496, + "learning_rate": 3.4909151093964046e-06, + "loss": 0.9869, + "step": 2449 + }, + { + "epoch": 2.281191806331471, + "grad_norm": 1.77230703830719, + "learning_rate": 3.4897783734793794e-06, + "loss": 1.0144, + "step": 2450 + }, + { + "epoch": 2.282122905027933, + "grad_norm": 1.5813177824020386, + "learning_rate": 3.488641394839123e-06, + "loss": 0.9611, + "step": 2451 + }, + { + "epoch": 2.283054003724395, + "grad_norm": 1.6739068031311035, + "learning_rate": 3.487504173754453e-06, + "loss": 0.9774, + "step": 2452 + }, + { + "epoch": 2.2839851024208566, + "grad_norm": 1.6267461776733398, + "learning_rate": 3.4863667105042526e-06, + "loss": 0.9745, + "step": 2453 + }, + { + "epoch": 2.2849162011173183, + "grad_norm": 1.5787975788116455, + "learning_rate": 3.485229005367461e-06, + "loss": 0.8997, + "step": 2454 + }, + { + "epoch": 2.2858472998137804, + "grad_norm": 1.6535475254058838, + "learning_rate": 3.4840910586230768e-06, + "loss": 0.9749, + "step": 2455 + }, + { + "epoch": 2.286778398510242, + "grad_norm": 1.5885213613510132, + "learning_rate": 3.4829528705501605e-06, + "loss": 0.9455, + "step": 2456 + }, + { + "epoch": 2.287709497206704, + "grad_norm": 1.5992566347122192, + "learning_rate": 3.4818144414278266e-06, + "loss": 0.9605, + "step": 2457 + }, + { + "epoch": 2.288640595903166, + "grad_norm": 1.6213765144348145, + "learning_rate": 3.4806757715352552e-06, + "loss": 0.9389, + "step": 2458 + }, + { + "epoch": 2.2895716945996276, + "grad_norm": 1.6163054704666138, + "learning_rate": 3.4795368611516795e-06, + "loss": 0.9622, + "step": 2459 + }, + { + "epoch": 2.2905027932960893, + "grad_norm": 1.6298826932907104, + "learning_rate": 3.4783977105563973e-06, + "loss": 1.002, + "step": 2460 + }, + { + "epoch": 2.2914338919925514, + "grad_norm": 1.593178629875183, + "learning_rate": 3.477258320028759e-06, + "loss": 0.9383, + "step": 2461 + }, + { + "epoch": 2.292364990689013, + "grad_norm": 1.6418095827102661, + "learning_rate": 3.47611868984818e-06, + "loss": 0.9743, + "step": 2462 + }, + { + "epoch": 2.293296089385475, + "grad_norm": 1.6952311992645264, + "learning_rate": 3.4749788202941297e-06, + "loss": 0.9733, + "step": 2463 + }, + { + "epoch": 2.294227188081937, + "grad_norm": 1.6906559467315674, + "learning_rate": 3.473838711646139e-06, + "loss": 0.956, + "step": 2464 + }, + { + "epoch": 2.2951582867783986, + "grad_norm": 1.6767266988754272, + "learning_rate": 3.472698364183798e-06, + "loss": 0.9933, + "step": 2465 + }, + { + "epoch": 2.2960893854748603, + "grad_norm": 1.6053396463394165, + "learning_rate": 3.4715577781867516e-06, + "loss": 0.926, + "step": 2466 + }, + { + "epoch": 2.297020484171322, + "grad_norm": 1.6009371280670166, + "learning_rate": 3.4704169539347066e-06, + "loss": 0.9401, + "step": 2467 + }, + { + "epoch": 2.297951582867784, + "grad_norm": 1.6782152652740479, + "learning_rate": 3.469275891707428e-06, + "loss": 0.9905, + "step": 2468 + }, + { + "epoch": 2.298882681564246, + "grad_norm": 1.7365431785583496, + "learning_rate": 3.4681345917847363e-06, + "loss": 0.9765, + "step": 2469 + }, + { + "epoch": 2.2998137802607075, + "grad_norm": 1.6472665071487427, + "learning_rate": 3.4669930544465147e-06, + "loss": 0.9605, + "step": 2470 + }, + { + "epoch": 2.3007448789571696, + "grad_norm": 1.6339211463928223, + "learning_rate": 3.4658512799727006e-06, + "loss": 0.9462, + "step": 2471 + }, + { + "epoch": 2.3016759776536313, + "grad_norm": 1.6361422538757324, + "learning_rate": 3.4647092686432917e-06, + "loss": 0.96, + "step": 2472 + }, + { + "epoch": 2.302607076350093, + "grad_norm": 1.7519464492797852, + "learning_rate": 3.4635670207383438e-06, + "loss": 0.9925, + "step": 2473 + }, + { + "epoch": 2.303538175046555, + "grad_norm": 1.6213538646697998, + "learning_rate": 3.4624245365379694e-06, + "loss": 0.9396, + "step": 2474 + }, + { + "epoch": 2.304469273743017, + "grad_norm": 1.6561614274978638, + "learning_rate": 3.46128181632234e-06, + "loss": 1.0284, + "step": 2475 + }, + { + "epoch": 2.3054003724394785, + "grad_norm": 1.6527241468429565, + "learning_rate": 3.4601388603716853e-06, + "loss": 0.9487, + "step": 2476 + }, + { + "epoch": 2.30633147113594, + "grad_norm": 1.5925219058990479, + "learning_rate": 3.458995668966292e-06, + "loss": 0.9012, + "step": 2477 + }, + { + "epoch": 2.3072625698324023, + "grad_norm": 1.712669849395752, + "learning_rate": 3.4578522423865042e-06, + "loss": 1.0153, + "step": 2478 + }, + { + "epoch": 2.308193668528864, + "grad_norm": 1.626772403717041, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.9233, + "step": 2479 + }, + { + "epoch": 2.3091247672253257, + "grad_norm": 1.6258918046951294, + "learning_rate": 3.455564684825414e-06, + "loss": 0.961, + "step": 2480 + }, + { + "epoch": 2.310055865921788, + "grad_norm": 1.6405956745147705, + "learning_rate": 3.4544205544050886e-06, + "loss": 0.977, + "step": 2481 + }, + { + "epoch": 2.3109869646182495, + "grad_norm": 1.6471235752105713, + "learning_rate": 3.453276189932324e-06, + "loss": 0.9703, + "step": 2482 + }, + { + "epoch": 2.311918063314711, + "grad_norm": 1.6240640878677368, + "learning_rate": 3.452131591687753e-06, + "loss": 0.9616, + "step": 2483 + }, + { + "epoch": 2.3128491620111733, + "grad_norm": 1.6593929529190063, + "learning_rate": 3.450986759952064e-06, + "loss": 1.0255, + "step": 2484 + }, + { + "epoch": 2.313780260707635, + "grad_norm": 1.6254647970199585, + "learning_rate": 3.4498416950060056e-06, + "loss": 0.9551, + "step": 2485 + }, + { + "epoch": 2.3147113594040967, + "grad_norm": 1.606094479560852, + "learning_rate": 3.4486963971303805e-06, + "loss": 0.9408, + "step": 2486 + }, + { + "epoch": 2.315642458100559, + "grad_norm": 1.569433569908142, + "learning_rate": 3.447550866606051e-06, + "loss": 0.9511, + "step": 2487 + }, + { + "epoch": 2.3165735567970205, + "grad_norm": 1.6102689504623413, + "learning_rate": 3.4464051037139346e-06, + "loss": 0.9534, + "step": 2488 + }, + { + "epoch": 2.317504655493482, + "grad_norm": 1.6579538583755493, + "learning_rate": 3.4452591087350067e-06, + "loss": 0.9577, + "step": 2489 + }, + { + "epoch": 2.3184357541899443, + "grad_norm": 1.5535004138946533, + "learning_rate": 3.4441128819503e-06, + "loss": 0.9063, + "step": 2490 + }, + { + "epoch": 2.319366852886406, + "grad_norm": 1.6399790048599243, + "learning_rate": 3.4429664236409034e-06, + "loss": 0.9651, + "step": 2491 + }, + { + "epoch": 2.3202979515828677, + "grad_norm": 1.6075857877731323, + "learning_rate": 3.441819734087963e-06, + "loss": 0.9414, + "step": 2492 + }, + { + "epoch": 2.32122905027933, + "grad_norm": 1.6000339984893799, + "learning_rate": 3.440672813572681e-06, + "loss": 0.9064, + "step": 2493 + }, + { + "epoch": 2.3221601489757915, + "grad_norm": 1.6694415807724, + "learning_rate": 3.439525662376317e-06, + "loss": 0.9531, + "step": 2494 + }, + { + "epoch": 2.323091247672253, + "grad_norm": 1.6475422382354736, + "learning_rate": 3.4383782807801846e-06, + "loss": 0.9548, + "step": 2495 + }, + { + "epoch": 2.3240223463687153, + "grad_norm": 1.7334296703338623, + "learning_rate": 3.437230669065659e-06, + "loss": 0.9952, + "step": 2496 + }, + { + "epoch": 2.324953445065177, + "grad_norm": 1.6021231412887573, + "learning_rate": 3.4360828275141677e-06, + "loss": 0.9259, + "step": 2497 + }, + { + "epoch": 2.3258845437616387, + "grad_norm": 1.5765581130981445, + "learning_rate": 3.4349347564071956e-06, + "loss": 0.9581, + "step": 2498 + }, + { + "epoch": 2.326815642458101, + "grad_norm": 1.6334996223449707, + "learning_rate": 3.433786456026285e-06, + "loss": 0.8949, + "step": 2499 + }, + { + "epoch": 2.3277467411545625, + "grad_norm": 1.5742268562316895, + "learning_rate": 3.432637926653031e-06, + "loss": 0.9344, + "step": 2500 + }, + { + "epoch": 2.328677839851024, + "grad_norm": 1.620252251625061, + "learning_rate": 3.431489168569091e-06, + "loss": 0.9185, + "step": 2501 + }, + { + "epoch": 2.329608938547486, + "grad_norm": 1.6275135278701782, + "learning_rate": 3.430340182056171e-06, + "loss": 0.9599, + "step": 2502 + }, + { + "epoch": 2.330540037243948, + "grad_norm": 1.6273826360702515, + "learning_rate": 3.4291909673960392e-06, + "loss": 0.946, + "step": 2503 + }, + { + "epoch": 2.3314711359404097, + "grad_norm": 1.6474202871322632, + "learning_rate": 3.4280415248705173e-06, + "loss": 0.9829, + "step": 2504 + }, + { + "epoch": 2.3324022346368714, + "grad_norm": 1.5774883031845093, + "learning_rate": 3.4268918547614814e-06, + "loss": 0.9115, + "step": 2505 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 1.6210507154464722, + "learning_rate": 3.425741957350867e-06, + "loss": 0.943, + "step": 2506 + }, + { + "epoch": 2.334264432029795, + "grad_norm": 1.6329035758972168, + "learning_rate": 3.424591832920661e-06, + "loss": 0.9272, + "step": 2507 + }, + { + "epoch": 2.335195530726257, + "grad_norm": 1.5920088291168213, + "learning_rate": 3.423441481752911e-06, + "loss": 0.9525, + "step": 2508 + }, + { + "epoch": 2.336126629422719, + "grad_norm": 1.6059393882751465, + "learning_rate": 3.422290904129715e-06, + "loss": 0.951, + "step": 2509 + }, + { + "epoch": 2.3370577281191807, + "grad_norm": 1.6441189050674438, + "learning_rate": 3.421140100333231e-06, + "loss": 0.9659, + "step": 2510 + }, + { + "epoch": 2.3379888268156424, + "grad_norm": 1.5974291563034058, + "learning_rate": 3.4199890706456697e-06, + "loss": 0.9425, + "step": 2511 + }, + { + "epoch": 2.338919925512104, + "grad_norm": 1.66739821434021, + "learning_rate": 3.4188378153492967e-06, + "loss": 1.025, + "step": 2512 + }, + { + "epoch": 2.339851024208566, + "grad_norm": 1.6239019632339478, + "learning_rate": 3.4176863347264355e-06, + "loss": 0.989, + "step": 2513 + }, + { + "epoch": 2.340782122905028, + "grad_norm": 1.6041226387023926, + "learning_rate": 3.4165346290594642e-06, + "loss": 0.9669, + "step": 2514 + }, + { + "epoch": 2.3417132216014895, + "grad_norm": 1.586902379989624, + "learning_rate": 3.4153826986308143e-06, + "loss": 0.9116, + "step": 2515 + }, + { + "epoch": 2.3426443202979517, + "grad_norm": 1.6596691608428955, + "learning_rate": 3.414230543722973e-06, + "loss": 0.9187, + "step": 2516 + }, + { + "epoch": 2.3435754189944134, + "grad_norm": 1.6154959201812744, + "learning_rate": 3.4130781646184852e-06, + "loss": 0.9626, + "step": 2517 + }, + { + "epoch": 2.344506517690875, + "grad_norm": 1.6038247346878052, + "learning_rate": 3.411925561599947e-06, + "loss": 0.9795, + "step": 2518 + }, + { + "epoch": 2.345437616387337, + "grad_norm": 1.6716678142547607, + "learning_rate": 3.4107727349500114e-06, + "loss": 1.0093, + "step": 2519 + }, + { + "epoch": 2.346368715083799, + "grad_norm": 1.6727886199951172, + "learning_rate": 3.4096196849513863e-06, + "loss": 0.9405, + "step": 2520 + }, + { + "epoch": 2.3472998137802605, + "grad_norm": 1.592605471611023, + "learning_rate": 3.4084664118868336e-06, + "loss": 0.9467, + "step": 2521 + }, + { + "epoch": 2.3482309124767227, + "grad_norm": 1.5989329814910889, + "learning_rate": 3.407312916039171e-06, + "loss": 0.9397, + "step": 2522 + }, + { + "epoch": 2.3491620111731844, + "grad_norm": 1.6715210676193237, + "learning_rate": 3.40615919769127e-06, + "loss": 0.9699, + "step": 2523 + }, + { + "epoch": 2.350093109869646, + "grad_norm": 1.577141284942627, + "learning_rate": 3.405005257126056e-06, + "loss": 0.9236, + "step": 2524 + }, + { + "epoch": 2.351024208566108, + "grad_norm": 1.5585591793060303, + "learning_rate": 3.40385109462651e-06, + "loss": 0.9187, + "step": 2525 + }, + { + "epoch": 2.35195530726257, + "grad_norm": 1.6048600673675537, + "learning_rate": 3.402696710475668e-06, + "loss": 0.9369, + "step": 2526 + }, + { + "epoch": 2.3528864059590315, + "grad_norm": 1.7114615440368652, + "learning_rate": 3.4015421049566185e-06, + "loss": 0.9871, + "step": 2527 + }, + { + "epoch": 2.3538175046554937, + "grad_norm": 1.6561464071273804, + "learning_rate": 3.4003872783525054e-06, + "loss": 0.9469, + "step": 2528 + }, + { + "epoch": 2.3547486033519553, + "grad_norm": 1.5900381803512573, + "learning_rate": 3.3992322309465274e-06, + "loss": 0.912, + "step": 2529 + }, + { + "epoch": 2.355679702048417, + "grad_norm": 1.7404963970184326, + "learning_rate": 3.3980769630219357e-06, + "loss": 0.9895, + "step": 2530 + }, + { + "epoch": 2.356610800744879, + "grad_norm": 1.5682674646377563, + "learning_rate": 3.396921474862037e-06, + "loss": 0.9672, + "step": 2531 + }, + { + "epoch": 2.357541899441341, + "grad_norm": 1.5718039274215698, + "learning_rate": 3.395765766750192e-06, + "loss": 0.9715, + "step": 2532 + }, + { + "epoch": 2.3584729981378025, + "grad_norm": 1.6884418725967407, + "learning_rate": 3.394609838969814e-06, + "loss": 0.939, + "step": 2533 + }, + { + "epoch": 2.3594040968342647, + "grad_norm": 1.6072217226028442, + "learning_rate": 3.3934536918043713e-06, + "loss": 0.963, + "step": 2534 + }, + { + "epoch": 2.3603351955307263, + "grad_norm": 1.6443567276000977, + "learning_rate": 3.3922973255373857e-06, + "loss": 1.0078, + "step": 2535 + }, + { + "epoch": 2.361266294227188, + "grad_norm": 1.6523720026016235, + "learning_rate": 3.3911407404524333e-06, + "loss": 0.9699, + "step": 2536 + }, + { + "epoch": 2.3621973929236497, + "grad_norm": 1.635683536529541, + "learning_rate": 3.389983936833143e-06, + "loss": 0.9842, + "step": 2537 + }, + { + "epoch": 2.363128491620112, + "grad_norm": 1.565341830253601, + "learning_rate": 3.3888269149631977e-06, + "loss": 0.9148, + "step": 2538 + }, + { + "epoch": 2.3640595903165735, + "grad_norm": 1.6393283605575562, + "learning_rate": 3.3876696751263333e-06, + "loss": 0.9789, + "step": 2539 + }, + { + "epoch": 2.364990689013035, + "grad_norm": 1.6058269739151, + "learning_rate": 3.386512217606339e-06, + "loss": 0.9404, + "step": 2540 + }, + { + "epoch": 2.3659217877094973, + "grad_norm": 1.5703598260879517, + "learning_rate": 3.38535454268706e-06, + "loss": 0.9168, + "step": 2541 + }, + { + "epoch": 2.366852886405959, + "grad_norm": 1.671377182006836, + "learning_rate": 3.3841966506523916e-06, + "loss": 0.961, + "step": 2542 + }, + { + "epoch": 2.3677839851024207, + "grad_norm": 1.6794096231460571, + "learning_rate": 3.383038541786282e-06, + "loss": 0.9689, + "step": 2543 + }, + { + "epoch": 2.368715083798883, + "grad_norm": 1.619209885597229, + "learning_rate": 3.3818802163727377e-06, + "loss": 0.9762, + "step": 2544 + }, + { + "epoch": 2.3696461824953445, + "grad_norm": 1.6613311767578125, + "learning_rate": 3.380721674695811e-06, + "loss": 0.9633, + "step": 2545 + }, + { + "epoch": 2.370577281191806, + "grad_norm": 1.6107838153839111, + "learning_rate": 3.379562917039614e-06, + "loss": 0.9326, + "step": 2546 + }, + { + "epoch": 2.3715083798882683, + "grad_norm": 1.6407833099365234, + "learning_rate": 3.3784039436883055e-06, + "loss": 0.9685, + "step": 2547 + }, + { + "epoch": 2.37243947858473, + "grad_norm": 1.6014596223831177, + "learning_rate": 3.377244754926104e-06, + "loss": 0.9425, + "step": 2548 + }, + { + "epoch": 2.3733705772811917, + "grad_norm": 1.6465659141540527, + "learning_rate": 3.376085351037274e-06, + "loss": 0.9691, + "step": 2549 + }, + { + "epoch": 2.3743016759776534, + "grad_norm": 1.6096807718276978, + "learning_rate": 3.374925732306138e-06, + "loss": 0.9214, + "step": 2550 + }, + { + "epoch": 2.3752327746741155, + "grad_norm": 1.6267849206924438, + "learning_rate": 3.3737658990170684e-06, + "loss": 0.9871, + "step": 2551 + }, + { + "epoch": 2.376163873370577, + "grad_norm": 1.6364647150039673, + "learning_rate": 3.3726058514544915e-06, + "loss": 0.9673, + "step": 2552 + }, + { + "epoch": 2.377094972067039, + "grad_norm": 1.588962197303772, + "learning_rate": 3.3714455899028847e-06, + "loss": 0.9555, + "step": 2553 + }, + { + "epoch": 2.378026070763501, + "grad_norm": 1.5935996770858765, + "learning_rate": 3.3702851146467797e-06, + "loss": 0.9042, + "step": 2554 + }, + { + "epoch": 2.3789571694599627, + "grad_norm": 1.5623427629470825, + "learning_rate": 3.369124425970759e-06, + "loss": 0.9053, + "step": 2555 + }, + { + "epoch": 2.3798882681564244, + "grad_norm": 1.6203159093856812, + "learning_rate": 3.3679635241594586e-06, + "loss": 0.9456, + "step": 2556 + }, + { + "epoch": 2.3808193668528865, + "grad_norm": 1.614363670349121, + "learning_rate": 3.3668024094975665e-06, + "loss": 0.948, + "step": 2557 + }, + { + "epoch": 2.381750465549348, + "grad_norm": 1.640191674232483, + "learning_rate": 3.365641082269822e-06, + "loss": 1.0115, + "step": 2558 + }, + { + "epoch": 2.38268156424581, + "grad_norm": 1.643373966217041, + "learning_rate": 3.364479542761018e-06, + "loss": 0.95, + "step": 2559 + }, + { + "epoch": 2.383612662942272, + "grad_norm": 1.6399234533309937, + "learning_rate": 3.3633177912559982e-06, + "loss": 0.9584, + "step": 2560 + }, + { + "epoch": 2.3845437616387337, + "grad_norm": 1.6302703619003296, + "learning_rate": 3.3621558280396594e-06, + "loss": 0.9707, + "step": 2561 + }, + { + "epoch": 2.3854748603351954, + "grad_norm": 1.6424598693847656, + "learning_rate": 3.360993653396949e-06, + "loss": 0.9836, + "step": 2562 + }, + { + "epoch": 2.3864059590316575, + "grad_norm": 1.6595407724380493, + "learning_rate": 3.3598312676128665e-06, + "loss": 0.958, + "step": 2563 + }, + { + "epoch": 2.387337057728119, + "grad_norm": 1.6464163064956665, + "learning_rate": 3.3586686709724647e-06, + "loss": 0.9656, + "step": 2564 + }, + { + "epoch": 2.388268156424581, + "grad_norm": 1.5871891975402832, + "learning_rate": 3.357505863760847e-06, + "loss": 0.926, + "step": 2565 + }, + { + "epoch": 2.389199255121043, + "grad_norm": 1.5294058322906494, + "learning_rate": 3.356342846263168e-06, + "loss": 0.9535, + "step": 2566 + }, + { + "epoch": 2.3901303538175047, + "grad_norm": 1.6083580255508423, + "learning_rate": 3.3551796187646345e-06, + "loss": 0.9587, + "step": 2567 + }, + { + "epoch": 2.3910614525139664, + "grad_norm": 1.5972957611083984, + "learning_rate": 3.3540161815505046e-06, + "loss": 0.9117, + "step": 2568 + }, + { + "epoch": 2.3919925512104285, + "grad_norm": 1.5899473428726196, + "learning_rate": 3.3528525349060873e-06, + "loss": 0.9214, + "step": 2569 + }, + { + "epoch": 2.39292364990689, + "grad_norm": 1.6294020414352417, + "learning_rate": 3.3516886791167446e-06, + "loss": 0.9497, + "step": 2570 + }, + { + "epoch": 2.393854748603352, + "grad_norm": 1.8026212453842163, + "learning_rate": 3.3505246144678884e-06, + "loss": 0.9189, + "step": 2571 + }, + { + "epoch": 2.394785847299814, + "grad_norm": 1.6396327018737793, + "learning_rate": 3.3493603412449815e-06, + "loss": 0.9805, + "step": 2572 + }, + { + "epoch": 2.3957169459962757, + "grad_norm": 1.7179055213928223, + "learning_rate": 3.34819585973354e-06, + "loss": 0.9952, + "step": 2573 + }, + { + "epoch": 2.3966480446927374, + "grad_norm": 1.6350146532058716, + "learning_rate": 3.347031170219127e-06, + "loss": 0.9306, + "step": 2574 + }, + { + "epoch": 2.397579143389199, + "grad_norm": 1.6674777269363403, + "learning_rate": 3.3458662729873614e-06, + "loss": 0.96, + "step": 2575 + }, + { + "epoch": 2.398510242085661, + "grad_norm": 1.5673600435256958, + "learning_rate": 3.3447011683239104e-06, + "loss": 0.9062, + "step": 2576 + }, + { + "epoch": 2.399441340782123, + "grad_norm": 1.6294329166412354, + "learning_rate": 3.343535856514492e-06, + "loss": 0.9479, + "step": 2577 + }, + { + "epoch": 2.4003724394785846, + "grad_norm": 1.7142693996429443, + "learning_rate": 3.342370337844876e-06, + "loss": 0.9603, + "step": 2578 + }, + { + "epoch": 2.4013035381750467, + "grad_norm": 1.614809513092041, + "learning_rate": 3.3412046126008814e-06, + "loss": 0.9635, + "step": 2579 + }, + { + "epoch": 2.4022346368715084, + "grad_norm": 1.6972476243972778, + "learning_rate": 3.34003868106838e-06, + "loss": 0.9607, + "step": 2580 + }, + { + "epoch": 2.40316573556797, + "grad_norm": 1.7378531694412231, + "learning_rate": 3.3388725435332915e-06, + "loss": 0.9986, + "step": 2581 + }, + { + "epoch": 2.404096834264432, + "grad_norm": 1.6341021060943604, + "learning_rate": 3.33770620028159e-06, + "loss": 0.978, + "step": 2582 + }, + { + "epoch": 2.405027932960894, + "grad_norm": 1.6068378686904907, + "learning_rate": 3.3365396515992954e-06, + "loss": 0.9471, + "step": 2583 + }, + { + "epoch": 2.4059590316573556, + "grad_norm": 1.668980360031128, + "learning_rate": 3.335372897772482e-06, + "loss": 1.0031, + "step": 2584 + }, + { + "epoch": 2.4068901303538173, + "grad_norm": 1.6691441535949707, + "learning_rate": 3.334205939087272e-06, + "loss": 0.95, + "step": 2585 + }, + { + "epoch": 2.4078212290502794, + "grad_norm": 1.6827136278152466, + "learning_rate": 3.333038775829839e-06, + "loss": 0.9729, + "step": 2586 + }, + { + "epoch": 2.408752327746741, + "grad_norm": 1.6189643144607544, + "learning_rate": 3.331871408286406e-06, + "loss": 1.009, + "step": 2587 + }, + { + "epoch": 2.4096834264432028, + "grad_norm": 1.6583327054977417, + "learning_rate": 3.330703836743245e-06, + "loss": 1.003, + "step": 2588 + }, + { + "epoch": 2.410614525139665, + "grad_norm": 1.6170976161956787, + "learning_rate": 3.329536061486682e-06, + "loss": 0.9728, + "step": 2589 + }, + { + "epoch": 2.4115456238361266, + "grad_norm": 1.6288248300552368, + "learning_rate": 3.328368082803088e-06, + "loss": 0.9571, + "step": 2590 + }, + { + "epoch": 2.4124767225325883, + "grad_norm": 1.6497873067855835, + "learning_rate": 3.3271999009788886e-06, + "loss": 0.941, + "step": 2591 + }, + { + "epoch": 2.4134078212290504, + "grad_norm": 1.7231147289276123, + "learning_rate": 3.3260315163005552e-06, + "loss": 0.9537, + "step": 2592 + }, + { + "epoch": 2.414338919925512, + "grad_norm": 1.6208795309066772, + "learning_rate": 3.32486292905461e-06, + "loss": 0.9503, + "step": 2593 + }, + { + "epoch": 2.4152700186219738, + "grad_norm": 1.665986180305481, + "learning_rate": 3.3236941395276283e-06, + "loss": 0.9516, + "step": 2594 + }, + { + "epoch": 2.416201117318436, + "grad_norm": 1.6351374387741089, + "learning_rate": 3.3225251480062296e-06, + "loss": 0.9594, + "step": 2595 + }, + { + "epoch": 2.4171322160148976, + "grad_norm": 1.627670407295227, + "learning_rate": 3.3213559547770873e-06, + "loss": 0.9799, + "step": 2596 + }, + { + "epoch": 2.4180633147113593, + "grad_norm": 1.6770120859146118, + "learning_rate": 3.3201865601269206e-06, + "loss": 0.9445, + "step": 2597 + }, + { + "epoch": 2.4189944134078214, + "grad_norm": 1.7052509784698486, + "learning_rate": 3.3190169643425025e-06, + "loss": 1.0043, + "step": 2598 + }, + { + "epoch": 2.419925512104283, + "grad_norm": 1.6855852603912354, + "learning_rate": 3.3178471677106504e-06, + "loss": 0.9495, + "step": 2599 + }, + { + "epoch": 2.4208566108007448, + "grad_norm": 1.743435025215149, + "learning_rate": 3.316677170518235e-06, + "loss": 0.9987, + "step": 2600 + }, + { + "epoch": 2.421787709497207, + "grad_norm": 1.6144195795059204, + "learning_rate": 3.315506973052174e-06, + "loss": 0.9481, + "step": 2601 + }, + { + "epoch": 2.4227188081936686, + "grad_norm": 1.718222975730896, + "learning_rate": 3.3143365755994346e-06, + "loss": 0.9586, + "step": 2602 + }, + { + "epoch": 2.4236499068901303, + "grad_norm": 1.644515037536621, + "learning_rate": 3.3131659784470334e-06, + "loss": 0.9423, + "step": 2603 + }, + { + "epoch": 2.4245810055865924, + "grad_norm": 1.6273270845413208, + "learning_rate": 3.3119951818820357e-06, + "loss": 0.9598, + "step": 2604 + }, + { + "epoch": 2.425512104283054, + "grad_norm": 1.6700594425201416, + "learning_rate": 3.3108241861915565e-06, + "loss": 0.9155, + "step": 2605 + }, + { + "epoch": 2.4264432029795158, + "grad_norm": 1.6600873470306396, + "learning_rate": 3.309652991662758e-06, + "loss": 0.9351, + "step": 2606 + }, + { + "epoch": 2.427374301675978, + "grad_norm": 1.6999610662460327, + "learning_rate": 3.3084815985828524e-06, + "loss": 0.9712, + "step": 2607 + }, + { + "epoch": 2.4283054003724396, + "grad_norm": 1.643938660621643, + "learning_rate": 3.3073100072391e-06, + "loss": 0.9507, + "step": 2608 + }, + { + "epoch": 2.4292364990689013, + "grad_norm": 1.6479452848434448, + "learning_rate": 3.306138217918811e-06, + "loss": 0.9557, + "step": 2609 + }, + { + "epoch": 2.430167597765363, + "grad_norm": 1.6434718370437622, + "learning_rate": 3.304966230909342e-06, + "loss": 0.9475, + "step": 2610 + }, + { + "epoch": 2.431098696461825, + "grad_norm": 1.676967740058899, + "learning_rate": 3.3037940464981005e-06, + "loss": 0.9573, + "step": 2611 + }, + { + "epoch": 2.4320297951582868, + "grad_norm": 1.6404533386230469, + "learning_rate": 3.30262166497254e-06, + "loss": 0.932, + "step": 2612 + }, + { + "epoch": 2.4329608938547485, + "grad_norm": 1.6470792293548584, + "learning_rate": 3.301449086620164e-06, + "loss": 0.9507, + "step": 2613 + }, + { + "epoch": 2.4338919925512106, + "grad_norm": 1.6286709308624268, + "learning_rate": 3.300276311728523e-06, + "loss": 0.986, + "step": 2614 + }, + { + "epoch": 2.4348230912476723, + "grad_norm": 1.6010756492614746, + "learning_rate": 3.299103340585218e-06, + "loss": 0.9355, + "step": 2615 + }, + { + "epoch": 2.435754189944134, + "grad_norm": 1.6273709535598755, + "learning_rate": 3.297930173477895e-06, + "loss": 0.9762, + "step": 2616 + }, + { + "epoch": 2.436685288640596, + "grad_norm": 1.6747825145721436, + "learning_rate": 3.2967568106942504e-06, + "loss": 0.9455, + "step": 2617 + }, + { + "epoch": 2.4376163873370578, + "grad_norm": 1.5681103467941284, + "learning_rate": 3.295583252522028e-06, + "loss": 0.9678, + "step": 2618 + }, + { + "epoch": 2.4385474860335195, + "grad_norm": 1.6230740547180176, + "learning_rate": 3.294409499249019e-06, + "loss": 0.9732, + "step": 2619 + }, + { + "epoch": 2.439478584729981, + "grad_norm": 1.7424641847610474, + "learning_rate": 3.2932355511630627e-06, + "loss": 0.9887, + "step": 2620 + }, + { + "epoch": 2.4404096834264433, + "grad_norm": 1.6812171936035156, + "learning_rate": 3.2920614085520465e-06, + "loss": 0.9808, + "step": 2621 + }, + { + "epoch": 2.441340782122905, + "grad_norm": 1.6341584920883179, + "learning_rate": 3.290887071703905e-06, + "loss": 0.9699, + "step": 2622 + }, + { + "epoch": 2.4422718808193666, + "grad_norm": 1.6232788562774658, + "learning_rate": 3.289712540906621e-06, + "loss": 0.9573, + "step": 2623 + }, + { + "epoch": 2.4432029795158288, + "grad_norm": 1.6105865240097046, + "learning_rate": 3.2885378164482235e-06, + "loss": 0.9222, + "step": 2624 + }, + { + "epoch": 2.4441340782122905, + "grad_norm": 1.6497597694396973, + "learning_rate": 3.287362898616792e-06, + "loss": 0.9804, + "step": 2625 + }, + { + "epoch": 2.445065176908752, + "grad_norm": 1.6729713678359985, + "learning_rate": 3.2861877877004495e-06, + "loss": 0.9674, + "step": 2626 + }, + { + "epoch": 2.4459962756052143, + "grad_norm": 1.700015902519226, + "learning_rate": 3.2850124839873693e-06, + "loss": 0.9498, + "step": 2627 + }, + { + "epoch": 2.446927374301676, + "grad_norm": 1.598156452178955, + "learning_rate": 3.283836987765771e-06, + "loss": 0.959, + "step": 2628 + }, + { + "epoch": 2.4478584729981376, + "grad_norm": 1.6494011878967285, + "learning_rate": 3.2826612993239213e-06, + "loss": 0.986, + "step": 2629 + }, + { + "epoch": 2.4487895716945998, + "grad_norm": 1.6884851455688477, + "learning_rate": 3.2814854189501343e-06, + "loss": 0.9648, + "step": 2630 + }, + { + "epoch": 2.4497206703910615, + "grad_norm": 1.6454182863235474, + "learning_rate": 3.28030934693277e-06, + "loss": 0.9667, + "step": 2631 + }, + { + "epoch": 2.450651769087523, + "grad_norm": 1.6368962526321411, + "learning_rate": 3.2791330835602385e-06, + "loss": 0.9348, + "step": 2632 + }, + { + "epoch": 2.4515828677839853, + "grad_norm": 1.6652709245681763, + "learning_rate": 3.2779566291209918e-06, + "loss": 0.9571, + "step": 2633 + }, + { + "epoch": 2.452513966480447, + "grad_norm": 1.6055703163146973, + "learning_rate": 3.2767799839035347e-06, + "loss": 0.9356, + "step": 2634 + }, + { + "epoch": 2.4534450651769086, + "grad_norm": 1.6307944059371948, + "learning_rate": 3.2756031481964134e-06, + "loss": 0.9324, + "step": 2635 + }, + { + "epoch": 2.4543761638733708, + "grad_norm": 1.5875563621520996, + "learning_rate": 3.274426122288225e-06, + "loss": 0.9381, + "step": 2636 + }, + { + "epoch": 2.4553072625698324, + "grad_norm": 1.6253734827041626, + "learning_rate": 3.2732489064676096e-06, + "loss": 0.9514, + "step": 2637 + }, + { + "epoch": 2.456238361266294, + "grad_norm": 1.6338211297988892, + "learning_rate": 3.2720715010232572e-06, + "loss": 0.9504, + "step": 2638 + }, + { + "epoch": 2.4571694599627563, + "grad_norm": 1.6506370306015015, + "learning_rate": 3.2708939062439027e-06, + "loss": 0.9667, + "step": 2639 + }, + { + "epoch": 2.458100558659218, + "grad_norm": 1.6240171194076538, + "learning_rate": 3.269716122418326e-06, + "loss": 0.9284, + "step": 2640 + }, + { + "epoch": 2.4590316573556796, + "grad_norm": 1.6600265502929688, + "learning_rate": 3.2685381498353574e-06, + "loss": 0.9666, + "step": 2641 + }, + { + "epoch": 2.4599627560521418, + "grad_norm": 1.6203696727752686, + "learning_rate": 3.267359988783869e-06, + "loss": 0.9705, + "step": 2642 + }, + { + "epoch": 2.4608938547486034, + "grad_norm": 1.5917997360229492, + "learning_rate": 3.266181639552781e-06, + "loss": 0.9284, + "step": 2643 + }, + { + "epoch": 2.461824953445065, + "grad_norm": 1.6019238233566284, + "learning_rate": 3.2650031024310607e-06, + "loss": 0.9538, + "step": 2644 + }, + { + "epoch": 2.462756052141527, + "grad_norm": 1.6653268337249756, + "learning_rate": 3.2638243777077204e-06, + "loss": 0.9551, + "step": 2645 + }, + { + "epoch": 2.463687150837989, + "grad_norm": 1.6435784101486206, + "learning_rate": 3.262645465671819e-06, + "loss": 0.9281, + "step": 2646 + }, + { + "epoch": 2.4646182495344506, + "grad_norm": 1.611222743988037, + "learning_rate": 3.26146636661246e-06, + "loss": 0.9646, + "step": 2647 + }, + { + "epoch": 2.4655493482309123, + "grad_norm": 1.6042946577072144, + "learning_rate": 3.2602870808187955e-06, + "loss": 0.9553, + "step": 2648 + }, + { + "epoch": 2.4664804469273744, + "grad_norm": 1.6262257099151611, + "learning_rate": 3.2591076085800193e-06, + "loss": 0.9423, + "step": 2649 + }, + { + "epoch": 2.467411545623836, + "grad_norm": 1.7350205183029175, + "learning_rate": 3.2579279501853746e-06, + "loss": 0.9583, + "step": 2650 + }, + { + "epoch": 2.468342644320298, + "grad_norm": 1.658310055732727, + "learning_rate": 3.256748105924149e-06, + "loss": 0.9655, + "step": 2651 + }, + { + "epoch": 2.46927374301676, + "grad_norm": 1.5956592559814453, + "learning_rate": 3.255568076085675e-06, + "loss": 0.9628, + "step": 2652 + }, + { + "epoch": 2.4702048417132216, + "grad_norm": 1.6241995096206665, + "learning_rate": 3.2543878609593314e-06, + "loss": 0.941, + "step": 2653 + }, + { + "epoch": 2.4711359404096833, + "grad_norm": 1.5923924446105957, + "learning_rate": 3.253207460834542e-06, + "loss": 0.9342, + "step": 2654 + }, + { + "epoch": 2.472067039106145, + "grad_norm": 1.6418542861938477, + "learning_rate": 3.2520268760007768e-06, + "loss": 0.9534, + "step": 2655 + }, + { + "epoch": 2.472998137802607, + "grad_norm": 1.634240746498108, + "learning_rate": 3.25084610674755e-06, + "loss": 0.9608, + "step": 2656 + }, + { + "epoch": 2.473929236499069, + "grad_norm": 1.5932693481445312, + "learning_rate": 3.249665153364421e-06, + "loss": 0.963, + "step": 2657 + }, + { + "epoch": 2.4748603351955305, + "grad_norm": 1.617982029914856, + "learning_rate": 3.248484016140996e-06, + "loss": 0.9391, + "step": 2658 + }, + { + "epoch": 2.4757914338919926, + "grad_norm": 1.6907939910888672, + "learning_rate": 3.2473026953669245e-06, + "loss": 0.9438, + "step": 2659 + }, + { + "epoch": 2.4767225325884543, + "grad_norm": 1.5538761615753174, + "learning_rate": 3.246121191331902e-06, + "loss": 0.941, + "step": 2660 + }, + { + "epoch": 2.477653631284916, + "grad_norm": 1.5928725004196167, + "learning_rate": 3.2449395043256683e-06, + "loss": 0.9695, + "step": 2661 + }, + { + "epoch": 2.478584729981378, + "grad_norm": 1.6139932870864868, + "learning_rate": 3.2437576346380077e-06, + "loss": 0.9563, + "step": 2662 + }, + { + "epoch": 2.47951582867784, + "grad_norm": 1.7027126550674438, + "learning_rate": 3.2425755825587515e-06, + "loss": 0.9372, + "step": 2663 + }, + { + "epoch": 2.4804469273743015, + "grad_norm": 1.6509934663772583, + "learning_rate": 3.2413933483777725e-06, + "loss": 0.9442, + "step": 2664 + }, + { + "epoch": 2.4813780260707636, + "grad_norm": 1.5893349647521973, + "learning_rate": 3.240210932384991e-06, + "loss": 0.9307, + "step": 2665 + }, + { + "epoch": 2.4823091247672253, + "grad_norm": 1.6430909633636475, + "learning_rate": 3.239028334870371e-06, + "loss": 0.9626, + "step": 2666 + }, + { + "epoch": 2.483240223463687, + "grad_norm": 1.6869335174560547, + "learning_rate": 3.23784555612392e-06, + "loss": 0.9607, + "step": 2667 + }, + { + "epoch": 2.484171322160149, + "grad_norm": 1.588243007659912, + "learning_rate": 3.2366625964356906e-06, + "loss": 0.9608, + "step": 2668 + }, + { + "epoch": 2.485102420856611, + "grad_norm": 1.6660819053649902, + "learning_rate": 3.2354794560957793e-06, + "loss": 0.9824, + "step": 2669 + }, + { + "epoch": 2.4860335195530725, + "grad_norm": 1.652782678604126, + "learning_rate": 3.234296135394329e-06, + "loss": 0.9175, + "step": 2670 + }, + { + "epoch": 2.4869646182495346, + "grad_norm": 1.6150346994400024, + "learning_rate": 3.2331126346215247e-06, + "loss": 0.9775, + "step": 2671 + }, + { + "epoch": 2.4878957169459963, + "grad_norm": 1.6447911262512207, + "learning_rate": 3.2319289540675963e-06, + "loss": 0.9344, + "step": 2672 + }, + { + "epoch": 2.488826815642458, + "grad_norm": 1.6614223718643188, + "learning_rate": 3.230745094022818e-06, + "loss": 0.9467, + "step": 2673 + }, + { + "epoch": 2.48975791433892, + "grad_norm": 1.6187084913253784, + "learning_rate": 3.2295610547775054e-06, + "loss": 0.9545, + "step": 2674 + }, + { + "epoch": 2.490689013035382, + "grad_norm": 1.6777342557907104, + "learning_rate": 3.228376836622023e-06, + "loss": 0.985, + "step": 2675 + }, + { + "epoch": 2.4916201117318435, + "grad_norm": 1.6157910823822021, + "learning_rate": 3.2271924398467746e-06, + "loss": 0.9573, + "step": 2676 + }, + { + "epoch": 2.4925512104283056, + "grad_norm": 1.5595381259918213, + "learning_rate": 3.2260078647422116e-06, + "loss": 0.9273, + "step": 2677 + }, + { + "epoch": 2.4934823091247673, + "grad_norm": 1.653050422668457, + "learning_rate": 3.2248231115988253e-06, + "loss": 0.9611, + "step": 2678 + }, + { + "epoch": 2.494413407821229, + "grad_norm": 1.6171876192092896, + "learning_rate": 3.2236381807071543e-06, + "loss": 0.9452, + "step": 2679 + }, + { + "epoch": 2.4953445065176907, + "grad_norm": 1.6414589881896973, + "learning_rate": 3.2224530723577775e-06, + "loss": 0.9668, + "step": 2680 + }, + { + "epoch": 2.496275605214153, + "grad_norm": 1.6396350860595703, + "learning_rate": 3.221267786841319e-06, + "loss": 0.9429, + "step": 2681 + }, + { + "epoch": 2.4972067039106145, + "grad_norm": 1.6193255186080933, + "learning_rate": 3.220082324448448e-06, + "loss": 0.983, + "step": 2682 + }, + { + "epoch": 2.498137802607076, + "grad_norm": 1.6369428634643555, + "learning_rate": 3.2188966854698724e-06, + "loss": 0.965, + "step": 2683 + }, + { + "epoch": 2.4990689013035383, + "grad_norm": 1.6261485815048218, + "learning_rate": 3.2177108701963494e-06, + "loss": 0.9932, + "step": 2684 + }, + { + "epoch": 2.5, + "grad_norm": 1.6400699615478516, + "learning_rate": 3.2165248789186744e-06, + "loss": 0.9876, + "step": 2685 + }, + { + "epoch": 2.5009310986964617, + "grad_norm": 1.5728873014450073, + "learning_rate": 3.2153387119276886e-06, + "loss": 0.9208, + "step": 2686 + }, + { + "epoch": 2.501862197392924, + "grad_norm": 1.5843089818954468, + "learning_rate": 3.214152369514275e-06, + "loss": 0.9705, + "step": 2687 + }, + { + "epoch": 2.5027932960893855, + "grad_norm": 1.6322309970855713, + "learning_rate": 3.2129658519693613e-06, + "loss": 0.9442, + "step": 2688 + }, + { + "epoch": 2.503724394785847, + "grad_norm": 1.5921897888183594, + "learning_rate": 3.211779159583916e-06, + "loss": 0.9573, + "step": 2689 + }, + { + "epoch": 2.504655493482309, + "grad_norm": 1.6905947923660278, + "learning_rate": 3.2105922926489507e-06, + "loss": 0.985, + "step": 2690 + }, + { + "epoch": 2.505586592178771, + "grad_norm": 1.6740881204605103, + "learning_rate": 3.209405251455524e-06, + "loss": 0.9897, + "step": 2691 + }, + { + "epoch": 2.5065176908752327, + "grad_norm": 1.6153020858764648, + "learning_rate": 3.20821803629473e-06, + "loss": 0.9173, + "step": 2692 + }, + { + "epoch": 2.5074487895716944, + "grad_norm": 1.6218596696853638, + "learning_rate": 3.2070306474577123e-06, + "loss": 0.9639, + "step": 2693 + }, + { + "epoch": 2.5083798882681565, + "grad_norm": 1.6769354343414307, + "learning_rate": 3.205843085235652e-06, + "loss": 0.9687, + "step": 2694 + }, + { + "epoch": 2.509310986964618, + "grad_norm": 1.6009342670440674, + "learning_rate": 3.204655349919776e-06, + "loss": 0.9274, + "step": 2695 + }, + { + "epoch": 2.51024208566108, + "grad_norm": 1.6258291006088257, + "learning_rate": 3.2034674418013523e-06, + "loss": 0.9475, + "step": 2696 + }, + { + "epoch": 2.511173184357542, + "grad_norm": 1.649918794631958, + "learning_rate": 3.202279361171691e-06, + "loss": 0.9828, + "step": 2697 + }, + { + "epoch": 2.5121042830540037, + "grad_norm": 1.6119354963302612, + "learning_rate": 3.2010911083221453e-06, + "loss": 0.953, + "step": 2698 + }, + { + "epoch": 2.5130353817504654, + "grad_norm": 1.6593668460845947, + "learning_rate": 3.1999026835441104e-06, + "loss": 0.9554, + "step": 2699 + }, + { + "epoch": 2.5139664804469275, + "grad_norm": 1.7150312662124634, + "learning_rate": 3.198714087129024e-06, + "loss": 1.0175, + "step": 2700 + }, + { + "epoch": 2.514897579143389, + "grad_norm": 1.5864650011062622, + "learning_rate": 3.197525319368365e-06, + "loss": 0.9561, + "step": 2701 + }, + { + "epoch": 2.515828677839851, + "grad_norm": 1.6027896404266357, + "learning_rate": 3.1963363805536542e-06, + "loss": 0.9817, + "step": 2702 + }, + { + "epoch": 2.516759776536313, + "grad_norm": 1.668905258178711, + "learning_rate": 3.195147270976455e-06, + "loss": 0.9692, + "step": 2703 + }, + { + "epoch": 2.5176908752327747, + "grad_norm": 1.641294002532959, + "learning_rate": 3.193957990928374e-06, + "loss": 0.9391, + "step": 2704 + }, + { + "epoch": 2.5186219739292364, + "grad_norm": 1.6915578842163086, + "learning_rate": 3.1927685407010574e-06, + "loss": 0.9842, + "step": 2705 + }, + { + "epoch": 2.5195530726256985, + "grad_norm": 1.6631584167480469, + "learning_rate": 3.191578920586193e-06, + "loss": 1.0038, + "step": 2706 + }, + { + "epoch": 2.52048417132216, + "grad_norm": 1.6747456789016724, + "learning_rate": 3.1903891308755125e-06, + "loss": 0.8991, + "step": 2707 + }, + { + "epoch": 2.521415270018622, + "grad_norm": 1.651414155960083, + "learning_rate": 3.1891991718607874e-06, + "loss": 0.9691, + "step": 2708 + }, + { + "epoch": 2.522346368715084, + "grad_norm": 1.6247169971466064, + "learning_rate": 3.1880090438338308e-06, + "loss": 0.9746, + "step": 2709 + }, + { + "epoch": 2.5232774674115457, + "grad_norm": 1.7061047554016113, + "learning_rate": 3.1868187470864986e-06, + "loss": 0.9578, + "step": 2710 + }, + { + "epoch": 2.5242085661080074, + "grad_norm": 1.6343475580215454, + "learning_rate": 3.1856282819106867e-06, + "loss": 0.9196, + "step": 2711 + }, + { + "epoch": 2.5251396648044695, + "grad_norm": 1.637317419052124, + "learning_rate": 3.184437648598332e-06, + "loss": 0.9537, + "step": 2712 + }, + { + "epoch": 2.526070763500931, + "grad_norm": 1.6752567291259766, + "learning_rate": 3.1832468474414148e-06, + "loss": 0.9653, + "step": 2713 + }, + { + "epoch": 2.527001862197393, + "grad_norm": 1.614477276802063, + "learning_rate": 3.1820558787319528e-06, + "loss": 0.9844, + "step": 2714 + }, + { + "epoch": 2.527932960893855, + "grad_norm": 1.6819109916687012, + "learning_rate": 3.18086474276201e-06, + "loss": 1.0123, + "step": 2715 + }, + { + "epoch": 2.5288640595903167, + "grad_norm": 1.6218712329864502, + "learning_rate": 3.1796734398236863e-06, + "loss": 0.9654, + "step": 2716 + }, + { + "epoch": 2.5297951582867784, + "grad_norm": 1.6440956592559814, + "learning_rate": 3.1784819702091263e-06, + "loss": 0.9451, + "step": 2717 + }, + { + "epoch": 2.5307262569832405, + "grad_norm": 1.6846622228622437, + "learning_rate": 3.1772903342105135e-06, + "loss": 0.9828, + "step": 2718 + }, + { + "epoch": 2.531657355679702, + "grad_norm": 1.7056374549865723, + "learning_rate": 3.176098532120071e-06, + "loss": 0.963, + "step": 2719 + }, + { + "epoch": 2.532588454376164, + "grad_norm": 1.6760709285736084, + "learning_rate": 3.1749065642300677e-06, + "loss": 0.954, + "step": 2720 + }, + { + "epoch": 2.5335195530726256, + "grad_norm": 1.6642045974731445, + "learning_rate": 3.173714430832806e-06, + "loss": 0.9972, + "step": 2721 + }, + { + "epoch": 2.5344506517690877, + "grad_norm": 1.6921284198760986, + "learning_rate": 3.1725221322206355e-06, + "loss": 0.9384, + "step": 2722 + }, + { + "epoch": 2.5353817504655494, + "grad_norm": 1.6488786935806274, + "learning_rate": 3.171329668685942e-06, + "loss": 0.9436, + "step": 2723 + }, + { + "epoch": 2.536312849162011, + "grad_norm": 1.70298433303833, + "learning_rate": 3.1701370405211536e-06, + "loss": 0.9577, + "step": 2724 + }, + { + "epoch": 2.5372439478584727, + "grad_norm": 1.6803982257843018, + "learning_rate": 3.1689442480187388e-06, + "loss": 0.9582, + "step": 2725 + }, + { + "epoch": 2.538175046554935, + "grad_norm": 1.6304399967193604, + "learning_rate": 3.1677512914712044e-06, + "loss": 0.948, + "step": 2726 + }, + { + "epoch": 2.5391061452513966, + "grad_norm": 1.6413767337799072, + "learning_rate": 3.1665581711711014e-06, + "loss": 0.9774, + "step": 2727 + }, + { + "epoch": 2.5400372439478582, + "grad_norm": 1.6617759466171265, + "learning_rate": 3.165364887411016e-06, + "loss": 0.926, + "step": 2728 + }, + { + "epoch": 2.5409683426443204, + "grad_norm": 1.6656525135040283, + "learning_rate": 3.164171440483579e-06, + "loss": 0.9859, + "step": 2729 + }, + { + "epoch": 2.541899441340782, + "grad_norm": 1.6810792684555054, + "learning_rate": 3.1629778306814586e-06, + "loss": 0.9433, + "step": 2730 + }, + { + "epoch": 2.5428305400372437, + "grad_norm": 1.6987841129302979, + "learning_rate": 3.161784058297363e-06, + "loss": 0.9508, + "step": 2731 + }, + { + "epoch": 2.543761638733706, + "grad_norm": 1.5758877992630005, + "learning_rate": 3.160590123624041e-06, + "loss": 0.9221, + "step": 2732 + }, + { + "epoch": 2.5446927374301676, + "grad_norm": 1.6114469766616821, + "learning_rate": 3.1593960269542817e-06, + "loss": 0.9251, + "step": 2733 + }, + { + "epoch": 2.5456238361266292, + "grad_norm": 1.7200394868850708, + "learning_rate": 3.1582017685809136e-06, + "loss": 0.9773, + "step": 2734 + }, + { + "epoch": 2.5465549348230914, + "grad_norm": 1.576918601989746, + "learning_rate": 3.1570073487968035e-06, + "loss": 0.9137, + "step": 2735 + }, + { + "epoch": 2.547486033519553, + "grad_norm": 1.5539870262145996, + "learning_rate": 3.155812767894859e-06, + "loss": 0.9061, + "step": 2736 + }, + { + "epoch": 2.5484171322160147, + "grad_norm": 1.6504672765731812, + "learning_rate": 3.1546180261680283e-06, + "loss": 0.941, + "step": 2737 + }, + { + "epoch": 2.549348230912477, + "grad_norm": 1.614476203918457, + "learning_rate": 3.1534231239092957e-06, + "loss": 0.896, + "step": 2738 + }, + { + "epoch": 2.5502793296089385, + "grad_norm": 1.616269826889038, + "learning_rate": 3.1522280614116886e-06, + "loss": 0.945, + "step": 2739 + }, + { + "epoch": 2.5512104283054002, + "grad_norm": 1.6424388885498047, + "learning_rate": 3.1510328389682708e-06, + "loss": 0.95, + "step": 2740 + }, + { + "epoch": 2.5521415270018624, + "grad_norm": 1.5913423299789429, + "learning_rate": 3.1498374568721473e-06, + "loss": 0.9496, + "step": 2741 + }, + { + "epoch": 2.553072625698324, + "grad_norm": 1.5946433544158936, + "learning_rate": 3.1486419154164615e-06, + "loss": 0.9832, + "step": 2742 + }, + { + "epoch": 2.5540037243947857, + "grad_norm": 1.606937289237976, + "learning_rate": 3.1474462148943963e-06, + "loss": 0.981, + "step": 2743 + }, + { + "epoch": 2.554934823091248, + "grad_norm": 1.6524548530578613, + "learning_rate": 3.146250355599172e-06, + "loss": 1.0038, + "step": 2744 + }, + { + "epoch": 2.5558659217877095, + "grad_norm": 1.6219244003295898, + "learning_rate": 3.14505433782405e-06, + "loss": 0.9715, + "step": 2745 + }, + { + "epoch": 2.5567970204841712, + "grad_norm": 1.663824200630188, + "learning_rate": 3.1438581618623293e-06, + "loss": 0.9773, + "step": 2746 + }, + { + "epoch": 2.5577281191806334, + "grad_norm": 1.6206378936767578, + "learning_rate": 3.1426618280073485e-06, + "loss": 0.951, + "step": 2747 + }, + { + "epoch": 2.558659217877095, + "grad_norm": 1.6190366744995117, + "learning_rate": 3.1414653365524827e-06, + "loss": 0.9686, + "step": 2748 + }, + { + "epoch": 2.5595903165735567, + "grad_norm": 1.6203886270523071, + "learning_rate": 3.1402686877911494e-06, + "loss": 0.9125, + "step": 2749 + }, + { + "epoch": 2.560521415270019, + "grad_norm": 1.583175539970398, + "learning_rate": 3.139071882016802e-06, + "loss": 0.9413, + "step": 2750 + }, + { + "epoch": 2.5614525139664805, + "grad_norm": 1.6100095510482788, + "learning_rate": 3.1378749195229325e-06, + "loss": 0.9386, + "step": 2751 + }, + { + "epoch": 2.5623836126629422, + "grad_norm": 1.6055479049682617, + "learning_rate": 3.1366778006030717e-06, + "loss": 0.9745, + "step": 2752 + }, + { + "epoch": 2.5633147113594044, + "grad_norm": 1.6939977407455444, + "learning_rate": 3.1354805255507902e-06, + "loss": 0.9899, + "step": 2753 + }, + { + "epoch": 2.564245810055866, + "grad_norm": 1.599851131439209, + "learning_rate": 3.134283094659695e-06, + "loss": 0.9089, + "step": 2754 + }, + { + "epoch": 2.5651769087523277, + "grad_norm": 1.6958569288253784, + "learning_rate": 3.1330855082234313e-06, + "loss": 0.9982, + "step": 2755 + }, + { + "epoch": 2.5661080074487894, + "grad_norm": 1.58479642868042, + "learning_rate": 3.131887766535684e-06, + "loss": 0.9103, + "step": 2756 + }, + { + "epoch": 2.5670391061452515, + "grad_norm": 1.6718580722808838, + "learning_rate": 3.1306898698901744e-06, + "loss": 0.9578, + "step": 2757 + }, + { + "epoch": 2.5679702048417132, + "grad_norm": 1.6546039581298828, + "learning_rate": 3.1294918185806627e-06, + "loss": 0.9321, + "step": 2758 + }, + { + "epoch": 2.568901303538175, + "grad_norm": 1.6734904050827026, + "learning_rate": 3.1282936129009473e-06, + "loss": 0.9425, + "step": 2759 + }, + { + "epoch": 2.5698324022346366, + "grad_norm": 1.6155725717544556, + "learning_rate": 3.127095253144864e-06, + "loss": 0.9266, + "step": 2760 + }, + { + "epoch": 2.5707635009310987, + "grad_norm": 1.6590644121170044, + "learning_rate": 3.125896739606286e-06, + "loss": 0.942, + "step": 2761 + }, + { + "epoch": 2.5716945996275604, + "grad_norm": 1.6101138591766357, + "learning_rate": 3.124698072579125e-06, + "loss": 0.9837, + "step": 2762 + }, + { + "epoch": 2.572625698324022, + "grad_norm": 1.6097989082336426, + "learning_rate": 3.12349925235733e-06, + "loss": 0.9496, + "step": 2763 + }, + { + "epoch": 2.5735567970204842, + "grad_norm": 1.6388952732086182, + "learning_rate": 3.122300279234886e-06, + "loss": 0.9519, + "step": 2764 + }, + { + "epoch": 2.574487895716946, + "grad_norm": 1.5914781093597412, + "learning_rate": 3.12110115350582e-06, + "loss": 0.93, + "step": 2765 + }, + { + "epoch": 2.5754189944134076, + "grad_norm": 1.5782808065414429, + "learning_rate": 3.1199018754641907e-06, + "loss": 0.9343, + "step": 2766 + }, + { + "epoch": 2.5763500931098697, + "grad_norm": 1.5984432697296143, + "learning_rate": 3.1187024454040993e-06, + "loss": 0.9514, + "step": 2767 + }, + { + "epoch": 2.5772811918063314, + "grad_norm": 1.5987129211425781, + "learning_rate": 3.11750286361968e-06, + "loss": 0.921, + "step": 2768 + }, + { + "epoch": 2.578212290502793, + "grad_norm": 1.5802693367004395, + "learning_rate": 3.1163031304051065e-06, + "loss": 0.9252, + "step": 2769 + }, + { + "epoch": 2.5791433891992552, + "grad_norm": 1.6504684686660767, + "learning_rate": 3.1151032460545906e-06, + "loss": 0.9065, + "step": 2770 + }, + { + "epoch": 2.580074487895717, + "grad_norm": 1.6109102964401245, + "learning_rate": 3.1139032108623773e-06, + "loss": 0.9553, + "step": 2771 + }, + { + "epoch": 2.5810055865921786, + "grad_norm": 1.6237130165100098, + "learning_rate": 3.1127030251227534e-06, + "loss": 0.9649, + "step": 2772 + }, + { + "epoch": 2.5819366852886407, + "grad_norm": 1.5690436363220215, + "learning_rate": 3.111502689130039e-06, + "loss": 0.9843, + "step": 2773 + }, + { + "epoch": 2.5828677839851024, + "grad_norm": 1.632887840270996, + "learning_rate": 3.110302203178593e-06, + "loss": 0.9335, + "step": 2774 + }, + { + "epoch": 2.583798882681564, + "grad_norm": 1.753038763999939, + "learning_rate": 3.10910156756281e-06, + "loss": 0.98, + "step": 2775 + }, + { + "epoch": 2.5847299813780262, + "grad_norm": 1.584394931793213, + "learning_rate": 3.1079007825771217e-06, + "loss": 0.9226, + "step": 2776 + }, + { + "epoch": 2.585661080074488, + "grad_norm": 1.5974069833755493, + "learning_rate": 3.1066998485159965e-06, + "loss": 0.9383, + "step": 2777 + }, + { + "epoch": 2.5865921787709496, + "grad_norm": 1.6693968772888184, + "learning_rate": 3.1054987656739395e-06, + "loss": 0.9654, + "step": 2778 + }, + { + "epoch": 2.5875232774674117, + "grad_norm": 1.673758864402771, + "learning_rate": 3.1042975343454927e-06, + "loss": 0.9673, + "step": 2779 + }, + { + "epoch": 2.5884543761638734, + "grad_norm": 1.6719874143600464, + "learning_rate": 3.103096154825233e-06, + "loss": 0.9402, + "step": 2780 + }, + { + "epoch": 2.589385474860335, + "grad_norm": 1.63694167137146, + "learning_rate": 3.1018946274077748e-06, + "loss": 0.9231, + "step": 2781 + }, + { + "epoch": 2.5903165735567972, + "grad_norm": 1.603509783744812, + "learning_rate": 3.100692952387769e-06, + "loss": 0.9154, + "step": 2782 + }, + { + "epoch": 2.591247672253259, + "grad_norm": 1.6750872135162354, + "learning_rate": 3.0994911300599013e-06, + "loss": 0.9394, + "step": 2783 + }, + { + "epoch": 2.5921787709497206, + "grad_norm": 1.6520224809646606, + "learning_rate": 3.0982891607188948e-06, + "loss": 0.9231, + "step": 2784 + }, + { + "epoch": 2.5931098696461827, + "grad_norm": 1.6294194459915161, + "learning_rate": 3.0970870446595087e-06, + "loss": 0.9222, + "step": 2785 + }, + { + "epoch": 2.5940409683426444, + "grad_norm": 1.70402193069458, + "learning_rate": 3.0958847821765377e-06, + "loss": 0.9699, + "step": 2786 + }, + { + "epoch": 2.594972067039106, + "grad_norm": 1.700933575630188, + "learning_rate": 3.094682373564812e-06, + "loss": 0.9437, + "step": 2787 + }, + { + "epoch": 2.5959031657355682, + "grad_norm": 1.5616328716278076, + "learning_rate": 3.0934798191191986e-06, + "loss": 0.9535, + "step": 2788 + }, + { + "epoch": 2.59683426443203, + "grad_norm": 1.6358249187469482, + "learning_rate": 3.092277119134599e-06, + "loss": 0.9425, + "step": 2789 + }, + { + "epoch": 2.5977653631284916, + "grad_norm": 1.6383639574050903, + "learning_rate": 3.091074273905953e-06, + "loss": 0.982, + "step": 2790 + }, + { + "epoch": 2.5986964618249533, + "grad_norm": 1.5911756753921509, + "learning_rate": 3.089871283728232e-06, + "loss": 0.9508, + "step": 2791 + }, + { + "epoch": 2.5996275605214154, + "grad_norm": 1.633705496788025, + "learning_rate": 3.0886681488964466e-06, + "loss": 0.9524, + "step": 2792 + }, + { + "epoch": 2.600558659217877, + "grad_norm": 1.6034173965454102, + "learning_rate": 3.0874648697056403e-06, + "loss": 0.943, + "step": 2793 + }, + { + "epoch": 2.601489757914339, + "grad_norm": 1.6653735637664795, + "learning_rate": 3.0862614464508944e-06, + "loss": 0.9616, + "step": 2794 + }, + { + "epoch": 2.6024208566108005, + "grad_norm": 1.5828442573547363, + "learning_rate": 3.0850578794273236e-06, + "loss": 0.9045, + "step": 2795 + }, + { + "epoch": 2.6033519553072626, + "grad_norm": 1.6605607271194458, + "learning_rate": 3.083854168930078e-06, + "loss": 0.9554, + "step": 2796 + }, + { + "epoch": 2.6042830540037243, + "grad_norm": 1.7061210870742798, + "learning_rate": 3.082650315254344e-06, + "loss": 0.9633, + "step": 2797 + }, + { + "epoch": 2.605214152700186, + "grad_norm": 1.6655867099761963, + "learning_rate": 3.0814463186953424e-06, + "loss": 0.9708, + "step": 2798 + }, + { + "epoch": 2.606145251396648, + "grad_norm": 1.5771557092666626, + "learning_rate": 3.08024217954833e-06, + "loss": 0.9222, + "step": 2799 + }, + { + "epoch": 2.60707635009311, + "grad_norm": 1.6326788663864136, + "learning_rate": 3.0790378981085957e-06, + "loss": 0.9345, + "step": 2800 + }, + { + "epoch": 2.6080074487895715, + "grad_norm": 1.6208207607269287, + "learning_rate": 3.077833474671467e-06, + "loss": 0.9574, + "step": 2801 + }, + { + "epoch": 2.6089385474860336, + "grad_norm": 1.6612160205841064, + "learning_rate": 3.076628909532303e-06, + "loss": 0.9509, + "step": 2802 + }, + { + "epoch": 2.6098696461824953, + "grad_norm": 1.69960618019104, + "learning_rate": 3.0754242029865005e-06, + "loss": 0.9156, + "step": 2803 + }, + { + "epoch": 2.610800744878957, + "grad_norm": 1.7071855068206787, + "learning_rate": 3.0742193553294896e-06, + "loss": 0.9748, + "step": 2804 + }, + { + "epoch": 2.611731843575419, + "grad_norm": 1.6477720737457275, + "learning_rate": 3.073014366856733e-06, + "loss": 0.9873, + "step": 2805 + }, + { + "epoch": 2.612662942271881, + "grad_norm": 1.680709719657898, + "learning_rate": 3.0718092378637325e-06, + "loss": 0.9412, + "step": 2806 + }, + { + "epoch": 2.6135940409683425, + "grad_norm": 1.6208752393722534, + "learning_rate": 3.07060396864602e-06, + "loss": 0.9422, + "step": 2807 + }, + { + "epoch": 2.6145251396648046, + "grad_norm": 1.6615691184997559, + "learning_rate": 3.0693985594991643e-06, + "loss": 0.9973, + "step": 2808 + }, + { + "epoch": 2.6154562383612663, + "grad_norm": 1.6101716756820679, + "learning_rate": 3.0681930107187667e-06, + "loss": 0.9604, + "step": 2809 + }, + { + "epoch": 2.616387337057728, + "grad_norm": 1.5833975076675415, + "learning_rate": 3.0669873226004655e-06, + "loss": 0.8983, + "step": 2810 + }, + { + "epoch": 2.61731843575419, + "grad_norm": 1.6398487091064453, + "learning_rate": 3.06578149543993e-06, + "loss": 0.9398, + "step": 2811 + }, + { + "epoch": 2.618249534450652, + "grad_norm": 1.7333900928497314, + "learning_rate": 3.064575529532865e-06, + "loss": 0.9354, + "step": 2812 + }, + { + "epoch": 2.6191806331471135, + "grad_norm": 1.6353338956832886, + "learning_rate": 3.063369425175011e-06, + "loss": 0.9272, + "step": 2813 + }, + { + "epoch": 2.6201117318435756, + "grad_norm": 1.5639485120773315, + "learning_rate": 3.062163182662139e-06, + "loss": 0.9171, + "step": 2814 + }, + { + "epoch": 2.6210428305400373, + "grad_norm": 1.578440546989441, + "learning_rate": 3.060956802290057e-06, + "loss": 0.9036, + "step": 2815 + }, + { + "epoch": 2.621973929236499, + "grad_norm": 1.6571640968322754, + "learning_rate": 3.0597502843546044e-06, + "loss": 0.9671, + "step": 2816 + }, + { + "epoch": 2.622905027932961, + "grad_norm": 1.6397638320922852, + "learning_rate": 3.058543629151657e-06, + "loss": 0.982, + "step": 2817 + }, + { + "epoch": 2.623836126629423, + "grad_norm": 1.668396234512329, + "learning_rate": 3.0573368369771204e-06, + "loss": 0.9567, + "step": 2818 + }, + { + "epoch": 2.6247672253258845, + "grad_norm": 1.600182294845581, + "learning_rate": 3.056129908126938e-06, + "loss": 0.9538, + "step": 2819 + }, + { + "epoch": 2.6256983240223466, + "grad_norm": 1.6866743564605713, + "learning_rate": 3.0549228428970844e-06, + "loss": 0.9791, + "step": 2820 + }, + { + "epoch": 2.6266294227188083, + "grad_norm": 1.5799665451049805, + "learning_rate": 3.053715641583567e-06, + "loss": 0.9613, + "step": 2821 + }, + { + "epoch": 2.62756052141527, + "grad_norm": 1.5834141969680786, + "learning_rate": 3.0525083044824306e-06, + "loss": 0.9783, + "step": 2822 + }, + { + "epoch": 2.628491620111732, + "grad_norm": 1.6275930404663086, + "learning_rate": 3.0513008318897468e-06, + "loss": 1.0146, + "step": 2823 + }, + { + "epoch": 2.629422718808194, + "grad_norm": 1.6697674989700317, + "learning_rate": 3.0500932241016255e-06, + "loss": 0.9296, + "step": 2824 + }, + { + "epoch": 2.6303538175046555, + "grad_norm": 1.5969644784927368, + "learning_rate": 3.0488854814142083e-06, + "loss": 0.9779, + "step": 2825 + }, + { + "epoch": 2.631284916201117, + "grad_norm": 1.6485135555267334, + "learning_rate": 3.04767760412367e-06, + "loss": 0.9463, + "step": 2826 + }, + { + "epoch": 2.6322160148975793, + "grad_norm": 1.6014982461929321, + "learning_rate": 3.0464695925262173e-06, + "loss": 0.9296, + "step": 2827 + }, + { + "epoch": 2.633147113594041, + "grad_norm": 1.6272295713424683, + "learning_rate": 3.045261446918092e-06, + "loss": 1.0032, + "step": 2828 + }, + { + "epoch": 2.6340782122905027, + "grad_norm": 1.607211947441101, + "learning_rate": 3.044053167595566e-06, + "loss": 0.9617, + "step": 2829 + }, + { + "epoch": 2.635009310986965, + "grad_norm": 1.6082310676574707, + "learning_rate": 3.0428447548549466e-06, + "loss": 0.9552, + "step": 2830 + }, + { + "epoch": 2.6359404096834265, + "grad_norm": 1.5937196016311646, + "learning_rate": 3.041636208992572e-06, + "loss": 0.9673, + "step": 2831 + }, + { + "epoch": 2.636871508379888, + "grad_norm": 1.6141903400421143, + "learning_rate": 3.0404275303048152e-06, + "loss": 0.9033, + "step": 2832 + }, + { + "epoch": 2.63780260707635, + "grad_norm": 1.6257685422897339, + "learning_rate": 3.0392187190880786e-06, + "loss": 0.9767, + "step": 2833 + }, + { + "epoch": 2.638733705772812, + "grad_norm": 1.5968188047409058, + "learning_rate": 3.0380097756387996e-06, + "loss": 0.9525, + "step": 2834 + }, + { + "epoch": 2.6396648044692737, + "grad_norm": 1.6123281717300415, + "learning_rate": 3.0368007002534474e-06, + "loss": 0.9457, + "step": 2835 + }, + { + "epoch": 2.6405959031657353, + "grad_norm": 1.6483839750289917, + "learning_rate": 3.035591493228523e-06, + "loss": 0.95, + "step": 2836 + }, + { + "epoch": 2.6415270018621975, + "grad_norm": 1.6793286800384521, + "learning_rate": 3.03438215486056e-06, + "loss": 0.965, + "step": 2837 + }, + { + "epoch": 2.642458100558659, + "grad_norm": 1.690168023109436, + "learning_rate": 3.033172685446125e-06, + "loss": 0.9667, + "step": 2838 + }, + { + "epoch": 2.643389199255121, + "grad_norm": 1.640297770500183, + "learning_rate": 3.031963085281816e-06, + "loss": 0.9258, + "step": 2839 + }, + { + "epoch": 2.644320297951583, + "grad_norm": 1.7079721689224243, + "learning_rate": 3.030753354664262e-06, + "loss": 0.9913, + "step": 2840 + }, + { + "epoch": 2.6452513966480447, + "grad_norm": 1.5586864948272705, + "learning_rate": 3.0295434938901263e-06, + "loss": 0.9342, + "step": 2841 + }, + { + "epoch": 2.6461824953445063, + "grad_norm": 1.6786353588104248, + "learning_rate": 3.028333503256103e-06, + "loss": 0.9496, + "step": 2842 + }, + { + "epoch": 2.6471135940409685, + "grad_norm": 1.613917350769043, + "learning_rate": 3.0271233830589162e-06, + "loss": 0.9445, + "step": 2843 + }, + { + "epoch": 2.64804469273743, + "grad_norm": 1.6379549503326416, + "learning_rate": 3.025913133595325e-06, + "loss": 0.9509, + "step": 2844 + }, + { + "epoch": 2.648975791433892, + "grad_norm": 1.6311167478561401, + "learning_rate": 3.0247027551621187e-06, + "loss": 0.9568, + "step": 2845 + }, + { + "epoch": 2.649906890130354, + "grad_norm": 1.5794034004211426, + "learning_rate": 3.0234922480561187e-06, + "loss": 0.9651, + "step": 2846 + }, + { + "epoch": 2.6508379888268156, + "grad_norm": 1.5947290658950806, + "learning_rate": 3.022281612574176e-06, + "loss": 0.9511, + "step": 2847 + }, + { + "epoch": 2.6517690875232773, + "grad_norm": 1.6354261636734009, + "learning_rate": 3.021070849013176e-06, + "loss": 0.9405, + "step": 2848 + }, + { + "epoch": 2.6527001862197395, + "grad_norm": 1.6305598020553589, + "learning_rate": 3.019859957670034e-06, + "loss": 0.9263, + "step": 2849 + }, + { + "epoch": 2.653631284916201, + "grad_norm": 1.6536580324172974, + "learning_rate": 3.018648938841695e-06, + "loss": 0.9455, + "step": 2850 + }, + { + "epoch": 2.654562383612663, + "grad_norm": 1.6688307523727417, + "learning_rate": 3.0174377928251392e-06, + "loss": 1.0027, + "step": 2851 + }, + { + "epoch": 2.655493482309125, + "grad_norm": 1.6771334409713745, + "learning_rate": 3.016226519917374e-06, + "loss": 0.9978, + "step": 2852 + }, + { + "epoch": 2.6564245810055866, + "grad_norm": 1.6501225233078003, + "learning_rate": 3.0150151204154423e-06, + "loss": 0.9535, + "step": 2853 + }, + { + "epoch": 2.6573556797020483, + "grad_norm": 1.6267509460449219, + "learning_rate": 3.0138035946164125e-06, + "loss": 1.0055, + "step": 2854 + }, + { + "epoch": 2.6582867783985105, + "grad_norm": 1.6203352212905884, + "learning_rate": 3.0125919428173876e-06, + "loss": 0.9498, + "step": 2855 + }, + { + "epoch": 2.659217877094972, + "grad_norm": 1.6645296812057495, + "learning_rate": 3.011380165315503e-06, + "loss": 0.9502, + "step": 2856 + }, + { + "epoch": 2.660148975791434, + "grad_norm": 1.5809940099716187, + "learning_rate": 3.010168262407919e-06, + "loss": 0.9448, + "step": 2857 + }, + { + "epoch": 2.661080074487896, + "grad_norm": 1.5849182605743408, + "learning_rate": 3.008956234391835e-06, + "loss": 0.9243, + "step": 2858 + }, + { + "epoch": 2.6620111731843576, + "grad_norm": 1.621230125427246, + "learning_rate": 3.0077440815644722e-06, + "loss": 0.957, + "step": 2859 + }, + { + "epoch": 2.6629422718808193, + "grad_norm": 1.6482765674591064, + "learning_rate": 3.00653180422309e-06, + "loss": 0.9411, + "step": 2860 + }, + { + "epoch": 2.6638733705772815, + "grad_norm": 1.5941625833511353, + "learning_rate": 3.005319402664973e-06, + "loss": 0.9388, + "step": 2861 + }, + { + "epoch": 2.664804469273743, + "grad_norm": 1.6318464279174805, + "learning_rate": 3.0041068771874387e-06, + "loss": 0.958, + "step": 2862 + }, + { + "epoch": 2.665735567970205, + "grad_norm": 1.6854920387268066, + "learning_rate": 3.0028942280878347e-06, + "loss": 0.9576, + "step": 2863 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.6749870777130127, + "learning_rate": 3.0016814556635387e-06, + "loss": 0.9924, + "step": 2864 + }, + { + "epoch": 2.6675977653631286, + "grad_norm": 1.5535457134246826, + "learning_rate": 3.00046856021196e-06, + "loss": 0.95, + "step": 2865 + }, + { + "epoch": 2.6685288640595903, + "grad_norm": 1.5978206396102905, + "learning_rate": 2.999255542030534e-06, + "loss": 0.9508, + "step": 2866 + }, + { + "epoch": 2.669459962756052, + "grad_norm": 1.6399273872375488, + "learning_rate": 2.998042401416732e-06, + "loss": 0.9616, + "step": 2867 + }, + { + "epoch": 2.6703910614525137, + "grad_norm": 1.5954216718673706, + "learning_rate": 2.9968291386680505e-06, + "loss": 0.9421, + "step": 2868 + }, + { + "epoch": 2.671322160148976, + "grad_norm": 1.7154549360275269, + "learning_rate": 2.9956157540820186e-06, + "loss": 1.0109, + "step": 2869 + }, + { + "epoch": 2.6722532588454375, + "grad_norm": 1.7107621431350708, + "learning_rate": 2.994402247956194e-06, + "loss": 1.0035, + "step": 2870 + }, + { + "epoch": 2.673184357541899, + "grad_norm": 1.6286007165908813, + "learning_rate": 2.9931886205881642e-06, + "loss": 0.9785, + "step": 2871 + }, + { + "epoch": 2.6741154562383613, + "grad_norm": 1.6210182905197144, + "learning_rate": 2.9919748722755485e-06, + "loss": 0.9403, + "step": 2872 + }, + { + "epoch": 2.675046554934823, + "grad_norm": 1.6527222394943237, + "learning_rate": 2.9907610033159927e-06, + "loss": 0.9398, + "step": 2873 + }, + { + "epoch": 2.6759776536312847, + "grad_norm": 1.6232794523239136, + "learning_rate": 2.989547014007175e-06, + "loss": 0.9371, + "step": 2874 + }, + { + "epoch": 2.676908752327747, + "grad_norm": 1.618330478668213, + "learning_rate": 2.9883329046468e-06, + "loss": 0.951, + "step": 2875 + }, + { + "epoch": 2.6778398510242085, + "grad_norm": 1.6865403652191162, + "learning_rate": 2.987118675532606e-06, + "loss": 0.975, + "step": 2876 + }, + { + "epoch": 2.67877094972067, + "grad_norm": 1.668399453163147, + "learning_rate": 2.985904326962357e-06, + "loss": 0.9715, + "step": 2877 + }, + { + "epoch": 2.6797020484171323, + "grad_norm": 1.7188880443572998, + "learning_rate": 2.9846898592338465e-06, + "loss": 0.9705, + "step": 2878 + }, + { + "epoch": 2.680633147113594, + "grad_norm": 1.6627426147460938, + "learning_rate": 2.9834752726449e-06, + "loss": 0.9891, + "step": 2879 + }, + { + "epoch": 2.6815642458100557, + "grad_norm": 1.6442075967788696, + "learning_rate": 2.9822605674933696e-06, + "loss": 0.9561, + "step": 2880 + }, + { + "epoch": 2.682495344506518, + "grad_norm": 1.6207232475280762, + "learning_rate": 2.9810457440771374e-06, + "loss": 0.9354, + "step": 2881 + }, + { + "epoch": 2.6834264432029795, + "grad_norm": 1.6638420820236206, + "learning_rate": 2.9798308026941147e-06, + "loss": 0.9404, + "step": 2882 + }, + { + "epoch": 2.684357541899441, + "grad_norm": 1.5670677423477173, + "learning_rate": 2.9786157436422413e-06, + "loss": 0.9166, + "step": 2883 + }, + { + "epoch": 2.6852886405959033, + "grad_norm": 1.6316684484481812, + "learning_rate": 2.9774005672194854e-06, + "loss": 0.9695, + "step": 2884 + }, + { + "epoch": 2.686219739292365, + "grad_norm": 1.5999932289123535, + "learning_rate": 2.9761852737238462e-06, + "loss": 0.9539, + "step": 2885 + }, + { + "epoch": 2.6871508379888267, + "grad_norm": 1.797905445098877, + "learning_rate": 2.9749698634533476e-06, + "loss": 0.9529, + "step": 2886 + }, + { + "epoch": 2.688081936685289, + "grad_norm": 1.74045729637146, + "learning_rate": 2.9737543367060475e-06, + "loss": 0.9387, + "step": 2887 + }, + { + "epoch": 2.6890130353817505, + "grad_norm": 1.6801518201828003, + "learning_rate": 2.9725386937800254e-06, + "loss": 0.9607, + "step": 2888 + }, + { + "epoch": 2.689944134078212, + "grad_norm": 1.661042332649231, + "learning_rate": 2.9713229349733965e-06, + "loss": 0.9621, + "step": 2889 + }, + { + "epoch": 2.6908752327746743, + "grad_norm": 1.6822454929351807, + "learning_rate": 2.9701070605843e-06, + "loss": 0.9812, + "step": 2890 + }, + { + "epoch": 2.691806331471136, + "grad_norm": 1.7273681163787842, + "learning_rate": 2.9688910709109052e-06, + "loss": 0.9304, + "step": 2891 + }, + { + "epoch": 2.6927374301675977, + "grad_norm": 1.7504971027374268, + "learning_rate": 2.967674966251409e-06, + "loss": 0.9818, + "step": 2892 + }, + { + "epoch": 2.69366852886406, + "grad_norm": 1.7233527898788452, + "learning_rate": 2.966458746904036e-06, + "loss": 0.9391, + "step": 2893 + }, + { + "epoch": 2.6945996275605215, + "grad_norm": 1.5805985927581787, + "learning_rate": 2.9652424131670404e-06, + "loss": 0.923, + "step": 2894 + }, + { + "epoch": 2.695530726256983, + "grad_norm": 1.5961047410964966, + "learning_rate": 2.964025965338702e-06, + "loss": 0.947, + "step": 2895 + }, + { + "epoch": 2.6964618249534453, + "grad_norm": 1.6449295282363892, + "learning_rate": 2.962809403717332e-06, + "loss": 0.9858, + "step": 2896 + }, + { + "epoch": 2.697392923649907, + "grad_norm": 1.668685793876648, + "learning_rate": 2.9615927286012664e-06, + "loss": 0.9773, + "step": 2897 + }, + { + "epoch": 2.6983240223463687, + "grad_norm": 1.6844929456710815, + "learning_rate": 2.960375940288871e-06, + "loss": 0.9953, + "step": 2898 + }, + { + "epoch": 2.6992551210428304, + "grad_norm": 1.6631215810775757, + "learning_rate": 2.959159039078539e-06, + "loss": 0.9509, + "step": 2899 + }, + { + "epoch": 2.7001862197392925, + "grad_norm": 1.6469553709030151, + "learning_rate": 2.957942025268689e-06, + "loss": 0.9356, + "step": 2900 + }, + { + "epoch": 2.701117318435754, + "grad_norm": 1.6841636896133423, + "learning_rate": 2.956724899157772e-06, + "loss": 0.9985, + "step": 2901 + }, + { + "epoch": 2.702048417132216, + "grad_norm": 1.6484923362731934, + "learning_rate": 2.9555076610442605e-06, + "loss": 0.9745, + "step": 2902 + }, + { + "epoch": 2.7029795158286776, + "grad_norm": 1.6595820188522339, + "learning_rate": 2.9542903112266613e-06, + "loss": 0.9586, + "step": 2903 + }, + { + "epoch": 2.7039106145251397, + "grad_norm": 1.7325596809387207, + "learning_rate": 2.953072850003502e-06, + "loss": 0.9571, + "step": 2904 + }, + { + "epoch": 2.7048417132216014, + "grad_norm": 1.6623222827911377, + "learning_rate": 2.9518552776733416e-06, + "loss": 1.0077, + "step": 2905 + }, + { + "epoch": 2.705772811918063, + "grad_norm": 1.6257872581481934, + "learning_rate": 2.950637594534765e-06, + "loss": 0.9116, + "step": 2906 + }, + { + "epoch": 2.706703910614525, + "grad_norm": 1.6201913356781006, + "learning_rate": 2.9494198008863843e-06, + "loss": 0.9217, + "step": 2907 + }, + { + "epoch": 2.707635009310987, + "grad_norm": 1.6633226871490479, + "learning_rate": 2.9482018970268395e-06, + "loss": 0.9638, + "step": 2908 + }, + { + "epoch": 2.7085661080074486, + "grad_norm": 1.828963041305542, + "learning_rate": 2.9469838832547964e-06, + "loss": 0.9927, + "step": 2909 + }, + { + "epoch": 2.7094972067039107, + "grad_norm": 1.6324316263198853, + "learning_rate": 2.9457657598689493e-06, + "loss": 0.9536, + "step": 2910 + }, + { + "epoch": 2.7104283054003724, + "grad_norm": 1.6936625242233276, + "learning_rate": 2.9445475271680175e-06, + "loss": 0.9468, + "step": 2911 + }, + { + "epoch": 2.711359404096834, + "grad_norm": 1.6941533088684082, + "learning_rate": 2.9433291854507483e-06, + "loss": 0.9363, + "step": 2912 + }, + { + "epoch": 2.712290502793296, + "grad_norm": 1.643308401107788, + "learning_rate": 2.9421107350159156e-06, + "loss": 0.9686, + "step": 2913 + }, + { + "epoch": 2.713221601489758, + "grad_norm": 1.685700535774231, + "learning_rate": 2.94089217616232e-06, + "loss": 0.9778, + "step": 2914 + }, + { + "epoch": 2.7141527001862196, + "grad_norm": 1.6530426740646362, + "learning_rate": 2.9396735091887883e-06, + "loss": 0.9866, + "step": 2915 + }, + { + "epoch": 2.7150837988826817, + "grad_norm": 1.6582716703414917, + "learning_rate": 2.9384547343941742e-06, + "loss": 0.9543, + "step": 2916 + }, + { + "epoch": 2.7160148975791434, + "grad_norm": 1.6577575206756592, + "learning_rate": 2.9372358520773575e-06, + "loss": 0.9201, + "step": 2917 + }, + { + "epoch": 2.716945996275605, + "grad_norm": 1.6079766750335693, + "learning_rate": 2.936016862537245e-06, + "loss": 0.9468, + "step": 2918 + }, + { + "epoch": 2.717877094972067, + "grad_norm": 1.615461826324463, + "learning_rate": 2.934797766072769e-06, + "loss": 0.9059, + "step": 2919 + }, + { + "epoch": 2.718808193668529, + "grad_norm": 1.6259170770645142, + "learning_rate": 2.933578562982888e-06, + "loss": 0.9722, + "step": 2920 + }, + { + "epoch": 2.7197392923649906, + "grad_norm": 1.6534541845321655, + "learning_rate": 2.932359253566588e-06, + "loss": 0.9857, + "step": 2921 + }, + { + "epoch": 2.7206703910614527, + "grad_norm": 1.5887418985366821, + "learning_rate": 2.931139838122879e-06, + "loss": 0.8908, + "step": 2922 + }, + { + "epoch": 2.7216014897579144, + "grad_norm": 1.7287448644638062, + "learning_rate": 2.929920316950799e-06, + "loss": 0.9871, + "step": 2923 + }, + { + "epoch": 2.722532588454376, + "grad_norm": 1.6595737934112549, + "learning_rate": 2.928700690349411e-06, + "loss": 0.968, + "step": 2924 + }, + { + "epoch": 2.723463687150838, + "grad_norm": 1.8576184511184692, + "learning_rate": 2.9274809586178026e-06, + "loss": 0.9522, + "step": 2925 + }, + { + "epoch": 2.7243947858473, + "grad_norm": 1.6479943990707397, + "learning_rate": 2.9262611220550906e-06, + "loss": 0.9813, + "step": 2926 + }, + { + "epoch": 2.7253258845437616, + "grad_norm": 1.6889318227767944, + "learning_rate": 2.9250411809604136e-06, + "loss": 0.979, + "step": 2927 + }, + { + "epoch": 2.7262569832402237, + "grad_norm": 1.651171326637268, + "learning_rate": 2.923821135632938e-06, + "loss": 0.9693, + "step": 2928 + }, + { + "epoch": 2.7271880819366854, + "grad_norm": 1.6054799556732178, + "learning_rate": 2.922600986371856e-06, + "loss": 0.9596, + "step": 2929 + }, + { + "epoch": 2.728119180633147, + "grad_norm": 1.677786111831665, + "learning_rate": 2.9213807334763857e-06, + "loss": 0.949, + "step": 2930 + }, + { + "epoch": 2.729050279329609, + "grad_norm": 1.6429251432418823, + "learning_rate": 2.920160377245766e-06, + "loss": 0.9775, + "step": 2931 + }, + { + "epoch": 2.729981378026071, + "grad_norm": 1.6588455438613892, + "learning_rate": 2.9189399179792675e-06, + "loss": 0.9177, + "step": 2932 + }, + { + "epoch": 2.7309124767225326, + "grad_norm": 1.6935231685638428, + "learning_rate": 2.917719355976183e-06, + "loss": 0.9392, + "step": 2933 + }, + { + "epoch": 2.7318435754189943, + "grad_norm": 1.578336477279663, + "learning_rate": 2.91649869153583e-06, + "loss": 0.9171, + "step": 2934 + }, + { + "epoch": 2.7327746741154564, + "grad_norm": 1.6603213548660278, + "learning_rate": 2.9152779249575536e-06, + "loss": 0.9416, + "step": 2935 + }, + { + "epoch": 2.733705772811918, + "grad_norm": 1.6804225444793701, + "learning_rate": 2.9140570565407194e-06, + "loss": 0.9519, + "step": 2936 + }, + { + "epoch": 2.7346368715083798, + "grad_norm": 1.6281383037567139, + "learning_rate": 2.9128360865847235e-06, + "loss": 0.9511, + "step": 2937 + }, + { + "epoch": 2.7355679702048414, + "grad_norm": 1.6681561470031738, + "learning_rate": 2.911615015388982e-06, + "loss": 0.9447, + "step": 2938 + }, + { + "epoch": 2.7364990689013036, + "grad_norm": 1.6646918058395386, + "learning_rate": 2.91039384325294e-06, + "loss": 0.9626, + "step": 2939 + }, + { + "epoch": 2.7374301675977653, + "grad_norm": 1.6071377992630005, + "learning_rate": 2.9091725704760637e-06, + "loss": 0.9286, + "step": 2940 + }, + { + "epoch": 2.738361266294227, + "grad_norm": 1.5857560634613037, + "learning_rate": 2.9079511973578467e-06, + "loss": 0.9536, + "step": 2941 + }, + { + "epoch": 2.739292364990689, + "grad_norm": 1.610556721687317, + "learning_rate": 2.906729724197807e-06, + "loss": 0.9714, + "step": 2942 + }, + { + "epoch": 2.7402234636871508, + "grad_norm": 1.5707459449768066, + "learning_rate": 2.905508151295484e-06, + "loss": 0.9564, + "step": 2943 + }, + { + "epoch": 2.7411545623836124, + "grad_norm": 1.5549044609069824, + "learning_rate": 2.9042864789504465e-06, + "loss": 0.9252, + "step": 2944 + }, + { + "epoch": 2.7420856610800746, + "grad_norm": 1.581815242767334, + "learning_rate": 2.9030647074622824e-06, + "loss": 0.9302, + "step": 2945 + }, + { + "epoch": 2.7430167597765363, + "grad_norm": 1.6523064374923706, + "learning_rate": 2.9018428371306097e-06, + "loss": 0.9348, + "step": 2946 + }, + { + "epoch": 2.743947858472998, + "grad_norm": 1.6628508567810059, + "learning_rate": 2.900620868255064e-06, + "loss": 0.9592, + "step": 2947 + }, + { + "epoch": 2.74487895716946, + "grad_norm": 1.6209479570388794, + "learning_rate": 2.8993988011353113e-06, + "loss": 0.9587, + "step": 2948 + }, + { + "epoch": 2.7458100558659218, + "grad_norm": 1.6480685472488403, + "learning_rate": 2.8981766360710377e-06, + "loss": 0.9909, + "step": 2949 + }, + { + "epoch": 2.7467411545623834, + "grad_norm": 1.6479328870773315, + "learning_rate": 2.8969543733619553e-06, + "loss": 0.9748, + "step": 2950 + }, + { + "epoch": 2.7476722532588456, + "grad_norm": 1.6769737005233765, + "learning_rate": 2.8957320133077987e-06, + "loss": 0.9729, + "step": 2951 + }, + { + "epoch": 2.7486033519553073, + "grad_norm": 1.6814030408859253, + "learning_rate": 2.894509556208327e-06, + "loss": 0.9305, + "step": 2952 + }, + { + "epoch": 2.749534450651769, + "grad_norm": 1.6598000526428223, + "learning_rate": 2.893287002363324e-06, + "loss": 0.9597, + "step": 2953 + }, + { + "epoch": 2.750465549348231, + "grad_norm": 1.5595886707305908, + "learning_rate": 2.8920643520725967e-06, + "loss": 0.9185, + "step": 2954 + }, + { + "epoch": 2.7513966480446927, + "grad_norm": 1.6225324869155884, + "learning_rate": 2.8908416056359743e-06, + "loss": 0.9801, + "step": 2955 + }, + { + "epoch": 2.7523277467411544, + "grad_norm": 1.6058166027069092, + "learning_rate": 2.8896187633533112e-06, + "loss": 0.9456, + "step": 2956 + }, + { + "epoch": 2.7532588454376166, + "grad_norm": 1.6955596208572388, + "learning_rate": 2.8883958255244855e-06, + "loss": 0.9439, + "step": 2957 + }, + { + "epoch": 2.7541899441340782, + "grad_norm": 1.7024625539779663, + "learning_rate": 2.8871727924493976e-06, + "loss": 0.95, + "step": 2958 + }, + { + "epoch": 2.75512104283054, + "grad_norm": 1.6556583642959595, + "learning_rate": 2.885949664427972e-06, + "loss": 0.9536, + "step": 2959 + }, + { + "epoch": 2.756052141527002, + "grad_norm": 1.6698493957519531, + "learning_rate": 2.884726441760155e-06, + "loss": 0.9951, + "step": 2960 + }, + { + "epoch": 2.7569832402234637, + "grad_norm": 1.6347700357437134, + "learning_rate": 2.8835031247459187e-06, + "loss": 0.9715, + "step": 2961 + }, + { + "epoch": 2.7579143389199254, + "grad_norm": 1.6315240859985352, + "learning_rate": 2.882279713685257e-06, + "loss": 0.9595, + "step": 2962 + }, + { + "epoch": 2.7588454376163876, + "grad_norm": 1.6313539743423462, + "learning_rate": 2.881056208878186e-06, + "loss": 0.9395, + "step": 2963 + }, + { + "epoch": 2.7597765363128492, + "grad_norm": 1.6100709438323975, + "learning_rate": 2.8798326106247472e-06, + "loss": 0.9141, + "step": 2964 + }, + { + "epoch": 2.760707635009311, + "grad_norm": 1.6138430833816528, + "learning_rate": 2.878608919225001e-06, + "loss": 0.9412, + "step": 2965 + }, + { + "epoch": 2.761638733705773, + "grad_norm": 1.6096676588058472, + "learning_rate": 2.8773851349790357e-06, + "loss": 0.9372, + "step": 2966 + }, + { + "epoch": 2.7625698324022347, + "grad_norm": 1.608506441116333, + "learning_rate": 2.876161258186958e-06, + "loss": 0.9344, + "step": 2967 + }, + { + "epoch": 2.7635009310986964, + "grad_norm": 1.6560670137405396, + "learning_rate": 2.8749372891488998e-06, + "loss": 0.9043, + "step": 2968 + }, + { + "epoch": 2.7644320297951586, + "grad_norm": 1.609519362449646, + "learning_rate": 2.873713228165014e-06, + "loss": 0.9625, + "step": 2969 + }, + { + "epoch": 2.7653631284916202, + "grad_norm": 1.5880539417266846, + "learning_rate": 2.8724890755354784e-06, + "loss": 0.9201, + "step": 2970 + }, + { + "epoch": 2.766294227188082, + "grad_norm": 1.6771913766860962, + "learning_rate": 2.8712648315604905e-06, + "loss": 0.9694, + "step": 2971 + }, + { + "epoch": 2.7672253258845436, + "grad_norm": 1.6371620893478394, + "learning_rate": 2.8700404965402728e-06, + "loss": 0.9564, + "step": 2972 + }, + { + "epoch": 2.7681564245810057, + "grad_norm": 1.6186131238937378, + "learning_rate": 2.8688160707750678e-06, + "loss": 0.9439, + "step": 2973 + }, + { + "epoch": 2.7690875232774674, + "grad_norm": 1.6411898136138916, + "learning_rate": 2.867591554565141e-06, + "loss": 0.9422, + "step": 2974 + }, + { + "epoch": 2.770018621973929, + "grad_norm": 1.6392754316329956, + "learning_rate": 2.866366948210781e-06, + "loss": 0.9719, + "step": 2975 + }, + { + "epoch": 2.770949720670391, + "grad_norm": 1.6504026651382446, + "learning_rate": 2.865142252012298e-06, + "loss": 0.942, + "step": 2976 + }, + { + "epoch": 2.771880819366853, + "grad_norm": 1.5859071016311646, + "learning_rate": 2.863917466270024e-06, + "loss": 0.9234, + "step": 2977 + }, + { + "epoch": 2.7728119180633146, + "grad_norm": 1.6110219955444336, + "learning_rate": 2.862692591284313e-06, + "loss": 0.9222, + "step": 2978 + }, + { + "epoch": 2.7737430167597763, + "grad_norm": 1.6395645141601562, + "learning_rate": 2.861467627355541e-06, + "loss": 0.9906, + "step": 2979 + }, + { + "epoch": 2.7746741154562384, + "grad_norm": 1.720544695854187, + "learning_rate": 2.8602425747841054e-06, + "loss": 0.9639, + "step": 2980 + }, + { + "epoch": 2.7756052141527, + "grad_norm": 1.661580204963684, + "learning_rate": 2.8590174338704256e-06, + "loss": 0.9635, + "step": 2981 + }, + { + "epoch": 2.776536312849162, + "grad_norm": 1.5911599397659302, + "learning_rate": 2.8577922049149443e-06, + "loss": 0.9593, + "step": 2982 + }, + { + "epoch": 2.777467411545624, + "grad_norm": 1.6180620193481445, + "learning_rate": 2.856566888218121e-06, + "loss": 0.9348, + "step": 2983 + }, + { + "epoch": 2.7783985102420856, + "grad_norm": 1.6100971698760986, + "learning_rate": 2.8553414840804446e-06, + "loss": 0.9376, + "step": 2984 + }, + { + "epoch": 2.7793296089385473, + "grad_norm": 1.6387910842895508, + "learning_rate": 2.8541159928024167e-06, + "loss": 0.9479, + "step": 2985 + }, + { + "epoch": 2.7802607076350094, + "grad_norm": 1.654554009437561, + "learning_rate": 2.8528904146845652e-06, + "loss": 0.9996, + "step": 2986 + }, + { + "epoch": 2.781191806331471, + "grad_norm": 1.6798985004425049, + "learning_rate": 2.851664750027441e-06, + "loss": 0.966, + "step": 2987 + }, + { + "epoch": 2.782122905027933, + "grad_norm": 1.6659077405929565, + "learning_rate": 2.8504389991316107e-06, + "loss": 0.9521, + "step": 2988 + }, + { + "epoch": 2.783054003724395, + "grad_norm": 1.6424578428268433, + "learning_rate": 2.849213162297667e-06, + "loss": 0.9747, + "step": 2989 + }, + { + "epoch": 2.7839851024208566, + "grad_norm": 1.6100014448165894, + "learning_rate": 2.8479872398262197e-06, + "loss": 0.941, + "step": 2990 + }, + { + "epoch": 2.7849162011173183, + "grad_norm": 1.6316558122634888, + "learning_rate": 2.8467612320179046e-06, + "loss": 0.9519, + "step": 2991 + }, + { + "epoch": 2.7858472998137804, + "grad_norm": 1.676816701889038, + "learning_rate": 2.8455351391733726e-06, + "loss": 1.0057, + "step": 2992 + }, + { + "epoch": 2.786778398510242, + "grad_norm": 1.6258941888809204, + "learning_rate": 2.8443089615933002e-06, + "loss": 0.9546, + "step": 2993 + }, + { + "epoch": 2.787709497206704, + "grad_norm": 1.622909426689148, + "learning_rate": 2.843082699578381e-06, + "loss": 0.9706, + "step": 2994 + }, + { + "epoch": 2.788640595903166, + "grad_norm": 1.6182568073272705, + "learning_rate": 2.841856353429332e-06, + "loss": 0.9454, + "step": 2995 + }, + { + "epoch": 2.7895716945996276, + "grad_norm": 1.6602880954742432, + "learning_rate": 2.8406299234468914e-06, + "loss": 0.9623, + "step": 2996 + }, + { + "epoch": 2.7905027932960893, + "grad_norm": 1.587006688117981, + "learning_rate": 2.839403409931814e-06, + "loss": 0.9345, + "step": 2997 + }, + { + "epoch": 2.7914338919925514, + "grad_norm": 1.6170670986175537, + "learning_rate": 2.8381768131848796e-06, + "loss": 0.9874, + "step": 2998 + }, + { + "epoch": 2.792364990689013, + "grad_norm": 1.6449601650238037, + "learning_rate": 2.836950133506885e-06, + "loss": 0.9658, + "step": 2999 + }, + { + "epoch": 2.793296089385475, + "grad_norm": 1.6289868354797363, + "learning_rate": 2.8357233711986487e-06, + "loss": 0.9559, + "step": 3000 + }, + { + "epoch": 2.794227188081937, + "grad_norm": 1.611748218536377, + "learning_rate": 2.8344965265610107e-06, + "loss": 0.9572, + "step": 3001 + }, + { + "epoch": 2.7951582867783986, + "grad_norm": 1.5705057382583618, + "learning_rate": 2.833269599894829e-06, + "loss": 0.9514, + "step": 3002 + }, + { + "epoch": 2.7960893854748603, + "grad_norm": 1.5393644571304321, + "learning_rate": 2.8320425915009825e-06, + "loss": 0.9308, + "step": 3003 + }, + { + "epoch": 2.7970204841713224, + "grad_norm": 1.6054822206497192, + "learning_rate": 2.8308155016803706e-06, + "loss": 0.9441, + "step": 3004 + }, + { + "epoch": 2.797951582867784, + "grad_norm": 1.6108616590499878, + "learning_rate": 2.829588330733913e-06, + "loss": 0.9299, + "step": 3005 + }, + { + "epoch": 2.798882681564246, + "grad_norm": 1.6474404335021973, + "learning_rate": 2.8283610789625483e-06, + "loss": 0.9654, + "step": 3006 + }, + { + "epoch": 2.7998137802607075, + "grad_norm": 1.5784049034118652, + "learning_rate": 2.8271337466672343e-06, + "loss": 0.9391, + "step": 3007 + }, + { + "epoch": 2.8007448789571696, + "grad_norm": 1.6093735694885254, + "learning_rate": 2.8259063341489514e-06, + "loss": 0.9102, + "step": 3008 + }, + { + "epoch": 2.8016759776536313, + "grad_norm": 1.6307052373886108, + "learning_rate": 2.8246788417086964e-06, + "loss": 0.9106, + "step": 3009 + }, + { + "epoch": 2.802607076350093, + "grad_norm": 1.6352678537368774, + "learning_rate": 2.8234512696474875e-06, + "loss": 0.9917, + "step": 3010 + }, + { + "epoch": 2.8035381750465547, + "grad_norm": 1.594149112701416, + "learning_rate": 2.8222236182663624e-06, + "loss": 0.9294, + "step": 3011 + }, + { + "epoch": 2.804469273743017, + "grad_norm": 1.5843313932418823, + "learning_rate": 2.820995887866378e-06, + "loss": 0.9624, + "step": 3012 + }, + { + "epoch": 2.8054003724394785, + "grad_norm": 1.559574007987976, + "learning_rate": 2.819768078748609e-06, + "loss": 0.9146, + "step": 3013 + }, + { + "epoch": 2.80633147113594, + "grad_norm": 1.6503013372421265, + "learning_rate": 2.8185401912141532e-06, + "loss": 0.9558, + "step": 3014 + }, + { + "epoch": 2.8072625698324023, + "grad_norm": 1.6319999694824219, + "learning_rate": 2.8173122255641234e-06, + "loss": 0.9347, + "step": 3015 + }, + { + "epoch": 2.808193668528864, + "grad_norm": 1.638600468635559, + "learning_rate": 2.8160841820996547e-06, + "loss": 0.9692, + "step": 3016 + }, + { + "epoch": 2.8091247672253257, + "grad_norm": 1.6471275091171265, + "learning_rate": 2.8148560611218987e-06, + "loss": 0.9525, + "step": 3017 + }, + { + "epoch": 2.810055865921788, + "grad_norm": 1.6110758781433105, + "learning_rate": 2.8136278629320294e-06, + "loss": 0.9576, + "step": 3018 + }, + { + "epoch": 2.8109869646182495, + "grad_norm": 1.6616575717926025, + "learning_rate": 2.8123995878312356e-06, + "loss": 0.9588, + "step": 3019 + }, + { + "epoch": 2.811918063314711, + "grad_norm": 1.6166157722473145, + "learning_rate": 2.811171236120728e-06, + "loss": 0.944, + "step": 3020 + }, + { + "epoch": 2.8128491620111733, + "grad_norm": 1.6271874904632568, + "learning_rate": 2.8099428081017353e-06, + "loss": 0.9796, + "step": 3021 + }, + { + "epoch": 2.813780260707635, + "grad_norm": 1.61025869846344, + "learning_rate": 2.808714304075505e-06, + "loss": 0.9339, + "step": 3022 + }, + { + "epoch": 2.8147113594040967, + "grad_norm": 1.6462604999542236, + "learning_rate": 2.807485724343303e-06, + "loss": 0.9791, + "step": 3023 + }, + { + "epoch": 2.815642458100559, + "grad_norm": 1.6522613763809204, + "learning_rate": 2.806257069206412e-06, + "loss": 0.963, + "step": 3024 + }, + { + "epoch": 2.8165735567970205, + "grad_norm": 1.6467231512069702, + "learning_rate": 2.805028338966137e-06, + "loss": 1.001, + "step": 3025 + }, + { + "epoch": 2.817504655493482, + "grad_norm": 1.604615569114685, + "learning_rate": 2.803799533923798e-06, + "loss": 0.9258, + "step": 3026 + }, + { + "epoch": 2.8184357541899443, + "grad_norm": 1.6463779211044312, + "learning_rate": 2.8025706543807364e-06, + "loss": 0.9679, + "step": 3027 + }, + { + "epoch": 2.819366852886406, + "grad_norm": 1.6304740905761719, + "learning_rate": 2.8013417006383078e-06, + "loss": 0.9912, + "step": 3028 + }, + { + "epoch": 2.8202979515828677, + "grad_norm": 1.6669981479644775, + "learning_rate": 2.8001126729978907e-06, + "loss": 0.9279, + "step": 3029 + }, + { + "epoch": 2.82122905027933, + "grad_norm": 1.5770384073257446, + "learning_rate": 2.7988835717608785e-06, + "loss": 0.9259, + "step": 3030 + }, + { + "epoch": 2.8221601489757915, + "grad_norm": 1.6221460103988647, + "learning_rate": 2.7976543972286824e-06, + "loss": 0.9441, + "step": 3031 + }, + { + "epoch": 2.823091247672253, + "grad_norm": 1.6051779985427856, + "learning_rate": 2.796425149702735e-06, + "loss": 0.9333, + "step": 3032 + }, + { + "epoch": 2.8240223463687153, + "grad_norm": 1.6150850057601929, + "learning_rate": 2.795195829484483e-06, + "loss": 0.9316, + "step": 3033 + }, + { + "epoch": 2.824953445065177, + "grad_norm": 1.7130149602890015, + "learning_rate": 2.7939664368753925e-06, + "loss": 0.9654, + "step": 3034 + }, + { + "epoch": 2.8258845437616387, + "grad_norm": 1.7591301202774048, + "learning_rate": 2.792736972176948e-06, + "loss": 0.9578, + "step": 3035 + }, + { + "epoch": 2.826815642458101, + "grad_norm": 1.6858676671981812, + "learning_rate": 2.79150743569065e-06, + "loss": 0.923, + "step": 3036 + }, + { + "epoch": 2.8277467411545625, + "grad_norm": 1.6735764741897583, + "learning_rate": 2.7902778277180187e-06, + "loss": 0.9377, + "step": 3037 + }, + { + "epoch": 2.828677839851024, + "grad_norm": 1.6521159410476685, + "learning_rate": 2.7890481485605898e-06, + "loss": 0.948, + "step": 3038 + }, + { + "epoch": 2.8296089385474863, + "grad_norm": 1.6810277700424194, + "learning_rate": 2.787818398519918e-06, + "loss": 0.9623, + "step": 3039 + }, + { + "epoch": 2.830540037243948, + "grad_norm": 1.734470009803772, + "learning_rate": 2.7865885778975742e-06, + "loss": 0.9737, + "step": 3040 + }, + { + "epoch": 2.8314711359404097, + "grad_norm": 1.5533194541931152, + "learning_rate": 2.7853586869951484e-06, + "loss": 0.9278, + "step": 3041 + }, + { + "epoch": 2.8324022346368714, + "grad_norm": 1.583024263381958, + "learning_rate": 2.784128726114245e-06, + "loss": 0.9148, + "step": 3042 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 1.6349562406539917, + "learning_rate": 2.782898695556487e-06, + "loss": 0.9693, + "step": 3043 + }, + { + "epoch": 2.834264432029795, + "grad_norm": 1.613322377204895, + "learning_rate": 2.7816685956235167e-06, + "loss": 0.9425, + "step": 3044 + }, + { + "epoch": 2.835195530726257, + "grad_norm": 1.5986024141311646, + "learning_rate": 2.7804384266169897e-06, + "loss": 0.9586, + "step": 3045 + }, + { + "epoch": 2.8361266294227185, + "grad_norm": 1.6297509670257568, + "learning_rate": 2.77920818883858e-06, + "loss": 0.9195, + "step": 3046 + }, + { + "epoch": 2.8370577281191807, + "grad_norm": 1.7266979217529297, + "learning_rate": 2.7779778825899804e-06, + "loss": 0.9818, + "step": 3047 + }, + { + "epoch": 2.8379888268156424, + "grad_norm": 1.6487879753112793, + "learning_rate": 2.776747508172897e-06, + "loss": 0.9603, + "step": 3048 + }, + { + "epoch": 2.838919925512104, + "grad_norm": 1.64192795753479, + "learning_rate": 2.775517065889055e-06, + "loss": 0.9191, + "step": 3049 + }, + { + "epoch": 2.839851024208566, + "grad_norm": 1.6581125259399414, + "learning_rate": 2.774286556040196e-06, + "loss": 0.9706, + "step": 3050 + }, + { + "epoch": 2.840782122905028, + "grad_norm": 1.5849334001541138, + "learning_rate": 2.7730559789280774e-06, + "loss": 0.9312, + "step": 3051 + }, + { + "epoch": 2.8417132216014895, + "grad_norm": 1.6276161670684814, + "learning_rate": 2.7718253348544734e-06, + "loss": 0.9773, + "step": 3052 + }, + { + "epoch": 2.8426443202979517, + "grad_norm": 1.6801183223724365, + "learning_rate": 2.7705946241211746e-06, + "loss": 0.958, + "step": 3053 + }, + { + "epoch": 2.8435754189944134, + "grad_norm": 1.6204458475112915, + "learning_rate": 2.7693638470299883e-06, + "loss": 0.949, + "step": 3054 + }, + { + "epoch": 2.844506517690875, + "grad_norm": 1.6157158613204956, + "learning_rate": 2.768133003882738e-06, + "loss": 0.9332, + "step": 3055 + }, + { + "epoch": 2.845437616387337, + "grad_norm": 1.6545689105987549, + "learning_rate": 2.7669020949812626e-06, + "loss": 0.9309, + "step": 3056 + }, + { + "epoch": 2.846368715083799, + "grad_norm": 1.6190471649169922, + "learning_rate": 2.7656711206274184e-06, + "loss": 0.9426, + "step": 3057 + }, + { + "epoch": 2.8472998137802605, + "grad_norm": 1.6490509510040283, + "learning_rate": 2.764440081123077e-06, + "loss": 0.9429, + "step": 3058 + }, + { + "epoch": 2.8482309124767227, + "grad_norm": 1.626109004020691, + "learning_rate": 2.7632089767701258e-06, + "loss": 0.9661, + "step": 3059 + }, + { + "epoch": 2.8491620111731844, + "grad_norm": 1.557902455329895, + "learning_rate": 2.7619778078704683e-06, + "loss": 0.926, + "step": 3060 + }, + { + "epoch": 2.850093109869646, + "grad_norm": 1.6351616382598877, + "learning_rate": 2.760746574726025e-06, + "loss": 0.9849, + "step": 3061 + }, + { + "epoch": 2.851024208566108, + "grad_norm": 1.630013346672058, + "learning_rate": 2.7595152776387283e-06, + "loss": 0.9583, + "step": 3062 + }, + { + "epoch": 2.85195530726257, + "grad_norm": 1.649207592010498, + "learning_rate": 2.758283916910532e-06, + "loss": 0.9624, + "step": 3063 + }, + { + "epoch": 2.8528864059590315, + "grad_norm": 1.732132077217102, + "learning_rate": 2.757052492843401e-06, + "loss": 0.9508, + "step": 3064 + }, + { + "epoch": 2.8538175046554937, + "grad_norm": 1.614696979522705, + "learning_rate": 2.755821005739318e-06, + "loss": 0.9374, + "step": 3065 + }, + { + "epoch": 2.8547486033519553, + "grad_norm": 1.610378623008728, + "learning_rate": 2.7545894559002806e-06, + "loss": 0.9471, + "step": 3066 + }, + { + "epoch": 2.855679702048417, + "grad_norm": 1.6785856485366821, + "learning_rate": 2.7533578436283005e-06, + "loss": 0.9765, + "step": 3067 + }, + { + "epoch": 2.856610800744879, + "grad_norm": 1.6911710500717163, + "learning_rate": 2.752126169225407e-06, + "loss": 0.9738, + "step": 3068 + }, + { + "epoch": 2.857541899441341, + "grad_norm": 1.6244384050369263, + "learning_rate": 2.750894432993642e-06, + "loss": 0.9087, + "step": 3069 + }, + { + "epoch": 2.8584729981378025, + "grad_norm": 1.6339046955108643, + "learning_rate": 2.7496626352350662e-06, + "loss": 0.9439, + "step": 3070 + }, + { + "epoch": 2.8594040968342647, + "grad_norm": 1.6325255632400513, + "learning_rate": 2.748430776251751e-06, + "loss": 0.9313, + "step": 3071 + }, + { + "epoch": 2.8603351955307263, + "grad_norm": 1.6449286937713623, + "learning_rate": 2.747198856345786e-06, + "loss": 0.9531, + "step": 3072 + }, + { + "epoch": 2.861266294227188, + "grad_norm": 1.6812504529953003, + "learning_rate": 2.745966875819276e-06, + "loss": 0.9332, + "step": 3073 + }, + { + "epoch": 2.86219739292365, + "grad_norm": 1.6733043193817139, + "learning_rate": 2.744734834974337e-06, + "loss": 0.9592, + "step": 3074 + }, + { + "epoch": 2.863128491620112, + "grad_norm": 1.6880135536193848, + "learning_rate": 2.7435027341131043e-06, + "loss": 0.9909, + "step": 3075 + }, + { + "epoch": 2.8640595903165735, + "grad_norm": 1.7200161218643188, + "learning_rate": 2.7422705735377243e-06, + "loss": 0.9699, + "step": 3076 + }, + { + "epoch": 2.864990689013035, + "grad_norm": 1.6862751245498657, + "learning_rate": 2.7410383535503616e-06, + "loss": 0.9148, + "step": 3077 + }, + { + "epoch": 2.8659217877094973, + "grad_norm": 1.5981451272964478, + "learning_rate": 2.73980607445319e-06, + "loss": 0.9852, + "step": 3078 + }, + { + "epoch": 2.866852886405959, + "grad_norm": 1.6795841455459595, + "learning_rate": 2.738573736548405e-06, + "loss": 0.9464, + "step": 3079 + }, + { + "epoch": 2.8677839851024207, + "grad_norm": 1.5939457416534424, + "learning_rate": 2.7373413401382104e-06, + "loss": 0.9513, + "step": 3080 + }, + { + "epoch": 2.868715083798883, + "grad_norm": 1.6223682165145874, + "learning_rate": 2.736108885524827e-06, + "loss": 0.914, + "step": 3081 + }, + { + "epoch": 2.8696461824953445, + "grad_norm": 1.6074382066726685, + "learning_rate": 2.73487637301049e-06, + "loss": 0.9667, + "step": 3082 + }, + { + "epoch": 2.870577281191806, + "grad_norm": 1.61403489112854, + "learning_rate": 2.7336438028974465e-06, + "loss": 0.9254, + "step": 3083 + }, + { + "epoch": 2.871508379888268, + "grad_norm": 1.6451538801193237, + "learning_rate": 2.732411175487963e-06, + "loss": 0.9691, + "step": 3084 + }, + { + "epoch": 2.87243947858473, + "grad_norm": 1.6541868448257446, + "learning_rate": 2.7311784910843135e-06, + "loss": 0.9379, + "step": 3085 + }, + { + "epoch": 2.8733705772811917, + "grad_norm": 1.7156344652175903, + "learning_rate": 2.72994574998879e-06, + "loss": 0.9369, + "step": 3086 + }, + { + "epoch": 2.8743016759776534, + "grad_norm": 1.7009023427963257, + "learning_rate": 2.7287129525036977e-06, + "loss": 0.9635, + "step": 3087 + }, + { + "epoch": 2.8752327746741155, + "grad_norm": 1.588074803352356, + "learning_rate": 2.7274800989313557e-06, + "loss": 0.8856, + "step": 3088 + }, + { + "epoch": 2.876163873370577, + "grad_norm": 1.5667760372161865, + "learning_rate": 2.726247189574095e-06, + "loss": 0.9399, + "step": 3089 + }, + { + "epoch": 2.877094972067039, + "grad_norm": 1.6495251655578613, + "learning_rate": 2.7250142247342637e-06, + "loss": 0.9757, + "step": 3090 + }, + { + "epoch": 2.878026070763501, + "grad_norm": 1.6756880283355713, + "learning_rate": 2.7237812047142204e-06, + "loss": 0.9813, + "step": 3091 + }, + { + "epoch": 2.8789571694599627, + "grad_norm": 1.696001648902893, + "learning_rate": 2.7225481298163388e-06, + "loss": 0.9784, + "step": 3092 + }, + { + "epoch": 2.8798882681564244, + "grad_norm": 1.6369796991348267, + "learning_rate": 2.7213150003430054e-06, + "loss": 0.9286, + "step": 3093 + }, + { + "epoch": 2.8808193668528865, + "grad_norm": 1.6162023544311523, + "learning_rate": 2.7200818165966213e-06, + "loss": 0.9484, + "step": 3094 + }, + { + "epoch": 2.881750465549348, + "grad_norm": 1.6477746963500977, + "learning_rate": 2.718848578879599e-06, + "loss": 0.9389, + "step": 3095 + }, + { + "epoch": 2.88268156424581, + "grad_norm": 1.6024819612503052, + "learning_rate": 2.7176152874943667e-06, + "loss": 0.9621, + "step": 3096 + }, + { + "epoch": 2.883612662942272, + "grad_norm": 1.5970337390899658, + "learning_rate": 2.7163819427433623e-06, + "loss": 0.9418, + "step": 3097 + }, + { + "epoch": 2.8845437616387337, + "grad_norm": 1.6287981271743774, + "learning_rate": 2.7151485449290405e-06, + "loss": 0.973, + "step": 3098 + }, + { + "epoch": 2.8854748603351954, + "grad_norm": 1.624208927154541, + "learning_rate": 2.7139150943538657e-06, + "loss": 0.9561, + "step": 3099 + }, + { + "epoch": 2.8864059590316575, + "grad_norm": 1.6084685325622559, + "learning_rate": 2.712681591320318e-06, + "loss": 0.9476, + "step": 3100 + }, + { + "epoch": 2.887337057728119, + "grad_norm": 1.651850700378418, + "learning_rate": 2.7114480361308892e-06, + "loss": 0.9776, + "step": 3101 + }, + { + "epoch": 2.888268156424581, + "grad_norm": 1.6913796663284302, + "learning_rate": 2.7102144290880834e-06, + "loss": 0.9782, + "step": 3102 + }, + { + "epoch": 2.889199255121043, + "grad_norm": 1.6258907318115234, + "learning_rate": 2.7089807704944184e-06, + "loss": 0.9245, + "step": 3103 + }, + { + "epoch": 2.8901303538175047, + "grad_norm": 1.6570173501968384, + "learning_rate": 2.707747060652424e-06, + "loss": 0.9823, + "step": 3104 + }, + { + "epoch": 2.8910614525139664, + "grad_norm": 1.649380087852478, + "learning_rate": 2.7065132998646414e-06, + "loss": 0.9363, + "step": 3105 + }, + { + "epoch": 2.8919925512104285, + "grad_norm": 1.6097255945205688, + "learning_rate": 2.7052794884336282e-06, + "loss": 0.9232, + "step": 3106 + }, + { + "epoch": 2.89292364990689, + "grad_norm": 1.6660614013671875, + "learning_rate": 2.70404562666195e-06, + "loss": 0.9478, + "step": 3107 + }, + { + "epoch": 2.893854748603352, + "grad_norm": 1.636584758758545, + "learning_rate": 2.7028117148521864e-06, + "loss": 0.9156, + "step": 3108 + }, + { + "epoch": 2.894785847299814, + "grad_norm": 1.614495873451233, + "learning_rate": 2.70157775330693e-06, + "loss": 0.9213, + "step": 3109 + }, + { + "epoch": 2.8957169459962757, + "grad_norm": 1.6499046087265015, + "learning_rate": 2.700343742328786e-06, + "loss": 0.9381, + "step": 3110 + }, + { + "epoch": 2.8966480446927374, + "grad_norm": 1.6998428106307983, + "learning_rate": 2.69910968222037e-06, + "loss": 0.9828, + "step": 3111 + }, + { + "epoch": 2.8975791433891995, + "grad_norm": 1.638384222984314, + "learning_rate": 2.6978755732843086e-06, + "loss": 0.9534, + "step": 3112 + }, + { + "epoch": 2.898510242085661, + "grad_norm": 1.6330877542495728, + "learning_rate": 2.696641415823246e-06, + "loss": 0.9542, + "step": 3113 + }, + { + "epoch": 2.899441340782123, + "grad_norm": 1.6594738960266113, + "learning_rate": 2.6954072101398305e-06, + "loss": 0.9036, + "step": 3114 + }, + { + "epoch": 2.9003724394785846, + "grad_norm": 1.6305596828460693, + "learning_rate": 2.6941729565367285e-06, + "loss": 0.9041, + "step": 3115 + }, + { + "epoch": 2.9013035381750467, + "grad_norm": 1.641000509262085, + "learning_rate": 2.6929386553166165e-06, + "loss": 0.9449, + "step": 3116 + }, + { + "epoch": 2.9022346368715084, + "grad_norm": 1.6552766561508179, + "learning_rate": 2.6917043067821796e-06, + "loss": 0.9586, + "step": 3117 + }, + { + "epoch": 2.90316573556797, + "grad_norm": 1.6130967140197754, + "learning_rate": 2.6904699112361195e-06, + "loss": 0.9568, + "step": 3118 + }, + { + "epoch": 2.9040968342644318, + "grad_norm": 1.5935466289520264, + "learning_rate": 2.6892354689811445e-06, + "loss": 0.9716, + "step": 3119 + }, + { + "epoch": 2.905027932960894, + "grad_norm": 1.7267063856124878, + "learning_rate": 2.688000980319979e-06, + "loss": 1.0024, + "step": 3120 + }, + { + "epoch": 2.9059590316573556, + "grad_norm": 1.7677205801010132, + "learning_rate": 2.686766445555354e-06, + "loss": 0.9305, + "step": 3121 + }, + { + "epoch": 2.9068901303538173, + "grad_norm": 1.6599723100662231, + "learning_rate": 2.6855318649900175e-06, + "loss": 0.9838, + "step": 3122 + }, + { + "epoch": 2.9078212290502794, + "grad_norm": 1.576768159866333, + "learning_rate": 2.684297238926723e-06, + "loss": 0.9306, + "step": 3123 + }, + { + "epoch": 2.908752327746741, + "grad_norm": 1.6890748739242554, + "learning_rate": 2.6830625676682383e-06, + "loss": 0.9659, + "step": 3124 + }, + { + "epoch": 2.9096834264432028, + "grad_norm": 1.6318072080612183, + "learning_rate": 2.6818278515173417e-06, + "loss": 0.952, + "step": 3125 + }, + { + "epoch": 2.910614525139665, + "grad_norm": 1.61650812625885, + "learning_rate": 2.6805930907768227e-06, + "loss": 0.9411, + "step": 3126 + }, + { + "epoch": 2.9115456238361266, + "grad_norm": 1.7172917127609253, + "learning_rate": 2.679358285749482e-06, + "loss": 1.0007, + "step": 3127 + }, + { + "epoch": 2.9124767225325883, + "grad_norm": 1.6232937574386597, + "learning_rate": 2.678123436738129e-06, + "loss": 0.9547, + "step": 3128 + }, + { + "epoch": 2.9134078212290504, + "grad_norm": 1.6304243803024292, + "learning_rate": 2.6768885440455887e-06, + "loss": 0.9458, + "step": 3129 + }, + { + "epoch": 2.914338919925512, + "grad_norm": 1.6008975505828857, + "learning_rate": 2.675653607974691e-06, + "loss": 0.9198, + "step": 3130 + }, + { + "epoch": 2.9152700186219738, + "grad_norm": 1.6223965883255005, + "learning_rate": 2.674418628828279e-06, + "loss": 0.9755, + "step": 3131 + }, + { + "epoch": 2.916201117318436, + "grad_norm": 1.7025574445724487, + "learning_rate": 2.6731836069092083e-06, + "loss": 0.9995, + "step": 3132 + }, + { + "epoch": 2.9171322160148976, + "grad_norm": 1.6130601167678833, + "learning_rate": 2.6719485425203415e-06, + "loss": 0.9468, + "step": 3133 + }, + { + "epoch": 2.9180633147113593, + "grad_norm": 1.6327604055404663, + "learning_rate": 2.6707134359645544e-06, + "loss": 0.9648, + "step": 3134 + }, + { + "epoch": 2.9189944134078214, + "grad_norm": 1.690895915031433, + "learning_rate": 2.6694782875447317e-06, + "loss": 0.931, + "step": 3135 + }, + { + "epoch": 2.919925512104283, + "grad_norm": 1.5851452350616455, + "learning_rate": 2.6682430975637687e-06, + "loss": 0.9232, + "step": 3136 + }, + { + "epoch": 2.9208566108007448, + "grad_norm": 1.7316862344741821, + "learning_rate": 2.6670078663245707e-06, + "loss": 0.9454, + "step": 3137 + }, + { + "epoch": 2.921787709497207, + "grad_norm": 1.651943325996399, + "learning_rate": 2.6657725941300533e-06, + "loss": 0.9533, + "step": 3138 + }, + { + "epoch": 2.9227188081936686, + "grad_norm": 1.6796914339065552, + "learning_rate": 2.664537281283143e-06, + "loss": 0.9699, + "step": 3139 + }, + { + "epoch": 2.9236499068901303, + "grad_norm": 1.6089403629302979, + "learning_rate": 2.663301928086774e-06, + "loss": 0.8837, + "step": 3140 + }, + { + "epoch": 2.9245810055865924, + "grad_norm": 1.6756631135940552, + "learning_rate": 2.662066534843893e-06, + "loss": 0.988, + "step": 3141 + }, + { + "epoch": 2.925512104283054, + "grad_norm": 1.572462558746338, + "learning_rate": 2.6608311018574545e-06, + "loss": 0.9402, + "step": 3142 + }, + { + "epoch": 2.9264432029795158, + "grad_norm": 1.6604962348937988, + "learning_rate": 2.659595629430424e-06, + "loss": 0.9766, + "step": 3143 + }, + { + "epoch": 2.927374301675978, + "grad_norm": 1.6363316774368286, + "learning_rate": 2.658360117865777e-06, + "loss": 0.9537, + "step": 3144 + }, + { + "epoch": 2.9283054003724396, + "grad_norm": 1.5668021440505981, + "learning_rate": 2.6571245674664964e-06, + "loss": 0.95, + "step": 3145 + }, + { + "epoch": 2.9292364990689013, + "grad_norm": 1.707266092300415, + "learning_rate": 2.6558889785355767e-06, + "loss": 0.9538, + "step": 3146 + }, + { + "epoch": 2.9301675977653634, + "grad_norm": 1.636340856552124, + "learning_rate": 2.6546533513760213e-06, + "loss": 0.9557, + "step": 3147 + }, + { + "epoch": 2.931098696461825, + "grad_norm": 1.629612684249878, + "learning_rate": 2.6534176862908434e-06, + "loss": 0.9362, + "step": 3148 + }, + { + "epoch": 2.9320297951582868, + "grad_norm": 1.6107096672058105, + "learning_rate": 2.652181983583064e-06, + "loss": 0.9299, + "step": 3149 + }, + { + "epoch": 2.9329608938547485, + "grad_norm": 1.625532865524292, + "learning_rate": 2.6509462435557155e-06, + "loss": 0.9558, + "step": 3150 + }, + { + "epoch": 2.9338919925512106, + "grad_norm": 1.6218911409378052, + "learning_rate": 2.6497104665118373e-06, + "loss": 0.9378, + "step": 3151 + }, + { + "epoch": 2.9348230912476723, + "grad_norm": 1.6273612976074219, + "learning_rate": 2.6484746527544786e-06, + "loss": 0.9315, + "step": 3152 + }, + { + "epoch": 2.935754189944134, + "grad_norm": 1.5820204019546509, + "learning_rate": 2.6472388025866993e-06, + "loss": 0.9616, + "step": 3153 + }, + { + "epoch": 2.9366852886405956, + "grad_norm": 1.6987463235855103, + "learning_rate": 2.646002916311566e-06, + "loss": 0.9663, + "step": 3154 + }, + { + "epoch": 2.9376163873370578, + "grad_norm": 1.595864176750183, + "learning_rate": 2.6447669942321535e-06, + "loss": 0.9353, + "step": 3155 + }, + { + "epoch": 2.9385474860335195, + "grad_norm": 1.6008108854293823, + "learning_rate": 2.6435310366515497e-06, + "loss": 0.9526, + "step": 3156 + }, + { + "epoch": 2.939478584729981, + "grad_norm": 1.6466643810272217, + "learning_rate": 2.6422950438728454e-06, + "loss": 0.9296, + "step": 3157 + }, + { + "epoch": 2.9404096834264433, + "grad_norm": 1.636741280555725, + "learning_rate": 2.6410590161991463e-06, + "loss": 0.9622, + "step": 3158 + }, + { + "epoch": 2.941340782122905, + "grad_norm": 1.6602981090545654, + "learning_rate": 2.6398229539335597e-06, + "loss": 0.979, + "step": 3159 + }, + { + "epoch": 2.9422718808193666, + "grad_norm": 1.658056378364563, + "learning_rate": 2.6385868573792074e-06, + "loss": 0.9597, + "step": 3160 + }, + { + "epoch": 2.9432029795158288, + "grad_norm": 1.7455765008926392, + "learning_rate": 2.6373507268392167e-06, + "loss": 0.9359, + "step": 3161 + }, + { + "epoch": 2.9441340782122905, + "grad_norm": 1.624936819076538, + "learning_rate": 2.6361145626167227e-06, + "loss": 0.9655, + "step": 3162 + }, + { + "epoch": 2.945065176908752, + "grad_norm": 1.6546506881713867, + "learning_rate": 2.634878365014872e-06, + "loss": 0.95, + "step": 3163 + }, + { + "epoch": 2.9459962756052143, + "grad_norm": 1.6638516187667847, + "learning_rate": 2.633642134336814e-06, + "loss": 0.947, + "step": 3164 + }, + { + "epoch": 2.946927374301676, + "grad_norm": 1.69620943069458, + "learning_rate": 2.632405870885713e-06, + "loss": 1.0029, + "step": 3165 + }, + { + "epoch": 2.9478584729981376, + "grad_norm": 1.6695328950881958, + "learning_rate": 2.6311695749647352e-06, + "loss": 0.9448, + "step": 3166 + }, + { + "epoch": 2.9487895716945998, + "grad_norm": 1.6961241960525513, + "learning_rate": 2.6299332468770583e-06, + "loss": 0.9598, + "step": 3167 + }, + { + "epoch": 2.9497206703910615, + "grad_norm": 1.6267427206039429, + "learning_rate": 2.6286968869258666e-06, + "loss": 0.9482, + "step": 3168 + }, + { + "epoch": 2.950651769087523, + "grad_norm": 1.6254559755325317, + "learning_rate": 2.627460495414352e-06, + "loss": 0.9316, + "step": 3169 + }, + { + "epoch": 2.9515828677839853, + "grad_norm": 1.6306689977645874, + "learning_rate": 2.6262240726457165e-06, + "loss": 0.9421, + "step": 3170 + }, + { + "epoch": 2.952513966480447, + "grad_norm": 1.6224042177200317, + "learning_rate": 2.624987618923166e-06, + "loss": 0.9272, + "step": 3171 + }, + { + "epoch": 2.9534450651769086, + "grad_norm": 1.599687099456787, + "learning_rate": 2.623751134549917e-06, + "loss": 0.9634, + "step": 3172 + }, + { + "epoch": 2.9543761638733708, + "grad_norm": 1.6378819942474365, + "learning_rate": 2.6225146198291915e-06, + "loss": 0.95, + "step": 3173 + }, + { + "epoch": 2.9553072625698324, + "grad_norm": 1.5943334102630615, + "learning_rate": 2.6212780750642203e-06, + "loss": 0.9431, + "step": 3174 + }, + { + "epoch": 2.956238361266294, + "grad_norm": 1.6969786882400513, + "learning_rate": 2.6200415005582414e-06, + "loss": 0.9583, + "step": 3175 + }, + { + "epoch": 2.9571694599627563, + "grad_norm": 1.6389068365097046, + "learning_rate": 2.618804896614499e-06, + "loss": 0.9671, + "step": 3176 + }, + { + "epoch": 2.958100558659218, + "grad_norm": 1.6660076379776, + "learning_rate": 2.6175682635362463e-06, + "loss": 0.9643, + "step": 3177 + }, + { + "epoch": 2.9590316573556796, + "grad_norm": 1.6488064527511597, + "learning_rate": 2.616331601626742e-06, + "loss": 0.9624, + "step": 3178 + }, + { + "epoch": 2.9599627560521418, + "grad_norm": 1.6433595418930054, + "learning_rate": 2.615094911189254e-06, + "loss": 0.9304, + "step": 3179 + }, + { + "epoch": 2.9608938547486034, + "grad_norm": 1.6492249965667725, + "learning_rate": 2.6138581925270536e-06, + "loss": 0.9602, + "step": 3180 + }, + { + "epoch": 2.961824953445065, + "grad_norm": 1.6754956245422363, + "learning_rate": 2.6126214459434223e-06, + "loss": 0.9699, + "step": 3181 + }, + { + "epoch": 2.9627560521415273, + "grad_norm": 1.6222394704818726, + "learning_rate": 2.611384671741647e-06, + "loss": 0.9408, + "step": 3182 + }, + { + "epoch": 2.963687150837989, + "grad_norm": 1.6233813762664795, + "learning_rate": 2.610147870225022e-06, + "loss": 0.9871, + "step": 3183 + }, + { + "epoch": 2.9646182495344506, + "grad_norm": 1.718134880065918, + "learning_rate": 2.608911041696848e-06, + "loss": 0.9432, + "step": 3184 + }, + { + "epoch": 2.9655493482309123, + "grad_norm": 1.6321865320205688, + "learning_rate": 2.607674186460432e-06, + "loss": 0.9787, + "step": 3185 + }, + { + "epoch": 2.9664804469273744, + "grad_norm": 1.7013866901397705, + "learning_rate": 2.6064373048190884e-06, + "loss": 0.9676, + "step": 3186 + }, + { + "epoch": 2.967411545623836, + "grad_norm": 1.6143778562545776, + "learning_rate": 2.605200397076137e-06, + "loss": 0.9424, + "step": 3187 + }, + { + "epoch": 2.968342644320298, + "grad_norm": 1.6556555032730103, + "learning_rate": 2.6039634635349044e-06, + "loss": 0.9698, + "step": 3188 + }, + { + "epoch": 2.9692737430167595, + "grad_norm": 1.6135071516036987, + "learning_rate": 2.602726504498724e-06, + "loss": 0.9193, + "step": 3189 + }, + { + "epoch": 2.9702048417132216, + "grad_norm": 1.6962298154830933, + "learning_rate": 2.6014895202709354e-06, + "loss": 0.9831, + "step": 3190 + }, + { + "epoch": 2.9711359404096833, + "grad_norm": 1.635617733001709, + "learning_rate": 2.600252511154884e-06, + "loss": 0.9779, + "step": 3191 + }, + { + "epoch": 2.972067039106145, + "grad_norm": 1.6644952297210693, + "learning_rate": 2.5990154774539213e-06, + "loss": 0.951, + "step": 3192 + }, + { + "epoch": 2.972998137802607, + "grad_norm": 1.6300113201141357, + "learning_rate": 2.5977784194714036e-06, + "loss": 0.9815, + "step": 3193 + }, + { + "epoch": 2.973929236499069, + "grad_norm": 1.6248083114624023, + "learning_rate": 2.5965413375106965e-06, + "loss": 0.9385, + "step": 3194 + }, + { + "epoch": 2.9748603351955305, + "grad_norm": 1.6800507307052612, + "learning_rate": 2.5953042318751686e-06, + "loss": 0.9469, + "step": 3195 + }, + { + "epoch": 2.9757914338919926, + "grad_norm": 1.644413709640503, + "learning_rate": 2.5940671028681954e-06, + "loss": 0.9257, + "step": 3196 + }, + { + "epoch": 2.9767225325884543, + "grad_norm": 1.654010534286499, + "learning_rate": 2.5928299507931574e-06, + "loss": 0.9792, + "step": 3197 + }, + { + "epoch": 2.977653631284916, + "grad_norm": 1.6688764095306396, + "learning_rate": 2.591592775953442e-06, + "loss": 0.9659, + "step": 3198 + }, + { + "epoch": 2.978584729981378, + "grad_norm": 1.67815363407135, + "learning_rate": 2.5903555786524413e-06, + "loss": 0.9418, + "step": 3199 + }, + { + "epoch": 2.97951582867784, + "grad_norm": 1.6087619066238403, + "learning_rate": 2.5891183591935514e-06, + "loss": 0.9498, + "step": 3200 + }, + { + "epoch": 2.9804469273743015, + "grad_norm": 1.616249442100525, + "learning_rate": 2.587881117880179e-06, + "loss": 0.9735, + "step": 3201 + }, + { + "epoch": 2.9813780260707636, + "grad_norm": 1.6133146286010742, + "learning_rate": 2.5866438550157284e-06, + "loss": 0.9327, + "step": 3202 + }, + { + "epoch": 2.9823091247672253, + "grad_norm": 1.6440492868423462, + "learning_rate": 2.585406570903616e-06, + "loss": 0.9456, + "step": 3203 + }, + { + "epoch": 2.983240223463687, + "grad_norm": 1.6011227369308472, + "learning_rate": 2.5841692658472616e-06, + "loss": 0.9421, + "step": 3204 + }, + { + "epoch": 2.984171322160149, + "grad_norm": 1.6756632328033447, + "learning_rate": 2.5829319401500867e-06, + "loss": 1.0086, + "step": 3205 + }, + { + "epoch": 2.985102420856611, + "grad_norm": 1.5915745496749878, + "learning_rate": 2.581694594115523e-06, + "loss": 0.9242, + "step": 3206 + }, + { + "epoch": 2.9860335195530725, + "grad_norm": 1.6238288879394531, + "learning_rate": 2.5804572280470027e-06, + "loss": 0.973, + "step": 3207 + }, + { + "epoch": 2.9869646182495346, + "grad_norm": 1.5941739082336426, + "learning_rate": 2.5792198422479668e-06, + "loss": 0.9684, + "step": 3208 + }, + { + "epoch": 2.9878957169459963, + "grad_norm": 1.6146596670150757, + "learning_rate": 2.5779824370218575e-06, + "loss": 0.9749, + "step": 3209 + }, + { + "epoch": 2.988826815642458, + "grad_norm": 1.5792685747146606, + "learning_rate": 2.5767450126721254e-06, + "loss": 0.9422, + "step": 3210 + }, + { + "epoch": 2.98975791433892, + "grad_norm": 1.5883413553237915, + "learning_rate": 2.5755075695022223e-06, + "loss": 0.9395, + "step": 3211 + }, + { + "epoch": 2.990689013035382, + "grad_norm": 1.6765646934509277, + "learning_rate": 2.574270107815607e-06, + "loss": 0.9924, + "step": 3212 + }, + { + "epoch": 2.9916201117318435, + "grad_norm": 1.6381527185440063, + "learning_rate": 2.5730326279157426e-06, + "loss": 0.9296, + "step": 3213 + }, + { + "epoch": 2.9925512104283056, + "grad_norm": 1.6947510242462158, + "learning_rate": 2.5717951301060947e-06, + "loss": 0.9788, + "step": 3214 + }, + { + "epoch": 2.9934823091247673, + "grad_norm": 1.7428909540176392, + "learning_rate": 2.5705576146901364e-06, + "loss": 0.9851, + "step": 3215 + }, + { + "epoch": 2.994413407821229, + "grad_norm": 1.6258810758590698, + "learning_rate": 2.5693200819713414e-06, + "loss": 0.9569, + "step": 3216 + }, + { + "epoch": 2.995344506517691, + "grad_norm": 1.6402835845947266, + "learning_rate": 2.5680825322531923e-06, + "loss": 0.9147, + "step": 3217 + }, + { + "epoch": 2.996275605214153, + "grad_norm": 1.5955889225006104, + "learning_rate": 2.566844965839171e-06, + "loss": 0.9239, + "step": 3218 + }, + { + "epoch": 2.9972067039106145, + "grad_norm": 1.5961555242538452, + "learning_rate": 2.5656073830327665e-06, + "loss": 0.9238, + "step": 3219 + }, + { + "epoch": 2.998137802607076, + "grad_norm": 1.5453569889068604, + "learning_rate": 2.5643697841374722e-06, + "loss": 0.928, + "step": 3220 + }, + { + "epoch": 2.9990689013035383, + "grad_norm": 1.69312584400177, + "learning_rate": 2.563132169456782e-06, + "loss": 0.9641, + "step": 3221 + }, + { + "epoch": 3.0, + "grad_norm": 1.7103936672210693, + "learning_rate": 2.5618945392941984e-06, + "loss": 0.9236, + "step": 3222 + }, + { + "epoch": 3.0009310986964617, + "grad_norm": 1.6348861455917358, + "learning_rate": 2.5606568939532243e-06, + "loss": 0.8808, + "step": 3223 + }, + { + "epoch": 3.001862197392924, + "grad_norm": 1.5315836668014526, + "learning_rate": 2.559419233737367e-06, + "loss": 0.8667, + "step": 3224 + }, + { + "epoch": 3.0027932960893855, + "grad_norm": 1.564589500427246, + "learning_rate": 2.558181558950138e-06, + "loss": 0.8484, + "step": 3225 + }, + { + "epoch": 3.003724394785847, + "grad_norm": 1.6595914363861084, + "learning_rate": 2.5569438698950523e-06, + "loss": 0.9022, + "step": 3226 + }, + { + "epoch": 3.0046554934823093, + "grad_norm": 1.5895179510116577, + "learning_rate": 2.5557061668756284e-06, + "loss": 0.9148, + "step": 3227 + }, + { + "epoch": 3.005586592178771, + "grad_norm": 1.7316328287124634, + "learning_rate": 2.5544684501953876e-06, + "loss": 0.9026, + "step": 3228 + }, + { + "epoch": 3.0065176908752327, + "grad_norm": 1.5359153747558594, + "learning_rate": 2.5532307201578548e-06, + "loss": 0.8577, + "step": 3229 + }, + { + "epoch": 3.007448789571695, + "grad_norm": 1.5917328596115112, + "learning_rate": 2.5519929770665596e-06, + "loss": 0.8823, + "step": 3230 + }, + { + "epoch": 3.0083798882681565, + "grad_norm": 1.6393204927444458, + "learning_rate": 2.5507552212250324e-06, + "loss": 0.9357, + "step": 3231 + }, + { + "epoch": 3.009310986964618, + "grad_norm": 1.6072674989700317, + "learning_rate": 2.5495174529368084e-06, + "loss": 0.9027, + "step": 3232 + }, + { + "epoch": 3.01024208566108, + "grad_norm": 1.6603189706802368, + "learning_rate": 2.5482796725054247e-06, + "loss": 0.91, + "step": 3233 + }, + { + "epoch": 3.011173184357542, + "grad_norm": 1.6961463689804077, + "learning_rate": 2.547041880234424e-06, + "loss": 0.9258, + "step": 3234 + }, + { + "epoch": 3.0121042830540037, + "grad_norm": 1.6351066827774048, + "learning_rate": 2.545804076427348e-06, + "loss": 0.8773, + "step": 3235 + }, + { + "epoch": 3.0130353817504654, + "grad_norm": 1.6111769676208496, + "learning_rate": 2.544566261387743e-06, + "loss": 0.8677, + "step": 3236 + }, + { + "epoch": 3.0139664804469275, + "grad_norm": 1.627763032913208, + "learning_rate": 2.5433284354191595e-06, + "loss": 0.9188, + "step": 3237 + }, + { + "epoch": 3.014897579143389, + "grad_norm": 1.680160641670227, + "learning_rate": 2.5420905988251488e-06, + "loss": 0.913, + "step": 3238 + }, + { + "epoch": 3.015828677839851, + "grad_norm": 1.7062358856201172, + "learning_rate": 2.5408527519092656e-06, + "loss": 0.9194, + "step": 3239 + }, + { + "epoch": 3.016759776536313, + "grad_norm": 1.6547439098358154, + "learning_rate": 2.539614894975067e-06, + "loss": 0.8822, + "step": 3240 + }, + { + "epoch": 3.0176908752327747, + "grad_norm": 1.7282648086547852, + "learning_rate": 2.5383770283261126e-06, + "loss": 0.9289, + "step": 3241 + }, + { + "epoch": 3.0186219739292364, + "grad_norm": 1.6317826509475708, + "learning_rate": 2.5371391522659645e-06, + "loss": 0.921, + "step": 3242 + }, + { + "epoch": 3.0195530726256985, + "grad_norm": 1.6277379989624023, + "learning_rate": 2.5359012670981853e-06, + "loss": 0.9567, + "step": 3243 + }, + { + "epoch": 3.02048417132216, + "grad_norm": 1.6117379665374756, + "learning_rate": 2.5346633731263444e-06, + "loss": 0.9047, + "step": 3244 + }, + { + "epoch": 3.021415270018622, + "grad_norm": 1.715893268585205, + "learning_rate": 2.533425470654007e-06, + "loss": 0.9291, + "step": 3245 + }, + { + "epoch": 3.022346368715084, + "grad_norm": 1.680368185043335, + "learning_rate": 2.5321875599847456e-06, + "loss": 0.9226, + "step": 3246 + }, + { + "epoch": 3.0232774674115457, + "grad_norm": 1.5958926677703857, + "learning_rate": 2.530949641422133e-06, + "loss": 0.9247, + "step": 3247 + }, + { + "epoch": 3.0242085661080074, + "grad_norm": 1.6338911056518555, + "learning_rate": 2.529711715269743e-06, + "loss": 0.9021, + "step": 3248 + }, + { + "epoch": 3.0251396648044695, + "grad_norm": 1.6195487976074219, + "learning_rate": 2.5284737818311537e-06, + "loss": 0.8914, + "step": 3249 + }, + { + "epoch": 3.026070763500931, + "grad_norm": 1.6505372524261475, + "learning_rate": 2.527235841409941e-06, + "loss": 0.8979, + "step": 3250 + }, + { + "epoch": 3.027001862197393, + "grad_norm": 1.685255527496338, + "learning_rate": 2.525997894309688e-06, + "loss": 0.9199, + "step": 3251 + }, + { + "epoch": 3.0279329608938546, + "grad_norm": 1.575434923171997, + "learning_rate": 2.5247599408339724e-06, + "loss": 0.867, + "step": 3252 + }, + { + "epoch": 3.0288640595903167, + "grad_norm": 1.7103954553604126, + "learning_rate": 2.523521981286381e-06, + "loss": 0.9454, + "step": 3253 + }, + { + "epoch": 3.0297951582867784, + "grad_norm": 1.6721266508102417, + "learning_rate": 2.5222840159704957e-06, + "loss": 0.9143, + "step": 3254 + }, + { + "epoch": 3.03072625698324, + "grad_norm": 1.69712495803833, + "learning_rate": 2.521046045189905e-06, + "loss": 0.9241, + "step": 3255 + }, + { + "epoch": 3.031657355679702, + "grad_norm": 1.6615443229675293, + "learning_rate": 2.519808069248194e-06, + "loss": 0.8948, + "step": 3256 + }, + { + "epoch": 3.032588454376164, + "grad_norm": 1.701850414276123, + "learning_rate": 2.5185700884489527e-06, + "loss": 0.8984, + "step": 3257 + }, + { + "epoch": 3.0335195530726256, + "grad_norm": 1.635740041732788, + "learning_rate": 2.5173321030957716e-06, + "loss": 0.9336, + "step": 3258 + }, + { + "epoch": 3.0344506517690877, + "grad_norm": 1.6742604970932007, + "learning_rate": 2.51609411349224e-06, + "loss": 0.9297, + "step": 3259 + }, + { + "epoch": 3.0353817504655494, + "grad_norm": 1.7093249559402466, + "learning_rate": 2.514856119941952e-06, + "loss": 0.9176, + "step": 3260 + }, + { + "epoch": 3.036312849162011, + "grad_norm": 1.6714253425598145, + "learning_rate": 2.5136181227484983e-06, + "loss": 0.9162, + "step": 3261 + }, + { + "epoch": 3.037243947858473, + "grad_norm": 1.629442572593689, + "learning_rate": 2.512380122215475e-06, + "loss": 0.9021, + "step": 3262 + }, + { + "epoch": 3.038175046554935, + "grad_norm": 1.6980584859848022, + "learning_rate": 2.5111421186464747e-06, + "loss": 0.9064, + "step": 3263 + }, + { + "epoch": 3.0391061452513966, + "grad_norm": 1.7323260307312012, + "learning_rate": 2.5099041123450948e-06, + "loss": 0.937, + "step": 3264 + }, + { + "epoch": 3.0400372439478587, + "grad_norm": 1.637475848197937, + "learning_rate": 2.50866610361493e-06, + "loss": 0.8936, + "step": 3265 + }, + { + "epoch": 3.0409683426443204, + "grad_norm": 1.6962095499038696, + "learning_rate": 2.507428092759578e-06, + "loss": 0.9293, + "step": 3266 + }, + { + "epoch": 3.041899441340782, + "grad_norm": 1.7129355669021606, + "learning_rate": 2.5061900800826355e-06, + "loss": 0.9075, + "step": 3267 + }, + { + "epoch": 3.0428305400372437, + "grad_norm": 1.664898157119751, + "learning_rate": 2.504952065887701e-06, + "loss": 0.91, + "step": 3268 + }, + { + "epoch": 3.043761638733706, + "grad_norm": 1.734605073928833, + "learning_rate": 2.5037140504783714e-06, + "loss": 0.8936, + "step": 3269 + }, + { + "epoch": 3.0446927374301676, + "grad_norm": 1.7139325141906738, + "learning_rate": 2.5024760341582455e-06, + "loss": 0.9331, + "step": 3270 + }, + { + "epoch": 3.0456238361266292, + "grad_norm": 1.717179298400879, + "learning_rate": 2.5012380172309224e-06, + "loss": 0.9096, + "step": 3271 + }, + { + "epoch": 3.0465549348230914, + "grad_norm": 1.6623668670654297, + "learning_rate": 2.5e-06, + "loss": 0.9106, + "step": 3272 + }, + { + "epoch": 3.047486033519553, + "grad_norm": 1.6601907014846802, + "learning_rate": 2.4987619827690784e-06, + "loss": 0.8966, + "step": 3273 + }, + { + "epoch": 3.0484171322160147, + "grad_norm": 1.6603847742080688, + "learning_rate": 2.4975239658417557e-06, + "loss": 0.9007, + "step": 3274 + }, + { + "epoch": 3.049348230912477, + "grad_norm": 1.7118208408355713, + "learning_rate": 2.4962859495216295e-06, + "loss": 0.9512, + "step": 3275 + }, + { + "epoch": 3.0502793296089385, + "grad_norm": 1.6912704706192017, + "learning_rate": 2.4950479341123e-06, + "loss": 0.9446, + "step": 3276 + }, + { + "epoch": 3.0512104283054002, + "grad_norm": 1.6684913635253906, + "learning_rate": 2.4938099199173645e-06, + "loss": 0.9156, + "step": 3277 + }, + { + "epoch": 3.0521415270018624, + "grad_norm": 1.657727599143982, + "learning_rate": 2.492571907240423e-06, + "loss": 0.9253, + "step": 3278 + }, + { + "epoch": 3.053072625698324, + "grad_norm": 1.6968016624450684, + "learning_rate": 2.4913338963850704e-06, + "loss": 0.8988, + "step": 3279 + }, + { + "epoch": 3.0540037243947857, + "grad_norm": 1.7171710729599, + "learning_rate": 2.490095887654906e-06, + "loss": 0.9049, + "step": 3280 + }, + { + "epoch": 3.054934823091248, + "grad_norm": 1.7257767915725708, + "learning_rate": 2.488857881353526e-06, + "loss": 0.9327, + "step": 3281 + }, + { + "epoch": 3.0558659217877095, + "grad_norm": 1.8294111490249634, + "learning_rate": 2.4876198777845263e-06, + "loss": 0.9482, + "step": 3282 + }, + { + "epoch": 3.0567970204841712, + "grad_norm": 1.726961612701416, + "learning_rate": 2.486381877251502e-06, + "loss": 0.9213, + "step": 3283 + }, + { + "epoch": 3.0577281191806334, + "grad_norm": 1.6841015815734863, + "learning_rate": 2.485143880058049e-06, + "loss": 0.9021, + "step": 3284 + }, + { + "epoch": 3.058659217877095, + "grad_norm": 1.6948037147521973, + "learning_rate": 2.4839058865077607e-06, + "loss": 0.9414, + "step": 3285 + }, + { + "epoch": 3.0595903165735567, + "grad_norm": 1.6763288974761963, + "learning_rate": 2.4826678969042292e-06, + "loss": 0.9183, + "step": 3286 + }, + { + "epoch": 3.0605214152700184, + "grad_norm": 1.658308982849121, + "learning_rate": 2.4814299115510477e-06, + "loss": 0.8916, + "step": 3287 + }, + { + "epoch": 3.0614525139664805, + "grad_norm": 1.7754426002502441, + "learning_rate": 2.480191930751807e-06, + "loss": 0.8561, + "step": 3288 + }, + { + "epoch": 3.0623836126629422, + "grad_norm": 1.7293765544891357, + "learning_rate": 2.478953954810096e-06, + "loss": 0.9501, + "step": 3289 + }, + { + "epoch": 3.063314711359404, + "grad_norm": 1.6309765577316284, + "learning_rate": 2.4777159840295047e-06, + "loss": 0.9146, + "step": 3290 + }, + { + "epoch": 3.064245810055866, + "grad_norm": 1.7002781629562378, + "learning_rate": 2.47647801871362e-06, + "loss": 0.9249, + "step": 3291 + }, + { + "epoch": 3.0651769087523277, + "grad_norm": 1.694254755973816, + "learning_rate": 2.4752400591660284e-06, + "loss": 0.9045, + "step": 3292 + }, + { + "epoch": 3.0661080074487894, + "grad_norm": 1.6246881484985352, + "learning_rate": 2.474002105690313e-06, + "loss": 0.9315, + "step": 3293 + }, + { + "epoch": 3.0670391061452515, + "grad_norm": 1.660295009613037, + "learning_rate": 2.472764158590059e-06, + "loss": 0.8825, + "step": 3294 + }, + { + "epoch": 3.0679702048417132, + "grad_norm": 1.688385248184204, + "learning_rate": 2.4715262181688475e-06, + "loss": 0.9207, + "step": 3295 + }, + { + "epoch": 3.068901303538175, + "grad_norm": 1.634578824043274, + "learning_rate": 2.4702882847302573e-06, + "loss": 0.9101, + "step": 3296 + }, + { + "epoch": 3.069832402234637, + "grad_norm": 1.6727747917175293, + "learning_rate": 2.469050358577867e-06, + "loss": 0.9342, + "step": 3297 + }, + { + "epoch": 3.0707635009310987, + "grad_norm": 1.6537649631500244, + "learning_rate": 2.467812440015255e-06, + "loss": 0.8723, + "step": 3298 + }, + { + "epoch": 3.0716945996275604, + "grad_norm": 1.6903131008148193, + "learning_rate": 2.466574529345994e-06, + "loss": 0.9111, + "step": 3299 + }, + { + "epoch": 3.0726256983240225, + "grad_norm": 1.656028389930725, + "learning_rate": 2.465336626873657e-06, + "loss": 0.904, + "step": 3300 + }, + { + "epoch": 3.0735567970204842, + "grad_norm": 1.6640174388885498, + "learning_rate": 2.4640987329018147e-06, + "loss": 0.9014, + "step": 3301 + }, + { + "epoch": 3.074487895716946, + "grad_norm": 1.6518352031707764, + "learning_rate": 2.4628608477340368e-06, + "loss": 0.8969, + "step": 3302 + }, + { + "epoch": 3.0754189944134076, + "grad_norm": 1.6160846948623657, + "learning_rate": 2.461622971673888e-06, + "loss": 0.9276, + "step": 3303 + }, + { + "epoch": 3.0763500931098697, + "grad_norm": 1.7019175291061401, + "learning_rate": 2.4603851050249327e-06, + "loss": 0.9182, + "step": 3304 + }, + { + "epoch": 3.0772811918063314, + "grad_norm": 1.6947954893112183, + "learning_rate": 2.4591472480907348e-06, + "loss": 0.9317, + "step": 3305 + }, + { + "epoch": 3.078212290502793, + "grad_norm": 1.66879403591156, + "learning_rate": 2.4579094011748517e-06, + "loss": 0.8858, + "step": 3306 + }, + { + "epoch": 3.0791433891992552, + "grad_norm": 1.6757324934005737, + "learning_rate": 2.4566715645808413e-06, + "loss": 0.8714, + "step": 3307 + }, + { + "epoch": 3.080074487895717, + "grad_norm": 1.6933484077453613, + "learning_rate": 2.4554337386122575e-06, + "loss": 0.9289, + "step": 3308 + }, + { + "epoch": 3.0810055865921786, + "grad_norm": 1.6270999908447266, + "learning_rate": 2.4541959235726534e-06, + "loss": 0.866, + "step": 3309 + }, + { + "epoch": 3.0819366852886407, + "grad_norm": 1.6904125213623047, + "learning_rate": 2.452958119765577e-06, + "loss": 0.938, + "step": 3310 + }, + { + "epoch": 3.0828677839851024, + "grad_norm": 1.825324296951294, + "learning_rate": 2.451720327494575e-06, + "loss": 0.9581, + "step": 3311 + }, + { + "epoch": 3.083798882681564, + "grad_norm": 1.7476593255996704, + "learning_rate": 2.450482547063193e-06, + "loss": 0.8942, + "step": 3312 + }, + { + "epoch": 3.0847299813780262, + "grad_norm": 1.7949507236480713, + "learning_rate": 2.4492447787749684e-06, + "loss": 0.9288, + "step": 3313 + }, + { + "epoch": 3.085661080074488, + "grad_norm": 1.719113826751709, + "learning_rate": 2.4480070229334413e-06, + "loss": 0.8886, + "step": 3314 + }, + { + "epoch": 3.0865921787709496, + "grad_norm": 1.7233141660690308, + "learning_rate": 2.446769279842145e-06, + "loss": 0.9106, + "step": 3315 + }, + { + "epoch": 3.0875232774674117, + "grad_norm": 1.7058930397033691, + "learning_rate": 2.4455315498046132e-06, + "loss": 0.8962, + "step": 3316 + }, + { + "epoch": 3.0884543761638734, + "grad_norm": 1.724055290222168, + "learning_rate": 2.4442938331243724e-06, + "loss": 0.8723, + "step": 3317 + }, + { + "epoch": 3.089385474860335, + "grad_norm": 1.727042555809021, + "learning_rate": 2.443056130104948e-06, + "loss": 0.9233, + "step": 3318 + }, + { + "epoch": 3.0903165735567972, + "grad_norm": 1.670355200767517, + "learning_rate": 2.441818441049863e-06, + "loss": 0.911, + "step": 3319 + }, + { + "epoch": 3.091247672253259, + "grad_norm": 1.7252628803253174, + "learning_rate": 2.440580766262634e-06, + "loss": 0.9492, + "step": 3320 + }, + { + "epoch": 3.0921787709497206, + "grad_norm": 1.6504132747650146, + "learning_rate": 2.4393431060467765e-06, + "loss": 0.919, + "step": 3321 + }, + { + "epoch": 3.0931098696461823, + "grad_norm": 1.7550840377807617, + "learning_rate": 2.438105460705803e-06, + "loss": 0.951, + "step": 3322 + }, + { + "epoch": 3.0940409683426444, + "grad_norm": 1.6968220472335815, + "learning_rate": 2.4368678305432182e-06, + "loss": 0.9516, + "step": 3323 + }, + { + "epoch": 3.094972067039106, + "grad_norm": 1.6675012111663818, + "learning_rate": 2.435630215862529e-06, + "loss": 0.8952, + "step": 3324 + }, + { + "epoch": 3.095903165735568, + "grad_norm": 1.6590911149978638, + "learning_rate": 2.434392616967234e-06, + "loss": 0.9139, + "step": 3325 + }, + { + "epoch": 3.09683426443203, + "grad_norm": 1.7370678186416626, + "learning_rate": 2.4331550341608304e-06, + "loss": 0.9367, + "step": 3326 + }, + { + "epoch": 3.0977653631284916, + "grad_norm": 1.6874971389770508, + "learning_rate": 2.431917467746809e-06, + "loss": 0.9108, + "step": 3327 + }, + { + "epoch": 3.0986964618249533, + "grad_norm": 1.65764582157135, + "learning_rate": 2.430679918028659e-06, + "loss": 0.8995, + "step": 3328 + }, + { + "epoch": 3.0996275605214154, + "grad_norm": 1.7074867486953735, + "learning_rate": 2.4294423853098653e-06, + "loss": 0.9226, + "step": 3329 + }, + { + "epoch": 3.100558659217877, + "grad_norm": 1.715795874595642, + "learning_rate": 2.4282048698939066e-06, + "loss": 0.9263, + "step": 3330 + }, + { + "epoch": 3.101489757914339, + "grad_norm": 1.6853233575820923, + "learning_rate": 2.426967372084258e-06, + "loss": 0.9209, + "step": 3331 + }, + { + "epoch": 3.102420856610801, + "grad_norm": 1.7638869285583496, + "learning_rate": 2.4257298921843935e-06, + "loss": 0.9018, + "step": 3332 + }, + { + "epoch": 3.1033519553072626, + "grad_norm": 1.7023661136627197, + "learning_rate": 2.4244924304977785e-06, + "loss": 0.8811, + "step": 3333 + }, + { + "epoch": 3.1042830540037243, + "grad_norm": 1.6923280954360962, + "learning_rate": 2.423254987327875e-06, + "loss": 0.9129, + "step": 3334 + }, + { + "epoch": 3.1052141527001864, + "grad_norm": 1.7066457271575928, + "learning_rate": 2.4220175629781425e-06, + "loss": 0.9022, + "step": 3335 + }, + { + "epoch": 3.106145251396648, + "grad_norm": 1.6826326847076416, + "learning_rate": 2.4207801577520345e-06, + "loss": 0.9095, + "step": 3336 + }, + { + "epoch": 3.10707635009311, + "grad_norm": 1.756100058555603, + "learning_rate": 2.4195427719529977e-06, + "loss": 0.9269, + "step": 3337 + }, + { + "epoch": 3.1080074487895715, + "grad_norm": 1.6395617723464966, + "learning_rate": 2.4183054058844775e-06, + "loss": 0.8798, + "step": 3338 + }, + { + "epoch": 3.1089385474860336, + "grad_norm": 1.6417088508605957, + "learning_rate": 2.4170680598499137e-06, + "loss": 0.9377, + "step": 3339 + }, + { + "epoch": 3.1098696461824953, + "grad_norm": 1.7448179721832275, + "learning_rate": 2.4158307341527396e-06, + "loss": 0.9021, + "step": 3340 + }, + { + "epoch": 3.110800744878957, + "grad_norm": 1.7119693756103516, + "learning_rate": 2.4145934290963842e-06, + "loss": 0.9027, + "step": 3341 + }, + { + "epoch": 3.111731843575419, + "grad_norm": 1.646247148513794, + "learning_rate": 2.413356144984272e-06, + "loss": 0.8591, + "step": 3342 + }, + { + "epoch": 3.112662942271881, + "grad_norm": 1.7105697393417358, + "learning_rate": 2.412118882119823e-06, + "loss": 0.91, + "step": 3343 + }, + { + "epoch": 3.1135940409683425, + "grad_norm": 1.656224250793457, + "learning_rate": 2.410881640806449e-06, + "loss": 0.9112, + "step": 3344 + }, + { + "epoch": 3.1145251396648046, + "grad_norm": 1.6878925561904907, + "learning_rate": 2.4096444213475595e-06, + "loss": 0.9162, + "step": 3345 + }, + { + "epoch": 3.1154562383612663, + "grad_norm": 1.707326889038086, + "learning_rate": 2.4084072240465585e-06, + "loss": 0.9485, + "step": 3346 + }, + { + "epoch": 3.116387337057728, + "grad_norm": 1.7247391939163208, + "learning_rate": 2.407170049206843e-06, + "loss": 0.8914, + "step": 3347 + }, + { + "epoch": 3.11731843575419, + "grad_norm": 1.6792007684707642, + "learning_rate": 2.4059328971318054e-06, + "loss": 0.926, + "step": 3348 + }, + { + "epoch": 3.118249534450652, + "grad_norm": 1.7814252376556396, + "learning_rate": 2.4046957681248314e-06, + "loss": 0.8685, + "step": 3349 + }, + { + "epoch": 3.1191806331471135, + "grad_norm": 1.6012327671051025, + "learning_rate": 2.403458662489304e-06, + "loss": 0.8687, + "step": 3350 + }, + { + "epoch": 3.1201117318435756, + "grad_norm": 1.6993153095245361, + "learning_rate": 2.4022215805285973e-06, + "loss": 0.9373, + "step": 3351 + }, + { + "epoch": 3.1210428305400373, + "grad_norm": 1.707701325416565, + "learning_rate": 2.4009845225460795e-06, + "loss": 0.923, + "step": 3352 + }, + { + "epoch": 3.121973929236499, + "grad_norm": 1.6501392126083374, + "learning_rate": 2.3997474888451165e-06, + "loss": 0.8701, + "step": 3353 + }, + { + "epoch": 3.122905027932961, + "grad_norm": 1.6571564674377441, + "learning_rate": 2.3985104797290654e-06, + "loss": 0.8881, + "step": 3354 + }, + { + "epoch": 3.123836126629423, + "grad_norm": 1.6818349361419678, + "learning_rate": 2.3972734955012766e-06, + "loss": 0.8899, + "step": 3355 + }, + { + "epoch": 3.1247672253258845, + "grad_norm": 1.6249483823776245, + "learning_rate": 2.396036536465096e-06, + "loss": 0.8891, + "step": 3356 + }, + { + "epoch": 3.1256983240223466, + "grad_norm": 1.73823881149292, + "learning_rate": 2.394799602923864e-06, + "loss": 0.9463, + "step": 3357 + }, + { + "epoch": 3.1266294227188083, + "grad_norm": 1.72157621383667, + "learning_rate": 2.393562695180913e-06, + "loss": 0.8692, + "step": 3358 + }, + { + "epoch": 3.12756052141527, + "grad_norm": 1.64155912399292, + "learning_rate": 2.3923258135395688e-06, + "loss": 0.8962, + "step": 3359 + }, + { + "epoch": 3.1284916201117317, + "grad_norm": 1.783261775970459, + "learning_rate": 2.3910889583031533e-06, + "loss": 0.9376, + "step": 3360 + }, + { + "epoch": 3.129422718808194, + "grad_norm": 1.684281349182129, + "learning_rate": 2.3898521297749785e-06, + "loss": 0.8981, + "step": 3361 + }, + { + "epoch": 3.1303538175046555, + "grad_norm": 1.691046118736267, + "learning_rate": 2.388615328258354e-06, + "loss": 0.9489, + "step": 3362 + }, + { + "epoch": 3.131284916201117, + "grad_norm": 1.683213710784912, + "learning_rate": 2.387378554056578e-06, + "loss": 0.9331, + "step": 3363 + }, + { + "epoch": 3.1322160148975793, + "grad_norm": 1.6740621328353882, + "learning_rate": 2.3861418074729477e-06, + "loss": 0.9037, + "step": 3364 + }, + { + "epoch": 3.133147113594041, + "grad_norm": 1.697553277015686, + "learning_rate": 2.384905088810747e-06, + "loss": 0.9186, + "step": 3365 + }, + { + "epoch": 3.1340782122905027, + "grad_norm": 1.6715878248214722, + "learning_rate": 2.3836683983732583e-06, + "loss": 0.9397, + "step": 3366 + }, + { + "epoch": 3.135009310986965, + "grad_norm": 1.7044919729232788, + "learning_rate": 2.382431736463755e-06, + "loss": 0.9084, + "step": 3367 + }, + { + "epoch": 3.1359404096834265, + "grad_norm": 1.687350869178772, + "learning_rate": 2.3811951033855015e-06, + "loss": 0.9025, + "step": 3368 + }, + { + "epoch": 3.136871508379888, + "grad_norm": 1.6968352794647217, + "learning_rate": 2.3799584994417594e-06, + "loss": 0.8827, + "step": 3369 + }, + { + "epoch": 3.1378026070763503, + "grad_norm": 1.6833912134170532, + "learning_rate": 2.3787219249357805e-06, + "loss": 0.945, + "step": 3370 + }, + { + "epoch": 3.138733705772812, + "grad_norm": 1.6953020095825195, + "learning_rate": 2.3774853801708097e-06, + "loss": 0.949, + "step": 3371 + }, + { + "epoch": 3.1396648044692737, + "grad_norm": 1.7083728313446045, + "learning_rate": 2.3762488654500836e-06, + "loss": 0.8975, + "step": 3372 + }, + { + "epoch": 3.1405959031657353, + "grad_norm": 1.6619200706481934, + "learning_rate": 2.3750123810768344e-06, + "loss": 0.8989, + "step": 3373 + }, + { + "epoch": 3.1415270018621975, + "grad_norm": 1.684128761291504, + "learning_rate": 2.3737759273542843e-06, + "loss": 0.9292, + "step": 3374 + }, + { + "epoch": 3.142458100558659, + "grad_norm": 1.6813548803329468, + "learning_rate": 2.372539504585648e-06, + "loss": 0.9007, + "step": 3375 + }, + { + "epoch": 3.143389199255121, + "grad_norm": 1.667564034461975, + "learning_rate": 2.371303113074134e-06, + "loss": 0.8933, + "step": 3376 + }, + { + "epoch": 3.144320297951583, + "grad_norm": 1.7173337936401367, + "learning_rate": 2.370066753122942e-06, + "loss": 0.904, + "step": 3377 + }, + { + "epoch": 3.1452513966480447, + "grad_norm": 1.646213173866272, + "learning_rate": 2.368830425035266e-06, + "loss": 0.8946, + "step": 3378 + }, + { + "epoch": 3.1461824953445063, + "grad_norm": 1.709409236907959, + "learning_rate": 2.367594129114288e-06, + "loss": 0.9175, + "step": 3379 + }, + { + "epoch": 3.1471135940409685, + "grad_norm": 1.6985713243484497, + "learning_rate": 2.366357865663186e-06, + "loss": 0.9487, + "step": 3380 + }, + { + "epoch": 3.14804469273743, + "grad_norm": 1.693822979927063, + "learning_rate": 2.3651216349851297e-06, + "loss": 0.9266, + "step": 3381 + }, + { + "epoch": 3.148975791433892, + "grad_norm": 1.6887474060058594, + "learning_rate": 2.363885437383278e-06, + "loss": 0.921, + "step": 3382 + }, + { + "epoch": 3.149906890130354, + "grad_norm": 1.6518738269805908, + "learning_rate": 2.362649273160784e-06, + "loss": 0.8776, + "step": 3383 + }, + { + "epoch": 3.1508379888268156, + "grad_norm": 1.7738643884658813, + "learning_rate": 2.361413142620793e-06, + "loss": 0.9344, + "step": 3384 + }, + { + "epoch": 3.1517690875232773, + "grad_norm": 1.6904795169830322, + "learning_rate": 2.3601770460664415e-06, + "loss": 0.9688, + "step": 3385 + }, + { + "epoch": 3.1527001862197395, + "grad_norm": 1.626943588256836, + "learning_rate": 2.358940983800855e-06, + "loss": 0.8789, + "step": 3386 + }, + { + "epoch": 3.153631284916201, + "grad_norm": 1.6936602592468262, + "learning_rate": 2.3577049561271545e-06, + "loss": 0.9073, + "step": 3387 + }, + { + "epoch": 3.154562383612663, + "grad_norm": 1.6696503162384033, + "learning_rate": 2.3564689633484515e-06, + "loss": 0.8763, + "step": 3388 + }, + { + "epoch": 3.155493482309125, + "grad_norm": 1.7532196044921875, + "learning_rate": 2.3552330057678473e-06, + "loss": 0.9078, + "step": 3389 + }, + { + "epoch": 3.1564245810055866, + "grad_norm": 1.7158058881759644, + "learning_rate": 2.353997083688435e-06, + "loss": 0.8885, + "step": 3390 + }, + { + "epoch": 3.1573556797020483, + "grad_norm": 1.6331466436386108, + "learning_rate": 2.3527611974133016e-06, + "loss": 0.9263, + "step": 3391 + }, + { + "epoch": 3.1582867783985105, + "grad_norm": 1.708320140838623, + "learning_rate": 2.351525347245522e-06, + "loss": 0.9185, + "step": 3392 + }, + { + "epoch": 3.159217877094972, + "grad_norm": 1.6720097064971924, + "learning_rate": 2.3502895334881635e-06, + "loss": 0.9187, + "step": 3393 + }, + { + "epoch": 3.160148975791434, + "grad_norm": 1.7563456296920776, + "learning_rate": 2.349053756444285e-06, + "loss": 0.9286, + "step": 3394 + }, + { + "epoch": 3.1610800744878955, + "grad_norm": 1.7602083683013916, + "learning_rate": 2.3478180164169366e-06, + "loss": 0.9178, + "step": 3395 + }, + { + "epoch": 3.1620111731843576, + "grad_norm": 1.681713342666626, + "learning_rate": 2.3465823137091574e-06, + "loss": 0.9244, + "step": 3396 + }, + { + "epoch": 3.1629422718808193, + "grad_norm": 1.6967233419418335, + "learning_rate": 2.3453466486239783e-06, + "loss": 0.9221, + "step": 3397 + }, + { + "epoch": 3.163873370577281, + "grad_norm": 1.7396472692489624, + "learning_rate": 2.3441110214644246e-06, + "loss": 0.917, + "step": 3398 + }, + { + "epoch": 3.164804469273743, + "grad_norm": 1.7534763813018799, + "learning_rate": 2.3428754325335044e-06, + "loss": 0.9176, + "step": 3399 + }, + { + "epoch": 3.165735567970205, + "grad_norm": 1.7043342590332031, + "learning_rate": 2.341639882134224e-06, + "loss": 0.9037, + "step": 3400 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 1.685696005821228, + "learning_rate": 2.340404370569576e-06, + "loss": 0.904, + "step": 3401 + }, + { + "epoch": 3.1675977653631286, + "grad_norm": 1.7363678216934204, + "learning_rate": 2.3391688981425464e-06, + "loss": 0.9145, + "step": 3402 + }, + { + "epoch": 3.1685288640595903, + "grad_norm": 1.6908007860183716, + "learning_rate": 2.337933465156108e-06, + "loss": 0.8958, + "step": 3403 + }, + { + "epoch": 3.169459962756052, + "grad_norm": 1.7950772047042847, + "learning_rate": 2.3366980719132268e-06, + "loss": 0.915, + "step": 3404 + }, + { + "epoch": 3.170391061452514, + "grad_norm": 1.7826727628707886, + "learning_rate": 2.3354627187168584e-06, + "loss": 0.8764, + "step": 3405 + }, + { + "epoch": 3.171322160148976, + "grad_norm": 1.6985899209976196, + "learning_rate": 2.3342274058699475e-06, + "loss": 0.9165, + "step": 3406 + }, + { + "epoch": 3.1722532588454375, + "grad_norm": 1.7099257707595825, + "learning_rate": 2.33299213367543e-06, + "loss": 0.877, + "step": 3407 + }, + { + "epoch": 3.1731843575418996, + "grad_norm": 1.7201907634735107, + "learning_rate": 2.3317569024362317e-06, + "loss": 0.9108, + "step": 3408 + }, + { + "epoch": 3.1741154562383613, + "grad_norm": 1.7022032737731934, + "learning_rate": 2.3305217124552696e-06, + "loss": 0.89, + "step": 3409 + }, + { + "epoch": 3.175046554934823, + "grad_norm": 1.7690213918685913, + "learning_rate": 2.329286564035446e-06, + "loss": 0.9562, + "step": 3410 + }, + { + "epoch": 3.1759776536312847, + "grad_norm": 1.7318713665008545, + "learning_rate": 2.3280514574796593e-06, + "loss": 0.9034, + "step": 3411 + }, + { + "epoch": 3.176908752327747, + "grad_norm": 1.6821590662002563, + "learning_rate": 2.3268163930907934e-06, + "loss": 0.9395, + "step": 3412 + }, + { + "epoch": 3.1778398510242085, + "grad_norm": 1.6919987201690674, + "learning_rate": 2.3255813711717216e-06, + "loss": 0.8768, + "step": 3413 + }, + { + "epoch": 3.17877094972067, + "grad_norm": 1.6836568117141724, + "learning_rate": 2.3243463920253103e-06, + "loss": 0.9124, + "step": 3414 + }, + { + "epoch": 3.1797020484171323, + "grad_norm": 1.7388001680374146, + "learning_rate": 2.3231114559544117e-06, + "loss": 0.9167, + "step": 3415 + }, + { + "epoch": 3.180633147113594, + "grad_norm": 1.7960084676742554, + "learning_rate": 2.321876563261871e-06, + "loss": 0.9004, + "step": 3416 + }, + { + "epoch": 3.1815642458100557, + "grad_norm": 1.6765440702438354, + "learning_rate": 2.3206417142505187e-06, + "loss": 0.9384, + "step": 3417 + }, + { + "epoch": 3.182495344506518, + "grad_norm": 1.6785004138946533, + "learning_rate": 2.3194069092231777e-06, + "loss": 0.9173, + "step": 3418 + }, + { + "epoch": 3.1834264432029795, + "grad_norm": 1.6729774475097656, + "learning_rate": 2.318172148482659e-06, + "loss": 0.892, + "step": 3419 + }, + { + "epoch": 3.184357541899441, + "grad_norm": 1.678394079208374, + "learning_rate": 2.316937432331762e-06, + "loss": 0.9129, + "step": 3420 + }, + { + "epoch": 3.1852886405959033, + "grad_norm": 1.6302787065505981, + "learning_rate": 2.3157027610732775e-06, + "loss": 0.8923, + "step": 3421 + }, + { + "epoch": 3.186219739292365, + "grad_norm": 1.7189890146255493, + "learning_rate": 2.3144681350099837e-06, + "loss": 0.9103, + "step": 3422 + }, + { + "epoch": 3.1871508379888267, + "grad_norm": 1.6748616695404053, + "learning_rate": 2.3132335544446462e-06, + "loss": 0.8764, + "step": 3423 + }, + { + "epoch": 3.188081936685289, + "grad_norm": 1.7060880661010742, + "learning_rate": 2.3119990196800218e-06, + "loss": 0.9425, + "step": 3424 + }, + { + "epoch": 3.1890130353817505, + "grad_norm": 1.6806586980819702, + "learning_rate": 2.3107645310188555e-06, + "loss": 0.9121, + "step": 3425 + }, + { + "epoch": 3.189944134078212, + "grad_norm": 1.674682378768921, + "learning_rate": 2.309530088763882e-06, + "loss": 0.8912, + "step": 3426 + }, + { + "epoch": 3.1908752327746743, + "grad_norm": 1.7024798393249512, + "learning_rate": 2.3082956932178212e-06, + "loss": 0.9466, + "step": 3427 + }, + { + "epoch": 3.191806331471136, + "grad_norm": 1.6897104978561401, + "learning_rate": 2.3070613446833843e-06, + "loss": 0.8994, + "step": 3428 + }, + { + "epoch": 3.1927374301675977, + "grad_norm": 1.6608905792236328, + "learning_rate": 2.305827043463272e-06, + "loss": 0.8684, + "step": 3429 + }, + { + "epoch": 3.1936685288640594, + "grad_norm": 1.7487759590148926, + "learning_rate": 2.3045927898601703e-06, + "loss": 0.937, + "step": 3430 + }, + { + "epoch": 3.1945996275605215, + "grad_norm": 1.7491610050201416, + "learning_rate": 2.303358584176755e-06, + "loss": 0.8712, + "step": 3431 + }, + { + "epoch": 3.195530726256983, + "grad_norm": 1.7019641399383545, + "learning_rate": 2.302124426715691e-06, + "loss": 0.8719, + "step": 3432 + }, + { + "epoch": 3.196461824953445, + "grad_norm": 1.7703745365142822, + "learning_rate": 2.3008903177796318e-06, + "loss": 0.8891, + "step": 3433 + }, + { + "epoch": 3.197392923649907, + "grad_norm": 1.6748254299163818, + "learning_rate": 2.2996562576712145e-06, + "loss": 0.8984, + "step": 3434 + }, + { + "epoch": 3.1983240223463687, + "grad_norm": 1.711982250213623, + "learning_rate": 2.2984222466930698e-06, + "loss": 0.8878, + "step": 3435 + }, + { + "epoch": 3.1992551210428304, + "grad_norm": 1.6686856746673584, + "learning_rate": 2.2971882851478144e-06, + "loss": 0.886, + "step": 3436 + }, + { + "epoch": 3.2001862197392925, + "grad_norm": 1.7037968635559082, + "learning_rate": 2.2959543733380514e-06, + "loss": 0.8974, + "step": 3437 + }, + { + "epoch": 3.201117318435754, + "grad_norm": 1.713139295578003, + "learning_rate": 2.294720511566373e-06, + "loss": 0.9088, + "step": 3438 + }, + { + "epoch": 3.202048417132216, + "grad_norm": 1.6898949146270752, + "learning_rate": 2.293486700135358e-06, + "loss": 0.8976, + "step": 3439 + }, + { + "epoch": 3.202979515828678, + "grad_norm": 1.6984347105026245, + "learning_rate": 2.292252939347577e-06, + "loss": 0.8946, + "step": 3440 + }, + { + "epoch": 3.2039106145251397, + "grad_norm": 1.6554429531097412, + "learning_rate": 2.2910192295055825e-06, + "loss": 0.8954, + "step": 3441 + }, + { + "epoch": 3.2048417132216014, + "grad_norm": 1.7220181226730347, + "learning_rate": 2.2897855709119166e-06, + "loss": 0.9305, + "step": 3442 + }, + { + "epoch": 3.2057728119180635, + "grad_norm": 1.739747405052185, + "learning_rate": 2.288551963869112e-06, + "loss": 0.9027, + "step": 3443 + }, + { + "epoch": 3.206703910614525, + "grad_norm": 1.7780839204788208, + "learning_rate": 2.2873184086796825e-06, + "loss": 0.9107, + "step": 3444 + }, + { + "epoch": 3.207635009310987, + "grad_norm": 1.7119289636611938, + "learning_rate": 2.2860849056461347e-06, + "loss": 0.8865, + "step": 3445 + }, + { + "epoch": 3.2085661080074486, + "grad_norm": 1.7334108352661133, + "learning_rate": 2.28485145507096e-06, + "loss": 0.9614, + "step": 3446 + }, + { + "epoch": 3.2094972067039107, + "grad_norm": 1.858974575996399, + "learning_rate": 2.283618057256638e-06, + "loss": 0.8966, + "step": 3447 + }, + { + "epoch": 3.2104283054003724, + "grad_norm": 1.789520025253296, + "learning_rate": 2.282384712505634e-06, + "loss": 0.9359, + "step": 3448 + }, + { + "epoch": 3.211359404096834, + "grad_norm": 1.6848711967468262, + "learning_rate": 2.2811514211204004e-06, + "loss": 0.8967, + "step": 3449 + }, + { + "epoch": 3.212290502793296, + "grad_norm": 1.7142305374145508, + "learning_rate": 2.27991818340338e-06, + "loss": 0.9115, + "step": 3450 + }, + { + "epoch": 3.213221601489758, + "grad_norm": 1.6918210983276367, + "learning_rate": 2.278684999656995e-06, + "loss": 0.8723, + "step": 3451 + }, + { + "epoch": 3.2141527001862196, + "grad_norm": 1.7474030256271362, + "learning_rate": 2.277451870183662e-06, + "loss": 0.9524, + "step": 3452 + }, + { + "epoch": 3.2150837988826817, + "grad_norm": 1.7189279794692993, + "learning_rate": 2.27621879528578e-06, + "loss": 0.9311, + "step": 3453 + }, + { + "epoch": 3.2160148975791434, + "grad_norm": 1.6171016693115234, + "learning_rate": 2.274985775265737e-06, + "loss": 0.9057, + "step": 3454 + }, + { + "epoch": 3.216945996275605, + "grad_norm": 1.7530337572097778, + "learning_rate": 2.273752810425906e-06, + "loss": 0.9257, + "step": 3455 + }, + { + "epoch": 3.217877094972067, + "grad_norm": 1.7092812061309814, + "learning_rate": 2.2725199010686456e-06, + "loss": 0.8976, + "step": 3456 + }, + { + "epoch": 3.218808193668529, + "grad_norm": 1.6773228645324707, + "learning_rate": 2.2712870474963036e-06, + "loss": 0.9133, + "step": 3457 + }, + { + "epoch": 3.2197392923649906, + "grad_norm": 1.6792818307876587, + "learning_rate": 2.270054250011211e-06, + "loss": 0.9434, + "step": 3458 + }, + { + "epoch": 3.2206703910614527, + "grad_norm": 1.7077932357788086, + "learning_rate": 2.2688215089156874e-06, + "loss": 0.9152, + "step": 3459 + }, + { + "epoch": 3.2216014897579144, + "grad_norm": 1.6712348461151123, + "learning_rate": 2.2675888245120384e-06, + "loss": 0.9235, + "step": 3460 + }, + { + "epoch": 3.222532588454376, + "grad_norm": 1.6330510377883911, + "learning_rate": 2.266356197102554e-06, + "loss": 0.9006, + "step": 3461 + }, + { + "epoch": 3.223463687150838, + "grad_norm": 1.6948401927947998, + "learning_rate": 2.265123626989511e-06, + "loss": 0.9428, + "step": 3462 + }, + { + "epoch": 3.2243947858473, + "grad_norm": 1.68091881275177, + "learning_rate": 2.2638911144751734e-06, + "loss": 0.9243, + "step": 3463 + }, + { + "epoch": 3.2253258845437616, + "grad_norm": 1.7108176946640015, + "learning_rate": 2.262658659861791e-06, + "loss": 0.938, + "step": 3464 + }, + { + "epoch": 3.2262569832402237, + "grad_norm": 1.680197834968567, + "learning_rate": 2.2614262634515953e-06, + "loss": 0.9002, + "step": 3465 + }, + { + "epoch": 3.2271880819366854, + "grad_norm": 1.7151423692703247, + "learning_rate": 2.26019392554681e-06, + "loss": 0.9125, + "step": 3466 + }, + { + "epoch": 3.228119180633147, + "grad_norm": 1.6826694011688232, + "learning_rate": 2.25896164644964e-06, + "loss": 0.9196, + "step": 3467 + }, + { + "epoch": 3.2290502793296088, + "grad_norm": 1.7032185792922974, + "learning_rate": 2.2577294264622765e-06, + "loss": 0.9164, + "step": 3468 + }, + { + "epoch": 3.229981378026071, + "grad_norm": 1.716027021408081, + "learning_rate": 2.256497265886896e-06, + "loss": 0.9518, + "step": 3469 + }, + { + "epoch": 3.2309124767225326, + "grad_norm": 1.7424236536026, + "learning_rate": 2.2552651650256634e-06, + "loss": 0.959, + "step": 3470 + }, + { + "epoch": 3.2318435754189943, + "grad_norm": 1.6295639276504517, + "learning_rate": 2.254033124180725e-06, + "loss": 0.8792, + "step": 3471 + }, + { + "epoch": 3.2327746741154564, + "grad_norm": 1.6773732900619507, + "learning_rate": 2.2528011436542142e-06, + "loss": 0.8981, + "step": 3472 + }, + { + "epoch": 3.233705772811918, + "grad_norm": 1.7503982782363892, + "learning_rate": 2.251569223748249e-06, + "loss": 0.9512, + "step": 3473 + }, + { + "epoch": 3.2346368715083798, + "grad_norm": 1.7218401432037354, + "learning_rate": 2.250337364764935e-06, + "loss": 0.9261, + "step": 3474 + }, + { + "epoch": 3.235567970204842, + "grad_norm": 1.6968414783477783, + "learning_rate": 2.2491055670063584e-06, + "loss": 0.8908, + "step": 3475 + }, + { + "epoch": 3.2364990689013036, + "grad_norm": 1.7379018068313599, + "learning_rate": 2.2478738307745937e-06, + "loss": 0.9323, + "step": 3476 + }, + { + "epoch": 3.2374301675977653, + "grad_norm": 1.6510965824127197, + "learning_rate": 2.2466421563717e-06, + "loss": 0.8853, + "step": 3477 + }, + { + "epoch": 3.2383612662942274, + "grad_norm": 1.7129486799240112, + "learning_rate": 2.24541054409972e-06, + "loss": 0.9482, + "step": 3478 + }, + { + "epoch": 3.239292364990689, + "grad_norm": 1.749428629875183, + "learning_rate": 2.2441789942606827e-06, + "loss": 0.9357, + "step": 3479 + }, + { + "epoch": 3.2402234636871508, + "grad_norm": 1.6841530799865723, + "learning_rate": 2.242947507156599e-06, + "loss": 0.9128, + "step": 3480 + }, + { + "epoch": 3.2411545623836124, + "grad_norm": 1.700286626815796, + "learning_rate": 2.2417160830894688e-06, + "loss": 0.8929, + "step": 3481 + }, + { + "epoch": 3.2420856610800746, + "grad_norm": 1.7240166664123535, + "learning_rate": 2.2404847223612725e-06, + "loss": 0.9307, + "step": 3482 + }, + { + "epoch": 3.2430167597765363, + "grad_norm": 1.7347228527069092, + "learning_rate": 2.239253425273976e-06, + "loss": 0.8893, + "step": 3483 + }, + { + "epoch": 3.243947858472998, + "grad_norm": 1.7174723148345947, + "learning_rate": 2.238022192129532e-06, + "loss": 0.8987, + "step": 3484 + }, + { + "epoch": 3.24487895716946, + "grad_norm": 1.6831586360931396, + "learning_rate": 2.236791023229875e-06, + "loss": 0.902, + "step": 3485 + }, + { + "epoch": 3.2458100558659218, + "grad_norm": 1.7524018287658691, + "learning_rate": 2.235559918876924e-06, + "loss": 0.957, + "step": 3486 + }, + { + "epoch": 3.2467411545623834, + "grad_norm": 1.7050849199295044, + "learning_rate": 2.2343288793725816e-06, + "loss": 0.9503, + "step": 3487 + }, + { + "epoch": 3.2476722532588456, + "grad_norm": 1.6524884700775146, + "learning_rate": 2.233097905018738e-06, + "loss": 0.9114, + "step": 3488 + }, + { + "epoch": 3.2486033519553073, + "grad_norm": 1.7306863069534302, + "learning_rate": 2.2318669961172627e-06, + "loss": 0.9093, + "step": 3489 + }, + { + "epoch": 3.249534450651769, + "grad_norm": 1.7213823795318604, + "learning_rate": 2.2306361529700125e-06, + "loss": 0.9007, + "step": 3490 + }, + { + "epoch": 3.250465549348231, + "grad_norm": 1.7477878332138062, + "learning_rate": 2.2294053758788267e-06, + "loss": 0.9367, + "step": 3491 + }, + { + "epoch": 3.2513966480446927, + "grad_norm": 1.6408740282058716, + "learning_rate": 2.2281746651455275e-06, + "loss": 0.8728, + "step": 3492 + }, + { + "epoch": 3.2523277467411544, + "grad_norm": 1.7176867723464966, + "learning_rate": 2.2269440210719234e-06, + "loss": 0.9092, + "step": 3493 + }, + { + "epoch": 3.2532588454376166, + "grad_norm": 1.7315559387207031, + "learning_rate": 2.2257134439598043e-06, + "loss": 0.9237, + "step": 3494 + }, + { + "epoch": 3.2541899441340782, + "grad_norm": 1.6877254247665405, + "learning_rate": 2.2244829341109463e-06, + "loss": 0.8952, + "step": 3495 + }, + { + "epoch": 3.25512104283054, + "grad_norm": 1.6631169319152832, + "learning_rate": 2.2232524918271036e-06, + "loss": 0.9074, + "step": 3496 + }, + { + "epoch": 3.256052141527002, + "grad_norm": 1.7043720483779907, + "learning_rate": 2.2220221174100204e-06, + "loss": 0.9091, + "step": 3497 + }, + { + "epoch": 3.2569832402234637, + "grad_norm": 1.7908573150634766, + "learning_rate": 2.220791811161421e-06, + "loss": 0.9532, + "step": 3498 + }, + { + "epoch": 3.2579143389199254, + "grad_norm": 1.732521891593933, + "learning_rate": 2.219561573383011e-06, + "loss": 0.9163, + "step": 3499 + }, + { + "epoch": 3.2588454376163876, + "grad_norm": 1.6356511116027832, + "learning_rate": 2.218331404376484e-06, + "loss": 0.8871, + "step": 3500 + }, + { + "epoch": 3.2597765363128492, + "grad_norm": 1.6799367666244507, + "learning_rate": 2.2171013044435132e-06, + "loss": 0.8903, + "step": 3501 + }, + { + "epoch": 3.260707635009311, + "grad_norm": 1.6938735246658325, + "learning_rate": 2.2158712738857564e-06, + "loss": 0.9461, + "step": 3502 + }, + { + "epoch": 3.2616387337057726, + "grad_norm": 1.7129855155944824, + "learning_rate": 2.2146413130048524e-06, + "loss": 0.9481, + "step": 3503 + }, + { + "epoch": 3.2625698324022347, + "grad_norm": 1.6919702291488647, + "learning_rate": 2.213411422102426e-06, + "loss": 0.9585, + "step": 3504 + }, + { + "epoch": 3.2635009310986964, + "grad_norm": 1.6424874067306519, + "learning_rate": 2.212181601480083e-06, + "loss": 0.9107, + "step": 3505 + }, + { + "epoch": 3.264432029795158, + "grad_norm": 1.6698466539382935, + "learning_rate": 2.210951851439411e-06, + "loss": 0.9099, + "step": 3506 + }, + { + "epoch": 3.2653631284916202, + "grad_norm": 1.6773866415023804, + "learning_rate": 2.2097221722819817e-06, + "loss": 0.9162, + "step": 3507 + }, + { + "epoch": 3.266294227188082, + "grad_norm": 1.744491457939148, + "learning_rate": 2.2084925643093502e-06, + "loss": 0.9262, + "step": 3508 + }, + { + "epoch": 3.2672253258845436, + "grad_norm": 1.7049018144607544, + "learning_rate": 2.207263027823053e-06, + "loss": 0.9251, + "step": 3509 + }, + { + "epoch": 3.2681564245810057, + "grad_norm": 1.6969716548919678, + "learning_rate": 2.206033563124608e-06, + "loss": 0.9365, + "step": 3510 + }, + { + "epoch": 3.2690875232774674, + "grad_norm": 1.697211742401123, + "learning_rate": 2.2048041705155175e-06, + "loss": 0.8812, + "step": 3511 + }, + { + "epoch": 3.270018621973929, + "grad_norm": 1.7172445058822632, + "learning_rate": 2.2035748502972658e-06, + "loss": 0.8945, + "step": 3512 + }, + { + "epoch": 3.2709497206703912, + "grad_norm": 1.645365834236145, + "learning_rate": 2.202345602771318e-06, + "loss": 0.8789, + "step": 3513 + }, + { + "epoch": 3.271880819366853, + "grad_norm": 1.6403355598449707, + "learning_rate": 2.2011164282391223e-06, + "loss": 0.8674, + "step": 3514 + }, + { + "epoch": 3.2728119180633146, + "grad_norm": 1.6744697093963623, + "learning_rate": 2.1998873270021097e-06, + "loss": 0.9114, + "step": 3515 + }, + { + "epoch": 3.2737430167597763, + "grad_norm": 1.7862238883972168, + "learning_rate": 2.1986582993616926e-06, + "loss": 0.9383, + "step": 3516 + }, + { + "epoch": 3.2746741154562384, + "grad_norm": 1.7182978391647339, + "learning_rate": 2.197429345619265e-06, + "loss": 0.8933, + "step": 3517 + }, + { + "epoch": 3.2756052141527, + "grad_norm": 1.748374104499817, + "learning_rate": 2.1962004660762025e-06, + "loss": 0.9279, + "step": 3518 + }, + { + "epoch": 3.276536312849162, + "grad_norm": 1.7625303268432617, + "learning_rate": 2.194971661033864e-06, + "loss": 0.9081, + "step": 3519 + }, + { + "epoch": 3.277467411545624, + "grad_norm": 1.655259370803833, + "learning_rate": 2.193742930793589e-06, + "loss": 0.8837, + "step": 3520 + }, + { + "epoch": 3.2783985102420856, + "grad_norm": 1.6879080533981323, + "learning_rate": 2.192514275656698e-06, + "loss": 0.896, + "step": 3521 + }, + { + "epoch": 3.2793296089385473, + "grad_norm": 1.6889163255691528, + "learning_rate": 2.1912856959244958e-06, + "loss": 0.8757, + "step": 3522 + }, + { + "epoch": 3.2802607076350094, + "grad_norm": 1.6968783140182495, + "learning_rate": 2.190057191898265e-06, + "loss": 0.938, + "step": 3523 + }, + { + "epoch": 3.281191806331471, + "grad_norm": 1.696675181388855, + "learning_rate": 2.1888287638792722e-06, + "loss": 0.9038, + "step": 3524 + }, + { + "epoch": 3.282122905027933, + "grad_norm": 1.6698479652404785, + "learning_rate": 2.1876004121687644e-06, + "loss": 0.8684, + "step": 3525 + }, + { + "epoch": 3.283054003724395, + "grad_norm": 1.6960601806640625, + "learning_rate": 2.186372137067972e-06, + "loss": 0.92, + "step": 3526 + }, + { + "epoch": 3.2839851024208566, + "grad_norm": 1.6478750705718994, + "learning_rate": 2.1851439388781017e-06, + "loss": 0.9147, + "step": 3527 + }, + { + "epoch": 3.2849162011173183, + "grad_norm": 1.7124546766281128, + "learning_rate": 2.1839158179003457e-06, + "loss": 0.8868, + "step": 3528 + }, + { + "epoch": 3.2858472998137804, + "grad_norm": 1.6764276027679443, + "learning_rate": 2.182687774435878e-06, + "loss": 0.8683, + "step": 3529 + }, + { + "epoch": 3.286778398510242, + "grad_norm": 1.8640626668930054, + "learning_rate": 2.1814598087858476e-06, + "loss": 0.9044, + "step": 3530 + }, + { + "epoch": 3.287709497206704, + "grad_norm": 1.934227466583252, + "learning_rate": 2.1802319212513913e-06, + "loss": 0.966, + "step": 3531 + }, + { + "epoch": 3.288640595903166, + "grad_norm": 1.7664982080459595, + "learning_rate": 2.1790041121336223e-06, + "loss": 0.9185, + "step": 3532 + }, + { + "epoch": 3.2895716945996276, + "grad_norm": 1.7142821550369263, + "learning_rate": 2.1777763817336384e-06, + "loss": 0.9193, + "step": 3533 + }, + { + "epoch": 3.2905027932960893, + "grad_norm": 1.735909342765808, + "learning_rate": 2.176548730352513e-06, + "loss": 0.9219, + "step": 3534 + }, + { + "epoch": 3.2914338919925514, + "grad_norm": 1.7370494604110718, + "learning_rate": 2.175321158291304e-06, + "loss": 0.9395, + "step": 3535 + }, + { + "epoch": 3.292364990689013, + "grad_norm": 1.7619099617004395, + "learning_rate": 2.17409366585105e-06, + "loss": 0.8983, + "step": 3536 + }, + { + "epoch": 3.293296089385475, + "grad_norm": 1.7213809490203857, + "learning_rate": 2.172866253332766e-06, + "loss": 0.8812, + "step": 3537 + }, + { + "epoch": 3.294227188081937, + "grad_norm": 1.7410892248153687, + "learning_rate": 2.171638921037453e-06, + "loss": 0.9048, + "step": 3538 + }, + { + "epoch": 3.2951582867783986, + "grad_norm": 1.7433421611785889, + "learning_rate": 2.1704116692660872e-06, + "loss": 0.9084, + "step": 3539 + }, + { + "epoch": 3.2960893854748603, + "grad_norm": 1.793975830078125, + "learning_rate": 2.1691844983196302e-06, + "loss": 0.8724, + "step": 3540 + }, + { + "epoch": 3.297020484171322, + "grad_norm": 1.6627880334854126, + "learning_rate": 2.1679574084990184e-06, + "loss": 0.8882, + "step": 3541 + }, + { + "epoch": 3.297951582867784, + "grad_norm": 1.8283196687698364, + "learning_rate": 2.166730400105172e-06, + "loss": 0.9405, + "step": 3542 + }, + { + "epoch": 3.298882681564246, + "grad_norm": 1.6557303667068481, + "learning_rate": 2.1655034734389906e-06, + "loss": 0.8783, + "step": 3543 + }, + { + "epoch": 3.2998137802607075, + "grad_norm": 1.696735143661499, + "learning_rate": 2.1642766288013517e-06, + "loss": 0.8668, + "step": 3544 + }, + { + "epoch": 3.3007448789571696, + "grad_norm": 1.704197883605957, + "learning_rate": 2.1630498664931156e-06, + "loss": 0.9217, + "step": 3545 + }, + { + "epoch": 3.3016759776536313, + "grad_norm": 1.6723155975341797, + "learning_rate": 2.161823186815121e-06, + "loss": 0.906, + "step": 3546 + }, + { + "epoch": 3.302607076350093, + "grad_norm": 1.6761142015457153, + "learning_rate": 2.160596590068187e-06, + "loss": 0.9172, + "step": 3547 + }, + { + "epoch": 3.303538175046555, + "grad_norm": 1.7704648971557617, + "learning_rate": 2.159370076553109e-06, + "loss": 0.8914, + "step": 3548 + }, + { + "epoch": 3.304469273743017, + "grad_norm": 1.6825358867645264, + "learning_rate": 2.158143646570668e-06, + "loss": 0.9232, + "step": 3549 + }, + { + "epoch": 3.3054003724394785, + "grad_norm": 1.7193504571914673, + "learning_rate": 2.15691730042162e-06, + "loss": 0.8553, + "step": 3550 + }, + { + "epoch": 3.30633147113594, + "grad_norm": 1.7455410957336426, + "learning_rate": 2.155691038406701e-06, + "loss": 0.934, + "step": 3551 + }, + { + "epoch": 3.3072625698324023, + "grad_norm": 1.6663957834243774, + "learning_rate": 2.1544648608266282e-06, + "loss": 0.8788, + "step": 3552 + }, + { + "epoch": 3.308193668528864, + "grad_norm": 1.7086024284362793, + "learning_rate": 2.1532387679820967e-06, + "loss": 0.9155, + "step": 3553 + }, + { + "epoch": 3.3091247672253257, + "grad_norm": 1.7731077671051025, + "learning_rate": 2.1520127601737807e-06, + "loss": 0.9755, + "step": 3554 + }, + { + "epoch": 3.310055865921788, + "grad_norm": 1.768649935722351, + "learning_rate": 2.1507868377023337e-06, + "loss": 0.9071, + "step": 3555 + }, + { + "epoch": 3.3109869646182495, + "grad_norm": 1.7404378652572632, + "learning_rate": 2.1495610008683897e-06, + "loss": 0.9018, + "step": 3556 + }, + { + "epoch": 3.311918063314711, + "grad_norm": 1.6915026903152466, + "learning_rate": 2.1483352499725604e-06, + "loss": 0.946, + "step": 3557 + }, + { + "epoch": 3.3128491620111733, + "grad_norm": 1.746828556060791, + "learning_rate": 2.147109585315435e-06, + "loss": 0.9038, + "step": 3558 + }, + { + "epoch": 3.313780260707635, + "grad_norm": 1.7067960500717163, + "learning_rate": 2.145884007197584e-06, + "loss": 0.9339, + "step": 3559 + }, + { + "epoch": 3.3147113594040967, + "grad_norm": 1.7131682634353638, + "learning_rate": 2.144658515919557e-06, + "loss": 0.8702, + "step": 3560 + }, + { + "epoch": 3.315642458100559, + "grad_norm": 1.6957186460494995, + "learning_rate": 2.143433111781879e-06, + "loss": 0.9072, + "step": 3561 + }, + { + "epoch": 3.3165735567970205, + "grad_norm": 1.6971725225448608, + "learning_rate": 2.1422077950850565e-06, + "loss": 0.9189, + "step": 3562 + }, + { + "epoch": 3.317504655493482, + "grad_norm": 1.7515807151794434, + "learning_rate": 2.140982566129575e-06, + "loss": 0.9088, + "step": 3563 + }, + { + "epoch": 3.3184357541899443, + "grad_norm": 1.684130311012268, + "learning_rate": 2.1397574252158954e-06, + "loss": 0.9014, + "step": 3564 + }, + { + "epoch": 3.319366852886406, + "grad_norm": 1.7420741319656372, + "learning_rate": 2.13853237264446e-06, + "loss": 0.924, + "step": 3565 + }, + { + "epoch": 3.3202979515828677, + "grad_norm": 1.6821070909500122, + "learning_rate": 2.1373074087156874e-06, + "loss": 0.9173, + "step": 3566 + }, + { + "epoch": 3.32122905027933, + "grad_norm": 1.7626503705978394, + "learning_rate": 2.1360825337299766e-06, + "loss": 0.9113, + "step": 3567 + }, + { + "epoch": 3.3221601489757915, + "grad_norm": 1.6721113920211792, + "learning_rate": 2.1348577479877025e-06, + "loss": 0.8961, + "step": 3568 + }, + { + "epoch": 3.323091247672253, + "grad_norm": 1.711659550666809, + "learning_rate": 2.1336330517892195e-06, + "loss": 0.9276, + "step": 3569 + }, + { + "epoch": 3.3240223463687153, + "grad_norm": 1.7217745780944824, + "learning_rate": 2.1324084454348592e-06, + "loss": 0.9278, + "step": 3570 + }, + { + "epoch": 3.324953445065177, + "grad_norm": 1.7459384202957153, + "learning_rate": 2.1311839292249335e-06, + "loss": 0.8922, + "step": 3571 + }, + { + "epoch": 3.3258845437616387, + "grad_norm": 1.7842185497283936, + "learning_rate": 2.129959503459728e-06, + "loss": 0.8986, + "step": 3572 + }, + { + "epoch": 3.326815642458101, + "grad_norm": 1.7478796243667603, + "learning_rate": 2.1287351684395094e-06, + "loss": 0.932, + "step": 3573 + }, + { + "epoch": 3.3277467411545625, + "grad_norm": 1.7591707706451416, + "learning_rate": 2.1275109244645224e-06, + "loss": 0.9474, + "step": 3574 + }, + { + "epoch": 3.328677839851024, + "grad_norm": 1.7879152297973633, + "learning_rate": 2.1262867718349867e-06, + "loss": 0.9286, + "step": 3575 + }, + { + "epoch": 3.329608938547486, + "grad_norm": 1.6960433721542358, + "learning_rate": 2.125062710851101e-06, + "loss": 0.8789, + "step": 3576 + }, + { + "epoch": 3.330540037243948, + "grad_norm": 1.6222604513168335, + "learning_rate": 2.1238387418130425e-06, + "loss": 0.8959, + "step": 3577 + }, + { + "epoch": 3.3314711359404097, + "grad_norm": 1.7148146629333496, + "learning_rate": 2.122614865020965e-06, + "loss": 0.9323, + "step": 3578 + }, + { + "epoch": 3.3324022346368714, + "grad_norm": 1.718290090560913, + "learning_rate": 2.1213910807749995e-06, + "loss": 0.9226, + "step": 3579 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 1.7199945449829102, + "learning_rate": 2.120167389375253e-06, + "loss": 0.9269, + "step": 3580 + }, + { + "epoch": 3.334264432029795, + "grad_norm": 1.7048648595809937, + "learning_rate": 2.118943791121815e-06, + "loss": 0.9035, + "step": 3581 + }, + { + "epoch": 3.335195530726257, + "grad_norm": 1.664337158203125, + "learning_rate": 2.1177202863147436e-06, + "loss": 0.9267, + "step": 3582 + }, + { + "epoch": 3.336126629422719, + "grad_norm": 1.72476065158844, + "learning_rate": 2.1164968752540817e-06, + "loss": 0.9389, + "step": 3583 + }, + { + "epoch": 3.3370577281191807, + "grad_norm": 1.7327197790145874, + "learning_rate": 2.1152735582398453e-06, + "loss": 0.9282, + "step": 3584 + }, + { + "epoch": 3.3379888268156424, + "grad_norm": 1.755927324295044, + "learning_rate": 2.1140503355720295e-06, + "loss": 0.9149, + "step": 3585 + }, + { + "epoch": 3.338919925512104, + "grad_norm": 1.6904610395431519, + "learning_rate": 2.1128272075506036e-06, + "loss": 0.873, + "step": 3586 + }, + { + "epoch": 3.339851024208566, + "grad_norm": 1.7619491815567017, + "learning_rate": 2.1116041744755153e-06, + "loss": 0.9276, + "step": 3587 + }, + { + "epoch": 3.340782122905028, + "grad_norm": 1.7055435180664062, + "learning_rate": 2.1103812366466896e-06, + "loss": 0.9215, + "step": 3588 + }, + { + "epoch": 3.3417132216014895, + "grad_norm": 1.774766445159912, + "learning_rate": 2.1091583943640265e-06, + "loss": 0.9031, + "step": 3589 + }, + { + "epoch": 3.3426443202979517, + "grad_norm": 1.7082542181015015, + "learning_rate": 2.107935647927404e-06, + "loss": 0.8945, + "step": 3590 + }, + { + "epoch": 3.3435754189944134, + "grad_norm": 1.674393892288208, + "learning_rate": 2.1067129976366767e-06, + "loss": 0.8965, + "step": 3591 + }, + { + "epoch": 3.344506517690875, + "grad_norm": 1.6963931322097778, + "learning_rate": 2.105490443791674e-06, + "loss": 0.8976, + "step": 3592 + }, + { + "epoch": 3.345437616387337, + "grad_norm": 1.6533441543579102, + "learning_rate": 2.104267986692202e-06, + "loss": 0.9075, + "step": 3593 + }, + { + "epoch": 3.346368715083799, + "grad_norm": 1.6994211673736572, + "learning_rate": 2.1030456266380455e-06, + "loss": 0.9111, + "step": 3594 + }, + { + "epoch": 3.3472998137802605, + "grad_norm": 1.7067980766296387, + "learning_rate": 2.1018233639289636e-06, + "loss": 0.9104, + "step": 3595 + }, + { + "epoch": 3.3482309124767227, + "grad_norm": 1.719502329826355, + "learning_rate": 2.1006011988646895e-06, + "loss": 0.9107, + "step": 3596 + }, + { + "epoch": 3.3491620111731844, + "grad_norm": 1.7006198167800903, + "learning_rate": 2.0993791317449362e-06, + "loss": 0.9026, + "step": 3597 + }, + { + "epoch": 3.350093109869646, + "grad_norm": 1.6677294969558716, + "learning_rate": 2.098157162869392e-06, + "loss": 0.9156, + "step": 3598 + }, + { + "epoch": 3.351024208566108, + "grad_norm": 1.785241961479187, + "learning_rate": 2.096935292537718e-06, + "loss": 0.9246, + "step": 3599 + }, + { + "epoch": 3.35195530726257, + "grad_norm": 1.6744484901428223, + "learning_rate": 2.0957135210495543e-06, + "loss": 0.9146, + "step": 3600 + }, + { + "epoch": 3.3528864059590315, + "grad_norm": 1.6849631071090698, + "learning_rate": 2.094491848704516e-06, + "loss": 0.8868, + "step": 3601 + }, + { + "epoch": 3.3538175046554937, + "grad_norm": 1.699573040008545, + "learning_rate": 2.093270275802194e-06, + "loss": 0.8887, + "step": 3602 + }, + { + "epoch": 3.3547486033519553, + "grad_norm": 1.684644103050232, + "learning_rate": 2.0920488026421537e-06, + "loss": 0.8873, + "step": 3603 + }, + { + "epoch": 3.355679702048417, + "grad_norm": 1.6562538146972656, + "learning_rate": 2.0908274295239367e-06, + "loss": 0.8809, + "step": 3604 + }, + { + "epoch": 3.356610800744879, + "grad_norm": 1.7715096473693848, + "learning_rate": 2.089606156747061e-06, + "loss": 0.9317, + "step": 3605 + }, + { + "epoch": 3.357541899441341, + "grad_norm": 1.8907216787338257, + "learning_rate": 2.0883849846110186e-06, + "loss": 0.9325, + "step": 3606 + }, + { + "epoch": 3.3584729981378025, + "grad_norm": 1.7321754693984985, + "learning_rate": 2.0871639134152773e-06, + "loss": 0.9316, + "step": 3607 + }, + { + "epoch": 3.3594040968342647, + "grad_norm": 1.7374184131622314, + "learning_rate": 2.085942943459281e-06, + "loss": 0.8875, + "step": 3608 + }, + { + "epoch": 3.3603351955307263, + "grad_norm": 1.7196762561798096, + "learning_rate": 2.084722075042448e-06, + "loss": 0.9262, + "step": 3609 + }, + { + "epoch": 3.361266294227188, + "grad_norm": 1.8062974214553833, + "learning_rate": 2.0835013084641704e-06, + "loss": 0.8704, + "step": 3610 + }, + { + "epoch": 3.3621973929236497, + "grad_norm": 1.6995432376861572, + "learning_rate": 2.0822806440238173e-06, + "loss": 0.8831, + "step": 3611 + }, + { + "epoch": 3.363128491620112, + "grad_norm": 1.709960699081421, + "learning_rate": 2.081060082020733e-06, + "loss": 0.883, + "step": 3612 + }, + { + "epoch": 3.3640595903165735, + "grad_norm": 1.767360806465149, + "learning_rate": 2.079839622754235e-06, + "loss": 0.9391, + "step": 3613 + }, + { + "epoch": 3.364990689013035, + "grad_norm": 1.7222709655761719, + "learning_rate": 2.078619266523615e-06, + "loss": 0.8884, + "step": 3614 + }, + { + "epoch": 3.3659217877094973, + "grad_norm": 1.757759928703308, + "learning_rate": 2.0773990136281435e-06, + "loss": 0.9416, + "step": 3615 + }, + { + "epoch": 3.366852886405959, + "grad_norm": 1.7415378093719482, + "learning_rate": 2.0761788643670623e-06, + "loss": 0.9303, + "step": 3616 + }, + { + "epoch": 3.3677839851024207, + "grad_norm": 1.687500238418579, + "learning_rate": 2.0749588190395868e-06, + "loss": 0.8661, + "step": 3617 + }, + { + "epoch": 3.368715083798883, + "grad_norm": 1.7491952180862427, + "learning_rate": 2.0737388779449098e-06, + "loss": 0.9242, + "step": 3618 + }, + { + "epoch": 3.3696461824953445, + "grad_norm": 1.8421630859375, + "learning_rate": 2.0725190413821978e-06, + "loss": 0.9117, + "step": 3619 + }, + { + "epoch": 3.370577281191806, + "grad_norm": 1.7336701154708862, + "learning_rate": 2.0712993096505902e-06, + "loss": 0.9022, + "step": 3620 + }, + { + "epoch": 3.3715083798882683, + "grad_norm": 1.7102673053741455, + "learning_rate": 2.0700796830492016e-06, + "loss": 0.8834, + "step": 3621 + }, + { + "epoch": 3.37243947858473, + "grad_norm": 1.728464961051941, + "learning_rate": 2.068860161877122e-06, + "loss": 0.9096, + "step": 3622 + }, + { + "epoch": 3.3733705772811917, + "grad_norm": 1.7444391250610352, + "learning_rate": 2.067640746433413e-06, + "loss": 0.9049, + "step": 3623 + }, + { + "epoch": 3.3743016759776534, + "grad_norm": 1.709729552268982, + "learning_rate": 2.066421437017113e-06, + "loss": 0.9237, + "step": 3624 + }, + { + "epoch": 3.3752327746741155, + "grad_norm": 1.6979856491088867, + "learning_rate": 2.0652022339272314e-06, + "loss": 0.9058, + "step": 3625 + }, + { + "epoch": 3.376163873370577, + "grad_norm": 1.6928445100784302, + "learning_rate": 2.0639831374627563e-06, + "loss": 0.9381, + "step": 3626 + }, + { + "epoch": 3.377094972067039, + "grad_norm": 1.7880457639694214, + "learning_rate": 2.0627641479226434e-06, + "loss": 0.9574, + "step": 3627 + }, + { + "epoch": 3.378026070763501, + "grad_norm": 1.672634243965149, + "learning_rate": 2.0615452656058266e-06, + "loss": 0.91, + "step": 3628 + }, + { + "epoch": 3.3789571694599627, + "grad_norm": 1.7423460483551025, + "learning_rate": 2.060326490811213e-06, + "loss": 0.9427, + "step": 3629 + }, + { + "epoch": 3.3798882681564244, + "grad_norm": 1.7408066987991333, + "learning_rate": 2.0591078238376804e-06, + "loss": 0.9179, + "step": 3630 + }, + { + "epoch": 3.3808193668528865, + "grad_norm": 1.7179259061813354, + "learning_rate": 2.057889264984085e-06, + "loss": 0.8984, + "step": 3631 + }, + { + "epoch": 3.381750465549348, + "grad_norm": 1.8055065870285034, + "learning_rate": 2.056670814549252e-06, + "loss": 0.9687, + "step": 3632 + }, + { + "epoch": 3.38268156424581, + "grad_norm": 1.6409751176834106, + "learning_rate": 2.0554524728319837e-06, + "loss": 0.9099, + "step": 3633 + }, + { + "epoch": 3.383612662942272, + "grad_norm": 1.7668641805648804, + "learning_rate": 2.0542342401310515e-06, + "loss": 0.9434, + "step": 3634 + }, + { + "epoch": 3.3845437616387337, + "grad_norm": 1.6830002069473267, + "learning_rate": 2.053016116745204e-06, + "loss": 0.8506, + "step": 3635 + }, + { + "epoch": 3.3854748603351954, + "grad_norm": 1.707519769668579, + "learning_rate": 2.0517981029731613e-06, + "loss": 0.887, + "step": 3636 + }, + { + "epoch": 3.3864059590316575, + "grad_norm": 1.7540509700775146, + "learning_rate": 2.050580199113616e-06, + "loss": 0.9065, + "step": 3637 + }, + { + "epoch": 3.387337057728119, + "grad_norm": 1.7131767272949219, + "learning_rate": 2.049362405465236e-06, + "loss": 0.9048, + "step": 3638 + }, + { + "epoch": 3.388268156424581, + "grad_norm": 1.7063047885894775, + "learning_rate": 2.0481447223266593e-06, + "loss": 0.9117, + "step": 3639 + }, + { + "epoch": 3.389199255121043, + "grad_norm": 1.744381070137024, + "learning_rate": 2.0469271499964995e-06, + "loss": 0.9218, + "step": 3640 + }, + { + "epoch": 3.3901303538175047, + "grad_norm": 1.7229853868484497, + "learning_rate": 2.0457096887733395e-06, + "loss": 0.8999, + "step": 3641 + }, + { + "epoch": 3.3910614525139664, + "grad_norm": 1.6572545766830444, + "learning_rate": 2.044492338955739e-06, + "loss": 0.8681, + "step": 3642 + }, + { + "epoch": 3.3919925512104285, + "grad_norm": 1.7188464403152466, + "learning_rate": 2.0432751008422293e-06, + "loss": 0.9177, + "step": 3643 + }, + { + "epoch": 3.39292364990689, + "grad_norm": 1.7729672193527222, + "learning_rate": 2.0420579747313114e-06, + "loss": 0.9408, + "step": 3644 + }, + { + "epoch": 3.393854748603352, + "grad_norm": 1.7475430965423584, + "learning_rate": 2.040840960921462e-06, + "loss": 0.9109, + "step": 3645 + }, + { + "epoch": 3.394785847299814, + "grad_norm": 1.7914328575134277, + "learning_rate": 2.039624059711129e-06, + "loss": 0.933, + "step": 3646 + }, + { + "epoch": 3.3957169459962757, + "grad_norm": 1.7425769567489624, + "learning_rate": 2.0384072713987345e-06, + "loss": 0.9491, + "step": 3647 + }, + { + "epoch": 3.3966480446927374, + "grad_norm": 1.7524155378341675, + "learning_rate": 2.0371905962826684e-06, + "loss": 0.9544, + "step": 3648 + }, + { + "epoch": 3.397579143389199, + "grad_norm": 1.738939642906189, + "learning_rate": 2.0359740346612982e-06, + "loss": 0.9082, + "step": 3649 + }, + { + "epoch": 3.398510242085661, + "grad_norm": 1.6907044649124146, + "learning_rate": 2.034757586832961e-06, + "loss": 0.9215, + "step": 3650 + }, + { + "epoch": 3.399441340782123, + "grad_norm": 1.6801565885543823, + "learning_rate": 2.0335412530959647e-06, + "loss": 0.9007, + "step": 3651 + }, + { + "epoch": 3.4003724394785846, + "grad_norm": 1.7847118377685547, + "learning_rate": 2.0323250337485913e-06, + "loss": 0.9443, + "step": 3652 + }, + { + "epoch": 3.4013035381750467, + "grad_norm": 1.7194799184799194, + "learning_rate": 2.031108929089095e-06, + "loss": 0.9563, + "step": 3653 + }, + { + "epoch": 3.4022346368715084, + "grad_norm": 1.7634360790252686, + "learning_rate": 2.0298929394157e-06, + "loss": 0.8962, + "step": 3654 + }, + { + "epoch": 3.40316573556797, + "grad_norm": 1.6883530616760254, + "learning_rate": 2.028677065026604e-06, + "loss": 0.8953, + "step": 3655 + }, + { + "epoch": 3.404096834264432, + "grad_norm": 1.6946510076522827, + "learning_rate": 2.0274613062199746e-06, + "loss": 0.9169, + "step": 3656 + }, + { + "epoch": 3.405027932960894, + "grad_norm": 1.7746978998184204, + "learning_rate": 2.0262456632939542e-06, + "loss": 0.9428, + "step": 3657 + }, + { + "epoch": 3.4059590316573556, + "grad_norm": 1.7683535814285278, + "learning_rate": 2.0250301365466528e-06, + "loss": 0.937, + "step": 3658 + }, + { + "epoch": 3.4068901303538173, + "grad_norm": 1.7677348852157593, + "learning_rate": 2.023814726276154e-06, + "loss": 0.9354, + "step": 3659 + }, + { + "epoch": 3.4078212290502794, + "grad_norm": 1.674815058708191, + "learning_rate": 2.0225994327805154e-06, + "loss": 0.8908, + "step": 3660 + }, + { + "epoch": 3.408752327746741, + "grad_norm": 1.854444980621338, + "learning_rate": 2.0213842563577595e-06, + "loss": 0.9402, + "step": 3661 + }, + { + "epoch": 3.4096834264432028, + "grad_norm": 1.6401361227035522, + "learning_rate": 2.020169197305886e-06, + "loss": 0.8829, + "step": 3662 + }, + { + "epoch": 3.410614525139665, + "grad_norm": 1.7089415788650513, + "learning_rate": 2.0189542559228626e-06, + "loss": 0.8877, + "step": 3663 + }, + { + "epoch": 3.4115456238361266, + "grad_norm": 1.7052597999572754, + "learning_rate": 2.0177394325066312e-06, + "loss": 0.9098, + "step": 3664 + }, + { + "epoch": 3.4124767225325883, + "grad_norm": 1.6806219816207886, + "learning_rate": 2.016524727355101e-06, + "loss": 0.9219, + "step": 3665 + }, + { + "epoch": 3.4134078212290504, + "grad_norm": 1.6951138973236084, + "learning_rate": 2.0153101407661544e-06, + "loss": 0.8858, + "step": 3666 + }, + { + "epoch": 3.414338919925512, + "grad_norm": 1.7071776390075684, + "learning_rate": 2.014095673037645e-06, + "loss": 0.9188, + "step": 3667 + }, + { + "epoch": 3.4152700186219738, + "grad_norm": 1.6961917877197266, + "learning_rate": 2.0128813244673947e-06, + "loss": 0.9074, + "step": 3668 + }, + { + "epoch": 3.416201117318436, + "grad_norm": 1.6834344863891602, + "learning_rate": 2.0116670953532004e-06, + "loss": 0.883, + "step": 3669 + }, + { + "epoch": 3.4171322160148976, + "grad_norm": 1.7505922317504883, + "learning_rate": 2.010452985992825e-06, + "loss": 0.9253, + "step": 3670 + }, + { + "epoch": 3.4180633147113593, + "grad_norm": 1.6882200241088867, + "learning_rate": 2.0092389966840077e-06, + "loss": 0.8687, + "step": 3671 + }, + { + "epoch": 3.4189944134078214, + "grad_norm": 1.7411258220672607, + "learning_rate": 2.0080251277244523e-06, + "loss": 0.9343, + "step": 3672 + }, + { + "epoch": 3.419925512104283, + "grad_norm": 1.731019377708435, + "learning_rate": 2.006811379411836e-06, + "loss": 0.939, + "step": 3673 + }, + { + "epoch": 3.4208566108007448, + "grad_norm": 1.7165260314941406, + "learning_rate": 2.0055977520438075e-06, + "loss": 0.931, + "step": 3674 + }, + { + "epoch": 3.421787709497207, + "grad_norm": 1.7198034524917603, + "learning_rate": 2.0043842459179823e-06, + "loss": 0.8711, + "step": 3675 + }, + { + "epoch": 3.4227188081936686, + "grad_norm": 1.680818796157837, + "learning_rate": 2.00317086133195e-06, + "loss": 0.9176, + "step": 3676 + }, + { + "epoch": 3.4236499068901303, + "grad_norm": 1.6482501029968262, + "learning_rate": 2.0019575985832684e-06, + "loss": 0.9031, + "step": 3677 + }, + { + "epoch": 3.4245810055865924, + "grad_norm": 1.7547640800476074, + "learning_rate": 2.000744457969467e-06, + "loss": 0.9087, + "step": 3678 + }, + { + "epoch": 3.425512104283054, + "grad_norm": 1.8160649538040161, + "learning_rate": 1.9995314397880412e-06, + "loss": 0.9311, + "step": 3679 + }, + { + "epoch": 3.4264432029795158, + "grad_norm": 1.7029207944869995, + "learning_rate": 1.9983185443364617e-06, + "loss": 0.899, + "step": 3680 + }, + { + "epoch": 3.427374301675978, + "grad_norm": 1.7521311044692993, + "learning_rate": 1.9971057719121666e-06, + "loss": 0.9135, + "step": 3681 + }, + { + "epoch": 3.4283054003724396, + "grad_norm": 1.7538201808929443, + "learning_rate": 1.9958931228125617e-06, + "loss": 0.914, + "step": 3682 + }, + { + "epoch": 3.4292364990689013, + "grad_norm": 1.6631975173950195, + "learning_rate": 1.9946805973350277e-06, + "loss": 0.8881, + "step": 3683 + }, + { + "epoch": 3.430167597765363, + "grad_norm": 1.8578765392303467, + "learning_rate": 1.993468195776911e-06, + "loss": 0.9325, + "step": 3684 + }, + { + "epoch": 3.431098696461825, + "grad_norm": 1.7013373374938965, + "learning_rate": 1.992255918435528e-06, + "loss": 0.8848, + "step": 3685 + }, + { + "epoch": 3.4320297951582868, + "grad_norm": 1.7043328285217285, + "learning_rate": 1.9910437656081658e-06, + "loss": 0.9025, + "step": 3686 + }, + { + "epoch": 3.4329608938547485, + "grad_norm": 1.6889464855194092, + "learning_rate": 1.9898317375920805e-06, + "loss": 0.9057, + "step": 3687 + }, + { + "epoch": 3.4338919925512106, + "grad_norm": 1.7357399463653564, + "learning_rate": 1.988619834684499e-06, + "loss": 0.9387, + "step": 3688 + }, + { + "epoch": 3.4348230912476723, + "grad_norm": 1.7146764993667603, + "learning_rate": 1.9874080571826132e-06, + "loss": 0.9334, + "step": 3689 + }, + { + "epoch": 3.435754189944134, + "grad_norm": 1.702122449874878, + "learning_rate": 1.9861964053835887e-06, + "loss": 0.8971, + "step": 3690 + }, + { + "epoch": 3.436685288640596, + "grad_norm": 1.6351912021636963, + "learning_rate": 1.9849848795845594e-06, + "loss": 0.9124, + "step": 3691 + }, + { + "epoch": 3.4376163873370578, + "grad_norm": 1.7232487201690674, + "learning_rate": 1.9837734800826267e-06, + "loss": 0.9056, + "step": 3692 + }, + { + "epoch": 3.4385474860335195, + "grad_norm": 1.7359774112701416, + "learning_rate": 1.9825622071748616e-06, + "loss": 0.9461, + "step": 3693 + }, + { + "epoch": 3.439478584729981, + "grad_norm": 1.67306649684906, + "learning_rate": 1.9813510611583054e-06, + "loss": 0.9207, + "step": 3694 + }, + { + "epoch": 3.4404096834264433, + "grad_norm": 1.7530717849731445, + "learning_rate": 1.9801400423299673e-06, + "loss": 0.9013, + "step": 3695 + }, + { + "epoch": 3.441340782122905, + "grad_norm": 1.7043278217315674, + "learning_rate": 1.9789291509868246e-06, + "loss": 0.9168, + "step": 3696 + }, + { + "epoch": 3.4422718808193666, + "grad_norm": 1.6729575395584106, + "learning_rate": 1.9777183874258242e-06, + "loss": 0.9174, + "step": 3697 + }, + { + "epoch": 3.4432029795158288, + "grad_norm": 1.8327440023422241, + "learning_rate": 1.976507751943882e-06, + "loss": 0.966, + "step": 3698 + }, + { + "epoch": 3.4441340782122905, + "grad_norm": 1.6883249282836914, + "learning_rate": 1.9752972448378817e-06, + "loss": 0.9182, + "step": 3699 + }, + { + "epoch": 3.445065176908752, + "grad_norm": 1.7073884010314941, + "learning_rate": 1.9740868664046754e-06, + "loss": 0.9128, + "step": 3700 + }, + { + "epoch": 3.4459962756052143, + "grad_norm": 1.7522189617156982, + "learning_rate": 1.972876616941084e-06, + "loss": 0.9069, + "step": 3701 + }, + { + "epoch": 3.446927374301676, + "grad_norm": 1.672468662261963, + "learning_rate": 1.9716664967438983e-06, + "loss": 0.8487, + "step": 3702 + }, + { + "epoch": 3.4478584729981376, + "grad_norm": 1.718237280845642, + "learning_rate": 1.970456506109874e-06, + "loss": 0.9069, + "step": 3703 + }, + { + "epoch": 3.4487895716945998, + "grad_norm": 1.736087441444397, + "learning_rate": 1.969246645335738e-06, + "loss": 0.9169, + "step": 3704 + }, + { + "epoch": 3.4497206703910615, + "grad_norm": 1.6877533197402954, + "learning_rate": 1.9680369147181847e-06, + "loss": 0.9257, + "step": 3705 + }, + { + "epoch": 3.450651769087523, + "grad_norm": 1.737767219543457, + "learning_rate": 1.9668273145538754e-06, + "loss": 0.9517, + "step": 3706 + }, + { + "epoch": 3.4515828677839853, + "grad_norm": 1.7471284866333008, + "learning_rate": 1.9656178451394404e-06, + "loss": 0.9254, + "step": 3707 + }, + { + "epoch": 3.452513966480447, + "grad_norm": 1.9232438802719116, + "learning_rate": 1.964408506771477e-06, + "loss": 0.9296, + "step": 3708 + }, + { + "epoch": 3.4534450651769086, + "grad_norm": 1.7236459255218506, + "learning_rate": 1.9631992997465535e-06, + "loss": 0.9221, + "step": 3709 + }, + { + "epoch": 3.4543761638733708, + "grad_norm": 1.7353768348693848, + "learning_rate": 1.961990224361201e-06, + "loss": 0.9305, + "step": 3710 + }, + { + "epoch": 3.4553072625698324, + "grad_norm": 1.7400027513504028, + "learning_rate": 1.9607812809119214e-06, + "loss": 0.9193, + "step": 3711 + }, + { + "epoch": 3.456238361266294, + "grad_norm": 1.8154343366622925, + "learning_rate": 1.959572469695186e-06, + "loss": 0.9334, + "step": 3712 + }, + { + "epoch": 3.4571694599627563, + "grad_norm": 1.7508608102798462, + "learning_rate": 1.9583637910074283e-06, + "loss": 0.8952, + "step": 3713 + }, + { + "epoch": 3.458100558659218, + "grad_norm": 1.7181293964385986, + "learning_rate": 1.9571552451450542e-06, + "loss": 0.9095, + "step": 3714 + }, + { + "epoch": 3.4590316573556796, + "grad_norm": 1.6842950582504272, + "learning_rate": 1.9559468324044343e-06, + "loss": 0.91, + "step": 3715 + }, + { + "epoch": 3.4599627560521418, + "grad_norm": 1.7393920421600342, + "learning_rate": 1.954738553081909e-06, + "loss": 0.9008, + "step": 3716 + }, + { + "epoch": 3.4608938547486034, + "grad_norm": 1.7108683586120605, + "learning_rate": 1.953530407473783e-06, + "loss": 0.9252, + "step": 3717 + }, + { + "epoch": 3.461824953445065, + "grad_norm": 1.7097468376159668, + "learning_rate": 1.952322395876331e-06, + "loss": 0.9425, + "step": 3718 + }, + { + "epoch": 3.462756052141527, + "grad_norm": 1.7436959743499756, + "learning_rate": 1.9511145185857925e-06, + "loss": 0.9465, + "step": 3719 + }, + { + "epoch": 3.463687150837989, + "grad_norm": 1.6939001083374023, + "learning_rate": 1.9499067758983753e-06, + "loss": 0.9184, + "step": 3720 + }, + { + "epoch": 3.4646182495344506, + "grad_norm": 1.771569848060608, + "learning_rate": 1.948699168110254e-06, + "loss": 0.933, + "step": 3721 + }, + { + "epoch": 3.4655493482309123, + "grad_norm": 1.7227600812911987, + "learning_rate": 1.947491695517571e-06, + "loss": 0.9264, + "step": 3722 + }, + { + "epoch": 3.4664804469273744, + "grad_norm": 1.6653895378112793, + "learning_rate": 1.9462843584164333e-06, + "loss": 0.8951, + "step": 3723 + }, + { + "epoch": 3.467411545623836, + "grad_norm": 1.6405330896377563, + "learning_rate": 1.945077157102916e-06, + "loss": 0.8724, + "step": 3724 + }, + { + "epoch": 3.468342644320298, + "grad_norm": 1.77916419506073, + "learning_rate": 1.9438700918730624e-06, + "loss": 0.9016, + "step": 3725 + }, + { + "epoch": 3.46927374301676, + "grad_norm": 1.7160097360610962, + "learning_rate": 1.942663163022881e-06, + "loss": 0.8992, + "step": 3726 + }, + { + "epoch": 3.4702048417132216, + "grad_norm": 1.7439091205596924, + "learning_rate": 1.941456370848344e-06, + "loss": 0.9197, + "step": 3727 + }, + { + "epoch": 3.4711359404096833, + "grad_norm": 1.7196481227874756, + "learning_rate": 1.940249715645396e-06, + "loss": 0.8867, + "step": 3728 + }, + { + "epoch": 3.472067039106145, + "grad_norm": 1.7669312953948975, + "learning_rate": 1.9390431977099444e-06, + "loss": 0.9277, + "step": 3729 + }, + { + "epoch": 3.472998137802607, + "grad_norm": 1.6947247982025146, + "learning_rate": 1.937836817337862e-06, + "loss": 0.8697, + "step": 3730 + }, + { + "epoch": 3.473929236499069, + "grad_norm": 1.7258371114730835, + "learning_rate": 1.9366305748249893e-06, + "loss": 0.9306, + "step": 3731 + }, + { + "epoch": 3.4748603351955305, + "grad_norm": 1.6535981893539429, + "learning_rate": 1.935424470467135e-06, + "loss": 0.8931, + "step": 3732 + }, + { + "epoch": 3.4757914338919926, + "grad_norm": 1.7044140100479126, + "learning_rate": 1.934218504560071e-06, + "loss": 0.9179, + "step": 3733 + }, + { + "epoch": 3.4767225325884543, + "grad_norm": 1.7087328433990479, + "learning_rate": 1.933012677399535e-06, + "loss": 0.9154, + "step": 3734 + }, + { + "epoch": 3.477653631284916, + "grad_norm": 1.8075371980667114, + "learning_rate": 1.9318069892812333e-06, + "loss": 0.947, + "step": 3735 + }, + { + "epoch": 3.478584729981378, + "grad_norm": 1.7017734050750732, + "learning_rate": 1.9306014405008365e-06, + "loss": 0.9186, + "step": 3736 + }, + { + "epoch": 3.47951582867784, + "grad_norm": 1.7372357845306396, + "learning_rate": 1.929396031353981e-06, + "loss": 0.9109, + "step": 3737 + }, + { + "epoch": 3.4804469273743015, + "grad_norm": 1.7136497497558594, + "learning_rate": 1.928190762136268e-06, + "loss": 0.8936, + "step": 3738 + }, + { + "epoch": 3.4813780260707636, + "grad_norm": 1.7523882389068604, + "learning_rate": 1.926985633143267e-06, + "loss": 0.912, + "step": 3739 + }, + { + "epoch": 3.4823091247672253, + "grad_norm": 1.7151979207992554, + "learning_rate": 1.9257806446705116e-06, + "loss": 0.9324, + "step": 3740 + }, + { + "epoch": 3.483240223463687, + "grad_norm": 1.8397040367126465, + "learning_rate": 1.9245757970135e-06, + "loss": 0.9392, + "step": 3741 + }, + { + "epoch": 3.484171322160149, + "grad_norm": 1.7208216190338135, + "learning_rate": 1.9233710904676973e-06, + "loss": 0.9126, + "step": 3742 + }, + { + "epoch": 3.485102420856611, + "grad_norm": 1.6682169437408447, + "learning_rate": 1.9221665253285344e-06, + "loss": 0.8867, + "step": 3743 + }, + { + "epoch": 3.4860335195530725, + "grad_norm": 1.6920920610427856, + "learning_rate": 1.9209621018914056e-06, + "loss": 0.8853, + "step": 3744 + }, + { + "epoch": 3.4869646182495346, + "grad_norm": 1.6885992288589478, + "learning_rate": 1.9197578204516707e-06, + "loss": 0.8793, + "step": 3745 + }, + { + "epoch": 3.4878957169459963, + "grad_norm": 1.6868942975997925, + "learning_rate": 1.918553681304657e-06, + "loss": 0.8969, + "step": 3746 + }, + { + "epoch": 3.488826815642458, + "grad_norm": 1.6382404565811157, + "learning_rate": 1.9173496847456567e-06, + "loss": 0.8694, + "step": 3747 + }, + { + "epoch": 3.48975791433892, + "grad_norm": 1.7104731798171997, + "learning_rate": 1.9161458310699227e-06, + "loss": 0.9172, + "step": 3748 + }, + { + "epoch": 3.490689013035382, + "grad_norm": 1.7463269233703613, + "learning_rate": 1.914942120572677e-06, + "loss": 0.9552, + "step": 3749 + }, + { + "epoch": 3.4916201117318435, + "grad_norm": 1.7733818292617798, + "learning_rate": 1.9137385535491064e-06, + "loss": 0.921, + "step": 3750 + }, + { + "epoch": 3.4925512104283056, + "grad_norm": 1.7402420043945312, + "learning_rate": 1.91253513029436e-06, + "loss": 0.9041, + "step": 3751 + }, + { + "epoch": 3.4934823091247673, + "grad_norm": 1.78652024269104, + "learning_rate": 1.9113318511035543e-06, + "loss": 0.9058, + "step": 3752 + }, + { + "epoch": 3.494413407821229, + "grad_norm": 1.7097171545028687, + "learning_rate": 1.9101287162717694e-06, + "loss": 0.9026, + "step": 3753 + }, + { + "epoch": 3.4953445065176907, + "grad_norm": 1.7454807758331299, + "learning_rate": 1.908925726094048e-06, + "loss": 0.9257, + "step": 3754 + }, + { + "epoch": 3.496275605214153, + "grad_norm": 1.700960397720337, + "learning_rate": 1.9077228808654012e-06, + "loss": 0.9014, + "step": 3755 + }, + { + "epoch": 3.4972067039106145, + "grad_norm": 1.694154977798462, + "learning_rate": 1.9065201808808018e-06, + "loss": 0.9043, + "step": 3756 + }, + { + "epoch": 3.498137802607076, + "grad_norm": 1.7099239826202393, + "learning_rate": 1.9053176264351894e-06, + "loss": 0.8785, + "step": 3757 + }, + { + "epoch": 3.4990689013035383, + "grad_norm": 1.8350014686584473, + "learning_rate": 1.9041152178234631e-06, + "loss": 0.9105, + "step": 3758 + }, + { + "epoch": 3.5, + "grad_norm": 1.7365843057632446, + "learning_rate": 1.9029129553404921e-06, + "loss": 0.8887, + "step": 3759 + }, + { + "epoch": 3.5009310986964617, + "grad_norm": 1.7611134052276611, + "learning_rate": 1.9017108392811065e-06, + "loss": 0.9398, + "step": 3760 + }, + { + "epoch": 3.501862197392924, + "grad_norm": 1.701502799987793, + "learning_rate": 1.9005088699400998e-06, + "loss": 0.9134, + "step": 3761 + }, + { + "epoch": 3.5027932960893855, + "grad_norm": 1.7850090265274048, + "learning_rate": 1.8993070476122318e-06, + "loss": 0.9112, + "step": 3762 + }, + { + "epoch": 3.503724394785847, + "grad_norm": 1.804706335067749, + "learning_rate": 1.8981053725922258e-06, + "loss": 0.9187, + "step": 3763 + }, + { + "epoch": 3.504655493482309, + "grad_norm": 1.6850451231002808, + "learning_rate": 1.8969038451747682e-06, + "loss": 0.8729, + "step": 3764 + }, + { + "epoch": 3.505586592178771, + "grad_norm": 1.7206321954727173, + "learning_rate": 1.895702465654508e-06, + "loss": 0.8743, + "step": 3765 + }, + { + "epoch": 3.5065176908752327, + "grad_norm": 1.7220323085784912, + "learning_rate": 1.8945012343260605e-06, + "loss": 0.9265, + "step": 3766 + }, + { + "epoch": 3.5074487895716944, + "grad_norm": 1.797593355178833, + "learning_rate": 1.8933001514840043e-06, + "loss": 0.9193, + "step": 3767 + }, + { + "epoch": 3.5083798882681565, + "grad_norm": 1.7038910388946533, + "learning_rate": 1.8920992174228792e-06, + "loss": 0.877, + "step": 3768 + }, + { + "epoch": 3.509310986964618, + "grad_norm": 1.7656110525131226, + "learning_rate": 1.890898432437191e-06, + "loss": 0.9062, + "step": 3769 + }, + { + "epoch": 3.51024208566108, + "grad_norm": 1.7884465456008911, + "learning_rate": 1.8896977968214078e-06, + "loss": 0.8961, + "step": 3770 + }, + { + "epoch": 3.511173184357542, + "grad_norm": 1.7180119752883911, + "learning_rate": 1.8884973108699623e-06, + "loss": 0.8757, + "step": 3771 + }, + { + "epoch": 3.5121042830540037, + "grad_norm": 1.7336595058441162, + "learning_rate": 1.8872969748772474e-06, + "loss": 0.908, + "step": 3772 + }, + { + "epoch": 3.5130353817504654, + "grad_norm": 1.7699941396713257, + "learning_rate": 1.886096789137623e-06, + "loss": 0.8661, + "step": 3773 + }, + { + "epoch": 3.5139664804469275, + "grad_norm": 1.726975917816162, + "learning_rate": 1.8848967539454109e-06, + "loss": 0.8995, + "step": 3774 + }, + { + "epoch": 3.514897579143389, + "grad_norm": 1.6452610492706299, + "learning_rate": 1.8836968695948944e-06, + "loss": 0.8486, + "step": 3775 + }, + { + "epoch": 3.515828677839851, + "grad_norm": 1.7326273918151855, + "learning_rate": 1.8824971363803205e-06, + "loss": 0.8939, + "step": 3776 + }, + { + "epoch": 3.516759776536313, + "grad_norm": 1.7196978330612183, + "learning_rate": 1.8812975545959011e-06, + "loss": 0.8934, + "step": 3777 + }, + { + "epoch": 3.5176908752327747, + "grad_norm": 1.7697244882583618, + "learning_rate": 1.8800981245358097e-06, + "loss": 0.9103, + "step": 3778 + }, + { + "epoch": 3.5186219739292364, + "grad_norm": 1.7580496072769165, + "learning_rate": 1.8788988464941804e-06, + "loss": 0.943, + "step": 3779 + }, + { + "epoch": 3.5195530726256985, + "grad_norm": 1.7845139503479004, + "learning_rate": 1.8776997207651137e-06, + "loss": 0.9133, + "step": 3780 + }, + { + "epoch": 3.52048417132216, + "grad_norm": 1.724279522895813, + "learning_rate": 1.8765007476426714e-06, + "loss": 0.9305, + "step": 3781 + }, + { + "epoch": 3.521415270018622, + "grad_norm": 1.751499891281128, + "learning_rate": 1.8753019274208762e-06, + "loss": 0.9397, + "step": 3782 + }, + { + "epoch": 3.522346368715084, + "grad_norm": 1.72331702709198, + "learning_rate": 1.8741032603937142e-06, + "loss": 0.924, + "step": 3783 + }, + { + "epoch": 3.5232774674115457, + "grad_norm": 1.779765248298645, + "learning_rate": 1.8729047468551365e-06, + "loss": 0.9207, + "step": 3784 + }, + { + "epoch": 3.5242085661080074, + "grad_norm": 1.6782563924789429, + "learning_rate": 1.8717063870990535e-06, + "loss": 0.871, + "step": 3785 + }, + { + "epoch": 3.5251396648044695, + "grad_norm": 1.6838434934616089, + "learning_rate": 1.8705081814193381e-06, + "loss": 0.9243, + "step": 3786 + }, + { + "epoch": 3.526070763500931, + "grad_norm": 1.7279614210128784, + "learning_rate": 1.869310130109826e-06, + "loss": 0.9378, + "step": 3787 + }, + { + "epoch": 3.527001862197393, + "grad_norm": 1.7303756475448608, + "learning_rate": 1.868112233464317e-06, + "loss": 0.9089, + "step": 3788 + }, + { + "epoch": 3.527932960893855, + "grad_norm": 1.6787052154541016, + "learning_rate": 1.8669144917765694e-06, + "loss": 0.8852, + "step": 3789 + }, + { + "epoch": 3.5288640595903167, + "grad_norm": 1.7224739789962769, + "learning_rate": 1.8657169053403052e-06, + "loss": 0.9187, + "step": 3790 + }, + { + "epoch": 3.5297951582867784, + "grad_norm": 1.7588502168655396, + "learning_rate": 1.8645194744492106e-06, + "loss": 0.931, + "step": 3791 + }, + { + "epoch": 3.5307262569832405, + "grad_norm": 1.7237749099731445, + "learning_rate": 1.8633221993969285e-06, + "loss": 0.9052, + "step": 3792 + }, + { + "epoch": 3.531657355679702, + "grad_norm": 1.775160789489746, + "learning_rate": 1.8621250804770683e-06, + "loss": 0.9254, + "step": 3793 + }, + { + "epoch": 3.532588454376164, + "grad_norm": 1.705663800239563, + "learning_rate": 1.8609281179831984e-06, + "loss": 0.9256, + "step": 3794 + }, + { + "epoch": 3.5335195530726256, + "grad_norm": 1.7833622694015503, + "learning_rate": 1.8597313122088513e-06, + "loss": 0.8922, + "step": 3795 + }, + { + "epoch": 3.5344506517690877, + "grad_norm": 1.7586307525634766, + "learning_rate": 1.8585346634475177e-06, + "loss": 0.8932, + "step": 3796 + }, + { + "epoch": 3.5353817504655494, + "grad_norm": 1.7682034969329834, + "learning_rate": 1.857338171992652e-06, + "loss": 0.9043, + "step": 3797 + }, + { + "epoch": 3.536312849162011, + "grad_norm": 1.7299585342407227, + "learning_rate": 1.8561418381376717e-06, + "loss": 0.8587, + "step": 3798 + }, + { + "epoch": 3.5372439478584727, + "grad_norm": 1.7286533117294312, + "learning_rate": 1.8549456621759506e-06, + "loss": 0.9318, + "step": 3799 + }, + { + "epoch": 3.538175046554935, + "grad_norm": 1.761705994606018, + "learning_rate": 1.8537496444008285e-06, + "loss": 0.9185, + "step": 3800 + }, + { + "epoch": 3.5391061452513966, + "grad_norm": 1.7503806352615356, + "learning_rate": 1.852553785105604e-06, + "loss": 0.9316, + "step": 3801 + }, + { + "epoch": 3.5400372439478582, + "grad_norm": 1.7258548736572266, + "learning_rate": 1.8513580845835387e-06, + "loss": 0.918, + "step": 3802 + }, + { + "epoch": 3.5409683426443204, + "grad_norm": 1.762282371520996, + "learning_rate": 1.8501625431278533e-06, + "loss": 0.9176, + "step": 3803 + }, + { + "epoch": 3.541899441340782, + "grad_norm": 1.7405894994735718, + "learning_rate": 1.84896716103173e-06, + "loss": 0.9356, + "step": 3804 + }, + { + "epoch": 3.5428305400372437, + "grad_norm": 1.7192190885543823, + "learning_rate": 1.847771938588313e-06, + "loss": 0.9344, + "step": 3805 + }, + { + "epoch": 3.543761638733706, + "grad_norm": 1.7265293598175049, + "learning_rate": 1.846576876090705e-06, + "loss": 0.8953, + "step": 3806 + }, + { + "epoch": 3.5446927374301676, + "grad_norm": 1.725490689277649, + "learning_rate": 1.8453819738319728e-06, + "loss": 0.9133, + "step": 3807 + }, + { + "epoch": 3.5456238361266292, + "grad_norm": 1.6897168159484863, + "learning_rate": 1.8441872321051406e-06, + "loss": 0.9312, + "step": 3808 + }, + { + "epoch": 3.5465549348230914, + "grad_norm": 1.7965730428695679, + "learning_rate": 1.8429926512031976e-06, + "loss": 0.9131, + "step": 3809 + }, + { + "epoch": 3.547486033519553, + "grad_norm": 1.733558177947998, + "learning_rate": 1.8417982314190868e-06, + "loss": 0.9284, + "step": 3810 + }, + { + "epoch": 3.5484171322160147, + "grad_norm": 1.709241509437561, + "learning_rate": 1.8406039730457185e-06, + "loss": 0.8946, + "step": 3811 + }, + { + "epoch": 3.549348230912477, + "grad_norm": 1.7721717357635498, + "learning_rate": 1.83940987637596e-06, + "loss": 0.9202, + "step": 3812 + }, + { + "epoch": 3.5502793296089385, + "grad_norm": 1.756757140159607, + "learning_rate": 1.838215941702638e-06, + "loss": 0.8986, + "step": 3813 + }, + { + "epoch": 3.5512104283054002, + "grad_norm": 1.7835568189620972, + "learning_rate": 1.8370221693185424e-06, + "loss": 0.9355, + "step": 3814 + }, + { + "epoch": 3.5521415270018624, + "grad_norm": 1.786633849143982, + "learning_rate": 1.8358285595164216e-06, + "loss": 0.9102, + "step": 3815 + }, + { + "epoch": 3.553072625698324, + "grad_norm": 1.702560305595398, + "learning_rate": 1.8346351125889849e-06, + "loss": 0.9205, + "step": 3816 + }, + { + "epoch": 3.5540037243947857, + "grad_norm": 1.6461273431777954, + "learning_rate": 1.8334418288288995e-06, + "loss": 0.8601, + "step": 3817 + }, + { + "epoch": 3.554934823091248, + "grad_norm": 1.731619954109192, + "learning_rate": 1.8322487085287953e-06, + "loss": 0.9504, + "step": 3818 + }, + { + "epoch": 3.5558659217877095, + "grad_norm": 1.7459062337875366, + "learning_rate": 1.831055751981262e-06, + "loss": 0.9443, + "step": 3819 + }, + { + "epoch": 3.5567970204841712, + "grad_norm": 1.8265029191970825, + "learning_rate": 1.829862959478847e-06, + "loss": 0.9443, + "step": 3820 + }, + { + "epoch": 3.5577281191806334, + "grad_norm": 1.7825050354003906, + "learning_rate": 1.828670331314058e-06, + "loss": 0.8552, + "step": 3821 + }, + { + "epoch": 3.558659217877095, + "grad_norm": 1.7627006769180298, + "learning_rate": 1.8274778677793653e-06, + "loss": 0.9402, + "step": 3822 + }, + { + "epoch": 3.5595903165735567, + "grad_norm": 1.7195544242858887, + "learning_rate": 1.8262855691671944e-06, + "loss": 0.9229, + "step": 3823 + }, + { + "epoch": 3.560521415270019, + "grad_norm": 1.7175885438919067, + "learning_rate": 1.825093435769933e-06, + "loss": 0.916, + "step": 3824 + }, + { + "epoch": 3.5614525139664805, + "grad_norm": 1.7534878253936768, + "learning_rate": 1.823901467879929e-06, + "loss": 0.9417, + "step": 3825 + }, + { + "epoch": 3.5623836126629422, + "grad_norm": 1.7108118534088135, + "learning_rate": 1.8227096657894878e-06, + "loss": 0.8908, + "step": 3826 + }, + { + "epoch": 3.5633147113594044, + "grad_norm": 1.6790822744369507, + "learning_rate": 1.8215180297908746e-06, + "loss": 0.8999, + "step": 3827 + }, + { + "epoch": 3.564245810055866, + "grad_norm": 1.7114630937576294, + "learning_rate": 1.8203265601763137e-06, + "loss": 0.8944, + "step": 3828 + }, + { + "epoch": 3.5651769087523277, + "grad_norm": 1.6899574995040894, + "learning_rate": 1.819135257237991e-06, + "loss": 0.91, + "step": 3829 + }, + { + "epoch": 3.5661080074487894, + "grad_norm": 1.6954869031906128, + "learning_rate": 1.8179441212680479e-06, + "loss": 0.874, + "step": 3830 + }, + { + "epoch": 3.5670391061452515, + "grad_norm": 1.6966147422790527, + "learning_rate": 1.8167531525585863e-06, + "loss": 0.9008, + "step": 3831 + }, + { + "epoch": 3.5679702048417132, + "grad_norm": 1.714116096496582, + "learning_rate": 1.8155623514016685e-06, + "loss": 0.9516, + "step": 3832 + }, + { + "epoch": 3.568901303538175, + "grad_norm": 1.796891212463379, + "learning_rate": 1.8143717180893144e-06, + "loss": 0.926, + "step": 3833 + }, + { + "epoch": 3.5698324022346366, + "grad_norm": 1.7023240327835083, + "learning_rate": 1.8131812529135024e-06, + "loss": 0.9359, + "step": 3834 + }, + { + "epoch": 3.5707635009310987, + "grad_norm": 1.7283834218978882, + "learning_rate": 1.8119909561661692e-06, + "loss": 0.921, + "step": 3835 + }, + { + "epoch": 3.5716945996275604, + "grad_norm": 1.807257890701294, + "learning_rate": 1.8108008281392136e-06, + "loss": 0.9173, + "step": 3836 + }, + { + "epoch": 3.572625698324022, + "grad_norm": 1.7010875940322876, + "learning_rate": 1.8096108691244884e-06, + "loss": 0.8992, + "step": 3837 + }, + { + "epoch": 3.5735567970204842, + "grad_norm": 1.81072998046875, + "learning_rate": 1.8084210794138076e-06, + "loss": 0.9329, + "step": 3838 + }, + { + "epoch": 3.574487895716946, + "grad_norm": 1.6628484725952148, + "learning_rate": 1.8072314592989432e-06, + "loss": 0.8697, + "step": 3839 + }, + { + "epoch": 3.5754189944134076, + "grad_norm": 1.7389262914657593, + "learning_rate": 1.8060420090716266e-06, + "loss": 0.9095, + "step": 3840 + }, + { + "epoch": 3.5763500931098697, + "grad_norm": 1.686619758605957, + "learning_rate": 1.8048527290235452e-06, + "loss": 0.884, + "step": 3841 + }, + { + "epoch": 3.5772811918063314, + "grad_norm": 1.6599712371826172, + "learning_rate": 1.8036636194463462e-06, + "loss": 0.8856, + "step": 3842 + }, + { + "epoch": 3.578212290502793, + "grad_norm": 1.774749517440796, + "learning_rate": 1.8024746806316369e-06, + "loss": 0.8936, + "step": 3843 + }, + { + "epoch": 3.5791433891992552, + "grad_norm": 1.7699576616287231, + "learning_rate": 1.8012859128709766e-06, + "loss": 0.9504, + "step": 3844 + }, + { + "epoch": 3.580074487895717, + "grad_norm": 1.7629735469818115, + "learning_rate": 1.80009731645589e-06, + "loss": 0.8941, + "step": 3845 + }, + { + "epoch": 3.5810055865921786, + "grad_norm": 1.768296241760254, + "learning_rate": 1.7989088916778546e-06, + "loss": 0.9206, + "step": 3846 + }, + { + "epoch": 3.5819366852886407, + "grad_norm": 1.7568581104278564, + "learning_rate": 1.7977206388283098e-06, + "loss": 0.8989, + "step": 3847 + }, + { + "epoch": 3.5828677839851024, + "grad_norm": 1.7615503072738647, + "learning_rate": 1.7965325581986487e-06, + "loss": 0.9256, + "step": 3848 + }, + { + "epoch": 3.583798882681564, + "grad_norm": 1.723462700843811, + "learning_rate": 1.7953446500802246e-06, + "loss": 0.8967, + "step": 3849 + }, + { + "epoch": 3.5847299813780262, + "grad_norm": 1.7438347339630127, + "learning_rate": 1.7941569147643493e-06, + "loss": 0.9299, + "step": 3850 + }, + { + "epoch": 3.585661080074488, + "grad_norm": 1.6896212100982666, + "learning_rate": 1.7929693525422887e-06, + "loss": 0.8743, + "step": 3851 + }, + { + "epoch": 3.5865921787709496, + "grad_norm": 1.7458972930908203, + "learning_rate": 1.7917819637052702e-06, + "loss": 0.8805, + "step": 3852 + }, + { + "epoch": 3.5875232774674117, + "grad_norm": 1.7452510595321655, + "learning_rate": 1.7905947485444775e-06, + "loss": 0.9088, + "step": 3853 + }, + { + "epoch": 3.5884543761638734, + "grad_norm": 1.7439093589782715, + "learning_rate": 1.7894077073510497e-06, + "loss": 0.8998, + "step": 3854 + }, + { + "epoch": 3.589385474860335, + "grad_norm": 1.7456589937210083, + "learning_rate": 1.788220840416085e-06, + "loss": 0.9257, + "step": 3855 + }, + { + "epoch": 3.5903165735567972, + "grad_norm": 1.722548246383667, + "learning_rate": 1.7870341480306397e-06, + "loss": 0.8935, + "step": 3856 + }, + { + "epoch": 3.591247672253259, + "grad_norm": 1.8748631477355957, + "learning_rate": 1.7858476304857259e-06, + "loss": 0.9039, + "step": 3857 + }, + { + "epoch": 3.5921787709497206, + "grad_norm": 1.6816240549087524, + "learning_rate": 1.7846612880723118e-06, + "loss": 0.896, + "step": 3858 + }, + { + "epoch": 3.5931098696461827, + "grad_norm": 1.692328691482544, + "learning_rate": 1.7834751210813262e-06, + "loss": 0.8662, + "step": 3859 + }, + { + "epoch": 3.5940409683426444, + "grad_norm": 1.7272828817367554, + "learning_rate": 1.7822891298036514e-06, + "loss": 0.9349, + "step": 3860 + }, + { + "epoch": 3.594972067039106, + "grad_norm": 1.672918677330017, + "learning_rate": 1.7811033145301282e-06, + "loss": 0.925, + "step": 3861 + }, + { + "epoch": 3.5959031657355682, + "grad_norm": 1.824395775794983, + "learning_rate": 1.7799176755515529e-06, + "loss": 0.9268, + "step": 3862 + }, + { + "epoch": 3.59683426443203, + "grad_norm": 1.709234595298767, + "learning_rate": 1.7787322131586815e-06, + "loss": 0.8982, + "step": 3863 + }, + { + "epoch": 3.5977653631284916, + "grad_norm": 1.7000312805175781, + "learning_rate": 1.7775469276422238e-06, + "loss": 0.9213, + "step": 3864 + }, + { + "epoch": 3.5986964618249533, + "grad_norm": 1.7053449153900146, + "learning_rate": 1.7763618192928468e-06, + "loss": 0.8863, + "step": 3865 + }, + { + "epoch": 3.5996275605214154, + "grad_norm": 1.7321999073028564, + "learning_rate": 1.775176888401175e-06, + "loss": 0.9507, + "step": 3866 + }, + { + "epoch": 3.600558659217877, + "grad_norm": 1.754015564918518, + "learning_rate": 1.7739921352577894e-06, + "loss": 0.8876, + "step": 3867 + }, + { + "epoch": 3.601489757914339, + "grad_norm": 1.6962555646896362, + "learning_rate": 1.7728075601532258e-06, + "loss": 0.9022, + "step": 3868 + }, + { + "epoch": 3.6024208566108005, + "grad_norm": 1.7329515218734741, + "learning_rate": 1.7716231633779774e-06, + "loss": 0.8734, + "step": 3869 + }, + { + "epoch": 3.6033519553072626, + "grad_norm": 1.7151148319244385, + "learning_rate": 1.7704389452224945e-06, + "loss": 0.9207, + "step": 3870 + }, + { + "epoch": 3.6042830540037243, + "grad_norm": 1.7513630390167236, + "learning_rate": 1.7692549059771835e-06, + "loss": 0.9082, + "step": 3871 + }, + { + "epoch": 3.605214152700186, + "grad_norm": 1.7094073295593262, + "learning_rate": 1.7680710459324043e-06, + "loss": 0.8941, + "step": 3872 + }, + { + "epoch": 3.606145251396648, + "grad_norm": 1.750381350517273, + "learning_rate": 1.766887365378475e-06, + "loss": 0.9715, + "step": 3873 + }, + { + "epoch": 3.60707635009311, + "grad_norm": 1.787935733795166, + "learning_rate": 1.7657038646056713e-06, + "loss": 0.8934, + "step": 3874 + }, + { + "epoch": 3.6080074487895715, + "grad_norm": 1.6999529600143433, + "learning_rate": 1.7645205439042213e-06, + "loss": 0.8942, + "step": 3875 + }, + { + "epoch": 3.6089385474860336, + "grad_norm": 1.8665771484375, + "learning_rate": 1.7633374035643103e-06, + "loss": 0.925, + "step": 3876 + }, + { + "epoch": 3.6098696461824953, + "grad_norm": 1.785208821296692, + "learning_rate": 1.7621544438760807e-06, + "loss": 0.9325, + "step": 3877 + }, + { + "epoch": 3.610800744878957, + "grad_norm": 1.6994990110397339, + "learning_rate": 1.76097166512963e-06, + "loss": 0.8469, + "step": 3878 + }, + { + "epoch": 3.611731843575419, + "grad_norm": 1.7353748083114624, + "learning_rate": 1.7597890676150096e-06, + "loss": 0.9276, + "step": 3879 + }, + { + "epoch": 3.612662942271881, + "grad_norm": 1.7782478332519531, + "learning_rate": 1.7586066516222277e-06, + "loss": 0.9373, + "step": 3880 + }, + { + "epoch": 3.6135940409683425, + "grad_norm": 2.06430983543396, + "learning_rate": 1.7574244174412498e-06, + "loss": 0.9152, + "step": 3881 + }, + { + "epoch": 3.6145251396648046, + "grad_norm": 1.729926586151123, + "learning_rate": 1.7562423653619931e-06, + "loss": 0.9395, + "step": 3882 + }, + { + "epoch": 3.6154562383612663, + "grad_norm": 1.798756718635559, + "learning_rate": 1.7550604956743328e-06, + "loss": 0.9276, + "step": 3883 + }, + { + "epoch": 3.616387337057728, + "grad_norm": 1.7765427827835083, + "learning_rate": 1.7538788086680994e-06, + "loss": 0.892, + "step": 3884 + }, + { + "epoch": 3.61731843575419, + "grad_norm": 1.7441920042037964, + "learning_rate": 1.752697304633076e-06, + "loss": 0.8823, + "step": 3885 + }, + { + "epoch": 3.618249534450652, + "grad_norm": 1.6643478870391846, + "learning_rate": 1.7515159838590046e-06, + "loss": 0.8948, + "step": 3886 + }, + { + "epoch": 3.6191806331471135, + "grad_norm": 1.7136600017547607, + "learning_rate": 1.750334846635579e-06, + "loss": 0.9205, + "step": 3887 + }, + { + "epoch": 3.6201117318435756, + "grad_norm": 1.6596384048461914, + "learning_rate": 1.7491538932524514e-06, + "loss": 0.872, + "step": 3888 + }, + { + "epoch": 3.6210428305400373, + "grad_norm": 1.7417312860488892, + "learning_rate": 1.747973123999224e-06, + "loss": 0.929, + "step": 3889 + }, + { + "epoch": 3.621973929236499, + "grad_norm": 1.6772305965423584, + "learning_rate": 1.7467925391654587e-06, + "loss": 0.9075, + "step": 3890 + }, + { + "epoch": 3.622905027932961, + "grad_norm": 1.7425562143325806, + "learning_rate": 1.74561213904067e-06, + "loss": 0.8965, + "step": 3891 + }, + { + "epoch": 3.623836126629423, + "grad_norm": 1.7270545959472656, + "learning_rate": 1.744431923914326e-06, + "loss": 0.8976, + "step": 3892 + }, + { + "epoch": 3.6247672253258845, + "grad_norm": 1.7174123525619507, + "learning_rate": 1.7432518940758519e-06, + "loss": 0.9245, + "step": 3893 + }, + { + "epoch": 3.6256983240223466, + "grad_norm": 1.7063301801681519, + "learning_rate": 1.7420720498146254e-06, + "loss": 0.9172, + "step": 3894 + }, + { + "epoch": 3.6266294227188083, + "grad_norm": 1.698531150817871, + "learning_rate": 1.7408923914199818e-06, + "loss": 0.8942, + "step": 3895 + }, + { + "epoch": 3.62756052141527, + "grad_norm": 1.7554367780685425, + "learning_rate": 1.7397129191812058e-06, + "loss": 0.8929, + "step": 3896 + }, + { + "epoch": 3.628491620111732, + "grad_norm": 1.718575119972229, + "learning_rate": 1.7385336333875403e-06, + "loss": 0.9067, + "step": 3897 + }, + { + "epoch": 3.629422718808194, + "grad_norm": 1.8120148181915283, + "learning_rate": 1.7373545343281822e-06, + "loss": 0.9292, + "step": 3898 + }, + { + "epoch": 3.6303538175046555, + "grad_norm": 1.76980721950531, + "learning_rate": 1.7361756222922798e-06, + "loss": 0.9431, + "step": 3899 + }, + { + "epoch": 3.631284916201117, + "grad_norm": 1.8019706010818481, + "learning_rate": 1.73499689756894e-06, + "loss": 0.9483, + "step": 3900 + }, + { + "epoch": 3.6322160148975793, + "grad_norm": 1.709527850151062, + "learning_rate": 1.7338183604472198e-06, + "loss": 0.9054, + "step": 3901 + }, + { + "epoch": 3.633147113594041, + "grad_norm": 1.767380952835083, + "learning_rate": 1.7326400112161329e-06, + "loss": 0.9074, + "step": 3902 + }, + { + "epoch": 3.6340782122905027, + "grad_norm": 1.7089146375656128, + "learning_rate": 1.7314618501646435e-06, + "loss": 0.9118, + "step": 3903 + }, + { + "epoch": 3.635009310986965, + "grad_norm": 1.8244383335113525, + "learning_rate": 1.730283877581674e-06, + "loss": 0.9024, + "step": 3904 + }, + { + "epoch": 3.6359404096834265, + "grad_norm": 1.777852177619934, + "learning_rate": 1.7291060937560985e-06, + "loss": 0.9283, + "step": 3905 + }, + { + "epoch": 3.636871508379888, + "grad_norm": 1.771607756614685, + "learning_rate": 1.7279284989767436e-06, + "loss": 0.9338, + "step": 3906 + }, + { + "epoch": 3.63780260707635, + "grad_norm": 1.7060426473617554, + "learning_rate": 1.7267510935323906e-06, + "loss": 0.9047, + "step": 3907 + }, + { + "epoch": 3.638733705772812, + "grad_norm": 1.7086195945739746, + "learning_rate": 1.7255738777117758e-06, + "loss": 0.9195, + "step": 3908 + }, + { + "epoch": 3.6396648044692737, + "grad_norm": 1.6890431642532349, + "learning_rate": 1.7243968518035874e-06, + "loss": 0.9024, + "step": 3909 + }, + { + "epoch": 3.6405959031657353, + "grad_norm": 1.7672970294952393, + "learning_rate": 1.7232200160964657e-06, + "loss": 0.967, + "step": 3910 + }, + { + "epoch": 3.6415270018621975, + "grad_norm": 1.6634522676467896, + "learning_rate": 1.7220433708790082e-06, + "loss": 0.8917, + "step": 3911 + }, + { + "epoch": 3.642458100558659, + "grad_norm": 1.7099052667617798, + "learning_rate": 1.720866916439763e-06, + "loss": 0.8967, + "step": 3912 + }, + { + "epoch": 3.643389199255121, + "grad_norm": 1.7886919975280762, + "learning_rate": 1.7196906530672306e-06, + "loss": 0.8804, + "step": 3913 + }, + { + "epoch": 3.644320297951583, + "grad_norm": 2.3211326599121094, + "learning_rate": 1.7185145810498663e-06, + "loss": 0.8589, + "step": 3914 + }, + { + "epoch": 3.6452513966480447, + "grad_norm": 1.695995807647705, + "learning_rate": 1.7173387006760789e-06, + "loss": 0.9519, + "step": 3915 + }, + { + "epoch": 3.6461824953445063, + "grad_norm": 1.670315146446228, + "learning_rate": 1.7161630122342299e-06, + "loss": 0.8887, + "step": 3916 + }, + { + "epoch": 3.6471135940409685, + "grad_norm": 1.7106984853744507, + "learning_rate": 1.7149875160126315e-06, + "loss": 0.8978, + "step": 3917 + }, + { + "epoch": 3.64804469273743, + "grad_norm": 1.7007761001586914, + "learning_rate": 1.713812212299551e-06, + "loss": 0.8804, + "step": 3918 + }, + { + "epoch": 3.648975791433892, + "grad_norm": 1.6932612657546997, + "learning_rate": 1.7126371013832093e-06, + "loss": 0.8936, + "step": 3919 + }, + { + "epoch": 3.649906890130354, + "grad_norm": 1.7997294664382935, + "learning_rate": 1.7114621835517771e-06, + "loss": 0.912, + "step": 3920 + }, + { + "epoch": 3.6508379888268156, + "grad_norm": 1.7106883525848389, + "learning_rate": 1.71028745909338e-06, + "loss": 0.8954, + "step": 3921 + }, + { + "epoch": 3.6517690875232773, + "grad_norm": 1.765065312385559, + "learning_rate": 1.7091129282960966e-06, + "loss": 0.9305, + "step": 3922 + }, + { + "epoch": 3.6527001862197395, + "grad_norm": 1.7304763793945312, + "learning_rate": 1.7079385914479545e-06, + "loss": 0.9079, + "step": 3923 + }, + { + "epoch": 3.653631284916201, + "grad_norm": 1.788089394569397, + "learning_rate": 1.7067644488369381e-06, + "loss": 0.9261, + "step": 3924 + }, + { + "epoch": 3.654562383612663, + "grad_norm": 1.7470579147338867, + "learning_rate": 1.705590500750981e-06, + "loss": 0.9251, + "step": 3925 + }, + { + "epoch": 3.655493482309125, + "grad_norm": 1.7565187215805054, + "learning_rate": 1.7044167474779727e-06, + "loss": 0.9067, + "step": 3926 + }, + { + "epoch": 3.6564245810055866, + "grad_norm": 1.7824771404266357, + "learning_rate": 1.70324318930575e-06, + "loss": 0.9555, + "step": 3927 + }, + { + "epoch": 3.6573556797020483, + "grad_norm": 1.664537787437439, + "learning_rate": 1.702069826522105e-06, + "loss": 0.8739, + "step": 3928 + }, + { + "epoch": 3.6582867783985105, + "grad_norm": 1.7708044052124023, + "learning_rate": 1.7008966594147833e-06, + "loss": 0.9262, + "step": 3929 + }, + { + "epoch": 3.659217877094972, + "grad_norm": 1.7169562578201294, + "learning_rate": 1.6997236882714774e-06, + "loss": 0.9157, + "step": 3930 + }, + { + "epoch": 3.660148975791434, + "grad_norm": 1.7896921634674072, + "learning_rate": 1.6985509133798367e-06, + "loss": 0.9081, + "step": 3931 + }, + { + "epoch": 3.661080074487896, + "grad_norm": 1.713206171989441, + "learning_rate": 1.6973783350274603e-06, + "loss": 0.8746, + "step": 3932 + }, + { + "epoch": 3.6620111731843576, + "grad_norm": 1.8007779121398926, + "learning_rate": 1.6962059535019001e-06, + "loss": 0.9185, + "step": 3933 + }, + { + "epoch": 3.6629422718808193, + "grad_norm": 1.7197685241699219, + "learning_rate": 1.6950337690906582e-06, + "loss": 0.917, + "step": 3934 + }, + { + "epoch": 3.6638733705772815, + "grad_norm": 1.7353636026382446, + "learning_rate": 1.6938617820811899e-06, + "loss": 0.8867, + "step": 3935 + }, + { + "epoch": 3.664804469273743, + "grad_norm": 1.6639593839645386, + "learning_rate": 1.692689992760901e-06, + "loss": 0.8644, + "step": 3936 + }, + { + "epoch": 3.665735567970205, + "grad_norm": 1.7221115827560425, + "learning_rate": 1.6915184014171484e-06, + "loss": 0.8906, + "step": 3937 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 1.6684808731079102, + "learning_rate": 1.690347008337243e-06, + "loss": 0.8812, + "step": 3938 + }, + { + "epoch": 3.6675977653631286, + "grad_norm": 1.7499772310256958, + "learning_rate": 1.6891758138084441e-06, + "loss": 0.9105, + "step": 3939 + }, + { + "epoch": 3.6685288640595903, + "grad_norm": 1.7787092924118042, + "learning_rate": 1.6880048181179653e-06, + "loss": 0.9388, + "step": 3940 + }, + { + "epoch": 3.669459962756052, + "grad_norm": 1.7139168977737427, + "learning_rate": 1.6868340215529674e-06, + "loss": 0.9015, + "step": 3941 + }, + { + "epoch": 3.6703910614525137, + "grad_norm": 1.7353609800338745, + "learning_rate": 1.6856634244005662e-06, + "loss": 0.9073, + "step": 3942 + }, + { + "epoch": 3.671322160148976, + "grad_norm": 1.727271556854248, + "learning_rate": 1.6844930269478274e-06, + "loss": 0.9021, + "step": 3943 + }, + { + "epoch": 3.6722532588454375, + "grad_norm": 1.8015776872634888, + "learning_rate": 1.6833228294817656e-06, + "loss": 0.9229, + "step": 3944 + }, + { + "epoch": 3.673184357541899, + "grad_norm": 1.7594966888427734, + "learning_rate": 1.6821528322893498e-06, + "loss": 0.9205, + "step": 3945 + }, + { + "epoch": 3.6741154562383613, + "grad_norm": 1.6897203922271729, + "learning_rate": 1.6809830356574982e-06, + "loss": 0.9033, + "step": 3946 + }, + { + "epoch": 3.675046554934823, + "grad_norm": 1.7579700946807861, + "learning_rate": 1.6798134398730798e-06, + "loss": 0.9091, + "step": 3947 + }, + { + "epoch": 3.6759776536312847, + "grad_norm": 1.8155449628829956, + "learning_rate": 1.6786440452229134e-06, + "loss": 0.8815, + "step": 3948 + }, + { + "epoch": 3.676908752327747, + "grad_norm": 1.7368659973144531, + "learning_rate": 1.6774748519937706e-06, + "loss": 0.9471, + "step": 3949 + }, + { + "epoch": 3.6778398510242085, + "grad_norm": 1.7323052883148193, + "learning_rate": 1.6763058604723725e-06, + "loss": 0.9392, + "step": 3950 + }, + { + "epoch": 3.67877094972067, + "grad_norm": 1.7457796335220337, + "learning_rate": 1.67513707094539e-06, + "loss": 0.9205, + "step": 3951 + }, + { + "epoch": 3.6797020484171323, + "grad_norm": 1.7468905448913574, + "learning_rate": 1.6739684836994458e-06, + "loss": 0.9137, + "step": 3952 + }, + { + "epoch": 3.680633147113594, + "grad_norm": 1.6871715784072876, + "learning_rate": 1.6728000990211124e-06, + "loss": 0.8508, + "step": 3953 + }, + { + "epoch": 3.6815642458100557, + "grad_norm": 1.7542946338653564, + "learning_rate": 1.6716319171969126e-06, + "loss": 0.9126, + "step": 3954 + }, + { + "epoch": 3.682495344506518, + "grad_norm": 1.7160470485687256, + "learning_rate": 1.6704639385133187e-06, + "loss": 0.8812, + "step": 3955 + }, + { + "epoch": 3.6834264432029795, + "grad_norm": 1.789633870124817, + "learning_rate": 1.669296163256755e-06, + "loss": 0.9538, + "step": 3956 + }, + { + "epoch": 3.684357541899441, + "grad_norm": 1.7571874856948853, + "learning_rate": 1.6681285917135952e-06, + "loss": 0.9509, + "step": 3957 + }, + { + "epoch": 3.6852886405959033, + "grad_norm": 1.67544424533844, + "learning_rate": 1.6669612241701622e-06, + "loss": 0.908, + "step": 3958 + }, + { + "epoch": 3.686219739292365, + "grad_norm": 1.7213146686553955, + "learning_rate": 1.665794060912728e-06, + "loss": 0.9236, + "step": 3959 + }, + { + "epoch": 3.6871508379888267, + "grad_norm": 1.732373595237732, + "learning_rate": 1.6646271022275185e-06, + "loss": 0.9216, + "step": 3960 + }, + { + "epoch": 3.688081936685289, + "grad_norm": 1.6794570684432983, + "learning_rate": 1.663460348400705e-06, + "loss": 0.9523, + "step": 3961 + }, + { + "epoch": 3.6890130353817505, + "grad_norm": 1.7191836833953857, + "learning_rate": 1.6622937997184106e-06, + "loss": 0.9, + "step": 3962 + }, + { + "epoch": 3.689944134078212, + "grad_norm": 1.7113410234451294, + "learning_rate": 1.6611274564667085e-06, + "loss": 0.9006, + "step": 3963 + }, + { + "epoch": 3.6908752327746743, + "grad_norm": 1.7398282289505005, + "learning_rate": 1.6599613189316213e-06, + "loss": 0.8882, + "step": 3964 + }, + { + "epoch": 3.691806331471136, + "grad_norm": 1.9544732570648193, + "learning_rate": 1.6587953873991198e-06, + "loss": 0.9377, + "step": 3965 + }, + { + "epoch": 3.6927374301675977, + "grad_norm": 1.8131192922592163, + "learning_rate": 1.6576296621551246e-06, + "loss": 0.9431, + "step": 3966 + }, + { + "epoch": 3.69366852886406, + "grad_norm": 1.8333956003189087, + "learning_rate": 1.656464143485509e-06, + "loss": 0.9295, + "step": 3967 + }, + { + "epoch": 3.6945996275605215, + "grad_norm": 1.7353899478912354, + "learning_rate": 1.6552988316760904e-06, + "loss": 0.8784, + "step": 3968 + }, + { + "epoch": 3.695530726256983, + "grad_norm": 1.7982414960861206, + "learning_rate": 1.654133727012639e-06, + "loss": 0.9251, + "step": 3969 + }, + { + "epoch": 3.6964618249534453, + "grad_norm": 1.6911696195602417, + "learning_rate": 1.6529688297808727e-06, + "loss": 0.9399, + "step": 3970 + }, + { + "epoch": 3.697392923649907, + "grad_norm": 1.6974493265151978, + "learning_rate": 1.651804140266461e-06, + "loss": 0.9237, + "step": 3971 + }, + { + "epoch": 3.6983240223463687, + "grad_norm": 1.7594558000564575, + "learning_rate": 1.650639658755019e-06, + "loss": 0.9622, + "step": 3972 + }, + { + "epoch": 3.6992551210428304, + "grad_norm": 1.779521107673645, + "learning_rate": 1.6494753855321116e-06, + "loss": 0.9512, + "step": 3973 + }, + { + "epoch": 3.7001862197392925, + "grad_norm": 1.7489949464797974, + "learning_rate": 1.6483113208832562e-06, + "loss": 0.9262, + "step": 3974 + }, + { + "epoch": 3.701117318435754, + "grad_norm": 1.6858429908752441, + "learning_rate": 1.647147465093913e-06, + "loss": 0.8642, + "step": 3975 + }, + { + "epoch": 3.702048417132216, + "grad_norm": 1.761678695678711, + "learning_rate": 1.6459838184494964e-06, + "loss": 0.8623, + "step": 3976 + }, + { + "epoch": 3.7029795158286776, + "grad_norm": 1.6939162015914917, + "learning_rate": 1.6448203812353657e-06, + "loss": 0.9389, + "step": 3977 + }, + { + "epoch": 3.7039106145251397, + "grad_norm": 1.7257338762283325, + "learning_rate": 1.6436571537368328e-06, + "loss": 0.9168, + "step": 3978 + }, + { + "epoch": 3.7048417132216014, + "grad_norm": 1.6770213842391968, + "learning_rate": 1.6424941362391539e-06, + "loss": 0.9101, + "step": 3979 + }, + { + "epoch": 3.705772811918063, + "grad_norm": 1.7241153717041016, + "learning_rate": 1.6413313290275357e-06, + "loss": 0.8996, + "step": 3980 + }, + { + "epoch": 3.706703910614525, + "grad_norm": 1.7888871431350708, + "learning_rate": 1.6401687323871346e-06, + "loss": 0.9858, + "step": 3981 + }, + { + "epoch": 3.707635009310987, + "grad_norm": 1.7111773490905762, + "learning_rate": 1.639006346603052e-06, + "loss": 0.9281, + "step": 3982 + }, + { + "epoch": 3.7085661080074486, + "grad_norm": 1.6771979331970215, + "learning_rate": 1.6378441719603417e-06, + "loss": 0.8948, + "step": 3983 + }, + { + "epoch": 3.7094972067039107, + "grad_norm": 1.7977564334869385, + "learning_rate": 1.6366822087440026e-06, + "loss": 0.9514, + "step": 3984 + }, + { + "epoch": 3.7104283054003724, + "grad_norm": 1.7720965147018433, + "learning_rate": 1.6355204572389832e-06, + "loss": 0.9168, + "step": 3985 + }, + { + "epoch": 3.711359404096834, + "grad_norm": 1.7549103498458862, + "learning_rate": 1.6343589177301783e-06, + "loss": 0.9071, + "step": 3986 + }, + { + "epoch": 3.712290502793296, + "grad_norm": 1.7695749998092651, + "learning_rate": 1.6331975905024341e-06, + "loss": 0.9641, + "step": 3987 + }, + { + "epoch": 3.713221601489758, + "grad_norm": 1.7933679819107056, + "learning_rate": 1.6320364758405422e-06, + "loss": 0.9461, + "step": 3988 + }, + { + "epoch": 3.7141527001862196, + "grad_norm": 1.7250888347625732, + "learning_rate": 1.6308755740292415e-06, + "loss": 0.9268, + "step": 3989 + }, + { + "epoch": 3.7150837988826817, + "grad_norm": 1.6932573318481445, + "learning_rate": 1.629714885353221e-06, + "loss": 0.8801, + "step": 3990 + }, + { + "epoch": 3.7160148975791434, + "grad_norm": 1.744909644126892, + "learning_rate": 1.6285544100971163e-06, + "loss": 0.9293, + "step": 3991 + }, + { + "epoch": 3.716945996275605, + "grad_norm": 1.7552919387817383, + "learning_rate": 1.6273941485455098e-06, + "loss": 0.9323, + "step": 3992 + }, + { + "epoch": 3.717877094972067, + "grad_norm": 1.6713666915893555, + "learning_rate": 1.6262341009829318e-06, + "loss": 0.9094, + "step": 3993 + }, + { + "epoch": 3.718808193668529, + "grad_norm": 1.679654836654663, + "learning_rate": 1.6250742676938625e-06, + "loss": 0.8952, + "step": 3994 + }, + { + "epoch": 3.7197392923649906, + "grad_norm": 1.6698956489562988, + "learning_rate": 1.6239146489627266e-06, + "loss": 0.8893, + "step": 3995 + }, + { + "epoch": 3.7206703910614527, + "grad_norm": 1.6208436489105225, + "learning_rate": 1.622755245073897e-06, + "loss": 0.8775, + "step": 3996 + }, + { + "epoch": 3.7216014897579144, + "grad_norm": 1.7489488124847412, + "learning_rate": 1.6215960563116945e-06, + "loss": 0.9083, + "step": 3997 + }, + { + "epoch": 3.722532588454376, + "grad_norm": 1.782345175743103, + "learning_rate": 1.6204370829603874e-06, + "loss": 0.8902, + "step": 3998 + }, + { + "epoch": 3.723463687150838, + "grad_norm": 1.7041456699371338, + "learning_rate": 1.6192783253041896e-06, + "loss": 0.8999, + "step": 3999 + }, + { + "epoch": 3.7243947858473, + "grad_norm": 1.7831697463989258, + "learning_rate": 1.618119783627263e-06, + "loss": 0.9237, + "step": 4000 + }, + { + "epoch": 3.7253258845437616, + "grad_norm": 1.8157907724380493, + "learning_rate": 1.6169614582137177e-06, + "loss": 0.9669, + "step": 4001 + }, + { + "epoch": 3.7262569832402237, + "grad_norm": 1.680737018585205, + "learning_rate": 1.6158033493476099e-06, + "loss": 0.9043, + "step": 4002 + }, + { + "epoch": 3.7271880819366854, + "grad_norm": 1.7769601345062256, + "learning_rate": 1.614645457312941e-06, + "loss": 0.9462, + "step": 4003 + }, + { + "epoch": 3.728119180633147, + "grad_norm": 1.732236385345459, + "learning_rate": 1.613487782393661e-06, + "loss": 0.9309, + "step": 4004 + }, + { + "epoch": 3.729050279329609, + "grad_norm": 1.6602421998977661, + "learning_rate": 1.6123303248736678e-06, + "loss": 0.8757, + "step": 4005 + }, + { + "epoch": 3.729981378026071, + "grad_norm": 1.7312138080596924, + "learning_rate": 1.6111730850368034e-06, + "loss": 0.9091, + "step": 4006 + }, + { + "epoch": 3.7309124767225326, + "grad_norm": 1.7424546480178833, + "learning_rate": 1.6100160631668572e-06, + "loss": 0.9067, + "step": 4007 + }, + { + "epoch": 3.7318435754189943, + "grad_norm": 1.6888412237167358, + "learning_rate": 1.6088592595475667e-06, + "loss": 0.8795, + "step": 4008 + }, + { + "epoch": 3.7327746741154564, + "grad_norm": 1.7270209789276123, + "learning_rate": 1.6077026744626145e-06, + "loss": 0.8942, + "step": 4009 + }, + { + "epoch": 3.733705772811918, + "grad_norm": 1.7854082584381104, + "learning_rate": 1.6065463081956293e-06, + "loss": 0.9028, + "step": 4010 + }, + { + "epoch": 3.7346368715083798, + "grad_norm": 1.7233459949493408, + "learning_rate": 1.605390161030186e-06, + "loss": 0.9115, + "step": 4011 + }, + { + "epoch": 3.7355679702048414, + "grad_norm": 1.7517027854919434, + "learning_rate": 1.6042342332498089e-06, + "loss": 0.898, + "step": 4012 + }, + { + "epoch": 3.7364990689013036, + "grad_norm": 1.7661665678024292, + "learning_rate": 1.6030785251379635e-06, + "loss": 0.9272, + "step": 4013 + }, + { + "epoch": 3.7374301675977653, + "grad_norm": 1.7863315343856812, + "learning_rate": 1.601923036978065e-06, + "loss": 0.9195, + "step": 4014 + }, + { + "epoch": 3.738361266294227, + "grad_norm": 1.6750671863555908, + "learning_rate": 1.6007677690534728e-06, + "loss": 0.9089, + "step": 4015 + }, + { + "epoch": 3.739292364990689, + "grad_norm": 1.7570219039916992, + "learning_rate": 1.5996127216474953e-06, + "loss": 0.9184, + "step": 4016 + }, + { + "epoch": 3.7402234636871508, + "grad_norm": 1.7448984384536743, + "learning_rate": 1.5984578950433823e-06, + "loss": 0.9409, + "step": 4017 + }, + { + "epoch": 3.7411545623836124, + "grad_norm": 1.6719850301742554, + "learning_rate": 1.5973032895243324e-06, + "loss": 0.9105, + "step": 4018 + }, + { + "epoch": 3.7420856610800746, + "grad_norm": 1.7397119998931885, + "learning_rate": 1.5961489053734908e-06, + "loss": 0.9321, + "step": 4019 + }, + { + "epoch": 3.7430167597765363, + "grad_norm": 1.7455973625183105, + "learning_rate": 1.5949947428739448e-06, + "loss": 0.8734, + "step": 4020 + }, + { + "epoch": 3.743947858472998, + "grad_norm": 1.7380287647247314, + "learning_rate": 1.5938408023087309e-06, + "loss": 0.9054, + "step": 4021 + }, + { + "epoch": 3.74487895716946, + "grad_norm": 1.7269648313522339, + "learning_rate": 1.59268708396083e-06, + "loss": 0.939, + "step": 4022 + }, + { + "epoch": 3.7458100558659218, + "grad_norm": 1.7250744104385376, + "learning_rate": 1.5915335881131666e-06, + "loss": 0.9365, + "step": 4023 + }, + { + "epoch": 3.7467411545623834, + "grad_norm": 1.7136321067810059, + "learning_rate": 1.590380315048614e-06, + "loss": 0.9138, + "step": 4024 + }, + { + "epoch": 3.7476722532588456, + "grad_norm": 1.7721107006072998, + "learning_rate": 1.5892272650499886e-06, + "loss": 0.9199, + "step": 4025 + }, + { + "epoch": 3.7486033519553073, + "grad_norm": 1.65714430809021, + "learning_rate": 1.5880744384000544e-06, + "loss": 0.9008, + "step": 4026 + }, + { + "epoch": 3.749534450651769, + "grad_norm": 1.722625732421875, + "learning_rate": 1.5869218353815158e-06, + "loss": 0.9126, + "step": 4027 + }, + { + "epoch": 3.750465549348231, + "grad_norm": 1.7553139925003052, + "learning_rate": 1.5857694562770273e-06, + "loss": 0.9111, + "step": 4028 + }, + { + "epoch": 3.7513966480446927, + "grad_norm": 1.7488781213760376, + "learning_rate": 1.5846173013691874e-06, + "loss": 0.921, + "step": 4029 + }, + { + "epoch": 3.7523277467411544, + "grad_norm": 1.72938871383667, + "learning_rate": 1.5834653709405368e-06, + "loss": 0.9112, + "step": 4030 + }, + { + "epoch": 3.7532588454376166, + "grad_norm": 1.7150977849960327, + "learning_rate": 1.582313665273565e-06, + "loss": 0.9133, + "step": 4031 + }, + { + "epoch": 3.7541899441340782, + "grad_norm": 1.7561137676239014, + "learning_rate": 1.581162184650704e-06, + "loss": 0.8987, + "step": 4032 + }, + { + "epoch": 3.75512104283054, + "grad_norm": 1.712423324584961, + "learning_rate": 1.580010929354332e-06, + "loss": 0.9485, + "step": 4033 + }, + { + "epoch": 3.756052141527002, + "grad_norm": 1.7377071380615234, + "learning_rate": 1.5788598996667695e-06, + "loss": 0.9128, + "step": 4034 + }, + { + "epoch": 3.7569832402234637, + "grad_norm": 1.7843185663223267, + "learning_rate": 1.577709095870285e-06, + "loss": 0.9394, + "step": 4035 + }, + { + "epoch": 3.7579143389199254, + "grad_norm": 1.732096791267395, + "learning_rate": 1.57655851824709e-06, + "loss": 0.9155, + "step": 4036 + }, + { + "epoch": 3.7588454376163876, + "grad_norm": 1.7659202814102173, + "learning_rate": 1.5754081670793395e-06, + "loss": 0.9083, + "step": 4037 + }, + { + "epoch": 3.7597765363128492, + "grad_norm": 1.743561029434204, + "learning_rate": 1.5742580426491338e-06, + "loss": 0.8927, + "step": 4038 + }, + { + "epoch": 3.760707635009311, + "grad_norm": 1.7321566343307495, + "learning_rate": 1.5731081452385188e-06, + "loss": 0.9204, + "step": 4039 + }, + { + "epoch": 3.761638733705773, + "grad_norm": 1.7810157537460327, + "learning_rate": 1.5719584751294842e-06, + "loss": 0.9043, + "step": 4040 + }, + { + "epoch": 3.7625698324022347, + "grad_norm": 1.823854684829712, + "learning_rate": 1.570809032603961e-06, + "loss": 0.954, + "step": 4041 + }, + { + "epoch": 3.7635009310986964, + "grad_norm": 1.723378300666809, + "learning_rate": 1.5696598179438293e-06, + "loss": 0.8818, + "step": 4042 + }, + { + "epoch": 3.7644320297951586, + "grad_norm": 1.7377586364746094, + "learning_rate": 1.5685108314309105e-06, + "loss": 0.902, + "step": 4043 + }, + { + "epoch": 3.7653631284916202, + "grad_norm": 1.7482560873031616, + "learning_rate": 1.5673620733469694e-06, + "loss": 0.9164, + "step": 4044 + }, + { + "epoch": 3.766294227188082, + "grad_norm": 1.7988420724868774, + "learning_rate": 1.5662135439737159e-06, + "loss": 0.9426, + "step": 4045 + }, + { + "epoch": 3.7672253258845436, + "grad_norm": 1.6999481916427612, + "learning_rate": 1.5650652435928043e-06, + "loss": 0.9025, + "step": 4046 + }, + { + "epoch": 3.7681564245810057, + "grad_norm": 1.8339227437973022, + "learning_rate": 1.5639171724858327e-06, + "loss": 0.9609, + "step": 4047 + }, + { + "epoch": 3.7690875232774674, + "grad_norm": 1.746368169784546, + "learning_rate": 1.5627693309343413e-06, + "loss": 0.9176, + "step": 4048 + }, + { + "epoch": 3.770018621973929, + "grad_norm": 1.7249951362609863, + "learning_rate": 1.5616217192198151e-06, + "loss": 0.9196, + "step": 4049 + }, + { + "epoch": 3.770949720670391, + "grad_norm": 1.6959333419799805, + "learning_rate": 1.5604743376236847e-06, + "loss": 0.9396, + "step": 4050 + }, + { + "epoch": 3.771880819366853, + "grad_norm": 1.8023675680160522, + "learning_rate": 1.5593271864273198e-06, + "loss": 0.9029, + "step": 4051 + }, + { + "epoch": 3.7728119180633146, + "grad_norm": 1.7501859664916992, + "learning_rate": 1.558180265912037e-06, + "loss": 0.9616, + "step": 4052 + }, + { + "epoch": 3.7737430167597763, + "grad_norm": 1.8011305332183838, + "learning_rate": 1.5570335763590972e-06, + "loss": 0.9092, + "step": 4053 + }, + { + "epoch": 3.7746741154562384, + "grad_norm": 1.827329397201538, + "learning_rate": 1.5558871180497004e-06, + "loss": 0.919, + "step": 4054 + }, + { + "epoch": 3.7756052141527, + "grad_norm": 1.671895980834961, + "learning_rate": 1.5547408912649942e-06, + "loss": 0.9416, + "step": 4055 + }, + { + "epoch": 3.776536312849162, + "grad_norm": 1.7218687534332275, + "learning_rate": 1.5535948962860658e-06, + "loss": 0.9169, + "step": 4056 + }, + { + "epoch": 3.777467411545624, + "grad_norm": 1.7514137029647827, + "learning_rate": 1.5524491333939501e-06, + "loss": 0.9409, + "step": 4057 + }, + { + "epoch": 3.7783985102420856, + "grad_norm": 1.6476129293441772, + "learning_rate": 1.5513036028696204e-06, + "loss": 0.8372, + "step": 4058 + }, + { + "epoch": 3.7793296089385473, + "grad_norm": 1.714404582977295, + "learning_rate": 1.550158304993995e-06, + "loss": 0.9221, + "step": 4059 + }, + { + "epoch": 3.7802607076350094, + "grad_norm": 1.6984009742736816, + "learning_rate": 1.549013240047937e-06, + "loss": 0.9143, + "step": 4060 + }, + { + "epoch": 3.781191806331471, + "grad_norm": 1.6875550746917725, + "learning_rate": 1.5478684083122481e-06, + "loss": 0.887, + "step": 4061 + }, + { + "epoch": 3.782122905027933, + "grad_norm": 1.7351542711257935, + "learning_rate": 1.5467238100676768e-06, + "loss": 0.9187, + "step": 4062 + }, + { + "epoch": 3.783054003724395, + "grad_norm": 1.7538175582885742, + "learning_rate": 1.5455794455949116e-06, + "loss": 0.8947, + "step": 4063 + }, + { + "epoch": 3.7839851024208566, + "grad_norm": 1.7356371879577637, + "learning_rate": 1.544435315174587e-06, + "loss": 0.9072, + "step": 4064 + }, + { + "epoch": 3.7849162011173183, + "grad_norm": 1.7276921272277832, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.8663, + "step": 4065 + }, + { + "epoch": 3.7858472998137804, + "grad_norm": 1.7000435590744019, + "learning_rate": 1.5421477576134966e-06, + "loss": 0.9123, + "step": 4066 + }, + { + "epoch": 3.786778398510242, + "grad_norm": 1.789954662322998, + "learning_rate": 1.5410043310337095e-06, + "loss": 0.9101, + "step": 4067 + }, + { + "epoch": 3.787709497206704, + "grad_norm": 1.7146666049957275, + "learning_rate": 1.5398611396283153e-06, + "loss": 0.9223, + "step": 4068 + }, + { + "epoch": 3.788640595903166, + "grad_norm": 1.7471160888671875, + "learning_rate": 1.5387181836776604e-06, + "loss": 0.9261, + "step": 4069 + }, + { + "epoch": 3.7895716945996276, + "grad_norm": 1.7684365510940552, + "learning_rate": 1.537575463462031e-06, + "loss": 0.9519, + "step": 4070 + }, + { + "epoch": 3.7905027932960893, + "grad_norm": 1.7900761365890503, + "learning_rate": 1.5364329792616577e-06, + "loss": 0.9018, + "step": 4071 + }, + { + "epoch": 3.7914338919925514, + "grad_norm": 1.843151330947876, + "learning_rate": 1.535290731356709e-06, + "loss": 0.9397, + "step": 4072 + }, + { + "epoch": 3.792364990689013, + "grad_norm": 1.6838605403900146, + "learning_rate": 1.5341487200273003e-06, + "loss": 0.881, + "step": 4073 + }, + { + "epoch": 3.793296089385475, + "grad_norm": 1.744537353515625, + "learning_rate": 1.5330069455534868e-06, + "loss": 0.9048, + "step": 4074 + }, + { + "epoch": 3.794227188081937, + "grad_norm": 1.799020528793335, + "learning_rate": 1.5318654082152639e-06, + "loss": 0.9034, + "step": 4075 + }, + { + "epoch": 3.7951582867783986, + "grad_norm": 1.6442123651504517, + "learning_rate": 1.530724108292573e-06, + "loss": 0.8921, + "step": 4076 + }, + { + "epoch": 3.7960893854748603, + "grad_norm": 1.6862736940383911, + "learning_rate": 1.5295830460652938e-06, + "loss": 0.9082, + "step": 4077 + }, + { + "epoch": 3.7970204841713224, + "grad_norm": 1.7750946283340454, + "learning_rate": 1.5284422218132495e-06, + "loss": 0.8917, + "step": 4078 + }, + { + "epoch": 3.797951582867784, + "grad_norm": 1.7139170169830322, + "learning_rate": 1.527301635816203e-06, + "loss": 0.9092, + "step": 4079 + }, + { + "epoch": 3.798882681564246, + "grad_norm": 1.7798914909362793, + "learning_rate": 1.526161288353861e-06, + "loss": 0.916, + "step": 4080 + }, + { + "epoch": 3.7998137802607075, + "grad_norm": 1.8249850273132324, + "learning_rate": 1.5250211797058712e-06, + "loss": 0.9659, + "step": 4081 + }, + { + "epoch": 3.8007448789571696, + "grad_norm": 1.728442668914795, + "learning_rate": 1.5238813101518208e-06, + "loss": 0.8815, + "step": 4082 + }, + { + "epoch": 3.8016759776536313, + "grad_norm": 1.6928693056106567, + "learning_rate": 1.5227416799712414e-06, + "loss": 0.8964, + "step": 4083 + }, + { + "epoch": 3.802607076350093, + "grad_norm": 1.705544114112854, + "learning_rate": 1.5216022894436044e-06, + "loss": 0.9163, + "step": 4084 + }, + { + "epoch": 3.8035381750465547, + "grad_norm": 1.8003267049789429, + "learning_rate": 1.5204631388483213e-06, + "loss": 0.9213, + "step": 4085 + }, + { + "epoch": 3.804469273743017, + "grad_norm": 1.760697364807129, + "learning_rate": 1.5193242284647458e-06, + "loss": 0.8991, + "step": 4086 + }, + { + "epoch": 3.8054003724394785, + "grad_norm": 1.7088054418563843, + "learning_rate": 1.5181855585721738e-06, + "loss": 0.8996, + "step": 4087 + }, + { + "epoch": 3.80633147113594, + "grad_norm": 1.6870173215866089, + "learning_rate": 1.5170471294498412e-06, + "loss": 0.9147, + "step": 4088 + }, + { + "epoch": 3.8072625698324023, + "grad_norm": 1.7374681234359741, + "learning_rate": 1.515908941376924e-06, + "loss": 0.9506, + "step": 4089 + }, + { + "epoch": 3.808193668528864, + "grad_norm": 1.6433149576187134, + "learning_rate": 1.5147709946325395e-06, + "loss": 0.8907, + "step": 4090 + }, + { + "epoch": 3.8091247672253257, + "grad_norm": 1.762100338935852, + "learning_rate": 1.5136332894957484e-06, + "loss": 0.8929, + "step": 4091 + }, + { + "epoch": 3.810055865921788, + "grad_norm": 1.7047054767608643, + "learning_rate": 1.5124958262455477e-06, + "loss": 0.8679, + "step": 4092 + }, + { + "epoch": 3.8109869646182495, + "grad_norm": 1.7583643198013306, + "learning_rate": 1.5113586051608782e-06, + "loss": 0.9434, + "step": 4093 + }, + { + "epoch": 3.811918063314711, + "grad_norm": 1.73088538646698, + "learning_rate": 1.5102216265206208e-06, + "loss": 0.9231, + "step": 4094 + }, + { + "epoch": 3.8128491620111733, + "grad_norm": 1.7437982559204102, + "learning_rate": 1.509084890603597e-06, + "loss": 0.9224, + "step": 4095 + }, + { + "epoch": 3.813780260707635, + "grad_norm": 1.7670544385910034, + "learning_rate": 1.5079483976885672e-06, + "loss": 0.9434, + "step": 4096 + }, + { + "epoch": 3.8147113594040967, + "grad_norm": 1.770229458808899, + "learning_rate": 1.5068121480542335e-06, + "loss": 0.8782, + "step": 4097 + }, + { + "epoch": 3.815642458100559, + "grad_norm": 1.7599060535430908, + "learning_rate": 1.5056761419792404e-06, + "loss": 0.9072, + "step": 4098 + }, + { + "epoch": 3.8165735567970205, + "grad_norm": 1.739033818244934, + "learning_rate": 1.5045403797421681e-06, + "loss": 0.9472, + "step": 4099 + }, + { + "epoch": 3.817504655493482, + "grad_norm": 1.755418062210083, + "learning_rate": 1.50340486162154e-06, + "loss": 0.8881, + "step": 4100 + }, + { + "epoch": 3.8184357541899443, + "grad_norm": 2.0345919132232666, + "learning_rate": 1.5022695878958194e-06, + "loss": 0.9399, + "step": 4101 + }, + { + "epoch": 3.819366852886406, + "grad_norm": 1.6922738552093506, + "learning_rate": 1.50113455884341e-06, + "loss": 0.8856, + "step": 4102 + }, + { + "epoch": 3.8202979515828677, + "grad_norm": 1.6994667053222656, + "learning_rate": 1.4999997747426538e-06, + "loss": 0.9008, + "step": 4103 + }, + { + "epoch": 3.82122905027933, + "grad_norm": 1.7431222200393677, + "learning_rate": 1.4988652358718336e-06, + "loss": 0.8982, + "step": 4104 + }, + { + "epoch": 3.8221601489757915, + "grad_norm": 1.7737343311309814, + "learning_rate": 1.4977309425091742e-06, + "loss": 0.9293, + "step": 4105 + }, + { + "epoch": 3.823091247672253, + "grad_norm": 1.700769305229187, + "learning_rate": 1.4965968949328352e-06, + "loss": 0.9152, + "step": 4106 + }, + { + "epoch": 3.8240223463687153, + "grad_norm": 1.7748693227767944, + "learning_rate": 1.4954630934209213e-06, + "loss": 0.9649, + "step": 4107 + }, + { + "epoch": 3.824953445065177, + "grad_norm": 1.7902367115020752, + "learning_rate": 1.494329538251473e-06, + "loss": 0.932, + "step": 4108 + }, + { + "epoch": 3.8258845437616387, + "grad_norm": 1.692631721496582, + "learning_rate": 1.4931962297024738e-06, + "loss": 0.8596, + "step": 4109 + }, + { + "epoch": 3.826815642458101, + "grad_norm": 1.7029507160186768, + "learning_rate": 1.4920631680518432e-06, + "loss": 0.9208, + "step": 4110 + }, + { + "epoch": 3.8277467411545625, + "grad_norm": 1.6942970752716064, + "learning_rate": 1.4909303535774421e-06, + "loss": 0.9032, + "step": 4111 + }, + { + "epoch": 3.828677839851024, + "grad_norm": 1.6594653129577637, + "learning_rate": 1.4897977865570713e-06, + "loss": 0.8836, + "step": 4112 + }, + { + "epoch": 3.8296089385474863, + "grad_norm": 1.7085827589035034, + "learning_rate": 1.488665467268468e-06, + "loss": 0.9315, + "step": 4113 + }, + { + "epoch": 3.830540037243948, + "grad_norm": 1.746665596961975, + "learning_rate": 1.487533395989313e-06, + "loss": 0.9154, + "step": 4114 + }, + { + "epoch": 3.8314711359404097, + "grad_norm": 1.7758121490478516, + "learning_rate": 1.4864015729972232e-06, + "loss": 0.9339, + "step": 4115 + }, + { + "epoch": 3.8324022346368714, + "grad_norm": 1.7491447925567627, + "learning_rate": 1.4852699985697546e-06, + "loss": 0.9125, + "step": 4116 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 1.7201588153839111, + "learning_rate": 1.4841386729844043e-06, + "loss": 0.8898, + "step": 4117 + }, + { + "epoch": 3.834264432029795, + "grad_norm": 1.71219003200531, + "learning_rate": 1.4830075965186064e-06, + "loss": 0.8557, + "step": 4118 + }, + { + "epoch": 3.835195530726257, + "grad_norm": 1.758597493171692, + "learning_rate": 1.4818767694497354e-06, + "loss": 0.925, + "step": 4119 + }, + { + "epoch": 3.8361266294227185, + "grad_norm": 1.7303849458694458, + "learning_rate": 1.4807461920551028e-06, + "loss": 0.9076, + "step": 4120 + }, + { + "epoch": 3.8370577281191807, + "grad_norm": 1.7226804494857788, + "learning_rate": 1.4796158646119607e-06, + "loss": 0.9126, + "step": 4121 + }, + { + "epoch": 3.8379888268156424, + "grad_norm": 1.7260617017745972, + "learning_rate": 1.4784857873974996e-06, + "loss": 0.8967, + "step": 4122 + }, + { + "epoch": 3.838919925512104, + "grad_norm": 1.6722670793533325, + "learning_rate": 1.477355960688847e-06, + "loss": 0.868, + "step": 4123 + }, + { + "epoch": 3.839851024208566, + "grad_norm": 1.7288227081298828, + "learning_rate": 1.4762263847630701e-06, + "loss": 0.9071, + "step": 4124 + }, + { + "epoch": 3.840782122905028, + "grad_norm": 1.7335927486419678, + "learning_rate": 1.475097059897176e-06, + "loss": 0.9051, + "step": 4125 + }, + { + "epoch": 3.8417132216014895, + "grad_norm": 1.8620281219482422, + "learning_rate": 1.4739679863681086e-06, + "loss": 0.9595, + "step": 4126 + }, + { + "epoch": 3.8426443202979517, + "grad_norm": 1.7311451435089111, + "learning_rate": 1.4728391644527494e-06, + "loss": 0.9315, + "step": 4127 + }, + { + "epoch": 3.8435754189944134, + "grad_norm": 1.7223219871520996, + "learning_rate": 1.4717105944279201e-06, + "loss": 0.9291, + "step": 4128 + }, + { + "epoch": 3.844506517690875, + "grad_norm": 1.8028019666671753, + "learning_rate": 1.4705822765703804e-06, + "loss": 0.9195, + "step": 4129 + }, + { + "epoch": 3.845437616387337, + "grad_norm": 1.7164208889007568, + "learning_rate": 1.4694542111568261e-06, + "loss": 0.9082, + "step": 4130 + }, + { + "epoch": 3.846368715083799, + "grad_norm": 1.698807716369629, + "learning_rate": 1.4683263984638929e-06, + "loss": 0.915, + "step": 4131 + }, + { + "epoch": 3.8472998137802605, + "grad_norm": 1.7113651037216187, + "learning_rate": 1.4671988387681548e-06, + "loss": 0.9033, + "step": 4132 + }, + { + "epoch": 3.8482309124767227, + "grad_norm": 1.8233782052993774, + "learning_rate": 1.4660715323461238e-06, + "loss": 0.9269, + "step": 4133 + }, + { + "epoch": 3.8491620111731844, + "grad_norm": 1.7255011796951294, + "learning_rate": 1.4649444794742474e-06, + "loss": 0.9298, + "step": 4134 + }, + { + "epoch": 3.850093109869646, + "grad_norm": 1.7078388929367065, + "learning_rate": 1.4638176804289128e-06, + "loss": 0.9132, + "step": 4135 + }, + { + "epoch": 3.851024208566108, + "grad_norm": 1.7611697912216187, + "learning_rate": 1.4626911354864465e-06, + "loss": 0.9237, + "step": 4136 + }, + { + "epoch": 3.85195530726257, + "grad_norm": 1.7190121412277222, + "learning_rate": 1.4615648449231095e-06, + "loss": 0.9199, + "step": 4137 + }, + { + "epoch": 3.8528864059590315, + "grad_norm": 1.7358964681625366, + "learning_rate": 1.4604388090151016e-06, + "loss": 0.9289, + "step": 4138 + }, + { + "epoch": 3.8538175046554937, + "grad_norm": 1.7040410041809082, + "learning_rate": 1.4593130280385614e-06, + "loss": 0.9017, + "step": 4139 + }, + { + "epoch": 3.8547486033519553, + "grad_norm": 1.7929447889328003, + "learning_rate": 1.4581875022695655e-06, + "loss": 0.9467, + "step": 4140 + }, + { + "epoch": 3.855679702048417, + "grad_norm": 1.7423875331878662, + "learning_rate": 1.4570622319841232e-06, + "loss": 0.9309, + "step": 4141 + }, + { + "epoch": 3.856610800744879, + "grad_norm": 1.756166696548462, + "learning_rate": 1.4559372174581865e-06, + "loss": 0.9572, + "step": 4142 + }, + { + "epoch": 3.857541899441341, + "grad_norm": 1.727579116821289, + "learning_rate": 1.4548124589676417e-06, + "loss": 0.9594, + "step": 4143 + }, + { + "epoch": 3.8584729981378025, + "grad_norm": 1.7996478080749512, + "learning_rate": 1.4536879567883133e-06, + "loss": 0.9497, + "step": 4144 + }, + { + "epoch": 3.8594040968342647, + "grad_norm": 1.7171953916549683, + "learning_rate": 1.4525637111959634e-06, + "loss": 0.8674, + "step": 4145 + }, + { + "epoch": 3.8603351955307263, + "grad_norm": 1.745754599571228, + "learning_rate": 1.4514397224662902e-06, + "loss": 0.9116, + "step": 4146 + }, + { + "epoch": 3.861266294227188, + "grad_norm": 1.7001436948776245, + "learning_rate": 1.4503159908749292e-06, + "loss": 0.8944, + "step": 4147 + }, + { + "epoch": 3.86219739292365, + "grad_norm": 1.8127472400665283, + "learning_rate": 1.4491925166974533e-06, + "loss": 0.9166, + "step": 4148 + }, + { + "epoch": 3.863128491620112, + "grad_norm": 1.7718563079833984, + "learning_rate": 1.4480693002093715e-06, + "loss": 0.916, + "step": 4149 + }, + { + "epoch": 3.8640595903165735, + "grad_norm": 1.7531687021255493, + "learning_rate": 1.4469463416861307e-06, + "loss": 0.9367, + "step": 4150 + }, + { + "epoch": 3.864990689013035, + "grad_norm": 1.7965582609176636, + "learning_rate": 1.4458236414031134e-06, + "loss": 0.9461, + "step": 4151 + }, + { + "epoch": 3.8659217877094973, + "grad_norm": 1.7462981939315796, + "learning_rate": 1.444701199635639e-06, + "loss": 0.8741, + "step": 4152 + }, + { + "epoch": 3.866852886405959, + "grad_norm": 1.7626858949661255, + "learning_rate": 1.4435790166589647e-06, + "loss": 0.8978, + "step": 4153 + }, + { + "epoch": 3.8677839851024207, + "grad_norm": 1.7848018407821655, + "learning_rate": 1.4424570927482826e-06, + "loss": 0.9353, + "step": 4154 + }, + { + "epoch": 3.868715083798883, + "grad_norm": 1.6567825078964233, + "learning_rate": 1.4413354281787217e-06, + "loss": 0.8812, + "step": 4155 + }, + { + "epoch": 3.8696461824953445, + "grad_norm": 1.779465675354004, + "learning_rate": 1.4402140232253486e-06, + "loss": 0.9216, + "step": 4156 + }, + { + "epoch": 3.870577281191806, + "grad_norm": 1.738989233970642, + "learning_rate": 1.4390928781631647e-06, + "loss": 0.9017, + "step": 4157 + }, + { + "epoch": 3.871508379888268, + "grad_norm": 1.7417113780975342, + "learning_rate": 1.437971993267108e-06, + "loss": 0.8857, + "step": 4158 + }, + { + "epoch": 3.87243947858473, + "grad_norm": 1.738136649131775, + "learning_rate": 1.4368513688120534e-06, + "loss": 0.9133, + "step": 4159 + }, + { + "epoch": 3.8733705772811917, + "grad_norm": 1.8069320917129517, + "learning_rate": 1.4357310050728116e-06, + "loss": 0.9184, + "step": 4160 + }, + { + "epoch": 3.8743016759776534, + "grad_norm": 1.7768930196762085, + "learning_rate": 1.434610902324129e-06, + "loss": 0.9325, + "step": 4161 + }, + { + "epoch": 3.8752327746741155, + "grad_norm": 1.7043150663375854, + "learning_rate": 1.4334910608406881e-06, + "loss": 0.917, + "step": 4162 + }, + { + "epoch": 3.876163873370577, + "grad_norm": 1.7589137554168701, + "learning_rate": 1.4323714808971078e-06, + "loss": 0.9163, + "step": 4163 + }, + { + "epoch": 3.877094972067039, + "grad_norm": 1.8015676736831665, + "learning_rate": 1.431252162767943e-06, + "loss": 0.8728, + "step": 4164 + }, + { + "epoch": 3.878026070763501, + "grad_norm": 1.7375624179840088, + "learning_rate": 1.430133106727683e-06, + "loss": 0.8913, + "step": 4165 + }, + { + "epoch": 3.8789571694599627, + "grad_norm": 1.756018042564392, + "learning_rate": 1.4290143130507544e-06, + "loss": 0.9051, + "step": 4166 + }, + { + "epoch": 3.8798882681564244, + "grad_norm": 1.6689399480819702, + "learning_rate": 1.4278957820115187e-06, + "loss": 0.8907, + "step": 4167 + }, + { + "epoch": 3.8808193668528865, + "grad_norm": 1.6374313831329346, + "learning_rate": 1.4267775138842726e-06, + "loss": 0.8721, + "step": 4168 + }, + { + "epoch": 3.881750465549348, + "grad_norm": 1.7690523862838745, + "learning_rate": 1.4256595089432502e-06, + "loss": 0.9368, + "step": 4169 + }, + { + "epoch": 3.88268156424581, + "grad_norm": 1.7883734703063965, + "learning_rate": 1.4245417674626183e-06, + "loss": 0.9404, + "step": 4170 + }, + { + "epoch": 3.883612662942272, + "grad_norm": 1.8173571825027466, + "learning_rate": 1.4234242897164814e-06, + "loss": 0.9077, + "step": 4171 + }, + { + "epoch": 3.8845437616387337, + "grad_norm": 1.8200242519378662, + "learning_rate": 1.4223070759788777e-06, + "loss": 0.9186, + "step": 4172 + }, + { + "epoch": 3.8854748603351954, + "grad_norm": 1.7700036764144897, + "learning_rate": 1.4211901265237821e-06, + "loss": 0.904, + "step": 4173 + }, + { + "epoch": 3.8864059590316575, + "grad_norm": 1.8249975442886353, + "learning_rate": 1.4200734416251047e-06, + "loss": 0.8984, + "step": 4174 + }, + { + "epoch": 3.887337057728119, + "grad_norm": 1.7791286706924438, + "learning_rate": 1.418957021556687e-06, + "loss": 0.9425, + "step": 4175 + }, + { + "epoch": 3.888268156424581, + "grad_norm": 1.8396724462509155, + "learning_rate": 1.4178408665923115e-06, + "loss": 0.9386, + "step": 4176 + }, + { + "epoch": 3.889199255121043, + "grad_norm": 1.7584431171417236, + "learning_rate": 1.4167249770056918e-06, + "loss": 0.8921, + "step": 4177 + }, + { + "epoch": 3.8901303538175047, + "grad_norm": 1.7347151041030884, + "learning_rate": 1.4156093530704774e-06, + "loss": 0.8947, + "step": 4178 + }, + { + "epoch": 3.8910614525139664, + "grad_norm": 1.752658724784851, + "learning_rate": 1.4144939950602527e-06, + "loss": 0.9384, + "step": 4179 + }, + { + "epoch": 3.8919925512104285, + "grad_norm": 1.7194240093231201, + "learning_rate": 1.4133789032485367e-06, + "loss": 0.8973, + "step": 4180 + }, + { + "epoch": 3.89292364990689, + "grad_norm": 1.7423033714294434, + "learning_rate": 1.4122640779087842e-06, + "loss": 0.9023, + "step": 4181 + }, + { + "epoch": 3.893854748603352, + "grad_norm": 1.7129592895507812, + "learning_rate": 1.411149519314381e-06, + "loss": 0.899, + "step": 4182 + }, + { + "epoch": 3.894785847299814, + "grad_norm": 1.8546561002731323, + "learning_rate": 1.4100352277386526e-06, + "loss": 0.9093, + "step": 4183 + }, + { + "epoch": 3.8957169459962757, + "grad_norm": 1.7067517042160034, + "learning_rate": 1.4089212034548572e-06, + "loss": 0.8734, + "step": 4184 + }, + { + "epoch": 3.8966480446927374, + "grad_norm": 1.7312023639678955, + "learning_rate": 1.407807446736184e-06, + "loss": 0.8872, + "step": 4185 + }, + { + "epoch": 3.8975791433891995, + "grad_norm": 1.7390625476837158, + "learning_rate": 1.4066939578557604e-06, + "loss": 0.9156, + "step": 4186 + }, + { + "epoch": 3.898510242085661, + "grad_norm": 1.6948963403701782, + "learning_rate": 1.4055807370866488e-06, + "loss": 0.9313, + "step": 4187 + }, + { + "epoch": 3.899441340782123, + "grad_norm": 1.7991451025009155, + "learning_rate": 1.404467784701844e-06, + "loss": 0.9527, + "step": 4188 + }, + { + "epoch": 3.9003724394785846, + "grad_norm": 1.7324836254119873, + "learning_rate": 1.403355100974272e-06, + "loss": 0.8844, + "step": 4189 + }, + { + "epoch": 3.9013035381750467, + "grad_norm": 1.7347400188446045, + "learning_rate": 1.4022426861767999e-06, + "loss": 0.8792, + "step": 4190 + }, + { + "epoch": 3.9022346368715084, + "grad_norm": 1.750918984413147, + "learning_rate": 1.4011305405822242e-06, + "loss": 0.9356, + "step": 4191 + }, + { + "epoch": 3.90316573556797, + "grad_norm": 1.7328245639801025, + "learning_rate": 1.4000186644632746e-06, + "loss": 0.8797, + "step": 4192 + }, + { + "epoch": 3.9040968342644318, + "grad_norm": 1.769477367401123, + "learning_rate": 1.3989070580926167e-06, + "loss": 0.9287, + "step": 4193 + }, + { + "epoch": 3.905027932960894, + "grad_norm": 1.7599049806594849, + "learning_rate": 1.3977957217428507e-06, + "loss": 0.9192, + "step": 4194 + }, + { + "epoch": 3.9059590316573556, + "grad_norm": 1.7609810829162598, + "learning_rate": 1.3966846556865105e-06, + "loss": 0.9351, + "step": 4195 + }, + { + "epoch": 3.9068901303538173, + "grad_norm": 1.7171039581298828, + "learning_rate": 1.395573860196059e-06, + "loss": 0.8922, + "step": 4196 + }, + { + "epoch": 3.9078212290502794, + "grad_norm": 1.691245675086975, + "learning_rate": 1.3944633355438994e-06, + "loss": 0.9075, + "step": 4197 + }, + { + "epoch": 3.908752327746741, + "grad_norm": 1.7755759954452515, + "learning_rate": 1.3933530820023661e-06, + "loss": 0.8995, + "step": 4198 + }, + { + "epoch": 3.9096834264432028, + "grad_norm": 1.766641616821289, + "learning_rate": 1.392243099843724e-06, + "loss": 0.9629, + "step": 4199 + }, + { + "epoch": 3.910614525139665, + "grad_norm": 1.7448575496673584, + "learning_rate": 1.3911333893401742e-06, + "loss": 0.9003, + "step": 4200 + }, + { + "epoch": 3.9115456238361266, + "grad_norm": 1.7452454566955566, + "learning_rate": 1.3900239507638525e-06, + "loss": 0.9341, + "step": 4201 + }, + { + "epoch": 3.9124767225325883, + "grad_norm": 1.7095787525177002, + "learning_rate": 1.3889147843868264e-06, + "loss": 0.8974, + "step": 4202 + }, + { + "epoch": 3.9134078212290504, + "grad_norm": 1.7270052433013916, + "learning_rate": 1.387805890481095e-06, + "loss": 0.9203, + "step": 4203 + }, + { + "epoch": 3.914338919925512, + "grad_norm": 1.765599250793457, + "learning_rate": 1.3866972693185921e-06, + "loss": 0.8957, + "step": 4204 + }, + { + "epoch": 3.9152700186219738, + "grad_norm": 1.7474619150161743, + "learning_rate": 1.3855889211711875e-06, + "loss": 0.9101, + "step": 4205 + }, + { + "epoch": 3.916201117318436, + "grad_norm": 1.8109654188156128, + "learning_rate": 1.3844808463106788e-06, + "loss": 0.9356, + "step": 4206 + }, + { + "epoch": 3.9171322160148976, + "grad_norm": 1.6738662719726562, + "learning_rate": 1.3833730450087985e-06, + "loss": 0.8954, + "step": 4207 + }, + { + "epoch": 3.9180633147113593, + "grad_norm": 1.7239640951156616, + "learning_rate": 1.3822655175372148e-06, + "loss": 0.8762, + "step": 4208 + }, + { + "epoch": 3.9189944134078214, + "grad_norm": 1.6647493839263916, + "learning_rate": 1.3811582641675266e-06, + "loss": 0.8794, + "step": 4209 + }, + { + "epoch": 3.919925512104283, + "grad_norm": 1.7524518966674805, + "learning_rate": 1.3800512851712636e-06, + "loss": 0.8951, + "step": 4210 + }, + { + "epoch": 3.9208566108007448, + "grad_norm": 1.7558668851852417, + "learning_rate": 1.3789445808198898e-06, + "loss": 0.9081, + "step": 4211 + }, + { + "epoch": 3.921787709497207, + "grad_norm": 1.8038939237594604, + "learning_rate": 1.3778381513848056e-06, + "loss": 0.8974, + "step": 4212 + }, + { + "epoch": 3.9227188081936686, + "grad_norm": 1.7405784130096436, + "learning_rate": 1.3767319971373369e-06, + "loss": 0.9448, + "step": 4213 + }, + { + "epoch": 3.9236499068901303, + "grad_norm": 1.757226824760437, + "learning_rate": 1.3756261183487473e-06, + "loss": 0.9157, + "step": 4214 + }, + { + "epoch": 3.9245810055865924, + "grad_norm": 1.6415494680404663, + "learning_rate": 1.3745205152902313e-06, + "loss": 0.8923, + "step": 4215 + }, + { + "epoch": 3.925512104283054, + "grad_norm": 1.7430733442306519, + "learning_rate": 1.3734151882329157e-06, + "loss": 0.8992, + "step": 4216 + }, + { + "epoch": 3.9264432029795158, + "grad_norm": 1.680516004562378, + "learning_rate": 1.3723101374478598e-06, + "loss": 0.9135, + "step": 4217 + }, + { + "epoch": 3.927374301675978, + "grad_norm": 1.7036300897598267, + "learning_rate": 1.371205363206054e-06, + "loss": 0.8859, + "step": 4218 + }, + { + "epoch": 3.9283054003724396, + "grad_norm": 1.7933826446533203, + "learning_rate": 1.370100865778425e-06, + "loss": 0.8925, + "step": 4219 + }, + { + "epoch": 3.9292364990689013, + "grad_norm": 1.6897965669631958, + "learning_rate": 1.3689966454358255e-06, + "loss": 0.8869, + "step": 4220 + }, + { + "epoch": 3.9301675977653634, + "grad_norm": 1.769063949584961, + "learning_rate": 1.3678927024490446e-06, + "loss": 0.9002, + "step": 4221 + }, + { + "epoch": 3.931098696461825, + "grad_norm": 1.729137659072876, + "learning_rate": 1.3667890370888016e-06, + "loss": 0.8762, + "step": 4222 + }, + { + "epoch": 3.9320297951582868, + "grad_norm": 1.7462197542190552, + "learning_rate": 1.3656856496257486e-06, + "loss": 0.909, + "step": 4223 + }, + { + "epoch": 3.9329608938547485, + "grad_norm": 1.7002540826797485, + "learning_rate": 1.364582540330469e-06, + "loss": 0.8908, + "step": 4224 + }, + { + "epoch": 3.9338919925512106, + "grad_norm": 1.711059331893921, + "learning_rate": 1.3634797094734776e-06, + "loss": 0.8971, + "step": 4225 + }, + { + "epoch": 3.9348230912476723, + "grad_norm": 1.7691084146499634, + "learning_rate": 1.3623771573252237e-06, + "loss": 0.942, + "step": 4226 + }, + { + "epoch": 3.935754189944134, + "grad_norm": 1.795163631439209, + "learning_rate": 1.3612748841560835e-06, + "loss": 0.9528, + "step": 4227 + }, + { + "epoch": 3.9366852886405956, + "grad_norm": 1.7376372814178467, + "learning_rate": 1.3601728902363682e-06, + "loss": 0.9344, + "step": 4228 + }, + { + "epoch": 3.9376163873370578, + "grad_norm": 1.747847080230713, + "learning_rate": 1.35907117583632e-06, + "loss": 0.91, + "step": 4229 + }, + { + "epoch": 3.9385474860335195, + "grad_norm": 1.7352440357208252, + "learning_rate": 1.3579697412261116e-06, + "loss": 0.9061, + "step": 4230 + }, + { + "epoch": 3.939478584729981, + "grad_norm": 1.7595494985580444, + "learning_rate": 1.3568685866758483e-06, + "loss": 0.9509, + "step": 4231 + }, + { + "epoch": 3.9404096834264433, + "grad_norm": 1.70807945728302, + "learning_rate": 1.3557677124555656e-06, + "loss": 0.898, + "step": 4232 + }, + { + "epoch": 3.941340782122905, + "grad_norm": 1.8331208229064941, + "learning_rate": 1.354667118835231e-06, + "loss": 0.9125, + "step": 4233 + }, + { + "epoch": 3.9422718808193666, + "grad_norm": 1.8426227569580078, + "learning_rate": 1.3535668060847428e-06, + "loss": 0.9327, + "step": 4234 + }, + { + "epoch": 3.9432029795158288, + "grad_norm": 1.745550274848938, + "learning_rate": 1.3524667744739305e-06, + "loss": 0.9143, + "step": 4235 + }, + { + "epoch": 3.9441340782122905, + "grad_norm": 1.727669596672058, + "learning_rate": 1.3513670242725552e-06, + "loss": 0.8857, + "step": 4236 + }, + { + "epoch": 3.945065176908752, + "grad_norm": 1.639237642288208, + "learning_rate": 1.350267555750308e-06, + "loss": 0.8758, + "step": 4237 + }, + { + "epoch": 3.9459962756052143, + "grad_norm": 1.7214468717575073, + "learning_rate": 1.3491683691768118e-06, + "loss": 0.9018, + "step": 4238 + }, + { + "epoch": 3.946927374301676, + "grad_norm": 1.7821786403656006, + "learning_rate": 1.3480694648216197e-06, + "loss": 0.8893, + "step": 4239 + }, + { + "epoch": 3.9478584729981376, + "grad_norm": 1.7453091144561768, + "learning_rate": 1.3469708429542157e-06, + "loss": 0.878, + "step": 4240 + }, + { + "epoch": 3.9487895716945998, + "grad_norm": 1.7219913005828857, + "learning_rate": 1.3458725038440154e-06, + "loss": 0.8881, + "step": 4241 + }, + { + "epoch": 3.9497206703910615, + "grad_norm": 1.684158444404602, + "learning_rate": 1.3447744477603639e-06, + "loss": 0.8995, + "step": 4242 + }, + { + "epoch": 3.950651769087523, + "grad_norm": 1.7291629314422607, + "learning_rate": 1.3436766749725372e-06, + "loss": 0.8966, + "step": 4243 + }, + { + "epoch": 3.9515828677839853, + "grad_norm": 1.786635160446167, + "learning_rate": 1.3425791857497422e-06, + "loss": 0.9089, + "step": 4244 + }, + { + "epoch": 3.952513966480447, + "grad_norm": 1.7477847337722778, + "learning_rate": 1.3414819803611165e-06, + "loss": 0.9289, + "step": 4245 + }, + { + "epoch": 3.9534450651769086, + "grad_norm": 1.695330023765564, + "learning_rate": 1.3403850590757267e-06, + "loss": 0.9412, + "step": 4246 + }, + { + "epoch": 3.9543761638733708, + "grad_norm": 1.7521381378173828, + "learning_rate": 1.3392884221625718e-06, + "loss": 0.9098, + "step": 4247 + }, + { + "epoch": 3.9553072625698324, + "grad_norm": 1.6534643173217773, + "learning_rate": 1.3381920698905788e-06, + "loss": 0.8957, + "step": 4248 + }, + { + "epoch": 3.956238361266294, + "grad_norm": 1.7063037157058716, + "learning_rate": 1.3370960025286068e-06, + "loss": 0.926, + "step": 4249 + }, + { + "epoch": 3.9571694599627563, + "grad_norm": 1.7220642566680908, + "learning_rate": 1.3360002203454441e-06, + "loss": 0.9278, + "step": 4250 + }, + { + "epoch": 3.958100558659218, + "grad_norm": 1.7003141641616821, + "learning_rate": 1.3349047236098089e-06, + "loss": 0.8983, + "step": 4251 + }, + { + "epoch": 3.9590316573556796, + "grad_norm": 1.6883808374404907, + "learning_rate": 1.3338095125903504e-06, + "loss": 0.9157, + "step": 4252 + }, + { + "epoch": 3.9599627560521418, + "grad_norm": 1.7049285173416138, + "learning_rate": 1.3327145875556475e-06, + "loss": 0.8966, + "step": 4253 + }, + { + "epoch": 3.9608938547486034, + "grad_norm": 1.7231625318527222, + "learning_rate": 1.3316199487742057e-06, + "loss": 0.927, + "step": 4254 + }, + { + "epoch": 3.961824953445065, + "grad_norm": 1.7194812297821045, + "learning_rate": 1.330525596514466e-06, + "loss": 0.894, + "step": 4255 + }, + { + "epoch": 3.9627560521415273, + "grad_norm": 1.7682898044586182, + "learning_rate": 1.3294315310447958e-06, + "loss": 0.9238, + "step": 4256 + }, + { + "epoch": 3.963687150837989, + "grad_norm": 1.6532021760940552, + "learning_rate": 1.3283377526334921e-06, + "loss": 0.8631, + "step": 4257 + }, + { + "epoch": 3.9646182495344506, + "grad_norm": 1.7085129022598267, + "learning_rate": 1.3272442615487822e-06, + "loss": 0.9009, + "step": 4258 + }, + { + "epoch": 3.9655493482309123, + "grad_norm": 1.7306479215621948, + "learning_rate": 1.3261510580588227e-06, + "loss": 0.9284, + "step": 4259 + }, + { + "epoch": 3.9664804469273744, + "grad_norm": 1.719754934310913, + "learning_rate": 1.3250581424317012e-06, + "loss": 0.902, + "step": 4260 + }, + { + "epoch": 3.967411545623836, + "grad_norm": 1.7374979257583618, + "learning_rate": 1.3239655149354297e-06, + "loss": 0.9353, + "step": 4261 + }, + { + "epoch": 3.968342644320298, + "grad_norm": 1.6877343654632568, + "learning_rate": 1.3228731758379562e-06, + "loss": 0.908, + "step": 4262 + }, + { + "epoch": 3.9692737430167595, + "grad_norm": 1.689611554145813, + "learning_rate": 1.3217811254071544e-06, + "loss": 0.886, + "step": 4263 + }, + { + "epoch": 3.9702048417132216, + "grad_norm": 1.7651277780532837, + "learning_rate": 1.320689363910827e-06, + "loss": 0.8875, + "step": 4264 + }, + { + "epoch": 3.9711359404096833, + "grad_norm": 1.7377010583877563, + "learning_rate": 1.319597891616707e-06, + "loss": 0.8883, + "step": 4265 + }, + { + "epoch": 3.972067039106145, + "grad_norm": 1.7432407140731812, + "learning_rate": 1.318506708792456e-06, + "loss": 0.8782, + "step": 4266 + }, + { + "epoch": 3.972998137802607, + "grad_norm": 1.7551065683364868, + "learning_rate": 1.3174158157056654e-06, + "loss": 0.898, + "step": 4267 + }, + { + "epoch": 3.973929236499069, + "grad_norm": 1.7748680114746094, + "learning_rate": 1.3163252126238524e-06, + "loss": 0.9389, + "step": 4268 + }, + { + "epoch": 3.9748603351955305, + "grad_norm": 1.691055178642273, + "learning_rate": 1.3152348998144677e-06, + "loss": 0.8876, + "step": 4269 + }, + { + "epoch": 3.9757914338919926, + "grad_norm": 1.7495986223220825, + "learning_rate": 1.3141448775448875e-06, + "loss": 0.885, + "step": 4270 + }, + { + "epoch": 3.9767225325884543, + "grad_norm": 1.6924501657485962, + "learning_rate": 1.3130551460824196e-06, + "loss": 0.9332, + "step": 4271 + }, + { + "epoch": 3.977653631284916, + "grad_norm": 1.6970319747924805, + "learning_rate": 1.3119657056942952e-06, + "loss": 0.8884, + "step": 4272 + }, + { + "epoch": 3.978584729981378, + "grad_norm": 1.7221189737319946, + "learning_rate": 1.3108765566476805e-06, + "loss": 0.8864, + "step": 4273 + }, + { + "epoch": 3.97951582867784, + "grad_norm": 1.7274166345596313, + "learning_rate": 1.309787699209668e-06, + "loss": 0.9295, + "step": 4274 + }, + { + "epoch": 3.9804469273743015, + "grad_norm": 1.6894820928573608, + "learning_rate": 1.3086991336472748e-06, + "loss": 0.8784, + "step": 4275 + }, + { + "epoch": 3.9813780260707636, + "grad_norm": 1.782052755355835, + "learning_rate": 1.3076108602274523e-06, + "loss": 0.8826, + "step": 4276 + }, + { + "epoch": 3.9823091247672253, + "grad_norm": 1.732740879058838, + "learning_rate": 1.3065228792170772e-06, + "loss": 0.9184, + "step": 4277 + }, + { + "epoch": 3.983240223463687, + "grad_norm": 1.6925077438354492, + "learning_rate": 1.3054351908829558e-06, + "loss": 0.8922, + "step": 4278 + }, + { + "epoch": 3.984171322160149, + "grad_norm": 1.7218379974365234, + "learning_rate": 1.3043477954918189e-06, + "loss": 0.936, + "step": 4279 + }, + { + "epoch": 3.985102420856611, + "grad_norm": 1.7616764307022095, + "learning_rate": 1.3032606933103305e-06, + "loss": 0.8905, + "step": 4280 + }, + { + "epoch": 3.9860335195530725, + "grad_norm": 1.7263654470443726, + "learning_rate": 1.302173884605082e-06, + "loss": 0.9289, + "step": 4281 + }, + { + "epoch": 3.9869646182495346, + "grad_norm": 1.785617470741272, + "learning_rate": 1.301087369642588e-06, + "loss": 0.9303, + "step": 4282 + }, + { + "epoch": 3.9878957169459963, + "grad_norm": 1.753913164138794, + "learning_rate": 1.3000011486892948e-06, + "loss": 0.9088, + "step": 4283 + }, + { + "epoch": 3.988826815642458, + "grad_norm": 1.7531358003616333, + "learning_rate": 1.2989152220115803e-06, + "loss": 0.9382, + "step": 4284 + }, + { + "epoch": 3.98975791433892, + "grad_norm": 1.7975728511810303, + "learning_rate": 1.2978295898757414e-06, + "loss": 0.9357, + "step": 4285 + }, + { + "epoch": 3.990689013035382, + "grad_norm": 1.7662535905838013, + "learning_rate": 1.2967442525480092e-06, + "loss": 0.9669, + "step": 4286 + }, + { + "epoch": 3.9916201117318435, + "grad_norm": 1.771039366722107, + "learning_rate": 1.295659210294542e-06, + "loss": 0.8885, + "step": 4287 + }, + { + "epoch": 3.9925512104283056, + "grad_norm": 1.7708089351654053, + "learning_rate": 1.2945744633814245e-06, + "loss": 0.933, + "step": 4288 + }, + { + "epoch": 3.9934823091247673, + "grad_norm": 1.7533818483352661, + "learning_rate": 1.2934900120746672e-06, + "loss": 0.9461, + "step": 4289 + }, + { + "epoch": 3.994413407821229, + "grad_norm": 1.743216872215271, + "learning_rate": 1.2924058566402097e-06, + "loss": 0.8616, + "step": 4290 + }, + { + "epoch": 3.995344506517691, + "grad_norm": 1.7505429983139038, + "learning_rate": 1.2913219973439234e-06, + "loss": 0.9026, + "step": 4291 + }, + { + "epoch": 3.996275605214153, + "grad_norm": 1.7968896627426147, + "learning_rate": 1.2902384344515984e-06, + "loss": 0.9293, + "step": 4292 + }, + { + "epoch": 3.9972067039106145, + "grad_norm": 1.7324241399765015, + "learning_rate": 1.2891551682289582e-06, + "loss": 0.8965, + "step": 4293 + }, + { + "epoch": 3.998137802607076, + "grad_norm": 1.7534737586975098, + "learning_rate": 1.2880721989416528e-06, + "loss": 0.8998, + "step": 4294 + }, + { + "epoch": 3.9990689013035383, + "grad_norm": 1.8052053451538086, + "learning_rate": 1.2869895268552596e-06, + "loss": 0.9414, + "step": 4295 + }, + { + "epoch": 4.0, + "grad_norm": 1.813524603843689, + "learning_rate": 1.2859071522352794e-06, + "loss": 0.931, + "step": 4296 + }, + { + "epoch": 4.000931098696462, + "grad_norm": 1.7110408544540405, + "learning_rate": 1.284825075347143e-06, + "loss": 0.9168, + "step": 4297 + }, + { + "epoch": 4.001862197392923, + "grad_norm": 1.6881792545318604, + "learning_rate": 1.2837432964562115e-06, + "loss": 0.8928, + "step": 4298 + }, + { + "epoch": 4.0027932960893855, + "grad_norm": 1.6802314519882202, + "learning_rate": 1.282661815827766e-06, + "loss": 0.8728, + "step": 4299 + }, + { + "epoch": 4.003724394785848, + "grad_norm": 1.7116115093231201, + "learning_rate": 1.2815806337270186e-06, + "loss": 0.8876, + "step": 4300 + }, + { + "epoch": 4.004655493482309, + "grad_norm": 1.6894326210021973, + "learning_rate": 1.2804997504191068e-06, + "loss": 0.8959, + "step": 4301 + }, + { + "epoch": 4.005586592178771, + "grad_norm": 1.734230399131775, + "learning_rate": 1.2794191661690987e-06, + "loss": 0.8683, + "step": 4302 + }, + { + "epoch": 4.006517690875233, + "grad_norm": 1.6565067768096924, + "learning_rate": 1.2783388812419825e-06, + "loss": 0.8579, + "step": 4303 + }, + { + "epoch": 4.007448789571694, + "grad_norm": 1.7714468240737915, + "learning_rate": 1.2772588959026763e-06, + "loss": 0.9291, + "step": 4304 + }, + { + "epoch": 4.0083798882681565, + "grad_norm": 1.8090442419052124, + "learning_rate": 1.2761792104160282e-06, + "loss": 0.9154, + "step": 4305 + }, + { + "epoch": 4.009310986964619, + "grad_norm": 1.748423457145691, + "learning_rate": 1.2750998250468055e-06, + "loss": 0.8539, + "step": 4306 + }, + { + "epoch": 4.01024208566108, + "grad_norm": 1.7488853931427002, + "learning_rate": 1.2740207400597076e-06, + "loss": 0.8778, + "step": 4307 + }, + { + "epoch": 4.011173184357542, + "grad_norm": 1.7151849269866943, + "learning_rate": 1.2729419557193573e-06, + "loss": 0.8903, + "step": 4308 + }, + { + "epoch": 4.012104283054004, + "grad_norm": 1.7538071870803833, + "learning_rate": 1.2718634722903073e-06, + "loss": 0.8607, + "step": 4309 + }, + { + "epoch": 4.013035381750465, + "grad_norm": 1.729733943939209, + "learning_rate": 1.270785290037031e-06, + "loss": 0.8959, + "step": 4310 + }, + { + "epoch": 4.0139664804469275, + "grad_norm": 1.81771981716156, + "learning_rate": 1.2697074092239322e-06, + "loss": 0.9031, + "step": 4311 + }, + { + "epoch": 4.01489757914339, + "grad_norm": 1.7719171047210693, + "learning_rate": 1.2686298301153394e-06, + "loss": 0.9052, + "step": 4312 + }, + { + "epoch": 4.015828677839851, + "grad_norm": 1.774204969406128, + "learning_rate": 1.2675525529755073e-06, + "loss": 0.8669, + "step": 4313 + }, + { + "epoch": 4.016759776536313, + "grad_norm": 1.7817857265472412, + "learning_rate": 1.266475578068616e-06, + "loss": 0.8597, + "step": 4314 + }, + { + "epoch": 4.017690875232774, + "grad_norm": 1.7808914184570312, + "learning_rate": 1.2653989056587728e-06, + "loss": 0.8837, + "step": 4315 + }, + { + "epoch": 4.018621973929236, + "grad_norm": 1.8062949180603027, + "learning_rate": 1.2643225360100089e-06, + "loss": 0.8648, + "step": 4316 + }, + { + "epoch": 4.0195530726256985, + "grad_norm": 1.7420310974121094, + "learning_rate": 1.2632464693862832e-06, + "loss": 0.8694, + "step": 4317 + }, + { + "epoch": 4.02048417132216, + "grad_norm": 1.859114408493042, + "learning_rate": 1.2621707060514786e-06, + "loss": 0.9073, + "step": 4318 + }, + { + "epoch": 4.021415270018622, + "grad_norm": 1.712108850479126, + "learning_rate": 1.2610952462694052e-06, + "loss": 0.8917, + "step": 4319 + }, + { + "epoch": 4.022346368715084, + "grad_norm": 1.7661962509155273, + "learning_rate": 1.260020090303797e-06, + "loss": 0.9052, + "step": 4320 + }, + { + "epoch": 4.023277467411545, + "grad_norm": 1.7091705799102783, + "learning_rate": 1.2589452384183154e-06, + "loss": 0.8834, + "step": 4321 + }, + { + "epoch": 4.024208566108007, + "grad_norm": 1.7769159078598022, + "learning_rate": 1.2578706908765453e-06, + "loss": 0.8811, + "step": 4322 + }, + { + "epoch": 4.0251396648044695, + "grad_norm": 1.894944429397583, + "learning_rate": 1.2567964479419982e-06, + "loss": 0.8724, + "step": 4323 + }, + { + "epoch": 4.026070763500931, + "grad_norm": 1.782280445098877, + "learning_rate": 1.2557225098781107e-06, + "loss": 0.9093, + "step": 4324 + }, + { + "epoch": 4.027001862197393, + "grad_norm": 1.7875630855560303, + "learning_rate": 1.2546488769482444e-06, + "loss": 0.8912, + "step": 4325 + }, + { + "epoch": 4.027932960893855, + "grad_norm": 1.7538729906082153, + "learning_rate": 1.253575549415686e-06, + "loss": 0.8449, + "step": 4326 + }, + { + "epoch": 4.028864059590316, + "grad_norm": 1.6861121654510498, + "learning_rate": 1.2525025275436476e-06, + "loss": 0.8503, + "step": 4327 + }, + { + "epoch": 4.029795158286778, + "grad_norm": 1.7636982202529907, + "learning_rate": 1.2514298115952666e-06, + "loss": 0.8826, + "step": 4328 + }, + { + "epoch": 4.0307262569832405, + "grad_norm": 1.7648496627807617, + "learning_rate": 1.2503574018336046e-06, + "loss": 0.8691, + "step": 4329 + }, + { + "epoch": 4.031657355679702, + "grad_norm": 1.7533055543899536, + "learning_rate": 1.2492852985216483e-06, + "loss": 0.8922, + "step": 4330 + }, + { + "epoch": 4.032588454376164, + "grad_norm": 1.7577660083770752, + "learning_rate": 1.2482135019223102e-06, + "loss": 0.8897, + "step": 4331 + }, + { + "epoch": 4.033519553072626, + "grad_norm": 1.7907830476760864, + "learning_rate": 1.247142012298426e-06, + "loss": 0.8936, + "step": 4332 + }, + { + "epoch": 4.034450651769087, + "grad_norm": 1.6967543363571167, + "learning_rate": 1.2460708299127578e-06, + "loss": 0.887, + "step": 4333 + }, + { + "epoch": 4.035381750465549, + "grad_norm": 1.8088047504425049, + "learning_rate": 1.2449999550279907e-06, + "loss": 0.9034, + "step": 4334 + }, + { + "epoch": 4.0363128491620115, + "grad_norm": 1.7461236715316772, + "learning_rate": 1.2439293879067357e-06, + "loss": 0.8826, + "step": 4335 + }, + { + "epoch": 4.037243947858473, + "grad_norm": 1.760558009147644, + "learning_rate": 1.242859128811528e-06, + "loss": 0.9035, + "step": 4336 + }, + { + "epoch": 4.038175046554935, + "grad_norm": 1.7760361433029175, + "learning_rate": 1.2417891780048269e-06, + "loss": 0.8723, + "step": 4337 + }, + { + "epoch": 4.039106145251397, + "grad_norm": 1.776084303855896, + "learning_rate": 1.2407195357490163e-06, + "loss": 0.8922, + "step": 4338 + }, + { + "epoch": 4.040037243947858, + "grad_norm": 1.7506184577941895, + "learning_rate": 1.2396502023064044e-06, + "loss": 0.8917, + "step": 4339 + }, + { + "epoch": 4.04096834264432, + "grad_norm": 1.7576179504394531, + "learning_rate": 1.2385811779392237e-06, + "loss": 0.9125, + "step": 4340 + }, + { + "epoch": 4.0418994413407825, + "grad_norm": 1.9415502548217773, + "learning_rate": 1.2375124629096311e-06, + "loss": 0.9047, + "step": 4341 + }, + { + "epoch": 4.042830540037244, + "grad_norm": 1.7783660888671875, + "learning_rate": 1.2364440574797071e-06, + "loss": 0.8724, + "step": 4342 + }, + { + "epoch": 4.043761638733706, + "grad_norm": 1.7987761497497559, + "learning_rate": 1.2353759619114572e-06, + "loss": 0.9143, + "step": 4343 + }, + { + "epoch": 4.044692737430168, + "grad_norm": 1.8013794422149658, + "learning_rate": 1.2343081764668096e-06, + "loss": 0.9041, + "step": 4344 + }, + { + "epoch": 4.045623836126629, + "grad_norm": 1.7497551441192627, + "learning_rate": 1.2332407014076181e-06, + "loss": 0.9112, + "step": 4345 + }, + { + "epoch": 4.046554934823091, + "grad_norm": 1.7631731033325195, + "learning_rate": 1.2321735369956597e-06, + "loss": 0.8718, + "step": 4346 + }, + { + "epoch": 4.0474860335195535, + "grad_norm": 1.8263648748397827, + "learning_rate": 1.2311066834926324e-06, + "loss": 0.8847, + "step": 4347 + }, + { + "epoch": 4.048417132216015, + "grad_norm": 1.700120449066162, + "learning_rate": 1.2300401411601637e-06, + "loss": 0.8509, + "step": 4348 + }, + { + "epoch": 4.049348230912477, + "grad_norm": 1.8069944381713867, + "learning_rate": 1.2289739102598e-06, + "loss": 0.8965, + "step": 4349 + }, + { + "epoch": 4.050279329608939, + "grad_norm": 1.7350928783416748, + "learning_rate": 1.2279079910530147e-06, + "loss": 0.8891, + "step": 4350 + }, + { + "epoch": 4.0512104283054, + "grad_norm": 1.883237600326538, + "learning_rate": 1.2268423838011997e-06, + "loss": 0.908, + "step": 4351 + }, + { + "epoch": 4.052141527001862, + "grad_norm": 1.7705849409103394, + "learning_rate": 1.2257770887656768e-06, + "loss": 0.8864, + "step": 4352 + }, + { + "epoch": 4.053072625698324, + "grad_norm": 1.744147539138794, + "learning_rate": 1.224712106207688e-06, + "loss": 0.8762, + "step": 4353 + }, + { + "epoch": 4.054003724394786, + "grad_norm": 1.824399471282959, + "learning_rate": 1.223647436388396e-06, + "loss": 0.9317, + "step": 4354 + }, + { + "epoch": 4.054934823091248, + "grad_norm": 1.7568637132644653, + "learning_rate": 1.2225830795688928e-06, + "loss": 0.8484, + "step": 4355 + }, + { + "epoch": 4.055865921787709, + "grad_norm": 1.7624163627624512, + "learning_rate": 1.2215190360101893e-06, + "loss": 0.865, + "step": 4356 + }, + { + "epoch": 4.056797020484171, + "grad_norm": 1.7121779918670654, + "learning_rate": 1.2204553059732216e-06, + "loss": 0.8644, + "step": 4357 + }, + { + "epoch": 4.057728119180633, + "grad_norm": 1.7766534090042114, + "learning_rate": 1.2193918897188456e-06, + "loss": 0.9041, + "step": 4358 + }, + { + "epoch": 4.058659217877095, + "grad_norm": 1.7455562353134155, + "learning_rate": 1.2183287875078454e-06, + "loss": 0.9232, + "step": 4359 + }, + { + "epoch": 4.059590316573557, + "grad_norm": 1.7383538484573364, + "learning_rate": 1.2172659996009254e-06, + "loss": 0.8824, + "step": 4360 + }, + { + "epoch": 4.060521415270019, + "grad_norm": 1.7370450496673584, + "learning_rate": 1.21620352625871e-06, + "loss": 0.9004, + "step": 4361 + }, + { + "epoch": 4.06145251396648, + "grad_norm": 1.6868106126785278, + "learning_rate": 1.215141367741753e-06, + "loss": 0.8649, + "step": 4362 + }, + { + "epoch": 4.062383612662942, + "grad_norm": 1.780739665031433, + "learning_rate": 1.2140795243105252e-06, + "loss": 0.8599, + "step": 4363 + }, + { + "epoch": 4.063314711359404, + "grad_norm": 1.7509766817092896, + "learning_rate": 1.213017996225424e-06, + "loss": 0.8334, + "step": 4364 + }, + { + "epoch": 4.064245810055866, + "grad_norm": 1.7510299682617188, + "learning_rate": 1.211956783746765e-06, + "loss": 0.8993, + "step": 4365 + }, + { + "epoch": 4.065176908752328, + "grad_norm": 1.795722484588623, + "learning_rate": 1.210895887134792e-06, + "loss": 0.8661, + "step": 4366 + }, + { + "epoch": 4.06610800744879, + "grad_norm": 1.753238558769226, + "learning_rate": 1.2098353066496686e-06, + "loss": 0.8881, + "step": 4367 + }, + { + "epoch": 4.067039106145251, + "grad_norm": 1.7655822038650513, + "learning_rate": 1.2087750425514789e-06, + "loss": 0.8578, + "step": 4368 + }, + { + "epoch": 4.067970204841713, + "grad_norm": 1.7908645868301392, + "learning_rate": 1.2077150951002308e-06, + "loss": 0.8703, + "step": 4369 + }, + { + "epoch": 4.068901303538175, + "grad_norm": 1.6975619792938232, + "learning_rate": 1.2066554645558578e-06, + "loss": 0.8286, + "step": 4370 + }, + { + "epoch": 4.069832402234637, + "grad_norm": 1.7243984937667847, + "learning_rate": 1.2055961511782126e-06, + "loss": 0.8707, + "step": 4371 + }, + { + "epoch": 4.070763500931099, + "grad_norm": 1.7782244682312012, + "learning_rate": 1.204537155227068e-06, + "loss": 0.8762, + "step": 4372 + }, + { + "epoch": 4.071694599627561, + "grad_norm": 1.7820193767547607, + "learning_rate": 1.2034784769621236e-06, + "loss": 0.8626, + "step": 4373 + }, + { + "epoch": 4.072625698324022, + "grad_norm": 1.8210536241531372, + "learning_rate": 1.2024201166429995e-06, + "loss": 0.9175, + "step": 4374 + }, + { + "epoch": 4.073556797020484, + "grad_norm": 1.8006234169006348, + "learning_rate": 1.2013620745292348e-06, + "loss": 0.9012, + "step": 4375 + }, + { + "epoch": 4.074487895716946, + "grad_norm": 1.7434529066085815, + "learning_rate": 1.2003043508802939e-06, + "loss": 0.8593, + "step": 4376 + }, + { + "epoch": 4.075418994413408, + "grad_norm": 1.784889578819275, + "learning_rate": 1.1992469459555635e-06, + "loss": 0.8591, + "step": 4377 + }, + { + "epoch": 4.07635009310987, + "grad_norm": 1.8174901008605957, + "learning_rate": 1.198189860014351e-06, + "loss": 0.9022, + "step": 4378 + }, + { + "epoch": 4.077281191806332, + "grad_norm": 1.868676781654358, + "learning_rate": 1.197133093315884e-06, + "loss": 0.8875, + "step": 4379 + }, + { + "epoch": 4.078212290502793, + "grad_norm": 1.837681531906128, + "learning_rate": 1.1960766461193125e-06, + "loss": 0.8702, + "step": 4380 + }, + { + "epoch": 4.079143389199255, + "grad_norm": 1.7518106698989868, + "learning_rate": 1.1950205186837124e-06, + "loss": 0.8797, + "step": 4381 + }, + { + "epoch": 4.080074487895717, + "grad_norm": 1.7747879028320312, + "learning_rate": 1.1939647112680744e-06, + "loss": 0.8676, + "step": 4382 + }, + { + "epoch": 4.081005586592179, + "grad_norm": 1.714235782623291, + "learning_rate": 1.1929092241313145e-06, + "loss": 0.8817, + "step": 4383 + }, + { + "epoch": 4.081936685288641, + "grad_norm": 1.782502293586731, + "learning_rate": 1.1918540575322724e-06, + "loss": 0.8755, + "step": 4384 + }, + { + "epoch": 4.082867783985103, + "grad_norm": 1.8014475107192993, + "learning_rate": 1.190799211729703e-06, + "loss": 0.8411, + "step": 4385 + }, + { + "epoch": 4.083798882681564, + "grad_norm": 1.7665168046951294, + "learning_rate": 1.189744686982288e-06, + "loss": 0.8872, + "step": 4386 + }, + { + "epoch": 4.084729981378026, + "grad_norm": 1.8126320838928223, + "learning_rate": 1.1886904835486269e-06, + "loss": 0.898, + "step": 4387 + }, + { + "epoch": 4.0856610800744875, + "grad_norm": 1.7646896839141846, + "learning_rate": 1.1876366016872446e-06, + "loss": 0.9312, + "step": 4388 + }, + { + "epoch": 4.08659217877095, + "grad_norm": 1.7641549110412598, + "learning_rate": 1.1865830416565816e-06, + "loss": 0.8716, + "step": 4389 + }, + { + "epoch": 4.087523277467412, + "grad_norm": 1.860168218612671, + "learning_rate": 1.1855298037150022e-06, + "loss": 0.9511, + "step": 4390 + }, + { + "epoch": 4.088454376163873, + "grad_norm": 1.7818304300308228, + "learning_rate": 1.1844768881207954e-06, + "loss": 0.8892, + "step": 4391 + }, + { + "epoch": 4.089385474860335, + "grad_norm": 1.7205288410186768, + "learning_rate": 1.183424295132164e-06, + "loss": 0.8502, + "step": 4392 + }, + { + "epoch": 4.090316573556797, + "grad_norm": 1.7936553955078125, + "learning_rate": 1.1823720250072366e-06, + "loss": 0.8489, + "step": 4393 + }, + { + "epoch": 4.0912476722532585, + "grad_norm": 1.813077449798584, + "learning_rate": 1.1813200780040598e-06, + "loss": 0.9098, + "step": 4394 + }, + { + "epoch": 4.092178770949721, + "grad_norm": 1.7216224670410156, + "learning_rate": 1.1802684543806059e-06, + "loss": 0.8657, + "step": 4395 + }, + { + "epoch": 4.093109869646183, + "grad_norm": 1.7604587078094482, + "learning_rate": 1.1792171543947612e-06, + "loss": 0.8857, + "step": 4396 + }, + { + "epoch": 4.094040968342644, + "grad_norm": 1.7869776487350464, + "learning_rate": 1.178166178304337e-06, + "loss": 0.907, + "step": 4397 + }, + { + "epoch": 4.094972067039106, + "grad_norm": 1.7652859687805176, + "learning_rate": 1.1771155263670635e-06, + "loss": 0.8434, + "step": 4398 + }, + { + "epoch": 4.095903165735568, + "grad_norm": 1.789361834526062, + "learning_rate": 1.176065198840593e-06, + "loss": 0.8807, + "step": 4399 + }, + { + "epoch": 4.0968342644320295, + "grad_norm": 1.6860939264297485, + "learning_rate": 1.1750151959824963e-06, + "loss": 0.8283, + "step": 4400 + }, + { + "epoch": 4.097765363128492, + "grad_norm": 1.7737427949905396, + "learning_rate": 1.1739655180502652e-06, + "loss": 0.8827, + "step": 4401 + }, + { + "epoch": 4.098696461824954, + "grad_norm": 1.8738794326782227, + "learning_rate": 1.172916165301314e-06, + "loss": 0.9142, + "step": 4402 + }, + { + "epoch": 4.099627560521415, + "grad_norm": 1.7536805868148804, + "learning_rate": 1.1718671379929736e-06, + "loss": 0.8916, + "step": 4403 + }, + { + "epoch": 4.100558659217877, + "grad_norm": 1.7699639797210693, + "learning_rate": 1.170818436382497e-06, + "loss": 0.8335, + "step": 4404 + }, + { + "epoch": 4.101489757914339, + "grad_norm": 1.7860591411590576, + "learning_rate": 1.1697700607270575e-06, + "loss": 0.9168, + "step": 4405 + }, + { + "epoch": 4.1024208566108005, + "grad_norm": 1.7285155057907104, + "learning_rate": 1.1687220112837482e-06, + "loss": 0.8698, + "step": 4406 + }, + { + "epoch": 4.103351955307263, + "grad_norm": 1.8007099628448486, + "learning_rate": 1.167674288309582e-06, + "loss": 0.8978, + "step": 4407 + }, + { + "epoch": 4.104283054003725, + "grad_norm": 1.7533308267593384, + "learning_rate": 1.166626892061492e-06, + "loss": 0.8354, + "step": 4408 + }, + { + "epoch": 4.105214152700186, + "grad_norm": 1.7615728378295898, + "learning_rate": 1.1655798227963308e-06, + "loss": 0.8351, + "step": 4409 + }, + { + "epoch": 4.106145251396648, + "grad_norm": 1.792166829109192, + "learning_rate": 1.1645330807708713e-06, + "loss": 0.867, + "step": 4410 + }, + { + "epoch": 4.10707635009311, + "grad_norm": 1.8389972448349, + "learning_rate": 1.163486666241806e-06, + "loss": 0.8976, + "step": 4411 + }, + { + "epoch": 4.1080074487895715, + "grad_norm": 1.7422645092010498, + "learning_rate": 1.1624405794657468e-06, + "loss": 0.8438, + "step": 4412 + }, + { + "epoch": 4.108938547486034, + "grad_norm": 1.8288774490356445, + "learning_rate": 1.1613948206992253e-06, + "loss": 0.9034, + "step": 4413 + }, + { + "epoch": 4.109869646182496, + "grad_norm": 1.7706986665725708, + "learning_rate": 1.1603493901986931e-06, + "loss": 0.8709, + "step": 4414 + }, + { + "epoch": 4.110800744878957, + "grad_norm": 1.8355355262756348, + "learning_rate": 1.159304288220521e-06, + "loss": 0.8909, + "step": 4415 + }, + { + "epoch": 4.111731843575419, + "grad_norm": 1.770878791809082, + "learning_rate": 1.158259515020999e-06, + "loss": 0.8835, + "step": 4416 + }, + { + "epoch": 4.112662942271881, + "grad_norm": 1.794213891029358, + "learning_rate": 1.1572150708563371e-06, + "loss": 0.8955, + "step": 4417 + }, + { + "epoch": 4.1135940409683425, + "grad_norm": 1.798050880432129, + "learning_rate": 1.1561709559826637e-06, + "loss": 0.8742, + "step": 4418 + }, + { + "epoch": 4.114525139664805, + "grad_norm": 1.800785779953003, + "learning_rate": 1.1551271706560275e-06, + "loss": 0.8796, + "step": 4419 + }, + { + "epoch": 4.115456238361267, + "grad_norm": 1.7599496841430664, + "learning_rate": 1.1540837151323953e-06, + "loss": 0.8285, + "step": 4420 + }, + { + "epoch": 4.116387337057728, + "grad_norm": 1.7706929445266724, + "learning_rate": 1.1530405896676538e-06, + "loss": 0.8249, + "step": 4421 + }, + { + "epoch": 4.11731843575419, + "grad_norm": 1.891851544380188, + "learning_rate": 1.151997794517609e-06, + "loss": 0.8458, + "step": 4422 + }, + { + "epoch": 4.118249534450651, + "grad_norm": 1.8159939050674438, + "learning_rate": 1.1509553299379847e-06, + "loss": 0.8948, + "step": 4423 + }, + { + "epoch": 4.1191806331471135, + "grad_norm": 1.8008509874343872, + "learning_rate": 1.149913196184425e-06, + "loss": 0.8781, + "step": 4424 + }, + { + "epoch": 4.120111731843576, + "grad_norm": 1.7960692644119263, + "learning_rate": 1.1488713935124916e-06, + "loss": 0.8916, + "step": 4425 + }, + { + "epoch": 4.121042830540037, + "grad_norm": 1.7519283294677734, + "learning_rate": 1.147829922177666e-06, + "loss": 0.8765, + "step": 4426 + }, + { + "epoch": 4.121973929236499, + "grad_norm": 1.7753019332885742, + "learning_rate": 1.146788782435348e-06, + "loss": 0.8853, + "step": 4427 + }, + { + "epoch": 4.122905027932961, + "grad_norm": 1.781790018081665, + "learning_rate": 1.1457479745408562e-06, + "loss": 0.8885, + "step": 4428 + }, + { + "epoch": 4.123836126629422, + "grad_norm": 1.769366979598999, + "learning_rate": 1.144707498749428e-06, + "loss": 0.8683, + "step": 4429 + }, + { + "epoch": 4.1247672253258845, + "grad_norm": 1.7504181861877441, + "learning_rate": 1.143667355316219e-06, + "loss": 0.9049, + "step": 4430 + }, + { + "epoch": 4.125698324022347, + "grad_norm": 1.852437973022461, + "learning_rate": 1.1426275444963033e-06, + "loss": 0.8724, + "step": 4431 + }, + { + "epoch": 4.126629422718808, + "grad_norm": 1.8044884204864502, + "learning_rate": 1.141588066544674e-06, + "loss": 0.8662, + "step": 4432 + }, + { + "epoch": 4.12756052141527, + "grad_norm": 1.8438327312469482, + "learning_rate": 1.1405489217162416e-06, + "loss": 0.8773, + "step": 4433 + }, + { + "epoch": 4.128491620111732, + "grad_norm": 1.8161712884902954, + "learning_rate": 1.1395101102658359e-06, + "loss": 0.9022, + "step": 4434 + }, + { + "epoch": 4.129422718808193, + "grad_norm": 1.7462180852890015, + "learning_rate": 1.1384716324482043e-06, + "loss": 0.8334, + "step": 4435 + }, + { + "epoch": 4.1303538175046555, + "grad_norm": 1.7567801475524902, + "learning_rate": 1.1374334885180136e-06, + "loss": 0.8438, + "step": 4436 + }, + { + "epoch": 4.131284916201118, + "grad_norm": 1.7532851696014404, + "learning_rate": 1.1363956787298447e-06, + "loss": 0.8986, + "step": 4437 + }, + { + "epoch": 4.132216014897579, + "grad_norm": 1.7308322191238403, + "learning_rate": 1.1353582033382027e-06, + "loss": 0.8546, + "step": 4438 + }, + { + "epoch": 4.133147113594041, + "grad_norm": 1.807786226272583, + "learning_rate": 1.1343210625975066e-06, + "loss": 0.8618, + "step": 4439 + }, + { + "epoch": 4.134078212290503, + "grad_norm": 1.7453243732452393, + "learning_rate": 1.133284256762094e-06, + "loss": 0.8765, + "step": 4440 + }, + { + "epoch": 4.135009310986964, + "grad_norm": 1.764945149421692, + "learning_rate": 1.132247786086221e-06, + "loss": 0.9008, + "step": 4441 + }, + { + "epoch": 4.1359404096834265, + "grad_norm": 1.7618006467819214, + "learning_rate": 1.1312116508240612e-06, + "loss": 0.8858, + "step": 4442 + }, + { + "epoch": 4.136871508379889, + "grad_norm": 1.7674936056137085, + "learning_rate": 1.1301758512297064e-06, + "loss": 0.87, + "step": 4443 + }, + { + "epoch": 4.13780260707635, + "grad_norm": 1.7640964984893799, + "learning_rate": 1.1291403875571632e-06, + "loss": 0.8616, + "step": 4444 + }, + { + "epoch": 4.138733705772812, + "grad_norm": 1.7371081113815308, + "learning_rate": 1.128105260060361e-06, + "loss": 0.8624, + "step": 4445 + }, + { + "epoch": 4.139664804469274, + "grad_norm": 1.8491816520690918, + "learning_rate": 1.1270704689931438e-06, + "loss": 0.8944, + "step": 4446 + }, + { + "epoch": 4.140595903165735, + "grad_norm": 1.830702543258667, + "learning_rate": 1.1260360146092709e-06, + "loss": 0.897, + "step": 4447 + }, + { + "epoch": 4.1415270018621975, + "grad_norm": 1.793031930923462, + "learning_rate": 1.1250018971624235e-06, + "loss": 0.922, + "step": 4448 + }, + { + "epoch": 4.14245810055866, + "grad_norm": 1.7762935161590576, + "learning_rate": 1.1239681169061981e-06, + "loss": 0.8623, + "step": 4449 + }, + { + "epoch": 4.143389199255121, + "grad_norm": 1.749521017074585, + "learning_rate": 1.1229346740941088e-06, + "loss": 0.8649, + "step": 4450 + }, + { + "epoch": 4.144320297951583, + "grad_norm": 1.8171783685684204, + "learning_rate": 1.121901568979584e-06, + "loss": 0.8926, + "step": 4451 + }, + { + "epoch": 4.145251396648045, + "grad_norm": 1.7553768157958984, + "learning_rate": 1.1208688018159747e-06, + "loss": 0.8577, + "step": 4452 + }, + { + "epoch": 4.146182495344506, + "grad_norm": 1.8062329292297363, + "learning_rate": 1.1198363728565465e-06, + "loss": 0.8652, + "step": 4453 + }, + { + "epoch": 4.1471135940409685, + "grad_norm": 1.7485045194625854, + "learning_rate": 1.1188042823544797e-06, + "loss": 0.8768, + "step": 4454 + }, + { + "epoch": 4.148044692737431, + "grad_norm": 1.717211127281189, + "learning_rate": 1.117772530562874e-06, + "loss": 0.8793, + "step": 4455 + }, + { + "epoch": 4.148975791433892, + "grad_norm": 1.789741039276123, + "learning_rate": 1.1167411177347473e-06, + "loss": 0.8557, + "step": 4456 + }, + { + "epoch": 4.149906890130354, + "grad_norm": 1.8083410263061523, + "learning_rate": 1.1157100441230328e-06, + "loss": 0.8924, + "step": 4457 + }, + { + "epoch": 4.150837988826815, + "grad_norm": 1.8228853940963745, + "learning_rate": 1.1146793099805784e-06, + "loss": 0.8975, + "step": 4458 + }, + { + "epoch": 4.151769087523277, + "grad_norm": 1.8562049865722656, + "learning_rate": 1.1136489155601532e-06, + "loss": 0.9054, + "step": 4459 + }, + { + "epoch": 4.1527001862197395, + "grad_norm": 1.7762516736984253, + "learning_rate": 1.1126188611144406e-06, + "loss": 0.8875, + "step": 4460 + }, + { + "epoch": 4.153631284916201, + "grad_norm": 1.7911391258239746, + "learning_rate": 1.111589146896039e-06, + "loss": 0.8836, + "step": 4461 + }, + { + "epoch": 4.154562383612663, + "grad_norm": 1.7966173887252808, + "learning_rate": 1.1105597731574654e-06, + "loss": 0.8872, + "step": 4462 + }, + { + "epoch": 4.155493482309125, + "grad_norm": 1.8051522970199585, + "learning_rate": 1.1095307401511546e-06, + "loss": 0.8534, + "step": 4463 + }, + { + "epoch": 4.156424581005586, + "grad_norm": 1.7415368556976318, + "learning_rate": 1.1085020481294561e-06, + "loss": 0.876, + "step": 4464 + }, + { + "epoch": 4.157355679702048, + "grad_norm": 1.7819180488586426, + "learning_rate": 1.1074736973446344e-06, + "loss": 0.923, + "step": 4465 + }, + { + "epoch": 4.1582867783985105, + "grad_norm": 1.7169443368911743, + "learning_rate": 1.1064456880488713e-06, + "loss": 0.8318, + "step": 4466 + }, + { + "epoch": 4.159217877094972, + "grad_norm": 1.7604836225509644, + "learning_rate": 1.105418020494269e-06, + "loss": 0.8727, + "step": 4467 + }, + { + "epoch": 4.160148975791434, + "grad_norm": 1.8176007270812988, + "learning_rate": 1.1043906949328387e-06, + "loss": 0.8506, + "step": 4468 + }, + { + "epoch": 4.161080074487896, + "grad_norm": 1.820765495300293, + "learning_rate": 1.103363711616512e-06, + "loss": 0.8855, + "step": 4469 + }, + { + "epoch": 4.162011173184357, + "grad_norm": 1.8011748790740967, + "learning_rate": 1.102337070797137e-06, + "loss": 0.9134, + "step": 4470 + }, + { + "epoch": 4.162942271880819, + "grad_norm": 1.7202539443969727, + "learning_rate": 1.1013107727264773e-06, + "loss": 0.818, + "step": 4471 + }, + { + "epoch": 4.1638733705772815, + "grad_norm": 1.8643807172775269, + "learning_rate": 1.10028481765621e-06, + "loss": 0.8795, + "step": 4472 + }, + { + "epoch": 4.164804469273743, + "grad_norm": 1.759416103363037, + "learning_rate": 1.0992592058379298e-06, + "loss": 0.8522, + "step": 4473 + }, + { + "epoch": 4.165735567970205, + "grad_norm": 1.8463172912597656, + "learning_rate": 1.0982339375231499e-06, + "loss": 0.879, + "step": 4474 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 1.8800420761108398, + "learning_rate": 1.0972090129632943e-06, + "loss": 0.9045, + "step": 4475 + }, + { + "epoch": 4.167597765363128, + "grad_norm": 1.746372103691101, + "learning_rate": 1.0961844324097057e-06, + "loss": 0.8455, + "step": 4476 + }, + { + "epoch": 4.16852886405959, + "grad_norm": 1.81632661819458, + "learning_rate": 1.0951601961136413e-06, + "loss": 0.8888, + "step": 4477 + }, + { + "epoch": 4.1694599627560525, + "grad_norm": 1.8126124143600464, + "learning_rate": 1.0941363043262771e-06, + "loss": 0.8775, + "step": 4478 + }, + { + "epoch": 4.170391061452514, + "grad_norm": 1.7821956872940063, + "learning_rate": 1.0931127572986991e-06, + "loss": 0.8843, + "step": 4479 + }, + { + "epoch": 4.171322160148976, + "grad_norm": 1.8142513036727905, + "learning_rate": 1.0920895552819118e-06, + "loss": 0.8867, + "step": 4480 + }, + { + "epoch": 4.172253258845438, + "grad_norm": 1.8370848894119263, + "learning_rate": 1.0910666985268375e-06, + "loss": 0.8861, + "step": 4481 + }, + { + "epoch": 4.173184357541899, + "grad_norm": 1.7372386455535889, + "learning_rate": 1.0900441872843083e-06, + "loss": 0.8624, + "step": 4482 + }, + { + "epoch": 4.174115456238361, + "grad_norm": 1.8046278953552246, + "learning_rate": 1.0890220218050762e-06, + "loss": 0.871, + "step": 4483 + }, + { + "epoch": 4.1750465549348235, + "grad_norm": 1.831581473350525, + "learning_rate": 1.088000202339806e-06, + "loss": 0.8612, + "step": 4484 + }, + { + "epoch": 4.175977653631285, + "grad_norm": 1.7874442338943481, + "learning_rate": 1.086978729139078e-06, + "loss": 0.8773, + "step": 4485 + }, + { + "epoch": 4.176908752327747, + "grad_norm": 1.8056275844573975, + "learning_rate": 1.0859576024533892e-06, + "loss": 0.9044, + "step": 4486 + }, + { + "epoch": 4.177839851024209, + "grad_norm": 1.694891095161438, + "learning_rate": 1.0849368225331485e-06, + "loss": 0.8497, + "step": 4487 + }, + { + "epoch": 4.17877094972067, + "grad_norm": 1.821637511253357, + "learning_rate": 1.0839163896286848e-06, + "loss": 0.9041, + "step": 4488 + }, + { + "epoch": 4.179702048417132, + "grad_norm": 1.766053318977356, + "learning_rate": 1.0828963039902358e-06, + "loss": 0.8854, + "step": 4489 + }, + { + "epoch": 4.1806331471135945, + "grad_norm": 1.8046610355377197, + "learning_rate": 1.0818765658679576e-06, + "loss": 0.8748, + "step": 4490 + }, + { + "epoch": 4.181564245810056, + "grad_norm": 1.7503914833068848, + "learning_rate": 1.080857175511921e-06, + "loss": 0.8753, + "step": 4491 + }, + { + "epoch": 4.182495344506518, + "grad_norm": 1.7624198198318481, + "learning_rate": 1.079838133172111e-06, + "loss": 0.8527, + "step": 4492 + }, + { + "epoch": 4.183426443202979, + "grad_norm": 1.8143476247787476, + "learning_rate": 1.0788194390984265e-06, + "loss": 0.8599, + "step": 4493 + }, + { + "epoch": 4.184357541899441, + "grad_norm": 1.7354788780212402, + "learning_rate": 1.0778010935406826e-06, + "loss": 0.864, + "step": 4494 + }, + { + "epoch": 4.185288640595903, + "grad_norm": 1.7747478485107422, + "learning_rate": 1.0767830967486078e-06, + "loss": 0.8545, + "step": 4495 + }, + { + "epoch": 4.186219739292365, + "grad_norm": 1.764509916305542, + "learning_rate": 1.075765448971845e-06, + "loss": 0.8533, + "step": 4496 + }, + { + "epoch": 4.187150837988827, + "grad_norm": 1.7566726207733154, + "learning_rate": 1.0747481504599523e-06, + "loss": 0.836, + "step": 4497 + }, + { + "epoch": 4.188081936685289, + "grad_norm": 1.8027533292770386, + "learning_rate": 1.0737312014624012e-06, + "loss": 0.864, + "step": 4498 + }, + { + "epoch": 4.18901303538175, + "grad_norm": 1.793636441230774, + "learning_rate": 1.0727146022285783e-06, + "loss": 0.8959, + "step": 4499 + }, + { + "epoch": 4.189944134078212, + "grad_norm": 1.8309879302978516, + "learning_rate": 1.0716983530077843e-06, + "loss": 0.8964, + "step": 4500 + }, + { + "epoch": 4.190875232774674, + "grad_norm": 1.8565000295639038, + "learning_rate": 1.0706824540492332e-06, + "loss": 0.8889, + "step": 4501 + }, + { + "epoch": 4.191806331471136, + "grad_norm": 1.8123760223388672, + "learning_rate": 1.0696669056020545e-06, + "loss": 0.8996, + "step": 4502 + }, + { + "epoch": 4.192737430167598, + "grad_norm": 1.8181185722351074, + "learning_rate": 1.0686517079152908e-06, + "loss": 0.9241, + "step": 4503 + }, + { + "epoch": 4.19366852886406, + "grad_norm": 1.7272616624832153, + "learning_rate": 1.0676368612378987e-06, + "loss": 0.8581, + "step": 4504 + }, + { + "epoch": 4.194599627560521, + "grad_norm": 1.8613497018814087, + "learning_rate": 1.066622365818749e-06, + "loss": 0.9081, + "step": 4505 + }, + { + "epoch": 4.195530726256983, + "grad_norm": 1.8543237447738647, + "learning_rate": 1.0656082219066267e-06, + "loss": 0.8972, + "step": 4506 + }, + { + "epoch": 4.196461824953445, + "grad_norm": 1.788077712059021, + "learning_rate": 1.0645944297502295e-06, + "loss": 0.8789, + "step": 4507 + }, + { + "epoch": 4.197392923649907, + "grad_norm": 1.7470952272415161, + "learning_rate": 1.06358098959817e-06, + "loss": 0.8744, + "step": 4508 + }, + { + "epoch": 4.198324022346369, + "grad_norm": 1.7785993814468384, + "learning_rate": 1.062567901698974e-06, + "loss": 0.8542, + "step": 4509 + }, + { + "epoch": 4.199255121042831, + "grad_norm": 1.799004077911377, + "learning_rate": 1.0615551663010805e-06, + "loss": 0.8659, + "step": 4510 + }, + { + "epoch": 4.200186219739292, + "grad_norm": 1.7550920248031616, + "learning_rate": 1.0605427836528432e-06, + "loss": 0.905, + "step": 4511 + }, + { + "epoch": 4.201117318435754, + "grad_norm": 1.8334801197052002, + "learning_rate": 1.059530754002528e-06, + "loss": 0.9007, + "step": 4512 + }, + { + "epoch": 4.202048417132216, + "grad_norm": 1.8100594282150269, + "learning_rate": 1.058519077598315e-06, + "loss": 0.9053, + "step": 4513 + }, + { + "epoch": 4.202979515828678, + "grad_norm": 1.7733691930770874, + "learning_rate": 1.0575077546882975e-06, + "loss": 0.8543, + "step": 4514 + }, + { + "epoch": 4.20391061452514, + "grad_norm": 1.9022310972213745, + "learning_rate": 1.056496785520482e-06, + "loss": 0.9017, + "step": 4515 + }, + { + "epoch": 4.204841713221602, + "grad_norm": 1.828141212463379, + "learning_rate": 1.0554861703427884e-06, + "loss": 0.8566, + "step": 4516 + }, + { + "epoch": 4.205772811918063, + "grad_norm": 1.8148683309555054, + "learning_rate": 1.0544759094030498e-06, + "loss": 0.8902, + "step": 4517 + }, + { + "epoch": 4.206703910614525, + "grad_norm": 1.7541577816009521, + "learning_rate": 1.0534660029490124e-06, + "loss": 0.8622, + "step": 4518 + }, + { + "epoch": 4.207635009310987, + "grad_norm": 1.7714163064956665, + "learning_rate": 1.0524564512283353e-06, + "loss": 0.8576, + "step": 4519 + }, + { + "epoch": 4.208566108007449, + "grad_norm": 1.7779289484024048, + "learning_rate": 1.051447254488591e-06, + "loss": 0.9081, + "step": 4520 + }, + { + "epoch": 4.209497206703911, + "grad_norm": 1.7867318391799927, + "learning_rate": 1.0504384129772644e-06, + "loss": 0.9155, + "step": 4521 + }, + { + "epoch": 4.210428305400373, + "grad_norm": 1.7814818620681763, + "learning_rate": 1.0494299269417543e-06, + "loss": 0.8847, + "step": 4522 + }, + { + "epoch": 4.211359404096834, + "grad_norm": 1.7456698417663574, + "learning_rate": 1.0484217966293695e-06, + "loss": 0.8758, + "step": 4523 + }, + { + "epoch": 4.212290502793296, + "grad_norm": 1.8181489706039429, + "learning_rate": 1.047414022287336e-06, + "loss": 0.895, + "step": 4524 + }, + { + "epoch": 4.213221601489758, + "grad_norm": 1.8468031883239746, + "learning_rate": 1.0464066041627896e-06, + "loss": 0.8984, + "step": 4525 + }, + { + "epoch": 4.21415270018622, + "grad_norm": 1.75785493850708, + "learning_rate": 1.0453995425027789e-06, + "loss": 0.8693, + "step": 4526 + }, + { + "epoch": 4.215083798882682, + "grad_norm": 1.8183516263961792, + "learning_rate": 1.0443928375542656e-06, + "loss": 0.9029, + "step": 4527 + }, + { + "epoch": 4.216014897579143, + "grad_norm": 1.8567568063735962, + "learning_rate": 1.043386489564124e-06, + "loss": 0.8997, + "step": 4528 + }, + { + "epoch": 4.216945996275605, + "grad_norm": 1.7914568185806274, + "learning_rate": 1.0423804987791417e-06, + "loss": 0.8906, + "step": 4529 + }, + { + "epoch": 4.217877094972067, + "grad_norm": 1.7404948472976685, + "learning_rate": 1.0413748654460148e-06, + "loss": 0.8689, + "step": 4530 + }, + { + "epoch": 4.218808193668528, + "grad_norm": 1.7938181161880493, + "learning_rate": 1.0403695898113571e-06, + "loss": 0.9176, + "step": 4531 + }, + { + "epoch": 4.219739292364991, + "grad_norm": 1.7656139135360718, + "learning_rate": 1.039364672121692e-06, + "loss": 0.9035, + "step": 4532 + }, + { + "epoch": 4.220670391061453, + "grad_norm": 1.7751072645187378, + "learning_rate": 1.0383601126234557e-06, + "loss": 0.8436, + "step": 4533 + }, + { + "epoch": 4.221601489757914, + "grad_norm": 1.8128849267959595, + "learning_rate": 1.0373559115629939e-06, + "loss": 0.8713, + "step": 4534 + }, + { + "epoch": 4.222532588454376, + "grad_norm": 1.74265456199646, + "learning_rate": 1.036352069186569e-06, + "loss": 0.9107, + "step": 4535 + }, + { + "epoch": 4.223463687150838, + "grad_norm": 1.7891944646835327, + "learning_rate": 1.035348585740353e-06, + "loss": 0.9158, + "step": 4536 + }, + { + "epoch": 4.224394785847299, + "grad_norm": 1.761500358581543, + "learning_rate": 1.034345461470428e-06, + "loss": 0.8456, + "step": 4537 + }, + { + "epoch": 4.225325884543762, + "grad_norm": 1.810757040977478, + "learning_rate": 1.0333426966227922e-06, + "loss": 0.9021, + "step": 4538 + }, + { + "epoch": 4.226256983240224, + "grad_norm": 1.7782764434814453, + "learning_rate": 1.0323402914433527e-06, + "loss": 0.8733, + "step": 4539 + }, + { + "epoch": 4.227188081936685, + "grad_norm": 1.7242146730422974, + "learning_rate": 1.0313382461779305e-06, + "loss": 0.8661, + "step": 4540 + }, + { + "epoch": 4.228119180633147, + "grad_norm": 1.796790599822998, + "learning_rate": 1.0303365610722537e-06, + "loss": 0.8934, + "step": 4541 + }, + { + "epoch": 4.229050279329609, + "grad_norm": 1.8492344617843628, + "learning_rate": 1.0293352363719688e-06, + "loss": 0.8663, + "step": 4542 + }, + { + "epoch": 4.22998137802607, + "grad_norm": 1.7374297380447388, + "learning_rate": 1.02833427232263e-06, + "loss": 0.8341, + "step": 4543 + }, + { + "epoch": 4.230912476722533, + "grad_norm": 1.8108088970184326, + "learning_rate": 1.027333669169701e-06, + "loss": 0.8983, + "step": 4544 + }, + { + "epoch": 4.231843575418995, + "grad_norm": 1.8245278596878052, + "learning_rate": 1.0263334271585625e-06, + "loss": 0.8807, + "step": 4545 + }, + { + "epoch": 4.232774674115456, + "grad_norm": 1.9147135019302368, + "learning_rate": 1.0253335465345037e-06, + "loss": 0.8545, + "step": 4546 + }, + { + "epoch": 4.233705772811918, + "grad_norm": 1.8330035209655762, + "learning_rate": 1.0243340275427232e-06, + "loss": 0.9075, + "step": 4547 + }, + { + "epoch": 4.23463687150838, + "grad_norm": 1.7763220071792603, + "learning_rate": 1.0233348704283332e-06, + "loss": 0.8934, + "step": 4548 + }, + { + "epoch": 4.235567970204841, + "grad_norm": 1.8318358659744263, + "learning_rate": 1.0223360754363584e-06, + "loss": 0.9068, + "step": 4549 + }, + { + "epoch": 4.236499068901304, + "grad_norm": 1.795265793800354, + "learning_rate": 1.0213376428117333e-06, + "loss": 0.8688, + "step": 4550 + }, + { + "epoch": 4.237430167597766, + "grad_norm": 1.7980766296386719, + "learning_rate": 1.0203395727993016e-06, + "loss": 0.9042, + "step": 4551 + }, + { + "epoch": 4.238361266294227, + "grad_norm": 1.7689661979675293, + "learning_rate": 1.01934186564382e-06, + "loss": 0.8557, + "step": 4552 + }, + { + "epoch": 4.239292364990689, + "grad_norm": 1.7639997005462646, + "learning_rate": 1.0183445215899585e-06, + "loss": 0.881, + "step": 4553 + }, + { + "epoch": 4.240223463687151, + "grad_norm": 1.8100800514221191, + "learning_rate": 1.0173475408822933e-06, + "loss": 0.8674, + "step": 4554 + }, + { + "epoch": 4.241154562383612, + "grad_norm": 1.81223464012146, + "learning_rate": 1.0163509237653138e-06, + "loss": 0.8884, + "step": 4555 + }, + { + "epoch": 4.242085661080075, + "grad_norm": 1.7740631103515625, + "learning_rate": 1.015354670483422e-06, + "loss": 0.868, + "step": 4556 + }, + { + "epoch": 4.243016759776537, + "grad_norm": 1.7722138166427612, + "learning_rate": 1.014358781280929e-06, + "loss": 0.842, + "step": 4557 + }, + { + "epoch": 4.243947858472998, + "grad_norm": 1.7707154750823975, + "learning_rate": 1.0133632564020546e-06, + "loss": 0.8864, + "step": 4558 + }, + { + "epoch": 4.24487895716946, + "grad_norm": 1.7235485315322876, + "learning_rate": 1.0123680960909319e-06, + "loss": 0.86, + "step": 4559 + }, + { + "epoch": 4.245810055865922, + "grad_norm": 1.812963843345642, + "learning_rate": 1.0113733005916058e-06, + "loss": 0.9149, + "step": 4560 + }, + { + "epoch": 4.246741154562383, + "grad_norm": 1.751678466796875, + "learning_rate": 1.0103788701480278e-06, + "loss": 0.8665, + "step": 4561 + }, + { + "epoch": 4.247672253258846, + "grad_norm": 1.964104175567627, + "learning_rate": 1.0093848050040622e-06, + "loss": 0.9244, + "step": 4562 + }, + { + "epoch": 4.248603351955307, + "grad_norm": 1.7948815822601318, + "learning_rate": 1.0083911054034835e-06, + "loss": 0.849, + "step": 4563 + }, + { + "epoch": 4.249534450651769, + "grad_norm": 1.7437764406204224, + "learning_rate": 1.0073977715899785e-06, + "loss": 0.884, + "step": 4564 + }, + { + "epoch": 4.250465549348231, + "grad_norm": 1.8104383945465088, + "learning_rate": 1.00640480380714e-06, + "loss": 0.8926, + "step": 4565 + }, + { + "epoch": 4.251396648044693, + "grad_norm": 1.8229525089263916, + "learning_rate": 1.005412202298473e-06, + "loss": 0.8779, + "step": 4566 + }, + { + "epoch": 4.252327746741154, + "grad_norm": 1.781477689743042, + "learning_rate": 1.0044199673073963e-06, + "loss": 0.8833, + "step": 4567 + }, + { + "epoch": 4.253258845437617, + "grad_norm": 1.757774829864502, + "learning_rate": 1.0034280990772324e-06, + "loss": 0.8035, + "step": 4568 + }, + { + "epoch": 4.254189944134078, + "grad_norm": 2.0378730297088623, + "learning_rate": 1.0024365978512181e-06, + "loss": 0.8938, + "step": 4569 + }, + { + "epoch": 4.25512104283054, + "grad_norm": 1.7808213233947754, + "learning_rate": 1.0014454638724983e-06, + "loss": 0.8856, + "step": 4570 + }, + { + "epoch": 4.256052141527002, + "grad_norm": 1.7794196605682373, + "learning_rate": 1.0004546973841314e-06, + "loss": 0.8698, + "step": 4571 + }, + { + "epoch": 4.256983240223463, + "grad_norm": 1.792229413986206, + "learning_rate": 9.994642986290797e-07, + "loss": 0.8733, + "step": 4572 + }, + { + "epoch": 4.257914338919925, + "grad_norm": 1.8139854669570923, + "learning_rate": 9.984742678502197e-07, + "loss": 0.8755, + "step": 4573 + }, + { + "epoch": 4.258845437616388, + "grad_norm": 1.7683613300323486, + "learning_rate": 9.97484605290338e-07, + "loss": 0.8501, + "step": 4574 + }, + { + "epoch": 4.259776536312849, + "grad_norm": 1.7386735677719116, + "learning_rate": 9.964953111921273e-07, + "loss": 0.8367, + "step": 4575 + }, + { + "epoch": 4.260707635009311, + "grad_norm": 1.790756106376648, + "learning_rate": 9.95506385798193e-07, + "loss": 0.8818, + "step": 4576 + }, + { + "epoch": 4.261638733705773, + "grad_norm": 1.8507716655731201, + "learning_rate": 9.945178293510485e-07, + "loss": 0.9153, + "step": 4577 + }, + { + "epoch": 4.262569832402234, + "grad_norm": 1.7573734521865845, + "learning_rate": 9.935296420931195e-07, + "loss": 0.8464, + "step": 4578 + }, + { + "epoch": 4.263500931098696, + "grad_norm": 1.8272173404693604, + "learning_rate": 9.925418242667367e-07, + "loss": 0.8963, + "step": 4579 + }, + { + "epoch": 4.264432029795159, + "grad_norm": 1.7940114736557007, + "learning_rate": 9.915543761141432e-07, + "loss": 0.8559, + "step": 4580 + }, + { + "epoch": 4.26536312849162, + "grad_norm": 1.7538626194000244, + "learning_rate": 9.905672978774913e-07, + "loss": 0.8908, + "step": 4581 + }, + { + "epoch": 4.266294227188082, + "grad_norm": 1.7882611751556396, + "learning_rate": 9.895805897988417e-07, + "loss": 0.9024, + "step": 4582 + }, + { + "epoch": 4.267225325884544, + "grad_norm": 1.8068485260009766, + "learning_rate": 9.885942521201644e-07, + "loss": 0.9008, + "step": 4583 + }, + { + "epoch": 4.268156424581005, + "grad_norm": 1.7912070751190186, + "learning_rate": 9.876082850833395e-07, + "loss": 0.8992, + "step": 4584 + }, + { + "epoch": 4.269087523277467, + "grad_norm": 1.831703543663025, + "learning_rate": 9.866226889301552e-07, + "loss": 0.9056, + "step": 4585 + }, + { + "epoch": 4.27001862197393, + "grad_norm": 1.8025038242340088, + "learning_rate": 9.856374639023095e-07, + "loss": 0.8738, + "step": 4586 + }, + { + "epoch": 4.270949720670391, + "grad_norm": 1.7474888563156128, + "learning_rate": 9.846526102414083e-07, + "loss": 0.8479, + "step": 4587 + }, + { + "epoch": 4.271880819366853, + "grad_norm": 1.704208254814148, + "learning_rate": 9.83668128188968e-07, + "loss": 0.8298, + "step": 4588 + }, + { + "epoch": 4.272811918063315, + "grad_norm": 1.775575041770935, + "learning_rate": 9.826840179864125e-07, + "loss": 0.8702, + "step": 4589 + }, + { + "epoch": 4.273743016759776, + "grad_norm": 1.7267366647720337, + "learning_rate": 9.81700279875075e-07, + "loss": 0.8392, + "step": 4590 + }, + { + "epoch": 4.274674115456238, + "grad_norm": 1.7947123050689697, + "learning_rate": 9.807169140961979e-07, + "loss": 0.8426, + "step": 4591 + }, + { + "epoch": 4.275605214152701, + "grad_norm": 1.7853509187698364, + "learning_rate": 9.797339208909312e-07, + "loss": 0.8981, + "step": 4592 + }, + { + "epoch": 4.276536312849162, + "grad_norm": 1.847640872001648, + "learning_rate": 9.78751300500335e-07, + "loss": 0.9105, + "step": 4593 + }, + { + "epoch": 4.277467411545624, + "grad_norm": 1.7890129089355469, + "learning_rate": 9.777690531653763e-07, + "loss": 0.8567, + "step": 4594 + }, + { + "epoch": 4.278398510242086, + "grad_norm": 1.77309250831604, + "learning_rate": 9.767871791269324e-07, + "loss": 0.8689, + "step": 4595 + }, + { + "epoch": 4.279329608938547, + "grad_norm": 1.8659417629241943, + "learning_rate": 9.758056786257874e-07, + "loss": 0.9131, + "step": 4596 + }, + { + "epoch": 4.280260707635009, + "grad_norm": 1.7237368822097778, + "learning_rate": 9.748245519026353e-07, + "loss": 0.8567, + "step": 4597 + }, + { + "epoch": 4.281191806331471, + "grad_norm": 1.7701530456542969, + "learning_rate": 9.73843799198077e-07, + "loss": 0.8998, + "step": 4598 + }, + { + "epoch": 4.282122905027933, + "grad_norm": 1.7479349374771118, + "learning_rate": 9.728634207526228e-07, + "loss": 0.8876, + "step": 4599 + }, + { + "epoch": 4.283054003724395, + "grad_norm": 1.7546591758728027, + "learning_rate": 9.718834168066904e-07, + "loss": 0.8733, + "step": 4600 + }, + { + "epoch": 4.283985102420857, + "grad_norm": 1.8057647943496704, + "learning_rate": 9.709037876006065e-07, + "loss": 0.8738, + "step": 4601 + }, + { + "epoch": 4.284916201117318, + "grad_norm": 1.82762610912323, + "learning_rate": 9.699245333746052e-07, + "loss": 0.9078, + "step": 4602 + }, + { + "epoch": 4.28584729981378, + "grad_norm": 1.8134896755218506, + "learning_rate": 9.689456543688288e-07, + "loss": 0.913, + "step": 4603 + }, + { + "epoch": 4.286778398510242, + "grad_norm": 1.846264123916626, + "learning_rate": 9.67967150823328e-07, + "loss": 0.8755, + "step": 4604 + }, + { + "epoch": 4.287709497206704, + "grad_norm": 1.7718585729599, + "learning_rate": 9.669890229780607e-07, + "loss": 0.8952, + "step": 4605 + }, + { + "epoch": 4.288640595903166, + "grad_norm": 1.8315508365631104, + "learning_rate": 9.660112710728934e-07, + "loss": 0.89, + "step": 4606 + }, + { + "epoch": 4.289571694599627, + "grad_norm": 1.7807248830795288, + "learning_rate": 9.650338953476002e-07, + "loss": 0.8756, + "step": 4607 + }, + { + "epoch": 4.290502793296089, + "grad_norm": 1.7499626874923706, + "learning_rate": 9.640568960418622e-07, + "loss": 0.8445, + "step": 4608 + }, + { + "epoch": 4.291433891992551, + "grad_norm": 1.7912813425064087, + "learning_rate": 9.630802733952697e-07, + "loss": 0.9042, + "step": 4609 + }, + { + "epoch": 4.292364990689013, + "grad_norm": 1.7681941986083984, + "learning_rate": 9.62104027647319e-07, + "loss": 0.8726, + "step": 4610 + }, + { + "epoch": 4.293296089385475, + "grad_norm": 1.8276888132095337, + "learning_rate": 9.61128159037415e-07, + "loss": 0.8988, + "step": 4611 + }, + { + "epoch": 4.294227188081937, + "grad_norm": 1.8095066547393799, + "learning_rate": 9.6015266780487e-07, + "loss": 0.8897, + "step": 4612 + }, + { + "epoch": 4.295158286778398, + "grad_norm": 1.8648639917373657, + "learning_rate": 9.591775541889033e-07, + "loss": 0.8843, + "step": 4613 + }, + { + "epoch": 4.29608938547486, + "grad_norm": 1.7964116334915161, + "learning_rate": 9.582028184286423e-07, + "loss": 0.8506, + "step": 4614 + }, + { + "epoch": 4.297020484171322, + "grad_norm": 1.7880585193634033, + "learning_rate": 9.572284607631219e-07, + "loss": 0.9102, + "step": 4615 + }, + { + "epoch": 4.297951582867784, + "grad_norm": 1.8076939582824707, + "learning_rate": 9.562544814312812e-07, + "loss": 0.851, + "step": 4616 + }, + { + "epoch": 4.298882681564246, + "grad_norm": 1.7638685703277588, + "learning_rate": 9.552808806719716e-07, + "loss": 0.8666, + "step": 4617 + }, + { + "epoch": 4.299813780260708, + "grad_norm": 1.8273146152496338, + "learning_rate": 9.543076587239484e-07, + "loss": 0.8871, + "step": 4618 + }, + { + "epoch": 4.300744878957169, + "grad_norm": 1.8195624351501465, + "learning_rate": 9.533348158258751e-07, + "loss": 0.8527, + "step": 4619 + }, + { + "epoch": 4.301675977653631, + "grad_norm": 1.7724034786224365, + "learning_rate": 9.523623522163197e-07, + "loss": 0.8627, + "step": 4620 + }, + { + "epoch": 4.302607076350093, + "grad_norm": 1.8399306535720825, + "learning_rate": 9.513902681337619e-07, + "loss": 0.8793, + "step": 4621 + }, + { + "epoch": 4.303538175046555, + "grad_norm": 1.733980655670166, + "learning_rate": 9.504185638165855e-07, + "loss": 0.8721, + "step": 4622 + }, + { + "epoch": 4.304469273743017, + "grad_norm": 1.782527208328247, + "learning_rate": 9.49447239503079e-07, + "loss": 0.9109, + "step": 4623 + }, + { + "epoch": 4.305400372439479, + "grad_norm": 1.7658100128173828, + "learning_rate": 9.48476295431443e-07, + "loss": 0.866, + "step": 4624 + }, + { + "epoch": 4.30633147113594, + "grad_norm": 1.7950258255004883, + "learning_rate": 9.475057318397807e-07, + "loss": 0.9106, + "step": 4625 + }, + { + "epoch": 4.307262569832402, + "grad_norm": 1.7671973705291748, + "learning_rate": 9.465355489661043e-07, + "loss": 0.8755, + "step": 4626 + }, + { + "epoch": 4.308193668528864, + "grad_norm": 1.7818858623504639, + "learning_rate": 9.455657470483293e-07, + "loss": 0.8639, + "step": 4627 + }, + { + "epoch": 4.309124767225326, + "grad_norm": 1.845274567604065, + "learning_rate": 9.445963263242822e-07, + "loss": 0.8808, + "step": 4628 + }, + { + "epoch": 4.310055865921788, + "grad_norm": 1.7942404747009277, + "learning_rate": 9.436272870316942e-07, + "loss": 0.8677, + "step": 4629 + }, + { + "epoch": 4.31098696461825, + "grad_norm": 1.8842107057571411, + "learning_rate": 9.426586294082013e-07, + "loss": 0.8922, + "step": 4630 + }, + { + "epoch": 4.311918063314711, + "grad_norm": 1.7267545461654663, + "learning_rate": 9.416903536913466e-07, + "loss": 0.8335, + "step": 4631 + }, + { + "epoch": 4.312849162011173, + "grad_norm": 1.8038636445999146, + "learning_rate": 9.407224601185824e-07, + "loss": 0.8596, + "step": 4632 + }, + { + "epoch": 4.3137802607076345, + "grad_norm": 1.8320026397705078, + "learning_rate": 9.397549489272653e-07, + "loss": 0.8924, + "step": 4633 + }, + { + "epoch": 4.314711359404097, + "grad_norm": 1.8169190883636475, + "learning_rate": 9.387878203546549e-07, + "loss": 0.8985, + "step": 4634 + }, + { + "epoch": 4.315642458100559, + "grad_norm": 1.7514070272445679, + "learning_rate": 9.378210746379229e-07, + "loss": 0.8734, + "step": 4635 + }, + { + "epoch": 4.316573556797021, + "grad_norm": 1.7552448511123657, + "learning_rate": 9.368547120141441e-07, + "loss": 0.8569, + "step": 4636 + }, + { + "epoch": 4.317504655493482, + "grad_norm": 1.800512671470642, + "learning_rate": 9.358887327202981e-07, + "loss": 0.8657, + "step": 4637 + }, + { + "epoch": 4.318435754189944, + "grad_norm": 1.79988694190979, + "learning_rate": 9.349231369932715e-07, + "loss": 0.9028, + "step": 4638 + }, + { + "epoch": 4.3193668528864055, + "grad_norm": 1.797728180885315, + "learning_rate": 9.339579250698589e-07, + "loss": 0.9025, + "step": 4639 + }, + { + "epoch": 4.320297951582868, + "grad_norm": 1.7657859325408936, + "learning_rate": 9.329930971867596e-07, + "loss": 0.9081, + "step": 4640 + }, + { + "epoch": 4.32122905027933, + "grad_norm": 1.79705810546875, + "learning_rate": 9.32028653580575e-07, + "loss": 0.8876, + "step": 4641 + }, + { + "epoch": 4.322160148975791, + "grad_norm": 1.7535821199417114, + "learning_rate": 9.310645944878185e-07, + "loss": 0.8633, + "step": 4642 + }, + { + "epoch": 4.323091247672253, + "grad_norm": 1.756882905960083, + "learning_rate": 9.301009201449063e-07, + "loss": 0.875, + "step": 4643 + }, + { + "epoch": 4.324022346368715, + "grad_norm": 1.7465287446975708, + "learning_rate": 9.291376307881581e-07, + "loss": 0.8579, + "step": 4644 + }, + { + "epoch": 4.3249534450651765, + "grad_norm": 1.7324336767196655, + "learning_rate": 9.281747266538011e-07, + "loss": 0.8749, + "step": 4645 + }, + { + "epoch": 4.325884543761639, + "grad_norm": 1.9030883312225342, + "learning_rate": 9.272122079779713e-07, + "loss": 0.9162, + "step": 4646 + }, + { + "epoch": 4.326815642458101, + "grad_norm": 1.698173999786377, + "learning_rate": 9.262500749967041e-07, + "loss": 0.846, + "step": 4647 + }, + { + "epoch": 4.327746741154562, + "grad_norm": 1.8473979234695435, + "learning_rate": 9.25288327945944e-07, + "loss": 0.8634, + "step": 4648 + }, + { + "epoch": 4.328677839851024, + "grad_norm": 1.8178915977478027, + "learning_rate": 9.243269670615396e-07, + "loss": 0.8646, + "step": 4649 + }, + { + "epoch": 4.329608938547486, + "grad_norm": 1.8191344738006592, + "learning_rate": 9.233659925792476e-07, + "loss": 0.8783, + "step": 4650 + }, + { + "epoch": 4.3305400372439475, + "grad_norm": 1.7983900308609009, + "learning_rate": 9.224054047347256e-07, + "loss": 0.8442, + "step": 4651 + }, + { + "epoch": 4.33147113594041, + "grad_norm": 1.7840402126312256, + "learning_rate": 9.214452037635377e-07, + "loss": 0.8936, + "step": 4652 + }, + { + "epoch": 4.332402234636872, + "grad_norm": 1.8222264051437378, + "learning_rate": 9.204853899011567e-07, + "loss": 0.9108, + "step": 4653 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 1.866690993309021, + "learning_rate": 9.195259633829553e-07, + "loss": 0.9053, + "step": 4654 + }, + { + "epoch": 4.334264432029795, + "grad_norm": 1.916879653930664, + "learning_rate": 9.185669244442144e-07, + "loss": 0.9094, + "step": 4655 + }, + { + "epoch": 4.335195530726257, + "grad_norm": 1.8632758855819702, + "learning_rate": 9.176082733201181e-07, + "loss": 0.8561, + "step": 4656 + }, + { + "epoch": 4.3361266294227185, + "grad_norm": 1.8305007219314575, + "learning_rate": 9.166500102457584e-07, + "loss": 0.86, + "step": 4657 + }, + { + "epoch": 4.337057728119181, + "grad_norm": 1.7265515327453613, + "learning_rate": 9.156921354561282e-07, + "loss": 0.8427, + "step": 4658 + }, + { + "epoch": 4.337988826815643, + "grad_norm": 1.8293341398239136, + "learning_rate": 9.147346491861276e-07, + "loss": 0.8669, + "step": 4659 + }, + { + "epoch": 4.338919925512104, + "grad_norm": 1.7909905910491943, + "learning_rate": 9.137775516705604e-07, + "loss": 0.881, + "step": 4660 + }, + { + "epoch": 4.339851024208566, + "grad_norm": 1.7975465059280396, + "learning_rate": 9.12820843144136e-07, + "loss": 0.8377, + "step": 4661 + }, + { + "epoch": 4.340782122905028, + "grad_norm": 1.7864269018173218, + "learning_rate": 9.118645238414681e-07, + "loss": 0.8858, + "step": 4662 + }, + { + "epoch": 4.3417132216014895, + "grad_norm": 1.7692947387695312, + "learning_rate": 9.109085939970733e-07, + "loss": 0.8966, + "step": 4663 + }, + { + "epoch": 4.342644320297952, + "grad_norm": 1.8217580318450928, + "learning_rate": 9.099530538453771e-07, + "loss": 0.8893, + "step": 4664 + }, + { + "epoch": 4.343575418994414, + "grad_norm": 1.7615762948989868, + "learning_rate": 9.089979036207042e-07, + "loss": 0.829, + "step": 4665 + }, + { + "epoch": 4.344506517690875, + "grad_norm": 1.8003782033920288, + "learning_rate": 9.080431435572862e-07, + "loss": 0.884, + "step": 4666 + }, + { + "epoch": 4.345437616387337, + "grad_norm": 1.7892361879348755, + "learning_rate": 9.070887738892592e-07, + "loss": 0.8633, + "step": 4667 + }, + { + "epoch": 4.346368715083799, + "grad_norm": 1.8369395732879639, + "learning_rate": 9.06134794850663e-07, + "loss": 0.9176, + "step": 4668 + }, + { + "epoch": 4.3472998137802605, + "grad_norm": 1.730631709098816, + "learning_rate": 9.05181206675442e-07, + "loss": 0.8588, + "step": 4669 + }, + { + "epoch": 4.348230912476723, + "grad_norm": 1.7462267875671387, + "learning_rate": 9.042280095974435e-07, + "loss": 0.8609, + "step": 4670 + }, + { + "epoch": 4.349162011173185, + "grad_norm": 1.8034248352050781, + "learning_rate": 9.032752038504222e-07, + "loss": 0.8632, + "step": 4671 + }, + { + "epoch": 4.350093109869646, + "grad_norm": 1.77486252784729, + "learning_rate": 9.023227896680325e-07, + "loss": 0.8681, + "step": 4672 + }, + { + "epoch": 4.351024208566108, + "grad_norm": 1.7913492918014526, + "learning_rate": 9.013707672838353e-07, + "loss": 0.9128, + "step": 4673 + }, + { + "epoch": 4.351955307262569, + "grad_norm": 1.7689261436462402, + "learning_rate": 9.004191369312953e-07, + "loss": 0.8858, + "step": 4674 + }, + { + "epoch": 4.3528864059590315, + "grad_norm": 1.7693501710891724, + "learning_rate": 8.994678988437802e-07, + "loss": 0.8444, + "step": 4675 + }, + { + "epoch": 4.353817504655494, + "grad_norm": 1.756178617477417, + "learning_rate": 8.985170532545623e-07, + "loss": 0.8803, + "step": 4676 + }, + { + "epoch": 4.354748603351955, + "grad_norm": 1.7510316371917725, + "learning_rate": 8.975666003968175e-07, + "loss": 0.8904, + "step": 4677 + }, + { + "epoch": 4.355679702048417, + "grad_norm": 1.8701618909835815, + "learning_rate": 8.966165405036248e-07, + "loss": 0.8507, + "step": 4678 + }, + { + "epoch": 4.356610800744879, + "grad_norm": 1.873578667640686, + "learning_rate": 8.956668738079676e-07, + "loss": 0.8934, + "step": 4679 + }, + { + "epoch": 4.35754189944134, + "grad_norm": 1.8069595098495483, + "learning_rate": 8.947176005427324e-07, + "loss": 0.8939, + "step": 4680 + }, + { + "epoch": 4.3584729981378025, + "grad_norm": 1.7824443578720093, + "learning_rate": 8.937687209407098e-07, + "loss": 0.8886, + "step": 4681 + }, + { + "epoch": 4.359404096834265, + "grad_norm": 1.8041117191314697, + "learning_rate": 8.928202352345927e-07, + "loss": 0.8693, + "step": 4682 + }, + { + "epoch": 4.360335195530726, + "grad_norm": 1.8021392822265625, + "learning_rate": 8.918721436569786e-07, + "loss": 0.8645, + "step": 4683 + }, + { + "epoch": 4.361266294227188, + "grad_norm": 1.8212915658950806, + "learning_rate": 8.90924446440368e-07, + "loss": 0.9079, + "step": 4684 + }, + { + "epoch": 4.36219739292365, + "grad_norm": 1.7713114023208618, + "learning_rate": 8.899771438171642e-07, + "loss": 0.9082, + "step": 4685 + }, + { + "epoch": 4.363128491620111, + "grad_norm": 1.8199948072433472, + "learning_rate": 8.890302360196742e-07, + "loss": 0.8771, + "step": 4686 + }, + { + "epoch": 4.3640595903165735, + "grad_norm": 1.7907487154006958, + "learning_rate": 8.880837232801081e-07, + "loss": 0.8802, + "step": 4687 + }, + { + "epoch": 4.364990689013036, + "grad_norm": 1.846829891204834, + "learning_rate": 8.871376058305794e-07, + "loss": 0.8485, + "step": 4688 + }, + { + "epoch": 4.365921787709497, + "grad_norm": 1.8024052381515503, + "learning_rate": 8.861918839031042e-07, + "loss": 0.8761, + "step": 4689 + }, + { + "epoch": 4.366852886405959, + "grad_norm": 1.778832197189331, + "learning_rate": 8.852465577296016e-07, + "loss": 0.9069, + "step": 4690 + }, + { + "epoch": 4.367783985102421, + "grad_norm": 1.8085780143737793, + "learning_rate": 8.843016275418939e-07, + "loss": 0.8985, + "step": 4691 + }, + { + "epoch": 4.368715083798882, + "grad_norm": 1.801321029663086, + "learning_rate": 8.833570935717065e-07, + "loss": 0.8841, + "step": 4692 + }, + { + "epoch": 4.3696461824953445, + "grad_norm": 1.7649714946746826, + "learning_rate": 8.82412956050667e-07, + "loss": 0.8423, + "step": 4693 + }, + { + "epoch": 4.370577281191807, + "grad_norm": 1.800034523010254, + "learning_rate": 8.814692152103063e-07, + "loss": 0.9159, + "step": 4694 + }, + { + "epoch": 4.371508379888268, + "grad_norm": 1.820361614227295, + "learning_rate": 8.805258712820578e-07, + "loss": 0.8456, + "step": 4695 + }, + { + "epoch": 4.37243947858473, + "grad_norm": 1.8390058279037476, + "learning_rate": 8.795829244972573e-07, + "loss": 0.8961, + "step": 4696 + }, + { + "epoch": 4.373370577281192, + "grad_norm": 1.78786301612854, + "learning_rate": 8.786403750871442e-07, + "loss": 0.8789, + "step": 4697 + }, + { + "epoch": 4.374301675977653, + "grad_norm": 1.7607841491699219, + "learning_rate": 8.776982232828602e-07, + "loss": 0.8435, + "step": 4698 + }, + { + "epoch": 4.3752327746741155, + "grad_norm": 1.8964289426803589, + "learning_rate": 8.767564693154465e-07, + "loss": 0.9152, + "step": 4699 + }, + { + "epoch": 4.376163873370578, + "grad_norm": 1.8596910238265991, + "learning_rate": 8.758151134158521e-07, + "loss": 0.9227, + "step": 4700 + }, + { + "epoch": 4.377094972067039, + "grad_norm": 1.765878438949585, + "learning_rate": 8.748741558149246e-07, + "loss": 0.8701, + "step": 4701 + }, + { + "epoch": 4.378026070763501, + "grad_norm": 1.7889503240585327, + "learning_rate": 8.739335967434151e-07, + "loss": 0.8751, + "step": 4702 + }, + { + "epoch": 4.378957169459963, + "grad_norm": 1.7116212844848633, + "learning_rate": 8.729934364319764e-07, + "loss": 0.8452, + "step": 4703 + }, + { + "epoch": 4.379888268156424, + "grad_norm": 1.7578794956207275, + "learning_rate": 8.720536751111641e-07, + "loss": 0.8952, + "step": 4704 + }, + { + "epoch": 4.3808193668528865, + "grad_norm": 1.789943814277649, + "learning_rate": 8.711143130114369e-07, + "loss": 0.8935, + "step": 4705 + }, + { + "epoch": 4.381750465549349, + "grad_norm": 1.8069759607315063, + "learning_rate": 8.701753503631516e-07, + "loss": 0.9326, + "step": 4706 + }, + { + "epoch": 4.38268156424581, + "grad_norm": 1.8406575918197632, + "learning_rate": 8.692367873965724e-07, + "loss": 0.924, + "step": 4707 + }, + { + "epoch": 4.383612662942272, + "grad_norm": 1.744400978088379, + "learning_rate": 8.68298624341862e-07, + "loss": 0.9003, + "step": 4708 + }, + { + "epoch": 4.384543761638733, + "grad_norm": 1.7727299928665161, + "learning_rate": 8.673608614290865e-07, + "loss": 0.8783, + "step": 4709 + }, + { + "epoch": 4.385474860335195, + "grad_norm": 1.7119126319885254, + "learning_rate": 8.664234988882131e-07, + "loss": 0.8738, + "step": 4710 + }, + { + "epoch": 4.3864059590316575, + "grad_norm": 1.7791107892990112, + "learning_rate": 8.654865369491111e-07, + "loss": 0.8765, + "step": 4711 + }, + { + "epoch": 4.387337057728119, + "grad_norm": 1.8230277299880981, + "learning_rate": 8.645499758415526e-07, + "loss": 0.8738, + "step": 4712 + }, + { + "epoch": 4.388268156424581, + "grad_norm": 1.781645655632019, + "learning_rate": 8.636138157952076e-07, + "loss": 0.8764, + "step": 4713 + }, + { + "epoch": 4.389199255121043, + "grad_norm": 1.852137565612793, + "learning_rate": 8.626780570396531e-07, + "loss": 0.8935, + "step": 4714 + }, + { + "epoch": 4.390130353817504, + "grad_norm": 1.8440238237380981, + "learning_rate": 8.617426998043652e-07, + "loss": 0.8797, + "step": 4715 + }, + { + "epoch": 4.391061452513966, + "grad_norm": 1.898415446281433, + "learning_rate": 8.608077443187193e-07, + "loss": 0.9281, + "step": 4716 + }, + { + "epoch": 4.3919925512104285, + "grad_norm": 1.8493984937667847, + "learning_rate": 8.598731908119953e-07, + "loss": 0.9342, + "step": 4717 + }, + { + "epoch": 4.39292364990689, + "grad_norm": 1.8758444786071777, + "learning_rate": 8.589390395133748e-07, + "loss": 0.9026, + "step": 4718 + }, + { + "epoch": 4.393854748603352, + "grad_norm": 1.8137794733047485, + "learning_rate": 8.580052906519396e-07, + "loss": 0.9219, + "step": 4719 + }, + { + "epoch": 4.394785847299814, + "grad_norm": 1.7815420627593994, + "learning_rate": 8.570719444566703e-07, + "loss": 0.8534, + "step": 4720 + }, + { + "epoch": 4.395716945996275, + "grad_norm": 1.806694507598877, + "learning_rate": 8.561390011564538e-07, + "loss": 0.886, + "step": 4721 + }, + { + "epoch": 4.396648044692737, + "grad_norm": 1.8989911079406738, + "learning_rate": 8.552064609800759e-07, + "loss": 0.8667, + "step": 4722 + }, + { + "epoch": 4.3975791433891995, + "grad_norm": 1.821148157119751, + "learning_rate": 8.542743241562212e-07, + "loss": 0.9103, + "step": 4723 + }, + { + "epoch": 4.398510242085661, + "grad_norm": 1.8119088411331177, + "learning_rate": 8.533425909134779e-07, + "loss": 0.854, + "step": 4724 + }, + { + "epoch": 4.399441340782123, + "grad_norm": 1.8332998752593994, + "learning_rate": 8.524112614803365e-07, + "loss": 0.9038, + "step": 4725 + }, + { + "epoch": 4.400372439478585, + "grad_norm": 1.7998125553131104, + "learning_rate": 8.514803360851867e-07, + "loss": 0.9222, + "step": 4726 + }, + { + "epoch": 4.401303538175046, + "grad_norm": 1.7936910390853882, + "learning_rate": 8.505498149563174e-07, + "loss": 0.842, + "step": 4727 + }, + { + "epoch": 4.402234636871508, + "grad_norm": 1.7958439588546753, + "learning_rate": 8.496196983219205e-07, + "loss": 0.8719, + "step": 4728 + }, + { + "epoch": 4.4031657355679705, + "grad_norm": 1.759028673171997, + "learning_rate": 8.486899864100906e-07, + "loss": 0.9072, + "step": 4729 + }, + { + "epoch": 4.404096834264432, + "grad_norm": 1.821743369102478, + "learning_rate": 8.477606794488182e-07, + "loss": 0.9106, + "step": 4730 + }, + { + "epoch": 4.405027932960894, + "grad_norm": 1.7689661979675293, + "learning_rate": 8.468317776659979e-07, + "loss": 0.878, + "step": 4731 + }, + { + "epoch": 4.405959031657356, + "grad_norm": 1.81678307056427, + "learning_rate": 8.459032812894252e-07, + "loss": 0.8897, + "step": 4732 + }, + { + "epoch": 4.406890130353817, + "grad_norm": 1.8086432218551636, + "learning_rate": 8.449751905467951e-07, + "loss": 0.9132, + "step": 4733 + }, + { + "epoch": 4.407821229050279, + "grad_norm": 1.8080083131790161, + "learning_rate": 8.440475056657019e-07, + "loss": 0.8832, + "step": 4734 + }, + { + "epoch": 4.4087523277467415, + "grad_norm": 1.8164284229278564, + "learning_rate": 8.431202268736413e-07, + "loss": 0.8978, + "step": 4735 + }, + { + "epoch": 4.409683426443203, + "grad_norm": 1.7544344663619995, + "learning_rate": 8.421933543980126e-07, + "loss": 0.8701, + "step": 4736 + }, + { + "epoch": 4.410614525139665, + "grad_norm": 1.892697811126709, + "learning_rate": 8.412668884661099e-07, + "loss": 0.8689, + "step": 4737 + }, + { + "epoch": 4.411545623836127, + "grad_norm": 1.8120418787002563, + "learning_rate": 8.403408293051302e-07, + "loss": 0.8702, + "step": 4738 + }, + { + "epoch": 4.412476722532588, + "grad_norm": 1.8047159910202026, + "learning_rate": 8.39415177142173e-07, + "loss": 0.8939, + "step": 4739 + }, + { + "epoch": 4.41340782122905, + "grad_norm": 1.7815989255905151, + "learning_rate": 8.384899322042355e-07, + "loss": 0.8939, + "step": 4740 + }, + { + "epoch": 4.4143389199255125, + "grad_norm": 1.7623627185821533, + "learning_rate": 8.375650947182137e-07, + "loss": 0.8865, + "step": 4741 + }, + { + "epoch": 4.415270018621974, + "grad_norm": 1.8120049238204956, + "learning_rate": 8.366406649109058e-07, + "loss": 0.9085, + "step": 4742 + }, + { + "epoch": 4.416201117318436, + "grad_norm": 1.806450366973877, + "learning_rate": 8.357166430090119e-07, + "loss": 0.8701, + "step": 4743 + }, + { + "epoch": 4.417132216014897, + "grad_norm": 1.811549425125122, + "learning_rate": 8.347930292391268e-07, + "loss": 0.9026, + "step": 4744 + }, + { + "epoch": 4.418063314711359, + "grad_norm": 1.7355504035949707, + "learning_rate": 8.338698238277498e-07, + "loss": 0.8586, + "step": 4745 + }, + { + "epoch": 4.418994413407821, + "grad_norm": 1.8455661535263062, + "learning_rate": 8.32947027001278e-07, + "loss": 0.8883, + "step": 4746 + }, + { + "epoch": 4.419925512104283, + "grad_norm": 1.821180820465088, + "learning_rate": 8.32024638986009e-07, + "loss": 0.8946, + "step": 4747 + }, + { + "epoch": 4.420856610800745, + "grad_norm": 1.758108377456665, + "learning_rate": 8.311026600081395e-07, + "loss": 0.8502, + "step": 4748 + }, + { + "epoch": 4.421787709497207, + "grad_norm": 1.8256518840789795, + "learning_rate": 8.30181090293766e-07, + "loss": 0.9042, + "step": 4749 + }, + { + "epoch": 4.422718808193668, + "grad_norm": 1.7990506887435913, + "learning_rate": 8.292599300688869e-07, + "loss": 0.9337, + "step": 4750 + }, + { + "epoch": 4.42364990689013, + "grad_norm": 1.8311277627944946, + "learning_rate": 8.283391795593962e-07, + "loss": 0.9241, + "step": 4751 + }, + { + "epoch": 4.424581005586592, + "grad_norm": 1.8120989799499512, + "learning_rate": 8.274188389910898e-07, + "loss": 0.8706, + "step": 4752 + }, + { + "epoch": 4.425512104283054, + "grad_norm": 1.794127345085144, + "learning_rate": 8.264989085896633e-07, + "loss": 0.8747, + "step": 4753 + }, + { + "epoch": 4.426443202979516, + "grad_norm": 1.7502238750457764, + "learning_rate": 8.255793885807104e-07, + "loss": 0.8588, + "step": 4754 + }, + { + "epoch": 4.427374301675978, + "grad_norm": 1.7457607984542847, + "learning_rate": 8.246602791897255e-07, + "loss": 0.8671, + "step": 4755 + }, + { + "epoch": 4.428305400372439, + "grad_norm": 1.7715762853622437, + "learning_rate": 8.237415806421015e-07, + "loss": 0.8698, + "step": 4756 + }, + { + "epoch": 4.429236499068901, + "grad_norm": 1.7810612916946411, + "learning_rate": 8.228232931631305e-07, + "loss": 0.8229, + "step": 4757 + }, + { + "epoch": 4.430167597765363, + "grad_norm": 1.8673218488693237, + "learning_rate": 8.219054169780041e-07, + "loss": 0.8691, + "step": 4758 + }, + { + "epoch": 4.431098696461825, + "grad_norm": 1.7508617639541626, + "learning_rate": 8.209879523118133e-07, + "loss": 0.8337, + "step": 4759 + }, + { + "epoch": 4.432029795158287, + "grad_norm": 1.8408684730529785, + "learning_rate": 8.200708993895476e-07, + "loss": 0.91, + "step": 4760 + }, + { + "epoch": 4.432960893854749, + "grad_norm": 1.7703893184661865, + "learning_rate": 8.191542584360957e-07, + "loss": 0.8761, + "step": 4761 + }, + { + "epoch": 4.43389199255121, + "grad_norm": 1.785624384880066, + "learning_rate": 8.182380296762457e-07, + "loss": 0.8717, + "step": 4762 + }, + { + "epoch": 4.434823091247672, + "grad_norm": 1.7466684579849243, + "learning_rate": 8.173222133346837e-07, + "loss": 0.8914, + "step": 4763 + }, + { + "epoch": 4.435754189944134, + "grad_norm": 1.7974501848220825, + "learning_rate": 8.164068096359959e-07, + "loss": 0.8645, + "step": 4764 + }, + { + "epoch": 4.436685288640596, + "grad_norm": 1.8699138164520264, + "learning_rate": 8.15491818804666e-07, + "loss": 0.9228, + "step": 4765 + }, + { + "epoch": 4.437616387337058, + "grad_norm": 1.8788704872131348, + "learning_rate": 8.145772410650777e-07, + "loss": 0.9019, + "step": 4766 + }, + { + "epoch": 4.43854748603352, + "grad_norm": 1.8652856349945068, + "learning_rate": 8.136630766415121e-07, + "loss": 0.8888, + "step": 4767 + }, + { + "epoch": 4.439478584729981, + "grad_norm": 1.8666926622390747, + "learning_rate": 8.127493257581503e-07, + "loss": 0.9068, + "step": 4768 + }, + { + "epoch": 4.440409683426443, + "grad_norm": 1.845198154449463, + "learning_rate": 8.118359886390709e-07, + "loss": 0.9087, + "step": 4769 + }, + { + "epoch": 4.441340782122905, + "grad_norm": 1.7786020040512085, + "learning_rate": 8.109230655082517e-07, + "loss": 0.8649, + "step": 4770 + }, + { + "epoch": 4.442271880819367, + "grad_norm": 1.830166220664978, + "learning_rate": 8.100105565895685e-07, + "loss": 0.8873, + "step": 4771 + }, + { + "epoch": 4.443202979515829, + "grad_norm": 1.826613187789917, + "learning_rate": 8.090984621067963e-07, + "loss": 0.8915, + "step": 4772 + }, + { + "epoch": 4.444134078212291, + "grad_norm": 1.9588901996612549, + "learning_rate": 8.08186782283607e-07, + "loss": 0.9208, + "step": 4773 + }, + { + "epoch": 4.445065176908752, + "grad_norm": 1.809730887413025, + "learning_rate": 8.072755173435726e-07, + "loss": 0.8746, + "step": 4774 + }, + { + "epoch": 4.445996275605214, + "grad_norm": 1.8880361318588257, + "learning_rate": 8.063646675101619e-07, + "loss": 0.9107, + "step": 4775 + }, + { + "epoch": 4.446927374301676, + "grad_norm": 1.7470858097076416, + "learning_rate": 8.054542330067428e-07, + "loss": 0.8691, + "step": 4776 + }, + { + "epoch": 4.447858472998138, + "grad_norm": 1.8388022184371948, + "learning_rate": 8.045442140565807e-07, + "loss": 0.9108, + "step": 4777 + }, + { + "epoch": 4.4487895716946, + "grad_norm": 1.7911401987075806, + "learning_rate": 8.0363461088284e-07, + "loss": 0.8753, + "step": 4778 + }, + { + "epoch": 4.449720670391061, + "grad_norm": 1.830960750579834, + "learning_rate": 8.027254237085822e-07, + "loss": 0.9063, + "step": 4779 + }, + { + "epoch": 4.450651769087523, + "grad_norm": 2.1027004718780518, + "learning_rate": 8.018166527567672e-07, + "loss": 0.902, + "step": 4780 + }, + { + "epoch": 4.451582867783985, + "grad_norm": 1.6773428916931152, + "learning_rate": 8.009082982502531e-07, + "loss": 0.8285, + "step": 4781 + }, + { + "epoch": 4.452513966480447, + "grad_norm": 1.7870938777923584, + "learning_rate": 8.000003604117951e-07, + "loss": 0.877, + "step": 4782 + }, + { + "epoch": 4.453445065176909, + "grad_norm": 1.757403016090393, + "learning_rate": 7.990928394640469e-07, + "loss": 0.8425, + "step": 4783 + }, + { + "epoch": 4.454376163873371, + "grad_norm": 1.812857747077942, + "learning_rate": 7.981857356295605e-07, + "loss": 0.8489, + "step": 4784 + }, + { + "epoch": 4.455307262569832, + "grad_norm": 1.8535525798797607, + "learning_rate": 7.972790491307827e-07, + "loss": 0.8429, + "step": 4785 + }, + { + "epoch": 4.456238361266294, + "grad_norm": 1.7605764865875244, + "learning_rate": 7.963727801900623e-07, + "loss": 0.8978, + "step": 4786 + }, + { + "epoch": 4.457169459962756, + "grad_norm": 1.8446509838104248, + "learning_rate": 7.954669290296427e-07, + "loss": 0.9481, + "step": 4787 + }, + { + "epoch": 4.4581005586592175, + "grad_norm": 1.833816409111023, + "learning_rate": 7.945614958716658e-07, + "loss": 0.8749, + "step": 4788 + }, + { + "epoch": 4.45903165735568, + "grad_norm": 1.8399702310562134, + "learning_rate": 7.936564809381709e-07, + "loss": 0.9163, + "step": 4789 + }, + { + "epoch": 4.459962756052142, + "grad_norm": 1.7930002212524414, + "learning_rate": 7.927518844510943e-07, + "loss": 0.9261, + "step": 4790 + }, + { + "epoch": 4.460893854748603, + "grad_norm": 1.8535377979278564, + "learning_rate": 7.918477066322714e-07, + "loss": 0.8952, + "step": 4791 + }, + { + "epoch": 4.461824953445065, + "grad_norm": 1.792407751083374, + "learning_rate": 7.909439477034309e-07, + "loss": 0.8688, + "step": 4792 + }, + { + "epoch": 4.462756052141527, + "grad_norm": 1.800287127494812, + "learning_rate": 7.900406078862042e-07, + "loss": 0.8789, + "step": 4793 + }, + { + "epoch": 4.4636871508379885, + "grad_norm": 1.8146910667419434, + "learning_rate": 7.891376874021162e-07, + "loss": 0.8775, + "step": 4794 + }, + { + "epoch": 4.464618249534451, + "grad_norm": 1.7478212118148804, + "learning_rate": 7.882351864725898e-07, + "loss": 0.8623, + "step": 4795 + }, + { + "epoch": 4.465549348230913, + "grad_norm": 1.845484972000122, + "learning_rate": 7.873331053189456e-07, + "loss": 0.9242, + "step": 4796 + }, + { + "epoch": 4.466480446927374, + "grad_norm": 1.81783926486969, + "learning_rate": 7.864314441624005e-07, + "loss": 0.872, + "step": 4797 + }, + { + "epoch": 4.467411545623836, + "grad_norm": 1.8407878875732422, + "learning_rate": 7.8553020322407e-07, + "loss": 0.9186, + "step": 4798 + }, + { + "epoch": 4.468342644320298, + "grad_norm": 1.8442916870117188, + "learning_rate": 7.846293827249624e-07, + "loss": 0.8786, + "step": 4799 + }, + { + "epoch": 4.4692737430167595, + "grad_norm": 1.8789836168289185, + "learning_rate": 7.837289828859884e-07, + "loss": 0.895, + "step": 4800 + }, + { + "epoch": 4.470204841713222, + "grad_norm": 1.7947131395339966, + "learning_rate": 7.828290039279523e-07, + "loss": 0.8016, + "step": 4801 + }, + { + "epoch": 4.471135940409684, + "grad_norm": 1.774171233177185, + "learning_rate": 7.819294460715566e-07, + "loss": 0.8639, + "step": 4802 + }, + { + "epoch": 4.472067039106145, + "grad_norm": 1.8351422548294067, + "learning_rate": 7.810303095373969e-07, + "loss": 0.8463, + "step": 4803 + }, + { + "epoch": 4.472998137802607, + "grad_norm": 1.7560762166976929, + "learning_rate": 7.801315945459714e-07, + "loss": 0.876, + "step": 4804 + }, + { + "epoch": 4.473929236499069, + "grad_norm": 1.7687538862228394, + "learning_rate": 7.792333013176717e-07, + "loss": 0.8656, + "step": 4805 + }, + { + "epoch": 4.4748603351955305, + "grad_norm": 1.8887925148010254, + "learning_rate": 7.783354300727835e-07, + "loss": 0.9304, + "step": 4806 + }, + { + "epoch": 4.475791433891993, + "grad_norm": 1.7851064205169678, + "learning_rate": 7.774379810314942e-07, + "loss": 0.9208, + "step": 4807 + }, + { + "epoch": 4.476722532588455, + "grad_norm": 1.8485733270645142, + "learning_rate": 7.765409544138843e-07, + "loss": 0.9229, + "step": 4808 + }, + { + "epoch": 4.477653631284916, + "grad_norm": 1.771230697631836, + "learning_rate": 7.756443504399325e-07, + "loss": 0.9154, + "step": 4809 + }, + { + "epoch": 4.478584729981378, + "grad_norm": 1.7369855642318726, + "learning_rate": 7.7474816932951e-07, + "loss": 0.8894, + "step": 4810 + }, + { + "epoch": 4.47951582867784, + "grad_norm": 1.8127371072769165, + "learning_rate": 7.738524113023901e-07, + "loss": 0.8556, + "step": 4811 + }, + { + "epoch": 4.4804469273743015, + "grad_norm": 1.8133959770202637, + "learning_rate": 7.729570765782393e-07, + "loss": 0.871, + "step": 4812 + }, + { + "epoch": 4.481378026070764, + "grad_norm": 1.802704095840454, + "learning_rate": 7.720621653766189e-07, + "loss": 0.8986, + "step": 4813 + }, + { + "epoch": 4.482309124767225, + "grad_norm": 1.7631899118423462, + "learning_rate": 7.711676779169875e-07, + "loss": 0.8373, + "step": 4814 + }, + { + "epoch": 4.483240223463687, + "grad_norm": 1.7754631042480469, + "learning_rate": 7.702736144187028e-07, + "loss": 0.888, + "step": 4815 + }, + { + "epoch": 4.484171322160149, + "grad_norm": 1.8155254125595093, + "learning_rate": 7.693799751010136e-07, + "loss": 0.8713, + "step": 4816 + }, + { + "epoch": 4.485102420856611, + "grad_norm": 1.751901626586914, + "learning_rate": 7.68486760183067e-07, + "loss": 0.8647, + "step": 4817 + }, + { + "epoch": 4.4860335195530725, + "grad_norm": 1.8239877223968506, + "learning_rate": 7.675939698839072e-07, + "loss": 0.9024, + "step": 4818 + }, + { + "epoch": 4.486964618249535, + "grad_norm": 1.7677234411239624, + "learning_rate": 7.667016044224735e-07, + "loss": 0.8792, + "step": 4819 + }, + { + "epoch": 4.487895716945996, + "grad_norm": 1.7824747562408447, + "learning_rate": 7.658096640175985e-07, + "loss": 0.9181, + "step": 4820 + }, + { + "epoch": 4.488826815642458, + "grad_norm": 1.8281453847885132, + "learning_rate": 7.649181488880131e-07, + "loss": 0.9055, + "step": 4821 + }, + { + "epoch": 4.48975791433892, + "grad_norm": 1.8496227264404297, + "learning_rate": 7.640270592523457e-07, + "loss": 0.8776, + "step": 4822 + }, + { + "epoch": 4.490689013035381, + "grad_norm": 1.775539517402649, + "learning_rate": 7.631363953291158e-07, + "loss": 0.8656, + "step": 4823 + }, + { + "epoch": 4.4916201117318435, + "grad_norm": 1.778825044631958, + "learning_rate": 7.622461573367413e-07, + "loss": 0.8775, + "step": 4824 + }, + { + "epoch": 4.492551210428306, + "grad_norm": 1.788356900215149, + "learning_rate": 7.613563454935344e-07, + "loss": 0.8646, + "step": 4825 + }, + { + "epoch": 4.493482309124767, + "grad_norm": 1.7931007146835327, + "learning_rate": 7.604669600177061e-07, + "loss": 0.879, + "step": 4826 + }, + { + "epoch": 4.494413407821229, + "grad_norm": 1.8443933725357056, + "learning_rate": 7.595780011273582e-07, + "loss": 0.9035, + "step": 4827 + }, + { + "epoch": 4.495344506517691, + "grad_norm": 1.781461238861084, + "learning_rate": 7.586894690404895e-07, + "loss": 0.8829, + "step": 4828 + }, + { + "epoch": 4.496275605214152, + "grad_norm": 1.8117586374282837, + "learning_rate": 7.57801363974997e-07, + "loss": 0.8628, + "step": 4829 + }, + { + "epoch": 4.4972067039106145, + "grad_norm": 1.8341553211212158, + "learning_rate": 7.569136861486687e-07, + "loss": 0.8809, + "step": 4830 + }, + { + "epoch": 4.498137802607077, + "grad_norm": 1.8261924982070923, + "learning_rate": 7.560264357791902e-07, + "loss": 0.8545, + "step": 4831 + }, + { + "epoch": 4.499068901303538, + "grad_norm": 1.797815203666687, + "learning_rate": 7.551396130841406e-07, + "loss": 0.8914, + "step": 4832 + }, + { + "epoch": 4.5, + "grad_norm": 1.808333396911621, + "learning_rate": 7.542532182809987e-07, + "loss": 0.8774, + "step": 4833 + }, + { + "epoch": 4.500931098696462, + "grad_norm": 1.766706943511963, + "learning_rate": 7.533672515871315e-07, + "loss": 0.8841, + "step": 4834 + }, + { + "epoch": 4.501862197392923, + "grad_norm": 1.878431797027588, + "learning_rate": 7.524817132198053e-07, + "loss": 0.9212, + "step": 4835 + }, + { + "epoch": 4.5027932960893855, + "grad_norm": 1.8471415042877197, + "learning_rate": 7.51596603396183e-07, + "loss": 0.9114, + "step": 4836 + }, + { + "epoch": 4.503724394785848, + "grad_norm": 1.8093715906143188, + "learning_rate": 7.507119223333168e-07, + "loss": 0.8828, + "step": 4837 + }, + { + "epoch": 4.504655493482309, + "grad_norm": 1.8035056591033936, + "learning_rate": 7.498276702481585e-07, + "loss": 0.9112, + "step": 4838 + }, + { + "epoch": 4.505586592178771, + "grad_norm": 1.7963917255401611, + "learning_rate": 7.489438473575519e-07, + "loss": 0.9205, + "step": 4839 + }, + { + "epoch": 4.506517690875233, + "grad_norm": 1.7810622453689575, + "learning_rate": 7.480604538782393e-07, + "loss": 0.8793, + "step": 4840 + }, + { + "epoch": 4.507448789571694, + "grad_norm": 1.785102128982544, + "learning_rate": 7.471774900268531e-07, + "loss": 0.8424, + "step": 4841 + }, + { + "epoch": 4.5083798882681565, + "grad_norm": 1.7570117712020874, + "learning_rate": 7.462949560199226e-07, + "loss": 0.8482, + "step": 4842 + }, + { + "epoch": 4.509310986964619, + "grad_norm": 1.9101332426071167, + "learning_rate": 7.454128520738721e-07, + "loss": 0.882, + "step": 4843 + }, + { + "epoch": 4.51024208566108, + "grad_norm": 1.8033291101455688, + "learning_rate": 7.445311784050193e-07, + "loss": 0.8853, + "step": 4844 + }, + { + "epoch": 4.511173184357542, + "grad_norm": 1.7979122400283813, + "learning_rate": 7.436499352295775e-07, + "loss": 0.8934, + "step": 4845 + }, + { + "epoch": 4.512104283054004, + "grad_norm": 1.7845351696014404, + "learning_rate": 7.427691227636536e-07, + "loss": 0.8824, + "step": 4846 + }, + { + "epoch": 4.513035381750465, + "grad_norm": 1.8282980918884277, + "learning_rate": 7.41888741223249e-07, + "loss": 0.9121, + "step": 4847 + }, + { + "epoch": 4.5139664804469275, + "grad_norm": 1.8121670484542847, + "learning_rate": 7.410087908242597e-07, + "loss": 0.855, + "step": 4848 + }, + { + "epoch": 4.514897579143389, + "grad_norm": 1.7613215446472168, + "learning_rate": 7.401292717824757e-07, + "loss": 0.8652, + "step": 4849 + }, + { + "epoch": 4.515828677839851, + "grad_norm": 1.8187661170959473, + "learning_rate": 7.392501843135815e-07, + "loss": 0.8528, + "step": 4850 + }, + { + "epoch": 4.516759776536313, + "grad_norm": 1.821890115737915, + "learning_rate": 7.383715286331555e-07, + "loss": 0.8563, + "step": 4851 + }, + { + "epoch": 4.517690875232775, + "grad_norm": 1.8196319341659546, + "learning_rate": 7.374933049566704e-07, + "loss": 0.8841, + "step": 4852 + }, + { + "epoch": 4.518621973929236, + "grad_norm": 1.7726709842681885, + "learning_rate": 7.366155134994931e-07, + "loss": 0.8974, + "step": 4853 + }, + { + "epoch": 4.5195530726256985, + "grad_norm": 1.7774194478988647, + "learning_rate": 7.35738154476884e-07, + "loss": 0.8706, + "step": 4854 + }, + { + "epoch": 4.52048417132216, + "grad_norm": 1.8048646450042725, + "learning_rate": 7.348612281039977e-07, + "loss": 0.894, + "step": 4855 + }, + { + "epoch": 4.521415270018622, + "grad_norm": 1.7804933786392212, + "learning_rate": 7.339847345958831e-07, + "loss": 0.9412, + "step": 4856 + }, + { + "epoch": 4.522346368715084, + "grad_norm": 1.8368923664093018, + "learning_rate": 7.331086741674823e-07, + "loss": 0.9013, + "step": 4857 + }, + { + "epoch": 4.523277467411545, + "grad_norm": 1.8403913974761963, + "learning_rate": 7.322330470336314e-07, + "loss": 0.91, + "step": 4858 + }, + { + "epoch": 4.524208566108007, + "grad_norm": 1.9273053407669067, + "learning_rate": 7.313578534090607e-07, + "loss": 0.8826, + "step": 4859 + }, + { + "epoch": 4.5251396648044695, + "grad_norm": 1.7729002237319946, + "learning_rate": 7.304830935083934e-07, + "loss": 0.8708, + "step": 4860 + }, + { + "epoch": 4.526070763500931, + "grad_norm": 1.8027873039245605, + "learning_rate": 7.296087675461475e-07, + "loss": 0.8594, + "step": 4861 + }, + { + "epoch": 4.527001862197393, + "grad_norm": 1.8889786005020142, + "learning_rate": 7.287348757367329e-07, + "loss": 0.8738, + "step": 4862 + }, + { + "epoch": 4.527932960893855, + "grad_norm": 1.7991100549697876, + "learning_rate": 7.278614182944547e-07, + "loss": 0.8682, + "step": 4863 + }, + { + "epoch": 4.528864059590316, + "grad_norm": 1.7868523597717285, + "learning_rate": 7.269883954335102e-07, + "loss": 0.8688, + "step": 4864 + }, + { + "epoch": 4.529795158286778, + "grad_norm": 1.8228610754013062, + "learning_rate": 7.261158073679913e-07, + "loss": 0.8905, + "step": 4865 + }, + { + "epoch": 4.5307262569832405, + "grad_norm": 1.866552710533142, + "learning_rate": 7.252436543118824e-07, + "loss": 0.8919, + "step": 4866 + }, + { + "epoch": 4.531657355679702, + "grad_norm": 1.7169156074523926, + "learning_rate": 7.243719364790613e-07, + "loss": 0.8136, + "step": 4867 + }, + { + "epoch": 4.532588454376164, + "grad_norm": 1.8556427955627441, + "learning_rate": 7.235006540832995e-07, + "loss": 0.9232, + "step": 4868 + }, + { + "epoch": 4.533519553072626, + "grad_norm": 1.7709615230560303, + "learning_rate": 7.226298073382612e-07, + "loss": 0.8551, + "step": 4869 + }, + { + "epoch": 4.534450651769087, + "grad_norm": 1.8398112058639526, + "learning_rate": 7.217593964575045e-07, + "loss": 0.8516, + "step": 4870 + }, + { + "epoch": 4.535381750465549, + "grad_norm": 1.8458967208862305, + "learning_rate": 7.208894216544798e-07, + "loss": 0.8586, + "step": 4871 + }, + { + "epoch": 4.5363128491620115, + "grad_norm": 1.8237415552139282, + "learning_rate": 7.200198831425309e-07, + "loss": 0.905, + "step": 4872 + }, + { + "epoch": 4.537243947858473, + "grad_norm": 1.841162085533142, + "learning_rate": 7.19150781134895e-07, + "loss": 0.8997, + "step": 4873 + }, + { + "epoch": 4.538175046554935, + "grad_norm": 1.7494664192199707, + "learning_rate": 7.182821158447015e-07, + "loss": 0.8693, + "step": 4874 + }, + { + "epoch": 4.539106145251397, + "grad_norm": 1.7804515361785889, + "learning_rate": 7.174138874849731e-07, + "loss": 0.863, + "step": 4875 + }, + { + "epoch": 4.540037243947858, + "grad_norm": 1.7727586030960083, + "learning_rate": 7.165460962686258e-07, + "loss": 0.8698, + "step": 4876 + }, + { + "epoch": 4.54096834264432, + "grad_norm": 1.7638684511184692, + "learning_rate": 7.156787424084679e-07, + "loss": 0.8326, + "step": 4877 + }, + { + "epoch": 4.5418994413407825, + "grad_norm": 1.7807317972183228, + "learning_rate": 7.148118261171991e-07, + "loss": 0.858, + "step": 4878 + }, + { + "epoch": 4.542830540037244, + "grad_norm": 1.7517857551574707, + "learning_rate": 7.139453476074145e-07, + "loss": 0.8309, + "step": 4879 + }, + { + "epoch": 4.543761638733706, + "grad_norm": 1.7257810831069946, + "learning_rate": 7.130793070916006e-07, + "loss": 0.8608, + "step": 4880 + }, + { + "epoch": 4.544692737430168, + "grad_norm": 1.8250972032546997, + "learning_rate": 7.122137047821371e-07, + "loss": 0.8671, + "step": 4881 + }, + { + "epoch": 4.545623836126629, + "grad_norm": 1.823484182357788, + "learning_rate": 7.113485408912926e-07, + "loss": 0.8655, + "step": 4882 + }, + { + "epoch": 4.546554934823091, + "grad_norm": 1.78757643699646, + "learning_rate": 7.104838156312338e-07, + "loss": 0.9049, + "step": 4883 + }, + { + "epoch": 4.547486033519553, + "grad_norm": 1.7498877048492432, + "learning_rate": 7.096195292140173e-07, + "loss": 0.8949, + "step": 4884 + }, + { + "epoch": 4.548417132216015, + "grad_norm": 1.867876410484314, + "learning_rate": 7.087556818515897e-07, + "loss": 0.8757, + "step": 4885 + }, + { + "epoch": 4.549348230912477, + "grad_norm": 1.8328959941864014, + "learning_rate": 7.078922737557944e-07, + "loss": 0.8756, + "step": 4886 + }, + { + "epoch": 4.550279329608939, + "grad_norm": 1.89278244972229, + "learning_rate": 7.07029305138364e-07, + "loss": 0.9246, + "step": 4887 + }, + { + "epoch": 4.5512104283054, + "grad_norm": 1.8167186975479126, + "learning_rate": 7.061667762109247e-07, + "loss": 0.8689, + "step": 4888 + }, + { + "epoch": 4.552141527001862, + "grad_norm": 1.8296791315078735, + "learning_rate": 7.053046871849927e-07, + "loss": 0.8662, + "step": 4889 + }, + { + "epoch": 4.553072625698324, + "grad_norm": 1.7703067064285278, + "learning_rate": 7.0444303827198e-07, + "loss": 0.8637, + "step": 4890 + }, + { + "epoch": 4.554003724394786, + "grad_norm": 1.8201336860656738, + "learning_rate": 7.035818296831887e-07, + "loss": 0.8772, + "step": 4891 + }, + { + "epoch": 4.554934823091248, + "grad_norm": 1.829553484916687, + "learning_rate": 7.027210616298102e-07, + "loss": 0.8894, + "step": 4892 + }, + { + "epoch": 4.55586592178771, + "grad_norm": 1.805327296257019, + "learning_rate": 7.018607343229333e-07, + "loss": 0.8844, + "step": 4893 + }, + { + "epoch": 4.556797020484171, + "grad_norm": 1.7788612842559814, + "learning_rate": 7.01000847973535e-07, + "loss": 0.8792, + "step": 4894 + }, + { + "epoch": 4.557728119180633, + "grad_norm": 1.7963896989822388, + "learning_rate": 7.001414027924861e-07, + "loss": 0.8731, + "step": 4895 + }, + { + "epoch": 4.558659217877095, + "grad_norm": 1.8227648735046387, + "learning_rate": 6.992823989905456e-07, + "loss": 0.8982, + "step": 4896 + }, + { + "epoch": 4.559590316573557, + "grad_norm": 1.829454779624939, + "learning_rate": 6.984238367783696e-07, + "loss": 0.9231, + "step": 4897 + }, + { + "epoch": 4.560521415270019, + "grad_norm": 1.978028416633606, + "learning_rate": 6.975657163665028e-07, + "loss": 0.8919, + "step": 4898 + }, + { + "epoch": 4.56145251396648, + "grad_norm": 1.8329576253890991, + "learning_rate": 6.967080379653807e-07, + "loss": 0.8684, + "step": 4899 + }, + { + "epoch": 4.562383612662942, + "grad_norm": 1.79604172706604, + "learning_rate": 6.958508017853316e-07, + "loss": 0.8698, + "step": 4900 + }, + { + "epoch": 4.563314711359404, + "grad_norm": 1.8160115480422974, + "learning_rate": 6.949940080365772e-07, + "loss": 0.8956, + "step": 4901 + }, + { + "epoch": 4.564245810055866, + "grad_norm": 1.7716492414474487, + "learning_rate": 6.941376569292282e-07, + "loss": 0.8813, + "step": 4902 + }, + { + "epoch": 4.565176908752328, + "grad_norm": 1.7929052114486694, + "learning_rate": 6.932817486732862e-07, + "loss": 0.91, + "step": 4903 + }, + { + "epoch": 4.56610800744879, + "grad_norm": 1.8840560913085938, + "learning_rate": 6.924262834786469e-07, + "loss": 0.9229, + "step": 4904 + }, + { + "epoch": 4.567039106145251, + "grad_norm": 1.7833038568496704, + "learning_rate": 6.915712615550965e-07, + "loss": 0.8425, + "step": 4905 + }, + { + "epoch": 4.567970204841713, + "grad_norm": 1.8236291408538818, + "learning_rate": 6.9071668311231e-07, + "loss": 0.8854, + "step": 4906 + }, + { + "epoch": 4.568901303538175, + "grad_norm": 1.7866694927215576, + "learning_rate": 6.898625483598556e-07, + "loss": 0.8597, + "step": 4907 + }, + { + "epoch": 4.569832402234637, + "grad_norm": 1.8275761604309082, + "learning_rate": 6.890088575071954e-07, + "loss": 0.877, + "step": 4908 + }, + { + "epoch": 4.570763500931099, + "grad_norm": 1.8281338214874268, + "learning_rate": 6.881556107636772e-07, + "loss": 0.8805, + "step": 4909 + }, + { + "epoch": 4.571694599627561, + "grad_norm": 1.8619472980499268, + "learning_rate": 6.873028083385436e-07, + "loss": 0.8958, + "step": 4910 + }, + { + "epoch": 4.572625698324022, + "grad_norm": 1.7753404378890991, + "learning_rate": 6.864504504409261e-07, + "loss": 0.893, + "step": 4911 + }, + { + "epoch": 4.573556797020484, + "grad_norm": 1.8085682392120361, + "learning_rate": 6.855985372798509e-07, + "loss": 0.8987, + "step": 4912 + }, + { + "epoch": 4.574487895716946, + "grad_norm": 1.8660725355148315, + "learning_rate": 6.847470690642305e-07, + "loss": 0.9169, + "step": 4913 + }, + { + "epoch": 4.575418994413408, + "grad_norm": 1.7359118461608887, + "learning_rate": 6.838960460028702e-07, + "loss": 0.859, + "step": 4914 + }, + { + "epoch": 4.57635009310987, + "grad_norm": 1.8842617273330688, + "learning_rate": 6.830454683044679e-07, + "loss": 0.8829, + "step": 4915 + }, + { + "epoch": 4.577281191806332, + "grad_norm": 1.8600102663040161, + "learning_rate": 6.821953361776093e-07, + "loss": 0.8953, + "step": 4916 + }, + { + "epoch": 4.578212290502793, + "grad_norm": 1.7984110116958618, + "learning_rate": 6.813456498307727e-07, + "loss": 0.8859, + "step": 4917 + }, + { + "epoch": 4.579143389199255, + "grad_norm": 1.7906854152679443, + "learning_rate": 6.804964094723255e-07, + "loss": 0.8822, + "step": 4918 + }, + { + "epoch": 4.5800744878957165, + "grad_norm": 1.783321738243103, + "learning_rate": 6.796476153105294e-07, + "loss": 0.8731, + "step": 4919 + }, + { + "epoch": 4.581005586592179, + "grad_norm": 1.8085970878601074, + "learning_rate": 6.787992675535319e-07, + "loss": 0.9087, + "step": 4920 + }, + { + "epoch": 4.581936685288641, + "grad_norm": 1.7378994226455688, + "learning_rate": 6.779513664093734e-07, + "loss": 0.8417, + "step": 4921 + }, + { + "epoch": 4.582867783985103, + "grad_norm": 1.8187665939331055, + "learning_rate": 6.771039120859854e-07, + "loss": 0.8649, + "step": 4922 + }, + { + "epoch": 4.583798882681564, + "grad_norm": 1.834657073020935, + "learning_rate": 6.762569047911885e-07, + "loss": 0.8497, + "step": 4923 + }, + { + "epoch": 4.584729981378026, + "grad_norm": 1.9246387481689453, + "learning_rate": 6.754103447326943e-07, + "loss": 0.89, + "step": 4924 + }, + { + "epoch": 4.5856610800744875, + "grad_norm": 1.8200368881225586, + "learning_rate": 6.745642321181039e-07, + "loss": 0.8843, + "step": 4925 + }, + { + "epoch": 4.58659217877095, + "grad_norm": 1.8476002216339111, + "learning_rate": 6.737185671549118e-07, + "loss": 0.8985, + "step": 4926 + }, + { + "epoch": 4.587523277467412, + "grad_norm": 1.780079960823059, + "learning_rate": 6.728733500504977e-07, + "loss": 0.8565, + "step": 4927 + }, + { + "epoch": 4.588454376163874, + "grad_norm": 1.7679234743118286, + "learning_rate": 6.720285810121352e-07, + "loss": 0.873, + "step": 4928 + }, + { + "epoch": 4.589385474860335, + "grad_norm": 1.7956304550170898, + "learning_rate": 6.711842602469868e-07, + "loss": 0.8987, + "step": 4929 + }, + { + "epoch": 4.590316573556797, + "grad_norm": 1.7678791284561157, + "learning_rate": 6.703403879621048e-07, + "loss": 0.8717, + "step": 4930 + }, + { + "epoch": 4.5912476722532585, + "grad_norm": 1.7883708477020264, + "learning_rate": 6.694969643644328e-07, + "loss": 0.8851, + "step": 4931 + }, + { + "epoch": 4.592178770949721, + "grad_norm": 1.8250255584716797, + "learning_rate": 6.686539896608019e-07, + "loss": 0.9169, + "step": 4932 + }, + { + "epoch": 4.593109869646183, + "grad_norm": 1.782662034034729, + "learning_rate": 6.678114640579369e-07, + "loss": 0.8511, + "step": 4933 + }, + { + "epoch": 4.594040968342644, + "grad_norm": 1.816391110420227, + "learning_rate": 6.669693877624486e-07, + "loss": 0.8864, + "step": 4934 + }, + { + "epoch": 4.594972067039106, + "grad_norm": 1.89912748336792, + "learning_rate": 6.6612776098084e-07, + "loss": 0.8668, + "step": 4935 + }, + { + "epoch": 4.595903165735568, + "grad_norm": 1.7913681268692017, + "learning_rate": 6.652865839195025e-07, + "loss": 0.8664, + "step": 4936 + }, + { + "epoch": 4.5968342644320295, + "grad_norm": 1.82980477809906, + "learning_rate": 6.644458567847184e-07, + "loss": 0.8985, + "step": 4937 + }, + { + "epoch": 4.597765363128492, + "grad_norm": 1.7817455530166626, + "learning_rate": 6.636055797826588e-07, + "loss": 0.8496, + "step": 4938 + }, + { + "epoch": 4.598696461824954, + "grad_norm": 1.7660399675369263, + "learning_rate": 6.627657531193848e-07, + "loss": 0.864, + "step": 4939 + }, + { + "epoch": 4.599627560521415, + "grad_norm": 1.8422852754592896, + "learning_rate": 6.619263770008472e-07, + "loss": 0.9031, + "step": 4940 + }, + { + "epoch": 4.600558659217877, + "grad_norm": 1.836295247077942, + "learning_rate": 6.610874516328861e-07, + "loss": 0.8532, + "step": 4941 + }, + { + "epoch": 4.601489757914339, + "grad_norm": 1.8881785869598389, + "learning_rate": 6.602489772212307e-07, + "loss": 0.9003, + "step": 4942 + }, + { + "epoch": 4.6024208566108005, + "grad_norm": 1.7739884853363037, + "learning_rate": 6.594109539715002e-07, + "loss": 0.8936, + "step": 4943 + }, + { + "epoch": 4.603351955307263, + "grad_norm": 1.836268663406372, + "learning_rate": 6.585733820892029e-07, + "loss": 0.9345, + "step": 4944 + }, + { + "epoch": 4.604283054003725, + "grad_norm": 1.80372154712677, + "learning_rate": 6.577362617797367e-07, + "loss": 0.8634, + "step": 4945 + }, + { + "epoch": 4.605214152700186, + "grad_norm": 1.8344762325286865, + "learning_rate": 6.568995932483882e-07, + "loss": 0.8956, + "step": 4946 + }, + { + "epoch": 4.606145251396648, + "grad_norm": 1.8161683082580566, + "learning_rate": 6.560633767003336e-07, + "loss": 0.9548, + "step": 4947 + }, + { + "epoch": 4.60707635009311, + "grad_norm": 1.8277242183685303, + "learning_rate": 6.552276123406384e-07, + "loss": 0.8646, + "step": 4948 + }, + { + "epoch": 4.6080074487895715, + "grad_norm": 1.8167372941970825, + "learning_rate": 6.543923003742567e-07, + "loss": 0.9268, + "step": 4949 + }, + { + "epoch": 4.608938547486034, + "grad_norm": 1.7665027379989624, + "learning_rate": 6.535574410060322e-07, + "loss": 0.852, + "step": 4950 + }, + { + "epoch": 4.609869646182496, + "grad_norm": 1.810671329498291, + "learning_rate": 6.527230344406971e-07, + "loss": 0.9031, + "step": 4951 + }, + { + "epoch": 4.610800744878957, + "grad_norm": 1.8120412826538086, + "learning_rate": 6.518890808828729e-07, + "loss": 0.9006, + "step": 4952 + }, + { + "epoch": 4.611731843575419, + "grad_norm": 1.784206509590149, + "learning_rate": 6.510555805370699e-07, + "loss": 0.8823, + "step": 4953 + }, + { + "epoch": 4.61266294227188, + "grad_norm": 1.7191848754882812, + "learning_rate": 6.502225336076878e-07, + "loss": 0.8111, + "step": 4954 + }, + { + "epoch": 4.6135940409683425, + "grad_norm": 1.7657610177993774, + "learning_rate": 6.49389940299014e-07, + "loss": 0.8705, + "step": 4955 + }, + { + "epoch": 4.614525139664805, + "grad_norm": 1.8224600553512573, + "learning_rate": 6.485578008152254e-07, + "loss": 0.8801, + "step": 4956 + }, + { + "epoch": 4.615456238361267, + "grad_norm": 1.8079572916030884, + "learning_rate": 6.477261153603876e-07, + "loss": 0.8799, + "step": 4957 + }, + { + "epoch": 4.616387337057728, + "grad_norm": 1.8472644090652466, + "learning_rate": 6.468948841384545e-07, + "loss": 0.8905, + "step": 4958 + }, + { + "epoch": 4.61731843575419, + "grad_norm": 1.8030836582183838, + "learning_rate": 6.460641073532689e-07, + "loss": 0.8778, + "step": 4959 + }, + { + "epoch": 4.618249534450651, + "grad_norm": 1.778321623802185, + "learning_rate": 6.452337852085622e-07, + "loss": 0.8602, + "step": 4960 + }, + { + "epoch": 4.6191806331471135, + "grad_norm": 1.805877447128296, + "learning_rate": 6.444039179079545e-07, + "loss": 0.8566, + "step": 4961 + }, + { + "epoch": 4.620111731843576, + "grad_norm": 1.8114385604858398, + "learning_rate": 6.435745056549533e-07, + "loss": 0.9083, + "step": 4962 + }, + { + "epoch": 4.621042830540038, + "grad_norm": 1.7827532291412354, + "learning_rate": 6.427455486529557e-07, + "loss": 0.8419, + "step": 4963 + }, + { + "epoch": 4.621973929236499, + "grad_norm": 1.7670725584030151, + "learning_rate": 6.419170471052472e-07, + "loss": 0.8582, + "step": 4964 + }, + { + "epoch": 4.622905027932961, + "grad_norm": 1.845465064048767, + "learning_rate": 6.41089001215e-07, + "loss": 0.8815, + "step": 4965 + }, + { + "epoch": 4.623836126629422, + "grad_norm": 1.8419194221496582, + "learning_rate": 6.402614111852767e-07, + "loss": 0.8776, + "step": 4966 + }, + { + "epoch": 4.6247672253258845, + "grad_norm": 1.7398557662963867, + "learning_rate": 6.394342772190276e-07, + "loss": 0.8209, + "step": 4967 + }, + { + "epoch": 4.625698324022347, + "grad_norm": 1.793521523475647, + "learning_rate": 6.386075995190882e-07, + "loss": 0.8982, + "step": 4968 + }, + { + "epoch": 4.626629422718808, + "grad_norm": 1.8188248872756958, + "learning_rate": 6.37781378288187e-07, + "loss": 0.9188, + "step": 4969 + }, + { + "epoch": 4.62756052141527, + "grad_norm": 1.8143765926361084, + "learning_rate": 6.369556137289373e-07, + "loss": 0.8738, + "step": 4970 + }, + { + "epoch": 4.628491620111732, + "grad_norm": 1.8234052658081055, + "learning_rate": 6.361303060438412e-07, + "loss": 0.9132, + "step": 4971 + }, + { + "epoch": 4.629422718808193, + "grad_norm": 1.827946662902832, + "learning_rate": 6.353054554352889e-07, + "loss": 0.905, + "step": 4972 + }, + { + "epoch": 4.6303538175046555, + "grad_norm": 1.8162509202957153, + "learning_rate": 6.344810621055583e-07, + "loss": 0.9115, + "step": 4973 + }, + { + "epoch": 4.631284916201118, + "grad_norm": 1.737370252609253, + "learning_rate": 6.336571262568164e-07, + "loss": 0.8352, + "step": 4974 + }, + { + "epoch": 4.632216014897579, + "grad_norm": 1.7803428173065186, + "learning_rate": 6.328336480911143e-07, + "loss": 0.8753, + "step": 4975 + }, + { + "epoch": 4.633147113594041, + "grad_norm": 1.8970229625701904, + "learning_rate": 6.320106278103954e-07, + "loss": 0.9031, + "step": 4976 + }, + { + "epoch": 4.634078212290503, + "grad_norm": 1.839725136756897, + "learning_rate": 6.311880656164895e-07, + "loss": 0.8566, + "step": 4977 + }, + { + "epoch": 4.635009310986964, + "grad_norm": 1.8428024053573608, + "learning_rate": 6.303659617111119e-07, + "loss": 0.9007, + "step": 4978 + }, + { + "epoch": 4.6359404096834265, + "grad_norm": 1.8503795862197876, + "learning_rate": 6.295443162958664e-07, + "loss": 0.8852, + "step": 4979 + }, + { + "epoch": 4.636871508379889, + "grad_norm": 1.8668088912963867, + "learning_rate": 6.28723129572247e-07, + "loss": 0.888, + "step": 4980 + }, + { + "epoch": 4.63780260707635, + "grad_norm": 1.797541856765747, + "learning_rate": 6.279024017416332e-07, + "loss": 0.8759, + "step": 4981 + }, + { + "epoch": 4.638733705772812, + "grad_norm": 1.7873948812484741, + "learning_rate": 6.270821330052898e-07, + "loss": 0.8961, + "step": 4982 + }, + { + "epoch": 4.639664804469274, + "grad_norm": 1.7875641584396362, + "learning_rate": 6.262623235643733e-07, + "loss": 0.8917, + "step": 4983 + }, + { + "epoch": 4.640595903165735, + "grad_norm": 1.7803682088851929, + "learning_rate": 6.254429736199255e-07, + "loss": 0.8832, + "step": 4984 + }, + { + "epoch": 4.6415270018621975, + "grad_norm": 1.8096002340316772, + "learning_rate": 6.246240833728737e-07, + "loss": 0.8582, + "step": 4985 + }, + { + "epoch": 4.64245810055866, + "grad_norm": 1.7872393131256104, + "learning_rate": 6.238056530240349e-07, + "loss": 0.8528, + "step": 4986 + }, + { + "epoch": 4.643389199255121, + "grad_norm": 1.8547927141189575, + "learning_rate": 6.229876827741136e-07, + "loss": 0.9333, + "step": 4987 + }, + { + "epoch": 4.644320297951583, + "grad_norm": 1.8264391422271729, + "learning_rate": 6.221701728237008e-07, + "loss": 0.8928, + "step": 4988 + }, + { + "epoch": 4.645251396648044, + "grad_norm": 1.8246098756790161, + "learning_rate": 6.213531233732723e-07, + "loss": 0.9007, + "step": 4989 + }, + { + "epoch": 4.646182495344506, + "grad_norm": 1.9142588376998901, + "learning_rate": 6.205365346231947e-07, + "loss": 0.91, + "step": 4990 + }, + { + "epoch": 4.6471135940409685, + "grad_norm": 1.8723204135894775, + "learning_rate": 6.197204067737205e-07, + "loss": 0.8795, + "step": 4991 + }, + { + "epoch": 4.648044692737431, + "grad_norm": 1.822513461112976, + "learning_rate": 6.189047400249873e-07, + "loss": 0.8977, + "step": 4992 + }, + { + "epoch": 4.648975791433892, + "grad_norm": 1.8235722780227661, + "learning_rate": 6.180895345770202e-07, + "loss": 0.8907, + "step": 4993 + }, + { + "epoch": 4.649906890130354, + "grad_norm": 1.8615317344665527, + "learning_rate": 6.172747906297341e-07, + "loss": 0.9209, + "step": 4994 + }, + { + "epoch": 4.650837988826815, + "grad_norm": 1.830812692642212, + "learning_rate": 6.164605083829284e-07, + "loss": 0.8782, + "step": 4995 + }, + { + "epoch": 4.651769087523277, + "grad_norm": 1.7503819465637207, + "learning_rate": 6.156466880362877e-07, + "loss": 0.8936, + "step": 4996 + }, + { + "epoch": 4.6527001862197395, + "grad_norm": 1.809609293937683, + "learning_rate": 6.148333297893852e-07, + "loss": 0.9126, + "step": 4997 + }, + { + "epoch": 4.653631284916202, + "grad_norm": 1.8414077758789062, + "learning_rate": 6.140204338416831e-07, + "loss": 0.9044, + "step": 4998 + }, + { + "epoch": 4.654562383612663, + "grad_norm": 1.8051362037658691, + "learning_rate": 6.132080003925254e-07, + "loss": 0.8612, + "step": 4999 + }, + { + "epoch": 4.655493482309125, + "grad_norm": 1.7567815780639648, + "learning_rate": 6.123960296411449e-07, + "loss": 0.8422, + "step": 5000 + }, + { + "epoch": 4.656424581005586, + "grad_norm": 1.8074955940246582, + "learning_rate": 6.115845217866625e-07, + "loss": 0.8647, + "step": 5001 + }, + { + "epoch": 4.657355679702048, + "grad_norm": 1.8724664449691772, + "learning_rate": 6.107734770280848e-07, + "loss": 0.877, + "step": 5002 + }, + { + "epoch": 4.6582867783985105, + "grad_norm": 1.8518455028533936, + "learning_rate": 6.09962895564302e-07, + "loss": 0.9245, + "step": 5003 + }, + { + "epoch": 4.659217877094972, + "grad_norm": 1.7816811800003052, + "learning_rate": 6.091527775940934e-07, + "loss": 0.8788, + "step": 5004 + }, + { + "epoch": 4.660148975791434, + "grad_norm": 1.8519641160964966, + "learning_rate": 6.08343123316126e-07, + "loss": 0.9079, + "step": 5005 + }, + { + "epoch": 4.661080074487896, + "grad_norm": 1.7912018299102783, + "learning_rate": 6.075339329289492e-07, + "loss": 0.8926, + "step": 5006 + }, + { + "epoch": 4.662011173184357, + "grad_norm": 1.7806576490402222, + "learning_rate": 6.067252066310014e-07, + "loss": 0.8449, + "step": 5007 + }, + { + "epoch": 4.662942271880819, + "grad_norm": 1.8023836612701416, + "learning_rate": 6.059169446206065e-07, + "loss": 0.9112, + "step": 5008 + }, + { + "epoch": 4.6638733705772815, + "grad_norm": 1.80609130859375, + "learning_rate": 6.051091470959744e-07, + "loss": 0.8632, + "step": 5009 + }, + { + "epoch": 4.664804469273743, + "grad_norm": 1.8198128938674927, + "learning_rate": 6.04301814255201e-07, + "loss": 0.8814, + "step": 5010 + }, + { + "epoch": 4.665735567970205, + "grad_norm": 1.8338043689727783, + "learning_rate": 6.03494946296268e-07, + "loss": 0.9378, + "step": 5011 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 1.7799285650253296, + "learning_rate": 6.026885434170457e-07, + "loss": 0.8822, + "step": 5012 + }, + { + "epoch": 4.667597765363128, + "grad_norm": 1.8207662105560303, + "learning_rate": 6.018826058152861e-07, + "loss": 0.9013, + "step": 5013 + }, + { + "epoch": 4.66852886405959, + "grad_norm": 1.8377959728240967, + "learning_rate": 6.010771336886292e-07, + "loss": 0.9182, + "step": 5014 + }, + { + "epoch": 4.6694599627560525, + "grad_norm": 1.7941144704818726, + "learning_rate": 6.002721272346019e-07, + "loss": 0.9061, + "step": 5015 + }, + { + "epoch": 4.670391061452514, + "grad_norm": 1.8215397596359253, + "learning_rate": 5.99467586650615e-07, + "loss": 0.8493, + "step": 5016 + }, + { + "epoch": 4.671322160148976, + "grad_norm": 1.8245929479599, + "learning_rate": 5.986635121339665e-07, + "loss": 0.9132, + "step": 5017 + }, + { + "epoch": 4.672253258845438, + "grad_norm": 1.7518024444580078, + "learning_rate": 5.978599038818381e-07, + "loss": 0.8606, + "step": 5018 + }, + { + "epoch": 4.673184357541899, + "grad_norm": 1.8537676334381104, + "learning_rate": 5.970567620913015e-07, + "loss": 0.8844, + "step": 5019 + }, + { + "epoch": 4.674115456238361, + "grad_norm": 1.8300917148590088, + "learning_rate": 5.962540869593081e-07, + "loss": 0.8584, + "step": 5020 + }, + { + "epoch": 4.6750465549348235, + "grad_norm": 1.734847068786621, + "learning_rate": 5.954518786826993e-07, + "loss": 0.8798, + "step": 5021 + }, + { + "epoch": 4.675977653631285, + "grad_norm": 1.853626012802124, + "learning_rate": 5.946501374582e-07, + "loss": 0.9203, + "step": 5022 + }, + { + "epoch": 4.676908752327747, + "grad_norm": 1.808036208152771, + "learning_rate": 5.938488634824213e-07, + "loss": 0.8492, + "step": 5023 + }, + { + "epoch": 4.677839851024208, + "grad_norm": 1.8057547807693481, + "learning_rate": 5.930480569518595e-07, + "loss": 0.9076, + "step": 5024 + }, + { + "epoch": 4.67877094972067, + "grad_norm": 1.75794517993927, + "learning_rate": 5.922477180628963e-07, + "loss": 0.8761, + "step": 5025 + }, + { + "epoch": 4.679702048417132, + "grad_norm": 1.831925630569458, + "learning_rate": 5.914478470117991e-07, + "loss": 0.8668, + "step": 5026 + }, + { + "epoch": 4.6806331471135945, + "grad_norm": 1.9117985963821411, + "learning_rate": 5.906484439947194e-07, + "loss": 0.8772, + "step": 5027 + }, + { + "epoch": 4.681564245810056, + "grad_norm": 1.7736573219299316, + "learning_rate": 5.89849509207695e-07, + "loss": 0.8669, + "step": 5028 + }, + { + "epoch": 4.682495344506518, + "grad_norm": 1.808447003364563, + "learning_rate": 5.890510428466489e-07, + "loss": 0.8471, + "step": 5029 + }, + { + "epoch": 4.683426443202979, + "grad_norm": 1.8561689853668213, + "learning_rate": 5.882530451073887e-07, + "loss": 0.8973, + "step": 5030 + }, + { + "epoch": 4.684357541899441, + "grad_norm": 1.775284767150879, + "learning_rate": 5.874555161856075e-07, + "loss": 0.854, + "step": 5031 + }, + { + "epoch": 4.685288640595903, + "grad_norm": 1.8542137145996094, + "learning_rate": 5.866584562768826e-07, + "loss": 0.8948, + "step": 5032 + }, + { + "epoch": 4.6862197392923655, + "grad_norm": 1.8357772827148438, + "learning_rate": 5.858618655766776e-07, + "loss": 0.8865, + "step": 5033 + }, + { + "epoch": 4.687150837988827, + "grad_norm": 1.835835576057434, + "learning_rate": 5.850657442803398e-07, + "loss": 0.907, + "step": 5034 + }, + { + "epoch": 4.688081936685289, + "grad_norm": 1.7783231735229492, + "learning_rate": 5.842700925831025e-07, + "loss": 0.86, + "step": 5035 + }, + { + "epoch": 4.68901303538175, + "grad_norm": 1.8519706726074219, + "learning_rate": 5.834749106800827e-07, + "loss": 0.8887, + "step": 5036 + }, + { + "epoch": 4.689944134078212, + "grad_norm": 1.800399661064148, + "learning_rate": 5.826801987662834e-07, + "loss": 0.8854, + "step": 5037 + }, + { + "epoch": 4.690875232774674, + "grad_norm": 1.8473293781280518, + "learning_rate": 5.818859570365908e-07, + "loss": 0.9353, + "step": 5038 + }, + { + "epoch": 4.691806331471136, + "grad_norm": 2.219167470932007, + "learning_rate": 5.81092185685778e-07, + "loss": 0.8923, + "step": 5039 + }, + { + "epoch": 4.692737430167598, + "grad_norm": 1.910131573677063, + "learning_rate": 5.802988849085001e-07, + "loss": 0.92, + "step": 5040 + }, + { + "epoch": 4.69366852886406, + "grad_norm": 1.7853119373321533, + "learning_rate": 5.79506054899299e-07, + "loss": 0.8777, + "step": 5041 + }, + { + "epoch": 4.694599627560521, + "grad_norm": 1.8372584581375122, + "learning_rate": 5.787136958526e-07, + "loss": 0.8998, + "step": 5042 + }, + { + "epoch": 4.695530726256983, + "grad_norm": 1.8630038499832153, + "learning_rate": 5.77921807962713e-07, + "loss": 0.8885, + "step": 5043 + }, + { + "epoch": 4.696461824953445, + "grad_norm": 1.791494369506836, + "learning_rate": 5.771303914238332e-07, + "loss": 0.8709, + "step": 5044 + }, + { + "epoch": 4.697392923649907, + "grad_norm": 1.7751961946487427, + "learning_rate": 5.763394464300393e-07, + "loss": 0.8868, + "step": 5045 + }, + { + "epoch": 4.698324022346369, + "grad_norm": 2.1380443572998047, + "learning_rate": 5.75548973175295e-07, + "loss": 0.8836, + "step": 5046 + }, + { + "epoch": 4.699255121042831, + "grad_norm": 1.9395332336425781, + "learning_rate": 5.747589718534463e-07, + "loss": 0.8848, + "step": 5047 + }, + { + "epoch": 4.700186219739292, + "grad_norm": 1.8002208471298218, + "learning_rate": 5.739694426582271e-07, + "loss": 0.8729, + "step": 5048 + }, + { + "epoch": 4.701117318435754, + "grad_norm": 1.77738618850708, + "learning_rate": 5.731803857832527e-07, + "loss": 0.8924, + "step": 5049 + }, + { + "epoch": 4.702048417132216, + "grad_norm": 1.7977443933486938, + "learning_rate": 5.723918014220237e-07, + "loss": 0.894, + "step": 5050 + }, + { + "epoch": 4.702979515828678, + "grad_norm": 1.7824580669403076, + "learning_rate": 5.716036897679242e-07, + "loss": 0.8583, + "step": 5051 + }, + { + "epoch": 4.70391061452514, + "grad_norm": 1.7997050285339355, + "learning_rate": 5.70816051014223e-07, + "loss": 0.8841, + "step": 5052 + }, + { + "epoch": 4.704841713221602, + "grad_norm": 1.8803186416625977, + "learning_rate": 5.700288853540733e-07, + "loss": 0.8773, + "step": 5053 + }, + { + "epoch": 4.705772811918063, + "grad_norm": 1.8349497318267822, + "learning_rate": 5.692421929805097e-07, + "loss": 0.9195, + "step": 5054 + }, + { + "epoch": 4.706703910614525, + "grad_norm": 1.9127777814865112, + "learning_rate": 5.684559740864545e-07, + "loss": 0.9291, + "step": 5055 + }, + { + "epoch": 4.707635009310987, + "grad_norm": 1.8481069803237915, + "learning_rate": 5.676702288647116e-07, + "loss": 0.8967, + "step": 5056 + }, + { + "epoch": 4.708566108007449, + "grad_norm": 1.829714059829712, + "learning_rate": 5.668849575079688e-07, + "loss": 0.8895, + "step": 5057 + }, + { + "epoch": 4.709497206703911, + "grad_norm": 1.8021724224090576, + "learning_rate": 5.661001602087984e-07, + "loss": 0.8892, + "step": 5058 + }, + { + "epoch": 4.710428305400372, + "grad_norm": 1.8210375308990479, + "learning_rate": 5.653158371596563e-07, + "loss": 0.9072, + "step": 5059 + }, + { + "epoch": 4.711359404096834, + "grad_norm": 1.805121898651123, + "learning_rate": 5.645319885528824e-07, + "loss": 0.8909, + "step": 5060 + }, + { + "epoch": 4.712290502793296, + "grad_norm": 1.8106958866119385, + "learning_rate": 5.637486145806978e-07, + "loss": 0.8825, + "step": 5061 + }, + { + "epoch": 4.713221601489758, + "grad_norm": 1.8526642322540283, + "learning_rate": 5.629657154352111e-07, + "loss": 0.9452, + "step": 5062 + }, + { + "epoch": 4.71415270018622, + "grad_norm": 1.811888575553894, + "learning_rate": 5.621832913084122e-07, + "loss": 0.8579, + "step": 5063 + }, + { + "epoch": 4.715083798882682, + "grad_norm": 1.9366356134414673, + "learning_rate": 5.614013423921754e-07, + "loss": 0.8734, + "step": 5064 + }, + { + "epoch": 4.716014897579143, + "grad_norm": 1.8188077211380005, + "learning_rate": 5.60619868878256e-07, + "loss": 0.896, + "step": 5065 + }, + { + "epoch": 4.716945996275605, + "grad_norm": 1.7806713581085205, + "learning_rate": 5.598388709582963e-07, + "loss": 0.8639, + "step": 5066 + }, + { + "epoch": 4.717877094972067, + "grad_norm": 1.8256219625473022, + "learning_rate": 5.59058348823821e-07, + "loss": 0.855, + "step": 5067 + }, + { + "epoch": 4.718808193668529, + "grad_norm": 1.81294846534729, + "learning_rate": 5.58278302666235e-07, + "loss": 0.8855, + "step": 5068 + }, + { + "epoch": 4.719739292364991, + "grad_norm": 1.805155634880066, + "learning_rate": 5.574987326768308e-07, + "loss": 0.875, + "step": 5069 + }, + { + "epoch": 4.720670391061453, + "grad_norm": 1.7777005434036255, + "learning_rate": 5.567196390467819e-07, + "loss": 0.8961, + "step": 5070 + }, + { + "epoch": 4.721601489757914, + "grad_norm": 1.7865101099014282, + "learning_rate": 5.559410219671457e-07, + "loss": 0.9106, + "step": 5071 + }, + { + "epoch": 4.722532588454376, + "grad_norm": 1.809126853942871, + "learning_rate": 5.551628816288607e-07, + "loss": 0.8782, + "step": 5072 + }, + { + "epoch": 4.723463687150838, + "grad_norm": 1.822241187095642, + "learning_rate": 5.543852182227522e-07, + "loss": 0.8624, + "step": 5073 + }, + { + "epoch": 4.724394785847299, + "grad_norm": 1.7868822813034058, + "learning_rate": 5.53608031939526e-07, + "loss": 0.8474, + "step": 5074 + }, + { + "epoch": 4.725325884543762, + "grad_norm": 1.8801569938659668, + "learning_rate": 5.528313229697704e-07, + "loss": 0.8705, + "step": 5075 + }, + { + "epoch": 4.726256983240224, + "grad_norm": 1.7596426010131836, + "learning_rate": 5.520550915039579e-07, + "loss": 0.8802, + "step": 5076 + }, + { + "epoch": 4.727188081936685, + "grad_norm": 1.8773366212844849, + "learning_rate": 5.512793377324452e-07, + "loss": 0.8971, + "step": 5077 + }, + { + "epoch": 4.728119180633147, + "grad_norm": 1.857252597808838, + "learning_rate": 5.505040618454688e-07, + "loss": 0.9377, + "step": 5078 + }, + { + "epoch": 4.729050279329609, + "grad_norm": 1.8370282649993896, + "learning_rate": 5.497292640331489e-07, + "loss": 0.9096, + "step": 5079 + }, + { + "epoch": 4.72998137802607, + "grad_norm": 1.825424075126648, + "learning_rate": 5.489549444854908e-07, + "loss": 0.9136, + "step": 5080 + }, + { + "epoch": 4.730912476722533, + "grad_norm": 1.8377336263656616, + "learning_rate": 5.481811033923807e-07, + "loss": 0.9198, + "step": 5081 + }, + { + "epoch": 4.731843575418995, + "grad_norm": 1.8035860061645508, + "learning_rate": 5.474077409435863e-07, + "loss": 0.8737, + "step": 5082 + }, + { + "epoch": 4.732774674115456, + "grad_norm": 1.7601449489593506, + "learning_rate": 5.466348573287589e-07, + "loss": 0.8725, + "step": 5083 + }, + { + "epoch": 4.733705772811918, + "grad_norm": 1.8170442581176758, + "learning_rate": 5.458624527374351e-07, + "loss": 0.9252, + "step": 5084 + }, + { + "epoch": 4.73463687150838, + "grad_norm": 1.8412610292434692, + "learning_rate": 5.450905273590293e-07, + "loss": 0.8494, + "step": 5085 + }, + { + "epoch": 4.735567970204841, + "grad_norm": 1.8355096578598022, + "learning_rate": 5.443190813828406e-07, + "loss": 0.9018, + "step": 5086 + }, + { + "epoch": 4.736499068901304, + "grad_norm": 1.831904411315918, + "learning_rate": 5.435481149980524e-07, + "loss": 0.8654, + "step": 5087 + }, + { + "epoch": 4.737430167597766, + "grad_norm": 1.8129584789276123, + "learning_rate": 5.427776283937281e-07, + "loss": 0.8547, + "step": 5088 + }, + { + "epoch": 4.738361266294227, + "grad_norm": 1.8119863271713257, + "learning_rate": 5.42007621758813e-07, + "loss": 0.8907, + "step": 5089 + }, + { + "epoch": 4.739292364990689, + "grad_norm": 1.7668596506118774, + "learning_rate": 5.412380952821358e-07, + "loss": 0.8635, + "step": 5090 + }, + { + "epoch": 4.740223463687151, + "grad_norm": 1.8402607440948486, + "learning_rate": 5.404690491524092e-07, + "loss": 0.9295, + "step": 5091 + }, + { + "epoch": 4.741154562383612, + "grad_norm": 1.9112495183944702, + "learning_rate": 5.397004835582242e-07, + "loss": 0.9106, + "step": 5092 + }, + { + "epoch": 4.742085661080075, + "grad_norm": 1.8426498174667358, + "learning_rate": 5.389323986880574e-07, + "loss": 0.8685, + "step": 5093 + }, + { + "epoch": 4.743016759776537, + "grad_norm": 1.7979059219360352, + "learning_rate": 5.381647947302646e-07, + "loss": 0.8746, + "step": 5094 + }, + { + "epoch": 4.743947858472998, + "grad_norm": 1.7704663276672363, + "learning_rate": 5.373976718730877e-07, + "loss": 0.8686, + "step": 5095 + }, + { + "epoch": 4.74487895716946, + "grad_norm": 1.8759069442749023, + "learning_rate": 5.36631030304646e-07, + "loss": 0.8964, + "step": 5096 + }, + { + "epoch": 4.745810055865922, + "grad_norm": 1.7853178977966309, + "learning_rate": 5.358648702129432e-07, + "loss": 0.8733, + "step": 5097 + }, + { + "epoch": 4.746741154562383, + "grad_norm": 1.8158758878707886, + "learning_rate": 5.350991917858662e-07, + "loss": 0.8555, + "step": 5098 + }, + { + "epoch": 4.747672253258846, + "grad_norm": 1.842551350593567, + "learning_rate": 5.343339952111806e-07, + "loss": 0.8879, + "step": 5099 + }, + { + "epoch": 4.748603351955307, + "grad_norm": 1.7902145385742188, + "learning_rate": 5.33569280676536e-07, + "loss": 0.8717, + "step": 5100 + }, + { + "epoch": 4.749534450651769, + "grad_norm": 1.83100163936615, + "learning_rate": 5.328050483694624e-07, + "loss": 0.8652, + "step": 5101 + }, + { + "epoch": 4.750465549348231, + "grad_norm": 1.811651587486267, + "learning_rate": 5.320412984773749e-07, + "loss": 0.8949, + "step": 5102 + }, + { + "epoch": 4.751396648044693, + "grad_norm": 1.7863266468048096, + "learning_rate": 5.312780311875654e-07, + "loss": 0.8956, + "step": 5103 + }, + { + "epoch": 4.752327746741154, + "grad_norm": 1.797908067703247, + "learning_rate": 5.305152466872104e-07, + "loss": 0.8698, + "step": 5104 + }, + { + "epoch": 4.753258845437617, + "grad_norm": 1.8201910257339478, + "learning_rate": 5.297529451633679e-07, + "loss": 0.9215, + "step": 5105 + }, + { + "epoch": 4.754189944134078, + "grad_norm": 1.8083101511001587, + "learning_rate": 5.289911268029766e-07, + "loss": 0.8684, + "step": 5106 + }, + { + "epoch": 4.75512104283054, + "grad_norm": 1.890697717666626, + "learning_rate": 5.282297917928572e-07, + "loss": 0.9042, + "step": 5107 + }, + { + "epoch": 4.756052141527002, + "grad_norm": 1.8234819173812866, + "learning_rate": 5.27468940319712e-07, + "loss": 0.882, + "step": 5108 + }, + { + "epoch": 4.756983240223463, + "grad_norm": 1.8932137489318848, + "learning_rate": 5.267085725701243e-07, + "loss": 0.886, + "step": 5109 + }, + { + "epoch": 4.757914338919925, + "grad_norm": 1.7871971130371094, + "learning_rate": 5.259486887305593e-07, + "loss": 0.9042, + "step": 5110 + }, + { + "epoch": 4.758845437616388, + "grad_norm": 1.7817273139953613, + "learning_rate": 5.251892889873628e-07, + "loss": 0.886, + "step": 5111 + }, + { + "epoch": 4.759776536312849, + "grad_norm": 1.8653682470321655, + "learning_rate": 5.244303735267626e-07, + "loss": 0.8945, + "step": 5112 + }, + { + "epoch": 4.760707635009311, + "grad_norm": 1.8966317176818848, + "learning_rate": 5.236719425348675e-07, + "loss": 0.9041, + "step": 5113 + }, + { + "epoch": 4.761638733705773, + "grad_norm": 1.8076449632644653, + "learning_rate": 5.229139961976671e-07, + "loss": 0.8954, + "step": 5114 + }, + { + "epoch": 4.762569832402234, + "grad_norm": 1.7871696949005127, + "learning_rate": 5.221565347010327e-07, + "loss": 0.873, + "step": 5115 + }, + { + "epoch": 4.763500931098696, + "grad_norm": 1.797074317932129, + "learning_rate": 5.213995582307166e-07, + "loss": 0.8358, + "step": 5116 + }, + { + "epoch": 4.764432029795159, + "grad_norm": 1.755550742149353, + "learning_rate": 5.20643066972352e-07, + "loss": 0.8956, + "step": 5117 + }, + { + "epoch": 4.76536312849162, + "grad_norm": 1.7301123142242432, + "learning_rate": 5.198870611114529e-07, + "loss": 0.8505, + "step": 5118 + }, + { + "epoch": 4.766294227188082, + "grad_norm": 1.7457224130630493, + "learning_rate": 5.19131540833415e-07, + "loss": 0.8836, + "step": 5119 + }, + { + "epoch": 4.767225325884544, + "grad_norm": 1.7832577228546143, + "learning_rate": 5.183765063235138e-07, + "loss": 0.8714, + "step": 5120 + }, + { + "epoch": 4.768156424581005, + "grad_norm": 1.8090815544128418, + "learning_rate": 5.17621957766907e-07, + "loss": 0.8909, + "step": 5121 + }, + { + "epoch": 4.769087523277467, + "grad_norm": 1.728471040725708, + "learning_rate": 5.168678953486323e-07, + "loss": 0.9122, + "step": 5122 + }, + { + "epoch": 4.77001862197393, + "grad_norm": 1.7956172227859497, + "learning_rate": 5.161143192536078e-07, + "loss": 0.8692, + "step": 5123 + }, + { + "epoch": 4.770949720670391, + "grad_norm": 1.7532689571380615, + "learning_rate": 5.153612296666335e-07, + "loss": 0.8777, + "step": 5124 + }, + { + "epoch": 4.771880819366853, + "grad_norm": 1.829037070274353, + "learning_rate": 5.146086267723891e-07, + "loss": 0.8995, + "step": 5125 + }, + { + "epoch": 4.772811918063315, + "grad_norm": 1.853463053703308, + "learning_rate": 5.138565107554355e-07, + "loss": 0.8985, + "step": 5126 + }, + { + "epoch": 4.773743016759776, + "grad_norm": 1.8086717128753662, + "learning_rate": 5.131048818002141e-07, + "loss": 0.8772, + "step": 5127 + }, + { + "epoch": 4.774674115456238, + "grad_norm": 1.7624129056930542, + "learning_rate": 5.123537400910467e-07, + "loss": 0.8401, + "step": 5128 + }, + { + "epoch": 4.775605214152701, + "grad_norm": 1.717605710029602, + "learning_rate": 5.116030858121354e-07, + "loss": 0.866, + "step": 5129 + }, + { + "epoch": 4.776536312849162, + "grad_norm": 1.7506386041641235, + "learning_rate": 5.108529191475637e-07, + "loss": 0.8824, + "step": 5130 + }, + { + "epoch": 4.777467411545624, + "grad_norm": 1.8472968339920044, + "learning_rate": 5.101032402812941e-07, + "loss": 0.8959, + "step": 5131 + }, + { + "epoch": 4.778398510242086, + "grad_norm": 1.8495606184005737, + "learning_rate": 5.093540493971708e-07, + "loss": 0.8856, + "step": 5132 + }, + { + "epoch": 4.779329608938547, + "grad_norm": 1.8525441884994507, + "learning_rate": 5.086053466789176e-07, + "loss": 0.9129, + "step": 5133 + }, + { + "epoch": 4.780260707635009, + "grad_norm": 1.845147728919983, + "learning_rate": 5.078571323101389e-07, + "loss": 0.8693, + "step": 5134 + }, + { + "epoch": 4.781191806331471, + "grad_norm": 1.8429573774337769, + "learning_rate": 5.071094064743193e-07, + "loss": 0.9145, + "step": 5135 + }, + { + "epoch": 4.782122905027933, + "grad_norm": 1.8528730869293213, + "learning_rate": 5.063621693548229e-07, + "loss": 0.8947, + "step": 5136 + }, + { + "epoch": 4.783054003724395, + "grad_norm": 1.8296526670455933, + "learning_rate": 5.056154211348954e-07, + "loss": 0.8711, + "step": 5137 + }, + { + "epoch": 4.783985102420857, + "grad_norm": 1.8636882305145264, + "learning_rate": 5.048691619976614e-07, + "loss": 0.9184, + "step": 5138 + }, + { + "epoch": 4.784916201117318, + "grad_norm": 1.8199172019958496, + "learning_rate": 5.041233921261265e-07, + "loss": 0.9192, + "step": 5139 + }, + { + "epoch": 4.78584729981378, + "grad_norm": 1.836961030960083, + "learning_rate": 5.033781117031739e-07, + "loss": 0.8819, + "step": 5140 + }, + { + "epoch": 4.786778398510242, + "grad_norm": 1.8954859972000122, + "learning_rate": 5.026333209115705e-07, + "loss": 0.9057, + "step": 5141 + }, + { + "epoch": 4.787709497206704, + "grad_norm": 1.8213351964950562, + "learning_rate": 5.01889019933961e-07, + "loss": 0.8714, + "step": 5142 + }, + { + "epoch": 4.788640595903166, + "grad_norm": 1.7947514057159424, + "learning_rate": 5.011452089528704e-07, + "loss": 0.8881, + "step": 5143 + }, + { + "epoch": 4.789571694599628, + "grad_norm": 1.8564882278442383, + "learning_rate": 5.004018881507016e-07, + "loss": 0.8557, + "step": 5144 + }, + { + "epoch": 4.790502793296089, + "grad_norm": 1.7946056127548218, + "learning_rate": 4.99659057709741e-07, + "loss": 0.8766, + "step": 5145 + }, + { + "epoch": 4.791433891992551, + "grad_norm": 1.793413519859314, + "learning_rate": 4.989167178121528e-07, + "loss": 0.8482, + "step": 5146 + }, + { + "epoch": 4.792364990689013, + "grad_norm": 1.864393711090088, + "learning_rate": 4.981748686399793e-07, + "loss": 0.8803, + "step": 5147 + }, + { + "epoch": 4.793296089385475, + "grad_norm": 1.8586562871932983, + "learning_rate": 4.974335103751454e-07, + "loss": 0.921, + "step": 5148 + }, + { + "epoch": 4.794227188081937, + "grad_norm": 1.8611735105514526, + "learning_rate": 4.966926431994543e-07, + "loss": 0.8682, + "step": 5149 + }, + { + "epoch": 4.795158286778398, + "grad_norm": 1.8674283027648926, + "learning_rate": 4.95952267294589e-07, + "loss": 0.9299, + "step": 5150 + }, + { + "epoch": 4.79608938547486, + "grad_norm": 1.898272156715393, + "learning_rate": 4.952123828421102e-07, + "loss": 0.9231, + "step": 5151 + }, + { + "epoch": 4.797020484171322, + "grad_norm": 1.8655104637145996, + "learning_rate": 4.94472990023461e-07, + "loss": 0.8966, + "step": 5152 + }, + { + "epoch": 4.797951582867784, + "grad_norm": 1.762320637702942, + "learning_rate": 4.937340890199632e-07, + "loss": 0.8999, + "step": 5153 + }, + { + "epoch": 4.798882681564246, + "grad_norm": 1.8819562196731567, + "learning_rate": 4.929956800128155e-07, + "loss": 0.9176, + "step": 5154 + }, + { + "epoch": 4.799813780260708, + "grad_norm": 1.8319530487060547, + "learning_rate": 4.922577631830993e-07, + "loss": 0.9234, + "step": 5155 + }, + { + "epoch": 4.800744878957169, + "grad_norm": 1.8255839347839355, + "learning_rate": 4.915203387117737e-07, + "loss": 0.8814, + "step": 5156 + }, + { + "epoch": 4.801675977653631, + "grad_norm": 1.7960742712020874, + "learning_rate": 4.907834067796774e-07, + "loss": 0.8619, + "step": 5157 + }, + { + "epoch": 4.802607076350093, + "grad_norm": 1.8356237411499023, + "learning_rate": 4.900469675675266e-07, + "loss": 0.925, + "step": 5158 + }, + { + "epoch": 4.803538175046555, + "grad_norm": 1.7555537223815918, + "learning_rate": 4.893110212559199e-07, + "loss": 0.8518, + "step": 5159 + }, + { + "epoch": 4.804469273743017, + "grad_norm": 1.7940551042556763, + "learning_rate": 4.885755680253334e-07, + "loss": 0.881, + "step": 5160 + }, + { + "epoch": 4.805400372439479, + "grad_norm": 1.7643146514892578, + "learning_rate": 4.87840608056121e-07, + "loss": 0.8579, + "step": 5161 + }, + { + "epoch": 4.80633147113594, + "grad_norm": 1.7674238681793213, + "learning_rate": 4.871061415285167e-07, + "loss": 0.8317, + "step": 5162 + }, + { + "epoch": 4.807262569832402, + "grad_norm": 1.825547456741333, + "learning_rate": 4.86372168622635e-07, + "loss": 0.9031, + "step": 5163 + }, + { + "epoch": 4.808193668528864, + "grad_norm": 1.8205349445343018, + "learning_rate": 4.856386895184681e-07, + "loss": 0.9006, + "step": 5164 + }, + { + "epoch": 4.809124767225326, + "grad_norm": 1.867270588874817, + "learning_rate": 4.849057043958846e-07, + "loss": 0.9012, + "step": 5165 + }, + { + "epoch": 4.810055865921788, + "grad_norm": 1.782147765159607, + "learning_rate": 4.841732134346369e-07, + "loss": 0.8806, + "step": 5166 + }, + { + "epoch": 4.81098696461825, + "grad_norm": 1.7375408411026, + "learning_rate": 4.834412168143535e-07, + "loss": 0.8354, + "step": 5167 + }, + { + "epoch": 4.811918063314711, + "grad_norm": 1.7347445487976074, + "learning_rate": 4.827097147145404e-07, + "loss": 0.8291, + "step": 5168 + }, + { + "epoch": 4.812849162011173, + "grad_norm": 1.7636226415634155, + "learning_rate": 4.819787073145837e-07, + "loss": 0.8523, + "step": 5169 + }, + { + "epoch": 4.8137802607076345, + "grad_norm": 1.8694385290145874, + "learning_rate": 4.812481947937498e-07, + "loss": 0.9025, + "step": 5170 + }, + { + "epoch": 4.814711359404097, + "grad_norm": 1.8160954713821411, + "learning_rate": 4.80518177331182e-07, + "loss": 0.8806, + "step": 5171 + }, + { + "epoch": 4.815642458100559, + "grad_norm": 1.7991918325424194, + "learning_rate": 4.797886551059011e-07, + "loss": 0.8964, + "step": 5172 + }, + { + "epoch": 4.816573556797021, + "grad_norm": 1.8306747674942017, + "learning_rate": 4.790596282968079e-07, + "loss": 0.8977, + "step": 5173 + }, + { + "epoch": 4.817504655493482, + "grad_norm": 1.8051376342773438, + "learning_rate": 4.783310970826835e-07, + "loss": 0.8904, + "step": 5174 + }, + { + "epoch": 4.818435754189944, + "grad_norm": 1.7542917728424072, + "learning_rate": 4.776030616421837e-07, + "loss": 0.8642, + "step": 5175 + }, + { + "epoch": 4.8193668528864055, + "grad_norm": 1.9030481576919556, + "learning_rate": 4.76875522153844e-07, + "loss": 0.9176, + "step": 5176 + }, + { + "epoch": 4.820297951582868, + "grad_norm": 1.7993779182434082, + "learning_rate": 4.761484787960813e-07, + "loss": 0.8479, + "step": 5177 + }, + { + "epoch": 4.82122905027933, + "grad_norm": 1.809257984161377, + "learning_rate": 4.7542193174718615e-07, + "loss": 0.8974, + "step": 5178 + }, + { + "epoch": 4.822160148975792, + "grad_norm": 1.731986165046692, + "learning_rate": 4.746958811853303e-07, + "loss": 0.8324, + "step": 5179 + }, + { + "epoch": 4.823091247672253, + "grad_norm": 1.768441081047058, + "learning_rate": 4.739703272885626e-07, + "loss": 0.8797, + "step": 5180 + }, + { + "epoch": 4.824022346368715, + "grad_norm": 1.848097562789917, + "learning_rate": 4.732452702348123e-07, + "loss": 0.9086, + "step": 5181 + }, + { + "epoch": 4.8249534450651765, + "grad_norm": 1.8118599653244019, + "learning_rate": 4.7252071020188277e-07, + "loss": 0.8447, + "step": 5182 + }, + { + "epoch": 4.825884543761639, + "grad_norm": 1.7663260698318481, + "learning_rate": 4.7179664736745845e-07, + "loss": 0.8941, + "step": 5183 + }, + { + "epoch": 4.826815642458101, + "grad_norm": 1.8207805156707764, + "learning_rate": 4.710730819091028e-07, + "loss": 0.886, + "step": 5184 + }, + { + "epoch": 4.827746741154562, + "grad_norm": 1.7996714115142822, + "learning_rate": 4.7035001400425356e-07, + "loss": 0.8406, + "step": 5185 + }, + { + "epoch": 4.828677839851024, + "grad_norm": 1.8655352592468262, + "learning_rate": 4.6962744383022977e-07, + "loss": 0.9092, + "step": 5186 + }, + { + "epoch": 4.829608938547486, + "grad_norm": 1.7827436923980713, + "learning_rate": 4.689053715642261e-07, + "loss": 0.8738, + "step": 5187 + }, + { + "epoch": 4.8305400372439475, + "grad_norm": 1.782157301902771, + "learning_rate": 4.681837973833181e-07, + "loss": 0.8611, + "step": 5188 + }, + { + "epoch": 4.83147113594041, + "grad_norm": 1.7509452104568481, + "learning_rate": 4.6746272146445585e-07, + "loss": 0.8659, + "step": 5189 + }, + { + "epoch": 4.832402234636872, + "grad_norm": 1.8279353380203247, + "learning_rate": 4.6674214398446907e-07, + "loss": 0.885, + "step": 5190 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 1.855201005935669, + "learning_rate": 4.6602206512006503e-07, + "loss": 0.8783, + "step": 5191 + }, + { + "epoch": 4.834264432029795, + "grad_norm": 1.7778265476226807, + "learning_rate": 4.653024850478283e-07, + "loss": 0.8998, + "step": 5192 + }, + { + "epoch": 4.835195530726257, + "grad_norm": 1.7887818813323975, + "learning_rate": 4.645834039442218e-07, + "loss": 0.8683, + "step": 5193 + }, + { + "epoch": 4.8361266294227185, + "grad_norm": 1.8530457019805908, + "learning_rate": 4.6386482198558487e-07, + "loss": 0.911, + "step": 5194 + }, + { + "epoch": 4.837057728119181, + "grad_norm": 1.7800312042236328, + "learning_rate": 4.6314673934813704e-07, + "loss": 0.8883, + "step": 5195 + }, + { + "epoch": 4.837988826815643, + "grad_norm": 1.8029181957244873, + "learning_rate": 4.624291562079719e-07, + "loss": 0.9018, + "step": 5196 + }, + { + "epoch": 4.838919925512104, + "grad_norm": 1.798660159111023, + "learning_rate": 4.617120727410629e-07, + "loss": 0.8771, + "step": 5197 + }, + { + "epoch": 4.839851024208566, + "grad_norm": 1.8968807458877563, + "learning_rate": 4.609954891232604e-07, + "loss": 0.9362, + "step": 5198 + }, + { + "epoch": 4.840782122905028, + "grad_norm": 1.7889306545257568, + "learning_rate": 4.6027940553029197e-07, + "loss": 0.8798, + "step": 5199 + }, + { + "epoch": 4.8417132216014895, + "grad_norm": 1.7940516471862793, + "learning_rate": 4.5956382213776294e-07, + "loss": 0.8981, + "step": 5200 + }, + { + "epoch": 4.842644320297952, + "grad_norm": 1.8786224126815796, + "learning_rate": 4.5884873912115545e-07, + "loss": 0.9378, + "step": 5201 + }, + { + "epoch": 4.843575418994414, + "grad_norm": 1.8178499937057495, + "learning_rate": 4.581341566558292e-07, + "loss": 0.8709, + "step": 5202 + }, + { + "epoch": 4.844506517690875, + "grad_norm": 1.7694302797317505, + "learning_rate": 4.5742007491702157e-07, + "loss": 0.859, + "step": 5203 + }, + { + "epoch": 4.845437616387337, + "grad_norm": 1.8367363214492798, + "learning_rate": 4.567064940798463e-07, + "loss": 0.8851, + "step": 5204 + }, + { + "epoch": 4.846368715083798, + "grad_norm": 1.8030140399932861, + "learning_rate": 4.559934143192951e-07, + "loss": 0.8496, + "step": 5205 + }, + { + "epoch": 4.8472998137802605, + "grad_norm": 1.8215893507003784, + "learning_rate": 4.552808358102362e-07, + "loss": 0.9088, + "step": 5206 + }, + { + "epoch": 4.848230912476723, + "grad_norm": 1.7975656986236572, + "learning_rate": 4.5456875872741503e-07, + "loss": 0.8729, + "step": 5207 + }, + { + "epoch": 4.849162011173185, + "grad_norm": 1.828337550163269, + "learning_rate": 4.5385718324545444e-07, + "loss": 0.8794, + "step": 5208 + }, + { + "epoch": 4.850093109869646, + "grad_norm": 1.788475751876831, + "learning_rate": 4.5314610953885396e-07, + "loss": 0.9089, + "step": 5209 + }, + { + "epoch": 4.851024208566108, + "grad_norm": 1.8540245294570923, + "learning_rate": 4.524355377819897e-07, + "loss": 0.8871, + "step": 5210 + }, + { + "epoch": 4.851955307262569, + "grad_norm": 1.783108115196228, + "learning_rate": 4.517254681491159e-07, + "loss": 0.8939, + "step": 5211 + }, + { + "epoch": 4.8528864059590315, + "grad_norm": 1.8092074394226074, + "learning_rate": 4.5101590081436217e-07, + "loss": 0.8906, + "step": 5212 + }, + { + "epoch": 4.853817504655494, + "grad_norm": 1.9343314170837402, + "learning_rate": 4.5030683595173566e-07, + "loss": 0.9306, + "step": 5213 + }, + { + "epoch": 4.854748603351956, + "grad_norm": 1.7881536483764648, + "learning_rate": 4.4959827373512066e-07, + "loss": 0.8823, + "step": 5214 + }, + { + "epoch": 4.855679702048417, + "grad_norm": 1.7947165966033936, + "learning_rate": 4.4889021433827756e-07, + "loss": 0.8994, + "step": 5215 + }, + { + "epoch": 4.856610800744879, + "grad_norm": 1.7124853134155273, + "learning_rate": 4.481826579348439e-07, + "loss": 0.8531, + "step": 5216 + }, + { + "epoch": 4.85754189944134, + "grad_norm": 1.78708016872406, + "learning_rate": 4.4747560469833305e-07, + "loss": 0.8274, + "step": 5217 + }, + { + "epoch": 4.8584729981378025, + "grad_norm": 1.7872692346572876, + "learning_rate": 4.467690548021361e-07, + "loss": 0.8888, + "step": 5218 + }, + { + "epoch": 4.859404096834265, + "grad_norm": 1.8036590814590454, + "learning_rate": 4.460630084195203e-07, + "loss": 0.8867, + "step": 5219 + }, + { + "epoch": 4.860335195530726, + "grad_norm": 1.8243296146392822, + "learning_rate": 4.4535746572362896e-07, + "loss": 0.894, + "step": 5220 + }, + { + "epoch": 4.861266294227188, + "grad_norm": 1.8128516674041748, + "learning_rate": 4.4465242688748244e-07, + "loss": 0.8648, + "step": 5221 + }, + { + "epoch": 4.86219739292365, + "grad_norm": 1.794575810432434, + "learning_rate": 4.439478920839771e-07, + "loss": 0.8481, + "step": 5222 + }, + { + "epoch": 4.863128491620111, + "grad_norm": 2.0917880535125732, + "learning_rate": 4.432438614858864e-07, + "loss": 0.8974, + "step": 5223 + }, + { + "epoch": 4.8640595903165735, + "grad_norm": 1.8327561616897583, + "learning_rate": 4.4254033526585917e-07, + "loss": 0.8932, + "step": 5224 + }, + { + "epoch": 4.864990689013036, + "grad_norm": 1.8219003677368164, + "learning_rate": 4.418373135964213e-07, + "loss": 0.8291, + "step": 5225 + }, + { + "epoch": 4.865921787709497, + "grad_norm": 1.8667099475860596, + "learning_rate": 4.411347966499746e-07, + "loss": 0.9215, + "step": 5226 + }, + { + "epoch": 4.866852886405959, + "grad_norm": 1.8412296772003174, + "learning_rate": 4.404327845987974e-07, + "loss": 0.8733, + "step": 5227 + }, + { + "epoch": 4.867783985102421, + "grad_norm": 1.8619046211242676, + "learning_rate": 4.397312776150439e-07, + "loss": 0.8871, + "step": 5228 + }, + { + "epoch": 4.868715083798882, + "grad_norm": 1.724885106086731, + "learning_rate": 4.3903027587074513e-07, + "loss": 0.8403, + "step": 5229 + }, + { + "epoch": 4.8696461824953445, + "grad_norm": 1.8432973623275757, + "learning_rate": 4.383297795378061e-07, + "loss": 0.8954, + "step": 5230 + }, + { + "epoch": 4.870577281191807, + "grad_norm": 1.8216813802719116, + "learning_rate": 4.37629788788011e-07, + "loss": 0.882, + "step": 5231 + }, + { + "epoch": 4.871508379888268, + "grad_norm": 1.7992639541625977, + "learning_rate": 4.3693030379301784e-07, + "loss": 0.9107, + "step": 5232 + }, + { + "epoch": 4.87243947858473, + "grad_norm": 1.8469982147216797, + "learning_rate": 4.362313247243616e-07, + "loss": 0.9036, + "step": 5233 + }, + { + "epoch": 4.873370577281192, + "grad_norm": 1.7853496074676514, + "learning_rate": 4.355328517534524e-07, + "loss": 0.8423, + "step": 5234 + }, + { + "epoch": 4.874301675977653, + "grad_norm": 1.823366641998291, + "learning_rate": 4.3483488505157713e-07, + "loss": 0.9167, + "step": 5235 + }, + { + "epoch": 4.8752327746741155, + "grad_norm": 1.8025858402252197, + "learning_rate": 4.3413742478989827e-07, + "loss": 0.8911, + "step": 5236 + }, + { + "epoch": 4.876163873370578, + "grad_norm": 1.7982072830200195, + "learning_rate": 4.334404711394521e-07, + "loss": 0.8411, + "step": 5237 + }, + { + "epoch": 4.877094972067039, + "grad_norm": 1.8426830768585205, + "learning_rate": 4.3274402427115476e-07, + "loss": 0.8756, + "step": 5238 + }, + { + "epoch": 4.878026070763501, + "grad_norm": 1.7709715366363525, + "learning_rate": 4.3204808435579533e-07, + "loss": 0.909, + "step": 5239 + }, + { + "epoch": 4.878957169459962, + "grad_norm": 1.775206208229065, + "learning_rate": 4.313526515640376e-07, + "loss": 0.8615, + "step": 5240 + }, + { + "epoch": 4.879888268156424, + "grad_norm": 1.8167459964752197, + "learning_rate": 4.306577260664241e-07, + "loss": 0.8719, + "step": 5241 + }, + { + "epoch": 4.8808193668528865, + "grad_norm": 1.789886236190796, + "learning_rate": 4.2996330803337067e-07, + "loss": 0.8475, + "step": 5242 + }, + { + "epoch": 4.881750465549349, + "grad_norm": 1.7546665668487549, + "learning_rate": 4.292693976351703e-07, + "loss": 0.8695, + "step": 5243 + }, + { + "epoch": 4.88268156424581, + "grad_norm": 1.785735845565796, + "learning_rate": 4.2857599504198824e-07, + "loss": 0.9081, + "step": 5244 + }, + { + "epoch": 4.883612662942272, + "grad_norm": 1.8095381259918213, + "learning_rate": 4.278831004238698e-07, + "loss": 0.8664, + "step": 5245 + }, + { + "epoch": 4.884543761638733, + "grad_norm": 1.798492193222046, + "learning_rate": 4.2719071395073307e-07, + "loss": 0.8756, + "step": 5246 + }, + { + "epoch": 4.885474860335195, + "grad_norm": 1.7602156400680542, + "learning_rate": 4.264988357923711e-07, + "loss": 0.8789, + "step": 5247 + }, + { + "epoch": 4.8864059590316575, + "grad_norm": 1.8550902605056763, + "learning_rate": 4.2580746611845273e-07, + "loss": 0.906, + "step": 5248 + }, + { + "epoch": 4.88733705772812, + "grad_norm": 1.848646640777588, + "learning_rate": 4.251166050985239e-07, + "loss": 0.8978, + "step": 5249 + }, + { + "epoch": 4.888268156424581, + "grad_norm": 1.865384578704834, + "learning_rate": 4.2442625290200407e-07, + "loss": 0.9286, + "step": 5250 + }, + { + "epoch": 4.889199255121043, + "grad_norm": 1.7855846881866455, + "learning_rate": 4.237364096981869e-07, + "loss": 0.8763, + "step": 5251 + }, + { + "epoch": 4.890130353817504, + "grad_norm": 1.7755378484725952, + "learning_rate": 4.230470756562438e-07, + "loss": 0.8734, + "step": 5252 + }, + { + "epoch": 4.891061452513966, + "grad_norm": 1.8568631410598755, + "learning_rate": 4.2235825094522067e-07, + "loss": 0.9158, + "step": 5253 + }, + { + "epoch": 4.8919925512104285, + "grad_norm": 1.8164997100830078, + "learning_rate": 4.216699357340362e-07, + "loss": 0.8855, + "step": 5254 + }, + { + "epoch": 4.89292364990689, + "grad_norm": 1.7983100414276123, + "learning_rate": 4.2098213019148625e-07, + "loss": 0.8748, + "step": 5255 + }, + { + "epoch": 4.893854748603352, + "grad_norm": 1.7970737218856812, + "learning_rate": 4.202948344862423e-07, + "loss": 0.8835, + "step": 5256 + }, + { + "epoch": 4.894785847299814, + "grad_norm": 1.8008607625961304, + "learning_rate": 4.196080487868495e-07, + "loss": 0.8925, + "step": 5257 + }, + { + "epoch": 4.895716945996275, + "grad_norm": 1.7305234670639038, + "learning_rate": 4.189217732617276e-07, + "loss": 0.8579, + "step": 5258 + }, + { + "epoch": 4.896648044692737, + "grad_norm": 1.730043888092041, + "learning_rate": 4.182360080791714e-07, + "loss": 0.8686, + "step": 5259 + }, + { + "epoch": 4.8975791433891995, + "grad_norm": 1.7659695148468018, + "learning_rate": 4.1755075340735335e-07, + "loss": 0.8841, + "step": 5260 + }, + { + "epoch": 4.898510242085661, + "grad_norm": 1.7888668775558472, + "learning_rate": 4.168660094143159e-07, + "loss": 0.8917, + "step": 5261 + }, + { + "epoch": 4.899441340782123, + "grad_norm": 1.8407880067825317, + "learning_rate": 4.16181776267979e-07, + "loss": 0.9151, + "step": 5262 + }, + { + "epoch": 4.900372439478585, + "grad_norm": 1.8631680011749268, + "learning_rate": 4.1549805413613865e-07, + "loss": 0.9031, + "step": 5263 + }, + { + "epoch": 4.901303538175046, + "grad_norm": 1.804580569267273, + "learning_rate": 4.148148431864632e-07, + "loss": 0.9356, + "step": 5264 + }, + { + "epoch": 4.902234636871508, + "grad_norm": 1.8344786167144775, + "learning_rate": 4.141321435864959e-07, + "loss": 0.8782, + "step": 5265 + }, + { + "epoch": 4.9031657355679705, + "grad_norm": 1.7794095277786255, + "learning_rate": 4.134499555036547e-07, + "loss": 0.8773, + "step": 5266 + }, + { + "epoch": 4.904096834264432, + "grad_norm": 1.8422811031341553, + "learning_rate": 4.1276827910523454e-07, + "loss": 0.9339, + "step": 5267 + }, + { + "epoch": 4.905027932960894, + "grad_norm": 1.8134912252426147, + "learning_rate": 4.1208711455840075e-07, + "loss": 0.8637, + "step": 5268 + }, + { + "epoch": 4.905959031657356, + "grad_norm": 1.7933812141418457, + "learning_rate": 4.1140646203019607e-07, + "loss": 0.8531, + "step": 5269 + }, + { + "epoch": 4.906890130353817, + "grad_norm": 1.936010718345642, + "learning_rate": 4.1072632168753624e-07, + "loss": 0.9169, + "step": 5270 + }, + { + "epoch": 4.907821229050279, + "grad_norm": 1.7585618495941162, + "learning_rate": 4.1004669369721364e-07, + "loss": 0.883, + "step": 5271 + }, + { + "epoch": 4.9087523277467415, + "grad_norm": 1.8700791597366333, + "learning_rate": 4.093675782258916e-07, + "loss": 0.9018, + "step": 5272 + }, + { + "epoch": 4.909683426443203, + "grad_norm": 1.9047349691390991, + "learning_rate": 4.086889754401094e-07, + "loss": 0.891, + "step": 5273 + }, + { + "epoch": 4.910614525139665, + "grad_norm": 1.7711817026138306, + "learning_rate": 4.0801088550628307e-07, + "loss": 0.8866, + "step": 5274 + }, + { + "epoch": 4.911545623836126, + "grad_norm": 1.7585043907165527, + "learning_rate": 4.0733330859069803e-07, + "loss": 0.8537, + "step": 5275 + }, + { + "epoch": 4.912476722532588, + "grad_norm": 1.8376471996307373, + "learning_rate": 4.0665624485951736e-07, + "loss": 0.9003, + "step": 5276 + }, + { + "epoch": 4.91340782122905, + "grad_norm": 1.8508752584457397, + "learning_rate": 4.059796944787772e-07, + "loss": 0.9529, + "step": 5277 + }, + { + "epoch": 4.9143389199255125, + "grad_norm": 1.8277952671051025, + "learning_rate": 4.053036576143882e-07, + "loss": 0.8982, + "step": 5278 + }, + { + "epoch": 4.915270018621974, + "grad_norm": 1.8005878925323486, + "learning_rate": 4.046281344321343e-07, + "loss": 0.8841, + "step": 5279 + }, + { + "epoch": 4.916201117318436, + "grad_norm": 1.8858959674835205, + "learning_rate": 4.039531250976736e-07, + "loss": 0.9437, + "step": 5280 + }, + { + "epoch": 4.917132216014897, + "grad_norm": 1.7643753290176392, + "learning_rate": 4.0327862977654064e-07, + "loss": 0.8679, + "step": 5281 + }, + { + "epoch": 4.918063314711359, + "grad_norm": 1.7742254734039307, + "learning_rate": 4.026046486341398e-07, + "loss": 0.8787, + "step": 5282 + }, + { + "epoch": 4.918994413407821, + "grad_norm": 1.7775417566299438, + "learning_rate": 4.0193118183575184e-07, + "loss": 0.8858, + "step": 5283 + }, + { + "epoch": 4.9199255121042835, + "grad_norm": 1.7834526300430298, + "learning_rate": 4.0125822954653114e-07, + "loss": 0.8437, + "step": 5284 + }, + { + "epoch": 4.920856610800745, + "grad_norm": 1.809555172920227, + "learning_rate": 4.0058579193150537e-07, + "loss": 0.8339, + "step": 5285 + }, + { + "epoch": 4.921787709497207, + "grad_norm": 1.8201770782470703, + "learning_rate": 3.999138691555771e-07, + "loss": 0.8188, + "step": 5286 + }, + { + "epoch": 4.922718808193668, + "grad_norm": 1.7378122806549072, + "learning_rate": 3.9924246138352106e-07, + "loss": 0.8583, + "step": 5287 + }, + { + "epoch": 4.92364990689013, + "grad_norm": 1.8543678522109985, + "learning_rate": 3.985715687799868e-07, + "loss": 0.8453, + "step": 5288 + }, + { + "epoch": 4.924581005586592, + "grad_norm": 1.7985683679580688, + "learning_rate": 3.9790119150949743e-07, + "loss": 0.887, + "step": 5289 + }, + { + "epoch": 4.925512104283054, + "grad_norm": 1.8846176862716675, + "learning_rate": 3.972313297364494e-07, + "loss": 0.9262, + "step": 5290 + }, + { + "epoch": 4.926443202979516, + "grad_norm": 1.8048757314682007, + "learning_rate": 3.965619836251125e-07, + "loss": 0.8608, + "step": 5291 + }, + { + "epoch": 4.927374301675978, + "grad_norm": 1.84028160572052, + "learning_rate": 3.958931533396307e-07, + "loss": 0.8779, + "step": 5292 + }, + { + "epoch": 4.928305400372439, + "grad_norm": 1.8262813091278076, + "learning_rate": 3.952248390440211e-07, + "loss": 0.8963, + "step": 5293 + }, + { + "epoch": 4.929236499068901, + "grad_norm": 1.825722098350525, + "learning_rate": 3.9455704090217454e-07, + "loss": 0.8674, + "step": 5294 + }, + { + "epoch": 4.930167597765363, + "grad_norm": 1.907501459121704, + "learning_rate": 3.938897590778545e-07, + "loss": 0.9149, + "step": 5295 + }, + { + "epoch": 4.931098696461825, + "grad_norm": 1.820899248123169, + "learning_rate": 3.932229937346993e-07, + "loss": 0.8803, + "step": 5296 + }, + { + "epoch": 4.932029795158287, + "grad_norm": 1.8526872396469116, + "learning_rate": 3.925567450362189e-07, + "loss": 0.8951, + "step": 5297 + }, + { + "epoch": 4.932960893854749, + "grad_norm": 1.8724814653396606, + "learning_rate": 3.9189101314579784e-07, + "loss": 0.9217, + "step": 5298 + }, + { + "epoch": 4.93389199255121, + "grad_norm": 1.8225414752960205, + "learning_rate": 3.9122579822669343e-07, + "loss": 0.8932, + "step": 5299 + }, + { + "epoch": 4.934823091247672, + "grad_norm": 1.880174994468689, + "learning_rate": 3.9056110044203594e-07, + "loss": 0.9198, + "step": 5300 + }, + { + "epoch": 4.935754189944134, + "grad_norm": 1.8608263731002808, + "learning_rate": 3.898969199548297e-07, + "loss": 0.8766, + "step": 5301 + }, + { + "epoch": 4.936685288640596, + "grad_norm": 1.885520577430725, + "learning_rate": 3.892332569279511e-07, + "loss": 0.9036, + "step": 5302 + }, + { + "epoch": 4.937616387337058, + "grad_norm": 1.8405259847640991, + "learning_rate": 3.8857011152415026e-07, + "loss": 0.8695, + "step": 5303 + }, + { + "epoch": 4.93854748603352, + "grad_norm": 1.8087760210037231, + "learning_rate": 3.8790748390605035e-07, + "loss": 0.8598, + "step": 5304 + }, + { + "epoch": 4.939478584729981, + "grad_norm": 1.8715178966522217, + "learning_rate": 3.872453742361473e-07, + "loss": 0.9, + "step": 5305 + }, + { + "epoch": 4.940409683426443, + "grad_norm": 1.7672086954116821, + "learning_rate": 3.865837826768107e-07, + "loss": 0.8841, + "step": 5306 + }, + { + "epoch": 4.941340782122905, + "grad_norm": 1.829879879951477, + "learning_rate": 3.8592270939028174e-07, + "loss": 0.8987, + "step": 5307 + }, + { + "epoch": 4.942271880819367, + "grad_norm": 1.929194450378418, + "learning_rate": 3.8526215453867614e-07, + "loss": 0.8991, + "step": 5308 + }, + { + "epoch": 4.943202979515829, + "grad_norm": 1.9536426067352295, + "learning_rate": 3.84602118283981e-07, + "loss": 0.9078, + "step": 5309 + }, + { + "epoch": 4.94413407821229, + "grad_norm": 1.8799476623535156, + "learning_rate": 3.839426007880576e-07, + "loss": 0.92, + "step": 5310 + }, + { + "epoch": 4.945065176908752, + "grad_norm": 1.8520907163619995, + "learning_rate": 3.832836022126388e-07, + "loss": 0.8927, + "step": 5311 + }, + { + "epoch": 4.945996275605214, + "grad_norm": 1.9198991060256958, + "learning_rate": 3.826251227193309e-07, + "loss": 0.8882, + "step": 5312 + }, + { + "epoch": 4.946927374301676, + "grad_norm": 1.8624638319015503, + "learning_rate": 3.819671624696128e-07, + "loss": 0.8954, + "step": 5313 + }, + { + "epoch": 4.947858472998138, + "grad_norm": 1.8187952041625977, + "learning_rate": 3.81309721624836e-07, + "loss": 0.9032, + "step": 5314 + }, + { + "epoch": 4.9487895716946, + "grad_norm": 1.7800878286361694, + "learning_rate": 3.8065280034622536e-07, + "loss": 0.8361, + "step": 5315 + }, + { + "epoch": 4.949720670391061, + "grad_norm": 1.7987459897994995, + "learning_rate": 3.799963987948757e-07, + "loss": 0.8782, + "step": 5316 + }, + { + "epoch": 4.950651769087523, + "grad_norm": 1.777549147605896, + "learning_rate": 3.7934051713175796e-07, + "loss": 0.9035, + "step": 5317 + }, + { + "epoch": 4.951582867783985, + "grad_norm": 1.824741005897522, + "learning_rate": 3.7868515551771343e-07, + "loss": 0.908, + "step": 5318 + }, + { + "epoch": 4.952513966480447, + "grad_norm": 1.795250654220581, + "learning_rate": 3.7803031411345644e-07, + "loss": 0.8527, + "step": 5319 + }, + { + "epoch": 4.953445065176909, + "grad_norm": 1.823562741279602, + "learning_rate": 3.773759930795737e-07, + "loss": 0.8608, + "step": 5320 + }, + { + "epoch": 4.954376163873371, + "grad_norm": 1.814447283744812, + "learning_rate": 3.767221925765241e-07, + "loss": 0.8705, + "step": 5321 + }, + { + "epoch": 4.955307262569832, + "grad_norm": 1.8410872220993042, + "learning_rate": 3.760689127646397e-07, + "loss": 0.8869, + "step": 5322 + }, + { + "epoch": 4.956238361266294, + "grad_norm": 1.7979161739349365, + "learning_rate": 3.754161538041226e-07, + "loss": 0.8476, + "step": 5323 + }, + { + "epoch": 4.957169459962756, + "grad_norm": 1.7886043787002563, + "learning_rate": 3.7476391585505034e-07, + "loss": 0.8643, + "step": 5324 + }, + { + "epoch": 4.9581005586592175, + "grad_norm": 1.9121350049972534, + "learning_rate": 3.741121990773708e-07, + "loss": 0.8706, + "step": 5325 + }, + { + "epoch": 4.95903165735568, + "grad_norm": 1.8424707651138306, + "learning_rate": 3.734610036309047e-07, + "loss": 0.9012, + "step": 5326 + }, + { + "epoch": 4.959962756052142, + "grad_norm": 1.8037383556365967, + "learning_rate": 3.728103296753433e-07, + "loss": 0.909, + "step": 5327 + }, + { + "epoch": 4.960893854748603, + "grad_norm": 1.778655767440796, + "learning_rate": 3.721601773702527e-07, + "loss": 0.9074, + "step": 5328 + }, + { + "epoch": 4.961824953445065, + "grad_norm": 1.8090509176254272, + "learning_rate": 3.715105468750699e-07, + "loss": 0.8592, + "step": 5329 + }, + { + "epoch": 4.962756052141527, + "grad_norm": 1.8085049390792847, + "learning_rate": 3.708614383491016e-07, + "loss": 0.8532, + "step": 5330 + }, + { + "epoch": 4.9636871508379885, + "grad_norm": 1.7577404975891113, + "learning_rate": 3.7021285195153076e-07, + "loss": 0.8754, + "step": 5331 + }, + { + "epoch": 4.964618249534451, + "grad_norm": 1.7330533266067505, + "learning_rate": 3.6956478784140937e-07, + "loss": 0.8609, + "step": 5332 + }, + { + "epoch": 4.965549348230913, + "grad_norm": 1.830278754234314, + "learning_rate": 3.6891724617766274e-07, + "loss": 0.8586, + "step": 5333 + }, + { + "epoch": 4.966480446927374, + "grad_norm": 1.7677191495895386, + "learning_rate": 3.6827022711908577e-07, + "loss": 0.8426, + "step": 5334 + }, + { + "epoch": 4.967411545623836, + "grad_norm": 1.9078500270843506, + "learning_rate": 3.676237308243483e-07, + "loss": 0.9344, + "step": 5335 + }, + { + "epoch": 4.968342644320298, + "grad_norm": 1.8793221712112427, + "learning_rate": 3.669777574519909e-07, + "loss": 0.8947, + "step": 5336 + }, + { + "epoch": 4.9692737430167595, + "grad_norm": 1.880645513534546, + "learning_rate": 3.6633230716042357e-07, + "loss": 0.8943, + "step": 5337 + }, + { + "epoch": 4.970204841713222, + "grad_norm": 1.8634294271469116, + "learning_rate": 3.656873801079319e-07, + "loss": 0.9014, + "step": 5338 + }, + { + "epoch": 4.971135940409684, + "grad_norm": 1.851664662361145, + "learning_rate": 3.650429764526711e-07, + "loss": 0.8453, + "step": 5339 + }, + { + "epoch": 4.972067039106145, + "grad_norm": 1.802804708480835, + "learning_rate": 3.6439909635266726e-07, + "loss": 0.8725, + "step": 5340 + }, + { + "epoch": 4.972998137802607, + "grad_norm": 1.7787460088729858, + "learning_rate": 3.6375573996581897e-07, + "loss": 0.8523, + "step": 5341 + }, + { + "epoch": 4.973929236499069, + "grad_norm": 1.8363609313964844, + "learning_rate": 3.6311290744989735e-07, + "loss": 0.8727, + "step": 5342 + }, + { + "epoch": 4.9748603351955305, + "grad_norm": 1.8185356855392456, + "learning_rate": 3.6247059896254435e-07, + "loss": 0.9108, + "step": 5343 + }, + { + "epoch": 4.975791433891993, + "grad_norm": 1.8040950298309326, + "learning_rate": 3.618288146612722e-07, + "loss": 0.898, + "step": 5344 + }, + { + "epoch": 4.976722532588455, + "grad_norm": 1.7791231870651245, + "learning_rate": 3.611875547034652e-07, + "loss": 0.8873, + "step": 5345 + }, + { + "epoch": 4.977653631284916, + "grad_norm": 1.854645013809204, + "learning_rate": 3.6054681924638154e-07, + "loss": 0.9041, + "step": 5346 + }, + { + "epoch": 4.978584729981378, + "grad_norm": 1.8184573650360107, + "learning_rate": 3.59906608447147e-07, + "loss": 0.8887, + "step": 5347 + }, + { + "epoch": 4.97951582867784, + "grad_norm": 1.8097659349441528, + "learning_rate": 3.5926692246275985e-07, + "loss": 0.8943, + "step": 5348 + }, + { + "epoch": 4.9804469273743015, + "grad_norm": 1.7808564901351929, + "learning_rate": 3.5862776145009214e-07, + "loss": 0.8737, + "step": 5349 + }, + { + "epoch": 4.981378026070764, + "grad_norm": 1.8281110525131226, + "learning_rate": 3.579891255658846e-07, + "loss": 0.8811, + "step": 5350 + }, + { + "epoch": 4.982309124767225, + "grad_norm": 1.8108853101730347, + "learning_rate": 3.573510149667489e-07, + "loss": 0.8889, + "step": 5351 + }, + { + "epoch": 4.983240223463687, + "grad_norm": 1.8407871723175049, + "learning_rate": 3.56713429809169e-07, + "loss": 0.9154, + "step": 5352 + }, + { + "epoch": 4.984171322160149, + "grad_norm": 1.8406052589416504, + "learning_rate": 3.5607637024950124e-07, + "loss": 0.8866, + "step": 5353 + }, + { + "epoch": 4.985102420856611, + "grad_norm": 1.8075628280639648, + "learning_rate": 3.554398364439704e-07, + "loss": 0.8923, + "step": 5354 + }, + { + "epoch": 4.9860335195530725, + "grad_norm": 1.793916940689087, + "learning_rate": 3.548038285486735e-07, + "loss": 0.8762, + "step": 5355 + }, + { + "epoch": 4.986964618249535, + "grad_norm": 1.9662938117980957, + "learning_rate": 3.541683467195789e-07, + "loss": 0.896, + "step": 5356 + }, + { + "epoch": 4.987895716945996, + "grad_norm": 2.2229628562927246, + "learning_rate": 3.535333911125269e-07, + "loss": 0.8808, + "step": 5357 + }, + { + "epoch": 4.988826815642458, + "grad_norm": 1.8215160369873047, + "learning_rate": 3.528989618832257e-07, + "loss": 0.8885, + "step": 5358 + }, + { + "epoch": 4.98975791433892, + "grad_norm": 1.7979981899261475, + "learning_rate": 3.52265059187257e-07, + "loss": 0.8796, + "step": 5359 + }, + { + "epoch": 4.990689013035381, + "grad_norm": 1.8188625574111938, + "learning_rate": 3.516316831800737e-07, + "loss": 0.8891, + "step": 5360 + }, + { + "epoch": 4.9916201117318435, + "grad_norm": 1.8791908025741577, + "learning_rate": 3.509988340169973e-07, + "loss": 0.9092, + "step": 5361 + }, + { + "epoch": 4.992551210428306, + "grad_norm": 1.8184922933578491, + "learning_rate": 3.5036651185322177e-07, + "loss": 0.8721, + "step": 5362 + }, + { + "epoch": 4.993482309124767, + "grad_norm": 1.8595117330551147, + "learning_rate": 3.497347168438106e-07, + "loss": 0.9051, + "step": 5363 + }, + { + "epoch": 4.994413407821229, + "grad_norm": 1.8297067880630493, + "learning_rate": 3.491034491437009e-07, + "loss": 0.9228, + "step": 5364 + }, + { + "epoch": 4.995344506517691, + "grad_norm": 1.7535303831100464, + "learning_rate": 3.4847270890769615e-07, + "loss": 0.8593, + "step": 5365 + }, + { + "epoch": 4.996275605214152, + "grad_norm": 1.8438254594802856, + "learning_rate": 3.4784249629047335e-07, + "loss": 0.8442, + "step": 5366 + }, + { + "epoch": 4.9972067039106145, + "grad_norm": 1.884529709815979, + "learning_rate": 3.472128114465806e-07, + "loss": 0.906, + "step": 5367 + }, + { + "epoch": 4.998137802607077, + "grad_norm": 1.8196370601654053, + "learning_rate": 3.465836545304341e-07, + "loss": 0.8904, + "step": 5368 + }, + { + "epoch": 4.999068901303538, + "grad_norm": 1.8597924709320068, + "learning_rate": 3.4595502569632203e-07, + "loss": 0.8818, + "step": 5369 + }, + { + "epoch": 5.0, + "grad_norm": 2.0305917263031006, + "learning_rate": 3.4532692509840315e-07, + "loss": 0.8773, + "step": 5370 + }, + { + "epoch": 5.000931098696462, + "grad_norm": 1.7566059827804565, + "learning_rate": 3.446993528907075e-07, + "loss": 0.8603, + "step": 5371 + }, + { + "epoch": 5.001862197392923, + "grad_norm": 1.8853473663330078, + "learning_rate": 3.440723092271331e-07, + "loss": 0.8939, + "step": 5372 + }, + { + "epoch": 5.0027932960893855, + "grad_norm": 1.8207406997680664, + "learning_rate": 3.4344579426145055e-07, + "loss": 0.8756, + "step": 5373 + }, + { + "epoch": 5.003724394785848, + "grad_norm": 1.7697628736495972, + "learning_rate": 3.428198081472997e-07, + "loss": 0.8407, + "step": 5374 + }, + { + "epoch": 5.004655493482309, + "grad_norm": 1.7912861108779907, + "learning_rate": 3.4219435103819116e-07, + "loss": 0.896, + "step": 5375 + }, + { + "epoch": 5.005586592178771, + "grad_norm": 1.7579586505889893, + "learning_rate": 3.415694230875058e-07, + "loss": 0.8525, + "step": 5376 + }, + { + "epoch": 5.006517690875233, + "grad_norm": 1.747606873512268, + "learning_rate": 3.409450244484946e-07, + "loss": 0.8341, + "step": 5377 + }, + { + "epoch": 5.007448789571694, + "grad_norm": 1.7693923711776733, + "learning_rate": 3.403211552742788e-07, + "loss": 0.8674, + "step": 5378 + }, + { + "epoch": 5.0083798882681565, + "grad_norm": 1.8055145740509033, + "learning_rate": 3.3969781571784985e-07, + "loss": 0.898, + "step": 5379 + }, + { + "epoch": 5.009310986964619, + "grad_norm": 1.7849164009094238, + "learning_rate": 3.3907500593206884e-07, + "loss": 0.8478, + "step": 5380 + }, + { + "epoch": 5.01024208566108, + "grad_norm": 1.7852933406829834, + "learning_rate": 3.3845272606966767e-07, + "loss": 0.855, + "step": 5381 + }, + { + "epoch": 5.011173184357542, + "grad_norm": 1.8394616842269897, + "learning_rate": 3.3783097628324765e-07, + "loss": 0.8814, + "step": 5382 + }, + { + "epoch": 5.012104283054004, + "grad_norm": 1.79812490940094, + "learning_rate": 3.3720975672528107e-07, + "loss": 0.8974, + "step": 5383 + }, + { + "epoch": 5.013035381750465, + "grad_norm": 1.7571392059326172, + "learning_rate": 3.365890675481087e-07, + "loss": 0.8494, + "step": 5384 + }, + { + "epoch": 5.0139664804469275, + "grad_norm": 1.8584692478179932, + "learning_rate": 3.359689089039428e-07, + "loss": 0.8651, + "step": 5385 + }, + { + "epoch": 5.01489757914339, + "grad_norm": 1.8004257678985596, + "learning_rate": 3.353492809448641e-07, + "loss": 0.8321, + "step": 5386 + }, + { + "epoch": 5.015828677839851, + "grad_norm": 1.8346531391143799, + "learning_rate": 3.3473018382282454e-07, + "loss": 0.8754, + "step": 5387 + }, + { + "epoch": 5.016759776536313, + "grad_norm": 1.8508727550506592, + "learning_rate": 3.341116176896447e-07, + "loss": 0.8956, + "step": 5388 + }, + { + "epoch": 5.017690875232774, + "grad_norm": 1.7843397855758667, + "learning_rate": 3.334935826970159e-07, + "loss": 0.8494, + "step": 5389 + }, + { + "epoch": 5.018621973929236, + "grad_norm": 1.7446109056472778, + "learning_rate": 3.328760789964988e-07, + "loss": 0.8572, + "step": 5390 + }, + { + "epoch": 5.0195530726256985, + "grad_norm": 1.8667515516281128, + "learning_rate": 3.3225910673952337e-07, + "loss": 0.8636, + "step": 5391 + }, + { + "epoch": 5.02048417132216, + "grad_norm": 1.7960751056671143, + "learning_rate": 3.316426660773897e-07, + "loss": 0.8586, + "step": 5392 + }, + { + "epoch": 5.021415270018622, + "grad_norm": 1.8366155624389648, + "learning_rate": 3.31026757161268e-07, + "loss": 0.8618, + "step": 5393 + }, + { + "epoch": 5.022346368715084, + "grad_norm": 1.7764601707458496, + "learning_rate": 3.304113801421968e-07, + "loss": 0.8428, + "step": 5394 + }, + { + "epoch": 5.023277467411545, + "grad_norm": 1.7742637395858765, + "learning_rate": 3.2979653517108523e-07, + "loss": 0.8593, + "step": 5395 + }, + { + "epoch": 5.024208566108007, + "grad_norm": 1.7575278282165527, + "learning_rate": 3.2918222239871207e-07, + "loss": 0.8513, + "step": 5396 + }, + { + "epoch": 5.0251396648044695, + "grad_norm": 1.7508395910263062, + "learning_rate": 3.2856844197572456e-07, + "loss": 0.8448, + "step": 5397 + }, + { + "epoch": 5.026070763500931, + "grad_norm": 1.8747994899749756, + "learning_rate": 3.2795519405264015e-07, + "loss": 0.8882, + "step": 5398 + }, + { + "epoch": 5.027001862197393, + "grad_norm": 1.847557783126831, + "learning_rate": 3.27342478779846e-07, + "loss": 0.8557, + "step": 5399 + }, + { + "epoch": 5.027932960893855, + "grad_norm": 1.882828712463379, + "learning_rate": 3.2673029630759745e-07, + "loss": 0.872, + "step": 5400 + }, + { + "epoch": 5.028864059590316, + "grad_norm": 1.8301234245300293, + "learning_rate": 3.2611864678602055e-07, + "loss": 0.8712, + "step": 5401 + }, + { + "epoch": 5.029795158286778, + "grad_norm": 1.8880056142807007, + "learning_rate": 3.255075303651098e-07, + "loss": 0.8758, + "step": 5402 + }, + { + "epoch": 5.0307262569832405, + "grad_norm": 1.795169711112976, + "learning_rate": 3.2489694719472916e-07, + "loss": 0.8799, + "step": 5403 + }, + { + "epoch": 5.031657355679702, + "grad_norm": 1.827068567276001, + "learning_rate": 3.2428689742461187e-07, + "loss": 0.8661, + "step": 5404 + }, + { + "epoch": 5.032588454376164, + "grad_norm": 1.8163630962371826, + "learning_rate": 3.236773812043606e-07, + "loss": 0.834, + "step": 5405 + }, + { + "epoch": 5.033519553072626, + "grad_norm": 1.780513048171997, + "learning_rate": 3.230683986834468e-07, + "loss": 0.7927, + "step": 5406 + }, + { + "epoch": 5.034450651769087, + "grad_norm": 1.831838607788086, + "learning_rate": 3.2245995001121103e-07, + "loss": 0.8529, + "step": 5407 + }, + { + "epoch": 5.035381750465549, + "grad_norm": 1.8557733297348022, + "learning_rate": 3.21852035336864e-07, + "loss": 0.861, + "step": 5408 + }, + { + "epoch": 5.0363128491620115, + "grad_norm": 1.8243850469589233, + "learning_rate": 3.2124465480948247e-07, + "loss": 0.8864, + "step": 5409 + }, + { + "epoch": 5.037243947858473, + "grad_norm": 1.8537359237670898, + "learning_rate": 3.2063780857801595e-07, + "loss": 0.8966, + "step": 5410 + }, + { + "epoch": 5.038175046554935, + "grad_norm": 1.749690055847168, + "learning_rate": 3.2003149679128125e-07, + "loss": 0.8426, + "step": 5411 + }, + { + "epoch": 5.039106145251397, + "grad_norm": 1.841686725616455, + "learning_rate": 3.1942571959796416e-07, + "loss": 0.898, + "step": 5412 + }, + { + "epoch": 5.040037243947858, + "grad_norm": 1.7795865535736084, + "learning_rate": 3.18820477146618e-07, + "loss": 0.8564, + "step": 5413 + }, + { + "epoch": 5.04096834264432, + "grad_norm": 1.8428759574890137, + "learning_rate": 3.1821576958566775e-07, + "loss": 0.917, + "step": 5414 + }, + { + "epoch": 5.0418994413407825, + "grad_norm": 1.8622180223464966, + "learning_rate": 3.1761159706340584e-07, + "loss": 0.9, + "step": 5415 + }, + { + "epoch": 5.042830540037244, + "grad_norm": 1.8199769258499146, + "learning_rate": 3.170079597279918e-07, + "loss": 0.8637, + "step": 5416 + }, + { + "epoch": 5.043761638733706, + "grad_norm": 1.8588244915008545, + "learning_rate": 3.164048577274573e-07, + "loss": 0.8718, + "step": 5417 + }, + { + "epoch": 5.044692737430168, + "grad_norm": 1.8281551599502563, + "learning_rate": 3.158022912097e-07, + "loss": 0.8688, + "step": 5418 + }, + { + "epoch": 5.045623836126629, + "grad_norm": 1.8638731241226196, + "learning_rate": 3.152002603224882e-07, + "loss": 0.8832, + "step": 5419 + }, + { + "epoch": 5.046554934823091, + "grad_norm": 1.8578804731369019, + "learning_rate": 3.145987652134563e-07, + "loss": 0.8329, + "step": 5420 + }, + { + "epoch": 5.0474860335195535, + "grad_norm": 1.77105712890625, + "learning_rate": 3.139978060301102e-07, + "loss": 0.8402, + "step": 5421 + }, + { + "epoch": 5.048417132216015, + "grad_norm": 1.8198322057724, + "learning_rate": 3.133973829198234e-07, + "loss": 0.878, + "step": 5422 + }, + { + "epoch": 5.049348230912477, + "grad_norm": 1.8618695735931396, + "learning_rate": 3.127974960298358e-07, + "loss": 0.8567, + "step": 5423 + }, + { + "epoch": 5.050279329608939, + "grad_norm": 1.871352195739746, + "learning_rate": 3.1219814550725844e-07, + "loss": 0.8492, + "step": 5424 + }, + { + "epoch": 5.0512104283054, + "grad_norm": 1.79216468334198, + "learning_rate": 3.1159933149907035e-07, + "loss": 0.8633, + "step": 5425 + }, + { + "epoch": 5.052141527001862, + "grad_norm": 1.7753634452819824, + "learning_rate": 3.110010541521191e-07, + "loss": 0.8215, + "step": 5426 + }, + { + "epoch": 5.053072625698324, + "grad_norm": 1.8090475797653198, + "learning_rate": 3.1040331361311847e-07, + "loss": 0.8557, + "step": 5427 + }, + { + "epoch": 5.054003724394786, + "grad_norm": 1.8700013160705566, + "learning_rate": 3.0980611002865366e-07, + "loss": 0.8601, + "step": 5428 + }, + { + "epoch": 5.054934823091248, + "grad_norm": 1.7824888229370117, + "learning_rate": 3.092094435451773e-07, + "loss": 0.8528, + "step": 5429 + }, + { + "epoch": 5.055865921787709, + "grad_norm": 1.8186638355255127, + "learning_rate": 3.086133143090081e-07, + "loss": 0.8364, + "step": 5430 + }, + { + "epoch": 5.056797020484171, + "grad_norm": 1.849936604499817, + "learning_rate": 3.0801772246633555e-07, + "loss": 0.8869, + "step": 5431 + }, + { + "epoch": 5.057728119180633, + "grad_norm": 1.7524813413619995, + "learning_rate": 3.07422668163217e-07, + "loss": 0.8398, + "step": 5432 + }, + { + "epoch": 5.058659217877095, + "grad_norm": 1.767720103263855, + "learning_rate": 3.068281515455776e-07, + "loss": 0.8114, + "step": 5433 + }, + { + "epoch": 5.059590316573557, + "grad_norm": 1.785894513130188, + "learning_rate": 3.062341727592094e-07, + "loss": 0.8436, + "step": 5434 + }, + { + "epoch": 5.060521415270019, + "grad_norm": 1.8394719362258911, + "learning_rate": 3.0564073194977506e-07, + "loss": 0.8672, + "step": 5435 + }, + { + "epoch": 5.06145251396648, + "grad_norm": 1.7850011587142944, + "learning_rate": 3.0504782926280426e-07, + "loss": 0.8549, + "step": 5436 + }, + { + "epoch": 5.062383612662942, + "grad_norm": 1.877810001373291, + "learning_rate": 3.0445546484369286e-07, + "loss": 0.8579, + "step": 5437 + }, + { + "epoch": 5.063314711359404, + "grad_norm": 1.7908408641815186, + "learning_rate": 3.038636388377067e-07, + "loss": 0.8444, + "step": 5438 + }, + { + "epoch": 5.064245810055866, + "grad_norm": 1.8714141845703125, + "learning_rate": 3.03272351389981e-07, + "loss": 0.8891, + "step": 5439 + }, + { + "epoch": 5.065176908752328, + "grad_norm": 1.8753211498260498, + "learning_rate": 3.026816026455148e-07, + "loss": 0.8949, + "step": 5440 + }, + { + "epoch": 5.06610800744879, + "grad_norm": 1.7906402349472046, + "learning_rate": 3.0209139274917867e-07, + "loss": 0.8617, + "step": 5441 + }, + { + "epoch": 5.067039106145251, + "grad_norm": 1.8123400211334229, + "learning_rate": 3.015017218457089e-07, + "loss": 0.8516, + "step": 5442 + }, + { + "epoch": 5.067970204841713, + "grad_norm": 1.8339799642562866, + "learning_rate": 3.009125900797116e-07, + "loss": 0.8675, + "step": 5443 + }, + { + "epoch": 5.068901303538175, + "grad_norm": 1.7807270288467407, + "learning_rate": 3.003239975956584e-07, + "loss": 0.8492, + "step": 5444 + }, + { + "epoch": 5.069832402234637, + "grad_norm": 1.8044829368591309, + "learning_rate": 2.9973594453788976e-07, + "loss": 0.871, + "step": 5445 + }, + { + "epoch": 5.070763500931099, + "grad_norm": 1.881725788116455, + "learning_rate": 2.9914843105061514e-07, + "loss": 0.8797, + "step": 5446 + }, + { + "epoch": 5.071694599627561, + "grad_norm": 1.7961394786834717, + "learning_rate": 2.9856145727790885e-07, + "loss": 0.8184, + "step": 5447 + }, + { + "epoch": 5.072625698324022, + "grad_norm": 1.8004388809204102, + "learning_rate": 2.979750233637152e-07, + "loss": 0.8077, + "step": 5448 + }, + { + "epoch": 5.073556797020484, + "grad_norm": 1.9360809326171875, + "learning_rate": 2.9738912945184465e-07, + "loss": 0.9063, + "step": 5449 + }, + { + "epoch": 5.074487895716946, + "grad_norm": 1.8166086673736572, + "learning_rate": 2.968037756859776e-07, + "loss": 0.8501, + "step": 5450 + }, + { + "epoch": 5.075418994413408, + "grad_norm": 1.8365291357040405, + "learning_rate": 2.962189622096584e-07, + "loss": 0.8731, + "step": 5451 + }, + { + "epoch": 5.07635009310987, + "grad_norm": 1.86810302734375, + "learning_rate": 2.9563468916630144e-07, + "loss": 0.8741, + "step": 5452 + }, + { + "epoch": 5.077281191806332, + "grad_norm": 1.775658130645752, + "learning_rate": 2.9505095669918796e-07, + "loss": 0.8129, + "step": 5453 + }, + { + "epoch": 5.078212290502793, + "grad_norm": 1.8750908374786377, + "learning_rate": 2.9446776495146656e-07, + "loss": 0.8836, + "step": 5454 + }, + { + "epoch": 5.079143389199255, + "grad_norm": 1.9212899208068848, + "learning_rate": 2.9388511406615333e-07, + "loss": 0.8943, + "step": 5455 + }, + { + "epoch": 5.080074487895717, + "grad_norm": 1.86331045627594, + "learning_rate": 2.933030041861312e-07, + "loss": 0.888, + "step": 5456 + }, + { + "epoch": 5.081005586592179, + "grad_norm": 1.7795807123184204, + "learning_rate": 2.9272143545415245e-07, + "loss": 0.8406, + "step": 5457 + }, + { + "epoch": 5.081936685288641, + "grad_norm": 1.799129843711853, + "learning_rate": 2.9214040801283347e-07, + "loss": 0.8522, + "step": 5458 + }, + { + "epoch": 5.082867783985103, + "grad_norm": 1.7958507537841797, + "learning_rate": 2.9155992200466025e-07, + "loss": 0.8674, + "step": 5459 + }, + { + "epoch": 5.083798882681564, + "grad_norm": 1.789273977279663, + "learning_rate": 2.9097997757198515e-07, + "loss": 0.8176, + "step": 5460 + }, + { + "epoch": 5.084729981378026, + "grad_norm": 1.8099762201309204, + "learning_rate": 2.9040057485702764e-07, + "loss": 0.8349, + "step": 5461 + }, + { + "epoch": 5.0856610800744875, + "grad_norm": 1.785123348236084, + "learning_rate": 2.8982171400187523e-07, + "loss": 0.837, + "step": 5462 + }, + { + "epoch": 5.08659217877095, + "grad_norm": 1.846750259399414, + "learning_rate": 2.8924339514848074e-07, + "loss": 0.8658, + "step": 5463 + }, + { + "epoch": 5.087523277467412, + "grad_norm": 1.8544315099716187, + "learning_rate": 2.8866561843866713e-07, + "loss": 0.8421, + "step": 5464 + }, + { + "epoch": 5.088454376163873, + "grad_norm": 1.828311562538147, + "learning_rate": 2.880883840141208e-07, + "loss": 0.8723, + "step": 5465 + }, + { + "epoch": 5.089385474860335, + "grad_norm": 1.8241746425628662, + "learning_rate": 2.8751169201639756e-07, + "loss": 0.8507, + "step": 5466 + }, + { + "epoch": 5.090316573556797, + "grad_norm": 1.8431018590927124, + "learning_rate": 2.8693554258691943e-07, + "loss": 0.8443, + "step": 5467 + }, + { + "epoch": 5.0912476722532585, + "grad_norm": 1.7918264865875244, + "learning_rate": 2.8635993586697555e-07, + "loss": 0.8833, + "step": 5468 + }, + { + "epoch": 5.092178770949721, + "grad_norm": 1.9013649225234985, + "learning_rate": 2.857848719977216e-07, + "loss": 0.8653, + "step": 5469 + }, + { + "epoch": 5.093109869646183, + "grad_norm": 1.7994608879089355, + "learning_rate": 2.8521035112018063e-07, + "loss": 0.8495, + "step": 5470 + }, + { + "epoch": 5.094040968342644, + "grad_norm": 1.8009556531906128, + "learning_rate": 2.846363733752425e-07, + "loss": 0.8524, + "step": 5471 + }, + { + "epoch": 5.094972067039106, + "grad_norm": 1.8510315418243408, + "learning_rate": 2.840629389036634e-07, + "loss": 0.8665, + "step": 5472 + }, + { + "epoch": 5.095903165735568, + "grad_norm": 1.8891124725341797, + "learning_rate": 2.834900478460667e-07, + "loss": 0.8929, + "step": 5473 + }, + { + "epoch": 5.0968342644320295, + "grad_norm": 1.8537659645080566, + "learning_rate": 2.829177003429426e-07, + "loss": 0.8691, + "step": 5474 + }, + { + "epoch": 5.097765363128492, + "grad_norm": 1.831802487373352, + "learning_rate": 2.823458965346476e-07, + "loss": 0.8456, + "step": 5475 + }, + { + "epoch": 5.098696461824954, + "grad_norm": 1.8554143905639648, + "learning_rate": 2.817746365614049e-07, + "loss": 0.8457, + "step": 5476 + }, + { + "epoch": 5.099627560521415, + "grad_norm": 1.8172563314437866, + "learning_rate": 2.812039205633049e-07, + "loss": 0.9034, + "step": 5477 + }, + { + "epoch": 5.100558659217877, + "grad_norm": 1.8462687730789185, + "learning_rate": 2.80633748680304e-07, + "loss": 0.8995, + "step": 5478 + }, + { + "epoch": 5.101489757914339, + "grad_norm": 1.8126068115234375, + "learning_rate": 2.800641210522256e-07, + "loss": 0.8885, + "step": 5479 + }, + { + "epoch": 5.1024208566108005, + "grad_norm": 1.8060555458068848, + "learning_rate": 2.794950378187591e-07, + "loss": 0.8505, + "step": 5480 + }, + { + "epoch": 5.103351955307263, + "grad_norm": 1.8111025094985962, + "learning_rate": 2.7892649911946095e-07, + "loss": 0.8837, + "step": 5481 + }, + { + "epoch": 5.104283054003725, + "grad_norm": 1.8687167167663574, + "learning_rate": 2.783585050937537e-07, + "loss": 0.8957, + "step": 5482 + }, + { + "epoch": 5.105214152700186, + "grad_norm": 1.8816295862197876, + "learning_rate": 2.777910558809263e-07, + "loss": 0.8691, + "step": 5483 + }, + { + "epoch": 5.106145251396648, + "grad_norm": 1.7675544023513794, + "learning_rate": 2.772241516201349e-07, + "loss": 0.8605, + "step": 5484 + }, + { + "epoch": 5.10707635009311, + "grad_norm": 1.8695759773254395, + "learning_rate": 2.766577924504005e-07, + "loss": 0.8931, + "step": 5485 + }, + { + "epoch": 5.1080074487895715, + "grad_norm": 1.8622584342956543, + "learning_rate": 2.7609197851061196e-07, + "loss": 0.9014, + "step": 5486 + }, + { + "epoch": 5.108938547486034, + "grad_norm": 1.8537383079528809, + "learning_rate": 2.755267099395234e-07, + "loss": 0.8636, + "step": 5487 + }, + { + "epoch": 5.109869646182496, + "grad_norm": 1.824541449546814, + "learning_rate": 2.749619868757558e-07, + "loss": 0.8548, + "step": 5488 + }, + { + "epoch": 5.110800744878957, + "grad_norm": 1.906436562538147, + "learning_rate": 2.743978094577959e-07, + "loss": 0.8972, + "step": 5489 + }, + { + "epoch": 5.111731843575419, + "grad_norm": 1.748932957649231, + "learning_rate": 2.7383417782399696e-07, + "loss": 0.7915, + "step": 5490 + }, + { + "epoch": 5.112662942271881, + "grad_norm": 1.824517846107483, + "learning_rate": 2.732710921125789e-07, + "loss": 0.8737, + "step": 5491 + }, + { + "epoch": 5.1135940409683425, + "grad_norm": 1.761131763458252, + "learning_rate": 2.727085524616255e-07, + "loss": 0.8288, + "step": 5492 + }, + { + "epoch": 5.114525139664805, + "grad_norm": 1.827260136604309, + "learning_rate": 2.721465590090899e-07, + "loss": 0.8625, + "step": 5493 + }, + { + "epoch": 5.115456238361267, + "grad_norm": 1.8775804042816162, + "learning_rate": 2.7158511189278926e-07, + "loss": 0.8948, + "step": 5494 + }, + { + "epoch": 5.116387337057728, + "grad_norm": 1.8295022249221802, + "learning_rate": 2.710242112504069e-07, + "loss": 0.8592, + "step": 5495 + }, + { + "epoch": 5.11731843575419, + "grad_norm": 1.8081015348434448, + "learning_rate": 2.7046385721949256e-07, + "loss": 0.827, + "step": 5496 + }, + { + "epoch": 5.118249534450651, + "grad_norm": 1.823494553565979, + "learning_rate": 2.6990404993746194e-07, + "loss": 0.832, + "step": 5497 + }, + { + "epoch": 5.1191806331471135, + "grad_norm": 1.809525728225708, + "learning_rate": 2.693447895415968e-07, + "loss": 0.8756, + "step": 5498 + }, + { + "epoch": 5.120111731843576, + "grad_norm": 1.875980019569397, + "learning_rate": 2.6878607616904334e-07, + "loss": 0.8923, + "step": 5499 + }, + { + "epoch": 5.121042830540037, + "grad_norm": 1.978255033493042, + "learning_rate": 2.6822790995681583e-07, + "loss": 0.9004, + "step": 5500 + }, + { + "epoch": 5.121973929236499, + "grad_norm": 1.8139119148254395, + "learning_rate": 2.6767029104179304e-07, + "loss": 0.8878, + "step": 5501 + }, + { + "epoch": 5.122905027932961, + "grad_norm": 1.8909567594528198, + "learning_rate": 2.671132195607198e-07, + "loss": 0.8882, + "step": 5502 + }, + { + "epoch": 5.123836126629422, + "grad_norm": 1.8698506355285645, + "learning_rate": 2.665566956502069e-07, + "loss": 0.8759, + "step": 5503 + }, + { + "epoch": 5.1247672253258845, + "grad_norm": 1.8118741512298584, + "learning_rate": 2.6600071944673014e-07, + "loss": 0.8522, + "step": 5504 + }, + { + "epoch": 5.125698324022347, + "grad_norm": 1.7702304124832153, + "learning_rate": 2.6544529108663204e-07, + "loss": 0.8771, + "step": 5505 + }, + { + "epoch": 5.126629422718808, + "grad_norm": 1.7991243600845337, + "learning_rate": 2.6489041070611927e-07, + "loss": 0.8799, + "step": 5506 + }, + { + "epoch": 5.12756052141527, + "grad_norm": 1.8362014293670654, + "learning_rate": 2.64336078441266e-07, + "loss": 0.8736, + "step": 5507 + }, + { + "epoch": 5.128491620111732, + "grad_norm": 1.9170562028884888, + "learning_rate": 2.6378229442801163e-07, + "loss": 0.8787, + "step": 5508 + }, + { + "epoch": 5.129422718808193, + "grad_norm": 1.8220125436782837, + "learning_rate": 2.632290588021588e-07, + "loss": 0.8677, + "step": 5509 + }, + { + "epoch": 5.1303538175046555, + "grad_norm": 1.7650326490402222, + "learning_rate": 2.62676371699378e-07, + "loss": 0.8578, + "step": 5510 + }, + { + "epoch": 5.131284916201118, + "grad_norm": 1.8190217018127441, + "learning_rate": 2.621242332552057e-07, + "loss": 0.8841, + "step": 5511 + }, + { + "epoch": 5.132216014897579, + "grad_norm": 1.7713018655776978, + "learning_rate": 2.6157264360504224e-07, + "loss": 0.8372, + "step": 5512 + }, + { + "epoch": 5.133147113594041, + "grad_norm": 1.78468918800354, + "learning_rate": 2.6102160288415277e-07, + "loss": 0.8623, + "step": 5513 + }, + { + "epoch": 5.134078212290503, + "grad_norm": 1.8052679300308228, + "learning_rate": 2.6047111122767034e-07, + "loss": 0.8818, + "step": 5514 + }, + { + "epoch": 5.135009310986964, + "grad_norm": 1.8052629232406616, + "learning_rate": 2.599211687705916e-07, + "loss": 0.8665, + "step": 5515 + }, + { + "epoch": 5.1359404096834265, + "grad_norm": 1.826066255569458, + "learning_rate": 2.593717756477787e-07, + "loss": 0.8767, + "step": 5516 + }, + { + "epoch": 5.136871508379889, + "grad_norm": 1.7910984754562378, + "learning_rate": 2.5882293199395836e-07, + "loss": 0.8517, + "step": 5517 + }, + { + "epoch": 5.13780260707635, + "grad_norm": 1.9318630695343018, + "learning_rate": 2.582746379437251e-07, + "loss": 0.8784, + "step": 5518 + }, + { + "epoch": 5.138733705772812, + "grad_norm": 1.8510595560073853, + "learning_rate": 2.577268936315366e-07, + "loss": 0.8701, + "step": 5519 + }, + { + "epoch": 5.139664804469274, + "grad_norm": 1.7714552879333496, + "learning_rate": 2.5717969919171553e-07, + "loss": 0.8563, + "step": 5520 + }, + { + "epoch": 5.140595903165735, + "grad_norm": 1.8220475912094116, + "learning_rate": 2.566330547584497e-07, + "loss": 0.8839, + "step": 5521 + }, + { + "epoch": 5.1415270018621975, + "grad_norm": 1.8357771635055542, + "learning_rate": 2.560869604657948e-07, + "loss": 0.8202, + "step": 5522 + }, + { + "epoch": 5.14245810055866, + "grad_norm": 1.801833987236023, + "learning_rate": 2.5554141644766774e-07, + "loss": 0.8672, + "step": 5523 + }, + { + "epoch": 5.143389199255121, + "grad_norm": 1.8583965301513672, + "learning_rate": 2.549964228378518e-07, + "loss": 0.8718, + "step": 5524 + }, + { + "epoch": 5.144320297951583, + "grad_norm": 1.8148066997528076, + "learning_rate": 2.5445197976999735e-07, + "loss": 0.8708, + "step": 5525 + }, + { + "epoch": 5.145251396648045, + "grad_norm": 1.7894881963729858, + "learning_rate": 2.5390808737761785e-07, + "loss": 0.7989, + "step": 5526 + }, + { + "epoch": 5.146182495344506, + "grad_norm": 1.904588222503662, + "learning_rate": 2.5336474579409074e-07, + "loss": 0.9013, + "step": 5527 + }, + { + "epoch": 5.1471135940409685, + "grad_norm": 1.8083937168121338, + "learning_rate": 2.5282195515266007e-07, + "loss": 0.8717, + "step": 5528 + }, + { + "epoch": 5.148044692737431, + "grad_norm": 1.852403998374939, + "learning_rate": 2.522797155864354e-07, + "loss": 0.9021, + "step": 5529 + }, + { + "epoch": 5.148975791433892, + "grad_norm": 1.813109040260315, + "learning_rate": 2.517380272283887e-07, + "loss": 0.8306, + "step": 5530 + }, + { + "epoch": 5.149906890130354, + "grad_norm": 1.7569769620895386, + "learning_rate": 2.5119689021135846e-07, + "loss": 0.8281, + "step": 5531 + }, + { + "epoch": 5.150837988826815, + "grad_norm": 1.8379690647125244, + "learning_rate": 2.50656304668048e-07, + "loss": 0.8677, + "step": 5532 + }, + { + "epoch": 5.151769087523277, + "grad_norm": 1.8273141384124756, + "learning_rate": 2.5011627073102563e-07, + "loss": 0.8778, + "step": 5533 + }, + { + "epoch": 5.1527001862197395, + "grad_norm": 1.860378384590149, + "learning_rate": 2.4957678853272246e-07, + "loss": 0.8996, + "step": 5534 + }, + { + "epoch": 5.153631284916201, + "grad_norm": 1.9000273942947388, + "learning_rate": 2.490378582054359e-07, + "loss": 0.9069, + "step": 5535 + }, + { + "epoch": 5.154562383612663, + "grad_norm": 1.8875058889389038, + "learning_rate": 2.4849947988132894e-07, + "loss": 0.8734, + "step": 5536 + }, + { + "epoch": 5.155493482309125, + "grad_norm": 1.8373746871948242, + "learning_rate": 2.47961653692427e-07, + "loss": 0.8459, + "step": 5537 + }, + { + "epoch": 5.156424581005586, + "grad_norm": 1.8489917516708374, + "learning_rate": 2.4742437977062134e-07, + "loss": 0.8736, + "step": 5538 + }, + { + "epoch": 5.157355679702048, + "grad_norm": 1.8276985883712769, + "learning_rate": 2.468876582476673e-07, + "loss": 0.8836, + "step": 5539 + }, + { + "epoch": 5.1582867783985105, + "grad_norm": 1.8339565992355347, + "learning_rate": 2.4635148925518576e-07, + "loss": 0.8869, + "step": 5540 + }, + { + "epoch": 5.159217877094972, + "grad_norm": 1.8088316917419434, + "learning_rate": 2.458158729246607e-07, + "loss": 0.858, + "step": 5541 + }, + { + "epoch": 5.160148975791434, + "grad_norm": 1.882627248764038, + "learning_rate": 2.452808093874412e-07, + "loss": 0.8737, + "step": 5542 + }, + { + "epoch": 5.161080074487896, + "grad_norm": 1.8557496070861816, + "learning_rate": 2.4474629877474206e-07, + "loss": 0.8938, + "step": 5543 + }, + { + "epoch": 5.162011173184357, + "grad_norm": 1.776591420173645, + "learning_rate": 2.442123412176398e-07, + "loss": 0.8488, + "step": 5544 + }, + { + "epoch": 5.162942271880819, + "grad_norm": 1.8062365055084229, + "learning_rate": 2.436789368470771e-07, + "loss": 0.8398, + "step": 5545 + }, + { + "epoch": 5.1638733705772815, + "grad_norm": 1.7667319774627686, + "learning_rate": 2.4314608579386113e-07, + "loss": 0.8336, + "step": 5546 + }, + { + "epoch": 5.164804469273743, + "grad_norm": 1.821705937385559, + "learning_rate": 2.4261378818866256e-07, + "loss": 0.8874, + "step": 5547 + }, + { + "epoch": 5.165735567970205, + "grad_norm": 1.8624881505966187, + "learning_rate": 2.420820441620167e-07, + "loss": 0.9236, + "step": 5548 + }, + { + "epoch": 5.166666666666667, + "grad_norm": 1.8150385618209839, + "learning_rate": 2.4155085384432314e-07, + "loss": 0.8762, + "step": 5549 + }, + { + "epoch": 5.167597765363128, + "grad_norm": 1.821459174156189, + "learning_rate": 2.410202173658457e-07, + "loss": 0.8724, + "step": 5550 + }, + { + "epoch": 5.16852886405959, + "grad_norm": 1.8151764869689941, + "learning_rate": 2.4049013485671206e-07, + "loss": 0.8452, + "step": 5551 + }, + { + "epoch": 5.1694599627560525, + "grad_norm": 1.8244167566299438, + "learning_rate": 2.399606064469143e-07, + "loss": 0.8581, + "step": 5552 + }, + { + "epoch": 5.170391061452514, + "grad_norm": 1.8199540376663208, + "learning_rate": 2.394316322663087e-07, + "loss": 0.8341, + "step": 5553 + }, + { + "epoch": 5.171322160148976, + "grad_norm": 1.8278475999832153, + "learning_rate": 2.3890321244461586e-07, + "loss": 0.8695, + "step": 5554 + }, + { + "epoch": 5.172253258845438, + "grad_norm": 1.8635401725769043, + "learning_rate": 2.3837534711141946e-07, + "loss": 0.8737, + "step": 5555 + }, + { + "epoch": 5.173184357541899, + "grad_norm": 1.838105320930481, + "learning_rate": 2.3784803639616856e-07, + "loss": 0.8664, + "step": 5556 + }, + { + "epoch": 5.174115456238361, + "grad_norm": 1.8818881511688232, + "learning_rate": 2.3732128042817494e-07, + "loss": 0.8625, + "step": 5557 + }, + { + "epoch": 5.1750465549348235, + "grad_norm": 1.8053056001663208, + "learning_rate": 2.36795079336615e-07, + "loss": 0.8343, + "step": 5558 + }, + { + "epoch": 5.175977653631285, + "grad_norm": 1.850576639175415, + "learning_rate": 2.3626943325052915e-07, + "loss": 0.8922, + "step": 5559 + }, + { + "epoch": 5.176908752327747, + "grad_norm": 1.8420586585998535, + "learning_rate": 2.357443422988215e-07, + "loss": 0.9098, + "step": 5560 + }, + { + "epoch": 5.177839851024209, + "grad_norm": 1.8008335828781128, + "learning_rate": 2.3521980661025984e-07, + "loss": 0.8662, + "step": 5561 + }, + { + "epoch": 5.17877094972067, + "grad_norm": 1.799306035041809, + "learning_rate": 2.346958263134763e-07, + "loss": 0.8735, + "step": 5562 + }, + { + "epoch": 5.179702048417132, + "grad_norm": 1.8064424991607666, + "learning_rate": 2.341724015369662e-07, + "loss": 0.8367, + "step": 5563 + }, + { + "epoch": 5.1806331471135945, + "grad_norm": 1.8104950189590454, + "learning_rate": 2.3364953240908928e-07, + "loss": 0.8853, + "step": 5564 + }, + { + "epoch": 5.181564245810056, + "grad_norm": 1.8647595643997192, + "learning_rate": 2.3312721905806828e-07, + "loss": 0.8516, + "step": 5565 + }, + { + "epoch": 5.182495344506518, + "grad_norm": 1.808050274848938, + "learning_rate": 2.3260546161199048e-07, + "loss": 0.8723, + "step": 5566 + }, + { + "epoch": 5.183426443202979, + "grad_norm": 1.8571369647979736, + "learning_rate": 2.3208426019880575e-07, + "loss": 0.8183, + "step": 5567 + }, + { + "epoch": 5.184357541899441, + "grad_norm": 1.7758479118347168, + "learning_rate": 2.3156361494632906e-07, + "loss": 0.8573, + "step": 5568 + }, + { + "epoch": 5.185288640595903, + "grad_norm": 1.8457156419754028, + "learning_rate": 2.3104352598223744e-07, + "loss": 0.8689, + "step": 5569 + }, + { + "epoch": 5.186219739292365, + "grad_norm": 1.796854019165039, + "learning_rate": 2.305239934340728e-07, + "loss": 0.8597, + "step": 5570 + }, + { + "epoch": 5.187150837988827, + "grad_norm": 2.1376373767852783, + "learning_rate": 2.300050174292398e-07, + "loss": 0.8597, + "step": 5571 + }, + { + "epoch": 5.188081936685289, + "grad_norm": 1.8045355081558228, + "learning_rate": 2.2948659809500702e-07, + "loss": 0.8517, + "step": 5572 + }, + { + "epoch": 5.18901303538175, + "grad_norm": 1.817205548286438, + "learning_rate": 2.28968735558506e-07, + "loss": 0.8594, + "step": 5573 + }, + { + "epoch": 5.189944134078212, + "grad_norm": 1.8470455408096313, + "learning_rate": 2.2845142994673246e-07, + "loss": 0.8917, + "step": 5574 + }, + { + "epoch": 5.190875232774674, + "grad_norm": 1.8841384649276733, + "learning_rate": 2.2793468138654517e-07, + "loss": 0.8631, + "step": 5575 + }, + { + "epoch": 5.191806331471136, + "grad_norm": 1.7934656143188477, + "learning_rate": 2.274184900046661e-07, + "loss": 0.8289, + "step": 5576 + }, + { + "epoch": 5.192737430167598, + "grad_norm": 1.8413658142089844, + "learning_rate": 2.2690285592768148e-07, + "loss": 0.8972, + "step": 5577 + }, + { + "epoch": 5.19366852886406, + "grad_norm": 1.8960078954696655, + "learning_rate": 2.263877792820385e-07, + "loss": 0.8898, + "step": 5578 + }, + { + "epoch": 5.194599627560521, + "grad_norm": 1.864646315574646, + "learning_rate": 2.2587326019405108e-07, + "loss": 0.8868, + "step": 5579 + }, + { + "epoch": 5.195530726256983, + "grad_norm": 1.8086352348327637, + "learning_rate": 2.2535929878989422e-07, + "loss": 0.8645, + "step": 5580 + }, + { + "epoch": 5.196461824953445, + "grad_norm": 1.949941635131836, + "learning_rate": 2.2484589519560645e-07, + "loss": 0.8765, + "step": 5581 + }, + { + "epoch": 5.197392923649907, + "grad_norm": 1.931442379951477, + "learning_rate": 2.243330495370896e-07, + "loss": 0.9163, + "step": 5582 + }, + { + "epoch": 5.198324022346369, + "grad_norm": 1.802750825881958, + "learning_rate": 2.2382076194010893e-07, + "loss": 0.8529, + "step": 5583 + }, + { + "epoch": 5.199255121042831, + "grad_norm": 1.785375714302063, + "learning_rate": 2.2330903253029306e-07, + "loss": 0.859, + "step": 5584 + }, + { + "epoch": 5.200186219739292, + "grad_norm": 1.8155770301818848, + "learning_rate": 2.2279786143313192e-07, + "loss": 0.9178, + "step": 5585 + }, + { + "epoch": 5.201117318435754, + "grad_norm": 1.8184212446212769, + "learning_rate": 2.2228724877398134e-07, + "loss": 0.8806, + "step": 5586 + }, + { + "epoch": 5.202048417132216, + "grad_norm": 1.8158317804336548, + "learning_rate": 2.2177719467805835e-07, + "loss": 0.8672, + "step": 5587 + }, + { + "epoch": 5.202979515828678, + "grad_norm": 1.8042694330215454, + "learning_rate": 2.212676992704435e-07, + "loss": 0.8701, + "step": 5588 + }, + { + "epoch": 5.20391061452514, + "grad_norm": 1.8759335279464722, + "learning_rate": 2.2075876267608016e-07, + "loss": 0.8707, + "step": 5589 + }, + { + "epoch": 5.204841713221602, + "grad_norm": 1.881545901298523, + "learning_rate": 2.2025038501977485e-07, + "loss": 0.8579, + "step": 5590 + }, + { + "epoch": 5.205772811918063, + "grad_norm": 1.8436025381088257, + "learning_rate": 2.1974256642619736e-07, + "loss": 0.8874, + "step": 5591 + }, + { + "epoch": 5.206703910614525, + "grad_norm": 1.8864734172821045, + "learning_rate": 2.1923530701987882e-07, + "loss": 0.8606, + "step": 5592 + }, + { + "epoch": 5.207635009310987, + "grad_norm": 1.8368639945983887, + "learning_rate": 2.1872860692521565e-07, + "loss": 0.8937, + "step": 5593 + }, + { + "epoch": 5.208566108007449, + "grad_norm": 1.8785499334335327, + "learning_rate": 2.1822246626646532e-07, + "loss": 0.8996, + "step": 5594 + }, + { + "epoch": 5.209497206703911, + "grad_norm": 1.805644154548645, + "learning_rate": 2.1771688516774913e-07, + "loss": 0.8605, + "step": 5595 + }, + { + "epoch": 5.210428305400373, + "grad_norm": 1.7644168138504028, + "learning_rate": 2.172118637530493e-07, + "loss": 0.8227, + "step": 5596 + }, + { + "epoch": 5.211359404096834, + "grad_norm": 1.8062853813171387, + "learning_rate": 2.1670740214621373e-07, + "loss": 0.8802, + "step": 5597 + }, + { + "epoch": 5.212290502793296, + "grad_norm": 1.7668213844299316, + "learning_rate": 2.1620350047095117e-07, + "loss": 0.8537, + "step": 5598 + }, + { + "epoch": 5.213221601489758, + "grad_norm": 1.8006895780563354, + "learning_rate": 2.1570015885083228e-07, + "loss": 0.8784, + "step": 5599 + }, + { + "epoch": 5.21415270018622, + "grad_norm": 1.8297566175460815, + "learning_rate": 2.1519737740929276e-07, + "loss": 0.8533, + "step": 5600 + }, + { + "epoch": 5.215083798882682, + "grad_norm": 1.8320248126983643, + "learning_rate": 2.1469515626962895e-07, + "loss": 0.8637, + "step": 5601 + }, + { + "epoch": 5.216014897579143, + "grad_norm": 1.8250216245651245, + "learning_rate": 2.1419349555500125e-07, + "loss": 0.8972, + "step": 5602 + }, + { + "epoch": 5.216945996275605, + "grad_norm": 1.8227274417877197, + "learning_rate": 2.136923953884304e-07, + "loss": 0.9336, + "step": 5603 + }, + { + "epoch": 5.217877094972067, + "grad_norm": 1.9002243280410767, + "learning_rate": 2.131918558928023e-07, + "loss": 0.8964, + "step": 5604 + }, + { + "epoch": 5.218808193668528, + "grad_norm": 1.814197301864624, + "learning_rate": 2.1269187719086454e-07, + "loss": 0.8602, + "step": 5605 + }, + { + "epoch": 5.219739292364991, + "grad_norm": 1.7803332805633545, + "learning_rate": 2.1219245940522548e-07, + "loss": 0.8767, + "step": 5606 + }, + { + "epoch": 5.220670391061453, + "grad_norm": 1.7914694547653198, + "learning_rate": 2.116936026583577e-07, + "loss": 0.8292, + "step": 5607 + }, + { + "epoch": 5.221601489757914, + "grad_norm": 1.82744300365448, + "learning_rate": 2.1119530707259695e-07, + "loss": 0.8742, + "step": 5608 + }, + { + "epoch": 5.222532588454376, + "grad_norm": 1.83793306350708, + "learning_rate": 2.106975727701391e-07, + "loss": 0.8677, + "step": 5609 + }, + { + "epoch": 5.223463687150838, + "grad_norm": 1.8640384674072266, + "learning_rate": 2.1020039987304285e-07, + "loss": 0.8609, + "step": 5610 + }, + { + "epoch": 5.224394785847299, + "grad_norm": 1.8460770845413208, + "learning_rate": 2.097037885032316e-07, + "loss": 0.885, + "step": 5611 + }, + { + "epoch": 5.225325884543762, + "grad_norm": 1.7314318418502808, + "learning_rate": 2.092077387824884e-07, + "loss": 0.8424, + "step": 5612 + }, + { + "epoch": 5.226256983240224, + "grad_norm": 1.8478906154632568, + "learning_rate": 2.0871225083245933e-07, + "loss": 0.8671, + "step": 5613 + }, + { + "epoch": 5.227188081936685, + "grad_norm": 1.853240728378296, + "learning_rate": 2.0821732477465245e-07, + "loss": 0.8682, + "step": 5614 + }, + { + "epoch": 5.228119180633147, + "grad_norm": 1.8116685152053833, + "learning_rate": 2.0772296073043984e-07, + "loss": 0.8688, + "step": 5615 + }, + { + "epoch": 5.229050279329609, + "grad_norm": 1.808295726776123, + "learning_rate": 2.0722915882105288e-07, + "loss": 0.8537, + "step": 5616 + }, + { + "epoch": 5.22998137802607, + "grad_norm": 1.8658608198165894, + "learning_rate": 2.0673591916758717e-07, + "loss": 0.8844, + "step": 5617 + }, + { + "epoch": 5.230912476722533, + "grad_norm": 1.7789552211761475, + "learning_rate": 2.062432418909993e-07, + "loss": 0.8534, + "step": 5618 + }, + { + "epoch": 5.231843575418995, + "grad_norm": 1.7944704294204712, + "learning_rate": 2.0575112711210988e-07, + "loss": 0.8397, + "step": 5619 + }, + { + "epoch": 5.232774674115456, + "grad_norm": 1.8074396848678589, + "learning_rate": 2.05259574951599e-07, + "loss": 0.8646, + "step": 5620 + }, + { + "epoch": 5.233705772811918, + "grad_norm": 1.829878330230713, + "learning_rate": 2.047685855300094e-07, + "loss": 0.9038, + "step": 5621 + }, + { + "epoch": 5.23463687150838, + "grad_norm": 1.889670491218567, + "learning_rate": 2.0427815896774784e-07, + "loss": 0.8464, + "step": 5622 + }, + { + "epoch": 5.235567970204841, + "grad_norm": 1.7938532829284668, + "learning_rate": 2.0378829538508062e-07, + "loss": 0.8736, + "step": 5623 + }, + { + "epoch": 5.236499068901304, + "grad_norm": 1.9136334657669067, + "learning_rate": 2.032989949021369e-07, + "loss": 0.9113, + "step": 5624 + }, + { + "epoch": 5.237430167597766, + "grad_norm": 1.8047664165496826, + "learning_rate": 2.0281025763890767e-07, + "loss": 0.873, + "step": 5625 + }, + { + "epoch": 5.238361266294227, + "grad_norm": 1.9355945587158203, + "learning_rate": 2.023220837152473e-07, + "loss": 0.8287, + "step": 5626 + }, + { + "epoch": 5.239292364990689, + "grad_norm": 1.9562362432479858, + "learning_rate": 2.0183447325086898e-07, + "loss": 0.9056, + "step": 5627 + }, + { + "epoch": 5.240223463687151, + "grad_norm": 1.8980822563171387, + "learning_rate": 2.013474263653495e-07, + "loss": 0.8925, + "step": 5628 + }, + { + "epoch": 5.241154562383612, + "grad_norm": 1.8333706855773926, + "learning_rate": 2.0086094317812859e-07, + "loss": 0.8578, + "step": 5629 + }, + { + "epoch": 5.242085661080075, + "grad_norm": 1.8562443256378174, + "learning_rate": 2.0037502380850533e-07, + "loss": 0.8844, + "step": 5630 + }, + { + "epoch": 5.243016759776537, + "grad_norm": 1.8450020551681519, + "learning_rate": 1.9988966837564184e-07, + "loss": 0.8587, + "step": 5631 + }, + { + "epoch": 5.243947858472998, + "grad_norm": 1.789536476135254, + "learning_rate": 1.9940487699856153e-07, + "loss": 0.8075, + "step": 5632 + }, + { + "epoch": 5.24487895716946, + "grad_norm": 1.9112567901611328, + "learning_rate": 1.98920649796151e-07, + "loss": 0.9269, + "step": 5633 + }, + { + "epoch": 5.245810055865922, + "grad_norm": 1.8516530990600586, + "learning_rate": 1.9843698688715607e-07, + "loss": 0.8734, + "step": 5634 + }, + { + "epoch": 5.246741154562383, + "grad_norm": 1.9026501178741455, + "learning_rate": 1.9795388839018548e-07, + "loss": 0.8833, + "step": 5635 + }, + { + "epoch": 5.247672253258846, + "grad_norm": 1.8442684412002563, + "learning_rate": 1.9747135442370946e-07, + "loss": 0.8567, + "step": 5636 + }, + { + "epoch": 5.248603351955307, + "grad_norm": 1.8109804391860962, + "learning_rate": 1.9698938510606002e-07, + "loss": 0.8403, + "step": 5637 + }, + { + "epoch": 5.249534450651769, + "grad_norm": 1.8949041366577148, + "learning_rate": 1.9650798055543014e-07, + "loss": 0.8773, + "step": 5638 + }, + { + "epoch": 5.250465549348231, + "grad_norm": 1.8307126760482788, + "learning_rate": 1.9602714088987478e-07, + "loss": 0.8746, + "step": 5639 + }, + { + "epoch": 5.251396648044693, + "grad_norm": 1.837146282196045, + "learning_rate": 1.9554686622730996e-07, + "loss": 0.8923, + "step": 5640 + }, + { + "epoch": 5.252327746741154, + "grad_norm": 1.8622758388519287, + "learning_rate": 1.950671566855139e-07, + "loss": 0.8895, + "step": 5641 + }, + { + "epoch": 5.253258845437617, + "grad_norm": 1.805173635482788, + "learning_rate": 1.9458801238212506e-07, + "loss": 0.8335, + "step": 5642 + }, + { + "epoch": 5.254189944134078, + "grad_norm": 1.7952090501785278, + "learning_rate": 1.9410943343464439e-07, + "loss": 0.8459, + "step": 5643 + }, + { + "epoch": 5.25512104283054, + "grad_norm": 1.8102829456329346, + "learning_rate": 1.9363141996043362e-07, + "loss": 0.8584, + "step": 5644 + }, + { + "epoch": 5.256052141527002, + "grad_norm": 1.7681130170822144, + "learning_rate": 1.9315397207671587e-07, + "loss": 0.8714, + "step": 5645 + }, + { + "epoch": 5.256983240223463, + "grad_norm": 1.833443522453308, + "learning_rate": 1.9267708990057586e-07, + "loss": 0.855, + "step": 5646 + }, + { + "epoch": 5.257914338919925, + "grad_norm": 1.8147053718566895, + "learning_rate": 1.9220077354895894e-07, + "loss": 0.8618, + "step": 5647 + }, + { + "epoch": 5.258845437616388, + "grad_norm": 1.7715224027633667, + "learning_rate": 1.9172502313867248e-07, + "loss": 0.8522, + "step": 5648 + }, + { + "epoch": 5.259776536312849, + "grad_norm": 1.838584065437317, + "learning_rate": 1.9124983878638453e-07, + "loss": 0.8637, + "step": 5649 + }, + { + "epoch": 5.260707635009311, + "grad_norm": 1.9571820497512817, + "learning_rate": 1.907752206086247e-07, + "loss": 0.9294, + "step": 5650 + }, + { + "epoch": 5.261638733705773, + "grad_norm": 1.8208258152008057, + "learning_rate": 1.9030116872178317e-07, + "loss": 0.8617, + "step": 5651 + }, + { + "epoch": 5.262569832402234, + "grad_norm": 1.7992171049118042, + "learning_rate": 1.89827683242112e-07, + "loss": 0.8848, + "step": 5652 + }, + { + "epoch": 5.263500931098696, + "grad_norm": 2.673541307449341, + "learning_rate": 1.893547642857238e-07, + "loss": 0.8668, + "step": 5653 + }, + { + "epoch": 5.264432029795159, + "grad_norm": 1.8477216958999634, + "learning_rate": 1.8888241196859225e-07, + "loss": 0.8769, + "step": 5654 + }, + { + "epoch": 5.26536312849162, + "grad_norm": 1.864316463470459, + "learning_rate": 1.8841062640655244e-07, + "loss": 0.8744, + "step": 5655 + }, + { + "epoch": 5.266294227188082, + "grad_norm": 1.8753626346588135, + "learning_rate": 1.8793940771530044e-07, + "loss": 0.9001, + "step": 5656 + }, + { + "epoch": 5.267225325884544, + "grad_norm": 1.9373072385787964, + "learning_rate": 1.87468756010393e-07, + "loss": 0.9083, + "step": 5657 + }, + { + "epoch": 5.268156424581005, + "grad_norm": 1.7712498903274536, + "learning_rate": 1.8699867140724803e-07, + "loss": 0.8606, + "step": 5658 + }, + { + "epoch": 5.269087523277467, + "grad_norm": 1.7829817533493042, + "learning_rate": 1.8652915402114418e-07, + "loss": 0.8507, + "step": 5659 + }, + { + "epoch": 5.27001862197393, + "grad_norm": 1.8625760078430176, + "learning_rate": 1.860602039672213e-07, + "loss": 0.877, + "step": 5660 + }, + { + "epoch": 5.270949720670391, + "grad_norm": 1.8658002614974976, + "learning_rate": 1.8559182136047988e-07, + "loss": 0.8813, + "step": 5661 + }, + { + "epoch": 5.271880819366853, + "grad_norm": 1.765851616859436, + "learning_rate": 1.8512400631578138e-07, + "loss": 0.8034, + "step": 5662 + }, + { + "epoch": 5.272811918063315, + "grad_norm": 1.894692301750183, + "learning_rate": 1.846567589478479e-07, + "loss": 0.9064, + "step": 5663 + }, + { + "epoch": 5.273743016759776, + "grad_norm": 1.8007086515426636, + "learning_rate": 1.8419007937126254e-07, + "loss": 0.8355, + "step": 5664 + }, + { + "epoch": 5.274674115456238, + "grad_norm": 1.8173516988754272, + "learning_rate": 1.837239677004693e-07, + "loss": 0.8885, + "step": 5665 + }, + { + "epoch": 5.275605214152701, + "grad_norm": 1.786285161972046, + "learning_rate": 1.8325842404977228e-07, + "loss": 0.8704, + "step": 5666 + }, + { + "epoch": 5.276536312849162, + "grad_norm": 1.7949508428573608, + "learning_rate": 1.8279344853333713e-07, + "loss": 0.8779, + "step": 5667 + }, + { + "epoch": 5.277467411545624, + "grad_norm": 1.7903894186019897, + "learning_rate": 1.823290412651893e-07, + "loss": 0.8659, + "step": 5668 + }, + { + "epoch": 5.278398510242086, + "grad_norm": 1.8247913122177124, + "learning_rate": 1.8186520235921546e-07, + "loss": 0.8857, + "step": 5669 + }, + { + "epoch": 5.279329608938547, + "grad_norm": 1.8549444675445557, + "learning_rate": 1.814019319291635e-07, + "loss": 0.8587, + "step": 5670 + }, + { + "epoch": 5.280260707635009, + "grad_norm": 1.8179452419281006, + "learning_rate": 1.809392300886395e-07, + "loss": 0.8649, + "step": 5671 + }, + { + "epoch": 5.281191806331471, + "grad_norm": 1.8239386081695557, + "learning_rate": 1.8047709695111327e-07, + "loss": 0.8459, + "step": 5672 + }, + { + "epoch": 5.282122905027933, + "grad_norm": 1.886542558670044, + "learning_rate": 1.8001553262991327e-07, + "loss": 0.9, + "step": 5673 + }, + { + "epoch": 5.283054003724395, + "grad_norm": 1.8134373426437378, + "learning_rate": 1.7955453723822903e-07, + "loss": 0.897, + "step": 5674 + }, + { + "epoch": 5.283985102420857, + "grad_norm": 1.874125599861145, + "learning_rate": 1.790941108891095e-07, + "loss": 0.8443, + "step": 5675 + }, + { + "epoch": 5.284916201117318, + "grad_norm": 1.8224263191223145, + "learning_rate": 1.7863425369546606e-07, + "loss": 0.8927, + "step": 5676 + }, + { + "epoch": 5.28584729981378, + "grad_norm": 1.76023268699646, + "learning_rate": 1.781749657700693e-07, + "loss": 0.847, + "step": 5677 + }, + { + "epoch": 5.286778398510242, + "grad_norm": 1.838577389717102, + "learning_rate": 1.777162472255492e-07, + "loss": 0.8982, + "step": 5678 + }, + { + "epoch": 5.287709497206704, + "grad_norm": 1.854857087135315, + "learning_rate": 1.772580981743985e-07, + "loss": 0.9061, + "step": 5679 + }, + { + "epoch": 5.288640595903166, + "grad_norm": 1.8174827098846436, + "learning_rate": 1.768005187289687e-07, + "loss": 0.8963, + "step": 5680 + }, + { + "epoch": 5.289571694599627, + "grad_norm": 1.7759593725204468, + "learning_rate": 1.763435090014723e-07, + "loss": 0.8731, + "step": 5681 + }, + { + "epoch": 5.290502793296089, + "grad_norm": 1.8693389892578125, + "learning_rate": 1.7588706910398045e-07, + "loss": 0.8704, + "step": 5682 + }, + { + "epoch": 5.291433891992551, + "grad_norm": 1.8118977546691895, + "learning_rate": 1.7543119914842726e-07, + "loss": 0.8591, + "step": 5683 + }, + { + "epoch": 5.292364990689013, + "grad_norm": 1.860899567604065, + "learning_rate": 1.749758992466055e-07, + "loss": 0.9184, + "step": 5684 + }, + { + "epoch": 5.293296089385475, + "grad_norm": 1.7810683250427246, + "learning_rate": 1.74521169510167e-07, + "loss": 0.8789, + "step": 5685 + }, + { + "epoch": 5.294227188081937, + "grad_norm": 1.8945982456207275, + "learning_rate": 1.740670100506267e-07, + "loss": 0.8692, + "step": 5686 + }, + { + "epoch": 5.295158286778398, + "grad_norm": 1.9001511335372925, + "learning_rate": 1.7361342097935717e-07, + "loss": 0.8891, + "step": 5687 + }, + { + "epoch": 5.29608938547486, + "grad_norm": 1.8172762393951416, + "learning_rate": 1.731604024075928e-07, + "loss": 0.8903, + "step": 5688 + }, + { + "epoch": 5.297020484171322, + "grad_norm": 1.8063994646072388, + "learning_rate": 1.727079544464258e-07, + "loss": 0.8669, + "step": 5689 + }, + { + "epoch": 5.297951582867784, + "grad_norm": 1.8083016872406006, + "learning_rate": 1.7225607720681132e-07, + "loss": 0.8669, + "step": 5690 + }, + { + "epoch": 5.298882681564246, + "grad_norm": 1.8618804216384888, + "learning_rate": 1.7180477079956293e-07, + "loss": 0.9041, + "step": 5691 + }, + { + "epoch": 5.299813780260708, + "grad_norm": 1.8071315288543701, + "learning_rate": 1.7135403533535406e-07, + "loss": 0.8567, + "step": 5692 + }, + { + "epoch": 5.300744878957169, + "grad_norm": 1.854587197303772, + "learning_rate": 1.7090387092471795e-07, + "loss": 0.8932, + "step": 5693 + }, + { + "epoch": 5.301675977653631, + "grad_norm": 1.8421626091003418, + "learning_rate": 1.704542776780496e-07, + "loss": 0.8854, + "step": 5694 + }, + { + "epoch": 5.302607076350093, + "grad_norm": 1.8113654851913452, + "learning_rate": 1.7000525570560284e-07, + "loss": 0.8715, + "step": 5695 + }, + { + "epoch": 5.303538175046555, + "grad_norm": 1.8999992609024048, + "learning_rate": 1.695568051174895e-07, + "loss": 0.849, + "step": 5696 + }, + { + "epoch": 5.304469273743017, + "grad_norm": 1.7969335317611694, + "learning_rate": 1.6910892602368474e-07, + "loss": 0.8654, + "step": 5697 + }, + { + "epoch": 5.305400372439479, + "grad_norm": 1.805788278579712, + "learning_rate": 1.6866161853402174e-07, + "loss": 0.8687, + "step": 5698 + }, + { + "epoch": 5.30633147113594, + "grad_norm": 1.8328217267990112, + "learning_rate": 1.6821488275819282e-07, + "loss": 0.8688, + "step": 5699 + }, + { + "epoch": 5.307262569832402, + "grad_norm": 1.8140062093734741, + "learning_rate": 1.6776871880575085e-07, + "loss": 0.8655, + "step": 5700 + }, + { + "epoch": 5.308193668528864, + "grad_norm": 1.8496992588043213, + "learning_rate": 1.6732312678611003e-07, + "loss": 0.8976, + "step": 5701 + }, + { + "epoch": 5.309124767225326, + "grad_norm": 1.7581454515457153, + "learning_rate": 1.668781068085415e-07, + "loss": 0.864, + "step": 5702 + }, + { + "epoch": 5.310055865921788, + "grad_norm": 1.8535518646240234, + "learning_rate": 1.6643365898217774e-07, + "loss": 0.8578, + "step": 5703 + }, + { + "epoch": 5.31098696461825, + "grad_norm": 1.819464087486267, + "learning_rate": 1.659897834160104e-07, + "loss": 0.916, + "step": 5704 + }, + { + "epoch": 5.311918063314711, + "grad_norm": 1.7941104173660278, + "learning_rate": 1.655464802188922e-07, + "loss": 0.8627, + "step": 5705 + }, + { + "epoch": 5.312849162011173, + "grad_norm": 1.8299163579940796, + "learning_rate": 1.6510374949953335e-07, + "loss": 0.8484, + "step": 5706 + }, + { + "epoch": 5.3137802607076345, + "grad_norm": 1.8308309316635132, + "learning_rate": 1.6466159136650422e-07, + "loss": 0.883, + "step": 5707 + }, + { + "epoch": 5.314711359404097, + "grad_norm": 1.8018893003463745, + "learning_rate": 1.6422000592823645e-07, + "loss": 0.8912, + "step": 5708 + }, + { + "epoch": 5.315642458100559, + "grad_norm": 1.8617347478866577, + "learning_rate": 1.6377899329301922e-07, + "loss": 0.8996, + "step": 5709 + }, + { + "epoch": 5.316573556797021, + "grad_norm": 1.84953773021698, + "learning_rate": 1.6333855356900184e-07, + "loss": 0.8679, + "step": 5710 + }, + { + "epoch": 5.317504655493482, + "grad_norm": 1.84372878074646, + "learning_rate": 1.6289868686419323e-07, + "loss": 0.867, + "step": 5711 + }, + { + "epoch": 5.318435754189944, + "grad_norm": 1.7670540809631348, + "learning_rate": 1.6245939328646322e-07, + "loss": 0.8438, + "step": 5712 + }, + { + "epoch": 5.3193668528864055, + "grad_norm": 1.8588340282440186, + "learning_rate": 1.6202067294353808e-07, + "loss": 0.8626, + "step": 5713 + }, + { + "epoch": 5.320297951582868, + "grad_norm": 1.769828200340271, + "learning_rate": 1.6158252594300593e-07, + "loss": 0.8453, + "step": 5714 + }, + { + "epoch": 5.32122905027933, + "grad_norm": 1.8881471157073975, + "learning_rate": 1.611449523923131e-07, + "loss": 0.8519, + "step": 5715 + }, + { + "epoch": 5.322160148975791, + "grad_norm": 1.7839561700820923, + "learning_rate": 1.607079523987662e-07, + "loss": 0.8202, + "step": 5716 + }, + { + "epoch": 5.323091247672253, + "grad_norm": 1.8396462202072144, + "learning_rate": 1.6027152606953007e-07, + "loss": 0.8762, + "step": 5717 + }, + { + "epoch": 5.324022346368715, + "grad_norm": 1.9111498594284058, + "learning_rate": 1.5983567351162964e-07, + "loss": 0.8913, + "step": 5718 + }, + { + "epoch": 5.3249534450651765, + "grad_norm": 1.8251688480377197, + "learning_rate": 1.5940039483195003e-07, + "loss": 0.8733, + "step": 5719 + }, + { + "epoch": 5.325884543761639, + "grad_norm": 1.8269546031951904, + "learning_rate": 1.589656901372333e-07, + "loss": 0.8699, + "step": 5720 + }, + { + "epoch": 5.326815642458101, + "grad_norm": 1.8492262363433838, + "learning_rate": 1.585315595340825e-07, + "loss": 0.8625, + "step": 5721 + }, + { + "epoch": 5.327746741154562, + "grad_norm": 1.8260760307312012, + "learning_rate": 1.5809800312895944e-07, + "loss": 0.902, + "step": 5722 + }, + { + "epoch": 5.328677839851024, + "grad_norm": 1.8590718507766724, + "learning_rate": 1.5766502102818491e-07, + "loss": 0.8665, + "step": 5723 + }, + { + "epoch": 5.329608938547486, + "grad_norm": 1.8169605731964111, + "learning_rate": 1.5723261333793927e-07, + "loss": 0.8537, + "step": 5724 + }, + { + "epoch": 5.3305400372439475, + "grad_norm": 1.7744882106781006, + "learning_rate": 1.5680078016426126e-07, + "loss": 0.8595, + "step": 5725 + }, + { + "epoch": 5.33147113594041, + "grad_norm": 1.8484734296798706, + "learning_rate": 1.5636952161305063e-07, + "loss": 0.8599, + "step": 5726 + }, + { + "epoch": 5.332402234636872, + "grad_norm": 1.813636302947998, + "learning_rate": 1.5593883779006336e-07, + "loss": 0.8815, + "step": 5727 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 1.8746895790100098, + "learning_rate": 1.5550872880091689e-07, + "loss": 0.8846, + "step": 5728 + }, + { + "epoch": 5.334264432029795, + "grad_norm": 1.7752913236618042, + "learning_rate": 1.5507919475108656e-07, + "loss": 0.841, + "step": 5729 + }, + { + "epoch": 5.335195530726257, + "grad_norm": 1.8907978534698486, + "learning_rate": 1.5465023574590677e-07, + "loss": 0.8867, + "step": 5730 + }, + { + "epoch": 5.3361266294227185, + "grad_norm": 1.832651138305664, + "learning_rate": 1.542218518905711e-07, + "loss": 0.8661, + "step": 5731 + }, + { + "epoch": 5.337057728119181, + "grad_norm": 1.9040380716323853, + "learning_rate": 1.5379404329013248e-07, + "loss": 0.8962, + "step": 5732 + }, + { + "epoch": 5.337988826815643, + "grad_norm": 1.903331995010376, + "learning_rate": 1.5336681004950198e-07, + "loss": 0.8954, + "step": 5733 + }, + { + "epoch": 5.338919925512104, + "grad_norm": 1.8832815885543823, + "learning_rate": 1.5294015227345028e-07, + "loss": 0.879, + "step": 5734 + }, + { + "epoch": 5.339851024208566, + "grad_norm": 1.8371176719665527, + "learning_rate": 1.5251407006660612e-07, + "loss": 0.8802, + "step": 5735 + }, + { + "epoch": 5.340782122905028, + "grad_norm": 1.8024457693099976, + "learning_rate": 1.5208856353345792e-07, + "loss": 0.8637, + "step": 5736 + }, + { + "epoch": 5.3417132216014895, + "grad_norm": 1.8594163656234741, + "learning_rate": 1.5166363277835273e-07, + "loss": 0.9111, + "step": 5737 + }, + { + "epoch": 5.342644320297952, + "grad_norm": 1.8056954145431519, + "learning_rate": 1.5123927790549582e-07, + "loss": 0.889, + "step": 5738 + }, + { + "epoch": 5.343575418994414, + "grad_norm": 1.8333896398544312, + "learning_rate": 1.5081549901895226e-07, + "loss": 0.8782, + "step": 5739 + }, + { + "epoch": 5.344506517690875, + "grad_norm": 1.8662219047546387, + "learning_rate": 1.5039229622264478e-07, + "loss": 0.8833, + "step": 5740 + }, + { + "epoch": 5.345437616387337, + "grad_norm": 1.8866630792617798, + "learning_rate": 1.4996966962035563e-07, + "loss": 0.885, + "step": 5741 + }, + { + "epoch": 5.346368715083799, + "grad_norm": 1.8318750858306885, + "learning_rate": 1.4954761931572526e-07, + "loss": 0.9043, + "step": 5742 + }, + { + "epoch": 5.3472998137802605, + "grad_norm": 1.8347046375274658, + "learning_rate": 1.4912614541225335e-07, + "loss": 0.8481, + "step": 5743 + }, + { + "epoch": 5.348230912476723, + "grad_norm": 1.828386664390564, + "learning_rate": 1.487052480132975e-07, + "loss": 0.8914, + "step": 5744 + }, + { + "epoch": 5.349162011173185, + "grad_norm": 1.7987037897109985, + "learning_rate": 1.482849272220749e-07, + "loss": 0.8669, + "step": 5745 + }, + { + "epoch": 5.350093109869646, + "grad_norm": 1.813961148262024, + "learning_rate": 1.4786518314166026e-07, + "loss": 0.8502, + "step": 5746 + }, + { + "epoch": 5.351024208566108, + "grad_norm": 1.7850428819656372, + "learning_rate": 1.474460158749874e-07, + "loss": 0.8644, + "step": 5747 + }, + { + "epoch": 5.351955307262569, + "grad_norm": 1.8678715229034424, + "learning_rate": 1.4702742552484884e-07, + "loss": 0.8551, + "step": 5748 + }, + { + "epoch": 5.3528864059590315, + "grad_norm": 1.8905242681503296, + "learning_rate": 1.4660941219389545e-07, + "loss": 0.8691, + "step": 5749 + }, + { + "epoch": 5.353817504655494, + "grad_norm": 1.8166905641555786, + "learning_rate": 1.4619197598463642e-07, + "loss": 0.8788, + "step": 5750 + }, + { + "epoch": 5.354748603351955, + "grad_norm": 1.7772449254989624, + "learning_rate": 1.4577511699943981e-07, + "loss": 0.845, + "step": 5751 + }, + { + "epoch": 5.355679702048417, + "grad_norm": 1.8455473184585571, + "learning_rate": 1.4535883534053163e-07, + "loss": 0.8529, + "step": 5752 + }, + { + "epoch": 5.356610800744879, + "grad_norm": 1.8661731481552124, + "learning_rate": 1.4494313110999692e-07, + "loss": 0.8884, + "step": 5753 + }, + { + "epoch": 5.35754189944134, + "grad_norm": 1.8801934719085693, + "learning_rate": 1.4452800440977826e-07, + "loss": 0.8787, + "step": 5754 + }, + { + "epoch": 5.3584729981378025, + "grad_norm": 1.8702919483184814, + "learning_rate": 1.4411345534167758e-07, + "loss": 0.841, + "step": 5755 + }, + { + "epoch": 5.359404096834265, + "grad_norm": 1.8658889532089233, + "learning_rate": 1.4369948400735469e-07, + "loss": 0.8869, + "step": 5756 + }, + { + "epoch": 5.360335195530726, + "grad_norm": 1.8539886474609375, + "learning_rate": 1.4328609050832775e-07, + "loss": 0.8857, + "step": 5757 + }, + { + "epoch": 5.361266294227188, + "grad_norm": 1.7972381114959717, + "learning_rate": 1.428732749459727e-07, + "loss": 0.8764, + "step": 5758 + }, + { + "epoch": 5.36219739292365, + "grad_norm": 1.8580195903778076, + "learning_rate": 1.4246103742152488e-07, + "loss": 0.8881, + "step": 5759 + }, + { + "epoch": 5.363128491620111, + "grad_norm": 1.8434388637542725, + "learning_rate": 1.420493780360771e-07, + "loss": 0.8874, + "step": 5760 + }, + { + "epoch": 5.3640595903165735, + "grad_norm": 1.8023698329925537, + "learning_rate": 1.4163829689057995e-07, + "loss": 0.8876, + "step": 5761 + }, + { + "epoch": 5.364990689013036, + "grad_norm": 1.7933152914047241, + "learning_rate": 1.4122779408584337e-07, + "loss": 0.8695, + "step": 5762 + }, + { + "epoch": 5.365921787709497, + "grad_norm": 1.8327285051345825, + "learning_rate": 1.4081786972253487e-07, + "loss": 0.887, + "step": 5763 + }, + { + "epoch": 5.366852886405959, + "grad_norm": 1.825608491897583, + "learning_rate": 1.4040852390118042e-07, + "loss": 0.8717, + "step": 5764 + }, + { + "epoch": 5.367783985102421, + "grad_norm": 1.8387831449508667, + "learning_rate": 1.3999975672216308e-07, + "loss": 0.9069, + "step": 5765 + }, + { + "epoch": 5.368715083798882, + "grad_norm": 1.7883532047271729, + "learning_rate": 1.3959156828572546e-07, + "loss": 0.8294, + "step": 5766 + }, + { + "epoch": 5.3696461824953445, + "grad_norm": 1.8172831535339355, + "learning_rate": 1.391839586919677e-07, + "loss": 0.8626, + "step": 5767 + }, + { + "epoch": 5.370577281191807, + "grad_norm": 1.8603065013885498, + "learning_rate": 1.3877692804084687e-07, + "loss": 0.8694, + "step": 5768 + }, + { + "epoch": 5.371508379888268, + "grad_norm": 1.842187523841858, + "learning_rate": 1.3837047643218e-07, + "loss": 0.8695, + "step": 5769 + }, + { + "epoch": 5.37243947858473, + "grad_norm": 1.8172756433486938, + "learning_rate": 1.3796460396564098e-07, + "loss": 0.8203, + "step": 5770 + }, + { + "epoch": 5.373370577281192, + "grad_norm": 1.907992959022522, + "learning_rate": 1.3755931074076134e-07, + "loss": 0.8948, + "step": 5771 + }, + { + "epoch": 5.374301675977653, + "grad_norm": 1.8834540843963623, + "learning_rate": 1.3715459685693127e-07, + "loss": 0.8582, + "step": 5772 + }, + { + "epoch": 5.3752327746741155, + "grad_norm": 1.8254954814910889, + "learning_rate": 1.3675046241339918e-07, + "loss": 0.8793, + "step": 5773 + }, + { + "epoch": 5.376163873370578, + "grad_norm": 1.85944664478302, + "learning_rate": 1.3634690750927105e-07, + "loss": 0.8837, + "step": 5774 + }, + { + "epoch": 5.377094972067039, + "grad_norm": 1.7900830507278442, + "learning_rate": 1.3594393224350965e-07, + "loss": 0.8265, + "step": 5775 + }, + { + "epoch": 5.378026070763501, + "grad_norm": 1.8265702724456787, + "learning_rate": 1.355415367149371e-07, + "loss": 0.8424, + "step": 5776 + }, + { + "epoch": 5.378957169459963, + "grad_norm": 1.8546820878982544, + "learning_rate": 1.3513972102223355e-07, + "loss": 0.8909, + "step": 5777 + }, + { + "epoch": 5.379888268156424, + "grad_norm": 1.820887565612793, + "learning_rate": 1.3473848526393468e-07, + "loss": 0.8768, + "step": 5778 + }, + { + "epoch": 5.3808193668528865, + "grad_norm": 1.8493762016296387, + "learning_rate": 1.3433782953843622e-07, + "loss": 0.8449, + "step": 5779 + }, + { + "epoch": 5.381750465549349, + "grad_norm": 1.8084982633590698, + "learning_rate": 1.3393775394399122e-07, + "loss": 0.8382, + "step": 5780 + }, + { + "epoch": 5.38268156424581, + "grad_norm": 1.8738248348236084, + "learning_rate": 1.3353825857871038e-07, + "loss": 0.8992, + "step": 5781 + }, + { + "epoch": 5.383612662942272, + "grad_norm": 1.8066442012786865, + "learning_rate": 1.3313934354056064e-07, + "loss": 0.8762, + "step": 5782 + }, + { + "epoch": 5.384543761638733, + "grad_norm": 1.8063591718673706, + "learning_rate": 1.3274100892736895e-07, + "loss": 0.873, + "step": 5783 + }, + { + "epoch": 5.385474860335195, + "grad_norm": 1.846178412437439, + "learning_rate": 1.323432548368192e-07, + "loss": 0.8735, + "step": 5784 + }, + { + "epoch": 5.3864059590316575, + "grad_norm": 1.8837950229644775, + "learning_rate": 1.3194608136645164e-07, + "loss": 0.8887, + "step": 5785 + }, + { + "epoch": 5.387337057728119, + "grad_norm": 1.821520209312439, + "learning_rate": 1.315494886136648e-07, + "loss": 0.8367, + "step": 5786 + }, + { + "epoch": 5.388268156424581, + "grad_norm": 1.793386459350586, + "learning_rate": 1.3115347667571637e-07, + "loss": 0.9091, + "step": 5787 + }, + { + "epoch": 5.389199255121043, + "grad_norm": 1.7829147577285767, + "learning_rate": 1.3075804564971984e-07, + "loss": 0.8466, + "step": 5788 + }, + { + "epoch": 5.390130353817504, + "grad_norm": 1.8293867111206055, + "learning_rate": 1.3036319563264592e-07, + "loss": 0.8692, + "step": 5789 + }, + { + "epoch": 5.391061452513966, + "grad_norm": 1.8555165529251099, + "learning_rate": 1.2996892672132417e-07, + "loss": 0.8977, + "step": 5790 + }, + { + "epoch": 5.3919925512104285, + "grad_norm": 1.7732361555099487, + "learning_rate": 1.295752390124419e-07, + "loss": 0.8239, + "step": 5791 + }, + { + "epoch": 5.39292364990689, + "grad_norm": 1.8599721193313599, + "learning_rate": 1.2918213260254192e-07, + "loss": 0.8494, + "step": 5792 + }, + { + "epoch": 5.393854748603352, + "grad_norm": 1.81840181350708, + "learning_rate": 1.287896075880257e-07, + "loss": 0.8669, + "step": 5793 + }, + { + "epoch": 5.394785847299814, + "grad_norm": 1.8548130989074707, + "learning_rate": 1.2839766406515296e-07, + "loss": 0.8545, + "step": 5794 + }, + { + "epoch": 5.395716945996275, + "grad_norm": 1.870778203010559, + "learning_rate": 1.2800630213003957e-07, + "loss": 0.9024, + "step": 5795 + }, + { + "epoch": 5.396648044692737, + "grad_norm": 1.8104339838027954, + "learning_rate": 1.27615521878659e-07, + "loss": 0.8826, + "step": 5796 + }, + { + "epoch": 5.3975791433891995, + "grad_norm": 1.7976478338241577, + "learning_rate": 1.2722532340684214e-07, + "loss": 0.8692, + "step": 5797 + }, + { + "epoch": 5.398510242085661, + "grad_norm": 1.7807847261428833, + "learning_rate": 1.2683570681027797e-07, + "loss": 0.8642, + "step": 5798 + }, + { + "epoch": 5.399441340782123, + "grad_norm": 1.8029394149780273, + "learning_rate": 1.2644667218451146e-07, + "loss": 0.8657, + "step": 5799 + }, + { + "epoch": 5.400372439478585, + "grad_norm": 1.813448190689087, + "learning_rate": 1.2605821962494576e-07, + "loss": 0.8619, + "step": 5800 + }, + { + "epoch": 5.401303538175046, + "grad_norm": 1.7840532064437866, + "learning_rate": 1.2567034922684102e-07, + "loss": 0.8585, + "step": 5801 + }, + { + "epoch": 5.402234636871508, + "grad_norm": 1.8277539014816284, + "learning_rate": 1.252830610853148e-07, + "loss": 0.8703, + "step": 5802 + }, + { + "epoch": 5.4031657355679705, + "grad_norm": 1.8130937814712524, + "learning_rate": 1.2489635529534138e-07, + "loss": 0.8608, + "step": 5803 + }, + { + "epoch": 5.404096834264432, + "grad_norm": 1.8168296813964844, + "learning_rate": 1.2451023195175244e-07, + "loss": 0.8381, + "step": 5804 + }, + { + "epoch": 5.405027932960894, + "grad_norm": 1.814274787902832, + "learning_rate": 1.241246911492383e-07, + "loss": 0.8527, + "step": 5805 + }, + { + "epoch": 5.405959031657356, + "grad_norm": 1.8428293466567993, + "learning_rate": 1.2373973298234337e-07, + "loss": 0.8576, + "step": 5806 + }, + { + "epoch": 5.406890130353817, + "grad_norm": 1.8432629108428955, + "learning_rate": 1.2335535754547183e-07, + "loss": 0.8386, + "step": 5807 + }, + { + "epoch": 5.407821229050279, + "grad_norm": 1.8969058990478516, + "learning_rate": 1.2297156493288387e-07, + "loss": 0.905, + "step": 5808 + }, + { + "epoch": 5.4087523277467415, + "grad_norm": 1.8513367176055908, + "learning_rate": 1.2258835523869667e-07, + "loss": 0.8621, + "step": 5809 + }, + { + "epoch": 5.409683426443203, + "grad_norm": 1.8026094436645508, + "learning_rate": 1.2220572855688506e-07, + "loss": 0.8438, + "step": 5810 + }, + { + "epoch": 5.410614525139665, + "grad_norm": 1.910266637802124, + "learning_rate": 1.2182368498127985e-07, + "loss": 0.8587, + "step": 5811 + }, + { + "epoch": 5.411545623836127, + "grad_norm": 1.8390917778015137, + "learning_rate": 1.2144222460557075e-07, + "loss": 0.8512, + "step": 5812 + }, + { + "epoch": 5.412476722532588, + "grad_norm": 1.804792046546936, + "learning_rate": 1.2106134752330219e-07, + "loss": 0.8554, + "step": 5813 + }, + { + "epoch": 5.41340782122905, + "grad_norm": 1.8106787204742432, + "learning_rate": 1.2068105382787714e-07, + "loss": 0.8709, + "step": 5814 + }, + { + "epoch": 5.4143389199255125, + "grad_norm": 1.7567534446716309, + "learning_rate": 1.2030134361255468e-07, + "loss": 0.8683, + "step": 5815 + }, + { + "epoch": 5.415270018621974, + "grad_norm": 1.8089193105697632, + "learning_rate": 1.1992221697045115e-07, + "loss": 0.851, + "step": 5816 + }, + { + "epoch": 5.416201117318436, + "grad_norm": 1.8304556608200073, + "learning_rate": 1.1954367399453993e-07, + "loss": 0.8715, + "step": 5817 + }, + { + "epoch": 5.417132216014897, + "grad_norm": 1.770833134651184, + "learning_rate": 1.1916571477765093e-07, + "loss": 0.8601, + "step": 5818 + }, + { + "epoch": 5.418063314711359, + "grad_norm": 1.8414725065231323, + "learning_rate": 1.1878833941247086e-07, + "loss": 0.8672, + "step": 5819 + }, + { + "epoch": 5.418994413407821, + "grad_norm": 1.7772095203399658, + "learning_rate": 1.1841154799154376e-07, + "loss": 0.8398, + "step": 5820 + }, + { + "epoch": 5.419925512104283, + "grad_norm": 1.8422200679779053, + "learning_rate": 1.1803534060727012e-07, + "loss": 0.8546, + "step": 5821 + }, + { + "epoch": 5.420856610800745, + "grad_norm": 1.864350438117981, + "learning_rate": 1.1765971735190701e-07, + "loss": 0.8908, + "step": 5822 + }, + { + "epoch": 5.421787709497207, + "grad_norm": 1.7925831079483032, + "learning_rate": 1.1728467831756851e-07, + "loss": 0.8231, + "step": 5823 + }, + { + "epoch": 5.422718808193668, + "grad_norm": 1.8903722763061523, + "learning_rate": 1.169102235962255e-07, + "loss": 0.9123, + "step": 5824 + }, + { + "epoch": 5.42364990689013, + "grad_norm": 1.8547897338867188, + "learning_rate": 1.1653635327970563e-07, + "loss": 0.8427, + "step": 5825 + }, + { + "epoch": 5.424581005586592, + "grad_norm": 1.8049408197402954, + "learning_rate": 1.1616306745969252e-07, + "loss": 0.8578, + "step": 5826 + }, + { + "epoch": 5.425512104283054, + "grad_norm": 1.8548072576522827, + "learning_rate": 1.1579036622772766e-07, + "loss": 0.8866, + "step": 5827 + }, + { + "epoch": 5.426443202979516, + "grad_norm": 1.8266637325286865, + "learning_rate": 1.1541824967520821e-07, + "loss": 0.8887, + "step": 5828 + }, + { + "epoch": 5.427374301675978, + "grad_norm": 1.7744497060775757, + "learning_rate": 1.1504671789338839e-07, + "loss": 0.8638, + "step": 5829 + }, + { + "epoch": 5.428305400372439, + "grad_norm": 1.848803997039795, + "learning_rate": 1.1467577097337867e-07, + "loss": 0.8865, + "step": 5830 + }, + { + "epoch": 5.429236499068901, + "grad_norm": 1.8362623453140259, + "learning_rate": 1.1430540900614657e-07, + "loss": 0.8662, + "step": 5831 + }, + { + "epoch": 5.430167597765363, + "grad_norm": 1.8043354749679565, + "learning_rate": 1.1393563208251579e-07, + "loss": 0.8809, + "step": 5832 + }, + { + "epoch": 5.431098696461825, + "grad_norm": 1.8668596744537354, + "learning_rate": 1.1356644029316661e-07, + "loss": 0.8728, + "step": 5833 + }, + { + "epoch": 5.432029795158287, + "grad_norm": 1.8812181949615479, + "learning_rate": 1.1319783372863601e-07, + "loss": 0.8538, + "step": 5834 + }, + { + "epoch": 5.432960893854749, + "grad_norm": 1.8391902446746826, + "learning_rate": 1.1282981247931729e-07, + "loss": 0.8714, + "step": 5835 + }, + { + "epoch": 5.43389199255121, + "grad_norm": 1.8737964630126953, + "learning_rate": 1.1246237663546016e-07, + "loss": 0.8989, + "step": 5836 + }, + { + "epoch": 5.434823091247672, + "grad_norm": 1.794968605041504, + "learning_rate": 1.1209552628717118e-07, + "loss": 0.8746, + "step": 5837 + }, + { + "epoch": 5.435754189944134, + "grad_norm": 1.877576470375061, + "learning_rate": 1.1172926152441255e-07, + "loss": 0.9019, + "step": 5838 + }, + { + "epoch": 5.436685288640596, + "grad_norm": 1.8380403518676758, + "learning_rate": 1.1136358243700407e-07, + "loss": 0.8836, + "step": 5839 + }, + { + "epoch": 5.437616387337058, + "grad_norm": 1.849636435508728, + "learning_rate": 1.1099848911462013e-07, + "loss": 0.8775, + "step": 5840 + }, + { + "epoch": 5.43854748603352, + "grad_norm": 1.8090568780899048, + "learning_rate": 1.106339816467933e-07, + "loss": 0.8492, + "step": 5841 + }, + { + "epoch": 5.439478584729981, + "grad_norm": 1.829854130744934, + "learning_rate": 1.1027006012291147e-07, + "loss": 0.8469, + "step": 5842 + }, + { + "epoch": 5.440409683426443, + "grad_norm": 1.9034913778305054, + "learning_rate": 1.0990672463221913e-07, + "loss": 0.9182, + "step": 5843 + }, + { + "epoch": 5.441340782122905, + "grad_norm": 1.8548129796981812, + "learning_rate": 1.0954397526381694e-07, + "loss": 0.8811, + "step": 5844 + }, + { + "epoch": 5.442271880819367, + "grad_norm": 1.8454434871673584, + "learning_rate": 1.0918181210666207e-07, + "loss": 0.9029, + "step": 5845 + }, + { + "epoch": 5.443202979515829, + "grad_norm": 1.776666283607483, + "learning_rate": 1.0882023524956764e-07, + "loss": 0.8233, + "step": 5846 + }, + { + "epoch": 5.444134078212291, + "grad_norm": 1.778867244720459, + "learning_rate": 1.0845924478120273e-07, + "loss": 0.8362, + "step": 5847 + }, + { + "epoch": 5.445065176908752, + "grad_norm": 1.783746600151062, + "learning_rate": 1.0809884079009347e-07, + "loss": 0.87, + "step": 5848 + }, + { + "epoch": 5.445996275605214, + "grad_norm": 1.8371528387069702, + "learning_rate": 1.0773902336462139e-07, + "loss": 0.8456, + "step": 5849 + }, + { + "epoch": 5.446927374301676, + "grad_norm": 1.8315542936325073, + "learning_rate": 1.0737979259302477e-07, + "loss": 0.8798, + "step": 5850 + }, + { + "epoch": 5.447858472998138, + "grad_norm": 1.8452399969100952, + "learning_rate": 1.0702114856339762e-07, + "loss": 0.9038, + "step": 5851 + }, + { + "epoch": 5.4487895716946, + "grad_norm": 1.8346343040466309, + "learning_rate": 1.0666309136368985e-07, + "loss": 0.8547, + "step": 5852 + }, + { + "epoch": 5.449720670391061, + "grad_norm": 1.7427295446395874, + "learning_rate": 1.0630562108170872e-07, + "loss": 0.828, + "step": 5853 + }, + { + "epoch": 5.450651769087523, + "grad_norm": 1.816021203994751, + "learning_rate": 1.0594873780511522e-07, + "loss": 0.9031, + "step": 5854 + }, + { + "epoch": 5.451582867783985, + "grad_norm": 1.9366120100021362, + "learning_rate": 1.0559244162142879e-07, + "loss": 0.9009, + "step": 5855 + }, + { + "epoch": 5.452513966480447, + "grad_norm": 1.781479835510254, + "learning_rate": 1.0523673261802369e-07, + "loss": 0.8358, + "step": 5856 + }, + { + "epoch": 5.453445065176909, + "grad_norm": 1.8144054412841797, + "learning_rate": 1.0488161088213045e-07, + "loss": 0.8967, + "step": 5857 + }, + { + "epoch": 5.454376163873371, + "grad_norm": 1.765842080116272, + "learning_rate": 1.0452707650083493e-07, + "loss": 0.8526, + "step": 5858 + }, + { + "epoch": 5.455307262569832, + "grad_norm": 1.934830665588379, + "learning_rate": 1.0417312956108067e-07, + "loss": 0.8876, + "step": 5859 + }, + { + "epoch": 5.456238361266294, + "grad_norm": 1.8614606857299805, + "learning_rate": 1.0381977014966543e-07, + "loss": 0.8291, + "step": 5860 + }, + { + "epoch": 5.457169459962756, + "grad_norm": 1.8687238693237305, + "learning_rate": 1.0346699835324297e-07, + "loss": 0.8733, + "step": 5861 + }, + { + "epoch": 5.4581005586592175, + "grad_norm": 1.845079779624939, + "learning_rate": 1.0311481425832409e-07, + "loss": 0.9049, + "step": 5862 + }, + { + "epoch": 5.45903165735568, + "grad_norm": 1.7980704307556152, + "learning_rate": 1.0276321795127497e-07, + "loss": 0.8959, + "step": 5863 + }, + { + "epoch": 5.459962756052142, + "grad_norm": 1.8096553087234497, + "learning_rate": 1.0241220951831776e-07, + "loss": 0.8701, + "step": 5864 + }, + { + "epoch": 5.460893854748603, + "grad_norm": 1.872987985610962, + "learning_rate": 1.0206178904552887e-07, + "loss": 0.915, + "step": 5865 + }, + { + "epoch": 5.461824953445065, + "grad_norm": 1.7858076095581055, + "learning_rate": 1.017119566188432e-07, + "loss": 0.8754, + "step": 5866 + }, + { + "epoch": 5.462756052141527, + "grad_norm": 1.8438210487365723, + "learning_rate": 1.0136271232405016e-07, + "loss": 0.9065, + "step": 5867 + }, + { + "epoch": 5.4636871508379885, + "grad_norm": 1.776132345199585, + "learning_rate": 1.01014056246794e-07, + "loss": 0.8547, + "step": 5868 + }, + { + "epoch": 5.464618249534451, + "grad_norm": 1.7897425889968872, + "learning_rate": 1.0066598847257553e-07, + "loss": 0.8684, + "step": 5869 + }, + { + "epoch": 5.465549348230913, + "grad_norm": 1.8708760738372803, + "learning_rate": 1.0031850908675284e-07, + "loss": 0.8877, + "step": 5870 + }, + { + "epoch": 5.466480446927374, + "grad_norm": 1.8119221925735474, + "learning_rate": 9.997161817453665e-08, + "loss": 0.8845, + "step": 5871 + }, + { + "epoch": 5.467411545623836, + "grad_norm": 1.8637148141860962, + "learning_rate": 9.96253158209956e-08, + "loss": 0.8953, + "step": 5872 + }, + { + "epoch": 5.468342644320298, + "grad_norm": 1.7755182981491089, + "learning_rate": 9.927960211105342e-08, + "loss": 0.8622, + "step": 5873 + }, + { + "epoch": 5.4692737430167595, + "grad_norm": 1.831527590751648, + "learning_rate": 9.893447712948978e-08, + "loss": 0.8921, + "step": 5874 + }, + { + "epoch": 5.470204841713222, + "grad_norm": 1.787359356880188, + "learning_rate": 9.858994096093922e-08, + "loss": 0.8708, + "step": 5875 + }, + { + "epoch": 5.471135940409684, + "grad_norm": 1.8702908754348755, + "learning_rate": 9.824599368989163e-08, + "loss": 0.8597, + "step": 5876 + }, + { + "epoch": 5.472067039106145, + "grad_norm": 1.81734037399292, + "learning_rate": 9.790263540069484e-08, + "loss": 0.8801, + "step": 5877 + }, + { + "epoch": 5.472998137802607, + "grad_norm": 1.848582148551941, + "learning_rate": 9.755986617754926e-08, + "loss": 0.8367, + "step": 5878 + }, + { + "epoch": 5.473929236499069, + "grad_norm": 1.8562015295028687, + "learning_rate": 9.72176861045121e-08, + "loss": 0.8579, + "step": 5879 + }, + { + "epoch": 5.4748603351955305, + "grad_norm": 1.889733910560608, + "learning_rate": 9.68760952654968e-08, + "loss": 0.8828, + "step": 5880 + }, + { + "epoch": 5.475791433891993, + "grad_norm": 1.9114090204238892, + "learning_rate": 9.653509374427162e-08, + "loss": 0.8747, + "step": 5881 + }, + { + "epoch": 5.476722532588455, + "grad_norm": 1.7627687454223633, + "learning_rate": 9.619468162445994e-08, + "loss": 0.8634, + "step": 5882 + }, + { + "epoch": 5.477653631284916, + "grad_norm": 1.8396098613739014, + "learning_rate": 9.585485898954056e-08, + "loss": 0.8694, + "step": 5883 + }, + { + "epoch": 5.478584729981378, + "grad_norm": 1.8447896242141724, + "learning_rate": 9.551562592284958e-08, + "loss": 0.8616, + "step": 5884 + }, + { + "epoch": 5.47951582867784, + "grad_norm": 1.8846668004989624, + "learning_rate": 9.517698250757574e-08, + "loss": 0.8771, + "step": 5885 + }, + { + "epoch": 5.4804469273743015, + "grad_norm": 1.8343228101730347, + "learning_rate": 9.48389288267651e-08, + "loss": 0.8617, + "step": 5886 + }, + { + "epoch": 5.481378026070764, + "grad_norm": 1.8732435703277588, + "learning_rate": 9.450146496331802e-08, + "loss": 0.8935, + "step": 5887 + }, + { + "epoch": 5.482309124767225, + "grad_norm": 1.8333910703659058, + "learning_rate": 9.416459099999164e-08, + "loss": 0.8753, + "step": 5888 + }, + { + "epoch": 5.483240223463687, + "grad_norm": 1.8192590475082397, + "learning_rate": 9.382830701939683e-08, + "loss": 0.8646, + "step": 5889 + }, + { + "epoch": 5.484171322160149, + "grad_norm": 1.8255399465560913, + "learning_rate": 9.349261310400037e-08, + "loss": 0.8579, + "step": 5890 + }, + { + "epoch": 5.485102420856611, + "grad_norm": 1.850782871246338, + "learning_rate": 9.315750933612533e-08, + "loss": 0.862, + "step": 5891 + }, + { + "epoch": 5.4860335195530725, + "grad_norm": 1.817672848701477, + "learning_rate": 9.28229957979479e-08, + "loss": 0.8697, + "step": 5892 + }, + { + "epoch": 5.486964618249535, + "grad_norm": 1.8011173009872437, + "learning_rate": 9.248907257150192e-08, + "loss": 0.8644, + "step": 5893 + }, + { + "epoch": 5.487895716945996, + "grad_norm": 1.9103015661239624, + "learning_rate": 9.215573973867437e-08, + "loss": 0.9203, + "step": 5894 + }, + { + "epoch": 5.488826815642458, + "grad_norm": 1.860225796699524, + "learning_rate": 9.182299738120931e-08, + "loss": 0.8563, + "step": 5895 + }, + { + "epoch": 5.48975791433892, + "grad_norm": 1.8558484315872192, + "learning_rate": 9.149084558070454e-08, + "loss": 0.9142, + "step": 5896 + }, + { + "epoch": 5.490689013035381, + "grad_norm": 1.8385810852050781, + "learning_rate": 9.115928441861405e-08, + "loss": 0.8471, + "step": 5897 + }, + { + "epoch": 5.4916201117318435, + "grad_norm": 1.863092064857483, + "learning_rate": 9.082831397624586e-08, + "loss": 0.8624, + "step": 5898 + }, + { + "epoch": 5.492551210428306, + "grad_norm": 1.8237619400024414, + "learning_rate": 9.04979343347645e-08, + "loss": 0.8601, + "step": 5899 + }, + { + "epoch": 5.493482309124767, + "grad_norm": 1.8285719156265259, + "learning_rate": 9.01681455751885e-08, + "loss": 0.8609, + "step": 5900 + }, + { + "epoch": 5.494413407821229, + "grad_norm": 1.81782865524292, + "learning_rate": 8.98389477783923e-08, + "loss": 0.8439, + "step": 5901 + }, + { + "epoch": 5.495344506517691, + "grad_norm": 1.7677713632583618, + "learning_rate": 8.95103410251047e-08, + "loss": 0.8482, + "step": 5902 + }, + { + "epoch": 5.496275605214152, + "grad_norm": 1.7472922801971436, + "learning_rate": 8.918232539590982e-08, + "loss": 0.8619, + "step": 5903 + }, + { + "epoch": 5.4972067039106145, + "grad_norm": 1.8732259273529053, + "learning_rate": 8.885490097124721e-08, + "loss": 0.9093, + "step": 5904 + }, + { + "epoch": 5.498137802607077, + "grad_norm": 1.8685342073440552, + "learning_rate": 8.852806783141099e-08, + "loss": 0.8521, + "step": 5905 + }, + { + "epoch": 5.499068901303538, + "grad_norm": 1.9260287284851074, + "learning_rate": 8.820182605655065e-08, + "loss": 0.8836, + "step": 5906 + }, + { + "epoch": 5.5, + "grad_norm": 1.8825738430023193, + "learning_rate": 8.787617572666996e-08, + "loss": 0.8992, + "step": 5907 + }, + { + "epoch": 5.500931098696462, + "grad_norm": 1.841812252998352, + "learning_rate": 8.755111692162837e-08, + "loss": 0.8708, + "step": 5908 + }, + { + "epoch": 5.501862197392923, + "grad_norm": 1.8100802898406982, + "learning_rate": 8.722664972113992e-08, + "loss": 0.8508, + "step": 5909 + }, + { + "epoch": 5.5027932960893855, + "grad_norm": 1.7384934425354004, + "learning_rate": 8.690277420477372e-08, + "loss": 0.8579, + "step": 5910 + }, + { + "epoch": 5.503724394785848, + "grad_norm": 1.7884365320205688, + "learning_rate": 8.657949045195374e-08, + "loss": 0.8693, + "step": 5911 + }, + { + "epoch": 5.504655493482309, + "grad_norm": 1.8689733743667603, + "learning_rate": 8.625679854195878e-08, + "loss": 0.8391, + "step": 5912 + }, + { + "epoch": 5.505586592178771, + "grad_norm": 1.9041293859481812, + "learning_rate": 8.593469855392278e-08, + "loss": 0.8482, + "step": 5913 + }, + { + "epoch": 5.506517690875233, + "grad_norm": 1.795282006263733, + "learning_rate": 8.561319056683392e-08, + "loss": 0.8766, + "step": 5914 + }, + { + "epoch": 5.507448789571694, + "grad_norm": 1.8776143789291382, + "learning_rate": 8.529227465953554e-08, + "loss": 0.8985, + "step": 5915 + }, + { + "epoch": 5.5083798882681565, + "grad_norm": 1.7925331592559814, + "learning_rate": 8.497195091072635e-08, + "loss": 0.8457, + "step": 5916 + }, + { + "epoch": 5.509310986964619, + "grad_norm": 1.8036510944366455, + "learning_rate": 8.465221939895851e-08, + "loss": 0.9248, + "step": 5917 + }, + { + "epoch": 5.51024208566108, + "grad_norm": 1.859997034072876, + "learning_rate": 8.433308020264042e-08, + "loss": 0.8878, + "step": 5918 + }, + { + "epoch": 5.511173184357542, + "grad_norm": 1.8175991773605347, + "learning_rate": 8.401453340003446e-08, + "loss": 0.8595, + "step": 5919 + }, + { + "epoch": 5.512104283054004, + "grad_norm": 1.8907252550125122, + "learning_rate": 8.369657906925732e-08, + "loss": 0.8627, + "step": 5920 + }, + { + "epoch": 5.513035381750465, + "grad_norm": 1.8853684663772583, + "learning_rate": 8.337921728828135e-08, + "loss": 0.8905, + "step": 5921 + }, + { + "epoch": 5.5139664804469275, + "grad_norm": 1.9330977201461792, + "learning_rate": 8.30624481349332e-08, + "loss": 0.9092, + "step": 5922 + }, + { + "epoch": 5.514897579143389, + "grad_norm": 1.856805443763733, + "learning_rate": 8.274627168689375e-08, + "loss": 0.849, + "step": 5923 + }, + { + "epoch": 5.515828677839851, + "grad_norm": 1.7932672500610352, + "learning_rate": 8.243068802169906e-08, + "loss": 0.8239, + "step": 5924 + }, + { + "epoch": 5.516759776536313, + "grad_norm": 1.89778470993042, + "learning_rate": 8.211569721674001e-08, + "loss": 0.8812, + "step": 5925 + }, + { + "epoch": 5.517690875232775, + "grad_norm": 1.7920472621917725, + "learning_rate": 8.180129934926145e-08, + "loss": 0.8546, + "step": 5926 + }, + { + "epoch": 5.518621973929236, + "grad_norm": 1.881791591644287, + "learning_rate": 8.148749449636312e-08, + "loss": 0.9219, + "step": 5927 + }, + { + "epoch": 5.5195530726256985, + "grad_norm": 1.8393127918243408, + "learning_rate": 8.117428273499927e-08, + "loss": 0.8497, + "step": 5928 + }, + { + "epoch": 5.52048417132216, + "grad_norm": 1.881919264793396, + "learning_rate": 8.086166414197905e-08, + "loss": 0.8969, + "step": 5929 + }, + { + "epoch": 5.521415270018622, + "grad_norm": 1.9552165269851685, + "learning_rate": 8.054963879396554e-08, + "loss": 0.8534, + "step": 5930 + }, + { + "epoch": 5.522346368715084, + "grad_norm": 1.7836925983428955, + "learning_rate": 8.0238206767477e-08, + "loss": 0.8185, + "step": 5931 + }, + { + "epoch": 5.523277467411545, + "grad_norm": 1.8438024520874023, + "learning_rate": 7.992736813888624e-08, + "loss": 0.8647, + "step": 5932 + }, + { + "epoch": 5.524208566108007, + "grad_norm": 1.9446122646331787, + "learning_rate": 7.961712298441892e-08, + "loss": 0.8451, + "step": 5933 + }, + { + "epoch": 5.5251396648044695, + "grad_norm": 1.8564201593399048, + "learning_rate": 7.930747138015754e-08, + "loss": 0.8814, + "step": 5934 + }, + { + "epoch": 5.526070763500931, + "grad_norm": 2.047893524169922, + "learning_rate": 7.899841340203746e-08, + "loss": 0.8684, + "step": 5935 + }, + { + "epoch": 5.527001862197393, + "grad_norm": 1.823179841041565, + "learning_rate": 7.868994912584943e-08, + "loss": 0.9005, + "step": 5936 + }, + { + "epoch": 5.527932960893855, + "grad_norm": 1.8654087781906128, + "learning_rate": 7.838207862723712e-08, + "loss": 0.9033, + "step": 5937 + }, + { + "epoch": 5.528864059590316, + "grad_norm": 1.8572924137115479, + "learning_rate": 7.80748019817007e-08, + "loss": 0.9045, + "step": 5938 + }, + { + "epoch": 5.529795158286778, + "grad_norm": 1.810759425163269, + "learning_rate": 7.776811926459293e-08, + "loss": 0.8733, + "step": 5939 + }, + { + "epoch": 5.5307262569832405, + "grad_norm": 1.7960491180419922, + "learning_rate": 7.746203055112144e-08, + "loss": 0.8852, + "step": 5940 + }, + { + "epoch": 5.531657355679702, + "grad_norm": 1.7855067253112793, + "learning_rate": 7.715653591634898e-08, + "loss": 0.8478, + "step": 5941 + }, + { + "epoch": 5.532588454376164, + "grad_norm": 1.7808315753936768, + "learning_rate": 7.685163543519142e-08, + "loss": 0.8618, + "step": 5942 + }, + { + "epoch": 5.533519553072626, + "grad_norm": 1.79416024684906, + "learning_rate": 7.65473291824198e-08, + "loss": 0.8732, + "step": 5943 + }, + { + "epoch": 5.534450651769087, + "grad_norm": 1.8522075414657593, + "learning_rate": 7.624361723265861e-08, + "loss": 0.8592, + "step": 5944 + }, + { + "epoch": 5.535381750465549, + "grad_norm": 1.8400535583496094, + "learning_rate": 7.594049966038769e-08, + "loss": 0.857, + "step": 5945 + }, + { + "epoch": 5.5363128491620115, + "grad_norm": 1.8692697286605835, + "learning_rate": 7.563797653994037e-08, + "loss": 0.8504, + "step": 5946 + }, + { + "epoch": 5.537243947858473, + "grad_norm": 1.8113665580749512, + "learning_rate": 7.53360479455037e-08, + "loss": 0.8739, + "step": 5947 + }, + { + "epoch": 5.538175046554935, + "grad_norm": 1.8228604793548584, + "learning_rate": 7.503471395112066e-08, + "loss": 0.8559, + "step": 5948 + }, + { + "epoch": 5.539106145251397, + "grad_norm": 2.015920639038086, + "learning_rate": 7.47339746306866e-08, + "loss": 0.8921, + "step": 5949 + }, + { + "epoch": 5.540037243947858, + "grad_norm": 1.8189797401428223, + "learning_rate": 7.443383005795224e-08, + "loss": 0.8799, + "step": 5950 + }, + { + "epoch": 5.54096834264432, + "grad_norm": 1.8223755359649658, + "learning_rate": 7.413428030652148e-08, + "loss": 0.8582, + "step": 5951 + }, + { + "epoch": 5.5418994413407825, + "grad_norm": 1.8254444599151611, + "learning_rate": 7.383532544985334e-08, + "loss": 0.8631, + "step": 5952 + }, + { + "epoch": 5.542830540037244, + "grad_norm": 1.8069080114364624, + "learning_rate": 7.353696556126056e-08, + "loss": 0.8896, + "step": 5953 + }, + { + "epoch": 5.543761638733706, + "grad_norm": 1.8386244773864746, + "learning_rate": 7.323920071390933e-08, + "loss": 0.8919, + "step": 5954 + }, + { + "epoch": 5.544692737430168, + "grad_norm": 1.8454148769378662, + "learning_rate": 7.29420309808207e-08, + "loss": 0.8506, + "step": 5955 + }, + { + "epoch": 5.545623836126629, + "grad_norm": 1.8871666193008423, + "learning_rate": 7.264545643486997e-08, + "loss": 0.895, + "step": 5956 + }, + { + "epoch": 5.546554934823091, + "grad_norm": 1.8775079250335693, + "learning_rate": 7.234947714878593e-08, + "loss": 0.8792, + "step": 5957 + }, + { + "epoch": 5.547486033519553, + "grad_norm": 1.7605834007263184, + "learning_rate": 7.205409319515078e-08, + "loss": 0.8639, + "step": 5958 + }, + { + "epoch": 5.548417132216015, + "grad_norm": 1.8546974658966064, + "learning_rate": 7.175930464640268e-08, + "loss": 0.8767, + "step": 5959 + }, + { + "epoch": 5.549348230912477, + "grad_norm": 1.7971640825271606, + "learning_rate": 7.146511157483216e-08, + "loss": 0.8724, + "step": 5960 + }, + { + "epoch": 5.550279329608939, + "grad_norm": 1.8387715816497803, + "learning_rate": 7.117151405258371e-08, + "loss": 0.8359, + "step": 5961 + }, + { + "epoch": 5.5512104283054, + "grad_norm": 1.8456361293792725, + "learning_rate": 7.087851215165614e-08, + "loss": 0.8685, + "step": 5962 + }, + { + "epoch": 5.552141527001862, + "grad_norm": 1.9173833131790161, + "learning_rate": 7.058610594390308e-08, + "loss": 0.8801, + "step": 5963 + }, + { + "epoch": 5.553072625698324, + "grad_norm": 1.7916356325149536, + "learning_rate": 7.029429550103106e-08, + "loss": 0.8637, + "step": 5964 + }, + { + "epoch": 5.554003724394786, + "grad_norm": 1.8275203704833984, + "learning_rate": 7.000308089460034e-08, + "loss": 0.8413, + "step": 5965 + }, + { + "epoch": 5.554934823091248, + "grad_norm": 1.8480234146118164, + "learning_rate": 6.971246219602517e-08, + "loss": 0.8806, + "step": 5966 + }, + { + "epoch": 5.55586592178771, + "grad_norm": 1.8012070655822754, + "learning_rate": 6.942243947657523e-08, + "loss": 0.8608, + "step": 5967 + }, + { + "epoch": 5.556797020484171, + "grad_norm": 1.7715802192687988, + "learning_rate": 6.913301280737139e-08, + "loss": 0.8346, + "step": 5968 + }, + { + "epoch": 5.557728119180633, + "grad_norm": 1.8419996500015259, + "learning_rate": 6.884418225938993e-08, + "loss": 0.8746, + "step": 5969 + }, + { + "epoch": 5.558659217877095, + "grad_norm": 1.8256666660308838, + "learning_rate": 6.85559479034617e-08, + "loss": 0.8583, + "step": 5970 + }, + { + "epoch": 5.559590316573557, + "grad_norm": 1.785621166229248, + "learning_rate": 6.826830981026933e-08, + "loss": 0.8152, + "step": 5971 + }, + { + "epoch": 5.560521415270019, + "grad_norm": 1.8242906332015991, + "learning_rate": 6.798126805035083e-08, + "loss": 0.8395, + "step": 5972 + }, + { + "epoch": 5.56145251396648, + "grad_norm": 1.83283269405365, + "learning_rate": 6.769482269409684e-08, + "loss": 0.8735, + "step": 5973 + }, + { + "epoch": 5.562383612662942, + "grad_norm": 1.8381223678588867, + "learning_rate": 6.740897381175338e-08, + "loss": 0.891, + "step": 5974 + }, + { + "epoch": 5.563314711359404, + "grad_norm": 1.8236596584320068, + "learning_rate": 6.712372147341827e-08, + "loss": 0.886, + "step": 5975 + }, + { + "epoch": 5.564245810055866, + "grad_norm": 1.8263452053070068, + "learning_rate": 6.683906574904364e-08, + "loss": 0.8749, + "step": 5976 + }, + { + "epoch": 5.565176908752328, + "grad_norm": 1.7623769044876099, + "learning_rate": 6.655500670843668e-08, + "loss": 0.863, + "step": 5977 + }, + { + "epoch": 5.56610800744879, + "grad_norm": 1.820884346961975, + "learning_rate": 6.627154442125638e-08, + "loss": 0.8365, + "step": 5978 + }, + { + "epoch": 5.567039106145251, + "grad_norm": 1.8408656120300293, + "learning_rate": 6.598867895701605e-08, + "loss": 0.8523, + "step": 5979 + }, + { + "epoch": 5.567970204841713, + "grad_norm": 1.9006643295288086, + "learning_rate": 6.570641038508296e-08, + "loss": 0.914, + "step": 5980 + }, + { + "epoch": 5.568901303538175, + "grad_norm": 1.898682951927185, + "learning_rate": 6.542473877467842e-08, + "loss": 0.8906, + "step": 5981 + }, + { + "epoch": 5.569832402234637, + "grad_norm": 1.881328821182251, + "learning_rate": 6.514366419487577e-08, + "loss": 0.9016, + "step": 5982 + }, + { + "epoch": 5.570763500931099, + "grad_norm": 1.8356987237930298, + "learning_rate": 6.48631867146035e-08, + "loss": 0.8501, + "step": 5983 + }, + { + "epoch": 5.571694599627561, + "grad_norm": 1.8315398693084717, + "learning_rate": 6.458330640264271e-08, + "loss": 0.886, + "step": 5984 + }, + { + "epoch": 5.572625698324022, + "grad_norm": 1.9021910429000854, + "learning_rate": 6.430402332762847e-08, + "loss": 0.8359, + "step": 5985 + }, + { + "epoch": 5.573556797020484, + "grad_norm": 1.830389380455017, + "learning_rate": 6.402533755804963e-08, + "loss": 0.8515, + "step": 5986 + }, + { + "epoch": 5.574487895716946, + "grad_norm": 1.8068532943725586, + "learning_rate": 6.374724916224761e-08, + "loss": 0.8679, + "step": 5987 + }, + { + "epoch": 5.575418994413408, + "grad_norm": 1.8922332525253296, + "learning_rate": 6.346975820841928e-08, + "loss": 0.87, + "step": 5988 + }, + { + "epoch": 5.57635009310987, + "grad_norm": 1.7715585231781006, + "learning_rate": 6.319286476461239e-08, + "loss": 0.852, + "step": 5989 + }, + { + "epoch": 5.577281191806332, + "grad_norm": 1.8676999807357788, + "learning_rate": 6.291656889873016e-08, + "loss": 0.8594, + "step": 5990 + }, + { + "epoch": 5.578212290502793, + "grad_norm": 1.7969449758529663, + "learning_rate": 6.26408706785281e-08, + "loss": 0.8739, + "step": 5991 + }, + { + "epoch": 5.579143389199255, + "grad_norm": 1.8363115787506104, + "learning_rate": 6.236577017161626e-08, + "loss": 0.8504, + "step": 5992 + }, + { + "epoch": 5.5800744878957165, + "grad_norm": 1.812180995941162, + "learning_rate": 6.209126744545713e-08, + "loss": 0.8787, + "step": 5993 + }, + { + "epoch": 5.581005586592179, + "grad_norm": 1.8263033628463745, + "learning_rate": 6.181736256736709e-08, + "loss": 0.8927, + "step": 5994 + }, + { + "epoch": 5.581936685288641, + "grad_norm": 1.8455986976623535, + "learning_rate": 6.154405560451577e-08, + "loss": 0.8653, + "step": 5995 + }, + { + "epoch": 5.582867783985103, + "grad_norm": 1.8830068111419678, + "learning_rate": 6.12713466239262e-08, + "loss": 0.9108, + "step": 5996 + }, + { + "epoch": 5.583798882681564, + "grad_norm": 1.7748769521713257, + "learning_rate": 6.099923569247463e-08, + "loss": 0.8633, + "step": 5997 + }, + { + "epoch": 5.584729981378026, + "grad_norm": 1.8017303943634033, + "learning_rate": 6.072772287689099e-08, + "loss": 0.8759, + "step": 5998 + }, + { + "epoch": 5.5856610800744875, + "grad_norm": 1.7511383295059204, + "learning_rate": 6.045680824375844e-08, + "loss": 0.8592, + "step": 5999 + }, + { + "epoch": 5.58659217877095, + "grad_norm": 1.8421976566314697, + "learning_rate": 6.018649185951325e-08, + "loss": 0.8554, + "step": 6000 + }, + { + "epoch": 5.587523277467412, + "grad_norm": 1.8167592287063599, + "learning_rate": 5.991677379044492e-08, + "loss": 0.8389, + "step": 6001 + }, + { + "epoch": 5.588454376163874, + "grad_norm": 1.7896063327789307, + "learning_rate": 5.964765410269635e-08, + "loss": 0.8777, + "step": 6002 + }, + { + "epoch": 5.589385474860335, + "grad_norm": 1.8245593309402466, + "learning_rate": 5.937913286226393e-08, + "loss": 0.8473, + "step": 6003 + }, + { + "epoch": 5.590316573556797, + "grad_norm": 1.8093490600585938, + "learning_rate": 5.911121013499721e-08, + "loss": 0.8651, + "step": 6004 + }, + { + "epoch": 5.5912476722532585, + "grad_norm": 1.841791033744812, + "learning_rate": 5.884388598659835e-08, + "loss": 0.8718, + "step": 6005 + }, + { + "epoch": 5.592178770949721, + "grad_norm": 1.7751057147979736, + "learning_rate": 5.8577160482623796e-08, + "loss": 0.8494, + "step": 6006 + }, + { + "epoch": 5.593109869646183, + "grad_norm": 1.796666145324707, + "learning_rate": 5.8311033688482064e-08, + "loss": 0.868, + "step": 6007 + }, + { + "epoch": 5.594040968342644, + "grad_norm": 1.8459514379501343, + "learning_rate": 5.804550566943595e-08, + "loss": 0.921, + "step": 6008 + }, + { + "epoch": 5.594972067039106, + "grad_norm": 1.8086484670639038, + "learning_rate": 5.7780576490600315e-08, + "loss": 0.9093, + "step": 6009 + }, + { + "epoch": 5.595903165735568, + "grad_norm": 1.836516261100769, + "learning_rate": 5.751624621694429e-08, + "loss": 0.9018, + "step": 6010 + }, + { + "epoch": 5.5968342644320295, + "grad_norm": 1.841781735420227, + "learning_rate": 5.725251491328909e-08, + "loss": 0.8785, + "step": 6011 + }, + { + "epoch": 5.597765363128492, + "grad_norm": 1.8107742071151733, + "learning_rate": 5.6989382644309645e-08, + "loss": 0.879, + "step": 6012 + }, + { + "epoch": 5.598696461824954, + "grad_norm": 1.8795465230941772, + "learning_rate": 5.6726849474533775e-08, + "loss": 0.8816, + "step": 6013 + }, + { + "epoch": 5.599627560521415, + "grad_norm": 1.810094952583313, + "learning_rate": 5.646491546834248e-08, + "loss": 0.8789, + "step": 6014 + }, + { + "epoch": 5.600558659217877, + "grad_norm": 1.8345750570297241, + "learning_rate": 5.6203580689970225e-08, + "loss": 0.8741, + "step": 6015 + }, + { + "epoch": 5.601489757914339, + "grad_norm": 1.838657021522522, + "learning_rate": 5.594284520350352e-08, + "loss": 0.8732, + "step": 6016 + }, + { + "epoch": 5.6024208566108005, + "grad_norm": 1.851117491722107, + "learning_rate": 5.568270907288287e-08, + "loss": 0.885, + "step": 6017 + }, + { + "epoch": 5.603351955307263, + "grad_norm": 1.8094843626022339, + "learning_rate": 5.542317236190115e-08, + "loss": 0.8976, + "step": 6018 + }, + { + "epoch": 5.604283054003725, + "grad_norm": 1.8319776058197021, + "learning_rate": 5.516423513420466e-08, + "loss": 0.8843, + "step": 6019 + }, + { + "epoch": 5.605214152700186, + "grad_norm": 1.80388605594635, + "learning_rate": 5.490589745329261e-08, + "loss": 0.8834, + "step": 6020 + }, + { + "epoch": 5.606145251396648, + "grad_norm": 1.824034571647644, + "learning_rate": 5.464815938251711e-08, + "loss": 0.8653, + "step": 6021 + }, + { + "epoch": 5.60707635009311, + "grad_norm": 1.76934015750885, + "learning_rate": 5.439102098508342e-08, + "loss": 0.877, + "step": 6022 + }, + { + "epoch": 5.6080074487895715, + "grad_norm": 1.782578706741333, + "learning_rate": 5.413448232404889e-08, + "loss": 0.8301, + "step": 6023 + }, + { + "epoch": 5.608938547486034, + "grad_norm": 1.8552275896072388, + "learning_rate": 5.387854346232485e-08, + "loss": 0.9031, + "step": 6024 + }, + { + "epoch": 5.609869646182496, + "grad_norm": 1.8113971948623657, + "learning_rate": 5.362320446267555e-08, + "loss": 0.8796, + "step": 6025 + }, + { + "epoch": 5.610800744878957, + "grad_norm": 1.7795312404632568, + "learning_rate": 5.3368465387717286e-08, + "loss": 0.8646, + "step": 6026 + }, + { + "epoch": 5.611731843575419, + "grad_norm": 1.889571189880371, + "learning_rate": 5.311432629991953e-08, + "loss": 0.8465, + "step": 6027 + }, + { + "epoch": 5.61266294227188, + "grad_norm": 1.778803825378418, + "learning_rate": 5.2860787261605485e-08, + "loss": 0.8655, + "step": 6028 + }, + { + "epoch": 5.6135940409683425, + "grad_norm": 1.830640196800232, + "learning_rate": 5.260784833494986e-08, + "loss": 0.8731, + "step": 6029 + }, + { + "epoch": 5.614525139664805, + "grad_norm": 1.8325234651565552, + "learning_rate": 5.235550958198082e-08, + "loss": 0.8838, + "step": 6030 + }, + { + "epoch": 5.615456238361267, + "grad_norm": 1.8577615022659302, + "learning_rate": 5.210377106457998e-08, + "loss": 0.8799, + "step": 6031 + }, + { + "epoch": 5.616387337057728, + "grad_norm": 2.0453357696533203, + "learning_rate": 5.185263284448072e-08, + "loss": 0.9199, + "step": 6032 + }, + { + "epoch": 5.61731843575419, + "grad_norm": 1.8782248497009277, + "learning_rate": 5.160209498326935e-08, + "loss": 0.877, + "step": 6033 + }, + { + "epoch": 5.618249534450651, + "grad_norm": 1.8369654417037964, + "learning_rate": 5.135215754238587e-08, + "loss": 0.8678, + "step": 6034 + }, + { + "epoch": 5.6191806331471135, + "grad_norm": 1.8032546043395996, + "learning_rate": 5.1102820583121826e-08, + "loss": 0.8726, + "step": 6035 + }, + { + "epoch": 5.620111731843576, + "grad_norm": 1.8382453918457031, + "learning_rate": 5.085408416662274e-08, + "loss": 0.8757, + "step": 6036 + }, + { + "epoch": 5.621042830540038, + "grad_norm": 1.8226993083953857, + "learning_rate": 5.060594835388538e-08, + "loss": 0.8952, + "step": 6037 + }, + { + "epoch": 5.621973929236499, + "grad_norm": 1.8010362386703491, + "learning_rate": 5.035841320576079e-08, + "loss": 0.8608, + "step": 6038 + }, + { + "epoch": 5.622905027932961, + "grad_norm": 1.8344913721084595, + "learning_rate": 5.0111478782952084e-08, + "loss": 0.8301, + "step": 6039 + }, + { + "epoch": 5.623836126629422, + "grad_norm": 1.7734776735305786, + "learning_rate": 4.986514514601415e-08, + "loss": 0.8515, + "step": 6040 + }, + { + "epoch": 5.6247672253258845, + "grad_norm": 1.8486405611038208, + "learning_rate": 4.9619412355355615e-08, + "loss": 0.8806, + "step": 6041 + }, + { + "epoch": 5.625698324022347, + "grad_norm": 1.871982455253601, + "learning_rate": 4.937428047123799e-08, + "loss": 0.8865, + "step": 6042 + }, + { + "epoch": 5.626629422718808, + "grad_norm": 1.8615424633026123, + "learning_rate": 4.912974955377486e-08, + "loss": 0.8552, + "step": 6043 + }, + { + "epoch": 5.62756052141527, + "grad_norm": 1.9197529554367065, + "learning_rate": 4.88858196629316e-08, + "loss": 0.8691, + "step": 6044 + }, + { + "epoch": 5.628491620111732, + "grad_norm": 1.8208160400390625, + "learning_rate": 4.86424908585284e-08, + "loss": 0.8849, + "step": 6045 + }, + { + "epoch": 5.629422718808193, + "grad_norm": 1.8656160831451416, + "learning_rate": 4.839976320023615e-08, + "loss": 0.8859, + "step": 6046 + }, + { + "epoch": 5.6303538175046555, + "grad_norm": 1.8428510427474976, + "learning_rate": 4.815763674757862e-08, + "loss": 0.8972, + "step": 6047 + }, + { + "epoch": 5.631284916201118, + "grad_norm": 1.8668882846832275, + "learning_rate": 4.7916111559932754e-08, + "loss": 0.9174, + "step": 6048 + }, + { + "epoch": 5.632216014897579, + "grad_norm": 1.8282147645950317, + "learning_rate": 4.767518769652785e-08, + "loss": 0.8468, + "step": 6049 + }, + { + "epoch": 5.633147113594041, + "grad_norm": 1.786958932876587, + "learning_rate": 4.7434865216445804e-08, + "loss": 0.8784, + "step": 6050 + }, + { + "epoch": 5.634078212290503, + "grad_norm": 1.8544881343841553, + "learning_rate": 4.719514417862031e-08, + "loss": 0.8922, + "step": 6051 + }, + { + "epoch": 5.635009310986964, + "grad_norm": 1.818051815032959, + "learning_rate": 4.695602464183824e-08, + "loss": 0.867, + "step": 6052 + }, + { + "epoch": 5.6359404096834265, + "grad_norm": 1.8366373777389526, + "learning_rate": 4.6717506664739344e-08, + "loss": 0.8655, + "step": 6053 + }, + { + "epoch": 5.636871508379889, + "grad_norm": 1.8343926668167114, + "learning_rate": 4.647959030581517e-08, + "loss": 0.8709, + "step": 6054 + }, + { + "epoch": 5.63780260707635, + "grad_norm": 1.8226313591003418, + "learning_rate": 4.624227562340933e-08, + "loss": 0.8706, + "step": 6055 + }, + { + "epoch": 5.638733705772812, + "grad_norm": 1.8533669710159302, + "learning_rate": 4.600556267571915e-08, + "loss": 0.9131, + "step": 6056 + }, + { + "epoch": 5.639664804469274, + "grad_norm": 1.83689546585083, + "learning_rate": 4.576945152079376e-08, + "loss": 0.8744, + "step": 6057 + }, + { + "epoch": 5.640595903165735, + "grad_norm": 1.8460745811462402, + "learning_rate": 4.5533942216534064e-08, + "loss": 0.897, + "step": 6058 + }, + { + "epoch": 5.6415270018621975, + "grad_norm": 1.7994188070297241, + "learning_rate": 4.529903482069442e-08, + "loss": 0.8498, + "step": 6059 + }, + { + "epoch": 5.64245810055866, + "grad_norm": 1.8268492221832275, + "learning_rate": 4.5064729390881244e-08, + "loss": 0.8833, + "step": 6060 + }, + { + "epoch": 5.643389199255121, + "grad_norm": 1.8405555486679077, + "learning_rate": 4.4831025984553025e-08, + "loss": 0.8406, + "step": 6061 + }, + { + "epoch": 5.644320297951583, + "grad_norm": 1.847129464149475, + "learning_rate": 4.4597924659020573e-08, + "loss": 0.8564, + "step": 6062 + }, + { + "epoch": 5.645251396648044, + "grad_norm": 1.8487014770507812, + "learning_rate": 4.436542547144762e-08, + "loss": 0.8543, + "step": 6063 + }, + { + "epoch": 5.646182495344506, + "grad_norm": 1.8839802742004395, + "learning_rate": 4.4133528478850216e-08, + "loss": 0.8895, + "step": 6064 + }, + { + "epoch": 5.6471135940409685, + "grad_norm": 1.8739949464797974, + "learning_rate": 4.390223373809566e-08, + "loss": 0.8685, + "step": 6065 + }, + { + "epoch": 5.648044692737431, + "grad_norm": 1.9083195924758911, + "learning_rate": 4.3671541305904677e-08, + "loss": 0.83, + "step": 6066 + }, + { + "epoch": 5.648975791433892, + "grad_norm": 1.8619877099990845, + "learning_rate": 4.3441451238850354e-08, + "loss": 0.9348, + "step": 6067 + }, + { + "epoch": 5.649906890130354, + "grad_norm": 1.7633413076400757, + "learning_rate": 4.3211963593357275e-08, + "loss": 0.8219, + "step": 6068 + }, + { + "epoch": 5.650837988826815, + "grad_norm": 1.8129841089248657, + "learning_rate": 4.298307842570265e-08, + "loss": 0.8629, + "step": 6069 + }, + { + "epoch": 5.651769087523277, + "grad_norm": 1.9705711603164673, + "learning_rate": 4.2754795792016025e-08, + "loss": 0.8809, + "step": 6070 + }, + { + "epoch": 5.6527001862197395, + "grad_norm": 1.8835104703903198, + "learning_rate": 4.2527115748279005e-08, + "loss": 0.9017, + "step": 6071 + }, + { + "epoch": 5.653631284916202, + "grad_norm": 1.8339288234710693, + "learning_rate": 4.2300038350325535e-08, + "loss": 0.8317, + "step": 6072 + }, + { + "epoch": 5.654562383612663, + "grad_norm": 1.8654881715774536, + "learning_rate": 4.207356365384191e-08, + "loss": 0.8716, + "step": 6073 + }, + { + "epoch": 5.655493482309125, + "grad_norm": 1.8331904411315918, + "learning_rate": 4.184769171436676e-08, + "loss": 0.8997, + "step": 6074 + }, + { + "epoch": 5.656424581005586, + "grad_norm": 1.8411519527435303, + "learning_rate": 4.162242258729021e-08, + "loss": 0.9461, + "step": 6075 + }, + { + "epoch": 5.657355679702048, + "grad_norm": 1.797438144683838, + "learning_rate": 4.139775632785531e-08, + "loss": 0.8419, + "step": 6076 + }, + { + "epoch": 5.6582867783985105, + "grad_norm": 1.840567708015442, + "learning_rate": 4.1173692991156586e-08, + "loss": 0.8823, + "step": 6077 + }, + { + "epoch": 5.659217877094972, + "grad_norm": 1.8347073793411255, + "learning_rate": 4.0950232632141205e-08, + "loss": 0.8729, + "step": 6078 + }, + { + "epoch": 5.660148975791434, + "grad_norm": 1.8677629232406616, + "learning_rate": 4.0727375305608384e-08, + "loss": 0.8949, + "step": 6079 + }, + { + "epoch": 5.661080074487896, + "grad_norm": 1.8658162355422974, + "learning_rate": 4.050512106620913e-08, + "loss": 0.862, + "step": 6080 + }, + { + "epoch": 5.662011173184357, + "grad_norm": 1.8640216588974, + "learning_rate": 4.028346996844734e-08, + "loss": 0.8999, + "step": 6081 + }, + { + "epoch": 5.662942271880819, + "grad_norm": 1.8421566486358643, + "learning_rate": 4.006242206667815e-08, + "loss": 0.876, + "step": 6082 + }, + { + "epoch": 5.6638733705772815, + "grad_norm": 1.7869415283203125, + "learning_rate": 3.984197741510903e-08, + "loss": 0.8422, + "step": 6083 + }, + { + "epoch": 5.664804469273743, + "grad_norm": 1.7873530387878418, + "learning_rate": 3.96221360677998e-08, + "loss": 0.8605, + "step": 6084 + }, + { + "epoch": 5.665735567970205, + "grad_norm": 1.8138799667358398, + "learning_rate": 3.9402898078662043e-08, + "loss": 0.8588, + "step": 6085 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 1.85843825340271, + "learning_rate": 3.918426350145971e-08, + "loss": 0.884, + "step": 6086 + }, + { + "epoch": 5.667597765363128, + "grad_norm": 1.8989852666854858, + "learning_rate": 3.896623238980796e-08, + "loss": 0.8864, + "step": 6087 + }, + { + "epoch": 5.66852886405959, + "grad_norm": 1.8271750211715698, + "learning_rate": 3.8748804797175134e-08, + "loss": 0.8805, + "step": 6088 + }, + { + "epoch": 5.6694599627560525, + "grad_norm": 1.8207539319992065, + "learning_rate": 3.853198077688053e-08, + "loss": 0.8314, + "step": 6089 + }, + { + "epoch": 5.670391061452514, + "grad_norm": 1.81073796749115, + "learning_rate": 3.831576038209606e-08, + "loss": 0.8825, + "step": 6090 + }, + { + "epoch": 5.671322160148976, + "grad_norm": 1.8539766073226929, + "learning_rate": 3.810014366584569e-08, + "loss": 0.8815, + "step": 6091 + }, + { + "epoch": 5.672253258845438, + "grad_norm": 1.814493179321289, + "learning_rate": 3.7885130681004636e-08, + "loss": 0.8533, + "step": 6092 + }, + { + "epoch": 5.673184357541899, + "grad_norm": 1.8028181791305542, + "learning_rate": 3.767072148030071e-08, + "loss": 0.8599, + "step": 6093 + }, + { + "epoch": 5.674115456238361, + "grad_norm": 1.8319977521896362, + "learning_rate": 3.745691611631325e-08, + "loss": 0.8691, + "step": 6094 + }, + { + "epoch": 5.6750465549348235, + "grad_norm": 1.8139729499816895, + "learning_rate": 3.724371464147419e-08, + "loss": 0.8709, + "step": 6095 + }, + { + "epoch": 5.675977653631285, + "grad_norm": 1.7646998167037964, + "learning_rate": 3.703111710806645e-08, + "loss": 0.8579, + "step": 6096 + }, + { + "epoch": 5.676908752327747, + "grad_norm": 1.8442840576171875, + "learning_rate": 3.681912356822554e-08, + "loss": 0.8487, + "step": 6097 + }, + { + "epoch": 5.677839851024208, + "grad_norm": 1.8518794775009155, + "learning_rate": 3.660773407393847e-08, + "loss": 0.8959, + "step": 6098 + }, + { + "epoch": 5.67877094972067, + "grad_norm": 1.812034010887146, + "learning_rate": 3.6396948677044355e-08, + "loss": 0.8852, + "step": 6099 + }, + { + "epoch": 5.679702048417132, + "grad_norm": 1.8986492156982422, + "learning_rate": 3.6186767429234323e-08, + "loss": 0.877, + "step": 6100 + }, + { + "epoch": 5.6806331471135945, + "grad_norm": 1.8779871463775635, + "learning_rate": 3.59771903820505e-08, + "loss": 0.8714, + "step": 6101 + }, + { + "epoch": 5.681564245810056, + "grad_norm": 1.8665193319320679, + "learning_rate": 3.576821758688787e-08, + "loss": 0.8497, + "step": 6102 + }, + { + "epoch": 5.682495344506518, + "grad_norm": 1.7920194864273071, + "learning_rate": 3.555984909499266e-08, + "loss": 0.8612, + "step": 6103 + }, + { + "epoch": 5.683426443202979, + "grad_norm": 1.865971326828003, + "learning_rate": 3.535208495746317e-08, + "loss": 0.8798, + "step": 6104 + }, + { + "epoch": 5.684357541899441, + "grad_norm": 1.7738081216812134, + "learning_rate": 3.51449252252492e-08, + "loss": 0.8449, + "step": 6105 + }, + { + "epoch": 5.685288640595903, + "grad_norm": 1.8473042249679565, + "learning_rate": 3.4938369949152616e-08, + "loss": 0.8312, + "step": 6106 + }, + { + "epoch": 5.6862197392923655, + "grad_norm": 1.8980828523635864, + "learning_rate": 3.473241917982678e-08, + "loss": 0.8977, + "step": 6107 + }, + { + "epoch": 5.687150837988827, + "grad_norm": 1.8025346994400024, + "learning_rate": 3.4527072967777406e-08, + "loss": 0.8667, + "step": 6108 + }, + { + "epoch": 5.688081936685289, + "grad_norm": 1.8227332830429077, + "learning_rate": 3.4322331363360886e-08, + "loss": 0.8724, + "step": 6109 + }, + { + "epoch": 5.68901303538175, + "grad_norm": 1.8765535354614258, + "learning_rate": 3.411819441678677e-08, + "loss": 0.8903, + "step": 6110 + }, + { + "epoch": 5.689944134078212, + "grad_norm": 1.7993947267532349, + "learning_rate": 3.391466217811473e-08, + "loss": 0.8372, + "step": 6111 + }, + { + "epoch": 5.690875232774674, + "grad_norm": 1.843091368675232, + "learning_rate": 3.371173469725736e-08, + "loss": 0.8421, + "step": 6112 + }, + { + "epoch": 5.691806331471136, + "grad_norm": 1.9149556159973145, + "learning_rate": 3.3509412023978727e-08, + "loss": 0.8826, + "step": 6113 + }, + { + "epoch": 5.692737430167598, + "grad_norm": 1.8525922298431396, + "learning_rate": 3.330769420789415e-08, + "loss": 0.8569, + "step": 6114 + }, + { + "epoch": 5.69366852886406, + "grad_norm": 1.9141684770584106, + "learning_rate": 3.3106581298470993e-08, + "loss": 0.8801, + "step": 6115 + }, + { + "epoch": 5.694599627560521, + "grad_norm": 1.8517714738845825, + "learning_rate": 3.29060733450276e-08, + "loss": 0.8732, + "step": 6116 + }, + { + "epoch": 5.695530726256983, + "grad_norm": 1.8187402486801147, + "learning_rate": 3.270617039673518e-08, + "loss": 0.8723, + "step": 6117 + }, + { + "epoch": 5.696461824953445, + "grad_norm": 1.9003697633743286, + "learning_rate": 3.250687250261564e-08, + "loss": 0.8995, + "step": 6118 + }, + { + "epoch": 5.697392923649907, + "grad_norm": 1.8452231884002686, + "learning_rate": 3.2308179711543206e-08, + "loss": 0.8645, + "step": 6119 + }, + { + "epoch": 5.698324022346369, + "grad_norm": 1.8136553764343262, + "learning_rate": 3.2110092072242246e-08, + "loss": 0.8911, + "step": 6120 + }, + { + "epoch": 5.699255121042831, + "grad_norm": 1.833276629447937, + "learning_rate": 3.191260963329085e-08, + "loss": 0.825, + "step": 6121 + }, + { + "epoch": 5.700186219739292, + "grad_norm": 1.7938638925552368, + "learning_rate": 3.171573244311721e-08, + "loss": 0.8768, + "step": 6122 + }, + { + "epoch": 5.701117318435754, + "grad_norm": 1.8911608457565308, + "learning_rate": 3.151946055000132e-08, + "loss": 0.8766, + "step": 6123 + }, + { + "epoch": 5.702048417132216, + "grad_norm": 1.802827000617981, + "learning_rate": 3.132379400207497e-08, + "loss": 0.8143, + "step": 6124 + }, + { + "epoch": 5.702979515828678, + "grad_norm": 1.8331093788146973, + "learning_rate": 3.112873284732171e-08, + "loss": 0.8709, + "step": 6125 + }, + { + "epoch": 5.70391061452514, + "grad_norm": 1.9000455141067505, + "learning_rate": 3.093427713357633e-08, + "loss": 0.8983, + "step": 6126 + }, + { + "epoch": 5.704841713221602, + "grad_norm": 1.8474172353744507, + "learning_rate": 3.074042690852486e-08, + "loss": 0.8555, + "step": 6127 + }, + { + "epoch": 5.705772811918063, + "grad_norm": 1.8938302993774414, + "learning_rate": 3.054718221970537e-08, + "loss": 0.9134, + "step": 6128 + }, + { + "epoch": 5.706703910614525, + "grad_norm": 1.8721483945846558, + "learning_rate": 3.035454311450747e-08, + "loss": 0.8619, + "step": 6129 + }, + { + "epoch": 5.707635009310987, + "grad_norm": 1.8344711065292358, + "learning_rate": 3.0162509640171424e-08, + "loss": 0.8927, + "step": 6130 + }, + { + "epoch": 5.708566108007449, + "grad_norm": 1.8288629055023193, + "learning_rate": 2.997108184379011e-08, + "loss": 0.9075, + "step": 6131 + }, + { + "epoch": 5.709497206703911, + "grad_norm": 1.893816590309143, + "learning_rate": 2.9780259772307362e-08, + "loss": 0.8475, + "step": 6132 + }, + { + "epoch": 5.710428305400372, + "grad_norm": 1.84168541431427, + "learning_rate": 2.959004347251798e-08, + "loss": 0.8919, + "step": 6133 + }, + { + "epoch": 5.711359404096834, + "grad_norm": 1.7987818717956543, + "learning_rate": 2.9400432991068816e-08, + "loss": 0.8592, + "step": 6134 + }, + { + "epoch": 5.712290502793296, + "grad_norm": 1.7837445735931396, + "learning_rate": 2.9211428374458228e-08, + "loss": 0.8601, + "step": 6135 + }, + { + "epoch": 5.713221601489758, + "grad_norm": 1.8974809646606445, + "learning_rate": 2.9023029669036084e-08, + "loss": 0.8985, + "step": 6136 + }, + { + "epoch": 5.71415270018622, + "grad_norm": 1.8392573595046997, + "learning_rate": 2.883523692100293e-08, + "loss": 0.8709, + "step": 6137 + }, + { + "epoch": 5.715083798882682, + "grad_norm": 1.8795548677444458, + "learning_rate": 2.8648050176410813e-08, + "loss": 0.8933, + "step": 6138 + }, + { + "epoch": 5.716014897579143, + "grad_norm": 1.7658891677856445, + "learning_rate": 2.8461469481164682e-08, + "loss": 0.8567, + "step": 6139 + }, + { + "epoch": 5.716945996275605, + "grad_norm": 1.8110045194625854, + "learning_rate": 2.8275494881018495e-08, + "loss": 0.9024, + "step": 6140 + }, + { + "epoch": 5.717877094972067, + "grad_norm": 1.831350564956665, + "learning_rate": 2.80901264215791e-08, + "loss": 0.8456, + "step": 6141 + }, + { + "epoch": 5.718808193668529, + "grad_norm": 1.8483390808105469, + "learning_rate": 2.790536414830486e-08, + "loss": 0.8522, + "step": 6142 + }, + { + "epoch": 5.719739292364991, + "grad_norm": 1.8632031679153442, + "learning_rate": 2.7721208106504805e-08, + "loss": 0.9092, + "step": 6143 + }, + { + "epoch": 5.720670391061453, + "grad_norm": 1.859823226928711, + "learning_rate": 2.75376583413392e-08, + "loss": 0.8615, + "step": 6144 + }, + { + "epoch": 5.721601489757914, + "grad_norm": 1.9285625219345093, + "learning_rate": 2.73547148978201e-08, + "loss": 0.8461, + "step": 6145 + }, + { + "epoch": 5.722532588454376, + "grad_norm": 1.7940329313278198, + "learning_rate": 2.717237782081078e-08, + "loss": 0.8799, + "step": 6146 + }, + { + "epoch": 5.723463687150838, + "grad_norm": 1.8296020030975342, + "learning_rate": 2.699064715502575e-08, + "loss": 0.8651, + "step": 6147 + }, + { + "epoch": 5.724394785847299, + "grad_norm": 1.8407827615737915, + "learning_rate": 2.680952294503075e-08, + "loss": 0.8652, + "step": 6148 + }, + { + "epoch": 5.725325884543762, + "grad_norm": 1.8058922290802002, + "learning_rate": 2.6629005235242478e-08, + "loss": 0.8298, + "step": 6149 + }, + { + "epoch": 5.726256983240224, + "grad_norm": 1.7690619230270386, + "learning_rate": 2.644909406992996e-08, + "loss": 0.8516, + "step": 6150 + }, + { + "epoch": 5.727188081936685, + "grad_norm": 1.8922557830810547, + "learning_rate": 2.626978949321235e-08, + "loss": 0.8964, + "step": 6151 + }, + { + "epoch": 5.728119180633147, + "grad_norm": 1.79352867603302, + "learning_rate": 2.6091091549060588e-08, + "loss": 0.869, + "step": 6152 + }, + { + "epoch": 5.729050279329609, + "grad_norm": 1.8331267833709717, + "learning_rate": 2.591300028129684e-08, + "loss": 0.8928, + "step": 6153 + }, + { + "epoch": 5.72998137802607, + "grad_norm": 1.8078433275222778, + "learning_rate": 2.573551573359423e-08, + "loss": 0.8406, + "step": 6154 + }, + { + "epoch": 5.730912476722533, + "grad_norm": 1.7971630096435547, + "learning_rate": 2.5558637949477384e-08, + "loss": 0.8803, + "step": 6155 + }, + { + "epoch": 5.731843575418995, + "grad_norm": 1.8573946952819824, + "learning_rate": 2.538236697232188e-08, + "loss": 0.8775, + "step": 6156 + }, + { + "epoch": 5.732774674115456, + "grad_norm": 1.7852193117141724, + "learning_rate": 2.520670284535509e-08, + "loss": 0.8426, + "step": 6157 + }, + { + "epoch": 5.733705772811918, + "grad_norm": 1.8414041996002197, + "learning_rate": 2.5031645611654497e-08, + "loss": 0.9165, + "step": 6158 + }, + { + "epoch": 5.73463687150838, + "grad_norm": 2.3889214992523193, + "learning_rate": 2.4857195314149374e-08, + "loss": 0.8984, + "step": 6159 + }, + { + "epoch": 5.735567970204841, + "grad_norm": 1.8619686365127563, + "learning_rate": 2.468335199562105e-08, + "loss": 0.8721, + "step": 6160 + }, + { + "epoch": 5.736499068901304, + "grad_norm": 1.9081919193267822, + "learning_rate": 2.4510115698700153e-08, + "loss": 0.8635, + "step": 6161 + }, + { + "epoch": 5.737430167597766, + "grad_norm": 1.8540387153625488, + "learning_rate": 2.433748646586964e-08, + "loss": 0.8297, + "step": 6162 + }, + { + "epoch": 5.738361266294227, + "grad_norm": 1.7734057903289795, + "learning_rate": 2.416546433946315e-08, + "loss": 0.8325, + "step": 6163 + }, + { + "epoch": 5.739292364990689, + "grad_norm": 1.8043057918548584, + "learning_rate": 2.399404936166638e-08, + "loss": 0.8521, + "step": 6164 + }, + { + "epoch": 5.740223463687151, + "grad_norm": 1.784746766090393, + "learning_rate": 2.3823241574514878e-08, + "loss": 0.8357, + "step": 6165 + }, + { + "epoch": 5.741154562383612, + "grad_norm": 1.8335862159729004, + "learning_rate": 2.3653041019895686e-08, + "loss": 0.8624, + "step": 6166 + }, + { + "epoch": 5.742085661080075, + "grad_norm": 1.8390240669250488, + "learning_rate": 2.348344773954736e-08, + "loss": 0.8604, + "step": 6167 + }, + { + "epoch": 5.743016759776537, + "grad_norm": 1.8676220178604126, + "learning_rate": 2.331446177505914e-08, + "loss": 0.874, + "step": 6168 + }, + { + "epoch": 5.743947858472998, + "grad_norm": 1.798399806022644, + "learning_rate": 2.314608316787148e-08, + "loss": 0.8369, + "step": 6169 + }, + { + "epoch": 5.74487895716946, + "grad_norm": 1.8409217596054077, + "learning_rate": 2.29783119592758e-08, + "loss": 0.8685, + "step": 6170 + }, + { + "epoch": 5.745810055865922, + "grad_norm": 1.7821255922317505, + "learning_rate": 2.2811148190414468e-08, + "loss": 0.8562, + "step": 6171 + }, + { + "epoch": 5.746741154562383, + "grad_norm": 1.8352800607681274, + "learning_rate": 2.2644591902281353e-08, + "loss": 0.8854, + "step": 6172 + }, + { + "epoch": 5.747672253258846, + "grad_norm": 1.8495738506317139, + "learning_rate": 2.2478643135720735e-08, + "loss": 0.8155, + "step": 6173 + }, + { + "epoch": 5.748603351955307, + "grad_norm": 1.816753625869751, + "learning_rate": 2.2313301931428388e-08, + "loss": 0.8728, + "step": 6174 + }, + { + "epoch": 5.749534450651769, + "grad_norm": 1.8112695217132568, + "learning_rate": 2.2148568329951047e-08, + "loss": 0.8734, + "step": 6175 + }, + { + "epoch": 5.750465549348231, + "grad_norm": 1.795685052871704, + "learning_rate": 2.1984442371685845e-08, + "loss": 0.8545, + "step": 6176 + }, + { + "epoch": 5.751396648044693, + "grad_norm": 1.8227074146270752, + "learning_rate": 2.182092409688169e-08, + "loss": 0.907, + "step": 6177 + }, + { + "epoch": 5.752327746741154, + "grad_norm": 1.8278332948684692, + "learning_rate": 2.1658013545638457e-08, + "loss": 0.8541, + "step": 6178 + }, + { + "epoch": 5.753258845437617, + "grad_norm": 1.8437132835388184, + "learning_rate": 2.1495710757905854e-08, + "loss": 0.8798, + "step": 6179 + }, + { + "epoch": 5.754189944134078, + "grad_norm": 1.8876391649246216, + "learning_rate": 2.1334015773486204e-08, + "loss": 0.8932, + "step": 6180 + }, + { + "epoch": 5.75512104283054, + "grad_norm": 1.7828001976013184, + "learning_rate": 2.1172928632031687e-08, + "loss": 0.8933, + "step": 6181 + }, + { + "epoch": 5.756052141527002, + "grad_norm": 1.9025543928146362, + "learning_rate": 2.1012449373045417e-08, + "loss": 0.8838, + "step": 6182 + }, + { + "epoch": 5.756983240223463, + "grad_norm": 1.8180935382843018, + "learning_rate": 2.0852578035881754e-08, + "loss": 0.85, + "step": 6183 + }, + { + "epoch": 5.757914338919925, + "grad_norm": 1.8774948120117188, + "learning_rate": 2.0693314659746276e-08, + "loss": 0.9177, + "step": 6184 + }, + { + "epoch": 5.758845437616388, + "grad_norm": 2.102290391921997, + "learning_rate": 2.053465928369497e-08, + "loss": 0.8787, + "step": 6185 + }, + { + "epoch": 5.759776536312849, + "grad_norm": 1.9274338483810425, + "learning_rate": 2.0376611946634482e-08, + "loss": 0.8802, + "step": 6186 + }, + { + "epoch": 5.760707635009311, + "grad_norm": 1.7935447692871094, + "learning_rate": 2.0219172687323263e-08, + "loss": 0.8325, + "step": 6187 + }, + { + "epoch": 5.761638733705773, + "grad_norm": 1.8201677799224854, + "learning_rate": 2.0062341544370146e-08, + "loss": 0.8685, + "step": 6188 + }, + { + "epoch": 5.762569832402234, + "grad_norm": 1.8810665607452393, + "learning_rate": 1.9906118556234365e-08, + "loss": 0.853, + "step": 6189 + }, + { + "epoch": 5.763500931098696, + "grad_norm": 1.8570650815963745, + "learning_rate": 1.975050376122667e-08, + "loss": 0.865, + "step": 6190 + }, + { + "epoch": 5.764432029795159, + "grad_norm": 1.8043748140335083, + "learning_rate": 1.959549719750875e-08, + "loss": 0.8336, + "step": 6191 + }, + { + "epoch": 5.76536312849162, + "grad_norm": 1.8313766717910767, + "learning_rate": 1.9441098903092427e-08, + "loss": 0.9102, + "step": 6192 + }, + { + "epoch": 5.766294227188082, + "grad_norm": 1.9020510911941528, + "learning_rate": 1.928730891584074e-08, + "loss": 0.901, + "step": 6193 + }, + { + "epoch": 5.767225325884544, + "grad_norm": 1.842392921447754, + "learning_rate": 1.9134127273468252e-08, + "loss": 0.888, + "step": 6194 + }, + { + "epoch": 5.768156424581005, + "grad_norm": 1.805274248123169, + "learning_rate": 1.8981554013538794e-08, + "loss": 0.8665, + "step": 6195 + }, + { + "epoch": 5.769087523277467, + "grad_norm": 1.9056750535964966, + "learning_rate": 1.8829589173468554e-08, + "loss": 0.8971, + "step": 6196 + }, + { + "epoch": 5.77001862197393, + "grad_norm": 1.7973355054855347, + "learning_rate": 1.8678232790523553e-08, + "loss": 0.8049, + "step": 6197 + }, + { + "epoch": 5.770949720670391, + "grad_norm": 1.8343547582626343, + "learning_rate": 1.852748490182077e-08, + "loss": 0.8845, + "step": 6198 + }, + { + "epoch": 5.771880819366853, + "grad_norm": 1.8544816970825195, + "learning_rate": 1.8377345544328406e-08, + "loss": 0.905, + "step": 6199 + }, + { + "epoch": 5.772811918063315, + "grad_norm": 1.9474034309387207, + "learning_rate": 1.822781475486507e-08, + "loss": 0.9156, + "step": 6200 + }, + { + "epoch": 5.773743016759776, + "grad_norm": 1.8517043590545654, + "learning_rate": 1.8078892570100036e-08, + "loss": 0.8798, + "step": 6201 + }, + { + "epoch": 5.774674115456238, + "grad_norm": 1.8375986814498901, + "learning_rate": 1.793057902655354e-08, + "loss": 0.8668, + "step": 6202 + }, + { + "epoch": 5.775605214152701, + "grad_norm": 1.828906536102295, + "learning_rate": 1.7782874160596485e-08, + "loss": 0.8855, + "step": 6203 + }, + { + "epoch": 5.776536312849162, + "grad_norm": 1.7894619703292847, + "learning_rate": 1.7635778008450456e-08, + "loss": 0.8583, + "step": 6204 + }, + { + "epoch": 5.777467411545624, + "grad_norm": 1.8066133260726929, + "learning_rate": 1.748929060618798e-08, + "loss": 0.8775, + "step": 6205 + }, + { + "epoch": 5.778398510242086, + "grad_norm": 1.8445091247558594, + "learning_rate": 1.7343411989731996e-08, + "loss": 0.8321, + "step": 6206 + }, + { + "epoch": 5.779329608938547, + "grad_norm": 1.81353759765625, + "learning_rate": 1.719814219485638e-08, + "loss": 0.8645, + "step": 6207 + }, + { + "epoch": 5.780260707635009, + "grad_norm": 1.8154714107513428, + "learning_rate": 1.70534812571857e-08, + "loss": 0.8729, + "step": 6208 + }, + { + "epoch": 5.781191806331471, + "grad_norm": 1.858802080154419, + "learning_rate": 1.6909429212194628e-08, + "loss": 0.8693, + "step": 6209 + }, + { + "epoch": 5.782122905027933, + "grad_norm": 1.750205636024475, + "learning_rate": 1.6765986095209906e-08, + "loss": 0.819, + "step": 6210 + }, + { + "epoch": 5.783054003724395, + "grad_norm": 1.857749581336975, + "learning_rate": 1.6623151941407566e-08, + "loss": 0.8196, + "step": 6211 + }, + { + "epoch": 5.783985102420857, + "grad_norm": 1.8542076349258423, + "learning_rate": 1.6480926785814865e-08, + "loss": 0.9032, + "step": 6212 + }, + { + "epoch": 5.784916201117318, + "grad_norm": 1.8493772745132446, + "learning_rate": 1.633931066330946e-08, + "loss": 0.8478, + "step": 6213 + }, + { + "epoch": 5.78584729981378, + "grad_norm": 1.896150827407837, + "learning_rate": 1.6198303608619958e-08, + "loss": 0.8897, + "step": 6214 + }, + { + "epoch": 5.786778398510242, + "grad_norm": 1.787654995918274, + "learning_rate": 1.6057905656325924e-08, + "loss": 0.8469, + "step": 6215 + }, + { + "epoch": 5.787709497206704, + "grad_norm": 1.865329384803772, + "learning_rate": 1.5918116840856758e-08, + "loss": 0.8385, + "step": 6216 + }, + { + "epoch": 5.788640595903166, + "grad_norm": 1.8444942235946655, + "learning_rate": 1.5778937196492538e-08, + "loss": 0.8891, + "step": 6217 + }, + { + "epoch": 5.789571694599628, + "grad_norm": 1.8012547492980957, + "learning_rate": 1.5640366757364856e-08, + "loss": 0.8695, + "step": 6218 + }, + { + "epoch": 5.790502793296089, + "grad_norm": 1.8336769342422485, + "learning_rate": 1.5502405557455413e-08, + "loss": 0.8531, + "step": 6219 + }, + { + "epoch": 5.791433891992551, + "grad_norm": 1.8199506998062134, + "learning_rate": 1.536505363059576e-08, + "loss": 0.8299, + "step": 6220 + }, + { + "epoch": 5.792364990689013, + "grad_norm": 1.858475685119629, + "learning_rate": 1.5228311010468956e-08, + "loss": 0.8539, + "step": 6221 + }, + { + "epoch": 5.793296089385475, + "grad_norm": 1.8624709844589233, + "learning_rate": 1.5092177730608726e-08, + "loss": 0.876, + "step": 6222 + }, + { + "epoch": 5.794227188081937, + "grad_norm": 1.8539190292358398, + "learning_rate": 1.4956653824398647e-08, + "loss": 0.8614, + "step": 6223 + }, + { + "epoch": 5.795158286778398, + "grad_norm": 1.8114997148513794, + "learning_rate": 1.4821739325073247e-08, + "loss": 0.856, + "step": 6224 + }, + { + "epoch": 5.79608938547486, + "grad_norm": 1.8629541397094727, + "learning_rate": 1.4687434265717727e-08, + "loss": 0.8689, + "step": 6225 + }, + { + "epoch": 5.797020484171322, + "grad_norm": 1.8459246158599854, + "learning_rate": 1.4553738679267959e-08, + "loss": 0.9058, + "step": 6226 + }, + { + "epoch": 5.797951582867784, + "grad_norm": 1.8861335515975952, + "learning_rate": 1.4420652598509388e-08, + "loss": 0.8864, + "step": 6227 + }, + { + "epoch": 5.798882681564246, + "grad_norm": 1.8663744926452637, + "learning_rate": 1.4288176056079239e-08, + "loss": 0.8525, + "step": 6228 + }, + { + "epoch": 5.799813780260708, + "grad_norm": 1.7854936122894287, + "learning_rate": 1.415630908446458e-08, + "loss": 0.8498, + "step": 6229 + }, + { + "epoch": 5.800744878957169, + "grad_norm": 1.8029377460479736, + "learning_rate": 1.4025051716003157e-08, + "loss": 0.8433, + "step": 6230 + }, + { + "epoch": 5.801675977653631, + "grad_norm": 1.8740154504776, + "learning_rate": 1.3894403982883109e-08, + "loss": 0.9215, + "step": 6231 + }, + { + "epoch": 5.802607076350093, + "grad_norm": 1.8007084131240845, + "learning_rate": 1.3764365917143529e-08, + "loss": 0.8822, + "step": 6232 + }, + { + "epoch": 5.803538175046555, + "grad_norm": 1.8130390644073486, + "learning_rate": 1.3634937550673077e-08, + "loss": 0.8988, + "step": 6233 + }, + { + "epoch": 5.804469273743017, + "grad_norm": 1.7794862985610962, + "learning_rate": 1.3506118915211642e-08, + "loss": 0.8649, + "step": 6234 + }, + { + "epoch": 5.805400372439479, + "grad_norm": 1.8455464839935303, + "learning_rate": 1.3377910042349507e-08, + "loss": 0.8748, + "step": 6235 + }, + { + "epoch": 5.80633147113594, + "grad_norm": 1.8462884426116943, + "learning_rate": 1.3250310963527358e-08, + "loss": 0.8952, + "step": 6236 + }, + { + "epoch": 5.807262569832402, + "grad_norm": 1.8136041164398193, + "learning_rate": 1.3123321710036274e-08, + "loss": 0.8519, + "step": 6237 + }, + { + "epoch": 5.808193668528864, + "grad_norm": 1.8250521421432495, + "learning_rate": 1.2996942313017735e-08, + "loss": 0.876, + "step": 6238 + }, + { + "epoch": 5.809124767225326, + "grad_norm": 1.8242619037628174, + "learning_rate": 1.2871172803463894e-08, + "loss": 0.8569, + "step": 6239 + }, + { + "epoch": 5.810055865921788, + "grad_norm": 1.8360867500305176, + "learning_rate": 1.2746013212217024e-08, + "loss": 0.8774, + "step": 6240 + }, + { + "epoch": 5.81098696461825, + "grad_norm": 1.790242075920105, + "learning_rate": 1.2621463569969793e-08, + "loss": 0.885, + "step": 6241 + }, + { + "epoch": 5.811918063314711, + "grad_norm": 1.7905941009521484, + "learning_rate": 1.2497523907266108e-08, + "loss": 0.8459, + "step": 6242 + }, + { + "epoch": 5.812849162011173, + "grad_norm": 1.839233160018921, + "learning_rate": 1.237419425449915e-08, + "loss": 0.8535, + "step": 6243 + }, + { + "epoch": 5.8137802607076345, + "grad_norm": 1.8010543584823608, + "learning_rate": 1.2251474641913342e-08, + "loss": 0.8565, + "step": 6244 + }, + { + "epoch": 5.814711359404097, + "grad_norm": 1.8343545198440552, + "learning_rate": 1.2129365099602941e-08, + "loss": 0.8687, + "step": 6245 + }, + { + "epoch": 5.815642458100559, + "grad_norm": 1.7444266080856323, + "learning_rate": 1.2007865657512885e-08, + "loss": 0.8454, + "step": 6246 + }, + { + "epoch": 5.816573556797021, + "grad_norm": 1.8072973489761353, + "learning_rate": 1.1886976345438783e-08, + "loss": 0.8401, + "step": 6247 + }, + { + "epoch": 5.817504655493482, + "grad_norm": 1.830601692199707, + "learning_rate": 1.1766697193026088e-08, + "loss": 0.8294, + "step": 6248 + }, + { + "epoch": 5.818435754189944, + "grad_norm": 1.8093454837799072, + "learning_rate": 1.1647028229770651e-08, + "loss": 0.8383, + "step": 6249 + }, + { + "epoch": 5.8193668528864055, + "grad_norm": 1.7983765602111816, + "learning_rate": 1.1527969485019275e-08, + "loss": 0.8727, + "step": 6250 + }, + { + "epoch": 5.820297951582868, + "grad_norm": 1.8720824718475342, + "learning_rate": 1.1409520987968604e-08, + "loss": 0.8928, + "step": 6251 + }, + { + "epoch": 5.82122905027933, + "grad_norm": 1.864741325378418, + "learning_rate": 1.1291682767665402e-08, + "loss": 0.8458, + "step": 6252 + }, + { + "epoch": 5.822160148975792, + "grad_norm": 1.8344504833221436, + "learning_rate": 1.1174454853007665e-08, + "loss": 0.8802, + "step": 6253 + }, + { + "epoch": 5.823091247672253, + "grad_norm": 1.8307900428771973, + "learning_rate": 1.1057837272742678e-08, + "loss": 0.8538, + "step": 6254 + }, + { + "epoch": 5.824022346368715, + "grad_norm": 1.8457612991333008, + "learning_rate": 1.0941830055468672e-08, + "loss": 0.874, + "step": 6255 + }, + { + "epoch": 5.8249534450651765, + "grad_norm": 1.8891717195510864, + "learning_rate": 1.0826433229634281e-08, + "loss": 0.8762, + "step": 6256 + }, + { + "epoch": 5.825884543761639, + "grad_norm": 1.8372935056686401, + "learning_rate": 1.0711646823538535e-08, + "loss": 0.8421, + "step": 6257 + }, + { + "epoch": 5.826815642458101, + "grad_norm": 1.819162368774414, + "learning_rate": 1.059747086532975e-08, + "loss": 0.8644, + "step": 6258 + }, + { + "epoch": 5.827746741154562, + "grad_norm": 1.7953081130981445, + "learning_rate": 1.0483905383007753e-08, + "loss": 0.852, + "step": 6259 + }, + { + "epoch": 5.828677839851024, + "grad_norm": 1.8138349056243896, + "learning_rate": 1.037095040442193e-08, + "loss": 0.8621, + "step": 6260 + }, + { + "epoch": 5.829608938547486, + "grad_norm": 1.7997726202011108, + "learning_rate": 1.0258605957272627e-08, + "loss": 0.8588, + "step": 6261 + }, + { + "epoch": 5.8305400372439475, + "grad_norm": 1.8210526704788208, + "learning_rate": 1.0146872069109748e-08, + "loss": 0.8623, + "step": 6262 + }, + { + "epoch": 5.83147113594041, + "grad_norm": 1.8615189790725708, + "learning_rate": 1.0035748767333876e-08, + "loss": 0.8806, + "step": 6263 + }, + { + "epoch": 5.832402234636872, + "grad_norm": 1.8190451860427856, + "learning_rate": 9.925236079195434e-09, + "loss": 0.8377, + "step": 6264 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 1.8366278409957886, + "learning_rate": 9.815334031796076e-09, + "loss": 0.8953, + "step": 6265 + }, + { + "epoch": 5.834264432029795, + "grad_norm": 1.8088374137878418, + "learning_rate": 9.706042652086744e-09, + "loss": 0.8736, + "step": 6266 + }, + { + "epoch": 5.835195530726257, + "grad_norm": 1.8467051982879639, + "learning_rate": 9.597361966868779e-09, + "loss": 0.8649, + "step": 6267 + }, + { + "epoch": 5.8361266294227185, + "grad_norm": 1.8143903017044067, + "learning_rate": 9.489292002793915e-09, + "loss": 0.8451, + "step": 6268 + }, + { + "epoch": 5.837057728119181, + "grad_norm": 1.837988257408142, + "learning_rate": 9.381832786364563e-09, + "loss": 0.8493, + "step": 6269 + }, + { + "epoch": 5.837988826815643, + "grad_norm": 1.8164803981781006, + "learning_rate": 9.274984343932702e-09, + "loss": 0.8732, + "step": 6270 + }, + { + "epoch": 5.838919925512104, + "grad_norm": 1.8181922435760498, + "learning_rate": 9.168746701700704e-09, + "loss": 0.8853, + "step": 6271 + }, + { + "epoch": 5.839851024208566, + "grad_norm": 1.7118419408798218, + "learning_rate": 9.063119885721061e-09, + "loss": 0.7968, + "step": 6272 + }, + { + "epoch": 5.840782122905028, + "grad_norm": 1.8169853687286377, + "learning_rate": 8.958103921896943e-09, + "loss": 0.8448, + "step": 6273 + }, + { + "epoch": 5.8417132216014895, + "grad_norm": 1.7918485403060913, + "learning_rate": 8.853698835981362e-09, + "loss": 0.865, + "step": 6274 + }, + { + "epoch": 5.842644320297952, + "grad_norm": 2.061112403869629, + "learning_rate": 8.749904653577446e-09, + "loss": 0.8595, + "step": 6275 + }, + { + "epoch": 5.843575418994414, + "grad_norm": 1.841090440750122, + "learning_rate": 8.646721400138725e-09, + "loss": 0.8746, + "step": 6276 + }, + { + "epoch": 5.844506517690875, + "grad_norm": 1.8588043451309204, + "learning_rate": 8.544149100968569e-09, + "loss": 0.8966, + "step": 6277 + }, + { + "epoch": 5.845437616387337, + "grad_norm": 1.876405119895935, + "learning_rate": 8.442187781221022e-09, + "loss": 0.8855, + "step": 6278 + }, + { + "epoch": 5.846368715083798, + "grad_norm": 1.9351667165756226, + "learning_rate": 8.340837465899975e-09, + "loss": 0.8803, + "step": 6279 + }, + { + "epoch": 5.8472998137802605, + "grad_norm": 1.8581575155258179, + "learning_rate": 8.240098179859712e-09, + "loss": 0.8802, + "step": 6280 + }, + { + "epoch": 5.848230912476723, + "grad_norm": 1.87726628780365, + "learning_rate": 8.139969947804084e-09, + "loss": 0.8557, + "step": 6281 + }, + { + "epoch": 5.849162011173185, + "grad_norm": 1.7543176412582397, + "learning_rate": 8.04045279428789e-09, + "loss": 0.8093, + "step": 6282 + }, + { + "epoch": 5.850093109869646, + "grad_norm": 1.8025380373001099, + "learning_rate": 7.941546743715778e-09, + "loss": 0.8262, + "step": 6283 + }, + { + "epoch": 5.851024208566108, + "grad_norm": 1.8235831260681152, + "learning_rate": 7.843251820342234e-09, + "loss": 0.8874, + "step": 6284 + }, + { + "epoch": 5.851955307262569, + "grad_norm": 1.8280607461929321, + "learning_rate": 7.745568048272145e-09, + "loss": 0.8666, + "step": 6285 + }, + { + "epoch": 5.8528864059590315, + "grad_norm": 1.7822529077529907, + "learning_rate": 7.648495451460513e-09, + "loss": 0.8778, + "step": 6286 + }, + { + "epoch": 5.853817504655494, + "grad_norm": 1.836317777633667, + "learning_rate": 7.552034053712465e-09, + "loss": 0.8394, + "step": 6287 + }, + { + "epoch": 5.854748603351956, + "grad_norm": 1.917044758796692, + "learning_rate": 7.456183878683243e-09, + "loss": 0.8615, + "step": 6288 + }, + { + "epoch": 5.855679702048417, + "grad_norm": 1.7980895042419434, + "learning_rate": 7.360944949878213e-09, + "loss": 0.7999, + "step": 6289 + }, + { + "epoch": 5.856610800744879, + "grad_norm": 1.8164446353912354, + "learning_rate": 7.26631729065258e-09, + "loss": 0.8704, + "step": 6290 + }, + { + "epoch": 5.85754189944134, + "grad_norm": 1.7821286916732788, + "learning_rate": 7.172300924212228e-09, + "loss": 0.8535, + "step": 6291 + }, + { + "epoch": 5.8584729981378025, + "grad_norm": 1.8831591606140137, + "learning_rate": 7.078895873612601e-09, + "loss": 0.8577, + "step": 6292 + }, + { + "epoch": 5.859404096834265, + "grad_norm": 1.8710130453109741, + "learning_rate": 6.986102161759267e-09, + "loss": 0.8649, + "step": 6293 + }, + { + "epoch": 5.860335195530726, + "grad_norm": 1.8668495416641235, + "learning_rate": 6.893919811407912e-09, + "loss": 0.8883, + "step": 6294 + }, + { + "epoch": 5.861266294227188, + "grad_norm": 1.8426321744918823, + "learning_rate": 6.802348845164897e-09, + "loss": 0.8618, + "step": 6295 + }, + { + "epoch": 5.86219739292365, + "grad_norm": 1.8313816785812378, + "learning_rate": 6.71138928548587e-09, + "loss": 0.8468, + "step": 6296 + }, + { + "epoch": 5.863128491620111, + "grad_norm": 1.8673412799835205, + "learning_rate": 6.621041154676877e-09, + "loss": 0.8805, + "step": 6297 + }, + { + "epoch": 5.8640595903165735, + "grad_norm": 1.8632373809814453, + "learning_rate": 6.531304474894084e-09, + "loss": 0.8644, + "step": 6298 + }, + { + "epoch": 5.864990689013036, + "grad_norm": 1.8402752876281738, + "learning_rate": 6.442179268143222e-09, + "loss": 0.8824, + "step": 6299 + }, + { + "epoch": 5.865921787709497, + "grad_norm": 1.820428729057312, + "learning_rate": 6.353665556280697e-09, + "loss": 0.8737, + "step": 6300 + }, + { + "epoch": 5.866852886405959, + "grad_norm": 1.8779093027114868, + "learning_rate": 6.265763361013033e-09, + "loss": 0.8917, + "step": 6301 + }, + { + "epoch": 5.867783985102421, + "grad_norm": 1.8311855792999268, + "learning_rate": 6.178472703895766e-09, + "loss": 0.861, + "step": 6302 + }, + { + "epoch": 5.868715083798882, + "grad_norm": 1.9788854122161865, + "learning_rate": 6.0917936063356605e-09, + "loss": 0.9175, + "step": 6303 + }, + { + "epoch": 5.8696461824953445, + "grad_norm": 1.8169821500778198, + "learning_rate": 6.005726089589325e-09, + "loss": 0.8999, + "step": 6304 + }, + { + "epoch": 5.870577281191807, + "grad_norm": 1.8846689462661743, + "learning_rate": 5.9202701747623766e-09, + "loss": 0.8543, + "step": 6305 + }, + { + "epoch": 5.871508379888268, + "grad_norm": 1.8466196060180664, + "learning_rate": 5.835425882811385e-09, + "loss": 0.8484, + "step": 6306 + }, + { + "epoch": 5.87243947858473, + "grad_norm": 1.8588248491287231, + "learning_rate": 5.75119323454304e-09, + "loss": 0.8644, + "step": 6307 + }, + { + "epoch": 5.873370577281192, + "grad_norm": 1.8642096519470215, + "learning_rate": 5.667572250613595e-09, + "loss": 0.846, + "step": 6308 + }, + { + "epoch": 5.874301675977653, + "grad_norm": 1.8254890441894531, + "learning_rate": 5.584562951529149e-09, + "loss": 0.8649, + "step": 6309 + }, + { + "epoch": 5.8752327746741155, + "grad_norm": 1.8642079830169678, + "learning_rate": 5.502165357645917e-09, + "loss": 0.8939, + "step": 6310 + }, + { + "epoch": 5.876163873370578, + "grad_norm": 1.8301225900650024, + "learning_rate": 5.4203794891707906e-09, + "loss": 0.8878, + "step": 6311 + }, + { + "epoch": 5.877094972067039, + "grad_norm": 1.8181841373443604, + "learning_rate": 5.339205366159949e-09, + "loss": 0.9065, + "step": 6312 + }, + { + "epoch": 5.878026070763501, + "grad_norm": 1.8410382270812988, + "learning_rate": 5.258643008519693e-09, + "loss": 0.8497, + "step": 6313 + }, + { + "epoch": 5.878957169459962, + "grad_norm": 1.8262267112731934, + "learning_rate": 5.178692436005883e-09, + "loss": 0.8395, + "step": 6314 + }, + { + "epoch": 5.879888268156424, + "grad_norm": 1.8950258493423462, + "learning_rate": 5.0993536682253375e-09, + "loss": 0.8125, + "step": 6315 + }, + { + "epoch": 5.8808193668528865, + "grad_norm": 1.9102938175201416, + "learning_rate": 5.020626724634159e-09, + "loss": 0.9264, + "step": 6316 + }, + { + "epoch": 5.881750465549349, + "grad_norm": 1.8354554176330566, + "learning_rate": 4.942511624538293e-09, + "loss": 0.8788, + "step": 6317 + }, + { + "epoch": 5.88268156424581, + "grad_norm": 1.80685293674469, + "learning_rate": 4.865008387094361e-09, + "loss": 0.8436, + "step": 6318 + }, + { + "epoch": 5.883612662942272, + "grad_norm": 1.7940880060195923, + "learning_rate": 4.7881170313082705e-09, + "loss": 0.8746, + "step": 6319 + }, + { + "epoch": 5.884543761638733, + "grad_norm": 1.833531141281128, + "learning_rate": 4.711837576035772e-09, + "loss": 0.8585, + "step": 6320 + }, + { + "epoch": 5.885474860335195, + "grad_norm": 1.859403371810913, + "learning_rate": 4.636170039983012e-09, + "loss": 0.8832, + "step": 6321 + }, + { + "epoch": 5.8864059590316575, + "grad_norm": 1.8519070148468018, + "learning_rate": 4.56111444170626e-09, + "loss": 0.8789, + "step": 6322 + }, + { + "epoch": 5.88733705772812, + "grad_norm": 1.788859486579895, + "learning_rate": 4.4866707996113476e-09, + "loss": 0.8377, + "step": 6323 + }, + { + "epoch": 5.888268156424581, + "grad_norm": 1.840727686882019, + "learning_rate": 4.412839131953394e-09, + "loss": 0.8567, + "step": 6324 + }, + { + "epoch": 5.889199255121043, + "grad_norm": 1.8736189603805542, + "learning_rate": 4.339619456839028e-09, + "loss": 0.8574, + "step": 6325 + }, + { + "epoch": 5.890130353817504, + "grad_norm": 1.8728331327438354, + "learning_rate": 4.26701179222333e-09, + "loss": 0.9107, + "step": 6326 + }, + { + "epoch": 5.891061452513966, + "grad_norm": 1.8076170682907104, + "learning_rate": 4.195016155912057e-09, + "loss": 0.8512, + "step": 6327 + }, + { + "epoch": 5.8919925512104285, + "grad_norm": 1.8310598134994507, + "learning_rate": 4.123632565560809e-09, + "loss": 0.869, + "step": 6328 + }, + { + "epoch": 5.89292364990689, + "grad_norm": 1.8443015813827515, + "learning_rate": 4.052861038674749e-09, + "loss": 0.8836, + "step": 6329 + }, + { + "epoch": 5.893854748603352, + "grad_norm": 1.8418281078338623, + "learning_rate": 3.982701592609439e-09, + "loss": 0.8705, + "step": 6330 + }, + { + "epoch": 5.894785847299814, + "grad_norm": 1.8712201118469238, + "learning_rate": 3.913154244569728e-09, + "loss": 0.875, + "step": 6331 + }, + { + "epoch": 5.895716945996275, + "grad_norm": 1.9113802909851074, + "learning_rate": 3.844219011610861e-09, + "loss": 0.8602, + "step": 6332 + }, + { + "epoch": 5.896648044692737, + "grad_norm": 1.844139814376831, + "learning_rate": 3.7758959106379274e-09, + "loss": 0.8822, + "step": 6333 + }, + { + "epoch": 5.8975791433891995, + "grad_norm": 1.8023875951766968, + "learning_rate": 3.7081849584055805e-09, + "loss": 0.8585, + "step": 6334 + }, + { + "epoch": 5.898510242085661, + "grad_norm": 1.7824209928512573, + "learning_rate": 3.641086171518593e-09, + "loss": 0.8277, + "step": 6335 + }, + { + "epoch": 5.899441340782123, + "grad_norm": 1.8526780605316162, + "learning_rate": 3.574599566431858e-09, + "loss": 0.8725, + "step": 6336 + }, + { + "epoch": 5.900372439478585, + "grad_norm": 1.8569629192352295, + "learning_rate": 3.5087251594498328e-09, + "loss": 0.8444, + "step": 6337 + }, + { + "epoch": 5.901303538175046, + "grad_norm": 1.8355720043182373, + "learning_rate": 3.4434629667265405e-09, + "loss": 0.8502, + "step": 6338 + }, + { + "epoch": 5.902234636871508, + "grad_norm": 1.8026421070098877, + "learning_rate": 3.3788130042664015e-09, + "loss": 0.8605, + "step": 6339 + }, + { + "epoch": 5.9031657355679705, + "grad_norm": 1.8531574010849, + "learning_rate": 3.3147752879236773e-09, + "loss": 0.8925, + "step": 6340 + }, + { + "epoch": 5.904096834264432, + "grad_norm": 1.8416420221328735, + "learning_rate": 3.2513498334021953e-09, + "loss": 0.8866, + "step": 6341 + }, + { + "epoch": 5.905027932960894, + "grad_norm": 1.8038629293441772, + "learning_rate": 3.188536656255903e-09, + "loss": 0.8511, + "step": 6342 + }, + { + "epoch": 5.905959031657356, + "grad_norm": 1.8304471969604492, + "learning_rate": 3.1263357718883112e-09, + "loss": 0.8726, + "step": 6343 + }, + { + "epoch": 5.906890130353817, + "grad_norm": 1.8365386724472046, + "learning_rate": 3.0647471955527752e-09, + "loss": 0.8784, + "step": 6344 + }, + { + "epoch": 5.907821229050279, + "grad_norm": 1.782964825630188, + "learning_rate": 3.003770942353046e-09, + "loss": 0.8615, + "step": 6345 + }, + { + "epoch": 5.9087523277467415, + "grad_norm": 1.836837887763977, + "learning_rate": 2.943407027242162e-09, + "loss": 0.8602, + "step": 6346 + }, + { + "epoch": 5.909683426443203, + "grad_norm": 1.8097761869430542, + "learning_rate": 2.8836554650230055e-09, + "loss": 0.8568, + "step": 6347 + }, + { + "epoch": 5.910614525139665, + "grad_norm": 1.847347617149353, + "learning_rate": 2.824516270348576e-09, + "loss": 0.8707, + "step": 6348 + }, + { + "epoch": 5.911545623836126, + "grad_norm": 1.827873706817627, + "learning_rate": 2.7659894577217183e-09, + "loss": 0.9115, + "step": 6349 + }, + { + "epoch": 5.912476722532588, + "grad_norm": 1.8255314826965332, + "learning_rate": 2.708075041494562e-09, + "loss": 0.8534, + "step": 6350 + }, + { + "epoch": 5.91340782122905, + "grad_norm": 1.873690128326416, + "learning_rate": 2.6507730358699135e-09, + "loss": 0.8874, + "step": 6351 + }, + { + "epoch": 5.9143389199255125, + "grad_norm": 1.8573442697525024, + "learning_rate": 2.5940834548995873e-09, + "loss": 0.8673, + "step": 6352 + }, + { + "epoch": 5.915270018621974, + "grad_norm": 1.810296893119812, + "learning_rate": 2.5380063124857968e-09, + "loss": 0.8669, + "step": 6353 + }, + { + "epoch": 5.916201117318436, + "grad_norm": 1.897988200187683, + "learning_rate": 2.482541622380319e-09, + "loss": 0.8973, + "step": 6354 + }, + { + "epoch": 5.917132216014897, + "grad_norm": 1.8114992380142212, + "learning_rate": 2.4276893981844963e-09, + "loss": 0.8788, + "step": 6355 + }, + { + "epoch": 5.918063314711359, + "grad_norm": 1.8560186624526978, + "learning_rate": 2.373449653349791e-09, + "loss": 0.8832, + "step": 6356 + }, + { + "epoch": 5.918994413407821, + "grad_norm": 1.8644506931304932, + "learning_rate": 2.3198224011777847e-09, + "loss": 0.8615, + "step": 6357 + }, + { + "epoch": 5.9199255121042835, + "grad_norm": 1.827688455581665, + "learning_rate": 2.26680765481907e-09, + "loss": 0.9039, + "step": 6358 + }, + { + "epoch": 5.920856610800745, + "grad_norm": 1.9086414575576782, + "learning_rate": 2.2144054272746352e-09, + "loss": 0.8884, + "step": 6359 + }, + { + "epoch": 5.921787709497207, + "grad_norm": 1.7982131242752075, + "learning_rate": 2.1626157313950347e-09, + "loss": 0.8951, + "step": 6360 + }, + { + "epoch": 5.922718808193668, + "grad_norm": 1.8264929056167603, + "learning_rate": 2.1114385798806645e-09, + "loss": 0.8574, + "step": 6361 + }, + { + "epoch": 5.92364990689013, + "grad_norm": 1.8348910808563232, + "learning_rate": 2.0608739852814863e-09, + "loss": 0.8725, + "step": 6362 + }, + { + "epoch": 5.924581005586592, + "grad_norm": 1.8792465925216675, + "learning_rate": 2.010921959997858e-09, + "loss": 0.9016, + "step": 6363 + }, + { + "epoch": 5.925512104283054, + "grad_norm": 1.884018898010254, + "learning_rate": 1.961582516279148e-09, + "loss": 0.8371, + "step": 6364 + }, + { + "epoch": 5.926443202979516, + "grad_norm": 1.9077988862991333, + "learning_rate": 1.912855666225122e-09, + "loss": 0.9016, + "step": 6365 + }, + { + "epoch": 5.927374301675978, + "grad_norm": 1.870465636253357, + "learning_rate": 1.8647414217848325e-09, + "loss": 0.8639, + "step": 6366 + }, + { + "epoch": 5.928305400372439, + "grad_norm": 1.8063478469848633, + "learning_rate": 1.8172397947574527e-09, + "loss": 0.8495, + "step": 6367 + }, + { + "epoch": 5.929236499068901, + "grad_norm": 1.8246792554855347, + "learning_rate": 1.7703507967919976e-09, + "loss": 0.8713, + "step": 6368 + }, + { + "epoch": 5.930167597765363, + "grad_norm": 1.8659306764602661, + "learning_rate": 1.7240744393864917e-09, + "loss": 0.8857, + "step": 6369 + }, + { + "epoch": 5.931098696461825, + "grad_norm": 1.9049984216690063, + "learning_rate": 1.6784107338899125e-09, + "loss": 0.8915, + "step": 6370 + }, + { + "epoch": 5.932029795158287, + "grad_norm": 1.8529447317123413, + "learning_rate": 1.6333596914999695e-09, + "loss": 0.8801, + "step": 6371 + }, + { + "epoch": 5.932960893854749, + "grad_norm": 1.8948285579681396, + "learning_rate": 1.5889213232644919e-09, + "loss": 0.8697, + "step": 6372 + }, + { + "epoch": 5.93389199255121, + "grad_norm": 1.8201647996902466, + "learning_rate": 1.5450956400814287e-09, + "loss": 0.8522, + "step": 6373 + }, + { + "epoch": 5.934823091247672, + "grad_norm": 1.735663890838623, + "learning_rate": 1.5018826526977394e-09, + "loss": 0.8473, + "step": 6374 + }, + { + "epoch": 5.935754189944134, + "grad_norm": 1.8138455152511597, + "learning_rate": 1.4592823717110572e-09, + "loss": 0.8624, + "step": 6375 + }, + { + "epoch": 5.936685288640596, + "grad_norm": 1.8177839517593384, + "learning_rate": 1.4172948075677483e-09, + "loss": 0.8934, + "step": 6376 + }, + { + "epoch": 5.937616387337058, + "grad_norm": 1.827480435371399, + "learning_rate": 1.3759199705645764e-09, + "loss": 0.8423, + "step": 6377 + }, + { + "epoch": 5.93854748603352, + "grad_norm": 1.797321081161499, + "learning_rate": 1.3351578708478696e-09, + "loss": 0.8285, + "step": 6378 + }, + { + "epoch": 5.939478584729981, + "grad_norm": 1.8046422004699707, + "learning_rate": 1.2950085184140759e-09, + "loss": 0.8631, + "step": 6379 + }, + { + "epoch": 5.940409683426443, + "grad_norm": 1.8134069442749023, + "learning_rate": 1.2554719231083757e-09, + "loss": 0.8555, + "step": 6380 + }, + { + "epoch": 5.941340782122905, + "grad_norm": 1.7775695323944092, + "learning_rate": 1.2165480946269014e-09, + "loss": 0.8697, + "step": 6381 + }, + { + "epoch": 5.942271880819367, + "grad_norm": 1.8263392448425293, + "learning_rate": 1.1782370425145183e-09, + "loss": 0.8751, + "step": 6382 + }, + { + "epoch": 5.943202979515829, + "grad_norm": 1.8601995706558228, + "learning_rate": 1.1405387761664888e-09, + "loss": 0.8929, + "step": 6383 + }, + { + "epoch": 5.94413407821229, + "grad_norm": 1.9737921953201294, + "learning_rate": 1.1034533048276396e-09, + "loss": 0.8587, + "step": 6384 + }, + { + "epoch": 5.945065176908752, + "grad_norm": 1.881279706954956, + "learning_rate": 1.0669806375920854e-09, + "loss": 0.8961, + "step": 6385 + }, + { + "epoch": 5.945996275605214, + "grad_norm": 1.8054611682891846, + "learning_rate": 1.0311207834040604e-09, + "loss": 0.8369, + "step": 6386 + }, + { + "epoch": 5.946927374301676, + "grad_norm": 1.8753248453140259, + "learning_rate": 9.958737510579187e-10, + "loss": 0.8656, + "step": 6387 + }, + { + "epoch": 5.947858472998138, + "grad_norm": 1.8040980100631714, + "learning_rate": 9.612395491970239e-10, + "loss": 0.8518, + "step": 6388 + }, + { + "epoch": 5.9487895716946, + "grad_norm": 1.8181818723678589, + "learning_rate": 9.272181863143048e-10, + "loss": 0.8575, + "step": 6389 + }, + { + "epoch": 5.949720670391061, + "grad_norm": 1.9011802673339844, + "learning_rate": 8.93809670753365e-10, + "loss": 0.9031, + "step": 6390 + }, + { + "epoch": 5.950651769087523, + "grad_norm": 1.7996485233306885, + "learning_rate": 8.610140107070953e-10, + "loss": 0.858, + "step": 6391 + }, + { + "epoch": 5.951582867783985, + "grad_norm": 1.8360109329223633, + "learning_rate": 8.28831214217396e-10, + "loss": 0.9081, + "step": 6392 + }, + { + "epoch": 5.952513966480447, + "grad_norm": 1.8669235706329346, + "learning_rate": 7.972612891765652e-10, + "loss": 0.9065, + "step": 6393 + }, + { + "epoch": 5.953445065176909, + "grad_norm": 1.8490556478500366, + "learning_rate": 7.663042433267431e-10, + "loss": 0.8736, + "step": 6394 + }, + { + "epoch": 5.954376163873371, + "grad_norm": 1.8064029216766357, + "learning_rate": 7.359600842596349e-10, + "loss": 0.8479, + "step": 6395 + }, + { + "epoch": 5.955307262569832, + "grad_norm": 1.8017925024032593, + "learning_rate": 7.062288194162326e-10, + "loss": 0.8665, + "step": 6396 + }, + { + "epoch": 5.956238361266294, + "grad_norm": 1.8381091356277466, + "learning_rate": 6.771104560876484e-10, + "loss": 0.847, + "step": 6397 + }, + { + "epoch": 5.957169459962756, + "grad_norm": 1.7793654203414917, + "learning_rate": 6.486050014145596e-10, + "loss": 0.8418, + "step": 6398 + }, + { + "epoch": 5.9581005586592175, + "grad_norm": 2.4611287117004395, + "learning_rate": 6.207124623872074e-10, + "loss": 0.8281, + "step": 6399 + }, + { + "epoch": 5.95903165735568, + "grad_norm": 1.8994784355163574, + "learning_rate": 5.934328458459537e-10, + "loss": 0.892, + "step": 6400 + }, + { + "epoch": 5.959962756052142, + "grad_norm": 1.8682223558425903, + "learning_rate": 5.667661584804474e-10, + "loss": 0.8565, + "step": 6401 + }, + { + "epoch": 5.960893854748603, + "grad_norm": 1.8080297708511353, + "learning_rate": 5.407124068301794e-10, + "loss": 0.8461, + "step": 6402 + }, + { + "epoch": 5.961824953445065, + "grad_norm": 1.8252531290054321, + "learning_rate": 5.152715972842059e-10, + "loss": 0.8732, + "step": 6403 + }, + { + "epoch": 5.962756052141527, + "grad_norm": 1.8170905113220215, + "learning_rate": 4.904437360814251e-10, + "loss": 0.8366, + "step": 6404 + }, + { + "epoch": 5.9636871508379885, + "grad_norm": 1.841629147529602, + "learning_rate": 4.662288293105776e-10, + "loss": 0.8992, + "step": 6405 + }, + { + "epoch": 5.964618249534451, + "grad_norm": 1.7900701761245728, + "learning_rate": 4.426268829094138e-10, + "loss": 0.8376, + "step": 6406 + }, + { + "epoch": 5.965549348230913, + "grad_norm": 1.782887578010559, + "learning_rate": 4.1963790266635887e-10, + "loss": 0.8646, + "step": 6407 + }, + { + "epoch": 5.966480446927374, + "grad_norm": 1.864758014678955, + "learning_rate": 3.9726189421857017e-10, + "loss": 0.8442, + "step": 6408 + }, + { + "epoch": 5.967411545623836, + "grad_norm": 1.8656058311462402, + "learning_rate": 3.754988630536027e-10, + "loss": 0.9112, + "step": 6409 + }, + { + "epoch": 5.968342644320298, + "grad_norm": 1.856855034828186, + "learning_rate": 3.543488145082985e-10, + "loss": 0.8832, + "step": 6410 + }, + { + "epoch": 5.9692737430167595, + "grad_norm": 1.8055484294891357, + "learning_rate": 3.3381175376934195e-10, + "loss": 0.8785, + "step": 6411 + }, + { + "epoch": 5.970204841713222, + "grad_norm": 1.8106634616851807, + "learning_rate": 3.1388768587298224e-10, + "loss": 0.847, + "step": 6412 + }, + { + "epoch": 5.971135940409684, + "grad_norm": 1.8128814697265625, + "learning_rate": 2.945766157050334e-10, + "loss": 0.855, + "step": 6413 + }, + { + "epoch": 5.972067039106145, + "grad_norm": 1.8292309045791626, + "learning_rate": 2.758785480014292e-10, + "loss": 0.8532, + "step": 6414 + }, + { + "epoch": 5.972998137802607, + "grad_norm": 1.8674393892288208, + "learning_rate": 2.5779348734739083e-10, + "loss": 0.8401, + "step": 6415 + }, + { + "epoch": 5.973929236499069, + "grad_norm": 1.7960641384124756, + "learning_rate": 2.403214381779817e-10, + "loss": 0.8335, + "step": 6416 + }, + { + "epoch": 5.9748603351955305, + "grad_norm": 1.7746152877807617, + "learning_rate": 2.2346240477755243e-10, + "loss": 0.8345, + "step": 6417 + }, + { + "epoch": 5.975791433891993, + "grad_norm": 1.8095226287841797, + "learning_rate": 2.0721639128085113e-10, + "loss": 0.8776, + "step": 6418 + }, + { + "epoch": 5.976722532588455, + "grad_norm": 1.828633189201355, + "learning_rate": 1.9158340167163557e-10, + "loss": 0.8537, + "step": 6419 + }, + { + "epoch": 5.977653631284916, + "grad_norm": 1.8022271394729614, + "learning_rate": 1.7656343978378342e-10, + "loss": 0.8385, + "step": 6420 + }, + { + "epoch": 5.978584729981378, + "grad_norm": 1.7816201448440552, + "learning_rate": 1.6215650930018202e-10, + "loss": 0.8625, + "step": 6421 + }, + { + "epoch": 5.97951582867784, + "grad_norm": 1.863553762435913, + "learning_rate": 1.483626137546712e-10, + "loss": 0.8452, + "step": 6422 + }, + { + "epoch": 5.9804469273743015, + "grad_norm": 1.8710039854049683, + "learning_rate": 1.3518175652899035e-10, + "loss": 0.8666, + "step": 6423 + }, + { + "epoch": 5.981378026070764, + "grad_norm": 1.8460798263549805, + "learning_rate": 1.226139408561089e-10, + "loss": 0.8461, + "step": 6424 + }, + { + "epoch": 5.982309124767225, + "grad_norm": 1.8002387285232544, + "learning_rate": 1.106591698177284e-10, + "loss": 0.876, + "step": 6425 + }, + { + "epoch": 5.983240223463687, + "grad_norm": 1.8913145065307617, + "learning_rate": 9.931744634567031e-11, + "loss": 0.9186, + "step": 6426 + }, + { + "epoch": 5.984171322160149, + "grad_norm": 1.8086674213409424, + "learning_rate": 8.858877322132087e-11, + "loss": 0.8692, + "step": 6427 + }, + { + "epoch": 5.985102420856611, + "grad_norm": 1.8788095712661743, + "learning_rate": 7.847315307535352e-11, + "loss": 0.8573, + "step": 6428 + }, + { + "epoch": 5.9860335195530725, + "grad_norm": 1.8312733173370361, + "learning_rate": 6.897058838883919e-11, + "loss": 0.8882, + "step": 6429 + }, + { + "epoch": 5.986964618249535, + "grad_norm": 1.8287769556045532, + "learning_rate": 6.008108149185843e-11, + "loss": 0.8785, + "step": 6430 + }, + { + "epoch": 5.987895716945996, + "grad_norm": 1.8518235683441162, + "learning_rate": 5.1804634564334154e-11, + "loss": 0.842, + "step": 6431 + }, + { + "epoch": 5.988826815642458, + "grad_norm": 1.8558229207992554, + "learning_rate": 4.414124963603161e-11, + "loss": 0.8582, + "step": 6432 + }, + { + "epoch": 5.98975791433892, + "grad_norm": 1.7795523405075073, + "learning_rate": 3.7090928586003275e-11, + "loss": 0.8478, + "step": 6433 + }, + { + "epoch": 5.990689013035381, + "grad_norm": 1.8624168634414673, + "learning_rate": 3.065367314342149e-11, + "loss": 0.8648, + "step": 6434 + }, + { + "epoch": 5.9916201117318435, + "grad_norm": 1.867603063583374, + "learning_rate": 2.4829484887023413e-11, + "loss": 0.8826, + "step": 6435 + }, + { + "epoch": 5.992551210428306, + "grad_norm": 1.8553563356399536, + "learning_rate": 1.9618365244833405e-11, + "loss": 0.8976, + "step": 6436 + }, + { + "epoch": 5.993482309124767, + "grad_norm": 1.892018437385559, + "learning_rate": 1.5020315494995717e-11, + "loss": 0.9111, + "step": 6437 + }, + { + "epoch": 5.994413407821229, + "grad_norm": 1.8219844102859497, + "learning_rate": 1.1035336764664283e-11, + "loss": 0.8694, + "step": 6438 + }, + { + "epoch": 5.995344506517691, + "grad_norm": 1.7894152402877808, + "learning_rate": 7.663430031390474e-12, + "loss": 0.8507, + "step": 6439 + }, + { + "epoch": 5.996275605214152, + "grad_norm": 1.8014150857925415, + "learning_rate": 4.904596122290439e-12, + "loss": 0.8413, + "step": 6440 + }, + { + "epoch": 5.9972067039106145, + "grad_norm": 1.9747562408447266, + "learning_rate": 2.7588357134900047e-12, + "loss": 0.879, + "step": 6441 + }, + { + "epoch": 5.998137802607077, + "grad_norm": 1.8461476564407349, + "learning_rate": 1.2261493312348827e-12, + "loss": 0.837, + "step": 6442 + }, + { + "epoch": 5.999068901303538, + "grad_norm": 1.8115543127059937, + "learning_rate": 3.0653735161312317e-13, + "loss": 0.869, + "step": 6443 + }, + { + "epoch": 6.0, + "grad_norm": 2.1417829990386963, + "learning_rate": 0.0, + "loss": 0.879, + "step": 6444 + } + ], + "logging_steps": 1, + "max_steps": 6444, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 1074, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6276306952517059e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}