diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17536 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9970005998800238, + "eval_steps": 500, + "global_step": 2499, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011997600479904018, + "grad_norm": 36.17856378068024, + "learning_rate": 0.0, + "loss": 6.5806, + "step": 1 + }, + { + "epoch": 0.0023995200959808036, + "grad_norm": 34.358282645308925, + "learning_rate": 4e-08, + "loss": 5.5196, + "step": 2 + }, + { + "epoch": 0.003599280143971206, + "grad_norm": 34.63248495550367, + "learning_rate": 8e-08, + "loss": 5.6467, + "step": 3 + }, + { + "epoch": 0.004799040191961607, + "grad_norm": 37.406685178178456, + "learning_rate": 1.2000000000000002e-07, + "loss": 7.1574, + "step": 4 + }, + { + "epoch": 0.00599880023995201, + "grad_norm": 38.40041370330666, + "learning_rate": 1.6e-07, + "loss": 5.0629, + "step": 5 + }, + { + "epoch": 0.007198560287942412, + "grad_norm": 32.30499073167661, + "learning_rate": 2.0000000000000002e-07, + "loss": 5.4575, + "step": 6 + }, + { + "epoch": 0.008398320335932814, + "grad_norm": 37.67738186414492, + "learning_rate": 2.4000000000000003e-07, + "loss": 6.8774, + "step": 7 + }, + { + "epoch": 0.009598080383923215, + "grad_norm": 29.847807359572833, + "learning_rate": 2.8e-07, + "loss": 5.0336, + "step": 8 + }, + { + "epoch": 0.010797840431913617, + "grad_norm": 33.87761978237807, + "learning_rate": 3.2e-07, + "loss": 5.3535, + "step": 9 + }, + { + "epoch": 0.01199760047990402, + "grad_norm": 36.02370970827611, + "learning_rate": 3.6e-07, + "loss": 6.5137, + "step": 10 + }, + { + "epoch": 0.013197360527894421, + "grad_norm": 30.794586404273485, + "learning_rate": 4.0000000000000003e-07, + "loss": 4.6923, + "step": 11 + }, + { + "epoch": 0.014397120575884824, + "grad_norm": 36.095798724282986, + "learning_rate": 4.4e-07, + "loss": 5.5734, + "step": 12 + }, + { + "epoch": 0.015596880623875225, + "grad_norm": 36.625714284092496, + "learning_rate": 4.800000000000001e-07, + "loss": 6.2458, + "step": 13 + }, + { + "epoch": 0.016796640671865627, + "grad_norm": 38.38403373240824, + "learning_rate": 5.2e-07, + "loss": 6.8462, + "step": 14 + }, + { + "epoch": 0.017996400719856028, + "grad_norm": 36.381283291167584, + "learning_rate": 5.6e-07, + "loss": 6.8765, + "step": 15 + }, + { + "epoch": 0.01919616076784643, + "grad_norm": 32.176585157359426, + "learning_rate": 6.000000000000001e-07, + "loss": 5.0196, + "step": 16 + }, + { + "epoch": 0.020395920815836834, + "grad_norm": 36.517388905365706, + "learning_rate": 6.4e-07, + "loss": 7.2797, + "step": 17 + }, + { + "epoch": 0.021595680863827234, + "grad_norm": 33.5867632244418, + "learning_rate": 6.800000000000001e-07, + "loss": 4.8825, + "step": 18 + }, + { + "epoch": 0.022795440911817635, + "grad_norm": 37.36761591767671, + "learning_rate": 7.2e-07, + "loss": 6.9421, + "step": 19 + }, + { + "epoch": 0.02399520095980804, + "grad_norm": 34.62445788638932, + "learning_rate": 7.6e-07, + "loss": 5.6175, + "step": 20 + }, + { + "epoch": 0.02519496100779844, + "grad_norm": 35.15779448273933, + "learning_rate": 8.000000000000001e-07, + "loss": 5.8947, + "step": 21 + }, + { + "epoch": 0.026394721055788842, + "grad_norm": 29.819457245845634, + "learning_rate": 8.400000000000001e-07, + "loss": 4.8043, + "step": 22 + }, + { + "epoch": 0.027594481103779243, + "grad_norm": 33.64012081915802, + "learning_rate": 8.8e-07, + "loss": 5.4804, + "step": 23 + }, + { + "epoch": 0.028794241151769647, + "grad_norm": 35.618352356220015, + "learning_rate": 9.200000000000001e-07, + "loss": 6.6431, + "step": 24 + }, + { + "epoch": 0.029994001199760048, + "grad_norm": 34.112079688785066, + "learning_rate": 9.600000000000001e-07, + "loss": 6.0226, + "step": 25 + }, + { + "epoch": 0.03119376124775045, + "grad_norm": 30.405498175713877, + "learning_rate": 1.0000000000000002e-06, + "loss": 5.2086, + "step": 26 + }, + { + "epoch": 0.03239352129574085, + "grad_norm": 26.22721194386834, + "learning_rate": 1.04e-06, + "loss": 3.9648, + "step": 27 + }, + { + "epoch": 0.033593281343731254, + "grad_norm": 32.474716650442616, + "learning_rate": 1.08e-06, + "loss": 5.42, + "step": 28 + }, + { + "epoch": 0.03479304139172166, + "grad_norm": 29.509812337295603, + "learning_rate": 1.12e-06, + "loss": 4.4588, + "step": 29 + }, + { + "epoch": 0.035992801439712056, + "grad_norm": 25.26807361414514, + "learning_rate": 1.1600000000000001e-06, + "loss": 4.4965, + "step": 30 + }, + { + "epoch": 0.03719256148770246, + "grad_norm": 17.211406917325288, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.1409, + "step": 31 + }, + { + "epoch": 0.03839232153569286, + "grad_norm": 16.58496319508288, + "learning_rate": 1.2400000000000002e-06, + "loss": 3.2392, + "step": 32 + }, + { + "epoch": 0.03959208158368326, + "grad_norm": 20.256709229042606, + "learning_rate": 1.28e-06, + "loss": 3.9943, + "step": 33 + }, + { + "epoch": 0.04079184163167367, + "grad_norm": 19.386587675168716, + "learning_rate": 1.32e-06, + "loss": 4.0614, + "step": 34 + }, + { + "epoch": 0.041991601679664065, + "grad_norm": 21.31388017416675, + "learning_rate": 1.3600000000000001e-06, + "loss": 4.8848, + "step": 35 + }, + { + "epoch": 0.04319136172765447, + "grad_norm": 16.440197605883533, + "learning_rate": 1.4000000000000001e-06, + "loss": 4.1091, + "step": 36 + }, + { + "epoch": 0.04439112177564487, + "grad_norm": 14.127689181062076, + "learning_rate": 1.44e-06, + "loss": 3.9764, + "step": 37 + }, + { + "epoch": 0.04559088182363527, + "grad_norm": 11.382233478930736, + "learning_rate": 1.48e-06, + "loss": 3.1144, + "step": 38 + }, + { + "epoch": 0.046790641871625675, + "grad_norm": 10.94943616284639, + "learning_rate": 1.52e-06, + "loss": 3.4877, + "step": 39 + }, + { + "epoch": 0.04799040191961608, + "grad_norm": 8.0400059342567, + "learning_rate": 1.56e-06, + "loss": 3.048, + "step": 40 + }, + { + "epoch": 0.04919016196760648, + "grad_norm": 13.04940759359585, + "learning_rate": 1.6000000000000001e-06, + "loss": 3.2501, + "step": 41 + }, + { + "epoch": 0.05038992201559688, + "grad_norm": 11.668121415877163, + "learning_rate": 1.6400000000000002e-06, + "loss": 2.6705, + "step": 42 + }, + { + "epoch": 0.05158968206358728, + "grad_norm": 15.351939200735895, + "learning_rate": 1.6800000000000002e-06, + "loss": 3.3811, + "step": 43 + }, + { + "epoch": 0.052789442111577684, + "grad_norm": 9.517039626972272, + "learning_rate": 1.72e-06, + "loss": 3.0159, + "step": 44 + }, + { + "epoch": 0.05398920215956809, + "grad_norm": 9.580365005252844, + "learning_rate": 1.76e-06, + "loss": 3.2343, + "step": 45 + }, + { + "epoch": 0.055188962207558485, + "grad_norm": 5.1751309603875235, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.7038, + "step": 46 + }, + { + "epoch": 0.05638872225554889, + "grad_norm": 5.6080543773161775, + "learning_rate": 1.8400000000000002e-06, + "loss": 3.1211, + "step": 47 + }, + { + "epoch": 0.057588482303539294, + "grad_norm": 4.310981133231568, + "learning_rate": 1.8800000000000002e-06, + "loss": 2.6889, + "step": 48 + }, + { + "epoch": 0.05878824235152969, + "grad_norm": 4.3584143180433745, + "learning_rate": 1.9200000000000003e-06, + "loss": 2.5729, + "step": 49 + }, + { + "epoch": 0.059988002399520096, + "grad_norm": 5.498808904661779, + "learning_rate": 1.9600000000000003e-06, + "loss": 2.8867, + "step": 50 + }, + { + "epoch": 0.0611877624475105, + "grad_norm": 2.6927761049069723, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.6145, + "step": 51 + }, + { + "epoch": 0.0623875224955009, + "grad_norm": 5.4214774551834735, + "learning_rate": 2.04e-06, + "loss": 3.1939, + "step": 52 + }, + { + "epoch": 0.0635872825434913, + "grad_norm": 5.406801526942302, + "learning_rate": 2.08e-06, + "loss": 2.6354, + "step": 53 + }, + { + "epoch": 0.0647870425914817, + "grad_norm": 3.714353613351617, + "learning_rate": 2.12e-06, + "loss": 2.5379, + "step": 54 + }, + { + "epoch": 0.06598680263947211, + "grad_norm": 2.9173663728629085, + "learning_rate": 2.16e-06, + "loss": 2.1469, + "step": 55 + }, + { + "epoch": 0.06718656268746251, + "grad_norm": 3.608786726950732, + "learning_rate": 2.2e-06, + "loss": 2.6828, + "step": 56 + }, + { + "epoch": 0.0683863227354529, + "grad_norm": 3.763597670515027, + "learning_rate": 2.24e-06, + "loss": 2.6978, + "step": 57 + }, + { + "epoch": 0.06958608278344332, + "grad_norm": 3.4041860826087795, + "learning_rate": 2.28e-06, + "loss": 2.3874, + "step": 58 + }, + { + "epoch": 0.07078584283143372, + "grad_norm": 3.4740369457844755, + "learning_rate": 2.3200000000000002e-06, + "loss": 2.9403, + "step": 59 + }, + { + "epoch": 0.07198560287942411, + "grad_norm": 3.307937151104892, + "learning_rate": 2.3600000000000003e-06, + "loss": 2.6198, + "step": 60 + }, + { + "epoch": 0.07318536292741452, + "grad_norm": 2.7933319240440566, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.2052, + "step": 61 + }, + { + "epoch": 0.07438512297540492, + "grad_norm": 2.5093361221749064, + "learning_rate": 2.4400000000000004e-06, + "loss": 2.0, + "step": 62 + }, + { + "epoch": 0.07558488302339532, + "grad_norm": 3.7731916081151016, + "learning_rate": 2.4800000000000004e-06, + "loss": 2.5814, + "step": 63 + }, + { + "epoch": 0.07678464307138572, + "grad_norm": 2.7883126052257388, + "learning_rate": 2.52e-06, + "loss": 2.2478, + "step": 64 + }, + { + "epoch": 0.07798440311937613, + "grad_norm": 2.973289118741531, + "learning_rate": 2.56e-06, + "loss": 2.6007, + "step": 65 + }, + { + "epoch": 0.07918416316736653, + "grad_norm": 2.6177891365858152, + "learning_rate": 2.6e-06, + "loss": 2.3398, + "step": 66 + }, + { + "epoch": 0.08038392321535692, + "grad_norm": 3.5881470293988573, + "learning_rate": 2.64e-06, + "loss": 2.3946, + "step": 67 + }, + { + "epoch": 0.08158368326334733, + "grad_norm": 2.3121432357226976, + "learning_rate": 2.68e-06, + "loss": 1.8217, + "step": 68 + }, + { + "epoch": 0.08278344331133773, + "grad_norm": 2.9061157483024544, + "learning_rate": 2.7200000000000002e-06, + "loss": 2.1346, + "step": 69 + }, + { + "epoch": 0.08398320335932813, + "grad_norm": 3.0322906921976975, + "learning_rate": 2.7600000000000003e-06, + "loss": 2.3916, + "step": 70 + }, + { + "epoch": 0.08518296340731854, + "grad_norm": 3.0090713796463433, + "learning_rate": 2.8000000000000003e-06, + "loss": 2.0897, + "step": 71 + }, + { + "epoch": 0.08638272345530894, + "grad_norm": 2.5732336424427413, + "learning_rate": 2.84e-06, + "loss": 1.8334, + "step": 72 + }, + { + "epoch": 0.08758248350329934, + "grad_norm": 2.511452530855231, + "learning_rate": 2.88e-06, + "loss": 1.8359, + "step": 73 + }, + { + "epoch": 0.08878224355128975, + "grad_norm": 2.1722861397430075, + "learning_rate": 2.92e-06, + "loss": 1.7097, + "step": 74 + }, + { + "epoch": 0.08998200359928014, + "grad_norm": 2.2110913265788446, + "learning_rate": 2.96e-06, + "loss": 1.8823, + "step": 75 + }, + { + "epoch": 0.09118176364727054, + "grad_norm": 3.1903636836969853, + "learning_rate": 3e-06, + "loss": 2.1091, + "step": 76 + }, + { + "epoch": 0.09238152369526095, + "grad_norm": 2.139050909268807, + "learning_rate": 3.04e-06, + "loss": 1.7386, + "step": 77 + }, + { + "epoch": 0.09358128374325135, + "grad_norm": 2.781821724211591, + "learning_rate": 3.08e-06, + "loss": 1.7271, + "step": 78 + }, + { + "epoch": 0.09478104379124175, + "grad_norm": 2.1741472134292894, + "learning_rate": 3.12e-06, + "loss": 1.7486, + "step": 79 + }, + { + "epoch": 0.09598080383923216, + "grad_norm": 2.373728486452905, + "learning_rate": 3.1600000000000002e-06, + "loss": 1.9319, + "step": 80 + }, + { + "epoch": 0.09718056388722256, + "grad_norm": 1.7379021917646496, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.4739, + "step": 81 + }, + { + "epoch": 0.09838032393521295, + "grad_norm": 1.7320610918875294, + "learning_rate": 3.2400000000000003e-06, + "loss": 1.5421, + "step": 82 + }, + { + "epoch": 0.09958008398320337, + "grad_norm": 2.144805174214645, + "learning_rate": 3.2800000000000004e-06, + "loss": 1.2531, + "step": 83 + }, + { + "epoch": 0.10077984403119376, + "grad_norm": 1.4760948538526937, + "learning_rate": 3.3200000000000004e-06, + "loss": 1.3801, + "step": 84 + }, + { + "epoch": 0.10197960407918416, + "grad_norm": 1.6876558775793726, + "learning_rate": 3.3600000000000004e-06, + "loss": 1.5244, + "step": 85 + }, + { + "epoch": 0.10317936412717456, + "grad_norm": 2.6308562265498656, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.6274, + "step": 86 + }, + { + "epoch": 0.10437912417516497, + "grad_norm": 2.079207911605781, + "learning_rate": 3.44e-06, + "loss": 1.4456, + "step": 87 + }, + { + "epoch": 0.10557888422315537, + "grad_norm": 1.949221335018367, + "learning_rate": 3.48e-06, + "loss": 1.5871, + "step": 88 + }, + { + "epoch": 0.10677864427114576, + "grad_norm": 1.8368415904194118, + "learning_rate": 3.52e-06, + "loss": 1.3269, + "step": 89 + }, + { + "epoch": 0.10797840431913618, + "grad_norm": 1.7700541461140875, + "learning_rate": 3.5600000000000002e-06, + "loss": 1.5883, + "step": 90 + }, + { + "epoch": 0.10917816436712657, + "grad_norm": 1.1320104168531329, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.0229, + "step": 91 + }, + { + "epoch": 0.11037792441511697, + "grad_norm": 1.9859979163482226, + "learning_rate": 3.6400000000000003e-06, + "loss": 1.4403, + "step": 92 + }, + { + "epoch": 0.11157768446310738, + "grad_norm": 1.6894108669888084, + "learning_rate": 3.6800000000000003e-06, + "loss": 1.2201, + "step": 93 + }, + { + "epoch": 0.11277744451109778, + "grad_norm": 1.5071412717650037, + "learning_rate": 3.7200000000000004e-06, + "loss": 1.4269, + "step": 94 + }, + { + "epoch": 0.11397720455908818, + "grad_norm": 1.6485135378859557, + "learning_rate": 3.7600000000000004e-06, + "loss": 1.4154, + "step": 95 + }, + { + "epoch": 0.11517696460707859, + "grad_norm": 1.6754056219688747, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.3267, + "step": 96 + }, + { + "epoch": 0.11637672465506899, + "grad_norm": 1.2972761759551974, + "learning_rate": 3.8400000000000005e-06, + "loss": 1.2842, + "step": 97 + }, + { + "epoch": 0.11757648470305938, + "grad_norm": 1.2720323324361733, + "learning_rate": 3.88e-06, + "loss": 1.131, + "step": 98 + }, + { + "epoch": 0.1187762447510498, + "grad_norm": 1.2532439771283779, + "learning_rate": 3.920000000000001e-06, + "loss": 1.2646, + "step": 99 + }, + { + "epoch": 0.11997600479904019, + "grad_norm": 1.7000249287596594, + "learning_rate": 3.96e-06, + "loss": 1.3158, + "step": 100 + }, + { + "epoch": 0.12117576484703059, + "grad_norm": 1.5235594363476401, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3772, + "step": 101 + }, + { + "epoch": 0.122375524895021, + "grad_norm": 1.301839610617614, + "learning_rate": 4.04e-06, + "loss": 1.2182, + "step": 102 + }, + { + "epoch": 0.1235752849430114, + "grad_norm": 1.210313514607747, + "learning_rate": 4.08e-06, + "loss": 1.1669, + "step": 103 + }, + { + "epoch": 0.1247750449910018, + "grad_norm": 1.30142947639613, + "learning_rate": 4.12e-06, + "loss": 1.0672, + "step": 104 + }, + { + "epoch": 0.1259748050389922, + "grad_norm": 3.128370236473383, + "learning_rate": 4.16e-06, + "loss": 1.2039, + "step": 105 + }, + { + "epoch": 0.1271745650869826, + "grad_norm": 1.4045390458763363, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.3575, + "step": 106 + }, + { + "epoch": 0.12837432513497302, + "grad_norm": 1.3667067390135492, + "learning_rate": 4.24e-06, + "loss": 1.2138, + "step": 107 + }, + { + "epoch": 0.1295740851829634, + "grad_norm": 1.2285297203972647, + "learning_rate": 4.2800000000000005e-06, + "loss": 1.2163, + "step": 108 + }, + { + "epoch": 0.1307738452309538, + "grad_norm": 1.3090900238777463, + "learning_rate": 4.32e-06, + "loss": 1.2605, + "step": 109 + }, + { + "epoch": 0.13197360527894422, + "grad_norm": 2.052730934540717, + "learning_rate": 4.360000000000001e-06, + "loss": 1.2653, + "step": 110 + }, + { + "epoch": 0.1331733653269346, + "grad_norm": 1.144841186760904, + "learning_rate": 4.4e-06, + "loss": 1.0678, + "step": 111 + }, + { + "epoch": 0.13437312537492502, + "grad_norm": 1.246084710168215, + "learning_rate": 4.440000000000001e-06, + "loss": 1.2, + "step": 112 + }, + { + "epoch": 0.13557288542291543, + "grad_norm": 1.6933198526774484, + "learning_rate": 4.48e-06, + "loss": 1.3187, + "step": 113 + }, + { + "epoch": 0.1367726454709058, + "grad_norm": 1.1999792730510719, + "learning_rate": 4.520000000000001e-06, + "loss": 1.1219, + "step": 114 + }, + { + "epoch": 0.13797240551889622, + "grad_norm": 1.0620219067763197, + "learning_rate": 4.56e-06, + "loss": 1.1237, + "step": 115 + }, + { + "epoch": 0.13917216556688664, + "grad_norm": 1.2599012736624202, + "learning_rate": 4.600000000000001e-06, + "loss": 1.1174, + "step": 116 + }, + { + "epoch": 0.14037192561487702, + "grad_norm": 1.0542956424602081, + "learning_rate": 4.6400000000000005e-06, + "loss": 1.0556, + "step": 117 + }, + { + "epoch": 0.14157168566286743, + "grad_norm": 1.4137377397969535, + "learning_rate": 4.680000000000001e-06, + "loss": 1.2119, + "step": 118 + }, + { + "epoch": 0.14277144571085784, + "grad_norm": 1.3907909495429018, + "learning_rate": 4.7200000000000005e-06, + "loss": 1.1808, + "step": 119 + }, + { + "epoch": 0.14397120575884823, + "grad_norm": 1.1932500680807583, + "learning_rate": 4.76e-06, + "loss": 1.1249, + "step": 120 + }, + { + "epoch": 0.14517096580683864, + "grad_norm": 1.0846222377259351, + "learning_rate": 4.800000000000001e-06, + "loss": 1.0507, + "step": 121 + }, + { + "epoch": 0.14637072585482905, + "grad_norm": 1.0610851208774656, + "learning_rate": 4.84e-06, + "loss": 1.1116, + "step": 122 + }, + { + "epoch": 0.14757048590281943, + "grad_norm": 0.886202868439996, + "learning_rate": 4.880000000000001e-06, + "loss": 0.9611, + "step": 123 + }, + { + "epoch": 0.14877024595080984, + "grad_norm": 1.6115026528324286, + "learning_rate": 4.92e-06, + "loss": 1.115, + "step": 124 + }, + { + "epoch": 0.14997000599880023, + "grad_norm": 1.0806217358873074, + "learning_rate": 4.960000000000001e-06, + "loss": 1.0021, + "step": 125 + }, + { + "epoch": 0.15116976604679064, + "grad_norm": 0.9920194026980443, + "learning_rate": 5e-06, + "loss": 1.062, + "step": 126 + }, + { + "epoch": 0.15236952609478105, + "grad_norm": 0.9327889709338532, + "learning_rate": 5.04e-06, + "loss": 1.0048, + "step": 127 + }, + { + "epoch": 0.15356928614277143, + "grad_norm": 1.0056043354123612, + "learning_rate": 5.0800000000000005e-06, + "loss": 0.9148, + "step": 128 + }, + { + "epoch": 0.15476904619076184, + "grad_norm": 1.0190425193471366, + "learning_rate": 5.12e-06, + "loss": 1.0364, + "step": 129 + }, + { + "epoch": 0.15596880623875226, + "grad_norm": 0.8777918587504914, + "learning_rate": 5.1600000000000006e-06, + "loss": 1.014, + "step": 130 + }, + { + "epoch": 0.15716856628674264, + "grad_norm": 1.1201006530409465, + "learning_rate": 5.2e-06, + "loss": 1.0061, + "step": 131 + }, + { + "epoch": 0.15836832633473305, + "grad_norm": 1.0665861855030594, + "learning_rate": 5.240000000000001e-06, + "loss": 1.0281, + "step": 132 + }, + { + "epoch": 0.15956808638272346, + "grad_norm": 1.2894515134551434, + "learning_rate": 5.28e-06, + "loss": 0.9923, + "step": 133 + }, + { + "epoch": 0.16076784643071385, + "grad_norm": 1.0389990888777316, + "learning_rate": 5.320000000000001e-06, + "loss": 1.1106, + "step": 134 + }, + { + "epoch": 0.16196760647870426, + "grad_norm": 1.0917817440032695, + "learning_rate": 5.36e-06, + "loss": 1.0392, + "step": 135 + }, + { + "epoch": 0.16316736652669467, + "grad_norm": 1.0587762299260288, + "learning_rate": 5.400000000000001e-06, + "loss": 1.0769, + "step": 136 + }, + { + "epoch": 0.16436712657468505, + "grad_norm": 0.8716176938258128, + "learning_rate": 5.4400000000000004e-06, + "loss": 1.0114, + "step": 137 + }, + { + "epoch": 0.16556688662267546, + "grad_norm": 1.1810297418673554, + "learning_rate": 5.480000000000001e-06, + "loss": 1.0641, + "step": 138 + }, + { + "epoch": 0.16676664667066587, + "grad_norm": 0.97161665734718, + "learning_rate": 5.5200000000000005e-06, + "loss": 1.0058, + "step": 139 + }, + { + "epoch": 0.16796640671865626, + "grad_norm": 1.0199814307298578, + "learning_rate": 5.560000000000001e-06, + "loss": 0.9774, + "step": 140 + }, + { + "epoch": 0.16916616676664667, + "grad_norm": 1.2324988972290545, + "learning_rate": 5.600000000000001e-06, + "loss": 1.0081, + "step": 141 + }, + { + "epoch": 0.17036592681463708, + "grad_norm": 0.9165804188035397, + "learning_rate": 5.64e-06, + "loss": 0.9421, + "step": 142 + }, + { + "epoch": 0.17156568686262746, + "grad_norm": 1.0948757075942301, + "learning_rate": 5.68e-06, + "loss": 0.9672, + "step": 143 + }, + { + "epoch": 0.17276544691061788, + "grad_norm": 1.3547639743881061, + "learning_rate": 5.72e-06, + "loss": 1.0737, + "step": 144 + }, + { + "epoch": 0.1739652069586083, + "grad_norm": 0.8432488035480182, + "learning_rate": 5.76e-06, + "loss": 0.9153, + "step": 145 + }, + { + "epoch": 0.17516496700659867, + "grad_norm": 0.9801210666456533, + "learning_rate": 5.8e-06, + "loss": 0.9635, + "step": 146 + }, + { + "epoch": 0.17636472705458908, + "grad_norm": 0.9391319754858495, + "learning_rate": 5.84e-06, + "loss": 0.9016, + "step": 147 + }, + { + "epoch": 0.1775644871025795, + "grad_norm": 1.0482432220624154, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.9298, + "step": 148 + }, + { + "epoch": 0.17876424715056988, + "grad_norm": 0.8909811362011276, + "learning_rate": 5.92e-06, + "loss": 0.8352, + "step": 149 + }, + { + "epoch": 0.1799640071985603, + "grad_norm": 0.9822132770617572, + "learning_rate": 5.9600000000000005e-06, + "loss": 0.9224, + "step": 150 + }, + { + "epoch": 0.1811637672465507, + "grad_norm": 1.01230157575695, + "learning_rate": 6e-06, + "loss": 1.1265, + "step": 151 + }, + { + "epoch": 0.18236352729454108, + "grad_norm": 0.9437580880985108, + "learning_rate": 6.040000000000001e-06, + "loss": 0.958, + "step": 152 + }, + { + "epoch": 0.1835632873425315, + "grad_norm": 0.972791293358992, + "learning_rate": 6.08e-06, + "loss": 0.9232, + "step": 153 + }, + { + "epoch": 0.1847630473905219, + "grad_norm": 0.8858717763933377, + "learning_rate": 6.120000000000001e-06, + "loss": 0.9367, + "step": 154 + }, + { + "epoch": 0.1859628074385123, + "grad_norm": 0.9341273071118985, + "learning_rate": 6.16e-06, + "loss": 0.9434, + "step": 155 + }, + { + "epoch": 0.1871625674865027, + "grad_norm": 0.9816475511556988, + "learning_rate": 6.200000000000001e-06, + "loss": 1.0585, + "step": 156 + }, + { + "epoch": 0.1883623275344931, + "grad_norm": 0.8700040253263074, + "learning_rate": 6.24e-06, + "loss": 0.8949, + "step": 157 + }, + { + "epoch": 0.1895620875824835, + "grad_norm": 0.8317898088035741, + "learning_rate": 6.280000000000001e-06, + "loss": 0.9559, + "step": 158 + }, + { + "epoch": 0.1907618476304739, + "grad_norm": 1.1531747673452089, + "learning_rate": 6.3200000000000005e-06, + "loss": 1.0648, + "step": 159 + }, + { + "epoch": 0.19196160767846432, + "grad_norm": 0.8882450955578516, + "learning_rate": 6.360000000000001e-06, + "loss": 0.911, + "step": 160 + }, + { + "epoch": 0.1931613677264547, + "grad_norm": 0.8890920488919913, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.8869, + "step": 161 + }, + { + "epoch": 0.1943611277744451, + "grad_norm": 0.9109956956678691, + "learning_rate": 6.440000000000001e-06, + "loss": 0.9941, + "step": 162 + }, + { + "epoch": 0.19556088782243553, + "grad_norm": 0.831996363776618, + "learning_rate": 6.480000000000001e-06, + "loss": 0.8425, + "step": 163 + }, + { + "epoch": 0.1967606478704259, + "grad_norm": 0.8344823541062902, + "learning_rate": 6.520000000000001e-06, + "loss": 0.8864, + "step": 164 + }, + { + "epoch": 0.19796040791841632, + "grad_norm": 1.093300270347132, + "learning_rate": 6.560000000000001e-06, + "loss": 0.9468, + "step": 165 + }, + { + "epoch": 0.19916016796640673, + "grad_norm": 0.9247298608370305, + "learning_rate": 6.600000000000001e-06, + "loss": 0.9053, + "step": 166 + }, + { + "epoch": 0.20035992801439712, + "grad_norm": 0.8701588667382361, + "learning_rate": 6.640000000000001e-06, + "loss": 0.9582, + "step": 167 + }, + { + "epoch": 0.20155968806238753, + "grad_norm": 0.9402890562149786, + "learning_rate": 6.680000000000001e-06, + "loss": 0.9583, + "step": 168 + }, + { + "epoch": 0.20275944811037794, + "grad_norm": 0.9482865539009554, + "learning_rate": 6.720000000000001e-06, + "loss": 0.9701, + "step": 169 + }, + { + "epoch": 0.20395920815836832, + "grad_norm": 0.9602660237737364, + "learning_rate": 6.760000000000001e-06, + "loss": 0.8874, + "step": 170 + }, + { + "epoch": 0.20515896820635873, + "grad_norm": 0.9616673957223468, + "learning_rate": 6.800000000000001e-06, + "loss": 0.9417, + "step": 171 + }, + { + "epoch": 0.20635872825434912, + "grad_norm": 0.9294318758101818, + "learning_rate": 6.8400000000000014e-06, + "loss": 1.0462, + "step": 172 + }, + { + "epoch": 0.20755848830233953, + "grad_norm": 1.025443081693529, + "learning_rate": 6.88e-06, + "loss": 1.0368, + "step": 173 + }, + { + "epoch": 0.20875824835032994, + "grad_norm": 0.9243795087667857, + "learning_rate": 6.92e-06, + "loss": 0.9266, + "step": 174 + }, + { + "epoch": 0.20995800839832032, + "grad_norm": 0.8913356558613602, + "learning_rate": 6.96e-06, + "loss": 0.8184, + "step": 175 + }, + { + "epoch": 0.21115776844631073, + "grad_norm": 0.8235026040452708, + "learning_rate": 7e-06, + "loss": 0.8625, + "step": 176 + }, + { + "epoch": 0.21235752849430115, + "grad_norm": 0.8610225245675168, + "learning_rate": 7.04e-06, + "loss": 0.8752, + "step": 177 + }, + { + "epoch": 0.21355728854229153, + "grad_norm": 0.9258435972085604, + "learning_rate": 7.08e-06, + "loss": 0.8718, + "step": 178 + }, + { + "epoch": 0.21475704859028194, + "grad_norm": 0.9663599667481519, + "learning_rate": 7.1200000000000004e-06, + "loss": 0.9455, + "step": 179 + }, + { + "epoch": 0.21595680863827235, + "grad_norm": 0.8891513081143456, + "learning_rate": 7.16e-06, + "loss": 0.8913, + "step": 180 + }, + { + "epoch": 0.21715656868626274, + "grad_norm": 0.8901809205969442, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.806, + "step": 181 + }, + { + "epoch": 0.21835632873425315, + "grad_norm": 0.9125751194829773, + "learning_rate": 7.24e-06, + "loss": 0.91, + "step": 182 + }, + { + "epoch": 0.21955608878224356, + "grad_norm": 0.8419253308107584, + "learning_rate": 7.280000000000001e-06, + "loss": 0.899, + "step": 183 + }, + { + "epoch": 0.22075584883023394, + "grad_norm": 0.9065419690594954, + "learning_rate": 7.32e-06, + "loss": 0.8456, + "step": 184 + }, + { + "epoch": 0.22195560887822435, + "grad_norm": 0.8982973381361369, + "learning_rate": 7.360000000000001e-06, + "loss": 0.8708, + "step": 185 + }, + { + "epoch": 0.22315536892621476, + "grad_norm": 0.8089289556734092, + "learning_rate": 7.4e-06, + "loss": 0.8851, + "step": 186 + }, + { + "epoch": 0.22435512897420515, + "grad_norm": 0.9659392279213572, + "learning_rate": 7.440000000000001e-06, + "loss": 0.9239, + "step": 187 + }, + { + "epoch": 0.22555488902219556, + "grad_norm": 0.9918304286118667, + "learning_rate": 7.48e-06, + "loss": 0.8922, + "step": 188 + }, + { + "epoch": 0.22675464907018597, + "grad_norm": 0.8870035129018767, + "learning_rate": 7.520000000000001e-06, + "loss": 0.7712, + "step": 189 + }, + { + "epoch": 0.22795440911817635, + "grad_norm": 0.8304274635703632, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.874, + "step": 190 + }, + { + "epoch": 0.22915416916616677, + "grad_norm": 0.9364956937637816, + "learning_rate": 7.600000000000001e-06, + "loss": 0.9543, + "step": 191 + }, + { + "epoch": 0.23035392921415718, + "grad_norm": 0.8742969550728668, + "learning_rate": 7.640000000000001e-06, + "loss": 0.9191, + "step": 192 + }, + { + "epoch": 0.23155368926214756, + "grad_norm": 0.9691851065618883, + "learning_rate": 7.680000000000001e-06, + "loss": 0.8632, + "step": 193 + }, + { + "epoch": 0.23275344931013797, + "grad_norm": 0.8438033215721245, + "learning_rate": 7.72e-06, + "loss": 0.9172, + "step": 194 + }, + { + "epoch": 0.23395320935812838, + "grad_norm": 0.9049771481466592, + "learning_rate": 7.76e-06, + "loss": 0.9471, + "step": 195 + }, + { + "epoch": 0.23515296940611877, + "grad_norm": 0.989123646719365, + "learning_rate": 7.800000000000002e-06, + "loss": 0.8708, + "step": 196 + }, + { + "epoch": 0.23635272945410918, + "grad_norm": 0.9706825120362642, + "learning_rate": 7.840000000000001e-06, + "loss": 0.9137, + "step": 197 + }, + { + "epoch": 0.2375524895020996, + "grad_norm": 0.8874052204518389, + "learning_rate": 7.88e-06, + "loss": 0.9451, + "step": 198 + }, + { + "epoch": 0.23875224955008997, + "grad_norm": 0.822402836098356, + "learning_rate": 7.92e-06, + "loss": 0.8645, + "step": 199 + }, + { + "epoch": 0.23995200959808038, + "grad_norm": 0.8547125621693455, + "learning_rate": 7.960000000000002e-06, + "loss": 0.846, + "step": 200 + }, + { + "epoch": 0.2411517696460708, + "grad_norm": 1.1387174253435128, + "learning_rate": 8.000000000000001e-06, + "loss": 0.952, + "step": 201 + }, + { + "epoch": 0.24235152969406118, + "grad_norm": 0.9321636106022483, + "learning_rate": 8.040000000000001e-06, + "loss": 0.7929, + "step": 202 + }, + { + "epoch": 0.2435512897420516, + "grad_norm": 0.8972716580989439, + "learning_rate": 8.08e-06, + "loss": 0.9002, + "step": 203 + }, + { + "epoch": 0.244751049790042, + "grad_norm": 0.8627127688612859, + "learning_rate": 8.120000000000002e-06, + "loss": 0.8346, + "step": 204 + }, + { + "epoch": 0.24595080983803239, + "grad_norm": 0.9804340797810391, + "learning_rate": 8.16e-06, + "loss": 0.8275, + "step": 205 + }, + { + "epoch": 0.2471505698860228, + "grad_norm": 0.9665883639615581, + "learning_rate": 8.2e-06, + "loss": 0.8732, + "step": 206 + }, + { + "epoch": 0.2483503299340132, + "grad_norm": 0.9978598494460422, + "learning_rate": 8.24e-06, + "loss": 0.8873, + "step": 207 + }, + { + "epoch": 0.2495500899820036, + "grad_norm": 0.835752279975912, + "learning_rate": 8.28e-06, + "loss": 0.7762, + "step": 208 + }, + { + "epoch": 0.250749850029994, + "grad_norm": 0.8056768392805612, + "learning_rate": 8.32e-06, + "loss": 0.8065, + "step": 209 + }, + { + "epoch": 0.2519496100779844, + "grad_norm": 1.0454200086020167, + "learning_rate": 8.36e-06, + "loss": 0.9384, + "step": 210 + }, + { + "epoch": 0.2531493701259748, + "grad_norm": 0.8460330286684768, + "learning_rate": 8.400000000000001e-06, + "loss": 0.8941, + "step": 211 + }, + { + "epoch": 0.2543491301739652, + "grad_norm": 0.8447583533784957, + "learning_rate": 8.44e-06, + "loss": 0.8714, + "step": 212 + }, + { + "epoch": 0.2555488902219556, + "grad_norm": 0.8376650804593687, + "learning_rate": 8.48e-06, + "loss": 0.8926, + "step": 213 + }, + { + "epoch": 0.25674865026994603, + "grad_norm": 0.8489358021207093, + "learning_rate": 8.52e-06, + "loss": 0.8818, + "step": 214 + }, + { + "epoch": 0.2579484103179364, + "grad_norm": 0.9236326997232677, + "learning_rate": 8.560000000000001e-06, + "loss": 0.9013, + "step": 215 + }, + { + "epoch": 0.2591481703659268, + "grad_norm": 0.8525808314403832, + "learning_rate": 8.6e-06, + "loss": 0.8728, + "step": 216 + }, + { + "epoch": 0.26034793041391724, + "grad_norm": 0.79761436147496, + "learning_rate": 8.64e-06, + "loss": 0.8644, + "step": 217 + }, + { + "epoch": 0.2615476904619076, + "grad_norm": 0.8960550194483666, + "learning_rate": 8.68e-06, + "loss": 0.8378, + "step": 218 + }, + { + "epoch": 0.262747450509898, + "grad_norm": 0.8592564745427348, + "learning_rate": 8.720000000000001e-06, + "loss": 0.7725, + "step": 219 + }, + { + "epoch": 0.26394721055788845, + "grad_norm": 0.9498899750514396, + "learning_rate": 8.76e-06, + "loss": 0.9275, + "step": 220 + }, + { + "epoch": 0.26514697060587883, + "grad_norm": 0.9793881370045231, + "learning_rate": 8.8e-06, + "loss": 0.9023, + "step": 221 + }, + { + "epoch": 0.2663467306538692, + "grad_norm": 0.7546525454223989, + "learning_rate": 8.84e-06, + "loss": 0.8087, + "step": 222 + }, + { + "epoch": 0.26754649070185965, + "grad_norm": 0.895743262472989, + "learning_rate": 8.880000000000001e-06, + "loss": 0.8462, + "step": 223 + }, + { + "epoch": 0.26874625074985004, + "grad_norm": 0.8508391890954724, + "learning_rate": 8.920000000000001e-06, + "loss": 0.9041, + "step": 224 + }, + { + "epoch": 0.2699460107978404, + "grad_norm": 0.8441234806191734, + "learning_rate": 8.96e-06, + "loss": 0.8307, + "step": 225 + }, + { + "epoch": 0.27114577084583086, + "grad_norm": 0.8792012530549744, + "learning_rate": 9e-06, + "loss": 0.8054, + "step": 226 + }, + { + "epoch": 0.27234553089382124, + "grad_norm": 0.8862693866179844, + "learning_rate": 9.040000000000002e-06, + "loss": 0.8082, + "step": 227 + }, + { + "epoch": 0.2735452909418116, + "grad_norm": 0.936765475346329, + "learning_rate": 9.080000000000001e-06, + "loss": 0.8213, + "step": 228 + }, + { + "epoch": 0.27474505098980206, + "grad_norm": 0.8845364940014634, + "learning_rate": 9.12e-06, + "loss": 0.8575, + "step": 229 + }, + { + "epoch": 0.27594481103779245, + "grad_norm": 0.8943979321402359, + "learning_rate": 9.16e-06, + "loss": 0.8593, + "step": 230 + }, + { + "epoch": 0.27714457108578283, + "grad_norm": 0.8746312602873112, + "learning_rate": 9.200000000000002e-06, + "loss": 0.8003, + "step": 231 + }, + { + "epoch": 0.27834433113377327, + "grad_norm": 0.8863435690762195, + "learning_rate": 9.240000000000001e-06, + "loss": 0.8141, + "step": 232 + }, + { + "epoch": 0.27954409118176365, + "grad_norm": 0.8886892400062681, + "learning_rate": 9.280000000000001e-06, + "loss": 0.8593, + "step": 233 + }, + { + "epoch": 0.28074385122975404, + "grad_norm": 0.8938765308959884, + "learning_rate": 9.32e-06, + "loss": 0.7926, + "step": 234 + }, + { + "epoch": 0.2819436112777445, + "grad_norm": 0.8905592256018503, + "learning_rate": 9.360000000000002e-06, + "loss": 0.7802, + "step": 235 + }, + { + "epoch": 0.28314337132573486, + "grad_norm": 0.944437847777848, + "learning_rate": 9.4e-06, + "loss": 0.8699, + "step": 236 + }, + { + "epoch": 0.28434313137372524, + "grad_norm": 0.9264151484947536, + "learning_rate": 9.440000000000001e-06, + "loss": 0.767, + "step": 237 + }, + { + "epoch": 0.2855428914217157, + "grad_norm": 0.7480605034270361, + "learning_rate": 9.48e-06, + "loss": 0.7868, + "step": 238 + }, + { + "epoch": 0.28674265146970607, + "grad_norm": 1.1013740308750852, + "learning_rate": 9.52e-06, + "loss": 0.8251, + "step": 239 + }, + { + "epoch": 0.28794241151769645, + "grad_norm": 0.817306124632072, + "learning_rate": 9.56e-06, + "loss": 0.7849, + "step": 240 + }, + { + "epoch": 0.2891421715656869, + "grad_norm": 0.8677167723424389, + "learning_rate": 9.600000000000001e-06, + "loss": 0.8002, + "step": 241 + }, + { + "epoch": 0.2903419316136773, + "grad_norm": 0.9154139015776457, + "learning_rate": 9.640000000000001e-06, + "loss": 0.8361, + "step": 242 + }, + { + "epoch": 0.29154169166166766, + "grad_norm": 0.9346294929702866, + "learning_rate": 9.68e-06, + "loss": 0.8009, + "step": 243 + }, + { + "epoch": 0.2927414517096581, + "grad_norm": 0.7915758604161942, + "learning_rate": 9.72e-06, + "loss": 0.828, + "step": 244 + }, + { + "epoch": 0.2939412117576485, + "grad_norm": 0.9537088344426334, + "learning_rate": 9.760000000000001e-06, + "loss": 0.8671, + "step": 245 + }, + { + "epoch": 0.29514097180563886, + "grad_norm": 1.0483821102743403, + "learning_rate": 9.800000000000001e-06, + "loss": 0.885, + "step": 246 + }, + { + "epoch": 0.29634073185362925, + "grad_norm": 0.9102427378891443, + "learning_rate": 9.84e-06, + "loss": 0.7805, + "step": 247 + }, + { + "epoch": 0.2975404919016197, + "grad_norm": 0.9252259031766393, + "learning_rate": 9.88e-06, + "loss": 0.8074, + "step": 248 + }, + { + "epoch": 0.29874025194961007, + "grad_norm": 0.8499107060128288, + "learning_rate": 9.920000000000002e-06, + "loss": 0.776, + "step": 249 + }, + { + "epoch": 0.29994001199760045, + "grad_norm": 1.281569882769472, + "learning_rate": 9.960000000000001e-06, + "loss": 0.8668, + "step": 250 + }, + { + "epoch": 0.3011397720455909, + "grad_norm": 0.8619792688405079, + "learning_rate": 1e-05, + "loss": 0.8864, + "step": 251 + }, + { + "epoch": 0.3023395320935813, + "grad_norm": 0.8951400482973322, + "learning_rate": 9.99999512178685e-06, + "loss": 0.8235, + "step": 252 + }, + { + "epoch": 0.30353929214157166, + "grad_norm": 0.967921131549879, + "learning_rate": 9.99998048715692e-06, + "loss": 0.7992, + "step": 253 + }, + { + "epoch": 0.3047390521895621, + "grad_norm": 0.840452525603444, + "learning_rate": 9.999956096138765e-06, + "loss": 0.7427, + "step": 254 + }, + { + "epoch": 0.3059388122375525, + "grad_norm": 1.0000789029765795, + "learning_rate": 9.999921948779978e-06, + "loss": 0.873, + "step": 255 + }, + { + "epoch": 0.30713857228554287, + "grad_norm": 0.8363738524608543, + "learning_rate": 9.99987804514719e-06, + "loss": 0.8984, + "step": 256 + }, + { + "epoch": 0.3083383323335333, + "grad_norm": 0.903315872911508, + "learning_rate": 9.999824385326073e-06, + "loss": 0.8936, + "step": 257 + }, + { + "epoch": 0.3095380923815237, + "grad_norm": 0.8112526213382579, + "learning_rate": 9.99976096942133e-06, + "loss": 0.8415, + "step": 258 + }, + { + "epoch": 0.31073785242951407, + "grad_norm": 0.8540518199641305, + "learning_rate": 9.999687797556705e-06, + "loss": 0.8041, + "step": 259 + }, + { + "epoch": 0.3119376124775045, + "grad_norm": 0.7614123112280738, + "learning_rate": 9.999604869874975e-06, + "loss": 0.9015, + "step": 260 + }, + { + "epoch": 0.3131373725254949, + "grad_norm": 0.8245082227173874, + "learning_rate": 9.999512186537957e-06, + "loss": 0.8514, + "step": 261 + }, + { + "epoch": 0.3143371325734853, + "grad_norm": 0.8054911223663823, + "learning_rate": 9.999409747726502e-06, + "loss": 0.8167, + "step": 262 + }, + { + "epoch": 0.3155368926214757, + "grad_norm": 0.8903884772939344, + "learning_rate": 9.999297553640498e-06, + "loss": 0.8892, + "step": 263 + }, + { + "epoch": 0.3167366526694661, + "grad_norm": 0.8458806392274165, + "learning_rate": 9.999175604498867e-06, + "loss": 0.8686, + "step": 264 + }, + { + "epoch": 0.3179364127174565, + "grad_norm": 0.8306255702145542, + "learning_rate": 9.999043900539568e-06, + "loss": 0.8528, + "step": 265 + }, + { + "epoch": 0.3191361727654469, + "grad_norm": 0.942771419645441, + "learning_rate": 9.99890244201959e-06, + "loss": 0.7698, + "step": 266 + }, + { + "epoch": 0.3203359328134373, + "grad_norm": 0.9116916527123582, + "learning_rate": 9.998751229214962e-06, + "loss": 0.7459, + "step": 267 + }, + { + "epoch": 0.3215356928614277, + "grad_norm": 0.8315868072518188, + "learning_rate": 9.998590262420743e-06, + "loss": 0.7962, + "step": 268 + }, + { + "epoch": 0.32273545290941813, + "grad_norm": 0.8560623972580833, + "learning_rate": 9.998419541951024e-06, + "loss": 0.8476, + "step": 269 + }, + { + "epoch": 0.3239352129574085, + "grad_norm": 0.9529061379410192, + "learning_rate": 9.998239068138929e-06, + "loss": 0.8171, + "step": 270 + }, + { + "epoch": 0.3251349730053989, + "grad_norm": 0.8807774080549793, + "learning_rate": 9.998048841336615e-06, + "loss": 0.8538, + "step": 271 + }, + { + "epoch": 0.32633473305338934, + "grad_norm": 0.85447879331899, + "learning_rate": 9.997848861915268e-06, + "loss": 0.8696, + "step": 272 + }, + { + "epoch": 0.3275344931013797, + "grad_norm": 1.018623091493027, + "learning_rate": 9.997639130265104e-06, + "loss": 0.7725, + "step": 273 + }, + { + "epoch": 0.3287342531493701, + "grad_norm": 0.8746987739592668, + "learning_rate": 9.997419646795372e-06, + "loss": 0.86, + "step": 274 + }, + { + "epoch": 0.32993401319736054, + "grad_norm": 0.921512734396811, + "learning_rate": 9.997190411934346e-06, + "loss": 0.7502, + "step": 275 + }, + { + "epoch": 0.3311337732453509, + "grad_norm": 0.9161536633828824, + "learning_rate": 9.996951426129327e-06, + "loss": 0.8332, + "step": 276 + }, + { + "epoch": 0.3323335332933413, + "grad_norm": 0.8829663327810717, + "learning_rate": 9.996702689846645e-06, + "loss": 0.7447, + "step": 277 + }, + { + "epoch": 0.33353329334133175, + "grad_norm": 0.8870347194650321, + "learning_rate": 9.996444203571655e-06, + "loss": 0.7957, + "step": 278 + }, + { + "epoch": 0.33473305338932213, + "grad_norm": 1.0771228842939597, + "learning_rate": 9.996175967808738e-06, + "loss": 0.8305, + "step": 279 + }, + { + "epoch": 0.3359328134373125, + "grad_norm": 1.0172416792396413, + "learning_rate": 9.995897983081301e-06, + "loss": 0.8696, + "step": 280 + }, + { + "epoch": 0.33713257348530296, + "grad_norm": 0.9606670435787978, + "learning_rate": 9.995610249931769e-06, + "loss": 0.8481, + "step": 281 + }, + { + "epoch": 0.33833233353329334, + "grad_norm": 0.9026045069176893, + "learning_rate": 9.995312768921592e-06, + "loss": 0.8514, + "step": 282 + }, + { + "epoch": 0.3395320935812837, + "grad_norm": 0.9391209813217448, + "learning_rate": 9.995005540631239e-06, + "loss": 0.7811, + "step": 283 + }, + { + "epoch": 0.34073185362927416, + "grad_norm": 0.8262010905204108, + "learning_rate": 9.994688565660201e-06, + "loss": 0.7676, + "step": 284 + }, + { + "epoch": 0.34193161367726455, + "grad_norm": 0.8447905564304076, + "learning_rate": 9.994361844626988e-06, + "loss": 0.7969, + "step": 285 + }, + { + "epoch": 0.34313137372525493, + "grad_norm": 0.8682002293146721, + "learning_rate": 9.994025378169123e-06, + "loss": 0.7306, + "step": 286 + }, + { + "epoch": 0.34433113377324537, + "grad_norm": 1.0013520695774265, + "learning_rate": 9.99367916694315e-06, + "loss": 0.8426, + "step": 287 + }, + { + "epoch": 0.34553089382123575, + "grad_norm": 0.8399202189103822, + "learning_rate": 9.993323211624625e-06, + "loss": 0.7884, + "step": 288 + }, + { + "epoch": 0.34673065386922614, + "grad_norm": 0.7564780955692382, + "learning_rate": 9.992957512908121e-06, + "loss": 0.7966, + "step": 289 + }, + { + "epoch": 0.3479304139172166, + "grad_norm": 0.8786188259825324, + "learning_rate": 9.992582071507217e-06, + "loss": 0.83, + "step": 290 + }, + { + "epoch": 0.34913017396520696, + "grad_norm": 0.8263855819597197, + "learning_rate": 9.992196888154508e-06, + "loss": 0.7298, + "step": 291 + }, + { + "epoch": 0.35032993401319734, + "grad_norm": 0.8335583134477816, + "learning_rate": 9.991801963601595e-06, + "loss": 0.7603, + "step": 292 + }, + { + "epoch": 0.3515296940611878, + "grad_norm": 0.8523253593159443, + "learning_rate": 9.991397298619091e-06, + "loss": 0.7732, + "step": 293 + }, + { + "epoch": 0.35272945410917816, + "grad_norm": 0.9147643300032103, + "learning_rate": 9.990982893996612e-06, + "loss": 0.8953, + "step": 294 + }, + { + "epoch": 0.35392921415716855, + "grad_norm": 0.8053352375638536, + "learning_rate": 9.990558750542778e-06, + "loss": 0.8504, + "step": 295 + }, + { + "epoch": 0.355128974205159, + "grad_norm": 0.8642158057007054, + "learning_rate": 9.990124869085216e-06, + "loss": 0.8018, + "step": 296 + }, + { + "epoch": 0.35632873425314937, + "grad_norm": 0.8944881258422162, + "learning_rate": 9.989681250470551e-06, + "loss": 0.7362, + "step": 297 + }, + { + "epoch": 0.35752849430113975, + "grad_norm": 0.8445661968651367, + "learning_rate": 9.98922789556441e-06, + "loss": 0.7657, + "step": 298 + }, + { + "epoch": 0.3587282543491302, + "grad_norm": 0.7985049700643867, + "learning_rate": 9.988764805251417e-06, + "loss": 0.7836, + "step": 299 + }, + { + "epoch": 0.3599280143971206, + "grad_norm": 0.814789600523998, + "learning_rate": 9.988291980435194e-06, + "loss": 0.8507, + "step": 300 + }, + { + "epoch": 0.36112777444511096, + "grad_norm": 0.877151130445411, + "learning_rate": 9.98780942203836e-06, + "loss": 0.8123, + "step": 301 + }, + { + "epoch": 0.3623275344931014, + "grad_norm": 0.8341864995834861, + "learning_rate": 9.987317131002519e-06, + "loss": 0.795, + "step": 302 + }, + { + "epoch": 0.3635272945410918, + "grad_norm": 0.9050260956788025, + "learning_rate": 9.986815108288273e-06, + "loss": 0.7917, + "step": 303 + }, + { + "epoch": 0.36472705458908217, + "grad_norm": 0.8567073056297471, + "learning_rate": 9.986303354875213e-06, + "loss": 0.8435, + "step": 304 + }, + { + "epoch": 0.3659268146370726, + "grad_norm": 0.8858524400221668, + "learning_rate": 9.985781871761915e-06, + "loss": 0.7367, + "step": 305 + }, + { + "epoch": 0.367126574685063, + "grad_norm": 0.8350475034028912, + "learning_rate": 9.98525065996594e-06, + "loss": 0.7849, + "step": 306 + }, + { + "epoch": 0.3683263347330534, + "grad_norm": 1.0440799632179578, + "learning_rate": 9.984709720523836e-06, + "loss": 0.8932, + "step": 307 + }, + { + "epoch": 0.3695260947810438, + "grad_norm": 0.724952917346773, + "learning_rate": 9.98415905449113e-06, + "loss": 0.7721, + "step": 308 + }, + { + "epoch": 0.3707258548290342, + "grad_norm": 0.8473537207653666, + "learning_rate": 9.983598662942325e-06, + "loss": 0.8477, + "step": 309 + }, + { + "epoch": 0.3719256148770246, + "grad_norm": 1.1641508886486074, + "learning_rate": 9.983028546970908e-06, + "loss": 0.817, + "step": 310 + }, + { + "epoch": 0.373125374925015, + "grad_norm": 0.8475121849998504, + "learning_rate": 9.982448707689338e-06, + "loss": 0.7466, + "step": 311 + }, + { + "epoch": 0.3743251349730054, + "grad_norm": 0.9417474319435428, + "learning_rate": 9.981859146229047e-06, + "loss": 0.7882, + "step": 312 + }, + { + "epoch": 0.3755248950209958, + "grad_norm": 0.8832965885957832, + "learning_rate": 9.981259863740435e-06, + "loss": 0.8116, + "step": 313 + }, + { + "epoch": 0.3767246550689862, + "grad_norm": 0.81201642834222, + "learning_rate": 9.980650861392877e-06, + "loss": 0.8432, + "step": 314 + }, + { + "epoch": 0.3779244151169766, + "grad_norm": 0.84708845432052, + "learning_rate": 9.980032140374706e-06, + "loss": 0.8342, + "step": 315 + }, + { + "epoch": 0.379124175164967, + "grad_norm": 0.9425876218381913, + "learning_rate": 9.979403701893226e-06, + "loss": 0.8462, + "step": 316 + }, + { + "epoch": 0.38032393521295743, + "grad_norm": 0.8137886430648797, + "learning_rate": 9.9787655471747e-06, + "loss": 0.7919, + "step": 317 + }, + { + "epoch": 0.3815236952609478, + "grad_norm": 0.7908051128093959, + "learning_rate": 9.97811767746435e-06, + "loss": 0.8084, + "step": 318 + }, + { + "epoch": 0.3827234553089382, + "grad_norm": 0.7982496312542597, + "learning_rate": 9.97746009402635e-06, + "loss": 0.8085, + "step": 319 + }, + { + "epoch": 0.38392321535692864, + "grad_norm": 0.776071189899833, + "learning_rate": 9.976792798143837e-06, + "loss": 0.7764, + "step": 320 + }, + { + "epoch": 0.385122975404919, + "grad_norm": 1.4618534145096531, + "learning_rate": 9.976115791118897e-06, + "loss": 0.8238, + "step": 321 + }, + { + "epoch": 0.3863227354529094, + "grad_norm": 0.7150889271178531, + "learning_rate": 9.97542907427256e-06, + "loss": 0.7196, + "step": 322 + }, + { + "epoch": 0.38752249550089984, + "grad_norm": 0.8050788339997501, + "learning_rate": 9.97473264894481e-06, + "loss": 0.7565, + "step": 323 + }, + { + "epoch": 0.3887222555488902, + "grad_norm": 0.9983272035179562, + "learning_rate": 9.97402651649457e-06, + "loss": 0.8429, + "step": 324 + }, + { + "epoch": 0.3899220155968806, + "grad_norm": 0.8623842545763719, + "learning_rate": 9.973310678299704e-06, + "loss": 0.7245, + "step": 325 + }, + { + "epoch": 0.39112177564487105, + "grad_norm": 0.8591131060288989, + "learning_rate": 9.97258513575702e-06, + "loss": 0.7125, + "step": 326 + }, + { + "epoch": 0.39232153569286143, + "grad_norm": 0.6711400146323111, + "learning_rate": 9.971849890282255e-06, + "loss": 0.7545, + "step": 327 + }, + { + "epoch": 0.3935212957408518, + "grad_norm": 0.768957585508672, + "learning_rate": 9.971104943310086e-06, + "loss": 0.7789, + "step": 328 + }, + { + "epoch": 0.39472105578884226, + "grad_norm": 1.1384395543413788, + "learning_rate": 9.970350296294114e-06, + "loss": 0.8106, + "step": 329 + }, + { + "epoch": 0.39592081583683264, + "grad_norm": 0.877220895635206, + "learning_rate": 9.969585950706872e-06, + "loss": 0.7892, + "step": 330 + }, + { + "epoch": 0.397120575884823, + "grad_norm": 0.8104918613582576, + "learning_rate": 9.968811908039818e-06, + "loss": 0.7972, + "step": 331 + }, + { + "epoch": 0.39832033593281346, + "grad_norm": 0.8234689194129275, + "learning_rate": 9.968028169803326e-06, + "loss": 0.809, + "step": 332 + }, + { + "epoch": 0.39952009598080385, + "grad_norm": 0.8441849338357255, + "learning_rate": 9.967234737526695e-06, + "loss": 0.775, + "step": 333 + }, + { + "epoch": 0.40071985602879423, + "grad_norm": 0.842204245498853, + "learning_rate": 9.966431612758138e-06, + "loss": 0.6613, + "step": 334 + }, + { + "epoch": 0.40191961607678467, + "grad_norm": 0.8800128959101974, + "learning_rate": 9.965618797064782e-06, + "loss": 0.747, + "step": 335 + }, + { + "epoch": 0.40311937612477505, + "grad_norm": 0.9511458805031532, + "learning_rate": 9.96479629203266e-06, + "loss": 0.7791, + "step": 336 + }, + { + "epoch": 0.40431913617276544, + "grad_norm": 0.9726969708248384, + "learning_rate": 9.963964099266715e-06, + "loss": 0.7684, + "step": 337 + }, + { + "epoch": 0.4055188962207559, + "grad_norm": 1.0222745919628025, + "learning_rate": 9.963122220390791e-06, + "loss": 0.7814, + "step": 338 + }, + { + "epoch": 0.40671865626874626, + "grad_norm": 1.0179699232834631, + "learning_rate": 9.962270657047634e-06, + "loss": 0.7701, + "step": 339 + }, + { + "epoch": 0.40791841631673664, + "grad_norm": 0.8559764361792539, + "learning_rate": 9.96140941089889e-06, + "loss": 0.8081, + "step": 340 + }, + { + "epoch": 0.4091181763647271, + "grad_norm": 0.8008244277348966, + "learning_rate": 9.960538483625092e-06, + "loss": 0.7081, + "step": 341 + }, + { + "epoch": 0.41031793641271747, + "grad_norm": 0.802937705951165, + "learning_rate": 9.959657876925671e-06, + "loss": 0.8017, + "step": 342 + }, + { + "epoch": 0.41151769646070785, + "grad_norm": 0.8658546979578301, + "learning_rate": 9.95876759251894e-06, + "loss": 0.7642, + "step": 343 + }, + { + "epoch": 0.41271745650869823, + "grad_norm": 0.8358056079222644, + "learning_rate": 9.957867632142098e-06, + "loss": 0.7632, + "step": 344 + }, + { + "epoch": 0.41391721655668867, + "grad_norm": 3.1390710745913935, + "learning_rate": 9.956957997551224e-06, + "loss": 0.7971, + "step": 345 + }, + { + "epoch": 0.41511697660467906, + "grad_norm": 1.0243036157293175, + "learning_rate": 9.956038690521276e-06, + "loss": 0.8726, + "step": 346 + }, + { + "epoch": 0.41631673665266944, + "grad_norm": 0.8289269012612019, + "learning_rate": 9.955109712846083e-06, + "loss": 0.7728, + "step": 347 + }, + { + "epoch": 0.4175164967006599, + "grad_norm": 0.8245470625798709, + "learning_rate": 9.954171066338346e-06, + "loss": 0.7399, + "step": 348 + }, + { + "epoch": 0.41871625674865026, + "grad_norm": 0.7171137486202372, + "learning_rate": 9.95322275282963e-06, + "loss": 0.7375, + "step": 349 + }, + { + "epoch": 0.41991601679664065, + "grad_norm": 0.8337532347907729, + "learning_rate": 9.952264774170368e-06, + "loss": 0.8555, + "step": 350 + }, + { + "epoch": 0.4211157768446311, + "grad_norm": 0.8802297139352729, + "learning_rate": 9.95129713222985e-06, + "loss": 0.7439, + "step": 351 + }, + { + "epoch": 0.42231553689262147, + "grad_norm": 0.871172679451122, + "learning_rate": 9.95031982889622e-06, + "loss": 0.7549, + "step": 352 + }, + { + "epoch": 0.42351529694061185, + "grad_norm": 0.8808416904757895, + "learning_rate": 9.949332866076475e-06, + "loss": 0.7092, + "step": 353 + }, + { + "epoch": 0.4247150569886023, + "grad_norm": 0.8948904416493283, + "learning_rate": 9.948336245696461e-06, + "loss": 0.7929, + "step": 354 + }, + { + "epoch": 0.4259148170365927, + "grad_norm": 0.7020740256232711, + "learning_rate": 9.94732996970087e-06, + "loss": 0.7058, + "step": 355 + }, + { + "epoch": 0.42711457708458306, + "grad_norm": 0.7923237942501681, + "learning_rate": 9.946314040053234e-06, + "loss": 0.7346, + "step": 356 + }, + { + "epoch": 0.4283143371325735, + "grad_norm": 0.7968620685418168, + "learning_rate": 9.945288458735919e-06, + "loss": 0.781, + "step": 357 + }, + { + "epoch": 0.4295140971805639, + "grad_norm": 0.8543323867178862, + "learning_rate": 9.94425322775013e-06, + "loss": 0.816, + "step": 358 + }, + { + "epoch": 0.43071385722855426, + "grad_norm": 0.9090120863126556, + "learning_rate": 9.943208349115893e-06, + "loss": 0.7682, + "step": 359 + }, + { + "epoch": 0.4319136172765447, + "grad_norm": 0.816617430755013, + "learning_rate": 9.94215382487207e-06, + "loss": 0.7139, + "step": 360 + }, + { + "epoch": 0.4331133773245351, + "grad_norm": 0.7208369485157738, + "learning_rate": 9.941089657076334e-06, + "loss": 0.7975, + "step": 361 + }, + { + "epoch": 0.43431313737252547, + "grad_norm": 0.7129151616725876, + "learning_rate": 9.940015847805183e-06, + "loss": 0.761, + "step": 362 + }, + { + "epoch": 0.4355128974205159, + "grad_norm": 0.8814622211192246, + "learning_rate": 9.938932399153924e-06, + "loss": 0.8132, + "step": 363 + }, + { + "epoch": 0.4367126574685063, + "grad_norm": 0.7244971715773292, + "learning_rate": 9.937839313236675e-06, + "loss": 0.7698, + "step": 364 + }, + { + "epoch": 0.4379124175164967, + "grad_norm": 0.8575978639150249, + "learning_rate": 9.936736592186357e-06, + "loss": 0.8339, + "step": 365 + }, + { + "epoch": 0.4391121775644871, + "grad_norm": 0.8461733576826898, + "learning_rate": 9.935624238154696e-06, + "loss": 0.7724, + "step": 366 + }, + { + "epoch": 0.4403119376124775, + "grad_norm": 0.7924350476401562, + "learning_rate": 9.934502253312211e-06, + "loss": 0.6884, + "step": 367 + }, + { + "epoch": 0.4415116976604679, + "grad_norm": 0.7096586005883623, + "learning_rate": 9.93337063984821e-06, + "loss": 0.7898, + "step": 368 + }, + { + "epoch": 0.4427114577084583, + "grad_norm": 0.8066448342385316, + "learning_rate": 9.932229399970801e-06, + "loss": 0.7898, + "step": 369 + }, + { + "epoch": 0.4439112177564487, + "grad_norm": 0.8256485114482758, + "learning_rate": 9.931078535906863e-06, + "loss": 0.839, + "step": 370 + }, + { + "epoch": 0.4451109778044391, + "grad_norm": 0.7650729169319722, + "learning_rate": 9.929918049902062e-06, + "loss": 0.7912, + "step": 371 + }, + { + "epoch": 0.44631073785242953, + "grad_norm": 0.7731359524210224, + "learning_rate": 9.92874794422084e-06, + "loss": 0.7476, + "step": 372 + }, + { + "epoch": 0.4475104979004199, + "grad_norm": 0.8188756125557376, + "learning_rate": 9.927568221146401e-06, + "loss": 0.8029, + "step": 373 + }, + { + "epoch": 0.4487102579484103, + "grad_norm": 0.8780583103974543, + "learning_rate": 9.926378882980728e-06, + "loss": 0.8268, + "step": 374 + }, + { + "epoch": 0.44991001799640074, + "grad_norm": 0.8976930887917707, + "learning_rate": 9.925179932044552e-06, + "loss": 0.7914, + "step": 375 + }, + { + "epoch": 0.4511097780443911, + "grad_norm": 0.790418734596251, + "learning_rate": 9.923971370677375e-06, + "loss": 0.7754, + "step": 376 + }, + { + "epoch": 0.4523095380923815, + "grad_norm": 0.7540539012125508, + "learning_rate": 9.922753201237441e-06, + "loss": 0.8271, + "step": 377 + }, + { + "epoch": 0.45350929814037194, + "grad_norm": 0.8114545834788608, + "learning_rate": 9.921525426101745e-06, + "loss": 0.7598, + "step": 378 + }, + { + "epoch": 0.4547090581883623, + "grad_norm": 0.8148141210694403, + "learning_rate": 9.920288047666031e-06, + "loss": 0.7773, + "step": 379 + }, + { + "epoch": 0.4559088182363527, + "grad_norm": 0.7760783698206488, + "learning_rate": 9.919041068344774e-06, + "loss": 0.7527, + "step": 380 + }, + { + "epoch": 0.45710857828434315, + "grad_norm": 0.7331163486277703, + "learning_rate": 9.917784490571188e-06, + "loss": 0.7976, + "step": 381 + }, + { + "epoch": 0.45830833833233353, + "grad_norm": 0.8178911597875124, + "learning_rate": 9.916518316797212e-06, + "loss": 0.7989, + "step": 382 + }, + { + "epoch": 0.4595080983803239, + "grad_norm": 0.7708390841568354, + "learning_rate": 9.915242549493513e-06, + "loss": 0.7658, + "step": 383 + }, + { + "epoch": 0.46070785842831435, + "grad_norm": 0.7346178145033802, + "learning_rate": 9.91395719114948e-06, + "loss": 0.7896, + "step": 384 + }, + { + "epoch": 0.46190761847630474, + "grad_norm": 0.7831346158780439, + "learning_rate": 9.912662244273213e-06, + "loss": 0.7912, + "step": 385 + }, + { + "epoch": 0.4631073785242951, + "grad_norm": 0.7685210278278765, + "learning_rate": 9.911357711391519e-06, + "loss": 0.7363, + "step": 386 + }, + { + "epoch": 0.46430713857228556, + "grad_norm": 0.8417914054386219, + "learning_rate": 9.910043595049918e-06, + "loss": 0.8178, + "step": 387 + }, + { + "epoch": 0.46550689862027594, + "grad_norm": 0.7672805808964465, + "learning_rate": 9.908719897812622e-06, + "loss": 0.7157, + "step": 388 + }, + { + "epoch": 0.4667066586682663, + "grad_norm": 0.8649515383741985, + "learning_rate": 9.907386622262547e-06, + "loss": 0.8493, + "step": 389 + }, + { + "epoch": 0.46790641871625677, + "grad_norm": 0.7738870524107353, + "learning_rate": 9.906043771001289e-06, + "loss": 0.8194, + "step": 390 + }, + { + "epoch": 0.46910617876424715, + "grad_norm": 0.8084944877365888, + "learning_rate": 9.904691346649137e-06, + "loss": 0.7827, + "step": 391 + }, + { + "epoch": 0.47030593881223753, + "grad_norm": 0.8645182910322615, + "learning_rate": 9.903329351845055e-06, + "loss": 0.7888, + "step": 392 + }, + { + "epoch": 0.471505698860228, + "grad_norm": 0.7753940783542463, + "learning_rate": 9.901957789246683e-06, + "loss": 0.708, + "step": 393 + }, + { + "epoch": 0.47270545890821836, + "grad_norm": 0.7535155544641968, + "learning_rate": 9.900576661530334e-06, + "loss": 0.758, + "step": 394 + }, + { + "epoch": 0.47390521895620874, + "grad_norm": 0.7990248815331882, + "learning_rate": 9.89918597139098e-06, + "loss": 0.7712, + "step": 395 + }, + { + "epoch": 0.4751049790041992, + "grad_norm": 0.7872305119483683, + "learning_rate": 9.89778572154225e-06, + "loss": 0.8311, + "step": 396 + }, + { + "epoch": 0.47630473905218956, + "grad_norm": 0.790424891523676, + "learning_rate": 9.89637591471644e-06, + "loss": 0.7826, + "step": 397 + }, + { + "epoch": 0.47750449910017995, + "grad_norm": 0.9628608337412052, + "learning_rate": 9.894956553664479e-06, + "loss": 0.8336, + "step": 398 + }, + { + "epoch": 0.4787042591481704, + "grad_norm": 0.9252288172565076, + "learning_rate": 9.893527641155945e-06, + "loss": 0.7676, + "step": 399 + }, + { + "epoch": 0.47990401919616077, + "grad_norm": 0.8662616102690699, + "learning_rate": 9.892089179979056e-06, + "loss": 0.7821, + "step": 400 + }, + { + "epoch": 0.48110377924415115, + "grad_norm": 0.7515686991319386, + "learning_rate": 9.89064117294066e-06, + "loss": 0.7426, + "step": 401 + }, + { + "epoch": 0.4823035392921416, + "grad_norm": 1.0924459166312195, + "learning_rate": 9.889183622866232e-06, + "loss": 0.8409, + "step": 402 + }, + { + "epoch": 0.483503299340132, + "grad_norm": 0.9035652732859879, + "learning_rate": 9.887716532599868e-06, + "loss": 0.7565, + "step": 403 + }, + { + "epoch": 0.48470305938812236, + "grad_norm": 0.8786197862572435, + "learning_rate": 9.886239905004277e-06, + "loss": 0.832, + "step": 404 + }, + { + "epoch": 0.4859028194361128, + "grad_norm": 0.7910128169701829, + "learning_rate": 9.884753742960784e-06, + "loss": 0.7388, + "step": 405 + }, + { + "epoch": 0.4871025794841032, + "grad_norm": 0.9082566293088858, + "learning_rate": 9.883258049369313e-06, + "loss": 0.7749, + "step": 406 + }, + { + "epoch": 0.48830233953209357, + "grad_norm": 5.393295746353944, + "learning_rate": 9.881752827148391e-06, + "loss": 0.6713, + "step": 407 + }, + { + "epoch": 0.489502099580084, + "grad_norm": 0.8808053301797711, + "learning_rate": 9.880238079235134e-06, + "loss": 0.7838, + "step": 408 + }, + { + "epoch": 0.4907018596280744, + "grad_norm": 0.8549263656066898, + "learning_rate": 9.878713808585247e-06, + "loss": 0.7639, + "step": 409 + }, + { + "epoch": 0.49190161967606477, + "grad_norm": 0.7105156902384112, + "learning_rate": 9.877180018173019e-06, + "loss": 0.7337, + "step": 410 + }, + { + "epoch": 0.4931013797240552, + "grad_norm": 0.7739159427592175, + "learning_rate": 9.87563671099131e-06, + "loss": 0.7576, + "step": 411 + }, + { + "epoch": 0.4943011397720456, + "grad_norm": 0.8040600363561748, + "learning_rate": 9.874083890051553e-06, + "loss": 0.7507, + "step": 412 + }, + { + "epoch": 0.495500899820036, + "grad_norm": 0.9575415088088016, + "learning_rate": 9.872521558383748e-06, + "loss": 0.758, + "step": 413 + }, + { + "epoch": 0.4967006598680264, + "grad_norm": 0.8925178432779028, + "learning_rate": 9.870949719036446e-06, + "loss": 0.7158, + "step": 414 + }, + { + "epoch": 0.4979004199160168, + "grad_norm": 0.8368122703058055, + "learning_rate": 9.869368375076755e-06, + "loss": 0.7411, + "step": 415 + }, + { + "epoch": 0.4991001799640072, + "grad_norm": 0.8523083439993232, + "learning_rate": 9.86777752959033e-06, + "loss": 0.75, + "step": 416 + }, + { + "epoch": 0.5002999400119976, + "grad_norm": 0.7765675505221259, + "learning_rate": 9.866177185681361e-06, + "loss": 0.7819, + "step": 417 + }, + { + "epoch": 0.501499700059988, + "grad_norm": 0.8129680747900232, + "learning_rate": 9.864567346472578e-06, + "loss": 0.7734, + "step": 418 + }, + { + "epoch": 0.5026994601079784, + "grad_norm": 0.7888227168815115, + "learning_rate": 9.862948015105233e-06, + "loss": 0.7784, + "step": 419 + }, + { + "epoch": 0.5038992201559688, + "grad_norm": 0.7485734740834548, + "learning_rate": 9.861319194739109e-06, + "loss": 0.7798, + "step": 420 + }, + { + "epoch": 0.5050989802039592, + "grad_norm": 0.783877019342748, + "learning_rate": 9.859680888552497e-06, + "loss": 0.7235, + "step": 421 + }, + { + "epoch": 0.5062987402519497, + "grad_norm": 0.933538619958105, + "learning_rate": 9.858033099742197e-06, + "loss": 0.7915, + "step": 422 + }, + { + "epoch": 0.50749850029994, + "grad_norm": 0.6991884169094089, + "learning_rate": 9.856375831523517e-06, + "loss": 0.7194, + "step": 423 + }, + { + "epoch": 0.5086982603479304, + "grad_norm": 0.8599435760285431, + "learning_rate": 9.854709087130261e-06, + "loss": 0.7195, + "step": 424 + }, + { + "epoch": 0.5098980203959208, + "grad_norm": 0.7761136844143638, + "learning_rate": 9.853032869814721e-06, + "loss": 0.7657, + "step": 425 + }, + { + "epoch": 0.5110977804439112, + "grad_norm": 1.193889743974957, + "learning_rate": 9.851347182847678e-06, + "loss": 0.8358, + "step": 426 + }, + { + "epoch": 0.5122975404919016, + "grad_norm": 0.7678561049828674, + "learning_rate": 9.849652029518384e-06, + "loss": 0.7357, + "step": 427 + }, + { + "epoch": 0.5134973005398921, + "grad_norm": 0.7865687358146413, + "learning_rate": 9.847947413134568e-06, + "loss": 0.7414, + "step": 428 + }, + { + "epoch": 0.5146970605878824, + "grad_norm": 0.7346520114156417, + "learning_rate": 9.846233337022427e-06, + "loss": 0.7832, + "step": 429 + }, + { + "epoch": 0.5158968206358728, + "grad_norm": 0.9350176986428819, + "learning_rate": 9.844509804526606e-06, + "loss": 0.7505, + "step": 430 + }, + { + "epoch": 0.5170965806838632, + "grad_norm": 0.8296219956459046, + "learning_rate": 9.842776819010213e-06, + "loss": 0.7832, + "step": 431 + }, + { + "epoch": 0.5182963407318536, + "grad_norm": 0.8476492086914537, + "learning_rate": 9.841034383854795e-06, + "loss": 0.7001, + "step": 432 + }, + { + "epoch": 0.519496100779844, + "grad_norm": 0.7240816239335365, + "learning_rate": 9.83928250246034e-06, + "loss": 0.6933, + "step": 433 + }, + { + "epoch": 0.5206958608278345, + "grad_norm": 0.7405341057000968, + "learning_rate": 9.837521178245273e-06, + "loss": 0.7561, + "step": 434 + }, + { + "epoch": 0.5218956208758249, + "grad_norm": 0.7692071981948461, + "learning_rate": 9.835750414646433e-06, + "loss": 0.693, + "step": 435 + }, + { + "epoch": 0.5230953809238152, + "grad_norm": 0.8287045028686297, + "learning_rate": 9.833970215119088e-06, + "loss": 0.7489, + "step": 436 + }, + { + "epoch": 0.5242951409718056, + "grad_norm": 0.795961787816489, + "learning_rate": 9.832180583136916e-06, + "loss": 0.7446, + "step": 437 + }, + { + "epoch": 0.525494901019796, + "grad_norm": 0.7581333038407717, + "learning_rate": 9.830381522191997e-06, + "loss": 0.7231, + "step": 438 + }, + { + "epoch": 0.5266946610677864, + "grad_norm": 0.9303998090134032, + "learning_rate": 9.828573035794815e-06, + "loss": 0.8329, + "step": 439 + }, + { + "epoch": 0.5278944211157769, + "grad_norm": 0.6566928859534891, + "learning_rate": 9.826755127474241e-06, + "loss": 0.6941, + "step": 440 + }, + { + "epoch": 0.5290941811637673, + "grad_norm": 0.7135091281559202, + "learning_rate": 9.824927800777535e-06, + "loss": 0.7822, + "step": 441 + }, + { + "epoch": 0.5302939412117577, + "grad_norm": 0.7534927462043957, + "learning_rate": 9.823091059270329e-06, + "loss": 0.8006, + "step": 442 + }, + { + "epoch": 0.531493701259748, + "grad_norm": 0.7406618710423156, + "learning_rate": 9.821244906536633e-06, + "loss": 0.7802, + "step": 443 + }, + { + "epoch": 0.5326934613077384, + "grad_norm": 0.7491038976546027, + "learning_rate": 9.819389346178814e-06, + "loss": 0.7735, + "step": 444 + }, + { + "epoch": 0.5338932213557288, + "grad_norm": 0.7822185677172192, + "learning_rate": 9.817524381817603e-06, + "loss": 0.7542, + "step": 445 + }, + { + "epoch": 0.5350929814037193, + "grad_norm": 0.7128977961072269, + "learning_rate": 9.815650017092078e-06, + "loss": 0.7778, + "step": 446 + }, + { + "epoch": 0.5362927414517097, + "grad_norm": 0.6904057914443502, + "learning_rate": 9.813766255659654e-06, + "loss": 0.734, + "step": 447 + }, + { + "epoch": 0.5374925014997001, + "grad_norm": 0.8427999544381056, + "learning_rate": 9.811873101196093e-06, + "loss": 0.7819, + "step": 448 + }, + { + "epoch": 0.5386922615476905, + "grad_norm": 0.7866934841281968, + "learning_rate": 9.809970557395476e-06, + "loss": 0.7153, + "step": 449 + }, + { + "epoch": 0.5398920215956808, + "grad_norm": 0.8032501027155347, + "learning_rate": 9.80805862797021e-06, + "loss": 0.7641, + "step": 450 + }, + { + "epoch": 0.5410917816436712, + "grad_norm": 0.7686919182685255, + "learning_rate": 9.806137316651013e-06, + "loss": 0.7456, + "step": 451 + }, + { + "epoch": 0.5422915416916617, + "grad_norm": 0.8177582066471052, + "learning_rate": 9.804206627186911e-06, + "loss": 0.7931, + "step": 452 + }, + { + "epoch": 0.5434913017396521, + "grad_norm": 0.9006022094358479, + "learning_rate": 9.802266563345235e-06, + "loss": 0.8071, + "step": 453 + }, + { + "epoch": 0.5446910617876425, + "grad_norm": 0.7518170142436402, + "learning_rate": 9.8003171289116e-06, + "loss": 0.8075, + "step": 454 + }, + { + "epoch": 0.5458908218356329, + "grad_norm": 0.7514481594832335, + "learning_rate": 9.798358327689906e-06, + "loss": 0.6912, + "step": 455 + }, + { + "epoch": 0.5470905818836233, + "grad_norm": 0.7915234537215006, + "learning_rate": 9.796390163502334e-06, + "loss": 0.7001, + "step": 456 + }, + { + "epoch": 0.5482903419316136, + "grad_norm": 0.8439870437962147, + "learning_rate": 9.794412640189336e-06, + "loss": 0.7564, + "step": 457 + }, + { + "epoch": 0.5494901019796041, + "grad_norm": 0.7780134611809413, + "learning_rate": 9.792425761609623e-06, + "loss": 0.7974, + "step": 458 + }, + { + "epoch": 0.5506898620275945, + "grad_norm": 0.9166536013833003, + "learning_rate": 9.790429531640163e-06, + "loss": 0.8225, + "step": 459 + }, + { + "epoch": 0.5518896220755849, + "grad_norm": 0.9079256575884883, + "learning_rate": 9.788423954176166e-06, + "loss": 0.7794, + "step": 460 + }, + { + "epoch": 0.5530893821235753, + "grad_norm": 0.7992584892279518, + "learning_rate": 9.786409033131092e-06, + "loss": 0.7594, + "step": 461 + }, + { + "epoch": 0.5542891421715657, + "grad_norm": 0.7316252955597614, + "learning_rate": 9.784384772436624e-06, + "loss": 0.7028, + "step": 462 + }, + { + "epoch": 0.555488902219556, + "grad_norm": 0.8776441197086907, + "learning_rate": 9.782351176042668e-06, + "loss": 0.7024, + "step": 463 + }, + { + "epoch": 0.5566886622675465, + "grad_norm": 0.8485818018512938, + "learning_rate": 9.780308247917355e-06, + "loss": 0.816, + "step": 464 + }, + { + "epoch": 0.5578884223155369, + "grad_norm": 0.7292056958121589, + "learning_rate": 9.778255992047024e-06, + "loss": 0.7556, + "step": 465 + }, + { + "epoch": 0.5590881823635273, + "grad_norm": 0.886905755147264, + "learning_rate": 9.776194412436202e-06, + "loss": 0.7597, + "step": 466 + }, + { + "epoch": 0.5602879424115177, + "grad_norm": 0.8700121060467031, + "learning_rate": 9.774123513107628e-06, + "loss": 0.7258, + "step": 467 + }, + { + "epoch": 0.5614877024595081, + "grad_norm": 0.8348518898244337, + "learning_rate": 9.77204329810221e-06, + "loss": 0.6959, + "step": 468 + }, + { + "epoch": 0.5626874625074985, + "grad_norm": 0.8063492786549548, + "learning_rate": 9.769953771479048e-06, + "loss": 0.7216, + "step": 469 + }, + { + "epoch": 0.563887222555489, + "grad_norm": 15.790661371937125, + "learning_rate": 9.767854937315398e-06, + "loss": 0.9547, + "step": 470 + }, + { + "epoch": 0.5650869826034793, + "grad_norm": 0.7455923646791719, + "learning_rate": 9.765746799706688e-06, + "loss": 0.7034, + "step": 471 + }, + { + "epoch": 0.5662867426514697, + "grad_norm": 0.7161568027914286, + "learning_rate": 9.763629362766495e-06, + "loss": 0.7697, + "step": 472 + }, + { + "epoch": 0.5674865026994601, + "grad_norm": 0.7960011292767376, + "learning_rate": 9.761502630626544e-06, + "loss": 0.74, + "step": 473 + }, + { + "epoch": 0.5686862627474505, + "grad_norm": 0.8386461066974339, + "learning_rate": 9.759366607436694e-06, + "loss": 0.7591, + "step": 474 + }, + { + "epoch": 0.5698860227954409, + "grad_norm": 0.8219184010947729, + "learning_rate": 9.757221297364936e-06, + "loss": 0.7985, + "step": 475 + }, + { + "epoch": 0.5710857828434314, + "grad_norm": 0.8526023917338333, + "learning_rate": 9.75506670459738e-06, + "loss": 0.7363, + "step": 476 + }, + { + "epoch": 0.5722855428914218, + "grad_norm": 0.7158601779007123, + "learning_rate": 9.752902833338256e-06, + "loss": 0.784, + "step": 477 + }, + { + "epoch": 0.5734853029394121, + "grad_norm": 0.9009368283263316, + "learning_rate": 9.75072968780989e-06, + "loss": 0.76, + "step": 478 + }, + { + "epoch": 0.5746850629874025, + "grad_norm": 0.844246397660529, + "learning_rate": 9.74854727225271e-06, + "loss": 0.7456, + "step": 479 + }, + { + "epoch": 0.5758848230353929, + "grad_norm": 0.9418669055461818, + "learning_rate": 9.746355590925231e-06, + "loss": 0.7488, + "step": 480 + }, + { + "epoch": 0.5770845830833833, + "grad_norm": 0.8489283035012338, + "learning_rate": 9.744154648104049e-06, + "loss": 0.7222, + "step": 481 + }, + { + "epoch": 0.5782843431313738, + "grad_norm": 0.7383667346126159, + "learning_rate": 9.74194444808383e-06, + "loss": 0.7387, + "step": 482 + }, + { + "epoch": 0.5794841031793642, + "grad_norm": 0.838663810803923, + "learning_rate": 9.739724995177309e-06, + "loss": 0.7478, + "step": 483 + }, + { + "epoch": 0.5806838632273545, + "grad_norm": 0.7732760292403525, + "learning_rate": 9.737496293715268e-06, + "loss": 0.7577, + "step": 484 + }, + { + "epoch": 0.5818836232753449, + "grad_norm": 0.8358539665976709, + "learning_rate": 9.735258348046538e-06, + "loss": 0.7199, + "step": 485 + }, + { + "epoch": 0.5830833833233353, + "grad_norm": 0.931301011982876, + "learning_rate": 9.73301116253799e-06, + "loss": 0.751, + "step": 486 + }, + { + "epoch": 0.5842831433713257, + "grad_norm": 0.8287516764022196, + "learning_rate": 9.730754741574529e-06, + "loss": 0.7583, + "step": 487 + }, + { + "epoch": 0.5854829034193162, + "grad_norm": 0.824571405077036, + "learning_rate": 9.72848908955907e-06, + "loss": 0.7018, + "step": 488 + }, + { + "epoch": 0.5866826634673066, + "grad_norm": 0.8092772472268522, + "learning_rate": 9.726214210912549e-06, + "loss": 0.7899, + "step": 489 + }, + { + "epoch": 0.587882423515297, + "grad_norm": 0.7579661335080632, + "learning_rate": 9.723930110073902e-06, + "loss": 0.6777, + "step": 490 + }, + { + "epoch": 0.5890821835632873, + "grad_norm": 0.9801294633681042, + "learning_rate": 9.721636791500065e-06, + "loss": 0.8294, + "step": 491 + }, + { + "epoch": 0.5902819436112777, + "grad_norm": 0.8218005486888227, + "learning_rate": 9.719334259665951e-06, + "loss": 0.7115, + "step": 492 + }, + { + "epoch": 0.5914817036592681, + "grad_norm": 0.8044698534033967, + "learning_rate": 9.71702251906446e-06, + "loss": 0.7619, + "step": 493 + }, + { + "epoch": 0.5926814637072585, + "grad_norm": 0.7541226470722168, + "learning_rate": 9.714701574206457e-06, + "loss": 0.7595, + "step": 494 + }, + { + "epoch": 0.593881223755249, + "grad_norm": 0.8013622981520949, + "learning_rate": 9.712371429620767e-06, + "loss": 0.7405, + "step": 495 + }, + { + "epoch": 0.5950809838032394, + "grad_norm": 0.6836569320702858, + "learning_rate": 9.710032089854166e-06, + "loss": 0.6942, + "step": 496 + }, + { + "epoch": 0.5962807438512298, + "grad_norm": 0.8065057972585213, + "learning_rate": 9.707683559471374e-06, + "loss": 0.8385, + "step": 497 + }, + { + "epoch": 0.5974805038992201, + "grad_norm": 0.7580471644835146, + "learning_rate": 9.705325843055045e-06, + "loss": 0.7433, + "step": 498 + }, + { + "epoch": 0.5986802639472105, + "grad_norm": 0.8606247011714901, + "learning_rate": 9.702958945205753e-06, + "loss": 0.7035, + "step": 499 + }, + { + "epoch": 0.5998800239952009, + "grad_norm": 1.5950934849613543, + "learning_rate": 9.700582870541995e-06, + "loss": 0.7909, + "step": 500 + }, + { + "epoch": 0.6010797840431914, + "grad_norm": 0.7731565625014056, + "learning_rate": 9.698197623700168e-06, + "loss": 0.7965, + "step": 501 + }, + { + "epoch": 0.6022795440911818, + "grad_norm": 0.8349098467362701, + "learning_rate": 9.69580320933457e-06, + "loss": 0.7261, + "step": 502 + }, + { + "epoch": 0.6034793041391722, + "grad_norm": 0.754057594438619, + "learning_rate": 9.693399632117385e-06, + "loss": 0.6971, + "step": 503 + }, + { + "epoch": 0.6046790641871626, + "grad_norm": 0.8023211344060523, + "learning_rate": 9.69098689673868e-06, + "loss": 0.7167, + "step": 504 + }, + { + "epoch": 0.6058788242351529, + "grad_norm": 1.0058172149504045, + "learning_rate": 9.688565007906387e-06, + "loss": 0.7819, + "step": 505 + }, + { + "epoch": 0.6070785842831433, + "grad_norm": 0.7242215853525499, + "learning_rate": 9.686133970346305e-06, + "loss": 0.6986, + "step": 506 + }, + { + "epoch": 0.6082783443311338, + "grad_norm": 0.7901538256806555, + "learning_rate": 9.68369378880208e-06, + "loss": 0.7646, + "step": 507 + }, + { + "epoch": 0.6094781043791242, + "grad_norm": 0.7503104666431076, + "learning_rate": 9.6812444680352e-06, + "loss": 0.7561, + "step": 508 + }, + { + "epoch": 0.6106778644271146, + "grad_norm": 0.7788288768425731, + "learning_rate": 9.678786012824995e-06, + "loss": 0.7616, + "step": 509 + }, + { + "epoch": 0.611877624475105, + "grad_norm": 0.7603265669161717, + "learning_rate": 9.676318427968608e-06, + "loss": 0.7311, + "step": 510 + }, + { + "epoch": 0.6130773845230953, + "grad_norm": 0.7078386385583856, + "learning_rate": 9.673841718281e-06, + "loss": 0.6879, + "step": 511 + }, + { + "epoch": 0.6142771445710857, + "grad_norm": 0.8055381057080542, + "learning_rate": 9.67135588859494e-06, + "loss": 0.7147, + "step": 512 + }, + { + "epoch": 0.6154769046190762, + "grad_norm": 0.7631878298047712, + "learning_rate": 9.668860943760992e-06, + "loss": 0.6773, + "step": 513 + }, + { + "epoch": 0.6166766646670666, + "grad_norm": 0.6465663248068814, + "learning_rate": 9.6663568886475e-06, + "loss": 0.7829, + "step": 514 + }, + { + "epoch": 0.617876424715057, + "grad_norm": 0.8571734522586008, + "learning_rate": 9.663843728140597e-06, + "loss": 0.7633, + "step": 515 + }, + { + "epoch": 0.6190761847630474, + "grad_norm": 0.8017497846966288, + "learning_rate": 9.661321467144172e-06, + "loss": 0.7087, + "step": 516 + }, + { + "epoch": 0.6202759448110378, + "grad_norm": 0.7759613883579511, + "learning_rate": 9.658790110579876e-06, + "loss": 0.7867, + "step": 517 + }, + { + "epoch": 0.6214757048590281, + "grad_norm": 0.8554914259201436, + "learning_rate": 9.656249663387107e-06, + "loss": 0.6902, + "step": 518 + }, + { + "epoch": 0.6226754649070186, + "grad_norm": 0.8565972170846713, + "learning_rate": 9.653700130523004e-06, + "loss": 0.816, + "step": 519 + }, + { + "epoch": 0.623875224955009, + "grad_norm": 0.8071076828122373, + "learning_rate": 9.651141516962431e-06, + "loss": 0.7227, + "step": 520 + }, + { + "epoch": 0.6250749850029994, + "grad_norm": 0.7660995482695625, + "learning_rate": 9.648573827697974e-06, + "loss": 0.7861, + "step": 521 + }, + { + "epoch": 0.6262747450509898, + "grad_norm": 1.0283097998930535, + "learning_rate": 9.645997067739929e-06, + "loss": 0.7303, + "step": 522 + }, + { + "epoch": 0.6274745050989802, + "grad_norm": 0.7745683261007134, + "learning_rate": 9.643411242116286e-06, + "loss": 0.6725, + "step": 523 + }, + { + "epoch": 0.6286742651469706, + "grad_norm": 0.8609117214762857, + "learning_rate": 9.64081635587273e-06, + "loss": 0.796, + "step": 524 + }, + { + "epoch": 0.629874025194961, + "grad_norm": 0.7864348864977745, + "learning_rate": 9.638212414072625e-06, + "loss": 0.7589, + "step": 525 + }, + { + "epoch": 0.6310737852429514, + "grad_norm": 0.7066437056556076, + "learning_rate": 9.635599421797004e-06, + "loss": 0.7414, + "step": 526 + }, + { + "epoch": 0.6322735452909418, + "grad_norm": 0.7634711558250465, + "learning_rate": 9.63297738414456e-06, + "loss": 0.7523, + "step": 527 + }, + { + "epoch": 0.6334733053389322, + "grad_norm": 0.7460018288720783, + "learning_rate": 9.630346306231637e-06, + "loss": 0.6864, + "step": 528 + }, + { + "epoch": 0.6346730653869226, + "grad_norm": 0.7895072856945539, + "learning_rate": 9.627706193192219e-06, + "loss": 0.7236, + "step": 529 + }, + { + "epoch": 0.635872825434913, + "grad_norm": 0.8164390104749236, + "learning_rate": 9.625057050177918e-06, + "loss": 0.6658, + "step": 530 + }, + { + "epoch": 0.6370725854829035, + "grad_norm": 0.65954773123744, + "learning_rate": 9.622398882357968e-06, + "loss": 0.7595, + "step": 531 + }, + { + "epoch": 0.6382723455308938, + "grad_norm": 0.9010898382276785, + "learning_rate": 9.619731694919212e-06, + "loss": 0.7702, + "step": 532 + }, + { + "epoch": 0.6394721055788842, + "grad_norm": 0.8386415071967853, + "learning_rate": 9.617055493066097e-06, + "loss": 0.6642, + "step": 533 + }, + { + "epoch": 0.6406718656268746, + "grad_norm": 0.8006666484897155, + "learning_rate": 9.614370282020652e-06, + "loss": 0.7895, + "step": 534 + }, + { + "epoch": 0.641871625674865, + "grad_norm": 0.6746633069935659, + "learning_rate": 9.611676067022492e-06, + "loss": 0.7577, + "step": 535 + }, + { + "epoch": 0.6430713857228554, + "grad_norm": 0.8148039263840821, + "learning_rate": 9.6089728533288e-06, + "loss": 0.7268, + "step": 536 + }, + { + "epoch": 0.6442711457708459, + "grad_norm": 0.8016142578877647, + "learning_rate": 9.606260646214314e-06, + "loss": 0.6941, + "step": 537 + }, + { + "epoch": 0.6454709058188363, + "grad_norm": 0.718756601136374, + "learning_rate": 9.603539450971326e-06, + "loss": 0.7473, + "step": 538 + }, + { + "epoch": 0.6466706658668266, + "grad_norm": 0.6757552684273367, + "learning_rate": 9.600809272909663e-06, + "loss": 0.7058, + "step": 539 + }, + { + "epoch": 0.647870425914817, + "grad_norm": 0.8704025257703171, + "learning_rate": 9.598070117356684e-06, + "loss": 0.8438, + "step": 540 + }, + { + "epoch": 0.6490701859628074, + "grad_norm": 0.667088769868094, + "learning_rate": 9.59532198965726e-06, + "loss": 0.7659, + "step": 541 + }, + { + "epoch": 0.6502699460107978, + "grad_norm": 0.9260561307588099, + "learning_rate": 9.59256489517377e-06, + "loss": 0.8057, + "step": 542 + }, + { + "epoch": 0.6514697060587883, + "grad_norm": 0.8775666263474623, + "learning_rate": 9.589798839286098e-06, + "loss": 0.7711, + "step": 543 + }, + { + "epoch": 0.6526694661067787, + "grad_norm": 0.6877550211780249, + "learning_rate": 9.587023827391603e-06, + "loss": 0.7835, + "step": 544 + }, + { + "epoch": 0.6538692261547691, + "grad_norm": 0.7349863305923766, + "learning_rate": 9.584239864905128e-06, + "loss": 0.7296, + "step": 545 + }, + { + "epoch": 0.6550689862027594, + "grad_norm": 0.8218995233024665, + "learning_rate": 9.581446957258975e-06, + "loss": 0.7829, + "step": 546 + }, + { + "epoch": 0.6562687462507498, + "grad_norm": 0.8134993669043972, + "learning_rate": 9.578645109902904e-06, + "loss": 0.765, + "step": 547 + }, + { + "epoch": 0.6574685062987402, + "grad_norm": 0.8557360032972663, + "learning_rate": 9.575834328304121e-06, + "loss": 0.7334, + "step": 548 + }, + { + "epoch": 0.6586682663467307, + "grad_norm": 0.7914397185522426, + "learning_rate": 9.57301461794726e-06, + "loss": 0.7127, + "step": 549 + }, + { + "epoch": 0.6598680263947211, + "grad_norm": 0.7734053557921267, + "learning_rate": 9.570185984334383e-06, + "loss": 0.7695, + "step": 550 + }, + { + "epoch": 0.6610677864427115, + "grad_norm": 0.880648269242868, + "learning_rate": 9.567348432984957e-06, + "loss": 0.7167, + "step": 551 + }, + { + "epoch": 0.6622675464907019, + "grad_norm": 0.7940200917011428, + "learning_rate": 9.56450196943586e-06, + "loss": 0.7364, + "step": 552 + }, + { + "epoch": 0.6634673065386922, + "grad_norm": 0.6781100978857986, + "learning_rate": 9.561646599241349e-06, + "loss": 0.6847, + "step": 553 + }, + { + "epoch": 0.6646670665866826, + "grad_norm": 0.7121065187027151, + "learning_rate": 9.558782327973068e-06, + "loss": 0.759, + "step": 554 + }, + { + "epoch": 0.6658668266346731, + "grad_norm": 0.9493717892282374, + "learning_rate": 9.555909161220025e-06, + "loss": 0.8074, + "step": 555 + }, + { + "epoch": 0.6670665866826635, + "grad_norm": 0.8350439792211257, + "learning_rate": 9.553027104588593e-06, + "loss": 0.7336, + "step": 556 + }, + { + "epoch": 0.6682663467306539, + "grad_norm": 0.7868865491202452, + "learning_rate": 9.550136163702482e-06, + "loss": 0.7655, + "step": 557 + }, + { + "epoch": 0.6694661067786443, + "grad_norm": 0.7655933329400806, + "learning_rate": 9.547236344202744e-06, + "loss": 0.7437, + "step": 558 + }, + { + "epoch": 0.6706658668266346, + "grad_norm": 0.9690309207482182, + "learning_rate": 9.544327651747753e-06, + "loss": 0.8109, + "step": 559 + }, + { + "epoch": 0.671865626874625, + "grad_norm": 0.7032506626870428, + "learning_rate": 9.541410092013201e-06, + "loss": 0.6855, + "step": 560 + }, + { + "epoch": 0.6730653869226155, + "grad_norm": 0.7619673285229566, + "learning_rate": 9.538483670692076e-06, + "loss": 0.6919, + "step": 561 + }, + { + "epoch": 0.6742651469706059, + "grad_norm": 0.7359622527888945, + "learning_rate": 9.535548393494661e-06, + "loss": 0.7804, + "step": 562 + }, + { + "epoch": 0.6754649070185963, + "grad_norm": 0.7216172279516673, + "learning_rate": 9.532604266148521e-06, + "loss": 0.7119, + "step": 563 + }, + { + "epoch": 0.6766646670665867, + "grad_norm": 0.7947602490045758, + "learning_rate": 9.529651294398485e-06, + "loss": 0.7116, + "step": 564 + }, + { + "epoch": 0.6778644271145771, + "grad_norm": 0.7938540244579113, + "learning_rate": 9.526689484006649e-06, + "loss": 0.7139, + "step": 565 + }, + { + "epoch": 0.6790641871625674, + "grad_norm": 0.7557012771757716, + "learning_rate": 9.523718840752343e-06, + "loss": 0.7737, + "step": 566 + }, + { + "epoch": 0.6802639472105579, + "grad_norm": 0.7130200527476641, + "learning_rate": 9.520739370432143e-06, + "loss": 0.7284, + "step": 567 + }, + { + "epoch": 0.6814637072585483, + "grad_norm": 0.8339374298828425, + "learning_rate": 9.517751078859848e-06, + "loss": 0.759, + "step": 568 + }, + { + "epoch": 0.6826634673065387, + "grad_norm": 0.7492909405177227, + "learning_rate": 9.514753971866463e-06, + "loss": 0.7311, + "step": 569 + }, + { + "epoch": 0.6838632273545291, + "grad_norm": 0.9066211146549824, + "learning_rate": 9.511748055300201e-06, + "loss": 0.8017, + "step": 570 + }, + { + "epoch": 0.6850629874025195, + "grad_norm": 0.720161620025462, + "learning_rate": 9.508733335026462e-06, + "loss": 0.773, + "step": 571 + }, + { + "epoch": 0.6862627474505099, + "grad_norm": 0.6967812860809943, + "learning_rate": 9.505709816927823e-06, + "loss": 0.7461, + "step": 572 + }, + { + "epoch": 0.6874625074985004, + "grad_norm": 0.7860373824966781, + "learning_rate": 9.502677506904035e-06, + "loss": 0.7483, + "step": 573 + }, + { + "epoch": 0.6886622675464907, + "grad_norm": 0.7275359182122862, + "learning_rate": 9.499636410871994e-06, + "loss": 0.8021, + "step": 574 + }, + { + "epoch": 0.6898620275944811, + "grad_norm": 0.7520613473149373, + "learning_rate": 9.496586534765752e-06, + "loss": 0.7328, + "step": 575 + }, + { + "epoch": 0.6910617876424715, + "grad_norm": 0.7301675898688976, + "learning_rate": 9.493527884536487e-06, + "loss": 0.6881, + "step": 576 + }, + { + "epoch": 0.6922615476904619, + "grad_norm": 0.7678401412943116, + "learning_rate": 9.490460466152491e-06, + "loss": 0.7324, + "step": 577 + }, + { + "epoch": 0.6934613077384523, + "grad_norm": 0.8612062026391113, + "learning_rate": 9.487384285599179e-06, + "loss": 0.768, + "step": 578 + }, + { + "epoch": 0.6946610677864428, + "grad_norm": 0.8147151721250593, + "learning_rate": 9.484299348879055e-06, + "loss": 0.6829, + "step": 579 + }, + { + "epoch": 0.6958608278344331, + "grad_norm": 0.8738236748718973, + "learning_rate": 9.48120566201171e-06, + "loss": 0.82, + "step": 580 + }, + { + "epoch": 0.6970605878824235, + "grad_norm": 0.8986436192452325, + "learning_rate": 9.478103231033808e-06, + "loss": 0.6793, + "step": 581 + }, + { + "epoch": 0.6982603479304139, + "grad_norm": 0.8904464145598673, + "learning_rate": 9.474992061999081e-06, + "loss": 0.7572, + "step": 582 + }, + { + "epoch": 0.6994601079784043, + "grad_norm": 0.6574883638307059, + "learning_rate": 9.471872160978303e-06, + "loss": 0.6443, + "step": 583 + }, + { + "epoch": 0.7006598680263947, + "grad_norm": 0.8985559380717492, + "learning_rate": 9.468743534059295e-06, + "loss": 0.7783, + "step": 584 + }, + { + "epoch": 0.7018596280743852, + "grad_norm": 0.7431036219557079, + "learning_rate": 9.465606187346897e-06, + "loss": 0.6254, + "step": 585 + }, + { + "epoch": 0.7030593881223756, + "grad_norm": 0.7728865866981772, + "learning_rate": 9.46246012696297e-06, + "loss": 0.7561, + "step": 586 + }, + { + "epoch": 0.704259148170366, + "grad_norm": 0.7501647430361906, + "learning_rate": 9.459305359046372e-06, + "loss": 0.6625, + "step": 587 + }, + { + "epoch": 0.7054589082183563, + "grad_norm": 0.7594681233852942, + "learning_rate": 9.456141889752959e-06, + "loss": 0.7741, + "step": 588 + }, + { + "epoch": 0.7066586682663467, + "grad_norm": 0.8174670353526049, + "learning_rate": 9.452969725255558e-06, + "loss": 0.7341, + "step": 589 + }, + { + "epoch": 0.7078584283143371, + "grad_norm": 0.7618253465793706, + "learning_rate": 9.449788871743972e-06, + "loss": 0.7444, + "step": 590 + }, + { + "epoch": 0.7090581883623275, + "grad_norm": 0.7474309464649311, + "learning_rate": 9.446599335424948e-06, + "loss": 0.7273, + "step": 591 + }, + { + "epoch": 0.710257948410318, + "grad_norm": 0.7187702974171543, + "learning_rate": 9.443401122522185e-06, + "loss": 0.7459, + "step": 592 + }, + { + "epoch": 0.7114577084583084, + "grad_norm": 0.7185805793407908, + "learning_rate": 9.440194239276308e-06, + "loss": 0.6859, + "step": 593 + }, + { + "epoch": 0.7126574685062987, + "grad_norm": 0.8506171645399014, + "learning_rate": 9.43697869194486e-06, + "loss": 0.6996, + "step": 594 + }, + { + "epoch": 0.7138572285542891, + "grad_norm": 0.9093408934214373, + "learning_rate": 9.433754486802291e-06, + "loss": 0.7084, + "step": 595 + }, + { + "epoch": 0.7150569886022795, + "grad_norm": 0.7899145578993906, + "learning_rate": 9.430521630139946e-06, + "loss": 0.6313, + "step": 596 + }, + { + "epoch": 0.7162567486502699, + "grad_norm": 0.871030688739406, + "learning_rate": 9.427280128266049e-06, + "loss": 0.7541, + "step": 597 + }, + { + "epoch": 0.7174565086982604, + "grad_norm": 0.7650457947902857, + "learning_rate": 9.424029987505699e-06, + "loss": 0.6741, + "step": 598 + }, + { + "epoch": 0.7186562687462508, + "grad_norm": 0.7472119241841967, + "learning_rate": 9.420771214200843e-06, + "loss": 0.7183, + "step": 599 + }, + { + "epoch": 0.7198560287942412, + "grad_norm": 0.8238613774473978, + "learning_rate": 9.417503814710277e-06, + "loss": 0.7379, + "step": 600 + }, + { + "epoch": 0.7210557888422315, + "grad_norm": 0.6805508800539326, + "learning_rate": 9.414227795409634e-06, + "loss": 0.6928, + "step": 601 + }, + { + "epoch": 0.7222555488902219, + "grad_norm": 0.8624131473450718, + "learning_rate": 9.410943162691359e-06, + "loss": 0.7423, + "step": 602 + }, + { + "epoch": 0.7234553089382123, + "grad_norm": 0.747745459350849, + "learning_rate": 9.407649922964708e-06, + "loss": 0.6918, + "step": 603 + }, + { + "epoch": 0.7246550689862028, + "grad_norm": 0.7044757854005275, + "learning_rate": 9.404348082655732e-06, + "loss": 0.7604, + "step": 604 + }, + { + "epoch": 0.7258548290341932, + "grad_norm": 0.7140315477847774, + "learning_rate": 9.40103764820726e-06, + "loss": 0.7346, + "step": 605 + }, + { + "epoch": 0.7270545890821836, + "grad_norm": 0.7921784179018161, + "learning_rate": 9.3977186260789e-06, + "loss": 0.6944, + "step": 606 + }, + { + "epoch": 0.728254349130174, + "grad_norm": 0.6872232338475704, + "learning_rate": 9.394391022747005e-06, + "loss": 0.7004, + "step": 607 + }, + { + "epoch": 0.7294541091781643, + "grad_norm": 0.8873585465398308, + "learning_rate": 9.39105484470468e-06, + "loss": 0.7299, + "step": 608 + }, + { + "epoch": 0.7306538692261547, + "grad_norm": 0.7736862318160932, + "learning_rate": 9.387710098461765e-06, + "loss": 0.7765, + "step": 609 + }, + { + "epoch": 0.7318536292741452, + "grad_norm": 0.6927992969964133, + "learning_rate": 9.384356790544807e-06, + "loss": 0.7472, + "step": 610 + }, + { + "epoch": 0.7330533893221356, + "grad_norm": 0.8082225325172666, + "learning_rate": 9.380994927497068e-06, + "loss": 0.7758, + "step": 611 + }, + { + "epoch": 0.734253149370126, + "grad_norm": 0.8724820015048785, + "learning_rate": 9.377624515878504e-06, + "loss": 0.7804, + "step": 612 + }, + { + "epoch": 0.7354529094181164, + "grad_norm": 0.82067275868822, + "learning_rate": 9.37424556226575e-06, + "loss": 0.7215, + "step": 613 + }, + { + "epoch": 0.7366526694661067, + "grad_norm": 0.7802391975377598, + "learning_rate": 9.370858073252104e-06, + "loss": 0.7595, + "step": 614 + }, + { + "epoch": 0.7378524295140971, + "grad_norm": 0.695511573939589, + "learning_rate": 9.367462055447528e-06, + "loss": 0.7495, + "step": 615 + }, + { + "epoch": 0.7390521895620876, + "grad_norm": 0.7210971676399367, + "learning_rate": 9.36405751547862e-06, + "loss": 0.7504, + "step": 616 + }, + { + "epoch": 0.740251949610078, + "grad_norm": 0.7127870994797852, + "learning_rate": 9.360644459988606e-06, + "loss": 0.6989, + "step": 617 + }, + { + "epoch": 0.7414517096580684, + "grad_norm": 0.8252434951513874, + "learning_rate": 9.357222895637337e-06, + "loss": 0.7435, + "step": 618 + }, + { + "epoch": 0.7426514697060588, + "grad_norm": 0.6777237241417108, + "learning_rate": 9.353792829101255e-06, + "loss": 0.7181, + "step": 619 + }, + { + "epoch": 0.7438512297540492, + "grad_norm": 0.7246319587284746, + "learning_rate": 9.3503542670734e-06, + "loss": 0.7542, + "step": 620 + }, + { + "epoch": 0.7450509898020395, + "grad_norm": 0.7233632732400332, + "learning_rate": 9.34690721626339e-06, + "loss": 0.6969, + "step": 621 + }, + { + "epoch": 0.74625074985003, + "grad_norm": 0.6981036869918213, + "learning_rate": 9.343451683397402e-06, + "loss": 0.6936, + "step": 622 + }, + { + "epoch": 0.7474505098980204, + "grad_norm": 0.6881147839478524, + "learning_rate": 9.339987675218165e-06, + "loss": 0.7398, + "step": 623 + }, + { + "epoch": 0.7486502699460108, + "grad_norm": 0.8089969231321731, + "learning_rate": 9.33651519848495e-06, + "loss": 0.7676, + "step": 624 + }, + { + "epoch": 0.7498500299940012, + "grad_norm": 0.7172080435840271, + "learning_rate": 9.333034259973551e-06, + "loss": 0.7167, + "step": 625 + }, + { + "epoch": 0.7510497900419916, + "grad_norm": 0.7256057710440875, + "learning_rate": 9.329544866476265e-06, + "loss": 0.6851, + "step": 626 + }, + { + "epoch": 0.752249550089982, + "grad_norm": 0.8188587519438086, + "learning_rate": 9.326047024801903e-06, + "loss": 0.7775, + "step": 627 + }, + { + "epoch": 0.7534493101379725, + "grad_norm": 0.762491836568235, + "learning_rate": 9.322540741775745e-06, + "loss": 0.7412, + "step": 628 + }, + { + "epoch": 0.7546490701859628, + "grad_norm": 0.6861742416181088, + "learning_rate": 9.319026024239552e-06, + "loss": 0.7172, + "step": 629 + }, + { + "epoch": 0.7558488302339532, + "grad_norm": 0.7425371085335192, + "learning_rate": 9.315502879051541e-06, + "loss": 0.744, + "step": 630 + }, + { + "epoch": 0.7570485902819436, + "grad_norm": 0.860120319975177, + "learning_rate": 9.311971313086371e-06, + "loss": 0.6923, + "step": 631 + }, + { + "epoch": 0.758248350329934, + "grad_norm": 0.7346323032138261, + "learning_rate": 9.30843133323514e-06, + "loss": 0.7782, + "step": 632 + }, + { + "epoch": 0.7594481103779244, + "grad_norm": 0.7544285377877584, + "learning_rate": 9.304882946405352e-06, + "loss": 0.6255, + "step": 633 + }, + { + "epoch": 0.7606478704259149, + "grad_norm": 0.736655907630535, + "learning_rate": 9.301326159520925e-06, + "loss": 0.6643, + "step": 634 + }, + { + "epoch": 0.7618476304739052, + "grad_norm": 0.7427962509814233, + "learning_rate": 9.297760979522166e-06, + "loss": 0.791, + "step": 635 + }, + { + "epoch": 0.7630473905218956, + "grad_norm": 0.748355158635932, + "learning_rate": 9.294187413365755e-06, + "loss": 0.8099, + "step": 636 + }, + { + "epoch": 0.764247150569886, + "grad_norm": 0.6991205935285665, + "learning_rate": 9.290605468024742e-06, + "loss": 0.7769, + "step": 637 + }, + { + "epoch": 0.7654469106178764, + "grad_norm": 0.7202702597169961, + "learning_rate": 9.287015150488522e-06, + "loss": 0.707, + "step": 638 + }, + { + "epoch": 0.7666466706658668, + "grad_norm": 0.8358327107940074, + "learning_rate": 9.28341646776283e-06, + "loss": 0.7207, + "step": 639 + }, + { + "epoch": 0.7678464307138573, + "grad_norm": 0.7159463048010999, + "learning_rate": 9.279809426869723e-06, + "loss": 0.6979, + "step": 640 + }, + { + "epoch": 0.7690461907618477, + "grad_norm": 0.861983749589215, + "learning_rate": 9.276194034847565e-06, + "loss": 0.8149, + "step": 641 + }, + { + "epoch": 0.770245950809838, + "grad_norm": 0.7787392983292577, + "learning_rate": 9.272570298751019e-06, + "loss": 0.6745, + "step": 642 + }, + { + "epoch": 0.7714457108578284, + "grad_norm": 0.7104910752402336, + "learning_rate": 9.268938225651027e-06, + "loss": 0.6715, + "step": 643 + }, + { + "epoch": 0.7726454709058188, + "grad_norm": 0.8049250722530529, + "learning_rate": 9.265297822634798e-06, + "loss": 0.7453, + "step": 644 + }, + { + "epoch": 0.7738452309538092, + "grad_norm": 0.6857302660826016, + "learning_rate": 9.261649096805798e-06, + "loss": 0.7502, + "step": 645 + }, + { + "epoch": 0.7750449910017997, + "grad_norm": 0.6553885497808106, + "learning_rate": 9.257992055283735e-06, + "loss": 0.7036, + "step": 646 + }, + { + "epoch": 0.7762447510497901, + "grad_norm": 0.7577789705457922, + "learning_rate": 9.254326705204534e-06, + "loss": 0.8082, + "step": 647 + }, + { + "epoch": 0.7774445110977805, + "grad_norm": 0.7691264141555266, + "learning_rate": 9.250653053720345e-06, + "loss": 0.8038, + "step": 648 + }, + { + "epoch": 0.7786442711457708, + "grad_norm": 0.7499407632048067, + "learning_rate": 9.246971107999505e-06, + "loss": 0.8091, + "step": 649 + }, + { + "epoch": 0.7798440311937612, + "grad_norm": 0.7775805611228338, + "learning_rate": 9.243280875226544e-06, + "loss": 0.8139, + "step": 650 + }, + { + "epoch": 0.7810437912417516, + "grad_norm": 0.6563095758055505, + "learning_rate": 9.239582362602155e-06, + "loss": 0.6856, + "step": 651 + }, + { + "epoch": 0.7822435512897421, + "grad_norm": 0.9321681932072071, + "learning_rate": 9.235875577343195e-06, + "loss": 0.7461, + "step": 652 + }, + { + "epoch": 0.7834433113377325, + "grad_norm": 0.870840392576577, + "learning_rate": 9.232160526682659e-06, + "loss": 0.7332, + "step": 653 + }, + { + "epoch": 0.7846430713857229, + "grad_norm": 0.8479916292898527, + "learning_rate": 9.228437217869668e-06, + "loss": 0.7548, + "step": 654 + }, + { + "epoch": 0.7858428314337133, + "grad_norm": 0.7577898264784557, + "learning_rate": 9.22470565816946e-06, + "loss": 0.7112, + "step": 655 + }, + { + "epoch": 0.7870425914817036, + "grad_norm": 0.7300874136086064, + "learning_rate": 9.220965854863375e-06, + "loss": 0.7645, + "step": 656 + }, + { + "epoch": 0.788242351529694, + "grad_norm": 0.8105013430146889, + "learning_rate": 9.217217815248834e-06, + "loss": 0.7181, + "step": 657 + }, + { + "epoch": 0.7894421115776845, + "grad_norm": 0.6988905401190743, + "learning_rate": 9.213461546639334e-06, + "loss": 0.8008, + "step": 658 + }, + { + "epoch": 0.7906418716256749, + "grad_norm": 0.9367110211078484, + "learning_rate": 9.209697056364421e-06, + "loss": 0.6649, + "step": 659 + }, + { + "epoch": 0.7918416316736653, + "grad_norm": 0.8415790618546976, + "learning_rate": 9.205924351769695e-06, + "loss": 0.6454, + "step": 660 + }, + { + "epoch": 0.7930413917216557, + "grad_norm": 0.779708965565287, + "learning_rate": 9.202143440216776e-06, + "loss": 0.7348, + "step": 661 + }, + { + "epoch": 0.794241151769646, + "grad_norm": 0.7584769779508698, + "learning_rate": 9.198354329083304e-06, + "loss": 0.7771, + "step": 662 + }, + { + "epoch": 0.7954409118176364, + "grad_norm": 0.752690256345137, + "learning_rate": 9.194557025762911e-06, + "loss": 0.6826, + "step": 663 + }, + { + "epoch": 0.7966406718656269, + "grad_norm": 0.6987843714144564, + "learning_rate": 9.190751537665222e-06, + "loss": 0.7299, + "step": 664 + }, + { + "epoch": 0.7978404319136173, + "grad_norm": 0.7684852943780471, + "learning_rate": 9.186937872215831e-06, + "loss": 0.6788, + "step": 665 + }, + { + "epoch": 0.7990401919616077, + "grad_norm": 0.7291940251139142, + "learning_rate": 9.183116036856283e-06, + "loss": 0.7404, + "step": 666 + }, + { + "epoch": 0.8002399520095981, + "grad_norm": 0.7059763113053509, + "learning_rate": 9.179286039044072e-06, + "loss": 0.7853, + "step": 667 + }, + { + "epoch": 0.8014397120575885, + "grad_norm": 0.8840293556447637, + "learning_rate": 9.175447886252618e-06, + "loss": 0.7489, + "step": 668 + }, + { + "epoch": 0.8026394721055788, + "grad_norm": 0.8385407380699331, + "learning_rate": 9.171601585971248e-06, + "loss": 0.75, + "step": 669 + }, + { + "epoch": 0.8038392321535693, + "grad_norm": 0.7991173151829092, + "learning_rate": 9.167747145705195e-06, + "loss": 0.6904, + "step": 670 + }, + { + "epoch": 0.8050389922015597, + "grad_norm": 0.6799107722840952, + "learning_rate": 9.163884572975566e-06, + "loss": 0.7309, + "step": 671 + }, + { + "epoch": 0.8062387522495501, + "grad_norm": 0.8564949850489171, + "learning_rate": 9.160013875319348e-06, + "loss": 0.6968, + "step": 672 + }, + { + "epoch": 0.8074385122975405, + "grad_norm": 0.797552540085641, + "learning_rate": 9.156135060289374e-06, + "loss": 0.6622, + "step": 673 + }, + { + "epoch": 0.8086382723455309, + "grad_norm": 0.7142660355705535, + "learning_rate": 9.152248135454315e-06, + "loss": 0.7295, + "step": 674 + }, + { + "epoch": 0.8098380323935213, + "grad_norm": 0.7496127091364704, + "learning_rate": 9.148353108398677e-06, + "loss": 0.7389, + "step": 675 + }, + { + "epoch": 0.8110377924415118, + "grad_norm": 0.8373650209653336, + "learning_rate": 9.144449986722764e-06, + "loss": 0.6162, + "step": 676 + }, + { + "epoch": 0.8122375524895021, + "grad_norm": 0.7508306658345354, + "learning_rate": 9.140538778042682e-06, + "loss": 0.814, + "step": 677 + }, + { + "epoch": 0.8134373125374925, + "grad_norm": 0.7764431964930828, + "learning_rate": 9.136619489990313e-06, + "loss": 0.6989, + "step": 678 + }, + { + "epoch": 0.8146370725854829, + "grad_norm": 0.7645974259736653, + "learning_rate": 9.132692130213307e-06, + "loss": 0.7694, + "step": 679 + }, + { + "epoch": 0.8158368326334733, + "grad_norm": 0.7556080783634802, + "learning_rate": 9.128756706375065e-06, + "loss": 0.7806, + "step": 680 + }, + { + "epoch": 0.8170365926814637, + "grad_norm": 0.7970652457497799, + "learning_rate": 9.124813226154719e-06, + "loss": 0.748, + "step": 681 + }, + { + "epoch": 0.8182363527294542, + "grad_norm": 1.0254876320720594, + "learning_rate": 9.120861697247125e-06, + "loss": 0.7371, + "step": 682 + }, + { + "epoch": 0.8194361127774445, + "grad_norm": 0.7158919072080201, + "learning_rate": 9.116902127362841e-06, + "loss": 0.7136, + "step": 683 + }, + { + "epoch": 0.8206358728254349, + "grad_norm": 0.8533405560260696, + "learning_rate": 9.11293452422812e-06, + "loss": 0.7545, + "step": 684 + }, + { + "epoch": 0.8218356328734253, + "grad_norm": 0.8259141057451911, + "learning_rate": 9.108958895584888e-06, + "loss": 0.7221, + "step": 685 + }, + { + "epoch": 0.8230353929214157, + "grad_norm": 0.8168844953059569, + "learning_rate": 9.104975249190728e-06, + "loss": 0.7437, + "step": 686 + }, + { + "epoch": 0.8242351529694061, + "grad_norm": 0.7572408428890092, + "learning_rate": 9.100983592818873e-06, + "loss": 0.7147, + "step": 687 + }, + { + "epoch": 0.8254349130173965, + "grad_norm": 0.7417806524544673, + "learning_rate": 9.09698393425818e-06, + "loss": 0.7825, + "step": 688 + }, + { + "epoch": 0.826634673065387, + "grad_norm": 0.9109315486736959, + "learning_rate": 9.092976281313127e-06, + "loss": 0.6836, + "step": 689 + }, + { + "epoch": 0.8278344331133773, + "grad_norm": 0.7700193136501021, + "learning_rate": 9.088960641803786e-06, + "loss": 0.7101, + "step": 690 + }, + { + "epoch": 0.8290341931613677, + "grad_norm": 0.7063368254506233, + "learning_rate": 9.084937023565817e-06, + "loss": 0.7607, + "step": 691 + }, + { + "epoch": 0.8302339532093581, + "grad_norm": 0.7335228407782676, + "learning_rate": 9.080905434450446e-06, + "loss": 0.793, + "step": 692 + }, + { + "epoch": 0.8314337132573485, + "grad_norm": 0.757472933993561, + "learning_rate": 9.076865882324453e-06, + "loss": 0.7252, + "step": 693 + }, + { + "epoch": 0.8326334733053389, + "grad_norm": 0.7775955900228193, + "learning_rate": 9.072818375070156e-06, + "loss": 0.7111, + "step": 694 + }, + { + "epoch": 0.8338332333533294, + "grad_norm": 0.7222525868463516, + "learning_rate": 9.068762920585399e-06, + "loss": 0.6625, + "step": 695 + }, + { + "epoch": 0.8350329934013198, + "grad_norm": 0.7291752779428845, + "learning_rate": 9.064699526783527e-06, + "loss": 0.7153, + "step": 696 + }, + { + "epoch": 0.8362327534493101, + "grad_norm": 0.7526637151804322, + "learning_rate": 9.060628201593383e-06, + "loss": 0.7601, + "step": 697 + }, + { + "epoch": 0.8374325134973005, + "grad_norm": 0.8495898517083398, + "learning_rate": 9.056548952959283e-06, + "loss": 0.663, + "step": 698 + }, + { + "epoch": 0.8386322735452909, + "grad_norm": 0.7919350740237497, + "learning_rate": 9.052461788841005e-06, + "loss": 0.7653, + "step": 699 + }, + { + "epoch": 0.8398320335932813, + "grad_norm": 0.6139199206157331, + "learning_rate": 9.048366717213772e-06, + "loss": 0.6735, + "step": 700 + }, + { + "epoch": 0.8410317936412718, + "grad_norm": 0.6954314325669156, + "learning_rate": 9.044263746068236e-06, + "loss": 0.7085, + "step": 701 + }, + { + "epoch": 0.8422315536892622, + "grad_norm": 0.7703628098346507, + "learning_rate": 9.040152883410465e-06, + "loss": 0.7534, + "step": 702 + }, + { + "epoch": 0.8434313137372526, + "grad_norm": 0.7962857304460215, + "learning_rate": 9.036034137261924e-06, + "loss": 0.6811, + "step": 703 + }, + { + "epoch": 0.8446310737852429, + "grad_norm": 0.6355076254938227, + "learning_rate": 9.031907515659464e-06, + "loss": 0.7239, + "step": 704 + }, + { + "epoch": 0.8458308338332333, + "grad_norm": 0.6583975844454037, + "learning_rate": 9.027773026655298e-06, + "loss": 0.716, + "step": 705 + }, + { + "epoch": 0.8470305938812237, + "grad_norm": 0.7138960727557581, + "learning_rate": 9.023630678316994e-06, + "loss": 0.7629, + "step": 706 + }, + { + "epoch": 0.8482303539292142, + "grad_norm": 0.759517500163122, + "learning_rate": 9.019480478727458e-06, + "loss": 0.712, + "step": 707 + }, + { + "epoch": 0.8494301139772046, + "grad_norm": 0.7617781662555174, + "learning_rate": 9.015322435984909e-06, + "loss": 0.6963, + "step": 708 + }, + { + "epoch": 0.850629874025195, + "grad_norm": 0.8017913620380468, + "learning_rate": 9.011156558202877e-06, + "loss": 0.75, + "step": 709 + }, + { + "epoch": 0.8518296340731853, + "grad_norm": 0.6428736050904517, + "learning_rate": 9.006982853510177e-06, + "loss": 0.7252, + "step": 710 + }, + { + "epoch": 0.8530293941211757, + "grad_norm": 0.6897099281544877, + "learning_rate": 9.0028013300509e-06, + "loss": 0.7084, + "step": 711 + }, + { + "epoch": 0.8542291541691661, + "grad_norm": 0.8692697405988467, + "learning_rate": 8.998611995984388e-06, + "loss": 0.7003, + "step": 712 + }, + { + "epoch": 0.8554289142171566, + "grad_norm": 0.6913961519935883, + "learning_rate": 8.994414859485228e-06, + "loss": 0.7201, + "step": 713 + }, + { + "epoch": 0.856628674265147, + "grad_norm": 0.8601053884938681, + "learning_rate": 8.99020992874323e-06, + "loss": 0.7144, + "step": 714 + }, + { + "epoch": 0.8578284343131374, + "grad_norm": 0.6869846626792715, + "learning_rate": 8.985997211963414e-06, + "loss": 0.6746, + "step": 715 + }, + { + "epoch": 0.8590281943611278, + "grad_norm": 0.8118651300115735, + "learning_rate": 8.981776717365991e-06, + "loss": 0.6991, + "step": 716 + }, + { + "epoch": 0.8602279544091181, + "grad_norm": 0.8515786561839657, + "learning_rate": 8.977548453186354e-06, + "loss": 0.6962, + "step": 717 + }, + { + "epoch": 0.8614277144571085, + "grad_norm": 0.6595328779321165, + "learning_rate": 8.973312427675047e-06, + "loss": 0.7054, + "step": 718 + }, + { + "epoch": 0.862627474505099, + "grad_norm": 0.7348146209601154, + "learning_rate": 8.969068649097766e-06, + "loss": 0.7633, + "step": 719 + }, + { + "epoch": 0.8638272345530894, + "grad_norm": 0.8617055777943434, + "learning_rate": 8.964817125735337e-06, + "loss": 0.776, + "step": 720 + }, + { + "epoch": 0.8650269946010798, + "grad_norm": 0.8590136330003421, + "learning_rate": 8.96055786588369e-06, + "loss": 0.7528, + "step": 721 + }, + { + "epoch": 0.8662267546490702, + "grad_norm": 0.7281656999282983, + "learning_rate": 8.956290877853857e-06, + "loss": 0.657, + "step": 722 + }, + { + "epoch": 0.8674265146970606, + "grad_norm": 0.7673981705874487, + "learning_rate": 8.95201616997195e-06, + "loss": 0.7062, + "step": 723 + }, + { + "epoch": 0.8686262747450509, + "grad_norm": 0.8162483335235116, + "learning_rate": 8.947733750579147e-06, + "loss": 0.7224, + "step": 724 + }, + { + "epoch": 0.8698260347930414, + "grad_norm": 0.7829593234370885, + "learning_rate": 8.943443628031663e-06, + "loss": 0.7262, + "step": 725 + }, + { + "epoch": 0.8710257948410318, + "grad_norm": 0.8091531239120493, + "learning_rate": 8.939145810700755e-06, + "loss": 0.6739, + "step": 726 + }, + { + "epoch": 0.8722255548890222, + "grad_norm": 0.7558985566945485, + "learning_rate": 8.93484030697269e-06, + "loss": 0.7302, + "step": 727 + }, + { + "epoch": 0.8734253149370126, + "grad_norm": 0.836085347122929, + "learning_rate": 8.930527125248733e-06, + "loss": 0.7361, + "step": 728 + }, + { + "epoch": 0.874625074985003, + "grad_norm": 0.7320634563463685, + "learning_rate": 8.926206273945133e-06, + "loss": 0.7481, + "step": 729 + }, + { + "epoch": 0.8758248350329934, + "grad_norm": 0.7362413271525446, + "learning_rate": 8.921877761493103e-06, + "loss": 0.6921, + "step": 730 + }, + { + "epoch": 0.8770245950809838, + "grad_norm": 0.7337908243444188, + "learning_rate": 8.917541596338806e-06, + "loss": 0.7598, + "step": 731 + }, + { + "epoch": 0.8782243551289742, + "grad_norm": 0.7532399005677989, + "learning_rate": 8.913197786943335e-06, + "loss": 0.671, + "step": 732 + }, + { + "epoch": 0.8794241151769646, + "grad_norm": 0.720119023479152, + "learning_rate": 8.908846341782705e-06, + "loss": 0.6855, + "step": 733 + }, + { + "epoch": 0.880623875224955, + "grad_norm": 0.8340213195042716, + "learning_rate": 8.904487269347824e-06, + "loss": 0.8062, + "step": 734 + }, + { + "epoch": 0.8818236352729454, + "grad_norm": 0.8537544717020235, + "learning_rate": 8.900120578144487e-06, + "loss": 0.7572, + "step": 735 + }, + { + "epoch": 0.8830233953209358, + "grad_norm": 0.7711195264883798, + "learning_rate": 8.895746276693354e-06, + "loss": 0.6446, + "step": 736 + }, + { + "epoch": 0.8842231553689263, + "grad_norm": 0.6887520424872645, + "learning_rate": 8.891364373529933e-06, + "loss": 0.7289, + "step": 737 + }, + { + "epoch": 0.8854229154169166, + "grad_norm": 0.8719145209704985, + "learning_rate": 8.886974877204571e-06, + "loss": 0.7299, + "step": 738 + }, + { + "epoch": 0.886622675464907, + "grad_norm": 0.6511567150951192, + "learning_rate": 8.882577796282423e-06, + "loss": 0.775, + "step": 739 + }, + { + "epoch": 0.8878224355128974, + "grad_norm": 0.6579623722688875, + "learning_rate": 8.87817313934345e-06, + "loss": 0.6634, + "step": 740 + }, + { + "epoch": 0.8890221955608878, + "grad_norm": 0.7008815480410427, + "learning_rate": 8.873760914982398e-06, + "loss": 0.6636, + "step": 741 + }, + { + "epoch": 0.8902219556088782, + "grad_norm": 1.4279455662876566, + "learning_rate": 8.86934113180877e-06, + "loss": 0.7955, + "step": 742 + }, + { + "epoch": 0.8914217156568687, + "grad_norm": 0.7713950135088316, + "learning_rate": 8.864913798446825e-06, + "loss": 0.7636, + "step": 743 + }, + { + "epoch": 0.8926214757048591, + "grad_norm": 0.8260504764129432, + "learning_rate": 8.860478923535556e-06, + "loss": 0.6612, + "step": 744 + }, + { + "epoch": 0.8938212357528494, + "grad_norm": 0.7201716538315065, + "learning_rate": 8.856036515728666e-06, + "loss": 0.6913, + "step": 745 + }, + { + "epoch": 0.8950209958008398, + "grad_norm": 0.7214346438900352, + "learning_rate": 8.85158658369456e-06, + "loss": 0.768, + "step": 746 + }, + { + "epoch": 0.8962207558488302, + "grad_norm": 0.7259238301199039, + "learning_rate": 8.847129136116326e-06, + "loss": 0.6861, + "step": 747 + }, + { + "epoch": 0.8974205158968206, + "grad_norm": 1.5664870972162603, + "learning_rate": 8.842664181691716e-06, + "loss": 0.7551, + "step": 748 + }, + { + "epoch": 0.8986202759448111, + "grad_norm": 0.7374710992889468, + "learning_rate": 8.83819172913313e-06, + "loss": 0.6725, + "step": 749 + }, + { + "epoch": 0.8998200359928015, + "grad_norm": 0.6867731164372635, + "learning_rate": 8.833711787167596e-06, + "loss": 0.7708, + "step": 750 + }, + { + "epoch": 0.9010197960407919, + "grad_norm": 0.7433539004061774, + "learning_rate": 8.829224364536762e-06, + "loss": 0.66, + "step": 751 + }, + { + "epoch": 0.9022195560887822, + "grad_norm": 0.6976954630092328, + "learning_rate": 8.82472946999687e-06, + "loss": 0.7192, + "step": 752 + }, + { + "epoch": 0.9034193161367726, + "grad_norm": 0.7243742893502679, + "learning_rate": 8.820227112318737e-06, + "loss": 0.6881, + "step": 753 + }, + { + "epoch": 0.904619076184763, + "grad_norm": 0.6601153532474624, + "learning_rate": 8.815717300287751e-06, + "loss": 0.681, + "step": 754 + }, + { + "epoch": 0.9058188362327535, + "grad_norm": 0.7442440370029324, + "learning_rate": 8.81120004270384e-06, + "loss": 0.7264, + "step": 755 + }, + { + "epoch": 0.9070185962807439, + "grad_norm": 0.7407327729335063, + "learning_rate": 8.806675348381465e-06, + "loss": 0.6697, + "step": 756 + }, + { + "epoch": 0.9082183563287343, + "grad_norm": 0.6749557172369749, + "learning_rate": 8.80214322614959e-06, + "loss": 0.7387, + "step": 757 + }, + { + "epoch": 0.9094181163767247, + "grad_norm": 0.7888777311895881, + "learning_rate": 8.797603684851685e-06, + "loss": 0.7901, + "step": 758 + }, + { + "epoch": 0.910617876424715, + "grad_norm": 0.8381921280616869, + "learning_rate": 8.793056733345685e-06, + "loss": 0.7785, + "step": 759 + }, + { + "epoch": 0.9118176364727054, + "grad_norm": 0.9634129064332837, + "learning_rate": 8.788502380503991e-06, + "loss": 0.6772, + "step": 760 + }, + { + "epoch": 0.9130173965206959, + "grad_norm": 0.6957936617015104, + "learning_rate": 8.783940635213443e-06, + "loss": 0.7519, + "step": 761 + }, + { + "epoch": 0.9142171565686863, + "grad_norm": 0.8261484130782292, + "learning_rate": 8.779371506375312e-06, + "loss": 0.7726, + "step": 762 + }, + { + "epoch": 0.9154169166166767, + "grad_norm": 0.8443144653539115, + "learning_rate": 8.774795002905266e-06, + "loss": 0.7667, + "step": 763 + }, + { + "epoch": 0.9166166766646671, + "grad_norm": 0.6997642773177474, + "learning_rate": 8.770211133733373e-06, + "loss": 0.6865, + "step": 764 + }, + { + "epoch": 0.9178164367126574, + "grad_norm": 0.9107408755390968, + "learning_rate": 8.765619907804068e-06, + "loss": 0.7522, + "step": 765 + }, + { + "epoch": 0.9190161967606478, + "grad_norm": 0.7972813404134526, + "learning_rate": 8.76102133407614e-06, + "loss": 0.7496, + "step": 766 + }, + { + "epoch": 0.9202159568086383, + "grad_norm": 0.8575564826675797, + "learning_rate": 8.756415421522723e-06, + "loss": 0.7115, + "step": 767 + }, + { + "epoch": 0.9214157168566287, + "grad_norm": 0.784169782895324, + "learning_rate": 8.751802179131261e-06, + "loss": 0.7411, + "step": 768 + }, + { + "epoch": 0.9226154769046191, + "grad_norm": 0.8257712462719169, + "learning_rate": 8.74718161590351e-06, + "loss": 0.722, + "step": 769 + }, + { + "epoch": 0.9238152369526095, + "grad_norm": 1.1175941018251747, + "learning_rate": 8.742553740855507e-06, + "loss": 0.8218, + "step": 770 + }, + { + "epoch": 0.9250149970005999, + "grad_norm": 0.9189804422676542, + "learning_rate": 8.737918563017553e-06, + "loss": 0.6823, + "step": 771 + }, + { + "epoch": 0.9262147570485902, + "grad_norm": 0.7037201795139563, + "learning_rate": 8.733276091434204e-06, + "loss": 0.7116, + "step": 772 + }, + { + "epoch": 0.9274145170965807, + "grad_norm": 0.7875872081694147, + "learning_rate": 8.728626335164247e-06, + "loss": 0.7094, + "step": 773 + }, + { + "epoch": 0.9286142771445711, + "grad_norm": 0.6815464687911802, + "learning_rate": 8.723969303280682e-06, + "loss": 0.7418, + "step": 774 + }, + { + "epoch": 0.9298140371925615, + "grad_norm": 0.7265610401869881, + "learning_rate": 8.719305004870705e-06, + "loss": 0.7806, + "step": 775 + }, + { + "epoch": 0.9310137972405519, + "grad_norm": 0.717266987553133, + "learning_rate": 8.714633449035699e-06, + "loss": 0.7155, + "step": 776 + }, + { + "epoch": 0.9322135572885423, + "grad_norm": 0.7642623821787721, + "learning_rate": 8.709954644891195e-06, + "loss": 0.7438, + "step": 777 + }, + { + "epoch": 0.9334133173365327, + "grad_norm": 0.8504999037092563, + "learning_rate": 8.705268601566877e-06, + "loss": 0.7389, + "step": 778 + }, + { + "epoch": 0.9346130773845231, + "grad_norm": 0.8259898115474855, + "learning_rate": 8.700575328206554e-06, + "loss": 0.7779, + "step": 779 + }, + { + "epoch": 0.9358128374325135, + "grad_norm": 0.6695114476759497, + "learning_rate": 8.695874833968136e-06, + "loss": 0.7475, + "step": 780 + }, + { + "epoch": 0.9370125974805039, + "grad_norm": 0.7666424504704847, + "learning_rate": 8.691167128023637e-06, + "loss": 0.6744, + "step": 781 + }, + { + "epoch": 0.9382123575284943, + "grad_norm": 0.6682439375073063, + "learning_rate": 8.686452219559125e-06, + "loss": 0.651, + "step": 782 + }, + { + "epoch": 0.9394121175764847, + "grad_norm": 0.751684373428545, + "learning_rate": 8.681730117774736e-06, + "loss": 0.7075, + "step": 783 + }, + { + "epoch": 0.9406118776244751, + "grad_norm": 0.7024570630082877, + "learning_rate": 8.677000831884639e-06, + "loss": 0.6648, + "step": 784 + }, + { + "epoch": 0.9418116376724655, + "grad_norm": 0.7583152269181472, + "learning_rate": 8.672264371117016e-06, + "loss": 0.751, + "step": 785 + }, + { + "epoch": 0.943011397720456, + "grad_norm": 0.7316407315135922, + "learning_rate": 8.667520744714055e-06, + "loss": 0.7648, + "step": 786 + }, + { + "epoch": 0.9442111577684463, + "grad_norm": 0.8269882990695296, + "learning_rate": 8.662769961931926e-06, + "loss": 0.6869, + "step": 787 + }, + { + "epoch": 0.9454109178164367, + "grad_norm": 0.9002478494140085, + "learning_rate": 8.658012032040758e-06, + "loss": 0.7247, + "step": 788 + }, + { + "epoch": 0.9466106778644271, + "grad_norm": 0.7721188467093039, + "learning_rate": 8.653246964324631e-06, + "loss": 0.6905, + "step": 789 + }, + { + "epoch": 0.9478104379124175, + "grad_norm": 0.7001233970231492, + "learning_rate": 8.648474768081553e-06, + "loss": 0.7291, + "step": 790 + }, + { + "epoch": 0.9490101979604079, + "grad_norm": 0.6839392449808562, + "learning_rate": 8.643695452623437e-06, + "loss": 0.6951, + "step": 791 + }, + { + "epoch": 0.9502099580083984, + "grad_norm": 0.7310041234694719, + "learning_rate": 8.638909027276093e-06, + "loss": 0.6974, + "step": 792 + }, + { + "epoch": 0.9514097180563887, + "grad_norm": 0.7067703460134739, + "learning_rate": 8.634115501379202e-06, + "loss": 0.6696, + "step": 793 + }, + { + "epoch": 0.9526094781043791, + "grad_norm": 0.7156477215190924, + "learning_rate": 8.629314884286299e-06, + "loss": 0.7316, + "step": 794 + }, + { + "epoch": 0.9538092381523695, + "grad_norm": 0.743906326807422, + "learning_rate": 8.62450718536476e-06, + "loss": 0.7036, + "step": 795 + }, + { + "epoch": 0.9550089982003599, + "grad_norm": 0.8704938925080471, + "learning_rate": 8.619692413995775e-06, + "loss": 0.6215, + "step": 796 + }, + { + "epoch": 0.9562087582483503, + "grad_norm": 0.7599568654719119, + "learning_rate": 8.614870579574338e-06, + "loss": 0.6672, + "step": 797 + }, + { + "epoch": 0.9574085182963408, + "grad_norm": 0.8316313659598846, + "learning_rate": 8.61004169150922e-06, + "loss": 0.754, + "step": 798 + }, + { + "epoch": 0.9586082783443312, + "grad_norm": 0.7315128118767747, + "learning_rate": 8.605205759222963e-06, + "loss": 0.7195, + "step": 799 + }, + { + "epoch": 0.9598080383923215, + "grad_norm": 0.7171949806066087, + "learning_rate": 8.600362792151849e-06, + "loss": 0.6753, + "step": 800 + }, + { + "epoch": 0.9610077984403119, + "grad_norm": 0.7630107755408194, + "learning_rate": 8.595512799745887e-06, + "loss": 0.7561, + "step": 801 + }, + { + "epoch": 0.9622075584883023, + "grad_norm": 0.6960112659463641, + "learning_rate": 8.590655791468798e-06, + "loss": 0.7178, + "step": 802 + }, + { + "epoch": 0.9634073185362927, + "grad_norm": 0.7785762158178057, + "learning_rate": 8.585791776797989e-06, + "loss": 0.775, + "step": 803 + }, + { + "epoch": 0.9646070785842832, + "grad_norm": 0.7625428223240929, + "learning_rate": 8.580920765224541e-06, + "loss": 0.6772, + "step": 804 + }, + { + "epoch": 0.9658068386322736, + "grad_norm": 0.6918983117535726, + "learning_rate": 8.576042766253186e-06, + "loss": 0.7432, + "step": 805 + }, + { + "epoch": 0.967006598680264, + "grad_norm": 0.6725832160053664, + "learning_rate": 8.571157789402292e-06, + "loss": 0.7227, + "step": 806 + }, + { + "epoch": 0.9682063587282543, + "grad_norm": 0.8274038768697642, + "learning_rate": 8.566265844203843e-06, + "loss": 0.6806, + "step": 807 + }, + { + "epoch": 0.9694061187762447, + "grad_norm": 0.8338463853279005, + "learning_rate": 8.56136694020342e-06, + "loss": 0.7171, + "step": 808 + }, + { + "epoch": 0.9706058788242351, + "grad_norm": 0.7590280630412745, + "learning_rate": 8.556461086960179e-06, + "loss": 0.6099, + "step": 809 + }, + { + "epoch": 0.9718056388722256, + "grad_norm": 0.6679038582059358, + "learning_rate": 8.551548294046843e-06, + "loss": 0.6973, + "step": 810 + }, + { + "epoch": 0.973005398920216, + "grad_norm": 0.909079297394524, + "learning_rate": 8.546628571049671e-06, + "loss": 0.7315, + "step": 811 + }, + { + "epoch": 0.9742051589682064, + "grad_norm": 0.9645613467378517, + "learning_rate": 8.541701927568444e-06, + "loss": 0.7391, + "step": 812 + }, + { + "epoch": 0.9754049190161967, + "grad_norm": 0.7697405806994291, + "learning_rate": 8.536768373216454e-06, + "loss": 0.6878, + "step": 813 + }, + { + "epoch": 0.9766046790641871, + "grad_norm": 0.7701896004261258, + "learning_rate": 8.531827917620466e-06, + "loss": 0.6798, + "step": 814 + }, + { + "epoch": 0.9778044391121775, + "grad_norm": 0.7096781450038352, + "learning_rate": 8.526880570420721e-06, + "loss": 0.7076, + "step": 815 + }, + { + "epoch": 0.979004199160168, + "grad_norm": 0.6409247185038676, + "learning_rate": 8.521926341270908e-06, + "loss": 0.7069, + "step": 816 + }, + { + "epoch": 0.9802039592081584, + "grad_norm": 0.706560411878609, + "learning_rate": 8.516965239838137e-06, + "loss": 0.7162, + "step": 817 + }, + { + "epoch": 0.9814037192561488, + "grad_norm": 0.9947129450737373, + "learning_rate": 8.511997275802934e-06, + "loss": 0.7489, + "step": 818 + }, + { + "epoch": 0.9826034793041392, + "grad_norm": 0.8735057070309135, + "learning_rate": 8.507022458859215e-06, + "loss": 0.7346, + "step": 819 + }, + { + "epoch": 0.9838032393521295, + "grad_norm": 0.6508191410059432, + "learning_rate": 8.502040798714263e-06, + "loss": 0.6725, + "step": 820 + }, + { + "epoch": 0.9850029994001199, + "grad_norm": 0.7443473259703014, + "learning_rate": 8.497052305088722e-06, + "loss": 0.6889, + "step": 821 + }, + { + "epoch": 0.9862027594481104, + "grad_norm": 1.2455924883181906, + "learning_rate": 8.492056987716566e-06, + "loss": 0.6847, + "step": 822 + }, + { + "epoch": 0.9874025194961008, + "grad_norm": 0.7944880426727919, + "learning_rate": 8.487054856345081e-06, + "loss": 0.749, + "step": 823 + }, + { + "epoch": 0.9886022795440912, + "grad_norm": 0.7262988013969764, + "learning_rate": 8.482045920734855e-06, + "loss": 0.6876, + "step": 824 + }, + { + "epoch": 0.9898020395920816, + "grad_norm": 0.7812167032222277, + "learning_rate": 8.47703019065975e-06, + "loss": 0.6905, + "step": 825 + }, + { + "epoch": 0.991001799640072, + "grad_norm": 0.8887583534089065, + "learning_rate": 8.472007675906883e-06, + "loss": 0.701, + "step": 826 + }, + { + "epoch": 0.9922015596880623, + "grad_norm": 0.8180330273352874, + "learning_rate": 8.466978386276618e-06, + "loss": 0.6619, + "step": 827 + }, + { + "epoch": 0.9934013197360528, + "grad_norm": 0.7550253827882137, + "learning_rate": 8.46194233158253e-06, + "loss": 0.6879, + "step": 828 + }, + { + "epoch": 0.9946010797840432, + "grad_norm": 0.8094156521246557, + "learning_rate": 8.456899521651401e-06, + "loss": 0.7122, + "step": 829 + }, + { + "epoch": 0.9958008398320336, + "grad_norm": 0.7764238822386331, + "learning_rate": 8.451849966323189e-06, + "loss": 0.7269, + "step": 830 + }, + { + "epoch": 0.997000599880024, + "grad_norm": 0.6943839690282205, + "learning_rate": 8.446793675451017e-06, + "loss": 0.6991, + "step": 831 + }, + { + "epoch": 0.9982003599280144, + "grad_norm": 0.7890307349482584, + "learning_rate": 8.441730658901154e-06, + "loss": 0.722, + "step": 832 + }, + { + "epoch": 0.9994001199760048, + "grad_norm": 0.7177227139393145, + "learning_rate": 8.436660926552987e-06, + "loss": 0.6527, + "step": 833 + }, + { + "epoch": 1.0, + "grad_norm": 0.7177227139393145, + "learning_rate": 8.431584488299009e-06, + "loss": 0.6803, + "step": 834 + }, + { + "epoch": 1.0011997600479905, + "grad_norm": 1.2017300303291012, + "learning_rate": 8.426501354044802e-06, + "loss": 0.7494, + "step": 835 + }, + { + "epoch": 1.0023995200959808, + "grad_norm": 1.0195681094503213, + "learning_rate": 8.42141153370901e-06, + "loss": 0.6218, + "step": 836 + }, + { + "epoch": 1.0035992801439713, + "grad_norm": 1.2864757513523697, + "learning_rate": 8.416315037223322e-06, + "loss": 0.7156, + "step": 837 + }, + { + "epoch": 1.0047990401919615, + "grad_norm": 0.7178806399171327, + "learning_rate": 8.41121187453246e-06, + "loss": 0.6571, + "step": 838 + }, + { + "epoch": 1.005998800239952, + "grad_norm": 0.8446232852801754, + "learning_rate": 8.406102055594148e-06, + "loss": 0.6132, + "step": 839 + }, + { + "epoch": 1.0071985602879423, + "grad_norm": 0.6838103195827099, + "learning_rate": 8.400985590379102e-06, + "loss": 0.5841, + "step": 840 + }, + { + "epoch": 1.0083983203359328, + "grad_norm": 0.6677261351678965, + "learning_rate": 8.395862488871003e-06, + "loss": 0.6029, + "step": 841 + }, + { + "epoch": 1.0095980803839233, + "grad_norm": 0.812967264899385, + "learning_rate": 8.390732761066484e-06, + "loss": 0.6174, + "step": 842 + }, + { + "epoch": 1.0107978404319136, + "grad_norm": 0.8837166380270197, + "learning_rate": 8.38559641697511e-06, + "loss": 0.6194, + "step": 843 + }, + { + "epoch": 1.011997600479904, + "grad_norm": 0.5840251307941942, + "learning_rate": 8.38045346661935e-06, + "loss": 0.6831, + "step": 844 + }, + { + "epoch": 1.0131973605278943, + "grad_norm": 0.8540882477821887, + "learning_rate": 8.375303920034567e-06, + "loss": 0.6423, + "step": 845 + }, + { + "epoch": 1.0143971205758848, + "grad_norm": 0.8173963023070573, + "learning_rate": 8.370147787269001e-06, + "loss": 0.6051, + "step": 846 + }, + { + "epoch": 1.0155968806238753, + "grad_norm": 0.7778251780116223, + "learning_rate": 8.364985078383729e-06, + "loss": 0.5686, + "step": 847 + }, + { + "epoch": 1.0167966406718656, + "grad_norm": 0.7777169141031703, + "learning_rate": 8.359815803452677e-06, + "loss": 0.6723, + "step": 848 + }, + { + "epoch": 1.017996400719856, + "grad_norm": 0.7375802737519649, + "learning_rate": 8.35463997256257e-06, + "loss": 0.6736, + "step": 849 + }, + { + "epoch": 1.0191961607678464, + "grad_norm": 0.5812046566626188, + "learning_rate": 8.349457595812934e-06, + "loss": 0.5867, + "step": 850 + }, + { + "epoch": 1.0203959208158369, + "grad_norm": 0.7775044961284155, + "learning_rate": 8.34426868331606e-06, + "loss": 0.6513, + "step": 851 + }, + { + "epoch": 1.0215956808638271, + "grad_norm": 0.6633412981042944, + "learning_rate": 8.339073245197001e-06, + "loss": 0.6703, + "step": 852 + }, + { + "epoch": 1.0227954409118176, + "grad_norm": 0.8696629740675844, + "learning_rate": 8.333871291593534e-06, + "loss": 0.6518, + "step": 853 + }, + { + "epoch": 1.0239952009598081, + "grad_norm": 0.6240149906719957, + "learning_rate": 8.328662832656159e-06, + "loss": 0.5998, + "step": 854 + }, + { + "epoch": 1.0251949610077984, + "grad_norm": 0.8402359587949699, + "learning_rate": 8.323447878548063e-06, + "loss": 0.6399, + "step": 855 + }, + { + "epoch": 1.0263947210557889, + "grad_norm": 0.9284063049240554, + "learning_rate": 8.318226439445107e-06, + "loss": 0.5422, + "step": 856 + }, + { + "epoch": 1.0275944811037792, + "grad_norm": 0.6819000269369228, + "learning_rate": 8.312998525535813e-06, + "loss": 0.5781, + "step": 857 + }, + { + "epoch": 1.0287942411517697, + "grad_norm": 0.6889190567412302, + "learning_rate": 8.307764147021328e-06, + "loss": 0.6427, + "step": 858 + }, + { + "epoch": 1.0299940011997601, + "grad_norm": 0.8990639088926144, + "learning_rate": 8.302523314115421e-06, + "loss": 0.5896, + "step": 859 + }, + { + "epoch": 1.0311937612477504, + "grad_norm": 0.6367860307845866, + "learning_rate": 8.297276037044451e-06, + "loss": 0.6469, + "step": 860 + }, + { + "epoch": 1.032393521295741, + "grad_norm": 0.628397539390034, + "learning_rate": 8.29202232604735e-06, + "loss": 0.6532, + "step": 861 + }, + { + "epoch": 1.0335932813437312, + "grad_norm": 0.7168283069895726, + "learning_rate": 8.28676219137561e-06, + "loss": 0.6416, + "step": 862 + }, + { + "epoch": 1.0347930413917217, + "grad_norm": 0.7469325210173965, + "learning_rate": 8.281495643293254e-06, + "loss": 0.6107, + "step": 863 + }, + { + "epoch": 1.035992801439712, + "grad_norm": 0.7806686875957286, + "learning_rate": 8.276222692076817e-06, + "loss": 0.642, + "step": 864 + }, + { + "epoch": 1.0371925614877024, + "grad_norm": 0.7758783574735141, + "learning_rate": 8.270943348015333e-06, + "loss": 0.6457, + "step": 865 + }, + { + "epoch": 1.038392321535693, + "grad_norm": 0.6626976713714866, + "learning_rate": 8.265657621410308e-06, + "loss": 0.6046, + "step": 866 + }, + { + "epoch": 1.0395920815836832, + "grad_norm": 0.7180729110513119, + "learning_rate": 8.2603655225757e-06, + "loss": 0.6085, + "step": 867 + }, + { + "epoch": 1.0407918416316737, + "grad_norm": 0.9268142513774826, + "learning_rate": 8.255067061837908e-06, + "loss": 0.5917, + "step": 868 + }, + { + "epoch": 1.041991601679664, + "grad_norm": 0.7650068515227395, + "learning_rate": 8.249762249535738e-06, + "loss": 0.6483, + "step": 869 + }, + { + "epoch": 1.0431913617276545, + "grad_norm": 0.6173847612200957, + "learning_rate": 8.244451096020392e-06, + "loss": 0.6258, + "step": 870 + }, + { + "epoch": 1.044391121775645, + "grad_norm": 0.5401877202785061, + "learning_rate": 8.239133611655445e-06, + "loss": 0.6582, + "step": 871 + }, + { + "epoch": 1.0455908818236352, + "grad_norm": 0.8339913450126345, + "learning_rate": 8.233809806816826e-06, + "loss": 0.702, + "step": 872 + }, + { + "epoch": 1.0467906418716257, + "grad_norm": 0.7421783902223927, + "learning_rate": 8.228479691892798e-06, + "loss": 0.6043, + "step": 873 + }, + { + "epoch": 1.047990401919616, + "grad_norm": 0.6485869690651025, + "learning_rate": 8.223143277283937e-06, + "loss": 0.6802, + "step": 874 + }, + { + "epoch": 1.0491901619676065, + "grad_norm": 0.8376775424330809, + "learning_rate": 8.217800573403105e-06, + "loss": 0.5938, + "step": 875 + }, + { + "epoch": 1.0503899220155968, + "grad_norm": 0.6241377838561342, + "learning_rate": 8.212451590675445e-06, + "loss": 0.6256, + "step": 876 + }, + { + "epoch": 1.0515896820635873, + "grad_norm": 0.7333742917325202, + "learning_rate": 8.20709633953835e-06, + "loss": 0.6947, + "step": 877 + }, + { + "epoch": 1.0527894421115778, + "grad_norm": 0.7659926461397386, + "learning_rate": 8.20173483044144e-06, + "loss": 0.5769, + "step": 878 + }, + { + "epoch": 1.053989202159568, + "grad_norm": 0.5499942656083657, + "learning_rate": 8.196367073846548e-06, + "loss": 0.7144, + "step": 879 + }, + { + "epoch": 1.0551889622075585, + "grad_norm": 0.8596591269856485, + "learning_rate": 8.190993080227699e-06, + "loss": 0.6613, + "step": 880 + }, + { + "epoch": 1.0563887222555488, + "grad_norm": 0.799981814356054, + "learning_rate": 8.185612860071088e-06, + "loss": 0.5757, + "step": 881 + }, + { + "epoch": 1.0575884823035393, + "grad_norm": 0.5822804320615319, + "learning_rate": 8.18022642387506e-06, + "loss": 0.6399, + "step": 882 + }, + { + "epoch": 1.0587882423515298, + "grad_norm": 0.7513579482146487, + "learning_rate": 8.174833782150087e-06, + "loss": 0.6252, + "step": 883 + }, + { + "epoch": 1.05998800239952, + "grad_norm": 0.747723338953572, + "learning_rate": 8.169434945418753e-06, + "loss": 0.6896, + "step": 884 + }, + { + "epoch": 1.0611877624475106, + "grad_norm": 0.9348483655270661, + "learning_rate": 8.164029924215726e-06, + "loss": 0.7288, + "step": 885 + }, + { + "epoch": 1.0623875224955008, + "grad_norm": 0.8017838449602085, + "learning_rate": 8.158618729087746e-06, + "loss": 0.6362, + "step": 886 + }, + { + "epoch": 1.0635872825434913, + "grad_norm": 0.6526350086776308, + "learning_rate": 8.1532013705936e-06, + "loss": 0.6259, + "step": 887 + }, + { + "epoch": 1.0647870425914816, + "grad_norm": 0.7686722113617305, + "learning_rate": 8.147777859304095e-06, + "loss": 0.6296, + "step": 888 + }, + { + "epoch": 1.065986802639472, + "grad_norm": 0.7302411966675888, + "learning_rate": 8.142348205802053e-06, + "loss": 0.6991, + "step": 889 + }, + { + "epoch": 1.0671865626874626, + "grad_norm": 0.5601035737922527, + "learning_rate": 8.136912420682275e-06, + "loss": 0.6498, + "step": 890 + }, + { + "epoch": 1.0683863227354529, + "grad_norm": 0.805045152553524, + "learning_rate": 8.13147051455153e-06, + "loss": 0.7064, + "step": 891 + }, + { + "epoch": 1.0695860827834434, + "grad_norm": 0.5928026959639249, + "learning_rate": 8.126022498028527e-06, + "loss": 0.6043, + "step": 892 + }, + { + "epoch": 1.0707858428314336, + "grad_norm": 0.7909305590908955, + "learning_rate": 8.120568381743901e-06, + "loss": 0.6051, + "step": 893 + }, + { + "epoch": 1.0719856028794241, + "grad_norm": 0.8191799284024129, + "learning_rate": 8.11510817634019e-06, + "loss": 0.6285, + "step": 894 + }, + { + "epoch": 1.0731853629274146, + "grad_norm": 0.6985090097745722, + "learning_rate": 8.10964189247181e-06, + "loss": 0.5839, + "step": 895 + }, + { + "epoch": 1.074385122975405, + "grad_norm": 0.7633055880376761, + "learning_rate": 8.10416954080504e-06, + "loss": 0.6305, + "step": 896 + }, + { + "epoch": 1.0755848830233954, + "grad_norm": 0.7557111172959706, + "learning_rate": 8.098691132018004e-06, + "loss": 0.5791, + "step": 897 + }, + { + "epoch": 1.0767846430713857, + "grad_norm": 0.5709527709412924, + "learning_rate": 8.093206676800635e-06, + "loss": 0.6336, + "step": 898 + }, + { + "epoch": 1.0779844031193762, + "grad_norm": 0.6652917879121284, + "learning_rate": 8.087716185854673e-06, + "loss": 0.6635, + "step": 899 + }, + { + "epoch": 1.0791841631673664, + "grad_norm": 0.7647659312405806, + "learning_rate": 8.08221966989363e-06, + "loss": 0.6738, + "step": 900 + }, + { + "epoch": 1.080383923215357, + "grad_norm": 0.790523330347467, + "learning_rate": 8.076717139642775e-06, + "loss": 0.5743, + "step": 901 + }, + { + "epoch": 1.0815836832633474, + "grad_norm": 0.7415457762653487, + "learning_rate": 8.071208605839118e-06, + "loss": 0.6239, + "step": 902 + }, + { + "epoch": 1.0827834433113377, + "grad_norm": 0.6364375401106082, + "learning_rate": 8.065694079231379e-06, + "loss": 0.5959, + "step": 903 + }, + { + "epoch": 1.0839832033593282, + "grad_norm": 0.7322831814610964, + "learning_rate": 8.06017357057997e-06, + "loss": 0.7109, + "step": 904 + }, + { + "epoch": 1.0851829634073185, + "grad_norm": 0.7326664889329088, + "learning_rate": 8.05464709065698e-06, + "loss": 0.5572, + "step": 905 + }, + { + "epoch": 1.086382723455309, + "grad_norm": 0.8097688842435833, + "learning_rate": 8.049114650246149e-06, + "loss": 0.5968, + "step": 906 + }, + { + "epoch": 1.0875824835032994, + "grad_norm": 0.8082297761876786, + "learning_rate": 8.043576260142844e-06, + "loss": 0.5504, + "step": 907 + }, + { + "epoch": 1.0887822435512897, + "grad_norm": 0.7700931670572386, + "learning_rate": 8.038031931154044e-06, + "loss": 0.6451, + "step": 908 + }, + { + "epoch": 1.0899820035992802, + "grad_norm": 0.6369168250832012, + "learning_rate": 8.03248167409832e-06, + "loss": 0.6843, + "step": 909 + }, + { + "epoch": 1.0911817636472705, + "grad_norm": 0.7380681634425306, + "learning_rate": 8.026925499805802e-06, + "loss": 0.6699, + "step": 910 + }, + { + "epoch": 1.092381523695261, + "grad_norm": 0.6481000672770302, + "learning_rate": 8.021363419118173e-06, + "loss": 0.6482, + "step": 911 + }, + { + "epoch": 1.0935812837432513, + "grad_norm": 0.6254888477119547, + "learning_rate": 8.01579544288864e-06, + "loss": 0.6728, + "step": 912 + }, + { + "epoch": 1.0947810437912417, + "grad_norm": 0.8215650406361228, + "learning_rate": 8.010221581981912e-06, + "loss": 0.6579, + "step": 913 + }, + { + "epoch": 1.0959808038392322, + "grad_norm": 0.7334573953644762, + "learning_rate": 8.004641847274182e-06, + "loss": 0.6592, + "step": 914 + }, + { + "epoch": 1.0971805638872225, + "grad_norm": 0.64502117270513, + "learning_rate": 7.999056249653106e-06, + "loss": 0.67, + "step": 915 + }, + { + "epoch": 1.098380323935213, + "grad_norm": 0.7423289771175321, + "learning_rate": 7.993464800017774e-06, + "loss": 0.6587, + "step": 916 + }, + { + "epoch": 1.0995800839832033, + "grad_norm": 0.6328514541805109, + "learning_rate": 7.9878675092787e-06, + "loss": 0.6558, + "step": 917 + }, + { + "epoch": 1.1007798440311938, + "grad_norm": 0.6315258751159936, + "learning_rate": 7.9822643883578e-06, + "loss": 0.6311, + "step": 918 + }, + { + "epoch": 1.101979604079184, + "grad_norm": 0.7830994090026224, + "learning_rate": 7.976655448188356e-06, + "loss": 0.7285, + "step": 919 + }, + { + "epoch": 1.1031793641271745, + "grad_norm": 0.656409629518676, + "learning_rate": 7.971040699715008e-06, + "loss": 0.5734, + "step": 920 + }, + { + "epoch": 1.104379124175165, + "grad_norm": 0.7995809349422637, + "learning_rate": 7.965420153893741e-06, + "loss": 0.6572, + "step": 921 + }, + { + "epoch": 1.1055788842231553, + "grad_norm": 0.6045764821458152, + "learning_rate": 7.959793821691838e-06, + "loss": 0.6165, + "step": 922 + }, + { + "epoch": 1.1067786442711458, + "grad_norm": 0.6254737760421022, + "learning_rate": 7.954161714087876e-06, + "loss": 0.6607, + "step": 923 + }, + { + "epoch": 1.107978404319136, + "grad_norm": 0.8767118032126044, + "learning_rate": 7.948523842071707e-06, + "loss": 0.6039, + "step": 924 + }, + { + "epoch": 1.1091781643671266, + "grad_norm": 0.6164747668994374, + "learning_rate": 7.942880216644426e-06, + "loss": 0.6112, + "step": 925 + }, + { + "epoch": 1.110377924415117, + "grad_norm": 0.7393019211191909, + "learning_rate": 7.937230848818355e-06, + "loss": 0.5887, + "step": 926 + }, + { + "epoch": 1.1115776844631073, + "grad_norm": 0.7090151969865134, + "learning_rate": 7.931575749617027e-06, + "loss": 0.6331, + "step": 927 + }, + { + "epoch": 1.1127774445110978, + "grad_norm": 0.8196729176899866, + "learning_rate": 7.925914930075147e-06, + "loss": 0.645, + "step": 928 + }, + { + "epoch": 1.113977204559088, + "grad_norm": 0.5956719830391075, + "learning_rate": 7.920248401238593e-06, + "loss": 0.5781, + "step": 929 + }, + { + "epoch": 1.1151769646070786, + "grad_norm": 0.6762218315155455, + "learning_rate": 7.914576174164378e-06, + "loss": 0.6568, + "step": 930 + }, + { + "epoch": 1.116376724655069, + "grad_norm": 0.636496405234365, + "learning_rate": 7.908898259920636e-06, + "loss": 0.7224, + "step": 931 + }, + { + "epoch": 1.1175764847030594, + "grad_norm": 0.865234836801879, + "learning_rate": 7.903214669586596e-06, + "loss": 0.6938, + "step": 932 + }, + { + "epoch": 1.1187762447510499, + "grad_norm": 0.8873977059438501, + "learning_rate": 7.897525414252564e-06, + "loss": 0.6277, + "step": 933 + }, + { + "epoch": 1.1199760047990401, + "grad_norm": 0.7977055991778375, + "learning_rate": 7.891830505019905e-06, + "loss": 0.6089, + "step": 934 + }, + { + "epoch": 1.1211757648470306, + "grad_norm": 0.7985938779341601, + "learning_rate": 7.886129953001002e-06, + "loss": 0.6699, + "step": 935 + }, + { + "epoch": 1.122375524895021, + "grad_norm": 0.7529036412800081, + "learning_rate": 7.880423769319265e-06, + "loss": 0.6115, + "step": 936 + }, + { + "epoch": 1.1235752849430114, + "grad_norm": 0.7728641750560808, + "learning_rate": 7.874711965109085e-06, + "loss": 0.5787, + "step": 937 + }, + { + "epoch": 1.124775044991002, + "grad_norm": 0.7603933905232403, + "learning_rate": 7.86899455151582e-06, + "loss": 0.6112, + "step": 938 + }, + { + "epoch": 1.1259748050389922, + "grad_norm": 0.8582898047056604, + "learning_rate": 7.863271539695777e-06, + "loss": 0.6396, + "step": 939 + }, + { + "epoch": 1.1271745650869827, + "grad_norm": 0.5942730046106529, + "learning_rate": 7.857542940816183e-06, + "loss": 0.5919, + "step": 940 + }, + { + "epoch": 1.128374325134973, + "grad_norm": 0.6646323437342261, + "learning_rate": 7.85180876605517e-06, + "loss": 0.576, + "step": 941 + }, + { + "epoch": 1.1295740851829634, + "grad_norm": 0.7939071629516148, + "learning_rate": 7.846069026601745e-06, + "loss": 0.6983, + "step": 942 + }, + { + "epoch": 1.1307738452309537, + "grad_norm": 0.7676942847848861, + "learning_rate": 7.84032373365578e-06, + "loss": 0.6937, + "step": 943 + }, + { + "epoch": 1.1319736052789442, + "grad_norm": 0.6185861404100774, + "learning_rate": 7.83457289842798e-06, + "loss": 0.6797, + "step": 944 + }, + { + "epoch": 1.1331733653269347, + "grad_norm": 0.8840067772768776, + "learning_rate": 7.828816532139866e-06, + "loss": 0.6588, + "step": 945 + }, + { + "epoch": 1.134373125374925, + "grad_norm": 0.720664166016145, + "learning_rate": 7.823054646023749e-06, + "loss": 0.6417, + "step": 946 + }, + { + "epoch": 1.1355728854229155, + "grad_norm": 0.655102377172491, + "learning_rate": 7.817287251322712e-06, + "loss": 0.6567, + "step": 947 + }, + { + "epoch": 1.1367726454709057, + "grad_norm": 0.7295884468134075, + "learning_rate": 7.811514359290591e-06, + "loss": 0.6704, + "step": 948 + }, + { + "epoch": 1.1379724055188962, + "grad_norm": 0.6055585752339337, + "learning_rate": 7.805735981191939e-06, + "loss": 0.6365, + "step": 949 + }, + { + "epoch": 1.1391721655668867, + "grad_norm": 0.9045297618528259, + "learning_rate": 7.799952128302027e-06, + "loss": 0.6148, + "step": 950 + }, + { + "epoch": 1.140371925614877, + "grad_norm": 0.7573009176463668, + "learning_rate": 7.794162811906798e-06, + "loss": 0.6422, + "step": 951 + }, + { + "epoch": 1.1415716856628675, + "grad_norm": 0.5189738295277829, + "learning_rate": 7.78836804330286e-06, + "loss": 0.6972, + "step": 952 + }, + { + "epoch": 1.1427714457108578, + "grad_norm": 0.761896631397838, + "learning_rate": 7.782567833797458e-06, + "loss": 0.6136, + "step": 953 + }, + { + "epoch": 1.1439712057588483, + "grad_norm": 1.1206412330960442, + "learning_rate": 7.776762194708459e-06, + "loss": 0.6781, + "step": 954 + }, + { + "epoch": 1.1451709658068387, + "grad_norm": 0.582336141346009, + "learning_rate": 7.770951137364317e-06, + "loss": 0.6204, + "step": 955 + }, + { + "epoch": 1.146370725854829, + "grad_norm": 0.711849105566516, + "learning_rate": 7.765134673104065e-06, + "loss": 0.7008, + "step": 956 + }, + { + "epoch": 1.1475704859028195, + "grad_norm": 0.7048388709901285, + "learning_rate": 7.759312813277285e-06, + "loss": 0.6661, + "step": 957 + }, + { + "epoch": 1.1487702459508098, + "grad_norm": 0.7260048434291833, + "learning_rate": 7.753485569244083e-06, + "loss": 0.6246, + "step": 958 + }, + { + "epoch": 1.1499700059988003, + "grad_norm": 0.6785224096803231, + "learning_rate": 7.747652952375078e-06, + "loss": 0.613, + "step": 959 + }, + { + "epoch": 1.1511697660467906, + "grad_norm": 0.7560505404058467, + "learning_rate": 7.741814974051366e-06, + "loss": 0.5887, + "step": 960 + }, + { + "epoch": 1.152369526094781, + "grad_norm": 0.7965630156892449, + "learning_rate": 7.73597164566451e-06, + "loss": 0.6341, + "step": 961 + }, + { + "epoch": 1.1535692861427713, + "grad_norm": 0.6923330802478767, + "learning_rate": 7.730122978616511e-06, + "loss": 0.6093, + "step": 962 + }, + { + "epoch": 1.1547690461907618, + "grad_norm": 0.6879908653795777, + "learning_rate": 7.724268984319785e-06, + "loss": 0.6887, + "step": 963 + }, + { + "epoch": 1.1559688062387523, + "grad_norm": 0.6108013120938313, + "learning_rate": 7.718409674197147e-06, + "loss": 0.6261, + "step": 964 + }, + { + "epoch": 1.1571685662867426, + "grad_norm": 0.7592662810855833, + "learning_rate": 7.71254505968178e-06, + "loss": 0.5789, + "step": 965 + }, + { + "epoch": 1.158368326334733, + "grad_norm": 0.803730538996608, + "learning_rate": 7.70667515221722e-06, + "loss": 0.6657, + "step": 966 + }, + { + "epoch": 1.1595680863827234, + "grad_norm": 0.9674342724300058, + "learning_rate": 7.700799963257335e-06, + "loss": 0.6145, + "step": 967 + }, + { + "epoch": 1.1607678464307138, + "grad_norm": 0.5854325160848438, + "learning_rate": 7.69491950426629e-06, + "loss": 0.6678, + "step": 968 + }, + { + "epoch": 1.1619676064787043, + "grad_norm": 0.6718578895332913, + "learning_rate": 7.689033786718539e-06, + "loss": 0.6125, + "step": 969 + }, + { + "epoch": 1.1631673665266946, + "grad_norm": 0.7289857412083824, + "learning_rate": 7.683142822098795e-06, + "loss": 0.6835, + "step": 970 + }, + { + "epoch": 1.164367126574685, + "grad_norm": 0.5641877014830333, + "learning_rate": 7.677246621902014e-06, + "loss": 0.6334, + "step": 971 + }, + { + "epoch": 1.1655668866226754, + "grad_norm": 0.8539371224490931, + "learning_rate": 7.671345197633361e-06, + "loss": 0.6436, + "step": 972 + }, + { + "epoch": 1.1667666466706659, + "grad_norm": 0.8045170582715613, + "learning_rate": 7.6654385608082e-06, + "loss": 0.6559, + "step": 973 + }, + { + "epoch": 1.1679664067186564, + "grad_norm": 0.565700728135143, + "learning_rate": 7.659526722952066e-06, + "loss": 0.5513, + "step": 974 + }, + { + "epoch": 1.1691661667666466, + "grad_norm": 0.8743238461492279, + "learning_rate": 7.653609695600636e-06, + "loss": 0.6148, + "step": 975 + }, + { + "epoch": 1.1703659268146371, + "grad_norm": 0.6122517836173725, + "learning_rate": 7.647687490299723e-06, + "loss": 0.7412, + "step": 976 + }, + { + "epoch": 1.1715656868626274, + "grad_norm": 0.7449414224622495, + "learning_rate": 7.641760118605237e-06, + "loss": 0.6942, + "step": 977 + }, + { + "epoch": 1.172765446910618, + "grad_norm": 0.877399381789804, + "learning_rate": 7.635827592083168e-06, + "loss": 0.6523, + "step": 978 + }, + { + "epoch": 1.1739652069586084, + "grad_norm": 0.6643133900992996, + "learning_rate": 7.629889922309576e-06, + "loss": 0.5856, + "step": 979 + }, + { + "epoch": 1.1751649670065987, + "grad_norm": 0.728087789468, + "learning_rate": 7.6239471208705404e-06, + "loss": 0.6325, + "step": 980 + }, + { + "epoch": 1.1763647270545892, + "grad_norm": 1.2168375793316153, + "learning_rate": 7.617999199362166e-06, + "loss": 0.6837, + "step": 981 + }, + { + "epoch": 1.1775644871025794, + "grad_norm": 0.5792305435721297, + "learning_rate": 7.612046169390543e-06, + "loss": 0.6752, + "step": 982 + }, + { + "epoch": 1.17876424715057, + "grad_norm": 0.758139899265491, + "learning_rate": 7.606088042571731e-06, + "loss": 0.5868, + "step": 983 + }, + { + "epoch": 1.1799640071985602, + "grad_norm": 0.7826574118053469, + "learning_rate": 7.600124830531737e-06, + "loss": 0.5982, + "step": 984 + }, + { + "epoch": 1.1811637672465507, + "grad_norm": 0.6000356797051724, + "learning_rate": 7.594156544906484e-06, + "loss": 0.6364, + "step": 985 + }, + { + "epoch": 1.182363527294541, + "grad_norm": 0.9687253001599708, + "learning_rate": 7.588183197341805e-06, + "loss": 0.6904, + "step": 986 + }, + { + "epoch": 1.1835632873425315, + "grad_norm": 0.6188352397748479, + "learning_rate": 7.582204799493402e-06, + "loss": 0.7003, + "step": 987 + }, + { + "epoch": 1.184763047390522, + "grad_norm": 0.7202260204090514, + "learning_rate": 7.576221363026834e-06, + "loss": 0.684, + "step": 988 + }, + { + "epoch": 1.1859628074385122, + "grad_norm": 0.7586597687175531, + "learning_rate": 7.570232899617497e-06, + "loss": 0.6204, + "step": 989 + }, + { + "epoch": 1.1871625674865027, + "grad_norm": 0.6651953021064884, + "learning_rate": 7.564239420950587e-06, + "loss": 0.718, + "step": 990 + }, + { + "epoch": 1.188362327534493, + "grad_norm": 0.8721655660530172, + "learning_rate": 7.558240938721091e-06, + "loss": 0.6177, + "step": 991 + }, + { + "epoch": 1.1895620875824835, + "grad_norm": 0.8435311574009753, + "learning_rate": 7.55223746463376e-06, + "loss": 0.6586, + "step": 992 + }, + { + "epoch": 1.190761847630474, + "grad_norm": 0.622077937555054, + "learning_rate": 7.546229010403086e-06, + "loss": 0.6699, + "step": 993 + }, + { + "epoch": 1.1919616076784643, + "grad_norm": 0.5482286323460653, + "learning_rate": 7.540215587753274e-06, + "loss": 0.5602, + "step": 994 + }, + { + "epoch": 1.1931613677264548, + "grad_norm": 0.7358949985009436, + "learning_rate": 7.534197208418228e-06, + "loss": 0.6869, + "step": 995 + }, + { + "epoch": 1.194361127774445, + "grad_norm": 0.6697867463290572, + "learning_rate": 7.5281738841415256e-06, + "loss": 0.6525, + "step": 996 + }, + { + "epoch": 1.1955608878224355, + "grad_norm": 0.8182406510517298, + "learning_rate": 7.522145626676386e-06, + "loss": 0.6842, + "step": 997 + }, + { + "epoch": 1.196760647870426, + "grad_norm": 0.5186713299722027, + "learning_rate": 7.5161124477856636e-06, + "loss": 0.613, + "step": 998 + }, + { + "epoch": 1.1979604079184163, + "grad_norm": 0.8244793743836701, + "learning_rate": 7.510074359241808e-06, + "loss": 0.6283, + "step": 999 + }, + { + "epoch": 1.1991601679664068, + "grad_norm": 0.6792712021878325, + "learning_rate": 7.5040313728268544e-06, + "loss": 0.6142, + "step": 1000 + }, + { + "epoch": 1.200359928014397, + "grad_norm": 0.6717251990019677, + "learning_rate": 7.497983500332392e-06, + "loss": 0.6295, + "step": 1001 + }, + { + "epoch": 1.2015596880623876, + "grad_norm": 0.5937700301061093, + "learning_rate": 7.491930753559547e-06, + "loss": 0.5831, + "step": 1002 + }, + { + "epoch": 1.202759448110378, + "grad_norm": 0.743402154852896, + "learning_rate": 7.4858731443189526e-06, + "loss": 0.6018, + "step": 1003 + }, + { + "epoch": 1.2039592081583683, + "grad_norm": 0.6412286431039477, + "learning_rate": 7.4798106844307325e-06, + "loss": 0.7176, + "step": 1004 + }, + { + "epoch": 1.2051589682063588, + "grad_norm": 0.712722659936385, + "learning_rate": 7.473743385724478e-06, + "loss": 0.6298, + "step": 1005 + }, + { + "epoch": 1.206358728254349, + "grad_norm": 0.6689458591312148, + "learning_rate": 7.467671260039218e-06, + "loss": 0.6576, + "step": 1006 + }, + { + "epoch": 1.2075584883023396, + "grad_norm": 0.7929731468061676, + "learning_rate": 7.4615943192234e-06, + "loss": 0.6488, + "step": 1007 + }, + { + "epoch": 1.2087582483503299, + "grad_norm": 0.8070861175008949, + "learning_rate": 7.4555125751348735e-06, + "loss": 0.6181, + "step": 1008 + }, + { + "epoch": 1.2099580083983204, + "grad_norm": 0.6500828766089807, + "learning_rate": 7.449426039640853e-06, + "loss": 0.6571, + "step": 1009 + }, + { + "epoch": 1.2111577684463106, + "grad_norm": 0.6679890203007678, + "learning_rate": 7.443334724617904e-06, + "loss": 0.6086, + "step": 1010 + }, + { + "epoch": 1.2123575284943011, + "grad_norm": 0.5709805644821978, + "learning_rate": 7.437238641951922e-06, + "loss": 0.6289, + "step": 1011 + }, + { + "epoch": 1.2135572885422916, + "grad_norm": 0.6959911040226187, + "learning_rate": 7.431137803538105e-06, + "loss": 0.7094, + "step": 1012 + }, + { + "epoch": 1.2147570485902819, + "grad_norm": 0.6389305357842437, + "learning_rate": 7.425032221280925e-06, + "loss": 0.6112, + "step": 1013 + }, + { + "epoch": 1.2159568086382724, + "grad_norm": 0.6980973604370527, + "learning_rate": 7.418921907094117e-06, + "loss": 0.6821, + "step": 1014 + }, + { + "epoch": 1.2171565686862627, + "grad_norm": 0.8056526394131966, + "learning_rate": 7.412806872900648e-06, + "loss": 0.5926, + "step": 1015 + }, + { + "epoch": 1.2183563287342531, + "grad_norm": 0.5509146634760903, + "learning_rate": 7.406687130632692e-06, + "loss": 0.6358, + "step": 1016 + }, + { + "epoch": 1.2195560887822436, + "grad_norm": 0.6980703149971923, + "learning_rate": 7.400562692231611e-06, + "loss": 0.6635, + "step": 1017 + }, + { + "epoch": 1.220755848830234, + "grad_norm": 0.6236557163845787, + "learning_rate": 7.394433569647935e-06, + "loss": 0.669, + "step": 1018 + }, + { + "epoch": 1.2219556088782244, + "grad_norm": 0.6846420411643493, + "learning_rate": 7.3882997748413285e-06, + "loss": 0.6305, + "step": 1019 + }, + { + "epoch": 1.2231553689262147, + "grad_norm": 0.7357860502422784, + "learning_rate": 7.382161319780574e-06, + "loss": 0.719, + "step": 1020 + }, + { + "epoch": 1.2243551289742052, + "grad_norm": 0.6768304157113985, + "learning_rate": 7.37601821644355e-06, + "loss": 0.6303, + "step": 1021 + }, + { + "epoch": 1.2255548890221957, + "grad_norm": 0.7669717553744075, + "learning_rate": 7.3698704768172026e-06, + "loss": 0.7411, + "step": 1022 + }, + { + "epoch": 1.226754649070186, + "grad_norm": 0.7701810105912146, + "learning_rate": 7.363718112897525e-06, + "loss": 0.6904, + "step": 1023 + }, + { + "epoch": 1.2279544091181764, + "grad_norm": 0.6981186558383136, + "learning_rate": 7.357561136689536e-06, + "loss": 0.6152, + "step": 1024 + }, + { + "epoch": 1.2291541691661667, + "grad_norm": 0.6666100717443457, + "learning_rate": 7.351399560207253e-06, + "loss": 0.6399, + "step": 1025 + }, + { + "epoch": 1.2303539292141572, + "grad_norm": 0.6869300699310232, + "learning_rate": 7.345233395473664e-06, + "loss": 0.7237, + "step": 1026 + }, + { + "epoch": 1.2315536892621475, + "grad_norm": 0.6965197593345103, + "learning_rate": 7.339062654520724e-06, + "loss": 0.6339, + "step": 1027 + }, + { + "epoch": 1.232753449310138, + "grad_norm": 0.6292232680942303, + "learning_rate": 7.332887349389301e-06, + "loss": 0.6196, + "step": 1028 + }, + { + "epoch": 1.2339532093581285, + "grad_norm": 0.7687665423496974, + "learning_rate": 7.3267074921291795e-06, + "loss": 0.6551, + "step": 1029 + }, + { + "epoch": 1.2351529694061187, + "grad_norm": 0.6249840034898358, + "learning_rate": 7.320523094799025e-06, + "loss": 0.6302, + "step": 1030 + }, + { + "epoch": 1.2363527294541092, + "grad_norm": 0.6468018329037355, + "learning_rate": 7.3143341694663604e-06, + "loss": 0.6683, + "step": 1031 + }, + { + "epoch": 1.2375524895020995, + "grad_norm": 0.7867545650766382, + "learning_rate": 7.308140728207544e-06, + "loss": 0.6594, + "step": 1032 + }, + { + "epoch": 1.23875224955009, + "grad_norm": 0.7323728579960705, + "learning_rate": 7.301942783107747e-06, + "loss": 0.6151, + "step": 1033 + }, + { + "epoch": 1.2399520095980803, + "grad_norm": 0.7095345504633144, + "learning_rate": 7.2957403462609276e-06, + "loss": 0.6424, + "step": 1034 + }, + { + "epoch": 1.2411517696460708, + "grad_norm": 0.7230751286642569, + "learning_rate": 7.289533429769812e-06, + "loss": 0.6643, + "step": 1035 + }, + { + "epoch": 1.2423515296940613, + "grad_norm": 0.7438701965456714, + "learning_rate": 7.2833220457458596e-06, + "loss": 0.639, + "step": 1036 + }, + { + "epoch": 1.2435512897420515, + "grad_norm": 0.6573622376094815, + "learning_rate": 7.277106206309258e-06, + "loss": 0.6497, + "step": 1037 + }, + { + "epoch": 1.244751049790042, + "grad_norm": 0.7426817035433573, + "learning_rate": 7.270885923588879e-06, + "loss": 0.6933, + "step": 1038 + }, + { + "epoch": 1.2459508098380323, + "grad_norm": 0.6219922286802084, + "learning_rate": 7.26466120972227e-06, + "loss": 0.6609, + "step": 1039 + }, + { + "epoch": 1.2471505698860228, + "grad_norm": 0.7422857469589963, + "learning_rate": 7.258432076855625e-06, + "loss": 0.6354, + "step": 1040 + }, + { + "epoch": 1.2483503299340133, + "grad_norm": 0.8573633191476585, + "learning_rate": 7.2521985371437575e-06, + "loss": 0.6304, + "step": 1041 + }, + { + "epoch": 1.2495500899820036, + "grad_norm": 0.8093256278783655, + "learning_rate": 7.245960602750081e-06, + "loss": 0.6303, + "step": 1042 + }, + { + "epoch": 1.250749850029994, + "grad_norm": 0.581070725446635, + "learning_rate": 7.239718285846586e-06, + "loss": 0.581, + "step": 1043 + }, + { + "epoch": 1.2519496100779843, + "grad_norm": 0.6549868295461385, + "learning_rate": 7.233471598613815e-06, + "loss": 0.6951, + "step": 1044 + }, + { + "epoch": 1.2531493701259748, + "grad_norm": 0.9138529056050184, + "learning_rate": 7.227220553240835e-06, + "loss": 0.6326, + "step": 1045 + }, + { + "epoch": 1.2543491301739653, + "grad_norm": 0.704514262769512, + "learning_rate": 7.220965161925215e-06, + "loss": 0.5955, + "step": 1046 + }, + { + "epoch": 1.2555488902219556, + "grad_norm": 0.813534183420755, + "learning_rate": 7.214705436873016e-06, + "loss": 0.6434, + "step": 1047 + }, + { + "epoch": 1.256748650269946, + "grad_norm": 0.7262085566877929, + "learning_rate": 7.208441390298742e-06, + "loss": 0.6835, + "step": 1048 + }, + { + "epoch": 1.2579484103179364, + "grad_norm": 0.550271216349488, + "learning_rate": 7.202173034425334e-06, + "loss": 0.6457, + "step": 1049 + }, + { + "epoch": 1.2591481703659269, + "grad_norm": 0.7468724934410274, + "learning_rate": 7.195900381484144e-06, + "loss": 0.6574, + "step": 1050 + }, + { + "epoch": 1.2603479304139174, + "grad_norm": 0.572209941881098, + "learning_rate": 7.189623443714908e-06, + "loss": 0.6297, + "step": 1051 + }, + { + "epoch": 1.2615476904619076, + "grad_norm": 0.8468906643732181, + "learning_rate": 7.183342233365721e-06, + "loss": 0.6229, + "step": 1052 + }, + { + "epoch": 1.262747450509898, + "grad_norm": 0.6943785807969055, + "learning_rate": 7.177056762693018e-06, + "loss": 0.6624, + "step": 1053 + }, + { + "epoch": 1.2639472105578884, + "grad_norm": 0.6925625991995398, + "learning_rate": 7.170767043961542e-06, + "loss": 0.6311, + "step": 1054 + }, + { + "epoch": 1.2651469706058789, + "grad_norm": 0.6762621801494739, + "learning_rate": 7.1644730894443305e-06, + "loss": 0.5871, + "step": 1055 + }, + { + "epoch": 1.2663467306538692, + "grad_norm": 0.6459618641769452, + "learning_rate": 7.158174911422685e-06, + "loss": 0.6532, + "step": 1056 + }, + { + "epoch": 1.2675464907018597, + "grad_norm": 0.8083476414846833, + "learning_rate": 7.151872522186147e-06, + "loss": 0.594, + "step": 1057 + }, + { + "epoch": 1.26874625074985, + "grad_norm": 0.6951131175229895, + "learning_rate": 7.145565934032472e-06, + "loss": 0.624, + "step": 1058 + }, + { + "epoch": 1.2699460107978404, + "grad_norm": 0.5042926125416562, + "learning_rate": 7.139255159267619e-06, + "loss": 0.6358, + "step": 1059 + }, + { + "epoch": 1.271145770845831, + "grad_norm": 0.9478685294428895, + "learning_rate": 7.132940210205705e-06, + "loss": 0.5738, + "step": 1060 + }, + { + "epoch": 1.2723455308938212, + "grad_norm": 0.694451255964263, + "learning_rate": 7.126621099168998e-06, + "loss": 0.6865, + "step": 1061 + }, + { + "epoch": 1.2735452909418117, + "grad_norm": 0.6123286837725577, + "learning_rate": 7.120297838487887e-06, + "loss": 0.622, + "step": 1062 + }, + { + "epoch": 1.274745050989802, + "grad_norm": 0.8242058769471933, + "learning_rate": 7.113970440500858e-06, + "loss": 0.6465, + "step": 1063 + }, + { + "epoch": 1.2759448110377924, + "grad_norm": 0.6682649397103331, + "learning_rate": 7.107638917554468e-06, + "loss": 0.6296, + "step": 1064 + }, + { + "epoch": 1.277144571085783, + "grad_norm": 0.5673261780539186, + "learning_rate": 7.101303282003324e-06, + "loss": 0.585, + "step": 1065 + }, + { + "epoch": 1.2783443311337732, + "grad_norm": 0.7034650786971746, + "learning_rate": 7.094963546210059e-06, + "loss": 0.6322, + "step": 1066 + }, + { + "epoch": 1.2795440911817637, + "grad_norm": 0.6856469553412491, + "learning_rate": 7.088619722545307e-06, + "loss": 0.6143, + "step": 1067 + }, + { + "epoch": 1.280743851229754, + "grad_norm": 0.6117789618536958, + "learning_rate": 7.082271823387675e-06, + "loss": 0.6641, + "step": 1068 + }, + { + "epoch": 1.2819436112777445, + "grad_norm": 0.7097706181718433, + "learning_rate": 7.07591986112373e-06, + "loss": 0.7064, + "step": 1069 + }, + { + "epoch": 1.283143371325735, + "grad_norm": 0.6841325747730488, + "learning_rate": 7.0695638481479565e-06, + "loss": 0.6184, + "step": 1070 + }, + { + "epoch": 1.2843431313737252, + "grad_norm": 0.8318445369442227, + "learning_rate": 7.063203796862752e-06, + "loss": 0.6399, + "step": 1071 + }, + { + "epoch": 1.2855428914217157, + "grad_norm": 0.4578231661291545, + "learning_rate": 7.056839719678391e-06, + "loss": 0.6473, + "step": 1072 + }, + { + "epoch": 1.286742651469706, + "grad_norm": 0.8006195088069488, + "learning_rate": 7.050471629013003e-06, + "loss": 0.6145, + "step": 1073 + }, + { + "epoch": 1.2879424115176965, + "grad_norm": 0.6727364270890672, + "learning_rate": 7.044099537292548e-06, + "loss": 0.6281, + "step": 1074 + }, + { + "epoch": 1.289142171565687, + "grad_norm": 1.0538331322539352, + "learning_rate": 7.037723456950797e-06, + "loss": 0.6266, + "step": 1075 + }, + { + "epoch": 1.2903419316136773, + "grad_norm": 0.626784875264221, + "learning_rate": 7.0313434004293015e-06, + "loss": 0.6819, + "step": 1076 + }, + { + "epoch": 1.2915416916616675, + "grad_norm": 0.5464614778448484, + "learning_rate": 7.02495938017737e-06, + "loss": 0.586, + "step": 1077 + }, + { + "epoch": 1.292741451709658, + "grad_norm": 0.6690811552578267, + "learning_rate": 7.018571408652045e-06, + "loss": 0.6915, + "step": 1078 + }, + { + "epoch": 1.2939412117576485, + "grad_norm": 0.730862264750046, + "learning_rate": 7.012179498318088e-06, + "loss": 0.6646, + "step": 1079 + }, + { + "epoch": 1.2951409718056388, + "grad_norm": 0.5329908348747406, + "learning_rate": 7.0057836616479334e-06, + "loss": 0.7027, + "step": 1080 + }, + { + "epoch": 1.2963407318536293, + "grad_norm": 0.8233431844664252, + "learning_rate": 6.9993839111216865e-06, + "loss": 0.6272, + "step": 1081 + }, + { + "epoch": 1.2975404919016196, + "grad_norm": 0.6808125676138477, + "learning_rate": 6.992980259227085e-06, + "loss": 0.6851, + "step": 1082 + }, + { + "epoch": 1.29874025194961, + "grad_norm": 0.9643095276335473, + "learning_rate": 6.986572718459479e-06, + "loss": 0.6963, + "step": 1083 + }, + { + "epoch": 1.2999400119976006, + "grad_norm": 0.44399583003612864, + "learning_rate": 6.98016130132181e-06, + "loss": 0.5905, + "step": 1084 + }, + { + "epoch": 1.3011397720455908, + "grad_norm": 0.7485852209698809, + "learning_rate": 6.973746020324581e-06, + "loss": 0.6267, + "step": 1085 + }, + { + "epoch": 1.3023395320935813, + "grad_norm": 0.6343967189087504, + "learning_rate": 6.967326887985837e-06, + "loss": 0.5925, + "step": 1086 + }, + { + "epoch": 1.3035392921415716, + "grad_norm": 0.8508973665476871, + "learning_rate": 6.960903916831132e-06, + "loss": 0.6478, + "step": 1087 + }, + { + "epoch": 1.304739052189562, + "grad_norm": 0.49693666140528264, + "learning_rate": 6.954477119393521e-06, + "loss": 0.6109, + "step": 1088 + }, + { + "epoch": 1.3059388122375526, + "grad_norm": 0.768961587133166, + "learning_rate": 6.948046508213515e-06, + "loss": 0.6555, + "step": 1089 + }, + { + "epoch": 1.3071385722855429, + "grad_norm": 0.7314059891873381, + "learning_rate": 6.94161209583907e-06, + "loss": 0.6874, + "step": 1090 + }, + { + "epoch": 1.3083383323335334, + "grad_norm": 0.7742341576454875, + "learning_rate": 6.9351738948255645e-06, + "loss": 0.6901, + "step": 1091 + }, + { + "epoch": 1.3095380923815236, + "grad_norm": 0.5975730969202686, + "learning_rate": 6.9287319177357615e-06, + "loss": 0.7018, + "step": 1092 + }, + { + "epoch": 1.3107378524295141, + "grad_norm": 0.764953793274789, + "learning_rate": 6.922286177139797e-06, + "loss": 0.5935, + "step": 1093 + }, + { + "epoch": 1.3119376124775046, + "grad_norm": 0.6493804506074498, + "learning_rate": 6.915836685615149e-06, + "loss": 0.6654, + "step": 1094 + }, + { + "epoch": 1.313137372525495, + "grad_norm": 0.7936377495763912, + "learning_rate": 6.9093834557466164e-06, + "loss": 0.6357, + "step": 1095 + }, + { + "epoch": 1.3143371325734852, + "grad_norm": 0.5796146895088004, + "learning_rate": 6.902926500126292e-06, + "loss": 0.6089, + "step": 1096 + }, + { + "epoch": 1.3155368926214757, + "grad_norm": 0.746242101409277, + "learning_rate": 6.896465831353535e-06, + "loss": 0.7047, + "step": 1097 + }, + { + "epoch": 1.3167366526694662, + "grad_norm": 0.5367920357525211, + "learning_rate": 6.890001462034957e-06, + "loss": 0.6199, + "step": 1098 + }, + { + "epoch": 1.3179364127174564, + "grad_norm": 0.7319144508274227, + "learning_rate": 6.883533404784384e-06, + "loss": 0.5796, + "step": 1099 + }, + { + "epoch": 1.319136172765447, + "grad_norm": 0.8063686483869226, + "learning_rate": 6.877061672222842e-06, + "loss": 0.6073, + "step": 1100 + }, + { + "epoch": 1.3203359328134372, + "grad_norm": 0.7251354738596527, + "learning_rate": 6.870586276978526e-06, + "loss": 0.6583, + "step": 1101 + }, + { + "epoch": 1.3215356928614277, + "grad_norm": 0.6011859592461312, + "learning_rate": 6.864107231686781e-06, + "loss": 0.6082, + "step": 1102 + }, + { + "epoch": 1.3227354529094182, + "grad_norm": 0.6832987527658642, + "learning_rate": 6.857624548990071e-06, + "loss": 0.6734, + "step": 1103 + }, + { + "epoch": 1.3239352129574085, + "grad_norm": 0.8621672410348915, + "learning_rate": 6.851138241537961e-06, + "loss": 0.6446, + "step": 1104 + }, + { + "epoch": 1.325134973005399, + "grad_norm": 0.6048231145547539, + "learning_rate": 6.8446483219870855e-06, + "loss": 0.6603, + "step": 1105 + }, + { + "epoch": 1.3263347330533892, + "grad_norm": 0.8589489889623938, + "learning_rate": 6.838154803001131e-06, + "loss": 0.7011, + "step": 1106 + }, + { + "epoch": 1.3275344931013797, + "grad_norm": 0.6661232123237989, + "learning_rate": 6.831657697250801e-06, + "loss": 0.637, + "step": 1107 + }, + { + "epoch": 1.3287342531493702, + "grad_norm": 0.6147162030404598, + "learning_rate": 6.825157017413808e-06, + "loss": 0.5849, + "step": 1108 + }, + { + "epoch": 1.3299340131973605, + "grad_norm": 0.7662173418303564, + "learning_rate": 6.818652776174828e-06, + "loss": 0.7015, + "step": 1109 + }, + { + "epoch": 1.331133773245351, + "grad_norm": 0.45937740526986315, + "learning_rate": 6.812144986225493e-06, + "loss": 0.6484, + "step": 1110 + }, + { + "epoch": 1.3323335332933413, + "grad_norm": 0.5298575907336551, + "learning_rate": 6.805633660264358e-06, + "loss": 0.6436, + "step": 1111 + }, + { + "epoch": 1.3335332933413317, + "grad_norm": 0.7406696355689192, + "learning_rate": 6.7991188109968766e-06, + "loss": 0.671, + "step": 1112 + }, + { + "epoch": 1.3347330533893222, + "grad_norm": 0.6034998910180078, + "learning_rate": 6.792600451135378e-06, + "loss": 0.5948, + "step": 1113 + }, + { + "epoch": 1.3359328134373125, + "grad_norm": 0.6245548231844243, + "learning_rate": 6.786078593399041e-06, + "loss": 0.6184, + "step": 1114 + }, + { + "epoch": 1.337132573485303, + "grad_norm": 0.7830690088927433, + "learning_rate": 6.779553250513875e-06, + "loss": 0.6353, + "step": 1115 + }, + { + "epoch": 1.3383323335332933, + "grad_norm": 0.5056698958171423, + "learning_rate": 6.773024435212678e-06, + "loss": 0.6272, + "step": 1116 + }, + { + "epoch": 1.3395320935812838, + "grad_norm": 0.8131497915203822, + "learning_rate": 6.766492160235037e-06, + "loss": 0.5875, + "step": 1117 + }, + { + "epoch": 1.3407318536292743, + "grad_norm": 0.7013985136781519, + "learning_rate": 6.759956438327283e-06, + "loss": 0.6156, + "step": 1118 + }, + { + "epoch": 1.3419316136772645, + "grad_norm": 0.5893473658594538, + "learning_rate": 6.75341728224247e-06, + "loss": 0.6636, + "step": 1119 + }, + { + "epoch": 1.3431313737252548, + "grad_norm": 0.7317031927045132, + "learning_rate": 6.746874704740363e-06, + "loss": 0.6148, + "step": 1120 + }, + { + "epoch": 1.3443311337732453, + "grad_norm": 0.8112527565928205, + "learning_rate": 6.74032871858739e-06, + "loss": 0.644, + "step": 1121 + }, + { + "epoch": 1.3455308938212358, + "grad_norm": 0.6641460501561001, + "learning_rate": 6.733779336556643e-06, + "loss": 0.5878, + "step": 1122 + }, + { + "epoch": 1.346730653869226, + "grad_norm": 0.7381532106113696, + "learning_rate": 6.727226571427832e-06, + "loss": 0.6282, + "step": 1123 + }, + { + "epoch": 1.3479304139172166, + "grad_norm": 0.6770212013493809, + "learning_rate": 6.7206704359872706e-06, + "loss": 0.6388, + "step": 1124 + }, + { + "epoch": 1.3491301739652068, + "grad_norm": 0.5750273845268145, + "learning_rate": 6.714110943027853e-06, + "loss": 0.556, + "step": 1125 + }, + { + "epoch": 1.3503299340131973, + "grad_norm": 0.9108358628622016, + "learning_rate": 6.707548105349016e-06, + "loss": 0.6073, + "step": 1126 + }, + { + "epoch": 1.3515296940611878, + "grad_norm": 0.7322466065246941, + "learning_rate": 6.700981935756732e-06, + "loss": 0.5416, + "step": 1127 + }, + { + "epoch": 1.352729454109178, + "grad_norm": 0.6079719258021722, + "learning_rate": 6.694412447063467e-06, + "loss": 0.6685, + "step": 1128 + }, + { + "epoch": 1.3539292141571686, + "grad_norm": 0.8370293574026989, + "learning_rate": 6.68783965208817e-06, + "loss": 0.6738, + "step": 1129 + }, + { + "epoch": 1.3551289742051589, + "grad_norm": 0.565962142703862, + "learning_rate": 6.68126356365624e-06, + "loss": 0.58, + "step": 1130 + }, + { + "epoch": 1.3563287342531494, + "grad_norm": 0.6388448863130164, + "learning_rate": 6.6746841945995e-06, + "loss": 0.6354, + "step": 1131 + }, + { + "epoch": 1.3575284943011399, + "grad_norm": 0.6653541085891204, + "learning_rate": 6.668101557756176e-06, + "loss": 0.6083, + "step": 1132 + }, + { + "epoch": 1.3587282543491301, + "grad_norm": 0.7599696151343085, + "learning_rate": 6.661515665970869e-06, + "loss": 0.6793, + "step": 1133 + }, + { + "epoch": 1.3599280143971206, + "grad_norm": 0.6741621620992139, + "learning_rate": 6.6549265320945345e-06, + "loss": 0.5406, + "step": 1134 + }, + { + "epoch": 1.361127774445111, + "grad_norm": 0.6832030336773401, + "learning_rate": 6.648334168984452e-06, + "loss": 0.6509, + "step": 1135 + }, + { + "epoch": 1.3623275344931014, + "grad_norm": 0.7042153457501232, + "learning_rate": 6.641738589504202e-06, + "loss": 0.6252, + "step": 1136 + }, + { + "epoch": 1.363527294541092, + "grad_norm": 0.7129225998390416, + "learning_rate": 6.635139806523642e-06, + "loss": 0.6705, + "step": 1137 + }, + { + "epoch": 1.3647270545890822, + "grad_norm": 0.6633480074567827, + "learning_rate": 6.6285378329188784e-06, + "loss": 0.6667, + "step": 1138 + }, + { + "epoch": 1.3659268146370727, + "grad_norm": 0.7298944730666421, + "learning_rate": 6.621932681572245e-06, + "loss": 0.6684, + "step": 1139 + }, + { + "epoch": 1.367126574685063, + "grad_norm": 0.7535335352629341, + "learning_rate": 6.615324365372281e-06, + "loss": 0.6122, + "step": 1140 + }, + { + "epoch": 1.3683263347330534, + "grad_norm": 0.6883059661899033, + "learning_rate": 6.608712897213691e-06, + "loss": 0.6674, + "step": 1141 + }, + { + "epoch": 1.369526094781044, + "grad_norm": 0.7904357771826949, + "learning_rate": 6.602098289997336e-06, + "loss": 0.6171, + "step": 1142 + }, + { + "epoch": 1.3707258548290342, + "grad_norm": 0.7843313126783715, + "learning_rate": 6.5954805566302035e-06, + "loss": 0.6219, + "step": 1143 + }, + { + "epoch": 1.3719256148770245, + "grad_norm": 0.8312569687830124, + "learning_rate": 6.588859710025378e-06, + "loss": 0.61, + "step": 1144 + }, + { + "epoch": 1.373125374925015, + "grad_norm": 0.5669030307035022, + "learning_rate": 6.5822357631020205e-06, + "loss": 0.6347, + "step": 1145 + }, + { + "epoch": 1.3743251349730055, + "grad_norm": 0.6927951715863442, + "learning_rate": 6.575608728785341e-06, + "loss": 0.6055, + "step": 1146 + }, + { + "epoch": 1.3755248950209957, + "grad_norm": 0.7869796862407736, + "learning_rate": 6.568978620006574e-06, + "loss": 0.68, + "step": 1147 + }, + { + "epoch": 1.3767246550689862, + "grad_norm": 0.6329606513199126, + "learning_rate": 6.562345449702952e-06, + "loss": 0.6729, + "step": 1148 + }, + { + "epoch": 1.3779244151169765, + "grad_norm": 0.7470692094435768, + "learning_rate": 6.555709230817683e-06, + "loss": 0.6922, + "step": 1149 + }, + { + "epoch": 1.379124175164967, + "grad_norm": 0.6397194190507423, + "learning_rate": 6.5490699762999226e-06, + "loss": 0.6609, + "step": 1150 + }, + { + "epoch": 1.3803239352129575, + "grad_norm": 0.7495978599469907, + "learning_rate": 6.54242769910475e-06, + "loss": 0.6081, + "step": 1151 + }, + { + "epoch": 1.3815236952609478, + "grad_norm": 0.7735457645872734, + "learning_rate": 6.535782412193146e-06, + "loss": 0.6591, + "step": 1152 + }, + { + "epoch": 1.3827234553089383, + "grad_norm": 0.6756589768594755, + "learning_rate": 6.529134128531956e-06, + "loss": 0.6439, + "step": 1153 + }, + { + "epoch": 1.3839232153569285, + "grad_norm": 0.7556235859296331, + "learning_rate": 6.522482861093882e-06, + "loss": 0.6511, + "step": 1154 + }, + { + "epoch": 1.385122975404919, + "grad_norm": 0.8105361367882571, + "learning_rate": 6.515828622857443e-06, + "loss": 0.5748, + "step": 1155 + }, + { + "epoch": 1.3863227354529095, + "grad_norm": 0.5900221272166272, + "learning_rate": 6.509171426806954e-06, + "loss": 0.6394, + "step": 1156 + }, + { + "epoch": 1.3875224955008998, + "grad_norm": 0.49277049926647637, + "learning_rate": 6.502511285932507e-06, + "loss": 0.6577, + "step": 1157 + }, + { + "epoch": 1.3887222555488903, + "grad_norm": 0.865199565640867, + "learning_rate": 6.495848213229933e-06, + "loss": 0.6189, + "step": 1158 + }, + { + "epoch": 1.3899220155968806, + "grad_norm": 0.6212103321219965, + "learning_rate": 6.4891822217007904e-06, + "loss": 0.6561, + "step": 1159 + }, + { + "epoch": 1.391121775644871, + "grad_norm": 0.650567408341297, + "learning_rate": 6.482513324352328e-06, + "loss": 0.6505, + "step": 1160 + }, + { + "epoch": 1.3923215356928615, + "grad_norm": 0.8464647346033654, + "learning_rate": 6.4758415341974705e-06, + "loss": 0.6155, + "step": 1161 + }, + { + "epoch": 1.3935212957408518, + "grad_norm": 0.6799298795578801, + "learning_rate": 6.469166864254779e-06, + "loss": 0.6417, + "step": 1162 + }, + { + "epoch": 1.3947210557888423, + "grad_norm": 0.5391827736330186, + "learning_rate": 6.462489327548442e-06, + "loss": 0.6379, + "step": 1163 + }, + { + "epoch": 1.3959208158368326, + "grad_norm": 0.7018397816358515, + "learning_rate": 6.455808937108237e-06, + "loss": 0.7058, + "step": 1164 + }, + { + "epoch": 1.397120575884823, + "grad_norm": 0.840076044364674, + "learning_rate": 6.4491257059695116e-06, + "loss": 0.667, + "step": 1165 + }, + { + "epoch": 1.3983203359328136, + "grad_norm": 0.6515030138359833, + "learning_rate": 6.442439647173155e-06, + "loss": 0.5647, + "step": 1166 + }, + { + "epoch": 1.3995200959808038, + "grad_norm": 0.5717150990683747, + "learning_rate": 6.435750773765579e-06, + "loss": 0.6355, + "step": 1167 + }, + { + "epoch": 1.4007198560287941, + "grad_norm": 0.7895379356993755, + "learning_rate": 6.429059098798679e-06, + "loss": 0.6481, + "step": 1168 + }, + { + "epoch": 1.4019196160767846, + "grad_norm": 0.698749126512253, + "learning_rate": 6.4223646353298264e-06, + "loss": 0.6304, + "step": 1169 + }, + { + "epoch": 1.403119376124775, + "grad_norm": 0.6577509725302786, + "learning_rate": 6.415667396421825e-06, + "loss": 0.6315, + "step": 1170 + }, + { + "epoch": 1.4043191361727654, + "grad_norm": 0.6922907089938489, + "learning_rate": 6.4089673951429e-06, + "loss": 0.6636, + "step": 1171 + }, + { + "epoch": 1.4055188962207559, + "grad_norm": 0.617477414124189, + "learning_rate": 6.402264644566665e-06, + "loss": 0.6193, + "step": 1172 + }, + { + "epoch": 1.4067186562687461, + "grad_norm": 0.6576230106720494, + "learning_rate": 6.395559157772098e-06, + "loss": 0.6295, + "step": 1173 + }, + { + "epoch": 1.4079184163167366, + "grad_norm": 0.6715729203526053, + "learning_rate": 6.388850947843517e-06, + "loss": 0.6243, + "step": 1174 + }, + { + "epoch": 1.4091181763647271, + "grad_norm": 0.7679879928940876, + "learning_rate": 6.382140027870553e-06, + "loss": 0.6648, + "step": 1175 + }, + { + "epoch": 1.4103179364127174, + "grad_norm": 0.8037472260249561, + "learning_rate": 6.375426410948128e-06, + "loss": 0.6517, + "step": 1176 + }, + { + "epoch": 1.411517696460708, + "grad_norm": 0.6349627119181712, + "learning_rate": 6.3687101101764185e-06, + "loss": 0.676, + "step": 1177 + }, + { + "epoch": 1.4127174565086982, + "grad_norm": 0.6801899051106781, + "learning_rate": 6.361991138660846e-06, + "loss": 0.6266, + "step": 1178 + }, + { + "epoch": 1.4139172165566887, + "grad_norm": 0.5350755164375657, + "learning_rate": 6.355269509512042e-06, + "loss": 0.6375, + "step": 1179 + }, + { + "epoch": 1.4151169766046792, + "grad_norm": 0.7239692027078118, + "learning_rate": 6.3485452358458185e-06, + "loss": 0.6139, + "step": 1180 + }, + { + "epoch": 1.4163167366526694, + "grad_norm": 0.695469325517881, + "learning_rate": 6.341818330783156e-06, + "loss": 0.625, + "step": 1181 + }, + { + "epoch": 1.41751649670066, + "grad_norm": 0.4780893249383196, + "learning_rate": 6.335088807450163e-06, + "loss": 0.7342, + "step": 1182 + }, + { + "epoch": 1.4187162567486502, + "grad_norm": 0.927274137868549, + "learning_rate": 6.328356678978058e-06, + "loss": 0.5985, + "step": 1183 + }, + { + "epoch": 1.4199160167966407, + "grad_norm": 0.72395308753094, + "learning_rate": 6.321621958503146e-06, + "loss": 0.6434, + "step": 1184 + }, + { + "epoch": 1.4211157768446312, + "grad_norm": 0.8789638735851689, + "learning_rate": 6.3148846591667865e-06, + "loss": 0.6734, + "step": 1185 + }, + { + "epoch": 1.4223155368926215, + "grad_norm": 0.5581994386487651, + "learning_rate": 6.308144794115374e-06, + "loss": 0.6156, + "step": 1186 + }, + { + "epoch": 1.4235152969406117, + "grad_norm": 0.6272793758583736, + "learning_rate": 6.301402376500306e-06, + "loss": 0.6515, + "step": 1187 + }, + { + "epoch": 1.4247150569886022, + "grad_norm": 0.6447701993231002, + "learning_rate": 6.2946574194779645e-06, + "loss": 0.6583, + "step": 1188 + }, + { + "epoch": 1.4259148170365927, + "grad_norm": 0.6209843693848559, + "learning_rate": 6.287909936209682e-06, + "loss": 0.6319, + "step": 1189 + }, + { + "epoch": 1.427114577084583, + "grad_norm": 0.6912573791199007, + "learning_rate": 6.281159939861725e-06, + "loss": 0.643, + "step": 1190 + }, + { + "epoch": 1.4283143371325735, + "grad_norm": 0.6415838122000773, + "learning_rate": 6.274407443605264e-06, + "loss": 0.6287, + "step": 1191 + }, + { + "epoch": 1.4295140971805638, + "grad_norm": 0.7097184462078711, + "learning_rate": 6.26765246061634e-06, + "loss": 0.6561, + "step": 1192 + }, + { + "epoch": 1.4307138572285543, + "grad_norm": 0.7213328291617336, + "learning_rate": 6.2608950040758575e-06, + "loss": 0.6163, + "step": 1193 + }, + { + "epoch": 1.4319136172765448, + "grad_norm": 0.6342851785100996, + "learning_rate": 6.254135087169537e-06, + "loss": 0.6688, + "step": 1194 + }, + { + "epoch": 1.433113377324535, + "grad_norm": 0.6609542753691291, + "learning_rate": 6.2473727230879076e-06, + "loss": 0.6745, + "step": 1195 + }, + { + "epoch": 1.4343131373725255, + "grad_norm": 0.7165029004297052, + "learning_rate": 6.240607925026271e-06, + "loss": 0.6552, + "step": 1196 + }, + { + "epoch": 1.4355128974205158, + "grad_norm": 0.65710539720105, + "learning_rate": 6.2338407061846765e-06, + "loss": 0.5844, + "step": 1197 + }, + { + "epoch": 1.4367126574685063, + "grad_norm": 0.6067918304499893, + "learning_rate": 6.227071079767899e-06, + "loss": 0.6143, + "step": 1198 + }, + { + "epoch": 1.4379124175164968, + "grad_norm": 0.6505049285655436, + "learning_rate": 6.2202990589854095e-06, + "loss": 0.5458, + "step": 1199 + }, + { + "epoch": 1.439112177564487, + "grad_norm": 0.7325345130366029, + "learning_rate": 6.213524657051354e-06, + "loss": 0.6436, + "step": 1200 + }, + { + "epoch": 1.4403119376124776, + "grad_norm": 0.589110413780781, + "learning_rate": 6.2067478871845236e-06, + "loss": 0.6423, + "step": 1201 + }, + { + "epoch": 1.4415116976604678, + "grad_norm": 0.7616775529555003, + "learning_rate": 6.199968762608326e-06, + "loss": 0.6762, + "step": 1202 + }, + { + "epoch": 1.4427114577084583, + "grad_norm": 0.5459706578178842, + "learning_rate": 6.193187296550771e-06, + "loss": 0.6944, + "step": 1203 + }, + { + "epoch": 1.4439112177564488, + "grad_norm": 0.622979747002905, + "learning_rate": 6.1864035022444325e-06, + "loss": 0.6007, + "step": 1204 + }, + { + "epoch": 1.445110977804439, + "grad_norm": 0.7495542665123462, + "learning_rate": 6.179617392926427e-06, + "loss": 0.6425, + "step": 1205 + }, + { + "epoch": 1.4463107378524296, + "grad_norm": 0.8002049020027295, + "learning_rate": 6.17282898183839e-06, + "loss": 0.5747, + "step": 1206 + }, + { + "epoch": 1.4475104979004199, + "grad_norm": 0.49958690427656804, + "learning_rate": 6.166038282226449e-06, + "loss": 0.6046, + "step": 1207 + }, + { + "epoch": 1.4487102579484104, + "grad_norm": 0.7221357839193259, + "learning_rate": 6.159245307341195e-06, + "loss": 0.6211, + "step": 1208 + }, + { + "epoch": 1.4499100179964008, + "grad_norm": 0.8801610973062568, + "learning_rate": 6.152450070437659e-06, + "loss": 0.6477, + "step": 1209 + }, + { + "epoch": 1.4511097780443911, + "grad_norm": 0.6659853219207728, + "learning_rate": 6.14565258477529e-06, + "loss": 0.6441, + "step": 1210 + }, + { + "epoch": 1.4523095380923814, + "grad_norm": 0.6760897828611526, + "learning_rate": 6.138852863617918e-06, + "loss": 0.569, + "step": 1211 + }, + { + "epoch": 1.4535092981403719, + "grad_norm": 0.6098179229000052, + "learning_rate": 6.132050920233739e-06, + "loss": 0.6635, + "step": 1212 + }, + { + "epoch": 1.4547090581883624, + "grad_norm": 0.813858537026656, + "learning_rate": 6.125246767895287e-06, + "loss": 0.6488, + "step": 1213 + }, + { + "epoch": 1.4559088182363527, + "grad_norm": 0.5711502698398734, + "learning_rate": 6.118440419879402e-06, + "loss": 0.6072, + "step": 1214 + }, + { + "epoch": 1.4571085782843431, + "grad_norm": 0.6667021617430856, + "learning_rate": 6.111631889467213e-06, + "loss": 0.6966, + "step": 1215 + }, + { + "epoch": 1.4583083383323334, + "grad_norm": 0.7557909481255707, + "learning_rate": 6.104821189944102e-06, + "loss": 0.67, + "step": 1216 + }, + { + "epoch": 1.459508098380324, + "grad_norm": 0.5640947675429846, + "learning_rate": 6.098008334599689e-06, + "loss": 0.6869, + "step": 1217 + }, + { + "epoch": 1.4607078584283144, + "grad_norm": 0.7139040670249647, + "learning_rate": 6.0911933367277984e-06, + "loss": 0.6422, + "step": 1218 + }, + { + "epoch": 1.4619076184763047, + "grad_norm": 0.6342341991664133, + "learning_rate": 6.084376209626433e-06, + "loss": 0.667, + "step": 1219 + }, + { + "epoch": 1.4631073785242952, + "grad_norm": 0.7585758162626316, + "learning_rate": 6.077556966597753e-06, + "loss": 0.6154, + "step": 1220 + }, + { + "epoch": 1.4643071385722854, + "grad_norm": 0.5965885737089633, + "learning_rate": 6.070735620948049e-06, + "loss": 0.6181, + "step": 1221 + }, + { + "epoch": 1.465506898620276, + "grad_norm": 0.7167205284378334, + "learning_rate": 6.063912185987708e-06, + "loss": 0.7338, + "step": 1222 + }, + { + "epoch": 1.4667066586682664, + "grad_norm": 0.6249955365345098, + "learning_rate": 6.057086675031202e-06, + "loss": 0.6121, + "step": 1223 + }, + { + "epoch": 1.4679064187162567, + "grad_norm": 0.7612387736488212, + "learning_rate": 6.050259101397048e-06, + "loss": 0.6384, + "step": 1224 + }, + { + "epoch": 1.4691061787642472, + "grad_norm": 0.6565025743667317, + "learning_rate": 6.0434294784077905e-06, + "loss": 0.6335, + "step": 1225 + }, + { + "epoch": 1.4703059388122375, + "grad_norm": 0.5840172467296441, + "learning_rate": 6.036597819389972e-06, + "loss": 0.5546, + "step": 1226 + }, + { + "epoch": 1.471505698860228, + "grad_norm": 0.8134562582624975, + "learning_rate": 6.02976413767411e-06, + "loss": 0.6825, + "step": 1227 + }, + { + "epoch": 1.4727054589082185, + "grad_norm": 0.7013684567318601, + "learning_rate": 6.022928446594662e-06, + "loss": 0.6696, + "step": 1228 + }, + { + "epoch": 1.4739052189562087, + "grad_norm": 0.476068932069486, + "learning_rate": 6.016090759490014e-06, + "loss": 0.6119, + "step": 1229 + }, + { + "epoch": 1.4751049790041992, + "grad_norm": 0.7243788321830064, + "learning_rate": 6.009251089702447e-06, + "loss": 0.6987, + "step": 1230 + }, + { + "epoch": 1.4763047390521895, + "grad_norm": 0.7913274804741819, + "learning_rate": 6.002409450578104e-06, + "loss": 0.6777, + "step": 1231 + }, + { + "epoch": 1.47750449910018, + "grad_norm": 0.6735369529497145, + "learning_rate": 5.995565855466974e-06, + "loss": 0.6325, + "step": 1232 + }, + { + "epoch": 1.4787042591481705, + "grad_norm": 0.5554440981407642, + "learning_rate": 5.988720317722866e-06, + "loss": 0.6515, + "step": 1233 + }, + { + "epoch": 1.4799040191961608, + "grad_norm": 0.6834658022330459, + "learning_rate": 5.981872850703376e-06, + "loss": 0.5841, + "step": 1234 + }, + { + "epoch": 1.481103779244151, + "grad_norm": 0.6933068366611141, + "learning_rate": 5.975023467769865e-06, + "loss": 0.6415, + "step": 1235 + }, + { + "epoch": 1.4823035392921415, + "grad_norm": 0.7325088249095414, + "learning_rate": 5.968172182287433e-06, + "loss": 0.5846, + "step": 1236 + }, + { + "epoch": 1.483503299340132, + "grad_norm": 0.7036937352178297, + "learning_rate": 5.9613190076248935e-06, + "loss": 0.6728, + "step": 1237 + }, + { + "epoch": 1.4847030593881223, + "grad_norm": 0.6323787187969813, + "learning_rate": 5.954463957154743e-06, + "loss": 0.5662, + "step": 1238 + }, + { + "epoch": 1.4859028194361128, + "grad_norm": 0.737462376786475, + "learning_rate": 5.947607044253142e-06, + "loss": 0.6503, + "step": 1239 + }, + { + "epoch": 1.487102579484103, + "grad_norm": 0.6486858074761898, + "learning_rate": 5.940748282299885e-06, + "loss": 0.5894, + "step": 1240 + }, + { + "epoch": 1.4883023395320936, + "grad_norm": 0.6100466907434766, + "learning_rate": 5.933887684678369e-06, + "loss": 0.5828, + "step": 1241 + }, + { + "epoch": 1.489502099580084, + "grad_norm": 0.8409578952042881, + "learning_rate": 5.927025264775581e-06, + "loss": 0.589, + "step": 1242 + }, + { + "epoch": 1.4907018596280743, + "grad_norm": 0.582264752223249, + "learning_rate": 5.920161035982058e-06, + "loss": 0.6669, + "step": 1243 + }, + { + "epoch": 1.4919016196760648, + "grad_norm": 0.6079716398734951, + "learning_rate": 5.913295011691869e-06, + "loss": 0.6101, + "step": 1244 + }, + { + "epoch": 1.493101379724055, + "grad_norm": 0.7467107653736639, + "learning_rate": 5.906427205302584e-06, + "loss": 0.6203, + "step": 1245 + }, + { + "epoch": 1.4943011397720456, + "grad_norm": 0.623686442929491, + "learning_rate": 5.899557630215256e-06, + "loss": 0.624, + "step": 1246 + }, + { + "epoch": 1.495500899820036, + "grad_norm": 0.6732483651744011, + "learning_rate": 5.892686299834382e-06, + "loss": 0.6519, + "step": 1247 + }, + { + "epoch": 1.4967006598680264, + "grad_norm": 0.8058906996878299, + "learning_rate": 5.88581322756789e-06, + "loss": 0.6071, + "step": 1248 + }, + { + "epoch": 1.4979004199160169, + "grad_norm": 0.5320089904676821, + "learning_rate": 5.878938426827106e-06, + "loss": 0.648, + "step": 1249 + }, + { + "epoch": 1.4991001799640071, + "grad_norm": 0.7532237642342396, + "learning_rate": 5.872061911026723e-06, + "loss": 0.5871, + "step": 1250 + }, + { + "epoch": 1.5002999400119976, + "grad_norm": 0.6431714703702812, + "learning_rate": 5.865183693584786e-06, + "loss": 0.6295, + "step": 1251 + }, + { + "epoch": 1.5014997000599881, + "grad_norm": 0.6235215837163856, + "learning_rate": 5.858303787922663e-06, + "loss": 0.7042, + "step": 1252 + }, + { + "epoch": 1.5026994601079784, + "grad_norm": 0.7383379478172799, + "learning_rate": 5.851422207465009e-06, + "loss": 0.6177, + "step": 1253 + }, + { + "epoch": 1.5038992201559687, + "grad_norm": 0.6367437664682997, + "learning_rate": 5.8445389656397525e-06, + "loss": 0.7393, + "step": 1254 + }, + { + "epoch": 1.5050989802039592, + "grad_norm": 0.6840499847058636, + "learning_rate": 5.83765407587806e-06, + "loss": 0.6664, + "step": 1255 + }, + { + "epoch": 1.5062987402519497, + "grad_norm": 0.7082182417706232, + "learning_rate": 5.830767551614316e-06, + "loss": 0.6624, + "step": 1256 + }, + { + "epoch": 1.5074985002999401, + "grad_norm": 0.7448387921989272, + "learning_rate": 5.823879406286095e-06, + "loss": 0.6458, + "step": 1257 + }, + { + "epoch": 1.5086982603479304, + "grad_norm": 0.6314122065500002, + "learning_rate": 5.816989653334131e-06, + "loss": 0.6356, + "step": 1258 + }, + { + "epoch": 1.5098980203959207, + "grad_norm": 0.7077408943810496, + "learning_rate": 5.810098306202301e-06, + "loss": 0.6376, + "step": 1259 + }, + { + "epoch": 1.5110977804439112, + "grad_norm": 0.5255442417829472, + "learning_rate": 5.803205378337586e-06, + "loss": 0.6054, + "step": 1260 + }, + { + "epoch": 1.5122975404919017, + "grad_norm": 0.7423020316453411, + "learning_rate": 5.796310883190055e-06, + "loss": 0.5346, + "step": 1261 + }, + { + "epoch": 1.5134973005398922, + "grad_norm": 0.72373854141275, + "learning_rate": 5.789414834212837e-06, + "loss": 0.6513, + "step": 1262 + }, + { + "epoch": 1.5146970605878824, + "grad_norm": 0.6268195313504005, + "learning_rate": 5.782517244862087e-06, + "loss": 0.6355, + "step": 1263 + }, + { + "epoch": 1.5158968206358727, + "grad_norm": 0.7470521652894679, + "learning_rate": 5.775618128596972e-06, + "loss": 0.6039, + "step": 1264 + }, + { + "epoch": 1.5170965806838632, + "grad_norm": 0.581894195138229, + "learning_rate": 5.768717498879635e-06, + "loss": 0.6927, + "step": 1265 + }, + { + "epoch": 1.5182963407318537, + "grad_norm": 0.641456779328077, + "learning_rate": 5.761815369175175e-06, + "loss": 0.6586, + "step": 1266 + }, + { + "epoch": 1.519496100779844, + "grad_norm": 0.5650571897087318, + "learning_rate": 5.754911752951612e-06, + "loss": 0.5913, + "step": 1267 + }, + { + "epoch": 1.5206958608278345, + "grad_norm": 0.7021394162807891, + "learning_rate": 5.7480066636798735e-06, + "loss": 0.6405, + "step": 1268 + }, + { + "epoch": 1.5218956208758248, + "grad_norm": 0.7911515180750498, + "learning_rate": 5.74110011483376e-06, + "loss": 0.5796, + "step": 1269 + }, + { + "epoch": 1.5230953809238152, + "grad_norm": 0.7166122110519555, + "learning_rate": 5.734192119889913e-06, + "loss": 0.6079, + "step": 1270 + }, + { + "epoch": 1.5242951409718057, + "grad_norm": 0.5612041476172737, + "learning_rate": 5.7272826923278065e-06, + "loss": 0.6609, + "step": 1271 + }, + { + "epoch": 1.525494901019796, + "grad_norm": 0.6213354163393838, + "learning_rate": 5.720371845629703e-06, + "loss": 0.6125, + "step": 1272 + }, + { + "epoch": 1.5266946610677863, + "grad_norm": 0.6474864927657513, + "learning_rate": 5.713459593280635e-06, + "loss": 0.6444, + "step": 1273 + }, + { + "epoch": 1.5278944211157768, + "grad_norm": 0.5908040200841372, + "learning_rate": 5.706545948768378e-06, + "loss": 0.5962, + "step": 1274 + }, + { + "epoch": 1.5290941811637673, + "grad_norm": 0.6227188076704627, + "learning_rate": 5.699630925583426e-06, + "loss": 0.6565, + "step": 1275 + }, + { + "epoch": 1.5302939412117578, + "grad_norm": 0.7599470245412667, + "learning_rate": 5.692714537218963e-06, + "loss": 0.6916, + "step": 1276 + }, + { + "epoch": 1.531493701259748, + "grad_norm": 0.7502609001812292, + "learning_rate": 5.685796797170831e-06, + "loss": 0.6004, + "step": 1277 + }, + { + "epoch": 1.5326934613077383, + "grad_norm": 0.5411325249265353, + "learning_rate": 5.67887771893752e-06, + "loss": 0.6619, + "step": 1278 + }, + { + "epoch": 1.5338932213557288, + "grad_norm": 0.7761314801897223, + "learning_rate": 5.671957316020123e-06, + "loss": 0.6192, + "step": 1279 + }, + { + "epoch": 1.5350929814037193, + "grad_norm": 0.6862836201660087, + "learning_rate": 5.665035601922317e-06, + "loss": 0.5928, + "step": 1280 + }, + { + "epoch": 1.5362927414517098, + "grad_norm": 0.7866975019317517, + "learning_rate": 5.658112590150348e-06, + "loss": 0.6476, + "step": 1281 + }, + { + "epoch": 1.5374925014997, + "grad_norm": 0.48330894009913766, + "learning_rate": 5.651188294212979e-06, + "loss": 0.6323, + "step": 1282 + }, + { + "epoch": 1.5386922615476903, + "grad_norm": 0.8543107937867963, + "learning_rate": 5.644262727621491e-06, + "loss": 0.6185, + "step": 1283 + }, + { + "epoch": 1.5398920215956808, + "grad_norm": 0.7477891341738985, + "learning_rate": 5.637335903889639e-06, + "loss": 0.6001, + "step": 1284 + }, + { + "epoch": 1.5410917816436713, + "grad_norm": 0.525523421014255, + "learning_rate": 5.630407836533632e-06, + "loss": 0.6311, + "step": 1285 + }, + { + "epoch": 1.5422915416916618, + "grad_norm": 1.0471427028757878, + "learning_rate": 5.623478539072106e-06, + "loss": 0.5697, + "step": 1286 + }, + { + "epoch": 1.543491301739652, + "grad_norm": 0.76087236543128, + "learning_rate": 5.616548025026096e-06, + "loss": 0.6693, + "step": 1287 + }, + { + "epoch": 1.5446910617876424, + "grad_norm": 0.6268895731868466, + "learning_rate": 5.609616307919015e-06, + "loss": 0.6079, + "step": 1288 + }, + { + "epoch": 1.5458908218356329, + "grad_norm": 0.6984978410322382, + "learning_rate": 5.6026834012766155e-06, + "loss": 0.6549, + "step": 1289 + }, + { + "epoch": 1.5470905818836234, + "grad_norm": 0.5098259790983851, + "learning_rate": 5.59574931862698e-06, + "loss": 0.6849, + "step": 1290 + }, + { + "epoch": 1.5482903419316136, + "grad_norm": 0.74458734342187, + "learning_rate": 5.588814073500481e-06, + "loss": 0.6239, + "step": 1291 + }, + { + "epoch": 1.5494901019796041, + "grad_norm": 0.5624204820252244, + "learning_rate": 5.581877679429758e-06, + "loss": 0.6346, + "step": 1292 + }, + { + "epoch": 1.5506898620275944, + "grad_norm": 0.8345891432596939, + "learning_rate": 5.574940149949697e-06, + "loss": 0.6148, + "step": 1293 + }, + { + "epoch": 1.551889622075585, + "grad_norm": 0.7357972047845589, + "learning_rate": 5.568001498597395e-06, + "loss": 0.5902, + "step": 1294 + }, + { + "epoch": 1.5530893821235754, + "grad_norm": 0.6544965040507694, + "learning_rate": 5.561061738912143e-06, + "loss": 0.6137, + "step": 1295 + }, + { + "epoch": 1.5542891421715657, + "grad_norm": 0.542534811709464, + "learning_rate": 5.5541208844353875e-06, + "loss": 0.6494, + "step": 1296 + }, + { + "epoch": 1.555488902219556, + "grad_norm": 0.7380424075936614, + "learning_rate": 5.547178948710719e-06, + "loss": 0.6584, + "step": 1297 + }, + { + "epoch": 1.5566886622675464, + "grad_norm": 0.6423461323690198, + "learning_rate": 5.540235945283835e-06, + "loss": 0.6675, + "step": 1298 + }, + { + "epoch": 1.557888422315537, + "grad_norm": 0.7083491579052724, + "learning_rate": 5.533291887702512e-06, + "loss": 0.606, + "step": 1299 + }, + { + "epoch": 1.5590881823635274, + "grad_norm": 0.6406839795704589, + "learning_rate": 5.526346789516591e-06, + "loss": 0.601, + "step": 1300 + }, + { + "epoch": 1.5602879424115177, + "grad_norm": 0.6174645592082069, + "learning_rate": 5.519400664277936e-06, + "loss": 0.7061, + "step": 1301 + }, + { + "epoch": 1.561487702459508, + "grad_norm": 0.704019949782006, + "learning_rate": 5.512453525540421e-06, + "loss": 0.6713, + "step": 1302 + }, + { + "epoch": 1.5626874625074985, + "grad_norm": 0.6448062770664398, + "learning_rate": 5.505505386859898e-06, + "loss": 0.6642, + "step": 1303 + }, + { + "epoch": 1.563887222555489, + "grad_norm": 0.7065362159390188, + "learning_rate": 5.498556261794161e-06, + "loss": 0.6278, + "step": 1304 + }, + { + "epoch": 1.5650869826034794, + "grad_norm": 0.7260098687088066, + "learning_rate": 5.491606163902942e-06, + "loss": 0.5843, + "step": 1305 + }, + { + "epoch": 1.5662867426514697, + "grad_norm": 0.6334867367558583, + "learning_rate": 5.4846551067478595e-06, + "loss": 0.6244, + "step": 1306 + }, + { + "epoch": 1.56748650269946, + "grad_norm": 0.6941300206724687, + "learning_rate": 5.477703103892412e-06, + "loss": 0.6038, + "step": 1307 + }, + { + "epoch": 1.5686862627474505, + "grad_norm": 0.7633621862009159, + "learning_rate": 5.47075016890194e-06, + "loss": 0.5862, + "step": 1308 + }, + { + "epoch": 1.569886022795441, + "grad_norm": 0.9369076335853657, + "learning_rate": 5.4637963153436e-06, + "loss": 0.6297, + "step": 1309 + }, + { + "epoch": 1.5710857828434315, + "grad_norm": 0.5555133609295394, + "learning_rate": 5.4568415567863485e-06, + "loss": 0.6275, + "step": 1310 + }, + { + "epoch": 1.5722855428914218, + "grad_norm": 0.7770497074003324, + "learning_rate": 5.449885906800901e-06, + "loss": 0.6219, + "step": 1311 + }, + { + "epoch": 1.573485302939412, + "grad_norm": 0.6168582319141467, + "learning_rate": 5.442929378959713e-06, + "loss": 0.6439, + "step": 1312 + }, + { + "epoch": 1.5746850629874025, + "grad_norm": 0.6524715422067409, + "learning_rate": 5.435971986836958e-06, + "loss": 0.6573, + "step": 1313 + }, + { + "epoch": 1.575884823035393, + "grad_norm": 0.7842695350118289, + "learning_rate": 5.429013744008491e-06, + "loss": 0.6485, + "step": 1314 + }, + { + "epoch": 1.5770845830833833, + "grad_norm": 0.5362782200254684, + "learning_rate": 5.42205466405183e-06, + "loss": 0.6324, + "step": 1315 + }, + { + "epoch": 1.5782843431313738, + "grad_norm": 0.7618827490812607, + "learning_rate": 5.4150947605461225e-06, + "loss": 0.6343, + "step": 1316 + }, + { + "epoch": 1.579484103179364, + "grad_norm": 0.5253757301879358, + "learning_rate": 5.4081340470721286e-06, + "loss": 0.6574, + "step": 1317 + }, + { + "epoch": 1.5806838632273545, + "grad_norm": 0.5647718553683049, + "learning_rate": 5.401172537212183e-06, + "loss": 0.6078, + "step": 1318 + }, + { + "epoch": 1.581883623275345, + "grad_norm": 0.7617002418019216, + "learning_rate": 5.3942102445501795e-06, + "loss": 0.6249, + "step": 1319 + }, + { + "epoch": 1.5830833833233353, + "grad_norm": 0.7260704174604635, + "learning_rate": 5.387247182671539e-06, + "loss": 0.6495, + "step": 1320 + }, + { + "epoch": 1.5842831433713256, + "grad_norm": 0.7150652300708271, + "learning_rate": 5.380283365163175e-06, + "loss": 0.6716, + "step": 1321 + }, + { + "epoch": 1.585482903419316, + "grad_norm": 0.6810037087806773, + "learning_rate": 5.373318805613489e-06, + "loss": 0.6083, + "step": 1322 + }, + { + "epoch": 1.5866826634673066, + "grad_norm": 0.7478140636419214, + "learning_rate": 5.366353517612319e-06, + "loss": 0.5957, + "step": 1323 + }, + { + "epoch": 1.587882423515297, + "grad_norm": 0.7161538437371096, + "learning_rate": 5.35938751475093e-06, + "loss": 0.6425, + "step": 1324 + }, + { + "epoch": 1.5890821835632873, + "grad_norm": 0.6063804079438616, + "learning_rate": 5.352420810621981e-06, + "loss": 0.6298, + "step": 1325 + }, + { + "epoch": 1.5902819436112776, + "grad_norm": 0.5841230926203578, + "learning_rate": 5.345453418819499e-06, + "loss": 0.6152, + "step": 1326 + }, + { + "epoch": 1.591481703659268, + "grad_norm": 0.6787445465536185, + "learning_rate": 5.338485352938854e-06, + "loss": 0.5985, + "step": 1327 + }, + { + "epoch": 1.5926814637072586, + "grad_norm": 0.6924927583320081, + "learning_rate": 5.3315166265767274e-06, + "loss": 0.6117, + "step": 1328 + }, + { + "epoch": 1.593881223755249, + "grad_norm": 0.7352710603579419, + "learning_rate": 5.324547253331094e-06, + "loss": 0.5997, + "step": 1329 + }, + { + "epoch": 1.5950809838032394, + "grad_norm": 0.6977482174008609, + "learning_rate": 5.31757724680119e-06, + "loss": 0.6009, + "step": 1330 + }, + { + "epoch": 1.5962807438512296, + "grad_norm": 0.6382832332344649, + "learning_rate": 5.310606620587484e-06, + "loss": 0.5573, + "step": 1331 + }, + { + "epoch": 1.5974805038992201, + "grad_norm": 0.6971060593932868, + "learning_rate": 5.303635388291659e-06, + "loss": 0.5733, + "step": 1332 + }, + { + "epoch": 1.5986802639472106, + "grad_norm": 0.6316557545076512, + "learning_rate": 5.296663563516576e-06, + "loss": 0.6096, + "step": 1333 + }, + { + "epoch": 1.599880023995201, + "grad_norm": 0.7608501254340002, + "learning_rate": 5.289691159866254e-06, + "loss": 0.6572, + "step": 1334 + }, + { + "epoch": 1.6010797840431914, + "grad_norm": 0.8037027202981949, + "learning_rate": 5.282718190945842e-06, + "loss": 0.7289, + "step": 1335 + }, + { + "epoch": 1.6022795440911817, + "grad_norm": 0.7421398229841776, + "learning_rate": 5.275744670361591e-06, + "loss": 0.6547, + "step": 1336 + }, + { + "epoch": 1.6034793041391722, + "grad_norm": 0.6625938252607363, + "learning_rate": 5.268770611720831e-06, + "loss": 0.6152, + "step": 1337 + }, + { + "epoch": 1.6046790641871627, + "grad_norm": 0.8199782776015508, + "learning_rate": 5.261796028631935e-06, + "loss": 0.6317, + "step": 1338 + }, + { + "epoch": 1.605878824235153, + "grad_norm": 0.49806684477338437, + "learning_rate": 5.254820934704309e-06, + "loss": 0.6433, + "step": 1339 + }, + { + "epoch": 1.6070785842831432, + "grad_norm": 0.7916961270191544, + "learning_rate": 5.24784534354835e-06, + "loss": 0.6764, + "step": 1340 + }, + { + "epoch": 1.6082783443311337, + "grad_norm": 0.6868484859480074, + "learning_rate": 5.240869268775422e-06, + "loss": 0.6777, + "step": 1341 + }, + { + "epoch": 1.6094781043791242, + "grad_norm": 0.7051861937478411, + "learning_rate": 5.2338927239978446e-06, + "loss": 0.6278, + "step": 1342 + }, + { + "epoch": 1.6106778644271147, + "grad_norm": 0.6532318157109314, + "learning_rate": 5.22691572282884e-06, + "loss": 0.5676, + "step": 1343 + }, + { + "epoch": 1.611877624475105, + "grad_norm": 0.9654876160289116, + "learning_rate": 5.21993827888253e-06, + "loss": 0.6753, + "step": 1344 + }, + { + "epoch": 1.6130773845230952, + "grad_norm": 0.5877057698919808, + "learning_rate": 5.2129604057739e-06, + "loss": 0.5589, + "step": 1345 + }, + { + "epoch": 1.6142771445710857, + "grad_norm": 0.6121179015957432, + "learning_rate": 5.205982117118768e-06, + "loss": 0.6621, + "step": 1346 + }, + { + "epoch": 1.6154769046190762, + "grad_norm": 0.7429376189213517, + "learning_rate": 5.199003426533767e-06, + "loss": 0.6336, + "step": 1347 + }, + { + "epoch": 1.6166766646670667, + "grad_norm": 0.6341338353766705, + "learning_rate": 5.192024347636313e-06, + "loss": 0.6051, + "step": 1348 + }, + { + "epoch": 1.617876424715057, + "grad_norm": 0.7824479742838273, + "learning_rate": 5.185044894044582e-06, + "loss": 0.6389, + "step": 1349 + }, + { + "epoch": 1.6190761847630473, + "grad_norm": 0.8642004701087276, + "learning_rate": 5.178065079377473e-06, + "loss": 0.5368, + "step": 1350 + }, + { + "epoch": 1.6202759448110378, + "grad_norm": 0.6679893599433782, + "learning_rate": 5.171084917254602e-06, + "loss": 0.6081, + "step": 1351 + }, + { + "epoch": 1.6214757048590283, + "grad_norm": 0.7236938887490497, + "learning_rate": 5.164104421296254e-06, + "loss": 0.5892, + "step": 1352 + }, + { + "epoch": 1.6226754649070187, + "grad_norm": 0.6099431977403261, + "learning_rate": 5.157123605123366e-06, + "loss": 0.5639, + "step": 1353 + }, + { + "epoch": 1.623875224955009, + "grad_norm": 0.7748564110848425, + "learning_rate": 5.150142482357505e-06, + "loss": 0.5525, + "step": 1354 + }, + { + "epoch": 1.6250749850029993, + "grad_norm": 0.6297212838369077, + "learning_rate": 5.14316106662083e-06, + "loss": 0.6638, + "step": 1355 + }, + { + "epoch": 1.6262747450509898, + "grad_norm": 0.9192127022014425, + "learning_rate": 5.136179371536076e-06, + "loss": 0.6518, + "step": 1356 + }, + { + "epoch": 1.6274745050989803, + "grad_norm": 0.5657648988393104, + "learning_rate": 5.129197410726522e-06, + "loss": 0.6914, + "step": 1357 + }, + { + "epoch": 1.6286742651469706, + "grad_norm": 0.779290014963476, + "learning_rate": 5.122215197815965e-06, + "loss": 0.6128, + "step": 1358 + }, + { + "epoch": 1.629874025194961, + "grad_norm": 0.743576865686689, + "learning_rate": 5.115232746428694e-06, + "loss": 0.6733, + "step": 1359 + }, + { + "epoch": 1.6310737852429513, + "grad_norm": 0.7455700611458127, + "learning_rate": 5.108250070189462e-06, + "loss": 0.6036, + "step": 1360 + }, + { + "epoch": 1.6322735452909418, + "grad_norm": 0.7410112167182892, + "learning_rate": 5.1012671827234664e-06, + "loss": 0.6891, + "step": 1361 + }, + { + "epoch": 1.6334733053389323, + "grad_norm": 0.7360720344296737, + "learning_rate": 5.094284097656308e-06, + "loss": 0.6906, + "step": 1362 + }, + { + "epoch": 1.6346730653869226, + "grad_norm": 0.8699066333508229, + "learning_rate": 5.087300828613978e-06, + "loss": 0.64, + "step": 1363 + }, + { + "epoch": 1.6358728254349129, + "grad_norm": 0.5635231414162302, + "learning_rate": 5.08031738922283e-06, + "loss": 0.6577, + "step": 1364 + }, + { + "epoch": 1.6370725854829034, + "grad_norm": 0.5908428663621056, + "learning_rate": 5.073333793109545e-06, + "loss": 0.678, + "step": 1365 + }, + { + "epoch": 1.6382723455308938, + "grad_norm": 0.8734239583999815, + "learning_rate": 5.066350053901109e-06, + "loss": 0.5877, + "step": 1366 + }, + { + "epoch": 1.6394721055788843, + "grad_norm": 0.7409036971656349, + "learning_rate": 5.059366185224791e-06, + "loss": 0.6378, + "step": 1367 + }, + { + "epoch": 1.6406718656268746, + "grad_norm": 0.6545105385418288, + "learning_rate": 5.052382200708111e-06, + "loss": 0.6184, + "step": 1368 + }, + { + "epoch": 1.6418716256748649, + "grad_norm": 0.697345909377518, + "learning_rate": 5.045398113978816e-06, + "loss": 0.6545, + "step": 1369 + }, + { + "epoch": 1.6430713857228554, + "grad_norm": 0.5220491623520667, + "learning_rate": 5.038413938664849e-06, + "loss": 0.5996, + "step": 1370 + }, + { + "epoch": 1.6442711457708459, + "grad_norm": 0.5669323994294941, + "learning_rate": 5.03142968839433e-06, + "loss": 0.6126, + "step": 1371 + }, + { + "epoch": 1.6454709058188364, + "grad_norm": 0.6387534834755425, + "learning_rate": 5.024445376795524e-06, + "loss": 0.5562, + "step": 1372 + }, + { + "epoch": 1.6466706658668266, + "grad_norm": 0.7008635230548818, + "learning_rate": 5.017461017496814e-06, + "loss": 0.6342, + "step": 1373 + }, + { + "epoch": 1.647870425914817, + "grad_norm": 0.6515918866550156, + "learning_rate": 5.010476624126678e-06, + "loss": 0.6251, + "step": 1374 + }, + { + "epoch": 1.6490701859628074, + "grad_norm": 0.5081521953065723, + "learning_rate": 5.00349221031366e-06, + "loss": 0.663, + "step": 1375 + }, + { + "epoch": 1.650269946010798, + "grad_norm": 0.7432546157072372, + "learning_rate": 4.996507789686342e-06, + "loss": 0.642, + "step": 1376 + }, + { + "epoch": 1.6514697060587884, + "grad_norm": 0.552207404022524, + "learning_rate": 4.989523375873325e-06, + "loss": 0.6261, + "step": 1377 + }, + { + "epoch": 1.6526694661067787, + "grad_norm": 0.6493854028503266, + "learning_rate": 4.982538982503188e-06, + "loss": 0.6748, + "step": 1378 + }, + { + "epoch": 1.653869226154769, + "grad_norm": 0.7591599367795824, + "learning_rate": 4.975554623204478e-06, + "loss": 0.7019, + "step": 1379 + }, + { + "epoch": 1.6550689862027594, + "grad_norm": 0.5672636560414229, + "learning_rate": 4.9685703116056715e-06, + "loss": 0.6207, + "step": 1380 + }, + { + "epoch": 1.65626874625075, + "grad_norm": 0.6634334406291118, + "learning_rate": 4.961586061335153e-06, + "loss": 0.6331, + "step": 1381 + }, + { + "epoch": 1.6574685062987402, + "grad_norm": 0.5677574979637521, + "learning_rate": 4.9546018860211845e-06, + "loss": 0.6381, + "step": 1382 + }, + { + "epoch": 1.6586682663467307, + "grad_norm": 0.5986843581839568, + "learning_rate": 4.94761779929189e-06, + "loss": 0.6833, + "step": 1383 + }, + { + "epoch": 1.659868026394721, + "grad_norm": 0.6519456536042523, + "learning_rate": 4.9406338147752094e-06, + "loss": 0.5493, + "step": 1384 + }, + { + "epoch": 1.6610677864427115, + "grad_norm": 0.6893166733936632, + "learning_rate": 4.933649946098892e-06, + "loss": 0.6549, + "step": 1385 + }, + { + "epoch": 1.662267546490702, + "grad_norm": 0.7457906133470886, + "learning_rate": 4.926666206890457e-06, + "loss": 0.6356, + "step": 1386 + }, + { + "epoch": 1.6634673065386922, + "grad_norm": 0.5601863451041117, + "learning_rate": 4.91968261077717e-06, + "loss": 0.6127, + "step": 1387 + }, + { + "epoch": 1.6646670665866825, + "grad_norm": 0.6284409463777942, + "learning_rate": 4.912699171386023e-06, + "loss": 0.6172, + "step": 1388 + }, + { + "epoch": 1.665866826634673, + "grad_norm": 0.5949620807781346, + "learning_rate": 4.905715902343695e-06, + "loss": 0.6274, + "step": 1389 + }, + { + "epoch": 1.6670665866826635, + "grad_norm": 0.8324715418723123, + "learning_rate": 4.898732817276535e-06, + "loss": 0.6103, + "step": 1390 + }, + { + "epoch": 1.668266346730654, + "grad_norm": 0.6575519307953094, + "learning_rate": 4.891749929810538e-06, + "loss": 0.6174, + "step": 1391 + }, + { + "epoch": 1.6694661067786443, + "grad_norm": 0.7097139058095568, + "learning_rate": 4.884767253571308e-06, + "loss": 0.6218, + "step": 1392 + }, + { + "epoch": 1.6706658668266345, + "grad_norm": 0.8040567508035438, + "learning_rate": 4.877784802184037e-06, + "loss": 0.6645, + "step": 1393 + }, + { + "epoch": 1.671865626874625, + "grad_norm": 0.7822401523231658, + "learning_rate": 4.870802589273478e-06, + "loss": 0.5774, + "step": 1394 + }, + { + "epoch": 1.6730653869226155, + "grad_norm": 0.616670401217597, + "learning_rate": 4.863820628463925e-06, + "loss": 0.6616, + "step": 1395 + }, + { + "epoch": 1.674265146970606, + "grad_norm": 0.5030978714474055, + "learning_rate": 4.856838933379171e-06, + "loss": 0.6501, + "step": 1396 + }, + { + "epoch": 1.6754649070185963, + "grad_norm": 0.6174359209713869, + "learning_rate": 4.8498575176424955e-06, + "loss": 0.5841, + "step": 1397 + }, + { + "epoch": 1.6766646670665866, + "grad_norm": 0.7981596232184236, + "learning_rate": 4.842876394876636e-06, + "loss": 0.5693, + "step": 1398 + }, + { + "epoch": 1.677864427114577, + "grad_norm": 0.5773863788433636, + "learning_rate": 4.8358955787037476e-06, + "loss": 0.5577, + "step": 1399 + }, + { + "epoch": 1.6790641871625676, + "grad_norm": 0.8147173648555551, + "learning_rate": 4.828915082745398e-06, + "loss": 0.6019, + "step": 1400 + }, + { + "epoch": 1.680263947210558, + "grad_norm": 0.6998856064970432, + "learning_rate": 4.821934920622528e-06, + "loss": 0.6435, + "step": 1401 + }, + { + "epoch": 1.6814637072585483, + "grad_norm": 0.6301096966575788, + "learning_rate": 4.81495510595542e-06, + "loss": 0.5991, + "step": 1402 + }, + { + "epoch": 1.6826634673065386, + "grad_norm": 0.5376164948997293, + "learning_rate": 4.807975652363687e-06, + "loss": 0.6178, + "step": 1403 + }, + { + "epoch": 1.683863227354529, + "grad_norm": 0.6960603742114515, + "learning_rate": 4.8009965734662345e-06, + "loss": 0.6492, + "step": 1404 + }, + { + "epoch": 1.6850629874025196, + "grad_norm": 0.8949959217017778, + "learning_rate": 4.794017882881233e-06, + "loss": 0.6691, + "step": 1405 + }, + { + "epoch": 1.6862627474505099, + "grad_norm": 0.6958345832251298, + "learning_rate": 4.7870395942261005e-06, + "loss": 0.6615, + "step": 1406 + }, + { + "epoch": 1.6874625074985004, + "grad_norm": 0.586209277636213, + "learning_rate": 4.7800617211174705e-06, + "loss": 0.6434, + "step": 1407 + }, + { + "epoch": 1.6886622675464906, + "grad_norm": 0.73110751774531, + "learning_rate": 4.773084277171161e-06, + "loss": 0.6084, + "step": 1408 + }, + { + "epoch": 1.6898620275944811, + "grad_norm": 0.7239656781429334, + "learning_rate": 4.766107276002158e-06, + "loss": 0.5835, + "step": 1409 + }, + { + "epoch": 1.6910617876424716, + "grad_norm": 0.6031929392948729, + "learning_rate": 4.759130731224579e-06, + "loss": 0.6424, + "step": 1410 + }, + { + "epoch": 1.6922615476904619, + "grad_norm": 0.8232354870495764, + "learning_rate": 4.752154656451652e-06, + "loss": 0.5513, + "step": 1411 + }, + { + "epoch": 1.6934613077384522, + "grad_norm": 0.7640962339418518, + "learning_rate": 4.745179065295692e-06, + "loss": 0.671, + "step": 1412 + }, + { + "epoch": 1.6946610677864427, + "grad_norm": 0.7514613522929842, + "learning_rate": 4.738203971368066e-06, + "loss": 0.6191, + "step": 1413 + }, + { + "epoch": 1.6958608278344331, + "grad_norm": 0.5324176220078973, + "learning_rate": 4.73122938827917e-06, + "loss": 0.6231, + "step": 1414 + }, + { + "epoch": 1.6970605878824236, + "grad_norm": 0.7887179892123806, + "learning_rate": 4.724255329638411e-06, + "loss": 0.617, + "step": 1415 + }, + { + "epoch": 1.698260347930414, + "grad_norm": 0.5051402718166775, + "learning_rate": 4.717281809054159e-06, + "loss": 0.5679, + "step": 1416 + }, + { + "epoch": 1.6994601079784042, + "grad_norm": 0.7305596217490118, + "learning_rate": 4.710308840133747e-06, + "loss": 0.6135, + "step": 1417 + }, + { + "epoch": 1.7006598680263947, + "grad_norm": 0.718026410793312, + "learning_rate": 4.703336436483426e-06, + "loss": 0.6765, + "step": 1418 + }, + { + "epoch": 1.7018596280743852, + "grad_norm": 0.752441248195329, + "learning_rate": 4.696364611708342e-06, + "loss": 0.6097, + "step": 1419 + }, + { + "epoch": 1.7030593881223757, + "grad_norm": 0.6721419890454678, + "learning_rate": 4.689393379412517e-06, + "loss": 0.6639, + "step": 1420 + }, + { + "epoch": 1.704259148170366, + "grad_norm": 0.7818993504772956, + "learning_rate": 4.682422753198812e-06, + "loss": 0.6074, + "step": 1421 + }, + { + "epoch": 1.7054589082183562, + "grad_norm": 0.6065727544273684, + "learning_rate": 4.6754527466689086e-06, + "loss": 0.6049, + "step": 1422 + }, + { + "epoch": 1.7066586682663467, + "grad_norm": 0.6877103580556603, + "learning_rate": 4.668483373423273e-06, + "loss": 0.6457, + "step": 1423 + }, + { + "epoch": 1.7078584283143372, + "grad_norm": 0.6648773775650588, + "learning_rate": 4.661514647061149e-06, + "loss": 0.5718, + "step": 1424 + }, + { + "epoch": 1.7090581883623275, + "grad_norm": 0.7531066166259784, + "learning_rate": 4.654546581180502e-06, + "loss": 0.6491, + "step": 1425 + }, + { + "epoch": 1.710257948410318, + "grad_norm": 0.7177465989254774, + "learning_rate": 4.647579189378019e-06, + "loss": 0.6496, + "step": 1426 + }, + { + "epoch": 1.7114577084583082, + "grad_norm": 0.6506887097762082, + "learning_rate": 4.640612485249072e-06, + "loss": 0.6098, + "step": 1427 + }, + { + "epoch": 1.7126574685062987, + "grad_norm": 0.6342494598879174, + "learning_rate": 4.6336464823876826e-06, + "loss": 0.585, + "step": 1428 + }, + { + "epoch": 1.7138572285542892, + "grad_norm": 0.7498553556828876, + "learning_rate": 4.6266811943865124e-06, + "loss": 0.6213, + "step": 1429 + }, + { + "epoch": 1.7150569886022795, + "grad_norm": 0.5775583134180416, + "learning_rate": 4.619716634836826e-06, + "loss": 0.6555, + "step": 1430 + }, + { + "epoch": 1.7162567486502698, + "grad_norm": 0.6937580595546063, + "learning_rate": 4.612752817328463e-06, + "loss": 0.6028, + "step": 1431 + }, + { + "epoch": 1.7174565086982603, + "grad_norm": 0.7321126076487201, + "learning_rate": 4.60578975544982e-06, + "loss": 0.6546, + "step": 1432 + }, + { + "epoch": 1.7186562687462508, + "grad_norm": 0.8260819299546153, + "learning_rate": 4.5988274627878175e-06, + "loss": 0.684, + "step": 1433 + }, + { + "epoch": 1.7198560287942413, + "grad_norm": 0.6295125038484417, + "learning_rate": 4.591865952927873e-06, + "loss": 0.6312, + "step": 1434 + }, + { + "epoch": 1.7210557888422315, + "grad_norm": 0.7764441386243321, + "learning_rate": 4.5849052394538775e-06, + "loss": 0.6707, + "step": 1435 + }, + { + "epoch": 1.7222555488902218, + "grad_norm": 0.5909491911998985, + "learning_rate": 4.577945335948172e-06, + "loss": 0.5888, + "step": 1436 + }, + { + "epoch": 1.7234553089382123, + "grad_norm": 0.6214967272966967, + "learning_rate": 4.570986255991509e-06, + "loss": 0.686, + "step": 1437 + }, + { + "epoch": 1.7246550689862028, + "grad_norm": 0.7019193004149545, + "learning_rate": 4.564028013163043e-06, + "loss": 0.6136, + "step": 1438 + }, + { + "epoch": 1.7258548290341933, + "grad_norm": 0.6201886115016094, + "learning_rate": 4.5570706210402874e-06, + "loss": 0.5633, + "step": 1439 + }, + { + "epoch": 1.7270545890821836, + "grad_norm": 0.8722538070364205, + "learning_rate": 4.5501140931991e-06, + "loss": 0.5883, + "step": 1440 + }, + { + "epoch": 1.7282543491301738, + "grad_norm": 0.6686952418096178, + "learning_rate": 4.543158443213653e-06, + "loss": 0.6282, + "step": 1441 + }, + { + "epoch": 1.7294541091781643, + "grad_norm": 0.7937448111380455, + "learning_rate": 4.5362036846564015e-06, + "loss": 0.5691, + "step": 1442 + }, + { + "epoch": 1.7306538692261548, + "grad_norm": 0.6396496808668831, + "learning_rate": 4.529249831098062e-06, + "loss": 0.6303, + "step": 1443 + }, + { + "epoch": 1.7318536292741453, + "grad_norm": 0.6642865948086335, + "learning_rate": 4.52229689610759e-06, + "loss": 0.6523, + "step": 1444 + }, + { + "epoch": 1.7330533893221356, + "grad_norm": 0.7621455130419058, + "learning_rate": 4.515344893252142e-06, + "loss": 0.6516, + "step": 1445 + }, + { + "epoch": 1.7342531493701259, + "grad_norm": 0.5789503449826376, + "learning_rate": 4.508393836097061e-06, + "loss": 0.6032, + "step": 1446 + }, + { + "epoch": 1.7354529094181164, + "grad_norm": 0.6277333953284421, + "learning_rate": 4.501443738205841e-06, + "loss": 0.6478, + "step": 1447 + }, + { + "epoch": 1.7366526694661069, + "grad_norm": 0.6816407236510691, + "learning_rate": 4.494494613140105e-06, + "loss": 0.6952, + "step": 1448 + }, + { + "epoch": 1.7378524295140971, + "grad_norm": 0.5982908395568053, + "learning_rate": 4.487546474459579e-06, + "loss": 0.5816, + "step": 1449 + }, + { + "epoch": 1.7390521895620876, + "grad_norm": 0.7660687758767524, + "learning_rate": 4.480599335722066e-06, + "loss": 0.619, + "step": 1450 + }, + { + "epoch": 1.740251949610078, + "grad_norm": 0.7223849213293781, + "learning_rate": 4.4736532104834105e-06, + "loss": 0.6366, + "step": 1451 + }, + { + "epoch": 1.7414517096580684, + "grad_norm": 0.5146072094360288, + "learning_rate": 4.466708112297489e-06, + "loss": 0.6983, + "step": 1452 + }, + { + "epoch": 1.7426514697060589, + "grad_norm": 0.7024831535983507, + "learning_rate": 4.459764054716167e-06, + "loss": 0.6105, + "step": 1453 + }, + { + "epoch": 1.7438512297540492, + "grad_norm": 0.4880549556650625, + "learning_rate": 4.4528210512892814e-06, + "loss": 0.5857, + "step": 1454 + }, + { + "epoch": 1.7450509898020394, + "grad_norm": 0.8018785675884215, + "learning_rate": 4.445879115564612e-06, + "loss": 0.6069, + "step": 1455 + }, + { + "epoch": 1.74625074985003, + "grad_norm": 0.571914332481684, + "learning_rate": 4.438938261087859e-06, + "loss": 0.6243, + "step": 1456 + }, + { + "epoch": 1.7474505098980204, + "grad_norm": 0.5807256525801378, + "learning_rate": 4.431998501402605e-06, + "loss": 0.6511, + "step": 1457 + }, + { + "epoch": 1.748650269946011, + "grad_norm": 0.7345672940171202, + "learning_rate": 4.4250598500503045e-06, + "loss": 0.555, + "step": 1458 + }, + { + "epoch": 1.7498500299940012, + "grad_norm": 0.7940960330578659, + "learning_rate": 4.4181223205702435e-06, + "loss": 0.61, + "step": 1459 + }, + { + "epoch": 1.7510497900419915, + "grad_norm": 0.7993824179585033, + "learning_rate": 4.41118592649952e-06, + "loss": 0.6065, + "step": 1460 + }, + { + "epoch": 1.752249550089982, + "grad_norm": 0.6358705050170755, + "learning_rate": 4.404250681373021e-06, + "loss": 0.6099, + "step": 1461 + }, + { + "epoch": 1.7534493101379725, + "grad_norm": 0.6965743907502828, + "learning_rate": 4.397316598723385e-06, + "loss": 0.6742, + "step": 1462 + }, + { + "epoch": 1.754649070185963, + "grad_norm": 0.6328725130943839, + "learning_rate": 4.390383692080986e-06, + "loss": 0.5806, + "step": 1463 + }, + { + "epoch": 1.7558488302339532, + "grad_norm": 0.6311566335345002, + "learning_rate": 4.383451974973903e-06, + "loss": 0.6053, + "step": 1464 + }, + { + "epoch": 1.7570485902819435, + "grad_norm": 0.7052395609633253, + "learning_rate": 4.376521460927896e-06, + "loss": 0.6321, + "step": 1465 + }, + { + "epoch": 1.758248350329934, + "grad_norm": 0.7535406881872222, + "learning_rate": 4.36959216346637e-06, + "loss": 0.5487, + "step": 1466 + }, + { + "epoch": 1.7594481103779245, + "grad_norm": 0.6338229748455944, + "learning_rate": 4.362664096110361e-06, + "loss": 0.7053, + "step": 1467 + }, + { + "epoch": 1.760647870425915, + "grad_norm": 0.6309233539496764, + "learning_rate": 4.35573727237851e-06, + "loss": 0.6439, + "step": 1468 + }, + { + "epoch": 1.7618476304739052, + "grad_norm": 0.6150574415134877, + "learning_rate": 4.3488117057870225e-06, + "loss": 0.6238, + "step": 1469 + }, + { + "epoch": 1.7630473905218955, + "grad_norm": 0.5685790958617888, + "learning_rate": 4.341887409849656e-06, + "loss": 0.6298, + "step": 1470 + }, + { + "epoch": 1.764247150569886, + "grad_norm": 0.6752528106945532, + "learning_rate": 4.334964398077684e-06, + "loss": 0.5655, + "step": 1471 + }, + { + "epoch": 1.7654469106178765, + "grad_norm": 0.7649358108814712, + "learning_rate": 4.328042683979878e-06, + "loss": 0.6546, + "step": 1472 + }, + { + "epoch": 1.7666466706658668, + "grad_norm": 0.7905896064810997, + "learning_rate": 4.321122281062481e-06, + "loss": 0.6105, + "step": 1473 + }, + { + "epoch": 1.7678464307138573, + "grad_norm": 0.5660683589294571, + "learning_rate": 4.3142032028291695e-06, + "loss": 0.5924, + "step": 1474 + }, + { + "epoch": 1.7690461907618475, + "grad_norm": 0.7846283436256514, + "learning_rate": 4.307285462781038e-06, + "loss": 0.6607, + "step": 1475 + }, + { + "epoch": 1.770245950809838, + "grad_norm": 0.6580703214396916, + "learning_rate": 4.3003690744165765e-06, + "loss": 0.578, + "step": 1476 + }, + { + "epoch": 1.7714457108578285, + "grad_norm": 0.5237471095482176, + "learning_rate": 4.293454051231624e-06, + "loss": 0.5871, + "step": 1477 + }, + { + "epoch": 1.7726454709058188, + "grad_norm": 0.6327731192095628, + "learning_rate": 4.286540406719367e-06, + "loss": 0.6445, + "step": 1478 + }, + { + "epoch": 1.773845230953809, + "grad_norm": 0.8070222294599864, + "learning_rate": 4.2796281543703e-06, + "loss": 0.63, + "step": 1479 + }, + { + "epoch": 1.7750449910017996, + "grad_norm": 0.6062725964397185, + "learning_rate": 4.272717307672194e-06, + "loss": 0.6437, + "step": 1480 + }, + { + "epoch": 1.77624475104979, + "grad_norm": 0.6722768114469376, + "learning_rate": 4.265807880110087e-06, + "loss": 0.5731, + "step": 1481 + }, + { + "epoch": 1.7774445110977806, + "grad_norm": 0.5678106199265854, + "learning_rate": 4.258899885166242e-06, + "loss": 0.6429, + "step": 1482 + }, + { + "epoch": 1.7786442711457708, + "grad_norm": 0.5571672309900889, + "learning_rate": 4.251993336320127e-06, + "loss": 0.6058, + "step": 1483 + }, + { + "epoch": 1.7798440311937611, + "grad_norm": 0.7036183857353746, + "learning_rate": 4.245088247048388e-06, + "loss": 0.6477, + "step": 1484 + }, + { + "epoch": 1.7810437912417516, + "grad_norm": 0.7123575403550069, + "learning_rate": 4.238184630824828e-06, + "loss": 0.6218, + "step": 1485 + }, + { + "epoch": 1.782243551289742, + "grad_norm": 0.7751307578636013, + "learning_rate": 4.231282501120366e-06, + "loss": 0.6096, + "step": 1486 + }, + { + "epoch": 1.7834433113377326, + "grad_norm": 0.5388334838746758, + "learning_rate": 4.224381871403028e-06, + "loss": 0.608, + "step": 1487 + }, + { + "epoch": 1.7846430713857229, + "grad_norm": 0.7485815957038062, + "learning_rate": 4.217482755137916e-06, + "loss": 0.5487, + "step": 1488 + }, + { + "epoch": 1.7858428314337131, + "grad_norm": 0.6686635243767864, + "learning_rate": 4.210585165787166e-06, + "loss": 0.5603, + "step": 1489 + }, + { + "epoch": 1.7870425914817036, + "grad_norm": 0.7980471135553686, + "learning_rate": 4.203689116809946e-06, + "loss": 0.5965, + "step": 1490 + }, + { + "epoch": 1.7882423515296941, + "grad_norm": 0.7711362904906689, + "learning_rate": 4.196794621662417e-06, + "loss": 0.6021, + "step": 1491 + }, + { + "epoch": 1.7894421115776846, + "grad_norm": 0.6478401612668694, + "learning_rate": 4.1899016937977e-06, + "loss": 0.5521, + "step": 1492 + }, + { + "epoch": 1.790641871625675, + "grad_norm": 0.7301532095236675, + "learning_rate": 4.183010346665869e-06, + "loss": 0.5973, + "step": 1493 + }, + { + "epoch": 1.7918416316736652, + "grad_norm": 0.7504012219009591, + "learning_rate": 4.1761205937139075e-06, + "loss": 0.5623, + "step": 1494 + }, + { + "epoch": 1.7930413917216557, + "grad_norm": 0.8575530566266752, + "learning_rate": 4.169232448385685e-06, + "loss": 0.6651, + "step": 1495 + }, + { + "epoch": 1.7942411517696462, + "grad_norm": 0.5991870908982022, + "learning_rate": 4.162345924121941e-06, + "loss": 0.5979, + "step": 1496 + }, + { + "epoch": 1.7954409118176364, + "grad_norm": 0.727844764668071, + "learning_rate": 4.15546103436025e-06, + "loss": 0.6081, + "step": 1497 + }, + { + "epoch": 1.796640671865627, + "grad_norm": 0.7160612221748286, + "learning_rate": 4.148577792534992e-06, + "loss": 0.6317, + "step": 1498 + }, + { + "epoch": 1.7978404319136172, + "grad_norm": 0.7569049873211127, + "learning_rate": 4.14169621207734e-06, + "loss": 0.5891, + "step": 1499 + }, + { + "epoch": 1.7990401919616077, + "grad_norm": 0.626522678372816, + "learning_rate": 4.134816306415216e-06, + "loss": 0.6095, + "step": 1500 + }, + { + "epoch": 1.8002399520095982, + "grad_norm": 0.6719967537153403, + "learning_rate": 4.127938088973279e-06, + "loss": 0.6506, + "step": 1501 + }, + { + "epoch": 1.8014397120575885, + "grad_norm": 0.5512021941132703, + "learning_rate": 4.121061573172898e-06, + "loss": 0.6205, + "step": 1502 + }, + { + "epoch": 1.8026394721055787, + "grad_norm": 0.7297583764268756, + "learning_rate": 4.1141867724321115e-06, + "loss": 0.6474, + "step": 1503 + }, + { + "epoch": 1.8038392321535692, + "grad_norm": 0.45406727311456535, + "learning_rate": 4.1073137001656184e-06, + "loss": 0.665, + "step": 1504 + }, + { + "epoch": 1.8050389922015597, + "grad_norm": 0.770605560110556, + "learning_rate": 4.100442369784746e-06, + "loss": 0.6455, + "step": 1505 + }, + { + "epoch": 1.8062387522495502, + "grad_norm": 0.6900802474911933, + "learning_rate": 4.0935727946974176e-06, + "loss": 0.6682, + "step": 1506 + }, + { + "epoch": 1.8074385122975405, + "grad_norm": 0.5556687244755016, + "learning_rate": 4.086704988308134e-06, + "loss": 0.6034, + "step": 1507 + }, + { + "epoch": 1.8086382723455308, + "grad_norm": 0.6873341697662002, + "learning_rate": 4.079838964017945e-06, + "loss": 0.5547, + "step": 1508 + }, + { + "epoch": 1.8098380323935213, + "grad_norm": 0.6874227965615172, + "learning_rate": 4.072974735224421e-06, + "loss": 0.6071, + "step": 1509 + }, + { + "epoch": 1.8110377924415118, + "grad_norm": 0.6122799863116688, + "learning_rate": 4.066112315321632e-06, + "loss": 0.6301, + "step": 1510 + }, + { + "epoch": 1.8122375524895022, + "grad_norm": 0.6156415651630566, + "learning_rate": 4.059251717700118e-06, + "loss": 0.6434, + "step": 1511 + }, + { + "epoch": 1.8134373125374925, + "grad_norm": 0.7039193843462168, + "learning_rate": 4.05239295574686e-06, + "loss": 0.5995, + "step": 1512 + }, + { + "epoch": 1.8146370725854828, + "grad_norm": 0.7226905662344233, + "learning_rate": 4.045536042845257e-06, + "loss": 0.6785, + "step": 1513 + }, + { + "epoch": 1.8158368326334733, + "grad_norm": 0.7656410951997662, + "learning_rate": 4.038680992375108e-06, + "loss": 0.5813, + "step": 1514 + }, + { + "epoch": 1.8170365926814638, + "grad_norm": 0.6446736657739984, + "learning_rate": 4.031827817712568e-06, + "loss": 0.5885, + "step": 1515 + }, + { + "epoch": 1.8182363527294543, + "grad_norm": 0.6008413183656197, + "learning_rate": 4.024976532230135e-06, + "loss": 0.626, + "step": 1516 + }, + { + "epoch": 1.8194361127774445, + "grad_norm": 0.6617307462453432, + "learning_rate": 4.018127149296626e-06, + "loss": 0.6027, + "step": 1517 + }, + { + "epoch": 1.8206358728254348, + "grad_norm": 0.6356581631265987, + "learning_rate": 4.011279682277135e-06, + "loss": 0.5932, + "step": 1518 + }, + { + "epoch": 1.8218356328734253, + "grad_norm": 0.6686838163259288, + "learning_rate": 4.004434144533027e-06, + "loss": 0.6644, + "step": 1519 + }, + { + "epoch": 1.8230353929214158, + "grad_norm": 0.7226742772232984, + "learning_rate": 3.9975905494218995e-06, + "loss": 0.6478, + "step": 1520 + }, + { + "epoch": 1.824235152969406, + "grad_norm": 0.6254638417669428, + "learning_rate": 3.9907489102975546e-06, + "loss": 0.5728, + "step": 1521 + }, + { + "epoch": 1.8254349130173964, + "grad_norm": 0.7162896495952091, + "learning_rate": 3.9839092405099856e-06, + "loss": 0.6315, + "step": 1522 + }, + { + "epoch": 1.8266346730653868, + "grad_norm": 0.6635180291450986, + "learning_rate": 3.97707155340534e-06, + "loss": 0.5864, + "step": 1523 + }, + { + "epoch": 1.8278344331133773, + "grad_norm": 0.5896170786586302, + "learning_rate": 3.970235862325892e-06, + "loss": 0.6109, + "step": 1524 + }, + { + "epoch": 1.8290341931613678, + "grad_norm": 0.6741198016269524, + "learning_rate": 3.963402180610028e-06, + "loss": 0.6371, + "step": 1525 + }, + { + "epoch": 1.830233953209358, + "grad_norm": 0.6608823149096729, + "learning_rate": 3.95657052159221e-06, + "loss": 0.6146, + "step": 1526 + }, + { + "epoch": 1.8314337132573484, + "grad_norm": 0.6141701612238413, + "learning_rate": 3.949740898602953e-06, + "loss": 0.6445, + "step": 1527 + }, + { + "epoch": 1.8326334733053389, + "grad_norm": 0.5381899289584886, + "learning_rate": 3.942913324968799e-06, + "loss": 0.6448, + "step": 1528 + }, + { + "epoch": 1.8338332333533294, + "grad_norm": 0.6289407950918525, + "learning_rate": 3.936087814012293e-06, + "loss": 0.5984, + "step": 1529 + }, + { + "epoch": 1.8350329934013199, + "grad_norm": 0.7430299369735149, + "learning_rate": 3.929264379051954e-06, + "loss": 0.615, + "step": 1530 + }, + { + "epoch": 1.8362327534493101, + "grad_norm": 0.6034032294138236, + "learning_rate": 3.922443033402249e-06, + "loss": 0.6413, + "step": 1531 + }, + { + "epoch": 1.8374325134973004, + "grad_norm": 0.5886494650231635, + "learning_rate": 3.91562379037357e-06, + "loss": 0.5852, + "step": 1532 + }, + { + "epoch": 1.838632273545291, + "grad_norm": 0.5068433432677715, + "learning_rate": 3.908806663272202e-06, + "loss": 0.6635, + "step": 1533 + }, + { + "epoch": 1.8398320335932814, + "grad_norm": 0.6607006051476574, + "learning_rate": 3.901991665400312e-06, + "loss": 0.6946, + "step": 1534 + }, + { + "epoch": 1.841031793641272, + "grad_norm": 0.7126897580438574, + "learning_rate": 3.895178810055899e-06, + "loss": 0.6821, + "step": 1535 + }, + { + "epoch": 1.8422315536892622, + "grad_norm": 0.6167775006271451, + "learning_rate": 3.888368110532788e-06, + "loss": 0.6649, + "step": 1536 + }, + { + "epoch": 1.8434313137372524, + "grad_norm": 0.7069325448529817, + "learning_rate": 3.8815595801206e-06, + "loss": 0.7081, + "step": 1537 + }, + { + "epoch": 1.844631073785243, + "grad_norm": 0.5739397130561915, + "learning_rate": 3.874753232104714e-06, + "loss": 0.5863, + "step": 1538 + }, + { + "epoch": 1.8458308338332334, + "grad_norm": 0.6899181546698574, + "learning_rate": 3.867949079766262e-06, + "loss": 0.5816, + "step": 1539 + }, + { + "epoch": 1.8470305938812237, + "grad_norm": 0.5441258043275793, + "learning_rate": 3.861147136382085e-06, + "loss": 0.5964, + "step": 1540 + }, + { + "epoch": 1.8482303539292142, + "grad_norm": 0.8088124006931219, + "learning_rate": 3.854347415224712e-06, + "loss": 0.6622, + "step": 1541 + }, + { + "epoch": 1.8494301139772045, + "grad_norm": 0.7350261458150595, + "learning_rate": 3.847549929562341e-06, + "loss": 0.614, + "step": 1542 + }, + { + "epoch": 1.850629874025195, + "grad_norm": 0.5648819785524932, + "learning_rate": 3.840754692658807e-06, + "loss": 0.5854, + "step": 1543 + }, + { + "epoch": 1.8518296340731855, + "grad_norm": 0.5970755135089987, + "learning_rate": 3.833961717773554e-06, + "loss": 0.6509, + "step": 1544 + }, + { + "epoch": 1.8530293941211757, + "grad_norm": 0.7247750021092256, + "learning_rate": 3.8271710181616106e-06, + "loss": 0.5358, + "step": 1545 + }, + { + "epoch": 1.854229154169166, + "grad_norm": 0.7132939979536987, + "learning_rate": 3.820382607073575e-06, + "loss": 0.5935, + "step": 1546 + }, + { + "epoch": 1.8554289142171565, + "grad_norm": 0.6616048635749737, + "learning_rate": 3.8135964977555696e-06, + "loss": 0.6382, + "step": 1547 + }, + { + "epoch": 1.856628674265147, + "grad_norm": 0.5171970812831564, + "learning_rate": 3.8068127034492287e-06, + "loss": 0.6577, + "step": 1548 + }, + { + "epoch": 1.8578284343131375, + "grad_norm": 0.7182240258341588, + "learning_rate": 3.800031237391676e-06, + "loss": 0.628, + "step": 1549 + }, + { + "epoch": 1.8590281943611278, + "grad_norm": 0.5717078220686386, + "learning_rate": 3.7932521128154785e-06, + "loss": 0.6608, + "step": 1550 + }, + { + "epoch": 1.860227954409118, + "grad_norm": 0.6749749924395927, + "learning_rate": 3.7864753429486475e-06, + "loss": 0.5582, + "step": 1551 + }, + { + "epoch": 1.8614277144571085, + "grad_norm": 0.6451774623688347, + "learning_rate": 3.779700941014593e-06, + "loss": 0.6287, + "step": 1552 + }, + { + "epoch": 1.862627474505099, + "grad_norm": 0.6596254606412011, + "learning_rate": 3.772928920232103e-06, + "loss": 0.6363, + "step": 1553 + }, + { + "epoch": 1.8638272345530895, + "grad_norm": 0.7688241705078183, + "learning_rate": 3.7661592938153248e-06, + "loss": 0.6424, + "step": 1554 + }, + { + "epoch": 1.8650269946010798, + "grad_norm": 0.639799196330518, + "learning_rate": 3.7593920749737307e-06, + "loss": 0.6, + "step": 1555 + }, + { + "epoch": 1.86622675464907, + "grad_norm": 0.6901288327510051, + "learning_rate": 3.7526272769120933e-06, + "loss": 0.6375, + "step": 1556 + }, + { + "epoch": 1.8674265146970606, + "grad_norm": 0.6477346369269456, + "learning_rate": 3.745864912830463e-06, + "loss": 0.6143, + "step": 1557 + }, + { + "epoch": 1.868626274745051, + "grad_norm": 0.7668025609143018, + "learning_rate": 3.739104995924144e-06, + "loss": 0.6667, + "step": 1558 + }, + { + "epoch": 1.8698260347930415, + "grad_norm": 0.5984068174104991, + "learning_rate": 3.7323475393836606e-06, + "loss": 0.6416, + "step": 1559 + }, + { + "epoch": 1.8710257948410318, + "grad_norm": 0.7428673455418627, + "learning_rate": 3.725592556394737e-06, + "loss": 0.615, + "step": 1560 + }, + { + "epoch": 1.872225554889022, + "grad_norm": 0.6109891944291239, + "learning_rate": 3.7188400601382768e-06, + "loss": 0.6209, + "step": 1561 + }, + { + "epoch": 1.8734253149370126, + "grad_norm": 0.6302325104487722, + "learning_rate": 3.712090063790319e-06, + "loss": 0.5695, + "step": 1562 + }, + { + "epoch": 1.874625074985003, + "grad_norm": 0.6689292962337307, + "learning_rate": 3.7053425805220385e-06, + "loss": 0.6115, + "step": 1563 + }, + { + "epoch": 1.8758248350329934, + "grad_norm": 0.7362877890092404, + "learning_rate": 3.6985976234996957e-06, + "loss": 0.5754, + "step": 1564 + }, + { + "epoch": 1.8770245950809838, + "grad_norm": 0.5993796529437634, + "learning_rate": 3.691855205884627e-06, + "loss": 0.6283, + "step": 1565 + }, + { + "epoch": 1.8782243551289741, + "grad_norm": 0.6836726848169472, + "learning_rate": 3.685115340833215e-06, + "loss": 0.622, + "step": 1566 + }, + { + "epoch": 1.8794241151769646, + "grad_norm": 0.679556193178331, + "learning_rate": 3.678378041496856e-06, + "loss": 0.6357, + "step": 1567 + }, + { + "epoch": 1.880623875224955, + "grad_norm": 0.6254658107874946, + "learning_rate": 3.6716433210219437e-06, + "loss": 0.618, + "step": 1568 + }, + { + "epoch": 1.8818236352729454, + "grad_norm": 0.7162368032094661, + "learning_rate": 3.66491119254984e-06, + "loss": 0.6344, + "step": 1569 + }, + { + "epoch": 1.8830233953209357, + "grad_norm": 0.5923415229221276, + "learning_rate": 3.6581816692168454e-06, + "loss": 0.6638, + "step": 1570 + }, + { + "epoch": 1.8842231553689262, + "grad_norm": 0.5789337203250047, + "learning_rate": 3.651454764154182e-06, + "loss": 0.6832, + "step": 1571 + }, + { + "epoch": 1.8854229154169166, + "grad_norm": 0.6858589129555117, + "learning_rate": 3.644730490487961e-06, + "loss": 0.6212, + "step": 1572 + }, + { + "epoch": 1.8866226754649071, + "grad_norm": 0.7585622324613859, + "learning_rate": 3.638008861339156e-06, + "loss": 0.599, + "step": 1573 + }, + { + "epoch": 1.8878224355128974, + "grad_norm": 0.6791926906418987, + "learning_rate": 3.6312898898235828e-06, + "loss": 0.6323, + "step": 1574 + }, + { + "epoch": 1.8890221955608877, + "grad_norm": 0.6789403422597844, + "learning_rate": 3.6245735890518746e-06, + "loss": 0.647, + "step": 1575 + }, + { + "epoch": 1.8902219556088782, + "grad_norm": 0.6475201273403531, + "learning_rate": 3.6178599721294476e-06, + "loss": 0.5911, + "step": 1576 + }, + { + "epoch": 1.8914217156568687, + "grad_norm": 0.9632926156368794, + "learning_rate": 3.611149052156483e-06, + "loss": 0.6901, + "step": 1577 + }, + { + "epoch": 1.8926214757048592, + "grad_norm": 0.9992430340877584, + "learning_rate": 3.604440842227904e-06, + "loss": 0.6734, + "step": 1578 + }, + { + "epoch": 1.8938212357528494, + "grad_norm": 0.5662345641902004, + "learning_rate": 3.5977353554333373e-06, + "loss": 0.6472, + "step": 1579 + }, + { + "epoch": 1.8950209958008397, + "grad_norm": 0.6833948607274497, + "learning_rate": 3.591032604857102e-06, + "loss": 0.618, + "step": 1580 + }, + { + "epoch": 1.8962207558488302, + "grad_norm": 0.5504537043144545, + "learning_rate": 3.584332603578178e-06, + "loss": 0.6248, + "step": 1581 + }, + { + "epoch": 1.8974205158968207, + "grad_norm": 0.5973518111867986, + "learning_rate": 3.5776353646701757e-06, + "loss": 0.6231, + "step": 1582 + }, + { + "epoch": 1.8986202759448112, + "grad_norm": 0.6890278607358444, + "learning_rate": 3.5709409012013207e-06, + "loss": 0.6166, + "step": 1583 + }, + { + "epoch": 1.8998200359928015, + "grad_norm": 0.6893800529744167, + "learning_rate": 3.564249226234423e-06, + "loss": 0.577, + "step": 1584 + }, + { + "epoch": 1.9010197960407917, + "grad_norm": 0.6950537843364714, + "learning_rate": 3.5575603528268454e-06, + "loss": 0.654, + "step": 1585 + }, + { + "epoch": 1.9022195560887822, + "grad_norm": 0.6343195501551226, + "learning_rate": 3.5508742940304893e-06, + "loss": 0.6121, + "step": 1586 + }, + { + "epoch": 1.9034193161367727, + "grad_norm": 0.6572797354811389, + "learning_rate": 3.5441910628917646e-06, + "loss": 0.6396, + "step": 1587 + }, + { + "epoch": 1.904619076184763, + "grad_norm": 0.7168995016643164, + "learning_rate": 3.5375106724515594e-06, + "loss": 0.6449, + "step": 1588 + }, + { + "epoch": 1.9058188362327535, + "grad_norm": 0.6522301267570335, + "learning_rate": 3.5308331357452206e-06, + "loss": 0.6397, + "step": 1589 + }, + { + "epoch": 1.9070185962807438, + "grad_norm": 0.5185299455083228, + "learning_rate": 3.524158465802531e-06, + "loss": 0.5938, + "step": 1590 + }, + { + "epoch": 1.9082183563287343, + "grad_norm": 0.6507834551656557, + "learning_rate": 3.517486675647672e-06, + "loss": 0.6077, + "step": 1591 + }, + { + "epoch": 1.9094181163767248, + "grad_norm": 0.5007079150029254, + "learning_rate": 3.5108177782992112e-06, + "loss": 0.5797, + "step": 1592 + }, + { + "epoch": 1.910617876424715, + "grad_norm": 0.6518483853622233, + "learning_rate": 3.5041517867700686e-06, + "loss": 0.6257, + "step": 1593 + }, + { + "epoch": 1.9118176364727053, + "grad_norm": 0.6171506924153792, + "learning_rate": 3.497488714067494e-06, + "loss": 0.6237, + "step": 1594 + }, + { + "epoch": 1.9130173965206958, + "grad_norm": 0.5877533330592362, + "learning_rate": 3.4908285731930467e-06, + "loss": 0.6141, + "step": 1595 + }, + { + "epoch": 1.9142171565686863, + "grad_norm": 0.6652036567138812, + "learning_rate": 3.484171377142559e-06, + "loss": 0.6327, + "step": 1596 + }, + { + "epoch": 1.9154169166166768, + "grad_norm": 0.526253481313353, + "learning_rate": 3.4775171389061193e-06, + "loss": 0.6375, + "step": 1597 + }, + { + "epoch": 1.916616676664667, + "grad_norm": 0.7534939416117871, + "learning_rate": 3.470865871468046e-06, + "loss": 0.6776, + "step": 1598 + }, + { + "epoch": 1.9178164367126573, + "grad_norm": 0.6263147900496997, + "learning_rate": 3.4642175878068562e-06, + "loss": 0.6927, + "step": 1599 + }, + { + "epoch": 1.9190161967606478, + "grad_norm": 0.7617577055615982, + "learning_rate": 3.4575723008952503e-06, + "loss": 0.6378, + "step": 1600 + }, + { + "epoch": 1.9202159568086383, + "grad_norm": 0.6064889940890927, + "learning_rate": 3.45093002370008e-06, + "loss": 0.6825, + "step": 1601 + }, + { + "epoch": 1.9214157168566288, + "grad_norm": 0.8528410463243201, + "learning_rate": 3.4442907691823193e-06, + "loss": 0.6226, + "step": 1602 + }, + { + "epoch": 1.922615476904619, + "grad_norm": 0.5302074932770431, + "learning_rate": 3.437654550297049e-06, + "loss": 0.6454, + "step": 1603 + }, + { + "epoch": 1.9238152369526094, + "grad_norm": 0.5529349035112056, + "learning_rate": 3.431021379993428e-06, + "loss": 0.6355, + "step": 1604 + }, + { + "epoch": 1.9250149970005999, + "grad_norm": 0.7120435058914695, + "learning_rate": 3.4243912712146606e-06, + "loss": 0.6604, + "step": 1605 + }, + { + "epoch": 1.9262147570485904, + "grad_norm": 0.7601134981983861, + "learning_rate": 3.417764236897979e-06, + "loss": 0.6125, + "step": 1606 + }, + { + "epoch": 1.9274145170965808, + "grad_norm": 0.7066131442900028, + "learning_rate": 3.4111402899746226e-06, + "loss": 0.6166, + "step": 1607 + }, + { + "epoch": 1.9286142771445711, + "grad_norm": 0.8372429490650506, + "learning_rate": 3.404519443369798e-06, + "loss": 0.611, + "step": 1608 + }, + { + "epoch": 1.9298140371925614, + "grad_norm": 0.5658543889572308, + "learning_rate": 3.3979017100026645e-06, + "loss": 0.6069, + "step": 1609 + }, + { + "epoch": 1.9310137972405519, + "grad_norm": 0.6589818674295754, + "learning_rate": 3.391287102786312e-06, + "loss": 0.5221, + "step": 1610 + }, + { + "epoch": 1.9322135572885424, + "grad_norm": 0.6756834191950231, + "learning_rate": 3.384675634627721e-06, + "loss": 0.6089, + "step": 1611 + }, + { + "epoch": 1.9334133173365327, + "grad_norm": 0.63275711529377, + "learning_rate": 3.3780673184277547e-06, + "loss": 0.6663, + "step": 1612 + }, + { + "epoch": 1.9346130773845231, + "grad_norm": 0.8138389230087799, + "learning_rate": 3.3714621670811236e-06, + "loss": 0.651, + "step": 1613 + }, + { + "epoch": 1.9358128374325134, + "grad_norm": 0.6481188306140845, + "learning_rate": 3.364860193476359e-06, + "loss": 0.6578, + "step": 1614 + }, + { + "epoch": 1.937012597480504, + "grad_norm": 0.5926771545560223, + "learning_rate": 3.358261410495799e-06, + "loss": 0.5994, + "step": 1615 + }, + { + "epoch": 1.9382123575284944, + "grad_norm": 0.5468743795966091, + "learning_rate": 3.3516658310155493e-06, + "loss": 0.6623, + "step": 1616 + }, + { + "epoch": 1.9394121175764847, + "grad_norm": 0.8235416182606894, + "learning_rate": 3.3450734679054663e-06, + "loss": 0.5928, + "step": 1617 + }, + { + "epoch": 1.940611877624475, + "grad_norm": 0.7211676880287002, + "learning_rate": 3.3384843340291308e-06, + "loss": 0.6084, + "step": 1618 + }, + { + "epoch": 1.9418116376724655, + "grad_norm": 0.6389688609459044, + "learning_rate": 3.331898442243826e-06, + "loss": 0.5666, + "step": 1619 + }, + { + "epoch": 1.943011397720456, + "grad_norm": 0.659744054555252, + "learning_rate": 3.3253158054005015e-06, + "loss": 0.6762, + "step": 1620 + }, + { + "epoch": 1.9442111577684464, + "grad_norm": 0.7401291382849171, + "learning_rate": 3.3187364363437604e-06, + "loss": 0.599, + "step": 1621 + }, + { + "epoch": 1.9454109178164367, + "grad_norm": 0.5857916376908441, + "learning_rate": 3.3121603479118315e-06, + "loss": 0.625, + "step": 1622 + }, + { + "epoch": 1.946610677864427, + "grad_norm": 0.6566329837613745, + "learning_rate": 3.3055875529365346e-06, + "loss": 0.613, + "step": 1623 + }, + { + "epoch": 1.9478104379124175, + "grad_norm": 0.4807287603523042, + "learning_rate": 3.2990180642432712e-06, + "loss": 0.6251, + "step": 1624 + }, + { + "epoch": 1.949010197960408, + "grad_norm": 0.6483447443970377, + "learning_rate": 3.292451894650986e-06, + "loss": 0.6355, + "step": 1625 + }, + { + "epoch": 1.9502099580083985, + "grad_norm": 0.6429062695240559, + "learning_rate": 3.285889056972148e-06, + "loss": 0.6118, + "step": 1626 + }, + { + "epoch": 1.9514097180563887, + "grad_norm": 0.7961405043126901, + "learning_rate": 3.279329564012731e-06, + "loss": 0.5941, + "step": 1627 + }, + { + "epoch": 1.952609478104379, + "grad_norm": 0.6899134331766983, + "learning_rate": 3.272773428572169e-06, + "loss": 0.6666, + "step": 1628 + }, + { + "epoch": 1.9538092381523695, + "grad_norm": 0.5947428608906596, + "learning_rate": 3.266220663443358e-06, + "loss": 0.6597, + "step": 1629 + }, + { + "epoch": 1.95500899820036, + "grad_norm": 0.8022747878699585, + "learning_rate": 3.259671281412612e-06, + "loss": 0.6407, + "step": 1630 + }, + { + "epoch": 1.9562087582483503, + "grad_norm": 0.6435528226556113, + "learning_rate": 3.253125295259639e-06, + "loss": 0.6103, + "step": 1631 + }, + { + "epoch": 1.9574085182963408, + "grad_norm": 0.6160566369973391, + "learning_rate": 3.2465827177575302e-06, + "loss": 0.7168, + "step": 1632 + }, + { + "epoch": 1.958608278344331, + "grad_norm": 0.6646541607513289, + "learning_rate": 3.2400435616727198e-06, + "loss": 0.6397, + "step": 1633 + }, + { + "epoch": 1.9598080383923215, + "grad_norm": 0.6301402280538672, + "learning_rate": 3.2335078397649646e-06, + "loss": 0.5901, + "step": 1634 + }, + { + "epoch": 1.961007798440312, + "grad_norm": 0.7735228178846464, + "learning_rate": 3.226975564787322e-06, + "loss": 0.6473, + "step": 1635 + }, + { + "epoch": 1.9622075584883023, + "grad_norm": 0.5614535765622817, + "learning_rate": 3.220446749486128e-06, + "loss": 0.6503, + "step": 1636 + }, + { + "epoch": 1.9634073185362926, + "grad_norm": 0.5106206875601105, + "learning_rate": 3.2139214066009595e-06, + "loss": 0.7028, + "step": 1637 + }, + { + "epoch": 1.964607078584283, + "grad_norm": 0.7156699211997962, + "learning_rate": 3.2073995488646225e-06, + "loss": 0.6614, + "step": 1638 + }, + { + "epoch": 1.9658068386322736, + "grad_norm": 0.642822707358646, + "learning_rate": 3.200881189003127e-06, + "loss": 0.6372, + "step": 1639 + }, + { + "epoch": 1.967006598680264, + "grad_norm": 0.655597372390054, + "learning_rate": 3.1943663397356437e-06, + "loss": 0.6272, + "step": 1640 + }, + { + "epoch": 1.9682063587282543, + "grad_norm": 0.7004155570951615, + "learning_rate": 3.187855013774508e-06, + "loss": 0.625, + "step": 1641 + }, + { + "epoch": 1.9694061187762446, + "grad_norm": 0.5392430860345746, + "learning_rate": 3.1813472238251742e-06, + "loss": 0.6249, + "step": 1642 + }, + { + "epoch": 1.970605878824235, + "grad_norm": 0.69825795555644, + "learning_rate": 3.1748429825861937e-06, + "loss": 0.659, + "step": 1643 + }, + { + "epoch": 1.9718056388722256, + "grad_norm": 0.5836952294843116, + "learning_rate": 3.1683423027491987e-06, + "loss": 0.6314, + "step": 1644 + }, + { + "epoch": 1.973005398920216, + "grad_norm": 0.7106771954157693, + "learning_rate": 3.1618451969988716e-06, + "loss": 0.6216, + "step": 1645 + }, + { + "epoch": 1.9742051589682064, + "grad_norm": 0.6079799375663077, + "learning_rate": 3.1553516780129157e-06, + "loss": 0.6255, + "step": 1646 + }, + { + "epoch": 1.9754049190161966, + "grad_norm": 0.6533033933369583, + "learning_rate": 3.1488617584620396e-06, + "loss": 0.6016, + "step": 1647 + }, + { + "epoch": 1.9766046790641871, + "grad_norm": 0.647873061009936, + "learning_rate": 3.1423754510099304e-06, + "loss": 0.6823, + "step": 1648 + }, + { + "epoch": 1.9778044391121776, + "grad_norm": 0.6100569670661736, + "learning_rate": 3.13589276831322e-06, + "loss": 0.6436, + "step": 1649 + }, + { + "epoch": 1.9790041991601681, + "grad_norm": 0.7053313473498782, + "learning_rate": 3.129413723021474e-06, + "loss": 0.6495, + "step": 1650 + }, + { + "epoch": 1.9802039592081584, + "grad_norm": 0.6057256781443613, + "learning_rate": 3.1229383277771595e-06, + "loss": 0.6333, + "step": 1651 + }, + { + "epoch": 1.9814037192561487, + "grad_norm": 0.6372818155791644, + "learning_rate": 3.116466595215617e-06, + "loss": 0.6111, + "step": 1652 + }, + { + "epoch": 1.9826034793041392, + "grad_norm": 0.7174431458091668, + "learning_rate": 3.1099985379650454e-06, + "loss": 0.6921, + "step": 1653 + }, + { + "epoch": 1.9838032393521297, + "grad_norm": 0.7915955539043774, + "learning_rate": 3.1035341686464664e-06, + "loss": 0.6267, + "step": 1654 + }, + { + "epoch": 1.98500299940012, + "grad_norm": 0.6050217020619578, + "learning_rate": 3.0970734998737095e-06, + "loss": 0.6578, + "step": 1655 + }, + { + "epoch": 1.9862027594481104, + "grad_norm": 0.7268494061378847, + "learning_rate": 3.090616544253385e-06, + "loss": 0.5408, + "step": 1656 + }, + { + "epoch": 1.9874025194961007, + "grad_norm": 0.6127371326001422, + "learning_rate": 3.084163314384852e-06, + "loss": 0.6581, + "step": 1657 + }, + { + "epoch": 1.9886022795440912, + "grad_norm": 0.6323484798333604, + "learning_rate": 3.0777138228602045e-06, + "loss": 0.6357, + "step": 1658 + }, + { + "epoch": 1.9898020395920817, + "grad_norm": 0.7300351753850884, + "learning_rate": 3.071268082264241e-06, + "loss": 0.6547, + "step": 1659 + }, + { + "epoch": 1.991001799640072, + "grad_norm": 0.41278449269264283, + "learning_rate": 3.0648261051744367e-06, + "loss": 0.6348, + "step": 1660 + }, + { + "epoch": 1.9922015596880622, + "grad_norm": 0.6580197911643658, + "learning_rate": 3.05838790416093e-06, + "loss": 0.6357, + "step": 1661 + }, + { + "epoch": 1.9934013197360527, + "grad_norm": 0.813218363737704, + "learning_rate": 3.051953491786487e-06, + "loss": 0.5861, + "step": 1662 + }, + { + "epoch": 1.9946010797840432, + "grad_norm": 0.5836776878674278, + "learning_rate": 3.04552288060648e-06, + "loss": 0.6315, + "step": 1663 + }, + { + "epoch": 1.9958008398320337, + "grad_norm": 0.9820496254307773, + "learning_rate": 3.0390960831688676e-06, + "loss": 0.6234, + "step": 1664 + }, + { + "epoch": 1.997000599880024, + "grad_norm": 0.6584441414557831, + "learning_rate": 3.0326731120141663e-06, + "loss": 0.5642, + "step": 1665 + }, + { + "epoch": 1.9982003599280143, + "grad_norm": 0.4613016565652139, + "learning_rate": 3.026253979675421e-06, + "loss": 0.6433, + "step": 1666 + }, + { + "epoch": 1.9994001199760048, + "grad_norm": 0.5993227231828745, + "learning_rate": 3.019838698678191e-06, + "loss": 0.6251, + "step": 1667 + }, + { + "epoch": 2.0, + "grad_norm": 1.1286287068986351, + "learning_rate": 3.013427281540523e-06, + "loss": 0.6533, + "step": 1668 + }, + { + "epoch": 2.0011997600479905, + "grad_norm": 0.6553687343168106, + "learning_rate": 3.0070197407729174e-06, + "loss": 0.546, + "step": 1669 + }, + { + "epoch": 2.002399520095981, + "grad_norm": 0.7232483187926956, + "learning_rate": 3.000616088878315e-06, + "loss": 0.5026, + "step": 1670 + }, + { + "epoch": 2.003599280143971, + "grad_norm": 0.635507837834464, + "learning_rate": 2.9942163383520682e-06, + "loss": 0.5566, + "step": 1671 + }, + { + "epoch": 2.0047990401919615, + "grad_norm": 0.586070052604526, + "learning_rate": 2.9878205016819133e-06, + "loss": 0.5692, + "step": 1672 + }, + { + "epoch": 2.005998800239952, + "grad_norm": 0.5744491184365953, + "learning_rate": 2.981428591347955e-06, + "loss": 0.6042, + "step": 1673 + }, + { + "epoch": 2.0071985602879425, + "grad_norm": 0.5533414219666576, + "learning_rate": 2.9750406198226333e-06, + "loss": 0.5313, + "step": 1674 + }, + { + "epoch": 2.008398320335933, + "grad_norm": 0.527974323446745, + "learning_rate": 2.9686565995707006e-06, + "loss": 0.5716, + "step": 1675 + }, + { + "epoch": 2.009598080383923, + "grad_norm": 0.6581060126537326, + "learning_rate": 2.962276543049204e-06, + "loss": 0.5912, + "step": 1676 + }, + { + "epoch": 2.0107978404319136, + "grad_norm": 0.6284377755426946, + "learning_rate": 2.9559004627074535e-06, + "loss": 0.6311, + "step": 1677 + }, + { + "epoch": 2.011997600479904, + "grad_norm": 0.6136973438415546, + "learning_rate": 2.9495283709869993e-06, + "loss": 0.563, + "step": 1678 + }, + { + "epoch": 2.0131973605278946, + "grad_norm": 0.7255219368682616, + "learning_rate": 2.94316028032161e-06, + "loss": 0.5456, + "step": 1679 + }, + { + "epoch": 2.0143971205758846, + "grad_norm": 0.6531829585887703, + "learning_rate": 2.936796203137249e-06, + "loss": 0.607, + "step": 1680 + }, + { + "epoch": 2.015596880623875, + "grad_norm": 0.5763977931898144, + "learning_rate": 2.9304361518520447e-06, + "loss": 0.5683, + "step": 1681 + }, + { + "epoch": 2.0167966406718656, + "grad_norm": 0.5910528949546794, + "learning_rate": 2.9240801388762707e-06, + "loss": 0.5357, + "step": 1682 + }, + { + "epoch": 2.017996400719856, + "grad_norm": 0.5898805007247918, + "learning_rate": 2.9177281766123254e-06, + "loss": 0.4886, + "step": 1683 + }, + { + "epoch": 2.0191961607678466, + "grad_norm": 0.5960173782729209, + "learning_rate": 2.911380277454695e-06, + "loss": 0.5925, + "step": 1684 + }, + { + "epoch": 2.0203959208158366, + "grad_norm": 0.6167640235101897, + "learning_rate": 2.905036453789942e-06, + "loss": 0.5941, + "step": 1685 + }, + { + "epoch": 2.021595680863827, + "grad_norm": 0.6138548143862488, + "learning_rate": 2.898696717996678e-06, + "loss": 0.5909, + "step": 1686 + }, + { + "epoch": 2.0227954409118176, + "grad_norm": 0.5839889378518093, + "learning_rate": 2.892361082445534e-06, + "loss": 0.516, + "step": 1687 + }, + { + "epoch": 2.023995200959808, + "grad_norm": 0.7247860919816828, + "learning_rate": 2.886029559499144e-06, + "loss": 0.5563, + "step": 1688 + }, + { + "epoch": 2.0251949610077986, + "grad_norm": 0.7298128666554536, + "learning_rate": 2.8797021615121134e-06, + "loss": 0.5459, + "step": 1689 + }, + { + "epoch": 2.0263947210557887, + "grad_norm": 0.7316860323201141, + "learning_rate": 2.8733789008310033e-06, + "loss": 0.5754, + "step": 1690 + }, + { + "epoch": 2.027594481103779, + "grad_norm": 0.6870139028142205, + "learning_rate": 2.8670597897942964e-06, + "loss": 0.5671, + "step": 1691 + }, + { + "epoch": 2.0287942411517697, + "grad_norm": 0.5965131558498986, + "learning_rate": 2.8607448407323824e-06, + "loss": 0.5275, + "step": 1692 + }, + { + "epoch": 2.02999400119976, + "grad_norm": 0.6436418134198707, + "learning_rate": 2.854434065967528e-06, + "loss": 0.5991, + "step": 1693 + }, + { + "epoch": 2.0311937612477506, + "grad_norm": 0.7620520831649701, + "learning_rate": 2.8481274778138567e-06, + "loss": 0.5569, + "step": 1694 + }, + { + "epoch": 2.0323935212957407, + "grad_norm": 0.620553773544572, + "learning_rate": 2.8418250885773156e-06, + "loss": 0.6529, + "step": 1695 + }, + { + "epoch": 2.033593281343731, + "grad_norm": 0.5900016668327399, + "learning_rate": 2.8355269105556695e-06, + "loss": 0.5883, + "step": 1696 + }, + { + "epoch": 2.0347930413917217, + "grad_norm": 0.5751741742801739, + "learning_rate": 2.82923295603846e-06, + "loss": 0.54, + "step": 1697 + }, + { + "epoch": 2.035992801439712, + "grad_norm": 0.7322012852297586, + "learning_rate": 2.822943237306983e-06, + "loss": 0.5295, + "step": 1698 + }, + { + "epoch": 2.0371925614877027, + "grad_norm": 0.6465495010276673, + "learning_rate": 2.8166577666342786e-06, + "loss": 0.5136, + "step": 1699 + }, + { + "epoch": 2.0383923215356927, + "grad_norm": 0.5951509571107764, + "learning_rate": 2.8103765562850942e-06, + "loss": 0.5782, + "step": 1700 + }, + { + "epoch": 2.039592081583683, + "grad_norm": 0.7817384358122911, + "learning_rate": 2.804099618515858e-06, + "loss": 0.5821, + "step": 1701 + }, + { + "epoch": 2.0407918416316737, + "grad_norm": 0.6031361130877024, + "learning_rate": 2.797826965574667e-06, + "loss": 0.5593, + "step": 1702 + }, + { + "epoch": 2.041991601679664, + "grad_norm": 0.5957045599311164, + "learning_rate": 2.7915586097012613e-06, + "loss": 0.5458, + "step": 1703 + }, + { + "epoch": 2.0431913617276543, + "grad_norm": 0.6360695872826579, + "learning_rate": 2.785294563126987e-06, + "loss": 0.5656, + "step": 1704 + }, + { + "epoch": 2.0443911217756447, + "grad_norm": 0.5655789747161657, + "learning_rate": 2.7790348380747832e-06, + "loss": 0.5162, + "step": 1705 + }, + { + "epoch": 2.0455908818236352, + "grad_norm": 0.6266463387114029, + "learning_rate": 2.7727794467591682e-06, + "loss": 0.5836, + "step": 1706 + }, + { + "epoch": 2.0467906418716257, + "grad_norm": 0.6397476354687799, + "learning_rate": 2.766528401386187e-06, + "loss": 0.5469, + "step": 1707 + }, + { + "epoch": 2.0479904019196162, + "grad_norm": 0.6424712078878715, + "learning_rate": 2.760281714153414e-06, + "loss": 0.6244, + "step": 1708 + }, + { + "epoch": 2.0491901619676063, + "grad_norm": 0.6454459575829394, + "learning_rate": 2.7540393972499203e-06, + "loss": 0.5179, + "step": 1709 + }, + { + "epoch": 2.0503899220155968, + "grad_norm": 0.6701297156785533, + "learning_rate": 2.747801462856244e-06, + "loss": 0.5488, + "step": 1710 + }, + { + "epoch": 2.0515896820635873, + "grad_norm": 0.6389965775957791, + "learning_rate": 2.741567923144376e-06, + "loss": 0.5138, + "step": 1711 + }, + { + "epoch": 2.0527894421115778, + "grad_norm": 0.63361030302146, + "learning_rate": 2.7353387902777304e-06, + "loss": 0.5385, + "step": 1712 + }, + { + "epoch": 2.0539892021595683, + "grad_norm": 0.6214392044652081, + "learning_rate": 2.7291140764111223e-06, + "loss": 0.6765, + "step": 1713 + }, + { + "epoch": 2.0551889622075583, + "grad_norm": 0.6284153913170616, + "learning_rate": 2.722893793690744e-06, + "loss": 0.6256, + "step": 1714 + }, + { + "epoch": 2.056388722255549, + "grad_norm": 0.6045362882434572, + "learning_rate": 2.716677954254141e-06, + "loss": 0.5945, + "step": 1715 + }, + { + "epoch": 2.0575884823035393, + "grad_norm": 0.7832123459310987, + "learning_rate": 2.7104665702301895e-06, + "loss": 0.5517, + "step": 1716 + }, + { + "epoch": 2.05878824235153, + "grad_norm": 0.599016337999235, + "learning_rate": 2.704259653739074e-06, + "loss": 0.6268, + "step": 1717 + }, + { + "epoch": 2.0599880023995203, + "grad_norm": 0.594891933671642, + "learning_rate": 2.6980572168922555e-06, + "loss": 0.5233, + "step": 1718 + }, + { + "epoch": 2.0611877624475103, + "grad_norm": 0.6773121292151333, + "learning_rate": 2.6918592717924563e-06, + "loss": 0.5777, + "step": 1719 + }, + { + "epoch": 2.062387522495501, + "grad_norm": 0.7031827325029097, + "learning_rate": 2.685665830533642e-06, + "loss": 0.5999, + "step": 1720 + }, + { + "epoch": 2.0635872825434913, + "grad_norm": 0.6371739899003336, + "learning_rate": 2.679476905200977e-06, + "loss": 0.5485, + "step": 1721 + }, + { + "epoch": 2.064787042591482, + "grad_norm": 0.6566325322148608, + "learning_rate": 2.6732925078708205e-06, + "loss": 0.5708, + "step": 1722 + }, + { + "epoch": 2.0659868026394723, + "grad_norm": 0.5948043795465717, + "learning_rate": 2.667112650610702e-06, + "loss": 0.6146, + "step": 1723 + }, + { + "epoch": 2.0671865626874624, + "grad_norm": 0.5999379851132288, + "learning_rate": 2.6609373454792797e-06, + "loss": 0.588, + "step": 1724 + }, + { + "epoch": 2.068386322735453, + "grad_norm": 0.6423600146443728, + "learning_rate": 2.6547666045263365e-06, + "loss": 0.5946, + "step": 1725 + }, + { + "epoch": 2.0695860827834434, + "grad_norm": 0.5936111859457325, + "learning_rate": 2.64860043979275e-06, + "loss": 0.5742, + "step": 1726 + }, + { + "epoch": 2.070785842831434, + "grad_norm": 0.6003496127801714, + "learning_rate": 2.6424388633104646e-06, + "loss": 0.6131, + "step": 1727 + }, + { + "epoch": 2.071985602879424, + "grad_norm": 0.5663334196056212, + "learning_rate": 2.6362818871024754e-06, + "loss": 0.6251, + "step": 1728 + }, + { + "epoch": 2.0731853629274144, + "grad_norm": 0.5997400925623815, + "learning_rate": 2.6301295231827983e-06, + "loss": 0.5446, + "step": 1729 + }, + { + "epoch": 2.074385122975405, + "grad_norm": 0.6463961773967529, + "learning_rate": 2.6239817835564514e-06, + "loss": 0.5936, + "step": 1730 + }, + { + "epoch": 2.0755848830233954, + "grad_norm": 0.6755373596792922, + "learning_rate": 2.6178386802194267e-06, + "loss": 0.5836, + "step": 1731 + }, + { + "epoch": 2.076784643071386, + "grad_norm": 0.6141145857607571, + "learning_rate": 2.611700225158673e-06, + "loss": 0.5602, + "step": 1732 + }, + { + "epoch": 2.077984403119376, + "grad_norm": 0.6293863917640159, + "learning_rate": 2.6055664303520655e-06, + "loss": 0.586, + "step": 1733 + }, + { + "epoch": 2.0791841631673664, + "grad_norm": 0.6042059842491126, + "learning_rate": 2.599437307768389e-06, + "loss": 0.5728, + "step": 1734 + }, + { + "epoch": 2.080383923215357, + "grad_norm": 0.6425208877547268, + "learning_rate": 2.5933128693673093e-06, + "loss": 0.5447, + "step": 1735 + }, + { + "epoch": 2.0815836832633474, + "grad_norm": 0.6252508061699878, + "learning_rate": 2.5871931270993533e-06, + "loss": 0.5833, + "step": 1736 + }, + { + "epoch": 2.082783443311338, + "grad_norm": 0.5753613975248165, + "learning_rate": 2.581078092905883e-06, + "loss": 0.5572, + "step": 1737 + }, + { + "epoch": 2.083983203359328, + "grad_norm": 0.6190082489851423, + "learning_rate": 2.5749677787190773e-06, + "loss": 0.5895, + "step": 1738 + }, + { + "epoch": 2.0851829634073185, + "grad_norm": 0.6483439527059782, + "learning_rate": 2.568862196461896e-06, + "loss": 0.5809, + "step": 1739 + }, + { + "epoch": 2.086382723455309, + "grad_norm": 0.6049753604492797, + "learning_rate": 2.5627613580480777e-06, + "loss": 0.5745, + "step": 1740 + }, + { + "epoch": 2.0875824835032994, + "grad_norm": 0.6019511442996845, + "learning_rate": 2.5566652753820986e-06, + "loss": 0.5828, + "step": 1741 + }, + { + "epoch": 2.08878224355129, + "grad_norm": 0.5637693698154113, + "learning_rate": 2.5505739603591485e-06, + "loss": 0.5205, + "step": 1742 + }, + { + "epoch": 2.08998200359928, + "grad_norm": 0.5664060694906904, + "learning_rate": 2.544487424865127e-06, + "loss": 0.5925, + "step": 1743 + }, + { + "epoch": 2.0911817636472705, + "grad_norm": 0.7064759661824541, + "learning_rate": 2.5384056807766013e-06, + "loss": 0.5034, + "step": 1744 + }, + { + "epoch": 2.092381523695261, + "grad_norm": 0.6981691256033558, + "learning_rate": 2.5323287399607844e-06, + "loss": 0.6892, + "step": 1745 + }, + { + "epoch": 2.0935812837432515, + "grad_norm": 0.7045947459947126, + "learning_rate": 2.526256614275524e-06, + "loss": 0.6233, + "step": 1746 + }, + { + "epoch": 2.094781043791242, + "grad_norm": 0.648809316236553, + "learning_rate": 2.5201893155692687e-06, + "loss": 0.6659, + "step": 1747 + }, + { + "epoch": 2.095980803839232, + "grad_norm": 0.662631188788291, + "learning_rate": 2.5141268556810495e-06, + "loss": 0.5075, + "step": 1748 + }, + { + "epoch": 2.0971805638872225, + "grad_norm": 0.6193505883910716, + "learning_rate": 2.5080692464404557e-06, + "loss": 0.5616, + "step": 1749 + }, + { + "epoch": 2.098380323935213, + "grad_norm": 0.7122822339221859, + "learning_rate": 2.5020164996676087e-06, + "loss": 0.5347, + "step": 1750 + }, + { + "epoch": 2.0995800839832035, + "grad_norm": 0.5625146971918583, + "learning_rate": 2.4959686271731472e-06, + "loss": 0.5681, + "step": 1751 + }, + { + "epoch": 2.1007798440311936, + "grad_norm": 0.6549716986169428, + "learning_rate": 2.489925640758193e-06, + "loss": 0.5175, + "step": 1752 + }, + { + "epoch": 2.101979604079184, + "grad_norm": 0.577542516177496, + "learning_rate": 2.483887552214338e-06, + "loss": 0.6132, + "step": 1753 + }, + { + "epoch": 2.1031793641271745, + "grad_norm": 0.6344390216058583, + "learning_rate": 2.4778543733236144e-06, + "loss": 0.5836, + "step": 1754 + }, + { + "epoch": 2.104379124175165, + "grad_norm": 0.6597128044852273, + "learning_rate": 2.471826115858478e-06, + "loss": 0.5651, + "step": 1755 + }, + { + "epoch": 2.1055788842231555, + "grad_norm": 0.6004318635958817, + "learning_rate": 2.4658027915817717e-06, + "loss": 0.562, + "step": 1756 + }, + { + "epoch": 2.1067786442711456, + "grad_norm": 0.6826868530541879, + "learning_rate": 2.4597844122467267e-06, + "loss": 0.5516, + "step": 1757 + }, + { + "epoch": 2.107978404319136, + "grad_norm": 0.6731413540794284, + "learning_rate": 2.453770989596917e-06, + "loss": 0.5182, + "step": 1758 + }, + { + "epoch": 2.1091781643671266, + "grad_norm": 0.6275693290935841, + "learning_rate": 2.44776253536624e-06, + "loss": 0.5585, + "step": 1759 + }, + { + "epoch": 2.110377924415117, + "grad_norm": 0.5874746966605109, + "learning_rate": 2.4417590612789094e-06, + "loss": 0.5465, + "step": 1760 + }, + { + "epoch": 2.1115776844631076, + "grad_norm": 0.5974898845267843, + "learning_rate": 2.4357605790494158e-06, + "loss": 0.6207, + "step": 1761 + }, + { + "epoch": 2.1127774445110976, + "grad_norm": 0.6061825734549284, + "learning_rate": 2.4297671003825058e-06, + "loss": 0.4986, + "step": 1762 + }, + { + "epoch": 2.113977204559088, + "grad_norm": 0.6439695766776677, + "learning_rate": 2.4237786369731655e-06, + "loss": 0.5817, + "step": 1763 + }, + { + "epoch": 2.1151769646070786, + "grad_norm": 0.6447339343313097, + "learning_rate": 2.4177952005066006e-06, + "loss": 0.5764, + "step": 1764 + }, + { + "epoch": 2.116376724655069, + "grad_norm": 0.6139730437589196, + "learning_rate": 2.4118168026581976e-06, + "loss": 0.5558, + "step": 1765 + }, + { + "epoch": 2.1175764847030596, + "grad_norm": 0.5680150493815513, + "learning_rate": 2.4058434550935163e-06, + "loss": 0.6039, + "step": 1766 + }, + { + "epoch": 2.1187762447510496, + "grad_norm": 0.6345243664025577, + "learning_rate": 2.3998751694682666e-06, + "loss": 0.5321, + "step": 1767 + }, + { + "epoch": 2.11997600479904, + "grad_norm": 0.764950759999074, + "learning_rate": 2.3939119574282706e-06, + "loss": 0.5613, + "step": 1768 + }, + { + "epoch": 2.1211757648470306, + "grad_norm": 0.6816883523166991, + "learning_rate": 2.3879538306094587e-06, + "loss": 0.5428, + "step": 1769 + }, + { + "epoch": 2.122375524895021, + "grad_norm": 0.6669704645020077, + "learning_rate": 2.3820008006378355e-06, + "loss": 0.6003, + "step": 1770 + }, + { + "epoch": 2.123575284943011, + "grad_norm": 0.6234045791660564, + "learning_rate": 2.37605287912946e-06, + "loss": 0.56, + "step": 1771 + }, + { + "epoch": 2.1247750449910017, + "grad_norm": 0.5950638939401581, + "learning_rate": 2.370110077690425e-06, + "loss": 0.5683, + "step": 1772 + }, + { + "epoch": 2.125974805038992, + "grad_norm": 0.6512885551567529, + "learning_rate": 2.3641724079168303e-06, + "loss": 0.5468, + "step": 1773 + }, + { + "epoch": 2.1271745650869827, + "grad_norm": 0.5747798498609408, + "learning_rate": 2.3582398813947642e-06, + "loss": 0.5684, + "step": 1774 + }, + { + "epoch": 2.128374325134973, + "grad_norm": 0.6611106061274544, + "learning_rate": 2.352312509700278e-06, + "loss": 0.6094, + "step": 1775 + }, + { + "epoch": 2.129574085182963, + "grad_norm": 0.6917106334768783, + "learning_rate": 2.3463903043993643e-06, + "loss": 0.5543, + "step": 1776 + }, + { + "epoch": 2.1307738452309537, + "grad_norm": 0.6511261294062615, + "learning_rate": 2.3404732770479355e-06, + "loss": 0.5143, + "step": 1777 + }, + { + "epoch": 2.131973605278944, + "grad_norm": 0.5835121301193615, + "learning_rate": 2.3345614391918013e-06, + "loss": 0.5327, + "step": 1778 + }, + { + "epoch": 2.1331733653269347, + "grad_norm": 0.5995561327066398, + "learning_rate": 2.328654802366641e-06, + "loss": 0.549, + "step": 1779 + }, + { + "epoch": 2.134373125374925, + "grad_norm": 0.6270294499218602, + "learning_rate": 2.3227533780979866e-06, + "loss": 0.5662, + "step": 1780 + }, + { + "epoch": 2.1355728854229152, + "grad_norm": 0.6066283235428948, + "learning_rate": 2.3168571779012073e-06, + "loss": 0.5755, + "step": 1781 + }, + { + "epoch": 2.1367726454709057, + "grad_norm": 0.6018231343746554, + "learning_rate": 2.310966213281465e-06, + "loss": 0.5028, + "step": 1782 + }, + { + "epoch": 2.1379724055188962, + "grad_norm": 0.6771968625455023, + "learning_rate": 2.305080495733712e-06, + "loss": 0.6507, + "step": 1783 + }, + { + "epoch": 2.1391721655668867, + "grad_norm": 0.5914151795227278, + "learning_rate": 2.299200036742668e-06, + "loss": 0.5812, + "step": 1784 + }, + { + "epoch": 2.140371925614877, + "grad_norm": 0.6638265537528982, + "learning_rate": 2.2933248477827814e-06, + "loss": 0.5296, + "step": 1785 + }, + { + "epoch": 2.1415716856628673, + "grad_norm": 0.6743505439103027, + "learning_rate": 2.2874549403182224e-06, + "loss": 0.6193, + "step": 1786 + }, + { + "epoch": 2.1427714457108578, + "grad_norm": 0.6093655720042745, + "learning_rate": 2.2815903258028554e-06, + "loss": 0.5518, + "step": 1787 + }, + { + "epoch": 2.1439712057588483, + "grad_norm": 0.6607671944638438, + "learning_rate": 2.2757310156802158e-06, + "loss": 0.5959, + "step": 1788 + }, + { + "epoch": 2.1451709658068387, + "grad_norm": 0.6864152138905839, + "learning_rate": 2.2698770213834904e-06, + "loss": 0.4978, + "step": 1789 + }, + { + "epoch": 2.1463707258548292, + "grad_norm": 0.6756950013938815, + "learning_rate": 2.2640283543354907e-06, + "loss": 0.5446, + "step": 1790 + }, + { + "epoch": 2.1475704859028193, + "grad_norm": 0.6961267024749692, + "learning_rate": 2.258185025948635e-06, + "loss": 0.4971, + "step": 1791 + }, + { + "epoch": 2.14877024595081, + "grad_norm": 0.5991236172755423, + "learning_rate": 2.2523470476249232e-06, + "loss": 0.5322, + "step": 1792 + }, + { + "epoch": 2.1499700059988003, + "grad_norm": 0.7345552094596932, + "learning_rate": 2.2465144307559172e-06, + "loss": 0.5695, + "step": 1793 + }, + { + "epoch": 2.1511697660467908, + "grad_norm": 0.6706955037760028, + "learning_rate": 2.2406871867227163e-06, + "loss": 0.5616, + "step": 1794 + }, + { + "epoch": 2.1523695260947813, + "grad_norm": 0.5887206055969618, + "learning_rate": 2.2348653268959353e-06, + "loss": 0.511, + "step": 1795 + }, + { + "epoch": 2.1535692861427713, + "grad_norm": 0.7224310469698312, + "learning_rate": 2.2290488626356836e-06, + "loss": 0.5691, + "step": 1796 + }, + { + "epoch": 2.154769046190762, + "grad_norm": 0.7170794492801699, + "learning_rate": 2.2232378052915428e-06, + "loss": 0.5402, + "step": 1797 + }, + { + "epoch": 2.1559688062387523, + "grad_norm": 0.6126640435973594, + "learning_rate": 2.2174321662025427e-06, + "loss": 0.6472, + "step": 1798 + }, + { + "epoch": 2.157168566286743, + "grad_norm": 0.5886685204838491, + "learning_rate": 2.2116319566971433e-06, + "loss": 0.5853, + "step": 1799 + }, + { + "epoch": 2.158368326334733, + "grad_norm": 0.7093491324672906, + "learning_rate": 2.2058371880932028e-06, + "loss": 0.5987, + "step": 1800 + }, + { + "epoch": 2.1595680863827234, + "grad_norm": 0.5759850720608728, + "learning_rate": 2.2000478716979727e-06, + "loss": 0.608, + "step": 1801 + }, + { + "epoch": 2.160767846430714, + "grad_norm": 0.6527246415338012, + "learning_rate": 2.194264018808061e-06, + "loss": 0.6342, + "step": 1802 + }, + { + "epoch": 2.1619676064787043, + "grad_norm": 0.9156170459502739, + "learning_rate": 2.188485640709411e-06, + "loss": 0.5886, + "step": 1803 + }, + { + "epoch": 2.163167366526695, + "grad_norm": 0.5950925395063658, + "learning_rate": 2.182712748677287e-06, + "loss": 0.537, + "step": 1804 + }, + { + "epoch": 2.164367126574685, + "grad_norm": 0.6694812889645247, + "learning_rate": 2.176945353976253e-06, + "loss": 0.5663, + "step": 1805 + }, + { + "epoch": 2.1655668866226754, + "grad_norm": 0.5956168088449492, + "learning_rate": 2.171183467860136e-06, + "loss": 0.569, + "step": 1806 + }, + { + "epoch": 2.166766646670666, + "grad_norm": 0.6008079381234008, + "learning_rate": 2.1654271015720213e-06, + "loss": 0.5288, + "step": 1807 + }, + { + "epoch": 2.1679664067186564, + "grad_norm": 0.702051221306401, + "learning_rate": 2.159676266344222e-06, + "loss": 0.5867, + "step": 1808 + }, + { + "epoch": 2.169166166766647, + "grad_norm": 0.6182868356474166, + "learning_rate": 2.1539309733982574e-06, + "loss": 0.5342, + "step": 1809 + }, + { + "epoch": 2.170365926814637, + "grad_norm": 0.5258453454474016, + "learning_rate": 2.148191233944833e-06, + "loss": 0.5732, + "step": 1810 + }, + { + "epoch": 2.1715656868626274, + "grad_norm": 0.6285807905161801, + "learning_rate": 2.1424570591838184e-06, + "loss": 0.5281, + "step": 1811 + }, + { + "epoch": 2.172765446910618, + "grad_norm": 0.7210098999907698, + "learning_rate": 2.136728460304224e-06, + "loss": 0.5957, + "step": 1812 + }, + { + "epoch": 2.1739652069586084, + "grad_norm": 0.6136648287223856, + "learning_rate": 2.1310054484841804e-06, + "loss": 0.5206, + "step": 1813 + }, + { + "epoch": 2.175164967006599, + "grad_norm": 0.5721710134462407, + "learning_rate": 2.1252880348909154e-06, + "loss": 0.5455, + "step": 1814 + }, + { + "epoch": 2.176364727054589, + "grad_norm": 0.5829787754472344, + "learning_rate": 2.1195762306807354e-06, + "loss": 0.5988, + "step": 1815 + }, + { + "epoch": 2.1775644871025794, + "grad_norm": 0.6073107932285269, + "learning_rate": 2.113870046999e-06, + "loss": 0.5854, + "step": 1816 + }, + { + "epoch": 2.17876424715057, + "grad_norm": 0.6634043222962149, + "learning_rate": 2.1081694949800972e-06, + "loss": 0.5948, + "step": 1817 + }, + { + "epoch": 2.1799640071985604, + "grad_norm": 0.5952194802441421, + "learning_rate": 2.102474585747435e-06, + "loss": 0.566, + "step": 1818 + }, + { + "epoch": 2.1811637672465505, + "grad_norm": 0.6401399556311126, + "learning_rate": 2.096785330413406e-06, + "loss": 0.6252, + "step": 1819 + }, + { + "epoch": 2.182363527294541, + "grad_norm": 0.5965514929018709, + "learning_rate": 2.091101740079364e-06, + "loss": 0.5862, + "step": 1820 + }, + { + "epoch": 2.1835632873425315, + "grad_norm": 0.6449129682843583, + "learning_rate": 2.085423825835622e-06, + "loss": 0.5684, + "step": 1821 + }, + { + "epoch": 2.184763047390522, + "grad_norm": 0.6076189884120574, + "learning_rate": 2.0797515987614084e-06, + "loss": 0.5475, + "step": 1822 + }, + { + "epoch": 2.1859628074385125, + "grad_norm": 0.6850039662885326, + "learning_rate": 2.074085069924855e-06, + "loss": 0.5261, + "step": 1823 + }, + { + "epoch": 2.1871625674865025, + "grad_norm": 0.5817182709374599, + "learning_rate": 2.068424250382974e-06, + "loss": 0.5711, + "step": 1824 + }, + { + "epoch": 2.188362327534493, + "grad_norm": 0.7508580988668893, + "learning_rate": 2.0627691511816453e-06, + "loss": 0.5366, + "step": 1825 + }, + { + "epoch": 2.1895620875824835, + "grad_norm": 0.6464934239515318, + "learning_rate": 2.0571197833555756e-06, + "loss": 0.5638, + "step": 1826 + }, + { + "epoch": 2.190761847630474, + "grad_norm": 0.6322795581774937, + "learning_rate": 2.0514761579282926e-06, + "loss": 0.5796, + "step": 1827 + }, + { + "epoch": 2.1919616076784645, + "grad_norm": 0.6151031350395569, + "learning_rate": 2.045838285912125e-06, + "loss": 0.5823, + "step": 1828 + }, + { + "epoch": 2.1931613677264545, + "grad_norm": 0.9645888747249992, + "learning_rate": 2.040206178308164e-06, + "loss": 0.4862, + "step": 1829 + }, + { + "epoch": 2.194361127774445, + "grad_norm": 0.6181538881404581, + "learning_rate": 2.0345798461062594e-06, + "loss": 0.5455, + "step": 1830 + }, + { + "epoch": 2.1955608878224355, + "grad_norm": 0.7168161618227379, + "learning_rate": 2.028959300284991e-06, + "loss": 0.4991, + "step": 1831 + }, + { + "epoch": 2.196760647870426, + "grad_norm": 0.5593225461837725, + "learning_rate": 2.023344551811647e-06, + "loss": 0.5718, + "step": 1832 + }, + { + "epoch": 2.1979604079184165, + "grad_norm": 0.6297978896378902, + "learning_rate": 2.0177356116422025e-06, + "loss": 0.586, + "step": 1833 + }, + { + "epoch": 2.1991601679664066, + "grad_norm": 0.6019293943984955, + "learning_rate": 2.0121324907213003e-06, + "loss": 0.5756, + "step": 1834 + }, + { + "epoch": 2.200359928014397, + "grad_norm": 0.5867637519763697, + "learning_rate": 2.006535199982228e-06, + "loss": 0.5619, + "step": 1835 + }, + { + "epoch": 2.2015596880623876, + "grad_norm": 0.5787485857506453, + "learning_rate": 2.000943750346896e-06, + "loss": 0.6323, + "step": 1836 + }, + { + "epoch": 2.202759448110378, + "grad_norm": 0.6453884378409138, + "learning_rate": 1.995358152725818e-06, + "loss": 0.5689, + "step": 1837 + }, + { + "epoch": 2.203959208158368, + "grad_norm": 0.7534093765701781, + "learning_rate": 1.989778418018088e-06, + "loss": 0.6421, + "step": 1838 + }, + { + "epoch": 2.2051589682063586, + "grad_norm": 0.6859433373701794, + "learning_rate": 1.9842045571113623e-06, + "loss": 0.5641, + "step": 1839 + }, + { + "epoch": 2.206358728254349, + "grad_norm": 0.6513989642933808, + "learning_rate": 1.978636580881829e-06, + "loss": 0.5017, + "step": 1840 + }, + { + "epoch": 2.2075584883023396, + "grad_norm": 0.5656305153730823, + "learning_rate": 1.973074500194199e-06, + "loss": 0.5369, + "step": 1841 + }, + { + "epoch": 2.20875824835033, + "grad_norm": 0.5825701804911828, + "learning_rate": 1.967518325901683e-06, + "loss": 0.6088, + "step": 1842 + }, + { + "epoch": 2.20995800839832, + "grad_norm": 0.6557230732903017, + "learning_rate": 1.9619680688459574e-06, + "loss": 0.6454, + "step": 1843 + }, + { + "epoch": 2.2111577684463106, + "grad_norm": 0.6135776891881046, + "learning_rate": 1.9564237398571565e-06, + "loss": 0.5716, + "step": 1844 + }, + { + "epoch": 2.212357528494301, + "grad_norm": 0.5859319693232549, + "learning_rate": 1.9508853497538533e-06, + "loss": 0.6105, + "step": 1845 + }, + { + "epoch": 2.2135572885422916, + "grad_norm": 0.5963528780810635, + "learning_rate": 1.945352909343021e-06, + "loss": 0.5885, + "step": 1846 + }, + { + "epoch": 2.214757048590282, + "grad_norm": 0.6196620299527998, + "learning_rate": 1.9398264294200315e-06, + "loss": 0.5963, + "step": 1847 + }, + { + "epoch": 2.215956808638272, + "grad_norm": 0.690298706312048, + "learning_rate": 1.9343059207686233e-06, + "loss": 0.5237, + "step": 1848 + }, + { + "epoch": 2.2171565686862627, + "grad_norm": 0.6464422212185456, + "learning_rate": 1.9287913941608833e-06, + "loss": 0.5286, + "step": 1849 + }, + { + "epoch": 2.218356328734253, + "grad_norm": 0.6701580975562744, + "learning_rate": 1.9232828603572255e-06, + "loss": 0.4676, + "step": 1850 + }, + { + "epoch": 2.2195560887822436, + "grad_norm": 0.7331563604028161, + "learning_rate": 1.9177803301063726e-06, + "loss": 0.6865, + "step": 1851 + }, + { + "epoch": 2.220755848830234, + "grad_norm": 0.6404123124096934, + "learning_rate": 1.9122838141453287e-06, + "loss": 0.5835, + "step": 1852 + }, + { + "epoch": 2.221955608878224, + "grad_norm": 0.6725869376483263, + "learning_rate": 1.9067933231993656e-06, + "loss": 0.5538, + "step": 1853 + }, + { + "epoch": 2.2231553689262147, + "grad_norm": 0.618777309548578, + "learning_rate": 1.9013088679819968e-06, + "loss": 0.6463, + "step": 1854 + }, + { + "epoch": 2.224355128974205, + "grad_norm": 0.601708285350291, + "learning_rate": 1.8958304591949594e-06, + "loss": 0.5473, + "step": 1855 + }, + { + "epoch": 2.2255548890221957, + "grad_norm": 0.6762916720677586, + "learning_rate": 1.8903581075281912e-06, + "loss": 0.5765, + "step": 1856 + }, + { + "epoch": 2.226754649070186, + "grad_norm": 0.5819918345353873, + "learning_rate": 1.8848918236598112e-06, + "loss": 0.5443, + "step": 1857 + }, + { + "epoch": 2.227954409118176, + "grad_norm": 0.5970533348240304, + "learning_rate": 1.8794316182560994e-06, + "loss": 0.507, + "step": 1858 + }, + { + "epoch": 2.2291541691661667, + "grad_norm": 0.592778211817268, + "learning_rate": 1.8739775019714735e-06, + "loss": 0.5656, + "step": 1859 + }, + { + "epoch": 2.230353929214157, + "grad_norm": 0.6466163136909957, + "learning_rate": 1.868529485448472e-06, + "loss": 0.5749, + "step": 1860 + }, + { + "epoch": 2.2315536892621477, + "grad_norm": 0.6461789354205344, + "learning_rate": 1.8630875793177245e-06, + "loss": 0.5744, + "step": 1861 + }, + { + "epoch": 2.232753449310138, + "grad_norm": 0.5990001671730018, + "learning_rate": 1.8576517941979466e-06, + "loss": 0.5665, + "step": 1862 + }, + { + "epoch": 2.2339532093581282, + "grad_norm": 0.6786058775623247, + "learning_rate": 1.8522221406959063e-06, + "loss": 0.5481, + "step": 1863 + }, + { + "epoch": 2.2351529694061187, + "grad_norm": 0.5890200409884031, + "learning_rate": 1.8467986294064029e-06, + "loss": 0.5426, + "step": 1864 + }, + { + "epoch": 2.2363527294541092, + "grad_norm": 0.566197078035473, + "learning_rate": 1.8413812709122535e-06, + "loss": 0.5251, + "step": 1865 + }, + { + "epoch": 2.2375524895020997, + "grad_norm": 0.5416952672510823, + "learning_rate": 1.8359700757842757e-06, + "loss": 0.5652, + "step": 1866 + }, + { + "epoch": 2.23875224955009, + "grad_norm": 0.6852543834616891, + "learning_rate": 1.8305650545812499e-06, + "loss": 0.5909, + "step": 1867 + }, + { + "epoch": 2.2399520095980803, + "grad_norm": 0.811544879193826, + "learning_rate": 1.8251662178499136e-06, + "loss": 0.5932, + "step": 1868 + }, + { + "epoch": 2.2411517696460708, + "grad_norm": 0.5762869874446908, + "learning_rate": 1.8197735761249424e-06, + "loss": 0.6263, + "step": 1869 + }, + { + "epoch": 2.2423515296940613, + "grad_norm": 0.5488879273291238, + "learning_rate": 1.8143871399289137e-06, + "loss": 0.5617, + "step": 1870 + }, + { + "epoch": 2.2435512897420518, + "grad_norm": 0.5629041301403368, + "learning_rate": 1.8090069197723032e-06, + "loss": 0.5402, + "step": 1871 + }, + { + "epoch": 2.244751049790042, + "grad_norm": 0.613623805105581, + "learning_rate": 1.8036329261534546e-06, + "loss": 0.5841, + "step": 1872 + }, + { + "epoch": 2.2459508098380323, + "grad_norm": 0.530537229404665, + "learning_rate": 1.7982651695585623e-06, + "loss": 0.51, + "step": 1873 + }, + { + "epoch": 2.247150569886023, + "grad_norm": 0.573926051224772, + "learning_rate": 1.7929036604616518e-06, + "loss": 0.5295, + "step": 1874 + }, + { + "epoch": 2.2483503299340133, + "grad_norm": 0.6723771525346085, + "learning_rate": 1.7875484093245555e-06, + "loss": 0.5417, + "step": 1875 + }, + { + "epoch": 2.249550089982004, + "grad_norm": 0.6500871435131167, + "learning_rate": 1.7821994265968962e-06, + "loss": 0.6398, + "step": 1876 + }, + { + "epoch": 2.250749850029994, + "grad_norm": 0.6648278777836503, + "learning_rate": 1.776856722716067e-06, + "loss": 0.5293, + "step": 1877 + }, + { + "epoch": 2.2519496100779843, + "grad_norm": 0.5759898973309917, + "learning_rate": 1.7715203081072025e-06, + "loss": 0.6463, + "step": 1878 + }, + { + "epoch": 2.253149370125975, + "grad_norm": 0.6066032014974669, + "learning_rate": 1.766190193183175e-06, + "loss": 0.514, + "step": 1879 + }, + { + "epoch": 2.2543491301739653, + "grad_norm": 0.6287796350012775, + "learning_rate": 1.7608663883445582e-06, + "loss": 0.5563, + "step": 1880 + }, + { + "epoch": 2.255548890221956, + "grad_norm": 0.6187048867104146, + "learning_rate": 1.7555489039796093e-06, + "loss": 0.5151, + "step": 1881 + }, + { + "epoch": 2.256748650269946, + "grad_norm": 0.5403178157587972, + "learning_rate": 1.7502377504642631e-06, + "loss": 0.5299, + "step": 1882 + }, + { + "epoch": 2.2579484103179364, + "grad_norm": 0.6300808545625, + "learning_rate": 1.7449329381620938e-06, + "loss": 0.595, + "step": 1883 + }, + { + "epoch": 2.259148170365927, + "grad_norm": 0.5671233702525605, + "learning_rate": 1.7396344774243013e-06, + "loss": 0.5507, + "step": 1884 + }, + { + "epoch": 2.2603479304139174, + "grad_norm": 0.5235190922216324, + "learning_rate": 1.7343423785896934e-06, + "loss": 0.5609, + "step": 1885 + }, + { + "epoch": 2.2615476904619074, + "grad_norm": 0.7559982150412795, + "learning_rate": 1.7290566519846697e-06, + "loss": 0.5258, + "step": 1886 + }, + { + "epoch": 2.262747450509898, + "grad_norm": 0.6092653856814914, + "learning_rate": 1.723777307923185e-06, + "loss": 0.5163, + "step": 1887 + }, + { + "epoch": 2.2639472105578884, + "grad_norm": 0.5677836138018956, + "learning_rate": 1.7185043567067466e-06, + "loss": 0.5546, + "step": 1888 + }, + { + "epoch": 2.265146970605879, + "grad_norm": 0.6187893833043113, + "learning_rate": 1.7132378086243907e-06, + "loss": 0.5551, + "step": 1889 + }, + { + "epoch": 2.2663467306538694, + "grad_norm": 0.708337066742395, + "learning_rate": 1.7079776739526505e-06, + "loss": 0.5433, + "step": 1890 + }, + { + "epoch": 2.26754649070186, + "grad_norm": 0.563463133541644, + "learning_rate": 1.7027239629555503e-06, + "loss": 0.567, + "step": 1891 + }, + { + "epoch": 2.26874625074985, + "grad_norm": 0.6579698507083996, + "learning_rate": 1.6974766858845792e-06, + "loss": 0.5555, + "step": 1892 + }, + { + "epoch": 2.2699460107978404, + "grad_norm": 0.6877758286059601, + "learning_rate": 1.692235852978672e-06, + "loss": 0.5324, + "step": 1893 + }, + { + "epoch": 2.271145770845831, + "grad_norm": 0.6399348315226369, + "learning_rate": 1.6870014744641882e-06, + "loss": 0.5188, + "step": 1894 + }, + { + "epoch": 2.2723455308938214, + "grad_norm": 0.5486291401853911, + "learning_rate": 1.6817735605548936e-06, + "loss": 0.6143, + "step": 1895 + }, + { + "epoch": 2.2735452909418115, + "grad_norm": 0.5805810155866001, + "learning_rate": 1.6765521214519393e-06, + "loss": 0.5345, + "step": 1896 + }, + { + "epoch": 2.274745050989802, + "grad_norm": 0.7942331671780003, + "learning_rate": 1.6713371673438427e-06, + "loss": 0.6989, + "step": 1897 + }, + { + "epoch": 2.2759448110377924, + "grad_norm": 0.6246634234338206, + "learning_rate": 1.666128708406467e-06, + "loss": 0.5657, + "step": 1898 + }, + { + "epoch": 2.277144571085783, + "grad_norm": 0.6477679105114238, + "learning_rate": 1.6609267548030011e-06, + "loss": 0.5589, + "step": 1899 + }, + { + "epoch": 2.2783443311337734, + "grad_norm": 1.5909813424098906, + "learning_rate": 1.6557313166839429e-06, + "loss": 0.6089, + "step": 1900 + }, + { + "epoch": 2.2795440911817635, + "grad_norm": 0.5726043180383195, + "learning_rate": 1.6505424041870693e-06, + "loss": 0.6009, + "step": 1901 + }, + { + "epoch": 2.280743851229754, + "grad_norm": 0.6290546283029272, + "learning_rate": 1.64536002743743e-06, + "loss": 0.5536, + "step": 1902 + }, + { + "epoch": 2.2819436112777445, + "grad_norm": 0.6065446291792649, + "learning_rate": 1.6401841965473248e-06, + "loss": 0.5703, + "step": 1903 + }, + { + "epoch": 2.283143371325735, + "grad_norm": 0.6387978612018363, + "learning_rate": 1.635014921616272e-06, + "loss": 0.5529, + "step": 1904 + }, + { + "epoch": 2.284343131373725, + "grad_norm": 0.5525726365824832, + "learning_rate": 1.6298522127310002e-06, + "loss": 0.596, + "step": 1905 + }, + { + "epoch": 2.2855428914217155, + "grad_norm": 0.6718164458112008, + "learning_rate": 1.6246960799654337e-06, + "loss": 0.5944, + "step": 1906 + }, + { + "epoch": 2.286742651469706, + "grad_norm": 0.6021082270319865, + "learning_rate": 1.619546533380652e-06, + "loss": 0.5542, + "step": 1907 + }, + { + "epoch": 2.2879424115176965, + "grad_norm": 0.5195251990068274, + "learning_rate": 1.614403583024892e-06, + "loss": 0.6108, + "step": 1908 + }, + { + "epoch": 2.289142171565687, + "grad_norm": 0.6037277381699504, + "learning_rate": 1.6092672389335169e-06, + "loss": 0.5581, + "step": 1909 + }, + { + "epoch": 2.2903419316136775, + "grad_norm": 0.6132642852995489, + "learning_rate": 1.6041375111289987e-06, + "loss": 0.6014, + "step": 1910 + }, + { + "epoch": 2.2915416916616675, + "grad_norm": 0.5869390782704332, + "learning_rate": 1.5990144096208998e-06, + "loss": 0.6195, + "step": 1911 + }, + { + "epoch": 2.292741451709658, + "grad_norm": 0.7480947459686417, + "learning_rate": 1.5938979444058527e-06, + "loss": 0.4999, + "step": 1912 + }, + { + "epoch": 2.2939412117576485, + "grad_norm": 0.6503912322641925, + "learning_rate": 1.5887881254675414e-06, + "loss": 0.5348, + "step": 1913 + }, + { + "epoch": 2.295140971805639, + "grad_norm": 0.5927596306455847, + "learning_rate": 1.5836849627766787e-06, + "loss": 0.6018, + "step": 1914 + }, + { + "epoch": 2.296340731853629, + "grad_norm": 0.5863027916401812, + "learning_rate": 1.5785884662909917e-06, + "loss": 0.5754, + "step": 1915 + }, + { + "epoch": 2.2975404919016196, + "grad_norm": 0.5287286367507306, + "learning_rate": 1.5734986459551994e-06, + "loss": 0.5533, + "step": 1916 + }, + { + "epoch": 2.29874025194961, + "grad_norm": 0.683394217140752, + "learning_rate": 1.568415511700992e-06, + "loss": 0.623, + "step": 1917 + }, + { + "epoch": 2.2999400119976006, + "grad_norm": 0.5975790778744007, + "learning_rate": 1.563339073447016e-06, + "loss": 0.5726, + "step": 1918 + }, + { + "epoch": 2.301139772045591, + "grad_norm": 0.6729930876497099, + "learning_rate": 1.5582693410988469e-06, + "loss": 0.5701, + "step": 1919 + }, + { + "epoch": 2.302339532093581, + "grad_norm": 0.6390721570846494, + "learning_rate": 1.553206324548983e-06, + "loss": 0.5314, + "step": 1920 + }, + { + "epoch": 2.3035392921415716, + "grad_norm": 0.6375196247502823, + "learning_rate": 1.548150033676814e-06, + "loss": 0.5239, + "step": 1921 + }, + { + "epoch": 2.304739052189562, + "grad_norm": 0.633062354492987, + "learning_rate": 1.5431004783486004e-06, + "loss": 0.5109, + "step": 1922 + }, + { + "epoch": 2.3059388122375526, + "grad_norm": 0.6168277970639483, + "learning_rate": 1.5380576684174698e-06, + "loss": 0.5909, + "step": 1923 + }, + { + "epoch": 2.3071385722855426, + "grad_norm": 0.6394106835366958, + "learning_rate": 1.5330216137233838e-06, + "loss": 0.562, + "step": 1924 + }, + { + "epoch": 2.308338332333533, + "grad_norm": 0.5875706845731299, + "learning_rate": 1.5279923240931176e-06, + "loss": 0.6031, + "step": 1925 + }, + { + "epoch": 2.3095380923815236, + "grad_norm": 0.5722294455291087, + "learning_rate": 1.5229698093402505e-06, + "loss": 0.5993, + "step": 1926 + }, + { + "epoch": 2.310737852429514, + "grad_norm": 0.5564432556484449, + "learning_rate": 1.5179540792651464e-06, + "loss": 0.5627, + "step": 1927 + }, + { + "epoch": 2.3119376124775046, + "grad_norm": 0.641828291089028, + "learning_rate": 1.5129451436549203e-06, + "loss": 0.5383, + "step": 1928 + }, + { + "epoch": 2.313137372525495, + "grad_norm": 0.5726544285669013, + "learning_rate": 1.5079430122834343e-06, + "loss": 0.5737, + "step": 1929 + }, + { + "epoch": 2.314337132573485, + "grad_norm": 0.8717905620475225, + "learning_rate": 1.5029476949112787e-06, + "loss": 0.6155, + "step": 1930 + }, + { + "epoch": 2.3155368926214757, + "grad_norm": 0.6063397584759121, + "learning_rate": 1.4979592012857375e-06, + "loss": 0.5923, + "step": 1931 + }, + { + "epoch": 2.316736652669466, + "grad_norm": 0.6089932443469553, + "learning_rate": 1.4929775411407871e-06, + "loss": 0.6011, + "step": 1932 + }, + { + "epoch": 2.3179364127174567, + "grad_norm": 0.6062511474466317, + "learning_rate": 1.4880027241970668e-06, + "loss": 0.5431, + "step": 1933 + }, + { + "epoch": 2.3191361727654467, + "grad_norm": 0.59304838560386, + "learning_rate": 1.4830347601618639e-06, + "loss": 0.5945, + "step": 1934 + }, + { + "epoch": 2.320335932813437, + "grad_norm": 0.643578110697501, + "learning_rate": 1.4780736587290927e-06, + "loss": 0.5187, + "step": 1935 + }, + { + "epoch": 2.3215356928614277, + "grad_norm": 0.5817383170621565, + "learning_rate": 1.4731194295792788e-06, + "loss": 0.5428, + "step": 1936 + }, + { + "epoch": 2.322735452909418, + "grad_norm": 0.6605528620423613, + "learning_rate": 1.4681720823795353e-06, + "loss": 0.5942, + "step": 1937 + }, + { + "epoch": 2.3239352129574087, + "grad_norm": 0.5914015474201617, + "learning_rate": 1.46323162678355e-06, + "loss": 0.5933, + "step": 1938 + }, + { + "epoch": 2.3251349730053987, + "grad_norm": 0.542788224900796, + "learning_rate": 1.458298072431556e-06, + "loss": 0.5954, + "step": 1939 + }, + { + "epoch": 2.3263347330533892, + "grad_norm": 0.569600511641598, + "learning_rate": 1.4533714289503297e-06, + "loss": 0.5392, + "step": 1940 + }, + { + "epoch": 2.3275344931013797, + "grad_norm": 0.592861908888043, + "learning_rate": 1.4484517059531588e-06, + "loss": 0.5719, + "step": 1941 + }, + { + "epoch": 2.32873425314937, + "grad_norm": 0.6265611773472458, + "learning_rate": 1.4435389130398208e-06, + "loss": 0.58, + "step": 1942 + }, + { + "epoch": 2.3299340131973607, + "grad_norm": 0.6304403704739078, + "learning_rate": 1.438633059796581e-06, + "loss": 0.5154, + "step": 1943 + }, + { + "epoch": 2.3311337732453508, + "grad_norm": 0.5543815663073327, + "learning_rate": 1.4337341557961587e-06, + "loss": 0.5859, + "step": 1944 + }, + { + "epoch": 2.3323335332933413, + "grad_norm": 0.605843532077587, + "learning_rate": 1.4288422105977096e-06, + "loss": 0.5499, + "step": 1945 + }, + { + "epoch": 2.3335332933413317, + "grad_norm": 0.6619172150696679, + "learning_rate": 1.4239572337468144e-06, + "loss": 0.528, + "step": 1946 + }, + { + "epoch": 2.3347330533893222, + "grad_norm": 0.6475994414381125, + "learning_rate": 1.4190792347754612e-06, + "loss": 0.4947, + "step": 1947 + }, + { + "epoch": 2.3359328134373127, + "grad_norm": 0.6041381388503334, + "learning_rate": 1.4142082232020127e-06, + "loss": 0.5507, + "step": 1948 + }, + { + "epoch": 2.337132573485303, + "grad_norm": 0.5982926224915779, + "learning_rate": 1.4093442085312036e-06, + "loss": 0.5908, + "step": 1949 + }, + { + "epoch": 2.3383323335332933, + "grad_norm": 0.6189361684892732, + "learning_rate": 1.4044872002541137e-06, + "loss": 0.5565, + "step": 1950 + }, + { + "epoch": 2.339532093581284, + "grad_norm": 0.6693290788425857, + "learning_rate": 1.3996372078481524e-06, + "loss": 0.5737, + "step": 1951 + }, + { + "epoch": 2.3407318536292743, + "grad_norm": 0.6402845104708695, + "learning_rate": 1.3947942407770382e-06, + "loss": 0.5414, + "step": 1952 + }, + { + "epoch": 2.3419316136772643, + "grad_norm": 0.616135532565906, + "learning_rate": 1.3899583084907808e-06, + "loss": 0.5517, + "step": 1953 + }, + { + "epoch": 2.343131373725255, + "grad_norm": 0.6461494511051894, + "learning_rate": 1.3851294204256638e-06, + "loss": 0.5728, + "step": 1954 + }, + { + "epoch": 2.3443311337732453, + "grad_norm": 0.6861833846520821, + "learning_rate": 1.3803075860042258e-06, + "loss": 0.5806, + "step": 1955 + }, + { + "epoch": 2.345530893821236, + "grad_norm": 0.6014821091393561, + "learning_rate": 1.3754928146352408e-06, + "loss": 0.6158, + "step": 1956 + }, + { + "epoch": 2.3467306538692263, + "grad_norm": 0.5475636439705546, + "learning_rate": 1.3706851157137008e-06, + "loss": 0.5431, + "step": 1957 + }, + { + "epoch": 2.347930413917217, + "grad_norm": 0.635894078553141, + "learning_rate": 1.3658844986207987e-06, + "loss": 0.5246, + "step": 1958 + }, + { + "epoch": 2.349130173965207, + "grad_norm": 0.6110431358884112, + "learning_rate": 1.3610909727239074e-06, + "loss": 0.6095, + "step": 1959 + }, + { + "epoch": 2.3503299340131973, + "grad_norm": 0.6877673486069413, + "learning_rate": 1.3563045473765634e-06, + "loss": 0.5491, + "step": 1960 + }, + { + "epoch": 2.351529694061188, + "grad_norm": 0.5904116741620707, + "learning_rate": 1.3515252319184497e-06, + "loss": 0.5307, + "step": 1961 + }, + { + "epoch": 2.3527294541091783, + "grad_norm": 0.5757789707012982, + "learning_rate": 1.3467530356753705e-06, + "loss": 0.545, + "step": 1962 + }, + { + "epoch": 2.3539292141571684, + "grad_norm": 0.6074692348087055, + "learning_rate": 1.3419879679592423e-06, + "loss": 0.5854, + "step": 1963 + }, + { + "epoch": 2.355128974205159, + "grad_norm": 0.6340604837018596, + "learning_rate": 1.3372300380680765e-06, + "loss": 0.5274, + "step": 1964 + }, + { + "epoch": 2.3563287342531494, + "grad_norm": 0.5595653101922524, + "learning_rate": 1.3324792552859456e-06, + "loss": 0.5705, + "step": 1965 + }, + { + "epoch": 2.35752849430114, + "grad_norm": 0.6038322319383336, + "learning_rate": 1.327735628882984e-06, + "loss": 0.5967, + "step": 1966 + }, + { + "epoch": 2.3587282543491304, + "grad_norm": 0.6053575074959013, + "learning_rate": 1.3229991681153632e-06, + "loss": 0.5921, + "step": 1967 + }, + { + "epoch": 2.3599280143971204, + "grad_norm": 0.7111278599605177, + "learning_rate": 1.3182698822252648e-06, + "loss": 0.531, + "step": 1968 + }, + { + "epoch": 2.361127774445111, + "grad_norm": 0.6305780252744821, + "learning_rate": 1.3135477804408763e-06, + "loss": 0.5621, + "step": 1969 + }, + { + "epoch": 2.3623275344931014, + "grad_norm": 0.6419371901578995, + "learning_rate": 1.308832871976366e-06, + "loss": 0.5162, + "step": 1970 + }, + { + "epoch": 2.363527294541092, + "grad_norm": 0.5597564694748165, + "learning_rate": 1.3041251660318643e-06, + "loss": 0.5992, + "step": 1971 + }, + { + "epoch": 2.364727054589082, + "grad_norm": 0.6223790891939087, + "learning_rate": 1.2994246717934488e-06, + "loss": 0.5711, + "step": 1972 + }, + { + "epoch": 2.3659268146370724, + "grad_norm": 0.7933541406230165, + "learning_rate": 1.2947313984331244e-06, + "loss": 0.5178, + "step": 1973 + }, + { + "epoch": 2.367126574685063, + "grad_norm": 0.5654858787055241, + "learning_rate": 1.2900453551088071e-06, + "loss": 0.5364, + "step": 1974 + }, + { + "epoch": 2.3683263347330534, + "grad_norm": 0.6515659326283025, + "learning_rate": 1.2853665509643032e-06, + "loss": 0.5745, + "step": 1975 + }, + { + "epoch": 2.369526094781044, + "grad_norm": 0.6874196766144196, + "learning_rate": 1.2806949951292945e-06, + "loss": 0.563, + "step": 1976 + }, + { + "epoch": 2.3707258548290344, + "grad_norm": 0.6517047305231014, + "learning_rate": 1.2760306967193191e-06, + "loss": 0.5392, + "step": 1977 + }, + { + "epoch": 2.3719256148770245, + "grad_norm": 0.7324117552434645, + "learning_rate": 1.2713736648357545e-06, + "loss": 0.5036, + "step": 1978 + }, + { + "epoch": 2.373125374925015, + "grad_norm": 0.6192932403368865, + "learning_rate": 1.2667239085657983e-06, + "loss": 0.5777, + "step": 1979 + }, + { + "epoch": 2.3743251349730055, + "grad_norm": 0.5380945260489934, + "learning_rate": 1.262081436982448e-06, + "loss": 0.5881, + "step": 1980 + }, + { + "epoch": 2.375524895020996, + "grad_norm": 0.7003327766357957, + "learning_rate": 1.257446259144494e-06, + "loss": 0.519, + "step": 1981 + }, + { + "epoch": 2.376724655068986, + "grad_norm": 0.6167640023652006, + "learning_rate": 1.2528183840964913e-06, + "loss": 0.5325, + "step": 1982 + }, + { + "epoch": 2.3779244151169765, + "grad_norm": 0.5850934143132089, + "learning_rate": 1.248197820868739e-06, + "loss": 0.5664, + "step": 1983 + }, + { + "epoch": 2.379124175164967, + "grad_norm": 0.6297352724034515, + "learning_rate": 1.2435845784772783e-06, + "loss": 0.5641, + "step": 1984 + }, + { + "epoch": 2.3803239352129575, + "grad_norm": 0.5442818523295648, + "learning_rate": 1.238978665923861e-06, + "loss": 0.5983, + "step": 1985 + }, + { + "epoch": 2.381523695260948, + "grad_norm": 0.7001314080471295, + "learning_rate": 1.2343800921959348e-06, + "loss": 0.6551, + "step": 1986 + }, + { + "epoch": 2.382723455308938, + "grad_norm": 0.5938647577049746, + "learning_rate": 1.2297888662666275e-06, + "loss": 0.5488, + "step": 1987 + }, + { + "epoch": 2.3839232153569285, + "grad_norm": 0.6845150089494614, + "learning_rate": 1.2252049970947355e-06, + "loss": 0.4832, + "step": 1988 + }, + { + "epoch": 2.385122975404919, + "grad_norm": 0.5537726896596256, + "learning_rate": 1.2206284936246903e-06, + "loss": 0.621, + "step": 1989 + }, + { + "epoch": 2.3863227354529095, + "grad_norm": 0.5956832825946745, + "learning_rate": 1.216059364786556e-06, + "loss": 0.6012, + "step": 1990 + }, + { + "epoch": 2.3875224955009, + "grad_norm": 0.5883351697255591, + "learning_rate": 1.211497619496011e-06, + "loss": 0.5384, + "step": 1991 + }, + { + "epoch": 2.38872225554889, + "grad_norm": 0.609285464500927, + "learning_rate": 1.2069432666543167e-06, + "loss": 0.5799, + "step": 1992 + }, + { + "epoch": 2.3899220155968806, + "grad_norm": 0.6326702607786753, + "learning_rate": 1.2023963151483165e-06, + "loss": 0.5632, + "step": 1993 + }, + { + "epoch": 2.391121775644871, + "grad_norm": 0.6649244538257834, + "learning_rate": 1.1978567738504099e-06, + "loss": 0.5383, + "step": 1994 + }, + { + "epoch": 2.3923215356928615, + "grad_norm": 0.5446010496554285, + "learning_rate": 1.1933246516185372e-06, + "loss": 0.553, + "step": 1995 + }, + { + "epoch": 2.393521295740852, + "grad_norm": 0.6395049348350018, + "learning_rate": 1.1887999572961605e-06, + "loss": 0.5323, + "step": 1996 + }, + { + "epoch": 2.394721055788842, + "grad_norm": 0.5624977453415554, + "learning_rate": 1.1842826997122504e-06, + "loss": 0.5583, + "step": 1997 + }, + { + "epoch": 2.3959208158368326, + "grad_norm": 0.6597021331336324, + "learning_rate": 1.1797728876812643e-06, + "loss": 0.5321, + "step": 1998 + }, + { + "epoch": 2.397120575884823, + "grad_norm": 0.6019638446046344, + "learning_rate": 1.1752705300031336e-06, + "loss": 0.5535, + "step": 1999 + }, + { + "epoch": 2.3983203359328136, + "grad_norm": 0.6228151763589116, + "learning_rate": 1.1707756354632382e-06, + "loss": 0.5591, + "step": 2000 + }, + { + "epoch": 2.3995200959808036, + "grad_norm": 0.6312014950596989, + "learning_rate": 1.1662882128324044e-06, + "loss": 0.5698, + "step": 2001 + }, + { + "epoch": 2.400719856028794, + "grad_norm": 0.7290597527034558, + "learning_rate": 1.1618082708668725e-06, + "loss": 0.5857, + "step": 2002 + }, + { + "epoch": 2.4019196160767846, + "grad_norm": 0.5542132677584134, + "learning_rate": 1.1573358183082861e-06, + "loss": 0.5852, + "step": 2003 + }, + { + "epoch": 2.403119376124775, + "grad_norm": 0.5953507728815691, + "learning_rate": 1.1528708638836744e-06, + "loss": 0.5464, + "step": 2004 + }, + { + "epoch": 2.4043191361727656, + "grad_norm": 0.5587521569960043, + "learning_rate": 1.1484134163054423e-06, + "loss": 0.5384, + "step": 2005 + }, + { + "epoch": 2.405518896220756, + "grad_norm": 0.5002152020323153, + "learning_rate": 1.1439634842713371e-06, + "loss": 0.6208, + "step": 2006 + }, + { + "epoch": 2.406718656268746, + "grad_norm": 0.5942264281550809, + "learning_rate": 1.1395210764644454e-06, + "loss": 0.6291, + "step": 2007 + }, + { + "epoch": 2.4079184163167366, + "grad_norm": 0.6363610661455074, + "learning_rate": 1.1350862015531766e-06, + "loss": 0.615, + "step": 2008 + }, + { + "epoch": 2.409118176364727, + "grad_norm": 0.6811779985331536, + "learning_rate": 1.1306588681912318e-06, + "loss": 0.5607, + "step": 2009 + }, + { + "epoch": 2.4103179364127176, + "grad_norm": 0.6209782474828403, + "learning_rate": 1.1262390850176042e-06, + "loss": 0.541, + "step": 2010 + }, + { + "epoch": 2.4115176964607077, + "grad_norm": 0.5541181532088056, + "learning_rate": 1.1218268606565497e-06, + "loss": 0.5851, + "step": 2011 + }, + { + "epoch": 2.412717456508698, + "grad_norm": 0.650537648990647, + "learning_rate": 1.117422203717578e-06, + "loss": 0.5038, + "step": 2012 + }, + { + "epoch": 2.4139172165566887, + "grad_norm": 0.670922801022453, + "learning_rate": 1.1130251227954313e-06, + "loss": 0.5934, + "step": 2013 + }, + { + "epoch": 2.415116976604679, + "grad_norm": 0.6601396976807905, + "learning_rate": 1.1086356264700683e-06, + "loss": 0.5824, + "step": 2014 + }, + { + "epoch": 2.4163167366526697, + "grad_norm": 0.5633180549832768, + "learning_rate": 1.104253723306648e-06, + "loss": 0.5778, + "step": 2015 + }, + { + "epoch": 2.4175164967006597, + "grad_norm": 0.5997911293984551, + "learning_rate": 1.0998794218555141e-06, + "loss": 0.5644, + "step": 2016 + }, + { + "epoch": 2.41871625674865, + "grad_norm": 0.5152887028635624, + "learning_rate": 1.0955127306521768e-06, + "loss": 0.5438, + "step": 2017 + }, + { + "epoch": 2.4199160167966407, + "grad_norm": 0.6229782943785084, + "learning_rate": 1.091153658217296e-06, + "loss": 0.5625, + "step": 2018 + }, + { + "epoch": 2.421115776844631, + "grad_norm": 0.5777102576408649, + "learning_rate": 1.0868022130566652e-06, + "loss": 0.6141, + "step": 2019 + }, + { + "epoch": 2.4223155368926212, + "grad_norm": 0.519362477250475, + "learning_rate": 1.0824584036611952e-06, + "loss": 0.5948, + "step": 2020 + }, + { + "epoch": 2.4235152969406117, + "grad_norm": 0.6323279030159387, + "learning_rate": 1.0781222385068974e-06, + "loss": 0.5099, + "step": 2021 + }, + { + "epoch": 2.4247150569886022, + "grad_norm": 0.6096341213942442, + "learning_rate": 1.0737937260548675e-06, + "loss": 0.5425, + "step": 2022 + }, + { + "epoch": 2.4259148170365927, + "grad_norm": 0.5847459771908484, + "learning_rate": 1.0694728747512688e-06, + "loss": 0.5576, + "step": 2023 + }, + { + "epoch": 2.4271145770845832, + "grad_norm": 0.6310530150131601, + "learning_rate": 1.0651596930273107e-06, + "loss": 0.5516, + "step": 2024 + }, + { + "epoch": 2.4283143371325737, + "grad_norm": 0.6063344779401464, + "learning_rate": 1.0608541892992464e-06, + "loss": 0.563, + "step": 2025 + }, + { + "epoch": 2.4295140971805638, + "grad_norm": 0.6688338028777848, + "learning_rate": 1.0565563719683387e-06, + "loss": 0.5099, + "step": 2026 + }, + { + "epoch": 2.4307138572285543, + "grad_norm": 0.828679681934732, + "learning_rate": 1.0522662494208535e-06, + "loss": 0.5441, + "step": 2027 + }, + { + "epoch": 2.4319136172765448, + "grad_norm": 0.5121616235981272, + "learning_rate": 1.0479838300280499e-06, + "loss": 0.6223, + "step": 2028 + }, + { + "epoch": 2.4331133773245353, + "grad_norm": 0.6018457847836182, + "learning_rate": 1.0437091221461438e-06, + "loss": 0.5202, + "step": 2029 + }, + { + "epoch": 2.4343131373725253, + "grad_norm": 0.5854513098809797, + "learning_rate": 1.0394421341163123e-06, + "loss": 0.5088, + "step": 2030 + }, + { + "epoch": 2.435512897420516, + "grad_norm": 0.6316958864507666, + "learning_rate": 1.0351828742646658e-06, + "loss": 0.5061, + "step": 2031 + }, + { + "epoch": 2.4367126574685063, + "grad_norm": 0.6489235005842126, + "learning_rate": 1.030931350902235e-06, + "loss": 0.6063, + "step": 2032 + }, + { + "epoch": 2.437912417516497, + "grad_norm": 0.5824017683498028, + "learning_rate": 1.0266875723249547e-06, + "loss": 0.5601, + "step": 2033 + }, + { + "epoch": 2.4391121775644873, + "grad_norm": 0.6200243422536237, + "learning_rate": 1.0224515468136481e-06, + "loss": 0.5506, + "step": 2034 + }, + { + "epoch": 2.4403119376124773, + "grad_norm": 0.5720863280705479, + "learning_rate": 1.0182232826340095e-06, + "loss": 0.5558, + "step": 2035 + }, + { + "epoch": 2.441511697660468, + "grad_norm": 0.6205568182114685, + "learning_rate": 1.014002788036587e-06, + "loss": 0.5641, + "step": 2036 + }, + { + "epoch": 2.4427114577084583, + "grad_norm": 0.6206921709526924, + "learning_rate": 1.0097900712567716e-06, + "loss": 0.4522, + "step": 2037 + }, + { + "epoch": 2.443911217756449, + "grad_norm": 0.5979265289150865, + "learning_rate": 1.005585140514773e-06, + "loss": 0.5393, + "step": 2038 + }, + { + "epoch": 2.445110977804439, + "grad_norm": 0.6106838156167552, + "learning_rate": 1.001388004015613e-06, + "loss": 0.5683, + "step": 2039 + }, + { + "epoch": 2.4463107378524294, + "grad_norm": 0.608285288407351, + "learning_rate": 9.97198669949102e-07, + "loss": 0.5913, + "step": 2040 + }, + { + "epoch": 2.44751049790042, + "grad_norm": 0.5438934210875334, + "learning_rate": 9.930171464898224e-07, + "loss": 0.6163, + "step": 2041 + }, + { + "epoch": 2.4487102579484104, + "grad_norm": 0.5543471267140387, + "learning_rate": 9.888434417971231e-07, + "loss": 0.5392, + "step": 2042 + }, + { + "epoch": 2.449910017996401, + "grad_norm": 0.6119791506740563, + "learning_rate": 9.846775640150929e-07, + "loss": 0.5483, + "step": 2043 + }, + { + "epoch": 2.4511097780443913, + "grad_norm": 0.626406196205996, + "learning_rate": 9.805195212725432e-07, + "loss": 0.5641, + "step": 2044 + }, + { + "epoch": 2.4523095380923814, + "grad_norm": 0.5706517719297208, + "learning_rate": 9.763693216830055e-07, + "loss": 0.5396, + "step": 2045 + }, + { + "epoch": 2.453509298140372, + "grad_norm": 0.6232083000658682, + "learning_rate": 9.722269733447038e-07, + "loss": 0.5458, + "step": 2046 + }, + { + "epoch": 2.4547090581883624, + "grad_norm": 0.6360627530329451, + "learning_rate": 9.680924843405377e-07, + "loss": 0.5637, + "step": 2047 + }, + { + "epoch": 2.455908818236353, + "grad_norm": 0.5964220775139456, + "learning_rate": 9.639658627380755e-07, + "loss": 0.5884, + "step": 2048 + }, + { + "epoch": 2.457108578284343, + "grad_norm": 0.639571658876441, + "learning_rate": 9.598471165895361e-07, + "loss": 0.5329, + "step": 2049 + }, + { + "epoch": 2.4583083383323334, + "grad_norm": 0.585403973447277, + "learning_rate": 9.557362539317656e-07, + "loss": 0.5455, + "step": 2050 + }, + { + "epoch": 2.459508098380324, + "grad_norm": 0.5243149220065292, + "learning_rate": 9.516332827862284e-07, + "loss": 0.5215, + "step": 2051 + }, + { + "epoch": 2.4607078584283144, + "grad_norm": 0.6437331191238298, + "learning_rate": 9.475382111589965e-07, + "loss": 0.5345, + "step": 2052 + }, + { + "epoch": 2.461907618476305, + "grad_norm": 0.6279857956904481, + "learning_rate": 9.434510470407182e-07, + "loss": 0.6214, + "step": 2053 + }, + { + "epoch": 2.463107378524295, + "grad_norm": 0.5517051975103123, + "learning_rate": 9.393717984066181e-07, + "loss": 0.5662, + "step": 2054 + }, + { + "epoch": 2.4643071385722854, + "grad_norm": 0.5512005435549526, + "learning_rate": 9.353004732164739e-07, + "loss": 0.5303, + "step": 2055 + }, + { + "epoch": 2.465506898620276, + "grad_norm": 0.72031566792074, + "learning_rate": 9.312370794146025e-07, + "loss": 0.4986, + "step": 2056 + }, + { + "epoch": 2.4667066586682664, + "grad_norm": 0.6545253273196747, + "learning_rate": 9.271816249298449e-07, + "loss": 0.5527, + "step": 2057 + }, + { + "epoch": 2.467906418716257, + "grad_norm": 0.5422272008238539, + "learning_rate": 9.231341176755487e-07, + "loss": 0.5918, + "step": 2058 + }, + { + "epoch": 2.469106178764247, + "grad_norm": 0.5958161709563157, + "learning_rate": 9.190945655495553e-07, + "loss": 0.5588, + "step": 2059 + }, + { + "epoch": 2.4703059388122375, + "grad_norm": 0.7328783616345088, + "learning_rate": 9.15062976434185e-07, + "loss": 0.5479, + "step": 2060 + }, + { + "epoch": 2.471505698860228, + "grad_norm": 0.6278689600428524, + "learning_rate": 9.110393581962146e-07, + "loss": 0.5794, + "step": 2061 + }, + { + "epoch": 2.4727054589082185, + "grad_norm": 0.6492780524869001, + "learning_rate": 9.070237186868741e-07, + "loss": 0.5853, + "step": 2062 + }, + { + "epoch": 2.473905218956209, + "grad_norm": 0.5522041253309613, + "learning_rate": 9.030160657418218e-07, + "loss": 0.6022, + "step": 2063 + }, + { + "epoch": 2.475104979004199, + "grad_norm": 0.6684264219716441, + "learning_rate": 8.990164071811297e-07, + "loss": 0.5468, + "step": 2064 + }, + { + "epoch": 2.4763047390521895, + "grad_norm": 0.6135364422730409, + "learning_rate": 8.950247508092724e-07, + "loss": 0.5553, + "step": 2065 + }, + { + "epoch": 2.47750449910018, + "grad_norm": 0.5842931943708874, + "learning_rate": 8.910411044151135e-07, + "loss": 0.6056, + "step": 2066 + }, + { + "epoch": 2.4787042591481705, + "grad_norm": 0.6777460725695204, + "learning_rate": 8.870654757718805e-07, + "loss": 0.5567, + "step": 2067 + }, + { + "epoch": 2.4799040191961605, + "grad_norm": 0.6673476797934147, + "learning_rate": 8.830978726371586e-07, + "loss": 0.5377, + "step": 2068 + }, + { + "epoch": 2.481103779244151, + "grad_norm": 0.6182567506606317, + "learning_rate": 8.791383027528772e-07, + "loss": 0.5056, + "step": 2069 + }, + { + "epoch": 2.4823035392921415, + "grad_norm": 0.5761850561096625, + "learning_rate": 8.75186773845283e-07, + "loss": 0.5573, + "step": 2070 + }, + { + "epoch": 2.483503299340132, + "grad_norm": 0.606360236325203, + "learning_rate": 8.712432936249365e-07, + "loss": 0.6207, + "step": 2071 + }, + { + "epoch": 2.4847030593881225, + "grad_norm": 0.603595466271828, + "learning_rate": 8.673078697866933e-07, + "loss": 0.6215, + "step": 2072 + }, + { + "epoch": 2.485902819436113, + "grad_norm": 0.6749154407409601, + "learning_rate": 8.633805100096881e-07, + "loss": 0.5485, + "step": 2073 + }, + { + "epoch": 2.487102579484103, + "grad_norm": 0.7045002622056066, + "learning_rate": 8.594612219573195e-07, + "loss": 0.5678, + "step": 2074 + }, + { + "epoch": 2.4883023395320936, + "grad_norm": 0.5950945639339188, + "learning_rate": 8.555500132772371e-07, + "loss": 0.5867, + "step": 2075 + }, + { + "epoch": 2.489502099580084, + "grad_norm": 0.5732665057845471, + "learning_rate": 8.516468916013243e-07, + "loss": 0.5325, + "step": 2076 + }, + { + "epoch": 2.4907018596280746, + "grad_norm": 0.6730573537624812, + "learning_rate": 8.47751864545685e-07, + "loss": 0.5712, + "step": 2077 + }, + { + "epoch": 2.4919016196760646, + "grad_norm": 0.6103788351739281, + "learning_rate": 8.438649397106285e-07, + "loss": 0.5506, + "step": 2078 + }, + { + "epoch": 2.493101379724055, + "grad_norm": 0.6331513008649531, + "learning_rate": 8.399861246806534e-07, + "loss": 0.5603, + "step": 2079 + }, + { + "epoch": 2.4943011397720456, + "grad_norm": 0.6559565087341892, + "learning_rate": 8.361154270244342e-07, + "loss": 0.4952, + "step": 2080 + }, + { + "epoch": 2.495500899820036, + "grad_norm": 0.6259786029877843, + "learning_rate": 8.322528542948067e-07, + "loss": 0.5476, + "step": 2081 + }, + { + "epoch": 2.4967006598680266, + "grad_norm": 0.6397549274955587, + "learning_rate": 8.283984140287521e-07, + "loss": 0.5335, + "step": 2082 + }, + { + "epoch": 2.4979004199160166, + "grad_norm": 0.7385966603303945, + "learning_rate": 8.245521137473822e-07, + "loss": 0.647, + "step": 2083 + }, + { + "epoch": 2.499100179964007, + "grad_norm": 0.6137631792373935, + "learning_rate": 8.207139609559284e-07, + "loss": 0.5315, + "step": 2084 + }, + { + "epoch": 2.5002999400119976, + "grad_norm": 0.6350875453667628, + "learning_rate": 8.16883963143717e-07, + "loss": 0.5511, + "step": 2085 + }, + { + "epoch": 2.501499700059988, + "grad_norm": 0.6032552621237215, + "learning_rate": 8.130621277841716e-07, + "loss": 0.5984, + "step": 2086 + }, + { + "epoch": 2.502699460107978, + "grad_norm": 0.5810034597194302, + "learning_rate": 8.09248462334779e-07, + "loss": 0.5514, + "step": 2087 + }, + { + "epoch": 2.5038992201559687, + "grad_norm": 0.564087058663493, + "learning_rate": 8.054429742370906e-07, + "loss": 0.5841, + "step": 2088 + }, + { + "epoch": 2.505098980203959, + "grad_norm": 0.62931338569948, + "learning_rate": 8.016456709166986e-07, + "loss": 0.5231, + "step": 2089 + }, + { + "epoch": 2.5062987402519497, + "grad_norm": 0.5686764121520594, + "learning_rate": 7.97856559783225e-07, + "loss": 0.5367, + "step": 2090 + }, + { + "epoch": 2.50749850029994, + "grad_norm": 0.6061303901603898, + "learning_rate": 7.940756482303064e-07, + "loss": 0.5871, + "step": 2091 + }, + { + "epoch": 2.5086982603479306, + "grad_norm": 0.637376733778738, + "learning_rate": 7.903029436355802e-07, + "loss": 0.6376, + "step": 2092 + }, + { + "epoch": 2.5098980203959207, + "grad_norm": 0.6070290680506467, + "learning_rate": 7.86538453360669e-07, + "loss": 0.5408, + "step": 2093 + }, + { + "epoch": 2.511097780443911, + "grad_norm": 0.6605654652869697, + "learning_rate": 7.827821847511668e-07, + "loss": 0.6269, + "step": 2094 + }, + { + "epoch": 2.5122975404919017, + "grad_norm": 0.6225011194684036, + "learning_rate": 7.790341451366263e-07, + "loss": 0.5601, + "step": 2095 + }, + { + "epoch": 2.513497300539892, + "grad_norm": 0.5458392403706084, + "learning_rate": 7.752943418305408e-07, + "loss": 0.5221, + "step": 2096 + }, + { + "epoch": 2.5146970605878822, + "grad_norm": 0.6360773622632925, + "learning_rate": 7.71562782130334e-07, + "loss": 0.6115, + "step": 2097 + }, + { + "epoch": 2.5158968206358727, + "grad_norm": 0.5782806887863192, + "learning_rate": 7.678394733173427e-07, + "loss": 0.5208, + "step": 2098 + }, + { + "epoch": 2.517096580683863, + "grad_norm": 0.5044811505319163, + "learning_rate": 7.641244226568056e-07, + "loss": 0.6212, + "step": 2099 + }, + { + "epoch": 2.5182963407318537, + "grad_norm": 0.6499090066208069, + "learning_rate": 7.60417637397845e-07, + "loss": 0.6085, + "step": 2100 + }, + { + "epoch": 2.519496100779844, + "grad_norm": 0.546825659889867, + "learning_rate": 7.567191247734589e-07, + "loss": 0.5505, + "step": 2101 + }, + { + "epoch": 2.5206958608278347, + "grad_norm": 0.5661118081548666, + "learning_rate": 7.530288920004957e-07, + "loss": 0.5503, + "step": 2102 + }, + { + "epoch": 2.5218956208758248, + "grad_norm": 0.5758245642104611, + "learning_rate": 7.493469462796565e-07, + "loss": 0.5188, + "step": 2103 + }, + { + "epoch": 2.5230953809238152, + "grad_norm": 0.65196002526569, + "learning_rate": 7.45673294795467e-07, + "loss": 0.6145, + "step": 2104 + }, + { + "epoch": 2.5242951409718057, + "grad_norm": 0.6387451554071346, + "learning_rate": 7.420079447162665e-07, + "loss": 0.5421, + "step": 2105 + }, + { + "epoch": 2.525494901019796, + "grad_norm": 0.6660052812587164, + "learning_rate": 7.383509031942016e-07, + "loss": 0.5441, + "step": 2106 + }, + { + "epoch": 2.5266946610677863, + "grad_norm": 0.6643620782254298, + "learning_rate": 7.347021773652035e-07, + "loss": 0.5121, + "step": 2107 + }, + { + "epoch": 2.527894421115777, + "grad_norm": 0.5942439165853299, + "learning_rate": 7.31061774348975e-07, + "loss": 0.5815, + "step": 2108 + }, + { + "epoch": 2.5290941811637673, + "grad_norm": 0.6514630475774669, + "learning_rate": 7.274297012489812e-07, + "loss": 0.5866, + "step": 2109 + }, + { + "epoch": 2.5302939412117578, + "grad_norm": 0.6417301950385145, + "learning_rate": 7.238059651524354e-07, + "loss": 0.5927, + "step": 2110 + }, + { + "epoch": 2.5314937012597483, + "grad_norm": 0.6574104218840121, + "learning_rate": 7.201905731302777e-07, + "loss": 0.6499, + "step": 2111 + }, + { + "epoch": 2.5326934613077383, + "grad_norm": 0.6806120471137596, + "learning_rate": 7.165835322371689e-07, + "loss": 0.5817, + "step": 2112 + }, + { + "epoch": 2.533893221355729, + "grad_norm": 0.6370789687688718, + "learning_rate": 7.129848495114783e-07, + "loss": 0.5126, + "step": 2113 + }, + { + "epoch": 2.5350929814037193, + "grad_norm": 0.6451567782463036, + "learning_rate": 7.093945319752588e-07, + "loss": 0.5185, + "step": 2114 + }, + { + "epoch": 2.53629274145171, + "grad_norm": 0.6322934547855595, + "learning_rate": 7.058125866342453e-07, + "loss": 0.6352, + "step": 2115 + }, + { + "epoch": 2.5374925014997, + "grad_norm": 0.5974029199120204, + "learning_rate": 7.022390204778351e-07, + "loss": 0.5667, + "step": 2116 + }, + { + "epoch": 2.5386922615476903, + "grad_norm": 0.8295297775898799, + "learning_rate": 6.986738404790755e-07, + "loss": 0.5237, + "step": 2117 + }, + { + "epoch": 2.539892021595681, + "grad_norm": 0.5713910785693439, + "learning_rate": 6.9511705359465e-07, + "loss": 0.6225, + "step": 2118 + }, + { + "epoch": 2.5410917816436713, + "grad_norm": 0.566923531902045, + "learning_rate": 6.915686667648619e-07, + "loss": 0.5867, + "step": 2119 + }, + { + "epoch": 2.542291541691662, + "grad_norm": 0.6492417746075824, + "learning_rate": 6.880286869136288e-07, + "loss": 0.5182, + "step": 2120 + }, + { + "epoch": 2.5434913017396523, + "grad_norm": 0.7956927113017503, + "learning_rate": 6.844971209484613e-07, + "loss": 0.5674, + "step": 2121 + }, + { + "epoch": 2.5446910617876424, + "grad_norm": 0.5641376870362562, + "learning_rate": 6.809739757604494e-07, + "loss": 0.5253, + "step": 2122 + }, + { + "epoch": 2.545890821835633, + "grad_norm": 0.5912812937401165, + "learning_rate": 6.774592582242567e-07, + "loss": 0.5383, + "step": 2123 + }, + { + "epoch": 2.5470905818836234, + "grad_norm": 0.5580922133198004, + "learning_rate": 6.739529751981e-07, + "loss": 0.5861, + "step": 2124 + }, + { + "epoch": 2.5482903419316134, + "grad_norm": 0.6298162645996003, + "learning_rate": 6.704551335237358e-07, + "loss": 0.5748, + "step": 2125 + }, + { + "epoch": 2.549490101979604, + "grad_norm": 0.5868891574360139, + "learning_rate": 6.669657400264512e-07, + "loss": 0.574, + "step": 2126 + }, + { + "epoch": 2.5506898620275944, + "grad_norm": 0.5976636824154793, + "learning_rate": 6.634848015150508e-07, + "loss": 0.545, + "step": 2127 + }, + { + "epoch": 2.551889622075585, + "grad_norm": 0.6020475860573156, + "learning_rate": 6.600123247818357e-07, + "loss": 0.5106, + "step": 2128 + }, + { + "epoch": 2.5530893821235754, + "grad_norm": 0.5902722858962282, + "learning_rate": 6.565483166025987e-07, + "loss": 0.5626, + "step": 2129 + }, + { + "epoch": 2.554289142171566, + "grad_norm": 0.5587928729210574, + "learning_rate": 6.530927837366113e-07, + "loss": 0.5722, + "step": 2130 + }, + { + "epoch": 2.555488902219556, + "grad_norm": 0.632512991280109, + "learning_rate": 6.496457329266003e-07, + "loss": 0.5234, + "step": 2131 + }, + { + "epoch": 2.5566886622675464, + "grad_norm": 0.6377743211690943, + "learning_rate": 6.462071708987467e-07, + "loss": 0.5435, + "step": 2132 + }, + { + "epoch": 2.557888422315537, + "grad_norm": 0.5842896842093335, + "learning_rate": 6.427771043626646e-07, + "loss": 0.5487, + "step": 2133 + }, + { + "epoch": 2.5590881823635274, + "grad_norm": 0.5957269315639774, + "learning_rate": 6.393555400113938e-07, + "loss": 0.5712, + "step": 2134 + }, + { + "epoch": 2.5602879424115175, + "grad_norm": 0.607664147978379, + "learning_rate": 6.359424845213819e-07, + "loss": 0.51, + "step": 2135 + }, + { + "epoch": 2.561487702459508, + "grad_norm": 0.6481620130889848, + "learning_rate": 6.325379445524732e-07, + "loss": 0.5201, + "step": 2136 + }, + { + "epoch": 2.5626874625074985, + "grad_norm": 0.5655635655291085, + "learning_rate": 6.291419267478965e-07, + "loss": 0.5335, + "step": 2137 + }, + { + "epoch": 2.563887222555489, + "grad_norm": 0.777957284332089, + "learning_rate": 6.257544377342523e-07, + "loss": 0.5303, + "step": 2138 + }, + { + "epoch": 2.5650869826034794, + "grad_norm": 0.62107869014827, + "learning_rate": 6.223754841214969e-07, + "loss": 0.5255, + "step": 2139 + }, + { + "epoch": 2.56628674265147, + "grad_norm": 0.6111004858053942, + "learning_rate": 6.190050725029328e-07, + "loss": 0.5165, + "step": 2140 + }, + { + "epoch": 2.56748650269946, + "grad_norm": 0.6142744373421746, + "learning_rate": 6.156432094551951e-07, + "loss": 0.593, + "step": 2141 + }, + { + "epoch": 2.5686862627474505, + "grad_norm": 0.6028771942191737, + "learning_rate": 6.122899015382372e-07, + "loss": 0.5702, + "step": 2142 + }, + { + "epoch": 2.569886022795441, + "grad_norm": 0.6055817514723558, + "learning_rate": 6.089451552953191e-07, + "loss": 0.5333, + "step": 2143 + }, + { + "epoch": 2.5710857828434315, + "grad_norm": 0.6190509492084894, + "learning_rate": 6.056089772529955e-07, + "loss": 0.5611, + "step": 2144 + }, + { + "epoch": 2.5722855428914215, + "grad_norm": 0.5863993150494622, + "learning_rate": 6.022813739211025e-07, + "loss": 0.5748, + "step": 2145 + }, + { + "epoch": 2.573485302939412, + "grad_norm": 0.6463470257981921, + "learning_rate": 5.989623517927395e-07, + "loss": 0.5035, + "step": 2146 + }, + { + "epoch": 2.5746850629874025, + "grad_norm": 0.5974658144499296, + "learning_rate": 5.956519173442704e-07, + "loss": 0.5744, + "step": 2147 + }, + { + "epoch": 2.575884823035393, + "grad_norm": 0.6047039800838729, + "learning_rate": 5.923500770352935e-07, + "loss": 0.5541, + "step": 2148 + }, + { + "epoch": 2.5770845830833835, + "grad_norm": 0.6412243636579218, + "learning_rate": 5.890568373086425e-07, + "loss": 0.5502, + "step": 2149 + }, + { + "epoch": 2.578284343131374, + "grad_norm": 0.6420558939025194, + "learning_rate": 5.857722045903674e-07, + "loss": 0.6135, + "step": 2150 + }, + { + "epoch": 2.579484103179364, + "grad_norm": 0.6692959800290453, + "learning_rate": 5.824961852897231e-07, + "loss": 0.5778, + "step": 2151 + }, + { + "epoch": 2.5806838632273545, + "grad_norm": 0.5791690455093836, + "learning_rate": 5.792287857991591e-07, + "loss": 0.6185, + "step": 2152 + }, + { + "epoch": 2.581883623275345, + "grad_norm": 0.5876943201180187, + "learning_rate": 5.759700124943029e-07, + "loss": 0.5751, + "step": 2153 + }, + { + "epoch": 2.583083383323335, + "grad_norm": 0.6354825100788556, + "learning_rate": 5.727198717339511e-07, + "loss": 0.5939, + "step": 2154 + }, + { + "epoch": 2.5842831433713256, + "grad_norm": 0.5648854282439252, + "learning_rate": 5.694783698600548e-07, + "loss": 0.5738, + "step": 2155 + }, + { + "epoch": 2.585482903419316, + "grad_norm": 0.5688007543004594, + "learning_rate": 5.662455131977102e-07, + "loss": 0.5758, + "step": 2156 + }, + { + "epoch": 2.5866826634673066, + "grad_norm": 0.5622931780417261, + "learning_rate": 5.63021308055141e-07, + "loss": 0.5811, + "step": 2157 + }, + { + "epoch": 2.587882423515297, + "grad_norm": 0.5936911421046386, + "learning_rate": 5.598057607236928e-07, + "loss": 0.5717, + "step": 2158 + }, + { + "epoch": 2.5890821835632876, + "grad_norm": 0.634008275839511, + "learning_rate": 5.565988774778153e-07, + "loss": 0.5215, + "step": 2159 + }, + { + "epoch": 2.5902819436112776, + "grad_norm": 0.6018185932926242, + "learning_rate": 5.534006645750523e-07, + "loss": 0.5197, + "step": 2160 + }, + { + "epoch": 2.591481703659268, + "grad_norm": 0.7005719436138388, + "learning_rate": 5.502111282560291e-07, + "loss": 0.5887, + "step": 2161 + }, + { + "epoch": 2.5926814637072586, + "grad_norm": 0.7301197459611951, + "learning_rate": 5.470302747444428e-07, + "loss": 0.6252, + "step": 2162 + }, + { + "epoch": 2.593881223755249, + "grad_norm": 0.6942438861370627, + "learning_rate": 5.438581102470419e-07, + "loss": 0.5182, + "step": 2163 + }, + { + "epoch": 2.595080983803239, + "grad_norm": 0.6917749064390649, + "learning_rate": 5.406946409536284e-07, + "loss": 0.6221, + "step": 2164 + }, + { + "epoch": 2.5962807438512296, + "grad_norm": 0.6215078882375504, + "learning_rate": 5.375398730370323e-07, + "loss": 0.5242, + "step": 2165 + }, + { + "epoch": 2.59748050389922, + "grad_norm": 0.6185351010861437, + "learning_rate": 5.343938126531034e-07, + "loss": 0.5276, + "step": 2166 + }, + { + "epoch": 2.5986802639472106, + "grad_norm": 0.6043673893011502, + "learning_rate": 5.312564659407054e-07, + "loss": 0.6079, + "step": 2167 + }, + { + "epoch": 2.599880023995201, + "grad_norm": 0.5734256754088662, + "learning_rate": 5.281278390216976e-07, + "loss": 0.5979, + "step": 2168 + }, + { + "epoch": 2.6010797840431916, + "grad_norm": 0.5800707816384436, + "learning_rate": 5.250079380009205e-07, + "loss": 0.568, + "step": 2169 + }, + { + "epoch": 2.6022795440911817, + "grad_norm": 0.5313291939600484, + "learning_rate": 5.218967689661914e-07, + "loss": 0.5976, + "step": 2170 + }, + { + "epoch": 2.603479304139172, + "grad_norm": 0.5736923527583822, + "learning_rate": 5.18794337988292e-07, + "loss": 0.5387, + "step": 2171 + }, + { + "epoch": 2.6046790641871627, + "grad_norm": 0.6999475757676541, + "learning_rate": 5.157006511209467e-07, + "loss": 0.4878, + "step": 2172 + }, + { + "epoch": 2.6058788242351527, + "grad_norm": 0.628131409078769, + "learning_rate": 5.126157144008209e-07, + "loss": 0.4561, + "step": 2173 + }, + { + "epoch": 2.607078584283143, + "grad_norm": 0.6090807924772929, + "learning_rate": 5.095395338475095e-07, + "loss": 0.547, + "step": 2174 + }, + { + "epoch": 2.6082783443311337, + "grad_norm": 0.6568840037108555, + "learning_rate": 5.064721154635155e-07, + "loss": 0.5179, + "step": 2175 + }, + { + "epoch": 2.609478104379124, + "grad_norm": 0.6077345432598468, + "learning_rate": 5.034134652342477e-07, + "loss": 0.5896, + "step": 2176 + }, + { + "epoch": 2.6106778644271147, + "grad_norm": 0.5683475107240232, + "learning_rate": 5.003635891280051e-07, + "loss": 0.577, + "step": 2177 + }, + { + "epoch": 2.611877624475105, + "grad_norm": 0.5940425103459605, + "learning_rate": 4.973224930959669e-07, + "loss": 0.5775, + "step": 2178 + }, + { + "epoch": 2.6130773845230952, + "grad_norm": 0.6807554600883522, + "learning_rate": 4.942901830721786e-07, + "loss": 0.5676, + "step": 2179 + }, + { + "epoch": 2.6142771445710857, + "grad_norm": 0.6501107217177522, + "learning_rate": 4.912666649735403e-07, + "loss": 0.5832, + "step": 2180 + }, + { + "epoch": 2.6154769046190762, + "grad_norm": 0.6529578897175722, + "learning_rate": 4.882519446998007e-07, + "loss": 0.5626, + "step": 2181 + }, + { + "epoch": 2.6166766646670667, + "grad_norm": 0.5592171901908932, + "learning_rate": 4.852460281335386e-07, + "loss": 0.5317, + "step": 2182 + }, + { + "epoch": 2.6178764247150568, + "grad_norm": 0.5899871633157431, + "learning_rate": 4.822489211401526e-07, + "loss": 0.5589, + "step": 2183 + }, + { + "epoch": 2.6190761847630473, + "grad_norm": 0.6203897260792866, + "learning_rate": 4.792606295678564e-07, + "loss": 0.5781, + "step": 2184 + }, + { + "epoch": 2.6202759448110378, + "grad_norm": 0.5954957072253819, + "learning_rate": 4.762811592476585e-07, + "loss": 0.5792, + "step": 2185 + }, + { + "epoch": 2.6214757048590283, + "grad_norm": 0.6842825105432413, + "learning_rate": 4.7331051599335377e-07, + "loss": 0.5621, + "step": 2186 + }, + { + "epoch": 2.6226754649070187, + "grad_norm": 0.5835272067634589, + "learning_rate": 4.7034870560151435e-07, + "loss": 0.607, + "step": 2187 + }, + { + "epoch": 2.6238752249550092, + "grad_norm": 0.6096228214681819, + "learning_rate": 4.673957338514812e-07, + "loss": 0.5736, + "step": 2188 + }, + { + "epoch": 2.6250749850029993, + "grad_norm": 0.566074085668047, + "learning_rate": 4.644516065053406e-07, + "loss": 0.591, + "step": 2189 + }, + { + "epoch": 2.62627474505099, + "grad_norm": 0.6278033549800017, + "learning_rate": 4.61516329307925e-07, + "loss": 0.6223, + "step": 2190 + }, + { + "epoch": 2.6274745050989803, + "grad_norm": 0.666495192770457, + "learning_rate": 4.585899079868006e-07, + "loss": 0.5843, + "step": 2191 + }, + { + "epoch": 2.6286742651469703, + "grad_norm": 0.6002593159128293, + "learning_rate": 4.5567234825224736e-07, + "loss": 0.5601, + "step": 2192 + }, + { + "epoch": 2.629874025194961, + "grad_norm": 0.6059102396663301, + "learning_rate": 4.52763655797257e-07, + "loss": 0.5704, + "step": 2193 + }, + { + "epoch": 2.6310737852429513, + "grad_norm": 0.5962329324448424, + "learning_rate": 4.498638362975194e-07, + "loss": 0.6028, + "step": 2194 + }, + { + "epoch": 2.632273545290942, + "grad_norm": 0.5800985716686567, + "learning_rate": 4.469728954114083e-07, + "loss": 0.573, + "step": 2195 + }, + { + "epoch": 2.6334733053389323, + "grad_norm": 0.550969607398562, + "learning_rate": 4.440908387799753e-07, + "loss": 0.5627, + "step": 2196 + }, + { + "epoch": 2.634673065386923, + "grad_norm": 0.5858661147935802, + "learning_rate": 4.41217672026934e-07, + "loss": 0.5717, + "step": 2197 + }, + { + "epoch": 2.635872825434913, + "grad_norm": 0.565563272849404, + "learning_rate": 4.3835340075865263e-07, + "loss": 0.567, + "step": 2198 + }, + { + "epoch": 2.6370725854829034, + "grad_norm": 0.7154679915472364, + "learning_rate": 4.354980305641415e-07, + "loss": 0.4806, + "step": 2199 + }, + { + "epoch": 2.638272345530894, + "grad_norm": 0.6277957077530519, + "learning_rate": 4.3265156701504274e-07, + "loss": 0.5637, + "step": 2200 + }, + { + "epoch": 2.6394721055788843, + "grad_norm": 0.5817270389510264, + "learning_rate": 4.298140156656178e-07, + "loss": 0.5409, + "step": 2201 + }, + { + "epoch": 2.6406718656268744, + "grad_norm": 0.5737173928522017, + "learning_rate": 4.269853820527403e-07, + "loss": 0.5886, + "step": 2202 + }, + { + "epoch": 2.641871625674865, + "grad_norm": 0.6314989133285192, + "learning_rate": 4.2416567169588086e-07, + "loss": 0.5606, + "step": 2203 + }, + { + "epoch": 2.6430713857228554, + "grad_norm": 0.6795554928383936, + "learning_rate": 4.213548900970965e-07, + "loss": 0.4729, + "step": 2204 + }, + { + "epoch": 2.644271145770846, + "grad_norm": 0.6145336139358205, + "learning_rate": 4.185530427410267e-07, + "loss": 0.6001, + "step": 2205 + }, + { + "epoch": 2.6454709058188364, + "grad_norm": 0.5157672326801428, + "learning_rate": 4.157601350948748e-07, + "loss": 0.5769, + "step": 2206 + }, + { + "epoch": 2.646670665866827, + "grad_norm": 0.5875060652510343, + "learning_rate": 4.1297617260839753e-07, + "loss": 0.5676, + "step": 2207 + }, + { + "epoch": 2.647870425914817, + "grad_norm": 0.7477248765689066, + "learning_rate": 4.102011607139039e-07, + "loss": 0.5785, + "step": 2208 + }, + { + "epoch": 2.6490701859628074, + "grad_norm": 0.6293565517750546, + "learning_rate": 4.074351048262304e-07, + "loss": 0.5925, + "step": 2209 + }, + { + "epoch": 2.650269946010798, + "grad_norm": 0.5569274950930724, + "learning_rate": 4.046780103427422e-07, + "loss": 0.6122, + "step": 2210 + }, + { + "epoch": 2.6514697060587884, + "grad_norm": 0.6226911550571225, + "learning_rate": 4.0192988264331765e-07, + "loss": 0.5728, + "step": 2211 + }, + { + "epoch": 2.6526694661067785, + "grad_norm": 0.7467205426182734, + "learning_rate": 3.9919072709033724e-07, + "loss": 0.5054, + "step": 2212 + }, + { + "epoch": 2.653869226154769, + "grad_norm": 0.5628270181417315, + "learning_rate": 3.9646054902867495e-07, + "loss": 0.5441, + "step": 2213 + }, + { + "epoch": 2.6550689862027594, + "grad_norm": 0.691376073275601, + "learning_rate": 3.937393537856871e-07, + "loss": 0.5514, + "step": 2214 + }, + { + "epoch": 2.65626874625075, + "grad_norm": 0.6045653367982666, + "learning_rate": 3.9102714667120177e-07, + "loss": 0.6203, + "step": 2215 + }, + { + "epoch": 2.6574685062987404, + "grad_norm": 0.6399541518504948, + "learning_rate": 3.8832393297750905e-07, + "loss": 0.533, + "step": 2216 + }, + { + "epoch": 2.658668266346731, + "grad_norm": 0.5815400951783926, + "learning_rate": 3.8562971797934947e-07, + "loss": 0.5676, + "step": 2217 + }, + { + "epoch": 2.659868026394721, + "grad_norm": 0.6824231088462228, + "learning_rate": 3.8294450693390496e-07, + "loss": 0.5527, + "step": 2218 + }, + { + "epoch": 2.6610677864427115, + "grad_norm": 0.630113655846623, + "learning_rate": 3.8026830508078873e-07, + "loss": 0.5274, + "step": 2219 + }, + { + "epoch": 2.662267546490702, + "grad_norm": 0.5548091593081005, + "learning_rate": 3.776011176420341e-07, + "loss": 0.5691, + "step": 2220 + }, + { + "epoch": 2.663467306538692, + "grad_norm": 0.7411474172071046, + "learning_rate": 3.749429498220841e-07, + "loss": 0.5111, + "step": 2221 + }, + { + "epoch": 2.6646670665866825, + "grad_norm": 0.5731552377828735, + "learning_rate": 3.722938068077825e-07, + "loss": 0.6043, + "step": 2222 + }, + { + "epoch": 2.665866826634673, + "grad_norm": 0.5699287440850611, + "learning_rate": 3.6965369376836427e-07, + "loss": 0.5435, + "step": 2223 + }, + { + "epoch": 2.6670665866826635, + "grad_norm": 0.6513608062882078, + "learning_rate": 3.670226158554396e-07, + "loss": 0.499, + "step": 2224 + }, + { + "epoch": 2.668266346730654, + "grad_norm": 0.575845986646372, + "learning_rate": 3.6440057820299577e-07, + "loss": 0.5676, + "step": 2225 + }, + { + "epoch": 2.6694661067786445, + "grad_norm": 0.5656523199158348, + "learning_rate": 3.617875859273756e-07, + "loss": 0.5739, + "step": 2226 + }, + { + "epoch": 2.6706658668266345, + "grad_norm": 0.5928884294996334, + "learning_rate": 3.5918364412727004e-07, + "loss": 0.5755, + "step": 2227 + }, + { + "epoch": 2.671865626874625, + "grad_norm": 0.6500031397126891, + "learning_rate": 3.565887578837152e-07, + "loss": 0.5345, + "step": 2228 + }, + { + "epoch": 2.6730653869226155, + "grad_norm": 0.6618145889563011, + "learning_rate": 3.5400293226007296e-07, + "loss": 0.473, + "step": 2229 + }, + { + "epoch": 2.674265146970606, + "grad_norm": 0.5874887604233205, + "learning_rate": 3.514261723020268e-07, + "loss": 0.5536, + "step": 2230 + }, + { + "epoch": 2.675464907018596, + "grad_norm": 0.7232082935107603, + "learning_rate": 3.4885848303756963e-07, + "loss": 0.4964, + "step": 2231 + }, + { + "epoch": 2.6766646670665866, + "grad_norm": 0.6363229555016645, + "learning_rate": 3.462998694769976e-07, + "loss": 0.5929, + "step": 2232 + }, + { + "epoch": 2.677864427114577, + "grad_norm": 0.5800160738933541, + "learning_rate": 3.4375033661289413e-07, + "loss": 0.5662, + "step": 2233 + }, + { + "epoch": 2.6790641871625676, + "grad_norm": 0.8804672806059494, + "learning_rate": 3.412098894201255e-07, + "loss": 0.6181, + "step": 2234 + }, + { + "epoch": 2.680263947210558, + "grad_norm": 0.524971145657382, + "learning_rate": 3.3867853285582894e-07, + "loss": 0.5407, + "step": 2235 + }, + { + "epoch": 2.6814637072585485, + "grad_norm": 0.6100957559110444, + "learning_rate": 3.361562718594036e-07, + "loss": 0.504, + "step": 2236 + }, + { + "epoch": 2.6826634673065386, + "grad_norm": 0.5354284577217275, + "learning_rate": 3.336431113524996e-07, + "loss": 0.556, + "step": 2237 + }, + { + "epoch": 2.683863227354529, + "grad_norm": 0.5678265948723596, + "learning_rate": 3.311390562390099e-07, + "loss": 0.6007, + "step": 2238 + }, + { + "epoch": 2.6850629874025196, + "grad_norm": 0.685672769832599, + "learning_rate": 3.2864411140506094e-07, + "loss": 0.6002, + "step": 2239 + }, + { + "epoch": 2.6862627474505096, + "grad_norm": 0.6067651086831962, + "learning_rate": 3.2615828171900234e-07, + "loss": 0.5837, + "step": 2240 + }, + { + "epoch": 2.6874625074985, + "grad_norm": 0.5540343155655241, + "learning_rate": 3.2368157203139404e-07, + "loss": 0.6169, + "step": 2241 + }, + { + "epoch": 2.6886622675464906, + "grad_norm": 0.5243216125695884, + "learning_rate": 3.212139871750064e-07, + "loss": 0.5947, + "step": 2242 + }, + { + "epoch": 2.689862027594481, + "grad_norm": 0.5166214552324191, + "learning_rate": 3.1875553196480056e-07, + "loss": 0.5611, + "step": 2243 + }, + { + "epoch": 2.6910617876424716, + "grad_norm": 0.5380333721802236, + "learning_rate": 3.1630621119792203e-07, + "loss": 0.4938, + "step": 2244 + }, + { + "epoch": 2.692261547690462, + "grad_norm": 0.5420827111585474, + "learning_rate": 3.138660296536966e-07, + "loss": 0.5731, + "step": 2245 + }, + { + "epoch": 2.693461307738452, + "grad_norm": 0.6244285648915006, + "learning_rate": 3.114349920936144e-07, + "loss": 0.5156, + "step": 2246 + }, + { + "epoch": 2.6946610677864427, + "grad_norm": 0.6573495030556705, + "learning_rate": 3.090131032613225e-07, + "loss": 0.547, + "step": 2247 + }, + { + "epoch": 2.695860827834433, + "grad_norm": 0.5442357185305635, + "learning_rate": 3.066003678826152e-07, + "loss": 0.5845, + "step": 2248 + }, + { + "epoch": 2.6970605878824236, + "grad_norm": 0.544062706584872, + "learning_rate": 3.041967906654314e-07, + "loss": 0.5996, + "step": 2249 + }, + { + "epoch": 2.6982603479304137, + "grad_norm": 0.9578707468901981, + "learning_rate": 3.018023762998329e-07, + "loss": 0.5733, + "step": 2250 + }, + { + "epoch": 2.699460107978404, + "grad_norm": 0.5953350417695669, + "learning_rate": 2.994171294580045e-07, + "loss": 0.5834, + "step": 2251 + }, + { + "epoch": 2.7006598680263947, + "grad_norm": 0.5622708003061522, + "learning_rate": 2.9704105479424705e-07, + "loss": 0.5551, + "step": 2252 + }, + { + "epoch": 2.701859628074385, + "grad_norm": 0.5305691541375912, + "learning_rate": 2.946741569449563e-07, + "loss": 0.5684, + "step": 2253 + }, + { + "epoch": 2.7030593881223757, + "grad_norm": 0.5249312174708018, + "learning_rate": 2.923164405286266e-07, + "loss": 0.5662, + "step": 2254 + }, + { + "epoch": 2.704259148170366, + "grad_norm": 0.6204603022129775, + "learning_rate": 2.8996791014583523e-07, + "loss": 0.613, + "step": 2255 + }, + { + "epoch": 2.705458908218356, + "grad_norm": 0.6978090453715121, + "learning_rate": 2.8762857037923475e-07, + "loss": 0.5123, + "step": 2256 + }, + { + "epoch": 2.7066586682663467, + "grad_norm": 0.5800125254374642, + "learning_rate": 2.852984257935448e-07, + "loss": 0.5672, + "step": 2257 + }, + { + "epoch": 2.707858428314337, + "grad_norm": 1.2967018180608374, + "learning_rate": 2.8297748093554125e-07, + "loss": 0.6101, + "step": 2258 + }, + { + "epoch": 2.7090581883623273, + "grad_norm": 0.5642756332587243, + "learning_rate": 2.8066574033405014e-07, + "loss": 0.6105, + "step": 2259 + }, + { + "epoch": 2.7102579484103178, + "grad_norm": 0.7239876886430663, + "learning_rate": 2.783632084999366e-07, + "loss": 0.5856, + "step": 2260 + }, + { + "epoch": 2.7114577084583082, + "grad_norm": 0.5860625340774893, + "learning_rate": 2.7606988992609773e-07, + "loss": 0.6328, + "step": 2261 + }, + { + "epoch": 2.7126574685062987, + "grad_norm": 0.5655581547072107, + "learning_rate": 2.737857890874518e-07, + "loss": 0.5996, + "step": 2262 + }, + { + "epoch": 2.7138572285542892, + "grad_norm": 16.76224385771302, + "learning_rate": 2.7151091044093083e-07, + "loss": 0.9452, + "step": 2263 + }, + { + "epoch": 2.7150569886022797, + "grad_norm": 0.6382218253421764, + "learning_rate": 2.6924525842547286e-07, + "loss": 0.5567, + "step": 2264 + }, + { + "epoch": 2.71625674865027, + "grad_norm": 0.6517148416150133, + "learning_rate": 2.669888374620094e-07, + "loss": 0.5537, + "step": 2265 + }, + { + "epoch": 2.7174565086982603, + "grad_norm": 0.5644663327138808, + "learning_rate": 2.6474165195346346e-07, + "loss": 0.5069, + "step": 2266 + }, + { + "epoch": 2.7186562687462508, + "grad_norm": 0.6619100893862593, + "learning_rate": 2.6250370628473465e-07, + "loss": 0.5838, + "step": 2267 + }, + { + "epoch": 2.7198560287942413, + "grad_norm": 0.6201250184997715, + "learning_rate": 2.6027500482269153e-07, + "loss": 0.5597, + "step": 2268 + }, + { + "epoch": 2.7210557888422313, + "grad_norm": 0.6363157832150368, + "learning_rate": 2.580555519161698e-07, + "loss": 0.5632, + "step": 2269 + }, + { + "epoch": 2.722255548890222, + "grad_norm": 0.6064969590377077, + "learning_rate": 2.5584535189595193e-07, + "loss": 0.5688, + "step": 2270 + }, + { + "epoch": 2.7234553089382123, + "grad_norm": 0.591823448277897, + "learning_rate": 2.536444090747703e-07, + "loss": 0.5613, + "step": 2271 + }, + { + "epoch": 2.724655068986203, + "grad_norm": 0.6261058741329462, + "learning_rate": 2.514527277472917e-07, + "loss": 0.5932, + "step": 2272 + }, + { + "epoch": 2.7258548290341933, + "grad_norm": 0.5497707289720722, + "learning_rate": 2.4927031219011133e-07, + "loss": 0.5965, + "step": 2273 + }, + { + "epoch": 2.727054589082184, + "grad_norm": 0.6196870230550074, + "learning_rate": 2.470971666617455e-07, + "loss": 0.5696, + "step": 2274 + }, + { + "epoch": 2.728254349130174, + "grad_norm": 0.6006894934472979, + "learning_rate": 2.449332954026201e-07, + "loss": 0.5852, + "step": 2275 + }, + { + "epoch": 2.7294541091781643, + "grad_norm": 0.5761744307987052, + "learning_rate": 2.427787026350659e-07, + "loss": 0.5648, + "step": 2276 + }, + { + "epoch": 2.730653869226155, + "grad_norm": 0.6287395982883524, + "learning_rate": 2.4063339256330695e-07, + "loss": 0.5975, + "step": 2277 + }, + { + "epoch": 2.7318536292741453, + "grad_norm": 0.6167967352976029, + "learning_rate": 2.3849736937345657e-07, + "loss": 0.5998, + "step": 2278 + }, + { + "epoch": 2.7330533893221354, + "grad_norm": 0.5976809733384075, + "learning_rate": 2.363706372335045e-07, + "loss": 0.6361, + "step": 2279 + }, + { + "epoch": 2.734253149370126, + "grad_norm": 0.5309139536680249, + "learning_rate": 2.3425320029331177e-07, + "loss": 0.6062, + "step": 2280 + }, + { + "epoch": 2.7354529094181164, + "grad_norm": 0.5643678683098926, + "learning_rate": 2.3214506268460224e-07, + "loss": 0.5474, + "step": 2281 + }, + { + "epoch": 2.736652669466107, + "grad_norm": 0.6645838525987101, + "learning_rate": 2.3004622852095372e-07, + "loss": 0.4881, + "step": 2282 + }, + { + "epoch": 2.7378524295140974, + "grad_norm": 0.5958003697026015, + "learning_rate": 2.279567018977902e-07, + "loss": 0.5989, + "step": 2283 + }, + { + "epoch": 2.739052189562088, + "grad_norm": 0.6578014953406578, + "learning_rate": 2.2587648689237461e-07, + "loss": 0.6215, + "step": 2284 + }, + { + "epoch": 2.740251949610078, + "grad_norm": 0.6076524618518147, + "learning_rate": 2.2380558756379788e-07, + "loss": 0.5891, + "step": 2285 + }, + { + "epoch": 2.7414517096580684, + "grad_norm": 0.6184516057759724, + "learning_rate": 2.2174400795297814e-07, + "loss": 0.5446, + "step": 2286 + }, + { + "epoch": 2.742651469706059, + "grad_norm": 0.5844299039413995, + "learning_rate": 2.196917520826447e-07, + "loss": 0.5247, + "step": 2287 + }, + { + "epoch": 2.743851229754049, + "grad_norm": 0.5706212187640874, + "learning_rate": 2.1764882395733266e-07, + "loss": 0.5421, + "step": 2288 + }, + { + "epoch": 2.7450509898020394, + "grad_norm": 0.63442375775085, + "learning_rate": 2.1561522756337773e-07, + "loss": 0.6292, + "step": 2289 + }, + { + "epoch": 2.74625074985003, + "grad_norm": 0.6434067374513008, + "learning_rate": 2.1359096686890845e-07, + "loss": 0.5952, + "step": 2290 + }, + { + "epoch": 2.7474505098980204, + "grad_norm": 0.6033121054333428, + "learning_rate": 2.1157604582383308e-07, + "loss": 0.5912, + "step": 2291 + }, + { + "epoch": 2.748650269946011, + "grad_norm": 0.6444524526744027, + "learning_rate": 2.0957046835983764e-07, + "loss": 0.5709, + "step": 2292 + }, + { + "epoch": 2.7498500299940014, + "grad_norm": 0.6249267289836973, + "learning_rate": 2.0757423839037728e-07, + "loss": 0.5662, + "step": 2293 + }, + { + "epoch": 2.7510497900419915, + "grad_norm": 0.6596582768889766, + "learning_rate": 2.0558735981066445e-07, + "loss": 0.566, + "step": 2294 + }, + { + "epoch": 2.752249550089982, + "grad_norm": 0.6574313280601249, + "learning_rate": 2.0360983649766685e-07, + "loss": 0.6004, + "step": 2295 + }, + { + "epoch": 2.7534493101379725, + "grad_norm": 0.6199832270141975, + "learning_rate": 2.0164167231009612e-07, + "loss": 0.5257, + "step": 2296 + }, + { + "epoch": 2.754649070185963, + "grad_norm": 0.6445693406967722, + "learning_rate": 1.9968287108840246e-07, + "loss": 0.5262, + "step": 2297 + }, + { + "epoch": 2.755848830233953, + "grad_norm": 0.6079282676234945, + "learning_rate": 1.9773343665476563e-07, + "loss": 0.545, + "step": 2298 + }, + { + "epoch": 2.7570485902819435, + "grad_norm": 0.6526380557697038, + "learning_rate": 1.957933728130884e-07, + "loss": 0.5556, + "step": 2299 + }, + { + "epoch": 2.758248350329934, + "grad_norm": 0.6277477478771757, + "learning_rate": 1.9386268334898817e-07, + "loss": 0.5109, + "step": 2300 + }, + { + "epoch": 2.7594481103779245, + "grad_norm": 0.9598483982453556, + "learning_rate": 1.9194137202979246e-07, + "loss": 0.5581, + "step": 2301 + }, + { + "epoch": 2.760647870425915, + "grad_norm": 0.5113688585036608, + "learning_rate": 1.9002944260452517e-07, + "loss": 0.5743, + "step": 2302 + }, + { + "epoch": 2.7618476304739055, + "grad_norm": 0.529723461042736, + "learning_rate": 1.8812689880390812e-07, + "loss": 0.6466, + "step": 2303 + }, + { + "epoch": 2.7630473905218955, + "grad_norm": 0.5898958642033482, + "learning_rate": 1.862337443403467e-07, + "loss": 0.5929, + "step": 2304 + }, + { + "epoch": 2.764247150569886, + "grad_norm": 0.6898063775649348, + "learning_rate": 1.8434998290792373e-07, + "loss": 0.5399, + "step": 2305 + }, + { + "epoch": 2.7654469106178765, + "grad_norm": 0.5779799023026893, + "learning_rate": 1.8247561818239723e-07, + "loss": 0.5641, + "step": 2306 + }, + { + "epoch": 2.7666466706658666, + "grad_norm": 0.6202471124125889, + "learning_rate": 1.806106538211866e-07, + "loss": 0.5772, + "step": 2307 + }, + { + "epoch": 2.767846430713857, + "grad_norm": 0.5851019466251818, + "learning_rate": 1.7875509346336917e-07, + "loss": 0.5765, + "step": 2308 + }, + { + "epoch": 2.7690461907618475, + "grad_norm": 0.6024107261524307, + "learning_rate": 1.7690894072967146e-07, + "loss": 0.581, + "step": 2309 + }, + { + "epoch": 2.770245950809838, + "grad_norm": 0.5510832385681983, + "learning_rate": 1.7507219922246688e-07, + "loss": 0.5625, + "step": 2310 + }, + { + "epoch": 2.7714457108578285, + "grad_norm": 0.589613404873875, + "learning_rate": 1.7324487252575907e-07, + "loss": 0.5761, + "step": 2311 + }, + { + "epoch": 2.772645470905819, + "grad_norm": 0.5682011883468383, + "learning_rate": 1.7142696420518523e-07, + "loss": 0.5825, + "step": 2312 + }, + { + "epoch": 2.773845230953809, + "grad_norm": 0.5897501239285461, + "learning_rate": 1.6961847780800346e-07, + "loss": 0.607, + "step": 2313 + }, + { + "epoch": 2.7750449910017996, + "grad_norm": 0.4809755268612014, + "learning_rate": 1.678194168630859e-07, + "loss": 0.5588, + "step": 2314 + }, + { + "epoch": 2.77624475104979, + "grad_norm": 0.6118928390334075, + "learning_rate": 1.6602978488091336e-07, + "loss": 0.6085, + "step": 2315 + }, + { + "epoch": 2.7774445110977806, + "grad_norm": 0.6209341488451864, + "learning_rate": 1.642495853535686e-07, + "loss": 0.5666, + "step": 2316 + }, + { + "epoch": 2.7786442711457706, + "grad_norm": 0.586087352341755, + "learning_rate": 1.6247882175472906e-07, + "loss": 0.5868, + "step": 2317 + }, + { + "epoch": 2.779844031193761, + "grad_norm": 0.5936490028024038, + "learning_rate": 1.6071749753965914e-07, + "loss": 0.5987, + "step": 2318 + }, + { + "epoch": 2.7810437912417516, + "grad_norm": 0.5932244561766362, + "learning_rate": 1.5896561614520578e-07, + "loss": 0.5096, + "step": 2319 + }, + { + "epoch": 2.782243551289742, + "grad_norm": 0.6959133173132499, + "learning_rate": 1.5722318098978783e-07, + "loss": 0.5321, + "step": 2320 + }, + { + "epoch": 2.7834433113377326, + "grad_norm": 0.5360416449049131, + "learning_rate": 1.5549019547339506e-07, + "loss": 0.5818, + "step": 2321 + }, + { + "epoch": 2.784643071385723, + "grad_norm": 0.5863954015074843, + "learning_rate": 1.537666629775747e-07, + "loss": 0.5521, + "step": 2322 + }, + { + "epoch": 2.785842831433713, + "grad_norm": 0.4947675242004059, + "learning_rate": 1.5205258686543212e-07, + "loss": 0.5907, + "step": 2323 + }, + { + "epoch": 2.7870425914817036, + "grad_norm": 0.5860510775749931, + "learning_rate": 1.5034797048161742e-07, + "loss": 0.5598, + "step": 2324 + }, + { + "epoch": 2.788242351529694, + "grad_norm": 0.6242552752538505, + "learning_rate": 1.4865281715232437e-07, + "loss": 0.6003, + "step": 2325 + }, + { + "epoch": 2.7894421115776846, + "grad_norm": 0.5525242477859322, + "learning_rate": 1.4696713018527932e-07, + "loss": 0.5539, + "step": 2326 + }, + { + "epoch": 2.7906418716256747, + "grad_norm": 0.5416778071383951, + "learning_rate": 1.4529091286973994e-07, + "loss": 0.5364, + "step": 2327 + }, + { + "epoch": 2.791841631673665, + "grad_norm": 0.5563327851110774, + "learning_rate": 1.4362416847648387e-07, + "loss": 0.6012, + "step": 2328 + }, + { + "epoch": 2.7930413917216557, + "grad_norm": 0.6265224455532714, + "learning_rate": 1.419669002578039e-07, + "loss": 0.5578, + "step": 2329 + }, + { + "epoch": 2.794241151769646, + "grad_norm": 0.6064562054059573, + "learning_rate": 1.4031911144750443e-07, + "loss": 0.6092, + "step": 2330 + }, + { + "epoch": 2.7954409118176367, + "grad_norm": 0.6221941054992836, + "learning_rate": 1.386808052608918e-07, + "loss": 0.5907, + "step": 2331 + }, + { + "epoch": 2.796640671865627, + "grad_norm": 0.6617135526417072, + "learning_rate": 1.3705198489476712e-07, + "loss": 0.6039, + "step": 2332 + }, + { + "epoch": 2.797840431913617, + "grad_norm": 0.5885914853696609, + "learning_rate": 1.3543265352742418e-07, + "loss": 0.5131, + "step": 2333 + }, + { + "epoch": 2.7990401919616077, + "grad_norm": 0.5369215495584861, + "learning_rate": 1.338228143186404e-07, + "loss": 0.5668, + "step": 2334 + }, + { + "epoch": 2.800239952009598, + "grad_norm": 0.7015199653828252, + "learning_rate": 1.3222247040967195e-07, + "loss": 0.539, + "step": 2335 + }, + { + "epoch": 2.8014397120575882, + "grad_norm": 0.6373537615543214, + "learning_rate": 1.3063162492324533e-07, + "loss": 0.6098, + "step": 2336 + }, + { + "epoch": 2.8026394721055787, + "grad_norm": 0.6182980392879887, + "learning_rate": 1.2905028096355465e-07, + "loss": 0.5617, + "step": 2337 + }, + { + "epoch": 2.8038392321535692, + "grad_norm": 0.656616322139744, + "learning_rate": 1.2747844161625277e-07, + "loss": 0.5108, + "step": 2338 + }, + { + "epoch": 2.8050389922015597, + "grad_norm": 0.6934710935426451, + "learning_rate": 1.2591610994844682e-07, + "loss": 0.6423, + "step": 2339 + }, + { + "epoch": 2.80623875224955, + "grad_norm": 0.6103801170457688, + "learning_rate": 1.2436328900869098e-07, + "loss": 0.5763, + "step": 2340 + }, + { + "epoch": 2.8074385122975407, + "grad_norm": 0.5358157555678392, + "learning_rate": 1.228199818269826e-07, + "loss": 0.6331, + "step": 2341 + }, + { + "epoch": 2.8086382723455308, + "grad_norm": 0.5499433489191992, + "learning_rate": 1.2128619141475394e-07, + "loss": 0.5712, + "step": 2342 + }, + { + "epoch": 2.8098380323935213, + "grad_norm": 0.7654236897587715, + "learning_rate": 1.1976192076486703e-07, + "loss": 0.5075, + "step": 2343 + }, + { + "epoch": 2.8110377924415118, + "grad_norm": 0.5805466981639074, + "learning_rate": 1.1824717285160992e-07, + "loss": 0.5489, + "step": 2344 + }, + { + "epoch": 2.8122375524895022, + "grad_norm": 0.5859214634697024, + "learning_rate": 1.1674195063068771e-07, + "loss": 0.5515, + "step": 2345 + }, + { + "epoch": 2.8134373125374923, + "grad_norm": 0.615826387549419, + "learning_rate": 1.1524625703921654e-07, + "loss": 0.5766, + "step": 2346 + }, + { + "epoch": 2.814637072585483, + "grad_norm": 0.6412381130303643, + "learning_rate": 1.137600949957235e-07, + "loss": 0.6163, + "step": 2347 + }, + { + "epoch": 2.8158368326334733, + "grad_norm": 0.5747136469427545, + "learning_rate": 1.1228346740013385e-07, + "loss": 0.6252, + "step": 2348 + }, + { + "epoch": 2.817036592681464, + "grad_norm": 0.6039502257496154, + "learning_rate": 1.1081637713376892e-07, + "loss": 0.5536, + "step": 2349 + }, + { + "epoch": 2.8182363527294543, + "grad_norm": 0.5639887024242807, + "learning_rate": 1.0935882705933987e-07, + "loss": 0.5199, + "step": 2350 + }, + { + "epoch": 2.8194361127774448, + "grad_norm": 0.6164265957182835, + "learning_rate": 1.0791082002094444e-07, + "loss": 0.5222, + "step": 2351 + }, + { + "epoch": 2.820635872825435, + "grad_norm": 0.5936091059699564, + "learning_rate": 1.0647235884405638e-07, + "loss": 0.6108, + "step": 2352 + }, + { + "epoch": 2.8218356328734253, + "grad_norm": 0.5932654147724347, + "learning_rate": 1.0504344633552265e-07, + "loss": 0.5337, + "step": 2353 + }, + { + "epoch": 2.823035392921416, + "grad_norm": 0.5184278300606726, + "learning_rate": 1.0362408528356127e-07, + "loss": 0.5649, + "step": 2354 + }, + { + "epoch": 2.824235152969406, + "grad_norm": 0.5589941396527119, + "learning_rate": 1.0221427845774956e-07, + "loss": 0.5803, + "step": 2355 + }, + { + "epoch": 2.8254349130173964, + "grad_norm": 0.671790775421853, + "learning_rate": 1.0081402860902257e-07, + "loss": 0.5729, + "step": 2356 + }, + { + "epoch": 2.826634673065387, + "grad_norm": 0.5891561264062914, + "learning_rate": 9.942333846966745e-08, + "loss": 0.6147, + "step": 2357 + }, + { + "epoch": 2.8278344331133773, + "grad_norm": 0.654399315559274, + "learning_rate": 9.804221075331688e-08, + "loss": 0.5565, + "step": 2358 + }, + { + "epoch": 2.829034193161368, + "grad_norm": 0.5758731263307098, + "learning_rate": 9.66706481549462e-08, + "loss": 0.5397, + "step": 2359 + }, + { + "epoch": 2.8302339532093583, + "grad_norm": 0.7214093965701747, + "learning_rate": 9.530865335086403e-08, + "loss": 0.5648, + "step": 2360 + }, + { + "epoch": 2.8314337132573484, + "grad_norm": 0.6203261667989486, + "learning_rate": 9.395622899871171e-08, + "loss": 0.5816, + "step": 2361 + }, + { + "epoch": 2.832633473305339, + "grad_norm": 0.5976424430809402, + "learning_rate": 9.261337773745437e-08, + "loss": 0.5973, + "step": 2362 + }, + { + "epoch": 2.8338332333533294, + "grad_norm": 0.5973645134564878, + "learning_rate": 9.128010218737771e-08, + "loss": 0.5933, + "step": 2363 + }, + { + "epoch": 2.83503299340132, + "grad_norm": 0.6093648571259452, + "learning_rate": 8.99564049500834e-08, + "loss": 0.6046, + "step": 2364 + }, + { + "epoch": 2.83623275344931, + "grad_norm": 0.6355614257808114, + "learning_rate": 8.8642288608482e-08, + "loss": 0.5631, + "step": 2365 + }, + { + "epoch": 2.8374325134973004, + "grad_norm": 0.6078813029467055, + "learning_rate": 8.733775572678849e-08, + "loss": 0.5822, + "step": 2366 + }, + { + "epoch": 2.838632273545291, + "grad_norm": 0.6311747353130119, + "learning_rate": 8.604280885051996e-08, + "loss": 0.5456, + "step": 2367 + }, + { + "epoch": 2.8398320335932814, + "grad_norm": 0.6648083150307312, + "learning_rate": 8.475745050648687e-08, + "loss": 0.5614, + "step": 2368 + }, + { + "epoch": 2.841031793641272, + "grad_norm": 0.5762667824699731, + "learning_rate": 8.34816832027896e-08, + "loss": 0.6001, + "step": 2369 + }, + { + "epoch": 2.8422315536892624, + "grad_norm": 0.5498158183758537, + "learning_rate": 8.221550942881406e-08, + "loss": 0.5935, + "step": 2370 + }, + { + "epoch": 2.8434313137372524, + "grad_norm": 0.6219391351167715, + "learning_rate": 8.095893165522728e-08, + "loss": 0.5799, + "step": 2371 + }, + { + "epoch": 2.844631073785243, + "grad_norm": 0.6413960554893885, + "learning_rate": 7.971195233397011e-08, + "loss": 0.5447, + "step": 2372 + }, + { + "epoch": 2.8458308338332334, + "grad_norm": 0.5827133421232401, + "learning_rate": 7.847457389825507e-08, + "loss": 0.5537, + "step": 2373 + }, + { + "epoch": 2.8470305938812235, + "grad_norm": 0.6326857277662814, + "learning_rate": 7.724679876256024e-08, + "loss": 0.5823, + "step": 2374 + }, + { + "epoch": 2.848230353929214, + "grad_norm": 0.6354885512998139, + "learning_rate": 7.602862932262589e-08, + "loss": 0.5863, + "step": 2375 + }, + { + "epoch": 2.8494301139772045, + "grad_norm": 0.6748462246435095, + "learning_rate": 7.482006795544783e-08, + "loss": 0.5455, + "step": 2376 + }, + { + "epoch": 2.850629874025195, + "grad_norm": 0.5673661368960419, + "learning_rate": 7.362111701927354e-08, + "loss": 0.5251, + "step": 2377 + }, + { + "epoch": 2.8518296340731855, + "grad_norm": 0.5556661562224987, + "learning_rate": 7.243177885359831e-08, + "loss": 0.5417, + "step": 2378 + }, + { + "epoch": 2.853029394121176, + "grad_norm": 0.6581598350545725, + "learning_rate": 7.125205577916072e-08, + "loss": 0.5949, + "step": 2379 + }, + { + "epoch": 2.854229154169166, + "grad_norm": 0.5916519804123358, + "learning_rate": 7.008195009793717e-08, + "loss": 0.6273, + "step": 2380 + }, + { + "epoch": 2.8554289142171565, + "grad_norm": 0.6805425376195103, + "learning_rate": 6.892146409313738e-08, + "loss": 0.5406, + "step": 2381 + }, + { + "epoch": 2.856628674265147, + "grad_norm": 0.5976912878047412, + "learning_rate": 6.77706000292e-08, + "loss": 0.5711, + "step": 2382 + }, + { + "epoch": 2.8578284343131375, + "grad_norm": 0.5729419043066288, + "learning_rate": 6.662936015178978e-08, + "loss": 0.6223, + "step": 2383 + }, + { + "epoch": 2.8590281943611275, + "grad_norm": 0.6842825240551157, + "learning_rate": 6.549774668779097e-08, + "loss": 0.5314, + "step": 2384 + }, + { + "epoch": 2.860227954409118, + "grad_norm": 0.5627449880625564, + "learning_rate": 6.437576184530447e-08, + "loss": 0.5852, + "step": 2385 + }, + { + "epoch": 2.8614277144571085, + "grad_norm": 0.6166357064036266, + "learning_rate": 6.326340781364238e-08, + "loss": 0.5233, + "step": 2386 + }, + { + "epoch": 2.862627474505099, + "grad_norm": 0.6309257685754714, + "learning_rate": 6.21606867633251e-08, + "loss": 0.5371, + "step": 2387 + }, + { + "epoch": 2.8638272345530895, + "grad_norm": 0.5931088942775689, + "learning_rate": 6.106760084607589e-08, + "loss": 0.5298, + "step": 2388 + }, + { + "epoch": 2.86502699460108, + "grad_norm": 0.6740502744496198, + "learning_rate": 5.998415219481746e-08, + "loss": 0.5738, + "step": 2389 + }, + { + "epoch": 2.86622675464907, + "grad_norm": 0.5451730412062963, + "learning_rate": 5.8910342923666486e-08, + "loss": 0.5101, + "step": 2390 + }, + { + "epoch": 2.8674265146970606, + "grad_norm": 0.5889645164578342, + "learning_rate": 5.784617512793134e-08, + "loss": 0.5556, + "step": 2391 + }, + { + "epoch": 2.868626274745051, + "grad_norm": 0.5443396605516243, + "learning_rate": 5.679165088410765e-08, + "loss": 0.6002, + "step": 2392 + }, + { + "epoch": 2.8698260347930415, + "grad_norm": 0.5962558841554856, + "learning_rate": 5.5746772249872797e-08, + "loss": 0.5479, + "step": 2393 + }, + { + "epoch": 2.8710257948410316, + "grad_norm": 0.6561710368807768, + "learning_rate": 5.4711541264082516e-08, + "loss": 0.563, + "step": 2394 + }, + { + "epoch": 2.872225554889022, + "grad_norm": 0.5435010579982906, + "learning_rate": 5.368595994676762e-08, + "loss": 0.5816, + "step": 2395 + }, + { + "epoch": 2.8734253149370126, + "grad_norm": 0.5893099934206002, + "learning_rate": 5.267003029913065e-08, + "loss": 0.5841, + "step": 2396 + }, + { + "epoch": 2.874625074985003, + "grad_norm": 0.5737803336900801, + "learning_rate": 5.166375430353976e-08, + "loss": 0.5323, + "step": 2397 + }, + { + "epoch": 2.8758248350329936, + "grad_norm": 0.6553772837801549, + "learning_rate": 5.066713392352651e-08, + "loss": 0.4994, + "step": 2398 + }, + { + "epoch": 2.877024595080984, + "grad_norm": 0.6562504677178241, + "learning_rate": 4.9680171103781425e-08, + "loss": 0.6203, + "step": 2399 + }, + { + "epoch": 2.878224355128974, + "grad_norm": 0.5349033934562512, + "learning_rate": 4.8702867770150655e-08, + "loss": 0.5742, + "step": 2400 + }, + { + "epoch": 2.8794241151769646, + "grad_norm": 0.5570332821601537, + "learning_rate": 4.7735225829632084e-08, + "loss": 0.5724, + "step": 2401 + }, + { + "epoch": 2.880623875224955, + "grad_norm": 0.6557660398768234, + "learning_rate": 4.677724717037035e-08, + "loss": 0.6064, + "step": 2402 + }, + { + "epoch": 2.881823635272945, + "grad_norm": 0.5688158694700273, + "learning_rate": 4.582893366165631e-08, + "loss": 0.5199, + "step": 2403 + }, + { + "epoch": 2.8830233953209357, + "grad_norm": 0.7807570238399966, + "learning_rate": 4.489028715391808e-08, + "loss": 0.5327, + "step": 2404 + }, + { + "epoch": 2.884223155368926, + "grad_norm": 0.5401166517202306, + "learning_rate": 4.396130947872501e-08, + "loss": 0.5663, + "step": 2405 + }, + { + "epoch": 2.8854229154169166, + "grad_norm": 0.6043981734915697, + "learning_rate": 4.304200244877654e-08, + "loss": 0.5759, + "step": 2406 + }, + { + "epoch": 2.886622675464907, + "grad_norm": 0.593386744178214, + "learning_rate": 4.213236785790275e-08, + "loss": 0.5814, + "step": 2407 + }, + { + "epoch": 2.8878224355128976, + "grad_norm": 0.5952394648525249, + "learning_rate": 4.12324074810605e-08, + "loss": 0.6064, + "step": 2408 + }, + { + "epoch": 2.8890221955608877, + "grad_norm": 0.5647464562397199, + "learning_rate": 4.03421230743295e-08, + "loss": 0.5369, + "step": 2409 + }, + { + "epoch": 2.890221955608878, + "grad_norm": 0.5457714401248307, + "learning_rate": 3.946151637490792e-08, + "loss": 0.5919, + "step": 2410 + }, + { + "epoch": 2.8914217156568687, + "grad_norm": 0.6228834155462756, + "learning_rate": 3.859058910111069e-08, + "loss": 0.6165, + "step": 2411 + }, + { + "epoch": 2.892621475704859, + "grad_norm": 0.6845194732842805, + "learning_rate": 3.772934295236619e-08, + "loss": 0.5278, + "step": 2412 + }, + { + "epoch": 2.893821235752849, + "grad_norm": 0.7300397421212467, + "learning_rate": 3.6877779609210686e-08, + "loss": 0.5539, + "step": 2413 + }, + { + "epoch": 2.8950209958008397, + "grad_norm": 0.615417448310366, + "learning_rate": 3.6035900733286667e-08, + "loss": 0.5605, + "step": 2414 + }, + { + "epoch": 2.89622075584883, + "grad_norm": 0.5390319632170048, + "learning_rate": 3.5203707967341184e-08, + "loss": 0.6184, + "step": 2415 + }, + { + "epoch": 2.8974205158968207, + "grad_norm": 0.6135132318318581, + "learning_rate": 3.438120293521918e-08, + "loss": 0.4843, + "step": 2416 + }, + { + "epoch": 2.898620275944811, + "grad_norm": 0.6615575451967282, + "learning_rate": 3.35683872418624e-08, + "loss": 0.6066, + "step": 2417 + }, + { + "epoch": 2.8998200359928017, + "grad_norm": 0.7081757860107213, + "learning_rate": 3.276526247330602e-08, + "loss": 0.6354, + "step": 2418 + }, + { + "epoch": 2.9010197960407917, + "grad_norm": 0.6064346017295151, + "learning_rate": 3.197183019667594e-08, + "loss": 0.5965, + "step": 2419 + }, + { + "epoch": 2.9022195560887822, + "grad_norm": 0.6021962892961614, + "learning_rate": 3.118809196018424e-08, + "loss": 0.5247, + "step": 2420 + }, + { + "epoch": 2.9034193161367727, + "grad_norm": 0.5059717113947267, + "learning_rate": 3.0414049293128744e-08, + "loss": 0.6033, + "step": 2421 + }, + { + "epoch": 2.904619076184763, + "grad_norm": 0.5794323760670155, + "learning_rate": 2.964970370588738e-08, + "loss": 0.4934, + "step": 2422 + }, + { + "epoch": 2.9058188362327533, + "grad_norm": 0.6680157195088374, + "learning_rate": 2.889505668991599e-08, + "loss": 0.5562, + "step": 2423 + }, + { + "epoch": 2.9070185962807438, + "grad_norm": 0.669071371015542, + "learning_rate": 2.8150109717745566e-08, + "loss": 0.538, + "step": 2424 + }, + { + "epoch": 2.9082183563287343, + "grad_norm": 0.6467881845154935, + "learning_rate": 2.741486424298112e-08, + "loss": 0.5068, + "step": 2425 + }, + { + "epoch": 2.9094181163767248, + "grad_norm": 0.5615104696063138, + "learning_rate": 2.6689321700296144e-08, + "loss": 0.6049, + "step": 2426 + }, + { + "epoch": 2.9106178764247153, + "grad_norm": 0.5337904628569098, + "learning_rate": 2.597348350543094e-08, + "loss": 0.5577, + "step": 2427 + }, + { + "epoch": 2.9118176364727053, + "grad_norm": 0.6039259193524817, + "learning_rate": 2.5267351055190404e-08, + "loss": 0.6093, + "step": 2428 + }, + { + "epoch": 2.913017396520696, + "grad_norm": 0.5885852772977639, + "learning_rate": 2.4570925727439576e-08, + "loss": 0.5727, + "step": 2429 + }, + { + "epoch": 2.9142171565686863, + "grad_norm": 0.5597545658591652, + "learning_rate": 2.3884208881103654e-08, + "loss": 0.5568, + "step": 2430 + }, + { + "epoch": 2.915416916616677, + "grad_norm": 0.7074707681715321, + "learning_rate": 2.3207201856162432e-08, + "loss": 0.5879, + "step": 2431 + }, + { + "epoch": 2.916616676664667, + "grad_norm": 0.5535623547416776, + "learning_rate": 2.2539905973650855e-08, + "loss": 0.6107, + "step": 2432 + }, + { + "epoch": 2.9178164367126573, + "grad_norm": 0.6002257711241739, + "learning_rate": 2.1882322535652923e-08, + "loss": 0.5723, + "step": 2433 + }, + { + "epoch": 2.919016196760648, + "grad_norm": 0.7927136704041635, + "learning_rate": 2.1234452825300565e-08, + "loss": 0.5036, + "step": 2434 + }, + { + "epoch": 2.9202159568086383, + "grad_norm": 0.6268379548387949, + "learning_rate": 2.0596298106774214e-08, + "loss": 0.5084, + "step": 2435 + }, + { + "epoch": 2.921415716856629, + "grad_norm": 0.5571291168516115, + "learning_rate": 1.9967859625293905e-08, + "loss": 0.5239, + "step": 2436 + }, + { + "epoch": 2.9226154769046193, + "grad_norm": 0.5562198335939383, + "learning_rate": 1.934913860712373e-08, + "loss": 0.5042, + "step": 2437 + }, + { + "epoch": 2.9238152369526094, + "grad_norm": 0.6745555259181267, + "learning_rate": 1.8740136259564612e-08, + "loss": 0.573, + "step": 2438 + }, + { + "epoch": 2.9250149970006, + "grad_norm": 0.5578468264382872, + "learning_rate": 1.8140853770953758e-08, + "loss": 0.5526, + "step": 2439 + }, + { + "epoch": 2.9262147570485904, + "grad_norm": 0.48928727530060995, + "learning_rate": 1.7551292310661883e-08, + "loss": 0.5701, + "step": 2440 + }, + { + "epoch": 2.927414517096581, + "grad_norm": 0.6335310178542912, + "learning_rate": 1.697145302909209e-08, + "loss": 0.5229, + "step": 2441 + }, + { + "epoch": 2.928614277144571, + "grad_norm": 0.6410518832303463, + "learning_rate": 1.6401337057675437e-08, + "loss": 0.6148, + "step": 2442 + }, + { + "epoch": 2.9298140371925614, + "grad_norm": 0.6393388135655955, + "learning_rate": 1.584094550887205e-08, + "loss": 0.5227, + "step": 2443 + }, + { + "epoch": 2.931013797240552, + "grad_norm": 0.7323633673448885, + "learning_rate": 1.5290279476165014e-08, + "loss": 0.5459, + "step": 2444 + }, + { + "epoch": 2.9322135572885424, + "grad_norm": 0.6045893347103232, + "learning_rate": 1.4749340034060367e-08, + "loss": 0.5908, + "step": 2445 + }, + { + "epoch": 2.933413317336533, + "grad_norm": 0.6857979509852705, + "learning_rate": 1.4218128238086548e-08, + "loss": 0.5103, + "step": 2446 + }, + { + "epoch": 2.9346130773845234, + "grad_norm": 0.6239867493238006, + "learning_rate": 1.36966451247883e-08, + "loss": 0.5371, + "step": 2447 + }, + { + "epoch": 2.9358128374325134, + "grad_norm": 0.5863751459074298, + "learning_rate": 1.3184891711727766e-08, + "loss": 0.5873, + "step": 2448 + }, + { + "epoch": 2.937012597480504, + "grad_norm": 0.6548800154129585, + "learning_rate": 1.2682868997482279e-08, + "loss": 0.5607, + "step": 2449 + }, + { + "epoch": 2.9382123575284944, + "grad_norm": 0.6205175088081284, + "learning_rate": 1.2190577961641581e-08, + "loss": 0.546, + "step": 2450 + }, + { + "epoch": 2.9394121175764845, + "grad_norm": 0.5730112925542632, + "learning_rate": 1.1708019564805607e-08, + "loss": 0.4884, + "step": 2451 + }, + { + "epoch": 2.940611877624475, + "grad_norm": 0.5576631496485588, + "learning_rate": 1.1235194748583367e-08, + "loss": 0.6219, + "step": 2452 + }, + { + "epoch": 2.9418116376724655, + "grad_norm": 0.6177446793222537, + "learning_rate": 1.0772104435591291e-08, + "loss": 0.5779, + "step": 2453 + }, + { + "epoch": 2.943011397720456, + "grad_norm": 0.5488751270134903, + "learning_rate": 1.0318749529450445e-08, + "loss": 0.5111, + "step": 2454 + }, + { + "epoch": 2.9442111577684464, + "grad_norm": 0.5853988884195377, + "learning_rate": 9.875130914785424e-09, + "loss": 0.4794, + "step": 2455 + }, + { + "epoch": 2.945410917816437, + "grad_norm": 0.5746464436940514, + "learning_rate": 9.44124945722269e-09, + "loss": 0.5213, + "step": 2456 + }, + { + "epoch": 2.946610677864427, + "grad_norm": 0.5958493842054852, + "learning_rate": 9.017106003389453e-09, + "loss": 0.5468, + "step": 2457 + }, + { + "epoch": 2.9478104379124175, + "grad_norm": 0.5536530116313324, + "learning_rate": 8.602701380909795e-09, + "loss": 0.6127, + "step": 2458 + }, + { + "epoch": 2.949010197960408, + "grad_norm": 0.6009468479285284, + "learning_rate": 8.198036398405217e-09, + "loss": 0.5661, + "step": 2459 + }, + { + "epoch": 2.9502099580083985, + "grad_norm": 0.5506053937698066, + "learning_rate": 7.803111845493539e-09, + "loss": 0.5562, + "step": 2460 + }, + { + "epoch": 2.9514097180563885, + "grad_norm": 0.6695927647551572, + "learning_rate": 7.417928492784443e-09, + "loss": 0.5259, + "step": 2461 + }, + { + "epoch": 2.952609478104379, + "grad_norm": 0.6277211369298169, + "learning_rate": 7.042487091880046e-09, + "loss": 0.5805, + "step": 2462 + }, + { + "epoch": 2.9538092381523695, + "grad_norm": 0.6151065472376642, + "learning_rate": 6.676788375374887e-09, + "loss": 0.5123, + "step": 2463 + }, + { + "epoch": 2.95500899820036, + "grad_norm": 0.6197422431335191, + "learning_rate": 6.320833056850939e-09, + "loss": 0.5593, + "step": 2464 + }, + { + "epoch": 2.9562087582483505, + "grad_norm": 0.5881896045872286, + "learning_rate": 5.9746218308781576e-09, + "loss": 0.5372, + "step": 2465 + }, + { + "epoch": 2.957408518296341, + "grad_norm": 0.5461184226742699, + "learning_rate": 5.638155373013931e-09, + "loss": 0.6064, + "step": 2466 + }, + { + "epoch": 2.958608278344331, + "grad_norm": 0.5740321047604405, + "learning_rate": 5.311434339799748e-09, + "loss": 0.5821, + "step": 2467 + }, + { + "epoch": 2.9598080383923215, + "grad_norm": 0.5209288885463851, + "learning_rate": 4.994459368762306e-09, + "loss": 0.5358, + "step": 2468 + }, + { + "epoch": 2.961007798440312, + "grad_norm": 0.6130283562317994, + "learning_rate": 4.6872310784096266e-09, + "loss": 0.582, + "step": 2469 + }, + { + "epoch": 2.962207558488302, + "grad_norm": 0.6166228177199247, + "learning_rate": 4.389750068231613e-09, + "loss": 0.5364, + "step": 2470 + }, + { + "epoch": 2.9634073185362926, + "grad_norm": 0.6014700381489093, + "learning_rate": 4.102016918698937e-09, + "loss": 0.5706, + "step": 2471 + }, + { + "epoch": 2.964607078584283, + "grad_norm": 0.5523814773020451, + "learning_rate": 3.824032191261373e-09, + "loss": 0.5369, + "step": 2472 + }, + { + "epoch": 2.9658068386322736, + "grad_norm": 0.5692347250551049, + "learning_rate": 3.5557964283455814e-09, + "loss": 0.5781, + "step": 2473 + }, + { + "epoch": 2.967006598680264, + "grad_norm": 0.6636113231878251, + "learning_rate": 3.2973101533567698e-09, + "loss": 0.5406, + "step": 2474 + }, + { + "epoch": 2.9682063587282546, + "grad_norm": 0.5418052771926479, + "learning_rate": 3.048573870674809e-09, + "loss": 0.5626, + "step": 2475 + }, + { + "epoch": 2.9694061187762446, + "grad_norm": 0.552204788057103, + "learning_rate": 2.809588065655899e-09, + "loss": 0.5345, + "step": 2476 + }, + { + "epoch": 2.970605878824235, + "grad_norm": 0.601451785587039, + "learning_rate": 2.580353204628683e-09, + "loss": 0.5226, + "step": 2477 + }, + { + "epoch": 2.9718056388722256, + "grad_norm": 0.6129126790989069, + "learning_rate": 2.360869734896465e-09, + "loss": 0.5999, + "step": 2478 + }, + { + "epoch": 2.973005398920216, + "grad_norm": 0.560396843514585, + "learning_rate": 2.1511380847333287e-09, + "loss": 0.5495, + "step": 2479 + }, + { + "epoch": 2.974205158968206, + "grad_norm": 0.6612438320348933, + "learning_rate": 1.9511586633863545e-09, + "loss": 0.5255, + "step": 2480 + }, + { + "epoch": 2.9754049190161966, + "grad_norm": 0.566045451367403, + "learning_rate": 1.7609318610722902e-09, + "loss": 0.5438, + "step": 2481 + }, + { + "epoch": 2.976604679064187, + "grad_norm": 0.5837424311079994, + "learning_rate": 1.580458048976996e-09, + "loss": 0.5038, + "step": 2482 + }, + { + "epoch": 2.9778044391121776, + "grad_norm": 0.5534621281042921, + "learning_rate": 1.409737579257664e-09, + "loss": 0.6024, + "step": 2483 + }, + { + "epoch": 2.979004199160168, + "grad_norm": 0.6026104152195847, + "learning_rate": 1.2487707850378227e-09, + "loss": 0.6059, + "step": 2484 + }, + { + "epoch": 2.9802039592081586, + "grad_norm": 0.6385190115377162, + "learning_rate": 1.0975579804101132e-09, + "loss": 0.5992, + "step": 2485 + }, + { + "epoch": 2.9814037192561487, + "grad_norm": 0.649119435899077, + "learning_rate": 9.560994604329576e-10, + "loss": 0.5002, + "step": 2486 + }, + { + "epoch": 2.982603479304139, + "grad_norm": 0.5676956087255925, + "learning_rate": 8.243955011333349e-10, + "loss": 0.5228, + "step": 2487 + }, + { + "epoch": 2.9838032393521297, + "grad_norm": 0.6041311035308607, + "learning_rate": 7.024463595028952e-10, + "loss": 0.5831, + "step": 2488 + }, + { + "epoch": 2.9850029994001197, + "grad_norm": 0.6071102452722618, + "learning_rate": 5.90252273498515e-10, + "loss": 0.5929, + "step": 2489 + }, + { + "epoch": 2.98620275944811, + "grad_norm": 0.5810758766628411, + "learning_rate": 4.878134620439623e-10, + "loss": 0.5855, + "step": 2490 + }, + { + "epoch": 2.9874025194961007, + "grad_norm": 0.5902052789115674, + "learning_rate": 3.951301250260109e-10, + "loss": 0.5904, + "step": 2491 + }, + { + "epoch": 2.988602279544091, + "grad_norm": 0.5486510541340834, + "learning_rate": 3.122024432961057e-10, + "loss": 0.6075, + "step": 2492 + }, + { + "epoch": 2.9898020395920817, + "grad_norm": 0.6679678462887061, + "learning_rate": 2.390305786698077e-10, + "loss": 0.5877, + "step": 2493 + }, + { + "epoch": 2.991001799640072, + "grad_norm": 0.6475178912393099, + "learning_rate": 1.7561467392679387e-10, + "loss": 0.5602, + "step": 2494 + }, + { + "epoch": 2.9922015596880622, + "grad_norm": 0.6573905852671234, + "learning_rate": 1.2195485280974694e-10, + "loss": 0.5506, + "step": 2495 + }, + { + "epoch": 2.9934013197360527, + "grad_norm": 0.6839301032257503, + "learning_rate": 7.805122002324527e-11, + "loss": 0.5639, + "step": 2496 + }, + { + "epoch": 2.994601079784043, + "grad_norm": 0.6982652385995234, + "learning_rate": 4.3903861236538336e-11, + "loss": 0.6393, + "step": 2497 + }, + { + "epoch": 2.9958008398320337, + "grad_norm": 0.6221337547855504, + "learning_rate": 1.951284308077117e-11, + "loss": 0.5426, + "step": 2498 + }, + { + "epoch": 2.9970005998800238, + "grad_norm": 0.5416322473354866, + "learning_rate": 4.878213150094624e-12, + "loss": 0.6198, + "step": 2499 + }, + { + "epoch": 2.9970005998800238, + "step": 2499, + "total_flos": 461172498612224.0, + "train_loss": 0.7641865797355777, + "train_runtime": 21559.3093, + "train_samples_per_second": 1.392, + "train_steps_per_second": 0.116 + } + ], + "logging_steps": 1, + "max_steps": 2499, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 461172498612224.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}