diff --git "a/outputs_pretrained/checkpoint-846/trainer_state.json" "b/outputs_pretrained/checkpoint-846/trainer_state.json" new file mode 100644--- /dev/null +++ "b/outputs_pretrained/checkpoint-846/trainer_state.json" @@ -0,0 +1,5956 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 846, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002364066193853428, + "grad_norm": 9.786882400512695, + "learning_rate": 0.0, + "loss": 3.2811, + "step": 1 + }, + { + "epoch": 0.004728132387706856, + "grad_norm": 10.056135177612305, + "learning_rate": 5e-06, + "loss": 3.2727, + "step": 2 + }, + { + "epoch": 0.0070921985815602835, + "grad_norm": 10.060759544372559, + "learning_rate": 1e-05, + "loss": 3.4002, + "step": 3 + }, + { + "epoch": 0.009456264775413711, + "grad_norm": 8.956620216369629, + "learning_rate": 1.5e-05, + "loss": 3.3498, + "step": 4 + }, + { + "epoch": 0.01182033096926714, + "grad_norm": 7.450173854827881, + "learning_rate": 2e-05, + "loss": 3.3485, + "step": 5 + }, + { + "epoch": 0.014184397163120567, + "grad_norm": 7.106024265289307, + "learning_rate": 2.5e-05, + "loss": 3.1664, + "step": 6 + }, + { + "epoch": 0.016548463356973995, + "grad_norm": 7.5257415771484375, + "learning_rate": 3e-05, + "loss": 3.087, + "step": 7 + }, + { + "epoch": 0.018912529550827423, + "grad_norm": 7.290092468261719, + "learning_rate": 3.5e-05, + "loss": 2.9256, + "step": 8 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 6.230826377868652, + "learning_rate": 4e-05, + "loss": 2.83, + "step": 9 + }, + { + "epoch": 0.02364066193853428, + "grad_norm": 7.3913679122924805, + "learning_rate": 4.5e-05, + "loss": 2.8809, + "step": 10 + }, + { + "epoch": 0.026004728132387706, + "grad_norm": 7.500810146331787, + "learning_rate": 5e-05, + "loss": 2.9329, + "step": 11 + }, + { + "epoch": 0.028368794326241134, + "grad_norm": 6.774663925170898, + "learning_rate": 4.994019138755981e-05, + "loss": 2.8633, + "step": 12 + }, + { + "epoch": 0.030732860520094562, + "grad_norm": 8.217077255249023, + "learning_rate": 4.988038277511962e-05, + "loss": 2.779, + "step": 13 + }, + { + "epoch": 0.03309692671394799, + "grad_norm": 6.935091495513916, + "learning_rate": 4.982057416267943e-05, + "loss": 2.7578, + "step": 14 + }, + { + "epoch": 0.03546099290780142, + "grad_norm": 7.576630592346191, + "learning_rate": 4.9760765550239234e-05, + "loss": 2.7023, + "step": 15 + }, + { + "epoch": 0.037825059101654845, + "grad_norm": 5.840672492980957, + "learning_rate": 4.970095693779905e-05, + "loss": 2.6723, + "step": 16 + }, + { + "epoch": 0.04018912529550828, + "grad_norm": 5.728421211242676, + "learning_rate": 4.964114832535885e-05, + "loss": 2.6571, + "step": 17 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 5.6147990226745605, + "learning_rate": 4.958133971291866e-05, + "loss": 2.712, + "step": 18 + }, + { + "epoch": 0.04491725768321513, + "grad_norm": 5.601494789123535, + "learning_rate": 4.952153110047847e-05, + "loss": 2.6546, + "step": 19 + }, + { + "epoch": 0.04728132387706856, + "grad_norm": 5.493714332580566, + "learning_rate": 4.946172248803828e-05, + "loss": 2.599, + "step": 20 + }, + { + "epoch": 0.04964539007092199, + "grad_norm": 5.742792129516602, + "learning_rate": 4.940191387559809e-05, + "loss": 2.6123, + "step": 21 + }, + { + "epoch": 0.05200945626477541, + "grad_norm": 4.954222202301025, + "learning_rate": 4.9342105263157894e-05, + "loss": 2.4297, + "step": 22 + }, + { + "epoch": 0.054373522458628844, + "grad_norm": 5.282480239868164, + "learning_rate": 4.928229665071771e-05, + "loss": 2.5279, + "step": 23 + }, + { + "epoch": 0.05673758865248227, + "grad_norm": 5.26470947265625, + "learning_rate": 4.922248803827751e-05, + "loss": 2.4522, + "step": 24 + }, + { + "epoch": 0.0591016548463357, + "grad_norm": 5.375247001647949, + "learning_rate": 4.916267942583732e-05, + "loss": 2.5739, + "step": 25 + }, + { + "epoch": 0.061465721040189124, + "grad_norm": 5.001519203186035, + "learning_rate": 4.910287081339713e-05, + "loss": 2.5496, + "step": 26 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 5.700991630554199, + "learning_rate": 4.904306220095694e-05, + "loss": 2.5462, + "step": 27 + }, + { + "epoch": 0.06619385342789598, + "grad_norm": 5.051942348480225, + "learning_rate": 4.898325358851675e-05, + "loss": 2.3694, + "step": 28 + }, + { + "epoch": 0.06855791962174941, + "grad_norm": 5.411677837371826, + "learning_rate": 4.892344497607656e-05, + "loss": 2.5297, + "step": 29 + }, + { + "epoch": 0.07092198581560284, + "grad_norm": 4.419405937194824, + "learning_rate": 4.886363636363637e-05, + "loss": 2.5009, + "step": 30 + }, + { + "epoch": 0.07328605200945626, + "grad_norm": 5.185110569000244, + "learning_rate": 4.880382775119617e-05, + "loss": 2.4033, + "step": 31 + }, + { + "epoch": 0.07565011820330969, + "grad_norm": 5.229886054992676, + "learning_rate": 4.874401913875598e-05, + "loss": 2.4766, + "step": 32 + }, + { + "epoch": 0.07801418439716312, + "grad_norm": 4.489797115325928, + "learning_rate": 4.868421052631579e-05, + "loss": 2.4468, + "step": 33 + }, + { + "epoch": 0.08037825059101655, + "grad_norm": 4.7152419090271, + "learning_rate": 4.86244019138756e-05, + "loss": 2.4374, + "step": 34 + }, + { + "epoch": 0.08274231678486997, + "grad_norm": 5.1662278175354, + "learning_rate": 4.8564593301435404e-05, + "loss": 2.3898, + "step": 35 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 4.726010322570801, + "learning_rate": 4.850478468899522e-05, + "loss": 2.5208, + "step": 36 + }, + { + "epoch": 0.08747044917257683, + "grad_norm": 4.929504871368408, + "learning_rate": 4.844497607655503e-05, + "loss": 2.4105, + "step": 37 + }, + { + "epoch": 0.08983451536643026, + "grad_norm": 4.676382064819336, + "learning_rate": 4.838516746411483e-05, + "loss": 2.3955, + "step": 38 + }, + { + "epoch": 0.09219858156028368, + "grad_norm": 5.068330764770508, + "learning_rate": 4.832535885167465e-05, + "loss": 2.3171, + "step": 39 + }, + { + "epoch": 0.09456264775413711, + "grad_norm": 4.925624847412109, + "learning_rate": 4.826555023923445e-05, + "loss": 2.3936, + "step": 40 + }, + { + "epoch": 0.09692671394799054, + "grad_norm": 4.296761989593506, + "learning_rate": 4.820574162679426e-05, + "loss": 2.3958, + "step": 41 + }, + { + "epoch": 0.09929078014184398, + "grad_norm": 4.61798095703125, + "learning_rate": 4.8145933014354064e-05, + "loss": 2.4068, + "step": 42 + }, + { + "epoch": 0.1016548463356974, + "grad_norm": 5.4643025398254395, + "learning_rate": 4.808612440191388e-05, + "loss": 2.3982, + "step": 43 + }, + { + "epoch": 0.10401891252955082, + "grad_norm": 4.580990314483643, + "learning_rate": 4.802631578947368e-05, + "loss": 2.346, + "step": 44 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 4.549551963806152, + "learning_rate": 4.796650717703349e-05, + "loss": 2.2992, + "step": 45 + }, + { + "epoch": 0.10874704491725769, + "grad_norm": 4.603874206542969, + "learning_rate": 4.790669856459331e-05, + "loss": 2.3436, + "step": 46 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 4.180109977722168, + "learning_rate": 4.784688995215311e-05, + "loss": 2.3465, + "step": 47 + }, + { + "epoch": 0.11347517730496454, + "grad_norm": 4.274153232574463, + "learning_rate": 4.778708133971292e-05, + "loss": 2.3173, + "step": 48 + }, + { + "epoch": 0.11583924349881797, + "grad_norm": 4.787023544311523, + "learning_rate": 4.772727272727273e-05, + "loss": 2.3225, + "step": 49 + }, + { + "epoch": 0.1182033096926714, + "grad_norm": 4.5308146476745605, + "learning_rate": 4.766746411483254e-05, + "loss": 2.2716, + "step": 50 + }, + { + "epoch": 0.12056737588652482, + "grad_norm": 4.548710823059082, + "learning_rate": 4.760765550239234e-05, + "loss": 2.277, + "step": 51 + }, + { + "epoch": 0.12293144208037825, + "grad_norm": 4.569216251373291, + "learning_rate": 4.754784688995216e-05, + "loss": 2.3076, + "step": 52 + }, + { + "epoch": 0.12529550827423167, + "grad_norm": 4.4847493171691895, + "learning_rate": 4.748803827751196e-05, + "loss": 2.3058, + "step": 53 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 4.511356353759766, + "learning_rate": 4.742822966507177e-05, + "loss": 2.1524, + "step": 54 + }, + { + "epoch": 0.13002364066193853, + "grad_norm": 4.245441436767578, + "learning_rate": 4.736842105263158e-05, + "loss": 2.2795, + "step": 55 + }, + { + "epoch": 0.13238770685579196, + "grad_norm": 4.903697490692139, + "learning_rate": 4.730861244019139e-05, + "loss": 2.2934, + "step": 56 + }, + { + "epoch": 0.1347517730496454, + "grad_norm": 4.915858745574951, + "learning_rate": 4.72488038277512e-05, + "loss": 2.271, + "step": 57 + }, + { + "epoch": 0.13711583924349882, + "grad_norm": 4.528074264526367, + "learning_rate": 4.7188995215311e-05, + "loss": 2.2712, + "step": 58 + }, + { + "epoch": 0.13947990543735225, + "grad_norm": 4.402848720550537, + "learning_rate": 4.712918660287082e-05, + "loss": 2.2441, + "step": 59 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 4.317126274108887, + "learning_rate": 4.706937799043062e-05, + "loss": 2.3081, + "step": 60 + }, + { + "epoch": 0.14420803782505912, + "grad_norm": 4.551846027374268, + "learning_rate": 4.700956937799043e-05, + "loss": 2.33, + "step": 61 + }, + { + "epoch": 0.14657210401891252, + "grad_norm": 4.294714450836182, + "learning_rate": 4.694976076555024e-05, + "loss": 2.2291, + "step": 62 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 4.264118194580078, + "learning_rate": 4.688995215311005e-05, + "loss": 2.187, + "step": 63 + }, + { + "epoch": 0.15130023640661938, + "grad_norm": 4.644858360290527, + "learning_rate": 4.683014354066986e-05, + "loss": 2.2767, + "step": 64 + }, + { + "epoch": 0.1536643026004728, + "grad_norm": 4.4240522384643555, + "learning_rate": 4.677033492822967e-05, + "loss": 2.2365, + "step": 65 + }, + { + "epoch": 0.15602836879432624, + "grad_norm": 4.994328022003174, + "learning_rate": 4.671052631578948e-05, + "loss": 2.1967, + "step": 66 + }, + { + "epoch": 0.15839243498817968, + "grad_norm": 4.091360092163086, + "learning_rate": 4.665071770334928e-05, + "loss": 2.1983, + "step": 67 + }, + { + "epoch": 0.1607565011820331, + "grad_norm": 4.4661760330200195, + "learning_rate": 4.659090909090909e-05, + "loss": 2.2006, + "step": 68 + }, + { + "epoch": 0.16312056737588654, + "grad_norm": 4.753389358520508, + "learning_rate": 4.65311004784689e-05, + "loss": 2.2816, + "step": 69 + }, + { + "epoch": 0.16548463356973994, + "grad_norm": 4.429995059967041, + "learning_rate": 4.647129186602871e-05, + "loss": 2.2732, + "step": 70 + }, + { + "epoch": 0.16784869976359337, + "grad_norm": 4.254610061645508, + "learning_rate": 4.641148325358852e-05, + "loss": 2.1097, + "step": 71 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 4.007920265197754, + "learning_rate": 4.635167464114833e-05, + "loss": 2.2057, + "step": 72 + }, + { + "epoch": 0.17257683215130024, + "grad_norm": 4.3318772315979, + "learning_rate": 4.629186602870814e-05, + "loss": 2.2074, + "step": 73 + }, + { + "epoch": 0.17494089834515367, + "grad_norm": 5.013289928436279, + "learning_rate": 4.623205741626794e-05, + "loss": 2.3091, + "step": 74 + }, + { + "epoch": 0.1773049645390071, + "grad_norm": 4.14231538772583, + "learning_rate": 4.617224880382776e-05, + "loss": 2.2149, + "step": 75 + }, + { + "epoch": 0.17966903073286053, + "grad_norm": 4.8862762451171875, + "learning_rate": 4.611244019138756e-05, + "loss": 2.2153, + "step": 76 + }, + { + "epoch": 0.18203309692671396, + "grad_norm": 4.4072489738464355, + "learning_rate": 4.605263157894737e-05, + "loss": 2.256, + "step": 77 + }, + { + "epoch": 0.18439716312056736, + "grad_norm": 4.19016170501709, + "learning_rate": 4.599282296650718e-05, + "loss": 2.1268, + "step": 78 + }, + { + "epoch": 0.1867612293144208, + "grad_norm": 4.425796031951904, + "learning_rate": 4.593301435406699e-05, + "loss": 2.2441, + "step": 79 + }, + { + "epoch": 0.18912529550827423, + "grad_norm": 4.253169536590576, + "learning_rate": 4.58732057416268e-05, + "loss": 2.2179, + "step": 80 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 4.226646900177002, + "learning_rate": 4.58133971291866e-05, + "loss": 2.2049, + "step": 81 + }, + { + "epoch": 0.1938534278959811, + "grad_norm": 4.134627342224121, + "learning_rate": 4.575358851674642e-05, + "loss": 2.1709, + "step": 82 + }, + { + "epoch": 0.19621749408983452, + "grad_norm": 3.9358890056610107, + "learning_rate": 4.569377990430622e-05, + "loss": 2.1614, + "step": 83 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 4.453876972198486, + "learning_rate": 4.563397129186603e-05, + "loss": 2.2435, + "step": 84 + }, + { + "epoch": 0.20094562647754138, + "grad_norm": 4.1704535484313965, + "learning_rate": 4.557416267942584e-05, + "loss": 2.1753, + "step": 85 + }, + { + "epoch": 0.2033096926713948, + "grad_norm": 4.117349624633789, + "learning_rate": 4.551435406698565e-05, + "loss": 2.196, + "step": 86 + }, + { + "epoch": 0.20567375886524822, + "grad_norm": 3.9573557376861572, + "learning_rate": 4.545454545454546e-05, + "loss": 2.1901, + "step": 87 + }, + { + "epoch": 0.20803782505910165, + "grad_norm": 4.639995098114014, + "learning_rate": 4.539473684210527e-05, + "loss": 2.2147, + "step": 88 + }, + { + "epoch": 0.21040189125295508, + "grad_norm": 4.055258750915527, + "learning_rate": 4.533492822966508e-05, + "loss": 2.1608, + "step": 89 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 4.474514484405518, + "learning_rate": 4.527511961722488e-05, + "loss": 2.1507, + "step": 90 + }, + { + "epoch": 0.21513002364066194, + "grad_norm": 4.639707088470459, + "learning_rate": 4.521531100478469e-05, + "loss": 2.2826, + "step": 91 + }, + { + "epoch": 0.21749408983451538, + "grad_norm": 4.863740921020508, + "learning_rate": 4.51555023923445e-05, + "loss": 2.1766, + "step": 92 + }, + { + "epoch": 0.2198581560283688, + "grad_norm": 4.412019729614258, + "learning_rate": 4.509569377990431e-05, + "loss": 2.1956, + "step": 93 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 4.765653610229492, + "learning_rate": 4.503588516746411e-05, + "loss": 2.2213, + "step": 94 + }, + { + "epoch": 0.22458628841607564, + "grad_norm": 4.181426525115967, + "learning_rate": 4.497607655502393e-05, + "loss": 2.1097, + "step": 95 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 4.018885612487793, + "learning_rate": 4.491626794258373e-05, + "loss": 2.1055, + "step": 96 + }, + { + "epoch": 0.2293144208037825, + "grad_norm": 4.294673919677734, + "learning_rate": 4.485645933014354e-05, + "loss": 2.1984, + "step": 97 + }, + { + "epoch": 0.23167848699763594, + "grad_norm": 4.040910720825195, + "learning_rate": 4.4796650717703357e-05, + "loss": 2.1209, + "step": 98 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 4.413674354553223, + "learning_rate": 4.473684210526316e-05, + "loss": 2.177, + "step": 99 + }, + { + "epoch": 0.2364066193853428, + "grad_norm": 4.586551189422607, + "learning_rate": 4.467703349282297e-05, + "loss": 2.2419, + "step": 100 + }, + { + "epoch": 0.23877068557919623, + "grad_norm": 3.753817319869995, + "learning_rate": 4.461722488038278e-05, + "loss": 2.1044, + "step": 101 + }, + { + "epoch": 0.24113475177304963, + "grad_norm": 4.756337642669678, + "learning_rate": 4.455741626794259e-05, + "loss": 2.2236, + "step": 102 + }, + { + "epoch": 0.24349881796690306, + "grad_norm": 4.174933433532715, + "learning_rate": 4.449760765550239e-05, + "loss": 2.1759, + "step": 103 + }, + { + "epoch": 0.2458628841607565, + "grad_norm": 4.282530784606934, + "learning_rate": 4.44377990430622e-05, + "loss": 2.16, + "step": 104 + }, + { + "epoch": 0.24822695035460993, + "grad_norm": 4.547479629516602, + "learning_rate": 4.437799043062201e-05, + "loss": 2.0593, + "step": 105 + }, + { + "epoch": 0.25059101654846333, + "grad_norm": 4.146774768829346, + "learning_rate": 4.431818181818182e-05, + "loss": 2.1806, + "step": 106 + }, + { + "epoch": 0.25295508274231676, + "grad_norm": 4.418383598327637, + "learning_rate": 4.425837320574163e-05, + "loss": 2.1936, + "step": 107 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 4.2527947425842285, + "learning_rate": 4.419856459330144e-05, + "loss": 2.0821, + "step": 108 + }, + { + "epoch": 0.2576832151300236, + "grad_norm": 4.108152389526367, + "learning_rate": 4.413875598086125e-05, + "loss": 2.1622, + "step": 109 + }, + { + "epoch": 0.26004728132387706, + "grad_norm": 4.083064556121826, + "learning_rate": 4.407894736842105e-05, + "loss": 2.1178, + "step": 110 + }, + { + "epoch": 0.2624113475177305, + "grad_norm": 4.213859558105469, + "learning_rate": 4.401913875598087e-05, + "loss": 2.1575, + "step": 111 + }, + { + "epoch": 0.2647754137115839, + "grad_norm": 5.222052574157715, + "learning_rate": 4.395933014354067e-05, + "loss": 2.0866, + "step": 112 + }, + { + "epoch": 0.26713947990543735, + "grad_norm": 4.5867743492126465, + "learning_rate": 4.389952153110048e-05, + "loss": 2.156, + "step": 113 + }, + { + "epoch": 0.2695035460992908, + "grad_norm": 4.399920463562012, + "learning_rate": 4.383971291866029e-05, + "loss": 2.1911, + "step": 114 + }, + { + "epoch": 0.2718676122931442, + "grad_norm": 3.858452081680298, + "learning_rate": 4.37799043062201e-05, + "loss": 2.152, + "step": 115 + }, + { + "epoch": 0.27423167848699764, + "grad_norm": 4.103243350982666, + "learning_rate": 4.372009569377991e-05, + "loss": 2.1825, + "step": 116 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 4.9622344970703125, + "learning_rate": 4.366028708133971e-05, + "loss": 2.1182, + "step": 117 + }, + { + "epoch": 0.2789598108747045, + "grad_norm": 4.199765205383301, + "learning_rate": 4.360047846889953e-05, + "loss": 2.0638, + "step": 118 + }, + { + "epoch": 0.28132387706855794, + "grad_norm": 4.943136215209961, + "learning_rate": 4.354066985645933e-05, + "loss": 2.1906, + "step": 119 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 4.076112270355225, + "learning_rate": 4.348086124401914e-05, + "loss": 2.1208, + "step": 120 + }, + { + "epoch": 0.2860520094562648, + "grad_norm": 4.323102951049805, + "learning_rate": 4.342105263157895e-05, + "loss": 2.1642, + "step": 121 + }, + { + "epoch": 0.28841607565011823, + "grad_norm": 3.8718385696411133, + "learning_rate": 4.336124401913876e-05, + "loss": 2.0946, + "step": 122 + }, + { + "epoch": 0.2907801418439716, + "grad_norm": 4.6664862632751465, + "learning_rate": 4.330143540669857e-05, + "loss": 2.1411, + "step": 123 + }, + { + "epoch": 0.29314420803782504, + "grad_norm": 4.570885181427002, + "learning_rate": 4.324162679425838e-05, + "loss": 2.1268, + "step": 124 + }, + { + "epoch": 0.29550827423167847, + "grad_norm": 3.7459053993225098, + "learning_rate": 4.318181818181819e-05, + "loss": 2.042, + "step": 125 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 4.332980155944824, + "learning_rate": 4.312200956937799e-05, + "loss": 2.1462, + "step": 126 + }, + { + "epoch": 0.30023640661938533, + "grad_norm": 4.2524919509887695, + "learning_rate": 4.3062200956937806e-05, + "loss": 2.136, + "step": 127 + }, + { + "epoch": 0.30260047281323876, + "grad_norm": 4.230786323547363, + "learning_rate": 4.300239234449761e-05, + "loss": 2.1397, + "step": 128 + }, + { + "epoch": 0.3049645390070922, + "grad_norm": 4.281345844268799, + "learning_rate": 4.294258373205742e-05, + "loss": 2.1, + "step": 129 + }, + { + "epoch": 0.3073286052009456, + "grad_norm": 4.491425514221191, + "learning_rate": 4.288277511961723e-05, + "loss": 2.1304, + "step": 130 + }, + { + "epoch": 0.30969267139479906, + "grad_norm": 4.068315029144287, + "learning_rate": 4.282296650717704e-05, + "loss": 2.1201, + "step": 131 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 4.424421787261963, + "learning_rate": 4.2763157894736847e-05, + "loss": 2.0387, + "step": 132 + }, + { + "epoch": 0.3144208037825059, + "grad_norm": 4.6968865394592285, + "learning_rate": 4.270334928229665e-05, + "loss": 2.0306, + "step": 133 + }, + { + "epoch": 0.31678486997635935, + "grad_norm": 3.468080520629883, + "learning_rate": 4.2643540669856466e-05, + "loss": 2.0499, + "step": 134 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 4.1545186042785645, + "learning_rate": 4.258373205741627e-05, + "loss": 2.0953, + "step": 135 + }, + { + "epoch": 0.3215130023640662, + "grad_norm": 4.665497779846191, + "learning_rate": 4.252392344497608e-05, + "loss": 2.0821, + "step": 136 + }, + { + "epoch": 0.32387706855791965, + "grad_norm": 4.307028770446777, + "learning_rate": 4.246411483253589e-05, + "loss": 2.0566, + "step": 137 + }, + { + "epoch": 0.3262411347517731, + "grad_norm": 4.265110969543457, + "learning_rate": 4.24043062200957e-05, + "loss": 2.1148, + "step": 138 + }, + { + "epoch": 0.32860520094562645, + "grad_norm": 4.940046787261963, + "learning_rate": 4.2344497607655506e-05, + "loss": 2.0617, + "step": 139 + }, + { + "epoch": 0.3309692671394799, + "grad_norm": 4.576653480529785, + "learning_rate": 4.2284688995215316e-05, + "loss": 2.1433, + "step": 140 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 4.502660274505615, + "learning_rate": 4.2224880382775126e-05, + "loss": 2.0935, + "step": 141 + }, + { + "epoch": 0.33569739952718675, + "grad_norm": 5.039083480834961, + "learning_rate": 4.216507177033493e-05, + "loss": 2.1362, + "step": 142 + }, + { + "epoch": 0.3380614657210402, + "grad_norm": 4.402792930603027, + "learning_rate": 4.210526315789474e-05, + "loss": 2.0993, + "step": 143 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 4.509486675262451, + "learning_rate": 4.204545454545455e-05, + "loss": 2.0299, + "step": 144 + }, + { + "epoch": 0.34278959810874704, + "grad_norm": 4.011101722717285, + "learning_rate": 4.198564593301436e-05, + "loss": 2.0929, + "step": 145 + }, + { + "epoch": 0.34515366430260047, + "grad_norm": 5.007168292999268, + "learning_rate": 4.192583732057416e-05, + "loss": 2.0842, + "step": 146 + }, + { + "epoch": 0.3475177304964539, + "grad_norm": 4.631533145904541, + "learning_rate": 4.1866028708133976e-05, + "loss": 2.0474, + "step": 147 + }, + { + "epoch": 0.34988179669030733, + "grad_norm": 4.278231620788574, + "learning_rate": 4.1806220095693785e-05, + "loss": 2.0819, + "step": 148 + }, + { + "epoch": 0.35224586288416077, + "grad_norm": 4.484694480895996, + "learning_rate": 4.174641148325359e-05, + "loss": 2.0502, + "step": 149 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 4.756961345672607, + "learning_rate": 4.1686602870813404e-05, + "loss": 2.0529, + "step": 150 + }, + { + "epoch": 0.35697399527186763, + "grad_norm": 4.347701549530029, + "learning_rate": 4.162679425837321e-05, + "loss": 2.0666, + "step": 151 + }, + { + "epoch": 0.35933806146572106, + "grad_norm": 3.76229190826416, + "learning_rate": 4.156698564593302e-05, + "loss": 2.0763, + "step": 152 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 3.8186378479003906, + "learning_rate": 4.150717703349282e-05, + "loss": 2.0392, + "step": 153 + }, + { + "epoch": 0.3640661938534279, + "grad_norm": 4.353139877319336, + "learning_rate": 4.1447368421052636e-05, + "loss": 1.9854, + "step": 154 + }, + { + "epoch": 0.3664302600472813, + "grad_norm": 5.3801422119140625, + "learning_rate": 4.138755980861244e-05, + "loss": 1.9959, + "step": 155 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 4.44751501083374, + "learning_rate": 4.132775119617225e-05, + "loss": 2.0911, + "step": 156 + }, + { + "epoch": 0.37115839243498816, + "grad_norm": 4.025294303894043, + "learning_rate": 4.1267942583732064e-05, + "loss": 2.0192, + "step": 157 + }, + { + "epoch": 0.3735224586288416, + "grad_norm": 4.605421543121338, + "learning_rate": 4.120813397129187e-05, + "loss": 2.1464, + "step": 158 + }, + { + "epoch": 0.375886524822695, + "grad_norm": 5.045236110687256, + "learning_rate": 4.114832535885168e-05, + "loss": 2.0756, + "step": 159 + }, + { + "epoch": 0.37825059101654845, + "grad_norm": 4.459848403930664, + "learning_rate": 4.1088516746411486e-05, + "loss": 2.1544, + "step": 160 + }, + { + "epoch": 0.3806146572104019, + "grad_norm": 4.420864105224609, + "learning_rate": 4.1028708133971296e-05, + "loss": 2.0296, + "step": 161 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 3.998237133026123, + "learning_rate": 4.09688995215311e-05, + "loss": 2.1039, + "step": 162 + }, + { + "epoch": 0.38534278959810875, + "grad_norm": 4.448668003082275, + "learning_rate": 4.0909090909090915e-05, + "loss": 2.0762, + "step": 163 + }, + { + "epoch": 0.3877068557919622, + "grad_norm": 4.266834259033203, + "learning_rate": 4.084928229665072e-05, + "loss": 2.0557, + "step": 164 + }, + { + "epoch": 0.3900709219858156, + "grad_norm": 4.796644687652588, + "learning_rate": 4.078947368421053e-05, + "loss": 2.1027, + "step": 165 + }, + { + "epoch": 0.39243498817966904, + "grad_norm": 4.560353755950928, + "learning_rate": 4.0729665071770337e-05, + "loss": 2.0245, + "step": 166 + }, + { + "epoch": 0.3947990543735225, + "grad_norm": 3.673262119293213, + "learning_rate": 4.0669856459330146e-05, + "loss": 2.033, + "step": 167 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 4.134026050567627, + "learning_rate": 4.0610047846889956e-05, + "loss": 2.0079, + "step": 168 + }, + { + "epoch": 0.39952718676122934, + "grad_norm": 4.677055835723877, + "learning_rate": 4.055023923444976e-05, + "loss": 2.0874, + "step": 169 + }, + { + "epoch": 0.40189125295508277, + "grad_norm": 4.731474876403809, + "learning_rate": 4.0490430622009575e-05, + "loss": 1.9878, + "step": 170 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 4.689650058746338, + "learning_rate": 4.043062200956938e-05, + "loss": 2.0933, + "step": 171 + }, + { + "epoch": 0.4066193853427896, + "grad_norm": 4.23575496673584, + "learning_rate": 4.037081339712919e-05, + "loss": 2.0436, + "step": 172 + }, + { + "epoch": 0.408983451536643, + "grad_norm": 3.8167171478271484, + "learning_rate": 4.0311004784688996e-05, + "loss": 1.9843, + "step": 173 + }, + { + "epoch": 0.41134751773049644, + "grad_norm": 4.35087251663208, + "learning_rate": 4.0251196172248806e-05, + "loss": 2.0386, + "step": 174 + }, + { + "epoch": 0.41371158392434987, + "grad_norm": 4.764196395874023, + "learning_rate": 4.0191387559808616e-05, + "loss": 2.0783, + "step": 175 + }, + { + "epoch": 0.4160756501182033, + "grad_norm": 4.374221324920654, + "learning_rate": 4.0131578947368425e-05, + "loss": 1.9711, + "step": 176 + }, + { + "epoch": 0.41843971631205673, + "grad_norm": 4.542954444885254, + "learning_rate": 4.0071770334928235e-05, + "loss": 2.0036, + "step": 177 + }, + { + "epoch": 0.42080378250591016, + "grad_norm": 3.849266767501831, + "learning_rate": 4.001196172248804e-05, + "loss": 2.0571, + "step": 178 + }, + { + "epoch": 0.4231678486997636, + "grad_norm": 4.014004230499268, + "learning_rate": 3.995215311004785e-05, + "loss": 2.0642, + "step": 179 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 4.369685649871826, + "learning_rate": 3.9892344497607656e-05, + "loss": 2.0502, + "step": 180 + }, + { + "epoch": 0.42789598108747046, + "grad_norm": 4.280391216278076, + "learning_rate": 3.9832535885167466e-05, + "loss": 2.0025, + "step": 181 + }, + { + "epoch": 0.4302600472813239, + "grad_norm": 4.444298267364502, + "learning_rate": 3.9772727272727275e-05, + "loss": 1.9828, + "step": 182 + }, + { + "epoch": 0.4326241134751773, + "grad_norm": 3.794656991958618, + "learning_rate": 3.9712918660287085e-05, + "loss": 2.048, + "step": 183 + }, + { + "epoch": 0.43498817966903075, + "grad_norm": 4.498697280883789, + "learning_rate": 3.9653110047846894e-05, + "loss": 2.1017, + "step": 184 + }, + { + "epoch": 0.4373522458628842, + "grad_norm": 4.182841777801514, + "learning_rate": 3.95933014354067e-05, + "loss": 2.0588, + "step": 185 + }, + { + "epoch": 0.4397163120567376, + "grad_norm": 4.118316173553467, + "learning_rate": 3.9533492822966514e-05, + "loss": 2.0654, + "step": 186 + }, + { + "epoch": 0.44208037825059104, + "grad_norm": 5.026358127593994, + "learning_rate": 3.9473684210526316e-05, + "loss": 2.0808, + "step": 187 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 4.171792984008789, + "learning_rate": 3.9413875598086126e-05, + "loss": 1.9832, + "step": 188 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 4.625768661499023, + "learning_rate": 3.9354066985645935e-05, + "loss": 2.0244, + "step": 189 + }, + { + "epoch": 0.4491725768321513, + "grad_norm": 4.119114398956299, + "learning_rate": 3.9294258373205745e-05, + "loss": 2.011, + "step": 190 + }, + { + "epoch": 0.4515366430260047, + "grad_norm": 4.071448802947998, + "learning_rate": 3.9234449760765554e-05, + "loss": 2.0469, + "step": 191 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 4.00705099105835, + "learning_rate": 3.917464114832536e-05, + "loss": 1.9889, + "step": 192 + }, + { + "epoch": 0.4562647754137116, + "grad_norm": 4.969683647155762, + "learning_rate": 3.9114832535885173e-05, + "loss": 2.0097, + "step": 193 + }, + { + "epoch": 0.458628841607565, + "grad_norm": 4.4712605476379395, + "learning_rate": 3.9055023923444976e-05, + "loss": 2.0413, + "step": 194 + }, + { + "epoch": 0.46099290780141844, + "grad_norm": 4.194468021392822, + "learning_rate": 3.8995215311004786e-05, + "loss": 2.0623, + "step": 195 + }, + { + "epoch": 0.46335697399527187, + "grad_norm": 3.9785311222076416, + "learning_rate": 3.8935406698564595e-05, + "loss": 2.0606, + "step": 196 + }, + { + "epoch": 0.4657210401891253, + "grad_norm": 4.314589977264404, + "learning_rate": 3.8875598086124405e-05, + "loss": 2.0411, + "step": 197 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 3.732752561569214, + "learning_rate": 3.8815789473684214e-05, + "loss": 1.8543, + "step": 198 + }, + { + "epoch": 0.47044917257683216, + "grad_norm": 4.228579998016357, + "learning_rate": 3.8755980861244024e-05, + "loss": 2.0262, + "step": 199 + }, + { + "epoch": 0.4728132387706856, + "grad_norm": 3.9367430210113525, + "learning_rate": 3.869617224880383e-05, + "loss": 2.0398, + "step": 200 + }, + { + "epoch": 0.475177304964539, + "grad_norm": 3.8661906719207764, + "learning_rate": 3.8636363636363636e-05, + "loss": 1.9708, + "step": 201 + }, + { + "epoch": 0.47754137115839246, + "grad_norm": 3.8464865684509277, + "learning_rate": 3.8576555023923446e-05, + "loss": 2.0388, + "step": 202 + }, + { + "epoch": 0.4799054373522459, + "grad_norm": 3.867969274520874, + "learning_rate": 3.8516746411483255e-05, + "loss": 2.0592, + "step": 203 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 4.027868270874023, + "learning_rate": 3.8456937799043065e-05, + "loss": 2.0641, + "step": 204 + }, + { + "epoch": 0.4846335697399527, + "grad_norm": 4.061384677886963, + "learning_rate": 3.839712918660287e-05, + "loss": 2.0542, + "step": 205 + }, + { + "epoch": 0.48699763593380613, + "grad_norm": 3.954545021057129, + "learning_rate": 3.8337320574162684e-05, + "loss": 1.9951, + "step": 206 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 3.8164191246032715, + "learning_rate": 3.8277511961722486e-05, + "loss": 2.0094, + "step": 207 + }, + { + "epoch": 0.491725768321513, + "grad_norm": 4.0749053955078125, + "learning_rate": 3.8217703349282296e-05, + "loss": 1.9861, + "step": 208 + }, + { + "epoch": 0.4940898345153664, + "grad_norm": 3.8733890056610107, + "learning_rate": 3.815789473684211e-05, + "loss": 1.9821, + "step": 209 + }, + { + "epoch": 0.49645390070921985, + "grad_norm": 3.9854531288146973, + "learning_rate": 3.8098086124401915e-05, + "loss": 1.9549, + "step": 210 + }, + { + "epoch": 0.4988179669030733, + "grad_norm": 4.046158313751221, + "learning_rate": 3.8038277511961725e-05, + "loss": 2.02, + "step": 211 + }, + { + "epoch": 0.5011820330969267, + "grad_norm": 4.107903003692627, + "learning_rate": 3.7978468899521534e-05, + "loss": 1.9814, + "step": 212 + }, + { + "epoch": 0.5035460992907801, + "grad_norm": 4.039123058319092, + "learning_rate": 3.7918660287081344e-05, + "loss": 1.9969, + "step": 213 + }, + { + "epoch": 0.5059101654846335, + "grad_norm": 4.200449466705322, + "learning_rate": 3.7858851674641146e-05, + "loss": 1.9756, + "step": 214 + }, + { + "epoch": 0.508274231678487, + "grad_norm": 4.346997261047363, + "learning_rate": 3.7799043062200956e-05, + "loss": 2.0102, + "step": 215 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 4.00998067855835, + "learning_rate": 3.7739234449760765e-05, + "loss": 2.0895, + "step": 216 + }, + { + "epoch": 0.5130023640661938, + "grad_norm": 4.083645820617676, + "learning_rate": 3.7679425837320575e-05, + "loss": 1.9927, + "step": 217 + }, + { + "epoch": 0.5153664302600472, + "grad_norm": 4.116786956787109, + "learning_rate": 3.7619617224880384e-05, + "loss": 1.9798, + "step": 218 + }, + { + "epoch": 0.5177304964539007, + "grad_norm": 3.8103268146514893, + "learning_rate": 3.7559808612440194e-05, + "loss": 1.9616, + "step": 219 + }, + { + "epoch": 0.5200945626477541, + "grad_norm": 3.789548635482788, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.995, + "step": 220 + }, + { + "epoch": 0.5224586288416075, + "grad_norm": 4.23507022857666, + "learning_rate": 3.7440191387559806e-05, + "loss": 2.0384, + "step": 221 + }, + { + "epoch": 0.524822695035461, + "grad_norm": 4.319752216339111, + "learning_rate": 3.738038277511962e-05, + "loss": 2.0136, + "step": 222 + }, + { + "epoch": 0.5271867612293144, + "grad_norm": 4.502188682556152, + "learning_rate": 3.7320574162679425e-05, + "loss": 1.993, + "step": 223 + }, + { + "epoch": 0.5295508274231678, + "grad_norm": 3.8698880672454834, + "learning_rate": 3.7260765550239235e-05, + "loss": 1.9742, + "step": 224 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 3.9415745735168457, + "learning_rate": 3.7200956937799044e-05, + "loss": 1.9259, + "step": 225 + }, + { + "epoch": 0.5342789598108747, + "grad_norm": 4.509599208831787, + "learning_rate": 3.7141148325358854e-05, + "loss": 1.9809, + "step": 226 + }, + { + "epoch": 0.5366430260047281, + "grad_norm": 4.538198471069336, + "learning_rate": 3.7081339712918663e-05, + "loss": 2.0455, + "step": 227 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 4.268890857696533, + "learning_rate": 3.7021531100478466e-05, + "loss": 2.0385, + "step": 228 + }, + { + "epoch": 0.541371158392435, + "grad_norm": 4.729723930358887, + "learning_rate": 3.696172248803828e-05, + "loss": 2.0132, + "step": 229 + }, + { + "epoch": 0.5437352245862884, + "grad_norm": 3.6644513607025146, + "learning_rate": 3.6901913875598085e-05, + "loss": 1.9425, + "step": 230 + }, + { + "epoch": 0.5460992907801419, + "grad_norm": 3.711594343185425, + "learning_rate": 3.6842105263157895e-05, + "loss": 2.0302, + "step": 231 + }, + { + "epoch": 0.5484633569739953, + "grad_norm": 4.120182991027832, + "learning_rate": 3.6782296650717704e-05, + "loss": 1.9863, + "step": 232 + }, + { + "epoch": 0.5508274231678487, + "grad_norm": 4.521392822265625, + "learning_rate": 3.6722488038277514e-05, + "loss": 1.9855, + "step": 233 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 4.531548023223877, + "learning_rate": 3.666267942583732e-05, + "loss": 1.9397, + "step": 234 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 3.672027111053467, + "learning_rate": 3.660287081339713e-05, + "loss": 2.0197, + "step": 235 + }, + { + "epoch": 0.557919621749409, + "grad_norm": 3.798299789428711, + "learning_rate": 3.654306220095694e-05, + "loss": 1.9749, + "step": 236 + }, + { + "epoch": 0.5602836879432624, + "grad_norm": 4.396960258483887, + "learning_rate": 3.6483253588516745e-05, + "loss": 1.9643, + "step": 237 + }, + { + "epoch": 0.5626477541371159, + "grad_norm": 3.955106019973755, + "learning_rate": 3.642344497607656e-05, + "loss": 1.9963, + "step": 238 + }, + { + "epoch": 0.5650118203309693, + "grad_norm": 4.263886451721191, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.9553, + "step": 239 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 4.148135662078857, + "learning_rate": 3.6303827751196174e-05, + "loss": 1.9899, + "step": 240 + }, + { + "epoch": 0.5697399527186762, + "grad_norm": 4.090165138244629, + "learning_rate": 3.624401913875598e-05, + "loss": 1.9922, + "step": 241 + }, + { + "epoch": 0.5721040189125296, + "grad_norm": 3.7597265243530273, + "learning_rate": 3.618421052631579e-05, + "loss": 1.9846, + "step": 242 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 4.532017707824707, + "learning_rate": 3.61244019138756e-05, + "loss": 1.9955, + "step": 243 + }, + { + "epoch": 0.5768321513002365, + "grad_norm": 4.268376350402832, + "learning_rate": 3.6064593301435405e-05, + "loss": 1.928, + "step": 244 + }, + { + "epoch": 0.5791962174940898, + "grad_norm": 4.414393901824951, + "learning_rate": 3.600478468899522e-05, + "loss": 1.9792, + "step": 245 + }, + { + "epoch": 0.5815602836879432, + "grad_norm": 4.180037498474121, + "learning_rate": 3.5944976076555024e-05, + "loss": 1.9661, + "step": 246 + }, + { + "epoch": 0.5839243498817966, + "grad_norm": 4.057360649108887, + "learning_rate": 3.5885167464114834e-05, + "loss": 1.9004, + "step": 247 + }, + { + "epoch": 0.5862884160756501, + "grad_norm": 3.8866353034973145, + "learning_rate": 3.582535885167464e-05, + "loss": 1.9582, + "step": 248 + }, + { + "epoch": 0.5886524822695035, + "grad_norm": 4.187384605407715, + "learning_rate": 3.576555023923445e-05, + "loss": 1.992, + "step": 249 + }, + { + "epoch": 0.5910165484633569, + "grad_norm": 3.99149751663208, + "learning_rate": 3.570574162679426e-05, + "loss": 1.9069, + "step": 250 + }, + { + "epoch": 0.5933806146572104, + "grad_norm": 3.89758563041687, + "learning_rate": 3.5645933014354065e-05, + "loss": 2.0696, + "step": 251 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 4.572390556335449, + "learning_rate": 3.558612440191388e-05, + "loss": 2.0519, + "step": 252 + }, + { + "epoch": 0.5981087470449172, + "grad_norm": 3.7999329566955566, + "learning_rate": 3.5526315789473684e-05, + "loss": 1.9143, + "step": 253 + }, + { + "epoch": 0.6004728132387707, + "grad_norm": 3.8050220012664795, + "learning_rate": 3.5466507177033493e-05, + "loss": 1.919, + "step": 254 + }, + { + "epoch": 0.6028368794326241, + "grad_norm": 4.685467720031738, + "learning_rate": 3.54066985645933e-05, + "loss": 2.0639, + "step": 255 + }, + { + "epoch": 0.6052009456264775, + "grad_norm": 4.132735252380371, + "learning_rate": 3.534688995215311e-05, + "loss": 2.0094, + "step": 256 + }, + { + "epoch": 0.607565011820331, + "grad_norm": 3.779338836669922, + "learning_rate": 3.5287081339712915e-05, + "loss": 1.9425, + "step": 257 + }, + { + "epoch": 0.6099290780141844, + "grad_norm": 3.988375186920166, + "learning_rate": 3.522727272727273e-05, + "loss": 1.9529, + "step": 258 + }, + { + "epoch": 0.6122931442080378, + "grad_norm": 3.8638248443603516, + "learning_rate": 3.516746411483254e-05, + "loss": 1.9326, + "step": 259 + }, + { + "epoch": 0.6146572104018913, + "grad_norm": 3.718116283416748, + "learning_rate": 3.5107655502392344e-05, + "loss": 1.9376, + "step": 260 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 3.8833279609680176, + "learning_rate": 3.504784688995216e-05, + "loss": 2.0297, + "step": 261 + }, + { + "epoch": 0.6193853427895981, + "grad_norm": 3.518829345703125, + "learning_rate": 3.498803827751196e-05, + "loss": 1.991, + "step": 262 + }, + { + "epoch": 0.6217494089834515, + "grad_norm": 3.8007776737213135, + "learning_rate": 3.492822966507177e-05, + "loss": 1.9722, + "step": 263 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 3.51373028755188, + "learning_rate": 3.4868421052631575e-05, + "loss": 1.8961, + "step": 264 + }, + { + "epoch": 0.6264775413711584, + "grad_norm": 3.6159632205963135, + "learning_rate": 3.480861244019139e-05, + "loss": 1.9719, + "step": 265 + }, + { + "epoch": 0.6288416075650118, + "grad_norm": 3.923194408416748, + "learning_rate": 3.4748803827751194e-05, + "loss": 1.9133, + "step": 266 + }, + { + "epoch": 0.6312056737588653, + "grad_norm": 3.849912166595459, + "learning_rate": 3.4688995215311004e-05, + "loss": 1.9508, + "step": 267 + }, + { + "epoch": 0.6335697399527187, + "grad_norm": 3.8065907955169678, + "learning_rate": 3.462918660287082e-05, + "loss": 1.9426, + "step": 268 + }, + { + "epoch": 0.6359338061465721, + "grad_norm": 4.141129493713379, + "learning_rate": 3.456937799043062e-05, + "loss": 2.0031, + "step": 269 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 4.0821123123168945, + "learning_rate": 3.450956937799043e-05, + "loss": 1.9503, + "step": 270 + }, + { + "epoch": 0.640661938534279, + "grad_norm": 3.863445997238159, + "learning_rate": 3.444976076555024e-05, + "loss": 1.918, + "step": 271 + }, + { + "epoch": 0.6430260047281324, + "grad_norm": 3.9590418338775635, + "learning_rate": 3.438995215311005e-05, + "loss": 1.9768, + "step": 272 + }, + { + "epoch": 0.6453900709219859, + "grad_norm": 3.7065553665161133, + "learning_rate": 3.4330143540669854e-05, + "loss": 1.95, + "step": 273 + }, + { + "epoch": 0.6477541371158393, + "grad_norm": 3.657320737838745, + "learning_rate": 3.427033492822967e-05, + "loss": 1.9733, + "step": 274 + }, + { + "epoch": 0.6501182033096927, + "grad_norm": 4.384260654449463, + "learning_rate": 3.421052631578947e-05, + "loss": 2.0237, + "step": 275 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 4.627539157867432, + "learning_rate": 3.415071770334928e-05, + "loss": 1.9492, + "step": 276 + }, + { + "epoch": 0.6548463356973995, + "grad_norm": 3.8552463054656982, + "learning_rate": 3.409090909090909e-05, + "loss": 1.9489, + "step": 277 + }, + { + "epoch": 0.6572104018912529, + "grad_norm": 3.8745601177215576, + "learning_rate": 3.40311004784689e-05, + "loss": 1.9182, + "step": 278 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 4.04236364364624, + "learning_rate": 3.397129186602871e-05, + "loss": 1.9581, + "step": 279 + }, + { + "epoch": 0.6619385342789598, + "grad_norm": 3.5719153881073, + "learning_rate": 3.3911483253588514e-05, + "loss": 1.9321, + "step": 280 + }, + { + "epoch": 0.6643026004728132, + "grad_norm": 4.358823776245117, + "learning_rate": 3.385167464114833e-05, + "loss": 1.9567, + "step": 281 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 4.5859293937683105, + "learning_rate": 3.379186602870813e-05, + "loss": 1.966, + "step": 282 + }, + { + "epoch": 0.6690307328605201, + "grad_norm": 4.17390251159668, + "learning_rate": 3.373205741626794e-05, + "loss": 2.0129, + "step": 283 + }, + { + "epoch": 0.6713947990543735, + "grad_norm": 4.25508975982666, + "learning_rate": 3.367224880382775e-05, + "loss": 1.9553, + "step": 284 + }, + { + "epoch": 0.6737588652482269, + "grad_norm": 3.8857264518737793, + "learning_rate": 3.361244019138756e-05, + "loss": 1.9755, + "step": 285 + }, + { + "epoch": 0.6761229314420804, + "grad_norm": 4.436855792999268, + "learning_rate": 3.355263157894737e-05, + "loss": 1.9313, + "step": 286 + }, + { + "epoch": 0.6784869976359338, + "grad_norm": 3.8671867847442627, + "learning_rate": 3.349282296650718e-05, + "loss": 1.9402, + "step": 287 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 3.6638526916503906, + "learning_rate": 3.343301435406699e-05, + "loss": 1.871, + "step": 288 + }, + { + "epoch": 0.6832151300236406, + "grad_norm": 4.047444820404053, + "learning_rate": 3.337320574162679e-05, + "loss": 1.9283, + "step": 289 + }, + { + "epoch": 0.6855791962174941, + "grad_norm": 3.7109580039978027, + "learning_rate": 3.33133971291866e-05, + "loss": 1.9663, + "step": 290 + }, + { + "epoch": 0.6879432624113475, + "grad_norm": 3.8624162673950195, + "learning_rate": 3.325358851674641e-05, + "loss": 1.9802, + "step": 291 + }, + { + "epoch": 0.6903073286052009, + "grad_norm": 3.6883432865142822, + "learning_rate": 3.319377990430622e-05, + "loss": 1.9165, + "step": 292 + }, + { + "epoch": 0.6926713947990544, + "grad_norm": 3.7664642333984375, + "learning_rate": 3.313397129186603e-05, + "loss": 1.9822, + "step": 293 + }, + { + "epoch": 0.6950354609929078, + "grad_norm": 3.6505606174468994, + "learning_rate": 3.307416267942584e-05, + "loss": 1.8814, + "step": 294 + }, + { + "epoch": 0.6973995271867612, + "grad_norm": 3.5809507369995117, + "learning_rate": 3.301435406698565e-05, + "loss": 1.9044, + "step": 295 + }, + { + "epoch": 0.6997635933806147, + "grad_norm": 3.8675575256347656, + "learning_rate": 3.295454545454545e-05, + "loss": 1.8942, + "step": 296 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 3.916057586669922, + "learning_rate": 3.289473684210527e-05, + "loss": 2.0424, + "step": 297 + }, + { + "epoch": 0.7044917257683215, + "grad_norm": 3.671261787414551, + "learning_rate": 3.283492822966507e-05, + "loss": 1.9595, + "step": 298 + }, + { + "epoch": 0.706855791962175, + "grad_norm": 4.06015682220459, + "learning_rate": 3.277511961722488e-05, + "loss": 1.8396, + "step": 299 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 3.9253146648406982, + "learning_rate": 3.271531100478469e-05, + "loss": 1.9823, + "step": 300 + }, + { + "epoch": 0.7115839243498818, + "grad_norm": 4.20823335647583, + "learning_rate": 3.26555023923445e-05, + "loss": 1.9768, + "step": 301 + }, + { + "epoch": 0.7139479905437353, + "grad_norm": 4.035942554473877, + "learning_rate": 3.259569377990431e-05, + "loss": 2.0076, + "step": 302 + }, + { + "epoch": 0.7163120567375887, + "grad_norm": 3.9924399852752686, + "learning_rate": 3.253588516746411e-05, + "loss": 1.9217, + "step": 303 + }, + { + "epoch": 0.7186761229314421, + "grad_norm": 3.8201253414154053, + "learning_rate": 3.247607655502393e-05, + "loss": 1.9539, + "step": 304 + }, + { + "epoch": 0.7210401891252955, + "grad_norm": 4.5720696449279785, + "learning_rate": 3.241626794258373e-05, + "loss": 1.9805, + "step": 305 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 4.060619354248047, + "learning_rate": 3.235645933014354e-05, + "loss": 1.9667, + "step": 306 + }, + { + "epoch": 0.7257683215130024, + "grad_norm": 4.097210884094238, + "learning_rate": 3.229665071770335e-05, + "loss": 1.9609, + "step": 307 + }, + { + "epoch": 0.7281323877068558, + "grad_norm": 3.757749080657959, + "learning_rate": 3.223684210526316e-05, + "loss": 2.0129, + "step": 308 + }, + { + "epoch": 0.7304964539007093, + "grad_norm": 4.25848388671875, + "learning_rate": 3.217703349282297e-05, + "loss": 2.0603, + "step": 309 + }, + { + "epoch": 0.7328605200945626, + "grad_norm": 3.9824750423431396, + "learning_rate": 3.211722488038278e-05, + "loss": 1.8766, + "step": 310 + }, + { + "epoch": 0.735224586288416, + "grad_norm": 4.179236888885498, + "learning_rate": 3.205741626794259e-05, + "loss": 1.9354, + "step": 311 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 3.9761154651641846, + "learning_rate": 3.199760765550239e-05, + "loss": 1.9749, + "step": 312 + }, + { + "epoch": 0.7399527186761229, + "grad_norm": 3.731112241744995, + "learning_rate": 3.19377990430622e-05, + "loss": 1.9597, + "step": 313 + }, + { + "epoch": 0.7423167848699763, + "grad_norm": 3.762098789215088, + "learning_rate": 3.187799043062201e-05, + "loss": 1.8983, + "step": 314 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 4.024747371673584, + "learning_rate": 3.181818181818182e-05, + "loss": 1.9149, + "step": 315 + }, + { + "epoch": 0.7470449172576832, + "grad_norm": 4.643849849700928, + "learning_rate": 3.175837320574162e-05, + "loss": 1.9244, + "step": 316 + }, + { + "epoch": 0.7494089834515366, + "grad_norm": 4.206188201904297, + "learning_rate": 3.169856459330144e-05, + "loss": 1.9201, + "step": 317 + }, + { + "epoch": 0.75177304964539, + "grad_norm": 3.647243022918701, + "learning_rate": 3.163875598086124e-05, + "loss": 1.8717, + "step": 318 + }, + { + "epoch": 0.7541371158392435, + "grad_norm": 3.7720701694488525, + "learning_rate": 3.157894736842105e-05, + "loss": 1.8891, + "step": 319 + }, + { + "epoch": 0.7565011820330969, + "grad_norm": 3.8284690380096436, + "learning_rate": 3.151913875598087e-05, + "loss": 1.9221, + "step": 320 + }, + { + "epoch": 0.7588652482269503, + "grad_norm": 4.109698295593262, + "learning_rate": 3.145933014354067e-05, + "loss": 1.8875, + "step": 321 + }, + { + "epoch": 0.7612293144208038, + "grad_norm": 3.684807062149048, + "learning_rate": 3.139952153110048e-05, + "loss": 1.9002, + "step": 322 + }, + { + "epoch": 0.7635933806146572, + "grad_norm": 3.700782299041748, + "learning_rate": 3.133971291866029e-05, + "loss": 1.9455, + "step": 323 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 3.9737021923065186, + "learning_rate": 3.12799043062201e-05, + "loss": 1.9065, + "step": 324 + }, + { + "epoch": 0.7683215130023641, + "grad_norm": 3.3986127376556396, + "learning_rate": 3.12200956937799e-05, + "loss": 1.8713, + "step": 325 + }, + { + "epoch": 0.7706855791962175, + "grad_norm": 3.8407750129699707, + "learning_rate": 3.116028708133971e-05, + "loss": 1.8779, + "step": 326 + }, + { + "epoch": 0.7730496453900709, + "grad_norm": 4.058707237243652, + "learning_rate": 3.110047846889952e-05, + "loss": 1.8802, + "step": 327 + }, + { + "epoch": 0.7754137115839244, + "grad_norm": 3.9604055881500244, + "learning_rate": 3.104066985645933e-05, + "loss": 1.8403, + "step": 328 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 4.052743434906006, + "learning_rate": 3.098086124401914e-05, + "loss": 1.9294, + "step": 329 + }, + { + "epoch": 0.7801418439716312, + "grad_norm": 3.3787264823913574, + "learning_rate": 3.092105263157895e-05, + "loss": 1.8994, + "step": 330 + }, + { + "epoch": 0.7825059101654847, + "grad_norm": 3.8170766830444336, + "learning_rate": 3.086124401913876e-05, + "loss": 1.9975, + "step": 331 + }, + { + "epoch": 0.7848699763593381, + "grad_norm": 3.8066794872283936, + "learning_rate": 3.080143540669856e-05, + "loss": 1.968, + "step": 332 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 4.192262172698975, + "learning_rate": 3.074162679425838e-05, + "loss": 1.9784, + "step": 333 + }, + { + "epoch": 0.789598108747045, + "grad_norm": 3.9428586959838867, + "learning_rate": 3.068181818181818e-05, + "loss": 1.9362, + "step": 334 + }, + { + "epoch": 0.7919621749408984, + "grad_norm": 4.186812400817871, + "learning_rate": 3.062200956937799e-05, + "loss": 1.9212, + "step": 335 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 4.045288562774658, + "learning_rate": 3.05622009569378e-05, + "loss": 1.9017, + "step": 336 + }, + { + "epoch": 0.7966903073286052, + "grad_norm": 4.203200340270996, + "learning_rate": 3.050239234449761e-05, + "loss": 1.9844, + "step": 337 + }, + { + "epoch": 0.7990543735224587, + "grad_norm": 4.394277095794678, + "learning_rate": 3.0442583732057416e-05, + "loss": 1.9332, + "step": 338 + }, + { + "epoch": 0.8014184397163121, + "grad_norm": 3.7559564113616943, + "learning_rate": 3.0382775119617225e-05, + "loss": 1.9142, + "step": 339 + }, + { + "epoch": 0.8037825059101655, + "grad_norm": 3.5842766761779785, + "learning_rate": 3.0322966507177035e-05, + "loss": 1.8843, + "step": 340 + }, + { + "epoch": 0.806146572104019, + "grad_norm": 3.8942437171936035, + "learning_rate": 3.0263157894736844e-05, + "loss": 1.825, + "step": 341 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 4.06583833694458, + "learning_rate": 3.020334928229665e-05, + "loss": 1.8859, + "step": 342 + }, + { + "epoch": 0.8108747044917257, + "grad_norm": 3.9897472858428955, + "learning_rate": 3.0143540669856463e-05, + "loss": 2.0088, + "step": 343 + }, + { + "epoch": 0.8132387706855791, + "grad_norm": 4.120972633361816, + "learning_rate": 3.008373205741627e-05, + "loss": 1.9644, + "step": 344 + }, + { + "epoch": 0.8156028368794326, + "grad_norm": 3.761667013168335, + "learning_rate": 3.0023923444976076e-05, + "loss": 1.8468, + "step": 345 + }, + { + "epoch": 0.817966903073286, + "grad_norm": 4.167830944061279, + "learning_rate": 2.996411483253589e-05, + "loss": 1.8995, + "step": 346 + }, + { + "epoch": 0.8203309692671394, + "grad_norm": 3.732332944869995, + "learning_rate": 2.9904306220095695e-05, + "loss": 1.8529, + "step": 347 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 4.0308003425598145, + "learning_rate": 2.9844497607655504e-05, + "loss": 1.9401, + "step": 348 + }, + { + "epoch": 0.8250591016548463, + "grad_norm": 3.981724500656128, + "learning_rate": 2.9784688995215314e-05, + "loss": 1.8821, + "step": 349 + }, + { + "epoch": 0.8274231678486997, + "grad_norm": 3.8602871894836426, + "learning_rate": 2.9724880382775123e-05, + "loss": 1.8909, + "step": 350 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 3.8556690216064453, + "learning_rate": 2.966507177033493e-05, + "loss": 1.9972, + "step": 351 + }, + { + "epoch": 0.8321513002364066, + "grad_norm": 3.716454029083252, + "learning_rate": 2.9605263157894735e-05, + "loss": 1.9235, + "step": 352 + }, + { + "epoch": 0.83451536643026, + "grad_norm": 4.057294845581055, + "learning_rate": 2.954545454545455e-05, + "loss": 1.8717, + "step": 353 + }, + { + "epoch": 0.8368794326241135, + "grad_norm": 3.5962278842926025, + "learning_rate": 2.9485645933014355e-05, + "loss": 1.9414, + "step": 354 + }, + { + "epoch": 0.8392434988179669, + "grad_norm": 4.190985679626465, + "learning_rate": 2.942583732057416e-05, + "loss": 1.9724, + "step": 355 + }, + { + "epoch": 0.8416075650118203, + "grad_norm": 3.9760379791259766, + "learning_rate": 2.9366028708133974e-05, + "loss": 1.839, + "step": 356 + }, + { + "epoch": 0.8439716312056738, + "grad_norm": 3.629091501235962, + "learning_rate": 2.9306220095693783e-05, + "loss": 1.8866, + "step": 357 + }, + { + "epoch": 0.8463356973995272, + "grad_norm": 3.752070188522339, + "learning_rate": 2.924641148325359e-05, + "loss": 1.8868, + "step": 358 + }, + { + "epoch": 0.8486997635933806, + "grad_norm": 3.5992238521575928, + "learning_rate": 2.9186602870813402e-05, + "loss": 1.8586, + "step": 359 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 3.47458553314209, + "learning_rate": 2.912679425837321e-05, + "loss": 1.9659, + "step": 360 + }, + { + "epoch": 0.8534278959810875, + "grad_norm": 3.6117656230926514, + "learning_rate": 2.9066985645933014e-05, + "loss": 1.8892, + "step": 361 + }, + { + "epoch": 0.8557919621749409, + "grad_norm": 4.080473899841309, + "learning_rate": 2.900717703349282e-05, + "loss": 1.8809, + "step": 362 + }, + { + "epoch": 0.8581560283687943, + "grad_norm": 4.260461330413818, + "learning_rate": 2.8947368421052634e-05, + "loss": 1.9461, + "step": 363 + }, + { + "epoch": 0.8605200945626478, + "grad_norm": 4.231245994567871, + "learning_rate": 2.888755980861244e-05, + "loss": 1.9389, + "step": 364 + }, + { + "epoch": 0.8628841607565012, + "grad_norm": 3.64261794090271, + "learning_rate": 2.882775119617225e-05, + "loss": 1.922, + "step": 365 + }, + { + "epoch": 0.8652482269503546, + "grad_norm": 3.591475009918213, + "learning_rate": 2.8767942583732062e-05, + "loss": 1.9616, + "step": 366 + }, + { + "epoch": 0.8676122931442081, + "grad_norm": 3.9587414264678955, + "learning_rate": 2.8708133971291868e-05, + "loss": 1.9045, + "step": 367 + }, + { + "epoch": 0.8699763593380615, + "grad_norm": 3.6751394271850586, + "learning_rate": 2.8648325358851674e-05, + "loss": 1.9077, + "step": 368 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 4.2092790603637695, + "learning_rate": 2.8588516746411487e-05, + "loss": 1.9413, + "step": 369 + }, + { + "epoch": 0.8747044917257684, + "grad_norm": 3.814706325531006, + "learning_rate": 2.8528708133971293e-05, + "loss": 1.9203, + "step": 370 + }, + { + "epoch": 0.8770685579196218, + "grad_norm": 3.674201250076294, + "learning_rate": 2.84688995215311e-05, + "loss": 1.8488, + "step": 371 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 3.5166468620300293, + "learning_rate": 2.8409090909090912e-05, + "loss": 1.9175, + "step": 372 + }, + { + "epoch": 0.8817966903073287, + "grad_norm": 3.619014024734497, + "learning_rate": 2.834928229665072e-05, + "loss": 1.8396, + "step": 373 + }, + { + "epoch": 0.8841607565011821, + "grad_norm": 3.923396110534668, + "learning_rate": 2.8289473684210528e-05, + "loss": 1.8317, + "step": 374 + }, + { + "epoch": 0.8865248226950354, + "grad_norm": 3.934695243835449, + "learning_rate": 2.8229665071770334e-05, + "loss": 1.9562, + "step": 375 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 3.761104106903076, + "learning_rate": 2.8169856459330147e-05, + "loss": 1.7913, + "step": 376 + }, + { + "epoch": 0.8912529550827423, + "grad_norm": 3.7853753566741943, + "learning_rate": 2.8110047846889953e-05, + "loss": 1.8132, + "step": 377 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 3.526927947998047, + "learning_rate": 2.805023923444976e-05, + "loss": 1.6913, + "step": 378 + }, + { + "epoch": 0.8959810874704491, + "grad_norm": 3.75763201713562, + "learning_rate": 2.7990430622009572e-05, + "loss": 1.8787, + "step": 379 + }, + { + "epoch": 0.8983451536643026, + "grad_norm": 3.601562023162842, + "learning_rate": 2.793062200956938e-05, + "loss": 1.9166, + "step": 380 + }, + { + "epoch": 0.900709219858156, + "grad_norm": 3.5951952934265137, + "learning_rate": 2.7870813397129185e-05, + "loss": 1.885, + "step": 381 + }, + { + "epoch": 0.9030732860520094, + "grad_norm": 3.5643539428710938, + "learning_rate": 2.7811004784688998e-05, + "loss": 1.9044, + "step": 382 + }, + { + "epoch": 0.9054373522458629, + "grad_norm": 3.7953860759735107, + "learning_rate": 2.7751196172248807e-05, + "loss": 1.9872, + "step": 383 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 4.0880913734436035, + "learning_rate": 2.7691387559808613e-05, + "loss": 1.9078, + "step": 384 + }, + { + "epoch": 0.9101654846335697, + "grad_norm": 3.8961236476898193, + "learning_rate": 2.7631578947368426e-05, + "loss": 1.8843, + "step": 385 + }, + { + "epoch": 0.9125295508274232, + "grad_norm": 3.7427453994750977, + "learning_rate": 2.7571770334928232e-05, + "loss": 1.8713, + "step": 386 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 3.7328555583953857, + "learning_rate": 2.751196172248804e-05, + "loss": 1.8305, + "step": 387 + }, + { + "epoch": 0.91725768321513, + "grad_norm": 3.890418291091919, + "learning_rate": 2.7452153110047845e-05, + "loss": 1.8197, + "step": 388 + }, + { + "epoch": 0.9196217494089834, + "grad_norm": 3.7216286659240723, + "learning_rate": 2.7392344497607657e-05, + "loss": 1.9201, + "step": 389 + }, + { + "epoch": 0.9219858156028369, + "grad_norm": 3.705873489379883, + "learning_rate": 2.7332535885167464e-05, + "loss": 1.8885, + "step": 390 + }, + { + "epoch": 0.9243498817966903, + "grad_norm": 3.5170631408691406, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.895, + "step": 391 + }, + { + "epoch": 0.9267139479905437, + "grad_norm": 3.632924795150757, + "learning_rate": 2.7212918660287086e-05, + "loss": 1.9286, + "step": 392 + }, + { + "epoch": 0.9290780141843972, + "grad_norm": 4.132338523864746, + "learning_rate": 2.7153110047846892e-05, + "loss": 1.9079, + "step": 393 + }, + { + "epoch": 0.9314420803782506, + "grad_norm": 3.8694465160369873, + "learning_rate": 2.7093301435406698e-05, + "loss": 1.856, + "step": 394 + }, + { + "epoch": 0.933806146572104, + "grad_norm": 4.146971702575684, + "learning_rate": 2.703349282296651e-05, + "loss": 1.9795, + "step": 395 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 3.581249952316284, + "learning_rate": 2.6973684210526317e-05, + "loss": 1.868, + "step": 396 + }, + { + "epoch": 0.9385342789598109, + "grad_norm": 3.779081106185913, + "learning_rate": 2.6913875598086123e-05, + "loss": 1.8462, + "step": 397 + }, + { + "epoch": 0.9408983451536643, + "grad_norm": 3.373218536376953, + "learning_rate": 2.6854066985645936e-05, + "loss": 1.8336, + "step": 398 + }, + { + "epoch": 0.9432624113475178, + "grad_norm": 3.7768990993499756, + "learning_rate": 2.6794258373205743e-05, + "loss": 1.9062, + "step": 399 + }, + { + "epoch": 0.9456264775413712, + "grad_norm": 3.4512805938720703, + "learning_rate": 2.6734449760765552e-05, + "loss": 1.873, + "step": 400 + }, + { + "epoch": 0.9479905437352246, + "grad_norm": 3.38236927986145, + "learning_rate": 2.6674641148325358e-05, + "loss": 1.8911, + "step": 401 + }, + { + "epoch": 0.950354609929078, + "grad_norm": 3.191875696182251, + "learning_rate": 2.661483253588517e-05, + "loss": 1.8732, + "step": 402 + }, + { + "epoch": 0.9527186761229315, + "grad_norm": 3.671778440475464, + "learning_rate": 2.6555023923444977e-05, + "loss": 1.8879, + "step": 403 + }, + { + "epoch": 0.9550827423167849, + "grad_norm": 3.831817150115967, + "learning_rate": 2.6495215311004783e-05, + "loss": 1.8256, + "step": 404 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 3.432061195373535, + "learning_rate": 2.6435406698564596e-05, + "loss": 1.8245, + "step": 405 + }, + { + "epoch": 0.9598108747044918, + "grad_norm": 3.591796398162842, + "learning_rate": 2.6375598086124402e-05, + "loss": 1.8954, + "step": 406 + }, + { + "epoch": 0.9621749408983451, + "grad_norm": 3.4541237354278564, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.8886, + "step": 407 + }, + { + "epoch": 0.9645390070921985, + "grad_norm": 3.4565842151641846, + "learning_rate": 2.625598086124402e-05, + "loss": 1.8766, + "step": 408 + }, + { + "epoch": 0.966903073286052, + "grad_norm": 3.812185049057007, + "learning_rate": 2.619617224880383e-05, + "loss": 1.8797, + "step": 409 + }, + { + "epoch": 0.9692671394799054, + "grad_norm": 4.211532115936279, + "learning_rate": 2.6136363636363637e-05, + "loss": 1.8563, + "step": 410 + }, + { + "epoch": 0.9716312056737588, + "grad_norm": 3.5806126594543457, + "learning_rate": 2.6076555023923443e-05, + "loss": 1.9192, + "step": 411 + }, + { + "epoch": 0.9739952718676123, + "grad_norm": 3.7554843425750732, + "learning_rate": 2.6016746411483256e-05, + "loss": 1.8267, + "step": 412 + }, + { + "epoch": 0.9763593380614657, + "grad_norm": 3.9262287616729736, + "learning_rate": 2.5956937799043062e-05, + "loss": 1.9415, + "step": 413 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 3.754761219024658, + "learning_rate": 2.589712918660287e-05, + "loss": 1.8816, + "step": 414 + }, + { + "epoch": 0.9810874704491725, + "grad_norm": 3.463529586791992, + "learning_rate": 2.583732057416268e-05, + "loss": 1.8487, + "step": 415 + }, + { + "epoch": 0.983451536643026, + "grad_norm": 3.6155738830566406, + "learning_rate": 2.5777511961722488e-05, + "loss": 1.8339, + "step": 416 + }, + { + "epoch": 0.9858156028368794, + "grad_norm": 3.745180130004883, + "learning_rate": 2.5717703349282297e-05, + "loss": 1.8555, + "step": 417 + }, + { + "epoch": 0.9881796690307328, + "grad_norm": 3.9255855083465576, + "learning_rate": 2.565789473684211e-05, + "loss": 1.8148, + "step": 418 + }, + { + "epoch": 0.9905437352245863, + "grad_norm": 4.076484203338623, + "learning_rate": 2.5598086124401916e-05, + "loss": 1.8527, + "step": 419 + }, + { + "epoch": 0.9929078014184397, + "grad_norm": 3.6310737133026123, + "learning_rate": 2.5538277511961722e-05, + "loss": 1.8875, + "step": 420 + }, + { + "epoch": 0.9952718676122931, + "grad_norm": 3.757092237472534, + "learning_rate": 2.5478468899521535e-05, + "loss": 1.9363, + "step": 421 + }, + { + "epoch": 0.9976359338061466, + "grad_norm": 3.754251003265381, + "learning_rate": 2.541866028708134e-05, + "loss": 1.8591, + "step": 422 + }, + { + "epoch": 1.0, + "grad_norm": 3.948606014251709, + "learning_rate": 2.5358851674641147e-05, + "loss": 1.8516, + "step": 423 + }, + { + "epoch": 1.0023640661938533, + "grad_norm": 3.393385171890259, + "learning_rate": 2.5299043062200957e-05, + "loss": 1.7089, + "step": 424 + }, + { + "epoch": 1.0047281323877069, + "grad_norm": 3.5038678646087646, + "learning_rate": 2.5239234449760766e-05, + "loss": 1.6513, + "step": 425 + }, + { + "epoch": 1.0070921985815602, + "grad_norm": 3.546327590942383, + "learning_rate": 2.5179425837320576e-05, + "loss": 1.6913, + "step": 426 + }, + { + "epoch": 1.0094562647754137, + "grad_norm": 3.012467384338379, + "learning_rate": 2.5119617224880382e-05, + "loss": 1.671, + "step": 427 + }, + { + "epoch": 1.011820330969267, + "grad_norm": 3.5642528533935547, + "learning_rate": 2.5059808612440195e-05, + "loss": 1.7187, + "step": 428 + }, + { + "epoch": 1.0141843971631206, + "grad_norm": 3.3508832454681396, + "learning_rate": 2.5e-05, + "loss": 1.6118, + "step": 429 + }, + { + "epoch": 1.016548463356974, + "grad_norm": 3.450350761413574, + "learning_rate": 2.494019138755981e-05, + "loss": 1.6704, + "step": 430 + }, + { + "epoch": 1.0189125295508275, + "grad_norm": 3.859874725341797, + "learning_rate": 2.4880382775119617e-05, + "loss": 1.6719, + "step": 431 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 3.572866439819336, + "learning_rate": 2.4820574162679426e-05, + "loss": 1.6766, + "step": 432 + }, + { + "epoch": 1.0236406619385343, + "grad_norm": 3.1181817054748535, + "learning_rate": 2.4760765550239236e-05, + "loss": 1.623, + "step": 433 + }, + { + "epoch": 1.0260047281323876, + "grad_norm": 3.3449785709381104, + "learning_rate": 2.4700956937799045e-05, + "loss": 1.6767, + "step": 434 + }, + { + "epoch": 1.0283687943262412, + "grad_norm": 3.494570732116699, + "learning_rate": 2.4641148325358855e-05, + "loss": 1.6433, + "step": 435 + }, + { + "epoch": 1.0307328605200945, + "grad_norm": 3.3296971321105957, + "learning_rate": 2.458133971291866e-05, + "loss": 1.6977, + "step": 436 + }, + { + "epoch": 1.033096926713948, + "grad_norm": 3.5671586990356445, + "learning_rate": 2.452153110047847e-05, + "loss": 1.6876, + "step": 437 + }, + { + "epoch": 1.0354609929078014, + "grad_norm": 3.4455606937408447, + "learning_rate": 2.446172248803828e-05, + "loss": 1.5463, + "step": 438 + }, + { + "epoch": 1.037825059101655, + "grad_norm": 3.561481237411499, + "learning_rate": 2.4401913875598086e-05, + "loss": 1.6733, + "step": 439 + }, + { + "epoch": 1.0401891252955082, + "grad_norm": 3.6918563842773438, + "learning_rate": 2.4342105263157896e-05, + "loss": 1.6511, + "step": 440 + }, + { + "epoch": 1.0425531914893618, + "grad_norm": 3.360909938812256, + "learning_rate": 2.4282296650717702e-05, + "loss": 1.6931, + "step": 441 + }, + { + "epoch": 1.044917257683215, + "grad_norm": 3.5804953575134277, + "learning_rate": 2.4222488038277515e-05, + "loss": 1.6333, + "step": 442 + }, + { + "epoch": 1.0472813238770686, + "grad_norm": 3.5099520683288574, + "learning_rate": 2.4162679425837324e-05, + "loss": 1.5951, + "step": 443 + }, + { + "epoch": 1.049645390070922, + "grad_norm": 3.4295504093170166, + "learning_rate": 2.410287081339713e-05, + "loss": 1.6764, + "step": 444 + }, + { + "epoch": 1.0520094562647755, + "grad_norm": 3.5318591594696045, + "learning_rate": 2.404306220095694e-05, + "loss": 1.6894, + "step": 445 + }, + { + "epoch": 1.0543735224586288, + "grad_norm": 3.4848272800445557, + "learning_rate": 2.3983253588516746e-05, + "loss": 1.6769, + "step": 446 + }, + { + "epoch": 1.0567375886524824, + "grad_norm": 3.7782180309295654, + "learning_rate": 2.3923444976076556e-05, + "loss": 1.6239, + "step": 447 + }, + { + "epoch": 1.0591016548463357, + "grad_norm": 3.2487025260925293, + "learning_rate": 2.3863636363636365e-05, + "loss": 1.5877, + "step": 448 + }, + { + "epoch": 1.0614657210401892, + "grad_norm": 3.6180076599121094, + "learning_rate": 2.380382775119617e-05, + "loss": 1.685, + "step": 449 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 3.6394782066345215, + "learning_rate": 2.374401913875598e-05, + "loss": 1.6699, + "step": 450 + }, + { + "epoch": 1.066193853427896, + "grad_norm": 3.7263615131378174, + "learning_rate": 2.368421052631579e-05, + "loss": 1.6681, + "step": 451 + }, + { + "epoch": 1.0685579196217494, + "grad_norm": 3.455543279647827, + "learning_rate": 2.36244019138756e-05, + "loss": 1.5929, + "step": 452 + }, + { + "epoch": 1.070921985815603, + "grad_norm": 3.379056930541992, + "learning_rate": 2.356459330143541e-05, + "loss": 1.7262, + "step": 453 + }, + { + "epoch": 1.0732860520094563, + "grad_norm": 3.415682792663574, + "learning_rate": 2.3504784688995216e-05, + "loss": 1.6502, + "step": 454 + }, + { + "epoch": 1.0756501182033098, + "grad_norm": 3.3975017070770264, + "learning_rate": 2.3444976076555025e-05, + "loss": 1.6846, + "step": 455 + }, + { + "epoch": 1.0780141843971631, + "grad_norm": 3.844403028488159, + "learning_rate": 2.3385167464114835e-05, + "loss": 1.6581, + "step": 456 + }, + { + "epoch": 1.0803782505910164, + "grad_norm": 3.237973690032959, + "learning_rate": 2.332535885167464e-05, + "loss": 1.6452, + "step": 457 + }, + { + "epoch": 1.08274231678487, + "grad_norm": 3.138275384902954, + "learning_rate": 2.326555023923445e-05, + "loss": 1.671, + "step": 458 + }, + { + "epoch": 1.0851063829787233, + "grad_norm": 3.2867116928100586, + "learning_rate": 2.320574162679426e-05, + "loss": 1.7039, + "step": 459 + }, + { + "epoch": 1.0874704491725768, + "grad_norm": 3.331429958343506, + "learning_rate": 2.314593301435407e-05, + "loss": 1.6251, + "step": 460 + }, + { + "epoch": 1.0898345153664302, + "grad_norm": 3.517249822616577, + "learning_rate": 2.308612440191388e-05, + "loss": 1.6588, + "step": 461 + }, + { + "epoch": 1.0921985815602837, + "grad_norm": 3.447352170944214, + "learning_rate": 2.3026315789473685e-05, + "loss": 1.6494, + "step": 462 + }, + { + "epoch": 1.094562647754137, + "grad_norm": 3.820619583129883, + "learning_rate": 2.2966507177033495e-05, + "loss": 1.6767, + "step": 463 + }, + { + "epoch": 1.0969267139479906, + "grad_norm": 3.612136125564575, + "learning_rate": 2.29066985645933e-05, + "loss": 1.6123, + "step": 464 + }, + { + "epoch": 1.099290780141844, + "grad_norm": 3.2653629779815674, + "learning_rate": 2.284688995215311e-05, + "loss": 1.639, + "step": 465 + }, + { + "epoch": 1.1016548463356974, + "grad_norm": 3.241689443588257, + "learning_rate": 2.278708133971292e-05, + "loss": 1.6561, + "step": 466 + }, + { + "epoch": 1.1040189125295508, + "grad_norm": 3.3771729469299316, + "learning_rate": 2.272727272727273e-05, + "loss": 1.6457, + "step": 467 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 3.3176181316375732, + "learning_rate": 2.266746411483254e-05, + "loss": 1.6047, + "step": 468 + }, + { + "epoch": 1.1087470449172576, + "grad_norm": 3.281697988510132, + "learning_rate": 2.2607655502392345e-05, + "loss": 1.6211, + "step": 469 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 3.4810190200805664, + "learning_rate": 2.2547846889952154e-05, + "loss": 1.7409, + "step": 470 + }, + { + "epoch": 1.1134751773049645, + "grad_norm": 3.873317003250122, + "learning_rate": 2.2488038277511964e-05, + "loss": 1.6596, + "step": 471 + }, + { + "epoch": 1.115839243498818, + "grad_norm": 3.9520647525787354, + "learning_rate": 2.242822966507177e-05, + "loss": 1.7332, + "step": 472 + }, + { + "epoch": 1.1182033096926713, + "grad_norm": 3.922635555267334, + "learning_rate": 2.236842105263158e-05, + "loss": 1.6934, + "step": 473 + }, + { + "epoch": 1.1205673758865249, + "grad_norm": 3.404571056365967, + "learning_rate": 2.230861244019139e-05, + "loss": 1.6456, + "step": 474 + }, + { + "epoch": 1.1229314420803782, + "grad_norm": 3.497051239013672, + "learning_rate": 2.2248803827751195e-05, + "loss": 1.712, + "step": 475 + }, + { + "epoch": 1.1252955082742317, + "grad_norm": 3.632838249206543, + "learning_rate": 2.2188995215311005e-05, + "loss": 1.6521, + "step": 476 + }, + { + "epoch": 1.127659574468085, + "grad_norm": 3.8431527614593506, + "learning_rate": 2.2129186602870814e-05, + "loss": 1.6738, + "step": 477 + }, + { + "epoch": 1.1300236406619386, + "grad_norm": 3.709177255630493, + "learning_rate": 2.2069377990430624e-05, + "loss": 1.622, + "step": 478 + }, + { + "epoch": 1.132387706855792, + "grad_norm": 3.3974366188049316, + "learning_rate": 2.2009569377990433e-05, + "loss": 1.7082, + "step": 479 + }, + { + "epoch": 1.1347517730496455, + "grad_norm": 3.6680588722229004, + "learning_rate": 2.194976076555024e-05, + "loss": 1.6759, + "step": 480 + }, + { + "epoch": 1.1371158392434988, + "grad_norm": 3.3660480976104736, + "learning_rate": 2.188995215311005e-05, + "loss": 1.6893, + "step": 481 + }, + { + "epoch": 1.1394799054373523, + "grad_norm": 3.4249792098999023, + "learning_rate": 2.1830143540669855e-05, + "loss": 1.6269, + "step": 482 + }, + { + "epoch": 1.1418439716312057, + "grad_norm": 3.5676686763763428, + "learning_rate": 2.1770334928229665e-05, + "loss": 1.5851, + "step": 483 + }, + { + "epoch": 1.1442080378250592, + "grad_norm": 3.6361424922943115, + "learning_rate": 2.1710526315789474e-05, + "loss": 1.683, + "step": 484 + }, + { + "epoch": 1.1465721040189125, + "grad_norm": 3.530165910720825, + "learning_rate": 2.1650717703349284e-05, + "loss": 1.6617, + "step": 485 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 3.3330204486846924, + "learning_rate": 2.1590909090909093e-05, + "loss": 1.6294, + "step": 486 + }, + { + "epoch": 1.1513002364066194, + "grad_norm": 3.3433423042297363, + "learning_rate": 2.1531100478468903e-05, + "loss": 1.5875, + "step": 487 + }, + { + "epoch": 1.1536643026004727, + "grad_norm": 3.511631488800049, + "learning_rate": 2.147129186602871e-05, + "loss": 1.6799, + "step": 488 + }, + { + "epoch": 1.1560283687943262, + "grad_norm": 3.3675262928009033, + "learning_rate": 2.141148325358852e-05, + "loss": 1.7247, + "step": 489 + }, + { + "epoch": 1.1583924349881798, + "grad_norm": 3.7090232372283936, + "learning_rate": 2.1351674641148325e-05, + "loss": 1.6302, + "step": 490 + }, + { + "epoch": 1.160756501182033, + "grad_norm": 3.7816193103790283, + "learning_rate": 2.1291866028708134e-05, + "loss": 1.7691, + "step": 491 + }, + { + "epoch": 1.1631205673758864, + "grad_norm": 3.2552871704101562, + "learning_rate": 2.1232057416267944e-05, + "loss": 1.5899, + "step": 492 + }, + { + "epoch": 1.16548463356974, + "grad_norm": 3.853459119796753, + "learning_rate": 2.1172248803827753e-05, + "loss": 1.7019, + "step": 493 + }, + { + "epoch": 1.1678486997635933, + "grad_norm": 3.5649783611297607, + "learning_rate": 2.1112440191387563e-05, + "loss": 1.6619, + "step": 494 + }, + { + "epoch": 1.1702127659574468, + "grad_norm": 3.476576805114746, + "learning_rate": 2.105263157894737e-05, + "loss": 1.611, + "step": 495 + }, + { + "epoch": 1.1725768321513002, + "grad_norm": 3.5537772178649902, + "learning_rate": 2.099282296650718e-05, + "loss": 1.637, + "step": 496 + }, + { + "epoch": 1.1749408983451537, + "grad_norm": 3.6302125453948975, + "learning_rate": 2.0933014354066988e-05, + "loss": 1.6317, + "step": 497 + }, + { + "epoch": 1.177304964539007, + "grad_norm": 3.622593879699707, + "learning_rate": 2.0873205741626794e-05, + "loss": 1.6086, + "step": 498 + }, + { + "epoch": 1.1796690307328606, + "grad_norm": 3.44309663772583, + "learning_rate": 2.0813397129186604e-05, + "loss": 1.6525, + "step": 499 + }, + { + "epoch": 1.1820330969267139, + "grad_norm": 3.0509703159332275, + "learning_rate": 2.075358851674641e-05, + "loss": 1.6, + "step": 500 + }, + { + "epoch": 1.1843971631205674, + "grad_norm": 3.322601556777954, + "learning_rate": 2.069377990430622e-05, + "loss": 1.5762, + "step": 501 + }, + { + "epoch": 1.1867612293144207, + "grad_norm": 3.471799612045288, + "learning_rate": 2.0633971291866032e-05, + "loss": 1.6196, + "step": 502 + }, + { + "epoch": 1.1891252955082743, + "grad_norm": 3.549744129180908, + "learning_rate": 2.057416267942584e-05, + "loss": 1.6898, + "step": 503 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 3.763566493988037, + "learning_rate": 2.0514354066985648e-05, + "loss": 1.6695, + "step": 504 + }, + { + "epoch": 1.1938534278959811, + "grad_norm": 3.3992598056793213, + "learning_rate": 2.0454545454545457e-05, + "loss": 1.6777, + "step": 505 + }, + { + "epoch": 1.1962174940898345, + "grad_norm": 3.303372621536255, + "learning_rate": 2.0394736842105264e-05, + "loss": 1.6111, + "step": 506 + }, + { + "epoch": 1.198581560283688, + "grad_norm": 3.443369150161743, + "learning_rate": 2.0334928229665073e-05, + "loss": 1.5834, + "step": 507 + }, + { + "epoch": 1.2009456264775413, + "grad_norm": 3.5040507316589355, + "learning_rate": 2.027511961722488e-05, + "loss": 1.6172, + "step": 508 + }, + { + "epoch": 1.2033096926713949, + "grad_norm": 3.334400177001953, + "learning_rate": 2.021531100478469e-05, + "loss": 1.6028, + "step": 509 + }, + { + "epoch": 1.2056737588652482, + "grad_norm": 3.5346176624298096, + "learning_rate": 2.0155502392344498e-05, + "loss": 1.696, + "step": 510 + }, + { + "epoch": 1.2080378250591017, + "grad_norm": 3.8326451778411865, + "learning_rate": 2.0095693779904308e-05, + "loss": 1.6668, + "step": 511 + }, + { + "epoch": 1.210401891252955, + "grad_norm": 3.3983545303344727, + "learning_rate": 2.0035885167464117e-05, + "loss": 1.6307, + "step": 512 + }, + { + "epoch": 1.2127659574468086, + "grad_norm": 3.451599597930908, + "learning_rate": 1.9976076555023923e-05, + "loss": 1.6307, + "step": 513 + }, + { + "epoch": 1.215130023640662, + "grad_norm": 3.855468988418579, + "learning_rate": 1.9916267942583733e-05, + "loss": 1.6825, + "step": 514 + }, + { + "epoch": 1.2174940898345155, + "grad_norm": 3.5130748748779297, + "learning_rate": 1.9856459330143542e-05, + "loss": 1.6431, + "step": 515 + }, + { + "epoch": 1.2198581560283688, + "grad_norm": 3.419294595718384, + "learning_rate": 1.979665071770335e-05, + "loss": 1.6686, + "step": 516 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 3.5099575519561768, + "learning_rate": 1.9736842105263158e-05, + "loss": 1.6568, + "step": 517 + }, + { + "epoch": 1.2245862884160756, + "grad_norm": 3.4354982376098633, + "learning_rate": 1.9677033492822968e-05, + "loss": 1.6038, + "step": 518 + }, + { + "epoch": 1.226950354609929, + "grad_norm": 3.4908761978149414, + "learning_rate": 1.9617224880382777e-05, + "loss": 1.6258, + "step": 519 + }, + { + "epoch": 1.2293144208037825, + "grad_norm": 3.4986984729766846, + "learning_rate": 1.9557416267942587e-05, + "loss": 1.6308, + "step": 520 + }, + { + "epoch": 1.231678486997636, + "grad_norm": 3.4746501445770264, + "learning_rate": 1.9497607655502393e-05, + "loss": 1.5862, + "step": 521 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 3.19508957862854, + "learning_rate": 1.9437799043062202e-05, + "loss": 1.6315, + "step": 522 + }, + { + "epoch": 1.2364066193853427, + "grad_norm": 3.50010347366333, + "learning_rate": 1.9377990430622012e-05, + "loss": 1.6374, + "step": 523 + }, + { + "epoch": 1.2387706855791962, + "grad_norm": 3.509312391281128, + "learning_rate": 1.9318181818181818e-05, + "loss": 1.6422, + "step": 524 + }, + { + "epoch": 1.2411347517730495, + "grad_norm": 3.383615732192993, + "learning_rate": 1.9258373205741628e-05, + "loss": 1.6133, + "step": 525 + }, + { + "epoch": 1.243498817966903, + "grad_norm": 3.5364954471588135, + "learning_rate": 1.9198564593301434e-05, + "loss": 1.6137, + "step": 526 + }, + { + "epoch": 1.2458628841607564, + "grad_norm": 3.60488224029541, + "learning_rate": 1.9138755980861243e-05, + "loss": 1.6931, + "step": 527 + }, + { + "epoch": 1.24822695035461, + "grad_norm": 3.1032845973968506, + "learning_rate": 1.9078947368421056e-05, + "loss": 1.6491, + "step": 528 + }, + { + "epoch": 1.2505910165484633, + "grad_norm": 3.539658784866333, + "learning_rate": 1.9019138755980862e-05, + "loss": 1.5981, + "step": 529 + }, + { + "epoch": 1.2529550827423168, + "grad_norm": 3.2954676151275635, + "learning_rate": 1.8959330143540672e-05, + "loss": 1.702, + "step": 530 + }, + { + "epoch": 1.2553191489361701, + "grad_norm": 3.566760540008545, + "learning_rate": 1.8899521531100478e-05, + "loss": 1.7553, + "step": 531 + }, + { + "epoch": 1.2576832151300237, + "grad_norm": 3.6372859477996826, + "learning_rate": 1.8839712918660287e-05, + "loss": 1.6513, + "step": 532 + }, + { + "epoch": 1.260047281323877, + "grad_norm": 3.449582576751709, + "learning_rate": 1.8779904306220097e-05, + "loss": 1.6247, + "step": 533 + }, + { + "epoch": 1.2624113475177305, + "grad_norm": 3.4251697063446045, + "learning_rate": 1.8720095693779903e-05, + "loss": 1.6476, + "step": 534 + }, + { + "epoch": 1.2647754137115839, + "grad_norm": 3.401045799255371, + "learning_rate": 1.8660287081339713e-05, + "loss": 1.6616, + "step": 535 + }, + { + "epoch": 1.2671394799054374, + "grad_norm": 3.2664437294006348, + "learning_rate": 1.8600478468899522e-05, + "loss": 1.5472, + "step": 536 + }, + { + "epoch": 1.2695035460992907, + "grad_norm": 3.605675458908081, + "learning_rate": 1.8540669856459332e-05, + "loss": 1.673, + "step": 537 + }, + { + "epoch": 1.2718676122931443, + "grad_norm": 3.743053674697876, + "learning_rate": 1.848086124401914e-05, + "loss": 1.7022, + "step": 538 + }, + { + "epoch": 1.2742316784869976, + "grad_norm": 3.6139564514160156, + "learning_rate": 1.8421052631578947e-05, + "loss": 1.7027, + "step": 539 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 3.537170648574829, + "learning_rate": 1.8361244019138757e-05, + "loss": 1.6479, + "step": 540 + }, + { + "epoch": 1.2789598108747045, + "grad_norm": 3.652331590652466, + "learning_rate": 1.8301435406698566e-05, + "loss": 1.6372, + "step": 541 + }, + { + "epoch": 1.281323877068558, + "grad_norm": 3.5538816452026367, + "learning_rate": 1.8241626794258373e-05, + "loss": 1.6375, + "step": 542 + }, + { + "epoch": 1.2836879432624113, + "grad_norm": 3.6458072662353516, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.6305, + "step": 543 + }, + { + "epoch": 1.2860520094562649, + "grad_norm": 3.471625328063965, + "learning_rate": 1.812200956937799e-05, + "loss": 1.5957, + "step": 544 + }, + { + "epoch": 1.2884160756501182, + "grad_norm": 3.323765516281128, + "learning_rate": 1.80622009569378e-05, + "loss": 1.5751, + "step": 545 + }, + { + "epoch": 1.2907801418439715, + "grad_norm": 3.2532002925872803, + "learning_rate": 1.800239234449761e-05, + "loss": 1.5689, + "step": 546 + }, + { + "epoch": 1.293144208037825, + "grad_norm": 3.2593157291412354, + "learning_rate": 1.7942583732057417e-05, + "loss": 1.5601, + "step": 547 + }, + { + "epoch": 1.2955082742316786, + "grad_norm": 3.3987958431243896, + "learning_rate": 1.7882775119617226e-05, + "loss": 1.6927, + "step": 548 + }, + { + "epoch": 1.297872340425532, + "grad_norm": 3.6468160152435303, + "learning_rate": 1.7822966507177032e-05, + "loss": 1.6122, + "step": 549 + }, + { + "epoch": 1.3002364066193852, + "grad_norm": 3.36958646774292, + "learning_rate": 1.7763157894736842e-05, + "loss": 1.702, + "step": 550 + }, + { + "epoch": 1.3026004728132388, + "grad_norm": 3.3941123485565186, + "learning_rate": 1.770334928229665e-05, + "loss": 1.6855, + "step": 551 + }, + { + "epoch": 1.3049645390070923, + "grad_norm": 3.241328001022339, + "learning_rate": 1.7643540669856458e-05, + "loss": 1.6427, + "step": 552 + }, + { + "epoch": 1.3073286052009456, + "grad_norm": 3.470787763595581, + "learning_rate": 1.758373205741627e-05, + "loss": 1.6664, + "step": 553 + }, + { + "epoch": 1.309692671394799, + "grad_norm": 3.4991133213043213, + "learning_rate": 1.752392344497608e-05, + "loss": 1.6903, + "step": 554 + }, + { + "epoch": 1.3120567375886525, + "grad_norm": 3.3299062252044678, + "learning_rate": 1.7464114832535886e-05, + "loss": 1.6424, + "step": 555 + }, + { + "epoch": 1.314420803782506, + "grad_norm": 3.72178316116333, + "learning_rate": 1.7404306220095696e-05, + "loss": 1.6782, + "step": 556 + }, + { + "epoch": 1.3167848699763594, + "grad_norm": 3.3736157417297363, + "learning_rate": 1.7344497607655502e-05, + "loss": 1.6784, + "step": 557 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 3.2792179584503174, + "learning_rate": 1.728468899521531e-05, + "loss": 1.6388, + "step": 558 + }, + { + "epoch": 1.3215130023640662, + "grad_norm": 3.2111778259277344, + "learning_rate": 1.722488038277512e-05, + "loss": 1.7016, + "step": 559 + }, + { + "epoch": 1.3238770685579198, + "grad_norm": 3.1302785873413086, + "learning_rate": 1.7165071770334927e-05, + "loss": 1.6088, + "step": 560 + }, + { + "epoch": 1.326241134751773, + "grad_norm": 3.296743869781494, + "learning_rate": 1.7105263157894737e-05, + "loss": 1.6173, + "step": 561 + }, + { + "epoch": 1.3286052009456264, + "grad_norm": 3.36970853805542, + "learning_rate": 1.7045454545454546e-05, + "loss": 1.6424, + "step": 562 + }, + { + "epoch": 1.33096926713948, + "grad_norm": 3.7065136432647705, + "learning_rate": 1.6985645933014356e-05, + "loss": 1.6204, + "step": 563 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 3.59222412109375, + "learning_rate": 1.6925837320574165e-05, + "loss": 1.6838, + "step": 564 + }, + { + "epoch": 1.3356973995271868, + "grad_norm": 3.18104887008667, + "learning_rate": 1.686602870813397e-05, + "loss": 1.6503, + "step": 565 + }, + { + "epoch": 1.3380614657210401, + "grad_norm": 3.9115757942199707, + "learning_rate": 1.680622009569378e-05, + "loss": 1.6485, + "step": 566 + }, + { + "epoch": 1.3404255319148937, + "grad_norm": 3.2171802520751953, + "learning_rate": 1.674641148325359e-05, + "loss": 1.6396, + "step": 567 + }, + { + "epoch": 1.342789598108747, + "grad_norm": 3.288642406463623, + "learning_rate": 1.6686602870813396e-05, + "loss": 1.6966, + "step": 568 + }, + { + "epoch": 1.3451536643026005, + "grad_norm": 3.4783761501312256, + "learning_rate": 1.6626794258373206e-05, + "loss": 1.6339, + "step": 569 + }, + { + "epoch": 1.3475177304964538, + "grad_norm": 3.4193568229675293, + "learning_rate": 1.6566985645933016e-05, + "loss": 1.666, + "step": 570 + }, + { + "epoch": 1.3498817966903074, + "grad_norm": 3.327106237411499, + "learning_rate": 1.6507177033492825e-05, + "loss": 1.6518, + "step": 571 + }, + { + "epoch": 1.3522458628841607, + "grad_norm": 3.579259157180786, + "learning_rate": 1.6447368421052635e-05, + "loss": 1.6352, + "step": 572 + }, + { + "epoch": 1.3546099290780143, + "grad_norm": 3.3811655044555664, + "learning_rate": 1.638755980861244e-05, + "loss": 1.6311, + "step": 573 + }, + { + "epoch": 1.3569739952718676, + "grad_norm": 3.1911275386810303, + "learning_rate": 1.632775119617225e-05, + "loss": 1.6124, + "step": 574 + }, + { + "epoch": 1.3593380614657211, + "grad_norm": 3.3752293586730957, + "learning_rate": 1.6267942583732056e-05, + "loss": 1.6206, + "step": 575 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 3.6627418994903564, + "learning_rate": 1.6208133971291866e-05, + "loss": 1.568, + "step": 576 + }, + { + "epoch": 1.364066193853428, + "grad_norm": 3.476933717727661, + "learning_rate": 1.6148325358851675e-05, + "loss": 1.6019, + "step": 577 + }, + { + "epoch": 1.3664302600472813, + "grad_norm": 3.566840648651123, + "learning_rate": 1.6088516746411485e-05, + "loss": 1.6348, + "step": 578 + }, + { + "epoch": 1.3687943262411348, + "grad_norm": 3.4907350540161133, + "learning_rate": 1.6028708133971294e-05, + "loss": 1.6803, + "step": 579 + }, + { + "epoch": 1.3711583924349882, + "grad_norm": 3.323463201522827, + "learning_rate": 1.59688995215311e-05, + "loss": 1.6502, + "step": 580 + }, + { + "epoch": 1.3735224586288415, + "grad_norm": 3.230318069458008, + "learning_rate": 1.590909090909091e-05, + "loss": 1.5683, + "step": 581 + }, + { + "epoch": 1.375886524822695, + "grad_norm": 3.3330845832824707, + "learning_rate": 1.584928229665072e-05, + "loss": 1.6215, + "step": 582 + }, + { + "epoch": 1.3782505910165486, + "grad_norm": 3.4185128211975098, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.6787, + "step": 583 + }, + { + "epoch": 1.3806146572104019, + "grad_norm": 3.7025232315063477, + "learning_rate": 1.5729665071770335e-05, + "loss": 1.6466, + "step": 584 + }, + { + "epoch": 1.3829787234042552, + "grad_norm": 3.770089626312256, + "learning_rate": 1.5669856459330145e-05, + "loss": 1.673, + "step": 585 + }, + { + "epoch": 1.3853427895981087, + "grad_norm": 3.472599506378174, + "learning_rate": 1.561004784688995e-05, + "loss": 1.6181, + "step": 586 + }, + { + "epoch": 1.3877068557919623, + "grad_norm": 3.397357702255249, + "learning_rate": 1.555023923444976e-05, + "loss": 1.6728, + "step": 587 + }, + { + "epoch": 1.3900709219858156, + "grad_norm": 3.5820982456207275, + "learning_rate": 1.549043062200957e-05, + "loss": 1.6025, + "step": 588 + }, + { + "epoch": 1.392434988179669, + "grad_norm": 3.512296438217163, + "learning_rate": 1.543062200956938e-05, + "loss": 1.6413, + "step": 589 + }, + { + "epoch": 1.3947990543735225, + "grad_norm": 3.4280846118927, + "learning_rate": 1.537081339712919e-05, + "loss": 1.6388, + "step": 590 + }, + { + "epoch": 1.397163120567376, + "grad_norm": 3.3687398433685303, + "learning_rate": 1.5311004784688995e-05, + "loss": 1.651, + "step": 591 + }, + { + "epoch": 1.3995271867612293, + "grad_norm": 3.170372486114502, + "learning_rate": 1.5251196172248805e-05, + "loss": 1.6449, + "step": 592 + }, + { + "epoch": 1.4018912529550827, + "grad_norm": 3.245079755783081, + "learning_rate": 1.5191387559808613e-05, + "loss": 1.6725, + "step": 593 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 3.2267603874206543, + "learning_rate": 1.5131578947368422e-05, + "loss": 1.6259, + "step": 594 + }, + { + "epoch": 1.4066193853427895, + "grad_norm": 3.304009199142456, + "learning_rate": 1.5071770334928232e-05, + "loss": 1.6362, + "step": 595 + }, + { + "epoch": 1.408983451536643, + "grad_norm": 3.2211010456085205, + "learning_rate": 1.5011961722488038e-05, + "loss": 1.6423, + "step": 596 + }, + { + "epoch": 1.4113475177304964, + "grad_norm": 3.5229361057281494, + "learning_rate": 1.4952153110047847e-05, + "loss": 1.604, + "step": 597 + }, + { + "epoch": 1.41371158392435, + "grad_norm": 3.3903708457946777, + "learning_rate": 1.4892344497607657e-05, + "loss": 1.54, + "step": 598 + }, + { + "epoch": 1.4160756501182032, + "grad_norm": 3.6783154010772705, + "learning_rate": 1.4832535885167465e-05, + "loss": 1.6891, + "step": 599 + }, + { + "epoch": 1.4184397163120568, + "grad_norm": 3.3614747524261475, + "learning_rate": 1.4772727272727274e-05, + "loss": 1.5927, + "step": 600 + }, + { + "epoch": 1.42080378250591, + "grad_norm": 3.4144439697265625, + "learning_rate": 1.471291866028708e-05, + "loss": 1.5759, + "step": 601 + }, + { + "epoch": 1.4231678486997636, + "grad_norm": 3.5212013721466064, + "learning_rate": 1.4653110047846892e-05, + "loss": 1.5951, + "step": 602 + }, + { + "epoch": 1.425531914893617, + "grad_norm": 3.3338160514831543, + "learning_rate": 1.4593301435406701e-05, + "loss": 1.5867, + "step": 603 + }, + { + "epoch": 1.4278959810874705, + "grad_norm": 3.3701469898223877, + "learning_rate": 1.4533492822966507e-05, + "loss": 1.6722, + "step": 604 + }, + { + "epoch": 1.4302600472813238, + "grad_norm": 3.4370622634887695, + "learning_rate": 1.4473684210526317e-05, + "loss": 1.6592, + "step": 605 + }, + { + "epoch": 1.4326241134751774, + "grad_norm": 3.3094122409820557, + "learning_rate": 1.4413875598086125e-05, + "loss": 1.6039, + "step": 606 + }, + { + "epoch": 1.4349881796690307, + "grad_norm": 3.21444034576416, + "learning_rate": 1.4354066985645934e-05, + "loss": 1.6889, + "step": 607 + }, + { + "epoch": 1.4373522458628842, + "grad_norm": 3.4152231216430664, + "learning_rate": 1.4294258373205744e-05, + "loss": 1.6215, + "step": 608 + }, + { + "epoch": 1.4397163120567376, + "grad_norm": 3.1800973415374756, + "learning_rate": 1.423444976076555e-05, + "loss": 1.5442, + "step": 609 + }, + { + "epoch": 1.442080378250591, + "grad_norm": 3.6048243045806885, + "learning_rate": 1.417464114832536e-05, + "loss": 1.6962, + "step": 610 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 3.259493589401245, + "learning_rate": 1.4114832535885167e-05, + "loss": 1.5805, + "step": 611 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 3.352950096130371, + "learning_rate": 1.4055023923444977e-05, + "loss": 1.5834, + "step": 612 + }, + { + "epoch": 1.4491725768321513, + "grad_norm": 3.435502529144287, + "learning_rate": 1.3995215311004786e-05, + "loss": 1.6109, + "step": 613 + }, + { + "epoch": 1.4515366430260048, + "grad_norm": 3.175809621810913, + "learning_rate": 1.3935406698564592e-05, + "loss": 1.5312, + "step": 614 + }, + { + "epoch": 1.4539007092198581, + "grad_norm": 3.328200340270996, + "learning_rate": 1.3875598086124404e-05, + "loss": 1.5559, + "step": 615 + }, + { + "epoch": 1.4562647754137115, + "grad_norm": 3.227952480316162, + "learning_rate": 1.3815789473684213e-05, + "loss": 1.5322, + "step": 616 + }, + { + "epoch": 1.458628841607565, + "grad_norm": 3.272395610809326, + "learning_rate": 1.375598086124402e-05, + "loss": 1.6339, + "step": 617 + }, + { + "epoch": 1.4609929078014185, + "grad_norm": 3.417051315307617, + "learning_rate": 1.3696172248803829e-05, + "loss": 1.6676, + "step": 618 + }, + { + "epoch": 1.4633569739952719, + "grad_norm": 3.299630641937256, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.6343, + "step": 619 + }, + { + "epoch": 1.4657210401891252, + "grad_norm": 3.3762664794921875, + "learning_rate": 1.3576555023923446e-05, + "loss": 1.5982, + "step": 620 + }, + { + "epoch": 1.4680851063829787, + "grad_norm": 2.958192825317383, + "learning_rate": 1.3516746411483256e-05, + "loss": 1.598, + "step": 621 + }, + { + "epoch": 1.4704491725768323, + "grad_norm": 3.270052671432495, + "learning_rate": 1.3456937799043062e-05, + "loss": 1.7025, + "step": 622 + }, + { + "epoch": 1.4728132387706856, + "grad_norm": 3.4359071254730225, + "learning_rate": 1.3397129186602871e-05, + "loss": 1.6344, + "step": 623 + }, + { + "epoch": 1.475177304964539, + "grad_norm": 3.2563626766204834, + "learning_rate": 1.3337320574162679e-05, + "loss": 1.6154, + "step": 624 + }, + { + "epoch": 1.4775413711583925, + "grad_norm": 3.3198134899139404, + "learning_rate": 1.3277511961722489e-05, + "loss": 1.5953, + "step": 625 + }, + { + "epoch": 1.479905437352246, + "grad_norm": 3.1364660263061523, + "learning_rate": 1.3217703349282298e-05, + "loss": 1.572, + "step": 626 + }, + { + "epoch": 1.4822695035460993, + "grad_norm": 3.346433162689209, + "learning_rate": 1.3157894736842106e-05, + "loss": 1.6064, + "step": 627 + }, + { + "epoch": 1.4846335697399526, + "grad_norm": 3.1672523021698, + "learning_rate": 1.3098086124401916e-05, + "loss": 1.6319, + "step": 628 + }, + { + "epoch": 1.4869976359338062, + "grad_norm": 3.061640977859497, + "learning_rate": 1.3038277511961722e-05, + "loss": 1.6271, + "step": 629 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 3.298517942428589, + "learning_rate": 1.2978468899521531e-05, + "loss": 1.5318, + "step": 630 + }, + { + "epoch": 1.491725768321513, + "grad_norm": 3.5346083641052246, + "learning_rate": 1.291866028708134e-05, + "loss": 1.6172, + "step": 631 + }, + { + "epoch": 1.4940898345153664, + "grad_norm": 3.271876335144043, + "learning_rate": 1.2858851674641149e-05, + "loss": 1.6122, + "step": 632 + }, + { + "epoch": 1.49645390070922, + "grad_norm": 3.443845748901367, + "learning_rate": 1.2799043062200958e-05, + "loss": 1.6147, + "step": 633 + }, + { + "epoch": 1.4988179669030732, + "grad_norm": 3.336913585662842, + "learning_rate": 1.2739234449760768e-05, + "loss": 1.5498, + "step": 634 + }, + { + "epoch": 1.5011820330969265, + "grad_norm": 3.355677366256714, + "learning_rate": 1.2679425837320574e-05, + "loss": 1.619, + "step": 635 + }, + { + "epoch": 1.50354609929078, + "grad_norm": 3.250521421432495, + "learning_rate": 1.2619617224880383e-05, + "loss": 1.5994, + "step": 636 + }, + { + "epoch": 1.5059101654846336, + "grad_norm": 3.4614198207855225, + "learning_rate": 1.2559808612440191e-05, + "loss": 1.6519, + "step": 637 + }, + { + "epoch": 1.508274231678487, + "grad_norm": 3.2225258350372314, + "learning_rate": 1.25e-05, + "loss": 1.678, + "step": 638 + }, + { + "epoch": 1.5106382978723403, + "grad_norm": 3.3898487091064453, + "learning_rate": 1.2440191387559808e-05, + "loss": 1.6185, + "step": 639 + }, + { + "epoch": 1.5130023640661938, + "grad_norm": 3.226099967956543, + "learning_rate": 1.2380382775119618e-05, + "loss": 1.5081, + "step": 640 + }, + { + "epoch": 1.5153664302600474, + "grad_norm": 3.1962661743164062, + "learning_rate": 1.2320574162679427e-05, + "loss": 1.645, + "step": 641 + }, + { + "epoch": 1.5177304964539007, + "grad_norm": 3.3228158950805664, + "learning_rate": 1.2260765550239235e-05, + "loss": 1.6331, + "step": 642 + }, + { + "epoch": 1.520094562647754, + "grad_norm": 3.6790809631347656, + "learning_rate": 1.2200956937799043e-05, + "loss": 1.5544, + "step": 643 + }, + { + "epoch": 1.5224586288416075, + "grad_norm": 3.375044822692871, + "learning_rate": 1.2141148325358851e-05, + "loss": 1.6076, + "step": 644 + }, + { + "epoch": 1.524822695035461, + "grad_norm": 3.324845790863037, + "learning_rate": 1.2081339712918662e-05, + "loss": 1.5041, + "step": 645 + }, + { + "epoch": 1.5271867612293144, + "grad_norm": 3.6021666526794434, + "learning_rate": 1.202153110047847e-05, + "loss": 1.6022, + "step": 646 + }, + { + "epoch": 1.5295508274231677, + "grad_norm": 3.412616729736328, + "learning_rate": 1.1961722488038278e-05, + "loss": 1.5649, + "step": 647 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 3.1025540828704834, + "learning_rate": 1.1901913875598086e-05, + "loss": 1.5535, + "step": 648 + }, + { + "epoch": 1.5342789598108748, + "grad_norm": 3.1715688705444336, + "learning_rate": 1.1842105263157895e-05, + "loss": 1.5612, + "step": 649 + }, + { + "epoch": 1.5366430260047281, + "grad_norm": 3.3327136039733887, + "learning_rate": 1.1782296650717705e-05, + "loss": 1.602, + "step": 650 + }, + { + "epoch": 1.5390070921985815, + "grad_norm": 3.278064727783203, + "learning_rate": 1.1722488038277513e-05, + "loss": 1.62, + "step": 651 + }, + { + "epoch": 1.541371158392435, + "grad_norm": 3.3567280769348145, + "learning_rate": 1.166267942583732e-05, + "loss": 1.6035, + "step": 652 + }, + { + "epoch": 1.5437352245862885, + "grad_norm": 3.2232353687286377, + "learning_rate": 1.160287081339713e-05, + "loss": 1.569, + "step": 653 + }, + { + "epoch": 1.5460992907801419, + "grad_norm": 3.1278183460235596, + "learning_rate": 1.154306220095694e-05, + "loss": 1.6563, + "step": 654 + }, + { + "epoch": 1.5484633569739952, + "grad_norm": 3.2195639610290527, + "learning_rate": 1.1483253588516747e-05, + "loss": 1.5537, + "step": 655 + }, + { + "epoch": 1.5508274231678487, + "grad_norm": 3.479424238204956, + "learning_rate": 1.1423444976076555e-05, + "loss": 1.6301, + "step": 656 + }, + { + "epoch": 1.5531914893617023, + "grad_norm": 3.478548765182495, + "learning_rate": 1.1363636363636365e-05, + "loss": 1.6581, + "step": 657 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 3.495012044906616, + "learning_rate": 1.1303827751196172e-05, + "loss": 1.6673, + "step": 658 + }, + { + "epoch": 1.557919621749409, + "grad_norm": 3.327958822250366, + "learning_rate": 1.1244019138755982e-05, + "loss": 1.6435, + "step": 659 + }, + { + "epoch": 1.5602836879432624, + "grad_norm": 3.364741325378418, + "learning_rate": 1.118421052631579e-05, + "loss": 1.6802, + "step": 660 + }, + { + "epoch": 1.562647754137116, + "grad_norm": 3.15824031829834, + "learning_rate": 1.1124401913875598e-05, + "loss": 1.6495, + "step": 661 + }, + { + "epoch": 1.5650118203309693, + "grad_norm": 3.103550910949707, + "learning_rate": 1.1064593301435407e-05, + "loss": 1.6271, + "step": 662 + }, + { + "epoch": 1.5673758865248226, + "grad_norm": 3.3356754779815674, + "learning_rate": 1.1004784688995217e-05, + "loss": 1.5932, + "step": 663 + }, + { + "epoch": 1.5697399527186762, + "grad_norm": 3.3036398887634277, + "learning_rate": 1.0944976076555025e-05, + "loss": 1.6617, + "step": 664 + }, + { + "epoch": 1.5721040189125297, + "grad_norm": 3.20703387260437, + "learning_rate": 1.0885167464114832e-05, + "loss": 1.6287, + "step": 665 + }, + { + "epoch": 1.574468085106383, + "grad_norm": 3.3921687602996826, + "learning_rate": 1.0825358851674642e-05, + "loss": 1.5584, + "step": 666 + }, + { + "epoch": 1.5768321513002364, + "grad_norm": 3.2371623516082764, + "learning_rate": 1.0765550239234451e-05, + "loss": 1.5664, + "step": 667 + }, + { + "epoch": 1.57919621749409, + "grad_norm": 3.1653621196746826, + "learning_rate": 1.070574162679426e-05, + "loss": 1.5695, + "step": 668 + }, + { + "epoch": 1.5815602836879432, + "grad_norm": 3.467496871948242, + "learning_rate": 1.0645933014354067e-05, + "loss": 1.5485, + "step": 669 + }, + { + "epoch": 1.5839243498817965, + "grad_norm": 3.112905263900757, + "learning_rate": 1.0586124401913877e-05, + "loss": 1.5365, + "step": 670 + }, + { + "epoch": 1.58628841607565, + "grad_norm": 3.1907830238342285, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.5456, + "step": 671 + }, + { + "epoch": 1.5886524822695036, + "grad_norm": 3.382817268371582, + "learning_rate": 1.0466507177033494e-05, + "loss": 1.6402, + "step": 672 + }, + { + "epoch": 1.591016548463357, + "grad_norm": 3.159687042236328, + "learning_rate": 1.0406698564593302e-05, + "loss": 1.5578, + "step": 673 + }, + { + "epoch": 1.5933806146572103, + "grad_norm": 3.244880199432373, + "learning_rate": 1.034688995215311e-05, + "loss": 1.6561, + "step": 674 + }, + { + "epoch": 1.5957446808510638, + "grad_norm": 2.9089481830596924, + "learning_rate": 1.028708133971292e-05, + "loss": 1.5732, + "step": 675 + }, + { + "epoch": 1.5981087470449173, + "grad_norm": 3.061993360519409, + "learning_rate": 1.0227272727272729e-05, + "loss": 1.5751, + "step": 676 + }, + { + "epoch": 1.6004728132387707, + "grad_norm": 3.3294806480407715, + "learning_rate": 1.0167464114832537e-05, + "loss": 1.6372, + "step": 677 + }, + { + "epoch": 1.602836879432624, + "grad_norm": 3.1197781562805176, + "learning_rate": 1.0107655502392344e-05, + "loss": 1.6208, + "step": 678 + }, + { + "epoch": 1.6052009456264775, + "grad_norm": 3.3565282821655273, + "learning_rate": 1.0047846889952154e-05, + "loss": 1.6708, + "step": 679 + }, + { + "epoch": 1.607565011820331, + "grad_norm": 3.3783833980560303, + "learning_rate": 9.988038277511962e-06, + "loss": 1.6521, + "step": 680 + }, + { + "epoch": 1.6099290780141844, + "grad_norm": 3.4118947982788086, + "learning_rate": 9.928229665071771e-06, + "loss": 1.5684, + "step": 681 + }, + { + "epoch": 1.6122931442080377, + "grad_norm": 3.194164752960205, + "learning_rate": 9.868421052631579e-06, + "loss": 1.5699, + "step": 682 + }, + { + "epoch": 1.6146572104018913, + "grad_norm": 3.446110725402832, + "learning_rate": 9.808612440191389e-06, + "loss": 1.5395, + "step": 683 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 3.4659085273742676, + "learning_rate": 9.748803827751196e-06, + "loss": 1.5732, + "step": 684 + }, + { + "epoch": 1.6193853427895981, + "grad_norm": 3.388087749481201, + "learning_rate": 9.688995215311006e-06, + "loss": 1.6181, + "step": 685 + }, + { + "epoch": 1.6217494089834514, + "grad_norm": 3.297330379486084, + "learning_rate": 9.629186602870814e-06, + "loss": 1.6206, + "step": 686 + }, + { + "epoch": 1.624113475177305, + "grad_norm": 3.3777146339416504, + "learning_rate": 9.569377990430622e-06, + "loss": 1.6021, + "step": 687 + }, + { + "epoch": 1.6264775413711585, + "grad_norm": 3.2018673419952393, + "learning_rate": 9.509569377990431e-06, + "loss": 1.5968, + "step": 688 + }, + { + "epoch": 1.6288416075650118, + "grad_norm": 3.416879892349243, + "learning_rate": 9.449760765550239e-06, + "loss": 1.6182, + "step": 689 + }, + { + "epoch": 1.6312056737588652, + "grad_norm": 3.445861339569092, + "learning_rate": 9.389952153110048e-06, + "loss": 1.7168, + "step": 690 + }, + { + "epoch": 1.6335697399527187, + "grad_norm": 3.3097856044769287, + "learning_rate": 9.330143540669856e-06, + "loss": 1.617, + "step": 691 + }, + { + "epoch": 1.6359338061465722, + "grad_norm": 3.3081119060516357, + "learning_rate": 9.270334928229666e-06, + "loss": 1.6162, + "step": 692 + }, + { + "epoch": 1.6382978723404256, + "grad_norm": 3.2553486824035645, + "learning_rate": 9.210526315789474e-06, + "loss": 1.5849, + "step": 693 + }, + { + "epoch": 1.6406619385342789, + "grad_norm": 3.4111063480377197, + "learning_rate": 9.150717703349283e-06, + "loss": 1.5933, + "step": 694 + }, + { + "epoch": 1.6430260047281324, + "grad_norm": 3.365490436553955, + "learning_rate": 9.090909090909091e-06, + "loss": 1.694, + "step": 695 + }, + { + "epoch": 1.645390070921986, + "grad_norm": 3.191035032272339, + "learning_rate": 9.0311004784689e-06, + "loss": 1.5617, + "step": 696 + }, + { + "epoch": 1.6477541371158393, + "grad_norm": 3.4338693618774414, + "learning_rate": 8.971291866028708e-06, + "loss": 1.6369, + "step": 697 + }, + { + "epoch": 1.6501182033096926, + "grad_norm": 3.3550338745117188, + "learning_rate": 8.911483253588516e-06, + "loss": 1.6562, + "step": 698 + }, + { + "epoch": 1.6524822695035462, + "grad_norm": 3.4408254623413086, + "learning_rate": 8.851674641148326e-06, + "loss": 1.5803, + "step": 699 + }, + { + "epoch": 1.6548463356973995, + "grad_norm": 3.332296371459961, + "learning_rate": 8.791866028708135e-06, + "loss": 1.6199, + "step": 700 + }, + { + "epoch": 1.6572104018912528, + "grad_norm": 3.1053788661956787, + "learning_rate": 8.732057416267943e-06, + "loss": 1.5932, + "step": 701 + }, + { + "epoch": 1.6595744680851063, + "grad_norm": 3.337559700012207, + "learning_rate": 8.672248803827751e-06, + "loss": 1.6057, + "step": 702 + }, + { + "epoch": 1.6619385342789599, + "grad_norm": 3.354663372039795, + "learning_rate": 8.61244019138756e-06, + "loss": 1.6222, + "step": 703 + }, + { + "epoch": 1.6643026004728132, + "grad_norm": 3.2696797847747803, + "learning_rate": 8.552631578947368e-06, + "loss": 1.6551, + "step": 704 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 3.4090421199798584, + "learning_rate": 8.492822966507178e-06, + "loss": 1.6064, + "step": 705 + }, + { + "epoch": 1.66903073286052, + "grad_norm": 3.2877285480499268, + "learning_rate": 8.433014354066986e-06, + "loss": 1.5979, + "step": 706 + }, + { + "epoch": 1.6713947990543736, + "grad_norm": 3.5218770503997803, + "learning_rate": 8.373205741626795e-06, + "loss": 1.6531, + "step": 707 + }, + { + "epoch": 1.673758865248227, + "grad_norm": 3.3162100315093994, + "learning_rate": 8.313397129186603e-06, + "loss": 1.6029, + "step": 708 + }, + { + "epoch": 1.6761229314420802, + "grad_norm": 3.282989978790283, + "learning_rate": 8.253588516746413e-06, + "loss": 1.6072, + "step": 709 + }, + { + "epoch": 1.6784869976359338, + "grad_norm": 3.0652577877044678, + "learning_rate": 8.19377990430622e-06, + "loss": 1.5495, + "step": 710 + }, + { + "epoch": 1.6808510638297873, + "grad_norm": 3.1217284202575684, + "learning_rate": 8.133971291866028e-06, + "loss": 1.5687, + "step": 711 + }, + { + "epoch": 1.6832151300236406, + "grad_norm": 3.3797202110290527, + "learning_rate": 8.074162679425838e-06, + "loss": 1.6669, + "step": 712 + }, + { + "epoch": 1.685579196217494, + "grad_norm": 3.2859535217285156, + "learning_rate": 8.014354066985647e-06, + "loss": 1.6696, + "step": 713 + }, + { + "epoch": 1.6879432624113475, + "grad_norm": 3.3486392498016357, + "learning_rate": 7.954545454545455e-06, + "loss": 1.5806, + "step": 714 + }, + { + "epoch": 1.690307328605201, + "grad_norm": 3.3833365440368652, + "learning_rate": 7.894736842105263e-06, + "loss": 1.5656, + "step": 715 + }, + { + "epoch": 1.6926713947990544, + "grad_norm": 3.4777348041534424, + "learning_rate": 7.834928229665072e-06, + "loss": 1.6621, + "step": 716 + }, + { + "epoch": 1.6950354609929077, + "grad_norm": 3.3407208919525146, + "learning_rate": 7.77511961722488e-06, + "loss": 1.6283, + "step": 717 + }, + { + "epoch": 1.6973995271867612, + "grad_norm": 3.4209749698638916, + "learning_rate": 7.71531100478469e-06, + "loss": 1.6341, + "step": 718 + }, + { + "epoch": 1.6997635933806148, + "grad_norm": 3.4124741554260254, + "learning_rate": 7.655502392344498e-06, + "loss": 1.5899, + "step": 719 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 3.362297296524048, + "learning_rate": 7.595693779904306e-06, + "loss": 1.6455, + "step": 720 + }, + { + "epoch": 1.7044917257683214, + "grad_norm": 3.364933967590332, + "learning_rate": 7.535885167464116e-06, + "loss": 1.6634, + "step": 721 + }, + { + "epoch": 1.706855791962175, + "grad_norm": 3.102181911468506, + "learning_rate": 7.476076555023924e-06, + "loss": 1.5939, + "step": 722 + }, + { + "epoch": 1.7092198581560285, + "grad_norm": 3.2613308429718018, + "learning_rate": 7.416267942583732e-06, + "loss": 1.6737, + "step": 723 + }, + { + "epoch": 1.7115839243498818, + "grad_norm": 3.2267072200775146, + "learning_rate": 7.35645933014354e-06, + "loss": 1.6096, + "step": 724 + }, + { + "epoch": 1.7139479905437351, + "grad_norm": 3.3645620346069336, + "learning_rate": 7.2966507177033505e-06, + "loss": 1.5925, + "step": 725 + }, + { + "epoch": 1.7163120567375887, + "grad_norm": 3.501962184906006, + "learning_rate": 7.236842105263158e-06, + "loss": 1.6251, + "step": 726 + }, + { + "epoch": 1.7186761229314422, + "grad_norm": 3.2537877559661865, + "learning_rate": 7.177033492822967e-06, + "loss": 1.6472, + "step": 727 + }, + { + "epoch": 1.7210401891252955, + "grad_norm": 3.2607951164245605, + "learning_rate": 7.117224880382775e-06, + "loss": 1.5956, + "step": 728 + }, + { + "epoch": 1.7234042553191489, + "grad_norm": 3.262928009033203, + "learning_rate": 7.0574162679425836e-06, + "loss": 1.6301, + "step": 729 + }, + { + "epoch": 1.7257683215130024, + "grad_norm": 3.369541883468628, + "learning_rate": 6.997607655502393e-06, + "loss": 1.6375, + "step": 730 + }, + { + "epoch": 1.728132387706856, + "grad_norm": 3.094522714614868, + "learning_rate": 6.937799043062202e-06, + "loss": 1.6489, + "step": 731 + }, + { + "epoch": 1.7304964539007093, + "grad_norm": 3.576575517654419, + "learning_rate": 6.87799043062201e-06, + "loss": 1.5887, + "step": 732 + }, + { + "epoch": 1.7328605200945626, + "grad_norm": 3.258538246154785, + "learning_rate": 6.818181818181818e-06, + "loss": 1.605, + "step": 733 + }, + { + "epoch": 1.7352245862884161, + "grad_norm": 3.1668145656585693, + "learning_rate": 6.758373205741628e-06, + "loss": 1.5805, + "step": 734 + }, + { + "epoch": 1.7375886524822695, + "grad_norm": 3.51381778717041, + "learning_rate": 6.698564593301436e-06, + "loss": 1.5987, + "step": 735 + }, + { + "epoch": 1.7399527186761228, + "grad_norm": 3.3210718631744385, + "learning_rate": 6.638755980861244e-06, + "loss": 1.5751, + "step": 736 + }, + { + "epoch": 1.7423167848699763, + "grad_norm": 3.149791717529297, + "learning_rate": 6.578947368421053e-06, + "loss": 1.5559, + "step": 737 + }, + { + "epoch": 1.7446808510638299, + "grad_norm": 3.152360200881958, + "learning_rate": 6.519138755980861e-06, + "loss": 1.4452, + "step": 738 + }, + { + "epoch": 1.7470449172576832, + "grad_norm": 3.0398778915405273, + "learning_rate": 6.45933014354067e-06, + "loss": 1.5646, + "step": 739 + }, + { + "epoch": 1.7494089834515365, + "grad_norm": 3.1679794788360596, + "learning_rate": 6.399521531100479e-06, + "loss": 1.5783, + "step": 740 + }, + { + "epoch": 1.75177304964539, + "grad_norm": 3.173020839691162, + "learning_rate": 6.339712918660287e-06, + "loss": 1.549, + "step": 741 + }, + { + "epoch": 1.7541371158392436, + "grad_norm": 3.451342821121216, + "learning_rate": 6.2799043062200955e-06, + "loss": 1.6184, + "step": 742 + }, + { + "epoch": 1.756501182033097, + "grad_norm": 3.26377010345459, + "learning_rate": 6.220095693779904e-06, + "loss": 1.6053, + "step": 743 + }, + { + "epoch": 1.7588652482269502, + "grad_norm": 3.3404619693756104, + "learning_rate": 6.160287081339714e-06, + "loss": 1.585, + "step": 744 + }, + { + "epoch": 1.7612293144208038, + "grad_norm": 3.3841774463653564, + "learning_rate": 6.1004784688995216e-06, + "loss": 1.5859, + "step": 745 + }, + { + "epoch": 1.7635933806146573, + "grad_norm": 3.4458372592926025, + "learning_rate": 6.040669856459331e-06, + "loss": 1.5913, + "step": 746 + }, + { + "epoch": 1.7659574468085106, + "grad_norm": 3.323127508163452, + "learning_rate": 5.980861244019139e-06, + "loss": 1.5518, + "step": 747 + }, + { + "epoch": 1.768321513002364, + "grad_norm": 3.858598232269287, + "learning_rate": 5.921052631578948e-06, + "loss": 1.6864, + "step": 748 + }, + { + "epoch": 1.7706855791962175, + "grad_norm": 2.968590259552002, + "learning_rate": 5.861244019138756e-06, + "loss": 1.6128, + "step": 749 + }, + { + "epoch": 1.773049645390071, + "grad_norm": 3.0373711585998535, + "learning_rate": 5.801435406698565e-06, + "loss": 1.543, + "step": 750 + }, + { + "epoch": 1.7754137115839244, + "grad_norm": 3.524451494216919, + "learning_rate": 5.741626794258374e-06, + "loss": 1.5866, + "step": 751 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 3.0094456672668457, + "learning_rate": 5.681818181818182e-06, + "loss": 1.5617, + "step": 752 + }, + { + "epoch": 1.7801418439716312, + "grad_norm": 3.0851380825042725, + "learning_rate": 5.622009569377991e-06, + "loss": 1.4814, + "step": 753 + }, + { + "epoch": 1.7825059101654848, + "grad_norm": 3.3271324634552, + "learning_rate": 5.562200956937799e-06, + "loss": 1.6402, + "step": 754 + }, + { + "epoch": 1.784869976359338, + "grad_norm": 3.0717716217041016, + "learning_rate": 5.502392344497608e-06, + "loss": 1.6037, + "step": 755 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 3.099236249923706, + "learning_rate": 5.442583732057416e-06, + "loss": 1.5689, + "step": 756 + }, + { + "epoch": 1.789598108747045, + "grad_norm": 3.1861002445220947, + "learning_rate": 5.382775119617226e-06, + "loss": 1.5622, + "step": 757 + }, + { + "epoch": 1.7919621749408985, + "grad_norm": 3.010728359222412, + "learning_rate": 5.3229665071770335e-06, + "loss": 1.6116, + "step": 758 + }, + { + "epoch": 1.7943262411347518, + "grad_norm": 3.0583670139312744, + "learning_rate": 5.263157894736842e-06, + "loss": 1.5781, + "step": 759 + }, + { + "epoch": 1.7966903073286051, + "grad_norm": 3.046917676925659, + "learning_rate": 5.203349282296651e-06, + "loss": 1.4681, + "step": 760 + }, + { + "epoch": 1.7990543735224587, + "grad_norm": 3.2571654319763184, + "learning_rate": 5.14354066985646e-06, + "loss": 1.6099, + "step": 761 + }, + { + "epoch": 1.8014184397163122, + "grad_norm": 3.372274160385132, + "learning_rate": 5.083732057416268e-06, + "loss": 1.5611, + "step": 762 + }, + { + "epoch": 1.8037825059101655, + "grad_norm": 3.326112747192383, + "learning_rate": 5.023923444976077e-06, + "loss": 1.653, + "step": 763 + }, + { + "epoch": 1.8061465721040189, + "grad_norm": 3.1222290992736816, + "learning_rate": 4.964114832535886e-06, + "loss": 1.5228, + "step": 764 + }, + { + "epoch": 1.8085106382978724, + "grad_norm": 3.0824484825134277, + "learning_rate": 4.904306220095694e-06, + "loss": 1.6368, + "step": 765 + }, + { + "epoch": 1.8108747044917257, + "grad_norm": 3.1424765586853027, + "learning_rate": 4.844497607655503e-06, + "loss": 1.5466, + "step": 766 + }, + { + "epoch": 1.813238770685579, + "grad_norm": 3.1939899921417236, + "learning_rate": 4.784688995215311e-06, + "loss": 1.6732, + "step": 767 + }, + { + "epoch": 1.8156028368794326, + "grad_norm": 3.301724672317505, + "learning_rate": 4.7248803827751195e-06, + "loss": 1.6021, + "step": 768 + }, + { + "epoch": 1.8179669030732861, + "grad_norm": 3.067897319793701, + "learning_rate": 4.665071770334928e-06, + "loss": 1.6226, + "step": 769 + }, + { + "epoch": 1.8203309692671394, + "grad_norm": 3.0447728633880615, + "learning_rate": 4.605263157894737e-06, + "loss": 1.6111, + "step": 770 + }, + { + "epoch": 1.8226950354609928, + "grad_norm": 3.2400856018066406, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.5478, + "step": 771 + }, + { + "epoch": 1.8250591016548463, + "grad_norm": 3.619302749633789, + "learning_rate": 4.485645933014354e-06, + "loss": 1.5489, + "step": 772 + }, + { + "epoch": 1.8274231678486998, + "grad_norm": 3.1459033489227295, + "learning_rate": 4.425837320574163e-06, + "loss": 1.6253, + "step": 773 + }, + { + "epoch": 1.8297872340425532, + "grad_norm": 3.0557096004486084, + "learning_rate": 4.3660287081339716e-06, + "loss": 1.6382, + "step": 774 + }, + { + "epoch": 1.8321513002364065, + "grad_norm": 3.4371113777160645, + "learning_rate": 4.30622009569378e-06, + "loss": 1.6007, + "step": 775 + }, + { + "epoch": 1.83451536643026, + "grad_norm": 3.019963026046753, + "learning_rate": 4.246411483253589e-06, + "loss": 1.5758, + "step": 776 + }, + { + "epoch": 1.8368794326241136, + "grad_norm": 3.315235137939453, + "learning_rate": 4.186602870813398e-06, + "loss": 1.6083, + "step": 777 + }, + { + "epoch": 1.839243498817967, + "grad_norm": 2.9758803844451904, + "learning_rate": 4.126794258373206e-06, + "loss": 1.5595, + "step": 778 + }, + { + "epoch": 1.8416075650118202, + "grad_norm": 3.254204273223877, + "learning_rate": 4.066985645933014e-06, + "loss": 1.5773, + "step": 779 + }, + { + "epoch": 1.8439716312056738, + "grad_norm": 3.199317216873169, + "learning_rate": 4.007177033492824e-06, + "loss": 1.53, + "step": 780 + }, + { + "epoch": 1.8463356973995273, + "grad_norm": 2.9678032398223877, + "learning_rate": 3.9473684210526315e-06, + "loss": 1.5161, + "step": 781 + }, + { + "epoch": 1.8486997635933806, + "grad_norm": 3.191781759262085, + "learning_rate": 3.88755980861244e-06, + "loss": 1.659, + "step": 782 + }, + { + "epoch": 1.851063829787234, + "grad_norm": 3.0926475524902344, + "learning_rate": 3.827751196172249e-06, + "loss": 1.5918, + "step": 783 + }, + { + "epoch": 1.8534278959810875, + "grad_norm": 3.0538089275360107, + "learning_rate": 3.767942583732058e-06, + "loss": 1.5246, + "step": 784 + }, + { + "epoch": 1.855791962174941, + "grad_norm": 3.3812143802642822, + "learning_rate": 3.708133971291866e-06, + "loss": 1.6095, + "step": 785 + }, + { + "epoch": 1.8581560283687943, + "grad_norm": 3.296877861022949, + "learning_rate": 3.6483253588516753e-06, + "loss": 1.6096, + "step": 786 + }, + { + "epoch": 1.8605200945626477, + "grad_norm": 3.0140068531036377, + "learning_rate": 3.5885167464114835e-06, + "loss": 1.5117, + "step": 787 + }, + { + "epoch": 1.8628841607565012, + "grad_norm": 3.225166082382202, + "learning_rate": 3.5287081339712918e-06, + "loss": 1.5789, + "step": 788 + }, + { + "epoch": 1.8652482269503547, + "grad_norm": 3.1967225074768066, + "learning_rate": 3.468899521531101e-06, + "loss": 1.5403, + "step": 789 + }, + { + "epoch": 1.867612293144208, + "grad_norm": 3.2307610511779785, + "learning_rate": 3.409090909090909e-06, + "loss": 1.5959, + "step": 790 + }, + { + "epoch": 1.8699763593380614, + "grad_norm": 3.252063274383545, + "learning_rate": 3.349282296650718e-06, + "loss": 1.6492, + "step": 791 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 3.333066940307617, + "learning_rate": 3.2894736842105265e-06, + "loss": 1.6399, + "step": 792 + }, + { + "epoch": 1.8747044917257685, + "grad_norm": 3.0975208282470703, + "learning_rate": 3.229665071770335e-06, + "loss": 1.544, + "step": 793 + }, + { + "epoch": 1.8770685579196218, + "grad_norm": 3.1042416095733643, + "learning_rate": 3.1698564593301434e-06, + "loss": 1.5577, + "step": 794 + }, + { + "epoch": 1.8794326241134751, + "grad_norm": 3.2555060386657715, + "learning_rate": 3.110047846889952e-06, + "loss": 1.655, + "step": 795 + }, + { + "epoch": 1.8817966903073287, + "grad_norm": 3.113212823867798, + "learning_rate": 3.0502392344497608e-06, + "loss": 1.6079, + "step": 796 + }, + { + "epoch": 1.8841607565011822, + "grad_norm": 3.1746702194213867, + "learning_rate": 2.9904306220095695e-06, + "loss": 1.5433, + "step": 797 + }, + { + "epoch": 1.8865248226950353, + "grad_norm": 3.156456708908081, + "learning_rate": 2.930622009569378e-06, + "loss": 1.5069, + "step": 798 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 3.4995877742767334, + "learning_rate": 2.870813397129187e-06, + "loss": 1.557, + "step": 799 + }, + { + "epoch": 1.8912529550827424, + "grad_norm": 3.097641706466675, + "learning_rate": 2.8110047846889955e-06, + "loss": 1.5456, + "step": 800 + }, + { + "epoch": 1.8936170212765957, + "grad_norm": 3.0806541442871094, + "learning_rate": 2.751196172248804e-06, + "loss": 1.5219, + "step": 801 + }, + { + "epoch": 1.895981087470449, + "grad_norm": 3.0909721851348877, + "learning_rate": 2.691387559808613e-06, + "loss": 1.6775, + "step": 802 + }, + { + "epoch": 1.8983451536643026, + "grad_norm": 3.172172784805298, + "learning_rate": 2.631578947368421e-06, + "loss": 1.58, + "step": 803 + }, + { + "epoch": 1.900709219858156, + "grad_norm": 3.4585001468658447, + "learning_rate": 2.57177033492823e-06, + "loss": 1.6051, + "step": 804 + }, + { + "epoch": 1.9030732860520094, + "grad_norm": 3.037222385406494, + "learning_rate": 2.5119617224880385e-06, + "loss": 1.6135, + "step": 805 + }, + { + "epoch": 1.9054373522458627, + "grad_norm": 3.203216791152954, + "learning_rate": 2.452153110047847e-06, + "loss": 1.5594, + "step": 806 + }, + { + "epoch": 1.9078014184397163, + "grad_norm": 3.219123363494873, + "learning_rate": 2.3923444976076554e-06, + "loss": 1.6568, + "step": 807 + }, + { + "epoch": 1.9101654846335698, + "grad_norm": 3.3200933933258057, + "learning_rate": 2.332535885167464e-06, + "loss": 1.5685, + "step": 808 + }, + { + "epoch": 1.9125295508274232, + "grad_norm": 3.247377395629883, + "learning_rate": 2.2727272727272728e-06, + "loss": 1.616, + "step": 809 + }, + { + "epoch": 1.9148936170212765, + "grad_norm": 3.124732255935669, + "learning_rate": 2.2129186602870814e-06, + "loss": 1.599, + "step": 810 + }, + { + "epoch": 1.91725768321513, + "grad_norm": 3.5280542373657227, + "learning_rate": 2.15311004784689e-06, + "loss": 1.6445, + "step": 811 + }, + { + "epoch": 1.9196217494089836, + "grad_norm": 3.254999876022339, + "learning_rate": 2.093301435406699e-06, + "loss": 1.6179, + "step": 812 + }, + { + "epoch": 1.9219858156028369, + "grad_norm": 2.8884024620056152, + "learning_rate": 2.033492822966507e-06, + "loss": 1.5094, + "step": 813 + }, + { + "epoch": 1.9243498817966902, + "grad_norm": 3.245943546295166, + "learning_rate": 1.9736842105263157e-06, + "loss": 1.6167, + "step": 814 + }, + { + "epoch": 1.9267139479905437, + "grad_norm": 3.1370460987091064, + "learning_rate": 1.9138755980861244e-06, + "loss": 1.5627, + "step": 815 + }, + { + "epoch": 1.9290780141843973, + "grad_norm": 3.339740753173828, + "learning_rate": 1.854066985645933e-06, + "loss": 1.6373, + "step": 816 + }, + { + "epoch": 1.9314420803782506, + "grad_norm": 3.221043348312378, + "learning_rate": 1.7942583732057418e-06, + "loss": 1.6013, + "step": 817 + }, + { + "epoch": 1.933806146572104, + "grad_norm": 3.43192458152771, + "learning_rate": 1.7344497607655504e-06, + "loss": 1.638, + "step": 818 + }, + { + "epoch": 1.9361702127659575, + "grad_norm": 3.0341379642486572, + "learning_rate": 1.674641148325359e-06, + "loss": 1.5758, + "step": 819 + }, + { + "epoch": 1.938534278959811, + "grad_norm": 3.183203935623169, + "learning_rate": 1.6148325358851676e-06, + "loss": 1.5632, + "step": 820 + }, + { + "epoch": 1.9408983451536643, + "grad_norm": 3.2515642642974854, + "learning_rate": 1.555023923444976e-06, + "loss": 1.5571, + "step": 821 + }, + { + "epoch": 1.9432624113475176, + "grad_norm": 3.4239914417266846, + "learning_rate": 1.4952153110047847e-06, + "loss": 1.6051, + "step": 822 + }, + { + "epoch": 1.9456264775413712, + "grad_norm": 3.138653039932251, + "learning_rate": 1.4354066985645934e-06, + "loss": 1.5776, + "step": 823 + }, + { + "epoch": 1.9479905437352247, + "grad_norm": 3.192990779876709, + "learning_rate": 1.375598086124402e-06, + "loss": 1.6117, + "step": 824 + }, + { + "epoch": 1.950354609929078, + "grad_norm": 3.2178995609283447, + "learning_rate": 1.3157894736842106e-06, + "loss": 1.6727, + "step": 825 + }, + { + "epoch": 1.9527186761229314, + "grad_norm": 3.2581796646118164, + "learning_rate": 1.2559808612440192e-06, + "loss": 1.6165, + "step": 826 + }, + { + "epoch": 1.955082742316785, + "grad_norm": 3.419954538345337, + "learning_rate": 1.1961722488038277e-06, + "loss": 1.581, + "step": 827 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 3.3294622898101807, + "learning_rate": 1.1363636363636364e-06, + "loss": 1.6156, + "step": 828 + }, + { + "epoch": 1.9598108747044918, + "grad_norm": 3.2191033363342285, + "learning_rate": 1.076555023923445e-06, + "loss": 1.5636, + "step": 829 + }, + { + "epoch": 1.962174940898345, + "grad_norm": 3.038212537765503, + "learning_rate": 1.0167464114832535e-06, + "loss": 1.5989, + "step": 830 + }, + { + "epoch": 1.9645390070921986, + "grad_norm": 3.1238808631896973, + "learning_rate": 9.569377990430622e-07, + "loss": 1.6156, + "step": 831 + }, + { + "epoch": 1.966903073286052, + "grad_norm": 3.3783342838287354, + "learning_rate": 8.971291866028709e-07, + "loss": 1.5802, + "step": 832 + }, + { + "epoch": 1.9692671394799053, + "grad_norm": 3.407663106918335, + "learning_rate": 8.373205741626795e-07, + "loss": 1.6369, + "step": 833 + }, + { + "epoch": 1.9716312056737588, + "grad_norm": 3.149191379547119, + "learning_rate": 7.77511961722488e-07, + "loss": 1.5683, + "step": 834 + }, + { + "epoch": 1.9739952718676124, + "grad_norm": 3.09270977973938, + "learning_rate": 7.177033492822967e-07, + "loss": 1.5947, + "step": 835 + }, + { + "epoch": 1.9763593380614657, + "grad_norm": 3.1009361743927, + "learning_rate": 6.578947368421053e-07, + "loss": 1.5997, + "step": 836 + }, + { + "epoch": 1.978723404255319, + "grad_norm": 3.1177103519439697, + "learning_rate": 5.980861244019139e-07, + "loss": 1.645, + "step": 837 + }, + { + "epoch": 1.9810874704491725, + "grad_norm": 3.130715847015381, + "learning_rate": 5.382775119617225e-07, + "loss": 1.564, + "step": 838 + }, + { + "epoch": 1.983451536643026, + "grad_norm": 3.048459529876709, + "learning_rate": 4.784688995215311e-07, + "loss": 1.6382, + "step": 839 + }, + { + "epoch": 1.9858156028368794, + "grad_norm": 2.779651403427124, + "learning_rate": 4.1866028708133973e-07, + "loss": 1.408, + "step": 840 + }, + { + "epoch": 1.9881796690307327, + "grad_norm": 3.279956579208374, + "learning_rate": 3.5885167464114835e-07, + "loss": 1.585, + "step": 841 + }, + { + "epoch": 1.9905437352245863, + "grad_norm": 3.1854429244995117, + "learning_rate": 2.990430622009569e-07, + "loss": 1.6264, + "step": 842 + }, + { + "epoch": 1.9929078014184398, + "grad_norm": 3.018812417984009, + "learning_rate": 2.3923444976076555e-07, + "loss": 1.6454, + "step": 843 + }, + { + "epoch": 1.9952718676122931, + "grad_norm": 2.9629745483398438, + "learning_rate": 1.7942583732057418e-07, + "loss": 1.5372, + "step": 844 + }, + { + "epoch": 1.9976359338061465, + "grad_norm": 3.0809407234191895, + "learning_rate": 1.1961722488038278e-07, + "loss": 1.5961, + "step": 845 + }, + { + "epoch": 2.0, + "grad_norm": 3.675367593765259, + "learning_rate": 5.980861244019139e-08, + "loss": 1.585, + "step": 846 + } + ], + "logging_steps": 1, + "max_steps": 846, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0786650102367027e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}