{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5336179295624333, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.10173338651657104, "learning_rate": 5.319148936170213e-07, "loss": 1.4985, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.10725362598896027, "learning_rate": 1.0638297872340427e-06, "loss": 1.4086, "step": 2 }, { "epoch": 0.0, "grad_norm": 0.1187792494893074, "learning_rate": 1.5957446808510639e-06, "loss": 1.4706, "step": 3 }, { "epoch": 0.0, "grad_norm": 0.10263431817293167, "learning_rate": 2.1276595744680853e-06, "loss": 1.4262, "step": 4 }, { "epoch": 0.01, "grad_norm": 0.10243014246225357, "learning_rate": 2.6595744680851065e-06, "loss": 1.4314, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.11368783563375473, "learning_rate": 3.1914893617021277e-06, "loss": 1.3997, "step": 6 }, { "epoch": 0.01, "grad_norm": 0.11974532902240753, "learning_rate": 3.723404255319149e-06, "loss": 1.4356, "step": 7 }, { "epoch": 0.01, "grad_norm": 0.10068795830011368, "learning_rate": 4.255319148936171e-06, "loss": 1.4006, "step": 8 }, { "epoch": 0.01, "grad_norm": 0.11169079691171646, "learning_rate": 4.787234042553191e-06, "loss": 1.4415, "step": 9 }, { "epoch": 0.01, "grad_norm": 0.10073692351579666, "learning_rate": 5.319148936170213e-06, "loss": 1.3752, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.1180361658334732, "learning_rate": 5.851063829787235e-06, "loss": 1.425, "step": 11 }, { "epoch": 0.01, "grad_norm": 0.09457835555076599, "learning_rate": 6.3829787234042555e-06, "loss": 1.3732, "step": 12 }, { "epoch": 0.01, "grad_norm": 0.10676431655883789, "learning_rate": 6.914893617021277e-06, "loss": 1.5031, "step": 13 }, { "epoch": 0.01, "grad_norm": 0.13407254219055176, "learning_rate": 7.446808510638298e-06, "loss": 1.4144, "step": 14 }, { "epoch": 0.02, "grad_norm": 0.10287946462631226, "learning_rate": 7.97872340425532e-06, "loss": 1.4369, "step": 15 }, { "epoch": 0.02, "grad_norm": 0.10514385998249054, "learning_rate": 8.510638297872341e-06, "loss": 1.39, "step": 16 }, { "epoch": 0.02, "grad_norm": 0.10802456736564636, "learning_rate": 9.042553191489362e-06, "loss": 1.4308, "step": 17 }, { "epoch": 0.02, "grad_norm": 0.11478875577449799, "learning_rate": 9.574468085106383e-06, "loss": 1.4301, "step": 18 }, { "epoch": 0.02, "grad_norm": 0.11263708025217056, "learning_rate": 1.0106382978723404e-05, "loss": 1.3662, "step": 19 }, { "epoch": 0.02, "grad_norm": 0.1158255934715271, "learning_rate": 1.0638297872340426e-05, "loss": 1.4532, "step": 20 }, { "epoch": 0.02, "grad_norm": 0.10566612333059311, "learning_rate": 1.1170212765957447e-05, "loss": 1.4037, "step": 21 }, { "epoch": 0.02, "grad_norm": 0.11653206497430801, "learning_rate": 1.170212765957447e-05, "loss": 1.5074, "step": 22 }, { "epoch": 0.02, "grad_norm": 0.1109832152724266, "learning_rate": 1.223404255319149e-05, "loss": 1.3657, "step": 23 }, { "epoch": 0.03, "grad_norm": 0.11619631946086884, "learning_rate": 1.2765957446808511e-05, "loss": 1.4038, "step": 24 }, { "epoch": 0.03, "grad_norm": 0.1255435049533844, "learning_rate": 1.3297872340425532e-05, "loss": 1.3373, "step": 25 }, { "epoch": 0.03, "grad_norm": 0.11004912108182907, "learning_rate": 1.3829787234042554e-05, "loss": 1.4355, "step": 26 }, { "epoch": 0.03, "grad_norm": 0.103106789290905, "learning_rate": 1.4361702127659577e-05, "loss": 1.4139, "step": 27 }, { "epoch": 0.03, "grad_norm": 0.11609970778226852, "learning_rate": 1.4893617021276596e-05, "loss": 1.3901, "step": 28 }, { "epoch": 0.03, "grad_norm": 0.10627258569002151, "learning_rate": 1.5425531914893617e-05, "loss": 1.4591, "step": 29 }, { "epoch": 0.03, "grad_norm": 0.10686346888542175, "learning_rate": 1.595744680851064e-05, "loss": 1.3809, "step": 30 }, { "epoch": 0.03, "grad_norm": 0.09971443563699722, "learning_rate": 1.6489361702127658e-05, "loss": 1.3737, "step": 31 }, { "epoch": 0.03, "grad_norm": 0.09937144815921783, "learning_rate": 1.7021276595744682e-05, "loss": 1.3764, "step": 32 }, { "epoch": 0.04, "grad_norm": 0.1083306223154068, "learning_rate": 1.7553191489361703e-05, "loss": 1.3584, "step": 33 }, { "epoch": 0.04, "grad_norm": 0.10381340235471725, "learning_rate": 1.8085106382978724e-05, "loss": 1.4051, "step": 34 }, { "epoch": 0.04, "grad_norm": 0.09269995987415314, "learning_rate": 1.8617021276595745e-05, "loss": 1.3287, "step": 35 }, { "epoch": 0.04, "grad_norm": 0.10057959705591202, "learning_rate": 1.9148936170212766e-05, "loss": 1.4432, "step": 36 }, { "epoch": 0.04, "grad_norm": 0.09498997777700424, "learning_rate": 1.968085106382979e-05, "loss": 1.5223, "step": 37 }, { "epoch": 0.04, "grad_norm": 0.09475935995578766, "learning_rate": 2.0212765957446807e-05, "loss": 1.4697, "step": 38 }, { "epoch": 0.04, "grad_norm": 0.09633958339691162, "learning_rate": 2.074468085106383e-05, "loss": 1.3877, "step": 39 }, { "epoch": 0.04, "grad_norm": 0.09594912081956863, "learning_rate": 2.1276595744680852e-05, "loss": 1.4528, "step": 40 }, { "epoch": 0.04, "grad_norm": 0.11214554309844971, "learning_rate": 2.1808510638297873e-05, "loss": 1.4098, "step": 41 }, { "epoch": 0.04, "grad_norm": 0.19427792727947235, "learning_rate": 2.2340425531914894e-05, "loss": 1.3801, "step": 42 }, { "epoch": 0.05, "grad_norm": 0.09848932921886444, "learning_rate": 2.2872340425531915e-05, "loss": 1.4141, "step": 43 }, { "epoch": 0.05, "grad_norm": 0.09963933378458023, "learning_rate": 2.340425531914894e-05, "loss": 1.4553, "step": 44 }, { "epoch": 0.05, "grad_norm": 0.09407307952642441, "learning_rate": 2.393617021276596e-05, "loss": 1.4168, "step": 45 }, { "epoch": 0.05, "grad_norm": 0.10419266670942307, "learning_rate": 2.446808510638298e-05, "loss": 1.3287, "step": 46 }, { "epoch": 0.05, "grad_norm": 0.09864490479230881, "learning_rate": 2.5e-05, "loss": 1.3317, "step": 47 }, { "epoch": 0.05, "grad_norm": 0.11262151598930359, "learning_rate": 2.5531914893617022e-05, "loss": 1.3246, "step": 48 }, { "epoch": 0.05, "grad_norm": 0.10121612995862961, "learning_rate": 2.6063829787234046e-05, "loss": 1.4149, "step": 49 }, { "epoch": 0.05, "grad_norm": 0.0978160873055458, "learning_rate": 2.6595744680851064e-05, "loss": 1.3421, "step": 50 }, { "epoch": 0.05, "grad_norm": 0.08920774608850479, "learning_rate": 2.7127659574468084e-05, "loss": 1.3861, "step": 51 }, { "epoch": 0.06, "grad_norm": 0.09108138084411621, "learning_rate": 2.765957446808511e-05, "loss": 1.3964, "step": 52 }, { "epoch": 0.06, "grad_norm": 0.1604197472333908, "learning_rate": 2.819148936170213e-05, "loss": 1.3571, "step": 53 }, { "epoch": 0.06, "grad_norm": 0.10113083571195602, "learning_rate": 2.8723404255319154e-05, "loss": 1.3825, "step": 54 }, { "epoch": 0.06, "grad_norm": 0.11050890386104584, "learning_rate": 2.925531914893617e-05, "loss": 1.3951, "step": 55 }, { "epoch": 0.06, "grad_norm": 0.0961427092552185, "learning_rate": 2.9787234042553192e-05, "loss": 1.3434, "step": 56 }, { "epoch": 0.06, "grad_norm": 0.1315707117319107, "learning_rate": 3.0319148936170216e-05, "loss": 1.3636, "step": 57 }, { "epoch": 0.06, "grad_norm": 0.10595009475946426, "learning_rate": 3.085106382978723e-05, "loss": 1.4939, "step": 58 }, { "epoch": 0.06, "grad_norm": 0.14528846740722656, "learning_rate": 3.1382978723404254e-05, "loss": 1.454, "step": 59 }, { "epoch": 0.06, "grad_norm": 0.0951959565281868, "learning_rate": 3.191489361702128e-05, "loss": 1.4043, "step": 60 }, { "epoch": 0.07, "grad_norm": 0.10170764476060867, "learning_rate": 3.2446808510638296e-05, "loss": 1.3428, "step": 61 }, { "epoch": 0.07, "grad_norm": 0.1816103458404541, "learning_rate": 3.2978723404255317e-05, "loss": 1.3395, "step": 62 }, { "epoch": 0.07, "grad_norm": 0.1011623665690422, "learning_rate": 3.3510638297872344e-05, "loss": 1.4016, "step": 63 }, { "epoch": 0.07, "grad_norm": 0.09644798189401627, "learning_rate": 3.4042553191489365e-05, "loss": 1.3991, "step": 64 }, { "epoch": 0.07, "grad_norm": 0.11246529221534729, "learning_rate": 3.4574468085106386e-05, "loss": 1.4112, "step": 65 }, { "epoch": 0.07, "grad_norm": 0.0963212326169014, "learning_rate": 3.5106382978723407e-05, "loss": 1.3987, "step": 66 }, { "epoch": 0.07, "grad_norm": 0.10353317856788635, "learning_rate": 3.563829787234043e-05, "loss": 1.4398, "step": 67 }, { "epoch": 0.07, "grad_norm": 0.1050473302602768, "learning_rate": 3.617021276595745e-05, "loss": 1.4308, "step": 68 }, { "epoch": 0.07, "grad_norm": 0.10414024442434311, "learning_rate": 3.670212765957447e-05, "loss": 1.3739, "step": 69 }, { "epoch": 0.07, "grad_norm": 0.1070413663983345, "learning_rate": 3.723404255319149e-05, "loss": 1.2944, "step": 70 }, { "epoch": 0.08, "grad_norm": 0.08770282566547394, "learning_rate": 3.776595744680852e-05, "loss": 1.2979, "step": 71 }, { "epoch": 0.08, "grad_norm": 0.09537198394536972, "learning_rate": 3.829787234042553e-05, "loss": 1.3194, "step": 72 }, { "epoch": 0.08, "grad_norm": 0.168814554810524, "learning_rate": 3.882978723404255e-05, "loss": 1.3255, "step": 73 }, { "epoch": 0.08, "grad_norm": 0.10750961303710938, "learning_rate": 3.936170212765958e-05, "loss": 1.4541, "step": 74 }, { "epoch": 0.08, "grad_norm": 0.09767494350671768, "learning_rate": 3.9893617021276594e-05, "loss": 1.3108, "step": 75 }, { "epoch": 0.08, "grad_norm": 0.09643732011318207, "learning_rate": 4.0425531914893614e-05, "loss": 1.4451, "step": 76 }, { "epoch": 0.08, "grad_norm": 0.10322108119726181, "learning_rate": 4.095744680851064e-05, "loss": 1.4286, "step": 77 }, { "epoch": 0.08, "grad_norm": 0.10831727087497711, "learning_rate": 4.148936170212766e-05, "loss": 1.415, "step": 78 }, { "epoch": 0.08, "grad_norm": 0.11692611873149872, "learning_rate": 4.2021276595744684e-05, "loss": 1.3944, "step": 79 }, { "epoch": 0.09, "grad_norm": 0.10344197601079941, "learning_rate": 4.2553191489361704e-05, "loss": 1.4005, "step": 80 }, { "epoch": 0.09, "grad_norm": 0.10011729598045349, "learning_rate": 4.3085106382978725e-05, "loss": 1.4402, "step": 81 }, { "epoch": 0.09, "grad_norm": 0.22549670934677124, "learning_rate": 4.3617021276595746e-05, "loss": 1.3343, "step": 82 }, { "epoch": 0.09, "grad_norm": 0.11193282157182693, "learning_rate": 4.414893617021277e-05, "loss": 1.4489, "step": 83 }, { "epoch": 0.09, "grad_norm": 0.10785133391618729, "learning_rate": 4.468085106382979e-05, "loss": 1.402, "step": 84 }, { "epoch": 0.09, "grad_norm": 0.10537637770175934, "learning_rate": 4.5212765957446815e-05, "loss": 1.3552, "step": 85 }, { "epoch": 0.09, "grad_norm": 0.1029570996761322, "learning_rate": 4.574468085106383e-05, "loss": 1.3307, "step": 86 }, { "epoch": 0.09, "grad_norm": 0.10009574890136719, "learning_rate": 4.627659574468085e-05, "loss": 1.3954, "step": 87 }, { "epoch": 0.09, "grad_norm": 0.11556259542703629, "learning_rate": 4.680851063829788e-05, "loss": 1.3959, "step": 88 }, { "epoch": 0.09, "grad_norm": 0.10029179602861404, "learning_rate": 4.734042553191489e-05, "loss": 1.3343, "step": 89 }, { "epoch": 0.1, "grad_norm": 0.11009006202220917, "learning_rate": 4.787234042553192e-05, "loss": 1.5279, "step": 90 }, { "epoch": 0.1, "grad_norm": 0.10678206384181976, "learning_rate": 4.840425531914894e-05, "loss": 1.341, "step": 91 }, { "epoch": 0.1, "grad_norm": 0.1142154335975647, "learning_rate": 4.893617021276596e-05, "loss": 1.4263, "step": 92 }, { "epoch": 0.1, "grad_norm": 0.09466603398323059, "learning_rate": 4.946808510638298e-05, "loss": 1.3956, "step": 93 }, { "epoch": 0.1, "grad_norm": 0.1106942892074585, "learning_rate": 5e-05, "loss": 1.3708, "step": 94 }, { "epoch": 0.1, "grad_norm": 0.12606613337993622, "learning_rate": 4.9999961062358615e-05, "loss": 1.3991, "step": 95 }, { "epoch": 0.1, "grad_norm": 0.10292167216539383, "learning_rate": 4.999984424955572e-05, "loss": 1.3572, "step": 96 }, { "epoch": 0.1, "grad_norm": 0.09828190505504608, "learning_rate": 4.9999649561955216e-05, "loss": 1.31, "step": 97 }, { "epoch": 0.1, "grad_norm": 0.109276182949543, "learning_rate": 4.9999377000163536e-05, "loss": 1.3833, "step": 98 }, { "epoch": 0.11, "grad_norm": 0.10516379773616791, "learning_rate": 4.999902656502973e-05, "loss": 1.3872, "step": 99 }, { "epoch": 0.11, "grad_norm": 0.0997815951704979, "learning_rate": 4.99985982576454e-05, "loss": 1.4308, "step": 100 }, { "epoch": 0.11, "grad_norm": 0.1041920930147171, "learning_rate": 4.9998092079344723e-05, "loss": 1.3832, "step": 101 }, { "epoch": 0.11, "grad_norm": 0.09994788467884064, "learning_rate": 4.9997508031704456e-05, "loss": 1.3408, "step": 102 }, { "epoch": 0.11, "grad_norm": 0.13418225944042206, "learning_rate": 4.9996846116543915e-05, "loss": 1.349, "step": 103 }, { "epoch": 0.11, "grad_norm": 0.12500610947608948, "learning_rate": 4.999610633592496e-05, "loss": 1.308, "step": 104 }, { "epoch": 0.11, "grad_norm": 0.15584518015384674, "learning_rate": 4.9995288692152044e-05, "loss": 1.4055, "step": 105 }, { "epoch": 0.11, "grad_norm": 0.10717131942510605, "learning_rate": 4.999439318777212e-05, "loss": 1.3719, "step": 106 }, { "epoch": 0.11, "grad_norm": 0.09883825480937958, "learning_rate": 4.999341982557468e-05, "loss": 1.3671, "step": 107 }, { "epoch": 0.12, "grad_norm": 0.09858126938343048, "learning_rate": 4.9992368608591775e-05, "loss": 1.4032, "step": 108 }, { "epoch": 0.12, "grad_norm": 0.09398377686738968, "learning_rate": 4.9991239540097965e-05, "loss": 1.2763, "step": 109 }, { "epoch": 0.12, "grad_norm": 0.10111579298973083, "learning_rate": 4.9990032623610296e-05, "loss": 1.3361, "step": 110 }, { "epoch": 0.12, "grad_norm": 0.10265881568193436, "learning_rate": 4.998874786288833e-05, "loss": 1.3745, "step": 111 }, { "epoch": 0.12, "grad_norm": 0.10393088310956955, "learning_rate": 4.998738526193412e-05, "loss": 1.4737, "step": 112 }, { "epoch": 0.12, "grad_norm": 0.10454674810171127, "learning_rate": 4.9985944824992166e-05, "loss": 1.3587, "step": 113 }, { "epoch": 0.12, "grad_norm": 0.09539375454187393, "learning_rate": 4.9984426556549456e-05, "loss": 1.3285, "step": 114 }, { "epoch": 0.12, "grad_norm": 0.10025190562009811, "learning_rate": 4.998283046133542e-05, "loss": 1.3573, "step": 115 }, { "epoch": 0.12, "grad_norm": 0.3517231345176697, "learning_rate": 4.9981156544321906e-05, "loss": 1.3057, "step": 116 }, { "epoch": 0.12, "grad_norm": 0.11298110336065292, "learning_rate": 4.9979404810723175e-05, "loss": 1.376, "step": 117 }, { "epoch": 0.13, "grad_norm": 0.0932416245341301, "learning_rate": 4.997757526599591e-05, "loss": 1.3948, "step": 118 }, { "epoch": 0.13, "grad_norm": 0.0937485620379448, "learning_rate": 4.997566791583916e-05, "loss": 1.2327, "step": 119 }, { "epoch": 0.13, "grad_norm": 0.09703949838876724, "learning_rate": 4.997368276619435e-05, "loss": 1.4109, "step": 120 }, { "epoch": 0.13, "grad_norm": 0.10851214081048965, "learning_rate": 4.997161982324523e-05, "loss": 1.3715, "step": 121 }, { "epoch": 0.13, "grad_norm": 0.09600158035755157, "learning_rate": 4.9969479093417894e-05, "loss": 1.3202, "step": 122 }, { "epoch": 0.13, "grad_norm": 0.09330842643976212, "learning_rate": 4.996726058338075e-05, "loss": 1.3079, "step": 123 }, { "epoch": 0.13, "grad_norm": 0.12271397560834885, "learning_rate": 4.996496430004446e-05, "loss": 1.4488, "step": 124 }, { "epoch": 0.13, "grad_norm": 0.09032147377729416, "learning_rate": 4.9962590250561996e-05, "loss": 1.3593, "step": 125 }, { "epoch": 0.13, "grad_norm": 0.0970826968550682, "learning_rate": 4.9960138442328534e-05, "loss": 1.3624, "step": 126 }, { "epoch": 0.14, "grad_norm": 0.09610634297132492, "learning_rate": 4.995760888298149e-05, "loss": 1.2691, "step": 127 }, { "epoch": 0.14, "grad_norm": 0.10096735507249832, "learning_rate": 4.9955001580400475e-05, "loss": 1.4252, "step": 128 }, { "epoch": 0.14, "grad_norm": 0.15890567004680634, "learning_rate": 4.9952316542707253e-05, "loss": 1.3557, "step": 129 }, { "epoch": 0.14, "grad_norm": 0.11183907091617584, "learning_rate": 4.994955377826577e-05, "loss": 1.2537, "step": 130 }, { "epoch": 0.14, "grad_norm": 0.09637223184108734, "learning_rate": 4.9946713295682034e-05, "loss": 1.2973, "step": 131 }, { "epoch": 0.14, "grad_norm": 0.08964424580335617, "learning_rate": 4.994379510380421e-05, "loss": 1.3916, "step": 132 }, { "epoch": 0.14, "grad_norm": 0.0993800014257431, "learning_rate": 4.9940799211722484e-05, "loss": 1.4148, "step": 133 }, { "epoch": 0.14, "grad_norm": 0.09357187151908875, "learning_rate": 4.9937725628769094e-05, "loss": 1.2904, "step": 134 }, { "epoch": 0.14, "grad_norm": 0.09439032524824142, "learning_rate": 4.9934574364518295e-05, "loss": 1.3445, "step": 135 }, { "epoch": 0.15, "grad_norm": 0.10771452635526657, "learning_rate": 4.993134542878631e-05, "loss": 1.3755, "step": 136 }, { "epoch": 0.15, "grad_norm": 0.09271609783172607, "learning_rate": 4.99280388316313e-05, "loss": 1.2766, "step": 137 }, { "epoch": 0.15, "grad_norm": 0.1042882427573204, "learning_rate": 4.992465458335335e-05, "loss": 1.3967, "step": 138 }, { "epoch": 0.15, "grad_norm": 0.10192878544330597, "learning_rate": 4.9921192694494454e-05, "loss": 1.3401, "step": 139 }, { "epoch": 0.15, "grad_norm": 0.09331267327070236, "learning_rate": 4.9917653175838405e-05, "loss": 1.3251, "step": 140 }, { "epoch": 0.15, "grad_norm": 0.09516970068216324, "learning_rate": 4.991403603841087e-05, "loss": 1.3637, "step": 141 }, { "epoch": 0.15, "grad_norm": 0.09037785977125168, "learning_rate": 4.9910341293479265e-05, "loss": 1.417, "step": 142 }, { "epoch": 0.15, "grad_norm": 0.10086451470851898, "learning_rate": 4.9906568952552754e-05, "loss": 1.4657, "step": 143 }, { "epoch": 0.15, "grad_norm": 0.09110086411237717, "learning_rate": 4.990271902738223e-05, "loss": 1.3173, "step": 144 }, { "epoch": 0.15, "grad_norm": 0.09085503220558167, "learning_rate": 4.989879152996025e-05, "loss": 1.3234, "step": 145 }, { "epoch": 0.16, "grad_norm": 0.0876404270529747, "learning_rate": 4.989478647252101e-05, "loss": 1.3367, "step": 146 }, { "epoch": 0.16, "grad_norm": 0.09222956746816635, "learning_rate": 4.9890703867540314e-05, "loss": 1.3408, "step": 147 }, { "epoch": 0.16, "grad_norm": 0.09831474721431732, "learning_rate": 4.988654372773552e-05, "loss": 1.3904, "step": 148 }, { "epoch": 0.16, "grad_norm": 0.09171093255281448, "learning_rate": 4.988230606606552e-05, "loss": 1.3178, "step": 149 }, { "epoch": 0.16, "grad_norm": 0.11647266149520874, "learning_rate": 4.9877990895730666e-05, "loss": 1.3282, "step": 150 }, { "epoch": 0.16, "grad_norm": 0.08659853786230087, "learning_rate": 4.9873598230172764e-05, "loss": 1.3531, "step": 151 }, { "epoch": 0.16, "grad_norm": 0.09219101071357727, "learning_rate": 4.986912808307502e-05, "loss": 1.2975, "step": 152 }, { "epoch": 0.16, "grad_norm": 0.0935453400015831, "learning_rate": 4.986458046836199e-05, "loss": 1.258, "step": 153 }, { "epoch": 0.16, "grad_norm": 0.09370335936546326, "learning_rate": 4.985995540019955e-05, "loss": 1.4368, "step": 154 }, { "epoch": 0.17, "grad_norm": 0.08816273510456085, "learning_rate": 4.9855252892994844e-05, "loss": 1.3168, "step": 155 }, { "epoch": 0.17, "grad_norm": 0.10105711221694946, "learning_rate": 4.9850472961396215e-05, "loss": 1.2625, "step": 156 }, { "epoch": 0.17, "grad_norm": 0.09414559602737427, "learning_rate": 4.984561562029323e-05, "loss": 1.34, "step": 157 }, { "epoch": 0.17, "grad_norm": 0.09968777745962143, "learning_rate": 4.9840680884816536e-05, "loss": 1.4364, "step": 158 }, { "epoch": 0.17, "grad_norm": 0.0899839997291565, "learning_rate": 4.983566877033791e-05, "loss": 1.2951, "step": 159 }, { "epoch": 0.17, "grad_norm": 0.08910808712244034, "learning_rate": 4.9830579292470136e-05, "loss": 1.3533, "step": 160 }, { "epoch": 0.17, "grad_norm": 0.09147724509239197, "learning_rate": 4.9825412467067e-05, "loss": 1.3941, "step": 161 }, { "epoch": 0.17, "grad_norm": 0.09149125963449478, "learning_rate": 4.9820168310223215e-05, "loss": 1.3223, "step": 162 }, { "epoch": 0.17, "grad_norm": 0.09467878937721252, "learning_rate": 4.981484683827439e-05, "loss": 1.2784, "step": 163 }, { "epoch": 0.18, "grad_norm": 0.1013043224811554, "learning_rate": 4.9809448067796974e-05, "loss": 1.3415, "step": 164 }, { "epoch": 0.18, "grad_norm": 0.10649001598358154, "learning_rate": 4.9803972015608205e-05, "loss": 1.3784, "step": 165 }, { "epoch": 0.18, "grad_norm": 0.09393008053302765, "learning_rate": 4.9798418698766034e-05, "loss": 1.3084, "step": 166 }, { "epoch": 0.18, "grad_norm": 0.09331285208463669, "learning_rate": 4.979278813456911e-05, "loss": 1.4026, "step": 167 }, { "epoch": 0.18, "grad_norm": 0.10828982293605804, "learning_rate": 4.97870803405567e-05, "loss": 1.3845, "step": 168 }, { "epoch": 0.18, "grad_norm": 0.09335343539714813, "learning_rate": 4.978129533450866e-05, "loss": 1.3743, "step": 169 }, { "epoch": 0.18, "grad_norm": 0.09739939123392105, "learning_rate": 4.977543313444534e-05, "loss": 1.357, "step": 170 }, { "epoch": 0.18, "grad_norm": 0.10757780820131302, "learning_rate": 4.976949375862756e-05, "loss": 1.4467, "step": 171 }, { "epoch": 0.18, "grad_norm": 0.13449633121490479, "learning_rate": 4.976347722555654e-05, "loss": 1.3811, "step": 172 }, { "epoch": 0.18, "grad_norm": 0.10207029432058334, "learning_rate": 4.975738355397386e-05, "loss": 1.3571, "step": 173 }, { "epoch": 0.19, "grad_norm": 0.09364256262779236, "learning_rate": 4.975121276286136e-05, "loss": 1.298, "step": 174 }, { "epoch": 0.19, "grad_norm": 0.10099958628416061, "learning_rate": 4.974496487144114e-05, "loss": 1.3605, "step": 175 }, { "epoch": 0.19, "grad_norm": 0.08877628296613693, "learning_rate": 4.973863989917544e-05, "loss": 1.3566, "step": 176 }, { "epoch": 0.19, "grad_norm": 0.09996306151151657, "learning_rate": 4.973223786576663e-05, "loss": 1.5077, "step": 177 }, { "epoch": 0.19, "grad_norm": 0.08964492380619049, "learning_rate": 4.972575879115711e-05, "loss": 1.3556, "step": 178 }, { "epoch": 0.19, "grad_norm": 0.10891906917095184, "learning_rate": 4.971920269552927e-05, "loss": 1.3015, "step": 179 }, { "epoch": 0.19, "grad_norm": 0.09565063565969467, "learning_rate": 4.971256959930541e-05, "loss": 1.2955, "step": 180 }, { "epoch": 0.19, "grad_norm": 0.09602455049753189, "learning_rate": 4.970585952314774e-05, "loss": 1.4334, "step": 181 }, { "epoch": 0.19, "grad_norm": 0.09518374502658844, "learning_rate": 4.969907248795818e-05, "loss": 1.3537, "step": 182 }, { "epoch": 0.2, "grad_norm": 0.09421328455209732, "learning_rate": 4.9692208514878444e-05, "loss": 1.4354, "step": 183 }, { "epoch": 0.2, "grad_norm": 0.09471164643764496, "learning_rate": 4.9685267625289886e-05, "loss": 1.278, "step": 184 }, { "epoch": 0.2, "grad_norm": 0.10142439603805542, "learning_rate": 4.967824984081344e-05, "loss": 1.4494, "step": 185 }, { "epoch": 0.2, "grad_norm": 0.0980624407529831, "learning_rate": 4.9671155183309595e-05, "loss": 1.3545, "step": 186 }, { "epoch": 0.2, "grad_norm": 0.09215401858091354, "learning_rate": 4.9663983674878296e-05, "loss": 1.2775, "step": 187 }, { "epoch": 0.2, "grad_norm": 0.10461422055959702, "learning_rate": 4.965673533785886e-05, "loss": 1.3267, "step": 188 }, { "epoch": 0.2, "grad_norm": 0.0982186496257782, "learning_rate": 4.9649410194829945e-05, "loss": 1.3111, "step": 189 }, { "epoch": 0.2, "grad_norm": 0.09877320379018784, "learning_rate": 4.9642008268609454e-05, "loss": 1.3306, "step": 190 }, { "epoch": 0.2, "grad_norm": 0.10117904841899872, "learning_rate": 4.9634529582254466e-05, "loss": 1.3162, "step": 191 }, { "epoch": 0.2, "grad_norm": 0.10001292824745178, "learning_rate": 4.962697415906118e-05, "loss": 1.3379, "step": 192 }, { "epoch": 0.21, "grad_norm": 0.09294575452804565, "learning_rate": 4.961934202256482e-05, "loss": 1.3189, "step": 193 }, { "epoch": 0.21, "grad_norm": 0.09697602689266205, "learning_rate": 4.9611633196539584e-05, "loss": 1.3325, "step": 194 }, { "epoch": 0.21, "grad_norm": 0.09828122705221176, "learning_rate": 4.960384770499855e-05, "loss": 1.4084, "step": 195 }, { "epoch": 0.21, "grad_norm": 0.0948810800909996, "learning_rate": 4.95959855721936e-05, "loss": 1.2597, "step": 196 }, { "epoch": 0.21, "grad_norm": 0.10147445648908615, "learning_rate": 4.958804682261539e-05, "loss": 1.2529, "step": 197 }, { "epoch": 0.21, "grad_norm": 0.14694844186306, "learning_rate": 4.95800314809932e-05, "loss": 1.3571, "step": 198 }, { "epoch": 0.21, "grad_norm": 0.21705757081508636, "learning_rate": 4.957193957229491e-05, "loss": 1.4377, "step": 199 }, { "epoch": 0.21, "grad_norm": 0.12128579616546631, "learning_rate": 4.956377112172691e-05, "loss": 1.3725, "step": 200 }, { "epoch": 0.21, "grad_norm": 0.09640059620141983, "learning_rate": 4.955552615473401e-05, "loss": 1.3913, "step": 201 }, { "epoch": 0.22, "grad_norm": 0.19373305141925812, "learning_rate": 4.954720469699938e-05, "loss": 1.4382, "step": 202 }, { "epoch": 0.22, "grad_norm": 0.0959777757525444, "learning_rate": 4.953880677444446e-05, "loss": 1.3134, "step": 203 }, { "epoch": 0.22, "grad_norm": 0.10545915365219116, "learning_rate": 4.953033241322886e-05, "loss": 1.3788, "step": 204 }, { "epoch": 0.22, "grad_norm": 0.1017264649271965, "learning_rate": 4.952178163975033e-05, "loss": 1.3932, "step": 205 }, { "epoch": 0.22, "grad_norm": 0.1435861438512802, "learning_rate": 4.951315448064462e-05, "loss": 1.2377, "step": 206 }, { "epoch": 0.22, "grad_norm": 0.10849795490503311, "learning_rate": 4.950445096278541e-05, "loss": 1.2753, "step": 207 }, { "epoch": 0.22, "grad_norm": 0.09382300078868866, "learning_rate": 4.949567111328428e-05, "loss": 1.2816, "step": 208 }, { "epoch": 0.22, "grad_norm": 0.09901299327611923, "learning_rate": 4.9486814959490544e-05, "loss": 1.3306, "step": 209 }, { "epoch": 0.22, "grad_norm": 0.09280100464820862, "learning_rate": 4.947788252899124e-05, "loss": 1.2148, "step": 210 }, { "epoch": 0.23, "grad_norm": 0.10457581281661987, "learning_rate": 4.946887384961097e-05, "loss": 1.4392, "step": 211 }, { "epoch": 0.23, "grad_norm": 0.09088419377803802, "learning_rate": 4.9459788949411886e-05, "loss": 1.3393, "step": 212 }, { "epoch": 0.23, "grad_norm": 0.09645824134349823, "learning_rate": 4.9450627856693545e-05, "loss": 1.4496, "step": 213 }, { "epoch": 0.23, "grad_norm": 0.09751391410827637, "learning_rate": 4.9441390599992864e-05, "loss": 1.3765, "step": 214 }, { "epoch": 0.23, "grad_norm": 0.09599944204092026, "learning_rate": 4.943207720808399e-05, "loss": 1.3751, "step": 215 }, { "epoch": 0.23, "grad_norm": 0.16549742221832275, "learning_rate": 4.9422687709978254e-05, "loss": 1.3439, "step": 216 }, { "epoch": 0.23, "grad_norm": 0.09610256552696228, "learning_rate": 4.941322213492405e-05, "loss": 1.2716, "step": 217 }, { "epoch": 0.23, "grad_norm": 0.09371083229780197, "learning_rate": 4.940368051240675e-05, "loss": 1.2652, "step": 218 }, { "epoch": 0.23, "grad_norm": 0.09995076060295105, "learning_rate": 4.9394062872148604e-05, "loss": 1.2524, "step": 219 }, { "epoch": 0.23, "grad_norm": 0.09488219022750854, "learning_rate": 4.9384369244108685e-05, "loss": 1.2505, "step": 220 }, { "epoch": 0.24, "grad_norm": 0.1144493967294693, "learning_rate": 4.937459965848275e-05, "loss": 1.3485, "step": 221 }, { "epoch": 0.24, "grad_norm": 0.09584493935108185, "learning_rate": 4.9364754145703165e-05, "loss": 1.2722, "step": 222 }, { "epoch": 0.24, "grad_norm": 0.12084224820137024, "learning_rate": 4.935483273643882e-05, "loss": 1.4047, "step": 223 }, { "epoch": 0.24, "grad_norm": 0.10730363428592682, "learning_rate": 4.9344835461595014e-05, "loss": 1.2585, "step": 224 }, { "epoch": 0.24, "grad_norm": 0.08327766507863998, "learning_rate": 4.933476235231337e-05, "loss": 1.205, "step": 225 }, { "epoch": 0.24, "grad_norm": 0.09607108682394028, "learning_rate": 4.9324613439971736e-05, "loss": 1.3635, "step": 226 }, { "epoch": 0.24, "grad_norm": 0.09505125880241394, "learning_rate": 4.931438875618408e-05, "loss": 1.3224, "step": 227 }, { "epoch": 0.24, "grad_norm": 0.09247023612260818, "learning_rate": 4.930408833280043e-05, "loss": 1.3292, "step": 228 }, { "epoch": 0.24, "grad_norm": 0.09089571982622147, "learning_rate": 4.929371220190671e-05, "loss": 1.3271, "step": 229 }, { "epoch": 0.25, "grad_norm": 0.0896812379360199, "learning_rate": 4.928326039582468e-05, "loss": 1.281, "step": 230 }, { "epoch": 0.25, "grad_norm": 0.10084407031536102, "learning_rate": 4.927273294711184e-05, "loss": 1.354, "step": 231 }, { "epoch": 0.25, "grad_norm": 0.09202654659748077, "learning_rate": 4.9262129888561316e-05, "loss": 1.3373, "step": 232 }, { "epoch": 0.25, "grad_norm": 0.09410908818244934, "learning_rate": 4.925145125320175e-05, "loss": 1.4251, "step": 233 }, { "epoch": 0.25, "grad_norm": 0.08918359875679016, "learning_rate": 4.9240697074297206e-05, "loss": 1.2926, "step": 234 }, { "epoch": 0.25, "grad_norm": 0.09744445979595184, "learning_rate": 4.9229867385347086e-05, "loss": 1.2261, "step": 235 }, { "epoch": 0.25, "grad_norm": 0.09546810388565063, "learning_rate": 4.921896222008598e-05, "loss": 1.3187, "step": 236 }, { "epoch": 0.25, "grad_norm": 0.09971386194229126, "learning_rate": 4.920798161248361e-05, "loss": 1.3097, "step": 237 }, { "epoch": 0.25, "grad_norm": 0.09423212707042694, "learning_rate": 4.9196925596744684e-05, "loss": 1.3812, "step": 238 }, { "epoch": 0.26, "grad_norm": 0.09651963412761688, "learning_rate": 4.918579420730883e-05, "loss": 1.2818, "step": 239 }, { "epoch": 0.26, "grad_norm": 0.10617819428443909, "learning_rate": 4.9174587478850445e-05, "loss": 1.3433, "step": 240 }, { "epoch": 0.26, "grad_norm": 0.09576583653688431, "learning_rate": 4.916330544627861e-05, "loss": 1.3034, "step": 241 }, { "epoch": 0.26, "grad_norm": 0.09386958181858063, "learning_rate": 4.915194814473699e-05, "loss": 1.2635, "step": 242 }, { "epoch": 0.26, "grad_norm": 0.1020546555519104, "learning_rate": 4.914051560960371e-05, "loss": 1.4071, "step": 243 }, { "epoch": 0.26, "grad_norm": 0.09851020574569702, "learning_rate": 4.912900787649124e-05, "loss": 1.3625, "step": 244 }, { "epoch": 0.26, "grad_norm": 0.09171383827924728, "learning_rate": 4.9117424981246304e-05, "loss": 1.3349, "step": 245 }, { "epoch": 0.26, "grad_norm": 0.09428778290748596, "learning_rate": 4.910576695994975e-05, "loss": 1.3469, "step": 246 }, { "epoch": 0.26, "grad_norm": 0.09907223284244537, "learning_rate": 4.909403384891644e-05, "loss": 1.3375, "step": 247 }, { "epoch": 0.26, "grad_norm": 0.09412197768688202, "learning_rate": 4.908222568469516e-05, "loss": 1.3386, "step": 248 }, { "epoch": 0.27, "grad_norm": 0.13928060233592987, "learning_rate": 4.907034250406846e-05, "loss": 1.3403, "step": 249 }, { "epoch": 0.27, "grad_norm": 0.09467838704586029, "learning_rate": 4.9058384344052587e-05, "loss": 1.3549, "step": 250 }, { "epoch": 0.27, "grad_norm": 0.08943888545036316, "learning_rate": 4.904635124189736e-05, "loss": 1.2899, "step": 251 }, { "epoch": 0.27, "grad_norm": 0.08892600983381271, "learning_rate": 4.903424323508601e-05, "loss": 1.2404, "step": 252 }, { "epoch": 0.27, "grad_norm": 0.08917702734470367, "learning_rate": 4.902206036133512e-05, "loss": 1.368, "step": 253 }, { "epoch": 0.27, "grad_norm": 0.10863058269023895, "learning_rate": 4.900980265859448e-05, "loss": 1.4147, "step": 254 }, { "epoch": 0.27, "grad_norm": 0.09652639180421829, "learning_rate": 4.8997470165046976e-05, "loss": 1.2795, "step": 255 }, { "epoch": 0.27, "grad_norm": 0.09574037045240402, "learning_rate": 4.8985062919108474e-05, "loss": 1.3202, "step": 256 }, { "epoch": 0.27, "grad_norm": 0.10350500792264938, "learning_rate": 4.897258095942766e-05, "loss": 1.3874, "step": 257 }, { "epoch": 0.28, "grad_norm": 0.0920305848121643, "learning_rate": 4.896002432488599e-05, "loss": 1.3145, "step": 258 }, { "epoch": 0.28, "grad_norm": 0.12242481857538223, "learning_rate": 4.8947393054597534e-05, "loss": 1.3162, "step": 259 }, { "epoch": 0.28, "grad_norm": 0.08995360136032104, "learning_rate": 4.8934687187908834e-05, "loss": 1.3715, "step": 260 }, { "epoch": 0.28, "grad_norm": 0.09082052856683731, "learning_rate": 4.8921906764398805e-05, "loss": 1.2569, "step": 261 }, { "epoch": 0.28, "grad_norm": 0.09512560814619064, "learning_rate": 4.890905182387861e-05, "loss": 1.4848, "step": 262 }, { "epoch": 0.28, "grad_norm": 0.08932676166296005, "learning_rate": 4.8896122406391556e-05, "loss": 1.314, "step": 263 }, { "epoch": 0.28, "grad_norm": 0.0890311747789383, "learning_rate": 4.888311855221289e-05, "loss": 1.3381, "step": 264 }, { "epoch": 0.28, "grad_norm": 0.08900325000286102, "learning_rate": 4.887004030184979e-05, "loss": 1.3001, "step": 265 }, { "epoch": 0.28, "grad_norm": 0.08832228183746338, "learning_rate": 4.885688769604114e-05, "loss": 1.3714, "step": 266 }, { "epoch": 0.28, "grad_norm": 0.09763582050800323, "learning_rate": 4.884366077575747e-05, "loss": 1.4229, "step": 267 }, { "epoch": 0.29, "grad_norm": 0.1476629674434662, "learning_rate": 4.883035958220077e-05, "loss": 1.5083, "step": 268 }, { "epoch": 0.29, "grad_norm": 0.10069846361875534, "learning_rate": 4.881698415680442e-05, "loss": 1.3742, "step": 269 }, { "epoch": 0.29, "grad_norm": 0.0915488749742508, "learning_rate": 4.8803534541233014e-05, "loss": 1.3738, "step": 270 }, { "epoch": 0.29, "grad_norm": 0.09633270651102066, "learning_rate": 4.879001077738227e-05, "loss": 1.3044, "step": 271 }, { "epoch": 0.29, "grad_norm": 0.09047040343284607, "learning_rate": 4.877641290737884e-05, "loss": 1.2579, "step": 272 }, { "epoch": 0.29, "grad_norm": 0.09998361766338348, "learning_rate": 4.876274097358027e-05, "loss": 1.3221, "step": 273 }, { "epoch": 0.29, "grad_norm": 0.10295653343200684, "learning_rate": 4.874899501857477e-05, "loss": 1.3666, "step": 274 }, { "epoch": 0.29, "grad_norm": 0.09751718491315842, "learning_rate": 4.873517508518116e-05, "loss": 1.3144, "step": 275 }, { "epoch": 0.29, "grad_norm": 0.09435424208641052, "learning_rate": 4.872128121644868e-05, "loss": 1.3125, "step": 276 }, { "epoch": 0.3, "grad_norm": 0.08840420097112656, "learning_rate": 4.870731345565689e-05, "loss": 1.3244, "step": 277 }, { "epoch": 0.3, "grad_norm": 0.09967703372240067, "learning_rate": 4.8693271846315515e-05, "loss": 1.3218, "step": 278 }, { "epoch": 0.3, "grad_norm": 0.09583844244480133, "learning_rate": 4.867915643216434e-05, "loss": 1.3866, "step": 279 }, { "epoch": 0.3, "grad_norm": 0.09268955886363983, "learning_rate": 4.866496725717303e-05, "loss": 1.2995, "step": 280 }, { "epoch": 0.3, "grad_norm": 0.09287814050912857, "learning_rate": 4.8650704365541035e-05, "loss": 1.3388, "step": 281 }, { "epoch": 0.3, "grad_norm": 0.09162658452987671, "learning_rate": 4.863636780169742e-05, "loss": 1.2241, "step": 282 }, { "epoch": 0.3, "grad_norm": 0.10253284871578217, "learning_rate": 4.8621957610300736e-05, "loss": 1.3496, "step": 283 }, { "epoch": 0.3, "grad_norm": 0.09558489173650742, "learning_rate": 4.860747383623889e-05, "loss": 1.3608, "step": 284 }, { "epoch": 0.3, "grad_norm": 0.11398740112781525, "learning_rate": 4.859291652462903e-05, "loss": 1.3924, "step": 285 }, { "epoch": 0.31, "grad_norm": 0.10008247196674347, "learning_rate": 4.8578285720817314e-05, "loss": 1.2955, "step": 286 }, { "epoch": 0.31, "grad_norm": 0.09404585510492325, "learning_rate": 4.8563581470378875e-05, "loss": 1.4093, "step": 287 }, { "epoch": 0.31, "grad_norm": 0.09330190718173981, "learning_rate": 4.8548803819117614e-05, "loss": 1.413, "step": 288 }, { "epoch": 0.31, "grad_norm": 0.09668520838022232, "learning_rate": 4.8533952813066094e-05, "loss": 1.3729, "step": 289 }, { "epoch": 0.31, "grad_norm": 0.09101221710443497, "learning_rate": 4.851902849848536e-05, "loss": 1.3243, "step": 290 }, { "epoch": 0.31, "grad_norm": 0.09626659750938416, "learning_rate": 4.8504030921864816e-05, "loss": 1.2227, "step": 291 }, { "epoch": 0.31, "grad_norm": 0.10459615290164948, "learning_rate": 4.848896012992208e-05, "loss": 1.4128, "step": 292 }, { "epoch": 0.31, "grad_norm": 0.09928146004676819, "learning_rate": 4.847381616960286e-05, "loss": 1.4355, "step": 293 }, { "epoch": 0.31, "grad_norm": 0.09046845883131027, "learning_rate": 4.8458599088080735e-05, "loss": 1.3679, "step": 294 }, { "epoch": 0.31, "grad_norm": 0.10265221446752548, "learning_rate": 4.844330893275711e-05, "loss": 1.329, "step": 295 }, { "epoch": 0.32, "grad_norm": 0.09289583563804626, "learning_rate": 4.8427945751260986e-05, "loss": 1.3476, "step": 296 }, { "epoch": 0.32, "grad_norm": 0.09216302633285522, "learning_rate": 4.8412509591448835e-05, "loss": 1.2461, "step": 297 }, { "epoch": 0.32, "grad_norm": 0.09431559592485428, "learning_rate": 4.839700050140448e-05, "loss": 1.3014, "step": 298 }, { "epoch": 0.32, "grad_norm": 0.09718261659145355, "learning_rate": 4.838141852943891e-05, "loss": 1.4044, "step": 299 }, { "epoch": 0.32, "grad_norm": 0.09670992195606232, "learning_rate": 4.836576372409015e-05, "loss": 1.3345, "step": 300 }, { "epoch": 0.32, "grad_norm": 0.09435348957777023, "learning_rate": 4.835003613412308e-05, "loss": 1.254, "step": 301 }, { "epoch": 0.32, "grad_norm": 0.09258078038692474, "learning_rate": 4.8334235808529335e-05, "loss": 1.3898, "step": 302 }, { "epoch": 0.32, "grad_norm": 0.0906069427728653, "learning_rate": 4.83183627965271e-05, "loss": 1.3753, "step": 303 }, { "epoch": 0.32, "grad_norm": 0.159384086728096, "learning_rate": 4.830241714756099e-05, "loss": 1.2721, "step": 304 }, { "epoch": 0.33, "grad_norm": 0.0922141820192337, "learning_rate": 4.828639891130189e-05, "loss": 1.2806, "step": 305 }, { "epoch": 0.33, "grad_norm": 0.09928394109010696, "learning_rate": 4.827030813764677e-05, "loss": 1.3605, "step": 306 }, { "epoch": 0.33, "grad_norm": 0.09432511776685715, "learning_rate": 4.825414487671859e-05, "loss": 1.4276, "step": 307 }, { "epoch": 0.33, "grad_norm": 0.09470757842063904, "learning_rate": 4.823790917886607e-05, "loss": 1.2807, "step": 308 }, { "epoch": 0.33, "grad_norm": 0.09105392545461655, "learning_rate": 4.822160109466361e-05, "loss": 1.3163, "step": 309 }, { "epoch": 0.33, "grad_norm": 0.09101832658052444, "learning_rate": 4.8205220674911074e-05, "loss": 1.2988, "step": 310 }, { "epoch": 0.33, "grad_norm": 0.09433624148368835, "learning_rate": 4.8188767970633647e-05, "loss": 1.3743, "step": 311 }, { "epoch": 0.33, "grad_norm": 0.1052064523100853, "learning_rate": 4.8172243033081695e-05, "loss": 1.4047, "step": 312 }, { "epoch": 0.33, "grad_norm": 0.09831178188323975, "learning_rate": 4.8155645913730585e-05, "loss": 1.4189, "step": 313 }, { "epoch": 0.34, "grad_norm": 0.09389661997556686, "learning_rate": 4.8138976664280536e-05, "loss": 1.3911, "step": 314 }, { "epoch": 0.34, "grad_norm": 0.0961923897266388, "learning_rate": 4.812223533665643e-05, "loss": 1.3879, "step": 315 }, { "epoch": 0.34, "grad_norm": 0.09563787281513214, "learning_rate": 4.8105421983007715e-05, "loss": 1.3871, "step": 316 }, { "epoch": 0.34, "grad_norm": 0.0972226932644844, "learning_rate": 4.808853665570816e-05, "loss": 1.4138, "step": 317 }, { "epoch": 0.34, "grad_norm": 0.09669040143489838, "learning_rate": 4.807157940735577e-05, "loss": 1.3376, "step": 318 }, { "epoch": 0.34, "grad_norm": 0.10110729932785034, "learning_rate": 4.805455029077255e-05, "loss": 1.3442, "step": 319 }, { "epoch": 0.34, "grad_norm": 0.10544568300247192, "learning_rate": 4.803744935900439e-05, "loss": 1.259, "step": 320 }, { "epoch": 0.34, "grad_norm": 0.09788301587104797, "learning_rate": 4.802027666532089e-05, "loss": 1.3767, "step": 321 }, { "epoch": 0.34, "grad_norm": 0.09752319008111954, "learning_rate": 4.8003032263215185e-05, "loss": 1.4368, "step": 322 }, { "epoch": 0.34, "grad_norm": 0.1186918169260025, "learning_rate": 4.798571620640378e-05, "loss": 1.2999, "step": 323 }, { "epoch": 0.35, "grad_norm": 0.09278569370508194, "learning_rate": 4.79683285488264e-05, "loss": 1.3786, "step": 324 }, { "epoch": 0.35, "grad_norm": 0.11016564816236496, "learning_rate": 4.795086934464579e-05, "loss": 1.3475, "step": 325 }, { "epoch": 0.35, "grad_norm": 0.11838585883378983, "learning_rate": 4.7933338648247563e-05, "loss": 1.3264, "step": 326 }, { "epoch": 0.35, "grad_norm": 0.09825286269187927, "learning_rate": 4.791573651424003e-05, "loss": 1.314, "step": 327 }, { "epoch": 0.35, "grad_norm": 0.12940770387649536, "learning_rate": 4.789806299745405e-05, "loss": 1.2689, "step": 328 }, { "epoch": 0.35, "grad_norm": 0.09449135512113571, "learning_rate": 4.7880318152942816e-05, "loss": 1.3096, "step": 329 }, { "epoch": 0.35, "grad_norm": 0.10377182811498642, "learning_rate": 4.786250203598174e-05, "loss": 1.3784, "step": 330 }, { "epoch": 0.35, "grad_norm": 0.09918166697025299, "learning_rate": 4.78446147020682e-05, "loss": 1.44, "step": 331 }, { "epoch": 0.35, "grad_norm": 0.09931398928165436, "learning_rate": 4.782665620692147e-05, "loss": 1.3034, "step": 332 }, { "epoch": 0.36, "grad_norm": 0.20402568578720093, "learning_rate": 4.780862660648244e-05, "loss": 1.3637, "step": 333 }, { "epoch": 0.36, "grad_norm": 0.10577716678380966, "learning_rate": 4.779052595691355e-05, "loss": 1.2544, "step": 334 }, { "epoch": 0.36, "grad_norm": 0.1048274114727974, "learning_rate": 4.77723543145985e-05, "loss": 1.4564, "step": 335 }, { "epoch": 0.36, "grad_norm": 0.09773129224777222, "learning_rate": 4.775411173614218e-05, "loss": 1.3061, "step": 336 }, { "epoch": 0.36, "grad_norm": 0.10128381103277206, "learning_rate": 4.773579827837041e-05, "loss": 1.3197, "step": 337 }, { "epoch": 0.36, "grad_norm": 0.087751604616642, "learning_rate": 4.7717413998329844e-05, "loss": 1.2628, "step": 338 }, { "epoch": 0.36, "grad_norm": 0.09778417646884918, "learning_rate": 4.769895895328769e-05, "loss": 1.4227, "step": 339 }, { "epoch": 0.36, "grad_norm": 0.10379809141159058, "learning_rate": 4.768043320073165e-05, "loss": 1.347, "step": 340 }, { "epoch": 0.36, "grad_norm": 0.09904753416776657, "learning_rate": 4.766183679836964e-05, "loss": 1.3168, "step": 341 }, { "epoch": 0.36, "grad_norm": 0.2791866958141327, "learning_rate": 4.7643169804129665e-05, "loss": 1.2482, "step": 342 }, { "epoch": 0.37, "grad_norm": 0.24763400852680206, "learning_rate": 4.762443227615963e-05, "loss": 1.4319, "step": 343 }, { "epoch": 0.37, "grad_norm": 0.0879005715250969, "learning_rate": 4.7605624272827126e-05, "loss": 1.2904, "step": 344 }, { "epoch": 0.37, "grad_norm": 0.09311103075742722, "learning_rate": 4.7586745852719316e-05, "loss": 1.3017, "step": 345 }, { "epoch": 0.37, "grad_norm": 0.09575958549976349, "learning_rate": 4.756779707464269e-05, "loss": 1.354, "step": 346 }, { "epoch": 0.37, "grad_norm": 0.09556791186332703, "learning_rate": 4.754877799762291e-05, "loss": 1.3307, "step": 347 }, { "epoch": 0.37, "grad_norm": 0.09889087080955505, "learning_rate": 4.752968868090459e-05, "loss": 1.3329, "step": 348 }, { "epoch": 0.37, "grad_norm": 0.0879189744591713, "learning_rate": 4.75105291839512e-05, "loss": 1.3677, "step": 349 }, { "epoch": 0.37, "grad_norm": 0.09398270398378372, "learning_rate": 4.749129956644477e-05, "loss": 1.3096, "step": 350 }, { "epoch": 0.37, "grad_norm": 0.0942855104804039, "learning_rate": 4.747199988828579e-05, "loss": 1.4615, "step": 351 }, { "epoch": 0.38, "grad_norm": 0.09009843319654465, "learning_rate": 4.7452630209592966e-05, "loss": 1.3422, "step": 352 }, { "epoch": 0.38, "grad_norm": 0.11878379434347153, "learning_rate": 4.7433190590703055e-05, "loss": 1.3167, "step": 353 }, { "epoch": 0.38, "grad_norm": 0.10389739274978638, "learning_rate": 4.7413681092170715e-05, "loss": 1.3042, "step": 354 }, { "epoch": 0.38, "grad_norm": 0.09575341641902924, "learning_rate": 4.739410177476824e-05, "loss": 1.3589, "step": 355 }, { "epoch": 0.38, "grad_norm": 0.0868607833981514, "learning_rate": 4.7374452699485426e-05, "loss": 1.3461, "step": 356 }, { "epoch": 0.38, "grad_norm": 0.09217596054077148, "learning_rate": 4.7354733927529374e-05, "loss": 1.3186, "step": 357 }, { "epoch": 0.38, "grad_norm": 0.08772280067205429, "learning_rate": 4.7334945520324265e-05, "loss": 1.3425, "step": 358 }, { "epoch": 0.38, "grad_norm": 0.10162526369094849, "learning_rate": 4.731508753951122e-05, "loss": 1.4125, "step": 359 }, { "epoch": 0.38, "grad_norm": 0.09132665395736694, "learning_rate": 4.729516004694808e-05, "loss": 1.3316, "step": 360 }, { "epoch": 0.39, "grad_norm": 0.0943852961063385, "learning_rate": 4.72751631047092e-05, "loss": 1.2758, "step": 361 }, { "epoch": 0.39, "grad_norm": 0.08872327208518982, "learning_rate": 4.725509677508528e-05, "loss": 1.3476, "step": 362 }, { "epoch": 0.39, "grad_norm": 0.09403718262910843, "learning_rate": 4.7234961120583174e-05, "loss": 1.2631, "step": 363 }, { "epoch": 0.39, "grad_norm": 0.08866386115550995, "learning_rate": 4.7214756203925676e-05, "loss": 1.3324, "step": 364 }, { "epoch": 0.39, "grad_norm": 0.10122651606798172, "learning_rate": 4.719448208805132e-05, "loss": 1.3209, "step": 365 }, { "epoch": 0.39, "grad_norm": 0.09378830343484879, "learning_rate": 4.71741388361142e-05, "loss": 1.3856, "step": 366 }, { "epoch": 0.39, "grad_norm": 0.09723149240016937, "learning_rate": 4.7153726511483786e-05, "loss": 1.3676, "step": 367 }, { "epoch": 0.39, "grad_norm": 0.08986017853021622, "learning_rate": 4.713324517774471e-05, "loss": 1.324, "step": 368 }, { "epoch": 0.39, "grad_norm": 0.08899199962615967, "learning_rate": 4.711269489869653e-05, "loss": 1.213, "step": 369 }, { "epoch": 0.39, "grad_norm": 0.0900987833738327, "learning_rate": 4.709207573835363e-05, "loss": 1.3337, "step": 370 }, { "epoch": 0.4, "grad_norm": 0.0994141548871994, "learning_rate": 4.70713877609449e-05, "loss": 1.458, "step": 371 }, { "epoch": 0.4, "grad_norm": 0.10680098831653595, "learning_rate": 4.7050631030913644e-05, "loss": 1.3494, "step": 372 }, { "epoch": 0.4, "grad_norm": 0.17104795575141907, "learning_rate": 4.70298056129173e-05, "loss": 1.2644, "step": 373 }, { "epoch": 0.4, "grad_norm": 0.08228933811187744, "learning_rate": 4.700891157182729e-05, "loss": 1.2638, "step": 374 }, { "epoch": 0.4, "grad_norm": 0.08857174217700958, "learning_rate": 4.698794897272877e-05, "loss": 1.3473, "step": 375 }, { "epoch": 0.4, "grad_norm": 0.08300795406103134, "learning_rate": 4.696691788092049e-05, "loss": 1.3295, "step": 376 }, { "epoch": 0.4, "grad_norm": 0.11059122532606125, "learning_rate": 4.694581836191454e-05, "loss": 1.2383, "step": 377 }, { "epoch": 0.4, "grad_norm": 0.09859713912010193, "learning_rate": 4.692465048143615e-05, "loss": 1.3404, "step": 378 }, { "epoch": 0.4, "grad_norm": 0.08896754682064056, "learning_rate": 4.6903414305423507e-05, "loss": 1.3865, "step": 379 }, { "epoch": 0.41, "grad_norm": 0.091724693775177, "learning_rate": 4.688210990002755e-05, "loss": 1.3131, "step": 380 }, { "epoch": 0.41, "grad_norm": 0.09057147055864334, "learning_rate": 4.686073733161173e-05, "loss": 1.2792, "step": 381 }, { "epoch": 0.41, "grad_norm": 0.09829523414373398, "learning_rate": 4.683929666675185e-05, "loss": 1.331, "step": 382 }, { "epoch": 0.41, "grad_norm": 0.09669854491949081, "learning_rate": 4.681778797223582e-05, "loss": 1.3238, "step": 383 }, { "epoch": 0.41, "grad_norm": 0.0903388261795044, "learning_rate": 4.679621131506347e-05, "loss": 1.295, "step": 384 }, { "epoch": 0.41, "grad_norm": 0.09009499102830887, "learning_rate": 4.6774566762446324e-05, "loss": 1.3714, "step": 385 }, { "epoch": 0.41, "grad_norm": 0.09612104296684265, "learning_rate": 4.675285438180741e-05, "loss": 1.3032, "step": 386 }, { "epoch": 0.41, "grad_norm": 0.09140989184379578, "learning_rate": 4.673107424078105e-05, "loss": 1.4714, "step": 387 }, { "epoch": 0.41, "grad_norm": 0.0838828906416893, "learning_rate": 4.670922640721261e-05, "loss": 1.2755, "step": 388 }, { "epoch": 0.42, "grad_norm": 0.09146388620138168, "learning_rate": 4.6687310949158344e-05, "loss": 1.3965, "step": 389 }, { "epoch": 0.42, "grad_norm": 0.09024867415428162, "learning_rate": 4.6665327934885174e-05, "loss": 1.3613, "step": 390 }, { "epoch": 0.42, "grad_norm": 0.08649884164333344, "learning_rate": 4.664327743287041e-05, "loss": 1.2359, "step": 391 }, { "epoch": 0.42, "grad_norm": 0.0954437181353569, "learning_rate": 4.6621159511801635e-05, "loss": 1.315, "step": 392 }, { "epoch": 0.42, "grad_norm": 0.09183992445468903, "learning_rate": 4.6598974240576406e-05, "loss": 1.3338, "step": 393 }, { "epoch": 0.42, "grad_norm": 0.09896053373813629, "learning_rate": 4.6576721688302105e-05, "loss": 1.3156, "step": 394 }, { "epoch": 0.42, "grad_norm": 0.0989176332950592, "learning_rate": 4.6554401924295686e-05, "loss": 1.3154, "step": 395 }, { "epoch": 0.42, "grad_norm": 0.08492989093065262, "learning_rate": 4.653201501808346e-05, "loss": 1.3708, "step": 396 }, { "epoch": 0.42, "grad_norm": 0.09674088656902313, "learning_rate": 4.650956103940089e-05, "loss": 1.3821, "step": 397 }, { "epoch": 0.42, "grad_norm": 0.09763908386230469, "learning_rate": 4.648704005819238e-05, "loss": 1.4568, "step": 398 }, { "epoch": 0.43, "grad_norm": 0.09475667029619217, "learning_rate": 4.6464452144611046e-05, "loss": 1.3513, "step": 399 }, { "epoch": 0.43, "grad_norm": 0.09377523511648178, "learning_rate": 4.644179736901848e-05, "loss": 1.3554, "step": 400 }, { "epoch": 0.43, "grad_norm": 0.08964923769235611, "learning_rate": 4.641907580198458e-05, "loss": 1.3095, "step": 401 }, { "epoch": 0.43, "grad_norm": 0.08872678130865097, "learning_rate": 4.6396287514287275e-05, "loss": 1.3737, "step": 402 }, { "epoch": 0.43, "grad_norm": 0.0877537652850151, "learning_rate": 4.637343257691234e-05, "loss": 1.3265, "step": 403 }, { "epoch": 0.43, "grad_norm": 0.08949404209852219, "learning_rate": 4.635051106105316e-05, "loss": 1.3761, "step": 404 }, { "epoch": 0.43, "grad_norm": 0.1385626494884491, "learning_rate": 4.632752303811053e-05, "loss": 1.4187, "step": 405 }, { "epoch": 0.43, "grad_norm": 0.09133272618055344, "learning_rate": 4.6304468579692384e-05, "loss": 1.1747, "step": 406 }, { "epoch": 0.43, "grad_norm": 0.09598804265260696, "learning_rate": 4.6281347757613626e-05, "loss": 1.3986, "step": 407 }, { "epoch": 0.44, "grad_norm": 0.09424811601638794, "learning_rate": 4.625816064389589e-05, "loss": 1.3617, "step": 408 }, { "epoch": 0.44, "grad_norm": 0.10442469269037247, "learning_rate": 4.623490731076728e-05, "loss": 1.2631, "step": 409 }, { "epoch": 0.44, "grad_norm": 0.08816258609294891, "learning_rate": 4.62115878306622e-05, "loss": 1.3181, "step": 410 }, { "epoch": 0.44, "grad_norm": 0.09299376606941223, "learning_rate": 4.61882022762211e-05, "loss": 1.3832, "step": 411 }, { "epoch": 0.44, "grad_norm": 0.09412654489278793, "learning_rate": 4.616475072029024e-05, "loss": 1.2522, "step": 412 }, { "epoch": 0.44, "grad_norm": 0.16249871253967285, "learning_rate": 4.614123323592148e-05, "loss": 1.4142, "step": 413 }, { "epoch": 0.44, "grad_norm": 0.08951661735773087, "learning_rate": 4.611764989637205e-05, "loss": 1.26, "step": 414 }, { "epoch": 0.44, "grad_norm": 0.09598527103662491, "learning_rate": 4.609400077510433e-05, "loss": 1.3028, "step": 415 }, { "epoch": 0.44, "grad_norm": 0.09367496520280838, "learning_rate": 4.607028594578559e-05, "loss": 1.2739, "step": 416 }, { "epoch": 0.45, "grad_norm": 0.09410126507282257, "learning_rate": 4.6046505482287794e-05, "loss": 1.4011, "step": 417 }, { "epoch": 0.45, "grad_norm": 0.0993422120809555, "learning_rate": 4.602265945868735e-05, "loss": 1.2641, "step": 418 }, { "epoch": 0.45, "grad_norm": 0.08851709961891174, "learning_rate": 4.59987479492649e-05, "loss": 1.3415, "step": 419 }, { "epoch": 0.45, "grad_norm": 0.10108134895563126, "learning_rate": 4.5974771028505064e-05, "loss": 1.3679, "step": 420 }, { "epoch": 0.45, "grad_norm": 0.08684688806533813, "learning_rate": 4.595072877109622e-05, "loss": 1.3404, "step": 421 }, { "epoch": 0.45, "grad_norm": 0.102836474776268, "learning_rate": 4.592662125193027e-05, "loss": 1.2668, "step": 422 }, { "epoch": 0.45, "grad_norm": 0.08989983052015305, "learning_rate": 4.59024485461024e-05, "loss": 1.3408, "step": 423 }, { "epoch": 0.45, "grad_norm": 0.11563542485237122, "learning_rate": 4.5878210728910894e-05, "loss": 1.4273, "step": 424 }, { "epoch": 0.45, "grad_norm": 0.09047998487949371, "learning_rate": 4.585390787585679e-05, "loss": 1.3598, "step": 425 }, { "epoch": 0.45, "grad_norm": 0.17246578633785248, "learning_rate": 4.5829540062643773e-05, "loss": 1.4129, "step": 426 }, { "epoch": 0.46, "grad_norm": 0.09310886263847351, "learning_rate": 4.5805107365177844e-05, "loss": 1.3474, "step": 427 }, { "epoch": 0.46, "grad_norm": 0.12132611870765686, "learning_rate": 4.5780609859567136e-05, "loss": 1.2837, "step": 428 }, { "epoch": 0.46, "grad_norm": 0.09810657054185867, "learning_rate": 4.575604762212167e-05, "loss": 1.2571, "step": 429 }, { "epoch": 0.46, "grad_norm": 0.1014370396733284, "learning_rate": 4.573142072935307e-05, "loss": 1.3435, "step": 430 }, { "epoch": 0.46, "grad_norm": 0.09363662451505661, "learning_rate": 4.570672925797439e-05, "loss": 1.3725, "step": 431 }, { "epoch": 0.46, "grad_norm": 0.11186376214027405, "learning_rate": 4.568197328489986e-05, "loss": 1.3095, "step": 432 }, { "epoch": 0.46, "grad_norm": 0.09949055314064026, "learning_rate": 4.5657152887244606e-05, "loss": 1.2912, "step": 433 }, { "epoch": 0.46, "grad_norm": 0.10397903621196747, "learning_rate": 4.563226814232444e-05, "loss": 1.3221, "step": 434 }, { "epoch": 0.46, "grad_norm": 0.09242431074380875, "learning_rate": 4.5607319127655636e-05, "loss": 1.3132, "step": 435 }, { "epoch": 0.47, "grad_norm": 0.09308359771966934, "learning_rate": 4.5582305920954643e-05, "loss": 1.3784, "step": 436 }, { "epoch": 0.47, "grad_norm": 0.10053109377622604, "learning_rate": 4.555722860013789e-05, "loss": 1.3767, "step": 437 }, { "epoch": 0.47, "grad_norm": 0.0856412872672081, "learning_rate": 4.553208724332153e-05, "loss": 1.3518, "step": 438 }, { "epoch": 0.47, "grad_norm": 0.09756068885326385, "learning_rate": 4.550688192882115e-05, "loss": 1.5058, "step": 439 }, { "epoch": 0.47, "grad_norm": 0.10069140046834946, "learning_rate": 4.54816127351516e-05, "loss": 1.3025, "step": 440 }, { "epoch": 0.47, "grad_norm": 0.0943472757935524, "learning_rate": 4.545627974102671e-05, "loss": 1.3474, "step": 441 }, { "epoch": 0.47, "grad_norm": 0.094652459025383, "learning_rate": 4.543088302535903e-05, "loss": 1.3433, "step": 442 }, { "epoch": 0.47, "grad_norm": 0.09766332805156708, "learning_rate": 4.540542266725963e-05, "loss": 1.2912, "step": 443 }, { "epoch": 0.47, "grad_norm": 0.0971674770116806, "learning_rate": 4.5379898746037804e-05, "loss": 1.269, "step": 444 }, { "epoch": 0.47, "grad_norm": 0.09349120408296585, "learning_rate": 4.535431134120086e-05, "loss": 1.2813, "step": 445 }, { "epoch": 0.48, "grad_norm": 0.15912680327892303, "learning_rate": 4.532866053245385e-05, "loss": 1.323, "step": 446 }, { "epoch": 0.48, "grad_norm": 0.1172761395573616, "learning_rate": 4.530294639969934e-05, "loss": 1.2852, "step": 447 }, { "epoch": 0.48, "grad_norm": 0.09645481407642365, "learning_rate": 4.527716902303714e-05, "loss": 1.3793, "step": 448 }, { "epoch": 0.48, "grad_norm": 0.09305194765329361, "learning_rate": 4.525132848276406e-05, "loss": 1.4287, "step": 449 }, { "epoch": 0.48, "grad_norm": 0.09805100411176682, "learning_rate": 4.522542485937369e-05, "loss": 1.4003, "step": 450 }, { "epoch": 0.48, "grad_norm": 0.09443584084510803, "learning_rate": 4.51994582335561e-05, "loss": 1.4108, "step": 451 }, { "epoch": 0.48, "grad_norm": 0.09847624599933624, "learning_rate": 4.517342868619764e-05, "loss": 1.3774, "step": 452 }, { "epoch": 0.48, "grad_norm": 0.09076975286006927, "learning_rate": 4.514733629838063e-05, "loss": 1.3411, "step": 453 }, { "epoch": 0.48, "grad_norm": 0.09129495918750763, "learning_rate": 4.5121181151383143e-05, "loss": 1.2343, "step": 454 }, { "epoch": 0.49, "grad_norm": 0.09538775682449341, "learning_rate": 4.509496332667878e-05, "loss": 1.309, "step": 455 }, { "epoch": 0.49, "grad_norm": 0.0932200700044632, "learning_rate": 4.506868290593635e-05, "loss": 1.4046, "step": 456 }, { "epoch": 0.49, "grad_norm": 0.09325648844242096, "learning_rate": 4.5042339971019666e-05, "loss": 1.2966, "step": 457 }, { "epoch": 0.49, "grad_norm": 0.10446179658174515, "learning_rate": 4.501593460398726e-05, "loss": 1.3569, "step": 458 }, { "epoch": 0.49, "grad_norm": 0.09409455209970474, "learning_rate": 4.498946688709216e-05, "loss": 1.3351, "step": 459 }, { "epoch": 0.49, "grad_norm": 0.09080103039741516, "learning_rate": 4.4962936902781594e-05, "loss": 1.2639, "step": 460 }, { "epoch": 0.49, "grad_norm": 0.09168045967817307, "learning_rate": 4.493634473369677e-05, "loss": 1.3146, "step": 461 }, { "epoch": 0.49, "grad_norm": 0.08616971224546432, "learning_rate": 4.4909690462672585e-05, "loss": 1.3693, "step": 462 }, { "epoch": 0.49, "grad_norm": 0.09884229302406311, "learning_rate": 4.488297417273741e-05, "loss": 1.3904, "step": 463 }, { "epoch": 0.5, "grad_norm": 0.09650994092226028, "learning_rate": 4.4856195947112776e-05, "loss": 1.4442, "step": 464 }, { "epoch": 0.5, "grad_norm": 0.09592213481664658, "learning_rate": 4.482935586921316e-05, "loss": 1.3532, "step": 465 }, { "epoch": 0.5, "grad_norm": 0.1017376109957695, "learning_rate": 4.480245402264572e-05, "loss": 1.3927, "step": 466 }, { "epoch": 0.5, "grad_norm": 0.08732669800519943, "learning_rate": 4.4775490491210016e-05, "loss": 1.3695, "step": 467 }, { "epoch": 0.5, "grad_norm": 0.09283038228750229, "learning_rate": 4.474846535889773e-05, "loss": 1.3465, "step": 468 }, { "epoch": 0.5, "grad_norm": 0.09399215877056122, "learning_rate": 4.472137870989247e-05, "loss": 1.3504, "step": 469 }, { "epoch": 0.5, "grad_norm": 0.09396019577980042, "learning_rate": 4.4694230628569454e-05, "loss": 1.328, "step": 470 }, { "epoch": 0.5, "grad_norm": 0.09818755835294724, "learning_rate": 4.466702119949526e-05, "loss": 1.3597, "step": 471 }, { "epoch": 0.5, "grad_norm": 0.10373959690332413, "learning_rate": 4.463975050742757e-05, "loss": 1.2681, "step": 472 }, { "epoch": 0.5, "grad_norm": 0.0993281677365303, "learning_rate": 4.461241863731489e-05, "loss": 1.3057, "step": 473 }, { "epoch": 0.51, "grad_norm": 0.09096087515354156, "learning_rate": 4.4585025674296315e-05, "loss": 1.3061, "step": 474 }, { "epoch": 0.51, "grad_norm": 0.09843237698078156, "learning_rate": 4.4557571703701226e-05, "loss": 1.2969, "step": 475 }, { "epoch": 0.51, "grad_norm": 0.09445469826459885, "learning_rate": 4.453005681104906e-05, "loss": 1.2051, "step": 476 }, { "epoch": 0.51, "grad_norm": 0.10724281519651413, "learning_rate": 4.4502481082049016e-05, "loss": 1.3629, "step": 477 }, { "epoch": 0.51, "grad_norm": 0.08827083557844162, "learning_rate": 4.4474844602599795e-05, "loss": 1.342, "step": 478 }, { "epoch": 0.51, "grad_norm": 0.09455974400043488, "learning_rate": 4.444714745878936e-05, "loss": 1.2408, "step": 479 }, { "epoch": 0.51, "grad_norm": 0.0963449627161026, "learning_rate": 4.44193897368946e-05, "loss": 1.3956, "step": 480 }, { "epoch": 0.51, "grad_norm": 0.1244681105017662, "learning_rate": 4.439157152338116e-05, "loss": 1.3086, "step": 481 }, { "epoch": 0.51, "grad_norm": 0.10625208914279938, "learning_rate": 4.436369290490307e-05, "loss": 1.3869, "step": 482 }, { "epoch": 0.52, "grad_norm": 0.10267725586891174, "learning_rate": 4.433575396830256e-05, "loss": 1.3136, "step": 483 }, { "epoch": 0.52, "grad_norm": 0.09765701740980148, "learning_rate": 4.4307754800609725e-05, "loss": 1.2566, "step": 484 }, { "epoch": 0.52, "grad_norm": 0.08589779585599899, "learning_rate": 4.427969548904228e-05, "loss": 1.3314, "step": 485 }, { "epoch": 0.52, "grad_norm": 0.08467262983322144, "learning_rate": 4.4251576121005314e-05, "loss": 1.228, "step": 486 }, { "epoch": 0.52, "grad_norm": 0.09379052370786667, "learning_rate": 4.422339678409096e-05, "loss": 1.3658, "step": 487 }, { "epoch": 0.52, "grad_norm": 0.10410299897193909, "learning_rate": 4.4195157566078186e-05, "loss": 1.2542, "step": 488 }, { "epoch": 0.52, "grad_norm": 0.09087986499071121, "learning_rate": 4.416685855493247e-05, "loss": 1.2092, "step": 489 }, { "epoch": 0.52, "grad_norm": 0.09085581451654434, "learning_rate": 4.413849983880554e-05, "loss": 1.2962, "step": 490 }, { "epoch": 0.52, "grad_norm": 0.09474420547485352, "learning_rate": 4.411008150603514e-05, "loss": 1.3953, "step": 491 }, { "epoch": 0.53, "grad_norm": 0.09279070049524307, "learning_rate": 4.408160364514468e-05, "loss": 1.3184, "step": 492 }, { "epoch": 0.53, "grad_norm": 0.09756158292293549, "learning_rate": 4.405306634484303e-05, "loss": 1.3197, "step": 493 }, { "epoch": 0.53, "grad_norm": 0.09180128574371338, "learning_rate": 4.40244696940242e-05, "loss": 1.2856, "step": 494 }, { "epoch": 0.53, "grad_norm": 0.09457723051309586, "learning_rate": 4.399581378176707e-05, "loss": 1.3081, "step": 495 }, { "epoch": 0.53, "grad_norm": 0.09794706106185913, "learning_rate": 4.396709869733515e-05, "loss": 1.3562, "step": 496 }, { "epoch": 0.53, "grad_norm": 0.0874537006020546, "learning_rate": 4.3938324530176236e-05, "loss": 1.3235, "step": 497 }, { "epoch": 0.53, "grad_norm": 0.0974416732788086, "learning_rate": 4.39094913699222e-05, "loss": 1.4397, "step": 498 }, { "epoch": 0.53, "grad_norm": 0.09341360628604889, "learning_rate": 4.388059930638865e-05, "loss": 1.3448, "step": 499 }, { "epoch": 0.53, "grad_norm": 0.08748738467693329, "learning_rate": 4.385164842957469e-05, "loss": 1.2659, "step": 500 } ], "logging_steps": 1.0, "max_steps": 1874, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "total_flos": 1.3535591043956736e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }