{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 752, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0026595744680851063, "grad_norm": 7.823265044222166, "learning_rate": 0.0, "loss": 3.0134, "step": 1 }, { "epoch": 0.005319148936170213, "grad_norm": 7.642957709635029, "learning_rate": 1.6005307325482135e-07, "loss": 3.1765, "step": 2 }, { "epoch": 0.007978723404255319, "grad_norm": 7.334880781186477, "learning_rate": 2.5367811923406806e-07, "loss": 3.015, "step": 3 }, { "epoch": 0.010638297872340425, "grad_norm": 7.622164840160959, "learning_rate": 3.201061465096427e-07, "loss": 3.0191, "step": 4 }, { "epoch": 0.013297872340425532, "grad_norm": 7.559561166288389, "learning_rate": 3.716317274634347e-07, "loss": 3.0604, "step": 5 }, { "epoch": 0.015957446808510637, "grad_norm": 7.537486932594524, "learning_rate": 4.137311924888894e-07, "loss": 3.0974, "step": 6 }, { "epoch": 0.018617021276595744, "grad_norm": 7.852202340999875, "learning_rate": 4.4932578299236894e-07, "loss": 3.0015, "step": 7 }, { "epoch": 0.02127659574468085, "grad_norm": 7.556325866974648, "learning_rate": 4.80159219764464e-07, "loss": 3.0507, "step": 8 }, { "epoch": 0.023936170212765957, "grad_norm": 7.332175625826142, "learning_rate": 5.073562384681361e-07, "loss": 3.127, "step": 9 }, { "epoch": 0.026595744680851064, "grad_norm": 7.725255533578292, "learning_rate": 5.316848007182561e-07, "loss": 3.0381, "step": 10 }, { "epoch": 0.02925531914893617, "grad_norm": 7.457326924140823, "learning_rate": 5.536926622778005e-07, "loss": 3.0634, "step": 11 }, { "epoch": 0.031914893617021274, "grad_norm": 7.693199711944396, "learning_rate": 5.737842657437107e-07, "loss": 3.0101, "step": 12 }, { "epoch": 0.034574468085106384, "grad_norm": 7.313122323445423, "learning_rate": 5.922667492826867e-07, "loss": 3.0967, "step": 13 }, { "epoch": 0.03723404255319149, "grad_norm": 7.381687042192129, "learning_rate": 6.093788562471904e-07, "loss": 3.0606, "step": 14 }, { "epoch": 0.0398936170212766, "grad_norm": 7.501689466289098, "learning_rate": 6.253098466975028e-07, "loss": 3.0923, "step": 15 }, { "epoch": 0.0425531914893617, "grad_norm": 7.764166896561166, "learning_rate": 6.402122930192854e-07, "loss": 3.0133, "step": 16 }, { "epoch": 0.04521276595744681, "grad_norm": 7.632126776388045, "learning_rate": 6.542109895570008e-07, "loss": 3.0261, "step": 17 }, { "epoch": 0.047872340425531915, "grad_norm": 7.5260472985128875, "learning_rate": 6.674093117229574e-07, "loss": 3.0122, "step": 18 }, { "epoch": 0.05053191489361702, "grad_norm": 7.760501268851623, "learning_rate": 6.798938534903572e-07, "loss": 2.8592, "step": 19 }, { "epoch": 0.05319148936170213, "grad_norm": 7.498060611474783, "learning_rate": 6.917378739730775e-07, "loss": 2.8595, "step": 20 }, { "epoch": 0.05585106382978723, "grad_norm": 7.527553046681602, "learning_rate": 7.030039022264371e-07, "loss": 2.7323, "step": 21 }, { "epoch": 0.05851063829787234, "grad_norm": 7.139599749462118, "learning_rate": 7.13745735532622e-07, "loss": 3.0506, "step": 22 }, { "epoch": 0.061170212765957445, "grad_norm": 7.6904167286597165, "learning_rate": 7.24009993125516e-07, "loss": 2.8334, "step": 23 }, { "epoch": 0.06382978723404255, "grad_norm": 7.13831891441032, "learning_rate": 7.338373389985321e-07, "loss": 3.0714, "step": 24 }, { "epoch": 0.06648936170212766, "grad_norm": 7.277657469838149, "learning_rate": 7.432634549268694e-07, "loss": 3.0034, "step": 25 }, { "epoch": 0.06914893617021277, "grad_norm": 7.601860581578553, "learning_rate": 7.52319822537508e-07, "loss": 2.8405, "step": 26 }, { "epoch": 0.07180851063829788, "grad_norm": 7.406160378215662, "learning_rate": 7.610343577022042e-07, "loss": 2.8776, "step": 27 }, { "epoch": 0.07446808510638298, "grad_norm": 7.501158815505739, "learning_rate": 7.694319295020116e-07, "loss": 2.8623, "step": 28 }, { "epoch": 0.07712765957446809, "grad_norm": 7.475182990956878, "learning_rate": 7.775347880836832e-07, "loss": 2.8176, "step": 29 }, { "epoch": 0.0797872340425532, "grad_norm": 6.90076598186568, "learning_rate": 7.853629199523242e-07, "loss": 2.6601, "step": 30 }, { "epoch": 0.08244680851063829, "grad_norm": 7.317267534729387, "learning_rate": 7.929343449851162e-07, "loss": 2.5921, "step": 31 }, { "epoch": 0.0851063829787234, "grad_norm": 6.540702440994457, "learning_rate": 8.002653662741068e-07, "loss": 2.7996, "step": 32 }, { "epoch": 0.08776595744680851, "grad_norm": 7.135880043595326, "learning_rate": 8.073707815118686e-07, "loss": 2.5778, "step": 33 }, { "epoch": 0.09042553191489362, "grad_norm": 6.507937939592382, "learning_rate": 8.142640628118222e-07, "loss": 2.7356, "step": 34 }, { "epoch": 0.09308510638297872, "grad_norm": 6.445617298664252, "learning_rate": 8.209575104558038e-07, "loss": 2.7163, "step": 35 }, { "epoch": 0.09574468085106383, "grad_norm": 6.62462859642164, "learning_rate": 8.274623849777788e-07, "loss": 2.6283, "step": 36 }, { "epoch": 0.09840425531914894, "grad_norm": 6.7066040969467995, "learning_rate": 8.337890211465859e-07, "loss": 2.5485, "step": 37 }, { "epoch": 0.10106382978723404, "grad_norm": 6.6019115107446815, "learning_rate": 8.399469267451787e-07, "loss": 2.5864, "step": 38 }, { "epoch": 0.10372340425531915, "grad_norm": 6.526338473606032, "learning_rate": 8.459448685167547e-07, "loss": 2.6546, "step": 39 }, { "epoch": 0.10638297872340426, "grad_norm": 6.677090538207485, "learning_rate": 8.517909472278988e-07, "loss": 2.6135, "step": 40 }, { "epoch": 0.10904255319148937, "grad_norm": 6.627853354895093, "learning_rate": 8.574926634616532e-07, "loss": 2.6117, "step": 41 }, { "epoch": 0.11170212765957446, "grad_norm": 6.210420944517809, "learning_rate": 8.630569754812584e-07, "loss": 2.6221, "step": 42 }, { "epoch": 0.11436170212765957, "grad_norm": 6.490293150689325, "learning_rate": 8.684903502843901e-07, "loss": 2.6191, "step": 43 }, { "epoch": 0.11702127659574468, "grad_norm": 6.5133887096434036, "learning_rate": 8.737988087874431e-07, "loss": 2.6479, "step": 44 }, { "epoch": 0.1196808510638298, "grad_norm": 6.5629458321401275, "learning_rate": 8.789879659315709e-07, "loss": 2.5919, "step": 45 }, { "epoch": 0.12234042553191489, "grad_norm": 6.263965110944729, "learning_rate": 8.840630663803374e-07, "loss": 2.564, "step": 46 }, { "epoch": 0.125, "grad_norm": 6.2925368290438355, "learning_rate": 8.890290163779749e-07, "loss": 2.4828, "step": 47 }, { "epoch": 0.1276595744680851, "grad_norm": 5.562972488052771, "learning_rate": 8.938904122533535e-07, "loss": 2.3536, "step": 48 }, { "epoch": 0.13031914893617022, "grad_norm": 5.051596119252882, "learning_rate": 8.986515659847379e-07, "loss": 2.2057, "step": 49 }, { "epoch": 0.13297872340425532, "grad_norm": 4.783115795828969, "learning_rate": 9.033165281816909e-07, "loss": 2.1078, "step": 50 }, { "epoch": 0.1356382978723404, "grad_norm": 4.0379873353666, "learning_rate": 9.078891087910689e-07, "loss": 2.1146, "step": 51 }, { "epoch": 0.13829787234042554, "grad_norm": 4.425461679286027, "learning_rate": 9.123728957923294e-07, "loss": 2.106, "step": 52 }, { "epoch": 0.14095744680851063, "grad_norm": 4.219063302168513, "learning_rate": 9.167712721119934e-07, "loss": 2.0448, "step": 53 }, { "epoch": 0.14361702127659576, "grad_norm": 4.177230312333208, "learning_rate": 9.210874309570255e-07, "loss": 2.106, "step": 54 }, { "epoch": 0.14627659574468085, "grad_norm": 3.58662530955563, "learning_rate": 9.253243897412354e-07, "loss": 2.1577, "step": 55 }, { "epoch": 0.14893617021276595, "grad_norm": 3.8076903225375607, "learning_rate": 9.294850027568331e-07, "loss": 2.044, "step": 56 }, { "epoch": 0.15159574468085107, "grad_norm": 3.9757823965828445, "learning_rate": 9.335719727244254e-07, "loss": 2.1354, "step": 57 }, { "epoch": 0.15425531914893617, "grad_norm": 3.8984355429604305, "learning_rate": 9.375878613385046e-07, "loss": 2.0297, "step": 58 }, { "epoch": 0.15691489361702127, "grad_norm": 4.006300970220442, "learning_rate": 9.415350989114764e-07, "loss": 1.8268, "step": 59 }, { "epoch": 0.1595744680851064, "grad_norm": 3.7231660155630126, "learning_rate": 9.454159932071455e-07, "loss": 1.8824, "step": 60 }, { "epoch": 0.1622340425531915, "grad_norm": 3.896921356096762, "learning_rate": 9.492327375440568e-07, "loss": 1.9475, "step": 61 }, { "epoch": 0.16489361702127658, "grad_norm": 3.1704796037774394, "learning_rate": 9.529874182399376e-07, "loss": 1.9461, "step": 62 }, { "epoch": 0.1675531914893617, "grad_norm": 3.624185273266048, "learning_rate": 9.566820214605051e-07, "loss": 2.0426, "step": 63 }, { "epoch": 0.1702127659574468, "grad_norm": 3.438777616799716, "learning_rate": 9.60318439528928e-07, "loss": 1.9094, "step": 64 }, { "epoch": 0.17287234042553193, "grad_norm": 4.546206080990496, "learning_rate": 9.638984767461214e-07, "loss": 1.9037, "step": 65 }, { "epoch": 0.17553191489361702, "grad_norm": 3.092553572071205, "learning_rate": 9.6742385476669e-07, "loss": 1.9928, "step": 66 }, { "epoch": 0.17819148936170212, "grad_norm": 3.3574221590495807, "learning_rate": 9.708962175706178e-07, "loss": 1.9752, "step": 67 }, { "epoch": 0.18085106382978725, "grad_norm": 3.0865121040891714, "learning_rate": 9.743171360666435e-07, "loss": 1.9853, "step": 68 }, { "epoch": 0.18351063829787234, "grad_norm": 3.25288028731065, "learning_rate": 9.776881123595842e-07, "loss": 1.8024, "step": 69 }, { "epoch": 0.18617021276595744, "grad_norm": 2.8739512645435865, "learning_rate": 9.810105837106252e-07, "loss": 2.0918, "step": 70 }, { "epoch": 0.18882978723404256, "grad_norm": 2.8379601865829414, "learning_rate": 9.842859262167094e-07, "loss": 1.801, "step": 71 }, { "epoch": 0.19148936170212766, "grad_norm": 2.634217473181439, "learning_rate": 9.875154582326002e-07, "loss": 1.9093, "step": 72 }, { "epoch": 0.19414893617021275, "grad_norm": 2.674409166489119, "learning_rate": 9.907004435569156e-07, "loss": 1.8468, "step": 73 }, { "epoch": 0.19680851063829788, "grad_norm": 2.5418462429291178, "learning_rate": 9.938420944014074e-07, "loss": 2.0187, "step": 74 }, { "epoch": 0.19946808510638298, "grad_norm": 2.249520992577069, "learning_rate": 9.969415741609375e-07, "loss": 1.8433, "step": 75 }, { "epoch": 0.20212765957446807, "grad_norm": 3.4070896898561567, "learning_rate": 1e-06, "loss": 1.6904, "step": 76 }, { "epoch": 0.2047872340425532, "grad_norm": 2.405587439537431, "learning_rate": 1e-06, "loss": 1.9263, "step": 77 }, { "epoch": 0.2074468085106383, "grad_norm": 2.1351508990882686, "learning_rate": 9.985207100591716e-07, "loss": 1.9915, "step": 78 }, { "epoch": 0.21010638297872342, "grad_norm": 2.253674050573154, "learning_rate": 9.97041420118343e-07, "loss": 1.8581, "step": 79 }, { "epoch": 0.2127659574468085, "grad_norm": 1.9816102720192281, "learning_rate": 9.955621301775147e-07, "loss": 1.8838, "step": 80 }, { "epoch": 0.2154255319148936, "grad_norm": 2.1585154035600502, "learning_rate": 9.940828402366864e-07, "loss": 1.7891, "step": 81 }, { "epoch": 0.21808510638297873, "grad_norm": 2.1927798177222466, "learning_rate": 9.92603550295858e-07, "loss": 1.6338, "step": 82 }, { "epoch": 0.22074468085106383, "grad_norm": 2.1195681092445606, "learning_rate": 9.911242603550295e-07, "loss": 1.8233, "step": 83 }, { "epoch": 0.22340425531914893, "grad_norm": 1.79894840198714, "learning_rate": 9.896449704142011e-07, "loss": 1.6456, "step": 84 }, { "epoch": 0.22606382978723405, "grad_norm": 2.1334819930653004, "learning_rate": 9.881656804733728e-07, "loss": 1.7721, "step": 85 }, { "epoch": 0.22872340425531915, "grad_norm": 2.2522144017174988, "learning_rate": 9.866863905325444e-07, "loss": 1.8225, "step": 86 }, { "epoch": 0.23138297872340424, "grad_norm": 2.1109395194449885, "learning_rate": 9.852071005917159e-07, "loss": 1.8103, "step": 87 }, { "epoch": 0.23404255319148937, "grad_norm": 1.889220101950186, "learning_rate": 9.837278106508875e-07, "loss": 1.7531, "step": 88 }, { "epoch": 0.23670212765957446, "grad_norm": 2.0103297503036797, "learning_rate": 9.822485207100592e-07, "loss": 1.4589, "step": 89 }, { "epoch": 0.2393617021276596, "grad_norm": 2.0087468092299012, "learning_rate": 9.807692307692306e-07, "loss": 1.7291, "step": 90 }, { "epoch": 0.24202127659574468, "grad_norm": 2.1537427838116603, "learning_rate": 9.792899408284023e-07, "loss": 1.8277, "step": 91 }, { "epoch": 0.24468085106382978, "grad_norm": 1.811327456337824, "learning_rate": 9.77810650887574e-07, "loss": 1.6806, "step": 92 }, { "epoch": 0.2473404255319149, "grad_norm": 1.8193216533171808, "learning_rate": 9.763313609467456e-07, "loss": 1.5615, "step": 93 }, { "epoch": 0.25, "grad_norm": 1.7880991378455267, "learning_rate": 9.748520710059172e-07, "loss": 1.6315, "step": 94 }, { "epoch": 0.2526595744680851, "grad_norm": 1.9732905510612142, "learning_rate": 9.733727810650887e-07, "loss": 1.6118, "step": 95 }, { "epoch": 0.2553191489361702, "grad_norm": 1.8507510310387487, "learning_rate": 9.718934911242603e-07, "loss": 1.562, "step": 96 }, { "epoch": 0.2579787234042553, "grad_norm": 1.937704198597928, "learning_rate": 9.704142011834318e-07, "loss": 1.5891, "step": 97 }, { "epoch": 0.26063829787234044, "grad_norm": 1.8949539814094551, "learning_rate": 9.689349112426034e-07, "loss": 1.55, "step": 98 }, { "epoch": 0.2632978723404255, "grad_norm": 1.7741323445830024, "learning_rate": 9.67455621301775e-07, "loss": 1.734, "step": 99 }, { "epoch": 0.26595744680851063, "grad_norm": 1.7021314190064671, "learning_rate": 9.659763313609467e-07, "loss": 1.4889, "step": 100 }, { "epoch": 0.26861702127659576, "grad_norm": 1.9644914618403917, "learning_rate": 9.644970414201184e-07, "loss": 1.7278, "step": 101 }, { "epoch": 0.2712765957446808, "grad_norm": 1.8634125925152643, "learning_rate": 9.630177514792898e-07, "loss": 1.5682, "step": 102 }, { "epoch": 0.27393617021276595, "grad_norm": 1.8401952841001055, "learning_rate": 9.615384615384615e-07, "loss": 1.565, "step": 103 }, { "epoch": 0.2765957446808511, "grad_norm": 1.804040900318666, "learning_rate": 9.600591715976331e-07, "loss": 1.5869, "step": 104 }, { "epoch": 0.27925531914893614, "grad_norm": 1.706090462740245, "learning_rate": 9.585798816568048e-07, "loss": 1.5148, "step": 105 }, { "epoch": 0.28191489361702127, "grad_norm": 1.728525487149655, "learning_rate": 9.571005917159762e-07, "loss": 1.5603, "step": 106 }, { "epoch": 0.2845744680851064, "grad_norm": 1.7524632420405768, "learning_rate": 9.556213017751479e-07, "loss": 1.6348, "step": 107 }, { "epoch": 0.2872340425531915, "grad_norm": 1.7478581672975904, "learning_rate": 9.541420118343195e-07, "loss": 1.6436, "step": 108 }, { "epoch": 0.2898936170212766, "grad_norm": 1.9985633100646443, "learning_rate": 9.526627218934911e-07, "loss": 1.7681, "step": 109 }, { "epoch": 0.2925531914893617, "grad_norm": 2.079481786216591, "learning_rate": 9.511834319526627e-07, "loss": 1.419, "step": 110 }, { "epoch": 0.29521276595744683, "grad_norm": 1.6892443930394687, "learning_rate": 9.497041420118342e-07, "loss": 1.6604, "step": 111 }, { "epoch": 0.2978723404255319, "grad_norm": 1.610960466459826, "learning_rate": 9.482248520710058e-07, "loss": 1.6354, "step": 112 }, { "epoch": 0.300531914893617, "grad_norm": 1.8544084912738468, "learning_rate": 9.467455621301774e-07, "loss": 1.596, "step": 113 }, { "epoch": 0.30319148936170215, "grad_norm": 1.9170248346565737, "learning_rate": 9.45266272189349e-07, "loss": 1.6547, "step": 114 }, { "epoch": 0.3058510638297872, "grad_norm": 1.7881140714522759, "learning_rate": 9.437869822485207e-07, "loss": 1.6039, "step": 115 }, { "epoch": 0.30851063829787234, "grad_norm": 1.9088889444538937, "learning_rate": 9.423076923076923e-07, "loss": 1.7795, "step": 116 }, { "epoch": 0.31117021276595747, "grad_norm": 2.2553186073976383, "learning_rate": 9.408284023668639e-07, "loss": 1.4582, "step": 117 }, { "epoch": 0.31382978723404253, "grad_norm": 1.843300845507743, "learning_rate": 9.393491124260355e-07, "loss": 1.5459, "step": 118 }, { "epoch": 0.31648936170212766, "grad_norm": 1.7169781655337961, "learning_rate": 9.378698224852071e-07, "loss": 1.5621, "step": 119 }, { "epoch": 0.3191489361702128, "grad_norm": 1.690585106680432, "learning_rate": 9.363905325443787e-07, "loss": 1.5449, "step": 120 }, { "epoch": 0.32180851063829785, "grad_norm": 1.634720340224596, "learning_rate": 9.349112426035502e-07, "loss": 1.5937, "step": 121 }, { "epoch": 0.324468085106383, "grad_norm": 1.905851103523696, "learning_rate": 9.334319526627219e-07, "loss": 1.6611, "step": 122 }, { "epoch": 0.3271276595744681, "grad_norm": 1.5977114642239374, "learning_rate": 9.319526627218934e-07, "loss": 1.3517, "step": 123 }, { "epoch": 0.32978723404255317, "grad_norm": 1.7739378101582597, "learning_rate": 9.304733727810651e-07, "loss": 1.6623, "step": 124 }, { "epoch": 0.3324468085106383, "grad_norm": 1.7433956729666737, "learning_rate": 9.289940828402366e-07, "loss": 1.565, "step": 125 }, { "epoch": 0.3351063829787234, "grad_norm": 1.8292660534852752, "learning_rate": 9.275147928994083e-07, "loss": 1.6095, "step": 126 }, { "epoch": 0.3377659574468085, "grad_norm": 1.8582148418654536, "learning_rate": 9.260355029585798e-07, "loss": 1.6689, "step": 127 }, { "epoch": 0.3404255319148936, "grad_norm": 1.5903483721719576, "learning_rate": 9.245562130177515e-07, "loss": 1.4741, "step": 128 }, { "epoch": 0.34308510638297873, "grad_norm": 1.5937293556222676, "learning_rate": 9.230769230769231e-07, "loss": 1.4603, "step": 129 }, { "epoch": 0.34574468085106386, "grad_norm": 1.8563999128333846, "learning_rate": 9.215976331360947e-07, "loss": 1.6907, "step": 130 }, { "epoch": 0.3484042553191489, "grad_norm": 1.7139518347374663, "learning_rate": 9.201183431952662e-07, "loss": 1.6474, "step": 131 }, { "epoch": 0.35106382978723405, "grad_norm": 1.7465122753927422, "learning_rate": 9.186390532544378e-07, "loss": 1.6324, "step": 132 }, { "epoch": 0.3537234042553192, "grad_norm": 1.6610443253134841, "learning_rate": 9.171597633136094e-07, "loss": 1.5045, "step": 133 }, { "epoch": 0.35638297872340424, "grad_norm": 1.655396079412198, "learning_rate": 9.15680473372781e-07, "loss": 1.5022, "step": 134 }, { "epoch": 0.35904255319148937, "grad_norm": 1.6336073697442315, "learning_rate": 9.142011834319526e-07, "loss": 1.5522, "step": 135 }, { "epoch": 0.3617021276595745, "grad_norm": 1.707143679480118, "learning_rate": 9.127218934911243e-07, "loss": 1.551, "step": 136 }, { "epoch": 0.36436170212765956, "grad_norm": 1.685651756020523, "learning_rate": 9.112426035502958e-07, "loss": 1.6122, "step": 137 }, { "epoch": 0.3670212765957447, "grad_norm": 2.358800653945757, "learning_rate": 9.097633136094675e-07, "loss": 1.5604, "step": 138 }, { "epoch": 0.3696808510638298, "grad_norm": 1.9426261400386715, "learning_rate": 9.08284023668639e-07, "loss": 1.4685, "step": 139 }, { "epoch": 0.3723404255319149, "grad_norm": 1.7028161565048658, "learning_rate": 9.068047337278106e-07, "loss": 1.4377, "step": 140 }, { "epoch": 0.375, "grad_norm": 1.6566345919926695, "learning_rate": 9.053254437869821e-07, "loss": 1.4231, "step": 141 }, { "epoch": 0.3776595744680851, "grad_norm": 1.750830521373255, "learning_rate": 9.038461538461538e-07, "loss": 1.5528, "step": 142 }, { "epoch": 0.3803191489361702, "grad_norm": 1.673773427490192, "learning_rate": 9.023668639053253e-07, "loss": 1.5206, "step": 143 }, { "epoch": 0.3829787234042553, "grad_norm": 1.9158107325263525, "learning_rate": 9.00887573964497e-07, "loss": 1.5624, "step": 144 }, { "epoch": 0.38563829787234044, "grad_norm": 1.863606934304487, "learning_rate": 8.994082840236686e-07, "loss": 1.7617, "step": 145 }, { "epoch": 0.3882978723404255, "grad_norm": 1.743890793561562, "learning_rate": 8.979289940828402e-07, "loss": 1.6736, "step": 146 }, { "epoch": 0.39095744680851063, "grad_norm": 1.7765187119696408, "learning_rate": 8.964497041420118e-07, "loss": 1.401, "step": 147 }, { "epoch": 0.39361702127659576, "grad_norm": 1.9492055399414594, "learning_rate": 8.949704142011834e-07, "loss": 1.7687, "step": 148 }, { "epoch": 0.3962765957446808, "grad_norm": 1.9491511698228168, "learning_rate": 8.93491124260355e-07, "loss": 1.5873, "step": 149 }, { "epoch": 0.39893617021276595, "grad_norm": 1.7732893423967535, "learning_rate": 8.920118343195265e-07, "loss": 1.4666, "step": 150 }, { "epoch": 0.4015957446808511, "grad_norm": 1.628295930467344, "learning_rate": 8.905325443786981e-07, "loss": 1.4253, "step": 151 }, { "epoch": 0.40425531914893614, "grad_norm": 1.9501879161375453, "learning_rate": 8.890532544378698e-07, "loss": 1.5748, "step": 152 }, { "epoch": 0.40691489361702127, "grad_norm": 1.6883577356837587, "learning_rate": 8.875739644970413e-07, "loss": 1.3546, "step": 153 }, { "epoch": 0.4095744680851064, "grad_norm": 1.7177779483356421, "learning_rate": 8.86094674556213e-07, "loss": 1.6715, "step": 154 }, { "epoch": 0.4122340425531915, "grad_norm": 1.806803334100437, "learning_rate": 8.846153846153846e-07, "loss": 1.5485, "step": 155 }, { "epoch": 0.4148936170212766, "grad_norm": 1.7522522193654075, "learning_rate": 8.831360946745562e-07, "loss": 1.5091, "step": 156 }, { "epoch": 0.4175531914893617, "grad_norm": 1.8508800423865754, "learning_rate": 8.816568047337278e-07, "loss": 1.7112, "step": 157 }, { "epoch": 0.42021276595744683, "grad_norm": 1.752024433296569, "learning_rate": 8.801775147928994e-07, "loss": 1.3702, "step": 158 }, { "epoch": 0.4228723404255319, "grad_norm": 2.0875697232783246, "learning_rate": 8.786982248520711e-07, "loss": 1.5972, "step": 159 }, { "epoch": 0.425531914893617, "grad_norm": 1.7852623572002673, "learning_rate": 8.772189349112425e-07, "loss": 1.5496, "step": 160 }, { "epoch": 0.42819148936170215, "grad_norm": 2.0049459574733968, "learning_rate": 8.757396449704142e-07, "loss": 1.5256, "step": 161 }, { "epoch": 0.4308510638297872, "grad_norm": 1.8394745863340762, "learning_rate": 8.742603550295857e-07, "loss": 1.5466, "step": 162 }, { "epoch": 0.43351063829787234, "grad_norm": 1.890821588557376, "learning_rate": 8.727810650887574e-07, "loss": 1.4839, "step": 163 }, { "epoch": 0.43617021276595747, "grad_norm": 1.6481011214712673, "learning_rate": 8.713017751479289e-07, "loss": 1.6322, "step": 164 }, { "epoch": 0.43882978723404253, "grad_norm": 1.6910215297075097, "learning_rate": 8.698224852071006e-07, "loss": 1.4294, "step": 165 }, { "epoch": 0.44148936170212766, "grad_norm": 2.2849926490581978, "learning_rate": 8.683431952662722e-07, "loss": 1.5214, "step": 166 }, { "epoch": 0.4441489361702128, "grad_norm": 1.6530282697158378, "learning_rate": 8.668639053254438e-07, "loss": 1.5387, "step": 167 }, { "epoch": 0.44680851063829785, "grad_norm": 1.8612064349812791, "learning_rate": 8.653846153846154e-07, "loss": 1.4698, "step": 168 }, { "epoch": 0.449468085106383, "grad_norm": 1.844773154127249, "learning_rate": 8.639053254437869e-07, "loss": 1.6155, "step": 169 }, { "epoch": 0.4521276595744681, "grad_norm": 1.7920892424117567, "learning_rate": 8.624260355029585e-07, "loss": 1.4435, "step": 170 }, { "epoch": 0.45478723404255317, "grad_norm": 1.7631889349519279, "learning_rate": 8.609467455621301e-07, "loss": 1.5657, "step": 171 }, { "epoch": 0.4574468085106383, "grad_norm": 1.850684750618834, "learning_rate": 8.594674556213017e-07, "loss": 1.4599, "step": 172 }, { "epoch": 0.4601063829787234, "grad_norm": 1.8936077580680233, "learning_rate": 8.579881656804734e-07, "loss": 1.4487, "step": 173 }, { "epoch": 0.4627659574468085, "grad_norm": 1.8302300226282981, "learning_rate": 8.565088757396449e-07, "loss": 1.3982, "step": 174 }, { "epoch": 0.4654255319148936, "grad_norm": 1.77826681795055, "learning_rate": 8.550295857988166e-07, "loss": 1.4513, "step": 175 }, { "epoch": 0.46808510638297873, "grad_norm": 1.687204688334926, "learning_rate": 8.535502958579881e-07, "loss": 1.4119, "step": 176 }, { "epoch": 0.47074468085106386, "grad_norm": 1.8120179028458203, "learning_rate": 8.520710059171598e-07, "loss": 1.6192, "step": 177 }, { "epoch": 0.4734042553191489, "grad_norm": 1.8795981293168291, "learning_rate": 8.505917159763313e-07, "loss": 1.4954, "step": 178 }, { "epoch": 0.47606382978723405, "grad_norm": 1.7065716786077503, "learning_rate": 8.491124260355029e-07, "loss": 1.5966, "step": 179 }, { "epoch": 0.4787234042553192, "grad_norm": 1.7627718668452295, "learning_rate": 8.476331360946745e-07, "loss": 1.4327, "step": 180 }, { "epoch": 0.48138297872340424, "grad_norm": 1.8665938451163775, "learning_rate": 8.461538461538461e-07, "loss": 1.4918, "step": 181 }, { "epoch": 0.48404255319148937, "grad_norm": 1.819110850294668, "learning_rate": 8.446745562130177e-07, "loss": 1.5539, "step": 182 }, { "epoch": 0.4867021276595745, "grad_norm": 1.8453397847354074, "learning_rate": 8.431952662721893e-07, "loss": 1.5331, "step": 183 }, { "epoch": 0.48936170212765956, "grad_norm": 2.622110865899153, "learning_rate": 8.417159763313609e-07, "loss": 1.3705, "step": 184 }, { "epoch": 0.4920212765957447, "grad_norm": 2.0496831369913378, "learning_rate": 8.402366863905325e-07, "loss": 1.434, "step": 185 }, { "epoch": 0.4946808510638298, "grad_norm": 1.7777460669960974, "learning_rate": 8.387573964497041e-07, "loss": 1.507, "step": 186 }, { "epoch": 0.4973404255319149, "grad_norm": 1.7648525067264564, "learning_rate": 8.372781065088757e-07, "loss": 1.5419, "step": 187 }, { "epoch": 0.5, "grad_norm": 1.7346580205717035, "learning_rate": 8.357988165680473e-07, "loss": 1.4474, "step": 188 }, { "epoch": 0.5026595744680851, "grad_norm": 1.8941186829293386, "learning_rate": 8.343195266272189e-07, "loss": 1.4239, "step": 189 }, { "epoch": 0.5053191489361702, "grad_norm": 1.793062893259623, "learning_rate": 8.328402366863904e-07, "loss": 1.5339, "step": 190 }, { "epoch": 0.5079787234042553, "grad_norm": 1.867427730668421, "learning_rate": 8.313609467455621e-07, "loss": 1.3395, "step": 191 }, { "epoch": 0.5106382978723404, "grad_norm": 1.836942681632619, "learning_rate": 8.298816568047336e-07, "loss": 1.6135, "step": 192 }, { "epoch": 0.5132978723404256, "grad_norm": 1.6942657192312134, "learning_rate": 8.284023668639053e-07, "loss": 1.5308, "step": 193 }, { "epoch": 0.5159574468085106, "grad_norm": 1.841411465300408, "learning_rate": 8.269230769230768e-07, "loss": 1.5345, "step": 194 }, { "epoch": 0.5186170212765957, "grad_norm": 1.8794098811564628, "learning_rate": 8.254437869822485e-07, "loss": 1.6901, "step": 195 }, { "epoch": 0.5212765957446809, "grad_norm": 1.7012388651957833, "learning_rate": 8.239644970414202e-07, "loss": 1.474, "step": 196 }, { "epoch": 0.523936170212766, "grad_norm": 1.7944418314011599, "learning_rate": 8.224852071005917e-07, "loss": 1.32, "step": 197 }, { "epoch": 0.526595744680851, "grad_norm": 1.7425046897179257, "learning_rate": 8.210059171597633e-07, "loss": 1.368, "step": 198 }, { "epoch": 0.5292553191489362, "grad_norm": 1.7880936710475852, "learning_rate": 8.195266272189348e-07, "loss": 1.645, "step": 199 }, { "epoch": 0.5319148936170213, "grad_norm": 1.6715457627732746, "learning_rate": 8.180473372781065e-07, "loss": 1.4261, "step": 200 }, { "epoch": 0.5345744680851063, "grad_norm": 1.822736509594185, "learning_rate": 8.16568047337278e-07, "loss": 1.624, "step": 201 }, { "epoch": 0.5372340425531915, "grad_norm": 1.6809290356200428, "learning_rate": 8.150887573964497e-07, "loss": 1.3937, "step": 202 }, { "epoch": 0.5398936170212766, "grad_norm": 1.7496410410443377, "learning_rate": 8.136094674556213e-07, "loss": 1.5429, "step": 203 }, { "epoch": 0.5425531914893617, "grad_norm": 1.9199038686131074, "learning_rate": 8.121301775147929e-07, "loss": 1.5023, "step": 204 }, { "epoch": 0.5452127659574468, "grad_norm": 1.6850758554154257, "learning_rate": 8.106508875739645e-07, "loss": 1.4991, "step": 205 }, { "epoch": 0.5478723404255319, "grad_norm": 1.9329660957508767, "learning_rate": 8.091715976331361e-07, "loss": 1.5398, "step": 206 }, { "epoch": 0.550531914893617, "grad_norm": 1.666476222919606, "learning_rate": 8.076923076923077e-07, "loss": 1.5637, "step": 207 }, { "epoch": 0.5531914893617021, "grad_norm": 2.224757697809998, "learning_rate": 8.062130177514792e-07, "loss": 1.6452, "step": 208 }, { "epoch": 0.5558510638297872, "grad_norm": 1.861249667872802, "learning_rate": 8.047337278106508e-07, "loss": 1.541, "step": 209 }, { "epoch": 0.5585106382978723, "grad_norm": 1.7749036870266581, "learning_rate": 8.032544378698225e-07, "loss": 1.4811, "step": 210 }, { "epoch": 0.5611702127659575, "grad_norm": 1.9725453562773687, "learning_rate": 8.01775147928994e-07, "loss": 1.4289, "step": 211 }, { "epoch": 0.5638297872340425, "grad_norm": 1.5771745142242444, "learning_rate": 8.002958579881657e-07, "loss": 1.1851, "step": 212 }, { "epoch": 0.5664893617021277, "grad_norm": 1.7671763938332208, "learning_rate": 7.988165680473372e-07, "loss": 1.4632, "step": 213 }, { "epoch": 0.5691489361702128, "grad_norm": 2.3539254619170147, "learning_rate": 7.973372781065089e-07, "loss": 1.4399, "step": 214 }, { "epoch": 0.5718085106382979, "grad_norm": 1.6811659582751803, "learning_rate": 7.958579881656804e-07, "loss": 1.3874, "step": 215 }, { "epoch": 0.574468085106383, "grad_norm": 1.816100982937805, "learning_rate": 7.943786982248521e-07, "loss": 1.3507, "step": 216 }, { "epoch": 0.5771276595744681, "grad_norm": 1.6054386033989114, "learning_rate": 7.928994082840237e-07, "loss": 1.3523, "step": 217 }, { "epoch": 0.5797872340425532, "grad_norm": 1.747109205347203, "learning_rate": 7.914201183431952e-07, "loss": 1.4471, "step": 218 }, { "epoch": 0.5824468085106383, "grad_norm": 2.544095072667201, "learning_rate": 7.899408284023668e-07, "loss": 1.4659, "step": 219 }, { "epoch": 0.5851063829787234, "grad_norm": 1.9052355208698295, "learning_rate": 7.884615384615384e-07, "loss": 1.6808, "step": 220 }, { "epoch": 0.5877659574468085, "grad_norm": 1.7475037482225553, "learning_rate": 7.8698224852071e-07, "loss": 1.4223, "step": 221 }, { "epoch": 0.5904255319148937, "grad_norm": 1.7030078252678653, "learning_rate": 7.855029585798816e-07, "loss": 1.558, "step": 222 }, { "epoch": 0.5930851063829787, "grad_norm": 1.6935707673119045, "learning_rate": 7.840236686390532e-07, "loss": 1.3466, "step": 223 }, { "epoch": 0.5957446808510638, "grad_norm": 1.9730090137474936, "learning_rate": 7.825443786982249e-07, "loss": 1.6373, "step": 224 }, { "epoch": 0.598404255319149, "grad_norm": 1.7329005942211182, "learning_rate": 7.810650887573964e-07, "loss": 1.3348, "step": 225 }, { "epoch": 0.601063829787234, "grad_norm": 2.033731441401403, "learning_rate": 7.795857988165681e-07, "loss": 1.5524, "step": 226 }, { "epoch": 0.6037234042553191, "grad_norm": 1.7207604873916247, "learning_rate": 7.781065088757395e-07, "loss": 1.3862, "step": 227 }, { "epoch": 0.6063829787234043, "grad_norm": 1.8075753757910789, "learning_rate": 7.766272189349112e-07, "loss": 1.5981, "step": 228 }, { "epoch": 0.6090425531914894, "grad_norm": 2.291978352476086, "learning_rate": 7.751479289940827e-07, "loss": 1.4514, "step": 229 }, { "epoch": 0.6117021276595744, "grad_norm": 1.8644568615293915, "learning_rate": 7.736686390532544e-07, "loss": 1.6587, "step": 230 }, { "epoch": 0.6143617021276596, "grad_norm": 2.594171053250292, "learning_rate": 7.721893491124259e-07, "loss": 1.6336, "step": 231 }, { "epoch": 0.6170212765957447, "grad_norm": 1.5011539788709316, "learning_rate": 7.707100591715976e-07, "loss": 1.2387, "step": 232 }, { "epoch": 0.6196808510638298, "grad_norm": 1.6819405282763624, "learning_rate": 7.692307692307693e-07, "loss": 1.5038, "step": 233 }, { "epoch": 0.6223404255319149, "grad_norm": 1.7251235005494032, "learning_rate": 7.677514792899408e-07, "loss": 1.5774, "step": 234 }, { "epoch": 0.625, "grad_norm": 1.864499827243002, "learning_rate": 7.662721893491125e-07, "loss": 1.5276, "step": 235 }, { "epoch": 0.6276595744680851, "grad_norm": 1.7781078666304035, "learning_rate": 7.64792899408284e-07, "loss": 1.5232, "step": 236 }, { "epoch": 0.6303191489361702, "grad_norm": 1.6599021088795032, "learning_rate": 7.633136094674556e-07, "loss": 1.4473, "step": 237 }, { "epoch": 0.6329787234042553, "grad_norm": 1.6721336663765791, "learning_rate": 7.618343195266271e-07, "loss": 1.3851, "step": 238 }, { "epoch": 0.6356382978723404, "grad_norm": 1.797473310291003, "learning_rate": 7.603550295857988e-07, "loss": 1.4871, "step": 239 }, { "epoch": 0.6382978723404256, "grad_norm": 1.68684289642348, "learning_rate": 7.588757396449704e-07, "loss": 1.3971, "step": 240 }, { "epoch": 0.6409574468085106, "grad_norm": 1.6548030218587813, "learning_rate": 7.57396449704142e-07, "loss": 1.4413, "step": 241 }, { "epoch": 0.6436170212765957, "grad_norm": 1.7764920048747164, "learning_rate": 7.559171597633136e-07, "loss": 1.5327, "step": 242 }, { "epoch": 0.6462765957446809, "grad_norm": 2.3776019048662627, "learning_rate": 7.544378698224852e-07, "loss": 1.3973, "step": 243 }, { "epoch": 0.648936170212766, "grad_norm": 2.180898241246454, "learning_rate": 7.529585798816568e-07, "loss": 1.4108, "step": 244 }, { "epoch": 0.651595744680851, "grad_norm": 1.7308120559219609, "learning_rate": 7.514792899408284e-07, "loss": 1.437, "step": 245 }, { "epoch": 0.6542553191489362, "grad_norm": 1.6797613083347633, "learning_rate": 7.5e-07, "loss": 1.4266, "step": 246 }, { "epoch": 0.6569148936170213, "grad_norm": 1.7244677372074293, "learning_rate": 7.485207100591716e-07, "loss": 1.4562, "step": 247 }, { "epoch": 0.6595744680851063, "grad_norm": 1.831008658275623, "learning_rate": 7.470414201183431e-07, "loss": 1.625, "step": 248 }, { "epoch": 0.6622340425531915, "grad_norm": 1.5987807515924746, "learning_rate": 7.455621301775148e-07, "loss": 1.351, "step": 249 }, { "epoch": 0.6648936170212766, "grad_norm": 1.657627324756177, "learning_rate": 7.440828402366863e-07, "loss": 1.3021, "step": 250 }, { "epoch": 0.6675531914893617, "grad_norm": 1.6806656229564951, "learning_rate": 7.42603550295858e-07, "loss": 1.4708, "step": 251 }, { "epoch": 0.6702127659574468, "grad_norm": 1.6469208307421896, "learning_rate": 7.411242603550295e-07, "loss": 1.4309, "step": 252 }, { "epoch": 0.6728723404255319, "grad_norm": 1.6396856616158755, "learning_rate": 7.396449704142012e-07, "loss": 1.501, "step": 253 }, { "epoch": 0.675531914893617, "grad_norm": 1.6377964159170837, "learning_rate": 7.381656804733728e-07, "loss": 1.5208, "step": 254 }, { "epoch": 0.6781914893617021, "grad_norm": 1.6580558864253538, "learning_rate": 7.366863905325444e-07, "loss": 1.4638, "step": 255 }, { "epoch": 0.6808510638297872, "grad_norm": 1.837851772242258, "learning_rate": 7.352071005917159e-07, "loss": 1.3164, "step": 256 }, { "epoch": 0.6835106382978723, "grad_norm": 2.224825104258165, "learning_rate": 7.337278106508875e-07, "loss": 1.6295, "step": 257 }, { "epoch": 0.6861702127659575, "grad_norm": 1.6131790535172048, "learning_rate": 7.322485207100591e-07, "loss": 1.4414, "step": 258 }, { "epoch": 0.6888297872340425, "grad_norm": 1.549489595607848, "learning_rate": 7.307692307692307e-07, "loss": 1.4455, "step": 259 }, { "epoch": 0.6914893617021277, "grad_norm": 1.761687284810298, "learning_rate": 7.292899408284023e-07, "loss": 1.4913, "step": 260 }, { "epoch": 0.6941489361702128, "grad_norm": 1.6593936380320258, "learning_rate": 7.27810650887574e-07, "loss": 1.6427, "step": 261 }, { "epoch": 0.6968085106382979, "grad_norm": 1.7879593292364175, "learning_rate": 7.263313609467455e-07, "loss": 1.6127, "step": 262 }, { "epoch": 0.699468085106383, "grad_norm": 1.559119726167982, "learning_rate": 7.248520710059172e-07, "loss": 1.3617, "step": 263 }, { "epoch": 0.7021276595744681, "grad_norm": 1.5376887507996986, "learning_rate": 7.233727810650887e-07, "loss": 1.3915, "step": 264 }, { "epoch": 0.7047872340425532, "grad_norm": 1.892877482230423, "learning_rate": 7.218934911242604e-07, "loss": 1.3938, "step": 265 }, { "epoch": 0.7074468085106383, "grad_norm": 2.1615047832844647, "learning_rate": 7.204142011834318e-07, "loss": 1.433, "step": 266 }, { "epoch": 0.7101063829787234, "grad_norm": 1.5754637988987956, "learning_rate": 7.189349112426035e-07, "loss": 1.3913, "step": 267 }, { "epoch": 0.7127659574468085, "grad_norm": 1.4917666655680848, "learning_rate": 7.17455621301775e-07, "loss": 1.4024, "step": 268 }, { "epoch": 0.7154255319148937, "grad_norm": 1.7371252437936426, "learning_rate": 7.159763313609467e-07, "loss": 1.5104, "step": 269 }, { "epoch": 0.7180851063829787, "grad_norm": 1.479255763133087, "learning_rate": 7.144970414201183e-07, "loss": 1.3533, "step": 270 }, { "epoch": 0.7207446808510638, "grad_norm": 1.6094715867178733, "learning_rate": 7.130177514792899e-07, "loss": 1.3532, "step": 271 }, { "epoch": 0.723404255319149, "grad_norm": 1.565198399335246, "learning_rate": 7.115384615384616e-07, "loss": 1.3988, "step": 272 }, { "epoch": 0.726063829787234, "grad_norm": 1.5067122007483011, "learning_rate": 7.100591715976331e-07, "loss": 1.3825, "step": 273 }, { "epoch": 0.7287234042553191, "grad_norm": 1.7140633929936213, "learning_rate": 7.085798816568048e-07, "loss": 1.4082, "step": 274 }, { "epoch": 0.7313829787234043, "grad_norm": 1.540948863934289, "learning_rate": 7.071005917159762e-07, "loss": 1.5153, "step": 275 }, { "epoch": 0.7340425531914894, "grad_norm": 1.7664241501358, "learning_rate": 7.056213017751479e-07, "loss": 1.2721, "step": 276 }, { "epoch": 0.7367021276595744, "grad_norm": 1.5709026992552224, "learning_rate": 7.041420118343195e-07, "loss": 1.3492, "step": 277 }, { "epoch": 0.7393617021276596, "grad_norm": 1.5068566647857482, "learning_rate": 7.026627218934911e-07, "loss": 1.362, "step": 278 }, { "epoch": 0.7420212765957447, "grad_norm": 1.9554416192824882, "learning_rate": 7.011834319526627e-07, "loss": 1.6618, "step": 279 }, { "epoch": 0.7446808510638298, "grad_norm": 1.6405976792740071, "learning_rate": 6.997041420118343e-07, "loss": 1.5917, "step": 280 }, { "epoch": 0.7473404255319149, "grad_norm": 1.7066156854813295, "learning_rate": 6.982248520710059e-07, "loss": 1.2984, "step": 281 }, { "epoch": 0.75, "grad_norm": 1.616654607721298, "learning_rate": 6.967455621301775e-07, "loss": 1.4085, "step": 282 }, { "epoch": 0.7526595744680851, "grad_norm": 1.6119917549130687, "learning_rate": 6.952662721893491e-07, "loss": 1.4059, "step": 283 }, { "epoch": 0.7553191489361702, "grad_norm": 1.4894224582399371, "learning_rate": 6.937869822485208e-07, "loss": 1.4205, "step": 284 }, { "epoch": 0.7579787234042553, "grad_norm": 1.7561130701083838, "learning_rate": 6.923076923076922e-07, "loss": 1.5931, "step": 285 }, { "epoch": 0.7606382978723404, "grad_norm": 1.5507572662266917, "learning_rate": 6.908284023668639e-07, "loss": 1.3968, "step": 286 }, { "epoch": 0.7632978723404256, "grad_norm": 1.4671913155048064, "learning_rate": 6.893491124260354e-07, "loss": 1.2951, "step": 287 }, { "epoch": 0.7659574468085106, "grad_norm": 1.5498863732312698, "learning_rate": 6.878698224852071e-07, "loss": 1.2232, "step": 288 }, { "epoch": 0.7686170212765957, "grad_norm": 1.680206076834721, "learning_rate": 6.863905325443786e-07, "loss": 1.4992, "step": 289 }, { "epoch": 0.7712765957446809, "grad_norm": 1.718088751084764, "learning_rate": 6.849112426035503e-07, "loss": 1.4422, "step": 290 }, { "epoch": 0.773936170212766, "grad_norm": 1.5282347438855142, "learning_rate": 6.834319526627219e-07, "loss": 1.4063, "step": 291 }, { "epoch": 0.776595744680851, "grad_norm": 1.9525999050003993, "learning_rate": 6.819526627218935e-07, "loss": 1.5957, "step": 292 }, { "epoch": 0.7792553191489362, "grad_norm": 2.258813866966866, "learning_rate": 6.804733727810651e-07, "loss": 1.4431, "step": 293 }, { "epoch": 0.7819148936170213, "grad_norm": 1.5364750834268603, "learning_rate": 6.789940828402367e-07, "loss": 1.3558, "step": 294 }, { "epoch": 0.7845744680851063, "grad_norm": 1.5393402313754123, "learning_rate": 6.775147928994082e-07, "loss": 1.439, "step": 295 }, { "epoch": 0.7872340425531915, "grad_norm": 1.6455162885770198, "learning_rate": 6.760355029585798e-07, "loss": 1.5158, "step": 296 }, { "epoch": 0.7898936170212766, "grad_norm": 1.6475778661453933, "learning_rate": 6.745562130177514e-07, "loss": 1.4278, "step": 297 }, { "epoch": 0.7925531914893617, "grad_norm": 1.502594611161215, "learning_rate": 6.730769230769231e-07, "loss": 1.3064, "step": 298 }, { "epoch": 0.7952127659574468, "grad_norm": 1.4819306978451936, "learning_rate": 6.715976331360946e-07, "loss": 1.4, "step": 299 }, { "epoch": 0.7978723404255319, "grad_norm": 1.6911681538448085, "learning_rate": 6.701183431952663e-07, "loss": 1.3364, "step": 300 }, { "epoch": 0.800531914893617, "grad_norm": 1.4712764033020207, "learning_rate": 6.686390532544378e-07, "loss": 1.3514, "step": 301 }, { "epoch": 0.8031914893617021, "grad_norm": 1.5453820007555663, "learning_rate": 6.671597633136095e-07, "loss": 1.252, "step": 302 }, { "epoch": 0.8058510638297872, "grad_norm": 1.6870546106387143, "learning_rate": 6.65680473372781e-07, "loss": 1.4819, "step": 303 }, { "epoch": 0.8085106382978723, "grad_norm": 1.539899104888, "learning_rate": 6.642011834319526e-07, "loss": 1.4248, "step": 304 }, { "epoch": 0.8111702127659575, "grad_norm": 1.8570540873303243, "learning_rate": 6.627218934911242e-07, "loss": 1.398, "step": 305 }, { "epoch": 0.8138297872340425, "grad_norm": 1.6462980732890118, "learning_rate": 6.612426035502958e-07, "loss": 1.472, "step": 306 }, { "epoch": 0.8164893617021277, "grad_norm": 5.047207753458083, "learning_rate": 6.597633136094674e-07, "loss": 1.4934, "step": 307 }, { "epoch": 0.8191489361702128, "grad_norm": 1.6578320558708661, "learning_rate": 6.58284023668639e-07, "loss": 1.4467, "step": 308 }, { "epoch": 0.8218085106382979, "grad_norm": 1.650877101009254, "learning_rate": 6.568047337278106e-07, "loss": 1.3491, "step": 309 }, { "epoch": 0.824468085106383, "grad_norm": 1.7139451577038085, "learning_rate": 6.553254437869822e-07, "loss": 1.4975, "step": 310 }, { "epoch": 0.8271276595744681, "grad_norm": 1.6275656326818695, "learning_rate": 6.538461538461538e-07, "loss": 1.4493, "step": 311 }, { "epoch": 0.8297872340425532, "grad_norm": 1.693438289435893, "learning_rate": 6.523668639053254e-07, "loss": 1.3593, "step": 312 }, { "epoch": 0.8324468085106383, "grad_norm": 1.5252049292780119, "learning_rate": 6.50887573964497e-07, "loss": 1.4798, "step": 313 }, { "epoch": 0.8351063829787234, "grad_norm": 1.7006952995622482, "learning_rate": 6.494082840236686e-07, "loss": 1.5054, "step": 314 }, { "epoch": 0.8377659574468085, "grad_norm": 1.7203889834996966, "learning_rate": 6.479289940828401e-07, "loss": 1.599, "step": 315 }, { "epoch": 0.8404255319148937, "grad_norm": 1.665289055188048, "learning_rate": 6.464497041420118e-07, "loss": 1.47, "step": 316 }, { "epoch": 0.8430851063829787, "grad_norm": 1.783255201333473, "learning_rate": 6.449704142011834e-07, "loss": 1.3293, "step": 317 }, { "epoch": 0.8457446808510638, "grad_norm": 1.5474686687545494, "learning_rate": 6.43491124260355e-07, "loss": 1.5827, "step": 318 }, { "epoch": 0.848404255319149, "grad_norm": 1.7096057045749924, "learning_rate": 6.420118343195266e-07, "loss": 1.4208, "step": 319 }, { "epoch": 0.851063829787234, "grad_norm": 1.660091264238197, "learning_rate": 6.405325443786982e-07, "loss": 1.3729, "step": 320 }, { "epoch": 0.8537234042553191, "grad_norm": 2.1515758550003663, "learning_rate": 6.390532544378699e-07, "loss": 1.6061, "step": 321 }, { "epoch": 0.8563829787234043, "grad_norm": 1.6705826372283528, "learning_rate": 6.375739644970414e-07, "loss": 1.3534, "step": 322 }, { "epoch": 0.8590425531914894, "grad_norm": 1.6232024300738965, "learning_rate": 6.360946745562131e-07, "loss": 1.425, "step": 323 }, { "epoch": 0.8617021276595744, "grad_norm": 1.7044169574045285, "learning_rate": 6.346153846153845e-07, "loss": 1.5695, "step": 324 }, { "epoch": 0.8643617021276596, "grad_norm": 1.7606258681853417, "learning_rate": 6.331360946745562e-07, "loss": 1.418, "step": 325 }, { "epoch": 0.8670212765957447, "grad_norm": 1.5280589114761016, "learning_rate": 6.316568047337277e-07, "loss": 1.4349, "step": 326 }, { "epoch": 0.8696808510638298, "grad_norm": 1.7139434884413298, "learning_rate": 6.301775147928994e-07, "loss": 1.4371, "step": 327 }, { "epoch": 0.8723404255319149, "grad_norm": 1.5926203744807812, "learning_rate": 6.28698224852071e-07, "loss": 1.4015, "step": 328 }, { "epoch": 0.875, "grad_norm": 1.6264161761425606, "learning_rate": 6.272189349112426e-07, "loss": 1.4729, "step": 329 }, { "epoch": 0.8776595744680851, "grad_norm": 1.5831228752137032, "learning_rate": 6.257396449704142e-07, "loss": 1.421, "step": 330 }, { "epoch": 0.8803191489361702, "grad_norm": 1.5811866295220025, "learning_rate": 6.242603550295858e-07, "loss": 1.3628, "step": 331 }, { "epoch": 0.8829787234042553, "grad_norm": 1.5679708453260865, "learning_rate": 6.227810650887574e-07, "loss": 1.2859, "step": 332 }, { "epoch": 0.8856382978723404, "grad_norm": 1.6766225130373726, "learning_rate": 6.213017751479289e-07, "loss": 1.4369, "step": 333 }, { "epoch": 0.8882978723404256, "grad_norm": 1.8047128650814857, "learning_rate": 6.198224852071005e-07, "loss": 1.5913, "step": 334 }, { "epoch": 0.8909574468085106, "grad_norm": 1.6456822515106042, "learning_rate": 6.183431952662722e-07, "loss": 1.4972, "step": 335 }, { "epoch": 0.8936170212765957, "grad_norm": 1.552523155961138, "learning_rate": 6.168639053254437e-07, "loss": 1.3171, "step": 336 }, { "epoch": 0.8962765957446809, "grad_norm": 1.788183804411441, "learning_rate": 6.153846153846154e-07, "loss": 1.5059, "step": 337 }, { "epoch": 0.898936170212766, "grad_norm": 1.5907686060024624, "learning_rate": 6.139053254437869e-07, "loss": 1.1485, "step": 338 }, { "epoch": 0.901595744680851, "grad_norm": 1.7254040314022046, "learning_rate": 6.124260355029586e-07, "loss": 1.5628, "step": 339 }, { "epoch": 0.9042553191489362, "grad_norm": 1.6347353623664331, "learning_rate": 6.109467455621301e-07, "loss": 1.3704, "step": 340 }, { "epoch": 0.9069148936170213, "grad_norm": 2.194464251540189, "learning_rate": 6.094674556213018e-07, "loss": 1.4758, "step": 341 }, { "epoch": 0.9095744680851063, "grad_norm": 1.5698776022464798, "learning_rate": 6.079881656804734e-07, "loss": 1.3871, "step": 342 }, { "epoch": 0.9122340425531915, "grad_norm": 1.8859732282362605, "learning_rate": 6.065088757396449e-07, "loss": 1.4136, "step": 343 }, { "epoch": 0.9148936170212766, "grad_norm": 1.7373147056080605, "learning_rate": 6.050295857988165e-07, "loss": 1.5494, "step": 344 }, { "epoch": 0.9175531914893617, "grad_norm": 1.6179407549268443, "learning_rate": 6.035502958579881e-07, "loss": 1.3776, "step": 345 }, { "epoch": 0.9202127659574468, "grad_norm": 1.77670135626407, "learning_rate": 6.020710059171597e-07, "loss": 1.3275, "step": 346 }, { "epoch": 0.9228723404255319, "grad_norm": 1.7482955670467306, "learning_rate": 6.005917159763313e-07, "loss": 1.4015, "step": 347 }, { "epoch": 0.925531914893617, "grad_norm": 1.6887523807534266, "learning_rate": 5.991124260355029e-07, "loss": 1.5069, "step": 348 }, { "epoch": 0.9281914893617021, "grad_norm": 1.514381055516736, "learning_rate": 5.976331360946746e-07, "loss": 1.3818, "step": 349 }, { "epoch": 0.9308510638297872, "grad_norm": 1.4907168186147164, "learning_rate": 5.961538461538461e-07, "loss": 1.4495, "step": 350 }, { "epoch": 0.9335106382978723, "grad_norm": 2.265910373999388, "learning_rate": 5.946745562130178e-07, "loss": 1.2853, "step": 351 }, { "epoch": 0.9361702127659575, "grad_norm": 1.7992082788491501, "learning_rate": 5.931952662721894e-07, "loss": 1.539, "step": 352 }, { "epoch": 0.9388297872340425, "grad_norm": 1.443028062263383, "learning_rate": 5.917159763313609e-07, "loss": 1.324, "step": 353 }, { "epoch": 0.9414893617021277, "grad_norm": 1.6139434859203183, "learning_rate": 5.902366863905324e-07, "loss": 1.3336, "step": 354 }, { "epoch": 0.9441489361702128, "grad_norm": 2.252829785523421, "learning_rate": 5.887573964497041e-07, "loss": 1.2986, "step": 355 }, { "epoch": 0.9468085106382979, "grad_norm": 1.7284412087838827, "learning_rate": 5.872781065088757e-07, "loss": 1.4817, "step": 356 }, { "epoch": 0.949468085106383, "grad_norm": 1.7787571244355151, "learning_rate": 5.857988165680473e-07, "loss": 1.5187, "step": 357 }, { "epoch": 0.9521276595744681, "grad_norm": 2.181835688354598, "learning_rate": 5.84319526627219e-07, "loss": 1.5578, "step": 358 }, { "epoch": 0.9547872340425532, "grad_norm": 1.4634212657053263, "learning_rate": 5.828402366863905e-07, "loss": 1.2286, "step": 359 }, { "epoch": 0.9574468085106383, "grad_norm": 1.687131629579792, "learning_rate": 5.813609467455622e-07, "loss": 1.3256, "step": 360 }, { "epoch": 0.9601063829787234, "grad_norm": 1.629444719409858, "learning_rate": 5.798816568047337e-07, "loss": 1.5522, "step": 361 }, { "epoch": 0.9627659574468085, "grad_norm": 1.6487449612370586, "learning_rate": 5.784023668639053e-07, "loss": 1.5252, "step": 362 }, { "epoch": 0.9654255319148937, "grad_norm": 1.5119623190054727, "learning_rate": 5.769230769230768e-07, "loss": 1.4479, "step": 363 }, { "epoch": 0.9680851063829787, "grad_norm": 1.529900871256959, "learning_rate": 5.754437869822485e-07, "loss": 1.4081, "step": 364 }, { "epoch": 0.9707446808510638, "grad_norm": 1.679158185017686, "learning_rate": 5.739644970414201e-07, "loss": 1.3219, "step": 365 }, { "epoch": 0.973404255319149, "grad_norm": 1.5743852626682602, "learning_rate": 5.724852071005917e-07, "loss": 1.4408, "step": 366 }, { "epoch": 0.976063829787234, "grad_norm": 1.4327135424204693, "learning_rate": 5.710059171597633e-07, "loss": 1.4267, "step": 367 }, { "epoch": 0.9787234042553191, "grad_norm": 1.693248001536766, "learning_rate": 5.695266272189349e-07, "loss": 1.459, "step": 368 }, { "epoch": 0.9813829787234043, "grad_norm": 1.6118417002332202, "learning_rate": 5.680473372781065e-07, "loss": 1.3239, "step": 369 }, { "epoch": 0.9840425531914894, "grad_norm": 1.5994817848229685, "learning_rate": 5.665680473372781e-07, "loss": 1.3316, "step": 370 }, { "epoch": 0.9867021276595744, "grad_norm": 1.734698428678095, "learning_rate": 5.650887573964497e-07, "loss": 1.4394, "step": 371 }, { "epoch": 0.9893617021276596, "grad_norm": 2.8750724783344626, "learning_rate": 5.636094674556213e-07, "loss": 1.3439, "step": 372 }, { "epoch": 0.9920212765957447, "grad_norm": 1.5483975094463054, "learning_rate": 5.621301775147928e-07, "loss": 1.3684, "step": 373 }, { "epoch": 0.9946808510638298, "grad_norm": 1.5202730618700395, "learning_rate": 5.606508875739645e-07, "loss": 1.3361, "step": 374 }, { "epoch": 0.9973404255319149, "grad_norm": 1.6144865765856777, "learning_rate": 5.59171597633136e-07, "loss": 1.3195, "step": 375 }, { "epoch": 1.0, "grad_norm": 1.6792801473937533, "learning_rate": 5.576923076923077e-07, "loss": 1.4384, "step": 376 }, { "epoch": 1.002659574468085, "grad_norm": 1.9175762077814629, "learning_rate": 5.562130177514792e-07, "loss": 1.4758, "step": 377 }, { "epoch": 1.0053191489361701, "grad_norm": 1.8048610851481421, "learning_rate": 5.547337278106509e-07, "loss": 1.4803, "step": 378 }, { "epoch": 1.0079787234042554, "grad_norm": 1.606071563190404, "learning_rate": 5.532544378698225e-07, "loss": 1.485, "step": 379 }, { "epoch": 1.0106382978723405, "grad_norm": 1.5572569044777356, "learning_rate": 5.517751479289941e-07, "loss": 1.2355, "step": 380 }, { "epoch": 1.0132978723404256, "grad_norm": 1.5959684601920348, "learning_rate": 5.502958579881657e-07, "loss": 1.2246, "step": 381 }, { "epoch": 1.0159574468085106, "grad_norm": 1.9674075560318893, "learning_rate": 5.488165680473372e-07, "loss": 1.5334, "step": 382 }, { "epoch": 1.0186170212765957, "grad_norm": 1.6680206362227628, "learning_rate": 5.473372781065088e-07, "loss": 1.4226, "step": 383 }, { "epoch": 1.0212765957446808, "grad_norm": 1.5700791218738284, "learning_rate": 5.458579881656804e-07, "loss": 1.3727, "step": 384 }, { "epoch": 1.023936170212766, "grad_norm": 1.5969942768737249, "learning_rate": 5.44378698224852e-07, "loss": 1.4911, "step": 385 }, { "epoch": 1.0265957446808511, "grad_norm": 1.5398360114287806, "learning_rate": 5.428994082840237e-07, "loss": 1.3769, "step": 386 }, { "epoch": 1.0292553191489362, "grad_norm": 1.5805625597294484, "learning_rate": 5.414201183431952e-07, "loss": 1.4166, "step": 387 }, { "epoch": 1.0319148936170213, "grad_norm": 1.5312252431931253, "learning_rate": 5.399408284023669e-07, "loss": 1.2332, "step": 388 }, { "epoch": 1.0345744680851063, "grad_norm": 2.185966499141712, "learning_rate": 5.384615384615384e-07, "loss": 1.3489, "step": 389 }, { "epoch": 1.0372340425531914, "grad_norm": 1.5033859343676257, "learning_rate": 5.369822485207101e-07, "loss": 1.4487, "step": 390 }, { "epoch": 1.0398936170212767, "grad_norm": 1.6054054860368354, "learning_rate": 5.355029585798815e-07, "loss": 1.4788, "step": 391 }, { "epoch": 1.0425531914893618, "grad_norm": 1.6494604615754016, "learning_rate": 5.340236686390532e-07, "loss": 1.479, "step": 392 }, { "epoch": 1.0452127659574468, "grad_norm": 1.7222866777780232, "learning_rate": 5.325443786982249e-07, "loss": 1.3891, "step": 393 }, { "epoch": 1.047872340425532, "grad_norm": 1.7350078493539867, "learning_rate": 5.310650887573964e-07, "loss": 1.5214, "step": 394 }, { "epoch": 1.050531914893617, "grad_norm": 1.677699700420203, "learning_rate": 5.295857988165681e-07, "loss": 1.4027, "step": 395 }, { "epoch": 1.053191489361702, "grad_norm": 1.7218061845324277, "learning_rate": 5.281065088757396e-07, "loss": 1.5612, "step": 396 }, { "epoch": 1.0558510638297873, "grad_norm": 2.0460338465780015, "learning_rate": 5.266272189349113e-07, "loss": 1.7095, "step": 397 }, { "epoch": 1.0585106382978724, "grad_norm": 1.8707733198479073, "learning_rate": 5.251479289940828e-07, "loss": 1.3582, "step": 398 }, { "epoch": 1.0611702127659575, "grad_norm": 1.6674094055135629, "learning_rate": 5.236686390532545e-07, "loss": 1.4667, "step": 399 }, { "epoch": 1.0638297872340425, "grad_norm": 1.9223542274996348, "learning_rate": 5.22189349112426e-07, "loss": 1.3237, "step": 400 }, { "epoch": 1.0664893617021276, "grad_norm": 1.442702870639783, "learning_rate": 5.207100591715976e-07, "loss": 1.3436, "step": 401 }, { "epoch": 1.0691489361702127, "grad_norm": 1.459623592531859, "learning_rate": 5.192307692307692e-07, "loss": 1.3075, "step": 402 }, { "epoch": 1.071808510638298, "grad_norm": 1.7736298040913328, "learning_rate": 5.177514792899408e-07, "loss": 1.55, "step": 403 }, { "epoch": 1.074468085106383, "grad_norm": 1.492584255658168, "learning_rate": 5.162721893491124e-07, "loss": 1.3287, "step": 404 }, { "epoch": 1.077127659574468, "grad_norm": 1.5311371897968131, "learning_rate": 5.14792899408284e-07, "loss": 1.2852, "step": 405 }, { "epoch": 1.0797872340425532, "grad_norm": 1.7056998990486645, "learning_rate": 5.133136094674556e-07, "loss": 1.3844, "step": 406 }, { "epoch": 1.0824468085106382, "grad_norm": 1.5754295217572547, "learning_rate": 5.118343195266272e-07, "loss": 1.4362, "step": 407 }, { "epoch": 1.0851063829787233, "grad_norm": 1.7118767376849466, "learning_rate": 5.103550295857988e-07, "loss": 1.4678, "step": 408 }, { "epoch": 1.0877659574468086, "grad_norm": 1.7720130880057632, "learning_rate": 5.088757396449705e-07, "loss": 1.407, "step": 409 }, { "epoch": 1.0904255319148937, "grad_norm": 1.6779654968724649, "learning_rate": 5.07396449704142e-07, "loss": 1.4306, "step": 410 }, { "epoch": 1.0930851063829787, "grad_norm": 1.6236129122592553, "learning_rate": 5.059171597633136e-07, "loss": 1.3498, "step": 411 }, { "epoch": 1.0957446808510638, "grad_norm": 1.6329048532167492, "learning_rate": 5.044378698224851e-07, "loss": 1.4461, "step": 412 }, { "epoch": 1.0984042553191489, "grad_norm": 1.6207024159387382, "learning_rate": 5.029585798816568e-07, "loss": 1.3772, "step": 413 }, { "epoch": 1.101063829787234, "grad_norm": 1.5324741841766363, "learning_rate": 5.014792899408283e-07, "loss": 1.1312, "step": 414 }, { "epoch": 1.1037234042553192, "grad_norm": 1.7401441557132455, "learning_rate": 5e-07, "loss": 1.1982, "step": 415 }, { "epoch": 1.1063829787234043, "grad_norm": 1.7504453773507886, "learning_rate": 4.985207100591715e-07, "loss": 1.4541, "step": 416 }, { "epoch": 1.1090425531914894, "grad_norm": 1.699882851098421, "learning_rate": 4.970414201183432e-07, "loss": 1.2368, "step": 417 }, { "epoch": 1.1117021276595744, "grad_norm": 1.6218516588828402, "learning_rate": 4.955621301775147e-07, "loss": 1.2906, "step": 418 }, { "epoch": 1.1143617021276595, "grad_norm": 1.6649091116123456, "learning_rate": 4.940828402366864e-07, "loss": 1.4454, "step": 419 }, { "epoch": 1.1170212765957448, "grad_norm": 1.728282227356823, "learning_rate": 4.926035502958579e-07, "loss": 1.4663, "step": 420 }, { "epoch": 1.1196808510638299, "grad_norm": 1.6435295189184387, "learning_rate": 4.911242603550296e-07, "loss": 1.4789, "step": 421 }, { "epoch": 1.122340425531915, "grad_norm": 1.8191659615562332, "learning_rate": 4.896449704142011e-07, "loss": 1.3986, "step": 422 }, { "epoch": 1.125, "grad_norm": 1.5470082389400086, "learning_rate": 4.881656804733728e-07, "loss": 1.4072, "step": 423 }, { "epoch": 1.127659574468085, "grad_norm": 1.581839768866324, "learning_rate": 4.866863905325443e-07, "loss": 1.3122, "step": 424 }, { "epoch": 1.1303191489361701, "grad_norm": 1.4620677635311095, "learning_rate": 4.852071005917159e-07, "loss": 1.2643, "step": 425 }, { "epoch": 1.1329787234042552, "grad_norm": 1.6707102916564711, "learning_rate": 4.837278106508875e-07, "loss": 1.3747, "step": 426 }, { "epoch": 1.1356382978723405, "grad_norm": 1.5396285202284683, "learning_rate": 4.822485207100592e-07, "loss": 1.3101, "step": 427 }, { "epoch": 1.1382978723404256, "grad_norm": 1.8606687901078265, "learning_rate": 4.807692307692307e-07, "loss": 1.3172, "step": 428 }, { "epoch": 1.1409574468085106, "grad_norm": 1.6119139560046312, "learning_rate": 4.792899408284024e-07, "loss": 1.3865, "step": 429 }, { "epoch": 1.1436170212765957, "grad_norm": 1.715672112601465, "learning_rate": 4.778106508875739e-07, "loss": 1.4168, "step": 430 }, { "epoch": 1.1462765957446808, "grad_norm": 1.6367162736314051, "learning_rate": 4.7633136094674555e-07, "loss": 1.6202, "step": 431 }, { "epoch": 1.148936170212766, "grad_norm": 1.6173047746530647, "learning_rate": 4.748520710059171e-07, "loss": 1.4345, "step": 432 }, { "epoch": 1.1515957446808511, "grad_norm": 1.591852292459417, "learning_rate": 4.733727810650887e-07, "loss": 1.3504, "step": 433 }, { "epoch": 1.1542553191489362, "grad_norm": 1.704091419091409, "learning_rate": 4.7189349112426035e-07, "loss": 1.3978, "step": 434 }, { "epoch": 1.1569148936170213, "grad_norm": 1.6750388468322808, "learning_rate": 4.7041420118343195e-07, "loss": 1.5323, "step": 435 }, { "epoch": 1.1595744680851063, "grad_norm": 1.550611356946591, "learning_rate": 4.6893491124260356e-07, "loss": 1.3516, "step": 436 }, { "epoch": 1.1622340425531914, "grad_norm": 1.6666235759250934, "learning_rate": 4.674556213017751e-07, "loss": 1.3193, "step": 437 }, { "epoch": 1.1648936170212765, "grad_norm": 1.6060648830034072, "learning_rate": 4.659763313609467e-07, "loss": 1.4802, "step": 438 }, { "epoch": 1.1675531914893618, "grad_norm": 2.7759623465499113, "learning_rate": 4.644970414201183e-07, "loss": 1.3673, "step": 439 }, { "epoch": 1.1702127659574468, "grad_norm": 1.6142500584687862, "learning_rate": 4.630177514792899e-07, "loss": 1.2367, "step": 440 }, { "epoch": 1.172872340425532, "grad_norm": 1.6293255382971552, "learning_rate": 4.6153846153846156e-07, "loss": 1.4771, "step": 441 }, { "epoch": 1.175531914893617, "grad_norm": 1.6166636037633662, "learning_rate": 4.600591715976331e-07, "loss": 1.3891, "step": 442 }, { "epoch": 1.178191489361702, "grad_norm": 1.6156668770120142, "learning_rate": 4.585798816568047e-07, "loss": 1.3015, "step": 443 }, { "epoch": 1.1808510638297873, "grad_norm": 1.541456190983287, "learning_rate": 4.571005917159763e-07, "loss": 1.325, "step": 444 }, { "epoch": 1.1835106382978724, "grad_norm": 1.5371528822910774, "learning_rate": 4.556213017751479e-07, "loss": 1.391, "step": 445 }, { "epoch": 1.1861702127659575, "grad_norm": 1.8047509120352834, "learning_rate": 4.541420118343195e-07, "loss": 1.3802, "step": 446 }, { "epoch": 1.1888297872340425, "grad_norm": 1.4772002442457595, "learning_rate": 4.5266272189349107e-07, "loss": 1.2972, "step": 447 }, { "epoch": 1.1914893617021276, "grad_norm": 1.4833680602448407, "learning_rate": 4.5118343195266267e-07, "loss": 1.3515, "step": 448 }, { "epoch": 1.1941489361702127, "grad_norm": 1.557530779220624, "learning_rate": 4.497041420118343e-07, "loss": 1.367, "step": 449 }, { "epoch": 1.196808510638298, "grad_norm": 1.8027220753490893, "learning_rate": 4.482248520710059e-07, "loss": 1.5443, "step": 450 }, { "epoch": 1.199468085106383, "grad_norm": 1.5684441226470547, "learning_rate": 4.467455621301775e-07, "loss": 1.3059, "step": 451 }, { "epoch": 1.202127659574468, "grad_norm": 1.593970040483734, "learning_rate": 4.4526627218934907e-07, "loss": 1.2474, "step": 452 }, { "epoch": 1.2047872340425532, "grad_norm": 1.7048839620218588, "learning_rate": 4.437869822485207e-07, "loss": 1.4689, "step": 453 }, { "epoch": 1.2074468085106382, "grad_norm": 1.6500745120708162, "learning_rate": 4.423076923076923e-07, "loss": 1.3768, "step": 454 }, { "epoch": 1.2101063829787235, "grad_norm": 1.6649022378945304, "learning_rate": 4.408284023668639e-07, "loss": 1.6992, "step": 455 }, { "epoch": 1.2127659574468086, "grad_norm": 2.150475218838757, "learning_rate": 4.3934911242603553e-07, "loss": 1.4338, "step": 456 }, { "epoch": 1.2154255319148937, "grad_norm": 1.4810681098612493, "learning_rate": 4.378698224852071e-07, "loss": 1.2523, "step": 457 }, { "epoch": 1.2180851063829787, "grad_norm": 1.5941194592252996, "learning_rate": 4.363905325443787e-07, "loss": 1.5144, "step": 458 }, { "epoch": 1.2207446808510638, "grad_norm": 2.9846606692055855, "learning_rate": 4.349112426035503e-07, "loss": 1.4394, "step": 459 }, { "epoch": 1.2234042553191489, "grad_norm": 1.5758645515570575, "learning_rate": 4.334319526627219e-07, "loss": 1.314, "step": 460 }, { "epoch": 1.226063829787234, "grad_norm": 2.0348791713600374, "learning_rate": 4.3195266272189343e-07, "loss": 1.3581, "step": 461 }, { "epoch": 1.2287234042553192, "grad_norm": 1.65492749945659, "learning_rate": 4.3047337278106503e-07, "loss": 1.5053, "step": 462 }, { "epoch": 1.2313829787234043, "grad_norm": 1.6722641251398465, "learning_rate": 4.289940828402367e-07, "loss": 1.4641, "step": 463 }, { "epoch": 1.2340425531914894, "grad_norm": 1.5474460973272384, "learning_rate": 4.275147928994083e-07, "loss": 1.4182, "step": 464 }, { "epoch": 1.2367021276595744, "grad_norm": 1.7345506046508428, "learning_rate": 4.260355029585799e-07, "loss": 1.3139, "step": 465 }, { "epoch": 1.2393617021276595, "grad_norm": 1.7713814803315784, "learning_rate": 4.2455621301775144e-07, "loss": 1.4832, "step": 466 }, { "epoch": 1.2420212765957448, "grad_norm": 1.5498103025703653, "learning_rate": 4.2307692307692304e-07, "loss": 1.4115, "step": 467 }, { "epoch": 1.2446808510638299, "grad_norm": 1.5577840972729278, "learning_rate": 4.2159763313609464e-07, "loss": 1.3256, "step": 468 }, { "epoch": 1.247340425531915, "grad_norm": 1.578861933203747, "learning_rate": 4.2011834319526624e-07, "loss": 1.2007, "step": 469 }, { "epoch": 1.25, "grad_norm": 1.6507686385229483, "learning_rate": 4.1863905325443785e-07, "loss": 1.3944, "step": 470 }, { "epoch": 1.252659574468085, "grad_norm": 1.7990714539210155, "learning_rate": 4.1715976331360945e-07, "loss": 1.4632, "step": 471 }, { "epoch": 1.2553191489361701, "grad_norm": 1.7618234269198014, "learning_rate": 4.1568047337278105e-07, "loss": 1.3313, "step": 472 }, { "epoch": 1.2579787234042552, "grad_norm": 1.5213599490802718, "learning_rate": 4.1420118343195265e-07, "loss": 1.5047, "step": 473 }, { "epoch": 1.2606382978723405, "grad_norm": 1.6052633557883167, "learning_rate": 4.1272189349112425e-07, "loss": 1.4177, "step": 474 }, { "epoch": 1.2632978723404256, "grad_norm": 1.9773267391803975, "learning_rate": 4.1124260355029585e-07, "loss": 1.2606, "step": 475 }, { "epoch": 1.2659574468085106, "grad_norm": 1.7023545368582853, "learning_rate": 4.097633136094674e-07, "loss": 1.3522, "step": 476 }, { "epoch": 1.2686170212765957, "grad_norm": 1.657218002450086, "learning_rate": 4.08284023668639e-07, "loss": 1.307, "step": 477 }, { "epoch": 1.2712765957446808, "grad_norm": 1.6560677482089055, "learning_rate": 4.0680473372781066e-07, "loss": 1.5599, "step": 478 }, { "epoch": 1.273936170212766, "grad_norm": 1.5827603390864668, "learning_rate": 4.0532544378698226e-07, "loss": 1.3864, "step": 479 }, { "epoch": 1.2765957446808511, "grad_norm": 1.490492812079521, "learning_rate": 4.0384615384615386e-07, "loss": 1.3238, "step": 480 }, { "epoch": 1.2792553191489362, "grad_norm": 1.4427306337618429, "learning_rate": 4.023668639053254e-07, "loss": 1.3381, "step": 481 }, { "epoch": 1.2819148936170213, "grad_norm": 1.8739427128710302, "learning_rate": 4.00887573964497e-07, "loss": 1.5195, "step": 482 }, { "epoch": 1.2845744680851063, "grad_norm": 1.4205586135195478, "learning_rate": 3.994082840236686e-07, "loss": 1.3342, "step": 483 }, { "epoch": 1.2872340425531914, "grad_norm": 1.4978308888768397, "learning_rate": 3.979289940828402e-07, "loss": 1.3198, "step": 484 }, { "epoch": 1.2898936170212765, "grad_norm": 1.453096779169849, "learning_rate": 3.9644970414201187e-07, "loss": 1.0572, "step": 485 }, { "epoch": 1.2925531914893618, "grad_norm": 1.9700050592115472, "learning_rate": 3.949704142011834e-07, "loss": 1.5476, "step": 486 }, { "epoch": 1.2952127659574468, "grad_norm": 1.52650807341244, "learning_rate": 3.93491124260355e-07, "loss": 1.3027, "step": 487 }, { "epoch": 1.297872340425532, "grad_norm": 1.6797022619264115, "learning_rate": 3.920118343195266e-07, "loss": 1.4014, "step": 488 }, { "epoch": 1.300531914893617, "grad_norm": 1.4684740172475148, "learning_rate": 3.905325443786982e-07, "loss": 1.2891, "step": 489 }, { "epoch": 1.3031914893617023, "grad_norm": 1.7009794386978352, "learning_rate": 3.8905325443786977e-07, "loss": 1.498, "step": 490 }, { "epoch": 1.3058510638297873, "grad_norm": 1.8679273089411261, "learning_rate": 3.8757396449704137e-07, "loss": 1.5135, "step": 491 }, { "epoch": 1.3085106382978724, "grad_norm": 2.6124670473785723, "learning_rate": 3.8609467455621297e-07, "loss": 1.4419, "step": 492 }, { "epoch": 1.3111702127659575, "grad_norm": 1.531497234704401, "learning_rate": 3.8461538461538463e-07, "loss": 1.441, "step": 493 }, { "epoch": 1.3138297872340425, "grad_norm": 1.6983808183380165, "learning_rate": 3.8313609467455623e-07, "loss": 1.3176, "step": 494 }, { "epoch": 1.3164893617021276, "grad_norm": 1.7106971746124235, "learning_rate": 3.816568047337278e-07, "loss": 1.2673, "step": 495 }, { "epoch": 1.3191489361702127, "grad_norm": 1.7661676163840787, "learning_rate": 3.801775147928994e-07, "loss": 1.6258, "step": 496 }, { "epoch": 1.3218085106382977, "grad_norm": 1.6248132891862335, "learning_rate": 3.78698224852071e-07, "loss": 1.3813, "step": 497 }, { "epoch": 1.324468085106383, "grad_norm": 1.5079876101311178, "learning_rate": 3.772189349112426e-07, "loss": 1.3491, "step": 498 }, { "epoch": 1.327127659574468, "grad_norm": 1.9080784267885529, "learning_rate": 3.757396449704142e-07, "loss": 1.4263, "step": 499 }, { "epoch": 1.3297872340425532, "grad_norm": 1.7134136936747053, "learning_rate": 3.742603550295858e-07, "loss": 1.4804, "step": 500 }, { "epoch": 1.3324468085106382, "grad_norm": 2.6890220767611934, "learning_rate": 3.727810650887574e-07, "loss": 1.4301, "step": 501 }, { "epoch": 1.3351063829787235, "grad_norm": 1.4491614153026324, "learning_rate": 3.71301775147929e-07, "loss": 1.2226, "step": 502 }, { "epoch": 1.3377659574468086, "grad_norm": 1.4673050610910694, "learning_rate": 3.698224852071006e-07, "loss": 1.2824, "step": 503 }, { "epoch": 1.3404255319148937, "grad_norm": 1.5811077672143066, "learning_rate": 3.683431952662722e-07, "loss": 1.4056, "step": 504 }, { "epoch": 1.3430851063829787, "grad_norm": 1.784207214911482, "learning_rate": 3.6686390532544374e-07, "loss": 1.4456, "step": 505 }, { "epoch": 1.3457446808510638, "grad_norm": 1.7545013437687231, "learning_rate": 3.6538461538461534e-07, "loss": 1.4255, "step": 506 }, { "epoch": 1.3484042553191489, "grad_norm": 1.5378814658235478, "learning_rate": 3.63905325443787e-07, "loss": 1.4752, "step": 507 }, { "epoch": 1.351063829787234, "grad_norm": 1.5663338737224375, "learning_rate": 3.624260355029586e-07, "loss": 1.4324, "step": 508 }, { "epoch": 1.3537234042553192, "grad_norm": 1.5530747526395428, "learning_rate": 3.609467455621302e-07, "loss": 1.3294, "step": 509 }, { "epoch": 1.3563829787234043, "grad_norm": 1.5555181536643647, "learning_rate": 3.5946745562130175e-07, "loss": 1.2615, "step": 510 }, { "epoch": 1.3590425531914894, "grad_norm": 1.561907923100703, "learning_rate": 3.5798816568047335e-07, "loss": 1.4247, "step": 511 }, { "epoch": 1.3617021276595744, "grad_norm": 1.561727561754077, "learning_rate": 3.5650887573964495e-07, "loss": 1.442, "step": 512 }, { "epoch": 1.3643617021276595, "grad_norm": 1.571729864924405, "learning_rate": 3.5502958579881655e-07, "loss": 1.3471, "step": 513 }, { "epoch": 1.3670212765957448, "grad_norm": 1.6501651767936791, "learning_rate": 3.535502958579881e-07, "loss": 1.4957, "step": 514 }, { "epoch": 1.3696808510638299, "grad_norm": 1.7712985007484374, "learning_rate": 3.5207100591715975e-07, "loss": 1.3116, "step": 515 }, { "epoch": 1.372340425531915, "grad_norm": 1.6021754882790804, "learning_rate": 3.5059171597633135e-07, "loss": 1.3507, "step": 516 }, { "epoch": 1.375, "grad_norm": 1.9744682223829157, "learning_rate": 3.4911242603550296e-07, "loss": 1.3187, "step": 517 }, { "epoch": 1.377659574468085, "grad_norm": 1.437548678030046, "learning_rate": 3.4763313609467456e-07, "loss": 1.3055, "step": 518 }, { "epoch": 1.3803191489361701, "grad_norm": 1.7376163882785898, "learning_rate": 3.461538461538461e-07, "loss": 1.3712, "step": 519 }, { "epoch": 1.3829787234042552, "grad_norm": 1.709895613646418, "learning_rate": 3.446745562130177e-07, "loss": 1.4941, "step": 520 }, { "epoch": 1.3856382978723405, "grad_norm": 1.5064773577923485, "learning_rate": 3.431952662721893e-07, "loss": 1.3598, "step": 521 }, { "epoch": 1.3882978723404256, "grad_norm": 1.6991123209979573, "learning_rate": 3.4171597633136096e-07, "loss": 1.3859, "step": 522 }, { "epoch": 1.3909574468085106, "grad_norm": 1.611358975201833, "learning_rate": 3.4023668639053256e-07, "loss": 1.3624, "step": 523 }, { "epoch": 1.3936170212765957, "grad_norm": 1.5235030722566782, "learning_rate": 3.387573964497041e-07, "loss": 1.306, "step": 524 }, { "epoch": 1.3962765957446808, "grad_norm": 1.5097567026286727, "learning_rate": 3.372781065088757e-07, "loss": 1.3098, "step": 525 }, { "epoch": 1.398936170212766, "grad_norm": 1.5501867735527708, "learning_rate": 3.357988165680473e-07, "loss": 1.2582, "step": 526 }, { "epoch": 1.4015957446808511, "grad_norm": 1.5737400889065642, "learning_rate": 3.343195266272189e-07, "loss": 1.4226, "step": 527 }, { "epoch": 1.4042553191489362, "grad_norm": 1.8163702192116935, "learning_rate": 3.328402366863905e-07, "loss": 1.45, "step": 528 }, { "epoch": 1.4069148936170213, "grad_norm": 1.6761526127572786, "learning_rate": 3.313609467455621e-07, "loss": 1.4133, "step": 529 }, { "epoch": 1.4095744680851063, "grad_norm": 1.7300976770863319, "learning_rate": 3.298816568047337e-07, "loss": 1.5036, "step": 530 }, { "epoch": 1.4122340425531914, "grad_norm": 1.7219520565452116, "learning_rate": 3.284023668639053e-07, "loss": 1.4172, "step": 531 }, { "epoch": 1.4148936170212765, "grad_norm": 1.8137826656078981, "learning_rate": 3.269230769230769e-07, "loss": 1.5673, "step": 532 }, { "epoch": 1.4175531914893618, "grad_norm": 1.9605494871424245, "learning_rate": 3.254437869822485e-07, "loss": 1.4421, "step": 533 }, { "epoch": 1.4202127659574468, "grad_norm": 1.5063443324517625, "learning_rate": 3.239644970414201e-07, "loss": 1.3858, "step": 534 }, { "epoch": 1.422872340425532, "grad_norm": 1.5929428001187216, "learning_rate": 3.224852071005917e-07, "loss": 1.4245, "step": 535 }, { "epoch": 1.425531914893617, "grad_norm": 1.5090052181328104, "learning_rate": 3.210059171597633e-07, "loss": 1.185, "step": 536 }, { "epoch": 1.4281914893617023, "grad_norm": 1.7599894966549008, "learning_rate": 3.1952662721893493e-07, "loss": 1.5936, "step": 537 }, { "epoch": 1.4308510638297873, "grad_norm": 1.8274682976599146, "learning_rate": 3.1804733727810653e-07, "loss": 1.5133, "step": 538 }, { "epoch": 1.4335106382978724, "grad_norm": 1.6304863965807304, "learning_rate": 3.165680473372781e-07, "loss": 1.4513, "step": 539 }, { "epoch": 1.4361702127659575, "grad_norm": 1.865748149954226, "learning_rate": 3.150887573964497e-07, "loss": 1.579, "step": 540 }, { "epoch": 1.4388297872340425, "grad_norm": 1.497890260310679, "learning_rate": 3.136094674556213e-07, "loss": 1.3996, "step": 541 }, { "epoch": 1.4414893617021276, "grad_norm": 1.5505684579290944, "learning_rate": 3.121301775147929e-07, "loss": 1.4765, "step": 542 }, { "epoch": 1.4441489361702127, "grad_norm": 1.5934674629645669, "learning_rate": 3.1065088757396443e-07, "loss": 1.2206, "step": 543 }, { "epoch": 1.4468085106382977, "grad_norm": 2.5003698075483776, "learning_rate": 3.091715976331361e-07, "loss": 1.4785, "step": 544 }, { "epoch": 1.449468085106383, "grad_norm": 1.5430363507491573, "learning_rate": 3.076923076923077e-07, "loss": 1.3596, "step": 545 }, { "epoch": 1.452127659574468, "grad_norm": 1.6114525579321486, "learning_rate": 3.062130177514793e-07, "loss": 1.3768, "step": 546 }, { "epoch": 1.4547872340425532, "grad_norm": 1.51705181171149, "learning_rate": 3.047337278106509e-07, "loss": 1.3161, "step": 547 }, { "epoch": 1.4574468085106382, "grad_norm": 1.659706683154854, "learning_rate": 3.0325443786982244e-07, "loss": 1.4808, "step": 548 }, { "epoch": 1.4601063829787235, "grad_norm": 1.6484483474446856, "learning_rate": 3.0177514792899404e-07, "loss": 1.398, "step": 549 }, { "epoch": 1.4627659574468086, "grad_norm": 1.6054531570011474, "learning_rate": 3.0029585798816564e-07, "loss": 1.1421, "step": 550 }, { "epoch": 1.4654255319148937, "grad_norm": 1.5260699880356663, "learning_rate": 2.988165680473373e-07, "loss": 1.4223, "step": 551 }, { "epoch": 1.4680851063829787, "grad_norm": 1.5022650148070196, "learning_rate": 2.973372781065089e-07, "loss": 1.3579, "step": 552 }, { "epoch": 1.4707446808510638, "grad_norm": 1.696210632092268, "learning_rate": 2.9585798816568045e-07, "loss": 1.4437, "step": 553 }, { "epoch": 1.4734042553191489, "grad_norm": 1.50505509525979, "learning_rate": 2.9437869822485205e-07, "loss": 1.3666, "step": 554 }, { "epoch": 1.476063829787234, "grad_norm": 1.6283581586889138, "learning_rate": 2.9289940828402365e-07, "loss": 1.3807, "step": 555 }, { "epoch": 1.4787234042553192, "grad_norm": 1.57845733466985, "learning_rate": 2.9142011834319525e-07, "loss": 1.4947, "step": 556 }, { "epoch": 1.4813829787234043, "grad_norm": 1.6269594263364617, "learning_rate": 2.8994082840236686e-07, "loss": 1.5315, "step": 557 }, { "epoch": 1.4840425531914894, "grad_norm": 1.4901674188093539, "learning_rate": 2.884615384615384e-07, "loss": 1.2067, "step": 558 }, { "epoch": 1.4867021276595744, "grad_norm": 1.608926803251607, "learning_rate": 2.8698224852071006e-07, "loss": 1.4501, "step": 559 }, { "epoch": 1.4893617021276595, "grad_norm": 1.5736272188001768, "learning_rate": 2.8550295857988166e-07, "loss": 1.4938, "step": 560 }, { "epoch": 1.4920212765957448, "grad_norm": 1.6178988306695008, "learning_rate": 2.8402366863905326e-07, "loss": 1.2858, "step": 561 }, { "epoch": 1.4946808510638299, "grad_norm": 1.612098241628475, "learning_rate": 2.8254437869822486e-07, "loss": 1.3793, "step": 562 }, { "epoch": 1.497340425531915, "grad_norm": 1.521850228548639, "learning_rate": 2.810650887573964e-07, "loss": 1.3616, "step": 563 }, { "epoch": 1.5, "grad_norm": 1.4283693834886921, "learning_rate": 2.79585798816568e-07, "loss": 1.2373, "step": 564 }, { "epoch": 1.502659574468085, "grad_norm": 1.4614575118454327, "learning_rate": 2.781065088757396e-07, "loss": 1.3506, "step": 565 }, { "epoch": 1.5053191489361701, "grad_norm": 4.833934856122629, "learning_rate": 2.7662721893491127e-07, "loss": 1.3368, "step": 566 }, { "epoch": 1.5079787234042552, "grad_norm": 1.5417407593664367, "learning_rate": 2.7514792899408287e-07, "loss": 1.3806, "step": 567 }, { "epoch": 1.5106382978723403, "grad_norm": 1.3942611390001125, "learning_rate": 2.736686390532544e-07, "loss": 1.2778, "step": 568 }, { "epoch": 1.5132978723404256, "grad_norm": 1.5232973474443783, "learning_rate": 2.72189349112426e-07, "loss": 1.5106, "step": 569 }, { "epoch": 1.5159574468085106, "grad_norm": 1.6181295111494955, "learning_rate": 2.707100591715976e-07, "loss": 1.3182, "step": 570 }, { "epoch": 1.5186170212765957, "grad_norm": 1.4905875051329172, "learning_rate": 2.692307692307692e-07, "loss": 1.359, "step": 571 }, { "epoch": 1.521276595744681, "grad_norm": 1.5438422326091557, "learning_rate": 2.6775147928994077e-07, "loss": 1.4581, "step": 572 }, { "epoch": 1.523936170212766, "grad_norm": 1.6689444553647594, "learning_rate": 2.662721893491124e-07, "loss": 1.4416, "step": 573 }, { "epoch": 1.5265957446808511, "grad_norm": 1.732092721800618, "learning_rate": 2.6479289940828403e-07, "loss": 1.4653, "step": 574 }, { "epoch": 1.5292553191489362, "grad_norm": 1.5939357125781168, "learning_rate": 2.6331360946745563e-07, "loss": 1.3659, "step": 575 }, { "epoch": 1.5319148936170213, "grad_norm": 1.619819379203523, "learning_rate": 2.6183431952662723e-07, "loss": 1.4057, "step": 576 }, { "epoch": 1.5345744680851063, "grad_norm": 1.5228031500567076, "learning_rate": 2.603550295857988e-07, "loss": 1.3322, "step": 577 }, { "epoch": 1.5372340425531914, "grad_norm": 1.6403075138073668, "learning_rate": 2.588757396449704e-07, "loss": 1.3243, "step": 578 }, { "epoch": 1.5398936170212765, "grad_norm": 1.6158463818930031, "learning_rate": 2.57396449704142e-07, "loss": 1.3743, "step": 579 }, { "epoch": 1.5425531914893615, "grad_norm": 1.4401607766731626, "learning_rate": 2.559171597633136e-07, "loss": 1.3209, "step": 580 }, { "epoch": 1.5452127659574468, "grad_norm": 1.610458527778034, "learning_rate": 2.5443786982248524e-07, "loss": 1.437, "step": 581 }, { "epoch": 1.547872340425532, "grad_norm": 1.4720391313596763, "learning_rate": 2.529585798816568e-07, "loss": 1.2406, "step": 582 }, { "epoch": 1.550531914893617, "grad_norm": 1.4693642812943966, "learning_rate": 2.514792899408284e-07, "loss": 1.3345, "step": 583 }, { "epoch": 1.5531914893617023, "grad_norm": 1.6024699547818029, "learning_rate": 2.5e-07, "loss": 1.4164, "step": 584 }, { "epoch": 1.5558510638297873, "grad_norm": 1.602502091357314, "learning_rate": 2.485207100591716e-07, "loss": 1.4412, "step": 585 }, { "epoch": 1.5585106382978724, "grad_norm": 1.7241679714315328, "learning_rate": 2.470414201183432e-07, "loss": 1.331, "step": 586 }, { "epoch": 1.5611702127659575, "grad_norm": 1.7371187244572857, "learning_rate": 2.455621301775148e-07, "loss": 1.4532, "step": 587 }, { "epoch": 1.5638297872340425, "grad_norm": 1.4995956670676633, "learning_rate": 2.440828402366864e-07, "loss": 1.2702, "step": 588 }, { "epoch": 1.5664893617021276, "grad_norm": 1.4659221291046236, "learning_rate": 2.4260355029585794e-07, "loss": 1.4754, "step": 589 }, { "epoch": 1.5691489361702127, "grad_norm": 1.5385805721266792, "learning_rate": 2.411242603550296e-07, "loss": 1.5509, "step": 590 }, { "epoch": 1.5718085106382977, "grad_norm": 1.5161262548508925, "learning_rate": 2.396449704142012e-07, "loss": 1.3936, "step": 591 }, { "epoch": 1.574468085106383, "grad_norm": 1.4666556990097799, "learning_rate": 2.3816568047337277e-07, "loss": 1.3143, "step": 592 }, { "epoch": 1.577127659574468, "grad_norm": 1.6300523417207398, "learning_rate": 2.3668639053254435e-07, "loss": 1.3385, "step": 593 }, { "epoch": 1.5797872340425532, "grad_norm": 1.5121985962743036, "learning_rate": 2.3520710059171598e-07, "loss": 1.4693, "step": 594 }, { "epoch": 1.5824468085106385, "grad_norm": 1.6977627534281994, "learning_rate": 2.3372781065088755e-07, "loss": 1.5827, "step": 595 }, { "epoch": 1.5851063829787235, "grad_norm": 1.4675438168952388, "learning_rate": 2.3224852071005915e-07, "loss": 1.4037, "step": 596 }, { "epoch": 1.5877659574468086, "grad_norm": 1.5087620408684652, "learning_rate": 2.3076923076923078e-07, "loss": 1.2713, "step": 597 }, { "epoch": 1.5904255319148937, "grad_norm": 1.853798719037303, "learning_rate": 2.2928994082840236e-07, "loss": 1.4517, "step": 598 }, { "epoch": 1.5930851063829787, "grad_norm": 1.5624391162454545, "learning_rate": 2.2781065088757396e-07, "loss": 1.5716, "step": 599 }, { "epoch": 1.5957446808510638, "grad_norm": 1.5647362537380562, "learning_rate": 2.2633136094674553e-07, "loss": 1.2679, "step": 600 }, { "epoch": 1.5984042553191489, "grad_norm": 1.5028293469540326, "learning_rate": 2.2485207100591716e-07, "loss": 1.3477, "step": 601 }, { "epoch": 1.601063829787234, "grad_norm": 1.5616178692766567, "learning_rate": 2.2337278106508876e-07, "loss": 1.349, "step": 602 }, { "epoch": 1.603723404255319, "grad_norm": 1.5652068533404448, "learning_rate": 2.2189349112426034e-07, "loss": 1.3408, "step": 603 }, { "epoch": 1.6063829787234043, "grad_norm": 1.8312281167867779, "learning_rate": 2.2041420118343194e-07, "loss": 1.5744, "step": 604 }, { "epoch": 1.6090425531914894, "grad_norm": 1.5113532834536092, "learning_rate": 2.1893491124260354e-07, "loss": 1.5116, "step": 605 }, { "epoch": 1.6117021276595744, "grad_norm": 1.4148002933798485, "learning_rate": 2.1745562130177514e-07, "loss": 1.3254, "step": 606 }, { "epoch": 1.6143617021276597, "grad_norm": 1.4128390757612144, "learning_rate": 2.1597633136094672e-07, "loss": 1.3424, "step": 607 }, { "epoch": 1.6170212765957448, "grad_norm": 1.664151543039297, "learning_rate": 2.1449704142011834e-07, "loss": 1.4507, "step": 608 }, { "epoch": 1.6196808510638299, "grad_norm": 1.5001892924079347, "learning_rate": 2.1301775147928995e-07, "loss": 1.3598, "step": 609 }, { "epoch": 1.622340425531915, "grad_norm": 1.7189011247258703, "learning_rate": 2.1153846153846152e-07, "loss": 1.4798, "step": 610 }, { "epoch": 1.625, "grad_norm": 1.4495039913652832, "learning_rate": 2.1005917159763312e-07, "loss": 1.1879, "step": 611 }, { "epoch": 1.627659574468085, "grad_norm": 1.4863964571390131, "learning_rate": 2.0857988165680472e-07, "loss": 1.4149, "step": 612 }, { "epoch": 1.6303191489361701, "grad_norm": 1.470842696782351, "learning_rate": 2.0710059171597633e-07, "loss": 1.5213, "step": 613 }, { "epoch": 1.6329787234042552, "grad_norm": 1.5332931589309218, "learning_rate": 2.0562130177514793e-07, "loss": 1.3847, "step": 614 }, { "epoch": 1.6356382978723403, "grad_norm": 1.5012230655181953, "learning_rate": 2.041420118343195e-07, "loss": 1.2194, "step": 615 }, { "epoch": 1.6382978723404256, "grad_norm": 1.4592244922211661, "learning_rate": 2.0266272189349113e-07, "loss": 1.2863, "step": 616 }, { "epoch": 1.6409574468085106, "grad_norm": 1.6194968573694928, "learning_rate": 2.011834319526627e-07, "loss": 1.563, "step": 617 }, { "epoch": 1.6436170212765957, "grad_norm": 1.5398995693701385, "learning_rate": 1.997041420118343e-07, "loss": 1.5, "step": 618 }, { "epoch": 1.646276595744681, "grad_norm": 1.803830954994613, "learning_rate": 1.9822485207100593e-07, "loss": 1.3459, "step": 619 }, { "epoch": 1.648936170212766, "grad_norm": 1.5731270083148248, "learning_rate": 1.967455621301775e-07, "loss": 1.3277, "step": 620 }, { "epoch": 1.6515957446808511, "grad_norm": 1.6370008858204694, "learning_rate": 1.952662721893491e-07, "loss": 1.4752, "step": 621 }, { "epoch": 1.6542553191489362, "grad_norm": 1.3905339157621093, "learning_rate": 1.9378698224852069e-07, "loss": 1.1591, "step": 622 }, { "epoch": 1.6569148936170213, "grad_norm": 1.521784820078054, "learning_rate": 1.9230769230769231e-07, "loss": 1.348, "step": 623 }, { "epoch": 1.6595744680851063, "grad_norm": 2.480779673395715, "learning_rate": 1.908284023668639e-07, "loss": 1.3468, "step": 624 }, { "epoch": 1.6622340425531914, "grad_norm": 1.5047866424190777, "learning_rate": 1.893491124260355e-07, "loss": 1.3808, "step": 625 }, { "epoch": 1.6648936170212765, "grad_norm": 1.5186127777273435, "learning_rate": 1.878698224852071e-07, "loss": 1.4201, "step": 626 }, { "epoch": 1.6675531914893615, "grad_norm": 1.4407427328000266, "learning_rate": 1.863905325443787e-07, "loss": 1.273, "step": 627 }, { "epoch": 1.6702127659574468, "grad_norm": 1.5224116074533014, "learning_rate": 1.849112426035503e-07, "loss": 1.2098, "step": 628 }, { "epoch": 1.672872340425532, "grad_norm": 1.527239003211648, "learning_rate": 1.8343195266272187e-07, "loss": 1.3724, "step": 629 }, { "epoch": 1.675531914893617, "grad_norm": 1.6525871512419401, "learning_rate": 1.819526627218935e-07, "loss": 1.3946, "step": 630 }, { "epoch": 1.6781914893617023, "grad_norm": 3.0200043340992933, "learning_rate": 1.804733727810651e-07, "loss": 1.4742, "step": 631 }, { "epoch": 1.6808510638297873, "grad_norm": 1.5029965510376364, "learning_rate": 1.7899408284023667e-07, "loss": 1.3623, "step": 632 }, { "epoch": 1.6835106382978724, "grad_norm": 1.5389625013367383, "learning_rate": 1.7751479289940827e-07, "loss": 1.5043, "step": 633 }, { "epoch": 1.6861702127659575, "grad_norm": 1.5608661501656413, "learning_rate": 1.7603550295857988e-07, "loss": 1.2883, "step": 634 }, { "epoch": 1.6888297872340425, "grad_norm": 1.6847845057440693, "learning_rate": 1.7455621301775148e-07, "loss": 1.4244, "step": 635 }, { "epoch": 1.6914893617021276, "grad_norm": 1.5793904433648327, "learning_rate": 1.7307692307692305e-07, "loss": 1.4062, "step": 636 }, { "epoch": 1.6941489361702127, "grad_norm": 1.4350293530642095, "learning_rate": 1.7159763313609465e-07, "loss": 1.2754, "step": 637 }, { "epoch": 1.6968085106382977, "grad_norm": 1.902506858522582, "learning_rate": 1.7011834319526628e-07, "loss": 1.4541, "step": 638 }, { "epoch": 1.699468085106383, "grad_norm": 1.478754263683889, "learning_rate": 1.6863905325443786e-07, "loss": 1.3463, "step": 639 }, { "epoch": 1.702127659574468, "grad_norm": 1.6464724285737642, "learning_rate": 1.6715976331360946e-07, "loss": 1.3807, "step": 640 }, { "epoch": 1.7047872340425532, "grad_norm": 1.6125752749357112, "learning_rate": 1.6568047337278106e-07, "loss": 1.2933, "step": 641 }, { "epoch": 1.7074468085106385, "grad_norm": 1.5928623495071816, "learning_rate": 1.6420118343195266e-07, "loss": 1.4326, "step": 642 }, { "epoch": 1.7101063829787235, "grad_norm": 1.5193190242572798, "learning_rate": 1.6272189349112426e-07, "loss": 1.3588, "step": 643 }, { "epoch": 1.7127659574468086, "grad_norm": 1.5482920311769846, "learning_rate": 1.6124260355029584e-07, "loss": 1.3839, "step": 644 }, { "epoch": 1.7154255319148937, "grad_norm": 1.8407335336806905, "learning_rate": 1.5976331360946747e-07, "loss": 1.3248, "step": 645 }, { "epoch": 1.7180851063829787, "grad_norm": 1.6055785649743377, "learning_rate": 1.5828402366863904e-07, "loss": 1.3872, "step": 646 }, { "epoch": 1.7207446808510638, "grad_norm": 1.6297496194969232, "learning_rate": 1.5680473372781064e-07, "loss": 1.389, "step": 647 }, { "epoch": 1.7234042553191489, "grad_norm": 1.577321745146047, "learning_rate": 1.5532544378698222e-07, "loss": 1.1947, "step": 648 }, { "epoch": 1.726063829787234, "grad_norm": 1.6447713137577962, "learning_rate": 1.5384615384615385e-07, "loss": 1.2652, "step": 649 }, { "epoch": 1.728723404255319, "grad_norm": 1.6234194331407543, "learning_rate": 1.5236686390532545e-07, "loss": 1.4239, "step": 650 }, { "epoch": 1.7313829787234043, "grad_norm": 1.532776130454777, "learning_rate": 1.5088757396449702e-07, "loss": 1.3875, "step": 651 }, { "epoch": 1.7340425531914894, "grad_norm": 1.4837535962878305, "learning_rate": 1.4940828402366865e-07, "loss": 1.2059, "step": 652 }, { "epoch": 1.7367021276595744, "grad_norm": 1.5395205053467318, "learning_rate": 1.4792899408284022e-07, "loss": 1.3513, "step": 653 }, { "epoch": 1.7393617021276597, "grad_norm": 1.4112077844892696, "learning_rate": 1.4644970414201183e-07, "loss": 1.3336, "step": 654 }, { "epoch": 1.7420212765957448, "grad_norm": 1.481010800777514, "learning_rate": 1.4497041420118343e-07, "loss": 1.4028, "step": 655 }, { "epoch": 1.7446808510638299, "grad_norm": 1.4564408238676725, "learning_rate": 1.4349112426035503e-07, "loss": 1.3502, "step": 656 }, { "epoch": 1.747340425531915, "grad_norm": 1.6956227102239596, "learning_rate": 1.4201183431952663e-07, "loss": 1.5672, "step": 657 }, { "epoch": 1.75, "grad_norm": 1.5705454639314052, "learning_rate": 1.405325443786982e-07, "loss": 1.4109, "step": 658 }, { "epoch": 1.752659574468085, "grad_norm": 1.5656622358755812, "learning_rate": 1.390532544378698e-07, "loss": 1.557, "step": 659 }, { "epoch": 1.7553191489361701, "grad_norm": 1.8848625197729474, "learning_rate": 1.3757396449704143e-07, "loss": 1.4017, "step": 660 }, { "epoch": 1.7579787234042552, "grad_norm": 1.4196764538431994, "learning_rate": 1.36094674556213e-07, "loss": 1.2331, "step": 661 }, { "epoch": 1.7606382978723403, "grad_norm": 1.4675927168298655, "learning_rate": 1.346153846153846e-07, "loss": 1.4689, "step": 662 }, { "epoch": 1.7632978723404256, "grad_norm": 1.6895719453339277, "learning_rate": 1.331360946745562e-07, "loss": 1.6055, "step": 663 }, { "epoch": 1.7659574468085106, "grad_norm": 1.6565509018980442, "learning_rate": 1.3165680473372781e-07, "loss": 1.346, "step": 664 }, { "epoch": 1.7686170212765957, "grad_norm": 1.6111421234975374, "learning_rate": 1.301775147928994e-07, "loss": 1.3318, "step": 665 }, { "epoch": 1.771276595744681, "grad_norm": 1.5477525938145107, "learning_rate": 1.28698224852071e-07, "loss": 1.4311, "step": 666 }, { "epoch": 1.773936170212766, "grad_norm": 1.4344548853484294, "learning_rate": 1.2721893491124262e-07, "loss": 1.4168, "step": 667 }, { "epoch": 1.7765957446808511, "grad_norm": 2.002400150167084, "learning_rate": 1.257396449704142e-07, "loss": 1.5304, "step": 668 }, { "epoch": 1.7792553191489362, "grad_norm": 1.6203137830914942, "learning_rate": 1.242603550295858e-07, "loss": 1.4902, "step": 669 }, { "epoch": 1.7819148936170213, "grad_norm": 1.653101321305009, "learning_rate": 1.227810650887574e-07, "loss": 1.523, "step": 670 }, { "epoch": 1.7845744680851063, "grad_norm": 1.4583067028702263, "learning_rate": 1.2130177514792897e-07, "loss": 1.3307, "step": 671 }, { "epoch": 1.7872340425531914, "grad_norm": 1.4416958484378999, "learning_rate": 1.198224852071006e-07, "loss": 1.2879, "step": 672 }, { "epoch": 1.7898936170212765, "grad_norm": 1.5342015216907867, "learning_rate": 1.1834319526627217e-07, "loss": 1.3491, "step": 673 }, { "epoch": 1.7925531914893615, "grad_norm": 1.5120417917571398, "learning_rate": 1.1686390532544378e-07, "loss": 1.5533, "step": 674 }, { "epoch": 1.7952127659574468, "grad_norm": 1.6448669091043147, "learning_rate": 1.1538461538461539e-07, "loss": 1.4507, "step": 675 }, { "epoch": 1.797872340425532, "grad_norm": 1.5744246355313867, "learning_rate": 1.1390532544378698e-07, "loss": 1.4762, "step": 676 }, { "epoch": 1.800531914893617, "grad_norm": 1.407351126310039, "learning_rate": 1.1242603550295858e-07, "loss": 1.2665, "step": 677 }, { "epoch": 1.8031914893617023, "grad_norm": 1.4428356495487928, "learning_rate": 1.1094674556213017e-07, "loss": 1.4222, "step": 678 }, { "epoch": 1.8058510638297873, "grad_norm": 1.4978022369408812, "learning_rate": 1.0946745562130177e-07, "loss": 1.3571, "step": 679 }, { "epoch": 1.8085106382978724, "grad_norm": 1.608694580830846, "learning_rate": 1.0798816568047336e-07, "loss": 1.2468, "step": 680 }, { "epoch": 1.8111702127659575, "grad_norm": 1.3671652219864612, "learning_rate": 1.0650887573964497e-07, "loss": 1.1918, "step": 681 }, { "epoch": 1.8138297872340425, "grad_norm": 1.5436563625586248, "learning_rate": 1.0502958579881656e-07, "loss": 1.3447, "step": 682 }, { "epoch": 1.8164893617021276, "grad_norm": 2.0668175329496448, "learning_rate": 1.0355029585798816e-07, "loss": 1.2851, "step": 683 }, { "epoch": 1.8191489361702127, "grad_norm": 1.4711737418040087, "learning_rate": 1.0207100591715975e-07, "loss": 1.4054, "step": 684 }, { "epoch": 1.8218085106382977, "grad_norm": 1.628475068104997, "learning_rate": 1.0059171597633135e-07, "loss": 1.2297, "step": 685 }, { "epoch": 1.824468085106383, "grad_norm": 1.6652537635356375, "learning_rate": 9.911242603550297e-08, "loss": 1.4249, "step": 686 }, { "epoch": 1.827127659574468, "grad_norm": 1.4549454801379844, "learning_rate": 9.763313609467456e-08, "loss": 1.4738, "step": 687 }, { "epoch": 1.8297872340425532, "grad_norm": 1.4571125733944477, "learning_rate": 9.615384615384616e-08, "loss": 1.2531, "step": 688 }, { "epoch": 1.8324468085106385, "grad_norm": 1.4934710030590315, "learning_rate": 9.467455621301774e-08, "loss": 1.4224, "step": 689 }, { "epoch": 1.8351063829787235, "grad_norm": 1.5068998589001918, "learning_rate": 9.319526627218935e-08, "loss": 1.4137, "step": 690 }, { "epoch": 1.8377659574468086, "grad_norm": 1.5592030646382606, "learning_rate": 9.171597633136093e-08, "loss": 1.4923, "step": 691 }, { "epoch": 1.8404255319148937, "grad_norm": 1.5420672523438603, "learning_rate": 9.023668639053255e-08, "loss": 1.3542, "step": 692 }, { "epoch": 1.8430851063829787, "grad_norm": 1.4933658760362354, "learning_rate": 8.875739644970414e-08, "loss": 1.4062, "step": 693 }, { "epoch": 1.8457446808510638, "grad_norm": 2.1197107348039648, "learning_rate": 8.727810650887574e-08, "loss": 1.3514, "step": 694 }, { "epoch": 1.8484042553191489, "grad_norm": 1.420310868366173, "learning_rate": 8.579881656804733e-08, "loss": 1.3865, "step": 695 }, { "epoch": 1.851063829787234, "grad_norm": 2.1476526664851083, "learning_rate": 8.431952662721893e-08, "loss": 1.1886, "step": 696 }, { "epoch": 1.853723404255319, "grad_norm": 1.3847908910859454, "learning_rate": 8.284023668639053e-08, "loss": 1.4107, "step": 697 }, { "epoch": 1.8563829787234043, "grad_norm": 1.6527903429011437, "learning_rate": 8.136094674556213e-08, "loss": 1.2876, "step": 698 }, { "epoch": 1.8590425531914894, "grad_norm": 1.5745014854949893, "learning_rate": 7.988165680473373e-08, "loss": 1.4558, "step": 699 }, { "epoch": 1.8617021276595744, "grad_norm": 1.5350363492855568, "learning_rate": 7.840236686390532e-08, "loss": 1.4523, "step": 700 }, { "epoch": 1.8643617021276597, "grad_norm": 1.4853786087332579, "learning_rate": 7.692307692307692e-08, "loss": 1.3292, "step": 701 }, { "epoch": 1.8670212765957448, "grad_norm": 1.4473821719214552, "learning_rate": 7.544378698224851e-08, "loss": 1.2034, "step": 702 }, { "epoch": 1.8696808510638299, "grad_norm": 1.4659266830367277, "learning_rate": 7.396449704142011e-08, "loss": 1.2584, "step": 703 }, { "epoch": 1.872340425531915, "grad_norm": 1.4759466915583441, "learning_rate": 7.248520710059171e-08, "loss": 1.3187, "step": 704 }, { "epoch": 1.875, "grad_norm": 2.111257320773056, "learning_rate": 7.100591715976332e-08, "loss": 1.3323, "step": 705 }, { "epoch": 1.877659574468085, "grad_norm": 1.5831480252428458, "learning_rate": 6.95266272189349e-08, "loss": 1.4302, "step": 706 }, { "epoch": 1.8803191489361701, "grad_norm": 1.6086043948043176, "learning_rate": 6.80473372781065e-08, "loss": 1.4017, "step": 707 }, { "epoch": 1.8829787234042552, "grad_norm": 2.0849492736061332, "learning_rate": 6.65680473372781e-08, "loss": 1.5211, "step": 708 }, { "epoch": 1.8856382978723403, "grad_norm": 1.5043217865201886, "learning_rate": 6.50887573964497e-08, "loss": 1.2166, "step": 709 }, { "epoch": 1.8882978723404256, "grad_norm": 1.5612635488662876, "learning_rate": 6.360946745562131e-08, "loss": 1.3481, "step": 710 }, { "epoch": 1.8909574468085106, "grad_norm": 1.4947402449036076, "learning_rate": 6.21301775147929e-08, "loss": 1.1988, "step": 711 }, { "epoch": 1.8936170212765957, "grad_norm": 1.7123431001612024, "learning_rate": 6.065088757396449e-08, "loss": 1.6215, "step": 712 }, { "epoch": 1.896276595744681, "grad_norm": 1.5722027689056413, "learning_rate": 5.917159763313609e-08, "loss": 1.435, "step": 713 }, { "epoch": 1.898936170212766, "grad_norm": 1.5736347184744337, "learning_rate": 5.7692307692307695e-08, "loss": 1.4243, "step": 714 }, { "epoch": 1.9015957446808511, "grad_norm": 1.4558857769282714, "learning_rate": 5.621301775147929e-08, "loss": 1.3732, "step": 715 }, { "epoch": 1.9042553191489362, "grad_norm": 1.430432818475582, "learning_rate": 5.4733727810650885e-08, "loss": 1.2582, "step": 716 }, { "epoch": 1.9069148936170213, "grad_norm": 1.4010572562597123, "learning_rate": 5.3254437869822486e-08, "loss": 1.2036, "step": 717 }, { "epoch": 1.9095744680851063, "grad_norm": 1.5030183623430164, "learning_rate": 5.177514792899408e-08, "loss": 1.3348, "step": 718 }, { "epoch": 1.9122340425531914, "grad_norm": 1.5264425521471463, "learning_rate": 5.0295857988165676e-08, "loss": 1.3486, "step": 719 }, { "epoch": 1.9148936170212765, "grad_norm": 1.6568867880777098, "learning_rate": 4.881656804733728e-08, "loss": 1.5102, "step": 720 }, { "epoch": 1.9175531914893615, "grad_norm": 1.426877705408139, "learning_rate": 4.733727810650887e-08, "loss": 1.2843, "step": 721 }, { "epoch": 1.9202127659574468, "grad_norm": 1.5745176121540452, "learning_rate": 4.585798816568047e-08, "loss": 1.5892, "step": 722 }, { "epoch": 1.922872340425532, "grad_norm": 1.5299374151207628, "learning_rate": 4.437869822485207e-08, "loss": 1.2833, "step": 723 }, { "epoch": 1.925531914893617, "grad_norm": 1.5485924328569498, "learning_rate": 4.2899408284023664e-08, "loss": 1.2528, "step": 724 }, { "epoch": 1.9281914893617023, "grad_norm": 1.5650812571579016, "learning_rate": 4.1420118343195265e-08, "loss": 1.4623, "step": 725 }, { "epoch": 1.9308510638297873, "grad_norm": 1.4874887834654986, "learning_rate": 3.9940828402366866e-08, "loss": 1.331, "step": 726 }, { "epoch": 1.9335106382978724, "grad_norm": 2.2386953559992606, "learning_rate": 3.846153846153846e-08, "loss": 1.3, "step": 727 }, { "epoch": 1.9361702127659575, "grad_norm": 1.6479534866842882, "learning_rate": 3.6982248520710056e-08, "loss": 1.5275, "step": 728 }, { "epoch": 1.9388297872340425, "grad_norm": 1.3421164941268597, "learning_rate": 3.550295857988166e-08, "loss": 1.2612, "step": 729 }, { "epoch": 1.9414893617021276, "grad_norm": 1.4738714329564195, "learning_rate": 3.402366863905325e-08, "loss": 1.4314, "step": 730 }, { "epoch": 1.9441489361702127, "grad_norm": 1.44156180682146, "learning_rate": 3.254437869822485e-08, "loss": 1.3338, "step": 731 }, { "epoch": 1.9468085106382977, "grad_norm": 1.5784061294788538, "learning_rate": 3.106508875739645e-08, "loss": 1.3386, "step": 732 }, { "epoch": 1.949468085106383, "grad_norm": 1.3371318363907538, "learning_rate": 2.9585798816568044e-08, "loss": 1.3028, "step": 733 }, { "epoch": 1.952127659574468, "grad_norm": 1.6852290803170833, "learning_rate": 2.8106508875739645e-08, "loss": 1.4942, "step": 734 }, { "epoch": 1.9547872340425532, "grad_norm": 1.4000270483265091, "learning_rate": 2.6627218934911243e-08, "loss": 1.2736, "step": 735 }, { "epoch": 1.9574468085106385, "grad_norm": 1.5150814064740574, "learning_rate": 2.5147928994082838e-08, "loss": 1.3892, "step": 736 }, { "epoch": 1.9601063829787235, "grad_norm": 1.6902820824629503, "learning_rate": 2.3668639053254436e-08, "loss": 1.4473, "step": 737 }, { "epoch": 1.9627659574468086, "grad_norm": 1.5540380607077866, "learning_rate": 2.2189349112426034e-08, "loss": 1.4118, "step": 738 }, { "epoch": 1.9654255319148937, "grad_norm": 1.7104646212150858, "learning_rate": 2.0710059171597633e-08, "loss": 1.3168, "step": 739 }, { "epoch": 1.9680851063829787, "grad_norm": 2.2605324172049865, "learning_rate": 1.923076923076923e-08, "loss": 1.5395, "step": 740 }, { "epoch": 1.9707446808510638, "grad_norm": 1.5974851484011308, "learning_rate": 1.775147928994083e-08, "loss": 1.6206, "step": 741 }, { "epoch": 1.9734042553191489, "grad_norm": 1.5065611553522427, "learning_rate": 1.6272189349112424e-08, "loss": 1.282, "step": 742 }, { "epoch": 1.976063829787234, "grad_norm": 1.5885436344655675, "learning_rate": 1.4792899408284022e-08, "loss": 1.3356, "step": 743 }, { "epoch": 1.978723404255319, "grad_norm": 1.503074753014641, "learning_rate": 1.3313609467455622e-08, "loss": 1.4519, "step": 744 }, { "epoch": 1.9813829787234043, "grad_norm": 1.4858939981761545, "learning_rate": 1.1834319526627218e-08, "loss": 1.4708, "step": 745 }, { "epoch": 1.9840425531914894, "grad_norm": 1.5483339923710784, "learning_rate": 1.0355029585798816e-08, "loss": 1.4574, "step": 746 }, { "epoch": 1.9867021276595744, "grad_norm": 1.4527735951794787, "learning_rate": 8.875739644970414e-09, "loss": 1.2918, "step": 747 }, { "epoch": 1.9893617021276597, "grad_norm": 1.6044461968692099, "learning_rate": 7.396449704142011e-09, "loss": 1.4377, "step": 748 }, { "epoch": 1.9920212765957448, "grad_norm": 1.530330381812826, "learning_rate": 5.917159763313609e-09, "loss": 1.4002, "step": 749 }, { "epoch": 1.9946808510638299, "grad_norm": 1.5188564969623919, "learning_rate": 4.437869822485207e-09, "loss": 1.3378, "step": 750 }, { "epoch": 1.997340425531915, "grad_norm": 1.470529168569605, "learning_rate": 2.9585798816568045e-09, "loss": 1.3217, "step": 751 }, { "epoch": 2.0, "grad_norm": 1.4650169982184404, "learning_rate": 1.4792899408284023e-09, "loss": 1.3323, "step": 752 } ], "logging_steps": 1.0, "max_steps": 752, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }