diff --git "a/checkpoint-10545/trainer_state.json" "b/checkpoint-10545/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-10545/trainer_state.json" @@ -0,0 +1,73849 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.7975432211101, + "eval_steps": 500, + "global_step": 10545, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00045495905368516835, + "grad_norm": 9.461428161462043, + "learning_rate": 1e-05, + "loss": 0.1263, + "step": 1 + }, + { + "epoch": 0.0009099181073703367, + "grad_norm": 5.190780450250769, + "learning_rate": 9.99999979571129e-06, + "loss": 0.1723, + "step": 2 + }, + { + "epoch": 0.001364877161055505, + "grad_norm": 7.521926017130347, + "learning_rate": 9.999999182845177e-06, + "loss": 0.1327, + "step": 3 + }, + { + "epoch": 0.0018198362147406734, + "grad_norm": 2.5665810200307217, + "learning_rate": 9.99999816140171e-06, + "loss": 0.1095, + "step": 4 + }, + { + "epoch": 0.0022747952684258415, + "grad_norm": 2.738508706395883, + "learning_rate": 9.999996731380973e-06, + "loss": 0.1151, + "step": 5 + }, + { + "epoch": 0.00272975432211101, + "grad_norm": 2.67941899677245, + "learning_rate": 9.999994892783083e-06, + "loss": 0.0821, + "step": 6 + }, + { + "epoch": 0.0031847133757961785, + "grad_norm": 2.137586234420784, + "learning_rate": 9.99999264560819e-06, + "loss": 0.0729, + "step": 7 + }, + { + "epoch": 0.003639672429481347, + "grad_norm": 2.8221590420989164, + "learning_rate": 9.999989989856477e-06, + "loss": 0.0929, + "step": 8 + }, + { + "epoch": 0.004094631483166515, + "grad_norm": 1.6167314639784554, + "learning_rate": 9.999986925528164e-06, + "loss": 0.0466, + "step": 9 + }, + { + "epoch": 0.004549590536851683, + "grad_norm": 2.1773262431631313, + "learning_rate": 9.999983452623498e-06, + "loss": 0.0709, + "step": 10 + }, + { + "epoch": 0.005004549590536852, + "grad_norm": 7.6444390817806465, + "learning_rate": 9.999979571142765e-06, + "loss": 0.0809, + "step": 11 + }, + { + "epoch": 0.00545950864422202, + "grad_norm": 2.034523884241798, + "learning_rate": 9.999975281086278e-06, + "loss": 0.0839, + "step": 12 + }, + { + "epoch": 0.005914467697907188, + "grad_norm": 3.576108282005355, + "learning_rate": 9.999970582454392e-06, + "loss": 0.0728, + "step": 13 + }, + { + "epoch": 0.006369426751592357, + "grad_norm": 2.623641566468802, + "learning_rate": 9.999965475247491e-06, + "loss": 0.1052, + "step": 14 + }, + { + "epoch": 0.006824385805277525, + "grad_norm": 2.1413574998269085, + "learning_rate": 9.99995995946599e-06, + "loss": 0.0885, + "step": 15 + }, + { + "epoch": 0.007279344858962694, + "grad_norm": 1.4859066724415246, + "learning_rate": 9.999954035110342e-06, + "loss": 0.0644, + "step": 16 + }, + { + "epoch": 0.0077343039126478615, + "grad_norm": 2.851793157608408, + "learning_rate": 9.999947702181027e-06, + "loss": 0.1057, + "step": 17 + }, + { + "epoch": 0.00818926296633303, + "grad_norm": 4.693829546662477, + "learning_rate": 9.999940960678568e-06, + "loss": 0.0867, + "step": 18 + }, + { + "epoch": 0.008644222020018199, + "grad_norm": 2.2728033563417362, + "learning_rate": 9.999933810603513e-06, + "loss": 0.0789, + "step": 19 + }, + { + "epoch": 0.009099181073703366, + "grad_norm": 1.6705986173507794, + "learning_rate": 9.999926251956447e-06, + "loss": 0.0683, + "step": 20 + }, + { + "epoch": 0.009554140127388535, + "grad_norm": 2.187579869114393, + "learning_rate": 9.999918284737986e-06, + "loss": 0.0984, + "step": 21 + }, + { + "epoch": 0.010009099181073703, + "grad_norm": 2.328040268012338, + "learning_rate": 9.999909908948782e-06, + "loss": 0.0699, + "step": 22 + }, + { + "epoch": 0.010464058234758872, + "grad_norm": 5.572389775693198, + "learning_rate": 9.999901124589519e-06, + "loss": 0.0912, + "step": 23 + }, + { + "epoch": 0.01091901728844404, + "grad_norm": 1.84796719674859, + "learning_rate": 9.999891931660916e-06, + "loss": 0.1015, + "step": 24 + }, + { + "epoch": 0.011373976342129208, + "grad_norm": 1.7501762990792236, + "learning_rate": 9.999882330163725e-06, + "loss": 0.0909, + "step": 25 + }, + { + "epoch": 0.011828935395814377, + "grad_norm": 0.9922115950592263, + "learning_rate": 9.999872320098729e-06, + "loss": 0.0656, + "step": 26 + }, + { + "epoch": 0.012283894449499545, + "grad_norm": 1.5612370560987539, + "learning_rate": 9.999861901466746e-06, + "loss": 0.0974, + "step": 27 + }, + { + "epoch": 0.012738853503184714, + "grad_norm": 1.4617271794930395, + "learning_rate": 9.999851074268625e-06, + "loss": 0.0853, + "step": 28 + }, + { + "epoch": 0.013193812556869881, + "grad_norm": 1.8127085104491556, + "learning_rate": 9.999839838505257e-06, + "loss": 0.1081, + "step": 29 + }, + { + "epoch": 0.01364877161055505, + "grad_norm": 1.4710105512612208, + "learning_rate": 9.999828194177555e-06, + "loss": 0.0868, + "step": 30 + }, + { + "epoch": 0.014103730664240218, + "grad_norm": 1.3474487189311888, + "learning_rate": 9.999816141286472e-06, + "loss": 0.0817, + "step": 31 + }, + { + "epoch": 0.014558689717925387, + "grad_norm": 1.0967596652549403, + "learning_rate": 9.99980367983299e-06, + "loss": 0.0637, + "step": 32 + }, + { + "epoch": 0.015013648771610554, + "grad_norm": 3.179425671823194, + "learning_rate": 9.999790809818134e-06, + "loss": 0.069, + "step": 33 + }, + { + "epoch": 0.015468607825295723, + "grad_norm": 4.482257681577152, + "learning_rate": 9.999777531242951e-06, + "loss": 0.0915, + "step": 34 + }, + { + "epoch": 0.01592356687898089, + "grad_norm": 3.953299040475791, + "learning_rate": 9.999763844108528e-06, + "loss": 0.0562, + "step": 35 + }, + { + "epoch": 0.01637852593266606, + "grad_norm": 1.1127201050382067, + "learning_rate": 9.999749748415982e-06, + "loss": 0.0556, + "step": 36 + }, + { + "epoch": 0.01683348498635123, + "grad_norm": 79.45756094624792, + "learning_rate": 9.999735244166464e-06, + "loss": 0.1223, + "step": 37 + }, + { + "epoch": 0.017288444040036398, + "grad_norm": 2777.9092912017113, + "learning_rate": 9.99972033136116e-06, + "loss": 0.3211, + "step": 38 + }, + { + "epoch": 0.017743403093721567, + "grad_norm": 2.5204693177238466, + "learning_rate": 9.999705010001291e-06, + "loss": 0.0723, + "step": 39 + }, + { + "epoch": 0.018198362147406732, + "grad_norm": 2.2975907071135655, + "learning_rate": 9.999689280088105e-06, + "loss": 0.0696, + "step": 40 + }, + { + "epoch": 0.0186533212010919, + "grad_norm": 2.998434349074003, + "learning_rate": 9.99967314162289e-06, + "loss": 0.083, + "step": 41 + }, + { + "epoch": 0.01910828025477707, + "grad_norm": 3.882239448575704, + "learning_rate": 9.999656594606966e-06, + "loss": 0.1015, + "step": 42 + }, + { + "epoch": 0.019563239308462238, + "grad_norm": 3.5286596480512493, + "learning_rate": 9.999639639041681e-06, + "loss": 0.0817, + "step": 43 + }, + { + "epoch": 0.020018198362147407, + "grad_norm": 1.6933989447443707, + "learning_rate": 9.999622274928424e-06, + "loss": 0.1003, + "step": 44 + }, + { + "epoch": 0.020473157415832575, + "grad_norm": 1.2483160046323276, + "learning_rate": 9.999604502268614e-06, + "loss": 0.0952, + "step": 45 + }, + { + "epoch": 0.020928116469517744, + "grad_norm": 0.9417906124383243, + "learning_rate": 9.9995863210637e-06, + "loss": 0.0731, + "step": 46 + }, + { + "epoch": 0.021383075523202913, + "grad_norm": 2.8195414757816897, + "learning_rate": 9.99956773131517e-06, + "loss": 0.1845, + "step": 47 + }, + { + "epoch": 0.02183803457688808, + "grad_norm": 2.74390379471345, + "learning_rate": 9.999548733024545e-06, + "loss": 0.1826, + "step": 48 + }, + { + "epoch": 0.022292993630573247, + "grad_norm": 1.5138494619527987, + "learning_rate": 9.999529326193373e-06, + "loss": 0.0857, + "step": 49 + }, + { + "epoch": 0.022747952684258416, + "grad_norm": 1.215379974181271, + "learning_rate": 9.999509510823242e-06, + "loss": 0.0686, + "step": 50 + }, + { + "epoch": 0.023202911737943584, + "grad_norm": 1.292187967807859, + "learning_rate": 9.999489286915773e-06, + "loss": 0.0707, + "step": 51 + }, + { + "epoch": 0.023657870791628753, + "grad_norm": 1.7888013203563982, + "learning_rate": 9.999468654472614e-06, + "loss": 0.0682, + "step": 52 + }, + { + "epoch": 0.024112829845313922, + "grad_norm": 0.8979425621703144, + "learning_rate": 9.999447613495457e-06, + "loss": 0.0508, + "step": 53 + }, + { + "epoch": 0.02456778889899909, + "grad_norm": 1.9123835444775663, + "learning_rate": 9.99942616398602e-06, + "loss": 0.0689, + "step": 54 + }, + { + "epoch": 0.02502274795268426, + "grad_norm": 0.9393581994096443, + "learning_rate": 9.99940430594605e-06, + "loss": 0.0496, + "step": 55 + }, + { + "epoch": 0.025477707006369428, + "grad_norm": 1.0234476513644222, + "learning_rate": 9.999382039377339e-06, + "loss": 0.0601, + "step": 56 + }, + { + "epoch": 0.025932666060054597, + "grad_norm": 0.9291387208138827, + "learning_rate": 9.999359364281704e-06, + "loss": 0.0377, + "step": 57 + }, + { + "epoch": 0.026387625113739762, + "grad_norm": 1.8209170803663992, + "learning_rate": 9.999336280660999e-06, + "loss": 0.1144, + "step": 58 + }, + { + "epoch": 0.02684258416742493, + "grad_norm": 1.1214625046464874, + "learning_rate": 9.99931278851711e-06, + "loss": 0.0622, + "step": 59 + }, + { + "epoch": 0.0272975432211101, + "grad_norm": 1.0331723997917317, + "learning_rate": 9.999288887851956e-06, + "loss": 0.0667, + "step": 60 + }, + { + "epoch": 0.027752502274795268, + "grad_norm": 1.0412381501406744, + "learning_rate": 9.999264578667493e-06, + "loss": 0.0566, + "step": 61 + }, + { + "epoch": 0.028207461328480437, + "grad_norm": 1.4510603110658047, + "learning_rate": 9.999239860965703e-06, + "loss": 0.0845, + "step": 62 + }, + { + "epoch": 0.028662420382165606, + "grad_norm": 1.301162540669183, + "learning_rate": 9.999214734748609e-06, + "loss": 0.0759, + "step": 63 + }, + { + "epoch": 0.029117379435850774, + "grad_norm": 0.9977688847603402, + "learning_rate": 9.999189200018263e-06, + "loss": 0.0528, + "step": 64 + }, + { + "epoch": 0.029572338489535943, + "grad_norm": 1.2894688842348854, + "learning_rate": 9.99916325677675e-06, + "loss": 0.0899, + "step": 65 + }, + { + "epoch": 0.03002729754322111, + "grad_norm": 1.4627871680702638, + "learning_rate": 9.999136905026194e-06, + "loss": 0.1456, + "step": 66 + }, + { + "epoch": 0.030482256596906277, + "grad_norm": 1.2304385710214434, + "learning_rate": 9.999110144768745e-06, + "loss": 0.079, + "step": 67 + }, + { + "epoch": 0.030937215650591446, + "grad_norm": 1.085016380732753, + "learning_rate": 9.99908297600659e-06, + "loss": 0.0696, + "step": 68 + }, + { + "epoch": 0.03139217470427662, + "grad_norm": 0.989450558642297, + "learning_rate": 9.99905539874195e-06, + "loss": 0.069, + "step": 69 + }, + { + "epoch": 0.03184713375796178, + "grad_norm": 1.0510491151133208, + "learning_rate": 9.99902741297708e-06, + "loss": 0.0555, + "step": 70 + }, + { + "epoch": 0.03230209281164695, + "grad_norm": 0.8938033562648371, + "learning_rate": 9.998999018714264e-06, + "loss": 0.0783, + "step": 71 + }, + { + "epoch": 0.03275705186533212, + "grad_norm": 2.902512108322722, + "learning_rate": 9.998970215955824e-06, + "loss": 0.0702, + "step": 72 + }, + { + "epoch": 0.033212010919017286, + "grad_norm": 0.7661831894133686, + "learning_rate": 9.998941004704113e-06, + "loss": 0.0519, + "step": 73 + }, + { + "epoch": 0.03366696997270246, + "grad_norm": 1.1047249497744047, + "learning_rate": 9.998911384961518e-06, + "loss": 0.0773, + "step": 74 + }, + { + "epoch": 0.034121929026387623, + "grad_norm": 0.7750047299312716, + "learning_rate": 9.998881356730458e-06, + "loss": 0.0598, + "step": 75 + }, + { + "epoch": 0.034576888080072796, + "grad_norm": 0.9815801555720315, + "learning_rate": 9.99885092001339e-06, + "loss": 0.0661, + "step": 76 + }, + { + "epoch": 0.03503184713375796, + "grad_norm": 1.3090963451351905, + "learning_rate": 9.998820074812799e-06, + "loss": 0.0713, + "step": 77 + }, + { + "epoch": 0.03548680618744313, + "grad_norm": 1.1489338732270693, + "learning_rate": 9.998788821131207e-06, + "loss": 0.0946, + "step": 78 + }, + { + "epoch": 0.0359417652411283, + "grad_norm": 0.9040381990998293, + "learning_rate": 9.998757158971164e-06, + "loss": 0.067, + "step": 79 + }, + { + "epoch": 0.036396724294813464, + "grad_norm": 1.1019926198229115, + "learning_rate": 9.998725088335263e-06, + "loss": 0.0874, + "step": 80 + }, + { + "epoch": 0.036851683348498636, + "grad_norm": 0.5779852750462403, + "learning_rate": 9.99869260922612e-06, + "loss": 0.0492, + "step": 81 + }, + { + "epoch": 0.0373066424021838, + "grad_norm": 1.2769852710418472, + "learning_rate": 9.998659721646393e-06, + "loss": 0.0781, + "step": 82 + }, + { + "epoch": 0.03776160145586897, + "grad_norm": 0.9020624084974485, + "learning_rate": 9.998626425598766e-06, + "loss": 0.0734, + "step": 83 + }, + { + "epoch": 0.03821656050955414, + "grad_norm": 0.9626764462141776, + "learning_rate": 9.99859272108596e-06, + "loss": 0.0719, + "step": 84 + }, + { + "epoch": 0.03867151956323931, + "grad_norm": 0.9435885887029873, + "learning_rate": 9.998558608110733e-06, + "loss": 0.0835, + "step": 85 + }, + { + "epoch": 0.039126478616924476, + "grad_norm": 1.0578725525123687, + "learning_rate": 9.998524086675867e-06, + "loss": 0.0746, + "step": 86 + }, + { + "epoch": 0.03958143767060965, + "grad_norm": 1.0366588534208079, + "learning_rate": 9.998489156784188e-06, + "loss": 0.0933, + "step": 87 + }, + { + "epoch": 0.040036396724294813, + "grad_norm": 1.0595948680723846, + "learning_rate": 9.998453818438547e-06, + "loss": 0.0846, + "step": 88 + }, + { + "epoch": 0.04049135577797998, + "grad_norm": 0.8807515753016749, + "learning_rate": 9.998418071641833e-06, + "loss": 0.0649, + "step": 89 + }, + { + "epoch": 0.04094631483166515, + "grad_norm": 0.9034225145874141, + "learning_rate": 9.998381916396967e-06, + "loss": 0.0621, + "step": 90 + }, + { + "epoch": 0.041401273885350316, + "grad_norm": 0.6732889821553815, + "learning_rate": 9.998345352706901e-06, + "loss": 0.0367, + "step": 91 + }, + { + "epoch": 0.04185623293903549, + "grad_norm": 0.7136967603743426, + "learning_rate": 9.998308380574628e-06, + "loss": 0.0569, + "step": 92 + }, + { + "epoch": 0.042311191992720654, + "grad_norm": 1.1459385364035048, + "learning_rate": 9.998271000003166e-06, + "loss": 0.1184, + "step": 93 + }, + { + "epoch": 0.042766151046405826, + "grad_norm": 0.8224906129097734, + "learning_rate": 9.998233210995569e-06, + "loss": 0.0682, + "step": 94 + }, + { + "epoch": 0.04322111010009099, + "grad_norm": 1.5182946932236698, + "learning_rate": 9.998195013554926e-06, + "loss": 0.0875, + "step": 95 + }, + { + "epoch": 0.04367606915377616, + "grad_norm": 0.9355855711018981, + "learning_rate": 9.998156407684359e-06, + "loss": 0.0939, + "step": 96 + }, + { + "epoch": 0.04413102820746133, + "grad_norm": 0.7329840867165283, + "learning_rate": 9.998117393387022e-06, + "loss": 0.0466, + "step": 97 + }, + { + "epoch": 0.044585987261146494, + "grad_norm": 0.8701001036058451, + "learning_rate": 9.9980779706661e-06, + "loss": 0.0729, + "step": 98 + }, + { + "epoch": 0.045040946314831666, + "grad_norm": 1.0218896298663185, + "learning_rate": 9.99803813952482e-06, + "loss": 0.0828, + "step": 99 + }, + { + "epoch": 0.04549590536851683, + "grad_norm": 0.9044995357273884, + "learning_rate": 9.997997899966433e-06, + "loss": 0.0709, + "step": 100 + }, + { + "epoch": 0.045950864422202004, + "grad_norm": 0.9877796099816964, + "learning_rate": 9.99795725199423e-06, + "loss": 0.0903, + "step": 101 + }, + { + "epoch": 0.04640582347588717, + "grad_norm": 1.0061501994463906, + "learning_rate": 9.99791619561153e-06, + "loss": 0.0831, + "step": 102 + }, + { + "epoch": 0.04686078252957234, + "grad_norm": 0.8789173954818107, + "learning_rate": 9.997874730821689e-06, + "loss": 0.0714, + "step": 103 + }, + { + "epoch": 0.047315741583257506, + "grad_norm": 15.480920098194954, + "learning_rate": 9.997832857628093e-06, + "loss": 0.2603, + "step": 104 + }, + { + "epoch": 0.04777070063694268, + "grad_norm": 1.3806761301603454, + "learning_rate": 9.99779057603417e-06, + "loss": 0.1227, + "step": 105 + }, + { + "epoch": 0.048225659690627844, + "grad_norm": 0.8462176607269959, + "learning_rate": 9.997747886043368e-06, + "loss": 0.0605, + "step": 106 + }, + { + "epoch": 0.04868061874431301, + "grad_norm": 0.7467169847716549, + "learning_rate": 9.997704787659179e-06, + "loss": 0.0618, + "step": 107 + }, + { + "epoch": 0.04913557779799818, + "grad_norm": 1.5653334818977065, + "learning_rate": 9.997661280885125e-06, + "loss": 0.1253, + "step": 108 + }, + { + "epoch": 0.049590536851683346, + "grad_norm": 0.871706038604149, + "learning_rate": 9.99761736572476e-06, + "loss": 0.0716, + "step": 109 + }, + { + "epoch": 0.05004549590536852, + "grad_norm": 1.1398296008355844, + "learning_rate": 9.997573042181672e-06, + "loss": 0.0698, + "step": 110 + }, + { + "epoch": 0.050500454959053684, + "grad_norm": 1.0487992691419916, + "learning_rate": 9.997528310259485e-06, + "loss": 0.1102, + "step": 111 + }, + { + "epoch": 0.050955414012738856, + "grad_norm": 0.9112684449646818, + "learning_rate": 9.997483169961852e-06, + "loss": 0.1032, + "step": 112 + }, + { + "epoch": 0.05141037306642402, + "grad_norm": 0.9418790141923585, + "learning_rate": 9.997437621292463e-06, + "loss": 0.0771, + "step": 113 + }, + { + "epoch": 0.051865332120109194, + "grad_norm": 0.7796140692842074, + "learning_rate": 9.99739166425504e-06, + "loss": 0.0627, + "step": 114 + }, + { + "epoch": 0.05232029117379436, + "grad_norm": 1.5434421216734795, + "learning_rate": 9.997345298853339e-06, + "loss": 0.1495, + "step": 115 + }, + { + "epoch": 0.052775250227479524, + "grad_norm": 0.8898179660551836, + "learning_rate": 9.997298525091148e-06, + "loss": 0.0735, + "step": 116 + }, + { + "epoch": 0.053230209281164696, + "grad_norm": 0.8585916871524272, + "learning_rate": 9.997251342972288e-06, + "loss": 0.068, + "step": 117 + }, + { + "epoch": 0.05368516833484986, + "grad_norm": 0.812806800238708, + "learning_rate": 9.997203752500616e-06, + "loss": 0.0689, + "step": 118 + }, + { + "epoch": 0.054140127388535034, + "grad_norm": 0.9677722064277628, + "learning_rate": 9.997155753680021e-06, + "loss": 0.0795, + "step": 119 + }, + { + "epoch": 0.0545950864422202, + "grad_norm": 1.621934591654054, + "learning_rate": 9.997107346514425e-06, + "loss": 0.0707, + "step": 120 + }, + { + "epoch": 0.05505004549590537, + "grad_norm": 0.6750452750311531, + "learning_rate": 9.997058531007782e-06, + "loss": 0.0588, + "step": 121 + }, + { + "epoch": 0.055505004549590536, + "grad_norm": 0.9583870506818666, + "learning_rate": 9.997009307164083e-06, + "loss": 0.0859, + "step": 122 + }, + { + "epoch": 0.05595996360327571, + "grad_norm": 1.247483970027119, + "learning_rate": 9.99695967498735e-06, + "loss": 0.0952, + "step": 123 + }, + { + "epoch": 0.056414922656960874, + "grad_norm": 0.7937903902273558, + "learning_rate": 9.996909634481639e-06, + "loss": 0.0614, + "step": 124 + }, + { + "epoch": 0.05686988171064604, + "grad_norm": 4.855426128828546, + "learning_rate": 9.996859185651038e-06, + "loss": 0.1629, + "step": 125 + }, + { + "epoch": 0.05732484076433121, + "grad_norm": 1.0499970639607177, + "learning_rate": 9.99680832849967e-06, + "loss": 0.1031, + "step": 126 + }, + { + "epoch": 0.05777979981801638, + "grad_norm": 0.8730447821488512, + "learning_rate": 9.99675706303169e-06, + "loss": 0.0606, + "step": 127 + }, + { + "epoch": 0.05823475887170155, + "grad_norm": 1.2779985416162813, + "learning_rate": 9.99670538925129e-06, + "loss": 0.074, + "step": 128 + }, + { + "epoch": 0.058689717925386714, + "grad_norm": 0.8606157718419157, + "learning_rate": 9.996653307162687e-06, + "loss": 0.0703, + "step": 129 + }, + { + "epoch": 0.059144676979071886, + "grad_norm": 0.8920761218762643, + "learning_rate": 9.996600816770144e-06, + "loss": 0.0818, + "step": 130 + }, + { + "epoch": 0.05959963603275705, + "grad_norm": 1.1603462045917847, + "learning_rate": 9.996547918077944e-06, + "loss": 0.1148, + "step": 131 + }, + { + "epoch": 0.06005459508644222, + "grad_norm": 0.9108713801214797, + "learning_rate": 9.996494611090414e-06, + "loss": 0.0884, + "step": 132 + }, + { + "epoch": 0.06050955414012739, + "grad_norm": 0.6523725468628359, + "learning_rate": 9.996440895811907e-06, + "loss": 0.0535, + "step": 133 + }, + { + "epoch": 0.060964513193812554, + "grad_norm": 0.8812777694752004, + "learning_rate": 9.996386772246816e-06, + "loss": 0.087, + "step": 134 + }, + { + "epoch": 0.061419472247497726, + "grad_norm": 1.0622191207422995, + "learning_rate": 9.99633224039956e-06, + "loss": 0.0982, + "step": 135 + }, + { + "epoch": 0.06187443130118289, + "grad_norm": 3.7961077321923025, + "learning_rate": 9.996277300274596e-06, + "loss": 0.1526, + "step": 136 + }, + { + "epoch": 0.062329390354868064, + "grad_norm": 0.9444433559435487, + "learning_rate": 9.996221951876415e-06, + "loss": 0.0996, + "step": 137 + }, + { + "epoch": 0.06278434940855324, + "grad_norm": 1.444871481552235, + "learning_rate": 9.996166195209539e-06, + "loss": 0.1075, + "step": 138 + }, + { + "epoch": 0.0632393084622384, + "grad_norm": 0.7446446480732116, + "learning_rate": 9.996110030278522e-06, + "loss": 0.0561, + "step": 139 + }, + { + "epoch": 0.06369426751592357, + "grad_norm": 0.8913010543094952, + "learning_rate": 9.996053457087958e-06, + "loss": 0.0715, + "step": 140 + }, + { + "epoch": 0.06414922656960874, + "grad_norm": 0.7815821404043856, + "learning_rate": 9.995996475642466e-06, + "loss": 0.0796, + "step": 141 + }, + { + "epoch": 0.0646041856232939, + "grad_norm": 0.74337588448595, + "learning_rate": 9.995939085946704e-06, + "loss": 0.0661, + "step": 142 + }, + { + "epoch": 0.06505914467697907, + "grad_norm": 0.9974255688753435, + "learning_rate": 9.995881288005363e-06, + "loss": 0.0869, + "step": 143 + }, + { + "epoch": 0.06551410373066424, + "grad_norm": 1.2260290141946268, + "learning_rate": 9.995823081823162e-06, + "loss": 0.0766, + "step": 144 + }, + { + "epoch": 0.06596906278434941, + "grad_norm": 0.9751795993584637, + "learning_rate": 9.99576446740486e-06, + "loss": 0.091, + "step": 145 + }, + { + "epoch": 0.06642402183803457, + "grad_norm": 1.6175476325168967, + "learning_rate": 9.995705444755249e-06, + "loss": 0.1208, + "step": 146 + }, + { + "epoch": 0.06687898089171974, + "grad_norm": 0.7580083688127299, + "learning_rate": 9.995646013879147e-06, + "loss": 0.0622, + "step": 147 + }, + { + "epoch": 0.06733393994540492, + "grad_norm": 1.0194887039793072, + "learning_rate": 9.995586174781413e-06, + "loss": 0.0753, + "step": 148 + }, + { + "epoch": 0.06778889899909009, + "grad_norm": 0.9065646408503975, + "learning_rate": 9.995525927466936e-06, + "loss": 0.0848, + "step": 149 + }, + { + "epoch": 0.06824385805277525, + "grad_norm": 0.8871078738477127, + "learning_rate": 9.995465271940641e-06, + "loss": 0.0607, + "step": 150 + }, + { + "epoch": 0.06869881710646042, + "grad_norm": 1.1486707652049646, + "learning_rate": 9.995404208207485e-06, + "loss": 0.0809, + "step": 151 + }, + { + "epoch": 0.06915377616014559, + "grad_norm": 1.1473150526096232, + "learning_rate": 9.995342736272453e-06, + "loss": 0.1035, + "step": 152 + }, + { + "epoch": 0.06960873521383075, + "grad_norm": 1.3025683052462544, + "learning_rate": 9.995280856140572e-06, + "loss": 0.1197, + "step": 153 + }, + { + "epoch": 0.07006369426751592, + "grad_norm": 0.8069596755970996, + "learning_rate": 9.9952185678169e-06, + "loss": 0.0526, + "step": 154 + }, + { + "epoch": 0.0705186533212011, + "grad_norm": 0.8153700064848134, + "learning_rate": 9.995155871306524e-06, + "loss": 0.0613, + "step": 155 + }, + { + "epoch": 0.07097361237488627, + "grad_norm": 0.7319023745966868, + "learning_rate": 9.995092766614567e-06, + "loss": 0.0512, + "step": 156 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 1.0146656175738817, + "learning_rate": 9.995029253746186e-06, + "loss": 0.0846, + "step": 157 + }, + { + "epoch": 0.0718835304822566, + "grad_norm": 0.8015254985373994, + "learning_rate": 9.994965332706574e-06, + "loss": 0.0619, + "step": 158 + }, + { + "epoch": 0.07233848953594177, + "grad_norm": 1.0630207312416284, + "learning_rate": 9.994901003500952e-06, + "loss": 0.0796, + "step": 159 + }, + { + "epoch": 0.07279344858962693, + "grad_norm": 0.9431304991088505, + "learning_rate": 9.994836266134575e-06, + "loss": 0.0743, + "step": 160 + }, + { + "epoch": 0.0732484076433121, + "grad_norm": 1.023738915097686, + "learning_rate": 9.994771120612737e-06, + "loss": 0.0888, + "step": 161 + }, + { + "epoch": 0.07370336669699727, + "grad_norm": 0.9272637744585672, + "learning_rate": 9.994705566940757e-06, + "loss": 0.084, + "step": 162 + }, + { + "epoch": 0.07415832575068244, + "grad_norm": 1.122378326253592, + "learning_rate": 9.994639605123994e-06, + "loss": 0.0961, + "step": 163 + }, + { + "epoch": 0.0746132848043676, + "grad_norm": 0.753531768411978, + "learning_rate": 9.994573235167839e-06, + "loss": 0.0736, + "step": 164 + }, + { + "epoch": 0.07506824385805277, + "grad_norm": 0.9314766958597749, + "learning_rate": 9.994506457077715e-06, + "loss": 0.0838, + "step": 165 + }, + { + "epoch": 0.07552320291173795, + "grad_norm": 0.996008388557059, + "learning_rate": 9.994439270859077e-06, + "loss": 0.1076, + "step": 166 + }, + { + "epoch": 0.07597816196542312, + "grad_norm": 0.9199332464612126, + "learning_rate": 9.994371676517418e-06, + "loss": 0.0724, + "step": 167 + }, + { + "epoch": 0.07643312101910828, + "grad_norm": 0.8652292283168678, + "learning_rate": 9.994303674058259e-06, + "loss": 0.0628, + "step": 168 + }, + { + "epoch": 0.07688808007279345, + "grad_norm": 0.8176262426438138, + "learning_rate": 9.994235263487158e-06, + "loss": 0.0743, + "step": 169 + }, + { + "epoch": 0.07734303912647862, + "grad_norm": 0.8147855247941459, + "learning_rate": 9.994166444809705e-06, + "loss": 0.0559, + "step": 170 + }, + { + "epoch": 0.07779799818016378, + "grad_norm": 0.7853019575635352, + "learning_rate": 9.994097218031524e-06, + "loss": 0.0681, + "step": 171 + }, + { + "epoch": 0.07825295723384895, + "grad_norm": 0.8445610480134321, + "learning_rate": 9.994027583158272e-06, + "loss": 0.0785, + "step": 172 + }, + { + "epoch": 0.07870791628753412, + "grad_norm": 0.8555498692388026, + "learning_rate": 9.993957540195638e-06, + "loss": 0.077, + "step": 173 + }, + { + "epoch": 0.0791628753412193, + "grad_norm": 0.8281270493499452, + "learning_rate": 9.993887089149346e-06, + "loss": 0.0848, + "step": 174 + }, + { + "epoch": 0.07961783439490445, + "grad_norm": 0.7180425978661062, + "learning_rate": 9.993816230025152e-06, + "loss": 0.0588, + "step": 175 + }, + { + "epoch": 0.08007279344858963, + "grad_norm": 0.9287545326980071, + "learning_rate": 9.99374496282885e-06, + "loss": 0.0874, + "step": 176 + }, + { + "epoch": 0.0805277525022748, + "grad_norm": 1.5950603980195528, + "learning_rate": 9.993673287566261e-06, + "loss": 0.1301, + "step": 177 + }, + { + "epoch": 0.08098271155595996, + "grad_norm": 0.505966633973175, + "learning_rate": 9.99360120424324e-06, + "loss": 0.0459, + "step": 178 + }, + { + "epoch": 0.08143767060964513, + "grad_norm": 0.6170796905443107, + "learning_rate": 9.993528712865681e-06, + "loss": 0.0666, + "step": 179 + }, + { + "epoch": 0.0818926296633303, + "grad_norm": 0.8965600572228928, + "learning_rate": 9.993455813439507e-06, + "loss": 0.0648, + "step": 180 + }, + { + "epoch": 0.08234758871701547, + "grad_norm": 0.7555745664692847, + "learning_rate": 9.993382505970673e-06, + "loss": 0.0479, + "step": 181 + }, + { + "epoch": 0.08280254777070063, + "grad_norm": 0.7885826993774436, + "learning_rate": 9.99330879046517e-06, + "loss": 0.0605, + "step": 182 + }, + { + "epoch": 0.0832575068243858, + "grad_norm": 0.6970911126559147, + "learning_rate": 9.993234666929024e-06, + "loss": 0.0545, + "step": 183 + }, + { + "epoch": 0.08371246587807098, + "grad_norm": 0.8281240642020996, + "learning_rate": 9.99316013536829e-06, + "loss": 0.0651, + "step": 184 + }, + { + "epoch": 0.08416742493175614, + "grad_norm": 0.8497823551734951, + "learning_rate": 9.993085195789057e-06, + "loss": 0.098, + "step": 185 + }, + { + "epoch": 0.08462238398544131, + "grad_norm": 0.8425278224044996, + "learning_rate": 9.993009848197452e-06, + "loss": 0.0861, + "step": 186 + }, + { + "epoch": 0.08507734303912648, + "grad_norm": 0.729342450692031, + "learning_rate": 9.992934092599629e-06, + "loss": 0.0651, + "step": 187 + }, + { + "epoch": 0.08553230209281165, + "grad_norm": 0.8810253378927329, + "learning_rate": 9.99285792900178e-06, + "loss": 0.0995, + "step": 188 + }, + { + "epoch": 0.08598726114649681, + "grad_norm": 1.0402457083445067, + "learning_rate": 9.992781357410131e-06, + "loss": 0.1061, + "step": 189 + }, + { + "epoch": 0.08644222020018198, + "grad_norm": 0.7397036090930822, + "learning_rate": 9.992704377830934e-06, + "loss": 0.0571, + "step": 190 + }, + { + "epoch": 0.08689717925386715, + "grad_norm": 1.4783630598693296, + "learning_rate": 9.992626990270484e-06, + "loss": 0.1154, + "step": 191 + }, + { + "epoch": 0.08735213830755233, + "grad_norm": 1.1100322283473036, + "learning_rate": 9.992549194735101e-06, + "loss": 0.1179, + "step": 192 + }, + { + "epoch": 0.08780709736123748, + "grad_norm": 0.5797984556503705, + "learning_rate": 9.992470991231144e-06, + "loss": 0.0466, + "step": 193 + }, + { + "epoch": 0.08826205641492266, + "grad_norm": 1.059908713900853, + "learning_rate": 9.992392379765005e-06, + "loss": 0.0994, + "step": 194 + }, + { + "epoch": 0.08871701546860783, + "grad_norm": 1.1187885391430794, + "learning_rate": 9.992313360343104e-06, + "loss": 0.0986, + "step": 195 + }, + { + "epoch": 0.08917197452229299, + "grad_norm": 0.7509441330173129, + "learning_rate": 9.992233932971901e-06, + "loss": 0.0634, + "step": 196 + }, + { + "epoch": 0.08962693357597816, + "grad_norm": 0.9426276516690344, + "learning_rate": 9.992154097657888e-06, + "loss": 0.0857, + "step": 197 + }, + { + "epoch": 0.09008189262966333, + "grad_norm": 0.8754039034503873, + "learning_rate": 9.992073854407585e-06, + "loss": 0.0881, + "step": 198 + }, + { + "epoch": 0.0905368516833485, + "grad_norm": 2.8697219156120712, + "learning_rate": 9.99199320322755e-06, + "loss": 0.0851, + "step": 199 + }, + { + "epoch": 0.09099181073703366, + "grad_norm": 0.7429242681646778, + "learning_rate": 9.991912144124375e-06, + "loss": 0.0729, + "step": 200 + }, + { + "epoch": 0.09144676979071883, + "grad_norm": 1.0552979449251756, + "learning_rate": 9.991830677104682e-06, + "loss": 0.1066, + "step": 201 + }, + { + "epoch": 0.09190172884440401, + "grad_norm": 0.8812651371324355, + "learning_rate": 9.99174880217513e-06, + "loss": 0.0732, + "step": 202 + }, + { + "epoch": 0.09235668789808917, + "grad_norm": 1.0755107845413352, + "learning_rate": 9.991666519342407e-06, + "loss": 0.0977, + "step": 203 + }, + { + "epoch": 0.09281164695177434, + "grad_norm": 0.8925063431256136, + "learning_rate": 9.99158382861324e-06, + "loss": 0.0904, + "step": 204 + }, + { + "epoch": 0.09326660600545951, + "grad_norm": 0.8190206986922173, + "learning_rate": 9.991500729994384e-06, + "loss": 0.0729, + "step": 205 + }, + { + "epoch": 0.09372156505914468, + "grad_norm": 0.6635798147425112, + "learning_rate": 9.991417223492629e-06, + "loss": 0.0631, + "step": 206 + }, + { + "epoch": 0.09417652411282984, + "grad_norm": 1.0314655306023923, + "learning_rate": 9.991333309114798e-06, + "loss": 0.0852, + "step": 207 + }, + { + "epoch": 0.09463148316651501, + "grad_norm": 0.8533496857694978, + "learning_rate": 9.991248986867753e-06, + "loss": 0.0868, + "step": 208 + }, + { + "epoch": 0.09508644222020018, + "grad_norm": 1.039085255997433, + "learning_rate": 9.991164256758378e-06, + "loss": 0.095, + "step": 209 + }, + { + "epoch": 0.09554140127388536, + "grad_norm": 1.1484522866350177, + "learning_rate": 9.9910791187936e-06, + "loss": 0.1333, + "step": 210 + }, + { + "epoch": 0.09599636032757052, + "grad_norm": 0.8277820800102422, + "learning_rate": 9.99099357298038e-06, + "loss": 0.0664, + "step": 211 + }, + { + "epoch": 0.09645131938125569, + "grad_norm": 0.821796111319934, + "learning_rate": 9.9909076193257e-06, + "loss": 0.083, + "step": 212 + }, + { + "epoch": 0.09690627843494086, + "grad_norm": 0.9448800546720313, + "learning_rate": 9.990821257836589e-06, + "loss": 0.0873, + "step": 213 + }, + { + "epoch": 0.09736123748862602, + "grad_norm": 0.9002810379340489, + "learning_rate": 9.990734488520103e-06, + "loss": 0.099, + "step": 214 + }, + { + "epoch": 0.09781619654231119, + "grad_norm": 0.6145149717344348, + "learning_rate": 9.990647311383334e-06, + "loss": 0.0425, + "step": 215 + }, + { + "epoch": 0.09827115559599636, + "grad_norm": 1.1377497370761045, + "learning_rate": 9.990559726433404e-06, + "loss": 0.0903, + "step": 216 + }, + { + "epoch": 0.09872611464968153, + "grad_norm": 0.8401357673155365, + "learning_rate": 9.99047173367747e-06, + "loss": 0.0812, + "step": 217 + }, + { + "epoch": 0.09918107370336669, + "grad_norm": 0.6977882365614015, + "learning_rate": 9.990383333122722e-06, + "loss": 0.0613, + "step": 218 + }, + { + "epoch": 0.09963603275705187, + "grad_norm": 0.6751056796776193, + "learning_rate": 9.990294524776384e-06, + "loss": 0.0636, + "step": 219 + }, + { + "epoch": 0.10009099181073704, + "grad_norm": 0.7973250315161167, + "learning_rate": 9.990205308645716e-06, + "loss": 0.0655, + "step": 220 + }, + { + "epoch": 0.1005459508644222, + "grad_norm": 0.6494979859380491, + "learning_rate": 9.990115684738005e-06, + "loss": 0.0461, + "step": 221 + }, + { + "epoch": 0.10100090991810737, + "grad_norm": 0.7863907355652456, + "learning_rate": 9.990025653060574e-06, + "loss": 0.0881, + "step": 222 + }, + { + "epoch": 0.10145586897179254, + "grad_norm": 1.2756737972223395, + "learning_rate": 9.98993521362078e-06, + "loss": 0.1102, + "step": 223 + }, + { + "epoch": 0.10191082802547771, + "grad_norm": 1.1992554133605928, + "learning_rate": 9.989844366426018e-06, + "loss": 0.1147, + "step": 224 + }, + { + "epoch": 0.10236578707916287, + "grad_norm": 0.5034605400337953, + "learning_rate": 9.989753111483707e-06, + "loss": 0.0462, + "step": 225 + }, + { + "epoch": 0.10282074613284804, + "grad_norm": 0.9881921480518578, + "learning_rate": 9.989661448801305e-06, + "loss": 0.0848, + "step": 226 + }, + { + "epoch": 0.10327570518653321, + "grad_norm": 0.7581777568438945, + "learning_rate": 9.989569378386303e-06, + "loss": 0.079, + "step": 227 + }, + { + "epoch": 0.10373066424021839, + "grad_norm": 0.6464731162067388, + "learning_rate": 9.989476900246223e-06, + "loss": 0.0617, + "step": 228 + }, + { + "epoch": 0.10418562329390355, + "grad_norm": 0.8780639185859085, + "learning_rate": 9.989384014388624e-06, + "loss": 0.086, + "step": 229 + }, + { + "epoch": 0.10464058234758872, + "grad_norm": 0.6623808171307163, + "learning_rate": 9.989290720821095e-06, + "loss": 0.0694, + "step": 230 + }, + { + "epoch": 0.10509554140127389, + "grad_norm": 0.721054554263859, + "learning_rate": 9.98919701955126e-06, + "loss": 0.0735, + "step": 231 + }, + { + "epoch": 0.10555050045495905, + "grad_norm": 0.7868134014829404, + "learning_rate": 9.989102910586776e-06, + "loss": 0.0546, + "step": 232 + }, + { + "epoch": 0.10600545950864422, + "grad_norm": 0.9137158371163484, + "learning_rate": 9.989008393935331e-06, + "loss": 0.0771, + "step": 233 + }, + { + "epoch": 0.10646041856232939, + "grad_norm": 0.8326009579593463, + "learning_rate": 9.98891346960465e-06, + "loss": 0.0667, + "step": 234 + }, + { + "epoch": 0.10691537761601456, + "grad_norm": 0.6462724580348628, + "learning_rate": 9.988818137602494e-06, + "loss": 0.0717, + "step": 235 + }, + { + "epoch": 0.10737033666969972, + "grad_norm": 0.7513725247558808, + "learning_rate": 9.988722397936646e-06, + "loss": 0.0733, + "step": 236 + }, + { + "epoch": 0.1078252957233849, + "grad_norm": 1.094509848236789, + "learning_rate": 9.988626250614932e-06, + "loss": 0.1009, + "step": 237 + }, + { + "epoch": 0.10828025477707007, + "grad_norm": 0.8200579138639758, + "learning_rate": 9.98852969564521e-06, + "loss": 0.0844, + "step": 238 + }, + { + "epoch": 0.10873521383075523, + "grad_norm": 0.7417763562196316, + "learning_rate": 9.988432733035369e-06, + "loss": 0.0611, + "step": 239 + }, + { + "epoch": 0.1091901728844404, + "grad_norm": 0.8476475869820355, + "learning_rate": 9.988335362793333e-06, + "loss": 0.0863, + "step": 240 + }, + { + "epoch": 0.10964513193812557, + "grad_norm": 0.9998642783878469, + "learning_rate": 9.988237584927058e-06, + "loss": 0.0909, + "step": 241 + }, + { + "epoch": 0.11010009099181074, + "grad_norm": 1.1689324698997519, + "learning_rate": 9.988139399444534e-06, + "loss": 0.124, + "step": 242 + }, + { + "epoch": 0.1105550500454959, + "grad_norm": 0.790901332269412, + "learning_rate": 9.988040806353786e-06, + "loss": 0.0855, + "step": 243 + }, + { + "epoch": 0.11101000909918107, + "grad_norm": 0.8931785977847209, + "learning_rate": 9.987941805662869e-06, + "loss": 0.1023, + "step": 244 + }, + { + "epoch": 0.11146496815286625, + "grad_norm": 0.7352781929773609, + "learning_rate": 9.98784239737987e-06, + "loss": 0.0563, + "step": 245 + }, + { + "epoch": 0.11191992720655142, + "grad_norm": 0.7169092611535308, + "learning_rate": 9.987742581512919e-06, + "loss": 0.0683, + "step": 246 + }, + { + "epoch": 0.11237488626023658, + "grad_norm": 0.6767560569792272, + "learning_rate": 9.987642358070167e-06, + "loss": 0.0669, + "step": 247 + }, + { + "epoch": 0.11282984531392175, + "grad_norm": 0.8442319805699996, + "learning_rate": 9.987541727059805e-06, + "loss": 0.0768, + "step": 248 + }, + { + "epoch": 0.11328480436760692, + "grad_norm": 0.7700876798522618, + "learning_rate": 9.987440688490058e-06, + "loss": 0.0643, + "step": 249 + }, + { + "epoch": 0.11373976342129208, + "grad_norm": 0.7286087978317647, + "learning_rate": 9.98733924236918e-06, + "loss": 0.0698, + "step": 250 + }, + { + "epoch": 0.11419472247497725, + "grad_norm": 0.7917355018437868, + "learning_rate": 9.98723738870546e-06, + "loss": 0.0791, + "step": 251 + }, + { + "epoch": 0.11464968152866242, + "grad_norm": 1.0469499693242315, + "learning_rate": 9.987135127507226e-06, + "loss": 0.0761, + "step": 252 + }, + { + "epoch": 0.1151046405823476, + "grad_norm": 0.8361714930383379, + "learning_rate": 9.987032458782828e-06, + "loss": 0.0789, + "step": 253 + }, + { + "epoch": 0.11555959963603275, + "grad_norm": 0.5902853873046482, + "learning_rate": 9.986929382540662e-06, + "loss": 0.0479, + "step": 254 + }, + { + "epoch": 0.11601455868971793, + "grad_norm": 0.7349436304465384, + "learning_rate": 9.986825898789145e-06, + "loss": 0.0668, + "step": 255 + }, + { + "epoch": 0.1164695177434031, + "grad_norm": 0.7657107039148755, + "learning_rate": 9.986722007536737e-06, + "loss": 0.0617, + "step": 256 + }, + { + "epoch": 0.11692447679708826, + "grad_norm": 0.6450631027744769, + "learning_rate": 9.986617708791926e-06, + "loss": 0.0679, + "step": 257 + }, + { + "epoch": 0.11737943585077343, + "grad_norm": 0.6292930010016882, + "learning_rate": 9.986513002563236e-06, + "loss": 0.0482, + "step": 258 + }, + { + "epoch": 0.1178343949044586, + "grad_norm": 0.8758541343517451, + "learning_rate": 9.986407888859221e-06, + "loss": 0.0994, + "step": 259 + }, + { + "epoch": 0.11828935395814377, + "grad_norm": 0.6537445862223847, + "learning_rate": 9.986302367688473e-06, + "loss": 0.07, + "step": 260 + }, + { + "epoch": 0.11874431301182893, + "grad_norm": 0.8029660816844667, + "learning_rate": 9.986196439059613e-06, + "loss": 0.0623, + "step": 261 + }, + { + "epoch": 0.1191992720655141, + "grad_norm": 0.7339528606524214, + "learning_rate": 9.986090102981297e-06, + "loss": 0.0791, + "step": 262 + }, + { + "epoch": 0.11965423111919928, + "grad_norm": 0.7934112522002073, + "learning_rate": 9.985983359462215e-06, + "loss": 0.0672, + "step": 263 + }, + { + "epoch": 0.12010919017288443, + "grad_norm": 1.0186962263060808, + "learning_rate": 9.98587620851109e-06, + "loss": 0.1213, + "step": 264 + }, + { + "epoch": 0.1205641492265696, + "grad_norm": 0.6769843647605545, + "learning_rate": 9.985768650136679e-06, + "loss": 0.0685, + "step": 265 + }, + { + "epoch": 0.12101910828025478, + "grad_norm": 0.7543020935976431, + "learning_rate": 9.985660684347765e-06, + "loss": 0.0861, + "step": 266 + }, + { + "epoch": 0.12147406733393995, + "grad_norm": 0.9552124731299731, + "learning_rate": 9.985552311153178e-06, + "loss": 0.0922, + "step": 267 + }, + { + "epoch": 0.12192902638762511, + "grad_norm": 0.7436699167226903, + "learning_rate": 9.985443530561769e-06, + "loss": 0.0885, + "step": 268 + }, + { + "epoch": 0.12238398544131028, + "grad_norm": 1.329058937551934, + "learning_rate": 9.98533434258243e-06, + "loss": 0.1115, + "step": 269 + }, + { + "epoch": 0.12283894449499545, + "grad_norm": 0.6835909813818813, + "learning_rate": 9.985224747224083e-06, + "loss": 0.0586, + "step": 270 + }, + { + "epoch": 0.12329390354868063, + "grad_norm": 1.0733107060854794, + "learning_rate": 9.98511474449568e-06, + "loss": 0.0811, + "step": 271 + }, + { + "epoch": 0.12374886260236578, + "grad_norm": 0.5916007278667166, + "learning_rate": 9.985004334406215e-06, + "loss": 0.0696, + "step": 272 + }, + { + "epoch": 0.12420382165605096, + "grad_norm": 0.9149357508392912, + "learning_rate": 9.984893516964707e-06, + "loss": 0.0704, + "step": 273 + }, + { + "epoch": 0.12465878070973613, + "grad_norm": 1.1634742377762608, + "learning_rate": 9.984782292180212e-06, + "loss": 0.1178, + "step": 274 + }, + { + "epoch": 0.1251137397634213, + "grad_norm": 0.603957454908005, + "learning_rate": 9.98467066006182e-06, + "loss": 0.0585, + "step": 275 + }, + { + "epoch": 0.12556869881710647, + "grad_norm": 0.7735087790025026, + "learning_rate": 9.984558620618651e-06, + "loss": 0.0953, + "step": 276 + }, + { + "epoch": 0.12602365787079162, + "grad_norm": 1.2570182633873541, + "learning_rate": 9.984446173859863e-06, + "loss": 0.1353, + "step": 277 + }, + { + "epoch": 0.1264786169244768, + "grad_norm": 0.7275895818672663, + "learning_rate": 9.984333319794642e-06, + "loss": 0.0774, + "step": 278 + }, + { + "epoch": 0.12693357597816196, + "grad_norm": 0.6395006056363333, + "learning_rate": 9.984220058432212e-06, + "loss": 0.0591, + "step": 279 + }, + { + "epoch": 0.12738853503184713, + "grad_norm": 0.6563921850032347, + "learning_rate": 9.984106389781828e-06, + "loss": 0.0573, + "step": 280 + }, + { + "epoch": 0.1278434940855323, + "grad_norm": 0.9399157526953884, + "learning_rate": 9.983992313852776e-06, + "loss": 0.0793, + "step": 281 + }, + { + "epoch": 0.12829845313921748, + "grad_norm": 0.93528061821534, + "learning_rate": 9.983877830654381e-06, + "loss": 0.0807, + "step": 282 + }, + { + "epoch": 0.12875341219290265, + "grad_norm": 0.7192448233352142, + "learning_rate": 9.983762940195996e-06, + "loss": 0.0773, + "step": 283 + }, + { + "epoch": 0.1292083712465878, + "grad_norm": 0.7097381072031733, + "learning_rate": 9.98364764248701e-06, + "loss": 0.0698, + "step": 284 + }, + { + "epoch": 0.12966333030027297, + "grad_norm": 1.1635566012920768, + "learning_rate": 9.983531937536844e-06, + "loss": 0.0893, + "step": 285 + }, + { + "epoch": 0.13011828935395814, + "grad_norm": 0.8456555685011555, + "learning_rate": 9.983415825354954e-06, + "loss": 0.0628, + "step": 286 + }, + { + "epoch": 0.1305732484076433, + "grad_norm": 0.7151838393189083, + "learning_rate": 9.983299305950828e-06, + "loss": 0.0557, + "step": 287 + }, + { + "epoch": 0.13102820746132848, + "grad_norm": 0.7095193783870621, + "learning_rate": 9.983182379333989e-06, + "loss": 0.0604, + "step": 288 + }, + { + "epoch": 0.13148316651501366, + "grad_norm": 0.8581434444337498, + "learning_rate": 9.983065045513986e-06, + "loss": 0.0781, + "step": 289 + }, + { + "epoch": 0.13193812556869883, + "grad_norm": 0.5600994934804626, + "learning_rate": 9.982947304500414e-06, + "loss": 0.0498, + "step": 290 + }, + { + "epoch": 0.13239308462238397, + "grad_norm": 0.7355720212694087, + "learning_rate": 9.98282915630289e-06, + "loss": 0.0692, + "step": 291 + }, + { + "epoch": 0.13284804367606914, + "grad_norm": 1.6846985851500909, + "learning_rate": 9.98271060093107e-06, + "loss": 0.1687, + "step": 292 + }, + { + "epoch": 0.13330300272975432, + "grad_norm": 0.7959406174268434, + "learning_rate": 9.98259163839464e-06, + "loss": 0.0718, + "step": 293 + }, + { + "epoch": 0.1337579617834395, + "grad_norm": 0.6005858848115938, + "learning_rate": 9.982472268703323e-06, + "loss": 0.0465, + "step": 294 + }, + { + "epoch": 0.13421292083712466, + "grad_norm": 0.7865103977061746, + "learning_rate": 9.982352491866874e-06, + "loss": 0.071, + "step": 295 + }, + { + "epoch": 0.13466787989080983, + "grad_norm": 0.7167219429964851, + "learning_rate": 9.982232307895077e-06, + "loss": 0.0658, + "step": 296 + }, + { + "epoch": 0.135122838944495, + "grad_norm": 1.206398567596641, + "learning_rate": 9.982111716797758e-06, + "loss": 0.101, + "step": 297 + }, + { + "epoch": 0.13557779799818018, + "grad_norm": 1.0085912508470862, + "learning_rate": 9.981990718584768e-06, + "loss": 0.0959, + "step": 298 + }, + { + "epoch": 0.13603275705186532, + "grad_norm": 0.8594135430057543, + "learning_rate": 9.981869313265995e-06, + "loss": 0.0912, + "step": 299 + }, + { + "epoch": 0.1364877161055505, + "grad_norm": 0.9903339586980618, + "learning_rate": 9.981747500851357e-06, + "loss": 0.0692, + "step": 300 + }, + { + "epoch": 0.13694267515923567, + "grad_norm": 0.7623380548666351, + "learning_rate": 9.981625281350812e-06, + "loss": 0.0699, + "step": 301 + }, + { + "epoch": 0.13739763421292084, + "grad_norm": 0.6267143484055344, + "learning_rate": 9.981502654774349e-06, + "loss": 0.0499, + "step": 302 + }, + { + "epoch": 0.137852593266606, + "grad_norm": 0.8234150836820757, + "learning_rate": 9.98137962113198e-06, + "loss": 0.0788, + "step": 303 + }, + { + "epoch": 0.13830755232029118, + "grad_norm": 0.8158733102806115, + "learning_rate": 9.98125618043377e-06, + "loss": 0.089, + "step": 304 + }, + { + "epoch": 0.13876251137397635, + "grad_norm": 0.6372656549463032, + "learning_rate": 9.981132332689796e-06, + "loss": 0.0517, + "step": 305 + }, + { + "epoch": 0.1392174704276615, + "grad_norm": 0.7713863813548327, + "learning_rate": 9.981008077910184e-06, + "loss": 0.0769, + "step": 306 + }, + { + "epoch": 0.13967242948134667, + "grad_norm": 0.8883775702857831, + "learning_rate": 9.980883416105084e-06, + "loss": 0.0828, + "step": 307 + }, + { + "epoch": 0.14012738853503184, + "grad_norm": 0.6490936355626988, + "learning_rate": 9.980758347284687e-06, + "loss": 0.0618, + "step": 308 + }, + { + "epoch": 0.14058234758871702, + "grad_norm": 0.8359554084586713, + "learning_rate": 9.980632871459209e-06, + "loss": 0.0714, + "step": 309 + }, + { + "epoch": 0.1410373066424022, + "grad_norm": 0.7373523328454649, + "learning_rate": 9.980506988638906e-06, + "loss": 0.0836, + "step": 310 + }, + { + "epoch": 0.14149226569608736, + "grad_norm": 0.6644370731485183, + "learning_rate": 9.980380698834064e-06, + "loss": 0.0777, + "step": 311 + }, + { + "epoch": 0.14194722474977253, + "grad_norm": 0.870883965477211, + "learning_rate": 9.980254002055003e-06, + "loss": 0.0847, + "step": 312 + }, + { + "epoch": 0.14240218380345768, + "grad_norm": 0.6021065409531002, + "learning_rate": 9.980126898312074e-06, + "loss": 0.0583, + "step": 313 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.8705461588189498, + "learning_rate": 9.979999387615665e-06, + "loss": 0.0895, + "step": 314 + }, + { + "epoch": 0.14331210191082802, + "grad_norm": 0.9639410731114018, + "learning_rate": 9.979871469976197e-06, + "loss": 0.0901, + "step": 315 + }, + { + "epoch": 0.1437670609645132, + "grad_norm": 0.7554126383153169, + "learning_rate": 9.97974314540412e-06, + "loss": 0.0699, + "step": 316 + }, + { + "epoch": 0.14422202001819837, + "grad_norm": 1.1039648440512544, + "learning_rate": 9.979614413909922e-06, + "loss": 0.1013, + "step": 317 + }, + { + "epoch": 0.14467697907188354, + "grad_norm": 0.5258831871743486, + "learning_rate": 9.979485275504121e-06, + "loss": 0.0544, + "step": 318 + }, + { + "epoch": 0.1451319381255687, + "grad_norm": 1.3025897394440575, + "learning_rate": 9.979355730197271e-06, + "loss": 0.1067, + "step": 319 + }, + { + "epoch": 0.14558689717925385, + "grad_norm": 0.5206132423310033, + "learning_rate": 9.979225777999956e-06, + "loss": 0.0497, + "step": 320 + }, + { + "epoch": 0.14604185623293903, + "grad_norm": 0.7202189397663867, + "learning_rate": 9.9790954189228e-06, + "loss": 0.0807, + "step": 321 + }, + { + "epoch": 0.1464968152866242, + "grad_norm": 0.5738667169449175, + "learning_rate": 9.97896465297645e-06, + "loss": 0.0614, + "step": 322 + }, + { + "epoch": 0.14695177434030937, + "grad_norm": 0.7972440737628133, + "learning_rate": 9.978833480171592e-06, + "loss": 0.0906, + "step": 323 + }, + { + "epoch": 0.14740673339399454, + "grad_norm": 0.7697423454053598, + "learning_rate": 9.978701900518947e-06, + "loss": 0.0632, + "step": 324 + }, + { + "epoch": 0.14786169244767972, + "grad_norm": 0.8259885564233931, + "learning_rate": 9.978569914029267e-06, + "loss": 0.0944, + "step": 325 + }, + { + "epoch": 0.1483166515013649, + "grad_norm": 0.8450006655868962, + "learning_rate": 9.978437520713335e-06, + "loss": 0.0862, + "step": 326 + }, + { + "epoch": 0.14877161055505003, + "grad_norm": 0.7746078278616594, + "learning_rate": 9.978304720581973e-06, + "loss": 0.088, + "step": 327 + }, + { + "epoch": 0.1492265696087352, + "grad_norm": 0.9977734940815816, + "learning_rate": 9.97817151364603e-06, + "loss": 0.1036, + "step": 328 + }, + { + "epoch": 0.14968152866242038, + "grad_norm": 0.7800752301510507, + "learning_rate": 9.978037899916393e-06, + "loss": 0.0778, + "step": 329 + }, + { + "epoch": 0.15013648771610555, + "grad_norm": 0.7521153273438224, + "learning_rate": 9.97790387940398e-06, + "loss": 0.0532, + "step": 330 + }, + { + "epoch": 0.15059144676979072, + "grad_norm": 0.8046420256419254, + "learning_rate": 9.977769452119741e-06, + "loss": 0.0708, + "step": 331 + }, + { + "epoch": 0.1510464058234759, + "grad_norm": 0.9071770528791517, + "learning_rate": 9.97763461807466e-06, + "loss": 0.1006, + "step": 332 + }, + { + "epoch": 0.15150136487716107, + "grad_norm": 0.8824570234268595, + "learning_rate": 9.97749937727976e-06, + "loss": 0.0855, + "step": 333 + }, + { + "epoch": 0.15195632393084624, + "grad_norm": 0.8286075823730068, + "learning_rate": 9.977363729746088e-06, + "loss": 0.077, + "step": 334 + }, + { + "epoch": 0.15241128298453138, + "grad_norm": 0.6791233851472963, + "learning_rate": 9.977227675484729e-06, + "loss": 0.0698, + "step": 335 + }, + { + "epoch": 0.15286624203821655, + "grad_norm": 0.9813875260679181, + "learning_rate": 9.977091214506803e-06, + "loss": 0.0838, + "step": 336 + }, + { + "epoch": 0.15332120109190173, + "grad_norm": 0.9986284190120469, + "learning_rate": 9.976954346823456e-06, + "loss": 0.0789, + "step": 337 + }, + { + "epoch": 0.1537761601455869, + "grad_norm": 0.6456071732838817, + "learning_rate": 9.976817072445878e-06, + "loss": 0.0566, + "step": 338 + }, + { + "epoch": 0.15423111919927207, + "grad_norm": 0.7707362352402762, + "learning_rate": 9.976679391385283e-06, + "loss": 0.0677, + "step": 339 + }, + { + "epoch": 0.15468607825295724, + "grad_norm": 0.5804713825378958, + "learning_rate": 9.976541303652923e-06, + "loss": 0.0547, + "step": 340 + }, + { + "epoch": 0.15514103730664242, + "grad_norm": 0.7705377953828665, + "learning_rate": 9.976402809260083e-06, + "loss": 0.0673, + "step": 341 + }, + { + "epoch": 0.15559599636032756, + "grad_norm": 0.651002355082985, + "learning_rate": 9.976263908218076e-06, + "loss": 0.066, + "step": 342 + }, + { + "epoch": 0.15605095541401273, + "grad_norm": 1.0075230687249708, + "learning_rate": 9.976124600538257e-06, + "loss": 0.1151, + "step": 343 + }, + { + "epoch": 0.1565059144676979, + "grad_norm": 0.7110146200064966, + "learning_rate": 9.975984886232006e-06, + "loss": 0.0693, + "step": 344 + }, + { + "epoch": 0.15696087352138308, + "grad_norm": 0.782615076662302, + "learning_rate": 9.975844765310743e-06, + "loss": 0.071, + "step": 345 + }, + { + "epoch": 0.15741583257506825, + "grad_norm": 1.091513822496144, + "learning_rate": 9.975704237785915e-06, + "loss": 0.1277, + "step": 346 + }, + { + "epoch": 0.15787079162875342, + "grad_norm": 0.8244942271322709, + "learning_rate": 9.975563303669006e-06, + "loss": 0.092, + "step": 347 + }, + { + "epoch": 0.1583257506824386, + "grad_norm": 1.0997264747524325, + "learning_rate": 9.975421962971536e-06, + "loss": 0.102, + "step": 348 + }, + { + "epoch": 0.15878070973612374, + "grad_norm": 1.0471722358260585, + "learning_rate": 9.97528021570505e-06, + "loss": 0.1112, + "step": 349 + }, + { + "epoch": 0.1592356687898089, + "grad_norm": 0.6366013160292697, + "learning_rate": 9.975138061881135e-06, + "loss": 0.0629, + "step": 350 + }, + { + "epoch": 0.15969062784349408, + "grad_norm": 0.7145502784859615, + "learning_rate": 9.974995501511404e-06, + "loss": 0.0567, + "step": 351 + }, + { + "epoch": 0.16014558689717925, + "grad_norm": 1.0825694007542435, + "learning_rate": 9.974852534607506e-06, + "loss": 0.0897, + "step": 352 + }, + { + "epoch": 0.16060054595086443, + "grad_norm": 0.8874195306329471, + "learning_rate": 9.974709161181126e-06, + "loss": 0.0879, + "step": 353 + }, + { + "epoch": 0.1610555050045496, + "grad_norm": 0.8193025449594961, + "learning_rate": 9.974565381243982e-06, + "loss": 0.0969, + "step": 354 + }, + { + "epoch": 0.16151046405823477, + "grad_norm": 0.76528422131405, + "learning_rate": 9.974421194807815e-06, + "loss": 0.0786, + "step": 355 + }, + { + "epoch": 0.16196542311191992, + "grad_norm": 0.8836543328533641, + "learning_rate": 9.974276601884416e-06, + "loss": 0.0744, + "step": 356 + }, + { + "epoch": 0.1624203821656051, + "grad_norm": 0.7482952108426273, + "learning_rate": 9.974131602485596e-06, + "loss": 0.0772, + "step": 357 + }, + { + "epoch": 0.16287534121929026, + "grad_norm": 0.9122723647083647, + "learning_rate": 9.973986196623203e-06, + "loss": 0.0851, + "step": 358 + }, + { + "epoch": 0.16333030027297543, + "grad_norm": 0.8373653902978805, + "learning_rate": 9.973840384309121e-06, + "loss": 0.0865, + "step": 359 + }, + { + "epoch": 0.1637852593266606, + "grad_norm": 0.6360069343077157, + "learning_rate": 9.973694165555264e-06, + "loss": 0.0618, + "step": 360 + }, + { + "epoch": 0.16424021838034578, + "grad_norm": 0.7967304456611868, + "learning_rate": 9.973547540373582e-06, + "loss": 0.0865, + "step": 361 + }, + { + "epoch": 0.16469517743403095, + "grad_norm": 1.1699452577832765, + "learning_rate": 9.973400508776054e-06, + "loss": 0.1144, + "step": 362 + }, + { + "epoch": 0.1651501364877161, + "grad_norm": 0.6282867599706373, + "learning_rate": 9.973253070774698e-06, + "loss": 0.0633, + "step": 363 + }, + { + "epoch": 0.16560509554140126, + "grad_norm": 0.79942272506218, + "learning_rate": 9.973105226381559e-06, + "loss": 0.069, + "step": 364 + }, + { + "epoch": 0.16606005459508644, + "grad_norm": 0.9348674828410355, + "learning_rate": 9.972956975608719e-06, + "loss": 0.1019, + "step": 365 + }, + { + "epoch": 0.1665150136487716, + "grad_norm": 1.0942665884463076, + "learning_rate": 9.972808318468292e-06, + "loss": 0.0859, + "step": 366 + }, + { + "epoch": 0.16696997270245678, + "grad_norm": 0.6283579225277517, + "learning_rate": 9.972659254972426e-06, + "loss": 0.0589, + "step": 367 + }, + { + "epoch": 0.16742493175614195, + "grad_norm": 1.0989677054167046, + "learning_rate": 9.972509785133304e-06, + "loss": 0.1081, + "step": 368 + }, + { + "epoch": 0.16787989080982713, + "grad_norm": 0.7310198219540203, + "learning_rate": 9.972359908963137e-06, + "loss": 0.0675, + "step": 369 + }, + { + "epoch": 0.16833484986351227, + "grad_norm": 0.757671629194488, + "learning_rate": 9.972209626474172e-06, + "loss": 0.0734, + "step": 370 + }, + { + "epoch": 0.16878980891719744, + "grad_norm": 0.7966175159886519, + "learning_rate": 9.972058937678692e-06, + "loss": 0.075, + "step": 371 + }, + { + "epoch": 0.16924476797088261, + "grad_norm": 0.9805514159267839, + "learning_rate": 9.97190784258901e-06, + "loss": 0.1071, + "step": 372 + }, + { + "epoch": 0.1696997270245678, + "grad_norm": 0.7000612574442994, + "learning_rate": 9.971756341217471e-06, + "loss": 0.0526, + "step": 373 + }, + { + "epoch": 0.17015468607825296, + "grad_norm": 0.7917466702374949, + "learning_rate": 9.971604433576456e-06, + "loss": 0.0698, + "step": 374 + }, + { + "epoch": 0.17060964513193813, + "grad_norm": 0.8412692631182211, + "learning_rate": 9.97145211967838e-06, + "loss": 0.0783, + "step": 375 + }, + { + "epoch": 0.1710646041856233, + "grad_norm": 0.5615038895232536, + "learning_rate": 9.971299399535685e-06, + "loss": 0.053, + "step": 376 + }, + { + "epoch": 0.17151956323930848, + "grad_norm": 0.6849745369298482, + "learning_rate": 9.971146273160854e-06, + "loss": 0.0774, + "step": 377 + }, + { + "epoch": 0.17197452229299362, + "grad_norm": 0.6466596777060115, + "learning_rate": 9.9709927405664e-06, + "loss": 0.0606, + "step": 378 + }, + { + "epoch": 0.1724294813466788, + "grad_norm": 0.7169884074840761, + "learning_rate": 9.970838801764866e-06, + "loss": 0.0839, + "step": 379 + }, + { + "epoch": 0.17288444040036396, + "grad_norm": 0.9393396355410675, + "learning_rate": 9.970684456768836e-06, + "loss": 0.1132, + "step": 380 + }, + { + "epoch": 0.17333939945404914, + "grad_norm": 12.197098173453568, + "learning_rate": 9.970529705590918e-06, + "loss": 0.4858, + "step": 381 + }, + { + "epoch": 0.1737943585077343, + "grad_norm": 0.7355841274771772, + "learning_rate": 9.97037454824376e-06, + "loss": 0.0714, + "step": 382 + }, + { + "epoch": 0.17424931756141948, + "grad_norm": 1.050385265783733, + "learning_rate": 9.97021898474004e-06, + "loss": 0.1024, + "step": 383 + }, + { + "epoch": 0.17470427661510465, + "grad_norm": 0.8612087678995594, + "learning_rate": 9.970063015092469e-06, + "loss": 0.085, + "step": 384 + }, + { + "epoch": 0.1751592356687898, + "grad_norm": 1.3886472100476919, + "learning_rate": 9.969906639313793e-06, + "loss": 0.1212, + "step": 385 + }, + { + "epoch": 0.17561419472247497, + "grad_norm": 0.8238176964814595, + "learning_rate": 9.96974985741679e-06, + "loss": 0.0721, + "step": 386 + }, + { + "epoch": 0.17606915377616014, + "grad_norm": 0.8718897735731601, + "learning_rate": 9.969592669414272e-06, + "loss": 0.0959, + "step": 387 + }, + { + "epoch": 0.17652411282984531, + "grad_norm": 6.796752422837202, + "learning_rate": 9.969435075319083e-06, + "loss": 0.115, + "step": 388 + }, + { + "epoch": 0.1769790718835305, + "grad_norm": 0.58176536820322, + "learning_rate": 9.969277075144104e-06, + "loss": 0.0459, + "step": 389 + }, + { + "epoch": 0.17743403093721566, + "grad_norm": 0.7267253435076165, + "learning_rate": 9.969118668902242e-06, + "loss": 0.07, + "step": 390 + }, + { + "epoch": 0.17788898999090083, + "grad_norm": 0.7682389367523258, + "learning_rate": 9.968959856606442e-06, + "loss": 0.0542, + "step": 391 + }, + { + "epoch": 0.17834394904458598, + "grad_norm": 0.7873348185837048, + "learning_rate": 9.968800638269682e-06, + "loss": 0.0598, + "step": 392 + }, + { + "epoch": 0.17879890809827115, + "grad_norm": 1.287713292390112, + "learning_rate": 9.968641013904974e-06, + "loss": 0.1442, + "step": 393 + }, + { + "epoch": 0.17925386715195632, + "grad_norm": 1.085650814952146, + "learning_rate": 9.968480983525359e-06, + "loss": 0.0926, + "step": 394 + }, + { + "epoch": 0.1797088262056415, + "grad_norm": 0.6716676596759695, + "learning_rate": 9.968320547143918e-06, + "loss": 0.0767, + "step": 395 + }, + { + "epoch": 0.18016378525932666, + "grad_norm": 0.8467396807693714, + "learning_rate": 9.968159704773757e-06, + "loss": 0.0977, + "step": 396 + }, + { + "epoch": 0.18061874431301184, + "grad_norm": 0.6438855833782786, + "learning_rate": 9.967998456428021e-06, + "loss": 0.0586, + "step": 397 + }, + { + "epoch": 0.181073703366697, + "grad_norm": 0.7254140122399564, + "learning_rate": 9.967836802119886e-06, + "loss": 0.06, + "step": 398 + }, + { + "epoch": 0.18152866242038215, + "grad_norm": 0.87517545358881, + "learning_rate": 9.967674741862563e-06, + "loss": 0.1016, + "step": 399 + }, + { + "epoch": 0.18198362147406733, + "grad_norm": 1.0624206936058178, + "learning_rate": 9.967512275669294e-06, + "loss": 0.1296, + "step": 400 + }, + { + "epoch": 0.1824385805277525, + "grad_norm": 1.0284720738314184, + "learning_rate": 9.967349403553353e-06, + "loss": 0.0862, + "step": 401 + }, + { + "epoch": 0.18289353958143767, + "grad_norm": 0.8342932737384292, + "learning_rate": 9.967186125528053e-06, + "loss": 0.0873, + "step": 402 + }, + { + "epoch": 0.18334849863512284, + "grad_norm": 1.543095569701571, + "learning_rate": 9.967022441606734e-06, + "loss": 0.1209, + "step": 403 + }, + { + "epoch": 0.18380345768880801, + "grad_norm": 0.70731586616612, + "learning_rate": 9.966858351802773e-06, + "loss": 0.0726, + "step": 404 + }, + { + "epoch": 0.1842584167424932, + "grad_norm": 0.6660531988680356, + "learning_rate": 9.966693856129576e-06, + "loss": 0.0562, + "step": 405 + }, + { + "epoch": 0.18471337579617833, + "grad_norm": 0.8503640969928286, + "learning_rate": 9.966528954600587e-06, + "loss": 0.0838, + "step": 406 + }, + { + "epoch": 0.1851683348498635, + "grad_norm": 0.6021534124846688, + "learning_rate": 9.96636364722928e-06, + "loss": 0.0673, + "step": 407 + }, + { + "epoch": 0.18562329390354868, + "grad_norm": 0.8782816795828058, + "learning_rate": 9.966197934029165e-06, + "loss": 0.0845, + "step": 408 + }, + { + "epoch": 0.18607825295723385, + "grad_norm": 0.9030990654346936, + "learning_rate": 9.966031815013781e-06, + "loss": 0.0839, + "step": 409 + }, + { + "epoch": 0.18653321201091902, + "grad_norm": 0.8567507299712805, + "learning_rate": 9.965865290196703e-06, + "loss": 0.0935, + "step": 410 + }, + { + "epoch": 0.1869881710646042, + "grad_norm": 0.8099856489670021, + "learning_rate": 9.96569835959154e-06, + "loss": 0.0747, + "step": 411 + }, + { + "epoch": 0.18744313011828936, + "grad_norm": 0.8938878675243255, + "learning_rate": 9.965531023211931e-06, + "loss": 0.0854, + "step": 412 + }, + { + "epoch": 0.18789808917197454, + "grad_norm": 0.735313860104022, + "learning_rate": 9.965363281071551e-06, + "loss": 0.0865, + "step": 413 + }, + { + "epoch": 0.18835304822565968, + "grad_norm": 0.5495229598132649, + "learning_rate": 9.965195133184108e-06, + "loss": 0.0403, + "step": 414 + }, + { + "epoch": 0.18880800727934485, + "grad_norm": 1.0700416713113117, + "learning_rate": 9.965026579563342e-06, + "loss": 0.1086, + "step": 415 + }, + { + "epoch": 0.18926296633303002, + "grad_norm": 0.7118653717355078, + "learning_rate": 9.964857620223024e-06, + "loss": 0.0691, + "step": 416 + }, + { + "epoch": 0.1897179253867152, + "grad_norm": 0.6871481686027417, + "learning_rate": 9.964688255176963e-06, + "loss": 0.0667, + "step": 417 + }, + { + "epoch": 0.19017288444040037, + "grad_norm": 0.9848841869658392, + "learning_rate": 9.964518484438998e-06, + "loss": 0.0813, + "step": 418 + }, + { + "epoch": 0.19062784349408554, + "grad_norm": 0.6311750922074311, + "learning_rate": 9.964348308023001e-06, + "loss": 0.0592, + "step": 419 + }, + { + "epoch": 0.1910828025477707, + "grad_norm": 0.7813168734245782, + "learning_rate": 9.964177725942881e-06, + "loss": 0.0826, + "step": 420 + }, + { + "epoch": 0.19153776160145586, + "grad_norm": 0.8572110622332836, + "learning_rate": 9.964006738212574e-06, + "loss": 0.0853, + "step": 421 + }, + { + "epoch": 0.19199272065514103, + "grad_norm": 0.5304433423014596, + "learning_rate": 9.963835344846056e-06, + "loss": 0.048, + "step": 422 + }, + { + "epoch": 0.1924476797088262, + "grad_norm": 0.7598521228122416, + "learning_rate": 9.963663545857328e-06, + "loss": 0.0757, + "step": 423 + }, + { + "epoch": 0.19290263876251137, + "grad_norm": 1.1542546683489703, + "learning_rate": 9.963491341260432e-06, + "loss": 0.104, + "step": 424 + }, + { + "epoch": 0.19335759781619655, + "grad_norm": 0.7766563582253432, + "learning_rate": 9.963318731069437e-06, + "loss": 0.0952, + "step": 425 + }, + { + "epoch": 0.19381255686988172, + "grad_norm": 1.1319194983916299, + "learning_rate": 9.96314571529845e-06, + "loss": 0.1005, + "step": 426 + }, + { + "epoch": 0.1942675159235669, + "grad_norm": 0.7230559135257585, + "learning_rate": 9.962972293961608e-06, + "loss": 0.0647, + "step": 427 + }, + { + "epoch": 0.19472247497725204, + "grad_norm": 0.9863934566369588, + "learning_rate": 9.962798467073083e-06, + "loss": 0.0763, + "step": 428 + }, + { + "epoch": 0.1951774340309372, + "grad_norm": 0.8259784410005646, + "learning_rate": 9.96262423464708e-06, + "loss": 0.087, + "step": 429 + }, + { + "epoch": 0.19563239308462238, + "grad_norm": 0.7987139095182185, + "learning_rate": 9.962449596697834e-06, + "loss": 0.0671, + "step": 430 + }, + { + "epoch": 0.19608735213830755, + "grad_norm": 1.130208173229934, + "learning_rate": 9.962274553239619e-06, + "loss": 0.119, + "step": 431 + }, + { + "epoch": 0.19654231119199272, + "grad_norm": 0.7399696243677417, + "learning_rate": 9.962099104286735e-06, + "loss": 0.064, + "step": 432 + }, + { + "epoch": 0.1969972702456779, + "grad_norm": 1.156015767405528, + "learning_rate": 9.961923249853523e-06, + "loss": 0.1102, + "step": 433 + }, + { + "epoch": 0.19745222929936307, + "grad_norm": 0.972422739757894, + "learning_rate": 9.961746989954349e-06, + "loss": 0.1093, + "step": 434 + }, + { + "epoch": 0.1979071883530482, + "grad_norm": 0.7766700420403171, + "learning_rate": 9.96157032460362e-06, + "loss": 0.0655, + "step": 435 + }, + { + "epoch": 0.19836214740673339, + "grad_norm": 0.7460679115751414, + "learning_rate": 9.961393253815767e-06, + "loss": 0.0751, + "step": 436 + }, + { + "epoch": 0.19881710646041856, + "grad_norm": 1.0684214450487566, + "learning_rate": 9.961215777605266e-06, + "loss": 0.0789, + "step": 437 + }, + { + "epoch": 0.19927206551410373, + "grad_norm": 0.7683994291392229, + "learning_rate": 9.961037895986615e-06, + "loss": 0.0849, + "step": 438 + }, + { + "epoch": 0.1997270245677889, + "grad_norm": 0.7270368453251704, + "learning_rate": 9.960859608974352e-06, + "loss": 0.0779, + "step": 439 + }, + { + "epoch": 0.20018198362147407, + "grad_norm": 0.701460207303568, + "learning_rate": 9.960680916583042e-06, + "loss": 0.0639, + "step": 440 + }, + { + "epoch": 0.20063694267515925, + "grad_norm": 0.6784619280926262, + "learning_rate": 9.960501818827292e-06, + "loss": 0.077, + "step": 441 + }, + { + "epoch": 0.2010919017288444, + "grad_norm": 0.8064075868568972, + "learning_rate": 9.960322315721735e-06, + "loss": 0.0827, + "step": 442 + }, + { + "epoch": 0.20154686078252956, + "grad_norm": 0.9155026735417204, + "learning_rate": 9.960142407281039e-06, + "loss": 0.0841, + "step": 443 + }, + { + "epoch": 0.20200181983621474, + "grad_norm": 0.6167749294869733, + "learning_rate": 9.959962093519904e-06, + "loss": 0.054, + "step": 444 + }, + { + "epoch": 0.2024567788898999, + "grad_norm": 0.8127781985331358, + "learning_rate": 9.959781374453066e-06, + "loss": 0.0751, + "step": 445 + }, + { + "epoch": 0.20291173794358508, + "grad_norm": 0.98306444688532, + "learning_rate": 9.959600250095294e-06, + "loss": 0.075, + "step": 446 + }, + { + "epoch": 0.20336669699727025, + "grad_norm": 0.7982130269360888, + "learning_rate": 9.959418720461384e-06, + "loss": 0.0834, + "step": 447 + }, + { + "epoch": 0.20382165605095542, + "grad_norm": 0.7862225023823932, + "learning_rate": 9.959236785566175e-06, + "loss": 0.0704, + "step": 448 + }, + { + "epoch": 0.20427661510464057, + "grad_norm": 0.562107514296544, + "learning_rate": 9.959054445424532e-06, + "loss": 0.0644, + "step": 449 + }, + { + "epoch": 0.20473157415832574, + "grad_norm": 0.6089607791855781, + "learning_rate": 9.958871700051353e-06, + "loss": 0.0512, + "step": 450 + }, + { + "epoch": 0.2051865332120109, + "grad_norm": 0.6962095067981563, + "learning_rate": 9.958688549461573e-06, + "loss": 0.0712, + "step": 451 + }, + { + "epoch": 0.20564149226569609, + "grad_norm": 1.155217046291275, + "learning_rate": 9.958504993670158e-06, + "loss": 0.1049, + "step": 452 + }, + { + "epoch": 0.20609645131938126, + "grad_norm": 1.0913314226134752, + "learning_rate": 9.958321032692107e-06, + "loss": 0.1226, + "step": 453 + }, + { + "epoch": 0.20655141037306643, + "grad_norm": 22.735025633907238, + "learning_rate": 9.958136666542455e-06, + "loss": 0.8419, + "step": 454 + }, + { + "epoch": 0.2070063694267516, + "grad_norm": 1.184019553325164, + "learning_rate": 9.957951895236262e-06, + "loss": 0.1113, + "step": 455 + }, + { + "epoch": 0.20746132848043677, + "grad_norm": 0.7664792046331882, + "learning_rate": 9.957766718788632e-06, + "loss": 0.104, + "step": 456 + }, + { + "epoch": 0.20791628753412192, + "grad_norm": 0.8672883026786035, + "learning_rate": 9.957581137214695e-06, + "loss": 0.074, + "step": 457 + }, + { + "epoch": 0.2083712465878071, + "grad_norm": 0.8772220264781722, + "learning_rate": 9.957395150529615e-06, + "loss": 0.0986, + "step": 458 + }, + { + "epoch": 0.20882620564149226, + "grad_norm": 0.7016331971826193, + "learning_rate": 9.95720875874859e-06, + "loss": 0.0752, + "step": 459 + }, + { + "epoch": 0.20928116469517744, + "grad_norm": 0.6308822051977305, + "learning_rate": 9.957021961886855e-06, + "loss": 0.0608, + "step": 460 + }, + { + "epoch": 0.2097361237488626, + "grad_norm": 0.9803601042372939, + "learning_rate": 9.956834759959669e-06, + "loss": 0.0908, + "step": 461 + }, + { + "epoch": 0.21019108280254778, + "grad_norm": 0.7674462109758159, + "learning_rate": 9.95664715298233e-06, + "loss": 0.074, + "step": 462 + }, + { + "epoch": 0.21064604185623295, + "grad_norm": 0.7450186566335193, + "learning_rate": 9.95645914097017e-06, + "loss": 0.0817, + "step": 463 + }, + { + "epoch": 0.2111010009099181, + "grad_norm": 0.7225723661612439, + "learning_rate": 9.956270723938553e-06, + "loss": 0.0849, + "step": 464 + }, + { + "epoch": 0.21155595996360327, + "grad_norm": 0.7190355211871646, + "learning_rate": 9.956081901902875e-06, + "loss": 0.0748, + "step": 465 + }, + { + "epoch": 0.21201091901728844, + "grad_norm": 1.210684562087392, + "learning_rate": 9.955892674878565e-06, + "loss": 0.1272, + "step": 466 + }, + { + "epoch": 0.2124658780709736, + "grad_norm": 0.834170476650907, + "learning_rate": 9.955703042881087e-06, + "loss": 0.0992, + "step": 467 + }, + { + "epoch": 0.21292083712465878, + "grad_norm": 0.874478173291907, + "learning_rate": 9.955513005925934e-06, + "loss": 0.0858, + "step": 468 + }, + { + "epoch": 0.21337579617834396, + "grad_norm": 0.5510320150423565, + "learning_rate": 9.95532256402864e-06, + "loss": 0.0574, + "step": 469 + }, + { + "epoch": 0.21383075523202913, + "grad_norm": 0.5657171871822584, + "learning_rate": 9.955131717204762e-06, + "loss": 0.0671, + "step": 470 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 0.7564664653864259, + "learning_rate": 9.954940465469898e-06, + "loss": 0.085, + "step": 471 + }, + { + "epoch": 0.21474067333939945, + "grad_norm": 0.7594501005901694, + "learning_rate": 9.954748808839675e-06, + "loss": 0.0733, + "step": 472 + }, + { + "epoch": 0.21519563239308462, + "grad_norm": 0.6748092428366178, + "learning_rate": 9.954556747329754e-06, + "loss": 0.0707, + "step": 473 + }, + { + "epoch": 0.2156505914467698, + "grad_norm": 1.715089789819449, + "learning_rate": 9.954364280955832e-06, + "loss": 0.1045, + "step": 474 + }, + { + "epoch": 0.21610555050045496, + "grad_norm": 0.6668751648778155, + "learning_rate": 9.954171409733634e-06, + "loss": 0.0573, + "step": 475 + }, + { + "epoch": 0.21656050955414013, + "grad_norm": 0.5963716475430643, + "learning_rate": 9.95397813367892e-06, + "loss": 0.0752, + "step": 476 + }, + { + "epoch": 0.2170154686078253, + "grad_norm": 0.9917190233932158, + "learning_rate": 9.953784452807487e-06, + "loss": 0.1049, + "step": 477 + }, + { + "epoch": 0.21747042766151045, + "grad_norm": 0.5638529401686616, + "learning_rate": 9.953590367135159e-06, + "loss": 0.0547, + "step": 478 + }, + { + "epoch": 0.21792538671519562, + "grad_norm": 0.6477110515460727, + "learning_rate": 9.953395876677796e-06, + "loss": 0.0564, + "step": 479 + }, + { + "epoch": 0.2183803457688808, + "grad_norm": 0.5492055118574499, + "learning_rate": 9.95320098145129e-06, + "loss": 0.0505, + "step": 480 + }, + { + "epoch": 0.21883530482256597, + "grad_norm": 0.8954528378372288, + "learning_rate": 9.95300568147157e-06, + "loss": 0.126, + "step": 481 + }, + { + "epoch": 0.21929026387625114, + "grad_norm": 0.6155736143826033, + "learning_rate": 9.952809976754593e-06, + "loss": 0.0518, + "step": 482 + }, + { + "epoch": 0.2197452229299363, + "grad_norm": 1.1486004986445648, + "learning_rate": 9.952613867316351e-06, + "loss": 0.1142, + "step": 483 + }, + { + "epoch": 0.22020018198362148, + "grad_norm": 0.8236924325360948, + "learning_rate": 9.95241735317287e-06, + "loss": 0.1047, + "step": 484 + }, + { + "epoch": 0.22065514103730663, + "grad_norm": 0.832372102653505, + "learning_rate": 9.952220434340209e-06, + "loss": 0.0729, + "step": 485 + }, + { + "epoch": 0.2211101000909918, + "grad_norm": 0.7288716722109786, + "learning_rate": 9.952023110834456e-06, + "loss": 0.068, + "step": 486 + }, + { + "epoch": 0.22156505914467697, + "grad_norm": 0.5327254294033283, + "learning_rate": 9.951825382671739e-06, + "loss": 0.0614, + "step": 487 + }, + { + "epoch": 0.22202001819836215, + "grad_norm": 0.7204991379763186, + "learning_rate": 9.951627249868213e-06, + "loss": 0.0666, + "step": 488 + }, + { + "epoch": 0.22247497725204732, + "grad_norm": 0.7485835393026234, + "learning_rate": 9.95142871244007e-06, + "loss": 0.068, + "step": 489 + }, + { + "epoch": 0.2229299363057325, + "grad_norm": 0.45602532896445397, + "learning_rate": 9.951229770403531e-06, + "loss": 0.0414, + "step": 490 + }, + { + "epoch": 0.22338489535941766, + "grad_norm": 0.7240661348572547, + "learning_rate": 9.951030423774858e-06, + "loss": 0.0798, + "step": 491 + }, + { + "epoch": 0.22383985441310283, + "grad_norm": 0.7716352477687572, + "learning_rate": 9.950830672570337e-06, + "loss": 0.071, + "step": 492 + }, + { + "epoch": 0.22429481346678798, + "grad_norm": 1.22677184750836, + "learning_rate": 9.95063051680629e-06, + "loss": 0.1373, + "step": 493 + }, + { + "epoch": 0.22474977252047315, + "grad_norm": 0.7365431233953595, + "learning_rate": 9.950429956499074e-06, + "loss": 0.0699, + "step": 494 + }, + { + "epoch": 0.22520473157415832, + "grad_norm": 0.705654951368504, + "learning_rate": 9.950228991665078e-06, + "loss": 0.0741, + "step": 495 + }, + { + "epoch": 0.2256596906278435, + "grad_norm": 0.8261497906057415, + "learning_rate": 9.950027622320724e-06, + "loss": 0.0764, + "step": 496 + }, + { + "epoch": 0.22611464968152867, + "grad_norm": 0.9965395262255518, + "learning_rate": 9.949825848482465e-06, + "loss": 0.0852, + "step": 497 + }, + { + "epoch": 0.22656960873521384, + "grad_norm": 0.6807161957389707, + "learning_rate": 9.949623670166794e-06, + "loss": 0.074, + "step": 498 + }, + { + "epoch": 0.227024567788899, + "grad_norm": 1.1216390709095547, + "learning_rate": 9.949421087390228e-06, + "loss": 0.0931, + "step": 499 + }, + { + "epoch": 0.22747952684258416, + "grad_norm": 1.1278655216416786, + "learning_rate": 9.949218100169322e-06, + "loss": 0.1177, + "step": 500 + }, + { + "epoch": 0.22793448589626933, + "grad_norm": 0.9160591457448575, + "learning_rate": 9.949014708520664e-06, + "loss": 0.1015, + "step": 501 + }, + { + "epoch": 0.2283894449499545, + "grad_norm": 0.9377363057118697, + "learning_rate": 9.948810912460872e-06, + "loss": 0.1059, + "step": 502 + }, + { + "epoch": 0.22884440400363967, + "grad_norm": 0.8760932101779023, + "learning_rate": 9.948606712006601e-06, + "loss": 0.0812, + "step": 503 + }, + { + "epoch": 0.22929936305732485, + "grad_norm": 0.6962605051289937, + "learning_rate": 9.948402107174537e-06, + "loss": 0.0735, + "step": 504 + }, + { + "epoch": 0.22975432211101002, + "grad_norm": 0.6501265713488487, + "learning_rate": 9.948197097981401e-06, + "loss": 0.0551, + "step": 505 + }, + { + "epoch": 0.2302092811646952, + "grad_norm": 1.2156011775652311, + "learning_rate": 9.947991684443942e-06, + "loss": 0.1066, + "step": 506 + }, + { + "epoch": 0.23066424021838033, + "grad_norm": 0.9679794435610901, + "learning_rate": 9.947785866578951e-06, + "loss": 0.0981, + "step": 507 + }, + { + "epoch": 0.2311191992720655, + "grad_norm": 0.7195724631231237, + "learning_rate": 9.94757964440324e-06, + "loss": 0.0777, + "step": 508 + }, + { + "epoch": 0.23157415832575068, + "grad_norm": 0.549427502610929, + "learning_rate": 9.947373017933665e-06, + "loss": 0.0516, + "step": 509 + }, + { + "epoch": 0.23202911737943585, + "grad_norm": 0.5667212336170355, + "learning_rate": 9.947165987187108e-06, + "loss": 0.0583, + "step": 510 + }, + { + "epoch": 0.23248407643312102, + "grad_norm": 0.6638127935874616, + "learning_rate": 9.946958552180489e-06, + "loss": 0.0723, + "step": 511 + }, + { + "epoch": 0.2329390354868062, + "grad_norm": 0.5226768129517959, + "learning_rate": 9.946750712930756e-06, + "loss": 0.0482, + "step": 512 + }, + { + "epoch": 0.23339399454049137, + "grad_norm": 0.8358986518129136, + "learning_rate": 9.946542469454894e-06, + "loss": 0.1037, + "step": 513 + }, + { + "epoch": 0.2338489535941765, + "grad_norm": 0.6695809647699968, + "learning_rate": 9.94633382176992e-06, + "loss": 0.0728, + "step": 514 + }, + { + "epoch": 0.23430391264786168, + "grad_norm": 1.0608546974350634, + "learning_rate": 9.946124769892884e-06, + "loss": 0.1192, + "step": 515 + }, + { + "epoch": 0.23475887170154686, + "grad_norm": 0.5090717025630993, + "learning_rate": 9.945915313840869e-06, + "loss": 0.0612, + "step": 516 + }, + { + "epoch": 0.23521383075523203, + "grad_norm": 0.8105130307542814, + "learning_rate": 9.94570545363099e-06, + "loss": 0.0838, + "step": 517 + }, + { + "epoch": 0.2356687898089172, + "grad_norm": 0.7752986876049957, + "learning_rate": 9.945495189280394e-06, + "loss": 0.092, + "step": 518 + }, + { + "epoch": 0.23612374886260237, + "grad_norm": 0.869801315379322, + "learning_rate": 9.945284520806267e-06, + "loss": 0.077, + "step": 519 + }, + { + "epoch": 0.23657870791628755, + "grad_norm": 0.5427153243822386, + "learning_rate": 9.94507344822582e-06, + "loss": 0.0592, + "step": 520 + }, + { + "epoch": 0.2370336669699727, + "grad_norm": 0.7368670007832758, + "learning_rate": 9.944861971556305e-06, + "loss": 0.0608, + "step": 521 + }, + { + "epoch": 0.23748862602365786, + "grad_norm": 0.8141430793460733, + "learning_rate": 9.944650090814998e-06, + "loss": 0.0616, + "step": 522 + }, + { + "epoch": 0.23794358507734303, + "grad_norm": 2.1096588720516425, + "learning_rate": 9.944437806019216e-06, + "loss": 0.0938, + "step": 523 + }, + { + "epoch": 0.2383985441310282, + "grad_norm": 0.7014907085161215, + "learning_rate": 9.944225117186306e-06, + "loss": 0.0812, + "step": 524 + }, + { + "epoch": 0.23885350318471338, + "grad_norm": 0.5078467158211916, + "learning_rate": 9.944012024333647e-06, + "loss": 0.0561, + "step": 525 + }, + { + "epoch": 0.23930846223839855, + "grad_norm": 0.6379031604907951, + "learning_rate": 9.943798527478652e-06, + "loss": 0.0678, + "step": 526 + }, + { + "epoch": 0.23976342129208372, + "grad_norm": 0.799876019099874, + "learning_rate": 9.943584626638768e-06, + "loss": 0.0914, + "step": 527 + }, + { + "epoch": 0.24021838034576887, + "grad_norm": 0.6550229607349646, + "learning_rate": 9.943370321831474e-06, + "loss": 0.0668, + "step": 528 + }, + { + "epoch": 0.24067333939945404, + "grad_norm": 0.767534839542607, + "learning_rate": 9.943155613074279e-06, + "loss": 0.0711, + "step": 529 + }, + { + "epoch": 0.2411282984531392, + "grad_norm": 0.7571838990000624, + "learning_rate": 9.942940500384733e-06, + "loss": 0.0893, + "step": 530 + }, + { + "epoch": 0.24158325750682438, + "grad_norm": 17.807000846945513, + "learning_rate": 9.942724983780409e-06, + "loss": 0.3419, + "step": 531 + }, + { + "epoch": 0.24203821656050956, + "grad_norm": 1.2088422410181228, + "learning_rate": 9.942509063278922e-06, + "loss": 0.1173, + "step": 532 + }, + { + "epoch": 0.24249317561419473, + "grad_norm": 0.8811842157145667, + "learning_rate": 9.942292738897914e-06, + "loss": 0.1006, + "step": 533 + }, + { + "epoch": 0.2429481346678799, + "grad_norm": 0.7726281786442553, + "learning_rate": 9.942076010655063e-06, + "loss": 0.0909, + "step": 534 + }, + { + "epoch": 0.24340309372156507, + "grad_norm": 0.9942256398778268, + "learning_rate": 9.941858878568078e-06, + "loss": 0.134, + "step": 535 + }, + { + "epoch": 0.24385805277525022, + "grad_norm": 1.001596627292525, + "learning_rate": 9.941641342654702e-06, + "loss": 0.0977, + "step": 536 + }, + { + "epoch": 0.2443130118289354, + "grad_norm": 0.5064863363900076, + "learning_rate": 9.941423402932713e-06, + "loss": 0.0559, + "step": 537 + }, + { + "epoch": 0.24476797088262056, + "grad_norm": 0.8589680374278897, + "learning_rate": 9.94120505941992e-06, + "loss": 0.0992, + "step": 538 + }, + { + "epoch": 0.24522292993630573, + "grad_norm": 0.7830880681851201, + "learning_rate": 9.940986312134162e-06, + "loss": 0.0825, + "step": 539 + }, + { + "epoch": 0.2456778889899909, + "grad_norm": 0.5778344550660577, + "learning_rate": 9.940767161093316e-06, + "loss": 0.0637, + "step": 540 + }, + { + "epoch": 0.24613284804367608, + "grad_norm": 0.8661775200374767, + "learning_rate": 9.94054760631529e-06, + "loss": 0.0958, + "step": 541 + }, + { + "epoch": 0.24658780709736125, + "grad_norm": 0.6976226834296251, + "learning_rate": 9.940327647818026e-06, + "loss": 0.0752, + "step": 542 + }, + { + "epoch": 0.2470427661510464, + "grad_norm": 0.7530160135685138, + "learning_rate": 9.940107285619495e-06, + "loss": 0.077, + "step": 543 + }, + { + "epoch": 0.24749772520473157, + "grad_norm": 0.7997106896354084, + "learning_rate": 9.939886519737707e-06, + "loss": 0.0958, + "step": 544 + }, + { + "epoch": 0.24795268425841674, + "grad_norm": 0.8918061918047896, + "learning_rate": 9.939665350190702e-06, + "loss": 0.0822, + "step": 545 + }, + { + "epoch": 0.2484076433121019, + "grad_norm": 0.804115756264787, + "learning_rate": 9.93944377699655e-06, + "loss": 0.0915, + "step": 546 + }, + { + "epoch": 0.24886260236578708, + "grad_norm": 0.6234057941022288, + "learning_rate": 9.93922180017336e-06, + "loss": 0.0672, + "step": 547 + }, + { + "epoch": 0.24931756141947226, + "grad_norm": 0.8269450754551354, + "learning_rate": 9.93899941973927e-06, + "loss": 0.1102, + "step": 548 + }, + { + "epoch": 0.24977252047315743, + "grad_norm": 0.9233841316663005, + "learning_rate": 9.93877663571245e-06, + "loss": 0.0963, + "step": 549 + }, + { + "epoch": 0.2502274795268426, + "grad_norm": 0.9944861568923805, + "learning_rate": 9.938553448111108e-06, + "loss": 0.1127, + "step": 550 + }, + { + "epoch": 0.25068243858052774, + "grad_norm": 0.8423641298780182, + "learning_rate": 9.938329856953482e-06, + "loss": 0.0788, + "step": 551 + }, + { + "epoch": 0.25113739763421294, + "grad_norm": 0.8124861649110975, + "learning_rate": 9.938105862257839e-06, + "loss": 0.0831, + "step": 552 + }, + { + "epoch": 0.2515923566878981, + "grad_norm": 0.6612222253979325, + "learning_rate": 9.937881464042485e-06, + "loss": 0.0703, + "step": 553 + }, + { + "epoch": 0.25204731574158323, + "grad_norm": 0.854447666921162, + "learning_rate": 9.937656662325759e-06, + "loss": 0.1074, + "step": 554 + }, + { + "epoch": 0.25250227479526843, + "grad_norm": 0.74521770368624, + "learning_rate": 9.937431457126028e-06, + "loss": 0.0777, + "step": 555 + }, + { + "epoch": 0.2529572338489536, + "grad_norm": 0.5044600553216889, + "learning_rate": 9.937205848461694e-06, + "loss": 0.0482, + "step": 556 + }, + { + "epoch": 0.2534121929026388, + "grad_norm": 1.0949051966397356, + "learning_rate": 9.936979836351197e-06, + "loss": 0.0945, + "step": 557 + }, + { + "epoch": 0.2538671519563239, + "grad_norm": 1.0332199252594778, + "learning_rate": 9.936753420813003e-06, + "loss": 0.092, + "step": 558 + }, + { + "epoch": 0.2543221110100091, + "grad_norm": 0.7029577630748303, + "learning_rate": 9.936526601865612e-06, + "loss": 0.0612, + "step": 559 + }, + { + "epoch": 0.25477707006369427, + "grad_norm": 0.5251640812064944, + "learning_rate": 9.936299379527561e-06, + "loss": 0.0569, + "step": 560 + }, + { + "epoch": 0.2552320291173794, + "grad_norm": 0.6689496924283664, + "learning_rate": 9.936071753817416e-06, + "loss": 0.0831, + "step": 561 + }, + { + "epoch": 0.2556869881710646, + "grad_norm": 0.8094390650978945, + "learning_rate": 9.935843724753778e-06, + "loss": 0.0897, + "step": 562 + }, + { + "epoch": 0.25614194722474976, + "grad_norm": 0.9168849457874456, + "learning_rate": 9.935615292355283e-06, + "loss": 0.1002, + "step": 563 + }, + { + "epoch": 0.25659690627843496, + "grad_norm": 0.8829987760246157, + "learning_rate": 9.935386456640593e-06, + "loss": 0.0997, + "step": 564 + }, + { + "epoch": 0.2570518653321201, + "grad_norm": 0.9381858557170412, + "learning_rate": 9.93515721762841e-06, + "loss": 0.0926, + "step": 565 + }, + { + "epoch": 0.2575068243858053, + "grad_norm": 0.6555630906162114, + "learning_rate": 9.934927575337469e-06, + "loss": 0.0805, + "step": 566 + }, + { + "epoch": 0.25796178343949044, + "grad_norm": 0.49897284031908906, + "learning_rate": 9.93469752978653e-06, + "loss": 0.0545, + "step": 567 + }, + { + "epoch": 0.2584167424931756, + "grad_norm": 0.8528689809178094, + "learning_rate": 9.934467080994394e-06, + "loss": 0.071, + "step": 568 + }, + { + "epoch": 0.2588717015468608, + "grad_norm": 0.7999188284583189, + "learning_rate": 9.934236228979893e-06, + "loss": 0.0675, + "step": 569 + }, + { + "epoch": 0.25932666060054593, + "grad_norm": 0.6603615540899209, + "learning_rate": 9.934004973761888e-06, + "loss": 0.0584, + "step": 570 + }, + { + "epoch": 0.25978161965423113, + "grad_norm": 0.907545218090885, + "learning_rate": 9.933773315359281e-06, + "loss": 0.0912, + "step": 571 + }, + { + "epoch": 0.2602365787079163, + "grad_norm": 1.2225854103436529, + "learning_rate": 9.933541253790998e-06, + "loss": 0.0996, + "step": 572 + }, + { + "epoch": 0.2606915377616015, + "grad_norm": 0.821182112953313, + "learning_rate": 9.933308789076004e-06, + "loss": 0.0886, + "step": 573 + }, + { + "epoch": 0.2611464968152866, + "grad_norm": 0.5608593716975471, + "learning_rate": 9.933075921233292e-06, + "loss": 0.0597, + "step": 574 + }, + { + "epoch": 0.26160145586897177, + "grad_norm": 0.977094581221023, + "learning_rate": 9.932842650281897e-06, + "loss": 0.0796, + "step": 575 + }, + { + "epoch": 0.26205641492265697, + "grad_norm": 1.0086738407073246, + "learning_rate": 9.932608976240875e-06, + "loss": 0.1245, + "step": 576 + }, + { + "epoch": 0.2625113739763421, + "grad_norm": 0.7841605184531412, + "learning_rate": 9.932374899129323e-06, + "loss": 0.0798, + "step": 577 + }, + { + "epoch": 0.2629663330300273, + "grad_norm": 0.6360279282536222, + "learning_rate": 9.932140418966369e-06, + "loss": 0.0714, + "step": 578 + }, + { + "epoch": 0.26342129208371245, + "grad_norm": 0.8673569892639119, + "learning_rate": 9.931905535771174e-06, + "loss": 0.0805, + "step": 579 + }, + { + "epoch": 0.26387625113739765, + "grad_norm": 1.0489822111787226, + "learning_rate": 9.93167024956293e-06, + "loss": 0.1046, + "step": 580 + }, + { + "epoch": 0.2643312101910828, + "grad_norm": 0.5670611684906575, + "learning_rate": 9.931434560360864e-06, + "loss": 0.0662, + "step": 581 + }, + { + "epoch": 0.26478616924476794, + "grad_norm": 0.6786486717931198, + "learning_rate": 9.931198468184236e-06, + "loss": 0.0705, + "step": 582 + }, + { + "epoch": 0.26524112829845314, + "grad_norm": 0.7580601459978998, + "learning_rate": 9.93096197305234e-06, + "loss": 0.0852, + "step": 583 + }, + { + "epoch": 0.2656960873521383, + "grad_norm": 0.8802141056853473, + "learning_rate": 9.930725074984498e-06, + "loss": 0.0989, + "step": 584 + }, + { + "epoch": 0.2661510464058235, + "grad_norm": 0.6365186853726369, + "learning_rate": 9.930487774000071e-06, + "loss": 0.0639, + "step": 585 + }, + { + "epoch": 0.26660600545950863, + "grad_norm": 0.5301331320559389, + "learning_rate": 9.930250070118448e-06, + "loss": 0.0628, + "step": 586 + }, + { + "epoch": 0.26706096451319383, + "grad_norm": 0.6982626314754508, + "learning_rate": 9.930011963359055e-06, + "loss": 0.071, + "step": 587 + }, + { + "epoch": 0.267515923566879, + "grad_norm": 1.0151988128038116, + "learning_rate": 9.929773453741346e-06, + "loss": 0.1074, + "step": 588 + }, + { + "epoch": 0.2679708826205642, + "grad_norm": 0.809050548171497, + "learning_rate": 9.929534541284814e-06, + "loss": 0.0715, + "step": 589 + }, + { + "epoch": 0.2684258416742493, + "grad_norm": 0.8254901916718546, + "learning_rate": 9.929295226008981e-06, + "loss": 0.0867, + "step": 590 + }, + { + "epoch": 0.26888080072793447, + "grad_norm": 0.695875393623419, + "learning_rate": 9.929055507933403e-06, + "loss": 0.0667, + "step": 591 + }, + { + "epoch": 0.26933575978161967, + "grad_norm": 0.6569370607259161, + "learning_rate": 9.928815387077668e-06, + "loss": 0.0667, + "step": 592 + }, + { + "epoch": 0.2697907188353048, + "grad_norm": 0.8509989554819866, + "learning_rate": 9.9285748634614e-06, + "loss": 0.0964, + "step": 593 + }, + { + "epoch": 0.27024567788899, + "grad_norm": 0.7743154017799978, + "learning_rate": 9.928333937104249e-06, + "loss": 0.1008, + "step": 594 + }, + { + "epoch": 0.27070063694267515, + "grad_norm": 0.6810806452813069, + "learning_rate": 9.928092608025905e-06, + "loss": 0.0623, + "step": 595 + }, + { + "epoch": 0.27115559599636035, + "grad_norm": 0.6757764847225584, + "learning_rate": 9.927850876246087e-06, + "loss": 0.0621, + "step": 596 + }, + { + "epoch": 0.2716105550500455, + "grad_norm": 0.7561897396028232, + "learning_rate": 9.927608741784551e-06, + "loss": 0.0769, + "step": 597 + }, + { + "epoch": 0.27206551410373064, + "grad_norm": 0.9087608421567758, + "learning_rate": 9.927366204661081e-06, + "loss": 0.1064, + "step": 598 + }, + { + "epoch": 0.27252047315741584, + "grad_norm": 0.6090969825991095, + "learning_rate": 9.927123264895497e-06, + "loss": 0.0596, + "step": 599 + }, + { + "epoch": 0.272975432211101, + "grad_norm": 0.5838273869575724, + "learning_rate": 9.926879922507651e-06, + "loss": 0.0581, + "step": 600 + }, + { + "epoch": 0.2734303912647862, + "grad_norm": 41.16319851924577, + "learning_rate": 9.926636177517427e-06, + "loss": 0.7305, + "step": 601 + }, + { + "epoch": 0.27388535031847133, + "grad_norm": 0.7159907538362364, + "learning_rate": 9.926392029944743e-06, + "loss": 0.0655, + "step": 602 + }, + { + "epoch": 0.27434030937215653, + "grad_norm": 0.6649118967721417, + "learning_rate": 9.92614747980955e-06, + "loss": 0.0676, + "step": 603 + }, + { + "epoch": 0.2747952684258417, + "grad_norm": 0.6955588874689645, + "learning_rate": 9.92590252713183e-06, + "loss": 0.0691, + "step": 604 + }, + { + "epoch": 0.2752502274795268, + "grad_norm": 1.0093833512385355, + "learning_rate": 9.925657171931603e-06, + "loss": 0.0788, + "step": 605 + }, + { + "epoch": 0.275705186533212, + "grad_norm": 0.7222760734094591, + "learning_rate": 9.925411414228913e-06, + "loss": 0.0765, + "step": 606 + }, + { + "epoch": 0.27616014558689717, + "grad_norm": 0.7901083190949632, + "learning_rate": 9.925165254043846e-06, + "loss": 0.0899, + "step": 607 + }, + { + "epoch": 0.27661510464058237, + "grad_norm": 0.9417411536264935, + "learning_rate": 9.924918691396516e-06, + "loss": 0.105, + "step": 608 + }, + { + "epoch": 0.2770700636942675, + "grad_norm": 0.8531576003982281, + "learning_rate": 9.924671726307073e-06, + "loss": 0.0943, + "step": 609 + }, + { + "epoch": 0.2775250227479527, + "grad_norm": 0.5771833327707789, + "learning_rate": 9.924424358795694e-06, + "loss": 0.0649, + "step": 610 + }, + { + "epoch": 0.27797998180163785, + "grad_norm": 0.6804808150530418, + "learning_rate": 9.924176588882597e-06, + "loss": 0.0591, + "step": 611 + }, + { + "epoch": 0.278434940855323, + "grad_norm": 0.6916110773643345, + "learning_rate": 9.923928416588027e-06, + "loss": 0.082, + "step": 612 + }, + { + "epoch": 0.2788898999090082, + "grad_norm": 0.7302341341594485, + "learning_rate": 9.923679841932261e-06, + "loss": 0.0858, + "step": 613 + }, + { + "epoch": 0.27934485896269334, + "grad_norm": 0.7190514572276734, + "learning_rate": 9.923430864935615e-06, + "loss": 0.0658, + "step": 614 + }, + { + "epoch": 0.27979981801637854, + "grad_norm": 0.6872892360375661, + "learning_rate": 9.923181485618432e-06, + "loss": 0.0639, + "step": 615 + }, + { + "epoch": 0.2802547770700637, + "grad_norm": 0.6937876338258171, + "learning_rate": 9.92293170400109e-06, + "loss": 0.0759, + "step": 616 + }, + { + "epoch": 0.2807097361237489, + "grad_norm": 0.8498928251372749, + "learning_rate": 9.922681520104002e-06, + "loss": 0.0777, + "step": 617 + }, + { + "epoch": 0.28116469517743403, + "grad_norm": 0.7409609990217324, + "learning_rate": 9.922430933947612e-06, + "loss": 0.0665, + "step": 618 + }, + { + "epoch": 0.2816196542311192, + "grad_norm": 1.2216942184143182, + "learning_rate": 9.922179945552393e-06, + "loss": 0.1405, + "step": 619 + }, + { + "epoch": 0.2820746132848044, + "grad_norm": 0.6637234254274302, + "learning_rate": 9.921928554938857e-06, + "loss": 0.062, + "step": 620 + }, + { + "epoch": 0.2825295723384895, + "grad_norm": 0.9463087936758936, + "learning_rate": 9.921676762127548e-06, + "loss": 0.0767, + "step": 621 + }, + { + "epoch": 0.2829845313921747, + "grad_norm": 1.089309305809361, + "learning_rate": 9.921424567139042e-06, + "loss": 0.1171, + "step": 622 + }, + { + "epoch": 0.28343949044585987, + "grad_norm": 0.8752119302288704, + "learning_rate": 9.921171969993942e-06, + "loss": 0.0813, + "step": 623 + }, + { + "epoch": 0.28389444949954507, + "grad_norm": 0.7870883299373892, + "learning_rate": 9.920918970712894e-06, + "loss": 0.0993, + "step": 624 + }, + { + "epoch": 0.2843494085532302, + "grad_norm": 0.6504873266789636, + "learning_rate": 9.92066556931657e-06, + "loss": 0.073, + "step": 625 + }, + { + "epoch": 0.28480436760691535, + "grad_norm": 1.1098031698420505, + "learning_rate": 9.920411765825679e-06, + "loss": 0.1218, + "step": 626 + }, + { + "epoch": 0.28525932666060055, + "grad_norm": 1.217844501512982, + "learning_rate": 9.920157560260957e-06, + "loss": 0.1549, + "step": 627 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.9728161223416268, + "learning_rate": 9.919902952643179e-06, + "loss": 0.0984, + "step": 628 + }, + { + "epoch": 0.2861692447679709, + "grad_norm": 0.5217007184455262, + "learning_rate": 9.91964794299315e-06, + "loss": 0.0636, + "step": 629 + }, + { + "epoch": 0.28662420382165604, + "grad_norm": 1.7394407973312302, + "learning_rate": 9.919392531331706e-06, + "loss": 0.1686, + "step": 630 + }, + { + "epoch": 0.28707916287534124, + "grad_norm": 0.5702940927618096, + "learning_rate": 9.919136717679723e-06, + "loss": 0.0465, + "step": 631 + }, + { + "epoch": 0.2875341219290264, + "grad_norm": 0.5990973378462472, + "learning_rate": 9.9188805020581e-06, + "loss": 0.0678, + "step": 632 + }, + { + "epoch": 0.28798908098271153, + "grad_norm": 0.9343816967111115, + "learning_rate": 9.918623884487777e-06, + "loss": 0.1068, + "step": 633 + }, + { + "epoch": 0.28844404003639673, + "grad_norm": 0.5997939637509836, + "learning_rate": 9.91836686498972e-06, + "loss": 0.0629, + "step": 634 + }, + { + "epoch": 0.2888989990900819, + "grad_norm": 0.8063617612610782, + "learning_rate": 9.918109443584938e-06, + "loss": 0.0904, + "step": 635 + }, + { + "epoch": 0.2893539581437671, + "grad_norm": 0.6625405697250593, + "learning_rate": 9.917851620294461e-06, + "loss": 0.0638, + "step": 636 + }, + { + "epoch": 0.2898089171974522, + "grad_norm": 0.7423789779714624, + "learning_rate": 9.917593395139358e-06, + "loss": 0.0714, + "step": 637 + }, + { + "epoch": 0.2902638762511374, + "grad_norm": 0.6102576569607258, + "learning_rate": 9.91733476814073e-06, + "loss": 0.0563, + "step": 638 + }, + { + "epoch": 0.29071883530482256, + "grad_norm": 0.8342620452233175, + "learning_rate": 9.91707573931971e-06, + "loss": 0.0934, + "step": 639 + }, + { + "epoch": 0.2911737943585077, + "grad_norm": 0.6397583044633867, + "learning_rate": 9.916816308697468e-06, + "loss": 0.0608, + "step": 640 + }, + { + "epoch": 0.2916287534121929, + "grad_norm": 0.7837909798874247, + "learning_rate": 9.9165564762952e-06, + "loss": 0.0936, + "step": 641 + }, + { + "epoch": 0.29208371246587805, + "grad_norm": 0.9915309549496408, + "learning_rate": 9.916296242134142e-06, + "loss": 0.1364, + "step": 642 + }, + { + "epoch": 0.29253867151956325, + "grad_norm": 0.7722166587924495, + "learning_rate": 9.916035606235555e-06, + "loss": 0.1022, + "step": 643 + }, + { + "epoch": 0.2929936305732484, + "grad_norm": 0.6446192951972597, + "learning_rate": 9.915774568620739e-06, + "loss": 0.0794, + "step": 644 + }, + { + "epoch": 0.2934485896269336, + "grad_norm": 0.7655996282008942, + "learning_rate": 9.915513129311025e-06, + "loss": 0.083, + "step": 645 + }, + { + "epoch": 0.29390354868061874, + "grad_norm": 0.7358761993420325, + "learning_rate": 9.915251288327776e-06, + "loss": 0.0927, + "step": 646 + }, + { + "epoch": 0.2943585077343039, + "grad_norm": 0.8417441236168001, + "learning_rate": 9.914989045692388e-06, + "loss": 0.0791, + "step": 647 + }, + { + "epoch": 0.2948134667879891, + "grad_norm": 0.8847229450668922, + "learning_rate": 9.914726401426293e-06, + "loss": 0.1114, + "step": 648 + }, + { + "epoch": 0.29526842584167423, + "grad_norm": 0.6805089048669102, + "learning_rate": 9.91446335555095e-06, + "loss": 0.0645, + "step": 649 + }, + { + "epoch": 0.29572338489535943, + "grad_norm": 0.9967907781154212, + "learning_rate": 9.914199908087856e-06, + "loss": 0.1125, + "step": 650 + }, + { + "epoch": 0.2961783439490446, + "grad_norm": 0.7069764233646496, + "learning_rate": 9.913936059058537e-06, + "loss": 0.0961, + "step": 651 + }, + { + "epoch": 0.2966333030027298, + "grad_norm": 0.8237259808163154, + "learning_rate": 9.913671808484554e-06, + "loss": 0.0863, + "step": 652 + }, + { + "epoch": 0.2970882620564149, + "grad_norm": 0.5595221349609915, + "learning_rate": 9.913407156387503e-06, + "loss": 0.0477, + "step": 653 + }, + { + "epoch": 0.29754322111010006, + "grad_norm": 0.8322598543263076, + "learning_rate": 9.913142102789005e-06, + "loss": 0.0785, + "step": 654 + }, + { + "epoch": 0.29799818016378526, + "grad_norm": 0.9426946452527044, + "learning_rate": 9.912876647710723e-06, + "loss": 0.0993, + "step": 655 + }, + { + "epoch": 0.2984531392174704, + "grad_norm": 0.8902481236790349, + "learning_rate": 9.912610791174348e-06, + "loss": 0.0981, + "step": 656 + }, + { + "epoch": 0.2989080982711556, + "grad_norm": 0.6714333609160019, + "learning_rate": 9.912344533201604e-06, + "loss": 0.0716, + "step": 657 + }, + { + "epoch": 0.29936305732484075, + "grad_norm": 0.6721636461789662, + "learning_rate": 9.91207787381425e-06, + "loss": 0.0675, + "step": 658 + }, + { + "epoch": 0.29981801637852595, + "grad_norm": 0.628744075340254, + "learning_rate": 9.911810813034073e-06, + "loss": 0.0583, + "step": 659 + }, + { + "epoch": 0.3002729754322111, + "grad_norm": 0.9172548581720068, + "learning_rate": 9.9115433508829e-06, + "loss": 0.0972, + "step": 660 + }, + { + "epoch": 0.30072793448589624, + "grad_norm": 0.914462327674233, + "learning_rate": 9.911275487382583e-06, + "loss": 0.089, + "step": 661 + }, + { + "epoch": 0.30118289353958144, + "grad_norm": 0.7410939383575923, + "learning_rate": 9.911007222555011e-06, + "loss": 0.0744, + "step": 662 + }, + { + "epoch": 0.3016378525932666, + "grad_norm": 0.6952942958219819, + "learning_rate": 9.91073855642211e-06, + "loss": 0.0627, + "step": 663 + }, + { + "epoch": 0.3020928116469518, + "grad_norm": 0.8802064643150562, + "learning_rate": 9.910469489005828e-06, + "loss": 0.0836, + "step": 664 + }, + { + "epoch": 0.30254777070063693, + "grad_norm": 0.9015922573736656, + "learning_rate": 9.910200020328158e-06, + "loss": 0.0934, + "step": 665 + }, + { + "epoch": 0.30300272975432213, + "grad_norm": 0.6635682732023674, + "learning_rate": 9.909930150411113e-06, + "loss": 0.0623, + "step": 666 + }, + { + "epoch": 0.3034576888080073, + "grad_norm": 1.928152977107998, + "learning_rate": 9.909659879276751e-06, + "loss": 0.1457, + "step": 667 + }, + { + "epoch": 0.3039126478616925, + "grad_norm": 0.7754006092902415, + "learning_rate": 9.909389206947156e-06, + "loss": 0.0621, + "step": 668 + }, + { + "epoch": 0.3043676069153776, + "grad_norm": 1.0461982822616211, + "learning_rate": 9.909118133444444e-06, + "loss": 0.1087, + "step": 669 + }, + { + "epoch": 0.30482256596906276, + "grad_norm": 0.7981897376851527, + "learning_rate": 9.90884665879077e-06, + "loss": 0.0921, + "step": 670 + }, + { + "epoch": 0.30527752502274796, + "grad_norm": 0.8941901965354629, + "learning_rate": 9.908574783008313e-06, + "loss": 0.1055, + "step": 671 + }, + { + "epoch": 0.3057324840764331, + "grad_norm": 1.0219508428898654, + "learning_rate": 9.908302506119291e-06, + "loss": 0.1152, + "step": 672 + }, + { + "epoch": 0.3061874431301183, + "grad_norm": 0.7623168423299865, + "learning_rate": 9.908029828145956e-06, + "loss": 0.0837, + "step": 673 + }, + { + "epoch": 0.30664240218380345, + "grad_norm": 0.7026665400337327, + "learning_rate": 9.907756749110587e-06, + "loss": 0.0785, + "step": 674 + }, + { + "epoch": 0.30709736123748865, + "grad_norm": 1.0861630797383492, + "learning_rate": 9.9074832690355e-06, + "loss": 0.1121, + "step": 675 + }, + { + "epoch": 0.3075523202911738, + "grad_norm": 0.8171913655631801, + "learning_rate": 9.907209387943042e-06, + "loss": 0.0759, + "step": 676 + }, + { + "epoch": 0.30800727934485894, + "grad_norm": 0.695009650682766, + "learning_rate": 9.906935105855595e-06, + "loss": 0.0508, + "step": 677 + }, + { + "epoch": 0.30846223839854414, + "grad_norm": 1.1629680848047237, + "learning_rate": 9.906660422795569e-06, + "loss": 0.1123, + "step": 678 + }, + { + "epoch": 0.3089171974522293, + "grad_norm": 1.1028006392582481, + "learning_rate": 9.906385338785411e-06, + "loss": 0.1048, + "step": 679 + }, + { + "epoch": 0.3093721565059145, + "grad_norm": 0.8590661780887954, + "learning_rate": 9.906109853847601e-06, + "loss": 0.0947, + "step": 680 + }, + { + "epoch": 0.30982711555959963, + "grad_norm": 0.9160314729851723, + "learning_rate": 9.90583396800465e-06, + "loss": 0.0928, + "step": 681 + }, + { + "epoch": 0.31028207461328483, + "grad_norm": 0.8935511298088069, + "learning_rate": 9.9055576812791e-06, + "loss": 0.0996, + "step": 682 + }, + { + "epoch": 0.31073703366697, + "grad_norm": 0.7005723015579258, + "learning_rate": 9.905280993693533e-06, + "loss": 0.0863, + "step": 683 + }, + { + "epoch": 0.3111919927206551, + "grad_norm": 0.6441434987399284, + "learning_rate": 9.905003905270553e-06, + "loss": 0.0682, + "step": 684 + }, + { + "epoch": 0.3116469517743403, + "grad_norm": 0.9609160991558658, + "learning_rate": 9.904726416032803e-06, + "loss": 0.1095, + "step": 685 + }, + { + "epoch": 0.31210191082802546, + "grad_norm": 0.723787688745946, + "learning_rate": 9.904448526002963e-06, + "loss": 0.0637, + "step": 686 + }, + { + "epoch": 0.31255686988171066, + "grad_norm": 0.5250433090776031, + "learning_rate": 9.904170235203737e-06, + "loss": 0.0587, + "step": 687 + }, + { + "epoch": 0.3130118289353958, + "grad_norm": 0.8819438583914972, + "learning_rate": 9.903891543657866e-06, + "loss": 0.1112, + "step": 688 + }, + { + "epoch": 0.313466787989081, + "grad_norm": 0.5413774773467063, + "learning_rate": 9.903612451388122e-06, + "loss": 0.0722, + "step": 689 + }, + { + "epoch": 0.31392174704276615, + "grad_norm": 0.8913097595158456, + "learning_rate": 9.903332958417315e-06, + "loss": 0.0893, + "step": 690 + }, + { + "epoch": 0.3143767060964513, + "grad_norm": 0.6466979890354269, + "learning_rate": 9.903053064768283e-06, + "loss": 0.0709, + "step": 691 + }, + { + "epoch": 0.3148316651501365, + "grad_norm": 0.8428101951038133, + "learning_rate": 9.902772770463892e-06, + "loss": 0.0814, + "step": 692 + }, + { + "epoch": 0.31528662420382164, + "grad_norm": 0.5832299371816577, + "learning_rate": 9.902492075527057e-06, + "loss": 0.0597, + "step": 693 + }, + { + "epoch": 0.31574158325750684, + "grad_norm": 0.7856263020740725, + "learning_rate": 9.902210979980705e-06, + "loss": 0.074, + "step": 694 + }, + { + "epoch": 0.316196542311192, + "grad_norm": 0.8507681095680276, + "learning_rate": 9.90192948384781e-06, + "loss": 0.0941, + "step": 695 + }, + { + "epoch": 0.3166515013648772, + "grad_norm": 0.7777857824270489, + "learning_rate": 9.901647587151376e-06, + "loss": 0.0708, + "step": 696 + }, + { + "epoch": 0.31710646041856233, + "grad_norm": 1.068022521735614, + "learning_rate": 9.901365289914437e-06, + "loss": 0.108, + "step": 697 + }, + { + "epoch": 0.3175614194722475, + "grad_norm": 1.1320770025873614, + "learning_rate": 9.901082592160059e-06, + "loss": 0.108, + "step": 698 + }, + { + "epoch": 0.3180163785259327, + "grad_norm": 0.803518334023751, + "learning_rate": 9.900799493911346e-06, + "loss": 0.0871, + "step": 699 + }, + { + "epoch": 0.3184713375796178, + "grad_norm": 0.8188444942805464, + "learning_rate": 9.900515995191431e-06, + "loss": 0.0808, + "step": 700 + }, + { + "epoch": 0.318926296633303, + "grad_norm": 0.8993527964087475, + "learning_rate": 9.900232096023478e-06, + "loss": 0.0821, + "step": 701 + }, + { + "epoch": 0.31938125568698816, + "grad_norm": 0.5600271316880729, + "learning_rate": 9.899947796430687e-06, + "loss": 0.0478, + "step": 702 + }, + { + "epoch": 0.31983621474067336, + "grad_norm": 0.8369718087747545, + "learning_rate": 9.899663096436292e-06, + "loss": 0.0871, + "step": 703 + }, + { + "epoch": 0.3202911737943585, + "grad_norm": 0.8993771893247359, + "learning_rate": 9.899377996063554e-06, + "loss": 0.0858, + "step": 704 + }, + { + "epoch": 0.32074613284804365, + "grad_norm": 0.6615773523414142, + "learning_rate": 9.899092495335772e-06, + "loss": 0.0601, + "step": 705 + }, + { + "epoch": 0.32120109190172885, + "grad_norm": 0.8278593900178107, + "learning_rate": 9.898806594276273e-06, + "loss": 0.0769, + "step": 706 + }, + { + "epoch": 0.321656050955414, + "grad_norm": 0.7866286577186284, + "learning_rate": 9.898520292908425e-06, + "loss": 0.0894, + "step": 707 + }, + { + "epoch": 0.3221110100090992, + "grad_norm": 0.8050313615570786, + "learning_rate": 9.89823359125562e-06, + "loss": 0.0732, + "step": 708 + }, + { + "epoch": 0.32256596906278434, + "grad_norm": 1.0243914254387991, + "learning_rate": 9.897946489341286e-06, + "loss": 0.0901, + "step": 709 + }, + { + "epoch": 0.32302092811646954, + "grad_norm": 0.7036337195424629, + "learning_rate": 9.897658987188882e-06, + "loss": 0.0686, + "step": 710 + }, + { + "epoch": 0.3234758871701547, + "grad_norm": 0.5593772745397846, + "learning_rate": 9.897371084821905e-06, + "loss": 0.045, + "step": 711 + }, + { + "epoch": 0.32393084622383983, + "grad_norm": 0.608867956874154, + "learning_rate": 9.897082782263878e-06, + "loss": 0.0692, + "step": 712 + }, + { + "epoch": 0.32438580527752503, + "grad_norm": 0.6488333561840038, + "learning_rate": 9.896794079538362e-06, + "loss": 0.0513, + "step": 713 + }, + { + "epoch": 0.3248407643312102, + "grad_norm": 0.5593745607285364, + "learning_rate": 9.896504976668948e-06, + "loss": 0.0437, + "step": 714 + }, + { + "epoch": 0.3252957233848954, + "grad_norm": 0.5072427035814352, + "learning_rate": 9.896215473679259e-06, + "loss": 0.0566, + "step": 715 + }, + { + "epoch": 0.3257506824385805, + "grad_norm": 0.7088539736923404, + "learning_rate": 9.895925570592952e-06, + "loss": 0.0878, + "step": 716 + }, + { + "epoch": 0.3262056414922657, + "grad_norm": 0.9653520712469312, + "learning_rate": 9.895635267433719e-06, + "loss": 0.101, + "step": 717 + }, + { + "epoch": 0.32666060054595086, + "grad_norm": 1.2323140645024868, + "learning_rate": 9.895344564225277e-06, + "loss": 0.1359, + "step": 718 + }, + { + "epoch": 0.327115559599636, + "grad_norm": 0.6826807669546061, + "learning_rate": 9.895053460991389e-06, + "loss": 0.0799, + "step": 719 + }, + { + "epoch": 0.3275705186533212, + "grad_norm": 0.9496304010026827, + "learning_rate": 9.894761957755834e-06, + "loss": 0.0928, + "step": 720 + }, + { + "epoch": 0.32802547770700635, + "grad_norm": 0.8578622125964999, + "learning_rate": 9.894470054542438e-06, + "loss": 0.1149, + "step": 721 + }, + { + "epoch": 0.32848043676069155, + "grad_norm": 0.5483719717114235, + "learning_rate": 9.894177751375053e-06, + "loss": 0.0621, + "step": 722 + }, + { + "epoch": 0.3289353958143767, + "grad_norm": 0.6341198897869947, + "learning_rate": 9.893885048277564e-06, + "loss": 0.0568, + "step": 723 + }, + { + "epoch": 0.3293903548680619, + "grad_norm": 0.7169738278552924, + "learning_rate": 9.893591945273888e-06, + "loss": 0.0752, + "step": 724 + }, + { + "epoch": 0.32984531392174704, + "grad_norm": 0.9839905963719277, + "learning_rate": 9.89329844238798e-06, + "loss": 0.1167, + "step": 725 + }, + { + "epoch": 0.3303002729754322, + "grad_norm": 0.6825969142747964, + "learning_rate": 9.89300453964382e-06, + "loss": 0.0693, + "step": 726 + }, + { + "epoch": 0.3307552320291174, + "grad_norm": 1.0420794853330364, + "learning_rate": 9.892710237065423e-06, + "loss": 0.1561, + "step": 727 + }, + { + "epoch": 0.33121019108280253, + "grad_norm": 1.0109988913697336, + "learning_rate": 9.892415534676844e-06, + "loss": 0.0813, + "step": 728 + }, + { + "epoch": 0.33166515013648773, + "grad_norm": 0.6237179977245606, + "learning_rate": 9.892120432502161e-06, + "loss": 0.063, + "step": 729 + }, + { + "epoch": 0.3321201091901729, + "grad_norm": 0.7047649578988654, + "learning_rate": 9.891824930565488e-06, + "loss": 0.0757, + "step": 730 + }, + { + "epoch": 0.3325750682438581, + "grad_norm": 0.8381336709785119, + "learning_rate": 9.891529028890974e-06, + "loss": 0.1137, + "step": 731 + }, + { + "epoch": 0.3330300272975432, + "grad_norm": 1.108812928457643, + "learning_rate": 9.891232727502797e-06, + "loss": 0.0971, + "step": 732 + }, + { + "epoch": 0.33348498635122836, + "grad_norm": 0.8911550238765422, + "learning_rate": 9.89093602642517e-06, + "loss": 0.0869, + "step": 733 + }, + { + "epoch": 0.33393994540491356, + "grad_norm": 0.7527062298816352, + "learning_rate": 9.890638925682339e-06, + "loss": 0.085, + "step": 734 + }, + { + "epoch": 0.3343949044585987, + "grad_norm": 0.8028637093759472, + "learning_rate": 9.89034142529858e-06, + "loss": 0.0866, + "step": 735 + }, + { + "epoch": 0.3348498635122839, + "grad_norm": 0.6620365400447171, + "learning_rate": 9.890043525298203e-06, + "loss": 0.053, + "step": 736 + }, + { + "epoch": 0.33530482256596905, + "grad_norm": 0.6606838089782118, + "learning_rate": 9.889745225705555e-06, + "loss": 0.0783, + "step": 737 + }, + { + "epoch": 0.33575978161965425, + "grad_norm": 0.6719238881234298, + "learning_rate": 9.889446526545007e-06, + "loss": 0.079, + "step": 738 + }, + { + "epoch": 0.3362147406733394, + "grad_norm": 0.7379881342173255, + "learning_rate": 9.88914742784097e-06, + "loss": 0.0848, + "step": 739 + }, + { + "epoch": 0.33666969972702454, + "grad_norm": 1.9725398231448836, + "learning_rate": 9.888847929617887e-06, + "loss": 0.1666, + "step": 740 + }, + { + "epoch": 0.33712465878070974, + "grad_norm": 0.7800667095330575, + "learning_rate": 9.888548031900226e-06, + "loss": 0.0779, + "step": 741 + }, + { + "epoch": 0.3375796178343949, + "grad_norm": 0.9725198572426639, + "learning_rate": 9.888247734712497e-06, + "loss": 0.0719, + "step": 742 + }, + { + "epoch": 0.3380345768880801, + "grad_norm": 0.9547104503470986, + "learning_rate": 9.887947038079238e-06, + "loss": 0.1119, + "step": 743 + }, + { + "epoch": 0.33848953594176523, + "grad_norm": 0.5879353672489683, + "learning_rate": 9.887645942025022e-06, + "loss": 0.0553, + "step": 744 + }, + { + "epoch": 0.33894449499545043, + "grad_norm": 0.5485885922626542, + "learning_rate": 9.887344446574452e-06, + "loss": 0.0494, + "step": 745 + }, + { + "epoch": 0.3393994540491356, + "grad_norm": 0.9640668269863656, + "learning_rate": 9.887042551752163e-06, + "loss": 0.1104, + "step": 746 + }, + { + "epoch": 0.3398544131028208, + "grad_norm": 0.8639463935480832, + "learning_rate": 9.886740257582827e-06, + "loss": 0.0655, + "step": 747 + }, + { + "epoch": 0.3403093721565059, + "grad_norm": 0.6489702107287116, + "learning_rate": 9.886437564091148e-06, + "loss": 0.0777, + "step": 748 + }, + { + "epoch": 0.34076433121019106, + "grad_norm": 0.8236523684362178, + "learning_rate": 9.886134471301854e-06, + "loss": 0.0916, + "step": 749 + }, + { + "epoch": 0.34121929026387626, + "grad_norm": 0.8459143900125461, + "learning_rate": 9.885830979239718e-06, + "loss": 0.1017, + "step": 750 + }, + { + "epoch": 0.3416742493175614, + "grad_norm": 0.7496065352262437, + "learning_rate": 9.885527087929541e-06, + "loss": 0.0861, + "step": 751 + }, + { + "epoch": 0.3421292083712466, + "grad_norm": 0.849292513666517, + "learning_rate": 9.88522279739615e-06, + "loss": 0.0839, + "step": 752 + }, + { + "epoch": 0.34258416742493175, + "grad_norm": 0.7756671663835698, + "learning_rate": 9.884918107664417e-06, + "loss": 0.0809, + "step": 753 + }, + { + "epoch": 0.34303912647861695, + "grad_norm": 0.7338987681003677, + "learning_rate": 9.884613018759234e-06, + "loss": 0.0721, + "step": 754 + }, + { + "epoch": 0.3434940855323021, + "grad_norm": 0.6003946948163056, + "learning_rate": 9.884307530705534e-06, + "loss": 0.0782, + "step": 755 + }, + { + "epoch": 0.34394904458598724, + "grad_norm": 0.5309561440373582, + "learning_rate": 9.88400164352828e-06, + "loss": 0.0563, + "step": 756 + }, + { + "epoch": 0.34440400363967244, + "grad_norm": 0.6551261739802692, + "learning_rate": 9.883695357252467e-06, + "loss": 0.061, + "step": 757 + }, + { + "epoch": 0.3448589626933576, + "grad_norm": 0.6598139820416582, + "learning_rate": 9.883388671903125e-06, + "loss": 0.084, + "step": 758 + }, + { + "epoch": 0.3453139217470428, + "grad_norm": 0.8678451615084499, + "learning_rate": 9.883081587505315e-06, + "loss": 0.0893, + "step": 759 + }, + { + "epoch": 0.34576888080072793, + "grad_norm": 0.8849976199871086, + "learning_rate": 9.882774104084127e-06, + "loss": 0.0938, + "step": 760 + }, + { + "epoch": 0.34622383985441313, + "grad_norm": 0.6157555054475868, + "learning_rate": 9.882466221664691e-06, + "loss": 0.0535, + "step": 761 + }, + { + "epoch": 0.3466787989080983, + "grad_norm": 0.9555128068667961, + "learning_rate": 9.882157940272165e-06, + "loss": 0.0984, + "step": 762 + }, + { + "epoch": 0.3471337579617834, + "grad_norm": 0.8431106213501941, + "learning_rate": 9.881849259931738e-06, + "loss": 0.1062, + "step": 763 + }, + { + "epoch": 0.3475887170154686, + "grad_norm": 0.6608166650909644, + "learning_rate": 9.881540180668637e-06, + "loss": 0.0589, + "step": 764 + }, + { + "epoch": 0.34804367606915376, + "grad_norm": 0.7177237690901401, + "learning_rate": 9.881230702508118e-06, + "loss": 0.0721, + "step": 765 + }, + { + "epoch": 0.34849863512283896, + "grad_norm": 0.49396541889218665, + "learning_rate": 9.880920825475468e-06, + "loss": 0.0582, + "step": 766 + }, + { + "epoch": 0.3489535941765241, + "grad_norm": 0.7008727540015932, + "learning_rate": 9.88061054959601e-06, + "loss": 0.0689, + "step": 767 + }, + { + "epoch": 0.3494085532302093, + "grad_norm": 0.6417543130209264, + "learning_rate": 9.880299874895098e-06, + "loss": 0.0859, + "step": 768 + }, + { + "epoch": 0.34986351228389445, + "grad_norm": 0.5325758158155319, + "learning_rate": 9.879988801398121e-06, + "loss": 0.0508, + "step": 769 + }, + { + "epoch": 0.3503184713375796, + "grad_norm": 0.653129374155715, + "learning_rate": 9.879677329130496e-06, + "loss": 0.0822, + "step": 770 + }, + { + "epoch": 0.3507734303912648, + "grad_norm": 0.6044703796770591, + "learning_rate": 9.879365458117678e-06, + "loss": 0.0662, + "step": 771 + }, + { + "epoch": 0.35122838944494994, + "grad_norm": 0.6417796330386928, + "learning_rate": 9.879053188385148e-06, + "loss": 0.0649, + "step": 772 + }, + { + "epoch": 0.35168334849863514, + "grad_norm": 0.6127493684308597, + "learning_rate": 9.878740519958425e-06, + "loss": 0.0601, + "step": 773 + }, + { + "epoch": 0.3521383075523203, + "grad_norm": 0.9092296350808027, + "learning_rate": 9.878427452863059e-06, + "loss": 0.1138, + "step": 774 + }, + { + "epoch": 0.3525932666060055, + "grad_norm": 0.8850379239223551, + "learning_rate": 9.878113987124633e-06, + "loss": 0.1135, + "step": 775 + }, + { + "epoch": 0.35304822565969063, + "grad_norm": 0.8106864823035035, + "learning_rate": 9.877800122768761e-06, + "loss": 0.084, + "step": 776 + }, + { + "epoch": 0.3535031847133758, + "grad_norm": 0.6717791100158048, + "learning_rate": 9.877485859821092e-06, + "loss": 0.0764, + "step": 777 + }, + { + "epoch": 0.353958143767061, + "grad_norm": 0.4266356830653338, + "learning_rate": 9.877171198307304e-06, + "loss": 0.0496, + "step": 778 + }, + { + "epoch": 0.3544131028207461, + "grad_norm": 0.7839112755574695, + "learning_rate": 9.87685613825311e-06, + "loss": 0.0864, + "step": 779 + }, + { + "epoch": 0.3548680618744313, + "grad_norm": 0.8928629316475961, + "learning_rate": 9.876540679684257e-06, + "loss": 0.0802, + "step": 780 + }, + { + "epoch": 0.35532302092811646, + "grad_norm": 0.7427060191976654, + "learning_rate": 9.876224822626522e-06, + "loss": 0.0809, + "step": 781 + }, + { + "epoch": 0.35577797998180166, + "grad_norm": 0.6618589317208607, + "learning_rate": 9.875908567105716e-06, + "loss": 0.0633, + "step": 782 + }, + { + "epoch": 0.3562329390354868, + "grad_norm": 0.9168643329932029, + "learning_rate": 9.87559191314768e-06, + "loss": 0.0977, + "step": 783 + }, + { + "epoch": 0.35668789808917195, + "grad_norm": 1.010661772545197, + "learning_rate": 9.87527486077829e-06, + "loss": 0.112, + "step": 784 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 0.7355960177801563, + "learning_rate": 9.874957410023458e-06, + "loss": 0.0578, + "step": 785 + }, + { + "epoch": 0.3575978161965423, + "grad_norm": 0.7012046376593928, + "learning_rate": 9.874639560909118e-06, + "loss": 0.0856, + "step": 786 + }, + { + "epoch": 0.3580527752502275, + "grad_norm": 0.629856671324697, + "learning_rate": 9.87432131346125e-06, + "loss": 0.079, + "step": 787 + }, + { + "epoch": 0.35850773430391264, + "grad_norm": 0.6605442679933491, + "learning_rate": 9.874002667705855e-06, + "loss": 0.0713, + "step": 788 + }, + { + "epoch": 0.35896269335759784, + "grad_norm": 0.6036439966816435, + "learning_rate": 9.873683623668972e-06, + "loss": 0.0734, + "step": 789 + }, + { + "epoch": 0.359417652411283, + "grad_norm": 0.9098464282834562, + "learning_rate": 9.873364181376674e-06, + "loss": 0.1273, + "step": 790 + }, + { + "epoch": 0.35987261146496813, + "grad_norm": 0.725232432410699, + "learning_rate": 9.873044340855062e-06, + "loss": 0.0704, + "step": 791 + }, + { + "epoch": 0.36032757051865333, + "grad_norm": 0.8275864687946802, + "learning_rate": 9.872724102130273e-06, + "loss": 0.0722, + "step": 792 + }, + { + "epoch": 0.3607825295723385, + "grad_norm": 0.6908762665090429, + "learning_rate": 9.872403465228476e-06, + "loss": 0.068, + "step": 793 + }, + { + "epoch": 0.3612374886260237, + "grad_norm": 0.8007479624540592, + "learning_rate": 9.872082430175871e-06, + "loss": 0.0792, + "step": 794 + }, + { + "epoch": 0.3616924476797088, + "grad_norm": 0.7580697654486878, + "learning_rate": 9.871760996998692e-06, + "loss": 0.0662, + "step": 795 + }, + { + "epoch": 0.362147406733394, + "grad_norm": 1.0378802589927232, + "learning_rate": 9.871439165723207e-06, + "loss": 0.0905, + "step": 796 + }, + { + "epoch": 0.36260236578707916, + "grad_norm": 0.9366156924362913, + "learning_rate": 9.87111693637571e-06, + "loss": 0.0966, + "step": 797 + }, + { + "epoch": 0.3630573248407643, + "grad_norm": 0.9568919919938076, + "learning_rate": 9.870794308982536e-06, + "loss": 0.1092, + "step": 798 + }, + { + "epoch": 0.3635122838944495, + "grad_norm": 1.0303944561108107, + "learning_rate": 9.870471283570046e-06, + "loss": 0.1214, + "step": 799 + }, + { + "epoch": 0.36396724294813465, + "grad_norm": 0.7123988620535131, + "learning_rate": 9.870147860164639e-06, + "loss": 0.0952, + "step": 800 + }, + { + "epoch": 0.36442220200181985, + "grad_norm": 0.6461145025804255, + "learning_rate": 9.86982403879274e-06, + "loss": 0.0653, + "step": 801 + }, + { + "epoch": 0.364877161055505, + "grad_norm": 0.761176238728339, + "learning_rate": 9.869499819480815e-06, + "loss": 0.0911, + "step": 802 + }, + { + "epoch": 0.3653321201091902, + "grad_norm": 0.6778284620896282, + "learning_rate": 9.869175202255354e-06, + "loss": 0.0726, + "step": 803 + }, + { + "epoch": 0.36578707916287534, + "grad_norm": 0.6378934869683002, + "learning_rate": 9.868850187142885e-06, + "loss": 0.0721, + "step": 804 + }, + { + "epoch": 0.3662420382165605, + "grad_norm": 0.725078464245391, + "learning_rate": 9.868524774169968e-06, + "loss": 0.0774, + "step": 805 + }, + { + "epoch": 0.3666969972702457, + "grad_norm": 0.7707907185217752, + "learning_rate": 9.86819896336319e-06, + "loss": 0.067, + "step": 806 + }, + { + "epoch": 0.36715195632393083, + "grad_norm": 0.8162851407409059, + "learning_rate": 9.867872754749178e-06, + "loss": 0.0908, + "step": 807 + }, + { + "epoch": 0.36760691537761603, + "grad_norm": 0.5330499489332517, + "learning_rate": 9.867546148354586e-06, + "loss": 0.066, + "step": 808 + }, + { + "epoch": 0.3680618744313012, + "grad_norm": 0.6649993383235931, + "learning_rate": 9.867219144206105e-06, + "loss": 0.0672, + "step": 809 + }, + { + "epoch": 0.3685168334849864, + "grad_norm": 0.9824606570699352, + "learning_rate": 9.866891742330458e-06, + "loss": 0.11, + "step": 810 + }, + { + "epoch": 0.3689717925386715, + "grad_norm": 0.6507791006697302, + "learning_rate": 9.866563942754394e-06, + "loss": 0.0622, + "step": 811 + }, + { + "epoch": 0.36942675159235666, + "grad_norm": 0.7455907568930894, + "learning_rate": 9.866235745504705e-06, + "loss": 0.0833, + "step": 812 + }, + { + "epoch": 0.36988171064604186, + "grad_norm": 0.9927293122267482, + "learning_rate": 9.865907150608203e-06, + "loss": 0.0978, + "step": 813 + }, + { + "epoch": 0.370336669699727, + "grad_norm": 0.817279180213694, + "learning_rate": 9.865578158091746e-06, + "loss": 0.1036, + "step": 814 + }, + { + "epoch": 0.3707916287534122, + "grad_norm": 0.9966504261459711, + "learning_rate": 9.865248767982211e-06, + "loss": 0.1027, + "step": 815 + }, + { + "epoch": 0.37124658780709735, + "grad_norm": 0.9561727776097537, + "learning_rate": 9.864918980306521e-06, + "loss": 0.1136, + "step": 816 + }, + { + "epoch": 0.37170154686078255, + "grad_norm": 0.6718095123705313, + "learning_rate": 9.86458879509162e-06, + "loss": 0.0762, + "step": 817 + }, + { + "epoch": 0.3721565059144677, + "grad_norm": 0.9803345299998187, + "learning_rate": 9.864258212364492e-06, + "loss": 0.0791, + "step": 818 + }, + { + "epoch": 0.37261146496815284, + "grad_norm": 0.8058679812037255, + "learning_rate": 9.86392723215215e-06, + "loss": 0.069, + "step": 819 + }, + { + "epoch": 0.37306642402183804, + "grad_norm": 0.5836160590759203, + "learning_rate": 9.86359585448164e-06, + "loss": 0.0621, + "step": 820 + }, + { + "epoch": 0.3735213830755232, + "grad_norm": 0.6511599091669776, + "learning_rate": 9.863264079380039e-06, + "loss": 0.0745, + "step": 821 + }, + { + "epoch": 0.3739763421292084, + "grad_norm": 0.9308266206126162, + "learning_rate": 9.862931906874461e-06, + "loss": 0.1132, + "step": 822 + }, + { + "epoch": 0.37443130118289353, + "grad_norm": 0.613775373571284, + "learning_rate": 9.862599336992048e-06, + "loss": 0.0545, + "step": 823 + }, + { + "epoch": 0.37488626023657873, + "grad_norm": 0.6991388893487894, + "learning_rate": 9.862266369759976e-06, + "loss": 0.0754, + "step": 824 + }, + { + "epoch": 0.37534121929026387, + "grad_norm": 0.6352968005261165, + "learning_rate": 9.861933005205454e-06, + "loss": 0.0576, + "step": 825 + }, + { + "epoch": 0.37579617834394907, + "grad_norm": 1.109194467922723, + "learning_rate": 9.861599243355725e-06, + "loss": 0.1281, + "step": 826 + }, + { + "epoch": 0.3762511373976342, + "grad_norm": 0.9742134289860664, + "learning_rate": 9.86126508423806e-06, + "loss": 0.1067, + "step": 827 + }, + { + "epoch": 0.37670609645131936, + "grad_norm": 0.6015820455914206, + "learning_rate": 9.860930527879763e-06, + "loss": 0.055, + "step": 828 + }, + { + "epoch": 0.37716105550500456, + "grad_norm": 1.0894948091440197, + "learning_rate": 9.860595574308179e-06, + "loss": 0.1147, + "step": 829 + }, + { + "epoch": 0.3776160145586897, + "grad_norm": 0.7023892750192133, + "learning_rate": 9.860260223550672e-06, + "loss": 0.0815, + "step": 830 + }, + { + "epoch": 0.3780709736123749, + "grad_norm": 0.4943868719085533, + "learning_rate": 9.859924475634649e-06, + "loss": 0.0476, + "step": 831 + }, + { + "epoch": 0.37852593266606005, + "grad_norm": 0.9974648765413693, + "learning_rate": 9.859588330587545e-06, + "loss": 0.1068, + "step": 832 + }, + { + "epoch": 0.37898089171974525, + "grad_norm": 0.5960289391531881, + "learning_rate": 9.859251788436829e-06, + "loss": 0.0715, + "step": 833 + }, + { + "epoch": 0.3794358507734304, + "grad_norm": 0.907079582974149, + "learning_rate": 9.85891484921e-06, + "loss": 0.0905, + "step": 834 + }, + { + "epoch": 0.37989080982711554, + "grad_norm": 0.8133034306250352, + "learning_rate": 9.858577512934592e-06, + "loss": 0.1012, + "step": 835 + }, + { + "epoch": 0.38034576888080074, + "grad_norm": 0.7828785203637737, + "learning_rate": 9.858239779638173e-06, + "loss": 0.0726, + "step": 836 + }, + { + "epoch": 0.3808007279344859, + "grad_norm": 1.3138864597148558, + "learning_rate": 9.857901649348338e-06, + "loss": 0.1307, + "step": 837 + }, + { + "epoch": 0.3812556869881711, + "grad_norm": 0.7000750227265026, + "learning_rate": 9.857563122092717e-06, + "loss": 0.0777, + "step": 838 + }, + { + "epoch": 0.3817106460418562, + "grad_norm": 0.757283984575844, + "learning_rate": 9.857224197898975e-06, + "loss": 0.083, + "step": 839 + }, + { + "epoch": 0.3821656050955414, + "grad_norm": 0.7113754486134378, + "learning_rate": 9.856884876794805e-06, + "loss": 0.0795, + "step": 840 + }, + { + "epoch": 0.38262056414922657, + "grad_norm": 0.6891370217065743, + "learning_rate": 9.856545158807938e-06, + "loss": 0.0576, + "step": 841 + }, + { + "epoch": 0.3830755232029117, + "grad_norm": 0.7230826558764609, + "learning_rate": 9.856205043966134e-06, + "loss": 0.0973, + "step": 842 + }, + { + "epoch": 0.3835304822565969, + "grad_norm": 0.9951638416419379, + "learning_rate": 9.855864532297181e-06, + "loss": 0.1225, + "step": 843 + }, + { + "epoch": 0.38398544131028206, + "grad_norm": 0.8272776971451865, + "learning_rate": 9.85552362382891e-06, + "loss": 0.0928, + "step": 844 + }, + { + "epoch": 0.38444040036396726, + "grad_norm": 0.662562460388915, + "learning_rate": 9.855182318589174e-06, + "loss": 0.0711, + "step": 845 + }, + { + "epoch": 0.3848953594176524, + "grad_norm": 1.185659176011977, + "learning_rate": 9.854840616605866e-06, + "loss": 0.0922, + "step": 846 + }, + { + "epoch": 0.3853503184713376, + "grad_norm": 0.7002426118833048, + "learning_rate": 9.854498517906908e-06, + "loss": 0.0828, + "step": 847 + }, + { + "epoch": 0.38580527752502275, + "grad_norm": 0.8957633348930525, + "learning_rate": 9.854156022520252e-06, + "loss": 0.0809, + "step": 848 + }, + { + "epoch": 0.3862602365787079, + "grad_norm": 1.0593251614278854, + "learning_rate": 9.853813130473887e-06, + "loss": 0.1109, + "step": 849 + }, + { + "epoch": 0.3867151956323931, + "grad_norm": 0.7751748709357449, + "learning_rate": 9.853469841795832e-06, + "loss": 0.0823, + "step": 850 + }, + { + "epoch": 0.38717015468607824, + "grad_norm": 0.5943868690351954, + "learning_rate": 9.853126156514142e-06, + "loss": 0.0758, + "step": 851 + }, + { + "epoch": 0.38762511373976344, + "grad_norm": 0.4901349757557767, + "learning_rate": 9.852782074656897e-06, + "loss": 0.064, + "step": 852 + }, + { + "epoch": 0.3880800727934486, + "grad_norm": 0.7531191508768753, + "learning_rate": 9.852437596252216e-06, + "loss": 0.0824, + "step": 853 + }, + { + "epoch": 0.3885350318471338, + "grad_norm": 0.7684236261792305, + "learning_rate": 9.852092721328248e-06, + "loss": 0.0674, + "step": 854 + }, + { + "epoch": 0.3889899909008189, + "grad_norm": 0.8624513661560378, + "learning_rate": 9.851747449913176e-06, + "loss": 0.09, + "step": 855 + }, + { + "epoch": 0.38944494995450407, + "grad_norm": 0.9125725996183891, + "learning_rate": 9.851401782035213e-06, + "loss": 0.129, + "step": 856 + }, + { + "epoch": 0.38989990900818927, + "grad_norm": 0.7630714638300728, + "learning_rate": 9.851055717722604e-06, + "loss": 0.068, + "step": 857 + }, + { + "epoch": 0.3903548680618744, + "grad_norm": 0.834756070401477, + "learning_rate": 9.850709257003628e-06, + "loss": 0.0831, + "step": 858 + }, + { + "epoch": 0.3908098271155596, + "grad_norm": 0.9864776662717517, + "learning_rate": 9.850362399906598e-06, + "loss": 0.0904, + "step": 859 + }, + { + "epoch": 0.39126478616924476, + "grad_norm": 0.6242730295284743, + "learning_rate": 9.850015146459857e-06, + "loss": 0.0754, + "step": 860 + }, + { + "epoch": 0.39171974522292996, + "grad_norm": 0.838271649072902, + "learning_rate": 9.84966749669178e-06, + "loss": 0.0899, + "step": 861 + }, + { + "epoch": 0.3921747042766151, + "grad_norm": 0.6826448278617049, + "learning_rate": 9.849319450630777e-06, + "loss": 0.0698, + "step": 862 + }, + { + "epoch": 0.39262966333030025, + "grad_norm": 0.5533993282250775, + "learning_rate": 9.848971008305288e-06, + "loss": 0.0688, + "step": 863 + }, + { + "epoch": 0.39308462238398545, + "grad_norm": 0.838673412156409, + "learning_rate": 9.848622169743784e-06, + "loss": 0.0815, + "step": 864 + }, + { + "epoch": 0.3935395814376706, + "grad_norm": 0.9783580500729582, + "learning_rate": 9.848272934974774e-06, + "loss": 0.0745, + "step": 865 + }, + { + "epoch": 0.3939945404913558, + "grad_norm": 0.5976030953641746, + "learning_rate": 9.847923304026793e-06, + "loss": 0.0664, + "step": 866 + }, + { + "epoch": 0.39444949954504094, + "grad_norm": 0.6999143793652887, + "learning_rate": 9.847573276928415e-06, + "loss": 0.0804, + "step": 867 + }, + { + "epoch": 0.39490445859872614, + "grad_norm": 0.6338725165728231, + "learning_rate": 9.847222853708239e-06, + "loss": 0.0655, + "step": 868 + }, + { + "epoch": 0.3953594176524113, + "grad_norm": 0.7010627446349382, + "learning_rate": 9.846872034394902e-06, + "loss": 0.0667, + "step": 869 + }, + { + "epoch": 0.3958143767060964, + "grad_norm": 0.6173227181881447, + "learning_rate": 9.84652081901707e-06, + "loss": 0.0674, + "step": 870 + }, + { + "epoch": 0.3962693357597816, + "grad_norm": 0.9673042020268607, + "learning_rate": 9.846169207603443e-06, + "loss": 0.1267, + "step": 871 + }, + { + "epoch": 0.39672429481346677, + "grad_norm": 0.6294912489479282, + "learning_rate": 9.845817200182755e-06, + "loss": 0.0588, + "step": 872 + }, + { + "epoch": 0.39717925386715197, + "grad_norm": 0.8477152807126976, + "learning_rate": 9.845464796783767e-06, + "loss": 0.1219, + "step": 873 + }, + { + "epoch": 0.3976342129208371, + "grad_norm": 0.5887483684825674, + "learning_rate": 9.845111997435279e-06, + "loss": 0.0731, + "step": 874 + }, + { + "epoch": 0.3980891719745223, + "grad_norm": 0.5630369277247907, + "learning_rate": 9.844758802166116e-06, + "loss": 0.0579, + "step": 875 + }, + { + "epoch": 0.39854413102820746, + "grad_norm": 0.6717541815357567, + "learning_rate": 9.844405211005145e-06, + "loss": 0.0711, + "step": 876 + }, + { + "epoch": 0.3989990900818926, + "grad_norm": 0.6571828619535791, + "learning_rate": 9.844051223981258e-06, + "loss": 0.0638, + "step": 877 + }, + { + "epoch": 0.3994540491355778, + "grad_norm": 0.6723710552364174, + "learning_rate": 9.84369684112338e-06, + "loss": 0.0676, + "step": 878 + }, + { + "epoch": 0.39990900818926295, + "grad_norm": 0.7014173744195523, + "learning_rate": 9.84334206246047e-06, + "loss": 0.0751, + "step": 879 + }, + { + "epoch": 0.40036396724294815, + "grad_norm": 0.7999660318519703, + "learning_rate": 9.842986888021518e-06, + "loss": 0.0895, + "step": 880 + }, + { + "epoch": 0.4008189262966333, + "grad_norm": 0.5578605501955606, + "learning_rate": 9.842631317835548e-06, + "loss": 0.0637, + "step": 881 + }, + { + "epoch": 0.4012738853503185, + "grad_norm": 0.6615256090849237, + "learning_rate": 9.842275351931617e-06, + "loss": 0.0664, + "step": 882 + }, + { + "epoch": 0.40172884440400364, + "grad_norm": 0.5263094198672195, + "learning_rate": 9.841918990338812e-06, + "loss": 0.0611, + "step": 883 + }, + { + "epoch": 0.4021838034576888, + "grad_norm": 0.8080883575450535, + "learning_rate": 9.841562233086252e-06, + "loss": 0.0912, + "step": 884 + }, + { + "epoch": 0.402638762511374, + "grad_norm": 0.6655757939327012, + "learning_rate": 9.841205080203092e-06, + "loss": 0.0601, + "step": 885 + }, + { + "epoch": 0.4030937215650591, + "grad_norm": 0.8701903481119097, + "learning_rate": 9.840847531718515e-06, + "loss": 0.0914, + "step": 886 + }, + { + "epoch": 0.4035486806187443, + "grad_norm": 0.7730206436987713, + "learning_rate": 9.840489587661738e-06, + "loss": 0.0747, + "step": 887 + }, + { + "epoch": 0.40400363967242947, + "grad_norm": 0.7410839527981146, + "learning_rate": 9.840131248062012e-06, + "loss": 0.079, + "step": 888 + }, + { + "epoch": 0.40445859872611467, + "grad_norm": 0.627620281196765, + "learning_rate": 9.839772512948618e-06, + "loss": 0.0715, + "step": 889 + }, + { + "epoch": 0.4049135577797998, + "grad_norm": 0.8746014124114054, + "learning_rate": 9.83941338235087e-06, + "loss": 0.0824, + "step": 890 + }, + { + "epoch": 0.40536851683348496, + "grad_norm": 1.0112737589697485, + "learning_rate": 9.839053856298116e-06, + "loss": 0.1251, + "step": 891 + }, + { + "epoch": 0.40582347588717016, + "grad_norm": 0.72216805525771, + "learning_rate": 9.838693934819734e-06, + "loss": 0.0893, + "step": 892 + }, + { + "epoch": 0.4062784349408553, + "grad_norm": 0.7544949830136005, + "learning_rate": 9.838333617945134e-06, + "loss": 0.0968, + "step": 893 + }, + { + "epoch": 0.4067333939945405, + "grad_norm": 0.9543024355165705, + "learning_rate": 9.837972905703762e-06, + "loss": 0.102, + "step": 894 + }, + { + "epoch": 0.40718835304822565, + "grad_norm": 1.02061795078975, + "learning_rate": 9.83761179812509e-06, + "loss": 0.0649, + "step": 895 + }, + { + "epoch": 0.40764331210191085, + "grad_norm": 0.39738812842187227, + "learning_rate": 9.837250295238629e-06, + "loss": 0.0428, + "step": 896 + }, + { + "epoch": 0.408098271155596, + "grad_norm": 0.8873895570319217, + "learning_rate": 9.836888397073919e-06, + "loss": 0.1068, + "step": 897 + }, + { + "epoch": 0.40855323020928114, + "grad_norm": 0.7492126364897504, + "learning_rate": 9.836526103660533e-06, + "loss": 0.0953, + "step": 898 + }, + { + "epoch": 0.40900818926296634, + "grad_norm": 0.821575499525911, + "learning_rate": 9.836163415028075e-06, + "loss": 0.0712, + "step": 899 + }, + { + "epoch": 0.4094631483166515, + "grad_norm": 1.0052579979241618, + "learning_rate": 9.835800331206183e-06, + "loss": 0.1138, + "step": 900 + }, + { + "epoch": 0.4099181073703367, + "grad_norm": 0.7848465428804848, + "learning_rate": 9.835436852224525e-06, + "loss": 0.0978, + "step": 901 + }, + { + "epoch": 0.4103730664240218, + "grad_norm": 0.9719856735481065, + "learning_rate": 9.835072978112804e-06, + "loss": 0.0846, + "step": 902 + }, + { + "epoch": 0.410828025477707, + "grad_norm": 0.6607308818506346, + "learning_rate": 9.834708708900755e-06, + "loss": 0.0654, + "step": 903 + }, + { + "epoch": 0.41128298453139217, + "grad_norm": 0.5191597312034261, + "learning_rate": 9.834344044618144e-06, + "loss": 0.0518, + "step": 904 + }, + { + "epoch": 0.41173794358507737, + "grad_norm": 0.5336391872354229, + "learning_rate": 9.83397898529477e-06, + "loss": 0.0535, + "step": 905 + }, + { + "epoch": 0.4121929026387625, + "grad_norm": 0.5687342550017563, + "learning_rate": 9.833613530960462e-06, + "loss": 0.0578, + "step": 906 + }, + { + "epoch": 0.41264786169244766, + "grad_norm": 0.8793783198642894, + "learning_rate": 9.833247681645083e-06, + "loss": 0.1286, + "step": 907 + }, + { + "epoch": 0.41310282074613286, + "grad_norm": 0.8073005899800644, + "learning_rate": 9.832881437378534e-06, + "loss": 0.0853, + "step": 908 + }, + { + "epoch": 0.413557779799818, + "grad_norm": 0.511699500000588, + "learning_rate": 9.832514798190738e-06, + "loss": 0.0504, + "step": 909 + }, + { + "epoch": 0.4140127388535032, + "grad_norm": 0.5082793074725768, + "learning_rate": 9.832147764111655e-06, + "loss": 0.056, + "step": 910 + }, + { + "epoch": 0.41446769790718835, + "grad_norm": 0.9876041013395295, + "learning_rate": 9.83178033517128e-06, + "loss": 0.0984, + "step": 911 + }, + { + "epoch": 0.41492265696087355, + "grad_norm": 0.7511273129930924, + "learning_rate": 9.831412511399633e-06, + "loss": 0.0969, + "step": 912 + }, + { + "epoch": 0.4153776160145587, + "grad_norm": 1.0144870263760433, + "learning_rate": 9.831044292826778e-06, + "loss": 0.1482, + "step": 913 + }, + { + "epoch": 0.41583257506824384, + "grad_norm": 0.70444400073401, + "learning_rate": 9.830675679482797e-06, + "loss": 0.0802, + "step": 914 + }, + { + "epoch": 0.41628753412192904, + "grad_norm": 1.0357251397748677, + "learning_rate": 9.830306671397816e-06, + "loss": 0.1061, + "step": 915 + }, + { + "epoch": 0.4167424931756142, + "grad_norm": 0.895894802940119, + "learning_rate": 9.829937268601988e-06, + "loss": 0.1005, + "step": 916 + }, + { + "epoch": 0.4171974522292994, + "grad_norm": 0.6004589977630954, + "learning_rate": 9.829567471125497e-06, + "loss": 0.0664, + "step": 917 + }, + { + "epoch": 0.4176524112829845, + "grad_norm": 0.6058859475834909, + "learning_rate": 9.829197278998562e-06, + "loss": 0.0728, + "step": 918 + }, + { + "epoch": 0.4181073703366697, + "grad_norm": 0.5886912548442098, + "learning_rate": 9.828826692251435e-06, + "loss": 0.074, + "step": 919 + }, + { + "epoch": 0.41856232939035487, + "grad_norm": 0.5982473215332103, + "learning_rate": 9.828455710914398e-06, + "loss": 0.0653, + "step": 920 + }, + { + "epoch": 0.41901728844404, + "grad_norm": 0.8647804622811079, + "learning_rate": 9.828084335017763e-06, + "loss": 0.0741, + "step": 921 + }, + { + "epoch": 0.4194722474977252, + "grad_norm": 0.653767178815679, + "learning_rate": 9.827712564591883e-06, + "loss": 0.0604, + "step": 922 + }, + { + "epoch": 0.41992720655141036, + "grad_norm": 0.7812500085225947, + "learning_rate": 9.827340399667132e-06, + "loss": 0.0708, + "step": 923 + }, + { + "epoch": 0.42038216560509556, + "grad_norm": 0.7314008563711142, + "learning_rate": 9.826967840273921e-06, + "loss": 0.0721, + "step": 924 + }, + { + "epoch": 0.4208371246587807, + "grad_norm": 0.8727413076803472, + "learning_rate": 9.8265948864427e-06, + "loss": 0.0892, + "step": 925 + }, + { + "epoch": 0.4212920837124659, + "grad_norm": 0.6051379056710864, + "learning_rate": 9.826221538203942e-06, + "loss": 0.0685, + "step": 926 + }, + { + "epoch": 0.42174704276615105, + "grad_norm": 0.7279887191787228, + "learning_rate": 9.825847795588154e-06, + "loss": 0.0766, + "step": 927 + }, + { + "epoch": 0.4222020018198362, + "grad_norm": 0.7126811268305303, + "learning_rate": 9.825473658625876e-06, + "loss": 0.0821, + "step": 928 + }, + { + "epoch": 0.4226569608735214, + "grad_norm": 0.8812960827967533, + "learning_rate": 9.825099127347684e-06, + "loss": 0.0982, + "step": 929 + }, + { + "epoch": 0.42311191992720654, + "grad_norm": 0.7462955906438729, + "learning_rate": 9.824724201784182e-06, + "loss": 0.1073, + "step": 930 + }, + { + "epoch": 0.42356687898089174, + "grad_norm": 0.5448066050338419, + "learning_rate": 9.824348881966004e-06, + "loss": 0.0637, + "step": 931 + }, + { + "epoch": 0.4240218380345769, + "grad_norm": 0.7750150802923693, + "learning_rate": 9.823973167923823e-06, + "loss": 0.09, + "step": 932 + }, + { + "epoch": 0.4244767970882621, + "grad_norm": 0.8695175796556455, + "learning_rate": 9.82359705968834e-06, + "loss": 0.0857, + "step": 933 + }, + { + "epoch": 0.4249317561419472, + "grad_norm": 0.653112477618241, + "learning_rate": 9.823220557290289e-06, + "loss": 0.0722, + "step": 934 + }, + { + "epoch": 0.42538671519563237, + "grad_norm": 0.7764742726938813, + "learning_rate": 9.822843660760434e-06, + "loss": 0.0582, + "step": 935 + }, + { + "epoch": 0.42584167424931757, + "grad_norm": 0.8338160462571067, + "learning_rate": 9.822466370129576e-06, + "loss": 0.0993, + "step": 936 + }, + { + "epoch": 0.4262966333030027, + "grad_norm": 0.7416650975880095, + "learning_rate": 9.822088685428543e-06, + "loss": 0.0782, + "step": 937 + }, + { + "epoch": 0.4267515923566879, + "grad_norm": 0.5969422348364739, + "learning_rate": 9.821710606688199e-06, + "loss": 0.0546, + "step": 938 + }, + { + "epoch": 0.42720655141037306, + "grad_norm": 0.6235404067325917, + "learning_rate": 9.82133213393944e-06, + "loss": 0.0638, + "step": 939 + }, + { + "epoch": 0.42766151046405826, + "grad_norm": 0.7910461101358781, + "learning_rate": 9.820953267213194e-06, + "loss": 0.0775, + "step": 940 + }, + { + "epoch": 0.4281164695177434, + "grad_norm": 0.692978452923811, + "learning_rate": 9.820574006540415e-06, + "loss": 0.053, + "step": 941 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.7310389759017597, + "learning_rate": 9.820194351952098e-06, + "loss": 0.0716, + "step": 942 + }, + { + "epoch": 0.42902638762511375, + "grad_norm": 0.6553331509390902, + "learning_rate": 9.819814303479268e-06, + "loss": 0.0612, + "step": 943 + }, + { + "epoch": 0.4294813466787989, + "grad_norm": 1.1310076957610966, + "learning_rate": 9.819433861152978e-06, + "loss": 0.1116, + "step": 944 + }, + { + "epoch": 0.4299363057324841, + "grad_norm": 0.6933766894953944, + "learning_rate": 9.819053025004316e-06, + "loss": 0.0932, + "step": 945 + }, + { + "epoch": 0.43039126478616924, + "grad_norm": 0.7823571557493696, + "learning_rate": 9.818671795064405e-06, + "loss": 0.0847, + "step": 946 + }, + { + "epoch": 0.43084622383985444, + "grad_norm": 0.8000794358590197, + "learning_rate": 9.818290171364396e-06, + "loss": 0.0916, + "step": 947 + }, + { + "epoch": 0.4313011828935396, + "grad_norm": 0.6207042654318157, + "learning_rate": 9.817908153935473e-06, + "loss": 0.0568, + "step": 948 + }, + { + "epoch": 0.4317561419472247, + "grad_norm": 0.7957970680354334, + "learning_rate": 9.817525742808854e-06, + "loss": 0.1203, + "step": 949 + }, + { + "epoch": 0.4322111010009099, + "grad_norm": 0.6607960765057979, + "learning_rate": 9.817142938015786e-06, + "loss": 0.069, + "step": 950 + }, + { + "epoch": 0.43266606005459507, + "grad_norm": 0.8132102265727185, + "learning_rate": 9.816759739587552e-06, + "loss": 0.0821, + "step": 951 + }, + { + "epoch": 0.43312101910828027, + "grad_norm": 0.6410149691778323, + "learning_rate": 9.816376147555464e-06, + "loss": 0.0612, + "step": 952 + }, + { + "epoch": 0.4335759781619654, + "grad_norm": 1.0196998859089288, + "learning_rate": 9.815992161950867e-06, + "loss": 0.1183, + "step": 953 + }, + { + "epoch": 0.4340309372156506, + "grad_norm": 0.5899375116434804, + "learning_rate": 9.81560778280514e-06, + "loss": 0.0604, + "step": 954 + }, + { + "epoch": 0.43448589626933576, + "grad_norm": 1.0046158107797931, + "learning_rate": 9.815223010149693e-06, + "loss": 0.0876, + "step": 955 + }, + { + "epoch": 0.4349408553230209, + "grad_norm": 0.7980339738331416, + "learning_rate": 9.814837844015966e-06, + "loss": 0.0894, + "step": 956 + }, + { + "epoch": 0.4353958143767061, + "grad_norm": 0.6974524248281853, + "learning_rate": 9.814452284435433e-06, + "loss": 0.0741, + "step": 957 + }, + { + "epoch": 0.43585077343039125, + "grad_norm": 0.7679692797858835, + "learning_rate": 9.814066331439603e-06, + "loss": 0.0796, + "step": 958 + }, + { + "epoch": 0.43630573248407645, + "grad_norm": 0.8183774417740679, + "learning_rate": 9.813679985060012e-06, + "loss": 0.0963, + "step": 959 + }, + { + "epoch": 0.4367606915377616, + "grad_norm": 0.7950656053104391, + "learning_rate": 9.81329324532823e-06, + "loss": 0.0837, + "step": 960 + }, + { + "epoch": 0.4372156505914468, + "grad_norm": 0.6056809369995887, + "learning_rate": 9.812906112275862e-06, + "loss": 0.0465, + "step": 961 + }, + { + "epoch": 0.43767060964513194, + "grad_norm": 1.0980359635620318, + "learning_rate": 9.81251858593454e-06, + "loss": 0.1206, + "step": 962 + }, + { + "epoch": 0.4381255686988171, + "grad_norm": 0.6123483237764059, + "learning_rate": 9.812130666335933e-06, + "loss": 0.08, + "step": 963 + }, + { + "epoch": 0.4385805277525023, + "grad_norm": 0.8151730014839008, + "learning_rate": 9.81174235351174e-06, + "loss": 0.0983, + "step": 964 + }, + { + "epoch": 0.4390354868061874, + "grad_norm": 0.7143828681073273, + "learning_rate": 9.811353647493691e-06, + "loss": 0.0809, + "step": 965 + }, + { + "epoch": 0.4394904458598726, + "grad_norm": 0.5647036962239634, + "learning_rate": 9.810964548313549e-06, + "loss": 0.0581, + "step": 966 + }, + { + "epoch": 0.43994540491355777, + "grad_norm": 0.7594400506736699, + "learning_rate": 9.81057505600311e-06, + "loss": 0.078, + "step": 967 + }, + { + "epoch": 0.44040036396724297, + "grad_norm": 0.6515426202345832, + "learning_rate": 9.810185170594205e-06, + "loss": 0.0688, + "step": 968 + }, + { + "epoch": 0.4408553230209281, + "grad_norm": 0.8798906332352223, + "learning_rate": 9.809794892118687e-06, + "loss": 0.0915, + "step": 969 + }, + { + "epoch": 0.44131028207461326, + "grad_norm": 0.7350866900672135, + "learning_rate": 9.809404220608451e-06, + "loss": 0.0671, + "step": 970 + }, + { + "epoch": 0.44176524112829846, + "grad_norm": 0.7216847217866104, + "learning_rate": 9.809013156095424e-06, + "loss": 0.0726, + "step": 971 + }, + { + "epoch": 0.4422202001819836, + "grad_norm": 0.8179702740752783, + "learning_rate": 9.808621698611557e-06, + "loss": 0.0758, + "step": 972 + }, + { + "epoch": 0.4426751592356688, + "grad_norm": 0.5533105745807706, + "learning_rate": 9.808229848188842e-06, + "loss": 0.0528, + "step": 973 + }, + { + "epoch": 0.44313011828935395, + "grad_norm": 0.7503486538749657, + "learning_rate": 9.807837604859296e-06, + "loss": 0.0878, + "step": 974 + }, + { + "epoch": 0.44358507734303915, + "grad_norm": 0.40510949005498975, + "learning_rate": 9.807444968654975e-06, + "loss": 0.0424, + "step": 975 + }, + { + "epoch": 0.4440400363967243, + "grad_norm": 0.8540666353042626, + "learning_rate": 9.807051939607959e-06, + "loss": 0.1108, + "step": 976 + }, + { + "epoch": 0.44449499545040944, + "grad_norm": 0.7543284179304937, + "learning_rate": 9.806658517750369e-06, + "loss": 0.0719, + "step": 977 + }, + { + "epoch": 0.44494995450409464, + "grad_norm": 0.6982493359241757, + "learning_rate": 9.80626470311435e-06, + "loss": 0.0777, + "step": 978 + }, + { + "epoch": 0.4454049135577798, + "grad_norm": 0.7275511253894157, + "learning_rate": 9.805870495732085e-06, + "loss": 0.0693, + "step": 979 + }, + { + "epoch": 0.445859872611465, + "grad_norm": 0.8647890459895436, + "learning_rate": 9.805475895635787e-06, + "loss": 0.0882, + "step": 980 + }, + { + "epoch": 0.4463148316651501, + "grad_norm": 0.757804762973183, + "learning_rate": 9.8050809028577e-06, + "loss": 0.0724, + "step": 981 + }, + { + "epoch": 0.4467697907188353, + "grad_norm": 0.7515219153063712, + "learning_rate": 9.8046855174301e-06, + "loss": 0.0659, + "step": 982 + }, + { + "epoch": 0.44722474977252047, + "grad_norm": 1.0502681583017184, + "learning_rate": 9.804289739385297e-06, + "loss": 0.1207, + "step": 983 + }, + { + "epoch": 0.44767970882620567, + "grad_norm": 0.5780062486364612, + "learning_rate": 9.803893568755633e-06, + "loss": 0.0772, + "step": 984 + }, + { + "epoch": 0.4481346678798908, + "grad_norm": 0.5515644567052078, + "learning_rate": 9.80349700557348e-06, + "loss": 0.0628, + "step": 985 + }, + { + "epoch": 0.44858962693357596, + "grad_norm": 0.6432677095504179, + "learning_rate": 9.803100049871246e-06, + "loss": 0.0817, + "step": 986 + }, + { + "epoch": 0.44904458598726116, + "grad_norm": 0.5424958391196154, + "learning_rate": 9.802702701681366e-06, + "loss": 0.0649, + "step": 987 + }, + { + "epoch": 0.4494995450409463, + "grad_norm": 0.6556126282036931, + "learning_rate": 9.80230496103631e-06, + "loss": 0.0579, + "step": 988 + }, + { + "epoch": 0.4499545040946315, + "grad_norm": 0.5632646083130022, + "learning_rate": 9.801906827968578e-06, + "loss": 0.0591, + "step": 989 + }, + { + "epoch": 0.45040946314831665, + "grad_norm": 1.0464719217252296, + "learning_rate": 9.801508302510707e-06, + "loss": 0.124, + "step": 990 + }, + { + "epoch": 0.45086442220200185, + "grad_norm": 0.7231067459050019, + "learning_rate": 9.801109384695261e-06, + "loss": 0.0631, + "step": 991 + }, + { + "epoch": 0.451319381255687, + "grad_norm": 0.775594128230074, + "learning_rate": 9.800710074554837e-06, + "loss": 0.0924, + "step": 992 + }, + { + "epoch": 0.45177434030937214, + "grad_norm": 0.6340180385643369, + "learning_rate": 9.800310372122066e-06, + "loss": 0.068, + "step": 993 + }, + { + "epoch": 0.45222929936305734, + "grad_norm": 0.9703750136380557, + "learning_rate": 9.799910277429609e-06, + "loss": 0.0902, + "step": 994 + }, + { + "epoch": 0.4526842584167425, + "grad_norm": 0.5881925827197537, + "learning_rate": 9.79950979051016e-06, + "loss": 0.0662, + "step": 995 + }, + { + "epoch": 0.4531392174704277, + "grad_norm": 0.7583235380843109, + "learning_rate": 9.799108911396446e-06, + "loss": 0.0755, + "step": 996 + }, + { + "epoch": 0.4535941765241128, + "grad_norm": 0.6585135755735663, + "learning_rate": 9.798707640121224e-06, + "loss": 0.0669, + "step": 997 + }, + { + "epoch": 0.454049135577798, + "grad_norm": 0.9344579240939844, + "learning_rate": 9.798305976717286e-06, + "loss": 0.1028, + "step": 998 + }, + { + "epoch": 0.45450409463148317, + "grad_norm": 0.6238360425747993, + "learning_rate": 9.79790392121745e-06, + "loss": 0.0608, + "step": 999 + }, + { + "epoch": 0.4549590536851683, + "grad_norm": 0.715680092291253, + "learning_rate": 9.797501473654573e-06, + "loss": 0.0792, + "step": 1000 + }, + { + "epoch": 0.4554140127388535, + "grad_norm": 0.8167758856821831, + "learning_rate": 9.797098634061543e-06, + "loss": 0.0948, + "step": 1001 + }, + { + "epoch": 0.45586897179253866, + "grad_norm": 0.8318764431867516, + "learning_rate": 9.796695402471275e-06, + "loss": 0.0967, + "step": 1002 + }, + { + "epoch": 0.45632393084622386, + "grad_norm": 0.9700547030363569, + "learning_rate": 9.79629177891672e-06, + "loss": 0.1138, + "step": 1003 + }, + { + "epoch": 0.456778889899909, + "grad_norm": 0.7702596501705347, + "learning_rate": 9.79588776343086e-06, + "loss": 0.0826, + "step": 1004 + }, + { + "epoch": 0.4572338489535942, + "grad_norm": 0.833778163717652, + "learning_rate": 9.795483356046711e-06, + "loss": 0.0927, + "step": 1005 + }, + { + "epoch": 0.45768880800727935, + "grad_norm": 0.7006737675801851, + "learning_rate": 9.795078556797318e-06, + "loss": 0.0747, + "step": 1006 + }, + { + "epoch": 0.4581437670609645, + "grad_norm": 0.8810114143185821, + "learning_rate": 9.794673365715761e-06, + "loss": 0.0921, + "step": 1007 + }, + { + "epoch": 0.4585987261146497, + "grad_norm": 0.7286145380478113, + "learning_rate": 9.794267782835148e-06, + "loss": 0.0832, + "step": 1008 + }, + { + "epoch": 0.45905368516833484, + "grad_norm": 0.8181887559127218, + "learning_rate": 9.793861808188622e-06, + "loss": 0.0729, + "step": 1009 + }, + { + "epoch": 0.45950864422202004, + "grad_norm": 1.0821839097582124, + "learning_rate": 9.793455441809359e-06, + "loss": 0.1025, + "step": 1010 + }, + { + "epoch": 0.4599636032757052, + "grad_norm": 0.515896949523265, + "learning_rate": 9.793048683730564e-06, + "loss": 0.0512, + "step": 1011 + }, + { + "epoch": 0.4604185623293904, + "grad_norm": 0.7800604571516774, + "learning_rate": 9.792641533985474e-06, + "loss": 0.1065, + "step": 1012 + }, + { + "epoch": 0.4608735213830755, + "grad_norm": 0.48365424866268936, + "learning_rate": 9.792233992607365e-06, + "loss": 0.0622, + "step": 1013 + }, + { + "epoch": 0.46132848043676067, + "grad_norm": 0.8472876133123602, + "learning_rate": 9.791826059629532e-06, + "loss": 0.0713, + "step": 1014 + }, + { + "epoch": 0.46178343949044587, + "grad_norm": 0.935522534168844, + "learning_rate": 9.791417735085316e-06, + "loss": 0.0853, + "step": 1015 + }, + { + "epoch": 0.462238398544131, + "grad_norm": 0.8028819334602026, + "learning_rate": 9.791009019008078e-06, + "loss": 0.0795, + "step": 1016 + }, + { + "epoch": 0.4626933575978162, + "grad_norm": 0.6458928385673616, + "learning_rate": 9.79059991143122e-06, + "loss": 0.0836, + "step": 1017 + }, + { + "epoch": 0.46314831665150136, + "grad_norm": 0.8309912415690437, + "learning_rate": 9.790190412388173e-06, + "loss": 0.0895, + "step": 1018 + }, + { + "epoch": 0.46360327570518656, + "grad_norm": 0.6953691809158898, + "learning_rate": 9.789780521912396e-06, + "loss": 0.0686, + "step": 1019 + }, + { + "epoch": 0.4640582347588717, + "grad_norm": 0.7563151979586233, + "learning_rate": 9.789370240037385e-06, + "loss": 0.0879, + "step": 1020 + }, + { + "epoch": 0.46451319381255685, + "grad_norm": 0.6646619102460968, + "learning_rate": 9.788959566796667e-06, + "loss": 0.0761, + "step": 1021 + }, + { + "epoch": 0.46496815286624205, + "grad_norm": 0.8092527562913561, + "learning_rate": 9.788548502223801e-06, + "loss": 0.0863, + "step": 1022 + }, + { + "epoch": 0.4654231119199272, + "grad_norm": 2.0284506817542396, + "learning_rate": 9.788137046352374e-06, + "loss": 0.2011, + "step": 1023 + }, + { + "epoch": 0.4658780709736124, + "grad_norm": 0.6524644993097855, + "learning_rate": 9.787725199216011e-06, + "loss": 0.0765, + "step": 1024 + }, + { + "epoch": 0.46633303002729753, + "grad_norm": 0.48134373932870766, + "learning_rate": 9.787312960848368e-06, + "loss": 0.0505, + "step": 1025 + }, + { + "epoch": 0.46678798908098273, + "grad_norm": 0.6646547386252114, + "learning_rate": 9.786900331283128e-06, + "loss": 0.0825, + "step": 1026 + }, + { + "epoch": 0.4672429481346679, + "grad_norm": 0.5655812014606527, + "learning_rate": 9.78648731055401e-06, + "loss": 0.0659, + "step": 1027 + }, + { + "epoch": 0.467697907188353, + "grad_norm": 0.680196435092224, + "learning_rate": 9.786073898694766e-06, + "loss": 0.0734, + "step": 1028 + }, + { + "epoch": 0.4681528662420382, + "grad_norm": 0.6198434008496165, + "learning_rate": 9.785660095739176e-06, + "loss": 0.0687, + "step": 1029 + }, + { + "epoch": 0.46860782529572337, + "grad_norm": 0.5967309034966486, + "learning_rate": 9.785245901721054e-06, + "loss": 0.0443, + "step": 1030 + }, + { + "epoch": 0.46906278434940857, + "grad_norm": 0.588565790719301, + "learning_rate": 9.784831316674246e-06, + "loss": 0.0741, + "step": 1031 + }, + { + "epoch": 0.4695177434030937, + "grad_norm": 0.6384508627867143, + "learning_rate": 9.784416340632634e-06, + "loss": 0.0639, + "step": 1032 + }, + { + "epoch": 0.4699727024567789, + "grad_norm": 0.528980291125106, + "learning_rate": 9.784000973630124e-06, + "loss": 0.0506, + "step": 1033 + }, + { + "epoch": 0.47042766151046406, + "grad_norm": 0.6297922247581061, + "learning_rate": 9.783585215700656e-06, + "loss": 0.0704, + "step": 1034 + }, + { + "epoch": 0.4708826205641492, + "grad_norm": 1.1014615381108162, + "learning_rate": 9.783169066878208e-06, + "loss": 0.1063, + "step": 1035 + }, + { + "epoch": 0.4713375796178344, + "grad_norm": 0.7370811970547196, + "learning_rate": 9.782752527196785e-06, + "loss": 0.0888, + "step": 1036 + }, + { + "epoch": 0.47179253867151955, + "grad_norm": 0.6272964856361817, + "learning_rate": 9.782335596690425e-06, + "loss": 0.0683, + "step": 1037 + }, + { + "epoch": 0.47224749772520475, + "grad_norm": 0.9675945822898259, + "learning_rate": 9.781918275393196e-06, + "loss": 0.1031, + "step": 1038 + }, + { + "epoch": 0.4727024567788899, + "grad_norm": 0.8448129794628584, + "learning_rate": 9.781500563339202e-06, + "loss": 0.0818, + "step": 1039 + }, + { + "epoch": 0.4731574158325751, + "grad_norm": 0.5148120993988892, + "learning_rate": 9.781082460562574e-06, + "loss": 0.0525, + "step": 1040 + }, + { + "epoch": 0.47361237488626023, + "grad_norm": 0.7767251927940846, + "learning_rate": 9.780663967097477e-06, + "loss": 0.0869, + "step": 1041 + }, + { + "epoch": 0.4740673339399454, + "grad_norm": 0.9661754574144388, + "learning_rate": 9.780245082978112e-06, + "loss": 0.0923, + "step": 1042 + }, + { + "epoch": 0.4745222929936306, + "grad_norm": 0.780061387882855, + "learning_rate": 9.779825808238705e-06, + "loss": 0.095, + "step": 1043 + }, + { + "epoch": 0.4749772520473157, + "grad_norm": 0.8513172657519864, + "learning_rate": 9.77940614291352e-06, + "loss": 0.0772, + "step": 1044 + }, + { + "epoch": 0.4754322111010009, + "grad_norm": 0.6199453465731616, + "learning_rate": 9.778986087036846e-06, + "loss": 0.0701, + "step": 1045 + }, + { + "epoch": 0.47588717015468607, + "grad_norm": 0.5327629714743946, + "learning_rate": 9.778565640643011e-06, + "loss": 0.0447, + "step": 1046 + }, + { + "epoch": 0.47634212920837127, + "grad_norm": 0.8882337205809296, + "learning_rate": 9.778144803766375e-06, + "loss": 0.0788, + "step": 1047 + }, + { + "epoch": 0.4767970882620564, + "grad_norm": 0.6023343672839219, + "learning_rate": 9.77772357644132e-06, + "loss": 0.0693, + "step": 1048 + }, + { + "epoch": 0.47725204731574156, + "grad_norm": 0.8031515985448552, + "learning_rate": 9.777301958702273e-06, + "loss": 0.0911, + "step": 1049 + }, + { + "epoch": 0.47770700636942676, + "grad_norm": 0.8695877166802147, + "learning_rate": 9.776879950583683e-06, + "loss": 0.12, + "step": 1050 + }, + { + "epoch": 0.4781619654231119, + "grad_norm": 0.6077253389668626, + "learning_rate": 9.776457552120034e-06, + "loss": 0.0722, + "step": 1051 + }, + { + "epoch": 0.4786169244767971, + "grad_norm": 0.7976020915977983, + "learning_rate": 9.776034763345845e-06, + "loss": 0.0783, + "step": 1052 + }, + { + "epoch": 0.47907188353048225, + "grad_norm": 0.7091049596783572, + "learning_rate": 9.775611584295663e-06, + "loss": 0.0739, + "step": 1053 + }, + { + "epoch": 0.47952684258416745, + "grad_norm": 0.7919907245184465, + "learning_rate": 9.775188015004072e-06, + "loss": 0.0728, + "step": 1054 + }, + { + "epoch": 0.4799818016378526, + "grad_norm": 0.9227645018819045, + "learning_rate": 9.774764055505676e-06, + "loss": 0.0905, + "step": 1055 + }, + { + "epoch": 0.48043676069153773, + "grad_norm": 0.7130315690029604, + "learning_rate": 9.774339705835127e-06, + "loss": 0.09, + "step": 1056 + }, + { + "epoch": 0.48089171974522293, + "grad_norm": 0.7993270676292756, + "learning_rate": 9.773914966027098e-06, + "loss": 0.1011, + "step": 1057 + }, + { + "epoch": 0.4813466787989081, + "grad_norm": 0.8955668988276211, + "learning_rate": 9.773489836116297e-06, + "loss": 0.0963, + "step": 1058 + }, + { + "epoch": 0.4818016378525933, + "grad_norm": 0.7582155580680914, + "learning_rate": 9.773064316137464e-06, + "loss": 0.0766, + "step": 1059 + }, + { + "epoch": 0.4822565969062784, + "grad_norm": 0.6939955066308027, + "learning_rate": 9.772638406125367e-06, + "loss": 0.0687, + "step": 1060 + }, + { + "epoch": 0.4827115559599636, + "grad_norm": 0.8091635860789653, + "learning_rate": 9.772212106114816e-06, + "loss": 0.0754, + "step": 1061 + }, + { + "epoch": 0.48316651501364877, + "grad_norm": 0.8236012040739623, + "learning_rate": 9.77178541614064e-06, + "loss": 0.0951, + "step": 1062 + }, + { + "epoch": 0.48362147406733397, + "grad_norm": 0.6622501946117725, + "learning_rate": 9.77135833623771e-06, + "loss": 0.083, + "step": 1063 + }, + { + "epoch": 0.4840764331210191, + "grad_norm": 0.8689743387052602, + "learning_rate": 9.770930866440927e-06, + "loss": 0.1074, + "step": 1064 + }, + { + "epoch": 0.48453139217470426, + "grad_norm": 0.6733750246744147, + "learning_rate": 9.770503006785214e-06, + "loss": 0.0639, + "step": 1065 + }, + { + "epoch": 0.48498635122838946, + "grad_norm": 0.9485233745498586, + "learning_rate": 9.770074757305541e-06, + "loss": 0.1106, + "step": 1066 + }, + { + "epoch": 0.4854413102820746, + "grad_norm": 0.8288392949652397, + "learning_rate": 9.769646118036902e-06, + "loss": 0.0661, + "step": 1067 + }, + { + "epoch": 0.4858962693357598, + "grad_norm": 0.7475423805914638, + "learning_rate": 9.76921708901432e-06, + "loss": 0.0686, + "step": 1068 + }, + { + "epoch": 0.48635122838944495, + "grad_norm": 0.54120364671088, + "learning_rate": 9.768787670272855e-06, + "loss": 0.0629, + "step": 1069 + }, + { + "epoch": 0.48680618744313015, + "grad_norm": 0.7281619635509152, + "learning_rate": 9.768357861847598e-06, + "loss": 0.0723, + "step": 1070 + }, + { + "epoch": 0.4872611464968153, + "grad_norm": 0.8883321717067604, + "learning_rate": 9.767927663773668e-06, + "loss": 0.0832, + "step": 1071 + }, + { + "epoch": 0.48771610555050043, + "grad_norm": 0.7681469789077073, + "learning_rate": 9.767497076086223e-06, + "loss": 0.0786, + "step": 1072 + }, + { + "epoch": 0.48817106460418563, + "grad_norm": 0.6590861395931087, + "learning_rate": 9.767066098820446e-06, + "loss": 0.0704, + "step": 1073 + }, + { + "epoch": 0.4886260236578708, + "grad_norm": 0.7944203702948146, + "learning_rate": 9.766634732011557e-06, + "loss": 0.0867, + "step": 1074 + }, + { + "epoch": 0.489080982711556, + "grad_norm": 0.7832480468570255, + "learning_rate": 9.766202975694801e-06, + "loss": 0.0873, + "step": 1075 + }, + { + "epoch": 0.4895359417652411, + "grad_norm": 0.7232266679451883, + "learning_rate": 9.765770829905464e-06, + "loss": 0.0785, + "step": 1076 + }, + { + "epoch": 0.4899909008189263, + "grad_norm": 0.5406798309730716, + "learning_rate": 9.765338294678856e-06, + "loss": 0.0469, + "step": 1077 + }, + { + "epoch": 0.49044585987261147, + "grad_norm": 0.5866548164219128, + "learning_rate": 9.764905370050321e-06, + "loss": 0.0524, + "step": 1078 + }, + { + "epoch": 0.4909008189262966, + "grad_norm": 0.9915720236606885, + "learning_rate": 9.76447205605524e-06, + "loss": 0.1019, + "step": 1079 + }, + { + "epoch": 0.4913557779799818, + "grad_norm": 0.6838845303274752, + "learning_rate": 9.764038352729018e-06, + "loss": 0.0891, + "step": 1080 + }, + { + "epoch": 0.49181073703366696, + "grad_norm": 0.9385660559352969, + "learning_rate": 9.763604260107096e-06, + "loss": 0.1058, + "step": 1081 + }, + { + "epoch": 0.49226569608735216, + "grad_norm": 0.6710872617569944, + "learning_rate": 9.763169778224946e-06, + "loss": 0.0665, + "step": 1082 + }, + { + "epoch": 0.4927206551410373, + "grad_norm": 0.7878885609137168, + "learning_rate": 9.762734907118072e-06, + "loss": 0.0876, + "step": 1083 + }, + { + "epoch": 0.4931756141947225, + "grad_norm": 0.6302166766090778, + "learning_rate": 9.76229964682201e-06, + "loss": 0.0507, + "step": 1084 + }, + { + "epoch": 0.49363057324840764, + "grad_norm": 0.5833462678864086, + "learning_rate": 9.761863997372325e-06, + "loss": 0.0612, + "step": 1085 + }, + { + "epoch": 0.4940855323020928, + "grad_norm": 1.036522158484448, + "learning_rate": 9.761427958804621e-06, + "loss": 0.1395, + "step": 1086 + }, + { + "epoch": 0.494540491355778, + "grad_norm": 1.1502320115946314, + "learning_rate": 9.760991531154526e-06, + "loss": 0.1149, + "step": 1087 + }, + { + "epoch": 0.49499545040946313, + "grad_norm": 0.7616054217825209, + "learning_rate": 9.760554714457704e-06, + "loss": 0.0684, + "step": 1088 + }, + { + "epoch": 0.49545040946314833, + "grad_norm": 0.5129309167340426, + "learning_rate": 9.760117508749846e-06, + "loss": 0.0614, + "step": 1089 + }, + { + "epoch": 0.4959053685168335, + "grad_norm": 0.7147170789642256, + "learning_rate": 9.759679914066686e-06, + "loss": 0.0842, + "step": 1090 + }, + { + "epoch": 0.4963603275705187, + "grad_norm": 0.7513123367978354, + "learning_rate": 9.759241930443975e-06, + "loss": 0.0749, + "step": 1091 + }, + { + "epoch": 0.4968152866242038, + "grad_norm": 0.5462870672862663, + "learning_rate": 9.75880355791751e-06, + "loss": 0.0588, + "step": 1092 + }, + { + "epoch": 0.49727024567788897, + "grad_norm": 0.6158644897786469, + "learning_rate": 9.758364796523105e-06, + "loss": 0.0578, + "step": 1093 + }, + { + "epoch": 0.49772520473157417, + "grad_norm": 0.5248367448810554, + "learning_rate": 9.757925646296617e-06, + "loss": 0.0504, + "step": 1094 + }, + { + "epoch": 0.4981801637852593, + "grad_norm": 0.7801307646100064, + "learning_rate": 9.757486107273935e-06, + "loss": 0.0819, + "step": 1095 + }, + { + "epoch": 0.4986351228389445, + "grad_norm": 0.6822936325355138, + "learning_rate": 9.75704617949097e-06, + "loss": 0.0828, + "step": 1096 + }, + { + "epoch": 0.49909008189262966, + "grad_norm": 0.49379397863131413, + "learning_rate": 9.756605862983675e-06, + "loss": 0.0606, + "step": 1097 + }, + { + "epoch": 0.49954504094631486, + "grad_norm": 0.5236513133369656, + "learning_rate": 9.756165157788029e-06, + "loss": 0.0493, + "step": 1098 + }, + { + "epoch": 0.5, + "grad_norm": 0.7323812225903658, + "learning_rate": 9.755724063940047e-06, + "loss": 0.0794, + "step": 1099 + }, + { + "epoch": 0.5004549590536852, + "grad_norm": 0.853156508842135, + "learning_rate": 9.755282581475769e-06, + "loss": 0.08, + "step": 1100 + }, + { + "epoch": 0.5009099181073703, + "grad_norm": 0.7117091061791435, + "learning_rate": 9.754840710431274e-06, + "loss": 0.0773, + "step": 1101 + }, + { + "epoch": 0.5013648771610555, + "grad_norm": 0.9350752111669145, + "learning_rate": 9.754398450842668e-06, + "loss": 0.1046, + "step": 1102 + }, + { + "epoch": 0.5018198362147407, + "grad_norm": 0.8834833642233855, + "learning_rate": 9.753955802746091e-06, + "loss": 0.1284, + "step": 1103 + }, + { + "epoch": 0.5022747952684259, + "grad_norm": 0.9022387216275947, + "learning_rate": 9.753512766177717e-06, + "loss": 0.0898, + "step": 1104 + }, + { + "epoch": 0.502729754322111, + "grad_norm": 0.551248880180483, + "learning_rate": 9.753069341173745e-06, + "loss": 0.0596, + "step": 1105 + }, + { + "epoch": 0.5031847133757962, + "grad_norm": 0.5970423480352659, + "learning_rate": 9.752625527770409e-06, + "loss": 0.0723, + "step": 1106 + }, + { + "epoch": 0.5036396724294814, + "grad_norm": 0.7620108531589319, + "learning_rate": 9.75218132600398e-06, + "loss": 0.0856, + "step": 1107 + }, + { + "epoch": 0.5040946314831665, + "grad_norm": 0.7720887684681512, + "learning_rate": 9.751736735910753e-06, + "loss": 0.0904, + "step": 1108 + }, + { + "epoch": 0.5045495905368517, + "grad_norm": 0.8672659681858957, + "learning_rate": 9.75129175752706e-06, + "loss": 0.1043, + "step": 1109 + }, + { + "epoch": 0.5050045495905369, + "grad_norm": 0.7511079874116621, + "learning_rate": 9.75084639088926e-06, + "loss": 0.0719, + "step": 1110 + }, + { + "epoch": 0.5054595086442221, + "grad_norm": 0.7442062138473109, + "learning_rate": 9.750400636033746e-06, + "loss": 0.0805, + "step": 1111 + }, + { + "epoch": 0.5059144676979072, + "grad_norm": 0.716157443156474, + "learning_rate": 9.749954492996947e-06, + "loss": 0.0902, + "step": 1112 + }, + { + "epoch": 0.5063694267515924, + "grad_norm": 0.7655895172099163, + "learning_rate": 9.749507961815317e-06, + "loss": 0.0973, + "step": 1113 + }, + { + "epoch": 0.5068243858052776, + "grad_norm": 0.6288294239038802, + "learning_rate": 9.749061042525343e-06, + "loss": 0.0646, + "step": 1114 + }, + { + "epoch": 0.5072793448589626, + "grad_norm": 0.6709452216437115, + "learning_rate": 9.74861373516355e-06, + "loss": 0.0717, + "step": 1115 + }, + { + "epoch": 0.5077343039126478, + "grad_norm": 0.6522838269502338, + "learning_rate": 9.748166039766484e-06, + "loss": 0.0475, + "step": 1116 + }, + { + "epoch": 0.508189262966333, + "grad_norm": 0.7999784990978867, + "learning_rate": 9.747717956370735e-06, + "loss": 0.0925, + "step": 1117 + }, + { + "epoch": 0.5086442220200182, + "grad_norm": 1.0917998243863505, + "learning_rate": 9.747269485012913e-06, + "loss": 0.1293, + "step": 1118 + }, + { + "epoch": 0.5090991810737033, + "grad_norm": 0.7636715530766439, + "learning_rate": 9.746820625729667e-06, + "loss": 0.0774, + "step": 1119 + }, + { + "epoch": 0.5095541401273885, + "grad_norm": 0.6701230428761437, + "learning_rate": 9.746371378557677e-06, + "loss": 0.0623, + "step": 1120 + }, + { + "epoch": 0.5100090991810737, + "grad_norm": 0.972334707766994, + "learning_rate": 9.745921743533653e-06, + "loss": 0.113, + "step": 1121 + }, + { + "epoch": 0.5104640582347588, + "grad_norm": 0.6630727679984025, + "learning_rate": 9.745471720694335e-06, + "loss": 0.0828, + "step": 1122 + }, + { + "epoch": 0.510919017288444, + "grad_norm": 0.8798279960192045, + "learning_rate": 9.745021310076498e-06, + "loss": 0.0772, + "step": 1123 + }, + { + "epoch": 0.5113739763421292, + "grad_norm": 0.6337737332675445, + "learning_rate": 9.744570511716952e-06, + "loss": 0.0805, + "step": 1124 + }, + { + "epoch": 0.5118289353958144, + "grad_norm": 0.9171053674032225, + "learning_rate": 9.744119325652526e-06, + "loss": 0.0901, + "step": 1125 + }, + { + "epoch": 0.5122838944494995, + "grad_norm": 0.7437420002919692, + "learning_rate": 9.743667751920093e-06, + "loss": 0.0789, + "step": 1126 + }, + { + "epoch": 0.5127388535031847, + "grad_norm": 0.692440215965907, + "learning_rate": 9.743215790556556e-06, + "loss": 0.0885, + "step": 1127 + }, + { + "epoch": 0.5131938125568699, + "grad_norm": 0.5830998661595514, + "learning_rate": 9.742763441598841e-06, + "loss": 0.0571, + "step": 1128 + }, + { + "epoch": 0.513648771610555, + "grad_norm": 0.7409283851806759, + "learning_rate": 9.742310705083919e-06, + "loss": 0.0819, + "step": 1129 + }, + { + "epoch": 0.5141037306642402, + "grad_norm": 0.6329559817029019, + "learning_rate": 9.74185758104878e-06, + "loss": 0.0732, + "step": 1130 + }, + { + "epoch": 0.5145586897179254, + "grad_norm": 0.47102788261692413, + "learning_rate": 9.741404069530455e-06, + "loss": 0.0496, + "step": 1131 + }, + { + "epoch": 0.5150136487716106, + "grad_norm": 0.7193278988032876, + "learning_rate": 9.740950170566002e-06, + "loss": 0.0797, + "step": 1132 + }, + { + "epoch": 0.5154686078252957, + "grad_norm": 0.7827454423152818, + "learning_rate": 9.740495884192509e-06, + "loss": 0.0863, + "step": 1133 + }, + { + "epoch": 0.5159235668789809, + "grad_norm": 0.5187125000260286, + "learning_rate": 9.740041210447101e-06, + "loss": 0.048, + "step": 1134 + }, + { + "epoch": 0.5163785259326661, + "grad_norm": 0.7621657915309645, + "learning_rate": 9.739586149366932e-06, + "loss": 0.076, + "step": 1135 + }, + { + "epoch": 0.5168334849863512, + "grad_norm": 1.0691498364952807, + "learning_rate": 9.739130700989185e-06, + "loss": 0.1085, + "step": 1136 + }, + { + "epoch": 0.5172884440400364, + "grad_norm": 1.126943089011516, + "learning_rate": 9.738674865351081e-06, + "loss": 0.1197, + "step": 1137 + }, + { + "epoch": 0.5177434030937216, + "grad_norm": 0.5967935472543325, + "learning_rate": 9.738218642489864e-06, + "loss": 0.0715, + "step": 1138 + }, + { + "epoch": 0.5181983621474068, + "grad_norm": 0.6520369417533736, + "learning_rate": 9.73776203244282e-06, + "loss": 0.0812, + "step": 1139 + }, + { + "epoch": 0.5186533212010919, + "grad_norm": 0.6923655317783546, + "learning_rate": 9.737305035247258e-06, + "loss": 0.0607, + "step": 1140 + }, + { + "epoch": 0.5191082802547771, + "grad_norm": 0.5971267035932937, + "learning_rate": 9.73684765094052e-06, + "loss": 0.0597, + "step": 1141 + }, + { + "epoch": 0.5195632393084623, + "grad_norm": 0.6102979031011873, + "learning_rate": 9.736389879559984e-06, + "loss": 0.0464, + "step": 1142 + }, + { + "epoch": 0.5200181983621474, + "grad_norm": 0.5971210330968472, + "learning_rate": 9.735931721143058e-06, + "loss": 0.0674, + "step": 1143 + }, + { + "epoch": 0.5204731574158326, + "grad_norm": 0.9014574419537533, + "learning_rate": 9.735473175727178e-06, + "loss": 0.1071, + "step": 1144 + }, + { + "epoch": 0.5209281164695178, + "grad_norm": 1.024240239778721, + "learning_rate": 9.735014243349814e-06, + "loss": 0.1058, + "step": 1145 + }, + { + "epoch": 0.521383075523203, + "grad_norm": 0.740240244958144, + "learning_rate": 9.73455492404847e-06, + "loss": 0.0716, + "step": 1146 + }, + { + "epoch": 0.521838034576888, + "grad_norm": 0.8552793125149327, + "learning_rate": 9.734095217860679e-06, + "loss": 0.1116, + "step": 1147 + }, + { + "epoch": 0.5222929936305732, + "grad_norm": 0.8388846880500271, + "learning_rate": 9.733635124824007e-06, + "loss": 0.1195, + "step": 1148 + }, + { + "epoch": 0.5227479526842584, + "grad_norm": 0.7476616795889469, + "learning_rate": 9.733174644976047e-06, + "loss": 0.0982, + "step": 1149 + }, + { + "epoch": 0.5232029117379435, + "grad_norm": 1.247104578949049, + "learning_rate": 9.732713778354431e-06, + "loss": 0.1339, + "step": 1150 + }, + { + "epoch": 0.5236578707916287, + "grad_norm": 0.8127429979477634, + "learning_rate": 9.732252524996818e-06, + "loss": 0.0994, + "step": 1151 + }, + { + "epoch": 0.5241128298453139, + "grad_norm": 1.1678300434583342, + "learning_rate": 9.731790884940899e-06, + "loss": 0.1152, + "step": 1152 + }, + { + "epoch": 0.5245677888989991, + "grad_norm": 0.5209287069427062, + "learning_rate": 9.731328858224398e-06, + "loss": 0.0546, + "step": 1153 + }, + { + "epoch": 0.5250227479526842, + "grad_norm": 0.8363023252623251, + "learning_rate": 9.730866444885069e-06, + "loss": 0.0894, + "step": 1154 + }, + { + "epoch": 0.5254777070063694, + "grad_norm": 0.8202924553152645, + "learning_rate": 9.730403644960697e-06, + "loss": 0.0914, + "step": 1155 + }, + { + "epoch": 0.5259326660600546, + "grad_norm": 0.4900409376406188, + "learning_rate": 9.729940458489105e-06, + "loss": 0.0454, + "step": 1156 + }, + { + "epoch": 0.5263876251137397, + "grad_norm": 0.5631225499534328, + "learning_rate": 9.729476885508136e-06, + "loss": 0.0542, + "step": 1157 + }, + { + "epoch": 0.5268425841674249, + "grad_norm": 0.566596895824316, + "learning_rate": 9.729012926055674e-06, + "loss": 0.0625, + "step": 1158 + }, + { + "epoch": 0.5272975432211101, + "grad_norm": 0.9035766920121469, + "learning_rate": 9.728548580169632e-06, + "loss": 0.1013, + "step": 1159 + }, + { + "epoch": 0.5277525022747953, + "grad_norm": 0.8241016260766749, + "learning_rate": 9.728083847887955e-06, + "loss": 0.078, + "step": 1160 + }, + { + "epoch": 0.5282074613284804, + "grad_norm": 0.7435557294319748, + "learning_rate": 9.727618729248617e-06, + "loss": 0.0864, + "step": 1161 + }, + { + "epoch": 0.5286624203821656, + "grad_norm": 0.6611375262646607, + "learning_rate": 9.727153224289627e-06, + "loss": 0.0769, + "step": 1162 + }, + { + "epoch": 0.5291173794358508, + "grad_norm": 0.8275931946782299, + "learning_rate": 9.726687333049024e-06, + "loss": 0.0889, + "step": 1163 + }, + { + "epoch": 0.5295723384895359, + "grad_norm": 1.057751919756087, + "learning_rate": 9.726221055564874e-06, + "loss": 0.0851, + "step": 1164 + }, + { + "epoch": 0.5300272975432211, + "grad_norm": 0.7884543920060787, + "learning_rate": 9.725754391875287e-06, + "loss": 0.0746, + "step": 1165 + }, + { + "epoch": 0.5304822565969063, + "grad_norm": 0.8593529313000522, + "learning_rate": 9.72528734201839e-06, + "loss": 0.0828, + "step": 1166 + }, + { + "epoch": 0.5309372156505915, + "grad_norm": 0.5225417485901063, + "learning_rate": 9.72481990603235e-06, + "loss": 0.0794, + "step": 1167 + }, + { + "epoch": 0.5313921747042766, + "grad_norm": 0.8820660720540598, + "learning_rate": 9.724352083955366e-06, + "loss": 0.1059, + "step": 1168 + }, + { + "epoch": 0.5318471337579618, + "grad_norm": 0.6775105748188827, + "learning_rate": 9.723883875825664e-06, + "loss": 0.079, + "step": 1169 + }, + { + "epoch": 0.532302092811647, + "grad_norm": 0.5969175177573056, + "learning_rate": 9.723415281681505e-06, + "loss": 0.061, + "step": 1170 + }, + { + "epoch": 0.5327570518653321, + "grad_norm": 0.7165111743049339, + "learning_rate": 9.722946301561179e-06, + "loss": 0.0824, + "step": 1171 + }, + { + "epoch": 0.5332120109190173, + "grad_norm": 0.7771351455478163, + "learning_rate": 9.722476935503011e-06, + "loss": 0.0936, + "step": 1172 + }, + { + "epoch": 0.5336669699727025, + "grad_norm": 0.5612071801020553, + "learning_rate": 9.722007183545353e-06, + "loss": 0.0584, + "step": 1173 + }, + { + "epoch": 0.5341219290263877, + "grad_norm": 0.7630759308283642, + "learning_rate": 9.721537045726594e-06, + "loss": 0.0711, + "step": 1174 + }, + { + "epoch": 0.5345768880800728, + "grad_norm": 0.7415951616336062, + "learning_rate": 9.721066522085148e-06, + "loss": 0.0786, + "step": 1175 + }, + { + "epoch": 0.535031847133758, + "grad_norm": 0.6697058559185771, + "learning_rate": 9.720595612659467e-06, + "loss": 0.0943, + "step": 1176 + }, + { + "epoch": 0.5354868061874432, + "grad_norm": 0.8294561042543531, + "learning_rate": 9.720124317488031e-06, + "loss": 0.0766, + "step": 1177 + }, + { + "epoch": 0.5359417652411284, + "grad_norm": 0.8069252663248169, + "learning_rate": 9.719652636609351e-06, + "loss": 0.1036, + "step": 1178 + }, + { + "epoch": 0.5363967242948134, + "grad_norm": 0.5216393236723873, + "learning_rate": 9.719180570061973e-06, + "loss": 0.0681, + "step": 1179 + }, + { + "epoch": 0.5368516833484986, + "grad_norm": 0.7561882785891234, + "learning_rate": 9.718708117884468e-06, + "loss": 0.0888, + "step": 1180 + }, + { + "epoch": 0.5373066424021838, + "grad_norm": 0.7101886443887773, + "learning_rate": 9.718235280115446e-06, + "loss": 0.0841, + "step": 1181 + }, + { + "epoch": 0.5377616014558689, + "grad_norm": 0.93883085852681, + "learning_rate": 9.717762056793545e-06, + "loss": 0.1116, + "step": 1182 + }, + { + "epoch": 0.5382165605095541, + "grad_norm": 0.8029318164759022, + "learning_rate": 9.717288447957433e-06, + "loss": 0.0817, + "step": 1183 + }, + { + "epoch": 0.5386715195632393, + "grad_norm": 0.7189629467174897, + "learning_rate": 9.716814453645811e-06, + "loss": 0.0913, + "step": 1184 + }, + { + "epoch": 0.5391264786169245, + "grad_norm": 0.6194922793353296, + "learning_rate": 9.716340073897414e-06, + "loss": 0.073, + "step": 1185 + }, + { + "epoch": 0.5395814376706096, + "grad_norm": 0.5862599296496694, + "learning_rate": 9.715865308751006e-06, + "loss": 0.0599, + "step": 1186 + }, + { + "epoch": 0.5400363967242948, + "grad_norm": 1.0638863826866105, + "learning_rate": 9.715390158245381e-06, + "loss": 0.1412, + "step": 1187 + }, + { + "epoch": 0.54049135577798, + "grad_norm": 0.6031416289368001, + "learning_rate": 9.714914622419367e-06, + "loss": 0.0694, + "step": 1188 + }, + { + "epoch": 0.5409463148316651, + "grad_norm": 0.5762096954254395, + "learning_rate": 9.714438701311822e-06, + "loss": 0.0627, + "step": 1189 + }, + { + "epoch": 0.5414012738853503, + "grad_norm": 0.6077021479661606, + "learning_rate": 9.713962394961636e-06, + "loss": 0.067, + "step": 1190 + }, + { + "epoch": 0.5418562329390355, + "grad_norm": 0.5381873559759192, + "learning_rate": 9.713485703407732e-06, + "loss": 0.0595, + "step": 1191 + }, + { + "epoch": 0.5423111919927207, + "grad_norm": 0.7866618609648011, + "learning_rate": 9.713008626689063e-06, + "loss": 0.1064, + "step": 1192 + }, + { + "epoch": 0.5427661510464058, + "grad_norm": 0.7100862231154079, + "learning_rate": 9.712531164844611e-06, + "loss": 0.07, + "step": 1193 + }, + { + "epoch": 0.543221110100091, + "grad_norm": 0.5579932774059501, + "learning_rate": 9.712053317913394e-06, + "loss": 0.0525, + "step": 1194 + }, + { + "epoch": 0.5436760691537762, + "grad_norm": 0.5454543895601387, + "learning_rate": 9.711575085934459e-06, + "loss": 0.0741, + "step": 1195 + }, + { + "epoch": 0.5441310282074613, + "grad_norm": 0.6754854519258514, + "learning_rate": 9.711096468946888e-06, + "loss": 0.101, + "step": 1196 + }, + { + "epoch": 0.5445859872611465, + "grad_norm": 0.8125002765504534, + "learning_rate": 9.710617466989787e-06, + "loss": 0.0937, + "step": 1197 + }, + { + "epoch": 0.5450409463148317, + "grad_norm": 0.5893498973936582, + "learning_rate": 9.710138080102298e-06, + "loss": 0.0658, + "step": 1198 + }, + { + "epoch": 0.5454959053685169, + "grad_norm": 0.8107633297228217, + "learning_rate": 9.709658308323597e-06, + "loss": 0.0955, + "step": 1199 + }, + { + "epoch": 0.545950864422202, + "grad_norm": 0.6726060122769176, + "learning_rate": 9.70917815169289e-06, + "loss": 0.084, + "step": 1200 + }, + { + "epoch": 0.5464058234758872, + "grad_norm": 0.6077011277694447, + "learning_rate": 9.708697610249407e-06, + "loss": 0.0756, + "step": 1201 + }, + { + "epoch": 0.5468607825295724, + "grad_norm": 0.7073007110523803, + "learning_rate": 9.70821668403242e-06, + "loss": 0.0818, + "step": 1202 + }, + { + "epoch": 0.5473157415832575, + "grad_norm": 0.9420816064988972, + "learning_rate": 9.707735373081231e-06, + "loss": 0.1197, + "step": 1203 + }, + { + "epoch": 0.5477707006369427, + "grad_norm": 0.552138579735494, + "learning_rate": 9.707253677435165e-06, + "loss": 0.0594, + "step": 1204 + }, + { + "epoch": 0.5482256596906279, + "grad_norm": 0.6375758502862188, + "learning_rate": 9.706771597133587e-06, + "loss": 0.0572, + "step": 1205 + }, + { + "epoch": 0.5486806187443131, + "grad_norm": 0.6581691945271008, + "learning_rate": 9.706289132215889e-06, + "loss": 0.0707, + "step": 1206 + }, + { + "epoch": 0.5491355777979982, + "grad_norm": 0.820106985355047, + "learning_rate": 9.705806282721498e-06, + "loss": 0.0865, + "step": 1207 + }, + { + "epoch": 0.5495905368516834, + "grad_norm": 0.5258555939105785, + "learning_rate": 9.705323048689866e-06, + "loss": 0.0462, + "step": 1208 + }, + { + "epoch": 0.5500454959053686, + "grad_norm": 0.7818892498713288, + "learning_rate": 9.704839430160487e-06, + "loss": 0.1005, + "step": 1209 + }, + { + "epoch": 0.5505004549590536, + "grad_norm": 0.6371281646305975, + "learning_rate": 9.704355427172874e-06, + "loss": 0.0712, + "step": 1210 + }, + { + "epoch": 0.5509554140127388, + "grad_norm": 0.5981165031558572, + "learning_rate": 9.70387103976658e-06, + "loss": 0.0669, + "step": 1211 + }, + { + "epoch": 0.551410373066424, + "grad_norm": 0.640233382171881, + "learning_rate": 9.703386267981188e-06, + "loss": 0.0629, + "step": 1212 + }, + { + "epoch": 0.5518653321201092, + "grad_norm": 0.5436666812285462, + "learning_rate": 9.70290111185631e-06, + "loss": 0.0527, + "step": 1213 + }, + { + "epoch": 0.5523202911737943, + "grad_norm": 0.9264418893677014, + "learning_rate": 9.702415571431594e-06, + "loss": 0.1392, + "step": 1214 + }, + { + "epoch": 0.5527752502274795, + "grad_norm": 0.6659444469982292, + "learning_rate": 9.70192964674671e-06, + "loss": 0.0948, + "step": 1215 + }, + { + "epoch": 0.5532302092811647, + "grad_norm": 0.5526163080676849, + "learning_rate": 9.70144333784137e-06, + "loss": 0.0661, + "step": 1216 + }, + { + "epoch": 0.5536851683348498, + "grad_norm": 0.7994476768514381, + "learning_rate": 9.700956644755313e-06, + "loss": 0.0966, + "step": 1217 + }, + { + "epoch": 0.554140127388535, + "grad_norm": 0.7919884013199107, + "learning_rate": 9.700469567528307e-06, + "loss": 0.1082, + "step": 1218 + }, + { + "epoch": 0.5545950864422202, + "grad_norm": 0.7366932972024113, + "learning_rate": 9.699982106200155e-06, + "loss": 0.0841, + "step": 1219 + }, + { + "epoch": 0.5550500454959054, + "grad_norm": 0.8558659635343526, + "learning_rate": 9.699494260810692e-06, + "loss": 0.0866, + "step": 1220 + }, + { + "epoch": 0.5555050045495905, + "grad_norm": 0.8060928626360002, + "learning_rate": 9.699006031399779e-06, + "loss": 0.0777, + "step": 1221 + }, + { + "epoch": 0.5559599636032757, + "grad_norm": 0.6914626835020681, + "learning_rate": 9.698517418007314e-06, + "loss": 0.0775, + "step": 1222 + }, + { + "epoch": 0.5564149226569609, + "grad_norm": 0.8706739684427142, + "learning_rate": 9.698028420673224e-06, + "loss": 0.0984, + "step": 1223 + }, + { + "epoch": 0.556869881710646, + "grad_norm": 0.7863016327992207, + "learning_rate": 9.697539039437468e-06, + "loss": 0.1118, + "step": 1224 + }, + { + "epoch": 0.5573248407643312, + "grad_norm": 0.7719453440565228, + "learning_rate": 9.697049274340036e-06, + "loss": 0.0824, + "step": 1225 + }, + { + "epoch": 0.5577797998180164, + "grad_norm": 1.1509899845731206, + "learning_rate": 9.696559125420949e-06, + "loss": 0.1254, + "step": 1226 + }, + { + "epoch": 0.5582347588717016, + "grad_norm": 0.5202193771917482, + "learning_rate": 9.696068592720257e-06, + "loss": 0.0538, + "step": 1227 + }, + { + "epoch": 0.5586897179253867, + "grad_norm": 0.5880633286090164, + "learning_rate": 9.69557767627805e-06, + "loss": 0.0711, + "step": 1228 + }, + { + "epoch": 0.5591446769790719, + "grad_norm": 0.6342846572654288, + "learning_rate": 9.695086376134438e-06, + "loss": 0.0671, + "step": 1229 + }, + { + "epoch": 0.5595996360327571, + "grad_norm": 0.7541651906429654, + "learning_rate": 9.694594692329571e-06, + "loss": 0.0813, + "step": 1230 + }, + { + "epoch": 0.5600545950864422, + "grad_norm": 0.6416731945433944, + "learning_rate": 9.694102624903627e-06, + "loss": 0.0733, + "step": 1231 + }, + { + "epoch": 0.5605095541401274, + "grad_norm": 1.0012992796464886, + "learning_rate": 9.693610173896815e-06, + "loss": 0.096, + "step": 1232 + }, + { + "epoch": 0.5609645131938126, + "grad_norm": 0.725396699259508, + "learning_rate": 9.693117339349376e-06, + "loss": 0.0665, + "step": 1233 + }, + { + "epoch": 0.5614194722474978, + "grad_norm": 0.7481457641805567, + "learning_rate": 9.692624121301581e-06, + "loss": 0.0715, + "step": 1234 + }, + { + "epoch": 0.5618744313011829, + "grad_norm": 0.969766282604155, + "learning_rate": 9.692130519793734e-06, + "loss": 0.0991, + "step": 1235 + }, + { + "epoch": 0.5623293903548681, + "grad_norm": 0.8522169509206354, + "learning_rate": 9.691636534866172e-06, + "loss": 0.1025, + "step": 1236 + }, + { + "epoch": 0.5627843494085533, + "grad_norm": 0.7682304561659135, + "learning_rate": 9.691142166559259e-06, + "loss": 0.0846, + "step": 1237 + }, + { + "epoch": 0.5632393084622384, + "grad_norm": 0.5495617218791536, + "learning_rate": 9.690647414913392e-06, + "loss": 0.0766, + "step": 1238 + }, + { + "epoch": 0.5636942675159236, + "grad_norm": 0.6826816911759014, + "learning_rate": 9.690152279969003e-06, + "loss": 0.0729, + "step": 1239 + }, + { + "epoch": 0.5641492265696088, + "grad_norm": 0.8352406959674302, + "learning_rate": 9.689656761766548e-06, + "loss": 0.0896, + "step": 1240 + }, + { + "epoch": 0.564604185623294, + "grad_norm": 0.5908696548320724, + "learning_rate": 9.689160860346522e-06, + "loss": 0.0753, + "step": 1241 + }, + { + "epoch": 0.565059144676979, + "grad_norm": 0.4283914528398344, + "learning_rate": 9.688664575749447e-06, + "loss": 0.0414, + "step": 1242 + }, + { + "epoch": 0.5655141037306642, + "grad_norm": 0.6584468440229382, + "learning_rate": 9.688167908015877e-06, + "loss": 0.0733, + "step": 1243 + }, + { + "epoch": 0.5659690627843494, + "grad_norm": 0.9211218848648471, + "learning_rate": 9.687670857186396e-06, + "loss": 0.1171, + "step": 1244 + }, + { + "epoch": 0.5664240218380345, + "grad_norm": 0.9250852893692096, + "learning_rate": 9.68717342330162e-06, + "loss": 0.1061, + "step": 1245 + }, + { + "epoch": 0.5668789808917197, + "grad_norm": 0.8688266055790496, + "learning_rate": 9.686675606402203e-06, + "loss": 0.1213, + "step": 1246 + }, + { + "epoch": 0.5673339399454049, + "grad_norm": 0.7110325678190088, + "learning_rate": 9.686177406528819e-06, + "loss": 0.0836, + "step": 1247 + }, + { + "epoch": 0.5677888989990901, + "grad_norm": 0.8260984800022192, + "learning_rate": 9.685678823722178e-06, + "loss": 0.0907, + "step": 1248 + }, + { + "epoch": 0.5682438580527752, + "grad_norm": 0.6625042460625208, + "learning_rate": 9.685179858023026e-06, + "loss": 0.0777, + "step": 1249 + }, + { + "epoch": 0.5686988171064604, + "grad_norm": 0.711324638729454, + "learning_rate": 9.684680509472133e-06, + "loss": 0.0815, + "step": 1250 + }, + { + "epoch": 0.5691537761601456, + "grad_norm": 0.6863010294874783, + "learning_rate": 9.684180778110306e-06, + "loss": 0.0642, + "step": 1251 + }, + { + "epoch": 0.5696087352138307, + "grad_norm": 0.5978880624303593, + "learning_rate": 9.683680663978377e-06, + "loss": 0.065, + "step": 1252 + }, + { + "epoch": 0.5700636942675159, + "grad_norm": 0.6322068932784428, + "learning_rate": 9.683180167117216e-06, + "loss": 0.0681, + "step": 1253 + }, + { + "epoch": 0.5705186533212011, + "grad_norm": 0.7826720403434554, + "learning_rate": 9.682679287567722e-06, + "loss": 0.0881, + "step": 1254 + }, + { + "epoch": 0.5709736123748863, + "grad_norm": 0.794807695787425, + "learning_rate": 9.682178025370824e-06, + "loss": 0.1118, + "step": 1255 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.7050268620804678, + "learning_rate": 9.681676380567482e-06, + "loss": 0.0839, + "step": 1256 + }, + { + "epoch": 0.5718835304822566, + "grad_norm": 0.5581694578677082, + "learning_rate": 9.681174353198687e-06, + "loss": 0.0482, + "step": 1257 + }, + { + "epoch": 0.5723384895359418, + "grad_norm": 0.6766600070725707, + "learning_rate": 9.680671943305465e-06, + "loss": 0.0679, + "step": 1258 + }, + { + "epoch": 0.5727934485896269, + "grad_norm": 0.6995276308642288, + "learning_rate": 9.680169150928868e-06, + "loss": 0.0823, + "step": 1259 + }, + { + "epoch": 0.5732484076433121, + "grad_norm": 0.6008334474427011, + "learning_rate": 9.679665976109985e-06, + "loss": 0.0669, + "step": 1260 + }, + { + "epoch": 0.5737033666969973, + "grad_norm": 0.6951316344905618, + "learning_rate": 9.679162418889932e-06, + "loss": 0.0644, + "step": 1261 + }, + { + "epoch": 0.5741583257506825, + "grad_norm": 0.7661270676130627, + "learning_rate": 9.678658479309854e-06, + "loss": 0.0837, + "step": 1262 + }, + { + "epoch": 0.5746132848043676, + "grad_norm": 0.7593531327031607, + "learning_rate": 9.678154157410937e-06, + "loss": 0.0646, + "step": 1263 + }, + { + "epoch": 0.5750682438580528, + "grad_norm": 0.7824619403016152, + "learning_rate": 9.677649453234388e-06, + "loss": 0.0907, + "step": 1264 + }, + { + "epoch": 0.575523202911738, + "grad_norm": 0.8187746029529864, + "learning_rate": 9.67714436682145e-06, + "loss": 0.0906, + "step": 1265 + }, + { + "epoch": 0.5759781619654231, + "grad_norm": 0.7676559233650921, + "learning_rate": 9.676638898213394e-06, + "loss": 0.0839, + "step": 1266 + }, + { + "epoch": 0.5764331210191083, + "grad_norm": 0.5944493207466681, + "learning_rate": 9.676133047451528e-06, + "loss": 0.0588, + "step": 1267 + }, + { + "epoch": 0.5768880800727935, + "grad_norm": 0.6734586229257056, + "learning_rate": 9.675626814577188e-06, + "loss": 0.0804, + "step": 1268 + }, + { + "epoch": 0.5773430391264787, + "grad_norm": 0.6315388478681175, + "learning_rate": 9.675120199631738e-06, + "loss": 0.0636, + "step": 1269 + }, + { + "epoch": 0.5777979981801638, + "grad_norm": 0.7252277920198784, + "learning_rate": 9.674613202656577e-06, + "loss": 0.0842, + "step": 1270 + }, + { + "epoch": 0.578252957233849, + "grad_norm": 0.58556718084403, + "learning_rate": 9.674105823693139e-06, + "loss": 0.0764, + "step": 1271 + }, + { + "epoch": 0.5787079162875342, + "grad_norm": 0.7635901125586164, + "learning_rate": 9.673598062782878e-06, + "loss": 0.0907, + "step": 1272 + }, + { + "epoch": 0.5791628753412192, + "grad_norm": 0.33852379656119563, + "learning_rate": 9.67308991996729e-06, + "loss": 0.0387, + "step": 1273 + }, + { + "epoch": 0.5796178343949044, + "grad_norm": 0.8984557509320932, + "learning_rate": 9.672581395287897e-06, + "loss": 0.0969, + "step": 1274 + }, + { + "epoch": 0.5800727934485896, + "grad_norm": 0.881696210059407, + "learning_rate": 9.672072488786254e-06, + "loss": 0.115, + "step": 1275 + }, + { + "epoch": 0.5805277525022748, + "grad_norm": 0.805394208652388, + "learning_rate": 9.671563200503947e-06, + "loss": 0.0916, + "step": 1276 + }, + { + "epoch": 0.5809827115559599, + "grad_norm": 0.5947193670178038, + "learning_rate": 9.67105353048259e-06, + "loss": 0.0645, + "step": 1277 + }, + { + "epoch": 0.5814376706096451, + "grad_norm": 0.9345719582841384, + "learning_rate": 9.670543478763834e-06, + "loss": 0.0853, + "step": 1278 + }, + { + "epoch": 0.5818926296633303, + "grad_norm": 0.46822310121822047, + "learning_rate": 9.670033045389356e-06, + "loss": 0.06, + "step": 1279 + }, + { + "epoch": 0.5823475887170154, + "grad_norm": 0.882335352298928, + "learning_rate": 9.669522230400868e-06, + "loss": 0.1288, + "step": 1280 + }, + { + "epoch": 0.5828025477707006, + "grad_norm": 0.7155876804587362, + "learning_rate": 9.66901103384011e-06, + "loss": 0.0923, + "step": 1281 + }, + { + "epoch": 0.5832575068243858, + "grad_norm": 0.758339057709363, + "learning_rate": 9.668499455748857e-06, + "loss": 0.0866, + "step": 1282 + }, + { + "epoch": 0.583712465878071, + "grad_norm": 0.5929990208040478, + "learning_rate": 9.66798749616891e-06, + "loss": 0.0571, + "step": 1283 + }, + { + "epoch": 0.5841674249317561, + "grad_norm": 0.5486564328594907, + "learning_rate": 9.667475155142104e-06, + "loss": 0.0551, + "step": 1284 + }, + { + "epoch": 0.5846223839854413, + "grad_norm": 0.6958253493282612, + "learning_rate": 9.666962432710307e-06, + "loss": 0.0731, + "step": 1285 + }, + { + "epoch": 0.5850773430391265, + "grad_norm": 1.1984701204529857, + "learning_rate": 9.666449328915418e-06, + "loss": 0.1248, + "step": 1286 + }, + { + "epoch": 0.5855323020928116, + "grad_norm": 1.07466414021835, + "learning_rate": 9.66593584379936e-06, + "loss": 0.0969, + "step": 1287 + }, + { + "epoch": 0.5859872611464968, + "grad_norm": 0.7365065558485686, + "learning_rate": 9.6654219774041e-06, + "loss": 0.0768, + "step": 1288 + }, + { + "epoch": 0.586442220200182, + "grad_norm": 0.7278778525375763, + "learning_rate": 9.664907729771622e-06, + "loss": 0.0931, + "step": 1289 + }, + { + "epoch": 0.5868971792538672, + "grad_norm": 0.6940342908894654, + "learning_rate": 9.664393100943951e-06, + "loss": 0.0716, + "step": 1290 + }, + { + "epoch": 0.5873521383075523, + "grad_norm": 0.7046475563496115, + "learning_rate": 9.663878090963142e-06, + "loss": 0.0833, + "step": 1291 + }, + { + "epoch": 0.5878070973612375, + "grad_norm": 0.6554863862272154, + "learning_rate": 9.663362699871275e-06, + "loss": 0.0705, + "step": 1292 + }, + { + "epoch": 0.5882620564149227, + "grad_norm": 0.610296786595235, + "learning_rate": 9.66284692771047e-06, + "loss": 0.0592, + "step": 1293 + }, + { + "epoch": 0.5887170154686078, + "grad_norm": 0.6866815075031769, + "learning_rate": 9.662330774522869e-06, + "loss": 0.0748, + "step": 1294 + }, + { + "epoch": 0.589171974522293, + "grad_norm": 0.5654106713312388, + "learning_rate": 9.661814240350653e-06, + "loss": 0.0546, + "step": 1295 + }, + { + "epoch": 0.5896269335759782, + "grad_norm": 1.271034489401823, + "learning_rate": 9.66129732523603e-06, + "loss": 0.1473, + "step": 1296 + }, + { + "epoch": 0.5900818926296634, + "grad_norm": 0.45734781465896296, + "learning_rate": 9.66078002922124e-06, + "loss": 0.0452, + "step": 1297 + }, + { + "epoch": 0.5905368516833485, + "grad_norm": 0.8001910391102482, + "learning_rate": 9.660262352348553e-06, + "loss": 0.0801, + "step": 1298 + }, + { + "epoch": 0.5909918107370337, + "grad_norm": 0.8095822615697389, + "learning_rate": 9.659744294660272e-06, + "loss": 0.0851, + "step": 1299 + }, + { + "epoch": 0.5914467697907189, + "grad_norm": 0.6222175915293906, + "learning_rate": 9.659225856198732e-06, + "loss": 0.0725, + "step": 1300 + }, + { + "epoch": 0.591901728844404, + "grad_norm": 0.5098172411498206, + "learning_rate": 9.658707037006294e-06, + "loss": 0.0586, + "step": 1301 + }, + { + "epoch": 0.5923566878980892, + "grad_norm": 0.5056342525545805, + "learning_rate": 9.658187837125357e-06, + "loss": 0.0552, + "step": 1302 + }, + { + "epoch": 0.5928116469517744, + "grad_norm": 0.8298114087640572, + "learning_rate": 9.657668256598347e-06, + "loss": 0.0976, + "step": 1303 + }, + { + "epoch": 0.5932666060054596, + "grad_norm": 0.9354418819253106, + "learning_rate": 9.657148295467719e-06, + "loss": 0.1128, + "step": 1304 + }, + { + "epoch": 0.5937215650591446, + "grad_norm": 0.732222390896743, + "learning_rate": 9.656627953775964e-06, + "loss": 0.0719, + "step": 1305 + }, + { + "epoch": 0.5941765241128298, + "grad_norm": 0.817074061431315, + "learning_rate": 9.6561072315656e-06, + "loss": 0.097, + "step": 1306 + }, + { + "epoch": 0.594631483166515, + "grad_norm": 0.6993010225350191, + "learning_rate": 9.655586128879185e-06, + "loss": 0.0866, + "step": 1307 + }, + { + "epoch": 0.5950864422202001, + "grad_norm": 0.6036033167422408, + "learning_rate": 9.655064645759291e-06, + "loss": 0.0615, + "step": 1308 + }, + { + "epoch": 0.5955414012738853, + "grad_norm": 0.4333029170805267, + "learning_rate": 9.654542782248539e-06, + "loss": 0.0333, + "step": 1309 + }, + { + "epoch": 0.5959963603275705, + "grad_norm": 0.5158856954901245, + "learning_rate": 9.65402053838957e-06, + "loss": 0.0534, + "step": 1310 + }, + { + "epoch": 0.5964513193812557, + "grad_norm": 0.8439407413306237, + "learning_rate": 9.653497914225059e-06, + "loss": 0.0886, + "step": 1311 + }, + { + "epoch": 0.5969062784349408, + "grad_norm": 1.097335021441692, + "learning_rate": 9.652974909797714e-06, + "loss": 0.1184, + "step": 1312 + }, + { + "epoch": 0.597361237488626, + "grad_norm": 0.6552117042192046, + "learning_rate": 9.652451525150272e-06, + "loss": 0.0719, + "step": 1313 + }, + { + "epoch": 0.5978161965423112, + "grad_norm": 0.6353863518066384, + "learning_rate": 9.651927760325504e-06, + "loss": 0.0696, + "step": 1314 + }, + { + "epoch": 0.5982711555959963, + "grad_norm": 0.9048456403488727, + "learning_rate": 9.651403615366204e-06, + "loss": 0.0859, + "step": 1315 + }, + { + "epoch": 0.5987261146496815, + "grad_norm": 0.7176841695337582, + "learning_rate": 9.650879090315207e-06, + "loss": 0.0821, + "step": 1316 + }, + { + "epoch": 0.5991810737033667, + "grad_norm": 0.696539124420045, + "learning_rate": 9.650354185215374e-06, + "loss": 0.0875, + "step": 1317 + }, + { + "epoch": 0.5996360327570519, + "grad_norm": 0.5924500205612657, + "learning_rate": 9.649828900109599e-06, + "loss": 0.0646, + "step": 1318 + }, + { + "epoch": 0.600090991810737, + "grad_norm": 0.5430407542910594, + "learning_rate": 9.649303235040803e-06, + "loss": 0.0486, + "step": 1319 + }, + { + "epoch": 0.6005459508644222, + "grad_norm": 0.6459813862779727, + "learning_rate": 9.648777190051944e-06, + "loss": 0.0903, + "step": 1320 + }, + { + "epoch": 0.6010009099181074, + "grad_norm": 0.6531397749427512, + "learning_rate": 9.648250765186006e-06, + "loss": 0.0638, + "step": 1321 + }, + { + "epoch": 0.6014558689717925, + "grad_norm": 0.6616813941465042, + "learning_rate": 9.647723960486006e-06, + "loss": 0.0861, + "step": 1322 + }, + { + "epoch": 0.6019108280254777, + "grad_norm": 0.8426003399558685, + "learning_rate": 9.647196775994995e-06, + "loss": 0.0928, + "step": 1323 + }, + { + "epoch": 0.6023657870791629, + "grad_norm": 0.6908471872127779, + "learning_rate": 9.646669211756049e-06, + "loss": 0.064, + "step": 1324 + }, + { + "epoch": 0.6028207461328481, + "grad_norm": 0.6969433310817453, + "learning_rate": 9.64614126781228e-06, + "loss": 0.0683, + "step": 1325 + }, + { + "epoch": 0.6032757051865332, + "grad_norm": 0.7506047981065134, + "learning_rate": 9.645612944206826e-06, + "loss": 0.0849, + "step": 1326 + }, + { + "epoch": 0.6037306642402184, + "grad_norm": 0.5624997977779479, + "learning_rate": 9.645084240982862e-06, + "loss": 0.064, + "step": 1327 + }, + { + "epoch": 0.6041856232939036, + "grad_norm": 0.43671100502349636, + "learning_rate": 9.644555158183592e-06, + "loss": 0.0615, + "step": 1328 + }, + { + "epoch": 0.6046405823475887, + "grad_norm": 0.553762280713577, + "learning_rate": 9.64402569585225e-06, + "loss": 0.0596, + "step": 1329 + }, + { + "epoch": 0.6050955414012739, + "grad_norm": 0.6580653378362663, + "learning_rate": 9.643495854032099e-06, + "loss": 0.0558, + "step": 1330 + }, + { + "epoch": 0.6055505004549591, + "grad_norm": 0.7656128172437318, + "learning_rate": 9.642965632766437e-06, + "loss": 0.0915, + "step": 1331 + }, + { + "epoch": 0.6060054595086443, + "grad_norm": 0.49008300515141723, + "learning_rate": 9.642435032098591e-06, + "loss": 0.0553, + "step": 1332 + }, + { + "epoch": 0.6064604185623294, + "grad_norm": 0.6058179105933948, + "learning_rate": 9.64190405207192e-06, + "loss": 0.0709, + "step": 1333 + }, + { + "epoch": 0.6069153776160146, + "grad_norm": 0.6707142568108124, + "learning_rate": 9.641372692729811e-06, + "loss": 0.0715, + "step": 1334 + }, + { + "epoch": 0.6073703366696998, + "grad_norm": 0.8710319334113071, + "learning_rate": 9.640840954115686e-06, + "loss": 0.091, + "step": 1335 + }, + { + "epoch": 0.607825295723385, + "grad_norm": 0.7496993600003082, + "learning_rate": 9.640308836272996e-06, + "loss": 0.0932, + "step": 1336 + }, + { + "epoch": 0.60828025477707, + "grad_norm": 0.9684583450547241, + "learning_rate": 9.639776339245225e-06, + "loss": 0.087, + "step": 1337 + }, + { + "epoch": 0.6087352138307552, + "grad_norm": 0.7857186962980957, + "learning_rate": 9.639243463075884e-06, + "loss": 0.1084, + "step": 1338 + }, + { + "epoch": 0.6091901728844404, + "grad_norm": 1.1677743182021476, + "learning_rate": 9.638710207808518e-06, + "loss": 0.0712, + "step": 1339 + }, + { + "epoch": 0.6096451319381255, + "grad_norm": 0.725604064535932, + "learning_rate": 9.6381765734867e-06, + "loss": 0.077, + "step": 1340 + }, + { + "epoch": 0.6101000909918107, + "grad_norm": 0.5923782964843433, + "learning_rate": 9.63764256015404e-06, + "loss": 0.0641, + "step": 1341 + }, + { + "epoch": 0.6105550500454959, + "grad_norm": 0.7069177546563966, + "learning_rate": 9.637108167854173e-06, + "loss": 0.0747, + "step": 1342 + }, + { + "epoch": 0.6110100090991811, + "grad_norm": 0.780384533965345, + "learning_rate": 9.636573396630767e-06, + "loss": 0.0709, + "step": 1343 + }, + { + "epoch": 0.6114649681528662, + "grad_norm": 0.7305821703239879, + "learning_rate": 9.636038246527523e-06, + "loss": 0.0955, + "step": 1344 + }, + { + "epoch": 0.6119199272065514, + "grad_norm": 0.6274215993935015, + "learning_rate": 9.635502717588168e-06, + "loss": 0.0656, + "step": 1345 + }, + { + "epoch": 0.6123748862602366, + "grad_norm": 0.6018866737558257, + "learning_rate": 9.634966809856465e-06, + "loss": 0.0729, + "step": 1346 + }, + { + "epoch": 0.6128298453139217, + "grad_norm": 0.9406786913650838, + "learning_rate": 9.634430523376207e-06, + "loss": 0.1105, + "step": 1347 + }, + { + "epoch": 0.6132848043676069, + "grad_norm": 0.6910930219074588, + "learning_rate": 9.633893858191214e-06, + "loss": 0.0652, + "step": 1348 + }, + { + "epoch": 0.6137397634212921, + "grad_norm": 0.6641071332456526, + "learning_rate": 9.633356814345342e-06, + "loss": 0.0896, + "step": 1349 + }, + { + "epoch": 0.6141947224749773, + "grad_norm": 0.6463461735454817, + "learning_rate": 9.632819391882475e-06, + "loss": 0.0691, + "step": 1350 + }, + { + "epoch": 0.6146496815286624, + "grad_norm": 0.6570738741447356, + "learning_rate": 9.63228159084653e-06, + "loss": 0.0726, + "step": 1351 + }, + { + "epoch": 0.6151046405823476, + "grad_norm": 0.9251372605740943, + "learning_rate": 9.631743411281451e-06, + "loss": 0.1089, + "step": 1352 + }, + { + "epoch": 0.6155595996360328, + "grad_norm": 1.0354136522724409, + "learning_rate": 9.631204853231219e-06, + "loss": 0.1065, + "step": 1353 + }, + { + "epoch": 0.6160145586897179, + "grad_norm": 0.7577345531084587, + "learning_rate": 9.630665916739839e-06, + "loss": 0.083, + "step": 1354 + }, + { + "epoch": 0.6164695177434031, + "grad_norm": 0.6775679844485006, + "learning_rate": 9.630126601851353e-06, + "loss": 0.065, + "step": 1355 + }, + { + "epoch": 0.6169244767970883, + "grad_norm": 0.6510409015870585, + "learning_rate": 9.62958690860983e-06, + "loss": 0.0842, + "step": 1356 + }, + { + "epoch": 0.6173794358507735, + "grad_norm": 0.6541401291987898, + "learning_rate": 9.629046837059373e-06, + "loss": 0.0809, + "step": 1357 + }, + { + "epoch": 0.6178343949044586, + "grad_norm": 0.6773644747284383, + "learning_rate": 9.628506387244111e-06, + "loss": 0.08, + "step": 1358 + }, + { + "epoch": 0.6182893539581438, + "grad_norm": 0.7401243921784199, + "learning_rate": 9.627965559208212e-06, + "loss": 0.0632, + "step": 1359 + }, + { + "epoch": 0.618744313011829, + "grad_norm": 0.6255731586329286, + "learning_rate": 9.627424352995866e-06, + "loss": 0.0836, + "step": 1360 + }, + { + "epoch": 0.6191992720655141, + "grad_norm": 0.8684189032240879, + "learning_rate": 9.626882768651298e-06, + "loss": 0.0918, + "step": 1361 + }, + { + "epoch": 0.6196542311191993, + "grad_norm": 0.5565014005760545, + "learning_rate": 9.626340806218765e-06, + "loss": 0.0508, + "step": 1362 + }, + { + "epoch": 0.6201091901728845, + "grad_norm": 0.580066419485805, + "learning_rate": 9.625798465742555e-06, + "loss": 0.0691, + "step": 1363 + }, + { + "epoch": 0.6205641492265697, + "grad_norm": 0.5980127746625918, + "learning_rate": 9.625255747266984e-06, + "loss": 0.0674, + "step": 1364 + }, + { + "epoch": 0.6210191082802548, + "grad_norm": 0.8518146992949526, + "learning_rate": 9.6247126508364e-06, + "loss": 0.1112, + "step": 1365 + }, + { + "epoch": 0.62147406733394, + "grad_norm": 0.8485700961520207, + "learning_rate": 9.624169176495185e-06, + "loss": 0.0966, + "step": 1366 + }, + { + "epoch": 0.6219290263876252, + "grad_norm": 0.9962639418238284, + "learning_rate": 9.623625324287747e-06, + "loss": 0.1047, + "step": 1367 + }, + { + "epoch": 0.6223839854413102, + "grad_norm": 0.7706385402975253, + "learning_rate": 9.623081094258527e-06, + "loss": 0.1229, + "step": 1368 + }, + { + "epoch": 0.6228389444949954, + "grad_norm": 0.9185957443221413, + "learning_rate": 9.622536486451997e-06, + "loss": 0.0981, + "step": 1369 + }, + { + "epoch": 0.6232939035486806, + "grad_norm": 0.5737112203779396, + "learning_rate": 9.621991500912662e-06, + "loss": 0.0615, + "step": 1370 + }, + { + "epoch": 0.6237488626023658, + "grad_norm": 0.8225187377418599, + "learning_rate": 9.621446137685051e-06, + "loss": 0.1032, + "step": 1371 + }, + { + "epoch": 0.6242038216560509, + "grad_norm": 0.911993563924521, + "learning_rate": 9.620900396813734e-06, + "loss": 0.1052, + "step": 1372 + }, + { + "epoch": 0.6246587807097361, + "grad_norm": 1.1969877300226637, + "learning_rate": 9.620354278343306e-06, + "loss": 0.1323, + "step": 1373 + }, + { + "epoch": 0.6251137397634213, + "grad_norm": 0.49674299728731663, + "learning_rate": 9.61980778231839e-06, + "loss": 0.0469, + "step": 1374 + }, + { + "epoch": 0.6255686988171064, + "grad_norm": 0.9419790098064809, + "learning_rate": 9.619260908783645e-06, + "loss": 0.0829, + "step": 1375 + }, + { + "epoch": 0.6260236578707916, + "grad_norm": 0.8648992102518269, + "learning_rate": 9.61871365778376e-06, + "loss": 0.1227, + "step": 1376 + }, + { + "epoch": 0.6264786169244768, + "grad_norm": 0.6855921150752273, + "learning_rate": 9.618166029363452e-06, + "loss": 0.0893, + "step": 1377 + }, + { + "epoch": 0.626933575978162, + "grad_norm": 0.7460350385490577, + "learning_rate": 9.61761802356747e-06, + "loss": 0.1029, + "step": 1378 + }, + { + "epoch": 0.6273885350318471, + "grad_norm": 0.6238948896650269, + "learning_rate": 9.617069640440598e-06, + "loss": 0.0671, + "step": 1379 + }, + { + "epoch": 0.6278434940855323, + "grad_norm": 0.8484782740935036, + "learning_rate": 9.616520880027645e-06, + "loss": 0.1094, + "step": 1380 + }, + { + "epoch": 0.6282984531392175, + "grad_norm": 0.4929008515621752, + "learning_rate": 9.615971742373453e-06, + "loss": 0.0621, + "step": 1381 + }, + { + "epoch": 0.6287534121929026, + "grad_norm": 0.8230508842215047, + "learning_rate": 9.615422227522897e-06, + "loss": 0.0873, + "step": 1382 + }, + { + "epoch": 0.6292083712465878, + "grad_norm": 0.8269677617343545, + "learning_rate": 9.614872335520879e-06, + "loss": 0.0996, + "step": 1383 + }, + { + "epoch": 0.629663330300273, + "grad_norm": 0.7039938726965704, + "learning_rate": 9.614322066412335e-06, + "loss": 0.084, + "step": 1384 + }, + { + "epoch": 0.6301182893539582, + "grad_norm": 0.7376546247757936, + "learning_rate": 9.613771420242229e-06, + "loss": 0.0857, + "step": 1385 + }, + { + "epoch": 0.6305732484076433, + "grad_norm": 0.6736142636267153, + "learning_rate": 9.613220397055558e-06, + "loss": 0.0732, + "step": 1386 + }, + { + "epoch": 0.6310282074613285, + "grad_norm": 0.7476942520500481, + "learning_rate": 9.612668996897351e-06, + "loss": 0.0713, + "step": 1387 + }, + { + "epoch": 0.6314831665150137, + "grad_norm": 0.7359465201312233, + "learning_rate": 9.612117219812662e-06, + "loss": 0.0847, + "step": 1388 + }, + { + "epoch": 0.6319381255686988, + "grad_norm": 0.9663363466846744, + "learning_rate": 9.611565065846583e-06, + "loss": 0.1015, + "step": 1389 + }, + { + "epoch": 0.632393084622384, + "grad_norm": 0.7893446645403931, + "learning_rate": 9.611012535044232e-06, + "loss": 0.0983, + "step": 1390 + }, + { + "epoch": 0.6328480436760692, + "grad_norm": 1.024989133088754, + "learning_rate": 9.61045962745076e-06, + "loss": 0.1102, + "step": 1391 + }, + { + "epoch": 0.6333030027297544, + "grad_norm": 0.4979683651622851, + "learning_rate": 9.609906343111348e-06, + "loss": 0.0586, + "step": 1392 + }, + { + "epoch": 0.6337579617834395, + "grad_norm": 1.1009002383858189, + "learning_rate": 9.609352682071209e-06, + "loss": 0.0963, + "step": 1393 + }, + { + "epoch": 0.6342129208371247, + "grad_norm": 1.0522149389130615, + "learning_rate": 9.608798644375583e-06, + "loss": 0.1189, + "step": 1394 + }, + { + "epoch": 0.6346678798908099, + "grad_norm": 0.9812979427333788, + "learning_rate": 9.608244230069745e-06, + "loss": 0.1216, + "step": 1395 + }, + { + "epoch": 0.635122838944495, + "grad_norm": 0.7352050689297358, + "learning_rate": 9.607689439199e-06, + "loss": 0.0875, + "step": 1396 + }, + { + "epoch": 0.6355777979981801, + "grad_norm": 0.8346962373874338, + "learning_rate": 9.60713427180868e-06, + "loss": 0.0872, + "step": 1397 + }, + { + "epoch": 0.6360327570518653, + "grad_norm": 0.9100484302304894, + "learning_rate": 9.606578727944156e-06, + "loss": 0.1014, + "step": 1398 + }, + { + "epoch": 0.6364877161055505, + "grad_norm": 0.6397054531308819, + "learning_rate": 9.606022807650819e-06, + "loss": 0.0661, + "step": 1399 + }, + { + "epoch": 0.6369426751592356, + "grad_norm": 0.7013671405977515, + "learning_rate": 9.6054665109741e-06, + "loss": 0.0788, + "step": 1400 + }, + { + "epoch": 0.6373976342129208, + "grad_norm": 0.7177935827049716, + "learning_rate": 9.604909837959456e-06, + "loss": 0.0739, + "step": 1401 + }, + { + "epoch": 0.637852593266606, + "grad_norm": 1.0034339624615456, + "learning_rate": 9.604352788652375e-06, + "loss": 0.125, + "step": 1402 + }, + { + "epoch": 0.6383075523202911, + "grad_norm": 0.7908500695821505, + "learning_rate": 9.603795363098377e-06, + "loss": 0.0626, + "step": 1403 + }, + { + "epoch": 0.6387625113739763, + "grad_norm": 0.7396845097003291, + "learning_rate": 9.603237561343013e-06, + "loss": 0.0845, + "step": 1404 + }, + { + "epoch": 0.6392174704276615, + "grad_norm": 0.6132031146325181, + "learning_rate": 9.602679383431864e-06, + "loss": 0.0832, + "step": 1405 + }, + { + "epoch": 0.6396724294813467, + "grad_norm": 0.5848815265706712, + "learning_rate": 9.602120829410539e-06, + "loss": 0.0609, + "step": 1406 + }, + { + "epoch": 0.6401273885350318, + "grad_norm": 1.1396916096380878, + "learning_rate": 9.601561899324685e-06, + "loss": 0.089, + "step": 1407 + }, + { + "epoch": 0.640582347588717, + "grad_norm": 0.6243784477376835, + "learning_rate": 9.601002593219972e-06, + "loss": 0.0629, + "step": 1408 + }, + { + "epoch": 0.6410373066424022, + "grad_norm": 0.7693306930944409, + "learning_rate": 9.600442911142107e-06, + "loss": 0.0975, + "step": 1409 + }, + { + "epoch": 0.6414922656960873, + "grad_norm": 0.5824222441008058, + "learning_rate": 9.599882853136821e-06, + "loss": 0.0668, + "step": 1410 + }, + { + "epoch": 0.6419472247497725, + "grad_norm": 0.7486427214965261, + "learning_rate": 9.59932241924988e-06, + "loss": 0.0885, + "step": 1411 + }, + { + "epoch": 0.6424021838034577, + "grad_norm": 0.7403442425812181, + "learning_rate": 9.598761609527084e-06, + "loss": 0.0764, + "step": 1412 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 0.8444168000337251, + "learning_rate": 9.598200424014255e-06, + "loss": 0.0901, + "step": 1413 + }, + { + "epoch": 0.643312101910828, + "grad_norm": 0.6214870203253012, + "learning_rate": 9.597638862757255e-06, + "loss": 0.0641, + "step": 1414 + }, + { + "epoch": 0.6437670609645132, + "grad_norm": 0.45639812216740483, + "learning_rate": 9.597076925801967e-06, + "loss": 0.0525, + "step": 1415 + }, + { + "epoch": 0.6442220200181984, + "grad_norm": 0.5879645013041995, + "learning_rate": 9.596514613194313e-06, + "loss": 0.0664, + "step": 1416 + }, + { + "epoch": 0.6446769790718835, + "grad_norm": 0.723485890557837, + "learning_rate": 9.595951924980245e-06, + "loss": 0.0878, + "step": 1417 + }, + { + "epoch": 0.6451319381255687, + "grad_norm": 0.49190939142236517, + "learning_rate": 9.595388861205738e-06, + "loss": 0.0446, + "step": 1418 + }, + { + "epoch": 0.6455868971792539, + "grad_norm": 0.8244975390610266, + "learning_rate": 9.59482542191681e-06, + "loss": 0.0927, + "step": 1419 + }, + { + "epoch": 0.6460418562329391, + "grad_norm": 0.8365340393723969, + "learning_rate": 9.594261607159494e-06, + "loss": 0.0944, + "step": 1420 + }, + { + "epoch": 0.6464968152866242, + "grad_norm": 0.9246231982112141, + "learning_rate": 9.59369741697987e-06, + "loss": 0.1132, + "step": 1421 + }, + { + "epoch": 0.6469517743403094, + "grad_norm": 0.7576903487594321, + "learning_rate": 9.593132851424036e-06, + "loss": 0.0968, + "step": 1422 + }, + { + "epoch": 0.6474067333939946, + "grad_norm": 0.7385455319846311, + "learning_rate": 9.59256791053813e-06, + "loss": 0.1045, + "step": 1423 + }, + { + "epoch": 0.6478616924476797, + "grad_norm": 0.8466333605064674, + "learning_rate": 9.592002594368312e-06, + "loss": 0.1058, + "step": 1424 + }, + { + "epoch": 0.6483166515013649, + "grad_norm": 0.9463191649116842, + "learning_rate": 9.59143690296078e-06, + "loss": 0.1179, + "step": 1425 + }, + { + "epoch": 0.6487716105550501, + "grad_norm": 0.49506567565602905, + "learning_rate": 9.590870836361758e-06, + "loss": 0.0679, + "step": 1426 + }, + { + "epoch": 0.6492265696087353, + "grad_norm": 0.9070193484568203, + "learning_rate": 9.590304394617506e-06, + "loss": 0.0889, + "step": 1427 + }, + { + "epoch": 0.6496815286624203, + "grad_norm": 0.4746970963167155, + "learning_rate": 9.589737577774308e-06, + "loss": 0.0474, + "step": 1428 + }, + { + "epoch": 0.6501364877161055, + "grad_norm": 0.7625565873276676, + "learning_rate": 9.58917038587848e-06, + "loss": 0.1052, + "step": 1429 + }, + { + "epoch": 0.6505914467697907, + "grad_norm": 0.5544350713091404, + "learning_rate": 9.588602818976374e-06, + "loss": 0.0602, + "step": 1430 + }, + { + "epoch": 0.6510464058234758, + "grad_norm": 0.8043877114109435, + "learning_rate": 9.588034877114367e-06, + "loss": 0.0714, + "step": 1431 + }, + { + "epoch": 0.651501364877161, + "grad_norm": 0.6177719048805246, + "learning_rate": 9.58746656033887e-06, + "loss": 0.0822, + "step": 1432 + }, + { + "epoch": 0.6519563239308462, + "grad_norm": 1.070732220715245, + "learning_rate": 9.586897868696323e-06, + "loss": 0.1203, + "step": 1433 + }, + { + "epoch": 0.6524112829845314, + "grad_norm": 1.183590915899486, + "learning_rate": 9.586328802233195e-06, + "loss": 0.0935, + "step": 1434 + }, + { + "epoch": 0.6528662420382165, + "grad_norm": 0.581772493938091, + "learning_rate": 9.58575936099599e-06, + "loss": 0.0682, + "step": 1435 + }, + { + "epoch": 0.6533212010919017, + "grad_norm": 0.7377901301818582, + "learning_rate": 9.58518954503124e-06, + "loss": 0.0824, + "step": 1436 + }, + { + "epoch": 0.6537761601455869, + "grad_norm": 0.9292214040800371, + "learning_rate": 9.584619354385505e-06, + "loss": 0.1138, + "step": 1437 + }, + { + "epoch": 0.654231119199272, + "grad_norm": 0.7573270642921373, + "learning_rate": 9.58404878910538e-06, + "loss": 0.074, + "step": 1438 + }, + { + "epoch": 0.6546860782529572, + "grad_norm": 0.5838864743945036, + "learning_rate": 9.58347784923749e-06, + "loss": 0.067, + "step": 1439 + }, + { + "epoch": 0.6551410373066424, + "grad_norm": 0.6730458126896756, + "learning_rate": 9.58290653482849e-06, + "loss": 0.0632, + "step": 1440 + }, + { + "epoch": 0.6555959963603276, + "grad_norm": 0.7216545389315259, + "learning_rate": 9.582334845925063e-06, + "loss": 0.0757, + "step": 1441 + }, + { + "epoch": 0.6560509554140127, + "grad_norm": 0.929819001740202, + "learning_rate": 9.581762782573926e-06, + "loss": 0.0973, + "step": 1442 + }, + { + "epoch": 0.6565059144676979, + "grad_norm": 0.7680577896195074, + "learning_rate": 9.581190344821827e-06, + "loss": 0.086, + "step": 1443 + }, + { + "epoch": 0.6569608735213831, + "grad_norm": 0.8746535076926352, + "learning_rate": 9.58061753271554e-06, + "loss": 0.1085, + "step": 1444 + }, + { + "epoch": 0.6574158325750682, + "grad_norm": 0.6364512825611769, + "learning_rate": 9.580044346301875e-06, + "loss": 0.0764, + "step": 1445 + }, + { + "epoch": 0.6578707916287534, + "grad_norm": 0.47118649986170347, + "learning_rate": 9.57947078562767e-06, + "loss": 0.0506, + "step": 1446 + }, + { + "epoch": 0.6583257506824386, + "grad_norm": 0.6564703457147261, + "learning_rate": 9.578896850739792e-06, + "loss": 0.0702, + "step": 1447 + }, + { + "epoch": 0.6587807097361238, + "grad_norm": 0.6786314185300042, + "learning_rate": 9.578322541685142e-06, + "loss": 0.0778, + "step": 1448 + }, + { + "epoch": 0.6592356687898089, + "grad_norm": 0.7866249519519628, + "learning_rate": 9.577747858510647e-06, + "loss": 0.1066, + "step": 1449 + }, + { + "epoch": 0.6596906278434941, + "grad_norm": 0.8352652198110325, + "learning_rate": 9.577172801263272e-06, + "loss": 0.0973, + "step": 1450 + }, + { + "epoch": 0.6601455868971793, + "grad_norm": 0.6694090591857538, + "learning_rate": 9.576597369990006e-06, + "loss": 0.077, + "step": 1451 + }, + { + "epoch": 0.6606005459508644, + "grad_norm": 0.6613042389515336, + "learning_rate": 9.576021564737871e-06, + "loss": 0.0608, + "step": 1452 + }, + { + "epoch": 0.6610555050045496, + "grad_norm": 0.7515982683897205, + "learning_rate": 9.575445385553917e-06, + "loss": 0.1003, + "step": 1453 + }, + { + "epoch": 0.6615104640582348, + "grad_norm": 0.9769815693335377, + "learning_rate": 9.57486883248523e-06, + "loss": 0.0946, + "step": 1454 + }, + { + "epoch": 0.66196542311192, + "grad_norm": 1.1665424395125852, + "learning_rate": 9.574291905578922e-06, + "loss": 0.1317, + "step": 1455 + }, + { + "epoch": 0.6624203821656051, + "grad_norm": 0.6942177292436024, + "learning_rate": 9.573714604882138e-06, + "loss": 0.0615, + "step": 1456 + }, + { + "epoch": 0.6628753412192903, + "grad_norm": 0.9194225981756011, + "learning_rate": 9.57313693044205e-06, + "loss": 0.0975, + "step": 1457 + }, + { + "epoch": 0.6633303002729755, + "grad_norm": 0.7117926275391128, + "learning_rate": 9.572558882305863e-06, + "loss": 0.0847, + "step": 1458 + }, + { + "epoch": 0.6637852593266605, + "grad_norm": 0.9546376743105418, + "learning_rate": 9.571980460520815e-06, + "loss": 0.1196, + "step": 1459 + }, + { + "epoch": 0.6642402183803457, + "grad_norm": 0.8937437496424256, + "learning_rate": 9.57140166513417e-06, + "loss": 0.096, + "step": 1460 + }, + { + "epoch": 0.664695177434031, + "grad_norm": 0.5937947199850856, + "learning_rate": 9.570822496193225e-06, + "loss": 0.058, + "step": 1461 + }, + { + "epoch": 0.6651501364877161, + "grad_norm": 0.5756039867728808, + "learning_rate": 9.570242953745307e-06, + "loss": 0.082, + "step": 1462 + }, + { + "epoch": 0.6656050955414012, + "grad_norm": 0.7416722804778516, + "learning_rate": 9.569663037837776e-06, + "loss": 0.098, + "step": 1463 + }, + { + "epoch": 0.6660600545950864, + "grad_norm": 0.6377485683281849, + "learning_rate": 9.569082748518017e-06, + "loss": 0.0723, + "step": 1464 + }, + { + "epoch": 0.6665150136487716, + "grad_norm": 0.7884664768500067, + "learning_rate": 9.568502085833449e-06, + "loss": 0.0884, + "step": 1465 + }, + { + "epoch": 0.6669699727024567, + "grad_norm": 0.7723350087530905, + "learning_rate": 9.567921049831522e-06, + "loss": 0.0967, + "step": 1466 + }, + { + "epoch": 0.6674249317561419, + "grad_norm": 0.7260885892233983, + "learning_rate": 9.567339640559716e-06, + "loss": 0.0812, + "step": 1467 + }, + { + "epoch": 0.6678798908098271, + "grad_norm": 0.5596294621225263, + "learning_rate": 9.566757858065538e-06, + "loss": 0.0631, + "step": 1468 + }, + { + "epoch": 0.6683348498635123, + "grad_norm": 0.7286352648100037, + "learning_rate": 9.566175702396534e-06, + "loss": 0.0823, + "step": 1469 + }, + { + "epoch": 0.6687898089171974, + "grad_norm": 0.9301493673689373, + "learning_rate": 9.565593173600271e-06, + "loss": 0.0987, + "step": 1470 + }, + { + "epoch": 0.6692447679708826, + "grad_norm": 0.6817718703338496, + "learning_rate": 9.565010271724353e-06, + "loss": 0.0755, + "step": 1471 + }, + { + "epoch": 0.6696997270245678, + "grad_norm": 0.7526239018301766, + "learning_rate": 9.56442699681641e-06, + "loss": 0.0876, + "step": 1472 + }, + { + "epoch": 0.6701546860782529, + "grad_norm": 0.7279647211742274, + "learning_rate": 9.563843348924105e-06, + "loss": 0.0681, + "step": 1473 + }, + { + "epoch": 0.6706096451319381, + "grad_norm": 0.8487044021854026, + "learning_rate": 9.563259328095132e-06, + "loss": 0.0903, + "step": 1474 + }, + { + "epoch": 0.6710646041856233, + "grad_norm": 0.609495225783116, + "learning_rate": 9.562674934377214e-06, + "loss": 0.0801, + "step": 1475 + }, + { + "epoch": 0.6715195632393085, + "grad_norm": 0.7638645194963899, + "learning_rate": 9.562090167818107e-06, + "loss": 0.0874, + "step": 1476 + }, + { + "epoch": 0.6719745222929936, + "grad_norm": 1.4076317151154771, + "learning_rate": 9.561505028465593e-06, + "loss": 0.0874, + "step": 1477 + }, + { + "epoch": 0.6724294813466788, + "grad_norm": 0.6311161675673277, + "learning_rate": 9.560919516367486e-06, + "loss": 0.0738, + "step": 1478 + }, + { + "epoch": 0.672884440400364, + "grad_norm": 0.638266808298586, + "learning_rate": 9.560333631571634e-06, + "loss": 0.0682, + "step": 1479 + }, + { + "epoch": 0.6733393994540491, + "grad_norm": 0.7097356519617585, + "learning_rate": 9.559747374125911e-06, + "loss": 0.0987, + "step": 1480 + }, + { + "epoch": 0.6737943585077343, + "grad_norm": 0.6502346745698145, + "learning_rate": 9.559160744078226e-06, + "loss": 0.0644, + "step": 1481 + }, + { + "epoch": 0.6742493175614195, + "grad_norm": 1.056681303492363, + "learning_rate": 9.558573741476513e-06, + "loss": 0.0939, + "step": 1482 + }, + { + "epoch": 0.6747042766151047, + "grad_norm": 0.7992268675141662, + "learning_rate": 9.557986366368742e-06, + "loss": 0.0733, + "step": 1483 + }, + { + "epoch": 0.6751592356687898, + "grad_norm": 1.0832399406974047, + "learning_rate": 9.557398618802907e-06, + "loss": 0.1123, + "step": 1484 + }, + { + "epoch": 0.675614194722475, + "grad_norm": 0.6543008513198456, + "learning_rate": 9.556810498827039e-06, + "loss": 0.0794, + "step": 1485 + }, + { + "epoch": 0.6760691537761602, + "grad_norm": 0.6306597614421026, + "learning_rate": 9.556222006489193e-06, + "loss": 0.0786, + "step": 1486 + }, + { + "epoch": 0.6765241128298453, + "grad_norm": 0.5618899284499352, + "learning_rate": 9.555633141837462e-06, + "loss": 0.0618, + "step": 1487 + }, + { + "epoch": 0.6769790718835305, + "grad_norm": 0.6434016854657288, + "learning_rate": 9.555043904919963e-06, + "loss": 0.0796, + "step": 1488 + }, + { + "epoch": 0.6774340309372157, + "grad_norm": 0.7512094182824542, + "learning_rate": 9.554454295784848e-06, + "loss": 0.0745, + "step": 1489 + }, + { + "epoch": 0.6778889899909009, + "grad_norm": 0.662429978970196, + "learning_rate": 9.553864314480294e-06, + "loss": 0.0788, + "step": 1490 + }, + { + "epoch": 0.678343949044586, + "grad_norm": 0.7125824073483379, + "learning_rate": 9.553273961054514e-06, + "loss": 0.072, + "step": 1491 + }, + { + "epoch": 0.6787989080982711, + "grad_norm": 0.8599367957772613, + "learning_rate": 9.552683235555749e-06, + "loss": 0.0765, + "step": 1492 + }, + { + "epoch": 0.6792538671519563, + "grad_norm": 0.7900843446637873, + "learning_rate": 9.55209213803227e-06, + "loss": 0.0861, + "step": 1493 + }, + { + "epoch": 0.6797088262056415, + "grad_norm": 0.9492542185178791, + "learning_rate": 9.551500668532377e-06, + "loss": 0.1036, + "step": 1494 + }, + { + "epoch": 0.6801637852593266, + "grad_norm": 0.5324340095596853, + "learning_rate": 9.550908827104404e-06, + "loss": 0.0509, + "step": 1495 + }, + { + "epoch": 0.6806187443130118, + "grad_norm": 1.4654919772375794, + "learning_rate": 9.550316613796716e-06, + "loss": 0.0891, + "step": 1496 + }, + { + "epoch": 0.681073703366697, + "grad_norm": 0.6964909028346599, + "learning_rate": 9.549724028657698e-06, + "loss": 0.0814, + "step": 1497 + }, + { + "epoch": 0.6815286624203821, + "grad_norm": 0.7118346157191014, + "learning_rate": 9.549131071735784e-06, + "loss": 0.0711, + "step": 1498 + }, + { + "epoch": 0.6819836214740673, + "grad_norm": 0.9814989838911676, + "learning_rate": 9.54853774307942e-06, + "loss": 0.0981, + "step": 1499 + }, + { + "epoch": 0.6824385805277525, + "grad_norm": 0.8030617514029292, + "learning_rate": 9.547944042737092e-06, + "loss": 0.0944, + "step": 1500 + }, + { + "epoch": 0.6828935395814377, + "grad_norm": 0.9091821467413523, + "learning_rate": 9.547349970757317e-06, + "loss": 0.1419, + "step": 1501 + }, + { + "epoch": 0.6833484986351228, + "grad_norm": 0.7604842345576438, + "learning_rate": 9.546755527188638e-06, + "loss": 0.0616, + "step": 1502 + }, + { + "epoch": 0.683803457688808, + "grad_norm": 0.7795635296832277, + "learning_rate": 9.546160712079629e-06, + "loss": 0.0819, + "step": 1503 + }, + { + "epoch": 0.6842584167424932, + "grad_norm": 0.6155010796235886, + "learning_rate": 9.545565525478896e-06, + "loss": 0.0737, + "step": 1504 + }, + { + "epoch": 0.6847133757961783, + "grad_norm": 0.6981564617213015, + "learning_rate": 9.544969967435079e-06, + "loss": 0.0786, + "step": 1505 + }, + { + "epoch": 0.6851683348498635, + "grad_norm": 0.8590705218017948, + "learning_rate": 9.54437403799684e-06, + "loss": 0.0835, + "step": 1506 + }, + { + "epoch": 0.6856232939035487, + "grad_norm": 0.8783591706447448, + "learning_rate": 9.543777737212876e-06, + "loss": 0.118, + "step": 1507 + }, + { + "epoch": 0.6860782529572339, + "grad_norm": 0.5312480753344904, + "learning_rate": 9.543181065131914e-06, + "loss": 0.0535, + "step": 1508 + }, + { + "epoch": 0.686533212010919, + "grad_norm": 0.6911478055364548, + "learning_rate": 9.542584021802715e-06, + "loss": 0.0651, + "step": 1509 + }, + { + "epoch": 0.6869881710646042, + "grad_norm": 0.910176403224045, + "learning_rate": 9.54198660727406e-06, + "loss": 0.0916, + "step": 1510 + }, + { + "epoch": 0.6874431301182894, + "grad_norm": 0.5369469100452242, + "learning_rate": 9.541388821594774e-06, + "loss": 0.064, + "step": 1511 + }, + { + "epoch": 0.6878980891719745, + "grad_norm": 0.7242695685667516, + "learning_rate": 9.540790664813702e-06, + "loss": 0.0725, + "step": 1512 + }, + { + "epoch": 0.6883530482256597, + "grad_norm": 0.7527422721071317, + "learning_rate": 9.540192136979722e-06, + "loss": 0.0863, + "step": 1513 + }, + { + "epoch": 0.6888080072793449, + "grad_norm": 0.5409793571909967, + "learning_rate": 9.539593238141745e-06, + "loss": 0.0678, + "step": 1514 + }, + { + "epoch": 0.6892629663330301, + "grad_norm": 0.5059270742296627, + "learning_rate": 9.538993968348706e-06, + "loss": 0.0613, + "step": 1515 + }, + { + "epoch": 0.6897179253867152, + "grad_norm": 0.8092866682697022, + "learning_rate": 9.538394327649581e-06, + "loss": 0.0816, + "step": 1516 + }, + { + "epoch": 0.6901728844404004, + "grad_norm": 0.7416822411067572, + "learning_rate": 9.537794316093366e-06, + "loss": 0.0736, + "step": 1517 + }, + { + "epoch": 0.6906278434940856, + "grad_norm": 0.6013123530792879, + "learning_rate": 9.537193933729092e-06, + "loss": 0.0637, + "step": 1518 + }, + { + "epoch": 0.6910828025477707, + "grad_norm": 1.0953662823641266, + "learning_rate": 9.53659318060582e-06, + "loss": 0.1381, + "step": 1519 + }, + { + "epoch": 0.6915377616014559, + "grad_norm": 0.7906081758139587, + "learning_rate": 9.535992056772639e-06, + "loss": 0.088, + "step": 1520 + }, + { + "epoch": 0.6919927206551411, + "grad_norm": 0.9984370937403453, + "learning_rate": 9.535390562278673e-06, + "loss": 0.086, + "step": 1521 + }, + { + "epoch": 0.6924476797088263, + "grad_norm": 0.7438661675719108, + "learning_rate": 9.53478869717307e-06, + "loss": 0.0771, + "step": 1522 + }, + { + "epoch": 0.6929026387625113, + "grad_norm": 0.85189844123529, + "learning_rate": 9.534186461505015e-06, + "loss": 0.1109, + "step": 1523 + }, + { + "epoch": 0.6933575978161965, + "grad_norm": 0.7215256903381998, + "learning_rate": 9.533583855323717e-06, + "loss": 0.0947, + "step": 1524 + }, + { + "epoch": 0.6938125568698817, + "grad_norm": 0.8936614524747819, + "learning_rate": 9.532980878678422e-06, + "loss": 0.0731, + "step": 1525 + }, + { + "epoch": 0.6942675159235668, + "grad_norm": 0.7734700292932609, + "learning_rate": 9.5323775316184e-06, + "loss": 0.0844, + "step": 1526 + }, + { + "epoch": 0.694722474977252, + "grad_norm": 0.7521845435610183, + "learning_rate": 9.531773814192953e-06, + "loss": 0.0878, + "step": 1527 + }, + { + "epoch": 0.6951774340309372, + "grad_norm": 0.890089227377408, + "learning_rate": 9.531169726451417e-06, + "loss": 0.1128, + "step": 1528 + }, + { + "epoch": 0.6956323930846224, + "grad_norm": 0.7682866565773229, + "learning_rate": 9.530565268443153e-06, + "loss": 0.0956, + "step": 1529 + }, + { + "epoch": 0.6960873521383075, + "grad_norm": 0.9617852359873308, + "learning_rate": 9.529960440217554e-06, + "loss": 0.1088, + "step": 1530 + }, + { + "epoch": 0.6965423111919927, + "grad_norm": 0.9775947633570551, + "learning_rate": 9.529355241824045e-06, + "loss": 0.107, + "step": 1531 + }, + { + "epoch": 0.6969972702456779, + "grad_norm": 0.6007455012792351, + "learning_rate": 9.528749673312082e-06, + "loss": 0.0743, + "step": 1532 + }, + { + "epoch": 0.697452229299363, + "grad_norm": 0.5419764603212612, + "learning_rate": 9.528143734731143e-06, + "loss": 0.0822, + "step": 1533 + }, + { + "epoch": 0.6979071883530482, + "grad_norm": 0.8185575482665152, + "learning_rate": 9.52753742613075e-06, + "loss": 0.0832, + "step": 1534 + }, + { + "epoch": 0.6983621474067334, + "grad_norm": 0.9643638751029543, + "learning_rate": 9.526930747560446e-06, + "loss": 0.1026, + "step": 1535 + }, + { + "epoch": 0.6988171064604186, + "grad_norm": 0.8502651132594353, + "learning_rate": 9.526323699069803e-06, + "loss": 0.0902, + "step": 1536 + }, + { + "epoch": 0.6992720655141037, + "grad_norm": 0.5376181329235236, + "learning_rate": 9.525716280708428e-06, + "loss": 0.068, + "step": 1537 + }, + { + "epoch": 0.6997270245677889, + "grad_norm": 0.7166675033334694, + "learning_rate": 9.525108492525957e-06, + "loss": 0.0752, + "step": 1538 + }, + { + "epoch": 0.7001819836214741, + "grad_norm": 0.43432195935007917, + "learning_rate": 9.524500334572054e-06, + "loss": 0.0417, + "step": 1539 + }, + { + "epoch": 0.7006369426751592, + "grad_norm": 0.8369054167821826, + "learning_rate": 9.523891806896417e-06, + "loss": 0.1098, + "step": 1540 + }, + { + "epoch": 0.7010919017288444, + "grad_norm": 0.49781336551041033, + "learning_rate": 9.523282909548773e-06, + "loss": 0.0618, + "step": 1541 + }, + { + "epoch": 0.7015468607825296, + "grad_norm": 0.9187882410427298, + "learning_rate": 9.522673642578873e-06, + "loss": 0.1247, + "step": 1542 + }, + { + "epoch": 0.7020018198362148, + "grad_norm": 0.5007920591193696, + "learning_rate": 9.522064006036509e-06, + "loss": 0.0601, + "step": 1543 + }, + { + "epoch": 0.7024567788898999, + "grad_norm": 0.582945252861272, + "learning_rate": 9.521453999971497e-06, + "loss": 0.0585, + "step": 1544 + }, + { + "epoch": 0.7029117379435851, + "grad_norm": 0.5749885951853907, + "learning_rate": 9.520843624433681e-06, + "loss": 0.0664, + "step": 1545 + }, + { + "epoch": 0.7033666969972703, + "grad_norm": 0.9724598324631707, + "learning_rate": 9.520232879472942e-06, + "loss": 0.1199, + "step": 1546 + }, + { + "epoch": 0.7038216560509554, + "grad_norm": 1.0592052108390146, + "learning_rate": 9.519621765139181e-06, + "loss": 0.1278, + "step": 1547 + }, + { + "epoch": 0.7042766151046406, + "grad_norm": 0.42374402440173636, + "learning_rate": 9.519010281482344e-06, + "loss": 0.0446, + "step": 1548 + }, + { + "epoch": 0.7047315741583258, + "grad_norm": 1.102301602930716, + "learning_rate": 9.518398428552393e-06, + "loss": 0.1226, + "step": 1549 + }, + { + "epoch": 0.705186533212011, + "grad_norm": 0.6842519583257138, + "learning_rate": 9.51778620639933e-06, + "loss": 0.0905, + "step": 1550 + }, + { + "epoch": 0.7056414922656961, + "grad_norm": 0.7530573117253311, + "learning_rate": 9.517173615073177e-06, + "loss": 0.0766, + "step": 1551 + }, + { + "epoch": 0.7060964513193813, + "grad_norm": 0.43285639961604566, + "learning_rate": 9.516560654623996e-06, + "loss": 0.0475, + "step": 1552 + }, + { + "epoch": 0.7065514103730665, + "grad_norm": 0.9094561094681402, + "learning_rate": 9.515947325101875e-06, + "loss": 0.0896, + "step": 1553 + }, + { + "epoch": 0.7070063694267515, + "grad_norm": 0.6097385256206468, + "learning_rate": 9.515333626556933e-06, + "loss": 0.0653, + "step": 1554 + }, + { + "epoch": 0.7074613284804367, + "grad_norm": 0.7304393114645329, + "learning_rate": 9.514719559039318e-06, + "loss": 0.0896, + "step": 1555 + }, + { + "epoch": 0.707916287534122, + "grad_norm": 0.8799769831067698, + "learning_rate": 9.514105122599208e-06, + "loss": 0.1176, + "step": 1556 + }, + { + "epoch": 0.7083712465878071, + "grad_norm": 1.0962688093811397, + "learning_rate": 9.513490317286815e-06, + "loss": 0.1174, + "step": 1557 + }, + { + "epoch": 0.7088262056414922, + "grad_norm": 0.8022559500547495, + "learning_rate": 9.512875143152373e-06, + "loss": 0.0969, + "step": 1558 + }, + { + "epoch": 0.7092811646951774, + "grad_norm": 0.37133918747574174, + "learning_rate": 9.512259600246156e-06, + "loss": 0.031, + "step": 1559 + }, + { + "epoch": 0.7097361237488626, + "grad_norm": 0.6214125216955318, + "learning_rate": 9.511643688618463e-06, + "loss": 0.0943, + "step": 1560 + }, + { + "epoch": 0.7101910828025477, + "grad_norm": 0.7097270108607417, + "learning_rate": 9.51102740831962e-06, + "loss": 0.0847, + "step": 1561 + }, + { + "epoch": 0.7106460418562329, + "grad_norm": 0.8290870913254417, + "learning_rate": 9.510410759399991e-06, + "loss": 0.0867, + "step": 1562 + }, + { + "epoch": 0.7111010009099181, + "grad_norm": 0.7141101307254801, + "learning_rate": 9.50979374190996e-06, + "loss": 0.0838, + "step": 1563 + }, + { + "epoch": 0.7115559599636033, + "grad_norm": 0.8532705780985276, + "learning_rate": 9.509176355899954e-06, + "loss": 0.09, + "step": 1564 + }, + { + "epoch": 0.7120109190172884, + "grad_norm": 0.6858037908830302, + "learning_rate": 9.508558601420417e-06, + "loss": 0.0637, + "step": 1565 + }, + { + "epoch": 0.7124658780709736, + "grad_norm": 0.7489578082911201, + "learning_rate": 9.507940478521833e-06, + "loss": 0.1059, + "step": 1566 + }, + { + "epoch": 0.7129208371246588, + "grad_norm": 0.5241685648277268, + "learning_rate": 9.507321987254712e-06, + "loss": 0.0474, + "step": 1567 + }, + { + "epoch": 0.7133757961783439, + "grad_norm": 0.9862924439076355, + "learning_rate": 9.50670312766959e-06, + "loss": 0.1047, + "step": 1568 + }, + { + "epoch": 0.7138307552320291, + "grad_norm": 0.8286292773017996, + "learning_rate": 9.506083899817043e-06, + "loss": 0.0808, + "step": 1569 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.8166629192761119, + "learning_rate": 9.505464303747667e-06, + "loss": 0.079, + "step": 1570 + }, + { + "epoch": 0.7147406733393995, + "grad_norm": 0.6651663578468047, + "learning_rate": 9.504844339512096e-06, + "loss": 0.0879, + "step": 1571 + }, + { + "epoch": 0.7151956323930846, + "grad_norm": 0.5230779536546156, + "learning_rate": 9.50422400716099e-06, + "loss": 0.0585, + "step": 1572 + }, + { + "epoch": 0.7156505914467698, + "grad_norm": 0.6543543054934573, + "learning_rate": 9.503603306745036e-06, + "loss": 0.0564, + "step": 1573 + }, + { + "epoch": 0.716105550500455, + "grad_norm": 0.7812592861176204, + "learning_rate": 9.502982238314962e-06, + "loss": 0.0874, + "step": 1574 + }, + { + "epoch": 0.7165605095541401, + "grad_norm": 0.5040232473993467, + "learning_rate": 9.502360801921512e-06, + "loss": 0.0532, + "step": 1575 + }, + { + "epoch": 0.7170154686078253, + "grad_norm": 0.8631279038726943, + "learning_rate": 9.501738997615471e-06, + "loss": 0.1045, + "step": 1576 + }, + { + "epoch": 0.7174704276615105, + "grad_norm": 0.7716014465645913, + "learning_rate": 9.501116825447648e-06, + "loss": 0.068, + "step": 1577 + }, + { + "epoch": 0.7179253867151957, + "grad_norm": 0.5327432187838176, + "learning_rate": 9.500494285468884e-06, + "loss": 0.053, + "step": 1578 + }, + { + "epoch": 0.7183803457688808, + "grad_norm": 0.8209926537375553, + "learning_rate": 9.499871377730053e-06, + "loss": 0.1164, + "step": 1579 + }, + { + "epoch": 0.718835304822566, + "grad_norm": 0.5454374508074649, + "learning_rate": 9.499248102282052e-06, + "loss": 0.0579, + "step": 1580 + }, + { + "epoch": 0.7192902638762512, + "grad_norm": 0.4944315103743207, + "learning_rate": 9.498624459175815e-06, + "loss": 0.0542, + "step": 1581 + }, + { + "epoch": 0.7197452229299363, + "grad_norm": 0.8372013648456964, + "learning_rate": 9.498000448462305e-06, + "loss": 0.0948, + "step": 1582 + }, + { + "epoch": 0.7202001819836215, + "grad_norm": 0.6792072434969908, + "learning_rate": 9.49737607019251e-06, + "loss": 0.0683, + "step": 1583 + }, + { + "epoch": 0.7206551410373067, + "grad_norm": 0.6679228302277659, + "learning_rate": 9.496751324417452e-06, + "loss": 0.0526, + "step": 1584 + }, + { + "epoch": 0.7211101000909919, + "grad_norm": 0.830168268257237, + "learning_rate": 9.496126211188184e-06, + "loss": 0.1049, + "step": 1585 + }, + { + "epoch": 0.721565059144677, + "grad_norm": 0.7614112606151382, + "learning_rate": 9.495500730555784e-06, + "loss": 0.0966, + "step": 1586 + }, + { + "epoch": 0.7220200181983621, + "grad_norm": 0.7574732623314945, + "learning_rate": 9.494874882571368e-06, + "loss": 0.0648, + "step": 1587 + }, + { + "epoch": 0.7224749772520473, + "grad_norm": 0.7541681951930181, + "learning_rate": 9.494248667286075e-06, + "loss": 0.0905, + "step": 1588 + }, + { + "epoch": 0.7229299363057324, + "grad_norm": 0.776748715422375, + "learning_rate": 9.493622084751076e-06, + "loss": 0.0841, + "step": 1589 + }, + { + "epoch": 0.7233848953594176, + "grad_norm": 0.6440945504942991, + "learning_rate": 9.492995135017574e-06, + "loss": 0.0779, + "step": 1590 + }, + { + "epoch": 0.7238398544131028, + "grad_norm": 0.658893968607762, + "learning_rate": 9.4923678181368e-06, + "loss": 0.0862, + "step": 1591 + }, + { + "epoch": 0.724294813466788, + "grad_norm": 0.764304310956247, + "learning_rate": 9.491740134160014e-06, + "loss": 0.0834, + "step": 1592 + }, + { + "epoch": 0.7247497725204731, + "grad_norm": 1.246667162089055, + "learning_rate": 9.491112083138509e-06, + "loss": 0.141, + "step": 1593 + }, + { + "epoch": 0.7252047315741583, + "grad_norm": 0.7827390484343668, + "learning_rate": 9.490483665123606e-06, + "loss": 0.0687, + "step": 1594 + }, + { + "epoch": 0.7256596906278435, + "grad_norm": 0.6055248563993239, + "learning_rate": 9.489854880166658e-06, + "loss": 0.0716, + "step": 1595 + }, + { + "epoch": 0.7261146496815286, + "grad_norm": 0.7067865427149594, + "learning_rate": 9.489225728319044e-06, + "loss": 0.0756, + "step": 1596 + }, + { + "epoch": 0.7265696087352138, + "grad_norm": 0.85395818798431, + "learning_rate": 9.488596209632179e-06, + "loss": 0.1099, + "step": 1597 + }, + { + "epoch": 0.727024567788899, + "grad_norm": 0.6870669290352402, + "learning_rate": 9.4879663241575e-06, + "loss": 0.0703, + "step": 1598 + }, + { + "epoch": 0.7274795268425842, + "grad_norm": 1.2809048497988667, + "learning_rate": 9.48733607194648e-06, + "loss": 0.1663, + "step": 1599 + }, + { + "epoch": 0.7279344858962693, + "grad_norm": 0.7180890087653823, + "learning_rate": 9.486705453050622e-06, + "loss": 0.0738, + "step": 1600 + }, + { + "epoch": 0.7283894449499545, + "grad_norm": 0.5662460892211576, + "learning_rate": 9.486074467521456e-06, + "loss": 0.0627, + "step": 1601 + }, + { + "epoch": 0.7288444040036397, + "grad_norm": 0.7172800606287587, + "learning_rate": 9.485443115410541e-06, + "loss": 0.0715, + "step": 1602 + }, + { + "epoch": 0.7292993630573248, + "grad_norm": 0.6146064647413995, + "learning_rate": 9.484811396769475e-06, + "loss": 0.0828, + "step": 1603 + }, + { + "epoch": 0.72975432211101, + "grad_norm": 0.8606888467276742, + "learning_rate": 9.484179311649873e-06, + "loss": 0.0962, + "step": 1604 + }, + { + "epoch": 0.7302092811646952, + "grad_norm": 0.46814164753859155, + "learning_rate": 9.483546860103388e-06, + "loss": 0.0477, + "step": 1605 + }, + { + "epoch": 0.7306642402183804, + "grad_norm": 0.7370090010007736, + "learning_rate": 9.4829140421817e-06, + "loss": 0.081, + "step": 1606 + }, + { + "epoch": 0.7311191992720655, + "grad_norm": 1.0689466216112777, + "learning_rate": 9.482280857936522e-06, + "loss": 0.109, + "step": 1607 + }, + { + "epoch": 0.7315741583257507, + "grad_norm": 0.4147348220425697, + "learning_rate": 9.481647307419594e-06, + "loss": 0.0479, + "step": 1608 + }, + { + "epoch": 0.7320291173794359, + "grad_norm": 0.4998747516198886, + "learning_rate": 9.481013390682687e-06, + "loss": 0.0634, + "step": 1609 + }, + { + "epoch": 0.732484076433121, + "grad_norm": 0.8673371359679307, + "learning_rate": 9.480379107777601e-06, + "loss": 0.1108, + "step": 1610 + }, + { + "epoch": 0.7329390354868062, + "grad_norm": 0.6369274329058493, + "learning_rate": 9.47974445875617e-06, + "loss": 0.0698, + "step": 1611 + }, + { + "epoch": 0.7333939945404914, + "grad_norm": 0.6434647227835387, + "learning_rate": 9.47910944367025e-06, + "loss": 0.0618, + "step": 1612 + }, + { + "epoch": 0.7338489535941766, + "grad_norm": 0.8035955314379585, + "learning_rate": 9.478474062571735e-06, + "loss": 0.0997, + "step": 1613 + }, + { + "epoch": 0.7343039126478617, + "grad_norm": 0.7996949463502321, + "learning_rate": 9.477838315512544e-06, + "loss": 0.0873, + "step": 1614 + }, + { + "epoch": 0.7347588717015469, + "grad_norm": 0.6484970204244012, + "learning_rate": 9.477202202544626e-06, + "loss": 0.0925, + "step": 1615 + }, + { + "epoch": 0.7352138307552321, + "grad_norm": 0.6478821974846899, + "learning_rate": 9.476565723719966e-06, + "loss": 0.0693, + "step": 1616 + }, + { + "epoch": 0.7356687898089171, + "grad_norm": 0.6896940284490023, + "learning_rate": 9.475928879090568e-06, + "loss": 0.0763, + "step": 1617 + }, + { + "epoch": 0.7361237488626023, + "grad_norm": 0.6758264439259065, + "learning_rate": 9.475291668708476e-06, + "loss": 0.0717, + "step": 1618 + }, + { + "epoch": 0.7365787079162875, + "grad_norm": 0.6285383601705616, + "learning_rate": 9.474654092625758e-06, + "loss": 0.0561, + "step": 1619 + }, + { + "epoch": 0.7370336669699727, + "grad_norm": 0.7488998942485512, + "learning_rate": 9.474016150894518e-06, + "loss": 0.0765, + "step": 1620 + }, + { + "epoch": 0.7374886260236578, + "grad_norm": 0.7511340475878087, + "learning_rate": 9.47337784356688e-06, + "loss": 0.0865, + "step": 1621 + }, + { + "epoch": 0.737943585077343, + "grad_norm": 0.6908706816034008, + "learning_rate": 9.472739170695006e-06, + "loss": 0.0879, + "step": 1622 + }, + { + "epoch": 0.7383985441310282, + "grad_norm": 0.9159671053782389, + "learning_rate": 9.472100132331089e-06, + "loss": 0.0862, + "step": 1623 + }, + { + "epoch": 0.7388535031847133, + "grad_norm": 0.8367180794291794, + "learning_rate": 9.471460728527342e-06, + "loss": 0.0988, + "step": 1624 + }, + { + "epoch": 0.7393084622383985, + "grad_norm": 0.6396536181540736, + "learning_rate": 9.470820959336018e-06, + "loss": 0.0742, + "step": 1625 + }, + { + "epoch": 0.7397634212920837, + "grad_norm": 0.7212059639642758, + "learning_rate": 9.470180824809394e-06, + "loss": 0.0887, + "step": 1626 + }, + { + "epoch": 0.7402183803457689, + "grad_norm": 0.6570480817818456, + "learning_rate": 9.469540324999782e-06, + "loss": 0.0654, + "step": 1627 + }, + { + "epoch": 0.740673339399454, + "grad_norm": 0.6780217435395393, + "learning_rate": 9.468899459959518e-06, + "loss": 0.0613, + "step": 1628 + }, + { + "epoch": 0.7411282984531392, + "grad_norm": 0.8367065537687267, + "learning_rate": 9.468258229740972e-06, + "loss": 0.087, + "step": 1629 + }, + { + "epoch": 0.7415832575068244, + "grad_norm": 0.6724757485261361, + "learning_rate": 9.467616634396542e-06, + "loss": 0.0513, + "step": 1630 + }, + { + "epoch": 0.7420382165605095, + "grad_norm": 0.5923362651506067, + "learning_rate": 9.466974673978654e-06, + "loss": 0.0668, + "step": 1631 + }, + { + "epoch": 0.7424931756141947, + "grad_norm": 0.8046255156703264, + "learning_rate": 9.466332348539772e-06, + "loss": 0.0888, + "step": 1632 + }, + { + "epoch": 0.7429481346678799, + "grad_norm": 0.7456071657218726, + "learning_rate": 9.465689658132379e-06, + "loss": 0.0872, + "step": 1633 + }, + { + "epoch": 0.7434030937215651, + "grad_norm": 0.8751254537474247, + "learning_rate": 9.465046602808994e-06, + "loss": 0.0901, + "step": 1634 + }, + { + "epoch": 0.7438580527752502, + "grad_norm": 0.9953711560207276, + "learning_rate": 9.464403182622164e-06, + "loss": 0.1175, + "step": 1635 + }, + { + "epoch": 0.7443130118289354, + "grad_norm": 0.738323897945569, + "learning_rate": 9.463759397624466e-06, + "loss": 0.1016, + "step": 1636 + }, + { + "epoch": 0.7447679708826206, + "grad_norm": 0.620705920516562, + "learning_rate": 9.46311524786851e-06, + "loss": 0.0654, + "step": 1637 + }, + { + "epoch": 0.7452229299363057, + "grad_norm": 1.2433273775382216, + "learning_rate": 9.462470733406929e-06, + "loss": 0.1403, + "step": 1638 + }, + { + "epoch": 0.7456778889899909, + "grad_norm": 1.0268174749706445, + "learning_rate": 9.461825854292394e-06, + "loss": 0.1065, + "step": 1639 + }, + { + "epoch": 0.7461328480436761, + "grad_norm": 0.6942991337802967, + "learning_rate": 9.4611806105776e-06, + "loss": 0.0736, + "step": 1640 + }, + { + "epoch": 0.7465878070973613, + "grad_norm": 0.8367822612372433, + "learning_rate": 9.460535002315272e-06, + "loss": 0.089, + "step": 1641 + }, + { + "epoch": 0.7470427661510464, + "grad_norm": 0.5929887457730553, + "learning_rate": 9.459889029558167e-06, + "loss": 0.0665, + "step": 1642 + }, + { + "epoch": 0.7474977252047316, + "grad_norm": 0.5692342733265978, + "learning_rate": 9.459242692359072e-06, + "loss": 0.0708, + "step": 1643 + }, + { + "epoch": 0.7479526842584168, + "grad_norm": 0.6049162715481944, + "learning_rate": 9.4585959907708e-06, + "loss": 0.0716, + "step": 1644 + }, + { + "epoch": 0.7484076433121019, + "grad_norm": 0.5865800556894495, + "learning_rate": 9.457948924846201e-06, + "loss": 0.0562, + "step": 1645 + }, + { + "epoch": 0.7488626023657871, + "grad_norm": 1.018263961729041, + "learning_rate": 9.457301494638147e-06, + "loss": 0.1129, + "step": 1646 + }, + { + "epoch": 0.7493175614194723, + "grad_norm": 0.8420303347709615, + "learning_rate": 9.456653700199542e-06, + "loss": 0.0982, + "step": 1647 + }, + { + "epoch": 0.7497725204731575, + "grad_norm": 0.6178217269864875, + "learning_rate": 9.456005541583326e-06, + "loss": 0.0777, + "step": 1648 + }, + { + "epoch": 0.7502274795268425, + "grad_norm": 0.6159701780113571, + "learning_rate": 9.455357018842458e-06, + "loss": 0.075, + "step": 1649 + }, + { + "epoch": 0.7506824385805277, + "grad_norm": 0.5563337669331565, + "learning_rate": 9.454708132029936e-06, + "loss": 0.0594, + "step": 1650 + }, + { + "epoch": 0.7511373976342129, + "grad_norm": 0.7796132603413727, + "learning_rate": 9.454058881198782e-06, + "loss": 0.0842, + "step": 1651 + }, + { + "epoch": 0.7515923566878981, + "grad_norm": 0.5977999349867541, + "learning_rate": 9.45340926640205e-06, + "loss": 0.0623, + "step": 1652 + }, + { + "epoch": 0.7520473157415832, + "grad_norm": 0.7762091660359064, + "learning_rate": 9.452759287692824e-06, + "loss": 0.0923, + "step": 1653 + }, + { + "epoch": 0.7525022747952684, + "grad_norm": 1.029286283612893, + "learning_rate": 9.452108945124218e-06, + "loss": 0.1114, + "step": 1654 + }, + { + "epoch": 0.7529572338489536, + "grad_norm": 0.5046695202197234, + "learning_rate": 9.451458238749375e-06, + "loss": 0.058, + "step": 1655 + }, + { + "epoch": 0.7534121929026387, + "grad_norm": 0.6262659207860063, + "learning_rate": 9.450807168621468e-06, + "loss": 0.0607, + "step": 1656 + }, + { + "epoch": 0.7538671519563239, + "grad_norm": 0.7451490801568118, + "learning_rate": 9.450155734793697e-06, + "loss": 0.0716, + "step": 1657 + }, + { + "epoch": 0.7543221110100091, + "grad_norm": 0.6504007368655154, + "learning_rate": 9.449503937319297e-06, + "loss": 0.0913, + "step": 1658 + }, + { + "epoch": 0.7547770700636943, + "grad_norm": 0.8923820492879996, + "learning_rate": 9.448851776251528e-06, + "loss": 0.0984, + "step": 1659 + }, + { + "epoch": 0.7552320291173794, + "grad_norm": 0.7256175088606572, + "learning_rate": 9.448199251643684e-06, + "loss": 0.0834, + "step": 1660 + }, + { + "epoch": 0.7556869881710646, + "grad_norm": 0.7778885787730276, + "learning_rate": 9.447546363549085e-06, + "loss": 0.0878, + "step": 1661 + }, + { + "epoch": 0.7561419472247498, + "grad_norm": 0.8265030986085233, + "learning_rate": 9.446893112021083e-06, + "loss": 0.0827, + "step": 1662 + }, + { + "epoch": 0.7565969062784349, + "grad_norm": 0.5801162274559535, + "learning_rate": 9.446239497113055e-06, + "loss": 0.0797, + "step": 1663 + }, + { + "epoch": 0.7570518653321201, + "grad_norm": 0.8974914764997551, + "learning_rate": 9.445585518878418e-06, + "loss": 0.1088, + "step": 1664 + }, + { + "epoch": 0.7575068243858053, + "grad_norm": 0.8878060872125964, + "learning_rate": 9.444931177370605e-06, + "loss": 0.1235, + "step": 1665 + }, + { + "epoch": 0.7579617834394905, + "grad_norm": 0.5088737676913533, + "learning_rate": 9.44427647264309e-06, + "loss": 0.0478, + "step": 1666 + }, + { + "epoch": 0.7584167424931756, + "grad_norm": 0.7484910765250183, + "learning_rate": 9.443621404749374e-06, + "loss": 0.0686, + "step": 1667 + }, + { + "epoch": 0.7588717015468608, + "grad_norm": 0.6292123912530658, + "learning_rate": 9.442965973742983e-06, + "loss": 0.0652, + "step": 1668 + }, + { + "epoch": 0.759326660600546, + "grad_norm": 1.037223955207567, + "learning_rate": 9.442310179677476e-06, + "loss": 0.0827, + "step": 1669 + }, + { + "epoch": 0.7597816196542311, + "grad_norm": 0.6769034013570638, + "learning_rate": 9.441654022606444e-06, + "loss": 0.0771, + "step": 1670 + }, + { + "epoch": 0.7602365787079163, + "grad_norm": 0.8310244395490821, + "learning_rate": 9.440997502583503e-06, + "loss": 0.091, + "step": 1671 + }, + { + "epoch": 0.7606915377616015, + "grad_norm": 1.0039785109365194, + "learning_rate": 9.4403406196623e-06, + "loss": 0.1251, + "step": 1672 + }, + { + "epoch": 0.7611464968152867, + "grad_norm": 0.7908056524331212, + "learning_rate": 9.439683373896515e-06, + "loss": 0.0876, + "step": 1673 + }, + { + "epoch": 0.7616014558689718, + "grad_norm": 1.0809832712577787, + "learning_rate": 9.439025765339852e-06, + "loss": 0.1256, + "step": 1674 + }, + { + "epoch": 0.762056414922657, + "grad_norm": 0.5964161616065347, + "learning_rate": 9.438367794046053e-06, + "loss": 0.0585, + "step": 1675 + }, + { + "epoch": 0.7625113739763422, + "grad_norm": 0.8617975528364193, + "learning_rate": 9.437709460068882e-06, + "loss": 0.0783, + "step": 1676 + }, + { + "epoch": 0.7629663330300273, + "grad_norm": 0.6361215357389327, + "learning_rate": 9.437050763462132e-06, + "loss": 0.0692, + "step": 1677 + }, + { + "epoch": 0.7634212920837125, + "grad_norm": 0.9790069893643866, + "learning_rate": 9.436391704279632e-06, + "loss": 0.1173, + "step": 1678 + }, + { + "epoch": 0.7638762511373977, + "grad_norm": 1.1287905857392149, + "learning_rate": 9.435732282575235e-06, + "loss": 0.1505, + "step": 1679 + }, + { + "epoch": 0.7643312101910829, + "grad_norm": 0.8195744592905398, + "learning_rate": 9.435072498402832e-06, + "loss": 0.0877, + "step": 1680 + }, + { + "epoch": 0.7647861692447679, + "grad_norm": 0.5293612997987346, + "learning_rate": 9.434412351816329e-06, + "loss": 0.0609, + "step": 1681 + }, + { + "epoch": 0.7652411282984531, + "grad_norm": 0.7565664140640663, + "learning_rate": 9.433751842869676e-06, + "loss": 0.0895, + "step": 1682 + }, + { + "epoch": 0.7656960873521383, + "grad_norm": 0.8390610329820178, + "learning_rate": 9.433090971616842e-06, + "loss": 0.0823, + "step": 1683 + }, + { + "epoch": 0.7661510464058234, + "grad_norm": 0.7979326314286513, + "learning_rate": 9.432429738111836e-06, + "loss": 0.0893, + "step": 1684 + }, + { + "epoch": 0.7666060054595086, + "grad_norm": 0.7985876042778349, + "learning_rate": 9.431768142408687e-06, + "loss": 0.0965, + "step": 1685 + }, + { + "epoch": 0.7670609645131938, + "grad_norm": 0.7008114448081032, + "learning_rate": 9.431106184561462e-06, + "loss": 0.0894, + "step": 1686 + }, + { + "epoch": 0.767515923566879, + "grad_norm": 0.8506122352220377, + "learning_rate": 9.430443864624249e-06, + "loss": 0.0949, + "step": 1687 + }, + { + "epoch": 0.7679708826205641, + "grad_norm": 1.0900644244466022, + "learning_rate": 9.429781182651171e-06, + "loss": 0.1211, + "step": 1688 + }, + { + "epoch": 0.7684258416742493, + "grad_norm": 0.585079487316927, + "learning_rate": 9.429118138696378e-06, + "loss": 0.0642, + "step": 1689 + }, + { + "epoch": 0.7688808007279345, + "grad_norm": 0.8727981223997378, + "learning_rate": 9.428454732814055e-06, + "loss": 0.0987, + "step": 1690 + }, + { + "epoch": 0.7693357597816196, + "grad_norm": 0.7032463083497149, + "learning_rate": 9.427790965058407e-06, + "loss": 0.0685, + "step": 1691 + }, + { + "epoch": 0.7697907188353048, + "grad_norm": 0.6784390616651746, + "learning_rate": 9.42712683548368e-06, + "loss": 0.079, + "step": 1692 + }, + { + "epoch": 0.77024567788899, + "grad_norm": 0.774501448184362, + "learning_rate": 9.426462344144138e-06, + "loss": 0.0784, + "step": 1693 + }, + { + "epoch": 0.7707006369426752, + "grad_norm": 0.7793988116138444, + "learning_rate": 9.425797491094086e-06, + "loss": 0.0801, + "step": 1694 + }, + { + "epoch": 0.7711555959963603, + "grad_norm": 0.7642360389143683, + "learning_rate": 9.425132276387847e-06, + "loss": 0.1009, + "step": 1695 + }, + { + "epoch": 0.7716105550500455, + "grad_norm": 0.6080046843370063, + "learning_rate": 9.424466700079785e-06, + "loss": 0.0688, + "step": 1696 + }, + { + "epoch": 0.7720655141037307, + "grad_norm": 0.6270167280264678, + "learning_rate": 9.423800762224283e-06, + "loss": 0.0626, + "step": 1697 + }, + { + "epoch": 0.7725204731574158, + "grad_norm": 0.5357586110049548, + "learning_rate": 9.42313446287576e-06, + "loss": 0.0626, + "step": 1698 + }, + { + "epoch": 0.772975432211101, + "grad_norm": 0.6233095813256608, + "learning_rate": 9.422467802088664e-06, + "loss": 0.0804, + "step": 1699 + }, + { + "epoch": 0.7734303912647862, + "grad_norm": 0.7158265191654914, + "learning_rate": 9.42180077991747e-06, + "loss": 0.0887, + "step": 1700 + }, + { + "epoch": 0.7738853503184714, + "grad_norm": 1.0305735114746193, + "learning_rate": 9.421133396416687e-06, + "loss": 0.1441, + "step": 1701 + }, + { + "epoch": 0.7743403093721565, + "grad_norm": 0.6965845039033058, + "learning_rate": 9.420465651640847e-06, + "loss": 0.079, + "step": 1702 + }, + { + "epoch": 0.7747952684258417, + "grad_norm": 0.4529773063241175, + "learning_rate": 9.419797545644516e-06, + "loss": 0.0443, + "step": 1703 + }, + { + "epoch": 0.7752502274795269, + "grad_norm": 0.5407082720421394, + "learning_rate": 9.41912907848229e-06, + "loss": 0.0625, + "step": 1704 + }, + { + "epoch": 0.775705186533212, + "grad_norm": 0.5625290405803486, + "learning_rate": 9.418460250208791e-06, + "loss": 0.0695, + "step": 1705 + }, + { + "epoch": 0.7761601455868972, + "grad_norm": 0.5288549658523206, + "learning_rate": 9.417791060878677e-06, + "loss": 0.0546, + "step": 1706 + }, + { + "epoch": 0.7766151046405824, + "grad_norm": 0.6390336517076213, + "learning_rate": 9.417121510546626e-06, + "loss": 0.0474, + "step": 1707 + }, + { + "epoch": 0.7770700636942676, + "grad_norm": 1.1628554226147039, + "learning_rate": 9.416451599267353e-06, + "loss": 0.1427, + "step": 1708 + }, + { + "epoch": 0.7775250227479527, + "grad_norm": 0.5775794942631142, + "learning_rate": 9.415781327095601e-06, + "loss": 0.0722, + "step": 1709 + }, + { + "epoch": 0.7779799818016379, + "grad_norm": 0.6702327788675698, + "learning_rate": 9.415110694086139e-06, + "loss": 0.0863, + "step": 1710 + }, + { + "epoch": 0.778434940855323, + "grad_norm": 1.0756620214218862, + "learning_rate": 9.41443970029377e-06, + "loss": 0.0916, + "step": 1711 + }, + { + "epoch": 0.7788898999090081, + "grad_norm": 0.6873597883249742, + "learning_rate": 9.413768345773324e-06, + "loss": 0.0928, + "step": 1712 + }, + { + "epoch": 0.7793448589626933, + "grad_norm": 0.546687059556293, + "learning_rate": 9.413096630579661e-06, + "loss": 0.0681, + "step": 1713 + }, + { + "epoch": 0.7797998180163785, + "grad_norm": 0.5882776722743176, + "learning_rate": 9.412424554767672e-06, + "loss": 0.0666, + "step": 1714 + }, + { + "epoch": 0.7802547770700637, + "grad_norm": 0.7757931395434748, + "learning_rate": 9.411752118392272e-06, + "loss": 0.0961, + "step": 1715 + }, + { + "epoch": 0.7807097361237488, + "grad_norm": 0.7533384044089068, + "learning_rate": 9.411079321508416e-06, + "loss": 0.0915, + "step": 1716 + }, + { + "epoch": 0.781164695177434, + "grad_norm": 0.6690633163427073, + "learning_rate": 9.410406164171076e-06, + "loss": 0.0757, + "step": 1717 + }, + { + "epoch": 0.7816196542311192, + "grad_norm": 0.9875033482174213, + "learning_rate": 9.40973264643526e-06, + "loss": 0.1016, + "step": 1718 + }, + { + "epoch": 0.7820746132848043, + "grad_norm": 0.7285855686862363, + "learning_rate": 9.409058768356007e-06, + "loss": 0.0777, + "step": 1719 + }, + { + "epoch": 0.7825295723384895, + "grad_norm": 0.5412833929378409, + "learning_rate": 9.408384529988385e-06, + "loss": 0.0596, + "step": 1720 + }, + { + "epoch": 0.7829845313921747, + "grad_norm": 0.48748390975323075, + "learning_rate": 9.407709931387486e-06, + "loss": 0.0451, + "step": 1721 + }, + { + "epoch": 0.7834394904458599, + "grad_norm": 0.8626755233369133, + "learning_rate": 9.407034972608436e-06, + "loss": 0.1093, + "step": 1722 + }, + { + "epoch": 0.783894449499545, + "grad_norm": 0.5986423081381415, + "learning_rate": 9.40635965370639e-06, + "loss": 0.0737, + "step": 1723 + }, + { + "epoch": 0.7843494085532302, + "grad_norm": 0.8697508747552452, + "learning_rate": 9.40568397473653e-06, + "loss": 0.0748, + "step": 1724 + }, + { + "epoch": 0.7848043676069154, + "grad_norm": 0.6651587535516658, + "learning_rate": 9.405007935754076e-06, + "loss": 0.0553, + "step": 1725 + }, + { + "epoch": 0.7852593266606005, + "grad_norm": 1.1307670638395897, + "learning_rate": 9.404331536814265e-06, + "loss": 0.1451, + "step": 1726 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 0.6724877006657928, + "learning_rate": 9.40365477797237e-06, + "loss": 0.0803, + "step": 1727 + }, + { + "epoch": 0.7861692447679709, + "grad_norm": 0.739524107451132, + "learning_rate": 9.40297765928369e-06, + "loss": 0.0713, + "step": 1728 + }, + { + "epoch": 0.7866242038216561, + "grad_norm": 0.6341880042511068, + "learning_rate": 9.402300180803563e-06, + "loss": 0.0739, + "step": 1729 + }, + { + "epoch": 0.7870791628753412, + "grad_norm": 0.5809522499341311, + "learning_rate": 9.401622342587346e-06, + "loss": 0.067, + "step": 1730 + }, + { + "epoch": 0.7875341219290264, + "grad_norm": 0.6208756444695567, + "learning_rate": 9.400944144690428e-06, + "loss": 0.0865, + "step": 1731 + }, + { + "epoch": 0.7879890809827116, + "grad_norm": 0.7358085271263743, + "learning_rate": 9.400265587168226e-06, + "loss": 0.0827, + "step": 1732 + }, + { + "epoch": 0.7884440400363967, + "grad_norm": 0.6985098389174249, + "learning_rate": 9.399586670076196e-06, + "loss": 0.0784, + "step": 1733 + }, + { + "epoch": 0.7888989990900819, + "grad_norm": 0.6524277365731544, + "learning_rate": 9.39890739346981e-06, + "loss": 0.0759, + "step": 1734 + }, + { + "epoch": 0.7893539581437671, + "grad_norm": 0.8500489687124628, + "learning_rate": 9.398227757404576e-06, + "loss": 0.1139, + "step": 1735 + }, + { + "epoch": 0.7898089171974523, + "grad_norm": 0.49161558761743546, + "learning_rate": 9.397547761936034e-06, + "loss": 0.0445, + "step": 1736 + }, + { + "epoch": 0.7902638762511374, + "grad_norm": 0.3886581827401007, + "learning_rate": 9.396867407119748e-06, + "loss": 0.0387, + "step": 1737 + }, + { + "epoch": 0.7907188353048226, + "grad_norm": 0.43315626329206963, + "learning_rate": 9.396186693011312e-06, + "loss": 0.0484, + "step": 1738 + }, + { + "epoch": 0.7911737943585078, + "grad_norm": 0.7578063731873546, + "learning_rate": 9.395505619666353e-06, + "loss": 0.0872, + "step": 1739 + }, + { + "epoch": 0.7916287534121929, + "grad_norm": 0.9087897001540515, + "learning_rate": 9.394824187140526e-06, + "loss": 0.0914, + "step": 1740 + }, + { + "epoch": 0.792083712465878, + "grad_norm": 0.5994634977370948, + "learning_rate": 9.394142395489512e-06, + "loss": 0.061, + "step": 1741 + }, + { + "epoch": 0.7925386715195633, + "grad_norm": 0.6263578026813904, + "learning_rate": 9.393460244769023e-06, + "loss": 0.0608, + "step": 1742 + }, + { + "epoch": 0.7929936305732485, + "grad_norm": 0.5753033056961346, + "learning_rate": 9.392777735034807e-06, + "loss": 0.0721, + "step": 1743 + }, + { + "epoch": 0.7934485896269335, + "grad_norm": 0.6561198773299641, + "learning_rate": 9.392094866342632e-06, + "loss": 0.0599, + "step": 1744 + }, + { + "epoch": 0.7939035486806187, + "grad_norm": 0.7317990056550264, + "learning_rate": 9.391411638748297e-06, + "loss": 0.0742, + "step": 1745 + }, + { + "epoch": 0.7943585077343039, + "grad_norm": 0.5011723772780661, + "learning_rate": 9.390728052307637e-06, + "loss": 0.0647, + "step": 1746 + }, + { + "epoch": 0.794813466787989, + "grad_norm": 0.6867846904523061, + "learning_rate": 9.390044107076506e-06, + "loss": 0.0779, + "step": 1747 + }, + { + "epoch": 0.7952684258416742, + "grad_norm": 0.9267872196876082, + "learning_rate": 9.389359803110796e-06, + "loss": 0.1001, + "step": 1748 + }, + { + "epoch": 0.7957233848953594, + "grad_norm": 3.487580179742763, + "learning_rate": 9.388675140466427e-06, + "loss": 0.1841, + "step": 1749 + }, + { + "epoch": 0.7961783439490446, + "grad_norm": 0.6520959532750612, + "learning_rate": 9.387990119199343e-06, + "loss": 0.0714, + "step": 1750 + }, + { + "epoch": 0.7966333030027297, + "grad_norm": 0.8129917876989495, + "learning_rate": 9.387304739365524e-06, + "loss": 0.0949, + "step": 1751 + }, + { + "epoch": 0.7970882620564149, + "grad_norm": 0.6276053555905522, + "learning_rate": 9.386619001020974e-06, + "loss": 0.0552, + "step": 1752 + }, + { + "epoch": 0.7975432211101001, + "grad_norm": 0.7632340875896291, + "learning_rate": 9.385932904221729e-06, + "loss": 0.0655, + "step": 1753 + }, + { + "epoch": 0.7979981801637852, + "grad_norm": 0.7239218776412117, + "learning_rate": 9.385246449023853e-06, + "loss": 0.1113, + "step": 1754 + }, + { + "epoch": 0.7984531392174704, + "grad_norm": 1.0468381569335767, + "learning_rate": 9.38455963548344e-06, + "loss": 0.1042, + "step": 1755 + }, + { + "epoch": 0.7989080982711556, + "grad_norm": 0.8019558864262506, + "learning_rate": 9.383872463656616e-06, + "loss": 0.0868, + "step": 1756 + }, + { + "epoch": 0.7993630573248408, + "grad_norm": 0.7449121488820226, + "learning_rate": 9.383184933599531e-06, + "loss": 0.0945, + "step": 1757 + }, + { + "epoch": 0.7998180163785259, + "grad_norm": 0.5905383438931077, + "learning_rate": 9.382497045368368e-06, + "loss": 0.0672, + "step": 1758 + }, + { + "epoch": 0.8002729754322111, + "grad_norm": 0.5337189472762474, + "learning_rate": 9.381808799019336e-06, + "loss": 0.0509, + "step": 1759 + }, + { + "epoch": 0.8007279344858963, + "grad_norm": 1.0483707789224317, + "learning_rate": 9.38112019460868e-06, + "loss": 0.1069, + "step": 1760 + }, + { + "epoch": 0.8011828935395814, + "grad_norm": 0.8974041640796228, + "learning_rate": 9.380431232192663e-06, + "loss": 0.1061, + "step": 1761 + }, + { + "epoch": 0.8016378525932666, + "grad_norm": 0.774987790741639, + "learning_rate": 9.379741911827591e-06, + "loss": 0.0971, + "step": 1762 + }, + { + "epoch": 0.8020928116469518, + "grad_norm": 0.5037991292329869, + "learning_rate": 9.379052233569788e-06, + "loss": 0.0545, + "step": 1763 + }, + { + "epoch": 0.802547770700637, + "grad_norm": 0.7571282390818425, + "learning_rate": 9.37836219747561e-06, + "loss": 0.0774, + "step": 1764 + }, + { + "epoch": 0.8030027297543221, + "grad_norm": 0.47374252215612206, + "learning_rate": 9.377671803601447e-06, + "loss": 0.0479, + "step": 1765 + }, + { + "epoch": 0.8034576888080073, + "grad_norm": 0.563871853603133, + "learning_rate": 9.376981052003713e-06, + "loss": 0.0583, + "step": 1766 + }, + { + "epoch": 0.8039126478616925, + "grad_norm": 0.7260639419055305, + "learning_rate": 9.376289942738855e-06, + "loss": 0.0739, + "step": 1767 + }, + { + "epoch": 0.8043676069153776, + "grad_norm": 0.7704639306429572, + "learning_rate": 9.375598475863345e-06, + "loss": 0.08, + "step": 1768 + }, + { + "epoch": 0.8048225659690628, + "grad_norm": 0.8052864772012752, + "learning_rate": 9.374906651433689e-06, + "loss": 0.1155, + "step": 1769 + }, + { + "epoch": 0.805277525022748, + "grad_norm": 0.945940660466259, + "learning_rate": 9.374214469506416e-06, + "loss": 0.0942, + "step": 1770 + }, + { + "epoch": 0.8057324840764332, + "grad_norm": 0.8382092898318407, + "learning_rate": 9.373521930138092e-06, + "loss": 0.0831, + "step": 1771 + }, + { + "epoch": 0.8061874431301183, + "grad_norm": 0.5910933141386769, + "learning_rate": 9.372829033385306e-06, + "loss": 0.0825, + "step": 1772 + }, + { + "epoch": 0.8066424021838035, + "grad_norm": 0.7616883112365667, + "learning_rate": 9.37213577930468e-06, + "loss": 0.0907, + "step": 1773 + }, + { + "epoch": 0.8070973612374887, + "grad_norm": 0.9571485234330176, + "learning_rate": 9.37144216795286e-06, + "loss": 0.1322, + "step": 1774 + }, + { + "epoch": 0.8075523202911737, + "grad_norm": 0.770430324420924, + "learning_rate": 9.370748199386529e-06, + "loss": 0.0821, + "step": 1775 + }, + { + "epoch": 0.8080072793448589, + "grad_norm": 0.6303205378749905, + "learning_rate": 9.370053873662393e-06, + "loss": 0.0694, + "step": 1776 + }, + { + "epoch": 0.8084622383985441, + "grad_norm": 0.6777135846807264, + "learning_rate": 9.36935919083719e-06, + "loss": 0.0685, + "step": 1777 + }, + { + "epoch": 0.8089171974522293, + "grad_norm": 0.7319936383805717, + "learning_rate": 9.368664150967686e-06, + "loss": 0.0679, + "step": 1778 + }, + { + "epoch": 0.8093721565059144, + "grad_norm": 0.7990830113911501, + "learning_rate": 9.367968754110675e-06, + "loss": 0.1023, + "step": 1779 + }, + { + "epoch": 0.8098271155595996, + "grad_norm": 0.5223284241529513, + "learning_rate": 9.367273000322983e-06, + "loss": 0.063, + "step": 1780 + }, + { + "epoch": 0.8102820746132848, + "grad_norm": 1.040419010652034, + "learning_rate": 9.366576889661465e-06, + "loss": 0.1236, + "step": 1781 + }, + { + "epoch": 0.8107370336669699, + "grad_norm": 0.6404250074887077, + "learning_rate": 9.365880422183003e-06, + "loss": 0.0656, + "step": 1782 + }, + { + "epoch": 0.8111919927206551, + "grad_norm": 0.7564675990794105, + "learning_rate": 9.365183597944506e-06, + "loss": 0.0725, + "step": 1783 + }, + { + "epoch": 0.8116469517743403, + "grad_norm": 0.5955963027805166, + "learning_rate": 9.364486417002922e-06, + "loss": 0.07, + "step": 1784 + }, + { + "epoch": 0.8121019108280255, + "grad_norm": 0.6658882483856376, + "learning_rate": 9.363788879415217e-06, + "loss": 0.0616, + "step": 1785 + }, + { + "epoch": 0.8125568698817106, + "grad_norm": 0.6032274064354748, + "learning_rate": 9.36309098523839e-06, + "loss": 0.0688, + "step": 1786 + }, + { + "epoch": 0.8130118289353958, + "grad_norm": 0.7627355718580127, + "learning_rate": 9.362392734529472e-06, + "loss": 0.0841, + "step": 1787 + }, + { + "epoch": 0.813466787989081, + "grad_norm": 0.6581922552034235, + "learning_rate": 9.361694127345523e-06, + "loss": 0.0773, + "step": 1788 + }, + { + "epoch": 0.8139217470427661, + "grad_norm": 0.5723109702485146, + "learning_rate": 9.360995163743622e-06, + "loss": 0.0755, + "step": 1789 + }, + { + "epoch": 0.8143767060964513, + "grad_norm": 0.8492692664232014, + "learning_rate": 9.360295843780893e-06, + "loss": 0.084, + "step": 1790 + }, + { + "epoch": 0.8148316651501365, + "grad_norm": 0.7138327780528116, + "learning_rate": 9.35959616751448e-06, + "loss": 0.0754, + "step": 1791 + }, + { + "epoch": 0.8152866242038217, + "grad_norm": 0.7513269368015193, + "learning_rate": 9.358896135001555e-06, + "loss": 0.075, + "step": 1792 + }, + { + "epoch": 0.8157415832575068, + "grad_norm": 6.226904157676098, + "learning_rate": 9.35819574629932e-06, + "loss": 0.2447, + "step": 1793 + }, + { + "epoch": 0.816196542311192, + "grad_norm": 0.9632842432595244, + "learning_rate": 9.35749500146501e-06, + "loss": 0.0968, + "step": 1794 + }, + { + "epoch": 0.8166515013648772, + "grad_norm": 0.6910899092527569, + "learning_rate": 9.356793900555891e-06, + "loss": 0.0736, + "step": 1795 + }, + { + "epoch": 0.8171064604185623, + "grad_norm": 0.8430341812657529, + "learning_rate": 9.356092443629247e-06, + "loss": 0.0929, + "step": 1796 + }, + { + "epoch": 0.8175614194722475, + "grad_norm": 0.7425545237339678, + "learning_rate": 9.355390630742401e-06, + "loss": 0.1005, + "step": 1797 + }, + { + "epoch": 0.8180163785259327, + "grad_norm": 0.7004618898733044, + "learning_rate": 9.3546884619527e-06, + "loss": 0.0789, + "step": 1798 + }, + { + "epoch": 0.8184713375796179, + "grad_norm": 0.5461552026045962, + "learning_rate": 9.353985937317525e-06, + "loss": 0.0763, + "step": 1799 + }, + { + "epoch": 0.818926296633303, + "grad_norm": 0.6222175380121098, + "learning_rate": 9.35328305689428e-06, + "loss": 0.0754, + "step": 1800 + }, + { + "epoch": 0.8193812556869882, + "grad_norm": 0.7386705168753549, + "learning_rate": 9.352579820740404e-06, + "loss": 0.0641, + "step": 1801 + }, + { + "epoch": 0.8198362147406734, + "grad_norm": 1.2544587029581489, + "learning_rate": 9.351876228913363e-06, + "loss": 0.107, + "step": 1802 + }, + { + "epoch": 0.8202911737943585, + "grad_norm": 0.6546855629883478, + "learning_rate": 9.351172281470645e-06, + "loss": 0.0781, + "step": 1803 + }, + { + "epoch": 0.8207461328480437, + "grad_norm": 0.7485647273392206, + "learning_rate": 9.350467978469782e-06, + "loss": 0.0898, + "step": 1804 + }, + { + "epoch": 0.8212010919017289, + "grad_norm": 0.5530668925780788, + "learning_rate": 9.34976331996832e-06, + "loss": 0.057, + "step": 1805 + }, + { + "epoch": 0.821656050955414, + "grad_norm": 0.870085999603916, + "learning_rate": 9.349058306023844e-06, + "loss": 0.1077, + "step": 1806 + }, + { + "epoch": 0.8221110100090991, + "grad_norm": 0.891036381079533, + "learning_rate": 9.348352936693964e-06, + "loss": 0.1082, + "step": 1807 + }, + { + "epoch": 0.8225659690627843, + "grad_norm": 0.5641275258385202, + "learning_rate": 9.347647212036316e-06, + "loss": 0.0613, + "step": 1808 + }, + { + "epoch": 0.8230209281164695, + "grad_norm": 0.7163257638587112, + "learning_rate": 9.346941132108575e-06, + "loss": 0.0842, + "step": 1809 + }, + { + "epoch": 0.8234758871701547, + "grad_norm": 0.7333770270884309, + "learning_rate": 9.346234696968435e-06, + "loss": 0.0782, + "step": 1810 + }, + { + "epoch": 0.8239308462238398, + "grad_norm": 0.5399164747367127, + "learning_rate": 9.345527906673622e-06, + "loss": 0.0676, + "step": 1811 + }, + { + "epoch": 0.824385805277525, + "grad_norm": 1.0476291790994476, + "learning_rate": 9.344820761281892e-06, + "loss": 0.0984, + "step": 1812 + }, + { + "epoch": 0.8248407643312102, + "grad_norm": 0.639304845804496, + "learning_rate": 9.344113260851031e-06, + "loss": 0.0764, + "step": 1813 + }, + { + "epoch": 0.8252957233848953, + "grad_norm": 0.6071291165528282, + "learning_rate": 9.343405405438852e-06, + "loss": 0.0707, + "step": 1814 + }, + { + "epoch": 0.8257506824385805, + "grad_norm": 0.6973111552871604, + "learning_rate": 9.342697195103199e-06, + "loss": 0.0917, + "step": 1815 + }, + { + "epoch": 0.8262056414922657, + "grad_norm": 0.6486872321285189, + "learning_rate": 9.341988629901942e-06, + "loss": 0.0725, + "step": 1816 + }, + { + "epoch": 0.8266606005459509, + "grad_norm": 0.5216883119977757, + "learning_rate": 9.341279709892981e-06, + "loss": 0.0572, + "step": 1817 + }, + { + "epoch": 0.827115559599636, + "grad_norm": 0.4472530755665983, + "learning_rate": 9.340570435134248e-06, + "loss": 0.0412, + "step": 1818 + }, + { + "epoch": 0.8275705186533212, + "grad_norm": 0.786165560489741, + "learning_rate": 9.339860805683703e-06, + "loss": 0.0905, + "step": 1819 + }, + { + "epoch": 0.8280254777070064, + "grad_norm": 0.8504390923669081, + "learning_rate": 9.33915082159933e-06, + "loss": 0.0761, + "step": 1820 + }, + { + "epoch": 0.8284804367606915, + "grad_norm": 0.5303034158640553, + "learning_rate": 9.338440482939146e-06, + "loss": 0.0735, + "step": 1821 + }, + { + "epoch": 0.8289353958143767, + "grad_norm": 0.6407993820931909, + "learning_rate": 9.337729789761199e-06, + "loss": 0.0829, + "step": 1822 + }, + { + "epoch": 0.8293903548680619, + "grad_norm": 2.670877671269915, + "learning_rate": 9.337018742123563e-06, + "loss": 0.1871, + "step": 1823 + }, + { + "epoch": 0.8298453139217471, + "grad_norm": 1.0355313595445745, + "learning_rate": 9.336307340084341e-06, + "loss": 0.0955, + "step": 1824 + }, + { + "epoch": 0.8303002729754322, + "grad_norm": 0.6127983226216669, + "learning_rate": 9.335595583701667e-06, + "loss": 0.0639, + "step": 1825 + }, + { + "epoch": 0.8307552320291174, + "grad_norm": 0.6196615465194765, + "learning_rate": 9.334883473033699e-06, + "loss": 0.0706, + "step": 1826 + }, + { + "epoch": 0.8312101910828026, + "grad_norm": 0.7243682512181147, + "learning_rate": 9.33417100813863e-06, + "loss": 0.0869, + "step": 1827 + }, + { + "epoch": 0.8316651501364877, + "grad_norm": 0.94108166831404, + "learning_rate": 9.33345818907468e-06, + "loss": 0.1349, + "step": 1828 + }, + { + "epoch": 0.8321201091901729, + "grad_norm": 4.6896190497823955, + "learning_rate": 9.332745015900097e-06, + "loss": 0.1125, + "step": 1829 + }, + { + "epoch": 0.8325750682438581, + "grad_norm": 0.7268733027831774, + "learning_rate": 9.332031488673156e-06, + "loss": 0.0651, + "step": 1830 + }, + { + "epoch": 0.8330300272975433, + "grad_norm": 0.5169699897246913, + "learning_rate": 9.331317607452166e-06, + "loss": 0.0683, + "step": 1831 + }, + { + "epoch": 0.8334849863512284, + "grad_norm": 0.5056561715785393, + "learning_rate": 9.330603372295463e-06, + "loss": 0.0568, + "step": 1832 + }, + { + "epoch": 0.8339399454049136, + "grad_norm": 0.5749009883761049, + "learning_rate": 9.329888783261408e-06, + "loss": 0.0594, + "step": 1833 + }, + { + "epoch": 0.8343949044585988, + "grad_norm": 0.6696966952437984, + "learning_rate": 9.329173840408394e-06, + "loss": 0.0764, + "step": 1834 + }, + { + "epoch": 0.8348498635122839, + "grad_norm": 0.7329039198928983, + "learning_rate": 9.328458543794844e-06, + "loss": 0.0729, + "step": 1835 + }, + { + "epoch": 0.835304822565969, + "grad_norm": 0.5892831520257552, + "learning_rate": 9.327742893479212e-06, + "loss": 0.0838, + "step": 1836 + }, + { + "epoch": 0.8357597816196543, + "grad_norm": 0.848350653615326, + "learning_rate": 9.327026889519973e-06, + "loss": 0.0778, + "step": 1837 + }, + { + "epoch": 0.8362147406733395, + "grad_norm": 0.939837339633871, + "learning_rate": 9.326310531975636e-06, + "loss": 0.1005, + "step": 1838 + }, + { + "epoch": 0.8366696997270245, + "grad_norm": 0.6312875650471034, + "learning_rate": 9.32559382090474e-06, + "loss": 0.0626, + "step": 1839 + }, + { + "epoch": 0.8371246587807097, + "grad_norm": 0.9586580739045799, + "learning_rate": 9.324876756365853e-06, + "loss": 0.1154, + "step": 1840 + }, + { + "epoch": 0.8375796178343949, + "grad_norm": 0.6108920091747637, + "learning_rate": 9.324159338417566e-06, + "loss": 0.0674, + "step": 1841 + }, + { + "epoch": 0.83803457688808, + "grad_norm": 0.9247779620401613, + "learning_rate": 9.323441567118508e-06, + "loss": 0.11, + "step": 1842 + }, + { + "epoch": 0.8384895359417652, + "grad_norm": 0.6152452902665, + "learning_rate": 9.322723442527328e-06, + "loss": 0.0657, + "step": 1843 + }, + { + "epoch": 0.8389444949954504, + "grad_norm": 0.6579130646316164, + "learning_rate": 9.32200496470271e-06, + "loss": 0.0721, + "step": 1844 + }, + { + "epoch": 0.8393994540491356, + "grad_norm": 0.6812573423845587, + "learning_rate": 9.321286133703365e-06, + "loss": 0.0627, + "step": 1845 + }, + { + "epoch": 0.8398544131028207, + "grad_norm": 0.5946100319565307, + "learning_rate": 9.320566949588031e-06, + "loss": 0.0708, + "step": 1846 + }, + { + "epoch": 0.8403093721565059, + "grad_norm": 0.6319246275087805, + "learning_rate": 9.319847412415477e-06, + "loss": 0.0651, + "step": 1847 + }, + { + "epoch": 0.8407643312101911, + "grad_norm": 0.6789460664352271, + "learning_rate": 9.3191275222445e-06, + "loss": 0.0707, + "step": 1848 + }, + { + "epoch": 0.8412192902638762, + "grad_norm": 0.4396253526793688, + "learning_rate": 9.31840727913393e-06, + "loss": 0.0431, + "step": 1849 + }, + { + "epoch": 0.8416742493175614, + "grad_norm": 0.6745617928769184, + "learning_rate": 9.317686683142616e-06, + "loss": 0.0747, + "step": 1850 + }, + { + "epoch": 0.8421292083712466, + "grad_norm": 0.6924165554321049, + "learning_rate": 9.316965734329447e-06, + "loss": 0.0575, + "step": 1851 + }, + { + "epoch": 0.8425841674249318, + "grad_norm": 0.7219679526943963, + "learning_rate": 9.316244432753332e-06, + "loss": 0.0935, + "step": 1852 + }, + { + "epoch": 0.8430391264786169, + "grad_norm": 1.0205930330831676, + "learning_rate": 9.315522778473214e-06, + "loss": 0.1213, + "step": 1853 + }, + { + "epoch": 0.8434940855323021, + "grad_norm": 1.009181015179975, + "learning_rate": 9.314800771548064e-06, + "loss": 0.1049, + "step": 1854 + }, + { + "epoch": 0.8439490445859873, + "grad_norm": 0.7263916504334191, + "learning_rate": 9.31407841203688e-06, + "loss": 0.1025, + "step": 1855 + }, + { + "epoch": 0.8444040036396724, + "grad_norm": 0.6276487176726284, + "learning_rate": 9.31335569999869e-06, + "loss": 0.0587, + "step": 1856 + }, + { + "epoch": 0.8448589626933576, + "grad_norm": 0.6171084743549562, + "learning_rate": 9.31263263549255e-06, + "loss": 0.0495, + "step": 1857 + }, + { + "epoch": 0.8453139217470428, + "grad_norm": 0.6730791565382994, + "learning_rate": 9.31190921857755e-06, + "loss": 0.0789, + "step": 1858 + }, + { + "epoch": 0.845768880800728, + "grad_norm": 0.7874386993734893, + "learning_rate": 9.311185449312798e-06, + "loss": 0.088, + "step": 1859 + }, + { + "epoch": 0.8462238398544131, + "grad_norm": 0.5073783803158326, + "learning_rate": 9.310461327757442e-06, + "loss": 0.0561, + "step": 1860 + }, + { + "epoch": 0.8466787989080983, + "grad_norm": 0.6051266904327832, + "learning_rate": 9.309736853970652e-06, + "loss": 0.0688, + "step": 1861 + }, + { + "epoch": 0.8471337579617835, + "grad_norm": 1.0483500354699085, + "learning_rate": 9.309012028011628e-06, + "loss": 0.1346, + "step": 1862 + }, + { + "epoch": 0.8475887170154686, + "grad_norm": 0.9049471090474998, + "learning_rate": 9.3082868499396e-06, + "loss": 0.0986, + "step": 1863 + }, + { + "epoch": 0.8480436760691538, + "grad_norm": 0.47381125867485346, + "learning_rate": 9.307561319813829e-06, + "loss": 0.058, + "step": 1864 + }, + { + "epoch": 0.848498635122839, + "grad_norm": 0.7964538075850383, + "learning_rate": 9.306835437693597e-06, + "loss": 0.0829, + "step": 1865 + }, + { + "epoch": 0.8489535941765242, + "grad_norm": 0.9919343521297046, + "learning_rate": 9.306109203638225e-06, + "loss": 0.0885, + "step": 1866 + }, + { + "epoch": 0.8494085532302093, + "grad_norm": 1.4502514405100166, + "learning_rate": 9.305382617707052e-06, + "loss": 0.1023, + "step": 1867 + }, + { + "epoch": 0.8498635122838945, + "grad_norm": 0.7238180713867792, + "learning_rate": 9.304655679959459e-06, + "loss": 0.0813, + "step": 1868 + }, + { + "epoch": 0.8503184713375797, + "grad_norm": 0.7360849022013412, + "learning_rate": 9.303928390454839e-06, + "loss": 0.0671, + "step": 1869 + }, + { + "epoch": 0.8507734303912647, + "grad_norm": 0.5803360108595549, + "learning_rate": 9.30320074925263e-06, + "loss": 0.075, + "step": 1870 + }, + { + "epoch": 0.8512283894449499, + "grad_norm": 0.6838093346854254, + "learning_rate": 9.302472756412288e-06, + "loss": 0.0812, + "step": 1871 + }, + { + "epoch": 0.8516833484986351, + "grad_norm": 0.8850924783689049, + "learning_rate": 9.301744411993302e-06, + "loss": 0.0991, + "step": 1872 + }, + { + "epoch": 0.8521383075523203, + "grad_norm": 0.8273381019086633, + "learning_rate": 9.30101571605519e-06, + "loss": 0.0803, + "step": 1873 + }, + { + "epoch": 0.8525932666060054, + "grad_norm": 0.6554434764444423, + "learning_rate": 9.300286668657495e-06, + "loss": 0.0737, + "step": 1874 + }, + { + "epoch": 0.8530482256596906, + "grad_norm": 0.8230660869280486, + "learning_rate": 9.299557269859795e-06, + "loss": 0.0748, + "step": 1875 + }, + { + "epoch": 0.8535031847133758, + "grad_norm": 0.609738768294497, + "learning_rate": 9.298827519721692e-06, + "loss": 0.0608, + "step": 1876 + }, + { + "epoch": 0.8539581437670609, + "grad_norm": 0.7433208516076715, + "learning_rate": 9.298097418302817e-06, + "loss": 0.0992, + "step": 1877 + }, + { + "epoch": 0.8544131028207461, + "grad_norm": 0.5414027711398505, + "learning_rate": 9.29736696566283e-06, + "loss": 0.0642, + "step": 1878 + }, + { + "epoch": 0.8548680618744313, + "grad_norm": 0.8950820233319129, + "learning_rate": 9.296636161861422e-06, + "loss": 0.1121, + "step": 1879 + }, + { + "epoch": 0.8553230209281165, + "grad_norm": 2.0225500877401617, + "learning_rate": 9.295905006958308e-06, + "loss": 0.1409, + "step": 1880 + }, + { + "epoch": 0.8557779799818016, + "grad_norm": 0.7783660516278756, + "learning_rate": 9.295173501013239e-06, + "loss": 0.0974, + "step": 1881 + }, + { + "epoch": 0.8562329390354868, + "grad_norm": 0.7064043776078144, + "learning_rate": 9.29444164408599e-06, + "loss": 0.0954, + "step": 1882 + }, + { + "epoch": 0.856687898089172, + "grad_norm": 0.6658976396134992, + "learning_rate": 9.29370943623636e-06, + "loss": 0.0636, + "step": 1883 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.6825106501213147, + "learning_rate": 9.292976877524189e-06, + "loss": 0.0908, + "step": 1884 + }, + { + "epoch": 0.8575978161965423, + "grad_norm": 0.8132731569130554, + "learning_rate": 9.292243968009332e-06, + "loss": 0.0952, + "step": 1885 + }, + { + "epoch": 0.8580527752502275, + "grad_norm": 1.283740720887758, + "learning_rate": 9.29151070775168e-06, + "loss": 0.1407, + "step": 1886 + }, + { + "epoch": 0.8585077343039127, + "grad_norm": 0.8987444265022443, + "learning_rate": 9.290777096811156e-06, + "loss": 0.1008, + "step": 1887 + }, + { + "epoch": 0.8589626933575978, + "grad_norm": 0.9027753674161602, + "learning_rate": 9.290043135247704e-06, + "loss": 0.0917, + "step": 1888 + }, + { + "epoch": 0.859417652411283, + "grad_norm": 0.7721264653335534, + "learning_rate": 9.289308823121302e-06, + "loss": 0.0876, + "step": 1889 + }, + { + "epoch": 0.8598726114649682, + "grad_norm": 0.8645055674602313, + "learning_rate": 9.28857416049195e-06, + "loss": 0.0775, + "step": 1890 + }, + { + "epoch": 0.8603275705186533, + "grad_norm": 0.7828026058785104, + "learning_rate": 9.287839147419685e-06, + "loss": 0.0953, + "step": 1891 + }, + { + "epoch": 0.8607825295723385, + "grad_norm": 0.7581321197025821, + "learning_rate": 9.287103783964571e-06, + "loss": 0.1004, + "step": 1892 + }, + { + "epoch": 0.8612374886260237, + "grad_norm": 0.5836098633522236, + "learning_rate": 9.286368070186696e-06, + "loss": 0.0586, + "step": 1893 + }, + { + "epoch": 0.8616924476797089, + "grad_norm": 0.8102404855384281, + "learning_rate": 9.285632006146178e-06, + "loss": 0.0809, + "step": 1894 + }, + { + "epoch": 0.862147406733394, + "grad_norm": 0.5684276012396848, + "learning_rate": 9.284895591903167e-06, + "loss": 0.0736, + "step": 1895 + }, + { + "epoch": 0.8626023657870792, + "grad_norm": 0.629014301705328, + "learning_rate": 9.284158827517838e-06, + "loss": 0.0707, + "step": 1896 + }, + { + "epoch": 0.8630573248407644, + "grad_norm": 0.6150335967135018, + "learning_rate": 9.283421713050398e-06, + "loss": 0.0665, + "step": 1897 + }, + { + "epoch": 0.8635122838944495, + "grad_norm": 0.7977181385850289, + "learning_rate": 9.282684248561078e-06, + "loss": 0.1077, + "step": 1898 + }, + { + "epoch": 0.8639672429481347, + "grad_norm": 0.5184482645002529, + "learning_rate": 9.281946434110141e-06, + "loss": 0.0594, + "step": 1899 + }, + { + "epoch": 0.8644222020018199, + "grad_norm": 0.7148270230091635, + "learning_rate": 9.28120826975788e-06, + "loss": 0.1005, + "step": 1900 + }, + { + "epoch": 0.864877161055505, + "grad_norm": 0.6020497479816633, + "learning_rate": 9.280469755564613e-06, + "loss": 0.0595, + "step": 1901 + }, + { + "epoch": 0.8653321201091901, + "grad_norm": 0.7725143836000526, + "learning_rate": 9.279730891590688e-06, + "loss": 0.063, + "step": 1902 + }, + { + "epoch": 0.8657870791628753, + "grad_norm": 0.5341160118168524, + "learning_rate": 9.27899167789648e-06, + "loss": 0.0649, + "step": 1903 + }, + { + "epoch": 0.8662420382165605, + "grad_norm": 0.78025783272878, + "learning_rate": 9.278252114542398e-06, + "loss": 0.0987, + "step": 1904 + }, + { + "epoch": 0.8666969972702456, + "grad_norm": 1.0383225939834173, + "learning_rate": 9.277512201588871e-06, + "loss": 0.1532, + "step": 1905 + }, + { + "epoch": 0.8671519563239308, + "grad_norm": 0.742851971816876, + "learning_rate": 9.276771939096367e-06, + "loss": 0.1083, + "step": 1906 + }, + { + "epoch": 0.867606915377616, + "grad_norm": 0.6246586544484709, + "learning_rate": 9.276031327125371e-06, + "loss": 0.0798, + "step": 1907 + }, + { + "epoch": 0.8680618744313012, + "grad_norm": 0.6937230711216974, + "learning_rate": 9.275290365736408e-06, + "loss": 0.0764, + "step": 1908 + }, + { + "epoch": 0.8685168334849863, + "grad_norm": 0.6405216327010745, + "learning_rate": 9.274549054990022e-06, + "loss": 0.0553, + "step": 1909 + }, + { + "epoch": 0.8689717925386715, + "grad_norm": 0.6118088958703919, + "learning_rate": 9.273807394946791e-06, + "loss": 0.0719, + "step": 1910 + }, + { + "epoch": 0.8694267515923567, + "grad_norm": 0.5929451056907732, + "learning_rate": 9.27306538566732e-06, + "loss": 0.0736, + "step": 1911 + }, + { + "epoch": 0.8698817106460418, + "grad_norm": 0.551189089448713, + "learning_rate": 9.272323027212244e-06, + "loss": 0.0802, + "step": 1912 + }, + { + "epoch": 0.870336669699727, + "grad_norm": 0.6964950682522272, + "learning_rate": 9.271580319642221e-06, + "loss": 0.0956, + "step": 1913 + }, + { + "epoch": 0.8707916287534122, + "grad_norm": 0.656523844824833, + "learning_rate": 9.270837263017947e-06, + "loss": 0.0716, + "step": 1914 + }, + { + "epoch": 0.8712465878070974, + "grad_norm": 0.5516956702822526, + "learning_rate": 9.270093857400138e-06, + "loss": 0.0756, + "step": 1915 + }, + { + "epoch": 0.8717015468607825, + "grad_norm": 0.6458984664434074, + "learning_rate": 9.269350102849542e-06, + "loss": 0.0762, + "step": 1916 + }, + { + "epoch": 0.8721565059144677, + "grad_norm": 0.6244797606471136, + "learning_rate": 9.268605999426936e-06, + "loss": 0.066, + "step": 1917 + }, + { + "epoch": 0.8726114649681529, + "grad_norm": 1.3051429800547985, + "learning_rate": 9.267861547193126e-06, + "loss": 0.1487, + "step": 1918 + }, + { + "epoch": 0.873066424021838, + "grad_norm": 0.9503536634109886, + "learning_rate": 9.267116746208944e-06, + "loss": 0.1088, + "step": 1919 + }, + { + "epoch": 0.8735213830755232, + "grad_norm": 0.6872044557187451, + "learning_rate": 9.26637159653525e-06, + "loss": 0.0952, + "step": 1920 + }, + { + "epoch": 0.8739763421292084, + "grad_norm": 0.8261797174841458, + "learning_rate": 9.265626098232934e-06, + "loss": 0.0917, + "step": 1921 + }, + { + "epoch": 0.8744313011828936, + "grad_norm": 0.6285868744907084, + "learning_rate": 9.26488025136292e-06, + "loss": 0.0736, + "step": 1922 + }, + { + "epoch": 0.8748862602365787, + "grad_norm": 0.95408072866655, + "learning_rate": 9.264134055986152e-06, + "loss": 0.09, + "step": 1923 + }, + { + "epoch": 0.8753412192902639, + "grad_norm": 0.8126928412084633, + "learning_rate": 9.263387512163604e-06, + "loss": 0.0861, + "step": 1924 + }, + { + "epoch": 0.8757961783439491, + "grad_norm": 0.628340619476289, + "learning_rate": 9.262640619956282e-06, + "loss": 0.0853, + "step": 1925 + }, + { + "epoch": 0.8762511373976342, + "grad_norm": 0.822645279842771, + "learning_rate": 9.261893379425218e-06, + "loss": 0.0921, + "step": 1926 + }, + { + "epoch": 0.8767060964513194, + "grad_norm": 0.664699910134531, + "learning_rate": 9.261145790631475e-06, + "loss": 0.0661, + "step": 1927 + }, + { + "epoch": 0.8771610555050046, + "grad_norm": 0.46120202232971963, + "learning_rate": 9.26039785363614e-06, + "loss": 0.0548, + "step": 1928 + }, + { + "epoch": 0.8776160145586898, + "grad_norm": 0.47348608915538554, + "learning_rate": 9.259649568500333e-06, + "loss": 0.0579, + "step": 1929 + }, + { + "epoch": 0.8780709736123748, + "grad_norm": 0.5421377090850338, + "learning_rate": 9.258900935285199e-06, + "loss": 0.0591, + "step": 1930 + }, + { + "epoch": 0.87852593266606, + "grad_norm": 0.5523212054660892, + "learning_rate": 9.258151954051914e-06, + "loss": 0.0757, + "step": 1931 + }, + { + "epoch": 0.8789808917197452, + "grad_norm": 0.733320680764707, + "learning_rate": 9.25740262486168e-06, + "loss": 0.0999, + "step": 1932 + }, + { + "epoch": 0.8794358507734303, + "grad_norm": 0.5636961368288687, + "learning_rate": 9.25665294777573e-06, + "loss": 0.0525, + "step": 1933 + }, + { + "epoch": 0.8798908098271155, + "grad_norm": 0.5613709035035684, + "learning_rate": 9.255902922855326e-06, + "loss": 0.0512, + "step": 1934 + }, + { + "epoch": 0.8803457688808007, + "grad_norm": 0.6266000159117329, + "learning_rate": 9.255152550161753e-06, + "loss": 0.0714, + "step": 1935 + }, + { + "epoch": 0.8808007279344859, + "grad_norm": 0.5624931761265524, + "learning_rate": 9.25440182975633e-06, + "loss": 0.0667, + "step": 1936 + }, + { + "epoch": 0.881255686988171, + "grad_norm": 0.8855653361345076, + "learning_rate": 9.253650761700401e-06, + "loss": 0.1104, + "step": 1937 + }, + { + "epoch": 0.8817106460418562, + "grad_norm": 0.4051324158485566, + "learning_rate": 9.252899346055343e-06, + "loss": 0.0447, + "step": 1938 + }, + { + "epoch": 0.8821656050955414, + "grad_norm": 0.6705030425420828, + "learning_rate": 9.252147582882556e-06, + "loss": 0.08, + "step": 1939 + }, + { + "epoch": 0.8826205641492265, + "grad_norm": 0.745395756906896, + "learning_rate": 9.25139547224347e-06, + "loss": 0.0892, + "step": 1940 + }, + { + "epoch": 0.8830755232029117, + "grad_norm": 0.9577657000178205, + "learning_rate": 9.250643014199547e-06, + "loss": 0.1144, + "step": 1941 + }, + { + "epoch": 0.8835304822565969, + "grad_norm": 0.6774410545148242, + "learning_rate": 9.24989020881227e-06, + "loss": 0.0753, + "step": 1942 + }, + { + "epoch": 0.8839854413102821, + "grad_norm": 0.7409774305157982, + "learning_rate": 9.249137056143159e-06, + "loss": 0.0722, + "step": 1943 + }, + { + "epoch": 0.8844404003639672, + "grad_norm": 0.6042335346844097, + "learning_rate": 9.248383556253758e-06, + "loss": 0.0775, + "step": 1944 + }, + { + "epoch": 0.8848953594176524, + "grad_norm": 0.8396643903072698, + "learning_rate": 9.247629709205635e-06, + "loss": 0.1051, + "step": 1945 + }, + { + "epoch": 0.8853503184713376, + "grad_norm": 0.6590167845553623, + "learning_rate": 9.246875515060396e-06, + "loss": 0.0774, + "step": 1946 + }, + { + "epoch": 0.8858052775250227, + "grad_norm": 0.5876827286169646, + "learning_rate": 9.24612097387967e-06, + "loss": 0.0768, + "step": 1947 + }, + { + "epoch": 0.8862602365787079, + "grad_norm": 0.8894868784932225, + "learning_rate": 9.245366085725111e-06, + "loss": 0.0983, + "step": 1948 + }, + { + "epoch": 0.8867151956323931, + "grad_norm": 0.5389319757607208, + "learning_rate": 9.24461085065841e-06, + "loss": 0.0571, + "step": 1949 + }, + { + "epoch": 0.8871701546860783, + "grad_norm": 0.4677621224916707, + "learning_rate": 9.243855268741275e-06, + "loss": 0.0534, + "step": 1950 + }, + { + "epoch": 0.8876251137397634, + "grad_norm": 0.6166575793819061, + "learning_rate": 9.243099340035454e-06, + "loss": 0.0679, + "step": 1951 + }, + { + "epoch": 0.8880800727934486, + "grad_norm": 0.684219803564928, + "learning_rate": 9.242343064602719e-06, + "loss": 0.0797, + "step": 1952 + }, + { + "epoch": 0.8885350318471338, + "grad_norm": 0.6543060915410528, + "learning_rate": 9.241586442504865e-06, + "loss": 0.0876, + "step": 1953 + }, + { + "epoch": 0.8889899909008189, + "grad_norm": 0.6916358607655352, + "learning_rate": 9.240829473803723e-06, + "loss": 0.0816, + "step": 1954 + }, + { + "epoch": 0.8894449499545041, + "grad_norm": 0.6650683160408256, + "learning_rate": 9.240072158561146e-06, + "loss": 0.0851, + "step": 1955 + }, + { + "epoch": 0.8898999090081893, + "grad_norm": 0.8336397769475173, + "learning_rate": 9.239314496839022e-06, + "loss": 0.1075, + "step": 1956 + }, + { + "epoch": 0.8903548680618745, + "grad_norm": 0.6498784190415388, + "learning_rate": 9.23855648869926e-06, + "loss": 0.0748, + "step": 1957 + }, + { + "epoch": 0.8908098271155596, + "grad_norm": 0.7894795440995916, + "learning_rate": 9.237798134203803e-06, + "loss": 0.1045, + "step": 1958 + }, + { + "epoch": 0.8912647861692448, + "grad_norm": 0.5980997509859944, + "learning_rate": 9.237039433414623e-06, + "loss": 0.079, + "step": 1959 + }, + { + "epoch": 0.89171974522293, + "grad_norm": 0.8222326498301533, + "learning_rate": 9.236280386393712e-06, + "loss": 0.082, + "step": 1960 + }, + { + "epoch": 0.892174704276615, + "grad_norm": 0.6293204676003961, + "learning_rate": 9.2355209932031e-06, + "loss": 0.0741, + "step": 1961 + }, + { + "epoch": 0.8926296633303002, + "grad_norm": 0.47863668175134233, + "learning_rate": 9.23476125390484e-06, + "loss": 0.0524, + "step": 1962 + }, + { + "epoch": 0.8930846223839854, + "grad_norm": 0.7798093326874596, + "learning_rate": 9.234001168561013e-06, + "loss": 0.0691, + "step": 1963 + }, + { + "epoch": 0.8935395814376706, + "grad_norm": 0.7301612531501247, + "learning_rate": 9.233240737233733e-06, + "loss": 0.0965, + "step": 1964 + }, + { + "epoch": 0.8939945404913557, + "grad_norm": 1.0452984923884894, + "learning_rate": 9.232479959985136e-06, + "loss": 0.1293, + "step": 1965 + }, + { + "epoch": 0.8944494995450409, + "grad_norm": 0.6963389022030017, + "learning_rate": 9.23171883687739e-06, + "loss": 0.0767, + "step": 1966 + }, + { + "epoch": 0.8949044585987261, + "grad_norm": 0.45171069390219404, + "learning_rate": 9.23095736797269e-06, + "loss": 0.0522, + "step": 1967 + }, + { + "epoch": 0.8953594176524113, + "grad_norm": 1.0061313103020273, + "learning_rate": 9.230195553333263e-06, + "loss": 0.1277, + "step": 1968 + }, + { + "epoch": 0.8958143767060964, + "grad_norm": 1.5986138982364897, + "learning_rate": 9.229433393021358e-06, + "loss": 0.1405, + "step": 1969 + }, + { + "epoch": 0.8962693357597816, + "grad_norm": 0.6908357505139043, + "learning_rate": 9.228670887099256e-06, + "loss": 0.0739, + "step": 1970 + }, + { + "epoch": 0.8967242948134668, + "grad_norm": 0.5277345258701365, + "learning_rate": 9.227908035629266e-06, + "loss": 0.0526, + "step": 1971 + }, + { + "epoch": 0.8971792538671519, + "grad_norm": 0.6285224648148875, + "learning_rate": 9.227144838673724e-06, + "loss": 0.0706, + "step": 1972 + }, + { + "epoch": 0.8976342129208371, + "grad_norm": 0.949308919855668, + "learning_rate": 9.226381296294995e-06, + "loss": 0.1045, + "step": 1973 + }, + { + "epoch": 0.8980891719745223, + "grad_norm": 0.752138900094858, + "learning_rate": 9.225617408555471e-06, + "loss": 0.0907, + "step": 1974 + }, + { + "epoch": 0.8985441310282075, + "grad_norm": 0.9650799951574368, + "learning_rate": 9.224853175517578e-06, + "loss": 0.1261, + "step": 1975 + }, + { + "epoch": 0.8989990900818926, + "grad_norm": 0.6368811817284902, + "learning_rate": 9.224088597243762e-06, + "loss": 0.0759, + "step": 1976 + }, + { + "epoch": 0.8994540491355778, + "grad_norm": 0.7403608884362824, + "learning_rate": 9.223323673796503e-06, + "loss": 0.081, + "step": 1977 + }, + { + "epoch": 0.899909008189263, + "grad_norm": 0.8033696439311833, + "learning_rate": 9.222558405238303e-06, + "loss": 0.0968, + "step": 1978 + }, + { + "epoch": 0.9003639672429481, + "grad_norm": 0.7306511821068437, + "learning_rate": 9.2217927916317e-06, + "loss": 0.0916, + "step": 1979 + }, + { + "epoch": 0.9008189262966333, + "grad_norm": 0.8380967239417318, + "learning_rate": 9.221026833039256e-06, + "loss": 0.0945, + "step": 1980 + }, + { + "epoch": 0.9012738853503185, + "grad_norm": 0.7718744506924977, + "learning_rate": 9.220260529523561e-06, + "loss": 0.0918, + "step": 1981 + }, + { + "epoch": 0.9017288444040037, + "grad_norm": 0.7393925382776323, + "learning_rate": 9.219493881147234e-06, + "loss": 0.0816, + "step": 1982 + }, + { + "epoch": 0.9021838034576888, + "grad_norm": 0.7687427983757074, + "learning_rate": 9.218726887972923e-06, + "loss": 0.0835, + "step": 1983 + }, + { + "epoch": 0.902638762511374, + "grad_norm": 0.6785077320109779, + "learning_rate": 9.2179595500633e-06, + "loss": 0.0799, + "step": 1984 + }, + { + "epoch": 0.9030937215650592, + "grad_norm": 0.9172539926736025, + "learning_rate": 9.217191867481072e-06, + "loss": 0.1147, + "step": 1985 + }, + { + "epoch": 0.9035486806187443, + "grad_norm": 0.9222679238503178, + "learning_rate": 9.21642384028897e-06, + "loss": 0.127, + "step": 1986 + }, + { + "epoch": 0.9040036396724295, + "grad_norm": 0.8844523810912496, + "learning_rate": 9.215655468549752e-06, + "loss": 0.1013, + "step": 1987 + }, + { + "epoch": 0.9044585987261147, + "grad_norm": 0.5874811797706115, + "learning_rate": 9.214886752326208e-06, + "loss": 0.0528, + "step": 1988 + }, + { + "epoch": 0.9049135577797999, + "grad_norm": 0.6774186522730414, + "learning_rate": 9.214117691681152e-06, + "loss": 0.0749, + "step": 1989 + }, + { + "epoch": 0.905368516833485, + "grad_norm": 0.46678264083336873, + "learning_rate": 9.213348286677429e-06, + "loss": 0.0502, + "step": 1990 + }, + { + "epoch": 0.9058234758871702, + "grad_norm": 0.6369505909634797, + "learning_rate": 9.21257853737791e-06, + "loss": 0.0597, + "step": 1991 + }, + { + "epoch": 0.9062784349408554, + "grad_norm": 0.7872482528902512, + "learning_rate": 9.211808443845499e-06, + "loss": 0.0842, + "step": 1992 + }, + { + "epoch": 0.9067333939945404, + "grad_norm": 0.6991340678786092, + "learning_rate": 9.211038006143121e-06, + "loss": 0.0714, + "step": 1993 + }, + { + "epoch": 0.9071883530482256, + "grad_norm": 0.5842126029431552, + "learning_rate": 9.210267224333735e-06, + "loss": 0.0686, + "step": 1994 + }, + { + "epoch": 0.9076433121019108, + "grad_norm": 0.6405241386542652, + "learning_rate": 9.209496098480324e-06, + "loss": 0.0843, + "step": 1995 + }, + { + "epoch": 0.908098271155596, + "grad_norm": 0.6431855863004138, + "learning_rate": 9.208724628645901e-06, + "loss": 0.0781, + "step": 1996 + }, + { + "epoch": 0.9085532302092811, + "grad_norm": 0.6571372788631167, + "learning_rate": 9.207952814893511e-06, + "loss": 0.0746, + "step": 1997 + }, + { + "epoch": 0.9090081892629663, + "grad_norm": 0.6228847041781231, + "learning_rate": 9.207180657286216e-06, + "loss": 0.0563, + "step": 1998 + }, + { + "epoch": 0.9094631483166515, + "grad_norm": 0.6649592874484661, + "learning_rate": 9.20640815588712e-06, + "loss": 0.0737, + "step": 1999 + }, + { + "epoch": 0.9099181073703366, + "grad_norm": 0.6395827893566276, + "learning_rate": 9.205635310759344e-06, + "loss": 0.0864, + "step": 2000 + }, + { + "epoch": 0.9103730664240218, + "grad_norm": 0.6470816609318947, + "learning_rate": 9.204862121966044e-06, + "loss": 0.0819, + "step": 2001 + }, + { + "epoch": 0.910828025477707, + "grad_norm": 0.6954176357821441, + "learning_rate": 9.2040885895704e-06, + "loss": 0.0935, + "step": 2002 + }, + { + "epoch": 0.9112829845313922, + "grad_norm": 0.5250024400720148, + "learning_rate": 9.203314713635621e-06, + "loss": 0.0521, + "step": 2003 + }, + { + "epoch": 0.9117379435850773, + "grad_norm": 0.6765818316745539, + "learning_rate": 9.202540494224946e-06, + "loss": 0.1078, + "step": 2004 + }, + { + "epoch": 0.9121929026387625, + "grad_norm": 0.7602463030942905, + "learning_rate": 9.20176593140164e-06, + "loss": 0.068, + "step": 2005 + }, + { + "epoch": 0.9126478616924477, + "grad_norm": 0.4564764883431911, + "learning_rate": 9.200991025228998e-06, + "loss": 0.0576, + "step": 2006 + }, + { + "epoch": 0.9131028207461328, + "grad_norm": 0.87338946860691, + "learning_rate": 9.20021577577034e-06, + "loss": 0.1155, + "step": 2007 + }, + { + "epoch": 0.913557779799818, + "grad_norm": 0.67443699378812, + "learning_rate": 9.199440183089019e-06, + "loss": 0.0803, + "step": 2008 + }, + { + "epoch": 0.9140127388535032, + "grad_norm": 0.697779741574365, + "learning_rate": 9.198664247248408e-06, + "loss": 0.0886, + "step": 2009 + }, + { + "epoch": 0.9144676979071884, + "grad_norm": 0.6888292123310293, + "learning_rate": 9.197887968311917e-06, + "loss": 0.088, + "step": 2010 + }, + { + "epoch": 0.9149226569608735, + "grad_norm": 0.593887211300783, + "learning_rate": 9.197111346342979e-06, + "loss": 0.0597, + "step": 2011 + }, + { + "epoch": 0.9153776160145587, + "grad_norm": 0.5222048906208826, + "learning_rate": 9.196334381405055e-06, + "loss": 0.055, + "step": 2012 + }, + { + "epoch": 0.9158325750682439, + "grad_norm": 0.7406902681131339, + "learning_rate": 9.195557073561636e-06, + "loss": 0.0725, + "step": 2013 + }, + { + "epoch": 0.916287534121929, + "grad_norm": 0.7369752030698005, + "learning_rate": 9.194779422876242e-06, + "loss": 0.0725, + "step": 2014 + }, + { + "epoch": 0.9167424931756142, + "grad_norm": 0.5674786098045346, + "learning_rate": 9.194001429412414e-06, + "loss": 0.0528, + "step": 2015 + }, + { + "epoch": 0.9171974522292994, + "grad_norm": 0.9561188233992612, + "learning_rate": 9.19322309323373e-06, + "loss": 0.1213, + "step": 2016 + }, + { + "epoch": 0.9176524112829846, + "grad_norm": 0.7666480467189352, + "learning_rate": 9.192444414403792e-06, + "loss": 0.0788, + "step": 2017 + }, + { + "epoch": 0.9181073703366697, + "grad_norm": 1.0242939804657472, + "learning_rate": 9.19166539298623e-06, + "loss": 0.1341, + "step": 2018 + }, + { + "epoch": 0.9185623293903549, + "grad_norm": 0.6407407288510717, + "learning_rate": 9.1908860290447e-06, + "loss": 0.0702, + "step": 2019 + }, + { + "epoch": 0.9190172884440401, + "grad_norm": 0.9262978099585683, + "learning_rate": 9.190106322642888e-06, + "loss": 0.0962, + "step": 2020 + }, + { + "epoch": 0.9194722474977252, + "grad_norm": 0.6371294810639554, + "learning_rate": 9.189326273844512e-06, + "loss": 0.0716, + "step": 2021 + }, + { + "epoch": 0.9199272065514104, + "grad_norm": 0.616042736799084, + "learning_rate": 9.18854588271331e-06, + "loss": 0.0697, + "step": 2022 + }, + { + "epoch": 0.9203821656050956, + "grad_norm": 0.8652881040430276, + "learning_rate": 9.187765149313057e-06, + "loss": 0.0949, + "step": 2023 + }, + { + "epoch": 0.9208371246587808, + "grad_norm": 0.7171212404467417, + "learning_rate": 9.186984073707545e-06, + "loss": 0.0685, + "step": 2024 + }, + { + "epoch": 0.9212920837124658, + "grad_norm": 0.6434040420425213, + "learning_rate": 9.186202655960603e-06, + "loss": 0.0774, + "step": 2025 + }, + { + "epoch": 0.921747042766151, + "grad_norm": 0.6537324523008204, + "learning_rate": 9.185420896136086e-06, + "loss": 0.0786, + "step": 2026 + }, + { + "epoch": 0.9222020018198362, + "grad_norm": 0.6271186642997567, + "learning_rate": 9.184638794297873e-06, + "loss": 0.0636, + "step": 2027 + }, + { + "epoch": 0.9226569608735213, + "grad_norm": 0.7041069370791754, + "learning_rate": 9.183856350509877e-06, + "loss": 0.0809, + "step": 2028 + }, + { + "epoch": 0.9231119199272065, + "grad_norm": 0.8781019574614535, + "learning_rate": 9.183073564836033e-06, + "loss": 0.1051, + "step": 2029 + }, + { + "epoch": 0.9235668789808917, + "grad_norm": 0.48818413319632054, + "learning_rate": 9.182290437340308e-06, + "loss": 0.0474, + "step": 2030 + }, + { + "epoch": 0.9240218380345769, + "grad_norm": 0.8775797840737246, + "learning_rate": 9.181506968086696e-06, + "loss": 0.0949, + "step": 2031 + }, + { + "epoch": 0.924476797088262, + "grad_norm": 0.958612912496998, + "learning_rate": 9.180723157139218e-06, + "loss": 0.121, + "step": 2032 + }, + { + "epoch": 0.9249317561419472, + "grad_norm": 0.6245762602830833, + "learning_rate": 9.179939004561925e-06, + "loss": 0.0655, + "step": 2033 + }, + { + "epoch": 0.9253867151956324, + "grad_norm": 0.5017046465493271, + "learning_rate": 9.17915451041889e-06, + "loss": 0.0661, + "step": 2034 + }, + { + "epoch": 0.9258416742493175, + "grad_norm": 0.710064858137144, + "learning_rate": 9.178369674774224e-06, + "loss": 0.0791, + "step": 2035 + }, + { + "epoch": 0.9262966333030027, + "grad_norm": 0.587851189554333, + "learning_rate": 9.177584497692056e-06, + "loss": 0.0637, + "step": 2036 + }, + { + "epoch": 0.9267515923566879, + "grad_norm": 1.3023478543600886, + "learning_rate": 9.176798979236548e-06, + "loss": 0.1095, + "step": 2037 + }, + { + "epoch": 0.9272065514103731, + "grad_norm": 0.540716658575828, + "learning_rate": 9.17601311947189e-06, + "loss": 0.0693, + "step": 2038 + }, + { + "epoch": 0.9276615104640582, + "grad_norm": 0.6208372361565256, + "learning_rate": 9.175226918462298e-06, + "loss": 0.0718, + "step": 2039 + }, + { + "epoch": 0.9281164695177434, + "grad_norm": 0.7701609774864682, + "learning_rate": 9.174440376272021e-06, + "loss": 0.0976, + "step": 2040 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 0.7010494768516853, + "learning_rate": 9.173653492965325e-06, + "loss": 0.0993, + "step": 2041 + }, + { + "epoch": 0.9290263876251137, + "grad_norm": 0.6373763175184742, + "learning_rate": 9.172866268606514e-06, + "loss": 0.0724, + "step": 2042 + }, + { + "epoch": 0.9294813466787989, + "grad_norm": 0.701200286737339, + "learning_rate": 9.172078703259917e-06, + "loss": 0.0825, + "step": 2043 + }, + { + "epoch": 0.9299363057324841, + "grad_norm": 0.4368340952860916, + "learning_rate": 9.171290796989887e-06, + "loss": 0.0477, + "step": 2044 + }, + { + "epoch": 0.9303912647861693, + "grad_norm": 0.6370651977402901, + "learning_rate": 9.170502549860813e-06, + "loss": 0.0796, + "step": 2045 + }, + { + "epoch": 0.9308462238398544, + "grad_norm": 1.1692149001382897, + "learning_rate": 9.169713961937104e-06, + "loss": 0.122, + "step": 2046 + }, + { + "epoch": 0.9313011828935396, + "grad_norm": 0.694595823352437, + "learning_rate": 9.168925033283199e-06, + "loss": 0.0935, + "step": 2047 + }, + { + "epoch": 0.9317561419472248, + "grad_norm": 0.672175800896758, + "learning_rate": 9.168135763963567e-06, + "loss": 0.0763, + "step": 2048 + }, + { + "epoch": 0.9322111010009099, + "grad_norm": 0.5254037194744254, + "learning_rate": 9.167346154042705e-06, + "loss": 0.0535, + "step": 2049 + }, + { + "epoch": 0.9326660600545951, + "grad_norm": 0.6788074343357934, + "learning_rate": 9.166556203585134e-06, + "loss": 0.0804, + "step": 2050 + }, + { + "epoch": 0.9331210191082803, + "grad_norm": 0.6950456412782345, + "learning_rate": 9.165765912655407e-06, + "loss": 0.0727, + "step": 2051 + }, + { + "epoch": 0.9335759781619655, + "grad_norm": 0.8037111447772672, + "learning_rate": 9.1649752813181e-06, + "loss": 0.0811, + "step": 2052 + }, + { + "epoch": 0.9340309372156506, + "grad_norm": 0.6043473581913603, + "learning_rate": 9.164184309637824e-06, + "loss": 0.0773, + "step": 2053 + }, + { + "epoch": 0.9344858962693358, + "grad_norm": 0.6914300193057683, + "learning_rate": 9.16339299767921e-06, + "loss": 0.0888, + "step": 2054 + }, + { + "epoch": 0.934940855323021, + "grad_norm": 0.5973299516809696, + "learning_rate": 9.162601345506923e-06, + "loss": 0.0771, + "step": 2055 + }, + { + "epoch": 0.935395814376706, + "grad_norm": 0.5667027927032561, + "learning_rate": 9.161809353185651e-06, + "loss": 0.0589, + "step": 2056 + }, + { + "epoch": 0.9358507734303912, + "grad_norm": 0.5892355686848351, + "learning_rate": 9.161017020780114e-06, + "loss": 0.0562, + "step": 2057 + }, + { + "epoch": 0.9363057324840764, + "grad_norm": 0.8503563061945567, + "learning_rate": 9.160224348355057e-06, + "loss": 0.1075, + "step": 2058 + }, + { + "epoch": 0.9367606915377616, + "grad_norm": 0.8030569297687169, + "learning_rate": 9.159431335975255e-06, + "loss": 0.0651, + "step": 2059 + }, + { + "epoch": 0.9372156505914467, + "grad_norm": 0.6182029602806504, + "learning_rate": 9.158637983705505e-06, + "loss": 0.0908, + "step": 2060 + }, + { + "epoch": 0.9376706096451319, + "grad_norm": 0.6167088007283392, + "learning_rate": 9.157844291610641e-06, + "loss": 0.0719, + "step": 2061 + }, + { + "epoch": 0.9381255686988171, + "grad_norm": 1.0378949375185438, + "learning_rate": 9.157050259755519e-06, + "loss": 0.0925, + "step": 2062 + }, + { + "epoch": 0.9385805277525022, + "grad_norm": 0.6009053311569907, + "learning_rate": 9.156255888205021e-06, + "loss": 0.0868, + "step": 2063 + }, + { + "epoch": 0.9390354868061874, + "grad_norm": 0.6730461926983252, + "learning_rate": 9.155461177024062e-06, + "loss": 0.0791, + "step": 2064 + }, + { + "epoch": 0.9394904458598726, + "grad_norm": 0.8310142050561945, + "learning_rate": 9.154666126277582e-06, + "loss": 0.0882, + "step": 2065 + }, + { + "epoch": 0.9399454049135578, + "grad_norm": 0.5455153208822874, + "learning_rate": 9.153870736030549e-06, + "loss": 0.0651, + "step": 2066 + }, + { + "epoch": 0.9404003639672429, + "grad_norm": 0.8245922923142007, + "learning_rate": 9.153075006347957e-06, + "loss": 0.1357, + "step": 2067 + }, + { + "epoch": 0.9408553230209281, + "grad_norm": 0.7891736693746195, + "learning_rate": 9.15227893729483e-06, + "loss": 0.0879, + "step": 2068 + }, + { + "epoch": 0.9413102820746133, + "grad_norm": 0.6032022964433661, + "learning_rate": 9.151482528936222e-06, + "loss": 0.0594, + "step": 2069 + }, + { + "epoch": 0.9417652411282984, + "grad_norm": 0.8087071917107507, + "learning_rate": 9.150685781337207e-06, + "loss": 0.0872, + "step": 2070 + }, + { + "epoch": 0.9422202001819836, + "grad_norm": 1.1875700013397057, + "learning_rate": 9.149888694562896e-06, + "loss": 0.1447, + "step": 2071 + }, + { + "epoch": 0.9426751592356688, + "grad_norm": 0.7351727785498874, + "learning_rate": 9.149091268678423e-06, + "loss": 0.0708, + "step": 2072 + }, + { + "epoch": 0.943130118289354, + "grad_norm": 0.6792286796417435, + "learning_rate": 9.148293503748947e-06, + "loss": 0.0876, + "step": 2073 + }, + { + "epoch": 0.9435850773430391, + "grad_norm": 0.7417762096300724, + "learning_rate": 9.14749539983966e-06, + "loss": 0.0852, + "step": 2074 + }, + { + "epoch": 0.9440400363967243, + "grad_norm": 0.5155173170030183, + "learning_rate": 9.146696957015777e-06, + "loss": 0.0606, + "step": 2075 + }, + { + "epoch": 0.9444949954504095, + "grad_norm": 1.1023064832096257, + "learning_rate": 9.145898175342545e-06, + "loss": 0.1488, + "step": 2076 + }, + { + "epoch": 0.9449499545040946, + "grad_norm": 0.6914694719967308, + "learning_rate": 9.145099054885238e-06, + "loss": 0.0816, + "step": 2077 + }, + { + "epoch": 0.9454049135577798, + "grad_norm": 0.6905933706764309, + "learning_rate": 9.144299595709156e-06, + "loss": 0.0876, + "step": 2078 + }, + { + "epoch": 0.945859872611465, + "grad_norm": 0.5233906895741112, + "learning_rate": 9.143499797879626e-06, + "loss": 0.0562, + "step": 2079 + }, + { + "epoch": 0.9463148316651502, + "grad_norm": 0.5101515836442003, + "learning_rate": 9.142699661462005e-06, + "loss": 0.0559, + "step": 2080 + }, + { + "epoch": 0.9467697907188353, + "grad_norm": 0.48017157157527135, + "learning_rate": 9.141899186521675e-06, + "loss": 0.0503, + "step": 2081 + }, + { + "epoch": 0.9472247497725205, + "grad_norm": 0.6592673728640894, + "learning_rate": 9.141098373124048e-06, + "loss": 0.0797, + "step": 2082 + }, + { + "epoch": 0.9476797088262057, + "grad_norm": 0.86432014477488, + "learning_rate": 9.140297221334562e-06, + "loss": 0.0858, + "step": 2083 + }, + { + "epoch": 0.9481346678798908, + "grad_norm": 1.0397141319559977, + "learning_rate": 9.139495731218685e-06, + "loss": 0.1198, + "step": 2084 + }, + { + "epoch": 0.948589626933576, + "grad_norm": 0.862052866017664, + "learning_rate": 9.138693902841914e-06, + "loss": 0.1056, + "step": 2085 + }, + { + "epoch": 0.9490445859872612, + "grad_norm": 0.7709077621401632, + "learning_rate": 9.137891736269764e-06, + "loss": 0.0918, + "step": 2086 + }, + { + "epoch": 0.9494995450409464, + "grad_norm": 0.8691294728765458, + "learning_rate": 9.137089231567789e-06, + "loss": 0.0925, + "step": 2087 + }, + { + "epoch": 0.9499545040946314, + "grad_norm": 0.6098999809715144, + "learning_rate": 9.136286388801564e-06, + "loss": 0.0673, + "step": 2088 + }, + { + "epoch": 0.9504094631483166, + "grad_norm": 0.7157788293123913, + "learning_rate": 9.135483208036695e-06, + "loss": 0.0802, + "step": 2089 + }, + { + "epoch": 0.9508644222020018, + "grad_norm": 0.9397853662008804, + "learning_rate": 9.134679689338814e-06, + "loss": 0.1021, + "step": 2090 + }, + { + "epoch": 0.9513193812556869, + "grad_norm": 0.5449934450219076, + "learning_rate": 9.133875832773582e-06, + "loss": 0.0698, + "step": 2091 + }, + { + "epoch": 0.9517743403093721, + "grad_norm": 0.5678662789014983, + "learning_rate": 9.133071638406684e-06, + "loss": 0.0726, + "step": 2092 + }, + { + "epoch": 0.9522292993630573, + "grad_norm": 0.704718355722168, + "learning_rate": 9.132267106303836e-06, + "loss": 0.0949, + "step": 2093 + }, + { + "epoch": 0.9526842584167425, + "grad_norm": 0.7119333629649424, + "learning_rate": 9.131462236530784e-06, + "loss": 0.0815, + "step": 2094 + }, + { + "epoch": 0.9531392174704276, + "grad_norm": 0.9543831010874976, + "learning_rate": 9.130657029153293e-06, + "loss": 0.1037, + "step": 2095 + }, + { + "epoch": 0.9535941765241128, + "grad_norm": 0.4141088945678519, + "learning_rate": 9.129851484237165e-06, + "loss": 0.0438, + "step": 2096 + }, + { + "epoch": 0.954049135577798, + "grad_norm": 0.880955172212152, + "learning_rate": 9.129045601848222e-06, + "loss": 0.1139, + "step": 2097 + }, + { + "epoch": 0.9545040946314831, + "grad_norm": 0.5340666725025275, + "learning_rate": 9.12823938205232e-06, + "loss": 0.0662, + "step": 2098 + }, + { + "epoch": 0.9549590536851683, + "grad_norm": 0.7598809630255295, + "learning_rate": 9.127432824915339e-06, + "loss": 0.086, + "step": 2099 + }, + { + "epoch": 0.9554140127388535, + "grad_norm": 0.5889551801250265, + "learning_rate": 9.126625930503187e-06, + "loss": 0.0618, + "step": 2100 + }, + { + "epoch": 0.9558689717925387, + "grad_norm": 0.7452095277301981, + "learning_rate": 9.125818698881798e-06, + "loss": 0.0846, + "step": 2101 + }, + { + "epoch": 0.9563239308462238, + "grad_norm": 0.874570701264544, + "learning_rate": 9.125011130117139e-06, + "loss": 0.0711, + "step": 2102 + }, + { + "epoch": 0.956778889899909, + "grad_norm": 0.6700889468480424, + "learning_rate": 9.124203224275198e-06, + "loss": 0.0771, + "step": 2103 + }, + { + "epoch": 0.9572338489535942, + "grad_norm": 0.5713697589917575, + "learning_rate": 9.123394981421995e-06, + "loss": 0.0647, + "step": 2104 + }, + { + "epoch": 0.9576888080072793, + "grad_norm": 0.7416406361243658, + "learning_rate": 9.122586401623574e-06, + "loss": 0.0797, + "step": 2105 + }, + { + "epoch": 0.9581437670609645, + "grad_norm": 0.8792771411195691, + "learning_rate": 9.12177748494601e-06, + "loss": 0.1043, + "step": 2106 + }, + { + "epoch": 0.9585987261146497, + "grad_norm": 0.8409261244287831, + "learning_rate": 9.120968231455406e-06, + "loss": 0.0968, + "step": 2107 + }, + { + "epoch": 0.9590536851683349, + "grad_norm": 0.588499824544961, + "learning_rate": 9.120158641217885e-06, + "loss": 0.0675, + "step": 2108 + }, + { + "epoch": 0.95950864422202, + "grad_norm": 0.5664840104040384, + "learning_rate": 9.119348714299607e-06, + "loss": 0.0721, + "step": 2109 + }, + { + "epoch": 0.9599636032757052, + "grad_norm": 0.7544363313105896, + "learning_rate": 9.118538450766755e-06, + "loss": 0.0723, + "step": 2110 + }, + { + "epoch": 0.9604185623293904, + "grad_norm": 0.6699256182505398, + "learning_rate": 9.117727850685541e-06, + "loss": 0.0669, + "step": 2111 + }, + { + "epoch": 0.9608735213830755, + "grad_norm": 0.5711605071447146, + "learning_rate": 9.116916914122202e-06, + "loss": 0.0637, + "step": 2112 + }, + { + "epoch": 0.9613284804367607, + "grad_norm": 0.6965803730129388, + "learning_rate": 9.116105641143005e-06, + "loss": 0.0744, + "step": 2113 + }, + { + "epoch": 0.9617834394904459, + "grad_norm": 0.8598026014818454, + "learning_rate": 9.115294031814242e-06, + "loss": 0.0937, + "step": 2114 + }, + { + "epoch": 0.9622383985441311, + "grad_norm": 0.5794082624701737, + "learning_rate": 9.114482086202236e-06, + "loss": 0.0675, + "step": 2115 + }, + { + "epoch": 0.9626933575978162, + "grad_norm": 0.7600807206599288, + "learning_rate": 9.113669804373335e-06, + "loss": 0.1047, + "step": 2116 + }, + { + "epoch": 0.9631483166515014, + "grad_norm": 0.6377342056356247, + "learning_rate": 9.112857186393913e-06, + "loss": 0.0676, + "step": 2117 + }, + { + "epoch": 0.9636032757051866, + "grad_norm": 1.1042469320816768, + "learning_rate": 9.112044232330377e-06, + "loss": 0.1508, + "step": 2118 + }, + { + "epoch": 0.9640582347588716, + "grad_norm": 0.817690744261235, + "learning_rate": 9.111230942249156e-06, + "loss": 0.0904, + "step": 2119 + }, + { + "epoch": 0.9645131938125568, + "grad_norm": 0.7037231293816442, + "learning_rate": 9.110417316216708e-06, + "loss": 0.0636, + "step": 2120 + }, + { + "epoch": 0.964968152866242, + "grad_norm": 0.6588945759110881, + "learning_rate": 9.10960335429952e-06, + "loss": 0.0684, + "step": 2121 + }, + { + "epoch": 0.9654231119199272, + "grad_norm": 0.6220308381200076, + "learning_rate": 9.108789056564105e-06, + "loss": 0.0877, + "step": 2122 + }, + { + "epoch": 0.9658780709736123, + "grad_norm": 0.6262721502493606, + "learning_rate": 9.107974423077001e-06, + "loss": 0.0642, + "step": 2123 + }, + { + "epoch": 0.9663330300272975, + "grad_norm": 0.9510165739511419, + "learning_rate": 9.107159453904781e-06, + "loss": 0.0994, + "step": 2124 + }, + { + "epoch": 0.9667879890809827, + "grad_norm": 0.7410601791583596, + "learning_rate": 9.10634414911404e-06, + "loss": 0.0751, + "step": 2125 + }, + { + "epoch": 0.9672429481346679, + "grad_norm": 0.592927363864185, + "learning_rate": 9.105528508771395e-06, + "loss": 0.0785, + "step": 2126 + }, + { + "epoch": 0.967697907188353, + "grad_norm": 0.704125884709214, + "learning_rate": 9.104712532943502e-06, + "loss": 0.0672, + "step": 2127 + }, + { + "epoch": 0.9681528662420382, + "grad_norm": 0.6763649668606744, + "learning_rate": 9.10389622169704e-06, + "loss": 0.0813, + "step": 2128 + }, + { + "epoch": 0.9686078252957234, + "grad_norm": 1.0481681916194059, + "learning_rate": 9.103079575098708e-06, + "loss": 0.1165, + "step": 2129 + }, + { + "epoch": 0.9690627843494085, + "grad_norm": 0.6244343397167454, + "learning_rate": 9.102262593215246e-06, + "loss": 0.0548, + "step": 2130 + }, + { + "epoch": 0.9695177434030937, + "grad_norm": 0.6662772517701377, + "learning_rate": 9.101445276113407e-06, + "loss": 0.0672, + "step": 2131 + }, + { + "epoch": 0.9699727024567789, + "grad_norm": 0.7302079833291476, + "learning_rate": 9.100627623859985e-06, + "loss": 0.0747, + "step": 2132 + }, + { + "epoch": 0.9704276615104641, + "grad_norm": 0.7003598456468986, + "learning_rate": 9.09980963652179e-06, + "loss": 0.0763, + "step": 2133 + }, + { + "epoch": 0.9708826205641492, + "grad_norm": 0.8675523177046712, + "learning_rate": 9.098991314165668e-06, + "loss": 0.1123, + "step": 2134 + }, + { + "epoch": 0.9713375796178344, + "grad_norm": 0.6531391716615499, + "learning_rate": 9.098172656858484e-06, + "loss": 0.0626, + "step": 2135 + }, + { + "epoch": 0.9717925386715196, + "grad_norm": 0.8230462520119928, + "learning_rate": 9.097353664667138e-06, + "loss": 0.0873, + "step": 2136 + }, + { + "epoch": 0.9722474977252047, + "grad_norm": 0.6524897158303723, + "learning_rate": 9.096534337658558e-06, + "loss": 0.0658, + "step": 2137 + }, + { + "epoch": 0.9727024567788899, + "grad_norm": 0.7421742040769631, + "learning_rate": 9.095714675899688e-06, + "loss": 0.0782, + "step": 2138 + }, + { + "epoch": 0.9731574158325751, + "grad_norm": 0.6400011673563383, + "learning_rate": 9.094894679457511e-06, + "loss": 0.0605, + "step": 2139 + }, + { + "epoch": 0.9736123748862603, + "grad_norm": 0.5825220963314399, + "learning_rate": 9.094074348399034e-06, + "loss": 0.0711, + "step": 2140 + }, + { + "epoch": 0.9740673339399454, + "grad_norm": 0.9652267063711952, + "learning_rate": 9.09325368279129e-06, + "loss": 0.0996, + "step": 2141 + }, + { + "epoch": 0.9745222929936306, + "grad_norm": 0.9291202899333796, + "learning_rate": 9.09243268270134e-06, + "loss": 0.0818, + "step": 2142 + }, + { + "epoch": 0.9749772520473158, + "grad_norm": 0.8799622533298002, + "learning_rate": 9.091611348196272e-06, + "loss": 0.0904, + "step": 2143 + }, + { + "epoch": 0.9754322111010009, + "grad_norm": 0.8326816067428606, + "learning_rate": 9.090789679343201e-06, + "loss": 0.0931, + "step": 2144 + }, + { + "epoch": 0.9758871701546861, + "grad_norm": 0.783000579713321, + "learning_rate": 9.089967676209274e-06, + "loss": 0.0879, + "step": 2145 + }, + { + "epoch": 0.9763421292083713, + "grad_norm": 0.7001846964382422, + "learning_rate": 9.089145338861657e-06, + "loss": 0.0916, + "step": 2146 + }, + { + "epoch": 0.9767970882620565, + "grad_norm": 0.953946241556791, + "learning_rate": 9.08832266736755e-06, + "loss": 0.1205, + "step": 2147 + }, + { + "epoch": 0.9772520473157416, + "grad_norm": 0.7358151559070641, + "learning_rate": 9.087499661794177e-06, + "loss": 0.0915, + "step": 2148 + }, + { + "epoch": 0.9777070063694268, + "grad_norm": 0.8142291270830226, + "learning_rate": 9.08667632220879e-06, + "loss": 0.0995, + "step": 2149 + }, + { + "epoch": 0.978161965423112, + "grad_norm": 0.7106034630801776, + "learning_rate": 9.08585264867867e-06, + "loss": 0.0783, + "step": 2150 + }, + { + "epoch": 0.978616924476797, + "grad_norm": 0.826812478555379, + "learning_rate": 9.085028641271123e-06, + "loss": 0.1058, + "step": 2151 + }, + { + "epoch": 0.9790718835304822, + "grad_norm": 0.8960647231942128, + "learning_rate": 9.084204300053483e-06, + "loss": 0.108, + "step": 2152 + }, + { + "epoch": 0.9795268425841674, + "grad_norm": 0.7308955491972883, + "learning_rate": 9.083379625093111e-06, + "loss": 0.0963, + "step": 2153 + }, + { + "epoch": 0.9799818016378526, + "grad_norm": 0.854998609995297, + "learning_rate": 9.082554616457397e-06, + "loss": 0.1031, + "step": 2154 + }, + { + "epoch": 0.9804367606915377, + "grad_norm": 0.6134903423880519, + "learning_rate": 9.081729274213758e-06, + "loss": 0.0728, + "step": 2155 + }, + { + "epoch": 0.9808917197452229, + "grad_norm": 0.7494461465991118, + "learning_rate": 9.080903598429634e-06, + "loss": 0.0612, + "step": 2156 + }, + { + "epoch": 0.9813466787989081, + "grad_norm": 0.6477350071161301, + "learning_rate": 9.080077589172496e-06, + "loss": 0.0725, + "step": 2157 + }, + { + "epoch": 0.9818016378525932, + "grad_norm": 0.5949372826775987, + "learning_rate": 9.079251246509846e-06, + "loss": 0.0618, + "step": 2158 + }, + { + "epoch": 0.9822565969062784, + "grad_norm": 1.0457437129682037, + "learning_rate": 9.078424570509202e-06, + "loss": 0.134, + "step": 2159 + }, + { + "epoch": 0.9827115559599636, + "grad_norm": 0.7562918714504535, + "learning_rate": 9.077597561238123e-06, + "loss": 0.0746, + "step": 2160 + }, + { + "epoch": 0.9831665150136488, + "grad_norm": 0.705691881874251, + "learning_rate": 9.076770218764186e-06, + "loss": 0.0903, + "step": 2161 + }, + { + "epoch": 0.9836214740673339, + "grad_norm": 0.700571619924188, + "learning_rate": 9.075942543154996e-06, + "loss": 0.0905, + "step": 2162 + }, + { + "epoch": 0.9840764331210191, + "grad_norm": 0.5178609664739039, + "learning_rate": 9.075114534478187e-06, + "loss": 0.0623, + "step": 2163 + }, + { + "epoch": 0.9845313921747043, + "grad_norm": 0.5564063525132696, + "learning_rate": 9.074286192801423e-06, + "loss": 0.0622, + "step": 2164 + }, + { + "epoch": 0.9849863512283894, + "grad_norm": 0.8390150599738658, + "learning_rate": 9.07345751819239e-06, + "loss": 0.0894, + "step": 2165 + }, + { + "epoch": 0.9854413102820746, + "grad_norm": 0.6899304429749638, + "learning_rate": 9.072628510718804e-06, + "loss": 0.0715, + "step": 2166 + }, + { + "epoch": 0.9858962693357598, + "grad_norm": 0.7215157855324703, + "learning_rate": 9.071799170448409e-06, + "loss": 0.0767, + "step": 2167 + }, + { + "epoch": 0.986351228389445, + "grad_norm": 0.5513970488289187, + "learning_rate": 9.070969497448972e-06, + "loss": 0.0586, + "step": 2168 + }, + { + "epoch": 0.9868061874431301, + "grad_norm": 0.5126138943457034, + "learning_rate": 9.070139491788295e-06, + "loss": 0.0686, + "step": 2169 + }, + { + "epoch": 0.9872611464968153, + "grad_norm": 0.7021455623884609, + "learning_rate": 9.069309153534196e-06, + "loss": 0.0853, + "step": 2170 + }, + { + "epoch": 0.9877161055505005, + "grad_norm": 0.8937932838828458, + "learning_rate": 9.068478482754532e-06, + "loss": 0.1229, + "step": 2171 + }, + { + "epoch": 0.9881710646041856, + "grad_norm": 0.7580326063736847, + "learning_rate": 9.067647479517179e-06, + "loss": 0.1176, + "step": 2172 + }, + { + "epoch": 0.9886260236578708, + "grad_norm": 0.854693695415459, + "learning_rate": 9.066816143890042e-06, + "loss": 0.0624, + "step": 2173 + }, + { + "epoch": 0.989080982711556, + "grad_norm": 0.691622087221906, + "learning_rate": 9.065984475941056e-06, + "loss": 0.0821, + "step": 2174 + }, + { + "epoch": 0.9895359417652412, + "grad_norm": 0.5701976798754824, + "learning_rate": 9.065152475738182e-06, + "loss": 0.0525, + "step": 2175 + }, + { + "epoch": 0.9899909008189263, + "grad_norm": 0.5280985607821013, + "learning_rate": 9.064320143349405e-06, + "loss": 0.0532, + "step": 2176 + }, + { + "epoch": 0.9904458598726115, + "grad_norm": 0.7270073505569681, + "learning_rate": 9.063487478842738e-06, + "loss": 0.0729, + "step": 2177 + }, + { + "epoch": 0.9909008189262967, + "grad_norm": 0.5397573476737881, + "learning_rate": 9.062654482286228e-06, + "loss": 0.0546, + "step": 2178 + }, + { + "epoch": 0.9913557779799818, + "grad_norm": 0.8280519656078903, + "learning_rate": 9.061821153747938e-06, + "loss": 0.0794, + "step": 2179 + }, + { + "epoch": 0.991810737033667, + "grad_norm": 0.6367661759018886, + "learning_rate": 9.060987493295967e-06, + "loss": 0.0679, + "step": 2180 + }, + { + "epoch": 0.9922656960873522, + "grad_norm": 0.7859239736098618, + "learning_rate": 9.060153500998438e-06, + "loss": 0.0958, + "step": 2181 + }, + { + "epoch": 0.9927206551410374, + "grad_norm": 0.8770748630020422, + "learning_rate": 9.0593191769235e-06, + "loss": 0.1037, + "step": 2182 + }, + { + "epoch": 0.9931756141947224, + "grad_norm": 0.5493767625809909, + "learning_rate": 9.05848452113933e-06, + "loss": 0.0535, + "step": 2183 + }, + { + "epoch": 0.9936305732484076, + "grad_norm": 1.0509546431486094, + "learning_rate": 9.057649533714134e-06, + "loss": 0.1136, + "step": 2184 + }, + { + "epoch": 0.9940855323020928, + "grad_norm": 0.8067366260983323, + "learning_rate": 9.056814214716143e-06, + "loss": 0.0911, + "step": 2185 + }, + { + "epoch": 0.9945404913557779, + "grad_norm": 0.6708197750921108, + "learning_rate": 9.055978564213614e-06, + "loss": 0.0737, + "step": 2186 + }, + { + "epoch": 0.9949954504094631, + "grad_norm": 1.0620824544949425, + "learning_rate": 9.055142582274831e-06, + "loss": 0.1035, + "step": 2187 + }, + { + "epoch": 0.9954504094631483, + "grad_norm": 0.7809645088567875, + "learning_rate": 9.054306268968111e-06, + "loss": 0.0964, + "step": 2188 + }, + { + "epoch": 0.9959053685168335, + "grad_norm": 0.6922882332723763, + "learning_rate": 9.053469624361793e-06, + "loss": 0.0769, + "step": 2189 + }, + { + "epoch": 0.9963603275705186, + "grad_norm": 0.6135634693459231, + "learning_rate": 9.052632648524242e-06, + "loss": 0.0857, + "step": 2190 + }, + { + "epoch": 0.9968152866242038, + "grad_norm": 0.7230383107997012, + "learning_rate": 9.051795341523852e-06, + "loss": 0.0666, + "step": 2191 + }, + { + "epoch": 0.997270245677889, + "grad_norm": 0.7702877397526973, + "learning_rate": 9.050957703429044e-06, + "loss": 0.0861, + "step": 2192 + }, + { + "epoch": 0.9977252047315741, + "grad_norm": 0.79537510756259, + "learning_rate": 9.050119734308266e-06, + "loss": 0.0906, + "step": 2193 + }, + { + "epoch": 0.9981801637852593, + "grad_norm": 0.6318589660625535, + "learning_rate": 9.049281434229995e-06, + "loss": 0.0821, + "step": 2194 + }, + { + "epoch": 0.9986351228389445, + "grad_norm": 0.6618836956269952, + "learning_rate": 9.048442803262731e-06, + "loss": 0.0748, + "step": 2195 + }, + { + "epoch": 0.9990900818926297, + "grad_norm": 0.5469592163366095, + "learning_rate": 9.047603841475003e-06, + "loss": 0.066, + "step": 2196 + }, + { + "epoch": 0.9995450409463148, + "grad_norm": 0.6279887796401853, + "learning_rate": 9.046764548935368e-06, + "loss": 0.0743, + "step": 2197 + }, + { + "epoch": 1.0, + "grad_norm": 0.40519899960847033, + "learning_rate": 9.045924925712411e-06, + "loss": 0.0327, + "step": 2198 + }, + { + "epoch": 1.000454959053685, + "grad_norm": 0.41468311147935694, + "learning_rate": 9.045084971874738e-06, + "loss": 0.0243, + "step": 2199 + }, + { + "epoch": 1.0009099181073704, + "grad_norm": 0.5188055788021196, + "learning_rate": 9.04424468749099e-06, + "loss": 0.0375, + "step": 2200 + }, + { + "epoch": 1.0013648771610555, + "grad_norm": 0.4764585088866917, + "learning_rate": 9.04340407262983e-06, + "loss": 0.0395, + "step": 2201 + }, + { + "epoch": 1.0018198362147406, + "grad_norm": 0.28928828491344616, + "learning_rate": 9.042563127359946e-06, + "loss": 0.0208, + "step": 2202 + }, + { + "epoch": 1.0022747952684259, + "grad_norm": 0.5179468693343099, + "learning_rate": 9.041721851750063e-06, + "loss": 0.0322, + "step": 2203 + }, + { + "epoch": 1.002729754322111, + "grad_norm": 0.4198208723720039, + "learning_rate": 9.04088024586892e-06, + "loss": 0.0366, + "step": 2204 + }, + { + "epoch": 1.0031847133757963, + "grad_norm": 0.4784473138415427, + "learning_rate": 9.040038309785293e-06, + "loss": 0.0422, + "step": 2205 + }, + { + "epoch": 1.0036396724294814, + "grad_norm": 0.576332931747316, + "learning_rate": 9.039196043567979e-06, + "loss": 0.0387, + "step": 2206 + }, + { + "epoch": 1.0040946314831665, + "grad_norm": 0.5205582439898824, + "learning_rate": 9.038353447285807e-06, + "loss": 0.0551, + "step": 2207 + }, + { + "epoch": 1.0045495905368518, + "grad_norm": 0.7737994932982504, + "learning_rate": 9.037510521007626e-06, + "loss": 0.042, + "step": 2208 + }, + { + "epoch": 1.0050045495905369, + "grad_norm": 0.4056433108647087, + "learning_rate": 9.03666726480232e-06, + "loss": 0.0309, + "step": 2209 + }, + { + "epoch": 1.005459508644222, + "grad_norm": 0.31259616668647877, + "learning_rate": 9.035823678738795e-06, + "loss": 0.0247, + "step": 2210 + }, + { + "epoch": 1.0059144676979073, + "grad_norm": 0.545747512672262, + "learning_rate": 9.034979762885985e-06, + "loss": 0.0379, + "step": 2211 + }, + { + "epoch": 1.0063694267515924, + "grad_norm": 0.3531093457798414, + "learning_rate": 9.034135517312848e-06, + "loss": 0.0198, + "step": 2212 + }, + { + "epoch": 1.0068243858052774, + "grad_norm": 0.3471778421349368, + "learning_rate": 9.033290942088377e-06, + "loss": 0.0191, + "step": 2213 + }, + { + "epoch": 1.0072793448589628, + "grad_norm": 0.45123302926671505, + "learning_rate": 9.032446037281582e-06, + "loss": 0.0233, + "step": 2214 + }, + { + "epoch": 1.0077343039126478, + "grad_norm": 0.40498118740009004, + "learning_rate": 9.031600802961508e-06, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 1.008189262966333, + "grad_norm": 0.44404852807953515, + "learning_rate": 9.030755239197224e-06, + "loss": 0.0343, + "step": 2216 + }, + { + "epoch": 1.0086442220200182, + "grad_norm": 0.41886201143517243, + "learning_rate": 9.029909346057826e-06, + "loss": 0.0276, + "step": 2217 + }, + { + "epoch": 1.0090991810737033, + "grad_norm": 0.2879285343911946, + "learning_rate": 9.029063123612431e-06, + "loss": 0.02, + "step": 2218 + }, + { + "epoch": 1.0095541401273886, + "grad_norm": 0.5781677724909076, + "learning_rate": 9.028216571930197e-06, + "loss": 0.0339, + "step": 2219 + }, + { + "epoch": 1.0100090991810737, + "grad_norm": 0.42128445628125777, + "learning_rate": 9.027369691080292e-06, + "loss": 0.0329, + "step": 2220 + }, + { + "epoch": 1.0104640582347588, + "grad_norm": 0.4867304814601137, + "learning_rate": 9.026522481131925e-06, + "loss": 0.0451, + "step": 2221 + }, + { + "epoch": 1.0109190172884441, + "grad_norm": 0.35647532367363194, + "learning_rate": 9.025674942154325e-06, + "loss": 0.0202, + "step": 2222 + }, + { + "epoch": 1.0113739763421292, + "grad_norm": 0.6154778638320356, + "learning_rate": 9.024827074216748e-06, + "loss": 0.0619, + "step": 2223 + }, + { + "epoch": 1.0118289353958143, + "grad_norm": 0.46447780693049373, + "learning_rate": 9.023978877388479e-06, + "loss": 0.0265, + "step": 2224 + }, + { + "epoch": 1.0122838944494996, + "grad_norm": 0.4551756875246183, + "learning_rate": 9.02313035173883e-06, + "loss": 0.0167, + "step": 2225 + }, + { + "epoch": 1.0127388535031847, + "grad_norm": 0.4341660568896861, + "learning_rate": 9.022281497337133e-06, + "loss": 0.0257, + "step": 2226 + }, + { + "epoch": 1.0131938125568698, + "grad_norm": 0.37807969634776667, + "learning_rate": 9.021432314252758e-06, + "loss": 0.0235, + "step": 2227 + }, + { + "epoch": 1.013648771610555, + "grad_norm": 0.43791115876653813, + "learning_rate": 9.020582802555095e-06, + "loss": 0.0285, + "step": 2228 + }, + { + "epoch": 1.0141037306642402, + "grad_norm": 0.7541669794368306, + "learning_rate": 9.019732962313562e-06, + "loss": 0.0412, + "step": 2229 + }, + { + "epoch": 1.0145586897179253, + "grad_norm": 0.41591203424935613, + "learning_rate": 9.018882793597605e-06, + "loss": 0.0217, + "step": 2230 + }, + { + "epoch": 1.0150136487716106, + "grad_norm": 0.531675738557164, + "learning_rate": 9.018032296476695e-06, + "loss": 0.0259, + "step": 2231 + }, + { + "epoch": 1.0154686078252957, + "grad_norm": 0.4525534861298487, + "learning_rate": 9.017181471020331e-06, + "loss": 0.032, + "step": 2232 + }, + { + "epoch": 1.015923566878981, + "grad_norm": 0.5572932855598556, + "learning_rate": 9.016330317298038e-06, + "loss": 0.0321, + "step": 2233 + }, + { + "epoch": 1.016378525932666, + "grad_norm": 0.4880772464783955, + "learning_rate": 9.01547883537937e-06, + "loss": 0.0242, + "step": 2234 + }, + { + "epoch": 1.0168334849863512, + "grad_norm": 0.5290436879010799, + "learning_rate": 9.014627025333906e-06, + "loss": 0.0268, + "step": 2235 + }, + { + "epoch": 1.0172884440400365, + "grad_norm": 0.3469524553449946, + "learning_rate": 9.01377488723125e-06, + "loss": 0.0189, + "step": 2236 + }, + { + "epoch": 1.0177434030937216, + "grad_norm": 0.5381328202645719, + "learning_rate": 9.012922421141036e-06, + "loss": 0.0282, + "step": 2237 + }, + { + "epoch": 1.0181983621474067, + "grad_norm": 0.5437416204093511, + "learning_rate": 9.012069627132925e-06, + "loss": 0.0365, + "step": 2238 + }, + { + "epoch": 1.018653321201092, + "grad_norm": 0.5151432843211493, + "learning_rate": 9.011216505276601e-06, + "loss": 0.0327, + "step": 2239 + }, + { + "epoch": 1.019108280254777, + "grad_norm": 0.7194165832171175, + "learning_rate": 9.01036305564178e-06, + "loss": 0.0447, + "step": 2240 + }, + { + "epoch": 1.0195632393084622, + "grad_norm": 0.4895196525190099, + "learning_rate": 9.009509278298201e-06, + "loss": 0.0226, + "step": 2241 + }, + { + "epoch": 1.0200181983621475, + "grad_norm": 0.36403402277658775, + "learning_rate": 9.008655173315629e-06, + "loss": 0.0172, + "step": 2242 + }, + { + "epoch": 1.0204731574158326, + "grad_norm": 0.5192307375895406, + "learning_rate": 9.00780074076386e-06, + "loss": 0.0281, + "step": 2243 + }, + { + "epoch": 1.0209281164695176, + "grad_norm": 0.5855074570295021, + "learning_rate": 9.006945980712713e-06, + "loss": 0.039, + "step": 2244 + }, + { + "epoch": 1.021383075523203, + "grad_norm": 0.3530576777441414, + "learning_rate": 9.006090893232036e-06, + "loss": 0.0165, + "step": 2245 + }, + { + "epoch": 1.021838034576888, + "grad_norm": 0.46560015374930225, + "learning_rate": 9.005235478391704e-06, + "loss": 0.031, + "step": 2246 + }, + { + "epoch": 1.0222929936305734, + "grad_norm": 0.4320906337363968, + "learning_rate": 9.004379736261614e-06, + "loss": 0.0229, + "step": 2247 + }, + { + "epoch": 1.0227479526842584, + "grad_norm": 0.5843690219708401, + "learning_rate": 9.003523666911698e-06, + "loss": 0.0398, + "step": 2248 + }, + { + "epoch": 1.0232029117379435, + "grad_norm": 0.4876049343109499, + "learning_rate": 9.002667270411905e-06, + "loss": 0.0209, + "step": 2249 + }, + { + "epoch": 1.0236578707916288, + "grad_norm": 0.4996309287294051, + "learning_rate": 9.001810546832219e-06, + "loss": 0.0339, + "step": 2250 + }, + { + "epoch": 1.024112829845314, + "grad_norm": 0.44615485337683974, + "learning_rate": 9.000953496242648e-06, + "loss": 0.0367, + "step": 2251 + }, + { + "epoch": 1.024567788898999, + "grad_norm": 0.4816248261028461, + "learning_rate": 9.000096118713226e-06, + "loss": 0.0302, + "step": 2252 + }, + { + "epoch": 1.0250227479526843, + "grad_norm": 0.3202895454501902, + "learning_rate": 8.999238414314014e-06, + "loss": 0.018, + "step": 2253 + }, + { + "epoch": 1.0254777070063694, + "grad_norm": 0.39394390771447657, + "learning_rate": 8.998380383115098e-06, + "loss": 0.0203, + "step": 2254 + }, + { + "epoch": 1.0259326660600545, + "grad_norm": 0.6774965098079401, + "learning_rate": 8.997522025186592e-06, + "loss": 0.0444, + "step": 2255 + }, + { + "epoch": 1.0263876251137398, + "grad_norm": 0.6156285698131154, + "learning_rate": 8.996663340598642e-06, + "loss": 0.033, + "step": 2256 + }, + { + "epoch": 1.026842584167425, + "grad_norm": 0.6636465470342775, + "learning_rate": 8.995804329421408e-06, + "loss": 0.0282, + "step": 2257 + }, + { + "epoch": 1.02729754322111, + "grad_norm": 0.7643329557559453, + "learning_rate": 8.994944991725094e-06, + "loss": 0.0413, + "step": 2258 + }, + { + "epoch": 1.0277525022747953, + "grad_norm": 0.4484887858566329, + "learning_rate": 8.994085327579914e-06, + "loss": 0.0244, + "step": 2259 + }, + { + "epoch": 1.0282074613284804, + "grad_norm": 0.6046158805682427, + "learning_rate": 8.993225337056118e-06, + "loss": 0.0372, + "step": 2260 + }, + { + "epoch": 1.0286624203821657, + "grad_norm": 0.5297868937946675, + "learning_rate": 8.992365020223982e-06, + "loss": 0.0407, + "step": 2261 + }, + { + "epoch": 1.0291173794358508, + "grad_norm": 0.4805793953554321, + "learning_rate": 8.991504377153805e-06, + "loss": 0.0297, + "step": 2262 + }, + { + "epoch": 1.0295723384895359, + "grad_norm": 0.6196673347815759, + "learning_rate": 8.990643407915915e-06, + "loss": 0.0397, + "step": 2263 + }, + { + "epoch": 1.0300272975432212, + "grad_norm": 0.6223272220447811, + "learning_rate": 8.98978211258067e-06, + "loss": 0.0409, + "step": 2264 + }, + { + "epoch": 1.0304822565969063, + "grad_norm": 0.49952273986223505, + "learning_rate": 8.988920491218446e-06, + "loss": 0.0272, + "step": 2265 + }, + { + "epoch": 1.0309372156505914, + "grad_norm": 0.6292771186739616, + "learning_rate": 8.988058543899654e-06, + "loss": 0.0384, + "step": 2266 + }, + { + "epoch": 1.0313921747042767, + "grad_norm": 0.38772458827936923, + "learning_rate": 8.987196270694727e-06, + "loss": 0.024, + "step": 2267 + }, + { + "epoch": 1.0318471337579618, + "grad_norm": 0.8799833129039605, + "learning_rate": 8.986333671674128e-06, + "loss": 0.0341, + "step": 2268 + }, + { + "epoch": 1.0323020928116469, + "grad_norm": 0.6271731268799836, + "learning_rate": 8.985470746908342e-06, + "loss": 0.033, + "step": 2269 + }, + { + "epoch": 1.0327570518653322, + "grad_norm": 0.38786047905872434, + "learning_rate": 8.984607496467885e-06, + "loss": 0.021, + "step": 2270 + }, + { + "epoch": 1.0332120109190173, + "grad_norm": 0.6280644096851069, + "learning_rate": 8.9837439204233e-06, + "loss": 0.0491, + "step": 2271 + }, + { + "epoch": 1.0336669699727024, + "grad_norm": 0.5847841334225715, + "learning_rate": 8.98288001884515e-06, + "loss": 0.0337, + "step": 2272 + }, + { + "epoch": 1.0341219290263877, + "grad_norm": 0.36088677101245703, + "learning_rate": 8.982015791804032e-06, + "loss": 0.0156, + "step": 2273 + }, + { + "epoch": 1.0345768880800728, + "grad_norm": 0.4537884974426005, + "learning_rate": 8.981151239370566e-06, + "loss": 0.027, + "step": 2274 + }, + { + "epoch": 1.035031847133758, + "grad_norm": 0.6090066061447076, + "learning_rate": 8.9802863616154e-06, + "loss": 0.0378, + "step": 2275 + }, + { + "epoch": 1.0354868061874432, + "grad_norm": 0.7101749544755233, + "learning_rate": 8.979421158609206e-06, + "loss": 0.0439, + "step": 2276 + }, + { + "epoch": 1.0359417652411282, + "grad_norm": 0.5742339125588956, + "learning_rate": 8.978555630422686e-06, + "loss": 0.0328, + "step": 2277 + }, + { + "epoch": 1.0363967242948136, + "grad_norm": 0.632873074474985, + "learning_rate": 8.977689777126568e-06, + "loss": 0.0472, + "step": 2278 + }, + { + "epoch": 1.0368516833484986, + "grad_norm": 0.8069979527700195, + "learning_rate": 8.976823598791604e-06, + "loss": 0.0319, + "step": 2279 + }, + { + "epoch": 1.0373066424021837, + "grad_norm": 0.4015240288673539, + "learning_rate": 8.975957095488575e-06, + "loss": 0.0269, + "step": 2280 + }, + { + "epoch": 1.037761601455869, + "grad_norm": 0.5786381841993868, + "learning_rate": 8.975090267288286e-06, + "loss": 0.0296, + "step": 2281 + }, + { + "epoch": 1.0382165605095541, + "grad_norm": 0.5451914455456522, + "learning_rate": 8.974223114261574e-06, + "loss": 0.0343, + "step": 2282 + }, + { + "epoch": 1.0386715195632392, + "grad_norm": 0.6945170105788371, + "learning_rate": 8.973355636479294e-06, + "loss": 0.0476, + "step": 2283 + }, + { + "epoch": 1.0391264786169245, + "grad_norm": 0.5171663408691534, + "learning_rate": 8.972487834012338e-06, + "loss": 0.0301, + "step": 2284 + }, + { + "epoch": 1.0395814376706096, + "grad_norm": 0.494166229450044, + "learning_rate": 8.971619706931613e-06, + "loss": 0.0226, + "step": 2285 + }, + { + "epoch": 1.0400363967242947, + "grad_norm": 0.7676778552323048, + "learning_rate": 8.970751255308063e-06, + "loss": 0.045, + "step": 2286 + }, + { + "epoch": 1.04049135577798, + "grad_norm": 0.44323443611073776, + "learning_rate": 8.969882479212652e-06, + "loss": 0.0196, + "step": 2287 + }, + { + "epoch": 1.040946314831665, + "grad_norm": 0.41146000373164554, + "learning_rate": 8.969013378716371e-06, + "loss": 0.0196, + "step": 2288 + }, + { + "epoch": 1.0414012738853504, + "grad_norm": 0.3888711487160539, + "learning_rate": 8.968143953890242e-06, + "loss": 0.0228, + "step": 2289 + }, + { + "epoch": 1.0418562329390355, + "grad_norm": 0.49379959221935377, + "learning_rate": 8.96727420480531e-06, + "loss": 0.0306, + "step": 2290 + }, + { + "epoch": 1.0423111919927206, + "grad_norm": 0.48325360654642197, + "learning_rate": 8.966404131532645e-06, + "loss": 0.0265, + "step": 2291 + }, + { + "epoch": 1.042766151046406, + "grad_norm": 0.47493208719115093, + "learning_rate": 8.965533734143347e-06, + "loss": 0.0239, + "step": 2292 + }, + { + "epoch": 1.043221110100091, + "grad_norm": 0.556271091368108, + "learning_rate": 8.964663012708538e-06, + "loss": 0.0365, + "step": 2293 + }, + { + "epoch": 1.043676069153776, + "grad_norm": 0.8512257992210553, + "learning_rate": 8.963791967299375e-06, + "loss": 0.0332, + "step": 2294 + }, + { + "epoch": 1.0441310282074614, + "grad_norm": 0.4600946915818348, + "learning_rate": 8.96292059798703e-06, + "loss": 0.0254, + "step": 2295 + }, + { + "epoch": 1.0445859872611465, + "grad_norm": 0.5926927797370501, + "learning_rate": 8.962048904842713e-06, + "loss": 0.034, + "step": 2296 + }, + { + "epoch": 1.0450409463148316, + "grad_norm": 0.5174352508068348, + "learning_rate": 8.96117688793765e-06, + "loss": 0.0334, + "step": 2297 + }, + { + "epoch": 1.0454959053685169, + "grad_norm": 0.4726564762945724, + "learning_rate": 8.960304547343101e-06, + "loss": 0.0271, + "step": 2298 + }, + { + "epoch": 1.045950864422202, + "grad_norm": 0.49021838747059965, + "learning_rate": 8.959431883130348e-06, + "loss": 0.0272, + "step": 2299 + }, + { + "epoch": 1.046405823475887, + "grad_norm": 0.33392762330146264, + "learning_rate": 8.958558895370703e-06, + "loss": 0.0184, + "step": 2300 + }, + { + "epoch": 1.0468607825295724, + "grad_norm": 0.43970090494512293, + "learning_rate": 8.9576855841355e-06, + "loss": 0.0247, + "step": 2301 + }, + { + "epoch": 1.0473157415832575, + "grad_norm": 0.34961568768416074, + "learning_rate": 8.956811949496108e-06, + "loss": 0.0207, + "step": 2302 + }, + { + "epoch": 1.0477707006369428, + "grad_norm": 0.5047819086466443, + "learning_rate": 8.955937991523908e-06, + "loss": 0.0358, + "step": 2303 + }, + { + "epoch": 1.0482256596906279, + "grad_norm": 0.5502957295717672, + "learning_rate": 8.955063710290322e-06, + "loss": 0.0396, + "step": 2304 + }, + { + "epoch": 1.048680618744313, + "grad_norm": 0.4007555279082937, + "learning_rate": 8.95418910586679e-06, + "loss": 0.0205, + "step": 2305 + }, + { + "epoch": 1.0491355777979983, + "grad_norm": 0.37932885662916804, + "learning_rate": 8.953314178324782e-06, + "loss": 0.0261, + "step": 2306 + }, + { + "epoch": 1.0495905368516834, + "grad_norm": 0.6331059696275105, + "learning_rate": 8.952438927735793e-06, + "loss": 0.0397, + "step": 2307 + }, + { + "epoch": 1.0500454959053684, + "grad_norm": 0.5533999405103901, + "learning_rate": 8.951563354171343e-06, + "loss": 0.0216, + "step": 2308 + }, + { + "epoch": 1.0505004549590538, + "grad_norm": 0.5064049753801714, + "learning_rate": 8.950687457702981e-06, + "loss": 0.0253, + "step": 2309 + }, + { + "epoch": 1.0509554140127388, + "grad_norm": 0.7762514931128638, + "learning_rate": 8.94981123840228e-06, + "loss": 0.0257, + "step": 2310 + }, + { + "epoch": 1.051410373066424, + "grad_norm": 0.5258772784610919, + "learning_rate": 8.948934696340842e-06, + "loss": 0.0402, + "step": 2311 + }, + { + "epoch": 1.0518653321201092, + "grad_norm": 0.5179164003875761, + "learning_rate": 8.948057831590296e-06, + "loss": 0.0392, + "step": 2312 + }, + { + "epoch": 1.0523202911737943, + "grad_norm": 0.4873683404674824, + "learning_rate": 8.94718064422229e-06, + "loss": 0.0225, + "step": 2313 + }, + { + "epoch": 1.0527752502274794, + "grad_norm": 0.42294954664238593, + "learning_rate": 8.94630313430851e-06, + "loss": 0.0239, + "step": 2314 + }, + { + "epoch": 1.0532302092811647, + "grad_norm": 0.5120965619588207, + "learning_rate": 8.945425301920656e-06, + "loss": 0.0239, + "step": 2315 + }, + { + "epoch": 1.0536851683348498, + "grad_norm": 0.5274581767565953, + "learning_rate": 8.944547147130467e-06, + "loss": 0.0395, + "step": 2316 + }, + { + "epoch": 1.0541401273885351, + "grad_norm": 0.6240914390797723, + "learning_rate": 8.943668670009698e-06, + "loss": 0.04, + "step": 2317 + }, + { + "epoch": 1.0545950864422202, + "grad_norm": 0.588480807715609, + "learning_rate": 8.942789870630133e-06, + "loss": 0.0379, + "step": 2318 + }, + { + "epoch": 1.0550500454959053, + "grad_norm": 0.5328051168509789, + "learning_rate": 8.941910749063587e-06, + "loss": 0.0256, + "step": 2319 + }, + { + "epoch": 1.0555050045495906, + "grad_norm": 0.5662136884367794, + "learning_rate": 8.941031305381894e-06, + "loss": 0.0349, + "step": 2320 + }, + { + "epoch": 1.0559599636032757, + "grad_norm": 0.4080289916939306, + "learning_rate": 8.940151539656922e-06, + "loss": 0.0203, + "step": 2321 + }, + { + "epoch": 1.0564149226569608, + "grad_norm": 0.6644738842779135, + "learning_rate": 8.93927145196056e-06, + "loss": 0.0295, + "step": 2322 + }, + { + "epoch": 1.056869881710646, + "grad_norm": 0.43989425636246393, + "learning_rate": 8.938391042364723e-06, + "loss": 0.0257, + "step": 2323 + }, + { + "epoch": 1.0573248407643312, + "grad_norm": 0.5431541428763835, + "learning_rate": 8.937510310941358e-06, + "loss": 0.03, + "step": 2324 + }, + { + "epoch": 1.0577797998180163, + "grad_norm": 0.5122724279785533, + "learning_rate": 8.936629257762429e-06, + "loss": 0.0273, + "step": 2325 + }, + { + "epoch": 1.0582347588717016, + "grad_norm": 0.41195858239961775, + "learning_rate": 8.935747882899937e-06, + "loss": 0.0216, + "step": 2326 + }, + { + "epoch": 1.0586897179253867, + "grad_norm": 0.5171757707727286, + "learning_rate": 8.9348661864259e-06, + "loss": 0.0299, + "step": 2327 + }, + { + "epoch": 1.0591446769790718, + "grad_norm": 0.6216382380161013, + "learning_rate": 8.93398416841237e-06, + "loss": 0.0525, + "step": 2328 + }, + { + "epoch": 1.059599636032757, + "grad_norm": 0.47615445722593264, + "learning_rate": 8.933101828931418e-06, + "loss": 0.0229, + "step": 2329 + }, + { + "epoch": 1.0600545950864422, + "grad_norm": 0.5543921495737715, + "learning_rate": 8.932219168055146e-06, + "loss": 0.0353, + "step": 2330 + }, + { + "epoch": 1.0605095541401275, + "grad_norm": 0.4807073495602966, + "learning_rate": 8.931336185855682e-06, + "loss": 0.029, + "step": 2331 + }, + { + "epoch": 1.0609645131938126, + "grad_norm": 0.7132043951444881, + "learning_rate": 8.930452882405178e-06, + "loss": 0.0573, + "step": 2332 + }, + { + "epoch": 1.0614194722474977, + "grad_norm": 0.7323908092635573, + "learning_rate": 8.929569257775816e-06, + "loss": 0.031, + "step": 2333 + }, + { + "epoch": 1.061874431301183, + "grad_norm": 0.7282498524373471, + "learning_rate": 8.9286853120398e-06, + "loss": 0.0212, + "step": 2334 + }, + { + "epoch": 1.062329390354868, + "grad_norm": 0.5041730540211715, + "learning_rate": 8.92780104526936e-06, + "loss": 0.0219, + "step": 2335 + }, + { + "epoch": 1.0627843494085532, + "grad_norm": 0.5694707546108049, + "learning_rate": 8.926916457536755e-06, + "loss": 0.0277, + "step": 2336 + }, + { + "epoch": 1.0632393084622385, + "grad_norm": 0.4942987205465501, + "learning_rate": 8.926031548914274e-06, + "loss": 0.0283, + "step": 2337 + }, + { + "epoch": 1.0636942675159236, + "grad_norm": 0.7094719472889628, + "learning_rate": 8.925146319474225e-06, + "loss": 0.0484, + "step": 2338 + }, + { + "epoch": 1.0641492265696086, + "grad_norm": 0.5401572696577567, + "learning_rate": 8.924260769288944e-06, + "loss": 0.032, + "step": 2339 + }, + { + "epoch": 1.064604185623294, + "grad_norm": 0.6271229371930636, + "learning_rate": 8.923374898430794e-06, + "loss": 0.0417, + "step": 2340 + }, + { + "epoch": 1.065059144676979, + "grad_norm": 0.5384710947557135, + "learning_rate": 8.922488706972165e-06, + "loss": 0.028, + "step": 2341 + }, + { + "epoch": 1.0655141037306644, + "grad_norm": 0.5738095562796759, + "learning_rate": 8.921602194985473e-06, + "loss": 0.0251, + "step": 2342 + }, + { + "epoch": 1.0659690627843494, + "grad_norm": 0.4114388383836, + "learning_rate": 8.920715362543158e-06, + "loss": 0.0257, + "step": 2343 + }, + { + "epoch": 1.0664240218380345, + "grad_norm": 0.4407026853756295, + "learning_rate": 8.919828209717691e-06, + "loss": 0.0318, + "step": 2344 + }, + { + "epoch": 1.0668789808917198, + "grad_norm": 0.5795706484311789, + "learning_rate": 8.918940736581565e-06, + "loss": 0.0384, + "step": 2345 + }, + { + "epoch": 1.067333939945405, + "grad_norm": 0.4997138165488597, + "learning_rate": 8.918052943207298e-06, + "loss": 0.0339, + "step": 2346 + }, + { + "epoch": 1.06778889899909, + "grad_norm": 0.6466785074736559, + "learning_rate": 8.91716482966744e-06, + "loss": 0.0412, + "step": 2347 + }, + { + "epoch": 1.0682438580527753, + "grad_norm": 0.6101860514996267, + "learning_rate": 8.916276396034561e-06, + "loss": 0.0349, + "step": 2348 + }, + { + "epoch": 1.0686988171064604, + "grad_norm": 0.6648890763063255, + "learning_rate": 8.915387642381261e-06, + "loss": 0.0374, + "step": 2349 + }, + { + "epoch": 1.0691537761601455, + "grad_norm": 0.6435783427790035, + "learning_rate": 8.914498568780163e-06, + "loss": 0.0425, + "step": 2350 + }, + { + "epoch": 1.0696087352138308, + "grad_norm": 0.4168529921191238, + "learning_rate": 8.913609175303923e-06, + "loss": 0.0222, + "step": 2351 + }, + { + "epoch": 1.070063694267516, + "grad_norm": 0.370333742802149, + "learning_rate": 8.912719462025213e-06, + "loss": 0.018, + "step": 2352 + }, + { + "epoch": 1.070518653321201, + "grad_norm": 0.3929772094003772, + "learning_rate": 8.911829429016737e-06, + "loss": 0.0184, + "step": 2353 + }, + { + "epoch": 1.0709736123748863, + "grad_norm": 0.36777976145335695, + "learning_rate": 8.910939076351228e-06, + "loss": 0.0199, + "step": 2354 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 0.5445905742319043, + "learning_rate": 8.910048404101437e-06, + "loss": 0.0297, + "step": 2355 + }, + { + "epoch": 1.0718835304822565, + "grad_norm": 0.517651494476337, + "learning_rate": 8.90915741234015e-06, + "loss": 0.0244, + "step": 2356 + }, + { + "epoch": 1.0723384895359418, + "grad_norm": 0.6079868190664829, + "learning_rate": 8.908266101140173e-06, + "loss": 0.0327, + "step": 2357 + }, + { + "epoch": 1.0727934485896269, + "grad_norm": 0.5005614750938115, + "learning_rate": 8.907374470574339e-06, + "loss": 0.0288, + "step": 2358 + }, + { + "epoch": 1.0732484076433122, + "grad_norm": 0.41084278869296126, + "learning_rate": 8.906482520715508e-06, + "loss": 0.0196, + "step": 2359 + }, + { + "epoch": 1.0737033666969973, + "grad_norm": 0.42883961230062595, + "learning_rate": 8.905590251636566e-06, + "loss": 0.0201, + "step": 2360 + }, + { + "epoch": 1.0741583257506824, + "grad_norm": 0.7507509176249603, + "learning_rate": 8.904697663410429e-06, + "loss": 0.0519, + "step": 2361 + }, + { + "epoch": 1.0746132848043677, + "grad_norm": 0.35684834441788627, + "learning_rate": 8.90380475611003e-06, + "loss": 0.0193, + "step": 2362 + }, + { + "epoch": 1.0750682438580528, + "grad_norm": 0.359991301638448, + "learning_rate": 8.902911529808338e-06, + "loss": 0.02, + "step": 2363 + }, + { + "epoch": 1.0755232029117379, + "grad_norm": 0.6485293447004715, + "learning_rate": 8.90201798457834e-06, + "loss": 0.05, + "step": 2364 + }, + { + "epoch": 1.0759781619654232, + "grad_norm": 0.35596882973823685, + "learning_rate": 8.901124120493055e-06, + "loss": 0.0201, + "step": 2365 + }, + { + "epoch": 1.0764331210191083, + "grad_norm": 0.5195485453283638, + "learning_rate": 8.900229937625522e-06, + "loss": 0.0267, + "step": 2366 + }, + { + "epoch": 1.0768880800727934, + "grad_norm": 0.5121436407601963, + "learning_rate": 8.899335436048813e-06, + "loss": 0.0293, + "step": 2367 + }, + { + "epoch": 1.0773430391264787, + "grad_norm": 0.574083355691705, + "learning_rate": 8.898440615836021e-06, + "loss": 0.0314, + "step": 2368 + }, + { + "epoch": 1.0777979981801638, + "grad_norm": 0.36323016195490376, + "learning_rate": 8.897545477060268e-06, + "loss": 0.0164, + "step": 2369 + }, + { + "epoch": 1.078252957233849, + "grad_norm": 0.44874033315946665, + "learning_rate": 8.8966500197947e-06, + "loss": 0.0255, + "step": 2370 + }, + { + "epoch": 1.0787079162875342, + "grad_norm": 0.4549169634711705, + "learning_rate": 8.895754244112486e-06, + "loss": 0.0252, + "step": 2371 + }, + { + "epoch": 1.0791628753412192, + "grad_norm": 0.5188300138751303, + "learning_rate": 8.894858150086832e-06, + "loss": 0.022, + "step": 2372 + }, + { + "epoch": 1.0796178343949046, + "grad_norm": 0.5077854205250166, + "learning_rate": 8.893961737790957e-06, + "loss": 0.027, + "step": 2373 + }, + { + "epoch": 1.0800727934485896, + "grad_norm": 0.5080695970336, + "learning_rate": 8.893065007298116e-06, + "loss": 0.0293, + "step": 2374 + }, + { + "epoch": 1.0805277525022747, + "grad_norm": 0.49124016807194615, + "learning_rate": 8.89216795868158e-06, + "loss": 0.0253, + "step": 2375 + }, + { + "epoch": 1.08098271155596, + "grad_norm": 0.746420330430573, + "learning_rate": 8.891270592014658e-06, + "loss": 0.0393, + "step": 2376 + }, + { + "epoch": 1.0814376706096451, + "grad_norm": 0.5899621371906842, + "learning_rate": 8.890372907370677e-06, + "loss": 0.0325, + "step": 2377 + }, + { + "epoch": 1.0818926296633302, + "grad_norm": 0.538668781912988, + "learning_rate": 8.889474904822987e-06, + "loss": 0.0254, + "step": 2378 + }, + { + "epoch": 1.0823475887170155, + "grad_norm": 0.48796027217616167, + "learning_rate": 8.888576584444976e-06, + "loss": 0.0284, + "step": 2379 + }, + { + "epoch": 1.0828025477707006, + "grad_norm": 0.4607384499708701, + "learning_rate": 8.887677946310045e-06, + "loss": 0.0293, + "step": 2380 + }, + { + "epoch": 1.0832575068243857, + "grad_norm": 0.6691227522534325, + "learning_rate": 8.886778990491632e-06, + "loss": 0.0479, + "step": 2381 + }, + { + "epoch": 1.083712465878071, + "grad_norm": 0.4131339751828579, + "learning_rate": 8.885879717063189e-06, + "loss": 0.0232, + "step": 2382 + }, + { + "epoch": 1.084167424931756, + "grad_norm": 0.49834287436563, + "learning_rate": 8.884980126098206e-06, + "loss": 0.0261, + "step": 2383 + }, + { + "epoch": 1.0846223839854412, + "grad_norm": 0.49133678192638947, + "learning_rate": 8.88408021767019e-06, + "loss": 0.0217, + "step": 2384 + }, + { + "epoch": 1.0850773430391265, + "grad_norm": 0.4897177991752284, + "learning_rate": 8.88317999185268e-06, + "loss": 0.0304, + "step": 2385 + }, + { + "epoch": 1.0855323020928116, + "grad_norm": 0.5332982190122252, + "learning_rate": 8.882279448719235e-06, + "loss": 0.024, + "step": 2386 + }, + { + "epoch": 1.085987261146497, + "grad_norm": 0.39337001966991797, + "learning_rate": 8.881378588343448e-06, + "loss": 0.0195, + "step": 2387 + }, + { + "epoch": 1.086442220200182, + "grad_norm": 0.5648723431118464, + "learning_rate": 8.88047741079893e-06, + "loss": 0.0277, + "step": 2388 + }, + { + "epoch": 1.086897179253867, + "grad_norm": 0.38358401084782046, + "learning_rate": 8.879575916159323e-06, + "loss": 0.0234, + "step": 2389 + }, + { + "epoch": 1.0873521383075524, + "grad_norm": 0.4916039064871815, + "learning_rate": 8.878674104498293e-06, + "loss": 0.0196, + "step": 2390 + }, + { + "epoch": 1.0878070973612375, + "grad_norm": 0.4574406020630443, + "learning_rate": 8.877771975889529e-06, + "loss": 0.0266, + "step": 2391 + }, + { + "epoch": 1.0882620564149226, + "grad_norm": 1.2527103886930033, + "learning_rate": 8.876869530406753e-06, + "loss": 0.085, + "step": 2392 + }, + { + "epoch": 1.0887170154686079, + "grad_norm": 0.6740099441800771, + "learning_rate": 8.875966768123705e-06, + "loss": 0.0491, + "step": 2393 + }, + { + "epoch": 1.089171974522293, + "grad_norm": 0.8127319301316774, + "learning_rate": 8.875063689114157e-06, + "loss": 0.0351, + "step": 2394 + }, + { + "epoch": 1.089626933575978, + "grad_norm": 0.6883882884250196, + "learning_rate": 8.874160293451903e-06, + "loss": 0.0351, + "step": 2395 + }, + { + "epoch": 1.0900818926296634, + "grad_norm": 0.472050537765526, + "learning_rate": 8.873256581210767e-06, + "loss": 0.0281, + "step": 2396 + }, + { + "epoch": 1.0905368516833485, + "grad_norm": 0.43429585005126187, + "learning_rate": 8.872352552464594e-06, + "loss": 0.0217, + "step": 2397 + }, + { + "epoch": 1.0909918107370338, + "grad_norm": 0.7559591015285818, + "learning_rate": 8.871448207287259e-06, + "loss": 0.0234, + "step": 2398 + }, + { + "epoch": 1.0914467697907189, + "grad_norm": 1.295843093263791, + "learning_rate": 8.870543545752657e-06, + "loss": 0.0378, + "step": 2399 + }, + { + "epoch": 1.091901728844404, + "grad_norm": 0.687703240327456, + "learning_rate": 8.869638567934718e-06, + "loss": 0.0428, + "step": 2400 + }, + { + "epoch": 1.0923566878980893, + "grad_norm": 0.5316380088515792, + "learning_rate": 8.86873327390739e-06, + "loss": 0.0207, + "step": 2401 + }, + { + "epoch": 1.0928116469517744, + "grad_norm": 0.37080940024955544, + "learning_rate": 8.867827663744649e-06, + "loss": 0.014, + "step": 2402 + }, + { + "epoch": 1.0932666060054594, + "grad_norm": 0.551372034105751, + "learning_rate": 8.8669217375205e-06, + "loss": 0.0407, + "step": 2403 + }, + { + "epoch": 1.0937215650591448, + "grad_norm": 0.550827427093742, + "learning_rate": 8.866015495308967e-06, + "loss": 0.0295, + "step": 2404 + }, + { + "epoch": 1.0941765241128298, + "grad_norm": 0.5312346261037174, + "learning_rate": 8.865108937184108e-06, + "loss": 0.0329, + "step": 2405 + }, + { + "epoch": 1.094631483166515, + "grad_norm": 0.606116027973049, + "learning_rate": 8.864202063220003e-06, + "loss": 0.036, + "step": 2406 + }, + { + "epoch": 1.0950864422202002, + "grad_norm": 0.5039409256044083, + "learning_rate": 8.863294873490752e-06, + "loss": 0.0237, + "step": 2407 + }, + { + "epoch": 1.0955414012738853, + "grad_norm": 0.7088932326845141, + "learning_rate": 8.862387368070493e-06, + "loss": 0.0502, + "step": 2408 + }, + { + "epoch": 1.0959963603275704, + "grad_norm": 0.42457685669799344, + "learning_rate": 8.86147954703338e-06, + "loss": 0.0232, + "step": 2409 + }, + { + "epoch": 1.0964513193812557, + "grad_norm": 0.400049727629285, + "learning_rate": 8.860571410453598e-06, + "loss": 0.0137, + "step": 2410 + }, + { + "epoch": 1.0969062784349408, + "grad_norm": 0.5528326412344238, + "learning_rate": 8.859662958405352e-06, + "loss": 0.0259, + "step": 2411 + }, + { + "epoch": 1.097361237488626, + "grad_norm": 0.3740020218354164, + "learning_rate": 8.858754190962881e-06, + "loss": 0.0207, + "step": 2412 + }, + { + "epoch": 1.0978161965423112, + "grad_norm": 0.43380267454252947, + "learning_rate": 8.857845108200443e-06, + "loss": 0.03, + "step": 2413 + }, + { + "epoch": 1.0982711555959963, + "grad_norm": 0.41117776188244837, + "learning_rate": 8.856935710192326e-06, + "loss": 0.0217, + "step": 2414 + }, + { + "epoch": 1.0987261146496816, + "grad_norm": 0.7295481072089418, + "learning_rate": 8.856025997012837e-06, + "loss": 0.0355, + "step": 2415 + }, + { + "epoch": 1.0991810737033667, + "grad_norm": 0.6100308273835641, + "learning_rate": 8.85511596873632e-06, + "loss": 0.0369, + "step": 2416 + }, + { + "epoch": 1.0996360327570518, + "grad_norm": 0.41413261117443184, + "learning_rate": 8.854205625437135e-06, + "loss": 0.0198, + "step": 2417 + }, + { + "epoch": 1.100090991810737, + "grad_norm": 0.45865368499615844, + "learning_rate": 8.853294967189672e-06, + "loss": 0.0274, + "step": 2418 + }, + { + "epoch": 1.1005459508644222, + "grad_norm": 0.49503724640291885, + "learning_rate": 8.852383994068345e-06, + "loss": 0.039, + "step": 2419 + }, + { + "epoch": 1.1010009099181073, + "grad_norm": 0.3278139965958097, + "learning_rate": 8.851472706147595e-06, + "loss": 0.02, + "step": 2420 + }, + { + "epoch": 1.1014558689717926, + "grad_norm": 0.7072991481662654, + "learning_rate": 8.85056110350189e-06, + "loss": 0.0478, + "step": 2421 + }, + { + "epoch": 1.1019108280254777, + "grad_norm": 0.3754428113606483, + "learning_rate": 8.84964918620572e-06, + "loss": 0.0204, + "step": 2422 + }, + { + "epoch": 1.1023657870791628, + "grad_norm": 0.7096758634544409, + "learning_rate": 8.848736954333603e-06, + "loss": 0.0335, + "step": 2423 + }, + { + "epoch": 1.102820746132848, + "grad_norm": 0.5727995354405594, + "learning_rate": 8.847824407960083e-06, + "loss": 0.0323, + "step": 2424 + }, + { + "epoch": 1.1032757051865332, + "grad_norm": 0.6229568548114003, + "learning_rate": 8.84691154715973e-06, + "loss": 0.0309, + "step": 2425 + }, + { + "epoch": 1.1037306642402185, + "grad_norm": 0.5010513455704715, + "learning_rate": 8.845998372007136e-06, + "loss": 0.0286, + "step": 2426 + }, + { + "epoch": 1.1041856232939036, + "grad_norm": 0.34862957832393143, + "learning_rate": 8.845084882576924e-06, + "loss": 0.0165, + "step": 2427 + }, + { + "epoch": 1.1046405823475887, + "grad_norm": 0.5610710585811625, + "learning_rate": 8.84417107894374e-06, + "loss": 0.0381, + "step": 2428 + }, + { + "epoch": 1.105095541401274, + "grad_norm": 0.3998367702132408, + "learning_rate": 8.843256961182255e-06, + "loss": 0.0186, + "step": 2429 + }, + { + "epoch": 1.105550500454959, + "grad_norm": 0.6787215229828617, + "learning_rate": 8.842342529367167e-06, + "loss": 0.0487, + "step": 2430 + }, + { + "epoch": 1.1060054595086442, + "grad_norm": 0.6483563929183911, + "learning_rate": 8.8414277835732e-06, + "loss": 0.0409, + "step": 2431 + }, + { + "epoch": 1.1064604185623295, + "grad_norm": 0.6351823340870137, + "learning_rate": 8.840512723875103e-06, + "loss": 0.0497, + "step": 2432 + }, + { + "epoch": 1.1069153776160146, + "grad_norm": 0.3467791981865341, + "learning_rate": 8.839597350347648e-06, + "loss": 0.0172, + "step": 2433 + }, + { + "epoch": 1.1073703366696996, + "grad_norm": 0.4877926867999841, + "learning_rate": 8.838681663065638e-06, + "loss": 0.0268, + "step": 2434 + }, + { + "epoch": 1.107825295723385, + "grad_norm": 0.561052741145843, + "learning_rate": 8.837765662103898e-06, + "loss": 0.0351, + "step": 2435 + }, + { + "epoch": 1.10828025477707, + "grad_norm": 0.5339886977527083, + "learning_rate": 8.836849347537278e-06, + "loss": 0.0286, + "step": 2436 + }, + { + "epoch": 1.1087352138307551, + "grad_norm": 0.41940315295115715, + "learning_rate": 8.835932719440658e-06, + "loss": 0.016, + "step": 2437 + }, + { + "epoch": 1.1091901728844404, + "grad_norm": 0.500811248377599, + "learning_rate": 8.835015777888938e-06, + "loss": 0.0277, + "step": 2438 + }, + { + "epoch": 1.1096451319381255, + "grad_norm": 0.6905252242552301, + "learning_rate": 8.83409852295705e-06, + "loss": 0.0451, + "step": 2439 + }, + { + "epoch": 1.1101000909918108, + "grad_norm": 0.4932334291437054, + "learning_rate": 8.833180954719941e-06, + "loss": 0.023, + "step": 2440 + }, + { + "epoch": 1.110555050045496, + "grad_norm": 0.32570391119462067, + "learning_rate": 8.832263073252597e-06, + "loss": 0.0223, + "step": 2441 + }, + { + "epoch": 1.111010009099181, + "grad_norm": 0.5189620509513116, + "learning_rate": 8.831344878630022e-06, + "loss": 0.0345, + "step": 2442 + }, + { + "epoch": 1.1114649681528663, + "grad_norm": 0.35471915929013836, + "learning_rate": 8.830426370927246e-06, + "loss": 0.0178, + "step": 2443 + }, + { + "epoch": 1.1119199272065514, + "grad_norm": 0.4071867204646678, + "learning_rate": 8.829507550219323e-06, + "loss": 0.0187, + "step": 2444 + }, + { + "epoch": 1.1123748862602365, + "grad_norm": 0.5327053422443435, + "learning_rate": 8.828588416581338e-06, + "loss": 0.0321, + "step": 2445 + }, + { + "epoch": 1.1128298453139218, + "grad_norm": 0.4727447057361278, + "learning_rate": 8.827668970088397e-06, + "loss": 0.0256, + "step": 2446 + }, + { + "epoch": 1.113284804367607, + "grad_norm": 0.44344698021715867, + "learning_rate": 8.826749210815634e-06, + "loss": 0.0212, + "step": 2447 + }, + { + "epoch": 1.113739763421292, + "grad_norm": 0.48653354586078956, + "learning_rate": 8.825829138838206e-06, + "loss": 0.0252, + "step": 2448 + }, + { + "epoch": 1.1141947224749773, + "grad_norm": 0.4904789767614279, + "learning_rate": 8.824908754231299e-06, + "loss": 0.0219, + "step": 2449 + }, + { + "epoch": 1.1146496815286624, + "grad_norm": 0.5096306577344566, + "learning_rate": 8.823988057070122e-06, + "loss": 0.0269, + "step": 2450 + }, + { + "epoch": 1.1151046405823477, + "grad_norm": 0.4524604770972165, + "learning_rate": 8.823067047429908e-06, + "loss": 0.0197, + "step": 2451 + }, + { + "epoch": 1.1155595996360328, + "grad_norm": 0.6661762941224277, + "learning_rate": 8.82214572538592e-06, + "loss": 0.0432, + "step": 2452 + }, + { + "epoch": 1.1160145586897179, + "grad_norm": 0.45413808918893234, + "learning_rate": 8.821224091013445e-06, + "loss": 0.0252, + "step": 2453 + }, + { + "epoch": 1.1164695177434032, + "grad_norm": 0.4564359066247584, + "learning_rate": 8.820302144387794e-06, + "loss": 0.0305, + "step": 2454 + }, + { + "epoch": 1.1169244767970883, + "grad_norm": 0.5331752474098931, + "learning_rate": 8.819379885584303e-06, + "loss": 0.0285, + "step": 2455 + }, + { + "epoch": 1.1173794358507734, + "grad_norm": 0.8314482044455632, + "learning_rate": 8.818457314678336e-06, + "loss": 0.0474, + "step": 2456 + }, + { + "epoch": 1.1178343949044587, + "grad_norm": 0.5831509752587852, + "learning_rate": 8.817534431745283e-06, + "loss": 0.0204, + "step": 2457 + }, + { + "epoch": 1.1182893539581438, + "grad_norm": 0.42113991056064237, + "learning_rate": 8.816611236860554e-06, + "loss": 0.0207, + "step": 2458 + }, + { + "epoch": 1.1187443130118289, + "grad_norm": 0.5492674131587796, + "learning_rate": 8.815687730099594e-06, + "loss": 0.023, + "step": 2459 + }, + { + "epoch": 1.1191992720655142, + "grad_norm": 0.5627677712218775, + "learning_rate": 8.81476391153786e-06, + "loss": 0.0238, + "step": 2460 + }, + { + "epoch": 1.1196542311191993, + "grad_norm": 0.306412099822185, + "learning_rate": 8.813839781250848e-06, + "loss": 0.0136, + "step": 2461 + }, + { + "epoch": 1.1201091901728844, + "grad_norm": 0.4884139369729457, + "learning_rate": 8.812915339314073e-06, + "loss": 0.0325, + "step": 2462 + }, + { + "epoch": 1.1205641492265697, + "grad_norm": 0.6440331779678226, + "learning_rate": 8.811990585803074e-06, + "loss": 0.0462, + "step": 2463 + }, + { + "epoch": 1.1210191082802548, + "grad_norm": 0.6354635395644428, + "learning_rate": 8.81106552079342e-06, + "loss": 0.0326, + "step": 2464 + }, + { + "epoch": 1.1214740673339398, + "grad_norm": 0.4841057095746355, + "learning_rate": 8.810140144360701e-06, + "loss": 0.0288, + "step": 2465 + }, + { + "epoch": 1.1219290263876252, + "grad_norm": 0.7578064954916388, + "learning_rate": 8.809214456580539e-06, + "loss": 0.0444, + "step": 2466 + }, + { + "epoch": 1.1223839854413102, + "grad_norm": 0.36333027437030824, + "learning_rate": 8.80828845752857e-06, + "loss": 0.0166, + "step": 2467 + }, + { + "epoch": 1.1228389444949956, + "grad_norm": 1.0828419984965674, + "learning_rate": 8.80736214728047e-06, + "loss": 0.0509, + "step": 2468 + }, + { + "epoch": 1.1232939035486806, + "grad_norm": 0.41035853061268457, + "learning_rate": 8.806435525911927e-06, + "loss": 0.0152, + "step": 2469 + }, + { + "epoch": 1.1237488626023657, + "grad_norm": 0.48117366130842515, + "learning_rate": 8.805508593498662e-06, + "loss": 0.0358, + "step": 2470 + }, + { + "epoch": 1.124203821656051, + "grad_norm": 0.48865070302034325, + "learning_rate": 8.804581350116422e-06, + "loss": 0.0248, + "step": 2471 + }, + { + "epoch": 1.1246587807097361, + "grad_norm": 0.6166160347574816, + "learning_rate": 8.803653795840974e-06, + "loss": 0.0372, + "step": 2472 + }, + { + "epoch": 1.1251137397634212, + "grad_norm": 0.4235666133907878, + "learning_rate": 8.802725930748115e-06, + "loss": 0.0224, + "step": 2473 + }, + { + "epoch": 1.1255686988171065, + "grad_norm": 0.49371023402555386, + "learning_rate": 8.801797754913667e-06, + "loss": 0.0253, + "step": 2474 + }, + { + "epoch": 1.1260236578707916, + "grad_norm": 0.5375981231946215, + "learning_rate": 8.800869268413475e-06, + "loss": 0.0303, + "step": 2475 + }, + { + "epoch": 1.1264786169244767, + "grad_norm": 0.6200342528643785, + "learning_rate": 8.79994047132341e-06, + "loss": 0.0301, + "step": 2476 + }, + { + "epoch": 1.126933575978162, + "grad_norm": 0.7763567599332302, + "learning_rate": 8.79901136371937e-06, + "loss": 0.0367, + "step": 2477 + }, + { + "epoch": 1.127388535031847, + "grad_norm": 0.4168679527566863, + "learning_rate": 8.798081945677279e-06, + "loss": 0.0193, + "step": 2478 + }, + { + "epoch": 1.1278434940855324, + "grad_norm": 0.5499515478297102, + "learning_rate": 8.797152217273082e-06, + "loss": 0.0232, + "step": 2479 + }, + { + "epoch": 1.1282984531392175, + "grad_norm": 0.3629031290073349, + "learning_rate": 8.796222178582756e-06, + "loss": 0.0217, + "step": 2480 + }, + { + "epoch": 1.1287534121929026, + "grad_norm": 0.539897737827513, + "learning_rate": 8.795291829682293e-06, + "loss": 0.0272, + "step": 2481 + }, + { + "epoch": 1.129208371246588, + "grad_norm": 0.5636939303591514, + "learning_rate": 8.794361170647723e-06, + "loss": 0.0322, + "step": 2482 + }, + { + "epoch": 1.129663330300273, + "grad_norm": 0.6219815104303015, + "learning_rate": 8.793430201555095e-06, + "loss": 0.0274, + "step": 2483 + }, + { + "epoch": 1.130118289353958, + "grad_norm": 0.6542904517198702, + "learning_rate": 8.79249892248048e-06, + "loss": 0.0358, + "step": 2484 + }, + { + "epoch": 1.1305732484076434, + "grad_norm": 0.46666017679304383, + "learning_rate": 8.79156733349998e-06, + "loss": 0.0308, + "step": 2485 + }, + { + "epoch": 1.1310282074613285, + "grad_norm": 0.643787908195578, + "learning_rate": 8.790635434689722e-06, + "loss": 0.0325, + "step": 2486 + }, + { + "epoch": 1.1314831665150136, + "grad_norm": 0.6798497056398047, + "learning_rate": 8.789703226125853e-06, + "loss": 0.0388, + "step": 2487 + }, + { + "epoch": 1.1319381255686989, + "grad_norm": 0.45682700520723596, + "learning_rate": 8.78877070788455e-06, + "loss": 0.0248, + "step": 2488 + }, + { + "epoch": 1.132393084622384, + "grad_norm": 0.520494224107322, + "learning_rate": 8.787837880042016e-06, + "loss": 0.0251, + "step": 2489 + }, + { + "epoch": 1.132848043676069, + "grad_norm": 0.5608809735379154, + "learning_rate": 8.786904742674476e-06, + "loss": 0.0354, + "step": 2490 + }, + { + "epoch": 1.1333030027297544, + "grad_norm": 0.5383912877252518, + "learning_rate": 8.78597129585818e-06, + "loss": 0.0252, + "step": 2491 + }, + { + "epoch": 1.1337579617834395, + "grad_norm": 0.3952421850434973, + "learning_rate": 8.78503753966941e-06, + "loss": 0.0191, + "step": 2492 + }, + { + "epoch": 1.1342129208371245, + "grad_norm": 0.7660377240440205, + "learning_rate": 8.784103474184463e-06, + "loss": 0.0372, + "step": 2493 + }, + { + "epoch": 1.1346678798908099, + "grad_norm": 0.45419840808136375, + "learning_rate": 8.783169099479669e-06, + "loss": 0.0237, + "step": 2494 + }, + { + "epoch": 1.135122838944495, + "grad_norm": 0.6963944475868004, + "learning_rate": 8.782234415631381e-06, + "loss": 0.0402, + "step": 2495 + }, + { + "epoch": 1.1355777979981803, + "grad_norm": 0.43802475738162483, + "learning_rate": 8.781299422715979e-06, + "loss": 0.0238, + "step": 2496 + }, + { + "epoch": 1.1360327570518653, + "grad_norm": 0.6062845259672841, + "learning_rate": 8.780364120809863e-06, + "loss": 0.0299, + "step": 2497 + }, + { + "epoch": 1.1364877161055504, + "grad_norm": 0.44459971712814256, + "learning_rate": 8.779428509989463e-06, + "loss": 0.0205, + "step": 2498 + }, + { + "epoch": 1.1369426751592357, + "grad_norm": 0.8182256630287221, + "learning_rate": 8.778492590331234e-06, + "loss": 0.0358, + "step": 2499 + }, + { + "epoch": 1.1373976342129208, + "grad_norm": 0.35292113524041313, + "learning_rate": 8.777556361911652e-06, + "loss": 0.0188, + "step": 2500 + }, + { + "epoch": 1.137852593266606, + "grad_norm": 0.5495898839385301, + "learning_rate": 8.776619824807225e-06, + "loss": 0.0403, + "step": 2501 + }, + { + "epoch": 1.1383075523202912, + "grad_norm": 0.47715012261917683, + "learning_rate": 8.77568297909448e-06, + "loss": 0.0308, + "step": 2502 + }, + { + "epoch": 1.1387625113739763, + "grad_norm": 0.5057002315147829, + "learning_rate": 8.774745824849973e-06, + "loss": 0.0255, + "step": 2503 + }, + { + "epoch": 1.1392174704276614, + "grad_norm": 0.637445487028803, + "learning_rate": 8.773808362150284e-06, + "loss": 0.0441, + "step": 2504 + }, + { + "epoch": 1.1396724294813467, + "grad_norm": 0.46970000948757085, + "learning_rate": 8.772870591072016e-06, + "loss": 0.0203, + "step": 2505 + }, + { + "epoch": 1.1401273885350318, + "grad_norm": 0.48405940158780947, + "learning_rate": 8.771932511691805e-06, + "loss": 0.0248, + "step": 2506 + }, + { + "epoch": 1.1405823475887171, + "grad_norm": 0.5007699680851107, + "learning_rate": 8.7709941240863e-06, + "loss": 0.0299, + "step": 2507 + }, + { + "epoch": 1.1410373066424022, + "grad_norm": 0.47412512472759577, + "learning_rate": 8.770055428332187e-06, + "loss": 0.0289, + "step": 2508 + }, + { + "epoch": 1.1414922656960873, + "grad_norm": 0.6167640062421629, + "learning_rate": 8.769116424506168e-06, + "loss": 0.0308, + "step": 2509 + }, + { + "epoch": 1.1419472247497726, + "grad_norm": 0.39237316345479106, + "learning_rate": 8.768177112684976e-06, + "loss": 0.023, + "step": 2510 + }, + { + "epoch": 1.1424021838034577, + "grad_norm": 0.5186908295343413, + "learning_rate": 8.767237492945372e-06, + "loss": 0.0253, + "step": 2511 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.5056070603356543, + "learning_rate": 8.766297565364127e-06, + "loss": 0.0269, + "step": 2512 + }, + { + "epoch": 1.143312101910828, + "grad_norm": 0.572114404769031, + "learning_rate": 8.765357330018056e-06, + "loss": 0.04, + "step": 2513 + }, + { + "epoch": 1.1437670609645132, + "grad_norm": 0.5742667251635876, + "learning_rate": 8.764416786983987e-06, + "loss": 0.0341, + "step": 2514 + }, + { + "epoch": 1.1442220200181983, + "grad_norm": 0.7921946978016261, + "learning_rate": 8.763475936338778e-06, + "loss": 0.0297, + "step": 2515 + }, + { + "epoch": 1.1446769790718836, + "grad_norm": 0.5932003547457203, + "learning_rate": 8.762534778159313e-06, + "loss": 0.0329, + "step": 2516 + }, + { + "epoch": 1.1451319381255687, + "grad_norm": 0.4383972484081299, + "learning_rate": 8.761593312522496e-06, + "loss": 0.026, + "step": 2517 + }, + { + "epoch": 1.1455868971792538, + "grad_norm": 0.494406013971066, + "learning_rate": 8.76065153950526e-06, + "loss": 0.0252, + "step": 2518 + }, + { + "epoch": 1.146041856232939, + "grad_norm": 0.41600285124838154, + "learning_rate": 8.759709459184565e-06, + "loss": 0.03, + "step": 2519 + }, + { + "epoch": 1.1464968152866242, + "grad_norm": 0.7103449624996373, + "learning_rate": 8.758767071637391e-06, + "loss": 0.0293, + "step": 2520 + }, + { + "epoch": 1.1469517743403093, + "grad_norm": 0.7247596682387525, + "learning_rate": 8.757824376940748e-06, + "loss": 0.0534, + "step": 2521 + }, + { + "epoch": 1.1474067333939946, + "grad_norm": 0.5429066180348485, + "learning_rate": 8.756881375171664e-06, + "loss": 0.0366, + "step": 2522 + }, + { + "epoch": 1.1478616924476797, + "grad_norm": 0.5884373670939516, + "learning_rate": 8.755938066407201e-06, + "loss": 0.0335, + "step": 2523 + }, + { + "epoch": 1.148316651501365, + "grad_norm": 0.6156045708560577, + "learning_rate": 8.754994450724441e-06, + "loss": 0.0345, + "step": 2524 + }, + { + "epoch": 1.14877161055505, + "grad_norm": 0.5614699649040673, + "learning_rate": 8.754050528200493e-06, + "loss": 0.0329, + "step": 2525 + }, + { + "epoch": 1.1492265696087351, + "grad_norm": 0.6406021126928062, + "learning_rate": 8.753106298912488e-06, + "loss": 0.0306, + "step": 2526 + }, + { + "epoch": 1.1496815286624205, + "grad_norm": 0.5000438600163287, + "learning_rate": 8.752161762937586e-06, + "loss": 0.0223, + "step": 2527 + }, + { + "epoch": 1.1501364877161055, + "grad_norm": 0.3997197285041498, + "learning_rate": 8.751216920352967e-06, + "loss": 0.0221, + "step": 2528 + }, + { + "epoch": 1.1505914467697906, + "grad_norm": 0.5040179214810742, + "learning_rate": 8.750271771235844e-06, + "loss": 0.0196, + "step": 2529 + }, + { + "epoch": 1.151046405823476, + "grad_norm": 0.40549609696673644, + "learning_rate": 8.749326315663447e-06, + "loss": 0.0231, + "step": 2530 + }, + { + "epoch": 1.151501364877161, + "grad_norm": 0.406160230893779, + "learning_rate": 8.748380553713033e-06, + "loss": 0.0208, + "step": 2531 + }, + { + "epoch": 1.1519563239308463, + "grad_norm": 0.5844194685702613, + "learning_rate": 8.747434485461892e-06, + "loss": 0.0241, + "step": 2532 + }, + { + "epoch": 1.1524112829845314, + "grad_norm": 0.36029638509152084, + "learning_rate": 8.746488110987326e-06, + "loss": 0.015, + "step": 2533 + }, + { + "epoch": 1.1528662420382165, + "grad_norm": 0.7276197204807093, + "learning_rate": 8.745541430366671e-06, + "loss": 0.0418, + "step": 2534 + }, + { + "epoch": 1.1533212010919018, + "grad_norm": 1.5020467500828025, + "learning_rate": 8.744594443677284e-06, + "loss": 0.0582, + "step": 2535 + }, + { + "epoch": 1.153776160145587, + "grad_norm": 0.4311974728697227, + "learning_rate": 8.743647150996551e-06, + "loss": 0.0258, + "step": 2536 + }, + { + "epoch": 1.154231119199272, + "grad_norm": 0.6248463720530537, + "learning_rate": 8.742699552401878e-06, + "loss": 0.0398, + "step": 2537 + }, + { + "epoch": 1.1546860782529573, + "grad_norm": 0.5339944254155865, + "learning_rate": 8.7417516479707e-06, + "loss": 0.0252, + "step": 2538 + }, + { + "epoch": 1.1551410373066424, + "grad_norm": 0.3465118720450813, + "learning_rate": 8.740803437780474e-06, + "loss": 0.0183, + "step": 2539 + }, + { + "epoch": 1.1555959963603275, + "grad_norm": 0.6096918552154363, + "learning_rate": 8.739854921908684e-06, + "loss": 0.0318, + "step": 2540 + }, + { + "epoch": 1.1560509554140128, + "grad_norm": 0.42626286323793855, + "learning_rate": 8.73890610043284e-06, + "loss": 0.0292, + "step": 2541 + }, + { + "epoch": 1.156505914467698, + "grad_norm": 0.47325164391197866, + "learning_rate": 8.737956973430475e-06, + "loss": 0.0337, + "step": 2542 + }, + { + "epoch": 1.156960873521383, + "grad_norm": 0.6214186683671308, + "learning_rate": 8.737007540979146e-06, + "loss": 0.0235, + "step": 2543 + }, + { + "epoch": 1.1574158325750683, + "grad_norm": 0.4958886649213906, + "learning_rate": 8.736057803156436e-06, + "loss": 0.0255, + "step": 2544 + }, + { + "epoch": 1.1578707916287534, + "grad_norm": 0.3732620529932146, + "learning_rate": 8.735107760039954e-06, + "loss": 0.0197, + "step": 2545 + }, + { + "epoch": 1.1583257506824385, + "grad_norm": 0.5778213004705967, + "learning_rate": 8.734157411707334e-06, + "loss": 0.0277, + "step": 2546 + }, + { + "epoch": 1.1587807097361238, + "grad_norm": 0.4850677867721973, + "learning_rate": 8.733206758236235e-06, + "loss": 0.0235, + "step": 2547 + }, + { + "epoch": 1.1592356687898089, + "grad_norm": 0.5687049775983313, + "learning_rate": 8.732255799704337e-06, + "loss": 0.0335, + "step": 2548 + }, + { + "epoch": 1.159690627843494, + "grad_norm": 0.5063906062734673, + "learning_rate": 8.73130453618935e-06, + "loss": 0.0224, + "step": 2549 + }, + { + "epoch": 1.1601455868971793, + "grad_norm": 0.4830706957588217, + "learning_rate": 8.730352967769007e-06, + "loss": 0.026, + "step": 2550 + }, + { + "epoch": 1.1606005459508644, + "grad_norm": 0.4565903397736301, + "learning_rate": 8.729401094521066e-06, + "loss": 0.0171, + "step": 2551 + }, + { + "epoch": 1.1610555050045497, + "grad_norm": 0.5299141705331825, + "learning_rate": 8.728448916523309e-06, + "loss": 0.0283, + "step": 2552 + }, + { + "epoch": 1.1615104640582348, + "grad_norm": 0.5618467862878425, + "learning_rate": 8.727496433853543e-06, + "loss": 0.0289, + "step": 2553 + }, + { + "epoch": 1.1619654231119199, + "grad_norm": 0.464342731748468, + "learning_rate": 8.726543646589605e-06, + "loss": 0.0202, + "step": 2554 + }, + { + "epoch": 1.1624203821656052, + "grad_norm": 0.5984943035378484, + "learning_rate": 8.725590554809346e-06, + "loss": 0.0387, + "step": 2555 + }, + { + "epoch": 1.1628753412192903, + "grad_norm": 0.3103247899143151, + "learning_rate": 8.724637158590652e-06, + "loss": 0.0172, + "step": 2556 + }, + { + "epoch": 1.1633303002729753, + "grad_norm": 0.5719001232225214, + "learning_rate": 8.72368345801143e-06, + "loss": 0.0328, + "step": 2557 + }, + { + "epoch": 1.1637852593266607, + "grad_norm": 0.7184689253863656, + "learning_rate": 8.722729453149613e-06, + "loss": 0.0256, + "step": 2558 + }, + { + "epoch": 1.1642402183803457, + "grad_norm": 0.4264869300929295, + "learning_rate": 8.721775144083155e-06, + "loss": 0.0273, + "step": 2559 + }, + { + "epoch": 1.164695177434031, + "grad_norm": 0.6992959245688258, + "learning_rate": 8.72082053089004e-06, + "loss": 0.0391, + "step": 2560 + }, + { + "epoch": 1.1651501364877161, + "grad_norm": 0.5598830058244858, + "learning_rate": 8.719865613648276e-06, + "loss": 0.0348, + "step": 2561 + }, + { + "epoch": 1.1656050955414012, + "grad_norm": 0.4490293057873329, + "learning_rate": 8.718910392435892e-06, + "loss": 0.0185, + "step": 2562 + }, + { + "epoch": 1.1660600545950865, + "grad_norm": 0.3188239247752473, + "learning_rate": 8.717954867330943e-06, + "loss": 0.0118, + "step": 2563 + }, + { + "epoch": 1.1665150136487716, + "grad_norm": 0.529002754756549, + "learning_rate": 8.716999038411513e-06, + "loss": 0.0422, + "step": 2564 + }, + { + "epoch": 1.1669699727024567, + "grad_norm": 0.6102751055626958, + "learning_rate": 8.716042905755708e-06, + "loss": 0.0321, + "step": 2565 + }, + { + "epoch": 1.167424931756142, + "grad_norm": 0.4958464600211268, + "learning_rate": 8.715086469441659e-06, + "loss": 0.027, + "step": 2566 + }, + { + "epoch": 1.1678798908098271, + "grad_norm": 0.6925927485590572, + "learning_rate": 8.714129729547522e-06, + "loss": 0.0528, + "step": 2567 + }, + { + "epoch": 1.1683348498635122, + "grad_norm": 0.48346645004557054, + "learning_rate": 8.713172686151475e-06, + "loss": 0.0241, + "step": 2568 + }, + { + "epoch": 1.1687898089171975, + "grad_norm": 0.6160868757033329, + "learning_rate": 8.712215339331724e-06, + "loss": 0.0364, + "step": 2569 + }, + { + "epoch": 1.1692447679708826, + "grad_norm": 0.5521736841094272, + "learning_rate": 8.711257689166499e-06, + "loss": 0.0384, + "step": 2570 + }, + { + "epoch": 1.1696997270245677, + "grad_norm": 0.4358123533199606, + "learning_rate": 8.710299735734057e-06, + "loss": 0.0218, + "step": 2571 + }, + { + "epoch": 1.170154686078253, + "grad_norm": 0.49989161769199447, + "learning_rate": 8.709341479112676e-06, + "loss": 0.019, + "step": 2572 + }, + { + "epoch": 1.170609645131938, + "grad_norm": 0.6461070187412289, + "learning_rate": 8.70838291938066e-06, + "loss": 0.05, + "step": 2573 + }, + { + "epoch": 1.1710646041856232, + "grad_norm": 0.5015730644729591, + "learning_rate": 8.70742405661634e-06, + "loss": 0.0262, + "step": 2574 + }, + { + "epoch": 1.1715195632393085, + "grad_norm": 0.6731652049317264, + "learning_rate": 8.706464890898068e-06, + "loss": 0.0417, + "step": 2575 + }, + { + "epoch": 1.1719745222929936, + "grad_norm": 0.5953498514866105, + "learning_rate": 8.705505422304224e-06, + "loss": 0.0251, + "step": 2576 + }, + { + "epoch": 1.1724294813466787, + "grad_norm": 0.49337464142227694, + "learning_rate": 8.70454565091321e-06, + "loss": 0.0283, + "step": 2577 + }, + { + "epoch": 1.172884440400364, + "grad_norm": 0.40746621618427764, + "learning_rate": 8.703585576803455e-06, + "loss": 0.0235, + "step": 2578 + }, + { + "epoch": 1.173339399454049, + "grad_norm": 0.574388099759434, + "learning_rate": 8.702625200053412e-06, + "loss": 0.0357, + "step": 2579 + }, + { + "epoch": 1.1737943585077344, + "grad_norm": 0.49209063287204186, + "learning_rate": 8.701664520741558e-06, + "loss": 0.0271, + "step": 2580 + }, + { + "epoch": 1.1742493175614195, + "grad_norm": 0.49658769644628054, + "learning_rate": 8.700703538946396e-06, + "loss": 0.0312, + "step": 2581 + }, + { + "epoch": 1.1747042766151046, + "grad_norm": 0.48898735666034404, + "learning_rate": 8.699742254746452e-06, + "loss": 0.0308, + "step": 2582 + }, + { + "epoch": 1.1751592356687899, + "grad_norm": 0.6965571111870493, + "learning_rate": 8.698780668220281e-06, + "loss": 0.0587, + "step": 2583 + }, + { + "epoch": 1.175614194722475, + "grad_norm": 0.4680913844344663, + "learning_rate": 8.697818779446456e-06, + "loss": 0.0268, + "step": 2584 + }, + { + "epoch": 1.17606915377616, + "grad_norm": 0.5966094635320064, + "learning_rate": 8.696856588503582e-06, + "loss": 0.0441, + "step": 2585 + }, + { + "epoch": 1.1765241128298454, + "grad_norm": 0.41029105691286216, + "learning_rate": 8.69589409547028e-06, + "loss": 0.0238, + "step": 2586 + }, + { + "epoch": 1.1769790718835305, + "grad_norm": 0.4919555962191467, + "learning_rate": 8.694931300425204e-06, + "loss": 0.022, + "step": 2587 + }, + { + "epoch": 1.1774340309372158, + "grad_norm": 0.4941665993905159, + "learning_rate": 8.693968203447027e-06, + "loss": 0.0318, + "step": 2588 + }, + { + "epoch": 1.1778889899909009, + "grad_norm": 0.4471241857833498, + "learning_rate": 8.693004804614451e-06, + "loss": 0.0298, + "step": 2589 + }, + { + "epoch": 1.178343949044586, + "grad_norm": 0.42475689565329255, + "learning_rate": 8.692041104006201e-06, + "loss": 0.0245, + "step": 2590 + }, + { + "epoch": 1.1787989080982713, + "grad_norm": 0.7037247909228679, + "learning_rate": 8.691077101701024e-06, + "loss": 0.0422, + "step": 2591 + }, + { + "epoch": 1.1792538671519563, + "grad_norm": 0.4727292395507324, + "learning_rate": 8.690112797777695e-06, + "loss": 0.0286, + "step": 2592 + }, + { + "epoch": 1.1797088262056414, + "grad_norm": 0.4886187172760372, + "learning_rate": 8.689148192315013e-06, + "loss": 0.0253, + "step": 2593 + }, + { + "epoch": 1.1801637852593267, + "grad_norm": 0.4878895092851417, + "learning_rate": 8.6881832853918e-06, + "loss": 0.0294, + "step": 2594 + }, + { + "epoch": 1.1806187443130118, + "grad_norm": 0.3785632403936228, + "learning_rate": 8.687218077086905e-06, + "loss": 0.0262, + "step": 2595 + }, + { + "epoch": 1.181073703366697, + "grad_norm": 0.3032359273578328, + "learning_rate": 8.6862525674792e-06, + "loss": 0.0207, + "step": 2596 + }, + { + "epoch": 1.1815286624203822, + "grad_norm": 0.5805982565364416, + "learning_rate": 8.685286756647582e-06, + "loss": 0.0299, + "step": 2597 + }, + { + "epoch": 1.1819836214740673, + "grad_norm": 0.5312395563049912, + "learning_rate": 8.684320644670975e-06, + "loss": 0.0391, + "step": 2598 + }, + { + "epoch": 1.1824385805277524, + "grad_norm": 0.6427828501421616, + "learning_rate": 8.68335423162832e-06, + "loss": 0.0366, + "step": 2599 + }, + { + "epoch": 1.1828935395814377, + "grad_norm": 0.6549023820063344, + "learning_rate": 8.682387517598591e-06, + "loss": 0.0466, + "step": 2600 + }, + { + "epoch": 1.1833484986351228, + "grad_norm": 0.4191743788408071, + "learning_rate": 8.681420502660785e-06, + "loss": 0.0233, + "step": 2601 + }, + { + "epoch": 1.183803457688808, + "grad_norm": 0.4871715984486466, + "learning_rate": 8.68045318689392e-06, + "loss": 0.0271, + "step": 2602 + }, + { + "epoch": 1.1842584167424932, + "grad_norm": 0.6701976394432037, + "learning_rate": 8.679485570377043e-06, + "loss": 0.0306, + "step": 2603 + }, + { + "epoch": 1.1847133757961783, + "grad_norm": 0.6441120205935942, + "learning_rate": 8.678517653189222e-06, + "loss": 0.0394, + "step": 2604 + }, + { + "epoch": 1.1851683348498634, + "grad_norm": 0.5060858425158437, + "learning_rate": 8.677549435409548e-06, + "loss": 0.0217, + "step": 2605 + }, + { + "epoch": 1.1856232939035487, + "grad_norm": 0.6752485468046396, + "learning_rate": 8.676580917117144e-06, + "loss": 0.039, + "step": 2606 + }, + { + "epoch": 1.1860782529572338, + "grad_norm": 0.3957815075118571, + "learning_rate": 8.675612098391149e-06, + "loss": 0.0188, + "step": 2607 + }, + { + "epoch": 1.186533212010919, + "grad_norm": 0.5187116630942156, + "learning_rate": 8.674642979310732e-06, + "loss": 0.026, + "step": 2608 + }, + { + "epoch": 1.1869881710646042, + "grad_norm": 0.5769983660492354, + "learning_rate": 8.673673559955086e-06, + "loss": 0.0343, + "step": 2609 + }, + { + "epoch": 1.1874431301182893, + "grad_norm": 0.4743399882711679, + "learning_rate": 8.672703840403428e-06, + "loss": 0.0293, + "step": 2610 + }, + { + "epoch": 1.1878980891719746, + "grad_norm": 0.3693698002797069, + "learning_rate": 8.671733820734996e-06, + "loss": 0.0162, + "step": 2611 + }, + { + "epoch": 1.1883530482256597, + "grad_norm": 0.7143210340908582, + "learning_rate": 8.670763501029059e-06, + "loss": 0.0424, + "step": 2612 + }, + { + "epoch": 1.1888080072793448, + "grad_norm": 0.44099669973790273, + "learning_rate": 8.669792881364905e-06, + "loss": 0.0288, + "step": 2613 + }, + { + "epoch": 1.18926296633303, + "grad_norm": 0.47880134181841405, + "learning_rate": 8.668821961821848e-06, + "loss": 0.0356, + "step": 2614 + }, + { + "epoch": 1.1897179253867152, + "grad_norm": 0.49961852236193943, + "learning_rate": 8.66785074247923e-06, + "loss": 0.0264, + "step": 2615 + }, + { + "epoch": 1.1901728844404005, + "grad_norm": 0.6606861173434392, + "learning_rate": 8.666879223416413e-06, + "loss": 0.0402, + "step": 2616 + }, + { + "epoch": 1.1906278434940856, + "grad_norm": 0.5832250365729773, + "learning_rate": 8.665907404712786e-06, + "loss": 0.0349, + "step": 2617 + }, + { + "epoch": 1.1910828025477707, + "grad_norm": 0.47607736173413934, + "learning_rate": 8.66493528644776e-06, + "loss": 0.0275, + "step": 2618 + }, + { + "epoch": 1.191537761601456, + "grad_norm": 0.4323045066773957, + "learning_rate": 8.663962868700773e-06, + "loss": 0.0215, + "step": 2619 + }, + { + "epoch": 1.191992720655141, + "grad_norm": 0.6823901111258103, + "learning_rate": 8.662990151551288e-06, + "loss": 0.0367, + "step": 2620 + }, + { + "epoch": 1.1924476797088261, + "grad_norm": 0.568395741941641, + "learning_rate": 8.66201713507879e-06, + "loss": 0.0327, + "step": 2621 + }, + { + "epoch": 1.1929026387625115, + "grad_norm": 0.8032308375903047, + "learning_rate": 8.661043819362788e-06, + "loss": 0.0396, + "step": 2622 + }, + { + "epoch": 1.1933575978161965, + "grad_norm": 0.5352047847553939, + "learning_rate": 8.660070204482818e-06, + "loss": 0.0384, + "step": 2623 + }, + { + "epoch": 1.1938125568698816, + "grad_norm": 0.43266491785940075, + "learning_rate": 8.65909629051844e-06, + "loss": 0.0235, + "step": 2624 + }, + { + "epoch": 1.194267515923567, + "grad_norm": 0.5039359947320041, + "learning_rate": 8.658122077549239e-06, + "loss": 0.0332, + "step": 2625 + }, + { + "epoch": 1.194722474977252, + "grad_norm": 0.46282675009108876, + "learning_rate": 8.65714756565482e-06, + "loss": 0.028, + "step": 2626 + }, + { + "epoch": 1.1951774340309371, + "grad_norm": 0.42685254155176316, + "learning_rate": 8.656172754914818e-06, + "loss": 0.0193, + "step": 2627 + }, + { + "epoch": 1.1956323930846224, + "grad_norm": 0.5644652302861507, + "learning_rate": 8.655197645408889e-06, + "loss": 0.0327, + "step": 2628 + }, + { + "epoch": 1.1960873521383075, + "grad_norm": 0.6017102850762671, + "learning_rate": 8.654222237216714e-06, + "loss": 0.0395, + "step": 2629 + }, + { + "epoch": 1.1965423111919926, + "grad_norm": 0.4828717952370834, + "learning_rate": 8.653246530418003e-06, + "loss": 0.0296, + "step": 2630 + }, + { + "epoch": 1.196997270245678, + "grad_norm": 0.4718632798920294, + "learning_rate": 8.652270525092481e-06, + "loss": 0.0175, + "step": 2631 + }, + { + "epoch": 1.197452229299363, + "grad_norm": 0.9210566120370747, + "learning_rate": 8.651294221319907e-06, + "loss": 0.0532, + "step": 2632 + }, + { + "epoch": 1.197907188353048, + "grad_norm": 0.5973832244257986, + "learning_rate": 8.650317619180057e-06, + "loss": 0.0356, + "step": 2633 + }, + { + "epoch": 1.1983621474067334, + "grad_norm": 0.4056353546459655, + "learning_rate": 8.649340718752736e-06, + "loss": 0.0233, + "step": 2634 + }, + { + "epoch": 1.1988171064604185, + "grad_norm": 0.6383917144915527, + "learning_rate": 8.648363520117773e-06, + "loss": 0.0282, + "step": 2635 + }, + { + "epoch": 1.1992720655141038, + "grad_norm": 0.30187722032440356, + "learning_rate": 8.647386023355017e-06, + "loss": 0.015, + "step": 2636 + }, + { + "epoch": 1.199727024567789, + "grad_norm": 0.7620089776567717, + "learning_rate": 8.646408228544349e-06, + "loss": 0.0449, + "step": 2637 + }, + { + "epoch": 1.200181983621474, + "grad_norm": 0.7042927681153068, + "learning_rate": 8.645430135765667e-06, + "loss": 0.04, + "step": 2638 + }, + { + "epoch": 1.2006369426751593, + "grad_norm": 0.5117403840739881, + "learning_rate": 8.644451745098896e-06, + "loss": 0.0297, + "step": 2639 + }, + { + "epoch": 1.2010919017288444, + "grad_norm": 0.7659399394915278, + "learning_rate": 8.643473056623987e-06, + "loss": 0.0592, + "step": 2640 + }, + { + "epoch": 1.2015468607825295, + "grad_norm": 0.5678495394727697, + "learning_rate": 8.642494070420912e-06, + "loss": 0.032, + "step": 2641 + }, + { + "epoch": 1.2020018198362148, + "grad_norm": 0.4587046178873542, + "learning_rate": 8.641514786569674e-06, + "loss": 0.0273, + "step": 2642 + }, + { + "epoch": 1.2024567788898999, + "grad_norm": 0.5810971871142143, + "learning_rate": 8.640535205150291e-06, + "loss": 0.0436, + "step": 2643 + }, + { + "epoch": 1.2029117379435852, + "grad_norm": 0.49553783255896267, + "learning_rate": 8.639555326242812e-06, + "loss": 0.0375, + "step": 2644 + }, + { + "epoch": 1.2033666969972703, + "grad_norm": 0.700954373813157, + "learning_rate": 8.638575149927306e-06, + "loss": 0.0416, + "step": 2645 + }, + { + "epoch": 1.2038216560509554, + "grad_norm": 0.51916075076626, + "learning_rate": 8.637594676283872e-06, + "loss": 0.0301, + "step": 2646 + }, + { + "epoch": 1.2042766151046407, + "grad_norm": 0.5616014526557234, + "learning_rate": 8.636613905392628e-06, + "loss": 0.0333, + "step": 2647 + }, + { + "epoch": 1.2047315741583258, + "grad_norm": 0.3996003632999196, + "learning_rate": 8.635632837333719e-06, + "loss": 0.0203, + "step": 2648 + }, + { + "epoch": 1.2051865332120109, + "grad_norm": 0.5908400254903149, + "learning_rate": 8.634651472187312e-06, + "loss": 0.0355, + "step": 2649 + }, + { + "epoch": 1.2056414922656962, + "grad_norm": 0.5521857176836706, + "learning_rate": 8.633669810033601e-06, + "loss": 0.0302, + "step": 2650 + }, + { + "epoch": 1.2060964513193813, + "grad_norm": 0.47154629646415547, + "learning_rate": 8.632687850952803e-06, + "loss": 0.0254, + "step": 2651 + }, + { + "epoch": 1.2065514103730663, + "grad_norm": 0.5084600548265098, + "learning_rate": 8.63170559502516e-06, + "loss": 0.0263, + "step": 2652 + }, + { + "epoch": 1.2070063694267517, + "grad_norm": 0.41669809700741084, + "learning_rate": 8.630723042330934e-06, + "loss": 0.0235, + "step": 2653 + }, + { + "epoch": 1.2074613284804367, + "grad_norm": 0.4239984269262903, + "learning_rate": 8.629740192950418e-06, + "loss": 0.0258, + "step": 2654 + }, + { + "epoch": 1.2079162875341218, + "grad_norm": 0.5493755020180808, + "learning_rate": 8.628757046963925e-06, + "loss": 0.0312, + "step": 2655 + }, + { + "epoch": 1.2083712465878071, + "grad_norm": 0.44940260929025, + "learning_rate": 8.627773604451795e-06, + "loss": 0.0253, + "step": 2656 + }, + { + "epoch": 1.2088262056414922, + "grad_norm": 0.49748760446391493, + "learning_rate": 8.626789865494388e-06, + "loss": 0.029, + "step": 2657 + }, + { + "epoch": 1.2092811646951773, + "grad_norm": 0.4473696250717918, + "learning_rate": 8.62580583017209e-06, + "loss": 0.0265, + "step": 2658 + }, + { + "epoch": 1.2097361237488626, + "grad_norm": 0.634783340896908, + "learning_rate": 8.624821498565316e-06, + "loss": 0.0375, + "step": 2659 + }, + { + "epoch": 1.2101910828025477, + "grad_norm": 0.5688906906342468, + "learning_rate": 8.623836870754497e-06, + "loss": 0.0291, + "step": 2660 + }, + { + "epoch": 1.210646041856233, + "grad_norm": 0.524163167377845, + "learning_rate": 8.622851946820094e-06, + "loss": 0.0343, + "step": 2661 + }, + { + "epoch": 1.2111010009099181, + "grad_norm": 0.4184285347511745, + "learning_rate": 8.621866726842592e-06, + "loss": 0.0245, + "step": 2662 + }, + { + "epoch": 1.2115559599636032, + "grad_norm": 0.5452023193304021, + "learning_rate": 8.620881210902497e-06, + "loss": 0.0361, + "step": 2663 + }, + { + "epoch": 1.2120109190172885, + "grad_norm": 0.8825681885181793, + "learning_rate": 8.61989539908034e-06, + "loss": 0.0551, + "step": 2664 + }, + { + "epoch": 1.2124658780709736, + "grad_norm": 0.6606796283358398, + "learning_rate": 8.61890929145668e-06, + "loss": 0.0501, + "step": 2665 + }, + { + "epoch": 1.2129208371246587, + "grad_norm": 0.5383057502775304, + "learning_rate": 8.617922888112093e-06, + "loss": 0.0327, + "step": 2666 + }, + { + "epoch": 1.213375796178344, + "grad_norm": 0.456267646438963, + "learning_rate": 8.616936189127189e-06, + "loss": 0.0271, + "step": 2667 + }, + { + "epoch": 1.213830755232029, + "grad_norm": 0.6876820645690198, + "learning_rate": 8.615949194582591e-06, + "loss": 0.0522, + "step": 2668 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 0.4235510337955621, + "learning_rate": 8.614961904558956e-06, + "loss": 0.0178, + "step": 2669 + }, + { + "epoch": 1.2147406733393995, + "grad_norm": 0.31389612581359266, + "learning_rate": 8.613974319136959e-06, + "loss": 0.0142, + "step": 2670 + }, + { + "epoch": 1.2151956323930846, + "grad_norm": 0.5466534592913287, + "learning_rate": 8.6129864383973e-06, + "loss": 0.0325, + "step": 2671 + }, + { + "epoch": 1.21565059144677, + "grad_norm": 0.6256801141600264, + "learning_rate": 8.611998262420707e-06, + "loss": 0.031, + "step": 2672 + }, + { + "epoch": 1.216105550500455, + "grad_norm": 0.5060382153635896, + "learning_rate": 8.611009791287926e-06, + "loss": 0.0262, + "step": 2673 + }, + { + "epoch": 1.21656050955414, + "grad_norm": 0.5027235560302646, + "learning_rate": 8.610021025079734e-06, + "loss": 0.0254, + "step": 2674 + }, + { + "epoch": 1.2170154686078254, + "grad_norm": 0.5543017523957823, + "learning_rate": 8.609031963876924e-06, + "loss": 0.0308, + "step": 2675 + }, + { + "epoch": 1.2174704276615105, + "grad_norm": 0.4737161111249352, + "learning_rate": 8.608042607760322e-06, + "loss": 0.0326, + "step": 2676 + }, + { + "epoch": 1.2179253867151956, + "grad_norm": 0.4843464243684333, + "learning_rate": 8.607052956810772e-06, + "loss": 0.0258, + "step": 2677 + }, + { + "epoch": 1.2183803457688809, + "grad_norm": 0.5194322149503382, + "learning_rate": 8.606063011109143e-06, + "loss": 0.0358, + "step": 2678 + }, + { + "epoch": 1.218835304822566, + "grad_norm": 0.5930513493210321, + "learning_rate": 8.60507277073633e-06, + "loss": 0.0362, + "step": 2679 + }, + { + "epoch": 1.219290263876251, + "grad_norm": 0.32996053031100914, + "learning_rate": 8.604082235773249e-06, + "loss": 0.0131, + "step": 2680 + }, + { + "epoch": 1.2197452229299364, + "grad_norm": 0.4531032973363827, + "learning_rate": 8.603091406300845e-06, + "loss": 0.0264, + "step": 2681 + }, + { + "epoch": 1.2202001819836215, + "grad_norm": 0.4752447004618926, + "learning_rate": 8.602100282400082e-06, + "loss": 0.0222, + "step": 2682 + }, + { + "epoch": 1.2206551410373065, + "grad_norm": 0.48294135837077795, + "learning_rate": 8.60110886415195e-06, + "loss": 0.0286, + "step": 2683 + }, + { + "epoch": 1.2211101000909919, + "grad_norm": 0.8146460808068521, + "learning_rate": 8.600117151637465e-06, + "loss": 0.0553, + "step": 2684 + }, + { + "epoch": 1.221565059144677, + "grad_norm": 0.5348405988590901, + "learning_rate": 8.599125144937666e-06, + "loss": 0.0341, + "step": 2685 + }, + { + "epoch": 1.222020018198362, + "grad_norm": 0.5209228039836593, + "learning_rate": 8.598132844133614e-06, + "loss": 0.0285, + "step": 2686 + }, + { + "epoch": 1.2224749772520473, + "grad_norm": 0.8667405302686297, + "learning_rate": 8.597140249306393e-06, + "loss": 0.0554, + "step": 2687 + }, + { + "epoch": 1.2229299363057324, + "grad_norm": 0.3662245233762516, + "learning_rate": 8.596147360537115e-06, + "loss": 0.0186, + "step": 2688 + }, + { + "epoch": 1.2233848953594177, + "grad_norm": 0.5675330701823686, + "learning_rate": 8.595154177906915e-06, + "loss": 0.0252, + "step": 2689 + }, + { + "epoch": 1.2238398544131028, + "grad_norm": 0.5055412550341041, + "learning_rate": 8.594160701496951e-06, + "loss": 0.0359, + "step": 2690 + }, + { + "epoch": 1.224294813466788, + "grad_norm": 0.4636507359192646, + "learning_rate": 8.593166931388408e-06, + "loss": 0.0235, + "step": 2691 + }, + { + "epoch": 1.2247497725204732, + "grad_norm": 0.5789114485670152, + "learning_rate": 8.592172867662488e-06, + "loss": 0.0309, + "step": 2692 + }, + { + "epoch": 1.2252047315741583, + "grad_norm": 0.5362511549256743, + "learning_rate": 8.591178510400424e-06, + "loss": 0.0288, + "step": 2693 + }, + { + "epoch": 1.2256596906278434, + "grad_norm": 0.665176698679116, + "learning_rate": 8.590183859683469e-06, + "loss": 0.0381, + "step": 2694 + }, + { + "epoch": 1.2261146496815287, + "grad_norm": 0.5319510120853973, + "learning_rate": 8.589188915592903e-06, + "loss": 0.0359, + "step": 2695 + }, + { + "epoch": 1.2265696087352138, + "grad_norm": 0.4177494615666587, + "learning_rate": 8.588193678210026e-06, + "loss": 0.0194, + "step": 2696 + }, + { + "epoch": 1.2270245677888991, + "grad_norm": 0.34563423472616117, + "learning_rate": 8.587198147616166e-06, + "loss": 0.0188, + "step": 2697 + }, + { + "epoch": 1.2274795268425842, + "grad_norm": 0.5420023688259344, + "learning_rate": 8.586202323892675e-06, + "loss": 0.0322, + "step": 2698 + }, + { + "epoch": 1.2279344858962693, + "grad_norm": 0.5715046852040315, + "learning_rate": 8.585206207120925e-06, + "loss": 0.0248, + "step": 2699 + }, + { + "epoch": 1.2283894449499546, + "grad_norm": 0.6150293588585071, + "learning_rate": 8.584209797382313e-06, + "loss": 0.0349, + "step": 2700 + }, + { + "epoch": 1.2288444040036397, + "grad_norm": 0.7538546206140824, + "learning_rate": 8.583213094758262e-06, + "loss": 0.0415, + "step": 2701 + }, + { + "epoch": 1.2292993630573248, + "grad_norm": 0.41258699232239693, + "learning_rate": 8.582216099330218e-06, + "loss": 0.0252, + "step": 2702 + }, + { + "epoch": 1.22975432211101, + "grad_norm": 0.5992053934366026, + "learning_rate": 8.581218811179655e-06, + "loss": 0.0231, + "step": 2703 + }, + { + "epoch": 1.2302092811646952, + "grad_norm": 0.4911038111295034, + "learning_rate": 8.58022123038806e-06, + "loss": 0.0367, + "step": 2704 + }, + { + "epoch": 1.2306642402183803, + "grad_norm": 0.5415583441174247, + "learning_rate": 8.579223357036956e-06, + "loss": 0.0356, + "step": 2705 + }, + { + "epoch": 1.2311191992720656, + "grad_norm": 0.648050207407017, + "learning_rate": 8.578225191207881e-06, + "loss": 0.0322, + "step": 2706 + }, + { + "epoch": 1.2315741583257507, + "grad_norm": 0.6515223387873779, + "learning_rate": 8.577226732982405e-06, + "loss": 0.0424, + "step": 2707 + }, + { + "epoch": 1.2320291173794358, + "grad_norm": 0.7662318426027166, + "learning_rate": 8.576227982442114e-06, + "loss": 0.037, + "step": 2708 + }, + { + "epoch": 1.232484076433121, + "grad_norm": 0.4709920734770032, + "learning_rate": 8.575228939668623e-06, + "loss": 0.0284, + "step": 2709 + }, + { + "epoch": 1.2329390354868062, + "grad_norm": 0.7144313144730997, + "learning_rate": 8.574229604743566e-06, + "loss": 0.0316, + "step": 2710 + }, + { + "epoch": 1.2333939945404913, + "grad_norm": 0.4992331855484428, + "learning_rate": 8.573229977748609e-06, + "loss": 0.0345, + "step": 2711 + }, + { + "epoch": 1.2338489535941766, + "grad_norm": 0.6112686451914704, + "learning_rate": 8.572230058765434e-06, + "loss": 0.0358, + "step": 2712 + }, + { + "epoch": 1.2343039126478617, + "grad_norm": 0.8262726736467544, + "learning_rate": 8.571229847875751e-06, + "loss": 0.0641, + "step": 2713 + }, + { + "epoch": 1.2347588717015467, + "grad_norm": 0.4953827805427677, + "learning_rate": 8.570229345161293e-06, + "loss": 0.0247, + "step": 2714 + }, + { + "epoch": 1.235213830755232, + "grad_norm": 0.3801656553630412, + "learning_rate": 8.569228550703815e-06, + "loss": 0.0249, + "step": 2715 + }, + { + "epoch": 1.2356687898089171, + "grad_norm": 0.49612613452863535, + "learning_rate": 8.568227464585099e-06, + "loss": 0.0277, + "step": 2716 + }, + { + "epoch": 1.2361237488626025, + "grad_norm": 0.4582666835548743, + "learning_rate": 8.567226086886948e-06, + "loss": 0.0262, + "step": 2717 + }, + { + "epoch": 1.2365787079162875, + "grad_norm": 0.6697552955443566, + "learning_rate": 8.566224417691191e-06, + "loss": 0.0338, + "step": 2718 + }, + { + "epoch": 1.2370336669699726, + "grad_norm": 0.8001154357445661, + "learning_rate": 8.565222457079679e-06, + "loss": 0.0685, + "step": 2719 + }, + { + "epoch": 1.237488626023658, + "grad_norm": 0.4454996360487464, + "learning_rate": 8.56422020513429e-06, + "loss": 0.0233, + "step": 2720 + }, + { + "epoch": 1.237943585077343, + "grad_norm": 0.42231887554095254, + "learning_rate": 8.56321766193692e-06, + "loss": 0.0247, + "step": 2721 + }, + { + "epoch": 1.2383985441310281, + "grad_norm": 0.49520892835841024, + "learning_rate": 8.562214827569495e-06, + "loss": 0.0198, + "step": 2722 + }, + { + "epoch": 1.2388535031847134, + "grad_norm": 0.3119762559086726, + "learning_rate": 8.56121170211396e-06, + "loss": 0.0183, + "step": 2723 + }, + { + "epoch": 1.2393084622383985, + "grad_norm": 0.48127588980662994, + "learning_rate": 8.560208285652287e-06, + "loss": 0.0348, + "step": 2724 + }, + { + "epoch": 1.2397634212920838, + "grad_norm": 0.975980592939099, + "learning_rate": 8.559204578266471e-06, + "loss": 0.0712, + "step": 2725 + }, + { + "epoch": 1.240218380345769, + "grad_norm": 0.4739910877413602, + "learning_rate": 8.55820058003853e-06, + "loss": 0.027, + "step": 2726 + }, + { + "epoch": 1.240673339399454, + "grad_norm": 0.5358172750361924, + "learning_rate": 8.557196291050506e-06, + "loss": 0.0403, + "step": 2727 + }, + { + "epoch": 1.2411282984531393, + "grad_norm": 0.49464890318884047, + "learning_rate": 8.556191711384466e-06, + "loss": 0.0336, + "step": 2728 + }, + { + "epoch": 1.2415832575068244, + "grad_norm": 0.4046597291390638, + "learning_rate": 8.555186841122498e-06, + "loss": 0.024, + "step": 2729 + }, + { + "epoch": 1.2420382165605095, + "grad_norm": 0.656706108193742, + "learning_rate": 8.554181680346717e-06, + "loss": 0.0348, + "step": 2730 + }, + { + "epoch": 1.2424931756141948, + "grad_norm": 0.49134341156698247, + "learning_rate": 8.553176229139262e-06, + "loss": 0.033, + "step": 2731 + }, + { + "epoch": 1.24294813466788, + "grad_norm": 0.3673616941332998, + "learning_rate": 8.552170487582287e-06, + "loss": 0.0233, + "step": 2732 + }, + { + "epoch": 1.243403093721565, + "grad_norm": 0.3845834813421107, + "learning_rate": 8.551164455757985e-06, + "loss": 0.021, + "step": 2733 + }, + { + "epoch": 1.2438580527752503, + "grad_norm": 0.4219248857316413, + "learning_rate": 8.550158133748559e-06, + "loss": 0.0232, + "step": 2734 + }, + { + "epoch": 1.2443130118289354, + "grad_norm": 0.5359384657995739, + "learning_rate": 8.549151521636244e-06, + "loss": 0.0426, + "step": 2735 + }, + { + "epoch": 1.2447679708826205, + "grad_norm": 0.6147117803498731, + "learning_rate": 8.548144619503291e-06, + "loss": 0.0372, + "step": 2736 + }, + { + "epoch": 1.2452229299363058, + "grad_norm": 0.7816013628144164, + "learning_rate": 8.547137427431986e-06, + "loss": 0.0509, + "step": 2737 + }, + { + "epoch": 1.2456778889899909, + "grad_norm": 0.5732293106945054, + "learning_rate": 8.546129945504629e-06, + "loss": 0.0404, + "step": 2738 + }, + { + "epoch": 1.246132848043676, + "grad_norm": 0.5878496377747829, + "learning_rate": 8.545122173803547e-06, + "loss": 0.0349, + "step": 2739 + }, + { + "epoch": 1.2465878070973613, + "grad_norm": 0.5178543900697522, + "learning_rate": 8.544114112411088e-06, + "loss": 0.0317, + "step": 2740 + }, + { + "epoch": 1.2470427661510464, + "grad_norm": 0.44475184485600816, + "learning_rate": 8.54310576140963e-06, + "loss": 0.0246, + "step": 2741 + }, + { + "epoch": 1.2474977252047315, + "grad_norm": 0.41811991583751146, + "learning_rate": 8.542097120881572e-06, + "loss": 0.0264, + "step": 2742 + }, + { + "epoch": 1.2479526842584168, + "grad_norm": 0.504603909447871, + "learning_rate": 8.541088190909333e-06, + "loss": 0.037, + "step": 2743 + }, + { + "epoch": 1.2484076433121019, + "grad_norm": 0.5546565546187008, + "learning_rate": 8.540078971575355e-06, + "loss": 0.0321, + "step": 2744 + }, + { + "epoch": 1.2488626023657872, + "grad_norm": 0.5988533107048205, + "learning_rate": 8.539069462962115e-06, + "loss": 0.0356, + "step": 2745 + }, + { + "epoch": 1.2493175614194723, + "grad_norm": 0.5355497681868633, + "learning_rate": 8.538059665152097e-06, + "loss": 0.0219, + "step": 2746 + }, + { + "epoch": 1.2497725204731573, + "grad_norm": 0.5560216189929246, + "learning_rate": 8.537049578227823e-06, + "loss": 0.0318, + "step": 2747 + }, + { + "epoch": 1.2502274795268427, + "grad_norm": 0.41791265535852423, + "learning_rate": 8.536039202271828e-06, + "loss": 0.0296, + "step": 2748 + }, + { + "epoch": 1.2506824385805277, + "grad_norm": 0.6230283621476296, + "learning_rate": 8.53502853736668e-06, + "loss": 0.0229, + "step": 2749 + }, + { + "epoch": 1.251137397634213, + "grad_norm": 0.5883015192363978, + "learning_rate": 8.534017583594965e-06, + "loss": 0.0454, + "step": 2750 + }, + { + "epoch": 1.2515923566878981, + "grad_norm": 0.5657093936113446, + "learning_rate": 8.53300634103929e-06, + "loss": 0.0328, + "step": 2751 + }, + { + "epoch": 1.2520473157415832, + "grad_norm": 0.9286848475357391, + "learning_rate": 8.531994809782294e-06, + "loss": 0.0651, + "step": 2752 + }, + { + "epoch": 1.2525022747952685, + "grad_norm": 0.5306254596544426, + "learning_rate": 8.530982989906632e-06, + "loss": 0.0264, + "step": 2753 + }, + { + "epoch": 1.2529572338489536, + "grad_norm": 0.599793814100533, + "learning_rate": 8.529970881494985e-06, + "loss": 0.038, + "step": 2754 + }, + { + "epoch": 1.2534121929026387, + "grad_norm": 0.4592924108716034, + "learning_rate": 8.52895848463006e-06, + "loss": 0.0253, + "step": 2755 + }, + { + "epoch": 1.253867151956324, + "grad_norm": 0.5025180855718538, + "learning_rate": 8.527945799394584e-06, + "loss": 0.0269, + "step": 2756 + }, + { + "epoch": 1.2543221110100091, + "grad_norm": 0.3690223518853051, + "learning_rate": 8.526932825871308e-06, + "loss": 0.0214, + "step": 2757 + }, + { + "epoch": 1.2547770700636942, + "grad_norm": 0.38161446652737785, + "learning_rate": 8.52591956414301e-06, + "loss": 0.0166, + "step": 2758 + }, + { + "epoch": 1.2552320291173795, + "grad_norm": 0.611622699149414, + "learning_rate": 8.524906014292488e-06, + "loss": 0.0412, + "step": 2759 + }, + { + "epoch": 1.2556869881710646, + "grad_norm": 0.4022077081421061, + "learning_rate": 8.523892176402565e-06, + "loss": 0.0234, + "step": 2760 + }, + { + "epoch": 1.2561419472247497, + "grad_norm": 0.4085009912666225, + "learning_rate": 8.522878050556087e-06, + "loss": 0.0271, + "step": 2761 + }, + { + "epoch": 1.256596906278435, + "grad_norm": 0.591494783456256, + "learning_rate": 8.521863636835924e-06, + "loss": 0.0288, + "step": 2762 + }, + { + "epoch": 1.25705186533212, + "grad_norm": 0.4315940956441906, + "learning_rate": 8.520848935324968e-06, + "loss": 0.0257, + "step": 2763 + }, + { + "epoch": 1.2575068243858052, + "grad_norm": 0.4623767141710468, + "learning_rate": 8.519833946106139e-06, + "loss": 0.0293, + "step": 2764 + }, + { + "epoch": 1.2579617834394905, + "grad_norm": 0.5965051882391731, + "learning_rate": 8.518818669262373e-06, + "loss": 0.0367, + "step": 2765 + }, + { + "epoch": 1.2584167424931756, + "grad_norm": 0.5441954958905808, + "learning_rate": 8.517803104876638e-06, + "loss": 0.0314, + "step": 2766 + }, + { + "epoch": 1.2588717015468607, + "grad_norm": 0.5077782576820083, + "learning_rate": 8.51678725303192e-06, + "loss": 0.0261, + "step": 2767 + }, + { + "epoch": 1.259326660600546, + "grad_norm": 0.6376855259836618, + "learning_rate": 8.515771113811226e-06, + "loss": 0.0409, + "step": 2768 + }, + { + "epoch": 1.259781619654231, + "grad_norm": 6.915760462322178, + "learning_rate": 8.514754687297598e-06, + "loss": 0.1986, + "step": 2769 + }, + { + "epoch": 1.2602365787079162, + "grad_norm": 0.5889806379105973, + "learning_rate": 8.513737973574088e-06, + "loss": 0.0336, + "step": 2770 + }, + { + "epoch": 1.2606915377616015, + "grad_norm": 0.5275667357404193, + "learning_rate": 8.512720972723779e-06, + "loss": 0.0289, + "step": 2771 + }, + { + "epoch": 1.2611464968152866, + "grad_norm": 0.3147286633021264, + "learning_rate": 8.511703684829773e-06, + "loss": 0.0163, + "step": 2772 + }, + { + "epoch": 1.2616014558689717, + "grad_norm": 0.8013976464224811, + "learning_rate": 8.510686109975202e-06, + "loss": 0.0468, + "step": 2773 + }, + { + "epoch": 1.262056414922657, + "grad_norm": 0.4994061588441834, + "learning_rate": 8.509668248243217e-06, + "loss": 0.02, + "step": 2774 + }, + { + "epoch": 1.262511373976342, + "grad_norm": 0.5749302842677763, + "learning_rate": 8.508650099716991e-06, + "loss": 0.0362, + "step": 2775 + }, + { + "epoch": 1.2629663330300274, + "grad_norm": 0.40312955247530624, + "learning_rate": 8.507631664479725e-06, + "loss": 0.0229, + "step": 2776 + }, + { + "epoch": 1.2634212920837125, + "grad_norm": 0.4196449929909409, + "learning_rate": 8.506612942614639e-06, + "loss": 0.0195, + "step": 2777 + }, + { + "epoch": 1.2638762511373978, + "grad_norm": 0.4933487936946509, + "learning_rate": 8.505593934204978e-06, + "loss": 0.0325, + "step": 2778 + }, + { + "epoch": 1.2643312101910829, + "grad_norm": 0.5074169183177683, + "learning_rate": 8.504574639334013e-06, + "loss": 0.0302, + "step": 2779 + }, + { + "epoch": 1.264786169244768, + "grad_norm": 0.5508963628460979, + "learning_rate": 8.503555058085035e-06, + "loss": 0.0215, + "step": 2780 + }, + { + "epoch": 1.2652411282984533, + "grad_norm": 0.462497042914889, + "learning_rate": 8.502535190541362e-06, + "loss": 0.0249, + "step": 2781 + }, + { + "epoch": 1.2656960873521383, + "grad_norm": 0.8409491409991778, + "learning_rate": 8.501515036786327e-06, + "loss": 0.0464, + "step": 2782 + }, + { + "epoch": 1.2661510464058234, + "grad_norm": 0.5198784420945559, + "learning_rate": 8.500494596903298e-06, + "loss": 0.0337, + "step": 2783 + }, + { + "epoch": 1.2666060054595087, + "grad_norm": 0.45309440991920896, + "learning_rate": 8.499473870975657e-06, + "loss": 0.0253, + "step": 2784 + }, + { + "epoch": 1.2670609645131938, + "grad_norm": 0.49216033758494676, + "learning_rate": 8.498452859086816e-06, + "loss": 0.0287, + "step": 2785 + }, + { + "epoch": 1.267515923566879, + "grad_norm": 0.6649300772430892, + "learning_rate": 8.497431561320204e-06, + "loss": 0.0493, + "step": 2786 + }, + { + "epoch": 1.2679708826205642, + "grad_norm": 0.6238882043590698, + "learning_rate": 8.496409977759281e-06, + "loss": 0.0358, + "step": 2787 + }, + { + "epoch": 1.2684258416742493, + "grad_norm": 0.3865373802086272, + "learning_rate": 8.495388108487525e-06, + "loss": 0.0197, + "step": 2788 + }, + { + "epoch": 1.2688808007279344, + "grad_norm": 0.4201281559699742, + "learning_rate": 8.494365953588435e-06, + "loss": 0.0222, + "step": 2789 + }, + { + "epoch": 1.2693357597816197, + "grad_norm": 0.6045328676429113, + "learning_rate": 8.493343513145543e-06, + "loss": 0.0402, + "step": 2790 + }, + { + "epoch": 1.2697907188353048, + "grad_norm": 0.5549299619854191, + "learning_rate": 8.492320787242394e-06, + "loss": 0.0323, + "step": 2791 + }, + { + "epoch": 1.27024567788899, + "grad_norm": 0.4475795587111154, + "learning_rate": 8.491297775962561e-06, + "loss": 0.0235, + "step": 2792 + }, + { + "epoch": 1.2707006369426752, + "grad_norm": 0.7036026073161589, + "learning_rate": 8.49027447938964e-06, + "loss": 0.0433, + "step": 2793 + }, + { + "epoch": 1.2711555959963603, + "grad_norm": 0.6587509536956878, + "learning_rate": 8.48925089760725e-06, + "loss": 0.0381, + "step": 2794 + }, + { + "epoch": 1.2716105550500454, + "grad_norm": 0.5314238167851097, + "learning_rate": 8.488227030699034e-06, + "loss": 0.0347, + "step": 2795 + }, + { + "epoch": 1.2720655141037307, + "grad_norm": 0.40385254147988475, + "learning_rate": 8.487202878748659e-06, + "loss": 0.0153, + "step": 2796 + }, + { + "epoch": 1.2725204731574158, + "grad_norm": 0.4187276708435959, + "learning_rate": 8.486178441839812e-06, + "loss": 0.028, + "step": 2797 + }, + { + "epoch": 1.2729754322111009, + "grad_norm": 0.5320879394729393, + "learning_rate": 8.485153720056206e-06, + "loss": 0.0282, + "step": 2798 + }, + { + "epoch": 1.2734303912647862, + "grad_norm": 0.5626707275839432, + "learning_rate": 8.484128713481578e-06, + "loss": 0.0265, + "step": 2799 + }, + { + "epoch": 1.2738853503184713, + "grad_norm": 0.49217175534274415, + "learning_rate": 8.483103422199683e-06, + "loss": 0.024, + "step": 2800 + }, + { + "epoch": 1.2743403093721566, + "grad_norm": 1.7488430942036035, + "learning_rate": 8.48207784629431e-06, + "loss": 0.0627, + "step": 2801 + }, + { + "epoch": 1.2747952684258417, + "grad_norm": 0.5648200964805751, + "learning_rate": 8.481051985849259e-06, + "loss": 0.0276, + "step": 2802 + }, + { + "epoch": 1.2752502274795268, + "grad_norm": 0.5556231021278181, + "learning_rate": 8.480025840948357e-06, + "loss": 0.034, + "step": 2803 + }, + { + "epoch": 1.275705186533212, + "grad_norm": 0.6476049581350152, + "learning_rate": 8.478999411675461e-06, + "loss": 0.0375, + "step": 2804 + }, + { + "epoch": 1.2761601455868972, + "grad_norm": 0.5124346469861034, + "learning_rate": 8.477972698114446e-06, + "loss": 0.0272, + "step": 2805 + }, + { + "epoch": 1.2766151046405825, + "grad_norm": 0.6717078615438046, + "learning_rate": 8.476945700349206e-06, + "loss": 0.0375, + "step": 2806 + }, + { + "epoch": 1.2770700636942676, + "grad_norm": 0.6030848555640039, + "learning_rate": 8.475918418463665e-06, + "loss": 0.0358, + "step": 2807 + }, + { + "epoch": 1.2775250227479527, + "grad_norm": 0.7825586786731197, + "learning_rate": 8.474890852541768e-06, + "loss": 0.0522, + "step": 2808 + }, + { + "epoch": 1.277979981801638, + "grad_norm": 0.6153861535254155, + "learning_rate": 8.473863002667484e-06, + "loss": 0.0445, + "step": 2809 + }, + { + "epoch": 1.278434940855323, + "grad_norm": 0.5282220213793622, + "learning_rate": 8.472834868924803e-06, + "loss": 0.0481, + "step": 2810 + }, + { + "epoch": 1.2788898999090081, + "grad_norm": 0.5421116083773543, + "learning_rate": 8.47180645139774e-06, + "loss": 0.0344, + "step": 2811 + }, + { + "epoch": 1.2793448589626935, + "grad_norm": 0.6791388111543428, + "learning_rate": 8.470777750170331e-06, + "loss": 0.0435, + "step": 2812 + }, + { + "epoch": 1.2797998180163785, + "grad_norm": 0.5207371822148394, + "learning_rate": 8.469748765326639e-06, + "loss": 0.0322, + "step": 2813 + }, + { + "epoch": 1.2802547770700636, + "grad_norm": 0.45374031536978576, + "learning_rate": 8.468719496950748e-06, + "loss": 0.0202, + "step": 2814 + }, + { + "epoch": 1.280709736123749, + "grad_norm": 0.6220210270574482, + "learning_rate": 8.467689945126764e-06, + "loss": 0.0389, + "step": 2815 + }, + { + "epoch": 1.281164695177434, + "grad_norm": 0.44694663225442227, + "learning_rate": 8.466660109938817e-06, + "loss": 0.0277, + "step": 2816 + }, + { + "epoch": 1.2816196542311191, + "grad_norm": 0.45764799454231664, + "learning_rate": 8.46562999147106e-06, + "loss": 0.0214, + "step": 2817 + }, + { + "epoch": 1.2820746132848044, + "grad_norm": 0.46665783231694, + "learning_rate": 8.464599589807673e-06, + "loss": 0.0328, + "step": 2818 + }, + { + "epoch": 1.2825295723384895, + "grad_norm": 0.5791262823156262, + "learning_rate": 8.463568905032853e-06, + "loss": 0.0315, + "step": 2819 + }, + { + "epoch": 1.2829845313921746, + "grad_norm": 0.5688402724299256, + "learning_rate": 8.462537937230823e-06, + "loss": 0.0341, + "step": 2820 + }, + { + "epoch": 1.28343949044586, + "grad_norm": 0.509413677148141, + "learning_rate": 8.46150668648583e-06, + "loss": 0.0269, + "step": 2821 + }, + { + "epoch": 1.283894449499545, + "grad_norm": 0.4653446182519699, + "learning_rate": 8.460475152882142e-06, + "loss": 0.0283, + "step": 2822 + }, + { + "epoch": 1.28434940855323, + "grad_norm": 0.3972245449089319, + "learning_rate": 8.459443336504052e-06, + "loss": 0.025, + "step": 2823 + }, + { + "epoch": 1.2848043676069154, + "grad_norm": 0.7353018545774426, + "learning_rate": 8.458411237435875e-06, + "loss": 0.0404, + "step": 2824 + }, + { + "epoch": 1.2852593266606005, + "grad_norm": 0.6368251484118869, + "learning_rate": 8.45737885576195e-06, + "loss": 0.0304, + "step": 2825 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.41997329370972175, + "learning_rate": 8.456346191566638e-06, + "loss": 0.0198, + "step": 2826 + }, + { + "epoch": 1.286169244767971, + "grad_norm": 0.6490280661806181, + "learning_rate": 8.455313244934324e-06, + "loss": 0.0421, + "step": 2827 + }, + { + "epoch": 1.286624203821656, + "grad_norm": 0.5889007709220472, + "learning_rate": 8.454280015949417e-06, + "loss": 0.0284, + "step": 2828 + }, + { + "epoch": 1.2870791628753413, + "grad_norm": 0.5451280417854998, + "learning_rate": 8.453246504696345e-06, + "loss": 0.0226, + "step": 2829 + }, + { + "epoch": 1.2875341219290264, + "grad_norm": 0.4320016505130762, + "learning_rate": 8.452212711259562e-06, + "loss": 0.0248, + "step": 2830 + }, + { + "epoch": 1.2879890809827115, + "grad_norm": 0.639460311213598, + "learning_rate": 8.45117863572355e-06, + "loss": 0.0358, + "step": 2831 + }, + { + "epoch": 1.2884440400363968, + "grad_norm": 0.4572192713269325, + "learning_rate": 8.450144278172802e-06, + "loss": 0.0219, + "step": 2832 + }, + { + "epoch": 1.2888989990900819, + "grad_norm": 0.6056405312996962, + "learning_rate": 8.449109638691846e-06, + "loss": 0.0404, + "step": 2833 + }, + { + "epoch": 1.2893539581437672, + "grad_norm": 0.6311739331027462, + "learning_rate": 8.448074717365227e-06, + "loss": 0.0462, + "step": 2834 + }, + { + "epoch": 1.2898089171974523, + "grad_norm": 0.3815545778311228, + "learning_rate": 8.447039514277511e-06, + "loss": 0.0218, + "step": 2835 + }, + { + "epoch": 1.2902638762511374, + "grad_norm": 0.3978060829542109, + "learning_rate": 8.446004029513294e-06, + "loss": 0.0178, + "step": 2836 + }, + { + "epoch": 1.2907188353048227, + "grad_norm": 0.5275681469990177, + "learning_rate": 8.44496826315719e-06, + "loss": 0.0328, + "step": 2837 + }, + { + "epoch": 1.2911737943585078, + "grad_norm": 0.39047958443652486, + "learning_rate": 8.443932215293837e-06, + "loss": 0.0185, + "step": 2838 + }, + { + "epoch": 1.2916287534121929, + "grad_norm": 0.3956886222652548, + "learning_rate": 8.442895886007894e-06, + "loss": 0.0198, + "step": 2839 + }, + { + "epoch": 1.2920837124658782, + "grad_norm": 0.46209991899892167, + "learning_rate": 8.441859275384051e-06, + "loss": 0.0312, + "step": 2840 + }, + { + "epoch": 1.2925386715195633, + "grad_norm": 0.46877419670667825, + "learning_rate": 8.440822383507009e-06, + "loss": 0.0291, + "step": 2841 + }, + { + "epoch": 1.2929936305732483, + "grad_norm": 0.45005958214248154, + "learning_rate": 8.4397852104615e-06, + "loss": 0.0237, + "step": 2842 + }, + { + "epoch": 1.2934485896269337, + "grad_norm": 0.45818206860330574, + "learning_rate": 8.438747756332278e-06, + "loss": 0.031, + "step": 2843 + }, + { + "epoch": 1.2939035486806187, + "grad_norm": 0.6372108995312181, + "learning_rate": 8.43771002120412e-06, + "loss": 0.043, + "step": 2844 + }, + { + "epoch": 1.2943585077343038, + "grad_norm": 0.7913133192237457, + "learning_rate": 8.43667200516182e-06, + "loss": 0.0322, + "step": 2845 + }, + { + "epoch": 1.2948134667879891, + "grad_norm": 0.7449168452669471, + "learning_rate": 8.435633708290205e-06, + "loss": 0.0668, + "step": 2846 + }, + { + "epoch": 1.2952684258416742, + "grad_norm": 0.5449759524419574, + "learning_rate": 8.434595130674121e-06, + "loss": 0.0287, + "step": 2847 + }, + { + "epoch": 1.2957233848953593, + "grad_norm": 0.6854051716023769, + "learning_rate": 8.433556272398431e-06, + "loss": 0.04, + "step": 2848 + }, + { + "epoch": 1.2961783439490446, + "grad_norm": 0.5167363871048377, + "learning_rate": 8.43251713354803e-06, + "loss": 0.0306, + "step": 2849 + }, + { + "epoch": 1.2966333030027297, + "grad_norm": 0.5049406724574115, + "learning_rate": 8.43147771420783e-06, + "loss": 0.03, + "step": 2850 + }, + { + "epoch": 1.2970882620564148, + "grad_norm": 0.47114278348228805, + "learning_rate": 8.430438014462764e-06, + "loss": 0.0295, + "step": 2851 + }, + { + "epoch": 1.2975432211101001, + "grad_norm": 0.7186330165104124, + "learning_rate": 8.429398034397798e-06, + "loss": 0.0387, + "step": 2852 + }, + { + "epoch": 1.2979981801637852, + "grad_norm": 0.599939271911953, + "learning_rate": 8.428357774097913e-06, + "loss": 0.0311, + "step": 2853 + }, + { + "epoch": 1.2984531392174703, + "grad_norm": 0.5994585274722956, + "learning_rate": 8.42731723364811e-06, + "loss": 0.0391, + "step": 2854 + }, + { + "epoch": 1.2989080982711556, + "grad_norm": 0.5657660047246607, + "learning_rate": 8.426276413133422e-06, + "loss": 0.0301, + "step": 2855 + }, + { + "epoch": 1.2993630573248407, + "grad_norm": 0.4190238792112018, + "learning_rate": 8.4252353126389e-06, + "loss": 0.0245, + "step": 2856 + }, + { + "epoch": 1.299818016378526, + "grad_norm": 0.6420578682403528, + "learning_rate": 8.424193932249614e-06, + "loss": 0.0377, + "step": 2857 + }, + { + "epoch": 1.300272975432211, + "grad_norm": 0.5700032729383738, + "learning_rate": 8.423152272050665e-06, + "loss": 0.0338, + "step": 2858 + }, + { + "epoch": 1.3007279344858962, + "grad_norm": 0.5125782380711011, + "learning_rate": 8.42211033212717e-06, + "loss": 0.0279, + "step": 2859 + }, + { + "epoch": 1.3011828935395815, + "grad_norm": 0.3613001497946594, + "learning_rate": 8.421068112564272e-06, + "loss": 0.0142, + "step": 2860 + }, + { + "epoch": 1.3016378525932666, + "grad_norm": 0.49734554941531095, + "learning_rate": 8.42002561344714e-06, + "loss": 0.027, + "step": 2861 + }, + { + "epoch": 1.302092811646952, + "grad_norm": 0.5045918830965134, + "learning_rate": 8.418982834860958e-06, + "loss": 0.0371, + "step": 2862 + }, + { + "epoch": 1.302547770700637, + "grad_norm": 0.7937593563379391, + "learning_rate": 8.417939776890938e-06, + "loss": 0.0477, + "step": 2863 + }, + { + "epoch": 1.303002729754322, + "grad_norm": 0.770292717700032, + "learning_rate": 8.416896439622315e-06, + "loss": 0.0364, + "step": 2864 + }, + { + "epoch": 1.3034576888080074, + "grad_norm": 0.40217667555726205, + "learning_rate": 8.415852823140344e-06, + "loss": 0.0215, + "step": 2865 + }, + { + "epoch": 1.3039126478616925, + "grad_norm": 0.39410556190775164, + "learning_rate": 8.41480892753031e-06, + "loss": 0.0258, + "step": 2866 + }, + { + "epoch": 1.3043676069153776, + "grad_norm": 0.5772374070482569, + "learning_rate": 8.413764752877509e-06, + "loss": 0.0383, + "step": 2867 + }, + { + "epoch": 1.3048225659690629, + "grad_norm": 0.528432161004818, + "learning_rate": 8.41272029926727e-06, + "loss": 0.0309, + "step": 2868 + }, + { + "epoch": 1.305277525022748, + "grad_norm": 0.5477254790589614, + "learning_rate": 8.411675566784939e-06, + "loss": 0.0306, + "step": 2869 + }, + { + "epoch": 1.305732484076433, + "grad_norm": 0.5287075963479533, + "learning_rate": 8.410630555515887e-06, + "loss": 0.0379, + "step": 2870 + }, + { + "epoch": 1.3061874431301184, + "grad_norm": 0.6485432136738517, + "learning_rate": 8.409585265545509e-06, + "loss": 0.0507, + "step": 2871 + }, + { + "epoch": 1.3066424021838035, + "grad_norm": 0.4441608183133053, + "learning_rate": 8.408539696959222e-06, + "loss": 0.0181, + "step": 2872 + }, + { + "epoch": 1.3070973612374885, + "grad_norm": 0.581370080812639, + "learning_rate": 8.407493849842462e-06, + "loss": 0.0298, + "step": 2873 + }, + { + "epoch": 1.3075523202911739, + "grad_norm": 0.41095630327527305, + "learning_rate": 8.406447724280694e-06, + "loss": 0.0172, + "step": 2874 + }, + { + "epoch": 1.308007279344859, + "grad_norm": 0.5932824581914778, + "learning_rate": 8.4054013203594e-06, + "loss": 0.0396, + "step": 2875 + }, + { + "epoch": 1.308462238398544, + "grad_norm": 0.5059022446307072, + "learning_rate": 8.40435463816409e-06, + "loss": 0.0295, + "step": 2876 + }, + { + "epoch": 1.3089171974522293, + "grad_norm": 0.6107245381532688, + "learning_rate": 8.403307677780291e-06, + "loss": 0.0456, + "step": 2877 + }, + { + "epoch": 1.3093721565059144, + "grad_norm": 0.4990493825414314, + "learning_rate": 8.40226043929356e-06, + "loss": 0.0251, + "step": 2878 + }, + { + "epoch": 1.3098271155595995, + "grad_norm": 0.4714729010419501, + "learning_rate": 8.40121292278947e-06, + "loss": 0.0162, + "step": 2879 + }, + { + "epoch": 1.3102820746132848, + "grad_norm": 0.8705842073062648, + "learning_rate": 8.400165128353619e-06, + "loss": 0.0628, + "step": 2880 + }, + { + "epoch": 1.31073703366697, + "grad_norm": 0.5559037822076499, + "learning_rate": 8.399117056071628e-06, + "loss": 0.0301, + "step": 2881 + }, + { + "epoch": 1.311191992720655, + "grad_norm": 0.7209850941894482, + "learning_rate": 8.398068706029144e-06, + "loss": 0.0562, + "step": 2882 + }, + { + "epoch": 1.3116469517743403, + "grad_norm": 0.6600248319736058, + "learning_rate": 8.397020078311829e-06, + "loss": 0.0479, + "step": 2883 + }, + { + "epoch": 1.3121019108280254, + "grad_norm": 0.5745977024793432, + "learning_rate": 8.395971173005373e-06, + "loss": 0.0307, + "step": 2884 + }, + { + "epoch": 1.3125568698817107, + "grad_norm": 0.7612725338466626, + "learning_rate": 8.39492199019549e-06, + "loss": 0.045, + "step": 2885 + }, + { + "epoch": 1.3130118289353958, + "grad_norm": 0.5825437442447712, + "learning_rate": 8.393872529967913e-06, + "loss": 0.0373, + "step": 2886 + }, + { + "epoch": 1.3134667879890811, + "grad_norm": 0.4805516575623738, + "learning_rate": 8.3928227924084e-06, + "loss": 0.0257, + "step": 2887 + }, + { + "epoch": 1.3139217470427662, + "grad_norm": 0.5620132619283021, + "learning_rate": 8.391772777602729e-06, + "loss": 0.0392, + "step": 2888 + }, + { + "epoch": 1.3143767060964513, + "grad_norm": 0.4909960245970496, + "learning_rate": 8.390722485636707e-06, + "loss": 0.0329, + "step": 2889 + }, + { + "epoch": 1.3148316651501366, + "grad_norm": 0.7269269853225212, + "learning_rate": 8.389671916596152e-06, + "loss": 0.052, + "step": 2890 + }, + { + "epoch": 1.3152866242038217, + "grad_norm": 0.5201558195230915, + "learning_rate": 8.388621070566918e-06, + "loss": 0.0248, + "step": 2891 + }, + { + "epoch": 1.3157415832575068, + "grad_norm": 0.394106763274554, + "learning_rate": 8.387569947634872e-06, + "loss": 0.018, + "step": 2892 + }, + { + "epoch": 1.316196542311192, + "grad_norm": 0.6404544864402316, + "learning_rate": 8.386518547885907e-06, + "loss": 0.0476, + "step": 2893 + }, + { + "epoch": 1.3166515013648772, + "grad_norm": 0.5191807643177463, + "learning_rate": 8.385466871405942e-06, + "loss": 0.0275, + "step": 2894 + }, + { + "epoch": 1.3171064604185623, + "grad_norm": 0.5938811996011489, + "learning_rate": 8.384414918280912e-06, + "loss": 0.0381, + "step": 2895 + }, + { + "epoch": 1.3175614194722476, + "grad_norm": 0.5756131532663223, + "learning_rate": 8.383362688596779e-06, + "loss": 0.0338, + "step": 2896 + }, + { + "epoch": 1.3180163785259327, + "grad_norm": 0.5811779212201078, + "learning_rate": 8.382310182439526e-06, + "loss": 0.0376, + "step": 2897 + }, + { + "epoch": 1.3184713375796178, + "grad_norm": 0.448051940064308, + "learning_rate": 8.381257399895157e-06, + "loss": 0.0311, + "step": 2898 + }, + { + "epoch": 1.318926296633303, + "grad_norm": 0.5275685839417847, + "learning_rate": 8.380204341049706e-06, + "loss": 0.0301, + "step": 2899 + }, + { + "epoch": 1.3193812556869882, + "grad_norm": 0.5862324399838035, + "learning_rate": 8.37915100598922e-06, + "loss": 0.0326, + "step": 2900 + }, + { + "epoch": 1.3198362147406733, + "grad_norm": 0.4657130433511499, + "learning_rate": 8.378097394799774e-06, + "loss": 0.0292, + "step": 2901 + }, + { + "epoch": 1.3202911737943586, + "grad_norm": 0.43527864616078554, + "learning_rate": 8.377043507567464e-06, + "loss": 0.0198, + "step": 2902 + }, + { + "epoch": 1.3207461328480437, + "grad_norm": 0.5682424315090409, + "learning_rate": 8.37598934437841e-06, + "loss": 0.0337, + "step": 2903 + }, + { + "epoch": 1.3212010919017287, + "grad_norm": 0.5138915407015388, + "learning_rate": 8.374934905318753e-06, + "loss": 0.0361, + "step": 2904 + }, + { + "epoch": 1.321656050955414, + "grad_norm": 0.6053557583572503, + "learning_rate": 8.373880190474653e-06, + "loss": 0.0357, + "step": 2905 + }, + { + "epoch": 1.3221110100090991, + "grad_norm": 0.39758173287328924, + "learning_rate": 8.372825199932304e-06, + "loss": 0.0291, + "step": 2906 + }, + { + "epoch": 1.3225659690627842, + "grad_norm": 0.3641881015036813, + "learning_rate": 8.371769933777908e-06, + "loss": 0.0171, + "step": 2907 + }, + { + "epoch": 1.3230209281164695, + "grad_norm": 0.3905844015965015, + "learning_rate": 8.370714392097703e-06, + "loss": 0.0185, + "step": 2908 + }, + { + "epoch": 1.3234758871701546, + "grad_norm": 0.7229548107308235, + "learning_rate": 8.369658574977939e-06, + "loss": 0.0308, + "step": 2909 + }, + { + "epoch": 1.3239308462238397, + "grad_norm": 0.44397421085587824, + "learning_rate": 8.368602482504894e-06, + "loss": 0.0257, + "step": 2910 + }, + { + "epoch": 1.324385805277525, + "grad_norm": 0.3765050238419547, + "learning_rate": 8.367546114764863e-06, + "loss": 0.0188, + "step": 2911 + }, + { + "epoch": 1.3248407643312101, + "grad_norm": 0.6236537427570221, + "learning_rate": 8.366489471844174e-06, + "loss": 0.0341, + "step": 2912 + }, + { + "epoch": 1.3252957233848954, + "grad_norm": 0.5416006396662083, + "learning_rate": 8.36543255382917e-06, + "loss": 0.0275, + "step": 2913 + }, + { + "epoch": 1.3257506824385805, + "grad_norm": 0.4937949308281839, + "learning_rate": 8.364375360806214e-06, + "loss": 0.022, + "step": 2914 + }, + { + "epoch": 1.3262056414922658, + "grad_norm": 0.537820582130104, + "learning_rate": 8.363317892861695e-06, + "loss": 0.0319, + "step": 2915 + }, + { + "epoch": 1.326660600545951, + "grad_norm": 0.3658940967114081, + "learning_rate": 8.36226015008203e-06, + "loss": 0.0163, + "step": 2916 + }, + { + "epoch": 1.327115559599636, + "grad_norm": 0.42988876957689603, + "learning_rate": 8.361202132553647e-06, + "loss": 0.0248, + "step": 2917 + }, + { + "epoch": 1.3275705186533213, + "grad_norm": 0.4763202386564734, + "learning_rate": 8.360143840363006e-06, + "loss": 0.0267, + "step": 2918 + }, + { + "epoch": 1.3280254777070064, + "grad_norm": 0.5304487014834793, + "learning_rate": 8.359085273596583e-06, + "loss": 0.0232, + "step": 2919 + }, + { + "epoch": 1.3284804367606915, + "grad_norm": 0.5132885954286545, + "learning_rate": 8.358026432340883e-06, + "loss": 0.0296, + "step": 2920 + }, + { + "epoch": 1.3289353958143768, + "grad_norm": 0.5441398032146411, + "learning_rate": 8.356967316682427e-06, + "loss": 0.025, + "step": 2921 + }, + { + "epoch": 1.329390354868062, + "grad_norm": 0.8637700710210932, + "learning_rate": 8.35590792670776e-06, + "loss": 0.0546, + "step": 2922 + }, + { + "epoch": 1.329845313921747, + "grad_norm": 0.6961731263031892, + "learning_rate": 8.354848262503455e-06, + "loss": 0.0455, + "step": 2923 + }, + { + "epoch": 1.3303002729754323, + "grad_norm": 0.4361786339244665, + "learning_rate": 8.3537883241561e-06, + "loss": 0.0247, + "step": 2924 + }, + { + "epoch": 1.3307552320291174, + "grad_norm": 0.43356790518587945, + "learning_rate": 8.352728111752308e-06, + "loss": 0.0211, + "step": 2925 + }, + { + "epoch": 1.3312101910828025, + "grad_norm": 0.6497845011963704, + "learning_rate": 8.351667625378714e-06, + "loss": 0.0339, + "step": 2926 + }, + { + "epoch": 1.3316651501364878, + "grad_norm": 0.42248975914582426, + "learning_rate": 8.35060686512198e-06, + "loss": 0.0241, + "step": 2927 + }, + { + "epoch": 1.3321201091901729, + "grad_norm": 1.0254154838868987, + "learning_rate": 8.349545831068783e-06, + "loss": 0.0527, + "step": 2928 + }, + { + "epoch": 1.332575068243858, + "grad_norm": 0.5465891301505484, + "learning_rate": 8.348484523305828e-06, + "loss": 0.0321, + "step": 2929 + }, + { + "epoch": 1.3330300272975433, + "grad_norm": 0.4438432419153223, + "learning_rate": 8.347422941919839e-06, + "loss": 0.0246, + "step": 2930 + }, + { + "epoch": 1.3334849863512284, + "grad_norm": 0.612887112386914, + "learning_rate": 8.346361086997563e-06, + "loss": 0.0378, + "step": 2931 + }, + { + "epoch": 1.3339399454049135, + "grad_norm": 0.5224132598314238, + "learning_rate": 8.345298958625773e-06, + "loss": 0.0238, + "step": 2932 + }, + { + "epoch": 1.3343949044585988, + "grad_norm": 0.4483766382516411, + "learning_rate": 8.344236556891258e-06, + "loss": 0.0229, + "step": 2933 + }, + { + "epoch": 1.3348498635122839, + "grad_norm": 0.6468172061802943, + "learning_rate": 8.343173881880834e-06, + "loss": 0.0478, + "step": 2934 + }, + { + "epoch": 1.335304822565969, + "grad_norm": 0.5929266648467246, + "learning_rate": 8.342110933681338e-06, + "loss": 0.0327, + "step": 2935 + }, + { + "epoch": 1.3357597816196543, + "grad_norm": 0.9253130324099915, + "learning_rate": 8.341047712379629e-06, + "loss": 0.0561, + "step": 2936 + }, + { + "epoch": 1.3362147406733393, + "grad_norm": 0.42363571916667725, + "learning_rate": 8.33998421806259e-06, + "loss": 0.0201, + "step": 2937 + }, + { + "epoch": 1.3366696997270244, + "grad_norm": 0.46068482279133083, + "learning_rate": 8.338920450817124e-06, + "loss": 0.0328, + "step": 2938 + }, + { + "epoch": 1.3371246587807097, + "grad_norm": 0.5809098017908542, + "learning_rate": 8.337856410730157e-06, + "loss": 0.0325, + "step": 2939 + }, + { + "epoch": 1.3375796178343948, + "grad_norm": 0.4339828167803655, + "learning_rate": 8.336792097888636e-06, + "loss": 0.025, + "step": 2940 + }, + { + "epoch": 1.3380345768880801, + "grad_norm": 0.39427353503978124, + "learning_rate": 8.335727512379535e-06, + "loss": 0.0235, + "step": 2941 + }, + { + "epoch": 1.3384895359417652, + "grad_norm": 0.46862803888718435, + "learning_rate": 8.334662654289847e-06, + "loss": 0.0262, + "step": 2942 + }, + { + "epoch": 1.3389444949954505, + "grad_norm": 0.5771037970727136, + "learning_rate": 8.333597523706583e-06, + "loss": 0.0364, + "step": 2943 + }, + { + "epoch": 1.3393994540491356, + "grad_norm": 0.5439670424125458, + "learning_rate": 8.332532120716787e-06, + "loss": 0.0317, + "step": 2944 + }, + { + "epoch": 1.3398544131028207, + "grad_norm": 0.5794567514020939, + "learning_rate": 8.331466445407513e-06, + "loss": 0.0352, + "step": 2945 + }, + { + "epoch": 1.340309372156506, + "grad_norm": 0.6921033406520039, + "learning_rate": 8.330400497865847e-06, + "loss": 0.0372, + "step": 2946 + }, + { + "epoch": 1.3407643312101911, + "grad_norm": 0.547284959859392, + "learning_rate": 8.329334278178893e-06, + "loss": 0.0407, + "step": 2947 + }, + { + "epoch": 1.3412192902638762, + "grad_norm": 0.3600178778635221, + "learning_rate": 8.328267786433777e-06, + "loss": 0.0207, + "step": 2948 + }, + { + "epoch": 1.3416742493175615, + "grad_norm": 0.49898975870202994, + "learning_rate": 8.327201022717645e-06, + "loss": 0.0364, + "step": 2949 + }, + { + "epoch": 1.3421292083712466, + "grad_norm": 0.4818424249267445, + "learning_rate": 8.326133987117674e-06, + "loss": 0.0461, + "step": 2950 + }, + { + "epoch": 1.3425841674249317, + "grad_norm": 0.5346060862589315, + "learning_rate": 8.325066679721053e-06, + "loss": 0.0322, + "step": 2951 + }, + { + "epoch": 1.343039126478617, + "grad_norm": 0.4768637215194256, + "learning_rate": 8.323999100615e-06, + "loss": 0.0228, + "step": 2952 + }, + { + "epoch": 1.343494085532302, + "grad_norm": 0.6121311741443156, + "learning_rate": 8.32293124988675e-06, + "loss": 0.0389, + "step": 2953 + }, + { + "epoch": 1.3439490445859872, + "grad_norm": 0.41050069959015945, + "learning_rate": 8.321863127623565e-06, + "loss": 0.0186, + "step": 2954 + }, + { + "epoch": 1.3444040036396725, + "grad_norm": 0.51587593937067, + "learning_rate": 8.320794733912727e-06, + "loss": 0.033, + "step": 2955 + }, + { + "epoch": 1.3448589626933576, + "grad_norm": 0.4429693043138898, + "learning_rate": 8.319726068841541e-06, + "loss": 0.0218, + "step": 2956 + }, + { + "epoch": 1.3453139217470427, + "grad_norm": 0.4419902925455928, + "learning_rate": 8.31865713249733e-06, + "loss": 0.026, + "step": 2957 + }, + { + "epoch": 1.345768880800728, + "grad_norm": 1.2645808402043965, + "learning_rate": 8.317587924967445e-06, + "loss": 0.0557, + "step": 2958 + }, + { + "epoch": 1.346223839854413, + "grad_norm": 0.5397408570521678, + "learning_rate": 8.31651844633926e-06, + "loss": 0.0278, + "step": 2959 + }, + { + "epoch": 1.3466787989080982, + "grad_norm": 0.5231183718719437, + "learning_rate": 8.31544869670016e-06, + "loss": 0.028, + "step": 2960 + }, + { + "epoch": 1.3471337579617835, + "grad_norm": 0.6229462228070504, + "learning_rate": 8.31437867613757e-06, + "loss": 0.038, + "step": 2961 + }, + { + "epoch": 1.3475887170154686, + "grad_norm": 0.7194000029115405, + "learning_rate": 8.313308384738918e-06, + "loss": 0.0443, + "step": 2962 + }, + { + "epoch": 1.3480436760691537, + "grad_norm": 0.3970249499512815, + "learning_rate": 8.31223782259167e-06, + "loss": 0.0148, + "step": 2963 + }, + { + "epoch": 1.348498635122839, + "grad_norm": 0.5285088103191983, + "learning_rate": 8.311166989783303e-06, + "loss": 0.0364, + "step": 2964 + }, + { + "epoch": 1.348953594176524, + "grad_norm": 0.39954065377503273, + "learning_rate": 8.310095886401326e-06, + "loss": 0.0231, + "step": 2965 + }, + { + "epoch": 1.3494085532302094, + "grad_norm": 0.5024343953698133, + "learning_rate": 8.309024512533258e-06, + "loss": 0.0241, + "step": 2966 + }, + { + "epoch": 1.3498635122838945, + "grad_norm": 0.4085029836051291, + "learning_rate": 8.307952868266653e-06, + "loss": 0.0189, + "step": 2967 + }, + { + "epoch": 1.3503184713375795, + "grad_norm": 0.5144685273285099, + "learning_rate": 8.306880953689078e-06, + "loss": 0.0267, + "step": 2968 + }, + { + "epoch": 1.3507734303912649, + "grad_norm": 0.6709812500236393, + "learning_rate": 8.305808768888123e-06, + "loss": 0.0476, + "step": 2969 + }, + { + "epoch": 1.35122838944495, + "grad_norm": 0.7208231702150235, + "learning_rate": 8.304736313951407e-06, + "loss": 0.0504, + "step": 2970 + }, + { + "epoch": 1.3516833484986353, + "grad_norm": 0.4863482746478878, + "learning_rate": 8.303663588966562e-06, + "loss": 0.0343, + "step": 2971 + }, + { + "epoch": 1.3521383075523203, + "grad_norm": 0.3674961944159868, + "learning_rate": 8.302590594021246e-06, + "loss": 0.0211, + "step": 2972 + }, + { + "epoch": 1.3525932666060054, + "grad_norm": 0.7230295041278698, + "learning_rate": 8.301517329203144e-06, + "loss": 0.054, + "step": 2973 + }, + { + "epoch": 1.3530482256596907, + "grad_norm": 0.45096016710067965, + "learning_rate": 8.300443794599953e-06, + "loss": 0.0347, + "step": 2974 + }, + { + "epoch": 1.3535031847133758, + "grad_norm": 0.5073916445193536, + "learning_rate": 8.299369990299401e-06, + "loss": 0.0247, + "step": 2975 + }, + { + "epoch": 1.353958143767061, + "grad_norm": 0.48751506850799814, + "learning_rate": 8.298295916389234e-06, + "loss": 0.0253, + "step": 2976 + }, + { + "epoch": 1.3544131028207462, + "grad_norm": 0.4580906342189067, + "learning_rate": 8.297221572957219e-06, + "loss": 0.0285, + "step": 2977 + }, + { + "epoch": 1.3548680618744313, + "grad_norm": 0.5613436435308797, + "learning_rate": 8.296146960091147e-06, + "loss": 0.0293, + "step": 2978 + }, + { + "epoch": 1.3553230209281164, + "grad_norm": 0.40557778558942736, + "learning_rate": 8.295072077878831e-06, + "loss": 0.026, + "step": 2979 + }, + { + "epoch": 1.3557779799818017, + "grad_norm": 0.5936885393390249, + "learning_rate": 8.293996926408106e-06, + "loss": 0.0277, + "step": 2980 + }, + { + "epoch": 1.3562329390354868, + "grad_norm": 0.6190172575588174, + "learning_rate": 8.292921505766826e-06, + "loss": 0.0439, + "step": 2981 + }, + { + "epoch": 1.356687898089172, + "grad_norm": 0.5580565035724842, + "learning_rate": 8.291845816042872e-06, + "loss": 0.0293, + "step": 2982 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 0.5141691931429101, + "learning_rate": 8.290769857324144e-06, + "loss": 0.0293, + "step": 2983 + }, + { + "epoch": 1.3575978161965423, + "grad_norm": 0.4512712109432825, + "learning_rate": 8.289693629698564e-06, + "loss": 0.0246, + "step": 2984 + }, + { + "epoch": 1.3580527752502274, + "grad_norm": 0.6040999257078576, + "learning_rate": 8.288617133254075e-06, + "loss": 0.0372, + "step": 2985 + }, + { + "epoch": 1.3585077343039127, + "grad_norm": 0.6915500026797864, + "learning_rate": 8.287540368078648e-06, + "loss": 0.0425, + "step": 2986 + }, + { + "epoch": 1.3589626933575978, + "grad_norm": 0.5125738426632732, + "learning_rate": 8.286463334260268e-06, + "loss": 0.0359, + "step": 2987 + }, + { + "epoch": 1.3594176524112829, + "grad_norm": 0.556319160536806, + "learning_rate": 8.285386031886944e-06, + "loss": 0.0241, + "step": 2988 + }, + { + "epoch": 1.3598726114649682, + "grad_norm": 0.6102320846565004, + "learning_rate": 8.284308461046713e-06, + "loss": 0.0333, + "step": 2989 + }, + { + "epoch": 1.3603275705186533, + "grad_norm": 0.5361302245507936, + "learning_rate": 8.283230621827625e-06, + "loss": 0.0278, + "step": 2990 + }, + { + "epoch": 1.3607825295723384, + "grad_norm": 0.4699596538592812, + "learning_rate": 8.282152514317756e-06, + "loss": 0.0265, + "step": 2991 + }, + { + "epoch": 1.3612374886260237, + "grad_norm": 0.6895404803749348, + "learning_rate": 8.281074138605207e-06, + "loss": 0.0394, + "step": 2992 + }, + { + "epoch": 1.3616924476797088, + "grad_norm": 0.4695263884849979, + "learning_rate": 8.279995494778097e-06, + "loss": 0.0206, + "step": 2993 + }, + { + "epoch": 1.362147406733394, + "grad_norm": 0.429106119981996, + "learning_rate": 8.278916582924566e-06, + "loss": 0.0262, + "step": 2994 + }, + { + "epoch": 1.3626023657870792, + "grad_norm": 0.676343482596927, + "learning_rate": 8.27783740313278e-06, + "loss": 0.0552, + "step": 2995 + }, + { + "epoch": 1.3630573248407643, + "grad_norm": 0.3406816842285639, + "learning_rate": 8.276757955490924e-06, + "loss": 0.0194, + "step": 2996 + }, + { + "epoch": 1.3635122838944496, + "grad_norm": 0.5611480329253273, + "learning_rate": 8.275678240087206e-06, + "loss": 0.0329, + "step": 2997 + }, + { + "epoch": 1.3639672429481347, + "grad_norm": 0.479385138705802, + "learning_rate": 8.274598257009856e-06, + "loss": 0.0322, + "step": 2998 + }, + { + "epoch": 1.36442220200182, + "grad_norm": 0.45227339102007225, + "learning_rate": 8.273518006347122e-06, + "loss": 0.0203, + "step": 2999 + }, + { + "epoch": 1.364877161055505, + "grad_norm": 0.44241532318072835, + "learning_rate": 8.272437488187282e-06, + "loss": 0.0227, + "step": 3000 + }, + { + "epoch": 1.3653321201091901, + "grad_norm": 0.5114034038920364, + "learning_rate": 8.271356702618627e-06, + "loss": 0.0403, + "step": 3001 + }, + { + "epoch": 1.3657870791628755, + "grad_norm": 0.6162969431059591, + "learning_rate": 8.270275649729476e-06, + "loss": 0.0325, + "step": 3002 + }, + { + "epoch": 1.3662420382165605, + "grad_norm": 0.527092132342831, + "learning_rate": 8.269194329608168e-06, + "loss": 0.0284, + "step": 3003 + }, + { + "epoch": 1.3666969972702456, + "grad_norm": 0.46778673954946487, + "learning_rate": 8.268112742343062e-06, + "loss": 0.0245, + "step": 3004 + }, + { + "epoch": 1.367151956323931, + "grad_norm": 0.4147528449331875, + "learning_rate": 8.267030888022543e-06, + "loss": 0.0139, + "step": 3005 + }, + { + "epoch": 1.367606915377616, + "grad_norm": 0.5241363451016124, + "learning_rate": 8.26594876673501e-06, + "loss": 0.0247, + "step": 3006 + }, + { + "epoch": 1.3680618744313011, + "grad_norm": 0.5201590911331215, + "learning_rate": 8.264866378568897e-06, + "loss": 0.0177, + "step": 3007 + }, + { + "epoch": 1.3685168334849864, + "grad_norm": 0.5378167093969959, + "learning_rate": 8.263783723612644e-06, + "loss": 0.0261, + "step": 3008 + }, + { + "epoch": 1.3689717925386715, + "grad_norm": 0.3726813097721879, + "learning_rate": 8.262700801954726e-06, + "loss": 0.02, + "step": 3009 + }, + { + "epoch": 1.3694267515923566, + "grad_norm": 0.6126857162538056, + "learning_rate": 8.261617613683633e-06, + "loss": 0.0433, + "step": 3010 + }, + { + "epoch": 1.369881710646042, + "grad_norm": 1.0211897698494863, + "learning_rate": 8.260534158887878e-06, + "loss": 0.0696, + "step": 3011 + }, + { + "epoch": 1.370336669699727, + "grad_norm": 0.4760258275057717, + "learning_rate": 8.259450437655994e-06, + "loss": 0.0349, + "step": 3012 + }, + { + "epoch": 1.370791628753412, + "grad_norm": 0.5989088817313054, + "learning_rate": 8.258366450076541e-06, + "loss": 0.0491, + "step": 3013 + }, + { + "epoch": 1.3712465878070974, + "grad_norm": 0.6467128053552115, + "learning_rate": 8.257282196238097e-06, + "loss": 0.0367, + "step": 3014 + }, + { + "epoch": 1.3717015468607825, + "grad_norm": 0.6142108685785274, + "learning_rate": 8.256197676229262e-06, + "loss": 0.04, + "step": 3015 + }, + { + "epoch": 1.3721565059144676, + "grad_norm": 0.5514322815123416, + "learning_rate": 8.255112890138657e-06, + "loss": 0.0268, + "step": 3016 + }, + { + "epoch": 1.372611464968153, + "grad_norm": 0.45640389193979836, + "learning_rate": 8.254027838054925e-06, + "loss": 0.0273, + "step": 3017 + }, + { + "epoch": 1.373066424021838, + "grad_norm": 0.39439780413094305, + "learning_rate": 8.252942520066735e-06, + "loss": 0.0177, + "step": 3018 + }, + { + "epoch": 1.373521383075523, + "grad_norm": 0.4504055956288238, + "learning_rate": 8.251856936262774e-06, + "loss": 0.028, + "step": 3019 + }, + { + "epoch": 1.3739763421292084, + "grad_norm": 0.48489134818246765, + "learning_rate": 8.250771086731745e-06, + "loss": 0.0313, + "step": 3020 + }, + { + "epoch": 1.3744313011828935, + "grad_norm": 0.36762870096085365, + "learning_rate": 8.249684971562387e-06, + "loss": 0.0134, + "step": 3021 + }, + { + "epoch": 1.3748862602365788, + "grad_norm": 0.5182427734749195, + "learning_rate": 8.248598590843447e-06, + "loss": 0.0273, + "step": 3022 + }, + { + "epoch": 1.3753412192902639, + "grad_norm": 0.3485416699729316, + "learning_rate": 8.247511944663701e-06, + "loss": 0.0193, + "step": 3023 + }, + { + "epoch": 1.3757961783439492, + "grad_norm": 0.5920956861150252, + "learning_rate": 8.246425033111944e-06, + "loss": 0.0445, + "step": 3024 + }, + { + "epoch": 1.3762511373976343, + "grad_norm": 0.5525125917448483, + "learning_rate": 8.245337856276996e-06, + "loss": 0.0357, + "step": 3025 + }, + { + "epoch": 1.3767060964513194, + "grad_norm": 0.629783233447734, + "learning_rate": 8.244250414247692e-06, + "loss": 0.0413, + "step": 3026 + }, + { + "epoch": 1.3771610555050047, + "grad_norm": 0.35243668108000564, + "learning_rate": 8.243162707112895e-06, + "loss": 0.0163, + "step": 3027 + }, + { + "epoch": 1.3776160145586898, + "grad_norm": 0.32017923480563804, + "learning_rate": 8.242074734961489e-06, + "loss": 0.0217, + "step": 3028 + }, + { + "epoch": 1.3780709736123748, + "grad_norm": 0.5302331839428659, + "learning_rate": 8.240986497882376e-06, + "loss": 0.0246, + "step": 3029 + }, + { + "epoch": 1.3785259326660602, + "grad_norm": 0.513223225130444, + "learning_rate": 8.239897995964483e-06, + "loss": 0.0245, + "step": 3030 + }, + { + "epoch": 1.3789808917197452, + "grad_norm": 0.5941014343712512, + "learning_rate": 8.238809229296756e-06, + "loss": 0.0327, + "step": 3031 + }, + { + "epoch": 1.3794358507734303, + "grad_norm": 0.8390821773783546, + "learning_rate": 8.237720197968167e-06, + "loss": 0.0722, + "step": 3032 + }, + { + "epoch": 1.3798908098271156, + "grad_norm": 0.49003643455117596, + "learning_rate": 8.236630902067702e-06, + "loss": 0.0242, + "step": 3033 + }, + { + "epoch": 1.3803457688808007, + "grad_norm": 0.6184622682847121, + "learning_rate": 8.235541341684378e-06, + "loss": 0.0396, + "step": 3034 + }, + { + "epoch": 1.3808007279344858, + "grad_norm": 0.4436355063234884, + "learning_rate": 8.234451516907228e-06, + "loss": 0.0197, + "step": 3035 + }, + { + "epoch": 1.3812556869881711, + "grad_norm": 0.45830037526579676, + "learning_rate": 8.233361427825305e-06, + "loss": 0.0257, + "step": 3036 + }, + { + "epoch": 1.3817106460418562, + "grad_norm": 0.6393980461408286, + "learning_rate": 8.232271074527688e-06, + "loss": 0.048, + "step": 3037 + }, + { + "epoch": 1.3821656050955413, + "grad_norm": 0.40754186598011133, + "learning_rate": 8.231180457103477e-06, + "loss": 0.0212, + "step": 3038 + }, + { + "epoch": 1.3826205641492266, + "grad_norm": 0.5370935736948618, + "learning_rate": 8.23008957564179e-06, + "loss": 0.0309, + "step": 3039 + }, + { + "epoch": 1.3830755232029117, + "grad_norm": 0.5067459376653409, + "learning_rate": 8.22899843023177e-06, + "loss": 0.0372, + "step": 3040 + }, + { + "epoch": 1.3835304822565968, + "grad_norm": 0.4526706797906497, + "learning_rate": 8.227907020962578e-06, + "loss": 0.0253, + "step": 3041 + }, + { + "epoch": 1.3839854413102821, + "grad_norm": 0.536389038504338, + "learning_rate": 8.226815347923404e-06, + "loss": 0.0298, + "step": 3042 + }, + { + "epoch": 1.3844404003639672, + "grad_norm": 0.4802523068929896, + "learning_rate": 8.225723411203452e-06, + "loss": 0.0275, + "step": 3043 + }, + { + "epoch": 1.3848953594176523, + "grad_norm": 0.5706003382968008, + "learning_rate": 8.22463121089195e-06, + "loss": 0.0378, + "step": 3044 + }, + { + "epoch": 1.3853503184713376, + "grad_norm": 0.4968696815862661, + "learning_rate": 8.223538747078146e-06, + "loss": 0.0337, + "step": 3045 + }, + { + "epoch": 1.3858052775250227, + "grad_norm": 0.5978285446033724, + "learning_rate": 8.222446019851315e-06, + "loss": 0.0297, + "step": 3046 + }, + { + "epoch": 1.3862602365787078, + "grad_norm": 0.7562718290242261, + "learning_rate": 8.221353029300747e-06, + "loss": 0.046, + "step": 3047 + }, + { + "epoch": 1.386715195632393, + "grad_norm": 0.40943266637554676, + "learning_rate": 8.220259775515756e-06, + "loss": 0.0208, + "step": 3048 + }, + { + "epoch": 1.3871701546860782, + "grad_norm": 0.49567626101539297, + "learning_rate": 8.21916625858568e-06, + "loss": 0.0308, + "step": 3049 + }, + { + "epoch": 1.3876251137397635, + "grad_norm": 0.5102658380968296, + "learning_rate": 8.218072478599875e-06, + "loss": 0.027, + "step": 3050 + }, + { + "epoch": 1.3880800727934486, + "grad_norm": 0.5640260630506516, + "learning_rate": 8.216978435647718e-06, + "loss": 0.052, + "step": 3051 + }, + { + "epoch": 1.388535031847134, + "grad_norm": 0.47339074035185624, + "learning_rate": 8.215884129818612e-06, + "loss": 0.0216, + "step": 3052 + }, + { + "epoch": 1.388989990900819, + "grad_norm": 0.45861030037606987, + "learning_rate": 8.214789561201979e-06, + "loss": 0.0228, + "step": 3053 + }, + { + "epoch": 1.389444949954504, + "grad_norm": 0.5617691155764737, + "learning_rate": 8.21369472988726e-06, + "loss": 0.0376, + "step": 3054 + }, + { + "epoch": 1.3898999090081894, + "grad_norm": 0.7593333756738143, + "learning_rate": 8.21259963596392e-06, + "loss": 0.0443, + "step": 3055 + }, + { + "epoch": 1.3903548680618745, + "grad_norm": 0.5508343748423888, + "learning_rate": 8.211504279521445e-06, + "loss": 0.0348, + "step": 3056 + }, + { + "epoch": 1.3908098271155596, + "grad_norm": 0.4297088508988446, + "learning_rate": 8.210408660649346e-06, + "loss": 0.0236, + "step": 3057 + }, + { + "epoch": 1.3912647861692449, + "grad_norm": 0.4652687964406028, + "learning_rate": 8.209312779437147e-06, + "loss": 0.0195, + "step": 3058 + }, + { + "epoch": 1.39171974522293, + "grad_norm": 0.5833898595618995, + "learning_rate": 8.208216635974401e-06, + "loss": 0.0409, + "step": 3059 + }, + { + "epoch": 1.392174704276615, + "grad_norm": 0.59822842016674, + "learning_rate": 8.207120230350682e-06, + "loss": 0.0496, + "step": 3060 + }, + { + "epoch": 1.3926296633303004, + "grad_norm": 0.2680216564923973, + "learning_rate": 8.206023562655578e-06, + "loss": 0.0192, + "step": 3061 + }, + { + "epoch": 1.3930846223839854, + "grad_norm": 0.4548172021576498, + "learning_rate": 8.204926632978708e-06, + "loss": 0.0263, + "step": 3062 + }, + { + "epoch": 1.3935395814376705, + "grad_norm": 0.5200897444884447, + "learning_rate": 8.203829441409708e-06, + "loss": 0.0357, + "step": 3063 + }, + { + "epoch": 1.3939945404913558, + "grad_norm": 0.5810832537093169, + "learning_rate": 8.202731988038232e-06, + "loss": 0.0362, + "step": 3064 + }, + { + "epoch": 1.394449499545041, + "grad_norm": 0.6998441041100203, + "learning_rate": 8.201634272953963e-06, + "loss": 0.0482, + "step": 3065 + }, + { + "epoch": 1.394904458598726, + "grad_norm": 0.5366640695091471, + "learning_rate": 8.2005362962466e-06, + "loss": 0.0242, + "step": 3066 + }, + { + "epoch": 1.3953594176524113, + "grad_norm": 0.43762864600569806, + "learning_rate": 8.199438058005864e-06, + "loss": 0.0275, + "step": 3067 + }, + { + "epoch": 1.3958143767060964, + "grad_norm": 0.7172645055229739, + "learning_rate": 8.198339558321497e-06, + "loss": 0.044, + "step": 3068 + }, + { + "epoch": 1.3962693357597815, + "grad_norm": 0.4922329511547334, + "learning_rate": 8.197240797283266e-06, + "loss": 0.0238, + "step": 3069 + }, + { + "epoch": 1.3967242948134668, + "grad_norm": 0.43305804975583, + "learning_rate": 8.196141774980957e-06, + "loss": 0.0253, + "step": 3070 + }, + { + "epoch": 1.397179253867152, + "grad_norm": 0.4830253862903282, + "learning_rate": 8.195042491504373e-06, + "loss": 0.027, + "step": 3071 + }, + { + "epoch": 1.397634212920837, + "grad_norm": 0.5708344998642687, + "learning_rate": 8.193942946943348e-06, + "loss": 0.0363, + "step": 3072 + }, + { + "epoch": 1.3980891719745223, + "grad_norm": 0.5596831343818472, + "learning_rate": 8.192843141387727e-06, + "loss": 0.0334, + "step": 3073 + }, + { + "epoch": 1.3985441310282074, + "grad_norm": 0.7790965200858694, + "learning_rate": 8.191743074927385e-06, + "loss": 0.0402, + "step": 3074 + }, + { + "epoch": 1.3989990900818925, + "grad_norm": 0.3861231357505535, + "learning_rate": 8.19064274765221e-06, + "loss": 0.0266, + "step": 3075 + }, + { + "epoch": 1.3994540491355778, + "grad_norm": 0.45166903229483174, + "learning_rate": 8.189542159652122e-06, + "loss": 0.0211, + "step": 3076 + }, + { + "epoch": 1.399909008189263, + "grad_norm": 0.6031804236142754, + "learning_rate": 8.18844131101705e-06, + "loss": 0.035, + "step": 3077 + }, + { + "epoch": 1.4003639672429482, + "grad_norm": 1.3846733915903273, + "learning_rate": 8.187340201836955e-06, + "loss": 0.037, + "step": 3078 + }, + { + "epoch": 1.4008189262966333, + "grad_norm": 0.4992604754422713, + "learning_rate": 8.186238832201809e-06, + "loss": 0.0382, + "step": 3079 + }, + { + "epoch": 1.4012738853503186, + "grad_norm": 0.5098672644565495, + "learning_rate": 8.185137202201618e-06, + "loss": 0.0314, + "step": 3080 + }, + { + "epoch": 1.4017288444040037, + "grad_norm": 0.4607415273832947, + "learning_rate": 8.184035311926397e-06, + "loss": 0.0232, + "step": 3081 + }, + { + "epoch": 1.4021838034576888, + "grad_norm": 0.5536418146465356, + "learning_rate": 8.18293316146619e-06, + "loss": 0.0415, + "step": 3082 + }, + { + "epoch": 1.402638762511374, + "grad_norm": 0.5646125069514234, + "learning_rate": 8.18183075091106e-06, + "loss": 0.0274, + "step": 3083 + }, + { + "epoch": 1.4030937215650592, + "grad_norm": 0.4731283804868446, + "learning_rate": 8.18072808035109e-06, + "loss": 0.0213, + "step": 3084 + }, + { + "epoch": 1.4035486806187443, + "grad_norm": 0.6175450171343488, + "learning_rate": 8.179625149876384e-06, + "loss": 0.0425, + "step": 3085 + }, + { + "epoch": 1.4040036396724296, + "grad_norm": 0.7990078666071156, + "learning_rate": 8.178521959577069e-06, + "loss": 0.0504, + "step": 3086 + }, + { + "epoch": 1.4044585987261147, + "grad_norm": 0.4939494179351219, + "learning_rate": 8.177418509543296e-06, + "loss": 0.0348, + "step": 3087 + }, + { + "epoch": 1.4049135577797998, + "grad_norm": 0.5234647003423882, + "learning_rate": 8.17631479986523e-06, + "loss": 0.0242, + "step": 3088 + }, + { + "epoch": 1.405368516833485, + "grad_norm": 0.4543086003689767, + "learning_rate": 8.175210830633063e-06, + "loss": 0.0365, + "step": 3089 + }, + { + "epoch": 1.4058234758871702, + "grad_norm": 0.4270056688908246, + "learning_rate": 8.174106601937005e-06, + "loss": 0.0197, + "step": 3090 + }, + { + "epoch": 1.4062784349408552, + "grad_norm": 0.3545709323637256, + "learning_rate": 8.173002113867291e-06, + "loss": 0.0211, + "step": 3091 + }, + { + "epoch": 1.4067333939945406, + "grad_norm": 0.6194564321624038, + "learning_rate": 8.171897366514174e-06, + "loss": 0.0386, + "step": 3092 + }, + { + "epoch": 1.4071883530482256, + "grad_norm": 10.733523014095153, + "learning_rate": 8.170792359967926e-06, + "loss": 0.0351, + "step": 3093 + }, + { + "epoch": 1.4076433121019107, + "grad_norm": 2.954613313740392, + "learning_rate": 8.169687094318848e-06, + "loss": 0.0484, + "step": 3094 + }, + { + "epoch": 1.408098271155596, + "grad_norm": 0.47504988483410193, + "learning_rate": 8.168581569657253e-06, + "loss": 0.0293, + "step": 3095 + }, + { + "epoch": 1.4085532302092811, + "grad_norm": 0.3293476754938369, + "learning_rate": 8.167475786073483e-06, + "loss": 0.0153, + "step": 3096 + }, + { + "epoch": 1.4090081892629662, + "grad_norm": 0.4467649777544998, + "learning_rate": 8.166369743657894e-06, + "loss": 0.0246, + "step": 3097 + }, + { + "epoch": 1.4094631483166515, + "grad_norm": 0.4910843124231285, + "learning_rate": 8.165263442500869e-06, + "loss": 0.0307, + "step": 3098 + }, + { + "epoch": 1.4099181073703366, + "grad_norm": 0.45810467957956863, + "learning_rate": 8.164156882692811e-06, + "loss": 0.0234, + "step": 3099 + }, + { + "epoch": 1.4103730664240217, + "grad_norm": 0.5117781690908434, + "learning_rate": 8.16305006432414e-06, + "loss": 0.0262, + "step": 3100 + }, + { + "epoch": 1.410828025477707, + "grad_norm": 0.5143420044566899, + "learning_rate": 8.161942987485303e-06, + "loss": 0.0241, + "step": 3101 + }, + { + "epoch": 1.4112829845313921, + "grad_norm": 0.8775557486692402, + "learning_rate": 8.160835652266765e-06, + "loss": 0.036, + "step": 3102 + }, + { + "epoch": 1.4117379435850774, + "grad_norm": 0.591279163847277, + "learning_rate": 8.159728058759012e-06, + "loss": 0.0552, + "step": 3103 + }, + { + "epoch": 1.4121929026387625, + "grad_norm": 0.43025003609442686, + "learning_rate": 8.15862020705255e-06, + "loss": 0.0265, + "step": 3104 + }, + { + "epoch": 1.4126478616924476, + "grad_norm": 0.5960364379763011, + "learning_rate": 8.157512097237909e-06, + "loss": 0.029, + "step": 3105 + }, + { + "epoch": 1.413102820746133, + "grad_norm": 0.4699210144648856, + "learning_rate": 8.15640372940564e-06, + "loss": 0.0261, + "step": 3106 + }, + { + "epoch": 1.413557779799818, + "grad_norm": 0.40195772164768756, + "learning_rate": 8.15529510364631e-06, + "loss": 0.0251, + "step": 3107 + }, + { + "epoch": 1.4140127388535033, + "grad_norm": 0.38209950272348364, + "learning_rate": 8.154186220050516e-06, + "loss": 0.0342, + "step": 3108 + }, + { + "epoch": 1.4144676979071884, + "grad_norm": 0.6918635638402106, + "learning_rate": 8.153077078708867e-06, + "loss": 0.0384, + "step": 3109 + }, + { + "epoch": 1.4149226569608735, + "grad_norm": 0.564268707731014, + "learning_rate": 8.151967679711997e-06, + "loss": 0.04, + "step": 3110 + }, + { + "epoch": 1.4153776160145588, + "grad_norm": 0.4717867339000607, + "learning_rate": 8.150858023150563e-06, + "loss": 0.0366, + "step": 3111 + }, + { + "epoch": 1.415832575068244, + "grad_norm": 0.36126517477798215, + "learning_rate": 8.14974810911524e-06, + "loss": 0.0154, + "step": 3112 + }, + { + "epoch": 1.416287534121929, + "grad_norm": 0.48852954945790295, + "learning_rate": 8.148637937696728e-06, + "loss": 0.0318, + "step": 3113 + }, + { + "epoch": 1.4167424931756143, + "grad_norm": 0.698433091689678, + "learning_rate": 8.147527508985742e-06, + "loss": 0.0538, + "step": 3114 + }, + { + "epoch": 1.4171974522292994, + "grad_norm": 0.5209634235366688, + "learning_rate": 8.14641682307302e-06, + "loss": 0.0317, + "step": 3115 + }, + { + "epoch": 1.4176524112829845, + "grad_norm": 0.4461508909116904, + "learning_rate": 8.145305880049328e-06, + "loss": 0.0254, + "step": 3116 + }, + { + "epoch": 1.4181073703366698, + "grad_norm": 0.4294589539133548, + "learning_rate": 8.14419468000544e-06, + "loss": 0.0252, + "step": 3117 + }, + { + "epoch": 1.4185623293903549, + "grad_norm": 0.5390763643611722, + "learning_rate": 8.143083223032164e-06, + "loss": 0.0198, + "step": 3118 + }, + { + "epoch": 1.41901728844404, + "grad_norm": 0.6769964702086475, + "learning_rate": 8.141971509220321e-06, + "loss": 0.0417, + "step": 3119 + }, + { + "epoch": 1.4194722474977253, + "grad_norm": 0.5507224112797532, + "learning_rate": 8.140859538660755e-06, + "loss": 0.0341, + "step": 3120 + }, + { + "epoch": 1.4199272065514104, + "grad_norm": 0.5263510786416742, + "learning_rate": 8.139747311444331e-06, + "loss": 0.0263, + "step": 3121 + }, + { + "epoch": 1.4203821656050954, + "grad_norm": 0.44633836091071555, + "learning_rate": 8.138634827661936e-06, + "loss": 0.0206, + "step": 3122 + }, + { + "epoch": 1.4208371246587808, + "grad_norm": 0.4409238528743987, + "learning_rate": 8.137522087404474e-06, + "loss": 0.0235, + "step": 3123 + }, + { + "epoch": 1.4212920837124658, + "grad_norm": 0.53650644597874, + "learning_rate": 8.13640909076288e-06, + "loss": 0.0249, + "step": 3124 + }, + { + "epoch": 1.421747042766151, + "grad_norm": 0.43913353713243797, + "learning_rate": 8.135295837828097e-06, + "loss": 0.022, + "step": 3125 + }, + { + "epoch": 1.4222020018198362, + "grad_norm": 0.4411508266847209, + "learning_rate": 8.134182328691098e-06, + "loss": 0.021, + "step": 3126 + }, + { + "epoch": 1.4226569608735213, + "grad_norm": 0.5580383577936809, + "learning_rate": 8.133068563442873e-06, + "loss": 0.0253, + "step": 3127 + }, + { + "epoch": 1.4231119199272064, + "grad_norm": 0.6159942959143841, + "learning_rate": 8.131954542174433e-06, + "loss": 0.0369, + "step": 3128 + }, + { + "epoch": 1.4235668789808917, + "grad_norm": 0.5518845413386557, + "learning_rate": 8.130840264976812e-06, + "loss": 0.0318, + "step": 3129 + }, + { + "epoch": 1.4240218380345768, + "grad_norm": 0.5260026586781827, + "learning_rate": 8.129725731941063e-06, + "loss": 0.0276, + "step": 3130 + }, + { + "epoch": 1.4244767970882621, + "grad_norm": 0.5177138558113535, + "learning_rate": 8.128610943158262e-06, + "loss": 0.0236, + "step": 3131 + }, + { + "epoch": 1.4249317561419472, + "grad_norm": 0.552369709761377, + "learning_rate": 8.127495898719502e-06, + "loss": 0.0341, + "step": 3132 + }, + { + "epoch": 1.4253867151956323, + "grad_norm": 0.5264921326858812, + "learning_rate": 8.126380598715902e-06, + "loss": 0.0342, + "step": 3133 + }, + { + "epoch": 1.4258416742493176, + "grad_norm": 0.9269763934758377, + "learning_rate": 8.125265043238597e-06, + "loss": 0.0531, + "step": 3134 + }, + { + "epoch": 1.4262966333030027, + "grad_norm": 0.9603254771666904, + "learning_rate": 8.124149232378747e-06, + "loss": 0.0417, + "step": 3135 + }, + { + "epoch": 1.426751592356688, + "grad_norm": 0.6470486278536731, + "learning_rate": 8.12303316622753e-06, + "loss": 0.0405, + "step": 3136 + }, + { + "epoch": 1.4272065514103731, + "grad_norm": 0.48201293664894446, + "learning_rate": 8.121916844876145e-06, + "loss": 0.0321, + "step": 3137 + }, + { + "epoch": 1.4276615104640582, + "grad_norm": 0.5872475622012117, + "learning_rate": 8.120800268415815e-06, + "loss": 0.0326, + "step": 3138 + }, + { + "epoch": 1.4281164695177435, + "grad_norm": 0.5576014328083249, + "learning_rate": 8.11968343693778e-06, + "loss": 0.0245, + "step": 3139 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.6302514250098855, + "learning_rate": 8.118566350533304e-06, + "loss": 0.0478, + "step": 3140 + }, + { + "epoch": 1.4290263876251137, + "grad_norm": 0.6141352095304792, + "learning_rate": 8.117449009293668e-06, + "loss": 0.0341, + "step": 3141 + }, + { + "epoch": 1.429481346678799, + "grad_norm": 0.5457299413412613, + "learning_rate": 8.116331413310178e-06, + "loss": 0.0282, + "step": 3142 + }, + { + "epoch": 1.429936305732484, + "grad_norm": 0.5791172434046514, + "learning_rate": 8.115213562674158e-06, + "loss": 0.0345, + "step": 3143 + }, + { + "epoch": 1.4303912647861692, + "grad_norm": 0.5478313762128336, + "learning_rate": 8.114095457476954e-06, + "loss": 0.0316, + "step": 3144 + }, + { + "epoch": 1.4308462238398545, + "grad_norm": 0.9292226056717331, + "learning_rate": 8.112977097809932e-06, + "loss": 0.0642, + "step": 3145 + }, + { + "epoch": 1.4313011828935396, + "grad_norm": 0.6165602818367223, + "learning_rate": 8.111858483764478e-06, + "loss": 0.0406, + "step": 3146 + }, + { + "epoch": 1.4317561419472247, + "grad_norm": 0.5589066346288158, + "learning_rate": 8.110739615432005e-06, + "loss": 0.0309, + "step": 3147 + }, + { + "epoch": 1.43221110100091, + "grad_norm": 0.5221994826214766, + "learning_rate": 8.109620492903938e-06, + "loss": 0.0426, + "step": 3148 + }, + { + "epoch": 1.432666060054595, + "grad_norm": 0.6643254697112511, + "learning_rate": 8.108501116271725e-06, + "loss": 0.0449, + "step": 3149 + }, + { + "epoch": 1.4331210191082802, + "grad_norm": 0.40499081748743093, + "learning_rate": 8.10738148562684e-06, + "loss": 0.0259, + "step": 3150 + }, + { + "epoch": 1.4335759781619655, + "grad_norm": 0.5319408589709967, + "learning_rate": 8.106261601060773e-06, + "loss": 0.0235, + "step": 3151 + }, + { + "epoch": 1.4340309372156506, + "grad_norm": 0.502676216768999, + "learning_rate": 8.105141462665036e-06, + "loss": 0.029, + "step": 3152 + }, + { + "epoch": 1.4344858962693356, + "grad_norm": 0.4708835121259622, + "learning_rate": 8.104021070531161e-06, + "loss": 0.0291, + "step": 3153 + }, + { + "epoch": 1.434940855323021, + "grad_norm": 0.6763569542098118, + "learning_rate": 8.102900424750702e-06, + "loss": 0.0435, + "step": 3154 + }, + { + "epoch": 1.435395814376706, + "grad_norm": 0.599568518154828, + "learning_rate": 8.101779525415232e-06, + "loss": 0.0366, + "step": 3155 + }, + { + "epoch": 1.4358507734303911, + "grad_norm": 0.4968849199855825, + "learning_rate": 8.100658372616346e-06, + "loss": 0.0324, + "step": 3156 + }, + { + "epoch": 1.4363057324840764, + "grad_norm": 0.6290729255135113, + "learning_rate": 8.099536966445661e-06, + "loss": 0.0323, + "step": 3157 + }, + { + "epoch": 1.4367606915377615, + "grad_norm": 0.6666747225975457, + "learning_rate": 8.098415306994813e-06, + "loss": 0.0443, + "step": 3158 + }, + { + "epoch": 1.4372156505914468, + "grad_norm": 0.7245033011103253, + "learning_rate": 8.097293394355459e-06, + "loss": 0.0516, + "step": 3159 + }, + { + "epoch": 1.437670609645132, + "grad_norm": 0.42971274750201555, + "learning_rate": 8.096171228619276e-06, + "loss": 0.0247, + "step": 3160 + }, + { + "epoch": 1.438125568698817, + "grad_norm": 0.5833567897257396, + "learning_rate": 8.095048809877961e-06, + "loss": 0.0294, + "step": 3161 + }, + { + "epoch": 1.4385805277525023, + "grad_norm": 0.6728067449458445, + "learning_rate": 8.093926138223234e-06, + "loss": 0.0495, + "step": 3162 + }, + { + "epoch": 1.4390354868061874, + "grad_norm": 0.6121920243772246, + "learning_rate": 8.092803213746838e-06, + "loss": 0.0418, + "step": 3163 + }, + { + "epoch": 1.4394904458598727, + "grad_norm": 0.42743590888197097, + "learning_rate": 8.091680036540528e-06, + "loss": 0.0279, + "step": 3164 + }, + { + "epoch": 1.4399454049135578, + "grad_norm": 0.4126243156879265, + "learning_rate": 8.090556606696088e-06, + "loss": 0.0223, + "step": 3165 + }, + { + "epoch": 1.440400363967243, + "grad_norm": 0.5217770481149632, + "learning_rate": 8.089432924305319e-06, + "loss": 0.0265, + "step": 3166 + }, + { + "epoch": 1.4408553230209282, + "grad_norm": 0.3749847244592466, + "learning_rate": 8.08830898946004e-06, + "loss": 0.0246, + "step": 3167 + }, + { + "epoch": 1.4413102820746133, + "grad_norm": 0.5165056627108671, + "learning_rate": 8.087184802252102e-06, + "loss": 0.0314, + "step": 3168 + }, + { + "epoch": 1.4417652411282984, + "grad_norm": 0.523636258278207, + "learning_rate": 8.086060362773362e-06, + "loss": 0.0325, + "step": 3169 + }, + { + "epoch": 1.4422202001819837, + "grad_norm": 0.6072272924301817, + "learning_rate": 8.084935671115705e-06, + "loss": 0.0219, + "step": 3170 + }, + { + "epoch": 1.4426751592356688, + "grad_norm": 0.525973990821791, + "learning_rate": 8.083810727371037e-06, + "loss": 0.0318, + "step": 3171 + }, + { + "epoch": 1.443130118289354, + "grad_norm": 0.4683061240213502, + "learning_rate": 8.082685531631282e-06, + "loss": 0.0319, + "step": 3172 + }, + { + "epoch": 1.4435850773430392, + "grad_norm": 0.8031318133945788, + "learning_rate": 8.081560083988387e-06, + "loss": 0.0347, + "step": 3173 + }, + { + "epoch": 1.4440400363967243, + "grad_norm": 0.5449225628318263, + "learning_rate": 8.080434384534318e-06, + "loss": 0.0396, + "step": 3174 + }, + { + "epoch": 1.4444949954504094, + "grad_norm": 0.42918723160148686, + "learning_rate": 8.07930843336106e-06, + "loss": 0.0221, + "step": 3175 + }, + { + "epoch": 1.4449499545040947, + "grad_norm": 0.6193328526079885, + "learning_rate": 8.078182230560628e-06, + "loss": 0.0367, + "step": 3176 + }, + { + "epoch": 1.4454049135577798, + "grad_norm": 0.5347432504833302, + "learning_rate": 8.077055776225041e-06, + "loss": 0.0329, + "step": 3177 + }, + { + "epoch": 1.4458598726114649, + "grad_norm": 0.46998578040146427, + "learning_rate": 8.075929070446354e-06, + "loss": 0.021, + "step": 3178 + }, + { + "epoch": 1.4463148316651502, + "grad_norm": 0.4749547726682001, + "learning_rate": 8.074802113316633e-06, + "loss": 0.032, + "step": 3179 + }, + { + "epoch": 1.4467697907188353, + "grad_norm": 0.4196630072378444, + "learning_rate": 8.07367490492797e-06, + "loss": 0.0181, + "step": 3180 + }, + { + "epoch": 1.4472247497725204, + "grad_norm": 0.5737177118299412, + "learning_rate": 8.072547445372471e-06, + "loss": 0.0391, + "step": 3181 + }, + { + "epoch": 1.4476797088262057, + "grad_norm": 0.6430451121317332, + "learning_rate": 8.071419734742275e-06, + "loss": 0.0395, + "step": 3182 + }, + { + "epoch": 1.4481346678798908, + "grad_norm": 0.4956926457176075, + "learning_rate": 8.070291773129526e-06, + "loss": 0.0307, + "step": 3183 + }, + { + "epoch": 1.4485896269335758, + "grad_norm": 0.6279572303428038, + "learning_rate": 8.0691635606264e-06, + "loss": 0.0347, + "step": 3184 + }, + { + "epoch": 1.4490445859872612, + "grad_norm": 0.5041410724529021, + "learning_rate": 8.068035097325087e-06, + "loss": 0.0308, + "step": 3185 + }, + { + "epoch": 1.4494995450409462, + "grad_norm": 0.598159207447712, + "learning_rate": 8.066906383317801e-06, + "loss": 0.0365, + "step": 3186 + }, + { + "epoch": 1.4499545040946316, + "grad_norm": 0.7495960095780199, + "learning_rate": 8.065777418696775e-06, + "loss": 0.0397, + "step": 3187 + }, + { + "epoch": 1.4504094631483166, + "grad_norm": 0.6239207793483897, + "learning_rate": 8.064648203554264e-06, + "loss": 0.0333, + "step": 3188 + }, + { + "epoch": 1.450864422202002, + "grad_norm": 0.6099069881067322, + "learning_rate": 8.06351873798254e-06, + "loss": 0.0447, + "step": 3189 + }, + { + "epoch": 1.451319381255687, + "grad_norm": 0.3679602053637268, + "learning_rate": 8.062389022073901e-06, + "loss": 0.0227, + "step": 3190 + }, + { + "epoch": 1.4517743403093721, + "grad_norm": 0.5180394703575767, + "learning_rate": 8.061259055920661e-06, + "loss": 0.0265, + "step": 3191 + }, + { + "epoch": 1.4522292993630574, + "grad_norm": 0.5464173471525077, + "learning_rate": 8.060128839615155e-06, + "loss": 0.0358, + "step": 3192 + }, + { + "epoch": 1.4526842584167425, + "grad_norm": 0.5864419383217762, + "learning_rate": 8.05899837324974e-06, + "loss": 0.0329, + "step": 3193 + }, + { + "epoch": 1.4531392174704276, + "grad_norm": 0.5545494776633784, + "learning_rate": 8.057867656916793e-06, + "loss": 0.0324, + "step": 3194 + }, + { + "epoch": 1.453594176524113, + "grad_norm": 0.44836014586822326, + "learning_rate": 8.05673669070871e-06, + "loss": 0.0308, + "step": 3195 + }, + { + "epoch": 1.454049135577798, + "grad_norm": 0.6284948960555835, + "learning_rate": 8.055605474717908e-06, + "loss": 0.0432, + "step": 3196 + }, + { + "epoch": 1.4545040946314831, + "grad_norm": 0.3707138957956115, + "learning_rate": 8.054474009036826e-06, + "loss": 0.0145, + "step": 3197 + }, + { + "epoch": 1.4549590536851684, + "grad_norm": 0.3239258824372849, + "learning_rate": 8.05334229375792e-06, + "loss": 0.0199, + "step": 3198 + }, + { + "epoch": 1.4554140127388535, + "grad_norm": 0.5079580495736739, + "learning_rate": 8.052210328973673e-06, + "loss": 0.0318, + "step": 3199 + }, + { + "epoch": 1.4558689717925386, + "grad_norm": 0.5104623998753726, + "learning_rate": 8.051078114776581e-06, + "loss": 0.0246, + "step": 3200 + }, + { + "epoch": 1.456323930846224, + "grad_norm": 0.49461842129465483, + "learning_rate": 8.049945651259163e-06, + "loss": 0.0199, + "step": 3201 + }, + { + "epoch": 1.456778889899909, + "grad_norm": 0.6851413999422369, + "learning_rate": 8.048812938513958e-06, + "loss": 0.0437, + "step": 3202 + }, + { + "epoch": 1.457233848953594, + "grad_norm": 0.4560870320885985, + "learning_rate": 8.047679976633532e-06, + "loss": 0.0241, + "step": 3203 + }, + { + "epoch": 1.4576888080072794, + "grad_norm": 0.5352288092695029, + "learning_rate": 8.04654676571046e-06, + "loss": 0.0426, + "step": 3204 + }, + { + "epoch": 1.4581437670609645, + "grad_norm": 0.6643419689254415, + "learning_rate": 8.045413305837344e-06, + "loss": 0.033, + "step": 3205 + }, + { + "epoch": 1.4585987261146496, + "grad_norm": 0.43853553587407185, + "learning_rate": 8.044279597106807e-06, + "loss": 0.0361, + "step": 3206 + }, + { + "epoch": 1.459053685168335, + "grad_norm": 0.4961349795414473, + "learning_rate": 8.043145639611488e-06, + "loss": 0.03, + "step": 3207 + }, + { + "epoch": 1.45950864422202, + "grad_norm": 0.4711729889421299, + "learning_rate": 8.04201143344405e-06, + "loss": 0.022, + "step": 3208 + }, + { + "epoch": 1.459963603275705, + "grad_norm": 0.363199677939404, + "learning_rate": 8.040876978697174e-06, + "loss": 0.0236, + "step": 3209 + }, + { + "epoch": 1.4604185623293904, + "grad_norm": 0.38604446366020234, + "learning_rate": 8.039742275463566e-06, + "loss": 0.0226, + "step": 3210 + }, + { + "epoch": 1.4608735213830755, + "grad_norm": 0.45765271075247205, + "learning_rate": 8.038607323835946e-06, + "loss": 0.025, + "step": 3211 + }, + { + "epoch": 1.4613284804367606, + "grad_norm": 0.4395685757788466, + "learning_rate": 8.037472123907058e-06, + "loss": 0.0206, + "step": 3212 + }, + { + "epoch": 1.4617834394904459, + "grad_norm": 0.6738624693889801, + "learning_rate": 8.036336675769665e-06, + "loss": 0.042, + "step": 3213 + }, + { + "epoch": 1.462238398544131, + "grad_norm": 0.5686179308511357, + "learning_rate": 8.03520097951655e-06, + "loss": 0.0381, + "step": 3214 + }, + { + "epoch": 1.4626933575978163, + "grad_norm": 0.4915965693261835, + "learning_rate": 8.034065035240519e-06, + "loss": 0.0332, + "step": 3215 + }, + { + "epoch": 1.4631483166515014, + "grad_norm": 0.4051092749550177, + "learning_rate": 8.032928843034393e-06, + "loss": 0.0197, + "step": 3216 + }, + { + "epoch": 1.4636032757051867, + "grad_norm": 0.6425143148265581, + "learning_rate": 8.031792402991022e-06, + "loss": 0.0466, + "step": 3217 + }, + { + "epoch": 1.4640582347588718, + "grad_norm": 0.5423117548064644, + "learning_rate": 8.030655715203265e-06, + "loss": 0.0309, + "step": 3218 + }, + { + "epoch": 1.4645131938125568, + "grad_norm": 0.5826622186708784, + "learning_rate": 8.029518779764007e-06, + "loss": 0.0438, + "step": 3219 + }, + { + "epoch": 1.4649681528662422, + "grad_norm": 0.5576538577521253, + "learning_rate": 8.028381596766159e-06, + "loss": 0.0373, + "step": 3220 + }, + { + "epoch": 1.4654231119199272, + "grad_norm": 0.6507764614874492, + "learning_rate": 8.027244166302641e-06, + "loss": 0.0328, + "step": 3221 + }, + { + "epoch": 1.4658780709736123, + "grad_norm": 0.46590511948446056, + "learning_rate": 8.026106488466403e-06, + "loss": 0.037, + "step": 3222 + }, + { + "epoch": 1.4663330300272976, + "grad_norm": 0.3698047411401823, + "learning_rate": 8.024968563350406e-06, + "loss": 0.0159, + "step": 3223 + }, + { + "epoch": 1.4667879890809827, + "grad_norm": 0.5475583937083369, + "learning_rate": 8.02383039104764e-06, + "loss": 0.0343, + "step": 3224 + }, + { + "epoch": 1.4672429481346678, + "grad_norm": 0.967666097714969, + "learning_rate": 8.02269197165111e-06, + "loss": 0.0493, + "step": 3225 + }, + { + "epoch": 1.4676979071883531, + "grad_norm": 0.530611866566285, + "learning_rate": 8.021553305253841e-06, + "loss": 0.0342, + "step": 3226 + }, + { + "epoch": 1.4681528662420382, + "grad_norm": 0.36365102789295684, + "learning_rate": 8.020414391948882e-06, + "loss": 0.0197, + "step": 3227 + }, + { + "epoch": 1.4686078252957233, + "grad_norm": 0.786177107653995, + "learning_rate": 8.0192752318293e-06, + "loss": 0.0462, + "step": 3228 + }, + { + "epoch": 1.4690627843494086, + "grad_norm": 0.4832882680186476, + "learning_rate": 8.01813582498818e-06, + "loss": 0.0322, + "step": 3229 + }, + { + "epoch": 1.4695177434030937, + "grad_norm": 0.5440743945002253, + "learning_rate": 8.01699617151863e-06, + "loss": 0.041, + "step": 3230 + }, + { + "epoch": 1.4699727024567788, + "grad_norm": 0.45934929798428287, + "learning_rate": 8.015856271513777e-06, + "loss": 0.0244, + "step": 3231 + }, + { + "epoch": 1.4704276615104641, + "grad_norm": 0.5331795218085696, + "learning_rate": 8.014716125066771e-06, + "loss": 0.0342, + "step": 3232 + }, + { + "epoch": 1.4708826205641492, + "grad_norm": 0.6946319306233284, + "learning_rate": 8.013575732270775e-06, + "loss": 0.0543, + "step": 3233 + }, + { + "epoch": 1.4713375796178343, + "grad_norm": 0.41286436665712734, + "learning_rate": 8.012435093218982e-06, + "loss": 0.0238, + "step": 3234 + }, + { + "epoch": 1.4717925386715196, + "grad_norm": 0.5015834867051956, + "learning_rate": 8.011294208004596e-06, + "loss": 0.0303, + "step": 3235 + }, + { + "epoch": 1.4722474977252047, + "grad_norm": 0.346012187808125, + "learning_rate": 8.010153076720848e-06, + "loss": 0.0269, + "step": 3236 + }, + { + "epoch": 1.4727024567788898, + "grad_norm": 0.44295027382591184, + "learning_rate": 8.00901169946098e-06, + "loss": 0.034, + "step": 3237 + }, + { + "epoch": 1.473157415832575, + "grad_norm": 0.514028689971606, + "learning_rate": 8.007870076318268e-06, + "loss": 0.0337, + "step": 3238 + }, + { + "epoch": 1.4736123748862602, + "grad_norm": 0.614308759846028, + "learning_rate": 8.006728207385996e-06, + "loss": 0.0412, + "step": 3239 + }, + { + "epoch": 1.4740673339399453, + "grad_norm": 0.30523726833683307, + "learning_rate": 8.005586092757472e-06, + "loss": 0.0162, + "step": 3240 + }, + { + "epoch": 1.4745222929936306, + "grad_norm": 0.638750228351473, + "learning_rate": 8.004443732526026e-06, + "loss": 0.0364, + "step": 3241 + }, + { + "epoch": 1.4749772520473157, + "grad_norm": 0.6238842384536734, + "learning_rate": 8.003301126785007e-06, + "loss": 0.039, + "step": 3242 + }, + { + "epoch": 1.475432211101001, + "grad_norm": 0.4763894780054722, + "learning_rate": 8.002158275627783e-06, + "loss": 0.0303, + "step": 3243 + }, + { + "epoch": 1.475887170154686, + "grad_norm": 0.5018596408732007, + "learning_rate": 8.00101517914774e-06, + "loss": 0.0288, + "step": 3244 + }, + { + "epoch": 1.4763421292083714, + "grad_norm": 0.7564418877944006, + "learning_rate": 7.999871837438292e-06, + "loss": 0.0561, + "step": 3245 + }, + { + "epoch": 1.4767970882620565, + "grad_norm": 0.49218467944499245, + "learning_rate": 7.998728250592865e-06, + "loss": 0.0259, + "step": 3246 + }, + { + "epoch": 1.4772520473157416, + "grad_norm": 0.5418004463760446, + "learning_rate": 7.997584418704905e-06, + "loss": 0.0325, + "step": 3247 + }, + { + "epoch": 1.4777070063694269, + "grad_norm": 0.5379198820020985, + "learning_rate": 7.996440341867884e-06, + "loss": 0.0342, + "step": 3248 + }, + { + "epoch": 1.478161965423112, + "grad_norm": 0.3534734720936311, + "learning_rate": 7.99529602017529e-06, + "loss": 0.0165, + "step": 3249 + }, + { + "epoch": 1.478616924476797, + "grad_norm": 0.5380771099198766, + "learning_rate": 7.994151453720632e-06, + "loss": 0.0285, + "step": 3250 + }, + { + "epoch": 1.4790718835304824, + "grad_norm": 0.5333004127188998, + "learning_rate": 7.993006642597438e-06, + "loss": 0.0331, + "step": 3251 + }, + { + "epoch": 1.4795268425841674, + "grad_norm": 0.6692517171982048, + "learning_rate": 7.991861586899258e-06, + "loss": 0.0549, + "step": 3252 + }, + { + "epoch": 1.4799818016378525, + "grad_norm": 0.6481381791067767, + "learning_rate": 7.990716286719662e-06, + "loss": 0.0299, + "step": 3253 + }, + { + "epoch": 1.4804367606915378, + "grad_norm": 0.4516323080619454, + "learning_rate": 7.989570742152235e-06, + "loss": 0.0189, + "step": 3254 + }, + { + "epoch": 1.480891719745223, + "grad_norm": 0.40638032731762097, + "learning_rate": 7.988424953290588e-06, + "loss": 0.0296, + "step": 3255 + }, + { + "epoch": 1.481346678798908, + "grad_norm": 0.597331728437238, + "learning_rate": 7.98727892022835e-06, + "loss": 0.041, + "step": 3256 + }, + { + "epoch": 1.4818016378525933, + "grad_norm": 0.4627299367009861, + "learning_rate": 7.986132643059169e-06, + "loss": 0.0274, + "step": 3257 + }, + { + "epoch": 1.4822565969062784, + "grad_norm": 0.6031029641496045, + "learning_rate": 7.984986121876714e-06, + "loss": 0.0401, + "step": 3258 + }, + { + "epoch": 1.4827115559599635, + "grad_norm": 0.547521201895844, + "learning_rate": 7.983839356774671e-06, + "loss": 0.0396, + "step": 3259 + }, + { + "epoch": 1.4831665150136488, + "grad_norm": 0.866003940880254, + "learning_rate": 7.982692347846755e-06, + "loss": 0.0359, + "step": 3260 + }, + { + "epoch": 1.483621474067334, + "grad_norm": 0.4726902171500864, + "learning_rate": 7.981545095186684e-06, + "loss": 0.0204, + "step": 3261 + }, + { + "epoch": 1.484076433121019, + "grad_norm": 0.8705680413739901, + "learning_rate": 7.980397598888217e-06, + "loss": 0.0633, + "step": 3262 + }, + { + "epoch": 1.4845313921747043, + "grad_norm": 0.570357211847994, + "learning_rate": 7.979249859045117e-06, + "loss": 0.0315, + "step": 3263 + }, + { + "epoch": 1.4849863512283894, + "grad_norm": 0.40815032752344726, + "learning_rate": 7.978101875751173e-06, + "loss": 0.0212, + "step": 3264 + }, + { + "epoch": 1.4854413102820745, + "grad_norm": 0.4310123367818552, + "learning_rate": 7.97695364910019e-06, + "loss": 0.0303, + "step": 3265 + }, + { + "epoch": 1.4858962693357598, + "grad_norm": 0.8088215943900415, + "learning_rate": 7.975805179186001e-06, + "loss": 0.0515, + "step": 3266 + }, + { + "epoch": 1.486351228389445, + "grad_norm": 0.7314527739715035, + "learning_rate": 7.97465646610245e-06, + "loss": 0.0422, + "step": 3267 + }, + { + "epoch": 1.4868061874431302, + "grad_norm": 0.7721966723010093, + "learning_rate": 7.973507509943406e-06, + "loss": 0.0479, + "step": 3268 + }, + { + "epoch": 1.4872611464968153, + "grad_norm": 0.47220931052176524, + "learning_rate": 7.972358310802758e-06, + "loss": 0.0286, + "step": 3269 + }, + { + "epoch": 1.4877161055505004, + "grad_norm": 0.5764503158465304, + "learning_rate": 7.971208868774412e-06, + "loss": 0.0465, + "step": 3270 + }, + { + "epoch": 1.4881710646041857, + "grad_norm": 1.0302779784470453, + "learning_rate": 7.970059183952295e-06, + "loss": 0.0427, + "step": 3271 + }, + { + "epoch": 1.4886260236578708, + "grad_norm": 5.82527427597862, + "learning_rate": 7.968909256430352e-06, + "loss": 0.0953, + "step": 3272 + }, + { + "epoch": 1.489080982711556, + "grad_norm": 0.6281307778669729, + "learning_rate": 7.967759086302554e-06, + "loss": 0.0432, + "step": 3273 + }, + { + "epoch": 1.4895359417652412, + "grad_norm": 0.4654685533789216, + "learning_rate": 7.966608673662885e-06, + "loss": 0.0189, + "step": 3274 + }, + { + "epoch": 1.4899909008189263, + "grad_norm": 0.5419348533302625, + "learning_rate": 7.965458018605352e-06, + "loss": 0.0366, + "step": 3275 + }, + { + "epoch": 1.4904458598726116, + "grad_norm": 0.6193685619288958, + "learning_rate": 7.964307121223983e-06, + "loss": 0.0419, + "step": 3276 + }, + { + "epoch": 1.4909008189262967, + "grad_norm": 0.4517996667707405, + "learning_rate": 7.96315598161282e-06, + "loss": 0.0257, + "step": 3277 + }, + { + "epoch": 1.4913557779799818, + "grad_norm": 0.9132409180267741, + "learning_rate": 7.962004599865935e-06, + "loss": 0.0601, + "step": 3278 + }, + { + "epoch": 1.491810737033667, + "grad_norm": 0.41324571059066195, + "learning_rate": 7.960852976077406e-06, + "loss": 0.0259, + "step": 3279 + }, + { + "epoch": 1.4922656960873522, + "grad_norm": 0.6733326146756219, + "learning_rate": 7.959701110341346e-06, + "loss": 0.0419, + "step": 3280 + }, + { + "epoch": 1.4927206551410372, + "grad_norm": 0.5701229927572675, + "learning_rate": 7.958549002751879e-06, + "loss": 0.0418, + "step": 3281 + }, + { + "epoch": 1.4931756141947226, + "grad_norm": 0.7607287312568358, + "learning_rate": 7.957396653403145e-06, + "loss": 0.0552, + "step": 3282 + }, + { + "epoch": 1.4936305732484076, + "grad_norm": 0.7579360335222567, + "learning_rate": 7.956244062389313e-06, + "loss": 0.0418, + "step": 3283 + }, + { + "epoch": 1.4940855323020927, + "grad_norm": 0.6259733792239423, + "learning_rate": 7.955091229804568e-06, + "loss": 0.0347, + "step": 3284 + }, + { + "epoch": 1.494540491355778, + "grad_norm": 0.44634359243758226, + "learning_rate": 7.95393815574311e-06, + "loss": 0.0247, + "step": 3285 + }, + { + "epoch": 1.4949954504094631, + "grad_norm": 0.6518710394549171, + "learning_rate": 7.952784840299166e-06, + "loss": 0.0385, + "step": 3286 + }, + { + "epoch": 1.4954504094631482, + "grad_norm": 0.42025419748462395, + "learning_rate": 7.951631283566981e-06, + "loss": 0.0145, + "step": 3287 + }, + { + "epoch": 1.4959053685168335, + "grad_norm": 0.5219400243782572, + "learning_rate": 7.950477485640818e-06, + "loss": 0.0369, + "step": 3288 + }, + { + "epoch": 1.4963603275705186, + "grad_norm": 0.5014319884220756, + "learning_rate": 7.949323446614957e-06, + "loss": 0.0267, + "step": 3289 + }, + { + "epoch": 1.4968152866242037, + "grad_norm": 0.7694796958332192, + "learning_rate": 7.948169166583703e-06, + "loss": 0.049, + "step": 3290 + }, + { + "epoch": 1.497270245677889, + "grad_norm": 0.5816908738213662, + "learning_rate": 7.94701464564138e-06, + "loss": 0.0275, + "step": 3291 + }, + { + "epoch": 1.4977252047315741, + "grad_norm": 0.5076527248188425, + "learning_rate": 7.945859883882327e-06, + "loss": 0.0274, + "step": 3292 + }, + { + "epoch": 1.4981801637852592, + "grad_norm": 0.6039997527684581, + "learning_rate": 7.94470488140091e-06, + "loss": 0.0378, + "step": 3293 + }, + { + "epoch": 1.4986351228389445, + "grad_norm": 0.592557887847226, + "learning_rate": 7.943549638291507e-06, + "loss": 0.0301, + "step": 3294 + }, + { + "epoch": 1.4990900818926296, + "grad_norm": 0.5313149152774042, + "learning_rate": 7.94239415464852e-06, + "loss": 0.0373, + "step": 3295 + }, + { + "epoch": 1.499545040946315, + "grad_norm": 0.5031451885132354, + "learning_rate": 7.94123843056637e-06, + "loss": 0.036, + "step": 3296 + }, + { + "epoch": 1.5, + "grad_norm": 0.5804403409340608, + "learning_rate": 7.9400824661395e-06, + "loss": 0.0379, + "step": 3297 + }, + { + "epoch": 1.5004549590536853, + "grad_norm": 0.45207731359234127, + "learning_rate": 7.938926261462366e-06, + "loss": 0.0253, + "step": 3298 + }, + { + "epoch": 1.5009099181073702, + "grad_norm": 0.7319323709083051, + "learning_rate": 7.93776981662945e-06, + "loss": 0.0427, + "step": 3299 + }, + { + "epoch": 1.5013648771610555, + "grad_norm": 0.5950172451800795, + "learning_rate": 7.936613131735253e-06, + "loss": 0.026, + "step": 3300 + }, + { + "epoch": 1.5018198362147408, + "grad_norm": 0.5184269203338593, + "learning_rate": 7.935456206874292e-06, + "loss": 0.0236, + "step": 3301 + }, + { + "epoch": 1.5022747952684259, + "grad_norm": 0.45332160585967457, + "learning_rate": 7.934299042141107e-06, + "loss": 0.0289, + "step": 3302 + }, + { + "epoch": 1.502729754322111, + "grad_norm": 0.4539427619968088, + "learning_rate": 7.933141637630252e-06, + "loss": 0.0228, + "step": 3303 + }, + { + "epoch": 1.5031847133757963, + "grad_norm": 0.6349022485227811, + "learning_rate": 7.931983993436312e-06, + "loss": 0.0441, + "step": 3304 + }, + { + "epoch": 1.5036396724294814, + "grad_norm": 0.5901676639230142, + "learning_rate": 7.930826109653882e-06, + "loss": 0.0315, + "step": 3305 + }, + { + "epoch": 1.5040946314831665, + "grad_norm": 0.44019191896156384, + "learning_rate": 7.929667986377574e-06, + "loss": 0.0241, + "step": 3306 + }, + { + "epoch": 1.5045495905368518, + "grad_norm": 0.849223492428955, + "learning_rate": 7.92850962370203e-06, + "loss": 0.0672, + "step": 3307 + }, + { + "epoch": 1.5050045495905369, + "grad_norm": 0.6138944480286226, + "learning_rate": 7.927351021721905e-06, + "loss": 0.0475, + "step": 3308 + }, + { + "epoch": 1.505459508644222, + "grad_norm": 0.5308981954331117, + "learning_rate": 7.926192180531873e-06, + "loss": 0.0328, + "step": 3309 + }, + { + "epoch": 1.5059144676979073, + "grad_norm": 0.45288398749279407, + "learning_rate": 7.925033100226632e-06, + "loss": 0.0258, + "step": 3310 + }, + { + "epoch": 1.5063694267515924, + "grad_norm": 0.6804917839354758, + "learning_rate": 7.923873780900894e-06, + "loss": 0.0519, + "step": 3311 + }, + { + "epoch": 1.5068243858052774, + "grad_norm": 0.6877383357614162, + "learning_rate": 7.922714222649394e-06, + "loss": 0.0461, + "step": 3312 + }, + { + "epoch": 1.5072793448589628, + "grad_norm": 0.37307848806453825, + "learning_rate": 7.92155442556689e-06, + "loss": 0.0169, + "step": 3313 + }, + { + "epoch": 1.5077343039126478, + "grad_norm": 0.4649246667832013, + "learning_rate": 7.92039438974815e-06, + "loss": 0.0246, + "step": 3314 + }, + { + "epoch": 1.508189262966333, + "grad_norm": 0.5797951905427798, + "learning_rate": 7.919234115287969e-06, + "loss": 0.0264, + "step": 3315 + }, + { + "epoch": 1.5086442220200182, + "grad_norm": 0.5999502093130843, + "learning_rate": 7.918073602281158e-06, + "loss": 0.0408, + "step": 3316 + }, + { + "epoch": 1.5090991810737033, + "grad_norm": 0.37444817465605024, + "learning_rate": 7.91691285082255e-06, + "loss": 0.0199, + "step": 3317 + }, + { + "epoch": 1.5095541401273884, + "grad_norm": 0.43571431795450083, + "learning_rate": 7.915751861007e-06, + "loss": 0.0311, + "step": 3318 + }, + { + "epoch": 1.5100090991810737, + "grad_norm": 0.5718039363551488, + "learning_rate": 7.914590632929372e-06, + "loss": 0.0398, + "step": 3319 + }, + { + "epoch": 1.5104640582347588, + "grad_norm": 0.5094342970442877, + "learning_rate": 7.913429166684561e-06, + "loss": 0.0379, + "step": 3320 + }, + { + "epoch": 1.510919017288444, + "grad_norm": 0.5242594744129395, + "learning_rate": 7.912267462367473e-06, + "loss": 0.026, + "step": 3321 + }, + { + "epoch": 1.5113739763421292, + "grad_norm": 0.48884902053211027, + "learning_rate": 7.911105520073044e-06, + "loss": 0.0306, + "step": 3322 + }, + { + "epoch": 1.5118289353958145, + "grad_norm": 0.7394168313167301, + "learning_rate": 7.909943339896215e-06, + "loss": 0.0479, + "step": 3323 + }, + { + "epoch": 1.5122838944494994, + "grad_norm": 0.6469513917525913, + "learning_rate": 7.908780921931957e-06, + "loss": 0.0371, + "step": 3324 + }, + { + "epoch": 1.5127388535031847, + "grad_norm": 0.8592861455834359, + "learning_rate": 7.90761826627526e-06, + "loss": 0.0484, + "step": 3325 + }, + { + "epoch": 1.51319381255687, + "grad_norm": 0.42257868990927616, + "learning_rate": 7.90645537302113e-06, + "loss": 0.0232, + "step": 3326 + }, + { + "epoch": 1.5136487716105549, + "grad_norm": 0.45779799028489876, + "learning_rate": 7.905292242264591e-06, + "loss": 0.0278, + "step": 3327 + }, + { + "epoch": 1.5141037306642402, + "grad_norm": 0.5433890115321734, + "learning_rate": 7.904128874100689e-06, + "loss": 0.0365, + "step": 3328 + }, + { + "epoch": 1.5145586897179255, + "grad_norm": 0.5494344731912977, + "learning_rate": 7.902965268624491e-06, + "loss": 0.0282, + "step": 3329 + }, + { + "epoch": 1.5150136487716106, + "grad_norm": 0.6130934963882723, + "learning_rate": 7.901801425931082e-06, + "loss": 0.0424, + "step": 3330 + }, + { + "epoch": 1.5154686078252957, + "grad_norm": 0.589889212368702, + "learning_rate": 7.900637346115563e-06, + "loss": 0.0311, + "step": 3331 + }, + { + "epoch": 1.515923566878981, + "grad_norm": 0.5616386558187996, + "learning_rate": 7.899473029273061e-06, + "loss": 0.0399, + "step": 3332 + }, + { + "epoch": 1.516378525932666, + "grad_norm": 0.5559424467039881, + "learning_rate": 7.898308475498717e-06, + "loss": 0.0368, + "step": 3333 + }, + { + "epoch": 1.5168334849863512, + "grad_norm": 0.4805464009233816, + "learning_rate": 7.897143684887692e-06, + "loss": 0.0257, + "step": 3334 + }, + { + "epoch": 1.5172884440400365, + "grad_norm": 0.4773831435489601, + "learning_rate": 7.89597865753517e-06, + "loss": 0.0256, + "step": 3335 + }, + { + "epoch": 1.5177434030937216, + "grad_norm": 0.6381236025058032, + "learning_rate": 7.894813393536349e-06, + "loss": 0.0439, + "step": 3336 + }, + { + "epoch": 1.5181983621474067, + "grad_norm": 0.911328191968218, + "learning_rate": 7.893647892986448e-06, + "loss": 0.0674, + "step": 3337 + }, + { + "epoch": 1.518653321201092, + "grad_norm": 0.4575031300837551, + "learning_rate": 7.892482155980713e-06, + "loss": 0.028, + "step": 3338 + }, + { + "epoch": 1.519108280254777, + "grad_norm": 0.43138761248175833, + "learning_rate": 7.891316182614397e-06, + "loss": 0.0211, + "step": 3339 + }, + { + "epoch": 1.5195632393084622, + "grad_norm": 0.6073801276576956, + "learning_rate": 7.890149972982779e-06, + "loss": 0.0346, + "step": 3340 + }, + { + "epoch": 1.5200181983621475, + "grad_norm": 0.495487486191647, + "learning_rate": 7.888983527181157e-06, + "loss": 0.0252, + "step": 3341 + }, + { + "epoch": 1.5204731574158326, + "grad_norm": 0.40225955239465766, + "learning_rate": 7.887816845304847e-06, + "loss": 0.0197, + "step": 3342 + }, + { + "epoch": 1.5209281164695176, + "grad_norm": 0.5565854453398432, + "learning_rate": 7.886649927449189e-06, + "loss": 0.0291, + "step": 3343 + }, + { + "epoch": 1.521383075523203, + "grad_norm": 0.6454632437150152, + "learning_rate": 7.885482773709532e-06, + "loss": 0.0368, + "step": 3344 + }, + { + "epoch": 1.521838034576888, + "grad_norm": 0.4211140971021706, + "learning_rate": 7.884315384181254e-06, + "loss": 0.0213, + "step": 3345 + }, + { + "epoch": 1.5222929936305731, + "grad_norm": 0.46955263253373974, + "learning_rate": 7.883147758959748e-06, + "loss": 0.0298, + "step": 3346 + }, + { + "epoch": 1.5227479526842584, + "grad_norm": 0.562841156860276, + "learning_rate": 7.881979898140428e-06, + "loss": 0.0315, + "step": 3347 + }, + { + "epoch": 1.5232029117379435, + "grad_norm": 0.653622108778222, + "learning_rate": 7.880811801818724e-06, + "loss": 0.029, + "step": 3348 + }, + { + "epoch": 1.5236578707916286, + "grad_norm": 0.4989556791862373, + "learning_rate": 7.879643470090092e-06, + "loss": 0.0287, + "step": 3349 + }, + { + "epoch": 1.524112829845314, + "grad_norm": 0.6493806438408574, + "learning_rate": 7.878474903049997e-06, + "loss": 0.0379, + "step": 3350 + }, + { + "epoch": 1.5245677888989992, + "grad_norm": 0.9494130739652255, + "learning_rate": 7.877306100793934e-06, + "loss": 0.0732, + "step": 3351 + }, + { + "epoch": 1.525022747952684, + "grad_norm": 0.5297290866341118, + "learning_rate": 7.876137063417411e-06, + "loss": 0.0308, + "step": 3352 + }, + { + "epoch": 1.5254777070063694, + "grad_norm": 0.6319762265400439, + "learning_rate": 7.874967791015954e-06, + "loss": 0.0242, + "step": 3353 + }, + { + "epoch": 1.5259326660600547, + "grad_norm": 0.5767239975955621, + "learning_rate": 7.873798283685112e-06, + "loss": 0.0406, + "step": 3354 + }, + { + "epoch": 1.5263876251137396, + "grad_norm": 0.5537544990681309, + "learning_rate": 7.872628541520453e-06, + "loss": 0.0274, + "step": 3355 + }, + { + "epoch": 1.526842584167425, + "grad_norm": 0.45762117910575895, + "learning_rate": 7.871458564617562e-06, + "loss": 0.0294, + "step": 3356 + }, + { + "epoch": 1.5272975432211102, + "grad_norm": 0.6149649052094309, + "learning_rate": 7.870288353072046e-06, + "loss": 0.0318, + "step": 3357 + }, + { + "epoch": 1.5277525022747953, + "grad_norm": 0.5433678607043896, + "learning_rate": 7.869117906979526e-06, + "loss": 0.0351, + "step": 3358 + }, + { + "epoch": 1.5282074613284804, + "grad_norm": 0.893607675733148, + "learning_rate": 7.867947226435649e-06, + "loss": 0.0361, + "step": 3359 + }, + { + "epoch": 1.5286624203821657, + "grad_norm": 0.72028067546292, + "learning_rate": 7.866776311536075e-06, + "loss": 0.0266, + "step": 3360 + }, + { + "epoch": 1.5291173794358508, + "grad_norm": 0.4457333347562546, + "learning_rate": 7.865605162376485e-06, + "loss": 0.025, + "step": 3361 + }, + { + "epoch": 1.5295723384895359, + "grad_norm": 0.5843708022673314, + "learning_rate": 7.864433779052586e-06, + "loss": 0.0402, + "step": 3362 + }, + { + "epoch": 1.5300272975432212, + "grad_norm": 0.47900762284025383, + "learning_rate": 7.863262161660093e-06, + "loss": 0.0264, + "step": 3363 + }, + { + "epoch": 1.5304822565969063, + "grad_norm": 0.6320035641166531, + "learning_rate": 7.862090310294747e-06, + "loss": 0.0418, + "step": 3364 + }, + { + "epoch": 1.5309372156505914, + "grad_norm": 0.4362921274047482, + "learning_rate": 7.860918225052306e-06, + "loss": 0.0206, + "step": 3365 + }, + { + "epoch": 1.5313921747042767, + "grad_norm": 0.47511830394385196, + "learning_rate": 7.859745906028545e-06, + "loss": 0.0302, + "step": 3366 + }, + { + "epoch": 1.5318471337579618, + "grad_norm": 0.44734571803407175, + "learning_rate": 7.858573353319264e-06, + "loss": 0.0269, + "step": 3367 + }, + { + "epoch": 1.5323020928116469, + "grad_norm": 0.4240090335241271, + "learning_rate": 7.85740056702028e-06, + "loss": 0.0232, + "step": 3368 + }, + { + "epoch": 1.5327570518653322, + "grad_norm": 0.47124639738210494, + "learning_rate": 7.856227547227421e-06, + "loss": 0.0347, + "step": 3369 + }, + { + "epoch": 1.5332120109190173, + "grad_norm": 0.5925515695295828, + "learning_rate": 7.85505429403655e-06, + "loss": 0.0352, + "step": 3370 + }, + { + "epoch": 1.5336669699727024, + "grad_norm": 0.42740971855552534, + "learning_rate": 7.853880807543534e-06, + "loss": 0.0373, + "step": 3371 + }, + { + "epoch": 1.5341219290263877, + "grad_norm": 0.6808939567186573, + "learning_rate": 7.852707087844267e-06, + "loss": 0.0459, + "step": 3372 + }, + { + "epoch": 1.5345768880800728, + "grad_norm": 0.5327072494298575, + "learning_rate": 7.851533135034658e-06, + "loss": 0.034, + "step": 3373 + }, + { + "epoch": 1.5350318471337578, + "grad_norm": 0.49051246039059626, + "learning_rate": 7.850358949210639e-06, + "loss": 0.0307, + "step": 3374 + }, + { + "epoch": 1.5354868061874432, + "grad_norm": 0.4879153893556511, + "learning_rate": 7.84918453046816e-06, + "loss": 0.0253, + "step": 3375 + }, + { + "epoch": 1.5359417652411285, + "grad_norm": 0.4606644347025298, + "learning_rate": 7.848009878903187e-06, + "loss": 0.0261, + "step": 3376 + }, + { + "epoch": 1.5363967242948133, + "grad_norm": 0.7246412722247344, + "learning_rate": 7.846834994611707e-06, + "loss": 0.0508, + "step": 3377 + }, + { + "epoch": 1.5368516833484986, + "grad_norm": 0.5437097771161801, + "learning_rate": 7.845659877689729e-06, + "loss": 0.0256, + "step": 3378 + }, + { + "epoch": 1.537306642402184, + "grad_norm": 0.4513777273034115, + "learning_rate": 7.844484528233279e-06, + "loss": 0.0268, + "step": 3379 + }, + { + "epoch": 1.5377616014558688, + "grad_norm": 0.5767478416216448, + "learning_rate": 7.843308946338396e-06, + "loss": 0.0459, + "step": 3380 + }, + { + "epoch": 1.5382165605095541, + "grad_norm": 0.5278410996002848, + "learning_rate": 7.842133132101145e-06, + "loss": 0.0291, + "step": 3381 + }, + { + "epoch": 1.5386715195632394, + "grad_norm": 0.5863010112909317, + "learning_rate": 7.840957085617612e-06, + "loss": 0.0351, + "step": 3382 + }, + { + "epoch": 1.5391264786169245, + "grad_norm": 0.5692494495242041, + "learning_rate": 7.839780806983894e-06, + "loss": 0.0362, + "step": 3383 + }, + { + "epoch": 1.5395814376706096, + "grad_norm": 0.5765842865723461, + "learning_rate": 7.838604296296114e-06, + "loss": 0.0281, + "step": 3384 + }, + { + "epoch": 1.540036396724295, + "grad_norm": 0.5611416523170261, + "learning_rate": 7.837427553650409e-06, + "loss": 0.0301, + "step": 3385 + }, + { + "epoch": 1.54049135577798, + "grad_norm": 0.5227145172283483, + "learning_rate": 7.836250579142938e-06, + "loss": 0.0256, + "step": 3386 + }, + { + "epoch": 1.540946314831665, + "grad_norm": 0.5053608536514115, + "learning_rate": 7.835073372869878e-06, + "loss": 0.032, + "step": 3387 + }, + { + "epoch": 1.5414012738853504, + "grad_norm": 0.5523481686663522, + "learning_rate": 7.833895934927426e-06, + "loss": 0.0351, + "step": 3388 + }, + { + "epoch": 1.5418562329390355, + "grad_norm": 0.46085951723099483, + "learning_rate": 7.832718265411795e-06, + "loss": 0.0238, + "step": 3389 + }, + { + "epoch": 1.5423111919927206, + "grad_norm": 0.6406733101128618, + "learning_rate": 7.83154036441922e-06, + "loss": 0.0597, + "step": 3390 + }, + { + "epoch": 1.542766151046406, + "grad_norm": 0.6182774088480784, + "learning_rate": 7.830362232045953e-06, + "loss": 0.0457, + "step": 3391 + }, + { + "epoch": 1.543221110100091, + "grad_norm": 0.419525367767363, + "learning_rate": 7.829183868388269e-06, + "loss": 0.0284, + "step": 3392 + }, + { + "epoch": 1.543676069153776, + "grad_norm": 0.687986567801542, + "learning_rate": 7.828005273542452e-06, + "loss": 0.0352, + "step": 3393 + }, + { + "epoch": 1.5441310282074614, + "grad_norm": 0.6030172854686707, + "learning_rate": 7.826826447604815e-06, + "loss": 0.0284, + "step": 3394 + }, + { + "epoch": 1.5445859872611465, + "grad_norm": 0.4267704901425831, + "learning_rate": 7.82564739067169e-06, + "loss": 0.0283, + "step": 3395 + }, + { + "epoch": 1.5450409463148316, + "grad_norm": 0.7090173708752295, + "learning_rate": 7.82446810283942e-06, + "loss": 0.0492, + "step": 3396 + }, + { + "epoch": 1.5454959053685169, + "grad_norm": 0.5782679542081406, + "learning_rate": 7.82328858420437e-06, + "loss": 0.0315, + "step": 3397 + }, + { + "epoch": 1.545950864422202, + "grad_norm": 0.41616530573356497, + "learning_rate": 7.82210883486293e-06, + "loss": 0.0235, + "step": 3398 + }, + { + "epoch": 1.546405823475887, + "grad_norm": 0.5309283516266711, + "learning_rate": 7.820928854911497e-06, + "loss": 0.0379, + "step": 3399 + }, + { + "epoch": 1.5468607825295724, + "grad_norm": 0.699558728154947, + "learning_rate": 7.819748644446499e-06, + "loss": 0.0482, + "step": 3400 + }, + { + "epoch": 1.5473157415832575, + "grad_norm": 0.471820338463683, + "learning_rate": 7.818568203564375e-06, + "loss": 0.023, + "step": 3401 + }, + { + "epoch": 1.5477707006369426, + "grad_norm": 0.42929414000407534, + "learning_rate": 7.817387532361585e-06, + "loss": 0.0244, + "step": 3402 + }, + { + "epoch": 1.5482256596906279, + "grad_norm": 0.41613692792574736, + "learning_rate": 7.816206630934611e-06, + "loss": 0.0217, + "step": 3403 + }, + { + "epoch": 1.5486806187443132, + "grad_norm": 0.5150802413289814, + "learning_rate": 7.815025499379947e-06, + "loss": 0.0277, + "step": 3404 + }, + { + "epoch": 1.549135577797998, + "grad_norm": 0.4967119725409103, + "learning_rate": 7.813844137794114e-06, + "loss": 0.0406, + "step": 3405 + }, + { + "epoch": 1.5495905368516834, + "grad_norm": 0.3871111988995279, + "learning_rate": 7.812662546273643e-06, + "loss": 0.0179, + "step": 3406 + }, + { + "epoch": 1.5500454959053687, + "grad_norm": 0.549646943524987, + "learning_rate": 7.811480724915093e-06, + "loss": 0.0327, + "step": 3407 + }, + { + "epoch": 1.5505004549590535, + "grad_norm": 0.3543829777210757, + "learning_rate": 7.810298673815031e-06, + "loss": 0.017, + "step": 3408 + }, + { + "epoch": 1.5509554140127388, + "grad_norm": 0.6113532245733246, + "learning_rate": 7.809116393070057e-06, + "loss": 0.0397, + "step": 3409 + }, + { + "epoch": 1.5514103730664242, + "grad_norm": 0.8106140035945865, + "learning_rate": 7.807933882776774e-06, + "loss": 0.0474, + "step": 3410 + }, + { + "epoch": 1.5518653321201092, + "grad_norm": 0.6896922151206373, + "learning_rate": 7.806751143031817e-06, + "loss": 0.0404, + "step": 3411 + }, + { + "epoch": 1.5523202911737943, + "grad_norm": 0.5736907390195001, + "learning_rate": 7.80556817393183e-06, + "loss": 0.0244, + "step": 3412 + }, + { + "epoch": 1.5527752502274796, + "grad_norm": 0.5185448241286031, + "learning_rate": 7.804384975573482e-06, + "loss": 0.025, + "step": 3413 + }, + { + "epoch": 1.5532302092811647, + "grad_norm": 0.6057910883771316, + "learning_rate": 7.803201548053459e-06, + "loss": 0.0301, + "step": 3414 + }, + { + "epoch": 1.5536851683348498, + "grad_norm": 0.5693286233096472, + "learning_rate": 7.802017891468464e-06, + "loss": 0.0277, + "step": 3415 + }, + { + "epoch": 1.5541401273885351, + "grad_norm": 0.679306287123086, + "learning_rate": 7.80083400591522e-06, + "loss": 0.0427, + "step": 3416 + }, + { + "epoch": 1.5545950864422202, + "grad_norm": 0.6700442491171918, + "learning_rate": 7.799649891490472e-06, + "loss": 0.041, + "step": 3417 + }, + { + "epoch": 1.5550500454959053, + "grad_norm": 0.45486416732531165, + "learning_rate": 7.798465548290975e-06, + "loss": 0.0311, + "step": 3418 + }, + { + "epoch": 1.5555050045495906, + "grad_norm": 0.4474510317671961, + "learning_rate": 7.797280976413512e-06, + "loss": 0.0276, + "step": 3419 + }, + { + "epoch": 1.5559599636032757, + "grad_norm": 0.5430321466215652, + "learning_rate": 7.796096175954881e-06, + "loss": 0.0369, + "step": 3420 + }, + { + "epoch": 1.5564149226569608, + "grad_norm": 0.5723323718862299, + "learning_rate": 7.7949111470119e-06, + "loss": 0.0308, + "step": 3421 + }, + { + "epoch": 1.556869881710646, + "grad_norm": 0.5885577249765643, + "learning_rate": 7.793725889681396e-06, + "loss": 0.027, + "step": 3422 + }, + { + "epoch": 1.5573248407643312, + "grad_norm": 0.639570428654892, + "learning_rate": 7.792540404060232e-06, + "loss": 0.0602, + "step": 3423 + }, + { + "epoch": 1.5577797998180163, + "grad_norm": 0.4521381633452731, + "learning_rate": 7.791354690245276e-06, + "loss": 0.023, + "step": 3424 + }, + { + "epoch": 1.5582347588717016, + "grad_norm": 0.5227633621304679, + "learning_rate": 7.790168748333422e-06, + "loss": 0.0306, + "step": 3425 + }, + { + "epoch": 1.5586897179253867, + "grad_norm": 0.49798978950391587, + "learning_rate": 7.788982578421576e-06, + "loss": 0.0276, + "step": 3426 + }, + { + "epoch": 1.5591446769790718, + "grad_norm": 0.5793757794831385, + "learning_rate": 7.78779618060667e-06, + "loss": 0.0372, + "step": 3427 + }, + { + "epoch": 1.559599636032757, + "grad_norm": 0.534549847977574, + "learning_rate": 7.78660955498565e-06, + "loss": 0.0376, + "step": 3428 + }, + { + "epoch": 1.5600545950864422, + "grad_norm": 0.7215365048435313, + "learning_rate": 7.78542270165548e-06, + "loss": 0.0425, + "step": 3429 + }, + { + "epoch": 1.5605095541401273, + "grad_norm": 0.4278147394667788, + "learning_rate": 7.784235620713148e-06, + "loss": 0.0263, + "step": 3430 + }, + { + "epoch": 1.5609645131938126, + "grad_norm": 1.1898282213354985, + "learning_rate": 7.783048312255653e-06, + "loss": 0.0444, + "step": 3431 + }, + { + "epoch": 1.5614194722474979, + "grad_norm": 0.45023984506018094, + "learning_rate": 7.781860776380018e-06, + "loss": 0.0224, + "step": 3432 + }, + { + "epoch": 1.5618744313011828, + "grad_norm": 0.6704837664332035, + "learning_rate": 7.780673013183285e-06, + "loss": 0.0469, + "step": 3433 + }, + { + "epoch": 1.562329390354868, + "grad_norm": 0.6013246016752672, + "learning_rate": 7.779485022762507e-06, + "loss": 0.0347, + "step": 3434 + }, + { + "epoch": 1.5627843494085534, + "grad_norm": 0.8945038921473537, + "learning_rate": 7.778296805214768e-06, + "loss": 0.0474, + "step": 3435 + }, + { + "epoch": 1.5632393084622382, + "grad_norm": 0.5510688494568616, + "learning_rate": 7.77710836063716e-06, + "loss": 0.0383, + "step": 3436 + }, + { + "epoch": 1.5636942675159236, + "grad_norm": 0.6583580556515867, + "learning_rate": 7.775919689126798e-06, + "loss": 0.0336, + "step": 3437 + }, + { + "epoch": 1.5641492265696089, + "grad_norm": 0.7415363558720585, + "learning_rate": 7.774730790780814e-06, + "loss": 0.0331, + "step": 3438 + }, + { + "epoch": 1.564604185623294, + "grad_norm": 0.7507039745490994, + "learning_rate": 7.773541665696363e-06, + "loss": 0.0491, + "step": 3439 + }, + { + "epoch": 1.565059144676979, + "grad_norm": 0.5977637793388121, + "learning_rate": 7.77235231397061e-06, + "loss": 0.0356, + "step": 3440 + }, + { + "epoch": 1.5655141037306644, + "grad_norm": 0.4500431567893906, + "learning_rate": 7.771162735700746e-06, + "loss": 0.0282, + "step": 3441 + }, + { + "epoch": 1.5659690627843494, + "grad_norm": 0.4701512669605663, + "learning_rate": 7.769972930983977e-06, + "loss": 0.0274, + "step": 3442 + }, + { + "epoch": 1.5664240218380345, + "grad_norm": 0.595132942648599, + "learning_rate": 7.76878289991753e-06, + "loss": 0.0377, + "step": 3443 + }, + { + "epoch": 1.5668789808917198, + "grad_norm": 0.7296510448397537, + "learning_rate": 7.76759264259865e-06, + "loss": 0.0437, + "step": 3444 + }, + { + "epoch": 1.567333939945405, + "grad_norm": 0.5102548675863424, + "learning_rate": 7.766402159124595e-06, + "loss": 0.0327, + "step": 3445 + }, + { + "epoch": 1.56778889899909, + "grad_norm": 0.538408356737662, + "learning_rate": 7.765211449592649e-06, + "loss": 0.0337, + "step": 3446 + }, + { + "epoch": 1.5682438580527753, + "grad_norm": 0.47460946861088327, + "learning_rate": 7.764020514100112e-06, + "loss": 0.0271, + "step": 3447 + }, + { + "epoch": 1.5686988171064604, + "grad_norm": 0.561507703759553, + "learning_rate": 7.7628293527443e-06, + "loss": 0.0318, + "step": 3448 + }, + { + "epoch": 1.5691537761601455, + "grad_norm": 0.4792840867094543, + "learning_rate": 7.76163796562255e-06, + "loss": 0.0331, + "step": 3449 + }, + { + "epoch": 1.5696087352138308, + "grad_norm": 0.8607482079019099, + "learning_rate": 7.760446352832217e-06, + "loss": 0.0388, + "step": 3450 + }, + { + "epoch": 1.570063694267516, + "grad_norm": 0.5223043059792263, + "learning_rate": 7.759254514470675e-06, + "loss": 0.0339, + "step": 3451 + }, + { + "epoch": 1.570518653321201, + "grad_norm": 5.710898517541713, + "learning_rate": 7.758062450635313e-06, + "loss": 0.0555, + "step": 3452 + }, + { + "epoch": 1.5709736123748863, + "grad_norm": 0.5105552971743239, + "learning_rate": 7.756870161423544e-06, + "loss": 0.0262, + "step": 3453 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.4968688652691249, + "learning_rate": 7.755677646932796e-06, + "loss": 0.0224, + "step": 3454 + }, + { + "epoch": 1.5718835304822565, + "grad_norm": 0.4976523567379475, + "learning_rate": 7.754484907260513e-06, + "loss": 0.0273, + "step": 3455 + }, + { + "epoch": 1.5723384895359418, + "grad_norm": 0.5434947844040935, + "learning_rate": 7.753291942504165e-06, + "loss": 0.0305, + "step": 3456 + }, + { + "epoch": 1.5727934485896269, + "grad_norm": 0.47435089320259716, + "learning_rate": 7.75209875276123e-06, + "loss": 0.0295, + "step": 3457 + }, + { + "epoch": 1.573248407643312, + "grad_norm": 0.7090034377929653, + "learning_rate": 7.750905338129218e-06, + "loss": 0.0562, + "step": 3458 + }, + { + "epoch": 1.5737033666969973, + "grad_norm": 0.35682356138133664, + "learning_rate": 7.749711698705642e-06, + "loss": 0.0215, + "step": 3459 + }, + { + "epoch": 1.5741583257506826, + "grad_norm": 0.690125413698341, + "learning_rate": 7.748517834588041e-06, + "loss": 0.0507, + "step": 3460 + }, + { + "epoch": 1.5746132848043675, + "grad_norm": 0.8065897776167275, + "learning_rate": 7.747323745873978e-06, + "loss": 0.0619, + "step": 3461 + }, + { + "epoch": 1.5750682438580528, + "grad_norm": 0.5611610306466592, + "learning_rate": 7.746129432661026e-06, + "loss": 0.0437, + "step": 3462 + }, + { + "epoch": 1.575523202911738, + "grad_norm": 0.6156596061850024, + "learning_rate": 7.744934895046777e-06, + "loss": 0.0389, + "step": 3463 + }, + { + "epoch": 1.575978161965423, + "grad_norm": 0.6770117433407621, + "learning_rate": 7.743740133128844e-06, + "loss": 0.0425, + "step": 3464 + }, + { + "epoch": 1.5764331210191083, + "grad_norm": 0.5646976725890579, + "learning_rate": 7.742545147004859e-06, + "loss": 0.0354, + "step": 3465 + }, + { + "epoch": 1.5768880800727936, + "grad_norm": 0.5002058028310358, + "learning_rate": 7.741349936772468e-06, + "loss": 0.0304, + "step": 3466 + }, + { + "epoch": 1.5773430391264787, + "grad_norm": 0.42270356073255155, + "learning_rate": 7.74015450252934e-06, + "loss": 0.0275, + "step": 3467 + }, + { + "epoch": 1.5777979981801638, + "grad_norm": 0.5112403656545559, + "learning_rate": 7.738958844373164e-06, + "loss": 0.0284, + "step": 3468 + }, + { + "epoch": 1.578252957233849, + "grad_norm": 0.5654040077009538, + "learning_rate": 7.737762962401637e-06, + "loss": 0.0266, + "step": 3469 + }, + { + "epoch": 1.5787079162875342, + "grad_norm": 0.5043261297906985, + "learning_rate": 7.736566856712486e-06, + "loss": 0.0454, + "step": 3470 + }, + { + "epoch": 1.5791628753412192, + "grad_norm": 0.4594033411160087, + "learning_rate": 7.735370527403447e-06, + "loss": 0.0222, + "step": 3471 + }, + { + "epoch": 1.5796178343949046, + "grad_norm": 0.5209905943849851, + "learning_rate": 7.734173974572283e-06, + "loss": 0.0347, + "step": 3472 + }, + { + "epoch": 1.5800727934485896, + "grad_norm": 0.42941284083933845, + "learning_rate": 7.732977198316772e-06, + "loss": 0.0298, + "step": 3473 + }, + { + "epoch": 1.5805277525022747, + "grad_norm": 0.37167293538439883, + "learning_rate": 7.731780198734702e-06, + "loss": 0.0168, + "step": 3474 + }, + { + "epoch": 1.58098271155596, + "grad_norm": 0.6728592612271111, + "learning_rate": 7.730582975923892e-06, + "loss": 0.0444, + "step": 3475 + }, + { + "epoch": 1.5814376706096451, + "grad_norm": 0.704688894432512, + "learning_rate": 7.729385529982174e-06, + "loss": 0.0455, + "step": 3476 + }, + { + "epoch": 1.5818926296633302, + "grad_norm": 0.6254068661918205, + "learning_rate": 7.728187861007394e-06, + "loss": 0.0314, + "step": 3477 + }, + { + "epoch": 1.5823475887170155, + "grad_norm": 0.4853841882946067, + "learning_rate": 7.726989969097424e-06, + "loss": 0.0269, + "step": 3478 + }, + { + "epoch": 1.5828025477707006, + "grad_norm": 0.5633959034881754, + "learning_rate": 7.725791854350148e-06, + "loss": 0.0434, + "step": 3479 + }, + { + "epoch": 1.5832575068243857, + "grad_norm": 0.4754367671510815, + "learning_rate": 7.724593516863472e-06, + "loss": 0.0323, + "step": 3480 + }, + { + "epoch": 1.583712465878071, + "grad_norm": 0.5505428790889674, + "learning_rate": 7.723394956735316e-06, + "loss": 0.0273, + "step": 3481 + }, + { + "epoch": 1.584167424931756, + "grad_norm": 0.6739680763910496, + "learning_rate": 7.722196174063625e-06, + "loss": 0.0441, + "step": 3482 + }, + { + "epoch": 1.5846223839854412, + "grad_norm": 0.5513199772036493, + "learning_rate": 7.720997168946355e-06, + "loss": 0.0408, + "step": 3483 + }, + { + "epoch": 1.5850773430391265, + "grad_norm": 0.37523723040612444, + "learning_rate": 7.719797941481487e-06, + "loss": 0.0223, + "step": 3484 + }, + { + "epoch": 1.5855323020928116, + "grad_norm": 0.6066395634270424, + "learning_rate": 7.71859849176701e-06, + "loss": 0.0394, + "step": 3485 + }, + { + "epoch": 1.5859872611464967, + "grad_norm": 0.5392116492425955, + "learning_rate": 7.717398819900943e-06, + "loss": 0.0342, + "step": 3486 + }, + { + "epoch": 1.586442220200182, + "grad_norm": 0.4688172933226109, + "learning_rate": 7.716198925981316e-06, + "loss": 0.0272, + "step": 3487 + }, + { + "epoch": 1.5868971792538673, + "grad_norm": 0.4914165590714183, + "learning_rate": 7.714998810106178e-06, + "loss": 0.0365, + "step": 3488 + }, + { + "epoch": 1.5873521383075522, + "grad_norm": 0.4851376352933137, + "learning_rate": 7.713798472373598e-06, + "loss": 0.0318, + "step": 3489 + }, + { + "epoch": 1.5878070973612375, + "grad_norm": 0.636717106499792, + "learning_rate": 7.712597912881664e-06, + "loss": 0.0388, + "step": 3490 + }, + { + "epoch": 1.5882620564149228, + "grad_norm": 0.4759975612349714, + "learning_rate": 7.711397131728479e-06, + "loss": 0.0343, + "step": 3491 + }, + { + "epoch": 1.5887170154686077, + "grad_norm": 0.6121544784459866, + "learning_rate": 7.710196129012163e-06, + "loss": 0.0393, + "step": 3492 + }, + { + "epoch": 1.589171974522293, + "grad_norm": 0.459290097235983, + "learning_rate": 7.70899490483086e-06, + "loss": 0.0239, + "step": 3493 + }, + { + "epoch": 1.5896269335759783, + "grad_norm": 0.5006072426867193, + "learning_rate": 7.707793459282726e-06, + "loss": 0.0264, + "step": 3494 + }, + { + "epoch": 1.5900818926296634, + "grad_norm": 0.5392590451459817, + "learning_rate": 7.706591792465938e-06, + "loss": 0.0415, + "step": 3495 + }, + { + "epoch": 1.5905368516833485, + "grad_norm": 0.6427781964599646, + "learning_rate": 7.705389904478694e-06, + "loss": 0.0451, + "step": 3496 + }, + { + "epoch": 1.5909918107370338, + "grad_norm": 0.5904502422720772, + "learning_rate": 7.704187795419202e-06, + "loss": 0.0326, + "step": 3497 + }, + { + "epoch": 1.5914467697907189, + "grad_norm": 0.5254871759750529, + "learning_rate": 7.702985465385698e-06, + "loss": 0.0271, + "step": 3498 + }, + { + "epoch": 1.591901728844404, + "grad_norm": 0.8139706995395187, + "learning_rate": 7.701782914476425e-06, + "loss": 0.0471, + "step": 3499 + }, + { + "epoch": 1.5923566878980893, + "grad_norm": 0.45527893905535977, + "learning_rate": 7.700580142789656e-06, + "loss": 0.0275, + "step": 3500 + }, + { + "epoch": 1.5928116469517744, + "grad_norm": 0.6912461759234002, + "learning_rate": 7.699377150423673e-06, + "loss": 0.0429, + "step": 3501 + }, + { + "epoch": 1.5932666060054594, + "grad_norm": 0.6082047878888045, + "learning_rate": 7.698173937476779e-06, + "loss": 0.0341, + "step": 3502 + }, + { + "epoch": 1.5937215650591448, + "grad_norm": 0.6073667552436033, + "learning_rate": 7.696970504047295e-06, + "loss": 0.0275, + "step": 3503 + }, + { + "epoch": 1.5941765241128298, + "grad_norm": 0.8091516868172267, + "learning_rate": 7.695766850233562e-06, + "loss": 0.0571, + "step": 3504 + }, + { + "epoch": 1.594631483166515, + "grad_norm": 0.49571343354550057, + "learning_rate": 7.694562976133935e-06, + "loss": 0.0456, + "step": 3505 + }, + { + "epoch": 1.5950864422202002, + "grad_norm": 0.4862397941245227, + "learning_rate": 7.693358881846789e-06, + "loss": 0.0245, + "step": 3506 + }, + { + "epoch": 1.5955414012738853, + "grad_norm": 0.8324349327582877, + "learning_rate": 7.692154567470522e-06, + "loss": 0.0584, + "step": 3507 + }, + { + "epoch": 1.5959963603275704, + "grad_norm": 0.42161922635608196, + "learning_rate": 7.69095003310354e-06, + "loss": 0.0224, + "step": 3508 + }, + { + "epoch": 1.5964513193812557, + "grad_norm": 0.5720006424666347, + "learning_rate": 7.689745278844271e-06, + "loss": 0.0444, + "step": 3509 + }, + { + "epoch": 1.5969062784349408, + "grad_norm": 0.4329811248199684, + "learning_rate": 7.688540304791166e-06, + "loss": 0.028, + "step": 3510 + }, + { + "epoch": 1.597361237488626, + "grad_norm": 0.6590673444667859, + "learning_rate": 7.687335111042691e-06, + "loss": 0.0513, + "step": 3511 + }, + { + "epoch": 1.5978161965423112, + "grad_norm": 0.5940362949551716, + "learning_rate": 7.686129697697324e-06, + "loss": 0.0324, + "step": 3512 + }, + { + "epoch": 1.5982711555959963, + "grad_norm": 0.6462196543335987, + "learning_rate": 7.684924064853568e-06, + "loss": 0.0371, + "step": 3513 + }, + { + "epoch": 1.5987261146496814, + "grad_norm": 0.31438590420870827, + "learning_rate": 7.683718212609945e-06, + "loss": 0.0149, + "step": 3514 + }, + { + "epoch": 1.5991810737033667, + "grad_norm": 0.6635942498961731, + "learning_rate": 7.682512141064988e-06, + "loss": 0.0482, + "step": 3515 + }, + { + "epoch": 1.599636032757052, + "grad_norm": 0.8219386674967395, + "learning_rate": 7.681305850317252e-06, + "loss": 0.0569, + "step": 3516 + }, + { + "epoch": 1.6000909918107369, + "grad_norm": 0.5324553198705271, + "learning_rate": 7.680099340465312e-06, + "loss": 0.031, + "step": 3517 + }, + { + "epoch": 1.6005459508644222, + "grad_norm": 0.5022819826273329, + "learning_rate": 7.678892611607756e-06, + "loss": 0.0322, + "step": 3518 + }, + { + "epoch": 1.6010009099181075, + "grad_norm": 0.5200432351073436, + "learning_rate": 7.677685663843195e-06, + "loss": 0.0244, + "step": 3519 + }, + { + "epoch": 1.6014558689717924, + "grad_norm": 0.5079063474217674, + "learning_rate": 7.676478497270253e-06, + "loss": 0.0302, + "step": 3520 + }, + { + "epoch": 1.6019108280254777, + "grad_norm": 0.45086384597230217, + "learning_rate": 7.675271111987574e-06, + "loss": 0.0264, + "step": 3521 + }, + { + "epoch": 1.602365787079163, + "grad_norm": 0.5987447309633367, + "learning_rate": 7.674063508093823e-06, + "loss": 0.0345, + "step": 3522 + }, + { + "epoch": 1.602820746132848, + "grad_norm": 0.6694848585180148, + "learning_rate": 7.672855685687676e-06, + "loss": 0.0468, + "step": 3523 + }, + { + "epoch": 1.6032757051865332, + "grad_norm": 0.4884693910817342, + "learning_rate": 7.671647644867836e-06, + "loss": 0.0312, + "step": 3524 + }, + { + "epoch": 1.6037306642402185, + "grad_norm": 0.5243581522168983, + "learning_rate": 7.670439385733012e-06, + "loss": 0.0314, + "step": 3525 + }, + { + "epoch": 1.6041856232939036, + "grad_norm": 0.416015607301664, + "learning_rate": 7.669230908381944e-06, + "loss": 0.0273, + "step": 3526 + }, + { + "epoch": 1.6046405823475887, + "grad_norm": 0.5428096951326984, + "learning_rate": 7.668022212913378e-06, + "loss": 0.0368, + "step": 3527 + }, + { + "epoch": 1.605095541401274, + "grad_norm": 0.5817012639943427, + "learning_rate": 7.666813299426087e-06, + "loss": 0.0412, + "step": 3528 + }, + { + "epoch": 1.605550500454959, + "grad_norm": 0.5570959439337889, + "learning_rate": 7.665604168018856e-06, + "loss": 0.0472, + "step": 3529 + }, + { + "epoch": 1.6060054595086442, + "grad_norm": 0.6212569836098534, + "learning_rate": 7.66439481879049e-06, + "loss": 0.0364, + "step": 3530 + }, + { + "epoch": 1.6064604185623295, + "grad_norm": 0.44375581247796847, + "learning_rate": 7.663185251839813e-06, + "loss": 0.0263, + "step": 3531 + }, + { + "epoch": 1.6069153776160146, + "grad_norm": 0.34915723581136715, + "learning_rate": 7.661975467265661e-06, + "loss": 0.0195, + "step": 3532 + }, + { + "epoch": 1.6073703366696996, + "grad_norm": 0.505632210854433, + "learning_rate": 7.660765465166898e-06, + "loss": 0.0238, + "step": 3533 + }, + { + "epoch": 1.607825295723385, + "grad_norm": 0.4686118663668764, + "learning_rate": 7.659555245642396e-06, + "loss": 0.0347, + "step": 3534 + }, + { + "epoch": 1.60828025477707, + "grad_norm": 0.5197317168350211, + "learning_rate": 7.658344808791049e-06, + "loss": 0.0256, + "step": 3535 + }, + { + "epoch": 1.6087352138307551, + "grad_norm": 0.4039821082222526, + "learning_rate": 7.65713415471177e-06, + "loss": 0.0193, + "step": 3536 + }, + { + "epoch": 1.6091901728844404, + "grad_norm": 0.42336017856374314, + "learning_rate": 7.655923283503488e-06, + "loss": 0.0236, + "step": 3537 + }, + { + "epoch": 1.6096451319381255, + "grad_norm": 0.4159281988410182, + "learning_rate": 7.654712195265148e-06, + "loss": 0.031, + "step": 3538 + }, + { + "epoch": 1.6101000909918106, + "grad_norm": 0.46547916774453957, + "learning_rate": 7.653500890095718e-06, + "loss": 0.0236, + "step": 3539 + }, + { + "epoch": 1.610555050045496, + "grad_norm": 0.6814753024595361, + "learning_rate": 7.652289368094177e-06, + "loss": 0.0398, + "step": 3540 + }, + { + "epoch": 1.6110100090991812, + "grad_norm": 0.7072996547012244, + "learning_rate": 7.651077629359526e-06, + "loss": 0.0299, + "step": 3541 + }, + { + "epoch": 1.611464968152866, + "grad_norm": 0.4728878872440036, + "learning_rate": 7.649865673990784e-06, + "loss": 0.0249, + "step": 3542 + }, + { + "epoch": 1.6119199272065514, + "grad_norm": 0.5985278217148838, + "learning_rate": 7.648653502086987e-06, + "loss": 0.0504, + "step": 3543 + }, + { + "epoch": 1.6123748862602367, + "grad_norm": 0.4864758660970619, + "learning_rate": 7.647441113747183e-06, + "loss": 0.0243, + "step": 3544 + }, + { + "epoch": 1.6128298453139216, + "grad_norm": 0.5046056423212918, + "learning_rate": 7.646228509070452e-06, + "loss": 0.0283, + "step": 3545 + }, + { + "epoch": 1.613284804367607, + "grad_norm": 0.4166387607087943, + "learning_rate": 7.645015688155875e-06, + "loss": 0.0232, + "step": 3546 + }, + { + "epoch": 1.6137397634212922, + "grad_norm": 0.6346006347530964, + "learning_rate": 7.643802651102561e-06, + "loss": 0.0307, + "step": 3547 + }, + { + "epoch": 1.6141947224749773, + "grad_norm": 0.614457634488847, + "learning_rate": 7.642589398009632e-06, + "loss": 0.0337, + "step": 3548 + }, + { + "epoch": 1.6146496815286624, + "grad_norm": 0.4011091478272312, + "learning_rate": 7.641375928976234e-06, + "loss": 0.0222, + "step": 3549 + }, + { + "epoch": 1.6151046405823477, + "grad_norm": 0.44802583977127497, + "learning_rate": 7.64016224410152e-06, + "loss": 0.0307, + "step": 3550 + }, + { + "epoch": 1.6155595996360328, + "grad_norm": 0.6512304506397211, + "learning_rate": 7.638948343484673e-06, + "loss": 0.039, + "step": 3551 + }, + { + "epoch": 1.6160145586897179, + "grad_norm": 0.5094725225300599, + "learning_rate": 7.637734227224885e-06, + "loss": 0.0269, + "step": 3552 + }, + { + "epoch": 1.6164695177434032, + "grad_norm": 0.38774814089402904, + "learning_rate": 7.636519895421365e-06, + "loss": 0.0227, + "step": 3553 + }, + { + "epoch": 1.6169244767970883, + "grad_norm": 0.600146705549112, + "learning_rate": 7.63530534817335e-06, + "loss": 0.031, + "step": 3554 + }, + { + "epoch": 1.6173794358507734, + "grad_norm": 0.6396250853926813, + "learning_rate": 7.63409058558008e-06, + "loss": 0.0541, + "step": 3555 + }, + { + "epoch": 1.6178343949044587, + "grad_norm": 0.3674994725639151, + "learning_rate": 7.632875607740825e-06, + "loss": 0.0198, + "step": 3556 + }, + { + "epoch": 1.6182893539581438, + "grad_norm": 0.706365847864232, + "learning_rate": 7.631660414754862e-06, + "loss": 0.0464, + "step": 3557 + }, + { + "epoch": 1.6187443130118289, + "grad_norm": 0.5917150010241741, + "learning_rate": 7.630445006721497e-06, + "loss": 0.0413, + "step": 3558 + }, + { + "epoch": 1.6191992720655142, + "grad_norm": 0.49637791594441183, + "learning_rate": 7.629229383740042e-06, + "loss": 0.0216, + "step": 3559 + }, + { + "epoch": 1.6196542311191993, + "grad_norm": 0.4170834995352204, + "learning_rate": 7.628013545909838e-06, + "loss": 0.0221, + "step": 3560 + }, + { + "epoch": 1.6201091901728844, + "grad_norm": 0.45364271750881185, + "learning_rate": 7.626797493330235e-06, + "loss": 0.0268, + "step": 3561 + }, + { + "epoch": 1.6205641492265697, + "grad_norm": 0.611490193807076, + "learning_rate": 7.625581226100603e-06, + "loss": 0.0408, + "step": 3562 + }, + { + "epoch": 1.6210191082802548, + "grad_norm": 0.4607491745006535, + "learning_rate": 7.6243647443203295e-06, + "loss": 0.0201, + "step": 3563 + }, + { + "epoch": 1.6214740673339398, + "grad_norm": 0.5622724933371087, + "learning_rate": 7.623148048088821e-06, + "loss": 0.0345, + "step": 3564 + }, + { + "epoch": 1.6219290263876252, + "grad_norm": 0.6433742820185328, + "learning_rate": 7.6219311375055e-06, + "loss": 0.0303, + "step": 3565 + }, + { + "epoch": 1.6223839854413102, + "grad_norm": 0.5029378697434539, + "learning_rate": 7.620714012669807e-06, + "loss": 0.0345, + "step": 3566 + }, + { + "epoch": 1.6228389444949953, + "grad_norm": 0.7588833946097253, + "learning_rate": 7.619496673681201e-06, + "loss": 0.0659, + "step": 3567 + }, + { + "epoch": 1.6232939035486806, + "grad_norm": 0.5425276901336257, + "learning_rate": 7.618279120639154e-06, + "loss": 0.0303, + "step": 3568 + }, + { + "epoch": 1.623748862602366, + "grad_norm": 0.4113587117385332, + "learning_rate": 7.6170613536431625e-06, + "loss": 0.0116, + "step": 3569 + }, + { + "epoch": 1.6242038216560508, + "grad_norm": 0.5187414197115504, + "learning_rate": 7.615843372792735e-06, + "loss": 0.0291, + "step": 3570 + }, + { + "epoch": 1.6246587807097361, + "grad_norm": 0.3522854361880795, + "learning_rate": 7.614625178187402e-06, + "loss": 0.0216, + "step": 3571 + }, + { + "epoch": 1.6251137397634214, + "grad_norm": 0.6881608158668938, + "learning_rate": 7.613406769926706e-06, + "loss": 0.0375, + "step": 3572 + }, + { + "epoch": 1.6255686988171063, + "grad_norm": 0.73681812623654, + "learning_rate": 7.612188148110211e-06, + "loss": 0.0449, + "step": 3573 + }, + { + "epoch": 1.6260236578707916, + "grad_norm": 0.38921683099066845, + "learning_rate": 7.610969312837497e-06, + "loss": 0.0177, + "step": 3574 + }, + { + "epoch": 1.626478616924477, + "grad_norm": 0.5386370708102756, + "learning_rate": 7.609750264208161e-06, + "loss": 0.0374, + "step": 3575 + }, + { + "epoch": 1.626933575978162, + "grad_norm": 0.6893051198659124, + "learning_rate": 7.60853100232182e-06, + "loss": 0.0407, + "step": 3576 + }, + { + "epoch": 1.627388535031847, + "grad_norm": 0.5280242999427653, + "learning_rate": 7.6073115272781055e-06, + "loss": 0.0305, + "step": 3577 + }, + { + "epoch": 1.6278434940855324, + "grad_norm": 0.4378730785336437, + "learning_rate": 7.606091839176666e-06, + "loss": 0.0313, + "step": 3578 + }, + { + "epoch": 1.6282984531392175, + "grad_norm": 0.4508110009197861, + "learning_rate": 7.604871938117171e-06, + "loss": 0.0259, + "step": 3579 + }, + { + "epoch": 1.6287534121929026, + "grad_norm": 0.37231708257580964, + "learning_rate": 7.6036518241993055e-06, + "loss": 0.0206, + "step": 3580 + }, + { + "epoch": 1.629208371246588, + "grad_norm": 0.5023008053378739, + "learning_rate": 7.602431497522771e-06, + "loss": 0.0284, + "step": 3581 + }, + { + "epoch": 1.629663330300273, + "grad_norm": 1.0431521995815034, + "learning_rate": 7.601210958187286e-06, + "loss": 0.0628, + "step": 3582 + }, + { + "epoch": 1.630118289353958, + "grad_norm": 0.6004977507554499, + "learning_rate": 7.599990206292589e-06, + "loss": 0.038, + "step": 3583 + }, + { + "epoch": 1.6305732484076434, + "grad_norm": 0.4205149087751285, + "learning_rate": 7.598769241938435e-06, + "loss": 0.0221, + "step": 3584 + }, + { + "epoch": 1.6310282074613285, + "grad_norm": 0.6822993931535771, + "learning_rate": 7.597548065224594e-06, + "loss": 0.04, + "step": 3585 + }, + { + "epoch": 1.6314831665150136, + "grad_norm": 0.3585095136964591, + "learning_rate": 7.596326676250853e-06, + "loss": 0.0193, + "step": 3586 + }, + { + "epoch": 1.6319381255686989, + "grad_norm": 0.4523831063775864, + "learning_rate": 7.595105075117023e-06, + "loss": 0.0217, + "step": 3587 + }, + { + "epoch": 1.632393084622384, + "grad_norm": 0.5716166616117068, + "learning_rate": 7.593883261922927e-06, + "loss": 0.0322, + "step": 3588 + }, + { + "epoch": 1.632848043676069, + "grad_norm": 0.5367659095679785, + "learning_rate": 7.592661236768402e-06, + "loss": 0.0423, + "step": 3589 + }, + { + "epoch": 1.6333030027297544, + "grad_norm": 0.4242349443718962, + "learning_rate": 7.59143899975331e-06, + "loss": 0.021, + "step": 3590 + }, + { + "epoch": 1.6337579617834395, + "grad_norm": 0.58571921676948, + "learning_rate": 7.5902165509775276e-06, + "loss": 0.0237, + "step": 3591 + }, + { + "epoch": 1.6342129208371245, + "grad_norm": 0.5080071985034912, + "learning_rate": 7.588993890540943e-06, + "loss": 0.0328, + "step": 3592 + }, + { + "epoch": 1.6346678798908099, + "grad_norm": 0.6532991035653749, + "learning_rate": 7.587771018543471e-06, + "loss": 0.0409, + "step": 3593 + }, + { + "epoch": 1.635122838944495, + "grad_norm": 0.3891041407518566, + "learning_rate": 7.586547935085038e-06, + "loss": 0.0208, + "step": 3594 + }, + { + "epoch": 1.63557779799818, + "grad_norm": 0.5728865075835124, + "learning_rate": 7.585324640265588e-06, + "loss": 0.0352, + "step": 3595 + }, + { + "epoch": 1.6360327570518653, + "grad_norm": 0.34316777483010713, + "learning_rate": 7.584101134185084e-06, + "loss": 0.0256, + "step": 3596 + }, + { + "epoch": 1.6364877161055507, + "grad_norm": 0.67041665141859, + "learning_rate": 7.582877416943504e-06, + "loss": 0.0482, + "step": 3597 + }, + { + "epoch": 1.6369426751592355, + "grad_norm": 0.6944862376061023, + "learning_rate": 7.581653488640845e-06, + "loss": 0.0401, + "step": 3598 + }, + { + "epoch": 1.6373976342129208, + "grad_norm": 0.4926515756950819, + "learning_rate": 7.580429349377123e-06, + "loss": 0.036, + "step": 3599 + }, + { + "epoch": 1.6378525932666061, + "grad_norm": 0.6531006904735797, + "learning_rate": 7.579204999252368e-06, + "loss": 0.035, + "step": 3600 + }, + { + "epoch": 1.638307552320291, + "grad_norm": 0.5554109610782453, + "learning_rate": 7.577980438366628e-06, + "loss": 0.0352, + "step": 3601 + }, + { + "epoch": 1.6387625113739763, + "grad_norm": 0.42491724437145356, + "learning_rate": 7.5767556668199685e-06, + "loss": 0.0215, + "step": 3602 + }, + { + "epoch": 1.6392174704276616, + "grad_norm": 0.4860918427295927, + "learning_rate": 7.575530684712473e-06, + "loss": 0.0343, + "step": 3603 + }, + { + "epoch": 1.6396724294813467, + "grad_norm": 0.5124027672501053, + "learning_rate": 7.574305492144238e-06, + "loss": 0.0286, + "step": 3604 + }, + { + "epoch": 1.6401273885350318, + "grad_norm": 0.5514675811144069, + "learning_rate": 7.5730800892153866e-06, + "loss": 0.0305, + "step": 3605 + }, + { + "epoch": 1.6405823475887171, + "grad_norm": 0.524484642986204, + "learning_rate": 7.5718544760260496e-06, + "loss": 0.0215, + "step": 3606 + }, + { + "epoch": 1.6410373066424022, + "grad_norm": 0.4407331404446461, + "learning_rate": 7.570628652676378e-06, + "loss": 0.0263, + "step": 3607 + }, + { + "epoch": 1.6414922656960873, + "grad_norm": 0.47395155961080115, + "learning_rate": 7.569402619266544e-06, + "loss": 0.0237, + "step": 3608 + }, + { + "epoch": 1.6419472247497726, + "grad_norm": 0.5502832582406681, + "learning_rate": 7.568176375896729e-06, + "loss": 0.0249, + "step": 3609 + }, + { + "epoch": 1.6424021838034577, + "grad_norm": 0.6479111217338874, + "learning_rate": 7.566949922667141e-06, + "loss": 0.0289, + "step": 3610 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 0.4577092754363741, + "learning_rate": 7.565723259677994e-06, + "loss": 0.0264, + "step": 3611 + }, + { + "epoch": 1.643312101910828, + "grad_norm": 0.7670756095284599, + "learning_rate": 7.564496387029532e-06, + "loss": 0.0306, + "step": 3612 + }, + { + "epoch": 1.6437670609645132, + "grad_norm": 0.5175142660634636, + "learning_rate": 7.563269304822005e-06, + "loss": 0.024, + "step": 3613 + }, + { + "epoch": 1.6442220200181983, + "grad_norm": 0.4852960351529595, + "learning_rate": 7.562042013155686e-06, + "loss": 0.0239, + "step": 3614 + }, + { + "epoch": 1.6446769790718836, + "grad_norm": 0.6874391579402988, + "learning_rate": 7.560814512130864e-06, + "loss": 0.0509, + "step": 3615 + }, + { + "epoch": 1.6451319381255687, + "grad_norm": 0.6884415051879906, + "learning_rate": 7.559586801847845e-06, + "loss": 0.0421, + "step": 3616 + }, + { + "epoch": 1.6455868971792538, + "grad_norm": 0.5022590822197832, + "learning_rate": 7.558358882406951e-06, + "loss": 0.0358, + "step": 3617 + }, + { + "epoch": 1.646041856232939, + "grad_norm": 0.7191160994079118, + "learning_rate": 7.5571307539085226e-06, + "loss": 0.0362, + "step": 3618 + }, + { + "epoch": 1.6464968152866242, + "grad_norm": 0.5361701775024936, + "learning_rate": 7.555902416452917e-06, + "loss": 0.0347, + "step": 3619 + }, + { + "epoch": 1.6469517743403093, + "grad_norm": 0.40469894871847095, + "learning_rate": 7.55467387014051e-06, + "loss": 0.0262, + "step": 3620 + }, + { + "epoch": 1.6474067333939946, + "grad_norm": 0.4729190743690024, + "learning_rate": 7.553445115071687e-06, + "loss": 0.0298, + "step": 3621 + }, + { + "epoch": 1.6478616924476797, + "grad_norm": 0.6232910519424011, + "learning_rate": 7.5522161513468635e-06, + "loss": 0.0419, + "step": 3622 + }, + { + "epoch": 1.6483166515013647, + "grad_norm": 0.7210062870818522, + "learning_rate": 7.550986979066461e-06, + "loss": 0.0477, + "step": 3623 + }, + { + "epoch": 1.64877161055505, + "grad_norm": 0.5437977283564177, + "learning_rate": 7.549757598330925e-06, + "loss": 0.0315, + "step": 3624 + }, + { + "epoch": 1.6492265696087354, + "grad_norm": 0.6454727225977187, + "learning_rate": 7.54852800924071e-06, + "loss": 0.0468, + "step": 3625 + }, + { + "epoch": 1.6496815286624202, + "grad_norm": 0.4260615747515582, + "learning_rate": 7.547298211896295e-06, + "loss": 0.0239, + "step": 3626 + }, + { + "epoch": 1.6501364877161055, + "grad_norm": 0.48016862965087487, + "learning_rate": 7.546068206398175e-06, + "loss": 0.0342, + "step": 3627 + }, + { + "epoch": 1.6505914467697909, + "grad_norm": 0.7138927769394839, + "learning_rate": 7.544837992846856e-06, + "loss": 0.0471, + "step": 3628 + }, + { + "epoch": 1.6510464058234757, + "grad_norm": 0.7630868706471562, + "learning_rate": 7.543607571342873e-06, + "loss": 0.0312, + "step": 3629 + }, + { + "epoch": 1.651501364877161, + "grad_norm": 0.44730595153415403, + "learning_rate": 7.542376941986765e-06, + "loss": 0.0219, + "step": 3630 + }, + { + "epoch": 1.6519563239308463, + "grad_norm": 0.5364651255825534, + "learning_rate": 7.541146104879093e-06, + "loss": 0.0382, + "step": 3631 + }, + { + "epoch": 1.6524112829845314, + "grad_norm": 0.41349870049114545, + "learning_rate": 7.5399150601204375e-06, + "loss": 0.021, + "step": 3632 + }, + { + "epoch": 1.6528662420382165, + "grad_norm": 0.44535310210219453, + "learning_rate": 7.538683807811393e-06, + "loss": 0.025, + "step": 3633 + }, + { + "epoch": 1.6533212010919018, + "grad_norm": 0.38034270281211774, + "learning_rate": 7.537452348052574e-06, + "loss": 0.0232, + "step": 3634 + }, + { + "epoch": 1.653776160145587, + "grad_norm": 0.41145064601774145, + "learning_rate": 7.536220680944608e-06, + "loss": 0.0266, + "step": 3635 + }, + { + "epoch": 1.654231119199272, + "grad_norm": 0.5573614158305944, + "learning_rate": 7.534988806588139e-06, + "loss": 0.0279, + "step": 3636 + }, + { + "epoch": 1.6546860782529573, + "grad_norm": 0.6441991234238742, + "learning_rate": 7.533756725083836e-06, + "loss": 0.0415, + "step": 3637 + }, + { + "epoch": 1.6551410373066424, + "grad_norm": 0.46683199273540027, + "learning_rate": 7.532524436532373e-06, + "loss": 0.022, + "step": 3638 + }, + { + "epoch": 1.6555959963603275, + "grad_norm": 0.7897915034320098, + "learning_rate": 7.531291941034451e-06, + "loss": 0.0481, + "step": 3639 + }, + { + "epoch": 1.6560509554140128, + "grad_norm": 0.5629722196073099, + "learning_rate": 7.530059238690783e-06, + "loss": 0.0401, + "step": 3640 + }, + { + "epoch": 1.656505914467698, + "grad_norm": 0.7125776182901007, + "learning_rate": 7.528826329602099e-06, + "loss": 0.0352, + "step": 3641 + }, + { + "epoch": 1.656960873521383, + "grad_norm": 0.5622661248270417, + "learning_rate": 7.5275932138691485e-06, + "loss": 0.0332, + "step": 3642 + }, + { + "epoch": 1.6574158325750683, + "grad_norm": 0.7151312448681683, + "learning_rate": 7.5263598915926934e-06, + "loss": 0.038, + "step": 3643 + }, + { + "epoch": 1.6578707916287534, + "grad_norm": 0.5343940111933614, + "learning_rate": 7.525126362873519e-06, + "loss": 0.0388, + "step": 3644 + }, + { + "epoch": 1.6583257506824385, + "grad_norm": 0.5572916031792711, + "learning_rate": 7.5238926278124195e-06, + "loss": 0.0305, + "step": 3645 + }, + { + "epoch": 1.6587807097361238, + "grad_norm": 0.6268588453699686, + "learning_rate": 7.522658686510214e-06, + "loss": 0.0367, + "step": 3646 + }, + { + "epoch": 1.6592356687898089, + "grad_norm": 0.6407086406233048, + "learning_rate": 7.521424539067732e-06, + "loss": 0.0349, + "step": 3647 + }, + { + "epoch": 1.659690627843494, + "grad_norm": 0.7598194861893685, + "learning_rate": 7.520190185585823e-06, + "loss": 0.0473, + "step": 3648 + }, + { + "epoch": 1.6601455868971793, + "grad_norm": 0.4939114369203625, + "learning_rate": 7.518955626165354e-06, + "loss": 0.0376, + "step": 3649 + }, + { + "epoch": 1.6606005459508644, + "grad_norm": 0.5009430676312933, + "learning_rate": 7.517720860907205e-06, + "loss": 0.0323, + "step": 3650 + }, + { + "epoch": 1.6610555050045495, + "grad_norm": 0.4835427088563686, + "learning_rate": 7.51648588991228e-06, + "loss": 0.029, + "step": 3651 + }, + { + "epoch": 1.6615104640582348, + "grad_norm": 0.5264831926178674, + "learning_rate": 7.51525071328149e-06, + "loss": 0.0367, + "step": 3652 + }, + { + "epoch": 1.66196542311192, + "grad_norm": 0.4817533345459221, + "learning_rate": 7.514015331115772e-06, + "loss": 0.0303, + "step": 3653 + }, + { + "epoch": 1.662420382165605, + "grad_norm": 0.46866664090793914, + "learning_rate": 7.512779743516073e-06, + "loss": 0.0381, + "step": 3654 + }, + { + "epoch": 1.6628753412192903, + "grad_norm": 0.417154929021052, + "learning_rate": 7.511543950583362e-06, + "loss": 0.0216, + "step": 3655 + }, + { + "epoch": 1.6633303002729756, + "grad_norm": 0.524657118523136, + "learning_rate": 7.5103079524186206e-06, + "loss": 0.0297, + "step": 3656 + }, + { + "epoch": 1.6637852593266604, + "grad_norm": 0.5512423603507336, + "learning_rate": 7.509071749122849e-06, + "loss": 0.032, + "step": 3657 + }, + { + "epoch": 1.6642402183803457, + "grad_norm": 0.6354187109786534, + "learning_rate": 7.5078353407970675e-06, + "loss": 0.0368, + "step": 3658 + }, + { + "epoch": 1.664695177434031, + "grad_norm": 0.465755148211732, + "learning_rate": 7.506598727542305e-06, + "loss": 0.026, + "step": 3659 + }, + { + "epoch": 1.6651501364877161, + "grad_norm": 1.2489245060589367, + "learning_rate": 7.5053619094596144e-06, + "loss": 0.0695, + "step": 3660 + }, + { + "epoch": 1.6656050955414012, + "grad_norm": 0.4195788150765403, + "learning_rate": 7.504124886650064e-06, + "loss": 0.0297, + "step": 3661 + }, + { + "epoch": 1.6660600545950865, + "grad_norm": 0.4520027445439969, + "learning_rate": 7.5028876592147356e-06, + "loss": 0.0195, + "step": 3662 + }, + { + "epoch": 1.6665150136487716, + "grad_norm": 0.6079480787748858, + "learning_rate": 7.501650227254731e-06, + "loss": 0.0312, + "step": 3663 + }, + { + "epoch": 1.6669699727024567, + "grad_norm": 1.0717457296352204, + "learning_rate": 7.500412590871167e-06, + "loss": 0.0504, + "step": 3664 + }, + { + "epoch": 1.667424931756142, + "grad_norm": 0.43425899943762286, + "learning_rate": 7.499174750165178e-06, + "loss": 0.0195, + "step": 3665 + }, + { + "epoch": 1.6678798908098271, + "grad_norm": 0.493687208906982, + "learning_rate": 7.497936705237915e-06, + "loss": 0.0355, + "step": 3666 + }, + { + "epoch": 1.6683348498635122, + "grad_norm": 0.5379546390424125, + "learning_rate": 7.4966984561905435e-06, + "loss": 0.0359, + "step": 3667 + }, + { + "epoch": 1.6687898089171975, + "grad_norm": 0.49000291706954646, + "learning_rate": 7.49546000312425e-06, + "loss": 0.029, + "step": 3668 + }, + { + "epoch": 1.6692447679708826, + "grad_norm": 0.7850925100883417, + "learning_rate": 7.494221346140234e-06, + "loss": 0.0648, + "step": 3669 + }, + { + "epoch": 1.6696997270245677, + "grad_norm": 0.3895801016241115, + "learning_rate": 7.4929824853397135e-06, + "loss": 0.0176, + "step": 3670 + }, + { + "epoch": 1.670154686078253, + "grad_norm": 0.5834752225965807, + "learning_rate": 7.4917434208239235e-06, + "loss": 0.0317, + "step": 3671 + }, + { + "epoch": 1.670609645131938, + "grad_norm": 0.4548464902681102, + "learning_rate": 7.490504152694113e-06, + "loss": 0.0313, + "step": 3672 + }, + { + "epoch": 1.6710646041856232, + "grad_norm": 0.6896247279139939, + "learning_rate": 7.489264681051551e-06, + "loss": 0.0448, + "step": 3673 + }, + { + "epoch": 1.6715195632393085, + "grad_norm": 0.39980420203978523, + "learning_rate": 7.488025005997519e-06, + "loss": 0.0267, + "step": 3674 + }, + { + "epoch": 1.6719745222929936, + "grad_norm": 0.6586268912096953, + "learning_rate": 7.486785127633321e-06, + "loss": 0.0368, + "step": 3675 + }, + { + "epoch": 1.6724294813466787, + "grad_norm": 0.43582639609296714, + "learning_rate": 7.485545046060272e-06, + "loss": 0.0262, + "step": 3676 + }, + { + "epoch": 1.672884440400364, + "grad_norm": 0.5879108612573464, + "learning_rate": 7.484304761379706e-06, + "loss": 0.0329, + "step": 3677 + }, + { + "epoch": 1.673339399454049, + "grad_norm": 0.4684434902017037, + "learning_rate": 7.4830642736929745e-06, + "loss": 0.0326, + "step": 3678 + }, + { + "epoch": 1.6737943585077342, + "grad_norm": 0.5530781122820492, + "learning_rate": 7.481823583101444e-06, + "loss": 0.0238, + "step": 3679 + }, + { + "epoch": 1.6742493175614195, + "grad_norm": 0.6103374408082517, + "learning_rate": 7.4805826897064985e-06, + "loss": 0.0371, + "step": 3680 + }, + { + "epoch": 1.6747042766151048, + "grad_norm": 0.574122694602557, + "learning_rate": 7.479341593609535e-06, + "loss": 0.0377, + "step": 3681 + }, + { + "epoch": 1.6751592356687897, + "grad_norm": 0.6200957468248504, + "learning_rate": 7.478100294911977e-06, + "loss": 0.0433, + "step": 3682 + }, + { + "epoch": 1.675614194722475, + "grad_norm": 0.6642367843460708, + "learning_rate": 7.476858793715252e-06, + "loss": 0.0407, + "step": 3683 + }, + { + "epoch": 1.6760691537761603, + "grad_norm": 0.696515604442919, + "learning_rate": 7.475617090120811e-06, + "loss": 0.035, + "step": 3684 + }, + { + "epoch": 1.6765241128298451, + "grad_norm": 0.5790866750331873, + "learning_rate": 7.4743751842301225e-06, + "loss": 0.0299, + "step": 3685 + }, + { + "epoch": 1.6769790718835305, + "grad_norm": 0.4894119701250759, + "learning_rate": 7.473133076144667e-06, + "loss": 0.0307, + "step": 3686 + }, + { + "epoch": 1.6774340309372158, + "grad_norm": 0.4775329874958323, + "learning_rate": 7.471890765965947e-06, + "loss": 0.0277, + "step": 3687 + }, + { + "epoch": 1.6778889899909009, + "grad_norm": 0.8596143001090791, + "learning_rate": 7.470648253795475e-06, + "loss": 0.0443, + "step": 3688 + }, + { + "epoch": 1.678343949044586, + "grad_norm": 0.47716517673981357, + "learning_rate": 7.469405539734786e-06, + "loss": 0.0175, + "step": 3689 + }, + { + "epoch": 1.6787989080982713, + "grad_norm": 0.5635174716071238, + "learning_rate": 7.468162623885428e-06, + "loss": 0.0341, + "step": 3690 + }, + { + "epoch": 1.6792538671519563, + "grad_norm": 0.48265942529925676, + "learning_rate": 7.466919506348964e-06, + "loss": 0.0263, + "step": 3691 + }, + { + "epoch": 1.6797088262056414, + "grad_norm": 0.4995704136788404, + "learning_rate": 7.465676187226981e-06, + "loss": 0.0327, + "step": 3692 + }, + { + "epoch": 1.6801637852593267, + "grad_norm": 0.539650849512318, + "learning_rate": 7.464432666621074e-06, + "loss": 0.0336, + "step": 3693 + }, + { + "epoch": 1.6806187443130118, + "grad_norm": 0.5180565169948113, + "learning_rate": 7.4631889446328595e-06, + "loss": 0.0316, + "step": 3694 + }, + { + "epoch": 1.681073703366697, + "grad_norm": 0.6102235388801855, + "learning_rate": 7.461945021363968e-06, + "loss": 0.0397, + "step": 3695 + }, + { + "epoch": 1.6815286624203822, + "grad_norm": 0.4209026490907827, + "learning_rate": 7.460700896916047e-06, + "loss": 0.0211, + "step": 3696 + }, + { + "epoch": 1.6819836214740673, + "grad_norm": 0.7313054141207679, + "learning_rate": 7.459456571390762e-06, + "loss": 0.0393, + "step": 3697 + }, + { + "epoch": 1.6824385805277524, + "grad_norm": 0.3649702874908748, + "learning_rate": 7.4582120448897896e-06, + "loss": 0.0212, + "step": 3698 + }, + { + "epoch": 1.6828935395814377, + "grad_norm": 0.6424573468685715, + "learning_rate": 7.456967317514834e-06, + "loss": 0.0321, + "step": 3699 + }, + { + "epoch": 1.6833484986351228, + "grad_norm": 0.6293356488965565, + "learning_rate": 7.455722389367603e-06, + "loss": 0.0273, + "step": 3700 + }, + { + "epoch": 1.683803457688808, + "grad_norm": 0.7235481703397522, + "learning_rate": 7.454477260549828e-06, + "loss": 0.0327, + "step": 3701 + }, + { + "epoch": 1.6842584167424932, + "grad_norm": 0.6999848465561391, + "learning_rate": 7.453231931163256e-06, + "loss": 0.0624, + "step": 3702 + }, + { + "epoch": 1.6847133757961783, + "grad_norm": 12.090077564639325, + "learning_rate": 7.45198640130965e-06, + "loss": 0.4276, + "step": 3703 + }, + { + "epoch": 1.6851683348498634, + "grad_norm": 0.5455210735996605, + "learning_rate": 7.450740671090788e-06, + "loss": 0.0357, + "step": 3704 + }, + { + "epoch": 1.6856232939035487, + "grad_norm": 0.5024928241770569, + "learning_rate": 7.449494740608465e-06, + "loss": 0.0339, + "step": 3705 + }, + { + "epoch": 1.686078252957234, + "grad_norm": 0.4273476781631383, + "learning_rate": 7.448248609964495e-06, + "loss": 0.0252, + "step": 3706 + }, + { + "epoch": 1.6865332120109189, + "grad_norm": 0.5009529731705445, + "learning_rate": 7.447002279260704e-06, + "loss": 0.026, + "step": 3707 + }, + { + "epoch": 1.6869881710646042, + "grad_norm": 0.4953825250807911, + "learning_rate": 7.445755748598938e-06, + "loss": 0.0299, + "step": 3708 + }, + { + "epoch": 1.6874431301182895, + "grad_norm": 0.5099196504130769, + "learning_rate": 7.444509018081054e-06, + "loss": 0.0285, + "step": 3709 + }, + { + "epoch": 1.6878980891719744, + "grad_norm": 0.44873344215081523, + "learning_rate": 7.443262087808936e-06, + "loss": 0.0309, + "step": 3710 + }, + { + "epoch": 1.6883530482256597, + "grad_norm": 0.4804221750169978, + "learning_rate": 7.442014957884473e-06, + "loss": 0.0321, + "step": 3711 + }, + { + "epoch": 1.688808007279345, + "grad_norm": 0.5431087843128045, + "learning_rate": 7.440767628409575e-06, + "loss": 0.0398, + "step": 3712 + }, + { + "epoch": 1.68926296633303, + "grad_norm": 0.43205749502925345, + "learning_rate": 7.439520099486168e-06, + "loss": 0.032, + "step": 3713 + }, + { + "epoch": 1.6897179253867152, + "grad_norm": 0.7020261750553709, + "learning_rate": 7.438272371216198e-06, + "loss": 0.052, + "step": 3714 + }, + { + "epoch": 1.6901728844404005, + "grad_norm": 0.40232785104425167, + "learning_rate": 7.437024443701619e-06, + "loss": 0.0259, + "step": 3715 + }, + { + "epoch": 1.6906278434940856, + "grad_norm": 0.392818290322344, + "learning_rate": 7.435776317044408e-06, + "loss": 0.0276, + "step": 3716 + }, + { + "epoch": 1.6910828025477707, + "grad_norm": 0.4762436487265989, + "learning_rate": 7.434527991346556e-06, + "loss": 0.0309, + "step": 3717 + }, + { + "epoch": 1.691537761601456, + "grad_norm": 0.7231433730579884, + "learning_rate": 7.433279466710071e-06, + "loss": 0.0406, + "step": 3718 + }, + { + "epoch": 1.691992720655141, + "grad_norm": 0.5651212308899453, + "learning_rate": 7.432030743236977e-06, + "loss": 0.0403, + "step": 3719 + }, + { + "epoch": 1.6924476797088261, + "grad_norm": 0.5993536332498526, + "learning_rate": 7.430781821029313e-06, + "loss": 0.0254, + "step": 3720 + }, + { + "epoch": 1.6929026387625115, + "grad_norm": 0.6161358572154612, + "learning_rate": 7.4295327001891384e-06, + "loss": 0.0301, + "step": 3721 + }, + { + "epoch": 1.6933575978161965, + "grad_norm": 0.5199384098068588, + "learning_rate": 7.428283380818521e-06, + "loss": 0.0391, + "step": 3722 + }, + { + "epoch": 1.6938125568698816, + "grad_norm": 0.5206027617785847, + "learning_rate": 7.42703386301955e-06, + "loss": 0.031, + "step": 3723 + }, + { + "epoch": 1.694267515923567, + "grad_norm": 0.8161105210637801, + "learning_rate": 7.4257841468943355e-06, + "loss": 0.0517, + "step": 3724 + }, + { + "epoch": 1.694722474977252, + "grad_norm": 0.4986248164130022, + "learning_rate": 7.424534232544993e-06, + "loss": 0.0238, + "step": 3725 + }, + { + "epoch": 1.6951774340309371, + "grad_norm": 0.6150288798515563, + "learning_rate": 7.423284120073664e-06, + "loss": 0.0371, + "step": 3726 + }, + { + "epoch": 1.6956323930846224, + "grad_norm": 0.3854498238738396, + "learning_rate": 7.422033809582498e-06, + "loss": 0.0229, + "step": 3727 + }, + { + "epoch": 1.6960873521383075, + "grad_norm": 0.5497475163150745, + "learning_rate": 7.4207833011736685e-06, + "loss": 0.0333, + "step": 3728 + }, + { + "epoch": 1.6965423111919926, + "grad_norm": 0.546075976887808, + "learning_rate": 7.419532594949359e-06, + "loss": 0.0391, + "step": 3729 + }, + { + "epoch": 1.696997270245678, + "grad_norm": 0.5503279076609637, + "learning_rate": 7.4182816910117724e-06, + "loss": 0.032, + "step": 3730 + }, + { + "epoch": 1.697452229299363, + "grad_norm": 0.44577137948939505, + "learning_rate": 7.417030589463128e-06, + "loss": 0.0229, + "step": 3731 + }, + { + "epoch": 1.697907188353048, + "grad_norm": 0.4637424150287877, + "learning_rate": 7.415779290405658e-06, + "loss": 0.0276, + "step": 3732 + }, + { + "epoch": 1.6983621474067334, + "grad_norm": 0.5116482751824055, + "learning_rate": 7.414527793941614e-06, + "loss": 0.031, + "step": 3733 + }, + { + "epoch": 1.6988171064604187, + "grad_norm": 0.42887680446126897, + "learning_rate": 7.413276100173262e-06, + "loss": 0.0299, + "step": 3734 + }, + { + "epoch": 1.6992720655141036, + "grad_norm": 0.4103477386094312, + "learning_rate": 7.412024209202887e-06, + "loss": 0.0241, + "step": 3735 + }, + { + "epoch": 1.699727024567789, + "grad_norm": 0.34985347095670377, + "learning_rate": 7.410772121132785e-06, + "loss": 0.0212, + "step": 3736 + }, + { + "epoch": 1.7001819836214742, + "grad_norm": 0.867876090579481, + "learning_rate": 7.409519836065272e-06, + "loss": 0.0519, + "step": 3737 + }, + { + "epoch": 1.700636942675159, + "grad_norm": 0.5730098932055396, + "learning_rate": 7.4082673541026805e-06, + "loss": 0.0419, + "step": 3738 + }, + { + "epoch": 1.7010919017288444, + "grad_norm": 0.4719630921045215, + "learning_rate": 7.407014675347356e-06, + "loss": 0.0255, + "step": 3739 + }, + { + "epoch": 1.7015468607825297, + "grad_norm": 0.6245364317243467, + "learning_rate": 7.405761799901662e-06, + "loss": 0.0426, + "step": 3740 + }, + { + "epoch": 1.7020018198362148, + "grad_norm": 0.5465632899661947, + "learning_rate": 7.404508727867978e-06, + "loss": 0.0329, + "step": 3741 + }, + { + "epoch": 1.7024567788898999, + "grad_norm": 1.5070858916095438, + "learning_rate": 7.403255459348699e-06, + "loss": 0.058, + "step": 3742 + }, + { + "epoch": 1.7029117379435852, + "grad_norm": 0.5022020296420143, + "learning_rate": 7.402001994446237e-06, + "loss": 0.0227, + "step": 3743 + }, + { + "epoch": 1.7033666969972703, + "grad_norm": 0.5805315084933184, + "learning_rate": 7.400748333263019e-06, + "loss": 0.0343, + "step": 3744 + }, + { + "epoch": 1.7038216560509554, + "grad_norm": 0.7410975613111707, + "learning_rate": 7.399494475901491e-06, + "loss": 0.0482, + "step": 3745 + }, + { + "epoch": 1.7042766151046407, + "grad_norm": 0.6576165214272308, + "learning_rate": 7.398240422464109e-06, + "loss": 0.0331, + "step": 3746 + }, + { + "epoch": 1.7047315741583258, + "grad_norm": 0.48532107166258226, + "learning_rate": 7.396986173053349e-06, + "loss": 0.028, + "step": 3747 + }, + { + "epoch": 1.7051865332120109, + "grad_norm": 0.5585289141774178, + "learning_rate": 7.395731727771705e-06, + "loss": 0.0237, + "step": 3748 + }, + { + "epoch": 1.7056414922656962, + "grad_norm": 0.6547560638585251, + "learning_rate": 7.394477086721683e-06, + "loss": 0.0366, + "step": 3749 + }, + { + "epoch": 1.7060964513193813, + "grad_norm": 0.6673639283206183, + "learning_rate": 7.393222250005807e-06, + "loss": 0.0381, + "step": 3750 + }, + { + "epoch": 1.7065514103730663, + "grad_norm": 0.6628156221749939, + "learning_rate": 7.391967217726616e-06, + "loss": 0.0263, + "step": 3751 + }, + { + "epoch": 1.7070063694267517, + "grad_norm": 0.319113418020921, + "learning_rate": 7.390711989986667e-06, + "loss": 0.0104, + "step": 3752 + }, + { + "epoch": 1.7074613284804367, + "grad_norm": 0.4874963972768766, + "learning_rate": 7.389456566888529e-06, + "loss": 0.0297, + "step": 3753 + }, + { + "epoch": 1.7079162875341218, + "grad_norm": 0.6623867553871146, + "learning_rate": 7.3882009485347915e-06, + "loss": 0.0313, + "step": 3754 + }, + { + "epoch": 1.7083712465878071, + "grad_norm": 0.6948367292138308, + "learning_rate": 7.386945135028058e-06, + "loss": 0.0353, + "step": 3755 + }, + { + "epoch": 1.7088262056414922, + "grad_norm": 0.38970023134785947, + "learning_rate": 7.385689126470948e-06, + "loss": 0.0227, + "step": 3756 + }, + { + "epoch": 1.7092811646951773, + "grad_norm": 0.5652773203658376, + "learning_rate": 7.384432922966094e-06, + "loss": 0.0352, + "step": 3757 + }, + { + "epoch": 1.7097361237488626, + "grad_norm": 0.45850523783482594, + "learning_rate": 7.383176524616151e-06, + "loss": 0.0212, + "step": 3758 + }, + { + "epoch": 1.7101910828025477, + "grad_norm": 0.745749844037621, + "learning_rate": 7.381919931523786e-06, + "loss": 0.0483, + "step": 3759 + }, + { + "epoch": 1.7106460418562328, + "grad_norm": 0.5571287482700786, + "learning_rate": 7.3806631437916795e-06, + "loss": 0.0369, + "step": 3760 + }, + { + "epoch": 1.7111010009099181, + "grad_norm": 0.33863255848977414, + "learning_rate": 7.379406161522531e-06, + "loss": 0.0145, + "step": 3761 + }, + { + "epoch": 1.7115559599636034, + "grad_norm": 0.601596026810322, + "learning_rate": 7.378148984819058e-06, + "loss": 0.0402, + "step": 3762 + }, + { + "epoch": 1.7120109190172883, + "grad_norm": 0.6041830243958276, + "learning_rate": 7.376891613783987e-06, + "loss": 0.0382, + "step": 3763 + }, + { + "epoch": 1.7124658780709736, + "grad_norm": 0.5590057572877777, + "learning_rate": 7.37563404852007e-06, + "loss": 0.0322, + "step": 3764 + }, + { + "epoch": 1.712920837124659, + "grad_norm": 0.5783338305509084, + "learning_rate": 7.374376289130066e-06, + "loss": 0.0308, + "step": 3765 + }, + { + "epoch": 1.7133757961783438, + "grad_norm": 0.44184246273165895, + "learning_rate": 7.373118335716755e-06, + "loss": 0.0328, + "step": 3766 + }, + { + "epoch": 1.713830755232029, + "grad_norm": 0.608815547149246, + "learning_rate": 7.37186018838293e-06, + "loss": 0.0322, + "step": 3767 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.7043357325734398, + "learning_rate": 7.3706018472314e-06, + "loss": 0.0448, + "step": 3768 + }, + { + "epoch": 1.7147406733393995, + "grad_norm": 0.5870414993849291, + "learning_rate": 7.369343312364994e-06, + "loss": 0.0275, + "step": 3769 + }, + { + "epoch": 1.7151956323930846, + "grad_norm": 0.4144434553520093, + "learning_rate": 7.3680845838865524e-06, + "loss": 0.0226, + "step": 3770 + }, + { + "epoch": 1.71565059144677, + "grad_norm": 0.7724336484979811, + "learning_rate": 7.366825661898932e-06, + "loss": 0.0523, + "step": 3771 + }, + { + "epoch": 1.716105550500455, + "grad_norm": 0.44158179383570634, + "learning_rate": 7.3655665465050085e-06, + "loss": 0.0219, + "step": 3772 + }, + { + "epoch": 1.71656050955414, + "grad_norm": 0.46164501993590573, + "learning_rate": 7.364307237807669e-06, + "loss": 0.0277, + "step": 3773 + }, + { + "epoch": 1.7170154686078254, + "grad_norm": 0.3878859081616898, + "learning_rate": 7.363047735909818e-06, + "loss": 0.0174, + "step": 3774 + }, + { + "epoch": 1.7174704276615105, + "grad_norm": 0.3010933687776966, + "learning_rate": 7.361788040914379e-06, + "loss": 0.0149, + "step": 3775 + }, + { + "epoch": 1.7179253867151956, + "grad_norm": 0.5938272876776134, + "learning_rate": 7.3605281529242855e-06, + "loss": 0.0477, + "step": 3776 + }, + { + "epoch": 1.7183803457688809, + "grad_norm": 0.5314911050681621, + "learning_rate": 7.359268072042493e-06, + "loss": 0.0347, + "step": 3777 + }, + { + "epoch": 1.718835304822566, + "grad_norm": 0.5161094802276209, + "learning_rate": 7.358007798371966e-06, + "loss": 0.0271, + "step": 3778 + }, + { + "epoch": 1.719290263876251, + "grad_norm": 0.43329053175996995, + "learning_rate": 7.3567473320156925e-06, + "loss": 0.0223, + "step": 3779 + }, + { + "epoch": 1.7197452229299364, + "grad_norm": 0.4944751146860554, + "learning_rate": 7.3554866730766696e-06, + "loss": 0.0225, + "step": 3780 + }, + { + "epoch": 1.7202001819836215, + "grad_norm": 0.44064583732303525, + "learning_rate": 7.3542258216579136e-06, + "loss": 0.03, + "step": 3781 + }, + { + "epoch": 1.7206551410373065, + "grad_norm": 0.5157957700243019, + "learning_rate": 7.3529647778624525e-06, + "loss": 0.0212, + "step": 3782 + }, + { + "epoch": 1.7211101000909919, + "grad_norm": 0.3945957677244866, + "learning_rate": 7.351703541793338e-06, + "loss": 0.0221, + "step": 3783 + }, + { + "epoch": 1.721565059144677, + "grad_norm": 0.6180546881752076, + "learning_rate": 7.35044211355363e-06, + "loss": 0.0321, + "step": 3784 + }, + { + "epoch": 1.722020018198362, + "grad_norm": 0.35332318030215154, + "learning_rate": 7.3491804932464054e-06, + "loss": 0.0201, + "step": 3785 + }, + { + "epoch": 1.7224749772520473, + "grad_norm": 0.43770098659833917, + "learning_rate": 7.347918680974761e-06, + "loss": 0.0206, + "step": 3786 + }, + { + "epoch": 1.7229299363057324, + "grad_norm": 0.5533036364637789, + "learning_rate": 7.3466566768418045e-06, + "loss": 0.0366, + "step": 3787 + }, + { + "epoch": 1.7233848953594175, + "grad_norm": 0.3318485584811483, + "learning_rate": 7.345394480950663e-06, + "loss": 0.016, + "step": 3788 + }, + { + "epoch": 1.7238398544131028, + "grad_norm": 0.526066120808399, + "learning_rate": 7.344132093404474e-06, + "loss": 0.0306, + "step": 3789 + }, + { + "epoch": 1.7242948134667881, + "grad_norm": 0.49753343489307694, + "learning_rate": 7.342869514306399e-06, + "loss": 0.0245, + "step": 3790 + }, + { + "epoch": 1.724749772520473, + "grad_norm": 0.45513746059510923, + "learning_rate": 7.341606743759606e-06, + "loss": 0.0276, + "step": 3791 + }, + { + "epoch": 1.7252047315741583, + "grad_norm": 0.4081732067012268, + "learning_rate": 7.340343781867285e-06, + "loss": 0.0193, + "step": 3792 + }, + { + "epoch": 1.7256596906278436, + "grad_norm": 0.5326445316009483, + "learning_rate": 7.339080628732638e-06, + "loss": 0.0338, + "step": 3793 + }, + { + "epoch": 1.7261146496815285, + "grad_norm": 0.4806692608797013, + "learning_rate": 7.337817284458887e-06, + "loss": 0.0221, + "step": 3794 + }, + { + "epoch": 1.7265696087352138, + "grad_norm": 0.4064847766577533, + "learning_rate": 7.336553749149263e-06, + "loss": 0.0165, + "step": 3795 + }, + { + "epoch": 1.7270245677888991, + "grad_norm": 0.5186028514202176, + "learning_rate": 7.33529002290702e-06, + "loss": 0.031, + "step": 3796 + }, + { + "epoch": 1.7274795268425842, + "grad_norm": 0.600305480294502, + "learning_rate": 7.3340261058354215e-06, + "loss": 0.0292, + "step": 3797 + }, + { + "epoch": 1.7279344858962693, + "grad_norm": 0.45392043889591754, + "learning_rate": 7.3327619980377505e-06, + "loss": 0.0211, + "step": 3798 + }, + { + "epoch": 1.7283894449499546, + "grad_norm": 0.5213897904145063, + "learning_rate": 7.3314976996173035e-06, + "loss": 0.0194, + "step": 3799 + }, + { + "epoch": 1.7288444040036397, + "grad_norm": 0.4522011044749012, + "learning_rate": 7.330233210677393e-06, + "loss": 0.0272, + "step": 3800 + }, + { + "epoch": 1.7292993630573248, + "grad_norm": 0.6462641556015574, + "learning_rate": 7.32896853132135e-06, + "loss": 0.0285, + "step": 3801 + }, + { + "epoch": 1.72975432211101, + "grad_norm": 0.5251528408225381, + "learning_rate": 7.327703661652513e-06, + "loss": 0.0336, + "step": 3802 + }, + { + "epoch": 1.7302092811646952, + "grad_norm": 0.4981466687834417, + "learning_rate": 7.326438601774246e-06, + "loss": 0.0249, + "step": 3803 + }, + { + "epoch": 1.7306642402183803, + "grad_norm": 0.42897325009717824, + "learning_rate": 7.325173351789923e-06, + "loss": 0.0268, + "step": 3804 + }, + { + "epoch": 1.7311191992720656, + "grad_norm": 0.45147714794739996, + "learning_rate": 7.323907911802935e-06, + "loss": 0.0225, + "step": 3805 + }, + { + "epoch": 1.7315741583257507, + "grad_norm": 0.7370959239958824, + "learning_rate": 7.322642281916684e-06, + "loss": 0.0467, + "step": 3806 + }, + { + "epoch": 1.7320291173794358, + "grad_norm": 0.4524420819643831, + "learning_rate": 7.321376462234596e-06, + "loss": 0.0274, + "step": 3807 + }, + { + "epoch": 1.732484076433121, + "grad_norm": 0.5254868113533432, + "learning_rate": 7.320110452860108e-06, + "loss": 0.0329, + "step": 3808 + }, + { + "epoch": 1.7329390354868062, + "grad_norm": 0.9564816030024412, + "learning_rate": 7.318844253896671e-06, + "loss": 0.0559, + "step": 3809 + }, + { + "epoch": 1.7333939945404913, + "grad_norm": 0.5793168368997682, + "learning_rate": 7.317577865447752e-06, + "loss": 0.0358, + "step": 3810 + }, + { + "epoch": 1.7338489535941766, + "grad_norm": 0.48460233389571516, + "learning_rate": 7.316311287616837e-06, + "loss": 0.0254, + "step": 3811 + }, + { + "epoch": 1.7343039126478617, + "grad_norm": 0.5224913589925347, + "learning_rate": 7.3150445205074235e-06, + "loss": 0.0248, + "step": 3812 + }, + { + "epoch": 1.7347588717015467, + "grad_norm": 0.4532280068500263, + "learning_rate": 7.313777564223027e-06, + "loss": 0.024, + "step": 3813 + }, + { + "epoch": 1.735213830755232, + "grad_norm": 0.7398017091813359, + "learning_rate": 7.3125104188671756e-06, + "loss": 0.0511, + "step": 3814 + }, + { + "epoch": 1.7356687898089171, + "grad_norm": 0.6215664511261365, + "learning_rate": 7.311243084543418e-06, + "loss": 0.0357, + "step": 3815 + }, + { + "epoch": 1.7361237488626022, + "grad_norm": 0.5734922269319896, + "learning_rate": 7.309975561355312e-06, + "loss": 0.0333, + "step": 3816 + }, + { + "epoch": 1.7365787079162875, + "grad_norm": 0.48964645948557484, + "learning_rate": 7.308707849406434e-06, + "loss": 0.0256, + "step": 3817 + }, + { + "epoch": 1.7370336669699729, + "grad_norm": 0.419383294955176, + "learning_rate": 7.3074399488003786e-06, + "loss": 0.0169, + "step": 3818 + }, + { + "epoch": 1.7374886260236577, + "grad_norm": 0.7285131334252631, + "learning_rate": 7.306171859640749e-06, + "loss": 0.0384, + "step": 3819 + }, + { + "epoch": 1.737943585077343, + "grad_norm": 0.4592919908652582, + "learning_rate": 7.304903582031171e-06, + "loss": 0.0283, + "step": 3820 + }, + { + "epoch": 1.7383985441310283, + "grad_norm": 4.09629691031824, + "learning_rate": 7.30363511607528e-06, + "loss": 0.0879, + "step": 3821 + }, + { + "epoch": 1.7388535031847132, + "grad_norm": 0.5226168373603701, + "learning_rate": 7.302366461876731e-06, + "loss": 0.0284, + "step": 3822 + }, + { + "epoch": 1.7393084622383985, + "grad_norm": 0.5448610135045856, + "learning_rate": 7.301097619539193e-06, + "loss": 0.0291, + "step": 3823 + }, + { + "epoch": 1.7397634212920838, + "grad_norm": 0.7777497169855851, + "learning_rate": 7.2998285891663465e-06, + "loss": 0.0504, + "step": 3824 + }, + { + "epoch": 1.740218380345769, + "grad_norm": 0.4599172925615985, + "learning_rate": 7.298559370861896e-06, + "loss": 0.0239, + "step": 3825 + }, + { + "epoch": 1.740673339399454, + "grad_norm": 0.7806211292664905, + "learning_rate": 7.297289964729554e-06, + "loss": 0.0581, + "step": 3826 + }, + { + "epoch": 1.7411282984531393, + "grad_norm": 0.6073930539328547, + "learning_rate": 7.29602037087305e-06, + "loss": 0.038, + "step": 3827 + }, + { + "epoch": 1.7415832575068244, + "grad_norm": 0.4942760634250622, + "learning_rate": 7.294750589396129e-06, + "loss": 0.0278, + "step": 3828 + }, + { + "epoch": 1.7420382165605095, + "grad_norm": 0.5097614043226851, + "learning_rate": 7.293480620402553e-06, + "loss": 0.0327, + "step": 3829 + }, + { + "epoch": 1.7424931756141948, + "grad_norm": 0.677626075495248, + "learning_rate": 7.2922104639961e-06, + "loss": 0.0428, + "step": 3830 + }, + { + "epoch": 1.74294813466788, + "grad_norm": 0.42937454159522365, + "learning_rate": 7.290940120280557e-06, + "loss": 0.0245, + "step": 3831 + }, + { + "epoch": 1.743403093721565, + "grad_norm": 0.7118640826950644, + "learning_rate": 7.2896695893597344e-06, + "loss": 0.0443, + "step": 3832 + }, + { + "epoch": 1.7438580527752503, + "grad_norm": 0.7972677619215067, + "learning_rate": 7.288398871337453e-06, + "loss": 0.0465, + "step": 3833 + }, + { + "epoch": 1.7443130118289354, + "grad_norm": 0.6170583394100522, + "learning_rate": 7.28712796631755e-06, + "loss": 0.0465, + "step": 3834 + }, + { + "epoch": 1.7447679708826205, + "grad_norm": 0.4855998128585689, + "learning_rate": 7.285856874403878e-06, + "loss": 0.0291, + "step": 3835 + }, + { + "epoch": 1.7452229299363058, + "grad_norm": 0.6098175387505606, + "learning_rate": 7.284585595700306e-06, + "loss": 0.0409, + "step": 3836 + }, + { + "epoch": 1.7456778889899909, + "grad_norm": 0.5051469778613001, + "learning_rate": 7.283314130310716e-06, + "loss": 0.029, + "step": 3837 + }, + { + "epoch": 1.746132848043676, + "grad_norm": 0.7383032973710374, + "learning_rate": 7.282042478339005e-06, + "loss": 0.0346, + "step": 3838 + }, + { + "epoch": 1.7465878070973613, + "grad_norm": 0.407879313889853, + "learning_rate": 7.2807706398890895e-06, + "loss": 0.0202, + "step": 3839 + }, + { + "epoch": 1.7470427661510464, + "grad_norm": 0.5353921163068872, + "learning_rate": 7.279498615064897e-06, + "loss": 0.0244, + "step": 3840 + }, + { + "epoch": 1.7474977252047315, + "grad_norm": 0.5249889217718524, + "learning_rate": 7.278226403970371e-06, + "loss": 0.0303, + "step": 3841 + }, + { + "epoch": 1.7479526842584168, + "grad_norm": 0.5250520409728981, + "learning_rate": 7.276954006709473e-06, + "loss": 0.0195, + "step": 3842 + }, + { + "epoch": 1.7484076433121019, + "grad_norm": 0.5503025135768406, + "learning_rate": 7.275681423386176e-06, + "loss": 0.0308, + "step": 3843 + }, + { + "epoch": 1.748862602365787, + "grad_norm": 0.4540938115527505, + "learning_rate": 7.27440865410447e-06, + "loss": 0.031, + "step": 3844 + }, + { + "epoch": 1.7493175614194723, + "grad_norm": 0.4608493774662213, + "learning_rate": 7.273135698968359e-06, + "loss": 0.033, + "step": 3845 + }, + { + "epoch": 1.7497725204731576, + "grad_norm": 0.5434375722057718, + "learning_rate": 7.271862558081865e-06, + "loss": 0.0398, + "step": 3846 + }, + { + "epoch": 1.7502274795268424, + "grad_norm": 0.5014557314754617, + "learning_rate": 7.270589231549022e-06, + "loss": 0.0411, + "step": 3847 + }, + { + "epoch": 1.7506824385805277, + "grad_norm": 0.5605891749344477, + "learning_rate": 7.269315719473879e-06, + "loss": 0.0378, + "step": 3848 + }, + { + "epoch": 1.751137397634213, + "grad_norm": 0.6081085150245663, + "learning_rate": 7.268042021960508e-06, + "loss": 0.0412, + "step": 3849 + }, + { + "epoch": 1.7515923566878981, + "grad_norm": 0.667465979501546, + "learning_rate": 7.266768139112982e-06, + "loss": 0.0497, + "step": 3850 + }, + { + "epoch": 1.7520473157415832, + "grad_norm": 0.40853600862540174, + "learning_rate": 7.265494071035401e-06, + "loss": 0.0244, + "step": 3851 + }, + { + "epoch": 1.7525022747952685, + "grad_norm": 0.5287548034772686, + "learning_rate": 7.264219817831875e-06, + "loss": 0.0385, + "step": 3852 + }, + { + "epoch": 1.7529572338489536, + "grad_norm": 0.5795926874902304, + "learning_rate": 7.262945379606532e-06, + "loss": 0.0311, + "step": 3853 + }, + { + "epoch": 1.7534121929026387, + "grad_norm": 0.5696107705215291, + "learning_rate": 7.261670756463511e-06, + "loss": 0.0447, + "step": 3854 + }, + { + "epoch": 1.753867151956324, + "grad_norm": 0.53296867648118, + "learning_rate": 7.260395948506969e-06, + "loss": 0.0387, + "step": 3855 + }, + { + "epoch": 1.7543221110100091, + "grad_norm": 0.7261108823709813, + "learning_rate": 7.259120955841079e-06, + "loss": 0.0374, + "step": 3856 + }, + { + "epoch": 1.7547770700636942, + "grad_norm": 0.42299923831898156, + "learning_rate": 7.257845778570025e-06, + "loss": 0.0265, + "step": 3857 + }, + { + "epoch": 1.7552320291173795, + "grad_norm": 0.5908684889670753, + "learning_rate": 7.256570416798012e-06, + "loss": 0.0459, + "step": 3858 + }, + { + "epoch": 1.7556869881710646, + "grad_norm": 0.5168098805178122, + "learning_rate": 7.255294870629255e-06, + "loss": 0.0311, + "step": 3859 + }, + { + "epoch": 1.7561419472247497, + "grad_norm": 0.49943533767271087, + "learning_rate": 7.254019140167985e-06, + "loss": 0.0345, + "step": 3860 + }, + { + "epoch": 1.756596906278435, + "grad_norm": 0.6434720360032985, + "learning_rate": 7.252743225518451e-06, + "loss": 0.0347, + "step": 3861 + }, + { + "epoch": 1.75705186533212, + "grad_norm": 0.3347064769600789, + "learning_rate": 7.251467126784913e-06, + "loss": 0.0134, + "step": 3862 + }, + { + "epoch": 1.7575068243858052, + "grad_norm": 0.6959580811097871, + "learning_rate": 7.2501908440716495e-06, + "loss": 0.0383, + "step": 3863 + }, + { + "epoch": 1.7579617834394905, + "grad_norm": 0.7211232218619601, + "learning_rate": 7.248914377482952e-06, + "loss": 0.0521, + "step": 3864 + }, + { + "epoch": 1.7584167424931756, + "grad_norm": 0.5339237434249615, + "learning_rate": 7.247637727123127e-06, + "loss": 0.0331, + "step": 3865 + }, + { + "epoch": 1.7588717015468607, + "grad_norm": 0.5061058168841682, + "learning_rate": 7.246360893096497e-06, + "loss": 0.0185, + "step": 3866 + }, + { + "epoch": 1.759326660600546, + "grad_norm": 0.6438152399441944, + "learning_rate": 7.245083875507399e-06, + "loss": 0.037, + "step": 3867 + }, + { + "epoch": 1.759781619654231, + "grad_norm": 0.46135233899654055, + "learning_rate": 7.243806674460187e-06, + "loss": 0.0266, + "step": 3868 + }, + { + "epoch": 1.7602365787079162, + "grad_norm": 0.5348484519345554, + "learning_rate": 7.242529290059226e-06, + "loss": 0.0313, + "step": 3869 + }, + { + "epoch": 1.7606915377616015, + "grad_norm": 0.6355208997437304, + "learning_rate": 7.241251722408897e-06, + "loss": 0.0418, + "step": 3870 + }, + { + "epoch": 1.7611464968152868, + "grad_norm": 0.49532919357280714, + "learning_rate": 7.239973971613601e-06, + "loss": 0.0198, + "step": 3871 + }, + { + "epoch": 1.7616014558689717, + "grad_norm": 0.5490859209662113, + "learning_rate": 7.238696037777746e-06, + "loss": 0.0217, + "step": 3872 + }, + { + "epoch": 1.762056414922657, + "grad_norm": 0.44798778875478795, + "learning_rate": 7.237417921005762e-06, + "loss": 0.0301, + "step": 3873 + }, + { + "epoch": 1.7625113739763423, + "grad_norm": 0.5596271551200658, + "learning_rate": 7.236139621402087e-06, + "loss": 0.0454, + "step": 3874 + }, + { + "epoch": 1.7629663330300271, + "grad_norm": 0.6046555536937368, + "learning_rate": 7.234861139071184e-06, + "loss": 0.0415, + "step": 3875 + }, + { + "epoch": 1.7634212920837125, + "grad_norm": 0.5784244809042435, + "learning_rate": 7.23358247411752e-06, + "loss": 0.0365, + "step": 3876 + }, + { + "epoch": 1.7638762511373978, + "grad_norm": 0.6506098894981671, + "learning_rate": 7.232303626645582e-06, + "loss": 0.0332, + "step": 3877 + }, + { + "epoch": 1.7643312101910829, + "grad_norm": 0.517598977042059, + "learning_rate": 7.231024596759874e-06, + "loss": 0.0293, + "step": 3878 + }, + { + "epoch": 1.764786169244768, + "grad_norm": 0.5031709596106537, + "learning_rate": 7.229745384564909e-06, + "loss": 0.031, + "step": 3879 + }, + { + "epoch": 1.7652411282984533, + "grad_norm": 0.5939459609265596, + "learning_rate": 7.228465990165222e-06, + "loss": 0.0368, + "step": 3880 + }, + { + "epoch": 1.7656960873521383, + "grad_norm": 0.8032766875075776, + "learning_rate": 7.227186413665359e-06, + "loss": 0.0459, + "step": 3881 + }, + { + "epoch": 1.7661510464058234, + "grad_norm": 0.5038566660592529, + "learning_rate": 7.225906655169879e-06, + "loss": 0.0235, + "step": 3882 + }, + { + "epoch": 1.7666060054595087, + "grad_norm": 0.4295582387223101, + "learning_rate": 7.2246267147833585e-06, + "loss": 0.0193, + "step": 3883 + }, + { + "epoch": 1.7670609645131938, + "grad_norm": 0.5324112724062958, + "learning_rate": 7.223346592610389e-06, + "loss": 0.0288, + "step": 3884 + }, + { + "epoch": 1.767515923566879, + "grad_norm": 0.5085275637911066, + "learning_rate": 7.222066288755578e-06, + "loss": 0.0367, + "step": 3885 + }, + { + "epoch": 1.7679708826205642, + "grad_norm": 0.6991787121800683, + "learning_rate": 7.220785803323544e-06, + "loss": 0.0411, + "step": 3886 + }, + { + "epoch": 1.7684258416742493, + "grad_norm": 0.9547185096653158, + "learning_rate": 7.219505136418924e-06, + "loss": 0.0654, + "step": 3887 + }, + { + "epoch": 1.7688808007279344, + "grad_norm": 0.5058121552890327, + "learning_rate": 7.218224288146367e-06, + "loss": 0.0292, + "step": 3888 + }, + { + "epoch": 1.7693357597816197, + "grad_norm": 0.45480870790423666, + "learning_rate": 7.216943258610538e-06, + "loss": 0.0276, + "step": 3889 + }, + { + "epoch": 1.7697907188353048, + "grad_norm": 0.4458594164591656, + "learning_rate": 7.215662047916118e-06, + "loss": 0.0249, + "step": 3890 + }, + { + "epoch": 1.77024567788899, + "grad_norm": 0.5899132330173917, + "learning_rate": 7.214380656167801e-06, + "loss": 0.025, + "step": 3891 + }, + { + "epoch": 1.7707006369426752, + "grad_norm": 0.459963286150344, + "learning_rate": 7.213099083470296e-06, + "loss": 0.0311, + "step": 3892 + }, + { + "epoch": 1.7711555959963603, + "grad_norm": 0.406938597420023, + "learning_rate": 7.21181732992833e-06, + "loss": 0.0256, + "step": 3893 + }, + { + "epoch": 1.7716105550500454, + "grad_norm": 0.574528139737722, + "learning_rate": 7.210535395646638e-06, + "loss": 0.0366, + "step": 3894 + }, + { + "epoch": 1.7720655141037307, + "grad_norm": 0.48034518659388153, + "learning_rate": 7.2092532807299794e-06, + "loss": 0.036, + "step": 3895 + }, + { + "epoch": 1.7725204731574158, + "grad_norm": 0.5540331865344734, + "learning_rate": 7.207970985283117e-06, + "loss": 0.0232, + "step": 3896 + }, + { + "epoch": 1.7729754322111009, + "grad_norm": 0.5929018938747233, + "learning_rate": 7.206688509410838e-06, + "loss": 0.0308, + "step": 3897 + }, + { + "epoch": 1.7734303912647862, + "grad_norm": 0.5494749966707317, + "learning_rate": 7.205405853217939e-06, + "loss": 0.0219, + "step": 3898 + }, + { + "epoch": 1.7738853503184715, + "grad_norm": 0.4798377241717678, + "learning_rate": 7.204123016809232e-06, + "loss": 0.029, + "step": 3899 + }, + { + "epoch": 1.7743403093721564, + "grad_norm": 0.6834984444108951, + "learning_rate": 7.202840000289548e-06, + "loss": 0.0436, + "step": 3900 + }, + { + "epoch": 1.7747952684258417, + "grad_norm": 0.6325235486118592, + "learning_rate": 7.2015568037637255e-06, + "loss": 0.043, + "step": 3901 + }, + { + "epoch": 1.775250227479527, + "grad_norm": 0.6543473635592312, + "learning_rate": 7.200273427336623e-06, + "loss": 0.0386, + "step": 3902 + }, + { + "epoch": 1.7757051865332119, + "grad_norm": 0.5494135195180709, + "learning_rate": 7.198989871113113e-06, + "loss": 0.0357, + "step": 3903 + }, + { + "epoch": 1.7761601455868972, + "grad_norm": 0.6036518496810389, + "learning_rate": 7.197706135198082e-06, + "loss": 0.0294, + "step": 3904 + }, + { + "epoch": 1.7766151046405825, + "grad_norm": 0.5058727125301408, + "learning_rate": 7.196422219696429e-06, + "loss": 0.0276, + "step": 3905 + }, + { + "epoch": 1.7770700636942676, + "grad_norm": 0.5408005843276728, + "learning_rate": 7.195138124713073e-06, + "loss": 0.0288, + "step": 3906 + }, + { + "epoch": 1.7775250227479527, + "grad_norm": 0.5570097418642844, + "learning_rate": 7.193853850352941e-06, + "loss": 0.0318, + "step": 3907 + }, + { + "epoch": 1.777979981801638, + "grad_norm": 0.7516904407926818, + "learning_rate": 7.192569396720978e-06, + "loss": 0.0363, + "step": 3908 + }, + { + "epoch": 1.778434940855323, + "grad_norm": 0.5426361783495182, + "learning_rate": 7.1912847639221495e-06, + "loss": 0.0281, + "step": 3909 + }, + { + "epoch": 1.7788898999090081, + "grad_norm": 0.5988355876026382, + "learning_rate": 7.189999952061424e-06, + "loss": 0.0293, + "step": 3910 + }, + { + "epoch": 1.7793448589626935, + "grad_norm": 0.4940570073509149, + "learning_rate": 7.188714961243792e-06, + "loss": 0.0215, + "step": 3911 + }, + { + "epoch": 1.7797998180163785, + "grad_norm": 0.5460494752292816, + "learning_rate": 7.187429791574259e-06, + "loss": 0.0324, + "step": 3912 + }, + { + "epoch": 1.7802547770700636, + "grad_norm": 0.5877174034961691, + "learning_rate": 7.18614444315784e-06, + "loss": 0.0423, + "step": 3913 + }, + { + "epoch": 1.780709736123749, + "grad_norm": 0.5616239933360987, + "learning_rate": 7.1848589160995705e-06, + "loss": 0.0325, + "step": 3914 + }, + { + "epoch": 1.781164695177434, + "grad_norm": 0.5565765328246807, + "learning_rate": 7.1835732105044955e-06, + "loss": 0.0274, + "step": 3915 + }, + { + "epoch": 1.7816196542311191, + "grad_norm": 0.4775935924483279, + "learning_rate": 7.182287326477681e-06, + "loss": 0.0307, + "step": 3916 + }, + { + "epoch": 1.7820746132848044, + "grad_norm": 0.4826281170996024, + "learning_rate": 7.181001264124201e-06, + "loss": 0.0245, + "step": 3917 + }, + { + "epoch": 1.7825295723384895, + "grad_norm": 0.6381064122504433, + "learning_rate": 7.179715023549145e-06, + "loss": 0.0336, + "step": 3918 + }, + { + "epoch": 1.7829845313921746, + "grad_norm": 0.5015086145547383, + "learning_rate": 7.178428604857622e-06, + "loss": 0.0276, + "step": 3919 + }, + { + "epoch": 1.78343949044586, + "grad_norm": 0.4836511306822213, + "learning_rate": 7.1771420081547514e-06, + "loss": 0.0312, + "step": 3920 + }, + { + "epoch": 1.783894449499545, + "grad_norm": 0.5672466172555591, + "learning_rate": 7.175855233545669e-06, + "loss": 0.0452, + "step": 3921 + }, + { + "epoch": 1.78434940855323, + "grad_norm": 0.5529854651973736, + "learning_rate": 7.174568281135521e-06, + "loss": 0.033, + "step": 3922 + }, + { + "epoch": 1.7848043676069154, + "grad_norm": 0.4677563731963938, + "learning_rate": 7.173281151029473e-06, + "loss": 0.0284, + "step": 3923 + }, + { + "epoch": 1.7852593266606005, + "grad_norm": 0.8341654528612025, + "learning_rate": 7.171993843332705e-06, + "loss": 0.0448, + "step": 3924 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.7469351020696116, + "learning_rate": 7.170706358150408e-06, + "loss": 0.0411, + "step": 3925 + }, + { + "epoch": 1.786169244767971, + "grad_norm": 0.5434962552420755, + "learning_rate": 7.169418695587791e-06, + "loss": 0.0438, + "step": 3926 + }, + { + "epoch": 1.7866242038216562, + "grad_norm": 0.3555412569211614, + "learning_rate": 7.1681308557500755e-06, + "loss": 0.0214, + "step": 3927 + }, + { + "epoch": 1.787079162875341, + "grad_norm": 0.5544920715175138, + "learning_rate": 7.166842838742497e-06, + "loss": 0.0306, + "step": 3928 + }, + { + "epoch": 1.7875341219290264, + "grad_norm": 0.5786874917103659, + "learning_rate": 7.165554644670307e-06, + "loss": 0.042, + "step": 3929 + }, + { + "epoch": 1.7879890809827117, + "grad_norm": 0.6196487285635767, + "learning_rate": 7.164266273638771e-06, + "loss": 0.044, + "step": 3930 + }, + { + "epoch": 1.7884440400363966, + "grad_norm": 0.6477828820090044, + "learning_rate": 7.162977725753169e-06, + "loss": 0.0413, + "step": 3931 + }, + { + "epoch": 1.7888989990900819, + "grad_norm": 0.5499920986044309, + "learning_rate": 7.1616890011187945e-06, + "loss": 0.039, + "step": 3932 + }, + { + "epoch": 1.7893539581437672, + "grad_norm": 0.6737272879159145, + "learning_rate": 7.160400099840959e-06, + "loss": 0.0481, + "step": 3933 + }, + { + "epoch": 1.7898089171974523, + "grad_norm": 0.6104575447990194, + "learning_rate": 7.1591110220249826e-06, + "loss": 0.0362, + "step": 3934 + }, + { + "epoch": 1.7902638762511374, + "grad_norm": 0.455324541786891, + "learning_rate": 7.157821767776203e-06, + "loss": 0.0243, + "step": 3935 + }, + { + "epoch": 1.7907188353048227, + "grad_norm": 0.3953682511963992, + "learning_rate": 7.1565323371999725e-06, + "loss": 0.0209, + "step": 3936 + }, + { + "epoch": 1.7911737943585078, + "grad_norm": 0.48993108578342043, + "learning_rate": 7.15524273040166e-06, + "loss": 0.0243, + "step": 3937 + }, + { + "epoch": 1.7916287534121929, + "grad_norm": 0.6575529662736596, + "learning_rate": 7.153952947486645e-06, + "loss": 0.0528, + "step": 3938 + }, + { + "epoch": 1.7920837124658782, + "grad_norm": 0.615383967163662, + "learning_rate": 7.152662988560322e-06, + "loss": 0.0374, + "step": 3939 + }, + { + "epoch": 1.7925386715195633, + "grad_norm": 0.5617255711206577, + "learning_rate": 7.151372853728099e-06, + "loss": 0.0404, + "step": 3940 + }, + { + "epoch": 1.7929936305732483, + "grad_norm": 0.7324453621905366, + "learning_rate": 7.150082543095403e-06, + "loss": 0.0321, + "step": 3941 + }, + { + "epoch": 1.7934485896269337, + "grad_norm": 0.6300413924268614, + "learning_rate": 7.148792056767672e-06, + "loss": 0.0419, + "step": 3942 + }, + { + "epoch": 1.7939035486806187, + "grad_norm": 0.4149778419279514, + "learning_rate": 7.147501394850357e-06, + "loss": 0.0292, + "step": 3943 + }, + { + "epoch": 1.7943585077343038, + "grad_norm": 0.5442232866096656, + "learning_rate": 7.146210557448926e-06, + "loss": 0.0333, + "step": 3944 + }, + { + "epoch": 1.7948134667879891, + "grad_norm": 0.3725055725075537, + "learning_rate": 7.144919544668863e-06, + "loss": 0.0156, + "step": 3945 + }, + { + "epoch": 1.7952684258416742, + "grad_norm": 0.6523123894505088, + "learning_rate": 7.143628356615657e-06, + "loss": 0.0356, + "step": 3946 + }, + { + "epoch": 1.7957233848953593, + "grad_norm": 0.5848326338750129, + "learning_rate": 7.142336993394825e-06, + "loss": 0.0334, + "step": 3947 + }, + { + "epoch": 1.7961783439490446, + "grad_norm": 0.670071924436715, + "learning_rate": 7.141045455111888e-06, + "loss": 0.0484, + "step": 3948 + }, + { + "epoch": 1.7966333030027297, + "grad_norm": 0.5985665676795898, + "learning_rate": 7.139753741872385e-06, + "loss": 0.0346, + "step": 3949 + }, + { + "epoch": 1.7970882620564148, + "grad_norm": 0.6010858528292592, + "learning_rate": 7.13846185378187e-06, + "loss": 0.0336, + "step": 3950 + }, + { + "epoch": 1.7975432211101001, + "grad_norm": 0.507128365864404, + "learning_rate": 7.137169790945908e-06, + "loss": 0.0282, + "step": 3951 + }, + { + "epoch": 1.7979981801637852, + "grad_norm": 0.5443000453471494, + "learning_rate": 7.135877553470083e-06, + "loss": 0.0294, + "step": 3952 + }, + { + "epoch": 1.7984531392174703, + "grad_norm": 0.5191857848884796, + "learning_rate": 7.134585141459991e-06, + "loss": 0.033, + "step": 3953 + }, + { + "epoch": 1.7989080982711556, + "grad_norm": 0.3401039408774095, + "learning_rate": 7.133292555021239e-06, + "loss": 0.0181, + "step": 3954 + }, + { + "epoch": 1.799363057324841, + "grad_norm": 0.5449394868820434, + "learning_rate": 7.131999794259454e-06, + "loss": 0.0268, + "step": 3955 + }, + { + "epoch": 1.7998180163785258, + "grad_norm": 0.5380343118714457, + "learning_rate": 7.1307068592802745e-06, + "loss": 0.0273, + "step": 3956 + }, + { + "epoch": 1.800272975432211, + "grad_norm": 0.4790260426232836, + "learning_rate": 7.129413750189351e-06, + "loss": 0.0361, + "step": 3957 + }, + { + "epoch": 1.8007279344858964, + "grad_norm": 0.7795203644523895, + "learning_rate": 7.128120467092354e-06, + "loss": 0.0367, + "step": 3958 + }, + { + "epoch": 1.8011828935395813, + "grad_norm": 0.4963598697991848, + "learning_rate": 7.126827010094962e-06, + "loss": 0.0266, + "step": 3959 + }, + { + "epoch": 1.8016378525932666, + "grad_norm": 0.5848640070282111, + "learning_rate": 7.125533379302872e-06, + "loss": 0.0408, + "step": 3960 + }, + { + "epoch": 1.802092811646952, + "grad_norm": 0.538672038336608, + "learning_rate": 7.1242395748217915e-06, + "loss": 0.0413, + "step": 3961 + }, + { + "epoch": 1.802547770700637, + "grad_norm": 0.6170672028146802, + "learning_rate": 7.122945596757449e-06, + "loss": 0.0462, + "step": 3962 + }, + { + "epoch": 1.803002729754322, + "grad_norm": 0.685339047524874, + "learning_rate": 7.121651445215577e-06, + "loss": 0.0438, + "step": 3963 + }, + { + "epoch": 1.8034576888080074, + "grad_norm": 0.5758149767926731, + "learning_rate": 7.120357120301931e-06, + "loss": 0.0418, + "step": 3964 + }, + { + "epoch": 1.8039126478616925, + "grad_norm": 0.5758706450090999, + "learning_rate": 7.119062622122277e-06, + "loss": 0.039, + "step": 3965 + }, + { + "epoch": 1.8043676069153776, + "grad_norm": 0.5164499799409003, + "learning_rate": 7.117767950782394e-06, + "loss": 0.0345, + "step": 3966 + }, + { + "epoch": 1.8048225659690629, + "grad_norm": 0.6701744378878628, + "learning_rate": 7.1164731063880775e-06, + "loss": 0.0357, + "step": 3967 + }, + { + "epoch": 1.805277525022748, + "grad_norm": 0.5339530812209317, + "learning_rate": 7.115178089045137e-06, + "loss": 0.0307, + "step": 3968 + }, + { + "epoch": 1.805732484076433, + "grad_norm": 0.5616425335788408, + "learning_rate": 7.1138828988593964e-06, + "loss": 0.0377, + "step": 3969 + }, + { + "epoch": 1.8061874431301184, + "grad_norm": 0.6328052760754227, + "learning_rate": 7.112587535936691e-06, + "loss": 0.035, + "step": 3970 + }, + { + "epoch": 1.8066424021838035, + "grad_norm": 0.49133852954676155, + "learning_rate": 7.111292000382871e-06, + "loss": 0.0344, + "step": 3971 + }, + { + "epoch": 1.8070973612374885, + "grad_norm": 0.5991588106482821, + "learning_rate": 7.1099962923038055e-06, + "loss": 0.0239, + "step": 3972 + }, + { + "epoch": 1.8075523202911739, + "grad_norm": 0.5223998404891242, + "learning_rate": 7.10870041180537e-06, + "loss": 0.0372, + "step": 3973 + }, + { + "epoch": 1.808007279344859, + "grad_norm": 0.4946982797043428, + "learning_rate": 7.10740435899346e-06, + "loss": 0.0314, + "step": 3974 + }, + { + "epoch": 1.808462238398544, + "grad_norm": 0.6989947393162688, + "learning_rate": 7.106108133973983e-06, + "loss": 0.0497, + "step": 3975 + }, + { + "epoch": 1.8089171974522293, + "grad_norm": 0.5312108774836449, + "learning_rate": 7.104811736852861e-06, + "loss": 0.0325, + "step": 3976 + }, + { + "epoch": 1.8093721565059144, + "grad_norm": 0.5491635863938112, + "learning_rate": 7.10351516773603e-06, + "loss": 0.0219, + "step": 3977 + }, + { + "epoch": 1.8098271155595995, + "grad_norm": 0.6472056450996261, + "learning_rate": 7.102218426729434e-06, + "loss": 0.0388, + "step": 3978 + }, + { + "epoch": 1.8102820746132848, + "grad_norm": 0.52454943466676, + "learning_rate": 7.1009215139390475e-06, + "loss": 0.0344, + "step": 3979 + }, + { + "epoch": 1.81073703366697, + "grad_norm": 0.4375468836504088, + "learning_rate": 7.0996244294708395e-06, + "loss": 0.0223, + "step": 3980 + }, + { + "epoch": 1.811191992720655, + "grad_norm": 0.4454882660335665, + "learning_rate": 7.098327173430806e-06, + "loss": 0.0319, + "step": 3981 + }, + { + "epoch": 1.8116469517743403, + "grad_norm": 0.7417080914051789, + "learning_rate": 7.097029745924951e-06, + "loss": 0.0418, + "step": 3982 + }, + { + "epoch": 1.8121019108280256, + "grad_norm": 0.40394518683492026, + "learning_rate": 7.095732147059295e-06, + "loss": 0.018, + "step": 3983 + }, + { + "epoch": 1.8125568698817105, + "grad_norm": 0.434190114216561, + "learning_rate": 7.094434376939874e-06, + "loss": 0.0271, + "step": 3984 + }, + { + "epoch": 1.8130118289353958, + "grad_norm": 0.5649421222343399, + "learning_rate": 7.093136435672731e-06, + "loss": 0.0352, + "step": 3985 + }, + { + "epoch": 1.8134667879890811, + "grad_norm": 0.579850191763891, + "learning_rate": 7.091838323363935e-06, + "loss": 0.0383, + "step": 3986 + }, + { + "epoch": 1.813921747042766, + "grad_norm": 0.7918429567659951, + "learning_rate": 7.090540040119556e-06, + "loss": 0.0542, + "step": 3987 + }, + { + "epoch": 1.8143767060964513, + "grad_norm": 0.6425928510109524, + "learning_rate": 7.089241586045684e-06, + "loss": 0.0357, + "step": 3988 + }, + { + "epoch": 1.8148316651501366, + "grad_norm": 0.550612542420563, + "learning_rate": 7.087942961248428e-06, + "loss": 0.0261, + "step": 3989 + }, + { + "epoch": 1.8152866242038217, + "grad_norm": 0.6018700809779689, + "learning_rate": 7.086644165833899e-06, + "loss": 0.044, + "step": 3990 + }, + { + "epoch": 1.8157415832575068, + "grad_norm": 0.504372754878958, + "learning_rate": 7.085345199908234e-06, + "loss": 0.0243, + "step": 3991 + }, + { + "epoch": 1.816196542311192, + "grad_norm": 0.4546000934803039, + "learning_rate": 7.084046063577577e-06, + "loss": 0.023, + "step": 3992 + }, + { + "epoch": 1.8166515013648772, + "grad_norm": 0.4740803970183519, + "learning_rate": 7.0827467569480846e-06, + "loss": 0.0292, + "step": 3993 + }, + { + "epoch": 1.8171064604185623, + "grad_norm": 0.6679799664830866, + "learning_rate": 7.081447280125935e-06, + "loss": 0.0403, + "step": 3994 + }, + { + "epoch": 1.8175614194722476, + "grad_norm": 1.0216658476486054, + "learning_rate": 7.080147633217311e-06, + "loss": 0.0382, + "step": 3995 + }, + { + "epoch": 1.8180163785259327, + "grad_norm": 0.8756873391931521, + "learning_rate": 7.078847816328419e-06, + "loss": 0.0296, + "step": 3996 + }, + { + "epoch": 1.8184713375796178, + "grad_norm": 0.6994105648415206, + "learning_rate": 7.077547829565471e-06, + "loss": 0.0449, + "step": 3997 + }, + { + "epoch": 1.818926296633303, + "grad_norm": 0.5970649996996983, + "learning_rate": 7.076247673034696e-06, + "loss": 0.0449, + "step": 3998 + }, + { + "epoch": 1.8193812556869882, + "grad_norm": 0.49363215921320414, + "learning_rate": 7.074947346842337e-06, + "loss": 0.0301, + "step": 3999 + }, + { + "epoch": 1.8198362147406733, + "grad_norm": 0.34205539276628477, + "learning_rate": 7.073646851094651e-06, + "loss": 0.0161, + "step": 4000 + }, + { + "epoch": 1.8202911737943586, + "grad_norm": 0.4754376726073723, + "learning_rate": 7.07234618589791e-06, + "loss": 0.035, + "step": 4001 + }, + { + "epoch": 1.8207461328480437, + "grad_norm": 0.43989711731105835, + "learning_rate": 7.071045351358396e-06, + "loss": 0.0275, + "step": 4002 + }, + { + "epoch": 1.8212010919017287, + "grad_norm": 0.6022776161475787, + "learning_rate": 7.06974434758241e-06, + "loss": 0.0296, + "step": 4003 + }, + { + "epoch": 1.821656050955414, + "grad_norm": 0.5856739185783606, + "learning_rate": 7.068443174676262e-06, + "loss": 0.0419, + "step": 4004 + }, + { + "epoch": 1.8221110100090991, + "grad_norm": 0.5657144514568859, + "learning_rate": 7.067141832746279e-06, + "loss": 0.0323, + "step": 4005 + }, + { + "epoch": 1.8225659690627842, + "grad_norm": 0.7015862599769166, + "learning_rate": 7.0658403218988004e-06, + "loss": 0.0408, + "step": 4006 + }, + { + "epoch": 1.8230209281164695, + "grad_norm": 0.499501929992003, + "learning_rate": 7.06453864224018e-06, + "loss": 0.0268, + "step": 4007 + }, + { + "epoch": 1.8234758871701549, + "grad_norm": 0.5065496773444177, + "learning_rate": 7.063236793876785e-06, + "loss": 0.0317, + "step": 4008 + }, + { + "epoch": 1.8239308462238397, + "grad_norm": 0.4125612464785829, + "learning_rate": 7.061934776914997e-06, + "loss": 0.0289, + "step": 4009 + }, + { + "epoch": 1.824385805277525, + "grad_norm": 0.38547122342302553, + "learning_rate": 7.06063259146121e-06, + "loss": 0.0204, + "step": 4010 + }, + { + "epoch": 1.8248407643312103, + "grad_norm": 0.5094320858269077, + "learning_rate": 7.0593302376218355e-06, + "loss": 0.0341, + "step": 4011 + }, + { + "epoch": 1.8252957233848952, + "grad_norm": 0.7171503681661315, + "learning_rate": 7.058027715503292e-06, + "loss": 0.0387, + "step": 4012 + }, + { + "epoch": 1.8257506824385805, + "grad_norm": 0.560749436168927, + "learning_rate": 7.056725025212017e-06, + "loss": 0.0335, + "step": 4013 + }, + { + "epoch": 1.8262056414922658, + "grad_norm": 0.5660083707755492, + "learning_rate": 7.055422166854461e-06, + "loss": 0.0303, + "step": 4014 + }, + { + "epoch": 1.826660600545951, + "grad_norm": 0.45700289605611905, + "learning_rate": 7.05411914053709e-06, + "loss": 0.0315, + "step": 4015 + }, + { + "epoch": 1.827115559599636, + "grad_norm": 0.5276769885577551, + "learning_rate": 7.052815946366377e-06, + "loss": 0.0351, + "step": 4016 + }, + { + "epoch": 1.8275705186533213, + "grad_norm": 0.5502525643150208, + "learning_rate": 7.051512584448815e-06, + "loss": 0.0397, + "step": 4017 + }, + { + "epoch": 1.8280254777070064, + "grad_norm": 0.482778868702987, + "learning_rate": 7.050209054890911e-06, + "loss": 0.0274, + "step": 4018 + }, + { + "epoch": 1.8284804367606915, + "grad_norm": 0.5312301615199478, + "learning_rate": 7.048905357799181e-06, + "loss": 0.04, + "step": 4019 + }, + { + "epoch": 1.8289353958143768, + "grad_norm": 0.6096548276462953, + "learning_rate": 7.047601493280157e-06, + "loss": 0.0283, + "step": 4020 + }, + { + "epoch": 1.829390354868062, + "grad_norm": 0.47444463674377546, + "learning_rate": 7.046297461440387e-06, + "loss": 0.0363, + "step": 4021 + }, + { + "epoch": 1.829845313921747, + "grad_norm": 0.6124965706930082, + "learning_rate": 7.044993262386429e-06, + "loss": 0.0409, + "step": 4022 + }, + { + "epoch": 1.8303002729754323, + "grad_norm": 0.6896982876579423, + "learning_rate": 7.043688896224856e-06, + "loss": 0.0381, + "step": 4023 + }, + { + "epoch": 1.8307552320291174, + "grad_norm": 0.51933427018194, + "learning_rate": 7.042384363062257e-06, + "loss": 0.0408, + "step": 4024 + }, + { + "epoch": 1.8312101910828025, + "grad_norm": 0.49264804902028037, + "learning_rate": 7.041079663005231e-06, + "loss": 0.0296, + "step": 4025 + }, + { + "epoch": 1.8316651501364878, + "grad_norm": 0.4301249590257083, + "learning_rate": 7.039774796160391e-06, + "loss": 0.0241, + "step": 4026 + }, + { + "epoch": 1.8321201091901729, + "grad_norm": 0.3618689267935005, + "learning_rate": 7.038469762634368e-06, + "loss": 0.0176, + "step": 4027 + }, + { + "epoch": 1.832575068243858, + "grad_norm": 0.618004674757222, + "learning_rate": 7.0371645625338e-06, + "loss": 0.0331, + "step": 4028 + }, + { + "epoch": 1.8330300272975433, + "grad_norm": 0.49202152928312604, + "learning_rate": 7.035859195965344e-06, + "loss": 0.038, + "step": 4029 + }, + { + "epoch": 1.8334849863512284, + "grad_norm": 0.4024547889272229, + "learning_rate": 7.034553663035669e-06, + "loss": 0.0257, + "step": 4030 + }, + { + "epoch": 1.8339399454049135, + "grad_norm": 0.4873924197201483, + "learning_rate": 7.033247963851457e-06, + "loss": 0.0196, + "step": 4031 + }, + { + "epoch": 1.8343949044585988, + "grad_norm": 0.49561205032728406, + "learning_rate": 7.031942098519403e-06, + "loss": 0.0291, + "step": 4032 + }, + { + "epoch": 1.8348498635122839, + "grad_norm": 0.49352370428101255, + "learning_rate": 7.030636067146217e-06, + "loss": 0.0349, + "step": 4033 + }, + { + "epoch": 1.835304822565969, + "grad_norm": 0.6452194492215327, + "learning_rate": 7.0293298698386215e-06, + "loss": 0.0385, + "step": 4034 + }, + { + "epoch": 1.8357597816196543, + "grad_norm": 0.5933562882498339, + "learning_rate": 7.028023506703354e-06, + "loss": 0.0414, + "step": 4035 + }, + { + "epoch": 1.8362147406733396, + "grad_norm": 0.4610645783735472, + "learning_rate": 7.0267169778471635e-06, + "loss": 0.0196, + "step": 4036 + }, + { + "epoch": 1.8366696997270244, + "grad_norm": 0.6054267023221399, + "learning_rate": 7.0254102833768134e-06, + "loss": 0.0479, + "step": 4037 + }, + { + "epoch": 1.8371246587807097, + "grad_norm": 0.5589777226125918, + "learning_rate": 7.024103423399083e-06, + "loss": 0.0358, + "step": 4038 + }, + { + "epoch": 1.837579617834395, + "grad_norm": 0.5022138566645574, + "learning_rate": 7.022796398020761e-06, + "loss": 0.0339, + "step": 4039 + }, + { + "epoch": 1.83803457688808, + "grad_norm": 0.6392214580188212, + "learning_rate": 7.021489207348651e-06, + "loss": 0.0415, + "step": 4040 + }, + { + "epoch": 1.8384895359417652, + "grad_norm": 0.4152816013558989, + "learning_rate": 7.020181851489574e-06, + "loss": 0.031, + "step": 4041 + }, + { + "epoch": 1.8389444949954505, + "grad_norm": 0.4997589182807433, + "learning_rate": 7.018874330550359e-06, + "loss": 0.0354, + "step": 4042 + }, + { + "epoch": 1.8393994540491356, + "grad_norm": 0.4878815217419077, + "learning_rate": 7.01756664463785e-06, + "loss": 0.026, + "step": 4043 + }, + { + "epoch": 1.8398544131028207, + "grad_norm": 0.4882957111581977, + "learning_rate": 7.016258793858906e-06, + "loss": 0.0279, + "step": 4044 + }, + { + "epoch": 1.840309372156506, + "grad_norm": 0.5091348211222342, + "learning_rate": 7.014950778320399e-06, + "loss": 0.0233, + "step": 4045 + }, + { + "epoch": 1.8407643312101911, + "grad_norm": 0.5165970173238531, + "learning_rate": 7.013642598129213e-06, + "loss": 0.035, + "step": 4046 + }, + { + "epoch": 1.8412192902638762, + "grad_norm": 0.45148158952539325, + "learning_rate": 7.01233425339225e-06, + "loss": 0.0274, + "step": 4047 + }, + { + "epoch": 1.8416742493175615, + "grad_norm": 0.47591721709121404, + "learning_rate": 7.011025744216417e-06, + "loss": 0.0277, + "step": 4048 + }, + { + "epoch": 1.8421292083712466, + "grad_norm": 0.5657580806360729, + "learning_rate": 7.0097170707086425e-06, + "loss": 0.0367, + "step": 4049 + }, + { + "epoch": 1.8425841674249317, + "grad_norm": 0.44240401287913805, + "learning_rate": 7.008408232975865e-06, + "loss": 0.0274, + "step": 4050 + }, + { + "epoch": 1.843039126478617, + "grad_norm": 3.2583444075327703, + "learning_rate": 7.007099231125036e-06, + "loss": 0.0524, + "step": 4051 + }, + { + "epoch": 1.843494085532302, + "grad_norm": 0.6144454233999067, + "learning_rate": 7.005790065263123e-06, + "loss": 0.0335, + "step": 4052 + }, + { + "epoch": 1.8439490445859872, + "grad_norm": 0.7055540833310427, + "learning_rate": 7.004480735497102e-06, + "loss": 0.0316, + "step": 4053 + }, + { + "epoch": 1.8444040036396725, + "grad_norm": 0.5278918909415452, + "learning_rate": 7.00317124193397e-06, + "loss": 0.0269, + "step": 4054 + }, + { + "epoch": 1.8448589626933576, + "grad_norm": 0.6178451990779255, + "learning_rate": 7.001861584680727e-06, + "loss": 0.0376, + "step": 4055 + }, + { + "epoch": 1.8453139217470427, + "grad_norm": 0.5258992283471937, + "learning_rate": 7.000551763844399e-06, + "loss": 0.0322, + "step": 4056 + }, + { + "epoch": 1.845768880800728, + "grad_norm": 0.5775961051186342, + "learning_rate": 6.999241779532014e-06, + "loss": 0.0379, + "step": 4057 + }, + { + "epoch": 1.846223839854413, + "grad_norm": 0.42259640511196034, + "learning_rate": 6.997931631850619e-06, + "loss": 0.0298, + "step": 4058 + }, + { + "epoch": 1.8466787989080982, + "grad_norm": 0.4873844670640969, + "learning_rate": 6.996621320907273e-06, + "loss": 0.0241, + "step": 4059 + }, + { + "epoch": 1.8471337579617835, + "grad_norm": 0.5151220931944738, + "learning_rate": 6.995310846809051e-06, + "loss": 0.0362, + "step": 4060 + }, + { + "epoch": 1.8475887170154686, + "grad_norm": 0.4246894371554255, + "learning_rate": 6.994000209663037e-06, + "loss": 0.0213, + "step": 4061 + }, + { + "epoch": 1.8480436760691537, + "grad_norm": 0.4666895538233693, + "learning_rate": 6.99268940957633e-06, + "loss": 0.0252, + "step": 4062 + }, + { + "epoch": 1.848498635122839, + "grad_norm": 0.6927898464919332, + "learning_rate": 6.991378446656043e-06, + "loss": 0.0401, + "step": 4063 + }, + { + "epoch": 1.8489535941765243, + "grad_norm": 0.6790701349214193, + "learning_rate": 6.990067321009303e-06, + "loss": 0.0294, + "step": 4064 + }, + { + "epoch": 1.8494085532302091, + "grad_norm": 0.8008386886618306, + "learning_rate": 6.9887560327432465e-06, + "loss": 0.0529, + "step": 4065 + }, + { + "epoch": 1.8498635122838945, + "grad_norm": 0.31982778652087174, + "learning_rate": 6.9874445819650315e-06, + "loss": 0.013, + "step": 4066 + }, + { + "epoch": 1.8503184713375798, + "grad_norm": 0.7293561184884446, + "learning_rate": 6.986132968781818e-06, + "loss": 0.0426, + "step": 4067 + }, + { + "epoch": 1.8507734303912646, + "grad_norm": 0.44612197517231394, + "learning_rate": 6.9848211933007904e-06, + "loss": 0.0267, + "step": 4068 + }, + { + "epoch": 1.85122838944495, + "grad_norm": 0.5644789933631869, + "learning_rate": 6.983509255629136e-06, + "loss": 0.0256, + "step": 4069 + }, + { + "epoch": 1.8516833484986353, + "grad_norm": 0.630824695024137, + "learning_rate": 6.982197155874062e-06, + "loss": 0.0353, + "step": 4070 + }, + { + "epoch": 1.8521383075523203, + "grad_norm": 0.6309372938638874, + "learning_rate": 6.980884894142789e-06, + "loss": 0.0385, + "step": 4071 + }, + { + "epoch": 1.8525932666060054, + "grad_norm": 0.549135184924981, + "learning_rate": 6.979572470542549e-06, + "loss": 0.0286, + "step": 4072 + }, + { + "epoch": 1.8530482256596907, + "grad_norm": 0.39639697149656766, + "learning_rate": 6.978259885180585e-06, + "loss": 0.0288, + "step": 4073 + }, + { + "epoch": 1.8535031847133758, + "grad_norm": 0.6033987036469487, + "learning_rate": 6.976947138164157e-06, + "loss": 0.0366, + "step": 4074 + }, + { + "epoch": 1.853958143767061, + "grad_norm": 0.4554154294708478, + "learning_rate": 6.975634229600539e-06, + "loss": 0.0361, + "step": 4075 + }, + { + "epoch": 1.8544131028207462, + "grad_norm": 0.5039912781556998, + "learning_rate": 6.9743211595970105e-06, + "loss": 0.0298, + "step": 4076 + }, + { + "epoch": 1.8548680618744313, + "grad_norm": 0.6270034579834832, + "learning_rate": 6.973007928260874e-06, + "loss": 0.0329, + "step": 4077 + }, + { + "epoch": 1.8553230209281164, + "grad_norm": 0.4343606029179238, + "learning_rate": 6.971694535699441e-06, + "loss": 0.0226, + "step": 4078 + }, + { + "epoch": 1.8557779799818017, + "grad_norm": 0.511619098133748, + "learning_rate": 6.970380982020033e-06, + "loss": 0.0311, + "step": 4079 + }, + { + "epoch": 1.8562329390354868, + "grad_norm": 0.6291419962861428, + "learning_rate": 6.969067267329989e-06, + "loss": 0.0428, + "step": 4080 + }, + { + "epoch": 1.856687898089172, + "grad_norm": 0.7658450189704773, + "learning_rate": 6.967753391736662e-06, + "loss": 0.047, + "step": 4081 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.6066682501864417, + "learning_rate": 6.966439355347412e-06, + "loss": 0.0375, + "step": 4082 + }, + { + "epoch": 1.8575978161965423, + "grad_norm": 0.7124765630916889, + "learning_rate": 6.965125158269619e-06, + "loss": 0.0324, + "step": 4083 + }, + { + "epoch": 1.8580527752502274, + "grad_norm": 0.5157157965087068, + "learning_rate": 6.963810800610672e-06, + "loss": 0.0269, + "step": 4084 + }, + { + "epoch": 1.8585077343039127, + "grad_norm": 0.47744357360890555, + "learning_rate": 6.962496282477976e-06, + "loss": 0.0279, + "step": 4085 + }, + { + "epoch": 1.8589626933575978, + "grad_norm": 0.5092031639283099, + "learning_rate": 6.961181603978946e-06, + "loss": 0.03, + "step": 4086 + }, + { + "epoch": 1.8594176524112829, + "grad_norm": 0.5650533067317093, + "learning_rate": 6.959866765221012e-06, + "loss": 0.0278, + "step": 4087 + }, + { + "epoch": 1.8598726114649682, + "grad_norm": 0.6170436102843387, + "learning_rate": 6.958551766311616e-06, + "loss": 0.0306, + "step": 4088 + }, + { + "epoch": 1.8603275705186533, + "grad_norm": 0.7496398968567375, + "learning_rate": 6.957236607358216e-06, + "loss": 0.0383, + "step": 4089 + }, + { + "epoch": 1.8607825295723384, + "grad_norm": 0.6582698994869104, + "learning_rate": 6.955921288468277e-06, + "loss": 0.0395, + "step": 4090 + }, + { + "epoch": 1.8612374886260237, + "grad_norm": 0.5216081345029945, + "learning_rate": 6.954605809749284e-06, + "loss": 0.0366, + "step": 4091 + }, + { + "epoch": 1.861692447679709, + "grad_norm": 0.6946230560815133, + "learning_rate": 6.953290171308732e-06, + "loss": 0.0394, + "step": 4092 + }, + { + "epoch": 1.8621474067333939, + "grad_norm": 0.5790626531179943, + "learning_rate": 6.951974373254127e-06, + "loss": 0.0287, + "step": 4093 + }, + { + "epoch": 1.8626023657870792, + "grad_norm": 0.4183972019857722, + "learning_rate": 6.950658415692992e-06, + "loss": 0.0307, + "step": 4094 + }, + { + "epoch": 1.8630573248407645, + "grad_norm": 0.5568397799801404, + "learning_rate": 6.949342298732861e-06, + "loss": 0.0378, + "step": 4095 + }, + { + "epoch": 1.8635122838944493, + "grad_norm": 0.5743063582553911, + "learning_rate": 6.948026022481279e-06, + "loss": 0.029, + "step": 4096 + }, + { + "epoch": 1.8639672429481347, + "grad_norm": 0.39922835370133297, + "learning_rate": 6.946709587045808e-06, + "loss": 0.0216, + "step": 4097 + }, + { + "epoch": 1.86442220200182, + "grad_norm": 0.652730513196503, + "learning_rate": 6.945392992534022e-06, + "loss": 0.0427, + "step": 4098 + }, + { + "epoch": 1.864877161055505, + "grad_norm": 0.3791674162212088, + "learning_rate": 6.9440762390535046e-06, + "loss": 0.024, + "step": 4099 + }, + { + "epoch": 1.8653321201091901, + "grad_norm": 0.533439629209361, + "learning_rate": 6.9427593267118565e-06, + "loss": 0.0317, + "step": 4100 + }, + { + "epoch": 1.8657870791628755, + "grad_norm": 0.6602195993489012, + "learning_rate": 6.941442255616691e-06, + "loss": 0.048, + "step": 4101 + }, + { + "epoch": 1.8662420382165605, + "grad_norm": 0.5800092363666163, + "learning_rate": 6.94012502587563e-06, + "loss": 0.0448, + "step": 4102 + }, + { + "epoch": 1.8666969972702456, + "grad_norm": 0.5347550571270917, + "learning_rate": 6.938807637596315e-06, + "loss": 0.0307, + "step": 4103 + }, + { + "epoch": 1.867151956323931, + "grad_norm": 0.6828358934580386, + "learning_rate": 6.937490090886394e-06, + "loss": 0.0415, + "step": 4104 + }, + { + "epoch": 1.867606915377616, + "grad_norm": 0.49549687933880604, + "learning_rate": 6.936172385853534e-06, + "loss": 0.0334, + "step": 4105 + }, + { + "epoch": 1.8680618744313011, + "grad_norm": 0.37121930632346345, + "learning_rate": 6.934854522605409e-06, + "loss": 0.0148, + "step": 4106 + }, + { + "epoch": 1.8685168334849864, + "grad_norm": 0.5637249554892388, + "learning_rate": 6.9335365012497095e-06, + "loss": 0.0346, + "step": 4107 + }, + { + "epoch": 1.8689717925386715, + "grad_norm": 0.4079255439605901, + "learning_rate": 6.93221832189414e-06, + "loss": 0.0238, + "step": 4108 + }, + { + "epoch": 1.8694267515923566, + "grad_norm": 0.51260711597464, + "learning_rate": 6.930899984646416e-06, + "loss": 0.031, + "step": 4109 + }, + { + "epoch": 1.869881710646042, + "grad_norm": 0.6268143820434923, + "learning_rate": 6.929581489614263e-06, + "loss": 0.0494, + "step": 4110 + }, + { + "epoch": 1.870336669699727, + "grad_norm": 0.622134803563498, + "learning_rate": 6.928262836905426e-06, + "loss": 0.025, + "step": 4111 + }, + { + "epoch": 1.870791628753412, + "grad_norm": 0.5332846552633914, + "learning_rate": 6.926944026627658e-06, + "loss": 0.0276, + "step": 4112 + }, + { + "epoch": 1.8712465878070974, + "grad_norm": 0.5708974782402968, + "learning_rate": 6.925625058888725e-06, + "loss": 0.032, + "step": 4113 + }, + { + "epoch": 1.8717015468607825, + "grad_norm": 0.4806578138038823, + "learning_rate": 6.924305933796409e-06, + "loss": 0.03, + "step": 4114 + }, + { + "epoch": 1.8721565059144676, + "grad_norm": 0.39212455930029516, + "learning_rate": 6.922986651458503e-06, + "loss": 0.0264, + "step": 4115 + }, + { + "epoch": 1.872611464968153, + "grad_norm": 0.4259990007914431, + "learning_rate": 6.921667211982811e-06, + "loss": 0.0196, + "step": 4116 + }, + { + "epoch": 1.873066424021838, + "grad_norm": 0.5043990484033058, + "learning_rate": 6.920347615477153e-06, + "loss": 0.0272, + "step": 4117 + }, + { + "epoch": 1.873521383075523, + "grad_norm": 0.6315043827155147, + "learning_rate": 6.919027862049359e-06, + "loss": 0.0347, + "step": 4118 + }, + { + "epoch": 1.8739763421292084, + "grad_norm": 0.5086595636715445, + "learning_rate": 6.917707951807275e-06, + "loss": 0.0294, + "step": 4119 + }, + { + "epoch": 1.8744313011828937, + "grad_norm": 0.7059881241515326, + "learning_rate": 6.9163878848587585e-06, + "loss": 0.0375, + "step": 4120 + }, + { + "epoch": 1.8748862602365786, + "grad_norm": 0.4981489285964966, + "learning_rate": 6.915067661311676e-06, + "loss": 0.0229, + "step": 4121 + }, + { + "epoch": 1.8753412192902639, + "grad_norm": 0.7535656971705376, + "learning_rate": 6.913747281273916e-06, + "loss": 0.0441, + "step": 4122 + }, + { + "epoch": 1.8757961783439492, + "grad_norm": 0.5606563081185906, + "learning_rate": 6.912426744853368e-06, + "loss": 0.0182, + "step": 4123 + }, + { + "epoch": 1.876251137397634, + "grad_norm": 0.4800039645683506, + "learning_rate": 6.911106052157943e-06, + "loss": 0.0329, + "step": 4124 + }, + { + "epoch": 1.8767060964513194, + "grad_norm": 0.7422807931262311, + "learning_rate": 6.909785203295563e-06, + "loss": 0.0483, + "step": 4125 + }, + { + "epoch": 1.8771610555050047, + "grad_norm": 0.7428282082101595, + "learning_rate": 6.908464198374161e-06, + "loss": 0.0395, + "step": 4126 + }, + { + "epoch": 1.8776160145586898, + "grad_norm": 0.5567461218204124, + "learning_rate": 6.907143037501681e-06, + "loss": 0.0261, + "step": 4127 + }, + { + "epoch": 1.8780709736123748, + "grad_norm": 0.4535800153644064, + "learning_rate": 6.9058217207860856e-06, + "loss": 0.0205, + "step": 4128 + }, + { + "epoch": 1.8785259326660602, + "grad_norm": 0.5380949608863376, + "learning_rate": 6.904500248335348e-06, + "loss": 0.0284, + "step": 4129 + }, + { + "epoch": 1.8789808917197452, + "grad_norm": 0.5576353977632108, + "learning_rate": 6.903178620257448e-06, + "loss": 0.0315, + "step": 4130 + }, + { + "epoch": 1.8794358507734303, + "grad_norm": 0.4049573082389251, + "learning_rate": 6.901856836660386e-06, + "loss": 0.0202, + "step": 4131 + }, + { + "epoch": 1.8798908098271156, + "grad_norm": 0.843414674417525, + "learning_rate": 6.900534897652174e-06, + "loss": 0.0521, + "step": 4132 + }, + { + "epoch": 1.8803457688808007, + "grad_norm": 0.5932961835636155, + "learning_rate": 6.8992128033408316e-06, + "loss": 0.032, + "step": 4133 + }, + { + "epoch": 1.8808007279344858, + "grad_norm": 0.5235441475273466, + "learning_rate": 6.8978905538343965e-06, + "loss": 0.0317, + "step": 4134 + }, + { + "epoch": 1.8812556869881711, + "grad_norm": 0.5931362305529173, + "learning_rate": 6.8965681492409145e-06, + "loss": 0.0357, + "step": 4135 + }, + { + "epoch": 1.8817106460418562, + "grad_norm": 0.6866152295154555, + "learning_rate": 6.895245589668449e-06, + "loss": 0.0322, + "step": 4136 + }, + { + "epoch": 1.8821656050955413, + "grad_norm": 0.5571567976815832, + "learning_rate": 6.893922875225072e-06, + "loss": 0.0342, + "step": 4137 + }, + { + "epoch": 1.8826205641492266, + "grad_norm": 0.6131320785326742, + "learning_rate": 6.892600006018871e-06, + "loss": 0.0411, + "step": 4138 + }, + { + "epoch": 1.8830755232029117, + "grad_norm": 0.4805462802325607, + "learning_rate": 6.891276982157946e-06, + "loss": 0.028, + "step": 4139 + }, + { + "epoch": 1.8835304822565968, + "grad_norm": 0.611829755478912, + "learning_rate": 6.8899538037504055e-06, + "loss": 0.0414, + "step": 4140 + }, + { + "epoch": 1.8839854413102821, + "grad_norm": 0.7171505872571483, + "learning_rate": 6.8886304709043764e-06, + "loss": 0.04, + "step": 4141 + }, + { + "epoch": 1.8844404003639672, + "grad_norm": 0.6341903094003845, + "learning_rate": 6.8873069837279915e-06, + "loss": 0.0534, + "step": 4142 + }, + { + "epoch": 1.8848953594176523, + "grad_norm": 0.5384311264461159, + "learning_rate": 6.885983342329406e-06, + "loss": 0.027, + "step": 4143 + }, + { + "epoch": 1.8853503184713376, + "grad_norm": 0.5989581507822104, + "learning_rate": 6.884659546816777e-06, + "loss": 0.0359, + "step": 4144 + }, + { + "epoch": 1.8858052775250227, + "grad_norm": 0.43766012476693866, + "learning_rate": 6.883335597298279e-06, + "loss": 0.0209, + "step": 4145 + }, + { + "epoch": 1.8862602365787078, + "grad_norm": 0.5834755472420379, + "learning_rate": 6.882011493882105e-06, + "loss": 0.0233, + "step": 4146 + }, + { + "epoch": 1.886715195632393, + "grad_norm": 0.6972609731827578, + "learning_rate": 6.880687236676449e-06, + "loss": 0.0438, + "step": 4147 + }, + { + "epoch": 1.8871701546860784, + "grad_norm": 0.4316315646247104, + "learning_rate": 6.879362825789525e-06, + "loss": 0.0199, + "step": 4148 + }, + { + "epoch": 1.8876251137397633, + "grad_norm": 0.3699795833373929, + "learning_rate": 6.8780382613295575e-06, + "loss": 0.0214, + "step": 4149 + }, + { + "epoch": 1.8880800727934486, + "grad_norm": 0.5280634658210035, + "learning_rate": 6.876713543404785e-06, + "loss": 0.0294, + "step": 4150 + }, + { + "epoch": 1.888535031847134, + "grad_norm": 0.42649598275949047, + "learning_rate": 6.875388672123458e-06, + "loss": 0.0209, + "step": 4151 + }, + { + "epoch": 1.8889899909008188, + "grad_norm": 0.6006523238443225, + "learning_rate": 6.874063647593836e-06, + "loss": 0.0399, + "step": 4152 + }, + { + "epoch": 1.889444949954504, + "grad_norm": 0.7779759284956044, + "learning_rate": 6.872738469924198e-06, + "loss": 0.0507, + "step": 4153 + }, + { + "epoch": 1.8898999090081894, + "grad_norm": 0.3897094032440889, + "learning_rate": 6.871413139222827e-06, + "loss": 0.0186, + "step": 4154 + }, + { + "epoch": 1.8903548680618745, + "grad_norm": 1.0454124974291628, + "learning_rate": 6.870087655598028e-06, + "loss": 0.0839, + "step": 4155 + }, + { + "epoch": 1.8908098271155596, + "grad_norm": 0.39688930772130354, + "learning_rate": 6.8687620191581095e-06, + "loss": 0.0212, + "step": 4156 + }, + { + "epoch": 1.8912647861692449, + "grad_norm": 0.41441871720990314, + "learning_rate": 6.867436230011397e-06, + "loss": 0.0218, + "step": 4157 + }, + { + "epoch": 1.89171974522293, + "grad_norm": 0.45977519202691797, + "learning_rate": 6.866110288266232e-06, + "loss": 0.0255, + "step": 4158 + }, + { + "epoch": 1.892174704276615, + "grad_norm": 0.4251437168707794, + "learning_rate": 6.864784194030958e-06, + "loss": 0.0254, + "step": 4159 + }, + { + "epoch": 1.8926296633303004, + "grad_norm": 0.5804153437616366, + "learning_rate": 6.863457947413944e-06, + "loss": 0.037, + "step": 4160 + }, + { + "epoch": 1.8930846223839854, + "grad_norm": 0.6495689827219507, + "learning_rate": 6.862131548523561e-06, + "loss": 0.0358, + "step": 4161 + }, + { + "epoch": 1.8935395814376705, + "grad_norm": 0.5133413068665638, + "learning_rate": 6.8608049974681964e-06, + "loss": 0.0393, + "step": 4162 + }, + { + "epoch": 1.8939945404913558, + "grad_norm": 0.4862566575630052, + "learning_rate": 6.859478294356252e-06, + "loss": 0.0204, + "step": 4163 + }, + { + "epoch": 1.894449499545041, + "grad_norm": 0.5755819284310981, + "learning_rate": 6.858151439296137e-06, + "loss": 0.0349, + "step": 4164 + }, + { + "epoch": 1.894904458598726, + "grad_norm": 0.6013684152559066, + "learning_rate": 6.85682443239628e-06, + "loss": 0.0299, + "step": 4165 + }, + { + "epoch": 1.8953594176524113, + "grad_norm": 0.593056511577885, + "learning_rate": 6.855497273765113e-06, + "loss": 0.0338, + "step": 4166 + }, + { + "epoch": 1.8958143767060964, + "grad_norm": 0.5140109171479168, + "learning_rate": 6.85416996351109e-06, + "loss": 0.0295, + "step": 4167 + }, + { + "epoch": 1.8962693357597815, + "grad_norm": 0.580481113661126, + "learning_rate": 6.8528425017426715e-06, + "loss": 0.0385, + "step": 4168 + }, + { + "epoch": 1.8967242948134668, + "grad_norm": 0.4729425465735144, + "learning_rate": 6.851514888568329e-06, + "loss": 0.0278, + "step": 4169 + }, + { + "epoch": 1.897179253867152, + "grad_norm": 0.6053992371768653, + "learning_rate": 6.850187124096552e-06, + "loss": 0.0415, + "step": 4170 + }, + { + "epoch": 1.897634212920837, + "grad_norm": 2.476045237305704, + "learning_rate": 6.848859208435838e-06, + "loss": 0.0426, + "step": 4171 + }, + { + "epoch": 1.8980891719745223, + "grad_norm": 0.5716074235612479, + "learning_rate": 6.847531141694701e-06, + "loss": 0.0419, + "step": 4172 + }, + { + "epoch": 1.8985441310282076, + "grad_norm": 0.7002000989512744, + "learning_rate": 6.846202923981661e-06, + "loss": 0.0362, + "step": 4173 + }, + { + "epoch": 1.8989990900818925, + "grad_norm": 0.5410115757258703, + "learning_rate": 6.844874555405256e-06, + "loss": 0.0307, + "step": 4174 + }, + { + "epoch": 1.8994540491355778, + "grad_norm": 0.36725599427465627, + "learning_rate": 6.8435460360740336e-06, + "loss": 0.022, + "step": 4175 + }, + { + "epoch": 1.8999090081892631, + "grad_norm": 0.3603629432216531, + "learning_rate": 6.842217366096553e-06, + "loss": 0.0183, + "step": 4176 + }, + { + "epoch": 1.900363967242948, + "grad_norm": 0.538845244670838, + "learning_rate": 6.84088854558139e-06, + "loss": 0.031, + "step": 4177 + }, + { + "epoch": 1.9008189262966333, + "grad_norm": 0.61286557236097, + "learning_rate": 6.839559574637128e-06, + "loss": 0.0322, + "step": 4178 + }, + { + "epoch": 1.9012738853503186, + "grad_norm": 0.44454321228530824, + "learning_rate": 6.838230453372365e-06, + "loss": 0.0244, + "step": 4179 + }, + { + "epoch": 1.9017288444040037, + "grad_norm": 0.6436314987022439, + "learning_rate": 6.836901181895711e-06, + "loss": 0.038, + "step": 4180 + }, + { + "epoch": 1.9021838034576888, + "grad_norm": 0.6354133524673475, + "learning_rate": 6.835571760315788e-06, + "loss": 0.0442, + "step": 4181 + }, + { + "epoch": 1.902638762511374, + "grad_norm": 0.6508106654348409, + "learning_rate": 6.83424218874123e-06, + "loss": 0.0417, + "step": 4182 + }, + { + "epoch": 1.9030937215650592, + "grad_norm": 0.6031856657210548, + "learning_rate": 6.832912467280684e-06, + "loss": 0.0326, + "step": 4183 + }, + { + "epoch": 1.9035486806187443, + "grad_norm": 0.5143443237954904, + "learning_rate": 6.831582596042807e-06, + "loss": 0.0312, + "step": 4184 + }, + { + "epoch": 1.9040036396724296, + "grad_norm": 0.625403042295468, + "learning_rate": 6.8302525751362724e-06, + "loss": 0.04, + "step": 4185 + }, + { + "epoch": 1.9044585987261147, + "grad_norm": 0.42059937773327577, + "learning_rate": 6.8289224046697645e-06, + "loss": 0.0251, + "step": 4186 + }, + { + "epoch": 1.9049135577797998, + "grad_norm": 0.5024413136546955, + "learning_rate": 6.827592084751975e-06, + "loss": 0.0366, + "step": 4187 + }, + { + "epoch": 1.905368516833485, + "grad_norm": 0.5498246757421382, + "learning_rate": 6.826261615491614e-06, + "loss": 0.0288, + "step": 4188 + }, + { + "epoch": 1.9058234758871702, + "grad_norm": 0.5713051541552272, + "learning_rate": 6.824930996997401e-06, + "loss": 0.0342, + "step": 4189 + }, + { + "epoch": 1.9062784349408552, + "grad_norm": 0.47256159060913766, + "learning_rate": 6.823600229378069e-06, + "loss": 0.0231, + "step": 4190 + }, + { + "epoch": 1.9067333939945406, + "grad_norm": 0.5070885208390664, + "learning_rate": 6.82226931274236e-06, + "loss": 0.0217, + "step": 4191 + }, + { + "epoch": 1.9071883530482256, + "grad_norm": 0.6279779773842173, + "learning_rate": 6.820938247199035e-06, + "loss": 0.0339, + "step": 4192 + }, + { + "epoch": 1.9076433121019107, + "grad_norm": 0.6186141625393742, + "learning_rate": 6.819607032856857e-06, + "loss": 0.0374, + "step": 4193 + }, + { + "epoch": 1.908098271155596, + "grad_norm": 0.48779739664063554, + "learning_rate": 6.81827566982461e-06, + "loss": 0.0238, + "step": 4194 + }, + { + "epoch": 1.9085532302092811, + "grad_norm": 0.6950217579037524, + "learning_rate": 6.816944158211088e-06, + "loss": 0.0513, + "step": 4195 + }, + { + "epoch": 1.9090081892629662, + "grad_norm": 0.5355831607004834, + "learning_rate": 6.815612498125093e-06, + "loss": 0.0311, + "step": 4196 + }, + { + "epoch": 1.9094631483166515, + "grad_norm": 0.742063347605555, + "learning_rate": 6.814280689675444e-06, + "loss": 0.0511, + "step": 4197 + }, + { + "epoch": 1.9099181073703366, + "grad_norm": 0.47161758023745004, + "learning_rate": 6.812948732970971e-06, + "loss": 0.0228, + "step": 4198 + }, + { + "epoch": 1.9103730664240217, + "grad_norm": 0.6490262091704564, + "learning_rate": 6.811616628120514e-06, + "loss": 0.0432, + "step": 4199 + }, + { + "epoch": 1.910828025477707, + "grad_norm": 0.38283337091143843, + "learning_rate": 6.8102843752329286e-06, + "loss": 0.0186, + "step": 4200 + }, + { + "epoch": 1.9112829845313923, + "grad_norm": 0.4468451599677746, + "learning_rate": 6.808951974417077e-06, + "loss": 0.0312, + "step": 4201 + }, + { + "epoch": 1.9117379435850772, + "grad_norm": 0.5048947846466737, + "learning_rate": 6.807619425781841e-06, + "loss": 0.0361, + "step": 4202 + }, + { + "epoch": 1.9121929026387625, + "grad_norm": 0.8947871093541464, + "learning_rate": 6.806286729436109e-06, + "loss": 0.0819, + "step": 4203 + }, + { + "epoch": 1.9126478616924478, + "grad_norm": 0.38672962216968826, + "learning_rate": 6.804953885488783e-06, + "loss": 0.0221, + "step": 4204 + }, + { + "epoch": 1.9131028207461327, + "grad_norm": 0.6261240402378399, + "learning_rate": 6.803620894048773e-06, + "loss": 0.0537, + "step": 4205 + }, + { + "epoch": 1.913557779799818, + "grad_norm": 0.48593300752539076, + "learning_rate": 6.802287755225012e-06, + "loss": 0.0384, + "step": 4206 + }, + { + "epoch": 1.9140127388535033, + "grad_norm": 0.6664418812806434, + "learning_rate": 6.800954469126434e-06, + "loss": 0.0375, + "step": 4207 + }, + { + "epoch": 1.9144676979071884, + "grad_norm": 0.4580710122565072, + "learning_rate": 6.799621035861989e-06, + "loss": 0.0218, + "step": 4208 + }, + { + "epoch": 1.9149226569608735, + "grad_norm": 0.36316540191417757, + "learning_rate": 6.798287455540642e-06, + "loss": 0.0185, + "step": 4209 + }, + { + "epoch": 1.9153776160145588, + "grad_norm": 0.5676289334367329, + "learning_rate": 6.7969537282713624e-06, + "loss": 0.0279, + "step": 4210 + }, + { + "epoch": 1.915832575068244, + "grad_norm": 0.5582353676662519, + "learning_rate": 6.795619854163143e-06, + "loss": 0.0348, + "step": 4211 + }, + { + "epoch": 1.916287534121929, + "grad_norm": 0.7340230920663307, + "learning_rate": 6.794285833324973e-06, + "loss": 0.0397, + "step": 4212 + }, + { + "epoch": 1.9167424931756143, + "grad_norm": 0.497610110970188, + "learning_rate": 6.792951665865871e-06, + "loss": 0.0297, + "step": 4213 + }, + { + "epoch": 1.9171974522292994, + "grad_norm": 0.8901064327873458, + "learning_rate": 6.791617351894855e-06, + "loss": 0.0599, + "step": 4214 + }, + { + "epoch": 1.9176524112829845, + "grad_norm": 0.6574222134836127, + "learning_rate": 6.790282891520958e-06, + "loss": 0.0405, + "step": 4215 + }, + { + "epoch": 1.9181073703366698, + "grad_norm": 0.5366635685640508, + "learning_rate": 6.788948284853232e-06, + "loss": 0.0306, + "step": 4216 + }, + { + "epoch": 1.9185623293903549, + "grad_norm": 0.4726708314237089, + "learning_rate": 6.787613532000727e-06, + "loss": 0.021, + "step": 4217 + }, + { + "epoch": 1.91901728844404, + "grad_norm": 0.6795314395307376, + "learning_rate": 6.786278633072521e-06, + "loss": 0.0319, + "step": 4218 + }, + { + "epoch": 1.9194722474977253, + "grad_norm": 0.45126350652234226, + "learning_rate": 6.784943588177687e-06, + "loss": 0.0269, + "step": 4219 + }, + { + "epoch": 1.9199272065514104, + "grad_norm": 0.6942845005528151, + "learning_rate": 6.783608397425328e-06, + "loss": 0.0405, + "step": 4220 + }, + { + "epoch": 1.9203821656050954, + "grad_norm": 0.32180086236608474, + "learning_rate": 6.782273060924544e-06, + "loss": 0.0179, + "step": 4221 + }, + { + "epoch": 1.9208371246587808, + "grad_norm": 0.7724119213459093, + "learning_rate": 6.780937578784452e-06, + "loss": 0.0586, + "step": 4222 + }, + { + "epoch": 1.9212920837124658, + "grad_norm": 0.4289017376812481, + "learning_rate": 6.779601951114186e-06, + "loss": 0.0217, + "step": 4223 + }, + { + "epoch": 1.921747042766151, + "grad_norm": 0.5465287260644731, + "learning_rate": 6.778266178022884e-06, + "loss": 0.0344, + "step": 4224 + }, + { + "epoch": 1.9222020018198362, + "grad_norm": 0.38990536763795697, + "learning_rate": 6.776930259619703e-06, + "loss": 0.0282, + "step": 4225 + }, + { + "epoch": 1.9226569608735213, + "grad_norm": 0.7934359797785354, + "learning_rate": 6.775594196013803e-06, + "loss": 0.0323, + "step": 4226 + }, + { + "epoch": 1.9231119199272064, + "grad_norm": 0.678124033784331, + "learning_rate": 6.774257987314364e-06, + "loss": 0.0362, + "step": 4227 + }, + { + "epoch": 1.9235668789808917, + "grad_norm": 0.5797665311517408, + "learning_rate": 6.772921633630577e-06, + "loss": 0.0331, + "step": 4228 + }, + { + "epoch": 1.924021838034577, + "grad_norm": 0.48896590389820244, + "learning_rate": 6.7715851350716375e-06, + "loss": 0.0281, + "step": 4229 + }, + { + "epoch": 1.924476797088262, + "grad_norm": 0.5081972827747478, + "learning_rate": 6.7702484917467635e-06, + "loss": 0.0324, + "step": 4230 + }, + { + "epoch": 1.9249317561419472, + "grad_norm": 0.6821635948549074, + "learning_rate": 6.768911703765175e-06, + "loss": 0.0486, + "step": 4231 + }, + { + "epoch": 1.9253867151956325, + "grad_norm": 0.5407561356587544, + "learning_rate": 6.767574771236114e-06, + "loss": 0.0381, + "step": 4232 + }, + { + "epoch": 1.9258416742493174, + "grad_norm": 0.5749904174805588, + "learning_rate": 6.766237694268822e-06, + "loss": 0.0414, + "step": 4233 + }, + { + "epoch": 1.9262966333030027, + "grad_norm": 0.5171904308253861, + "learning_rate": 6.764900472972562e-06, + "loss": 0.0244, + "step": 4234 + }, + { + "epoch": 1.926751592356688, + "grad_norm": 0.5536805309306727, + "learning_rate": 6.763563107456607e-06, + "loss": 0.0303, + "step": 4235 + }, + { + "epoch": 1.9272065514103731, + "grad_norm": 0.671886081855136, + "learning_rate": 6.762225597830236e-06, + "loss": 0.0585, + "step": 4236 + }, + { + "epoch": 1.9276615104640582, + "grad_norm": 0.5428504655942143, + "learning_rate": 6.760887944202751e-06, + "loss": 0.0165, + "step": 4237 + }, + { + "epoch": 1.9281164695177435, + "grad_norm": 0.5289601666695274, + "learning_rate": 6.759550146683454e-06, + "loss": 0.0308, + "step": 4238 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 0.500870856273744, + "learning_rate": 6.758212205381665e-06, + "loss": 0.0303, + "step": 4239 + }, + { + "epoch": 1.9290263876251137, + "grad_norm": 0.548165583645003, + "learning_rate": 6.7568741204067145e-06, + "loss": 0.0299, + "step": 4240 + }, + { + "epoch": 1.929481346678799, + "grad_norm": 0.6690133186126083, + "learning_rate": 6.7555358918679435e-06, + "loss": 0.0332, + "step": 4241 + }, + { + "epoch": 1.929936305732484, + "grad_norm": 0.6155617299772874, + "learning_rate": 6.75419751987471e-06, + "loss": 0.0314, + "step": 4242 + }, + { + "epoch": 1.9303912647861692, + "grad_norm": 0.5839720790284428, + "learning_rate": 6.752859004536376e-06, + "loss": 0.035, + "step": 4243 + }, + { + "epoch": 1.9308462238398545, + "grad_norm": 0.6772350774170958, + "learning_rate": 6.751520345962319e-06, + "loss": 0.0425, + "step": 4244 + }, + { + "epoch": 1.9313011828935396, + "grad_norm": 0.44906985575019187, + "learning_rate": 6.7501815442619315e-06, + "loss": 0.0245, + "step": 4245 + }, + { + "epoch": 1.9317561419472247, + "grad_norm": 0.559749956318088, + "learning_rate": 6.74884259954461e-06, + "loss": 0.0309, + "step": 4246 + }, + { + "epoch": 1.93221110100091, + "grad_norm": 0.48000720862490154, + "learning_rate": 6.747503511919768e-06, + "loss": 0.0348, + "step": 4247 + }, + { + "epoch": 1.932666060054595, + "grad_norm": 0.48872163153009507, + "learning_rate": 6.746164281496832e-06, + "loss": 0.035, + "step": 4248 + }, + { + "epoch": 1.9331210191082802, + "grad_norm": 0.5850838894642061, + "learning_rate": 6.744824908385237e-06, + "loss": 0.0407, + "step": 4249 + }, + { + "epoch": 1.9335759781619655, + "grad_norm": 0.3777275153128007, + "learning_rate": 6.743485392694429e-06, + "loss": 0.011, + "step": 4250 + }, + { + "epoch": 1.9340309372156506, + "grad_norm": 0.4899042906423011, + "learning_rate": 6.742145734533868e-06, + "loss": 0.0268, + "step": 4251 + }, + { + "epoch": 1.9344858962693356, + "grad_norm": 0.497140311954729, + "learning_rate": 6.740805934013027e-06, + "loss": 0.0336, + "step": 4252 + }, + { + "epoch": 1.934940855323021, + "grad_norm": 0.5497428673732642, + "learning_rate": 6.739465991241385e-06, + "loss": 0.0402, + "step": 4253 + }, + { + "epoch": 1.935395814376706, + "grad_norm": 0.47312365256676847, + "learning_rate": 6.7381259063284375e-06, + "loss": 0.0213, + "step": 4254 + }, + { + "epoch": 1.9358507734303911, + "grad_norm": 0.48376291484120243, + "learning_rate": 6.7367856793836905e-06, + "loss": 0.0262, + "step": 4255 + }, + { + "epoch": 1.9363057324840764, + "grad_norm": 0.8142605312434946, + "learning_rate": 6.7354453105166615e-06, + "loss": 0.0508, + "step": 4256 + }, + { + "epoch": 1.9367606915377618, + "grad_norm": 0.42547889245737935, + "learning_rate": 6.734104799836878e-06, + "loss": 0.0166, + "step": 4257 + }, + { + "epoch": 1.9372156505914466, + "grad_norm": 0.5833429245001462, + "learning_rate": 6.7327641474538816e-06, + "loss": 0.0193, + "step": 4258 + }, + { + "epoch": 1.937670609645132, + "grad_norm": 0.5371727299945996, + "learning_rate": 6.731423353477224e-06, + "loss": 0.0337, + "step": 4259 + }, + { + "epoch": 1.9381255686988172, + "grad_norm": 0.5590256570552319, + "learning_rate": 6.73008241801647e-06, + "loss": 0.0384, + "step": 4260 + }, + { + "epoch": 1.9385805277525021, + "grad_norm": 0.49279712910657897, + "learning_rate": 6.7287413411811935e-06, + "loss": 0.0207, + "step": 4261 + }, + { + "epoch": 1.9390354868061874, + "grad_norm": 0.47385824774797464, + "learning_rate": 6.727400123080981e-06, + "loss": 0.0273, + "step": 4262 + }, + { + "epoch": 1.9394904458598727, + "grad_norm": 0.5136475532315747, + "learning_rate": 6.726058763825431e-06, + "loss": 0.0203, + "step": 4263 + }, + { + "epoch": 1.9399454049135578, + "grad_norm": 0.383378308522441, + "learning_rate": 6.724717263524154e-06, + "loss": 0.0206, + "step": 4264 + }, + { + "epoch": 1.940400363967243, + "grad_norm": 0.5024964516130972, + "learning_rate": 6.723375622286772e-06, + "loss": 0.0345, + "step": 4265 + }, + { + "epoch": 1.9408553230209282, + "grad_norm": 0.462858719824685, + "learning_rate": 6.722033840222917e-06, + "loss": 0.0316, + "step": 4266 + }, + { + "epoch": 1.9413102820746133, + "grad_norm": 0.5835999657969747, + "learning_rate": 6.720691917442232e-06, + "loss": 0.0216, + "step": 4267 + }, + { + "epoch": 1.9417652411282984, + "grad_norm": 0.42098842770143113, + "learning_rate": 6.7193498540543736e-06, + "loss": 0.0274, + "step": 4268 + }, + { + "epoch": 1.9422202001819837, + "grad_norm": 0.7112824280279982, + "learning_rate": 6.7180076501690105e-06, + "loss": 0.0428, + "step": 4269 + }, + { + "epoch": 1.9426751592356688, + "grad_norm": 0.6517418719527546, + "learning_rate": 6.716665305895821e-06, + "loss": 0.0375, + "step": 4270 + }, + { + "epoch": 1.943130118289354, + "grad_norm": 0.4606842807963288, + "learning_rate": 6.715322821344495e-06, + "loss": 0.0283, + "step": 4271 + }, + { + "epoch": 1.9435850773430392, + "grad_norm": 0.4262009261306818, + "learning_rate": 6.713980196624732e-06, + "loss": 0.0205, + "step": 4272 + }, + { + "epoch": 1.9440400363967243, + "grad_norm": 0.8172009202443107, + "learning_rate": 6.712637431846251e-06, + "loss": 0.0494, + "step": 4273 + }, + { + "epoch": 1.9444949954504094, + "grad_norm": 1.1652533424561522, + "learning_rate": 6.711294527118772e-06, + "loss": 0.0738, + "step": 4274 + }, + { + "epoch": 1.9449499545040947, + "grad_norm": 0.46193035782026587, + "learning_rate": 6.709951482552032e-06, + "loss": 0.0202, + "step": 4275 + }, + { + "epoch": 1.9454049135577798, + "grad_norm": 0.432957979144842, + "learning_rate": 6.708608298255778e-06, + "loss": 0.0179, + "step": 4276 + }, + { + "epoch": 1.9458598726114649, + "grad_norm": 0.6172675181416378, + "learning_rate": 6.707264974339772e-06, + "loss": 0.0357, + "step": 4277 + }, + { + "epoch": 1.9463148316651502, + "grad_norm": 0.5084300297630266, + "learning_rate": 6.705921510913781e-06, + "loss": 0.0393, + "step": 4278 + }, + { + "epoch": 1.9467697907188353, + "grad_norm": 0.5684094495685631, + "learning_rate": 6.704577908087589e-06, + "loss": 0.0434, + "step": 4279 + }, + { + "epoch": 1.9472247497725204, + "grad_norm": 0.7216629288754447, + "learning_rate": 6.7032341659709875e-06, + "loss": 0.0408, + "step": 4280 + }, + { + "epoch": 1.9476797088262057, + "grad_norm": 0.44704604969626477, + "learning_rate": 6.701890284673782e-06, + "loss": 0.0278, + "step": 4281 + }, + { + "epoch": 1.9481346678798908, + "grad_norm": 0.6359559201978502, + "learning_rate": 6.700546264305787e-06, + "loss": 0.0333, + "step": 4282 + }, + { + "epoch": 1.9485896269335758, + "grad_norm": 0.6380060738199653, + "learning_rate": 6.699202104976832e-06, + "loss": 0.0341, + "step": 4283 + }, + { + "epoch": 1.9490445859872612, + "grad_norm": 0.5494115223446472, + "learning_rate": 6.697857806796753e-06, + "loss": 0.0281, + "step": 4284 + }, + { + "epoch": 1.9494995450409465, + "grad_norm": 0.596507057397881, + "learning_rate": 6.696513369875403e-06, + "loss": 0.0419, + "step": 4285 + }, + { + "epoch": 1.9499545040946313, + "grad_norm": 0.4656177655917427, + "learning_rate": 6.695168794322642e-06, + "loss": 0.0216, + "step": 4286 + }, + { + "epoch": 1.9504094631483166, + "grad_norm": 0.5297924925475368, + "learning_rate": 6.693824080248341e-06, + "loss": 0.0237, + "step": 4287 + }, + { + "epoch": 1.950864422202002, + "grad_norm": 0.41739211017712247, + "learning_rate": 6.692479227762387e-06, + "loss": 0.0238, + "step": 4288 + }, + { + "epoch": 1.9513193812556868, + "grad_norm": 0.5024100245734225, + "learning_rate": 6.691134236974673e-06, + "loss": 0.0283, + "step": 4289 + }, + { + "epoch": 1.9517743403093721, + "grad_norm": 0.749115415166653, + "learning_rate": 6.6897891079951065e-06, + "loss": 0.0457, + "step": 4290 + }, + { + "epoch": 1.9522292993630574, + "grad_norm": 0.6087252514507477, + "learning_rate": 6.688443840933605e-06, + "loss": 0.0408, + "step": 4291 + }, + { + "epoch": 1.9526842584167425, + "grad_norm": 0.7019660276329671, + "learning_rate": 6.6870984359000964e-06, + "loss": 0.0371, + "step": 4292 + }, + { + "epoch": 1.9531392174704276, + "grad_norm": 0.44147466434114707, + "learning_rate": 6.6857528930045245e-06, + "loss": 0.0243, + "step": 4293 + }, + { + "epoch": 1.953594176524113, + "grad_norm": 0.73232677874317, + "learning_rate": 6.684407212356838e-06, + "loss": 0.0448, + "step": 4294 + }, + { + "epoch": 1.954049135577798, + "grad_norm": 0.8646603208146352, + "learning_rate": 6.683061394067002e-06, + "loss": 0.0471, + "step": 4295 + }, + { + "epoch": 1.9545040946314831, + "grad_norm": 0.7264235846217042, + "learning_rate": 6.6817154382449876e-06, + "loss": 0.0522, + "step": 4296 + }, + { + "epoch": 1.9549590536851684, + "grad_norm": 0.49328455085653516, + "learning_rate": 6.680369345000783e-06, + "loss": 0.0346, + "step": 4297 + }, + { + "epoch": 1.9554140127388535, + "grad_norm": 0.7462276090013412, + "learning_rate": 6.679023114444385e-06, + "loss": 0.0518, + "step": 4298 + }, + { + "epoch": 1.9558689717925386, + "grad_norm": 0.5443680438196182, + "learning_rate": 6.6776767466857974e-06, + "loss": 0.0379, + "step": 4299 + }, + { + "epoch": 1.956323930846224, + "grad_norm": 0.5168252132778363, + "learning_rate": 6.676330241835045e-06, + "loss": 0.0212, + "step": 4300 + }, + { + "epoch": 1.956778889899909, + "grad_norm": 0.48305900780596694, + "learning_rate": 6.674983600002155e-06, + "loss": 0.0219, + "step": 4301 + }, + { + "epoch": 1.957233848953594, + "grad_norm": 0.5604428637787447, + "learning_rate": 6.67363682129717e-06, + "loss": 0.0324, + "step": 4302 + }, + { + "epoch": 1.9576888080072794, + "grad_norm": 0.32607077104499727, + "learning_rate": 6.672289905830141e-06, + "loss": 0.0165, + "step": 4303 + }, + { + "epoch": 1.9581437670609645, + "grad_norm": 0.6360494656970666, + "learning_rate": 6.6709428537111336e-06, + "loss": 0.0375, + "step": 4304 + }, + { + "epoch": 1.9585987261146496, + "grad_norm": 0.5622414185374158, + "learning_rate": 6.669595665050223e-06, + "loss": 0.0317, + "step": 4305 + }, + { + "epoch": 1.959053685168335, + "grad_norm": 0.4737427909708252, + "learning_rate": 6.668248339957491e-06, + "loss": 0.0301, + "step": 4306 + }, + { + "epoch": 1.95950864422202, + "grad_norm": 0.49041443717670724, + "learning_rate": 6.666900878543041e-06, + "loss": 0.0318, + "step": 4307 + }, + { + "epoch": 1.959963603275705, + "grad_norm": 0.3981810185297644, + "learning_rate": 6.6655532809169785e-06, + "loss": 0.0304, + "step": 4308 + }, + { + "epoch": 1.9604185623293904, + "grad_norm": 0.7103157363923976, + "learning_rate": 6.664205547189424e-06, + "loss": 0.0531, + "step": 4309 + }, + { + "epoch": 1.9608735213830755, + "grad_norm": 0.2983514558093099, + "learning_rate": 6.662857677470508e-06, + "loss": 0.017, + "step": 4310 + }, + { + "epoch": 1.9613284804367606, + "grad_norm": 0.4795393028635018, + "learning_rate": 6.66150967187037e-06, + "loss": 0.0351, + "step": 4311 + }, + { + "epoch": 1.9617834394904459, + "grad_norm": 0.48386879103999797, + "learning_rate": 6.660161530499168e-06, + "loss": 0.0296, + "step": 4312 + }, + { + "epoch": 1.9622383985441312, + "grad_norm": 0.6086611680310994, + "learning_rate": 6.65881325346706e-06, + "loss": 0.0331, + "step": 4313 + }, + { + "epoch": 1.962693357597816, + "grad_norm": 0.7315193859463398, + "learning_rate": 6.657464840884225e-06, + "loss": 0.0448, + "step": 4314 + }, + { + "epoch": 1.9631483166515014, + "grad_norm": 0.4931020414003279, + "learning_rate": 6.656116292860849e-06, + "loss": 0.0383, + "step": 4315 + }, + { + "epoch": 1.9636032757051867, + "grad_norm": 0.6692604288867324, + "learning_rate": 6.654767609507127e-06, + "loss": 0.0569, + "step": 4316 + }, + { + "epoch": 1.9640582347588715, + "grad_norm": 0.6022950831552026, + "learning_rate": 6.65341879093327e-06, + "loss": 0.0344, + "step": 4317 + }, + { + "epoch": 1.9645131938125568, + "grad_norm": 0.5355104605232749, + "learning_rate": 6.652069837249495e-06, + "loss": 0.0261, + "step": 4318 + }, + { + "epoch": 1.9649681528662422, + "grad_norm": 0.5793528038518898, + "learning_rate": 6.650720748566035e-06, + "loss": 0.0386, + "step": 4319 + }, + { + "epoch": 1.9654231119199272, + "grad_norm": 0.686478667336817, + "learning_rate": 6.649371524993129e-06, + "loss": 0.0356, + "step": 4320 + }, + { + "epoch": 1.9658780709736123, + "grad_norm": 4.5997608223820885, + "learning_rate": 6.64802216664103e-06, + "loss": 0.1511, + "step": 4321 + }, + { + "epoch": 1.9663330300272976, + "grad_norm": 0.6232069446071732, + "learning_rate": 6.646672673620005e-06, + "loss": 0.0449, + "step": 4322 + }, + { + "epoch": 1.9667879890809827, + "grad_norm": 0.5406508477628449, + "learning_rate": 6.645323046040323e-06, + "loss": 0.0309, + "step": 4323 + }, + { + "epoch": 1.9672429481346678, + "grad_norm": 0.5261295765454279, + "learning_rate": 6.643973284012271e-06, + "loss": 0.0261, + "step": 4324 + }, + { + "epoch": 1.9676979071883531, + "grad_norm": 0.751170141815096, + "learning_rate": 6.642623387646148e-06, + "loss": 0.0216, + "step": 4325 + }, + { + "epoch": 1.9681528662420382, + "grad_norm": 0.6125631770685741, + "learning_rate": 6.64127335705226e-06, + "loss": 0.0426, + "step": 4326 + }, + { + "epoch": 1.9686078252957233, + "grad_norm": 0.46397179555598267, + "learning_rate": 6.639923192340923e-06, + "loss": 0.0247, + "step": 4327 + }, + { + "epoch": 1.9690627843494086, + "grad_norm": 0.5013740130400764, + "learning_rate": 6.63857289362247e-06, + "loss": 0.0297, + "step": 4328 + }, + { + "epoch": 1.9695177434030937, + "grad_norm": 0.46418829608866635, + "learning_rate": 6.637222461007241e-06, + "loss": 0.0291, + "step": 4329 + }, + { + "epoch": 1.9699727024567788, + "grad_norm": 0.44197278228763304, + "learning_rate": 6.635871894605585e-06, + "loss": 0.0323, + "step": 4330 + }, + { + "epoch": 1.9704276615104641, + "grad_norm": 0.35879639917254974, + "learning_rate": 6.634521194527865e-06, + "loss": 0.0201, + "step": 4331 + }, + { + "epoch": 1.9708826205641492, + "grad_norm": 0.4078620107559211, + "learning_rate": 6.633170360884455e-06, + "loss": 0.0261, + "step": 4332 + }, + { + "epoch": 1.9713375796178343, + "grad_norm": 0.4603460338520836, + "learning_rate": 6.6318193937857375e-06, + "loss": 0.0288, + "step": 4333 + }, + { + "epoch": 1.9717925386715196, + "grad_norm": 0.5356573066502125, + "learning_rate": 6.630468293342109e-06, + "loss": 0.0389, + "step": 4334 + }, + { + "epoch": 1.9722474977252047, + "grad_norm": 0.6230220298169801, + "learning_rate": 6.629117059663975e-06, + "loss": 0.0405, + "step": 4335 + }, + { + "epoch": 1.9727024567788898, + "grad_norm": 0.757992411373233, + "learning_rate": 6.627765692861752e-06, + "loss": 0.0546, + "step": 4336 + }, + { + "epoch": 1.973157415832575, + "grad_norm": 0.5349849941879612, + "learning_rate": 6.626414193045867e-06, + "loss": 0.0314, + "step": 4337 + }, + { + "epoch": 1.9736123748862604, + "grad_norm": 0.6693115397359995, + "learning_rate": 6.625062560326758e-06, + "loss": 0.0417, + "step": 4338 + }, + { + "epoch": 1.9740673339399453, + "grad_norm": 0.5222744642383346, + "learning_rate": 6.6237107948148785e-06, + "loss": 0.0291, + "step": 4339 + }, + { + "epoch": 1.9745222929936306, + "grad_norm": 0.42003903473507664, + "learning_rate": 6.622358896620682e-06, + "loss": 0.0216, + "step": 4340 + }, + { + "epoch": 1.974977252047316, + "grad_norm": 0.3460225532281504, + "learning_rate": 6.621006865854645e-06, + "loss": 0.017, + "step": 4341 + }, + { + "epoch": 1.9754322111010008, + "grad_norm": 0.5132175355903507, + "learning_rate": 6.619654702627246e-06, + "loss": 0.0324, + "step": 4342 + }, + { + "epoch": 1.975887170154686, + "grad_norm": 0.4372697417829231, + "learning_rate": 6.61830240704898e-06, + "loss": 0.0278, + "step": 4343 + }, + { + "epoch": 1.9763421292083714, + "grad_norm": 0.6015750404917026, + "learning_rate": 6.616949979230349e-06, + "loss": 0.0382, + "step": 4344 + }, + { + "epoch": 1.9767970882620565, + "grad_norm": 0.5658368711187638, + "learning_rate": 6.615597419281867e-06, + "loss": 0.0237, + "step": 4345 + }, + { + "epoch": 1.9772520473157416, + "grad_norm": 0.7546044236392536, + "learning_rate": 6.614244727314063e-06, + "loss": 0.0439, + "step": 4346 + }, + { + "epoch": 1.9777070063694269, + "grad_norm": 0.5610657717322657, + "learning_rate": 6.612891903437466e-06, + "loss": 0.0341, + "step": 4347 + }, + { + "epoch": 1.978161965423112, + "grad_norm": 0.5483801878678801, + "learning_rate": 6.611538947762628e-06, + "loss": 0.0321, + "step": 4348 + }, + { + "epoch": 1.978616924476797, + "grad_norm": 0.4906620796819385, + "learning_rate": 6.610185860400106e-06, + "loss": 0.0258, + "step": 4349 + }, + { + "epoch": 1.9790718835304824, + "grad_norm": 0.5491476478691184, + "learning_rate": 6.608832641460465e-06, + "loss": 0.041, + "step": 4350 + }, + { + "epoch": 1.9795268425841674, + "grad_norm": 0.5539074191353909, + "learning_rate": 6.607479291054288e-06, + "loss": 0.0362, + "step": 4351 + }, + { + "epoch": 1.9799818016378525, + "grad_norm": 0.4806168792141746, + "learning_rate": 6.6061258092921595e-06, + "loss": 0.0234, + "step": 4352 + }, + { + "epoch": 1.9804367606915378, + "grad_norm": 0.6160426143841132, + "learning_rate": 6.6047721962846854e-06, + "loss": 0.0348, + "step": 4353 + }, + { + "epoch": 1.980891719745223, + "grad_norm": 0.6686540794952565, + "learning_rate": 6.603418452142475e-06, + "loss": 0.0383, + "step": 4354 + }, + { + "epoch": 1.981346678798908, + "grad_norm": 0.5334646209689482, + "learning_rate": 6.602064576976148e-06, + "loss": 0.0264, + "step": 4355 + }, + { + "epoch": 1.9818016378525933, + "grad_norm": 0.666271260243879, + "learning_rate": 6.600710570896341e-06, + "loss": 0.0383, + "step": 4356 + }, + { + "epoch": 1.9822565969062784, + "grad_norm": 0.8503041725684195, + "learning_rate": 6.5993564340136915e-06, + "loss": 0.0647, + "step": 4357 + }, + { + "epoch": 1.9827115559599635, + "grad_norm": 0.37484521391292774, + "learning_rate": 6.598002166438859e-06, + "loss": 0.0168, + "step": 4358 + }, + { + "epoch": 1.9831665150136488, + "grad_norm": 0.515778668718774, + "learning_rate": 6.596647768282505e-06, + "loss": 0.0298, + "step": 4359 + }, + { + "epoch": 1.983621474067334, + "grad_norm": 0.6713659676731957, + "learning_rate": 6.595293239655307e-06, + "loss": 0.0345, + "step": 4360 + }, + { + "epoch": 1.984076433121019, + "grad_norm": 0.5925529457710818, + "learning_rate": 6.593938580667949e-06, + "loss": 0.0376, + "step": 4361 + }, + { + "epoch": 1.9845313921747043, + "grad_norm": 0.6579256270327525, + "learning_rate": 6.592583791431128e-06, + "loss": 0.0427, + "step": 4362 + }, + { + "epoch": 1.9849863512283894, + "grad_norm": 0.5299555195913671, + "learning_rate": 6.591228872055553e-06, + "loss": 0.0255, + "step": 4363 + }, + { + "epoch": 1.9854413102820745, + "grad_norm": 0.6607587462806197, + "learning_rate": 6.5898738226519396e-06, + "loss": 0.0465, + "step": 4364 + }, + { + "epoch": 1.9858962693357598, + "grad_norm": 0.5091472503350368, + "learning_rate": 6.588518643331018e-06, + "loss": 0.0249, + "step": 4365 + }, + { + "epoch": 1.9863512283894451, + "grad_norm": 0.4595426588781745, + "learning_rate": 6.5871633342035255e-06, + "loss": 0.0303, + "step": 4366 + }, + { + "epoch": 1.98680618744313, + "grad_norm": 0.5310500534756174, + "learning_rate": 6.585807895380212e-06, + "loss": 0.0322, + "step": 4367 + }, + { + "epoch": 1.9872611464968153, + "grad_norm": 0.5651708114452124, + "learning_rate": 6.584452326971841e-06, + "loss": 0.0421, + "step": 4368 + }, + { + "epoch": 1.9877161055505006, + "grad_norm": 0.632273854951826, + "learning_rate": 6.583096629089178e-06, + "loss": 0.037, + "step": 4369 + }, + { + "epoch": 1.9881710646041855, + "grad_norm": 0.43696218471863923, + "learning_rate": 6.5817408018430105e-06, + "loss": 0.0216, + "step": 4370 + }, + { + "epoch": 1.9886260236578708, + "grad_norm": 0.6068747591423251, + "learning_rate": 6.580384845344128e-06, + "loss": 0.0308, + "step": 4371 + }, + { + "epoch": 1.989080982711556, + "grad_norm": 0.4235191860692518, + "learning_rate": 6.579028759703332e-06, + "loss": 0.0248, + "step": 4372 + }, + { + "epoch": 1.9895359417652412, + "grad_norm": 0.5598435542102982, + "learning_rate": 6.577672545031436e-06, + "loss": 0.0321, + "step": 4373 + }, + { + "epoch": 1.9899909008189263, + "grad_norm": 0.37623320043924735, + "learning_rate": 6.576316201439264e-06, + "loss": 0.0238, + "step": 4374 + }, + { + "epoch": 1.9904458598726116, + "grad_norm": 0.5140371963530428, + "learning_rate": 6.574959729037653e-06, + "loss": 0.0301, + "step": 4375 + }, + { + "epoch": 1.9909008189262967, + "grad_norm": 0.49567280344957254, + "learning_rate": 6.573603127937443e-06, + "loss": 0.0342, + "step": 4376 + }, + { + "epoch": 1.9913557779799818, + "grad_norm": 0.504590903021897, + "learning_rate": 6.572246398249492e-06, + "loss": 0.0287, + "step": 4377 + }, + { + "epoch": 1.991810737033667, + "grad_norm": 0.5818147798282555, + "learning_rate": 6.570889540084666e-06, + "loss": 0.0332, + "step": 4378 + }, + { + "epoch": 1.9922656960873522, + "grad_norm": 0.4938323230947109, + "learning_rate": 6.569532553553841e-06, + "loss": 0.0297, + "step": 4379 + }, + { + "epoch": 1.9927206551410372, + "grad_norm": 0.543508024211706, + "learning_rate": 6.568175438767904e-06, + "loss": 0.0322, + "step": 4380 + }, + { + "epoch": 1.9931756141947226, + "grad_norm": 0.47867791548831073, + "learning_rate": 6.566818195837751e-06, + "loss": 0.0281, + "step": 4381 + }, + { + "epoch": 1.9936305732484076, + "grad_norm": 0.5598346075270535, + "learning_rate": 6.5654608248742924e-06, + "loss": 0.039, + "step": 4382 + }, + { + "epoch": 1.9940855323020927, + "grad_norm": 0.5553858519117557, + "learning_rate": 6.564103325988442e-06, + "loss": 0.0412, + "step": 4383 + }, + { + "epoch": 1.994540491355778, + "grad_norm": 0.711787912581307, + "learning_rate": 6.562745699291133e-06, + "loss": 0.0414, + "step": 4384 + }, + { + "epoch": 1.9949954504094631, + "grad_norm": 0.5244433760038992, + "learning_rate": 6.561387944893304e-06, + "loss": 0.0267, + "step": 4385 + }, + { + "epoch": 1.9954504094631482, + "grad_norm": 0.38337108463111075, + "learning_rate": 6.560030062905901e-06, + "loss": 0.0195, + "step": 4386 + }, + { + "epoch": 1.9959053685168335, + "grad_norm": 0.5588694456295312, + "learning_rate": 6.558672053439888e-06, + "loss": 0.0375, + "step": 4387 + }, + { + "epoch": 1.9963603275705186, + "grad_norm": 0.5692970330829459, + "learning_rate": 6.557313916606232e-06, + "loss": 0.0373, + "step": 4388 + }, + { + "epoch": 1.9968152866242037, + "grad_norm": 0.47534629481112917, + "learning_rate": 6.555955652515918e-06, + "loss": 0.0208, + "step": 4389 + }, + { + "epoch": 1.997270245677889, + "grad_norm": 0.6826078768399079, + "learning_rate": 6.554597261279932e-06, + "loss": 0.0479, + "step": 4390 + }, + { + "epoch": 1.9977252047315741, + "grad_norm": 0.4647646884318749, + "learning_rate": 6.553238743009278e-06, + "loss": 0.0257, + "step": 4391 + }, + { + "epoch": 1.9981801637852592, + "grad_norm": 0.5040993261207027, + "learning_rate": 6.551880097814971e-06, + "loss": 0.0371, + "step": 4392 + }, + { + "epoch": 1.9986351228389445, + "grad_norm": 0.5257593216912329, + "learning_rate": 6.550521325808029e-06, + "loss": 0.038, + "step": 4393 + }, + { + "epoch": 1.9990900818926298, + "grad_norm": 0.5498615738618472, + "learning_rate": 6.549162427099487e-06, + "loss": 0.0312, + "step": 4394 + }, + { + "epoch": 1.9995450409463147, + "grad_norm": 0.5898625475436916, + "learning_rate": 6.547803401800385e-06, + "loss": 0.0316, + "step": 4395 + }, + { + "epoch": 2.0, + "grad_norm": 0.6291682782694813, + "learning_rate": 6.546444250021783e-06, + "loss": 0.0359, + "step": 4396 + }, + { + "epoch": 2.0004549590536853, + "grad_norm": 0.29661485114623315, + "learning_rate": 6.545084971874738e-06, + "loss": 0.0111, + "step": 4397 + }, + { + "epoch": 2.00090991810737, + "grad_norm": 0.33133405147036854, + "learning_rate": 6.543725567470327e-06, + "loss": 0.0163, + "step": 4398 + }, + { + "epoch": 2.0013648771610555, + "grad_norm": 0.24619716513701012, + "learning_rate": 6.542366036919634e-06, + "loss": 0.0085, + "step": 4399 + }, + { + "epoch": 2.001819836214741, + "grad_norm": 0.227326455372897, + "learning_rate": 6.541006380333754e-06, + "loss": 0.009, + "step": 4400 + }, + { + "epoch": 2.0022747952684257, + "grad_norm": 0.3750038499488343, + "learning_rate": 6.539646597823791e-06, + "loss": 0.0174, + "step": 4401 + }, + { + "epoch": 2.002729754322111, + "grad_norm": 0.2741933372767153, + "learning_rate": 6.5382866895008625e-06, + "loss": 0.0122, + "step": 4402 + }, + { + "epoch": 2.0031847133757963, + "grad_norm": 0.3486640836851904, + "learning_rate": 6.536926655476092e-06, + "loss": 0.0153, + "step": 4403 + }, + { + "epoch": 2.003639672429481, + "grad_norm": 0.27151976968378777, + "learning_rate": 6.535566495860615e-06, + "loss": 0.0104, + "step": 4404 + }, + { + "epoch": 2.0040946314831665, + "grad_norm": 0.22619718539976288, + "learning_rate": 6.534206210765579e-06, + "loss": 0.0109, + "step": 4405 + }, + { + "epoch": 2.0045495905368518, + "grad_norm": 0.35042859460732206, + "learning_rate": 6.53284580030214e-06, + "loss": 0.0165, + "step": 4406 + }, + { + "epoch": 2.0050045495905366, + "grad_norm": 0.35024054234718227, + "learning_rate": 6.531485264581464e-06, + "loss": 0.014, + "step": 4407 + }, + { + "epoch": 2.005459508644222, + "grad_norm": 0.3523410861831411, + "learning_rate": 6.530124603714729e-06, + "loss": 0.0175, + "step": 4408 + }, + { + "epoch": 2.0059144676979073, + "grad_norm": 0.21154324422798507, + "learning_rate": 6.5287638178131216e-06, + "loss": 0.0085, + "step": 4409 + }, + { + "epoch": 2.0063694267515926, + "grad_norm": 0.2574154363605853, + "learning_rate": 6.527402906987838e-06, + "loss": 0.0102, + "step": 4410 + }, + { + "epoch": 2.0068243858052774, + "grad_norm": 0.4202359889540857, + "learning_rate": 6.526041871350086e-06, + "loss": 0.0147, + "step": 4411 + }, + { + "epoch": 2.0072793448589628, + "grad_norm": 0.512735944313641, + "learning_rate": 6.524680711011085e-06, + "loss": 0.0227, + "step": 4412 + }, + { + "epoch": 2.007734303912648, + "grad_norm": 0.46095481615562883, + "learning_rate": 6.523319426082062e-06, + "loss": 0.0169, + "step": 4413 + }, + { + "epoch": 2.008189262966333, + "grad_norm": 0.3436523183514405, + "learning_rate": 6.521958016674253e-06, + "loss": 0.016, + "step": 4414 + }, + { + "epoch": 2.0086442220200182, + "grad_norm": 0.34563237176708556, + "learning_rate": 6.52059648289891e-06, + "loss": 0.0098, + "step": 4415 + }, + { + "epoch": 2.0090991810737036, + "grad_norm": 0.20620877505979698, + "learning_rate": 6.519234824867288e-06, + "loss": 0.0086, + "step": 4416 + }, + { + "epoch": 2.0095541401273884, + "grad_norm": 0.29783321754422687, + "learning_rate": 6.517873042690658e-06, + "loss": 0.0106, + "step": 4417 + }, + { + "epoch": 2.0100090991810737, + "grad_norm": 0.3453682593057073, + "learning_rate": 6.516511136480297e-06, + "loss": 0.0111, + "step": 4418 + }, + { + "epoch": 2.010464058234759, + "grad_norm": 0.37926577488480717, + "learning_rate": 6.515149106347495e-06, + "loss": 0.0103, + "step": 4419 + }, + { + "epoch": 2.010919017288444, + "grad_norm": 0.2699076627022181, + "learning_rate": 6.513786952403549e-06, + "loss": 0.009, + "step": 4420 + }, + { + "epoch": 2.011373976342129, + "grad_norm": 0.4573462759532556, + "learning_rate": 6.512424674759772e-06, + "loss": 0.0174, + "step": 4421 + }, + { + "epoch": 2.0118289353958145, + "grad_norm": 0.3761340593210398, + "learning_rate": 6.511062273527478e-06, + "loss": 0.0117, + "step": 4422 + }, + { + "epoch": 2.0122838944494994, + "grad_norm": 0.2873044747781519, + "learning_rate": 6.509699748817999e-06, + "loss": 0.0077, + "step": 4423 + }, + { + "epoch": 2.0127388535031847, + "grad_norm": 0.4471565163894407, + "learning_rate": 6.5083371007426754e-06, + "loss": 0.0122, + "step": 4424 + }, + { + "epoch": 2.01319381255687, + "grad_norm": 0.18762503676748718, + "learning_rate": 6.506974329412855e-06, + "loss": 0.0055, + "step": 4425 + }, + { + "epoch": 2.013648771610555, + "grad_norm": 0.41747131105768737, + "learning_rate": 6.505611434939898e-06, + "loss": 0.0117, + "step": 4426 + }, + { + "epoch": 2.01410373066424, + "grad_norm": 0.28884035180311946, + "learning_rate": 6.504248417435174e-06, + "loss": 0.0117, + "step": 4427 + }, + { + "epoch": 2.0145586897179255, + "grad_norm": 0.3634594502134902, + "learning_rate": 6.502885277010063e-06, + "loss": 0.0177, + "step": 4428 + }, + { + "epoch": 2.0150136487716104, + "grad_norm": 0.4063657344059047, + "learning_rate": 6.501522013775951e-06, + "loss": 0.0147, + "step": 4429 + }, + { + "epoch": 2.0154686078252957, + "grad_norm": 0.4404980270121877, + "learning_rate": 6.500158627844245e-06, + "loss": 0.0119, + "step": 4430 + }, + { + "epoch": 2.015923566878981, + "grad_norm": 0.20532966469615765, + "learning_rate": 6.498795119326348e-06, + "loss": 0.0066, + "step": 4431 + }, + { + "epoch": 2.016378525932666, + "grad_norm": 0.404535372889522, + "learning_rate": 6.497431488333683e-06, + "loss": 0.0174, + "step": 4432 + }, + { + "epoch": 2.016833484986351, + "grad_norm": 0.5848970097518671, + "learning_rate": 6.496067734977681e-06, + "loss": 0.0205, + "step": 4433 + }, + { + "epoch": 2.0172884440400365, + "grad_norm": 0.25410677716502017, + "learning_rate": 6.494703859369778e-06, + "loss": 0.0049, + "step": 4434 + }, + { + "epoch": 2.0177434030937214, + "grad_norm": 0.47548386572656115, + "learning_rate": 6.493339861621426e-06, + "loss": 0.0113, + "step": 4435 + }, + { + "epoch": 2.0181983621474067, + "grad_norm": 0.4468900324730278, + "learning_rate": 6.491975741844083e-06, + "loss": 0.0129, + "step": 4436 + }, + { + "epoch": 2.018653321201092, + "grad_norm": 0.5481295616443194, + "learning_rate": 6.490611500149222e-06, + "loss": 0.0167, + "step": 4437 + }, + { + "epoch": 2.0191082802547773, + "grad_norm": 0.4155580311539728, + "learning_rate": 6.489247136648321e-06, + "loss": 0.0115, + "step": 4438 + }, + { + "epoch": 2.019563239308462, + "grad_norm": 0.27816921441583564, + "learning_rate": 6.487882651452867e-06, + "loss": 0.0113, + "step": 4439 + }, + { + "epoch": 2.0200181983621475, + "grad_norm": 0.6028146743323916, + "learning_rate": 6.486518044674364e-06, + "loss": 0.0143, + "step": 4440 + }, + { + "epoch": 2.0204731574158328, + "grad_norm": 0.4833297613211801, + "learning_rate": 6.4851533164243184e-06, + "loss": 0.0136, + "step": 4441 + }, + { + "epoch": 2.0209281164695176, + "grad_norm": 0.34567853809426624, + "learning_rate": 6.483788466814251e-06, + "loss": 0.0111, + "step": 4442 + }, + { + "epoch": 2.021383075523203, + "grad_norm": 0.4456901597501804, + "learning_rate": 6.482423495955692e-06, + "loss": 0.0116, + "step": 4443 + }, + { + "epoch": 2.0218380345768883, + "grad_norm": 0.5137637244388034, + "learning_rate": 6.4810584039601776e-06, + "loss": 0.0229, + "step": 4444 + }, + { + "epoch": 2.022292993630573, + "grad_norm": 0.5240103241593609, + "learning_rate": 6.4796931909392605e-06, + "loss": 0.0298, + "step": 4445 + }, + { + "epoch": 2.0227479526842584, + "grad_norm": 0.6996956460313095, + "learning_rate": 6.478327857004496e-06, + "loss": 0.0252, + "step": 4446 + }, + { + "epoch": 2.0232029117379438, + "grad_norm": 0.4864794904953008, + "learning_rate": 6.476962402267457e-06, + "loss": 0.0223, + "step": 4447 + }, + { + "epoch": 2.0236578707916286, + "grad_norm": 0.2717614212804149, + "learning_rate": 6.475596826839718e-06, + "loss": 0.0057, + "step": 4448 + }, + { + "epoch": 2.024112829845314, + "grad_norm": 0.3421850465764547, + "learning_rate": 6.474231130832873e-06, + "loss": 0.0149, + "step": 4449 + }, + { + "epoch": 2.0245677888989992, + "grad_norm": 0.54699171460358, + "learning_rate": 6.4728653143585165e-06, + "loss": 0.024, + "step": 4450 + }, + { + "epoch": 2.025022747952684, + "grad_norm": 0.3496883500317143, + "learning_rate": 6.4714993775282576e-06, + "loss": 0.0084, + "step": 4451 + }, + { + "epoch": 2.0254777070063694, + "grad_norm": 0.4957179416651556, + "learning_rate": 6.470133320453716e-06, + "loss": 0.0188, + "step": 4452 + }, + { + "epoch": 2.0259326660600547, + "grad_norm": 0.4593651031575888, + "learning_rate": 6.468767143246515e-06, + "loss": 0.0218, + "step": 4453 + }, + { + "epoch": 2.0263876251137396, + "grad_norm": 0.40386577359827, + "learning_rate": 6.467400846018299e-06, + "loss": 0.0162, + "step": 4454 + }, + { + "epoch": 2.026842584167425, + "grad_norm": 0.38025380370000395, + "learning_rate": 6.466034428880713e-06, + "loss": 0.0161, + "step": 4455 + }, + { + "epoch": 2.02729754322111, + "grad_norm": 0.5151203534522615, + "learning_rate": 6.464667891945413e-06, + "loss": 0.017, + "step": 4456 + }, + { + "epoch": 2.027752502274795, + "grad_norm": 0.34742835371586755, + "learning_rate": 6.463301235324066e-06, + "loss": 0.0093, + "step": 4457 + }, + { + "epoch": 2.0282074613284804, + "grad_norm": 0.3439303563428748, + "learning_rate": 6.461934459128351e-06, + "loss": 0.0124, + "step": 4458 + }, + { + "epoch": 2.0286624203821657, + "grad_norm": 0.45170347221061447, + "learning_rate": 6.460567563469956e-06, + "loss": 0.0165, + "step": 4459 + }, + { + "epoch": 2.0291173794358506, + "grad_norm": 0.48931317367198135, + "learning_rate": 6.459200548460574e-06, + "loss": 0.0184, + "step": 4460 + }, + { + "epoch": 2.029572338489536, + "grad_norm": 0.45094252471113627, + "learning_rate": 6.457833414211913e-06, + "loss": 0.0179, + "step": 4461 + }, + { + "epoch": 2.030027297543221, + "grad_norm": 0.3155385633049441, + "learning_rate": 6.4564661608356895e-06, + "loss": 0.0095, + "step": 4462 + }, + { + "epoch": 2.030482256596906, + "grad_norm": 0.4679983016076288, + "learning_rate": 6.455098788443628e-06, + "loss": 0.0151, + "step": 4463 + }, + { + "epoch": 2.0309372156505914, + "grad_norm": 0.3223606777694415, + "learning_rate": 6.453731297147464e-06, + "loss": 0.0065, + "step": 4464 + }, + { + "epoch": 2.0313921747042767, + "grad_norm": 0.3518840136193259, + "learning_rate": 6.452363687058944e-06, + "loss": 0.0142, + "step": 4465 + }, + { + "epoch": 2.031847133757962, + "grad_norm": 0.4551104474723514, + "learning_rate": 6.450995958289823e-06, + "loss": 0.0227, + "step": 4466 + }, + { + "epoch": 2.032302092811647, + "grad_norm": 0.4234721298392046, + "learning_rate": 6.449628110951864e-06, + "loss": 0.0128, + "step": 4467 + }, + { + "epoch": 2.032757051865332, + "grad_norm": 0.34735067289965277, + "learning_rate": 6.448260145156842e-06, + "loss": 0.0102, + "step": 4468 + }, + { + "epoch": 2.0332120109190175, + "grad_norm": 0.2925105437444775, + "learning_rate": 6.446892061016543e-06, + "loss": 0.0089, + "step": 4469 + }, + { + "epoch": 2.0336669699727024, + "grad_norm": 0.28557266020408856, + "learning_rate": 6.445523858642757e-06, + "loss": 0.0087, + "step": 4470 + }, + { + "epoch": 2.0341219290263877, + "grad_norm": 0.22950926949060813, + "learning_rate": 6.44415553814729e-06, + "loss": 0.0089, + "step": 4471 + }, + { + "epoch": 2.034576888080073, + "grad_norm": 0.3818053666366112, + "learning_rate": 6.442787099641954e-06, + "loss": 0.0148, + "step": 4472 + }, + { + "epoch": 2.035031847133758, + "grad_norm": 0.3988495192583969, + "learning_rate": 6.441418543238573e-06, + "loss": 0.0114, + "step": 4473 + }, + { + "epoch": 2.035486806187443, + "grad_norm": 0.1933907250862565, + "learning_rate": 6.440049869048975e-06, + "loss": 0.0045, + "step": 4474 + }, + { + "epoch": 2.0359417652411285, + "grad_norm": 0.29494059099515324, + "learning_rate": 6.438681077185007e-06, + "loss": 0.0081, + "step": 4475 + }, + { + "epoch": 2.0363967242948133, + "grad_norm": 0.34480547718786486, + "learning_rate": 6.43731216775852e-06, + "loss": 0.0124, + "step": 4476 + }, + { + "epoch": 2.0368516833484986, + "grad_norm": 0.37007488761569907, + "learning_rate": 6.435943140881371e-06, + "loss": 0.0213, + "step": 4477 + }, + { + "epoch": 2.037306642402184, + "grad_norm": 0.2748881269360316, + "learning_rate": 6.434573996665433e-06, + "loss": 0.0059, + "step": 4478 + }, + { + "epoch": 2.037761601455869, + "grad_norm": 0.26630809246208575, + "learning_rate": 6.433204735222588e-06, + "loss": 0.0091, + "step": 4479 + }, + { + "epoch": 2.038216560509554, + "grad_norm": 0.3096085485212809, + "learning_rate": 6.431835356664724e-06, + "loss": 0.0106, + "step": 4480 + }, + { + "epoch": 2.0386715195632394, + "grad_norm": 0.16181335219841153, + "learning_rate": 6.43046586110374e-06, + "loss": 0.0028, + "step": 4481 + }, + { + "epoch": 2.0391264786169243, + "grad_norm": 0.2831903057353729, + "learning_rate": 6.429096248651545e-06, + "loss": 0.0042, + "step": 4482 + }, + { + "epoch": 2.0395814376706096, + "grad_norm": 0.20289363916753683, + "learning_rate": 6.427726519420061e-06, + "loss": 0.0051, + "step": 4483 + }, + { + "epoch": 2.040036396724295, + "grad_norm": 0.3648528177635673, + "learning_rate": 6.426356673521211e-06, + "loss": 0.0139, + "step": 4484 + }, + { + "epoch": 2.04049135577798, + "grad_norm": 0.3622896287328724, + "learning_rate": 6.424986711066936e-06, + "loss": 0.012, + "step": 4485 + }, + { + "epoch": 2.040946314831665, + "grad_norm": 0.32884386613740535, + "learning_rate": 6.423616632169183e-06, + "loss": 0.0107, + "step": 4486 + }, + { + "epoch": 2.0414012738853504, + "grad_norm": 0.37933877465879645, + "learning_rate": 6.422246436939906e-06, + "loss": 0.0111, + "step": 4487 + }, + { + "epoch": 2.0418562329390353, + "grad_norm": 0.29523659467992736, + "learning_rate": 6.420876125491074e-06, + "loss": 0.0093, + "step": 4488 + }, + { + "epoch": 2.0423111919927206, + "grad_norm": 0.27326868398843224, + "learning_rate": 6.41950569793466e-06, + "loss": 0.0113, + "step": 4489 + }, + { + "epoch": 2.042766151046406, + "grad_norm": 0.3979596951225033, + "learning_rate": 6.418135154382655e-06, + "loss": 0.01, + "step": 4490 + }, + { + "epoch": 2.0432211101000908, + "grad_norm": 0.45553399667837624, + "learning_rate": 6.416764494947047e-06, + "loss": 0.0172, + "step": 4491 + }, + { + "epoch": 2.043676069153776, + "grad_norm": 0.37437585080804636, + "learning_rate": 6.4153937197398394e-06, + "loss": 0.007, + "step": 4492 + }, + { + "epoch": 2.0441310282074614, + "grad_norm": 0.40616957286227146, + "learning_rate": 6.414022828873053e-06, + "loss": 0.011, + "step": 4493 + }, + { + "epoch": 2.0445859872611467, + "grad_norm": 0.2827652219835658, + "learning_rate": 6.412651822458705e-06, + "loss": 0.0064, + "step": 4494 + }, + { + "epoch": 2.0450409463148316, + "grad_norm": 0.4462866764759693, + "learning_rate": 6.411280700608831e-06, + "loss": 0.0174, + "step": 4495 + }, + { + "epoch": 2.045495905368517, + "grad_norm": 0.5106540488008284, + "learning_rate": 6.409909463435471e-06, + "loss": 0.0158, + "step": 4496 + }, + { + "epoch": 2.045950864422202, + "grad_norm": 0.5023106121662962, + "learning_rate": 6.408538111050675e-06, + "loss": 0.0187, + "step": 4497 + }, + { + "epoch": 2.046405823475887, + "grad_norm": 0.2706502978948015, + "learning_rate": 6.407166643566507e-06, + "loss": 0.0068, + "step": 4498 + }, + { + "epoch": 2.0468607825295724, + "grad_norm": 0.1342442982265927, + "learning_rate": 6.405795061095035e-06, + "loss": 0.0026, + "step": 4499 + }, + { + "epoch": 2.0473157415832577, + "grad_norm": 0.30407723641667117, + "learning_rate": 6.40442336374834e-06, + "loss": 0.0053, + "step": 4500 + }, + { + "epoch": 2.0477707006369426, + "grad_norm": 0.36406400308330433, + "learning_rate": 6.4030515516385085e-06, + "loss": 0.013, + "step": 4501 + }, + { + "epoch": 2.048225659690628, + "grad_norm": 0.413690505210537, + "learning_rate": 6.401679624877641e-06, + "loss": 0.0088, + "step": 4502 + }, + { + "epoch": 2.048680618744313, + "grad_norm": 0.3138346298460819, + "learning_rate": 6.400307583577845e-06, + "loss": 0.0075, + "step": 4503 + }, + { + "epoch": 2.049135577797998, + "grad_norm": 0.28421717298116506, + "learning_rate": 6.3989354278512365e-06, + "loss": 0.0091, + "step": 4504 + }, + { + "epoch": 2.0495905368516834, + "grad_norm": 0.3902980730132667, + "learning_rate": 6.397563157809944e-06, + "loss": 0.0133, + "step": 4505 + }, + { + "epoch": 2.0500454959053687, + "grad_norm": 0.3419413032157435, + "learning_rate": 6.396190773566098e-06, + "loss": 0.008, + "step": 4506 + }, + { + "epoch": 2.0505004549590535, + "grad_norm": 0.47943799467739867, + "learning_rate": 6.39481827523185e-06, + "loss": 0.0224, + "step": 4507 + }, + { + "epoch": 2.050955414012739, + "grad_norm": 0.5374258981199744, + "learning_rate": 6.393445662919352e-06, + "loss": 0.0176, + "step": 4508 + }, + { + "epoch": 2.051410373066424, + "grad_norm": 0.4243266474425536, + "learning_rate": 6.3920729367407645e-06, + "loss": 0.0138, + "step": 4509 + }, + { + "epoch": 2.051865332120109, + "grad_norm": 0.18548240194284374, + "learning_rate": 6.390700096808266e-06, + "loss": 0.0035, + "step": 4510 + }, + { + "epoch": 2.0523202911737943, + "grad_norm": 0.4831143257226566, + "learning_rate": 6.389327143234033e-06, + "loss": 0.0155, + "step": 4511 + }, + { + "epoch": 2.0527752502274796, + "grad_norm": 0.818348652840465, + "learning_rate": 6.387954076130263e-06, + "loss": 0.0195, + "step": 4512 + }, + { + "epoch": 2.0532302092811645, + "grad_norm": 0.24451486435498174, + "learning_rate": 6.386580895609151e-06, + "loss": 0.0077, + "step": 4513 + }, + { + "epoch": 2.05368516833485, + "grad_norm": 0.6286193236656645, + "learning_rate": 6.385207601782912e-06, + "loss": 0.0141, + "step": 4514 + }, + { + "epoch": 2.054140127388535, + "grad_norm": 0.3183855842619917, + "learning_rate": 6.383834194763763e-06, + "loss": 0.008, + "step": 4515 + }, + { + "epoch": 2.05459508644222, + "grad_norm": 0.390485711720583, + "learning_rate": 6.382460674663932e-06, + "loss": 0.0092, + "step": 4516 + }, + { + "epoch": 2.0550500454959053, + "grad_norm": 0.5014940631434602, + "learning_rate": 6.381087041595659e-06, + "loss": 0.0193, + "step": 4517 + }, + { + "epoch": 2.0555050045495906, + "grad_norm": 0.4143102550753054, + "learning_rate": 6.379713295671189e-06, + "loss": 0.0086, + "step": 4518 + }, + { + "epoch": 2.055959963603276, + "grad_norm": 0.4132710071888765, + "learning_rate": 6.3783394370027785e-06, + "loss": 0.0173, + "step": 4519 + }, + { + "epoch": 2.056414922656961, + "grad_norm": 0.42016048890380997, + "learning_rate": 6.376965465702696e-06, + "loss": 0.0147, + "step": 4520 + }, + { + "epoch": 2.056869881710646, + "grad_norm": 0.4992153924894953, + "learning_rate": 6.375591381883213e-06, + "loss": 0.0165, + "step": 4521 + }, + { + "epoch": 2.0573248407643314, + "grad_norm": 0.28310436425069296, + "learning_rate": 6.374217185656614e-06, + "loss": 0.0095, + "step": 4522 + }, + { + "epoch": 2.0577797998180163, + "grad_norm": 0.5065260337851148, + "learning_rate": 6.372842877135191e-06, + "loss": 0.0122, + "step": 4523 + }, + { + "epoch": 2.0582347588717016, + "grad_norm": 0.4043323568079733, + "learning_rate": 6.37146845643125e-06, + "loss": 0.0116, + "step": 4524 + }, + { + "epoch": 2.058689717925387, + "grad_norm": 0.523207412624782, + "learning_rate": 6.370093923657099e-06, + "loss": 0.0111, + "step": 4525 + }, + { + "epoch": 2.0591446769790718, + "grad_norm": 0.29125802401521844, + "learning_rate": 6.36871927892506e-06, + "loss": 0.0078, + "step": 4526 + }, + { + "epoch": 2.059599636032757, + "grad_norm": 0.23957204726726347, + "learning_rate": 6.367344522347465e-06, + "loss": 0.0045, + "step": 4527 + }, + { + "epoch": 2.0600545950864424, + "grad_norm": 0.40895703978535547, + "learning_rate": 6.365969654036648e-06, + "loss": 0.0073, + "step": 4528 + }, + { + "epoch": 2.0605095541401273, + "grad_norm": 0.4095715269513763, + "learning_rate": 6.36459467410496e-06, + "loss": 0.0111, + "step": 4529 + }, + { + "epoch": 2.0609645131938126, + "grad_norm": 0.33090525933488096, + "learning_rate": 6.363219582664758e-06, + "loss": 0.01, + "step": 4530 + }, + { + "epoch": 2.061419472247498, + "grad_norm": 0.2919761694239399, + "learning_rate": 6.361844379828408e-06, + "loss": 0.0093, + "step": 4531 + }, + { + "epoch": 2.0618744313011828, + "grad_norm": 0.1761863838449364, + "learning_rate": 6.360469065708286e-06, + "loss": 0.0025, + "step": 4532 + }, + { + "epoch": 2.062329390354868, + "grad_norm": 0.4755051873680735, + "learning_rate": 6.359093640416773e-06, + "loss": 0.0145, + "step": 4533 + }, + { + "epoch": 2.0627843494085534, + "grad_norm": 0.3940944642294024, + "learning_rate": 6.357718104066267e-06, + "loss": 0.0102, + "step": 4534 + }, + { + "epoch": 2.0632393084622382, + "grad_norm": 0.32006471418527305, + "learning_rate": 6.356342456769169e-06, + "loss": 0.0134, + "step": 4535 + }, + { + "epoch": 2.0636942675159236, + "grad_norm": 0.4070164451436498, + "learning_rate": 6.354966698637892e-06, + "loss": 0.0086, + "step": 4536 + }, + { + "epoch": 2.064149226569609, + "grad_norm": 0.3806130387200204, + "learning_rate": 6.353590829784853e-06, + "loss": 0.0093, + "step": 4537 + }, + { + "epoch": 2.0646041856232937, + "grad_norm": 0.340282552287716, + "learning_rate": 6.352214850322486e-06, + "loss": 0.0045, + "step": 4538 + }, + { + "epoch": 2.065059144676979, + "grad_norm": 0.36706911447568497, + "learning_rate": 6.3508387603632275e-06, + "loss": 0.0109, + "step": 4539 + }, + { + "epoch": 2.0655141037306644, + "grad_norm": 0.287314127027193, + "learning_rate": 6.349462560019524e-06, + "loss": 0.0082, + "step": 4540 + }, + { + "epoch": 2.065969062784349, + "grad_norm": 0.5856303471113281, + "learning_rate": 6.348086249403836e-06, + "loss": 0.0302, + "step": 4541 + }, + { + "epoch": 2.0664240218380345, + "grad_norm": 0.41896863384200134, + "learning_rate": 6.3467098286286274e-06, + "loss": 0.01, + "step": 4542 + }, + { + "epoch": 2.06687898089172, + "grad_norm": 0.426921792024738, + "learning_rate": 6.3453332978063745e-06, + "loss": 0.0051, + "step": 4543 + }, + { + "epoch": 2.0673339399454047, + "grad_norm": 0.4348306158879752, + "learning_rate": 6.343956657049558e-06, + "loss": 0.012, + "step": 4544 + }, + { + "epoch": 2.06778889899909, + "grad_norm": 0.3622185695212827, + "learning_rate": 6.342579906470673e-06, + "loss": 0.0128, + "step": 4545 + }, + { + "epoch": 2.0682438580527753, + "grad_norm": 0.3970694799161914, + "learning_rate": 6.341203046182223e-06, + "loss": 0.0074, + "step": 4546 + }, + { + "epoch": 2.06869881710646, + "grad_norm": 0.6259473080603545, + "learning_rate": 6.339826076296715e-06, + "loss": 0.0157, + "step": 4547 + }, + { + "epoch": 2.0691537761601455, + "grad_norm": 0.30910267507061634, + "learning_rate": 6.338448996926671e-06, + "loss": 0.0108, + "step": 4548 + }, + { + "epoch": 2.069608735213831, + "grad_norm": 0.3797281042748299, + "learning_rate": 6.337071808184619e-06, + "loss": 0.0145, + "step": 4549 + }, + { + "epoch": 2.070063694267516, + "grad_norm": 0.24930199379338153, + "learning_rate": 6.335694510183098e-06, + "loss": 0.0072, + "step": 4550 + }, + { + "epoch": 2.070518653321201, + "grad_norm": 0.4107630216562235, + "learning_rate": 6.3343171030346525e-06, + "loss": 0.01, + "step": 4551 + }, + { + "epoch": 2.0709736123748863, + "grad_norm": 0.3737643389016472, + "learning_rate": 6.332939586851838e-06, + "loss": 0.0139, + "step": 4552 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 0.32194853200282086, + "learning_rate": 6.331561961747224e-06, + "loss": 0.0073, + "step": 4553 + }, + { + "epoch": 2.0718835304822565, + "grad_norm": 0.5983325080789265, + "learning_rate": 6.330184227833376e-06, + "loss": 0.0213, + "step": 4554 + }, + { + "epoch": 2.072338489535942, + "grad_norm": 0.7343643528516064, + "learning_rate": 6.328806385222881e-06, + "loss": 0.0168, + "step": 4555 + }, + { + "epoch": 2.072793448589627, + "grad_norm": 0.4020262446848935, + "learning_rate": 6.327428434028331e-06, + "loss": 0.0108, + "step": 4556 + }, + { + "epoch": 2.073248407643312, + "grad_norm": 0.4537001584900146, + "learning_rate": 6.326050374362322e-06, + "loss": 0.0158, + "step": 4557 + }, + { + "epoch": 2.0737033666969973, + "grad_norm": 0.38177530440217355, + "learning_rate": 6.324672206337465e-06, + "loss": 0.0083, + "step": 4558 + }, + { + "epoch": 2.0741583257506826, + "grad_norm": 0.4500572048722902, + "learning_rate": 6.323293930066377e-06, + "loss": 0.0148, + "step": 4559 + }, + { + "epoch": 2.0746132848043675, + "grad_norm": 0.46657307952440574, + "learning_rate": 6.3219155456616856e-06, + "loss": 0.0217, + "step": 4560 + }, + { + "epoch": 2.0750682438580528, + "grad_norm": 0.25860718016450046, + "learning_rate": 6.320537053236024e-06, + "loss": 0.007, + "step": 4561 + }, + { + "epoch": 2.075523202911738, + "grad_norm": 0.2535852497165178, + "learning_rate": 6.31915845290204e-06, + "loss": 0.0046, + "step": 4562 + }, + { + "epoch": 2.075978161965423, + "grad_norm": 0.4879797035856903, + "learning_rate": 6.317779744772384e-06, + "loss": 0.0168, + "step": 4563 + }, + { + "epoch": 2.0764331210191083, + "grad_norm": 0.5480357468481581, + "learning_rate": 6.316400928959718e-06, + "loss": 0.0167, + "step": 4564 + }, + { + "epoch": 2.0768880800727936, + "grad_norm": 0.4225876036094889, + "learning_rate": 6.315022005576713e-06, + "loss": 0.0105, + "step": 4565 + }, + { + "epoch": 2.0773430391264784, + "grad_norm": 0.25443563760292537, + "learning_rate": 6.31364297473605e-06, + "loss": 0.0084, + "step": 4566 + }, + { + "epoch": 2.0777979981801638, + "grad_norm": 0.3507842449241893, + "learning_rate": 6.312263836550413e-06, + "loss": 0.0086, + "step": 4567 + }, + { + "epoch": 2.078252957233849, + "grad_norm": 0.400370306980809, + "learning_rate": 6.310884591132501e-06, + "loss": 0.0111, + "step": 4568 + }, + { + "epoch": 2.078707916287534, + "grad_norm": 0.47168660585239236, + "learning_rate": 6.309505238595022e-06, + "loss": 0.0067, + "step": 4569 + }, + { + "epoch": 2.0791628753412192, + "grad_norm": 0.3683418005490081, + "learning_rate": 6.3081257790506875e-06, + "loss": 0.0086, + "step": 4570 + }, + { + "epoch": 2.0796178343949046, + "grad_norm": 0.43676959298159407, + "learning_rate": 6.306746212612222e-06, + "loss": 0.0145, + "step": 4571 + }, + { + "epoch": 2.0800727934485894, + "grad_norm": 0.3902808514258779, + "learning_rate": 6.305366539392358e-06, + "loss": 0.0096, + "step": 4572 + }, + { + "epoch": 2.0805277525022747, + "grad_norm": 0.4408050461319164, + "learning_rate": 6.303986759503835e-06, + "loss": 0.0132, + "step": 4573 + }, + { + "epoch": 2.08098271155596, + "grad_norm": 0.4850415360136034, + "learning_rate": 6.302606873059403e-06, + "loss": 0.0082, + "step": 4574 + }, + { + "epoch": 2.0814376706096454, + "grad_norm": 0.4259018931666596, + "learning_rate": 6.301226880171818e-06, + "loss": 0.0212, + "step": 4575 + }, + { + "epoch": 2.08189262966333, + "grad_norm": 0.37655761023061296, + "learning_rate": 6.29984678095385e-06, + "loss": 0.0105, + "step": 4576 + }, + { + "epoch": 2.0823475887170155, + "grad_norm": 0.3938914042491262, + "learning_rate": 6.2984665755182735e-06, + "loss": 0.0086, + "step": 4577 + }, + { + "epoch": 2.082802547770701, + "grad_norm": 0.6143724316889407, + "learning_rate": 6.297086263977872e-06, + "loss": 0.0153, + "step": 4578 + }, + { + "epoch": 2.0832575068243857, + "grad_norm": 0.5858433418813844, + "learning_rate": 6.295705846445439e-06, + "loss": 0.0334, + "step": 4579 + }, + { + "epoch": 2.083712465878071, + "grad_norm": 0.6002687806444671, + "learning_rate": 6.294325323033775e-06, + "loss": 0.0184, + "step": 4580 + }, + { + "epoch": 2.0841674249317563, + "grad_norm": 0.42666084226846, + "learning_rate": 6.29294469385569e-06, + "loss": 0.0183, + "step": 4581 + }, + { + "epoch": 2.084622383985441, + "grad_norm": 0.33115785714098306, + "learning_rate": 6.291563959024005e-06, + "loss": 0.0103, + "step": 4582 + }, + { + "epoch": 2.0850773430391265, + "grad_norm": 0.40795705419312395, + "learning_rate": 6.290183118651546e-06, + "loss": 0.0161, + "step": 4583 + }, + { + "epoch": 2.085532302092812, + "grad_norm": 0.24850035268721687, + "learning_rate": 6.2888021728511475e-06, + "loss": 0.0045, + "step": 4584 + }, + { + "epoch": 2.0859872611464967, + "grad_norm": 0.34080271467213186, + "learning_rate": 6.2874211217356574e-06, + "loss": 0.0114, + "step": 4585 + }, + { + "epoch": 2.086442220200182, + "grad_norm": 0.2859645446897584, + "learning_rate": 6.286039965417925e-06, + "loss": 0.0082, + "step": 4586 + }, + { + "epoch": 2.0868971792538673, + "grad_norm": 0.41872676985407964, + "learning_rate": 6.284658704010815e-06, + "loss": 0.0112, + "step": 4587 + }, + { + "epoch": 2.087352138307552, + "grad_norm": 0.5174279262156914, + "learning_rate": 6.283277337627198e-06, + "loss": 0.0217, + "step": 4588 + }, + { + "epoch": 2.0878070973612375, + "grad_norm": 0.4215936972981889, + "learning_rate": 6.281895866379951e-06, + "loss": 0.0135, + "step": 4589 + }, + { + "epoch": 2.088262056414923, + "grad_norm": 0.2934422956997878, + "learning_rate": 6.280514290381965e-06, + "loss": 0.0036, + "step": 4590 + }, + { + "epoch": 2.0887170154686077, + "grad_norm": 0.4206969344733646, + "learning_rate": 6.2791326097461324e-06, + "loss": 0.0132, + "step": 4591 + }, + { + "epoch": 2.089171974522293, + "grad_norm": 0.3891551053085789, + "learning_rate": 6.2777508245853605e-06, + "loss": 0.0161, + "step": 4592 + }, + { + "epoch": 2.0896269335759783, + "grad_norm": 0.2868441850896359, + "learning_rate": 6.276368935012559e-06, + "loss": 0.008, + "step": 4593 + }, + { + "epoch": 2.090081892629663, + "grad_norm": 0.4325567732140906, + "learning_rate": 6.274986941140654e-06, + "loss": 0.0128, + "step": 4594 + }, + { + "epoch": 2.0905368516833485, + "grad_norm": 0.49807938130393564, + "learning_rate": 6.273604843082573e-06, + "loss": 0.0136, + "step": 4595 + }, + { + "epoch": 2.0909918107370338, + "grad_norm": 0.5118888559284318, + "learning_rate": 6.272222640951257e-06, + "loss": 0.0105, + "step": 4596 + }, + { + "epoch": 2.0914467697907186, + "grad_norm": 0.28166633533659863, + "learning_rate": 6.270840334859651e-06, + "loss": 0.0113, + "step": 4597 + }, + { + "epoch": 2.091901728844404, + "grad_norm": 0.3900023284562493, + "learning_rate": 6.269457924920713e-06, + "loss": 0.011, + "step": 4598 + }, + { + "epoch": 2.0923566878980893, + "grad_norm": 0.41681279462432963, + "learning_rate": 6.2680754112474065e-06, + "loss": 0.0139, + "step": 4599 + }, + { + "epoch": 2.092811646951774, + "grad_norm": 0.4565442396926241, + "learning_rate": 6.266692793952702e-06, + "loss": 0.0106, + "step": 4600 + }, + { + "epoch": 2.0932666060054594, + "grad_norm": 0.5744912833702874, + "learning_rate": 6.265310073149584e-06, + "loss": 0.0263, + "step": 4601 + }, + { + "epoch": 2.0937215650591448, + "grad_norm": 0.43435981273700197, + "learning_rate": 6.263927248951042e-06, + "loss": 0.0131, + "step": 4602 + }, + { + "epoch": 2.0941765241128296, + "grad_norm": 0.25937892046827166, + "learning_rate": 6.26254432147007e-06, + "loss": 0.0077, + "step": 4603 + }, + { + "epoch": 2.094631483166515, + "grad_norm": 0.4005011124266399, + "learning_rate": 6.261161290819681e-06, + "loss": 0.0103, + "step": 4604 + }, + { + "epoch": 2.0950864422202002, + "grad_norm": 0.5898502910711467, + "learning_rate": 6.259778157112885e-06, + "loss": 0.0174, + "step": 4605 + }, + { + "epoch": 2.0955414012738856, + "grad_norm": 0.5079361789692335, + "learning_rate": 6.258394920462707e-06, + "loss": 0.0208, + "step": 4606 + }, + { + "epoch": 2.0959963603275704, + "grad_norm": 0.48058981648043236, + "learning_rate": 6.257011580982179e-06, + "loss": 0.0102, + "step": 4607 + }, + { + "epoch": 2.0964513193812557, + "grad_norm": 0.4851769092280976, + "learning_rate": 6.255628138784341e-06, + "loss": 0.0139, + "step": 4608 + }, + { + "epoch": 2.096906278434941, + "grad_norm": 0.29882629203864847, + "learning_rate": 6.254244593982244e-06, + "loss": 0.0098, + "step": 4609 + }, + { + "epoch": 2.097361237488626, + "grad_norm": 0.35145666589052676, + "learning_rate": 6.252860946688939e-06, + "loss": 0.0078, + "step": 4610 + }, + { + "epoch": 2.097816196542311, + "grad_norm": 0.4665876717085764, + "learning_rate": 6.251477197017498e-06, + "loss": 0.0159, + "step": 4611 + }, + { + "epoch": 2.0982711555959965, + "grad_norm": 0.5760290115746136, + "learning_rate": 6.250093345080992e-06, + "loss": 0.0283, + "step": 4612 + }, + { + "epoch": 2.0987261146496814, + "grad_norm": 0.2294038867724398, + "learning_rate": 6.248709390992504e-06, + "loss": 0.0046, + "step": 4613 + }, + { + "epoch": 2.0991810737033667, + "grad_norm": 0.4230616771741766, + "learning_rate": 6.247325334865121e-06, + "loss": 0.0116, + "step": 4614 + }, + { + "epoch": 2.099636032757052, + "grad_norm": 0.27642211082644613, + "learning_rate": 6.245941176811946e-06, + "loss": 0.0084, + "step": 4615 + }, + { + "epoch": 2.100090991810737, + "grad_norm": 0.6578946218287635, + "learning_rate": 6.244556916946085e-06, + "loss": 0.0179, + "step": 4616 + }, + { + "epoch": 2.100545950864422, + "grad_norm": 0.4064909225755356, + "learning_rate": 6.243172555380651e-06, + "loss": 0.0107, + "step": 4617 + }, + { + "epoch": 2.1010009099181075, + "grad_norm": 0.254458023879425, + "learning_rate": 6.241788092228773e-06, + "loss": 0.0109, + "step": 4618 + }, + { + "epoch": 2.1014558689717924, + "grad_norm": 0.3556847926268138, + "learning_rate": 6.240403527603579e-06, + "loss": 0.0117, + "step": 4619 + }, + { + "epoch": 2.1019108280254777, + "grad_norm": 0.5112993248015234, + "learning_rate": 6.23901886161821e-06, + "loss": 0.0118, + "step": 4620 + }, + { + "epoch": 2.102365787079163, + "grad_norm": 0.3782746457746778, + "learning_rate": 6.237634094385814e-06, + "loss": 0.0107, + "step": 4621 + }, + { + "epoch": 2.102820746132848, + "grad_norm": 0.3403380790753291, + "learning_rate": 6.23624922601955e-06, + "loss": 0.0121, + "step": 4622 + }, + { + "epoch": 2.103275705186533, + "grad_norm": 0.422132134092372, + "learning_rate": 6.234864256632582e-06, + "loss": 0.0134, + "step": 4623 + }, + { + "epoch": 2.1037306642402185, + "grad_norm": 0.6334762245502538, + "learning_rate": 6.233479186338084e-06, + "loss": 0.0195, + "step": 4624 + }, + { + "epoch": 2.1041856232939034, + "grad_norm": 0.3369816740149923, + "learning_rate": 6.232094015249236e-06, + "loss": 0.0095, + "step": 4625 + }, + { + "epoch": 2.1046405823475887, + "grad_norm": 0.5808306354860736, + "learning_rate": 6.230708743479231e-06, + "loss": 0.0166, + "step": 4626 + }, + { + "epoch": 2.105095541401274, + "grad_norm": 0.7841793961162681, + "learning_rate": 6.229323371141264e-06, + "loss": 0.0166, + "step": 4627 + }, + { + "epoch": 2.105550500454959, + "grad_norm": 0.330060841421579, + "learning_rate": 6.2279378983485415e-06, + "loss": 0.0166, + "step": 4628 + }, + { + "epoch": 2.106005459508644, + "grad_norm": 0.28734963065483526, + "learning_rate": 6.226552325214281e-06, + "loss": 0.0082, + "step": 4629 + }, + { + "epoch": 2.1064604185623295, + "grad_norm": 0.35806742065789043, + "learning_rate": 6.225166651851704e-06, + "loss": 0.0121, + "step": 4630 + }, + { + "epoch": 2.1069153776160148, + "grad_norm": 0.4378141175008406, + "learning_rate": 6.2237808783740395e-06, + "loss": 0.0104, + "step": 4631 + }, + { + "epoch": 2.1073703366696996, + "grad_norm": 0.5425819690818054, + "learning_rate": 6.2223950048945295e-06, + "loss": 0.0223, + "step": 4632 + }, + { + "epoch": 2.107825295723385, + "grad_norm": 0.4016986818697845, + "learning_rate": 6.22100903152642e-06, + "loss": 0.0137, + "step": 4633 + }, + { + "epoch": 2.1082802547770703, + "grad_norm": 0.36947648991630955, + "learning_rate": 6.219622958382965e-06, + "loss": 0.0165, + "step": 4634 + }, + { + "epoch": 2.108735213830755, + "grad_norm": 0.17005186534941416, + "learning_rate": 6.218236785577431e-06, + "loss": 0.0039, + "step": 4635 + }, + { + "epoch": 2.1091901728844404, + "grad_norm": 0.6340745866299279, + "learning_rate": 6.216850513223087e-06, + "loss": 0.0256, + "step": 4636 + }, + { + "epoch": 2.1096451319381258, + "grad_norm": 0.30127420891385065, + "learning_rate": 6.215464141433216e-06, + "loss": 0.0069, + "step": 4637 + }, + { + "epoch": 2.1101000909918106, + "grad_norm": 0.23026028296805445, + "learning_rate": 6.214077670321103e-06, + "loss": 0.0035, + "step": 4638 + }, + { + "epoch": 2.110555050045496, + "grad_norm": 0.45991686539056265, + "learning_rate": 6.212691100000046e-06, + "loss": 0.014, + "step": 4639 + }, + { + "epoch": 2.1110100090991812, + "grad_norm": 0.36611553659446155, + "learning_rate": 6.211304430583349e-06, + "loss": 0.0161, + "step": 4640 + }, + { + "epoch": 2.111464968152866, + "grad_norm": 0.3628732699246142, + "learning_rate": 6.209917662184324e-06, + "loss": 0.0084, + "step": 4641 + }, + { + "epoch": 2.1119199272065514, + "grad_norm": 0.32595313351396926, + "learning_rate": 6.208530794916291e-06, + "loss": 0.007, + "step": 4642 + }, + { + "epoch": 2.1123748862602367, + "grad_norm": 0.4699770478908922, + "learning_rate": 6.20714382889258e-06, + "loss": 0.0115, + "step": 4643 + }, + { + "epoch": 2.1128298453139216, + "grad_norm": 0.4628952236120021, + "learning_rate": 6.205756764226526e-06, + "loss": 0.0121, + "step": 4644 + }, + { + "epoch": 2.113284804367607, + "grad_norm": 0.5187753749264317, + "learning_rate": 6.2043696010314745e-06, + "loss": 0.0284, + "step": 4645 + }, + { + "epoch": 2.113739763421292, + "grad_norm": 0.3867323333746344, + "learning_rate": 6.202982339420778e-06, + "loss": 0.014, + "step": 4646 + }, + { + "epoch": 2.114194722474977, + "grad_norm": 0.3981493529790491, + "learning_rate": 6.201594979507798e-06, + "loss": 0.0093, + "step": 4647 + }, + { + "epoch": 2.1146496815286624, + "grad_norm": 0.27754154956912436, + "learning_rate": 6.2002075214059024e-06, + "loss": 0.0063, + "step": 4648 + }, + { + "epoch": 2.1151046405823477, + "grad_norm": 0.4835166644113581, + "learning_rate": 6.198819965228468e-06, + "loss": 0.0183, + "step": 4649 + }, + { + "epoch": 2.1155595996360326, + "grad_norm": 0.3388831775783709, + "learning_rate": 6.19743231108888e-06, + "loss": 0.0051, + "step": 4650 + }, + { + "epoch": 2.116014558689718, + "grad_norm": 0.49436350716636684, + "learning_rate": 6.196044559100531e-06, + "loss": 0.0175, + "step": 4651 + }, + { + "epoch": 2.116469517743403, + "grad_norm": 0.3239993158425095, + "learning_rate": 6.194656709376822e-06, + "loss": 0.0097, + "step": 4652 + }, + { + "epoch": 2.116924476797088, + "grad_norm": 0.2610206185014245, + "learning_rate": 6.193268762031162e-06, + "loss": 0.0134, + "step": 4653 + }, + { + "epoch": 2.1173794358507734, + "grad_norm": 0.4812889361595157, + "learning_rate": 6.1918807171769666e-06, + "loss": 0.0209, + "step": 4654 + }, + { + "epoch": 2.1178343949044587, + "grad_norm": 0.31728463901876697, + "learning_rate": 6.1904925749276635e-06, + "loss": 0.0096, + "step": 4655 + }, + { + "epoch": 2.1182893539581436, + "grad_norm": 0.28225278847668417, + "learning_rate": 6.189104335396681e-06, + "loss": 0.0073, + "step": 4656 + }, + { + "epoch": 2.118744313011829, + "grad_norm": 0.30348622765303085, + "learning_rate": 6.187715998697463e-06, + "loss": 0.0076, + "step": 4657 + }, + { + "epoch": 2.119199272065514, + "grad_norm": 0.3446741668542745, + "learning_rate": 6.1863275649434575e-06, + "loss": 0.0084, + "step": 4658 + }, + { + "epoch": 2.1196542311191995, + "grad_norm": 0.42149292621983275, + "learning_rate": 6.184939034248121e-06, + "loss": 0.0127, + "step": 4659 + }, + { + "epoch": 2.1201091901728844, + "grad_norm": 0.2888655467331999, + "learning_rate": 6.183550406724917e-06, + "loss": 0.0068, + "step": 4660 + }, + { + "epoch": 2.1205641492265697, + "grad_norm": 0.27747096934774546, + "learning_rate": 6.18216168248732e-06, + "loss": 0.0106, + "step": 4661 + }, + { + "epoch": 2.121019108280255, + "grad_norm": 0.4080394325964134, + "learning_rate": 6.1807728616488085e-06, + "loss": 0.0122, + "step": 4662 + }, + { + "epoch": 2.12147406733394, + "grad_norm": 0.30637128462500635, + "learning_rate": 6.1793839443228685e-06, + "loss": 0.0068, + "step": 4663 + }, + { + "epoch": 2.121929026387625, + "grad_norm": 0.44786866354627414, + "learning_rate": 6.177994930623001e-06, + "loss": 0.0162, + "step": 4664 + }, + { + "epoch": 2.1223839854413105, + "grad_norm": 0.3251280730970267, + "learning_rate": 6.176605820662707e-06, + "loss": 0.011, + "step": 4665 + }, + { + "epoch": 2.1228389444949953, + "grad_norm": 0.4069982323725549, + "learning_rate": 6.1752166145554996e-06, + "loss": 0.0081, + "step": 4666 + }, + { + "epoch": 2.1232939035486806, + "grad_norm": 0.32279978984882435, + "learning_rate": 6.173827312414897e-06, + "loss": 0.0114, + "step": 4667 + }, + { + "epoch": 2.123748862602366, + "grad_norm": 0.4372200402483741, + "learning_rate": 6.172437914354428e-06, + "loss": 0.011, + "step": 4668 + }, + { + "epoch": 2.124203821656051, + "grad_norm": 0.26049460733199026, + "learning_rate": 6.171048420487627e-06, + "loss": 0.0073, + "step": 4669 + }, + { + "epoch": 2.124658780709736, + "grad_norm": 0.3874247713956048, + "learning_rate": 6.169658830928037e-06, + "loss": 0.0114, + "step": 4670 + }, + { + "epoch": 2.1251137397634214, + "grad_norm": 0.36789539929279097, + "learning_rate": 6.168269145789211e-06, + "loss": 0.0128, + "step": 4671 + }, + { + "epoch": 2.1255686988171063, + "grad_norm": 0.41423701778830163, + "learning_rate": 6.166879365184705e-06, + "loss": 0.0125, + "step": 4672 + }, + { + "epoch": 2.1260236578707916, + "grad_norm": 0.3888237106440935, + "learning_rate": 6.165489489228086e-06, + "loss": 0.0097, + "step": 4673 + }, + { + "epoch": 2.126478616924477, + "grad_norm": 0.33622016893128404, + "learning_rate": 6.164099518032931e-06, + "loss": 0.0096, + "step": 4674 + }, + { + "epoch": 2.126933575978162, + "grad_norm": 0.5419369950038244, + "learning_rate": 6.16270945171282e-06, + "loss": 0.0223, + "step": 4675 + }, + { + "epoch": 2.127388535031847, + "grad_norm": 0.4666748588676364, + "learning_rate": 6.161319290381342e-06, + "loss": 0.0138, + "step": 4676 + }, + { + "epoch": 2.1278434940855324, + "grad_norm": 0.20680893492029834, + "learning_rate": 6.159929034152098e-06, + "loss": 0.0044, + "step": 4677 + }, + { + "epoch": 2.1282984531392173, + "grad_norm": 0.32100270225017463, + "learning_rate": 6.158538683138689e-06, + "loss": 0.0079, + "step": 4678 + }, + { + "epoch": 2.1287534121929026, + "grad_norm": 0.5240960762964716, + "learning_rate": 6.157148237454734e-06, + "loss": 0.0208, + "step": 4679 + }, + { + "epoch": 2.129208371246588, + "grad_norm": 0.4750588882235299, + "learning_rate": 6.155757697213848e-06, + "loss": 0.0317, + "step": 4680 + }, + { + "epoch": 2.1296633303002728, + "grad_norm": 0.6071814918296271, + "learning_rate": 6.154367062529663e-06, + "loss": 0.0162, + "step": 4681 + }, + { + "epoch": 2.130118289353958, + "grad_norm": 0.3483499351536633, + "learning_rate": 6.152976333515816e-06, + "loss": 0.0103, + "step": 4682 + }, + { + "epoch": 2.1305732484076434, + "grad_norm": 0.3176517571523393, + "learning_rate": 6.151585510285949e-06, + "loss": 0.022, + "step": 4683 + }, + { + "epoch": 2.1310282074613287, + "grad_norm": 0.2834270098676874, + "learning_rate": 6.150194592953714e-06, + "loss": 0.0078, + "step": 4684 + }, + { + "epoch": 2.1314831665150136, + "grad_norm": 0.20075819652666196, + "learning_rate": 6.1488035816327705e-06, + "loss": 0.0052, + "step": 4685 + }, + { + "epoch": 2.131938125568699, + "grad_norm": 0.35543231749418297, + "learning_rate": 6.147412476436789e-06, + "loss": 0.0188, + "step": 4686 + }, + { + "epoch": 2.132393084622384, + "grad_norm": 0.4005658010128812, + "learning_rate": 6.146021277479438e-06, + "loss": 0.0161, + "step": 4687 + }, + { + "epoch": 2.132848043676069, + "grad_norm": 0.43084722475538406, + "learning_rate": 6.1446299848744064e-06, + "loss": 0.0094, + "step": 4688 + }, + { + "epoch": 2.1333030027297544, + "grad_norm": 0.35225087592272825, + "learning_rate": 6.143238598735382e-06, + "loss": 0.0104, + "step": 4689 + }, + { + "epoch": 2.1337579617834397, + "grad_norm": 0.4328677661340792, + "learning_rate": 6.14184711917606e-06, + "loss": 0.0124, + "step": 4690 + }, + { + "epoch": 2.1342129208371245, + "grad_norm": 0.512463258271892, + "learning_rate": 6.140455546310149e-06, + "loss": 0.0218, + "step": 4691 + }, + { + "epoch": 2.13466787989081, + "grad_norm": 0.309462854786802, + "learning_rate": 6.13906388025136e-06, + "loss": 0.0079, + "step": 4692 + }, + { + "epoch": 2.135122838944495, + "grad_norm": 0.46781772579974923, + "learning_rate": 6.137672121113416e-06, + "loss": 0.0191, + "step": 4693 + }, + { + "epoch": 2.13557779799818, + "grad_norm": 2.517888895246575, + "learning_rate": 6.136280269010043e-06, + "loss": 0.0821, + "step": 4694 + }, + { + "epoch": 2.1360327570518653, + "grad_norm": 0.4572240952536275, + "learning_rate": 6.134888324054978e-06, + "loss": 0.016, + "step": 4695 + }, + { + "epoch": 2.1364877161055507, + "grad_norm": 0.24484263967956846, + "learning_rate": 6.133496286361965e-06, + "loss": 0.0085, + "step": 4696 + }, + { + "epoch": 2.1369426751592355, + "grad_norm": 0.3074001728531714, + "learning_rate": 6.132104156044753e-06, + "loss": 0.0082, + "step": 4697 + }, + { + "epoch": 2.137397634212921, + "grad_norm": 0.6501583812528575, + "learning_rate": 6.130711933217103e-06, + "loss": 0.0184, + "step": 4698 + }, + { + "epoch": 2.137852593266606, + "grad_norm": 0.43823203013909906, + "learning_rate": 6.12931961799278e-06, + "loss": 0.0161, + "step": 4699 + }, + { + "epoch": 2.138307552320291, + "grad_norm": 0.2380852798065643, + "learning_rate": 6.127927210485558e-06, + "loss": 0.0052, + "step": 4700 + }, + { + "epoch": 2.1387625113739763, + "grad_norm": 0.41066885343616005, + "learning_rate": 6.126534710809217e-06, + "loss": 0.0104, + "step": 4701 + }, + { + "epoch": 2.1392174704276616, + "grad_norm": 0.3193981628094812, + "learning_rate": 6.125142119077548e-06, + "loss": 0.009, + "step": 4702 + }, + { + "epoch": 2.1396724294813465, + "grad_norm": 0.27467948825988836, + "learning_rate": 6.123749435404345e-06, + "loss": 0.0067, + "step": 4703 + }, + { + "epoch": 2.140127388535032, + "grad_norm": 0.47760238173553504, + "learning_rate": 6.122356659903414e-06, + "loss": 0.0081, + "step": 4704 + }, + { + "epoch": 2.140582347588717, + "grad_norm": 0.33054107501935437, + "learning_rate": 6.1209637926885635e-06, + "loss": 0.0108, + "step": 4705 + }, + { + "epoch": 2.141037306642402, + "grad_norm": 0.15623333190508923, + "learning_rate": 6.119570833873616e-06, + "loss": 0.0035, + "step": 4706 + }, + { + "epoch": 2.1414922656960873, + "grad_norm": 0.6933431793628463, + "learning_rate": 6.118177783572394e-06, + "loss": 0.0378, + "step": 4707 + }, + { + "epoch": 2.1419472247497726, + "grad_norm": 0.24781879479870367, + "learning_rate": 6.116784641898734e-06, + "loss": 0.0074, + "step": 4708 + }, + { + "epoch": 2.1424021838034575, + "grad_norm": 0.4716485241111409, + "learning_rate": 6.115391408966478e-06, + "loss": 0.0097, + "step": 4709 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.3580072322855141, + "learning_rate": 6.113998084889472e-06, + "loss": 0.0042, + "step": 4710 + }, + { + "epoch": 2.143312101910828, + "grad_norm": 0.38016300805459524, + "learning_rate": 6.112604669781572e-06, + "loss": 0.012, + "step": 4711 + }, + { + "epoch": 2.143767060964513, + "grad_norm": 0.507619153938635, + "learning_rate": 6.111211163756644e-06, + "loss": 0.0146, + "step": 4712 + }, + { + "epoch": 2.1442220200181983, + "grad_norm": 0.42767402212513583, + "learning_rate": 6.10981756692856e-06, + "loss": 0.0115, + "step": 4713 + }, + { + "epoch": 2.1446769790718836, + "grad_norm": 0.5149753893607478, + "learning_rate": 6.108423879411193e-06, + "loss": 0.011, + "step": 4714 + }, + { + "epoch": 2.145131938125569, + "grad_norm": 0.23231944854908404, + "learning_rate": 6.107030101318433e-06, + "loss": 0.0056, + "step": 4715 + }, + { + "epoch": 2.1455868971792538, + "grad_norm": 0.35414801991752665, + "learning_rate": 6.1056362327641726e-06, + "loss": 0.0077, + "step": 4716 + }, + { + "epoch": 2.146041856232939, + "grad_norm": 0.4995095629811545, + "learning_rate": 6.104242273862313e-06, + "loss": 0.0187, + "step": 4717 + }, + { + "epoch": 2.1464968152866244, + "grad_norm": 0.40298438638076695, + "learning_rate": 6.102848224726761e-06, + "loss": 0.011, + "step": 4718 + }, + { + "epoch": 2.1469517743403093, + "grad_norm": 0.2879775600435006, + "learning_rate": 6.1014540854714324e-06, + "loss": 0.0056, + "step": 4719 + }, + { + "epoch": 2.1474067333939946, + "grad_norm": 0.2463958119740778, + "learning_rate": 6.100059856210251e-06, + "loss": 0.009, + "step": 4720 + }, + { + "epoch": 2.14786169244768, + "grad_norm": 0.6843603558724051, + "learning_rate": 6.098665537057145e-06, + "loss": 0.0192, + "step": 4721 + }, + { + "epoch": 2.1483166515013647, + "grad_norm": 0.3959988936762539, + "learning_rate": 6.097271128126052e-06, + "loss": 0.0117, + "step": 4722 + }, + { + "epoch": 2.14877161055505, + "grad_norm": 0.24838444785399422, + "learning_rate": 6.095876629530918e-06, + "loss": 0.0068, + "step": 4723 + }, + { + "epoch": 2.1492265696087354, + "grad_norm": 0.2528726815979166, + "learning_rate": 6.094482041385697e-06, + "loss": 0.0053, + "step": 4724 + }, + { + "epoch": 2.1496815286624202, + "grad_norm": 0.4836205323337117, + "learning_rate": 6.093087363804345e-06, + "loss": 0.0129, + "step": 4725 + }, + { + "epoch": 2.1501364877161055, + "grad_norm": 0.3494498814055253, + "learning_rate": 6.0916925969008275e-06, + "loss": 0.0146, + "step": 4726 + }, + { + "epoch": 2.150591446769791, + "grad_norm": 0.3399153756042083, + "learning_rate": 6.090297740789124e-06, + "loss": 0.0106, + "step": 4727 + }, + { + "epoch": 2.1510464058234757, + "grad_norm": 0.5278632665447198, + "learning_rate": 6.088902795583211e-06, + "loss": 0.016, + "step": 4728 + }, + { + "epoch": 2.151501364877161, + "grad_norm": 0.2242819039645525, + "learning_rate": 6.08750776139708e-06, + "loss": 0.0072, + "step": 4729 + }, + { + "epoch": 2.1519563239308463, + "grad_norm": 0.19418934363495513, + "learning_rate": 6.086112638344727e-06, + "loss": 0.0053, + "step": 4730 + }, + { + "epoch": 2.152411282984531, + "grad_norm": 0.27005944566872925, + "learning_rate": 6.084717426540152e-06, + "loss": 0.0097, + "step": 4731 + }, + { + "epoch": 2.1528662420382165, + "grad_norm": 0.3667775845948143, + "learning_rate": 6.08332212609737e-06, + "loss": 0.014, + "step": 4732 + }, + { + "epoch": 2.153321201091902, + "grad_norm": 0.37190070651539836, + "learning_rate": 6.081926737130392e-06, + "loss": 0.0106, + "step": 4733 + }, + { + "epoch": 2.1537761601455867, + "grad_norm": 0.3497323603964637, + "learning_rate": 6.080531259753251e-06, + "loss": 0.0084, + "step": 4734 + }, + { + "epoch": 2.154231119199272, + "grad_norm": 0.7224472318461973, + "learning_rate": 6.079135694079973e-06, + "loss": 0.026, + "step": 4735 + }, + { + "epoch": 2.1546860782529573, + "grad_norm": 0.4345968958519765, + "learning_rate": 6.0777400402246e-06, + "loss": 0.0149, + "step": 4736 + }, + { + "epoch": 2.1551410373066426, + "grad_norm": 0.2859755139669437, + "learning_rate": 6.076344298301178e-06, + "loss": 0.0084, + "step": 4737 + }, + { + "epoch": 2.1555959963603275, + "grad_norm": 0.4119900972119734, + "learning_rate": 6.0749484684237605e-06, + "loss": 0.0109, + "step": 4738 + }, + { + "epoch": 2.156050955414013, + "grad_norm": 0.3616480465966663, + "learning_rate": 6.073552550706408e-06, + "loss": 0.011, + "step": 4739 + }, + { + "epoch": 2.156505914467698, + "grad_norm": 0.30461442568532915, + "learning_rate": 6.0721565452631895e-06, + "loss": 0.006, + "step": 4740 + }, + { + "epoch": 2.156960873521383, + "grad_norm": 0.3764717952729475, + "learning_rate": 6.070760452208181e-06, + "loss": 0.0131, + "step": 4741 + }, + { + "epoch": 2.1574158325750683, + "grad_norm": 0.20780953253775353, + "learning_rate": 6.069364271655463e-06, + "loss": 0.0072, + "step": 4742 + }, + { + "epoch": 2.1578707916287536, + "grad_norm": 0.4013046783652124, + "learning_rate": 6.0679680037191245e-06, + "loss": 0.0104, + "step": 4743 + }, + { + "epoch": 2.1583257506824385, + "grad_norm": 0.5323515325728005, + "learning_rate": 6.0665716485132665e-06, + "loss": 0.0247, + "step": 4744 + }, + { + "epoch": 2.158780709736124, + "grad_norm": 0.3098061356634163, + "learning_rate": 6.065175206151988e-06, + "loss": 0.0071, + "step": 4745 + }, + { + "epoch": 2.159235668789809, + "grad_norm": 0.6711243538714666, + "learning_rate": 6.0637786767494035e-06, + "loss": 0.0213, + "step": 4746 + }, + { + "epoch": 2.159690627843494, + "grad_norm": 0.4331859916089256, + "learning_rate": 6.062382060419628e-06, + "loss": 0.0131, + "step": 4747 + }, + { + "epoch": 2.1601455868971793, + "grad_norm": 0.43169068625570167, + "learning_rate": 6.0609853572767886e-06, + "loss": 0.0122, + "step": 4748 + }, + { + "epoch": 2.1606005459508646, + "grad_norm": 0.40906162019120135, + "learning_rate": 6.0595885674350184e-06, + "loss": 0.0155, + "step": 4749 + }, + { + "epoch": 2.1610555050045495, + "grad_norm": 0.3653189390834431, + "learning_rate": 6.058191691008453e-06, + "loss": 0.0133, + "step": 4750 + }, + { + "epoch": 2.1615104640582348, + "grad_norm": 0.7215799730558953, + "learning_rate": 6.056794728111244e-06, + "loss": 0.0294, + "step": 4751 + }, + { + "epoch": 2.16196542311192, + "grad_norm": 0.28209095792273875, + "learning_rate": 6.0553976788575395e-06, + "loss": 0.0084, + "step": 4752 + }, + { + "epoch": 2.162420382165605, + "grad_norm": 0.26556119267985756, + "learning_rate": 6.054000543361506e-06, + "loss": 0.0067, + "step": 4753 + }, + { + "epoch": 2.1628753412192903, + "grad_norm": 0.46234399150367417, + "learning_rate": 6.052603321737306e-06, + "loss": 0.0228, + "step": 4754 + }, + { + "epoch": 2.1633303002729756, + "grad_norm": 0.2291718371880802, + "learning_rate": 6.051206014099116e-06, + "loss": 0.0043, + "step": 4755 + }, + { + "epoch": 2.1637852593266604, + "grad_norm": 0.37626130042703126, + "learning_rate": 6.049808620561118e-06, + "loss": 0.0108, + "step": 4756 + }, + { + "epoch": 2.1642402183803457, + "grad_norm": 0.4387336074321349, + "learning_rate": 6.0484111412375005e-06, + "loss": 0.0157, + "step": 4757 + }, + { + "epoch": 2.164695177434031, + "grad_norm": 0.33337756700143717, + "learning_rate": 6.047013576242459e-06, + "loss": 0.0072, + "step": 4758 + }, + { + "epoch": 2.165150136487716, + "grad_norm": 0.4185471804476035, + "learning_rate": 6.045615925690196e-06, + "loss": 0.0189, + "step": 4759 + }, + { + "epoch": 2.1656050955414012, + "grad_norm": 0.2661686896047376, + "learning_rate": 6.044218189694922e-06, + "loss": 0.0083, + "step": 4760 + }, + { + "epoch": 2.1660600545950865, + "grad_norm": 0.4272995989189421, + "learning_rate": 6.042820368370854e-06, + "loss": 0.0168, + "step": 4761 + }, + { + "epoch": 2.1665150136487714, + "grad_norm": 0.31015151921115824, + "learning_rate": 6.041422461832214e-06, + "loss": 0.0054, + "step": 4762 + }, + { + "epoch": 2.1669699727024567, + "grad_norm": 0.5715853686305054, + "learning_rate": 6.0400244701932334e-06, + "loss": 0.0325, + "step": 4763 + }, + { + "epoch": 2.167424931756142, + "grad_norm": 0.26231670013399255, + "learning_rate": 6.03862639356815e-06, + "loss": 0.0052, + "step": 4764 + }, + { + "epoch": 2.167879890809827, + "grad_norm": 0.4456886325034235, + "learning_rate": 6.037228232071207e-06, + "loss": 0.0185, + "step": 4765 + }, + { + "epoch": 2.168334849863512, + "grad_norm": 0.4069779934635853, + "learning_rate": 6.035829985816659e-06, + "loss": 0.0112, + "step": 4766 + }, + { + "epoch": 2.1687898089171975, + "grad_norm": 0.33798797647195855, + "learning_rate": 6.034431654918761e-06, + "loss": 0.0078, + "step": 4767 + }, + { + "epoch": 2.1692447679708824, + "grad_norm": 0.4244850264652735, + "learning_rate": 6.033033239491779e-06, + "loss": 0.0127, + "step": 4768 + }, + { + "epoch": 2.1696997270245677, + "grad_norm": 0.4582293511737658, + "learning_rate": 6.031634739649987e-06, + "loss": 0.0228, + "step": 4769 + }, + { + "epoch": 2.170154686078253, + "grad_norm": 0.3495427084098909, + "learning_rate": 6.030236155507663e-06, + "loss": 0.0135, + "step": 4770 + }, + { + "epoch": 2.1706096451319383, + "grad_norm": 0.3447277224746257, + "learning_rate": 6.028837487179092e-06, + "loss": 0.0086, + "step": 4771 + }, + { + "epoch": 2.171064604185623, + "grad_norm": 0.41684658949692854, + "learning_rate": 6.0274387347785675e-06, + "loss": 0.0131, + "step": 4772 + }, + { + "epoch": 2.1715195632393085, + "grad_norm": 2.161786263491943, + "learning_rate": 6.026039898420392e-06, + "loss": 0.0314, + "step": 4773 + }, + { + "epoch": 2.171974522292994, + "grad_norm": 0.23402137097262496, + "learning_rate": 6.024640978218867e-06, + "loss": 0.0077, + "step": 4774 + }, + { + "epoch": 2.1724294813466787, + "grad_norm": 0.4908623331090018, + "learning_rate": 6.023241974288308e-06, + "loss": 0.0137, + "step": 4775 + }, + { + "epoch": 2.172884440400364, + "grad_norm": 0.3139320367383749, + "learning_rate": 6.021842886743036e-06, + "loss": 0.0094, + "step": 4776 + }, + { + "epoch": 2.1733393994540493, + "grad_norm": 0.5267319213235393, + "learning_rate": 6.02044371569738e-06, + "loss": 0.0174, + "step": 4777 + }, + { + "epoch": 2.173794358507734, + "grad_norm": 0.3176716587564772, + "learning_rate": 6.019044461265672e-06, + "loss": 0.0077, + "step": 4778 + }, + { + "epoch": 2.1742493175614195, + "grad_norm": 0.5365896557338491, + "learning_rate": 6.01764512356225e-06, + "loss": 0.0151, + "step": 4779 + }, + { + "epoch": 2.174704276615105, + "grad_norm": 0.3069252575826205, + "learning_rate": 6.016245702701466e-06, + "loss": 0.0073, + "step": 4780 + }, + { + "epoch": 2.1751592356687897, + "grad_norm": 0.45555655741708767, + "learning_rate": 6.014846198797673e-06, + "loss": 0.0152, + "step": 4781 + }, + { + "epoch": 2.175614194722475, + "grad_norm": 0.577079432872511, + "learning_rate": 6.013446611965229e-06, + "loss": 0.0202, + "step": 4782 + }, + { + "epoch": 2.1760691537761603, + "grad_norm": 0.2647405580225935, + "learning_rate": 6.012046942318507e-06, + "loss": 0.0062, + "step": 4783 + }, + { + "epoch": 2.176524112829845, + "grad_norm": 0.38849884729419404, + "learning_rate": 6.0106471899718785e-06, + "loss": 0.0143, + "step": 4784 + }, + { + "epoch": 2.1769790718835305, + "grad_norm": 0.4252941734157271, + "learning_rate": 6.009247355039725e-06, + "loss": 0.0126, + "step": 4785 + }, + { + "epoch": 2.1774340309372158, + "grad_norm": 0.2569941751090335, + "learning_rate": 6.007847437636436e-06, + "loss": 0.0057, + "step": 4786 + }, + { + "epoch": 2.1778889899909006, + "grad_norm": 0.44171668049013985, + "learning_rate": 6.006447437876406e-06, + "loss": 0.0107, + "step": 4787 + }, + { + "epoch": 2.178343949044586, + "grad_norm": 0.39292596246211253, + "learning_rate": 6.005047355874036e-06, + "loss": 0.0103, + "step": 4788 + }, + { + "epoch": 2.1787989080982713, + "grad_norm": 0.4829001341608921, + "learning_rate": 6.003647191743734e-06, + "loss": 0.0139, + "step": 4789 + }, + { + "epoch": 2.179253867151956, + "grad_norm": 0.3070324978728649, + "learning_rate": 6.002246945599918e-06, + "loss": 0.0067, + "step": 4790 + }, + { + "epoch": 2.1797088262056414, + "grad_norm": 0.3008772991904148, + "learning_rate": 6.0008466175570066e-06, + "loss": 0.0058, + "step": 4791 + }, + { + "epoch": 2.1801637852593267, + "grad_norm": 0.47506214872913943, + "learning_rate": 5.999446207729429e-06, + "loss": 0.0192, + "step": 4792 + }, + { + "epoch": 2.180618744313012, + "grad_norm": 0.4593157770677093, + "learning_rate": 5.9980457162316206e-06, + "loss": 0.0168, + "step": 4793 + }, + { + "epoch": 2.181073703366697, + "grad_norm": 0.30301020683938856, + "learning_rate": 5.996645143178025e-06, + "loss": 0.0075, + "step": 4794 + }, + { + "epoch": 2.1815286624203822, + "grad_norm": 0.39741882552150204, + "learning_rate": 5.995244488683088e-06, + "loss": 0.009, + "step": 4795 + }, + { + "epoch": 2.1819836214740675, + "grad_norm": 0.6789778146399447, + "learning_rate": 5.993843752861266e-06, + "loss": 0.0265, + "step": 4796 + }, + { + "epoch": 2.1824385805277524, + "grad_norm": 0.45692103365533293, + "learning_rate": 5.992442935827021e-06, + "loss": 0.0123, + "step": 4797 + }, + { + "epoch": 2.1828935395814377, + "grad_norm": 0.2663643592767637, + "learning_rate": 5.99104203769482e-06, + "loss": 0.0065, + "step": 4798 + }, + { + "epoch": 2.183348498635123, + "grad_norm": 0.4192556124069004, + "learning_rate": 5.98964105857914e-06, + "loss": 0.0164, + "step": 4799 + }, + { + "epoch": 2.183803457688808, + "grad_norm": 0.3294174370540794, + "learning_rate": 5.988239998594463e-06, + "loss": 0.0111, + "step": 4800 + }, + { + "epoch": 2.184258416742493, + "grad_norm": 0.256413754235468, + "learning_rate": 5.9868388578552736e-06, + "loss": 0.0057, + "step": 4801 + }, + { + "epoch": 2.1847133757961785, + "grad_norm": 0.35476569934440916, + "learning_rate": 5.985437636476072e-06, + "loss": 0.013, + "step": 4802 + }, + { + "epoch": 2.1851683348498634, + "grad_norm": 0.5364039288146651, + "learning_rate": 5.984036334571354e-06, + "loss": 0.0188, + "step": 4803 + }, + { + "epoch": 2.1856232939035487, + "grad_norm": 0.5035275777717944, + "learning_rate": 5.982634952255633e-06, + "loss": 0.0111, + "step": 4804 + }, + { + "epoch": 2.186078252957234, + "grad_norm": 0.3242586079060238, + "learning_rate": 5.98123348964342e-06, + "loss": 0.0112, + "step": 4805 + }, + { + "epoch": 2.186533212010919, + "grad_norm": 0.3814719552488175, + "learning_rate": 5.979831946849237e-06, + "loss": 0.0208, + "step": 4806 + }, + { + "epoch": 2.186988171064604, + "grad_norm": 0.4091000994356109, + "learning_rate": 5.978430323987614e-06, + "loss": 0.0129, + "step": 4807 + }, + { + "epoch": 2.1874431301182895, + "grad_norm": 0.3493239318833215, + "learning_rate": 5.977028621173082e-06, + "loss": 0.0099, + "step": 4808 + }, + { + "epoch": 2.1878980891719744, + "grad_norm": 0.4579529662370525, + "learning_rate": 5.975626838520185e-06, + "loss": 0.0104, + "step": 4809 + }, + { + "epoch": 2.1883530482256597, + "grad_norm": 0.3544967487096724, + "learning_rate": 5.9742249761434665e-06, + "loss": 0.0094, + "step": 4810 + }, + { + "epoch": 2.188808007279345, + "grad_norm": 0.35437826425866875, + "learning_rate": 5.972823034157485e-06, + "loss": 0.0133, + "step": 4811 + }, + { + "epoch": 2.18926296633303, + "grad_norm": 0.29840883208459107, + "learning_rate": 5.971421012676796e-06, + "loss": 0.009, + "step": 4812 + }, + { + "epoch": 2.189717925386715, + "grad_norm": 0.32160309893785227, + "learning_rate": 5.970018911815969e-06, + "loss": 0.0075, + "step": 4813 + }, + { + "epoch": 2.1901728844404005, + "grad_norm": 0.40003893224302556, + "learning_rate": 5.9686167316895786e-06, + "loss": 0.01, + "step": 4814 + }, + { + "epoch": 2.1906278434940853, + "grad_norm": 0.5572564727276995, + "learning_rate": 5.967214472412202e-06, + "loss": 0.0185, + "step": 4815 + }, + { + "epoch": 2.1910828025477707, + "grad_norm": 0.37148152625028286, + "learning_rate": 5.965812134098428e-06, + "loss": 0.0131, + "step": 4816 + }, + { + "epoch": 2.191537761601456, + "grad_norm": 0.3321422926123032, + "learning_rate": 5.9644097168628455e-06, + "loss": 0.0113, + "step": 4817 + }, + { + "epoch": 2.191992720655141, + "grad_norm": 0.4280593116204736, + "learning_rate": 5.963007220820057e-06, + "loss": 0.0105, + "step": 4818 + }, + { + "epoch": 2.192447679708826, + "grad_norm": 0.48795039502378945, + "learning_rate": 5.9616046460846685e-06, + "loss": 0.0264, + "step": 4819 + }, + { + "epoch": 2.1929026387625115, + "grad_norm": 0.4345889767556356, + "learning_rate": 5.960201992771289e-06, + "loss": 0.0147, + "step": 4820 + }, + { + "epoch": 2.1933575978161963, + "grad_norm": 0.26992468902773464, + "learning_rate": 5.958799260994541e-06, + "loss": 0.005, + "step": 4821 + }, + { + "epoch": 2.1938125568698816, + "grad_norm": 0.3243729973073873, + "learning_rate": 5.957396450869046e-06, + "loss": 0.0059, + "step": 4822 + }, + { + "epoch": 2.194267515923567, + "grad_norm": 0.20256304912426243, + "learning_rate": 5.955993562509438e-06, + "loss": 0.0032, + "step": 4823 + }, + { + "epoch": 2.194722474977252, + "grad_norm": 0.3111318089177059, + "learning_rate": 5.954590596030352e-06, + "loss": 0.0134, + "step": 4824 + }, + { + "epoch": 2.195177434030937, + "grad_norm": 0.27959531047054953, + "learning_rate": 5.953187551546433e-06, + "loss": 0.0075, + "step": 4825 + }, + { + "epoch": 2.1956323930846224, + "grad_norm": 0.32796181877560665, + "learning_rate": 5.951784429172334e-06, + "loss": 0.0097, + "step": 4826 + }, + { + "epoch": 2.1960873521383077, + "grad_norm": 0.8261618815243253, + "learning_rate": 5.950381229022706e-06, + "loss": 0.0163, + "step": 4827 + }, + { + "epoch": 2.1965423111919926, + "grad_norm": 0.47960887226055954, + "learning_rate": 5.948977951212219e-06, + "loss": 0.0133, + "step": 4828 + }, + { + "epoch": 2.196997270245678, + "grad_norm": 0.6295428431852292, + "learning_rate": 5.947574595855539e-06, + "loss": 0.0157, + "step": 4829 + }, + { + "epoch": 2.1974522292993632, + "grad_norm": 0.5058231133904427, + "learning_rate": 5.946171163067341e-06, + "loss": 0.0136, + "step": 4830 + }, + { + "epoch": 2.197907188353048, + "grad_norm": 0.41769543575645507, + "learning_rate": 5.944767652962309e-06, + "loss": 0.0139, + "step": 4831 + }, + { + "epoch": 2.1983621474067334, + "grad_norm": 0.4026794930288648, + "learning_rate": 5.943364065655131e-06, + "loss": 0.0199, + "step": 4832 + }, + { + "epoch": 2.1988171064604187, + "grad_norm": 0.613796587142224, + "learning_rate": 5.941960401260502e-06, + "loss": 0.0241, + "step": 4833 + }, + { + "epoch": 2.1992720655141036, + "grad_norm": 0.44531019560955765, + "learning_rate": 5.940556659893123e-06, + "loss": 0.0111, + "step": 4834 + }, + { + "epoch": 2.199727024567789, + "grad_norm": 0.6064428736600246, + "learning_rate": 5.9391528416677e-06, + "loss": 0.0241, + "step": 4835 + }, + { + "epoch": 2.200181983621474, + "grad_norm": 0.659961666861241, + "learning_rate": 5.93774894669895e-06, + "loss": 0.0193, + "step": 4836 + }, + { + "epoch": 2.200636942675159, + "grad_norm": 0.35896669249236113, + "learning_rate": 5.936344975101589e-06, + "loss": 0.0162, + "step": 4837 + }, + { + "epoch": 2.2010919017288444, + "grad_norm": 0.2226024632735598, + "learning_rate": 5.934940926990346e-06, + "loss": 0.0059, + "step": 4838 + }, + { + "epoch": 2.2015468607825297, + "grad_norm": 0.6412521074792568, + "learning_rate": 5.933536802479952e-06, + "loss": 0.0268, + "step": 4839 + }, + { + "epoch": 2.2020018198362146, + "grad_norm": 0.2311707642063774, + "learning_rate": 5.9321326016851475e-06, + "loss": 0.0085, + "step": 4840 + }, + { + "epoch": 2.2024567788899, + "grad_norm": 0.2665377816064787, + "learning_rate": 5.930728324720676e-06, + "loss": 0.0075, + "step": 4841 + }, + { + "epoch": 2.202911737943585, + "grad_norm": 0.433629797866015, + "learning_rate": 5.929323971701287e-06, + "loss": 0.0091, + "step": 4842 + }, + { + "epoch": 2.20336669699727, + "grad_norm": 0.37633327644039666, + "learning_rate": 5.927919542741742e-06, + "loss": 0.0149, + "step": 4843 + }, + { + "epoch": 2.2038216560509554, + "grad_norm": 0.2997005850142501, + "learning_rate": 5.926515037956802e-06, + "loss": 0.0049, + "step": 4844 + }, + { + "epoch": 2.2042766151046407, + "grad_norm": 0.4106961641210632, + "learning_rate": 5.925110457461236e-06, + "loss": 0.0107, + "step": 4845 + }, + { + "epoch": 2.2047315741583255, + "grad_norm": 0.390639317481699, + "learning_rate": 5.923705801369822e-06, + "loss": 0.0184, + "step": 4846 + }, + { + "epoch": 2.205186533212011, + "grad_norm": 0.3714930458538043, + "learning_rate": 5.922301069797343e-06, + "loss": 0.0079, + "step": 4847 + }, + { + "epoch": 2.205641492265696, + "grad_norm": 0.37185622687701453, + "learning_rate": 5.920896262858583e-06, + "loss": 0.0096, + "step": 4848 + }, + { + "epoch": 2.2060964513193815, + "grad_norm": 0.21431297930910828, + "learning_rate": 5.919491380668341e-06, + "loss": 0.0027, + "step": 4849 + }, + { + "epoch": 2.2065514103730663, + "grad_norm": 0.26101335832628725, + "learning_rate": 5.918086423341415e-06, + "loss": 0.005, + "step": 4850 + }, + { + "epoch": 2.2070063694267517, + "grad_norm": 0.3174396421177979, + "learning_rate": 5.916681390992613e-06, + "loss": 0.0083, + "step": 4851 + }, + { + "epoch": 2.207461328480437, + "grad_norm": 0.37189332383009505, + "learning_rate": 5.915276283736746e-06, + "loss": 0.0125, + "step": 4852 + }, + { + "epoch": 2.207916287534122, + "grad_norm": 0.22839634936221256, + "learning_rate": 5.9138711016886364e-06, + "loss": 0.0082, + "step": 4853 + }, + { + "epoch": 2.208371246587807, + "grad_norm": 0.39902762102311734, + "learning_rate": 5.912465844963106e-06, + "loss": 0.0089, + "step": 4854 + }, + { + "epoch": 2.2088262056414925, + "grad_norm": 0.38011464793935024, + "learning_rate": 5.911060513674986e-06, + "loss": 0.0132, + "step": 4855 + }, + { + "epoch": 2.2092811646951773, + "grad_norm": 0.44117284633989806, + "learning_rate": 5.9096551079391175e-06, + "loss": 0.0098, + "step": 4856 + }, + { + "epoch": 2.2097361237488626, + "grad_norm": 0.5433595565488459, + "learning_rate": 5.908249627870342e-06, + "loss": 0.0211, + "step": 4857 + }, + { + "epoch": 2.210191082802548, + "grad_norm": 0.4540701154032658, + "learning_rate": 5.906844073583507e-06, + "loss": 0.0101, + "step": 4858 + }, + { + "epoch": 2.210646041856233, + "grad_norm": 0.6026998190118189, + "learning_rate": 5.90543844519347e-06, + "loss": 0.0261, + "step": 4859 + }, + { + "epoch": 2.211101000909918, + "grad_norm": 0.6184229542959041, + "learning_rate": 5.904032742815092e-06, + "loss": 0.0276, + "step": 4860 + }, + { + "epoch": 2.2115559599636034, + "grad_norm": 0.32119971093347577, + "learning_rate": 5.902626966563241e-06, + "loss": 0.0084, + "step": 4861 + }, + { + "epoch": 2.2120109190172883, + "grad_norm": 0.353281287967236, + "learning_rate": 5.901221116552791e-06, + "loss": 0.0125, + "step": 4862 + }, + { + "epoch": 2.2124658780709736, + "grad_norm": 0.535007533174073, + "learning_rate": 5.8998151928986205e-06, + "loss": 0.0285, + "step": 4863 + }, + { + "epoch": 2.212920837124659, + "grad_norm": 0.5308484524550808, + "learning_rate": 5.898409195715616e-06, + "loss": 0.0214, + "step": 4864 + }, + { + "epoch": 2.213375796178344, + "grad_norm": 0.44553792105412476, + "learning_rate": 5.89700312511867e-06, + "loss": 0.0107, + "step": 4865 + }, + { + "epoch": 2.213830755232029, + "grad_norm": 0.494225228414626, + "learning_rate": 5.895596981222679e-06, + "loss": 0.0131, + "step": 4866 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.28563660749753433, + "learning_rate": 5.894190764142547e-06, + "loss": 0.0077, + "step": 4867 + }, + { + "epoch": 2.2147406733393993, + "grad_norm": 0.36408312845008073, + "learning_rate": 5.892784473993184e-06, + "loss": 0.0161, + "step": 4868 + }, + { + "epoch": 2.2151956323930846, + "grad_norm": 0.31879770665449286, + "learning_rate": 5.891378110889505e-06, + "loss": 0.0124, + "step": 4869 + }, + { + "epoch": 2.21565059144677, + "grad_norm": 0.3031384330817416, + "learning_rate": 5.889971674946434e-06, + "loss": 0.0118, + "step": 4870 + }, + { + "epoch": 2.2161055505004548, + "grad_norm": 0.41800795118456197, + "learning_rate": 5.888565166278895e-06, + "loss": 0.0128, + "step": 4871 + }, + { + "epoch": 2.21656050955414, + "grad_norm": 0.5372077913709468, + "learning_rate": 5.887158585001825e-06, + "loss": 0.0132, + "step": 4872 + }, + { + "epoch": 2.2170154686078254, + "grad_norm": 0.5567453276448947, + "learning_rate": 5.885751931230159e-06, + "loss": 0.0153, + "step": 4873 + }, + { + "epoch": 2.2174704276615103, + "grad_norm": 0.26780172680459685, + "learning_rate": 5.884345205078847e-06, + "loss": 0.0079, + "step": 4874 + }, + { + "epoch": 2.2179253867151956, + "grad_norm": 0.2872420903479903, + "learning_rate": 5.8829384066628395e-06, + "loss": 0.0106, + "step": 4875 + }, + { + "epoch": 2.218380345768881, + "grad_norm": 0.2720204238574773, + "learning_rate": 5.881531536097091e-06, + "loss": 0.0085, + "step": 4876 + }, + { + "epoch": 2.2188353048225657, + "grad_norm": 0.2854866521087616, + "learning_rate": 5.8801245934965676e-06, + "loss": 0.0116, + "step": 4877 + }, + { + "epoch": 2.219290263876251, + "grad_norm": 0.2934997083257922, + "learning_rate": 5.878717578976236e-06, + "loss": 0.0091, + "step": 4878 + }, + { + "epoch": 2.2197452229299364, + "grad_norm": 0.2950040866228448, + "learning_rate": 5.877310492651073e-06, + "loss": 0.0091, + "step": 4879 + }, + { + "epoch": 2.2202001819836217, + "grad_norm": 0.38209288233432137, + "learning_rate": 5.875903334636056e-06, + "loss": 0.013, + "step": 4880 + }, + { + "epoch": 2.2206551410373065, + "grad_norm": 0.4351658467566086, + "learning_rate": 5.874496105046177e-06, + "loss": 0.0133, + "step": 4881 + }, + { + "epoch": 2.221110100090992, + "grad_norm": 0.452865040336361, + "learning_rate": 5.873088803996424e-06, + "loss": 0.0123, + "step": 4882 + }, + { + "epoch": 2.221565059144677, + "grad_norm": 0.3422021101237533, + "learning_rate": 5.871681431601797e-06, + "loss": 0.0107, + "step": 4883 + }, + { + "epoch": 2.222020018198362, + "grad_norm": 0.3445596196811098, + "learning_rate": 5.870273987977301e-06, + "loss": 0.0098, + "step": 4884 + }, + { + "epoch": 2.2224749772520473, + "grad_norm": 0.2666549337093361, + "learning_rate": 5.868866473237944e-06, + "loss": 0.0053, + "step": 4885 + }, + { + "epoch": 2.2229299363057327, + "grad_norm": 0.5124450260383374, + "learning_rate": 5.867458887498743e-06, + "loss": 0.0163, + "step": 4886 + }, + { + "epoch": 2.2233848953594175, + "grad_norm": 0.3993052473544218, + "learning_rate": 5.866051230874719e-06, + "loss": 0.0121, + "step": 4887 + }, + { + "epoch": 2.223839854413103, + "grad_norm": 0.4550765708618119, + "learning_rate": 5.8646435034808975e-06, + "loss": 0.0073, + "step": 4888 + }, + { + "epoch": 2.224294813466788, + "grad_norm": 0.49229903782323486, + "learning_rate": 5.863235705432317e-06, + "loss": 0.0161, + "step": 4889 + }, + { + "epoch": 2.224749772520473, + "grad_norm": 0.6784352478909106, + "learning_rate": 5.86182783684401e-06, + "loss": 0.0182, + "step": 4890 + }, + { + "epoch": 2.2252047315741583, + "grad_norm": 0.4082367694691016, + "learning_rate": 5.860419897831025e-06, + "loss": 0.0133, + "step": 4891 + }, + { + "epoch": 2.2256596906278436, + "grad_norm": 0.45059913620494113, + "learning_rate": 5.859011888508412e-06, + "loss": 0.0147, + "step": 4892 + }, + { + "epoch": 2.2261146496815285, + "grad_norm": 0.49608072863593244, + "learning_rate": 5.857603808991228e-06, + "loss": 0.0213, + "step": 4893 + }, + { + "epoch": 2.226569608735214, + "grad_norm": 0.2726929458301699, + "learning_rate": 5.856195659394531e-06, + "loss": 0.0064, + "step": 4894 + }, + { + "epoch": 2.227024567788899, + "grad_norm": 0.538526382842245, + "learning_rate": 5.8547874398333924e-06, + "loss": 0.0236, + "step": 4895 + }, + { + "epoch": 2.227479526842584, + "grad_norm": 0.38738433735005584, + "learning_rate": 5.853379150422885e-06, + "loss": 0.0104, + "step": 4896 + }, + { + "epoch": 2.2279344858962693, + "grad_norm": 1.5326763283968696, + "learning_rate": 5.851970791278086e-06, + "loss": 0.0201, + "step": 4897 + }, + { + "epoch": 2.2283894449499546, + "grad_norm": 0.2143038442053601, + "learning_rate": 5.850562362514083e-06, + "loss": 0.0052, + "step": 4898 + }, + { + "epoch": 2.2288444040036395, + "grad_norm": 0.37303622579677503, + "learning_rate": 5.849153864245963e-06, + "loss": 0.0071, + "step": 4899 + }, + { + "epoch": 2.229299363057325, + "grad_norm": 0.39372925777577983, + "learning_rate": 5.847745296588827e-06, + "loss": 0.0067, + "step": 4900 + }, + { + "epoch": 2.22975432211101, + "grad_norm": 0.4332279505365411, + "learning_rate": 5.8463366596577706e-06, + "loss": 0.0136, + "step": 4901 + }, + { + "epoch": 2.2302092811646954, + "grad_norm": 0.29820073293128874, + "learning_rate": 5.844927953567906e-06, + "loss": 0.0147, + "step": 4902 + }, + { + "epoch": 2.2306642402183803, + "grad_norm": 0.4984730828892178, + "learning_rate": 5.843519178434345e-06, + "loss": 0.0198, + "step": 4903 + }, + { + "epoch": 2.2311191992720656, + "grad_norm": 0.6088795409231883, + "learning_rate": 5.842110334372203e-06, + "loss": 0.0159, + "step": 4904 + }, + { + "epoch": 2.231574158325751, + "grad_norm": 0.40473599951548656, + "learning_rate": 5.840701421496611e-06, + "loss": 0.0084, + "step": 4905 + }, + { + "epoch": 2.2320291173794358, + "grad_norm": 0.3123276430739044, + "learning_rate": 5.8392924399226945e-06, + "loss": 0.005, + "step": 4906 + }, + { + "epoch": 2.232484076433121, + "grad_norm": 0.26646880519784627, + "learning_rate": 5.837883389765589e-06, + "loss": 0.0068, + "step": 4907 + }, + { + "epoch": 2.2329390354868064, + "grad_norm": 0.35407883713038, + "learning_rate": 5.8364742711404375e-06, + "loss": 0.0086, + "step": 4908 + }, + { + "epoch": 2.2333939945404913, + "grad_norm": 0.33912525466798915, + "learning_rate": 5.835065084162386e-06, + "loss": 0.005, + "step": 4909 + }, + { + "epoch": 2.2338489535941766, + "grad_norm": 0.5903475824416466, + "learning_rate": 5.833655828946587e-06, + "loss": 0.0139, + "step": 4910 + }, + { + "epoch": 2.234303912647862, + "grad_norm": 0.48780826821725437, + "learning_rate": 5.832246505608198e-06, + "loss": 0.0206, + "step": 4911 + }, + { + "epoch": 2.2347588717015467, + "grad_norm": 0.40209753389439074, + "learning_rate": 5.830837114262384e-06, + "loss": 0.0174, + "step": 4912 + }, + { + "epoch": 2.235213830755232, + "grad_norm": 0.4813709058978907, + "learning_rate": 5.829427655024312e-06, + "loss": 0.0161, + "step": 4913 + }, + { + "epoch": 2.2356687898089174, + "grad_norm": 0.21913261319584917, + "learning_rate": 5.828018128009156e-06, + "loss": 0.006, + "step": 4914 + }, + { + "epoch": 2.2361237488626022, + "grad_norm": 0.6013695489737468, + "learning_rate": 5.826608533332101e-06, + "loss": 0.0242, + "step": 4915 + }, + { + "epoch": 2.2365787079162875, + "grad_norm": 0.3203067204799793, + "learning_rate": 5.825198871108328e-06, + "loss": 0.0125, + "step": 4916 + }, + { + "epoch": 2.237033666969973, + "grad_norm": 0.5646445515057829, + "learning_rate": 5.823789141453031e-06, + "loss": 0.0181, + "step": 4917 + }, + { + "epoch": 2.2374886260236577, + "grad_norm": 0.49975574343761603, + "learning_rate": 5.822379344481404e-06, + "loss": 0.0162, + "step": 4918 + }, + { + "epoch": 2.237943585077343, + "grad_norm": 0.4065286004383984, + "learning_rate": 5.820969480308652e-06, + "loss": 0.0064, + "step": 4919 + }, + { + "epoch": 2.2383985441310283, + "grad_norm": 0.32281003186740637, + "learning_rate": 5.819559549049982e-06, + "loss": 0.0064, + "step": 4920 + }, + { + "epoch": 2.238853503184713, + "grad_norm": 0.47098734384379837, + "learning_rate": 5.8181495508206045e-06, + "loss": 0.0151, + "step": 4921 + }, + { + "epoch": 2.2393084622383985, + "grad_norm": 0.3570877991124948, + "learning_rate": 5.816739485735743e-06, + "loss": 0.0131, + "step": 4922 + }, + { + "epoch": 2.239763421292084, + "grad_norm": 0.4698502156284249, + "learning_rate": 5.815329353910618e-06, + "loss": 0.0144, + "step": 4923 + }, + { + "epoch": 2.2402183803457687, + "grad_norm": 0.5783311808405599, + "learning_rate": 5.81391915546046e-06, + "loss": 0.0158, + "step": 4924 + }, + { + "epoch": 2.240673339399454, + "grad_norm": 0.38897801889691475, + "learning_rate": 5.812508890500503e-06, + "loss": 0.0123, + "step": 4925 + }, + { + "epoch": 2.2411282984531393, + "grad_norm": 0.40295578720791275, + "learning_rate": 5.811098559145991e-06, + "loss": 0.0104, + "step": 4926 + }, + { + "epoch": 2.241583257506824, + "grad_norm": 1.4507216651716257, + "learning_rate": 5.809688161512167e-06, + "loss": 0.0176, + "step": 4927 + }, + { + "epoch": 2.2420382165605095, + "grad_norm": 0.5062657433386122, + "learning_rate": 5.808277697714283e-06, + "loss": 0.0184, + "step": 4928 + }, + { + "epoch": 2.242493175614195, + "grad_norm": 0.37838836624796074, + "learning_rate": 5.806867167867595e-06, + "loss": 0.0124, + "step": 4929 + }, + { + "epoch": 2.2429481346678797, + "grad_norm": 0.3740956013076256, + "learning_rate": 5.8054565720873665e-06, + "loss": 0.0172, + "step": 4930 + }, + { + "epoch": 2.243403093721565, + "grad_norm": 2.8682603949734697, + "learning_rate": 5.804045910488864e-06, + "loss": 0.0392, + "step": 4931 + }, + { + "epoch": 2.2438580527752503, + "grad_norm": 0.25586205171673104, + "learning_rate": 5.80263518318736e-06, + "loss": 0.0062, + "step": 4932 + }, + { + "epoch": 2.244313011828935, + "grad_norm": 0.5138290683522561, + "learning_rate": 5.801224390298135e-06, + "loss": 0.0223, + "step": 4933 + }, + { + "epoch": 2.2447679708826205, + "grad_norm": 0.31307652816681897, + "learning_rate": 5.79981353193647e-06, + "loss": 0.0083, + "step": 4934 + }, + { + "epoch": 2.245222929936306, + "grad_norm": 0.5007090142655471, + "learning_rate": 5.798402608217655e-06, + "loss": 0.0125, + "step": 4935 + }, + { + "epoch": 2.245677888989991, + "grad_norm": 0.3376887505199625, + "learning_rate": 5.7969916192569855e-06, + "loss": 0.0184, + "step": 4936 + }, + { + "epoch": 2.246132848043676, + "grad_norm": 0.3470931920015642, + "learning_rate": 5.7955805651697595e-06, + "loss": 0.0109, + "step": 4937 + }, + { + "epoch": 2.2465878070973613, + "grad_norm": 0.5101493862951201, + "learning_rate": 5.794169446071283e-06, + "loss": 0.012, + "step": 4938 + }, + { + "epoch": 2.2470427661510466, + "grad_norm": 0.4222936678567516, + "learning_rate": 5.792758262076864e-06, + "loss": 0.0181, + "step": 4939 + }, + { + "epoch": 2.2474977252047315, + "grad_norm": 0.3154935093336658, + "learning_rate": 5.7913470133018225e-06, + "loss": 0.0105, + "step": 4940 + }, + { + "epoch": 2.2479526842584168, + "grad_norm": 0.3089213208483222, + "learning_rate": 5.789935699861475e-06, + "loss": 0.009, + "step": 4941 + }, + { + "epoch": 2.248407643312102, + "grad_norm": 0.12832391205359286, + "learning_rate": 5.78852432187115e-06, + "loss": 0.004, + "step": 4942 + }, + { + "epoch": 2.248862602365787, + "grad_norm": 0.41319675096337877, + "learning_rate": 5.787112879446177e-06, + "loss": 0.0095, + "step": 4943 + }, + { + "epoch": 2.2493175614194723, + "grad_norm": 0.3204711489093298, + "learning_rate": 5.785701372701896e-06, + "loss": 0.0131, + "step": 4944 + }, + { + "epoch": 2.2497725204731576, + "grad_norm": 0.6014032291558712, + "learning_rate": 5.784289801753646e-06, + "loss": 0.0269, + "step": 4945 + }, + { + "epoch": 2.2502274795268424, + "grad_norm": 0.5374390381709199, + "learning_rate": 5.782878166716775e-06, + "loss": 0.0217, + "step": 4946 + }, + { + "epoch": 2.2506824385805277, + "grad_norm": 0.36820229425385403, + "learning_rate": 5.7814664677066364e-06, + "loss": 0.0126, + "step": 4947 + }, + { + "epoch": 2.251137397634213, + "grad_norm": 0.46592461070793456, + "learning_rate": 5.780054704838587e-06, + "loss": 0.0111, + "step": 4948 + }, + { + "epoch": 2.251592356687898, + "grad_norm": 0.25899334637267063, + "learning_rate": 5.77864287822799e-06, + "loss": 0.0081, + "step": 4949 + }, + { + "epoch": 2.2520473157415832, + "grad_norm": 0.28583117999916735, + "learning_rate": 5.777230987990212e-06, + "loss": 0.0102, + "step": 4950 + }, + { + "epoch": 2.2525022747952685, + "grad_norm": 0.3209514458106223, + "learning_rate": 5.775819034240629e-06, + "loss": 0.0096, + "step": 4951 + }, + { + "epoch": 2.2529572338489534, + "grad_norm": 0.24934042043358404, + "learning_rate": 5.774407017094618e-06, + "loss": 0.0085, + "step": 4952 + }, + { + "epoch": 2.2534121929026387, + "grad_norm": 0.47009835945401357, + "learning_rate": 5.772994936667562e-06, + "loss": 0.0197, + "step": 4953 + }, + { + "epoch": 2.253867151956324, + "grad_norm": 0.23128927069268837, + "learning_rate": 5.771582793074853e-06, + "loss": 0.0055, + "step": 4954 + }, + { + "epoch": 2.2543221110100093, + "grad_norm": 0.5715633849732003, + "learning_rate": 5.77017058643188e-06, + "loss": 0.0174, + "step": 4955 + }, + { + "epoch": 2.254777070063694, + "grad_norm": 0.43675819305589864, + "learning_rate": 5.768758316854045e-06, + "loss": 0.0137, + "step": 4956 + }, + { + "epoch": 2.2552320291173795, + "grad_norm": 0.2688189862498713, + "learning_rate": 5.767345984456751e-06, + "loss": 0.0082, + "step": 4957 + }, + { + "epoch": 2.255686988171065, + "grad_norm": 0.44499266839276047, + "learning_rate": 5.7659335893554115e-06, + "loss": 0.0168, + "step": 4958 + }, + { + "epoch": 2.2561419472247497, + "grad_norm": 0.44237827656028933, + "learning_rate": 5.764521131665437e-06, + "loss": 0.016, + "step": 4959 + }, + { + "epoch": 2.256596906278435, + "grad_norm": 0.37931892498888303, + "learning_rate": 5.7631086115022464e-06, + "loss": 0.0076, + "step": 4960 + }, + { + "epoch": 2.2570518653321203, + "grad_norm": 0.5485154743879561, + "learning_rate": 5.761696028981269e-06, + "loss": 0.0233, + "step": 4961 + }, + { + "epoch": 2.257506824385805, + "grad_norm": 0.3893754847059226, + "learning_rate": 5.7602833842179285e-06, + "loss": 0.0123, + "step": 4962 + }, + { + "epoch": 2.2579617834394905, + "grad_norm": 0.40854201839403814, + "learning_rate": 5.758870677327665e-06, + "loss": 0.0103, + "step": 4963 + }, + { + "epoch": 2.258416742493176, + "grad_norm": 0.49304151830282095, + "learning_rate": 5.7574579084259175e-06, + "loss": 0.0068, + "step": 4964 + }, + { + "epoch": 2.2588717015468607, + "grad_norm": 0.3101229238565357, + "learning_rate": 5.7560450776281295e-06, + "loss": 0.006, + "step": 4965 + }, + { + "epoch": 2.259326660600546, + "grad_norm": 0.3018619929319456, + "learning_rate": 5.754632185049753e-06, + "loss": 0.0153, + "step": 4966 + }, + { + "epoch": 2.2597816196542313, + "grad_norm": 0.40603320270712606, + "learning_rate": 5.75321923080624e-06, + "loss": 0.0134, + "step": 4967 + }, + { + "epoch": 2.260236578707916, + "grad_norm": 0.34605688909493, + "learning_rate": 5.751806215013055e-06, + "loss": 0.0132, + "step": 4968 + }, + { + "epoch": 2.2606915377616015, + "grad_norm": 0.2875706728087106, + "learning_rate": 5.75039313778566e-06, + "loss": 0.005, + "step": 4969 + }, + { + "epoch": 2.261146496815287, + "grad_norm": 0.3932509886436604, + "learning_rate": 5.748979999239528e-06, + "loss": 0.012, + "step": 4970 + }, + { + "epoch": 2.2616014558689717, + "grad_norm": 0.567212199212246, + "learning_rate": 5.7475667994901316e-06, + "loss": 0.0236, + "step": 4971 + }, + { + "epoch": 2.262056414922657, + "grad_norm": 0.21614892083477413, + "learning_rate": 5.746153538652953e-06, + "loss": 0.0046, + "step": 4972 + }, + { + "epoch": 2.2625113739763423, + "grad_norm": 0.370042314029447, + "learning_rate": 5.7447402168434775e-06, + "loss": 0.0134, + "step": 4973 + }, + { + "epoch": 2.262966333030027, + "grad_norm": 0.5187504110392387, + "learning_rate": 5.743326834177192e-06, + "loss": 0.017, + "step": 4974 + }, + { + "epoch": 2.2634212920837125, + "grad_norm": 0.4883200916567663, + "learning_rate": 5.741913390769597e-06, + "loss": 0.0205, + "step": 4975 + }, + { + "epoch": 2.2638762511373978, + "grad_norm": 0.5524738448804214, + "learning_rate": 5.74049988673619e-06, + "loss": 0.0169, + "step": 4976 + }, + { + "epoch": 2.2643312101910826, + "grad_norm": 0.47634028211297075, + "learning_rate": 5.739086322192474e-06, + "loss": 0.0127, + "step": 4977 + }, + { + "epoch": 2.264786169244768, + "grad_norm": 0.4926572378339637, + "learning_rate": 5.737672697253964e-06, + "loss": 0.0135, + "step": 4978 + }, + { + "epoch": 2.2652411282984533, + "grad_norm": 0.3576600424560595, + "learning_rate": 5.736259012036171e-06, + "loss": 0.0123, + "step": 4979 + }, + { + "epoch": 2.265696087352138, + "grad_norm": 0.5349258203707818, + "learning_rate": 5.734845266654619e-06, + "loss": 0.0131, + "step": 4980 + }, + { + "epoch": 2.2661510464058234, + "grad_norm": 0.4592868406741922, + "learning_rate": 5.733431461224828e-06, + "loss": 0.0098, + "step": 4981 + }, + { + "epoch": 2.2666060054595087, + "grad_norm": 0.2552801025598758, + "learning_rate": 5.732017595862329e-06, + "loss": 0.0055, + "step": 4982 + }, + { + "epoch": 2.2670609645131936, + "grad_norm": 0.35292700821310596, + "learning_rate": 5.730603670682661e-06, + "loss": 0.0116, + "step": 4983 + }, + { + "epoch": 2.267515923566879, + "grad_norm": 0.48567998975999815, + "learning_rate": 5.7291896858013574e-06, + "loss": 0.0163, + "step": 4984 + }, + { + "epoch": 2.2679708826205642, + "grad_norm": 0.43355116925628845, + "learning_rate": 5.727775641333968e-06, + "loss": 0.015, + "step": 4985 + }, + { + "epoch": 2.268425841674249, + "grad_norm": 0.4044202222636259, + "learning_rate": 5.726361537396038e-06, + "loss": 0.0157, + "step": 4986 + }, + { + "epoch": 2.2688808007279344, + "grad_norm": 0.3995526928549569, + "learning_rate": 5.724947374103125e-06, + "loss": 0.0099, + "step": 4987 + }, + { + "epoch": 2.2693357597816197, + "grad_norm": 0.35497847490830436, + "learning_rate": 5.723533151570785e-06, + "loss": 0.0097, + "step": 4988 + }, + { + "epoch": 2.2697907188353046, + "grad_norm": 0.2612014985047218, + "learning_rate": 5.722118869914583e-06, + "loss": 0.0061, + "step": 4989 + }, + { + "epoch": 2.27024567788899, + "grad_norm": 0.36502490742386334, + "learning_rate": 5.720704529250091e-06, + "loss": 0.0096, + "step": 4990 + }, + { + "epoch": 2.270700636942675, + "grad_norm": 0.4948103633382935, + "learning_rate": 5.719290129692876e-06, + "loss": 0.0226, + "step": 4991 + }, + { + "epoch": 2.2711555959963605, + "grad_norm": 0.6525924451495685, + "learning_rate": 5.717875671358521e-06, + "loss": 0.0095, + "step": 4992 + }, + { + "epoch": 2.2716105550500454, + "grad_norm": 0.272285400491665, + "learning_rate": 5.7164611543626094e-06, + "loss": 0.0084, + "step": 4993 + }, + { + "epoch": 2.2720655141037307, + "grad_norm": 0.40074776778608673, + "learning_rate": 5.715046578820726e-06, + "loss": 0.0078, + "step": 4994 + }, + { + "epoch": 2.272520473157416, + "grad_norm": 0.5321487800525837, + "learning_rate": 5.713631944848467e-06, + "loss": 0.0137, + "step": 4995 + }, + { + "epoch": 2.272975432211101, + "grad_norm": 0.5950192216486127, + "learning_rate": 5.712217252561426e-06, + "loss": 0.0273, + "step": 4996 + }, + { + "epoch": 2.273430391264786, + "grad_norm": 0.4284115464368724, + "learning_rate": 5.71080250207521e-06, + "loss": 0.0088, + "step": 4997 + }, + { + "epoch": 2.2738853503184715, + "grad_norm": 0.4443237885326474, + "learning_rate": 5.709387693505421e-06, + "loss": 0.0134, + "step": 4998 + }, + { + "epoch": 2.2743403093721564, + "grad_norm": 0.4168854902978304, + "learning_rate": 5.707972826967675e-06, + "loss": 0.0133, + "step": 4999 + }, + { + "epoch": 2.2747952684258417, + "grad_norm": 0.37898531127249696, + "learning_rate": 5.706557902577587e-06, + "loss": 0.0121, + "step": 5000 + }, + { + "epoch": 2.275250227479527, + "grad_norm": 0.345508399152065, + "learning_rate": 5.705142920450777e-06, + "loss": 0.0078, + "step": 5001 + }, + { + "epoch": 2.275705186533212, + "grad_norm": 0.3722891283110431, + "learning_rate": 5.703727880702872e-06, + "loss": 0.0095, + "step": 5002 + }, + { + "epoch": 2.276160145586897, + "grad_norm": 0.5259989621791551, + "learning_rate": 5.702312783449502e-06, + "loss": 0.0076, + "step": 5003 + }, + { + "epoch": 2.2766151046405825, + "grad_norm": 0.24442617647958984, + "learning_rate": 5.700897628806304e-06, + "loss": 0.0071, + "step": 5004 + }, + { + "epoch": 2.2770700636942673, + "grad_norm": 0.5788326829668876, + "learning_rate": 5.699482416888917e-06, + "loss": 0.0288, + "step": 5005 + }, + { + "epoch": 2.2775250227479527, + "grad_norm": 0.24644035360801245, + "learning_rate": 5.698067147812986e-06, + "loss": 0.0039, + "step": 5006 + }, + { + "epoch": 2.277979981801638, + "grad_norm": 0.3470487605226571, + "learning_rate": 5.696651821694159e-06, + "loss": 0.0115, + "step": 5007 + }, + { + "epoch": 2.278434940855323, + "grad_norm": 0.3709003071958226, + "learning_rate": 5.6952364386480915e-06, + "loss": 0.0122, + "step": 5008 + }, + { + "epoch": 2.278889899909008, + "grad_norm": 0.41802688093191903, + "learning_rate": 5.693820998790442e-06, + "loss": 0.0107, + "step": 5009 + }, + { + "epoch": 2.2793448589626935, + "grad_norm": 0.42250516619422673, + "learning_rate": 5.692405502236874e-06, + "loss": 0.0202, + "step": 5010 + }, + { + "epoch": 2.2797998180163788, + "grad_norm": 0.39930549644985364, + "learning_rate": 5.690989949103056e-06, + "loss": 0.0135, + "step": 5011 + }, + { + "epoch": 2.2802547770700636, + "grad_norm": 0.6672791426407672, + "learning_rate": 5.689574339504659e-06, + "loss": 0.0202, + "step": 5012 + }, + { + "epoch": 2.280709736123749, + "grad_norm": 0.28877596548959583, + "learning_rate": 5.68815867355736e-06, + "loss": 0.0086, + "step": 5013 + }, + { + "epoch": 2.2811646951774343, + "grad_norm": 0.23984631690190455, + "learning_rate": 5.686742951376844e-06, + "loss": 0.0055, + "step": 5014 + }, + { + "epoch": 2.281619654231119, + "grad_norm": 0.42296709808869476, + "learning_rate": 5.685327173078794e-06, + "loss": 0.0146, + "step": 5015 + }, + { + "epoch": 2.2820746132848044, + "grad_norm": 0.22652783021347725, + "learning_rate": 5.683911338778902e-06, + "loss": 0.0082, + "step": 5016 + }, + { + "epoch": 2.2825295723384897, + "grad_norm": 0.3828821098856581, + "learning_rate": 5.682495448592865e-06, + "loss": 0.0085, + "step": 5017 + }, + { + "epoch": 2.2829845313921746, + "grad_norm": 0.6481984981851953, + "learning_rate": 5.681079502636382e-06, + "loss": 0.0196, + "step": 5018 + }, + { + "epoch": 2.28343949044586, + "grad_norm": 0.3605305080965294, + "learning_rate": 5.6796635010251565e-06, + "loss": 0.0145, + "step": 5019 + }, + { + "epoch": 2.2838944494995452, + "grad_norm": 0.45871327603411105, + "learning_rate": 5.678247443874899e-06, + "loss": 0.0091, + "step": 5020 + }, + { + "epoch": 2.28434940855323, + "grad_norm": 0.4387078912852865, + "learning_rate": 5.676831331301326e-06, + "loss": 0.0174, + "step": 5021 + }, + { + "epoch": 2.2848043676069154, + "grad_norm": 0.7364232085228714, + "learning_rate": 5.67541516342015e-06, + "loss": 0.0175, + "step": 5022 + }, + { + "epoch": 2.2852593266606007, + "grad_norm": 0.38051913232432416, + "learning_rate": 5.673998940347098e-06, + "loss": 0.0105, + "step": 5023 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.4203835976556839, + "learning_rate": 5.672582662197897e-06, + "loss": 0.0176, + "step": 5024 + }, + { + "epoch": 2.286169244767971, + "grad_norm": 0.4657706550731654, + "learning_rate": 5.671166329088278e-06, + "loss": 0.0177, + "step": 5025 + }, + { + "epoch": 2.286624203821656, + "grad_norm": 0.6592102460769599, + "learning_rate": 5.669749941133978e-06, + "loss": 0.0212, + "step": 5026 + }, + { + "epoch": 2.287079162875341, + "grad_norm": 0.4071193965957663, + "learning_rate": 5.668333498450736e-06, + "loss": 0.0084, + "step": 5027 + }, + { + "epoch": 2.2875341219290264, + "grad_norm": 0.4983485656734523, + "learning_rate": 5.6669170011543e-06, + "loss": 0.0225, + "step": 5028 + }, + { + "epoch": 2.2879890809827117, + "grad_norm": 0.43354718301208217, + "learning_rate": 5.665500449360418e-06, + "loss": 0.0107, + "step": 5029 + }, + { + "epoch": 2.2884440400363966, + "grad_norm": 0.39977433431566556, + "learning_rate": 5.664083843184843e-06, + "loss": 0.013, + "step": 5030 + }, + { + "epoch": 2.288898999090082, + "grad_norm": 0.32376461410374807, + "learning_rate": 5.662667182743338e-06, + "loss": 0.0091, + "step": 5031 + }, + { + "epoch": 2.289353958143767, + "grad_norm": 0.3657112859204232, + "learning_rate": 5.661250468151662e-06, + "loss": 0.012, + "step": 5032 + }, + { + "epoch": 2.289808917197452, + "grad_norm": 0.3009564905495144, + "learning_rate": 5.659833699525584e-06, + "loss": 0.0072, + "step": 5033 + }, + { + "epoch": 2.2902638762511374, + "grad_norm": 0.39452593339125275, + "learning_rate": 5.6584168769808766e-06, + "loss": 0.022, + "step": 5034 + }, + { + "epoch": 2.2907188353048227, + "grad_norm": 0.360961425345943, + "learning_rate": 5.657000000633315e-06, + "loss": 0.0099, + "step": 5035 + }, + { + "epoch": 2.2911737943585075, + "grad_norm": 0.463328619990547, + "learning_rate": 5.655583070598681e-06, + "loss": 0.0128, + "step": 5036 + }, + { + "epoch": 2.291628753412193, + "grad_norm": 0.35998487951110436, + "learning_rate": 5.6541660869927565e-06, + "loss": 0.0125, + "step": 5037 + }, + { + "epoch": 2.292083712465878, + "grad_norm": 0.4405740492655455, + "learning_rate": 5.652749049931336e-06, + "loss": 0.0176, + "step": 5038 + }, + { + "epoch": 2.292538671519563, + "grad_norm": 0.6490718386759887, + "learning_rate": 5.65133195953021e-06, + "loss": 0.0309, + "step": 5039 + }, + { + "epoch": 2.2929936305732483, + "grad_norm": 0.3699204140788893, + "learning_rate": 5.6499148159051775e-06, + "loss": 0.0099, + "step": 5040 + }, + { + "epoch": 2.2934485896269337, + "grad_norm": 0.3902261568395218, + "learning_rate": 5.648497619172042e-06, + "loss": 0.0132, + "step": 5041 + }, + { + "epoch": 2.2939035486806185, + "grad_norm": 0.4404931751252235, + "learning_rate": 5.647080369446609e-06, + "loss": 0.0162, + "step": 5042 + }, + { + "epoch": 2.294358507734304, + "grad_norm": 0.2353589954428234, + "learning_rate": 5.645663066844692e-06, + "loss": 0.0067, + "step": 5043 + }, + { + "epoch": 2.294813466787989, + "grad_norm": 0.5950281333092574, + "learning_rate": 5.644245711482101e-06, + "loss": 0.0284, + "step": 5044 + }, + { + "epoch": 2.295268425841674, + "grad_norm": 0.4707885897948095, + "learning_rate": 5.642828303474665e-06, + "loss": 0.0163, + "step": 5045 + }, + { + "epoch": 2.2957233848953593, + "grad_norm": 0.4904893500955452, + "learning_rate": 5.6414108429382e-06, + "loss": 0.0183, + "step": 5046 + }, + { + "epoch": 2.2961783439490446, + "grad_norm": 0.29333798950057566, + "learning_rate": 5.639993329988537e-06, + "loss": 0.0094, + "step": 5047 + }, + { + "epoch": 2.29663330300273, + "grad_norm": 0.32263195452792615, + "learning_rate": 5.638575764741511e-06, + "loss": 0.0108, + "step": 5048 + }, + { + "epoch": 2.297088262056415, + "grad_norm": 0.2795409607827203, + "learning_rate": 5.637158147312956e-06, + "loss": 0.0065, + "step": 5049 + }, + { + "epoch": 2.2975432211101, + "grad_norm": 0.3823863568191496, + "learning_rate": 5.635740477818716e-06, + "loss": 0.0107, + "step": 5050 + }, + { + "epoch": 2.2979981801637854, + "grad_norm": 0.28033895048134977, + "learning_rate": 5.634322756374634e-06, + "loss": 0.0071, + "step": 5051 + }, + { + "epoch": 2.2984531392174703, + "grad_norm": 0.29199495113342167, + "learning_rate": 5.632904983096561e-06, + "loss": 0.0076, + "step": 5052 + }, + { + "epoch": 2.2989080982711556, + "grad_norm": 0.4174410155801214, + "learning_rate": 5.631487158100352e-06, + "loss": 0.0167, + "step": 5053 + }, + { + "epoch": 2.299363057324841, + "grad_norm": 0.18711042403073394, + "learning_rate": 5.630069281501862e-06, + "loss": 0.0046, + "step": 5054 + }, + { + "epoch": 2.299818016378526, + "grad_norm": 0.5876691796827814, + "learning_rate": 5.628651353416957e-06, + "loss": 0.0187, + "step": 5055 + }, + { + "epoch": 2.300272975432211, + "grad_norm": 0.7474700362276027, + "learning_rate": 5.627233373961503e-06, + "loss": 0.0173, + "step": 5056 + }, + { + "epoch": 2.3007279344858964, + "grad_norm": 0.3849472671771112, + "learning_rate": 5.62581534325137e-06, + "loss": 0.01, + "step": 5057 + }, + { + "epoch": 2.3011828935395813, + "grad_norm": 0.4551373080863021, + "learning_rate": 5.624397261402432e-06, + "loss": 0.011, + "step": 5058 + }, + { + "epoch": 2.3016378525932666, + "grad_norm": 0.3737734609919106, + "learning_rate": 5.62297912853057e-06, + "loss": 0.013, + "step": 5059 + }, + { + "epoch": 2.302092811646952, + "grad_norm": 0.3156504510256659, + "learning_rate": 5.621560944751668e-06, + "loss": 0.0069, + "step": 5060 + }, + { + "epoch": 2.3025477707006368, + "grad_norm": 0.3385850814330116, + "learning_rate": 5.6201427101816105e-06, + "loss": 0.0095, + "step": 5061 + }, + { + "epoch": 2.303002729754322, + "grad_norm": 0.3968512150117719, + "learning_rate": 5.618724424936295e-06, + "loss": 0.0119, + "step": 5062 + }, + { + "epoch": 2.3034576888080074, + "grad_norm": 0.3817144000944613, + "learning_rate": 5.61730608913161e-06, + "loss": 0.0137, + "step": 5063 + }, + { + "epoch": 2.3039126478616927, + "grad_norm": 0.44460642647912646, + "learning_rate": 5.615887702883462e-06, + "loss": 0.0164, + "step": 5064 + }, + { + "epoch": 2.3043676069153776, + "grad_norm": 0.42919077855817034, + "learning_rate": 5.61446926630775e-06, + "loss": 0.0108, + "step": 5065 + }, + { + "epoch": 2.304822565969063, + "grad_norm": 0.5802541888893671, + "learning_rate": 5.613050779520385e-06, + "loss": 0.0211, + "step": 5066 + }, + { + "epoch": 2.305277525022748, + "grad_norm": 0.46402502055509914, + "learning_rate": 5.611632242637279e-06, + "loss": 0.0204, + "step": 5067 + }, + { + "epoch": 2.305732484076433, + "grad_norm": 0.26674695532163656, + "learning_rate": 5.610213655774349e-06, + "loss": 0.0057, + "step": 5068 + }, + { + "epoch": 2.3061874431301184, + "grad_norm": 0.3823871062863482, + "learning_rate": 5.608795019047514e-06, + "loss": 0.0204, + "step": 5069 + }, + { + "epoch": 2.3066424021838037, + "grad_norm": 0.3923528299084095, + "learning_rate": 5.607376332572699e-06, + "loss": 0.0112, + "step": 5070 + }, + { + "epoch": 2.3070973612374885, + "grad_norm": 0.3614994681977554, + "learning_rate": 5.605957596465834e-06, + "loss": 0.0145, + "step": 5071 + }, + { + "epoch": 2.307552320291174, + "grad_norm": 0.45085748129160796, + "learning_rate": 5.60453881084285e-06, + "loss": 0.0175, + "step": 5072 + }, + { + "epoch": 2.308007279344859, + "grad_norm": 0.48780435224282037, + "learning_rate": 5.603119975819684e-06, + "loss": 0.0221, + "step": 5073 + }, + { + "epoch": 2.308462238398544, + "grad_norm": 0.3344774151298191, + "learning_rate": 5.601701091512279e-06, + "loss": 0.0095, + "step": 5074 + }, + { + "epoch": 2.3089171974522293, + "grad_norm": 0.42206035237573375, + "learning_rate": 5.600282158036575e-06, + "loss": 0.0072, + "step": 5075 + }, + { + "epoch": 2.3093721565059147, + "grad_norm": 0.5683375002520097, + "learning_rate": 5.598863175508526e-06, + "loss": 0.0209, + "step": 5076 + }, + { + "epoch": 2.3098271155595995, + "grad_norm": 0.48459850230370116, + "learning_rate": 5.597444144044083e-06, + "loss": 0.017, + "step": 5077 + }, + { + "epoch": 2.310282074613285, + "grad_norm": 0.4332383227088994, + "learning_rate": 5.596025063759202e-06, + "loss": 0.0171, + "step": 5078 + }, + { + "epoch": 2.31073703366697, + "grad_norm": 0.2758241308671077, + "learning_rate": 5.594605934769845e-06, + "loss": 0.0104, + "step": 5079 + }, + { + "epoch": 2.311191992720655, + "grad_norm": 0.3338536433140087, + "learning_rate": 5.593186757191974e-06, + "loss": 0.0089, + "step": 5080 + }, + { + "epoch": 2.3116469517743403, + "grad_norm": 0.2640392126447474, + "learning_rate": 5.591767531141563e-06, + "loss": 0.0082, + "step": 5081 + }, + { + "epoch": 2.3121019108280256, + "grad_norm": 0.3953893898741137, + "learning_rate": 5.59034825673458e-06, + "loss": 0.0083, + "step": 5082 + }, + { + "epoch": 2.3125568698817105, + "grad_norm": 0.2569213699979878, + "learning_rate": 5.588928934087003e-06, + "loss": 0.0086, + "step": 5083 + }, + { + "epoch": 2.313011828935396, + "grad_norm": 0.6313666359843114, + "learning_rate": 5.5875095633148146e-06, + "loss": 0.0246, + "step": 5084 + }, + { + "epoch": 2.313466787989081, + "grad_norm": 0.643173164905506, + "learning_rate": 5.586090144533998e-06, + "loss": 0.0305, + "step": 5085 + }, + { + "epoch": 2.313921747042766, + "grad_norm": 0.5062189665269613, + "learning_rate": 5.58467067786054e-06, + "loss": 0.0157, + "step": 5086 + }, + { + "epoch": 2.3143767060964513, + "grad_norm": 0.5819628723646005, + "learning_rate": 5.583251163410436e-06, + "loss": 0.0091, + "step": 5087 + }, + { + "epoch": 2.3148316651501366, + "grad_norm": 0.4972780006772306, + "learning_rate": 5.58183160129968e-06, + "loss": 0.0162, + "step": 5088 + }, + { + "epoch": 2.3152866242038215, + "grad_norm": 0.41694264361458766, + "learning_rate": 5.580411991644273e-06, + "loss": 0.0164, + "step": 5089 + }, + { + "epoch": 2.315741583257507, + "grad_norm": 0.489632550791834, + "learning_rate": 5.578992334560219e-06, + "loss": 0.0167, + "step": 5090 + }, + { + "epoch": 2.316196542311192, + "grad_norm": 0.3288945372054903, + "learning_rate": 5.577572630163527e-06, + "loss": 0.0086, + "step": 5091 + }, + { + "epoch": 2.316651501364877, + "grad_norm": 0.38969169771532974, + "learning_rate": 5.576152878570208e-06, + "loss": 0.014, + "step": 5092 + }, + { + "epoch": 2.3171064604185623, + "grad_norm": 0.2772452761090754, + "learning_rate": 5.5747330798962765e-06, + "loss": 0.0076, + "step": 5093 + }, + { + "epoch": 2.3175614194722476, + "grad_norm": 0.7398501879278008, + "learning_rate": 5.573313234257755e-06, + "loss": 0.0212, + "step": 5094 + }, + { + "epoch": 2.3180163785259325, + "grad_norm": 0.40366058849418684, + "learning_rate": 5.571893341770663e-06, + "loss": 0.0092, + "step": 5095 + }, + { + "epoch": 2.3184713375796178, + "grad_norm": 0.3973674281915543, + "learning_rate": 5.57047340255103e-06, + "loss": 0.0097, + "step": 5096 + }, + { + "epoch": 2.318926296633303, + "grad_norm": 0.46292198909608095, + "learning_rate": 5.569053416714887e-06, + "loss": 0.019, + "step": 5097 + }, + { + "epoch": 2.319381255686988, + "grad_norm": 0.4462893128572885, + "learning_rate": 5.56763338437827e-06, + "loss": 0.0144, + "step": 5098 + }, + { + "epoch": 2.3198362147406733, + "grad_norm": 0.48797923425342077, + "learning_rate": 5.566213305657215e-06, + "loss": 0.0129, + "step": 5099 + }, + { + "epoch": 2.3202911737943586, + "grad_norm": 0.412122799014893, + "learning_rate": 5.564793180667766e-06, + "loss": 0.0096, + "step": 5100 + }, + { + "epoch": 2.3207461328480434, + "grad_norm": 3.5850346149075736, + "learning_rate": 5.56337300952597e-06, + "loss": 0.0639, + "step": 5101 + }, + { + "epoch": 2.3212010919017287, + "grad_norm": 0.19091056292886274, + "learning_rate": 5.561952792347873e-06, + "loss": 0.0043, + "step": 5102 + }, + { + "epoch": 2.321656050955414, + "grad_norm": 0.3746308093194985, + "learning_rate": 5.5605325292495335e-06, + "loss": 0.0113, + "step": 5103 + }, + { + "epoch": 2.3221110100090994, + "grad_norm": 0.4288877488182419, + "learning_rate": 5.559112220347007e-06, + "loss": 0.0193, + "step": 5104 + }, + { + "epoch": 2.3225659690627842, + "grad_norm": 0.43634766681944853, + "learning_rate": 5.557691865756355e-06, + "loss": 0.0115, + "step": 5105 + }, + { + "epoch": 2.3230209281164695, + "grad_norm": 0.4211964374389764, + "learning_rate": 5.556271465593642e-06, + "loss": 0.0212, + "step": 5106 + }, + { + "epoch": 2.323475887170155, + "grad_norm": 0.30603182021946, + "learning_rate": 5.554851019974935e-06, + "loss": 0.0075, + "step": 5107 + }, + { + "epoch": 2.3239308462238397, + "grad_norm": 0.2871259949309822, + "learning_rate": 5.5534305290163115e-06, + "loss": 0.0075, + "step": 5108 + }, + { + "epoch": 2.324385805277525, + "grad_norm": 0.3468060233961574, + "learning_rate": 5.552009992833842e-06, + "loss": 0.0078, + "step": 5109 + }, + { + "epoch": 2.3248407643312103, + "grad_norm": 0.3542587721425399, + "learning_rate": 5.55058941154361e-06, + "loss": 0.0097, + "step": 5110 + }, + { + "epoch": 2.325295723384895, + "grad_norm": 0.38377233668808036, + "learning_rate": 5.549168785261698e-06, + "loss": 0.0094, + "step": 5111 + }, + { + "epoch": 2.3257506824385805, + "grad_norm": 0.7489040250614702, + "learning_rate": 5.547748114104192e-06, + "loss": 0.0191, + "step": 5112 + }, + { + "epoch": 2.326205641492266, + "grad_norm": 0.28559796075104366, + "learning_rate": 5.546327398187184e-06, + "loss": 0.0045, + "step": 5113 + }, + { + "epoch": 2.3266606005459507, + "grad_norm": 0.49166462721919674, + "learning_rate": 5.544906637626768e-06, + "loss": 0.0205, + "step": 5114 + }, + { + "epoch": 2.327115559599636, + "grad_norm": 0.3507371672526975, + "learning_rate": 5.543485832539043e-06, + "loss": 0.0076, + "step": 5115 + }, + { + "epoch": 2.3275705186533213, + "grad_norm": 0.16025051105052449, + "learning_rate": 5.54206498304011e-06, + "loss": 0.0035, + "step": 5116 + }, + { + "epoch": 2.328025477707006, + "grad_norm": 0.3514133884320373, + "learning_rate": 5.540644089246073e-06, + "loss": 0.0115, + "step": 5117 + }, + { + "epoch": 2.3284804367606915, + "grad_norm": 0.32838122382429447, + "learning_rate": 5.539223151273045e-06, + "loss": 0.0072, + "step": 5118 + }, + { + "epoch": 2.328935395814377, + "grad_norm": 0.3789507088231374, + "learning_rate": 5.537802169237134e-06, + "loss": 0.0077, + "step": 5119 + }, + { + "epoch": 2.329390354868062, + "grad_norm": 0.5026365616612761, + "learning_rate": 5.536381143254461e-06, + "loss": 0.017, + "step": 5120 + }, + { + "epoch": 2.329845313921747, + "grad_norm": 0.38103556357332163, + "learning_rate": 5.534960073441141e-06, + "loss": 0.0165, + "step": 5121 + }, + { + "epoch": 2.3303002729754323, + "grad_norm": 0.3273533156081994, + "learning_rate": 5.533538959913301e-06, + "loss": 0.0072, + "step": 5122 + }, + { + "epoch": 2.3307552320291176, + "grad_norm": 0.31684182975393166, + "learning_rate": 5.5321178027870655e-06, + "loss": 0.0096, + "step": 5123 + }, + { + "epoch": 2.3312101910828025, + "grad_norm": 0.2591271732965268, + "learning_rate": 5.530696602178566e-06, + "loss": 0.008, + "step": 5124 + }, + { + "epoch": 2.331665150136488, + "grad_norm": 0.3364534699488801, + "learning_rate": 5.529275358203938e-06, + "loss": 0.0094, + "step": 5125 + }, + { + "epoch": 2.332120109190173, + "grad_norm": 0.31871194877671855, + "learning_rate": 5.527854070979317e-06, + "loss": 0.0089, + "step": 5126 + }, + { + "epoch": 2.332575068243858, + "grad_norm": 0.5134652747659972, + "learning_rate": 5.526432740620846e-06, + "loss": 0.015, + "step": 5127 + }, + { + "epoch": 2.3330300272975433, + "grad_norm": 0.4040805564754702, + "learning_rate": 5.525011367244668e-06, + "loss": 0.0126, + "step": 5128 + }, + { + "epoch": 2.3334849863512286, + "grad_norm": 0.22849334940135976, + "learning_rate": 5.523589950966932e-06, + "loss": 0.0065, + "step": 5129 + }, + { + "epoch": 2.3339399454049135, + "grad_norm": 0.38056950548925816, + "learning_rate": 5.522168491903791e-06, + "loss": 0.0088, + "step": 5130 + }, + { + "epoch": 2.3343949044585988, + "grad_norm": 0.4479812046589512, + "learning_rate": 5.520746990171396e-06, + "loss": 0.0112, + "step": 5131 + }, + { + "epoch": 2.334849863512284, + "grad_norm": 0.35949104680649646, + "learning_rate": 5.5193254458859115e-06, + "loss": 0.0117, + "step": 5132 + }, + { + "epoch": 2.335304822565969, + "grad_norm": 0.3596141100734688, + "learning_rate": 5.517903859163496e-06, + "loss": 0.0114, + "step": 5133 + }, + { + "epoch": 2.3357597816196543, + "grad_norm": 0.3321917780898999, + "learning_rate": 5.516482230120316e-06, + "loss": 0.0104, + "step": 5134 + }, + { + "epoch": 2.3362147406733396, + "grad_norm": 0.4668618150432097, + "learning_rate": 5.515060558872541e-06, + "loss": 0.0116, + "step": 5135 + }, + { + "epoch": 2.3366696997270244, + "grad_norm": 0.21570878006243732, + "learning_rate": 5.513638845536341e-06, + "loss": 0.004, + "step": 5136 + }, + { + "epoch": 2.3371246587807097, + "grad_norm": 0.4585477462408327, + "learning_rate": 5.512217090227896e-06, + "loss": 0.0111, + "step": 5137 + }, + { + "epoch": 2.337579617834395, + "grad_norm": 0.597536026245206, + "learning_rate": 5.510795293063383e-06, + "loss": 0.0127, + "step": 5138 + }, + { + "epoch": 2.33803457688808, + "grad_norm": 0.3536513185606967, + "learning_rate": 5.509373454158986e-06, + "loss": 0.0067, + "step": 5139 + }, + { + "epoch": 2.3384895359417652, + "grad_norm": 0.20360766036589872, + "learning_rate": 5.5079515736308905e-06, + "loss": 0.005, + "step": 5140 + }, + { + "epoch": 2.3389444949954505, + "grad_norm": 0.2707652544579224, + "learning_rate": 5.506529651595286e-06, + "loss": 0.0054, + "step": 5141 + }, + { + "epoch": 2.3393994540491354, + "grad_norm": 0.39362868925558897, + "learning_rate": 5.5051076881683656e-06, + "loss": 0.0117, + "step": 5142 + }, + { + "epoch": 2.3398544131028207, + "grad_norm": 0.43413748804385666, + "learning_rate": 5.503685683466326e-06, + "loss": 0.0147, + "step": 5143 + }, + { + "epoch": 2.340309372156506, + "grad_norm": 0.4750004627533353, + "learning_rate": 5.502263637605368e-06, + "loss": 0.0177, + "step": 5144 + }, + { + "epoch": 2.340764331210191, + "grad_norm": 0.49510868242892087, + "learning_rate": 5.500841550701692e-06, + "loss": 0.0109, + "step": 5145 + }, + { + "epoch": 2.341219290263876, + "grad_norm": 0.47646236015820576, + "learning_rate": 5.499419422871506e-06, + "loss": 0.0181, + "step": 5146 + }, + { + "epoch": 2.3416742493175615, + "grad_norm": 0.5135583609690554, + "learning_rate": 5.4979972542310224e-06, + "loss": 0.0207, + "step": 5147 + }, + { + "epoch": 2.3421292083712464, + "grad_norm": 0.5281542299049714, + "learning_rate": 5.49657504489645e-06, + "loss": 0.0125, + "step": 5148 + }, + { + "epoch": 2.3425841674249317, + "grad_norm": 0.3803719712457962, + "learning_rate": 5.495152794984009e-06, + "loss": 0.012, + "step": 5149 + }, + { + "epoch": 2.343039126478617, + "grad_norm": 0.3549464506951641, + "learning_rate": 5.493730504609916e-06, + "loss": 0.0142, + "step": 5150 + }, + { + "epoch": 2.343494085532302, + "grad_norm": 0.5083323476527921, + "learning_rate": 5.492308173890398e-06, + "loss": 0.0172, + "step": 5151 + }, + { + "epoch": 2.343949044585987, + "grad_norm": 0.28678181509420586, + "learning_rate": 5.490885802941678e-06, + "loss": 0.0075, + "step": 5152 + }, + { + "epoch": 2.3444040036396725, + "grad_norm": 0.5466667218810686, + "learning_rate": 5.489463391879986e-06, + "loss": 0.0167, + "step": 5153 + }, + { + "epoch": 2.3448589626933574, + "grad_norm": 0.393248239534694, + "learning_rate": 5.488040940821558e-06, + "loss": 0.0135, + "step": 5154 + }, + { + "epoch": 2.3453139217470427, + "grad_norm": 0.5634806254809632, + "learning_rate": 5.486618449882628e-06, + "loss": 0.0235, + "step": 5155 + }, + { + "epoch": 2.345768880800728, + "grad_norm": 0.35741976769377365, + "learning_rate": 5.485195919179434e-06, + "loss": 0.0096, + "step": 5156 + }, + { + "epoch": 2.3462238398544133, + "grad_norm": 0.3287227460273248, + "learning_rate": 5.483773348828224e-06, + "loss": 0.0102, + "step": 5157 + }, + { + "epoch": 2.346678798908098, + "grad_norm": 0.34566261414427857, + "learning_rate": 5.482350738945238e-06, + "loss": 0.0186, + "step": 5158 + }, + { + "epoch": 2.3471337579617835, + "grad_norm": 0.4825164592835336, + "learning_rate": 5.4809280896467275e-06, + "loss": 0.0222, + "step": 5159 + }, + { + "epoch": 2.347588717015469, + "grad_norm": 0.41901111819882036, + "learning_rate": 5.479505401048947e-06, + "loss": 0.0117, + "step": 5160 + }, + { + "epoch": 2.3480436760691537, + "grad_norm": 0.4078878433271012, + "learning_rate": 5.4780826732681506e-06, + "loss": 0.0098, + "step": 5161 + }, + { + "epoch": 2.348498635122839, + "grad_norm": 0.39394380721957667, + "learning_rate": 5.476659906420596e-06, + "loss": 0.0083, + "step": 5162 + }, + { + "epoch": 2.3489535941765243, + "grad_norm": 0.5075398429488378, + "learning_rate": 5.4752371006225455e-06, + "loss": 0.0151, + "step": 5163 + }, + { + "epoch": 2.349408553230209, + "grad_norm": 0.4971659694533423, + "learning_rate": 5.4738142559902685e-06, + "loss": 0.0244, + "step": 5164 + }, + { + "epoch": 2.3498635122838945, + "grad_norm": 0.45965992406623524, + "learning_rate": 5.472391372640028e-06, + "loss": 0.0125, + "step": 5165 + }, + { + "epoch": 2.3503184713375798, + "grad_norm": 0.5015944290941781, + "learning_rate": 5.470968450688098e-06, + "loss": 0.0119, + "step": 5166 + }, + { + "epoch": 2.3507734303912646, + "grad_norm": 0.7219996034314764, + "learning_rate": 5.469545490250753e-06, + "loss": 0.0135, + "step": 5167 + }, + { + "epoch": 2.35122838944495, + "grad_norm": 0.5160262999277548, + "learning_rate": 5.468122491444271e-06, + "loss": 0.0149, + "step": 5168 + }, + { + "epoch": 2.3516833484986353, + "grad_norm": 0.30359649291817126, + "learning_rate": 5.466699454384934e-06, + "loss": 0.0133, + "step": 5169 + }, + { + "epoch": 2.35213830755232, + "grad_norm": 0.449765608795224, + "learning_rate": 5.465276379189024e-06, + "loss": 0.0239, + "step": 5170 + }, + { + "epoch": 2.3525932666060054, + "grad_norm": 0.2868592417951846, + "learning_rate": 5.4638532659728306e-06, + "loss": 0.0069, + "step": 5171 + }, + { + "epoch": 2.3530482256596907, + "grad_norm": 0.34360803220041697, + "learning_rate": 5.462430114852641e-06, + "loss": 0.0103, + "step": 5172 + }, + { + "epoch": 2.3535031847133756, + "grad_norm": 0.24947631338377824, + "learning_rate": 5.461006925944753e-06, + "loss": 0.0067, + "step": 5173 + }, + { + "epoch": 2.353958143767061, + "grad_norm": 0.353015504583996, + "learning_rate": 5.4595836993654605e-06, + "loss": 0.0103, + "step": 5174 + }, + { + "epoch": 2.3544131028207462, + "grad_norm": 0.2937630791965515, + "learning_rate": 5.458160435231062e-06, + "loss": 0.0096, + "step": 5175 + }, + { + "epoch": 2.3548680618744315, + "grad_norm": 0.28706192712106926, + "learning_rate": 5.456737133657865e-06, + "loss": 0.0063, + "step": 5176 + }, + { + "epoch": 2.3553230209281164, + "grad_norm": 0.41859742618093065, + "learning_rate": 5.455313794762167e-06, + "loss": 0.0145, + "step": 5177 + }, + { + "epoch": 2.3557779799818017, + "grad_norm": 0.315336713910296, + "learning_rate": 5.453890418660286e-06, + "loss": 0.01, + "step": 5178 + }, + { + "epoch": 2.356232939035487, + "grad_norm": 0.3971798519572188, + "learning_rate": 5.452467005468528e-06, + "loss": 0.0211, + "step": 5179 + }, + { + "epoch": 2.356687898089172, + "grad_norm": 0.3816503031725337, + "learning_rate": 5.45104355530321e-06, + "loss": 0.0178, + "step": 5180 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.31547098033189974, + "learning_rate": 5.44962006828065e-06, + "loss": 0.0102, + "step": 5181 + }, + { + "epoch": 2.3575978161965425, + "grad_norm": 0.33955988320842717, + "learning_rate": 5.448196544517168e-06, + "loss": 0.0108, + "step": 5182 + }, + { + "epoch": 2.3580527752502274, + "grad_norm": 0.39597746365649117, + "learning_rate": 5.44677298412909e-06, + "loss": 0.0181, + "step": 5183 + }, + { + "epoch": 2.3585077343039127, + "grad_norm": 0.43323282165822946, + "learning_rate": 5.445349387232738e-06, + "loss": 0.0094, + "step": 5184 + }, + { + "epoch": 2.358962693357598, + "grad_norm": 0.5537151721234318, + "learning_rate": 5.443925753944448e-06, + "loss": 0.0221, + "step": 5185 + }, + { + "epoch": 2.359417652411283, + "grad_norm": 0.2923319692625086, + "learning_rate": 5.4425020843805485e-06, + "loss": 0.0081, + "step": 5186 + }, + { + "epoch": 2.359872611464968, + "grad_norm": 0.560851840102983, + "learning_rate": 5.4410783786573785e-06, + "loss": 0.0186, + "step": 5187 + }, + { + "epoch": 2.3603275705186535, + "grad_norm": 0.3300258799514752, + "learning_rate": 5.439654636891275e-06, + "loss": 0.0121, + "step": 5188 + }, + { + "epoch": 2.3607825295723384, + "grad_norm": 0.34519215267290093, + "learning_rate": 5.4382308591985785e-06, + "loss": 0.0138, + "step": 5189 + }, + { + "epoch": 2.3612374886260237, + "grad_norm": 0.22487529661537098, + "learning_rate": 5.436807045695638e-06, + "loss": 0.0051, + "step": 5190 + }, + { + "epoch": 2.361692447679709, + "grad_norm": 0.3575446831077963, + "learning_rate": 5.435383196498795e-06, + "loss": 0.0096, + "step": 5191 + }, + { + "epoch": 2.362147406733394, + "grad_norm": 0.3945715478446503, + "learning_rate": 5.433959311724406e-06, + "loss": 0.0105, + "step": 5192 + }, + { + "epoch": 2.362602365787079, + "grad_norm": 0.4411210526095032, + "learning_rate": 5.432535391488821e-06, + "loss": 0.0128, + "step": 5193 + }, + { + "epoch": 2.3630573248407645, + "grad_norm": 0.4444325898238472, + "learning_rate": 5.431111435908396e-06, + "loss": 0.0121, + "step": 5194 + }, + { + "epoch": 2.3635122838944493, + "grad_norm": 0.31383347621890917, + "learning_rate": 5.429687445099493e-06, + "loss": 0.0147, + "step": 5195 + }, + { + "epoch": 2.3639672429481347, + "grad_norm": 0.506179144097311, + "learning_rate": 5.428263419178471e-06, + "loss": 0.0152, + "step": 5196 + }, + { + "epoch": 2.36442220200182, + "grad_norm": 0.31338140472658715, + "learning_rate": 5.4268393582616986e-06, + "loss": 0.0091, + "step": 5197 + }, + { + "epoch": 2.364877161055505, + "grad_norm": 0.6449391980468399, + "learning_rate": 5.425415262465539e-06, + "loss": 0.0112, + "step": 5198 + }, + { + "epoch": 2.36533212010919, + "grad_norm": 0.2714619504084544, + "learning_rate": 5.423991131906366e-06, + "loss": 0.0095, + "step": 5199 + }, + { + "epoch": 2.3657870791628755, + "grad_norm": 0.44637208265522926, + "learning_rate": 5.422566966700553e-06, + "loss": 0.0178, + "step": 5200 + }, + { + "epoch": 2.3662420382165603, + "grad_norm": 0.44473690474266697, + "learning_rate": 5.421142766964475e-06, + "loss": 0.0128, + "step": 5201 + }, + { + "epoch": 2.3666969972702456, + "grad_norm": 0.3251591627585468, + "learning_rate": 5.419718532814513e-06, + "loss": 0.0108, + "step": 5202 + }, + { + "epoch": 2.367151956323931, + "grad_norm": 0.43871324002278006, + "learning_rate": 5.418294264367046e-06, + "loss": 0.0184, + "step": 5203 + }, + { + "epoch": 2.367606915377616, + "grad_norm": 0.32143958977435755, + "learning_rate": 5.416869961738463e-06, + "loss": 0.0084, + "step": 5204 + }, + { + "epoch": 2.368061874431301, + "grad_norm": 0.6047381473561789, + "learning_rate": 5.415445625045148e-06, + "loss": 0.0222, + "step": 5205 + }, + { + "epoch": 2.3685168334849864, + "grad_norm": 0.2775712802500097, + "learning_rate": 5.414021254403493e-06, + "loss": 0.0086, + "step": 5206 + }, + { + "epoch": 2.3689717925386713, + "grad_norm": 0.3913688146883563, + "learning_rate": 5.412596849929892e-06, + "loss": 0.0123, + "step": 5207 + }, + { + "epoch": 2.3694267515923566, + "grad_norm": 0.4180126345527299, + "learning_rate": 5.411172411740737e-06, + "loss": 0.0127, + "step": 5208 + }, + { + "epoch": 2.369881710646042, + "grad_norm": 0.4145520895468016, + "learning_rate": 5.409747939952432e-06, + "loss": 0.0115, + "step": 5209 + }, + { + "epoch": 2.370336669699727, + "grad_norm": 0.46984567291181023, + "learning_rate": 5.408323434681375e-06, + "loss": 0.0119, + "step": 5210 + }, + { + "epoch": 2.370791628753412, + "grad_norm": 0.4370725716934907, + "learning_rate": 5.40689889604397e-06, + "loss": 0.0155, + "step": 5211 + }, + { + "epoch": 2.3712465878070974, + "grad_norm": 0.24972957311155186, + "learning_rate": 5.4054743241566255e-06, + "loss": 0.006, + "step": 5212 + }, + { + "epoch": 2.3717015468607827, + "grad_norm": 0.42555167464637306, + "learning_rate": 5.404049719135749e-06, + "loss": 0.0101, + "step": 5213 + }, + { + "epoch": 2.3721565059144676, + "grad_norm": 0.2732869198235925, + "learning_rate": 5.4026250810977565e-06, + "loss": 0.0061, + "step": 5214 + }, + { + "epoch": 2.372611464968153, + "grad_norm": 0.23664705125295948, + "learning_rate": 5.401200410159059e-06, + "loss": 0.0042, + "step": 5215 + }, + { + "epoch": 2.373066424021838, + "grad_norm": 0.3954694984893938, + "learning_rate": 5.3997757064360756e-06, + "loss": 0.013, + "step": 5216 + }, + { + "epoch": 2.373521383075523, + "grad_norm": 0.18180273546718598, + "learning_rate": 5.398350970045229e-06, + "loss": 0.0047, + "step": 5217 + }, + { + "epoch": 2.3739763421292084, + "grad_norm": 0.31477234286448397, + "learning_rate": 5.396926201102937e-06, + "loss": 0.0057, + "step": 5218 + }, + { + "epoch": 2.3744313011828937, + "grad_norm": 0.36759194140173296, + "learning_rate": 5.39550139972563e-06, + "loss": 0.0092, + "step": 5219 + }, + { + "epoch": 2.3748862602365786, + "grad_norm": 0.442239633811156, + "learning_rate": 5.394076566029733e-06, + "loss": 0.0204, + "step": 5220 + }, + { + "epoch": 2.375341219290264, + "grad_norm": 0.37882384654604506, + "learning_rate": 5.392651700131681e-06, + "loss": 0.0138, + "step": 5221 + }, + { + "epoch": 2.375796178343949, + "grad_norm": 0.45700123117896185, + "learning_rate": 5.391226802147904e-06, + "loss": 0.0202, + "step": 5222 + }, + { + "epoch": 2.376251137397634, + "grad_norm": 0.4173548911856212, + "learning_rate": 5.38980187219484e-06, + "loss": 0.013, + "step": 5223 + }, + { + "epoch": 2.3767060964513194, + "grad_norm": 0.501526585754104, + "learning_rate": 5.388376910388928e-06, + "loss": 0.0146, + "step": 5224 + }, + { + "epoch": 2.3771610555050047, + "grad_norm": 0.4972966797438422, + "learning_rate": 5.386951916846608e-06, + "loss": 0.023, + "step": 5225 + }, + { + "epoch": 2.3776160145586895, + "grad_norm": 0.3498558355060645, + "learning_rate": 5.385526891684324e-06, + "loss": 0.0138, + "step": 5226 + }, + { + "epoch": 2.378070973612375, + "grad_norm": 0.3700683880723432, + "learning_rate": 5.3841018350185244e-06, + "loss": 0.0133, + "step": 5227 + }, + { + "epoch": 2.37852593266606, + "grad_norm": 0.5585648096050481, + "learning_rate": 5.3826767469656585e-06, + "loss": 0.0156, + "step": 5228 + }, + { + "epoch": 2.3789808917197455, + "grad_norm": 0.3331006105880638, + "learning_rate": 5.381251627642174e-06, + "loss": 0.0092, + "step": 5229 + }, + { + "epoch": 2.3794358507734303, + "grad_norm": 0.3079202123421543, + "learning_rate": 5.37982647716453e-06, + "loss": 0.0094, + "step": 5230 + }, + { + "epoch": 2.3798908098271156, + "grad_norm": 0.5613934800867718, + "learning_rate": 5.378401295649182e-06, + "loss": 0.0172, + "step": 5231 + }, + { + "epoch": 2.380345768880801, + "grad_norm": 0.3703230818626966, + "learning_rate": 5.3769760832125884e-06, + "loss": 0.0076, + "step": 5232 + }, + { + "epoch": 2.380800727934486, + "grad_norm": 0.39393991401324163, + "learning_rate": 5.375550839971212e-06, + "loss": 0.0088, + "step": 5233 + }, + { + "epoch": 2.381255686988171, + "grad_norm": 0.2322305193916463, + "learning_rate": 5.374125566041517e-06, + "loss": 0.0059, + "step": 5234 + }, + { + "epoch": 2.3817106460418564, + "grad_norm": 0.6230743002805157, + "learning_rate": 5.37270026153997e-06, + "loss": 0.0217, + "step": 5235 + }, + { + "epoch": 2.3821656050955413, + "grad_norm": 0.23889548023386997, + "learning_rate": 5.37127492658304e-06, + "loss": 0.0056, + "step": 5236 + }, + { + "epoch": 2.3826205641492266, + "grad_norm": 0.45250805993738163, + "learning_rate": 5.3698495612872e-06, + "loss": 0.0141, + "step": 5237 + }, + { + "epoch": 2.383075523202912, + "grad_norm": 0.4417927166255723, + "learning_rate": 5.368424165768925e-06, + "loss": 0.0163, + "step": 5238 + }, + { + "epoch": 2.383530482256597, + "grad_norm": 0.601485164623354, + "learning_rate": 5.366998740144691e-06, + "loss": 0.0226, + "step": 5239 + }, + { + "epoch": 2.383985441310282, + "grad_norm": 0.447241403498978, + "learning_rate": 5.365573284530976e-06, + "loss": 0.022, + "step": 5240 + }, + { + "epoch": 2.3844404003639674, + "grad_norm": 0.48628577475368706, + "learning_rate": 5.3641477990442645e-06, + "loss": 0.0203, + "step": 5241 + }, + { + "epoch": 2.3848953594176523, + "grad_norm": 0.42848094818559307, + "learning_rate": 5.362722283801038e-06, + "loss": 0.0078, + "step": 5242 + }, + { + "epoch": 2.3853503184713376, + "grad_norm": 0.35278326637047613, + "learning_rate": 5.361296738917785e-06, + "loss": 0.0132, + "step": 5243 + }, + { + "epoch": 2.385805277525023, + "grad_norm": 0.46467384475872836, + "learning_rate": 5.359871164510995e-06, + "loss": 0.0207, + "step": 5244 + }, + { + "epoch": 2.386260236578708, + "grad_norm": 0.4096938326628108, + "learning_rate": 5.3584455606971576e-06, + "loss": 0.0136, + "step": 5245 + }, + { + "epoch": 2.386715195632393, + "grad_norm": 0.3279067148690326, + "learning_rate": 5.357019927592769e-06, + "loss": 0.0071, + "step": 5246 + }, + { + "epoch": 2.3871701546860784, + "grad_norm": 0.32010669985804296, + "learning_rate": 5.355594265314321e-06, + "loss": 0.0103, + "step": 5247 + }, + { + "epoch": 2.3876251137397633, + "grad_norm": 0.4281430187657181, + "learning_rate": 5.3541685739783186e-06, + "loss": 0.0089, + "step": 5248 + }, + { + "epoch": 2.3880800727934486, + "grad_norm": 0.6972921855537589, + "learning_rate": 5.352742853701259e-06, + "loss": 0.0188, + "step": 5249 + }, + { + "epoch": 2.388535031847134, + "grad_norm": 0.3369695950732781, + "learning_rate": 5.351317104599646e-06, + "loss": 0.0118, + "step": 5250 + }, + { + "epoch": 2.3889899909008188, + "grad_norm": 0.38649180583281173, + "learning_rate": 5.3498913267899864e-06, + "loss": 0.0102, + "step": 5251 + }, + { + "epoch": 2.389444949954504, + "grad_norm": 0.2538962592519053, + "learning_rate": 5.348465520388787e-06, + "loss": 0.0085, + "step": 5252 + }, + { + "epoch": 2.3898999090081894, + "grad_norm": 0.40680922435796585, + "learning_rate": 5.34703968551256e-06, + "loss": 0.0115, + "step": 5253 + }, + { + "epoch": 2.3903548680618742, + "grad_norm": 0.4516969702412574, + "learning_rate": 5.345613822277815e-06, + "loss": 0.0164, + "step": 5254 + }, + { + "epoch": 2.3908098271155596, + "grad_norm": 0.43021217049594745, + "learning_rate": 5.344187930801072e-06, + "loss": 0.0122, + "step": 5255 + }, + { + "epoch": 2.391264786169245, + "grad_norm": 0.5015071545210491, + "learning_rate": 5.342762011198843e-06, + "loss": 0.0119, + "step": 5256 + }, + { + "epoch": 2.3917197452229297, + "grad_norm": 0.6970788943828632, + "learning_rate": 5.341336063587651e-06, + "loss": 0.0208, + "step": 5257 + }, + { + "epoch": 2.392174704276615, + "grad_norm": 0.4780454188193996, + "learning_rate": 5.3399100880840185e-06, + "loss": 0.0117, + "step": 5258 + }, + { + "epoch": 2.3926296633303004, + "grad_norm": 0.5264740389456065, + "learning_rate": 5.338484084804467e-06, + "loss": 0.009, + "step": 5259 + }, + { + "epoch": 2.3930846223839852, + "grad_norm": 0.3439050966830248, + "learning_rate": 5.337058053865527e-06, + "loss": 0.0106, + "step": 5260 + }, + { + "epoch": 2.3935395814376705, + "grad_norm": 0.6715181229719871, + "learning_rate": 5.335631995383722e-06, + "loss": 0.0247, + "step": 5261 + }, + { + "epoch": 2.393994540491356, + "grad_norm": 0.574221604583767, + "learning_rate": 5.3342059094755885e-06, + "loss": 0.0156, + "step": 5262 + }, + { + "epoch": 2.3944494995450407, + "grad_norm": 0.47373211505761376, + "learning_rate": 5.332779796257656e-06, + "loss": 0.0119, + "step": 5263 + }, + { + "epoch": 2.394904458598726, + "grad_norm": 0.32577636597481435, + "learning_rate": 5.331353655846462e-06, + "loss": 0.0146, + "step": 5264 + }, + { + "epoch": 2.3953594176524113, + "grad_norm": 0.41695373722744733, + "learning_rate": 5.329927488358544e-06, + "loss": 0.0121, + "step": 5265 + }, + { + "epoch": 2.395814376706096, + "grad_norm": 0.31340022193006356, + "learning_rate": 5.3285012939104395e-06, + "loss": 0.0076, + "step": 5266 + }, + { + "epoch": 2.3962693357597815, + "grad_norm": 0.42920487838433324, + "learning_rate": 5.327075072618696e-06, + "loss": 0.0117, + "step": 5267 + }, + { + "epoch": 2.396724294813467, + "grad_norm": 0.36684847916393465, + "learning_rate": 5.325648824599853e-06, + "loss": 0.0117, + "step": 5268 + }, + { + "epoch": 2.397179253867152, + "grad_norm": 0.3279088478753296, + "learning_rate": 5.324222549970458e-06, + "loss": 0.0146, + "step": 5269 + }, + { + "epoch": 2.397634212920837, + "grad_norm": 0.6377109395841157, + "learning_rate": 5.322796248847062e-06, + "loss": 0.0188, + "step": 5270 + }, + { + "epoch": 2.3980891719745223, + "grad_norm": 0.24823435844273126, + "learning_rate": 5.321369921346211e-06, + "loss": 0.0063, + "step": 5271 + }, + { + "epoch": 2.3985441310282076, + "grad_norm": 0.3508243366725432, + "learning_rate": 5.3199435675844644e-06, + "loss": 0.011, + "step": 5272 + }, + { + "epoch": 2.3989990900818925, + "grad_norm": 0.35664589258764695, + "learning_rate": 5.318517187678374e-06, + "loss": 0.0126, + "step": 5273 + }, + { + "epoch": 2.399454049135578, + "grad_norm": 0.49922827351419585, + "learning_rate": 5.317090781744497e-06, + "loss": 0.0134, + "step": 5274 + }, + { + "epoch": 2.399909008189263, + "grad_norm": 0.42920600718528423, + "learning_rate": 5.315664349899393e-06, + "loss": 0.0197, + "step": 5275 + }, + { + "epoch": 2.400363967242948, + "grad_norm": 0.3636947535999247, + "learning_rate": 5.314237892259624e-06, + "loss": 0.0129, + "step": 5276 + }, + { + "epoch": 2.4008189262966333, + "grad_norm": 0.4114786573539662, + "learning_rate": 5.312811408941753e-06, + "loss": 0.0099, + "step": 5277 + }, + { + "epoch": 2.4012738853503186, + "grad_norm": 0.4031041535549888, + "learning_rate": 5.311384900062346e-06, + "loss": 0.0159, + "step": 5278 + }, + { + "epoch": 2.4017288444040035, + "grad_norm": 0.6712437758510306, + "learning_rate": 5.309958365737973e-06, + "loss": 0.0182, + "step": 5279 + }, + { + "epoch": 2.402183803457689, + "grad_norm": 0.43264061757597233, + "learning_rate": 5.308531806085202e-06, + "loss": 0.0159, + "step": 5280 + }, + { + "epoch": 2.402638762511374, + "grad_norm": 0.26441178822412903, + "learning_rate": 5.307105221220604e-06, + "loss": 0.0067, + "step": 5281 + }, + { + "epoch": 2.403093721565059, + "grad_norm": 0.4404124093235634, + "learning_rate": 5.3056786112607535e-06, + "loss": 0.0111, + "step": 5282 + }, + { + "epoch": 2.4035486806187443, + "grad_norm": 0.42989271959707304, + "learning_rate": 5.304251976322229e-06, + "loss": 0.0119, + "step": 5283 + }, + { + "epoch": 2.4040036396724296, + "grad_norm": 0.4631793935562148, + "learning_rate": 5.302825316521607e-06, + "loss": 0.0143, + "step": 5284 + }, + { + "epoch": 2.404458598726115, + "grad_norm": 0.58463774112294, + "learning_rate": 5.301398631975466e-06, + "loss": 0.0208, + "step": 5285 + }, + { + "epoch": 2.4049135577797998, + "grad_norm": 0.5531017946934959, + "learning_rate": 5.299971922800391e-06, + "loss": 0.0287, + "step": 5286 + }, + { + "epoch": 2.405368516833485, + "grad_norm": 0.33464890796105184, + "learning_rate": 5.298545189112966e-06, + "loss": 0.01, + "step": 5287 + }, + { + "epoch": 2.4058234758871704, + "grad_norm": 0.40252846455148866, + "learning_rate": 5.297118431029775e-06, + "loss": 0.0175, + "step": 5288 + }, + { + "epoch": 2.4062784349408552, + "grad_norm": 0.28830007619590997, + "learning_rate": 5.295691648667407e-06, + "loss": 0.0079, + "step": 5289 + }, + { + "epoch": 2.4067333939945406, + "grad_norm": 0.36136838045868824, + "learning_rate": 5.294264842142454e-06, + "loss": 0.0112, + "step": 5290 + }, + { + "epoch": 2.407188353048226, + "grad_norm": 0.42706958288079705, + "learning_rate": 5.292838011571507e-06, + "loss": 0.0103, + "step": 5291 + }, + { + "epoch": 2.4076433121019107, + "grad_norm": 0.5404021820517108, + "learning_rate": 5.2914111570711605e-06, + "loss": 0.0141, + "step": 5292 + }, + { + "epoch": 2.408098271155596, + "grad_norm": 0.40642751021129225, + "learning_rate": 5.289984278758009e-06, + "loss": 0.0153, + "step": 5293 + }, + { + "epoch": 2.4085532302092814, + "grad_norm": 0.30578502453889306, + "learning_rate": 5.2885573767486535e-06, + "loss": 0.0113, + "step": 5294 + }, + { + "epoch": 2.4090081892629662, + "grad_norm": 0.3209377054110551, + "learning_rate": 5.28713045115969e-06, + "loss": 0.0114, + "step": 5295 + }, + { + "epoch": 2.4094631483166515, + "grad_norm": 0.48096854465015776, + "learning_rate": 5.285703502107724e-06, + "loss": 0.017, + "step": 5296 + }, + { + "epoch": 2.409918107370337, + "grad_norm": 0.4219490254103711, + "learning_rate": 5.284276529709358e-06, + "loss": 0.0125, + "step": 5297 + }, + { + "epoch": 2.4103730664240217, + "grad_norm": 0.4636782537011902, + "learning_rate": 5.282849534081198e-06, + "loss": 0.0191, + "step": 5298 + }, + { + "epoch": 2.410828025477707, + "grad_norm": 0.42315034221489234, + "learning_rate": 5.28142251533985e-06, + "loss": 0.0175, + "step": 5299 + }, + { + "epoch": 2.4112829845313923, + "grad_norm": 0.4673610840212676, + "learning_rate": 5.2799954736019264e-06, + "loss": 0.016, + "step": 5300 + }, + { + "epoch": 2.411737943585077, + "grad_norm": 0.40097914710543014, + "learning_rate": 5.2785684089840375e-06, + "loss": 0.0146, + "step": 5301 + }, + { + "epoch": 2.4121929026387625, + "grad_norm": 0.445243466924135, + "learning_rate": 5.277141321602795e-06, + "loss": 0.0173, + "step": 5302 + }, + { + "epoch": 2.412647861692448, + "grad_norm": 0.37737962994500585, + "learning_rate": 5.275714211574816e-06, + "loss": 0.0106, + "step": 5303 + }, + { + "epoch": 2.4131028207461327, + "grad_norm": 0.4112204646422568, + "learning_rate": 5.274287079016717e-06, + "loss": 0.0157, + "step": 5304 + }, + { + "epoch": 2.413557779799818, + "grad_norm": 0.33025822861785786, + "learning_rate": 5.272859924045116e-06, + "loss": 0.0113, + "step": 5305 + }, + { + "epoch": 2.4140127388535033, + "grad_norm": 0.3300443428375169, + "learning_rate": 5.2714327467766335e-06, + "loss": 0.0107, + "step": 5306 + }, + { + "epoch": 2.414467697907188, + "grad_norm": 0.40680442981426845, + "learning_rate": 5.2700055473278935e-06, + "loss": 0.0111, + "step": 5307 + }, + { + "epoch": 2.4149226569608735, + "grad_norm": 0.3689628882005614, + "learning_rate": 5.268578325815521e-06, + "loss": 0.0149, + "step": 5308 + }, + { + "epoch": 2.415377616014559, + "grad_norm": 0.3866694235292162, + "learning_rate": 5.267151082356138e-06, + "loss": 0.0103, + "step": 5309 + }, + { + "epoch": 2.4158325750682437, + "grad_norm": 0.3141065437405475, + "learning_rate": 5.265723817066376e-06, + "loss": 0.0097, + "step": 5310 + }, + { + "epoch": 2.416287534121929, + "grad_norm": 0.27490602838646616, + "learning_rate": 5.264296530062865e-06, + "loss": 0.0079, + "step": 5311 + }, + { + "epoch": 2.4167424931756143, + "grad_norm": 0.38772971732292993, + "learning_rate": 5.262869221462234e-06, + "loss": 0.0061, + "step": 5312 + }, + { + "epoch": 2.417197452229299, + "grad_norm": 0.26314076127170805, + "learning_rate": 5.261441891381116e-06, + "loss": 0.0087, + "step": 5313 + }, + { + "epoch": 2.4176524112829845, + "grad_norm": 0.2792754343780329, + "learning_rate": 5.260014539936148e-06, + "loss": 0.0091, + "step": 5314 + }, + { + "epoch": 2.41810737033667, + "grad_norm": 0.6497302976664651, + "learning_rate": 5.258587167243968e-06, + "loss": 0.0184, + "step": 5315 + }, + { + "epoch": 2.4185623293903546, + "grad_norm": 0.4894356800746713, + "learning_rate": 5.257159773421211e-06, + "loss": 0.0137, + "step": 5316 + }, + { + "epoch": 2.41901728844404, + "grad_norm": 0.38163565505543456, + "learning_rate": 5.255732358584517e-06, + "loss": 0.0171, + "step": 5317 + }, + { + "epoch": 2.4194722474977253, + "grad_norm": 0.3234146579281654, + "learning_rate": 5.2543049228505326e-06, + "loss": 0.007, + "step": 5318 + }, + { + "epoch": 2.41992720655141, + "grad_norm": 0.44372590044724763, + "learning_rate": 5.252877466335897e-06, + "loss": 0.0182, + "step": 5319 + }, + { + "epoch": 2.4203821656050954, + "grad_norm": 0.5020596134338452, + "learning_rate": 5.251449989157257e-06, + "loss": 0.0227, + "step": 5320 + }, + { + "epoch": 2.4208371246587808, + "grad_norm": 0.4087790160266453, + "learning_rate": 5.250022491431259e-06, + "loss": 0.0189, + "step": 5321 + }, + { + "epoch": 2.421292083712466, + "grad_norm": 0.29776606030221714, + "learning_rate": 5.2485949732745525e-06, + "loss": 0.0064, + "step": 5322 + }, + { + "epoch": 2.421747042766151, + "grad_norm": 0.4447913938555693, + "learning_rate": 5.247167434803787e-06, + "loss": 0.0133, + "step": 5323 + }, + { + "epoch": 2.4222020018198362, + "grad_norm": 0.46365246362920237, + "learning_rate": 5.245739876135615e-06, + "loss": 0.0173, + "step": 5324 + }, + { + "epoch": 2.4226569608735216, + "grad_norm": 0.5178806777130259, + "learning_rate": 5.244312297386691e-06, + "loss": 0.0115, + "step": 5325 + }, + { + "epoch": 2.4231119199272064, + "grad_norm": 0.35093793454524563, + "learning_rate": 5.242884698673668e-06, + "loss": 0.0114, + "step": 5326 + }, + { + "epoch": 2.4235668789808917, + "grad_norm": 0.28422567507689994, + "learning_rate": 5.241457080113205e-06, + "loss": 0.0081, + "step": 5327 + }, + { + "epoch": 2.424021838034577, + "grad_norm": 0.48736935552274546, + "learning_rate": 5.24002944182196e-06, + "loss": 0.0189, + "step": 5328 + }, + { + "epoch": 2.424476797088262, + "grad_norm": 0.2964000798583057, + "learning_rate": 5.2386017839165925e-06, + "loss": 0.0147, + "step": 5329 + }, + { + "epoch": 2.4249317561419472, + "grad_norm": 0.5907284720265954, + "learning_rate": 5.237174106513764e-06, + "loss": 0.0095, + "step": 5330 + }, + { + "epoch": 2.4253867151956325, + "grad_norm": 0.4737712799278728, + "learning_rate": 5.235746409730139e-06, + "loss": 0.0141, + "step": 5331 + }, + { + "epoch": 2.4258416742493174, + "grad_norm": 0.5550673202170863, + "learning_rate": 5.234318693682384e-06, + "loss": 0.0173, + "step": 5332 + }, + { + "epoch": 2.4262966333030027, + "grad_norm": 0.14115228212962203, + "learning_rate": 5.232890958487162e-06, + "loss": 0.0018, + "step": 5333 + }, + { + "epoch": 2.426751592356688, + "grad_norm": 0.4631423032957736, + "learning_rate": 5.231463204261142e-06, + "loss": 0.0193, + "step": 5334 + }, + { + "epoch": 2.427206551410373, + "grad_norm": 0.4235314876677975, + "learning_rate": 5.2300354311209955e-06, + "loss": 0.0111, + "step": 5335 + }, + { + "epoch": 2.427661510464058, + "grad_norm": 0.4684844540332601, + "learning_rate": 5.228607639183392e-06, + "loss": 0.0198, + "step": 5336 + }, + { + "epoch": 2.4281164695177435, + "grad_norm": 0.32457884621106575, + "learning_rate": 5.227179828565003e-06, + "loss": 0.0089, + "step": 5337 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.23451179427681904, + "learning_rate": 5.225751999382507e-06, + "loss": 0.0068, + "step": 5338 + }, + { + "epoch": 2.4290263876251137, + "grad_norm": 0.23318095670248587, + "learning_rate": 5.224324151752575e-06, + "loss": 0.0063, + "step": 5339 + }, + { + "epoch": 2.429481346678799, + "grad_norm": 0.4347170570515523, + "learning_rate": 5.222896285791889e-06, + "loss": 0.0163, + "step": 5340 + }, + { + "epoch": 2.4299363057324843, + "grad_norm": 0.21176806980673593, + "learning_rate": 5.221468401617121e-06, + "loss": 0.0036, + "step": 5341 + }, + { + "epoch": 2.430391264786169, + "grad_norm": 0.3457671056366103, + "learning_rate": 5.220040499344958e-06, + "loss": 0.0077, + "step": 5342 + }, + { + "epoch": 2.4308462238398545, + "grad_norm": 0.32392634294488304, + "learning_rate": 5.2186125790920796e-06, + "loss": 0.0056, + "step": 5343 + }, + { + "epoch": 2.43130118289354, + "grad_norm": 0.34963533307201805, + "learning_rate": 5.217184640975167e-06, + "loss": 0.0074, + "step": 5344 + }, + { + "epoch": 2.4317561419472247, + "grad_norm": 0.5346383300170781, + "learning_rate": 5.2157566851109075e-06, + "loss": 0.0164, + "step": 5345 + }, + { + "epoch": 2.43221110100091, + "grad_norm": 0.4217985939266549, + "learning_rate": 5.214328711615984e-06, + "loss": 0.0133, + "step": 5346 + }, + { + "epoch": 2.4326660600545953, + "grad_norm": 0.4333819173618331, + "learning_rate": 5.2129007206070894e-06, + "loss": 0.0081, + "step": 5347 + }, + { + "epoch": 2.43312101910828, + "grad_norm": 0.49905999345083274, + "learning_rate": 5.211472712200905e-06, + "loss": 0.0105, + "step": 5348 + }, + { + "epoch": 2.4335759781619655, + "grad_norm": 0.45412017248375747, + "learning_rate": 5.210044686514129e-06, + "loss": 0.0113, + "step": 5349 + }, + { + "epoch": 2.434030937215651, + "grad_norm": 0.30414770683658676, + "learning_rate": 5.20861664366345e-06, + "loss": 0.0054, + "step": 5350 + }, + { + "epoch": 2.4344858962693356, + "grad_norm": 0.48930476002695283, + "learning_rate": 5.207188583765559e-06, + "loss": 0.016, + "step": 5351 + }, + { + "epoch": 2.434940855323021, + "grad_norm": 0.4009965835101347, + "learning_rate": 5.205760506937155e-06, + "loss": 0.0092, + "step": 5352 + }, + { + "epoch": 2.4353958143767063, + "grad_norm": 0.41809624341181123, + "learning_rate": 5.204332413294929e-06, + "loss": 0.0152, + "step": 5353 + }, + { + "epoch": 2.435850773430391, + "grad_norm": 0.21306333496842877, + "learning_rate": 5.202904302955582e-06, + "loss": 0.0039, + "step": 5354 + }, + { + "epoch": 2.4363057324840764, + "grad_norm": 0.48254684653454416, + "learning_rate": 5.201476176035813e-06, + "loss": 0.0167, + "step": 5355 + }, + { + "epoch": 2.4367606915377618, + "grad_norm": 0.4504698553713479, + "learning_rate": 5.200048032652319e-06, + "loss": 0.0139, + "step": 5356 + }, + { + "epoch": 2.4372156505914466, + "grad_norm": 0.6854774387741476, + "learning_rate": 5.198619872921804e-06, + "loss": 0.0226, + "step": 5357 + }, + { + "epoch": 2.437670609645132, + "grad_norm": 0.41483620737618115, + "learning_rate": 5.1971916969609685e-06, + "loss": 0.0132, + "step": 5358 + }, + { + "epoch": 2.4381255686988172, + "grad_norm": 0.5129994167682194, + "learning_rate": 5.195763504886518e-06, + "loss": 0.0261, + "step": 5359 + }, + { + "epoch": 2.438580527752502, + "grad_norm": 0.41405688069575036, + "learning_rate": 5.19433529681516e-06, + "loss": 0.0067, + "step": 5360 + }, + { + "epoch": 2.4390354868061874, + "grad_norm": 0.43578772096858115, + "learning_rate": 5.1929070728635985e-06, + "loss": 0.0159, + "step": 5361 + }, + { + "epoch": 2.4394904458598727, + "grad_norm": 0.31965492899761083, + "learning_rate": 5.1914788331485424e-06, + "loss": 0.0079, + "step": 5362 + }, + { + "epoch": 2.4399454049135576, + "grad_norm": 0.40154663104201666, + "learning_rate": 5.190050577786699e-06, + "loss": 0.0074, + "step": 5363 + }, + { + "epoch": 2.440400363967243, + "grad_norm": 0.44931254718914193, + "learning_rate": 5.188622306894783e-06, + "loss": 0.0106, + "step": 5364 + }, + { + "epoch": 2.4408553230209282, + "grad_norm": 0.33437106617358336, + "learning_rate": 5.187194020589501e-06, + "loss": 0.005, + "step": 5365 + }, + { + "epoch": 2.441310282074613, + "grad_norm": 0.20199436605898388, + "learning_rate": 5.185765718987571e-06, + "loss": 0.0025, + "step": 5366 + }, + { + "epoch": 2.4417652411282984, + "grad_norm": 0.46926903808957776, + "learning_rate": 5.184337402205705e-06, + "loss": 0.0194, + "step": 5367 + }, + { + "epoch": 2.4422202001819837, + "grad_norm": 0.367596857437341, + "learning_rate": 5.18290907036062e-06, + "loss": 0.0084, + "step": 5368 + }, + { + "epoch": 2.4426751592356686, + "grad_norm": 0.1327251181749855, + "learning_rate": 5.18148072356903e-06, + "loss": 0.0025, + "step": 5369 + }, + { + "epoch": 2.443130118289354, + "grad_norm": 0.4147742303890549, + "learning_rate": 5.180052361947656e-06, + "loss": 0.0104, + "step": 5370 + }, + { + "epoch": 2.443585077343039, + "grad_norm": 0.4630253091530012, + "learning_rate": 5.178623985613216e-06, + "loss": 0.0199, + "step": 5371 + }, + { + "epoch": 2.444040036396724, + "grad_norm": 0.34218287491377375, + "learning_rate": 5.177195594682431e-06, + "loss": 0.0108, + "step": 5372 + }, + { + "epoch": 2.4444949954504094, + "grad_norm": 0.409352010746614, + "learning_rate": 5.17576718927202e-06, + "loss": 0.0073, + "step": 5373 + }, + { + "epoch": 2.4449499545040947, + "grad_norm": 0.36350434011146243, + "learning_rate": 5.174338769498711e-06, + "loss": 0.0118, + "step": 5374 + }, + { + "epoch": 2.4454049135577796, + "grad_norm": 0.33996923505627474, + "learning_rate": 5.172910335479223e-06, + "loss": 0.0141, + "step": 5375 + }, + { + "epoch": 2.445859872611465, + "grad_norm": 0.5691354080089669, + "learning_rate": 5.171481887330283e-06, + "loss": 0.0275, + "step": 5376 + }, + { + "epoch": 2.44631483166515, + "grad_norm": 0.587573769102348, + "learning_rate": 5.170053425168619e-06, + "loss": 0.0171, + "step": 5377 + }, + { + "epoch": 2.4467697907188355, + "grad_norm": 0.4951503647581403, + "learning_rate": 5.168624949110956e-06, + "loss": 0.0107, + "step": 5378 + }, + { + "epoch": 2.4472247497725204, + "grad_norm": 0.5222877485943915, + "learning_rate": 5.167196459274024e-06, + "loss": 0.0233, + "step": 5379 + }, + { + "epoch": 2.4476797088262057, + "grad_norm": 0.3857745059781012, + "learning_rate": 5.165767955774554e-06, + "loss": 0.0122, + "step": 5380 + }, + { + "epoch": 2.448134667879891, + "grad_norm": 0.4280925056961366, + "learning_rate": 5.164339438729273e-06, + "loss": 0.0097, + "step": 5381 + }, + { + "epoch": 2.448589626933576, + "grad_norm": 0.4134588427952028, + "learning_rate": 5.162910908254917e-06, + "loss": 0.0167, + "step": 5382 + }, + { + "epoch": 2.449044585987261, + "grad_norm": 0.41863303981195715, + "learning_rate": 5.161482364468216e-06, + "loss": 0.0254, + "step": 5383 + }, + { + "epoch": 2.4494995450409465, + "grad_norm": 0.29417651115854027, + "learning_rate": 5.1600538074859045e-06, + "loss": 0.0059, + "step": 5384 + }, + { + "epoch": 2.4499545040946313, + "grad_norm": 0.3633088275042752, + "learning_rate": 5.15862523742472e-06, + "loss": 0.0115, + "step": 5385 + }, + { + "epoch": 2.4504094631483166, + "grad_norm": 0.44125434053934054, + "learning_rate": 5.157196654401397e-06, + "loss": 0.0157, + "step": 5386 + }, + { + "epoch": 2.450864422202002, + "grad_norm": 0.2642282335305633, + "learning_rate": 5.155768058532674e-06, + "loss": 0.0061, + "step": 5387 + }, + { + "epoch": 2.451319381255687, + "grad_norm": 0.2843265353073583, + "learning_rate": 5.15433944993529e-06, + "loss": 0.0105, + "step": 5388 + }, + { + "epoch": 2.451774340309372, + "grad_norm": 0.4197881250600456, + "learning_rate": 5.15291082872598e-06, + "loss": 0.0063, + "step": 5389 + }, + { + "epoch": 2.4522292993630574, + "grad_norm": 0.4789517931451006, + "learning_rate": 5.151482195021489e-06, + "loss": 0.0234, + "step": 5390 + }, + { + "epoch": 2.4526842584167423, + "grad_norm": 0.4717176046669697, + "learning_rate": 5.150053548938557e-06, + "loss": 0.022, + "step": 5391 + }, + { + "epoch": 2.4531392174704276, + "grad_norm": 0.23207011911006153, + "learning_rate": 5.148624890593927e-06, + "loss": 0.0062, + "step": 5392 + }, + { + "epoch": 2.453594176524113, + "grad_norm": 0.459972570924796, + "learning_rate": 5.147196220104342e-06, + "loss": 0.017, + "step": 5393 + }, + { + "epoch": 2.4540491355777982, + "grad_norm": 0.5567722874176256, + "learning_rate": 5.145767537586546e-06, + "loss": 0.0124, + "step": 5394 + }, + { + "epoch": 2.454504094631483, + "grad_norm": 0.30209411330804076, + "learning_rate": 5.144338843157286e-06, + "loss": 0.0096, + "step": 5395 + }, + { + "epoch": 2.4549590536851684, + "grad_norm": 0.2519017471529283, + "learning_rate": 5.1429101369333065e-06, + "loss": 0.0072, + "step": 5396 + }, + { + "epoch": 2.4554140127388537, + "grad_norm": 0.6563956864761441, + "learning_rate": 5.141481419031357e-06, + "loss": 0.0187, + "step": 5397 + }, + { + "epoch": 2.4558689717925386, + "grad_norm": 0.4708398496905736, + "learning_rate": 5.140052689568185e-06, + "loss": 0.0162, + "step": 5398 + }, + { + "epoch": 2.456323930846224, + "grad_norm": 0.17379437121204325, + "learning_rate": 5.1386239486605394e-06, + "loss": 0.0034, + "step": 5399 + }, + { + "epoch": 2.4567788898999092, + "grad_norm": 0.2611660630951624, + "learning_rate": 5.1371951964251695e-06, + "loss": 0.0078, + "step": 5400 + }, + { + "epoch": 2.457233848953594, + "grad_norm": 0.3978398002330195, + "learning_rate": 5.135766432978829e-06, + "loss": 0.0127, + "step": 5401 + }, + { + "epoch": 2.4576888080072794, + "grad_norm": 0.2922957300568856, + "learning_rate": 5.134337658438269e-06, + "loss": 0.0074, + "step": 5402 + }, + { + "epoch": 2.4581437670609647, + "grad_norm": 0.2539151326498751, + "learning_rate": 5.132908872920242e-06, + "loss": 0.0103, + "step": 5403 + }, + { + "epoch": 2.4585987261146496, + "grad_norm": 0.458182720404031, + "learning_rate": 5.1314800765415014e-06, + "loss": 0.0092, + "step": 5404 + }, + { + "epoch": 2.459053685168335, + "grad_norm": 0.3001758441171674, + "learning_rate": 5.130051269418804e-06, + "loss": 0.0062, + "step": 5405 + }, + { + "epoch": 2.45950864422202, + "grad_norm": 0.6882974671921861, + "learning_rate": 5.128622451668903e-06, + "loss": 0.0309, + "step": 5406 + }, + { + "epoch": 2.459963603275705, + "grad_norm": 0.49732412901573037, + "learning_rate": 5.127193623408556e-06, + "loss": 0.0141, + "step": 5407 + }, + { + "epoch": 2.4604185623293904, + "grad_norm": 0.41004480864420567, + "learning_rate": 5.125764784754521e-06, + "loss": 0.013, + "step": 5408 + }, + { + "epoch": 2.4608735213830757, + "grad_norm": 0.41635378566858544, + "learning_rate": 5.1243359358235555e-06, + "loss": 0.0145, + "step": 5409 + }, + { + "epoch": 2.4613284804367606, + "grad_norm": 0.5225584751956076, + "learning_rate": 5.12290707673242e-06, + "loss": 0.0136, + "step": 5410 + }, + { + "epoch": 2.461783439490446, + "grad_norm": 0.40487768289825765, + "learning_rate": 5.121478207597871e-06, + "loss": 0.0118, + "step": 5411 + }, + { + "epoch": 2.462238398544131, + "grad_norm": 0.32717360369468845, + "learning_rate": 5.120049328536674e-06, + "loss": 0.0094, + "step": 5412 + }, + { + "epoch": 2.462693357597816, + "grad_norm": 0.48756100640197947, + "learning_rate": 5.1186204396655855e-06, + "loss": 0.0178, + "step": 5413 + }, + { + "epoch": 2.4631483166515014, + "grad_norm": 0.6624070111868631, + "learning_rate": 5.117191541101372e-06, + "loss": 0.012, + "step": 5414 + }, + { + "epoch": 2.4636032757051867, + "grad_norm": 0.6506693017472739, + "learning_rate": 5.115762632960795e-06, + "loss": 0.0213, + "step": 5415 + }, + { + "epoch": 2.4640582347588715, + "grad_norm": 0.2881670840004332, + "learning_rate": 5.114333715360617e-06, + "loss": 0.0071, + "step": 5416 + }, + { + "epoch": 2.464513193812557, + "grad_norm": 0.26900122959065437, + "learning_rate": 5.1129047884176065e-06, + "loss": 0.0089, + "step": 5417 + }, + { + "epoch": 2.464968152866242, + "grad_norm": 0.3812297870362603, + "learning_rate": 5.111475852248523e-06, + "loss": 0.0139, + "step": 5418 + }, + { + "epoch": 2.465423111919927, + "grad_norm": 0.4476381917589943, + "learning_rate": 5.11004690697014e-06, + "loss": 0.0116, + "step": 5419 + }, + { + "epoch": 2.4658780709736123, + "grad_norm": 0.37913712833335467, + "learning_rate": 5.1086179526992185e-06, + "loss": 0.0112, + "step": 5420 + }, + { + "epoch": 2.4663330300272976, + "grad_norm": 0.35300019232667945, + "learning_rate": 5.107188989552529e-06, + "loss": 0.0157, + "step": 5421 + }, + { + "epoch": 2.4667879890809825, + "grad_norm": 0.483643978357322, + "learning_rate": 5.105760017646839e-06, + "loss": 0.0147, + "step": 5422 + }, + { + "epoch": 2.467242948134668, + "grad_norm": 0.4084233464230104, + "learning_rate": 5.1043310370989184e-06, + "loss": 0.0174, + "step": 5423 + }, + { + "epoch": 2.467697907188353, + "grad_norm": 0.3258662727119082, + "learning_rate": 5.102902048025538e-06, + "loss": 0.013, + "step": 5424 + }, + { + "epoch": 2.468152866242038, + "grad_norm": 0.47279595459088114, + "learning_rate": 5.101473050543464e-06, + "loss": 0.0146, + "step": 5425 + }, + { + "epoch": 2.4686078252957233, + "grad_norm": 0.3251070762000244, + "learning_rate": 5.100044044769472e-06, + "loss": 0.0089, + "step": 5426 + }, + { + "epoch": 2.4690627843494086, + "grad_norm": 0.3201788561181316, + "learning_rate": 5.098615030820333e-06, + "loss": 0.0071, + "step": 5427 + }, + { + "epoch": 2.4695177434030935, + "grad_norm": 0.28274405624464927, + "learning_rate": 5.097186008812818e-06, + "loss": 0.0056, + "step": 5428 + }, + { + "epoch": 2.469972702456779, + "grad_norm": 0.2820569423373688, + "learning_rate": 5.095756978863702e-06, + "loss": 0.006, + "step": 5429 + }, + { + "epoch": 2.470427661510464, + "grad_norm": 0.4647302280554121, + "learning_rate": 5.094327941089758e-06, + "loss": 0.0096, + "step": 5430 + }, + { + "epoch": 2.470882620564149, + "grad_norm": 0.250235327250325, + "learning_rate": 5.0928988956077604e-06, + "loss": 0.0084, + "step": 5431 + }, + { + "epoch": 2.4713375796178343, + "grad_norm": 0.3826893361598013, + "learning_rate": 5.0914698425344845e-06, + "loss": 0.0087, + "step": 5432 + }, + { + "epoch": 2.4717925386715196, + "grad_norm": 0.8139476684262739, + "learning_rate": 5.090040781986706e-06, + "loss": 0.0239, + "step": 5433 + }, + { + "epoch": 2.472247497725205, + "grad_norm": 0.2749894403917113, + "learning_rate": 5.088611714081203e-06, + "loss": 0.0068, + "step": 5434 + }, + { + "epoch": 2.47270245677889, + "grad_norm": 0.5239176380706023, + "learning_rate": 5.0871826389347475e-06, + "loss": 0.0172, + "step": 5435 + }, + { + "epoch": 2.473157415832575, + "grad_norm": 0.4481873000513952, + "learning_rate": 5.085753556664124e-06, + "loss": 0.0102, + "step": 5436 + }, + { + "epoch": 2.4736123748862604, + "grad_norm": 0.3097559481978516, + "learning_rate": 5.084324467386106e-06, + "loss": 0.0129, + "step": 5437 + }, + { + "epoch": 2.4740673339399453, + "grad_norm": 0.3057121182463589, + "learning_rate": 5.0828953712174735e-06, + "loss": 0.0083, + "step": 5438 + }, + { + "epoch": 2.4745222929936306, + "grad_norm": 0.4079199912576415, + "learning_rate": 5.081466268275005e-06, + "loss": 0.0104, + "step": 5439 + }, + { + "epoch": 2.474977252047316, + "grad_norm": 0.3809173575195753, + "learning_rate": 5.080037158675481e-06, + "loss": 0.0136, + "step": 5440 + }, + { + "epoch": 2.4754322111010008, + "grad_norm": 0.30222294439440717, + "learning_rate": 5.078608042535682e-06, + "loss": 0.0091, + "step": 5441 + }, + { + "epoch": 2.475887170154686, + "grad_norm": 0.43467376785926043, + "learning_rate": 5.077178919972388e-06, + "loss": 0.0138, + "step": 5442 + }, + { + "epoch": 2.4763421292083714, + "grad_norm": 0.5629966441322612, + "learning_rate": 5.075749791102382e-06, + "loss": 0.0124, + "step": 5443 + }, + { + "epoch": 2.4767970882620562, + "grad_norm": 0.44402811668745085, + "learning_rate": 5.074320656042446e-06, + "loss": 0.0148, + "step": 5444 + }, + { + "epoch": 2.4772520473157416, + "grad_norm": 0.3843873930847924, + "learning_rate": 5.072891514909361e-06, + "loss": 0.0102, + "step": 5445 + }, + { + "epoch": 2.477707006369427, + "grad_norm": 0.32882781000696565, + "learning_rate": 5.071462367819909e-06, + "loss": 0.0119, + "step": 5446 + }, + { + "epoch": 2.4781619654231117, + "grad_norm": 0.42795726773632586, + "learning_rate": 5.070033214890876e-06, + "loss": 0.016, + "step": 5447 + }, + { + "epoch": 2.478616924476797, + "grad_norm": 0.22838737121675054, + "learning_rate": 5.068604056239046e-06, + "loss": 0.0043, + "step": 5448 + }, + { + "epoch": 2.4790718835304824, + "grad_norm": 0.34419758854472965, + "learning_rate": 5.067174891981201e-06, + "loss": 0.0078, + "step": 5449 + }, + { + "epoch": 2.4795268425841677, + "grad_norm": 0.6802361006015856, + "learning_rate": 5.065745722234128e-06, + "loss": 0.0235, + "step": 5450 + }, + { + "epoch": 2.4799818016378525, + "grad_norm": 0.3091308707042837, + "learning_rate": 5.064316547114612e-06, + "loss": 0.0052, + "step": 5451 + }, + { + "epoch": 2.480436760691538, + "grad_norm": 0.3555379741422837, + "learning_rate": 5.0628873667394364e-06, + "loss": 0.0081, + "step": 5452 + }, + { + "epoch": 2.480891719745223, + "grad_norm": 0.3114904842834978, + "learning_rate": 5.061458181225391e-06, + "loss": 0.0062, + "step": 5453 + }, + { + "epoch": 2.481346678798908, + "grad_norm": 0.3427768927053479, + "learning_rate": 5.060028990689259e-06, + "loss": 0.0087, + "step": 5454 + }, + { + "epoch": 2.4818016378525933, + "grad_norm": 0.49459219472705657, + "learning_rate": 5.05859979524783e-06, + "loss": 0.0158, + "step": 5455 + }, + { + "epoch": 2.4822565969062786, + "grad_norm": 0.30078272464870365, + "learning_rate": 5.057170595017891e-06, + "loss": 0.0099, + "step": 5456 + }, + { + "epoch": 2.4827115559599635, + "grad_norm": 0.2252201083771656, + "learning_rate": 5.055741390116227e-06, + "loss": 0.0036, + "step": 5457 + }, + { + "epoch": 2.483166515013649, + "grad_norm": 0.5446063405835339, + "learning_rate": 5.054312180659631e-06, + "loss": 0.0102, + "step": 5458 + }, + { + "epoch": 2.483621474067334, + "grad_norm": 0.43403876230490246, + "learning_rate": 5.0528829667648875e-06, + "loss": 0.0092, + "step": 5459 + }, + { + "epoch": 2.484076433121019, + "grad_norm": 0.45501792804101815, + "learning_rate": 5.051453748548786e-06, + "loss": 0.015, + "step": 5460 + }, + { + "epoch": 2.4845313921747043, + "grad_norm": 1.3807537003371522, + "learning_rate": 5.050024526128118e-06, + "loss": 0.0087, + "step": 5461 + }, + { + "epoch": 2.4849863512283896, + "grad_norm": 0.4206908833428577, + "learning_rate": 5.048595299619671e-06, + "loss": 0.018, + "step": 5462 + }, + { + "epoch": 2.4854413102820745, + "grad_norm": 0.5526855525115705, + "learning_rate": 5.047166069140235e-06, + "loss": 0.0197, + "step": 5463 + }, + { + "epoch": 2.48589626933576, + "grad_norm": 0.43880393214773, + "learning_rate": 5.0457368348066e-06, + "loss": 0.0108, + "step": 5464 + }, + { + "epoch": 2.486351228389445, + "grad_norm": 0.42724520346313205, + "learning_rate": 5.04430759673556e-06, + "loss": 0.0078, + "step": 5465 + }, + { + "epoch": 2.48680618744313, + "grad_norm": 0.5998726297653039, + "learning_rate": 5.042878355043902e-06, + "loss": 0.0153, + "step": 5466 + }, + { + "epoch": 2.4872611464968153, + "grad_norm": 0.24920444582854095, + "learning_rate": 5.041449109848418e-06, + "loss": 0.0044, + "step": 5467 + }, + { + "epoch": 2.4877161055505006, + "grad_norm": 0.4596108651630432, + "learning_rate": 5.040019861265901e-06, + "loss": 0.0132, + "step": 5468 + }, + { + "epoch": 2.4881710646041855, + "grad_norm": 0.26925258867518487, + "learning_rate": 5.038590609413141e-06, + "loss": 0.0072, + "step": 5469 + }, + { + "epoch": 2.488626023657871, + "grad_norm": 0.350170449969174, + "learning_rate": 5.03716135440693e-06, + "loss": 0.0139, + "step": 5470 + }, + { + "epoch": 2.489080982711556, + "grad_norm": 0.49030337105475924, + "learning_rate": 5.035732096364061e-06, + "loss": 0.0322, + "step": 5471 + }, + { + "epoch": 2.489535941765241, + "grad_norm": 0.5060644314900122, + "learning_rate": 5.034302835401328e-06, + "loss": 0.0139, + "step": 5472 + }, + { + "epoch": 2.4899909008189263, + "grad_norm": 0.5299328884299171, + "learning_rate": 5.032873571635522e-06, + "loss": 0.0237, + "step": 5473 + }, + { + "epoch": 2.4904458598726116, + "grad_norm": 0.4599415921033778, + "learning_rate": 5.031444305183435e-06, + "loss": 0.0089, + "step": 5474 + }, + { + "epoch": 2.4909008189262964, + "grad_norm": 0.4580562875637281, + "learning_rate": 5.030015036161863e-06, + "loss": 0.0147, + "step": 5475 + }, + { + "epoch": 2.4913557779799818, + "grad_norm": 0.3369733129797372, + "learning_rate": 5.028585764687596e-06, + "loss": 0.0089, + "step": 5476 + }, + { + "epoch": 2.491810737033667, + "grad_norm": 0.3201648664458773, + "learning_rate": 5.02715649087743e-06, + "loss": 0.0076, + "step": 5477 + }, + { + "epoch": 2.492265696087352, + "grad_norm": 0.4890149308940846, + "learning_rate": 5.025727214848158e-06, + "loss": 0.0102, + "step": 5478 + }, + { + "epoch": 2.4927206551410372, + "grad_norm": 0.39312401492933874, + "learning_rate": 5.024297936716574e-06, + "loss": 0.0088, + "step": 5479 + }, + { + "epoch": 2.4931756141947226, + "grad_norm": 0.3618076509461657, + "learning_rate": 5.0228686565994745e-06, + "loss": 0.0109, + "step": 5480 + }, + { + "epoch": 2.4936305732484074, + "grad_norm": 0.4513013278322856, + "learning_rate": 5.021439374613648e-06, + "loss": 0.0165, + "step": 5481 + }, + { + "epoch": 2.4940855323020927, + "grad_norm": 0.671144246567914, + "learning_rate": 5.020010090875895e-06, + "loss": 0.0137, + "step": 5482 + }, + { + "epoch": 2.494540491355778, + "grad_norm": 0.4678025512741067, + "learning_rate": 5.018580805503007e-06, + "loss": 0.0172, + "step": 5483 + }, + { + "epoch": 2.494995450409463, + "grad_norm": 0.45375195687662523, + "learning_rate": 5.01715151861178e-06, + "loss": 0.015, + "step": 5484 + }, + { + "epoch": 2.4954504094631482, + "grad_norm": 0.27386671611422353, + "learning_rate": 5.015722230319009e-06, + "loss": 0.0078, + "step": 5485 + }, + { + "epoch": 2.4959053685168335, + "grad_norm": 0.47655367248773756, + "learning_rate": 5.014292940741487e-06, + "loss": 0.0172, + "step": 5486 + }, + { + "epoch": 2.496360327570519, + "grad_norm": 0.38754885419391666, + "learning_rate": 5.012863649996013e-06, + "loss": 0.0149, + "step": 5487 + }, + { + "epoch": 2.4968152866242037, + "grad_norm": 0.33897821581513504, + "learning_rate": 5.011434358199375e-06, + "loss": 0.0106, + "step": 5488 + }, + { + "epoch": 2.497270245677889, + "grad_norm": 0.47996766790953127, + "learning_rate": 5.0100050654683766e-06, + "loss": 0.0204, + "step": 5489 + }, + { + "epoch": 2.4977252047315743, + "grad_norm": 0.4268913339924163, + "learning_rate": 5.008575771919808e-06, + "loss": 0.0082, + "step": 5490 + }, + { + "epoch": 2.498180163785259, + "grad_norm": 0.3951902395496699, + "learning_rate": 5.007146477670466e-06, + "loss": 0.0112, + "step": 5491 + }, + { + "epoch": 2.4986351228389445, + "grad_norm": 0.7081438062828642, + "learning_rate": 5.005717182837147e-06, + "loss": 0.0305, + "step": 5492 + }, + { + "epoch": 2.49909008189263, + "grad_norm": 0.42096235736073284, + "learning_rate": 5.004287887536645e-06, + "loss": 0.0203, + "step": 5493 + }, + { + "epoch": 2.4995450409463147, + "grad_norm": 0.5346006200366389, + "learning_rate": 5.002858591885756e-06, + "loss": 0.0189, + "step": 5494 + }, + { + "epoch": 2.5, + "grad_norm": 0.49017838297375854, + "learning_rate": 5.001429296001275e-06, + "loss": 0.0145, + "step": 5495 + }, + { + "epoch": 2.5004549590536853, + "grad_norm": 0.4141991773280852, + "learning_rate": 5e-06, + "loss": 0.0096, + "step": 5496 + }, + { + "epoch": 2.50090991810737, + "grad_norm": 0.2467185146822804, + "learning_rate": 4.9985707039987256e-06, + "loss": 0.0064, + "step": 5497 + }, + { + "epoch": 2.5013648771610555, + "grad_norm": 0.35852650013127413, + "learning_rate": 4.9971414081142455e-06, + "loss": 0.0114, + "step": 5498 + }, + { + "epoch": 2.501819836214741, + "grad_norm": 0.2997727062951807, + "learning_rate": 4.995712112463358e-06, + "loss": 0.0083, + "step": 5499 + }, + { + "epoch": 2.502274795268426, + "grad_norm": 0.4678314645703148, + "learning_rate": 4.994282817162854e-06, + "loss": 0.0246, + "step": 5500 + }, + { + "epoch": 2.502729754322111, + "grad_norm": 0.37661683265971724, + "learning_rate": 4.992853522329535e-06, + "loss": 0.01, + "step": 5501 + }, + { + "epoch": 2.5031847133757963, + "grad_norm": 0.3003152893169655, + "learning_rate": 4.991424228080193e-06, + "loss": 0.0081, + "step": 5502 + }, + { + "epoch": 2.5036396724294816, + "grad_norm": 0.27232546572021565, + "learning_rate": 4.989994934531625e-06, + "loss": 0.0048, + "step": 5503 + }, + { + "epoch": 2.5040946314831665, + "grad_norm": 0.31903561416643506, + "learning_rate": 4.988565641800626e-06, + "loss": 0.0103, + "step": 5504 + }, + { + "epoch": 2.5045495905368518, + "grad_norm": 0.31597363231985026, + "learning_rate": 4.98713635000399e-06, + "loss": 0.0087, + "step": 5505 + }, + { + "epoch": 2.505004549590537, + "grad_norm": 0.3289676047625398, + "learning_rate": 4.985707059258515e-06, + "loss": 0.0092, + "step": 5506 + }, + { + "epoch": 2.505459508644222, + "grad_norm": 0.36358965738000726, + "learning_rate": 4.9842777696809925e-06, + "loss": 0.0107, + "step": 5507 + }, + { + "epoch": 2.5059144676979073, + "grad_norm": 0.3595581796565044, + "learning_rate": 4.98284848138822e-06, + "loss": 0.0156, + "step": 5508 + }, + { + "epoch": 2.5063694267515926, + "grad_norm": 0.29196970351493584, + "learning_rate": 4.9814191944969934e-06, + "loss": 0.0071, + "step": 5509 + }, + { + "epoch": 2.5068243858052774, + "grad_norm": 0.36578404478530974, + "learning_rate": 4.979989909124106e-06, + "loss": 0.0102, + "step": 5510 + }, + { + "epoch": 2.5072793448589628, + "grad_norm": 0.4041831002992017, + "learning_rate": 4.978560625386354e-06, + "loss": 0.0122, + "step": 5511 + }, + { + "epoch": 2.507734303912648, + "grad_norm": 0.4143905742161826, + "learning_rate": 4.977131343400528e-06, + "loss": 0.0137, + "step": 5512 + }, + { + "epoch": 2.508189262966333, + "grad_norm": 0.40720665401264816, + "learning_rate": 4.975702063283428e-06, + "loss": 0.0063, + "step": 5513 + }, + { + "epoch": 2.5086442220200182, + "grad_norm": 0.564393026317754, + "learning_rate": 4.974272785151843e-06, + "loss": 0.0201, + "step": 5514 + }, + { + "epoch": 2.5090991810737036, + "grad_norm": 0.30933261805319073, + "learning_rate": 4.972843509122571e-06, + "loss": 0.0112, + "step": 5515 + }, + { + "epoch": 2.5095541401273884, + "grad_norm": 0.3069960764395276, + "learning_rate": 4.971414235312406e-06, + "loss": 0.0082, + "step": 5516 + }, + { + "epoch": 2.5100090991810737, + "grad_norm": 0.5014702072334418, + "learning_rate": 4.9699849638381396e-06, + "loss": 0.0128, + "step": 5517 + }, + { + "epoch": 2.510464058234759, + "grad_norm": 0.36167069762141185, + "learning_rate": 4.968555694816567e-06, + "loss": 0.0161, + "step": 5518 + }, + { + "epoch": 2.510919017288444, + "grad_norm": 0.47630341434996465, + "learning_rate": 4.96712642836448e-06, + "loss": 0.0105, + "step": 5519 + }, + { + "epoch": 2.511373976342129, + "grad_norm": 0.3495169527672344, + "learning_rate": 4.965697164598674e-06, + "loss": 0.0086, + "step": 5520 + }, + { + "epoch": 2.5118289353958145, + "grad_norm": 0.4536091118396002, + "learning_rate": 4.964267903635939e-06, + "loss": 0.0174, + "step": 5521 + }, + { + "epoch": 2.5122838944494994, + "grad_norm": 0.5347034444205534, + "learning_rate": 4.96283864559307e-06, + "loss": 0.0094, + "step": 5522 + }, + { + "epoch": 2.5127388535031847, + "grad_norm": 0.34504812867014445, + "learning_rate": 4.96140939058686e-06, + "loss": 0.0131, + "step": 5523 + }, + { + "epoch": 2.51319381255687, + "grad_norm": 0.3159930637146742, + "learning_rate": 4.9599801387341e-06, + "loss": 0.007, + "step": 5524 + }, + { + "epoch": 2.513648771610555, + "grad_norm": 0.4629740972135442, + "learning_rate": 4.958550890151584e-06, + "loss": 0.011, + "step": 5525 + }, + { + "epoch": 2.51410373066424, + "grad_norm": 0.3702026628093378, + "learning_rate": 4.9571216449561e-06, + "loss": 0.0108, + "step": 5526 + }, + { + "epoch": 2.5145586897179255, + "grad_norm": 0.6242326412181597, + "learning_rate": 4.955692403264442e-06, + "loss": 0.014, + "step": 5527 + }, + { + "epoch": 2.5150136487716104, + "grad_norm": 0.5075724414262232, + "learning_rate": 4.9542631651934e-06, + "loss": 0.0131, + "step": 5528 + }, + { + "epoch": 2.5154686078252957, + "grad_norm": 0.8756432428133315, + "learning_rate": 4.952833930859766e-06, + "loss": 0.0436, + "step": 5529 + }, + { + "epoch": 2.515923566878981, + "grad_norm": 0.5147063715586754, + "learning_rate": 4.951404700380331e-06, + "loss": 0.0295, + "step": 5530 + }, + { + "epoch": 2.516378525932666, + "grad_norm": 0.30998361393225377, + "learning_rate": 4.9499754738718835e-06, + "loss": 0.0093, + "step": 5531 + }, + { + "epoch": 2.516833484986351, + "grad_norm": 0.3577270446223234, + "learning_rate": 4.948546251451216e-06, + "loss": 0.0115, + "step": 5532 + }, + { + "epoch": 2.5172884440400365, + "grad_norm": 0.3958399614718588, + "learning_rate": 4.947117033235116e-06, + "loss": 0.0187, + "step": 5533 + }, + { + "epoch": 2.5177434030937214, + "grad_norm": 0.2819495908306721, + "learning_rate": 4.945687819340372e-06, + "loss": 0.0051, + "step": 5534 + }, + { + "epoch": 2.5181983621474067, + "grad_norm": 0.5591416708777733, + "learning_rate": 4.944258609883773e-06, + "loss": 0.0172, + "step": 5535 + }, + { + "epoch": 2.518653321201092, + "grad_norm": 0.3741700766720031, + "learning_rate": 4.942829404982112e-06, + "loss": 0.0142, + "step": 5536 + }, + { + "epoch": 2.519108280254777, + "grad_norm": 0.4945911767497451, + "learning_rate": 4.9414002047521705e-06, + "loss": 0.019, + "step": 5537 + }, + { + "epoch": 2.519563239308462, + "grad_norm": 0.31958519221663395, + "learning_rate": 4.939971009310743e-06, + "loss": 0.0135, + "step": 5538 + }, + { + "epoch": 2.5200181983621475, + "grad_norm": 0.3592760701659723, + "learning_rate": 4.938541818774611e-06, + "loss": 0.0124, + "step": 5539 + }, + { + "epoch": 2.5204731574158323, + "grad_norm": 0.6461483543159348, + "learning_rate": 4.937112633260566e-06, + "loss": 0.0244, + "step": 5540 + }, + { + "epoch": 2.5209281164695176, + "grad_norm": 0.34428537904976697, + "learning_rate": 4.9356834528853905e-06, + "loss": 0.0097, + "step": 5541 + }, + { + "epoch": 2.521383075523203, + "grad_norm": 0.4758533567881116, + "learning_rate": 4.934254277765872e-06, + "loss": 0.016, + "step": 5542 + }, + { + "epoch": 2.521838034576888, + "grad_norm": 0.3571273207320487, + "learning_rate": 4.9328251080188e-06, + "loss": 0.0093, + "step": 5543 + }, + { + "epoch": 2.522292993630573, + "grad_norm": 0.31431089221926806, + "learning_rate": 4.931395943760955e-06, + "loss": 0.0084, + "step": 5544 + }, + { + "epoch": 2.5227479526842584, + "grad_norm": 4.747832584318776, + "learning_rate": 4.929966785109125e-06, + "loss": 0.111, + "step": 5545 + }, + { + "epoch": 2.5232029117379433, + "grad_norm": 0.39506910801267275, + "learning_rate": 4.928537632180092e-06, + "loss": 0.0102, + "step": 5546 + }, + { + "epoch": 2.5236578707916286, + "grad_norm": 0.6871172353377315, + "learning_rate": 4.927108485090643e-06, + "loss": 0.0281, + "step": 5547 + }, + { + "epoch": 2.524112829845314, + "grad_norm": 0.37466249576280947, + "learning_rate": 4.925679343957557e-06, + "loss": 0.012, + "step": 5548 + }, + { + "epoch": 2.5245677888989992, + "grad_norm": 0.34469889399890297, + "learning_rate": 4.924250208897619e-06, + "loss": 0.0109, + "step": 5549 + }, + { + "epoch": 2.525022747952684, + "grad_norm": 0.399821198857524, + "learning_rate": 4.922821080027613e-06, + "loss": 0.0125, + "step": 5550 + }, + { + "epoch": 2.5254777070063694, + "grad_norm": 0.39885120746996305, + "learning_rate": 4.921391957464319e-06, + "loss": 0.0129, + "step": 5551 + }, + { + "epoch": 2.5259326660600547, + "grad_norm": 0.5874602162370535, + "learning_rate": 4.919962841324521e-06, + "loss": 0.0245, + "step": 5552 + }, + { + "epoch": 2.5263876251137396, + "grad_norm": 0.33581082842146154, + "learning_rate": 4.918533731724997e-06, + "loss": 0.0115, + "step": 5553 + }, + { + "epoch": 2.526842584167425, + "grad_norm": 0.3470907353559372, + "learning_rate": 4.917104628782529e-06, + "loss": 0.0088, + "step": 5554 + }, + { + "epoch": 2.52729754322111, + "grad_norm": 0.5267945547914693, + "learning_rate": 4.915675532613896e-06, + "loss": 0.0142, + "step": 5555 + }, + { + "epoch": 2.5277525022747955, + "grad_norm": 0.3725294825015107, + "learning_rate": 4.914246443335876e-06, + "loss": 0.0111, + "step": 5556 + }, + { + "epoch": 2.5282074613284804, + "grad_norm": 0.2627866071882769, + "learning_rate": 4.9128173610652524e-06, + "loss": 0.0087, + "step": 5557 + }, + { + "epoch": 2.5286624203821657, + "grad_norm": 0.40804527856852457, + "learning_rate": 4.9113882859187985e-06, + "loss": 0.0118, + "step": 5558 + }, + { + "epoch": 2.529117379435851, + "grad_norm": 0.44875320925114015, + "learning_rate": 4.909959218013295e-06, + "loss": 0.0208, + "step": 5559 + }, + { + "epoch": 2.529572338489536, + "grad_norm": 0.439232146447453, + "learning_rate": 4.908530157465516e-06, + "loss": 0.012, + "step": 5560 + }, + { + "epoch": 2.530027297543221, + "grad_norm": 0.271794599047473, + "learning_rate": 4.907101104392242e-06, + "loss": 0.0078, + "step": 5561 + }, + { + "epoch": 2.5304822565969065, + "grad_norm": 0.8592142893631755, + "learning_rate": 4.905672058910243e-06, + "loss": 0.0294, + "step": 5562 + }, + { + "epoch": 2.5309372156505914, + "grad_norm": 0.41292625608557826, + "learning_rate": 4.904243021136298e-06, + "loss": 0.0107, + "step": 5563 + }, + { + "epoch": 2.5313921747042767, + "grad_norm": 0.38744962053142673, + "learning_rate": 4.902813991187183e-06, + "loss": 0.0144, + "step": 5564 + }, + { + "epoch": 2.531847133757962, + "grad_norm": 0.24441496751971636, + "learning_rate": 4.901384969179668e-06, + "loss": 0.0078, + "step": 5565 + }, + { + "epoch": 2.532302092811647, + "grad_norm": 0.4237942333287475, + "learning_rate": 4.8999559552305294e-06, + "loss": 0.0131, + "step": 5566 + }, + { + "epoch": 2.532757051865332, + "grad_norm": 0.5124564835590572, + "learning_rate": 4.898526949456537e-06, + "loss": 0.0092, + "step": 5567 + }, + { + "epoch": 2.5332120109190175, + "grad_norm": 0.539497107342928, + "learning_rate": 4.897097951974465e-06, + "loss": 0.0216, + "step": 5568 + }, + { + "epoch": 2.5336669699727024, + "grad_norm": 0.3139704212663251, + "learning_rate": 4.895668962901084e-06, + "loss": 0.0084, + "step": 5569 + }, + { + "epoch": 2.5341219290263877, + "grad_norm": 0.3789700818731656, + "learning_rate": 4.894239982353162e-06, + "loss": 0.0111, + "step": 5570 + }, + { + "epoch": 2.534576888080073, + "grad_norm": 0.270368556526887, + "learning_rate": 4.892811010447472e-06, + "loss": 0.0062, + "step": 5571 + }, + { + "epoch": 2.535031847133758, + "grad_norm": 0.8572665325618268, + "learning_rate": 4.891382047300783e-06, + "loss": 0.0237, + "step": 5572 + }, + { + "epoch": 2.535486806187443, + "grad_norm": 0.3680325899891936, + "learning_rate": 4.889953093029862e-06, + "loss": 0.0111, + "step": 5573 + }, + { + "epoch": 2.5359417652411285, + "grad_norm": 0.43888213024771167, + "learning_rate": 4.888524147751479e-06, + "loss": 0.0141, + "step": 5574 + }, + { + "epoch": 2.5363967242948133, + "grad_norm": 0.5154113179861864, + "learning_rate": 4.887095211582397e-06, + "loss": 0.0127, + "step": 5575 + }, + { + "epoch": 2.5368516833484986, + "grad_norm": 0.3662234379749456, + "learning_rate": 4.885666284639385e-06, + "loss": 0.0144, + "step": 5576 + }, + { + "epoch": 2.537306642402184, + "grad_norm": 0.6064861777011574, + "learning_rate": 4.884237367039207e-06, + "loss": 0.0258, + "step": 5577 + }, + { + "epoch": 2.537761601455869, + "grad_norm": 0.22811602993101804, + "learning_rate": 4.882808458898629e-06, + "loss": 0.0058, + "step": 5578 + }, + { + "epoch": 2.538216560509554, + "grad_norm": 0.47988155796684506, + "learning_rate": 4.881379560334416e-06, + "loss": 0.0112, + "step": 5579 + }, + { + "epoch": 2.5386715195632394, + "grad_norm": 0.48247690902291146, + "learning_rate": 4.879950671463328e-06, + "loss": 0.0113, + "step": 5580 + }, + { + "epoch": 2.5391264786169243, + "grad_norm": 0.712840557495972, + "learning_rate": 4.878521792402131e-06, + "loss": 0.0293, + "step": 5581 + }, + { + "epoch": 2.5395814376706096, + "grad_norm": 0.2824744664732004, + "learning_rate": 4.877092923267582e-06, + "loss": 0.0056, + "step": 5582 + }, + { + "epoch": 2.540036396724295, + "grad_norm": 0.3342297770263099, + "learning_rate": 4.875664064176447e-06, + "loss": 0.0112, + "step": 5583 + }, + { + "epoch": 2.54049135577798, + "grad_norm": 0.5261592758190419, + "learning_rate": 4.87423521524548e-06, + "loss": 0.0144, + "step": 5584 + }, + { + "epoch": 2.540946314831665, + "grad_norm": 0.37226086362158617, + "learning_rate": 4.8728063765914446e-06, + "loss": 0.0095, + "step": 5585 + }, + { + "epoch": 2.5414012738853504, + "grad_norm": 0.21794305516565435, + "learning_rate": 4.871377548331099e-06, + "loss": 0.0044, + "step": 5586 + }, + { + "epoch": 2.5418562329390353, + "grad_norm": 0.5972324079160436, + "learning_rate": 4.869948730581198e-06, + "loss": 0.0133, + "step": 5587 + }, + { + "epoch": 2.5423111919927206, + "grad_norm": 0.3574329092640865, + "learning_rate": 4.8685199234585e-06, + "loss": 0.013, + "step": 5588 + }, + { + "epoch": 2.542766151046406, + "grad_norm": 0.39022968181721507, + "learning_rate": 4.867091127079759e-06, + "loss": 0.0097, + "step": 5589 + }, + { + "epoch": 2.5432211101000908, + "grad_norm": 0.32666939246421944, + "learning_rate": 4.865662341561733e-06, + "loss": 0.0098, + "step": 5590 + }, + { + "epoch": 2.543676069153776, + "grad_norm": 0.4250077147805536, + "learning_rate": 4.864233567021171e-06, + "loss": 0.0113, + "step": 5591 + }, + { + "epoch": 2.5441310282074614, + "grad_norm": 0.41558963533530774, + "learning_rate": 4.8628048035748304e-06, + "loss": 0.011, + "step": 5592 + }, + { + "epoch": 2.5445859872611463, + "grad_norm": 0.41087461827146887, + "learning_rate": 4.861376051339462e-06, + "loss": 0.0159, + "step": 5593 + }, + { + "epoch": 2.5450409463148316, + "grad_norm": 0.3193621099240802, + "learning_rate": 4.859947310431816e-06, + "loss": 0.006, + "step": 5594 + }, + { + "epoch": 2.545495905368517, + "grad_norm": 0.35894260549027757, + "learning_rate": 4.858518580968644e-06, + "loss": 0.0084, + "step": 5595 + }, + { + "epoch": 2.5459508644222018, + "grad_norm": 0.3013580851069753, + "learning_rate": 4.857089863066694e-06, + "loss": 0.0118, + "step": 5596 + }, + { + "epoch": 2.546405823475887, + "grad_norm": 0.4268250695733099, + "learning_rate": 4.8556611568427165e-06, + "loss": 0.0174, + "step": 5597 + }, + { + "epoch": 2.5468607825295724, + "grad_norm": 0.4394317936188433, + "learning_rate": 4.854232462413455e-06, + "loss": 0.0138, + "step": 5598 + }, + { + "epoch": 2.5473157415832572, + "grad_norm": 0.40675101044833256, + "learning_rate": 4.852803779895658e-06, + "loss": 0.0168, + "step": 5599 + }, + { + "epoch": 2.5477707006369426, + "grad_norm": 0.2736244357234857, + "learning_rate": 4.8513751094060744e-06, + "loss": 0.0076, + "step": 5600 + }, + { + "epoch": 2.548225659690628, + "grad_norm": 0.3242077509300469, + "learning_rate": 4.849946451061444e-06, + "loss": 0.0079, + "step": 5601 + }, + { + "epoch": 2.548680618744313, + "grad_norm": 0.45386624160854155, + "learning_rate": 4.848517804978513e-06, + "loss": 0.0159, + "step": 5602 + }, + { + "epoch": 2.549135577797998, + "grad_norm": 0.34118928000231935, + "learning_rate": 4.847089171274022e-06, + "loss": 0.0161, + "step": 5603 + }, + { + "epoch": 2.5495905368516834, + "grad_norm": 0.2614801479276341, + "learning_rate": 4.845660550064714e-06, + "loss": 0.0053, + "step": 5604 + }, + { + "epoch": 2.5500454959053687, + "grad_norm": 0.4315174445267716, + "learning_rate": 4.8442319414673266e-06, + "loss": 0.0154, + "step": 5605 + }, + { + "epoch": 2.5505004549590535, + "grad_norm": 0.7845355806911353, + "learning_rate": 4.842803345598604e-06, + "loss": 0.0259, + "step": 5606 + }, + { + "epoch": 2.550955414012739, + "grad_norm": 0.33333613207861285, + "learning_rate": 4.841374762575281e-06, + "loss": 0.0093, + "step": 5607 + }, + { + "epoch": 2.551410373066424, + "grad_norm": 0.33005978750881937, + "learning_rate": 4.839946192514096e-06, + "loss": 0.0104, + "step": 5608 + }, + { + "epoch": 2.5518653321201095, + "grad_norm": 0.6088233555043218, + "learning_rate": 4.838517635531787e-06, + "loss": 0.0256, + "step": 5609 + }, + { + "epoch": 2.5523202911737943, + "grad_norm": 0.3382526567827965, + "learning_rate": 4.837089091745086e-06, + "loss": 0.0086, + "step": 5610 + }, + { + "epoch": 2.5527752502274796, + "grad_norm": 0.31057908536556494, + "learning_rate": 4.835660561270729e-06, + "loss": 0.0077, + "step": 5611 + }, + { + "epoch": 2.553230209281165, + "grad_norm": 0.551724445181761, + "learning_rate": 4.834232044225447e-06, + "loss": 0.0227, + "step": 5612 + }, + { + "epoch": 2.55368516833485, + "grad_norm": 0.29200280313332505, + "learning_rate": 4.832803540725977e-06, + "loss": 0.0111, + "step": 5613 + }, + { + "epoch": 2.554140127388535, + "grad_norm": 0.47928702896577746, + "learning_rate": 4.831375050889045e-06, + "loss": 0.0143, + "step": 5614 + }, + { + "epoch": 2.5545950864422204, + "grad_norm": 0.2709973614498435, + "learning_rate": 4.829946574831383e-06, + "loss": 0.0096, + "step": 5615 + }, + { + "epoch": 2.5550500454959053, + "grad_norm": 0.41319674005519985, + "learning_rate": 4.828518112669718e-06, + "loss": 0.0143, + "step": 5616 + }, + { + "epoch": 2.5555050045495906, + "grad_norm": 0.4179304240853926, + "learning_rate": 4.827089664520779e-06, + "loss": 0.0137, + "step": 5617 + }, + { + "epoch": 2.555959963603276, + "grad_norm": 0.6009269695179169, + "learning_rate": 4.8256612305012915e-06, + "loss": 0.0283, + "step": 5618 + }, + { + "epoch": 2.556414922656961, + "grad_norm": 0.5525798106238338, + "learning_rate": 4.8242328107279805e-06, + "loss": 0.0157, + "step": 5619 + }, + { + "epoch": 2.556869881710646, + "grad_norm": 0.3009813287525514, + "learning_rate": 4.822804405317571e-06, + "loss": 0.0065, + "step": 5620 + }, + { + "epoch": 2.5573248407643314, + "grad_norm": 0.5024514195761675, + "learning_rate": 4.821376014386785e-06, + "loss": 0.0141, + "step": 5621 + }, + { + "epoch": 2.5577797998180163, + "grad_norm": 0.38596782208931674, + "learning_rate": 4.819947638052345e-06, + "loss": 0.0121, + "step": 5622 + }, + { + "epoch": 2.5582347588717016, + "grad_norm": 0.4591430541180311, + "learning_rate": 4.818519276430971e-06, + "loss": 0.0102, + "step": 5623 + }, + { + "epoch": 2.558689717925387, + "grad_norm": 0.3284450771135685, + "learning_rate": 4.8170909296393824e-06, + "loss": 0.006, + "step": 5624 + }, + { + "epoch": 2.5591446769790718, + "grad_norm": 0.4315696126470542, + "learning_rate": 4.815662597794296e-06, + "loss": 0.0129, + "step": 5625 + }, + { + "epoch": 2.559599636032757, + "grad_norm": 0.2984890660335619, + "learning_rate": 4.814234281012429e-06, + "loss": 0.0091, + "step": 5626 + }, + { + "epoch": 2.5600545950864424, + "grad_norm": 0.28799942970226056, + "learning_rate": 4.812805979410499e-06, + "loss": 0.0097, + "step": 5627 + }, + { + "epoch": 2.5605095541401273, + "grad_norm": 0.5168975637701665, + "learning_rate": 4.811377693105219e-06, + "loss": 0.0211, + "step": 5628 + }, + { + "epoch": 2.5609645131938126, + "grad_norm": 0.4424202098069847, + "learning_rate": 4.809949422213303e-06, + "loss": 0.0091, + "step": 5629 + }, + { + "epoch": 2.561419472247498, + "grad_norm": 0.5069754961694984, + "learning_rate": 4.80852116685146e-06, + "loss": 0.0175, + "step": 5630 + }, + { + "epoch": 2.5618744313011828, + "grad_norm": 0.5715217237686248, + "learning_rate": 4.807092927136404e-06, + "loss": 0.0263, + "step": 5631 + }, + { + "epoch": 2.562329390354868, + "grad_norm": 0.3962821391351908, + "learning_rate": 4.805664703184842e-06, + "loss": 0.0125, + "step": 5632 + }, + { + "epoch": 2.5627843494085534, + "grad_norm": 0.4456271554372272, + "learning_rate": 4.804236495113481e-06, + "loss": 0.0136, + "step": 5633 + }, + { + "epoch": 2.5632393084622382, + "grad_norm": 0.3920443337679276, + "learning_rate": 4.802808303039032e-06, + "loss": 0.0163, + "step": 5634 + }, + { + "epoch": 2.5636942675159236, + "grad_norm": 0.20440097078867706, + "learning_rate": 4.801380127078198e-06, + "loss": 0.0042, + "step": 5635 + }, + { + "epoch": 2.564149226569609, + "grad_norm": 0.30323678238048796, + "learning_rate": 4.799951967347683e-06, + "loss": 0.0087, + "step": 5636 + }, + { + "epoch": 2.5646041856232937, + "grad_norm": 0.4519864852017533, + "learning_rate": 4.798523823964189e-06, + "loss": 0.0155, + "step": 5637 + }, + { + "epoch": 2.565059144676979, + "grad_norm": 0.45497642018573037, + "learning_rate": 4.79709569704442e-06, + "loss": 0.0112, + "step": 5638 + }, + { + "epoch": 2.5655141037306644, + "grad_norm": 0.22147517641216033, + "learning_rate": 4.795667586705073e-06, + "loss": 0.0062, + "step": 5639 + }, + { + "epoch": 2.565969062784349, + "grad_norm": 0.14301943004382262, + "learning_rate": 4.794239493062846e-06, + "loss": 0.0029, + "step": 5640 + }, + { + "epoch": 2.5664240218380345, + "grad_norm": 0.38343344825948156, + "learning_rate": 4.792811416234441e-06, + "loss": 0.011, + "step": 5641 + }, + { + "epoch": 2.56687898089172, + "grad_norm": 0.3722181884774487, + "learning_rate": 4.791383356336552e-06, + "loss": 0.0158, + "step": 5642 + }, + { + "epoch": 2.5673339399454047, + "grad_norm": 0.26893801764638015, + "learning_rate": 4.7899553134858715e-06, + "loss": 0.0075, + "step": 5643 + }, + { + "epoch": 2.56778889899909, + "grad_norm": 0.2892027359172782, + "learning_rate": 4.7885272877990955e-06, + "loss": 0.0069, + "step": 5644 + }, + { + "epoch": 2.5682438580527753, + "grad_norm": 0.44760714500350546, + "learning_rate": 4.787099279392913e-06, + "loss": 0.0201, + "step": 5645 + }, + { + "epoch": 2.56869881710646, + "grad_norm": 0.4511090540698227, + "learning_rate": 4.7856712883840174e-06, + "loss": 0.0074, + "step": 5646 + }, + { + "epoch": 2.5691537761601455, + "grad_norm": 0.42953413647564953, + "learning_rate": 4.784243314889094e-06, + "loss": 0.0263, + "step": 5647 + }, + { + "epoch": 2.569608735213831, + "grad_norm": 0.28324021307789976, + "learning_rate": 4.782815359024834e-06, + "loss": 0.0056, + "step": 5648 + }, + { + "epoch": 2.5700636942675157, + "grad_norm": 0.3945037003097103, + "learning_rate": 4.781387420907922e-06, + "loss": 0.0175, + "step": 5649 + }, + { + "epoch": 2.570518653321201, + "grad_norm": 0.4455129862590413, + "learning_rate": 4.779959500655043e-06, + "loss": 0.0146, + "step": 5650 + }, + { + "epoch": 2.5709736123748863, + "grad_norm": 0.3662410009937011, + "learning_rate": 4.77853159838288e-06, + "loss": 0.0083, + "step": 5651 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.38796879524415695, + "learning_rate": 4.777103714208114e-06, + "loss": 0.0088, + "step": 5652 + }, + { + "epoch": 2.5718835304822565, + "grad_norm": 0.46067658983642434, + "learning_rate": 4.775675848247427e-06, + "loss": 0.0245, + "step": 5653 + }, + { + "epoch": 2.572338489535942, + "grad_norm": 0.3793292965702815, + "learning_rate": 4.774248000617494e-06, + "loss": 0.014, + "step": 5654 + }, + { + "epoch": 2.5727934485896267, + "grad_norm": 0.5589395168221093, + "learning_rate": 4.772820171434997e-06, + "loss": 0.0218, + "step": 5655 + }, + { + "epoch": 2.573248407643312, + "grad_norm": 0.35660954216467167, + "learning_rate": 4.77139236081661e-06, + "loss": 0.0085, + "step": 5656 + }, + { + "epoch": 2.5737033666969973, + "grad_norm": 0.37115423687658833, + "learning_rate": 4.769964568879006e-06, + "loss": 0.017, + "step": 5657 + }, + { + "epoch": 2.5741583257506826, + "grad_norm": 0.5059021892593047, + "learning_rate": 4.768536795738859e-06, + "loss": 0.0161, + "step": 5658 + }, + { + "epoch": 2.5746132848043675, + "grad_norm": 0.39404637850315527, + "learning_rate": 4.76710904151284e-06, + "loss": 0.0169, + "step": 5659 + }, + { + "epoch": 2.5750682438580528, + "grad_norm": 0.3251630101652949, + "learning_rate": 4.765681306317618e-06, + "loss": 0.0087, + "step": 5660 + }, + { + "epoch": 2.575523202911738, + "grad_norm": 0.3729261098704293, + "learning_rate": 4.764253590269861e-06, + "loss": 0.0107, + "step": 5661 + }, + { + "epoch": 2.575978161965423, + "grad_norm": 0.5563546315346025, + "learning_rate": 4.762825893486236e-06, + "loss": 0.0251, + "step": 5662 + }, + { + "epoch": 2.5764331210191083, + "grad_norm": 0.46011067744887346, + "learning_rate": 4.761398216083409e-06, + "loss": 0.0175, + "step": 5663 + }, + { + "epoch": 2.5768880800727936, + "grad_norm": 0.5243980347635864, + "learning_rate": 4.7599705581780415e-06, + "loss": 0.0221, + "step": 5664 + }, + { + "epoch": 2.577343039126479, + "grad_norm": 0.22153214791109685, + "learning_rate": 4.7585429198867975e-06, + "loss": 0.005, + "step": 5665 + }, + { + "epoch": 2.5777979981801638, + "grad_norm": 0.3470983000687461, + "learning_rate": 4.757115301326334e-06, + "loss": 0.0083, + "step": 5666 + }, + { + "epoch": 2.578252957233849, + "grad_norm": 0.46582878099904523, + "learning_rate": 4.755687702613312e-06, + "loss": 0.0114, + "step": 5667 + }, + { + "epoch": 2.5787079162875344, + "grad_norm": 0.3366141366194447, + "learning_rate": 4.754260123864386e-06, + "loss": 0.0081, + "step": 5668 + }, + { + "epoch": 2.5791628753412192, + "grad_norm": 0.27418210374134405, + "learning_rate": 4.752832565196213e-06, + "loss": 0.009, + "step": 5669 + }, + { + "epoch": 2.5796178343949046, + "grad_norm": 0.4096441211837446, + "learning_rate": 4.751405026725449e-06, + "loss": 0.0157, + "step": 5670 + }, + { + "epoch": 2.58007279344859, + "grad_norm": 0.2989812908078907, + "learning_rate": 4.749977508568742e-06, + "loss": 0.0069, + "step": 5671 + }, + { + "epoch": 2.5805277525022747, + "grad_norm": 0.3161858485023435, + "learning_rate": 4.7485500108427456e-06, + "loss": 0.0125, + "step": 5672 + }, + { + "epoch": 2.58098271155596, + "grad_norm": 0.41529380289389317, + "learning_rate": 4.7471225336641045e-06, + "loss": 0.0126, + "step": 5673 + }, + { + "epoch": 2.5814376706096454, + "grad_norm": 0.3891313276093038, + "learning_rate": 4.74569507714947e-06, + "loss": 0.0091, + "step": 5674 + }, + { + "epoch": 2.58189262966333, + "grad_norm": 0.36763083573086686, + "learning_rate": 4.744267641415483e-06, + "loss": 0.0112, + "step": 5675 + }, + { + "epoch": 2.5823475887170155, + "grad_norm": 0.26062760370540944, + "learning_rate": 4.74284022657879e-06, + "loss": 0.0095, + "step": 5676 + }, + { + "epoch": 2.582802547770701, + "grad_norm": 0.4445073634493149, + "learning_rate": 4.741412832756034e-06, + "loss": 0.0198, + "step": 5677 + }, + { + "epoch": 2.5832575068243857, + "grad_norm": 0.2525167791360124, + "learning_rate": 4.7399854600638524e-06, + "loss": 0.0098, + "step": 5678 + }, + { + "epoch": 2.583712465878071, + "grad_norm": 0.38463953964064057, + "learning_rate": 4.738558108618885e-06, + "loss": 0.0153, + "step": 5679 + }, + { + "epoch": 2.5841674249317563, + "grad_norm": 0.40068511122357836, + "learning_rate": 4.737130778537769e-06, + "loss": 0.0166, + "step": 5680 + }, + { + "epoch": 2.584622383985441, + "grad_norm": 0.5621317411510426, + "learning_rate": 4.735703469937138e-06, + "loss": 0.0168, + "step": 5681 + }, + { + "epoch": 2.5850773430391265, + "grad_norm": 0.458585899105421, + "learning_rate": 4.734276182933624e-06, + "loss": 0.0235, + "step": 5682 + }, + { + "epoch": 2.585532302092812, + "grad_norm": 0.23298344983466068, + "learning_rate": 4.732848917643863e-06, + "loss": 0.0042, + "step": 5683 + }, + { + "epoch": 2.5859872611464967, + "grad_norm": 0.4890985050410502, + "learning_rate": 4.731421674184481e-06, + "loss": 0.0163, + "step": 5684 + }, + { + "epoch": 2.586442220200182, + "grad_norm": 0.3166420086538857, + "learning_rate": 4.729994452672108e-06, + "loss": 0.0088, + "step": 5685 + }, + { + "epoch": 2.5868971792538673, + "grad_norm": 0.431078529504673, + "learning_rate": 4.728567253223367e-06, + "loss": 0.0201, + "step": 5686 + }, + { + "epoch": 2.587352138307552, + "grad_norm": 0.5884834347804827, + "learning_rate": 4.727140075954887e-06, + "loss": 0.0236, + "step": 5687 + }, + { + "epoch": 2.5878070973612375, + "grad_norm": 0.41660555887864803, + "learning_rate": 4.725712920983286e-06, + "loss": 0.0193, + "step": 5688 + }, + { + "epoch": 2.588262056414923, + "grad_norm": 0.5891883848732339, + "learning_rate": 4.724285788425185e-06, + "loss": 0.0096, + "step": 5689 + }, + { + "epoch": 2.5887170154686077, + "grad_norm": 0.3155735538669077, + "learning_rate": 4.722858678397206e-06, + "loss": 0.0084, + "step": 5690 + }, + { + "epoch": 2.589171974522293, + "grad_norm": 0.3079750584560806, + "learning_rate": 4.721431591015963e-06, + "loss": 0.0061, + "step": 5691 + }, + { + "epoch": 2.5896269335759783, + "grad_norm": 0.37091097569234943, + "learning_rate": 4.720004526398075e-06, + "loss": 0.015, + "step": 5692 + }, + { + "epoch": 2.590081892629663, + "grad_norm": 0.4926220454682441, + "learning_rate": 4.7185774846601505e-06, + "loss": 0.0085, + "step": 5693 + }, + { + "epoch": 2.5905368516833485, + "grad_norm": 0.5546824530334568, + "learning_rate": 4.717150465918805e-06, + "loss": 0.016, + "step": 5694 + }, + { + "epoch": 2.5909918107370338, + "grad_norm": 0.4540780593471903, + "learning_rate": 4.715723470290644e-06, + "loss": 0.0153, + "step": 5695 + }, + { + "epoch": 2.5914467697907186, + "grad_norm": 0.3553708020812148, + "learning_rate": 4.714296497892277e-06, + "loss": 0.009, + "step": 5696 + }, + { + "epoch": 2.591901728844404, + "grad_norm": 0.39148283435969855, + "learning_rate": 4.712869548840311e-06, + "loss": 0.0109, + "step": 5697 + }, + { + "epoch": 2.5923566878980893, + "grad_norm": 0.43669578541265186, + "learning_rate": 4.711442623251349e-06, + "loss": 0.016, + "step": 5698 + }, + { + "epoch": 2.592811646951774, + "grad_norm": 0.27027201782885235, + "learning_rate": 4.710015721241993e-06, + "loss": 0.0081, + "step": 5699 + }, + { + "epoch": 2.5932666060054594, + "grad_norm": 0.4110807726380925, + "learning_rate": 4.708588842928842e-06, + "loss": 0.0162, + "step": 5700 + }, + { + "epoch": 2.5937215650591448, + "grad_norm": 0.35656479827200116, + "learning_rate": 4.7071619884284955e-06, + "loss": 0.0103, + "step": 5701 + }, + { + "epoch": 2.5941765241128296, + "grad_norm": 0.305326393352088, + "learning_rate": 4.705735157857548e-06, + "loss": 0.0112, + "step": 5702 + }, + { + "epoch": 2.594631483166515, + "grad_norm": 0.2792248986192964, + "learning_rate": 4.704308351332593e-06, + "loss": 0.01, + "step": 5703 + }, + { + "epoch": 2.5950864422202002, + "grad_norm": 0.621058188142095, + "learning_rate": 4.702881568970227e-06, + "loss": 0.0167, + "step": 5704 + }, + { + "epoch": 2.595541401273885, + "grad_norm": 0.3892333521186203, + "learning_rate": 4.701454810887036e-06, + "loss": 0.0114, + "step": 5705 + }, + { + "epoch": 2.5959963603275704, + "grad_norm": 0.35186659910320217, + "learning_rate": 4.70002807719961e-06, + "loss": 0.0069, + "step": 5706 + }, + { + "epoch": 2.5964513193812557, + "grad_norm": 0.40108911457025237, + "learning_rate": 4.698601368024535e-06, + "loss": 0.0172, + "step": 5707 + }, + { + "epoch": 2.5969062784349406, + "grad_norm": 0.42373634603180493, + "learning_rate": 4.697174683478396e-06, + "loss": 0.0157, + "step": 5708 + }, + { + "epoch": 2.597361237488626, + "grad_norm": 0.6394868852008742, + "learning_rate": 4.695748023677773e-06, + "loss": 0.0231, + "step": 5709 + }, + { + "epoch": 2.597816196542311, + "grad_norm": 0.5512344234887848, + "learning_rate": 4.6943213887392465e-06, + "loss": 0.0159, + "step": 5710 + }, + { + "epoch": 2.598271155595996, + "grad_norm": 0.1773299977692886, + "learning_rate": 4.692894778779398e-06, + "loss": 0.0044, + "step": 5711 + }, + { + "epoch": 2.5987261146496814, + "grad_norm": 0.46249397568887457, + "learning_rate": 4.691468193914799e-06, + "loss": 0.0172, + "step": 5712 + }, + { + "epoch": 2.5991810737033667, + "grad_norm": 0.533014196533797, + "learning_rate": 4.690041634262028e-06, + "loss": 0.0167, + "step": 5713 + }, + { + "epoch": 2.599636032757052, + "grad_norm": 0.412996319965304, + "learning_rate": 4.6886150999376554e-06, + "loss": 0.0093, + "step": 5714 + }, + { + "epoch": 2.600090991810737, + "grad_norm": 0.3674583428485953, + "learning_rate": 4.687188591058248e-06, + "loss": 0.012, + "step": 5715 + }, + { + "epoch": 2.600545950864422, + "grad_norm": 0.5397087960063365, + "learning_rate": 4.68576210774038e-06, + "loss": 0.0294, + "step": 5716 + }, + { + "epoch": 2.6010009099181075, + "grad_norm": 0.3718017390186533, + "learning_rate": 4.684335650100609e-06, + "loss": 0.0122, + "step": 5717 + }, + { + "epoch": 2.6014558689717924, + "grad_norm": 0.5260313811778944, + "learning_rate": 4.682909218255505e-06, + "loss": 0.0187, + "step": 5718 + }, + { + "epoch": 2.6019108280254777, + "grad_norm": 0.3005424847198216, + "learning_rate": 4.6814828123216285e-06, + "loss": 0.008, + "step": 5719 + }, + { + "epoch": 2.602365787079163, + "grad_norm": 0.33107833266033543, + "learning_rate": 4.680056432415536e-06, + "loss": 0.0088, + "step": 5720 + }, + { + "epoch": 2.6028207461328483, + "grad_norm": 0.2958430539835329, + "learning_rate": 4.6786300786537905e-06, + "loss": 0.0073, + "step": 5721 + }, + { + "epoch": 2.603275705186533, + "grad_norm": 0.31657097622558167, + "learning_rate": 4.677203751152941e-06, + "loss": 0.0118, + "step": 5722 + }, + { + "epoch": 2.6037306642402185, + "grad_norm": 0.4079268321662643, + "learning_rate": 4.675777450029545e-06, + "loss": 0.0165, + "step": 5723 + }, + { + "epoch": 2.604185623293904, + "grad_norm": 0.33480528730370973, + "learning_rate": 4.6743511754001495e-06, + "loss": 0.0083, + "step": 5724 + }, + { + "epoch": 2.6046405823475887, + "grad_norm": 0.29683596241044735, + "learning_rate": 4.6729249273813055e-06, + "loss": 0.0075, + "step": 5725 + }, + { + "epoch": 2.605095541401274, + "grad_norm": 0.4032644218320927, + "learning_rate": 4.671498706089561e-06, + "loss": 0.0153, + "step": 5726 + }, + { + "epoch": 2.6055505004549593, + "grad_norm": 0.4074746712570187, + "learning_rate": 4.670072511641458e-06, + "loss": 0.0131, + "step": 5727 + }, + { + "epoch": 2.606005459508644, + "grad_norm": 0.3692897583349627, + "learning_rate": 4.66864634415354e-06, + "loss": 0.0138, + "step": 5728 + }, + { + "epoch": 2.6064604185623295, + "grad_norm": 0.30749957189884386, + "learning_rate": 4.667220203742345e-06, + "loss": 0.0065, + "step": 5729 + }, + { + "epoch": 2.6069153776160148, + "grad_norm": 0.30623565693679805, + "learning_rate": 4.665794090524414e-06, + "loss": 0.0124, + "step": 5730 + }, + { + "epoch": 2.6073703366696996, + "grad_norm": 0.500005036560983, + "learning_rate": 4.6643680046162785e-06, + "loss": 0.0205, + "step": 5731 + }, + { + "epoch": 2.607825295723385, + "grad_norm": 0.4034681368915988, + "learning_rate": 4.662941946134474e-06, + "loss": 0.0151, + "step": 5732 + }, + { + "epoch": 2.6082802547770703, + "grad_norm": 0.43530520324662214, + "learning_rate": 4.661515915195534e-06, + "loss": 0.0165, + "step": 5733 + }, + { + "epoch": 2.608735213830755, + "grad_norm": 0.509796889816632, + "learning_rate": 4.660089911915983e-06, + "loss": 0.0162, + "step": 5734 + }, + { + "epoch": 2.6091901728844404, + "grad_norm": 0.3612159836198824, + "learning_rate": 4.6586639364123505e-06, + "loss": 0.0122, + "step": 5735 + }, + { + "epoch": 2.6096451319381258, + "grad_norm": 0.36888760672195897, + "learning_rate": 4.657237988801159e-06, + "loss": 0.0083, + "step": 5736 + }, + { + "epoch": 2.6101000909918106, + "grad_norm": 0.31423597167812467, + "learning_rate": 4.655812069198932e-06, + "loss": 0.0081, + "step": 5737 + }, + { + "epoch": 2.610555050045496, + "grad_norm": 0.3897289635639756, + "learning_rate": 4.654386177722185e-06, + "loss": 0.0131, + "step": 5738 + }, + { + "epoch": 2.6110100090991812, + "grad_norm": 0.2048674693935203, + "learning_rate": 4.652960314487441e-06, + "loss": 0.0054, + "step": 5739 + }, + { + "epoch": 2.611464968152866, + "grad_norm": 0.5026091275956444, + "learning_rate": 4.651534479611214e-06, + "loss": 0.0135, + "step": 5740 + }, + { + "epoch": 2.6119199272065514, + "grad_norm": 0.3423561108381752, + "learning_rate": 4.650108673210014e-06, + "loss": 0.0102, + "step": 5741 + }, + { + "epoch": 2.6123748862602367, + "grad_norm": 0.3257624307460199, + "learning_rate": 4.648682895400356e-06, + "loss": 0.0079, + "step": 5742 + }, + { + "epoch": 2.6128298453139216, + "grad_norm": 0.5221096495513938, + "learning_rate": 4.647257146298742e-06, + "loss": 0.0123, + "step": 5743 + }, + { + "epoch": 2.613284804367607, + "grad_norm": 1.364059336614433, + "learning_rate": 4.645831426021684e-06, + "loss": 0.0303, + "step": 5744 + }, + { + "epoch": 2.613739763421292, + "grad_norm": 0.551240674369172, + "learning_rate": 4.644405734685679e-06, + "loss": 0.0273, + "step": 5745 + }, + { + "epoch": 2.614194722474977, + "grad_norm": 0.38420589237028663, + "learning_rate": 4.642980072407233e-06, + "loss": 0.0104, + "step": 5746 + }, + { + "epoch": 2.6146496815286624, + "grad_norm": 0.5384659442716453, + "learning_rate": 4.641554439302843e-06, + "loss": 0.0143, + "step": 5747 + }, + { + "epoch": 2.6151046405823477, + "grad_norm": 0.3876530722237191, + "learning_rate": 4.640128835489007e-06, + "loss": 0.0119, + "step": 5748 + }, + { + "epoch": 2.6155595996360326, + "grad_norm": 0.3476457709085455, + "learning_rate": 4.6387032610822164e-06, + "loss": 0.011, + "step": 5749 + }, + { + "epoch": 2.616014558689718, + "grad_norm": 0.26694074076048757, + "learning_rate": 4.637277716198964e-06, + "loss": 0.0091, + "step": 5750 + }, + { + "epoch": 2.616469517743403, + "grad_norm": 0.4197615362874177, + "learning_rate": 4.635852200955738e-06, + "loss": 0.0176, + "step": 5751 + }, + { + "epoch": 2.616924476797088, + "grad_norm": 0.48739515766664343, + "learning_rate": 4.634426715469024e-06, + "loss": 0.022, + "step": 5752 + }, + { + "epoch": 2.6173794358507734, + "grad_norm": 0.26925881956883646, + "learning_rate": 4.633001259855311e-06, + "loss": 0.0066, + "step": 5753 + }, + { + "epoch": 2.6178343949044587, + "grad_norm": 0.3718053136267044, + "learning_rate": 4.631575834231076e-06, + "loss": 0.0142, + "step": 5754 + }, + { + "epoch": 2.6182893539581436, + "grad_norm": 0.35008894308345395, + "learning_rate": 4.630150438712801e-06, + "loss": 0.0137, + "step": 5755 + }, + { + "epoch": 2.618744313011829, + "grad_norm": 0.2634624680974572, + "learning_rate": 4.6287250734169605e-06, + "loss": 0.0088, + "step": 5756 + }, + { + "epoch": 2.619199272065514, + "grad_norm": 0.4188095871063324, + "learning_rate": 4.627299738460032e-06, + "loss": 0.0133, + "step": 5757 + }, + { + "epoch": 2.619654231119199, + "grad_norm": 0.37122477130171483, + "learning_rate": 4.6258744339584855e-06, + "loss": 0.0135, + "step": 5758 + }, + { + "epoch": 2.6201091901728844, + "grad_norm": 0.41017280481299545, + "learning_rate": 4.624449160028789e-06, + "loss": 0.011, + "step": 5759 + }, + { + "epoch": 2.6205641492265697, + "grad_norm": 0.28790189607168515, + "learning_rate": 4.623023916787412e-06, + "loss": 0.007, + "step": 5760 + }, + { + "epoch": 2.6210191082802545, + "grad_norm": 0.8095340989341101, + "learning_rate": 4.6215987043508185e-06, + "loss": 0.0188, + "step": 5761 + }, + { + "epoch": 2.62147406733394, + "grad_norm": 0.38160155583041777, + "learning_rate": 4.620173522835471e-06, + "loss": 0.0113, + "step": 5762 + }, + { + "epoch": 2.621929026387625, + "grad_norm": 0.3070713675822249, + "learning_rate": 4.618748372357827e-06, + "loss": 0.0076, + "step": 5763 + }, + { + "epoch": 2.62238398544131, + "grad_norm": 0.34069908038311614, + "learning_rate": 4.617323253034345e-06, + "loss": 0.0098, + "step": 5764 + }, + { + "epoch": 2.6228389444949953, + "grad_norm": 0.3435062113384841, + "learning_rate": 4.615898164981477e-06, + "loss": 0.0111, + "step": 5765 + }, + { + "epoch": 2.6232939035486806, + "grad_norm": 0.37577842389294913, + "learning_rate": 4.614473108315676e-06, + "loss": 0.015, + "step": 5766 + }, + { + "epoch": 2.623748862602366, + "grad_norm": 0.42903703048564007, + "learning_rate": 4.613048083153393e-06, + "loss": 0.0164, + "step": 5767 + }, + { + "epoch": 2.624203821656051, + "grad_norm": 0.5280151778638612, + "learning_rate": 4.611623089611073e-06, + "loss": 0.0141, + "step": 5768 + }, + { + "epoch": 2.624658780709736, + "grad_norm": 0.2636320380579304, + "learning_rate": 4.610198127805161e-06, + "loss": 0.0067, + "step": 5769 + }, + { + "epoch": 2.6251137397634214, + "grad_norm": 0.3750678231721881, + "learning_rate": 4.608773197852096e-06, + "loss": 0.0124, + "step": 5770 + }, + { + "epoch": 2.6255686988171063, + "grad_norm": 0.4404046939348186, + "learning_rate": 4.607348299868321e-06, + "loss": 0.0176, + "step": 5771 + }, + { + "epoch": 2.6260236578707916, + "grad_norm": 0.5118635401841247, + "learning_rate": 4.605923433970268e-06, + "loss": 0.0186, + "step": 5772 + }, + { + "epoch": 2.626478616924477, + "grad_norm": 0.5571878826845478, + "learning_rate": 4.604498600274371e-06, + "loss": 0.0211, + "step": 5773 + }, + { + "epoch": 2.6269335759781622, + "grad_norm": 0.22682777512616378, + "learning_rate": 4.603073798897064e-06, + "loss": 0.0059, + "step": 5774 + }, + { + "epoch": 2.627388535031847, + "grad_norm": 0.24317710327264414, + "learning_rate": 4.601649029954773e-06, + "loss": 0.006, + "step": 5775 + }, + { + "epoch": 2.6278434940855324, + "grad_norm": 0.41648038322407926, + "learning_rate": 4.600224293563926e-06, + "loss": 0.0092, + "step": 5776 + }, + { + "epoch": 2.6282984531392177, + "grad_norm": 0.5273847358730055, + "learning_rate": 4.598799589840943e-06, + "loss": 0.0092, + "step": 5777 + }, + { + "epoch": 2.6287534121929026, + "grad_norm": 0.32386046473781877, + "learning_rate": 4.597374918902247e-06, + "loss": 0.0113, + "step": 5778 + }, + { + "epoch": 2.629208371246588, + "grad_norm": 0.2967163856636597, + "learning_rate": 4.595950280864252e-06, + "loss": 0.0081, + "step": 5779 + }, + { + "epoch": 2.629663330300273, + "grad_norm": 0.3618730617503731, + "learning_rate": 4.594525675843375e-06, + "loss": 0.0088, + "step": 5780 + }, + { + "epoch": 2.630118289353958, + "grad_norm": 0.3308158172891041, + "learning_rate": 4.593101103956031e-06, + "loss": 0.0063, + "step": 5781 + }, + { + "epoch": 2.6305732484076434, + "grad_norm": 0.2711505150736749, + "learning_rate": 4.591676565318626e-06, + "loss": 0.0081, + "step": 5782 + }, + { + "epoch": 2.6310282074613287, + "grad_norm": 0.5925624878565041, + "learning_rate": 4.5902520600475694e-06, + "loss": 0.0168, + "step": 5783 + }, + { + "epoch": 2.6314831665150136, + "grad_norm": 0.5513124587083298, + "learning_rate": 4.588827588259265e-06, + "loss": 0.0178, + "step": 5784 + }, + { + "epoch": 2.631938125568699, + "grad_norm": 0.2665248519466739, + "learning_rate": 4.587403150070111e-06, + "loss": 0.007, + "step": 5785 + }, + { + "epoch": 2.632393084622384, + "grad_norm": 0.4397339921938562, + "learning_rate": 4.5859787455965095e-06, + "loss": 0.0133, + "step": 5786 + }, + { + "epoch": 2.632848043676069, + "grad_norm": 0.2800285530528093, + "learning_rate": 4.584554374954853e-06, + "loss": 0.0052, + "step": 5787 + }, + { + "epoch": 2.6333030027297544, + "grad_norm": 0.3129782224235985, + "learning_rate": 4.583130038261538e-06, + "loss": 0.0111, + "step": 5788 + }, + { + "epoch": 2.6337579617834397, + "grad_norm": 0.36646264066442885, + "learning_rate": 4.5817057356329545e-06, + "loss": 0.0121, + "step": 5789 + }, + { + "epoch": 2.6342129208371245, + "grad_norm": 0.30701057653401975, + "learning_rate": 4.580281467185488e-06, + "loss": 0.0084, + "step": 5790 + }, + { + "epoch": 2.63466787989081, + "grad_norm": 0.40444746509468976, + "learning_rate": 4.578857233035527e-06, + "loss": 0.0082, + "step": 5791 + }, + { + "epoch": 2.635122838944495, + "grad_norm": 0.385256018837849, + "learning_rate": 4.5774330332994485e-06, + "loss": 0.0114, + "step": 5792 + }, + { + "epoch": 2.63557779799818, + "grad_norm": 0.2883972426981428, + "learning_rate": 4.576008868093636e-06, + "loss": 0.0073, + "step": 5793 + }, + { + "epoch": 2.6360327570518653, + "grad_norm": 0.3895013306574242, + "learning_rate": 4.574584737534462e-06, + "loss": 0.0143, + "step": 5794 + }, + { + "epoch": 2.6364877161055507, + "grad_norm": 0.2324144680387847, + "learning_rate": 4.573160641738303e-06, + "loss": 0.0078, + "step": 5795 + }, + { + "epoch": 2.6369426751592355, + "grad_norm": 0.45599244569563824, + "learning_rate": 4.57173658082153e-06, + "loss": 0.0128, + "step": 5796 + }, + { + "epoch": 2.637397634212921, + "grad_norm": 0.5501655745067856, + "learning_rate": 4.570312554900508e-06, + "loss": 0.0258, + "step": 5797 + }, + { + "epoch": 2.637852593266606, + "grad_norm": 0.34996166563248526, + "learning_rate": 4.568888564091606e-06, + "loss": 0.014, + "step": 5798 + }, + { + "epoch": 2.638307552320291, + "grad_norm": 0.38384660960629524, + "learning_rate": 4.567464608511181e-06, + "loss": 0.0174, + "step": 5799 + }, + { + "epoch": 2.6387625113739763, + "grad_norm": 0.4623207748093834, + "learning_rate": 4.566040688275597e-06, + "loss": 0.0276, + "step": 5800 + }, + { + "epoch": 2.6392174704276616, + "grad_norm": 0.5223956569811683, + "learning_rate": 4.564616803501205e-06, + "loss": 0.0264, + "step": 5801 + }, + { + "epoch": 2.6396724294813465, + "grad_norm": 0.4449564284543771, + "learning_rate": 4.563192954304364e-06, + "loss": 0.016, + "step": 5802 + }, + { + "epoch": 2.640127388535032, + "grad_norm": 0.36251810804579676, + "learning_rate": 4.561769140801422e-06, + "loss": 0.013, + "step": 5803 + }, + { + "epoch": 2.640582347588717, + "grad_norm": 0.33855626891813634, + "learning_rate": 4.5603453631087265e-06, + "loss": 0.0127, + "step": 5804 + }, + { + "epoch": 2.641037306642402, + "grad_norm": 0.34777929454318607, + "learning_rate": 4.558921621342623e-06, + "loss": 0.0114, + "step": 5805 + }, + { + "epoch": 2.6414922656960873, + "grad_norm": 0.41291110565820316, + "learning_rate": 4.557497915619452e-06, + "loss": 0.015, + "step": 5806 + }, + { + "epoch": 2.6419472247497726, + "grad_norm": 0.5617402299104329, + "learning_rate": 4.556074246055555e-06, + "loss": 0.013, + "step": 5807 + }, + { + "epoch": 2.6424021838034575, + "grad_norm": 0.2687758103178183, + "learning_rate": 4.5546506127672625e-06, + "loss": 0.0086, + "step": 5808 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 0.22091279175143466, + "learning_rate": 4.553227015870912e-06, + "loss": 0.0046, + "step": 5809 + }, + { + "epoch": 2.643312101910828, + "grad_norm": 0.4313278689258374, + "learning_rate": 4.551803455482833e-06, + "loss": 0.0101, + "step": 5810 + }, + { + "epoch": 2.643767060964513, + "grad_norm": 0.6047817710725403, + "learning_rate": 4.550379931719351e-06, + "loss": 0.0096, + "step": 5811 + }, + { + "epoch": 2.6442220200181983, + "grad_norm": 0.32507370297772714, + "learning_rate": 4.548956444696791e-06, + "loss": 0.0083, + "step": 5812 + }, + { + "epoch": 2.6446769790718836, + "grad_norm": 0.2984733283454028, + "learning_rate": 4.5475329945314735e-06, + "loss": 0.0076, + "step": 5813 + }, + { + "epoch": 2.6451319381255685, + "grad_norm": 0.4613307935418929, + "learning_rate": 4.5461095813397164e-06, + "loss": 0.0185, + "step": 5814 + }, + { + "epoch": 2.6455868971792538, + "grad_norm": 0.4451412694131819, + "learning_rate": 4.544686205237833e-06, + "loss": 0.0224, + "step": 5815 + }, + { + "epoch": 2.646041856232939, + "grad_norm": 0.4145702427879785, + "learning_rate": 4.543262866342138e-06, + "loss": 0.0104, + "step": 5816 + }, + { + "epoch": 2.646496815286624, + "grad_norm": 0.3356934525716918, + "learning_rate": 4.541839564768939e-06, + "loss": 0.0082, + "step": 5817 + }, + { + "epoch": 2.6469517743403093, + "grad_norm": 0.439864585201127, + "learning_rate": 4.540416300634541e-06, + "loss": 0.0147, + "step": 5818 + }, + { + "epoch": 2.6474067333939946, + "grad_norm": 0.34928665902959727, + "learning_rate": 4.538993074055249e-06, + "loss": 0.0123, + "step": 5819 + }, + { + "epoch": 2.6478616924476794, + "grad_norm": 0.1589131959554854, + "learning_rate": 4.537569885147361e-06, + "loss": 0.0039, + "step": 5820 + }, + { + "epoch": 2.6483166515013647, + "grad_norm": 0.5476586369600871, + "learning_rate": 4.536146734027173e-06, + "loss": 0.0229, + "step": 5821 + }, + { + "epoch": 2.64877161055505, + "grad_norm": 0.3645269802636522, + "learning_rate": 4.534723620810977e-06, + "loss": 0.0148, + "step": 5822 + }, + { + "epoch": 2.6492265696087354, + "grad_norm": 0.4071429204997368, + "learning_rate": 4.533300545615068e-06, + "loss": 0.013, + "step": 5823 + }, + { + "epoch": 2.6496815286624202, + "grad_norm": 0.25993932014711674, + "learning_rate": 4.53187750855573e-06, + "loss": 0.0056, + "step": 5824 + }, + { + "epoch": 2.6501364877161055, + "grad_norm": 0.3610959108570182, + "learning_rate": 4.530454509749249e-06, + "loss": 0.0108, + "step": 5825 + }, + { + "epoch": 2.650591446769791, + "grad_norm": 0.3441179248113602, + "learning_rate": 4.529031549311904e-06, + "loss": 0.014, + "step": 5826 + }, + { + "epoch": 2.6510464058234757, + "grad_norm": 0.4457131297216828, + "learning_rate": 4.527608627359975e-06, + "loss": 0.018, + "step": 5827 + }, + { + "epoch": 2.651501364877161, + "grad_norm": 0.5060202653904013, + "learning_rate": 4.526185744009735e-06, + "loss": 0.0227, + "step": 5828 + }, + { + "epoch": 2.6519563239308463, + "grad_norm": 0.3673643525210136, + "learning_rate": 4.524762899377454e-06, + "loss": 0.0126, + "step": 5829 + }, + { + "epoch": 2.6524112829845317, + "grad_norm": 0.44996253774646483, + "learning_rate": 4.523340093579406e-06, + "loss": 0.0146, + "step": 5830 + }, + { + "epoch": 2.6528662420382165, + "grad_norm": 0.2976982932735961, + "learning_rate": 4.521917326731851e-06, + "loss": 0.0073, + "step": 5831 + }, + { + "epoch": 2.653321201091902, + "grad_norm": 0.33298774146807764, + "learning_rate": 4.520494598951055e-06, + "loss": 0.0088, + "step": 5832 + }, + { + "epoch": 2.653776160145587, + "grad_norm": 0.46122820715296065, + "learning_rate": 4.519071910353273e-06, + "loss": 0.0141, + "step": 5833 + }, + { + "epoch": 2.654231119199272, + "grad_norm": 0.45337578318684346, + "learning_rate": 4.5176492610547645e-06, + "loss": 0.0165, + "step": 5834 + }, + { + "epoch": 2.6546860782529573, + "grad_norm": 0.4976446291006916, + "learning_rate": 4.5162266511717786e-06, + "loss": 0.0164, + "step": 5835 + }, + { + "epoch": 2.6551410373066426, + "grad_norm": 0.3929025087484671, + "learning_rate": 4.514804080820565e-06, + "loss": 0.0125, + "step": 5836 + }, + { + "epoch": 2.6555959963603275, + "grad_norm": 0.579797174178738, + "learning_rate": 4.513381550117373e-06, + "loss": 0.0191, + "step": 5837 + }, + { + "epoch": 2.656050955414013, + "grad_norm": 0.23622730003541972, + "learning_rate": 4.511959059178443e-06, + "loss": 0.0038, + "step": 5838 + }, + { + "epoch": 2.656505914467698, + "grad_norm": 0.4543745917329778, + "learning_rate": 4.5105366081200145e-06, + "loss": 0.0204, + "step": 5839 + }, + { + "epoch": 2.656960873521383, + "grad_norm": 0.3278508136129625, + "learning_rate": 4.509114197058324e-06, + "loss": 0.0093, + "step": 5840 + }, + { + "epoch": 2.6574158325750683, + "grad_norm": 0.3779936944103423, + "learning_rate": 4.507691826109604e-06, + "loss": 0.0187, + "step": 5841 + }, + { + "epoch": 2.6578707916287536, + "grad_norm": 0.3390924411890412, + "learning_rate": 4.5062694953900844e-06, + "loss": 0.0106, + "step": 5842 + }, + { + "epoch": 2.6583257506824385, + "grad_norm": 0.3444507329542516, + "learning_rate": 4.504847205015992e-06, + "loss": 0.0115, + "step": 5843 + }, + { + "epoch": 2.658780709736124, + "grad_norm": 0.49310642630458146, + "learning_rate": 4.5034249551035506e-06, + "loss": 0.0171, + "step": 5844 + }, + { + "epoch": 2.659235668789809, + "grad_norm": 0.47513025298327166, + "learning_rate": 4.502002745768979e-06, + "loss": 0.0171, + "step": 5845 + }, + { + "epoch": 2.659690627843494, + "grad_norm": 0.3112837906022871, + "learning_rate": 4.500580577128495e-06, + "loss": 0.01, + "step": 5846 + }, + { + "epoch": 2.6601455868971793, + "grad_norm": 0.3597869037000204, + "learning_rate": 4.49915844929831e-06, + "loss": 0.0124, + "step": 5847 + }, + { + "epoch": 2.6606005459508646, + "grad_norm": 0.6453893116899386, + "learning_rate": 4.497736362394636e-06, + "loss": 0.0083, + "step": 5848 + }, + { + "epoch": 2.6610555050045495, + "grad_norm": 0.3001237241602498, + "learning_rate": 4.496314316533677e-06, + "loss": 0.0084, + "step": 5849 + }, + { + "epoch": 2.6615104640582348, + "grad_norm": 0.24020844944489492, + "learning_rate": 4.494892311831635e-06, + "loss": 0.0053, + "step": 5850 + }, + { + "epoch": 2.66196542311192, + "grad_norm": 0.4048867054822093, + "learning_rate": 4.493470348404716e-06, + "loss": 0.0105, + "step": 5851 + }, + { + "epoch": 2.662420382165605, + "grad_norm": 0.47597584535507365, + "learning_rate": 4.492048426369111e-06, + "loss": 0.0147, + "step": 5852 + }, + { + "epoch": 2.6628753412192903, + "grad_norm": 0.38895201531794266, + "learning_rate": 4.4906265458410155e-06, + "loss": 0.0148, + "step": 5853 + }, + { + "epoch": 2.6633303002729756, + "grad_norm": 0.5263749586215324, + "learning_rate": 4.489204706936618e-06, + "loss": 0.02, + "step": 5854 + }, + { + "epoch": 2.6637852593266604, + "grad_norm": 0.4070373856353329, + "learning_rate": 4.487782909772106e-06, + "loss": 0.0104, + "step": 5855 + }, + { + "epoch": 2.6642402183803457, + "grad_norm": 0.5081767752708876, + "learning_rate": 4.486361154463662e-06, + "loss": 0.0101, + "step": 5856 + }, + { + "epoch": 2.664695177434031, + "grad_norm": 0.464902831281099, + "learning_rate": 4.484939441127462e-06, + "loss": 0.0122, + "step": 5857 + }, + { + "epoch": 2.665150136487716, + "grad_norm": 0.2928487182953344, + "learning_rate": 4.483517769879686e-06, + "loss": 0.0074, + "step": 5858 + }, + { + "epoch": 2.6656050955414012, + "grad_norm": 0.34151745015555895, + "learning_rate": 4.482096140836506e-06, + "loss": 0.0137, + "step": 5859 + }, + { + "epoch": 2.6660600545950865, + "grad_norm": 0.41820223929717, + "learning_rate": 4.48067455411409e-06, + "loss": 0.0171, + "step": 5860 + }, + { + "epoch": 2.6665150136487714, + "grad_norm": 0.44755507302270947, + "learning_rate": 4.4792530098286055e-06, + "loss": 0.011, + "step": 5861 + }, + { + "epoch": 2.6669699727024567, + "grad_norm": 0.5075248732354104, + "learning_rate": 4.477831508096212e-06, + "loss": 0.0294, + "step": 5862 + }, + { + "epoch": 2.667424931756142, + "grad_norm": 0.3649971612422481, + "learning_rate": 4.476410049033071e-06, + "loss": 0.018, + "step": 5863 + }, + { + "epoch": 2.667879890809827, + "grad_norm": 0.6136006900786559, + "learning_rate": 4.474988632755333e-06, + "loss": 0.0188, + "step": 5864 + }, + { + "epoch": 2.668334849863512, + "grad_norm": 0.31952285704506544, + "learning_rate": 4.473567259379155e-06, + "loss": 0.0098, + "step": 5865 + }, + { + "epoch": 2.6687898089171975, + "grad_norm": 0.3303307149310139, + "learning_rate": 4.4721459290206845e-06, + "loss": 0.009, + "step": 5866 + }, + { + "epoch": 2.6692447679708824, + "grad_norm": 0.48624774371727625, + "learning_rate": 4.470724641796064e-06, + "loss": 0.0167, + "step": 5867 + }, + { + "epoch": 2.6696997270245677, + "grad_norm": 0.5994384400402291, + "learning_rate": 4.469303397821436e-06, + "loss": 0.0234, + "step": 5868 + }, + { + "epoch": 2.670154686078253, + "grad_norm": 0.41772145877689015, + "learning_rate": 4.467882197212936e-06, + "loss": 0.0221, + "step": 5869 + }, + { + "epoch": 2.670609645131938, + "grad_norm": 0.43795459902089234, + "learning_rate": 4.466461040086703e-06, + "loss": 0.0164, + "step": 5870 + }, + { + "epoch": 2.671064604185623, + "grad_norm": 0.47863779803985534, + "learning_rate": 4.46503992655886e-06, + "loss": 0.0208, + "step": 5871 + }, + { + "epoch": 2.6715195632393085, + "grad_norm": 0.5909524466084196, + "learning_rate": 4.46361885674554e-06, + "loss": 0.0139, + "step": 5872 + }, + { + "epoch": 2.6719745222929934, + "grad_norm": 0.19048398219418494, + "learning_rate": 4.462197830762867e-06, + "loss": 0.0053, + "step": 5873 + }, + { + "epoch": 2.6724294813466787, + "grad_norm": 0.26133628017831345, + "learning_rate": 4.460776848726956e-06, + "loss": 0.0083, + "step": 5874 + }, + { + "epoch": 2.672884440400364, + "grad_norm": 0.41149130816008755, + "learning_rate": 4.459355910753928e-06, + "loss": 0.0173, + "step": 5875 + }, + { + "epoch": 2.673339399454049, + "grad_norm": 0.4524287025288151, + "learning_rate": 4.4579350169598926e-06, + "loss": 0.0179, + "step": 5876 + }, + { + "epoch": 2.673794358507734, + "grad_norm": 0.5112956824842843, + "learning_rate": 4.456514167460959e-06, + "loss": 0.0122, + "step": 5877 + }, + { + "epoch": 2.6742493175614195, + "grad_norm": 0.25333799765495163, + "learning_rate": 4.4550933623732326e-06, + "loss": 0.0086, + "step": 5878 + }, + { + "epoch": 2.674704276615105, + "grad_norm": 0.3001308840457026, + "learning_rate": 4.4536726018128165e-06, + "loss": 0.0096, + "step": 5879 + }, + { + "epoch": 2.6751592356687897, + "grad_norm": 0.4096936505479323, + "learning_rate": 4.45225188589581e-06, + "loss": 0.0135, + "step": 5880 + }, + { + "epoch": 2.675614194722475, + "grad_norm": 0.5571718231482075, + "learning_rate": 4.450831214738303e-06, + "loss": 0.0205, + "step": 5881 + }, + { + "epoch": 2.6760691537761603, + "grad_norm": 0.3091438223005542, + "learning_rate": 4.4494105884563915e-06, + "loss": 0.0099, + "step": 5882 + }, + { + "epoch": 2.676524112829845, + "grad_norm": 0.25766124485336955, + "learning_rate": 4.447990007166159e-06, + "loss": 0.0081, + "step": 5883 + }, + { + "epoch": 2.6769790718835305, + "grad_norm": 0.5478957271033703, + "learning_rate": 4.446569470983692e-06, + "loss": 0.0141, + "step": 5884 + }, + { + "epoch": 2.6774340309372158, + "grad_norm": 0.2775414909517801, + "learning_rate": 4.445148980025065e-06, + "loss": 0.0102, + "step": 5885 + }, + { + "epoch": 2.677888989990901, + "grad_norm": 0.24580451578508553, + "learning_rate": 4.443728534406359e-06, + "loss": 0.0056, + "step": 5886 + }, + { + "epoch": 2.678343949044586, + "grad_norm": 0.4127465647091162, + "learning_rate": 4.442308134243647e-06, + "loss": 0.018, + "step": 5887 + }, + { + "epoch": 2.6787989080982713, + "grad_norm": 0.31431865508607826, + "learning_rate": 4.440887779652995e-06, + "loss": 0.0096, + "step": 5888 + }, + { + "epoch": 2.6792538671519566, + "grad_norm": 0.40643493771737643, + "learning_rate": 4.439467470750468e-06, + "loss": 0.0113, + "step": 5889 + }, + { + "epoch": 2.6797088262056414, + "grad_norm": 0.29721225550651337, + "learning_rate": 4.438047207652129e-06, + "loss": 0.0068, + "step": 5890 + }, + { + "epoch": 2.6801637852593267, + "grad_norm": 0.3345518511805347, + "learning_rate": 4.436626990474031e-06, + "loss": 0.0086, + "step": 5891 + }, + { + "epoch": 2.680618744313012, + "grad_norm": 0.5785048947016902, + "learning_rate": 4.435206819332235e-06, + "loss": 0.0174, + "step": 5892 + }, + { + "epoch": 2.681073703366697, + "grad_norm": 0.2507398612469783, + "learning_rate": 4.433786694342787e-06, + "loss": 0.0069, + "step": 5893 + }, + { + "epoch": 2.6815286624203822, + "grad_norm": 0.42109176587022173, + "learning_rate": 4.432366615621731e-06, + "loss": 0.0188, + "step": 5894 + }, + { + "epoch": 2.6819836214740675, + "grad_norm": 0.3116410675804666, + "learning_rate": 4.430946583285114e-06, + "loss": 0.0069, + "step": 5895 + }, + { + "epoch": 2.6824385805277524, + "grad_norm": 0.47691225592049874, + "learning_rate": 4.429526597448971e-06, + "loss": 0.0121, + "step": 5896 + }, + { + "epoch": 2.6828935395814377, + "grad_norm": 0.3169795585038183, + "learning_rate": 4.4281066582293395e-06, + "loss": 0.0079, + "step": 5897 + }, + { + "epoch": 2.683348498635123, + "grad_norm": 0.47180599726734984, + "learning_rate": 4.426686765742247e-06, + "loss": 0.0171, + "step": 5898 + }, + { + "epoch": 2.683803457688808, + "grad_norm": 0.3678746859384994, + "learning_rate": 4.425266920103724e-06, + "loss": 0.0095, + "step": 5899 + }, + { + "epoch": 2.684258416742493, + "grad_norm": 0.3346651706380877, + "learning_rate": 4.423847121429794e-06, + "loss": 0.0101, + "step": 5900 + }, + { + "epoch": 2.6847133757961785, + "grad_norm": 0.31662847366698194, + "learning_rate": 4.422427369836474e-06, + "loss": 0.0108, + "step": 5901 + }, + { + "epoch": 2.6851683348498634, + "grad_norm": 0.3108308777458586, + "learning_rate": 4.421007665439783e-06, + "loss": 0.0089, + "step": 5902 + }, + { + "epoch": 2.6856232939035487, + "grad_norm": 0.32253387780453324, + "learning_rate": 4.419588008355728e-06, + "loss": 0.0082, + "step": 5903 + }, + { + "epoch": 2.686078252957234, + "grad_norm": 0.3299627220985637, + "learning_rate": 4.418168398700323e-06, + "loss": 0.0103, + "step": 5904 + }, + { + "epoch": 2.686533212010919, + "grad_norm": 0.35129161189746655, + "learning_rate": 4.4167488365895655e-06, + "loss": 0.0083, + "step": 5905 + }, + { + "epoch": 2.686988171064604, + "grad_norm": 0.46063557831796464, + "learning_rate": 4.415329322139461e-06, + "loss": 0.0139, + "step": 5906 + }, + { + "epoch": 2.6874431301182895, + "grad_norm": 0.30436466186993605, + "learning_rate": 4.413909855466004e-06, + "loss": 0.0084, + "step": 5907 + }, + { + "epoch": 2.6878980891719744, + "grad_norm": 0.4276214090485729, + "learning_rate": 4.412490436685186e-06, + "loss": 0.012, + "step": 5908 + }, + { + "epoch": 2.6883530482256597, + "grad_norm": 0.43284047680467447, + "learning_rate": 4.411071065912998e-06, + "loss": 0.0124, + "step": 5909 + }, + { + "epoch": 2.688808007279345, + "grad_norm": 0.33056729075812286, + "learning_rate": 4.4096517432654214e-06, + "loss": 0.0109, + "step": 5910 + }, + { + "epoch": 2.68926296633303, + "grad_norm": 0.3622732918865968, + "learning_rate": 4.40823246885844e-06, + "loss": 0.0102, + "step": 5911 + }, + { + "epoch": 2.689717925386715, + "grad_norm": 0.3749288794205867, + "learning_rate": 4.406813242808026e-06, + "loss": 0.0091, + "step": 5912 + }, + { + "epoch": 2.6901728844404005, + "grad_norm": 0.4823955650158816, + "learning_rate": 4.405394065230156e-06, + "loss": 0.0248, + "step": 5913 + }, + { + "epoch": 2.6906278434940853, + "grad_norm": 0.4005168714710043, + "learning_rate": 4.4039749362408e-06, + "loss": 0.0138, + "step": 5914 + }, + { + "epoch": 2.6910828025477707, + "grad_norm": 0.24863550042056706, + "learning_rate": 4.402555855955919e-06, + "loss": 0.0065, + "step": 5915 + }, + { + "epoch": 2.691537761601456, + "grad_norm": 0.37360887296936135, + "learning_rate": 4.4011368244914755e-06, + "loss": 0.014, + "step": 5916 + }, + { + "epoch": 2.691992720655141, + "grad_norm": 0.30213901305179247, + "learning_rate": 4.399717841963426e-06, + "loss": 0.0072, + "step": 5917 + }, + { + "epoch": 2.692447679708826, + "grad_norm": 0.35931487388605293, + "learning_rate": 4.398298908487724e-06, + "loss": 0.0124, + "step": 5918 + }, + { + "epoch": 2.6929026387625115, + "grad_norm": 0.42471054550290893, + "learning_rate": 4.396880024180317e-06, + "loss": 0.0085, + "step": 5919 + }, + { + "epoch": 2.6933575978161963, + "grad_norm": 0.41595229154631397, + "learning_rate": 4.395461189157151e-06, + "loss": 0.0153, + "step": 5920 + }, + { + "epoch": 2.6938125568698816, + "grad_norm": 0.46265630144973596, + "learning_rate": 4.394042403534168e-06, + "loss": 0.0211, + "step": 5921 + }, + { + "epoch": 2.694267515923567, + "grad_norm": 0.3677226498607149, + "learning_rate": 4.3926236674273015e-06, + "loss": 0.0152, + "step": 5922 + }, + { + "epoch": 2.694722474977252, + "grad_norm": 0.3296085608612325, + "learning_rate": 4.391204980952488e-06, + "loss": 0.0053, + "step": 5923 + }, + { + "epoch": 2.695177434030937, + "grad_norm": 0.24938540793476988, + "learning_rate": 4.3897863442256524e-06, + "loss": 0.0064, + "step": 5924 + }, + { + "epoch": 2.6956323930846224, + "grad_norm": 0.3695274301851014, + "learning_rate": 4.388367757362722e-06, + "loss": 0.0106, + "step": 5925 + }, + { + "epoch": 2.6960873521383073, + "grad_norm": 0.42640183823398775, + "learning_rate": 4.386949220479615e-06, + "loss": 0.018, + "step": 5926 + }, + { + "epoch": 2.6965423111919926, + "grad_norm": 0.5087558783924742, + "learning_rate": 4.3855307336922506e-06, + "loss": 0.0113, + "step": 5927 + }, + { + "epoch": 2.696997270245678, + "grad_norm": 0.3870320579224213, + "learning_rate": 4.384112297116539e-06, + "loss": 0.0095, + "step": 5928 + }, + { + "epoch": 2.697452229299363, + "grad_norm": 0.3736399356844653, + "learning_rate": 4.382693910868391e-06, + "loss": 0.0114, + "step": 5929 + }, + { + "epoch": 2.697907188353048, + "grad_norm": 0.3062519866734188, + "learning_rate": 4.381275575063707e-06, + "loss": 0.0058, + "step": 5930 + }, + { + "epoch": 2.6983621474067334, + "grad_norm": 0.4299664104931159, + "learning_rate": 4.37985728981839e-06, + "loss": 0.0164, + "step": 5931 + }, + { + "epoch": 2.6988171064604187, + "grad_norm": 0.4447482202242733, + "learning_rate": 4.378439055248333e-06, + "loss": 0.0221, + "step": 5932 + }, + { + "epoch": 2.6992720655141036, + "grad_norm": 0.3920079999178914, + "learning_rate": 4.37702087146943e-06, + "loss": 0.0092, + "step": 5933 + }, + { + "epoch": 2.699727024567789, + "grad_norm": 0.5715092331290539, + "learning_rate": 4.3756027385975695e-06, + "loss": 0.0234, + "step": 5934 + }, + { + "epoch": 2.700181983621474, + "grad_norm": 0.45525462737236416, + "learning_rate": 4.374184656748631e-06, + "loss": 0.0104, + "step": 5935 + }, + { + "epoch": 2.700636942675159, + "grad_norm": 0.2209715620676055, + "learning_rate": 4.372766626038499e-06, + "loss": 0.0053, + "step": 5936 + }, + { + "epoch": 2.7010919017288444, + "grad_norm": 0.4108139339697753, + "learning_rate": 4.371348646583044e-06, + "loss": 0.0108, + "step": 5937 + }, + { + "epoch": 2.7015468607825297, + "grad_norm": 0.37570904277500317, + "learning_rate": 4.36993071849814e-06, + "loss": 0.0071, + "step": 5938 + }, + { + "epoch": 2.702001819836215, + "grad_norm": 0.7350727758399006, + "learning_rate": 4.368512841899651e-06, + "loss": 0.0249, + "step": 5939 + }, + { + "epoch": 2.7024567788899, + "grad_norm": 0.5080607083946161, + "learning_rate": 4.36709501690344e-06, + "loss": 0.0156, + "step": 5940 + }, + { + "epoch": 2.702911737943585, + "grad_norm": 0.3854516790863163, + "learning_rate": 4.365677243625367e-06, + "loss": 0.0081, + "step": 5941 + }, + { + "epoch": 2.7033666969972705, + "grad_norm": 0.3410881101028962, + "learning_rate": 4.364259522181286e-06, + "loss": 0.0103, + "step": 5942 + }, + { + "epoch": 2.7038216560509554, + "grad_norm": 0.35641324030309507, + "learning_rate": 4.362841852687045e-06, + "loss": 0.0093, + "step": 5943 + }, + { + "epoch": 2.7042766151046407, + "grad_norm": 0.5388055690150927, + "learning_rate": 4.361424235258491e-06, + "loss": 0.0166, + "step": 5944 + }, + { + "epoch": 2.704731574158326, + "grad_norm": 0.2631136612518011, + "learning_rate": 4.360006670011464e-06, + "loss": 0.0054, + "step": 5945 + }, + { + "epoch": 2.705186533212011, + "grad_norm": 0.2893901886937811, + "learning_rate": 4.3585891570618025e-06, + "loss": 0.0091, + "step": 5946 + }, + { + "epoch": 2.705641492265696, + "grad_norm": 0.5550008205804519, + "learning_rate": 4.357171696525336e-06, + "loss": 0.019, + "step": 5947 + }, + { + "epoch": 2.7060964513193815, + "grad_norm": 0.7036458097520657, + "learning_rate": 4.355754288517898e-06, + "loss": 0.0192, + "step": 5948 + }, + { + "epoch": 2.7065514103730663, + "grad_norm": 0.37716798265996226, + "learning_rate": 4.3543369331553094e-06, + "loss": 0.0098, + "step": 5949 + }, + { + "epoch": 2.7070063694267517, + "grad_norm": 0.35358119456607356, + "learning_rate": 4.352919630553393e-06, + "loss": 0.0122, + "step": 5950 + }, + { + "epoch": 2.707461328480437, + "grad_norm": 0.49091923935457776, + "learning_rate": 4.351502380827959e-06, + "loss": 0.0105, + "step": 5951 + }, + { + "epoch": 2.707916287534122, + "grad_norm": 0.3721912755689915, + "learning_rate": 4.350085184094824e-06, + "loss": 0.0114, + "step": 5952 + }, + { + "epoch": 2.708371246587807, + "grad_norm": 0.4595936797563464, + "learning_rate": 4.348668040469791e-06, + "loss": 0.0121, + "step": 5953 + }, + { + "epoch": 2.7088262056414925, + "grad_norm": 0.18308282044190585, + "learning_rate": 4.347250950068665e-06, + "loss": 0.0046, + "step": 5954 + }, + { + "epoch": 2.7092811646951773, + "grad_norm": 0.38133385208471704, + "learning_rate": 4.3458339130072435e-06, + "loss": 0.0134, + "step": 5955 + }, + { + "epoch": 2.7097361237488626, + "grad_norm": 0.3324887417715275, + "learning_rate": 4.34441692940132e-06, + "loss": 0.0086, + "step": 5956 + }, + { + "epoch": 2.710191082802548, + "grad_norm": 0.4121226392533765, + "learning_rate": 4.342999999366687e-06, + "loss": 0.011, + "step": 5957 + }, + { + "epoch": 2.710646041856233, + "grad_norm": 0.3664305268526003, + "learning_rate": 4.341583123019124e-06, + "loss": 0.0089, + "step": 5958 + }, + { + "epoch": 2.711101000909918, + "grad_norm": 0.30153585451944653, + "learning_rate": 4.340166300474418e-06, + "loss": 0.0108, + "step": 5959 + }, + { + "epoch": 2.7115559599636034, + "grad_norm": 0.2674138630433753, + "learning_rate": 4.338749531848339e-06, + "loss": 0.0079, + "step": 5960 + }, + { + "epoch": 2.7120109190172883, + "grad_norm": 0.4596102873673691, + "learning_rate": 4.337332817256662e-06, + "loss": 0.0158, + "step": 5961 + }, + { + "epoch": 2.7124658780709736, + "grad_norm": 0.36879563360197226, + "learning_rate": 4.3359161568151566e-06, + "loss": 0.0173, + "step": 5962 + }, + { + "epoch": 2.712920837124659, + "grad_norm": 0.31597139069597485, + "learning_rate": 4.334499550639583e-06, + "loss": 0.0092, + "step": 5963 + }, + { + "epoch": 2.713375796178344, + "grad_norm": 0.2618434781780756, + "learning_rate": 4.333082998845701e-06, + "loss": 0.008, + "step": 5964 + }, + { + "epoch": 2.713830755232029, + "grad_norm": 0.31157091138158305, + "learning_rate": 4.331666501549266e-06, + "loss": 0.0121, + "step": 5965 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.42119834558716646, + "learning_rate": 4.330250058866025e-06, + "loss": 0.0223, + "step": 5966 + }, + { + "epoch": 2.7147406733393993, + "grad_norm": 0.6547892014861587, + "learning_rate": 4.3288336709117246e-06, + "loss": 0.0219, + "step": 5967 + }, + { + "epoch": 2.7151956323930846, + "grad_norm": 0.4372691554822337, + "learning_rate": 4.327417337802104e-06, + "loss": 0.0121, + "step": 5968 + }, + { + "epoch": 2.71565059144677, + "grad_norm": 0.40059797983832385, + "learning_rate": 4.326001059652903e-06, + "loss": 0.0137, + "step": 5969 + }, + { + "epoch": 2.7161055505004548, + "grad_norm": 0.3263738631147659, + "learning_rate": 4.324584836579851e-06, + "loss": 0.0054, + "step": 5970 + }, + { + "epoch": 2.71656050955414, + "grad_norm": 0.32344836807620925, + "learning_rate": 4.323168668698677e-06, + "loss": 0.0138, + "step": 5971 + }, + { + "epoch": 2.7170154686078254, + "grad_norm": 0.45925229926678546, + "learning_rate": 4.321752556125103e-06, + "loss": 0.0124, + "step": 5972 + }, + { + "epoch": 2.7174704276615103, + "grad_norm": 0.5011067119815998, + "learning_rate": 4.320336498974845e-06, + "loss": 0.0228, + "step": 5973 + }, + { + "epoch": 2.7179253867151956, + "grad_norm": 0.7521044804384955, + "learning_rate": 4.3189204973636215e-06, + "loss": 0.0355, + "step": 5974 + }, + { + "epoch": 2.718380345768881, + "grad_norm": 0.32854602876346556, + "learning_rate": 4.317504551407136e-06, + "loss": 0.0077, + "step": 5975 + }, + { + "epoch": 2.7188353048225657, + "grad_norm": 0.3839985419288797, + "learning_rate": 4.316088661221099e-06, + "loss": 0.0118, + "step": 5976 + }, + { + "epoch": 2.719290263876251, + "grad_norm": 0.6425451914377049, + "learning_rate": 4.314672826921208e-06, + "loss": 0.0281, + "step": 5977 + }, + { + "epoch": 2.7197452229299364, + "grad_norm": 0.4744921390656975, + "learning_rate": 4.313257048623158e-06, + "loss": 0.0139, + "step": 5978 + }, + { + "epoch": 2.7202001819836212, + "grad_norm": 0.33409379873540984, + "learning_rate": 4.311841326442642e-06, + "loss": 0.008, + "step": 5979 + }, + { + "epoch": 2.7206551410373065, + "grad_norm": 0.3259174484089608, + "learning_rate": 4.310425660495343e-06, + "loss": 0.0105, + "step": 5980 + }, + { + "epoch": 2.721110100090992, + "grad_norm": 0.4861237315830944, + "learning_rate": 4.3090100508969465e-06, + "loss": 0.0195, + "step": 5981 + }, + { + "epoch": 2.7215650591446767, + "grad_norm": 0.5213304928328588, + "learning_rate": 4.307594497763127e-06, + "loss": 0.0148, + "step": 5982 + }, + { + "epoch": 2.722020018198362, + "grad_norm": 0.34951031591703374, + "learning_rate": 4.306179001209558e-06, + "loss": 0.0138, + "step": 5983 + }, + { + "epoch": 2.7224749772520473, + "grad_norm": 0.271038910785206, + "learning_rate": 4.304763561351909e-06, + "loss": 0.0088, + "step": 5984 + }, + { + "epoch": 2.722929936305732, + "grad_norm": 0.3836333079471888, + "learning_rate": 4.303348178305842e-06, + "loss": 0.0165, + "step": 5985 + }, + { + "epoch": 2.7233848953594175, + "grad_norm": 0.298228828505515, + "learning_rate": 4.301932852187016e-06, + "loss": 0.0108, + "step": 5986 + }, + { + "epoch": 2.723839854413103, + "grad_norm": 0.42299373707295224, + "learning_rate": 4.300517583111085e-06, + "loss": 0.0176, + "step": 5987 + }, + { + "epoch": 2.724294813466788, + "grad_norm": 0.2785147818525911, + "learning_rate": 4.299102371193698e-06, + "loss": 0.0082, + "step": 5988 + }, + { + "epoch": 2.724749772520473, + "grad_norm": 0.2568855967379187, + "learning_rate": 4.297687216550498e-06, + "loss": 0.0081, + "step": 5989 + }, + { + "epoch": 2.7252047315741583, + "grad_norm": 0.3593110390610532, + "learning_rate": 4.296272119297128e-06, + "loss": 0.0127, + "step": 5990 + }, + { + "epoch": 2.7256596906278436, + "grad_norm": 0.4960810888734688, + "learning_rate": 4.294857079549225e-06, + "loss": 0.0298, + "step": 5991 + }, + { + "epoch": 2.7261146496815285, + "grad_norm": 0.4111408777512469, + "learning_rate": 4.2934420974224145e-06, + "loss": 0.0071, + "step": 5992 + }, + { + "epoch": 2.726569608735214, + "grad_norm": 0.48783725226573443, + "learning_rate": 4.292027173032326e-06, + "loss": 0.0167, + "step": 5993 + }, + { + "epoch": 2.727024567788899, + "grad_norm": 0.6014951972548783, + "learning_rate": 4.29061230649458e-06, + "loss": 0.0366, + "step": 5994 + }, + { + "epoch": 2.7274795268425844, + "grad_norm": 0.5639786086544031, + "learning_rate": 4.289197497924792e-06, + "loss": 0.0158, + "step": 5995 + }, + { + "epoch": 2.7279344858962693, + "grad_norm": 0.511557736098475, + "learning_rate": 4.287782747438573e-06, + "loss": 0.0212, + "step": 5996 + }, + { + "epoch": 2.7283894449499546, + "grad_norm": 0.4175148879191534, + "learning_rate": 4.286368055151534e-06, + "loss": 0.0188, + "step": 5997 + }, + { + "epoch": 2.72884440400364, + "grad_norm": 0.42995148142816225, + "learning_rate": 4.2849534211792745e-06, + "loss": 0.0107, + "step": 5998 + }, + { + "epoch": 2.729299363057325, + "grad_norm": 0.32554889780250434, + "learning_rate": 4.283538845637391e-06, + "loss": 0.0062, + "step": 5999 + }, + { + "epoch": 2.72975432211101, + "grad_norm": 0.2858393798989467, + "learning_rate": 4.28212432864148e-06, + "loss": 0.0094, + "step": 6000 + }, + { + "epoch": 2.7302092811646954, + "grad_norm": 0.45764410917087667, + "learning_rate": 4.280709870307126e-06, + "loss": 0.0172, + "step": 6001 + }, + { + "epoch": 2.7306642402183803, + "grad_norm": 0.47934274558905793, + "learning_rate": 4.279295470749913e-06, + "loss": 0.0181, + "step": 6002 + }, + { + "epoch": 2.7311191992720656, + "grad_norm": 0.48026364129621, + "learning_rate": 4.277881130085417e-06, + "loss": 0.0108, + "step": 6003 + }, + { + "epoch": 2.731574158325751, + "grad_norm": 0.2637951056808094, + "learning_rate": 4.276466848429216e-06, + "loss": 0.0068, + "step": 6004 + }, + { + "epoch": 2.7320291173794358, + "grad_norm": 0.3407506054823702, + "learning_rate": 4.275052625896877e-06, + "loss": 0.0076, + "step": 6005 + }, + { + "epoch": 2.732484076433121, + "grad_norm": 0.2874278977902796, + "learning_rate": 4.273638462603963e-06, + "loss": 0.0069, + "step": 6006 + }, + { + "epoch": 2.7329390354868064, + "grad_norm": 0.34675813464582306, + "learning_rate": 4.272224358666034e-06, + "loss": 0.0094, + "step": 6007 + }, + { + "epoch": 2.7333939945404913, + "grad_norm": 0.4228462684046438, + "learning_rate": 4.270810314198644e-06, + "loss": 0.013, + "step": 6008 + }, + { + "epoch": 2.7338489535941766, + "grad_norm": 0.4965390314312646, + "learning_rate": 4.269396329317342e-06, + "loss": 0.0172, + "step": 6009 + }, + { + "epoch": 2.734303912647862, + "grad_norm": 0.5078479135463949, + "learning_rate": 4.2679824041376706e-06, + "loss": 0.0213, + "step": 6010 + }, + { + "epoch": 2.7347588717015467, + "grad_norm": 0.4779946394655433, + "learning_rate": 4.266568538775174e-06, + "loss": 0.0162, + "step": 6011 + }, + { + "epoch": 2.735213830755232, + "grad_norm": 0.5037789907695531, + "learning_rate": 4.265154733345383e-06, + "loss": 0.0222, + "step": 6012 + }, + { + "epoch": 2.7356687898089174, + "grad_norm": 0.48690389553033686, + "learning_rate": 4.2637409879638295e-06, + "loss": 0.0212, + "step": 6013 + }, + { + "epoch": 2.7361237488626022, + "grad_norm": 0.3585354660137679, + "learning_rate": 4.262327302746037e-06, + "loss": 0.0094, + "step": 6014 + }, + { + "epoch": 2.7365787079162875, + "grad_norm": 0.26747655226336553, + "learning_rate": 4.260913677807527e-06, + "loss": 0.0079, + "step": 6015 + }, + { + "epoch": 2.737033666969973, + "grad_norm": 0.45017825990090166, + "learning_rate": 4.259500113263812e-06, + "loss": 0.0132, + "step": 6016 + }, + { + "epoch": 2.7374886260236577, + "grad_norm": 0.35508190230188047, + "learning_rate": 4.258086609230403e-06, + "loss": 0.011, + "step": 6017 + }, + { + "epoch": 2.737943585077343, + "grad_norm": 0.44162569436143834, + "learning_rate": 4.256673165822808e-06, + "loss": 0.0102, + "step": 6018 + }, + { + "epoch": 2.7383985441310283, + "grad_norm": 0.5580313491582966, + "learning_rate": 4.255259783156524e-06, + "loss": 0.0283, + "step": 6019 + }, + { + "epoch": 2.738853503184713, + "grad_norm": 0.5012586476460804, + "learning_rate": 4.253846461347049e-06, + "loss": 0.0149, + "step": 6020 + }, + { + "epoch": 2.7393084622383985, + "grad_norm": 0.24032698607455938, + "learning_rate": 4.252433200509869e-06, + "loss": 0.0085, + "step": 6021 + }, + { + "epoch": 2.739763421292084, + "grad_norm": 0.4447599235839283, + "learning_rate": 4.251020000760474e-06, + "loss": 0.0228, + "step": 6022 + }, + { + "epoch": 2.7402183803457687, + "grad_norm": 0.37372332737273567, + "learning_rate": 4.2496068622143405e-06, + "loss": 0.019, + "step": 6023 + }, + { + "epoch": 2.740673339399454, + "grad_norm": 0.44428958710931893, + "learning_rate": 4.248193784986945e-06, + "loss": 0.0158, + "step": 6024 + }, + { + "epoch": 2.7411282984531393, + "grad_norm": 0.3596671825856569, + "learning_rate": 4.24678076919376e-06, + "loss": 0.0106, + "step": 6025 + }, + { + "epoch": 2.741583257506824, + "grad_norm": 0.6336535672429905, + "learning_rate": 4.2453678149502485e-06, + "loss": 0.034, + "step": 6026 + }, + { + "epoch": 2.7420382165605095, + "grad_norm": 0.41892272188920054, + "learning_rate": 4.243954922371872e-06, + "loss": 0.0162, + "step": 6027 + }, + { + "epoch": 2.742493175614195, + "grad_norm": 0.412957468127852, + "learning_rate": 4.242542091574083e-06, + "loss": 0.0146, + "step": 6028 + }, + { + "epoch": 2.7429481346678797, + "grad_norm": 0.2956409275628743, + "learning_rate": 4.241129322672336e-06, + "loss": 0.0074, + "step": 6029 + }, + { + "epoch": 2.743403093721565, + "grad_norm": 0.7754413600064651, + "learning_rate": 4.239716615782072e-06, + "loss": 0.0248, + "step": 6030 + }, + { + "epoch": 2.7438580527752503, + "grad_norm": 0.3767843229640008, + "learning_rate": 4.238303971018732e-06, + "loss": 0.0084, + "step": 6031 + }, + { + "epoch": 2.744313011828935, + "grad_norm": 0.83477873341383, + "learning_rate": 4.236891388497754e-06, + "loss": 0.0244, + "step": 6032 + }, + { + "epoch": 2.7447679708826205, + "grad_norm": 0.3487267676364768, + "learning_rate": 4.235478868334564e-06, + "loss": 0.0143, + "step": 6033 + }, + { + "epoch": 2.745222929936306, + "grad_norm": 0.4715632280454509, + "learning_rate": 4.23406641064459e-06, + "loss": 0.0212, + "step": 6034 + }, + { + "epoch": 2.7456778889899907, + "grad_norm": 0.13743236892341112, + "learning_rate": 4.2326540155432495e-06, + "loss": 0.0025, + "step": 6035 + }, + { + "epoch": 2.746132848043676, + "grad_norm": 0.3903137918449913, + "learning_rate": 4.231241683145957e-06, + "loss": 0.0169, + "step": 6036 + }, + { + "epoch": 2.7465878070973613, + "grad_norm": 0.33399784550717343, + "learning_rate": 4.229829413568123e-06, + "loss": 0.0098, + "step": 6037 + }, + { + "epoch": 2.747042766151046, + "grad_norm": 0.30915255490624294, + "learning_rate": 4.228417206925149e-06, + "loss": 0.0119, + "step": 6038 + }, + { + "epoch": 2.7474977252047315, + "grad_norm": 0.4498938401201024, + "learning_rate": 4.227005063332438e-06, + "loss": 0.0182, + "step": 6039 + }, + { + "epoch": 2.7479526842584168, + "grad_norm": 0.41696981771742386, + "learning_rate": 4.225592982905383e-06, + "loss": 0.011, + "step": 6040 + }, + { + "epoch": 2.7484076433121016, + "grad_norm": 0.5416946682788819, + "learning_rate": 4.224180965759371e-06, + "loss": 0.0188, + "step": 6041 + }, + { + "epoch": 2.748862602365787, + "grad_norm": 0.3394706961277096, + "learning_rate": 4.222769012009789e-06, + "loss": 0.0079, + "step": 6042 + }, + { + "epoch": 2.7493175614194723, + "grad_norm": 0.3145576845233851, + "learning_rate": 4.221357121772012e-06, + "loss": 0.0134, + "step": 6043 + }, + { + "epoch": 2.7497725204731576, + "grad_norm": 0.3780673339666677, + "learning_rate": 4.219945295161415e-06, + "loss": 0.0163, + "step": 6044 + }, + { + "epoch": 2.7502274795268424, + "grad_norm": 0.4413501139701228, + "learning_rate": 4.218533532293364e-06, + "loss": 0.022, + "step": 6045 + }, + { + "epoch": 2.7506824385805277, + "grad_norm": 0.4744950692812475, + "learning_rate": 4.2171218332832255e-06, + "loss": 0.0177, + "step": 6046 + }, + { + "epoch": 2.751137397634213, + "grad_norm": 0.2876589143233397, + "learning_rate": 4.215710198246355e-06, + "loss": 0.0119, + "step": 6047 + }, + { + "epoch": 2.7515923566878984, + "grad_norm": 0.3568692776496177, + "learning_rate": 4.2142986272981054e-06, + "loss": 0.0148, + "step": 6048 + }, + { + "epoch": 2.7520473157415832, + "grad_norm": 0.3987937262773912, + "learning_rate": 4.212887120553824e-06, + "loss": 0.0162, + "step": 6049 + }, + { + "epoch": 2.7525022747952685, + "grad_norm": 0.3970739293642638, + "learning_rate": 4.211475678128853e-06, + "loss": 0.016, + "step": 6050 + }, + { + "epoch": 2.752957233848954, + "grad_norm": 0.38257854830908083, + "learning_rate": 4.210064300138527e-06, + "loss": 0.0195, + "step": 6051 + }, + { + "epoch": 2.7534121929026387, + "grad_norm": 0.27539639933281646, + "learning_rate": 4.208652986698179e-06, + "loss": 0.0061, + "step": 6052 + }, + { + "epoch": 2.753867151956324, + "grad_norm": 0.276362732921801, + "learning_rate": 4.2072417379231366e-06, + "loss": 0.0092, + "step": 6053 + }, + { + "epoch": 2.7543221110100093, + "grad_norm": 0.30579099001049087, + "learning_rate": 4.205830553928719e-06, + "loss": 0.0072, + "step": 6054 + }, + { + "epoch": 2.754777070063694, + "grad_norm": 0.2791220968846108, + "learning_rate": 4.204419434830242e-06, + "loss": 0.0066, + "step": 6055 + }, + { + "epoch": 2.7552320291173795, + "grad_norm": 0.3965659540742579, + "learning_rate": 4.203008380743017e-06, + "loss": 0.0242, + "step": 6056 + }, + { + "epoch": 2.755686988171065, + "grad_norm": 0.4539489140290909, + "learning_rate": 4.201597391782346e-06, + "loss": 0.0139, + "step": 6057 + }, + { + "epoch": 2.7561419472247497, + "grad_norm": 0.23824613312272833, + "learning_rate": 4.200186468063532e-06, + "loss": 0.0082, + "step": 6058 + }, + { + "epoch": 2.756596906278435, + "grad_norm": 0.4477019339717532, + "learning_rate": 4.198775609701866e-06, + "loss": 0.0194, + "step": 6059 + }, + { + "epoch": 2.7570518653321203, + "grad_norm": 0.2994317448702024, + "learning_rate": 4.19736481681264e-06, + "loss": 0.0064, + "step": 6060 + }, + { + "epoch": 2.757506824385805, + "grad_norm": 0.37626624109614354, + "learning_rate": 4.195954089511138e-06, + "loss": 0.0109, + "step": 6061 + }, + { + "epoch": 2.7579617834394905, + "grad_norm": 0.4013219959387168, + "learning_rate": 4.194543427912635e-06, + "loss": 0.01, + "step": 6062 + }, + { + "epoch": 2.758416742493176, + "grad_norm": 0.7997522438204837, + "learning_rate": 4.1931328321324076e-06, + "loss": 0.0263, + "step": 6063 + }, + { + "epoch": 2.7588717015468607, + "grad_norm": 0.31119835503473015, + "learning_rate": 4.191722302285719e-06, + "loss": 0.0085, + "step": 6064 + }, + { + "epoch": 2.759326660600546, + "grad_norm": 0.2853810603396253, + "learning_rate": 4.190311838487835e-06, + "loss": 0.0092, + "step": 6065 + }, + { + "epoch": 2.7597816196542313, + "grad_norm": 0.28249320508886516, + "learning_rate": 4.18890144085401e-06, + "loss": 0.0113, + "step": 6066 + }, + { + "epoch": 2.760236578707916, + "grad_norm": 0.4242011220249303, + "learning_rate": 4.187491109499496e-06, + "loss": 0.0177, + "step": 6067 + }, + { + "epoch": 2.7606915377616015, + "grad_norm": 0.39698604392213305, + "learning_rate": 4.186080844539541e-06, + "loss": 0.0124, + "step": 6068 + }, + { + "epoch": 2.761146496815287, + "grad_norm": 0.5918489733481939, + "learning_rate": 4.1846706460893835e-06, + "loss": 0.0173, + "step": 6069 + }, + { + "epoch": 2.7616014558689717, + "grad_norm": 0.22072983320116718, + "learning_rate": 4.183260514264259e-06, + "loss": 0.0049, + "step": 6070 + }, + { + "epoch": 2.762056414922657, + "grad_norm": 0.33935785502715216, + "learning_rate": 4.181850449179397e-06, + "loss": 0.0096, + "step": 6071 + }, + { + "epoch": 2.7625113739763423, + "grad_norm": 0.2104903430711546, + "learning_rate": 4.180440450950021e-06, + "loss": 0.0053, + "step": 6072 + }, + { + "epoch": 2.762966333030027, + "grad_norm": 0.28401035184529577, + "learning_rate": 4.179030519691349e-06, + "loss": 0.01, + "step": 6073 + }, + { + "epoch": 2.7634212920837125, + "grad_norm": 0.2196656857103231, + "learning_rate": 4.1776206555185964e-06, + "loss": 0.0065, + "step": 6074 + }, + { + "epoch": 2.7638762511373978, + "grad_norm": 0.3298230170003272, + "learning_rate": 4.17621085854697e-06, + "loss": 0.009, + "step": 6075 + }, + { + "epoch": 2.7643312101910826, + "grad_norm": 0.4293989065704018, + "learning_rate": 4.174801128891673e-06, + "loss": 0.0163, + "step": 6076 + }, + { + "epoch": 2.764786169244768, + "grad_norm": 0.3193133909773695, + "learning_rate": 4.173391466667901e-06, + "loss": 0.0077, + "step": 6077 + }, + { + "epoch": 2.7652411282984533, + "grad_norm": 0.3839636462875678, + "learning_rate": 4.171981871990845e-06, + "loss": 0.0127, + "step": 6078 + }, + { + "epoch": 2.765696087352138, + "grad_norm": 0.3577973140064893, + "learning_rate": 4.1705723449756905e-06, + "loss": 0.0067, + "step": 6079 + }, + { + "epoch": 2.7661510464058234, + "grad_norm": 0.4196585376044528, + "learning_rate": 4.169162885737617e-06, + "loss": 0.0104, + "step": 6080 + }, + { + "epoch": 2.7666060054595087, + "grad_norm": 0.39230708506159845, + "learning_rate": 4.167753494391803e-06, + "loss": 0.0067, + "step": 6081 + }, + { + "epoch": 2.7670609645131936, + "grad_norm": 0.4203626802175535, + "learning_rate": 4.166344171053414e-06, + "loss": 0.0188, + "step": 6082 + }, + { + "epoch": 2.767515923566879, + "grad_norm": 0.38508313789767185, + "learning_rate": 4.164934915837616e-06, + "loss": 0.0114, + "step": 6083 + }, + { + "epoch": 2.7679708826205642, + "grad_norm": 0.2836512973439608, + "learning_rate": 4.163525728859564e-06, + "loss": 0.0071, + "step": 6084 + }, + { + "epoch": 2.768425841674249, + "grad_norm": 0.507854761946816, + "learning_rate": 4.162116610234413e-06, + "loss": 0.0168, + "step": 6085 + }, + { + "epoch": 2.7688808007279344, + "grad_norm": 0.44711296495064196, + "learning_rate": 4.160707560077308e-06, + "loss": 0.0114, + "step": 6086 + }, + { + "epoch": 2.7693357597816197, + "grad_norm": 0.4402637245154256, + "learning_rate": 4.15929857850339e-06, + "loss": 0.0135, + "step": 6087 + }, + { + "epoch": 2.7697907188353046, + "grad_norm": 0.38054718542309746, + "learning_rate": 4.157889665627797e-06, + "loss": 0.0205, + "step": 6088 + }, + { + "epoch": 2.77024567788899, + "grad_norm": 0.2969576548015972, + "learning_rate": 4.156480821565657e-06, + "loss": 0.0094, + "step": 6089 + }, + { + "epoch": 2.770700636942675, + "grad_norm": 0.40685272359600044, + "learning_rate": 4.155072046432096e-06, + "loss": 0.0106, + "step": 6090 + }, + { + "epoch": 2.77115559599636, + "grad_norm": 0.6099433784764327, + "learning_rate": 4.15366334034223e-06, + "loss": 0.0127, + "step": 6091 + }, + { + "epoch": 2.7716105550500454, + "grad_norm": 0.3697608331724075, + "learning_rate": 4.152254703411176e-06, + "loss": 0.0149, + "step": 6092 + }, + { + "epoch": 2.7720655141037307, + "grad_norm": 0.5553578973041321, + "learning_rate": 4.1508461357540375e-06, + "loss": 0.0261, + "step": 6093 + }, + { + "epoch": 2.7725204731574156, + "grad_norm": 0.41108563991522645, + "learning_rate": 4.149437637485917e-06, + "loss": 0.0131, + "step": 6094 + }, + { + "epoch": 2.772975432211101, + "grad_norm": 0.2981934428308497, + "learning_rate": 4.148029208721914e-06, + "loss": 0.0115, + "step": 6095 + }, + { + "epoch": 2.773430391264786, + "grad_norm": 0.27972718856948525, + "learning_rate": 4.146620849577116e-06, + "loss": 0.007, + "step": 6096 + }, + { + "epoch": 2.7738853503184715, + "grad_norm": 0.4735269301944073, + "learning_rate": 4.145212560166608e-06, + "loss": 0.0158, + "step": 6097 + }, + { + "epoch": 2.7743403093721564, + "grad_norm": 0.5992133569279419, + "learning_rate": 4.14380434060547e-06, + "loss": 0.024, + "step": 6098 + }, + { + "epoch": 2.7747952684258417, + "grad_norm": 0.41263465498683555, + "learning_rate": 4.142396191008775e-06, + "loss": 0.0136, + "step": 6099 + }, + { + "epoch": 2.775250227479527, + "grad_norm": 0.4540678890818895, + "learning_rate": 4.1409881114915895e-06, + "loss": 0.0149, + "step": 6100 + }, + { + "epoch": 2.775705186533212, + "grad_norm": 0.5350222753468424, + "learning_rate": 4.139580102168975e-06, + "loss": 0.0145, + "step": 6101 + }, + { + "epoch": 2.776160145586897, + "grad_norm": 0.5974686412677844, + "learning_rate": 4.138172163155991e-06, + "loss": 0.0133, + "step": 6102 + }, + { + "epoch": 2.7766151046405825, + "grad_norm": 0.2485634179454498, + "learning_rate": 4.136764294567684e-06, + "loss": 0.0069, + "step": 6103 + }, + { + "epoch": 2.777070063694268, + "grad_norm": 0.4455822554552264, + "learning_rate": 4.135356496519103e-06, + "loss": 0.0144, + "step": 6104 + }, + { + "epoch": 2.7775250227479527, + "grad_norm": 0.4095125624931906, + "learning_rate": 4.1339487691252835e-06, + "loss": 0.0115, + "step": 6105 + }, + { + "epoch": 2.777979981801638, + "grad_norm": 0.32133424970948393, + "learning_rate": 4.1325411125012596e-06, + "loss": 0.0108, + "step": 6106 + }, + { + "epoch": 2.7784349408553233, + "grad_norm": 0.363708717190287, + "learning_rate": 4.131133526762059e-06, + "loss": 0.0151, + "step": 6107 + }, + { + "epoch": 2.778889899909008, + "grad_norm": 0.4004201324139527, + "learning_rate": 4.129726012022699e-06, + "loss": 0.0142, + "step": 6108 + }, + { + "epoch": 2.7793448589626935, + "grad_norm": 0.40719233124918014, + "learning_rate": 4.128318568398203e-06, + "loss": 0.0123, + "step": 6109 + }, + { + "epoch": 2.7797998180163788, + "grad_norm": 0.1069858768999529, + "learning_rate": 4.126911196003577e-06, + "loss": 0.0016, + "step": 6110 + }, + { + "epoch": 2.7802547770700636, + "grad_norm": 0.4986757196985265, + "learning_rate": 4.125503894953824e-06, + "loss": 0.0151, + "step": 6111 + }, + { + "epoch": 2.780709736123749, + "grad_norm": 0.40006597528261445, + "learning_rate": 4.124096665363945e-06, + "loss": 0.0097, + "step": 6112 + }, + { + "epoch": 2.7811646951774343, + "grad_norm": 0.3556187303609429, + "learning_rate": 4.122689507348929e-06, + "loss": 0.0126, + "step": 6113 + }, + { + "epoch": 2.781619654231119, + "grad_norm": 0.2980746095953485, + "learning_rate": 4.121282421023766e-06, + "loss": 0.0064, + "step": 6114 + }, + { + "epoch": 2.7820746132848044, + "grad_norm": 0.3245688169288705, + "learning_rate": 4.119875406503434e-06, + "loss": 0.0072, + "step": 6115 + }, + { + "epoch": 2.7825295723384897, + "grad_norm": 0.3359587015267282, + "learning_rate": 4.11846846390291e-06, + "loss": 0.0114, + "step": 6116 + }, + { + "epoch": 2.7829845313921746, + "grad_norm": 0.3609356095211249, + "learning_rate": 4.117061593337163e-06, + "loss": 0.0108, + "step": 6117 + }, + { + "epoch": 2.78343949044586, + "grad_norm": 0.3814984149878645, + "learning_rate": 4.115654794921154e-06, + "loss": 0.012, + "step": 6118 + }, + { + "epoch": 2.7838944494995452, + "grad_norm": 0.3702545080140778, + "learning_rate": 4.114248068769843e-06, + "loss": 0.0105, + "step": 6119 + }, + { + "epoch": 2.78434940855323, + "grad_norm": 0.2089677221723774, + "learning_rate": 4.112841414998178e-06, + "loss": 0.0062, + "step": 6120 + }, + { + "epoch": 2.7848043676069154, + "grad_norm": 0.565495547702211, + "learning_rate": 4.111434833721108e-06, + "loss": 0.009, + "step": 6121 + }, + { + "epoch": 2.7852593266606007, + "grad_norm": 0.5192264102055605, + "learning_rate": 4.110028325053568e-06, + "loss": 0.0128, + "step": 6122 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 0.3855956011187438, + "learning_rate": 4.1086218891104955e-06, + "loss": 0.013, + "step": 6123 + }, + { + "epoch": 2.786169244767971, + "grad_norm": 0.3387686752427193, + "learning_rate": 4.107215526006818e-06, + "loss": 0.0063, + "step": 6124 + }, + { + "epoch": 2.786624203821656, + "grad_norm": 0.3411160755212013, + "learning_rate": 4.105809235857454e-06, + "loss": 0.0114, + "step": 6125 + }, + { + "epoch": 2.787079162875341, + "grad_norm": 0.4634161517162885, + "learning_rate": 4.104403018777323e-06, + "loss": 0.0135, + "step": 6126 + }, + { + "epoch": 2.7875341219290264, + "grad_norm": 0.43586130194323086, + "learning_rate": 4.102996874881332e-06, + "loss": 0.0144, + "step": 6127 + }, + { + "epoch": 2.7879890809827117, + "grad_norm": 0.2333417989509398, + "learning_rate": 4.101590804284386e-06, + "loss": 0.0044, + "step": 6128 + }, + { + "epoch": 2.7884440400363966, + "grad_norm": 0.37099998245746507, + "learning_rate": 4.10018480710138e-06, + "loss": 0.0129, + "step": 6129 + }, + { + "epoch": 2.788898999090082, + "grad_norm": 0.3605216388388408, + "learning_rate": 4.09877888344721e-06, + "loss": 0.0116, + "step": 6130 + }, + { + "epoch": 2.789353958143767, + "grad_norm": 0.32127702019694454, + "learning_rate": 4.09737303343676e-06, + "loss": 0.0078, + "step": 6131 + }, + { + "epoch": 2.789808917197452, + "grad_norm": 0.4105825824192487, + "learning_rate": 4.0959672571849085e-06, + "loss": 0.0116, + "step": 6132 + }, + { + "epoch": 2.7902638762511374, + "grad_norm": 0.5365128626959759, + "learning_rate": 4.094561554806532e-06, + "loss": 0.03, + "step": 6133 + }, + { + "epoch": 2.7907188353048227, + "grad_norm": 1.2024839942652652, + "learning_rate": 4.093155926416494e-06, + "loss": 0.0171, + "step": 6134 + }, + { + "epoch": 2.7911737943585075, + "grad_norm": 0.42772647366704053, + "learning_rate": 4.091750372129661e-06, + "loss": 0.0135, + "step": 6135 + }, + { + "epoch": 2.791628753412193, + "grad_norm": 0.6080903619430011, + "learning_rate": 4.090344892060883e-06, + "loss": 0.0143, + "step": 6136 + }, + { + "epoch": 2.792083712465878, + "grad_norm": 0.2714797294601795, + "learning_rate": 4.0889394863250135e-06, + "loss": 0.0095, + "step": 6137 + }, + { + "epoch": 2.792538671519563, + "grad_norm": 0.31361914857417433, + "learning_rate": 4.087534155036896e-06, + "loss": 0.0078, + "step": 6138 + }, + { + "epoch": 2.7929936305732483, + "grad_norm": 0.5677242944360218, + "learning_rate": 4.086128898311365e-06, + "loss": 0.0321, + "step": 6139 + }, + { + "epoch": 2.7934485896269337, + "grad_norm": 0.2690088037282102, + "learning_rate": 4.084723716263255e-06, + "loss": 0.0063, + "step": 6140 + }, + { + "epoch": 2.7939035486806185, + "grad_norm": 0.30559644460865787, + "learning_rate": 4.08331860900739e-06, + "loss": 0.0068, + "step": 6141 + }, + { + "epoch": 2.794358507734304, + "grad_norm": 0.4406939030506512, + "learning_rate": 4.0819135766585875e-06, + "loss": 0.0095, + "step": 6142 + }, + { + "epoch": 2.794813466787989, + "grad_norm": 0.2787607200087893, + "learning_rate": 4.08050861933166e-06, + "loss": 0.0111, + "step": 6143 + }, + { + "epoch": 2.795268425841674, + "grad_norm": 0.3172800639046428, + "learning_rate": 4.079103737141417e-06, + "loss": 0.0116, + "step": 6144 + }, + { + "epoch": 2.7957233848953593, + "grad_norm": 0.4815081625712856, + "learning_rate": 4.077698930202659e-06, + "loss": 0.0172, + "step": 6145 + }, + { + "epoch": 2.7961783439490446, + "grad_norm": 0.39982219763098237, + "learning_rate": 4.076294198630179e-06, + "loss": 0.0161, + "step": 6146 + }, + { + "epoch": 2.7966333030027295, + "grad_norm": 0.34409258783246904, + "learning_rate": 4.074889542538765e-06, + "loss": 0.0085, + "step": 6147 + }, + { + "epoch": 2.797088262056415, + "grad_norm": 0.3676107651287223, + "learning_rate": 4.073484962043201e-06, + "loss": 0.0109, + "step": 6148 + }, + { + "epoch": 2.7975432211101, + "grad_norm": 0.5426235224871229, + "learning_rate": 4.0720804572582604e-06, + "loss": 0.0179, + "step": 6149 + }, + { + "epoch": 2.797998180163785, + "grad_norm": 0.29564947110329504, + "learning_rate": 4.070676028298713e-06, + "loss": 0.0093, + "step": 6150 + }, + { + "epoch": 2.7984531392174703, + "grad_norm": 0.29361865044206104, + "learning_rate": 4.069271675279326e-06, + "loss": 0.0058, + "step": 6151 + }, + { + "epoch": 2.7989080982711556, + "grad_norm": 0.4724242565339271, + "learning_rate": 4.067867398314853e-06, + "loss": 0.0148, + "step": 6152 + }, + { + "epoch": 2.799363057324841, + "grad_norm": 0.3957167051302801, + "learning_rate": 4.06646319752005e-06, + "loss": 0.0126, + "step": 6153 + }, + { + "epoch": 2.799818016378526, + "grad_norm": 0.2952970132079702, + "learning_rate": 4.065059073009656e-06, + "loss": 0.0037, + "step": 6154 + }, + { + "epoch": 2.800272975432211, + "grad_norm": 0.5179131046118575, + "learning_rate": 4.063655024898413e-06, + "loss": 0.0155, + "step": 6155 + }, + { + "epoch": 2.8007279344858964, + "grad_norm": 0.32668303841341806, + "learning_rate": 4.062251053301053e-06, + "loss": 0.0057, + "step": 6156 + }, + { + "epoch": 2.8011828935395813, + "grad_norm": 0.4738822446233882, + "learning_rate": 4.0608471583323e-06, + "loss": 0.0123, + "step": 6157 + }, + { + "epoch": 2.8016378525932666, + "grad_norm": 0.4115028385994002, + "learning_rate": 4.059443340106879e-06, + "loss": 0.0131, + "step": 6158 + }, + { + "epoch": 2.802092811646952, + "grad_norm": 0.4445545794301525, + "learning_rate": 4.0580395987394985e-06, + "loss": 0.0135, + "step": 6159 + }, + { + "epoch": 2.802547770700637, + "grad_norm": 0.4974983954631685, + "learning_rate": 4.05663593434487e-06, + "loss": 0.0129, + "step": 6160 + }, + { + "epoch": 2.803002729754322, + "grad_norm": 0.5583654847070216, + "learning_rate": 4.0552323470376916e-06, + "loss": 0.0172, + "step": 6161 + }, + { + "epoch": 2.8034576888080074, + "grad_norm": 0.4321008907232332, + "learning_rate": 4.05382883693266e-06, + "loss": 0.0222, + "step": 6162 + }, + { + "epoch": 2.8039126478616927, + "grad_norm": 0.2227093598090755, + "learning_rate": 4.052425404144463e-06, + "loss": 0.0051, + "step": 6163 + }, + { + "epoch": 2.8043676069153776, + "grad_norm": 0.3694282504252912, + "learning_rate": 4.051022048787781e-06, + "loss": 0.0097, + "step": 6164 + }, + { + "epoch": 2.804822565969063, + "grad_norm": 0.5681438221233656, + "learning_rate": 4.049618770977294e-06, + "loss": 0.018, + "step": 6165 + }, + { + "epoch": 2.805277525022748, + "grad_norm": 0.2892593210016612, + "learning_rate": 4.048215570827668e-06, + "loss": 0.0047, + "step": 6166 + }, + { + "epoch": 2.805732484076433, + "grad_norm": 0.23883116102697122, + "learning_rate": 4.046812448453568e-06, + "loss": 0.0071, + "step": 6167 + }, + { + "epoch": 2.8061874431301184, + "grad_norm": 0.28257402229772005, + "learning_rate": 4.045409403969649e-06, + "loss": 0.0061, + "step": 6168 + }, + { + "epoch": 2.8066424021838037, + "grad_norm": 0.4509944900552927, + "learning_rate": 4.044006437490564e-06, + "loss": 0.0092, + "step": 6169 + }, + { + "epoch": 2.8070973612374885, + "grad_norm": 0.43804886630996964, + "learning_rate": 4.042603549130955e-06, + "loss": 0.0126, + "step": 6170 + }, + { + "epoch": 2.807552320291174, + "grad_norm": 0.25021342325412005, + "learning_rate": 4.041200739005459e-06, + "loss": 0.0045, + "step": 6171 + }, + { + "epoch": 2.808007279344859, + "grad_norm": 0.2863598785273324, + "learning_rate": 4.039798007228711e-06, + "loss": 0.0135, + "step": 6172 + }, + { + "epoch": 2.808462238398544, + "grad_norm": 0.28327946187334474, + "learning_rate": 4.038395353915332e-06, + "loss": 0.0095, + "step": 6173 + }, + { + "epoch": 2.8089171974522293, + "grad_norm": 0.4102568188771912, + "learning_rate": 4.036992779179944e-06, + "loss": 0.011, + "step": 6174 + }, + { + "epoch": 2.8093721565059147, + "grad_norm": 0.6984396517670316, + "learning_rate": 4.035590283137155e-06, + "loss": 0.0185, + "step": 6175 + }, + { + "epoch": 2.8098271155595995, + "grad_norm": 0.22159635474375433, + "learning_rate": 4.034187865901576e-06, + "loss": 0.0062, + "step": 6176 + }, + { + "epoch": 2.810282074613285, + "grad_norm": 0.5517320078633793, + "learning_rate": 4.0327855275878005e-06, + "loss": 0.022, + "step": 6177 + }, + { + "epoch": 2.81073703366697, + "grad_norm": 0.33894270663938314, + "learning_rate": 4.031383268310422e-06, + "loss": 0.0063, + "step": 6178 + }, + { + "epoch": 2.811191992720655, + "grad_norm": 0.23034929824612319, + "learning_rate": 4.029981088184031e-06, + "loss": 0.0061, + "step": 6179 + }, + { + "epoch": 2.8116469517743403, + "grad_norm": 0.3090425666502145, + "learning_rate": 4.028578987323206e-06, + "loss": 0.0118, + "step": 6180 + }, + { + "epoch": 2.8121019108280256, + "grad_norm": 0.500033020983605, + "learning_rate": 4.027176965842518e-06, + "loss": 0.0119, + "step": 6181 + }, + { + "epoch": 2.8125568698817105, + "grad_norm": 0.3249202590119885, + "learning_rate": 4.025775023856535e-06, + "loss": 0.0089, + "step": 6182 + }, + { + "epoch": 2.813011828935396, + "grad_norm": 0.3949380686526961, + "learning_rate": 4.024373161479817e-06, + "loss": 0.0102, + "step": 6183 + }, + { + "epoch": 2.813466787989081, + "grad_norm": 0.3923780176585343, + "learning_rate": 4.02297137882692e-06, + "loss": 0.0117, + "step": 6184 + }, + { + "epoch": 2.813921747042766, + "grad_norm": 0.4570940911572673, + "learning_rate": 4.0215696760123864e-06, + "loss": 0.0086, + "step": 6185 + }, + { + "epoch": 2.8143767060964513, + "grad_norm": 0.2693777188109058, + "learning_rate": 4.020168053150763e-06, + "loss": 0.0086, + "step": 6186 + }, + { + "epoch": 2.8148316651501366, + "grad_norm": 0.5132690913024266, + "learning_rate": 4.018766510356582e-06, + "loss": 0.0195, + "step": 6187 + }, + { + "epoch": 2.8152866242038215, + "grad_norm": 0.4280831276148228, + "learning_rate": 4.017365047744368e-06, + "loss": 0.0107, + "step": 6188 + }, + { + "epoch": 2.815741583257507, + "grad_norm": 0.39207033872188096, + "learning_rate": 4.015963665428647e-06, + "loss": 0.0131, + "step": 6189 + }, + { + "epoch": 2.816196542311192, + "grad_norm": 0.37182274264419013, + "learning_rate": 4.014562363523931e-06, + "loss": 0.0077, + "step": 6190 + }, + { + "epoch": 2.816651501364877, + "grad_norm": 0.36959480584241267, + "learning_rate": 4.013161142144729e-06, + "loss": 0.0081, + "step": 6191 + }, + { + "epoch": 2.8171064604185623, + "grad_norm": 0.48029848145136084, + "learning_rate": 4.011760001405539e-06, + "loss": 0.0196, + "step": 6192 + }, + { + "epoch": 2.8175614194722476, + "grad_norm": 0.39938984185530507, + "learning_rate": 4.010358941420861e-06, + "loss": 0.0144, + "step": 6193 + }, + { + "epoch": 2.8180163785259325, + "grad_norm": 0.5186541852711295, + "learning_rate": 4.008957962305181e-06, + "loss": 0.0077, + "step": 6194 + }, + { + "epoch": 2.8184713375796178, + "grad_norm": 0.4704822675115242, + "learning_rate": 4.007557064172981e-06, + "loss": 0.0148, + "step": 6195 + }, + { + "epoch": 2.818926296633303, + "grad_norm": 0.4576744056253656, + "learning_rate": 4.0061562471387364e-06, + "loss": 0.0168, + "step": 6196 + }, + { + "epoch": 2.819381255686988, + "grad_norm": 0.4640905261962327, + "learning_rate": 4.004755511316913e-06, + "loss": 0.0132, + "step": 6197 + }, + { + "epoch": 2.8198362147406733, + "grad_norm": 0.5765526220206479, + "learning_rate": 4.003354856821978e-06, + "loss": 0.012, + "step": 6198 + }, + { + "epoch": 2.8202911737943586, + "grad_norm": 0.4219405088038093, + "learning_rate": 4.001954283768379e-06, + "loss": 0.0133, + "step": 6199 + }, + { + "epoch": 2.8207461328480434, + "grad_norm": 0.46041164012862307, + "learning_rate": 4.0005537922705715e-06, + "loss": 0.0161, + "step": 6200 + }, + { + "epoch": 2.8212010919017287, + "grad_norm": 0.2979993492422895, + "learning_rate": 3.999153382442995e-06, + "loss": 0.011, + "step": 6201 + }, + { + "epoch": 2.821656050955414, + "grad_norm": 0.5193864738255584, + "learning_rate": 3.997753054400083e-06, + "loss": 0.0224, + "step": 6202 + }, + { + "epoch": 2.822111010009099, + "grad_norm": 0.263340828216671, + "learning_rate": 3.996352808256267e-06, + "loss": 0.0068, + "step": 6203 + }, + { + "epoch": 2.8225659690627842, + "grad_norm": 0.9075308243517225, + "learning_rate": 3.994952644125965e-06, + "loss": 0.0379, + "step": 6204 + }, + { + "epoch": 2.8230209281164695, + "grad_norm": 0.4606940433030956, + "learning_rate": 3.993552562123596e-06, + "loss": 0.0079, + "step": 6205 + }, + { + "epoch": 2.823475887170155, + "grad_norm": 0.46382897126081213, + "learning_rate": 3.9921525623635645e-06, + "loss": 0.0242, + "step": 6206 + }, + { + "epoch": 2.8239308462238397, + "grad_norm": 0.45456026962942275, + "learning_rate": 3.990752644960275e-06, + "loss": 0.0086, + "step": 6207 + }, + { + "epoch": 2.824385805277525, + "grad_norm": 0.4109895851058085, + "learning_rate": 3.989352810028123e-06, + "loss": 0.0144, + "step": 6208 + }, + { + "epoch": 2.8248407643312103, + "grad_norm": 0.44216401737597316, + "learning_rate": 3.987953057681494e-06, + "loss": 0.015, + "step": 6209 + }, + { + "epoch": 2.825295723384895, + "grad_norm": 0.47281748571493637, + "learning_rate": 3.986553388034772e-06, + "loss": 0.0098, + "step": 6210 + }, + { + "epoch": 2.8257506824385805, + "grad_norm": 0.5202086496180477, + "learning_rate": 3.985153801202329e-06, + "loss": 0.0115, + "step": 6211 + }, + { + "epoch": 2.826205641492266, + "grad_norm": 0.42007418723643686, + "learning_rate": 3.983754297298536e-06, + "loss": 0.013, + "step": 6212 + }, + { + "epoch": 2.826660600545951, + "grad_norm": 0.2580674306748603, + "learning_rate": 3.98235487643775e-06, + "loss": 0.0075, + "step": 6213 + }, + { + "epoch": 2.827115559599636, + "grad_norm": 0.4831461625018207, + "learning_rate": 3.980955538734329e-06, + "loss": 0.0148, + "step": 6214 + }, + { + "epoch": 2.8275705186533213, + "grad_norm": 0.4924236442432863, + "learning_rate": 3.9795562843026205e-06, + "loss": 0.0156, + "step": 6215 + }, + { + "epoch": 2.8280254777070066, + "grad_norm": 0.34345252750675803, + "learning_rate": 3.9781571132569644e-06, + "loss": 0.0068, + "step": 6216 + }, + { + "epoch": 2.8284804367606915, + "grad_norm": 0.3657755175918713, + "learning_rate": 3.976758025711693e-06, + "loss": 0.0131, + "step": 6217 + }, + { + "epoch": 2.828935395814377, + "grad_norm": 0.3521811277945949, + "learning_rate": 3.975359021781136e-06, + "loss": 0.0114, + "step": 6218 + }, + { + "epoch": 2.829390354868062, + "grad_norm": 0.6147186033564752, + "learning_rate": 3.973960101579611e-06, + "loss": 0.0241, + "step": 6219 + }, + { + "epoch": 2.829845313921747, + "grad_norm": 0.2926985229278627, + "learning_rate": 3.9725612652214325e-06, + "loss": 0.0073, + "step": 6220 + }, + { + "epoch": 2.8303002729754323, + "grad_norm": 0.5536621993472162, + "learning_rate": 3.971162512820909e-06, + "loss": 0.0288, + "step": 6221 + }, + { + "epoch": 2.8307552320291176, + "grad_norm": 0.2359701509054112, + "learning_rate": 3.969763844492338e-06, + "loss": 0.0039, + "step": 6222 + }, + { + "epoch": 2.8312101910828025, + "grad_norm": 0.45417416402719135, + "learning_rate": 3.968365260350014e-06, + "loss": 0.0095, + "step": 6223 + }, + { + "epoch": 2.831665150136488, + "grad_norm": 0.4692648445169428, + "learning_rate": 3.9669667605082215e-06, + "loss": 0.0165, + "step": 6224 + }, + { + "epoch": 2.832120109190173, + "grad_norm": 0.5301083101280761, + "learning_rate": 3.965568345081242e-06, + "loss": 0.0205, + "step": 6225 + }, + { + "epoch": 2.832575068243858, + "grad_norm": 0.408721164035492, + "learning_rate": 3.964170014183344e-06, + "loss": 0.0111, + "step": 6226 + }, + { + "epoch": 2.8330300272975433, + "grad_norm": 0.399372569637041, + "learning_rate": 3.962771767928793e-06, + "loss": 0.016, + "step": 6227 + }, + { + "epoch": 2.8334849863512286, + "grad_norm": 0.2714697972879553, + "learning_rate": 3.961373606431852e-06, + "loss": 0.0082, + "step": 6228 + }, + { + "epoch": 2.8339399454049135, + "grad_norm": 0.2513873152496805, + "learning_rate": 3.959975529806767e-06, + "loss": 0.006, + "step": 6229 + }, + { + "epoch": 2.8343949044585988, + "grad_norm": 0.27400233417289505, + "learning_rate": 3.958577538167788e-06, + "loss": 0.0081, + "step": 6230 + }, + { + "epoch": 2.834849863512284, + "grad_norm": 0.3389260811669458, + "learning_rate": 3.957179631629148e-06, + "loss": 0.0098, + "step": 6231 + }, + { + "epoch": 2.835304822565969, + "grad_norm": 0.307024384609634, + "learning_rate": 3.9557818103050794e-06, + "loss": 0.0093, + "step": 6232 + }, + { + "epoch": 2.8357597816196543, + "grad_norm": 0.16866023560329357, + "learning_rate": 3.954384074309805e-06, + "loss": 0.0032, + "step": 6233 + }, + { + "epoch": 2.8362147406733396, + "grad_norm": 0.46015536527315654, + "learning_rate": 3.952986423757541e-06, + "loss": 0.0135, + "step": 6234 + }, + { + "epoch": 2.8366696997270244, + "grad_norm": 0.3440336028007375, + "learning_rate": 3.9515888587625e-06, + "loss": 0.0089, + "step": 6235 + }, + { + "epoch": 2.8371246587807097, + "grad_norm": 0.5030475499553019, + "learning_rate": 3.9501913794388826e-06, + "loss": 0.0243, + "step": 6236 + }, + { + "epoch": 2.837579617834395, + "grad_norm": 0.2591496580193672, + "learning_rate": 3.9487939859008855e-06, + "loss": 0.0118, + "step": 6237 + }, + { + "epoch": 2.83803457688808, + "grad_norm": 0.3818984107723288, + "learning_rate": 3.947396678262696e-06, + "loss": 0.0192, + "step": 6238 + }, + { + "epoch": 2.8384895359417652, + "grad_norm": 0.31222358788521987, + "learning_rate": 3.9459994566384965e-06, + "loss": 0.0071, + "step": 6239 + }, + { + "epoch": 2.8389444949954505, + "grad_norm": 0.37310107381749025, + "learning_rate": 3.944602321142461e-06, + "loss": 0.0109, + "step": 6240 + }, + { + "epoch": 2.8393994540491354, + "grad_norm": 0.36063037626474487, + "learning_rate": 3.943205271888757e-06, + "loss": 0.0125, + "step": 6241 + }, + { + "epoch": 2.8398544131028207, + "grad_norm": 0.3976547008235038, + "learning_rate": 3.941808308991548e-06, + "loss": 0.0119, + "step": 6242 + }, + { + "epoch": 2.840309372156506, + "grad_norm": 0.42885646552793905, + "learning_rate": 3.940411432564983e-06, + "loss": 0.0198, + "step": 6243 + }, + { + "epoch": 2.840764331210191, + "grad_norm": 0.32609775183349216, + "learning_rate": 3.939014642723213e-06, + "loss": 0.0082, + "step": 6244 + }, + { + "epoch": 2.841219290263876, + "grad_norm": 0.2633611789673397, + "learning_rate": 3.937617939580374e-06, + "loss": 0.0095, + "step": 6245 + }, + { + "epoch": 2.8416742493175615, + "grad_norm": 0.7468469436321076, + "learning_rate": 3.936221323250599e-06, + "loss": 0.0387, + "step": 6246 + }, + { + "epoch": 2.8421292083712464, + "grad_norm": 0.46187922087775524, + "learning_rate": 3.9348247938480134e-06, + "loss": 0.0189, + "step": 6247 + }, + { + "epoch": 2.8425841674249317, + "grad_norm": 0.3614147118478118, + "learning_rate": 3.9334283514867334e-06, + "loss": 0.0154, + "step": 6248 + }, + { + "epoch": 2.843039126478617, + "grad_norm": 0.4203956918022225, + "learning_rate": 3.932031996280875e-06, + "loss": 0.0129, + "step": 6249 + }, + { + "epoch": 2.843494085532302, + "grad_norm": 0.36877925475134965, + "learning_rate": 3.9306357283445375e-06, + "loss": 0.0072, + "step": 6250 + }, + { + "epoch": 2.843949044585987, + "grad_norm": 0.31676464172553337, + "learning_rate": 3.929239547791821e-06, + "loss": 0.0113, + "step": 6251 + }, + { + "epoch": 2.8444040036396725, + "grad_norm": 0.33370319234845225, + "learning_rate": 3.927843454736812e-06, + "loss": 0.0083, + "step": 6252 + }, + { + "epoch": 2.8448589626933574, + "grad_norm": 0.4779432996177093, + "learning_rate": 3.926447449293593e-06, + "loss": 0.0165, + "step": 6253 + }, + { + "epoch": 2.8453139217470427, + "grad_norm": 0.3448796729681847, + "learning_rate": 3.925051531576242e-06, + "loss": 0.0107, + "step": 6254 + }, + { + "epoch": 2.845768880800728, + "grad_norm": 0.7920240022008865, + "learning_rate": 3.923655701698823e-06, + "loss": 0.0151, + "step": 6255 + }, + { + "epoch": 2.846223839854413, + "grad_norm": 0.3929181089463895, + "learning_rate": 3.922259959775401e-06, + "loss": 0.0074, + "step": 6256 + }, + { + "epoch": 2.846678798908098, + "grad_norm": 0.48258847782440906, + "learning_rate": 3.920864305920028e-06, + "loss": 0.0108, + "step": 6257 + }, + { + "epoch": 2.8471337579617835, + "grad_norm": 0.3296749800384298, + "learning_rate": 3.919468740246751e-06, + "loss": 0.0087, + "step": 6258 + }, + { + "epoch": 2.8475887170154683, + "grad_norm": 0.28240203229145405, + "learning_rate": 3.9180732628696085e-06, + "loss": 0.0069, + "step": 6259 + }, + { + "epoch": 2.8480436760691537, + "grad_norm": 0.43034205793785113, + "learning_rate": 3.916677873902633e-06, + "loss": 0.0086, + "step": 6260 + }, + { + "epoch": 2.848498635122839, + "grad_norm": 0.5889995269761055, + "learning_rate": 3.91528257345985e-06, + "loss": 0.0214, + "step": 6261 + }, + { + "epoch": 2.8489535941765243, + "grad_norm": 0.2561110247434447, + "learning_rate": 3.913887361655274e-06, + "loss": 0.0047, + "step": 6262 + }, + { + "epoch": 2.849408553230209, + "grad_norm": 0.4018927452114139, + "learning_rate": 3.91249223860292e-06, + "loss": 0.0103, + "step": 6263 + }, + { + "epoch": 2.8498635122838945, + "grad_norm": 0.37873751535407935, + "learning_rate": 3.9110972044167895e-06, + "loss": 0.01, + "step": 6264 + }, + { + "epoch": 2.8503184713375798, + "grad_norm": 0.30995861125402663, + "learning_rate": 3.909702259210877e-06, + "loss": 0.0122, + "step": 6265 + }, + { + "epoch": 2.8507734303912646, + "grad_norm": 0.3161837188029331, + "learning_rate": 3.908307403099173e-06, + "loss": 0.0093, + "step": 6266 + }, + { + "epoch": 2.85122838944495, + "grad_norm": 0.3327609618084383, + "learning_rate": 3.906912636195658e-06, + "loss": 0.0075, + "step": 6267 + }, + { + "epoch": 2.8516833484986353, + "grad_norm": 0.32864109799528896, + "learning_rate": 3.905517958614306e-06, + "loss": 0.0075, + "step": 6268 + }, + { + "epoch": 2.8521383075523206, + "grad_norm": 0.4799548165970622, + "learning_rate": 3.904123370469082e-06, + "loss": 0.0134, + "step": 6269 + }, + { + "epoch": 2.8525932666060054, + "grad_norm": 0.42389617533635005, + "learning_rate": 3.902728871873948e-06, + "loss": 0.013, + "step": 6270 + }, + { + "epoch": 2.8530482256596907, + "grad_norm": 0.33785952911566375, + "learning_rate": 3.901334462942857e-06, + "loss": 0.0109, + "step": 6271 + }, + { + "epoch": 2.853503184713376, + "grad_norm": 0.5898203473860996, + "learning_rate": 3.899940143789751e-06, + "loss": 0.0194, + "step": 6272 + }, + { + "epoch": 2.853958143767061, + "grad_norm": 0.3206891964404092, + "learning_rate": 3.898545914528569e-06, + "loss": 0.0101, + "step": 6273 + }, + { + "epoch": 2.8544131028207462, + "grad_norm": 0.4364208443227221, + "learning_rate": 3.89715177527324e-06, + "loss": 0.0107, + "step": 6274 + }, + { + "epoch": 2.8548680618744315, + "grad_norm": 0.5727255634194993, + "learning_rate": 3.895757726137689e-06, + "loss": 0.0149, + "step": 6275 + }, + { + "epoch": 2.8553230209281164, + "grad_norm": 0.3050702500498504, + "learning_rate": 3.894363767235827e-06, + "loss": 0.0065, + "step": 6276 + }, + { + "epoch": 2.8557779799818017, + "grad_norm": 0.45008964956059117, + "learning_rate": 3.892969898681567e-06, + "loss": 0.0128, + "step": 6277 + }, + { + "epoch": 2.856232939035487, + "grad_norm": 0.512783180225894, + "learning_rate": 3.891576120588808e-06, + "loss": 0.0124, + "step": 6278 + }, + { + "epoch": 2.856687898089172, + "grad_norm": 0.28781976414263555, + "learning_rate": 3.890182433071442e-06, + "loss": 0.0055, + "step": 6279 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.36475905568543177, + "learning_rate": 3.8887888362433565e-06, + "loss": 0.0114, + "step": 6280 + }, + { + "epoch": 2.8575978161965425, + "grad_norm": 0.3528387944542802, + "learning_rate": 3.887395330218429e-06, + "loss": 0.0127, + "step": 6281 + }, + { + "epoch": 2.8580527752502274, + "grad_norm": 0.49295591560837315, + "learning_rate": 3.88600191511053e-06, + "loss": 0.026, + "step": 6282 + }, + { + "epoch": 2.8585077343039127, + "grad_norm": 0.3833227027075929, + "learning_rate": 3.8846085910335226e-06, + "loss": 0.0116, + "step": 6283 + }, + { + "epoch": 2.858962693357598, + "grad_norm": 0.3048850317680643, + "learning_rate": 3.883215358101266e-06, + "loss": 0.0103, + "step": 6284 + }, + { + "epoch": 2.859417652411283, + "grad_norm": 0.3343925798847499, + "learning_rate": 3.881822216427607e-06, + "loss": 0.0057, + "step": 6285 + }, + { + "epoch": 2.859872611464968, + "grad_norm": 0.4629523464228457, + "learning_rate": 3.880429166126385e-06, + "loss": 0.0157, + "step": 6286 + }, + { + "epoch": 2.8603275705186535, + "grad_norm": 0.5882468591085122, + "learning_rate": 3.879036207311438e-06, + "loss": 0.011, + "step": 6287 + }, + { + "epoch": 2.8607825295723384, + "grad_norm": 0.35458341546101474, + "learning_rate": 3.87764334009659e-06, + "loss": 0.0163, + "step": 6288 + }, + { + "epoch": 2.8612374886260237, + "grad_norm": 0.2276758875960399, + "learning_rate": 3.876250564595658e-06, + "loss": 0.005, + "step": 6289 + }, + { + "epoch": 2.861692447679709, + "grad_norm": 0.29770107756072245, + "learning_rate": 3.874857880922453e-06, + "loss": 0.0064, + "step": 6290 + }, + { + "epoch": 2.862147406733394, + "grad_norm": 0.41935572575138813, + "learning_rate": 3.873465289190785e-06, + "loss": 0.0092, + "step": 6291 + }, + { + "epoch": 2.862602365787079, + "grad_norm": 0.3591339221779853, + "learning_rate": 3.872072789514444e-06, + "loss": 0.0071, + "step": 6292 + }, + { + "epoch": 2.8630573248407645, + "grad_norm": 0.4375891096587897, + "learning_rate": 3.870680382007223e-06, + "loss": 0.0113, + "step": 6293 + }, + { + "epoch": 2.8635122838944493, + "grad_norm": 0.3684019269047062, + "learning_rate": 3.869288066782898e-06, + "loss": 0.0117, + "step": 6294 + }, + { + "epoch": 2.8639672429481347, + "grad_norm": 0.40391472870105816, + "learning_rate": 3.867895843955249e-06, + "loss": 0.0065, + "step": 6295 + }, + { + "epoch": 2.86442220200182, + "grad_norm": 0.3903734654988649, + "learning_rate": 3.866503713638037e-06, + "loss": 0.0077, + "step": 6296 + }, + { + "epoch": 2.864877161055505, + "grad_norm": 0.46125587147120434, + "learning_rate": 3.8651116759450225e-06, + "loss": 0.0154, + "step": 6297 + }, + { + "epoch": 2.86533212010919, + "grad_norm": 0.5955787823821846, + "learning_rate": 3.863719730989958e-06, + "loss": 0.0172, + "step": 6298 + }, + { + "epoch": 2.8657870791628755, + "grad_norm": 0.366758042196921, + "learning_rate": 3.862327878886585e-06, + "loss": 0.009, + "step": 6299 + }, + { + "epoch": 2.8662420382165603, + "grad_norm": 0.5224307717050332, + "learning_rate": 3.860936119748642e-06, + "loss": 0.0194, + "step": 6300 + }, + { + "epoch": 2.8666969972702456, + "grad_norm": 0.40543025280630457, + "learning_rate": 3.859544453689853e-06, + "loss": 0.0133, + "step": 6301 + }, + { + "epoch": 2.867151956323931, + "grad_norm": 0.36560351191576285, + "learning_rate": 3.858152880823943e-06, + "loss": 0.0094, + "step": 6302 + }, + { + "epoch": 2.867606915377616, + "grad_norm": 0.4582924402202266, + "learning_rate": 3.856761401264621e-06, + "loss": 0.0232, + "step": 6303 + }, + { + "epoch": 2.868061874431301, + "grad_norm": 0.29322164088269265, + "learning_rate": 3.8553700151255935e-06, + "loss": 0.0057, + "step": 6304 + }, + { + "epoch": 2.8685168334849864, + "grad_norm": 0.391556775245077, + "learning_rate": 3.853978722520562e-06, + "loss": 0.014, + "step": 6305 + }, + { + "epoch": 2.8689717925386713, + "grad_norm": 0.3714143208914766, + "learning_rate": 3.8525875235632125e-06, + "loss": 0.011, + "step": 6306 + }, + { + "epoch": 2.8694267515923566, + "grad_norm": 0.3966665180003109, + "learning_rate": 3.85119641836723e-06, + "loss": 0.0178, + "step": 6307 + }, + { + "epoch": 2.869881710646042, + "grad_norm": 0.2592938059452948, + "learning_rate": 3.849805407046288e-06, + "loss": 0.0049, + "step": 6308 + }, + { + "epoch": 2.870336669699727, + "grad_norm": 0.5038771572877041, + "learning_rate": 3.848414489714054e-06, + "loss": 0.0157, + "step": 6309 + }, + { + "epoch": 2.870791628753412, + "grad_norm": 0.4286736006088806, + "learning_rate": 3.847023666484187e-06, + "loss": 0.0202, + "step": 6310 + }, + { + "epoch": 2.8712465878070974, + "grad_norm": 0.47034164071342965, + "learning_rate": 3.8456329374703375e-06, + "loss": 0.0179, + "step": 6311 + }, + { + "epoch": 2.8717015468607823, + "grad_norm": 0.34124052176861547, + "learning_rate": 3.844242302786153e-06, + "loss": 0.0087, + "step": 6312 + }, + { + "epoch": 2.8721565059144676, + "grad_norm": 0.3814836081503508, + "learning_rate": 3.8428517625452675e-06, + "loss": 0.0067, + "step": 6313 + }, + { + "epoch": 2.872611464968153, + "grad_norm": 0.34100076799914025, + "learning_rate": 3.841461316861312e-06, + "loss": 0.0105, + "step": 6314 + }, + { + "epoch": 2.8730664240218378, + "grad_norm": 0.6781933350158105, + "learning_rate": 3.840070965847904e-06, + "loss": 0.0182, + "step": 6315 + }, + { + "epoch": 2.873521383075523, + "grad_norm": 0.4155545550670509, + "learning_rate": 3.83868070961866e-06, + "loss": 0.0101, + "step": 6316 + }, + { + "epoch": 2.8739763421292084, + "grad_norm": 0.4086492845096628, + "learning_rate": 3.837290548287183e-06, + "loss": 0.0127, + "step": 6317 + }, + { + "epoch": 2.8744313011828937, + "grad_norm": 0.414005074607464, + "learning_rate": 3.83590048196707e-06, + "loss": 0.0216, + "step": 6318 + }, + { + "epoch": 2.8748862602365786, + "grad_norm": 0.3042880022208222, + "learning_rate": 3.834510510771914e-06, + "loss": 0.0085, + "step": 6319 + }, + { + "epoch": 2.875341219290264, + "grad_norm": 0.40440982052288527, + "learning_rate": 3.833120634815296e-06, + "loss": 0.019, + "step": 6320 + }, + { + "epoch": 2.875796178343949, + "grad_norm": 0.26802996831161435, + "learning_rate": 3.831730854210791e-06, + "loss": 0.0054, + "step": 6321 + }, + { + "epoch": 2.876251137397634, + "grad_norm": 0.7323334230102913, + "learning_rate": 3.830341169071965e-06, + "loss": 0.0242, + "step": 6322 + }, + { + "epoch": 2.8767060964513194, + "grad_norm": 0.4133971361129842, + "learning_rate": 3.828951579512374e-06, + "loss": 0.013, + "step": 6323 + }, + { + "epoch": 2.8771610555050047, + "grad_norm": 0.35954978352833555, + "learning_rate": 3.827562085645574e-06, + "loss": 0.0161, + "step": 6324 + }, + { + "epoch": 2.87761601455869, + "grad_norm": 0.30090268898226913, + "learning_rate": 3.826172687585104e-06, + "loss": 0.0094, + "step": 6325 + }, + { + "epoch": 2.878070973612375, + "grad_norm": 0.5403862929387374, + "learning_rate": 3.824783385444501e-06, + "loss": 0.0178, + "step": 6326 + }, + { + "epoch": 2.87852593266606, + "grad_norm": 0.3140167732550238, + "learning_rate": 3.8233941793372934e-06, + "loss": 0.0081, + "step": 6327 + }, + { + "epoch": 2.8789808917197455, + "grad_norm": 0.39000912243012076, + "learning_rate": 3.822005069377e-06, + "loss": 0.0124, + "step": 6328 + }, + { + "epoch": 2.8794358507734303, + "grad_norm": 0.3286182522237254, + "learning_rate": 3.820616055677132e-06, + "loss": 0.0082, + "step": 6329 + }, + { + "epoch": 2.8798908098271156, + "grad_norm": 0.3373701375250453, + "learning_rate": 3.819227138351194e-06, + "loss": 0.0096, + "step": 6330 + }, + { + "epoch": 2.880345768880801, + "grad_norm": 0.5454728178026315, + "learning_rate": 3.817838317512683e-06, + "loss": 0.0184, + "step": 6331 + }, + { + "epoch": 2.880800727934486, + "grad_norm": 0.48750702796293294, + "learning_rate": 3.8164495932750835e-06, + "loss": 0.0143, + "step": 6332 + }, + { + "epoch": 2.881255686988171, + "grad_norm": 0.3748304025492812, + "learning_rate": 3.81506096575188e-06, + "loss": 0.013, + "step": 6333 + }, + { + "epoch": 2.8817106460418564, + "grad_norm": 0.2617319646306041, + "learning_rate": 3.813672435056544e-06, + "loss": 0.0074, + "step": 6334 + }, + { + "epoch": 2.8821656050955413, + "grad_norm": 0.3476047473437665, + "learning_rate": 3.812284001302538e-06, + "loss": 0.0118, + "step": 6335 + }, + { + "epoch": 2.8826205641492266, + "grad_norm": 0.3286218231881052, + "learning_rate": 3.8108956646033214e-06, + "loss": 0.0083, + "step": 6336 + }, + { + "epoch": 2.883075523202912, + "grad_norm": 0.4079602279689255, + "learning_rate": 3.809507425072339e-06, + "loss": 0.0124, + "step": 6337 + }, + { + "epoch": 2.883530482256597, + "grad_norm": 0.3473277015907147, + "learning_rate": 3.808119282823035e-06, + "loss": 0.0078, + "step": 6338 + }, + { + "epoch": 2.883985441310282, + "grad_norm": 0.6991876672781839, + "learning_rate": 3.8067312379688393e-06, + "loss": 0.0194, + "step": 6339 + }, + { + "epoch": 2.8844404003639674, + "grad_norm": 0.22675332639990367, + "learning_rate": 3.8053432906231786e-06, + "loss": 0.0052, + "step": 6340 + }, + { + "epoch": 2.8848953594176523, + "grad_norm": 0.49560045249468987, + "learning_rate": 3.8039554408994707e-06, + "loss": 0.015, + "step": 6341 + }, + { + "epoch": 2.8853503184713376, + "grad_norm": 0.36076854600275526, + "learning_rate": 3.802567688911121e-06, + "loss": 0.0103, + "step": 6342 + }, + { + "epoch": 2.885805277525023, + "grad_norm": 0.40202946834776543, + "learning_rate": 3.801180034771534e-06, + "loss": 0.0113, + "step": 6343 + }, + { + "epoch": 2.886260236578708, + "grad_norm": 0.3789777648716472, + "learning_rate": 3.7997924785940992e-06, + "loss": 0.0128, + "step": 6344 + }, + { + "epoch": 2.886715195632393, + "grad_norm": 0.503523378489957, + "learning_rate": 3.798405020492204e-06, + "loss": 0.0177, + "step": 6345 + }, + { + "epoch": 2.8871701546860784, + "grad_norm": 0.45704213837144136, + "learning_rate": 3.7970176605792227e-06, + "loss": 0.0218, + "step": 6346 + }, + { + "epoch": 2.8876251137397633, + "grad_norm": 0.3884640190831784, + "learning_rate": 3.7956303989685263e-06, + "loss": 0.0132, + "step": 6347 + }, + { + "epoch": 2.8880800727934486, + "grad_norm": 0.3950973283624611, + "learning_rate": 3.7942432357734756e-06, + "loss": 0.0165, + "step": 6348 + }, + { + "epoch": 2.888535031847134, + "grad_norm": 0.36843284102790197, + "learning_rate": 3.7928561711074212e-06, + "loss": 0.005, + "step": 6349 + }, + { + "epoch": 2.8889899909008188, + "grad_norm": 0.40221758497654353, + "learning_rate": 3.791469205083711e-06, + "loss": 0.0147, + "step": 6350 + }, + { + "epoch": 2.889444949954504, + "grad_norm": 0.4207977527715021, + "learning_rate": 3.790082337815678e-06, + "loss": 0.0129, + "step": 6351 + }, + { + "epoch": 2.8898999090081894, + "grad_norm": 0.3079913969975454, + "learning_rate": 3.788695569416654e-06, + "loss": 0.0068, + "step": 6352 + }, + { + "epoch": 2.8903548680618742, + "grad_norm": 0.4885305823729606, + "learning_rate": 3.7873088999999553e-06, + "loss": 0.0181, + "step": 6353 + }, + { + "epoch": 2.8908098271155596, + "grad_norm": 0.3543962167794279, + "learning_rate": 3.785922329678898e-06, + "loss": 0.0098, + "step": 6354 + }, + { + "epoch": 2.891264786169245, + "grad_norm": 0.4228251555050592, + "learning_rate": 3.784535858566786e-06, + "loss": 0.0214, + "step": 6355 + }, + { + "epoch": 2.8917197452229297, + "grad_norm": 0.33124874600109977, + "learning_rate": 3.7831494867769134e-06, + "loss": 0.0071, + "step": 6356 + }, + { + "epoch": 2.892174704276615, + "grad_norm": 0.41657557881618507, + "learning_rate": 3.7817632144225713e-06, + "loss": 0.0113, + "step": 6357 + }, + { + "epoch": 2.8926296633303004, + "grad_norm": 0.3291065357445098, + "learning_rate": 3.780377041617037e-06, + "loss": 0.0062, + "step": 6358 + }, + { + "epoch": 2.8930846223839852, + "grad_norm": 0.3615507370786119, + "learning_rate": 3.7789909684735825e-06, + "loss": 0.0096, + "step": 6359 + }, + { + "epoch": 2.8935395814376705, + "grad_norm": 0.4012291148091594, + "learning_rate": 3.7776049951054718e-06, + "loss": 0.0084, + "step": 6360 + }, + { + "epoch": 2.893994540491356, + "grad_norm": 0.45028283597993485, + "learning_rate": 3.7762191216259613e-06, + "loss": 0.0134, + "step": 6361 + }, + { + "epoch": 2.8944494995450407, + "grad_norm": 0.3255175306272122, + "learning_rate": 3.7748333481482974e-06, + "loss": 0.0125, + "step": 6362 + }, + { + "epoch": 2.894904458598726, + "grad_norm": 0.31382961258144904, + "learning_rate": 3.77344767478572e-06, + "loss": 0.0078, + "step": 6363 + }, + { + "epoch": 2.8953594176524113, + "grad_norm": 0.21500250648441935, + "learning_rate": 3.7720621016514593e-06, + "loss": 0.0027, + "step": 6364 + }, + { + "epoch": 2.895814376706096, + "grad_norm": 0.521065049666442, + "learning_rate": 3.7706766288587386e-06, + "loss": 0.0173, + "step": 6365 + }, + { + "epoch": 2.8962693357597815, + "grad_norm": 0.319897868405258, + "learning_rate": 3.7692912565207716e-06, + "loss": 0.0071, + "step": 6366 + }, + { + "epoch": 2.896724294813467, + "grad_norm": 0.4516402028839931, + "learning_rate": 3.7679059847507643e-06, + "loss": 0.013, + "step": 6367 + }, + { + "epoch": 2.8971792538671517, + "grad_norm": 0.3595984074926371, + "learning_rate": 3.7665208136619176e-06, + "loss": 0.0127, + "step": 6368 + }, + { + "epoch": 2.897634212920837, + "grad_norm": 0.4642847466205341, + "learning_rate": 3.7651357433674187e-06, + "loss": 0.0159, + "step": 6369 + }, + { + "epoch": 2.8980891719745223, + "grad_norm": 0.33327002165436054, + "learning_rate": 3.763750773980451e-06, + "loss": 0.0067, + "step": 6370 + }, + { + "epoch": 2.8985441310282076, + "grad_norm": 0.4191259812612057, + "learning_rate": 3.762365905614187e-06, + "loss": 0.0221, + "step": 6371 + }, + { + "epoch": 2.8989990900818925, + "grad_norm": 0.4272225721959623, + "learning_rate": 3.760981138381793e-06, + "loss": 0.0135, + "step": 6372 + }, + { + "epoch": 2.899454049135578, + "grad_norm": 0.29475046938215244, + "learning_rate": 3.7595964723964236e-06, + "loss": 0.0068, + "step": 6373 + }, + { + "epoch": 2.899909008189263, + "grad_norm": 0.5355429451634685, + "learning_rate": 3.7582119077712277e-06, + "loss": 0.011, + "step": 6374 + }, + { + "epoch": 2.900363967242948, + "grad_norm": 0.4908348557141388, + "learning_rate": 3.7568274446193486e-06, + "loss": 0.0158, + "step": 6375 + }, + { + "epoch": 2.9008189262966333, + "grad_norm": 0.39466071047943796, + "learning_rate": 3.7554430830539164e-06, + "loss": 0.0124, + "step": 6376 + }, + { + "epoch": 2.9012738853503186, + "grad_norm": 0.2776723609453031, + "learning_rate": 3.7540588231880557e-06, + "loss": 0.0046, + "step": 6377 + }, + { + "epoch": 2.901728844404004, + "grad_norm": 0.7685620841223039, + "learning_rate": 3.75267466513488e-06, + "loss": 0.0245, + "step": 6378 + }, + { + "epoch": 2.902183803457689, + "grad_norm": 0.5251658551775495, + "learning_rate": 3.7512906090074997e-06, + "loss": 0.0172, + "step": 6379 + }, + { + "epoch": 2.902638762511374, + "grad_norm": 0.3884955696657931, + "learning_rate": 3.74990665491901e-06, + "loss": 0.0095, + "step": 6380 + }, + { + "epoch": 2.9030937215650594, + "grad_norm": 0.271630438033093, + "learning_rate": 3.748522802982502e-06, + "loss": 0.007, + "step": 6381 + }, + { + "epoch": 2.9035486806187443, + "grad_norm": 0.378895331137123, + "learning_rate": 3.747139053311061e-06, + "loss": 0.0141, + "step": 6382 + }, + { + "epoch": 2.9040036396724296, + "grad_norm": 0.3110405139589322, + "learning_rate": 3.745755406017758e-06, + "loss": 0.0095, + "step": 6383 + }, + { + "epoch": 2.904458598726115, + "grad_norm": 0.3398426847127566, + "learning_rate": 3.74437186121566e-06, + "loss": 0.0092, + "step": 6384 + }, + { + "epoch": 2.9049135577797998, + "grad_norm": 0.3518693864415104, + "learning_rate": 3.7429884190178224e-06, + "loss": 0.0096, + "step": 6385 + }, + { + "epoch": 2.905368516833485, + "grad_norm": 0.5256974818520146, + "learning_rate": 3.741605079537295e-06, + "loss": 0.0167, + "step": 6386 + }, + { + "epoch": 2.9058234758871704, + "grad_norm": 0.4092907395202486, + "learning_rate": 3.740221842887117e-06, + "loss": 0.0108, + "step": 6387 + }, + { + "epoch": 2.9062784349408552, + "grad_norm": 0.34404946823911237, + "learning_rate": 3.7388387091803204e-06, + "loss": 0.0113, + "step": 6388 + }, + { + "epoch": 2.9067333939945406, + "grad_norm": 0.28834987095222475, + "learning_rate": 3.73745567852993e-06, + "loss": 0.0107, + "step": 6389 + }, + { + "epoch": 2.907188353048226, + "grad_norm": 0.3095882977232282, + "learning_rate": 3.73607275104896e-06, + "loss": 0.0094, + "step": 6390 + }, + { + "epoch": 2.9076433121019107, + "grad_norm": 0.3807601767809069, + "learning_rate": 3.7346899268504174e-06, + "loss": 0.012, + "step": 6391 + }, + { + "epoch": 2.908098271155596, + "grad_norm": 0.3260098412889043, + "learning_rate": 3.7333072060472987e-06, + "loss": 0.009, + "step": 6392 + }, + { + "epoch": 2.9085532302092814, + "grad_norm": 0.5427951597890734, + "learning_rate": 3.7319245887525956e-06, + "loss": 0.0146, + "step": 6393 + }, + { + "epoch": 2.9090081892629662, + "grad_norm": 0.39000578811549785, + "learning_rate": 3.73054207507929e-06, + "loss": 0.0105, + "step": 6394 + }, + { + "epoch": 2.9094631483166515, + "grad_norm": 0.4186877479062313, + "learning_rate": 3.729159665140348e-06, + "loss": 0.0117, + "step": 6395 + }, + { + "epoch": 2.909918107370337, + "grad_norm": 0.4922890078770253, + "learning_rate": 3.7277773590487436e-06, + "loss": 0.0146, + "step": 6396 + }, + { + "epoch": 2.9103730664240217, + "grad_norm": 0.3147800308833414, + "learning_rate": 3.726395156917428e-06, + "loss": 0.0125, + "step": 6397 + }, + { + "epoch": 2.910828025477707, + "grad_norm": 0.2879171961223307, + "learning_rate": 3.7250130588593467e-06, + "loss": 0.0086, + "step": 6398 + }, + { + "epoch": 2.9112829845313923, + "grad_norm": 0.3067206030759571, + "learning_rate": 3.723631064987443e-06, + "loss": 0.0115, + "step": 6399 + }, + { + "epoch": 2.911737943585077, + "grad_norm": 0.3049912993314163, + "learning_rate": 3.722249175414643e-06, + "loss": 0.0103, + "step": 6400 + }, + { + "epoch": 2.9121929026387625, + "grad_norm": 0.4957445043143095, + "learning_rate": 3.7208673902538705e-06, + "loss": 0.0124, + "step": 6401 + }, + { + "epoch": 2.912647861692448, + "grad_norm": 0.41912048390994716, + "learning_rate": 3.7194857096180366e-06, + "loss": 0.0157, + "step": 6402 + }, + { + "epoch": 2.9131028207461327, + "grad_norm": 0.3220786981792336, + "learning_rate": 3.7181041336200485e-06, + "loss": 0.0055, + "step": 6403 + }, + { + "epoch": 2.913557779799818, + "grad_norm": 0.3922854078706956, + "learning_rate": 3.7167226623728035e-06, + "loss": 0.0125, + "step": 6404 + }, + { + "epoch": 2.9140127388535033, + "grad_norm": 0.41602097384647796, + "learning_rate": 3.7153412959891856e-06, + "loss": 0.0142, + "step": 6405 + }, + { + "epoch": 2.914467697907188, + "grad_norm": 0.40619529249645336, + "learning_rate": 3.713960034582077e-06, + "loss": 0.0132, + "step": 6406 + }, + { + "epoch": 2.9149226569608735, + "grad_norm": 0.2815417818515389, + "learning_rate": 3.712578878264345e-06, + "loss": 0.0094, + "step": 6407 + }, + { + "epoch": 2.915377616014559, + "grad_norm": 0.24659431801693033, + "learning_rate": 3.7111978271488546e-06, + "loss": 0.0063, + "step": 6408 + }, + { + "epoch": 2.9158325750682437, + "grad_norm": 0.23984002812085403, + "learning_rate": 3.709816881348456e-06, + "loss": 0.0055, + "step": 6409 + }, + { + "epoch": 2.916287534121929, + "grad_norm": 0.46157485439988916, + "learning_rate": 3.7084360409759956e-06, + "loss": 0.0106, + "step": 6410 + }, + { + "epoch": 2.9167424931756143, + "grad_norm": 0.4257357090766481, + "learning_rate": 3.7070553061443106e-06, + "loss": 0.0177, + "step": 6411 + }, + { + "epoch": 2.917197452229299, + "grad_norm": 0.4430704949620385, + "learning_rate": 3.705674676966226e-06, + "loss": 0.0082, + "step": 6412 + }, + { + "epoch": 2.9176524112829845, + "grad_norm": 0.43743754168504323, + "learning_rate": 3.7042941535545628e-06, + "loss": 0.0095, + "step": 6413 + }, + { + "epoch": 2.91810737033667, + "grad_norm": 0.2983734567492093, + "learning_rate": 3.702913736022129e-06, + "loss": 0.0071, + "step": 6414 + }, + { + "epoch": 2.9185623293903546, + "grad_norm": 0.282368325605843, + "learning_rate": 3.701533424481728e-06, + "loss": 0.0061, + "step": 6415 + }, + { + "epoch": 2.91901728844404, + "grad_norm": 0.2812285450319493, + "learning_rate": 3.7001532190461497e-06, + "loss": 0.0035, + "step": 6416 + }, + { + "epoch": 2.9194722474977253, + "grad_norm": 0.2779456517793231, + "learning_rate": 3.698773119828182e-06, + "loss": 0.0096, + "step": 6417 + }, + { + "epoch": 2.91992720655141, + "grad_norm": 0.36421628202503153, + "learning_rate": 3.6973931269405994e-06, + "loss": 0.0105, + "step": 6418 + }, + { + "epoch": 2.9203821656050954, + "grad_norm": 0.30592615966929854, + "learning_rate": 3.696013240496166e-06, + "loss": 0.0092, + "step": 6419 + }, + { + "epoch": 2.9208371246587808, + "grad_norm": 0.2455435362824365, + "learning_rate": 3.694633460607644e-06, + "loss": 0.0056, + "step": 6420 + }, + { + "epoch": 2.9212920837124656, + "grad_norm": 0.4095843281421876, + "learning_rate": 3.693253787387779e-06, + "loss": 0.0122, + "step": 6421 + }, + { + "epoch": 2.921747042766151, + "grad_norm": 0.37739285852106713, + "learning_rate": 3.691874220949314e-06, + "loss": 0.01, + "step": 6422 + }, + { + "epoch": 2.9222020018198362, + "grad_norm": 0.35981876257094425, + "learning_rate": 3.690494761404979e-06, + "loss": 0.0068, + "step": 6423 + }, + { + "epoch": 2.922656960873521, + "grad_norm": 0.38381891847358335, + "learning_rate": 3.6891154088674985e-06, + "loss": 0.0096, + "step": 6424 + }, + { + "epoch": 2.9231119199272064, + "grad_norm": 0.5358702715105391, + "learning_rate": 3.687736163449589e-06, + "loss": 0.0166, + "step": 6425 + }, + { + "epoch": 2.9235668789808917, + "grad_norm": 0.3463906828200289, + "learning_rate": 3.6863570252639522e-06, + "loss": 0.0092, + "step": 6426 + }, + { + "epoch": 2.924021838034577, + "grad_norm": 0.3390637291243101, + "learning_rate": 3.6849779944232885e-06, + "loss": 0.0069, + "step": 6427 + }, + { + "epoch": 2.924476797088262, + "grad_norm": 0.4496711440316595, + "learning_rate": 3.683599071040283e-06, + "loss": 0.0182, + "step": 6428 + }, + { + "epoch": 2.9249317561419472, + "grad_norm": 0.40106220012278543, + "learning_rate": 3.6822202552276176e-06, + "loss": 0.0118, + "step": 6429 + }, + { + "epoch": 2.9253867151956325, + "grad_norm": 0.41005481151564804, + "learning_rate": 3.6808415470979602e-06, + "loss": 0.0131, + "step": 6430 + }, + { + "epoch": 2.9258416742493174, + "grad_norm": 0.27832402660490174, + "learning_rate": 3.679462946763975e-06, + "loss": 0.0074, + "step": 6431 + }, + { + "epoch": 2.9262966333030027, + "grad_norm": 0.41817783001301695, + "learning_rate": 3.678084454338316e-06, + "loss": 0.0076, + "step": 6432 + }, + { + "epoch": 2.926751592356688, + "grad_norm": 0.40978520701397403, + "learning_rate": 3.6767060699336253e-06, + "loss": 0.0152, + "step": 6433 + }, + { + "epoch": 2.9272065514103733, + "grad_norm": 0.37563857566681513, + "learning_rate": 3.6753277936625374e-06, + "loss": 0.0055, + "step": 6434 + }, + { + "epoch": 2.927661510464058, + "grad_norm": 0.4103564455898853, + "learning_rate": 3.6739496256376816e-06, + "loss": 0.0069, + "step": 6435 + }, + { + "epoch": 2.9281164695177435, + "grad_norm": 0.38379673912392054, + "learning_rate": 3.672571565971672e-06, + "loss": 0.014, + "step": 6436 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 0.41343362721512666, + "learning_rate": 3.671193614777119e-06, + "loss": 0.0178, + "step": 6437 + }, + { + "epoch": 2.9290263876251137, + "grad_norm": 0.5279136335701177, + "learning_rate": 3.669815772166625e-06, + "loss": 0.0163, + "step": 6438 + }, + { + "epoch": 2.929481346678799, + "grad_norm": 0.715206835259677, + "learning_rate": 3.6684380382527784e-06, + "loss": 0.0252, + "step": 6439 + }, + { + "epoch": 2.9299363057324843, + "grad_norm": 0.4699215733523937, + "learning_rate": 3.6670604131481625e-06, + "loss": 0.0181, + "step": 6440 + }, + { + "epoch": 2.930391264786169, + "grad_norm": 0.43942198974305297, + "learning_rate": 3.665682896965349e-06, + "loss": 0.0204, + "step": 6441 + }, + { + "epoch": 2.9308462238398545, + "grad_norm": 0.3618401829398822, + "learning_rate": 3.664305489816905e-06, + "loss": 0.0114, + "step": 6442 + }, + { + "epoch": 2.93130118289354, + "grad_norm": 0.5945058832165803, + "learning_rate": 3.662928191815383e-06, + "loss": 0.0215, + "step": 6443 + }, + { + "epoch": 2.9317561419472247, + "grad_norm": 0.22962348680888112, + "learning_rate": 3.6615510030733302e-06, + "loss": 0.0076, + "step": 6444 + }, + { + "epoch": 2.93221110100091, + "grad_norm": 0.3394510298500849, + "learning_rate": 3.660173923703287e-06, + "loss": 0.0122, + "step": 6445 + }, + { + "epoch": 2.9326660600545953, + "grad_norm": 0.5258261357226683, + "learning_rate": 3.6587969538177793e-06, + "loss": 0.0169, + "step": 6446 + }, + { + "epoch": 2.93312101910828, + "grad_norm": 0.5012672727554425, + "learning_rate": 3.657420093529328e-06, + "loss": 0.0167, + "step": 6447 + }, + { + "epoch": 2.9335759781619655, + "grad_norm": 0.2731143247747597, + "learning_rate": 3.656043342950443e-06, + "loss": 0.0067, + "step": 6448 + }, + { + "epoch": 2.934030937215651, + "grad_norm": 0.41294977607930694, + "learning_rate": 3.6546667021936284e-06, + "loss": 0.0115, + "step": 6449 + }, + { + "epoch": 2.9344858962693356, + "grad_norm": 0.6537779837928394, + "learning_rate": 3.6532901713713742e-06, + "loss": 0.0189, + "step": 6450 + }, + { + "epoch": 2.934940855323021, + "grad_norm": 0.3978700759394602, + "learning_rate": 3.6519137505961636e-06, + "loss": 0.0115, + "step": 6451 + }, + { + "epoch": 2.9353958143767063, + "grad_norm": 0.3942439895320119, + "learning_rate": 3.650537439980476e-06, + "loss": 0.0144, + "step": 6452 + }, + { + "epoch": 2.935850773430391, + "grad_norm": 0.6015376869042698, + "learning_rate": 3.649161239636774e-06, + "loss": 0.0187, + "step": 6453 + }, + { + "epoch": 2.9363057324840764, + "grad_norm": 0.3245374606574856, + "learning_rate": 3.647785149677516e-06, + "loss": 0.0117, + "step": 6454 + }, + { + "epoch": 2.9367606915377618, + "grad_norm": 0.5037273302868972, + "learning_rate": 3.6464091702151484e-06, + "loss": 0.026, + "step": 6455 + }, + { + "epoch": 2.9372156505914466, + "grad_norm": 0.5511274356127235, + "learning_rate": 3.645033301362111e-06, + "loss": 0.0178, + "step": 6456 + }, + { + "epoch": 2.937670609645132, + "grad_norm": 0.42975626401719114, + "learning_rate": 3.643657543230832e-06, + "loss": 0.0127, + "step": 6457 + }, + { + "epoch": 2.9381255686988172, + "grad_norm": 0.48340126395934097, + "learning_rate": 3.6422818959337326e-06, + "loss": 0.0229, + "step": 6458 + }, + { + "epoch": 2.938580527752502, + "grad_norm": 0.3623128074406011, + "learning_rate": 3.640906359583228e-06, + "loss": 0.0094, + "step": 6459 + }, + { + "epoch": 2.9390354868061874, + "grad_norm": 0.3466250150212331, + "learning_rate": 3.6395309342917163e-06, + "loss": 0.0063, + "step": 6460 + }, + { + "epoch": 2.9394904458598727, + "grad_norm": 0.6088263100387632, + "learning_rate": 3.638155620171594e-06, + "loss": 0.0186, + "step": 6461 + }, + { + "epoch": 2.9399454049135576, + "grad_norm": 0.5086846303566309, + "learning_rate": 3.6367804173352434e-06, + "loss": 0.0161, + "step": 6462 + }, + { + "epoch": 2.940400363967243, + "grad_norm": 0.28554776742467947, + "learning_rate": 3.6354053258950416e-06, + "loss": 0.0116, + "step": 6463 + }, + { + "epoch": 2.9408553230209282, + "grad_norm": 0.33621246049434594, + "learning_rate": 3.6340303459633553e-06, + "loss": 0.0088, + "step": 6464 + }, + { + "epoch": 2.941310282074613, + "grad_norm": 0.313175095500926, + "learning_rate": 3.6326554776525357e-06, + "loss": 0.0079, + "step": 6465 + }, + { + "epoch": 2.9417652411282984, + "grad_norm": 0.42398447161072816, + "learning_rate": 3.63128072107494e-06, + "loss": 0.012, + "step": 6466 + }, + { + "epoch": 2.9422202001819837, + "grad_norm": 0.37481151861391304, + "learning_rate": 3.6299060763429016e-06, + "loss": 0.0126, + "step": 6467 + }, + { + "epoch": 2.9426751592356686, + "grad_norm": 0.3641669379794496, + "learning_rate": 3.628531543568751e-06, + "loss": 0.0175, + "step": 6468 + }, + { + "epoch": 2.943130118289354, + "grad_norm": 0.39785299336729835, + "learning_rate": 3.6271571228648108e-06, + "loss": 0.0082, + "step": 6469 + }, + { + "epoch": 2.943585077343039, + "grad_norm": 0.378413719535719, + "learning_rate": 3.6257828143433886e-06, + "loss": 0.0108, + "step": 6470 + }, + { + "epoch": 2.944040036396724, + "grad_norm": 0.5402550307213921, + "learning_rate": 3.6244086181167903e-06, + "loss": 0.0215, + "step": 6471 + }, + { + "epoch": 2.9444949954504094, + "grad_norm": 0.33770455857948545, + "learning_rate": 3.623034534297306e-06, + "loss": 0.0114, + "step": 6472 + }, + { + "epoch": 2.9449499545040947, + "grad_norm": 0.2765679731681435, + "learning_rate": 3.621660562997222e-06, + "loss": 0.0088, + "step": 6473 + }, + { + "epoch": 2.9454049135577796, + "grad_norm": 0.3523143978385394, + "learning_rate": 3.6202867043288126e-06, + "loss": 0.0081, + "step": 6474 + }, + { + "epoch": 2.945859872611465, + "grad_norm": 0.5230423166762636, + "learning_rate": 3.6189129584043426e-06, + "loss": 0.0174, + "step": 6475 + }, + { + "epoch": 2.94631483166515, + "grad_norm": 0.359405361480886, + "learning_rate": 3.6175393253360704e-06, + "loss": 0.0089, + "step": 6476 + }, + { + "epoch": 2.946769790718835, + "grad_norm": 0.3911789410996822, + "learning_rate": 3.6161658052362393e-06, + "loss": 0.0113, + "step": 6477 + }, + { + "epoch": 2.9472247497725204, + "grad_norm": 0.3744567594377753, + "learning_rate": 3.6147923982170906e-06, + "loss": 0.0124, + "step": 6478 + }, + { + "epoch": 2.9476797088262057, + "grad_norm": 0.3385001640787944, + "learning_rate": 3.6134191043908496e-06, + "loss": 0.0126, + "step": 6479 + }, + { + "epoch": 2.9481346678798905, + "grad_norm": 0.4244177104253812, + "learning_rate": 3.6120459238697387e-06, + "loss": 0.0232, + "step": 6480 + }, + { + "epoch": 2.948589626933576, + "grad_norm": 0.374756040858105, + "learning_rate": 3.610672856765968e-06, + "loss": 0.0152, + "step": 6481 + }, + { + "epoch": 2.949044585987261, + "grad_norm": 0.40130996389176443, + "learning_rate": 3.6092999031917366e-06, + "loss": 0.0057, + "step": 6482 + }, + { + "epoch": 2.9494995450409465, + "grad_norm": 0.6906072685479914, + "learning_rate": 3.607927063259237e-06, + "loss": 0.0253, + "step": 6483 + }, + { + "epoch": 2.9499545040946313, + "grad_norm": 0.41175261532553814, + "learning_rate": 3.6065543370806504e-06, + "loss": 0.007, + "step": 6484 + }, + { + "epoch": 2.9504094631483166, + "grad_norm": 0.4425109093139479, + "learning_rate": 3.605181724768152e-06, + "loss": 0.0119, + "step": 6485 + }, + { + "epoch": 2.950864422202002, + "grad_norm": 0.5125659469748413, + "learning_rate": 3.603809226433902e-06, + "loss": 0.0108, + "step": 6486 + }, + { + "epoch": 2.951319381255687, + "grad_norm": 0.21762571448281992, + "learning_rate": 3.602436842190058e-06, + "loss": 0.0061, + "step": 6487 + }, + { + "epoch": 2.951774340309372, + "grad_norm": 0.28720286244589455, + "learning_rate": 3.6010645721487648e-06, + "loss": 0.0061, + "step": 6488 + }, + { + "epoch": 2.9522292993630574, + "grad_norm": 0.2508069191871544, + "learning_rate": 3.5996924164221558e-06, + "loss": 0.0056, + "step": 6489 + }, + { + "epoch": 2.9526842584167428, + "grad_norm": 0.339259444336096, + "learning_rate": 3.5983203751223605e-06, + "loss": 0.0122, + "step": 6490 + }, + { + "epoch": 2.9531392174704276, + "grad_norm": 0.5387079386311976, + "learning_rate": 3.5969484483614923e-06, + "loss": 0.0269, + "step": 6491 + }, + { + "epoch": 2.953594176524113, + "grad_norm": 0.34316022717814354, + "learning_rate": 3.595576636251663e-06, + "loss": 0.0104, + "step": 6492 + }, + { + "epoch": 2.9540491355777982, + "grad_norm": 0.346253213252283, + "learning_rate": 3.594204938904966e-06, + "loss": 0.0095, + "step": 6493 + }, + { + "epoch": 2.954504094631483, + "grad_norm": 0.41042946731191354, + "learning_rate": 3.5928333564334937e-06, + "loss": 0.0072, + "step": 6494 + }, + { + "epoch": 2.9549590536851684, + "grad_norm": 0.3459032386002424, + "learning_rate": 3.591461888949326e-06, + "loss": 0.0133, + "step": 6495 + }, + { + "epoch": 2.9554140127388537, + "grad_norm": 0.33990718467059117, + "learning_rate": 3.590090536564531e-06, + "loss": 0.0111, + "step": 6496 + }, + { + "epoch": 2.9558689717925386, + "grad_norm": 0.42419889987542986, + "learning_rate": 3.588719299391171e-06, + "loss": 0.0206, + "step": 6497 + }, + { + "epoch": 2.956323930846224, + "grad_norm": 0.4111216623523516, + "learning_rate": 3.5873481775412957e-06, + "loss": 0.0092, + "step": 6498 + }, + { + "epoch": 2.9567788898999092, + "grad_norm": 0.438032802285073, + "learning_rate": 3.5859771711269486e-06, + "loss": 0.0139, + "step": 6499 + }, + { + "epoch": 2.957233848953594, + "grad_norm": 0.3627021964701024, + "learning_rate": 3.58460628026016e-06, + "loss": 0.011, + "step": 6500 + }, + { + "epoch": 2.9576888080072794, + "grad_norm": 0.5433206072238611, + "learning_rate": 3.583235505052955e-06, + "loss": 0.023, + "step": 6501 + }, + { + "epoch": 2.9581437670609647, + "grad_norm": 0.36726213720614953, + "learning_rate": 3.581864845617348e-06, + "loss": 0.0062, + "step": 6502 + }, + { + "epoch": 2.9585987261146496, + "grad_norm": 0.31629345606568876, + "learning_rate": 3.5804943020653403e-06, + "loss": 0.0124, + "step": 6503 + }, + { + "epoch": 2.959053685168335, + "grad_norm": 0.38915340933054543, + "learning_rate": 3.579123874508927e-06, + "loss": 0.0145, + "step": 6504 + }, + { + "epoch": 2.95950864422202, + "grad_norm": 0.2957833994150592, + "learning_rate": 3.5777535630600962e-06, + "loss": 0.0102, + "step": 6505 + }, + { + "epoch": 2.959963603275705, + "grad_norm": 0.32441200112227103, + "learning_rate": 3.57638336783082e-06, + "loss": 0.0116, + "step": 6506 + }, + { + "epoch": 2.9604185623293904, + "grad_norm": 0.31312273494997284, + "learning_rate": 3.575013288933065e-06, + "loss": 0.0113, + "step": 6507 + }, + { + "epoch": 2.9608735213830757, + "grad_norm": 0.42217355317451716, + "learning_rate": 3.5736433264787903e-06, + "loss": 0.011, + "step": 6508 + }, + { + "epoch": 2.9613284804367606, + "grad_norm": 0.43660776250825045, + "learning_rate": 3.572273480579941e-06, + "loss": 0.0203, + "step": 6509 + }, + { + "epoch": 2.961783439490446, + "grad_norm": 0.25376051638608066, + "learning_rate": 3.5709037513484555e-06, + "loss": 0.0077, + "step": 6510 + }, + { + "epoch": 2.962238398544131, + "grad_norm": 0.5652030775647546, + "learning_rate": 3.569534138896262e-06, + "loss": 0.0273, + "step": 6511 + }, + { + "epoch": 2.962693357597816, + "grad_norm": 0.5268701970464702, + "learning_rate": 3.568164643335279e-06, + "loss": 0.0228, + "step": 6512 + }, + { + "epoch": 2.9631483166515014, + "grad_norm": 0.4982147251025876, + "learning_rate": 3.566795264777414e-06, + "loss": 0.0125, + "step": 6513 + }, + { + "epoch": 2.9636032757051867, + "grad_norm": 0.6654372766865096, + "learning_rate": 3.565426003334567e-06, + "loss": 0.0237, + "step": 6514 + }, + { + "epoch": 2.9640582347588715, + "grad_norm": 0.32738325332042234, + "learning_rate": 3.564056859118631e-06, + "loss": 0.0102, + "step": 6515 + }, + { + "epoch": 2.964513193812557, + "grad_norm": 0.5996042803569864, + "learning_rate": 3.5626878322414824e-06, + "loss": 0.0159, + "step": 6516 + }, + { + "epoch": 2.964968152866242, + "grad_norm": 0.3353125547507947, + "learning_rate": 3.5613189228149947e-06, + "loss": 0.0146, + "step": 6517 + }, + { + "epoch": 2.965423111919927, + "grad_norm": 0.5474646876156293, + "learning_rate": 3.5599501309510252e-06, + "loss": 0.0153, + "step": 6518 + }, + { + "epoch": 2.9658780709736123, + "grad_norm": 0.435343334679233, + "learning_rate": 3.5585814567614307e-06, + "loss": 0.0133, + "step": 6519 + }, + { + "epoch": 2.9663330300272976, + "grad_norm": 0.49327260361722014, + "learning_rate": 3.557212900358048e-06, + "loss": 0.0217, + "step": 6520 + }, + { + "epoch": 2.9667879890809825, + "grad_norm": 0.4450911084721028, + "learning_rate": 3.555844461852711e-06, + "loss": 0.0161, + "step": 6521 + }, + { + "epoch": 2.967242948134668, + "grad_norm": 0.3584541153371727, + "learning_rate": 3.5544761413572444e-06, + "loss": 0.009, + "step": 6522 + }, + { + "epoch": 2.967697907188353, + "grad_norm": 0.3124643595442405, + "learning_rate": 3.5531079389834587e-06, + "loss": 0.0086, + "step": 6523 + }, + { + "epoch": 2.968152866242038, + "grad_norm": 0.32194436616331723, + "learning_rate": 3.5517398548431592e-06, + "loss": 0.0099, + "step": 6524 + }, + { + "epoch": 2.9686078252957233, + "grad_norm": 0.34931698332817945, + "learning_rate": 3.5503718890481376e-06, + "loss": 0.0091, + "step": 6525 + }, + { + "epoch": 2.9690627843494086, + "grad_norm": 0.37815788525833466, + "learning_rate": 3.5490040417101795e-06, + "loss": 0.0125, + "step": 6526 + }, + { + "epoch": 2.9695177434030935, + "grad_norm": 0.28238843041660866, + "learning_rate": 3.5476363129410576e-06, + "loss": 0.008, + "step": 6527 + }, + { + "epoch": 2.969972702456779, + "grad_norm": 0.3697340753396572, + "learning_rate": 3.5462687028525366e-06, + "loss": 0.0099, + "step": 6528 + }, + { + "epoch": 2.970427661510464, + "grad_norm": 0.32393130922575747, + "learning_rate": 3.544901211556374e-06, + "loss": 0.0087, + "step": 6529 + }, + { + "epoch": 2.970882620564149, + "grad_norm": 0.4461277427758584, + "learning_rate": 3.543533839164312e-06, + "loss": 0.0135, + "step": 6530 + }, + { + "epoch": 2.9713375796178343, + "grad_norm": 0.29835175560956406, + "learning_rate": 3.5421665857880887e-06, + "loss": 0.0059, + "step": 6531 + }, + { + "epoch": 2.9717925386715196, + "grad_norm": 0.3606052458056106, + "learning_rate": 3.5407994515394273e-06, + "loss": 0.0214, + "step": 6532 + }, + { + "epoch": 2.9722474977252045, + "grad_norm": 0.4187932062463131, + "learning_rate": 3.539432436530046e-06, + "loss": 0.011, + "step": 6533 + }, + { + "epoch": 2.97270245677889, + "grad_norm": 0.319007180889365, + "learning_rate": 3.538065540871649e-06, + "loss": 0.0116, + "step": 6534 + }, + { + "epoch": 2.973157415832575, + "grad_norm": 0.5531012482531121, + "learning_rate": 3.5366987646759333e-06, + "loss": 0.0152, + "step": 6535 + }, + { + "epoch": 2.9736123748862604, + "grad_norm": 0.47758964654481584, + "learning_rate": 3.535332108054589e-06, + "loss": 0.0179, + "step": 6536 + }, + { + "epoch": 2.9740673339399453, + "grad_norm": 0.3569075037008427, + "learning_rate": 3.5339655711192878e-06, + "loss": 0.015, + "step": 6537 + }, + { + "epoch": 2.9745222929936306, + "grad_norm": 0.5098602727032687, + "learning_rate": 3.532599153981702e-06, + "loss": 0.0147, + "step": 6538 + }, + { + "epoch": 2.974977252047316, + "grad_norm": 0.33681859522390956, + "learning_rate": 3.531232856753486e-06, + "loss": 0.0125, + "step": 6539 + }, + { + "epoch": 2.9754322111010008, + "grad_norm": 0.552580439927012, + "learning_rate": 3.5298666795462865e-06, + "loss": 0.0147, + "step": 6540 + }, + { + "epoch": 2.975887170154686, + "grad_norm": 0.3375126958474858, + "learning_rate": 3.528500622471745e-06, + "loss": 0.0086, + "step": 6541 + }, + { + "epoch": 2.9763421292083714, + "grad_norm": 0.5061059149911944, + "learning_rate": 3.5271346856414847e-06, + "loss": 0.0158, + "step": 6542 + }, + { + "epoch": 2.9767970882620567, + "grad_norm": 0.2708132117320371, + "learning_rate": 3.525768869167128e-06, + "loss": 0.0068, + "step": 6543 + }, + { + "epoch": 2.9772520473157416, + "grad_norm": 0.3515462809804613, + "learning_rate": 3.5244031731602824e-06, + "loss": 0.0157, + "step": 6544 + }, + { + "epoch": 2.977707006369427, + "grad_norm": 0.38160272549077745, + "learning_rate": 3.523037597732545e-06, + "loss": 0.0128, + "step": 6545 + }, + { + "epoch": 2.978161965423112, + "grad_norm": 0.5097781769845867, + "learning_rate": 3.521672142995506e-06, + "loss": 0.0253, + "step": 6546 + }, + { + "epoch": 2.978616924476797, + "grad_norm": 0.3929186750012416, + "learning_rate": 3.5203068090607416e-06, + "loss": 0.0093, + "step": 6547 + }, + { + "epoch": 2.9790718835304824, + "grad_norm": 0.5796891004760814, + "learning_rate": 3.518941596039825e-06, + "loss": 0.0181, + "step": 6548 + }, + { + "epoch": 2.9795268425841677, + "grad_norm": 0.4339014424134206, + "learning_rate": 3.51757650404431e-06, + "loss": 0.0277, + "step": 6549 + }, + { + "epoch": 2.9799818016378525, + "grad_norm": 0.41398487307020093, + "learning_rate": 3.5162115331857494e-06, + "loss": 0.0186, + "step": 6550 + }, + { + "epoch": 2.980436760691538, + "grad_norm": 0.3757799362980298, + "learning_rate": 3.514846683575683e-06, + "loss": 0.0146, + "step": 6551 + }, + { + "epoch": 2.980891719745223, + "grad_norm": 0.5281256320015878, + "learning_rate": 3.5134819553256374e-06, + "loss": 0.0146, + "step": 6552 + }, + { + "epoch": 2.981346678798908, + "grad_norm": 0.42468968336333995, + "learning_rate": 3.512117348547134e-06, + "loss": 0.0137, + "step": 6553 + }, + { + "epoch": 2.9818016378525933, + "grad_norm": 0.3359772115067771, + "learning_rate": 3.510752863351682e-06, + "loss": 0.0114, + "step": 6554 + }, + { + "epoch": 2.9822565969062786, + "grad_norm": 0.3638159398785847, + "learning_rate": 3.50938849985078e-06, + "loss": 0.0098, + "step": 6555 + }, + { + "epoch": 2.9827115559599635, + "grad_norm": 0.19591234810517247, + "learning_rate": 3.508024258155918e-06, + "loss": 0.0034, + "step": 6556 + }, + { + "epoch": 2.983166515013649, + "grad_norm": 0.4248982130335413, + "learning_rate": 3.506660138378575e-06, + "loss": 0.0132, + "step": 6557 + }, + { + "epoch": 2.983621474067334, + "grad_norm": 0.41451250175402327, + "learning_rate": 3.505296140630224e-06, + "loss": 0.0177, + "step": 6558 + }, + { + "epoch": 2.984076433121019, + "grad_norm": 0.38969374572008747, + "learning_rate": 3.5039322650223207e-06, + "loss": 0.0108, + "step": 6559 + }, + { + "epoch": 2.9845313921747043, + "grad_norm": 0.38980340160687305, + "learning_rate": 3.5025685116663176e-06, + "loss": 0.0094, + "step": 6560 + }, + { + "epoch": 2.9849863512283896, + "grad_norm": 0.4368813537831224, + "learning_rate": 3.5012048806736525e-06, + "loss": 0.0157, + "step": 6561 + }, + { + "epoch": 2.9854413102820745, + "grad_norm": 0.445846609961705, + "learning_rate": 3.499841372155757e-06, + "loss": 0.0111, + "step": 6562 + }, + { + "epoch": 2.98589626933576, + "grad_norm": 0.36904131373841426, + "learning_rate": 3.4984779862240483e-06, + "loss": 0.009, + "step": 6563 + }, + { + "epoch": 2.986351228389445, + "grad_norm": 0.5158383186613557, + "learning_rate": 3.497114722989938e-06, + "loss": 0.0108, + "step": 6564 + }, + { + "epoch": 2.98680618744313, + "grad_norm": 0.2709198700370854, + "learning_rate": 3.495751582564827e-06, + "loss": 0.0091, + "step": 6565 + }, + { + "epoch": 2.9872611464968153, + "grad_norm": 0.3360782617935061, + "learning_rate": 3.4943885650601028e-06, + "loss": 0.008, + "step": 6566 + }, + { + "epoch": 2.9877161055505006, + "grad_norm": 0.4780280782673142, + "learning_rate": 3.4930256705871467e-06, + "loss": 0.0208, + "step": 6567 + }, + { + "epoch": 2.9881710646041855, + "grad_norm": 0.44920864250946857, + "learning_rate": 3.4916628992573267e-06, + "loss": 0.0118, + "step": 6568 + }, + { + "epoch": 2.988626023657871, + "grad_norm": 0.29882332357265173, + "learning_rate": 3.490300251182003e-06, + "loss": 0.0075, + "step": 6569 + }, + { + "epoch": 2.989080982711556, + "grad_norm": 0.5988789623374708, + "learning_rate": 3.4889377264725233e-06, + "loss": 0.0234, + "step": 6570 + }, + { + "epoch": 2.989535941765241, + "grad_norm": 0.3983572447931714, + "learning_rate": 3.48757532524023e-06, + "loss": 0.0126, + "step": 6571 + }, + { + "epoch": 2.9899909008189263, + "grad_norm": 0.3764680155253722, + "learning_rate": 3.4862130475964516e-06, + "loss": 0.0115, + "step": 6572 + }, + { + "epoch": 2.9904458598726116, + "grad_norm": 0.35833475350018384, + "learning_rate": 3.4848508936525063e-06, + "loss": 0.0138, + "step": 6573 + }, + { + "epoch": 2.9909008189262964, + "grad_norm": 0.34962169597146103, + "learning_rate": 3.4834888635197044e-06, + "loss": 0.0075, + "step": 6574 + }, + { + "epoch": 2.9913557779799818, + "grad_norm": 0.3918155381217852, + "learning_rate": 3.482126957309344e-06, + "loss": 0.015, + "step": 6575 + }, + { + "epoch": 2.991810737033667, + "grad_norm": 0.46818754910044025, + "learning_rate": 3.4807651751327133e-06, + "loss": 0.0139, + "step": 6576 + }, + { + "epoch": 2.992265696087352, + "grad_norm": 0.48511393425607174, + "learning_rate": 3.479403517101091e-06, + "loss": 0.0097, + "step": 6577 + }, + { + "epoch": 2.9927206551410372, + "grad_norm": 0.6301894242314166, + "learning_rate": 3.478041983325747e-06, + "loss": 0.021, + "step": 6578 + }, + { + "epoch": 2.9931756141947226, + "grad_norm": 0.3476736060486332, + "learning_rate": 3.476680573917939e-06, + "loss": 0.0103, + "step": 6579 + }, + { + "epoch": 2.9936305732484074, + "grad_norm": 0.5028656074679961, + "learning_rate": 3.4753192889889166e-06, + "loss": 0.014, + "step": 6580 + }, + { + "epoch": 2.9940855323020927, + "grad_norm": 0.25322291430768457, + "learning_rate": 3.4739581286499147e-06, + "loss": 0.007, + "step": 6581 + }, + { + "epoch": 2.994540491355778, + "grad_norm": 0.36743018715888376, + "learning_rate": 3.4725970930121646e-06, + "loss": 0.0054, + "step": 6582 + }, + { + "epoch": 2.994995450409463, + "grad_norm": 0.32859431938405953, + "learning_rate": 3.4712361821868814e-06, + "loss": 0.0101, + "step": 6583 + }, + { + "epoch": 2.9954504094631482, + "grad_norm": 0.3683224458881264, + "learning_rate": 3.4698753962852715e-06, + "loss": 0.0097, + "step": 6584 + }, + { + "epoch": 2.9959053685168335, + "grad_norm": 0.44288889919259156, + "learning_rate": 3.468514735418537e-06, + "loss": 0.012, + "step": 6585 + }, + { + "epoch": 2.9963603275705184, + "grad_norm": 0.4542328497419673, + "learning_rate": 3.4671541996978607e-06, + "loss": 0.013, + "step": 6586 + }, + { + "epoch": 2.9968152866242037, + "grad_norm": 0.28826436118881377, + "learning_rate": 3.465793789234423e-06, + "loss": 0.0137, + "step": 6587 + }, + { + "epoch": 2.997270245677889, + "grad_norm": 0.3456106458761672, + "learning_rate": 3.4644335041393867e-06, + "loss": 0.0097, + "step": 6588 + }, + { + "epoch": 2.997725204731574, + "grad_norm": 0.36722041869566846, + "learning_rate": 3.463073344523911e-06, + "loss": 0.0105, + "step": 6589 + }, + { + "epoch": 2.998180163785259, + "grad_norm": 0.3671137298434854, + "learning_rate": 3.4617133104991396e-06, + "loss": 0.0163, + "step": 6590 + }, + { + "epoch": 2.9986351228389445, + "grad_norm": 0.26960385085062966, + "learning_rate": 3.4603534021762088e-06, + "loss": 0.0043, + "step": 6591 + }, + { + "epoch": 2.99909008189263, + "grad_norm": 0.4990967729111016, + "learning_rate": 3.458993619666248e-06, + "loss": 0.0104, + "step": 6592 + }, + { + "epoch": 2.9995450409463147, + "grad_norm": 0.5826797641151416, + "learning_rate": 3.4576339630803667e-06, + "loss": 0.0245, + "step": 6593 + }, + { + "epoch": 3.0, + "grad_norm": 0.23571962806449165, + "learning_rate": 3.456274432529675e-06, + "loss": 0.0083, + "step": 6594 + }, + { + "epoch": 3.0004549590536853, + "grad_norm": 0.0893191704649401, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.0018, + "step": 6595 + }, + { + "epoch": 3.00090991810737, + "grad_norm": 0.3347845618716773, + "learning_rate": 3.4535557499782195e-06, + "loss": 0.0056, + "step": 6596 + }, + { + "epoch": 3.0013648771610555, + "grad_norm": 0.13635346834634315, + "learning_rate": 3.452196598199615e-06, + "loss": 0.0024, + "step": 6597 + }, + { + "epoch": 3.001819836214741, + "grad_norm": 0.16089312403502545, + "learning_rate": 3.4508375729005137e-06, + "loss": 0.003, + "step": 6598 + }, + { + "epoch": 3.0022747952684257, + "grad_norm": 0.19438018052149716, + "learning_rate": 3.449478674191972e-06, + "loss": 0.0057, + "step": 6599 + }, + { + "epoch": 3.002729754322111, + "grad_norm": 0.1531692653094001, + "learning_rate": 3.44811990218503e-06, + "loss": 0.0029, + "step": 6600 + }, + { + "epoch": 3.0031847133757963, + "grad_norm": 0.14561522799142143, + "learning_rate": 3.4467612569907226e-06, + "loss": 0.0025, + "step": 6601 + }, + { + "epoch": 3.003639672429481, + "grad_norm": 0.21913303802077205, + "learning_rate": 3.4454027387200695e-06, + "loss": 0.0043, + "step": 6602 + }, + { + "epoch": 3.0040946314831665, + "grad_norm": 0.1478213208738507, + "learning_rate": 3.4440443474840857e-06, + "loss": 0.0021, + "step": 6603 + }, + { + "epoch": 3.0045495905368518, + "grad_norm": 0.2202665007894661, + "learning_rate": 3.4426860833937696e-06, + "loss": 0.0062, + "step": 6604 + }, + { + "epoch": 3.0050045495905366, + "grad_norm": 0.15933037907081252, + "learning_rate": 3.4413279465601136e-06, + "loss": 0.0022, + "step": 6605 + }, + { + "epoch": 3.005459508644222, + "grad_norm": 0.2995172416816985, + "learning_rate": 3.4399699370940996e-06, + "loss": 0.0062, + "step": 6606 + }, + { + "epoch": 3.0059144676979073, + "grad_norm": 0.34600386926789417, + "learning_rate": 3.4386120551066976e-06, + "loss": 0.0037, + "step": 6607 + }, + { + "epoch": 3.0063694267515926, + "grad_norm": 0.20426270040272176, + "learning_rate": 3.437254300708868e-06, + "loss": 0.0039, + "step": 6608 + }, + { + "epoch": 3.0068243858052774, + "grad_norm": 0.301874923402197, + "learning_rate": 3.4358966740115595e-06, + "loss": 0.0073, + "step": 6609 + }, + { + "epoch": 3.0072793448589628, + "grad_norm": 0.1122449943135855, + "learning_rate": 3.4345391751257105e-06, + "loss": 0.0014, + "step": 6610 + }, + { + "epoch": 3.007734303912648, + "grad_norm": 0.1996226528370291, + "learning_rate": 3.433181804162251e-06, + "loss": 0.0046, + "step": 6611 + }, + { + "epoch": 3.008189262966333, + "grad_norm": 0.1739934559568579, + "learning_rate": 3.4318245612320976e-06, + "loss": 0.0023, + "step": 6612 + }, + { + "epoch": 3.0086442220200182, + "grad_norm": 0.3493893395081199, + "learning_rate": 3.4304674464461597e-06, + "loss": 0.0091, + "step": 6613 + }, + { + "epoch": 3.0090991810737036, + "grad_norm": 0.18861323894491727, + "learning_rate": 3.429110459915336e-06, + "loss": 0.0041, + "step": 6614 + }, + { + "epoch": 3.0095541401273884, + "grad_norm": 0.2042787293335032, + "learning_rate": 3.427753601750509e-06, + "loss": 0.0045, + "step": 6615 + }, + { + "epoch": 3.0100090991810737, + "grad_norm": 0.07566522736113394, + "learning_rate": 3.4263968720625597e-06, + "loss": 0.0013, + "step": 6616 + }, + { + "epoch": 3.010464058234759, + "grad_norm": 0.4534280859980601, + "learning_rate": 3.4250402709623497e-06, + "loss": 0.0051, + "step": 6617 + }, + { + "epoch": 3.010919017288444, + "grad_norm": 0.23575231717807305, + "learning_rate": 3.423683798560738e-06, + "loss": 0.0063, + "step": 6618 + }, + { + "epoch": 3.011373976342129, + "grad_norm": 0.14894053974123447, + "learning_rate": 3.4223274549685653e-06, + "loss": 0.0032, + "step": 6619 + }, + { + "epoch": 3.0118289353958145, + "grad_norm": 0.11599223217040529, + "learning_rate": 3.4209712402966693e-06, + "loss": 0.0021, + "step": 6620 + }, + { + "epoch": 3.0122838944494994, + "grad_norm": 0.3253214585977508, + "learning_rate": 3.419615154655874e-06, + "loss": 0.0049, + "step": 6621 + }, + { + "epoch": 3.0127388535031847, + "grad_norm": 0.14413936950678644, + "learning_rate": 3.4182591981569903e-06, + "loss": 0.0026, + "step": 6622 + }, + { + "epoch": 3.01319381255687, + "grad_norm": 0.15902466126091303, + "learning_rate": 3.416903370910822e-06, + "loss": 0.0028, + "step": 6623 + }, + { + "epoch": 3.013648771610555, + "grad_norm": 0.3624443200913134, + "learning_rate": 3.415547673028161e-06, + "loss": 0.0058, + "step": 6624 + }, + { + "epoch": 3.01410373066424, + "grad_norm": 0.24355384727569604, + "learning_rate": 3.41419210461979e-06, + "loss": 0.0027, + "step": 6625 + }, + { + "epoch": 3.0145586897179255, + "grad_norm": 0.3643777170821261, + "learning_rate": 3.412836665796476e-06, + "loss": 0.012, + "step": 6626 + }, + { + "epoch": 3.0150136487716104, + "grad_norm": 0.19011536327909298, + "learning_rate": 3.4114813566689837e-06, + "loss": 0.0026, + "step": 6627 + }, + { + "epoch": 3.0154686078252957, + "grad_norm": 0.36758426824899176, + "learning_rate": 3.410126177348062e-06, + "loss": 0.0047, + "step": 6628 + }, + { + "epoch": 3.015923566878981, + "grad_norm": 0.27298808165345695, + "learning_rate": 3.408771127944448e-06, + "loss": 0.0033, + "step": 6629 + }, + { + "epoch": 3.016378525932666, + "grad_norm": 0.264986574263462, + "learning_rate": 3.4074162085688734e-06, + "loss": 0.0058, + "step": 6630 + }, + { + "epoch": 3.016833484986351, + "grad_norm": 0.12813541524178768, + "learning_rate": 3.4060614193320524e-06, + "loss": 0.0017, + "step": 6631 + }, + { + "epoch": 3.0172884440400365, + "grad_norm": 0.337075641842545, + "learning_rate": 3.4047067603446947e-06, + "loss": 0.0042, + "step": 6632 + }, + { + "epoch": 3.0177434030937214, + "grad_norm": 0.3163141420097077, + "learning_rate": 3.403352231717495e-06, + "loss": 0.0096, + "step": 6633 + }, + { + "epoch": 3.0181983621474067, + "grad_norm": 0.618042102760366, + "learning_rate": 3.4019978335611414e-06, + "loss": 0.0058, + "step": 6634 + }, + { + "epoch": 3.018653321201092, + "grad_norm": 0.2230934149262993, + "learning_rate": 3.400643565986309e-06, + "loss": 0.0034, + "step": 6635 + }, + { + "epoch": 3.0191082802547773, + "grad_norm": 0.11737139792400703, + "learning_rate": 3.3992894291036615e-06, + "loss": 0.0012, + "step": 6636 + }, + { + "epoch": 3.019563239308462, + "grad_norm": 0.1253771741531208, + "learning_rate": 3.3979354230238537e-06, + "loss": 0.0017, + "step": 6637 + }, + { + "epoch": 3.0200181983621475, + "grad_norm": 0.31309338322046454, + "learning_rate": 3.396581547857527e-06, + "loss": 0.0032, + "step": 6638 + }, + { + "epoch": 3.0204731574158328, + "grad_norm": 0.2721560884607357, + "learning_rate": 3.3952278037153162e-06, + "loss": 0.0059, + "step": 6639 + }, + { + "epoch": 3.0209281164695176, + "grad_norm": 0.19990261394723954, + "learning_rate": 3.3938741907078404e-06, + "loss": 0.0026, + "step": 6640 + }, + { + "epoch": 3.021383075523203, + "grad_norm": 0.24426365534373817, + "learning_rate": 3.3925207089457134e-06, + "loss": 0.0047, + "step": 6641 + }, + { + "epoch": 3.0218380345768883, + "grad_norm": 0.28575080423457644, + "learning_rate": 3.391167358539536e-06, + "loss": 0.0037, + "step": 6642 + }, + { + "epoch": 3.022292993630573, + "grad_norm": 0.138887174100726, + "learning_rate": 3.3898141395998957e-06, + "loss": 0.0018, + "step": 6643 + }, + { + "epoch": 3.0227479526842584, + "grad_norm": 0.3332499529648303, + "learning_rate": 3.388461052237373e-06, + "loss": 0.0105, + "step": 6644 + }, + { + "epoch": 3.0232029117379438, + "grad_norm": 0.27342978649469785, + "learning_rate": 3.3871080965625356e-06, + "loss": 0.0022, + "step": 6645 + }, + { + "epoch": 3.0236578707916286, + "grad_norm": 0.19489859654165118, + "learning_rate": 3.3857552726859398e-06, + "loss": 0.0039, + "step": 6646 + }, + { + "epoch": 3.024112829845314, + "grad_norm": 0.14270451832945846, + "learning_rate": 3.3844025807181325e-06, + "loss": 0.0018, + "step": 6647 + }, + { + "epoch": 3.0245677888989992, + "grad_norm": 0.35927759277977983, + "learning_rate": 3.383050020769652e-06, + "loss": 0.0052, + "step": 6648 + }, + { + "epoch": 3.025022747952684, + "grad_norm": 0.24924094124945648, + "learning_rate": 3.381697592951021e-06, + "loss": 0.0038, + "step": 6649 + }, + { + "epoch": 3.0254777070063694, + "grad_norm": 0.2909270553851987, + "learning_rate": 3.380345297372756e-06, + "loss": 0.0036, + "step": 6650 + }, + { + "epoch": 3.0259326660600547, + "grad_norm": 0.2911388642080158, + "learning_rate": 3.3789931341453564e-06, + "loss": 0.0068, + "step": 6651 + }, + { + "epoch": 3.0263876251137396, + "grad_norm": 0.2488324883508692, + "learning_rate": 3.3776411033793198e-06, + "loss": 0.0055, + "step": 6652 + }, + { + "epoch": 3.026842584167425, + "grad_norm": 0.10552758910776536, + "learning_rate": 3.376289205185125e-06, + "loss": 0.0028, + "step": 6653 + }, + { + "epoch": 3.02729754322111, + "grad_norm": 0.04360860869686373, + "learning_rate": 3.3749374396732413e-06, + "loss": 0.0005, + "step": 6654 + }, + { + "epoch": 3.027752502274795, + "grad_norm": 0.16426236232776292, + "learning_rate": 3.3735858069541345e-06, + "loss": 0.0023, + "step": 6655 + }, + { + "epoch": 3.0282074613284804, + "grad_norm": 0.2646664737834769, + "learning_rate": 3.372234307138249e-06, + "loss": 0.0038, + "step": 6656 + }, + { + "epoch": 3.0286624203821657, + "grad_norm": 0.12543008094547287, + "learning_rate": 3.3708829403360266e-06, + "loss": 0.0023, + "step": 6657 + }, + { + "epoch": 3.0291173794358506, + "grad_norm": 0.23685846621502793, + "learning_rate": 3.369531706657892e-06, + "loss": 0.0047, + "step": 6658 + }, + { + "epoch": 3.029572338489536, + "grad_norm": 0.27145480963050583, + "learning_rate": 3.368180606214264e-06, + "loss": 0.0074, + "step": 6659 + }, + { + "epoch": 3.030027297543221, + "grad_norm": 0.11211373787488198, + "learning_rate": 3.3668296391155473e-06, + "loss": 0.002, + "step": 6660 + }, + { + "epoch": 3.030482256596906, + "grad_norm": 0.22366057001345938, + "learning_rate": 3.3654788054721356e-06, + "loss": 0.0038, + "step": 6661 + }, + { + "epoch": 3.0309372156505914, + "grad_norm": 0.3061134159484588, + "learning_rate": 3.3641281053944165e-06, + "loss": 0.0062, + "step": 6662 + }, + { + "epoch": 3.0313921747042767, + "grad_norm": 0.19156535723123164, + "learning_rate": 3.36277753899276e-06, + "loss": 0.0048, + "step": 6663 + }, + { + "epoch": 3.031847133757962, + "grad_norm": 0.3513658408597901, + "learning_rate": 3.3614271063775306e-06, + "loss": 0.0042, + "step": 6664 + }, + { + "epoch": 3.032302092811647, + "grad_norm": 0.39642909000522336, + "learning_rate": 3.3600768076590772e-06, + "loss": 0.0088, + "step": 6665 + }, + { + "epoch": 3.032757051865332, + "grad_norm": 0.1440361583079107, + "learning_rate": 3.3587266429477426e-06, + "loss": 0.0014, + "step": 6666 + }, + { + "epoch": 3.0332120109190175, + "grad_norm": 0.4184718961994572, + "learning_rate": 3.3573766123538536e-06, + "loss": 0.0051, + "step": 6667 + }, + { + "epoch": 3.0336669699727024, + "grad_norm": 0.22512611342607328, + "learning_rate": 3.356026715987729e-06, + "loss": 0.0024, + "step": 6668 + }, + { + "epoch": 3.0341219290263877, + "grad_norm": 0.1610965509607099, + "learning_rate": 3.354676953959679e-06, + "loss": 0.002, + "step": 6669 + }, + { + "epoch": 3.034576888080073, + "grad_norm": 0.13527929271548852, + "learning_rate": 3.353327326379997e-06, + "loss": 0.0014, + "step": 6670 + }, + { + "epoch": 3.035031847133758, + "grad_norm": 0.28141741622039324, + "learning_rate": 3.3519778333589702e-06, + "loss": 0.0038, + "step": 6671 + }, + { + "epoch": 3.035486806187443, + "grad_norm": 0.2240547258601244, + "learning_rate": 3.3506284750068718e-06, + "loss": 0.0033, + "step": 6672 + }, + { + "epoch": 3.0359417652411285, + "grad_norm": 0.20360780883197763, + "learning_rate": 3.3492792514339672e-06, + "loss": 0.001, + "step": 6673 + }, + { + "epoch": 3.0363967242948133, + "grad_norm": 0.11469096789758414, + "learning_rate": 3.347930162750505e-06, + "loss": 0.001, + "step": 6674 + }, + { + "epoch": 3.0368516833484986, + "grad_norm": 0.21056526465288855, + "learning_rate": 3.3465812090667303e-06, + "loss": 0.0028, + "step": 6675 + }, + { + "epoch": 3.037306642402184, + "grad_norm": 0.2933373381535325, + "learning_rate": 3.3452323904928742e-06, + "loss": 0.0052, + "step": 6676 + }, + { + "epoch": 3.037761601455869, + "grad_norm": 0.14648108429263568, + "learning_rate": 3.343883707139153e-06, + "loss": 0.0016, + "step": 6677 + }, + { + "epoch": 3.038216560509554, + "grad_norm": 0.3595127209262766, + "learning_rate": 3.3425351591157766e-06, + "loss": 0.008, + "step": 6678 + }, + { + "epoch": 3.0386715195632394, + "grad_norm": 0.15990912197878135, + "learning_rate": 3.3411867465329416e-06, + "loss": 0.0025, + "step": 6679 + }, + { + "epoch": 3.0391264786169243, + "grad_norm": 0.3206118713392188, + "learning_rate": 3.3398384695008356e-06, + "loss": 0.0018, + "step": 6680 + }, + { + "epoch": 3.0395814376706096, + "grad_norm": 0.2639358400102524, + "learning_rate": 3.33849032812963e-06, + "loss": 0.0026, + "step": 6681 + }, + { + "epoch": 3.040036396724295, + "grad_norm": 0.6928160680099208, + "learning_rate": 3.337142322529493e-06, + "loss": 0.0184, + "step": 6682 + }, + { + "epoch": 3.04049135577798, + "grad_norm": 0.29813064246204063, + "learning_rate": 3.3357944528105767e-06, + "loss": 0.004, + "step": 6683 + }, + { + "epoch": 3.040946314831665, + "grad_norm": 0.246077961176862, + "learning_rate": 3.334446719083022e-06, + "loss": 0.0037, + "step": 6684 + }, + { + "epoch": 3.0414012738853504, + "grad_norm": 0.1168015668333905, + "learning_rate": 3.33309912145696e-06, + "loss": 0.001, + "step": 6685 + }, + { + "epoch": 3.0418562329390353, + "grad_norm": 0.26554698256243703, + "learning_rate": 3.3317516600425105e-06, + "loss": 0.0047, + "step": 6686 + }, + { + "epoch": 3.0423111919927206, + "grad_norm": 0.145936907656275, + "learning_rate": 3.33040433494978e-06, + "loss": 0.0015, + "step": 6687 + }, + { + "epoch": 3.042766151046406, + "grad_norm": 0.14191997224295935, + "learning_rate": 3.3290571462888664e-06, + "loss": 0.0022, + "step": 6688 + }, + { + "epoch": 3.0432211101000908, + "grad_norm": 0.2068582104615765, + "learning_rate": 3.3277100941698597e-06, + "loss": 0.0029, + "step": 6689 + }, + { + "epoch": 3.043676069153776, + "grad_norm": 0.12399299402302033, + "learning_rate": 3.3263631787028308e-06, + "loss": 0.0016, + "step": 6690 + }, + { + "epoch": 3.0441310282074614, + "grad_norm": 0.15741626964441308, + "learning_rate": 3.3250163999978457e-06, + "loss": 0.0018, + "step": 6691 + }, + { + "epoch": 3.0445859872611467, + "grad_norm": 0.3094470919955424, + "learning_rate": 3.3236697581649557e-06, + "loss": 0.0059, + "step": 6692 + }, + { + "epoch": 3.0450409463148316, + "grad_norm": 0.18765245098930736, + "learning_rate": 3.3223232533142034e-06, + "loss": 0.0015, + "step": 6693 + }, + { + "epoch": 3.045495905368517, + "grad_norm": 0.5725730085518594, + "learning_rate": 3.320976885555618e-06, + "loss": 0.0036, + "step": 6694 + }, + { + "epoch": 3.045950864422202, + "grad_norm": 0.23770536112447027, + "learning_rate": 3.3196306549992176e-06, + "loss": 0.0037, + "step": 6695 + }, + { + "epoch": 3.046405823475887, + "grad_norm": 0.16804086419320965, + "learning_rate": 3.3182845617550137e-06, + "loss": 0.0016, + "step": 6696 + }, + { + "epoch": 3.0468607825295724, + "grad_norm": 0.2221844581540461, + "learning_rate": 3.316938605933e-06, + "loss": 0.0028, + "step": 6697 + }, + { + "epoch": 3.0473157415832577, + "grad_norm": 0.10819946158612141, + "learning_rate": 3.315592787643164e-06, + "loss": 0.001, + "step": 6698 + }, + { + "epoch": 3.0477707006369426, + "grad_norm": 0.2660761098576769, + "learning_rate": 3.3142471069954767e-06, + "loss": 0.0025, + "step": 6699 + }, + { + "epoch": 3.048225659690628, + "grad_norm": 0.18950291654243787, + "learning_rate": 3.3129015640999052e-06, + "loss": 0.0022, + "step": 6700 + }, + { + "epoch": 3.048680618744313, + "grad_norm": 0.13500705608327346, + "learning_rate": 3.311556159066397e-06, + "loss": 0.0032, + "step": 6701 + }, + { + "epoch": 3.049135577797998, + "grad_norm": 0.42461346784601345, + "learning_rate": 3.3102108920048935e-06, + "loss": 0.0051, + "step": 6702 + }, + { + "epoch": 3.0495905368516834, + "grad_norm": 0.29661973482573345, + "learning_rate": 3.3088657630253278e-06, + "loss": 0.0059, + "step": 6703 + }, + { + "epoch": 3.0500454959053687, + "grad_norm": 0.21458789712042925, + "learning_rate": 3.3075207722376136e-06, + "loss": 0.0056, + "step": 6704 + }, + { + "epoch": 3.0505004549590535, + "grad_norm": 0.22927246194199355, + "learning_rate": 3.3061759197516598e-06, + "loss": 0.0047, + "step": 6705 + }, + { + "epoch": 3.050955414012739, + "grad_norm": 0.12383450067305045, + "learning_rate": 3.3048312056773592e-06, + "loss": 0.002, + "step": 6706 + }, + { + "epoch": 3.051410373066424, + "grad_norm": 0.28367765063276584, + "learning_rate": 3.3034866301245983e-06, + "loss": 0.006, + "step": 6707 + }, + { + "epoch": 3.051865332120109, + "grad_norm": 0.10539871834944999, + "learning_rate": 3.302142193203248e-06, + "loss": 0.0007, + "step": 6708 + }, + { + "epoch": 3.0523202911737943, + "grad_norm": 0.20715104476378063, + "learning_rate": 3.3007978950231684e-06, + "loss": 0.0009, + "step": 6709 + }, + { + "epoch": 3.0527752502274796, + "grad_norm": 0.12719221782871798, + "learning_rate": 3.2994537356942137e-06, + "loss": 0.0014, + "step": 6710 + }, + { + "epoch": 3.0532302092811645, + "grad_norm": 0.16971830470920765, + "learning_rate": 3.298109715326219e-06, + "loss": 0.0035, + "step": 6711 + }, + { + "epoch": 3.05368516833485, + "grad_norm": 0.38435683043787533, + "learning_rate": 3.296765834029014e-06, + "loss": 0.0063, + "step": 6712 + }, + { + "epoch": 3.054140127388535, + "grad_norm": 0.2242781790282236, + "learning_rate": 3.2954220919124125e-06, + "loss": 0.0031, + "step": 6713 + }, + { + "epoch": 3.05459508644222, + "grad_norm": 0.1947451320689837, + "learning_rate": 3.29407848908622e-06, + "loss": 0.0021, + "step": 6714 + }, + { + "epoch": 3.0550500454959053, + "grad_norm": 0.27425414421883815, + "learning_rate": 3.2927350256602293e-06, + "loss": 0.0022, + "step": 6715 + }, + { + "epoch": 3.0555050045495906, + "grad_norm": 0.342670858910268, + "learning_rate": 3.291391701744221e-06, + "loss": 0.0046, + "step": 6716 + }, + { + "epoch": 3.055959963603276, + "grad_norm": 0.40502422693178913, + "learning_rate": 3.290048517447969e-06, + "loss": 0.0095, + "step": 6717 + }, + { + "epoch": 3.056414922656961, + "grad_norm": 0.2447947801658237, + "learning_rate": 3.2887054728812284e-06, + "loss": 0.004, + "step": 6718 + }, + { + "epoch": 3.056869881710646, + "grad_norm": 0.07084556483498368, + "learning_rate": 3.2873625681537503e-06, + "loss": 0.0006, + "step": 6719 + }, + { + "epoch": 3.0573248407643314, + "grad_norm": 0.18491076189575661, + "learning_rate": 3.286019803375269e-06, + "loss": 0.0017, + "step": 6720 + }, + { + "epoch": 3.0577797998180163, + "grad_norm": 0.1518483405872594, + "learning_rate": 3.2846771786555075e-06, + "loss": 0.0016, + "step": 6721 + }, + { + "epoch": 3.0582347588717016, + "grad_norm": 0.1246677738383808, + "learning_rate": 3.2833346941041823e-06, + "loss": 0.0012, + "step": 6722 + }, + { + "epoch": 3.058689717925387, + "grad_norm": 0.24501469373553433, + "learning_rate": 3.2819923498309903e-06, + "loss": 0.0037, + "step": 6723 + }, + { + "epoch": 3.0591446769790718, + "grad_norm": 0.34766650866262405, + "learning_rate": 3.280650145945627e-06, + "loss": 0.0035, + "step": 6724 + }, + { + "epoch": 3.059599636032757, + "grad_norm": 0.14575844229273618, + "learning_rate": 3.27930808255777e-06, + "loss": 0.0007, + "step": 6725 + }, + { + "epoch": 3.0600545950864424, + "grad_norm": 0.2783510874130845, + "learning_rate": 3.277966159777085e-06, + "loss": 0.0068, + "step": 6726 + }, + { + "epoch": 3.0605095541401273, + "grad_norm": 0.07908330415622568, + "learning_rate": 3.27662437771323e-06, + "loss": 0.0006, + "step": 6727 + }, + { + "epoch": 3.0609645131938126, + "grad_norm": 0.2734536838160686, + "learning_rate": 3.2752827364758464e-06, + "loss": 0.0023, + "step": 6728 + }, + { + "epoch": 3.061419472247498, + "grad_norm": 0.18183981401188093, + "learning_rate": 3.2739412361745703e-06, + "loss": 0.0012, + "step": 6729 + }, + { + "epoch": 3.0618744313011828, + "grad_norm": 0.3419348643210425, + "learning_rate": 3.27259987691902e-06, + "loss": 0.0036, + "step": 6730 + }, + { + "epoch": 3.062329390354868, + "grad_norm": 0.32333906168309456, + "learning_rate": 3.2712586588188074e-06, + "loss": 0.0056, + "step": 6731 + }, + { + "epoch": 3.0627843494085534, + "grad_norm": 0.05072137352877194, + "learning_rate": 3.269917581983531e-06, + "loss": 0.0005, + "step": 6732 + }, + { + "epoch": 3.0632393084622382, + "grad_norm": 0.185579387618519, + "learning_rate": 3.2685766465227766e-06, + "loss": 0.0027, + "step": 6733 + }, + { + "epoch": 3.0636942675159236, + "grad_norm": 0.32487271522956185, + "learning_rate": 3.26723585254612e-06, + "loss": 0.0076, + "step": 6734 + }, + { + "epoch": 3.064149226569609, + "grad_norm": 0.41963868473177607, + "learning_rate": 3.265895200163123e-06, + "loss": 0.0074, + "step": 6735 + }, + { + "epoch": 3.0646041856232937, + "grad_norm": 0.43032963788354833, + "learning_rate": 3.2645546894833415e-06, + "loss": 0.0142, + "step": 6736 + }, + { + "epoch": 3.065059144676979, + "grad_norm": 0.23090880419977639, + "learning_rate": 3.2632143206163103e-06, + "loss": 0.0058, + "step": 6737 + }, + { + "epoch": 3.0655141037306644, + "grad_norm": 0.13971581409177106, + "learning_rate": 3.2618740936715633e-06, + "loss": 0.0027, + "step": 6738 + }, + { + "epoch": 3.065969062784349, + "grad_norm": 0.27559078548912813, + "learning_rate": 3.2605340087586168e-06, + "loss": 0.008, + "step": 6739 + }, + { + "epoch": 3.0664240218380345, + "grad_norm": 0.19149140193008823, + "learning_rate": 3.2591940659869747e-06, + "loss": 0.0029, + "step": 6740 + }, + { + "epoch": 3.06687898089172, + "grad_norm": 0.32214302850237414, + "learning_rate": 3.2578542654661326e-06, + "loss": 0.0045, + "step": 6741 + }, + { + "epoch": 3.0673339399454047, + "grad_norm": 0.3984638299308933, + "learning_rate": 3.256514607305572e-06, + "loss": 0.0156, + "step": 6742 + }, + { + "epoch": 3.06778889899909, + "grad_norm": 0.1549843083152256, + "learning_rate": 3.2551750916147656e-06, + "loss": 0.002, + "step": 6743 + }, + { + "epoch": 3.0682438580527753, + "grad_norm": 0.14445799642068108, + "learning_rate": 3.2538357185031688e-06, + "loss": 0.0015, + "step": 6744 + }, + { + "epoch": 3.06869881710646, + "grad_norm": 0.286337049654631, + "learning_rate": 3.2524964880802324e-06, + "loss": 0.0091, + "step": 6745 + }, + { + "epoch": 3.0691537761601455, + "grad_norm": 0.25286225331619305, + "learning_rate": 3.2511574004553924e-06, + "loss": 0.0034, + "step": 6746 + }, + { + "epoch": 3.069608735213831, + "grad_norm": 0.10431242281535952, + "learning_rate": 3.2498184557380705e-06, + "loss": 0.0012, + "step": 6747 + }, + { + "epoch": 3.070063694267516, + "grad_norm": 0.16269028958376114, + "learning_rate": 3.2484796540376824e-06, + "loss": 0.0024, + "step": 6748 + }, + { + "epoch": 3.070518653321201, + "grad_norm": 0.1963416886866191, + "learning_rate": 3.2471409954636256e-06, + "loss": 0.0047, + "step": 6749 + }, + { + "epoch": 3.0709736123748863, + "grad_norm": 0.24824216801087257, + "learning_rate": 3.245802480125292e-06, + "loss": 0.0027, + "step": 6750 + }, + { + "epoch": 3.0714285714285716, + "grad_norm": 0.3652436704817453, + "learning_rate": 3.244464108132056e-06, + "loss": 0.0126, + "step": 6751 + }, + { + "epoch": 3.0718835304822565, + "grad_norm": 0.256209447243996, + "learning_rate": 3.2431258795932863e-06, + "loss": 0.0027, + "step": 6752 + }, + { + "epoch": 3.072338489535942, + "grad_norm": 0.1885877672421738, + "learning_rate": 3.241787794618336e-06, + "loss": 0.0028, + "step": 6753 + }, + { + "epoch": 3.072793448589627, + "grad_norm": 0.1684478415436154, + "learning_rate": 3.240449853316548e-06, + "loss": 0.0038, + "step": 6754 + }, + { + "epoch": 3.073248407643312, + "grad_norm": 0.2539833830707047, + "learning_rate": 3.23911205579725e-06, + "loss": 0.0032, + "step": 6755 + }, + { + "epoch": 3.0737033666969973, + "grad_norm": 0.23607032119302146, + "learning_rate": 3.2377744021697643e-06, + "loss": 0.004, + "step": 6756 + }, + { + "epoch": 3.0741583257506826, + "grad_norm": 0.2461559638382031, + "learning_rate": 3.2364368925433954e-06, + "loss": 0.0057, + "step": 6757 + }, + { + "epoch": 3.0746132848043675, + "grad_norm": 0.19472830974496605, + "learning_rate": 3.235099527027438e-06, + "loss": 0.0051, + "step": 6758 + }, + { + "epoch": 3.0750682438580528, + "grad_norm": 0.30793589428434276, + "learning_rate": 3.2337623057311794e-06, + "loss": 0.0127, + "step": 6759 + }, + { + "epoch": 3.075523202911738, + "grad_norm": 0.2218162672251745, + "learning_rate": 3.232425228763888e-06, + "loss": 0.0016, + "step": 6760 + }, + { + "epoch": 3.075978161965423, + "grad_norm": 0.15274006563259715, + "learning_rate": 3.2310882962348257e-06, + "loss": 0.0027, + "step": 6761 + }, + { + "epoch": 3.0764331210191083, + "grad_norm": 0.35604068911246184, + "learning_rate": 3.229751508253238e-06, + "loss": 0.0041, + "step": 6762 + }, + { + "epoch": 3.0768880800727936, + "grad_norm": 0.27756735557983553, + "learning_rate": 3.228414864928364e-06, + "loss": 0.0088, + "step": 6763 + }, + { + "epoch": 3.0773430391264784, + "grad_norm": 0.1550615401595909, + "learning_rate": 3.2270783663694254e-06, + "loss": 0.0025, + "step": 6764 + }, + { + "epoch": 3.0777979981801638, + "grad_norm": 0.37469727087291216, + "learning_rate": 3.2257420126856357e-06, + "loss": 0.0054, + "step": 6765 + }, + { + "epoch": 3.078252957233849, + "grad_norm": 0.3148135804600808, + "learning_rate": 3.224405803986198e-06, + "loss": 0.0032, + "step": 6766 + }, + { + "epoch": 3.078707916287534, + "grad_norm": 0.19563974920154628, + "learning_rate": 3.223069740380299e-06, + "loss": 0.0036, + "step": 6767 + }, + { + "epoch": 3.0791628753412192, + "grad_norm": 0.15179995299084575, + "learning_rate": 3.2217338219771166e-06, + "loss": 0.0015, + "step": 6768 + }, + { + "epoch": 3.0796178343949046, + "grad_norm": 0.5307591429935765, + "learning_rate": 3.2203980488858154e-06, + "loss": 0.0065, + "step": 6769 + }, + { + "epoch": 3.0800727934485894, + "grad_norm": 0.09519582501742016, + "learning_rate": 3.2190624212155497e-06, + "loss": 0.0012, + "step": 6770 + }, + { + "epoch": 3.0805277525022747, + "grad_norm": 0.31477397210596536, + "learning_rate": 3.217726939075459e-06, + "loss": 0.0041, + "step": 6771 + }, + { + "epoch": 3.08098271155596, + "grad_norm": 0.10423859914988316, + "learning_rate": 3.2163916025746734e-06, + "loss": 0.0018, + "step": 6772 + }, + { + "epoch": 3.0814376706096454, + "grad_norm": 0.22066632675155598, + "learning_rate": 3.215056411822313e-06, + "loss": 0.0036, + "step": 6773 + }, + { + "epoch": 3.08189262966333, + "grad_norm": 0.3322288750822183, + "learning_rate": 3.213721366927481e-06, + "loss": 0.0102, + "step": 6774 + }, + { + "epoch": 3.0823475887170155, + "grad_norm": 0.3297821943068105, + "learning_rate": 3.2123864679992732e-06, + "loss": 0.0028, + "step": 6775 + }, + { + "epoch": 3.082802547770701, + "grad_norm": 0.2861879548721158, + "learning_rate": 3.21105171514677e-06, + "loss": 0.0035, + "step": 6776 + }, + { + "epoch": 3.0832575068243857, + "grad_norm": 0.23409510522565669, + "learning_rate": 3.209717108479042e-06, + "loss": 0.0011, + "step": 6777 + }, + { + "epoch": 3.083712465878071, + "grad_norm": 0.36240084758036684, + "learning_rate": 3.208382648105147e-06, + "loss": 0.0049, + "step": 6778 + }, + { + "epoch": 3.0841674249317563, + "grad_norm": 0.14284862077915064, + "learning_rate": 3.2070483341341295e-06, + "loss": 0.0018, + "step": 6779 + }, + { + "epoch": 3.084622383985441, + "grad_norm": 0.13574095398642788, + "learning_rate": 3.205714166675027e-06, + "loss": 0.0022, + "step": 6780 + }, + { + "epoch": 3.0850773430391265, + "grad_norm": 0.254705055854357, + "learning_rate": 3.2043801458368597e-06, + "loss": 0.0055, + "step": 6781 + }, + { + "epoch": 3.085532302092812, + "grad_norm": 0.047918884710824824, + "learning_rate": 3.203046271728638e-06, + "loss": 0.0004, + "step": 6782 + }, + { + "epoch": 3.0859872611464967, + "grad_norm": 0.23173230609638582, + "learning_rate": 3.2017125444593595e-06, + "loss": 0.0046, + "step": 6783 + }, + { + "epoch": 3.086442220200182, + "grad_norm": 0.39728780776283734, + "learning_rate": 3.2003789641380115e-06, + "loss": 0.0153, + "step": 6784 + }, + { + "epoch": 3.0868971792538673, + "grad_norm": 0.13449792478069442, + "learning_rate": 3.1990455308735667e-06, + "loss": 0.0014, + "step": 6785 + }, + { + "epoch": 3.087352138307552, + "grad_norm": 0.2541201258133481, + "learning_rate": 3.1977122447749876e-06, + "loss": 0.0032, + "step": 6786 + }, + { + "epoch": 3.0878070973612375, + "grad_norm": 0.1602163158655709, + "learning_rate": 3.196379105951226e-06, + "loss": 0.0034, + "step": 6787 + }, + { + "epoch": 3.088262056414923, + "grad_norm": 0.28588033239075733, + "learning_rate": 3.195046114511219e-06, + "loss": 0.0098, + "step": 6788 + }, + { + "epoch": 3.0887170154686077, + "grad_norm": 0.602944255822795, + "learning_rate": 3.193713270563892e-06, + "loss": 0.0135, + "step": 6789 + }, + { + "epoch": 3.089171974522293, + "grad_norm": 0.20656631772005207, + "learning_rate": 3.1923805742181603e-06, + "loss": 0.0026, + "step": 6790 + }, + { + "epoch": 3.0896269335759783, + "grad_norm": 0.21656989632053653, + "learning_rate": 3.1910480255829235e-06, + "loss": 0.0038, + "step": 6791 + }, + { + "epoch": 3.090081892629663, + "grad_norm": 0.14329498352974906, + "learning_rate": 3.189715624767074e-06, + "loss": 0.0022, + "step": 6792 + }, + { + "epoch": 3.0905368516833485, + "grad_norm": 0.1746950490511674, + "learning_rate": 3.1883833718794863e-06, + "loss": 0.0039, + "step": 6793 + }, + { + "epoch": 3.0909918107370338, + "grad_norm": 0.5528401786430039, + "learning_rate": 3.18705126702903e-06, + "loss": 0.0073, + "step": 6794 + }, + { + "epoch": 3.0914467697907186, + "grad_norm": 0.24112917565479014, + "learning_rate": 3.185719310324557e-06, + "loss": 0.0028, + "step": 6795 + }, + { + "epoch": 3.091901728844404, + "grad_norm": 0.25085510075313666, + "learning_rate": 3.184387501874908e-06, + "loss": 0.0027, + "step": 6796 + }, + { + "epoch": 3.0923566878980893, + "grad_norm": 0.13331791093439024, + "learning_rate": 3.1830558417889145e-06, + "loss": 0.001, + "step": 6797 + }, + { + "epoch": 3.092811646951774, + "grad_norm": 0.1334019090615344, + "learning_rate": 3.1817243301753912e-06, + "loss": 0.0009, + "step": 6798 + }, + { + "epoch": 3.0932666060054594, + "grad_norm": 0.2206446641484756, + "learning_rate": 3.1803929671431457e-06, + "loss": 0.0056, + "step": 6799 + }, + { + "epoch": 3.0937215650591448, + "grad_norm": 0.22346354807526733, + "learning_rate": 3.179061752800967e-06, + "loss": 0.0034, + "step": 6800 + }, + { + "epoch": 3.0941765241128296, + "grad_norm": 0.18159113071231728, + "learning_rate": 3.1777306872576396e-06, + "loss": 0.0039, + "step": 6801 + }, + { + "epoch": 3.094631483166515, + "grad_norm": 0.23189102644656986, + "learning_rate": 3.176399770621933e-06, + "loss": 0.0028, + "step": 6802 + }, + { + "epoch": 3.0950864422202002, + "grad_norm": 0.2646974563604349, + "learning_rate": 3.1750690030025998e-06, + "loss": 0.0046, + "step": 6803 + }, + { + "epoch": 3.0955414012738856, + "grad_norm": 0.3552353110594039, + "learning_rate": 3.173738384508388e-06, + "loss": 0.0061, + "step": 6804 + }, + { + "epoch": 3.0959963603275704, + "grad_norm": 0.27473740890782317, + "learning_rate": 3.172407915248027e-06, + "loss": 0.0059, + "step": 6805 + }, + { + "epoch": 3.0964513193812557, + "grad_norm": 0.21037904857030093, + "learning_rate": 3.171077595330239e-06, + "loss": 0.0025, + "step": 6806 + }, + { + "epoch": 3.096906278434941, + "grad_norm": 0.22035385629324789, + "learning_rate": 3.169747424863728e-06, + "loss": 0.0042, + "step": 6807 + }, + { + "epoch": 3.097361237488626, + "grad_norm": 0.37125583006058394, + "learning_rate": 3.168417403957193e-06, + "loss": 0.0051, + "step": 6808 + }, + { + "epoch": 3.097816196542311, + "grad_norm": 0.22099877904042967, + "learning_rate": 3.167087532719318e-06, + "loss": 0.005, + "step": 6809 + }, + { + "epoch": 3.0982711555959965, + "grad_norm": 0.277029472802014, + "learning_rate": 3.1657578112587713e-06, + "loss": 0.0033, + "step": 6810 + }, + { + "epoch": 3.0987261146496814, + "grad_norm": 0.13559192176704188, + "learning_rate": 3.1644282396842135e-06, + "loss": 0.002, + "step": 6811 + }, + { + "epoch": 3.0991810737033667, + "grad_norm": 0.2477328764641318, + "learning_rate": 3.16309881810429e-06, + "loss": 0.0035, + "step": 6812 + }, + { + "epoch": 3.099636032757052, + "grad_norm": 0.20415176153338863, + "learning_rate": 3.1617695466276364e-06, + "loss": 0.0022, + "step": 6813 + }, + { + "epoch": 3.100090991810737, + "grad_norm": 0.3369442240246176, + "learning_rate": 3.160440425362873e-06, + "loss": 0.0037, + "step": 6814 + }, + { + "epoch": 3.100545950864422, + "grad_norm": 0.45163771156229976, + "learning_rate": 3.1591114544186107e-06, + "loss": 0.0161, + "step": 6815 + }, + { + "epoch": 3.1010009099181075, + "grad_norm": 0.23015613448335928, + "learning_rate": 3.157782633903448e-06, + "loss": 0.0032, + "step": 6816 + }, + { + "epoch": 3.1014558689717924, + "grad_norm": 0.19549436314524332, + "learning_rate": 3.1564539639259685e-06, + "loss": 0.0025, + "step": 6817 + }, + { + "epoch": 3.1019108280254777, + "grad_norm": 0.21312866380568515, + "learning_rate": 3.1551254445947468e-06, + "loss": 0.0014, + "step": 6818 + }, + { + "epoch": 3.102365787079163, + "grad_norm": 0.47724103266672474, + "learning_rate": 3.1537970760183406e-06, + "loss": 0.0105, + "step": 6819 + }, + { + "epoch": 3.102820746132848, + "grad_norm": 0.18669670815297695, + "learning_rate": 3.1524688583053014e-06, + "loss": 0.0019, + "step": 6820 + }, + { + "epoch": 3.103275705186533, + "grad_norm": 0.12167249750420166, + "learning_rate": 3.151140791564162e-06, + "loss": 0.0012, + "step": 6821 + }, + { + "epoch": 3.1037306642402185, + "grad_norm": 0.3700964808925218, + "learning_rate": 3.1498128759034484e-06, + "loss": 0.0098, + "step": 6822 + }, + { + "epoch": 3.1041856232939034, + "grad_norm": 0.28064010484527385, + "learning_rate": 3.1484851114316724e-06, + "loss": 0.0035, + "step": 6823 + }, + { + "epoch": 3.1046405823475887, + "grad_norm": 0.27322401109860034, + "learning_rate": 3.1471574982573306e-06, + "loss": 0.007, + "step": 6824 + }, + { + "epoch": 3.105095541401274, + "grad_norm": 0.1695836303368951, + "learning_rate": 3.1458300364889118e-06, + "loss": 0.0037, + "step": 6825 + }, + { + "epoch": 3.105550500454959, + "grad_norm": 0.3666144555844518, + "learning_rate": 3.144502726234889e-06, + "loss": 0.0063, + "step": 6826 + }, + { + "epoch": 3.106005459508644, + "grad_norm": 0.1254979537275639, + "learning_rate": 3.143175567603723e-06, + "loss": 0.0008, + "step": 6827 + }, + { + "epoch": 3.1064604185623295, + "grad_norm": 0.09078910708998472, + "learning_rate": 3.141848560703863e-06, + "loss": 0.001, + "step": 6828 + }, + { + "epoch": 3.1069153776160148, + "grad_norm": 0.21858526744185838, + "learning_rate": 3.14052170564375e-06, + "loss": 0.0043, + "step": 6829 + }, + { + "epoch": 3.1073703366696996, + "grad_norm": 0.261118643262181, + "learning_rate": 3.139195002531804e-06, + "loss": 0.0079, + "step": 6830 + }, + { + "epoch": 3.107825295723385, + "grad_norm": 0.3114192070734795, + "learning_rate": 3.1378684514764413e-06, + "loss": 0.0021, + "step": 6831 + }, + { + "epoch": 3.1082802547770703, + "grad_norm": 0.18848825978171801, + "learning_rate": 3.1365420525860575e-06, + "loss": 0.0017, + "step": 6832 + }, + { + "epoch": 3.108735213830755, + "grad_norm": 0.2764956274626624, + "learning_rate": 3.135215805969043e-06, + "loss": 0.0031, + "step": 6833 + }, + { + "epoch": 3.1091901728844404, + "grad_norm": 0.09770119491979927, + "learning_rate": 3.133889711733771e-06, + "loss": 0.0013, + "step": 6834 + }, + { + "epoch": 3.1096451319381258, + "grad_norm": 0.28421163712249997, + "learning_rate": 3.1325637699886023e-06, + "loss": 0.0066, + "step": 6835 + }, + { + "epoch": 3.1101000909918106, + "grad_norm": 0.23556076856213476, + "learning_rate": 3.1312379808418926e-06, + "loss": 0.0025, + "step": 6836 + }, + { + "epoch": 3.110555050045496, + "grad_norm": 0.10250499014945226, + "learning_rate": 3.1299123444019737e-06, + "loss": 0.0008, + "step": 6837 + }, + { + "epoch": 3.1110100090991812, + "grad_norm": 0.28035545990831157, + "learning_rate": 3.128586860777174e-06, + "loss": 0.0037, + "step": 6838 + }, + { + "epoch": 3.111464968152866, + "grad_norm": 0.3360707081314679, + "learning_rate": 3.127261530075804e-06, + "loss": 0.0042, + "step": 6839 + }, + { + "epoch": 3.1119199272065514, + "grad_norm": 0.23777162093326473, + "learning_rate": 3.125936352406166e-06, + "loss": 0.0033, + "step": 6840 + }, + { + "epoch": 3.1123748862602367, + "grad_norm": 0.2611160470051368, + "learning_rate": 3.1246113278765442e-06, + "loss": 0.0072, + "step": 6841 + }, + { + "epoch": 3.1128298453139216, + "grad_norm": 0.1334675178409681, + "learning_rate": 3.123286456595215e-06, + "loss": 0.002, + "step": 6842 + }, + { + "epoch": 3.113284804367607, + "grad_norm": 0.18728210685285573, + "learning_rate": 3.1219617386704433e-06, + "loss": 0.0057, + "step": 6843 + }, + { + "epoch": 3.113739763421292, + "grad_norm": 0.18067937966525086, + "learning_rate": 3.1206371742104756e-06, + "loss": 0.0015, + "step": 6844 + }, + { + "epoch": 3.114194722474977, + "grad_norm": 0.2530804121374889, + "learning_rate": 3.119312763323553e-06, + "loss": 0.0021, + "step": 6845 + }, + { + "epoch": 3.1146496815286624, + "grad_norm": 0.07786261321008235, + "learning_rate": 3.1179885061178965e-06, + "loss": 0.001, + "step": 6846 + }, + { + "epoch": 3.1151046405823477, + "grad_norm": 0.3167869446803168, + "learning_rate": 3.116664402701721e-06, + "loss": 0.0031, + "step": 6847 + }, + { + "epoch": 3.1155595996360326, + "grad_norm": 0.3034780237339912, + "learning_rate": 3.1153404531832252e-06, + "loss": 0.0037, + "step": 6848 + }, + { + "epoch": 3.116014558689718, + "grad_norm": 0.1430869012028794, + "learning_rate": 3.1140166576705955e-06, + "loss": 0.0015, + "step": 6849 + }, + { + "epoch": 3.116469517743403, + "grad_norm": 0.2955575243852622, + "learning_rate": 3.1126930162720093e-06, + "loss": 0.0063, + "step": 6850 + }, + { + "epoch": 3.116924476797088, + "grad_norm": 0.18123834513066317, + "learning_rate": 3.1113695290956257e-06, + "loss": 0.0013, + "step": 6851 + }, + { + "epoch": 3.1173794358507734, + "grad_norm": 0.18820326429090645, + "learning_rate": 3.1100461962495966e-06, + "loss": 0.0009, + "step": 6852 + }, + { + "epoch": 3.1178343949044587, + "grad_norm": 0.3286260862725942, + "learning_rate": 3.1087230178420557e-06, + "loss": 0.0054, + "step": 6853 + }, + { + "epoch": 3.1182893539581436, + "grad_norm": 0.41650101855170824, + "learning_rate": 3.10739999398113e-06, + "loss": 0.014, + "step": 6854 + }, + { + "epoch": 3.118744313011829, + "grad_norm": 0.23364049257521766, + "learning_rate": 3.1060771247749287e-06, + "loss": 0.0029, + "step": 6855 + }, + { + "epoch": 3.119199272065514, + "grad_norm": 0.20554830683417963, + "learning_rate": 3.1047544103315515e-06, + "loss": 0.0053, + "step": 6856 + }, + { + "epoch": 3.1196542311191995, + "grad_norm": 0.18823120029122975, + "learning_rate": 3.1034318507590867e-06, + "loss": 0.0023, + "step": 6857 + }, + { + "epoch": 3.1201091901728844, + "grad_norm": 0.43839133144778, + "learning_rate": 3.102109446165605e-06, + "loss": 0.0034, + "step": 6858 + }, + { + "epoch": 3.1205641492265697, + "grad_norm": 0.05775456441691884, + "learning_rate": 3.10078719665917e-06, + "loss": 0.0003, + "step": 6859 + }, + { + "epoch": 3.121019108280255, + "grad_norm": 0.47133052133767855, + "learning_rate": 3.0994651023478273e-06, + "loss": 0.0043, + "step": 6860 + }, + { + "epoch": 3.12147406733394, + "grad_norm": 0.2967469487959148, + "learning_rate": 3.0981431633396153e-06, + "loss": 0.005, + "step": 6861 + }, + { + "epoch": 3.121929026387625, + "grad_norm": 0.18682640074147638, + "learning_rate": 3.0968213797425543e-06, + "loss": 0.0034, + "step": 6862 + }, + { + "epoch": 3.1223839854413105, + "grad_norm": 0.060623855381300544, + "learning_rate": 3.0954997516646535e-06, + "loss": 0.0005, + "step": 6863 + }, + { + "epoch": 3.1228389444949953, + "grad_norm": 0.23986937011875514, + "learning_rate": 3.094178279213914e-06, + "loss": 0.0048, + "step": 6864 + }, + { + "epoch": 3.1232939035486806, + "grad_norm": 0.030752265465777497, + "learning_rate": 3.09285696249832e-06, + "loss": 0.0003, + "step": 6865 + }, + { + "epoch": 3.123748862602366, + "grad_norm": 0.1249131422777919, + "learning_rate": 3.091535801625841e-06, + "loss": 0.0009, + "step": 6866 + }, + { + "epoch": 3.124203821656051, + "grad_norm": 0.20426069213763887, + "learning_rate": 3.090214796704439e-06, + "loss": 0.0021, + "step": 6867 + }, + { + "epoch": 3.124658780709736, + "grad_norm": 0.06901177869871337, + "learning_rate": 3.0888939478420583e-06, + "loss": 0.0008, + "step": 6868 + }, + { + "epoch": 3.1251137397634214, + "grad_norm": 0.10847243942239054, + "learning_rate": 3.087573255146634e-06, + "loss": 0.0011, + "step": 6869 + }, + { + "epoch": 3.1255686988171063, + "grad_norm": 0.13784424514041746, + "learning_rate": 3.086252718726086e-06, + "loss": 0.0015, + "step": 6870 + }, + { + "epoch": 3.1260236578707916, + "grad_norm": 0.1826519258113166, + "learning_rate": 3.0849323386883236e-06, + "loss": 0.0021, + "step": 6871 + }, + { + "epoch": 3.126478616924477, + "grad_norm": 0.14043473789225805, + "learning_rate": 3.0836121151412436e-06, + "loss": 0.0019, + "step": 6872 + }, + { + "epoch": 3.126933575978162, + "grad_norm": 0.36756843729354177, + "learning_rate": 3.0822920481927255e-06, + "loss": 0.0058, + "step": 6873 + }, + { + "epoch": 3.127388535031847, + "grad_norm": 0.18237617466251585, + "learning_rate": 3.080972137950643e-06, + "loss": 0.0016, + "step": 6874 + }, + { + "epoch": 3.1278434940855324, + "grad_norm": 0.3914557887957653, + "learning_rate": 3.079652384522849e-06, + "loss": 0.007, + "step": 6875 + }, + { + "epoch": 3.1282984531392173, + "grad_norm": 0.2644660494945359, + "learning_rate": 3.0783327880171916e-06, + "loss": 0.0032, + "step": 6876 + }, + { + "epoch": 3.1287534121929026, + "grad_norm": 0.4032487225341823, + "learning_rate": 3.077013348541498e-06, + "loss": 0.015, + "step": 6877 + }, + { + "epoch": 3.129208371246588, + "grad_norm": 0.2312442593073337, + "learning_rate": 3.075694066203591e-06, + "loss": 0.0027, + "step": 6878 + }, + { + "epoch": 3.1296633303002728, + "grad_norm": 0.08588400873012622, + "learning_rate": 3.0743749411112754e-06, + "loss": 0.0013, + "step": 6879 + }, + { + "epoch": 3.130118289353958, + "grad_norm": 0.22002806254206217, + "learning_rate": 3.073055973372343e-06, + "loss": 0.0027, + "step": 6880 + }, + { + "epoch": 3.1305732484076434, + "grad_norm": 0.1417886830228247, + "learning_rate": 3.071737163094576e-06, + "loss": 0.0012, + "step": 6881 + }, + { + "epoch": 3.1310282074613287, + "grad_norm": 0.1689950083208543, + "learning_rate": 3.0704185103857383e-06, + "loss": 0.0026, + "step": 6882 + }, + { + "epoch": 3.1314831665150136, + "grad_norm": 0.04820569512335356, + "learning_rate": 3.0691000153535864e-06, + "loss": 0.0006, + "step": 6883 + }, + { + "epoch": 3.131938125568699, + "grad_norm": 0.3576441567792067, + "learning_rate": 3.0677816781058604e-06, + "loss": 0.0053, + "step": 6884 + }, + { + "epoch": 3.132393084622384, + "grad_norm": 0.19423967620935864, + "learning_rate": 3.0664634987502905e-06, + "loss": 0.0032, + "step": 6885 + }, + { + "epoch": 3.132848043676069, + "grad_norm": 0.27304912751554217, + "learning_rate": 3.0651454773945926e-06, + "loss": 0.0041, + "step": 6886 + }, + { + "epoch": 3.1333030027297544, + "grad_norm": 0.14069924471421918, + "learning_rate": 3.063827614146468e-06, + "loss": 0.0013, + "step": 6887 + }, + { + "epoch": 3.1337579617834397, + "grad_norm": 0.33853261769032045, + "learning_rate": 3.062509909113608e-06, + "loss": 0.0018, + "step": 6888 + }, + { + "epoch": 3.1342129208371245, + "grad_norm": 0.3300657984131354, + "learning_rate": 3.061192362403687e-06, + "loss": 0.0035, + "step": 6889 + }, + { + "epoch": 3.13466787989081, + "grad_norm": 0.22586443625227232, + "learning_rate": 3.0598749741243717e-06, + "loss": 0.0043, + "step": 6890 + }, + { + "epoch": 3.135122838944495, + "grad_norm": 0.3142204030631487, + "learning_rate": 3.05855774438331e-06, + "loss": 0.0097, + "step": 6891 + }, + { + "epoch": 3.13557779799818, + "grad_norm": 0.5347180070280461, + "learning_rate": 3.057240673288144e-06, + "loss": 0.0071, + "step": 6892 + }, + { + "epoch": 3.1360327570518653, + "grad_norm": 0.07979869757978177, + "learning_rate": 3.0559237609464963e-06, + "loss": 0.0007, + "step": 6893 + }, + { + "epoch": 3.1364877161055507, + "grad_norm": 0.34513907395525717, + "learning_rate": 3.0546070074659796e-06, + "loss": 0.0094, + "step": 6894 + }, + { + "epoch": 3.1369426751592355, + "grad_norm": 0.24669283060965713, + "learning_rate": 3.0532904129541928e-06, + "loss": 0.0054, + "step": 6895 + }, + { + "epoch": 3.137397634212921, + "grad_norm": 0.26367498137508605, + "learning_rate": 3.0519739775187235e-06, + "loss": 0.0066, + "step": 6896 + }, + { + "epoch": 3.137852593266606, + "grad_norm": 0.3499818267724872, + "learning_rate": 3.050657701267142e-06, + "loss": 0.0033, + "step": 6897 + }, + { + "epoch": 3.138307552320291, + "grad_norm": 0.23822308511616982, + "learning_rate": 3.0493415843070086e-06, + "loss": 0.0089, + "step": 6898 + }, + { + "epoch": 3.1387625113739763, + "grad_norm": 0.3506298325425386, + "learning_rate": 3.048025626745874e-06, + "loss": 0.0108, + "step": 6899 + }, + { + "epoch": 3.1392174704276616, + "grad_norm": 0.16713557199923024, + "learning_rate": 3.0467098286912696e-06, + "loss": 0.0018, + "step": 6900 + }, + { + "epoch": 3.1396724294813465, + "grad_norm": 0.24614508266707702, + "learning_rate": 3.045394190250718e-06, + "loss": 0.0067, + "step": 6901 + }, + { + "epoch": 3.140127388535032, + "grad_norm": 0.15444915730686565, + "learning_rate": 3.0440787115317243e-06, + "loss": 0.0021, + "step": 6902 + }, + { + "epoch": 3.140582347588717, + "grad_norm": 0.5231792062704987, + "learning_rate": 3.0427633926417876e-06, + "loss": 0.0074, + "step": 6903 + }, + { + "epoch": 3.141037306642402, + "grad_norm": 0.3562110756967007, + "learning_rate": 3.0414482336883855e-06, + "loss": 0.004, + "step": 6904 + }, + { + "epoch": 3.1414922656960873, + "grad_norm": 0.5750481940157537, + "learning_rate": 3.040133234778989e-06, + "loss": 0.0024, + "step": 6905 + }, + { + "epoch": 3.1419472247497726, + "grad_norm": 0.04883965726067432, + "learning_rate": 3.0388183960210554e-06, + "loss": 0.0005, + "step": 6906 + }, + { + "epoch": 3.1424021838034575, + "grad_norm": 0.23197639701457073, + "learning_rate": 3.0375037175220247e-06, + "loss": 0.005, + "step": 6907 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 0.4011252824886896, + "learning_rate": 3.0361891993893287e-06, + "loss": 0.0109, + "step": 6908 + }, + { + "epoch": 3.143312101910828, + "grad_norm": 0.16329878320363317, + "learning_rate": 3.0348748417303826e-06, + "loss": 0.0031, + "step": 6909 + }, + { + "epoch": 3.143767060964513, + "grad_norm": 0.4667068330320742, + "learning_rate": 3.03356064465259e-06, + "loss": 0.006, + "step": 6910 + }, + { + "epoch": 3.1442220200181983, + "grad_norm": 0.2126685344140864, + "learning_rate": 3.0322466082633405e-06, + "loss": 0.0025, + "step": 6911 + }, + { + "epoch": 3.1446769790718836, + "grad_norm": 0.2922093884797465, + "learning_rate": 3.030932732670011e-06, + "loss": 0.0079, + "step": 6912 + }, + { + "epoch": 3.145131938125569, + "grad_norm": 0.3383758073430269, + "learning_rate": 3.029619017979969e-06, + "loss": 0.0106, + "step": 6913 + }, + { + "epoch": 3.1455868971792538, + "grad_norm": 0.1872380842348733, + "learning_rate": 3.0283054643005605e-06, + "loss": 0.0017, + "step": 6914 + }, + { + "epoch": 3.146041856232939, + "grad_norm": 0.22900712924516234, + "learning_rate": 3.026992071739127e-06, + "loss": 0.0039, + "step": 6915 + }, + { + "epoch": 3.1464968152866244, + "grad_norm": 0.1062966484968155, + "learning_rate": 3.0256788404029903e-06, + "loss": 0.0012, + "step": 6916 + }, + { + "epoch": 3.1469517743403093, + "grad_norm": 0.11574748561169698, + "learning_rate": 3.024365770399464e-06, + "loss": 0.0021, + "step": 6917 + }, + { + "epoch": 3.1474067333939946, + "grad_norm": 0.11166745590869884, + "learning_rate": 3.0230528618358435e-06, + "loss": 0.0009, + "step": 6918 + }, + { + "epoch": 3.14786169244768, + "grad_norm": 0.3276043045734579, + "learning_rate": 3.021740114819415e-06, + "loss": 0.0049, + "step": 6919 + }, + { + "epoch": 3.1483166515013647, + "grad_norm": 0.16322970016947816, + "learning_rate": 3.0204275294574526e-06, + "loss": 0.0013, + "step": 6920 + }, + { + "epoch": 3.14877161055505, + "grad_norm": 0.2662388850182962, + "learning_rate": 3.019115105857211e-06, + "loss": 0.0038, + "step": 6921 + }, + { + "epoch": 3.1492265696087354, + "grad_norm": 0.23585195831489067, + "learning_rate": 3.017802844125939e-06, + "loss": 0.0024, + "step": 6922 + }, + { + "epoch": 3.1496815286624202, + "grad_norm": 0.24780754269062924, + "learning_rate": 3.016490744370866e-06, + "loss": 0.0042, + "step": 6923 + }, + { + "epoch": 3.1501364877161055, + "grad_norm": 0.18098048291154423, + "learning_rate": 3.0151788066992125e-06, + "loss": 0.0051, + "step": 6924 + }, + { + "epoch": 3.150591446769791, + "grad_norm": 0.13069298547750907, + "learning_rate": 3.013867031218183e-06, + "loss": 0.0023, + "step": 6925 + }, + { + "epoch": 3.1510464058234757, + "grad_norm": 0.30992928731917874, + "learning_rate": 3.0125554180349694e-06, + "loss": 0.0052, + "step": 6926 + }, + { + "epoch": 3.151501364877161, + "grad_norm": 0.2334764034735816, + "learning_rate": 3.0112439672567526e-06, + "loss": 0.0052, + "step": 6927 + }, + { + "epoch": 3.1519563239308463, + "grad_norm": 0.4362554464038189, + "learning_rate": 3.009932678990698e-06, + "loss": 0.0108, + "step": 6928 + }, + { + "epoch": 3.152411282984531, + "grad_norm": 0.26549600857142336, + "learning_rate": 3.008621553343959e-06, + "loss": 0.0038, + "step": 6929 + }, + { + "epoch": 3.1528662420382165, + "grad_norm": 0.23576495783450618, + "learning_rate": 3.007310590423672e-06, + "loss": 0.0024, + "step": 6930 + }, + { + "epoch": 3.153321201091902, + "grad_norm": 0.18064147428010743, + "learning_rate": 3.0059997903369658e-06, + "loss": 0.0039, + "step": 6931 + }, + { + "epoch": 3.1537761601455867, + "grad_norm": 0.2725154124219658, + "learning_rate": 3.004689153190952e-06, + "loss": 0.006, + "step": 6932 + }, + { + "epoch": 3.154231119199272, + "grad_norm": 0.06262260074187467, + "learning_rate": 3.0033786790927266e-06, + "loss": 0.0006, + "step": 6933 + }, + { + "epoch": 3.1546860782529573, + "grad_norm": 0.1521036144057461, + "learning_rate": 3.0020683681493824e-06, + "loss": 0.0018, + "step": 6934 + }, + { + "epoch": 3.1551410373066426, + "grad_norm": 0.24234515722418318, + "learning_rate": 3.000758220467988e-06, + "loss": 0.0015, + "step": 6935 + }, + { + "epoch": 3.1555959963603275, + "grad_norm": 0.09706574994749276, + "learning_rate": 2.9994482361556026e-06, + "loss": 0.0011, + "step": 6936 + }, + { + "epoch": 3.156050955414013, + "grad_norm": 0.25646361450328253, + "learning_rate": 2.9981384153192737e-06, + "loss": 0.0057, + "step": 6937 + }, + { + "epoch": 3.156505914467698, + "grad_norm": 0.09333352082841011, + "learning_rate": 2.9968287580660328e-06, + "loss": 0.0011, + "step": 6938 + }, + { + "epoch": 3.156960873521383, + "grad_norm": 0.36507353945184257, + "learning_rate": 2.9955192645028995e-06, + "loss": 0.0069, + "step": 6939 + }, + { + "epoch": 3.1574158325750683, + "grad_norm": 0.3736052944418765, + "learning_rate": 2.994209934736879e-06, + "loss": 0.0077, + "step": 6940 + }, + { + "epoch": 3.1578707916287536, + "grad_norm": 0.19118053832227974, + "learning_rate": 2.9929007688749647e-06, + "loss": 0.0045, + "step": 6941 + }, + { + "epoch": 3.1583257506824385, + "grad_norm": 0.028668478996688996, + "learning_rate": 2.991591767024137e-06, + "loss": 0.0003, + "step": 6942 + }, + { + "epoch": 3.158780709736124, + "grad_norm": 0.5005650245540699, + "learning_rate": 2.990282929291359e-06, + "loss": 0.0041, + "step": 6943 + }, + { + "epoch": 3.159235668789809, + "grad_norm": 0.21948524129154579, + "learning_rate": 2.9889742557835855e-06, + "loss": 0.0062, + "step": 6944 + }, + { + "epoch": 3.159690627843494, + "grad_norm": 0.1688003229431753, + "learning_rate": 2.9876657466077523e-06, + "loss": 0.0016, + "step": 6945 + }, + { + "epoch": 3.1601455868971793, + "grad_norm": 0.3526875064917373, + "learning_rate": 2.9863574018707887e-06, + "loss": 0.0072, + "step": 6946 + }, + { + "epoch": 3.1606005459508646, + "grad_norm": 0.1432575929253344, + "learning_rate": 2.9850492216796016e-06, + "loss": 0.0025, + "step": 6947 + }, + { + "epoch": 3.1610555050045495, + "grad_norm": 0.2527655887021309, + "learning_rate": 2.983741206141094e-06, + "loss": 0.0027, + "step": 6948 + }, + { + "epoch": 3.1615104640582348, + "grad_norm": 0.1766949098242639, + "learning_rate": 2.9824333553621515e-06, + "loss": 0.0049, + "step": 6949 + }, + { + "epoch": 3.16196542311192, + "grad_norm": 0.17726795076389612, + "learning_rate": 2.9811256694496428e-06, + "loss": 0.0016, + "step": 6950 + }, + { + "epoch": 3.162420382165605, + "grad_norm": 0.1951559691219317, + "learning_rate": 2.979818148510427e-06, + "loss": 0.0026, + "step": 6951 + }, + { + "epoch": 3.1628753412192903, + "grad_norm": 0.3866619841764015, + "learning_rate": 2.978510792651349e-06, + "loss": 0.0087, + "step": 6952 + }, + { + "epoch": 3.1633303002729756, + "grad_norm": 0.1848239122137479, + "learning_rate": 2.9772036019792415e-06, + "loss": 0.0011, + "step": 6953 + }, + { + "epoch": 3.1637852593266604, + "grad_norm": 0.39702477405697256, + "learning_rate": 2.9758965766009187e-06, + "loss": 0.0055, + "step": 6954 + }, + { + "epoch": 3.1642402183803457, + "grad_norm": 0.21157337658248726, + "learning_rate": 2.974589716623187e-06, + "loss": 0.0059, + "step": 6955 + }, + { + "epoch": 3.164695177434031, + "grad_norm": 0.09650726779714776, + "learning_rate": 2.9732830221528386e-06, + "loss": 0.001, + "step": 6956 + }, + { + "epoch": 3.165150136487716, + "grad_norm": 0.1483336983921621, + "learning_rate": 2.9719764932966477e-06, + "loss": 0.0012, + "step": 6957 + }, + { + "epoch": 3.1656050955414012, + "grad_norm": 0.514786651956486, + "learning_rate": 2.9706701301613806e-06, + "loss": 0.0151, + "step": 6958 + }, + { + "epoch": 3.1660600545950865, + "grad_norm": 0.3691817509596384, + "learning_rate": 2.969363932853785e-06, + "loss": 0.0049, + "step": 6959 + }, + { + "epoch": 3.1665150136487714, + "grad_norm": 0.2928020644138154, + "learning_rate": 2.968057901480599e-06, + "loss": 0.0093, + "step": 6960 + }, + { + "epoch": 3.1669699727024567, + "grad_norm": 0.10110473195819367, + "learning_rate": 2.9667520361485435e-06, + "loss": 0.001, + "step": 6961 + }, + { + "epoch": 3.167424931756142, + "grad_norm": 0.21108152820646317, + "learning_rate": 2.9654463369643305e-06, + "loss": 0.0032, + "step": 6962 + }, + { + "epoch": 3.167879890809827, + "grad_norm": 0.20160032923496765, + "learning_rate": 2.9641408040346563e-06, + "loss": 0.0022, + "step": 6963 + }, + { + "epoch": 3.168334849863512, + "grad_norm": 0.1257116402451997, + "learning_rate": 2.9628354374662005e-06, + "loss": 0.0017, + "step": 6964 + }, + { + "epoch": 3.1687898089171975, + "grad_norm": 0.23255984994435436, + "learning_rate": 2.961530237365634e-06, + "loss": 0.0032, + "step": 6965 + }, + { + "epoch": 3.1692447679708824, + "grad_norm": 0.16078873437351757, + "learning_rate": 2.9602252038396097e-06, + "loss": 0.0031, + "step": 6966 + }, + { + "epoch": 3.1696997270245677, + "grad_norm": 0.11493960089780716, + "learning_rate": 2.958920336994772e-06, + "loss": 0.0007, + "step": 6967 + }, + { + "epoch": 3.170154686078253, + "grad_norm": 0.3582624200962682, + "learning_rate": 2.957615636937744e-06, + "loss": 0.0057, + "step": 6968 + }, + { + "epoch": 3.1706096451319383, + "grad_norm": 0.36328299559816346, + "learning_rate": 2.9563111037751437e-06, + "loss": 0.0114, + "step": 6969 + }, + { + "epoch": 3.171064604185623, + "grad_norm": 0.4499217887781241, + "learning_rate": 2.9550067376135723e-06, + "loss": 0.0071, + "step": 6970 + }, + { + "epoch": 3.1715195632393085, + "grad_norm": 0.3041462875162963, + "learning_rate": 2.953702538559615e-06, + "loss": 0.0048, + "step": 6971 + }, + { + "epoch": 3.171974522292994, + "grad_norm": 0.282479872234923, + "learning_rate": 2.952398506719844e-06, + "loss": 0.0017, + "step": 6972 + }, + { + "epoch": 3.1724294813466787, + "grad_norm": 0.1934463390572384, + "learning_rate": 2.951094642200821e-06, + "loss": 0.004, + "step": 6973 + }, + { + "epoch": 3.172884440400364, + "grad_norm": 0.19199945438383587, + "learning_rate": 2.9497909451090913e-06, + "loss": 0.0034, + "step": 6974 + }, + { + "epoch": 3.1733393994540493, + "grad_norm": 0.265309250667967, + "learning_rate": 2.948487415551185e-06, + "loss": 0.0056, + "step": 6975 + }, + { + "epoch": 3.173794358507734, + "grad_norm": 0.2828975870477447, + "learning_rate": 2.947184053633625e-06, + "loss": 0.004, + "step": 6976 + }, + { + "epoch": 3.1742493175614195, + "grad_norm": 0.2673498425994623, + "learning_rate": 2.9458808594629117e-06, + "loss": 0.0046, + "step": 6977 + }, + { + "epoch": 3.174704276615105, + "grad_norm": 0.17801218073857059, + "learning_rate": 2.94457783314554e-06, + "loss": 0.0032, + "step": 6978 + }, + { + "epoch": 3.1751592356687897, + "grad_norm": 0.4511777180967237, + "learning_rate": 2.9432749747879845e-06, + "loss": 0.0096, + "step": 6979 + }, + { + "epoch": 3.175614194722475, + "grad_norm": 0.3218999149293914, + "learning_rate": 2.9419722844967113e-06, + "loss": 0.0103, + "step": 6980 + }, + { + "epoch": 3.1760691537761603, + "grad_norm": 0.13031810988975212, + "learning_rate": 2.940669762378168e-06, + "loss": 0.0017, + "step": 6981 + }, + { + "epoch": 3.176524112829845, + "grad_norm": 0.16634347564999383, + "learning_rate": 2.93936740853879e-06, + "loss": 0.003, + "step": 6982 + }, + { + "epoch": 3.1769790718835305, + "grad_norm": 0.35073646452000296, + "learning_rate": 2.9380652230850036e-06, + "loss": 0.0055, + "step": 6983 + }, + { + "epoch": 3.1774340309372158, + "grad_norm": 0.13779006286259865, + "learning_rate": 2.9367632061232155e-06, + "loss": 0.0017, + "step": 6984 + }, + { + "epoch": 3.1778889899909006, + "grad_norm": 0.2066628657399087, + "learning_rate": 2.935461357759821e-06, + "loss": 0.0032, + "step": 6985 + }, + { + "epoch": 3.178343949044586, + "grad_norm": 0.19396580606858738, + "learning_rate": 2.9341596781012004e-06, + "loss": 0.0018, + "step": 6986 + }, + { + "epoch": 3.1787989080982713, + "grad_norm": 0.04884125106200917, + "learning_rate": 2.9328581672537227e-06, + "loss": 0.0005, + "step": 6987 + }, + { + "epoch": 3.179253867151956, + "grad_norm": 0.22919642298421322, + "learning_rate": 2.9315568253237394e-06, + "loss": 0.0018, + "step": 6988 + }, + { + "epoch": 3.1797088262056414, + "grad_norm": 0.13864618583544894, + "learning_rate": 2.930255652417591e-06, + "loss": 0.0012, + "step": 6989 + }, + { + "epoch": 3.1801637852593267, + "grad_norm": 0.3397982807660512, + "learning_rate": 2.9289546486416042e-06, + "loss": 0.0063, + "step": 6990 + }, + { + "epoch": 3.180618744313012, + "grad_norm": 0.27321968304937355, + "learning_rate": 2.9276538141020907e-06, + "loss": 0.0027, + "step": 6991 + }, + { + "epoch": 3.181073703366697, + "grad_norm": 0.27382106641037923, + "learning_rate": 2.92635314890535e-06, + "loss": 0.0077, + "step": 6992 + }, + { + "epoch": 3.1815286624203822, + "grad_norm": 0.1890246170642665, + "learning_rate": 2.925052653157664e-06, + "loss": 0.0046, + "step": 6993 + }, + { + "epoch": 3.1819836214740675, + "grad_norm": 0.17113397948676443, + "learning_rate": 2.923752326965306e-06, + "loss": 0.0038, + "step": 6994 + }, + { + "epoch": 3.1824385805277524, + "grad_norm": 0.46890350563403316, + "learning_rate": 2.922452170434531e-06, + "loss": 0.0072, + "step": 6995 + }, + { + "epoch": 3.1828935395814377, + "grad_norm": 0.18836950500834493, + "learning_rate": 2.9211521836715806e-06, + "loss": 0.0022, + "step": 6996 + }, + { + "epoch": 3.183348498635123, + "grad_norm": 0.058613057769979555, + "learning_rate": 2.9198523667826885e-06, + "loss": 0.0005, + "step": 6997 + }, + { + "epoch": 3.183803457688808, + "grad_norm": 0.14162693351389868, + "learning_rate": 2.9185527198740673e-06, + "loss": 0.0015, + "step": 6998 + }, + { + "epoch": 3.184258416742493, + "grad_norm": 0.2611812239293852, + "learning_rate": 2.917253243051915e-06, + "loss": 0.0055, + "step": 6999 + }, + { + "epoch": 3.1847133757961785, + "grad_norm": 0.21605526072712278, + "learning_rate": 2.9159539364224254e-06, + "loss": 0.004, + "step": 7000 + }, + { + "epoch": 3.1851683348498634, + "grad_norm": 0.19439346188644074, + "learning_rate": 2.914654800091768e-06, + "loss": 0.0012, + "step": 7001 + }, + { + "epoch": 3.1856232939035487, + "grad_norm": 0.33966533894608314, + "learning_rate": 2.9133558341661027e-06, + "loss": 0.0086, + "step": 7002 + }, + { + "epoch": 3.186078252957234, + "grad_norm": 0.3850582324069532, + "learning_rate": 2.912057038751574e-06, + "loss": 0.0056, + "step": 7003 + }, + { + "epoch": 3.186533212010919, + "grad_norm": 0.22403964117173933, + "learning_rate": 2.9107584139543147e-06, + "loss": 0.0053, + "step": 7004 + }, + { + "epoch": 3.186988171064604, + "grad_norm": 0.26608613177073764, + "learning_rate": 2.909459959880445e-06, + "loss": 0.0054, + "step": 7005 + }, + { + "epoch": 3.1874431301182895, + "grad_norm": 0.3506465274404851, + "learning_rate": 2.9081616766360665e-06, + "loss": 0.0049, + "step": 7006 + }, + { + "epoch": 3.1878980891719744, + "grad_norm": 0.0692620909517553, + "learning_rate": 2.906863564327269e-06, + "loss": 0.0007, + "step": 7007 + }, + { + "epoch": 3.1883530482256597, + "grad_norm": 0.26402863797053766, + "learning_rate": 2.9055656230601293e-06, + "loss": 0.0036, + "step": 7008 + }, + { + "epoch": 3.188808007279345, + "grad_norm": 0.4588868642887718, + "learning_rate": 2.904267852940705e-06, + "loss": 0.0071, + "step": 7009 + }, + { + "epoch": 3.18926296633303, + "grad_norm": 0.3152284566115162, + "learning_rate": 2.902970254075049e-06, + "loss": 0.0037, + "step": 7010 + }, + { + "epoch": 3.189717925386715, + "grad_norm": 0.4337128851726491, + "learning_rate": 2.901672826569195e-06, + "loss": 0.0132, + "step": 7011 + }, + { + "epoch": 3.1901728844404005, + "grad_norm": 0.5011694619624966, + "learning_rate": 2.900375570529162e-06, + "loss": 0.0069, + "step": 7012 + }, + { + "epoch": 3.1906278434940853, + "grad_norm": 0.3373852818093339, + "learning_rate": 2.8990784860609555e-06, + "loss": 0.0126, + "step": 7013 + }, + { + "epoch": 3.1910828025477707, + "grad_norm": 0.2903091018628997, + "learning_rate": 2.897781573270565e-06, + "loss": 0.0064, + "step": 7014 + }, + { + "epoch": 3.191537761601456, + "grad_norm": 0.3513440393628483, + "learning_rate": 2.896484832263973e-06, + "loss": 0.0037, + "step": 7015 + }, + { + "epoch": 3.191992720655141, + "grad_norm": 0.11824010187024156, + "learning_rate": 2.895188263147141e-06, + "loss": 0.0013, + "step": 7016 + }, + { + "epoch": 3.192447679708826, + "grad_norm": 0.6185486354105568, + "learning_rate": 2.8938918660260173e-06, + "loss": 0.0107, + "step": 7017 + }, + { + "epoch": 3.1929026387625115, + "grad_norm": 0.4431826813186897, + "learning_rate": 2.8925956410065414e-06, + "loss": 0.0072, + "step": 7018 + }, + { + "epoch": 3.1933575978161963, + "grad_norm": 0.19884086207376586, + "learning_rate": 2.8912995881946303e-06, + "loss": 0.0039, + "step": 7019 + }, + { + "epoch": 3.1938125568698816, + "grad_norm": 0.19480040542304444, + "learning_rate": 2.890003707696196e-06, + "loss": 0.0033, + "step": 7020 + }, + { + "epoch": 3.194267515923567, + "grad_norm": 0.1703072898911511, + "learning_rate": 2.88870799961713e-06, + "loss": 0.0026, + "step": 7021 + }, + { + "epoch": 3.194722474977252, + "grad_norm": 0.5755124786510741, + "learning_rate": 2.8874124640633115e-06, + "loss": 0.0082, + "step": 7022 + }, + { + "epoch": 3.195177434030937, + "grad_norm": 0.4408051162755984, + "learning_rate": 2.8861171011406052e-06, + "loss": 0.0161, + "step": 7023 + }, + { + "epoch": 3.1956323930846224, + "grad_norm": 0.22693617648642977, + "learning_rate": 2.8848219109548623e-06, + "loss": 0.0029, + "step": 7024 + }, + { + "epoch": 3.1960873521383077, + "grad_norm": 0.560357126303314, + "learning_rate": 2.8835268936119233e-06, + "loss": 0.0032, + "step": 7025 + }, + { + "epoch": 3.1965423111919926, + "grad_norm": 0.18668153440199636, + "learning_rate": 2.882232049217608e-06, + "loss": 0.0028, + "step": 7026 + }, + { + "epoch": 3.196997270245678, + "grad_norm": 0.27298531267990983, + "learning_rate": 2.8809373778777262e-06, + "loss": 0.0068, + "step": 7027 + }, + { + "epoch": 3.1974522292993632, + "grad_norm": 0.29394006000889444, + "learning_rate": 2.87964287969807e-06, + "loss": 0.0026, + "step": 7028 + }, + { + "epoch": 3.197907188353048, + "grad_norm": 0.13356383356886484, + "learning_rate": 2.8783485547844247e-06, + "loss": 0.0026, + "step": 7029 + }, + { + "epoch": 3.1983621474067334, + "grad_norm": 0.111298059352568, + "learning_rate": 2.877054403242554e-06, + "loss": 0.0008, + "step": 7030 + }, + { + "epoch": 3.1988171064604187, + "grad_norm": 0.14570272870682158, + "learning_rate": 2.8757604251782077e-06, + "loss": 0.0015, + "step": 7031 + }, + { + "epoch": 3.1992720655141036, + "grad_norm": 0.18254357335485907, + "learning_rate": 2.8744666206971295e-06, + "loss": 0.0051, + "step": 7032 + }, + { + "epoch": 3.199727024567789, + "grad_norm": 0.19640566322377645, + "learning_rate": 2.8731729899050374e-06, + "loss": 0.0066, + "step": 7033 + }, + { + "epoch": 3.200181983621474, + "grad_norm": 0.17340254397041094, + "learning_rate": 2.8718795329076465e-06, + "loss": 0.0019, + "step": 7034 + }, + { + "epoch": 3.200636942675159, + "grad_norm": 0.3315625381531741, + "learning_rate": 2.8705862498106496e-06, + "loss": 0.008, + "step": 7035 + }, + { + "epoch": 3.2010919017288444, + "grad_norm": 0.1320339384241569, + "learning_rate": 2.8692931407197276e-06, + "loss": 0.0012, + "step": 7036 + }, + { + "epoch": 3.2015468607825297, + "grad_norm": 0.07674289430099869, + "learning_rate": 2.8680002057405466e-06, + "loss": 0.0007, + "step": 7037 + }, + { + "epoch": 3.2020018198362146, + "grad_norm": 0.24346719572021686, + "learning_rate": 2.86670744497876e-06, + "loss": 0.0035, + "step": 7038 + }, + { + "epoch": 3.2024567788899, + "grad_norm": 0.17979439699527758, + "learning_rate": 2.86541485854001e-06, + "loss": 0.0029, + "step": 7039 + }, + { + "epoch": 3.202911737943585, + "grad_norm": 0.15306681712630646, + "learning_rate": 2.864122446529918e-06, + "loss": 0.0025, + "step": 7040 + }, + { + "epoch": 3.20336669699727, + "grad_norm": 0.16833353442459245, + "learning_rate": 2.8628302090540938e-06, + "loss": 0.0033, + "step": 7041 + }, + { + "epoch": 3.2038216560509554, + "grad_norm": 0.5090333769918682, + "learning_rate": 2.861538146218133e-06, + "loss": 0.0115, + "step": 7042 + }, + { + "epoch": 3.2042766151046407, + "grad_norm": 0.058486268020582495, + "learning_rate": 2.8602462581276166e-06, + "loss": 0.0006, + "step": 7043 + }, + { + "epoch": 3.2047315741583255, + "grad_norm": 0.2321510470825738, + "learning_rate": 2.858954544888114e-06, + "loss": 0.0064, + "step": 7044 + }, + { + "epoch": 3.205186533212011, + "grad_norm": 0.28357815960626026, + "learning_rate": 2.857663006605176e-06, + "loss": 0.0069, + "step": 7045 + }, + { + "epoch": 3.205641492265696, + "grad_norm": 0.3739588837097919, + "learning_rate": 2.8563716433843438e-06, + "loss": 0.0123, + "step": 7046 + }, + { + "epoch": 3.2060964513193815, + "grad_norm": 0.1739856858299897, + "learning_rate": 2.8550804553311407e-06, + "loss": 0.0026, + "step": 7047 + }, + { + "epoch": 3.2065514103730663, + "grad_norm": 0.3729315402442275, + "learning_rate": 2.8537894425510743e-06, + "loss": 0.0094, + "step": 7048 + }, + { + "epoch": 3.2070063694267517, + "grad_norm": 0.19879084739198827, + "learning_rate": 2.8524986051496438e-06, + "loss": 0.0054, + "step": 7049 + }, + { + "epoch": 3.207461328480437, + "grad_norm": 0.2507443550659683, + "learning_rate": 2.85120794323233e-06, + "loss": 0.0067, + "step": 7050 + }, + { + "epoch": 3.207916287534122, + "grad_norm": 0.2034723154478191, + "learning_rate": 2.8499174569045997e-06, + "loss": 0.0054, + "step": 7051 + }, + { + "epoch": 3.208371246587807, + "grad_norm": 0.19812395661781815, + "learning_rate": 2.8486271462719024e-06, + "loss": 0.0039, + "step": 7052 + }, + { + "epoch": 3.2088262056414925, + "grad_norm": 0.12151231660747276, + "learning_rate": 2.847337011439679e-06, + "loss": 0.0016, + "step": 7053 + }, + { + "epoch": 3.2092811646951773, + "grad_norm": 0.06625285711406216, + "learning_rate": 2.8460470525133565e-06, + "loss": 0.0009, + "step": 7054 + }, + { + "epoch": 3.2097361237488626, + "grad_norm": 0.2222268868967166, + "learning_rate": 2.8447572695983413e-06, + "loss": 0.0027, + "step": 7055 + }, + { + "epoch": 3.210191082802548, + "grad_norm": 0.22375109937504972, + "learning_rate": 2.843467662800029e-06, + "loss": 0.004, + "step": 7056 + }, + { + "epoch": 3.210646041856233, + "grad_norm": 0.24148247330602277, + "learning_rate": 2.8421782322237983e-06, + "loss": 0.0041, + "step": 7057 + }, + { + "epoch": 3.211101000909918, + "grad_norm": 0.4036283257293504, + "learning_rate": 2.8408889779750204e-06, + "loss": 0.0079, + "step": 7058 + }, + { + "epoch": 3.2115559599636034, + "grad_norm": 0.16012253927637513, + "learning_rate": 2.839599900159042e-06, + "loss": 0.0031, + "step": 7059 + }, + { + "epoch": 3.2120109190172883, + "grad_norm": 0.19575698891162982, + "learning_rate": 2.838310998881206e-06, + "loss": 0.004, + "step": 7060 + }, + { + "epoch": 3.2124658780709736, + "grad_norm": 0.23238409751911412, + "learning_rate": 2.8370222742468324e-06, + "loss": 0.0028, + "step": 7061 + }, + { + "epoch": 3.212920837124659, + "grad_norm": 1.1799028822947624, + "learning_rate": 2.8357337263612294e-06, + "loss": 0.0079, + "step": 7062 + }, + { + "epoch": 3.213375796178344, + "grad_norm": 0.11704122077802191, + "learning_rate": 2.8344453553296942e-06, + "loss": 0.0009, + "step": 7063 + }, + { + "epoch": 3.213830755232029, + "grad_norm": 0.25464919161887095, + "learning_rate": 2.833157161257505e-06, + "loss": 0.0039, + "step": 7064 + }, + { + "epoch": 3.2142857142857144, + "grad_norm": 0.1701445119296061, + "learning_rate": 2.8318691442499275e-06, + "loss": 0.0015, + "step": 7065 + }, + { + "epoch": 3.2147406733393993, + "grad_norm": 0.022517491579665552, + "learning_rate": 2.83058130441221e-06, + "loss": 0.0002, + "step": 7066 + }, + { + "epoch": 3.2151956323930846, + "grad_norm": 0.2986016634993806, + "learning_rate": 2.8292936418495913e-06, + "loss": 0.0025, + "step": 7067 + }, + { + "epoch": 3.21565059144677, + "grad_norm": 0.25424537343014675, + "learning_rate": 2.8280061566672957e-06, + "loss": 0.003, + "step": 7068 + }, + { + "epoch": 3.2161055505004548, + "grad_norm": 0.21837940611362922, + "learning_rate": 2.8267188489705275e-06, + "loss": 0.0022, + "step": 7069 + }, + { + "epoch": 3.21656050955414, + "grad_norm": 0.1724310669243258, + "learning_rate": 2.825431718864482e-06, + "loss": 0.0019, + "step": 7070 + }, + { + "epoch": 3.2170154686078254, + "grad_norm": 0.3434573885395083, + "learning_rate": 2.824144766454333e-06, + "loss": 0.0074, + "step": 7071 + }, + { + "epoch": 3.2174704276615103, + "grad_norm": 0.17924963646789135, + "learning_rate": 2.82285799184525e-06, + "loss": 0.0038, + "step": 7072 + }, + { + "epoch": 3.2179253867151956, + "grad_norm": 0.22808845598214292, + "learning_rate": 2.8215713951423772e-06, + "loss": 0.0049, + "step": 7073 + }, + { + "epoch": 3.218380345768881, + "grad_norm": 0.16599836575910004, + "learning_rate": 2.8202849764508554e-06, + "loss": 0.0022, + "step": 7074 + }, + { + "epoch": 3.2188353048225657, + "grad_norm": 0.3223551171330232, + "learning_rate": 2.8189987358758018e-06, + "loss": 0.0124, + "step": 7075 + }, + { + "epoch": 3.219290263876251, + "grad_norm": 0.12228389008917982, + "learning_rate": 2.8177126735223193e-06, + "loss": 0.0011, + "step": 7076 + }, + { + "epoch": 3.2197452229299364, + "grad_norm": 0.20551910180727626, + "learning_rate": 2.8164267894955045e-06, + "loss": 0.0038, + "step": 7077 + }, + { + "epoch": 3.2202001819836217, + "grad_norm": 0.15038531484880807, + "learning_rate": 2.8151410839004325e-06, + "loss": 0.0036, + "step": 7078 + }, + { + "epoch": 3.2206551410373065, + "grad_norm": 0.4052783988873605, + "learning_rate": 2.8138555568421625e-06, + "loss": 0.0071, + "step": 7079 + }, + { + "epoch": 3.221110100090992, + "grad_norm": 0.16447661337097008, + "learning_rate": 2.8125702084257432e-06, + "loss": 0.0019, + "step": 7080 + }, + { + "epoch": 3.221565059144677, + "grad_norm": 0.1262644516927067, + "learning_rate": 2.8112850387562097e-06, + "loss": 0.0008, + "step": 7081 + }, + { + "epoch": 3.222020018198362, + "grad_norm": 0.11041599551115908, + "learning_rate": 2.810000047938577e-06, + "loss": 0.0009, + "step": 7082 + }, + { + "epoch": 3.2224749772520473, + "grad_norm": 0.1798952670690953, + "learning_rate": 2.8087152360778513e-06, + "loss": 0.0021, + "step": 7083 + }, + { + "epoch": 3.2229299363057327, + "grad_norm": 0.060136204294505696, + "learning_rate": 2.807430603279022e-06, + "loss": 0.0005, + "step": 7084 + }, + { + "epoch": 3.2233848953594175, + "grad_norm": 0.13404582005182145, + "learning_rate": 2.806146149647062e-06, + "loss": 0.0012, + "step": 7085 + }, + { + "epoch": 3.223839854413103, + "grad_norm": 0.43716814240493523, + "learning_rate": 2.804861875286929e-06, + "loss": 0.0083, + "step": 7086 + }, + { + "epoch": 3.224294813466788, + "grad_norm": 0.3108530374716742, + "learning_rate": 2.8035777803035703e-06, + "loss": 0.0058, + "step": 7087 + }, + { + "epoch": 3.224749772520473, + "grad_norm": 0.2317295713780408, + "learning_rate": 2.8022938648019187e-06, + "loss": 0.002, + "step": 7088 + }, + { + "epoch": 3.2252047315741583, + "grad_norm": 0.11282755114913963, + "learning_rate": 2.801010128886888e-06, + "loss": 0.002, + "step": 7089 + }, + { + "epoch": 3.2256596906278436, + "grad_norm": 0.2165071297806928, + "learning_rate": 2.7997265726633783e-06, + "loss": 0.003, + "step": 7090 + }, + { + "epoch": 3.2261146496815285, + "grad_norm": 0.020343813297191718, + "learning_rate": 2.7984431962362758e-06, + "loss": 0.0002, + "step": 7091 + }, + { + "epoch": 3.226569608735214, + "grad_norm": 0.06077218066603958, + "learning_rate": 2.797159999710454e-06, + "loss": 0.0006, + "step": 7092 + }, + { + "epoch": 3.227024567788899, + "grad_norm": 0.12333300168926505, + "learning_rate": 2.7958769831907694e-06, + "loss": 0.0016, + "step": 7093 + }, + { + "epoch": 3.227479526842584, + "grad_norm": 0.5918758359390299, + "learning_rate": 2.7945941467820626e-06, + "loss": 0.0147, + "step": 7094 + }, + { + "epoch": 3.2279344858962693, + "grad_norm": 0.20177534987550425, + "learning_rate": 2.793311490589164e-06, + "loss": 0.002, + "step": 7095 + }, + { + "epoch": 3.2283894449499546, + "grad_norm": 0.14393835941534916, + "learning_rate": 2.792029014716883e-06, + "loss": 0.0017, + "step": 7096 + }, + { + "epoch": 3.2288444040036395, + "grad_norm": 0.14805804298902528, + "learning_rate": 2.7907467192700222e-06, + "loss": 0.0014, + "step": 7097 + }, + { + "epoch": 3.229299363057325, + "grad_norm": 0.2160902593340542, + "learning_rate": 2.7894646043533623e-06, + "loss": 0.0097, + "step": 7098 + }, + { + "epoch": 3.22975432211101, + "grad_norm": 0.33562528419967586, + "learning_rate": 2.7881826700716724e-06, + "loss": 0.0051, + "step": 7099 + }, + { + "epoch": 3.2302092811646954, + "grad_norm": 0.09317744494325432, + "learning_rate": 2.7869009165297046e-06, + "loss": 0.0009, + "step": 7100 + }, + { + "epoch": 3.2306642402183803, + "grad_norm": 0.1849985981344789, + "learning_rate": 2.785619343832199e-06, + "loss": 0.0023, + "step": 7101 + }, + { + "epoch": 3.2311191992720656, + "grad_norm": 0.04485944164390735, + "learning_rate": 2.784337952083883e-06, + "loss": 0.0005, + "step": 7102 + }, + { + "epoch": 3.231574158325751, + "grad_norm": 0.24686562422397823, + "learning_rate": 2.783056741389464e-06, + "loss": 0.0023, + "step": 7103 + }, + { + "epoch": 3.2320291173794358, + "grad_norm": 0.2815808173089651, + "learning_rate": 2.7817757118536354e-06, + "loss": 0.0039, + "step": 7104 + }, + { + "epoch": 3.232484076433121, + "grad_norm": 0.3456378578190567, + "learning_rate": 2.780494863581077e-06, + "loss": 0.0058, + "step": 7105 + }, + { + "epoch": 3.2329390354868064, + "grad_norm": 0.21264962376783553, + "learning_rate": 2.779214196676457e-06, + "loss": 0.002, + "step": 7106 + }, + { + "epoch": 3.2333939945404913, + "grad_norm": 0.3646219696749776, + "learning_rate": 2.7779337112444238e-06, + "loss": 0.009, + "step": 7107 + }, + { + "epoch": 3.2338489535941766, + "grad_norm": 0.4208559385557234, + "learning_rate": 2.776653407389611e-06, + "loss": 0.0063, + "step": 7108 + }, + { + "epoch": 3.234303912647862, + "grad_norm": 0.19330467771961016, + "learning_rate": 2.7753732852166428e-06, + "loss": 0.0043, + "step": 7109 + }, + { + "epoch": 3.2347588717015467, + "grad_norm": 0.1344147761618293, + "learning_rate": 2.774093344830122e-06, + "loss": 0.0015, + "step": 7110 + }, + { + "epoch": 3.235213830755232, + "grad_norm": 0.2657374909842001, + "learning_rate": 2.7728135863346427e-06, + "loss": 0.005, + "step": 7111 + }, + { + "epoch": 3.2356687898089174, + "grad_norm": 0.4072459750414462, + "learning_rate": 2.7715340098347794e-06, + "loss": 0.0026, + "step": 7112 + }, + { + "epoch": 3.2361237488626022, + "grad_norm": 0.1442266297551056, + "learning_rate": 2.770254615435093e-06, + "loss": 0.0036, + "step": 7113 + }, + { + "epoch": 3.2365787079162875, + "grad_norm": 0.22488065683105946, + "learning_rate": 2.76897540324013e-06, + "loss": 0.0024, + "step": 7114 + }, + { + "epoch": 3.237033666969973, + "grad_norm": 0.26262719206301965, + "learning_rate": 2.76769637335442e-06, + "loss": 0.0033, + "step": 7115 + }, + { + "epoch": 3.2374886260236577, + "grad_norm": 0.18923068007150018, + "learning_rate": 2.766417525882481e-06, + "loss": 0.0041, + "step": 7116 + }, + { + "epoch": 3.237943585077343, + "grad_norm": 0.3162808890372713, + "learning_rate": 2.7651388609288177e-06, + "loss": 0.0032, + "step": 7117 + }, + { + "epoch": 3.2383985441310283, + "grad_norm": 0.20015900847028084, + "learning_rate": 2.7638603785979133e-06, + "loss": 0.0054, + "step": 7118 + }, + { + "epoch": 3.238853503184713, + "grad_norm": 0.28257635956012495, + "learning_rate": 2.762582078994241e-06, + "loss": 0.0031, + "step": 7119 + }, + { + "epoch": 3.2393084622383985, + "grad_norm": 0.043847211302883014, + "learning_rate": 2.761303962222255e-06, + "loss": 0.0004, + "step": 7120 + }, + { + "epoch": 3.239763421292084, + "grad_norm": 0.09330151131243765, + "learning_rate": 2.760026028386401e-06, + "loss": 0.0008, + "step": 7121 + }, + { + "epoch": 3.2402183803457687, + "grad_norm": 0.15342091258456772, + "learning_rate": 2.7587482775911024e-06, + "loss": 0.0015, + "step": 7122 + }, + { + "epoch": 3.240673339399454, + "grad_norm": 0.20887414195070997, + "learning_rate": 2.757470709940776e-06, + "loss": 0.0034, + "step": 7123 + }, + { + "epoch": 3.2411282984531393, + "grad_norm": 0.24010222913705828, + "learning_rate": 2.7561933255398156e-06, + "loss": 0.0037, + "step": 7124 + }, + { + "epoch": 3.241583257506824, + "grad_norm": 0.275266451650375, + "learning_rate": 2.754916124492601e-06, + "loss": 0.0028, + "step": 7125 + }, + { + "epoch": 3.2420382165605095, + "grad_norm": 0.19890008052552316, + "learning_rate": 2.7536391069035046e-06, + "loss": 0.002, + "step": 7126 + }, + { + "epoch": 3.242493175614195, + "grad_norm": 0.1416871420440179, + "learning_rate": 2.7523622728768757e-06, + "loss": 0.0007, + "step": 7127 + }, + { + "epoch": 3.2429481346678797, + "grad_norm": 0.08239089863556529, + "learning_rate": 2.7510856225170513e-06, + "loss": 0.0006, + "step": 7128 + }, + { + "epoch": 3.243403093721565, + "grad_norm": 0.10984184017568707, + "learning_rate": 2.7498091559283525e-06, + "loss": 0.0007, + "step": 7129 + }, + { + "epoch": 3.2438580527752503, + "grad_norm": 0.21224609107233322, + "learning_rate": 2.7485328732150872e-06, + "loss": 0.0051, + "step": 7130 + }, + { + "epoch": 3.244313011828935, + "grad_norm": 0.3065831000641192, + "learning_rate": 2.7472567744815506e-06, + "loss": 0.0024, + "step": 7131 + }, + { + "epoch": 3.2447679708826205, + "grad_norm": 0.3190289370033862, + "learning_rate": 2.7459808598320165e-06, + "loss": 0.0028, + "step": 7132 + }, + { + "epoch": 3.245222929936306, + "grad_norm": 0.1621658989961227, + "learning_rate": 2.7447051293707476e-06, + "loss": 0.0015, + "step": 7133 + }, + { + "epoch": 3.245677888989991, + "grad_norm": 0.39736181091782363, + "learning_rate": 2.7434295832019887e-06, + "loss": 0.0061, + "step": 7134 + }, + { + "epoch": 3.246132848043676, + "grad_norm": 0.3990589807723281, + "learning_rate": 2.7421542214299756e-06, + "loss": 0.0075, + "step": 7135 + }, + { + "epoch": 3.2465878070973613, + "grad_norm": 0.20334170027250575, + "learning_rate": 2.7408790441589217e-06, + "loss": 0.007, + "step": 7136 + }, + { + "epoch": 3.2470427661510466, + "grad_norm": 0.1532240001827751, + "learning_rate": 2.7396040514930316e-06, + "loss": 0.0033, + "step": 7137 + }, + { + "epoch": 3.2474977252047315, + "grad_norm": 0.24004564455352015, + "learning_rate": 2.7383292435364908e-06, + "loss": 0.0028, + "step": 7138 + }, + { + "epoch": 3.2479526842584168, + "grad_norm": 0.05297090734355906, + "learning_rate": 2.737054620393469e-06, + "loss": 0.0003, + "step": 7139 + }, + { + "epoch": 3.248407643312102, + "grad_norm": 0.10141762631608119, + "learning_rate": 2.7357801821681255e-06, + "loss": 0.0008, + "step": 7140 + }, + { + "epoch": 3.248862602365787, + "grad_norm": 0.1890439388552718, + "learning_rate": 2.734505928964601e-06, + "loss": 0.0015, + "step": 7141 + }, + { + "epoch": 3.2493175614194723, + "grad_norm": 0.18501097364674118, + "learning_rate": 2.733231860887021e-06, + "loss": 0.0021, + "step": 7142 + }, + { + "epoch": 3.2497725204731576, + "grad_norm": 0.03364657933446476, + "learning_rate": 2.7319579780394943e-06, + "loss": 0.0002, + "step": 7143 + }, + { + "epoch": 3.2502274795268424, + "grad_norm": 0.34418281924681093, + "learning_rate": 2.730684280526119e-06, + "loss": 0.0057, + "step": 7144 + }, + { + "epoch": 3.2506824385805277, + "grad_norm": 0.05360713459967386, + "learning_rate": 2.729410768450979e-06, + "loss": 0.0003, + "step": 7145 + }, + { + "epoch": 3.251137397634213, + "grad_norm": 0.169200753952908, + "learning_rate": 2.7281374419181367e-06, + "loss": 0.0013, + "step": 7146 + }, + { + "epoch": 3.251592356687898, + "grad_norm": 0.14898552955724903, + "learning_rate": 2.726864301031643e-06, + "loss": 0.0012, + "step": 7147 + }, + { + "epoch": 3.2520473157415832, + "grad_norm": 0.0703381828959106, + "learning_rate": 2.725591345895533e-06, + "loss": 0.0005, + "step": 7148 + }, + { + "epoch": 3.2525022747952685, + "grad_norm": 0.42677499836340976, + "learning_rate": 2.7243185766138257e-06, + "loss": 0.0078, + "step": 7149 + }, + { + "epoch": 3.2529572338489534, + "grad_norm": 0.10280017033093113, + "learning_rate": 2.7230459932905275e-06, + "loss": 0.0008, + "step": 7150 + }, + { + "epoch": 3.2534121929026387, + "grad_norm": 0.28943015042820897, + "learning_rate": 2.7217735960296295e-06, + "loss": 0.0035, + "step": 7151 + }, + { + "epoch": 3.253867151956324, + "grad_norm": 0.12429545745869343, + "learning_rate": 2.720501384935105e-06, + "loss": 0.0005, + "step": 7152 + }, + { + "epoch": 3.2543221110100093, + "grad_norm": 0.06051664488385681, + "learning_rate": 2.7192293601109134e-06, + "loss": 0.0006, + "step": 7153 + }, + { + "epoch": 3.254777070063694, + "grad_norm": 0.24896660037657153, + "learning_rate": 2.717957521660996e-06, + "loss": 0.0049, + "step": 7154 + }, + { + "epoch": 3.2552320291173795, + "grad_norm": 0.17915147040713456, + "learning_rate": 2.7166858696892867e-06, + "loss": 0.0037, + "step": 7155 + }, + { + "epoch": 3.255686988171065, + "grad_norm": 0.20377455551361887, + "learning_rate": 2.715414404299697e-06, + "loss": 0.002, + "step": 7156 + }, + { + "epoch": 3.2561419472247497, + "grad_norm": 0.3373844136749358, + "learning_rate": 2.7141431255961228e-06, + "loss": 0.0038, + "step": 7157 + }, + { + "epoch": 3.256596906278435, + "grad_norm": 0.32355393248923936, + "learning_rate": 2.7128720336824523e-06, + "loss": 0.0041, + "step": 7158 + }, + { + "epoch": 3.2570518653321203, + "grad_norm": 0.14333946189776586, + "learning_rate": 2.7116011286625478e-06, + "loss": 0.0014, + "step": 7159 + }, + { + "epoch": 3.257506824385805, + "grad_norm": 0.2600585350353282, + "learning_rate": 2.710330410640267e-06, + "loss": 0.0023, + "step": 7160 + }, + { + "epoch": 3.2579617834394905, + "grad_norm": 0.09719011715006663, + "learning_rate": 2.7090598797194447e-06, + "loss": 0.0022, + "step": 7161 + }, + { + "epoch": 3.258416742493176, + "grad_norm": 0.11143143654325995, + "learning_rate": 2.707789536003903e-06, + "loss": 0.0017, + "step": 7162 + }, + { + "epoch": 3.2588717015468607, + "grad_norm": 0.34203956354533316, + "learning_rate": 2.7065193795974474e-06, + "loss": 0.0107, + "step": 7163 + }, + { + "epoch": 3.259326660600546, + "grad_norm": 0.41330817811248277, + "learning_rate": 2.705249410603871e-06, + "loss": 0.0086, + "step": 7164 + }, + { + "epoch": 3.2597816196542313, + "grad_norm": 0.09824676055711044, + "learning_rate": 2.7039796291269516e-06, + "loss": 0.0005, + "step": 7165 + }, + { + "epoch": 3.260236578707916, + "grad_norm": 0.06807850809841584, + "learning_rate": 2.7027100352704484e-06, + "loss": 0.0004, + "step": 7166 + }, + { + "epoch": 3.2606915377616015, + "grad_norm": 0.02428812660313921, + "learning_rate": 2.7014406291381057e-06, + "loss": 0.0003, + "step": 7167 + }, + { + "epoch": 3.261146496815287, + "grad_norm": 0.30857124081147036, + "learning_rate": 2.7001714108336535e-06, + "loss": 0.0093, + "step": 7168 + }, + { + "epoch": 3.2616014558689717, + "grad_norm": 0.4031854097512622, + "learning_rate": 2.6989023804608095e-06, + "loss": 0.0034, + "step": 7169 + }, + { + "epoch": 3.262056414922657, + "grad_norm": 0.10851853189900855, + "learning_rate": 2.697633538123271e-06, + "loss": 0.0006, + "step": 7170 + }, + { + "epoch": 3.2625113739763423, + "grad_norm": 0.2690155188250169, + "learning_rate": 2.696364883924721e-06, + "loss": 0.0033, + "step": 7171 + }, + { + "epoch": 3.262966333030027, + "grad_norm": 0.2181276629041794, + "learning_rate": 2.695096417968831e-06, + "loss": 0.0021, + "step": 7172 + }, + { + "epoch": 3.2634212920837125, + "grad_norm": 0.23758819563699266, + "learning_rate": 2.6938281403592508e-06, + "loss": 0.0018, + "step": 7173 + }, + { + "epoch": 3.2638762511373978, + "grad_norm": 0.7967636126170162, + "learning_rate": 2.692560051199623e-06, + "loss": 0.042, + "step": 7174 + }, + { + "epoch": 3.2643312101910826, + "grad_norm": 0.4211075960736797, + "learning_rate": 2.691292150593567e-06, + "loss": 0.0028, + "step": 7175 + }, + { + "epoch": 3.264786169244768, + "grad_norm": 0.20546727432724868, + "learning_rate": 2.6900244386446903e-06, + "loss": 0.003, + "step": 7176 + }, + { + "epoch": 3.2652411282984533, + "grad_norm": 0.20224565986671209, + "learning_rate": 2.688756915456583e-06, + "loss": 0.002, + "step": 7177 + }, + { + "epoch": 3.265696087352138, + "grad_norm": 0.32028901986308433, + "learning_rate": 2.6874895811328227e-06, + "loss": 0.0047, + "step": 7178 + }, + { + "epoch": 3.2661510464058234, + "grad_norm": 0.3517715475866459, + "learning_rate": 2.6862224357769735e-06, + "loss": 0.0034, + "step": 7179 + }, + { + "epoch": 3.2666060054595087, + "grad_norm": 0.16322044110343517, + "learning_rate": 2.684955479492577e-06, + "loss": 0.001, + "step": 7180 + }, + { + "epoch": 3.2670609645131936, + "grad_norm": 0.19287541425823002, + "learning_rate": 2.6836887123831646e-06, + "loss": 0.0026, + "step": 7181 + }, + { + "epoch": 3.267515923566879, + "grad_norm": 0.21650577938346618, + "learning_rate": 2.6824221345522485e-06, + "loss": 0.0015, + "step": 7182 + }, + { + "epoch": 3.2679708826205642, + "grad_norm": 0.26646322209002665, + "learning_rate": 2.6811557461033313e-06, + "loss": 0.0026, + "step": 7183 + }, + { + "epoch": 3.268425841674249, + "grad_norm": 0.25724749351765963, + "learning_rate": 2.6798895471398945e-06, + "loss": 0.0037, + "step": 7184 + }, + { + "epoch": 3.2688808007279344, + "grad_norm": 0.11269082970141436, + "learning_rate": 2.678623537765404e-06, + "loss": 0.0013, + "step": 7185 + }, + { + "epoch": 3.2693357597816197, + "grad_norm": 0.24740827637594645, + "learning_rate": 2.6773577180833173e-06, + "loss": 0.006, + "step": 7186 + }, + { + "epoch": 3.2697907188353046, + "grad_norm": 0.24586612229392948, + "learning_rate": 2.6760920881970688e-06, + "loss": 0.0032, + "step": 7187 + }, + { + "epoch": 3.27024567788899, + "grad_norm": 0.3197566711858623, + "learning_rate": 2.674826648210078e-06, + "loss": 0.0041, + "step": 7188 + }, + { + "epoch": 3.270700636942675, + "grad_norm": 0.15619097272415708, + "learning_rate": 2.673561398225755e-06, + "loss": 0.0012, + "step": 7189 + }, + { + "epoch": 3.2711555959963605, + "grad_norm": 0.18916658455143268, + "learning_rate": 2.6722963383474888e-06, + "loss": 0.0014, + "step": 7190 + }, + { + "epoch": 3.2716105550500454, + "grad_norm": 0.23624471707474365, + "learning_rate": 2.6710314686786544e-06, + "loss": 0.0046, + "step": 7191 + }, + { + "epoch": 3.2720655141037307, + "grad_norm": 0.4881128093896115, + "learning_rate": 2.6697667893226077e-06, + "loss": 0.0116, + "step": 7192 + }, + { + "epoch": 3.272520473157416, + "grad_norm": 0.2804352033956888, + "learning_rate": 2.6685023003826965e-06, + "loss": 0.0027, + "step": 7193 + }, + { + "epoch": 3.272975432211101, + "grad_norm": 0.2267541804256952, + "learning_rate": 2.6672380019622503e-06, + "loss": 0.0029, + "step": 7194 + }, + { + "epoch": 3.273430391264786, + "grad_norm": 0.2757278787292266, + "learning_rate": 2.6659738941645797e-06, + "loss": 0.003, + "step": 7195 + }, + { + "epoch": 3.2738853503184715, + "grad_norm": 0.09656253820978641, + "learning_rate": 2.6647099770929824e-06, + "loss": 0.0009, + "step": 7196 + }, + { + "epoch": 3.2743403093721564, + "grad_norm": 0.2568811260632643, + "learning_rate": 2.6634462508507375e-06, + "loss": 0.0021, + "step": 7197 + }, + { + "epoch": 3.2747952684258417, + "grad_norm": 0.16199808895737475, + "learning_rate": 2.662182715541115e-06, + "loss": 0.0015, + "step": 7198 + }, + { + "epoch": 3.275250227479527, + "grad_norm": 0.17744029718226378, + "learning_rate": 2.660919371267362e-06, + "loss": 0.0031, + "step": 7199 + }, + { + "epoch": 3.275705186533212, + "grad_norm": 0.29539641511409087, + "learning_rate": 2.659656218132717e-06, + "loss": 0.0053, + "step": 7200 + }, + { + "epoch": 3.276160145586897, + "grad_norm": 0.28386411505165304, + "learning_rate": 2.658393256240396e-06, + "loss": 0.0047, + "step": 7201 + }, + { + "epoch": 3.2766151046405825, + "grad_norm": 0.14718863575620184, + "learning_rate": 2.657130485693602e-06, + "loss": 0.0026, + "step": 7202 + }, + { + "epoch": 3.2770700636942673, + "grad_norm": 0.24779703909292675, + "learning_rate": 2.655867906595526e-06, + "loss": 0.0052, + "step": 7203 + }, + { + "epoch": 3.2775250227479527, + "grad_norm": 0.17435684825661954, + "learning_rate": 2.65460551904934e-06, + "loss": 0.002, + "step": 7204 + }, + { + "epoch": 3.277979981801638, + "grad_norm": 0.3549564471563982, + "learning_rate": 2.653343323158198e-06, + "loss": 0.0022, + "step": 7205 + }, + { + "epoch": 3.278434940855323, + "grad_norm": 0.1819617948665579, + "learning_rate": 2.6520813190252404e-06, + "loss": 0.0022, + "step": 7206 + }, + { + "epoch": 3.278889899909008, + "grad_norm": 0.22062113504077227, + "learning_rate": 2.6508195067535945e-06, + "loss": 0.002, + "step": 7207 + }, + { + "epoch": 3.2793448589626935, + "grad_norm": 0.31409163144651064, + "learning_rate": 2.649557886446372e-06, + "loss": 0.0049, + "step": 7208 + }, + { + "epoch": 3.2797998180163788, + "grad_norm": 0.2796765425652682, + "learning_rate": 2.648296458206664e-06, + "loss": 0.0021, + "step": 7209 + }, + { + "epoch": 3.2802547770700636, + "grad_norm": 0.25773002985936555, + "learning_rate": 2.6470352221375496e-06, + "loss": 0.0031, + "step": 7210 + }, + { + "epoch": 3.280709736123749, + "grad_norm": 0.241479713747486, + "learning_rate": 2.6457741783420885e-06, + "loss": 0.0019, + "step": 7211 + }, + { + "epoch": 3.2811646951774343, + "grad_norm": 0.3235988011783772, + "learning_rate": 2.6445133269233325e-06, + "loss": 0.0085, + "step": 7212 + }, + { + "epoch": 3.281619654231119, + "grad_norm": 0.12733756900378967, + "learning_rate": 2.6432526679843074e-06, + "loss": 0.0014, + "step": 7213 + }, + { + "epoch": 3.2820746132848044, + "grad_norm": 0.39314498928669095, + "learning_rate": 2.641992201628034e-06, + "loss": 0.0042, + "step": 7214 + }, + { + "epoch": 3.2825295723384897, + "grad_norm": 0.09141632687175844, + "learning_rate": 2.6407319279575088e-06, + "loss": 0.0011, + "step": 7215 + }, + { + "epoch": 3.2829845313921746, + "grad_norm": 0.2826436391514078, + "learning_rate": 2.639471847075714e-06, + "loss": 0.0057, + "step": 7216 + }, + { + "epoch": 3.28343949044586, + "grad_norm": 0.12212552924455057, + "learning_rate": 2.6382119590856226e-06, + "loss": 0.0011, + "step": 7217 + }, + { + "epoch": 3.2838944494995452, + "grad_norm": 0.24209539677389158, + "learning_rate": 2.6369522640901836e-06, + "loss": 0.0018, + "step": 7218 + }, + { + "epoch": 3.28434940855323, + "grad_norm": 0.35241312909161665, + "learning_rate": 2.6356927621923343e-06, + "loss": 0.0082, + "step": 7219 + }, + { + "epoch": 3.2848043676069154, + "grad_norm": 0.38443106051345766, + "learning_rate": 2.634433453494993e-06, + "loss": 0.0089, + "step": 7220 + }, + { + "epoch": 3.2852593266606007, + "grad_norm": 0.4712211357254818, + "learning_rate": 2.633174338101068e-06, + "loss": 0.0046, + "step": 7221 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 0.19369858066775172, + "learning_rate": 2.6319154161134484e-06, + "loss": 0.0021, + "step": 7222 + }, + { + "epoch": 3.286169244767971, + "grad_norm": 0.3276655948136777, + "learning_rate": 2.6306566876350072e-06, + "loss": 0.0067, + "step": 7223 + }, + { + "epoch": 3.286624203821656, + "grad_norm": 0.28093485046800604, + "learning_rate": 2.6293981527686018e-06, + "loss": 0.0036, + "step": 7224 + }, + { + "epoch": 3.287079162875341, + "grad_norm": 0.14654869426966358, + "learning_rate": 2.6281398116170736e-06, + "loss": 0.0014, + "step": 7225 + }, + { + "epoch": 3.2875341219290264, + "grad_norm": 0.2526362478616213, + "learning_rate": 2.626881664283247e-06, + "loss": 0.0032, + "step": 7226 + }, + { + "epoch": 3.2879890809827117, + "grad_norm": 0.11656170348832014, + "learning_rate": 2.625623710869934e-06, + "loss": 0.0008, + "step": 7227 + }, + { + "epoch": 3.2884440400363966, + "grad_norm": 0.419202347891847, + "learning_rate": 2.62436595147993e-06, + "loss": 0.0044, + "step": 7228 + }, + { + "epoch": 3.288898999090082, + "grad_norm": 0.36359780227547744, + "learning_rate": 2.6231083862160134e-06, + "loss": 0.0068, + "step": 7229 + }, + { + "epoch": 3.289353958143767, + "grad_norm": 0.12545000328430567, + "learning_rate": 2.621851015180945e-06, + "loss": 0.0011, + "step": 7230 + }, + { + "epoch": 3.289808917197452, + "grad_norm": 0.3613893940093683, + "learning_rate": 2.6205938384774698e-06, + "loss": 0.0122, + "step": 7231 + }, + { + "epoch": 3.2902638762511374, + "grad_norm": 0.20055896136181164, + "learning_rate": 2.6193368562083226e-06, + "loss": 0.0022, + "step": 7232 + }, + { + "epoch": 3.2907188353048227, + "grad_norm": 0.25846498960539516, + "learning_rate": 2.618080068476217e-06, + "loss": 0.0044, + "step": 7233 + }, + { + "epoch": 3.2911737943585075, + "grad_norm": 0.32379062949367765, + "learning_rate": 2.6168234753838497e-06, + "loss": 0.0038, + "step": 7234 + }, + { + "epoch": 3.291628753412193, + "grad_norm": 0.20795370844078756, + "learning_rate": 2.615567077033907e-06, + "loss": 0.0009, + "step": 7235 + }, + { + "epoch": 3.292083712465878, + "grad_norm": 0.04601790713926328, + "learning_rate": 2.6143108735290534e-06, + "loss": 0.0003, + "step": 7236 + }, + { + "epoch": 3.292538671519563, + "grad_norm": 0.3164377804861328, + "learning_rate": 2.6130548649719434e-06, + "loss": 0.0069, + "step": 7237 + }, + { + "epoch": 3.2929936305732483, + "grad_norm": 0.24218745571621716, + "learning_rate": 2.61179905146521e-06, + "loss": 0.0026, + "step": 7238 + }, + { + "epoch": 3.2934485896269337, + "grad_norm": 0.2755404432763007, + "learning_rate": 2.610543433111473e-06, + "loss": 0.0044, + "step": 7239 + }, + { + "epoch": 3.2939035486806185, + "grad_norm": 0.14076124438836563, + "learning_rate": 2.609288010013335e-06, + "loss": 0.002, + "step": 7240 + }, + { + "epoch": 3.294358507734304, + "grad_norm": 0.09307802009705414, + "learning_rate": 2.6080327822733837e-06, + "loss": 0.0005, + "step": 7241 + }, + { + "epoch": 3.294813466787989, + "grad_norm": 0.09960470798181198, + "learning_rate": 2.6067777499941937e-06, + "loss": 0.0008, + "step": 7242 + }, + { + "epoch": 3.295268425841674, + "grad_norm": 0.17779239899983187, + "learning_rate": 2.6055229132783175e-06, + "loss": 0.003, + "step": 7243 + }, + { + "epoch": 3.2957233848953593, + "grad_norm": 0.239092146999513, + "learning_rate": 2.6042682722282964e-06, + "loss": 0.0059, + "step": 7244 + }, + { + "epoch": 3.2961783439490446, + "grad_norm": 0.26744604832466934, + "learning_rate": 2.603013826946651e-06, + "loss": 0.003, + "step": 7245 + }, + { + "epoch": 3.29663330300273, + "grad_norm": 0.12128936745238342, + "learning_rate": 2.6017595775358928e-06, + "loss": 0.0011, + "step": 7246 + }, + { + "epoch": 3.297088262056415, + "grad_norm": 0.20695500254881521, + "learning_rate": 2.6005055240985113e-06, + "loss": 0.0057, + "step": 7247 + }, + { + "epoch": 3.2975432211101, + "grad_norm": 0.1419702376477733, + "learning_rate": 2.5992516667369805e-06, + "loss": 0.0025, + "step": 7248 + }, + { + "epoch": 3.2979981801637854, + "grad_norm": 0.05793491091204328, + "learning_rate": 2.597998005553764e-06, + "loss": 0.0005, + "step": 7249 + }, + { + "epoch": 3.2984531392174703, + "grad_norm": 0.08289586536331124, + "learning_rate": 2.5967445406513013e-06, + "loss": 0.0006, + "step": 7250 + }, + { + "epoch": 3.2989080982711556, + "grad_norm": 0.2714757900769244, + "learning_rate": 2.5954912721320237e-06, + "loss": 0.0028, + "step": 7251 + }, + { + "epoch": 3.299363057324841, + "grad_norm": 0.12976936546940726, + "learning_rate": 2.5942382000983403e-06, + "loss": 0.0019, + "step": 7252 + }, + { + "epoch": 3.299818016378526, + "grad_norm": 0.3248448443965714, + "learning_rate": 2.5929853246526466e-06, + "loss": 0.005, + "step": 7253 + }, + { + "epoch": 3.300272975432211, + "grad_norm": 0.17074397189179708, + "learning_rate": 2.5917326458973224e-06, + "loss": 0.0019, + "step": 7254 + }, + { + "epoch": 3.3007279344858964, + "grad_norm": 0.3065231414289718, + "learning_rate": 2.5904801639347273e-06, + "loss": 0.0056, + "step": 7255 + }, + { + "epoch": 3.3011828935395813, + "grad_norm": 0.20273530408422263, + "learning_rate": 2.5892278788672154e-06, + "loss": 0.002, + "step": 7256 + }, + { + "epoch": 3.3016378525932666, + "grad_norm": 0.38598289013176906, + "learning_rate": 2.5879757907971144e-06, + "loss": 0.0066, + "step": 7257 + }, + { + "epoch": 3.302092811646952, + "grad_norm": 0.136017382148316, + "learning_rate": 2.5867238998267386e-06, + "loss": 0.0017, + "step": 7258 + }, + { + "epoch": 3.3025477707006368, + "grad_norm": 0.05194067573736035, + "learning_rate": 2.585472206058388e-06, + "loss": 0.0005, + "step": 7259 + }, + { + "epoch": 3.303002729754322, + "grad_norm": 0.23793721769862095, + "learning_rate": 2.584220709594343e-06, + "loss": 0.0041, + "step": 7260 + }, + { + "epoch": 3.3034576888080074, + "grad_norm": 0.24718595782286687, + "learning_rate": 2.582969410536874e-06, + "loss": 0.0026, + "step": 7261 + }, + { + "epoch": 3.3039126478616927, + "grad_norm": 0.2704220442702209, + "learning_rate": 2.5817183089882275e-06, + "loss": 0.0052, + "step": 7262 + }, + { + "epoch": 3.3043676069153776, + "grad_norm": 0.19002404344345794, + "learning_rate": 2.580467405050642e-06, + "loss": 0.0043, + "step": 7263 + }, + { + "epoch": 3.304822565969063, + "grad_norm": 0.0949216897112524, + "learning_rate": 2.5792166988263336e-06, + "loss": 0.0007, + "step": 7264 + }, + { + "epoch": 3.305277525022748, + "grad_norm": 0.7628543889003218, + "learning_rate": 2.5779661904175022e-06, + "loss": 0.0011, + "step": 7265 + }, + { + "epoch": 3.305732484076433, + "grad_norm": 0.23094332899454187, + "learning_rate": 2.576715879926338e-06, + "loss": 0.0027, + "step": 7266 + }, + { + "epoch": 3.3061874431301184, + "grad_norm": 0.3845595692052192, + "learning_rate": 2.575465767455009e-06, + "loss": 0.002, + "step": 7267 + }, + { + "epoch": 3.3066424021838037, + "grad_norm": 0.20459573434649453, + "learning_rate": 2.574215853105667e-06, + "loss": 0.0017, + "step": 7268 + }, + { + "epoch": 3.3070973612374885, + "grad_norm": 0.3196180698891119, + "learning_rate": 2.5729661369804505e-06, + "loss": 0.0045, + "step": 7269 + }, + { + "epoch": 3.307552320291174, + "grad_norm": 0.2073632429701408, + "learning_rate": 2.5717166191814803e-06, + "loss": 0.0021, + "step": 7270 + }, + { + "epoch": 3.308007279344859, + "grad_norm": 0.5950699211150948, + "learning_rate": 2.5704672998108636e-06, + "loss": 0.0132, + "step": 7271 + }, + { + "epoch": 3.308462238398544, + "grad_norm": 0.2997257838226498, + "learning_rate": 2.569218178970688e-06, + "loss": 0.0027, + "step": 7272 + }, + { + "epoch": 3.3089171974522293, + "grad_norm": 0.2924224676762152, + "learning_rate": 2.5679692567630247e-06, + "loss": 0.0041, + "step": 7273 + }, + { + "epoch": 3.3093721565059147, + "grad_norm": 0.518688078078415, + "learning_rate": 2.5667205332899294e-06, + "loss": 0.005, + "step": 7274 + }, + { + "epoch": 3.3098271155595995, + "grad_norm": 0.13013443653274556, + "learning_rate": 2.5654720086534456e-06, + "loss": 0.0013, + "step": 7275 + }, + { + "epoch": 3.310282074613285, + "grad_norm": 0.37097330863903055, + "learning_rate": 2.5642236829555926e-06, + "loss": 0.0064, + "step": 7276 + }, + { + "epoch": 3.31073703366697, + "grad_norm": 0.4071420739095773, + "learning_rate": 2.5629755562983827e-06, + "loss": 0.0088, + "step": 7277 + }, + { + "epoch": 3.311191992720655, + "grad_norm": 0.3013908187952351, + "learning_rate": 2.5617276287838043e-06, + "loss": 0.0041, + "step": 7278 + }, + { + "epoch": 3.3116469517743403, + "grad_norm": 0.13989079123637085, + "learning_rate": 2.560479900513832e-06, + "loss": 0.0022, + "step": 7279 + }, + { + "epoch": 3.3121019108280256, + "grad_norm": 0.14256871449962055, + "learning_rate": 2.5592323715904266e-06, + "loss": 0.0029, + "step": 7280 + }, + { + "epoch": 3.3125568698817105, + "grad_norm": 0.12064162971566546, + "learning_rate": 2.5579850421155294e-06, + "loss": 0.0022, + "step": 7281 + }, + { + "epoch": 3.313011828935396, + "grad_norm": 0.14792415961492336, + "learning_rate": 2.5567379121910672e-06, + "loss": 0.0017, + "step": 7282 + }, + { + "epoch": 3.313466787989081, + "grad_norm": 0.1547308310214879, + "learning_rate": 2.5554909819189466e-06, + "loss": 0.0011, + "step": 7283 + }, + { + "epoch": 3.313921747042766, + "grad_norm": 0.2690986210507297, + "learning_rate": 2.5542442514010635e-06, + "loss": 0.0035, + "step": 7284 + }, + { + "epoch": 3.3143767060964513, + "grad_norm": 0.12918828812883756, + "learning_rate": 2.5529977207392977e-06, + "loss": 0.0019, + "step": 7285 + }, + { + "epoch": 3.3148316651501366, + "grad_norm": 0.5004581771599083, + "learning_rate": 2.551751390035507e-06, + "loss": 0.009, + "step": 7286 + }, + { + "epoch": 3.3152866242038215, + "grad_norm": 0.2188150802674912, + "learning_rate": 2.550505259391537e-06, + "loss": 0.0042, + "step": 7287 + }, + { + "epoch": 3.315741583257507, + "grad_norm": 0.32237899611928544, + "learning_rate": 2.549259328909214e-06, + "loss": 0.0038, + "step": 7288 + }, + { + "epoch": 3.316196542311192, + "grad_norm": 0.17433601843193414, + "learning_rate": 2.548013598690352e-06, + "loss": 0.0022, + "step": 7289 + }, + { + "epoch": 3.316651501364877, + "grad_norm": 0.32272099671720045, + "learning_rate": 2.5467680688367437e-06, + "loss": 0.0045, + "step": 7290 + }, + { + "epoch": 3.3171064604185623, + "grad_norm": 0.2581902236623843, + "learning_rate": 2.5455227394501726e-06, + "loss": 0.0069, + "step": 7291 + }, + { + "epoch": 3.3175614194722476, + "grad_norm": 0.35864153150367284, + "learning_rate": 2.5442776106323984e-06, + "loss": 0.0041, + "step": 7292 + }, + { + "epoch": 3.3180163785259325, + "grad_norm": 0.14772981343671274, + "learning_rate": 2.5430326824851693e-06, + "loss": 0.0009, + "step": 7293 + }, + { + "epoch": 3.3184713375796178, + "grad_norm": 0.13298019476464154, + "learning_rate": 2.5417879551102104e-06, + "loss": 0.001, + "step": 7294 + }, + { + "epoch": 3.318926296633303, + "grad_norm": 0.34506515118859876, + "learning_rate": 2.540543428609241e-06, + "loss": 0.0035, + "step": 7295 + }, + { + "epoch": 3.319381255686988, + "grad_norm": 0.13203322940229437, + "learning_rate": 2.539299103083956e-06, + "loss": 0.0012, + "step": 7296 + }, + { + "epoch": 3.3198362147406733, + "grad_norm": 0.19859864419203488, + "learning_rate": 2.5380549786360335e-06, + "loss": 0.0012, + "step": 7297 + }, + { + "epoch": 3.3202911737943586, + "grad_norm": 0.36402931140326145, + "learning_rate": 2.5368110553671426e-06, + "loss": 0.0093, + "step": 7298 + }, + { + "epoch": 3.3207461328480434, + "grad_norm": 0.49718953879928546, + "learning_rate": 2.5355673333789264e-06, + "loss": 0.0051, + "step": 7299 + }, + { + "epoch": 3.3212010919017287, + "grad_norm": 0.20120116580885394, + "learning_rate": 2.53432381277302e-06, + "loss": 0.0038, + "step": 7300 + }, + { + "epoch": 3.321656050955414, + "grad_norm": 0.20356369716200837, + "learning_rate": 2.5330804936510374e-06, + "loss": 0.0022, + "step": 7301 + }, + { + "epoch": 3.3221110100090994, + "grad_norm": 0.15078844520382614, + "learning_rate": 2.5318373761145757e-06, + "loss": 0.0028, + "step": 7302 + }, + { + "epoch": 3.3225659690627842, + "grad_norm": 0.2013139263014182, + "learning_rate": 2.530594460265217e-06, + "loss": 0.002, + "step": 7303 + }, + { + "epoch": 3.3230209281164695, + "grad_norm": 0.23668974653933753, + "learning_rate": 2.5293517462045254e-06, + "loss": 0.0039, + "step": 7304 + }, + { + "epoch": 3.323475887170155, + "grad_norm": 0.21387998420937504, + "learning_rate": 2.528109234034054e-06, + "loss": 0.0012, + "step": 7305 + }, + { + "epoch": 3.3239308462238397, + "grad_norm": 0.1727488684922544, + "learning_rate": 2.5268669238553337e-06, + "loss": 0.0042, + "step": 7306 + }, + { + "epoch": 3.324385805277525, + "grad_norm": 0.27516283501786926, + "learning_rate": 2.52562481576988e-06, + "loss": 0.0041, + "step": 7307 + }, + { + "epoch": 3.3248407643312103, + "grad_norm": 0.08728974391034555, + "learning_rate": 2.5243829098791894e-06, + "loss": 0.001, + "step": 7308 + }, + { + "epoch": 3.325295723384895, + "grad_norm": 0.36755171543684856, + "learning_rate": 2.5231412062847502e-06, + "loss": 0.0073, + "step": 7309 + }, + { + "epoch": 3.3257506824385805, + "grad_norm": 0.2876866632856369, + "learning_rate": 2.5218997050880262e-06, + "loss": 0.004, + "step": 7310 + }, + { + "epoch": 3.326205641492266, + "grad_norm": 0.11053008624322136, + "learning_rate": 2.5206584063904648e-06, + "loss": 0.0015, + "step": 7311 + }, + { + "epoch": 3.3266606005459507, + "grad_norm": 0.20030627286537156, + "learning_rate": 2.5194173102935044e-06, + "loss": 0.0026, + "step": 7312 + }, + { + "epoch": 3.327115559599636, + "grad_norm": 0.16950799475825842, + "learning_rate": 2.5181764168985566e-06, + "loss": 0.0033, + "step": 7313 + }, + { + "epoch": 3.3275705186533213, + "grad_norm": 0.2842261936564266, + "learning_rate": 2.516935726307027e-06, + "loss": 0.0027, + "step": 7314 + }, + { + "epoch": 3.328025477707006, + "grad_norm": 0.41526373932969185, + "learning_rate": 2.515695238620296e-06, + "loss": 0.0021, + "step": 7315 + }, + { + "epoch": 3.3284804367606915, + "grad_norm": 0.21075976304945018, + "learning_rate": 2.514454953939731e-06, + "loss": 0.0034, + "step": 7316 + }, + { + "epoch": 3.328935395814377, + "grad_norm": 0.07055641907169599, + "learning_rate": 2.5132148723666807e-06, + "loss": 0.0007, + "step": 7317 + }, + { + "epoch": 3.329390354868062, + "grad_norm": 1.0278173050813473, + "learning_rate": 2.5119749940024805e-06, + "loss": 0.0095, + "step": 7318 + }, + { + "epoch": 3.329845313921747, + "grad_norm": 0.1438870759025006, + "learning_rate": 2.5107353189484503e-06, + "loss": 0.0022, + "step": 7319 + }, + { + "epoch": 3.3303002729754323, + "grad_norm": 0.33844938099288047, + "learning_rate": 2.5094958473058883e-06, + "loss": 0.0051, + "step": 7320 + }, + { + "epoch": 3.3307552320291176, + "grad_norm": 0.3393158344571439, + "learning_rate": 2.508256579176078e-06, + "loss": 0.0068, + "step": 7321 + }, + { + "epoch": 3.3312101910828025, + "grad_norm": 0.3775418266642367, + "learning_rate": 2.5070175146602864e-06, + "loss": 0.0082, + "step": 7322 + }, + { + "epoch": 3.331665150136488, + "grad_norm": 0.3513833090670177, + "learning_rate": 2.5057786538597674e-06, + "loss": 0.0031, + "step": 7323 + }, + { + "epoch": 3.332120109190173, + "grad_norm": 0.07797340683218945, + "learning_rate": 2.504539996875752e-06, + "loss": 0.0009, + "step": 7324 + }, + { + "epoch": 3.332575068243858, + "grad_norm": 0.189112346004838, + "learning_rate": 2.5033015438094577e-06, + "loss": 0.0022, + "step": 7325 + }, + { + "epoch": 3.3330300272975433, + "grad_norm": 0.26356992815314156, + "learning_rate": 2.502063294762087e-06, + "loss": 0.005, + "step": 7326 + }, + { + "epoch": 3.3334849863512286, + "grad_norm": 0.10282269568175559, + "learning_rate": 2.500825249834823e-06, + "loss": 0.0009, + "step": 7327 + }, + { + "epoch": 3.3339399454049135, + "grad_norm": 0.1778713733816966, + "learning_rate": 2.4995874091288343e-06, + "loss": 0.0023, + "step": 7328 + }, + { + "epoch": 3.3343949044585988, + "grad_norm": 0.24928156816519922, + "learning_rate": 2.4983497727452704e-06, + "loss": 0.001, + "step": 7329 + }, + { + "epoch": 3.334849863512284, + "grad_norm": 0.33541814228695904, + "learning_rate": 2.4971123407852665e-06, + "loss": 0.01, + "step": 7330 + }, + { + "epoch": 3.335304822565969, + "grad_norm": 0.5295496967837838, + "learning_rate": 2.495875113349939e-06, + "loss": 0.0066, + "step": 7331 + }, + { + "epoch": 3.3357597816196543, + "grad_norm": 0.28325256526663395, + "learning_rate": 2.494638090540387e-06, + "loss": 0.0061, + "step": 7332 + }, + { + "epoch": 3.3362147406733396, + "grad_norm": 0.09104853173261164, + "learning_rate": 2.493401272457695e-06, + "loss": 0.001, + "step": 7333 + }, + { + "epoch": 3.3366696997270244, + "grad_norm": 0.36272303356851776, + "learning_rate": 2.492164659202934e-06, + "loss": 0.0128, + "step": 7334 + }, + { + "epoch": 3.3371246587807097, + "grad_norm": 0.35061986939040046, + "learning_rate": 2.490928250877152e-06, + "loss": 0.0111, + "step": 7335 + }, + { + "epoch": 3.337579617834395, + "grad_norm": 0.28032885436245897, + "learning_rate": 2.4896920475813824e-06, + "loss": 0.0047, + "step": 7336 + }, + { + "epoch": 3.33803457688808, + "grad_norm": 0.46575974164271244, + "learning_rate": 2.48845604941664e-06, + "loss": 0.0109, + "step": 7337 + }, + { + "epoch": 3.3384895359417652, + "grad_norm": 0.44632320593972635, + "learning_rate": 2.4872202564839293e-06, + "loss": 0.0105, + "step": 7338 + }, + { + "epoch": 3.3389444949954505, + "grad_norm": 0.29875495923786044, + "learning_rate": 2.48598466888423e-06, + "loss": 0.0046, + "step": 7339 + }, + { + "epoch": 3.3393994540491354, + "grad_norm": 0.2570759464781434, + "learning_rate": 2.4847492867185113e-06, + "loss": 0.0028, + "step": 7340 + }, + { + "epoch": 3.3398544131028207, + "grad_norm": 0.25893243182426584, + "learning_rate": 2.483514110087723e-06, + "loss": 0.0074, + "step": 7341 + }, + { + "epoch": 3.340309372156506, + "grad_norm": 0.26430627311799326, + "learning_rate": 2.482279139092795e-06, + "loss": 0.0016, + "step": 7342 + }, + { + "epoch": 3.340764331210191, + "grad_norm": 0.1980482553148076, + "learning_rate": 2.481044373834648e-06, + "loss": 0.0023, + "step": 7343 + }, + { + "epoch": 3.341219290263876, + "grad_norm": 0.364216865329497, + "learning_rate": 2.479809814414179e-06, + "loss": 0.0066, + "step": 7344 + }, + { + "epoch": 3.3416742493175615, + "grad_norm": 0.2557590655299494, + "learning_rate": 2.4785754609322714e-06, + "loss": 0.0059, + "step": 7345 + }, + { + "epoch": 3.3421292083712464, + "grad_norm": 0.1370077657388034, + "learning_rate": 2.477341313489788e-06, + "loss": 0.0017, + "step": 7346 + }, + { + "epoch": 3.3425841674249317, + "grad_norm": 0.06275266553004565, + "learning_rate": 2.4761073721875805e-06, + "loss": 0.0005, + "step": 7347 + }, + { + "epoch": 3.343039126478617, + "grad_norm": 0.10547302869649244, + "learning_rate": 2.4748736371264825e-06, + "loss": 0.0007, + "step": 7348 + }, + { + "epoch": 3.343494085532302, + "grad_norm": 0.2881291699184336, + "learning_rate": 2.4736401084073074e-06, + "loss": 0.0052, + "step": 7349 + }, + { + "epoch": 3.343949044585987, + "grad_norm": 0.23454407724560167, + "learning_rate": 2.4724067861308544e-06, + "loss": 0.0032, + "step": 7350 + }, + { + "epoch": 3.3444040036396725, + "grad_norm": 0.16237935164582346, + "learning_rate": 2.4711736703979015e-06, + "loss": 0.0012, + "step": 7351 + }, + { + "epoch": 3.3448589626933574, + "grad_norm": 0.17893016342068024, + "learning_rate": 2.4699407613092185e-06, + "loss": 0.0014, + "step": 7352 + }, + { + "epoch": 3.3453139217470427, + "grad_norm": 0.28489482923608267, + "learning_rate": 2.4687080589655494e-06, + "loss": 0.003, + "step": 7353 + }, + { + "epoch": 3.345768880800728, + "grad_norm": 0.40093990448855094, + "learning_rate": 2.467475563467628e-06, + "loss": 0.009, + "step": 7354 + }, + { + "epoch": 3.3462238398544133, + "grad_norm": 0.1603316142781402, + "learning_rate": 2.4662432749161664e-06, + "loss": 0.0028, + "step": 7355 + }, + { + "epoch": 3.346678798908098, + "grad_norm": 0.35500619515811727, + "learning_rate": 2.4650111934118604e-06, + "loss": 0.0032, + "step": 7356 + }, + { + "epoch": 3.3471337579617835, + "grad_norm": 0.04077092516052504, + "learning_rate": 2.463779319055394e-06, + "loss": 0.0004, + "step": 7357 + }, + { + "epoch": 3.347588717015469, + "grad_norm": 0.2930415679876122, + "learning_rate": 2.462547651947428e-06, + "loss": 0.0049, + "step": 7358 + }, + { + "epoch": 3.3480436760691537, + "grad_norm": 0.14966112595626418, + "learning_rate": 2.4613161921886087e-06, + "loss": 0.0016, + "step": 7359 + }, + { + "epoch": 3.348498635122839, + "grad_norm": 0.09521370717364151, + "learning_rate": 2.4600849398795633e-06, + "loss": 0.0013, + "step": 7360 + }, + { + "epoch": 3.3489535941765243, + "grad_norm": 0.26831455339138277, + "learning_rate": 2.458853895120907e-06, + "loss": 0.004, + "step": 7361 + }, + { + "epoch": 3.349408553230209, + "grad_norm": 0.3522142055830625, + "learning_rate": 2.4576230580132366e-06, + "loss": 0.0031, + "step": 7362 + }, + { + "epoch": 3.3498635122838945, + "grad_norm": 0.3136183709621097, + "learning_rate": 2.4563924286571285e-06, + "loss": 0.004, + "step": 7363 + }, + { + "epoch": 3.3503184713375798, + "grad_norm": 0.38469428663494953, + "learning_rate": 2.455162007153144e-06, + "loss": 0.0053, + "step": 7364 + }, + { + "epoch": 3.3507734303912646, + "grad_norm": 0.2608359331901631, + "learning_rate": 2.4539317936018287e-06, + "loss": 0.0043, + "step": 7365 + }, + { + "epoch": 3.35122838944495, + "grad_norm": 0.28166058044681713, + "learning_rate": 2.452701788103707e-06, + "loss": 0.0019, + "step": 7366 + }, + { + "epoch": 3.3516833484986353, + "grad_norm": 0.2311103793427457, + "learning_rate": 2.451471990759291e-06, + "loss": 0.0024, + "step": 7367 + }, + { + "epoch": 3.35213830755232, + "grad_norm": 0.2940347344513648, + "learning_rate": 2.4502424016690775e-06, + "loss": 0.0049, + "step": 7368 + }, + { + "epoch": 3.3525932666060054, + "grad_norm": 0.4669024471263398, + "learning_rate": 2.44901302093354e-06, + "loss": 0.0095, + "step": 7369 + }, + { + "epoch": 3.3530482256596907, + "grad_norm": 0.0961448192715872, + "learning_rate": 2.4477838486531386e-06, + "loss": 0.0023, + "step": 7370 + }, + { + "epoch": 3.3535031847133756, + "grad_norm": 0.138231832817281, + "learning_rate": 2.446554884928313e-06, + "loss": 0.0007, + "step": 7371 + }, + { + "epoch": 3.353958143767061, + "grad_norm": 0.4425843182085254, + "learning_rate": 2.445326129859493e-06, + "loss": 0.0042, + "step": 7372 + }, + { + "epoch": 3.3544131028207462, + "grad_norm": 0.13892725791334185, + "learning_rate": 2.4440975835470853e-06, + "loss": 0.0018, + "step": 7373 + }, + { + "epoch": 3.3548680618744315, + "grad_norm": 0.2050029208884926, + "learning_rate": 2.4428692460914783e-06, + "loss": 0.0034, + "step": 7374 + }, + { + "epoch": 3.3553230209281164, + "grad_norm": 0.5872584964516175, + "learning_rate": 2.441641117593051e-06, + "loss": 0.0157, + "step": 7375 + }, + { + "epoch": 3.3557779799818017, + "grad_norm": 0.2075077060962585, + "learning_rate": 2.440413198152156e-06, + "loss": 0.0025, + "step": 7376 + }, + { + "epoch": 3.356232939035487, + "grad_norm": 0.21060675410282897, + "learning_rate": 2.4391854878691374e-06, + "loss": 0.0021, + "step": 7377 + }, + { + "epoch": 3.356687898089172, + "grad_norm": 0.18972519948504624, + "learning_rate": 2.437957986844316e-06, + "loss": 0.0016, + "step": 7378 + }, + { + "epoch": 3.357142857142857, + "grad_norm": 0.5664575246739697, + "learning_rate": 2.436730695177998e-06, + "loss": 0.0037, + "step": 7379 + }, + { + "epoch": 3.3575978161965425, + "grad_norm": 0.35747484587848555, + "learning_rate": 2.43550361297047e-06, + "loss": 0.0081, + "step": 7380 + }, + { + "epoch": 3.3580527752502274, + "grad_norm": 0.2616627947472333, + "learning_rate": 2.434276740322005e-06, + "loss": 0.0094, + "step": 7381 + }, + { + "epoch": 3.3585077343039127, + "grad_norm": 0.2449991978069107, + "learning_rate": 2.4330500773328608e-06, + "loss": 0.0041, + "step": 7382 + }, + { + "epoch": 3.358962693357598, + "grad_norm": 0.3051261543090273, + "learning_rate": 2.4318236241032723e-06, + "loss": 0.0027, + "step": 7383 + }, + { + "epoch": 3.359417652411283, + "grad_norm": 0.1995524845485516, + "learning_rate": 2.430597380733459e-06, + "loss": 0.0055, + "step": 7384 + }, + { + "epoch": 3.359872611464968, + "grad_norm": 0.5443547108548171, + "learning_rate": 2.429371347323622e-06, + "loss": 0.0051, + "step": 7385 + }, + { + "epoch": 3.3603275705186535, + "grad_norm": 0.21093484627895018, + "learning_rate": 2.428145523973952e-06, + "loss": 0.0023, + "step": 7386 + }, + { + "epoch": 3.3607825295723384, + "grad_norm": 0.13761078709243324, + "learning_rate": 2.426919910784615e-06, + "loss": 0.0014, + "step": 7387 + }, + { + "epoch": 3.3612374886260237, + "grad_norm": 0.49437877181380013, + "learning_rate": 2.425694507855762e-06, + "loss": 0.0123, + "step": 7388 + }, + { + "epoch": 3.361692447679709, + "grad_norm": 0.5302513007493147, + "learning_rate": 2.42446931528753e-06, + "loss": 0.004, + "step": 7389 + }, + { + "epoch": 3.362147406733394, + "grad_norm": 0.11245617563980523, + "learning_rate": 2.423244333180032e-06, + "loss": 0.0009, + "step": 7390 + }, + { + "epoch": 3.362602365787079, + "grad_norm": 0.08739279737890837, + "learning_rate": 2.422019561633373e-06, + "loss": 0.0016, + "step": 7391 + }, + { + "epoch": 3.3630573248407645, + "grad_norm": 0.3547527842961154, + "learning_rate": 2.4207950007476335e-06, + "loss": 0.0057, + "step": 7392 + }, + { + "epoch": 3.3635122838944493, + "grad_norm": 0.33975658421196914, + "learning_rate": 2.4195706506228785e-06, + "loss": 0.0034, + "step": 7393 + }, + { + "epoch": 3.3639672429481347, + "grad_norm": 0.3250004927194024, + "learning_rate": 2.4183465113591547e-06, + "loss": 0.0038, + "step": 7394 + }, + { + "epoch": 3.36442220200182, + "grad_norm": 0.28608833459689476, + "learning_rate": 2.417122583056496e-06, + "loss": 0.0043, + "step": 7395 + }, + { + "epoch": 3.364877161055505, + "grad_norm": 0.2405906832521087, + "learning_rate": 2.4158988658149173e-06, + "loss": 0.0043, + "step": 7396 + }, + { + "epoch": 3.36533212010919, + "grad_norm": 0.4235606741284786, + "learning_rate": 2.4146753597344136e-06, + "loss": 0.006, + "step": 7397 + }, + { + "epoch": 3.3657870791628755, + "grad_norm": 0.09472292713394488, + "learning_rate": 2.4134520649149646e-06, + "loss": 0.0009, + "step": 7398 + }, + { + "epoch": 3.3662420382165603, + "grad_norm": 0.1446411078510499, + "learning_rate": 2.4122289814565312e-06, + "loss": 0.0021, + "step": 7399 + }, + { + "epoch": 3.3666969972702456, + "grad_norm": 0.15640355506080575, + "learning_rate": 2.4110061094590583e-06, + "loss": 0.0024, + "step": 7400 + }, + { + "epoch": 3.367151956323931, + "grad_norm": 0.2408129485245961, + "learning_rate": 2.4097834490224754e-06, + "loss": 0.0056, + "step": 7401 + }, + { + "epoch": 3.367606915377616, + "grad_norm": 0.335113110031722, + "learning_rate": 2.4085610002466904e-06, + "loss": 0.0086, + "step": 7402 + }, + { + "epoch": 3.368061874431301, + "grad_norm": 0.08420324284226373, + "learning_rate": 2.407338763231599e-06, + "loss": 0.0013, + "step": 7403 + }, + { + "epoch": 3.3685168334849864, + "grad_norm": 0.1396450007182465, + "learning_rate": 2.4061167380770763e-06, + "loss": 0.0011, + "step": 7404 + }, + { + "epoch": 3.3689717925386713, + "grad_norm": 0.16999508428944177, + "learning_rate": 2.404894924882977e-06, + "loss": 0.0018, + "step": 7405 + }, + { + "epoch": 3.3694267515923566, + "grad_norm": 0.3526039573547545, + "learning_rate": 2.4036733237491476e-06, + "loss": 0.0073, + "step": 7406 + }, + { + "epoch": 3.369881710646042, + "grad_norm": 0.23492435210891607, + "learning_rate": 2.4024519347754093e-06, + "loss": 0.0026, + "step": 7407 + }, + { + "epoch": 3.370336669699727, + "grad_norm": 0.15506720423048814, + "learning_rate": 2.4012307580615687e-06, + "loss": 0.0025, + "step": 7408 + }, + { + "epoch": 3.370791628753412, + "grad_norm": 0.3132982903346134, + "learning_rate": 2.400009793707412e-06, + "loss": 0.0026, + "step": 7409 + }, + { + "epoch": 3.3712465878070974, + "grad_norm": 0.5417676611954173, + "learning_rate": 2.3987890418127135e-06, + "loss": 0.0067, + "step": 7410 + }, + { + "epoch": 3.3717015468607827, + "grad_norm": 0.11962600915485652, + "learning_rate": 2.39756850247723e-06, + "loss": 0.0011, + "step": 7411 + }, + { + "epoch": 3.3721565059144676, + "grad_norm": 0.29016847422403347, + "learning_rate": 2.3963481758006958e-06, + "loss": 0.0047, + "step": 7412 + }, + { + "epoch": 3.372611464968153, + "grad_norm": 0.06873271751268913, + "learning_rate": 2.3951280618828305e-06, + "loss": 0.0006, + "step": 7413 + }, + { + "epoch": 3.373066424021838, + "grad_norm": 0.1891045428343446, + "learning_rate": 2.393908160823335e-06, + "loss": 0.0022, + "step": 7414 + }, + { + "epoch": 3.373521383075523, + "grad_norm": 0.2432783585012321, + "learning_rate": 2.3926884727218975e-06, + "loss": 0.0021, + "step": 7415 + }, + { + "epoch": 3.3739763421292084, + "grad_norm": 0.34727501362713886, + "learning_rate": 2.3914689976781807e-06, + "loss": 0.0069, + "step": 7416 + }, + { + "epoch": 3.3744313011828937, + "grad_norm": 0.28857604910638013, + "learning_rate": 2.3902497357918404e-06, + "loss": 0.0027, + "step": 7417 + }, + { + "epoch": 3.3748862602365786, + "grad_norm": 0.24525753408560597, + "learning_rate": 2.3890306871625058e-06, + "loss": 0.004, + "step": 7418 + }, + { + "epoch": 3.375341219290264, + "grad_norm": 0.2027320126370087, + "learning_rate": 2.3878118518897905e-06, + "loss": 0.0055, + "step": 7419 + }, + { + "epoch": 3.375796178343949, + "grad_norm": 0.20894508240817183, + "learning_rate": 2.3865932300732954e-06, + "loss": 0.0028, + "step": 7420 + }, + { + "epoch": 3.376251137397634, + "grad_norm": 0.26394897429274966, + "learning_rate": 2.3853748218126e-06, + "loss": 0.0027, + "step": 7421 + }, + { + "epoch": 3.3767060964513194, + "grad_norm": 0.2262767108381828, + "learning_rate": 2.384156627207267e-06, + "loss": 0.0019, + "step": 7422 + }, + { + "epoch": 3.3771610555050047, + "grad_norm": 0.15183545450199315, + "learning_rate": 2.3829386463568388e-06, + "loss": 0.0017, + "step": 7423 + }, + { + "epoch": 3.3776160145586895, + "grad_norm": 0.1648028546920776, + "learning_rate": 2.3817208793608467e-06, + "loss": 0.0016, + "step": 7424 + }, + { + "epoch": 3.378070973612375, + "grad_norm": 0.1332240175041799, + "learning_rate": 2.380503326318801e-06, + "loss": 0.0014, + "step": 7425 + }, + { + "epoch": 3.37852593266606, + "grad_norm": 0.05527216943138619, + "learning_rate": 2.379285987330195e-06, + "loss": 0.0006, + "step": 7426 + }, + { + "epoch": 3.3789808917197455, + "grad_norm": 0.22076078562524457, + "learning_rate": 2.3780688624945026e-06, + "loss": 0.002, + "step": 7427 + }, + { + "epoch": 3.3794358507734303, + "grad_norm": 0.22583059364757874, + "learning_rate": 2.3768519519111804e-06, + "loss": 0.007, + "step": 7428 + }, + { + "epoch": 3.3798908098271156, + "grad_norm": 0.24204613519324628, + "learning_rate": 2.3756352556796726e-06, + "loss": 0.0026, + "step": 7429 + }, + { + "epoch": 3.380345768880801, + "grad_norm": 0.19870045843796788, + "learning_rate": 2.374418773899398e-06, + "loss": 0.0037, + "step": 7430 + }, + { + "epoch": 3.380800727934486, + "grad_norm": 0.1063779813856768, + "learning_rate": 2.3732025066697667e-06, + "loss": 0.001, + "step": 7431 + }, + { + "epoch": 3.381255686988171, + "grad_norm": 0.26323668100938086, + "learning_rate": 2.3719864540901634e-06, + "loss": 0.0038, + "step": 7432 + }, + { + "epoch": 3.3817106460418564, + "grad_norm": 0.24662734188438992, + "learning_rate": 2.3707706162599573e-06, + "loss": 0.0074, + "step": 7433 + }, + { + "epoch": 3.3821656050955413, + "grad_norm": 0.27595005994695637, + "learning_rate": 2.369554993278505e-06, + "loss": 0.0033, + "step": 7434 + }, + { + "epoch": 3.3826205641492266, + "grad_norm": 0.127101063642665, + "learning_rate": 2.3683395852451396e-06, + "loss": 0.0007, + "step": 7435 + }, + { + "epoch": 3.383075523202912, + "grad_norm": 0.1374146327909923, + "learning_rate": 2.367124392259179e-06, + "loss": 0.0008, + "step": 7436 + }, + { + "epoch": 3.383530482256597, + "grad_norm": 0.13515885020106613, + "learning_rate": 2.3659094144199214e-06, + "loss": 0.0018, + "step": 7437 + }, + { + "epoch": 3.383985441310282, + "grad_norm": 0.40005077561699465, + "learning_rate": 2.3646946518266522e-06, + "loss": 0.0046, + "step": 7438 + }, + { + "epoch": 3.3844404003639674, + "grad_norm": 0.11524035274942056, + "learning_rate": 2.3634801045786338e-06, + "loss": 0.0015, + "step": 7439 + }, + { + "epoch": 3.3848953594176523, + "grad_norm": 0.1960265772957544, + "learning_rate": 2.362265772775116e-06, + "loss": 0.0017, + "step": 7440 + }, + { + "epoch": 3.3853503184713376, + "grad_norm": 0.2299423310415387, + "learning_rate": 2.361051656515328e-06, + "loss": 0.002, + "step": 7441 + }, + { + "epoch": 3.385805277525023, + "grad_norm": 0.22255715423427594, + "learning_rate": 2.3598377558984814e-06, + "loss": 0.0029, + "step": 7442 + }, + { + "epoch": 3.386260236578708, + "grad_norm": 0.430756961990995, + "learning_rate": 2.3586240710237685e-06, + "loss": 0.011, + "step": 7443 + }, + { + "epoch": 3.386715195632393, + "grad_norm": 0.03173160400022317, + "learning_rate": 2.3574106019903673e-06, + "loss": 0.0003, + "step": 7444 + }, + { + "epoch": 3.3871701546860784, + "grad_norm": 0.2615814142769935, + "learning_rate": 2.356197348897441e-06, + "loss": 0.0068, + "step": 7445 + }, + { + "epoch": 3.3876251137397633, + "grad_norm": 0.3088409936110604, + "learning_rate": 2.3549843118441275e-06, + "loss": 0.0057, + "step": 7446 + }, + { + "epoch": 3.3880800727934486, + "grad_norm": 0.17585992559994212, + "learning_rate": 2.3537714909295513e-06, + "loss": 0.0026, + "step": 7447 + }, + { + "epoch": 3.388535031847134, + "grad_norm": 0.30041111394092374, + "learning_rate": 2.3525588862528163e-06, + "loss": 0.0031, + "step": 7448 + }, + { + "epoch": 3.3889899909008188, + "grad_norm": 0.6346439900667901, + "learning_rate": 2.351346497913016e-06, + "loss": 0.0198, + "step": 7449 + }, + { + "epoch": 3.389444949954504, + "grad_norm": 0.4954806140291429, + "learning_rate": 2.350134326009218e-06, + "loss": 0.0033, + "step": 7450 + }, + { + "epoch": 3.3898999090081894, + "grad_norm": 0.3460371370826556, + "learning_rate": 2.348922370640475e-06, + "loss": 0.0059, + "step": 7451 + }, + { + "epoch": 3.3903548680618742, + "grad_norm": 0.41469027172683753, + "learning_rate": 2.347710631905825e-06, + "loss": 0.0083, + "step": 7452 + }, + { + "epoch": 3.3908098271155596, + "grad_norm": 0.29431864933926416, + "learning_rate": 2.3464991099042826e-06, + "loss": 0.0043, + "step": 7453 + }, + { + "epoch": 3.391264786169245, + "grad_norm": 0.21397868198234682, + "learning_rate": 2.345287804734852e-06, + "loss": 0.0029, + "step": 7454 + }, + { + "epoch": 3.3917197452229297, + "grad_norm": 0.21561317801769855, + "learning_rate": 2.3440767164965137e-06, + "loss": 0.0028, + "step": 7455 + }, + { + "epoch": 3.392174704276615, + "grad_norm": 0.3482130736958066, + "learning_rate": 2.342865845288232e-06, + "loss": 0.0037, + "step": 7456 + }, + { + "epoch": 3.3926296633303004, + "grad_norm": 0.24450700003734802, + "learning_rate": 2.3416551912089513e-06, + "loss": 0.0042, + "step": 7457 + }, + { + "epoch": 3.3930846223839852, + "grad_norm": 0.13762153687268255, + "learning_rate": 2.340444754357604e-06, + "loss": 0.0017, + "step": 7458 + }, + { + "epoch": 3.3935395814376705, + "grad_norm": 0.44899251478874885, + "learning_rate": 2.339234534833103e-06, + "loss": 0.0051, + "step": 7459 + }, + { + "epoch": 3.393994540491356, + "grad_norm": 0.4440731458246905, + "learning_rate": 2.33802453273434e-06, + "loss": 0.0094, + "step": 7460 + }, + { + "epoch": 3.3944494995450407, + "grad_norm": 0.32028308463595856, + "learning_rate": 2.33681474816019e-06, + "loss": 0.004, + "step": 7461 + }, + { + "epoch": 3.394904458598726, + "grad_norm": 0.19976710451337512, + "learning_rate": 2.3356051812095104e-06, + "loss": 0.0078, + "step": 7462 + }, + { + "epoch": 3.3953594176524113, + "grad_norm": 0.10010372295754415, + "learning_rate": 2.334395831981145e-06, + "loss": 0.0017, + "step": 7463 + }, + { + "epoch": 3.395814376706096, + "grad_norm": 0.31711059250387985, + "learning_rate": 2.3331867005739127e-06, + "loss": 0.0023, + "step": 7464 + }, + { + "epoch": 3.3962693357597815, + "grad_norm": 0.3296500083697068, + "learning_rate": 2.3319777870866217e-06, + "loss": 0.0042, + "step": 7465 + }, + { + "epoch": 3.396724294813467, + "grad_norm": 0.31665319933384933, + "learning_rate": 2.3307690916180575e-06, + "loss": 0.0026, + "step": 7466 + }, + { + "epoch": 3.397179253867152, + "grad_norm": 0.2826439077134469, + "learning_rate": 2.329560614266987e-06, + "loss": 0.0037, + "step": 7467 + }, + { + "epoch": 3.397634212920837, + "grad_norm": 0.2984721192995415, + "learning_rate": 2.328352355132165e-06, + "loss": 0.0074, + "step": 7468 + }, + { + "epoch": 3.3980891719745223, + "grad_norm": 0.1621392575248319, + "learning_rate": 2.327144314312324e-06, + "loss": 0.002, + "step": 7469 + }, + { + "epoch": 3.3985441310282076, + "grad_norm": 0.2551895988159972, + "learning_rate": 2.3259364919061795e-06, + "loss": 0.0049, + "step": 7470 + }, + { + "epoch": 3.3989990900818925, + "grad_norm": 0.2108534720772892, + "learning_rate": 2.3247288880124265e-06, + "loss": 0.0015, + "step": 7471 + }, + { + "epoch": 3.399454049135578, + "grad_norm": 0.13524097984038214, + "learning_rate": 2.323521502729747e-06, + "loss": 0.0023, + "step": 7472 + }, + { + "epoch": 3.399909008189263, + "grad_norm": 0.1704708922074107, + "learning_rate": 2.322314336156806e-06, + "loss": 0.002, + "step": 7473 + }, + { + "epoch": 3.400363967242948, + "grad_norm": 0.1267406621929312, + "learning_rate": 2.3211073883922447e-06, + "loss": 0.0019, + "step": 7474 + }, + { + "epoch": 3.4008189262966333, + "grad_norm": 0.3100928218013081, + "learning_rate": 2.31990065953469e-06, + "loss": 0.0046, + "step": 7475 + }, + { + "epoch": 3.4012738853503186, + "grad_norm": 0.20746714353843193, + "learning_rate": 2.31869414968275e-06, + "loss": 0.0032, + "step": 7476 + }, + { + "epoch": 3.4017288444040035, + "grad_norm": 0.11569388270920063, + "learning_rate": 2.3174878589350135e-06, + "loss": 0.0015, + "step": 7477 + }, + { + "epoch": 3.402183803457689, + "grad_norm": 0.39858592777381013, + "learning_rate": 2.3162817873900556e-06, + "loss": 0.0046, + "step": 7478 + }, + { + "epoch": 3.402638762511374, + "grad_norm": 0.2415748693095528, + "learning_rate": 2.315075935146432e-06, + "loss": 0.0037, + "step": 7479 + }, + { + "epoch": 3.403093721565059, + "grad_norm": 0.16769446953265724, + "learning_rate": 2.3138703023026775e-06, + "loss": 0.002, + "step": 7480 + }, + { + "epoch": 3.4035486806187443, + "grad_norm": 0.19158668731085754, + "learning_rate": 2.3126648889573124e-06, + "loss": 0.002, + "step": 7481 + }, + { + "epoch": 3.4040036396724296, + "grad_norm": 0.1710038402115662, + "learning_rate": 2.311459695208834e-06, + "loss": 0.0041, + "step": 7482 + }, + { + "epoch": 3.404458598726115, + "grad_norm": 0.12239022636371179, + "learning_rate": 2.31025472115573e-06, + "loss": 0.0012, + "step": 7483 + }, + { + "epoch": 3.4049135577797998, + "grad_norm": 0.20696613435734043, + "learning_rate": 2.3090499668964637e-06, + "loss": 0.0035, + "step": 7484 + }, + { + "epoch": 3.405368516833485, + "grad_norm": 0.5902100957614066, + "learning_rate": 2.3078454325294797e-06, + "loss": 0.0055, + "step": 7485 + }, + { + "epoch": 3.4058234758871704, + "grad_norm": 0.2742027427827176, + "learning_rate": 2.3066411181532113e-06, + "loss": 0.004, + "step": 7486 + }, + { + "epoch": 3.4062784349408552, + "grad_norm": 0.14160244714288026, + "learning_rate": 2.3054370238660655e-06, + "loss": 0.0028, + "step": 7487 + }, + { + "epoch": 3.4067333939945406, + "grad_norm": 0.10936957664965745, + "learning_rate": 2.30423314976644e-06, + "loss": 0.001, + "step": 7488 + }, + { + "epoch": 3.407188353048226, + "grad_norm": 0.27197434499031803, + "learning_rate": 2.3030294959527073e-06, + "loss": 0.0053, + "step": 7489 + }, + { + "epoch": 3.4076433121019107, + "grad_norm": 0.31420550405197456, + "learning_rate": 2.3018260625232246e-06, + "loss": 0.0088, + "step": 7490 + }, + { + "epoch": 3.408098271155596, + "grad_norm": 0.35484494683380813, + "learning_rate": 2.3006228495763295e-06, + "loss": 0.0057, + "step": 7491 + }, + { + "epoch": 3.4085532302092814, + "grad_norm": 0.29627024364808935, + "learning_rate": 2.299419857210345e-06, + "loss": 0.0025, + "step": 7492 + }, + { + "epoch": 3.4090081892629662, + "grad_norm": 0.3488972186549872, + "learning_rate": 2.298217085523576e-06, + "loss": 0.0076, + "step": 7493 + }, + { + "epoch": 3.4094631483166515, + "grad_norm": 0.2041676568134566, + "learning_rate": 2.2970145346143045e-06, + "loss": 0.0022, + "step": 7494 + }, + { + "epoch": 3.409918107370337, + "grad_norm": 0.3644252519731684, + "learning_rate": 2.2958122045808002e-06, + "loss": 0.0073, + "step": 7495 + }, + { + "epoch": 3.4103730664240217, + "grad_norm": 0.17796643268697715, + "learning_rate": 2.294610095521308e-06, + "loss": 0.0012, + "step": 7496 + }, + { + "epoch": 3.410828025477707, + "grad_norm": 0.26542944580127786, + "learning_rate": 2.293408207534063e-06, + "loss": 0.0015, + "step": 7497 + }, + { + "epoch": 3.4112829845313923, + "grad_norm": 0.02401730874842608, + "learning_rate": 2.2922065407172767e-06, + "loss": 0.0003, + "step": 7498 + }, + { + "epoch": 3.411737943585077, + "grad_norm": 0.15404227228074097, + "learning_rate": 2.2910050951691416e-06, + "loss": 0.0029, + "step": 7499 + }, + { + "epoch": 3.4121929026387625, + "grad_norm": 0.08787003145192555, + "learning_rate": 2.2898038709878386e-06, + "loss": 0.0006, + "step": 7500 + }, + { + "epoch": 3.412647861692448, + "grad_norm": 0.2818577725151577, + "learning_rate": 2.2886028682715217e-06, + "loss": 0.0033, + "step": 7501 + }, + { + "epoch": 3.4131028207461327, + "grad_norm": 0.18935098847517642, + "learning_rate": 2.287402087118336e-06, + "loss": 0.0024, + "step": 7502 + }, + { + "epoch": 3.413557779799818, + "grad_norm": 0.13854282299343776, + "learning_rate": 2.2862015276264016e-06, + "loss": 0.0029, + "step": 7503 + }, + { + "epoch": 3.4140127388535033, + "grad_norm": 0.3039917831277023, + "learning_rate": 2.2850011898938236e-06, + "loss": 0.0024, + "step": 7504 + }, + { + "epoch": 3.414467697907188, + "grad_norm": 0.23879128032948413, + "learning_rate": 2.283801074018685e-06, + "loss": 0.0045, + "step": 7505 + }, + { + "epoch": 3.4149226569608735, + "grad_norm": 0.40633003185693745, + "learning_rate": 2.2826011800990567e-06, + "loss": 0.0053, + "step": 7506 + }, + { + "epoch": 3.415377616014559, + "grad_norm": 0.16206510040827052, + "learning_rate": 2.28140150823299e-06, + "loss": 0.0031, + "step": 7507 + }, + { + "epoch": 3.4158325750682437, + "grad_norm": 0.14917788836790466, + "learning_rate": 2.280202058518515e-06, + "loss": 0.0022, + "step": 7508 + }, + { + "epoch": 3.416287534121929, + "grad_norm": 0.3271005830438918, + "learning_rate": 2.279002831053645e-06, + "loss": 0.006, + "step": 7509 + }, + { + "epoch": 3.4167424931756143, + "grad_norm": 0.07984297227321478, + "learning_rate": 2.277803825936376e-06, + "loss": 0.0005, + "step": 7510 + }, + { + "epoch": 3.417197452229299, + "grad_norm": 0.06900928897055819, + "learning_rate": 2.2766050432646835e-06, + "loss": 0.0007, + "step": 7511 + }, + { + "epoch": 3.4176524112829845, + "grad_norm": 0.24834426694486775, + "learning_rate": 2.2754064831365296e-06, + "loss": 0.0027, + "step": 7512 + }, + { + "epoch": 3.41810737033667, + "grad_norm": 0.07036224443674961, + "learning_rate": 2.2742081456498517e-06, + "loss": 0.0007, + "step": 7513 + }, + { + "epoch": 3.4185623293903546, + "grad_norm": 0.1131181170368926, + "learning_rate": 2.2730100309025765e-06, + "loss": 0.0009, + "step": 7514 + }, + { + "epoch": 3.41901728844404, + "grad_norm": 0.2982112452094316, + "learning_rate": 2.271812138992607e-06, + "loss": 0.006, + "step": 7515 + }, + { + "epoch": 3.4194722474977253, + "grad_norm": 0.2993485471057614, + "learning_rate": 2.270614470017827e-06, + "loss": 0.0043, + "step": 7516 + }, + { + "epoch": 3.41992720655141, + "grad_norm": 0.37038025888018483, + "learning_rate": 2.2694170240761086e-06, + "loss": 0.0048, + "step": 7517 + }, + { + "epoch": 3.4203821656050954, + "grad_norm": 0.23874094536173626, + "learning_rate": 2.2682198012653e-06, + "loss": 0.0026, + "step": 7518 + }, + { + "epoch": 3.4208371246587808, + "grad_norm": 0.22512967728517105, + "learning_rate": 2.2670228016832325e-06, + "loss": 0.0017, + "step": 7519 + }, + { + "epoch": 3.421292083712466, + "grad_norm": 0.28465327850834565, + "learning_rate": 2.2658260254277176e-06, + "loss": 0.0043, + "step": 7520 + }, + { + "epoch": 3.421747042766151, + "grad_norm": 0.17985399793542411, + "learning_rate": 2.2646294725965522e-06, + "loss": 0.003, + "step": 7521 + }, + { + "epoch": 3.4222020018198362, + "grad_norm": 0.11797323942667824, + "learning_rate": 2.2634331432875163e-06, + "loss": 0.0011, + "step": 7522 + }, + { + "epoch": 3.4226569608735216, + "grad_norm": 0.04658003610474827, + "learning_rate": 2.262237037598365e-06, + "loss": 0.0005, + "step": 7523 + }, + { + "epoch": 3.4231119199272064, + "grad_norm": 0.2269317789310602, + "learning_rate": 2.261041155626839e-06, + "loss": 0.0042, + "step": 7524 + }, + { + "epoch": 3.4235668789808917, + "grad_norm": 0.20585873166901278, + "learning_rate": 2.2598454974706595e-06, + "loss": 0.0022, + "step": 7525 + }, + { + "epoch": 3.424021838034577, + "grad_norm": 0.028660333040078968, + "learning_rate": 2.2586500632275333e-06, + "loss": 0.0002, + "step": 7526 + }, + { + "epoch": 3.424476797088262, + "grad_norm": 0.17937247856404254, + "learning_rate": 2.2574548529951423e-06, + "loss": 0.0021, + "step": 7527 + }, + { + "epoch": 3.4249317561419472, + "grad_norm": 0.36347026246348846, + "learning_rate": 2.256259866871157e-06, + "loss": 0.0088, + "step": 7528 + }, + { + "epoch": 3.4253867151956325, + "grad_norm": 0.29857263800353295, + "learning_rate": 2.2550651049532253e-06, + "loss": 0.004, + "step": 7529 + }, + { + "epoch": 3.4258416742493174, + "grad_norm": 0.2619645740528489, + "learning_rate": 2.2538705673389747e-06, + "loss": 0.0024, + "step": 7530 + }, + { + "epoch": 3.4262966333030027, + "grad_norm": 0.37703136795917297, + "learning_rate": 2.252676254126022e-06, + "loss": 0.0024, + "step": 7531 + }, + { + "epoch": 3.426751592356688, + "grad_norm": 0.09039222248055129, + "learning_rate": 2.251482165411959e-06, + "loss": 0.0005, + "step": 7532 + }, + { + "epoch": 3.427206551410373, + "grad_norm": 0.27748057243558133, + "learning_rate": 2.2502883012943614e-06, + "loss": 0.0033, + "step": 7533 + }, + { + "epoch": 3.427661510464058, + "grad_norm": 0.28617041874724686, + "learning_rate": 2.2490946618707844e-06, + "loss": 0.0039, + "step": 7534 + }, + { + "epoch": 3.4281164695177435, + "grad_norm": 0.47224638433142235, + "learning_rate": 2.2479012472387685e-06, + "loss": 0.007, + "step": 7535 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.0772165856397902, + "learning_rate": 2.2467080574958365e-06, + "loss": 0.0006, + "step": 7536 + }, + { + "epoch": 3.4290263876251137, + "grad_norm": 0.3347847903742624, + "learning_rate": 2.245515092739488e-06, + "loss": 0.0125, + "step": 7537 + }, + { + "epoch": 3.429481346678799, + "grad_norm": 0.48537822056152635, + "learning_rate": 2.244322353067207e-06, + "loss": 0.0038, + "step": 7538 + }, + { + "epoch": 3.4299363057324843, + "grad_norm": 0.29186139958278556, + "learning_rate": 2.2431298385764565e-06, + "loss": 0.0041, + "step": 7539 + }, + { + "epoch": 3.430391264786169, + "grad_norm": 0.2796719230790243, + "learning_rate": 2.241937549364688e-06, + "loss": 0.0032, + "step": 7540 + }, + { + "epoch": 3.4308462238398545, + "grad_norm": 0.14853241686614696, + "learning_rate": 2.240745485529326e-06, + "loss": 0.002, + "step": 7541 + }, + { + "epoch": 3.43130118289354, + "grad_norm": 0.23798983423878695, + "learning_rate": 2.2395536471677835e-06, + "loss": 0.0018, + "step": 7542 + }, + { + "epoch": 3.4317561419472247, + "grad_norm": 0.20385346619196504, + "learning_rate": 2.2383620343774516e-06, + "loss": 0.0018, + "step": 7543 + }, + { + "epoch": 3.43221110100091, + "grad_norm": 0.23529455413456232, + "learning_rate": 2.2371706472557026e-06, + "loss": 0.0013, + "step": 7544 + }, + { + "epoch": 3.4326660600545953, + "grad_norm": 0.0993919378156682, + "learning_rate": 2.2359794858998894e-06, + "loss": 0.0012, + "step": 7545 + }, + { + "epoch": 3.43312101910828, + "grad_norm": 0.17278011269367857, + "learning_rate": 2.2347885504073525e-06, + "loss": 0.0033, + "step": 7546 + }, + { + "epoch": 3.4335759781619655, + "grad_norm": 0.292090133582803, + "learning_rate": 2.233597840875407e-06, + "loss": 0.0069, + "step": 7547 + }, + { + "epoch": 3.434030937215651, + "grad_norm": 0.28677597066255733, + "learning_rate": 2.232407357401352e-06, + "loss": 0.0052, + "step": 7548 + }, + { + "epoch": 3.4344858962693356, + "grad_norm": 0.16510230830411007, + "learning_rate": 2.231217100082471e-06, + "loss": 0.0024, + "step": 7549 + }, + { + "epoch": 3.434940855323021, + "grad_norm": 0.21829917839424848, + "learning_rate": 2.230027069016023e-06, + "loss": 0.0037, + "step": 7550 + }, + { + "epoch": 3.4353958143767063, + "grad_norm": 0.18653216734594502, + "learning_rate": 2.2288372642992557e-06, + "loss": 0.0018, + "step": 7551 + }, + { + "epoch": 3.435850773430391, + "grad_norm": 0.3865186293895805, + "learning_rate": 2.227647686029392e-06, + "loss": 0.0043, + "step": 7552 + }, + { + "epoch": 3.4363057324840764, + "grad_norm": 0.21625871006312322, + "learning_rate": 2.2264583343036406e-06, + "loss": 0.0049, + "step": 7553 + }, + { + "epoch": 3.4367606915377618, + "grad_norm": 0.20052999210861674, + "learning_rate": 2.2252692092191864e-06, + "loss": 0.002, + "step": 7554 + }, + { + "epoch": 3.4372156505914466, + "grad_norm": 0.32467840049268315, + "learning_rate": 2.2240803108732024e-06, + "loss": 0.0059, + "step": 7555 + }, + { + "epoch": 3.437670609645132, + "grad_norm": 0.1844286177408229, + "learning_rate": 2.2228916393628407e-06, + "loss": 0.0015, + "step": 7556 + }, + { + "epoch": 3.4381255686988172, + "grad_norm": 0.2418682440631349, + "learning_rate": 2.2217031947852336e-06, + "loss": 0.0032, + "step": 7557 + }, + { + "epoch": 3.438580527752502, + "grad_norm": 0.4792988592408678, + "learning_rate": 2.220514977237494e-06, + "loss": 0.0089, + "step": 7558 + }, + { + "epoch": 3.4390354868061874, + "grad_norm": 0.38816660052508606, + "learning_rate": 2.219326986816717e-06, + "loss": 0.0093, + "step": 7559 + }, + { + "epoch": 3.4394904458598727, + "grad_norm": 0.12074161677617608, + "learning_rate": 2.218139223619983e-06, + "loss": 0.001, + "step": 7560 + }, + { + "epoch": 3.4399454049135576, + "grad_norm": 0.34773008386237036, + "learning_rate": 2.2169516877443487e-06, + "loss": 0.0048, + "step": 7561 + }, + { + "epoch": 3.440400363967243, + "grad_norm": 0.423207554559181, + "learning_rate": 2.215764379286853e-06, + "loss": 0.0064, + "step": 7562 + }, + { + "epoch": 3.4408553230209282, + "grad_norm": 0.04242146456268766, + "learning_rate": 2.21457729834452e-06, + "loss": 0.0004, + "step": 7563 + }, + { + "epoch": 3.441310282074613, + "grad_norm": 1.2470397646639153, + "learning_rate": 2.2133904450143502e-06, + "loss": 0.0155, + "step": 7564 + }, + { + "epoch": 3.4417652411282984, + "grad_norm": 0.13792975474096358, + "learning_rate": 2.2122038193933297e-06, + "loss": 0.0023, + "step": 7565 + }, + { + "epoch": 3.4422202001819837, + "grad_norm": 0.09370130857618474, + "learning_rate": 2.211017421578425e-06, + "loss": 0.0011, + "step": 7566 + }, + { + "epoch": 3.4426751592356686, + "grad_norm": 0.23737595228068492, + "learning_rate": 2.2098312516665806e-06, + "loss": 0.0054, + "step": 7567 + }, + { + "epoch": 3.443130118289354, + "grad_norm": 0.2664383212374511, + "learning_rate": 2.2086453097547244e-06, + "loss": 0.003, + "step": 7568 + }, + { + "epoch": 3.443585077343039, + "grad_norm": 0.16132888935395276, + "learning_rate": 2.2074595959397675e-06, + "loss": 0.0028, + "step": 7569 + }, + { + "epoch": 3.444040036396724, + "grad_norm": 0.5772967328366053, + "learning_rate": 2.2062741103186037e-06, + "loss": 0.0108, + "step": 7570 + }, + { + "epoch": 3.4444949954504094, + "grad_norm": 0.3199739605964402, + "learning_rate": 2.205088852988103e-06, + "loss": 0.0082, + "step": 7571 + }, + { + "epoch": 3.4449499545040947, + "grad_norm": 0.38774756555729817, + "learning_rate": 2.20390382404512e-06, + "loss": 0.0092, + "step": 7572 + }, + { + "epoch": 3.4454049135577796, + "grad_norm": 0.1104623776806025, + "learning_rate": 2.2027190235864875e-06, + "loss": 0.0008, + "step": 7573 + }, + { + "epoch": 3.445859872611465, + "grad_norm": 0.10013143827437533, + "learning_rate": 2.201534451709025e-06, + "loss": 0.0011, + "step": 7574 + }, + { + "epoch": 3.44631483166515, + "grad_norm": 0.2688118064240035, + "learning_rate": 2.20035010850953e-06, + "loss": 0.0021, + "step": 7575 + }, + { + "epoch": 3.4467697907188355, + "grad_norm": 0.2948865089535925, + "learning_rate": 2.1991659940847797e-06, + "loss": 0.0032, + "step": 7576 + }, + { + "epoch": 3.4472247497725204, + "grad_norm": 0.4027637446370561, + "learning_rate": 2.197982108531537e-06, + "loss": 0.003, + "step": 7577 + }, + { + "epoch": 3.4476797088262057, + "grad_norm": 0.081179246782656, + "learning_rate": 2.1967984519465414e-06, + "loss": 0.0006, + "step": 7578 + }, + { + "epoch": 3.448134667879891, + "grad_norm": 0.2759429853936018, + "learning_rate": 2.1956150244265184e-06, + "loss": 0.008, + "step": 7579 + }, + { + "epoch": 3.448589626933576, + "grad_norm": 0.06454804157775773, + "learning_rate": 2.1944318260681715e-06, + "loss": 0.0004, + "step": 7580 + }, + { + "epoch": 3.449044585987261, + "grad_norm": 0.22803415435311675, + "learning_rate": 2.193248856968185e-06, + "loss": 0.0012, + "step": 7581 + }, + { + "epoch": 3.4494995450409465, + "grad_norm": 0.03709175174358703, + "learning_rate": 2.192066117223228e-06, + "loss": 0.0003, + "step": 7582 + }, + { + "epoch": 3.4499545040946313, + "grad_norm": 0.22274667973839699, + "learning_rate": 2.190883606929945e-06, + "loss": 0.0027, + "step": 7583 + }, + { + "epoch": 3.4504094631483166, + "grad_norm": 0.27241348710092617, + "learning_rate": 2.1897013261849678e-06, + "loss": 0.0042, + "step": 7584 + }, + { + "epoch": 3.450864422202002, + "grad_norm": 0.2061567636885349, + "learning_rate": 2.1885192750849087e-06, + "loss": 0.0021, + "step": 7585 + }, + { + "epoch": 3.451319381255687, + "grad_norm": 0.25342431366619456, + "learning_rate": 2.187337453726358e-06, + "loss": 0.002, + "step": 7586 + }, + { + "epoch": 3.451774340309372, + "grad_norm": 0.20710263181165453, + "learning_rate": 2.186155862205889e-06, + "loss": 0.0021, + "step": 7587 + }, + { + "epoch": 3.4522292993630574, + "grad_norm": 0.08750458274873917, + "learning_rate": 2.1849745006200536e-06, + "loss": 0.0005, + "step": 7588 + }, + { + "epoch": 3.4526842584167423, + "grad_norm": 0.14585580421027522, + "learning_rate": 2.183793369065391e-06, + "loss": 0.0033, + "step": 7589 + }, + { + "epoch": 3.4531392174704276, + "grad_norm": 0.2129683517211805, + "learning_rate": 2.182612467638415e-06, + "loss": 0.0034, + "step": 7590 + }, + { + "epoch": 3.453594176524113, + "grad_norm": 0.18623176925640242, + "learning_rate": 2.181431796435627e-06, + "loss": 0.002, + "step": 7591 + }, + { + "epoch": 3.4540491355777982, + "grad_norm": 0.3411416919071153, + "learning_rate": 2.1802513555535038e-06, + "loss": 0.0074, + "step": 7592 + }, + { + "epoch": 3.454504094631483, + "grad_norm": 0.06228605865594398, + "learning_rate": 2.1790711450885038e-06, + "loss": 0.0007, + "step": 7593 + }, + { + "epoch": 3.4549590536851684, + "grad_norm": 0.20682244288660823, + "learning_rate": 2.1778911651370728e-06, + "loss": 0.003, + "step": 7594 + }, + { + "epoch": 3.4554140127388537, + "grad_norm": 0.37674912856008497, + "learning_rate": 2.176711415795631e-06, + "loss": 0.0049, + "step": 7595 + }, + { + "epoch": 3.4558689717925386, + "grad_norm": 0.20956696853928675, + "learning_rate": 2.1755318971605828e-06, + "loss": 0.0048, + "step": 7596 + }, + { + "epoch": 3.456323930846224, + "grad_norm": 0.13996537141149556, + "learning_rate": 2.1743526093283106e-06, + "loss": 0.0014, + "step": 7597 + }, + { + "epoch": 3.4567788898999092, + "grad_norm": 0.11111230907779905, + "learning_rate": 2.1731735523951832e-06, + "loss": 0.0013, + "step": 7598 + }, + { + "epoch": 3.457233848953594, + "grad_norm": 0.3765596737913546, + "learning_rate": 2.1719947264575484e-06, + "loss": 0.0091, + "step": 7599 + }, + { + "epoch": 3.4576888080072794, + "grad_norm": 0.21079256008288408, + "learning_rate": 2.1708161316117338e-06, + "loss": 0.0021, + "step": 7600 + }, + { + "epoch": 3.4581437670609647, + "grad_norm": 0.053408008752552555, + "learning_rate": 2.169637767954048e-06, + "loss": 0.0006, + "step": 7601 + }, + { + "epoch": 3.4585987261146496, + "grad_norm": 0.40635097820663696, + "learning_rate": 2.1684596355807807e-06, + "loss": 0.0068, + "step": 7602 + }, + { + "epoch": 3.459053685168335, + "grad_norm": 0.1806692460087327, + "learning_rate": 2.167281734588207e-06, + "loss": 0.0027, + "step": 7603 + }, + { + "epoch": 3.45950864422202, + "grad_norm": 0.4497160257864994, + "learning_rate": 2.166104065072575e-06, + "loss": 0.0101, + "step": 7604 + }, + { + "epoch": 3.459963603275705, + "grad_norm": 0.1287875528761719, + "learning_rate": 2.164926627130123e-06, + "loss": 0.0021, + "step": 7605 + }, + { + "epoch": 3.4604185623293904, + "grad_norm": 0.19435939703400862, + "learning_rate": 2.163749420857064e-06, + "loss": 0.0013, + "step": 7606 + }, + { + "epoch": 3.4608735213830757, + "grad_norm": 0.3117927694283147, + "learning_rate": 2.162572446349592e-06, + "loss": 0.0036, + "step": 7607 + }, + { + "epoch": 3.4613284804367606, + "grad_norm": 0.20144599399754456, + "learning_rate": 2.161395703703888e-06, + "loss": 0.0026, + "step": 7608 + }, + { + "epoch": 3.461783439490446, + "grad_norm": 0.20076451955079833, + "learning_rate": 2.160219193016108e-06, + "loss": 0.0047, + "step": 7609 + }, + { + "epoch": 3.462238398544131, + "grad_norm": 0.2855118399669933, + "learning_rate": 2.159042914382391e-06, + "loss": 0.0078, + "step": 7610 + }, + { + "epoch": 3.462693357597816, + "grad_norm": 0.2858193240528223, + "learning_rate": 2.1578668678988556e-06, + "loss": 0.0023, + "step": 7611 + }, + { + "epoch": 3.4631483166515014, + "grad_norm": 0.35197519842563885, + "learning_rate": 2.1566910536616052e-06, + "loss": 0.0031, + "step": 7612 + }, + { + "epoch": 3.4636032757051867, + "grad_norm": 0.6211274895977381, + "learning_rate": 2.155515471766723e-06, + "loss": 0.0085, + "step": 7613 + }, + { + "epoch": 3.4640582347588715, + "grad_norm": 0.1082607290666677, + "learning_rate": 2.154340122310271e-06, + "loss": 0.0014, + "step": 7614 + }, + { + "epoch": 3.464513193812557, + "grad_norm": 0.11953090664049322, + "learning_rate": 2.1531650053882934e-06, + "loss": 0.0014, + "step": 7615 + }, + { + "epoch": 3.464968152866242, + "grad_norm": 0.701244752820876, + "learning_rate": 2.151990121096816e-06, + "loss": 0.0082, + "step": 7616 + }, + { + "epoch": 3.465423111919927, + "grad_norm": 0.3539661775632979, + "learning_rate": 2.1508154695318417e-06, + "loss": 0.0054, + "step": 7617 + }, + { + "epoch": 3.4658780709736123, + "grad_norm": 0.1994463520339777, + "learning_rate": 2.1496410507893608e-06, + "loss": 0.0034, + "step": 7618 + }, + { + "epoch": 3.4663330300272976, + "grad_norm": 0.21445122631720664, + "learning_rate": 2.148466864965343e-06, + "loss": 0.001, + "step": 7619 + }, + { + "epoch": 3.4667879890809825, + "grad_norm": 0.5655065115031422, + "learning_rate": 2.147292912155735e-06, + "loss": 0.0091, + "step": 7620 + }, + { + "epoch": 3.467242948134668, + "grad_norm": 0.27343652443409633, + "learning_rate": 2.146119192456468e-06, + "loss": 0.0053, + "step": 7621 + }, + { + "epoch": 3.467697907188353, + "grad_norm": 0.3044766199111716, + "learning_rate": 2.1449457059634505e-06, + "loss": 0.0045, + "step": 7622 + }, + { + "epoch": 3.468152866242038, + "grad_norm": 0.20504372124105646, + "learning_rate": 2.1437724527725785e-06, + "loss": 0.0036, + "step": 7623 + }, + { + "epoch": 3.4686078252957233, + "grad_norm": 0.3428906131655945, + "learning_rate": 2.1425994329797233e-06, + "loss": 0.0038, + "step": 7624 + }, + { + "epoch": 3.4690627843494086, + "grad_norm": 0.32566883383029654, + "learning_rate": 2.1414266466807365e-06, + "loss": 0.0046, + "step": 7625 + }, + { + "epoch": 3.4695177434030935, + "grad_norm": 0.0828636824706321, + "learning_rate": 2.1402540939714565e-06, + "loss": 0.0004, + "step": 7626 + }, + { + "epoch": 3.469972702456779, + "grad_norm": 0.21200476841826338, + "learning_rate": 2.139081774947696e-06, + "loss": 0.0019, + "step": 7627 + }, + { + "epoch": 3.470427661510464, + "grad_norm": 0.2749722201481947, + "learning_rate": 2.1379096897052547e-06, + "loss": 0.0041, + "step": 7628 + }, + { + "epoch": 3.470882620564149, + "grad_norm": 0.2897640267601307, + "learning_rate": 2.136737838339909e-06, + "loss": 0.0036, + "step": 7629 + }, + { + "epoch": 3.4713375796178343, + "grad_norm": 0.3814692753140629, + "learning_rate": 2.135566220947416e-06, + "loss": 0.0112, + "step": 7630 + }, + { + "epoch": 3.4717925386715196, + "grad_norm": 0.2363076761425247, + "learning_rate": 2.1343948376235146e-06, + "loss": 0.0056, + "step": 7631 + }, + { + "epoch": 3.472247497725205, + "grad_norm": 0.7851429315806875, + "learning_rate": 2.1332236884639256e-06, + "loss": 0.0036, + "step": 7632 + }, + { + "epoch": 3.47270245677889, + "grad_norm": 0.21600176693922699, + "learning_rate": 2.1320527735643526e-06, + "loss": 0.0055, + "step": 7633 + }, + { + "epoch": 3.473157415832575, + "grad_norm": 0.21443155601202132, + "learning_rate": 2.1308820930204753e-06, + "loss": 0.0016, + "step": 7634 + }, + { + "epoch": 3.4736123748862604, + "grad_norm": 0.041329900558874755, + "learning_rate": 2.1297116469279566e-06, + "loss": 0.0004, + "step": 7635 + }, + { + "epoch": 3.4740673339399453, + "grad_norm": 0.42409122140892475, + "learning_rate": 2.128541435382438e-06, + "loss": 0.0041, + "step": 7636 + }, + { + "epoch": 3.4745222929936306, + "grad_norm": 0.36781015367786757, + "learning_rate": 2.127371458479548e-06, + "loss": 0.0097, + "step": 7637 + }, + { + "epoch": 3.474977252047316, + "grad_norm": 0.23064456548736212, + "learning_rate": 2.1262017163148895e-06, + "loss": 0.0053, + "step": 7638 + }, + { + "epoch": 3.4754322111010008, + "grad_norm": 0.06676349607223435, + "learning_rate": 2.1250322089840477e-06, + "loss": 0.0004, + "step": 7639 + }, + { + "epoch": 3.475887170154686, + "grad_norm": 0.25347342042534343, + "learning_rate": 2.1238629365825913e-06, + "loss": 0.003, + "step": 7640 + }, + { + "epoch": 3.4763421292083714, + "grad_norm": 0.236096168728993, + "learning_rate": 2.1226938992060658e-06, + "loss": 0.004, + "step": 7641 + }, + { + "epoch": 3.4767970882620562, + "grad_norm": 0.18380529756649028, + "learning_rate": 2.121525096950003e-06, + "loss": 0.0022, + "step": 7642 + }, + { + "epoch": 3.4772520473157416, + "grad_norm": 0.18749254168424617, + "learning_rate": 2.12035652990991e-06, + "loss": 0.0034, + "step": 7643 + }, + { + "epoch": 3.477707006369427, + "grad_norm": 0.4144093987784156, + "learning_rate": 2.1191881981812775e-06, + "loss": 0.0047, + "step": 7644 + }, + { + "epoch": 3.4781619654231117, + "grad_norm": 0.09958505928649422, + "learning_rate": 2.118020101859573e-06, + "loss": 0.0007, + "step": 7645 + }, + { + "epoch": 3.478616924476797, + "grad_norm": 0.06464789867528721, + "learning_rate": 2.116852241040252e-06, + "loss": 0.0005, + "step": 7646 + }, + { + "epoch": 3.4790718835304824, + "grad_norm": 0.27935140306616363, + "learning_rate": 2.115684615818747e-06, + "loss": 0.0072, + "step": 7647 + }, + { + "epoch": 3.4795268425841677, + "grad_norm": 0.27625905261306927, + "learning_rate": 2.1145172262904695e-06, + "loss": 0.0034, + "step": 7648 + }, + { + "epoch": 3.4799818016378525, + "grad_norm": 0.18352368089048088, + "learning_rate": 2.1133500725508138e-06, + "loss": 0.0015, + "step": 7649 + }, + { + "epoch": 3.480436760691538, + "grad_norm": 0.3003965541889959, + "learning_rate": 2.1121831546951523e-06, + "loss": 0.005, + "step": 7650 + }, + { + "epoch": 3.480891719745223, + "grad_norm": 0.3302318788975426, + "learning_rate": 2.1110164728188444e-06, + "loss": 0.0029, + "step": 7651 + }, + { + "epoch": 3.481346678798908, + "grad_norm": 0.2517251573498449, + "learning_rate": 2.1098500270172227e-06, + "loss": 0.0057, + "step": 7652 + }, + { + "epoch": 3.4818016378525933, + "grad_norm": 0.13703521113700673, + "learning_rate": 2.108683817385604e-06, + "loss": 0.0021, + "step": 7653 + }, + { + "epoch": 3.4822565969062786, + "grad_norm": 0.17465017489104626, + "learning_rate": 2.1075178440192883e-06, + "loss": 0.0026, + "step": 7654 + }, + { + "epoch": 3.4827115559599635, + "grad_norm": 0.3465028784605604, + "learning_rate": 2.1063521070135524e-06, + "loss": 0.0113, + "step": 7655 + }, + { + "epoch": 3.483166515013649, + "grad_norm": 0.3202340544807439, + "learning_rate": 2.105186606463653e-06, + "loss": 0.0031, + "step": 7656 + }, + { + "epoch": 3.483621474067334, + "grad_norm": 0.2195421656372889, + "learning_rate": 2.104021342464832e-06, + "loss": 0.0053, + "step": 7657 + }, + { + "epoch": 3.484076433121019, + "grad_norm": 0.3054300703290584, + "learning_rate": 2.10285631511231e-06, + "loss": 0.0039, + "step": 7658 + }, + { + "epoch": 3.4845313921747043, + "grad_norm": 0.23760591490469737, + "learning_rate": 2.101691524501286e-06, + "loss": 0.0021, + "step": 7659 + }, + { + "epoch": 3.4849863512283896, + "grad_norm": 0.04292499430560187, + "learning_rate": 2.10052697072694e-06, + "loss": 0.0004, + "step": 7660 + }, + { + "epoch": 3.4854413102820745, + "grad_norm": 0.14970135576185403, + "learning_rate": 2.099362653884436e-06, + "loss": 0.0022, + "step": 7661 + }, + { + "epoch": 3.48589626933576, + "grad_norm": 0.49591442435479727, + "learning_rate": 2.0981985740689186e-06, + "loss": 0.0122, + "step": 7662 + }, + { + "epoch": 3.486351228389445, + "grad_norm": 0.013466798394920708, + "learning_rate": 2.0970347313755095e-06, + "loss": 0.0001, + "step": 7663 + }, + { + "epoch": 3.48680618744313, + "grad_norm": 0.34123558150053246, + "learning_rate": 2.0958711258993126e-06, + "loss": 0.0035, + "step": 7664 + }, + { + "epoch": 3.4872611464968153, + "grad_norm": 0.45707477527665535, + "learning_rate": 2.0947077577354102e-06, + "loss": 0.0089, + "step": 7665 + }, + { + "epoch": 3.4877161055505006, + "grad_norm": 0.14902716143499972, + "learning_rate": 2.0935446269788718e-06, + "loss": 0.0011, + "step": 7666 + }, + { + "epoch": 3.4881710646041855, + "grad_norm": 0.096295257932685, + "learning_rate": 2.0923817337247394e-06, + "loss": 0.0006, + "step": 7667 + }, + { + "epoch": 3.488626023657871, + "grad_norm": 0.04511701075256628, + "learning_rate": 2.0912190780680425e-06, + "loss": 0.0004, + "step": 7668 + }, + { + "epoch": 3.489080982711556, + "grad_norm": 0.15709166808886962, + "learning_rate": 2.0900566601037865e-06, + "loss": 0.0024, + "step": 7669 + }, + { + "epoch": 3.489535941765241, + "grad_norm": 0.30102747564671173, + "learning_rate": 2.0888944799269573e-06, + "loss": 0.0034, + "step": 7670 + }, + { + "epoch": 3.4899909008189263, + "grad_norm": 0.04903034125974509, + "learning_rate": 2.087732537632527e-06, + "loss": 0.0004, + "step": 7671 + }, + { + "epoch": 3.4904458598726116, + "grad_norm": 0.13787578985762405, + "learning_rate": 2.0865708333154415e-06, + "loss": 0.0012, + "step": 7672 + }, + { + "epoch": 3.4909008189262964, + "grad_norm": 0.4930112044362682, + "learning_rate": 2.085409367070631e-06, + "loss": 0.0078, + "step": 7673 + }, + { + "epoch": 3.4913557779799818, + "grad_norm": 0.3331102491912277, + "learning_rate": 2.0842481389930024e-06, + "loss": 0.0042, + "step": 7674 + }, + { + "epoch": 3.491810737033667, + "grad_norm": 0.24518165619961485, + "learning_rate": 2.083087149177449e-06, + "loss": 0.0055, + "step": 7675 + }, + { + "epoch": 3.492265696087352, + "grad_norm": 0.26806674327420144, + "learning_rate": 2.0819263977188433e-06, + "loss": 0.0044, + "step": 7676 + }, + { + "epoch": 3.4927206551410372, + "grad_norm": 0.12831954387596262, + "learning_rate": 2.0807658847120336e-06, + "loss": 0.0011, + "step": 7677 + }, + { + "epoch": 3.4931756141947226, + "grad_norm": 0.3574771135467804, + "learning_rate": 2.079605610251853e-06, + "loss": 0.0088, + "step": 7678 + }, + { + "epoch": 3.4936305732484074, + "grad_norm": 0.9454144648403213, + "learning_rate": 2.0784455744331115e-06, + "loss": 0.0033, + "step": 7679 + }, + { + "epoch": 3.4940855323020927, + "grad_norm": 0.4379124975836111, + "learning_rate": 2.077285777350606e-06, + "loss": 0.0078, + "step": 7680 + }, + { + "epoch": 3.494540491355778, + "grad_norm": 0.4310839291740187, + "learning_rate": 2.0761262190991065e-06, + "loss": 0.0043, + "step": 7681 + }, + { + "epoch": 3.494995450409463, + "grad_norm": 0.2914926678941282, + "learning_rate": 2.07496689977337e-06, + "loss": 0.0031, + "step": 7682 + }, + { + "epoch": 3.4954504094631482, + "grad_norm": 0.24195137796044136, + "learning_rate": 2.073807819468129e-06, + "loss": 0.0016, + "step": 7683 + }, + { + "epoch": 3.4959053685168335, + "grad_norm": 0.21132836396361337, + "learning_rate": 2.072648978278096e-06, + "loss": 0.0026, + "step": 7684 + }, + { + "epoch": 3.496360327570519, + "grad_norm": 0.43372639824221204, + "learning_rate": 2.0714903762979716e-06, + "loss": 0.0091, + "step": 7685 + }, + { + "epoch": 3.4968152866242037, + "grad_norm": 0.4669176399759277, + "learning_rate": 2.0703320136224276e-06, + "loss": 0.0052, + "step": 7686 + }, + { + "epoch": 3.497270245677889, + "grad_norm": 0.34361418459078286, + "learning_rate": 2.0691738903461218e-06, + "loss": 0.0024, + "step": 7687 + }, + { + "epoch": 3.4977252047315743, + "grad_norm": 0.05989770662271216, + "learning_rate": 2.0680160065636883e-06, + "loss": 0.0004, + "step": 7688 + }, + { + "epoch": 3.498180163785259, + "grad_norm": 0.2598226848700686, + "learning_rate": 2.0668583623697473e-06, + "loss": 0.0057, + "step": 7689 + }, + { + "epoch": 3.4986351228389445, + "grad_norm": 0.12947657831849282, + "learning_rate": 2.065700957858894e-06, + "loss": 0.001, + "step": 7690 + }, + { + "epoch": 3.49909008189263, + "grad_norm": 0.26414260804908984, + "learning_rate": 2.0645437931257084e-06, + "loss": 0.0029, + "step": 7691 + }, + { + "epoch": 3.4995450409463147, + "grad_norm": 0.2324277480681154, + "learning_rate": 2.063386868264748e-06, + "loss": 0.0039, + "step": 7692 + }, + { + "epoch": 3.5, + "grad_norm": 0.3713831590109256, + "learning_rate": 2.062230183370551e-06, + "loss": 0.0129, + "step": 7693 + }, + { + "epoch": 3.5004549590536853, + "grad_norm": 0.3314732097380107, + "learning_rate": 2.061073738537635e-06, + "loss": 0.0049, + "step": 7694 + }, + { + "epoch": 3.50090991810737, + "grad_norm": 0.154796189639817, + "learning_rate": 2.0599175338605003e-06, + "loss": 0.0017, + "step": 7695 + }, + { + "epoch": 3.5013648771610555, + "grad_norm": 0.13762628370203114, + "learning_rate": 2.05876156943363e-06, + "loss": 0.0017, + "step": 7696 + }, + { + "epoch": 3.501819836214741, + "grad_norm": 0.24251629376883618, + "learning_rate": 2.0576058453514813e-06, + "loss": 0.0023, + "step": 7697 + }, + { + "epoch": 3.502274795268426, + "grad_norm": 0.09266297595642309, + "learning_rate": 2.056450361708495e-06, + "loss": 0.0007, + "step": 7698 + }, + { + "epoch": 3.502729754322111, + "grad_norm": 0.21739943901262987, + "learning_rate": 2.055295118599091e-06, + "loss": 0.0017, + "step": 7699 + }, + { + "epoch": 3.5031847133757963, + "grad_norm": 0.4813147020638217, + "learning_rate": 2.0541401161176734e-06, + "loss": 0.0056, + "step": 7700 + }, + { + "epoch": 3.5036396724294816, + "grad_norm": 0.28823743863945117, + "learning_rate": 2.052985354358622e-06, + "loss": 0.0034, + "step": 7701 + }, + { + "epoch": 3.5040946314831665, + "grad_norm": 0.5030958069763313, + "learning_rate": 2.0518308334162967e-06, + "loss": 0.0066, + "step": 7702 + }, + { + "epoch": 3.5045495905368518, + "grad_norm": 0.35179867111866353, + "learning_rate": 2.0506765533850443e-06, + "loss": 0.0027, + "step": 7703 + }, + { + "epoch": 3.505004549590537, + "grad_norm": 0.1695077708781563, + "learning_rate": 2.049522514359183e-06, + "loss": 0.0017, + "step": 7704 + }, + { + "epoch": 3.505459508644222, + "grad_norm": 0.13925839508068513, + "learning_rate": 2.048368716433019e-06, + "loss": 0.001, + "step": 7705 + }, + { + "epoch": 3.5059144676979073, + "grad_norm": 0.20857052666077638, + "learning_rate": 2.0472151597008343e-06, + "loss": 0.0018, + "step": 7706 + }, + { + "epoch": 3.5063694267515926, + "grad_norm": 0.32588152477384214, + "learning_rate": 2.046061844256892e-06, + "loss": 0.0063, + "step": 7707 + }, + { + "epoch": 3.5068243858052774, + "grad_norm": 0.2159640328455583, + "learning_rate": 2.044908770195434e-06, + "loss": 0.0045, + "step": 7708 + }, + { + "epoch": 3.5072793448589628, + "grad_norm": 0.0927780252449845, + "learning_rate": 2.043755937610686e-06, + "loss": 0.0009, + "step": 7709 + }, + { + "epoch": 3.507734303912648, + "grad_norm": 0.3699721690080576, + "learning_rate": 2.042603346596855e-06, + "loss": 0.0036, + "step": 7710 + }, + { + "epoch": 3.508189262966333, + "grad_norm": 0.14692602124506826, + "learning_rate": 2.041450997248123e-06, + "loss": 0.001, + "step": 7711 + }, + { + "epoch": 3.5086442220200182, + "grad_norm": 0.13063334895354267, + "learning_rate": 2.0402988896586544e-06, + "loss": 0.0016, + "step": 7712 + }, + { + "epoch": 3.5090991810737036, + "grad_norm": 0.11577327532530884, + "learning_rate": 2.039147023922593e-06, + "loss": 0.0009, + "step": 7713 + }, + { + "epoch": 3.5095541401273884, + "grad_norm": 0.27500033312287503, + "learning_rate": 2.0379954001340676e-06, + "loss": 0.0032, + "step": 7714 + }, + { + "epoch": 3.5100090991810737, + "grad_norm": 0.20955556319604213, + "learning_rate": 2.0368440183871812e-06, + "loss": 0.0029, + "step": 7715 + }, + { + "epoch": 3.510464058234759, + "grad_norm": 0.3950502179693419, + "learning_rate": 2.035692878776019e-06, + "loss": 0.0072, + "step": 7716 + }, + { + "epoch": 3.510919017288444, + "grad_norm": 0.12650653859475156, + "learning_rate": 2.03454198139465e-06, + "loss": 0.0021, + "step": 7717 + }, + { + "epoch": 3.511373976342129, + "grad_norm": 0.20906515268210074, + "learning_rate": 2.0333913263371157e-06, + "loss": 0.0059, + "step": 7718 + }, + { + "epoch": 3.5118289353958145, + "grad_norm": 0.12271949406361587, + "learning_rate": 2.032240913697448e-06, + "loss": 0.0011, + "step": 7719 + }, + { + "epoch": 3.5122838944494994, + "grad_norm": 0.17998330403633808, + "learning_rate": 2.03109074356965e-06, + "loss": 0.0026, + "step": 7720 + }, + { + "epoch": 3.5127388535031847, + "grad_norm": 0.3815183137353262, + "learning_rate": 2.0299408160477084e-06, + "loss": 0.003, + "step": 7721 + }, + { + "epoch": 3.51319381255687, + "grad_norm": 0.14660418518619533, + "learning_rate": 2.0287911312255916e-06, + "loss": 0.0022, + "step": 7722 + }, + { + "epoch": 3.513648771610555, + "grad_norm": 0.40032699275641115, + "learning_rate": 2.0276416891972416e-06, + "loss": 0.007, + "step": 7723 + }, + { + "epoch": 3.51410373066424, + "grad_norm": 0.09232605913048236, + "learning_rate": 2.0264924900565937e-06, + "loss": 0.0014, + "step": 7724 + }, + { + "epoch": 3.5145586897179255, + "grad_norm": 0.16072560504405328, + "learning_rate": 2.0253435338975506e-06, + "loss": 0.0021, + "step": 7725 + }, + { + "epoch": 3.5150136487716104, + "grad_norm": 0.0888364786911226, + "learning_rate": 2.024194820814001e-06, + "loss": 0.0009, + "step": 7726 + }, + { + "epoch": 3.5154686078252957, + "grad_norm": 0.2534815771670263, + "learning_rate": 2.023046350899812e-06, + "loss": 0.0039, + "step": 7727 + }, + { + "epoch": 3.515923566878981, + "grad_norm": 0.136855460118477, + "learning_rate": 2.0218981242488295e-06, + "loss": 0.0015, + "step": 7728 + }, + { + "epoch": 3.516378525932666, + "grad_norm": 0.2555215073819053, + "learning_rate": 2.0207501409548854e-06, + "loss": 0.0087, + "step": 7729 + }, + { + "epoch": 3.516833484986351, + "grad_norm": 0.3361774757234204, + "learning_rate": 2.019602401111783e-06, + "loss": 0.0119, + "step": 7730 + }, + { + "epoch": 3.5172884440400365, + "grad_norm": 0.19381321912426963, + "learning_rate": 2.0184549048133157e-06, + "loss": 0.0035, + "step": 7731 + }, + { + "epoch": 3.5177434030937214, + "grad_norm": 0.43141822202194136, + "learning_rate": 2.0173076521532485e-06, + "loss": 0.0064, + "step": 7732 + }, + { + "epoch": 3.5181983621474067, + "grad_norm": 0.19249594679272394, + "learning_rate": 2.016160643225329e-06, + "loss": 0.0035, + "step": 7733 + }, + { + "epoch": 3.518653321201092, + "grad_norm": 0.3965970263838499, + "learning_rate": 2.015013878123288e-06, + "loss": 0.0059, + "step": 7734 + }, + { + "epoch": 3.519108280254777, + "grad_norm": 0.23543948713337504, + "learning_rate": 2.013867356940833e-06, + "loss": 0.0032, + "step": 7735 + }, + { + "epoch": 3.519563239308462, + "grad_norm": 0.15592626464027823, + "learning_rate": 2.0127210797716522e-06, + "loss": 0.0008, + "step": 7736 + }, + { + "epoch": 3.5200181983621475, + "grad_norm": 0.2949229386168134, + "learning_rate": 2.0115750467094132e-06, + "loss": 0.0075, + "step": 7737 + }, + { + "epoch": 3.5204731574158323, + "grad_norm": 0.23403578862731103, + "learning_rate": 2.010429257847765e-06, + "loss": 0.0028, + "step": 7738 + }, + { + "epoch": 3.5209281164695176, + "grad_norm": 0.35966101897758035, + "learning_rate": 2.0092837132803396e-06, + "loss": 0.0085, + "step": 7739 + }, + { + "epoch": 3.521383075523203, + "grad_norm": 0.2564744225208268, + "learning_rate": 2.0081384131007425e-06, + "loss": 0.0023, + "step": 7740 + }, + { + "epoch": 3.521838034576888, + "grad_norm": 0.3584577571659201, + "learning_rate": 2.0069933574025634e-06, + "loss": 0.007, + "step": 7741 + }, + { + "epoch": 3.522292993630573, + "grad_norm": 0.39500534624058986, + "learning_rate": 2.0058485462793693e-06, + "loss": 0.0039, + "step": 7742 + }, + { + "epoch": 3.5227479526842584, + "grad_norm": 0.2156540854241804, + "learning_rate": 2.004703979824712e-06, + "loss": 0.0036, + "step": 7743 + }, + { + "epoch": 3.5232029117379433, + "grad_norm": 0.3463758948091541, + "learning_rate": 2.003559658132117e-06, + "loss": 0.0068, + "step": 7744 + }, + { + "epoch": 3.5236578707916286, + "grad_norm": 0.11934460967375501, + "learning_rate": 2.0024155812950967e-06, + "loss": 0.0021, + "step": 7745 + }, + { + "epoch": 3.524112829845314, + "grad_norm": 0.10718495765205331, + "learning_rate": 2.0012717494071384e-06, + "loss": 0.0022, + "step": 7746 + }, + { + "epoch": 3.5245677888989992, + "grad_norm": 0.2771520953346642, + "learning_rate": 2.0001281625617086e-06, + "loss": 0.0072, + "step": 7747 + }, + { + "epoch": 3.525022747952684, + "grad_norm": 0.2886675412571639, + "learning_rate": 1.99898482085226e-06, + "loss": 0.0054, + "step": 7748 + }, + { + "epoch": 3.5254777070063694, + "grad_norm": 0.29857230312345473, + "learning_rate": 1.9978417243722192e-06, + "loss": 0.0025, + "step": 7749 + }, + { + "epoch": 3.5259326660600547, + "grad_norm": 0.24868240714629586, + "learning_rate": 1.996698873214995e-06, + "loss": 0.0041, + "step": 7750 + }, + { + "epoch": 3.5263876251137396, + "grad_norm": 0.1859194084631994, + "learning_rate": 1.995556267473975e-06, + "loss": 0.0049, + "step": 7751 + }, + { + "epoch": 3.526842584167425, + "grad_norm": 0.12545191760091903, + "learning_rate": 1.9944139072425276e-06, + "loss": 0.0017, + "step": 7752 + }, + { + "epoch": 3.52729754322111, + "grad_norm": 0.2047816519840243, + "learning_rate": 1.9932717926140055e-06, + "loss": 0.0022, + "step": 7753 + }, + { + "epoch": 3.5277525022747955, + "grad_norm": 0.31282131817938136, + "learning_rate": 1.992129923681734e-06, + "loss": 0.0097, + "step": 7754 + }, + { + "epoch": 3.5282074613284804, + "grad_norm": 0.24047159717721744, + "learning_rate": 1.9909883005390213e-06, + "loss": 0.0029, + "step": 7755 + }, + { + "epoch": 3.5286624203821657, + "grad_norm": 0.3848924828105359, + "learning_rate": 1.9898469232791546e-06, + "loss": 0.0062, + "step": 7756 + }, + { + "epoch": 3.529117379435851, + "grad_norm": 0.20400027152249636, + "learning_rate": 1.9887057919954056e-06, + "loss": 0.0037, + "step": 7757 + }, + { + "epoch": 3.529572338489536, + "grad_norm": 0.21319407139990143, + "learning_rate": 1.9875649067810184e-06, + "loss": 0.0022, + "step": 7758 + }, + { + "epoch": 3.530027297543221, + "grad_norm": 0.20840203584149328, + "learning_rate": 1.9864242677292244e-06, + "loss": 0.0017, + "step": 7759 + }, + { + "epoch": 3.5304822565969065, + "grad_norm": 0.10516358843511583, + "learning_rate": 1.9852838749332304e-06, + "loss": 0.0015, + "step": 7760 + }, + { + "epoch": 3.5309372156505914, + "grad_norm": 0.19790041912850087, + "learning_rate": 1.984143728486224e-06, + "loss": 0.0041, + "step": 7761 + }, + { + "epoch": 3.5313921747042767, + "grad_norm": 0.21096348895533565, + "learning_rate": 1.9830038284813708e-06, + "loss": 0.0038, + "step": 7762 + }, + { + "epoch": 3.531847133757962, + "grad_norm": 0.12389181466543855, + "learning_rate": 1.981864175011822e-06, + "loss": 0.0022, + "step": 7763 + }, + { + "epoch": 3.532302092811647, + "grad_norm": 0.06832625849825473, + "learning_rate": 1.980724768170702e-06, + "loss": 0.0009, + "step": 7764 + }, + { + "epoch": 3.532757051865332, + "grad_norm": 0.26512545294472484, + "learning_rate": 1.9795856080511184e-06, + "loss": 0.0033, + "step": 7765 + }, + { + "epoch": 3.5332120109190175, + "grad_norm": 0.09913613064610376, + "learning_rate": 1.97844669474616e-06, + "loss": 0.0008, + "step": 7766 + }, + { + "epoch": 3.5336669699727024, + "grad_norm": 0.11892462331092149, + "learning_rate": 1.977308028348891e-06, + "loss": 0.0015, + "step": 7767 + }, + { + "epoch": 3.5341219290263877, + "grad_norm": 0.21379987765787165, + "learning_rate": 1.976169608952361e-06, + "loss": 0.0043, + "step": 7768 + }, + { + "epoch": 3.534576888080073, + "grad_norm": 0.4392171257258942, + "learning_rate": 1.9750314366495953e-06, + "loss": 0.0077, + "step": 7769 + }, + { + "epoch": 3.535031847133758, + "grad_norm": 0.5114805694341462, + "learning_rate": 1.9738935115336004e-06, + "loss": 0.0048, + "step": 7770 + }, + { + "epoch": 3.535486806187443, + "grad_norm": 0.3052064749925445, + "learning_rate": 1.9727558336973594e-06, + "loss": 0.0052, + "step": 7771 + }, + { + "epoch": 3.5359417652411285, + "grad_norm": 0.26725770758324674, + "learning_rate": 1.9716184032338415e-06, + "loss": 0.003, + "step": 7772 + }, + { + "epoch": 3.5363967242948133, + "grad_norm": 0.4597987824136517, + "learning_rate": 1.9704812202359928e-06, + "loss": 0.009, + "step": 7773 + }, + { + "epoch": 3.5368516833484986, + "grad_norm": 0.11087107979603508, + "learning_rate": 1.9693442847967383e-06, + "loss": 0.0008, + "step": 7774 + }, + { + "epoch": 3.537306642402184, + "grad_norm": 0.23878980076655165, + "learning_rate": 1.9682075970089815e-06, + "loss": 0.0049, + "step": 7775 + }, + { + "epoch": 3.537761601455869, + "grad_norm": 0.34025913021293663, + "learning_rate": 1.9670711569656076e-06, + "loss": 0.004, + "step": 7776 + }, + { + "epoch": 3.538216560509554, + "grad_norm": 0.2054945240914702, + "learning_rate": 1.9659349647594835e-06, + "loss": 0.004, + "step": 7777 + }, + { + "epoch": 3.5386715195632394, + "grad_norm": 0.26478068246536995, + "learning_rate": 1.964799020483452e-06, + "loss": 0.0067, + "step": 7778 + }, + { + "epoch": 3.5391264786169243, + "grad_norm": 0.11053642628877315, + "learning_rate": 1.9636633242303365e-06, + "loss": 0.0015, + "step": 7779 + }, + { + "epoch": 3.5395814376706096, + "grad_norm": 0.20097693948923082, + "learning_rate": 1.962527876092944e-06, + "loss": 0.0035, + "step": 7780 + }, + { + "epoch": 3.540036396724295, + "grad_norm": 0.3552994232820875, + "learning_rate": 1.9613926761640543e-06, + "loss": 0.0046, + "step": 7781 + }, + { + "epoch": 3.54049135577798, + "grad_norm": 0.2165973136872899, + "learning_rate": 1.9602577245364345e-06, + "loss": 0.0037, + "step": 7782 + }, + { + "epoch": 3.540946314831665, + "grad_norm": 0.22827804602151244, + "learning_rate": 1.9591230213028265e-06, + "loss": 0.0022, + "step": 7783 + }, + { + "epoch": 3.5414012738853504, + "grad_norm": 0.20769097356530997, + "learning_rate": 1.957988566555953e-06, + "loss": 0.003, + "step": 7784 + }, + { + "epoch": 3.5418562329390353, + "grad_norm": 0.25811787146593734, + "learning_rate": 1.9568543603885136e-06, + "loss": 0.0026, + "step": 7785 + }, + { + "epoch": 3.5423111919927206, + "grad_norm": 0.3676202785795054, + "learning_rate": 1.9557204028931936e-06, + "loss": 0.0089, + "step": 7786 + }, + { + "epoch": 3.542766151046406, + "grad_norm": 0.3947073011485478, + "learning_rate": 1.9545866941626563e-06, + "loss": 0.0084, + "step": 7787 + }, + { + "epoch": 3.5432211101000908, + "grad_norm": 0.16825838161603163, + "learning_rate": 1.9534532342895413e-06, + "loss": 0.0014, + "step": 7788 + }, + { + "epoch": 3.543676069153776, + "grad_norm": 0.3284350768006685, + "learning_rate": 1.9523200233664695e-06, + "loss": 0.0069, + "step": 7789 + }, + { + "epoch": 3.5441310282074614, + "grad_norm": 0.14180787120331206, + "learning_rate": 1.9511870614860407e-06, + "loss": 0.0019, + "step": 7790 + }, + { + "epoch": 3.5445859872611463, + "grad_norm": 0.14790928009158866, + "learning_rate": 1.950054348740839e-06, + "loss": 0.002, + "step": 7791 + }, + { + "epoch": 3.5450409463148316, + "grad_norm": 0.06680045802963924, + "learning_rate": 1.948921885223422e-06, + "loss": 0.0007, + "step": 7792 + }, + { + "epoch": 3.545495905368517, + "grad_norm": 0.2869613031072459, + "learning_rate": 1.9477896710263285e-06, + "loss": 0.0083, + "step": 7793 + }, + { + "epoch": 3.5459508644222018, + "grad_norm": 0.5072236925227637, + "learning_rate": 1.946657706242081e-06, + "loss": 0.0098, + "step": 7794 + }, + { + "epoch": 3.546405823475887, + "grad_norm": 0.22275331235240897, + "learning_rate": 1.945525990963176e-06, + "loss": 0.0026, + "step": 7795 + }, + { + "epoch": 3.5468607825295724, + "grad_norm": 0.2513136107376008, + "learning_rate": 1.944394525282094e-06, + "loss": 0.0028, + "step": 7796 + }, + { + "epoch": 3.5473157415832572, + "grad_norm": 0.12425764201258375, + "learning_rate": 1.9432633092912924e-06, + "loss": 0.0007, + "step": 7797 + }, + { + "epoch": 3.5477707006369426, + "grad_norm": 0.3561145669451658, + "learning_rate": 1.9421323430832097e-06, + "loss": 0.0056, + "step": 7798 + }, + { + "epoch": 3.548225659690628, + "grad_norm": 0.25548502182389277, + "learning_rate": 1.941001626750262e-06, + "loss": 0.0033, + "step": 7799 + }, + { + "epoch": 3.548680618744313, + "grad_norm": 0.3356442032343971, + "learning_rate": 1.939871160384846e-06, + "loss": 0.0029, + "step": 7800 + }, + { + "epoch": 3.549135577797998, + "grad_norm": 0.29472476245464385, + "learning_rate": 1.9387409440793387e-06, + "loss": 0.0061, + "step": 7801 + }, + { + "epoch": 3.5495905368516834, + "grad_norm": 0.08043940012829344, + "learning_rate": 1.9376109779260986e-06, + "loss": 0.0005, + "step": 7802 + }, + { + "epoch": 3.5500454959053687, + "grad_norm": 0.16168538080218328, + "learning_rate": 1.9364812620174607e-06, + "loss": 0.002, + "step": 7803 + }, + { + "epoch": 3.5505004549590535, + "grad_norm": 0.30321264558747174, + "learning_rate": 1.9353517964457386e-06, + "loss": 0.0026, + "step": 7804 + }, + { + "epoch": 3.550955414012739, + "grad_norm": 0.16581168105425526, + "learning_rate": 1.934222581303226e-06, + "loss": 0.0018, + "step": 7805 + }, + { + "epoch": 3.551410373066424, + "grad_norm": 0.20053437534470278, + "learning_rate": 1.933093616682201e-06, + "loss": 0.0034, + "step": 7806 + }, + { + "epoch": 3.5518653321201095, + "grad_norm": 0.16316560799208163, + "learning_rate": 1.9319649026749144e-06, + "loss": 0.002, + "step": 7807 + }, + { + "epoch": 3.5523202911737943, + "grad_norm": 0.12272105120085568, + "learning_rate": 1.9308364393736025e-06, + "loss": 0.0009, + "step": 7808 + }, + { + "epoch": 3.5527752502274796, + "grad_norm": 0.04301336602681302, + "learning_rate": 1.9297082268704758e-06, + "loss": 0.0004, + "step": 7809 + }, + { + "epoch": 3.553230209281165, + "grad_norm": 0.04921376032330394, + "learning_rate": 1.9285802652577262e-06, + "loss": 0.0005, + "step": 7810 + }, + { + "epoch": 3.55368516833485, + "grad_norm": 0.4000595697202519, + "learning_rate": 1.9274525546275284e-06, + "loss": 0.0022, + "step": 7811 + }, + { + "epoch": 3.554140127388535, + "grad_norm": 0.19376601167534413, + "learning_rate": 1.926325095072033e-06, + "loss": 0.0012, + "step": 7812 + }, + { + "epoch": 3.5545950864422204, + "grad_norm": 0.17456277815504023, + "learning_rate": 1.9251978866833696e-06, + "loss": 0.0015, + "step": 7813 + }, + { + "epoch": 3.5550500454959053, + "grad_norm": 0.271046223458997, + "learning_rate": 1.924070929553648e-06, + "loss": 0.0032, + "step": 7814 + }, + { + "epoch": 3.5555050045495906, + "grad_norm": 0.27826881314759416, + "learning_rate": 1.922944223774959e-06, + "loss": 0.0039, + "step": 7815 + }, + { + "epoch": 3.555959963603276, + "grad_norm": 0.066469035444489, + "learning_rate": 1.9218177694393737e-06, + "loss": 0.0007, + "step": 7816 + }, + { + "epoch": 3.556414922656961, + "grad_norm": 0.28801098648844486, + "learning_rate": 1.9206915666389396e-06, + "loss": 0.0031, + "step": 7817 + }, + { + "epoch": 3.556869881710646, + "grad_norm": 0.2410917775885526, + "learning_rate": 1.9195656154656844e-06, + "loss": 0.007, + "step": 7818 + }, + { + "epoch": 3.5573248407643314, + "grad_norm": 0.3350517672704401, + "learning_rate": 1.9184399160116146e-06, + "loss": 0.0043, + "step": 7819 + }, + { + "epoch": 3.5577797998180163, + "grad_norm": 0.07255469096683553, + "learning_rate": 1.91731446836872e-06, + "loss": 0.0006, + "step": 7820 + }, + { + "epoch": 3.5582347588717016, + "grad_norm": 0.2816743657966873, + "learning_rate": 1.9161892726289643e-06, + "loss": 0.0031, + "step": 7821 + }, + { + "epoch": 3.558689717925387, + "grad_norm": 0.23167362359732568, + "learning_rate": 1.9150643288842963e-06, + "loss": 0.0038, + "step": 7822 + }, + { + "epoch": 3.5591446769790718, + "grad_norm": 0.15723888410495052, + "learning_rate": 1.9139396372266407e-06, + "loss": 0.002, + "step": 7823 + }, + { + "epoch": 3.559599636032757, + "grad_norm": 8.291661300439266, + "learning_rate": 1.912815197747899e-06, + "loss": 0.0929, + "step": 7824 + }, + { + "epoch": 3.5600545950864424, + "grad_norm": 0.556252010177556, + "learning_rate": 1.9116910105399594e-06, + "loss": 0.0119, + "step": 7825 + }, + { + "epoch": 3.5605095541401273, + "grad_norm": 0.10335399382757667, + "learning_rate": 1.9105670756946836e-06, + "loss": 0.0017, + "step": 7826 + }, + { + "epoch": 3.5609645131938126, + "grad_norm": 0.3029963896566323, + "learning_rate": 1.909443393303915e-06, + "loss": 0.0062, + "step": 7827 + }, + { + "epoch": 3.561419472247498, + "grad_norm": 0.24255008071947937, + "learning_rate": 1.908319963459474e-06, + "loss": 0.0037, + "step": 7828 + }, + { + "epoch": 3.5618744313011828, + "grad_norm": 0.31374557388326757, + "learning_rate": 1.9071967862531632e-06, + "loss": 0.0027, + "step": 7829 + }, + { + "epoch": 3.562329390354868, + "grad_norm": 0.2839158633267819, + "learning_rate": 1.906073861776766e-06, + "loss": 0.0084, + "step": 7830 + }, + { + "epoch": 3.5627843494085534, + "grad_norm": 0.4561988777530323, + "learning_rate": 1.9049511901220409e-06, + "loss": 0.0099, + "step": 7831 + }, + { + "epoch": 3.5632393084622382, + "grad_norm": 0.19015944658395145, + "learning_rate": 1.9038287713807269e-06, + "loss": 0.0048, + "step": 7832 + }, + { + "epoch": 3.5636942675159236, + "grad_norm": 0.2685832904432481, + "learning_rate": 1.9027066056445437e-06, + "loss": 0.0031, + "step": 7833 + }, + { + "epoch": 3.564149226569609, + "grad_norm": 0.4365573333097983, + "learning_rate": 1.9015846930051879e-06, + "loss": 0.0092, + "step": 7834 + }, + { + "epoch": 3.5646041856232937, + "grad_norm": 0.2956580581449613, + "learning_rate": 1.900463033554339e-06, + "loss": 0.0073, + "step": 7835 + }, + { + "epoch": 3.565059144676979, + "grad_norm": 0.1827958823749623, + "learning_rate": 1.8993416273836546e-06, + "loss": 0.0013, + "step": 7836 + }, + { + "epoch": 3.5655141037306644, + "grad_norm": 0.2399800437682335, + "learning_rate": 1.8982204745847704e-06, + "loss": 0.0029, + "step": 7837 + }, + { + "epoch": 3.565969062784349, + "grad_norm": 0.17654268778036145, + "learning_rate": 1.8970995752493016e-06, + "loss": 0.0022, + "step": 7838 + }, + { + "epoch": 3.5664240218380345, + "grad_norm": 0.12701314508844222, + "learning_rate": 1.8959789294688408e-06, + "loss": 0.0019, + "step": 7839 + }, + { + "epoch": 3.56687898089172, + "grad_norm": 0.03969785212810466, + "learning_rate": 1.8948585373349665e-06, + "loss": 0.0003, + "step": 7840 + }, + { + "epoch": 3.5673339399454047, + "grad_norm": 0.05554512170704364, + "learning_rate": 1.8937383989392294e-06, + "loss": 0.0005, + "step": 7841 + }, + { + "epoch": 3.56778889899909, + "grad_norm": 0.19775444531922698, + "learning_rate": 1.8926185143731607e-06, + "loss": 0.0023, + "step": 7842 + }, + { + "epoch": 3.5682438580527753, + "grad_norm": 0.1915942271992767, + "learning_rate": 1.8914988837282767e-06, + "loss": 0.0025, + "step": 7843 + }, + { + "epoch": 3.56869881710646, + "grad_norm": 0.1826400534228145, + "learning_rate": 1.8903795070960635e-06, + "loss": 0.004, + "step": 7844 + }, + { + "epoch": 3.5691537761601455, + "grad_norm": 0.1141328912592994, + "learning_rate": 1.8892603845679963e-06, + "loss": 0.0014, + "step": 7845 + }, + { + "epoch": 3.569608735213831, + "grad_norm": 0.07758567332736764, + "learning_rate": 1.8881415162355222e-06, + "loss": 0.001, + "step": 7846 + }, + { + "epoch": 3.5700636942675157, + "grad_norm": 0.17736716274854417, + "learning_rate": 1.8870229021900706e-06, + "loss": 0.0028, + "step": 7847 + }, + { + "epoch": 3.570518653321201, + "grad_norm": 0.24901034747236278, + "learning_rate": 1.8859045425230477e-06, + "loss": 0.0072, + "step": 7848 + }, + { + "epoch": 3.5709736123748863, + "grad_norm": 0.15898630125207755, + "learning_rate": 1.8847864373258417e-06, + "loss": 0.0026, + "step": 7849 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 0.360024037313287, + "learning_rate": 1.8836685866898224e-06, + "loss": 0.0042, + "step": 7850 + }, + { + "epoch": 3.5718835304822565, + "grad_norm": 0.18931597313542467, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.0024, + "step": 7851 + }, + { + "epoch": 3.572338489535942, + "grad_norm": 0.3368471376035797, + "learning_rate": 1.8814336494666979e-06, + "loss": 0.0034, + "step": 7852 + }, + { + "epoch": 3.5727934485896267, + "grad_norm": 0.1877896723592328, + "learning_rate": 1.88031656306222e-06, + "loss": 0.0021, + "step": 7853 + }, + { + "epoch": 3.573248407643312, + "grad_norm": 0.30689448214652615, + "learning_rate": 1.8791997315841865e-06, + "loss": 0.003, + "step": 7854 + }, + { + "epoch": 3.5737033666969973, + "grad_norm": 0.37648531464817464, + "learning_rate": 1.8780831551238566e-06, + "loss": 0.0054, + "step": 7855 + }, + { + "epoch": 3.5741583257506826, + "grad_norm": 0.14790533285685548, + "learning_rate": 1.8769668337724717e-06, + "loss": 0.0019, + "step": 7856 + }, + { + "epoch": 3.5746132848043675, + "grad_norm": 0.3290089136734179, + "learning_rate": 1.875850767621255e-06, + "loss": 0.0081, + "step": 7857 + }, + { + "epoch": 3.5750682438580528, + "grad_norm": 0.2438159577849764, + "learning_rate": 1.8747349567614036e-06, + "loss": 0.0026, + "step": 7858 + }, + { + "epoch": 3.575523202911738, + "grad_norm": 0.44908224923679907, + "learning_rate": 1.8736194012840996e-06, + "loss": 0.0064, + "step": 7859 + }, + { + "epoch": 3.575978161965423, + "grad_norm": 0.2612222870249877, + "learning_rate": 1.8725041012804994e-06, + "loss": 0.0038, + "step": 7860 + }, + { + "epoch": 3.5764331210191083, + "grad_norm": 0.2538727072882192, + "learning_rate": 1.8713890568417408e-06, + "loss": 0.0047, + "step": 7861 + }, + { + "epoch": 3.5768880800727936, + "grad_norm": 0.37328615462974646, + "learning_rate": 1.870274268058938e-06, + "loss": 0.0044, + "step": 7862 + }, + { + "epoch": 3.577343039126479, + "grad_norm": 0.23852059542601314, + "learning_rate": 1.8691597350231877e-06, + "loss": 0.0036, + "step": 7863 + }, + { + "epoch": 3.5777979981801638, + "grad_norm": 0.1648965035834699, + "learning_rate": 1.8680454578255674e-06, + "loss": 0.002, + "step": 7864 + }, + { + "epoch": 3.578252957233849, + "grad_norm": 0.5781760248014696, + "learning_rate": 1.8669314365571285e-06, + "loss": 0.0057, + "step": 7865 + }, + { + "epoch": 3.5787079162875344, + "grad_norm": 0.15098789223124823, + "learning_rate": 1.8658176713089038e-06, + "loss": 0.0033, + "step": 7866 + }, + { + "epoch": 3.5791628753412192, + "grad_norm": 0.2569999285657466, + "learning_rate": 1.8647041621719047e-06, + "loss": 0.0062, + "step": 7867 + }, + { + "epoch": 3.5796178343949046, + "grad_norm": 0.22836082376197347, + "learning_rate": 1.8635909092371214e-06, + "loss": 0.0018, + "step": 7868 + }, + { + "epoch": 3.58007279344859, + "grad_norm": 0.2125034388435507, + "learning_rate": 1.862477912595526e-06, + "loss": 0.0029, + "step": 7869 + }, + { + "epoch": 3.5805277525022747, + "grad_norm": 0.14826069261954608, + "learning_rate": 1.861365172338066e-06, + "loss": 0.0008, + "step": 7870 + }, + { + "epoch": 3.58098271155596, + "grad_norm": 0.25791990914333773, + "learning_rate": 1.8602526885556715e-06, + "loss": 0.0035, + "step": 7871 + }, + { + "epoch": 3.5814376706096454, + "grad_norm": 0.24349151276618372, + "learning_rate": 1.8591404613392477e-06, + "loss": 0.0053, + "step": 7872 + }, + { + "epoch": 3.58189262966333, + "grad_norm": 0.2541523185075842, + "learning_rate": 1.8580284907796803e-06, + "loss": 0.0038, + "step": 7873 + }, + { + "epoch": 3.5823475887170155, + "grad_norm": 0.13711911289964399, + "learning_rate": 1.8569167769678375e-06, + "loss": 0.0023, + "step": 7874 + }, + { + "epoch": 3.582802547770701, + "grad_norm": 0.4239392401036632, + "learning_rate": 1.8558053199945613e-06, + "loss": 0.0128, + "step": 7875 + }, + { + "epoch": 3.5832575068243857, + "grad_norm": 0.21898069795300995, + "learning_rate": 1.8546941199506752e-06, + "loss": 0.0043, + "step": 7876 + }, + { + "epoch": 3.583712465878071, + "grad_norm": 0.35824719370429525, + "learning_rate": 1.8535831769269802e-06, + "loss": 0.0098, + "step": 7877 + }, + { + "epoch": 3.5841674249317563, + "grad_norm": 0.19549547883919538, + "learning_rate": 1.8524724910142588e-06, + "loss": 0.0015, + "step": 7878 + }, + { + "epoch": 3.584622383985441, + "grad_norm": 0.34835104718669907, + "learning_rate": 1.851362062303273e-06, + "loss": 0.007, + "step": 7879 + }, + { + "epoch": 3.5850773430391265, + "grad_norm": 0.04346302220808581, + "learning_rate": 1.85025189088476e-06, + "loss": 0.0005, + "step": 7880 + }, + { + "epoch": 3.585532302092812, + "grad_norm": 0.31511780552489604, + "learning_rate": 1.849141976849439e-06, + "loss": 0.0045, + "step": 7881 + }, + { + "epoch": 3.5859872611464967, + "grad_norm": 0.2975742152769068, + "learning_rate": 1.848032320288004e-06, + "loss": 0.0037, + "step": 7882 + }, + { + "epoch": 3.586442220200182, + "grad_norm": 0.09078263429941769, + "learning_rate": 1.8469229212911361e-06, + "loss": 0.0009, + "step": 7883 + }, + { + "epoch": 3.5868971792538673, + "grad_norm": 0.11573700181135811, + "learning_rate": 1.8458137799494857e-06, + "loss": 0.0006, + "step": 7884 + }, + { + "epoch": 3.587352138307552, + "grad_norm": 0.14498108831423218, + "learning_rate": 1.8447048963536908e-06, + "loss": 0.0011, + "step": 7885 + }, + { + "epoch": 3.5878070973612375, + "grad_norm": 0.0983077504321627, + "learning_rate": 1.8435962705943628e-06, + "loss": 0.0012, + "step": 7886 + }, + { + "epoch": 3.588262056414923, + "grad_norm": 0.3040958269451847, + "learning_rate": 1.8424879027620913e-06, + "loss": 0.0035, + "step": 7887 + }, + { + "epoch": 3.5887170154686077, + "grad_norm": 0.33771776352356425, + "learning_rate": 1.8413797929474515e-06, + "loss": 0.0058, + "step": 7888 + }, + { + "epoch": 3.589171974522293, + "grad_norm": 0.19632673041120569, + "learning_rate": 1.8402719412409904e-06, + "loss": 0.0041, + "step": 7889 + }, + { + "epoch": 3.5896269335759783, + "grad_norm": 0.22556125154060808, + "learning_rate": 1.8391643477332367e-06, + "loss": 0.0052, + "step": 7890 + }, + { + "epoch": 3.590081892629663, + "grad_norm": 0.16926219057323924, + "learning_rate": 1.8380570125146968e-06, + "loss": 0.0024, + "step": 7891 + }, + { + "epoch": 3.5905368516833485, + "grad_norm": 0.2824537823006504, + "learning_rate": 1.8369499356758592e-06, + "loss": 0.0023, + "step": 7892 + }, + { + "epoch": 3.5909918107370338, + "grad_norm": 0.2907508835953086, + "learning_rate": 1.83584311730719e-06, + "loss": 0.0039, + "step": 7893 + }, + { + "epoch": 3.5914467697907186, + "grad_norm": 0.39093362476591115, + "learning_rate": 1.8347365574991317e-06, + "loss": 0.0078, + "step": 7894 + }, + { + "epoch": 3.591901728844404, + "grad_norm": 0.21703840186106568, + "learning_rate": 1.8336302563421083e-06, + "loss": 0.0022, + "step": 7895 + }, + { + "epoch": 3.5923566878980893, + "grad_norm": 0.30147901577619696, + "learning_rate": 1.8325242139265192e-06, + "loss": 0.0021, + "step": 7896 + }, + { + "epoch": 3.592811646951774, + "grad_norm": 0.3958194604680672, + "learning_rate": 1.8314184303427484e-06, + "loss": 0.0024, + "step": 7897 + }, + { + "epoch": 3.5932666060054594, + "grad_norm": 0.11906332456432153, + "learning_rate": 1.830312905681153e-06, + "loss": 0.001, + "step": 7898 + }, + { + "epoch": 3.5937215650591448, + "grad_norm": 0.24610698279584053, + "learning_rate": 1.8292076400320746e-06, + "loss": 0.0028, + "step": 7899 + }, + { + "epoch": 3.5941765241128296, + "grad_norm": 0.21960152706123276, + "learning_rate": 1.8281026334858287e-06, + "loss": 0.0058, + "step": 7900 + }, + { + "epoch": 3.594631483166515, + "grad_norm": 0.1790492789710415, + "learning_rate": 1.8269978861327097e-06, + "loss": 0.0017, + "step": 7901 + }, + { + "epoch": 3.5950864422202002, + "grad_norm": 0.32270491510546734, + "learning_rate": 1.8258933980629957e-06, + "loss": 0.0045, + "step": 7902 + }, + { + "epoch": 3.595541401273885, + "grad_norm": 0.22759880789456016, + "learning_rate": 1.8247891693669394e-06, + "loss": 0.0013, + "step": 7903 + }, + { + "epoch": 3.5959963603275704, + "grad_norm": 0.5020246925343776, + "learning_rate": 1.8236852001347728e-06, + "loss": 0.009, + "step": 7904 + }, + { + "epoch": 3.5964513193812557, + "grad_norm": 0.08959192426591882, + "learning_rate": 1.8225814904567057e-06, + "loss": 0.0018, + "step": 7905 + }, + { + "epoch": 3.5969062784349406, + "grad_norm": 0.1358588758268522, + "learning_rate": 1.821478040422932e-06, + "loss": 0.0018, + "step": 7906 + }, + { + "epoch": 3.597361237488626, + "grad_norm": 0.13531912890129033, + "learning_rate": 1.8203748501236173e-06, + "loss": 0.0019, + "step": 7907 + }, + { + "epoch": 3.597816196542311, + "grad_norm": 0.19270377915409623, + "learning_rate": 1.8192719196489123e-06, + "loss": 0.0021, + "step": 7908 + }, + { + "epoch": 3.598271155595996, + "grad_norm": 0.07305642433280565, + "learning_rate": 1.8181692490889418e-06, + "loss": 0.0006, + "step": 7909 + }, + { + "epoch": 3.5987261146496814, + "grad_norm": 0.6475939154284186, + "learning_rate": 1.8170668385338113e-06, + "loss": 0.0101, + "step": 7910 + }, + { + "epoch": 3.5991810737033667, + "grad_norm": 0.2817718685548278, + "learning_rate": 1.8159646880736036e-06, + "loss": 0.002, + "step": 7911 + }, + { + "epoch": 3.599636032757052, + "grad_norm": 0.26163244962577487, + "learning_rate": 1.8148627977983817e-06, + "loss": 0.003, + "step": 7912 + }, + { + "epoch": 3.600090991810737, + "grad_norm": 0.1663032670859049, + "learning_rate": 1.8137611677981904e-06, + "loss": 0.002, + "step": 7913 + }, + { + "epoch": 3.600545950864422, + "grad_norm": 0.2697742508820168, + "learning_rate": 1.8126597981630474e-06, + "loss": 0.0027, + "step": 7914 + }, + { + "epoch": 3.6010009099181075, + "grad_norm": 0.228454837473255, + "learning_rate": 1.8115586889829517e-06, + "loss": 0.0025, + "step": 7915 + }, + { + "epoch": 3.6014558689717924, + "grad_norm": 0.22736868081564568, + "learning_rate": 1.8104578403478794e-06, + "loss": 0.0027, + "step": 7916 + }, + { + "epoch": 3.6019108280254777, + "grad_norm": 0.2126793718386172, + "learning_rate": 1.8093572523477904e-06, + "loss": 0.0018, + "step": 7917 + }, + { + "epoch": 3.602365787079163, + "grad_norm": 0.26137924219270103, + "learning_rate": 1.8082569250726179e-06, + "loss": 0.0051, + "step": 7918 + }, + { + "epoch": 3.6028207461328483, + "grad_norm": 0.1931062911369165, + "learning_rate": 1.8071568586122733e-06, + "loss": 0.0051, + "step": 7919 + }, + { + "epoch": 3.603275705186533, + "grad_norm": 0.1332182802182727, + "learning_rate": 1.806057053056654e-06, + "loss": 0.0018, + "step": 7920 + }, + { + "epoch": 3.6037306642402185, + "grad_norm": 0.21635079571295868, + "learning_rate": 1.8049575084956266e-06, + "loss": 0.0035, + "step": 7921 + }, + { + "epoch": 3.604185623293904, + "grad_norm": 0.3301968051704073, + "learning_rate": 1.8038582250190445e-06, + "loss": 0.0053, + "step": 7922 + }, + { + "epoch": 3.6046405823475887, + "grad_norm": 0.37722946083861625, + "learning_rate": 1.802759202716735e-06, + "loss": 0.0031, + "step": 7923 + }, + { + "epoch": 3.605095541401274, + "grad_norm": 0.1582487674967987, + "learning_rate": 1.8016604416785043e-06, + "loss": 0.0026, + "step": 7924 + }, + { + "epoch": 3.6055505004549593, + "grad_norm": 0.26033746966944415, + "learning_rate": 1.8005619419941372e-06, + "loss": 0.0049, + "step": 7925 + }, + { + "epoch": 3.606005459508644, + "grad_norm": 0.6888754296815256, + "learning_rate": 1.7994637037534003e-06, + "loss": 0.0048, + "step": 7926 + }, + { + "epoch": 3.6064604185623295, + "grad_norm": 0.17130425264638557, + "learning_rate": 1.798365727046037e-06, + "loss": 0.0015, + "step": 7927 + }, + { + "epoch": 3.6069153776160148, + "grad_norm": 0.18849109292648805, + "learning_rate": 1.797268011961768e-06, + "loss": 0.0033, + "step": 7928 + }, + { + "epoch": 3.6073703366696996, + "grad_norm": 0.22730640992688486, + "learning_rate": 1.7961705585902945e-06, + "loss": 0.0025, + "step": 7929 + }, + { + "epoch": 3.607825295723385, + "grad_norm": 0.4602083535252957, + "learning_rate": 1.7950733670212921e-06, + "loss": 0.0104, + "step": 7930 + }, + { + "epoch": 3.6082802547770703, + "grad_norm": 0.5105317258007144, + "learning_rate": 1.7939764373444223e-06, + "loss": 0.0069, + "step": 7931 + }, + { + "epoch": 3.608735213830755, + "grad_norm": 0.24042606244188347, + "learning_rate": 1.7928797696493204e-06, + "loss": 0.0032, + "step": 7932 + }, + { + "epoch": 3.6091901728844404, + "grad_norm": 0.37057816240218805, + "learning_rate": 1.7917833640255988e-06, + "loss": 0.0045, + "step": 7933 + }, + { + "epoch": 3.6096451319381258, + "grad_norm": 0.18373547765656353, + "learning_rate": 1.790687220562854e-06, + "loss": 0.0019, + "step": 7934 + }, + { + "epoch": 3.6101000909918106, + "grad_norm": 0.16822871767184222, + "learning_rate": 1.7895913393506547e-06, + "loss": 0.002, + "step": 7935 + }, + { + "epoch": 3.610555050045496, + "grad_norm": 0.9120036062826911, + "learning_rate": 1.7884957204785546e-06, + "loss": 0.0301, + "step": 7936 + }, + { + "epoch": 3.6110100090991812, + "grad_norm": 0.18320868208439767, + "learning_rate": 1.7874003640360816e-06, + "loss": 0.0025, + "step": 7937 + }, + { + "epoch": 3.611464968152866, + "grad_norm": 0.38050779048434175, + "learning_rate": 1.7863052701127426e-06, + "loss": 0.003, + "step": 7938 + }, + { + "epoch": 3.6119199272065514, + "grad_norm": 0.3179570980024949, + "learning_rate": 1.785210438798024e-06, + "loss": 0.0051, + "step": 7939 + }, + { + "epoch": 3.6123748862602367, + "grad_norm": 0.2466072577767851, + "learning_rate": 1.7841158701813872e-06, + "loss": 0.0039, + "step": 7940 + }, + { + "epoch": 3.6128298453139216, + "grad_norm": 0.2947505388506555, + "learning_rate": 1.7830215643522818e-06, + "loss": 0.0045, + "step": 7941 + }, + { + "epoch": 3.613284804367607, + "grad_norm": 0.1261551585566211, + "learning_rate": 1.7819275214001263e-06, + "loss": 0.0014, + "step": 7942 + }, + { + "epoch": 3.613739763421292, + "grad_norm": 0.376726998153657, + "learning_rate": 1.7808337414143218e-06, + "loss": 0.0082, + "step": 7943 + }, + { + "epoch": 3.614194722474977, + "grad_norm": 0.19294331327608216, + "learning_rate": 1.779740224484246e-06, + "loss": 0.0028, + "step": 7944 + }, + { + "epoch": 3.6146496815286624, + "grad_norm": 0.07491206481414248, + "learning_rate": 1.7786469706992542e-06, + "loss": 0.0007, + "step": 7945 + }, + { + "epoch": 3.6151046405823477, + "grad_norm": 0.3390974285656792, + "learning_rate": 1.7775539801486868e-06, + "loss": 0.0043, + "step": 7946 + }, + { + "epoch": 3.6155595996360326, + "grad_norm": 0.28674394998514324, + "learning_rate": 1.7764612529218538e-06, + "loss": 0.0028, + "step": 7947 + }, + { + "epoch": 3.616014558689718, + "grad_norm": 0.19709725492535993, + "learning_rate": 1.7753687891080517e-06, + "loss": 0.0017, + "step": 7948 + }, + { + "epoch": 3.616469517743403, + "grad_norm": 0.11491431563050752, + "learning_rate": 1.77427658879655e-06, + "loss": 0.0008, + "step": 7949 + }, + { + "epoch": 3.616924476797088, + "grad_norm": 0.32651563866476874, + "learning_rate": 1.7731846520765962e-06, + "loss": 0.0074, + "step": 7950 + }, + { + "epoch": 3.6173794358507734, + "grad_norm": 0.3122939474441103, + "learning_rate": 1.7720929790374225e-06, + "loss": 0.0038, + "step": 7951 + }, + { + "epoch": 3.6178343949044587, + "grad_norm": 0.37129059422111754, + "learning_rate": 1.7710015697682332e-06, + "loss": 0.0049, + "step": 7952 + }, + { + "epoch": 3.6182893539581436, + "grad_norm": 0.2332594988565104, + "learning_rate": 1.7699104243582133e-06, + "loss": 0.004, + "step": 7953 + }, + { + "epoch": 3.618744313011829, + "grad_norm": 0.09925808851744253, + "learning_rate": 1.768819542896525e-06, + "loss": 0.0009, + "step": 7954 + }, + { + "epoch": 3.619199272065514, + "grad_norm": 0.31852423028987725, + "learning_rate": 1.7677289254723124e-06, + "loss": 0.0084, + "step": 7955 + }, + { + "epoch": 3.619654231119199, + "grad_norm": 0.2439366003167992, + "learning_rate": 1.766638572174696e-06, + "loss": 0.0031, + "step": 7956 + }, + { + "epoch": 3.6201091901728844, + "grad_norm": 0.2634222070561646, + "learning_rate": 1.7655484830927743e-06, + "loss": 0.003, + "step": 7957 + }, + { + "epoch": 3.6205641492265697, + "grad_norm": 0.14678145811124868, + "learning_rate": 1.7644586583156237e-06, + "loss": 0.0017, + "step": 7958 + }, + { + "epoch": 3.6210191082802545, + "grad_norm": 0.07528320992932991, + "learning_rate": 1.7633690979322986e-06, + "loss": 0.0008, + "step": 7959 + }, + { + "epoch": 3.62147406733394, + "grad_norm": 0.19911298795477617, + "learning_rate": 1.7622798020318354e-06, + "loss": 0.0029, + "step": 7960 + }, + { + "epoch": 3.621929026387625, + "grad_norm": 0.17775238000837332, + "learning_rate": 1.7611907707032444e-06, + "loss": 0.0015, + "step": 7961 + }, + { + "epoch": 3.62238398544131, + "grad_norm": 0.10451954365692988, + "learning_rate": 1.7601020040355182e-06, + "loss": 0.001, + "step": 7962 + }, + { + "epoch": 3.6228389444949953, + "grad_norm": 0.3015366289508726, + "learning_rate": 1.7590135021176258e-06, + "loss": 0.0021, + "step": 7963 + }, + { + "epoch": 3.6232939035486806, + "grad_norm": 0.5457679843974467, + "learning_rate": 1.7579252650385114e-06, + "loss": 0.0042, + "step": 7964 + }, + { + "epoch": 3.623748862602366, + "grad_norm": 0.10670382197885002, + "learning_rate": 1.7568372928871053e-06, + "loss": 0.0007, + "step": 7965 + }, + { + "epoch": 3.624203821656051, + "grad_norm": 0.25542607912323717, + "learning_rate": 1.7557495857523098e-06, + "loss": 0.0021, + "step": 7966 + }, + { + "epoch": 3.624658780709736, + "grad_norm": 0.19018887122334371, + "learning_rate": 1.7546621437230071e-06, + "loss": 0.0008, + "step": 7967 + }, + { + "epoch": 3.6251137397634214, + "grad_norm": 0.3269645623973134, + "learning_rate": 1.7535749668880563e-06, + "loss": 0.0039, + "step": 7968 + }, + { + "epoch": 3.6255686988171063, + "grad_norm": 0.1778802123521978, + "learning_rate": 1.752488055336299e-06, + "loss": 0.002, + "step": 7969 + }, + { + "epoch": 3.6260236578707916, + "grad_norm": 0.1470751908641553, + "learning_rate": 1.7514014091565535e-06, + "loss": 0.0012, + "step": 7970 + }, + { + "epoch": 3.626478616924477, + "grad_norm": 0.36141962596043486, + "learning_rate": 1.7503150284376142e-06, + "loss": 0.0088, + "step": 7971 + }, + { + "epoch": 3.6269335759781622, + "grad_norm": 0.2724935962348301, + "learning_rate": 1.7492289132682554e-06, + "loss": 0.0027, + "step": 7972 + }, + { + "epoch": 3.627388535031847, + "grad_norm": 0.29107249038998095, + "learning_rate": 1.7481430637372298e-06, + "loss": 0.0041, + "step": 7973 + }, + { + "epoch": 3.6278434940855324, + "grad_norm": 0.13134200796209422, + "learning_rate": 1.7470574799332658e-06, + "loss": 0.0008, + "step": 7974 + }, + { + "epoch": 3.6282984531392177, + "grad_norm": 0.4090567447742412, + "learning_rate": 1.7459721619450743e-06, + "loss": 0.0064, + "step": 7975 + }, + { + "epoch": 3.6287534121929026, + "grad_norm": 0.20124356356315617, + "learning_rate": 1.7448871098613446e-06, + "loss": 0.0016, + "step": 7976 + }, + { + "epoch": 3.629208371246588, + "grad_norm": 0.15918472248295626, + "learning_rate": 1.7438023237707403e-06, + "loss": 0.0022, + "step": 7977 + }, + { + "epoch": 3.629663330300273, + "grad_norm": 0.20179590639826822, + "learning_rate": 1.7427178037619046e-06, + "loss": 0.0019, + "step": 7978 + }, + { + "epoch": 3.630118289353958, + "grad_norm": 0.11466157783046807, + "learning_rate": 1.7416335499234593e-06, + "loss": 0.0008, + "step": 7979 + }, + { + "epoch": 3.6305732484076434, + "grad_norm": 0.3021783554825343, + "learning_rate": 1.740549562344007e-06, + "loss": 0.0053, + "step": 7980 + }, + { + "epoch": 3.6310282074613287, + "grad_norm": 0.31465809219243523, + "learning_rate": 1.739465841112125e-06, + "loss": 0.003, + "step": 7981 + }, + { + "epoch": 3.6314831665150136, + "grad_norm": 0.17248181798206758, + "learning_rate": 1.7383823863163685e-06, + "loss": 0.002, + "step": 7982 + }, + { + "epoch": 3.631938125568699, + "grad_norm": 0.22433963000160198, + "learning_rate": 1.7372991980452753e-06, + "loss": 0.0019, + "step": 7983 + }, + { + "epoch": 3.632393084622384, + "grad_norm": 0.23492332221439477, + "learning_rate": 1.7362162763873557e-06, + "loss": 0.0066, + "step": 7984 + }, + { + "epoch": 3.632848043676069, + "grad_norm": 0.07711611078754155, + "learning_rate": 1.7351336214311055e-06, + "loss": 0.0007, + "step": 7985 + }, + { + "epoch": 3.6333030027297544, + "grad_norm": 0.36145023436743134, + "learning_rate": 1.7340512332649905e-06, + "loss": 0.0049, + "step": 7986 + }, + { + "epoch": 3.6337579617834397, + "grad_norm": 0.16489699725141843, + "learning_rate": 1.7329691119774606e-06, + "loss": 0.0035, + "step": 7987 + }, + { + "epoch": 3.6342129208371245, + "grad_norm": 0.07212114779748191, + "learning_rate": 1.7318872576569396e-06, + "loss": 0.0007, + "step": 7988 + }, + { + "epoch": 3.63466787989081, + "grad_norm": 0.15231096026220012, + "learning_rate": 1.7308056703918324e-06, + "loss": 0.001, + "step": 7989 + }, + { + "epoch": 3.635122838944495, + "grad_norm": 0.2462416491193636, + "learning_rate": 1.7297243502705247e-06, + "loss": 0.0038, + "step": 7990 + }, + { + "epoch": 3.63557779799818, + "grad_norm": 0.18542740966174365, + "learning_rate": 1.7286432973813744e-06, + "loss": 0.0016, + "step": 7991 + }, + { + "epoch": 3.6360327570518653, + "grad_norm": 0.3046324467845501, + "learning_rate": 1.7275625118127203e-06, + "loss": 0.0042, + "step": 7992 + }, + { + "epoch": 3.6364877161055507, + "grad_norm": 0.15460995311172396, + "learning_rate": 1.726481993652878e-06, + "loss": 0.0021, + "step": 7993 + }, + { + "epoch": 3.6369426751592355, + "grad_norm": 0.28842121453416686, + "learning_rate": 1.725401742990146e-06, + "loss": 0.0045, + "step": 7994 + }, + { + "epoch": 3.637397634212921, + "grad_norm": 0.20665613874312755, + "learning_rate": 1.7243217599127953e-06, + "loss": 0.003, + "step": 7995 + }, + { + "epoch": 3.637852593266606, + "grad_norm": 0.20818202801446936, + "learning_rate": 1.7232420445090765e-06, + "loss": 0.0028, + "step": 7996 + }, + { + "epoch": 3.638307552320291, + "grad_norm": 0.27258458591774315, + "learning_rate": 1.7221625968672212e-06, + "loss": 0.0056, + "step": 7997 + }, + { + "epoch": 3.6387625113739763, + "grad_norm": 0.06792279719819914, + "learning_rate": 1.7210834170754342e-06, + "loss": 0.0008, + "step": 7998 + }, + { + "epoch": 3.6392174704276616, + "grad_norm": 0.26170658844395617, + "learning_rate": 1.7200045052219044e-06, + "loss": 0.0031, + "step": 7999 + }, + { + "epoch": 3.6396724294813465, + "grad_norm": 0.09230026412826589, + "learning_rate": 1.7189258613947945e-06, + "loss": 0.0009, + "step": 8000 + }, + { + "epoch": 3.640127388535032, + "grad_norm": 0.3259220385786079, + "learning_rate": 1.7178474856822457e-06, + "loss": 0.0061, + "step": 8001 + }, + { + "epoch": 3.640582347588717, + "grad_norm": 0.4320532035267409, + "learning_rate": 1.716769378172377e-06, + "loss": 0.0056, + "step": 8002 + }, + { + "epoch": 3.641037306642402, + "grad_norm": 0.2802724673216492, + "learning_rate": 1.715691538953288e-06, + "loss": 0.0052, + "step": 8003 + }, + { + "epoch": 3.6414922656960873, + "grad_norm": 0.38924744222299684, + "learning_rate": 1.7146139681130557e-06, + "loss": 0.0033, + "step": 8004 + }, + { + "epoch": 3.6419472247497726, + "grad_norm": 0.5772229222338136, + "learning_rate": 1.7135366657397335e-06, + "loss": 0.0056, + "step": 8005 + }, + { + "epoch": 3.6424021838034575, + "grad_norm": 0.19733163639925325, + "learning_rate": 1.7124596319213532e-06, + "loss": 0.004, + "step": 8006 + }, + { + "epoch": 3.642857142857143, + "grad_norm": 0.2547387512497557, + "learning_rate": 1.7113828667459242e-06, + "loss": 0.0032, + "step": 8007 + }, + { + "epoch": 3.643312101910828, + "grad_norm": 0.20781975644724582, + "learning_rate": 1.7103063703014372e-06, + "loss": 0.0018, + "step": 8008 + }, + { + "epoch": 3.643767060964513, + "grad_norm": 0.18637383797821747, + "learning_rate": 1.709230142675858e-06, + "loss": 0.0038, + "step": 8009 + }, + { + "epoch": 3.6442220200181983, + "grad_norm": 0.24572797256397994, + "learning_rate": 1.7081541839571285e-06, + "loss": 0.0021, + "step": 8010 + }, + { + "epoch": 3.6446769790718836, + "grad_norm": 0.38239060579784134, + "learning_rate": 1.7070784942331753e-06, + "loss": 0.0044, + "step": 8011 + }, + { + "epoch": 3.6451319381255685, + "grad_norm": 0.30692054265423613, + "learning_rate": 1.7060030735918963e-06, + "loss": 0.0132, + "step": 8012 + }, + { + "epoch": 3.6455868971792538, + "grad_norm": 0.11880490417194524, + "learning_rate": 1.7049279221211696e-06, + "loss": 0.0009, + "step": 8013 + }, + { + "epoch": 3.646041856232939, + "grad_norm": 0.26737440297820153, + "learning_rate": 1.7038530399088538e-06, + "loss": 0.0073, + "step": 8014 + }, + { + "epoch": 3.646496815286624, + "grad_norm": 0.15832893330790165, + "learning_rate": 1.7027784270427822e-06, + "loss": 0.0012, + "step": 8015 + }, + { + "epoch": 3.6469517743403093, + "grad_norm": 0.22136569941688236, + "learning_rate": 1.701704083610768e-06, + "loss": 0.0011, + "step": 8016 + }, + { + "epoch": 3.6474067333939946, + "grad_norm": 0.15909962981524223, + "learning_rate": 1.700630009700599e-06, + "loss": 0.002, + "step": 8017 + }, + { + "epoch": 3.6478616924476794, + "grad_norm": 0.14564063705330876, + "learning_rate": 1.6995562054000459e-06, + "loss": 0.002, + "step": 8018 + }, + { + "epoch": 3.6483166515013647, + "grad_norm": 0.26482202710233876, + "learning_rate": 1.6984826707968566e-06, + "loss": 0.0035, + "step": 8019 + }, + { + "epoch": 3.64877161055505, + "grad_norm": 0.0418102071312338, + "learning_rate": 1.6974094059787544e-06, + "loss": 0.0003, + "step": 8020 + }, + { + "epoch": 3.6492265696087354, + "grad_norm": 0.2365175368480019, + "learning_rate": 1.6963364110334407e-06, + "loss": 0.002, + "step": 8021 + }, + { + "epoch": 3.6496815286624202, + "grad_norm": 0.18684998075962683, + "learning_rate": 1.6952636860485944e-06, + "loss": 0.0023, + "step": 8022 + }, + { + "epoch": 3.6501364877161055, + "grad_norm": 0.3184178875394961, + "learning_rate": 1.694191231111878e-06, + "loss": 0.0029, + "step": 8023 + }, + { + "epoch": 3.650591446769791, + "grad_norm": 0.3355623492931194, + "learning_rate": 1.6931190463109231e-06, + "loss": 0.0105, + "step": 8024 + }, + { + "epoch": 3.6510464058234757, + "grad_norm": 0.1440589001383886, + "learning_rate": 1.6920471317333476e-06, + "loss": 0.0013, + "step": 8025 + }, + { + "epoch": 3.651501364877161, + "grad_norm": 0.2217250047560994, + "learning_rate": 1.6909754874667422e-06, + "loss": 0.0031, + "step": 8026 + }, + { + "epoch": 3.6519563239308463, + "grad_norm": 0.07925508024230195, + "learning_rate": 1.689904113598675e-06, + "loss": 0.0006, + "step": 8027 + }, + { + "epoch": 3.6524112829845317, + "grad_norm": 0.23261960660017306, + "learning_rate": 1.6888330102166966e-06, + "loss": 0.0021, + "step": 8028 + }, + { + "epoch": 3.6528662420382165, + "grad_norm": 0.3905453071373662, + "learning_rate": 1.687762177408332e-06, + "loss": 0.0065, + "step": 8029 + }, + { + "epoch": 3.653321201091902, + "grad_norm": 0.643251963619827, + "learning_rate": 1.6866916152610836e-06, + "loss": 0.0067, + "step": 8030 + }, + { + "epoch": 3.653776160145587, + "grad_norm": 0.03428659765720983, + "learning_rate": 1.6856213238624324e-06, + "loss": 0.0003, + "step": 8031 + }, + { + "epoch": 3.654231119199272, + "grad_norm": 0.2959269804489528, + "learning_rate": 1.6845513032998389e-06, + "loss": 0.0018, + "step": 8032 + }, + { + "epoch": 3.6546860782529573, + "grad_norm": 0.20584201943035565, + "learning_rate": 1.6834815536607424e-06, + "loss": 0.0031, + "step": 8033 + }, + { + "epoch": 3.6551410373066426, + "grad_norm": 0.115908312475804, + "learning_rate": 1.6824120750325562e-06, + "loss": 0.0008, + "step": 8034 + }, + { + "epoch": 3.6555959963603275, + "grad_norm": 0.43484571627949, + "learning_rate": 1.6813428675026728e-06, + "loss": 0.0057, + "step": 8035 + }, + { + "epoch": 3.656050955414013, + "grad_norm": 0.2567462497537789, + "learning_rate": 1.6802739311584615e-06, + "loss": 0.0039, + "step": 8036 + }, + { + "epoch": 3.656505914467698, + "grad_norm": 0.07363581620074681, + "learning_rate": 1.6792052660872749e-06, + "loss": 0.0009, + "step": 8037 + }, + { + "epoch": 3.656960873521383, + "grad_norm": 0.138696403855459, + "learning_rate": 1.6781368723764352e-06, + "loss": 0.0014, + "step": 8038 + }, + { + "epoch": 3.6574158325750683, + "grad_norm": 0.24446612454313577, + "learning_rate": 1.677068750113251e-06, + "loss": 0.0056, + "step": 8039 + }, + { + "epoch": 3.6578707916287536, + "grad_norm": 0.19522023722854703, + "learning_rate": 1.6760008993850024e-06, + "loss": 0.0024, + "step": 8040 + }, + { + "epoch": 3.6583257506824385, + "grad_norm": 0.2936910334169346, + "learning_rate": 1.6749333202789474e-06, + "loss": 0.0055, + "step": 8041 + }, + { + "epoch": 3.658780709736124, + "grad_norm": 0.2594041278930158, + "learning_rate": 1.673866012882327e-06, + "loss": 0.0039, + "step": 8042 + }, + { + "epoch": 3.659235668789809, + "grad_norm": 0.256542480986476, + "learning_rate": 1.6727989772823556e-06, + "loss": 0.002, + "step": 8043 + }, + { + "epoch": 3.659690627843494, + "grad_norm": 0.05315534347099337, + "learning_rate": 1.6717322135662262e-06, + "loss": 0.0007, + "step": 8044 + }, + { + "epoch": 3.6601455868971793, + "grad_norm": 0.22839544401872766, + "learning_rate": 1.6706657218211087e-06, + "loss": 0.0018, + "step": 8045 + }, + { + "epoch": 3.6606005459508646, + "grad_norm": 0.2895208201895555, + "learning_rate": 1.6695995021341526e-06, + "loss": 0.0058, + "step": 8046 + }, + { + "epoch": 3.6610555050045495, + "grad_norm": 0.27113804214984594, + "learning_rate": 1.6685335545924874e-06, + "loss": 0.0017, + "step": 8047 + }, + { + "epoch": 3.6615104640582348, + "grad_norm": 0.20008265866310768, + "learning_rate": 1.667467879283215e-06, + "loss": 0.001, + "step": 8048 + }, + { + "epoch": 3.66196542311192, + "grad_norm": 0.36496285492596525, + "learning_rate": 1.6664024762934183e-06, + "loss": 0.0057, + "step": 8049 + }, + { + "epoch": 3.662420382165605, + "grad_norm": 0.22717373419090375, + "learning_rate": 1.6653373457101562e-06, + "loss": 0.0019, + "step": 8050 + }, + { + "epoch": 3.6628753412192903, + "grad_norm": 0.16463648813290777, + "learning_rate": 1.6642724876204658e-06, + "loss": 0.0008, + "step": 8051 + }, + { + "epoch": 3.6633303002729756, + "grad_norm": 0.34860700898580577, + "learning_rate": 1.663207902111364e-06, + "loss": 0.0023, + "step": 8052 + }, + { + "epoch": 3.6637852593266604, + "grad_norm": 0.1897024665327669, + "learning_rate": 1.6621435892698452e-06, + "loss": 0.0008, + "step": 8053 + }, + { + "epoch": 3.6642402183803457, + "grad_norm": 0.2538522484070175, + "learning_rate": 1.661079549182878e-06, + "loss": 0.0024, + "step": 8054 + }, + { + "epoch": 3.664695177434031, + "grad_norm": 0.13633945559507546, + "learning_rate": 1.660015781937412e-06, + "loss": 0.0009, + "step": 8055 + }, + { + "epoch": 3.665150136487716, + "grad_norm": 0.05667062613502048, + "learning_rate": 1.6589522876203717e-06, + "loss": 0.0005, + "step": 8056 + }, + { + "epoch": 3.6656050955414012, + "grad_norm": 0.9296746127418903, + "learning_rate": 1.6578890663186637e-06, + "loss": 0.0218, + "step": 8057 + }, + { + "epoch": 3.6660600545950865, + "grad_norm": 0.16994736190385232, + "learning_rate": 1.6568261181191687e-06, + "loss": 0.0018, + "step": 8058 + }, + { + "epoch": 3.6665150136487714, + "grad_norm": 0.20970246978452106, + "learning_rate": 1.6557634431087433e-06, + "loss": 0.0046, + "step": 8059 + }, + { + "epoch": 3.6669699727024567, + "grad_norm": 0.3238131627995037, + "learning_rate": 1.6547010413742292e-06, + "loss": 0.0021, + "step": 8060 + }, + { + "epoch": 3.667424931756142, + "grad_norm": 0.34887487019946234, + "learning_rate": 1.653638913002437e-06, + "loss": 0.0025, + "step": 8061 + }, + { + "epoch": 3.667879890809827, + "grad_norm": 0.21373648284817673, + "learning_rate": 1.6525770580801626e-06, + "loss": 0.0023, + "step": 8062 + }, + { + "epoch": 3.668334849863512, + "grad_norm": 0.26000117786148963, + "learning_rate": 1.6515154766941738e-06, + "loss": 0.0079, + "step": 8063 + }, + { + "epoch": 3.6687898089171975, + "grad_norm": 0.2316191875370578, + "learning_rate": 1.6504541689312186e-06, + "loss": 0.0019, + "step": 8064 + }, + { + "epoch": 3.6692447679708824, + "grad_norm": 0.2233798900373078, + "learning_rate": 1.6493931348780211e-06, + "loss": 0.0056, + "step": 8065 + }, + { + "epoch": 3.6696997270245677, + "grad_norm": 0.3112488602656874, + "learning_rate": 1.6483323746212854e-06, + "loss": 0.0028, + "step": 8066 + }, + { + "epoch": 3.670154686078253, + "grad_norm": 0.07397477435980715, + "learning_rate": 1.6472718882476934e-06, + "loss": 0.0005, + "step": 8067 + }, + { + "epoch": 3.670609645131938, + "grad_norm": 0.21372708894174636, + "learning_rate": 1.6462116758439018e-06, + "loss": 0.0023, + "step": 8068 + }, + { + "epoch": 3.671064604185623, + "grad_norm": 0.11587699128581494, + "learning_rate": 1.6451517374965465e-06, + "loss": 0.0005, + "step": 8069 + }, + { + "epoch": 3.6715195632393085, + "grad_norm": 0.23728992814098315, + "learning_rate": 1.6440920732922395e-06, + "loss": 0.0027, + "step": 8070 + }, + { + "epoch": 3.6719745222929934, + "grad_norm": 0.2498084519480596, + "learning_rate": 1.6430326833175747e-06, + "loss": 0.0007, + "step": 8071 + }, + { + "epoch": 3.6724294813466787, + "grad_norm": 0.2033303305879698, + "learning_rate": 1.6419735676591192e-06, + "loss": 0.0018, + "step": 8072 + }, + { + "epoch": 3.672884440400364, + "grad_norm": 0.09336571464239883, + "learning_rate": 1.640914726403417e-06, + "loss": 0.0007, + "step": 8073 + }, + { + "epoch": 3.673339399454049, + "grad_norm": 0.23707085945734882, + "learning_rate": 1.6398561596369955e-06, + "loss": 0.0015, + "step": 8074 + }, + { + "epoch": 3.673794358507734, + "grad_norm": 0.18057782993863786, + "learning_rate": 1.6387978674463528e-06, + "loss": 0.0025, + "step": 8075 + }, + { + "epoch": 3.6742493175614195, + "grad_norm": 0.1877921614722231, + "learning_rate": 1.6377398499179714e-06, + "loss": 0.004, + "step": 8076 + }, + { + "epoch": 3.674704276615105, + "grad_norm": 0.18128735508895163, + "learning_rate": 1.6366821071383054e-06, + "loss": 0.0034, + "step": 8077 + }, + { + "epoch": 3.6751592356687897, + "grad_norm": 0.4882826172237317, + "learning_rate": 1.6356246391937886e-06, + "loss": 0.0077, + "step": 8078 + }, + { + "epoch": 3.675614194722475, + "grad_norm": 0.1487306066833931, + "learning_rate": 1.6345674461708316e-06, + "loss": 0.0027, + "step": 8079 + }, + { + "epoch": 3.6760691537761603, + "grad_norm": 0.2807606238463104, + "learning_rate": 1.633510528155825e-06, + "loss": 0.0033, + "step": 8080 + }, + { + "epoch": 3.676524112829845, + "grad_norm": 0.6105029591761452, + "learning_rate": 1.6324538852351363e-06, + "loss": 0.0054, + "step": 8081 + }, + { + "epoch": 3.6769790718835305, + "grad_norm": 0.31109172914956656, + "learning_rate": 1.6313975174951085e-06, + "loss": 0.0094, + "step": 8082 + }, + { + "epoch": 3.6774340309372158, + "grad_norm": 0.07440605530093407, + "learning_rate": 1.6303414250220634e-06, + "loss": 0.0004, + "step": 8083 + }, + { + "epoch": 3.677888989990901, + "grad_norm": 0.2508677618629711, + "learning_rate": 1.6292856079022995e-06, + "loss": 0.0071, + "step": 8084 + }, + { + "epoch": 3.678343949044586, + "grad_norm": 0.11832609471604028, + "learning_rate": 1.6282300662220918e-06, + "loss": 0.0008, + "step": 8085 + }, + { + "epoch": 3.6787989080982713, + "grad_norm": 0.1776783336318369, + "learning_rate": 1.6271748000676984e-06, + "loss": 0.0029, + "step": 8086 + }, + { + "epoch": 3.6792538671519566, + "grad_norm": 0.18193949597186546, + "learning_rate": 1.6261198095253471e-06, + "loss": 0.0008, + "step": 8087 + }, + { + "epoch": 3.6797088262056414, + "grad_norm": 0.4059062526547424, + "learning_rate": 1.62506509468125e-06, + "loss": 0.0034, + "step": 8088 + }, + { + "epoch": 3.6801637852593267, + "grad_norm": 0.09739922675376, + "learning_rate": 1.6240106556215928e-06, + "loss": 0.001, + "step": 8089 + }, + { + "epoch": 3.680618744313012, + "grad_norm": 0.18279291383397514, + "learning_rate": 1.6229564924325368e-06, + "loss": 0.0013, + "step": 8090 + }, + { + "epoch": 3.681073703366697, + "grad_norm": 0.07938296220456725, + "learning_rate": 1.621902605200228e-06, + "loss": 0.0009, + "step": 8091 + }, + { + "epoch": 3.6815286624203822, + "grad_norm": 0.4295045590791644, + "learning_rate": 1.6208489940107824e-06, + "loss": 0.0133, + "step": 8092 + }, + { + "epoch": 3.6819836214740675, + "grad_norm": 0.13567236342070368, + "learning_rate": 1.6197956589502967e-06, + "loss": 0.0011, + "step": 8093 + }, + { + "epoch": 3.6824385805277524, + "grad_norm": 0.237125267127452, + "learning_rate": 1.6187426001048434e-06, + "loss": 0.0051, + "step": 8094 + }, + { + "epoch": 3.6828935395814377, + "grad_norm": 0.0921147379700198, + "learning_rate": 1.6176898175604756e-06, + "loss": 0.001, + "step": 8095 + }, + { + "epoch": 3.683348498635123, + "grad_norm": 0.45689369392245593, + "learning_rate": 1.616637311403223e-06, + "loss": 0.004, + "step": 8096 + }, + { + "epoch": 3.683803457688808, + "grad_norm": 0.0788016536765662, + "learning_rate": 1.61558508171909e-06, + "loss": 0.0003, + "step": 8097 + }, + { + "epoch": 3.684258416742493, + "grad_norm": 0.3398853455146879, + "learning_rate": 1.6145331285940603e-06, + "loss": 0.0041, + "step": 8098 + }, + { + "epoch": 3.6847133757961785, + "grad_norm": 0.2181534229852083, + "learning_rate": 1.613481452114093e-06, + "loss": 0.0045, + "step": 8099 + }, + { + "epoch": 3.6851683348498634, + "grad_norm": 0.15704801098337517, + "learning_rate": 1.6124300523651298e-06, + "loss": 0.0012, + "step": 8100 + }, + { + "epoch": 3.6856232939035487, + "grad_norm": 0.4041858294364238, + "learning_rate": 1.611378929433083e-06, + "loss": 0.0079, + "step": 8101 + }, + { + "epoch": 3.686078252957234, + "grad_norm": 0.32553459565945025, + "learning_rate": 1.6103280834038488e-06, + "loss": 0.0035, + "step": 8102 + }, + { + "epoch": 3.686533212010919, + "grad_norm": 0.1751796301007587, + "learning_rate": 1.609277514363296e-06, + "loss": 0.001, + "step": 8103 + }, + { + "epoch": 3.686988171064604, + "grad_norm": 0.17804523665708427, + "learning_rate": 1.6082272223972705e-06, + "loss": 0.0019, + "step": 8104 + }, + { + "epoch": 3.6874431301182895, + "grad_norm": 0.4336705678119796, + "learning_rate": 1.6071772075916015e-06, + "loss": 0.0039, + "step": 8105 + }, + { + "epoch": 3.6878980891719744, + "grad_norm": 0.17729697885502382, + "learning_rate": 1.6061274700320884e-06, + "loss": 0.002, + "step": 8106 + }, + { + "epoch": 3.6883530482256597, + "grad_norm": 0.3183949662331046, + "learning_rate": 1.6050780098045126e-06, + "loss": 0.002, + "step": 8107 + }, + { + "epoch": 3.688808007279345, + "grad_norm": 0.40763133301734894, + "learning_rate": 1.6040288269946286e-06, + "loss": 0.0173, + "step": 8108 + }, + { + "epoch": 3.68926296633303, + "grad_norm": 0.2337827993703391, + "learning_rate": 1.6029799216881726e-06, + "loss": 0.003, + "step": 8109 + }, + { + "epoch": 3.689717925386715, + "grad_norm": 0.17871030055580414, + "learning_rate": 1.6019312939708588e-06, + "loss": 0.0039, + "step": 8110 + }, + { + "epoch": 3.6901728844404005, + "grad_norm": 0.09853067750951439, + "learning_rate": 1.6008829439283736e-06, + "loss": 0.0008, + "step": 8111 + }, + { + "epoch": 3.6906278434940853, + "grad_norm": 0.14965381136569647, + "learning_rate": 1.5998348716463834e-06, + "loss": 0.0011, + "step": 8112 + }, + { + "epoch": 3.6910828025477707, + "grad_norm": 0.08220692669256428, + "learning_rate": 1.5987870772105318e-06, + "loss": 0.0008, + "step": 8113 + }, + { + "epoch": 3.691537761601456, + "grad_norm": 0.35664315589602713, + "learning_rate": 1.5977395607064417e-06, + "loss": 0.0073, + "step": 8114 + }, + { + "epoch": 3.691992720655141, + "grad_norm": 0.5066667458416546, + "learning_rate": 1.596692322219709e-06, + "loss": 0.0103, + "step": 8115 + }, + { + "epoch": 3.692447679708826, + "grad_norm": 0.24439120936355235, + "learning_rate": 1.595645361835912e-06, + "loss": 0.0044, + "step": 8116 + }, + { + "epoch": 3.6929026387625115, + "grad_norm": 0.07884822485577705, + "learning_rate": 1.5945986796406016e-06, + "loss": 0.0007, + "step": 8117 + }, + { + "epoch": 3.6933575978161963, + "grad_norm": 0.3541099715146426, + "learning_rate": 1.593552275719309e-06, + "loss": 0.0038, + "step": 8118 + }, + { + "epoch": 3.6938125568698816, + "grad_norm": 0.2727755087120767, + "learning_rate": 1.5925061501575395e-06, + "loss": 0.0038, + "step": 8119 + }, + { + "epoch": 3.694267515923567, + "grad_norm": 0.3138646035146125, + "learning_rate": 1.5914603030407804e-06, + "loss": 0.0037, + "step": 8120 + }, + { + "epoch": 3.694722474977252, + "grad_norm": 0.36335095432959763, + "learning_rate": 1.5904147344544928e-06, + "loss": 0.0032, + "step": 8121 + }, + { + "epoch": 3.695177434030937, + "grad_norm": 0.3252670029539399, + "learning_rate": 1.589369444484114e-06, + "loss": 0.0071, + "step": 8122 + }, + { + "epoch": 3.6956323930846224, + "grad_norm": 0.17722976254242095, + "learning_rate": 1.5883244332150633e-06, + "loss": 0.0016, + "step": 8123 + }, + { + "epoch": 3.6960873521383073, + "grad_norm": 0.21793092151447244, + "learning_rate": 1.5872797007327317e-06, + "loss": 0.0009, + "step": 8124 + }, + { + "epoch": 3.6965423111919926, + "grad_norm": 0.2453560035524618, + "learning_rate": 1.5862352471224924e-06, + "loss": 0.0022, + "step": 8125 + }, + { + "epoch": 3.696997270245678, + "grad_norm": 0.2802691468742985, + "learning_rate": 1.5851910724696928e-06, + "loss": 0.0031, + "step": 8126 + }, + { + "epoch": 3.697452229299363, + "grad_norm": 0.2714291926028238, + "learning_rate": 1.5841471768596572e-06, + "loss": 0.0036, + "step": 8127 + }, + { + "epoch": 3.697907188353048, + "grad_norm": 0.1686808066666587, + "learning_rate": 1.5831035603776868e-06, + "loss": 0.0017, + "step": 8128 + }, + { + "epoch": 3.6983621474067334, + "grad_norm": 0.3231812351426906, + "learning_rate": 1.5820602231090632e-06, + "loss": 0.0055, + "step": 8129 + }, + { + "epoch": 3.6988171064604187, + "grad_norm": 0.1333500586490056, + "learning_rate": 1.5810171651390444e-06, + "loss": 0.0014, + "step": 8130 + }, + { + "epoch": 3.6992720655141036, + "grad_norm": 0.13082552313962106, + "learning_rate": 1.5799743865528628e-06, + "loss": 0.001, + "step": 8131 + }, + { + "epoch": 3.699727024567789, + "grad_norm": 0.3775599061582529, + "learning_rate": 1.5789318874357296e-06, + "loss": 0.0036, + "step": 8132 + }, + { + "epoch": 3.700181983621474, + "grad_norm": 0.30490658121115527, + "learning_rate": 1.5778896678728317e-06, + "loss": 0.0061, + "step": 8133 + }, + { + "epoch": 3.700636942675159, + "grad_norm": 0.47510571798683454, + "learning_rate": 1.576847727949337e-06, + "loss": 0.0066, + "step": 8134 + }, + { + "epoch": 3.7010919017288444, + "grad_norm": 0.6801230806386573, + "learning_rate": 1.5758060677503879e-06, + "loss": 0.0082, + "step": 8135 + }, + { + "epoch": 3.7015468607825297, + "grad_norm": 0.45149534343087655, + "learning_rate": 1.5747646873611016e-06, + "loss": 0.0049, + "step": 8136 + }, + { + "epoch": 3.702001819836215, + "grad_norm": 0.08401680854604955, + "learning_rate": 1.5737235868665785e-06, + "loss": 0.001, + "step": 8137 + }, + { + "epoch": 3.7024567788899, + "grad_norm": 0.15202672484368168, + "learning_rate": 1.5726827663518896e-06, + "loss": 0.0012, + "step": 8138 + }, + { + "epoch": 3.702911737943585, + "grad_norm": 0.18858998473549005, + "learning_rate": 1.5716422259020887e-06, + "loss": 0.0035, + "step": 8139 + }, + { + "epoch": 3.7033666969972705, + "grad_norm": 0.14498326515775617, + "learning_rate": 1.5706019656022026e-06, + "loss": 0.0016, + "step": 8140 + }, + { + "epoch": 3.7038216560509554, + "grad_norm": 0.11211350283108794, + "learning_rate": 1.5695619855372368e-06, + "loss": 0.0015, + "step": 8141 + }, + { + "epoch": 3.7042766151046407, + "grad_norm": 0.1600336110538318, + "learning_rate": 1.5685222857921723e-06, + "loss": 0.0021, + "step": 8142 + }, + { + "epoch": 3.704731574158326, + "grad_norm": 0.14413269398601997, + "learning_rate": 1.5674828664519703e-06, + "loss": 0.0014, + "step": 8143 + }, + { + "epoch": 3.705186533212011, + "grad_norm": 0.02911049526708629, + "learning_rate": 1.5664437276015692e-06, + "loss": 0.0003, + "step": 8144 + }, + { + "epoch": 3.705641492265696, + "grad_norm": 0.21041983313542045, + "learning_rate": 1.5654048693258805e-06, + "loss": 0.0016, + "step": 8145 + }, + { + "epoch": 3.7060964513193815, + "grad_norm": 0.304157616495816, + "learning_rate": 1.5643662917097956e-06, + "loss": 0.0089, + "step": 8146 + }, + { + "epoch": 3.7065514103730663, + "grad_norm": 0.2615352239004681, + "learning_rate": 1.5633279948381802e-06, + "loss": 0.0025, + "step": 8147 + }, + { + "epoch": 3.7070063694267517, + "grad_norm": 0.27691913461101136, + "learning_rate": 1.5622899787958833e-06, + "loss": 0.0049, + "step": 8148 + }, + { + "epoch": 3.707461328480437, + "grad_norm": 0.114621549822485, + "learning_rate": 1.5612522436677246e-06, + "loss": 0.0009, + "step": 8149 + }, + { + "epoch": 3.707916287534122, + "grad_norm": 0.5218349233293614, + "learning_rate": 1.5602147895385017e-06, + "loss": 0.0053, + "step": 8150 + }, + { + "epoch": 3.708371246587807, + "grad_norm": 0.143448234861761, + "learning_rate": 1.5591776164929934e-06, + "loss": 0.0019, + "step": 8151 + }, + { + "epoch": 3.7088262056414925, + "grad_norm": 0.3200096242316086, + "learning_rate": 1.5581407246159508e-06, + "loss": 0.0034, + "step": 8152 + }, + { + "epoch": 3.7092811646951773, + "grad_norm": 0.2091570438104385, + "learning_rate": 1.557104113992106e-06, + "loss": 0.0012, + "step": 8153 + }, + { + "epoch": 3.7097361237488626, + "grad_norm": 0.6784293195389853, + "learning_rate": 1.556067784706165e-06, + "loss": 0.0039, + "step": 8154 + }, + { + "epoch": 3.710191082802548, + "grad_norm": 0.20314360753897395, + "learning_rate": 1.5550317368428125e-06, + "loss": 0.0026, + "step": 8155 + }, + { + "epoch": 3.710646041856233, + "grad_norm": 0.3866123087416582, + "learning_rate": 1.5539959704867086e-06, + "loss": 0.0041, + "step": 8156 + }, + { + "epoch": 3.711101000909918, + "grad_norm": 0.22551460892807806, + "learning_rate": 1.5529604857224906e-06, + "loss": 0.0039, + "step": 8157 + }, + { + "epoch": 3.7115559599636034, + "grad_norm": 0.27405852867762126, + "learning_rate": 1.5519252826347747e-06, + "loss": 0.0025, + "step": 8158 + }, + { + "epoch": 3.7120109190172883, + "grad_norm": 0.3565648505068062, + "learning_rate": 1.5508903613081556e-06, + "loss": 0.0039, + "step": 8159 + }, + { + "epoch": 3.7124658780709736, + "grad_norm": 0.1750687873245234, + "learning_rate": 1.5498557218271992e-06, + "loss": 0.0011, + "step": 8160 + }, + { + "epoch": 3.712920837124659, + "grad_norm": 0.1352421615919305, + "learning_rate": 1.5488213642764532e-06, + "loss": 0.0028, + "step": 8161 + }, + { + "epoch": 3.713375796178344, + "grad_norm": 0.202231039324682, + "learning_rate": 1.5477872887404382e-06, + "loss": 0.0013, + "step": 8162 + }, + { + "epoch": 3.713830755232029, + "grad_norm": 0.24228110286430646, + "learning_rate": 1.5467534953036572e-06, + "loss": 0.0013, + "step": 8163 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 0.22773854068191546, + "learning_rate": 1.545719984050585e-06, + "loss": 0.004, + "step": 8164 + }, + { + "epoch": 3.7147406733393993, + "grad_norm": 0.2722710264362533, + "learning_rate": 1.544686755065677e-06, + "loss": 0.0043, + "step": 8165 + }, + { + "epoch": 3.7151956323930846, + "grad_norm": 0.21025749108698538, + "learning_rate": 1.5436538084333635e-06, + "loss": 0.0018, + "step": 8166 + }, + { + "epoch": 3.71565059144677, + "grad_norm": 0.3469508797420825, + "learning_rate": 1.5426211442380513e-06, + "loss": 0.0067, + "step": 8167 + }, + { + "epoch": 3.7161055505004548, + "grad_norm": 0.2183605406128026, + "learning_rate": 1.5415887625641264e-06, + "loss": 0.0012, + "step": 8168 + }, + { + "epoch": 3.71656050955414, + "grad_norm": 0.3629561416787955, + "learning_rate": 1.5405566634959502e-06, + "loss": 0.0103, + "step": 8169 + }, + { + "epoch": 3.7170154686078254, + "grad_norm": 0.24427040328021304, + "learning_rate": 1.5395248471178609e-06, + "loss": 0.0079, + "step": 8170 + }, + { + "epoch": 3.7174704276615103, + "grad_norm": 0.576094040455724, + "learning_rate": 1.5384933135141716e-06, + "loss": 0.0046, + "step": 8171 + }, + { + "epoch": 3.7179253867151956, + "grad_norm": 0.40737176015701665, + "learning_rate": 1.5374620627691772e-06, + "loss": 0.006, + "step": 8172 + }, + { + "epoch": 3.718380345768881, + "grad_norm": 0.6436231479428369, + "learning_rate": 1.5364310949671479e-06, + "loss": 0.0053, + "step": 8173 + }, + { + "epoch": 3.7188353048225657, + "grad_norm": 0.20807227823414023, + "learning_rate": 1.535400410192328e-06, + "loss": 0.0023, + "step": 8174 + }, + { + "epoch": 3.719290263876251, + "grad_norm": 0.2543467034198584, + "learning_rate": 1.5343700085289404e-06, + "loss": 0.007, + "step": 8175 + }, + { + "epoch": 3.7197452229299364, + "grad_norm": 0.16510189582639836, + "learning_rate": 1.533339890061184e-06, + "loss": 0.0022, + "step": 8176 + }, + { + "epoch": 3.7202001819836212, + "grad_norm": 0.33333657645841364, + "learning_rate": 1.5323100548732378e-06, + "loss": 0.0025, + "step": 8177 + }, + { + "epoch": 3.7206551410373065, + "grad_norm": 0.11701843279350102, + "learning_rate": 1.5312805030492522e-06, + "loss": 0.0011, + "step": 8178 + }, + { + "epoch": 3.721110100090992, + "grad_norm": 0.2816843841249177, + "learning_rate": 1.530251234673361e-06, + "loss": 0.0048, + "step": 8179 + }, + { + "epoch": 3.7215650591446767, + "grad_norm": 0.31726450797492217, + "learning_rate": 1.5292222498296699e-06, + "loss": 0.0083, + "step": 8180 + }, + { + "epoch": 3.722020018198362, + "grad_norm": 0.3807455373911141, + "learning_rate": 1.5281935486022609e-06, + "loss": 0.008, + "step": 8181 + }, + { + "epoch": 3.7224749772520473, + "grad_norm": 0.3212487310627123, + "learning_rate": 1.5271651310751979e-06, + "loss": 0.0056, + "step": 8182 + }, + { + "epoch": 3.722929936305732, + "grad_norm": 0.1564252903280299, + "learning_rate": 1.5261369973325173e-06, + "loss": 0.0028, + "step": 8183 + }, + { + "epoch": 3.7233848953594175, + "grad_norm": 0.2053746298802703, + "learning_rate": 1.5251091474582337e-06, + "loss": 0.0038, + "step": 8184 + }, + { + "epoch": 3.723839854413103, + "grad_norm": 0.11648992545027957, + "learning_rate": 1.5240815815363363e-06, + "loss": 0.0008, + "step": 8185 + }, + { + "epoch": 3.724294813466788, + "grad_norm": 0.2745476945485481, + "learning_rate": 1.5230542996507951e-06, + "loss": 0.0054, + "step": 8186 + }, + { + "epoch": 3.724749772520473, + "grad_norm": 0.3817400439444805, + "learning_rate": 1.5220273018855565e-06, + "loss": 0.012, + "step": 8187 + }, + { + "epoch": 3.7252047315741583, + "grad_norm": 0.22620017285496435, + "learning_rate": 1.52100058832454e-06, + "loss": 0.0021, + "step": 8188 + }, + { + "epoch": 3.7256596906278436, + "grad_norm": 0.23609135270221862, + "learning_rate": 1.5199741590516449e-06, + "loss": 0.004, + "step": 8189 + }, + { + "epoch": 3.7261146496815285, + "grad_norm": 0.20238060883779085, + "learning_rate": 1.518948014150745e-06, + "loss": 0.0049, + "step": 8190 + }, + { + "epoch": 3.726569608735214, + "grad_norm": 0.26415892273089064, + "learning_rate": 1.517922153705692e-06, + "loss": 0.0058, + "step": 8191 + }, + { + "epoch": 3.727024567788899, + "grad_norm": 0.29755471660063304, + "learning_rate": 1.516896577800316e-06, + "loss": 0.0047, + "step": 8192 + }, + { + "epoch": 3.7274795268425844, + "grad_norm": 0.20014416089509243, + "learning_rate": 1.5158712865184233e-06, + "loss": 0.0015, + "step": 8193 + }, + { + "epoch": 3.7279344858962693, + "grad_norm": 0.17539450605563967, + "learning_rate": 1.5148462799437952e-06, + "loss": 0.0012, + "step": 8194 + }, + { + "epoch": 3.7283894449499546, + "grad_norm": 0.26433059508538664, + "learning_rate": 1.51382155816019e-06, + "loss": 0.0023, + "step": 8195 + }, + { + "epoch": 3.72884440400364, + "grad_norm": 0.3998943484669266, + "learning_rate": 1.512797121251342e-06, + "loss": 0.0092, + "step": 8196 + }, + { + "epoch": 3.729299363057325, + "grad_norm": 0.18197829623673958, + "learning_rate": 1.5117729693009669e-06, + "loss": 0.0017, + "step": 8197 + }, + { + "epoch": 3.72975432211101, + "grad_norm": 0.19680326817115126, + "learning_rate": 1.5107491023927523e-06, + "loss": 0.0049, + "step": 8198 + }, + { + "epoch": 3.7302092811646954, + "grad_norm": 0.30362799043348515, + "learning_rate": 1.5097255206103617e-06, + "loss": 0.0042, + "step": 8199 + }, + { + "epoch": 3.7306642402183803, + "grad_norm": 0.3683103144138177, + "learning_rate": 1.5087022240374417e-06, + "loss": 0.0082, + "step": 8200 + }, + { + "epoch": 3.7311191992720656, + "grad_norm": 0.32816132325601566, + "learning_rate": 1.5076792127576074e-06, + "loss": 0.0031, + "step": 8201 + }, + { + "epoch": 3.731574158325751, + "grad_norm": 0.18096225981957614, + "learning_rate": 1.5066564868544587e-06, + "loss": 0.0055, + "step": 8202 + }, + { + "epoch": 3.7320291173794358, + "grad_norm": 0.3953806872657199, + "learning_rate": 1.5056340464115653e-06, + "loss": 0.0049, + "step": 8203 + }, + { + "epoch": 3.732484076433121, + "grad_norm": 0.2410208030875781, + "learning_rate": 1.504611891512478e-06, + "loss": 0.0039, + "step": 8204 + }, + { + "epoch": 3.7329390354868064, + "grad_norm": 0.3200857663453817, + "learning_rate": 1.5035900222407197e-06, + "loss": 0.0139, + "step": 8205 + }, + { + "epoch": 3.7333939945404913, + "grad_norm": 0.12241225924537646, + "learning_rate": 1.5025684386797957e-06, + "loss": 0.0009, + "step": 8206 + }, + { + "epoch": 3.7338489535941766, + "grad_norm": 0.23738776768358139, + "learning_rate": 1.501547140913186e-06, + "loss": 0.0066, + "step": 8207 + }, + { + "epoch": 3.734303912647862, + "grad_norm": 0.3078124828880467, + "learning_rate": 1.5005261290243445e-06, + "loss": 0.0034, + "step": 8208 + }, + { + "epoch": 3.7347588717015467, + "grad_norm": 0.21207605564515963, + "learning_rate": 1.499505403096705e-06, + "loss": 0.0034, + "step": 8209 + }, + { + "epoch": 3.735213830755232, + "grad_norm": 0.1886399794420435, + "learning_rate": 1.498484963213674e-06, + "loss": 0.0035, + "step": 8210 + }, + { + "epoch": 3.7356687898089174, + "grad_norm": 0.13404938480224354, + "learning_rate": 1.4974648094586408e-06, + "loss": 0.0016, + "step": 8211 + }, + { + "epoch": 3.7361237488626022, + "grad_norm": 0.13570478317238555, + "learning_rate": 1.4964449419149657e-06, + "loss": 0.0011, + "step": 8212 + }, + { + "epoch": 3.7365787079162875, + "grad_norm": 0.2640883114846296, + "learning_rate": 1.4954253606659868e-06, + "loss": 0.0032, + "step": 8213 + }, + { + "epoch": 3.737033666969973, + "grad_norm": 0.12715737080694361, + "learning_rate": 1.4944060657950227e-06, + "loss": 0.0015, + "step": 8214 + }, + { + "epoch": 3.7374886260236577, + "grad_norm": 0.11243982104331475, + "learning_rate": 1.4933870573853616e-06, + "loss": 0.0018, + "step": 8215 + }, + { + "epoch": 3.737943585077343, + "grad_norm": 0.2366247478197491, + "learning_rate": 1.4923683355202761e-06, + "loss": 0.0025, + "step": 8216 + }, + { + "epoch": 3.7383985441310283, + "grad_norm": 0.38383205174148627, + "learning_rate": 1.4913499002830106e-06, + "loss": 0.0043, + "step": 8217 + }, + { + "epoch": 3.738853503184713, + "grad_norm": 0.24298155221940393, + "learning_rate": 1.4903317517567856e-06, + "loss": 0.0037, + "step": 8218 + }, + { + "epoch": 3.7393084622383985, + "grad_norm": 0.1498799121177, + "learning_rate": 1.4893138900247989e-06, + "loss": 0.0038, + "step": 8219 + }, + { + "epoch": 3.739763421292084, + "grad_norm": 0.23918706551676, + "learning_rate": 1.4882963151702272e-06, + "loss": 0.0033, + "step": 8220 + }, + { + "epoch": 3.7402183803457687, + "grad_norm": 0.30941297576676496, + "learning_rate": 1.4872790272762234e-06, + "loss": 0.0045, + "step": 8221 + }, + { + "epoch": 3.740673339399454, + "grad_norm": 0.02228059940016356, + "learning_rate": 1.4862620264259142e-06, + "loss": 0.0002, + "step": 8222 + }, + { + "epoch": 3.7411282984531393, + "grad_norm": 0.09162878481786349, + "learning_rate": 1.4852453127024042e-06, + "loss": 0.0012, + "step": 8223 + }, + { + "epoch": 3.741583257506824, + "grad_norm": 0.17340731818855434, + "learning_rate": 1.4842288861887732e-06, + "loss": 0.0036, + "step": 8224 + }, + { + "epoch": 3.7420382165605095, + "grad_norm": 0.3006127733886893, + "learning_rate": 1.4832127469680823e-06, + "loss": 0.0053, + "step": 8225 + }, + { + "epoch": 3.742493175614195, + "grad_norm": 0.16527196990196308, + "learning_rate": 1.482196895123364e-06, + "loss": 0.0011, + "step": 8226 + }, + { + "epoch": 3.7429481346678797, + "grad_norm": 0.08198894408476844, + "learning_rate": 1.4811813307376271e-06, + "loss": 0.0008, + "step": 8227 + }, + { + "epoch": 3.743403093721565, + "grad_norm": 0.22901257424191804, + "learning_rate": 1.4801660538938633e-06, + "loss": 0.0016, + "step": 8228 + }, + { + "epoch": 3.7438580527752503, + "grad_norm": 0.21571914853572963, + "learning_rate": 1.479151064675034e-06, + "loss": 0.0032, + "step": 8229 + }, + { + "epoch": 3.744313011828935, + "grad_norm": 0.25112107983233956, + "learning_rate": 1.4781363631640777e-06, + "loss": 0.0035, + "step": 8230 + }, + { + "epoch": 3.7447679708826205, + "grad_norm": 0.1768253906755621, + "learning_rate": 1.4771219494439148e-06, + "loss": 0.0014, + "step": 8231 + }, + { + "epoch": 3.745222929936306, + "grad_norm": 0.3420258053192273, + "learning_rate": 1.4761078235974374e-06, + "loss": 0.0044, + "step": 8232 + }, + { + "epoch": 3.7456778889899907, + "grad_norm": 0.3239689734533046, + "learning_rate": 1.4750939857075147e-06, + "loss": 0.0042, + "step": 8233 + }, + { + "epoch": 3.746132848043676, + "grad_norm": 0.43480442220027377, + "learning_rate": 1.4740804358569916e-06, + "loss": 0.0039, + "step": 8234 + }, + { + "epoch": 3.7465878070973613, + "grad_norm": 0.2456495241766836, + "learning_rate": 1.4730671741286923e-06, + "loss": 0.0021, + "step": 8235 + }, + { + "epoch": 3.747042766151046, + "grad_norm": 0.2388725584202595, + "learning_rate": 1.4720542006054178e-06, + "loss": 0.0038, + "step": 8236 + }, + { + "epoch": 3.7474977252047315, + "grad_norm": 0.23301887945847435, + "learning_rate": 1.471041515369942e-06, + "loss": 0.0036, + "step": 8237 + }, + { + "epoch": 3.7479526842584168, + "grad_norm": 0.17543611593853725, + "learning_rate": 1.4700291185050164e-06, + "loss": 0.0025, + "step": 8238 + }, + { + "epoch": 3.7484076433121016, + "grad_norm": 0.26831077593504893, + "learning_rate": 1.4690170100933692e-06, + "loss": 0.0064, + "step": 8239 + }, + { + "epoch": 3.748862602365787, + "grad_norm": 0.1613720374260712, + "learning_rate": 1.4680051902177073e-06, + "loss": 0.0017, + "step": 8240 + }, + { + "epoch": 3.7493175614194723, + "grad_norm": 0.1760288579310544, + "learning_rate": 1.4669936589607092e-06, + "loss": 0.0019, + "step": 8241 + }, + { + "epoch": 3.7497725204731576, + "grad_norm": 0.46488460827072936, + "learning_rate": 1.4659824164050363e-06, + "loss": 0.0042, + "step": 8242 + }, + { + "epoch": 3.7502274795268424, + "grad_norm": 0.1676211883362639, + "learning_rate": 1.4649714626333206e-06, + "loss": 0.0021, + "step": 8243 + }, + { + "epoch": 3.7506824385805277, + "grad_norm": 0.2138238858681873, + "learning_rate": 1.4639607977281716e-06, + "loss": 0.0051, + "step": 8244 + }, + { + "epoch": 3.751137397634213, + "grad_norm": 0.19386391574906334, + "learning_rate": 1.462950421772179e-06, + "loss": 0.0026, + "step": 8245 + }, + { + "epoch": 3.7515923566878984, + "grad_norm": 0.30300347693637725, + "learning_rate": 1.4619403348479045e-06, + "loss": 0.0048, + "step": 8246 + }, + { + "epoch": 3.7520473157415832, + "grad_norm": 0.1297172804999276, + "learning_rate": 1.4609305370378867e-06, + "loss": 0.0011, + "step": 8247 + }, + { + "epoch": 3.7525022747952685, + "grad_norm": 0.09863438238686904, + "learning_rate": 1.4599210284246452e-06, + "loss": 0.0011, + "step": 8248 + }, + { + "epoch": 3.752957233848954, + "grad_norm": 0.03359004436647919, + "learning_rate": 1.4589118090906684e-06, + "loss": 0.0002, + "step": 8249 + }, + { + "epoch": 3.7534121929026387, + "grad_norm": 0.26956004939560535, + "learning_rate": 1.4579028791184286e-06, + "loss": 0.0032, + "step": 8250 + }, + { + "epoch": 3.753867151956324, + "grad_norm": 0.15930204581725518, + "learning_rate": 1.4568942385903695e-06, + "loss": 0.0016, + "step": 8251 + }, + { + "epoch": 3.7543221110100093, + "grad_norm": 0.4694956725111323, + "learning_rate": 1.455885887588913e-06, + "loss": 0.0104, + "step": 8252 + }, + { + "epoch": 3.754777070063694, + "grad_norm": 0.12168435882028572, + "learning_rate": 1.4548778261964552e-06, + "loss": 0.0016, + "step": 8253 + }, + { + "epoch": 3.7552320291173795, + "grad_norm": 0.24991258948964318, + "learning_rate": 1.4538700544953715e-06, + "loss": 0.0035, + "step": 8254 + }, + { + "epoch": 3.755686988171065, + "grad_norm": 0.16153944886396596, + "learning_rate": 1.4528625725680146e-06, + "loss": 0.0024, + "step": 8255 + }, + { + "epoch": 3.7561419472247497, + "grad_norm": 0.2742082198279699, + "learning_rate": 1.4518553804967094e-06, + "loss": 0.003, + "step": 8256 + }, + { + "epoch": 3.756596906278435, + "grad_norm": 0.21943261188891286, + "learning_rate": 1.4508484783637588e-06, + "loss": 0.0039, + "step": 8257 + }, + { + "epoch": 3.7570518653321203, + "grad_norm": 0.40702778771251596, + "learning_rate": 1.4498418662514418e-06, + "loss": 0.0088, + "step": 8258 + }, + { + "epoch": 3.757506824385805, + "grad_norm": 0.32585313264357846, + "learning_rate": 1.4488355442420166e-06, + "loss": 0.0113, + "step": 8259 + }, + { + "epoch": 3.7579617834394905, + "grad_norm": 0.21473201616683416, + "learning_rate": 1.4478295124177133e-06, + "loss": 0.0021, + "step": 8260 + }, + { + "epoch": 3.758416742493176, + "grad_norm": 0.12206682889929317, + "learning_rate": 1.4468237708607397e-06, + "loss": 0.0012, + "step": 8261 + }, + { + "epoch": 3.7588717015468607, + "grad_norm": 0.08287384135932507, + "learning_rate": 1.4458183196532833e-06, + "loss": 0.001, + "step": 8262 + }, + { + "epoch": 3.759326660600546, + "grad_norm": 0.09582947209908133, + "learning_rate": 1.4448131588775026e-06, + "loss": 0.0014, + "step": 8263 + }, + { + "epoch": 3.7597816196542313, + "grad_norm": 0.04708513880623716, + "learning_rate": 1.4438082886155347e-06, + "loss": 0.0003, + "step": 8264 + }, + { + "epoch": 3.760236578707916, + "grad_norm": 0.4133360982095818, + "learning_rate": 1.4428037089494946e-06, + "loss": 0.011, + "step": 8265 + }, + { + "epoch": 3.7606915377616015, + "grad_norm": 0.22983535595942226, + "learning_rate": 1.4417994199614716e-06, + "loss": 0.0024, + "step": 8266 + }, + { + "epoch": 3.761146496815287, + "grad_norm": 0.30595199856839916, + "learning_rate": 1.4407954217335312e-06, + "loss": 0.006, + "step": 8267 + }, + { + "epoch": 3.7616014558689717, + "grad_norm": 0.28199614845423543, + "learning_rate": 1.4397917143477146e-06, + "loss": 0.0108, + "step": 8268 + }, + { + "epoch": 3.762056414922657, + "grad_norm": 0.4448458371879213, + "learning_rate": 1.4387882978860412e-06, + "loss": 0.0047, + "step": 8269 + }, + { + "epoch": 3.7625113739763423, + "grad_norm": 0.09856097507428246, + "learning_rate": 1.437785172430507e-06, + "loss": 0.001, + "step": 8270 + }, + { + "epoch": 3.762966333030027, + "grad_norm": 0.34702751501721035, + "learning_rate": 1.436782338063082e-06, + "loss": 0.0068, + "step": 8271 + }, + { + "epoch": 3.7634212920837125, + "grad_norm": 0.3459473693444633, + "learning_rate": 1.4357797948657126e-06, + "loss": 0.0067, + "step": 8272 + }, + { + "epoch": 3.7638762511373978, + "grad_norm": 0.1000976792686654, + "learning_rate": 1.4347775429203215e-06, + "loss": 0.0013, + "step": 8273 + }, + { + "epoch": 3.7643312101910826, + "grad_norm": 0.12371231083008664, + "learning_rate": 1.43377558230881e-06, + "loss": 0.0011, + "step": 8274 + }, + { + "epoch": 3.764786169244768, + "grad_norm": 0.6908311924322025, + "learning_rate": 1.432773913113052e-06, + "loss": 0.0054, + "step": 8275 + }, + { + "epoch": 3.7652411282984533, + "grad_norm": 0.4250515237535653, + "learning_rate": 1.431772535414902e-06, + "loss": 0.0035, + "step": 8276 + }, + { + "epoch": 3.765696087352138, + "grad_norm": 0.2581985701494445, + "learning_rate": 1.430771449296186e-06, + "loss": 0.0041, + "step": 8277 + }, + { + "epoch": 3.7661510464058234, + "grad_norm": 0.22040733457862796, + "learning_rate": 1.4297706548387074e-06, + "loss": 0.0023, + "step": 8278 + }, + { + "epoch": 3.7666060054595087, + "grad_norm": 0.3236483680786687, + "learning_rate": 1.4287701521242493e-06, + "loss": 0.0068, + "step": 8279 + }, + { + "epoch": 3.7670609645131936, + "grad_norm": 0.24037905617484695, + "learning_rate": 1.427769941234567e-06, + "loss": 0.0042, + "step": 8280 + }, + { + "epoch": 3.767515923566879, + "grad_norm": 0.08354808399660561, + "learning_rate": 1.426770022251393e-06, + "loss": 0.0008, + "step": 8281 + }, + { + "epoch": 3.7679708826205642, + "grad_norm": 0.2610880841770453, + "learning_rate": 1.4257703952564344e-06, + "loss": 0.0041, + "step": 8282 + }, + { + "epoch": 3.768425841674249, + "grad_norm": 0.37604638537049256, + "learning_rate": 1.4247710603313785e-06, + "loss": 0.0071, + "step": 8283 + }, + { + "epoch": 3.7688808007279344, + "grad_norm": 0.21831117309163686, + "learning_rate": 1.4237720175578873e-06, + "loss": 0.0052, + "step": 8284 + }, + { + "epoch": 3.7693357597816197, + "grad_norm": 0.22212900825780707, + "learning_rate": 1.4227732670175963e-06, + "loss": 0.0049, + "step": 8285 + }, + { + "epoch": 3.7697907188353046, + "grad_norm": 0.2692388023971826, + "learning_rate": 1.4217748087921202e-06, + "loss": 0.0071, + "step": 8286 + }, + { + "epoch": 3.77024567788899, + "grad_norm": 0.4182857081945022, + "learning_rate": 1.4207766429630453e-06, + "loss": 0.0016, + "step": 8287 + }, + { + "epoch": 3.770700636942675, + "grad_norm": 0.12810129756546632, + "learning_rate": 1.4197787696119414e-06, + "loss": 0.001, + "step": 8288 + }, + { + "epoch": 3.77115559599636, + "grad_norm": 0.17563388815871478, + "learning_rate": 1.4187811888203468e-06, + "loss": 0.0024, + "step": 8289 + }, + { + "epoch": 3.7716105550500454, + "grad_norm": 0.28458649825509774, + "learning_rate": 1.4177839006697818e-06, + "loss": 0.0059, + "step": 8290 + }, + { + "epoch": 3.7720655141037307, + "grad_norm": 0.16885255227897664, + "learning_rate": 1.41678690524174e-06, + "loss": 0.0022, + "step": 8291 + }, + { + "epoch": 3.7725204731574156, + "grad_norm": 0.3219999041978894, + "learning_rate": 1.415790202617689e-06, + "loss": 0.0022, + "step": 8292 + }, + { + "epoch": 3.772975432211101, + "grad_norm": 0.2659269968266485, + "learning_rate": 1.4147937928790778e-06, + "loss": 0.0064, + "step": 8293 + }, + { + "epoch": 3.773430391264786, + "grad_norm": 0.12091441191771202, + "learning_rate": 1.4137976761073269e-06, + "loss": 0.0007, + "step": 8294 + }, + { + "epoch": 3.7738853503184715, + "grad_norm": 0.10390529647426298, + "learning_rate": 1.4128018523838355e-06, + "loss": 0.0007, + "step": 8295 + }, + { + "epoch": 3.7743403093721564, + "grad_norm": 0.39299677619197315, + "learning_rate": 1.4118063217899746e-06, + "loss": 0.0026, + "step": 8296 + }, + { + "epoch": 3.7747952684258417, + "grad_norm": 0.38015204142968195, + "learning_rate": 1.4108110844070977e-06, + "loss": 0.007, + "step": 8297 + }, + { + "epoch": 3.775250227479527, + "grad_norm": 0.129346704475973, + "learning_rate": 1.4098161403165317e-06, + "loss": 0.0007, + "step": 8298 + }, + { + "epoch": 3.775705186533212, + "grad_norm": 0.2602422039857844, + "learning_rate": 1.4088214895995777e-06, + "loss": 0.0057, + "step": 8299 + }, + { + "epoch": 3.776160145586897, + "grad_norm": 0.27091709573008166, + "learning_rate": 1.4078271323375137e-06, + "loss": 0.0034, + "step": 8300 + }, + { + "epoch": 3.7766151046405825, + "grad_norm": 0.7141580285719477, + "learning_rate": 1.4068330686115943e-06, + "loss": 0.0135, + "step": 8301 + }, + { + "epoch": 3.777070063694268, + "grad_norm": 0.27640944266949646, + "learning_rate": 1.4058392985030488e-06, + "loss": 0.0074, + "step": 8302 + }, + { + "epoch": 3.7775250227479527, + "grad_norm": 0.3385462216465802, + "learning_rate": 1.4048458220930843e-06, + "loss": 0.004, + "step": 8303 + }, + { + "epoch": 3.777979981801638, + "grad_norm": 0.08786583811062756, + "learning_rate": 1.4038526394628854e-06, + "loss": 0.0008, + "step": 8304 + }, + { + "epoch": 3.7784349408553233, + "grad_norm": 0.40261208409509985, + "learning_rate": 1.4028597506936086e-06, + "loss": 0.0104, + "step": 8305 + }, + { + "epoch": 3.778889899909008, + "grad_norm": 0.24271452189170495, + "learning_rate": 1.4018671558663888e-06, + "loss": 0.004, + "step": 8306 + }, + { + "epoch": 3.7793448589626935, + "grad_norm": 0.1524327901050344, + "learning_rate": 1.4008748550623342e-06, + "loss": 0.0024, + "step": 8307 + }, + { + "epoch": 3.7797998180163788, + "grad_norm": 0.22063041266979924, + "learning_rate": 1.3998828483625343e-06, + "loss": 0.0025, + "step": 8308 + }, + { + "epoch": 3.7802547770700636, + "grad_norm": 0.14460719120690937, + "learning_rate": 1.3988911358480506e-06, + "loss": 0.0024, + "step": 8309 + }, + { + "epoch": 3.780709736123749, + "grad_norm": 0.10118189583718977, + "learning_rate": 1.3978997175999186e-06, + "loss": 0.0012, + "step": 8310 + }, + { + "epoch": 3.7811646951774343, + "grad_norm": 0.11685587624071721, + "learning_rate": 1.3969085936991567e-06, + "loss": 0.0014, + "step": 8311 + }, + { + "epoch": 3.781619654231119, + "grad_norm": 0.20108530234702668, + "learning_rate": 1.3959177642267513e-06, + "loss": 0.0033, + "step": 8312 + }, + { + "epoch": 3.7820746132848044, + "grad_norm": 0.10795246525356737, + "learning_rate": 1.3949272292636722e-06, + "loss": 0.0011, + "step": 8313 + }, + { + "epoch": 3.7825295723384897, + "grad_norm": 0.26094487332367433, + "learning_rate": 1.3939369888908593e-06, + "loss": 0.0043, + "step": 8314 + }, + { + "epoch": 3.7829845313921746, + "grad_norm": 0.24836976639729275, + "learning_rate": 1.392947043189231e-06, + "loss": 0.0022, + "step": 8315 + }, + { + "epoch": 3.78343949044586, + "grad_norm": 0.15770979557828438, + "learning_rate": 1.3919573922396796e-06, + "loss": 0.0011, + "step": 8316 + }, + { + "epoch": 3.7838944494995452, + "grad_norm": 0.3270618825843036, + "learning_rate": 1.390968036123076e-06, + "loss": 0.0048, + "step": 8317 + }, + { + "epoch": 3.78434940855323, + "grad_norm": 0.24141696524179324, + "learning_rate": 1.3899789749202674e-06, + "loss": 0.0025, + "step": 8318 + }, + { + "epoch": 3.7848043676069154, + "grad_norm": 0.28267758275750954, + "learning_rate": 1.388990208712075e-06, + "loss": 0.0044, + "step": 8319 + }, + { + "epoch": 3.7852593266606007, + "grad_norm": 0.1399486288759601, + "learning_rate": 1.3880017375792953e-06, + "loss": 0.0007, + "step": 8320 + }, + { + "epoch": 3.7857142857142856, + "grad_norm": 0.22842964329284515, + "learning_rate": 1.3870135616027003e-06, + "loss": 0.0016, + "step": 8321 + }, + { + "epoch": 3.786169244767971, + "grad_norm": 0.35524751768780877, + "learning_rate": 1.3860256808630429e-06, + "loss": 0.0036, + "step": 8322 + }, + { + "epoch": 3.786624203821656, + "grad_norm": 0.3221021611325404, + "learning_rate": 1.385038095441046e-06, + "loss": 0.0019, + "step": 8323 + }, + { + "epoch": 3.787079162875341, + "grad_norm": 0.2024406933552888, + "learning_rate": 1.3840508054174095e-06, + "loss": 0.0016, + "step": 8324 + }, + { + "epoch": 3.7875341219290264, + "grad_norm": 0.20208271857860724, + "learning_rate": 1.3830638108728128e-06, + "loss": 0.0018, + "step": 8325 + }, + { + "epoch": 3.7879890809827117, + "grad_norm": 0.0751246125810503, + "learning_rate": 1.3820771118879067e-06, + "loss": 0.0005, + "step": 8326 + }, + { + "epoch": 3.7884440400363966, + "grad_norm": 0.2707006913872268, + "learning_rate": 1.3810907085433216e-06, + "loss": 0.0027, + "step": 8327 + }, + { + "epoch": 3.788898999090082, + "grad_norm": 0.5023881792721455, + "learning_rate": 1.3801046009196612e-06, + "loss": 0.0112, + "step": 8328 + }, + { + "epoch": 3.789353958143767, + "grad_norm": 0.13838993811751352, + "learning_rate": 1.3791187890975055e-06, + "loss": 0.0017, + "step": 8329 + }, + { + "epoch": 3.789808917197452, + "grad_norm": 0.23351490105407538, + "learning_rate": 1.3781332731574087e-06, + "loss": 0.0043, + "step": 8330 + }, + { + "epoch": 3.7902638762511374, + "grad_norm": 0.22232711243570125, + "learning_rate": 1.3771480531799054e-06, + "loss": 0.0059, + "step": 8331 + }, + { + "epoch": 3.7907188353048227, + "grad_norm": 0.30010517879141546, + "learning_rate": 1.3761631292455036e-06, + "loss": 0.0056, + "step": 8332 + }, + { + "epoch": 3.7911737943585075, + "grad_norm": 0.3390281605040097, + "learning_rate": 1.3751785014346853e-06, + "loss": 0.0043, + "step": 8333 + }, + { + "epoch": 3.791628753412193, + "grad_norm": 0.3379033481004833, + "learning_rate": 1.374194169827911e-06, + "loss": 0.0049, + "step": 8334 + }, + { + "epoch": 3.792083712465878, + "grad_norm": 0.15248759514781857, + "learning_rate": 1.3732101345056149e-06, + "loss": 0.0015, + "step": 8335 + }, + { + "epoch": 3.792538671519563, + "grad_norm": 0.3090673122352973, + "learning_rate": 1.3722263955482068e-06, + "loss": 0.0029, + "step": 8336 + }, + { + "epoch": 3.7929936305732483, + "grad_norm": 0.30032816807117646, + "learning_rate": 1.371242953036076e-06, + "loss": 0.0046, + "step": 8337 + }, + { + "epoch": 3.7934485896269337, + "grad_norm": 0.47663178964047737, + "learning_rate": 1.3702598070495826e-06, + "loss": 0.0093, + "step": 8338 + }, + { + "epoch": 3.7939035486806185, + "grad_norm": 0.17488689932511098, + "learning_rate": 1.3692769576690674e-06, + "loss": 0.0011, + "step": 8339 + }, + { + "epoch": 3.794358507734304, + "grad_norm": 0.22410253015377568, + "learning_rate": 1.3682944049748425e-06, + "loss": 0.0052, + "step": 8340 + }, + { + "epoch": 3.794813466787989, + "grad_norm": 0.1786421527765028, + "learning_rate": 1.3673121490471975e-06, + "loss": 0.0034, + "step": 8341 + }, + { + "epoch": 3.795268425841674, + "grad_norm": 0.2517136326053243, + "learning_rate": 1.3663301899663995e-06, + "loss": 0.0063, + "step": 8342 + }, + { + "epoch": 3.7957233848953593, + "grad_norm": 0.048507668053566086, + "learning_rate": 1.3653485278126894e-06, + "loss": 0.0003, + "step": 8343 + }, + { + "epoch": 3.7961783439490446, + "grad_norm": 0.20757721788522404, + "learning_rate": 1.364367162666283e-06, + "loss": 0.0035, + "step": 8344 + }, + { + "epoch": 3.7966333030027295, + "grad_norm": 0.2612240603399998, + "learning_rate": 1.363386094607373e-06, + "loss": 0.0041, + "step": 8345 + }, + { + "epoch": 3.797088262056415, + "grad_norm": 0.08271865212704826, + "learning_rate": 1.3624053237161278e-06, + "loss": 0.0011, + "step": 8346 + }, + { + "epoch": 3.7975432211101, + "grad_norm": 0.3715588667918637, + "learning_rate": 1.361424850072694e-06, + "loss": 0.0039, + "step": 8347 + }, + { + "epoch": 3.797998180163785, + "grad_norm": 0.20936961372647586, + "learning_rate": 1.3604446737571902e-06, + "loss": 0.0054, + "step": 8348 + }, + { + "epoch": 3.7984531392174703, + "grad_norm": 0.19051753243209604, + "learning_rate": 1.3594647948497113e-06, + "loss": 0.0037, + "step": 8349 + }, + { + "epoch": 3.7989080982711556, + "grad_norm": 0.28101577844919723, + "learning_rate": 1.3584852134303273e-06, + "loss": 0.005, + "step": 8350 + }, + { + "epoch": 3.799363057324841, + "grad_norm": 0.3966610896334549, + "learning_rate": 1.3575059295790882e-06, + "loss": 0.0132, + "step": 8351 + }, + { + "epoch": 3.799818016378526, + "grad_norm": 0.3114847369521246, + "learning_rate": 1.3565269433760137e-06, + "loss": 0.0106, + "step": 8352 + }, + { + "epoch": 3.800272975432211, + "grad_norm": 0.18758962670137064, + "learning_rate": 1.355548254901105e-06, + "loss": 0.002, + "step": 8353 + }, + { + "epoch": 3.8007279344858964, + "grad_norm": 0.24626540682506912, + "learning_rate": 1.3545698642343351e-06, + "loss": 0.0033, + "step": 8354 + }, + { + "epoch": 3.8011828935395813, + "grad_norm": 0.07727662074760829, + "learning_rate": 1.3535917714556512e-06, + "loss": 0.0006, + "step": 8355 + }, + { + "epoch": 3.8016378525932666, + "grad_norm": 0.3433344176285079, + "learning_rate": 1.352613976644983e-06, + "loss": 0.005, + "step": 8356 + }, + { + "epoch": 3.802092811646952, + "grad_norm": 0.1406963305448382, + "learning_rate": 1.3516364798822284e-06, + "loss": 0.0013, + "step": 8357 + }, + { + "epoch": 3.802547770700637, + "grad_norm": 0.15787679477952846, + "learning_rate": 1.3506592812472653e-06, + "loss": 0.0018, + "step": 8358 + }, + { + "epoch": 3.803002729754322, + "grad_norm": 0.12541486370887497, + "learning_rate": 1.3496823808199438e-06, + "loss": 0.0016, + "step": 8359 + }, + { + "epoch": 3.8034576888080074, + "grad_norm": 0.2008632031837807, + "learning_rate": 1.3487057786800932e-06, + "loss": 0.0031, + "step": 8360 + }, + { + "epoch": 3.8039126478616927, + "grad_norm": 0.2754841460669745, + "learning_rate": 1.3477294749075194e-06, + "loss": 0.0078, + "step": 8361 + }, + { + "epoch": 3.8043676069153776, + "grad_norm": 0.22427736264793874, + "learning_rate": 1.346753469581999e-06, + "loss": 0.0046, + "step": 8362 + }, + { + "epoch": 3.804822565969063, + "grad_norm": 0.2092053635397987, + "learning_rate": 1.3457777627832868e-06, + "loss": 0.0028, + "step": 8363 + }, + { + "epoch": 3.805277525022748, + "grad_norm": 0.17228733251915468, + "learning_rate": 1.3448023545911126e-06, + "loss": 0.001, + "step": 8364 + }, + { + "epoch": 3.805732484076433, + "grad_norm": 0.1674441467315862, + "learning_rate": 1.3438272450851846e-06, + "loss": 0.0016, + "step": 8365 + }, + { + "epoch": 3.8061874431301184, + "grad_norm": 0.12685419369546294, + "learning_rate": 1.3428524343451809e-06, + "loss": 0.0007, + "step": 8366 + }, + { + "epoch": 3.8066424021838037, + "grad_norm": 0.20988870320387673, + "learning_rate": 1.3418779224507634e-06, + "loss": 0.0016, + "step": 8367 + }, + { + "epoch": 3.8070973612374885, + "grad_norm": 0.08262350636139937, + "learning_rate": 1.3409037094815613e-06, + "loss": 0.0006, + "step": 8368 + }, + { + "epoch": 3.807552320291174, + "grad_norm": 0.17581154904944118, + "learning_rate": 1.3399297955171825e-06, + "loss": 0.0019, + "step": 8369 + }, + { + "epoch": 3.808007279344859, + "grad_norm": 0.16225362274054209, + "learning_rate": 1.338956180637213e-06, + "loss": 0.0021, + "step": 8370 + }, + { + "epoch": 3.808462238398544, + "grad_norm": 0.2001268120216354, + "learning_rate": 1.3379828649212123e-06, + "loss": 0.0035, + "step": 8371 + }, + { + "epoch": 3.8089171974522293, + "grad_norm": 0.33097145827180485, + "learning_rate": 1.3370098484487138e-06, + "loss": 0.0079, + "step": 8372 + }, + { + "epoch": 3.8093721565059147, + "grad_norm": 0.2258307707906472, + "learning_rate": 1.336037131299227e-06, + "loss": 0.0052, + "step": 8373 + }, + { + "epoch": 3.8098271155595995, + "grad_norm": 0.3399619825879456, + "learning_rate": 1.335064713552241e-06, + "loss": 0.0098, + "step": 8374 + }, + { + "epoch": 3.810282074613285, + "grad_norm": 0.09768137874619251, + "learning_rate": 1.3340925952872147e-06, + "loss": 0.0007, + "step": 8375 + }, + { + "epoch": 3.81073703366697, + "grad_norm": 0.08380580073280516, + "learning_rate": 1.3331207765835875e-06, + "loss": 0.0009, + "step": 8376 + }, + { + "epoch": 3.811191992720655, + "grad_norm": 0.05436590886491927, + "learning_rate": 1.332149257520771e-06, + "loss": 0.0005, + "step": 8377 + }, + { + "epoch": 3.8116469517743403, + "grad_norm": 0.12454516050396389, + "learning_rate": 1.3311780381781537e-06, + "loss": 0.0021, + "step": 8378 + }, + { + "epoch": 3.8121019108280256, + "grad_norm": 0.4294764678194112, + "learning_rate": 1.3302071186350972e-06, + "loss": 0.0053, + "step": 8379 + }, + { + "epoch": 3.8125568698817105, + "grad_norm": 0.20353694606732922, + "learning_rate": 1.3292364989709422e-06, + "loss": 0.0039, + "step": 8380 + }, + { + "epoch": 3.813011828935396, + "grad_norm": 0.25911207089557037, + "learning_rate": 1.3282661792650054e-06, + "loss": 0.0025, + "step": 8381 + }, + { + "epoch": 3.813466787989081, + "grad_norm": 0.18024098799730776, + "learning_rate": 1.3272961595965743e-06, + "loss": 0.0027, + "step": 8382 + }, + { + "epoch": 3.813921747042766, + "grad_norm": 0.18652213195706055, + "learning_rate": 1.3263264400449161e-06, + "loss": 0.0014, + "step": 8383 + }, + { + "epoch": 3.8143767060964513, + "grad_norm": 0.4040370739991496, + "learning_rate": 1.325357020689269e-06, + "loss": 0.0077, + "step": 8384 + }, + { + "epoch": 3.8148316651501366, + "grad_norm": 0.09313504076322941, + "learning_rate": 1.3243879016088534e-06, + "loss": 0.0007, + "step": 8385 + }, + { + "epoch": 3.8152866242038215, + "grad_norm": 0.23092393538599115, + "learning_rate": 1.3234190828828591e-06, + "loss": 0.0036, + "step": 8386 + }, + { + "epoch": 3.815741583257507, + "grad_norm": 0.2571674933170574, + "learning_rate": 1.3224505645904534e-06, + "loss": 0.0035, + "step": 8387 + }, + { + "epoch": 3.816196542311192, + "grad_norm": 0.45823777636308516, + "learning_rate": 1.3214823468107807e-06, + "loss": 0.0106, + "step": 8388 + }, + { + "epoch": 3.816651501364877, + "grad_norm": 0.19575203408620434, + "learning_rate": 1.3205144296229572e-06, + "loss": 0.0025, + "step": 8389 + }, + { + "epoch": 3.8171064604185623, + "grad_norm": 0.14813728090937492, + "learning_rate": 1.3195468131060796e-06, + "loss": 0.0048, + "step": 8390 + }, + { + "epoch": 3.8175614194722476, + "grad_norm": 0.460510606669038, + "learning_rate": 1.3185794973392158e-06, + "loss": 0.0153, + "step": 8391 + }, + { + "epoch": 3.8180163785259325, + "grad_norm": 0.0692284420265699, + "learning_rate": 1.3176124824014102e-06, + "loss": 0.0007, + "step": 8392 + }, + { + "epoch": 3.8184713375796178, + "grad_norm": 0.10343261170357118, + "learning_rate": 1.3166457683716815e-06, + "loss": 0.0007, + "step": 8393 + }, + { + "epoch": 3.818926296633303, + "grad_norm": 0.2958141511157044, + "learning_rate": 1.3156793553290271e-06, + "loss": 0.0056, + "step": 8394 + }, + { + "epoch": 3.819381255686988, + "grad_norm": 0.1438216881899908, + "learning_rate": 1.3147132433524184e-06, + "loss": 0.0025, + "step": 8395 + }, + { + "epoch": 3.8198362147406733, + "grad_norm": 0.10786947358240591, + "learning_rate": 1.313747432520801e-06, + "loss": 0.0009, + "step": 8396 + }, + { + "epoch": 3.8202911737943586, + "grad_norm": 0.25833041543913837, + "learning_rate": 1.3127819229130967e-06, + "loss": 0.006, + "step": 8397 + }, + { + "epoch": 3.8207461328480434, + "grad_norm": 0.1456879020024286, + "learning_rate": 1.3118167146082005e-06, + "loss": 0.0018, + "step": 8398 + }, + { + "epoch": 3.8212010919017287, + "grad_norm": 0.1095285919176108, + "learning_rate": 1.3108518076849886e-06, + "loss": 0.0017, + "step": 8399 + }, + { + "epoch": 3.821656050955414, + "grad_norm": 0.2114624496544202, + "learning_rate": 1.3098872022223069e-06, + "loss": 0.0031, + "step": 8400 + }, + { + "epoch": 3.822111010009099, + "grad_norm": 0.4166027951841225, + "learning_rate": 1.3089228982989771e-06, + "loss": 0.0077, + "step": 8401 + }, + { + "epoch": 3.8225659690627842, + "grad_norm": 0.1341179200124046, + "learning_rate": 1.3079588959938006e-06, + "loss": 0.0012, + "step": 8402 + }, + { + "epoch": 3.8230209281164695, + "grad_norm": 0.2815178784042401, + "learning_rate": 1.3069951953855486e-06, + "loss": 0.0018, + "step": 8403 + }, + { + "epoch": 3.823475887170155, + "grad_norm": 0.3038368919220553, + "learning_rate": 1.3060317965529734e-06, + "loss": 0.004, + "step": 8404 + }, + { + "epoch": 3.8239308462238397, + "grad_norm": 0.12260925707408848, + "learning_rate": 1.305068699574798e-06, + "loss": 0.0015, + "step": 8405 + }, + { + "epoch": 3.824385805277525, + "grad_norm": 0.2630754045078848, + "learning_rate": 1.3041059045297217e-06, + "loss": 0.0016, + "step": 8406 + }, + { + "epoch": 3.8248407643312103, + "grad_norm": 0.41332693376846336, + "learning_rate": 1.303143411496421e-06, + "loss": 0.0051, + "step": 8407 + }, + { + "epoch": 3.825295723384895, + "grad_norm": 0.1368726520759684, + "learning_rate": 1.3021812205535444e-06, + "loss": 0.0007, + "step": 8408 + }, + { + "epoch": 3.8257506824385805, + "grad_norm": 0.09424341000898812, + "learning_rate": 1.3012193317797189e-06, + "loss": 0.001, + "step": 8409 + }, + { + "epoch": 3.826205641492266, + "grad_norm": 0.06180291258421272, + "learning_rate": 1.3002577452535475e-06, + "loss": 0.0004, + "step": 8410 + }, + { + "epoch": 3.826660600545951, + "grad_norm": 0.3137531774726185, + "learning_rate": 1.2992964610536057e-06, + "loss": 0.0044, + "step": 8411 + }, + { + "epoch": 3.827115559599636, + "grad_norm": 0.19275950127678917, + "learning_rate": 1.2983354792584446e-06, + "loss": 0.0035, + "step": 8412 + }, + { + "epoch": 3.8275705186533213, + "grad_norm": 0.3541223724062894, + "learning_rate": 1.2973747999465903e-06, + "loss": 0.0038, + "step": 8413 + }, + { + "epoch": 3.8280254777070066, + "grad_norm": 0.07837287848896765, + "learning_rate": 1.2964144231965475e-06, + "loss": 0.0011, + "step": 8414 + }, + { + "epoch": 3.8284804367606915, + "grad_norm": 0.2019202511765073, + "learning_rate": 1.2954543490867917e-06, + "loss": 0.0025, + "step": 8415 + }, + { + "epoch": 3.828935395814377, + "grad_norm": 0.34803643060160927, + "learning_rate": 1.2944945776957778e-06, + "loss": 0.0035, + "step": 8416 + }, + { + "epoch": 3.829390354868062, + "grad_norm": 0.254793766134315, + "learning_rate": 1.2935351091019338e-06, + "loss": 0.0027, + "step": 8417 + }, + { + "epoch": 3.829845313921747, + "grad_norm": 0.08242862581297755, + "learning_rate": 1.2925759433836604e-06, + "loss": 0.001, + "step": 8418 + }, + { + "epoch": 3.8303002729754323, + "grad_norm": 0.2402643718597356, + "learning_rate": 1.29161708061934e-06, + "loss": 0.0061, + "step": 8419 + }, + { + "epoch": 3.8307552320291176, + "grad_norm": 0.31272566727538187, + "learning_rate": 1.2906585208873251e-06, + "loss": 0.0077, + "step": 8420 + }, + { + "epoch": 3.8312101910828025, + "grad_norm": 0.10375993159218067, + "learning_rate": 1.2897002642659444e-06, + "loss": 0.0009, + "step": 8421 + }, + { + "epoch": 3.831665150136488, + "grad_norm": 0.1822856543304169, + "learning_rate": 1.2887423108335012e-06, + "loss": 0.0029, + "step": 8422 + }, + { + "epoch": 3.832120109190173, + "grad_norm": 0.05261446379952179, + "learning_rate": 1.2877846606682764e-06, + "loss": 0.0003, + "step": 8423 + }, + { + "epoch": 3.832575068243858, + "grad_norm": 0.2633675845918805, + "learning_rate": 1.2868273138485265e-06, + "loss": 0.0069, + "step": 8424 + }, + { + "epoch": 3.8330300272975433, + "grad_norm": 0.38249460070364494, + "learning_rate": 1.2858702704524801e-06, + "loss": 0.0045, + "step": 8425 + }, + { + "epoch": 3.8334849863512286, + "grad_norm": 0.2211986689376906, + "learning_rate": 1.284913530558342e-06, + "loss": 0.0058, + "step": 8426 + }, + { + "epoch": 3.8339399454049135, + "grad_norm": 0.04571734758670077, + "learning_rate": 1.2839570942442918e-06, + "loss": 0.0003, + "step": 8427 + }, + { + "epoch": 3.8343949044585988, + "grad_norm": 0.26529399465953435, + "learning_rate": 1.2830009615884876e-06, + "loss": 0.0057, + "step": 8428 + }, + { + "epoch": 3.834849863512284, + "grad_norm": 0.14497405512497816, + "learning_rate": 1.2820451326690576e-06, + "loss": 0.002, + "step": 8429 + }, + { + "epoch": 3.835304822565969, + "grad_norm": 0.12265915889052215, + "learning_rate": 1.2810896075641106e-06, + "loss": 0.0016, + "step": 8430 + }, + { + "epoch": 3.8357597816196543, + "grad_norm": 0.2923179320068547, + "learning_rate": 1.2801343863517269e-06, + "loss": 0.0051, + "step": 8431 + }, + { + "epoch": 3.8362147406733396, + "grad_norm": 0.3477059185150881, + "learning_rate": 1.2791794691099603e-06, + "loss": 0.0097, + "step": 8432 + }, + { + "epoch": 3.8366696997270244, + "grad_norm": 0.6442459221866703, + "learning_rate": 1.2782248559168458e-06, + "loss": 0.0088, + "step": 8433 + }, + { + "epoch": 3.8371246587807097, + "grad_norm": 0.4303015594796121, + "learning_rate": 1.2772705468503892e-06, + "loss": 0.0042, + "step": 8434 + }, + { + "epoch": 3.837579617834395, + "grad_norm": 0.3089108860974287, + "learning_rate": 1.2763165419885714e-06, + "loss": 0.0053, + "step": 8435 + }, + { + "epoch": 3.83803457688808, + "grad_norm": 0.13206227245372856, + "learning_rate": 1.2753628414093489e-06, + "loss": 0.0007, + "step": 8436 + }, + { + "epoch": 3.8384895359417652, + "grad_norm": 0.1514964065398938, + "learning_rate": 1.274409445190654e-06, + "loss": 0.0016, + "step": 8437 + }, + { + "epoch": 3.8389444949954505, + "grad_norm": 0.18536084132664352, + "learning_rate": 1.2734563534103967e-06, + "loss": 0.0015, + "step": 8438 + }, + { + "epoch": 3.8393994540491354, + "grad_norm": 0.2714714388232563, + "learning_rate": 1.2725035661464568e-06, + "loss": 0.0053, + "step": 8439 + }, + { + "epoch": 3.8398544131028207, + "grad_norm": 0.22143921333332356, + "learning_rate": 1.2715510834766925e-06, + "loss": 0.0035, + "step": 8440 + }, + { + "epoch": 3.840309372156506, + "grad_norm": 0.6099050324716289, + "learning_rate": 1.2705989054789358e-06, + "loss": 0.0103, + "step": 8441 + }, + { + "epoch": 3.840764331210191, + "grad_norm": 0.07760202526070378, + "learning_rate": 1.269647032230994e-06, + "loss": 0.0008, + "step": 8442 + }, + { + "epoch": 3.841219290263876, + "grad_norm": 0.14988794337877911, + "learning_rate": 1.2686954638106497e-06, + "loss": 0.0027, + "step": 8443 + }, + { + "epoch": 3.8416742493175615, + "grad_norm": 0.09512279442014934, + "learning_rate": 1.2677442002956636e-06, + "loss": 0.0009, + "step": 8444 + }, + { + "epoch": 3.8421292083712464, + "grad_norm": 0.21960955604778923, + "learning_rate": 1.2667932417637669e-06, + "loss": 0.0045, + "step": 8445 + }, + { + "epoch": 3.8425841674249317, + "grad_norm": 0.049301179279957294, + "learning_rate": 1.2658425882926672e-06, + "loss": 0.0004, + "step": 8446 + }, + { + "epoch": 3.843039126478617, + "grad_norm": 0.18069259601796572, + "learning_rate": 1.2648922399600467e-06, + "loss": 0.0022, + "step": 8447 + }, + { + "epoch": 3.843494085532302, + "grad_norm": 0.19142766700652458, + "learning_rate": 1.2639421968435655e-06, + "loss": 0.003, + "step": 8448 + }, + { + "epoch": 3.843949044585987, + "grad_norm": 0.1523308102062212, + "learning_rate": 1.262992459020857e-06, + "loss": 0.0016, + "step": 8449 + }, + { + "epoch": 3.8444040036396725, + "grad_norm": 0.2619310242968084, + "learning_rate": 1.2620430265695267e-06, + "loss": 0.0061, + "step": 8450 + }, + { + "epoch": 3.8448589626933574, + "grad_norm": 0.28502517658670345, + "learning_rate": 1.2610938995671606e-06, + "loss": 0.0039, + "step": 8451 + }, + { + "epoch": 3.8453139217470427, + "grad_norm": 0.12442702407848295, + "learning_rate": 1.2601450780913161e-06, + "loss": 0.0015, + "step": 8452 + }, + { + "epoch": 3.845768880800728, + "grad_norm": 0.24018405930013872, + "learning_rate": 1.2591965622195274e-06, + "loss": 0.0046, + "step": 8453 + }, + { + "epoch": 3.846223839854413, + "grad_norm": 0.08288531940869734, + "learning_rate": 1.2582483520293026e-06, + "loss": 0.0006, + "step": 8454 + }, + { + "epoch": 3.846678798908098, + "grad_norm": 0.2280538494024286, + "learning_rate": 1.2573004475981243e-06, + "loss": 0.0026, + "step": 8455 + }, + { + "epoch": 3.8471337579617835, + "grad_norm": 0.12883334774563363, + "learning_rate": 1.256352849003451e-06, + "loss": 0.0013, + "step": 8456 + }, + { + "epoch": 3.8475887170154683, + "grad_norm": 0.22662915829767713, + "learning_rate": 1.2554055563227163e-06, + "loss": 0.0022, + "step": 8457 + }, + { + "epoch": 3.8480436760691537, + "grad_norm": 0.27566415303598973, + "learning_rate": 1.2544585696333305e-06, + "loss": 0.0041, + "step": 8458 + }, + { + "epoch": 3.848498635122839, + "grad_norm": 0.28085231845099684, + "learning_rate": 1.2535118890126758e-06, + "loss": 0.0023, + "step": 8459 + }, + { + "epoch": 3.8489535941765243, + "grad_norm": 0.2588290618037598, + "learning_rate": 1.2525655145381104e-06, + "loss": 0.0046, + "step": 8460 + }, + { + "epoch": 3.849408553230209, + "grad_norm": 0.03194820568526538, + "learning_rate": 1.2516194462869663e-06, + "loss": 0.0003, + "step": 8461 + }, + { + "epoch": 3.8498635122838945, + "grad_norm": 0.1768107842451328, + "learning_rate": 1.2506736843365552e-06, + "loss": 0.0039, + "step": 8462 + }, + { + "epoch": 3.8503184713375798, + "grad_norm": 0.3924196110233749, + "learning_rate": 1.2497282287641588e-06, + "loss": 0.0053, + "step": 8463 + }, + { + "epoch": 3.8507734303912646, + "grad_norm": 0.5109022811396491, + "learning_rate": 1.248783079647034e-06, + "loss": 0.0072, + "step": 8464 + }, + { + "epoch": 3.85122838944495, + "grad_norm": 0.1933094018038453, + "learning_rate": 1.247838237062417e-06, + "loss": 0.0018, + "step": 8465 + }, + { + "epoch": 3.8516833484986353, + "grad_norm": 0.4144346490474305, + "learning_rate": 1.2468937010875131e-06, + "loss": 0.0085, + "step": 8466 + }, + { + "epoch": 3.8521383075523206, + "grad_norm": 0.1355727849874782, + "learning_rate": 1.2459494717995085e-06, + "loss": 0.0012, + "step": 8467 + }, + { + "epoch": 3.8525932666060054, + "grad_norm": 0.16928611993835407, + "learning_rate": 1.2450055492755602e-06, + "loss": 0.0029, + "step": 8468 + }, + { + "epoch": 3.8530482256596907, + "grad_norm": 0.40246200344720284, + "learning_rate": 1.244061933592801e-06, + "loss": 0.0041, + "step": 8469 + }, + { + "epoch": 3.853503184713376, + "grad_norm": 0.13060140981445278, + "learning_rate": 1.2431186248283373e-06, + "loss": 0.0008, + "step": 8470 + }, + { + "epoch": 3.853958143767061, + "grad_norm": 0.2299546387610646, + "learning_rate": 1.2421756230592535e-06, + "loss": 0.002, + "step": 8471 + }, + { + "epoch": 3.8544131028207462, + "grad_norm": 0.1902407967740717, + "learning_rate": 1.2412329283626096e-06, + "loss": 0.0021, + "step": 8472 + }, + { + "epoch": 3.8548680618744315, + "grad_norm": 0.3665417651145956, + "learning_rate": 1.2402905408154359e-06, + "loss": 0.0033, + "step": 8473 + }, + { + "epoch": 3.8553230209281164, + "grad_norm": 0.20410041093409834, + "learning_rate": 1.2393484604947403e-06, + "loss": 0.0028, + "step": 8474 + }, + { + "epoch": 3.8557779799818017, + "grad_norm": 0.16998252128460117, + "learning_rate": 1.2384066874775047e-06, + "loss": 0.0029, + "step": 8475 + }, + { + "epoch": 3.856232939035487, + "grad_norm": 0.254858957722109, + "learning_rate": 1.2374652218406884e-06, + "loss": 0.0041, + "step": 8476 + }, + { + "epoch": 3.856687898089172, + "grad_norm": 0.16426591782009645, + "learning_rate": 1.236524063661223e-06, + "loss": 0.0013, + "step": 8477 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.2147436072461515, + "learning_rate": 1.2355832130160134e-06, + "loss": 0.0017, + "step": 8478 + }, + { + "epoch": 3.8575978161965425, + "grad_norm": 0.12819228530685667, + "learning_rate": 1.234642669981946e-06, + "loss": 0.0014, + "step": 8479 + }, + { + "epoch": 3.8580527752502274, + "grad_norm": 0.17063038965755264, + "learning_rate": 1.2337024346358745e-06, + "loss": 0.0015, + "step": 8480 + }, + { + "epoch": 3.8585077343039127, + "grad_norm": 0.07538577281816423, + "learning_rate": 1.2327625070546306e-06, + "loss": 0.0006, + "step": 8481 + }, + { + "epoch": 3.858962693357598, + "grad_norm": 0.21829555406319018, + "learning_rate": 1.2318228873150233e-06, + "loss": 0.0014, + "step": 8482 + }, + { + "epoch": 3.859417652411283, + "grad_norm": 0.8538086274154935, + "learning_rate": 1.230883575493833e-06, + "loss": 0.0204, + "step": 8483 + }, + { + "epoch": 3.859872611464968, + "grad_norm": 0.2527606116905976, + "learning_rate": 1.2299445716678155e-06, + "loss": 0.0056, + "step": 8484 + }, + { + "epoch": 3.8603275705186535, + "grad_norm": 0.23098909363938505, + "learning_rate": 1.2290058759137008e-06, + "loss": 0.0015, + "step": 8485 + }, + { + "epoch": 3.8607825295723384, + "grad_norm": 0.47281148816795787, + "learning_rate": 1.228067488308196e-06, + "loss": 0.0082, + "step": 8486 + }, + { + "epoch": 3.8612374886260237, + "grad_norm": 0.2939325031824273, + "learning_rate": 1.227129408927984e-06, + "loss": 0.0038, + "step": 8487 + }, + { + "epoch": 3.861692447679709, + "grad_norm": 0.22152083230825947, + "learning_rate": 1.2261916378497185e-06, + "loss": 0.0035, + "step": 8488 + }, + { + "epoch": 3.862147406733394, + "grad_norm": 0.13073537912552963, + "learning_rate": 1.2252541751500297e-06, + "loss": 0.0018, + "step": 8489 + }, + { + "epoch": 3.862602365787079, + "grad_norm": 0.25251644999039125, + "learning_rate": 1.2243170209055216e-06, + "loss": 0.003, + "step": 8490 + }, + { + "epoch": 3.8630573248407645, + "grad_norm": 0.3489997487440649, + "learning_rate": 1.2233801751927777e-06, + "loss": 0.007, + "step": 8491 + }, + { + "epoch": 3.8635122838944493, + "grad_norm": 0.24501311776491316, + "learning_rate": 1.2224436380883492e-06, + "loss": 0.0072, + "step": 8492 + }, + { + "epoch": 3.8639672429481347, + "grad_norm": 0.32105220703151655, + "learning_rate": 1.2215074096687685e-06, + "loss": 0.0099, + "step": 8493 + }, + { + "epoch": 3.86442220200182, + "grad_norm": 0.2453375811247342, + "learning_rate": 1.2205714900105387e-06, + "loss": 0.0081, + "step": 8494 + }, + { + "epoch": 3.864877161055505, + "grad_norm": 0.14215486028298338, + "learning_rate": 1.219635879190138e-06, + "loss": 0.002, + "step": 8495 + }, + { + "epoch": 3.86533212010919, + "grad_norm": 0.1657498998523969, + "learning_rate": 1.2187005772840222e-06, + "loss": 0.0022, + "step": 8496 + }, + { + "epoch": 3.8657870791628755, + "grad_norm": 0.3151440894059005, + "learning_rate": 1.2177655843686193e-06, + "loss": 0.0066, + "step": 8497 + }, + { + "epoch": 3.8662420382165603, + "grad_norm": 0.4361697324762576, + "learning_rate": 1.2168309005203321e-06, + "loss": 0.0035, + "step": 8498 + }, + { + "epoch": 3.8666969972702456, + "grad_norm": 0.18600029231156295, + "learning_rate": 1.215896525815538e-06, + "loss": 0.0028, + "step": 8499 + }, + { + "epoch": 3.867151956323931, + "grad_norm": 0.32574708560126603, + "learning_rate": 1.214962460330591e-06, + "loss": 0.0079, + "step": 8500 + }, + { + "epoch": 3.867606915377616, + "grad_norm": 0.4790888601671271, + "learning_rate": 1.2140287041418203e-06, + "loss": 0.0063, + "step": 8501 + }, + { + "epoch": 3.868061874431301, + "grad_norm": 0.2894038847868456, + "learning_rate": 1.2130952573255261e-06, + "loss": 0.0049, + "step": 8502 + }, + { + "epoch": 3.8685168334849864, + "grad_norm": 0.2575133700033376, + "learning_rate": 1.212162119957986e-06, + "loss": 0.0042, + "step": 8503 + }, + { + "epoch": 3.8689717925386713, + "grad_norm": 0.16055335960243933, + "learning_rate": 1.2112292921154507e-06, + "loss": 0.0015, + "step": 8504 + }, + { + "epoch": 3.8694267515923566, + "grad_norm": 0.5268595075573901, + "learning_rate": 1.210296773874149e-06, + "loss": 0.0055, + "step": 8505 + }, + { + "epoch": 3.869881710646042, + "grad_norm": 0.14380315121523776, + "learning_rate": 1.2093645653102787e-06, + "loss": 0.0015, + "step": 8506 + }, + { + "epoch": 3.870336669699727, + "grad_norm": 0.3030518155517926, + "learning_rate": 1.2084326665000201e-06, + "loss": 0.0039, + "step": 8507 + }, + { + "epoch": 3.870791628753412, + "grad_norm": 0.3005729703259829, + "learning_rate": 1.2075010775195205e-06, + "loss": 0.0047, + "step": 8508 + }, + { + "epoch": 3.8712465878070974, + "grad_norm": 0.19318436861989613, + "learning_rate": 1.2065697984449055e-06, + "loss": 0.0039, + "step": 8509 + }, + { + "epoch": 3.8717015468607823, + "grad_norm": 0.22623508334349976, + "learning_rate": 1.2056388293522768e-06, + "loss": 0.004, + "step": 8510 + }, + { + "epoch": 3.8721565059144676, + "grad_norm": 0.2713342001914328, + "learning_rate": 1.2047081703177077e-06, + "loss": 0.0025, + "step": 8511 + }, + { + "epoch": 3.872611464968153, + "grad_norm": 0.13520350593071404, + "learning_rate": 1.2037778214172475e-06, + "loss": 0.001, + "step": 8512 + }, + { + "epoch": 3.8730664240218378, + "grad_norm": 0.05526102019123737, + "learning_rate": 1.2028477827269186e-06, + "loss": 0.0006, + "step": 8513 + }, + { + "epoch": 3.873521383075523, + "grad_norm": 0.21448112011511056, + "learning_rate": 1.2019180543227216e-06, + "loss": 0.0045, + "step": 8514 + }, + { + "epoch": 3.8739763421292084, + "grad_norm": 0.3353278890425571, + "learning_rate": 1.20098863628063e-06, + "loss": 0.0073, + "step": 8515 + }, + { + "epoch": 3.8744313011828937, + "grad_norm": 0.3159457572771494, + "learning_rate": 1.2000595286765914e-06, + "loss": 0.0081, + "step": 8516 + }, + { + "epoch": 3.8748862602365786, + "grad_norm": 0.09978703021591674, + "learning_rate": 1.1991307315865274e-06, + "loss": 0.001, + "step": 8517 + }, + { + "epoch": 3.875341219290264, + "grad_norm": 0.2664293463872522, + "learning_rate": 1.1982022450863358e-06, + "loss": 0.0054, + "step": 8518 + }, + { + "epoch": 3.875796178343949, + "grad_norm": 0.23266523081990464, + "learning_rate": 1.1972740692518858e-06, + "loss": 0.0059, + "step": 8519 + }, + { + "epoch": 3.876251137397634, + "grad_norm": 0.10827081850758029, + "learning_rate": 1.1963462041590262e-06, + "loss": 0.0012, + "step": 8520 + }, + { + "epoch": 3.8767060964513194, + "grad_norm": 0.15410971891546507, + "learning_rate": 1.1954186498835797e-06, + "loss": 0.0024, + "step": 8521 + }, + { + "epoch": 3.8771610555050047, + "grad_norm": 0.3021095692725338, + "learning_rate": 1.194491406501339e-06, + "loss": 0.0035, + "step": 8522 + }, + { + "epoch": 3.87761601455869, + "grad_norm": 0.045179863353742786, + "learning_rate": 1.193564474088076e-06, + "loss": 0.0003, + "step": 8523 + }, + { + "epoch": 3.878070973612375, + "grad_norm": 0.3901003588986076, + "learning_rate": 1.192637852719532e-06, + "loss": 0.0107, + "step": 8524 + }, + { + "epoch": 3.87852593266606, + "grad_norm": 0.24040628801795585, + "learning_rate": 1.1917115424714305e-06, + "loss": 0.0021, + "step": 8525 + }, + { + "epoch": 3.8789808917197455, + "grad_norm": 0.29613167821824377, + "learning_rate": 1.1907855434194637e-06, + "loss": 0.0056, + "step": 8526 + }, + { + "epoch": 3.8794358507734303, + "grad_norm": 0.25052492781274743, + "learning_rate": 1.1898598556392987e-06, + "loss": 0.0038, + "step": 8527 + }, + { + "epoch": 3.8798908098271156, + "grad_norm": 0.44246555835890966, + "learning_rate": 1.1889344792065816e-06, + "loss": 0.0066, + "step": 8528 + }, + { + "epoch": 3.880345768880801, + "grad_norm": 0.2918796780310605, + "learning_rate": 1.1880094141969262e-06, + "loss": 0.0061, + "step": 8529 + }, + { + "epoch": 3.880800727934486, + "grad_norm": 0.14354874224811812, + "learning_rate": 1.1870846606859288e-06, + "loss": 0.0009, + "step": 8530 + }, + { + "epoch": 3.881255686988171, + "grad_norm": 0.17765159955535334, + "learning_rate": 1.1861602187491533e-06, + "loss": 0.0021, + "step": 8531 + }, + { + "epoch": 3.8817106460418564, + "grad_norm": 0.3821340989360063, + "learning_rate": 1.1852360884621417e-06, + "loss": 0.0093, + "step": 8532 + }, + { + "epoch": 3.8821656050955413, + "grad_norm": 0.1782471975402449, + "learning_rate": 1.1843122699004083e-06, + "loss": 0.0028, + "step": 8533 + }, + { + "epoch": 3.8826205641492266, + "grad_norm": 0.17999876619669206, + "learning_rate": 1.1833887631394447e-06, + "loss": 0.0027, + "step": 8534 + }, + { + "epoch": 3.883075523202912, + "grad_norm": 0.14798802286978838, + "learning_rate": 1.1824655682547176e-06, + "loss": 0.0013, + "step": 8535 + }, + { + "epoch": 3.883530482256597, + "grad_norm": 0.13713911919604432, + "learning_rate": 1.181542685321664e-06, + "loss": 0.0026, + "step": 8536 + }, + { + "epoch": 3.883985441310282, + "grad_norm": 0.28753625801362137, + "learning_rate": 1.180620114415698e-06, + "loss": 0.0065, + "step": 8537 + }, + { + "epoch": 3.8844404003639674, + "grad_norm": 0.24528139972882526, + "learning_rate": 1.1796978556122069e-06, + "loss": 0.0057, + "step": 8538 + }, + { + "epoch": 3.8848953594176523, + "grad_norm": 0.2208511421476422, + "learning_rate": 1.178775908986556e-06, + "loss": 0.0026, + "step": 8539 + }, + { + "epoch": 3.8853503184713376, + "grad_norm": 0.3498990752877721, + "learning_rate": 1.1778542746140814e-06, + "loss": 0.0087, + "step": 8540 + }, + { + "epoch": 3.885805277525023, + "grad_norm": 0.077305454218025, + "learning_rate": 1.1769329525700934e-06, + "loss": 0.0008, + "step": 8541 + }, + { + "epoch": 3.886260236578708, + "grad_norm": 0.13867732645712094, + "learning_rate": 1.176011942929881e-06, + "loss": 0.0026, + "step": 8542 + }, + { + "epoch": 3.886715195632393, + "grad_norm": 2.021104381740242, + "learning_rate": 1.1750912457687024e-06, + "loss": 0.0092, + "step": 8543 + }, + { + "epoch": 3.8871701546860784, + "grad_norm": 0.05688037376082502, + "learning_rate": 1.1741708611617951e-06, + "loss": 0.0004, + "step": 8544 + }, + { + "epoch": 3.8876251137397633, + "grad_norm": 0.11688013987991248, + "learning_rate": 1.1732507891843681e-06, + "loss": 0.0013, + "step": 8545 + }, + { + "epoch": 3.8880800727934486, + "grad_norm": 0.11431852676949443, + "learning_rate": 1.1723310299116052e-06, + "loss": 0.0009, + "step": 8546 + }, + { + "epoch": 3.888535031847134, + "grad_norm": 0.12562883668862718, + "learning_rate": 1.1714115834186646e-06, + "loss": 0.0013, + "step": 8547 + }, + { + "epoch": 3.8889899909008188, + "grad_norm": 0.17462077871472168, + "learning_rate": 1.1704924497806775e-06, + "loss": 0.002, + "step": 8548 + }, + { + "epoch": 3.889444949954504, + "grad_norm": 0.2662032111623803, + "learning_rate": 1.1695736290727554e-06, + "loss": 0.002, + "step": 8549 + }, + { + "epoch": 3.8898999090081894, + "grad_norm": 0.1087899932136938, + "learning_rate": 1.1686551213699788e-06, + "loss": 0.0009, + "step": 8550 + }, + { + "epoch": 3.8903548680618742, + "grad_norm": 0.3749712407882381, + "learning_rate": 1.1677369267474036e-06, + "loss": 0.0037, + "step": 8551 + }, + { + "epoch": 3.8908098271155596, + "grad_norm": 0.35472025867000434, + "learning_rate": 1.1668190452800604e-06, + "loss": 0.0039, + "step": 8552 + }, + { + "epoch": 3.891264786169245, + "grad_norm": 0.1307639121413599, + "learning_rate": 1.1659014770429527e-06, + "loss": 0.001, + "step": 8553 + }, + { + "epoch": 3.8917197452229297, + "grad_norm": 0.15643581691221203, + "learning_rate": 1.164984222111063e-06, + "loss": 0.0015, + "step": 8554 + }, + { + "epoch": 3.892174704276615, + "grad_norm": 0.15113369685521766, + "learning_rate": 1.1640672805593423e-06, + "loss": 0.0032, + "step": 8555 + }, + { + "epoch": 3.8926296633303004, + "grad_norm": 0.2700194885635221, + "learning_rate": 1.1631506524627223e-06, + "loss": 0.0042, + "step": 8556 + }, + { + "epoch": 3.8930846223839852, + "grad_norm": 0.22335646873162818, + "learning_rate": 1.1622343378961037e-06, + "loss": 0.0034, + "step": 8557 + }, + { + "epoch": 3.8935395814376705, + "grad_norm": 0.3530776068168374, + "learning_rate": 1.1613183369343627e-06, + "loss": 0.0031, + "step": 8558 + }, + { + "epoch": 3.893994540491356, + "grad_norm": 0.14898826965234926, + "learning_rate": 1.1604026496523536e-06, + "loss": 0.0015, + "step": 8559 + }, + { + "epoch": 3.8944494995450407, + "grad_norm": 0.0900944793650691, + "learning_rate": 1.1594872761249e-06, + "loss": 0.0007, + "step": 8560 + }, + { + "epoch": 3.894904458598726, + "grad_norm": 0.2546589358428222, + "learning_rate": 1.1585722164268021e-06, + "loss": 0.0038, + "step": 8561 + }, + { + "epoch": 3.8953594176524113, + "grad_norm": 0.20274418218456525, + "learning_rate": 1.1576574706328342e-06, + "loss": 0.0029, + "step": 8562 + }, + { + "epoch": 3.895814376706096, + "grad_norm": 0.382352250268221, + "learning_rate": 1.1567430388177459e-06, + "loss": 0.0058, + "step": 8563 + }, + { + "epoch": 3.8962693357597815, + "grad_norm": 0.3740862719077101, + "learning_rate": 1.1558289210562618e-06, + "loss": 0.0038, + "step": 8564 + }, + { + "epoch": 3.896724294813467, + "grad_norm": 0.1599488372275841, + "learning_rate": 1.1549151174230778e-06, + "loss": 0.0025, + "step": 8565 + }, + { + "epoch": 3.8971792538671517, + "grad_norm": 0.41750466773791584, + "learning_rate": 1.1540016279928668e-06, + "loss": 0.006, + "step": 8566 + }, + { + "epoch": 3.897634212920837, + "grad_norm": 0.12327063563732767, + "learning_rate": 1.1530884528402724e-06, + "loss": 0.0016, + "step": 8567 + }, + { + "epoch": 3.8980891719745223, + "grad_norm": 0.2799985029804694, + "learning_rate": 1.1521755920399191e-06, + "loss": 0.0079, + "step": 8568 + }, + { + "epoch": 3.8985441310282076, + "grad_norm": 0.30444484701947416, + "learning_rate": 1.1512630456663976e-06, + "loss": 0.0022, + "step": 8569 + }, + { + "epoch": 3.8989990900818925, + "grad_norm": 0.12678300534776715, + "learning_rate": 1.1503508137942814e-06, + "loss": 0.0013, + "step": 8570 + }, + { + "epoch": 3.899454049135578, + "grad_norm": 0.3200922001692808, + "learning_rate": 1.1494388964981117e-06, + "loss": 0.009, + "step": 8571 + }, + { + "epoch": 3.899909008189263, + "grad_norm": 0.3025651795182405, + "learning_rate": 1.1485272938524045e-06, + "loss": 0.002, + "step": 8572 + }, + { + "epoch": 3.900363967242948, + "grad_norm": 0.1477858119731619, + "learning_rate": 1.147616005931656e-06, + "loss": 0.0012, + "step": 8573 + }, + { + "epoch": 3.9008189262966333, + "grad_norm": 0.3644421033195017, + "learning_rate": 1.1467050328103295e-06, + "loss": 0.0041, + "step": 8574 + }, + { + "epoch": 3.9012738853503186, + "grad_norm": 0.412765249106771, + "learning_rate": 1.145794374562867e-06, + "loss": 0.0112, + "step": 8575 + }, + { + "epoch": 3.901728844404004, + "grad_norm": 0.09425648281105749, + "learning_rate": 1.1448840312636812e-06, + "loss": 0.0013, + "step": 8576 + }, + { + "epoch": 3.902183803457689, + "grad_norm": 0.11709607267456766, + "learning_rate": 1.1439740029871622e-06, + "loss": 0.001, + "step": 8577 + }, + { + "epoch": 3.902638762511374, + "grad_norm": 0.41047764808237097, + "learning_rate": 1.143064289807676e-06, + "loss": 0.008, + "step": 8578 + }, + { + "epoch": 3.9030937215650594, + "grad_norm": 0.44103632497336703, + "learning_rate": 1.1421548917995584e-06, + "loss": 0.0054, + "step": 8579 + }, + { + "epoch": 3.9035486806187443, + "grad_norm": 0.08228339709205519, + "learning_rate": 1.1412458090371208e-06, + "loss": 0.0012, + "step": 8580 + }, + { + "epoch": 3.9040036396724296, + "grad_norm": 0.11814590257555363, + "learning_rate": 1.1403370415946486e-06, + "loss": 0.0014, + "step": 8581 + }, + { + "epoch": 3.904458598726115, + "grad_norm": 0.2564362949576262, + "learning_rate": 1.1394285895464041e-06, + "loss": 0.0064, + "step": 8582 + }, + { + "epoch": 3.9049135577797998, + "grad_norm": 0.5530443241890499, + "learning_rate": 1.1385204529666205e-06, + "loss": 0.0034, + "step": 8583 + }, + { + "epoch": 3.905368516833485, + "grad_norm": 0.1478147299889684, + "learning_rate": 1.1376126319295078e-06, + "loss": 0.0014, + "step": 8584 + }, + { + "epoch": 3.9058234758871704, + "grad_norm": 0.47332719412921304, + "learning_rate": 1.1367051265092487e-06, + "loss": 0.009, + "step": 8585 + }, + { + "epoch": 3.9062784349408552, + "grad_norm": 0.21666486995780654, + "learning_rate": 1.1357979367800004e-06, + "loss": 0.0023, + "step": 8586 + }, + { + "epoch": 3.9067333939945406, + "grad_norm": 0.27718541865223906, + "learning_rate": 1.1348910628158927e-06, + "loss": 0.0055, + "step": 8587 + }, + { + "epoch": 3.907188353048226, + "grad_norm": 0.3178236785516548, + "learning_rate": 1.1339845046910342e-06, + "loss": 0.0079, + "step": 8588 + }, + { + "epoch": 3.9076433121019107, + "grad_norm": 0.2835566572880014, + "learning_rate": 1.1330782624795027e-06, + "loss": 0.0044, + "step": 8589 + }, + { + "epoch": 3.908098271155596, + "grad_norm": 0.17121068411044227, + "learning_rate": 1.1321723362553516e-06, + "loss": 0.0035, + "step": 8590 + }, + { + "epoch": 3.9085532302092814, + "grad_norm": 0.2633371349136533, + "learning_rate": 1.131266726092612e-06, + "loss": 0.0052, + "step": 8591 + }, + { + "epoch": 3.9090081892629662, + "grad_norm": 0.18550586432732438, + "learning_rate": 1.1303614320652828e-06, + "loss": 0.0012, + "step": 8592 + }, + { + "epoch": 3.9094631483166515, + "grad_norm": 0.14134717139327005, + "learning_rate": 1.1294564542473435e-06, + "loss": 0.0009, + "step": 8593 + }, + { + "epoch": 3.909918107370337, + "grad_norm": 0.14179189989250485, + "learning_rate": 1.1285517927127438e-06, + "loss": 0.0011, + "step": 8594 + }, + { + "epoch": 3.9103730664240217, + "grad_norm": 0.18480139294796458, + "learning_rate": 1.1276474475354077e-06, + "loss": 0.0042, + "step": 8595 + }, + { + "epoch": 3.910828025477707, + "grad_norm": 0.26152965664440003, + "learning_rate": 1.126743418789234e-06, + "loss": 0.0021, + "step": 8596 + }, + { + "epoch": 3.9112829845313923, + "grad_norm": 0.25280587208806443, + "learning_rate": 1.1258397065480963e-06, + "loss": 0.0022, + "step": 8597 + }, + { + "epoch": 3.911737943585077, + "grad_norm": 0.26530461494558766, + "learning_rate": 1.124936310885844e-06, + "loss": 0.0056, + "step": 8598 + }, + { + "epoch": 3.9121929026387625, + "grad_norm": 0.30633216410793296, + "learning_rate": 1.1240332318762964e-06, + "loss": 0.0085, + "step": 8599 + }, + { + "epoch": 3.912647861692448, + "grad_norm": 0.08594344662114259, + "learning_rate": 1.1231304695932494e-06, + "loss": 0.0016, + "step": 8600 + }, + { + "epoch": 3.9131028207461327, + "grad_norm": 0.10880268538920568, + "learning_rate": 1.1222280241104716e-06, + "loss": 0.0012, + "step": 8601 + }, + { + "epoch": 3.913557779799818, + "grad_norm": 0.5411180940024666, + "learning_rate": 1.1213258955017086e-06, + "loss": 0.0097, + "step": 8602 + }, + { + "epoch": 3.9140127388535033, + "grad_norm": 0.2871672628937993, + "learning_rate": 1.1204240838406782e-06, + "loss": 0.0056, + "step": 8603 + }, + { + "epoch": 3.914467697907188, + "grad_norm": 0.3146656867161822, + "learning_rate": 1.1195225892010697e-06, + "loss": 0.0084, + "step": 8604 + }, + { + "epoch": 3.9149226569608735, + "grad_norm": 0.2352121235369682, + "learning_rate": 1.118621411656553e-06, + "loss": 0.002, + "step": 8605 + }, + { + "epoch": 3.915377616014559, + "grad_norm": 0.09586231537241285, + "learning_rate": 1.1177205512807643e-06, + "loss": 0.0007, + "step": 8606 + }, + { + "epoch": 3.9158325750682437, + "grad_norm": 0.21579455644758452, + "learning_rate": 1.1168200081473219e-06, + "loss": 0.0019, + "step": 8607 + }, + { + "epoch": 3.916287534121929, + "grad_norm": 0.2781448682024323, + "learning_rate": 1.1159197823298117e-06, + "loss": 0.0039, + "step": 8608 + }, + { + "epoch": 3.9167424931756143, + "grad_norm": 0.30543288704648824, + "learning_rate": 1.115019873901797e-06, + "loss": 0.0096, + "step": 8609 + }, + { + "epoch": 3.917197452229299, + "grad_norm": 0.0742914789727566, + "learning_rate": 1.1141202829368124e-06, + "loss": 0.0011, + "step": 8610 + }, + { + "epoch": 3.9176524112829845, + "grad_norm": 0.19062290156766876, + "learning_rate": 1.1132210095083696e-06, + "loss": 0.0068, + "step": 8611 + }, + { + "epoch": 3.91810737033667, + "grad_norm": 0.26470427971734123, + "learning_rate": 1.112322053689955e-06, + "loss": 0.0043, + "step": 8612 + }, + { + "epoch": 3.9185623293903546, + "grad_norm": 0.16327070324816795, + "learning_rate": 1.111423415555025e-06, + "loss": 0.0016, + "step": 8613 + }, + { + "epoch": 3.91901728844404, + "grad_norm": 0.2462024846666314, + "learning_rate": 1.110525095177013e-06, + "loss": 0.0019, + "step": 8614 + }, + { + "epoch": 3.9194722474977253, + "grad_norm": 0.18678889734389317, + "learning_rate": 1.1096270926293245e-06, + "loss": 0.0017, + "step": 8615 + }, + { + "epoch": 3.91992720655141, + "grad_norm": 0.2198330200083498, + "learning_rate": 1.1087294079853423e-06, + "loss": 0.0056, + "step": 8616 + }, + { + "epoch": 3.9203821656050954, + "grad_norm": 0.1754214115924083, + "learning_rate": 1.10783204131842e-06, + "loss": 0.0024, + "step": 8617 + }, + { + "epoch": 3.9208371246587808, + "grad_norm": 0.09310448686369165, + "learning_rate": 1.1069349927018858e-06, + "loss": 0.0009, + "step": 8618 + }, + { + "epoch": 3.9212920837124656, + "grad_norm": 0.3972172442181105, + "learning_rate": 1.1060382622090437e-06, + "loss": 0.0047, + "step": 8619 + }, + { + "epoch": 3.921747042766151, + "grad_norm": 0.13143351817243823, + "learning_rate": 1.1051418499131683e-06, + "loss": 0.0011, + "step": 8620 + }, + { + "epoch": 3.9222020018198362, + "grad_norm": 0.10110188430928709, + "learning_rate": 1.1042457558875135e-06, + "loss": 0.0012, + "step": 8621 + }, + { + "epoch": 3.922656960873521, + "grad_norm": 0.16142768137410202, + "learning_rate": 1.1033499802053027e-06, + "loss": 0.0016, + "step": 8622 + }, + { + "epoch": 3.9231119199272064, + "grad_norm": 0.34715387308860346, + "learning_rate": 1.1024545229397344e-06, + "loss": 0.0029, + "step": 8623 + }, + { + "epoch": 3.9235668789808917, + "grad_norm": 0.39642159044255004, + "learning_rate": 1.101559384163981e-06, + "loss": 0.0061, + "step": 8624 + }, + { + "epoch": 3.924021838034577, + "grad_norm": 0.3881289778789691, + "learning_rate": 1.1006645639511881e-06, + "loss": 0.0139, + "step": 8625 + }, + { + "epoch": 3.924476797088262, + "grad_norm": 0.1449078177475703, + "learning_rate": 1.0997700623744784e-06, + "loss": 0.0012, + "step": 8626 + }, + { + "epoch": 3.9249317561419472, + "grad_norm": 0.11366266129974073, + "learning_rate": 1.0988758795069465e-06, + "loss": 0.0006, + "step": 8627 + }, + { + "epoch": 3.9253867151956325, + "grad_norm": 0.07754432110257026, + "learning_rate": 1.0979820154216608e-06, + "loss": 0.0005, + "step": 8628 + }, + { + "epoch": 3.9258416742493174, + "grad_norm": 0.07008705442776036, + "learning_rate": 1.0970884701916634e-06, + "loss": 0.0009, + "step": 8629 + }, + { + "epoch": 3.9262966333030027, + "grad_norm": 0.2680134664292478, + "learning_rate": 1.0961952438899699e-06, + "loss": 0.003, + "step": 8630 + }, + { + "epoch": 3.926751592356688, + "grad_norm": 0.1646752181540853, + "learning_rate": 1.0953023365895721e-06, + "loss": 0.0022, + "step": 8631 + }, + { + "epoch": 3.9272065514103733, + "grad_norm": 0.23522267023706603, + "learning_rate": 1.094409748363433e-06, + "loss": 0.0037, + "step": 8632 + }, + { + "epoch": 3.927661510464058, + "grad_norm": 0.19158777285507161, + "learning_rate": 1.0935174792844934e-06, + "loss": 0.0061, + "step": 8633 + }, + { + "epoch": 3.9281164695177435, + "grad_norm": 0.21099724199086758, + "learning_rate": 1.0926255294256638e-06, + "loss": 0.0031, + "step": 8634 + }, + { + "epoch": 3.928571428571429, + "grad_norm": 0.12153992936245393, + "learning_rate": 1.0917338988598287e-06, + "loss": 0.0008, + "step": 8635 + }, + { + "epoch": 3.9290263876251137, + "grad_norm": 0.47079421823001855, + "learning_rate": 1.0908425876598512e-06, + "loss": 0.0085, + "step": 8636 + }, + { + "epoch": 3.929481346678799, + "grad_norm": 0.30617255225261963, + "learning_rate": 1.0899515958985641e-06, + "loss": 0.0032, + "step": 8637 + }, + { + "epoch": 3.9299363057324843, + "grad_norm": 0.1920210400975485, + "learning_rate": 1.0890609236487748e-06, + "loss": 0.002, + "step": 8638 + }, + { + "epoch": 3.930391264786169, + "grad_norm": 0.4275090553301554, + "learning_rate": 1.088170570983264e-06, + "loss": 0.0043, + "step": 8639 + }, + { + "epoch": 3.9308462238398545, + "grad_norm": 0.16416958893908284, + "learning_rate": 1.0872805379747881e-06, + "loss": 0.0031, + "step": 8640 + }, + { + "epoch": 3.93130118289354, + "grad_norm": 0.40803310219522615, + "learning_rate": 1.0863908246960786e-06, + "loss": 0.0042, + "step": 8641 + }, + { + "epoch": 3.9317561419472247, + "grad_norm": 0.30289168127297866, + "learning_rate": 1.085501431219837e-06, + "loss": 0.0069, + "step": 8642 + }, + { + "epoch": 3.93221110100091, + "grad_norm": 0.19369172892118913, + "learning_rate": 1.0846123576187413e-06, + "loss": 0.0037, + "step": 8643 + }, + { + "epoch": 3.9326660600545953, + "grad_norm": 0.14886037559979806, + "learning_rate": 1.0837236039654397e-06, + "loss": 0.0021, + "step": 8644 + }, + { + "epoch": 3.93312101910828, + "grad_norm": 0.05580514991875835, + "learning_rate": 1.0828351703325612e-06, + "loss": 0.0005, + "step": 8645 + }, + { + "epoch": 3.9335759781619655, + "grad_norm": 0.20148014456043523, + "learning_rate": 1.0819470567927021e-06, + "loss": 0.0014, + "step": 8646 + }, + { + "epoch": 3.934030937215651, + "grad_norm": 0.16647964047422256, + "learning_rate": 1.0810592634184364e-06, + "loss": 0.0006, + "step": 8647 + }, + { + "epoch": 3.9344858962693356, + "grad_norm": 0.061531122052330224, + "learning_rate": 1.08017179028231e-06, + "loss": 0.0005, + "step": 8648 + }, + { + "epoch": 3.934940855323021, + "grad_norm": 0.3815404935382793, + "learning_rate": 1.0792846374568416e-06, + "loss": 0.0102, + "step": 8649 + }, + { + "epoch": 3.9353958143767063, + "grad_norm": 0.11426919162438308, + "learning_rate": 1.0783978050145288e-06, + "loss": 0.0009, + "step": 8650 + }, + { + "epoch": 3.935850773430391, + "grad_norm": 0.49548187169426167, + "learning_rate": 1.077511293027837e-06, + "loss": 0.0069, + "step": 8651 + }, + { + "epoch": 3.9363057324840764, + "grad_norm": 0.2875600916353048, + "learning_rate": 1.0766251015692086e-06, + "loss": 0.006, + "step": 8652 + }, + { + "epoch": 3.9367606915377618, + "grad_norm": 0.23380131484145028, + "learning_rate": 1.075739230711058e-06, + "loss": 0.0074, + "step": 8653 + }, + { + "epoch": 3.9372156505914466, + "grad_norm": 0.2477451242343582, + "learning_rate": 1.0748536805257753e-06, + "loss": 0.0034, + "step": 8654 + }, + { + "epoch": 3.937670609645132, + "grad_norm": 0.1261165654044897, + "learning_rate": 1.0739684510857257e-06, + "loss": 0.0011, + "step": 8655 + }, + { + "epoch": 3.9381255686988172, + "grad_norm": 0.26204275295761664, + "learning_rate": 1.0730835424632446e-06, + "loss": 0.0028, + "step": 8656 + }, + { + "epoch": 3.938580527752502, + "grad_norm": 0.18508101639745386, + "learning_rate": 1.0721989547306423e-06, + "loss": 0.0039, + "step": 8657 + }, + { + "epoch": 3.9390354868061874, + "grad_norm": 0.21769222540241673, + "learning_rate": 1.0713146879602038e-06, + "loss": 0.0025, + "step": 8658 + }, + { + "epoch": 3.9394904458598727, + "grad_norm": 0.10034400825997478, + "learning_rate": 1.0704307422241856e-06, + "loss": 0.0014, + "step": 8659 + }, + { + "epoch": 3.9399454049135576, + "grad_norm": 0.3513842150791386, + "learning_rate": 1.0695471175948213e-06, + "loss": 0.0024, + "step": 8660 + }, + { + "epoch": 3.940400363967243, + "grad_norm": 0.24255234185569405, + "learning_rate": 1.0686638141443184e-06, + "loss": 0.0013, + "step": 8661 + }, + { + "epoch": 3.9408553230209282, + "grad_norm": 0.15771631194738922, + "learning_rate": 1.067780831944855e-06, + "loss": 0.0032, + "step": 8662 + }, + { + "epoch": 3.941310282074613, + "grad_norm": 0.20513929273822182, + "learning_rate": 1.0668981710685844e-06, + "loss": 0.001, + "step": 8663 + }, + { + "epoch": 3.9417652411282984, + "grad_norm": 0.07935303674519927, + "learning_rate": 1.0660158315876318e-06, + "loss": 0.0018, + "step": 8664 + }, + { + "epoch": 3.9422202001819837, + "grad_norm": 0.09263657522623965, + "learning_rate": 1.0651338135741006e-06, + "loss": 0.001, + "step": 8665 + }, + { + "epoch": 3.9426751592356686, + "grad_norm": 0.24340118318335507, + "learning_rate": 1.0642521171000653e-06, + "loss": 0.0023, + "step": 8666 + }, + { + "epoch": 3.943130118289354, + "grad_norm": 0.17367882379298985, + "learning_rate": 1.0633707422375716e-06, + "loss": 0.0018, + "step": 8667 + }, + { + "epoch": 3.943585077343039, + "grad_norm": 0.07771508279610274, + "learning_rate": 1.062489689058645e-06, + "loss": 0.0008, + "step": 8668 + }, + { + "epoch": 3.944040036396724, + "grad_norm": 0.2898127528353958, + "learning_rate": 1.0616089576352774e-06, + "loss": 0.0074, + "step": 8669 + }, + { + "epoch": 3.9444949954504094, + "grad_norm": 0.24703088495388803, + "learning_rate": 1.060728548039442e-06, + "loss": 0.0026, + "step": 8670 + }, + { + "epoch": 3.9449499545040947, + "grad_norm": 0.26944606176110175, + "learning_rate": 1.0598484603430797e-06, + "loss": 0.0032, + "step": 8671 + }, + { + "epoch": 3.9454049135577796, + "grad_norm": 0.29121094145548027, + "learning_rate": 1.0589686946181078e-06, + "loss": 0.0023, + "step": 8672 + }, + { + "epoch": 3.945859872611465, + "grad_norm": 0.20062648290840376, + "learning_rate": 1.0580892509364149e-06, + "loss": 0.0024, + "step": 8673 + }, + { + "epoch": 3.94631483166515, + "grad_norm": 0.3499803264391899, + "learning_rate": 1.0572101293698671e-06, + "loss": 0.0032, + "step": 8674 + }, + { + "epoch": 3.946769790718835, + "grad_norm": 0.21998699471541316, + "learning_rate": 1.056331329990304e-06, + "loss": 0.0021, + "step": 8675 + }, + { + "epoch": 3.9472247497725204, + "grad_norm": 0.20345501118890286, + "learning_rate": 1.0554528528695346e-06, + "loss": 0.0038, + "step": 8676 + }, + { + "epoch": 3.9476797088262057, + "grad_norm": 0.129402120440634, + "learning_rate": 1.0545746980793447e-06, + "loss": 0.0011, + "step": 8677 + }, + { + "epoch": 3.9481346678798905, + "grad_norm": 0.17145867970903111, + "learning_rate": 1.0536968656914914e-06, + "loss": 0.002, + "step": 8678 + }, + { + "epoch": 3.948589626933576, + "grad_norm": 0.2681611999526238, + "learning_rate": 1.052819355777711e-06, + "loss": 0.0064, + "step": 8679 + }, + { + "epoch": 3.949044585987261, + "grad_norm": 0.254791795465932, + "learning_rate": 1.051942168409707e-06, + "loss": 0.0025, + "step": 8680 + }, + { + "epoch": 3.9494995450409465, + "grad_norm": 0.6065577504385965, + "learning_rate": 1.0510653036591583e-06, + "loss": 0.0085, + "step": 8681 + }, + { + "epoch": 3.9499545040946313, + "grad_norm": 0.156835419474489, + "learning_rate": 1.0501887615977214e-06, + "loss": 0.0011, + "step": 8682 + }, + { + "epoch": 3.9504094631483166, + "grad_norm": 0.2636604698049228, + "learning_rate": 1.0493125422970202e-06, + "loss": 0.0054, + "step": 8683 + }, + { + "epoch": 3.950864422202002, + "grad_norm": 0.13970453218294443, + "learning_rate": 1.0484366458286587e-06, + "loss": 0.0015, + "step": 8684 + }, + { + "epoch": 3.951319381255687, + "grad_norm": 0.23198002437201626, + "learning_rate": 1.0475610722642088e-06, + "loss": 0.0046, + "step": 8685 + }, + { + "epoch": 3.951774340309372, + "grad_norm": 0.41613148640674474, + "learning_rate": 1.0466858216752195e-06, + "loss": 0.0098, + "step": 8686 + }, + { + "epoch": 3.9522292993630574, + "grad_norm": 0.11465905064390043, + "learning_rate": 1.04581089413321e-06, + "loss": 0.0011, + "step": 8687 + }, + { + "epoch": 3.9526842584167428, + "grad_norm": 0.4837982363440678, + "learning_rate": 1.0449362897096776e-06, + "loss": 0.0046, + "step": 8688 + }, + { + "epoch": 3.9531392174704276, + "grad_norm": 0.6149581324428018, + "learning_rate": 1.0440620084760922e-06, + "loss": 0.0052, + "step": 8689 + }, + { + "epoch": 3.953594176524113, + "grad_norm": 0.29608630177877293, + "learning_rate": 1.0431880505038945e-06, + "loss": 0.0048, + "step": 8690 + }, + { + "epoch": 3.9540491355777982, + "grad_norm": 0.6426721614861997, + "learning_rate": 1.0423144158644999e-06, + "loss": 0.0147, + "step": 8691 + }, + { + "epoch": 3.954504094631483, + "grad_norm": 0.08263751493435481, + "learning_rate": 1.0414411046292994e-06, + "loss": 0.0013, + "step": 8692 + }, + { + "epoch": 3.9549590536851684, + "grad_norm": 0.11949983173572434, + "learning_rate": 1.040568116869653e-06, + "loss": 0.0008, + "step": 8693 + }, + { + "epoch": 3.9554140127388537, + "grad_norm": 0.14808758997571178, + "learning_rate": 1.0396954526569014e-06, + "loss": 0.001, + "step": 8694 + }, + { + "epoch": 3.9558689717925386, + "grad_norm": 0.4391031935388645, + "learning_rate": 1.038823112062351e-06, + "loss": 0.0062, + "step": 8695 + }, + { + "epoch": 3.956323930846224, + "grad_norm": 0.3449639865614129, + "learning_rate": 1.0379510951572891e-06, + "loss": 0.0058, + "step": 8696 + }, + { + "epoch": 3.9567788898999092, + "grad_norm": 0.22060024698376487, + "learning_rate": 1.037079402012971e-06, + "loss": 0.0033, + "step": 8697 + }, + { + "epoch": 3.957233848953594, + "grad_norm": 0.19863491841459927, + "learning_rate": 1.0362080327006263e-06, + "loss": 0.0014, + "step": 8698 + }, + { + "epoch": 3.9576888080072794, + "grad_norm": 0.6450457156491471, + "learning_rate": 1.0353369872914626e-06, + "loss": 0.0092, + "step": 8699 + }, + { + "epoch": 3.9581437670609647, + "grad_norm": 0.1812422798582844, + "learning_rate": 1.0344662658566562e-06, + "loss": 0.0029, + "step": 8700 + }, + { + "epoch": 3.9585987261146496, + "grad_norm": 0.3499859683753853, + "learning_rate": 1.0335958684673574e-06, + "loss": 0.0048, + "step": 8701 + }, + { + "epoch": 3.959053685168335, + "grad_norm": 0.33754610678884184, + "learning_rate": 1.0327257951946917e-06, + "loss": 0.006, + "step": 8702 + }, + { + "epoch": 3.95950864422202, + "grad_norm": 0.2245844092477358, + "learning_rate": 1.0318560461097577e-06, + "loss": 0.0018, + "step": 8703 + }, + { + "epoch": 3.959963603275705, + "grad_norm": 0.12315408931465745, + "learning_rate": 1.030986621283629e-06, + "loss": 0.0012, + "step": 8704 + }, + { + "epoch": 3.9604185623293904, + "grad_norm": 0.12010861141309559, + "learning_rate": 1.0301175207873492e-06, + "loss": 0.0017, + "step": 8705 + }, + { + "epoch": 3.9608735213830757, + "grad_norm": 1.2467247797502008, + "learning_rate": 1.0292487446919385e-06, + "loss": 0.0094, + "step": 8706 + }, + { + "epoch": 3.9613284804367606, + "grad_norm": 0.1702753689624994, + "learning_rate": 1.0283802930683866e-06, + "loss": 0.0021, + "step": 8707 + }, + { + "epoch": 3.961783439490446, + "grad_norm": 0.19280587099263943, + "learning_rate": 1.0275121659876636e-06, + "loss": 0.0054, + "step": 8708 + }, + { + "epoch": 3.962238398544131, + "grad_norm": 0.12907783862629738, + "learning_rate": 1.0266443635207052e-06, + "loss": 0.0008, + "step": 8709 + }, + { + "epoch": 3.962693357597816, + "grad_norm": 0.0991430962001798, + "learning_rate": 1.0257768857384271e-06, + "loss": 0.0012, + "step": 8710 + }, + { + "epoch": 3.9631483166515014, + "grad_norm": 0.2809748907686127, + "learning_rate": 1.0249097327117142e-06, + "loss": 0.0057, + "step": 8711 + }, + { + "epoch": 3.9636032757051867, + "grad_norm": 0.1621882457127811, + "learning_rate": 1.0240429045114258e-06, + "loss": 0.0016, + "step": 8712 + }, + { + "epoch": 3.9640582347588715, + "grad_norm": 0.13838844775693793, + "learning_rate": 1.0231764012083966e-06, + "loss": 0.0017, + "step": 8713 + }, + { + "epoch": 3.964513193812557, + "grad_norm": 0.2524560728919716, + "learning_rate": 1.0223102228734332e-06, + "loss": 0.0039, + "step": 8714 + }, + { + "epoch": 3.964968152866242, + "grad_norm": 0.1490474408984266, + "learning_rate": 1.0214443695773152e-06, + "loss": 0.0022, + "step": 8715 + }, + { + "epoch": 3.965423111919927, + "grad_norm": 0.15521693607656253, + "learning_rate": 1.0205788413907952e-06, + "loss": 0.0011, + "step": 8716 + }, + { + "epoch": 3.9658780709736123, + "grad_norm": 0.25999108001055726, + "learning_rate": 1.0197136383846013e-06, + "loss": 0.0028, + "step": 8717 + }, + { + "epoch": 3.9663330300272976, + "grad_norm": 0.3463144321518056, + "learning_rate": 1.018848760629435e-06, + "loss": 0.0068, + "step": 8718 + }, + { + "epoch": 3.9667879890809825, + "grad_norm": 0.17594896148628147, + "learning_rate": 1.0179842081959695e-06, + "loss": 0.0028, + "step": 8719 + }, + { + "epoch": 3.967242948134668, + "grad_norm": 0.04295292711472292, + "learning_rate": 1.0171199811548522e-06, + "loss": 0.0005, + "step": 8720 + }, + { + "epoch": 3.967697907188353, + "grad_norm": 0.28654923641368224, + "learning_rate": 1.0162560795767019e-06, + "loss": 0.003, + "step": 8721 + }, + { + "epoch": 3.968152866242038, + "grad_norm": 0.3053671988277895, + "learning_rate": 1.0153925035321155e-06, + "loss": 0.0102, + "step": 8722 + }, + { + "epoch": 3.9686078252957233, + "grad_norm": 0.2715425283881052, + "learning_rate": 1.0145292530916584e-06, + "loss": 0.0025, + "step": 8723 + }, + { + "epoch": 3.9690627843494086, + "grad_norm": 0.07658153294032666, + "learning_rate": 1.0136663283258734e-06, + "loss": 0.0005, + "step": 8724 + }, + { + "epoch": 3.9695177434030935, + "grad_norm": 0.13775655789012115, + "learning_rate": 1.0128037293052744e-06, + "loss": 0.0012, + "step": 8725 + }, + { + "epoch": 3.969972702456779, + "grad_norm": 0.2597407304558315, + "learning_rate": 1.0119414561003472e-06, + "loss": 0.0034, + "step": 8726 + }, + { + "epoch": 3.970427661510464, + "grad_norm": 0.23116775928116218, + "learning_rate": 1.0110795087815555e-06, + "loss": 0.003, + "step": 8727 + }, + { + "epoch": 3.970882620564149, + "grad_norm": 0.21092958003181436, + "learning_rate": 1.0102178874193324e-06, + "loss": 0.0041, + "step": 8728 + }, + { + "epoch": 3.9713375796178343, + "grad_norm": 0.3633862961488543, + "learning_rate": 1.0093565920840863e-06, + "loss": 0.0028, + "step": 8729 + }, + { + "epoch": 3.9717925386715196, + "grad_norm": 0.3691079082696472, + "learning_rate": 1.0084956228461962e-06, + "loss": 0.006, + "step": 8730 + }, + { + "epoch": 3.9722474977252045, + "grad_norm": 0.14908440249447896, + "learning_rate": 1.0076349797760199e-06, + "loss": 0.0025, + "step": 8731 + }, + { + "epoch": 3.97270245677889, + "grad_norm": 0.34556387668714117, + "learning_rate": 1.0067746629438819e-06, + "loss": 0.003, + "step": 8732 + }, + { + "epoch": 3.973157415832575, + "grad_norm": 0.32848764113459716, + "learning_rate": 1.0059146724200869e-06, + "loss": 0.0094, + "step": 8733 + }, + { + "epoch": 3.9736123748862604, + "grad_norm": 0.23966969552121023, + "learning_rate": 1.0050550082749077e-06, + "loss": 0.004, + "step": 8734 + }, + { + "epoch": 3.9740673339399453, + "grad_norm": 0.21141709663998878, + "learning_rate": 1.0041956705785921e-06, + "loss": 0.0033, + "step": 8735 + }, + { + "epoch": 3.9745222929936306, + "grad_norm": 0.34271889423516433, + "learning_rate": 1.0033366594013605e-06, + "loss": 0.006, + "step": 8736 + }, + { + "epoch": 3.974977252047316, + "grad_norm": 0.16438627342977627, + "learning_rate": 1.0024779748134077e-06, + "loss": 0.0023, + "step": 8737 + }, + { + "epoch": 3.9754322111010008, + "grad_norm": 0.21527411005713984, + "learning_rate": 1.001619616884904e-06, + "loss": 0.0033, + "step": 8738 + }, + { + "epoch": 3.975887170154686, + "grad_norm": 0.1608473656316147, + "learning_rate": 1.0007615856859882e-06, + "loss": 0.0009, + "step": 8739 + }, + { + "epoch": 3.9763421292083714, + "grad_norm": 0.3284809627554831, + "learning_rate": 9.999038812867757e-07, + "loss": 0.0074, + "step": 8740 + }, + { + "epoch": 3.9767970882620567, + "grad_norm": 0.1462191760856074, + "learning_rate": 9.990465037573522e-07, + "loss": 0.0015, + "step": 8741 + }, + { + "epoch": 3.9772520473157416, + "grad_norm": 0.32529967232499146, + "learning_rate": 9.981894531677811e-07, + "loss": 0.0019, + "step": 8742 + }, + { + "epoch": 3.977707006369427, + "grad_norm": 0.1776392132258705, + "learning_rate": 9.973327295880962e-07, + "loss": 0.0029, + "step": 8743 + }, + { + "epoch": 3.978161965423112, + "grad_norm": 0.3820563221184206, + "learning_rate": 9.964763330883037e-07, + "loss": 0.0058, + "step": 8744 + }, + { + "epoch": 3.978616924476797, + "grad_norm": 0.30612370402455663, + "learning_rate": 9.956202637383872e-07, + "loss": 0.0051, + "step": 8745 + }, + { + "epoch": 3.9790718835304824, + "grad_norm": 0.3191255868166321, + "learning_rate": 9.947645216082969e-07, + "loss": 0.0094, + "step": 8746 + }, + { + "epoch": 3.9795268425841677, + "grad_norm": 0.4696820632872373, + "learning_rate": 9.93909106767964e-07, + "loss": 0.0103, + "step": 8747 + }, + { + "epoch": 3.9799818016378525, + "grad_norm": 0.042044907583024684, + "learning_rate": 9.930540192872878e-07, + "loss": 0.0004, + "step": 8748 + }, + { + "epoch": 3.980436760691538, + "grad_norm": 0.11795179096727024, + "learning_rate": 9.921992592361417e-07, + "loss": 0.0025, + "step": 8749 + }, + { + "epoch": 3.980891719745223, + "grad_norm": 0.2046039392004054, + "learning_rate": 9.913448266843723e-07, + "loss": 0.0033, + "step": 8750 + }, + { + "epoch": 3.981346678798908, + "grad_norm": 0.5841869736893668, + "learning_rate": 9.904907217018e-07, + "loss": 0.0086, + "step": 8751 + }, + { + "epoch": 3.9818016378525933, + "grad_norm": 0.11895797035409238, + "learning_rate": 9.89636944358221e-07, + "loss": 0.0017, + "step": 8752 + }, + { + "epoch": 3.9822565969062786, + "grad_norm": 0.2358466084469214, + "learning_rate": 9.887834947233998e-07, + "loss": 0.0029, + "step": 8753 + }, + { + "epoch": 3.9827115559599635, + "grad_norm": 0.10090868860955018, + "learning_rate": 9.879303728670769e-07, + "loss": 0.0007, + "step": 8754 + }, + { + "epoch": 3.983166515013649, + "grad_norm": 0.21601646307537486, + "learning_rate": 9.87077578858965e-07, + "loss": 0.0026, + "step": 8755 + }, + { + "epoch": 3.983621474067334, + "grad_norm": 0.4228229652085819, + "learning_rate": 9.862251127687517e-07, + "loss": 0.0072, + "step": 8756 + }, + { + "epoch": 3.984076433121019, + "grad_norm": 0.2028143509325717, + "learning_rate": 9.853729746660967e-07, + "loss": 0.0019, + "step": 8757 + }, + { + "epoch": 3.9845313921747043, + "grad_norm": 0.033332071780349914, + "learning_rate": 9.845211646206303e-07, + "loss": 0.0004, + "step": 8758 + }, + { + "epoch": 3.9849863512283896, + "grad_norm": 0.24264904644566826, + "learning_rate": 9.836696827019626e-07, + "loss": 0.0082, + "step": 8759 + }, + { + "epoch": 3.9854413102820745, + "grad_norm": 0.26325127541901877, + "learning_rate": 9.828185289796694e-07, + "loss": 0.0039, + "step": 8760 + }, + { + "epoch": 3.98589626933576, + "grad_norm": 0.3430827811238145, + "learning_rate": 9.819677035233056e-07, + "loss": 0.0048, + "step": 8761 + }, + { + "epoch": 3.986351228389445, + "grad_norm": 0.3358286935788147, + "learning_rate": 9.81117206402396e-07, + "loss": 0.0034, + "step": 8762 + }, + { + "epoch": 3.98680618744313, + "grad_norm": 0.12137190403464901, + "learning_rate": 9.802670376864388e-07, + "loss": 0.0011, + "step": 8763 + }, + { + "epoch": 3.9872611464968153, + "grad_norm": 0.14336226362938656, + "learning_rate": 9.794171974449067e-07, + "loss": 0.0009, + "step": 8764 + }, + { + "epoch": 3.9877161055505006, + "grad_norm": 0.11599236315923966, + "learning_rate": 9.785676857472421e-07, + "loss": 0.0009, + "step": 8765 + }, + { + "epoch": 3.9881710646041855, + "grad_norm": 0.3456247368378552, + "learning_rate": 9.777185026628676e-07, + "loss": 0.0017, + "step": 8766 + }, + { + "epoch": 3.988626023657871, + "grad_norm": 0.029247647986222928, + "learning_rate": 9.768696482611728e-07, + "loss": 0.0002, + "step": 8767 + }, + { + "epoch": 3.989080982711556, + "grad_norm": 0.10692913485883968, + "learning_rate": 9.760211226115224e-07, + "loss": 0.0015, + "step": 8768 + }, + { + "epoch": 3.989535941765241, + "grad_norm": 0.23407586241030387, + "learning_rate": 9.751729257832532e-07, + "loss": 0.0021, + "step": 8769 + }, + { + "epoch": 3.9899909008189263, + "grad_norm": 0.12988169044897596, + "learning_rate": 9.743250578456752e-07, + "loss": 0.0013, + "step": 8770 + }, + { + "epoch": 3.9904458598726116, + "grad_norm": 0.4082665644860765, + "learning_rate": 9.734775188680756e-07, + "loss": 0.0059, + "step": 8771 + }, + { + "epoch": 3.9909008189262964, + "grad_norm": 0.16520521011067488, + "learning_rate": 9.726303089197082e-07, + "loss": 0.0017, + "step": 8772 + }, + { + "epoch": 3.9913557779799818, + "grad_norm": 0.19565360378040844, + "learning_rate": 9.717834280698052e-07, + "loss": 0.0017, + "step": 8773 + }, + { + "epoch": 3.991810737033667, + "grad_norm": 0.226198635897474, + "learning_rate": 9.709368763875693e-07, + "loss": 0.003, + "step": 8774 + }, + { + "epoch": 3.992265696087352, + "grad_norm": 0.17248274805901515, + "learning_rate": 9.700906539421756e-07, + "loss": 0.0018, + "step": 8775 + }, + { + "epoch": 3.9927206551410372, + "grad_norm": 0.3293312271683953, + "learning_rate": 9.692447608027767e-07, + "loss": 0.0074, + "step": 8776 + }, + { + "epoch": 3.9931756141947226, + "grad_norm": 0.30023329038908825, + "learning_rate": 9.683991970384926e-07, + "loss": 0.0073, + "step": 8777 + }, + { + "epoch": 3.9936305732484074, + "grad_norm": 0.1969723331175485, + "learning_rate": 9.675539627184194e-07, + "loss": 0.0032, + "step": 8778 + }, + { + "epoch": 3.9940855323020927, + "grad_norm": 0.04680802015481922, + "learning_rate": 9.66709057911625e-07, + "loss": 0.0004, + "step": 8779 + }, + { + "epoch": 3.994540491355778, + "grad_norm": 0.29455246796677664, + "learning_rate": 9.658644826871521e-07, + "loss": 0.0079, + "step": 8780 + }, + { + "epoch": 3.994995450409463, + "grad_norm": 0.1850009513454034, + "learning_rate": 9.65020237114017e-07, + "loss": 0.0016, + "step": 8781 + }, + { + "epoch": 3.9954504094631482, + "grad_norm": 0.16219225526352227, + "learning_rate": 9.641763212612065e-07, + "loss": 0.0028, + "step": 8782 + }, + { + "epoch": 3.9959053685168335, + "grad_norm": 0.02569460864892824, + "learning_rate": 9.633327351976812e-07, + "loss": 0.0002, + "step": 8783 + }, + { + "epoch": 3.9963603275705184, + "grad_norm": 0.2568372854164518, + "learning_rate": 9.62489478992374e-07, + "loss": 0.0023, + "step": 8784 + }, + { + "epoch": 3.9968152866242037, + "grad_norm": 0.14499148233375353, + "learning_rate": 9.616465527141944e-07, + "loss": 0.0012, + "step": 8785 + }, + { + "epoch": 3.997270245677889, + "grad_norm": 0.309844440479213, + "learning_rate": 9.60803956432021e-07, + "loss": 0.0029, + "step": 8786 + }, + { + "epoch": 3.997725204731574, + "grad_norm": 0.3518639185091935, + "learning_rate": 9.599616902147079e-07, + "loss": 0.0026, + "step": 8787 + }, + { + "epoch": 3.998180163785259, + "grad_norm": 0.2909013371052998, + "learning_rate": 9.591197541310815e-07, + "loss": 0.0067, + "step": 8788 + }, + { + "epoch": 3.9986351228389445, + "grad_norm": 0.11106972171052319, + "learning_rate": 9.582781482499382e-07, + "loss": 0.0007, + "step": 8789 + }, + { + "epoch": 3.99909008189263, + "grad_norm": 0.12271487168672769, + "learning_rate": 9.574368726400546e-07, + "loss": 0.0022, + "step": 8790 + }, + { + "epoch": 3.9995450409463147, + "grad_norm": 0.1912244248318796, + "learning_rate": 9.565959273701731e-07, + "loss": 0.0026, + "step": 8791 + }, + { + "epoch": 4.0, + "grad_norm": 0.2508029985197783, + "learning_rate": 9.557553125090125e-07, + "loss": 0.003, + "step": 8792 + }, + { + "epoch": 4.000454959053685, + "grad_norm": 0.16629222058490714, + "learning_rate": 9.549150281252633e-07, + "loss": 0.001, + "step": 8793 + }, + { + "epoch": 4.000909918107371, + "grad_norm": 0.05719935228050158, + "learning_rate": 9.540750742875905e-07, + "loss": 0.0004, + "step": 8794 + }, + { + "epoch": 4.0013648771610555, + "grad_norm": 0.06236427885770259, + "learning_rate": 9.532354510646324e-07, + "loss": 0.0003, + "step": 8795 + }, + { + "epoch": 4.00181983621474, + "grad_norm": 0.12152999402084956, + "learning_rate": 9.52396158524998e-07, + "loss": 0.0022, + "step": 8796 + }, + { + "epoch": 4.002274795268426, + "grad_norm": 0.05899456985133848, + "learning_rate": 9.515571967372711e-07, + "loss": 0.0007, + "step": 8797 + }, + { + "epoch": 4.002729754322111, + "grad_norm": 0.08540582284877513, + "learning_rate": 9.507185657700063e-07, + "loss": 0.0008, + "step": 8798 + }, + { + "epoch": 4.003184713375796, + "grad_norm": 0.03038397634794022, + "learning_rate": 9.49880265691735e-07, + "loss": 0.0002, + "step": 8799 + }, + { + "epoch": 4.003639672429482, + "grad_norm": 0.09453161125851471, + "learning_rate": 9.490422965709567e-07, + "loss": 0.0012, + "step": 8800 + }, + { + "epoch": 4.0040946314831665, + "grad_norm": 0.023004927038856763, + "learning_rate": 9.482046584761496e-07, + "loss": 0.0002, + "step": 8801 + }, + { + "epoch": 4.004549590536851, + "grad_norm": 0.05805642327320453, + "learning_rate": 9.473673514757597e-07, + "loss": 0.001, + "step": 8802 + }, + { + "epoch": 4.005004549590537, + "grad_norm": 0.10417321598418562, + "learning_rate": 9.465303756382089e-07, + "loss": 0.0009, + "step": 8803 + }, + { + "epoch": 4.005459508644222, + "grad_norm": 0.060553228852067764, + "learning_rate": 9.456937310318887e-07, + "loss": 0.0008, + "step": 8804 + }, + { + "epoch": 4.005914467697907, + "grad_norm": 0.11718271171656236, + "learning_rate": 9.44857417725169e-07, + "loss": 0.0019, + "step": 8805 + }, + { + "epoch": 4.006369426751593, + "grad_norm": 0.1525982819267108, + "learning_rate": 9.440214357863886e-07, + "loss": 0.0011, + "step": 8806 + }, + { + "epoch": 4.0068243858052774, + "grad_norm": 0.15186714338180338, + "learning_rate": 9.431857852838583e-07, + "loss": 0.0017, + "step": 8807 + }, + { + "epoch": 4.007279344858962, + "grad_norm": 0.056470550793833195, + "learning_rate": 9.423504662858668e-07, + "loss": 0.0008, + "step": 8808 + }, + { + "epoch": 4.007734303912648, + "grad_norm": 0.03213406387937676, + "learning_rate": 9.415154788606695e-07, + "loss": 0.0004, + "step": 8809 + }, + { + "epoch": 4.008189262966333, + "grad_norm": 0.10789827861267955, + "learning_rate": 9.406808230765003e-07, + "loss": 0.0017, + "step": 8810 + }, + { + "epoch": 4.008644222020018, + "grad_norm": 0.03326499372539869, + "learning_rate": 9.398464990015632e-07, + "loss": 0.0003, + "step": 8811 + }, + { + "epoch": 4.0090991810737036, + "grad_norm": 0.13732606415092802, + "learning_rate": 9.39012506704034e-07, + "loss": 0.0009, + "step": 8812 + }, + { + "epoch": 4.009554140127388, + "grad_norm": 0.06052085080997063, + "learning_rate": 9.381788462520625e-07, + "loss": 0.0007, + "step": 8813 + }, + { + "epoch": 4.010009099181073, + "grad_norm": 0.03687619251212416, + "learning_rate": 9.37345517713773e-07, + "loss": 0.0006, + "step": 8814 + }, + { + "epoch": 4.010464058234759, + "grad_norm": 0.08696989433821824, + "learning_rate": 9.365125211572618e-07, + "loss": 0.0006, + "step": 8815 + }, + { + "epoch": 4.010919017288444, + "grad_norm": 0.10067405821078737, + "learning_rate": 9.35679856650597e-07, + "loss": 0.001, + "step": 8816 + }, + { + "epoch": 4.011373976342129, + "grad_norm": 0.29326634481674474, + "learning_rate": 9.3484752426182e-07, + "loss": 0.0031, + "step": 8817 + }, + { + "epoch": 4.0118289353958145, + "grad_norm": 0.02722245843376445, + "learning_rate": 9.340155240589438e-07, + "loss": 0.0004, + "step": 8818 + }, + { + "epoch": 4.012283894449499, + "grad_norm": 0.08483441130339463, + "learning_rate": 9.331838561099588e-07, + "loss": 0.0008, + "step": 8819 + }, + { + "epoch": 4.012738853503185, + "grad_norm": 0.2814341617235241, + "learning_rate": 9.323525204828232e-07, + "loss": 0.0018, + "step": 8820 + }, + { + "epoch": 4.01319381255687, + "grad_norm": 0.03885371775112174, + "learning_rate": 9.315215172454689e-07, + "loss": 0.0005, + "step": 8821 + }, + { + "epoch": 4.013648771610555, + "grad_norm": 0.0801769474773646, + "learning_rate": 9.306908464658049e-07, + "loss": 0.0014, + "step": 8822 + }, + { + "epoch": 4.014103730664241, + "grad_norm": 0.13932865631304017, + "learning_rate": 9.298605082117062e-07, + "loss": 0.0024, + "step": 8823 + }, + { + "epoch": 4.0145586897179255, + "grad_norm": 0.1737163295052175, + "learning_rate": 9.290305025510282e-07, + "loss": 0.0023, + "step": 8824 + }, + { + "epoch": 4.01501364877161, + "grad_norm": 0.059422585767260366, + "learning_rate": 9.282008295515926e-07, + "loss": 0.0006, + "step": 8825 + }, + { + "epoch": 4.015468607825296, + "grad_norm": 0.05645889787498745, + "learning_rate": 9.273714892811975e-07, + "loss": 0.0007, + "step": 8826 + }, + { + "epoch": 4.015923566878981, + "grad_norm": 0.1218436780245377, + "learning_rate": 9.265424818076107e-07, + "loss": 0.0027, + "step": 8827 + }, + { + "epoch": 4.016378525932666, + "grad_norm": 0.06894785952639247, + "learning_rate": 9.257138071985771e-07, + "loss": 0.0005, + "step": 8828 + }, + { + "epoch": 4.016833484986352, + "grad_norm": 0.02303475730372948, + "learning_rate": 9.248854655218131e-07, + "loss": 0.0002, + "step": 8829 + }, + { + "epoch": 4.0172884440400365, + "grad_norm": 0.01997278340019789, + "learning_rate": 9.240574568450056e-07, + "loss": 0.0002, + "step": 8830 + }, + { + "epoch": 4.017743403093721, + "grad_norm": 0.04785533783856743, + "learning_rate": 9.232297812358166e-07, + "loss": 0.0006, + "step": 8831 + }, + { + "epoch": 4.018198362147407, + "grad_norm": 0.043167284016806806, + "learning_rate": 9.224024387618774e-07, + "loss": 0.0004, + "step": 8832 + }, + { + "epoch": 4.018653321201092, + "grad_norm": 0.0715606613607931, + "learning_rate": 9.21575429490798e-07, + "loss": 0.0003, + "step": 8833 + }, + { + "epoch": 4.019108280254777, + "grad_norm": 0.05578037900399803, + "learning_rate": 9.207487534901565e-07, + "loss": 0.0006, + "step": 8834 + }, + { + "epoch": 4.019563239308463, + "grad_norm": 0.055469182131497125, + "learning_rate": 9.19922410827504e-07, + "loss": 0.0003, + "step": 8835 + }, + { + "epoch": 4.0200181983621475, + "grad_norm": 0.18119416154441015, + "learning_rate": 9.190964015703679e-07, + "loss": 0.003, + "step": 8836 + }, + { + "epoch": 4.020473157415832, + "grad_norm": 0.03358839125611996, + "learning_rate": 9.182707257862444e-07, + "loss": 0.0003, + "step": 8837 + }, + { + "epoch": 4.020928116469518, + "grad_norm": 0.09534904427799977, + "learning_rate": 9.174453835426034e-07, + "loss": 0.0023, + "step": 8838 + }, + { + "epoch": 4.021383075523203, + "grad_norm": 0.3361905090875473, + "learning_rate": 9.166203749068897e-07, + "loss": 0.0014, + "step": 8839 + }, + { + "epoch": 4.021838034576888, + "grad_norm": 0.049679718973019385, + "learning_rate": 9.157956999465189e-07, + "loss": 0.0005, + "step": 8840 + }, + { + "epoch": 4.022292993630574, + "grad_norm": 0.02174356134841266, + "learning_rate": 9.149713587288795e-07, + "loss": 0.0002, + "step": 8841 + }, + { + "epoch": 4.022747952684258, + "grad_norm": 0.1900377877472236, + "learning_rate": 9.141473513213317e-07, + "loss": 0.002, + "step": 8842 + }, + { + "epoch": 4.023202911737943, + "grad_norm": 0.07960843514729345, + "learning_rate": 9.133236777912107e-07, + "loss": 0.0006, + "step": 8843 + }, + { + "epoch": 4.023657870791629, + "grad_norm": 0.02187814862049447, + "learning_rate": 9.125003382058245e-07, + "loss": 0.0003, + "step": 8844 + }, + { + "epoch": 4.024112829845314, + "grad_norm": 0.027882909629560287, + "learning_rate": 9.116773326324518e-07, + "loss": 0.0003, + "step": 8845 + }, + { + "epoch": 4.024567788898999, + "grad_norm": 0.07110757169703331, + "learning_rate": 9.10854661138345e-07, + "loss": 0.0006, + "step": 8846 + }, + { + "epoch": 4.0250227479526846, + "grad_norm": 0.022934152761742876, + "learning_rate": 9.100323237907272e-07, + "loss": 0.0002, + "step": 8847 + }, + { + "epoch": 4.025477707006369, + "grad_norm": 0.0357790498890144, + "learning_rate": 9.092103206567993e-07, + "loss": 0.0005, + "step": 8848 + }, + { + "epoch": 4.025932666060054, + "grad_norm": 0.03831240959878874, + "learning_rate": 9.083886518037288e-07, + "loss": 0.0004, + "step": 8849 + }, + { + "epoch": 4.02638762511374, + "grad_norm": 0.03401280425837219, + "learning_rate": 9.075673172986615e-07, + "loss": 0.0004, + "step": 8850 + }, + { + "epoch": 4.026842584167425, + "grad_norm": 0.1535679340541849, + "learning_rate": 9.067463172087115e-07, + "loss": 0.0024, + "step": 8851 + }, + { + "epoch": 4.02729754322111, + "grad_norm": 0.016958990476787006, + "learning_rate": 9.059256516009662e-07, + "loss": 0.0002, + "step": 8852 + }, + { + "epoch": 4.0277525022747955, + "grad_norm": 0.06832549703698518, + "learning_rate": 9.051053205424898e-07, + "loss": 0.0009, + "step": 8853 + }, + { + "epoch": 4.02820746132848, + "grad_norm": 0.034531653057010385, + "learning_rate": 9.042853241003136e-07, + "loss": 0.0003, + "step": 8854 + }, + { + "epoch": 4.028662420382165, + "grad_norm": 0.11969217982314607, + "learning_rate": 9.034656623414451e-07, + "loss": 0.0012, + "step": 8855 + }, + { + "epoch": 4.029117379435851, + "grad_norm": 0.05900885201897496, + "learning_rate": 9.026463353328613e-07, + "loss": 0.0008, + "step": 8856 + }, + { + "epoch": 4.029572338489536, + "grad_norm": 0.012944271170558883, + "learning_rate": 9.018273431415159e-07, + "loss": 0.0001, + "step": 8857 + }, + { + "epoch": 4.030027297543221, + "grad_norm": 0.03252348541151254, + "learning_rate": 9.010086858343337e-07, + "loss": 0.0003, + "step": 8858 + }, + { + "epoch": 4.0304822565969065, + "grad_norm": 0.010421336056305555, + "learning_rate": 9.00190363478211e-07, + "loss": 0.0001, + "step": 8859 + }, + { + "epoch": 4.030937215650591, + "grad_norm": 0.0385043168603255, + "learning_rate": 8.99372376140017e-07, + "loss": 0.0004, + "step": 8860 + }, + { + "epoch": 4.031392174704276, + "grad_norm": 0.021166918386738422, + "learning_rate": 8.985547238865932e-07, + "loss": 0.0001, + "step": 8861 + }, + { + "epoch": 4.031847133757962, + "grad_norm": 0.06351162398142775, + "learning_rate": 8.977374067847566e-07, + "loss": 0.0007, + "step": 8862 + }, + { + "epoch": 4.032302092811647, + "grad_norm": 0.014223236880788444, + "learning_rate": 8.96920424901292e-07, + "loss": 0.0001, + "step": 8863 + }, + { + "epoch": 4.032757051865332, + "grad_norm": 0.2142073923819191, + "learning_rate": 8.961037783029619e-07, + "loss": 0.0018, + "step": 8864 + }, + { + "epoch": 4.0332120109190175, + "grad_norm": 0.03503072632918302, + "learning_rate": 8.952874670564987e-07, + "loss": 0.0005, + "step": 8865 + }, + { + "epoch": 4.033666969972702, + "grad_norm": 0.056155964747450354, + "learning_rate": 8.944714912286051e-07, + "loss": 0.0003, + "step": 8866 + }, + { + "epoch": 4.034121929026387, + "grad_norm": 0.1566431753653592, + "learning_rate": 8.936558508859627e-07, + "loss": 0.0011, + "step": 8867 + }, + { + "epoch": 4.034576888080073, + "grad_norm": 0.0293734711298181, + "learning_rate": 8.9284054609522e-07, + "loss": 0.0002, + "step": 8868 + }, + { + "epoch": 4.035031847133758, + "grad_norm": 0.12108081780178453, + "learning_rate": 8.920255769229996e-07, + "loss": 0.0015, + "step": 8869 + }, + { + "epoch": 4.035486806187443, + "grad_norm": 0.038777185050657005, + "learning_rate": 8.912109434358967e-07, + "loss": 0.0002, + "step": 8870 + }, + { + "epoch": 4.0359417652411285, + "grad_norm": 0.06502099659443437, + "learning_rate": 8.903966457004804e-07, + "loss": 0.0004, + "step": 8871 + }, + { + "epoch": 4.036396724294813, + "grad_norm": 0.10969463517955318, + "learning_rate": 8.895826837832928e-07, + "loss": 0.0004, + "step": 8872 + }, + { + "epoch": 4.036851683348498, + "grad_norm": 0.1924762552943705, + "learning_rate": 8.887690577508451e-07, + "loss": 0.001, + "step": 8873 + }, + { + "epoch": 4.037306642402184, + "grad_norm": 0.022684012443389672, + "learning_rate": 8.879557676696243e-07, + "loss": 0.0002, + "step": 8874 + }, + { + "epoch": 4.037761601455869, + "grad_norm": 0.017218707485729432, + "learning_rate": 8.871428136060883e-07, + "loss": 0.0001, + "step": 8875 + }, + { + "epoch": 4.038216560509555, + "grad_norm": 0.08411827591407593, + "learning_rate": 8.863301956266673e-07, + "loss": 0.0007, + "step": 8876 + }, + { + "epoch": 4.038671519563239, + "grad_norm": 0.0490676564588678, + "learning_rate": 8.855179137977649e-07, + "loss": 0.0005, + "step": 8877 + }, + { + "epoch": 4.039126478616924, + "grad_norm": 0.025097860062216235, + "learning_rate": 8.847059681857595e-07, + "loss": 0.0003, + "step": 8878 + }, + { + "epoch": 4.03958143767061, + "grad_norm": 0.03299646778254466, + "learning_rate": 8.838943588569976e-07, + "loss": 0.0003, + "step": 8879 + }, + { + "epoch": 4.040036396724295, + "grad_norm": 0.044524123147625376, + "learning_rate": 8.830830858778001e-07, + "loss": 0.0003, + "step": 8880 + }, + { + "epoch": 4.04049135577798, + "grad_norm": 0.02142811088835668, + "learning_rate": 8.822721493144603e-07, + "loss": 0.0002, + "step": 8881 + }, + { + "epoch": 4.0409463148316656, + "grad_norm": 0.0543461562792307, + "learning_rate": 8.814615492332462e-07, + "loss": 0.0004, + "step": 8882 + }, + { + "epoch": 4.04140127388535, + "grad_norm": 0.24011374656624696, + "learning_rate": 8.806512857003951e-07, + "loss": 0.0007, + "step": 8883 + }, + { + "epoch": 4.041856232939035, + "grad_norm": 0.0538549023953534, + "learning_rate": 8.798413587821164e-07, + "loss": 0.0003, + "step": 8884 + }, + { + "epoch": 4.042311191992721, + "grad_norm": 0.14560340598590216, + "learning_rate": 8.790317685445971e-07, + "loss": 0.0027, + "step": 8885 + }, + { + "epoch": 4.042766151046406, + "grad_norm": 0.15049845263483105, + "learning_rate": 8.782225150539903e-07, + "loss": 0.0026, + "step": 8886 + }, + { + "epoch": 4.043221110100091, + "grad_norm": 0.045227957975040284, + "learning_rate": 8.77413598376427e-07, + "loss": 0.0003, + "step": 8887 + }, + { + "epoch": 4.0436760691537765, + "grad_norm": 0.02505410553777718, + "learning_rate": 8.766050185780067e-07, + "loss": 0.0002, + "step": 8888 + }, + { + "epoch": 4.044131028207461, + "grad_norm": 0.032392681875533055, + "learning_rate": 8.757967757248037e-07, + "loss": 0.0002, + "step": 8889 + }, + { + "epoch": 4.044585987261146, + "grad_norm": 0.04228755254587372, + "learning_rate": 8.749888698828618e-07, + "loss": 0.0008, + "step": 8890 + }, + { + "epoch": 4.045040946314832, + "grad_norm": 0.06947840080519249, + "learning_rate": 8.741813011182015e-07, + "loss": 0.0007, + "step": 8891 + }, + { + "epoch": 4.045495905368517, + "grad_norm": 0.027067513906722015, + "learning_rate": 8.73374069496814e-07, + "loss": 0.0003, + "step": 8892 + }, + { + "epoch": 4.045950864422202, + "grad_norm": 0.05425715690999819, + "learning_rate": 8.725671750846621e-07, + "loss": 0.0004, + "step": 8893 + }, + { + "epoch": 4.0464058234758875, + "grad_norm": 0.12118559688524527, + "learning_rate": 8.717606179476811e-07, + "loss": 0.001, + "step": 8894 + }, + { + "epoch": 4.046860782529572, + "grad_norm": 0.016646865707413114, + "learning_rate": 8.709543981517787e-07, + "loss": 0.0001, + "step": 8895 + }, + { + "epoch": 4.047315741583257, + "grad_norm": 0.08585894100632072, + "learning_rate": 8.70148515762837e-07, + "loss": 0.0009, + "step": 8896 + }, + { + "epoch": 4.047770700636943, + "grad_norm": 0.09019787343404101, + "learning_rate": 8.693429708467089e-07, + "loss": 0.0013, + "step": 8897 + }, + { + "epoch": 4.048225659690628, + "grad_norm": 0.1757845738174576, + "learning_rate": 8.685377634692177e-07, + "loss": 0.002, + "step": 8898 + }, + { + "epoch": 4.048680618744313, + "grad_norm": 0.2648062681391176, + "learning_rate": 8.677328936961643e-07, + "loss": 0.0018, + "step": 8899 + }, + { + "epoch": 4.0491355777979985, + "grad_norm": 0.04054065784768181, + "learning_rate": 8.669283615933161e-07, + "loss": 0.0002, + "step": 8900 + }, + { + "epoch": 4.049590536851683, + "grad_norm": 0.028120667044778246, + "learning_rate": 8.661241672264193e-07, + "loss": 0.0001, + "step": 8901 + }, + { + "epoch": 4.050045495905368, + "grad_norm": 0.20649495046568908, + "learning_rate": 8.653203106611868e-07, + "loss": 0.0012, + "step": 8902 + }, + { + "epoch": 4.050500454959054, + "grad_norm": 0.07667435440191665, + "learning_rate": 8.645167919633063e-07, + "loss": 0.0004, + "step": 8903 + }, + { + "epoch": 4.050955414012739, + "grad_norm": 0.18083107846473992, + "learning_rate": 8.637136111984368e-07, + "loss": 0.0044, + "step": 8904 + }, + { + "epoch": 4.051410373066424, + "grad_norm": 0.04045093253658398, + "learning_rate": 8.629107684322113e-07, + "loss": 0.0003, + "step": 8905 + }, + { + "epoch": 4.0518653321201095, + "grad_norm": 0.051477488497152786, + "learning_rate": 8.621082637302369e-07, + "loss": 0.0006, + "step": 8906 + }, + { + "epoch": 4.052320291173794, + "grad_norm": 0.09126540470406105, + "learning_rate": 8.613060971580878e-07, + "loss": 0.0005, + "step": 8907 + }, + { + "epoch": 4.052775250227479, + "grad_norm": 0.029071446374448463, + "learning_rate": 8.605042687813148e-07, + "loss": 0.0003, + "step": 8908 + }, + { + "epoch": 4.053230209281165, + "grad_norm": 0.04933030792822622, + "learning_rate": 8.597027786654388e-07, + "loss": 0.0006, + "step": 8909 + }, + { + "epoch": 4.05368516833485, + "grad_norm": 0.06666459880024085, + "learning_rate": 8.589016268759537e-07, + "loss": 0.0006, + "step": 8910 + }, + { + "epoch": 4.054140127388535, + "grad_norm": 0.022547665925337156, + "learning_rate": 8.581008134783275e-07, + "loss": 0.0002, + "step": 8911 + }, + { + "epoch": 4.05459508644222, + "grad_norm": 0.015496997476280678, + "learning_rate": 8.573003385379969e-07, + "loss": 0.0001, + "step": 8912 + }, + { + "epoch": 4.055050045495905, + "grad_norm": 0.05036702898357478, + "learning_rate": 8.565002021203755e-07, + "loss": 0.0007, + "step": 8913 + }, + { + "epoch": 4.05550500454959, + "grad_norm": 0.02838668507755251, + "learning_rate": 8.557004042908457e-07, + "loss": 0.0002, + "step": 8914 + }, + { + "epoch": 4.055959963603276, + "grad_norm": 0.031068534901867645, + "learning_rate": 8.549009451147622e-07, + "loss": 0.0003, + "step": 8915 + }, + { + "epoch": 4.056414922656961, + "grad_norm": 0.020284579951607155, + "learning_rate": 8.541018246574556e-07, + "loss": 0.0002, + "step": 8916 + }, + { + "epoch": 4.056869881710646, + "grad_norm": 0.07834446973243235, + "learning_rate": 8.533030429842254e-07, + "loss": 0.001, + "step": 8917 + }, + { + "epoch": 4.057324840764331, + "grad_norm": 0.016110756599252412, + "learning_rate": 8.525046001603437e-07, + "loss": 0.0002, + "step": 8918 + }, + { + "epoch": 4.057779799818016, + "grad_norm": 0.02754572370681331, + "learning_rate": 8.517064962510552e-07, + "loss": 0.0003, + "step": 8919 + }, + { + "epoch": 4.058234758871701, + "grad_norm": 0.016533551553972476, + "learning_rate": 8.509087313215786e-07, + "loss": 0.0002, + "step": 8920 + }, + { + "epoch": 4.058689717925387, + "grad_norm": 0.013175663004793819, + "learning_rate": 8.501113054371041e-07, + "loss": 0.0001, + "step": 8921 + }, + { + "epoch": 4.059144676979072, + "grad_norm": 0.06306415838667524, + "learning_rate": 8.493142186627934e-07, + "loss": 0.0008, + "step": 8922 + }, + { + "epoch": 4.059599636032757, + "grad_norm": 0.03610607875264149, + "learning_rate": 8.485174710637801e-07, + "loss": 0.0004, + "step": 8923 + }, + { + "epoch": 4.060054595086442, + "grad_norm": 0.06247731857500946, + "learning_rate": 8.477210627051702e-07, + "loss": 0.0008, + "step": 8924 + }, + { + "epoch": 4.060509554140127, + "grad_norm": 0.09650801477478864, + "learning_rate": 8.469249936520446e-07, + "loss": 0.0015, + "step": 8925 + }, + { + "epoch": 4.060964513193812, + "grad_norm": 0.02882791154111998, + "learning_rate": 8.461292639694519e-07, + "loss": 0.0002, + "step": 8926 + }, + { + "epoch": 4.061419472247498, + "grad_norm": 0.08301781116966915, + "learning_rate": 8.453338737224187e-07, + "loss": 0.0009, + "step": 8927 + }, + { + "epoch": 4.061874431301183, + "grad_norm": 0.1050407721071887, + "learning_rate": 8.445388229759388e-07, + "loss": 0.0015, + "step": 8928 + }, + { + "epoch": 4.0623293903548685, + "grad_norm": 0.10116590468987932, + "learning_rate": 8.437441117949791e-07, + "loss": 0.0006, + "step": 8929 + }, + { + "epoch": 4.062784349408553, + "grad_norm": 0.10710108256887711, + "learning_rate": 8.429497402444825e-07, + "loss": 0.0012, + "step": 8930 + }, + { + "epoch": 4.063239308462238, + "grad_norm": 0.14542571030514667, + "learning_rate": 8.4215570838936e-07, + "loss": 0.0028, + "step": 8931 + }, + { + "epoch": 4.063694267515924, + "grad_norm": 0.04838768933729137, + "learning_rate": 8.413620162944963e-07, + "loss": 0.0004, + "step": 8932 + }, + { + "epoch": 4.064149226569609, + "grad_norm": 0.025306242244517784, + "learning_rate": 8.405686640247473e-07, + "loss": 0.0001, + "step": 8933 + }, + { + "epoch": 4.064604185623294, + "grad_norm": 0.06418806936722982, + "learning_rate": 8.397756516449429e-07, + "loss": 0.0007, + "step": 8934 + }, + { + "epoch": 4.0650591446769795, + "grad_norm": 0.023924411571602452, + "learning_rate": 8.389829792198867e-07, + "loss": 0.0002, + "step": 8935 + }, + { + "epoch": 4.065514103730664, + "grad_norm": 0.09422613042342526, + "learning_rate": 8.381906468143497e-07, + "loss": 0.0014, + "step": 8936 + }, + { + "epoch": 4.065969062784349, + "grad_norm": 0.09464187290411909, + "learning_rate": 8.37398654493079e-07, + "loss": 0.0011, + "step": 8937 + }, + { + "epoch": 4.066424021838035, + "grad_norm": 0.023980285812512162, + "learning_rate": 8.366070023207906e-07, + "loss": 0.0002, + "step": 8938 + }, + { + "epoch": 4.06687898089172, + "grad_norm": 0.04353005793073376, + "learning_rate": 8.358156903621778e-07, + "loss": 0.0002, + "step": 8939 + }, + { + "epoch": 4.067333939945405, + "grad_norm": 0.11476803805506165, + "learning_rate": 8.350247186818999e-07, + "loss": 0.0017, + "step": 8940 + }, + { + "epoch": 4.0677888989990905, + "grad_norm": 0.048915106122024446, + "learning_rate": 8.342340873445948e-07, + "loss": 0.0009, + "step": 8941 + }, + { + "epoch": 4.068243858052775, + "grad_norm": 0.06616594207739772, + "learning_rate": 8.334437964148673e-07, + "loss": 0.0005, + "step": 8942 + }, + { + "epoch": 4.06869881710646, + "grad_norm": 0.043982404447476434, + "learning_rate": 8.326538459572953e-07, + "loss": 0.0003, + "step": 8943 + }, + { + "epoch": 4.069153776160146, + "grad_norm": 0.025750021686192535, + "learning_rate": 8.318642360364332e-07, + "loss": 0.0003, + "step": 8944 + }, + { + "epoch": 4.069608735213831, + "grad_norm": 0.16312377108904488, + "learning_rate": 8.310749667168022e-07, + "loss": 0.0026, + "step": 8945 + }, + { + "epoch": 4.070063694267516, + "grad_norm": 0.07553269579476078, + "learning_rate": 8.302860380628985e-07, + "loss": 0.0011, + "step": 8946 + }, + { + "epoch": 4.070518653321201, + "grad_norm": 0.023597852511622334, + "learning_rate": 8.294974501391884e-07, + "loss": 0.0002, + "step": 8947 + }, + { + "epoch": 4.070973612374886, + "grad_norm": 0.019405493109933992, + "learning_rate": 8.287092030101135e-07, + "loss": 0.0001, + "step": 8948 + }, + { + "epoch": 4.071428571428571, + "grad_norm": 0.029065838050355623, + "learning_rate": 8.279212967400846e-07, + "loss": 0.0002, + "step": 8949 + }, + { + "epoch": 4.071883530482257, + "grad_norm": 0.11550024324451541, + "learning_rate": 8.271337313934869e-07, + "loss": 0.0004, + "step": 8950 + }, + { + "epoch": 4.072338489535942, + "grad_norm": 0.08340750659124711, + "learning_rate": 8.263465070346765e-07, + "loss": 0.0009, + "step": 8951 + }, + { + "epoch": 4.072793448589627, + "grad_norm": 0.009318842729009603, + "learning_rate": 8.255596237279817e-07, + "loss": 0.0001, + "step": 8952 + }, + { + "epoch": 4.073248407643312, + "grad_norm": 0.05436080997286999, + "learning_rate": 8.247730815377014e-07, + "loss": 0.0002, + "step": 8953 + }, + { + "epoch": 4.073703366696997, + "grad_norm": 0.12221853508932347, + "learning_rate": 8.239868805281098e-07, + "loss": 0.0006, + "step": 8954 + }, + { + "epoch": 4.074158325750682, + "grad_norm": 0.23702379713841448, + "learning_rate": 8.232010207634527e-07, + "loss": 0.004, + "step": 8955 + }, + { + "epoch": 4.074613284804368, + "grad_norm": 0.03331357686505322, + "learning_rate": 8.224155023079461e-07, + "loss": 0.0003, + "step": 8956 + }, + { + "epoch": 4.075068243858053, + "grad_norm": 0.18943491853875813, + "learning_rate": 8.216303252257791e-07, + "loss": 0.0024, + "step": 8957 + }, + { + "epoch": 4.075523202911738, + "grad_norm": 0.006566208299789381, + "learning_rate": 8.208454895811108e-07, + "loss": 0.0001, + "step": 8958 + }, + { + "epoch": 4.075978161965423, + "grad_norm": 0.12251875752424424, + "learning_rate": 8.200609954380778e-07, + "loss": 0.0013, + "step": 8959 + }, + { + "epoch": 4.076433121019108, + "grad_norm": 0.10692732317604908, + "learning_rate": 8.192768428607839e-07, + "loss": 0.0016, + "step": 8960 + }, + { + "epoch": 4.076888080072793, + "grad_norm": 0.028289276831684338, + "learning_rate": 8.18493031913305e-07, + "loss": 0.0002, + "step": 8961 + }, + { + "epoch": 4.077343039126479, + "grad_norm": 0.019305883017379562, + "learning_rate": 8.177095626596932e-07, + "loss": 0.0001, + "step": 8962 + }, + { + "epoch": 4.077797998180164, + "grad_norm": 0.13520783450211288, + "learning_rate": 8.169264351639672e-07, + "loss": 0.0007, + "step": 8963 + }, + { + "epoch": 4.078252957233849, + "grad_norm": 0.014133256391491447, + "learning_rate": 8.161436494901242e-07, + "loss": 0.0001, + "step": 8964 + }, + { + "epoch": 4.078707916287534, + "grad_norm": 0.03690999317822864, + "learning_rate": 8.153612057021276e-07, + "loss": 0.0003, + "step": 8965 + }, + { + "epoch": 4.079162875341219, + "grad_norm": 0.026264170966425613, + "learning_rate": 8.145791038639161e-07, + "loss": 0.0003, + "step": 8966 + }, + { + "epoch": 4.079617834394904, + "grad_norm": 0.009964548093883467, + "learning_rate": 8.137973440393976e-07, + "loss": 0.0001, + "step": 8967 + }, + { + "epoch": 4.08007279344859, + "grad_norm": 0.1445759254143165, + "learning_rate": 8.130159262924553e-07, + "loss": 0.0048, + "step": 8968 + }, + { + "epoch": 4.080527752502275, + "grad_norm": 0.1467094810700478, + "learning_rate": 8.122348506869448e-07, + "loss": 0.0013, + "step": 8969 + }, + { + "epoch": 4.08098271155596, + "grad_norm": 0.07713309591746362, + "learning_rate": 8.114541172866902e-07, + "loss": 0.0008, + "step": 8970 + }, + { + "epoch": 4.081437670609645, + "grad_norm": 0.1432465944909815, + "learning_rate": 8.106737261554897e-07, + "loss": 0.0006, + "step": 8971 + }, + { + "epoch": 4.08189262966333, + "grad_norm": 0.015942642563481164, + "learning_rate": 8.098936773571126e-07, + "loss": 0.0001, + "step": 8972 + }, + { + "epoch": 4.082347588717015, + "grad_norm": 0.025928907252859866, + "learning_rate": 8.091139709553031e-07, + "loss": 0.0003, + "step": 8973 + }, + { + "epoch": 4.082802547770701, + "grad_norm": 0.027104632299115566, + "learning_rate": 8.083346070137737e-07, + "loss": 0.0002, + "step": 8974 + }, + { + "epoch": 4.083257506824386, + "grad_norm": 0.13595448163868545, + "learning_rate": 8.075555855962097e-07, + "loss": 0.0029, + "step": 8975 + }, + { + "epoch": 4.083712465878071, + "grad_norm": 0.26466784047835923, + "learning_rate": 8.067769067662718e-07, + "loss": 0.0024, + "step": 8976 + }, + { + "epoch": 4.084167424931756, + "grad_norm": 0.007845274885144131, + "learning_rate": 8.059985705875873e-07, + "loss": 0.0001, + "step": 8977 + }, + { + "epoch": 4.084622383985441, + "grad_norm": 0.01942580719732545, + "learning_rate": 8.052205771237603e-07, + "loss": 0.0002, + "step": 8978 + }, + { + "epoch": 4.085077343039126, + "grad_norm": 0.025475048300551154, + "learning_rate": 8.044429264383652e-07, + "loss": 0.0002, + "step": 8979 + }, + { + "epoch": 4.085532302092812, + "grad_norm": 0.09668303521085533, + "learning_rate": 8.036656185949466e-07, + "loss": 0.0008, + "step": 8980 + }, + { + "epoch": 4.085987261146497, + "grad_norm": 0.11432311808470239, + "learning_rate": 8.028886536570235e-07, + "loss": 0.0017, + "step": 8981 + }, + { + "epoch": 4.0864422202001816, + "grad_norm": 0.026269303380395737, + "learning_rate": 8.021120316880843e-07, + "loss": 0.0001, + "step": 8982 + }, + { + "epoch": 4.086897179253867, + "grad_norm": 0.03710699175861286, + "learning_rate": 8.01335752751592e-07, + "loss": 0.0002, + "step": 8983 + }, + { + "epoch": 4.087352138307552, + "grad_norm": 0.010914013634221788, + "learning_rate": 8.005598169109829e-07, + "loss": 0.0001, + "step": 8984 + }, + { + "epoch": 4.087807097361237, + "grad_norm": 0.047489529010998566, + "learning_rate": 7.997842242296605e-07, + "loss": 0.0004, + "step": 8985 + }, + { + "epoch": 4.088262056414923, + "grad_norm": 0.19870774277849554, + "learning_rate": 7.990089747710033e-07, + "loss": 0.0015, + "step": 8986 + }, + { + "epoch": 4.088717015468608, + "grad_norm": 0.14171092159495802, + "learning_rate": 7.982340685983602e-07, + "loss": 0.0021, + "step": 8987 + }, + { + "epoch": 4.089171974522293, + "grad_norm": 0.04923405617189786, + "learning_rate": 7.97459505775055e-07, + "loss": 0.0004, + "step": 8988 + }, + { + "epoch": 4.089626933575978, + "grad_norm": 0.23756487347204336, + "learning_rate": 7.966852863643798e-07, + "loss": 0.0039, + "step": 8989 + }, + { + "epoch": 4.090081892629663, + "grad_norm": 0.0848559727271451, + "learning_rate": 7.959114104296017e-07, + "loss": 0.0004, + "step": 8990 + }, + { + "epoch": 4.090536851683349, + "grad_norm": 0.059109878803031216, + "learning_rate": 7.951378780339581e-07, + "loss": 0.0002, + "step": 8991 + }, + { + "epoch": 4.090991810737034, + "grad_norm": 0.010690299268787687, + "learning_rate": 7.943646892406564e-07, + "loss": 0.0001, + "step": 8992 + }, + { + "epoch": 4.091446769790719, + "grad_norm": 0.016695187742703434, + "learning_rate": 7.93591844112881e-07, + "loss": 0.0002, + "step": 8993 + }, + { + "epoch": 4.091901728844404, + "grad_norm": 0.08820152133944004, + "learning_rate": 7.928193427137848e-07, + "loss": 0.0005, + "step": 8994 + }, + { + "epoch": 4.092356687898089, + "grad_norm": 0.27176668344128196, + "learning_rate": 7.920471851064915e-07, + "loss": 0.0012, + "step": 8995 + }, + { + "epoch": 4.092811646951774, + "grad_norm": 0.2599016186439286, + "learning_rate": 7.912753713540988e-07, + "loss": 0.0016, + "step": 8996 + }, + { + "epoch": 4.09326660600546, + "grad_norm": 0.24508168228113944, + "learning_rate": 7.905039015196764e-07, + "loss": 0.0029, + "step": 8997 + }, + { + "epoch": 4.093721565059145, + "grad_norm": 0.16552595439233694, + "learning_rate": 7.89732775666266e-07, + "loss": 0.0003, + "step": 8998 + }, + { + "epoch": 4.09417652411283, + "grad_norm": 0.03168636056515445, + "learning_rate": 7.889619938568799e-07, + "loss": 0.0003, + "step": 8999 + }, + { + "epoch": 4.094631483166515, + "grad_norm": 0.03534578011139962, + "learning_rate": 7.881915561545028e-07, + "loss": 0.0002, + "step": 9000 + }, + { + "epoch": 4.0950864422202, + "grad_norm": 0.03524834656645179, + "learning_rate": 7.8742146262209e-07, + "loss": 0.0003, + "step": 9001 + }, + { + "epoch": 4.095541401273885, + "grad_norm": 0.033614103644947185, + "learning_rate": 7.866517133225726e-07, + "loss": 0.0003, + "step": 9002 + }, + { + "epoch": 4.095996360327571, + "grad_norm": 0.11298186897731866, + "learning_rate": 7.858823083188494e-07, + "loss": 0.0012, + "step": 9003 + }, + { + "epoch": 4.096451319381256, + "grad_norm": 0.00536461124229854, + "learning_rate": 7.851132476737938e-07, + "loss": 0.0, + "step": 9004 + }, + { + "epoch": 4.096906278434941, + "grad_norm": 0.10682383732275096, + "learning_rate": 7.843445314502491e-07, + "loss": 0.0006, + "step": 9005 + }, + { + "epoch": 4.097361237488626, + "grad_norm": 0.13699461915240788, + "learning_rate": 7.835761597110308e-07, + "loss": 0.001, + "step": 9006 + }, + { + "epoch": 4.097816196542311, + "grad_norm": 0.25655542712667856, + "learning_rate": 7.828081325189285e-07, + "loss": 0.0011, + "step": 9007 + }, + { + "epoch": 4.098271155595996, + "grad_norm": 0.1295219593968848, + "learning_rate": 7.820404499367012e-07, + "loss": 0.0016, + "step": 9008 + }, + { + "epoch": 4.098726114649682, + "grad_norm": 0.010228893357943289, + "learning_rate": 7.8127311202708e-07, + "loss": 0.0001, + "step": 9009 + }, + { + "epoch": 4.099181073703367, + "grad_norm": 0.06387778339624936, + "learning_rate": 7.805061188527674e-07, + "loss": 0.0005, + "step": 9010 + }, + { + "epoch": 4.099636032757052, + "grad_norm": 0.04014668054526879, + "learning_rate": 7.797394704764394e-07, + "loss": 0.0003, + "step": 9011 + }, + { + "epoch": 4.100090991810737, + "grad_norm": 0.15261539973298907, + "learning_rate": 7.789731669607447e-07, + "loss": 0.0026, + "step": 9012 + }, + { + "epoch": 4.100545950864422, + "grad_norm": 0.013924416974846462, + "learning_rate": 7.782072083683012e-07, + "loss": 0.0001, + "step": 9013 + }, + { + "epoch": 4.101000909918107, + "grad_norm": 0.048890312162571815, + "learning_rate": 7.774415947616987e-07, + "loss": 0.0004, + "step": 9014 + }, + { + "epoch": 4.101455868971793, + "grad_norm": 0.1745776853619407, + "learning_rate": 7.766763262035004e-07, + "loss": 0.0006, + "step": 9015 + }, + { + "epoch": 4.101910828025478, + "grad_norm": 0.026931852751956892, + "learning_rate": 7.759114027562387e-07, + "loss": 0.0002, + "step": 9016 + }, + { + "epoch": 4.1023657870791626, + "grad_norm": 0.04054561665703668, + "learning_rate": 7.751468244824217e-07, + "loss": 0.0003, + "step": 9017 + }, + { + "epoch": 4.102820746132848, + "grad_norm": 0.012079983109997021, + "learning_rate": 7.743825914445285e-07, + "loss": 0.0001, + "step": 9018 + }, + { + "epoch": 4.103275705186533, + "grad_norm": 0.009909822756248432, + "learning_rate": 7.736187037050069e-07, + "loss": 0.0001, + "step": 9019 + }, + { + "epoch": 4.103730664240218, + "grad_norm": 0.255041745201386, + "learning_rate": 7.728551613262786e-07, + "loss": 0.0023, + "step": 9020 + }, + { + "epoch": 4.104185623293904, + "grad_norm": 0.07233558674493862, + "learning_rate": 7.720919643707359e-07, + "loss": 0.0009, + "step": 9021 + }, + { + "epoch": 4.104640582347589, + "grad_norm": 0.028764691554044784, + "learning_rate": 7.713291129007455e-07, + "loss": 0.0001, + "step": 9022 + }, + { + "epoch": 4.1050955414012735, + "grad_norm": 0.04014556800112941, + "learning_rate": 7.705666069786438e-07, + "loss": 0.0003, + "step": 9023 + }, + { + "epoch": 4.105550500454959, + "grad_norm": 0.10363523284450694, + "learning_rate": 7.698044466667381e-07, + "loss": 0.0011, + "step": 9024 + }, + { + "epoch": 4.106005459508644, + "grad_norm": 0.03143944711491693, + "learning_rate": 7.690426320273104e-07, + "loss": 0.0003, + "step": 9025 + }, + { + "epoch": 4.106460418562329, + "grad_norm": 0.026968636564706217, + "learning_rate": 7.682811631226112e-07, + "loss": 0.0003, + "step": 9026 + }, + { + "epoch": 4.106915377616015, + "grad_norm": 0.13690237862947646, + "learning_rate": 7.675200400148658e-07, + "loss": 0.003, + "step": 9027 + }, + { + "epoch": 4.1073703366697, + "grad_norm": 0.24160531306052005, + "learning_rate": 7.66759262766269e-07, + "loss": 0.0032, + "step": 9028 + }, + { + "epoch": 4.1078252957233845, + "grad_norm": 0.046244627139069, + "learning_rate": 7.659988314389887e-07, + "loss": 0.0005, + "step": 9029 + }, + { + "epoch": 4.10828025477707, + "grad_norm": 0.06866401152338791, + "learning_rate": 7.652387460951616e-07, + "loss": 0.0008, + "step": 9030 + }, + { + "epoch": 4.108735213830755, + "grad_norm": 0.03827093977408107, + "learning_rate": 7.644790067969005e-07, + "loss": 0.0003, + "step": 9031 + }, + { + "epoch": 4.10919017288444, + "grad_norm": 0.034568775397146846, + "learning_rate": 7.637196136062886e-07, + "loss": 0.0003, + "step": 9032 + }, + { + "epoch": 4.109645131938126, + "grad_norm": 0.7420127393791961, + "learning_rate": 7.62960566585379e-07, + "loss": 0.0003, + "step": 9033 + }, + { + "epoch": 4.110100090991811, + "grad_norm": 0.07435224717976553, + "learning_rate": 7.622018657961972e-07, + "loss": 0.0007, + "step": 9034 + }, + { + "epoch": 4.1105550500454955, + "grad_norm": 0.14420560210490088, + "learning_rate": 7.614435113007406e-07, + "loss": 0.0006, + "step": 9035 + }, + { + "epoch": 4.111010009099181, + "grad_norm": 0.07155717824907191, + "learning_rate": 7.6068550316098e-07, + "loss": 0.0013, + "step": 9036 + }, + { + "epoch": 4.111464968152866, + "grad_norm": 0.17961858540151596, + "learning_rate": 7.599278414388544e-07, + "loss": 0.0024, + "step": 9037 + }, + { + "epoch": 4.111919927206552, + "grad_norm": 0.02222899526199989, + "learning_rate": 7.591705261962784e-07, + "loss": 0.0002, + "step": 9038 + }, + { + "epoch": 4.112374886260237, + "grad_norm": 0.09069728412670236, + "learning_rate": 7.584135574951362e-07, + "loss": 0.0009, + "step": 9039 + }, + { + "epoch": 4.112829845313922, + "grad_norm": 0.036027378337994576, + "learning_rate": 7.576569353972818e-07, + "loss": 0.0003, + "step": 9040 + }, + { + "epoch": 4.113284804367607, + "grad_norm": 0.0427730374010759, + "learning_rate": 7.569006599645456e-07, + "loss": 0.0005, + "step": 9041 + }, + { + "epoch": 4.113739763421292, + "grad_norm": 0.04034244624040654, + "learning_rate": 7.561447312587256e-07, + "loss": 0.0004, + "step": 9042 + }, + { + "epoch": 4.114194722474977, + "grad_norm": 0.1526865906571951, + "learning_rate": 7.553891493415933e-07, + "loss": 0.0011, + "step": 9043 + }, + { + "epoch": 4.114649681528663, + "grad_norm": 0.1703628161788179, + "learning_rate": 7.546339142748899e-07, + "loss": 0.005, + "step": 9044 + }, + { + "epoch": 4.115104640582348, + "grad_norm": 0.009205656991178103, + "learning_rate": 7.53879026120331e-07, + "loss": 0.0001, + "step": 9045 + }, + { + "epoch": 4.115559599636033, + "grad_norm": 0.04278017196341073, + "learning_rate": 7.531244849396041e-07, + "loss": 0.0003, + "step": 9046 + }, + { + "epoch": 4.116014558689718, + "grad_norm": 0.012008839485111932, + "learning_rate": 7.523702907943658e-07, + "loss": 0.0001, + "step": 9047 + }, + { + "epoch": 4.116469517743403, + "grad_norm": 0.0787735252110677, + "learning_rate": 7.516164437462453e-07, + "loss": 0.0004, + "step": 9048 + }, + { + "epoch": 4.116924476797088, + "grad_norm": 0.014533373140239067, + "learning_rate": 7.508629438568415e-07, + "loss": 0.0001, + "step": 9049 + }, + { + "epoch": 4.117379435850774, + "grad_norm": 0.06570845393221407, + "learning_rate": 7.501097911877308e-07, + "loss": 0.0006, + "step": 9050 + }, + { + "epoch": 4.117834394904459, + "grad_norm": 0.10137236537515193, + "learning_rate": 7.493569858004546e-07, + "loss": 0.0013, + "step": 9051 + }, + { + "epoch": 4.1182893539581436, + "grad_norm": 0.07812229924381484, + "learning_rate": 7.486045277565307e-07, + "loss": 0.0006, + "step": 9052 + }, + { + "epoch": 4.118744313011829, + "grad_norm": 0.10653101170381142, + "learning_rate": 7.478524171174456e-07, + "loss": 0.0007, + "step": 9053 + }, + { + "epoch": 4.119199272065514, + "grad_norm": 0.14023018899099193, + "learning_rate": 7.471006539446585e-07, + "loss": 0.0016, + "step": 9054 + }, + { + "epoch": 4.119654231119199, + "grad_norm": 0.09752839565509065, + "learning_rate": 7.46349238299599e-07, + "loss": 0.0013, + "step": 9055 + }, + { + "epoch": 4.120109190172885, + "grad_norm": 0.07994606258243822, + "learning_rate": 7.455981702436715e-07, + "loss": 0.0003, + "step": 9056 + }, + { + "epoch": 4.12056414922657, + "grad_norm": 0.06766490200755171, + "learning_rate": 7.448474498382491e-07, + "loss": 0.0005, + "step": 9057 + }, + { + "epoch": 4.1210191082802545, + "grad_norm": 0.07879223370637874, + "learning_rate": 7.440970771446754e-07, + "loss": 0.0011, + "step": 9058 + }, + { + "epoch": 4.12147406733394, + "grad_norm": 0.05380977953691529, + "learning_rate": 7.433470522242702e-07, + "loss": 0.0012, + "step": 9059 + }, + { + "epoch": 4.121929026387625, + "grad_norm": 0.06098484058756868, + "learning_rate": 7.425973751383203e-07, + "loss": 0.0004, + "step": 9060 + }, + { + "epoch": 4.12238398544131, + "grad_norm": 0.13783283295433033, + "learning_rate": 7.41848045948087e-07, + "loss": 0.0017, + "step": 9061 + }, + { + "epoch": 4.122838944494996, + "grad_norm": 0.0743303095245618, + "learning_rate": 7.410990647148025e-07, + "loss": 0.0009, + "step": 9062 + }, + { + "epoch": 4.123293903548681, + "grad_norm": 0.04413514441662232, + "learning_rate": 7.40350431499669e-07, + "loss": 0.0004, + "step": 9063 + }, + { + "epoch": 4.1237488626023655, + "grad_norm": 0.023929063696521806, + "learning_rate": 7.396021463638608e-07, + "loss": 0.0001, + "step": 9064 + }, + { + "epoch": 4.124203821656051, + "grad_norm": 0.10022273116903364, + "learning_rate": 7.388542093685258e-07, + "loss": 0.0007, + "step": 9065 + }, + { + "epoch": 4.124658780709736, + "grad_norm": 0.12799627777300554, + "learning_rate": 7.381066205747822e-07, + "loss": 0.0013, + "step": 9066 + }, + { + "epoch": 4.125113739763421, + "grad_norm": 0.008836613750157247, + "learning_rate": 7.373593800437196e-07, + "loss": 0.0001, + "step": 9067 + }, + { + "epoch": 4.125568698817107, + "grad_norm": 0.12382721197088449, + "learning_rate": 7.366124878363984e-07, + "loss": 0.0015, + "step": 9068 + }, + { + "epoch": 4.126023657870792, + "grad_norm": 0.023711049079336122, + "learning_rate": 7.358659440138499e-07, + "loss": 0.0002, + "step": 9069 + }, + { + "epoch": 4.1264786169244765, + "grad_norm": 0.056088606069611455, + "learning_rate": 7.351197486370809e-07, + "loss": 0.0004, + "step": 9070 + }, + { + "epoch": 4.126933575978162, + "grad_norm": 0.026745150542072205, + "learning_rate": 7.343739017670665e-07, + "loss": 0.0002, + "step": 9071 + }, + { + "epoch": 4.127388535031847, + "grad_norm": 0.12775574427460795, + "learning_rate": 7.336284034647517e-07, + "loss": 0.0022, + "step": 9072 + }, + { + "epoch": 4.127843494085532, + "grad_norm": 0.029243124692943483, + "learning_rate": 7.328832537910585e-07, + "loss": 0.0002, + "step": 9073 + }, + { + "epoch": 4.128298453139218, + "grad_norm": 0.374559378736072, + "learning_rate": 7.321384528068748e-07, + "loss": 0.0027, + "step": 9074 + }, + { + "epoch": 4.128753412192903, + "grad_norm": 0.12335953680153586, + "learning_rate": 7.313940005730641e-07, + "loss": 0.002, + "step": 9075 + }, + { + "epoch": 4.1292083712465875, + "grad_norm": 0.1683120319833235, + "learning_rate": 7.306498971504589e-07, + "loss": 0.0016, + "step": 9076 + }, + { + "epoch": 4.129663330300273, + "grad_norm": 0.02827564683645294, + "learning_rate": 7.299061425998638e-07, + "loss": 0.0002, + "step": 9077 + }, + { + "epoch": 4.130118289353958, + "grad_norm": 0.013370817807326758, + "learning_rate": 7.291627369820542e-07, + "loss": 0.0001, + "step": 9078 + }, + { + "epoch": 4.130573248407643, + "grad_norm": 0.02059884797082071, + "learning_rate": 7.284196803577787e-07, + "loss": 0.0001, + "step": 9079 + }, + { + "epoch": 4.131028207461329, + "grad_norm": 0.027242690545820625, + "learning_rate": 7.276769727877575e-07, + "loss": 0.0003, + "step": 9080 + }, + { + "epoch": 4.131483166515014, + "grad_norm": 0.03871781328085265, + "learning_rate": 7.269346143326805e-07, + "loss": 0.0003, + "step": 9081 + }, + { + "epoch": 4.131938125568698, + "grad_norm": 0.01139347640688604, + "learning_rate": 7.261926050532103e-07, + "loss": 0.0001, + "step": 9082 + }, + { + "epoch": 4.132393084622384, + "grad_norm": 0.06897621353278155, + "learning_rate": 7.254509450099784e-07, + "loss": 0.0006, + "step": 9083 + }, + { + "epoch": 4.132848043676069, + "grad_norm": 0.015195485397562918, + "learning_rate": 7.247096342635929e-07, + "loss": 0.0002, + "step": 9084 + }, + { + "epoch": 4.133303002729754, + "grad_norm": 0.20469708096473443, + "learning_rate": 7.239686728746292e-07, + "loss": 0.0025, + "step": 9085 + }, + { + "epoch": 4.13375796178344, + "grad_norm": 0.010152895976561375, + "learning_rate": 7.232280609036341e-07, + "loss": 0.0001, + "step": 9086 + }, + { + "epoch": 4.1342129208371245, + "grad_norm": 0.04787282604103635, + "learning_rate": 7.224877984111289e-07, + "loss": 0.0004, + "step": 9087 + }, + { + "epoch": 4.134667879890809, + "grad_norm": 0.06828705294116683, + "learning_rate": 7.217478854576026e-07, + "loss": 0.0009, + "step": 9088 + }, + { + "epoch": 4.135122838944495, + "grad_norm": 0.13268464470767546, + "learning_rate": 7.210083221035202e-07, + "loss": 0.0012, + "step": 9089 + }, + { + "epoch": 4.13557779799818, + "grad_norm": 0.009507070639699456, + "learning_rate": 7.202691084093138e-07, + "loss": 0.0001, + "step": 9090 + }, + { + "epoch": 4.136032757051865, + "grad_norm": 0.07720946676333287, + "learning_rate": 7.195302444353886e-07, + "loss": 0.0004, + "step": 9091 + }, + { + "epoch": 4.136487716105551, + "grad_norm": 0.16378615861974807, + "learning_rate": 7.187917302421216e-07, + "loss": 0.0017, + "step": 9092 + }, + { + "epoch": 4.1369426751592355, + "grad_norm": 0.16474450040440558, + "learning_rate": 7.180535658898596e-07, + "loss": 0.0011, + "step": 9093 + }, + { + "epoch": 4.13739763421292, + "grad_norm": 0.004564027357272073, + "learning_rate": 7.173157514389228e-07, + "loss": 0.0, + "step": 9094 + }, + { + "epoch": 4.137852593266606, + "grad_norm": 0.10569007381464603, + "learning_rate": 7.165782869496035e-07, + "loss": 0.0015, + "step": 9095 + }, + { + "epoch": 4.138307552320291, + "grad_norm": 0.09926510154592078, + "learning_rate": 7.158411724821629e-07, + "loss": 0.0005, + "step": 9096 + }, + { + "epoch": 4.138762511373977, + "grad_norm": 0.0625997135512905, + "learning_rate": 7.151044080968344e-07, + "loss": 0.0013, + "step": 9097 + }, + { + "epoch": 4.139217470427662, + "grad_norm": 0.009962982838196358, + "learning_rate": 7.143679938538228e-07, + "loss": 0.0001, + "step": 9098 + }, + { + "epoch": 4.1396724294813465, + "grad_norm": 0.07108107178098812, + "learning_rate": 7.136319298133054e-07, + "loss": 0.0004, + "step": 9099 + }, + { + "epoch": 4.140127388535032, + "grad_norm": 0.0874769956895481, + "learning_rate": 7.128962160354291e-07, + "loss": 0.0004, + "step": 9100 + }, + { + "epoch": 4.140582347588717, + "grad_norm": 0.01717447621969392, + "learning_rate": 7.121608525803142e-07, + "loss": 0.0001, + "step": 9101 + }, + { + "epoch": 4.141037306642402, + "grad_norm": 0.00924791996682537, + "learning_rate": 7.114258395080509e-07, + "loss": 0.0001, + "step": 9102 + }, + { + "epoch": 4.141492265696088, + "grad_norm": 0.13289027307934165, + "learning_rate": 7.106911768787e-07, + "loss": 0.0014, + "step": 9103 + }, + { + "epoch": 4.141947224749773, + "grad_norm": 0.019391147405749938, + "learning_rate": 7.09956864752297e-07, + "loss": 0.0001, + "step": 9104 + }, + { + "epoch": 4.1424021838034575, + "grad_norm": 0.23201577816908722, + "learning_rate": 7.092229031888448e-07, + "loss": 0.0021, + "step": 9105 + }, + { + "epoch": 4.142857142857143, + "grad_norm": 0.0814584473978802, + "learning_rate": 7.084892922483205e-07, + "loss": 0.0013, + "step": 9106 + }, + { + "epoch": 4.143312101910828, + "grad_norm": 0.05315791364710135, + "learning_rate": 7.077560319906696e-07, + "loss": 0.0002, + "step": 9107 + }, + { + "epoch": 4.143767060964513, + "grad_norm": 0.07157720774010551, + "learning_rate": 7.070231224758123e-07, + "loss": 0.0007, + "step": 9108 + }, + { + "epoch": 4.144222020018199, + "grad_norm": 0.10267073891209148, + "learning_rate": 7.062905637636397e-07, + "loss": 0.0016, + "step": 9109 + }, + { + "epoch": 4.144676979071884, + "grad_norm": 0.014047374623526305, + "learning_rate": 7.055583559140116e-07, + "loss": 0.0001, + "step": 9110 + }, + { + "epoch": 4.1451319381255685, + "grad_norm": 0.06027531730501482, + "learning_rate": 7.048264989867615e-07, + "loss": 0.0004, + "step": 9111 + }, + { + "epoch": 4.145586897179254, + "grad_norm": 0.10077012797627584, + "learning_rate": 7.040949930416918e-07, + "loss": 0.0019, + "step": 9112 + }, + { + "epoch": 4.146041856232939, + "grad_norm": 0.07578086815231937, + "learning_rate": 7.033638381385804e-07, + "loss": 0.0007, + "step": 9113 + }, + { + "epoch": 4.146496815286624, + "grad_norm": 0.2161013725003285, + "learning_rate": 7.026330343371712e-07, + "loss": 0.0009, + "step": 9114 + }, + { + "epoch": 4.14695177434031, + "grad_norm": 0.0402471231960349, + "learning_rate": 7.019025816971852e-07, + "loss": 0.0002, + "step": 9115 + }, + { + "epoch": 4.147406733393995, + "grad_norm": 0.027199882194507998, + "learning_rate": 7.011724802783104e-07, + "loss": 0.0002, + "step": 9116 + }, + { + "epoch": 4.147861692447679, + "grad_norm": 0.03104854755254246, + "learning_rate": 7.004427301402055e-07, + "loss": 0.0003, + "step": 9117 + }, + { + "epoch": 4.148316651501365, + "grad_norm": 0.009678113573452149, + "learning_rate": 6.997133313425058e-07, + "loss": 0.0001, + "step": 9118 + }, + { + "epoch": 4.14877161055505, + "grad_norm": 0.04210170721353225, + "learning_rate": 6.989842839448125e-07, + "loss": 0.0004, + "step": 9119 + }, + { + "epoch": 4.149226569608735, + "grad_norm": 0.0644528736816396, + "learning_rate": 6.982555880066999e-07, + "loss": 0.0012, + "step": 9120 + }, + { + "epoch": 4.149681528662421, + "grad_norm": 0.1057194319425339, + "learning_rate": 6.975272435877134e-07, + "loss": 0.0014, + "step": 9121 + }, + { + "epoch": 4.1501364877161055, + "grad_norm": 0.006663769364760856, + "learning_rate": 6.967992507473703e-07, + "loss": 0.0001, + "step": 9122 + }, + { + "epoch": 4.15059144676979, + "grad_norm": 0.022638692143098008, + "learning_rate": 6.960716095451608e-07, + "loss": 0.0002, + "step": 9123 + }, + { + "epoch": 4.151046405823476, + "grad_norm": 0.02387221776472979, + "learning_rate": 6.95344320040543e-07, + "loss": 0.0001, + "step": 9124 + }, + { + "epoch": 4.151501364877161, + "grad_norm": 0.03687433742698374, + "learning_rate": 6.946173822929481e-07, + "loss": 0.0004, + "step": 9125 + }, + { + "epoch": 4.151956323930846, + "grad_norm": 0.07900027947464566, + "learning_rate": 6.938907963617775e-07, + "loss": 0.0008, + "step": 9126 + }, + { + "epoch": 4.152411282984532, + "grad_norm": 0.006882607438259228, + "learning_rate": 6.931645623064031e-07, + "loss": 0.0, + "step": 9127 + }, + { + "epoch": 4.1528662420382165, + "grad_norm": 0.04867944724074638, + "learning_rate": 6.924386801861721e-07, + "loss": 0.0006, + "step": 9128 + }, + { + "epoch": 4.153321201091901, + "grad_norm": 0.07776516566746394, + "learning_rate": 6.917131500603996e-07, + "loss": 0.0006, + "step": 9129 + }, + { + "epoch": 4.153776160145587, + "grad_norm": 0.028951684215992028, + "learning_rate": 6.909879719883733e-07, + "loss": 0.0002, + "step": 9130 + }, + { + "epoch": 4.154231119199272, + "grad_norm": 0.08448020514966632, + "learning_rate": 6.902631460293501e-07, + "loss": 0.0007, + "step": 9131 + }, + { + "epoch": 4.154686078252957, + "grad_norm": 0.06194416165232362, + "learning_rate": 6.89538672242559e-07, + "loss": 0.0006, + "step": 9132 + }, + { + "epoch": 4.155141037306643, + "grad_norm": 0.3397159170814614, + "learning_rate": 6.888145506872029e-07, + "loss": 0.0038, + "step": 9133 + }, + { + "epoch": 4.1555959963603275, + "grad_norm": 0.20356016980668376, + "learning_rate": 6.880907814224524e-07, + "loss": 0.0012, + "step": 9134 + }, + { + "epoch": 4.156050955414012, + "grad_norm": 0.07823289180254378, + "learning_rate": 6.873673645074497e-07, + "loss": 0.0006, + "step": 9135 + }, + { + "epoch": 4.156505914467698, + "grad_norm": 0.08199286828782393, + "learning_rate": 6.866443000013117e-07, + "loss": 0.0007, + "step": 9136 + }, + { + "epoch": 4.156960873521383, + "grad_norm": 0.03775878067999496, + "learning_rate": 6.859215879631215e-07, + "loss": 0.0002, + "step": 9137 + }, + { + "epoch": 4.157415832575068, + "grad_norm": 0.25692113151172263, + "learning_rate": 6.851992284519377e-07, + "loss": 0.005, + "step": 9138 + }, + { + "epoch": 4.157870791628754, + "grad_norm": 0.052334533110741516, + "learning_rate": 6.844772215267875e-07, + "loss": 0.0002, + "step": 9139 + }, + { + "epoch": 4.1583257506824385, + "grad_norm": 0.12715905993237583, + "learning_rate": 6.837555672466701e-07, + "loss": 0.0009, + "step": 9140 + }, + { + "epoch": 4.158780709736123, + "grad_norm": 0.07667259298747758, + "learning_rate": 6.830342656705546e-07, + "loss": 0.0007, + "step": 9141 + }, + { + "epoch": 4.159235668789809, + "grad_norm": 0.10471670082195757, + "learning_rate": 6.823133168573836e-07, + "loss": 0.0008, + "step": 9142 + }, + { + "epoch": 4.159690627843494, + "grad_norm": 0.11208945904229069, + "learning_rate": 6.815927208660711e-07, + "loss": 0.001, + "step": 9143 + }, + { + "epoch": 4.160145586897179, + "grad_norm": 0.24994788542120291, + "learning_rate": 6.808724777554998e-07, + "loss": 0.0014, + "step": 9144 + }, + { + "epoch": 4.160600545950865, + "grad_norm": 0.04349589401000913, + "learning_rate": 6.801525875845244e-07, + "loss": 0.0004, + "step": 9145 + }, + { + "epoch": 4.1610555050045495, + "grad_norm": 0.19759787328195044, + "learning_rate": 6.794330504119706e-07, + "loss": 0.004, + "step": 9146 + }, + { + "epoch": 4.161510464058235, + "grad_norm": 0.030857087462157606, + "learning_rate": 6.787138662966369e-07, + "loss": 0.0002, + "step": 9147 + }, + { + "epoch": 4.16196542311192, + "grad_norm": 0.11520096368589824, + "learning_rate": 6.779950352972919e-07, + "loss": 0.0018, + "step": 9148 + }, + { + "epoch": 4.162420382165605, + "grad_norm": 0.1908976447168784, + "learning_rate": 6.77276557472673e-07, + "loss": 0.001, + "step": 9149 + }, + { + "epoch": 4.162875341219291, + "grad_norm": 0.07705742660010427, + "learning_rate": 6.76558432881494e-07, + "loss": 0.0006, + "step": 9150 + }, + { + "epoch": 4.163330300272976, + "grad_norm": 0.011536493961918222, + "learning_rate": 6.758406615824342e-07, + "loss": 0.0001, + "step": 9151 + }, + { + "epoch": 4.16378525932666, + "grad_norm": 0.12534348159449027, + "learning_rate": 6.751232436341487e-07, + "loss": 0.0008, + "step": 9152 + }, + { + "epoch": 4.164240218380346, + "grad_norm": 0.04610340944499061, + "learning_rate": 6.744061790952611e-07, + "loss": 0.0003, + "step": 9153 + }, + { + "epoch": 4.164695177434031, + "grad_norm": 0.016528406575559255, + "learning_rate": 6.736894680243661e-07, + "loss": 0.0001, + "step": 9154 + }, + { + "epoch": 4.165150136487716, + "grad_norm": 0.14485888727291657, + "learning_rate": 6.729731104800292e-07, + "loss": 0.0012, + "step": 9155 + }, + { + "epoch": 4.165605095541402, + "grad_norm": 0.028025914044455785, + "learning_rate": 6.722571065207889e-07, + "loss": 0.0002, + "step": 9156 + }, + { + "epoch": 4.1660600545950865, + "grad_norm": 0.035713522445234486, + "learning_rate": 6.715414562051553e-07, + "loss": 0.0003, + "step": 9157 + }, + { + "epoch": 4.166515013648771, + "grad_norm": 0.12071765878038823, + "learning_rate": 6.708261595916071e-07, + "loss": 0.0016, + "step": 9158 + }, + { + "epoch": 4.166969972702457, + "grad_norm": 0.019864769840983554, + "learning_rate": 6.701112167385943e-07, + "loss": 0.0001, + "step": 9159 + }, + { + "epoch": 4.167424931756142, + "grad_norm": 0.04428397490285644, + "learning_rate": 6.693966277045394e-07, + "loss": 0.0002, + "step": 9160 + }, + { + "epoch": 4.167879890809827, + "grad_norm": 0.041786114304456355, + "learning_rate": 6.686823925478336e-07, + "loss": 0.0001, + "step": 9161 + }, + { + "epoch": 4.168334849863513, + "grad_norm": 0.07412171322175884, + "learning_rate": 6.679685113268447e-07, + "loss": 0.0007, + "step": 9162 + }, + { + "epoch": 4.1687898089171975, + "grad_norm": 0.023537291646692, + "learning_rate": 6.672549840999037e-07, + "loss": 0.0001, + "step": 9163 + }, + { + "epoch": 4.169244767970882, + "grad_norm": 0.17160225817536284, + "learning_rate": 6.665418109253207e-07, + "loss": 0.0048, + "step": 9164 + }, + { + "epoch": 4.169699727024568, + "grad_norm": 0.00893465321613862, + "learning_rate": 6.658289918613709e-07, + "loss": 0.0001, + "step": 9165 + }, + { + "epoch": 4.170154686078253, + "grad_norm": 0.1361831934819655, + "learning_rate": 6.651165269663018e-07, + "loss": 0.0029, + "step": 9166 + }, + { + "epoch": 4.170609645131938, + "grad_norm": 0.031564461284722276, + "learning_rate": 6.644044162983355e-07, + "loss": 0.0002, + "step": 9167 + }, + { + "epoch": 4.171064604185624, + "grad_norm": 0.0770030833103646, + "learning_rate": 6.636926599156601e-07, + "loss": 0.0003, + "step": 9168 + }, + { + "epoch": 4.1715195632393085, + "grad_norm": 0.006766130064054846, + "learning_rate": 6.629812578764389e-07, + "loss": 0.0001, + "step": 9169 + }, + { + "epoch": 4.171974522292993, + "grad_norm": 0.024778419877562463, + "learning_rate": 6.622702102388018e-07, + "loss": 0.0002, + "step": 9170 + }, + { + "epoch": 4.172429481346679, + "grad_norm": 0.05138540127432682, + "learning_rate": 6.615595170608541e-07, + "loss": 0.0003, + "step": 9171 + }, + { + "epoch": 4.172884440400364, + "grad_norm": 0.023476761991421933, + "learning_rate": 6.608491784006715e-07, + "loss": 0.0002, + "step": 9172 + }, + { + "epoch": 4.173339399454049, + "grad_norm": 0.0560661430328377, + "learning_rate": 6.601391943162989e-07, + "loss": 0.0005, + "step": 9173 + }, + { + "epoch": 4.173794358507735, + "grad_norm": 0.059333921481697464, + "learning_rate": 6.594295648657528e-07, + "loss": 0.0005, + "step": 9174 + }, + { + "epoch": 4.1742493175614195, + "grad_norm": 0.02768945967891085, + "learning_rate": 6.587202901070194e-07, + "loss": 0.0003, + "step": 9175 + }, + { + "epoch": 4.174704276615104, + "grad_norm": 0.009875111081740105, + "learning_rate": 6.5801137009806e-07, + "loss": 0.0001, + "step": 9176 + }, + { + "epoch": 4.17515923566879, + "grad_norm": 0.09459231301077736, + "learning_rate": 6.573028048968022e-07, + "loss": 0.0006, + "step": 9177 + }, + { + "epoch": 4.175614194722475, + "grad_norm": 0.23591128626393254, + "learning_rate": 6.565945945611485e-07, + "loss": 0.0077, + "step": 9178 + }, + { + "epoch": 4.17606915377616, + "grad_norm": 0.005969329223529804, + "learning_rate": 6.558867391489703e-07, + "loss": 0.0001, + "step": 9179 + }, + { + "epoch": 4.176524112829846, + "grad_norm": 0.08929087480993467, + "learning_rate": 6.551792387181089e-07, + "loss": 0.0007, + "step": 9180 + }, + { + "epoch": 4.1769790718835305, + "grad_norm": 0.02797661408841475, + "learning_rate": 6.544720933263798e-07, + "loss": 0.0002, + "step": 9181 + }, + { + "epoch": 4.177434030937215, + "grad_norm": 0.013424039313164264, + "learning_rate": 6.537653030315671e-07, + "loss": 0.0001, + "step": 9182 + }, + { + "epoch": 4.177888989990901, + "grad_norm": 0.0034710398803660517, + "learning_rate": 6.530588678914263e-07, + "loss": 0.0, + "step": 9183 + }, + { + "epoch": 4.178343949044586, + "grad_norm": 0.031620278929570436, + "learning_rate": 6.523527879636837e-07, + "loss": 0.0002, + "step": 9184 + }, + { + "epoch": 4.178798908098271, + "grad_norm": 0.029953733539113833, + "learning_rate": 6.516470633060368e-07, + "loss": 0.0003, + "step": 9185 + }, + { + "epoch": 4.179253867151957, + "grad_norm": 0.10342099574404647, + "learning_rate": 6.509416939761565e-07, + "loss": 0.0019, + "step": 9186 + }, + { + "epoch": 4.179708826205641, + "grad_norm": 0.16248306500458215, + "learning_rate": 6.502366800316801e-07, + "loss": 0.0009, + "step": 9187 + }, + { + "epoch": 4.180163785259326, + "grad_norm": 0.013022761017413183, + "learning_rate": 6.495320215302192e-07, + "loss": 0.0001, + "step": 9188 + }, + { + "epoch": 4.180618744313012, + "grad_norm": 0.013148338493735616, + "learning_rate": 6.48827718529354e-07, + "loss": 0.0001, + "step": 9189 + }, + { + "epoch": 4.181073703366697, + "grad_norm": 0.16967024721292967, + "learning_rate": 6.481237710866389e-07, + "loss": 0.002, + "step": 9190 + }, + { + "epoch": 4.181528662420382, + "grad_norm": 0.08451525810806661, + "learning_rate": 6.474201792595958e-07, + "loss": 0.001, + "step": 9191 + }, + { + "epoch": 4.1819836214740675, + "grad_norm": 0.07785399963556024, + "learning_rate": 6.467169431057202e-07, + "loss": 0.0006, + "step": 9192 + }, + { + "epoch": 4.182438580527752, + "grad_norm": 0.010904243117567148, + "learning_rate": 6.460140626824763e-07, + "loss": 0.0001, + "step": 9193 + }, + { + "epoch": 4.182893539581437, + "grad_norm": 0.03257330343874157, + "learning_rate": 6.453115380473001e-07, + "loss": 0.0002, + "step": 9194 + }, + { + "epoch": 4.183348498635123, + "grad_norm": 0.03690040075654703, + "learning_rate": 6.446093692576005e-07, + "loss": 0.0003, + "step": 9195 + }, + { + "epoch": 4.183803457688808, + "grad_norm": 0.029523620598909953, + "learning_rate": 6.439075563707548e-07, + "loss": 0.0001, + "step": 9196 + }, + { + "epoch": 4.184258416742493, + "grad_norm": 0.03577807208194298, + "learning_rate": 6.432060994441114e-07, + "loss": 0.0003, + "step": 9197 + }, + { + "epoch": 4.1847133757961785, + "grad_norm": 0.05449437440305742, + "learning_rate": 6.425049985349891e-07, + "loss": 0.0006, + "step": 9198 + }, + { + "epoch": 4.185168334849863, + "grad_norm": 0.14825086389507425, + "learning_rate": 6.418042537006813e-07, + "loss": 0.0017, + "step": 9199 + }, + { + "epoch": 4.185623293903548, + "grad_norm": 0.027539289611392494, + "learning_rate": 6.411038649984474e-07, + "loss": 0.0002, + "step": 9200 + }, + { + "epoch": 4.186078252957234, + "grad_norm": 0.10060678633737367, + "learning_rate": 6.404038324855222e-07, + "loss": 0.0018, + "step": 9201 + }, + { + "epoch": 4.186533212010919, + "grad_norm": 0.008703302032577048, + "learning_rate": 6.397041562191081e-07, + "loss": 0.0001, + "step": 9202 + }, + { + "epoch": 4.186988171064604, + "grad_norm": 0.1556722438759862, + "learning_rate": 6.390048362563789e-07, + "loss": 0.0022, + "step": 9203 + }, + { + "epoch": 4.1874431301182895, + "grad_norm": 0.010543354781474334, + "learning_rate": 6.383058726544799e-07, + "loss": 0.0001, + "step": 9204 + }, + { + "epoch": 4.187898089171974, + "grad_norm": 0.18037695066916648, + "learning_rate": 6.376072654705274e-07, + "loss": 0.002, + "step": 9205 + }, + { + "epoch": 4.188353048225659, + "grad_norm": 0.16796778212296457, + "learning_rate": 6.369090147616103e-07, + "loss": 0.0021, + "step": 9206 + }, + { + "epoch": 4.188808007279345, + "grad_norm": 0.019603927792413246, + "learning_rate": 6.362111205847843e-07, + "loss": 0.0001, + "step": 9207 + }, + { + "epoch": 4.18926296633303, + "grad_norm": 0.3861315637527699, + "learning_rate": 6.355135829970794e-07, + "loss": 0.0022, + "step": 9208 + }, + { + "epoch": 4.189717925386716, + "grad_norm": 0.16050242908147475, + "learning_rate": 6.348164020554936e-07, + "loss": 0.0043, + "step": 9209 + }, + { + "epoch": 4.1901728844404005, + "grad_norm": 0.14560812987755511, + "learning_rate": 6.341195778169989e-07, + "loss": 0.0016, + "step": 9210 + }, + { + "epoch": 4.190627843494085, + "grad_norm": 0.1310902894596744, + "learning_rate": 6.334231103385369e-07, + "loss": 0.0009, + "step": 9211 + }, + { + "epoch": 4.191082802547771, + "grad_norm": 0.01991136686751464, + "learning_rate": 6.327269996770174e-07, + "loss": 0.0002, + "step": 9212 + }, + { + "epoch": 4.191537761601456, + "grad_norm": 0.042213720963715916, + "learning_rate": 6.320312458893262e-07, + "loss": 0.0004, + "step": 9213 + }, + { + "epoch": 4.191992720655141, + "grad_norm": 0.1363581936034296, + "learning_rate": 6.313358490323152e-07, + "loss": 0.0009, + "step": 9214 + }, + { + "epoch": 4.192447679708827, + "grad_norm": 0.3706466109068231, + "learning_rate": 6.306408091628108e-07, + "loss": 0.001, + "step": 9215 + }, + { + "epoch": 4.1929026387625115, + "grad_norm": 0.016176040948542317, + "learning_rate": 6.299461263376078e-07, + "loss": 0.0001, + "step": 9216 + }, + { + "epoch": 4.193357597816196, + "grad_norm": 0.05161273612807351, + "learning_rate": 6.292518006134723e-07, + "loss": 0.0002, + "step": 9217 + }, + { + "epoch": 4.193812556869882, + "grad_norm": 0.004714421082205538, + "learning_rate": 6.285578320471403e-07, + "loss": 0.0, + "step": 9218 + }, + { + "epoch": 4.194267515923567, + "grad_norm": 0.1783373314823882, + "learning_rate": 6.278642206953212e-07, + "loss": 0.002, + "step": 9219 + }, + { + "epoch": 4.194722474977252, + "grad_norm": 0.26380081822850604, + "learning_rate": 6.271709666146947e-07, + "loss": 0.0023, + "step": 9220 + }, + { + "epoch": 4.195177434030938, + "grad_norm": 0.10525343942278702, + "learning_rate": 6.264780698619094e-07, + "loss": 0.0002, + "step": 9221 + }, + { + "epoch": 4.195632393084622, + "grad_norm": 0.1434293007929281, + "learning_rate": 6.257855304935851e-07, + "loss": 0.0004, + "step": 9222 + }, + { + "epoch": 4.196087352138307, + "grad_norm": 0.05190244122703771, + "learning_rate": 6.250933485663124e-07, + "loss": 0.0003, + "step": 9223 + }, + { + "epoch": 4.196542311191993, + "grad_norm": 0.014246052764307845, + "learning_rate": 6.244015241366558e-07, + "loss": 0.0001, + "step": 9224 + }, + { + "epoch": 4.196997270245678, + "grad_norm": 0.034111973326917, + "learning_rate": 6.237100572611465e-07, + "loss": 0.0003, + "step": 9225 + }, + { + "epoch": 4.197452229299363, + "grad_norm": 0.2155729294220744, + "learning_rate": 6.230189479962873e-07, + "loss": 0.0039, + "step": 9226 + }, + { + "epoch": 4.1979071883530485, + "grad_norm": 0.0065085798646169766, + "learning_rate": 6.223281963985539e-07, + "loss": 0.0001, + "step": 9227 + }, + { + "epoch": 4.198362147406733, + "grad_norm": 0.02088539552658922, + "learning_rate": 6.216378025243902e-07, + "loss": 0.0001, + "step": 9228 + }, + { + "epoch": 4.198817106460418, + "grad_norm": 0.007280283249482536, + "learning_rate": 6.209477664302139e-07, + "loss": 0.0001, + "step": 9229 + }, + { + "epoch": 4.199272065514104, + "grad_norm": 0.019677628734965126, + "learning_rate": 6.202580881724107e-07, + "loss": 0.0001, + "step": 9230 + }, + { + "epoch": 4.199727024567789, + "grad_norm": 0.06795452885858172, + "learning_rate": 6.195687678073376e-07, + "loss": 0.0004, + "step": 9231 + }, + { + "epoch": 4.200181983621474, + "grad_norm": 0.0936993291356914, + "learning_rate": 6.188798053913226e-07, + "loss": 0.0011, + "step": 9232 + }, + { + "epoch": 4.2006369426751595, + "grad_norm": 0.04389398655634868, + "learning_rate": 6.181912009806629e-07, + "loss": 0.0004, + "step": 9233 + }, + { + "epoch": 4.201091901728844, + "grad_norm": 0.019063541393414738, + "learning_rate": 6.175029546316325e-07, + "loss": 0.0002, + "step": 9234 + }, + { + "epoch": 4.201546860782529, + "grad_norm": 0.16280824663918508, + "learning_rate": 6.168150664004696e-07, + "loss": 0.0004, + "step": 9235 + }, + { + "epoch": 4.202001819836215, + "grad_norm": 0.08059903867287413, + "learning_rate": 6.16127536343385e-07, + "loss": 0.0004, + "step": 9236 + }, + { + "epoch": 4.2024567788899, + "grad_norm": 0.12587323985074533, + "learning_rate": 6.154403645165608e-07, + "loss": 0.0009, + "step": 9237 + }, + { + "epoch": 4.202911737943585, + "grad_norm": 0.004735581879985467, + "learning_rate": 6.147535509761487e-07, + "loss": 0.0, + "step": 9238 + }, + { + "epoch": 4.2033666969972705, + "grad_norm": 0.11462942349212701, + "learning_rate": 6.140670957782735e-07, + "loss": 0.0007, + "step": 9239 + }, + { + "epoch": 4.203821656050955, + "grad_norm": 0.6553681836037633, + "learning_rate": 6.133809989790274e-07, + "loss": 0.0047, + "step": 9240 + }, + { + "epoch": 4.20427661510464, + "grad_norm": 0.07555905467646963, + "learning_rate": 6.126952606344777e-07, + "loss": 0.0007, + "step": 9241 + }, + { + "epoch": 4.204731574158326, + "grad_norm": 0.018565144935949913, + "learning_rate": 6.120098808006581e-07, + "loss": 0.0001, + "step": 9242 + }, + { + "epoch": 4.205186533212011, + "grad_norm": 0.02181423444709814, + "learning_rate": 6.113248595335742e-07, + "loss": 0.0002, + "step": 9243 + }, + { + "epoch": 4.205641492265696, + "grad_norm": 0.5092251492851335, + "learning_rate": 6.106401968892045e-07, + "loss": 0.0034, + "step": 9244 + }, + { + "epoch": 4.2060964513193815, + "grad_norm": 0.1322294130743196, + "learning_rate": 6.099558929234961e-07, + "loss": 0.0013, + "step": 9245 + }, + { + "epoch": 4.206551410373066, + "grad_norm": 0.09545367306436295, + "learning_rate": 6.092719476923664e-07, + "loss": 0.0004, + "step": 9246 + }, + { + "epoch": 4.207006369426751, + "grad_norm": 0.033852493799744435, + "learning_rate": 6.085883612517041e-07, + "loss": 0.0002, + "step": 9247 + }, + { + "epoch": 4.207461328480437, + "grad_norm": 0.11487475001328348, + "learning_rate": 6.079051336573694e-07, + "loss": 0.0005, + "step": 9248 + }, + { + "epoch": 4.207916287534122, + "grad_norm": 0.1393337908259671, + "learning_rate": 6.072222649651938e-07, + "loss": 0.0016, + "step": 9249 + }, + { + "epoch": 4.208371246587807, + "grad_norm": 0.086005086579078, + "learning_rate": 6.065397552309765e-07, + "loss": 0.0009, + "step": 9250 + }, + { + "epoch": 4.2088262056414925, + "grad_norm": 0.016164087562687696, + "learning_rate": 6.058576045104903e-07, + "loss": 0.0001, + "step": 9251 + }, + { + "epoch": 4.209281164695177, + "grad_norm": 0.0396690418853614, + "learning_rate": 6.051758128594759e-07, + "loss": 0.0003, + "step": 9252 + }, + { + "epoch": 4.209736123748862, + "grad_norm": 0.17307388864539813, + "learning_rate": 6.044943803336478e-07, + "loss": 0.0025, + "step": 9253 + }, + { + "epoch": 4.210191082802548, + "grad_norm": 0.010909629365686685, + "learning_rate": 6.038133069886887e-07, + "loss": 0.0001, + "step": 9254 + }, + { + "epoch": 4.210646041856233, + "grad_norm": 0.04285829641335229, + "learning_rate": 6.031325928802534e-07, + "loss": 0.0001, + "step": 9255 + }, + { + "epoch": 4.211101000909918, + "grad_norm": 0.03186494509096555, + "learning_rate": 6.024522380639669e-07, + "loss": 0.0003, + "step": 9256 + }, + { + "epoch": 4.211555959963603, + "grad_norm": 0.07519870204749489, + "learning_rate": 6.017722425954231e-07, + "loss": 0.0008, + "step": 9257 + }, + { + "epoch": 4.212010919017288, + "grad_norm": 0.10114821368449645, + "learning_rate": 6.010926065301909e-07, + "loss": 0.001, + "step": 9258 + }, + { + "epoch": 4.212465878070974, + "grad_norm": 0.005373423467783357, + "learning_rate": 6.004133299238052e-07, + "loss": 0.0, + "step": 9259 + }, + { + "epoch": 4.212920837124659, + "grad_norm": 0.06858789496133387, + "learning_rate": 5.997344128317739e-07, + "loss": 0.0005, + "step": 9260 + }, + { + "epoch": 4.213375796178344, + "grad_norm": 0.04407547760768694, + "learning_rate": 5.990558553095743e-07, + "loss": 0.0006, + "step": 9261 + }, + { + "epoch": 4.2138307552320295, + "grad_norm": 0.01451503844421713, + "learning_rate": 5.983776574126554e-07, + "loss": 0.0001, + "step": 9262 + }, + { + "epoch": 4.214285714285714, + "grad_norm": 0.11242653710469215, + "learning_rate": 5.976998191964378e-07, + "loss": 0.001, + "step": 9263 + }, + { + "epoch": 4.214740673339399, + "grad_norm": 0.016192436623986877, + "learning_rate": 5.9702234071631e-07, + "loss": 0.0001, + "step": 9264 + }, + { + "epoch": 4.215195632393085, + "grad_norm": 0.0494932513329367, + "learning_rate": 5.963452220276333e-07, + "loss": 0.0002, + "step": 9265 + }, + { + "epoch": 4.21565059144677, + "grad_norm": 0.1596845483423612, + "learning_rate": 5.956684631857385e-07, + "loss": 0.0009, + "step": 9266 + }, + { + "epoch": 4.216105550500455, + "grad_norm": 0.014596039927809067, + "learning_rate": 5.949920642459256e-07, + "loss": 0.0001, + "step": 9267 + }, + { + "epoch": 4.2165605095541405, + "grad_norm": 0.16087612869616275, + "learning_rate": 5.943160252634688e-07, + "loss": 0.0031, + "step": 9268 + }, + { + "epoch": 4.217015468607825, + "grad_norm": 0.01839831589538077, + "learning_rate": 5.936403462936113e-07, + "loss": 0.0001, + "step": 9269 + }, + { + "epoch": 4.21747042766151, + "grad_norm": 0.0702892304218404, + "learning_rate": 5.92965027391566e-07, + "loss": 0.0005, + "step": 9270 + }, + { + "epoch": 4.217925386715196, + "grad_norm": 0.17384681246707095, + "learning_rate": 5.922900686125166e-07, + "loss": 0.0007, + "step": 9271 + }, + { + "epoch": 4.218380345768881, + "grad_norm": 0.12433413518160925, + "learning_rate": 5.916154700116161e-07, + "loss": 0.0013, + "step": 9272 + }, + { + "epoch": 4.218835304822566, + "grad_norm": 0.04959066437792704, + "learning_rate": 5.909412316439933e-07, + "loss": 0.0007, + "step": 9273 + }, + { + "epoch": 4.2192902638762515, + "grad_norm": 0.03396951723817135, + "learning_rate": 5.902673535647413e-07, + "loss": 0.0002, + "step": 9274 + }, + { + "epoch": 4.219745222929936, + "grad_norm": 0.2632647347248532, + "learning_rate": 5.89593835828926e-07, + "loss": 0.0055, + "step": 9275 + }, + { + "epoch": 4.220200181983621, + "grad_norm": 0.006930218834141626, + "learning_rate": 5.889206784915863e-07, + "loss": 0.0, + "step": 9276 + }, + { + "epoch": 4.220655141037307, + "grad_norm": 0.10977733810756486, + "learning_rate": 5.882478816077275e-07, + "loss": 0.0008, + "step": 9277 + }, + { + "epoch": 4.221110100090992, + "grad_norm": 0.07528702005502176, + "learning_rate": 5.875754452323296e-07, + "loss": 0.0009, + "step": 9278 + }, + { + "epoch": 4.221565059144677, + "grad_norm": 0.026784072640724205, + "learning_rate": 5.869033694203402e-07, + "loss": 0.0002, + "step": 9279 + }, + { + "epoch": 4.2220200181983625, + "grad_norm": 0.05917152904685677, + "learning_rate": 5.862316542266777e-07, + "loss": 0.0005, + "step": 9280 + }, + { + "epoch": 4.222474977252047, + "grad_norm": 0.05384090420509348, + "learning_rate": 5.85560299706231e-07, + "loss": 0.0003, + "step": 9281 + }, + { + "epoch": 4.222929936305732, + "grad_norm": 0.275294963965191, + "learning_rate": 5.848893059138616e-07, + "loss": 0.0009, + "step": 9282 + }, + { + "epoch": 4.223384895359418, + "grad_norm": 0.20394959958015133, + "learning_rate": 5.842186729044003e-07, + "loss": 0.0029, + "step": 9283 + }, + { + "epoch": 4.223839854413103, + "grad_norm": 0.03366684958534347, + "learning_rate": 5.835484007326475e-07, + "loss": 0.0001, + "step": 9284 + }, + { + "epoch": 4.224294813466788, + "grad_norm": 0.34716471218447637, + "learning_rate": 5.828784894533751e-07, + "loss": 0.0039, + "step": 9285 + }, + { + "epoch": 4.2247497725204735, + "grad_norm": 0.09420327678053593, + "learning_rate": 5.822089391213237e-07, + "loss": 0.0007, + "step": 9286 + }, + { + "epoch": 4.225204731574158, + "grad_norm": 0.0740663765362519, + "learning_rate": 5.815397497912084e-07, + "loss": 0.0006, + "step": 9287 + }, + { + "epoch": 4.225659690627843, + "grad_norm": 0.02050749025065289, + "learning_rate": 5.808709215177111e-07, + "loss": 0.0001, + "step": 9288 + }, + { + "epoch": 4.226114649681529, + "grad_norm": 0.04191357099582751, + "learning_rate": 5.802024543554846e-07, + "loss": 0.0009, + "step": 9289 + }, + { + "epoch": 4.226569608735214, + "grad_norm": 0.023610364371886728, + "learning_rate": 5.795343483591548e-07, + "loss": 0.0001, + "step": 9290 + }, + { + "epoch": 4.227024567788899, + "grad_norm": 0.027975160420341295, + "learning_rate": 5.788666035833146e-07, + "loss": 0.0002, + "step": 9291 + }, + { + "epoch": 4.227479526842584, + "grad_norm": 0.14526961551799208, + "learning_rate": 5.781992200825309e-07, + "loss": 0.0015, + "step": 9292 + }, + { + "epoch": 4.227934485896269, + "grad_norm": 0.06404254366889023, + "learning_rate": 5.77532197911338e-07, + "loss": 0.0003, + "step": 9293 + }, + { + "epoch": 4.228389444949954, + "grad_norm": 0.05215656286078959, + "learning_rate": 5.768655371242421e-07, + "loss": 0.0004, + "step": 9294 + }, + { + "epoch": 4.22884440400364, + "grad_norm": 0.18349748769479132, + "learning_rate": 5.761992377757192e-07, + "loss": 0.0012, + "step": 9295 + }, + { + "epoch": 4.229299363057325, + "grad_norm": 0.028345648795295303, + "learning_rate": 5.755332999202168e-07, + "loss": 0.0003, + "step": 9296 + }, + { + "epoch": 4.22975432211101, + "grad_norm": 0.07408040339289708, + "learning_rate": 5.74867723612153e-07, + "loss": 0.0006, + "step": 9297 + }, + { + "epoch": 4.230209281164695, + "grad_norm": 0.07599486844267275, + "learning_rate": 5.742025089059155e-07, + "loss": 0.0002, + "step": 9298 + }, + { + "epoch": 4.23066424021838, + "grad_norm": 0.22352911402439368, + "learning_rate": 5.735376558558625e-07, + "loss": 0.0045, + "step": 9299 + }, + { + "epoch": 4.231119199272065, + "grad_norm": 0.06511784495072674, + "learning_rate": 5.72873164516321e-07, + "loss": 0.0004, + "step": 9300 + }, + { + "epoch": 4.231574158325751, + "grad_norm": 0.09486577965947564, + "learning_rate": 5.722090349415932e-07, + "loss": 0.001, + "step": 9301 + }, + { + "epoch": 4.232029117379436, + "grad_norm": 0.140978365623315, + "learning_rate": 5.715452671859468e-07, + "loss": 0.0023, + "step": 9302 + }, + { + "epoch": 4.232484076433121, + "grad_norm": 0.023789570101093938, + "learning_rate": 5.708818613036221e-07, + "loss": 0.0002, + "step": 9303 + }, + { + "epoch": 4.232939035486806, + "grad_norm": 0.06217827631473419, + "learning_rate": 5.702188173488304e-07, + "loss": 0.0006, + "step": 9304 + }, + { + "epoch": 4.233393994540491, + "grad_norm": 0.029948375685679478, + "learning_rate": 5.695561353757523e-07, + "loss": 0.0001, + "step": 9305 + }, + { + "epoch": 4.233848953594176, + "grad_norm": 0.06481214941934971, + "learning_rate": 5.688938154385382e-07, + "loss": 0.0006, + "step": 9306 + }, + { + "epoch": 4.234303912647862, + "grad_norm": 0.015754880391528853, + "learning_rate": 5.682318575913121e-07, + "loss": 0.0001, + "step": 9307 + }, + { + "epoch": 4.234758871701547, + "grad_norm": 0.1370104022465754, + "learning_rate": 5.675702618881645e-07, + "loss": 0.0032, + "step": 9308 + }, + { + "epoch": 4.235213830755232, + "grad_norm": 0.030985005776223276, + "learning_rate": 5.669090283831585e-07, + "loss": 0.0002, + "step": 9309 + }, + { + "epoch": 4.235668789808917, + "grad_norm": 0.019916934754802586, + "learning_rate": 5.662481571303264e-07, + "loss": 0.0001, + "step": 9310 + }, + { + "epoch": 4.236123748862602, + "grad_norm": 0.33903228752697046, + "learning_rate": 5.655876481836719e-07, + "loss": 0.002, + "step": 9311 + }, + { + "epoch": 4.236578707916287, + "grad_norm": 0.15377719459580733, + "learning_rate": 5.649275015971706e-07, + "loss": 0.0025, + "step": 9312 + }, + { + "epoch": 4.237033666969973, + "grad_norm": 0.12203306522676567, + "learning_rate": 5.642677174247646e-07, + "loss": 0.0014, + "step": 9313 + }, + { + "epoch": 4.237488626023658, + "grad_norm": 0.09511660950678767, + "learning_rate": 5.636082957203698e-07, + "loss": 0.0011, + "step": 9314 + }, + { + "epoch": 4.237943585077343, + "grad_norm": 0.05398236632756204, + "learning_rate": 5.629492365378691e-07, + "loss": 0.0019, + "step": 9315 + }, + { + "epoch": 4.238398544131028, + "grad_norm": 0.029383209840151758, + "learning_rate": 5.622905399311201e-07, + "loss": 0.0002, + "step": 9316 + }, + { + "epoch": 4.238853503184713, + "grad_norm": 0.1474027183066491, + "learning_rate": 5.616322059539469e-07, + "loss": 0.0016, + "step": 9317 + }, + { + "epoch": 4.239308462238399, + "grad_norm": 0.03116306615580843, + "learning_rate": 5.609742346601471e-07, + "loss": 0.0002, + "step": 9318 + }, + { + "epoch": 4.239763421292084, + "grad_norm": 0.06595812418215023, + "learning_rate": 5.603166261034865e-07, + "loss": 0.0007, + "step": 9319 + }, + { + "epoch": 4.240218380345769, + "grad_norm": 0.009383657030537765, + "learning_rate": 5.59659380337701e-07, + "loss": 0.0001, + "step": 9320 + }, + { + "epoch": 4.2406733393994545, + "grad_norm": 0.025587756274128386, + "learning_rate": 5.590024974164993e-07, + "loss": 0.0001, + "step": 9321 + }, + { + "epoch": 4.241128298453139, + "grad_norm": 0.07414089223016224, + "learning_rate": 5.583459773935584e-07, + "loss": 0.0009, + "step": 9322 + }, + { + "epoch": 4.241583257506824, + "grad_norm": 0.2552290100720014, + "learning_rate": 5.576898203225256e-07, + "loss": 0.0014, + "step": 9323 + }, + { + "epoch": 4.24203821656051, + "grad_norm": 0.21993919821261146, + "learning_rate": 5.570340262570184e-07, + "loss": 0.0033, + "step": 9324 + }, + { + "epoch": 4.242493175614195, + "grad_norm": 0.04946897779007293, + "learning_rate": 5.563785952506267e-07, + "loss": 0.0005, + "step": 9325 + }, + { + "epoch": 4.24294813466788, + "grad_norm": 0.1987220738552609, + "learning_rate": 5.557235273569094e-07, + "loss": 0.0036, + "step": 9326 + }, + { + "epoch": 4.243403093721565, + "grad_norm": 0.12016134997718976, + "learning_rate": 5.55068822629396e-07, + "loss": 0.0003, + "step": 9327 + }, + { + "epoch": 4.24385805277525, + "grad_norm": 0.17516747446154912, + "learning_rate": 5.544144811215845e-07, + "loss": 0.0015, + "step": 9328 + }, + { + "epoch": 4.244313011828935, + "grad_norm": 0.053407387325286236, + "learning_rate": 5.537605028869453e-07, + "loss": 0.0006, + "step": 9329 + }, + { + "epoch": 4.244767970882621, + "grad_norm": 0.0790862010567722, + "learning_rate": 5.531068879789192e-07, + "loss": 0.0008, + "step": 9330 + }, + { + "epoch": 4.245222929936306, + "grad_norm": 0.02954864414504281, + "learning_rate": 5.524536364509153e-07, + "loss": 0.0002, + "step": 9331 + }, + { + "epoch": 4.245677888989991, + "grad_norm": 0.05049424345756518, + "learning_rate": 5.518007483563165e-07, + "loss": 0.0004, + "step": 9332 + }, + { + "epoch": 4.246132848043676, + "grad_norm": 0.10101528020228469, + "learning_rate": 5.511482237484722e-07, + "loss": 0.0006, + "step": 9333 + }, + { + "epoch": 4.246587807097361, + "grad_norm": 0.052945242228749474, + "learning_rate": 5.504960626807038e-07, + "loss": 0.0004, + "step": 9334 + }, + { + "epoch": 4.247042766151046, + "grad_norm": 0.03692554971228215, + "learning_rate": 5.49844265206304e-07, + "loss": 0.0001, + "step": 9335 + }, + { + "epoch": 4.247497725204732, + "grad_norm": 0.009485519778121981, + "learning_rate": 5.491928313785344e-07, + "loss": 0.0001, + "step": 9336 + }, + { + "epoch": 4.247952684258417, + "grad_norm": 0.009598598109304232, + "learning_rate": 5.485417612506267e-07, + "loss": 0.0001, + "step": 9337 + }, + { + "epoch": 4.248407643312102, + "grad_norm": 0.02561969510083542, + "learning_rate": 5.478910548757827e-07, + "loss": 0.0001, + "step": 9338 + }, + { + "epoch": 4.248862602365787, + "grad_norm": 0.10708835994655748, + "learning_rate": 5.47240712307176e-07, + "loss": 0.0029, + "step": 9339 + }, + { + "epoch": 4.249317561419472, + "grad_norm": 0.09559653245999736, + "learning_rate": 5.465907335979514e-07, + "loss": 0.001, + "step": 9340 + }, + { + "epoch": 4.249772520473157, + "grad_norm": 0.15467428574057263, + "learning_rate": 5.459411188012198e-07, + "loss": 0.0017, + "step": 9341 + }, + { + "epoch": 4.250227479526843, + "grad_norm": 0.02213983272205055, + "learning_rate": 5.452918679700664e-07, + "loss": 0.0001, + "step": 9342 + }, + { + "epoch": 4.250682438580528, + "grad_norm": 0.09463856127072065, + "learning_rate": 5.446429811575438e-07, + "loss": 0.0006, + "step": 9343 + }, + { + "epoch": 4.251137397634213, + "grad_norm": 0.04324646566908147, + "learning_rate": 5.439944584166756e-07, + "loss": 0.0002, + "step": 9344 + }, + { + "epoch": 4.251592356687898, + "grad_norm": 0.10622226285871803, + "learning_rate": 5.433462998004574e-07, + "loss": 0.0012, + "step": 9345 + }, + { + "epoch": 4.252047315741583, + "grad_norm": 0.005745622212430651, + "learning_rate": 5.426985053618545e-07, + "loss": 0.0, + "step": 9346 + }, + { + "epoch": 4.252502274795268, + "grad_norm": 0.05565024068149903, + "learning_rate": 5.420510751538005e-07, + "loss": 0.0001, + "step": 9347 + }, + { + "epoch": 4.252957233848954, + "grad_norm": 0.12740508911108422, + "learning_rate": 5.414040092292006e-07, + "loss": 0.002, + "step": 9348 + }, + { + "epoch": 4.253412192902639, + "grad_norm": 0.00713767722317854, + "learning_rate": 5.407573076409295e-07, + "loss": 0.0, + "step": 9349 + }, + { + "epoch": 4.253867151956324, + "grad_norm": 0.18049298993801566, + "learning_rate": 5.401109704418339e-07, + "loss": 0.0016, + "step": 9350 + }, + { + "epoch": 4.254322111010009, + "grad_norm": 0.141920957311733, + "learning_rate": 5.3946499768473e-07, + "loss": 0.0024, + "step": 9351 + }, + { + "epoch": 4.254777070063694, + "grad_norm": 0.12093676476602054, + "learning_rate": 5.388193894224014e-07, + "loss": 0.001, + "step": 9352 + }, + { + "epoch": 4.255232029117379, + "grad_norm": 0.02067757886910532, + "learning_rate": 5.381741457076068e-07, + "loss": 0.0001, + "step": 9353 + }, + { + "epoch": 4.255686988171065, + "grad_norm": 0.4797307568601315, + "learning_rate": 5.375292665930703e-07, + "loss": 0.014, + "step": 9354 + }, + { + "epoch": 4.25614194722475, + "grad_norm": 0.006306633377726425, + "learning_rate": 5.368847521314912e-07, + "loss": 0.0, + "step": 9355 + }, + { + "epoch": 4.256596906278435, + "grad_norm": 0.0990989142607219, + "learning_rate": 5.362406023755351e-07, + "loss": 0.001, + "step": 9356 + }, + { + "epoch": 4.25705186533212, + "grad_norm": 0.15494132093358595, + "learning_rate": 5.355968173778386e-07, + "loss": 0.0009, + "step": 9357 + }, + { + "epoch": 4.257506824385805, + "grad_norm": 0.05255021309603676, + "learning_rate": 5.349533971910081e-07, + "loss": 0.0006, + "step": 9358 + }, + { + "epoch": 4.25796178343949, + "grad_norm": 0.16000169484884014, + "learning_rate": 5.343103418676215e-07, + "loss": 0.0018, + "step": 9359 + }, + { + "epoch": 4.258416742493176, + "grad_norm": 0.01837174525114683, + "learning_rate": 5.336676514602285e-07, + "loss": 0.0001, + "step": 9360 + }, + { + "epoch": 4.258871701546861, + "grad_norm": 0.12017248432687933, + "learning_rate": 5.330253260213452e-07, + "loss": 0.0017, + "step": 9361 + }, + { + "epoch": 4.2593266606005455, + "grad_norm": 0.011715700179753999, + "learning_rate": 5.323833656034594e-07, + "loss": 0.0001, + "step": 9362 + }, + { + "epoch": 4.259781619654231, + "grad_norm": 0.040199168037055026, + "learning_rate": 5.317417702590283e-07, + "loss": 0.0004, + "step": 9363 + }, + { + "epoch": 4.260236578707916, + "grad_norm": 0.05866484084764532, + "learning_rate": 5.311005400404828e-07, + "loss": 0.0003, + "step": 9364 + }, + { + "epoch": 4.260691537761602, + "grad_norm": 0.0509454176492725, + "learning_rate": 5.304596750002195e-07, + "loss": 0.0004, + "step": 9365 + }, + { + "epoch": 4.261146496815287, + "grad_norm": 0.005881427861288782, + "learning_rate": 5.298191751906057e-07, + "loss": 0.0, + "step": 9366 + }, + { + "epoch": 4.261601455868972, + "grad_norm": 0.040475060842325425, + "learning_rate": 5.291790406639836e-07, + "loss": 0.0002, + "step": 9367 + }, + { + "epoch": 4.262056414922657, + "grad_norm": 0.03195688298089145, + "learning_rate": 5.285392714726589e-07, + "loss": 0.0002, + "step": 9368 + }, + { + "epoch": 4.262511373976342, + "grad_norm": 0.06749456836920831, + "learning_rate": 5.278998676689129e-07, + "loss": 0.0005, + "step": 9369 + }, + { + "epoch": 4.262966333030027, + "grad_norm": 0.00963296156122002, + "learning_rate": 5.272608293049941e-07, + "loss": 0.0001, + "step": 9370 + }, + { + "epoch": 4.263421292083713, + "grad_norm": 0.13375752970379154, + "learning_rate": 5.266221564331214e-07, + "loss": 0.0013, + "step": 9371 + }, + { + "epoch": 4.263876251137398, + "grad_norm": 0.016385319406306883, + "learning_rate": 5.259838491054836e-07, + "loss": 0.0001, + "step": 9372 + }, + { + "epoch": 4.264331210191083, + "grad_norm": 0.02039640415875035, + "learning_rate": 5.253459073742411e-07, + "loss": 0.0002, + "step": 9373 + }, + { + "epoch": 4.264786169244768, + "grad_norm": 0.15402696514031883, + "learning_rate": 5.247083312915247e-07, + "loss": 0.0006, + "step": 9374 + }, + { + "epoch": 4.265241128298453, + "grad_norm": 0.00758210887343876, + "learning_rate": 5.240711209094335e-07, + "loss": 0.0001, + "step": 9375 + }, + { + "epoch": 4.265696087352138, + "grad_norm": 0.06534292542961342, + "learning_rate": 5.234342762800365e-07, + "loss": 0.0007, + "step": 9376 + }, + { + "epoch": 4.266151046405824, + "grad_norm": 0.08075096057316716, + "learning_rate": 5.227977974553749e-07, + "loss": 0.0006, + "step": 9377 + }, + { + "epoch": 4.266606005459509, + "grad_norm": 0.08130011001507072, + "learning_rate": 5.221616844874577e-07, + "loss": 0.0007, + "step": 9378 + }, + { + "epoch": 4.267060964513194, + "grad_norm": 0.13562851418401212, + "learning_rate": 5.215259374282666e-07, + "loss": 0.0022, + "step": 9379 + }, + { + "epoch": 4.267515923566879, + "grad_norm": 0.11049870894123699, + "learning_rate": 5.2089055632975e-07, + "loss": 0.0004, + "step": 9380 + }, + { + "epoch": 4.267970882620564, + "grad_norm": 0.03203315008822801, + "learning_rate": 5.202555412438309e-07, + "loss": 0.0002, + "step": 9381 + }, + { + "epoch": 4.268425841674249, + "grad_norm": 0.11246568988555525, + "learning_rate": 5.196208922223988e-07, + "loss": 0.0014, + "step": 9382 + }, + { + "epoch": 4.268880800727935, + "grad_norm": 0.0484548824835751, + "learning_rate": 5.189866093173135e-07, + "loss": 0.0004, + "step": 9383 + }, + { + "epoch": 4.26933575978162, + "grad_norm": 0.02306155849820032, + "learning_rate": 5.183526925804067e-07, + "loss": 0.0002, + "step": 9384 + }, + { + "epoch": 4.269790718835305, + "grad_norm": 0.1073189119835409, + "learning_rate": 5.177191420634792e-07, + "loss": 0.0012, + "step": 9385 + }, + { + "epoch": 4.27024567788899, + "grad_norm": 0.02978896684836263, + "learning_rate": 5.170859578183019e-07, + "loss": 0.0002, + "step": 9386 + }, + { + "epoch": 4.270700636942675, + "grad_norm": 0.02872018391760248, + "learning_rate": 5.164531398966138e-07, + "loss": 0.0002, + "step": 9387 + }, + { + "epoch": 4.27115559599636, + "grad_norm": 0.024454689149395675, + "learning_rate": 5.158206883501282e-07, + "loss": 0.0001, + "step": 9388 + }, + { + "epoch": 4.271610555050046, + "grad_norm": 0.309703972484005, + "learning_rate": 5.151886032305265e-07, + "loss": 0.0012, + "step": 9389 + }, + { + "epoch": 4.272065514103731, + "grad_norm": 0.025025046483026826, + "learning_rate": 5.145568845894583e-07, + "loss": 0.0001, + "step": 9390 + }, + { + "epoch": 4.272520473157416, + "grad_norm": 0.160686964743567, + "learning_rate": 5.139255324785458e-07, + "loss": 0.0015, + "step": 9391 + }, + { + "epoch": 4.272975432211101, + "grad_norm": 0.035499169518848406, + "learning_rate": 5.132945469493788e-07, + "loss": 0.0002, + "step": 9392 + }, + { + "epoch": 4.273430391264786, + "grad_norm": 0.20541328876179651, + "learning_rate": 5.126639280535211e-07, + "loss": 0.0016, + "step": 9393 + }, + { + "epoch": 4.273885350318471, + "grad_norm": 0.010777903427906499, + "learning_rate": 5.12033675842501e-07, + "loss": 0.0001, + "step": 9394 + }, + { + "epoch": 4.274340309372157, + "grad_norm": 0.11456487271800445, + "learning_rate": 5.114037903678227e-07, + "loss": 0.0008, + "step": 9395 + }, + { + "epoch": 4.274795268425842, + "grad_norm": 0.037896546187512016, + "learning_rate": 5.107742716809566e-07, + "loss": 0.0002, + "step": 9396 + }, + { + "epoch": 4.2752502274795265, + "grad_norm": 0.03162816545617232, + "learning_rate": 5.101451198333423e-07, + "loss": 0.0002, + "step": 9397 + }, + { + "epoch": 4.275705186533212, + "grad_norm": 0.08485308349316291, + "learning_rate": 5.095163348763943e-07, + "loss": 0.0007, + "step": 9398 + }, + { + "epoch": 4.276160145586897, + "grad_norm": 0.10601461335791529, + "learning_rate": 5.088879168614918e-07, + "loss": 0.0014, + "step": 9399 + }, + { + "epoch": 4.276615104640582, + "grad_norm": 0.04136955354817777, + "learning_rate": 5.082598658399879e-07, + "loss": 0.0002, + "step": 9400 + }, + { + "epoch": 4.277070063694268, + "grad_norm": 0.18810748326277346, + "learning_rate": 5.076321818632018e-07, + "loss": 0.0039, + "step": 9401 + }, + { + "epoch": 4.277525022747953, + "grad_norm": 0.09148530578537371, + "learning_rate": 5.070048649824267e-07, + "loss": 0.0004, + "step": 9402 + }, + { + "epoch": 4.2779799818016375, + "grad_norm": 0.12006026487055753, + "learning_rate": 5.063779152489245e-07, + "loss": 0.0011, + "step": 9403 + }, + { + "epoch": 4.278434940855323, + "grad_norm": 0.011503973504918717, + "learning_rate": 5.057513327139263e-07, + "loss": 0.0001, + "step": 9404 + }, + { + "epoch": 4.278889899909008, + "grad_norm": 0.02451284363597608, + "learning_rate": 5.051251174286331e-07, + "loss": 0.0001, + "step": 9405 + }, + { + "epoch": 4.279344858962693, + "grad_norm": 0.05348024494939322, + "learning_rate": 5.044992694442158e-07, + "loss": 0.0005, + "step": 9406 + }, + { + "epoch": 4.279799818016379, + "grad_norm": 0.02422143268384626, + "learning_rate": 5.038737888118178e-07, + "loss": 0.0002, + "step": 9407 + }, + { + "epoch": 4.280254777070064, + "grad_norm": 0.007810394825914106, + "learning_rate": 5.032486755825484e-07, + "loss": 0.0001, + "step": 9408 + }, + { + "epoch": 4.2807097361237485, + "grad_norm": 0.009611292354112944, + "learning_rate": 5.026239298074909e-07, + "loss": 0.0001, + "step": 9409 + }, + { + "epoch": 4.281164695177434, + "grad_norm": 0.1992134259239629, + "learning_rate": 5.019995515376963e-07, + "loss": 0.0013, + "step": 9410 + }, + { + "epoch": 4.281619654231119, + "grad_norm": 0.03746306288079818, + "learning_rate": 5.01375540824185e-07, + "loss": 0.0002, + "step": 9411 + }, + { + "epoch": 4.282074613284804, + "grad_norm": 0.007442771113152153, + "learning_rate": 5.007518977179482e-07, + "loss": 0.0, + "step": 9412 + }, + { + "epoch": 4.28252957233849, + "grad_norm": 0.07102955563686467, + "learning_rate": 5.001286222699491e-07, + "loss": 0.0004, + "step": 9413 + }, + { + "epoch": 4.282984531392175, + "grad_norm": 0.2658893686714957, + "learning_rate": 4.995057145311172e-07, + "loss": 0.0016, + "step": 9414 + }, + { + "epoch": 4.2834394904458595, + "grad_norm": 0.01212385410953683, + "learning_rate": 4.988831745523537e-07, + "loss": 0.0001, + "step": 9415 + }, + { + "epoch": 4.283894449499545, + "grad_norm": 0.08025179085578654, + "learning_rate": 4.982610023845313e-07, + "loss": 0.0006, + "step": 9416 + }, + { + "epoch": 4.28434940855323, + "grad_norm": 0.03395557345563699, + "learning_rate": 4.976391980784889e-07, + "loss": 0.0002, + "step": 9417 + }, + { + "epoch": 4.284804367606915, + "grad_norm": 0.28009571920375315, + "learning_rate": 4.970177616850397e-07, + "loss": 0.0035, + "step": 9418 + }, + { + "epoch": 4.285259326660601, + "grad_norm": 0.2121134074411285, + "learning_rate": 4.963966932549641e-07, + "loss": 0.0022, + "step": 9419 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.019556036543950005, + "learning_rate": 4.957759928390121e-07, + "loss": 0.0001, + "step": 9420 + }, + { + "epoch": 4.2861692447679705, + "grad_norm": 0.1880748330002612, + "learning_rate": 4.951556604879049e-07, + "loss": 0.0008, + "step": 9421 + }, + { + "epoch": 4.286624203821656, + "grad_norm": 0.11429922284627615, + "learning_rate": 4.945356962523329e-07, + "loss": 0.0013, + "step": 9422 + }, + { + "epoch": 4.287079162875341, + "grad_norm": 0.04548185066711093, + "learning_rate": 4.939161001829579e-07, + "loss": 0.0001, + "step": 9423 + }, + { + "epoch": 4.287534121929026, + "grad_norm": 0.11820784247438874, + "learning_rate": 4.932968723304105e-07, + "loss": 0.0012, + "step": 9424 + }, + { + "epoch": 4.287989080982712, + "grad_norm": 0.12433797446899217, + "learning_rate": 4.926780127452901e-07, + "loss": 0.0016, + "step": 9425 + }, + { + "epoch": 4.288444040036397, + "grad_norm": 0.07678094650993814, + "learning_rate": 4.920595214781671e-07, + "loss": 0.0004, + "step": 9426 + }, + { + "epoch": 4.288898999090081, + "grad_norm": 0.11673110203288319, + "learning_rate": 4.914413985795829e-07, + "loss": 0.0013, + "step": 9427 + }, + { + "epoch": 4.289353958143767, + "grad_norm": 0.15560004974938102, + "learning_rate": 4.908236441000474e-07, + "loss": 0.0024, + "step": 9428 + }, + { + "epoch": 4.289808917197452, + "grad_norm": 0.021967800806110525, + "learning_rate": 4.902062580900396e-07, + "loss": 0.0002, + "step": 9429 + }, + { + "epoch": 4.290263876251138, + "grad_norm": 0.029300951166889445, + "learning_rate": 4.895892406000113e-07, + "loss": 0.0002, + "step": 9430 + }, + { + "epoch": 4.290718835304823, + "grad_norm": 0.26672052180912426, + "learning_rate": 4.889725916803801e-07, + "loss": 0.0013, + "step": 9431 + }, + { + "epoch": 4.2911737943585075, + "grad_norm": 0.14466017685738486, + "learning_rate": 4.883563113815388e-07, + "loss": 0.0014, + "step": 9432 + }, + { + "epoch": 4.291628753412193, + "grad_norm": 0.03234383339019922, + "learning_rate": 4.877403997538443e-07, + "loss": 0.0002, + "step": 9433 + }, + { + "epoch": 4.292083712465878, + "grad_norm": 0.01765892589603697, + "learning_rate": 4.871248568476278e-07, + "loss": 0.0001, + "step": 9434 + }, + { + "epoch": 4.292538671519563, + "grad_norm": 0.030799748763440448, + "learning_rate": 4.865096827131871e-07, + "loss": 0.0003, + "step": 9435 + }, + { + "epoch": 4.292993630573249, + "grad_norm": 0.12289043677660492, + "learning_rate": 4.858948774007921e-07, + "loss": 0.0006, + "step": 9436 + }, + { + "epoch": 4.293448589626934, + "grad_norm": 0.10882075896221208, + "learning_rate": 4.852804409606832e-07, + "loss": 0.0007, + "step": 9437 + }, + { + "epoch": 4.2939035486806185, + "grad_norm": 0.13369377006159008, + "learning_rate": 4.846663734430684e-07, + "loss": 0.0031, + "step": 9438 + }, + { + "epoch": 4.294358507734304, + "grad_norm": 0.2689251459401683, + "learning_rate": 4.840526748981267e-07, + "loss": 0.0026, + "step": 9439 + }, + { + "epoch": 4.294813466787989, + "grad_norm": 0.06324588817919938, + "learning_rate": 4.83439345376005e-07, + "loss": 0.0002, + "step": 9440 + }, + { + "epoch": 4.295268425841674, + "grad_norm": 0.01046168002163692, + "learning_rate": 4.82826384926825e-07, + "loss": 0.0001, + "step": 9441 + }, + { + "epoch": 4.29572338489536, + "grad_norm": 0.3567896872767112, + "learning_rate": 4.822137936006732e-07, + "loss": 0.0043, + "step": 9442 + }, + { + "epoch": 4.296178343949045, + "grad_norm": 0.12613594506668488, + "learning_rate": 4.816015714476074e-07, + "loss": 0.0008, + "step": 9443 + }, + { + "epoch": 4.2966333030027295, + "grad_norm": 0.13948413840303564, + "learning_rate": 4.809897185176566e-07, + "loss": 0.0006, + "step": 9444 + }, + { + "epoch": 4.297088262056415, + "grad_norm": 0.06265948200416833, + "learning_rate": 4.803782348608177e-07, + "loss": 0.0006, + "step": 9445 + }, + { + "epoch": 4.2975432211101, + "grad_norm": 0.010579017652374813, + "learning_rate": 4.797671205270604e-07, + "loss": 0.0001, + "step": 9446 + }, + { + "epoch": 4.297998180163785, + "grad_norm": 0.1152175307054137, + "learning_rate": 4.791563755663203e-07, + "loss": 0.0023, + "step": 9447 + }, + { + "epoch": 4.298453139217471, + "grad_norm": 0.015339585207144988, + "learning_rate": 4.785460000285053e-07, + "loss": 0.0001, + "step": 9448 + }, + { + "epoch": 4.298908098271156, + "grad_norm": 0.005743888658135438, + "learning_rate": 4.779359939634926e-07, + "loss": 0.0, + "step": 9449 + }, + { + "epoch": 4.2993630573248405, + "grad_norm": 0.164605272005598, + "learning_rate": 4.773263574211279e-07, + "loss": 0.0006, + "step": 9450 + }, + { + "epoch": 4.299818016378526, + "grad_norm": 0.05192488266792382, + "learning_rate": 4.7671709045122914e-07, + "loss": 0.0012, + "step": 9451 + }, + { + "epoch": 4.300272975432211, + "grad_norm": 0.34121220992405865, + "learning_rate": 4.761081931035838e-07, + "loss": 0.0059, + "step": 9452 + }, + { + "epoch": 4.300727934485896, + "grad_norm": 0.12996774576180398, + "learning_rate": 4.7549966542794703e-07, + "loss": 0.001, + "step": 9453 + }, + { + "epoch": 4.301182893539582, + "grad_norm": 0.04212948058346271, + "learning_rate": 4.748915074740451e-07, + "loss": 0.0001, + "step": 9454 + }, + { + "epoch": 4.301637852593267, + "grad_norm": 0.06553098037555283, + "learning_rate": 4.7428371929157333e-07, + "loss": 0.0005, + "step": 9455 + }, + { + "epoch": 4.3020928116469515, + "grad_norm": 0.01692617983929504, + "learning_rate": 4.736763009301987e-07, + "loss": 0.0001, + "step": 9456 + }, + { + "epoch": 4.302547770700637, + "grad_norm": 0.0914594123086564, + "learning_rate": 4.730692524395553e-07, + "loss": 0.0009, + "step": 9457 + }, + { + "epoch": 4.303002729754322, + "grad_norm": 0.07401523962304041, + "learning_rate": 4.724625738692501e-07, + "loss": 0.0003, + "step": 9458 + }, + { + "epoch": 4.303457688808007, + "grad_norm": 0.05006230651350633, + "learning_rate": 4.718562652688574e-07, + "loss": 0.0002, + "step": 9459 + }, + { + "epoch": 4.303912647861693, + "grad_norm": 0.12790973820871607, + "learning_rate": 4.7125032668792036e-07, + "loss": 0.001, + "step": 9460 + }, + { + "epoch": 4.304367606915378, + "grad_norm": 0.09872796404226394, + "learning_rate": 4.70644758175956e-07, + "loss": 0.0009, + "step": 9461 + }, + { + "epoch": 4.304822565969062, + "grad_norm": 0.09948775177384377, + "learning_rate": 4.700395597824481e-07, + "loss": 0.0007, + "step": 9462 + }, + { + "epoch": 4.305277525022748, + "grad_norm": 0.19927698689297516, + "learning_rate": 4.6943473155684983e-07, + "loss": 0.0009, + "step": 9463 + }, + { + "epoch": 4.305732484076433, + "grad_norm": 0.03053122579911819, + "learning_rate": 4.6883027354858447e-07, + "loss": 0.0002, + "step": 9464 + }, + { + "epoch": 4.306187443130118, + "grad_norm": 0.3646886361853749, + "learning_rate": 4.6822618580704694e-07, + "loss": 0.0043, + "step": 9465 + }, + { + "epoch": 4.306642402183804, + "grad_norm": 0.04212069404476076, + "learning_rate": 4.676224683816005e-07, + "loss": 0.0005, + "step": 9466 + }, + { + "epoch": 4.3070973612374885, + "grad_norm": 0.1519858996652111, + "learning_rate": 4.6701912132157854e-07, + "loss": 0.0009, + "step": 9467 + }, + { + "epoch": 4.307552320291173, + "grad_norm": 0.020674840563901613, + "learning_rate": 4.664161446762827e-07, + "loss": 0.0002, + "step": 9468 + }, + { + "epoch": 4.308007279344859, + "grad_norm": 0.022163062605841628, + "learning_rate": 4.6581353849498576e-07, + "loss": 0.0001, + "step": 9469 + }, + { + "epoch": 4.308462238398544, + "grad_norm": 0.03481091456227857, + "learning_rate": 4.652113028269306e-07, + "loss": 0.0002, + "step": 9470 + }, + { + "epoch": 4.308917197452229, + "grad_norm": 0.02440523714926293, + "learning_rate": 4.646094377213284e-07, + "loss": 0.0001, + "step": 9471 + }, + { + "epoch": 4.309372156505915, + "grad_norm": 0.18156949652247606, + "learning_rate": 4.640079432273614e-07, + "loss": 0.0013, + "step": 9472 + }, + { + "epoch": 4.3098271155595995, + "grad_norm": 0.014171935112652133, + "learning_rate": 4.6340681939418155e-07, + "loss": 0.0001, + "step": 9473 + }, + { + "epoch": 4.310282074613285, + "grad_norm": 0.10118555549312674, + "learning_rate": 4.628060662709083e-07, + "loss": 0.0005, + "step": 9474 + }, + { + "epoch": 4.31073703366697, + "grad_norm": 0.07053292161657575, + "learning_rate": 4.6220568390663465e-07, + "loss": 0.0006, + "step": 9475 + }, + { + "epoch": 4.311191992720655, + "grad_norm": 0.01780630011510076, + "learning_rate": 4.6160567235041974e-07, + "loss": 0.0001, + "step": 9476 + }, + { + "epoch": 4.311646951774341, + "grad_norm": 0.07682431226079638, + "learning_rate": 4.610060316512943e-07, + "loss": 0.0008, + "step": 9477 + }, + { + "epoch": 4.312101910828026, + "grad_norm": 0.07722225271805593, + "learning_rate": 4.6040676185825696e-07, + "loss": 0.0004, + "step": 9478 + }, + { + "epoch": 4.3125568698817105, + "grad_norm": 0.009418970620694796, + "learning_rate": 4.5980786302027846e-07, + "loss": 0.0001, + "step": 9479 + }, + { + "epoch": 4.313011828935396, + "grad_norm": 0.02565495146216306, + "learning_rate": 4.592093351862992e-07, + "loss": 0.0002, + "step": 9480 + }, + { + "epoch": 4.313466787989081, + "grad_norm": 0.09445978290931722, + "learning_rate": 4.5861117840522664e-07, + "loss": 0.001, + "step": 9481 + }, + { + "epoch": 4.313921747042766, + "grad_norm": 0.012392574447745174, + "learning_rate": 4.5801339272594004e-07, + "loss": 0.0, + "step": 9482 + }, + { + "epoch": 4.314376706096452, + "grad_norm": 0.01611543225774431, + "learning_rate": 4.574159781972876e-07, + "loss": 0.0001, + "step": 9483 + }, + { + "epoch": 4.314831665150137, + "grad_norm": 0.07530056225211533, + "learning_rate": 4.5681893486808625e-07, + "loss": 0.0004, + "step": 9484 + }, + { + "epoch": 4.3152866242038215, + "grad_norm": 0.13969925655438337, + "learning_rate": 4.562222627871249e-07, + "loss": 0.0012, + "step": 9485 + }, + { + "epoch": 4.315741583257507, + "grad_norm": 0.09025074476596753, + "learning_rate": 4.556259620031617e-07, + "loss": 0.0004, + "step": 9486 + }, + { + "epoch": 4.316196542311192, + "grad_norm": 0.03016822732471592, + "learning_rate": 4.550300325649226e-07, + "loss": 0.0003, + "step": 9487 + }, + { + "epoch": 4.316651501364877, + "grad_norm": 0.1167719769355159, + "learning_rate": 4.544344745211038e-07, + "loss": 0.0022, + "step": 9488 + }, + { + "epoch": 4.317106460418563, + "grad_norm": 0.024338125589698348, + "learning_rate": 4.538392879203718e-07, + "loss": 0.0002, + "step": 9489 + }, + { + "epoch": 4.317561419472248, + "grad_norm": 0.023710959654620913, + "learning_rate": 4.5324447281136383e-07, + "loss": 0.0003, + "step": 9490 + }, + { + "epoch": 4.3180163785259325, + "grad_norm": 0.1678662928253239, + "learning_rate": 4.5265002924268443e-07, + "loss": 0.0024, + "step": 9491 + }, + { + "epoch": 4.318471337579618, + "grad_norm": 0.08620100759381095, + "learning_rate": 4.5205595726290795e-07, + "loss": 0.0004, + "step": 9492 + }, + { + "epoch": 4.318926296633303, + "grad_norm": 0.026232198256582, + "learning_rate": 4.5146225692058174e-07, + "loss": 0.0002, + "step": 9493 + }, + { + "epoch": 4.319381255686988, + "grad_norm": 0.05810398380901062, + "learning_rate": 4.5086892826421757e-07, + "loss": 0.0003, + "step": 9494 + }, + { + "epoch": 4.319836214740674, + "grad_norm": 0.030043277667089033, + "learning_rate": 4.502759713423016e-07, + "loss": 0.0001, + "step": 9495 + }, + { + "epoch": 4.320291173794359, + "grad_norm": 0.14353490409139388, + "learning_rate": 4.496833862032873e-07, + "loss": 0.001, + "step": 9496 + }, + { + "epoch": 4.320746132848043, + "grad_norm": 0.2908135803299327, + "learning_rate": 4.4909117289559713e-07, + "loss": 0.002, + "step": 9497 + }, + { + "epoch": 4.321201091901729, + "grad_norm": 0.054794526283025904, + "learning_rate": 4.484993314676239e-07, + "loss": 0.0005, + "step": 9498 + }, + { + "epoch": 4.321656050955414, + "grad_norm": 0.28404728994912926, + "learning_rate": 4.479078619677313e-07, + "loss": 0.0019, + "step": 9499 + }, + { + "epoch": 4.322111010009099, + "grad_norm": 0.0231018922164419, + "learning_rate": 4.4731676444425165e-07, + "loss": 0.0002, + "step": 9500 + }, + { + "epoch": 4.322565969062785, + "grad_norm": 0.002971606315893479, + "learning_rate": 4.467260389454864e-07, + "loss": 0.0, + "step": 9501 + }, + { + "epoch": 4.3230209281164695, + "grad_norm": 0.12189084857917876, + "learning_rate": 4.4613568551970687e-07, + "loss": 0.0012, + "step": 9502 + }, + { + "epoch": 4.323475887170154, + "grad_norm": 0.1379315244153815, + "learning_rate": 4.455457042151529e-07, + "loss": 0.0005, + "step": 9503 + }, + { + "epoch": 4.32393084622384, + "grad_norm": 0.12266870676968793, + "learning_rate": 4.4495609508003747e-07, + "loss": 0.0003, + "step": 9504 + }, + { + "epoch": 4.324385805277525, + "grad_norm": 0.11118893112725846, + "learning_rate": 4.443668581625393e-07, + "loss": 0.0013, + "step": 9505 + }, + { + "epoch": 4.32484076433121, + "grad_norm": 0.10323862197120745, + "learning_rate": 4.4377799351080776e-07, + "loss": 0.001, + "step": 9506 + }, + { + "epoch": 4.325295723384896, + "grad_norm": 0.2674571804087863, + "learning_rate": 4.431895011729637e-07, + "loss": 0.0014, + "step": 9507 + }, + { + "epoch": 4.3257506824385805, + "grad_norm": 0.007617615129750181, + "learning_rate": 4.426013811970942e-07, + "loss": 0.0001, + "step": 9508 + }, + { + "epoch": 4.326205641492265, + "grad_norm": 0.01794300954381809, + "learning_rate": 4.420136336312597e-07, + "loss": 0.0001, + "step": 9509 + }, + { + "epoch": 4.326660600545951, + "grad_norm": 0.18656816292993106, + "learning_rate": 4.414262585234874e-07, + "loss": 0.0035, + "step": 9510 + }, + { + "epoch": 4.327115559599636, + "grad_norm": 0.03864447361427777, + "learning_rate": 4.40839255921775e-07, + "loss": 0.0002, + "step": 9511 + }, + { + "epoch": 4.327570518653321, + "grad_norm": 0.09947989222315545, + "learning_rate": 4.402526258740886e-07, + "loss": 0.0006, + "step": 9512 + }, + { + "epoch": 4.328025477707007, + "grad_norm": 0.017387686072632164, + "learning_rate": 4.396663684283664e-07, + "loss": 0.0001, + "step": 9513 + }, + { + "epoch": 4.3284804367606915, + "grad_norm": 0.041729592064830776, + "learning_rate": 4.3908048363251464e-07, + "loss": 0.0002, + "step": 9514 + }, + { + "epoch": 4.328935395814376, + "grad_norm": 0.017757007796318103, + "learning_rate": 4.384949715344089e-07, + "loss": 0.0001, + "step": 9515 + }, + { + "epoch": 4.329390354868062, + "grad_norm": 0.034981319824325924, + "learning_rate": 4.379098321818948e-07, + "loss": 0.0002, + "step": 9516 + }, + { + "epoch": 4.329845313921747, + "grad_norm": 0.09251019222117542, + "learning_rate": 4.373250656227862e-07, + "loss": 0.0006, + "step": 9517 + }, + { + "epoch": 4.330300272975432, + "grad_norm": 0.046653612926894385, + "learning_rate": 4.367406719048689e-07, + "loss": 0.0005, + "step": 9518 + }, + { + "epoch": 4.330755232029118, + "grad_norm": 0.08693001754714129, + "learning_rate": 4.361566510758963e-07, + "loss": 0.0008, + "step": 9519 + }, + { + "epoch": 4.3312101910828025, + "grad_norm": 0.03530663400392882, + "learning_rate": 4.355730031835914e-07, + "loss": 0.0002, + "step": 9520 + }, + { + "epoch": 4.331665150136487, + "grad_norm": 0.011049055356075959, + "learning_rate": 4.349897282756488e-07, + "loss": 0.0, + "step": 9521 + }, + { + "epoch": 4.332120109190173, + "grad_norm": 0.12178748468935524, + "learning_rate": 4.344068263997303e-07, + "loss": 0.0019, + "step": 9522 + }, + { + "epoch": 4.332575068243858, + "grad_norm": 0.11680242767992842, + "learning_rate": 4.338242976034668e-07, + "loss": 0.0013, + "step": 9523 + }, + { + "epoch": 4.333030027297543, + "grad_norm": 0.07923184615410592, + "learning_rate": 4.3324214193446233e-07, + "loss": 0.0001, + "step": 9524 + }, + { + "epoch": 4.333484986351229, + "grad_norm": 0.1089654569742116, + "learning_rate": 4.326603594402862e-07, + "loss": 0.0012, + "step": 9525 + }, + { + "epoch": 4.3339399454049135, + "grad_norm": 0.05951894803069833, + "learning_rate": 4.3207895016847966e-07, + "loss": 0.0007, + "step": 9526 + }, + { + "epoch": 4.334394904458598, + "grad_norm": 0.23333836884089046, + "learning_rate": 4.3149791416655206e-07, + "loss": 0.0022, + "step": 9527 + }, + { + "epoch": 4.334849863512284, + "grad_norm": 0.009065358724321152, + "learning_rate": 4.309172514819837e-07, + "loss": 0.0, + "step": 9528 + }, + { + "epoch": 4.335304822565969, + "grad_norm": 0.026820224032963576, + "learning_rate": 4.303369621622244e-07, + "loss": 0.0002, + "step": 9529 + }, + { + "epoch": 4.335759781619654, + "grad_norm": 0.003668693652079188, + "learning_rate": 4.297570462546924e-07, + "loss": 0.0, + "step": 9530 + }, + { + "epoch": 4.33621474067334, + "grad_norm": 0.01269806947788099, + "learning_rate": 4.2917750380677583e-07, + "loss": 0.0001, + "step": 9531 + }, + { + "epoch": 4.336669699727024, + "grad_norm": 0.04999190269455574, + "learning_rate": 4.285983348658307e-07, + "loss": 0.0004, + "step": 9532 + }, + { + "epoch": 4.337124658780709, + "grad_norm": 0.015292310527195728, + "learning_rate": 4.280195394791864e-07, + "loss": 0.0001, + "step": 9533 + }, + { + "epoch": 4.337579617834395, + "grad_norm": 0.052062547937813335, + "learning_rate": 4.274411176941373e-07, + "loss": 0.0002, + "step": 9534 + }, + { + "epoch": 4.33803457688808, + "grad_norm": 0.11064949050872513, + "learning_rate": 4.2686306955795173e-07, + "loss": 0.0009, + "step": 9535 + }, + { + "epoch": 4.338489535941765, + "grad_norm": 0.19712257675213457, + "learning_rate": 4.2628539511786417e-07, + "loss": 0.0037, + "step": 9536 + }, + { + "epoch": 4.3389444949954505, + "grad_norm": 0.48844836229017824, + "learning_rate": 4.2570809442107785e-07, + "loss": 0.0026, + "step": 9537 + }, + { + "epoch": 4.339399454049135, + "grad_norm": 0.034426168785234586, + "learning_rate": 4.2513116751477013e-07, + "loss": 0.0002, + "step": 9538 + }, + { + "epoch": 4.339854413102821, + "grad_norm": 0.11093734819230727, + "learning_rate": 4.245546144460838e-07, + "loss": 0.001, + "step": 9539 + }, + { + "epoch": 4.340309372156506, + "grad_norm": 0.011211162420060433, + "learning_rate": 4.2397843526213124e-07, + "loss": 0.0001, + "step": 9540 + }, + { + "epoch": 4.340764331210191, + "grad_norm": 0.036901415893791166, + "learning_rate": 4.2340263000999526e-07, + "loss": 0.0004, + "step": 9541 + }, + { + "epoch": 4.341219290263877, + "grad_norm": 0.14939522086022908, + "learning_rate": 4.228271987367283e-07, + "loss": 0.0018, + "step": 9542 + }, + { + "epoch": 4.3416742493175615, + "grad_norm": 0.010760449335293095, + "learning_rate": 4.222521414893538e-07, + "loss": 0.0001, + "step": 9543 + }, + { + "epoch": 4.342129208371246, + "grad_norm": 0.1323723659924108, + "learning_rate": 4.216774583148608e-07, + "loss": 0.0027, + "step": 9544 + }, + { + "epoch": 4.342584167424932, + "grad_norm": 0.056197498836121025, + "learning_rate": 4.2110314926021024e-07, + "loss": 0.0004, + "step": 9545 + }, + { + "epoch": 4.343039126478617, + "grad_norm": 0.016802289941257622, + "learning_rate": 4.205292143723322e-07, + "loss": 0.0001, + "step": 9546 + }, + { + "epoch": 4.343494085532302, + "grad_norm": 0.013156974205455137, + "learning_rate": 4.199556536981264e-07, + "loss": 0.0001, + "step": 9547 + }, + { + "epoch": 4.343949044585988, + "grad_norm": 0.016014505988141435, + "learning_rate": 4.1938246728445986e-07, + "loss": 0.0002, + "step": 9548 + }, + { + "epoch": 4.3444040036396725, + "grad_norm": 0.031143187808687866, + "learning_rate": 4.1880965517817396e-07, + "loss": 0.0003, + "step": 9549 + }, + { + "epoch": 4.344858962693357, + "grad_norm": 0.04607120566065836, + "learning_rate": 4.18237217426074e-07, + "loss": 0.0003, + "step": 9550 + }, + { + "epoch": 4.345313921747043, + "grad_norm": 0.1270663262246374, + "learning_rate": 4.176651540749371e-07, + "loss": 0.0018, + "step": 9551 + }, + { + "epoch": 4.345768880800728, + "grad_norm": 0.08882594961851505, + "learning_rate": 4.1709346517151084e-07, + "loss": 0.0007, + "step": 9552 + }, + { + "epoch": 4.346223839854413, + "grad_norm": 0.037274770161642, + "learning_rate": 4.165221507625106e-07, + "loss": 0.0004, + "step": 9553 + }, + { + "epoch": 4.346678798908099, + "grad_norm": 0.1558727170497576, + "learning_rate": 4.1595121089462123e-07, + "loss": 0.0033, + "step": 9554 + }, + { + "epoch": 4.3471337579617835, + "grad_norm": 0.1721808385426201, + "learning_rate": 4.1538064561449653e-07, + "loss": 0.0008, + "step": 9555 + }, + { + "epoch": 4.347588717015468, + "grad_norm": 0.12713874106425976, + "learning_rate": 4.148104549687626e-07, + "loss": 0.0006, + "step": 9556 + }, + { + "epoch": 4.348043676069154, + "grad_norm": 0.10572219599084051, + "learning_rate": 4.1424063900401046e-07, + "loss": 0.0016, + "step": 9557 + }, + { + "epoch": 4.348498635122839, + "grad_norm": 0.23264285433577755, + "learning_rate": 4.1367119776680566e-07, + "loss": 0.0037, + "step": 9558 + }, + { + "epoch": 4.348953594176524, + "grad_norm": 0.10998039199353982, + "learning_rate": 4.131021313036787e-07, + "loss": 0.001, + "step": 9559 + }, + { + "epoch": 4.34940855323021, + "grad_norm": 0.046975576352913544, + "learning_rate": 4.1253343966113133e-07, + "loss": 0.0002, + "step": 9560 + }, + { + "epoch": 4.3498635122838945, + "grad_norm": 0.10266679880834646, + "learning_rate": 4.119651228856331e-07, + "loss": 0.0004, + "step": 9561 + }, + { + "epoch": 4.350318471337579, + "grad_norm": 0.026858915138591524, + "learning_rate": 4.113971810236261e-07, + "loss": 0.0003, + "step": 9562 + }, + { + "epoch": 4.350773430391265, + "grad_norm": 0.010133092996935038, + "learning_rate": 4.1082961412152065e-07, + "loss": 0.0001, + "step": 9563 + }, + { + "epoch": 4.35122838944495, + "grad_norm": 0.04774426939198638, + "learning_rate": 4.10262422225694e-07, + "loss": 0.0004, + "step": 9564 + }, + { + "epoch": 4.351683348498635, + "grad_norm": 0.06513242498920833, + "learning_rate": 4.0969560538249574e-07, + "loss": 0.0006, + "step": 9565 + }, + { + "epoch": 4.352138307552321, + "grad_norm": 0.17211349660931294, + "learning_rate": 4.0912916363824165e-07, + "loss": 0.0018, + "step": 9566 + }, + { + "epoch": 4.352593266606005, + "grad_norm": 0.08361707745428784, + "learning_rate": 4.0856309703922124e-07, + "loss": 0.001, + "step": 9567 + }, + { + "epoch": 4.35304822565969, + "grad_norm": 0.08447822346193756, + "learning_rate": 4.0799740563168934e-07, + "loss": 0.0003, + "step": 9568 + }, + { + "epoch": 4.353503184713376, + "grad_norm": 0.03343690160485407, + "learning_rate": 4.074320894618716e-07, + "loss": 0.0002, + "step": 9569 + }, + { + "epoch": 4.353958143767061, + "grad_norm": 0.020594602393960933, + "learning_rate": 4.068671485759651e-07, + "loss": 0.0001, + "step": 9570 + }, + { + "epoch": 4.354413102820746, + "grad_norm": 0.3461345788314648, + "learning_rate": 4.0630258302013115e-07, + "loss": 0.0043, + "step": 9571 + }, + { + "epoch": 4.3548680618744315, + "grad_norm": 0.03684470442623735, + "learning_rate": 4.057383928405062e-07, + "loss": 0.0003, + "step": 9572 + }, + { + "epoch": 4.355323020928116, + "grad_norm": 0.056045711349917225, + "learning_rate": 4.0517457808319225e-07, + "loss": 0.0003, + "step": 9573 + }, + { + "epoch": 4.355777979981801, + "grad_norm": 0.01219136046219526, + "learning_rate": 4.0461113879426197e-07, + "loss": 0.0001, + "step": 9574 + }, + { + "epoch": 4.356232939035487, + "grad_norm": 0.046258575449575595, + "learning_rate": 4.0404807501975617e-07, + "loss": 0.0004, + "step": 9575 + }, + { + "epoch": 4.356687898089172, + "grad_norm": 0.03301448378595439, + "learning_rate": 4.0348538680568595e-07, + "loss": 0.0003, + "step": 9576 + }, + { + "epoch": 4.357142857142857, + "grad_norm": 0.03625878472038226, + "learning_rate": 4.0292307419803333e-07, + "loss": 0.0001, + "step": 9577 + }, + { + "epoch": 4.3575978161965425, + "grad_norm": 0.12908302766485893, + "learning_rate": 4.0236113724274716e-07, + "loss": 0.0014, + "step": 9578 + }, + { + "epoch": 4.358052775250227, + "grad_norm": 0.0314803481001272, + "learning_rate": 4.017995759857457e-07, + "loss": 0.0002, + "step": 9579 + }, + { + "epoch": 4.358507734303912, + "grad_norm": 0.17385132170622852, + "learning_rate": 4.012383904729167e-07, + "loss": 0.0005, + "step": 9580 + }, + { + "epoch": 4.358962693357598, + "grad_norm": 0.17489838147799963, + "learning_rate": 4.0067758075012006e-07, + "loss": 0.0015, + "step": 9581 + }, + { + "epoch": 4.359417652411283, + "grad_norm": 0.04735051412179548, + "learning_rate": 4.001171468631809e-07, + "loss": 0.0002, + "step": 9582 + }, + { + "epoch": 4.359872611464968, + "grad_norm": 0.011593250507184535, + "learning_rate": 3.995570888578942e-07, + "loss": 0.0001, + "step": 9583 + }, + { + "epoch": 4.3603275705186535, + "grad_norm": 0.10384953810570867, + "learning_rate": 3.9899740678002843e-07, + "loss": 0.0014, + "step": 9584 + }, + { + "epoch": 4.360782529572338, + "grad_norm": 0.012969577997066021, + "learning_rate": 3.984381006753152e-07, + "loss": 0.0001, + "step": 9585 + }, + { + "epoch": 4.361237488626024, + "grad_norm": 0.10846099358239839, + "learning_rate": 3.978791705894608e-07, + "loss": 0.0006, + "step": 9586 + }, + { + "epoch": 4.361692447679709, + "grad_norm": 0.2728546015870505, + "learning_rate": 3.9732061656813816e-07, + "loss": 0.0013, + "step": 9587 + }, + { + "epoch": 4.362147406733394, + "grad_norm": 0.04057684681920179, + "learning_rate": 3.9676243865698847e-07, + "loss": 0.0002, + "step": 9588 + }, + { + "epoch": 4.36260236578708, + "grad_norm": 0.053313456372892414, + "learning_rate": 3.962046369016248e-07, + "loss": 0.0003, + "step": 9589 + }, + { + "epoch": 4.3630573248407645, + "grad_norm": 0.08024098869555892, + "learning_rate": 3.956472113476256e-07, + "loss": 0.0007, + "step": 9590 + }, + { + "epoch": 4.363512283894449, + "grad_norm": 0.005280989948720423, + "learning_rate": 3.9509016204054506e-07, + "loss": 0.0, + "step": 9591 + }, + { + "epoch": 4.363967242948135, + "grad_norm": 0.08780626355445023, + "learning_rate": 3.945334890259012e-07, + "loss": 0.0006, + "step": 9592 + }, + { + "epoch": 4.36442220200182, + "grad_norm": 0.0980754369380235, + "learning_rate": 3.93977192349182e-07, + "loss": 0.0007, + "step": 9593 + }, + { + "epoch": 4.364877161055505, + "grad_norm": 0.024347136947112283, + "learning_rate": 3.9342127205584615e-07, + "loss": 0.0002, + "step": 9594 + }, + { + "epoch": 4.365332120109191, + "grad_norm": 0.17287816110303303, + "learning_rate": 3.928657281913201e-07, + "loss": 0.0014, + "step": 9595 + }, + { + "epoch": 4.3657870791628755, + "grad_norm": 0.09864768338881905, + "learning_rate": 3.9231056080100196e-07, + "loss": 0.0006, + "step": 9596 + }, + { + "epoch": 4.36624203821656, + "grad_norm": 0.02510646786256023, + "learning_rate": 3.91755769930256e-07, + "loss": 0.0002, + "step": 9597 + }, + { + "epoch": 4.366696997270246, + "grad_norm": 0.019304989411127894, + "learning_rate": 3.912013556244182e-07, + "loss": 0.0001, + "step": 9598 + }, + { + "epoch": 4.367151956323931, + "grad_norm": 0.017127464710915995, + "learning_rate": 3.9064731792879283e-07, + "loss": 0.0001, + "step": 9599 + }, + { + "epoch": 4.367606915377616, + "grad_norm": 0.08523559660315795, + "learning_rate": 3.9009365688865207e-07, + "loss": 0.0005, + "step": 9600 + }, + { + "epoch": 4.368061874431302, + "grad_norm": 0.019164847643307118, + "learning_rate": 3.8954037254924026e-07, + "loss": 0.0001, + "step": 9601 + }, + { + "epoch": 4.368516833484986, + "grad_norm": 0.11275523122428278, + "learning_rate": 3.88987464955769e-07, + "loss": 0.0016, + "step": 9602 + }, + { + "epoch": 4.368971792538671, + "grad_norm": 0.039825553630519164, + "learning_rate": 3.8843493415341826e-07, + "loss": 0.0001, + "step": 9603 + }, + { + "epoch": 4.369426751592357, + "grad_norm": 0.08636715304684091, + "learning_rate": 3.878827801873386e-07, + "loss": 0.0005, + "step": 9604 + }, + { + "epoch": 4.369881710646042, + "grad_norm": 0.009525632671508556, + "learning_rate": 3.8733100310265e-07, + "loss": 0.0001, + "step": 9605 + }, + { + "epoch": 4.370336669699727, + "grad_norm": 0.05250451534109771, + "learning_rate": 3.8677960294444207e-07, + "loss": 0.0002, + "step": 9606 + }, + { + "epoch": 4.3707916287534125, + "grad_norm": 0.04944772206670779, + "learning_rate": 3.8622857975777195e-07, + "loss": 0.0004, + "step": 9607 + }, + { + "epoch": 4.371246587807097, + "grad_norm": 0.00660304792501937, + "learning_rate": 3.856779335876665e-07, + "loss": 0.0, + "step": 9608 + }, + { + "epoch": 4.371701546860782, + "grad_norm": 0.17920775271069825, + "learning_rate": 3.8512766447912133e-07, + "loss": 0.0009, + "step": 9609 + }, + { + "epoch": 4.372156505914468, + "grad_norm": 0.12288588892681677, + "learning_rate": 3.8457777247710384e-07, + "loss": 0.0008, + "step": 9610 + }, + { + "epoch": 4.372611464968153, + "grad_norm": 0.011675052209256074, + "learning_rate": 3.8402825762654636e-07, + "loss": 0.0001, + "step": 9611 + }, + { + "epoch": 4.373066424021838, + "grad_norm": 0.05153275009335694, + "learning_rate": 3.834791199723559e-07, + "loss": 0.0004, + "step": 9612 + }, + { + "epoch": 4.3735213830755235, + "grad_norm": 0.026925588330581448, + "learning_rate": 3.8293035955940304e-07, + "loss": 0.0002, + "step": 9613 + }, + { + "epoch": 4.373976342129208, + "grad_norm": 0.31727413466743354, + "learning_rate": 3.8238197643252984e-07, + "loss": 0.0009, + "step": 9614 + }, + { + "epoch": 4.374431301182893, + "grad_norm": 0.028318927128566894, + "learning_rate": 3.818339706365498e-07, + "loss": 0.0001, + "step": 9615 + }, + { + "epoch": 4.374886260236579, + "grad_norm": 0.48466827051088585, + "learning_rate": 3.812863422162422e-07, + "loss": 0.004, + "step": 9616 + }, + { + "epoch": 4.375341219290264, + "grad_norm": 0.09230738948824826, + "learning_rate": 3.807390912163561e-07, + "loss": 0.0018, + "step": 9617 + }, + { + "epoch": 4.375796178343949, + "grad_norm": 0.008232451756627379, + "learning_rate": 3.8019221768161087e-07, + "loss": 0.0001, + "step": 9618 + }, + { + "epoch": 4.3762511373976345, + "grad_norm": 0.046883467641014004, + "learning_rate": 3.7964572165669456e-07, + "loss": 0.0004, + "step": 9619 + }, + { + "epoch": 4.376706096451319, + "grad_norm": 0.08010664195395718, + "learning_rate": 3.790996031862654e-07, + "loss": 0.0008, + "step": 9620 + }, + { + "epoch": 4.377161055505004, + "grad_norm": 0.0029841600905793422, + "learning_rate": 3.785538623149493e-07, + "loss": 0.0, + "step": 9621 + }, + { + "epoch": 4.37761601455869, + "grad_norm": 0.14784348278396045, + "learning_rate": 3.7800849908734063e-07, + "loss": 0.0024, + "step": 9622 + }, + { + "epoch": 4.378070973612375, + "grad_norm": 0.19231677265268154, + "learning_rate": 3.7746351354800425e-07, + "loss": 0.0012, + "step": 9623 + }, + { + "epoch": 4.37852593266606, + "grad_norm": 0.11553539412378755, + "learning_rate": 3.769189057414752e-07, + "loss": 0.0012, + "step": 9624 + }, + { + "epoch": 4.3789808917197455, + "grad_norm": 0.10429395319435979, + "learning_rate": 3.763746757122544e-07, + "loss": 0.0003, + "step": 9625 + }, + { + "epoch": 4.37943585077343, + "grad_norm": 0.10200522298621524, + "learning_rate": 3.758308235048158e-07, + "loss": 0.0014, + "step": 9626 + }, + { + "epoch": 4.379890809827115, + "grad_norm": 0.18028201088531204, + "learning_rate": 3.752873491636e-07, + "loss": 0.0019, + "step": 9627 + }, + { + "epoch": 4.380345768880801, + "grad_norm": 0.5178074103618024, + "learning_rate": 3.7474425273301696e-07, + "loss": 0.0028, + "step": 9628 + }, + { + "epoch": 4.380800727934486, + "grad_norm": 0.2714839734938491, + "learning_rate": 3.742015342574451e-07, + "loss": 0.0019, + "step": 9629 + }, + { + "epoch": 4.381255686988171, + "grad_norm": 0.04783280340686421, + "learning_rate": 3.7365919378123507e-07, + "loss": 0.0004, + "step": 9630 + }, + { + "epoch": 4.3817106460418564, + "grad_norm": 0.010833970302822507, + "learning_rate": 3.7311723134870304e-07, + "loss": 0.0001, + "step": 9631 + }, + { + "epoch": 4.382165605095541, + "grad_norm": 0.012692716406864656, + "learning_rate": 3.7257564700413527e-07, + "loss": 0.0001, + "step": 9632 + }, + { + "epoch": 4.382620564149226, + "grad_norm": 0.15486804200847887, + "learning_rate": 3.7203444079178977e-07, + "loss": 0.0008, + "step": 9633 + }, + { + "epoch": 4.383075523202912, + "grad_norm": 0.017057050576178897, + "learning_rate": 3.7149361275588826e-07, + "loss": 0.0001, + "step": 9634 + }, + { + "epoch": 4.383530482256597, + "grad_norm": 0.05739298411511804, + "learning_rate": 3.7095316294062824e-07, + "loss": 0.0003, + "step": 9635 + }, + { + "epoch": 4.383985441310282, + "grad_norm": 0.11215266516873934, + "learning_rate": 3.7041309139017046e-07, + "loss": 0.0016, + "step": 9636 + }, + { + "epoch": 4.384440400363967, + "grad_norm": 0.18286538353234372, + "learning_rate": 3.698733981486485e-07, + "loss": 0.0016, + "step": 9637 + }, + { + "epoch": 4.384895359417652, + "grad_norm": 0.10212499367328176, + "learning_rate": 3.6933408326016164e-07, + "loss": 0.0012, + "step": 9638 + }, + { + "epoch": 4.385350318471337, + "grad_norm": 0.05009482784609614, + "learning_rate": 3.687951467687817e-07, + "loss": 0.0004, + "step": 9639 + }, + { + "epoch": 4.385805277525023, + "grad_norm": 0.15005315396638721, + "learning_rate": 3.6825658871854906e-07, + "loss": 0.0008, + "step": 9640 + }, + { + "epoch": 4.386260236578708, + "grad_norm": 0.048455735934680506, + "learning_rate": 3.677184091534708e-07, + "loss": 0.0004, + "step": 9641 + }, + { + "epoch": 4.386715195632393, + "grad_norm": 0.017641312748139105, + "learning_rate": 3.671806081175255e-07, + "loss": 0.0001, + "step": 9642 + }, + { + "epoch": 4.387170154686078, + "grad_norm": 0.0702345354656347, + "learning_rate": 3.666431856546582e-07, + "loss": 0.0011, + "step": 9643 + }, + { + "epoch": 4.387625113739763, + "grad_norm": 0.043257386507523775, + "learning_rate": 3.6610614180878636e-07, + "loss": 0.0002, + "step": 9644 + }, + { + "epoch": 4.388080072793448, + "grad_norm": 0.024067102843430388, + "learning_rate": 3.6556947662379436e-07, + "loss": 0.0001, + "step": 9645 + }, + { + "epoch": 4.388535031847134, + "grad_norm": 0.013900689815314704, + "learning_rate": 3.65033190143535e-07, + "loss": 0.0001, + "step": 9646 + }, + { + "epoch": 4.388989990900819, + "grad_norm": 0.012122857913547336, + "learning_rate": 3.6449728241183256e-07, + "loss": 0.0001, + "step": 9647 + }, + { + "epoch": 4.389444949954504, + "grad_norm": 0.19466210882837834, + "learning_rate": 3.639617534724782e-07, + "loss": 0.0024, + "step": 9648 + }, + { + "epoch": 4.389899909008189, + "grad_norm": 0.06278658953638261, + "learning_rate": 3.634266033692335e-07, + "loss": 0.0004, + "step": 9649 + }, + { + "epoch": 4.390354868061874, + "grad_norm": 0.006738354979878417, + "learning_rate": 3.6289183214582854e-07, + "loss": 0.0001, + "step": 9650 + }, + { + "epoch": 4.39080982711556, + "grad_norm": 0.1281438172455948, + "learning_rate": 3.623574398459617e-07, + "loss": 0.0007, + "step": 9651 + }, + { + "epoch": 4.391264786169245, + "grad_norm": 0.009725434907386876, + "learning_rate": 3.6182342651330083e-07, + "loss": 0.0, + "step": 9652 + }, + { + "epoch": 4.39171974522293, + "grad_norm": 0.10971134435132887, + "learning_rate": 3.612897921914837e-07, + "loss": 0.0008, + "step": 9653 + }, + { + "epoch": 4.3921747042766155, + "grad_norm": 0.14548375277724887, + "learning_rate": 3.607565369241173e-07, + "loss": 0.0013, + "step": 9654 + }, + { + "epoch": 4.3926296633303, + "grad_norm": 0.05081155951627734, + "learning_rate": 3.60223660754776e-07, + "loss": 0.0001, + "step": 9655 + }, + { + "epoch": 4.393084622383985, + "grad_norm": 0.30777792060884823, + "learning_rate": 3.596911637270045e-07, + "loss": 0.0053, + "step": 9656 + }, + { + "epoch": 4.393539581437671, + "grad_norm": 0.03905229247285689, + "learning_rate": 3.591590458843142e-07, + "loss": 0.0002, + "step": 9657 + }, + { + "epoch": 4.393994540491356, + "grad_norm": 0.23688943423319317, + "learning_rate": 3.586273072701901e-07, + "loss": 0.0031, + "step": 9658 + }, + { + "epoch": 4.394449499545041, + "grad_norm": 0.15961072087549197, + "learning_rate": 3.580959479280821e-07, + "loss": 0.0035, + "step": 9659 + }, + { + "epoch": 4.3949044585987265, + "grad_norm": 0.3084126059174271, + "learning_rate": 3.575649679014098e-07, + "loss": 0.0007, + "step": 9660 + }, + { + "epoch": 4.395359417652411, + "grad_norm": 0.07068959984525587, + "learning_rate": 3.570343672335641e-07, + "loss": 0.0004, + "step": 9661 + }, + { + "epoch": 4.395814376706096, + "grad_norm": 0.01760097716547311, + "learning_rate": 3.5650414596790137e-07, + "loss": 0.0001, + "step": 9662 + }, + { + "epoch": 4.396269335759782, + "grad_norm": 0.017771311226987968, + "learning_rate": 3.559743041477509e-07, + "loss": 0.0001, + "step": 9663 + }, + { + "epoch": 4.396724294813467, + "grad_norm": 0.08812991679779757, + "learning_rate": 3.5544484181640804e-07, + "loss": 0.0008, + "step": 9664 + }, + { + "epoch": 4.397179253867152, + "grad_norm": 0.09367364024309256, + "learning_rate": 3.5491575901713815e-07, + "loss": 0.0008, + "step": 9665 + }, + { + "epoch": 4.3976342129208374, + "grad_norm": 0.039712037928291505, + "learning_rate": 3.5438705579317557e-07, + "loss": 0.0003, + "step": 9666 + }, + { + "epoch": 4.398089171974522, + "grad_norm": 0.1434547622828054, + "learning_rate": 3.538587321877224e-07, + "loss": 0.0014, + "step": 9667 + }, + { + "epoch": 4.398544131028207, + "grad_norm": 0.0647965971418509, + "learning_rate": 3.5333078824395237e-07, + "loss": 0.0007, + "step": 9668 + }, + { + "epoch": 4.398999090081893, + "grad_norm": 0.12102628074073363, + "learning_rate": 3.528032240050061e-07, + "loss": 0.0007, + "step": 9669 + }, + { + "epoch": 4.399454049135578, + "grad_norm": 0.20675964229726468, + "learning_rate": 3.522760395139946e-07, + "loss": 0.0008, + "step": 9670 + }, + { + "epoch": 4.399909008189263, + "grad_norm": 0.014355203381870482, + "learning_rate": 3.517492348139956e-07, + "loss": 0.0001, + "step": 9671 + }, + { + "epoch": 4.400363967242948, + "grad_norm": 0.18179559570037063, + "learning_rate": 3.5122280994805747e-07, + "loss": 0.0011, + "step": 9672 + }, + { + "epoch": 4.400818926296633, + "grad_norm": 0.006988816690587801, + "learning_rate": 3.50696764959198e-07, + "loss": 0.0, + "step": 9673 + }, + { + "epoch": 4.401273885350318, + "grad_norm": 0.06264685905739664, + "learning_rate": 3.5017109989040234e-07, + "loss": 0.0006, + "step": 9674 + }, + { + "epoch": 4.401728844404004, + "grad_norm": 0.07370856029500973, + "learning_rate": 3.4964581478462654e-07, + "loss": 0.0005, + "step": 9675 + }, + { + "epoch": 4.402183803457689, + "grad_norm": 0.15611045884952449, + "learning_rate": 3.4912090968479417e-07, + "loss": 0.0029, + "step": 9676 + }, + { + "epoch": 4.402638762511374, + "grad_norm": 0.06902600631497491, + "learning_rate": 3.4859638463379695e-07, + "loss": 0.0007, + "step": 9677 + }, + { + "epoch": 4.403093721565059, + "grad_norm": 0.2762981361753823, + "learning_rate": 3.480722396744984e-07, + "loss": 0.0023, + "step": 9678 + }, + { + "epoch": 4.403548680618744, + "grad_norm": 0.008827958162164893, + "learning_rate": 3.4754847484972877e-07, + "loss": 0.0001, + "step": 9679 + }, + { + "epoch": 4.404003639672429, + "grad_norm": 0.12605560627352566, + "learning_rate": 3.47025090202287e-07, + "loss": 0.0008, + "step": 9680 + }, + { + "epoch": 4.404458598726115, + "grad_norm": 0.15338898186435398, + "learning_rate": 3.4650208577494185e-07, + "loss": 0.0007, + "step": 9681 + }, + { + "epoch": 4.4049135577798, + "grad_norm": 0.06408781723482458, + "learning_rate": 3.4597946161043063e-07, + "loss": 0.0007, + "step": 9682 + }, + { + "epoch": 4.405368516833485, + "grad_norm": 0.03052030364532293, + "learning_rate": 3.45457217751462e-07, + "loss": 0.0002, + "step": 9683 + }, + { + "epoch": 4.40582347588717, + "grad_norm": 0.15679009916566664, + "learning_rate": 3.4493535424070913e-07, + "loss": 0.0011, + "step": 9684 + }, + { + "epoch": 4.406278434940855, + "grad_norm": 0.031935211938976964, + "learning_rate": 3.4441387112081724e-07, + "loss": 0.0002, + "step": 9685 + }, + { + "epoch": 4.40673339399454, + "grad_norm": 0.04184156567991399, + "learning_rate": 3.43892768434399e-07, + "loss": 0.0004, + "step": 9686 + }, + { + "epoch": 4.407188353048226, + "grad_norm": 0.005955905539000646, + "learning_rate": 3.433720462240375e-07, + "loss": 0.0, + "step": 9687 + }, + { + "epoch": 4.407643312101911, + "grad_norm": 0.051772575472979976, + "learning_rate": 3.4285170453228214e-07, + "loss": 0.0004, + "step": 9688 + }, + { + "epoch": 4.408098271155596, + "grad_norm": 0.05526077577904048, + "learning_rate": 3.4233174340165486e-07, + "loss": 0.0003, + "step": 9689 + }, + { + "epoch": 4.408553230209281, + "grad_norm": 0.12090852165042458, + "learning_rate": 3.41812162874644e-07, + "loss": 0.0013, + "step": 9690 + }, + { + "epoch": 4.409008189262966, + "grad_norm": 0.057057035027325675, + "learning_rate": 3.412929629937062e-07, + "loss": 0.0002, + "step": 9691 + }, + { + "epoch": 4.409463148316651, + "grad_norm": 0.15785618181098338, + "learning_rate": 3.407741438012691e-07, + "loss": 0.0031, + "step": 9692 + }, + { + "epoch": 4.409918107370337, + "grad_norm": 0.06391119199517106, + "learning_rate": 3.4025570533972876e-07, + "loss": 0.0005, + "step": 9693 + }, + { + "epoch": 4.410373066424022, + "grad_norm": 0.03104074143741479, + "learning_rate": 3.397376476514486e-07, + "loss": 0.0002, + "step": 9694 + }, + { + "epoch": 4.4108280254777075, + "grad_norm": 0.038138633398526085, + "learning_rate": 3.392199707787619e-07, + "loss": 0.0002, + "step": 9695 + }, + { + "epoch": 4.411282984531392, + "grad_norm": 0.01833962905184601, + "learning_rate": 3.38702674763971e-07, + "loss": 0.0002, + "step": 9696 + }, + { + "epoch": 4.411737943585077, + "grad_norm": 0.019255456029010715, + "learning_rate": 3.3818575964934764e-07, + "loss": 0.0001, + "step": 9697 + }, + { + "epoch": 4.412192902638763, + "grad_norm": 0.05634132477630591, + "learning_rate": 3.376692254771324e-07, + "loss": 0.0011, + "step": 9698 + }, + { + "epoch": 4.412647861692448, + "grad_norm": 0.17832163922167318, + "learning_rate": 3.371530722895322e-07, + "loss": 0.0025, + "step": 9699 + }, + { + "epoch": 4.413102820746133, + "grad_norm": 0.19601540817641203, + "learning_rate": 3.3663730012872654e-07, + "loss": 0.0028, + "step": 9700 + }, + { + "epoch": 4.4135577797998184, + "grad_norm": 0.03220956884829947, + "learning_rate": 3.3612190903686e-07, + "loss": 0.0002, + "step": 9701 + }, + { + "epoch": 4.414012738853503, + "grad_norm": 0.18995225346002148, + "learning_rate": 3.35606899056049e-07, + "loss": 0.0012, + "step": 9702 + }, + { + "epoch": 4.414467697907188, + "grad_norm": 0.12808194422439712, + "learning_rate": 3.3509227022837876e-07, + "loss": 0.0016, + "step": 9703 + }, + { + "epoch": 4.414922656960874, + "grad_norm": 0.0900081386892947, + "learning_rate": 3.3457802259590165e-07, + "loss": 0.0008, + "step": 9704 + }, + { + "epoch": 4.415377616014559, + "grad_norm": 7.8541099720937915, + "learning_rate": 3.3406415620064024e-07, + "loss": 0.0555, + "step": 9705 + }, + { + "epoch": 4.415832575068244, + "grad_norm": 0.009981937382365271, + "learning_rate": 3.335506710845837e-07, + "loss": 0.0001, + "step": 9706 + }, + { + "epoch": 4.416287534121929, + "grad_norm": 0.11658179895853803, + "learning_rate": 3.330375672896935e-07, + "loss": 0.0005, + "step": 9707 + }, + { + "epoch": 4.416742493175614, + "grad_norm": 0.21280310225032892, + "learning_rate": 3.3252484485789716e-07, + "loss": 0.0014, + "step": 9708 + }, + { + "epoch": 4.417197452229299, + "grad_norm": 0.02765464238733074, + "learning_rate": 3.320125038310923e-07, + "loss": 0.0002, + "step": 9709 + }, + { + "epoch": 4.417652411282985, + "grad_norm": 0.00996669636979424, + "learning_rate": 3.315005442511454e-07, + "loss": 0.0001, + "step": 9710 + }, + { + "epoch": 4.41810737033667, + "grad_norm": 0.05063264672279933, + "learning_rate": 3.3098896615989085e-07, + "loss": 0.0004, + "step": 9711 + }, + { + "epoch": 4.418562329390355, + "grad_norm": 0.07702474173078007, + "learning_rate": 3.304777695991335e-07, + "loss": 0.0002, + "step": 9712 + }, + { + "epoch": 4.41901728844404, + "grad_norm": 0.026721926559433458, + "learning_rate": 3.299669546106454e-07, + "loss": 0.0002, + "step": 9713 + }, + { + "epoch": 4.419472247497725, + "grad_norm": 0.010899089291708826, + "learning_rate": 3.294565212361683e-07, + "loss": 0.0001, + "step": 9714 + }, + { + "epoch": 4.41992720655141, + "grad_norm": 0.03366531450123238, + "learning_rate": 3.289464695174111e-07, + "loss": 0.0002, + "step": 9715 + }, + { + "epoch": 4.420382165605096, + "grad_norm": 0.009923945204701358, + "learning_rate": 3.2843679949605466e-07, + "loss": 0.0, + "step": 9716 + }, + { + "epoch": 4.420837124658781, + "grad_norm": 0.06063481948797327, + "learning_rate": 3.27927511213747e-07, + "loss": 0.0002, + "step": 9717 + }, + { + "epoch": 4.421292083712466, + "grad_norm": 0.024555764257756893, + "learning_rate": 3.2741860471210364e-07, + "loss": 0.0002, + "step": 9718 + }, + { + "epoch": 4.421747042766151, + "grad_norm": 0.4021708913069274, + "learning_rate": 3.269100800327113e-07, + "loss": 0.0048, + "step": 9719 + }, + { + "epoch": 4.422202001819836, + "grad_norm": 0.28517867811205955, + "learning_rate": 3.2640193721712286e-07, + "loss": 0.0027, + "step": 9720 + }, + { + "epoch": 4.422656960873521, + "grad_norm": 0.3135499781605325, + "learning_rate": 3.258941763068635e-07, + "loss": 0.0041, + "step": 9721 + }, + { + "epoch": 4.423111919927207, + "grad_norm": 0.07550189281238037, + "learning_rate": 3.2538679734342327e-07, + "loss": 0.0004, + "step": 9722 + }, + { + "epoch": 4.423566878980892, + "grad_norm": 0.18851424145062828, + "learning_rate": 3.248798003682629e-07, + "loss": 0.0029, + "step": 9723 + }, + { + "epoch": 4.424021838034577, + "grad_norm": 0.07922482042576807, + "learning_rate": 3.243731854228138e-07, + "loss": 0.0008, + "step": 9724 + }, + { + "epoch": 4.424476797088262, + "grad_norm": 0.020516938144610152, + "learning_rate": 3.238669525484722e-07, + "loss": 0.0001, + "step": 9725 + }, + { + "epoch": 4.424931756141947, + "grad_norm": 0.2950157139685972, + "learning_rate": 3.2336110178660676e-07, + "loss": 0.0038, + "step": 9726 + }, + { + "epoch": 4.425386715195632, + "grad_norm": 0.21229129188726475, + "learning_rate": 3.2285563317855207e-07, + "loss": 0.0015, + "step": 9727 + }, + { + "epoch": 4.425841674249318, + "grad_norm": 0.0949715281069857, + "learning_rate": 3.223505467656135e-07, + "loss": 0.0013, + "step": 9728 + }, + { + "epoch": 4.426296633303003, + "grad_norm": 0.009500179798080936, + "learning_rate": 3.218458425890636e-07, + "loss": 0.0, + "step": 9729 + }, + { + "epoch": 4.426751592356688, + "grad_norm": 0.20256127931545861, + "learning_rate": 3.213415206901449e-07, + "loss": 0.0023, + "step": 9730 + }, + { + "epoch": 4.427206551410373, + "grad_norm": 0.09059227155147881, + "learning_rate": 3.2083758111006946e-07, + "loss": 0.0003, + "step": 9731 + }, + { + "epoch": 4.427661510464058, + "grad_norm": 0.043626402178712116, + "learning_rate": 3.20334023890016e-07, + "loss": 0.0004, + "step": 9732 + }, + { + "epoch": 4.428116469517743, + "grad_norm": 0.06265446526543357, + "learning_rate": 3.198308490711327e-07, + "loss": 0.0005, + "step": 9733 + }, + { + "epoch": 4.428571428571429, + "grad_norm": 0.40477377674568726, + "learning_rate": 3.1932805669453724e-07, + "loss": 0.0027, + "step": 9734 + }, + { + "epoch": 4.429026387625114, + "grad_norm": 0.14063853884218894, + "learning_rate": 3.18825646801314e-07, + "loss": 0.0005, + "step": 9735 + }, + { + "epoch": 4.429481346678799, + "grad_norm": 0.07899892714086733, + "learning_rate": 3.183236194325201e-07, + "loss": 0.0005, + "step": 9736 + }, + { + "epoch": 4.429936305732484, + "grad_norm": 0.006900169526151869, + "learning_rate": 3.1782197462917664e-07, + "loss": 0.0, + "step": 9737 + }, + { + "epoch": 4.430391264786169, + "grad_norm": 0.017784397873982963, + "learning_rate": 3.17320712432278e-07, + "loss": 0.0002, + "step": 9738 + }, + { + "epoch": 4.430846223839854, + "grad_norm": 0.006386438345564333, + "learning_rate": 3.1681983288278375e-07, + "loss": 0.0, + "step": 9739 + }, + { + "epoch": 4.43130118289354, + "grad_norm": 0.06859236760882528, + "learning_rate": 3.1631933602162326e-07, + "loss": 0.0004, + "step": 9740 + }, + { + "epoch": 4.431756141947225, + "grad_norm": 0.05124514038916852, + "learning_rate": 3.1581922188969605e-07, + "loss": 0.0003, + "step": 9741 + }, + { + "epoch": 4.4322111010009095, + "grad_norm": 0.17893493472605435, + "learning_rate": 3.153194905278678e-07, + "loss": 0.0015, + "step": 9742 + }, + { + "epoch": 4.432666060054595, + "grad_norm": 0.012273655532279657, + "learning_rate": 3.1482014197697584e-07, + "loss": 0.0001, + "step": 9743 + }, + { + "epoch": 4.43312101910828, + "grad_norm": 0.008279159542419746, + "learning_rate": 3.143211762778226e-07, + "loss": 0.0, + "step": 9744 + }, + { + "epoch": 4.433575978161965, + "grad_norm": 0.17784125720879143, + "learning_rate": 3.1382259347118195e-07, + "loss": 0.0037, + "step": 9745 + }, + { + "epoch": 4.434030937215651, + "grad_norm": 0.027285914082301757, + "learning_rate": 3.133243935977981e-07, + "loss": 0.0001, + "step": 9746 + }, + { + "epoch": 4.434485896269336, + "grad_norm": 0.3335437766838001, + "learning_rate": 3.1282657669837956e-07, + "loss": 0.0036, + "step": 9747 + }, + { + "epoch": 4.4349408553230205, + "grad_norm": 0.17411214302792816, + "learning_rate": 3.1232914281360607e-07, + "loss": 0.0028, + "step": 9748 + }, + { + "epoch": 4.435395814376706, + "grad_norm": 0.01583613165235244, + "learning_rate": 3.1183209198412446e-07, + "loss": 0.0001, + "step": 9749 + }, + { + "epoch": 4.435850773430391, + "grad_norm": 0.009105500980011135, + "learning_rate": 3.1133542425055394e-07, + "loss": 0.0001, + "step": 9750 + }, + { + "epoch": 4.436305732484076, + "grad_norm": 0.353614607238613, + "learning_rate": 3.1083913965347824e-07, + "loss": 0.0061, + "step": 9751 + }, + { + "epoch": 4.436760691537762, + "grad_norm": 0.008776523309563642, + "learning_rate": 3.1034323823345256e-07, + "loss": 0.0001, + "step": 9752 + }, + { + "epoch": 4.437215650591447, + "grad_norm": 0.016079597583139957, + "learning_rate": 3.0984772003099905e-07, + "loss": 0.0001, + "step": 9753 + }, + { + "epoch": 4.4376706096451315, + "grad_norm": 0.1863709299910358, + "learning_rate": 3.09352585086608e-07, + "loss": 0.0015, + "step": 9754 + }, + { + "epoch": 4.438125568698817, + "grad_norm": 0.037387267554902175, + "learning_rate": 3.088578334407427e-07, + "loss": 0.0003, + "step": 9755 + }, + { + "epoch": 4.438580527752502, + "grad_norm": 0.4293941149466844, + "learning_rate": 3.0836346513382963e-07, + "loss": 0.002, + "step": 9756 + }, + { + "epoch": 4.439035486806187, + "grad_norm": 0.016602409146596518, + "learning_rate": 3.0786948020626706e-07, + "loss": 0.0001, + "step": 9757 + }, + { + "epoch": 4.439490445859873, + "grad_norm": 0.2978140116003094, + "learning_rate": 3.0737587869842045e-07, + "loss": 0.003, + "step": 9758 + }, + { + "epoch": 4.439945404913558, + "grad_norm": 0.04684090690398713, + "learning_rate": 3.0688266065062535e-07, + "loss": 0.0002, + "step": 9759 + }, + { + "epoch": 4.440400363967243, + "grad_norm": 0.09876346914508588, + "learning_rate": 3.063898261031856e-07, + "loss": 0.001, + "step": 9760 + }, + { + "epoch": 4.440855323020928, + "grad_norm": 0.009139626111730102, + "learning_rate": 3.058973750963734e-07, + "loss": 0.0001, + "step": 9761 + }, + { + "epoch": 4.441310282074613, + "grad_norm": 0.16439642644320687, + "learning_rate": 3.0540530767042944e-07, + "loss": 0.0013, + "step": 9762 + }, + { + "epoch": 4.441765241128299, + "grad_norm": 0.022623478061656294, + "learning_rate": 3.0491362386556254e-07, + "loss": 0.0002, + "step": 9763 + }, + { + "epoch": 4.442220200181984, + "grad_norm": 0.09362669926828819, + "learning_rate": 3.044223237219518e-07, + "loss": 0.0005, + "step": 9764 + }, + { + "epoch": 4.442675159235669, + "grad_norm": 0.0885582827009058, + "learning_rate": 3.0393140727974334e-07, + "loss": 0.0004, + "step": 9765 + }, + { + "epoch": 4.443130118289354, + "grad_norm": 0.22615170263271775, + "learning_rate": 3.0344087457905344e-07, + "loss": 0.0013, + "step": 9766 + }, + { + "epoch": 4.443585077343039, + "grad_norm": 0.017767123213202212, + "learning_rate": 3.029507256599662e-07, + "loss": 0.0001, + "step": 9767 + }, + { + "epoch": 4.444040036396724, + "grad_norm": 0.01997416799800844, + "learning_rate": 3.024609605625328e-07, + "loss": 0.0001, + "step": 9768 + }, + { + "epoch": 4.44449499545041, + "grad_norm": 0.11262580844406248, + "learning_rate": 3.019715793267769e-07, + "loss": 0.0013, + "step": 9769 + }, + { + "epoch": 4.444949954504095, + "grad_norm": 0.006793420981161215, + "learning_rate": 3.01482581992687e-07, + "loss": 0.0, + "step": 9770 + }, + { + "epoch": 4.44540491355778, + "grad_norm": 0.07252150471494055, + "learning_rate": 3.009939686002228e-07, + "loss": 0.0007, + "step": 9771 + }, + { + "epoch": 4.445859872611465, + "grad_norm": 0.0687106592733224, + "learning_rate": 3.0050573918930957e-07, + "loss": 0.0006, + "step": 9772 + }, + { + "epoch": 4.44631483166515, + "grad_norm": 0.015071387322499517, + "learning_rate": 3.0001789379984547e-07, + "loss": 0.0001, + "step": 9773 + }, + { + "epoch": 4.446769790718835, + "grad_norm": 0.08161484190804871, + "learning_rate": 2.9953043247169355e-07, + "loss": 0.0006, + "step": 9774 + }, + { + "epoch": 4.447224749772521, + "grad_norm": 0.2128520990957493, + "learning_rate": 2.9904335524468866e-07, + "loss": 0.0039, + "step": 9775 + }, + { + "epoch": 4.447679708826206, + "grad_norm": 0.08737768827246814, + "learning_rate": 2.985566621586311e-07, + "loss": 0.0011, + "step": 9776 + }, + { + "epoch": 4.4481346678798905, + "grad_norm": 0.39062062527920377, + "learning_rate": 2.9807035325329136e-07, + "loss": 0.0004, + "step": 9777 + }, + { + "epoch": 4.448589626933576, + "grad_norm": 0.0054942624799455, + "learning_rate": 2.975844285684082e-07, + "loss": 0.0, + "step": 9778 + }, + { + "epoch": 4.449044585987261, + "grad_norm": 0.1983831108843659, + "learning_rate": 2.9709888814368926e-07, + "loss": 0.0044, + "step": 9779 + }, + { + "epoch": 4.449499545040946, + "grad_norm": 0.012994031604128303, + "learning_rate": 2.966137320188123e-07, + "loss": 0.0001, + "step": 9780 + }, + { + "epoch": 4.449954504094632, + "grad_norm": 0.036988204299509365, + "learning_rate": 2.961289602334205e-07, + "loss": 0.0002, + "step": 9781 + }, + { + "epoch": 4.450409463148317, + "grad_norm": 0.13197265729278673, + "learning_rate": 2.9564457282712787e-07, + "loss": 0.0019, + "step": 9782 + }, + { + "epoch": 4.4508644222020015, + "grad_norm": 0.039774391063334864, + "learning_rate": 2.951605698395149e-07, + "loss": 0.0002, + "step": 9783 + }, + { + "epoch": 4.451319381255687, + "grad_norm": 0.006074030432539295, + "learning_rate": 2.946769513101344e-07, + "loss": 0.0, + "step": 9784 + }, + { + "epoch": 4.451774340309372, + "grad_norm": 0.3046659119210393, + "learning_rate": 2.941937172785042e-07, + "loss": 0.0047, + "step": 9785 + }, + { + "epoch": 4.452229299363057, + "grad_norm": 0.04319090795977366, + "learning_rate": 2.937108677841116e-07, + "loss": 0.0003, + "step": 9786 + }, + { + "epoch": 4.452684258416743, + "grad_norm": 0.10283298670020563, + "learning_rate": 2.932284028664145e-07, + "loss": 0.0019, + "step": 9787 + }, + { + "epoch": 4.453139217470428, + "grad_norm": 0.00943886261722189, + "learning_rate": 2.9274632256483526e-07, + "loss": 0.0001, + "step": 9788 + }, + { + "epoch": 4.4535941765241125, + "grad_norm": 0.08254570558061936, + "learning_rate": 2.922646269187701e-07, + "loss": 0.0005, + "step": 9789 + }, + { + "epoch": 4.454049135577798, + "grad_norm": 0.017799630563506644, + "learning_rate": 2.917833159675792e-07, + "loss": 0.0001, + "step": 9790 + }, + { + "epoch": 4.454504094631483, + "grad_norm": 0.06126000346464943, + "learning_rate": 2.91302389750594e-07, + "loss": 0.0005, + "step": 9791 + }, + { + "epoch": 4.454959053685168, + "grad_norm": 0.004720162083997436, + "learning_rate": 2.908218483071124e-07, + "loss": 0.0, + "step": 9792 + }, + { + "epoch": 4.455414012738854, + "grad_norm": 0.019631397622341947, + "learning_rate": 2.903416916764029e-07, + "loss": 0.0001, + "step": 9793 + }, + { + "epoch": 4.455868971792539, + "grad_norm": 0.01802349236863066, + "learning_rate": 2.898619198977026e-07, + "loss": 0.0002, + "step": 9794 + }, + { + "epoch": 4.4563239308462235, + "grad_norm": 0.43256760719446097, + "learning_rate": 2.893825330102151e-07, + "loss": 0.0211, + "step": 9795 + }, + { + "epoch": 4.456778889899909, + "grad_norm": 0.025298544497720836, + "learning_rate": 2.889035310531141e-07, + "loss": 0.0002, + "step": 9796 + }, + { + "epoch": 4.457233848953594, + "grad_norm": 0.06483438654538458, + "learning_rate": 2.8842491406554094e-07, + "loss": 0.0003, + "step": 9797 + }, + { + "epoch": 4.457688808007279, + "grad_norm": 0.16805962708227917, + "learning_rate": 2.879466820866067e-07, + "loss": 0.0013, + "step": 9798 + }, + { + "epoch": 4.458143767060965, + "grad_norm": 0.02271715470086333, + "learning_rate": 2.874688351553906e-07, + "loss": 0.0002, + "step": 9799 + }, + { + "epoch": 4.45859872611465, + "grad_norm": 0.20073293686823662, + "learning_rate": 2.869913733109386e-07, + "loss": 0.0017, + "step": 9800 + }, + { + "epoch": 4.4590536851683344, + "grad_norm": 0.14380132232382806, + "learning_rate": 2.8651429659226906e-07, + "loss": 0.0012, + "step": 9801 + }, + { + "epoch": 4.45950864422202, + "grad_norm": 0.012534494839139832, + "learning_rate": 2.8603760503836454e-07, + "loss": 0.0001, + "step": 9802 + }, + { + "epoch": 4.459963603275705, + "grad_norm": 0.038842044308688276, + "learning_rate": 2.8556129868817893e-07, + "loss": 0.0001, + "step": 9803 + }, + { + "epoch": 4.460418562329391, + "grad_norm": 1.0275785904236199, + "learning_rate": 2.850853775806345e-07, + "loss": 0.003, + "step": 9804 + }, + { + "epoch": 4.460873521383076, + "grad_norm": 0.30648582761010784, + "learning_rate": 2.846098417546206e-07, + "loss": 0.0019, + "step": 9805 + }, + { + "epoch": 4.461328480436761, + "grad_norm": 0.022603007775520007, + "learning_rate": 2.841346912489956e-07, + "loss": 0.0002, + "step": 9806 + }, + { + "epoch": 4.461783439490446, + "grad_norm": 0.05186014983687546, + "learning_rate": 2.836599261025852e-07, + "loss": 0.0002, + "step": 9807 + }, + { + "epoch": 4.462238398544131, + "grad_norm": 0.03837095601837969, + "learning_rate": 2.831855463541888e-07, + "loss": 0.0001, + "step": 9808 + }, + { + "epoch": 4.462693357597816, + "grad_norm": 0.014959444476450033, + "learning_rate": 2.8271155204256826e-07, + "loss": 0.0001, + "step": 9809 + }, + { + "epoch": 4.463148316651502, + "grad_norm": 0.0661108077314431, + "learning_rate": 2.8223794320645705e-07, + "loss": 0.0003, + "step": 9810 + }, + { + "epoch": 4.463603275705187, + "grad_norm": 0.1025476499277121, + "learning_rate": 2.817647198845558e-07, + "loss": 0.0009, + "step": 9811 + }, + { + "epoch": 4.4640582347588715, + "grad_norm": 0.04266926187999061, + "learning_rate": 2.812918821155336e-07, + "loss": 0.0002, + "step": 9812 + }, + { + "epoch": 4.464513193812557, + "grad_norm": 0.17872743192093948, + "learning_rate": 2.808194299380296e-07, + "loss": 0.0006, + "step": 9813 + }, + { + "epoch": 4.464968152866242, + "grad_norm": 0.14087940767837617, + "learning_rate": 2.803473633906495e-07, + "loss": 0.0009, + "step": 9814 + }, + { + "epoch": 4.465423111919927, + "grad_norm": 0.014106178327896889, + "learning_rate": 2.7987568251197027e-07, + "loss": 0.0001, + "step": 9815 + }, + { + "epoch": 4.465878070973613, + "grad_norm": 0.026955016264711362, + "learning_rate": 2.794043873405339e-07, + "loss": 0.0002, + "step": 9816 + }, + { + "epoch": 4.466333030027298, + "grad_norm": 0.14319096113865132, + "learning_rate": 2.789334779148528e-07, + "loss": 0.0032, + "step": 9817 + }, + { + "epoch": 4.4667879890809825, + "grad_norm": 0.01537631016289191, + "learning_rate": 2.78462954273408e-07, + "loss": 0.0001, + "step": 9818 + }, + { + "epoch": 4.467242948134668, + "grad_norm": 0.10932080023011205, + "learning_rate": 2.77992816454648e-07, + "loss": 0.0017, + "step": 9819 + }, + { + "epoch": 4.467697907188353, + "grad_norm": 0.011293209914633813, + "learning_rate": 2.775230644969906e-07, + "loss": 0.0001, + "step": 9820 + }, + { + "epoch": 4.468152866242038, + "grad_norm": 0.16308582746928127, + "learning_rate": 2.7705369843882223e-07, + "loss": 0.001, + "step": 9821 + }, + { + "epoch": 4.468607825295724, + "grad_norm": 0.0814893870538949, + "learning_rate": 2.7658471831849664e-07, + "loss": 0.0006, + "step": 9822 + }, + { + "epoch": 4.469062784349409, + "grad_norm": 0.13535296181518966, + "learning_rate": 2.761161241743371e-07, + "loss": 0.0006, + "step": 9823 + }, + { + "epoch": 4.4695177434030935, + "grad_norm": 0.057057305749909186, + "learning_rate": 2.756479160446357e-07, + "loss": 0.0008, + "step": 9824 + }, + { + "epoch": 4.469972702456779, + "grad_norm": 0.013109914042717294, + "learning_rate": 2.751800939676513e-07, + "loss": 0.0001, + "step": 9825 + }, + { + "epoch": 4.470427661510464, + "grad_norm": 0.12042985313853603, + "learning_rate": 2.747126579816117e-07, + "loss": 0.0022, + "step": 9826 + }, + { + "epoch": 4.470882620564149, + "grad_norm": 0.010020386345474681, + "learning_rate": 2.7424560812471466e-07, + "loss": 0.0001, + "step": 9827 + }, + { + "epoch": 4.471337579617835, + "grad_norm": 0.1843457913919323, + "learning_rate": 2.737789444351263e-07, + "loss": 0.002, + "step": 9828 + }, + { + "epoch": 4.47179253867152, + "grad_norm": 0.1273467604869257, + "learning_rate": 2.733126669509789e-07, + "loss": 0.0011, + "step": 9829 + }, + { + "epoch": 4.4722474977252045, + "grad_norm": 0.054725157669334536, + "learning_rate": 2.728467757103748e-07, + "loss": 0.0004, + "step": 9830 + }, + { + "epoch": 4.47270245677889, + "grad_norm": 0.19774346462172113, + "learning_rate": 2.7238127075138345e-07, + "loss": 0.0043, + "step": 9831 + }, + { + "epoch": 4.473157415832575, + "grad_norm": 0.012177097946780433, + "learning_rate": 2.7191615211204613e-07, + "loss": 0.0001, + "step": 9832 + }, + { + "epoch": 4.47361237488626, + "grad_norm": 0.023122379653789975, + "learning_rate": 2.714514198303686e-07, + "loss": 0.0001, + "step": 9833 + }, + { + "epoch": 4.474067333939946, + "grad_norm": 0.009683029066415974, + "learning_rate": 2.7098707394432653e-07, + "loss": 0.0001, + "step": 9834 + }, + { + "epoch": 4.474522292993631, + "grad_norm": 0.05642467243687519, + "learning_rate": 2.7052311449186577e-07, + "loss": 0.0005, + "step": 9835 + }, + { + "epoch": 4.4749772520473154, + "grad_norm": 0.08345240618601117, + "learning_rate": 2.70059541510897e-07, + "loss": 0.0005, + "step": 9836 + }, + { + "epoch": 4.475432211101001, + "grad_norm": 0.0031893396602070702, + "learning_rate": 2.6959635503930327e-07, + "loss": 0.0, + "step": 9837 + }, + { + "epoch": 4.475887170154686, + "grad_norm": 0.01898662942622112, + "learning_rate": 2.691335551149327e-07, + "loss": 0.0002, + "step": 9838 + }, + { + "epoch": 4.476342129208371, + "grad_norm": 0.006469085962762504, + "learning_rate": 2.686711417756038e-07, + "loss": 0.0, + "step": 9839 + }, + { + "epoch": 4.476797088262057, + "grad_norm": 0.07524928980180771, + "learning_rate": 2.6820911505910263e-07, + "loss": 0.0003, + "step": 9840 + }, + { + "epoch": 4.477252047315742, + "grad_norm": 0.11950012732445724, + "learning_rate": 2.677474750031822e-07, + "loss": 0.0018, + "step": 9841 + }, + { + "epoch": 4.477707006369426, + "grad_norm": 0.00783898214449213, + "learning_rate": 2.672862216455696e-07, + "loss": 0.0, + "step": 9842 + }, + { + "epoch": 4.478161965423112, + "grad_norm": 0.022946767666007787, + "learning_rate": 2.6682535502395356e-07, + "loss": 0.0001, + "step": 9843 + }, + { + "epoch": 4.478616924476797, + "grad_norm": 0.28792011528851646, + "learning_rate": 2.663648751759951e-07, + "loss": 0.0043, + "step": 9844 + }, + { + "epoch": 4.479071883530482, + "grad_norm": 0.09237528427888451, + "learning_rate": 2.6590478213932236e-07, + "loss": 0.0011, + "step": 9845 + }, + { + "epoch": 4.479526842584168, + "grad_norm": 0.21573638918480934, + "learning_rate": 2.6544507595153036e-07, + "loss": 0.0023, + "step": 9846 + }, + { + "epoch": 4.4799818016378525, + "grad_norm": 0.005402306940496315, + "learning_rate": 2.649857566501873e-07, + "loss": 0.0001, + "step": 9847 + }, + { + "epoch": 4.480436760691537, + "grad_norm": 0.039497324320792275, + "learning_rate": 2.6452682427282383e-07, + "loss": 0.0002, + "step": 9848 + }, + { + "epoch": 4.480891719745223, + "grad_norm": 0.014236832125789911, + "learning_rate": 2.6406827885694377e-07, + "loss": 0.0001, + "step": 9849 + }, + { + "epoch": 4.481346678798908, + "grad_norm": 0.022868857639931274, + "learning_rate": 2.6361012044001654e-07, + "loss": 0.0001, + "step": 9850 + }, + { + "epoch": 4.481801637852593, + "grad_norm": 0.011047213994452935, + "learning_rate": 2.6315234905948063e-07, + "loss": 0.0001, + "step": 9851 + }, + { + "epoch": 4.482256596906279, + "grad_norm": 0.06548485286490154, + "learning_rate": 2.626949647527438e-07, + "loss": 0.0006, + "step": 9852 + }, + { + "epoch": 4.4827115559599635, + "grad_norm": 0.02527050144323225, + "learning_rate": 2.622379675571812e-07, + "loss": 0.0002, + "step": 9853 + }, + { + "epoch": 4.483166515013648, + "grad_norm": 0.06102208484773239, + "learning_rate": 2.6178135751013576e-07, + "loss": 0.0006, + "step": 9854 + }, + { + "epoch": 4.483621474067334, + "grad_norm": 0.02332799082250286, + "learning_rate": 2.613251346489204e-07, + "loss": 0.0002, + "step": 9855 + }, + { + "epoch": 4.484076433121019, + "grad_norm": 0.021532156027894378, + "learning_rate": 2.608692990108147e-07, + "loss": 0.0001, + "step": 9856 + }, + { + "epoch": 4.484531392174704, + "grad_norm": 0.00914003022193629, + "learning_rate": 2.604138506330689e-07, + "loss": 0.0001, + "step": 9857 + }, + { + "epoch": 4.48498635122839, + "grad_norm": 0.02846928576310341, + "learning_rate": 2.5995878955289933e-07, + "loss": 0.0002, + "step": 9858 + }, + { + "epoch": 4.4854413102820745, + "grad_norm": 0.21378335888448957, + "learning_rate": 2.5950411580749235e-07, + "loss": 0.0009, + "step": 9859 + }, + { + "epoch": 4.485896269335759, + "grad_norm": 0.18327109367967684, + "learning_rate": 2.5904982943399993e-07, + "loss": 0.0021, + "step": 9860 + }, + { + "epoch": 4.486351228389445, + "grad_norm": 0.04540874112772403, + "learning_rate": 2.585959304695462e-07, + "loss": 0.0001, + "step": 9861 + }, + { + "epoch": 4.48680618744313, + "grad_norm": 0.1299983675938219, + "learning_rate": 2.581424189512205e-07, + "loss": 0.0012, + "step": 9862 + }, + { + "epoch": 4.487261146496815, + "grad_norm": 0.06896514245733236, + "learning_rate": 2.576892949160825e-07, + "loss": 0.0006, + "step": 9863 + }, + { + "epoch": 4.487716105550501, + "grad_norm": 0.0576733226007471, + "learning_rate": 2.572365584011599e-07, + "loss": 0.0005, + "step": 9864 + }, + { + "epoch": 4.4881710646041855, + "grad_norm": 0.10620583590965074, + "learning_rate": 2.567842094434464e-07, + "loss": 0.0007, + "step": 9865 + }, + { + "epoch": 4.48862602365787, + "grad_norm": 0.013973205863701958, + "learning_rate": 2.56332248079908e-07, + "loss": 0.0001, + "step": 9866 + }, + { + "epoch": 4.489080982711556, + "grad_norm": 0.014089263477686374, + "learning_rate": 2.558806743474762e-07, + "loss": 0.0001, + "step": 9867 + }, + { + "epoch": 4.489535941765241, + "grad_norm": 0.05230955011793734, + "learning_rate": 2.5542948828305104e-07, + "loss": 0.0004, + "step": 9868 + }, + { + "epoch": 4.489990900818927, + "grad_norm": 0.010346874428992351, + "learning_rate": 2.5497868992350184e-07, + "loss": 0.0, + "step": 9869 + }, + { + "epoch": 4.490445859872612, + "grad_norm": 0.02913525149330349, + "learning_rate": 2.545282793056653e-07, + "loss": 0.0002, + "step": 9870 + }, + { + "epoch": 4.4909008189262964, + "grad_norm": 0.16629691698138255, + "learning_rate": 2.54078256466348e-07, + "loss": 0.0017, + "step": 9871 + }, + { + "epoch": 4.491355777979982, + "grad_norm": 0.05124612674058239, + "learning_rate": 2.5362862144232336e-07, + "loss": 0.0003, + "step": 9872 + }, + { + "epoch": 4.491810737033667, + "grad_norm": 0.24363180548823193, + "learning_rate": 2.531793742703331e-07, + "loss": 0.0013, + "step": 9873 + }, + { + "epoch": 4.492265696087352, + "grad_norm": 0.021724911127147375, + "learning_rate": 2.527305149870879e-07, + "loss": 0.0001, + "step": 9874 + }, + { + "epoch": 4.492720655141038, + "grad_norm": 0.2283618288108952, + "learning_rate": 2.522820436292667e-07, + "loss": 0.0034, + "step": 9875 + }, + { + "epoch": 4.4931756141947226, + "grad_norm": 0.15896141386604887, + "learning_rate": 2.5183396023351567e-07, + "loss": 0.0012, + "step": 9876 + }, + { + "epoch": 4.493630573248407, + "grad_norm": 0.12338300992770802, + "learning_rate": 2.513862648364518e-07, + "loss": 0.0012, + "step": 9877 + }, + { + "epoch": 4.494085532302093, + "grad_norm": 0.011596265287238042, + "learning_rate": 2.509389574746579e-07, + "loss": 0.0001, + "step": 9878 + }, + { + "epoch": 4.494540491355778, + "grad_norm": 0.14052493615326575, + "learning_rate": 2.5049203818468537e-07, + "loss": 0.0004, + "step": 9879 + }, + { + "epoch": 4.494995450409463, + "grad_norm": 0.03327082778941423, + "learning_rate": 2.500455070030544e-07, + "loss": 0.0001, + "step": 9880 + }, + { + "epoch": 4.495450409463149, + "grad_norm": 0.054568840509463375, + "learning_rate": 2.495993639662547e-07, + "loss": 0.0004, + "step": 9881 + }, + { + "epoch": 4.4959053685168335, + "grad_norm": 0.1119712940527868, + "learning_rate": 2.491536091107427e-07, + "loss": 0.0008, + "step": 9882 + }, + { + "epoch": 4.496360327570518, + "grad_norm": 0.1320771439856323, + "learning_rate": 2.48708242472942e-07, + "loss": 0.0007, + "step": 9883 + }, + { + "epoch": 4.496815286624204, + "grad_norm": 0.07088333664436038, + "learning_rate": 2.4826326408924795e-07, + "loss": 0.0005, + "step": 9884 + }, + { + "epoch": 4.497270245677889, + "grad_norm": 0.021123830596431616, + "learning_rate": 2.4781867399602033e-07, + "loss": 0.0001, + "step": 9885 + }, + { + "epoch": 4.497725204731574, + "grad_norm": 0.28202501994495616, + "learning_rate": 2.473744722295912e-07, + "loss": 0.0038, + "step": 9886 + }, + { + "epoch": 4.49818016378526, + "grad_norm": 0.027920598785741005, + "learning_rate": 2.469306588262571e-07, + "loss": 0.0002, + "step": 9887 + }, + { + "epoch": 4.4986351228389445, + "grad_norm": 0.011353369280796066, + "learning_rate": 2.4648723382228513e-07, + "loss": 0.0001, + "step": 9888 + }, + { + "epoch": 4.499090081892629, + "grad_norm": 0.012219980387193708, + "learning_rate": 2.4604419725390906e-07, + "loss": 0.0001, + "step": 9889 + }, + { + "epoch": 4.499545040946315, + "grad_norm": 0.04660593775571815, + "learning_rate": 2.456015491573327e-07, + "loss": 0.0002, + "step": 9890 + }, + { + "epoch": 4.5, + "grad_norm": 0.04883669816785238, + "learning_rate": 2.4515928956872716e-07, + "loss": 0.0004, + "step": 9891 + }, + { + "epoch": 4.500454959053685, + "grad_norm": 0.010114742930456203, + "learning_rate": 2.447174185242324e-07, + "loss": 0.0001, + "step": 9892 + }, + { + "epoch": 4.500909918107371, + "grad_norm": 0.4722986074888995, + "learning_rate": 2.4427593605995505e-07, + "loss": 0.0014, + "step": 9893 + }, + { + "epoch": 4.5013648771610555, + "grad_norm": 0.045812815872761495, + "learning_rate": 2.438348422119713e-07, + "loss": 0.0003, + "step": 9894 + }, + { + "epoch": 4.50181983621474, + "grad_norm": 0.004933274382772532, + "learning_rate": 2.4339413701632617e-07, + "loss": 0.0, + "step": 9895 + }, + { + "epoch": 4.502274795268426, + "grad_norm": 0.020359412302883535, + "learning_rate": 2.4295382050903147e-07, + "loss": 0.0001, + "step": 9896 + }, + { + "epoch": 4.502729754322111, + "grad_norm": 0.023678091742378612, + "learning_rate": 2.4251389272606674e-07, + "loss": 0.0002, + "step": 9897 + }, + { + "epoch": 4.503184713375796, + "grad_norm": 0.12961085944872078, + "learning_rate": 2.4207435370338374e-07, + "loss": 0.0012, + "step": 9898 + }, + { + "epoch": 4.503639672429482, + "grad_norm": 0.06681443463339036, + "learning_rate": 2.416352034768965e-07, + "loss": 0.0004, + "step": 9899 + }, + { + "epoch": 4.5040946314831665, + "grad_norm": 0.052942536083503584, + "learning_rate": 2.411964420824925e-07, + "loss": 0.0004, + "step": 9900 + }, + { + "epoch": 4.504549590536851, + "grad_norm": 0.0482369922530543, + "learning_rate": 2.407580695560252e-07, + "loss": 0.0002, + "step": 9901 + }, + { + "epoch": 4.505004549590537, + "grad_norm": 0.09560473705119904, + "learning_rate": 2.4032008593331544e-07, + "loss": 0.0011, + "step": 9902 + }, + { + "epoch": 4.505459508644222, + "grad_norm": 0.2272654642815518, + "learning_rate": 2.398824912501535e-07, + "loss": 0.0012, + "step": 9903 + }, + { + "epoch": 4.505914467697907, + "grad_norm": 0.04665829289119957, + "learning_rate": 2.3944528554229795e-07, + "loss": 0.0002, + "step": 9904 + }, + { + "epoch": 4.506369426751593, + "grad_norm": 0.04837064452101903, + "learning_rate": 2.390084688454752e-07, + "loss": 0.0002, + "step": 9905 + }, + { + "epoch": 4.5068243858052774, + "grad_norm": 0.015412511288154834, + "learning_rate": 2.3857204119538016e-07, + "loss": 0.0001, + "step": 9906 + }, + { + "epoch": 4.507279344858962, + "grad_norm": 0.06909807662246724, + "learning_rate": 2.3813600262767578e-07, + "loss": 0.0005, + "step": 9907 + }, + { + "epoch": 4.507734303912648, + "grad_norm": 0.019903526284543397, + "learning_rate": 2.37700353177992e-07, + "loss": 0.0001, + "step": 9908 + }, + { + "epoch": 4.508189262966333, + "grad_norm": 0.34300063357150967, + "learning_rate": 2.3726509288192977e-07, + "loss": 0.0013, + "step": 9909 + }, + { + "epoch": 4.508644222020019, + "grad_norm": 0.006279392129840297, + "learning_rate": 2.3683022177505565e-07, + "loss": 0.0, + "step": 9910 + }, + { + "epoch": 4.5090991810737036, + "grad_norm": 0.02269171887515086, + "learning_rate": 2.363957398929051e-07, + "loss": 0.0001, + "step": 9911 + }, + { + "epoch": 4.509554140127388, + "grad_norm": 0.04685891906308402, + "learning_rate": 2.3596164727098304e-07, + "loss": 0.0003, + "step": 9912 + }, + { + "epoch": 4.510009099181074, + "grad_norm": 0.005558002100558519, + "learning_rate": 2.3552794394476053e-07, + "loss": 0.0, + "step": 9913 + }, + { + "epoch": 4.510464058234759, + "grad_norm": 0.11995212190840954, + "learning_rate": 2.3509462994967868e-07, + "loss": 0.0018, + "step": 9914 + }, + { + "epoch": 4.510919017288444, + "grad_norm": 0.1817786184983082, + "learning_rate": 2.346617053211453e-07, + "loss": 0.0017, + "step": 9915 + }, + { + "epoch": 4.51137397634213, + "grad_norm": 0.324050688034584, + "learning_rate": 2.342291700945376e-07, + "loss": 0.0062, + "step": 9916 + }, + { + "epoch": 4.5118289353958145, + "grad_norm": 0.04382471818462054, + "learning_rate": 2.3379702430520013e-07, + "loss": 0.0002, + "step": 9917 + }, + { + "epoch": 4.512283894449499, + "grad_norm": 0.11465236073782037, + "learning_rate": 2.3336526798844517e-07, + "loss": 0.0006, + "step": 9918 + }, + { + "epoch": 4.512738853503185, + "grad_norm": 0.05984669697104292, + "learning_rate": 2.3293390117955394e-07, + "loss": 0.0005, + "step": 9919 + }, + { + "epoch": 4.51319381255687, + "grad_norm": 0.2909316393569093, + "learning_rate": 2.325029239137777e-07, + "loss": 0.0031, + "step": 9920 + }, + { + "epoch": 4.513648771610555, + "grad_norm": 0.07713496730362766, + "learning_rate": 2.3207233622633275e-07, + "loss": 0.0003, + "step": 9921 + }, + { + "epoch": 4.514103730664241, + "grad_norm": 0.11585690530414175, + "learning_rate": 2.3164213815240476e-07, + "loss": 0.0018, + "step": 9922 + }, + { + "epoch": 4.5145586897179255, + "grad_norm": 0.300016029472826, + "learning_rate": 2.312123297271468e-07, + "loss": 0.004, + "step": 9923 + }, + { + "epoch": 4.51501364877161, + "grad_norm": 0.06862492293195506, + "learning_rate": 2.3078291098568184e-07, + "loss": 0.0008, + "step": 9924 + }, + { + "epoch": 4.515468607825296, + "grad_norm": 0.20677295281035155, + "learning_rate": 2.303538819630996e-07, + "loss": 0.0042, + "step": 9925 + }, + { + "epoch": 4.515923566878981, + "grad_norm": 0.008484838853610744, + "learning_rate": 2.299252426944587e-07, + "loss": 0.0001, + "step": 9926 + }, + { + "epoch": 4.516378525932666, + "grad_norm": 0.088911079301893, + "learning_rate": 2.2949699321478612e-07, + "loss": 0.0004, + "step": 9927 + }, + { + "epoch": 4.516833484986352, + "grad_norm": 0.3833271543482625, + "learning_rate": 2.29069133559075e-07, + "loss": 0.009, + "step": 9928 + }, + { + "epoch": 4.5172884440400365, + "grad_norm": 0.3084607327507269, + "learning_rate": 2.286416637622896e-07, + "loss": 0.0035, + "step": 9929 + }, + { + "epoch": 4.517743403093721, + "grad_norm": 0.2540997972747094, + "learning_rate": 2.2821458385936025e-07, + "loss": 0.0053, + "step": 9930 + }, + { + "epoch": 4.518198362147407, + "grad_norm": 0.005081740422745523, + "learning_rate": 2.2778789388518573e-07, + "loss": 0.0, + "step": 9931 + }, + { + "epoch": 4.518653321201092, + "grad_norm": 0.008606815220825223, + "learning_rate": 2.2736159387463264e-07, + "loss": 0.0, + "step": 9932 + }, + { + "epoch": 4.519108280254777, + "grad_norm": 0.08896213781823452, + "learning_rate": 2.269356838625375e-07, + "loss": 0.0007, + "step": 9933 + }, + { + "epoch": 4.519563239308463, + "grad_norm": 0.04512275087179489, + "learning_rate": 2.2651016388370361e-07, + "loss": 0.0003, + "step": 9934 + }, + { + "epoch": 4.5200181983621475, + "grad_norm": 0.09403573057466452, + "learning_rate": 2.2608503397290203e-07, + "loss": 0.0009, + "step": 9935 + }, + { + "epoch": 4.520473157415832, + "grad_norm": 0.16105008189387443, + "learning_rate": 2.2566029416487333e-07, + "loss": 0.0004, + "step": 9936 + }, + { + "epoch": 4.520928116469518, + "grad_norm": 0.24235688766509916, + "learning_rate": 2.252359444943236e-07, + "loss": 0.0033, + "step": 9937 + }, + { + "epoch": 4.521383075523203, + "grad_norm": 0.05418808698563803, + "learning_rate": 2.248119849959307e-07, + "loss": 0.0005, + "step": 9938 + }, + { + "epoch": 4.521838034576888, + "grad_norm": 0.10286283469703374, + "learning_rate": 2.243884157043369e-07, + "loss": 0.001, + "step": 9939 + }, + { + "epoch": 4.522292993630574, + "grad_norm": 0.05820316958147569, + "learning_rate": 2.2396523665415615e-07, + "loss": 0.0003, + "step": 9940 + }, + { + "epoch": 4.522747952684258, + "grad_norm": 0.0851026713285884, + "learning_rate": 2.2354244787996748e-07, + "loss": 0.0005, + "step": 9941 + }, + { + "epoch": 4.523202911737943, + "grad_norm": 0.04007618882852361, + "learning_rate": 2.2312004941631936e-07, + "loss": 0.0004, + "step": 9942 + }, + { + "epoch": 4.523657870791629, + "grad_norm": 0.05060604467978839, + "learning_rate": 2.226980412977292e-07, + "loss": 0.0004, + "step": 9943 + }, + { + "epoch": 4.524112829845314, + "grad_norm": 0.6355256071773773, + "learning_rate": 2.2227642355868107e-07, + "loss": 0.0009, + "step": 9944 + }, + { + "epoch": 4.524567788898999, + "grad_norm": 0.2100560207057896, + "learning_rate": 2.218551962336274e-07, + "loss": 0.0018, + "step": 9945 + }, + { + "epoch": 4.5250227479526846, + "grad_norm": 0.03370484451732676, + "learning_rate": 2.2143435935698897e-07, + "loss": 0.0001, + "step": 9946 + }, + { + "epoch": 4.525477707006369, + "grad_norm": 0.024307783656008334, + "learning_rate": 2.2101391296315444e-07, + "loss": 0.0002, + "step": 9947 + }, + { + "epoch": 4.525932666060054, + "grad_norm": 0.2007286369976889, + "learning_rate": 2.2059385708648183e-07, + "loss": 0.0029, + "step": 9948 + }, + { + "epoch": 4.52638762511374, + "grad_norm": 0.016459887056630526, + "learning_rate": 2.2017419176129596e-07, + "loss": 0.0001, + "step": 9949 + }, + { + "epoch": 4.526842584167425, + "grad_norm": 0.019678851060355582, + "learning_rate": 2.197549170218899e-07, + "loss": 0.0001, + "step": 9950 + }, + { + "epoch": 4.52729754322111, + "grad_norm": 0.024405814092407618, + "learning_rate": 2.1933603290252404e-07, + "loss": 0.0002, + "step": 9951 + }, + { + "epoch": 4.5277525022747955, + "grad_norm": 0.033450062320797204, + "learning_rate": 2.1891753943742766e-07, + "loss": 0.0002, + "step": 9952 + }, + { + "epoch": 4.52820746132848, + "grad_norm": 0.060115051084917326, + "learning_rate": 2.1849943666079899e-07, + "loss": 0.0003, + "step": 9953 + }, + { + "epoch": 4.528662420382165, + "grad_norm": 0.15643968431611702, + "learning_rate": 2.1808172460680399e-07, + "loss": 0.0011, + "step": 9954 + }, + { + "epoch": 4.529117379435851, + "grad_norm": 0.35152894563582243, + "learning_rate": 2.176644033095754e-07, + "loss": 0.0014, + "step": 9955 + }, + { + "epoch": 4.529572338489536, + "grad_norm": 0.13707314479892121, + "learning_rate": 2.172474728032148e-07, + "loss": 0.001, + "step": 9956 + }, + { + "epoch": 4.530027297543221, + "grad_norm": 0.17159233844994978, + "learning_rate": 2.168309331217916e-07, + "loss": 0.0014, + "step": 9957 + }, + { + "epoch": 4.5304822565969065, + "grad_norm": 0.19317591511616486, + "learning_rate": 2.1641478429934415e-07, + "loss": 0.0021, + "step": 9958 + }, + { + "epoch": 4.530937215650591, + "grad_norm": 0.1222003441188645, + "learning_rate": 2.1599902636987858e-07, + "loss": 0.0011, + "step": 9959 + }, + { + "epoch": 4.531392174704276, + "grad_norm": 0.3074446990591251, + "learning_rate": 2.1558365936736715e-07, + "loss": 0.0009, + "step": 9960 + }, + { + "epoch": 4.531847133757962, + "grad_norm": 0.11791867665102791, + "learning_rate": 2.151686833257538e-07, + "loss": 0.0015, + "step": 9961 + }, + { + "epoch": 4.532302092811647, + "grad_norm": 0.014327161167464516, + "learning_rate": 2.14754098278947e-07, + "loss": 0.0001, + "step": 9962 + }, + { + "epoch": 4.532757051865332, + "grad_norm": 0.10183073000157901, + "learning_rate": 2.1433990426082574e-07, + "loss": 0.0009, + "step": 9963 + }, + { + "epoch": 4.5332120109190175, + "grad_norm": 0.04708548455085974, + "learning_rate": 2.1392610130523574e-07, + "loss": 0.0001, + "step": 9964 + }, + { + "epoch": 4.533666969972702, + "grad_norm": 0.07008209420271527, + "learning_rate": 2.13512689445991e-07, + "loss": 0.0003, + "step": 9965 + }, + { + "epoch": 4.534121929026387, + "grad_norm": 0.027011181244921068, + "learning_rate": 2.1309966871687292e-07, + "loss": 0.0001, + "step": 9966 + }, + { + "epoch": 4.534576888080073, + "grad_norm": 0.18192988706667365, + "learning_rate": 2.1268703915163225e-07, + "loss": 0.0018, + "step": 9967 + }, + { + "epoch": 4.535031847133758, + "grad_norm": 0.10342716911334705, + "learning_rate": 2.1227480078398866e-07, + "loss": 0.002, + "step": 9968 + }, + { + "epoch": 4.535486806187443, + "grad_norm": 0.010329184257664265, + "learning_rate": 2.1186295364762687e-07, + "loss": 0.0001, + "step": 9969 + }, + { + "epoch": 4.5359417652411285, + "grad_norm": 0.011847811971578165, + "learning_rate": 2.114514977762011e-07, + "loss": 0.0001, + "step": 9970 + }, + { + "epoch": 4.536396724294813, + "grad_norm": 0.028508735206714417, + "learning_rate": 2.1104043320333388e-07, + "loss": 0.0002, + "step": 9971 + }, + { + "epoch": 4.536851683348498, + "grad_norm": 0.02471678864265696, + "learning_rate": 2.1062975996261615e-07, + "loss": 0.0001, + "step": 9972 + }, + { + "epoch": 4.537306642402184, + "grad_norm": 0.15598499056191162, + "learning_rate": 2.1021947808760602e-07, + "loss": 0.001, + "step": 9973 + }, + { + "epoch": 4.537761601455869, + "grad_norm": 0.15395266129932716, + "learning_rate": 2.098095876118289e-07, + "loss": 0.0007, + "step": 9974 + }, + { + "epoch": 4.538216560509554, + "grad_norm": 0.01510795330811687, + "learning_rate": 2.094000885687808e-07, + "loss": 0.0001, + "step": 9975 + }, + { + "epoch": 4.538671519563239, + "grad_norm": 0.16855095827378602, + "learning_rate": 2.0899098099192273e-07, + "loss": 0.0028, + "step": 9976 + }, + { + "epoch": 4.539126478616924, + "grad_norm": 0.003939923343996043, + "learning_rate": 2.085822649146857e-07, + "loss": 0.0, + "step": 9977 + }, + { + "epoch": 4.539581437670609, + "grad_norm": 0.20812141974751644, + "learning_rate": 2.0817394037046856e-07, + "loss": 0.001, + "step": 9978 + }, + { + "epoch": 4.540036396724295, + "grad_norm": 0.007092484874895985, + "learning_rate": 2.0776600739263742e-07, + "loss": 0.0, + "step": 9979 + }, + { + "epoch": 4.54049135577798, + "grad_norm": 0.07559988175948198, + "learning_rate": 2.0735846601452613e-07, + "loss": 0.0008, + "step": 9980 + }, + { + "epoch": 4.540946314831665, + "grad_norm": 0.26008675786349245, + "learning_rate": 2.06951316269437e-07, + "loss": 0.0066, + "step": 9981 + }, + { + "epoch": 4.54140127388535, + "grad_norm": 0.009340358641583231, + "learning_rate": 2.0654455819064222e-07, + "loss": 0.0, + "step": 9982 + }, + { + "epoch": 4.541856232939035, + "grad_norm": 0.07075574860979925, + "learning_rate": 2.0613819181137918e-07, + "loss": 0.0006, + "step": 9983 + }, + { + "epoch": 4.542311191992721, + "grad_norm": 0.0681312860868519, + "learning_rate": 2.0573221716485402e-07, + "loss": 0.0005, + "step": 9984 + }, + { + "epoch": 4.542766151046406, + "grad_norm": 0.1720352875540983, + "learning_rate": 2.0532663428424138e-07, + "loss": 0.003, + "step": 9985 + }, + { + "epoch": 4.543221110100091, + "grad_norm": 0.06982106929868871, + "learning_rate": 2.0492144320268247e-07, + "loss": 0.0006, + "step": 9986 + }, + { + "epoch": 4.5436760691537765, + "grad_norm": 0.013441651889484069, + "learning_rate": 2.045166439532903e-07, + "loss": 0.0001, + "step": 9987 + }, + { + "epoch": 4.544131028207461, + "grad_norm": 0.09723690321988937, + "learning_rate": 2.0411223656914058e-07, + "loss": 0.0007, + "step": 9988 + }, + { + "epoch": 4.544585987261146, + "grad_norm": 0.0589015189687312, + "learning_rate": 2.0370822108328191e-07, + "loss": 0.0001, + "step": 9989 + }, + { + "epoch": 4.545040946314832, + "grad_norm": 0.04474927927366833, + "learning_rate": 2.0330459752872734e-07, + "loss": 0.0002, + "step": 9990 + }, + { + "epoch": 4.545495905368517, + "grad_norm": 0.015049182841729113, + "learning_rate": 2.0290136593845821e-07, + "loss": 0.0001, + "step": 9991 + }, + { + "epoch": 4.545950864422202, + "grad_norm": 0.016189247709023722, + "learning_rate": 2.024985263454271e-07, + "loss": 0.0001, + "step": 9992 + }, + { + "epoch": 4.5464058234758875, + "grad_norm": 0.027049167856260638, + "learning_rate": 2.0209607878255156e-07, + "loss": 0.0002, + "step": 9993 + }, + { + "epoch": 4.546860782529572, + "grad_norm": 0.010416542932749248, + "learning_rate": 2.0169402328271637e-07, + "loss": 0.0, + "step": 9994 + }, + { + "epoch": 4.547315741583257, + "grad_norm": 0.009078294144646514, + "learning_rate": 2.0129235987877694e-07, + "loss": 0.0, + "step": 9995 + }, + { + "epoch": 4.547770700636943, + "grad_norm": 0.07828669015635847, + "learning_rate": 2.0089108860355422e-07, + "loss": 0.0009, + "step": 9996 + }, + { + "epoch": 4.548225659690628, + "grad_norm": 0.04651502968247596, + "learning_rate": 2.0049020948984033e-07, + "loss": 0.0003, + "step": 9997 + }, + { + "epoch": 4.548680618744313, + "grad_norm": 0.03401092455072713, + "learning_rate": 2.0008972257039184e-07, + "loss": 0.0002, + "step": 9998 + }, + { + "epoch": 4.5491355777979985, + "grad_norm": 0.011284210409231964, + "learning_rate": 1.9968962787793534e-07, + "loss": 0.0001, + "step": 9999 + }, + { + "epoch": 4.549590536851683, + "grad_norm": 0.05301564104224029, + "learning_rate": 1.9928992544516356e-07, + "loss": 0.0001, + "step": 10000 + }, + { + "epoch": 4.550045495905368, + "grad_norm": 0.042940566955574654, + "learning_rate": 1.9889061530473986e-07, + "loss": 0.0002, + "step": 10001 + }, + { + "epoch": 4.550500454959054, + "grad_norm": 0.07118417003742626, + "learning_rate": 1.984916974892931e-07, + "loss": 0.0009, + "step": 10002 + }, + { + "epoch": 4.550955414012739, + "grad_norm": 0.01901971376805956, + "learning_rate": 1.9809317203142165e-07, + "loss": 0.0002, + "step": 10003 + }, + { + "epoch": 4.551410373066424, + "grad_norm": 0.08501062045859577, + "learning_rate": 1.9769503896369167e-07, + "loss": 0.0006, + "step": 10004 + }, + { + "epoch": 4.5518653321201095, + "grad_norm": 0.046243475480679676, + "learning_rate": 1.9729729831863497e-07, + "loss": 0.0003, + "step": 10005 + }, + { + "epoch": 4.552320291173794, + "grad_norm": 0.029729060496125403, + "learning_rate": 1.968999501287544e-07, + "loss": 0.0002, + "step": 10006 + }, + { + "epoch": 4.552775250227479, + "grad_norm": 0.020723444242764136, + "learning_rate": 1.965029944265201e-07, + "loss": 0.0001, + "step": 10007 + }, + { + "epoch": 4.553230209281165, + "grad_norm": 0.030507413711084074, + "learning_rate": 1.9610643124436834e-07, + "loss": 0.0002, + "step": 10008 + }, + { + "epoch": 4.55368516833485, + "grad_norm": 0.05761848108370722, + "learning_rate": 1.9571026061470432e-07, + "loss": 0.0002, + "step": 10009 + }, + { + "epoch": 4.554140127388535, + "grad_norm": 0.3637030185561456, + "learning_rate": 1.953144825699016e-07, + "loss": 0.005, + "step": 10010 + }, + { + "epoch": 4.55459508644222, + "grad_norm": 0.1264561631131197, + "learning_rate": 1.9491909714230207e-07, + "loss": 0.0006, + "step": 10011 + }, + { + "epoch": 4.555050045495905, + "grad_norm": 0.02897131742345199, + "learning_rate": 1.9452410436421486e-07, + "loss": 0.0001, + "step": 10012 + }, + { + "epoch": 4.55550500454959, + "grad_norm": 0.0543235407243238, + "learning_rate": 1.9412950426791645e-07, + "loss": 0.0001, + "step": 10013 + }, + { + "epoch": 4.555959963603276, + "grad_norm": 0.0335056341710435, + "learning_rate": 1.9373529688565095e-07, + "loss": 0.0002, + "step": 10014 + }, + { + "epoch": 4.556414922656961, + "grad_norm": 0.09350690681472225, + "learning_rate": 1.9334148224963267e-07, + "loss": 0.001, + "step": 10015 + }, + { + "epoch": 4.556869881710646, + "grad_norm": 0.09716939135450206, + "learning_rate": 1.9294806039204139e-07, + "loss": 0.001, + "step": 10016 + }, + { + "epoch": 4.557324840764331, + "grad_norm": 0.0810259986721641, + "learning_rate": 1.925550313450264e-07, + "loss": 0.0008, + "step": 10017 + }, + { + "epoch": 4.557779799818016, + "grad_norm": 0.010121372960152175, + "learning_rate": 1.9216239514070422e-07, + "loss": 0.0001, + "step": 10018 + }, + { + "epoch": 4.558234758871702, + "grad_norm": 0.004985594013211901, + "learning_rate": 1.9177015181115866e-07, + "loss": 0.0, + "step": 10019 + }, + { + "epoch": 4.558689717925387, + "grad_norm": 0.07959891846905738, + "learning_rate": 1.9137830138844295e-07, + "loss": 0.0007, + "step": 10020 + }, + { + "epoch": 4.559144676979072, + "grad_norm": 0.011407829444275024, + "learning_rate": 1.90986843904577e-07, + "loss": 0.0001, + "step": 10021 + }, + { + "epoch": 4.5595996360327575, + "grad_norm": 0.003986694614751018, + "learning_rate": 1.9059577939154917e-07, + "loss": 0.0, + "step": 10022 + }, + { + "epoch": 4.560054595086442, + "grad_norm": 0.06184289434632973, + "learning_rate": 1.9020510788131385e-07, + "loss": 0.0003, + "step": 10023 + }, + { + "epoch": 4.560509554140127, + "grad_norm": 0.009696954144415907, + "learning_rate": 1.898148294057972e-07, + "loss": 0.0001, + "step": 10024 + }, + { + "epoch": 4.560964513193813, + "grad_norm": 0.014845767753796878, + "learning_rate": 1.8942494399688983e-07, + "loss": 0.0001, + "step": 10025 + }, + { + "epoch": 4.561419472247498, + "grad_norm": 0.030705769926938763, + "learning_rate": 1.8903545168645177e-07, + "loss": 0.0003, + "step": 10026 + }, + { + "epoch": 4.561874431301183, + "grad_norm": 0.005480069322467514, + "learning_rate": 1.8864635250631091e-07, + "loss": 0.0, + "step": 10027 + }, + { + "epoch": 4.5623293903548685, + "grad_norm": 0.2060006847676165, + "learning_rate": 1.8825764648826182e-07, + "loss": 0.0017, + "step": 10028 + }, + { + "epoch": 4.562784349408553, + "grad_norm": 0.02037804494756718, + "learning_rate": 1.87869333664068e-07, + "loss": 0.0001, + "step": 10029 + }, + { + "epoch": 4.563239308462238, + "grad_norm": 0.07736862950887789, + "learning_rate": 1.8748141406546072e-07, + "loss": 0.0004, + "step": 10030 + }, + { + "epoch": 4.563694267515924, + "grad_norm": 0.003152446657269879, + "learning_rate": 1.8709388772413962e-07, + "loss": 0.0, + "step": 10031 + }, + { + "epoch": 4.564149226569609, + "grad_norm": 0.1468126676196871, + "learning_rate": 1.8670675467177102e-07, + "loss": 0.0017, + "step": 10032 + }, + { + "epoch": 4.564604185623294, + "grad_norm": 0.038073502984638075, + "learning_rate": 1.863200149399902e-07, + "loss": 0.0003, + "step": 10033 + }, + { + "epoch": 4.5650591446769795, + "grad_norm": 0.03545595884241631, + "learning_rate": 1.8593366856039852e-07, + "loss": 0.0002, + "step": 10034 + }, + { + "epoch": 4.565514103730664, + "grad_norm": 0.0077038321409879755, + "learning_rate": 1.8554771556456796e-07, + "loss": 0.0001, + "step": 10035 + }, + { + "epoch": 4.565969062784349, + "grad_norm": 0.3464187108032849, + "learning_rate": 1.8516215598403609e-07, + "loss": 0.0076, + "step": 10036 + }, + { + "epoch": 4.566424021838035, + "grad_norm": 0.07887769120835544, + "learning_rate": 1.847769898503088e-07, + "loss": 0.0006, + "step": 10037 + }, + { + "epoch": 4.56687898089172, + "grad_norm": 0.04025740671936166, + "learning_rate": 1.8439221719486088e-07, + "loss": 0.0007, + "step": 10038 + }, + { + "epoch": 4.567333939945405, + "grad_norm": 0.01815608476508528, + "learning_rate": 1.8400783804913335e-07, + "loss": 0.0001, + "step": 10039 + }, + { + "epoch": 4.5677888989990905, + "grad_norm": 0.12191023964046135, + "learning_rate": 1.8362385244453718e-07, + "loss": 0.0011, + "step": 10040 + }, + { + "epoch": 4.568243858052775, + "grad_norm": 0.027642578797082033, + "learning_rate": 1.8324026041244947e-07, + "loss": 0.0003, + "step": 10041 + }, + { + "epoch": 4.56869881710646, + "grad_norm": 0.0029447030448456783, + "learning_rate": 1.8285706198421516e-07, + "loss": 0.0, + "step": 10042 + }, + { + "epoch": 4.569153776160146, + "grad_norm": 0.10491726350368716, + "learning_rate": 1.8247425719114696e-07, + "loss": 0.0008, + "step": 10043 + }, + { + "epoch": 4.569608735213831, + "grad_norm": 0.06091755817156818, + "learning_rate": 1.820918460645271e-07, + "loss": 0.0003, + "step": 10044 + }, + { + "epoch": 4.570063694267516, + "grad_norm": 0.012433108077342896, + "learning_rate": 1.8170982863560449e-07, + "loss": 0.0001, + "step": 10045 + }, + { + "epoch": 4.570518653321201, + "grad_norm": 0.04716933765658577, + "learning_rate": 1.813282049355952e-07, + "loss": 0.0002, + "step": 10046 + }, + { + "epoch": 4.570973612374886, + "grad_norm": 0.03797918315483642, + "learning_rate": 1.8094697499568437e-07, + "loss": 0.0003, + "step": 10047 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 0.014231168195735184, + "learning_rate": 1.8056613884702313e-07, + "loss": 0.0001, + "step": 10048 + }, + { + "epoch": 4.571883530482257, + "grad_norm": 0.18676111030865297, + "learning_rate": 1.801856965207338e-07, + "loss": 0.0032, + "step": 10049 + }, + { + "epoch": 4.572338489535942, + "grad_norm": 0.07688337790373528, + "learning_rate": 1.798056480479027e-07, + "loss": 0.0007, + "step": 10050 + }, + { + "epoch": 4.572793448589627, + "grad_norm": 0.009727580828325387, + "learning_rate": 1.79425993459586e-07, + "loss": 0.0001, + "step": 10051 + }, + { + "epoch": 4.573248407643312, + "grad_norm": 0.08603696470832439, + "learning_rate": 1.7904673278680838e-07, + "loss": 0.0002, + "step": 10052 + }, + { + "epoch": 4.573703366696997, + "grad_norm": 0.10407459027056128, + "learning_rate": 1.7866786606055953e-07, + "loss": 0.0008, + "step": 10053 + }, + { + "epoch": 4.574158325750682, + "grad_norm": 0.02914426573795828, + "learning_rate": 1.7828939331180073e-07, + "loss": 0.0001, + "step": 10054 + }, + { + "epoch": 4.574613284804368, + "grad_norm": 0.018091803266200208, + "learning_rate": 1.7791131457145727e-07, + "loss": 0.0001, + "step": 10055 + }, + { + "epoch": 4.575068243858053, + "grad_norm": 0.011430570573384806, + "learning_rate": 1.7753362987042555e-07, + "loss": 0.0001, + "step": 10056 + }, + { + "epoch": 4.575523202911738, + "grad_norm": 0.1189297919880683, + "learning_rate": 1.7715633923956753e-07, + "loss": 0.0011, + "step": 10057 + }, + { + "epoch": 4.575978161965423, + "grad_norm": 0.28925825709099545, + "learning_rate": 1.7677944270971193e-07, + "loss": 0.0028, + "step": 10058 + }, + { + "epoch": 4.576433121019108, + "grad_norm": 0.12277351557467386, + "learning_rate": 1.7640294031166073e-07, + "loss": 0.0031, + "step": 10059 + }, + { + "epoch": 4.576888080072793, + "grad_norm": 0.17140840155551917, + "learning_rate": 1.7602683207617766e-07, + "loss": 0.0015, + "step": 10060 + }, + { + "epoch": 4.577343039126479, + "grad_norm": 0.1066701218054566, + "learning_rate": 1.7565111803399704e-07, + "loss": 0.0003, + "step": 10061 + }, + { + "epoch": 4.577797998180164, + "grad_norm": 0.045872613806426064, + "learning_rate": 1.7527579821582042e-07, + "loss": 0.0008, + "step": 10062 + }, + { + "epoch": 4.578252957233849, + "grad_norm": 0.018937886976938538, + "learning_rate": 1.749008726523166e-07, + "loss": 0.0001, + "step": 10063 + }, + { + "epoch": 4.578707916287534, + "grad_norm": 0.06746145909838563, + "learning_rate": 1.745263413741244e-07, + "loss": 0.0002, + "step": 10064 + }, + { + "epoch": 4.579162875341219, + "grad_norm": 0.11869234682558218, + "learning_rate": 1.741522044118471e-07, + "loss": 0.0014, + "step": 10065 + }, + { + "epoch": 4.579617834394904, + "grad_norm": 0.08853164214495346, + "learning_rate": 1.7377846179605918e-07, + "loss": 0.0002, + "step": 10066 + }, + { + "epoch": 4.58007279344859, + "grad_norm": 0.0783136492175814, + "learning_rate": 1.7340511355730006e-07, + "loss": 0.0003, + "step": 10067 + }, + { + "epoch": 4.580527752502275, + "grad_norm": 0.03460472027384699, + "learning_rate": 1.7303215972607813e-07, + "loss": 0.0002, + "step": 10068 + }, + { + "epoch": 4.58098271155596, + "grad_norm": 0.007659397505490161, + "learning_rate": 1.726596003328701e-07, + "loss": 0.0, + "step": 10069 + }, + { + "epoch": 4.581437670609645, + "grad_norm": 0.1955389345567847, + "learning_rate": 1.7228743540811943e-07, + "loss": 0.0004, + "step": 10070 + }, + { + "epoch": 4.58189262966333, + "grad_norm": 0.024683740437104366, + "learning_rate": 1.719156649822379e-07, + "loss": 0.0002, + "step": 10071 + }, + { + "epoch": 4.582347588717015, + "grad_norm": 0.36227605669775986, + "learning_rate": 1.7154428908560394e-07, + "loss": 0.0022, + "step": 10072 + }, + { + "epoch": 4.582802547770701, + "grad_norm": 0.26752782591049856, + "learning_rate": 1.7117330774856555e-07, + "loss": 0.0013, + "step": 10073 + }, + { + "epoch": 4.583257506824386, + "grad_norm": 0.06650524457738367, + "learning_rate": 1.7080272100143847e-07, + "loss": 0.0009, + "step": 10074 + }, + { + "epoch": 4.583712465878071, + "grad_norm": 0.06080457162296043, + "learning_rate": 1.7043252887450456e-07, + "loss": 0.0003, + "step": 10075 + }, + { + "epoch": 4.584167424931756, + "grad_norm": 0.01789797385823625, + "learning_rate": 1.700627313980141e-07, + "loss": 0.0002, + "step": 10076 + }, + { + "epoch": 4.584622383985441, + "grad_norm": 0.006409853048861973, + "learning_rate": 1.6969332860218514e-07, + "loss": 0.0, + "step": 10077 + }, + { + "epoch": 4.585077343039126, + "grad_norm": 0.17817406065578753, + "learning_rate": 1.6932432051720405e-07, + "loss": 0.0027, + "step": 10078 + }, + { + "epoch": 4.585532302092812, + "grad_norm": 0.0927044138884308, + "learning_rate": 1.6895570717322397e-07, + "loss": 0.0011, + "step": 10079 + }, + { + "epoch": 4.585987261146497, + "grad_norm": 0.02084884267294616, + "learning_rate": 1.6858748860036745e-07, + "loss": 0.0001, + "step": 10080 + }, + { + "epoch": 4.5864422202001816, + "grad_norm": 0.14161989033558514, + "learning_rate": 1.6821966482872264e-07, + "loss": 0.001, + "step": 10081 + }, + { + "epoch": 4.586897179253867, + "grad_norm": 0.1833901036683208, + "learning_rate": 1.678522358883461e-07, + "loss": 0.0055, + "step": 10082 + }, + { + "epoch": 4.587352138307552, + "grad_norm": 0.03488514339002122, + "learning_rate": 1.6748520180926376e-07, + "loss": 0.0001, + "step": 10083 + }, + { + "epoch": 4.587807097361237, + "grad_norm": 0.025599110099182264, + "learning_rate": 1.671185626214672e-07, + "loss": 0.0001, + "step": 10084 + }, + { + "epoch": 4.588262056414923, + "grad_norm": 0.09677698649934258, + "learning_rate": 1.667523183549169e-07, + "loss": 0.0009, + "step": 10085 + }, + { + "epoch": 4.588717015468608, + "grad_norm": 0.14701762866160098, + "learning_rate": 1.6638646903953947e-07, + "loss": 0.002, + "step": 10086 + }, + { + "epoch": 4.5891719745222925, + "grad_norm": 0.03505381396798078, + "learning_rate": 1.6602101470523158e-07, + "loss": 0.0003, + "step": 10087 + }, + { + "epoch": 4.589626933575978, + "grad_norm": 0.010135940452738574, + "learning_rate": 1.6565595538185707e-07, + "loss": 0.0001, + "step": 10088 + }, + { + "epoch": 4.590081892629663, + "grad_norm": 0.010896084217685007, + "learning_rate": 1.6529129109924547e-07, + "loss": 0.0001, + "step": 10089 + }, + { + "epoch": 4.590536851683348, + "grad_norm": 0.006505564507405903, + "learning_rate": 1.649270218871968e-07, + "loss": 0.0, + "step": 10090 + }, + { + "epoch": 4.590991810737034, + "grad_norm": 0.2488147576899445, + "learning_rate": 1.645631477754761e-07, + "loss": 0.0019, + "step": 10091 + }, + { + "epoch": 4.591446769790719, + "grad_norm": 0.04038727042596749, + "learning_rate": 1.641996687938191e-07, + "loss": 0.0004, + "step": 10092 + }, + { + "epoch": 4.5919017288444035, + "grad_norm": 0.01763999325373993, + "learning_rate": 1.6383658497192646e-07, + "loss": 0.0002, + "step": 10093 + }, + { + "epoch": 4.592356687898089, + "grad_norm": 0.022453668418020135, + "learning_rate": 1.634738963394683e-07, + "loss": 0.0002, + "step": 10094 + }, + { + "epoch": 4.592811646951774, + "grad_norm": 0.08384841100925537, + "learning_rate": 1.6311160292608208e-07, + "loss": 0.0003, + "step": 10095 + }, + { + "epoch": 4.59326660600546, + "grad_norm": 0.018475629340727648, + "learning_rate": 1.627497047613724e-07, + "loss": 0.0001, + "step": 10096 + }, + { + "epoch": 4.593721565059145, + "grad_norm": 0.08583954796010979, + "learning_rate": 1.623882018749112e-07, + "loss": 0.0009, + "step": 10097 + }, + { + "epoch": 4.59417652411283, + "grad_norm": 0.15376879717192948, + "learning_rate": 1.6202709429624043e-07, + "loss": 0.0016, + "step": 10098 + }, + { + "epoch": 4.594631483166515, + "grad_norm": 0.0964049554766292, + "learning_rate": 1.616663820548675e-07, + "loss": 0.0011, + "step": 10099 + }, + { + "epoch": 4.5950864422202, + "grad_norm": 0.13252709056548337, + "learning_rate": 1.6130606518026725e-07, + "loss": 0.0021, + "step": 10100 + }, + { + "epoch": 4.595541401273885, + "grad_norm": 0.0038090229864312414, + "learning_rate": 1.60946143701885e-07, + "loss": 0.0, + "step": 10101 + }, + { + "epoch": 4.595996360327571, + "grad_norm": 0.07459162772236673, + "learning_rate": 1.6058661764912997e-07, + "loss": 0.0004, + "step": 10102 + }, + { + "epoch": 4.596451319381256, + "grad_norm": 0.26031320818769427, + "learning_rate": 1.6022748705138313e-07, + "loss": 0.0005, + "step": 10103 + }, + { + "epoch": 4.596906278434941, + "grad_norm": 0.16112833984187921, + "learning_rate": 1.5986875193798934e-07, + "loss": 0.0017, + "step": 10104 + }, + { + "epoch": 4.597361237488626, + "grad_norm": 0.10327430322060341, + "learning_rate": 1.5951041233826347e-07, + "loss": 0.0016, + "step": 10105 + }, + { + "epoch": 4.597816196542311, + "grad_norm": 0.0335273949812391, + "learning_rate": 1.5915246828148657e-07, + "loss": 0.0003, + "step": 10106 + }, + { + "epoch": 4.598271155595996, + "grad_norm": 0.02388105255093149, + "learning_rate": 1.5879491979690854e-07, + "loss": 0.0001, + "step": 10107 + }, + { + "epoch": 4.598726114649682, + "grad_norm": 0.21977463164474043, + "learning_rate": 1.5843776691374823e-07, + "loss": 0.0007, + "step": 10108 + }, + { + "epoch": 4.599181073703367, + "grad_norm": 0.016489540639126124, + "learning_rate": 1.5808100966118844e-07, + "loss": 0.0001, + "step": 10109 + }, + { + "epoch": 4.599636032757052, + "grad_norm": 0.00642479285954726, + "learning_rate": 1.577246480683836e-07, + "loss": 0.0, + "step": 10110 + }, + { + "epoch": 4.600090991810737, + "grad_norm": 0.03219413412893527, + "learning_rate": 1.5736868216445155e-07, + "loss": 0.0002, + "step": 10111 + }, + { + "epoch": 4.600545950864422, + "grad_norm": 0.019346077148516046, + "learning_rate": 1.570131119784829e-07, + "loss": 0.0001, + "step": 10112 + }, + { + "epoch": 4.601000909918107, + "grad_norm": 0.006218382792734654, + "learning_rate": 1.5665793753953162e-07, + "loss": 0.0, + "step": 10113 + }, + { + "epoch": 4.601455868971793, + "grad_norm": 0.015622294368870639, + "learning_rate": 1.5630315887662117e-07, + "loss": 0.0001, + "step": 10114 + }, + { + "epoch": 4.601910828025478, + "grad_norm": 0.017345444433933915, + "learning_rate": 1.559487760187428e-07, + "loss": 0.0002, + "step": 10115 + }, + { + "epoch": 4.6023657870791626, + "grad_norm": 0.2160180379150265, + "learning_rate": 1.5559478899485447e-07, + "loss": 0.0016, + "step": 10116 + }, + { + "epoch": 4.602820746132848, + "grad_norm": 0.0026341134473061, + "learning_rate": 1.552411978338836e-07, + "loss": 0.0, + "step": 10117 + }, + { + "epoch": 4.603275705186533, + "grad_norm": 0.01083704962400509, + "learning_rate": 1.5488800256472315e-07, + "loss": 0.0001, + "step": 10118 + }, + { + "epoch": 4.603730664240218, + "grad_norm": 0.15260011786840505, + "learning_rate": 1.545352032162345e-07, + "loss": 0.0028, + "step": 10119 + }, + { + "epoch": 4.604185623293904, + "grad_norm": 0.07914157037190421, + "learning_rate": 1.5418279981724683e-07, + "loss": 0.0005, + "step": 10120 + }, + { + "epoch": 4.604640582347589, + "grad_norm": 0.026299070051616653, + "learning_rate": 1.5383079239655762e-07, + "loss": 0.0002, + "step": 10121 + }, + { + "epoch": 4.6050955414012735, + "grad_norm": 0.11537894084000455, + "learning_rate": 1.5347918098293114e-07, + "loss": 0.0011, + "step": 10122 + }, + { + "epoch": 4.605550500454959, + "grad_norm": 0.12214011477653744, + "learning_rate": 1.5312796560509935e-07, + "loss": 0.0014, + "step": 10123 + }, + { + "epoch": 4.606005459508644, + "grad_norm": 0.034621923594996946, + "learning_rate": 1.5277714629176155e-07, + "loss": 0.0002, + "step": 10124 + }, + { + "epoch": 4.606460418562329, + "grad_norm": 0.006269139712969645, + "learning_rate": 1.5242672307158534e-07, + "loss": 0.0, + "step": 10125 + }, + { + "epoch": 4.606915377616015, + "grad_norm": 0.0763685326895611, + "learning_rate": 1.5207669597320618e-07, + "loss": 0.0004, + "step": 10126 + }, + { + "epoch": 4.6073703366697, + "grad_norm": 0.006566883940682684, + "learning_rate": 1.5172706502522672e-07, + "loss": 0.0, + "step": 10127 + }, + { + "epoch": 4.607825295723385, + "grad_norm": 0.016857567333840084, + "learning_rate": 1.5137783025621634e-07, + "loss": 0.0001, + "step": 10128 + }, + { + "epoch": 4.60828025477707, + "grad_norm": 0.07832595525597381, + "learning_rate": 1.5102899169471386e-07, + "loss": 0.0004, + "step": 10129 + }, + { + "epoch": 4.608735213830755, + "grad_norm": 0.007996000943571764, + "learning_rate": 1.5068054936922426e-07, + "loss": 0.0, + "step": 10130 + }, + { + "epoch": 4.609190172884441, + "grad_norm": 0.13135011514864436, + "learning_rate": 1.5033250330822036e-07, + "loss": 0.0038, + "step": 10131 + }, + { + "epoch": 4.609645131938126, + "grad_norm": 0.008050907150329823, + "learning_rate": 1.4998485354014436e-07, + "loss": 0.0, + "step": 10132 + }, + { + "epoch": 4.610100090991811, + "grad_norm": 0.025811960139408, + "learning_rate": 1.49637600093403e-07, + "loss": 0.0002, + "step": 10133 + }, + { + "epoch": 4.610555050045496, + "grad_norm": 0.13329781420957088, + "learning_rate": 1.492907429963736e-07, + "loss": 0.0005, + "step": 10134 + }, + { + "epoch": 4.611010009099181, + "grad_norm": 0.3968536018348747, + "learning_rate": 1.4894428227739787e-07, + "loss": 0.0021, + "step": 10135 + }, + { + "epoch": 4.611464968152866, + "grad_norm": 0.019323539051422454, + "learning_rate": 1.485982179647888e-07, + "loss": 0.0001, + "step": 10136 + }, + { + "epoch": 4.611919927206552, + "grad_norm": 0.05566346265320087, + "learning_rate": 1.4825255008682483e-07, + "loss": 0.0002, + "step": 10137 + }, + { + "epoch": 4.612374886260237, + "grad_norm": 0.0908920460291126, + "learning_rate": 1.4790727867175224e-07, + "loss": 0.0003, + "step": 10138 + }, + { + "epoch": 4.612829845313922, + "grad_norm": 0.06582488252393662, + "learning_rate": 1.4756240374778463e-07, + "loss": 0.0003, + "step": 10139 + }, + { + "epoch": 4.613284804367607, + "grad_norm": 0.007087593541376637, + "learning_rate": 1.4721792534310386e-07, + "loss": 0.0, + "step": 10140 + }, + { + "epoch": 4.613739763421292, + "grad_norm": 0.041696913325147585, + "learning_rate": 1.4687384348585964e-07, + "loss": 0.0004, + "step": 10141 + }, + { + "epoch": 4.614194722474977, + "grad_norm": 0.022933051631727406, + "learning_rate": 1.465301582041684e-07, + "loss": 0.0001, + "step": 10142 + }, + { + "epoch": 4.614649681528663, + "grad_norm": 1.303792613442042, + "learning_rate": 1.4618686952611428e-07, + "loss": 0.0118, + "step": 10143 + }, + { + "epoch": 4.615104640582348, + "grad_norm": 0.032707504441561695, + "learning_rate": 1.4584397747974987e-07, + "loss": 0.0001, + "step": 10144 + }, + { + "epoch": 4.615559599636033, + "grad_norm": 0.013022017326902556, + "learning_rate": 1.455014820930939e-07, + "loss": 0.0001, + "step": 10145 + }, + { + "epoch": 4.616014558689718, + "grad_norm": 0.07130696505473208, + "learning_rate": 1.4515938339413504e-07, + "loss": 0.0006, + "step": 10146 + }, + { + "epoch": 4.616469517743403, + "grad_norm": 0.07214436881185252, + "learning_rate": 1.4481768141082652e-07, + "loss": 0.0017, + "step": 10147 + }, + { + "epoch": 4.616924476797088, + "grad_norm": 0.1033760209287351, + "learning_rate": 1.4447637617109157e-07, + "loss": 0.0011, + "step": 10148 + }, + { + "epoch": 4.617379435850774, + "grad_norm": 0.010718819245754766, + "learning_rate": 1.4413546770281893e-07, + "loss": 0.0001, + "step": 10149 + }, + { + "epoch": 4.617834394904459, + "grad_norm": 0.1250026119179043, + "learning_rate": 1.437949560338675e-07, + "loss": 0.0004, + "step": 10150 + }, + { + "epoch": 4.6182893539581436, + "grad_norm": 0.013741319867024624, + "learning_rate": 1.4345484119206222e-07, + "loss": 0.0001, + "step": 10151 + }, + { + "epoch": 4.618744313011829, + "grad_norm": 0.09472888366923614, + "learning_rate": 1.4311512320519528e-07, + "loss": 0.0012, + "step": 10152 + }, + { + "epoch": 4.619199272065514, + "grad_norm": 0.08998392552959748, + "learning_rate": 1.427758021010267e-07, + "loss": 0.0003, + "step": 10153 + }, + { + "epoch": 4.619654231119199, + "grad_norm": 0.06160328013824963, + "learning_rate": 1.4243687790728433e-07, + "loss": 0.0005, + "step": 10154 + }, + { + "epoch": 4.620109190172885, + "grad_norm": 0.24342276091398451, + "learning_rate": 1.4209835065166433e-07, + "loss": 0.006, + "step": 10155 + }, + { + "epoch": 4.62056414922657, + "grad_norm": 0.14924390832608933, + "learning_rate": 1.417602203618279e-07, + "loss": 0.0053, + "step": 10156 + }, + { + "epoch": 4.6210191082802545, + "grad_norm": 0.011802511419363603, + "learning_rate": 1.4142248706540796e-07, + "loss": 0.0001, + "step": 10157 + }, + { + "epoch": 4.62147406733394, + "grad_norm": 0.14234054105551489, + "learning_rate": 1.4108515079000075e-07, + "loss": 0.0015, + "step": 10158 + }, + { + "epoch": 4.621929026387625, + "grad_norm": 0.029955309913570694, + "learning_rate": 1.4074821156317197e-07, + "loss": 0.0002, + "step": 10159 + }, + { + "epoch": 4.62238398544131, + "grad_norm": 0.049696155983342795, + "learning_rate": 1.4041166941245578e-07, + "loss": 0.0003, + "step": 10160 + }, + { + "epoch": 4.622838944494996, + "grad_norm": 0.00600299962862912, + "learning_rate": 1.4007552436535176e-07, + "loss": 0.0, + "step": 10161 + }, + { + "epoch": 4.623293903548681, + "grad_norm": 0.0095031794806623, + "learning_rate": 1.3973977644932913e-07, + "loss": 0.0001, + "step": 10162 + }, + { + "epoch": 4.6237488626023655, + "grad_norm": 0.3235915088642451, + "learning_rate": 1.3940442569182255e-07, + "loss": 0.0057, + "step": 10163 + }, + { + "epoch": 4.624203821656051, + "grad_norm": 0.010381106645430217, + "learning_rate": 1.3906947212023625e-07, + "loss": 0.0001, + "step": 10164 + }, + { + "epoch": 4.624658780709736, + "grad_norm": 0.24804601954697872, + "learning_rate": 1.3873491576194165e-07, + "loss": 0.0061, + "step": 10165 + }, + { + "epoch": 4.625113739763421, + "grad_norm": 0.40596109806754843, + "learning_rate": 1.3840075664427577e-07, + "loss": 0.0047, + "step": 10166 + }, + { + "epoch": 4.625568698817107, + "grad_norm": 0.05214450095762541, + "learning_rate": 1.3806699479454567e-07, + "loss": 0.0006, + "step": 10167 + }, + { + "epoch": 4.626023657870792, + "grad_norm": 0.033912337877299234, + "learning_rate": 1.377336302400245e-07, + "loss": 0.0002, + "step": 10168 + }, + { + "epoch": 4.6264786169244765, + "grad_norm": 0.13646254961781484, + "learning_rate": 1.3740066300795274e-07, + "loss": 0.0019, + "step": 10169 + }, + { + "epoch": 4.626933575978162, + "grad_norm": 0.03438743028185538, + "learning_rate": 1.3706809312553914e-07, + "loss": 0.0002, + "step": 10170 + }, + { + "epoch": 4.627388535031847, + "grad_norm": 0.013365391720777744, + "learning_rate": 1.3673592061996087e-07, + "loss": 0.0001, + "step": 10171 + }, + { + "epoch": 4.627843494085532, + "grad_norm": 0.2200304686770983, + "learning_rate": 1.3640414551836122e-07, + "loss": 0.0046, + "step": 10172 + }, + { + "epoch": 4.628298453139218, + "grad_norm": 0.009235076324389713, + "learning_rate": 1.3607276784785074e-07, + "loss": 0.0001, + "step": 10173 + }, + { + "epoch": 4.628753412192903, + "grad_norm": 0.10847971104135985, + "learning_rate": 1.3574178763550772e-07, + "loss": 0.0009, + "step": 10174 + }, + { + "epoch": 4.6292083712465875, + "grad_norm": 0.044590173955268986, + "learning_rate": 1.3541120490837943e-07, + "loss": 0.0002, + "step": 10175 + }, + { + "epoch": 4.629663330300273, + "grad_norm": 0.02157787933266301, + "learning_rate": 1.3508101969347986e-07, + "loss": 0.0001, + "step": 10176 + }, + { + "epoch": 4.630118289353958, + "grad_norm": 0.561859082001676, + "learning_rate": 1.347512320177885e-07, + "loss": 0.0027, + "step": 10177 + }, + { + "epoch": 4.630573248407643, + "grad_norm": 0.008779433087511678, + "learning_rate": 1.3442184190825547e-07, + "loss": 0.0001, + "step": 10178 + }, + { + "epoch": 4.631028207461329, + "grad_norm": 0.034840075704382495, + "learning_rate": 1.34092849391797e-07, + "loss": 0.0002, + "step": 10179 + }, + { + "epoch": 4.631483166515014, + "grad_norm": 0.019750917666495425, + "learning_rate": 1.3376425449529661e-07, + "loss": 0.0001, + "step": 10180 + }, + { + "epoch": 4.631938125568698, + "grad_norm": 0.28808741898602425, + "learning_rate": 1.334360572456056e-07, + "loss": 0.0017, + "step": 10181 + }, + { + "epoch": 4.632393084622384, + "grad_norm": 0.05012937701671048, + "learning_rate": 1.3310825766954305e-07, + "loss": 0.001, + "step": 10182 + }, + { + "epoch": 4.632848043676069, + "grad_norm": 0.010303042243101506, + "learning_rate": 1.327808557938942e-07, + "loss": 0.0001, + "step": 10183 + }, + { + "epoch": 4.633303002729754, + "grad_norm": 0.04690058144319214, + "learning_rate": 1.324538516454138e-07, + "loss": 0.0002, + "step": 10184 + }, + { + "epoch": 4.63375796178344, + "grad_norm": 0.18241458562307544, + "learning_rate": 1.3212724525082376e-07, + "loss": 0.002, + "step": 10185 + }, + { + "epoch": 4.6342129208371245, + "grad_norm": 0.018623052885392427, + "learning_rate": 1.3180103663681165e-07, + "loss": 0.0001, + "step": 10186 + }, + { + "epoch": 4.634667879890809, + "grad_norm": 0.008191792748821817, + "learning_rate": 1.3147522583003448e-07, + "loss": 0.0, + "step": 10187 + }, + { + "epoch": 4.635122838944495, + "grad_norm": 0.10315490954016021, + "learning_rate": 1.3114981285711538e-07, + "loss": 0.0013, + "step": 10188 + }, + { + "epoch": 4.63557779799818, + "grad_norm": 0.03132705029902822, + "learning_rate": 1.308247977446464e-07, + "loss": 0.0002, + "step": 10189 + }, + { + "epoch": 4.636032757051865, + "grad_norm": 0.06235343469483827, + "learning_rate": 1.3050018051918578e-07, + "loss": 0.0006, + "step": 10190 + }, + { + "epoch": 4.636487716105551, + "grad_norm": 0.009128196767433108, + "learning_rate": 1.3017596120725952e-07, + "loss": 0.0, + "step": 10191 + }, + { + "epoch": 4.6369426751592355, + "grad_norm": 0.22046398257934455, + "learning_rate": 1.298521398353625e-07, + "loss": 0.0013, + "step": 10192 + }, + { + "epoch": 4.63739763421292, + "grad_norm": 0.333970918208788, + "learning_rate": 1.295287164299547e-07, + "loss": 0.001, + "step": 10193 + }, + { + "epoch": 4.637852593266606, + "grad_norm": 0.09685445479325029, + "learning_rate": 1.292056910174655e-07, + "loss": 0.0006, + "step": 10194 + }, + { + "epoch": 4.638307552320291, + "grad_norm": 0.01494019227754789, + "learning_rate": 1.288830636242916e-07, + "loss": 0.0001, + "step": 10195 + }, + { + "epoch": 4.638762511373976, + "grad_norm": 0.17916318720598856, + "learning_rate": 1.2856083427679522e-07, + "loss": 0.0009, + "step": 10196 + }, + { + "epoch": 4.639217470427662, + "grad_norm": 0.1609286785384726, + "learning_rate": 1.2823900300130808e-07, + "loss": 0.0012, + "step": 10197 + }, + { + "epoch": 4.6396724294813465, + "grad_norm": 0.10144835259228799, + "learning_rate": 1.2791756982412917e-07, + "loss": 0.0005, + "step": 10198 + }, + { + "epoch": 4.640127388535031, + "grad_norm": 0.060658527836725366, + "learning_rate": 1.2759653477152412e-07, + "loss": 0.0003, + "step": 10199 + }, + { + "epoch": 4.640582347588717, + "grad_norm": 0.0227548642896754, + "learning_rate": 1.272758978697275e-07, + "loss": 0.0002, + "step": 10200 + }, + { + "epoch": 4.641037306642402, + "grad_norm": 0.015299341637384929, + "learning_rate": 1.269556591449389e-07, + "loss": 0.0001, + "step": 10201 + }, + { + "epoch": 4.641492265696087, + "grad_norm": 0.06898131151561372, + "learning_rate": 1.2663581862332741e-07, + "loss": 0.0005, + "step": 10202 + }, + { + "epoch": 4.641947224749773, + "grad_norm": 0.005027214953384058, + "learning_rate": 1.2631637633102878e-07, + "loss": 0.0, + "step": 10203 + }, + { + "epoch": 4.6424021838034575, + "grad_norm": 0.006431894331208697, + "learning_rate": 1.2599733229414656e-07, + "loss": 0.0, + "step": 10204 + }, + { + "epoch": 4.642857142857143, + "grad_norm": 0.012310336207182297, + "learning_rate": 1.256786865387516e-07, + "loss": 0.0, + "step": 10205 + }, + { + "epoch": 4.643312101910828, + "grad_norm": 0.03426431921104071, + "learning_rate": 1.253604390908819e-07, + "loss": 0.0005, + "step": 10206 + }, + { + "epoch": 4.643767060964513, + "grad_norm": 0.033299214981028294, + "learning_rate": 1.2504258997654395e-07, + "loss": 0.0001, + "step": 10207 + }, + { + "epoch": 4.644222020018199, + "grad_norm": 0.11587346777721734, + "learning_rate": 1.247251392217097e-07, + "loss": 0.0007, + "step": 10208 + }, + { + "epoch": 4.644676979071884, + "grad_norm": 0.04997306509282845, + "learning_rate": 1.244080868523212e-07, + "loss": 0.0003, + "step": 10209 + }, + { + "epoch": 4.6451319381255685, + "grad_norm": 0.09826598425246098, + "learning_rate": 1.2409143289428606e-07, + "loss": 0.001, + "step": 10210 + }, + { + "epoch": 4.645586897179254, + "grad_norm": 0.06985262563333702, + "learning_rate": 1.237751773734791e-07, + "loss": 0.0005, + "step": 10211 + }, + { + "epoch": 4.646041856232939, + "grad_norm": 0.001415377924834579, + "learning_rate": 1.234593203157436e-07, + "loss": 0.0, + "step": 10212 + }, + { + "epoch": 4.646496815286624, + "grad_norm": 0.08676187334445322, + "learning_rate": 1.2314386174689052e-07, + "loss": 0.0007, + "step": 10213 + }, + { + "epoch": 4.64695177434031, + "grad_norm": 0.08094879529359549, + "learning_rate": 1.2282880169269707e-07, + "loss": 0.0006, + "step": 10214 + }, + { + "epoch": 4.647406733393995, + "grad_norm": 0.08958867163380105, + "learning_rate": 1.2251414017890928e-07, + "loss": 0.0004, + "step": 10215 + }, + { + "epoch": 4.647861692447679, + "grad_norm": 0.03489213435693581, + "learning_rate": 1.2219987723123939e-07, + "loss": 0.0001, + "step": 10216 + }, + { + "epoch": 4.648316651501365, + "grad_norm": 0.04566919553045858, + "learning_rate": 1.218860128753674e-07, + "loss": 0.0003, + "step": 10217 + }, + { + "epoch": 4.64877161055505, + "grad_norm": 0.225069329146852, + "learning_rate": 1.215725471369411e-07, + "loss": 0.0028, + "step": 10218 + }, + { + "epoch": 4.649226569608735, + "grad_norm": 0.041881852562165846, + "learning_rate": 1.2125948004157506e-07, + "loss": 0.0002, + "step": 10219 + }, + { + "epoch": 4.649681528662421, + "grad_norm": 0.008070887556896545, + "learning_rate": 1.2094681161485267e-07, + "loss": 0.0001, + "step": 10220 + }, + { + "epoch": 4.6501364877161055, + "grad_norm": 0.01901146623798444, + "learning_rate": 1.206345418823235e-07, + "loss": 0.0001, + "step": 10221 + }, + { + "epoch": 4.65059144676979, + "grad_norm": 0.36257880272150783, + "learning_rate": 1.2032267086950378e-07, + "loss": 0.0107, + "step": 10222 + }, + { + "epoch": 4.651046405823476, + "grad_norm": 0.34285106672170484, + "learning_rate": 1.2001119860187928e-07, + "loss": 0.0039, + "step": 10223 + }, + { + "epoch": 4.651501364877161, + "grad_norm": 0.004741663391622923, + "learning_rate": 1.197001251049018e-07, + "loss": 0.0, + "step": 10224 + }, + { + "epoch": 4.651956323930846, + "grad_norm": 0.18874095547053066, + "learning_rate": 1.1938945040399107e-07, + "loss": 0.0013, + "step": 10225 + }, + { + "epoch": 4.652411282984532, + "grad_norm": 0.1467383327015633, + "learning_rate": 1.1907917452453344e-07, + "loss": 0.0024, + "step": 10226 + }, + { + "epoch": 4.6528662420382165, + "grad_norm": 0.0624467326070796, + "learning_rate": 1.187692974918836e-07, + "loss": 0.0004, + "step": 10227 + }, + { + "epoch": 4.653321201091901, + "grad_norm": 0.11143894385287034, + "learning_rate": 1.1845981933136352e-07, + "loss": 0.0003, + "step": 10228 + }, + { + "epoch": 4.653776160145587, + "grad_norm": 0.20219272049674344, + "learning_rate": 1.1815074006826243e-07, + "loss": 0.001, + "step": 10229 + }, + { + "epoch": 4.654231119199272, + "grad_norm": 0.05480471236745072, + "learning_rate": 1.178420597278368e-07, + "loss": 0.0005, + "step": 10230 + }, + { + "epoch": 4.654686078252957, + "grad_norm": 0.4792014577162499, + "learning_rate": 1.1753377833530922e-07, + "loss": 0.0037, + "step": 10231 + }, + { + "epoch": 4.655141037306643, + "grad_norm": 0.03812148473216996, + "learning_rate": 1.1722589591587342e-07, + "loss": 0.0002, + "step": 10232 + }, + { + "epoch": 4.6555959963603275, + "grad_norm": 0.059543807501753955, + "learning_rate": 1.169184124946865e-07, + "loss": 0.0005, + "step": 10233 + }, + { + "epoch": 4.656050955414012, + "grad_norm": 0.05482593952747607, + "learning_rate": 1.1661132809687504e-07, + "loss": 0.0002, + "step": 10234 + }, + { + "epoch": 4.656505914467698, + "grad_norm": 0.0259659490851893, + "learning_rate": 1.1630464274753284e-07, + "loss": 0.0002, + "step": 10235 + }, + { + "epoch": 4.656960873521383, + "grad_norm": 0.0027512639589397808, + "learning_rate": 1.1599835647172042e-07, + "loss": 0.0, + "step": 10236 + }, + { + "epoch": 4.657415832575068, + "grad_norm": 0.05746948692003079, + "learning_rate": 1.1569246929446665e-07, + "loss": 0.0005, + "step": 10237 + }, + { + "epoch": 4.657870791628754, + "grad_norm": 0.09449271092010088, + "learning_rate": 1.1538698124076709e-07, + "loss": 0.0005, + "step": 10238 + }, + { + "epoch": 4.6583257506824385, + "grad_norm": 0.0814308319838571, + "learning_rate": 1.1508189233558453e-07, + "loss": 0.0011, + "step": 10239 + }, + { + "epoch": 4.658780709736124, + "grad_norm": 0.1277713641581946, + "learning_rate": 1.1477720260384962e-07, + "loss": 0.0008, + "step": 10240 + }, + { + "epoch": 4.659235668789809, + "grad_norm": 0.012715468486772056, + "learning_rate": 1.1447291207046019e-07, + "loss": 0.0, + "step": 10241 + }, + { + "epoch": 4.659690627843494, + "grad_norm": 0.015274381293408386, + "learning_rate": 1.1416902076028136e-07, + "loss": 0.0001, + "step": 10242 + }, + { + "epoch": 4.66014558689718, + "grad_norm": 0.11634578067259214, + "learning_rate": 1.1386552869814604e-07, + "loss": 0.0015, + "step": 10243 + }, + { + "epoch": 4.660600545950865, + "grad_norm": 0.03943422738373436, + "learning_rate": 1.1356243590885441e-07, + "loss": 0.0003, + "step": 10244 + }, + { + "epoch": 4.6610555050045495, + "grad_norm": 0.0076796104732176846, + "learning_rate": 1.1325974241717385e-07, + "loss": 0.0, + "step": 10245 + }, + { + "epoch": 4.661510464058235, + "grad_norm": 0.1559827480698999, + "learning_rate": 1.1295744824783794e-07, + "loss": 0.0027, + "step": 10246 + }, + { + "epoch": 4.66196542311192, + "grad_norm": 0.04058144076867448, + "learning_rate": 1.1265555342554968e-07, + "loss": 0.0003, + "step": 10247 + }, + { + "epoch": 4.662420382165605, + "grad_norm": 0.08963031140961897, + "learning_rate": 1.1235405797497933e-07, + "loss": 0.0006, + "step": 10248 + }, + { + "epoch": 4.662875341219291, + "grad_norm": 0.03604198077039828, + "learning_rate": 1.1205296192076275e-07, + "loss": 0.0003, + "step": 10249 + }, + { + "epoch": 4.663330300272976, + "grad_norm": 0.036854331060587636, + "learning_rate": 1.1175226528750416e-07, + "loss": 0.0003, + "step": 10250 + }, + { + "epoch": 4.66378525932666, + "grad_norm": 0.003971254287974712, + "learning_rate": 1.1145196809977499e-07, + "loss": 0.0, + "step": 10251 + }, + { + "epoch": 4.664240218380346, + "grad_norm": 0.12978544118659024, + "learning_rate": 1.1115207038211507e-07, + "loss": 0.0009, + "step": 10252 + }, + { + "epoch": 4.664695177434031, + "grad_norm": 0.15588645365758752, + "learning_rate": 1.1085257215902978e-07, + "loss": 0.0019, + "step": 10253 + }, + { + "epoch": 4.665150136487716, + "grad_norm": 0.004813458787314431, + "learning_rate": 1.105534734549929e-07, + "loss": 0.0, + "step": 10254 + }, + { + "epoch": 4.665605095541402, + "grad_norm": 0.04465227128931435, + "learning_rate": 1.1025477429444598e-07, + "loss": 0.0004, + "step": 10255 + }, + { + "epoch": 4.6660600545950865, + "grad_norm": 0.024452297518496233, + "learning_rate": 1.0995647470179672e-07, + "loss": 0.0002, + "step": 10256 + }, + { + "epoch": 4.666515013648771, + "grad_norm": 0.014451426336303646, + "learning_rate": 1.0965857470142172e-07, + "loss": 0.0001, + "step": 10257 + }, + { + "epoch": 4.666969972702457, + "grad_norm": 0.0044332680286918264, + "learning_rate": 1.0936107431766319e-07, + "loss": 0.0, + "step": 10258 + }, + { + "epoch": 4.667424931756142, + "grad_norm": 0.05971002350169554, + "learning_rate": 1.0906397357483167e-07, + "loss": 0.0008, + "step": 10259 + }, + { + "epoch": 4.667879890809827, + "grad_norm": 0.007070738488226248, + "learning_rate": 1.0876727249720443e-07, + "loss": 0.0001, + "step": 10260 + }, + { + "epoch": 4.668334849863513, + "grad_norm": 0.014440176375710932, + "learning_rate": 1.0847097110902704e-07, + "loss": 0.0001, + "step": 10261 + }, + { + "epoch": 4.6687898089171975, + "grad_norm": 0.029342900994430036, + "learning_rate": 1.0817506943451239e-07, + "loss": 0.0001, + "step": 10262 + }, + { + "epoch": 4.669244767970882, + "grad_norm": 0.032939897933142996, + "learning_rate": 1.0787956749784002e-07, + "loss": 0.0003, + "step": 10263 + }, + { + "epoch": 4.669699727024568, + "grad_norm": 0.08960649802277025, + "learning_rate": 1.075844653231567e-07, + "loss": 0.0007, + "step": 10264 + }, + { + "epoch": 4.670154686078253, + "grad_norm": 0.0073237852704496455, + "learning_rate": 1.0728976293457649e-07, + "loss": 0.0001, + "step": 10265 + }, + { + "epoch": 4.670609645131938, + "grad_norm": 0.05709982276190452, + "learning_rate": 1.069954603561818e-07, + "loss": 0.0006, + "step": 10266 + }, + { + "epoch": 4.671064604185624, + "grad_norm": 0.12135944219803936, + "learning_rate": 1.0670155761202172e-07, + "loss": 0.0004, + "step": 10267 + }, + { + "epoch": 4.6715195632393085, + "grad_norm": 0.05891484746184756, + "learning_rate": 1.0640805472611204e-07, + "loss": 0.0002, + "step": 10268 + }, + { + "epoch": 4.671974522292993, + "grad_norm": 0.1087781571722819, + "learning_rate": 1.061149517224369e-07, + "loss": 0.0013, + "step": 10269 + }, + { + "epoch": 4.672429481346679, + "grad_norm": 0.07540830259997695, + "learning_rate": 1.0582224862494716e-07, + "loss": 0.0004, + "step": 10270 + }, + { + "epoch": 4.672884440400364, + "grad_norm": 0.07659004129922979, + "learning_rate": 1.0552994545756201e-07, + "loss": 0.0003, + "step": 10271 + }, + { + "epoch": 4.673339399454049, + "grad_norm": 0.006056208404833988, + "learning_rate": 1.0523804224416623e-07, + "loss": 0.0, + "step": 10272 + }, + { + "epoch": 4.673794358507735, + "grad_norm": 0.05847826235197548, + "learning_rate": 1.0494653900861296e-07, + "loss": 0.0006, + "step": 10273 + }, + { + "epoch": 4.6742493175614195, + "grad_norm": 0.059442494775477576, + "learning_rate": 1.0465543577472314e-07, + "loss": 0.0005, + "step": 10274 + }, + { + "epoch": 4.674704276615104, + "grad_norm": 0.022137740314037393, + "learning_rate": 1.0436473256628277e-07, + "loss": 0.0001, + "step": 10275 + }, + { + "epoch": 4.67515923566879, + "grad_norm": 0.07105261175453277, + "learning_rate": 1.0407442940704837e-07, + "loss": 0.0006, + "step": 10276 + }, + { + "epoch": 4.675614194722475, + "grad_norm": 0.09515882674468688, + "learning_rate": 1.037845263207421e-07, + "loss": 0.0012, + "step": 10277 + }, + { + "epoch": 4.67606915377616, + "grad_norm": 0.010268336694827337, + "learning_rate": 1.0349502333105333e-07, + "loss": 0.0001, + "step": 10278 + }, + { + "epoch": 4.676524112829846, + "grad_norm": 0.014583361848813897, + "learning_rate": 1.0320592046163925e-07, + "loss": 0.0001, + "step": 10279 + }, + { + "epoch": 4.6769790718835305, + "grad_norm": 0.024517182114238003, + "learning_rate": 1.0291721773612263e-07, + "loss": 0.0001, + "step": 10280 + }, + { + "epoch": 4.677434030937215, + "grad_norm": 0.13133312676613232, + "learning_rate": 1.0262891517809626e-07, + "loss": 0.0016, + "step": 10281 + }, + { + "epoch": 4.677888989990901, + "grad_norm": 0.05639198907159649, + "learning_rate": 1.0234101281111852e-07, + "loss": 0.0003, + "step": 10282 + }, + { + "epoch": 4.678343949044586, + "grad_norm": 0.11769120137609002, + "learning_rate": 1.0205351065871615e-07, + "loss": 0.001, + "step": 10283 + }, + { + "epoch": 4.678798908098271, + "grad_norm": 0.11845125324173367, + "learning_rate": 1.0176640874438148e-07, + "loss": 0.0029, + "step": 10284 + }, + { + "epoch": 4.679253867151957, + "grad_norm": 0.09705450154745682, + "learning_rate": 1.0147970709157573e-07, + "loss": 0.0005, + "step": 10285 + }, + { + "epoch": 4.679708826205641, + "grad_norm": 0.048168920994499494, + "learning_rate": 1.0119340572372683e-07, + "loss": 0.0003, + "step": 10286 + }, + { + "epoch": 4.680163785259326, + "grad_norm": 0.009505128742610303, + "learning_rate": 1.0090750466422994e-07, + "loss": 0.0, + "step": 10287 + }, + { + "epoch": 4.680618744313012, + "grad_norm": 0.2689474288340439, + "learning_rate": 1.0062200393644806e-07, + "loss": 0.0017, + "step": 10288 + }, + { + "epoch": 4.681073703366697, + "grad_norm": 0.43896843966617727, + "learning_rate": 1.0033690356370973e-07, + "loss": 0.0017, + "step": 10289 + }, + { + "epoch": 4.681528662420382, + "grad_norm": 0.012701161599689375, + "learning_rate": 1.0005220356931355e-07, + "loss": 0.0001, + "step": 10290 + }, + { + "epoch": 4.6819836214740675, + "grad_norm": 0.050418455025712544, + "learning_rate": 9.976790397652314e-08, + "loss": 0.0001, + "step": 10291 + }, + { + "epoch": 4.682438580527752, + "grad_norm": 0.017990743852292785, + "learning_rate": 9.948400480857101e-08, + "loss": 0.0001, + "step": 10292 + }, + { + "epoch": 4.682893539581437, + "grad_norm": 0.00863934575465124, + "learning_rate": 9.920050608865473e-08, + "loss": 0.0, + "step": 10293 + }, + { + "epoch": 4.683348498635123, + "grad_norm": 0.006880755402871023, + "learning_rate": 9.89174078399413e-08, + "loss": 0.0, + "step": 10294 + }, + { + "epoch": 4.683803457688808, + "grad_norm": 0.03217173446263907, + "learning_rate": 9.863471008556447e-08, + "loss": 0.0001, + "step": 10295 + }, + { + "epoch": 4.684258416742493, + "grad_norm": 0.011075899015750672, + "learning_rate": 9.835241284862462e-08, + "loss": 0.0001, + "step": 10296 + }, + { + "epoch": 4.6847133757961785, + "grad_norm": 0.0816900350939216, + "learning_rate": 9.807051615218999e-08, + "loss": 0.0017, + "step": 10297 + }, + { + "epoch": 4.685168334849863, + "grad_norm": 0.2035813405958449, + "learning_rate": 9.778902001929602e-08, + "loss": 0.0025, + "step": 10298 + }, + { + "epoch": 4.685623293903548, + "grad_norm": 0.18314774013053128, + "learning_rate": 9.750792447294489e-08, + "loss": 0.0005, + "step": 10299 + }, + { + "epoch": 4.686078252957234, + "grad_norm": 0.009352875103096634, + "learning_rate": 9.722722953610708e-08, + "loss": 0.0001, + "step": 10300 + }, + { + "epoch": 4.686533212010919, + "grad_norm": 0.02418393477665069, + "learning_rate": 9.694693523171927e-08, + "loss": 0.0001, + "step": 10301 + }, + { + "epoch": 4.686988171064604, + "grad_norm": 0.051276069106874204, + "learning_rate": 9.666704158268592e-08, + "loss": 0.0002, + "step": 10302 + }, + { + "epoch": 4.6874431301182895, + "grad_norm": 0.009813481523625078, + "learning_rate": 9.638754861187816e-08, + "loss": 0.0001, + "step": 10303 + }, + { + "epoch": 4.687898089171974, + "grad_norm": 0.03243343470503502, + "learning_rate": 9.610845634213551e-08, + "loss": 0.0001, + "step": 10304 + }, + { + "epoch": 4.688353048225659, + "grad_norm": 0.10791818056928296, + "learning_rate": 9.582976479626471e-08, + "loss": 0.0011, + "step": 10305 + }, + { + "epoch": 4.688808007279345, + "grad_norm": 0.01373580111942201, + "learning_rate": 9.555147399703813e-08, + "loss": 0.0001, + "step": 10306 + }, + { + "epoch": 4.68926296633303, + "grad_norm": 0.01609801415912017, + "learning_rate": 9.527358396719699e-08, + "loss": 0.0001, + "step": 10307 + }, + { + "epoch": 4.689717925386715, + "grad_norm": 0.05467212082069933, + "learning_rate": 9.49960947294487e-08, + "loss": 0.0004, + "step": 10308 + }, + { + "epoch": 4.6901728844404005, + "grad_norm": 0.10471969069327443, + "learning_rate": 9.471900630646847e-08, + "loss": 0.0002, + "step": 10309 + }, + { + "epoch": 4.690627843494085, + "grad_norm": 0.010687898036433899, + "learning_rate": 9.444231872089927e-08, + "loss": 0.0001, + "step": 10310 + }, + { + "epoch": 4.69108280254777, + "grad_norm": 0.016634716213867702, + "learning_rate": 9.416603199535079e-08, + "loss": 0.0001, + "step": 10311 + }, + { + "epoch": 4.691537761601456, + "grad_norm": 0.009502206992849859, + "learning_rate": 9.389014615239944e-08, + "loss": 0.0, + "step": 10312 + }, + { + "epoch": 4.691992720655141, + "grad_norm": 0.12690626687443035, + "learning_rate": 9.36146612145894e-08, + "loss": 0.0013, + "step": 10313 + }, + { + "epoch": 4.692447679708827, + "grad_norm": 0.08252042132018089, + "learning_rate": 9.333957720443209e-08, + "loss": 0.0007, + "step": 10314 + }, + { + "epoch": 4.6929026387625115, + "grad_norm": 0.10958263836203827, + "learning_rate": 9.306489414440678e-08, + "loss": 0.0017, + "step": 10315 + }, + { + "epoch": 4.693357597816196, + "grad_norm": 0.13190084520488968, + "learning_rate": 9.279061205695828e-08, + "loss": 0.0006, + "step": 10316 + }, + { + "epoch": 4.693812556869882, + "grad_norm": 0.033410268197950864, + "learning_rate": 9.251673096450032e-08, + "loss": 0.0003, + "step": 10317 + }, + { + "epoch": 4.694267515923567, + "grad_norm": 0.2418523170102415, + "learning_rate": 9.224325088941332e-08, + "loss": 0.0008, + "step": 10318 + }, + { + "epoch": 4.694722474977252, + "grad_norm": 0.15226320856554007, + "learning_rate": 9.197017185404444e-08, + "loss": 0.0008, + "step": 10319 + }, + { + "epoch": 4.695177434030938, + "grad_norm": 0.032085887105710253, + "learning_rate": 9.169749388070859e-08, + "loss": 0.0001, + "step": 10320 + }, + { + "epoch": 4.695632393084622, + "grad_norm": 0.2769150562142887, + "learning_rate": 9.142521699168794e-08, + "loss": 0.006, + "step": 10321 + }, + { + "epoch": 4.696087352138307, + "grad_norm": 0.14071008323329443, + "learning_rate": 9.115334120923191e-08, + "loss": 0.0003, + "step": 10322 + }, + { + "epoch": 4.696542311191993, + "grad_norm": 0.08100280793718856, + "learning_rate": 9.088186655555608e-08, + "loss": 0.0009, + "step": 10323 + }, + { + "epoch": 4.696997270245678, + "grad_norm": 0.2314546957753362, + "learning_rate": 9.061079305284492e-08, + "loss": 0.0037, + "step": 10324 + }, + { + "epoch": 4.697452229299363, + "grad_norm": 0.010094699283373984, + "learning_rate": 9.03401207232496e-08, + "loss": 0.0, + "step": 10325 + }, + { + "epoch": 4.6979071883530485, + "grad_norm": 0.28837022294452597, + "learning_rate": 9.006984958888742e-08, + "loss": 0.0052, + "step": 10326 + }, + { + "epoch": 4.698362147406733, + "grad_norm": 0.04588712277186774, + "learning_rate": 8.979997967184462e-08, + "loss": 0.0002, + "step": 10327 + }, + { + "epoch": 4.698817106460418, + "grad_norm": 0.05293351730519668, + "learning_rate": 8.953051099417242e-08, + "loss": 0.0004, + "step": 10328 + }, + { + "epoch": 4.699272065514104, + "grad_norm": 0.04172704486171423, + "learning_rate": 8.926144357789158e-08, + "loss": 0.0003, + "step": 10329 + }, + { + "epoch": 4.699727024567789, + "grad_norm": 0.017897674231096176, + "learning_rate": 8.899277744498891e-08, + "loss": 0.0001, + "step": 10330 + }, + { + "epoch": 4.700181983621474, + "grad_norm": 0.058405896279421984, + "learning_rate": 8.872451261741854e-08, + "loss": 0.0006, + "step": 10331 + }, + { + "epoch": 4.7006369426751595, + "grad_norm": 0.09739571362233164, + "learning_rate": 8.845664911710239e-08, + "loss": 0.0004, + "step": 10332 + }, + { + "epoch": 4.701091901728844, + "grad_norm": 0.10881848118936635, + "learning_rate": 8.818918696592737e-08, + "loss": 0.0008, + "step": 10333 + }, + { + "epoch": 4.701546860782529, + "grad_norm": 0.08470917904403007, + "learning_rate": 8.792212618575158e-08, + "loss": 0.0008, + "step": 10334 + }, + { + "epoch": 4.702001819836215, + "grad_norm": 0.03494299040115534, + "learning_rate": 8.765546679839643e-08, + "loss": 0.0001, + "step": 10335 + }, + { + "epoch": 4.7024567788899, + "grad_norm": 0.007312358652135778, + "learning_rate": 8.738920882565283e-08, + "loss": 0.0, + "step": 10336 + }, + { + "epoch": 4.702911737943585, + "grad_norm": 0.029885048881680767, + "learning_rate": 8.712335228927782e-08, + "loss": 0.0002, + "step": 10337 + }, + { + "epoch": 4.7033666969972705, + "grad_norm": 0.056061374928798396, + "learning_rate": 8.685789721099569e-08, + "loss": 0.0005, + "step": 10338 + }, + { + "epoch": 4.703821656050955, + "grad_norm": 0.06543708507156024, + "learning_rate": 8.659284361249909e-08, + "loss": 0.0005, + "step": 10339 + }, + { + "epoch": 4.70427661510464, + "grad_norm": 0.10127908135566865, + "learning_rate": 8.632819151544681e-08, + "loss": 0.0007, + "step": 10340 + }, + { + "epoch": 4.704731574158326, + "grad_norm": 0.018628976793558953, + "learning_rate": 8.606394094146431e-08, + "loss": 0.0001, + "step": 10341 + }, + { + "epoch": 4.705186533212011, + "grad_norm": 0.028999291563235746, + "learning_rate": 8.580009191214544e-08, + "loss": 0.0003, + "step": 10342 + }, + { + "epoch": 4.705641492265696, + "grad_norm": 0.03454009066266616, + "learning_rate": 8.553664444905074e-08, + "loss": 0.0002, + "step": 10343 + }, + { + "epoch": 4.7060964513193815, + "grad_norm": 0.0578044670367911, + "learning_rate": 8.527359857370799e-08, + "loss": 0.0004, + "step": 10344 + }, + { + "epoch": 4.706551410373066, + "grad_norm": 0.1353017322841546, + "learning_rate": 8.501095430761219e-08, + "loss": 0.0015, + "step": 10345 + }, + { + "epoch": 4.707006369426751, + "grad_norm": 0.019850752998643802, + "learning_rate": 8.474871167222509e-08, + "loss": 0.0002, + "step": 10346 + }, + { + "epoch": 4.707461328480437, + "grad_norm": 0.046681506511303344, + "learning_rate": 8.448687068897676e-08, + "loss": 0.0003, + "step": 10347 + }, + { + "epoch": 4.707916287534122, + "grad_norm": 0.0048335019185104236, + "learning_rate": 8.422543137926231e-08, + "loss": 0.0, + "step": 10348 + }, + { + "epoch": 4.708371246587808, + "grad_norm": 0.018965589136176502, + "learning_rate": 8.396439376444631e-08, + "loss": 0.0002, + "step": 10349 + }, + { + "epoch": 4.7088262056414925, + "grad_norm": 0.00948008505034407, + "learning_rate": 8.370375786586005e-08, + "loss": 0.0, + "step": 10350 + }, + { + "epoch": 4.709281164695177, + "grad_norm": 0.06281868868250738, + "learning_rate": 8.344352370480036e-08, + "loss": 0.0006, + "step": 10351 + }, + { + "epoch": 4.709736123748863, + "grad_norm": 0.05066419694732638, + "learning_rate": 8.318369130253301e-08, + "loss": 0.0006, + "step": 10352 + }, + { + "epoch": 4.710191082802548, + "grad_norm": 0.16733808203378805, + "learning_rate": 8.292426068028992e-08, + "loss": 0.0007, + "step": 10353 + }, + { + "epoch": 4.710646041856233, + "grad_norm": 0.19217003206127167, + "learning_rate": 8.266523185927134e-08, + "loss": 0.0032, + "step": 10354 + }, + { + "epoch": 4.711101000909919, + "grad_norm": 0.045897348967784274, + "learning_rate": 8.240660486064367e-08, + "loss": 0.0002, + "step": 10355 + }, + { + "epoch": 4.711555959963603, + "grad_norm": 0.018638677900911078, + "learning_rate": 8.214837970554057e-08, + "loss": 0.0001, + "step": 10356 + }, + { + "epoch": 4.712010919017288, + "grad_norm": 0.06399199253328469, + "learning_rate": 8.189055641506293e-08, + "loss": 0.0002, + "step": 10357 + }, + { + "epoch": 4.712465878070974, + "grad_norm": 0.27939902481595374, + "learning_rate": 8.163313501027892e-08, + "loss": 0.005, + "step": 10358 + }, + { + "epoch": 4.712920837124659, + "grad_norm": 0.2456729056592538, + "learning_rate": 8.137611551222391e-08, + "loss": 0.0019, + "step": 10359 + }, + { + "epoch": 4.713375796178344, + "grad_norm": 0.025587039217796363, + "learning_rate": 8.111949794190055e-08, + "loss": 0.0001, + "step": 10360 + }, + { + "epoch": 4.7138307552320295, + "grad_norm": 0.10163891544422349, + "learning_rate": 8.086328232027874e-08, + "loss": 0.001, + "step": 10361 + }, + { + "epoch": 4.714285714285714, + "grad_norm": 0.05639120408845683, + "learning_rate": 8.060746866829394e-08, + "loss": 0.0006, + "step": 10362 + }, + { + "epoch": 4.714740673339399, + "grad_norm": 0.011926202335770674, + "learning_rate": 8.035205700685167e-08, + "loss": 0.0001, + "step": 10363 + }, + { + "epoch": 4.715195632393085, + "grad_norm": 0.04077934688115527, + "learning_rate": 8.009704735682244e-08, + "loss": 0.0002, + "step": 10364 + }, + { + "epoch": 4.71565059144677, + "grad_norm": 0.18752014827787267, + "learning_rate": 7.98424397390446e-08, + "loss": 0.0037, + "step": 10365 + }, + { + "epoch": 4.716105550500455, + "grad_norm": 0.007376981776185636, + "learning_rate": 7.95882341743226e-08, + "loss": 0.0, + "step": 10366 + }, + { + "epoch": 4.7165605095541405, + "grad_norm": 0.06875385197181354, + "learning_rate": 7.933443068342983e-08, + "loss": 0.0003, + "step": 10367 + }, + { + "epoch": 4.717015468607825, + "grad_norm": 0.08718207021948843, + "learning_rate": 7.908102928710637e-08, + "loss": 0.0004, + "step": 10368 + }, + { + "epoch": 4.71747042766151, + "grad_norm": 0.11941730741458896, + "learning_rate": 7.882803000605843e-08, + "loss": 0.0007, + "step": 10369 + }, + { + "epoch": 4.717925386715196, + "grad_norm": 0.007507273381700301, + "learning_rate": 7.857543286096003e-08, + "loss": 0.0, + "step": 10370 + }, + { + "epoch": 4.718380345768881, + "grad_norm": 0.006327290062892956, + "learning_rate": 7.832323787245188e-08, + "loss": 0.0, + "step": 10371 + }, + { + "epoch": 4.718835304822566, + "grad_norm": 0.07743546431310214, + "learning_rate": 7.807144506114306e-08, + "loss": 0.0003, + "step": 10372 + }, + { + "epoch": 4.7192902638762515, + "grad_norm": 0.008904934451756781, + "learning_rate": 7.782005444760821e-08, + "loss": 0.0001, + "step": 10373 + }, + { + "epoch": 4.719745222929936, + "grad_norm": 0.08263727860855853, + "learning_rate": 7.756906605239089e-08, + "loss": 0.0006, + "step": 10374 + }, + { + "epoch": 4.720200181983621, + "grad_norm": 0.1537215297408134, + "learning_rate": 7.731847989599916e-08, + "loss": 0.0008, + "step": 10375 + }, + { + "epoch": 4.720655141037307, + "grad_norm": 0.019947675947161292, + "learning_rate": 7.706829599891107e-08, + "loss": 0.0001, + "step": 10376 + }, + { + "epoch": 4.721110100090992, + "grad_norm": 0.04268760277897016, + "learning_rate": 7.681851438156973e-08, + "loss": 0.0005, + "step": 10377 + }, + { + "epoch": 4.721565059144677, + "grad_norm": 0.0710877894113309, + "learning_rate": 7.656913506438712e-08, + "loss": 0.0006, + "step": 10378 + }, + { + "epoch": 4.7220200181983625, + "grad_norm": 0.05359024275467646, + "learning_rate": 7.63201580677403e-08, + "loss": 0.0002, + "step": 10379 + }, + { + "epoch": 4.722474977252047, + "grad_norm": 0.016204789150332372, + "learning_rate": 7.607158341197462e-08, + "loss": 0.0001, + "step": 10380 + }, + { + "epoch": 4.722929936305732, + "grad_norm": 0.022401934156869894, + "learning_rate": 7.582341111740332e-08, + "loss": 0.0002, + "step": 10381 + }, + { + "epoch": 4.723384895359418, + "grad_norm": 0.1992439838520583, + "learning_rate": 7.557564120430572e-08, + "loss": 0.0013, + "step": 10382 + }, + { + "epoch": 4.723839854413103, + "grad_norm": 0.0063882251300269605, + "learning_rate": 7.532827369292783e-08, + "loss": 0.0, + "step": 10383 + }, + { + "epoch": 4.724294813466788, + "grad_norm": 0.0304810265854276, + "learning_rate": 7.508130860348406e-08, + "loss": 0.0001, + "step": 10384 + }, + { + "epoch": 4.7247497725204735, + "grad_norm": 0.04008498112719115, + "learning_rate": 7.483474595615491e-08, + "loss": 0.0002, + "step": 10385 + }, + { + "epoch": 4.725204731574158, + "grad_norm": 0.011124581368161962, + "learning_rate": 7.458858577108818e-08, + "loss": 0.0001, + "step": 10386 + }, + { + "epoch": 4.725659690627843, + "grad_norm": 0.029634245073102704, + "learning_rate": 7.434282806839944e-08, + "loss": 0.0003, + "step": 10387 + }, + { + "epoch": 4.726114649681529, + "grad_norm": 0.022487009086170885, + "learning_rate": 7.409747286817093e-08, + "loss": 0.0005, + "step": 10388 + }, + { + "epoch": 4.726569608735214, + "grad_norm": 0.06382411591487908, + "learning_rate": 7.385252019045164e-08, + "loss": 0.0006, + "step": 10389 + }, + { + "epoch": 4.727024567788899, + "grad_norm": 0.006060846013447366, + "learning_rate": 7.360797005525833e-08, + "loss": 0.0, + "step": 10390 + }, + { + "epoch": 4.727479526842584, + "grad_norm": 0.37412869852178027, + "learning_rate": 7.33638224825739e-08, + "loss": 0.0032, + "step": 10391 + }, + { + "epoch": 4.727934485896269, + "grad_norm": 0.22282554896505724, + "learning_rate": 7.312007749234961e-08, + "loss": 0.0016, + "step": 10392 + }, + { + "epoch": 4.728389444949954, + "grad_norm": 0.025689585202640138, + "learning_rate": 7.287673510450343e-08, + "loss": 0.0002, + "step": 10393 + }, + { + "epoch": 4.72884440400364, + "grad_norm": 0.005680373871094684, + "learning_rate": 7.26337953389189e-08, + "loss": 0.0, + "step": 10394 + }, + { + "epoch": 4.729299363057325, + "grad_norm": 0.020321580200183473, + "learning_rate": 7.239125821544957e-08, + "loss": 0.0001, + "step": 10395 + }, + { + "epoch": 4.72975432211101, + "grad_norm": 0.323477508892141, + "learning_rate": 7.214912375391291e-08, + "loss": 0.0008, + "step": 10396 + }, + { + "epoch": 4.730209281164695, + "grad_norm": 0.0990288296925892, + "learning_rate": 7.190739197409646e-08, + "loss": 0.0009, + "step": 10397 + }, + { + "epoch": 4.73066424021838, + "grad_norm": 0.11962998423555951, + "learning_rate": 7.166606289575274e-08, + "loss": 0.0008, + "step": 10398 + }, + { + "epoch": 4.731119199272065, + "grad_norm": 0.009435083007987862, + "learning_rate": 7.142513653860261e-08, + "loss": 0.0, + "step": 10399 + }, + { + "epoch": 4.731574158325751, + "grad_norm": 0.03480302526753761, + "learning_rate": 7.118461292233258e-08, + "loss": 0.0002, + "step": 10400 + }, + { + "epoch": 4.732029117379436, + "grad_norm": 0.012640705639870676, + "learning_rate": 7.094449206659748e-08, + "loss": 0.0001, + "step": 10401 + }, + { + "epoch": 4.732484076433121, + "grad_norm": 0.008999708535897202, + "learning_rate": 7.070477399101938e-08, + "loss": 0.0001, + "step": 10402 + }, + { + "epoch": 4.732939035486806, + "grad_norm": 0.008499970566807848, + "learning_rate": 7.046545871518651e-08, + "loss": 0.0, + "step": 10403 + }, + { + "epoch": 4.733393994540491, + "grad_norm": 0.0851887875865808, + "learning_rate": 7.022654625865544e-08, + "loss": 0.0003, + "step": 10404 + }, + { + "epoch": 4.733848953594176, + "grad_norm": 0.15044690086974163, + "learning_rate": 6.998803664094723e-08, + "loss": 0.0004, + "step": 10405 + }, + { + "epoch": 4.734303912647862, + "grad_norm": 0.01890569048704366, + "learning_rate": 6.974992988155405e-08, + "loss": 0.0001, + "step": 10406 + }, + { + "epoch": 4.734758871701547, + "grad_norm": 0.013506523684603035, + "learning_rate": 6.951222599993091e-08, + "loss": 0.0001, + "step": 10407 + }, + { + "epoch": 4.735213830755232, + "grad_norm": 0.014481787776529292, + "learning_rate": 6.927492501550282e-08, + "loss": 0.0001, + "step": 10408 + }, + { + "epoch": 4.735668789808917, + "grad_norm": 0.49245972111253755, + "learning_rate": 6.903802694766148e-08, + "loss": 0.0029, + "step": 10409 + }, + { + "epoch": 4.736123748862602, + "grad_norm": 0.020938023767858617, + "learning_rate": 6.880153181576421e-08, + "loss": 0.0001, + "step": 10410 + }, + { + "epoch": 4.736578707916287, + "grad_norm": 0.12405111402772104, + "learning_rate": 6.856543963913665e-08, + "loss": 0.0002, + "step": 10411 + }, + { + "epoch": 4.737033666969973, + "grad_norm": 0.13137879328676527, + "learning_rate": 6.832975043707168e-08, + "loss": 0.0005, + "step": 10412 + }, + { + "epoch": 4.737488626023658, + "grad_norm": 0.11925776144501105, + "learning_rate": 6.809446422882782e-08, + "loss": 0.0014, + "step": 10413 + }, + { + "epoch": 4.737943585077343, + "grad_norm": 0.05406749377476792, + "learning_rate": 6.785958103363244e-08, + "loss": 0.0002, + "step": 10414 + }, + { + "epoch": 4.738398544131028, + "grad_norm": 0.13982304862638056, + "learning_rate": 6.762510087067741e-08, + "loss": 0.0014, + "step": 10415 + }, + { + "epoch": 4.738853503184713, + "grad_norm": 0.04421406450067183, + "learning_rate": 6.739102375912577e-08, + "loss": 0.0004, + "step": 10416 + }, + { + "epoch": 4.739308462238398, + "grad_norm": 0.012554461966809179, + "learning_rate": 6.715734971810439e-08, + "loss": 0.0001, + "step": 10417 + }, + { + "epoch": 4.739763421292084, + "grad_norm": 0.014902518578885133, + "learning_rate": 6.692407876670803e-08, + "loss": 0.0001, + "step": 10418 + }, + { + "epoch": 4.740218380345769, + "grad_norm": 0.0090459033328069, + "learning_rate": 6.669121092399811e-08, + "loss": 0.0001, + "step": 10419 + }, + { + "epoch": 4.740673339399454, + "grad_norm": 0.26291839367654923, + "learning_rate": 6.645874620900328e-08, + "loss": 0.0069, + "step": 10420 + }, + { + "epoch": 4.741128298453139, + "grad_norm": 0.104180717169593, + "learning_rate": 6.622668464072057e-08, + "loss": 0.0004, + "step": 10421 + }, + { + "epoch": 4.741583257506824, + "grad_norm": 0.011277401545510287, + "learning_rate": 6.599502623811204e-08, + "loss": 0.0001, + "step": 10422 + }, + { + "epoch": 4.742038216560509, + "grad_norm": 0.09708480399271728, + "learning_rate": 6.576377102010866e-08, + "loss": 0.0002, + "step": 10423 + }, + { + "epoch": 4.742493175614195, + "grad_norm": 0.030156295580507506, + "learning_rate": 6.5532919005607e-08, + "loss": 0.0001, + "step": 10424 + }, + { + "epoch": 4.74294813466788, + "grad_norm": 0.011097368621538343, + "learning_rate": 6.53024702134708e-08, + "loss": 0.0001, + "step": 10425 + }, + { + "epoch": 4.743403093721565, + "grad_norm": 0.02677174333287603, + "learning_rate": 6.507242466253283e-08, + "loss": 0.0001, + "step": 10426 + }, + { + "epoch": 4.74385805277525, + "grad_norm": 0.014480153932460315, + "learning_rate": 6.48427823715897e-08, + "loss": 0.0001, + "step": 10427 + }, + { + "epoch": 4.744313011828935, + "grad_norm": 0.008721429054318669, + "learning_rate": 6.461354335940807e-08, + "loss": 0.0, + "step": 10428 + }, + { + "epoch": 4.744767970882621, + "grad_norm": 0.013046979546112676, + "learning_rate": 6.438470764471849e-08, + "loss": 0.0001, + "step": 10429 + }, + { + "epoch": 4.745222929936306, + "grad_norm": 0.13028606938694787, + "learning_rate": 6.415627524622214e-08, + "loss": 0.0015, + "step": 10430 + }, + { + "epoch": 4.745677888989991, + "grad_norm": 0.012711252711354327, + "learning_rate": 6.39282461825852e-08, + "loss": 0.0001, + "step": 10431 + }, + { + "epoch": 4.746132848043676, + "grad_norm": 0.05103948090065527, + "learning_rate": 6.370062047244052e-08, + "loss": 0.0004, + "step": 10432 + }, + { + "epoch": 4.746587807097361, + "grad_norm": 0.03951076272118454, + "learning_rate": 6.347339813438935e-08, + "loss": 0.0003, + "step": 10433 + }, + { + "epoch": 4.747042766151046, + "grad_norm": 0.011223772264066753, + "learning_rate": 6.32465791869985e-08, + "loss": 0.0001, + "step": 10434 + }, + { + "epoch": 4.747497725204732, + "grad_norm": 0.028980534885325128, + "learning_rate": 6.30201636488037e-08, + "loss": 0.0001, + "step": 10435 + }, + { + "epoch": 4.747952684258417, + "grad_norm": 0.010274337536098235, + "learning_rate": 6.279415153830515e-08, + "loss": 0.0001, + "step": 10436 + }, + { + "epoch": 4.748407643312102, + "grad_norm": 0.01082462532463599, + "learning_rate": 6.256854287397251e-08, + "loss": 0.0001, + "step": 10437 + }, + { + "epoch": 4.748862602365787, + "grad_norm": 0.00578098534764013, + "learning_rate": 6.234333767424161e-08, + "loss": 0.0, + "step": 10438 + }, + { + "epoch": 4.749317561419472, + "grad_norm": 0.240120978350182, + "learning_rate": 6.211853595751493e-08, + "loss": 0.0036, + "step": 10439 + }, + { + "epoch": 4.749772520473157, + "grad_norm": 0.25957538728202784, + "learning_rate": 6.189413774216168e-08, + "loss": 0.0026, + "step": 10440 + }, + { + "epoch": 4.750227479526843, + "grad_norm": 0.24991561994030595, + "learning_rate": 6.167014304651997e-08, + "loss": 0.0011, + "step": 10441 + }, + { + "epoch": 4.750682438580528, + "grad_norm": 0.003873625455326493, + "learning_rate": 6.144655188889237e-08, + "loss": 0.0, + "step": 10442 + }, + { + "epoch": 4.751137397634213, + "grad_norm": 0.05548854489336238, + "learning_rate": 6.122336428755038e-08, + "loss": 0.0003, + "step": 10443 + }, + { + "epoch": 4.751592356687898, + "grad_norm": 0.1288444402855887, + "learning_rate": 6.100058026073108e-08, + "loss": 0.0019, + "step": 10444 + }, + { + "epoch": 4.752047315741583, + "grad_norm": 0.009789255576044124, + "learning_rate": 6.077819982664101e-08, + "loss": 0.0001, + "step": 10445 + }, + { + "epoch": 4.752502274795268, + "grad_norm": 0.30060800972959284, + "learning_rate": 6.055622300345066e-08, + "loss": 0.0052, + "step": 10446 + }, + { + "epoch": 4.752957233848954, + "grad_norm": 0.021005981817564315, + "learning_rate": 6.033464980929993e-08, + "loss": 0.0001, + "step": 10447 + }, + { + "epoch": 4.753412192902639, + "grad_norm": 0.015513802068342611, + "learning_rate": 6.011348026229324e-08, + "loss": 0.0001, + "step": 10448 + }, + { + "epoch": 4.753867151956324, + "grad_norm": 0.0983825208169601, + "learning_rate": 5.989271438050558e-08, + "loss": 0.0008, + "step": 10449 + }, + { + "epoch": 4.754322111010009, + "grad_norm": 0.10051818823520767, + "learning_rate": 5.967235218197531e-08, + "loss": 0.001, + "step": 10450 + }, + { + "epoch": 4.754777070063694, + "grad_norm": 0.11061422999148696, + "learning_rate": 5.945239368471079e-08, + "loss": 0.0015, + "step": 10451 + }, + { + "epoch": 4.755232029117379, + "grad_norm": 0.050694672602760595, + "learning_rate": 5.923283890668485e-08, + "loss": 0.0003, + "step": 10452 + }, + { + "epoch": 4.755686988171065, + "grad_norm": 0.06085398896416968, + "learning_rate": 5.9013687865839273e-08, + "loss": 0.0002, + "step": 10453 + }, + { + "epoch": 4.75614194722475, + "grad_norm": 0.11032030590496665, + "learning_rate": 5.8794940580081394e-08, + "loss": 0.001, + "step": 10454 + }, + { + "epoch": 4.756596906278435, + "grad_norm": 0.05142475024000202, + "learning_rate": 5.857659706728691e-08, + "loss": 0.0004, + "step": 10455 + }, + { + "epoch": 4.75705186533212, + "grad_norm": 0.2125310134176713, + "learning_rate": 5.835865734529822e-08, + "loss": 0.0015, + "step": 10456 + }, + { + "epoch": 4.757506824385805, + "grad_norm": 0.0184904908420292, + "learning_rate": 5.814112143192274e-08, + "loss": 0.0001, + "step": 10457 + }, + { + "epoch": 4.757961783439491, + "grad_norm": 0.011255046644503164, + "learning_rate": 5.792398934493848e-08, + "loss": 0.0001, + "step": 10458 + }, + { + "epoch": 4.758416742493176, + "grad_norm": 0.20133832984465874, + "learning_rate": 5.7707261102086795e-08, + "loss": 0.0012, + "step": 10459 + }, + { + "epoch": 4.758871701546861, + "grad_norm": 0.09829205116312933, + "learning_rate": 5.749093672107908e-08, + "loss": 0.0014, + "step": 10460 + }, + { + "epoch": 4.759326660600546, + "grad_norm": 0.05813792526776249, + "learning_rate": 5.727501621959175e-08, + "loss": 0.0003, + "step": 10461 + }, + { + "epoch": 4.759781619654231, + "grad_norm": 0.1677711352494823, + "learning_rate": 5.705949961526902e-08, + "loss": 0.0013, + "step": 10462 + }, + { + "epoch": 4.760236578707916, + "grad_norm": 0.11609644692104817, + "learning_rate": 5.6844386925721805e-08, + "loss": 0.0008, + "step": 10463 + }, + { + "epoch": 4.760691537761602, + "grad_norm": 0.24157437545876134, + "learning_rate": 5.6629678168527715e-08, + "loss": 0.0045, + "step": 10464 + }, + { + "epoch": 4.761146496815287, + "grad_norm": 0.2621746892735075, + "learning_rate": 5.641537336123271e-08, + "loss": 0.0026, + "step": 10465 + }, + { + "epoch": 4.761601455868972, + "grad_norm": 0.013228704115087659, + "learning_rate": 5.6201472521348885e-08, + "loss": 0.0001, + "step": 10466 + }, + { + "epoch": 4.762056414922657, + "grad_norm": 0.04136793599242851, + "learning_rate": 5.5987975666353944e-08, + "loss": 0.0003, + "step": 10467 + }, + { + "epoch": 4.762511373976342, + "grad_norm": 0.02573806361551556, + "learning_rate": 5.577488281369503e-08, + "loss": 0.0001, + "step": 10468 + }, + { + "epoch": 4.762966333030027, + "grad_norm": 0.18254481099006487, + "learning_rate": 5.5562193980784886e-08, + "loss": 0.0005, + "step": 10469 + }, + { + "epoch": 4.763421292083713, + "grad_norm": 0.003874586068290597, + "learning_rate": 5.534990918500294e-08, + "loss": 0.0, + "step": 10470 + }, + { + "epoch": 4.763876251137398, + "grad_norm": 0.02404991758021951, + "learning_rate": 5.513802844369642e-08, + "loss": 0.0001, + "step": 10471 + }, + { + "epoch": 4.764331210191083, + "grad_norm": 0.16636884874536972, + "learning_rate": 5.492655177418038e-08, + "loss": 0.0028, + "step": 10472 + }, + { + "epoch": 4.764786169244768, + "grad_norm": 0.008467201396735673, + "learning_rate": 5.471547919373377e-08, + "loss": 0.0, + "step": 10473 + }, + { + "epoch": 4.765241128298453, + "grad_norm": 0.0835722910822301, + "learning_rate": 5.4504810719606114e-08, + "loss": 0.0004, + "step": 10474 + }, + { + "epoch": 4.765696087352138, + "grad_norm": 0.016304320981645384, + "learning_rate": 5.429454636901144e-08, + "loss": 0.0001, + "step": 10475 + }, + { + "epoch": 4.766151046405824, + "grad_norm": 0.06329695406533259, + "learning_rate": 5.4084686159132096e-08, + "loss": 0.0003, + "step": 10476 + }, + { + "epoch": 4.766606005459509, + "grad_norm": 0.1303457832223836, + "learning_rate": 5.3875230107116036e-08, + "loss": 0.0016, + "step": 10477 + }, + { + "epoch": 4.767060964513194, + "grad_norm": 0.035704064059513985, + "learning_rate": 5.366617823007958e-08, + "loss": 0.0002, + "step": 10478 + }, + { + "epoch": 4.767515923566879, + "grad_norm": 0.061168384178163226, + "learning_rate": 5.345753054510627e-08, + "loss": 0.0003, + "step": 10479 + }, + { + "epoch": 4.767970882620564, + "grad_norm": 0.009029478221275073, + "learning_rate": 5.324928706924526e-08, + "loss": 0.0001, + "step": 10480 + }, + { + "epoch": 4.768425841674249, + "grad_norm": 0.2831000742667647, + "learning_rate": 5.3041447819512925e-08, + "loss": 0.0025, + "step": 10481 + }, + { + "epoch": 4.768880800727935, + "grad_norm": 0.15296905017968024, + "learning_rate": 5.283401281289291e-08, + "loss": 0.0005, + "step": 10482 + }, + { + "epoch": 4.76933575978162, + "grad_norm": 0.008065879076170169, + "learning_rate": 5.26269820663361e-08, + "loss": 0.0001, + "step": 10483 + }, + { + "epoch": 4.769790718835305, + "grad_norm": 0.11772679437127716, + "learning_rate": 5.242035559676062e-08, + "loss": 0.0009, + "step": 10484 + }, + { + "epoch": 4.77024567788899, + "grad_norm": 0.031230829898169044, + "learning_rate": 5.22141334210502e-08, + "loss": 0.0003, + "step": 10485 + }, + { + "epoch": 4.770700636942675, + "grad_norm": 0.14587626768329914, + "learning_rate": 5.200831555605745e-08, + "loss": 0.0008, + "step": 10486 + }, + { + "epoch": 4.77115559599636, + "grad_norm": 0.0858717029576134, + "learning_rate": 5.18029020185995e-08, + "loss": 0.0007, + "step": 10487 + }, + { + "epoch": 4.771610555050046, + "grad_norm": 0.033812059761657534, + "learning_rate": 5.1597892825462904e-08, + "loss": 0.0001, + "step": 10488 + }, + { + "epoch": 4.772065514103731, + "grad_norm": 0.006557216341889427, + "learning_rate": 5.139328799339982e-08, + "loss": 0.0, + "step": 10489 + }, + { + "epoch": 4.772520473157416, + "grad_norm": 0.023955501475005192, + "learning_rate": 5.1189087539129656e-08, + "loss": 0.0002, + "step": 10490 + }, + { + "epoch": 4.772975432211101, + "grad_norm": 0.04585853373194461, + "learning_rate": 5.0985291479338506e-08, + "loss": 0.0001, + "step": 10491 + }, + { + "epoch": 4.773430391264786, + "grad_norm": 0.021702837462913984, + "learning_rate": 5.078189983067916e-08, + "loss": 0.0001, + "step": 10492 + }, + { + "epoch": 4.773885350318471, + "grad_norm": 0.17922102132954887, + "learning_rate": 5.057891260977277e-08, + "loss": 0.0007, + "step": 10493 + }, + { + "epoch": 4.774340309372157, + "grad_norm": 0.026902093312953422, + "learning_rate": 5.037632983320662e-08, + "loss": 0.0001, + "step": 10494 + }, + { + "epoch": 4.774795268425842, + "grad_norm": 0.07414970298959292, + "learning_rate": 5.017415151753413e-08, + "loss": 0.0009, + "step": 10495 + }, + { + "epoch": 4.7752502274795265, + "grad_norm": 0.004481096511032117, + "learning_rate": 4.9972377679277096e-08, + "loss": 0.0, + "step": 10496 + }, + { + "epoch": 4.775705186533212, + "grad_norm": 0.07155299636763211, + "learning_rate": 4.9771008334922874e-08, + "loss": 0.0002, + "step": 10497 + }, + { + "epoch": 4.776160145586897, + "grad_norm": 0.043105064651908984, + "learning_rate": 4.95700435009272e-08, + "loss": 0.0004, + "step": 10498 + }, + { + "epoch": 4.776615104640582, + "grad_norm": 0.11334264726492232, + "learning_rate": 4.9369483193711375e-08, + "loss": 0.0008, + "step": 10499 + }, + { + "epoch": 4.777070063694268, + "grad_norm": 0.04372280713250435, + "learning_rate": 4.9169327429664513e-08, + "loss": 0.0003, + "step": 10500 + }, + { + "epoch": 4.777525022747953, + "grad_norm": 0.0567245885903299, + "learning_rate": 4.896957622514298e-08, + "loss": 0.0002, + "step": 10501 + }, + { + "epoch": 4.7779799818016375, + "grad_norm": 0.05539153255073634, + "learning_rate": 4.877022959646816e-08, + "loss": 0.0005, + "step": 10502 + }, + { + "epoch": 4.778434940855323, + "grad_norm": 0.18441064518113084, + "learning_rate": 4.85712875599309e-08, + "loss": 0.003, + "step": 10503 + }, + { + "epoch": 4.778889899909008, + "grad_norm": 0.18221204838936633, + "learning_rate": 4.8372750131788214e-08, + "loss": 0.0046, + "step": 10504 + }, + { + "epoch": 4.779344858962693, + "grad_norm": 0.16788315445887017, + "learning_rate": 4.8174617328262675e-08, + "loss": 0.0004, + "step": 10505 + }, + { + "epoch": 4.779799818016379, + "grad_norm": 0.02893776665167774, + "learning_rate": 4.797688916554466e-08, + "loss": 0.0001, + "step": 10506 + }, + { + "epoch": 4.780254777070064, + "grad_norm": 0.06444160325200556, + "learning_rate": 4.777956565979236e-08, + "loss": 0.0005, + "step": 10507 + }, + { + "epoch": 4.7807097361237485, + "grad_norm": 0.05512069424498918, + "learning_rate": 4.75826468271301e-08, + "loss": 0.0001, + "step": 10508 + }, + { + "epoch": 4.781164695177434, + "grad_norm": 0.06292658594426694, + "learning_rate": 4.738613268364889e-08, + "loss": 0.0009, + "step": 10509 + }, + { + "epoch": 4.781619654231119, + "grad_norm": 0.006702160578139643, + "learning_rate": 4.719002324540756e-08, + "loss": 0.0001, + "step": 10510 + }, + { + "epoch": 4.782074613284804, + "grad_norm": 0.037029417114959204, + "learning_rate": 4.699431852842995e-08, + "loss": 0.001, + "step": 10511 + }, + { + "epoch": 4.78252957233849, + "grad_norm": 0.12414774787413381, + "learning_rate": 4.679901854870994e-08, + "loss": 0.0012, + "step": 10512 + }, + { + "epoch": 4.782984531392175, + "grad_norm": 0.019471087984977813, + "learning_rate": 4.660412332220476e-08, + "loss": 0.0002, + "step": 10513 + }, + { + "epoch": 4.7834394904458595, + "grad_norm": 0.2795646762845415, + "learning_rate": 4.640963286484224e-08, + "loss": 0.0031, + "step": 10514 + }, + { + "epoch": 4.783894449499545, + "grad_norm": 0.02901829717063289, + "learning_rate": 4.6215547192514085e-08, + "loss": 0.0002, + "step": 10515 + }, + { + "epoch": 4.78434940855323, + "grad_norm": 0.08182638069428427, + "learning_rate": 4.602186632107985e-08, + "loss": 0.0007, + "step": 10516 + }, + { + "epoch": 4.784804367606915, + "grad_norm": 0.1631063372427597, + "learning_rate": 4.582859026636688e-08, + "loss": 0.0008, + "step": 10517 + }, + { + "epoch": 4.785259326660601, + "grad_norm": 0.1844271632020055, + "learning_rate": 4.5635719044169194e-08, + "loss": 0.0032, + "step": 10518 + }, + { + "epoch": 4.785714285714286, + "grad_norm": 0.07665528665097694, + "learning_rate": 4.5443252670246404e-08, + "loss": 0.0002, + "step": 10519 + }, + { + "epoch": 4.7861692447679705, + "grad_norm": 0.030584329823764238, + "learning_rate": 4.52511911603265e-08, + "loss": 0.0001, + "step": 10520 + }, + { + "epoch": 4.786624203821656, + "grad_norm": 0.07700336738538648, + "learning_rate": 4.505953453010359e-08, + "loss": 0.0002, + "step": 10521 + }, + { + "epoch": 4.787079162875341, + "grad_norm": 0.07314791019480306, + "learning_rate": 4.4868282795239026e-08, + "loss": 0.001, + "step": 10522 + }, + { + "epoch": 4.787534121929026, + "grad_norm": 0.08562270387581507, + "learning_rate": 4.4677435971361985e-08, + "loss": 0.0006, + "step": 10523 + }, + { + "epoch": 4.787989080982712, + "grad_norm": 0.04221491144114259, + "learning_rate": 4.448699407406665e-08, + "loss": 0.0001, + "step": 10524 + }, + { + "epoch": 4.788444040036397, + "grad_norm": 0.08206179531749871, + "learning_rate": 4.429695711891502e-08, + "loss": 0.0005, + "step": 10525 + }, + { + "epoch": 4.788898999090081, + "grad_norm": 0.006166077996794459, + "learning_rate": 4.410732512143634e-08, + "loss": 0.0, + "step": 10526 + }, + { + "epoch": 4.789353958143767, + "grad_norm": 0.05715006313493763, + "learning_rate": 4.3918098097125994e-08, + "loss": 0.0001, + "step": 10527 + }, + { + "epoch": 4.789808917197452, + "grad_norm": 0.005578505933585062, + "learning_rate": 4.372927606144772e-08, + "loss": 0.0, + "step": 10528 + }, + { + "epoch": 4.790263876251137, + "grad_norm": 0.05571538995352204, + "learning_rate": 4.354085902983085e-08, + "loss": 0.0005, + "step": 10529 + }, + { + "epoch": 4.790718835304823, + "grad_norm": 0.006223425459099669, + "learning_rate": 4.3352847017671396e-08, + "loss": 0.0, + "step": 10530 + }, + { + "epoch": 4.7911737943585075, + "grad_norm": 0.05182659626430289, + "learning_rate": 4.316524004033318e-08, + "loss": 0.0005, + "step": 10531 + }, + { + "epoch": 4.791628753412192, + "grad_norm": 0.02142757918277954, + "learning_rate": 4.297803811314727e-08, + "loss": 0.0001, + "step": 10532 + }, + { + "epoch": 4.792083712465878, + "grad_norm": 0.013843673384548316, + "learning_rate": 4.2791241251409765e-08, + "loss": 0.0, + "step": 10533 + }, + { + "epoch": 4.792538671519563, + "grad_norm": 0.11427663225571885, + "learning_rate": 4.260484947038568e-08, + "loss": 0.0009, + "step": 10534 + }, + { + "epoch": 4.792993630573249, + "grad_norm": 0.17773758151955296, + "learning_rate": 4.2418862785306156e-08, + "loss": 0.0028, + "step": 10535 + }, + { + "epoch": 4.793448589626934, + "grad_norm": 0.027262757429204452, + "learning_rate": 4.2233281211368494e-08, + "loss": 0.0001, + "step": 10536 + }, + { + "epoch": 4.7939035486806185, + "grad_norm": 0.06796770268321947, + "learning_rate": 4.204810476373833e-08, + "loss": 0.0003, + "step": 10537 + }, + { + "epoch": 4.794358507734304, + "grad_norm": 0.10100379583039645, + "learning_rate": 4.1863333457546895e-08, + "loss": 0.0007, + "step": 10538 + }, + { + "epoch": 4.794813466787989, + "grad_norm": 0.017344385957671453, + "learning_rate": 4.1678967307893225e-08, + "loss": 0.0001, + "step": 10539 + }, + { + "epoch": 4.795268425841674, + "grad_norm": 0.0074177376043692254, + "learning_rate": 4.1495006329843044e-08, + "loss": 0.0, + "step": 10540 + }, + { + "epoch": 4.79572338489536, + "grad_norm": 0.034727937445359014, + "learning_rate": 4.1311450538427666e-08, + "loss": 0.0004, + "step": 10541 + }, + { + "epoch": 4.796178343949045, + "grad_norm": 0.021418066558488196, + "learning_rate": 4.112829994864842e-08, + "loss": 0.0001, + "step": 10542 + }, + { + "epoch": 4.7966333030027295, + "grad_norm": 0.18629067898945587, + "learning_rate": 4.094555457547e-08, + "loss": 0.0021, + "step": 10543 + }, + { + "epoch": 4.797088262056415, + "grad_norm": 0.2011310590971207, + "learning_rate": 4.0763214433826024e-08, + "loss": 0.001, + "step": 10544 + }, + { + "epoch": 4.7975432211101, + "grad_norm": 0.2967322944483974, + "learning_rate": 4.058127953861568e-08, + "loss": 0.0065, + "step": 10545 + } + ], + "logging_steps": 1, + "max_steps": 10990, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 555, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 69333290385408.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}