Qwen_Math_high_1e5 / checkpoint-1110 /trainer_state.json
redsgnaoh's picture
Upload folder using huggingface_hub
487e8fc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5050045495905369,
"eval_steps": 500,
"global_step": 1110,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00045495905368516835,
"grad_norm": 9.461428161462043,
"learning_rate": 1e-05,
"loss": 0.1263,
"step": 1
},
{
"epoch": 0.0009099181073703367,
"grad_norm": 5.190780450250769,
"learning_rate": 9.99999979571129e-06,
"loss": 0.1723,
"step": 2
},
{
"epoch": 0.001364877161055505,
"grad_norm": 7.521926017130347,
"learning_rate": 9.999999182845177e-06,
"loss": 0.1327,
"step": 3
},
{
"epoch": 0.0018198362147406734,
"grad_norm": 2.5665810200307217,
"learning_rate": 9.99999816140171e-06,
"loss": 0.1095,
"step": 4
},
{
"epoch": 0.0022747952684258415,
"grad_norm": 2.738508706395883,
"learning_rate": 9.999996731380973e-06,
"loss": 0.1151,
"step": 5
},
{
"epoch": 0.00272975432211101,
"grad_norm": 2.67941899677245,
"learning_rate": 9.999994892783083e-06,
"loss": 0.0821,
"step": 6
},
{
"epoch": 0.0031847133757961785,
"grad_norm": 2.137586234420784,
"learning_rate": 9.99999264560819e-06,
"loss": 0.0729,
"step": 7
},
{
"epoch": 0.003639672429481347,
"grad_norm": 2.8221590420989164,
"learning_rate": 9.999989989856477e-06,
"loss": 0.0929,
"step": 8
},
{
"epoch": 0.004094631483166515,
"grad_norm": 1.6167314639784554,
"learning_rate": 9.999986925528164e-06,
"loss": 0.0466,
"step": 9
},
{
"epoch": 0.004549590536851683,
"grad_norm": 2.1773262431631313,
"learning_rate": 9.999983452623498e-06,
"loss": 0.0709,
"step": 10
},
{
"epoch": 0.005004549590536852,
"grad_norm": 7.6444390817806465,
"learning_rate": 9.999979571142765e-06,
"loss": 0.0809,
"step": 11
},
{
"epoch": 0.00545950864422202,
"grad_norm": 2.034523884241798,
"learning_rate": 9.999975281086278e-06,
"loss": 0.0839,
"step": 12
},
{
"epoch": 0.005914467697907188,
"grad_norm": 3.576108282005355,
"learning_rate": 9.999970582454392e-06,
"loss": 0.0728,
"step": 13
},
{
"epoch": 0.006369426751592357,
"grad_norm": 2.623641566468802,
"learning_rate": 9.999965475247491e-06,
"loss": 0.1052,
"step": 14
},
{
"epoch": 0.006824385805277525,
"grad_norm": 2.1413574998269085,
"learning_rate": 9.99995995946599e-06,
"loss": 0.0885,
"step": 15
},
{
"epoch": 0.007279344858962694,
"grad_norm": 1.4859066724415246,
"learning_rate": 9.999954035110342e-06,
"loss": 0.0644,
"step": 16
},
{
"epoch": 0.0077343039126478615,
"grad_norm": 2.851793157608408,
"learning_rate": 9.999947702181027e-06,
"loss": 0.1057,
"step": 17
},
{
"epoch": 0.00818926296633303,
"grad_norm": 4.693829546662477,
"learning_rate": 9.999940960678568e-06,
"loss": 0.0867,
"step": 18
},
{
"epoch": 0.008644222020018199,
"grad_norm": 2.2728033563417362,
"learning_rate": 9.999933810603513e-06,
"loss": 0.0789,
"step": 19
},
{
"epoch": 0.009099181073703366,
"grad_norm": 1.6705986173507794,
"learning_rate": 9.999926251956447e-06,
"loss": 0.0683,
"step": 20
},
{
"epoch": 0.009554140127388535,
"grad_norm": 2.187579869114393,
"learning_rate": 9.999918284737986e-06,
"loss": 0.0984,
"step": 21
},
{
"epoch": 0.010009099181073703,
"grad_norm": 2.328040268012338,
"learning_rate": 9.999909908948782e-06,
"loss": 0.0699,
"step": 22
},
{
"epoch": 0.010464058234758872,
"grad_norm": 5.572389775693198,
"learning_rate": 9.999901124589519e-06,
"loss": 0.0912,
"step": 23
},
{
"epoch": 0.01091901728844404,
"grad_norm": 1.84796719674859,
"learning_rate": 9.999891931660916e-06,
"loss": 0.1015,
"step": 24
},
{
"epoch": 0.011373976342129208,
"grad_norm": 1.7501762990792236,
"learning_rate": 9.999882330163725e-06,
"loss": 0.0909,
"step": 25
},
{
"epoch": 0.011828935395814377,
"grad_norm": 0.9922115950592263,
"learning_rate": 9.999872320098729e-06,
"loss": 0.0656,
"step": 26
},
{
"epoch": 0.012283894449499545,
"grad_norm": 1.5612370560987539,
"learning_rate": 9.999861901466746e-06,
"loss": 0.0974,
"step": 27
},
{
"epoch": 0.012738853503184714,
"grad_norm": 1.4617271794930395,
"learning_rate": 9.999851074268625e-06,
"loss": 0.0853,
"step": 28
},
{
"epoch": 0.013193812556869881,
"grad_norm": 1.8127085104491556,
"learning_rate": 9.999839838505257e-06,
"loss": 0.1081,
"step": 29
},
{
"epoch": 0.01364877161055505,
"grad_norm": 1.4710105512612208,
"learning_rate": 9.999828194177555e-06,
"loss": 0.0868,
"step": 30
},
{
"epoch": 0.014103730664240218,
"grad_norm": 1.3474487189311888,
"learning_rate": 9.999816141286472e-06,
"loss": 0.0817,
"step": 31
},
{
"epoch": 0.014558689717925387,
"grad_norm": 1.0967596652549403,
"learning_rate": 9.99980367983299e-06,
"loss": 0.0637,
"step": 32
},
{
"epoch": 0.015013648771610554,
"grad_norm": 3.179425671823194,
"learning_rate": 9.999790809818134e-06,
"loss": 0.069,
"step": 33
},
{
"epoch": 0.015468607825295723,
"grad_norm": 4.482257681577152,
"learning_rate": 9.999777531242951e-06,
"loss": 0.0915,
"step": 34
},
{
"epoch": 0.01592356687898089,
"grad_norm": 3.953299040475791,
"learning_rate": 9.999763844108528e-06,
"loss": 0.0562,
"step": 35
},
{
"epoch": 0.01637852593266606,
"grad_norm": 1.1127201050382067,
"learning_rate": 9.999749748415982e-06,
"loss": 0.0556,
"step": 36
},
{
"epoch": 0.01683348498635123,
"grad_norm": 79.45756094624792,
"learning_rate": 9.999735244166464e-06,
"loss": 0.1223,
"step": 37
},
{
"epoch": 0.017288444040036398,
"grad_norm": 2777.9092912017113,
"learning_rate": 9.99972033136116e-06,
"loss": 0.3211,
"step": 38
},
{
"epoch": 0.017743403093721567,
"grad_norm": 2.5204693177238466,
"learning_rate": 9.999705010001291e-06,
"loss": 0.0723,
"step": 39
},
{
"epoch": 0.018198362147406732,
"grad_norm": 2.2975907071135655,
"learning_rate": 9.999689280088105e-06,
"loss": 0.0696,
"step": 40
},
{
"epoch": 0.0186533212010919,
"grad_norm": 2.998434349074003,
"learning_rate": 9.99967314162289e-06,
"loss": 0.083,
"step": 41
},
{
"epoch": 0.01910828025477707,
"grad_norm": 3.882239448575704,
"learning_rate": 9.999656594606966e-06,
"loss": 0.1015,
"step": 42
},
{
"epoch": 0.019563239308462238,
"grad_norm": 3.5286596480512493,
"learning_rate": 9.999639639041681e-06,
"loss": 0.0817,
"step": 43
},
{
"epoch": 0.020018198362147407,
"grad_norm": 1.6933989447443707,
"learning_rate": 9.999622274928424e-06,
"loss": 0.1003,
"step": 44
},
{
"epoch": 0.020473157415832575,
"grad_norm": 1.2483160046323276,
"learning_rate": 9.999604502268614e-06,
"loss": 0.0952,
"step": 45
},
{
"epoch": 0.020928116469517744,
"grad_norm": 0.9417906124383243,
"learning_rate": 9.9995863210637e-06,
"loss": 0.0731,
"step": 46
},
{
"epoch": 0.021383075523202913,
"grad_norm": 2.8195414757816897,
"learning_rate": 9.99956773131517e-06,
"loss": 0.1845,
"step": 47
},
{
"epoch": 0.02183803457688808,
"grad_norm": 2.74390379471345,
"learning_rate": 9.999548733024545e-06,
"loss": 0.1826,
"step": 48
},
{
"epoch": 0.022292993630573247,
"grad_norm": 1.5138494619527987,
"learning_rate": 9.999529326193373e-06,
"loss": 0.0857,
"step": 49
},
{
"epoch": 0.022747952684258416,
"grad_norm": 1.215379974181271,
"learning_rate": 9.999509510823242e-06,
"loss": 0.0686,
"step": 50
},
{
"epoch": 0.023202911737943584,
"grad_norm": 1.292187967807859,
"learning_rate": 9.999489286915773e-06,
"loss": 0.0707,
"step": 51
},
{
"epoch": 0.023657870791628753,
"grad_norm": 1.7888013203563982,
"learning_rate": 9.999468654472614e-06,
"loss": 0.0682,
"step": 52
},
{
"epoch": 0.024112829845313922,
"grad_norm": 0.8979425621703144,
"learning_rate": 9.999447613495457e-06,
"loss": 0.0508,
"step": 53
},
{
"epoch": 0.02456778889899909,
"grad_norm": 1.9123835444775663,
"learning_rate": 9.99942616398602e-06,
"loss": 0.0689,
"step": 54
},
{
"epoch": 0.02502274795268426,
"grad_norm": 0.9393581994096443,
"learning_rate": 9.99940430594605e-06,
"loss": 0.0496,
"step": 55
},
{
"epoch": 0.025477707006369428,
"grad_norm": 1.0234476513644222,
"learning_rate": 9.999382039377339e-06,
"loss": 0.0601,
"step": 56
},
{
"epoch": 0.025932666060054597,
"grad_norm": 0.9291387208138827,
"learning_rate": 9.999359364281704e-06,
"loss": 0.0377,
"step": 57
},
{
"epoch": 0.026387625113739762,
"grad_norm": 1.8209170803663992,
"learning_rate": 9.999336280660999e-06,
"loss": 0.1144,
"step": 58
},
{
"epoch": 0.02684258416742493,
"grad_norm": 1.1214625046464874,
"learning_rate": 9.99931278851711e-06,
"loss": 0.0622,
"step": 59
},
{
"epoch": 0.0272975432211101,
"grad_norm": 1.0331723997917317,
"learning_rate": 9.999288887851956e-06,
"loss": 0.0667,
"step": 60
},
{
"epoch": 0.027752502274795268,
"grad_norm": 1.0412381501406744,
"learning_rate": 9.999264578667493e-06,
"loss": 0.0566,
"step": 61
},
{
"epoch": 0.028207461328480437,
"grad_norm": 1.4510603110658047,
"learning_rate": 9.999239860965703e-06,
"loss": 0.0845,
"step": 62
},
{
"epoch": 0.028662420382165606,
"grad_norm": 1.301162540669183,
"learning_rate": 9.999214734748609e-06,
"loss": 0.0759,
"step": 63
},
{
"epoch": 0.029117379435850774,
"grad_norm": 0.9977688847603402,
"learning_rate": 9.999189200018263e-06,
"loss": 0.0528,
"step": 64
},
{
"epoch": 0.029572338489535943,
"grad_norm": 1.2894688842348854,
"learning_rate": 9.99916325677675e-06,
"loss": 0.0899,
"step": 65
},
{
"epoch": 0.03002729754322111,
"grad_norm": 1.4627871680702638,
"learning_rate": 9.999136905026194e-06,
"loss": 0.1456,
"step": 66
},
{
"epoch": 0.030482256596906277,
"grad_norm": 1.2304385710214434,
"learning_rate": 9.999110144768745e-06,
"loss": 0.079,
"step": 67
},
{
"epoch": 0.030937215650591446,
"grad_norm": 1.085016380732753,
"learning_rate": 9.99908297600659e-06,
"loss": 0.0696,
"step": 68
},
{
"epoch": 0.03139217470427662,
"grad_norm": 0.989450558642297,
"learning_rate": 9.99905539874195e-06,
"loss": 0.069,
"step": 69
},
{
"epoch": 0.03184713375796178,
"grad_norm": 1.0510491151133208,
"learning_rate": 9.99902741297708e-06,
"loss": 0.0555,
"step": 70
},
{
"epoch": 0.03230209281164695,
"grad_norm": 0.8938033562648371,
"learning_rate": 9.998999018714264e-06,
"loss": 0.0783,
"step": 71
},
{
"epoch": 0.03275705186533212,
"grad_norm": 2.902512108322722,
"learning_rate": 9.998970215955824e-06,
"loss": 0.0702,
"step": 72
},
{
"epoch": 0.033212010919017286,
"grad_norm": 0.7661831894133686,
"learning_rate": 9.998941004704113e-06,
"loss": 0.0519,
"step": 73
},
{
"epoch": 0.03366696997270246,
"grad_norm": 1.1047249497744047,
"learning_rate": 9.998911384961518e-06,
"loss": 0.0773,
"step": 74
},
{
"epoch": 0.034121929026387623,
"grad_norm": 0.7750047299312716,
"learning_rate": 9.998881356730458e-06,
"loss": 0.0598,
"step": 75
},
{
"epoch": 0.034576888080072796,
"grad_norm": 0.9815801555720315,
"learning_rate": 9.99885092001339e-06,
"loss": 0.0661,
"step": 76
},
{
"epoch": 0.03503184713375796,
"grad_norm": 1.3090963451351905,
"learning_rate": 9.998820074812799e-06,
"loss": 0.0713,
"step": 77
},
{
"epoch": 0.03548680618744313,
"grad_norm": 1.1489338732270693,
"learning_rate": 9.998788821131207e-06,
"loss": 0.0946,
"step": 78
},
{
"epoch": 0.0359417652411283,
"grad_norm": 0.9040381990998293,
"learning_rate": 9.998757158971164e-06,
"loss": 0.067,
"step": 79
},
{
"epoch": 0.036396724294813464,
"grad_norm": 1.1019926198229115,
"learning_rate": 9.998725088335263e-06,
"loss": 0.0874,
"step": 80
},
{
"epoch": 0.036851683348498636,
"grad_norm": 0.5779852750462403,
"learning_rate": 9.99869260922612e-06,
"loss": 0.0492,
"step": 81
},
{
"epoch": 0.0373066424021838,
"grad_norm": 1.2769852710418472,
"learning_rate": 9.998659721646393e-06,
"loss": 0.0781,
"step": 82
},
{
"epoch": 0.03776160145586897,
"grad_norm": 0.9020624084974485,
"learning_rate": 9.998626425598766e-06,
"loss": 0.0734,
"step": 83
},
{
"epoch": 0.03821656050955414,
"grad_norm": 0.9626764462141776,
"learning_rate": 9.99859272108596e-06,
"loss": 0.0719,
"step": 84
},
{
"epoch": 0.03867151956323931,
"grad_norm": 0.9435885887029873,
"learning_rate": 9.998558608110733e-06,
"loss": 0.0835,
"step": 85
},
{
"epoch": 0.039126478616924476,
"grad_norm": 1.0578725525123687,
"learning_rate": 9.998524086675867e-06,
"loss": 0.0746,
"step": 86
},
{
"epoch": 0.03958143767060965,
"grad_norm": 1.0366588534208079,
"learning_rate": 9.998489156784188e-06,
"loss": 0.0933,
"step": 87
},
{
"epoch": 0.040036396724294813,
"grad_norm": 1.0595948680723846,
"learning_rate": 9.998453818438547e-06,
"loss": 0.0846,
"step": 88
},
{
"epoch": 0.04049135577797998,
"grad_norm": 0.8807515753016749,
"learning_rate": 9.998418071641833e-06,
"loss": 0.0649,
"step": 89
},
{
"epoch": 0.04094631483166515,
"grad_norm": 0.9034225145874141,
"learning_rate": 9.998381916396967e-06,
"loss": 0.0621,
"step": 90
},
{
"epoch": 0.041401273885350316,
"grad_norm": 0.6732889821553815,
"learning_rate": 9.998345352706901e-06,
"loss": 0.0367,
"step": 91
},
{
"epoch": 0.04185623293903549,
"grad_norm": 0.7136967603743426,
"learning_rate": 9.998308380574628e-06,
"loss": 0.0569,
"step": 92
},
{
"epoch": 0.042311191992720654,
"grad_norm": 1.1459385364035048,
"learning_rate": 9.998271000003166e-06,
"loss": 0.1184,
"step": 93
},
{
"epoch": 0.042766151046405826,
"grad_norm": 0.8224906129097734,
"learning_rate": 9.998233210995569e-06,
"loss": 0.0682,
"step": 94
},
{
"epoch": 0.04322111010009099,
"grad_norm": 1.5182946932236698,
"learning_rate": 9.998195013554926e-06,
"loss": 0.0875,
"step": 95
},
{
"epoch": 0.04367606915377616,
"grad_norm": 0.9355855711018981,
"learning_rate": 9.998156407684359e-06,
"loss": 0.0939,
"step": 96
},
{
"epoch": 0.04413102820746133,
"grad_norm": 0.7329840867165283,
"learning_rate": 9.998117393387022e-06,
"loss": 0.0466,
"step": 97
},
{
"epoch": 0.044585987261146494,
"grad_norm": 0.8701001036058451,
"learning_rate": 9.9980779706661e-06,
"loss": 0.0729,
"step": 98
},
{
"epoch": 0.045040946314831666,
"grad_norm": 1.0218896298663185,
"learning_rate": 9.99803813952482e-06,
"loss": 0.0828,
"step": 99
},
{
"epoch": 0.04549590536851683,
"grad_norm": 0.9044995357273884,
"learning_rate": 9.997997899966433e-06,
"loss": 0.0709,
"step": 100
},
{
"epoch": 0.045950864422202004,
"grad_norm": 0.9877796099816964,
"learning_rate": 9.99795725199423e-06,
"loss": 0.0903,
"step": 101
},
{
"epoch": 0.04640582347588717,
"grad_norm": 1.0061501994463906,
"learning_rate": 9.99791619561153e-06,
"loss": 0.0831,
"step": 102
},
{
"epoch": 0.04686078252957234,
"grad_norm": 0.8789173954818107,
"learning_rate": 9.997874730821689e-06,
"loss": 0.0714,
"step": 103
},
{
"epoch": 0.047315741583257506,
"grad_norm": 15.480920098194954,
"learning_rate": 9.997832857628093e-06,
"loss": 0.2603,
"step": 104
},
{
"epoch": 0.04777070063694268,
"grad_norm": 1.3806761301603454,
"learning_rate": 9.99779057603417e-06,
"loss": 0.1227,
"step": 105
},
{
"epoch": 0.048225659690627844,
"grad_norm": 0.8462176607269959,
"learning_rate": 9.997747886043368e-06,
"loss": 0.0605,
"step": 106
},
{
"epoch": 0.04868061874431301,
"grad_norm": 0.7467169847716549,
"learning_rate": 9.997704787659179e-06,
"loss": 0.0618,
"step": 107
},
{
"epoch": 0.04913557779799818,
"grad_norm": 1.5653334818977065,
"learning_rate": 9.997661280885125e-06,
"loss": 0.1253,
"step": 108
},
{
"epoch": 0.049590536851683346,
"grad_norm": 0.871706038604149,
"learning_rate": 9.99761736572476e-06,
"loss": 0.0716,
"step": 109
},
{
"epoch": 0.05004549590536852,
"grad_norm": 1.1398296008355844,
"learning_rate": 9.997573042181672e-06,
"loss": 0.0698,
"step": 110
},
{
"epoch": 0.050500454959053684,
"grad_norm": 1.0487992691419916,
"learning_rate": 9.997528310259485e-06,
"loss": 0.1102,
"step": 111
},
{
"epoch": 0.050955414012738856,
"grad_norm": 0.9112684449646818,
"learning_rate": 9.997483169961852e-06,
"loss": 0.1032,
"step": 112
},
{
"epoch": 0.05141037306642402,
"grad_norm": 0.9418790141923585,
"learning_rate": 9.997437621292463e-06,
"loss": 0.0771,
"step": 113
},
{
"epoch": 0.051865332120109194,
"grad_norm": 0.7796140692842074,
"learning_rate": 9.99739166425504e-06,
"loss": 0.0627,
"step": 114
},
{
"epoch": 0.05232029117379436,
"grad_norm": 1.5434421216734795,
"learning_rate": 9.997345298853339e-06,
"loss": 0.1495,
"step": 115
},
{
"epoch": 0.052775250227479524,
"grad_norm": 0.8898179660551836,
"learning_rate": 9.997298525091148e-06,
"loss": 0.0735,
"step": 116
},
{
"epoch": 0.053230209281164696,
"grad_norm": 0.8585916871524272,
"learning_rate": 9.997251342972288e-06,
"loss": 0.068,
"step": 117
},
{
"epoch": 0.05368516833484986,
"grad_norm": 0.812806800238708,
"learning_rate": 9.997203752500616e-06,
"loss": 0.0689,
"step": 118
},
{
"epoch": 0.054140127388535034,
"grad_norm": 0.9677722064277628,
"learning_rate": 9.997155753680021e-06,
"loss": 0.0795,
"step": 119
},
{
"epoch": 0.0545950864422202,
"grad_norm": 1.621934591654054,
"learning_rate": 9.997107346514425e-06,
"loss": 0.0707,
"step": 120
},
{
"epoch": 0.05505004549590537,
"grad_norm": 0.6750452750311531,
"learning_rate": 9.997058531007782e-06,
"loss": 0.0588,
"step": 121
},
{
"epoch": 0.055505004549590536,
"grad_norm": 0.9583870506818666,
"learning_rate": 9.997009307164083e-06,
"loss": 0.0859,
"step": 122
},
{
"epoch": 0.05595996360327571,
"grad_norm": 1.247483970027119,
"learning_rate": 9.99695967498735e-06,
"loss": 0.0952,
"step": 123
},
{
"epoch": 0.056414922656960874,
"grad_norm": 0.7937903902273558,
"learning_rate": 9.996909634481639e-06,
"loss": 0.0614,
"step": 124
},
{
"epoch": 0.05686988171064604,
"grad_norm": 4.855426128828546,
"learning_rate": 9.996859185651038e-06,
"loss": 0.1629,
"step": 125
},
{
"epoch": 0.05732484076433121,
"grad_norm": 1.0499970639607177,
"learning_rate": 9.99680832849967e-06,
"loss": 0.1031,
"step": 126
},
{
"epoch": 0.05777979981801638,
"grad_norm": 0.8730447821488512,
"learning_rate": 9.99675706303169e-06,
"loss": 0.0606,
"step": 127
},
{
"epoch": 0.05823475887170155,
"grad_norm": 1.2779985416162813,
"learning_rate": 9.99670538925129e-06,
"loss": 0.074,
"step": 128
},
{
"epoch": 0.058689717925386714,
"grad_norm": 0.8606157718419157,
"learning_rate": 9.996653307162687e-06,
"loss": 0.0703,
"step": 129
},
{
"epoch": 0.059144676979071886,
"grad_norm": 0.8920761218762643,
"learning_rate": 9.996600816770144e-06,
"loss": 0.0818,
"step": 130
},
{
"epoch": 0.05959963603275705,
"grad_norm": 1.1603462045917847,
"learning_rate": 9.996547918077944e-06,
"loss": 0.1148,
"step": 131
},
{
"epoch": 0.06005459508644222,
"grad_norm": 0.9108713801214797,
"learning_rate": 9.996494611090414e-06,
"loss": 0.0884,
"step": 132
},
{
"epoch": 0.06050955414012739,
"grad_norm": 0.6523725468628359,
"learning_rate": 9.996440895811907e-06,
"loss": 0.0535,
"step": 133
},
{
"epoch": 0.060964513193812554,
"grad_norm": 0.8812777694752004,
"learning_rate": 9.996386772246816e-06,
"loss": 0.087,
"step": 134
},
{
"epoch": 0.061419472247497726,
"grad_norm": 1.0622191207422995,
"learning_rate": 9.99633224039956e-06,
"loss": 0.0982,
"step": 135
},
{
"epoch": 0.06187443130118289,
"grad_norm": 3.7961077321923025,
"learning_rate": 9.996277300274596e-06,
"loss": 0.1526,
"step": 136
},
{
"epoch": 0.062329390354868064,
"grad_norm": 0.9444433559435487,
"learning_rate": 9.996221951876415e-06,
"loss": 0.0996,
"step": 137
},
{
"epoch": 0.06278434940855324,
"grad_norm": 1.444871481552235,
"learning_rate": 9.996166195209539e-06,
"loss": 0.1075,
"step": 138
},
{
"epoch": 0.0632393084622384,
"grad_norm": 0.7446446480732116,
"learning_rate": 9.996110030278522e-06,
"loss": 0.0561,
"step": 139
},
{
"epoch": 0.06369426751592357,
"grad_norm": 0.8913010543094952,
"learning_rate": 9.996053457087958e-06,
"loss": 0.0715,
"step": 140
},
{
"epoch": 0.06414922656960874,
"grad_norm": 0.7815821404043856,
"learning_rate": 9.995996475642466e-06,
"loss": 0.0796,
"step": 141
},
{
"epoch": 0.0646041856232939,
"grad_norm": 0.74337588448595,
"learning_rate": 9.995939085946704e-06,
"loss": 0.0661,
"step": 142
},
{
"epoch": 0.06505914467697907,
"grad_norm": 0.9974255688753435,
"learning_rate": 9.995881288005363e-06,
"loss": 0.0869,
"step": 143
},
{
"epoch": 0.06551410373066424,
"grad_norm": 1.2260290141946268,
"learning_rate": 9.995823081823162e-06,
"loss": 0.0766,
"step": 144
},
{
"epoch": 0.06596906278434941,
"grad_norm": 0.9751795993584637,
"learning_rate": 9.99576446740486e-06,
"loss": 0.091,
"step": 145
},
{
"epoch": 0.06642402183803457,
"grad_norm": 1.6175476325168967,
"learning_rate": 9.995705444755249e-06,
"loss": 0.1208,
"step": 146
},
{
"epoch": 0.06687898089171974,
"grad_norm": 0.7580083688127299,
"learning_rate": 9.995646013879147e-06,
"loss": 0.0622,
"step": 147
},
{
"epoch": 0.06733393994540492,
"grad_norm": 1.0194887039793072,
"learning_rate": 9.995586174781413e-06,
"loss": 0.0753,
"step": 148
},
{
"epoch": 0.06778889899909009,
"grad_norm": 0.9065646408503975,
"learning_rate": 9.995525927466936e-06,
"loss": 0.0848,
"step": 149
},
{
"epoch": 0.06824385805277525,
"grad_norm": 0.8871078738477127,
"learning_rate": 9.995465271940641e-06,
"loss": 0.0607,
"step": 150
},
{
"epoch": 0.06869881710646042,
"grad_norm": 1.1486707652049646,
"learning_rate": 9.995404208207485e-06,
"loss": 0.0809,
"step": 151
},
{
"epoch": 0.06915377616014559,
"grad_norm": 1.1473150526096232,
"learning_rate": 9.995342736272453e-06,
"loss": 0.1035,
"step": 152
},
{
"epoch": 0.06960873521383075,
"grad_norm": 1.3025683052462544,
"learning_rate": 9.995280856140572e-06,
"loss": 0.1197,
"step": 153
},
{
"epoch": 0.07006369426751592,
"grad_norm": 0.8069596755970996,
"learning_rate": 9.9952185678169e-06,
"loss": 0.0526,
"step": 154
},
{
"epoch": 0.0705186533212011,
"grad_norm": 0.8153700064848134,
"learning_rate": 9.995155871306524e-06,
"loss": 0.0613,
"step": 155
},
{
"epoch": 0.07097361237488627,
"grad_norm": 0.7319023745966868,
"learning_rate": 9.995092766614567e-06,
"loss": 0.0512,
"step": 156
},
{
"epoch": 0.07142857142857142,
"grad_norm": 1.0146656175738817,
"learning_rate": 9.995029253746186e-06,
"loss": 0.0846,
"step": 157
},
{
"epoch": 0.0718835304822566,
"grad_norm": 0.8015254985373994,
"learning_rate": 9.994965332706574e-06,
"loss": 0.0619,
"step": 158
},
{
"epoch": 0.07233848953594177,
"grad_norm": 1.0630207312416284,
"learning_rate": 9.994901003500952e-06,
"loss": 0.0796,
"step": 159
},
{
"epoch": 0.07279344858962693,
"grad_norm": 0.9431304991088505,
"learning_rate": 9.994836266134575e-06,
"loss": 0.0743,
"step": 160
},
{
"epoch": 0.0732484076433121,
"grad_norm": 1.023738915097686,
"learning_rate": 9.994771120612737e-06,
"loss": 0.0888,
"step": 161
},
{
"epoch": 0.07370336669699727,
"grad_norm": 0.9272637744585672,
"learning_rate": 9.994705566940757e-06,
"loss": 0.084,
"step": 162
},
{
"epoch": 0.07415832575068244,
"grad_norm": 1.122378326253592,
"learning_rate": 9.994639605123994e-06,
"loss": 0.0961,
"step": 163
},
{
"epoch": 0.0746132848043676,
"grad_norm": 0.753531768411978,
"learning_rate": 9.994573235167839e-06,
"loss": 0.0736,
"step": 164
},
{
"epoch": 0.07506824385805277,
"grad_norm": 0.9314766958597749,
"learning_rate": 9.994506457077715e-06,
"loss": 0.0838,
"step": 165
},
{
"epoch": 0.07552320291173795,
"grad_norm": 0.996008388557059,
"learning_rate": 9.994439270859077e-06,
"loss": 0.1076,
"step": 166
},
{
"epoch": 0.07597816196542312,
"grad_norm": 0.9199332464612126,
"learning_rate": 9.994371676517418e-06,
"loss": 0.0724,
"step": 167
},
{
"epoch": 0.07643312101910828,
"grad_norm": 0.8652292283168678,
"learning_rate": 9.994303674058259e-06,
"loss": 0.0628,
"step": 168
},
{
"epoch": 0.07688808007279345,
"grad_norm": 0.8176262426438138,
"learning_rate": 9.994235263487158e-06,
"loss": 0.0743,
"step": 169
},
{
"epoch": 0.07734303912647862,
"grad_norm": 0.8147855247941459,
"learning_rate": 9.994166444809705e-06,
"loss": 0.0559,
"step": 170
},
{
"epoch": 0.07779799818016378,
"grad_norm": 0.7853019575635352,
"learning_rate": 9.994097218031524e-06,
"loss": 0.0681,
"step": 171
},
{
"epoch": 0.07825295723384895,
"grad_norm": 0.8445610480134321,
"learning_rate": 9.994027583158272e-06,
"loss": 0.0785,
"step": 172
},
{
"epoch": 0.07870791628753412,
"grad_norm": 0.8555498692388026,
"learning_rate": 9.993957540195638e-06,
"loss": 0.077,
"step": 173
},
{
"epoch": 0.0791628753412193,
"grad_norm": 0.8281270493499452,
"learning_rate": 9.993887089149346e-06,
"loss": 0.0848,
"step": 174
},
{
"epoch": 0.07961783439490445,
"grad_norm": 0.7180425978661062,
"learning_rate": 9.993816230025152e-06,
"loss": 0.0588,
"step": 175
},
{
"epoch": 0.08007279344858963,
"grad_norm": 0.9287545326980071,
"learning_rate": 9.99374496282885e-06,
"loss": 0.0874,
"step": 176
},
{
"epoch": 0.0805277525022748,
"grad_norm": 1.5950603980195528,
"learning_rate": 9.993673287566261e-06,
"loss": 0.1301,
"step": 177
},
{
"epoch": 0.08098271155595996,
"grad_norm": 0.505966633973175,
"learning_rate": 9.99360120424324e-06,
"loss": 0.0459,
"step": 178
},
{
"epoch": 0.08143767060964513,
"grad_norm": 0.6170796905443107,
"learning_rate": 9.993528712865681e-06,
"loss": 0.0666,
"step": 179
},
{
"epoch": 0.0818926296633303,
"grad_norm": 0.8965600572228928,
"learning_rate": 9.993455813439507e-06,
"loss": 0.0648,
"step": 180
},
{
"epoch": 0.08234758871701547,
"grad_norm": 0.7555745664692847,
"learning_rate": 9.993382505970673e-06,
"loss": 0.0479,
"step": 181
},
{
"epoch": 0.08280254777070063,
"grad_norm": 0.7885826993774436,
"learning_rate": 9.99330879046517e-06,
"loss": 0.0605,
"step": 182
},
{
"epoch": 0.0832575068243858,
"grad_norm": 0.6970911126559147,
"learning_rate": 9.993234666929024e-06,
"loss": 0.0545,
"step": 183
},
{
"epoch": 0.08371246587807098,
"grad_norm": 0.8281240642020996,
"learning_rate": 9.99316013536829e-06,
"loss": 0.0651,
"step": 184
},
{
"epoch": 0.08416742493175614,
"grad_norm": 0.8497823551734951,
"learning_rate": 9.993085195789057e-06,
"loss": 0.098,
"step": 185
},
{
"epoch": 0.08462238398544131,
"grad_norm": 0.8425278224044996,
"learning_rate": 9.993009848197452e-06,
"loss": 0.0861,
"step": 186
},
{
"epoch": 0.08507734303912648,
"grad_norm": 0.729342450692031,
"learning_rate": 9.992934092599629e-06,
"loss": 0.0651,
"step": 187
},
{
"epoch": 0.08553230209281165,
"grad_norm": 0.8810253378927329,
"learning_rate": 9.99285792900178e-06,
"loss": 0.0995,
"step": 188
},
{
"epoch": 0.08598726114649681,
"grad_norm": 1.0402457083445067,
"learning_rate": 9.992781357410131e-06,
"loss": 0.1061,
"step": 189
},
{
"epoch": 0.08644222020018198,
"grad_norm": 0.7397036090930822,
"learning_rate": 9.992704377830934e-06,
"loss": 0.0571,
"step": 190
},
{
"epoch": 0.08689717925386715,
"grad_norm": 1.4783630598693296,
"learning_rate": 9.992626990270484e-06,
"loss": 0.1154,
"step": 191
},
{
"epoch": 0.08735213830755233,
"grad_norm": 1.1100322283473036,
"learning_rate": 9.992549194735101e-06,
"loss": 0.1179,
"step": 192
},
{
"epoch": 0.08780709736123748,
"grad_norm": 0.5797984556503705,
"learning_rate": 9.992470991231144e-06,
"loss": 0.0466,
"step": 193
},
{
"epoch": 0.08826205641492266,
"grad_norm": 1.059908713900853,
"learning_rate": 9.992392379765005e-06,
"loss": 0.0994,
"step": 194
},
{
"epoch": 0.08871701546860783,
"grad_norm": 1.1187885391430794,
"learning_rate": 9.992313360343104e-06,
"loss": 0.0986,
"step": 195
},
{
"epoch": 0.08917197452229299,
"grad_norm": 0.7509441330173129,
"learning_rate": 9.992233932971901e-06,
"loss": 0.0634,
"step": 196
},
{
"epoch": 0.08962693357597816,
"grad_norm": 0.9426276516690344,
"learning_rate": 9.992154097657888e-06,
"loss": 0.0857,
"step": 197
},
{
"epoch": 0.09008189262966333,
"grad_norm": 0.8754039034503873,
"learning_rate": 9.992073854407585e-06,
"loss": 0.0881,
"step": 198
},
{
"epoch": 0.0905368516833485,
"grad_norm": 2.8697219156120712,
"learning_rate": 9.99199320322755e-06,
"loss": 0.0851,
"step": 199
},
{
"epoch": 0.09099181073703366,
"grad_norm": 0.7429242681646778,
"learning_rate": 9.991912144124375e-06,
"loss": 0.0729,
"step": 200
},
{
"epoch": 0.09144676979071883,
"grad_norm": 1.0552979449251756,
"learning_rate": 9.991830677104682e-06,
"loss": 0.1066,
"step": 201
},
{
"epoch": 0.09190172884440401,
"grad_norm": 0.8812651371324355,
"learning_rate": 9.99174880217513e-06,
"loss": 0.0732,
"step": 202
},
{
"epoch": 0.09235668789808917,
"grad_norm": 1.0755107845413352,
"learning_rate": 9.991666519342407e-06,
"loss": 0.0977,
"step": 203
},
{
"epoch": 0.09281164695177434,
"grad_norm": 0.8925063431256136,
"learning_rate": 9.99158382861324e-06,
"loss": 0.0904,
"step": 204
},
{
"epoch": 0.09326660600545951,
"grad_norm": 0.8190206986922173,
"learning_rate": 9.991500729994384e-06,
"loss": 0.0729,
"step": 205
},
{
"epoch": 0.09372156505914468,
"grad_norm": 0.6635798147425112,
"learning_rate": 9.991417223492629e-06,
"loss": 0.0631,
"step": 206
},
{
"epoch": 0.09417652411282984,
"grad_norm": 1.0314655306023923,
"learning_rate": 9.991333309114798e-06,
"loss": 0.0852,
"step": 207
},
{
"epoch": 0.09463148316651501,
"grad_norm": 0.8533496857694978,
"learning_rate": 9.991248986867753e-06,
"loss": 0.0868,
"step": 208
},
{
"epoch": 0.09508644222020018,
"grad_norm": 1.039085255997433,
"learning_rate": 9.991164256758378e-06,
"loss": 0.095,
"step": 209
},
{
"epoch": 0.09554140127388536,
"grad_norm": 1.1484522866350177,
"learning_rate": 9.9910791187936e-06,
"loss": 0.1333,
"step": 210
},
{
"epoch": 0.09599636032757052,
"grad_norm": 0.8277820800102422,
"learning_rate": 9.99099357298038e-06,
"loss": 0.0664,
"step": 211
},
{
"epoch": 0.09645131938125569,
"grad_norm": 0.821796111319934,
"learning_rate": 9.9909076193257e-06,
"loss": 0.083,
"step": 212
},
{
"epoch": 0.09690627843494086,
"grad_norm": 0.9448800546720313,
"learning_rate": 9.990821257836589e-06,
"loss": 0.0873,
"step": 213
},
{
"epoch": 0.09736123748862602,
"grad_norm": 0.9002810379340489,
"learning_rate": 9.990734488520103e-06,
"loss": 0.099,
"step": 214
},
{
"epoch": 0.09781619654231119,
"grad_norm": 0.6145149717344348,
"learning_rate": 9.990647311383334e-06,
"loss": 0.0425,
"step": 215
},
{
"epoch": 0.09827115559599636,
"grad_norm": 1.1377497370761045,
"learning_rate": 9.990559726433404e-06,
"loss": 0.0903,
"step": 216
},
{
"epoch": 0.09872611464968153,
"grad_norm": 0.8401357673155365,
"learning_rate": 9.99047173367747e-06,
"loss": 0.0812,
"step": 217
},
{
"epoch": 0.09918107370336669,
"grad_norm": 0.6977882365614015,
"learning_rate": 9.990383333122722e-06,
"loss": 0.0613,
"step": 218
},
{
"epoch": 0.09963603275705187,
"grad_norm": 0.6751056796776193,
"learning_rate": 9.990294524776384e-06,
"loss": 0.0636,
"step": 219
},
{
"epoch": 0.10009099181073704,
"grad_norm": 0.7973250315161167,
"learning_rate": 9.990205308645716e-06,
"loss": 0.0655,
"step": 220
},
{
"epoch": 0.1005459508644222,
"grad_norm": 0.6494979859380491,
"learning_rate": 9.990115684738005e-06,
"loss": 0.0461,
"step": 221
},
{
"epoch": 0.10100090991810737,
"grad_norm": 0.7863907355652456,
"learning_rate": 9.990025653060574e-06,
"loss": 0.0881,
"step": 222
},
{
"epoch": 0.10145586897179254,
"grad_norm": 1.2756737972223395,
"learning_rate": 9.98993521362078e-06,
"loss": 0.1102,
"step": 223
},
{
"epoch": 0.10191082802547771,
"grad_norm": 1.1992554133605928,
"learning_rate": 9.989844366426018e-06,
"loss": 0.1147,
"step": 224
},
{
"epoch": 0.10236578707916287,
"grad_norm": 0.5034605400337953,
"learning_rate": 9.989753111483707e-06,
"loss": 0.0462,
"step": 225
},
{
"epoch": 0.10282074613284804,
"grad_norm": 0.9881921480518578,
"learning_rate": 9.989661448801305e-06,
"loss": 0.0848,
"step": 226
},
{
"epoch": 0.10327570518653321,
"grad_norm": 0.7581777568438945,
"learning_rate": 9.989569378386303e-06,
"loss": 0.079,
"step": 227
},
{
"epoch": 0.10373066424021839,
"grad_norm": 0.6464731162067388,
"learning_rate": 9.989476900246223e-06,
"loss": 0.0617,
"step": 228
},
{
"epoch": 0.10418562329390355,
"grad_norm": 0.8780639185859085,
"learning_rate": 9.989384014388624e-06,
"loss": 0.086,
"step": 229
},
{
"epoch": 0.10464058234758872,
"grad_norm": 0.6623808171307163,
"learning_rate": 9.989290720821095e-06,
"loss": 0.0694,
"step": 230
},
{
"epoch": 0.10509554140127389,
"grad_norm": 0.721054554263859,
"learning_rate": 9.98919701955126e-06,
"loss": 0.0735,
"step": 231
},
{
"epoch": 0.10555050045495905,
"grad_norm": 0.7868134014829404,
"learning_rate": 9.989102910586776e-06,
"loss": 0.0546,
"step": 232
},
{
"epoch": 0.10600545950864422,
"grad_norm": 0.9137158371163484,
"learning_rate": 9.989008393935331e-06,
"loss": 0.0771,
"step": 233
},
{
"epoch": 0.10646041856232939,
"grad_norm": 0.8326009579593463,
"learning_rate": 9.98891346960465e-06,
"loss": 0.0667,
"step": 234
},
{
"epoch": 0.10691537761601456,
"grad_norm": 0.6462724580348628,
"learning_rate": 9.988818137602494e-06,
"loss": 0.0717,
"step": 235
},
{
"epoch": 0.10737033666969972,
"grad_norm": 0.7513725247558808,
"learning_rate": 9.988722397936646e-06,
"loss": 0.0733,
"step": 236
},
{
"epoch": 0.1078252957233849,
"grad_norm": 1.094509848236789,
"learning_rate": 9.988626250614932e-06,
"loss": 0.1009,
"step": 237
},
{
"epoch": 0.10828025477707007,
"grad_norm": 0.8200579138639758,
"learning_rate": 9.98852969564521e-06,
"loss": 0.0844,
"step": 238
},
{
"epoch": 0.10873521383075523,
"grad_norm": 0.7417763562196316,
"learning_rate": 9.988432733035369e-06,
"loss": 0.0611,
"step": 239
},
{
"epoch": 0.1091901728844404,
"grad_norm": 0.8476475869820355,
"learning_rate": 9.988335362793333e-06,
"loss": 0.0863,
"step": 240
},
{
"epoch": 0.10964513193812557,
"grad_norm": 0.9998642783878469,
"learning_rate": 9.988237584927058e-06,
"loss": 0.0909,
"step": 241
},
{
"epoch": 0.11010009099181074,
"grad_norm": 1.1689324698997519,
"learning_rate": 9.988139399444534e-06,
"loss": 0.124,
"step": 242
},
{
"epoch": 0.1105550500454959,
"grad_norm": 0.790901332269412,
"learning_rate": 9.988040806353786e-06,
"loss": 0.0855,
"step": 243
},
{
"epoch": 0.11101000909918107,
"grad_norm": 0.8931785977847209,
"learning_rate": 9.987941805662869e-06,
"loss": 0.1023,
"step": 244
},
{
"epoch": 0.11146496815286625,
"grad_norm": 0.7352781929773609,
"learning_rate": 9.98784239737987e-06,
"loss": 0.0563,
"step": 245
},
{
"epoch": 0.11191992720655142,
"grad_norm": 0.7169092611535308,
"learning_rate": 9.987742581512919e-06,
"loss": 0.0683,
"step": 246
},
{
"epoch": 0.11237488626023658,
"grad_norm": 0.6767560569792272,
"learning_rate": 9.987642358070167e-06,
"loss": 0.0669,
"step": 247
},
{
"epoch": 0.11282984531392175,
"grad_norm": 0.8442319805699996,
"learning_rate": 9.987541727059805e-06,
"loss": 0.0768,
"step": 248
},
{
"epoch": 0.11328480436760692,
"grad_norm": 0.7700876798522618,
"learning_rate": 9.987440688490058e-06,
"loss": 0.0643,
"step": 249
},
{
"epoch": 0.11373976342129208,
"grad_norm": 0.7286087978317647,
"learning_rate": 9.98733924236918e-06,
"loss": 0.0698,
"step": 250
},
{
"epoch": 0.11419472247497725,
"grad_norm": 0.7917355018437868,
"learning_rate": 9.98723738870546e-06,
"loss": 0.0791,
"step": 251
},
{
"epoch": 0.11464968152866242,
"grad_norm": 1.0469499693242315,
"learning_rate": 9.987135127507226e-06,
"loss": 0.0761,
"step": 252
},
{
"epoch": 0.1151046405823476,
"grad_norm": 0.8361714930383379,
"learning_rate": 9.987032458782828e-06,
"loss": 0.0789,
"step": 253
},
{
"epoch": 0.11555959963603275,
"grad_norm": 0.5902853873046482,
"learning_rate": 9.986929382540662e-06,
"loss": 0.0479,
"step": 254
},
{
"epoch": 0.11601455868971793,
"grad_norm": 0.7349436304465384,
"learning_rate": 9.986825898789145e-06,
"loss": 0.0668,
"step": 255
},
{
"epoch": 0.1164695177434031,
"grad_norm": 0.7657107039148755,
"learning_rate": 9.986722007536737e-06,
"loss": 0.0617,
"step": 256
},
{
"epoch": 0.11692447679708826,
"grad_norm": 0.6450631027744769,
"learning_rate": 9.986617708791926e-06,
"loss": 0.0679,
"step": 257
},
{
"epoch": 0.11737943585077343,
"grad_norm": 0.6292930010016882,
"learning_rate": 9.986513002563236e-06,
"loss": 0.0482,
"step": 258
},
{
"epoch": 0.1178343949044586,
"grad_norm": 0.8758541343517451,
"learning_rate": 9.986407888859221e-06,
"loss": 0.0994,
"step": 259
},
{
"epoch": 0.11828935395814377,
"grad_norm": 0.6537445862223847,
"learning_rate": 9.986302367688473e-06,
"loss": 0.07,
"step": 260
},
{
"epoch": 0.11874431301182893,
"grad_norm": 0.8029660816844667,
"learning_rate": 9.986196439059613e-06,
"loss": 0.0623,
"step": 261
},
{
"epoch": 0.1191992720655141,
"grad_norm": 0.7339528606524214,
"learning_rate": 9.986090102981297e-06,
"loss": 0.0791,
"step": 262
},
{
"epoch": 0.11965423111919928,
"grad_norm": 0.7934112522002073,
"learning_rate": 9.985983359462215e-06,
"loss": 0.0672,
"step": 263
},
{
"epoch": 0.12010919017288443,
"grad_norm": 1.0186962263060808,
"learning_rate": 9.98587620851109e-06,
"loss": 0.1213,
"step": 264
},
{
"epoch": 0.1205641492265696,
"grad_norm": 0.6769843647605545,
"learning_rate": 9.985768650136679e-06,
"loss": 0.0685,
"step": 265
},
{
"epoch": 0.12101910828025478,
"grad_norm": 0.7543020935976431,
"learning_rate": 9.985660684347765e-06,
"loss": 0.0861,
"step": 266
},
{
"epoch": 0.12147406733393995,
"grad_norm": 0.9552124731299731,
"learning_rate": 9.985552311153178e-06,
"loss": 0.0922,
"step": 267
},
{
"epoch": 0.12192902638762511,
"grad_norm": 0.7436699167226903,
"learning_rate": 9.985443530561769e-06,
"loss": 0.0885,
"step": 268
},
{
"epoch": 0.12238398544131028,
"grad_norm": 1.329058937551934,
"learning_rate": 9.98533434258243e-06,
"loss": 0.1115,
"step": 269
},
{
"epoch": 0.12283894449499545,
"grad_norm": 0.6835909813818813,
"learning_rate": 9.985224747224083e-06,
"loss": 0.0586,
"step": 270
},
{
"epoch": 0.12329390354868063,
"grad_norm": 1.0733107060854794,
"learning_rate": 9.98511474449568e-06,
"loss": 0.0811,
"step": 271
},
{
"epoch": 0.12374886260236578,
"grad_norm": 0.5916007278667166,
"learning_rate": 9.985004334406215e-06,
"loss": 0.0696,
"step": 272
},
{
"epoch": 0.12420382165605096,
"grad_norm": 0.9149357508392912,
"learning_rate": 9.984893516964707e-06,
"loss": 0.0704,
"step": 273
},
{
"epoch": 0.12465878070973613,
"grad_norm": 1.1634742377762608,
"learning_rate": 9.984782292180212e-06,
"loss": 0.1178,
"step": 274
},
{
"epoch": 0.1251137397634213,
"grad_norm": 0.603957454908005,
"learning_rate": 9.98467066006182e-06,
"loss": 0.0585,
"step": 275
},
{
"epoch": 0.12556869881710647,
"grad_norm": 0.7735087790025026,
"learning_rate": 9.984558620618651e-06,
"loss": 0.0953,
"step": 276
},
{
"epoch": 0.12602365787079162,
"grad_norm": 1.2570182633873541,
"learning_rate": 9.984446173859863e-06,
"loss": 0.1353,
"step": 277
},
{
"epoch": 0.1264786169244768,
"grad_norm": 0.7275895818672663,
"learning_rate": 9.984333319794642e-06,
"loss": 0.0774,
"step": 278
},
{
"epoch": 0.12693357597816196,
"grad_norm": 0.6395006056363333,
"learning_rate": 9.984220058432212e-06,
"loss": 0.0591,
"step": 279
},
{
"epoch": 0.12738853503184713,
"grad_norm": 0.6563921850032347,
"learning_rate": 9.984106389781828e-06,
"loss": 0.0573,
"step": 280
},
{
"epoch": 0.1278434940855323,
"grad_norm": 0.9399157526953884,
"learning_rate": 9.983992313852776e-06,
"loss": 0.0793,
"step": 281
},
{
"epoch": 0.12829845313921748,
"grad_norm": 0.93528061821534,
"learning_rate": 9.983877830654381e-06,
"loss": 0.0807,
"step": 282
},
{
"epoch": 0.12875341219290265,
"grad_norm": 0.7192448233352142,
"learning_rate": 9.983762940195996e-06,
"loss": 0.0773,
"step": 283
},
{
"epoch": 0.1292083712465878,
"grad_norm": 0.7097381072031733,
"learning_rate": 9.98364764248701e-06,
"loss": 0.0698,
"step": 284
},
{
"epoch": 0.12966333030027297,
"grad_norm": 1.1635566012920768,
"learning_rate": 9.983531937536844e-06,
"loss": 0.0893,
"step": 285
},
{
"epoch": 0.13011828935395814,
"grad_norm": 0.8456555685011555,
"learning_rate": 9.983415825354954e-06,
"loss": 0.0628,
"step": 286
},
{
"epoch": 0.1305732484076433,
"grad_norm": 0.7151838393189083,
"learning_rate": 9.983299305950828e-06,
"loss": 0.0557,
"step": 287
},
{
"epoch": 0.13102820746132848,
"grad_norm": 0.7095193783870621,
"learning_rate": 9.983182379333989e-06,
"loss": 0.0604,
"step": 288
},
{
"epoch": 0.13148316651501366,
"grad_norm": 0.8581434444337498,
"learning_rate": 9.983065045513986e-06,
"loss": 0.0781,
"step": 289
},
{
"epoch": 0.13193812556869883,
"grad_norm": 0.5600994934804626,
"learning_rate": 9.982947304500414e-06,
"loss": 0.0498,
"step": 290
},
{
"epoch": 0.13239308462238397,
"grad_norm": 0.7355720212694087,
"learning_rate": 9.98282915630289e-06,
"loss": 0.0692,
"step": 291
},
{
"epoch": 0.13284804367606914,
"grad_norm": 1.6846985851500909,
"learning_rate": 9.98271060093107e-06,
"loss": 0.1687,
"step": 292
},
{
"epoch": 0.13330300272975432,
"grad_norm": 0.7959406174268434,
"learning_rate": 9.98259163839464e-06,
"loss": 0.0718,
"step": 293
},
{
"epoch": 0.1337579617834395,
"grad_norm": 0.6005858848115938,
"learning_rate": 9.982472268703323e-06,
"loss": 0.0465,
"step": 294
},
{
"epoch": 0.13421292083712466,
"grad_norm": 0.7865103977061746,
"learning_rate": 9.982352491866874e-06,
"loss": 0.071,
"step": 295
},
{
"epoch": 0.13466787989080983,
"grad_norm": 0.7167219429964851,
"learning_rate": 9.982232307895077e-06,
"loss": 0.0658,
"step": 296
},
{
"epoch": 0.135122838944495,
"grad_norm": 1.206398567596641,
"learning_rate": 9.982111716797758e-06,
"loss": 0.101,
"step": 297
},
{
"epoch": 0.13557779799818018,
"grad_norm": 1.0085912508470862,
"learning_rate": 9.981990718584768e-06,
"loss": 0.0959,
"step": 298
},
{
"epoch": 0.13603275705186532,
"grad_norm": 0.8594135430057543,
"learning_rate": 9.981869313265995e-06,
"loss": 0.0912,
"step": 299
},
{
"epoch": 0.1364877161055505,
"grad_norm": 0.9903339586980618,
"learning_rate": 9.981747500851357e-06,
"loss": 0.0692,
"step": 300
},
{
"epoch": 0.13694267515923567,
"grad_norm": 0.7623380548666351,
"learning_rate": 9.981625281350812e-06,
"loss": 0.0699,
"step": 301
},
{
"epoch": 0.13739763421292084,
"grad_norm": 0.6267143484055344,
"learning_rate": 9.981502654774349e-06,
"loss": 0.0499,
"step": 302
},
{
"epoch": 0.137852593266606,
"grad_norm": 0.8234150836820757,
"learning_rate": 9.98137962113198e-06,
"loss": 0.0788,
"step": 303
},
{
"epoch": 0.13830755232029118,
"grad_norm": 0.8158733102806115,
"learning_rate": 9.98125618043377e-06,
"loss": 0.089,
"step": 304
},
{
"epoch": 0.13876251137397635,
"grad_norm": 0.6372656549463032,
"learning_rate": 9.981132332689796e-06,
"loss": 0.0517,
"step": 305
},
{
"epoch": 0.1392174704276615,
"grad_norm": 0.7713863813548327,
"learning_rate": 9.981008077910184e-06,
"loss": 0.0769,
"step": 306
},
{
"epoch": 0.13967242948134667,
"grad_norm": 0.8883775702857831,
"learning_rate": 9.980883416105084e-06,
"loss": 0.0828,
"step": 307
},
{
"epoch": 0.14012738853503184,
"grad_norm": 0.6490936355626988,
"learning_rate": 9.980758347284687e-06,
"loss": 0.0618,
"step": 308
},
{
"epoch": 0.14058234758871702,
"grad_norm": 0.8359554084586713,
"learning_rate": 9.980632871459209e-06,
"loss": 0.0714,
"step": 309
},
{
"epoch": 0.1410373066424022,
"grad_norm": 0.7373523328454649,
"learning_rate": 9.980506988638906e-06,
"loss": 0.0836,
"step": 310
},
{
"epoch": 0.14149226569608736,
"grad_norm": 0.6644370731485183,
"learning_rate": 9.980380698834064e-06,
"loss": 0.0777,
"step": 311
},
{
"epoch": 0.14194722474977253,
"grad_norm": 0.870883965477211,
"learning_rate": 9.980254002055003e-06,
"loss": 0.0847,
"step": 312
},
{
"epoch": 0.14240218380345768,
"grad_norm": 0.6021065409531002,
"learning_rate": 9.980126898312074e-06,
"loss": 0.0583,
"step": 313
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.8705461588189498,
"learning_rate": 9.979999387615665e-06,
"loss": 0.0895,
"step": 314
},
{
"epoch": 0.14331210191082802,
"grad_norm": 0.9639410731114018,
"learning_rate": 9.979871469976197e-06,
"loss": 0.0901,
"step": 315
},
{
"epoch": 0.1437670609645132,
"grad_norm": 0.7554126383153169,
"learning_rate": 9.97974314540412e-06,
"loss": 0.0699,
"step": 316
},
{
"epoch": 0.14422202001819837,
"grad_norm": 1.1039648440512544,
"learning_rate": 9.979614413909922e-06,
"loss": 0.1013,
"step": 317
},
{
"epoch": 0.14467697907188354,
"grad_norm": 0.5258831871743486,
"learning_rate": 9.979485275504121e-06,
"loss": 0.0544,
"step": 318
},
{
"epoch": 0.1451319381255687,
"grad_norm": 1.3025897394440575,
"learning_rate": 9.979355730197271e-06,
"loss": 0.1067,
"step": 319
},
{
"epoch": 0.14558689717925385,
"grad_norm": 0.5206132423310033,
"learning_rate": 9.979225777999956e-06,
"loss": 0.0497,
"step": 320
},
{
"epoch": 0.14604185623293903,
"grad_norm": 0.7202189397663867,
"learning_rate": 9.9790954189228e-06,
"loss": 0.0807,
"step": 321
},
{
"epoch": 0.1464968152866242,
"grad_norm": 0.5738667169449175,
"learning_rate": 9.97896465297645e-06,
"loss": 0.0614,
"step": 322
},
{
"epoch": 0.14695177434030937,
"grad_norm": 0.7972440737628133,
"learning_rate": 9.978833480171592e-06,
"loss": 0.0906,
"step": 323
},
{
"epoch": 0.14740673339399454,
"grad_norm": 0.7697423454053598,
"learning_rate": 9.978701900518947e-06,
"loss": 0.0632,
"step": 324
},
{
"epoch": 0.14786169244767972,
"grad_norm": 0.8259885564233931,
"learning_rate": 9.978569914029267e-06,
"loss": 0.0944,
"step": 325
},
{
"epoch": 0.1483166515013649,
"grad_norm": 0.8450006655868962,
"learning_rate": 9.978437520713335e-06,
"loss": 0.0862,
"step": 326
},
{
"epoch": 0.14877161055505003,
"grad_norm": 0.7746078278616594,
"learning_rate": 9.978304720581973e-06,
"loss": 0.088,
"step": 327
},
{
"epoch": 0.1492265696087352,
"grad_norm": 0.9977734940815816,
"learning_rate": 9.97817151364603e-06,
"loss": 0.1036,
"step": 328
},
{
"epoch": 0.14968152866242038,
"grad_norm": 0.7800752301510507,
"learning_rate": 9.978037899916393e-06,
"loss": 0.0778,
"step": 329
},
{
"epoch": 0.15013648771610555,
"grad_norm": 0.7521153273438224,
"learning_rate": 9.97790387940398e-06,
"loss": 0.0532,
"step": 330
},
{
"epoch": 0.15059144676979072,
"grad_norm": 0.8046420256419254,
"learning_rate": 9.977769452119741e-06,
"loss": 0.0708,
"step": 331
},
{
"epoch": 0.1510464058234759,
"grad_norm": 0.9071770528791517,
"learning_rate": 9.97763461807466e-06,
"loss": 0.1006,
"step": 332
},
{
"epoch": 0.15150136487716107,
"grad_norm": 0.8824570234268595,
"learning_rate": 9.97749937727976e-06,
"loss": 0.0855,
"step": 333
},
{
"epoch": 0.15195632393084624,
"grad_norm": 0.8286075823730068,
"learning_rate": 9.977363729746088e-06,
"loss": 0.077,
"step": 334
},
{
"epoch": 0.15241128298453138,
"grad_norm": 0.6791233851472963,
"learning_rate": 9.977227675484729e-06,
"loss": 0.0698,
"step": 335
},
{
"epoch": 0.15286624203821655,
"grad_norm": 0.9813875260679181,
"learning_rate": 9.977091214506803e-06,
"loss": 0.0838,
"step": 336
},
{
"epoch": 0.15332120109190173,
"grad_norm": 0.9986284190120469,
"learning_rate": 9.976954346823456e-06,
"loss": 0.0789,
"step": 337
},
{
"epoch": 0.1537761601455869,
"grad_norm": 0.6456071732838817,
"learning_rate": 9.976817072445878e-06,
"loss": 0.0566,
"step": 338
},
{
"epoch": 0.15423111919927207,
"grad_norm": 0.7707362352402762,
"learning_rate": 9.976679391385283e-06,
"loss": 0.0677,
"step": 339
},
{
"epoch": 0.15468607825295724,
"grad_norm": 0.5804713825378958,
"learning_rate": 9.976541303652923e-06,
"loss": 0.0547,
"step": 340
},
{
"epoch": 0.15514103730664242,
"grad_norm": 0.7705377953828665,
"learning_rate": 9.976402809260083e-06,
"loss": 0.0673,
"step": 341
},
{
"epoch": 0.15559599636032756,
"grad_norm": 0.651002355082985,
"learning_rate": 9.976263908218076e-06,
"loss": 0.066,
"step": 342
},
{
"epoch": 0.15605095541401273,
"grad_norm": 1.0075230687249708,
"learning_rate": 9.976124600538257e-06,
"loss": 0.1151,
"step": 343
},
{
"epoch": 0.1565059144676979,
"grad_norm": 0.7110146200064966,
"learning_rate": 9.975984886232006e-06,
"loss": 0.0693,
"step": 344
},
{
"epoch": 0.15696087352138308,
"grad_norm": 0.782615076662302,
"learning_rate": 9.975844765310743e-06,
"loss": 0.071,
"step": 345
},
{
"epoch": 0.15741583257506825,
"grad_norm": 1.091513822496144,
"learning_rate": 9.975704237785915e-06,
"loss": 0.1277,
"step": 346
},
{
"epoch": 0.15787079162875342,
"grad_norm": 0.8244942271322709,
"learning_rate": 9.975563303669006e-06,
"loss": 0.092,
"step": 347
},
{
"epoch": 0.1583257506824386,
"grad_norm": 1.0997264747524325,
"learning_rate": 9.975421962971536e-06,
"loss": 0.102,
"step": 348
},
{
"epoch": 0.15878070973612374,
"grad_norm": 1.0471722358260585,
"learning_rate": 9.97528021570505e-06,
"loss": 0.1112,
"step": 349
},
{
"epoch": 0.1592356687898089,
"grad_norm": 0.6366013160292697,
"learning_rate": 9.975138061881135e-06,
"loss": 0.0629,
"step": 350
},
{
"epoch": 0.15969062784349408,
"grad_norm": 0.7145502784859615,
"learning_rate": 9.974995501511404e-06,
"loss": 0.0567,
"step": 351
},
{
"epoch": 0.16014558689717925,
"grad_norm": 1.0825694007542435,
"learning_rate": 9.974852534607506e-06,
"loss": 0.0897,
"step": 352
},
{
"epoch": 0.16060054595086443,
"grad_norm": 0.8874195306329471,
"learning_rate": 9.974709161181126e-06,
"loss": 0.0879,
"step": 353
},
{
"epoch": 0.1610555050045496,
"grad_norm": 0.8193025449594961,
"learning_rate": 9.974565381243982e-06,
"loss": 0.0969,
"step": 354
},
{
"epoch": 0.16151046405823477,
"grad_norm": 0.76528422131405,
"learning_rate": 9.974421194807815e-06,
"loss": 0.0786,
"step": 355
},
{
"epoch": 0.16196542311191992,
"grad_norm": 0.8836543328533641,
"learning_rate": 9.974276601884416e-06,
"loss": 0.0744,
"step": 356
},
{
"epoch": 0.1624203821656051,
"grad_norm": 0.7482952108426273,
"learning_rate": 9.974131602485596e-06,
"loss": 0.0772,
"step": 357
},
{
"epoch": 0.16287534121929026,
"grad_norm": 0.9122723647083647,
"learning_rate": 9.973986196623203e-06,
"loss": 0.0851,
"step": 358
},
{
"epoch": 0.16333030027297543,
"grad_norm": 0.8373653902978805,
"learning_rate": 9.973840384309121e-06,
"loss": 0.0865,
"step": 359
},
{
"epoch": 0.1637852593266606,
"grad_norm": 0.6360069343077157,
"learning_rate": 9.973694165555264e-06,
"loss": 0.0618,
"step": 360
},
{
"epoch": 0.16424021838034578,
"grad_norm": 0.7967304456611868,
"learning_rate": 9.973547540373582e-06,
"loss": 0.0865,
"step": 361
},
{
"epoch": 0.16469517743403095,
"grad_norm": 1.1699452577832765,
"learning_rate": 9.973400508776054e-06,
"loss": 0.1144,
"step": 362
},
{
"epoch": 0.1651501364877161,
"grad_norm": 0.6282867599706373,
"learning_rate": 9.973253070774698e-06,
"loss": 0.0633,
"step": 363
},
{
"epoch": 0.16560509554140126,
"grad_norm": 0.79942272506218,
"learning_rate": 9.973105226381559e-06,
"loss": 0.069,
"step": 364
},
{
"epoch": 0.16606005459508644,
"grad_norm": 0.9348674828410355,
"learning_rate": 9.972956975608719e-06,
"loss": 0.1019,
"step": 365
},
{
"epoch": 0.1665150136487716,
"grad_norm": 1.0942665884463076,
"learning_rate": 9.972808318468292e-06,
"loss": 0.0859,
"step": 366
},
{
"epoch": 0.16696997270245678,
"grad_norm": 0.6283579225277517,
"learning_rate": 9.972659254972426e-06,
"loss": 0.0589,
"step": 367
},
{
"epoch": 0.16742493175614195,
"grad_norm": 1.0989677054167046,
"learning_rate": 9.972509785133304e-06,
"loss": 0.1081,
"step": 368
},
{
"epoch": 0.16787989080982713,
"grad_norm": 0.7310198219540203,
"learning_rate": 9.972359908963137e-06,
"loss": 0.0675,
"step": 369
},
{
"epoch": 0.16833484986351227,
"grad_norm": 0.757671629194488,
"learning_rate": 9.972209626474172e-06,
"loss": 0.0734,
"step": 370
},
{
"epoch": 0.16878980891719744,
"grad_norm": 0.7966175159886519,
"learning_rate": 9.972058937678692e-06,
"loss": 0.075,
"step": 371
},
{
"epoch": 0.16924476797088261,
"grad_norm": 0.9805514159267839,
"learning_rate": 9.97190784258901e-06,
"loss": 0.1071,
"step": 372
},
{
"epoch": 0.1696997270245678,
"grad_norm": 0.7000612574442994,
"learning_rate": 9.971756341217471e-06,
"loss": 0.0526,
"step": 373
},
{
"epoch": 0.17015468607825296,
"grad_norm": 0.7917466702374949,
"learning_rate": 9.971604433576456e-06,
"loss": 0.0698,
"step": 374
},
{
"epoch": 0.17060964513193813,
"grad_norm": 0.8412692631182211,
"learning_rate": 9.97145211967838e-06,
"loss": 0.0783,
"step": 375
},
{
"epoch": 0.1710646041856233,
"grad_norm": 0.5615038895232536,
"learning_rate": 9.971299399535685e-06,
"loss": 0.053,
"step": 376
},
{
"epoch": 0.17151956323930848,
"grad_norm": 0.6849745369298482,
"learning_rate": 9.971146273160854e-06,
"loss": 0.0774,
"step": 377
},
{
"epoch": 0.17197452229299362,
"grad_norm": 0.6466596777060115,
"learning_rate": 9.9709927405664e-06,
"loss": 0.0606,
"step": 378
},
{
"epoch": 0.1724294813466788,
"grad_norm": 0.7169884074840761,
"learning_rate": 9.970838801764866e-06,
"loss": 0.0839,
"step": 379
},
{
"epoch": 0.17288444040036396,
"grad_norm": 0.9393396355410675,
"learning_rate": 9.970684456768836e-06,
"loss": 0.1132,
"step": 380
},
{
"epoch": 0.17333939945404914,
"grad_norm": 12.197098173453568,
"learning_rate": 9.970529705590918e-06,
"loss": 0.4858,
"step": 381
},
{
"epoch": 0.1737943585077343,
"grad_norm": 0.7355841274771772,
"learning_rate": 9.97037454824376e-06,
"loss": 0.0714,
"step": 382
},
{
"epoch": 0.17424931756141948,
"grad_norm": 1.050385265783733,
"learning_rate": 9.97021898474004e-06,
"loss": 0.1024,
"step": 383
},
{
"epoch": 0.17470427661510465,
"grad_norm": 0.8612087678995594,
"learning_rate": 9.970063015092469e-06,
"loss": 0.085,
"step": 384
},
{
"epoch": 0.1751592356687898,
"grad_norm": 1.3886472100476919,
"learning_rate": 9.969906639313793e-06,
"loss": 0.1212,
"step": 385
},
{
"epoch": 0.17561419472247497,
"grad_norm": 0.8238176964814595,
"learning_rate": 9.96974985741679e-06,
"loss": 0.0721,
"step": 386
},
{
"epoch": 0.17606915377616014,
"grad_norm": 0.8718897735731601,
"learning_rate": 9.969592669414272e-06,
"loss": 0.0959,
"step": 387
},
{
"epoch": 0.17652411282984531,
"grad_norm": 6.796752422837202,
"learning_rate": 9.969435075319083e-06,
"loss": 0.115,
"step": 388
},
{
"epoch": 0.1769790718835305,
"grad_norm": 0.58176536820322,
"learning_rate": 9.969277075144104e-06,
"loss": 0.0459,
"step": 389
},
{
"epoch": 0.17743403093721566,
"grad_norm": 0.7267253435076165,
"learning_rate": 9.969118668902242e-06,
"loss": 0.07,
"step": 390
},
{
"epoch": 0.17788898999090083,
"grad_norm": 0.7682389367523258,
"learning_rate": 9.968959856606442e-06,
"loss": 0.0542,
"step": 391
},
{
"epoch": 0.17834394904458598,
"grad_norm": 0.7873348185837048,
"learning_rate": 9.968800638269682e-06,
"loss": 0.0598,
"step": 392
},
{
"epoch": 0.17879890809827115,
"grad_norm": 1.287713292390112,
"learning_rate": 9.968641013904974e-06,
"loss": 0.1442,
"step": 393
},
{
"epoch": 0.17925386715195632,
"grad_norm": 1.085650814952146,
"learning_rate": 9.968480983525359e-06,
"loss": 0.0926,
"step": 394
},
{
"epoch": 0.1797088262056415,
"grad_norm": 0.6716676596759695,
"learning_rate": 9.968320547143918e-06,
"loss": 0.0767,
"step": 395
},
{
"epoch": 0.18016378525932666,
"grad_norm": 0.8467396807693714,
"learning_rate": 9.968159704773757e-06,
"loss": 0.0977,
"step": 396
},
{
"epoch": 0.18061874431301184,
"grad_norm": 0.6438855833782786,
"learning_rate": 9.967998456428021e-06,
"loss": 0.0586,
"step": 397
},
{
"epoch": 0.181073703366697,
"grad_norm": 0.7254140122399564,
"learning_rate": 9.967836802119886e-06,
"loss": 0.06,
"step": 398
},
{
"epoch": 0.18152866242038215,
"grad_norm": 0.87517545358881,
"learning_rate": 9.967674741862563e-06,
"loss": 0.1016,
"step": 399
},
{
"epoch": 0.18198362147406733,
"grad_norm": 1.0624206936058178,
"learning_rate": 9.967512275669294e-06,
"loss": 0.1296,
"step": 400
},
{
"epoch": 0.1824385805277525,
"grad_norm": 1.0284720738314184,
"learning_rate": 9.967349403553353e-06,
"loss": 0.0862,
"step": 401
},
{
"epoch": 0.18289353958143767,
"grad_norm": 0.8342932737384292,
"learning_rate": 9.967186125528053e-06,
"loss": 0.0873,
"step": 402
},
{
"epoch": 0.18334849863512284,
"grad_norm": 1.543095569701571,
"learning_rate": 9.967022441606734e-06,
"loss": 0.1209,
"step": 403
},
{
"epoch": 0.18380345768880801,
"grad_norm": 0.70731586616612,
"learning_rate": 9.966858351802773e-06,
"loss": 0.0726,
"step": 404
},
{
"epoch": 0.1842584167424932,
"grad_norm": 0.6660531988680356,
"learning_rate": 9.966693856129576e-06,
"loss": 0.0562,
"step": 405
},
{
"epoch": 0.18471337579617833,
"grad_norm": 0.8503640969928286,
"learning_rate": 9.966528954600587e-06,
"loss": 0.0838,
"step": 406
},
{
"epoch": 0.1851683348498635,
"grad_norm": 0.6021534124846688,
"learning_rate": 9.96636364722928e-06,
"loss": 0.0673,
"step": 407
},
{
"epoch": 0.18562329390354868,
"grad_norm": 0.8782816795828058,
"learning_rate": 9.966197934029165e-06,
"loss": 0.0845,
"step": 408
},
{
"epoch": 0.18607825295723385,
"grad_norm": 0.9030990654346936,
"learning_rate": 9.966031815013781e-06,
"loss": 0.0839,
"step": 409
},
{
"epoch": 0.18653321201091902,
"grad_norm": 0.8567507299712805,
"learning_rate": 9.965865290196703e-06,
"loss": 0.0935,
"step": 410
},
{
"epoch": 0.1869881710646042,
"grad_norm": 0.8099856489670021,
"learning_rate": 9.96569835959154e-06,
"loss": 0.0747,
"step": 411
},
{
"epoch": 0.18744313011828936,
"grad_norm": 0.8938878675243255,
"learning_rate": 9.965531023211931e-06,
"loss": 0.0854,
"step": 412
},
{
"epoch": 0.18789808917197454,
"grad_norm": 0.735313860104022,
"learning_rate": 9.965363281071551e-06,
"loss": 0.0865,
"step": 413
},
{
"epoch": 0.18835304822565968,
"grad_norm": 0.5495229598132649,
"learning_rate": 9.965195133184108e-06,
"loss": 0.0403,
"step": 414
},
{
"epoch": 0.18880800727934485,
"grad_norm": 1.0700416713113117,
"learning_rate": 9.965026579563342e-06,
"loss": 0.1086,
"step": 415
},
{
"epoch": 0.18926296633303002,
"grad_norm": 0.7118653717355078,
"learning_rate": 9.964857620223024e-06,
"loss": 0.0691,
"step": 416
},
{
"epoch": 0.1897179253867152,
"grad_norm": 0.6871481686027417,
"learning_rate": 9.964688255176963e-06,
"loss": 0.0667,
"step": 417
},
{
"epoch": 0.19017288444040037,
"grad_norm": 0.9848841869658392,
"learning_rate": 9.964518484438998e-06,
"loss": 0.0813,
"step": 418
},
{
"epoch": 0.19062784349408554,
"grad_norm": 0.6311750922074311,
"learning_rate": 9.964348308023001e-06,
"loss": 0.0592,
"step": 419
},
{
"epoch": 0.1910828025477707,
"grad_norm": 0.7813168734245782,
"learning_rate": 9.964177725942881e-06,
"loss": 0.0826,
"step": 420
},
{
"epoch": 0.19153776160145586,
"grad_norm": 0.8572110622332836,
"learning_rate": 9.964006738212574e-06,
"loss": 0.0853,
"step": 421
},
{
"epoch": 0.19199272065514103,
"grad_norm": 0.5304433423014596,
"learning_rate": 9.963835344846056e-06,
"loss": 0.048,
"step": 422
},
{
"epoch": 0.1924476797088262,
"grad_norm": 0.7598521228122416,
"learning_rate": 9.963663545857328e-06,
"loss": 0.0757,
"step": 423
},
{
"epoch": 0.19290263876251137,
"grad_norm": 1.1542546683489703,
"learning_rate": 9.963491341260432e-06,
"loss": 0.104,
"step": 424
},
{
"epoch": 0.19335759781619655,
"grad_norm": 0.7766563582253432,
"learning_rate": 9.963318731069437e-06,
"loss": 0.0952,
"step": 425
},
{
"epoch": 0.19381255686988172,
"grad_norm": 1.1319194983916299,
"learning_rate": 9.96314571529845e-06,
"loss": 0.1005,
"step": 426
},
{
"epoch": 0.1942675159235669,
"grad_norm": 0.7230559135257585,
"learning_rate": 9.962972293961608e-06,
"loss": 0.0647,
"step": 427
},
{
"epoch": 0.19472247497725204,
"grad_norm": 0.9863934566369588,
"learning_rate": 9.962798467073083e-06,
"loss": 0.0763,
"step": 428
},
{
"epoch": 0.1951774340309372,
"grad_norm": 0.8259784410005646,
"learning_rate": 9.96262423464708e-06,
"loss": 0.087,
"step": 429
},
{
"epoch": 0.19563239308462238,
"grad_norm": 0.7987139095182185,
"learning_rate": 9.962449596697834e-06,
"loss": 0.0671,
"step": 430
},
{
"epoch": 0.19608735213830755,
"grad_norm": 1.130208173229934,
"learning_rate": 9.962274553239619e-06,
"loss": 0.119,
"step": 431
},
{
"epoch": 0.19654231119199272,
"grad_norm": 0.7399696243677417,
"learning_rate": 9.962099104286735e-06,
"loss": 0.064,
"step": 432
},
{
"epoch": 0.1969972702456779,
"grad_norm": 1.156015767405528,
"learning_rate": 9.961923249853523e-06,
"loss": 0.1102,
"step": 433
},
{
"epoch": 0.19745222929936307,
"grad_norm": 0.972422739757894,
"learning_rate": 9.961746989954349e-06,
"loss": 0.1093,
"step": 434
},
{
"epoch": 0.1979071883530482,
"grad_norm": 0.7766700420403171,
"learning_rate": 9.96157032460362e-06,
"loss": 0.0655,
"step": 435
},
{
"epoch": 0.19836214740673339,
"grad_norm": 0.7460679115751414,
"learning_rate": 9.961393253815767e-06,
"loss": 0.0751,
"step": 436
},
{
"epoch": 0.19881710646041856,
"grad_norm": 1.0684214450487566,
"learning_rate": 9.961215777605266e-06,
"loss": 0.0789,
"step": 437
},
{
"epoch": 0.19927206551410373,
"grad_norm": 0.7683994291392229,
"learning_rate": 9.961037895986615e-06,
"loss": 0.0849,
"step": 438
},
{
"epoch": 0.1997270245677889,
"grad_norm": 0.7270368453251704,
"learning_rate": 9.960859608974352e-06,
"loss": 0.0779,
"step": 439
},
{
"epoch": 0.20018198362147407,
"grad_norm": 0.701460207303568,
"learning_rate": 9.960680916583042e-06,
"loss": 0.0639,
"step": 440
},
{
"epoch": 0.20063694267515925,
"grad_norm": 0.6784619280926262,
"learning_rate": 9.960501818827292e-06,
"loss": 0.077,
"step": 441
},
{
"epoch": 0.2010919017288444,
"grad_norm": 0.8064075868568972,
"learning_rate": 9.960322315721735e-06,
"loss": 0.0827,
"step": 442
},
{
"epoch": 0.20154686078252956,
"grad_norm": 0.9155026735417204,
"learning_rate": 9.960142407281039e-06,
"loss": 0.0841,
"step": 443
},
{
"epoch": 0.20200181983621474,
"grad_norm": 0.6167749294869733,
"learning_rate": 9.959962093519904e-06,
"loss": 0.054,
"step": 444
},
{
"epoch": 0.2024567788898999,
"grad_norm": 0.8127781985331358,
"learning_rate": 9.959781374453066e-06,
"loss": 0.0751,
"step": 445
},
{
"epoch": 0.20291173794358508,
"grad_norm": 0.98306444688532,
"learning_rate": 9.959600250095294e-06,
"loss": 0.075,
"step": 446
},
{
"epoch": 0.20336669699727025,
"grad_norm": 0.7982130269360888,
"learning_rate": 9.959418720461384e-06,
"loss": 0.0834,
"step": 447
},
{
"epoch": 0.20382165605095542,
"grad_norm": 0.7862225023823932,
"learning_rate": 9.959236785566175e-06,
"loss": 0.0704,
"step": 448
},
{
"epoch": 0.20427661510464057,
"grad_norm": 0.562107514296544,
"learning_rate": 9.959054445424532e-06,
"loss": 0.0644,
"step": 449
},
{
"epoch": 0.20473157415832574,
"grad_norm": 0.6089607791855781,
"learning_rate": 9.958871700051353e-06,
"loss": 0.0512,
"step": 450
},
{
"epoch": 0.2051865332120109,
"grad_norm": 0.6962095067981563,
"learning_rate": 9.958688549461573e-06,
"loss": 0.0712,
"step": 451
},
{
"epoch": 0.20564149226569609,
"grad_norm": 1.155217046291275,
"learning_rate": 9.958504993670158e-06,
"loss": 0.1049,
"step": 452
},
{
"epoch": 0.20609645131938126,
"grad_norm": 1.0913314226134752,
"learning_rate": 9.958321032692107e-06,
"loss": 0.1226,
"step": 453
},
{
"epoch": 0.20655141037306643,
"grad_norm": 22.735025633907238,
"learning_rate": 9.958136666542455e-06,
"loss": 0.8419,
"step": 454
},
{
"epoch": 0.2070063694267516,
"grad_norm": 1.184019553325164,
"learning_rate": 9.957951895236262e-06,
"loss": 0.1113,
"step": 455
},
{
"epoch": 0.20746132848043677,
"grad_norm": 0.7664792046331882,
"learning_rate": 9.957766718788632e-06,
"loss": 0.104,
"step": 456
},
{
"epoch": 0.20791628753412192,
"grad_norm": 0.8672883026786035,
"learning_rate": 9.957581137214695e-06,
"loss": 0.074,
"step": 457
},
{
"epoch": 0.2083712465878071,
"grad_norm": 0.8772220264781722,
"learning_rate": 9.957395150529615e-06,
"loss": 0.0986,
"step": 458
},
{
"epoch": 0.20882620564149226,
"grad_norm": 0.7016331971826193,
"learning_rate": 9.95720875874859e-06,
"loss": 0.0752,
"step": 459
},
{
"epoch": 0.20928116469517744,
"grad_norm": 0.6308822051977305,
"learning_rate": 9.957021961886855e-06,
"loss": 0.0608,
"step": 460
},
{
"epoch": 0.2097361237488626,
"grad_norm": 0.9803601042372939,
"learning_rate": 9.956834759959669e-06,
"loss": 0.0908,
"step": 461
},
{
"epoch": 0.21019108280254778,
"grad_norm": 0.7674462109758159,
"learning_rate": 9.95664715298233e-06,
"loss": 0.074,
"step": 462
},
{
"epoch": 0.21064604185623295,
"grad_norm": 0.7450186566335193,
"learning_rate": 9.95645914097017e-06,
"loss": 0.0817,
"step": 463
},
{
"epoch": 0.2111010009099181,
"grad_norm": 0.7225723661612439,
"learning_rate": 9.956270723938553e-06,
"loss": 0.0849,
"step": 464
},
{
"epoch": 0.21155595996360327,
"grad_norm": 0.7190355211871646,
"learning_rate": 9.956081901902875e-06,
"loss": 0.0748,
"step": 465
},
{
"epoch": 0.21201091901728844,
"grad_norm": 1.210684562087392,
"learning_rate": 9.955892674878565e-06,
"loss": 0.1272,
"step": 466
},
{
"epoch": 0.2124658780709736,
"grad_norm": 0.834170476650907,
"learning_rate": 9.955703042881087e-06,
"loss": 0.0992,
"step": 467
},
{
"epoch": 0.21292083712465878,
"grad_norm": 0.874478173291907,
"learning_rate": 9.955513005925934e-06,
"loss": 0.0858,
"step": 468
},
{
"epoch": 0.21337579617834396,
"grad_norm": 0.5510320150423565,
"learning_rate": 9.95532256402864e-06,
"loss": 0.0574,
"step": 469
},
{
"epoch": 0.21383075523202913,
"grad_norm": 0.5657171871822584,
"learning_rate": 9.955131717204762e-06,
"loss": 0.0671,
"step": 470
},
{
"epoch": 0.21428571428571427,
"grad_norm": 0.7564664653864259,
"learning_rate": 9.954940465469898e-06,
"loss": 0.085,
"step": 471
},
{
"epoch": 0.21474067333939945,
"grad_norm": 0.7594501005901694,
"learning_rate": 9.954748808839675e-06,
"loss": 0.0733,
"step": 472
},
{
"epoch": 0.21519563239308462,
"grad_norm": 0.6748092428366178,
"learning_rate": 9.954556747329754e-06,
"loss": 0.0707,
"step": 473
},
{
"epoch": 0.2156505914467698,
"grad_norm": 1.715089789819449,
"learning_rate": 9.954364280955832e-06,
"loss": 0.1045,
"step": 474
},
{
"epoch": 0.21610555050045496,
"grad_norm": 0.6668751648778155,
"learning_rate": 9.954171409733634e-06,
"loss": 0.0573,
"step": 475
},
{
"epoch": 0.21656050955414013,
"grad_norm": 0.5963716475430643,
"learning_rate": 9.95397813367892e-06,
"loss": 0.0752,
"step": 476
},
{
"epoch": 0.2170154686078253,
"grad_norm": 0.9917190233932158,
"learning_rate": 9.953784452807487e-06,
"loss": 0.1049,
"step": 477
},
{
"epoch": 0.21747042766151045,
"grad_norm": 0.5638529401686616,
"learning_rate": 9.953590367135159e-06,
"loss": 0.0547,
"step": 478
},
{
"epoch": 0.21792538671519562,
"grad_norm": 0.6477110515460727,
"learning_rate": 9.953395876677796e-06,
"loss": 0.0564,
"step": 479
},
{
"epoch": 0.2183803457688808,
"grad_norm": 0.5492055118574499,
"learning_rate": 9.95320098145129e-06,
"loss": 0.0505,
"step": 480
},
{
"epoch": 0.21883530482256597,
"grad_norm": 0.8954528378372288,
"learning_rate": 9.95300568147157e-06,
"loss": 0.126,
"step": 481
},
{
"epoch": 0.21929026387625114,
"grad_norm": 0.6155736143826033,
"learning_rate": 9.952809976754593e-06,
"loss": 0.0518,
"step": 482
},
{
"epoch": 0.2197452229299363,
"grad_norm": 1.1486004986445648,
"learning_rate": 9.952613867316351e-06,
"loss": 0.1142,
"step": 483
},
{
"epoch": 0.22020018198362148,
"grad_norm": 0.8236924325360948,
"learning_rate": 9.95241735317287e-06,
"loss": 0.1047,
"step": 484
},
{
"epoch": 0.22065514103730663,
"grad_norm": 0.832372102653505,
"learning_rate": 9.952220434340209e-06,
"loss": 0.0729,
"step": 485
},
{
"epoch": 0.2211101000909918,
"grad_norm": 0.7288716722109786,
"learning_rate": 9.952023110834456e-06,
"loss": 0.068,
"step": 486
},
{
"epoch": 0.22156505914467697,
"grad_norm": 0.5327254294033283,
"learning_rate": 9.951825382671739e-06,
"loss": 0.0614,
"step": 487
},
{
"epoch": 0.22202001819836215,
"grad_norm": 0.7204991379763186,
"learning_rate": 9.951627249868213e-06,
"loss": 0.0666,
"step": 488
},
{
"epoch": 0.22247497725204732,
"grad_norm": 0.7485835393026234,
"learning_rate": 9.95142871244007e-06,
"loss": 0.068,
"step": 489
},
{
"epoch": 0.2229299363057325,
"grad_norm": 0.45602532896445397,
"learning_rate": 9.951229770403531e-06,
"loss": 0.0414,
"step": 490
},
{
"epoch": 0.22338489535941766,
"grad_norm": 0.7240661348572547,
"learning_rate": 9.951030423774858e-06,
"loss": 0.0798,
"step": 491
},
{
"epoch": 0.22383985441310283,
"grad_norm": 0.7716352477687572,
"learning_rate": 9.950830672570337e-06,
"loss": 0.071,
"step": 492
},
{
"epoch": 0.22429481346678798,
"grad_norm": 1.22677184750836,
"learning_rate": 9.95063051680629e-06,
"loss": 0.1373,
"step": 493
},
{
"epoch": 0.22474977252047315,
"grad_norm": 0.7365431233953595,
"learning_rate": 9.950429956499074e-06,
"loss": 0.0699,
"step": 494
},
{
"epoch": 0.22520473157415832,
"grad_norm": 0.705654951368504,
"learning_rate": 9.950228991665078e-06,
"loss": 0.0741,
"step": 495
},
{
"epoch": 0.2256596906278435,
"grad_norm": 0.8261497906057415,
"learning_rate": 9.950027622320724e-06,
"loss": 0.0764,
"step": 496
},
{
"epoch": 0.22611464968152867,
"grad_norm": 0.9965395262255518,
"learning_rate": 9.949825848482465e-06,
"loss": 0.0852,
"step": 497
},
{
"epoch": 0.22656960873521384,
"grad_norm": 0.6807161957389707,
"learning_rate": 9.949623670166794e-06,
"loss": 0.074,
"step": 498
},
{
"epoch": 0.227024567788899,
"grad_norm": 1.1216390709095547,
"learning_rate": 9.949421087390228e-06,
"loss": 0.0931,
"step": 499
},
{
"epoch": 0.22747952684258416,
"grad_norm": 1.1278655216416786,
"learning_rate": 9.949218100169322e-06,
"loss": 0.1177,
"step": 500
},
{
"epoch": 0.22793448589626933,
"grad_norm": 0.9160591457448575,
"learning_rate": 9.949014708520664e-06,
"loss": 0.1015,
"step": 501
},
{
"epoch": 0.2283894449499545,
"grad_norm": 0.9377363057118697,
"learning_rate": 9.948810912460872e-06,
"loss": 0.1059,
"step": 502
},
{
"epoch": 0.22884440400363967,
"grad_norm": 0.8760932101779023,
"learning_rate": 9.948606712006601e-06,
"loss": 0.0812,
"step": 503
},
{
"epoch": 0.22929936305732485,
"grad_norm": 0.6962605051289937,
"learning_rate": 9.948402107174537e-06,
"loss": 0.0735,
"step": 504
},
{
"epoch": 0.22975432211101002,
"grad_norm": 0.6501265713488487,
"learning_rate": 9.948197097981401e-06,
"loss": 0.0551,
"step": 505
},
{
"epoch": 0.2302092811646952,
"grad_norm": 1.2156011775652311,
"learning_rate": 9.947991684443942e-06,
"loss": 0.1066,
"step": 506
},
{
"epoch": 0.23066424021838033,
"grad_norm": 0.9679794435610901,
"learning_rate": 9.947785866578951e-06,
"loss": 0.0981,
"step": 507
},
{
"epoch": 0.2311191992720655,
"grad_norm": 0.7195724631231237,
"learning_rate": 9.94757964440324e-06,
"loss": 0.0777,
"step": 508
},
{
"epoch": 0.23157415832575068,
"grad_norm": 0.549427502610929,
"learning_rate": 9.947373017933665e-06,
"loss": 0.0516,
"step": 509
},
{
"epoch": 0.23202911737943585,
"grad_norm": 0.5667212336170355,
"learning_rate": 9.947165987187108e-06,
"loss": 0.0583,
"step": 510
},
{
"epoch": 0.23248407643312102,
"grad_norm": 0.6638127935874616,
"learning_rate": 9.946958552180489e-06,
"loss": 0.0723,
"step": 511
},
{
"epoch": 0.2329390354868062,
"grad_norm": 0.5226768129517959,
"learning_rate": 9.946750712930756e-06,
"loss": 0.0482,
"step": 512
},
{
"epoch": 0.23339399454049137,
"grad_norm": 0.8358986518129136,
"learning_rate": 9.946542469454894e-06,
"loss": 0.1037,
"step": 513
},
{
"epoch": 0.2338489535941765,
"grad_norm": 0.6695809647699968,
"learning_rate": 9.94633382176992e-06,
"loss": 0.0728,
"step": 514
},
{
"epoch": 0.23430391264786168,
"grad_norm": 1.0608546974350634,
"learning_rate": 9.946124769892884e-06,
"loss": 0.1192,
"step": 515
},
{
"epoch": 0.23475887170154686,
"grad_norm": 0.5090717025630993,
"learning_rate": 9.945915313840869e-06,
"loss": 0.0612,
"step": 516
},
{
"epoch": 0.23521383075523203,
"grad_norm": 0.8105130307542814,
"learning_rate": 9.94570545363099e-06,
"loss": 0.0838,
"step": 517
},
{
"epoch": 0.2356687898089172,
"grad_norm": 0.7752986876049957,
"learning_rate": 9.945495189280394e-06,
"loss": 0.092,
"step": 518
},
{
"epoch": 0.23612374886260237,
"grad_norm": 0.869801315379322,
"learning_rate": 9.945284520806267e-06,
"loss": 0.077,
"step": 519
},
{
"epoch": 0.23657870791628755,
"grad_norm": 0.5427153243822386,
"learning_rate": 9.94507344822582e-06,
"loss": 0.0592,
"step": 520
},
{
"epoch": 0.2370336669699727,
"grad_norm": 0.7368670007832758,
"learning_rate": 9.944861971556305e-06,
"loss": 0.0608,
"step": 521
},
{
"epoch": 0.23748862602365786,
"grad_norm": 0.8141430793460733,
"learning_rate": 9.944650090814998e-06,
"loss": 0.0616,
"step": 522
},
{
"epoch": 0.23794358507734303,
"grad_norm": 2.1096588720516425,
"learning_rate": 9.944437806019216e-06,
"loss": 0.0938,
"step": 523
},
{
"epoch": 0.2383985441310282,
"grad_norm": 0.7014907085161215,
"learning_rate": 9.944225117186306e-06,
"loss": 0.0812,
"step": 524
},
{
"epoch": 0.23885350318471338,
"grad_norm": 0.5078467158211916,
"learning_rate": 9.944012024333647e-06,
"loss": 0.0561,
"step": 525
},
{
"epoch": 0.23930846223839855,
"grad_norm": 0.6379031604907951,
"learning_rate": 9.943798527478652e-06,
"loss": 0.0678,
"step": 526
},
{
"epoch": 0.23976342129208372,
"grad_norm": 0.799876019099874,
"learning_rate": 9.943584626638768e-06,
"loss": 0.0914,
"step": 527
},
{
"epoch": 0.24021838034576887,
"grad_norm": 0.6550229607349646,
"learning_rate": 9.943370321831474e-06,
"loss": 0.0668,
"step": 528
},
{
"epoch": 0.24067333939945404,
"grad_norm": 0.767534839542607,
"learning_rate": 9.943155613074279e-06,
"loss": 0.0711,
"step": 529
},
{
"epoch": 0.2411282984531392,
"grad_norm": 0.7571838990000624,
"learning_rate": 9.942940500384733e-06,
"loss": 0.0893,
"step": 530
},
{
"epoch": 0.24158325750682438,
"grad_norm": 17.807000846945513,
"learning_rate": 9.942724983780409e-06,
"loss": 0.3419,
"step": 531
},
{
"epoch": 0.24203821656050956,
"grad_norm": 1.2088422410181228,
"learning_rate": 9.942509063278922e-06,
"loss": 0.1173,
"step": 532
},
{
"epoch": 0.24249317561419473,
"grad_norm": 0.8811842157145667,
"learning_rate": 9.942292738897914e-06,
"loss": 0.1006,
"step": 533
},
{
"epoch": 0.2429481346678799,
"grad_norm": 0.7726281786442553,
"learning_rate": 9.942076010655063e-06,
"loss": 0.0909,
"step": 534
},
{
"epoch": 0.24340309372156507,
"grad_norm": 0.9942256398778268,
"learning_rate": 9.941858878568078e-06,
"loss": 0.134,
"step": 535
},
{
"epoch": 0.24385805277525022,
"grad_norm": 1.001596627292525,
"learning_rate": 9.941641342654702e-06,
"loss": 0.0977,
"step": 536
},
{
"epoch": 0.2443130118289354,
"grad_norm": 0.5064863363900076,
"learning_rate": 9.941423402932713e-06,
"loss": 0.0559,
"step": 537
},
{
"epoch": 0.24476797088262056,
"grad_norm": 0.8589680374278897,
"learning_rate": 9.94120505941992e-06,
"loss": 0.0992,
"step": 538
},
{
"epoch": 0.24522292993630573,
"grad_norm": 0.7830880681851201,
"learning_rate": 9.940986312134162e-06,
"loss": 0.0825,
"step": 539
},
{
"epoch": 0.2456778889899909,
"grad_norm": 0.5778344550660577,
"learning_rate": 9.940767161093316e-06,
"loss": 0.0637,
"step": 540
},
{
"epoch": 0.24613284804367608,
"grad_norm": 0.8661775200374767,
"learning_rate": 9.94054760631529e-06,
"loss": 0.0958,
"step": 541
},
{
"epoch": 0.24658780709736125,
"grad_norm": 0.6976226834296251,
"learning_rate": 9.940327647818026e-06,
"loss": 0.0752,
"step": 542
},
{
"epoch": 0.2470427661510464,
"grad_norm": 0.7530160135685138,
"learning_rate": 9.940107285619495e-06,
"loss": 0.077,
"step": 543
},
{
"epoch": 0.24749772520473157,
"grad_norm": 0.7997106896354084,
"learning_rate": 9.939886519737707e-06,
"loss": 0.0958,
"step": 544
},
{
"epoch": 0.24795268425841674,
"grad_norm": 0.8918061918047896,
"learning_rate": 9.939665350190702e-06,
"loss": 0.0822,
"step": 545
},
{
"epoch": 0.2484076433121019,
"grad_norm": 0.804115756264787,
"learning_rate": 9.93944377699655e-06,
"loss": 0.0915,
"step": 546
},
{
"epoch": 0.24886260236578708,
"grad_norm": 0.6234057941022288,
"learning_rate": 9.93922180017336e-06,
"loss": 0.0672,
"step": 547
},
{
"epoch": 0.24931756141947226,
"grad_norm": 0.8269450754551354,
"learning_rate": 9.93899941973927e-06,
"loss": 0.1102,
"step": 548
},
{
"epoch": 0.24977252047315743,
"grad_norm": 0.9233841316663005,
"learning_rate": 9.93877663571245e-06,
"loss": 0.0963,
"step": 549
},
{
"epoch": 0.2502274795268426,
"grad_norm": 0.9944861568923805,
"learning_rate": 9.938553448111108e-06,
"loss": 0.1127,
"step": 550
},
{
"epoch": 0.25068243858052774,
"grad_norm": 0.8423641298780182,
"learning_rate": 9.938329856953482e-06,
"loss": 0.0788,
"step": 551
},
{
"epoch": 0.25113739763421294,
"grad_norm": 0.8124861649110975,
"learning_rate": 9.938105862257839e-06,
"loss": 0.0831,
"step": 552
},
{
"epoch": 0.2515923566878981,
"grad_norm": 0.6612222253979325,
"learning_rate": 9.937881464042485e-06,
"loss": 0.0703,
"step": 553
},
{
"epoch": 0.25204731574158323,
"grad_norm": 0.854447666921162,
"learning_rate": 9.937656662325759e-06,
"loss": 0.1074,
"step": 554
},
{
"epoch": 0.25250227479526843,
"grad_norm": 0.74521770368624,
"learning_rate": 9.937431457126028e-06,
"loss": 0.0777,
"step": 555
},
{
"epoch": 0.2529572338489536,
"grad_norm": 0.5044600553216889,
"learning_rate": 9.937205848461694e-06,
"loss": 0.0482,
"step": 556
},
{
"epoch": 0.2534121929026388,
"grad_norm": 1.0949051966397356,
"learning_rate": 9.936979836351197e-06,
"loss": 0.0945,
"step": 557
},
{
"epoch": 0.2538671519563239,
"grad_norm": 1.0332199252594778,
"learning_rate": 9.936753420813003e-06,
"loss": 0.092,
"step": 558
},
{
"epoch": 0.2543221110100091,
"grad_norm": 0.7029577630748303,
"learning_rate": 9.936526601865612e-06,
"loss": 0.0612,
"step": 559
},
{
"epoch": 0.25477707006369427,
"grad_norm": 0.5251640812064944,
"learning_rate": 9.936299379527561e-06,
"loss": 0.0569,
"step": 560
},
{
"epoch": 0.2552320291173794,
"grad_norm": 0.6689496924283664,
"learning_rate": 9.936071753817416e-06,
"loss": 0.0831,
"step": 561
},
{
"epoch": 0.2556869881710646,
"grad_norm": 0.8094390650978945,
"learning_rate": 9.935843724753778e-06,
"loss": 0.0897,
"step": 562
},
{
"epoch": 0.25614194722474976,
"grad_norm": 0.9168849457874456,
"learning_rate": 9.935615292355283e-06,
"loss": 0.1002,
"step": 563
},
{
"epoch": 0.25659690627843496,
"grad_norm": 0.8829987760246157,
"learning_rate": 9.935386456640593e-06,
"loss": 0.0997,
"step": 564
},
{
"epoch": 0.2570518653321201,
"grad_norm": 0.9381858557170412,
"learning_rate": 9.93515721762841e-06,
"loss": 0.0926,
"step": 565
},
{
"epoch": 0.2575068243858053,
"grad_norm": 0.6555630906162114,
"learning_rate": 9.934927575337469e-06,
"loss": 0.0805,
"step": 566
},
{
"epoch": 0.25796178343949044,
"grad_norm": 0.49897284031908906,
"learning_rate": 9.93469752978653e-06,
"loss": 0.0545,
"step": 567
},
{
"epoch": 0.2584167424931756,
"grad_norm": 0.8528689809178094,
"learning_rate": 9.934467080994394e-06,
"loss": 0.071,
"step": 568
},
{
"epoch": 0.2588717015468608,
"grad_norm": 0.7999188284583189,
"learning_rate": 9.934236228979893e-06,
"loss": 0.0675,
"step": 569
},
{
"epoch": 0.25932666060054593,
"grad_norm": 0.6603615540899209,
"learning_rate": 9.934004973761888e-06,
"loss": 0.0584,
"step": 570
},
{
"epoch": 0.25978161965423113,
"grad_norm": 0.907545218090885,
"learning_rate": 9.933773315359281e-06,
"loss": 0.0912,
"step": 571
},
{
"epoch": 0.2602365787079163,
"grad_norm": 1.2225854103436529,
"learning_rate": 9.933541253790998e-06,
"loss": 0.0996,
"step": 572
},
{
"epoch": 0.2606915377616015,
"grad_norm": 0.821182112953313,
"learning_rate": 9.933308789076004e-06,
"loss": 0.0886,
"step": 573
},
{
"epoch": 0.2611464968152866,
"grad_norm": 0.5608593716975471,
"learning_rate": 9.933075921233292e-06,
"loss": 0.0597,
"step": 574
},
{
"epoch": 0.26160145586897177,
"grad_norm": 0.977094581221023,
"learning_rate": 9.932842650281897e-06,
"loss": 0.0796,
"step": 575
},
{
"epoch": 0.26205641492265697,
"grad_norm": 1.0086738407073246,
"learning_rate": 9.932608976240875e-06,
"loss": 0.1245,
"step": 576
},
{
"epoch": 0.2625113739763421,
"grad_norm": 0.7841605184531412,
"learning_rate": 9.932374899129323e-06,
"loss": 0.0798,
"step": 577
},
{
"epoch": 0.2629663330300273,
"grad_norm": 0.6360279282536222,
"learning_rate": 9.932140418966369e-06,
"loss": 0.0714,
"step": 578
},
{
"epoch": 0.26342129208371245,
"grad_norm": 0.8673569892639119,
"learning_rate": 9.931905535771174e-06,
"loss": 0.0805,
"step": 579
},
{
"epoch": 0.26387625113739765,
"grad_norm": 1.0489822111787226,
"learning_rate": 9.93167024956293e-06,
"loss": 0.1046,
"step": 580
},
{
"epoch": 0.2643312101910828,
"grad_norm": 0.5670611684906575,
"learning_rate": 9.931434560360864e-06,
"loss": 0.0662,
"step": 581
},
{
"epoch": 0.26478616924476794,
"grad_norm": 0.6786486717931198,
"learning_rate": 9.931198468184236e-06,
"loss": 0.0705,
"step": 582
},
{
"epoch": 0.26524112829845314,
"grad_norm": 0.7580601459978998,
"learning_rate": 9.93096197305234e-06,
"loss": 0.0852,
"step": 583
},
{
"epoch": 0.2656960873521383,
"grad_norm": 0.8802141056853473,
"learning_rate": 9.930725074984498e-06,
"loss": 0.0989,
"step": 584
},
{
"epoch": 0.2661510464058235,
"grad_norm": 0.6365186853726369,
"learning_rate": 9.930487774000071e-06,
"loss": 0.0639,
"step": 585
},
{
"epoch": 0.26660600545950863,
"grad_norm": 0.5301331320559389,
"learning_rate": 9.930250070118448e-06,
"loss": 0.0628,
"step": 586
},
{
"epoch": 0.26706096451319383,
"grad_norm": 0.6982626314754508,
"learning_rate": 9.930011963359055e-06,
"loss": 0.071,
"step": 587
},
{
"epoch": 0.267515923566879,
"grad_norm": 1.0151988128038116,
"learning_rate": 9.929773453741346e-06,
"loss": 0.1074,
"step": 588
},
{
"epoch": 0.2679708826205642,
"grad_norm": 0.809050548171497,
"learning_rate": 9.929534541284814e-06,
"loss": 0.0715,
"step": 589
},
{
"epoch": 0.2684258416742493,
"grad_norm": 0.8254901916718546,
"learning_rate": 9.929295226008981e-06,
"loss": 0.0867,
"step": 590
},
{
"epoch": 0.26888080072793447,
"grad_norm": 0.695875393623419,
"learning_rate": 9.929055507933403e-06,
"loss": 0.0667,
"step": 591
},
{
"epoch": 0.26933575978161967,
"grad_norm": 0.6569370607259161,
"learning_rate": 9.928815387077668e-06,
"loss": 0.0667,
"step": 592
},
{
"epoch": 0.2697907188353048,
"grad_norm": 0.8509989554819866,
"learning_rate": 9.9285748634614e-06,
"loss": 0.0964,
"step": 593
},
{
"epoch": 0.27024567788899,
"grad_norm": 0.7743154017799978,
"learning_rate": 9.928333937104249e-06,
"loss": 0.1008,
"step": 594
},
{
"epoch": 0.27070063694267515,
"grad_norm": 0.6810806452813069,
"learning_rate": 9.928092608025905e-06,
"loss": 0.0623,
"step": 595
},
{
"epoch": 0.27115559599636035,
"grad_norm": 0.6757764847225584,
"learning_rate": 9.927850876246087e-06,
"loss": 0.0621,
"step": 596
},
{
"epoch": 0.2716105550500455,
"grad_norm": 0.7561897396028232,
"learning_rate": 9.927608741784551e-06,
"loss": 0.0769,
"step": 597
},
{
"epoch": 0.27206551410373064,
"grad_norm": 0.9087608421567758,
"learning_rate": 9.927366204661081e-06,
"loss": 0.1064,
"step": 598
},
{
"epoch": 0.27252047315741584,
"grad_norm": 0.6090969825991095,
"learning_rate": 9.927123264895497e-06,
"loss": 0.0596,
"step": 599
},
{
"epoch": 0.272975432211101,
"grad_norm": 0.5838273869575724,
"learning_rate": 9.926879922507651e-06,
"loss": 0.0581,
"step": 600
},
{
"epoch": 0.2734303912647862,
"grad_norm": 41.16319851924577,
"learning_rate": 9.926636177517427e-06,
"loss": 0.7305,
"step": 601
},
{
"epoch": 0.27388535031847133,
"grad_norm": 0.7159907538362364,
"learning_rate": 9.926392029944743e-06,
"loss": 0.0655,
"step": 602
},
{
"epoch": 0.27434030937215653,
"grad_norm": 0.6649118967721417,
"learning_rate": 9.92614747980955e-06,
"loss": 0.0676,
"step": 603
},
{
"epoch": 0.2747952684258417,
"grad_norm": 0.6955588874689645,
"learning_rate": 9.92590252713183e-06,
"loss": 0.0691,
"step": 604
},
{
"epoch": 0.2752502274795268,
"grad_norm": 1.0093833512385355,
"learning_rate": 9.925657171931603e-06,
"loss": 0.0788,
"step": 605
},
{
"epoch": 0.275705186533212,
"grad_norm": 0.7222760734094591,
"learning_rate": 9.925411414228913e-06,
"loss": 0.0765,
"step": 606
},
{
"epoch": 0.27616014558689717,
"grad_norm": 0.7901083190949632,
"learning_rate": 9.925165254043846e-06,
"loss": 0.0899,
"step": 607
},
{
"epoch": 0.27661510464058237,
"grad_norm": 0.9417411536264935,
"learning_rate": 9.924918691396516e-06,
"loss": 0.105,
"step": 608
},
{
"epoch": 0.2770700636942675,
"grad_norm": 0.8531576003982281,
"learning_rate": 9.924671726307073e-06,
"loss": 0.0943,
"step": 609
},
{
"epoch": 0.2775250227479527,
"grad_norm": 0.5771833327707789,
"learning_rate": 9.924424358795694e-06,
"loss": 0.0649,
"step": 610
},
{
"epoch": 0.27797998180163785,
"grad_norm": 0.6804808150530418,
"learning_rate": 9.924176588882597e-06,
"loss": 0.0591,
"step": 611
},
{
"epoch": 0.278434940855323,
"grad_norm": 0.6916110773643345,
"learning_rate": 9.923928416588027e-06,
"loss": 0.082,
"step": 612
},
{
"epoch": 0.2788898999090082,
"grad_norm": 0.7302341341594485,
"learning_rate": 9.923679841932261e-06,
"loss": 0.0858,
"step": 613
},
{
"epoch": 0.27934485896269334,
"grad_norm": 0.7190514572276734,
"learning_rate": 9.923430864935615e-06,
"loss": 0.0658,
"step": 614
},
{
"epoch": 0.27979981801637854,
"grad_norm": 0.6872892360375661,
"learning_rate": 9.923181485618432e-06,
"loss": 0.0639,
"step": 615
},
{
"epoch": 0.2802547770700637,
"grad_norm": 0.6937876338258171,
"learning_rate": 9.92293170400109e-06,
"loss": 0.0759,
"step": 616
},
{
"epoch": 0.2807097361237489,
"grad_norm": 0.8498928251372749,
"learning_rate": 9.922681520104002e-06,
"loss": 0.0777,
"step": 617
},
{
"epoch": 0.28116469517743403,
"grad_norm": 0.7409609990217324,
"learning_rate": 9.922430933947612e-06,
"loss": 0.0665,
"step": 618
},
{
"epoch": 0.2816196542311192,
"grad_norm": 1.2216942184143182,
"learning_rate": 9.922179945552393e-06,
"loss": 0.1405,
"step": 619
},
{
"epoch": 0.2820746132848044,
"grad_norm": 0.6637234254274302,
"learning_rate": 9.921928554938857e-06,
"loss": 0.062,
"step": 620
},
{
"epoch": 0.2825295723384895,
"grad_norm": 0.9463087936758936,
"learning_rate": 9.921676762127548e-06,
"loss": 0.0767,
"step": 621
},
{
"epoch": 0.2829845313921747,
"grad_norm": 1.089309305809361,
"learning_rate": 9.921424567139042e-06,
"loss": 0.1171,
"step": 622
},
{
"epoch": 0.28343949044585987,
"grad_norm": 0.8752119302288704,
"learning_rate": 9.921171969993942e-06,
"loss": 0.0813,
"step": 623
},
{
"epoch": 0.28389444949954507,
"grad_norm": 0.7870883299373892,
"learning_rate": 9.920918970712894e-06,
"loss": 0.0993,
"step": 624
},
{
"epoch": 0.2843494085532302,
"grad_norm": 0.6504873266789636,
"learning_rate": 9.92066556931657e-06,
"loss": 0.073,
"step": 625
},
{
"epoch": 0.28480436760691535,
"grad_norm": 1.1098031698420505,
"learning_rate": 9.920411765825679e-06,
"loss": 0.1218,
"step": 626
},
{
"epoch": 0.28525932666060055,
"grad_norm": 1.217844501512982,
"learning_rate": 9.920157560260957e-06,
"loss": 0.1549,
"step": 627
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.9728161223416268,
"learning_rate": 9.919902952643179e-06,
"loss": 0.0984,
"step": 628
},
{
"epoch": 0.2861692447679709,
"grad_norm": 0.5217007184455262,
"learning_rate": 9.91964794299315e-06,
"loss": 0.0636,
"step": 629
},
{
"epoch": 0.28662420382165604,
"grad_norm": 1.7394407973312302,
"learning_rate": 9.919392531331706e-06,
"loss": 0.1686,
"step": 630
},
{
"epoch": 0.28707916287534124,
"grad_norm": 0.5702940927618096,
"learning_rate": 9.919136717679723e-06,
"loss": 0.0465,
"step": 631
},
{
"epoch": 0.2875341219290264,
"grad_norm": 0.5990973378462472,
"learning_rate": 9.9188805020581e-06,
"loss": 0.0678,
"step": 632
},
{
"epoch": 0.28798908098271153,
"grad_norm": 0.9343816967111115,
"learning_rate": 9.918623884487777e-06,
"loss": 0.1068,
"step": 633
},
{
"epoch": 0.28844404003639673,
"grad_norm": 0.5997939637509836,
"learning_rate": 9.91836686498972e-06,
"loss": 0.0629,
"step": 634
},
{
"epoch": 0.2888989990900819,
"grad_norm": 0.8063617612610782,
"learning_rate": 9.918109443584938e-06,
"loss": 0.0904,
"step": 635
},
{
"epoch": 0.2893539581437671,
"grad_norm": 0.6625405697250593,
"learning_rate": 9.917851620294461e-06,
"loss": 0.0638,
"step": 636
},
{
"epoch": 0.2898089171974522,
"grad_norm": 0.7423789779714624,
"learning_rate": 9.917593395139358e-06,
"loss": 0.0714,
"step": 637
},
{
"epoch": 0.2902638762511374,
"grad_norm": 0.6102576569607258,
"learning_rate": 9.91733476814073e-06,
"loss": 0.0563,
"step": 638
},
{
"epoch": 0.29071883530482256,
"grad_norm": 0.8342620452233175,
"learning_rate": 9.91707573931971e-06,
"loss": 0.0934,
"step": 639
},
{
"epoch": 0.2911737943585077,
"grad_norm": 0.6397583044633867,
"learning_rate": 9.916816308697468e-06,
"loss": 0.0608,
"step": 640
},
{
"epoch": 0.2916287534121929,
"grad_norm": 0.7837909798874247,
"learning_rate": 9.9165564762952e-06,
"loss": 0.0936,
"step": 641
},
{
"epoch": 0.29208371246587805,
"grad_norm": 0.9915309549496408,
"learning_rate": 9.916296242134142e-06,
"loss": 0.1364,
"step": 642
},
{
"epoch": 0.29253867151956325,
"grad_norm": 0.7722166587924495,
"learning_rate": 9.916035606235555e-06,
"loss": 0.1022,
"step": 643
},
{
"epoch": 0.2929936305732484,
"grad_norm": 0.6446192951972597,
"learning_rate": 9.915774568620739e-06,
"loss": 0.0794,
"step": 644
},
{
"epoch": 0.2934485896269336,
"grad_norm": 0.7655996282008942,
"learning_rate": 9.915513129311025e-06,
"loss": 0.083,
"step": 645
},
{
"epoch": 0.29390354868061874,
"grad_norm": 0.7358761993420325,
"learning_rate": 9.915251288327776e-06,
"loss": 0.0927,
"step": 646
},
{
"epoch": 0.2943585077343039,
"grad_norm": 0.8417441236168001,
"learning_rate": 9.914989045692388e-06,
"loss": 0.0791,
"step": 647
},
{
"epoch": 0.2948134667879891,
"grad_norm": 0.8847229450668922,
"learning_rate": 9.914726401426293e-06,
"loss": 0.1114,
"step": 648
},
{
"epoch": 0.29526842584167423,
"grad_norm": 0.6805089048669102,
"learning_rate": 9.91446335555095e-06,
"loss": 0.0645,
"step": 649
},
{
"epoch": 0.29572338489535943,
"grad_norm": 0.9967907781154212,
"learning_rate": 9.914199908087856e-06,
"loss": 0.1125,
"step": 650
},
{
"epoch": 0.2961783439490446,
"grad_norm": 0.7069764233646496,
"learning_rate": 9.913936059058537e-06,
"loss": 0.0961,
"step": 651
},
{
"epoch": 0.2966333030027298,
"grad_norm": 0.8237259808163154,
"learning_rate": 9.913671808484554e-06,
"loss": 0.0863,
"step": 652
},
{
"epoch": 0.2970882620564149,
"grad_norm": 0.5595221349609915,
"learning_rate": 9.913407156387503e-06,
"loss": 0.0477,
"step": 653
},
{
"epoch": 0.29754322111010006,
"grad_norm": 0.8322598543263076,
"learning_rate": 9.913142102789005e-06,
"loss": 0.0785,
"step": 654
},
{
"epoch": 0.29799818016378526,
"grad_norm": 0.9426946452527044,
"learning_rate": 9.912876647710723e-06,
"loss": 0.0993,
"step": 655
},
{
"epoch": 0.2984531392174704,
"grad_norm": 0.8902481236790349,
"learning_rate": 9.912610791174348e-06,
"loss": 0.0981,
"step": 656
},
{
"epoch": 0.2989080982711556,
"grad_norm": 0.6714333609160019,
"learning_rate": 9.912344533201604e-06,
"loss": 0.0716,
"step": 657
},
{
"epoch": 0.29936305732484075,
"grad_norm": 0.6721636461789662,
"learning_rate": 9.91207787381425e-06,
"loss": 0.0675,
"step": 658
},
{
"epoch": 0.29981801637852595,
"grad_norm": 0.628744075340254,
"learning_rate": 9.911810813034073e-06,
"loss": 0.0583,
"step": 659
},
{
"epoch": 0.3002729754322111,
"grad_norm": 0.9172548581720068,
"learning_rate": 9.9115433508829e-06,
"loss": 0.0972,
"step": 660
},
{
"epoch": 0.30072793448589624,
"grad_norm": 0.914462327674233,
"learning_rate": 9.911275487382583e-06,
"loss": 0.089,
"step": 661
},
{
"epoch": 0.30118289353958144,
"grad_norm": 0.7410939383575923,
"learning_rate": 9.911007222555011e-06,
"loss": 0.0744,
"step": 662
},
{
"epoch": 0.3016378525932666,
"grad_norm": 0.6952942958219819,
"learning_rate": 9.91073855642211e-06,
"loss": 0.0627,
"step": 663
},
{
"epoch": 0.3020928116469518,
"grad_norm": 0.8802064643150562,
"learning_rate": 9.910469489005828e-06,
"loss": 0.0836,
"step": 664
},
{
"epoch": 0.30254777070063693,
"grad_norm": 0.9015922573736656,
"learning_rate": 9.910200020328158e-06,
"loss": 0.0934,
"step": 665
},
{
"epoch": 0.30300272975432213,
"grad_norm": 0.6635682732023674,
"learning_rate": 9.909930150411113e-06,
"loss": 0.0623,
"step": 666
},
{
"epoch": 0.3034576888080073,
"grad_norm": 1.928152977107998,
"learning_rate": 9.909659879276751e-06,
"loss": 0.1457,
"step": 667
},
{
"epoch": 0.3039126478616925,
"grad_norm": 0.7754006092902415,
"learning_rate": 9.909389206947156e-06,
"loss": 0.0621,
"step": 668
},
{
"epoch": 0.3043676069153776,
"grad_norm": 1.0461982822616211,
"learning_rate": 9.909118133444444e-06,
"loss": 0.1087,
"step": 669
},
{
"epoch": 0.30482256596906276,
"grad_norm": 0.7981897376851527,
"learning_rate": 9.90884665879077e-06,
"loss": 0.0921,
"step": 670
},
{
"epoch": 0.30527752502274796,
"grad_norm": 0.8941901965354629,
"learning_rate": 9.908574783008313e-06,
"loss": 0.1055,
"step": 671
},
{
"epoch": 0.3057324840764331,
"grad_norm": 1.0219508428898654,
"learning_rate": 9.908302506119291e-06,
"loss": 0.1152,
"step": 672
},
{
"epoch": 0.3061874431301183,
"grad_norm": 0.7623168423299865,
"learning_rate": 9.908029828145956e-06,
"loss": 0.0837,
"step": 673
},
{
"epoch": 0.30664240218380345,
"grad_norm": 0.7026665400337327,
"learning_rate": 9.907756749110587e-06,
"loss": 0.0785,
"step": 674
},
{
"epoch": 0.30709736123748865,
"grad_norm": 1.0861630797383492,
"learning_rate": 9.9074832690355e-06,
"loss": 0.1121,
"step": 675
},
{
"epoch": 0.3075523202911738,
"grad_norm": 0.8171913655631801,
"learning_rate": 9.907209387943042e-06,
"loss": 0.0759,
"step": 676
},
{
"epoch": 0.30800727934485894,
"grad_norm": 0.695009650682766,
"learning_rate": 9.906935105855595e-06,
"loss": 0.0508,
"step": 677
},
{
"epoch": 0.30846223839854414,
"grad_norm": 1.1629680848047237,
"learning_rate": 9.906660422795569e-06,
"loss": 0.1123,
"step": 678
},
{
"epoch": 0.3089171974522293,
"grad_norm": 1.1028006392582481,
"learning_rate": 9.906385338785411e-06,
"loss": 0.1048,
"step": 679
},
{
"epoch": 0.3093721565059145,
"grad_norm": 0.8590661780887954,
"learning_rate": 9.906109853847601e-06,
"loss": 0.0947,
"step": 680
},
{
"epoch": 0.30982711555959963,
"grad_norm": 0.9160314729851723,
"learning_rate": 9.90583396800465e-06,
"loss": 0.0928,
"step": 681
},
{
"epoch": 0.31028207461328483,
"grad_norm": 0.8935511298088069,
"learning_rate": 9.9055576812791e-06,
"loss": 0.0996,
"step": 682
},
{
"epoch": 0.31073703366697,
"grad_norm": 0.7005723015579258,
"learning_rate": 9.905280993693533e-06,
"loss": 0.0863,
"step": 683
},
{
"epoch": 0.3111919927206551,
"grad_norm": 0.6441434987399284,
"learning_rate": 9.905003905270553e-06,
"loss": 0.0682,
"step": 684
},
{
"epoch": 0.3116469517743403,
"grad_norm": 0.9609160991558658,
"learning_rate": 9.904726416032803e-06,
"loss": 0.1095,
"step": 685
},
{
"epoch": 0.31210191082802546,
"grad_norm": 0.723787688745946,
"learning_rate": 9.904448526002963e-06,
"loss": 0.0637,
"step": 686
},
{
"epoch": 0.31255686988171066,
"grad_norm": 0.5250433090776031,
"learning_rate": 9.904170235203737e-06,
"loss": 0.0587,
"step": 687
},
{
"epoch": 0.3130118289353958,
"grad_norm": 0.8819438583914972,
"learning_rate": 9.903891543657866e-06,
"loss": 0.1112,
"step": 688
},
{
"epoch": 0.313466787989081,
"grad_norm": 0.5413774773467063,
"learning_rate": 9.903612451388122e-06,
"loss": 0.0722,
"step": 689
},
{
"epoch": 0.31392174704276615,
"grad_norm": 0.8913097595158456,
"learning_rate": 9.903332958417315e-06,
"loss": 0.0893,
"step": 690
},
{
"epoch": 0.3143767060964513,
"grad_norm": 0.6466979890354269,
"learning_rate": 9.903053064768283e-06,
"loss": 0.0709,
"step": 691
},
{
"epoch": 0.3148316651501365,
"grad_norm": 0.8428101951038133,
"learning_rate": 9.902772770463892e-06,
"loss": 0.0814,
"step": 692
},
{
"epoch": 0.31528662420382164,
"grad_norm": 0.5832299371816577,
"learning_rate": 9.902492075527057e-06,
"loss": 0.0597,
"step": 693
},
{
"epoch": 0.31574158325750684,
"grad_norm": 0.7856263020740725,
"learning_rate": 9.902210979980705e-06,
"loss": 0.074,
"step": 694
},
{
"epoch": 0.316196542311192,
"grad_norm": 0.8507681095680276,
"learning_rate": 9.90192948384781e-06,
"loss": 0.0941,
"step": 695
},
{
"epoch": 0.3166515013648772,
"grad_norm": 0.7777857824270489,
"learning_rate": 9.901647587151376e-06,
"loss": 0.0708,
"step": 696
},
{
"epoch": 0.31710646041856233,
"grad_norm": 1.068022521735614,
"learning_rate": 9.901365289914437e-06,
"loss": 0.108,
"step": 697
},
{
"epoch": 0.3175614194722475,
"grad_norm": 1.1320770025873614,
"learning_rate": 9.901082592160059e-06,
"loss": 0.108,
"step": 698
},
{
"epoch": 0.3180163785259327,
"grad_norm": 0.803518334023751,
"learning_rate": 9.900799493911346e-06,
"loss": 0.0871,
"step": 699
},
{
"epoch": 0.3184713375796178,
"grad_norm": 0.8188444942805464,
"learning_rate": 9.900515995191431e-06,
"loss": 0.0808,
"step": 700
},
{
"epoch": 0.318926296633303,
"grad_norm": 0.8993527964087475,
"learning_rate": 9.900232096023478e-06,
"loss": 0.0821,
"step": 701
},
{
"epoch": 0.31938125568698816,
"grad_norm": 0.5600271316880729,
"learning_rate": 9.899947796430687e-06,
"loss": 0.0478,
"step": 702
},
{
"epoch": 0.31983621474067336,
"grad_norm": 0.8369718087747545,
"learning_rate": 9.899663096436292e-06,
"loss": 0.0871,
"step": 703
},
{
"epoch": 0.3202911737943585,
"grad_norm": 0.8993771893247359,
"learning_rate": 9.899377996063554e-06,
"loss": 0.0858,
"step": 704
},
{
"epoch": 0.32074613284804365,
"grad_norm": 0.6615773523414142,
"learning_rate": 9.899092495335772e-06,
"loss": 0.0601,
"step": 705
},
{
"epoch": 0.32120109190172885,
"grad_norm": 0.8278593900178107,
"learning_rate": 9.898806594276273e-06,
"loss": 0.0769,
"step": 706
},
{
"epoch": 0.321656050955414,
"grad_norm": 0.7866286577186284,
"learning_rate": 9.898520292908425e-06,
"loss": 0.0894,
"step": 707
},
{
"epoch": 0.3221110100090992,
"grad_norm": 0.8050313615570786,
"learning_rate": 9.89823359125562e-06,
"loss": 0.0732,
"step": 708
},
{
"epoch": 0.32256596906278434,
"grad_norm": 1.0243914254387991,
"learning_rate": 9.897946489341286e-06,
"loss": 0.0901,
"step": 709
},
{
"epoch": 0.32302092811646954,
"grad_norm": 0.7036337195424629,
"learning_rate": 9.897658987188882e-06,
"loss": 0.0686,
"step": 710
},
{
"epoch": 0.3234758871701547,
"grad_norm": 0.5593772745397846,
"learning_rate": 9.897371084821905e-06,
"loss": 0.045,
"step": 711
},
{
"epoch": 0.32393084622383983,
"grad_norm": 0.608867956874154,
"learning_rate": 9.897082782263878e-06,
"loss": 0.0692,
"step": 712
},
{
"epoch": 0.32438580527752503,
"grad_norm": 0.6488333561840038,
"learning_rate": 9.896794079538362e-06,
"loss": 0.0513,
"step": 713
},
{
"epoch": 0.3248407643312102,
"grad_norm": 0.5593745607285364,
"learning_rate": 9.896504976668948e-06,
"loss": 0.0437,
"step": 714
},
{
"epoch": 0.3252957233848954,
"grad_norm": 0.5072427035814352,
"learning_rate": 9.896215473679259e-06,
"loss": 0.0566,
"step": 715
},
{
"epoch": 0.3257506824385805,
"grad_norm": 0.7088539736923404,
"learning_rate": 9.895925570592952e-06,
"loss": 0.0878,
"step": 716
},
{
"epoch": 0.3262056414922657,
"grad_norm": 0.9653520712469312,
"learning_rate": 9.895635267433719e-06,
"loss": 0.101,
"step": 717
},
{
"epoch": 0.32666060054595086,
"grad_norm": 1.2323140645024868,
"learning_rate": 9.895344564225277e-06,
"loss": 0.1359,
"step": 718
},
{
"epoch": 0.327115559599636,
"grad_norm": 0.6826807669546061,
"learning_rate": 9.895053460991389e-06,
"loss": 0.0799,
"step": 719
},
{
"epoch": 0.3275705186533212,
"grad_norm": 0.9496304010026827,
"learning_rate": 9.894761957755834e-06,
"loss": 0.0928,
"step": 720
},
{
"epoch": 0.32802547770700635,
"grad_norm": 0.8578622125964999,
"learning_rate": 9.894470054542438e-06,
"loss": 0.1149,
"step": 721
},
{
"epoch": 0.32848043676069155,
"grad_norm": 0.5483719717114235,
"learning_rate": 9.894177751375053e-06,
"loss": 0.0621,
"step": 722
},
{
"epoch": 0.3289353958143767,
"grad_norm": 0.6341198897869947,
"learning_rate": 9.893885048277564e-06,
"loss": 0.0568,
"step": 723
},
{
"epoch": 0.3293903548680619,
"grad_norm": 0.7169738278552924,
"learning_rate": 9.893591945273888e-06,
"loss": 0.0752,
"step": 724
},
{
"epoch": 0.32984531392174704,
"grad_norm": 0.9839905963719277,
"learning_rate": 9.89329844238798e-06,
"loss": 0.1167,
"step": 725
},
{
"epoch": 0.3303002729754322,
"grad_norm": 0.6825969142747964,
"learning_rate": 9.89300453964382e-06,
"loss": 0.0693,
"step": 726
},
{
"epoch": 0.3307552320291174,
"grad_norm": 1.0420794853330364,
"learning_rate": 9.892710237065423e-06,
"loss": 0.1561,
"step": 727
},
{
"epoch": 0.33121019108280253,
"grad_norm": 1.0109988913697336,
"learning_rate": 9.892415534676844e-06,
"loss": 0.0813,
"step": 728
},
{
"epoch": 0.33166515013648773,
"grad_norm": 0.6237179977245606,
"learning_rate": 9.892120432502161e-06,
"loss": 0.063,
"step": 729
},
{
"epoch": 0.3321201091901729,
"grad_norm": 0.7047649578988654,
"learning_rate": 9.891824930565488e-06,
"loss": 0.0757,
"step": 730
},
{
"epoch": 0.3325750682438581,
"grad_norm": 0.8381336709785119,
"learning_rate": 9.891529028890974e-06,
"loss": 0.1137,
"step": 731
},
{
"epoch": 0.3330300272975432,
"grad_norm": 1.108812928457643,
"learning_rate": 9.891232727502797e-06,
"loss": 0.0971,
"step": 732
},
{
"epoch": 0.33348498635122836,
"grad_norm": 0.8911550238765422,
"learning_rate": 9.89093602642517e-06,
"loss": 0.0869,
"step": 733
},
{
"epoch": 0.33393994540491356,
"grad_norm": 0.7527062298816352,
"learning_rate": 9.890638925682339e-06,
"loss": 0.085,
"step": 734
},
{
"epoch": 0.3343949044585987,
"grad_norm": 0.8028637093759472,
"learning_rate": 9.89034142529858e-06,
"loss": 0.0866,
"step": 735
},
{
"epoch": 0.3348498635122839,
"grad_norm": 0.6620365400447171,
"learning_rate": 9.890043525298203e-06,
"loss": 0.053,
"step": 736
},
{
"epoch": 0.33530482256596905,
"grad_norm": 0.6606838089782118,
"learning_rate": 9.889745225705555e-06,
"loss": 0.0783,
"step": 737
},
{
"epoch": 0.33575978161965425,
"grad_norm": 0.6719238881234298,
"learning_rate": 9.889446526545007e-06,
"loss": 0.079,
"step": 738
},
{
"epoch": 0.3362147406733394,
"grad_norm": 0.7379881342173255,
"learning_rate": 9.88914742784097e-06,
"loss": 0.0848,
"step": 739
},
{
"epoch": 0.33666969972702454,
"grad_norm": 1.9725398231448836,
"learning_rate": 9.888847929617887e-06,
"loss": 0.1666,
"step": 740
},
{
"epoch": 0.33712465878070974,
"grad_norm": 0.7800667095330575,
"learning_rate": 9.888548031900226e-06,
"loss": 0.0779,
"step": 741
},
{
"epoch": 0.3375796178343949,
"grad_norm": 0.9725198572426639,
"learning_rate": 9.888247734712497e-06,
"loss": 0.0719,
"step": 742
},
{
"epoch": 0.3380345768880801,
"grad_norm": 0.9547104503470986,
"learning_rate": 9.887947038079238e-06,
"loss": 0.1119,
"step": 743
},
{
"epoch": 0.33848953594176523,
"grad_norm": 0.5879353672489683,
"learning_rate": 9.887645942025022e-06,
"loss": 0.0553,
"step": 744
},
{
"epoch": 0.33894449499545043,
"grad_norm": 0.5485885922626542,
"learning_rate": 9.887344446574452e-06,
"loss": 0.0494,
"step": 745
},
{
"epoch": 0.3393994540491356,
"grad_norm": 0.9640668269863656,
"learning_rate": 9.887042551752163e-06,
"loss": 0.1104,
"step": 746
},
{
"epoch": 0.3398544131028208,
"grad_norm": 0.8639463935480832,
"learning_rate": 9.886740257582827e-06,
"loss": 0.0655,
"step": 747
},
{
"epoch": 0.3403093721565059,
"grad_norm": 0.6489702107287116,
"learning_rate": 9.886437564091148e-06,
"loss": 0.0777,
"step": 748
},
{
"epoch": 0.34076433121019106,
"grad_norm": 0.8236523684362178,
"learning_rate": 9.886134471301854e-06,
"loss": 0.0916,
"step": 749
},
{
"epoch": 0.34121929026387626,
"grad_norm": 0.8459143900125461,
"learning_rate": 9.885830979239718e-06,
"loss": 0.1017,
"step": 750
},
{
"epoch": 0.3416742493175614,
"grad_norm": 0.7496065352262437,
"learning_rate": 9.885527087929541e-06,
"loss": 0.0861,
"step": 751
},
{
"epoch": 0.3421292083712466,
"grad_norm": 0.849292513666517,
"learning_rate": 9.88522279739615e-06,
"loss": 0.0839,
"step": 752
},
{
"epoch": 0.34258416742493175,
"grad_norm": 0.7756671663835698,
"learning_rate": 9.884918107664417e-06,
"loss": 0.0809,
"step": 753
},
{
"epoch": 0.34303912647861695,
"grad_norm": 0.7338987681003677,
"learning_rate": 9.884613018759234e-06,
"loss": 0.0721,
"step": 754
},
{
"epoch": 0.3434940855323021,
"grad_norm": 0.6003946948163056,
"learning_rate": 9.884307530705534e-06,
"loss": 0.0782,
"step": 755
},
{
"epoch": 0.34394904458598724,
"grad_norm": 0.5309561440373582,
"learning_rate": 9.88400164352828e-06,
"loss": 0.0563,
"step": 756
},
{
"epoch": 0.34440400363967244,
"grad_norm": 0.6551261739802692,
"learning_rate": 9.883695357252467e-06,
"loss": 0.061,
"step": 757
},
{
"epoch": 0.3448589626933576,
"grad_norm": 0.6598139820416582,
"learning_rate": 9.883388671903125e-06,
"loss": 0.084,
"step": 758
},
{
"epoch": 0.3453139217470428,
"grad_norm": 0.8678451615084499,
"learning_rate": 9.883081587505315e-06,
"loss": 0.0893,
"step": 759
},
{
"epoch": 0.34576888080072793,
"grad_norm": 0.8849976199871086,
"learning_rate": 9.882774104084127e-06,
"loss": 0.0938,
"step": 760
},
{
"epoch": 0.34622383985441313,
"grad_norm": 0.6157555054475868,
"learning_rate": 9.882466221664691e-06,
"loss": 0.0535,
"step": 761
},
{
"epoch": 0.3466787989080983,
"grad_norm": 0.9555128068667961,
"learning_rate": 9.882157940272165e-06,
"loss": 0.0984,
"step": 762
},
{
"epoch": 0.3471337579617834,
"grad_norm": 0.8431106213501941,
"learning_rate": 9.881849259931738e-06,
"loss": 0.1062,
"step": 763
},
{
"epoch": 0.3475887170154686,
"grad_norm": 0.6608166650909644,
"learning_rate": 9.881540180668637e-06,
"loss": 0.0589,
"step": 764
},
{
"epoch": 0.34804367606915376,
"grad_norm": 0.7177237690901401,
"learning_rate": 9.881230702508118e-06,
"loss": 0.0721,
"step": 765
},
{
"epoch": 0.34849863512283896,
"grad_norm": 0.49396541889218665,
"learning_rate": 9.880920825475468e-06,
"loss": 0.0582,
"step": 766
},
{
"epoch": 0.3489535941765241,
"grad_norm": 0.7008727540015932,
"learning_rate": 9.88061054959601e-06,
"loss": 0.0689,
"step": 767
},
{
"epoch": 0.3494085532302093,
"grad_norm": 0.6417543130209264,
"learning_rate": 9.880299874895098e-06,
"loss": 0.0859,
"step": 768
},
{
"epoch": 0.34986351228389445,
"grad_norm": 0.5325758158155319,
"learning_rate": 9.879988801398121e-06,
"loss": 0.0508,
"step": 769
},
{
"epoch": 0.3503184713375796,
"grad_norm": 0.653129374155715,
"learning_rate": 9.879677329130496e-06,
"loss": 0.0822,
"step": 770
},
{
"epoch": 0.3507734303912648,
"grad_norm": 0.6044703796770591,
"learning_rate": 9.879365458117678e-06,
"loss": 0.0662,
"step": 771
},
{
"epoch": 0.35122838944494994,
"grad_norm": 0.6417796330386928,
"learning_rate": 9.879053188385148e-06,
"loss": 0.0649,
"step": 772
},
{
"epoch": 0.35168334849863514,
"grad_norm": 0.6127493684308597,
"learning_rate": 9.878740519958425e-06,
"loss": 0.0601,
"step": 773
},
{
"epoch": 0.3521383075523203,
"grad_norm": 0.9092296350808027,
"learning_rate": 9.878427452863059e-06,
"loss": 0.1138,
"step": 774
},
{
"epoch": 0.3525932666060055,
"grad_norm": 0.8850379239223551,
"learning_rate": 9.878113987124633e-06,
"loss": 0.1135,
"step": 775
},
{
"epoch": 0.35304822565969063,
"grad_norm": 0.8106864823035035,
"learning_rate": 9.877800122768761e-06,
"loss": 0.084,
"step": 776
},
{
"epoch": 0.3535031847133758,
"grad_norm": 0.6717791100158048,
"learning_rate": 9.877485859821092e-06,
"loss": 0.0764,
"step": 777
},
{
"epoch": 0.353958143767061,
"grad_norm": 0.4266356830653338,
"learning_rate": 9.877171198307304e-06,
"loss": 0.0496,
"step": 778
},
{
"epoch": 0.3544131028207461,
"grad_norm": 0.7839112755574695,
"learning_rate": 9.87685613825311e-06,
"loss": 0.0864,
"step": 779
},
{
"epoch": 0.3548680618744313,
"grad_norm": 0.8928629316475961,
"learning_rate": 9.876540679684257e-06,
"loss": 0.0802,
"step": 780
},
{
"epoch": 0.35532302092811646,
"grad_norm": 0.7427060191976654,
"learning_rate": 9.876224822626522e-06,
"loss": 0.0809,
"step": 781
},
{
"epoch": 0.35577797998180166,
"grad_norm": 0.6618589317208607,
"learning_rate": 9.875908567105716e-06,
"loss": 0.0633,
"step": 782
},
{
"epoch": 0.3562329390354868,
"grad_norm": 0.9168643329932029,
"learning_rate": 9.87559191314768e-06,
"loss": 0.0977,
"step": 783
},
{
"epoch": 0.35668789808917195,
"grad_norm": 1.010661772545197,
"learning_rate": 9.87527486077829e-06,
"loss": 0.112,
"step": 784
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.7355960177801563,
"learning_rate": 9.874957410023458e-06,
"loss": 0.0578,
"step": 785
},
{
"epoch": 0.3575978161965423,
"grad_norm": 0.7012046376593928,
"learning_rate": 9.874639560909118e-06,
"loss": 0.0856,
"step": 786
},
{
"epoch": 0.3580527752502275,
"grad_norm": 0.629856671324697,
"learning_rate": 9.87432131346125e-06,
"loss": 0.079,
"step": 787
},
{
"epoch": 0.35850773430391264,
"grad_norm": 0.6605442679933491,
"learning_rate": 9.874002667705855e-06,
"loss": 0.0713,
"step": 788
},
{
"epoch": 0.35896269335759784,
"grad_norm": 0.6036439966816435,
"learning_rate": 9.873683623668972e-06,
"loss": 0.0734,
"step": 789
},
{
"epoch": 0.359417652411283,
"grad_norm": 0.9098464282834562,
"learning_rate": 9.873364181376674e-06,
"loss": 0.1273,
"step": 790
},
{
"epoch": 0.35987261146496813,
"grad_norm": 0.725232432410699,
"learning_rate": 9.873044340855062e-06,
"loss": 0.0704,
"step": 791
},
{
"epoch": 0.36032757051865333,
"grad_norm": 0.8275864687946802,
"learning_rate": 9.872724102130273e-06,
"loss": 0.0722,
"step": 792
},
{
"epoch": 0.3607825295723385,
"grad_norm": 0.6908762665090429,
"learning_rate": 9.872403465228476e-06,
"loss": 0.068,
"step": 793
},
{
"epoch": 0.3612374886260237,
"grad_norm": 0.8007479624540592,
"learning_rate": 9.872082430175871e-06,
"loss": 0.0792,
"step": 794
},
{
"epoch": 0.3616924476797088,
"grad_norm": 0.7580697654486878,
"learning_rate": 9.871760996998692e-06,
"loss": 0.0662,
"step": 795
},
{
"epoch": 0.362147406733394,
"grad_norm": 1.0378802589927232,
"learning_rate": 9.871439165723207e-06,
"loss": 0.0905,
"step": 796
},
{
"epoch": 0.36260236578707916,
"grad_norm": 0.9366156924362913,
"learning_rate": 9.87111693637571e-06,
"loss": 0.0966,
"step": 797
},
{
"epoch": 0.3630573248407643,
"grad_norm": 0.9568919919938076,
"learning_rate": 9.870794308982536e-06,
"loss": 0.1092,
"step": 798
},
{
"epoch": 0.3635122838944495,
"grad_norm": 1.0303944561108107,
"learning_rate": 9.870471283570046e-06,
"loss": 0.1214,
"step": 799
},
{
"epoch": 0.36396724294813465,
"grad_norm": 0.7123988620535131,
"learning_rate": 9.870147860164639e-06,
"loss": 0.0952,
"step": 800
},
{
"epoch": 0.36442220200181985,
"grad_norm": 0.6461145025804255,
"learning_rate": 9.86982403879274e-06,
"loss": 0.0653,
"step": 801
},
{
"epoch": 0.364877161055505,
"grad_norm": 0.761176238728339,
"learning_rate": 9.869499819480815e-06,
"loss": 0.0911,
"step": 802
},
{
"epoch": 0.3653321201091902,
"grad_norm": 0.6778284620896282,
"learning_rate": 9.869175202255354e-06,
"loss": 0.0726,
"step": 803
},
{
"epoch": 0.36578707916287534,
"grad_norm": 0.6378934869683002,
"learning_rate": 9.868850187142885e-06,
"loss": 0.0721,
"step": 804
},
{
"epoch": 0.3662420382165605,
"grad_norm": 0.725078464245391,
"learning_rate": 9.868524774169968e-06,
"loss": 0.0774,
"step": 805
},
{
"epoch": 0.3666969972702457,
"grad_norm": 0.7707907185217752,
"learning_rate": 9.86819896336319e-06,
"loss": 0.067,
"step": 806
},
{
"epoch": 0.36715195632393083,
"grad_norm": 0.8162851407409059,
"learning_rate": 9.867872754749178e-06,
"loss": 0.0908,
"step": 807
},
{
"epoch": 0.36760691537761603,
"grad_norm": 0.5330499489332517,
"learning_rate": 9.867546148354586e-06,
"loss": 0.066,
"step": 808
},
{
"epoch": 0.3680618744313012,
"grad_norm": 0.6649993383235931,
"learning_rate": 9.867219144206105e-06,
"loss": 0.0672,
"step": 809
},
{
"epoch": 0.3685168334849864,
"grad_norm": 0.9824606570699352,
"learning_rate": 9.866891742330458e-06,
"loss": 0.11,
"step": 810
},
{
"epoch": 0.3689717925386715,
"grad_norm": 0.6507791006697302,
"learning_rate": 9.866563942754394e-06,
"loss": 0.0622,
"step": 811
},
{
"epoch": 0.36942675159235666,
"grad_norm": 0.7455907568930894,
"learning_rate": 9.866235745504705e-06,
"loss": 0.0833,
"step": 812
},
{
"epoch": 0.36988171064604186,
"grad_norm": 0.9927293122267482,
"learning_rate": 9.865907150608203e-06,
"loss": 0.0978,
"step": 813
},
{
"epoch": 0.370336669699727,
"grad_norm": 0.817279180213694,
"learning_rate": 9.865578158091746e-06,
"loss": 0.1036,
"step": 814
},
{
"epoch": 0.3707916287534122,
"grad_norm": 0.9966504261459711,
"learning_rate": 9.865248767982211e-06,
"loss": 0.1027,
"step": 815
},
{
"epoch": 0.37124658780709735,
"grad_norm": 0.9561727776097537,
"learning_rate": 9.864918980306521e-06,
"loss": 0.1136,
"step": 816
},
{
"epoch": 0.37170154686078255,
"grad_norm": 0.6718095123705313,
"learning_rate": 9.86458879509162e-06,
"loss": 0.0762,
"step": 817
},
{
"epoch": 0.3721565059144677,
"grad_norm": 0.9803345299998187,
"learning_rate": 9.864258212364492e-06,
"loss": 0.0791,
"step": 818
},
{
"epoch": 0.37261146496815284,
"grad_norm": 0.8058679812037255,
"learning_rate": 9.86392723215215e-06,
"loss": 0.069,
"step": 819
},
{
"epoch": 0.37306642402183804,
"grad_norm": 0.5836160590759203,
"learning_rate": 9.86359585448164e-06,
"loss": 0.0621,
"step": 820
},
{
"epoch": 0.3735213830755232,
"grad_norm": 0.6511599091669776,
"learning_rate": 9.863264079380039e-06,
"loss": 0.0745,
"step": 821
},
{
"epoch": 0.3739763421292084,
"grad_norm": 0.9308266206126162,
"learning_rate": 9.862931906874461e-06,
"loss": 0.1132,
"step": 822
},
{
"epoch": 0.37443130118289353,
"grad_norm": 0.613775373571284,
"learning_rate": 9.862599336992048e-06,
"loss": 0.0545,
"step": 823
},
{
"epoch": 0.37488626023657873,
"grad_norm": 0.6991388893487894,
"learning_rate": 9.862266369759976e-06,
"loss": 0.0754,
"step": 824
},
{
"epoch": 0.37534121929026387,
"grad_norm": 0.6352968005261165,
"learning_rate": 9.861933005205454e-06,
"loss": 0.0576,
"step": 825
},
{
"epoch": 0.37579617834394907,
"grad_norm": 1.109194467922723,
"learning_rate": 9.861599243355725e-06,
"loss": 0.1281,
"step": 826
},
{
"epoch": 0.3762511373976342,
"grad_norm": 0.9742134289860664,
"learning_rate": 9.86126508423806e-06,
"loss": 0.1067,
"step": 827
},
{
"epoch": 0.37670609645131936,
"grad_norm": 0.6015820455914206,
"learning_rate": 9.860930527879763e-06,
"loss": 0.055,
"step": 828
},
{
"epoch": 0.37716105550500456,
"grad_norm": 1.0894948091440197,
"learning_rate": 9.860595574308179e-06,
"loss": 0.1147,
"step": 829
},
{
"epoch": 0.3776160145586897,
"grad_norm": 0.7023892750192133,
"learning_rate": 9.860260223550672e-06,
"loss": 0.0815,
"step": 830
},
{
"epoch": 0.3780709736123749,
"grad_norm": 0.4943868719085533,
"learning_rate": 9.859924475634649e-06,
"loss": 0.0476,
"step": 831
},
{
"epoch": 0.37852593266606005,
"grad_norm": 0.9974648765413693,
"learning_rate": 9.859588330587545e-06,
"loss": 0.1068,
"step": 832
},
{
"epoch": 0.37898089171974525,
"grad_norm": 0.5960289391531881,
"learning_rate": 9.859251788436829e-06,
"loss": 0.0715,
"step": 833
},
{
"epoch": 0.3794358507734304,
"grad_norm": 0.907079582974149,
"learning_rate": 9.85891484921e-06,
"loss": 0.0905,
"step": 834
},
{
"epoch": 0.37989080982711554,
"grad_norm": 0.8133034306250352,
"learning_rate": 9.858577512934592e-06,
"loss": 0.1012,
"step": 835
},
{
"epoch": 0.38034576888080074,
"grad_norm": 0.7828785203637737,
"learning_rate": 9.858239779638173e-06,
"loss": 0.0726,
"step": 836
},
{
"epoch": 0.3808007279344859,
"grad_norm": 1.3138864597148558,
"learning_rate": 9.857901649348338e-06,
"loss": 0.1307,
"step": 837
},
{
"epoch": 0.3812556869881711,
"grad_norm": 0.7000750227265026,
"learning_rate": 9.857563122092717e-06,
"loss": 0.0777,
"step": 838
},
{
"epoch": 0.3817106460418562,
"grad_norm": 0.757283984575844,
"learning_rate": 9.857224197898975e-06,
"loss": 0.083,
"step": 839
},
{
"epoch": 0.3821656050955414,
"grad_norm": 0.7113754486134378,
"learning_rate": 9.856884876794805e-06,
"loss": 0.0795,
"step": 840
},
{
"epoch": 0.38262056414922657,
"grad_norm": 0.6891370217065743,
"learning_rate": 9.856545158807938e-06,
"loss": 0.0576,
"step": 841
},
{
"epoch": 0.3830755232029117,
"grad_norm": 0.7230826558764609,
"learning_rate": 9.856205043966134e-06,
"loss": 0.0973,
"step": 842
},
{
"epoch": 0.3835304822565969,
"grad_norm": 0.9951638416419379,
"learning_rate": 9.855864532297181e-06,
"loss": 0.1225,
"step": 843
},
{
"epoch": 0.38398544131028206,
"grad_norm": 0.8272776971451865,
"learning_rate": 9.85552362382891e-06,
"loss": 0.0928,
"step": 844
},
{
"epoch": 0.38444040036396726,
"grad_norm": 0.662562460388915,
"learning_rate": 9.855182318589174e-06,
"loss": 0.0711,
"step": 845
},
{
"epoch": 0.3848953594176524,
"grad_norm": 1.185659176011977,
"learning_rate": 9.854840616605866e-06,
"loss": 0.0922,
"step": 846
},
{
"epoch": 0.3853503184713376,
"grad_norm": 0.7002426118833048,
"learning_rate": 9.854498517906908e-06,
"loss": 0.0828,
"step": 847
},
{
"epoch": 0.38580527752502275,
"grad_norm": 0.8957633348930525,
"learning_rate": 9.854156022520252e-06,
"loss": 0.0809,
"step": 848
},
{
"epoch": 0.3862602365787079,
"grad_norm": 1.0593251614278854,
"learning_rate": 9.853813130473887e-06,
"loss": 0.1109,
"step": 849
},
{
"epoch": 0.3867151956323931,
"grad_norm": 0.7751748709357449,
"learning_rate": 9.853469841795832e-06,
"loss": 0.0823,
"step": 850
},
{
"epoch": 0.38717015468607824,
"grad_norm": 0.5943868690351954,
"learning_rate": 9.853126156514142e-06,
"loss": 0.0758,
"step": 851
},
{
"epoch": 0.38762511373976344,
"grad_norm": 0.4901349757557767,
"learning_rate": 9.852782074656897e-06,
"loss": 0.064,
"step": 852
},
{
"epoch": 0.3880800727934486,
"grad_norm": 0.7531191508768753,
"learning_rate": 9.852437596252216e-06,
"loss": 0.0824,
"step": 853
},
{
"epoch": 0.3885350318471338,
"grad_norm": 0.7684236261792305,
"learning_rate": 9.852092721328248e-06,
"loss": 0.0674,
"step": 854
},
{
"epoch": 0.3889899909008189,
"grad_norm": 0.8624513661560378,
"learning_rate": 9.851747449913176e-06,
"loss": 0.09,
"step": 855
},
{
"epoch": 0.38944494995450407,
"grad_norm": 0.9125725996183891,
"learning_rate": 9.851401782035213e-06,
"loss": 0.129,
"step": 856
},
{
"epoch": 0.38989990900818927,
"grad_norm": 0.7630714638300728,
"learning_rate": 9.851055717722604e-06,
"loss": 0.068,
"step": 857
},
{
"epoch": 0.3903548680618744,
"grad_norm": 0.834756070401477,
"learning_rate": 9.850709257003628e-06,
"loss": 0.0831,
"step": 858
},
{
"epoch": 0.3908098271155596,
"grad_norm": 0.9864776662717517,
"learning_rate": 9.850362399906598e-06,
"loss": 0.0904,
"step": 859
},
{
"epoch": 0.39126478616924476,
"grad_norm": 0.6242730295284743,
"learning_rate": 9.850015146459857e-06,
"loss": 0.0754,
"step": 860
},
{
"epoch": 0.39171974522292996,
"grad_norm": 0.838271649072902,
"learning_rate": 9.84966749669178e-06,
"loss": 0.0899,
"step": 861
},
{
"epoch": 0.3921747042766151,
"grad_norm": 0.6826448278617049,
"learning_rate": 9.849319450630777e-06,
"loss": 0.0698,
"step": 862
},
{
"epoch": 0.39262966333030025,
"grad_norm": 0.5533993282250775,
"learning_rate": 9.848971008305288e-06,
"loss": 0.0688,
"step": 863
},
{
"epoch": 0.39308462238398545,
"grad_norm": 0.838673412156409,
"learning_rate": 9.848622169743784e-06,
"loss": 0.0815,
"step": 864
},
{
"epoch": 0.3935395814376706,
"grad_norm": 0.9783580500729582,
"learning_rate": 9.848272934974774e-06,
"loss": 0.0745,
"step": 865
},
{
"epoch": 0.3939945404913558,
"grad_norm": 0.5976030953641746,
"learning_rate": 9.847923304026793e-06,
"loss": 0.0664,
"step": 866
},
{
"epoch": 0.39444949954504094,
"grad_norm": 0.6999143793652887,
"learning_rate": 9.847573276928415e-06,
"loss": 0.0804,
"step": 867
},
{
"epoch": 0.39490445859872614,
"grad_norm": 0.6338725165728231,
"learning_rate": 9.847222853708239e-06,
"loss": 0.0655,
"step": 868
},
{
"epoch": 0.3953594176524113,
"grad_norm": 0.7010627446349382,
"learning_rate": 9.846872034394902e-06,
"loss": 0.0667,
"step": 869
},
{
"epoch": 0.3958143767060964,
"grad_norm": 0.6173227181881447,
"learning_rate": 9.84652081901707e-06,
"loss": 0.0674,
"step": 870
},
{
"epoch": 0.3962693357597816,
"grad_norm": 0.9673042020268607,
"learning_rate": 9.846169207603443e-06,
"loss": 0.1267,
"step": 871
},
{
"epoch": 0.39672429481346677,
"grad_norm": 0.6294912489479282,
"learning_rate": 9.845817200182755e-06,
"loss": 0.0588,
"step": 872
},
{
"epoch": 0.39717925386715197,
"grad_norm": 0.8477152807126976,
"learning_rate": 9.845464796783767e-06,
"loss": 0.1219,
"step": 873
},
{
"epoch": 0.3976342129208371,
"grad_norm": 0.5887483684825674,
"learning_rate": 9.845111997435279e-06,
"loss": 0.0731,
"step": 874
},
{
"epoch": 0.3980891719745223,
"grad_norm": 0.5630369277247907,
"learning_rate": 9.844758802166116e-06,
"loss": 0.0579,
"step": 875
},
{
"epoch": 0.39854413102820746,
"grad_norm": 0.6717541815357567,
"learning_rate": 9.844405211005145e-06,
"loss": 0.0711,
"step": 876
},
{
"epoch": 0.3989990900818926,
"grad_norm": 0.6571828619535791,
"learning_rate": 9.844051223981258e-06,
"loss": 0.0638,
"step": 877
},
{
"epoch": 0.3994540491355778,
"grad_norm": 0.6723710552364174,
"learning_rate": 9.84369684112338e-06,
"loss": 0.0676,
"step": 878
},
{
"epoch": 0.39990900818926295,
"grad_norm": 0.7014173744195523,
"learning_rate": 9.84334206246047e-06,
"loss": 0.0751,
"step": 879
},
{
"epoch": 0.40036396724294815,
"grad_norm": 0.7999660318519703,
"learning_rate": 9.842986888021518e-06,
"loss": 0.0895,
"step": 880
},
{
"epoch": 0.4008189262966333,
"grad_norm": 0.5578605501955606,
"learning_rate": 9.842631317835548e-06,
"loss": 0.0637,
"step": 881
},
{
"epoch": 0.4012738853503185,
"grad_norm": 0.6615256090849237,
"learning_rate": 9.842275351931617e-06,
"loss": 0.0664,
"step": 882
},
{
"epoch": 0.40172884440400364,
"grad_norm": 0.5263094198672195,
"learning_rate": 9.841918990338812e-06,
"loss": 0.0611,
"step": 883
},
{
"epoch": 0.4021838034576888,
"grad_norm": 0.8080883575450535,
"learning_rate": 9.841562233086252e-06,
"loss": 0.0912,
"step": 884
},
{
"epoch": 0.402638762511374,
"grad_norm": 0.6655757939327012,
"learning_rate": 9.841205080203092e-06,
"loss": 0.0601,
"step": 885
},
{
"epoch": 0.4030937215650591,
"grad_norm": 0.8701903481119097,
"learning_rate": 9.840847531718515e-06,
"loss": 0.0914,
"step": 886
},
{
"epoch": 0.4035486806187443,
"grad_norm": 0.7730206436987713,
"learning_rate": 9.840489587661738e-06,
"loss": 0.0747,
"step": 887
},
{
"epoch": 0.40400363967242947,
"grad_norm": 0.7410839527981146,
"learning_rate": 9.840131248062012e-06,
"loss": 0.079,
"step": 888
},
{
"epoch": 0.40445859872611467,
"grad_norm": 0.627620281196765,
"learning_rate": 9.839772512948618e-06,
"loss": 0.0715,
"step": 889
},
{
"epoch": 0.4049135577797998,
"grad_norm": 0.8746014124114054,
"learning_rate": 9.83941338235087e-06,
"loss": 0.0824,
"step": 890
},
{
"epoch": 0.40536851683348496,
"grad_norm": 1.0112737589697485,
"learning_rate": 9.839053856298116e-06,
"loss": 0.1251,
"step": 891
},
{
"epoch": 0.40582347588717016,
"grad_norm": 0.72216805525771,
"learning_rate": 9.838693934819734e-06,
"loss": 0.0893,
"step": 892
},
{
"epoch": 0.4062784349408553,
"grad_norm": 0.7544949830136005,
"learning_rate": 9.838333617945134e-06,
"loss": 0.0968,
"step": 893
},
{
"epoch": 0.4067333939945405,
"grad_norm": 0.9543024355165705,
"learning_rate": 9.837972905703762e-06,
"loss": 0.102,
"step": 894
},
{
"epoch": 0.40718835304822565,
"grad_norm": 1.02061795078975,
"learning_rate": 9.83761179812509e-06,
"loss": 0.0649,
"step": 895
},
{
"epoch": 0.40764331210191085,
"grad_norm": 0.39738812842187227,
"learning_rate": 9.837250295238629e-06,
"loss": 0.0428,
"step": 896
},
{
"epoch": 0.408098271155596,
"grad_norm": 0.8873895570319217,
"learning_rate": 9.836888397073919e-06,
"loss": 0.1068,
"step": 897
},
{
"epoch": 0.40855323020928114,
"grad_norm": 0.7492126364897504,
"learning_rate": 9.836526103660533e-06,
"loss": 0.0953,
"step": 898
},
{
"epoch": 0.40900818926296634,
"grad_norm": 0.821575499525911,
"learning_rate": 9.836163415028075e-06,
"loss": 0.0712,
"step": 899
},
{
"epoch": 0.4094631483166515,
"grad_norm": 1.0052579979241618,
"learning_rate": 9.835800331206183e-06,
"loss": 0.1138,
"step": 900
},
{
"epoch": 0.4099181073703367,
"grad_norm": 0.7848465428804848,
"learning_rate": 9.835436852224525e-06,
"loss": 0.0978,
"step": 901
},
{
"epoch": 0.4103730664240218,
"grad_norm": 0.9719856735481065,
"learning_rate": 9.835072978112804e-06,
"loss": 0.0846,
"step": 902
},
{
"epoch": 0.410828025477707,
"grad_norm": 0.6607308818506346,
"learning_rate": 9.834708708900755e-06,
"loss": 0.0654,
"step": 903
},
{
"epoch": 0.41128298453139217,
"grad_norm": 0.5191597312034261,
"learning_rate": 9.834344044618144e-06,
"loss": 0.0518,
"step": 904
},
{
"epoch": 0.41173794358507737,
"grad_norm": 0.5336391872354229,
"learning_rate": 9.83397898529477e-06,
"loss": 0.0535,
"step": 905
},
{
"epoch": 0.4121929026387625,
"grad_norm": 0.5687342550017563,
"learning_rate": 9.833613530960462e-06,
"loss": 0.0578,
"step": 906
},
{
"epoch": 0.41264786169244766,
"grad_norm": 0.8793783198642894,
"learning_rate": 9.833247681645083e-06,
"loss": 0.1286,
"step": 907
},
{
"epoch": 0.41310282074613286,
"grad_norm": 0.8073005899800644,
"learning_rate": 9.832881437378534e-06,
"loss": 0.0853,
"step": 908
},
{
"epoch": 0.413557779799818,
"grad_norm": 0.511699500000588,
"learning_rate": 9.832514798190738e-06,
"loss": 0.0504,
"step": 909
},
{
"epoch": 0.4140127388535032,
"grad_norm": 0.5082793074725768,
"learning_rate": 9.832147764111655e-06,
"loss": 0.056,
"step": 910
},
{
"epoch": 0.41446769790718835,
"grad_norm": 0.9876041013395295,
"learning_rate": 9.83178033517128e-06,
"loss": 0.0984,
"step": 911
},
{
"epoch": 0.41492265696087355,
"grad_norm": 0.7511273129930924,
"learning_rate": 9.831412511399633e-06,
"loss": 0.0969,
"step": 912
},
{
"epoch": 0.4153776160145587,
"grad_norm": 1.0144870263760433,
"learning_rate": 9.831044292826778e-06,
"loss": 0.1482,
"step": 913
},
{
"epoch": 0.41583257506824384,
"grad_norm": 0.70444400073401,
"learning_rate": 9.830675679482797e-06,
"loss": 0.0802,
"step": 914
},
{
"epoch": 0.41628753412192904,
"grad_norm": 1.0357251397748677,
"learning_rate": 9.830306671397816e-06,
"loss": 0.1061,
"step": 915
},
{
"epoch": 0.4167424931756142,
"grad_norm": 0.895894802940119,
"learning_rate": 9.829937268601988e-06,
"loss": 0.1005,
"step": 916
},
{
"epoch": 0.4171974522292994,
"grad_norm": 0.6004589977630954,
"learning_rate": 9.829567471125497e-06,
"loss": 0.0664,
"step": 917
},
{
"epoch": 0.4176524112829845,
"grad_norm": 0.6058859475834909,
"learning_rate": 9.829197278998562e-06,
"loss": 0.0728,
"step": 918
},
{
"epoch": 0.4181073703366697,
"grad_norm": 0.5886912548442098,
"learning_rate": 9.828826692251435e-06,
"loss": 0.074,
"step": 919
},
{
"epoch": 0.41856232939035487,
"grad_norm": 0.5982473215332103,
"learning_rate": 9.828455710914398e-06,
"loss": 0.0653,
"step": 920
},
{
"epoch": 0.41901728844404,
"grad_norm": 0.8647804622811079,
"learning_rate": 9.828084335017763e-06,
"loss": 0.0741,
"step": 921
},
{
"epoch": 0.4194722474977252,
"grad_norm": 0.653767178815679,
"learning_rate": 9.827712564591883e-06,
"loss": 0.0604,
"step": 922
},
{
"epoch": 0.41992720655141036,
"grad_norm": 0.7812500085225947,
"learning_rate": 9.827340399667132e-06,
"loss": 0.0708,
"step": 923
},
{
"epoch": 0.42038216560509556,
"grad_norm": 0.7314008563711142,
"learning_rate": 9.826967840273921e-06,
"loss": 0.0721,
"step": 924
},
{
"epoch": 0.4208371246587807,
"grad_norm": 0.8727413076803472,
"learning_rate": 9.8265948864427e-06,
"loss": 0.0892,
"step": 925
},
{
"epoch": 0.4212920837124659,
"grad_norm": 0.6051379056710864,
"learning_rate": 9.826221538203942e-06,
"loss": 0.0685,
"step": 926
},
{
"epoch": 0.42174704276615105,
"grad_norm": 0.7279887191787228,
"learning_rate": 9.825847795588154e-06,
"loss": 0.0766,
"step": 927
},
{
"epoch": 0.4222020018198362,
"grad_norm": 0.7126811268305303,
"learning_rate": 9.825473658625876e-06,
"loss": 0.0821,
"step": 928
},
{
"epoch": 0.4226569608735214,
"grad_norm": 0.8812960827967533,
"learning_rate": 9.825099127347684e-06,
"loss": 0.0982,
"step": 929
},
{
"epoch": 0.42311191992720654,
"grad_norm": 0.7462955906438729,
"learning_rate": 9.824724201784182e-06,
"loss": 0.1073,
"step": 930
},
{
"epoch": 0.42356687898089174,
"grad_norm": 0.5448066050338419,
"learning_rate": 9.824348881966004e-06,
"loss": 0.0637,
"step": 931
},
{
"epoch": 0.4240218380345769,
"grad_norm": 0.7750150802923693,
"learning_rate": 9.823973167923823e-06,
"loss": 0.09,
"step": 932
},
{
"epoch": 0.4244767970882621,
"grad_norm": 0.8695175796556455,
"learning_rate": 9.82359705968834e-06,
"loss": 0.0857,
"step": 933
},
{
"epoch": 0.4249317561419472,
"grad_norm": 0.653112477618241,
"learning_rate": 9.823220557290289e-06,
"loss": 0.0722,
"step": 934
},
{
"epoch": 0.42538671519563237,
"grad_norm": 0.7764742726938813,
"learning_rate": 9.822843660760434e-06,
"loss": 0.0582,
"step": 935
},
{
"epoch": 0.42584167424931757,
"grad_norm": 0.8338160462571067,
"learning_rate": 9.822466370129576e-06,
"loss": 0.0993,
"step": 936
},
{
"epoch": 0.4262966333030027,
"grad_norm": 0.7416650975880095,
"learning_rate": 9.822088685428543e-06,
"loss": 0.0782,
"step": 937
},
{
"epoch": 0.4267515923566879,
"grad_norm": 0.5969422348364739,
"learning_rate": 9.821710606688199e-06,
"loss": 0.0546,
"step": 938
},
{
"epoch": 0.42720655141037306,
"grad_norm": 0.6235404067325917,
"learning_rate": 9.82133213393944e-06,
"loss": 0.0638,
"step": 939
},
{
"epoch": 0.42766151046405826,
"grad_norm": 0.7910461101358781,
"learning_rate": 9.820953267213194e-06,
"loss": 0.0775,
"step": 940
},
{
"epoch": 0.4281164695177434,
"grad_norm": 0.692978452923811,
"learning_rate": 9.820574006540415e-06,
"loss": 0.053,
"step": 941
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.7310389759017597,
"learning_rate": 9.820194351952098e-06,
"loss": 0.0716,
"step": 942
},
{
"epoch": 0.42902638762511375,
"grad_norm": 0.6553331509390902,
"learning_rate": 9.819814303479268e-06,
"loss": 0.0612,
"step": 943
},
{
"epoch": 0.4294813466787989,
"grad_norm": 1.1310076957610966,
"learning_rate": 9.819433861152978e-06,
"loss": 0.1116,
"step": 944
},
{
"epoch": 0.4299363057324841,
"grad_norm": 0.6933766894953944,
"learning_rate": 9.819053025004316e-06,
"loss": 0.0932,
"step": 945
},
{
"epoch": 0.43039126478616924,
"grad_norm": 0.7823571557493696,
"learning_rate": 9.818671795064405e-06,
"loss": 0.0847,
"step": 946
},
{
"epoch": 0.43084622383985444,
"grad_norm": 0.8000794358590197,
"learning_rate": 9.818290171364396e-06,
"loss": 0.0916,
"step": 947
},
{
"epoch": 0.4313011828935396,
"grad_norm": 0.6207042654318157,
"learning_rate": 9.817908153935473e-06,
"loss": 0.0568,
"step": 948
},
{
"epoch": 0.4317561419472247,
"grad_norm": 0.7957970680354334,
"learning_rate": 9.817525742808854e-06,
"loss": 0.1203,
"step": 949
},
{
"epoch": 0.4322111010009099,
"grad_norm": 0.6607960765057979,
"learning_rate": 9.817142938015786e-06,
"loss": 0.069,
"step": 950
},
{
"epoch": 0.43266606005459507,
"grad_norm": 0.8132102265727185,
"learning_rate": 9.816759739587552e-06,
"loss": 0.0821,
"step": 951
},
{
"epoch": 0.43312101910828027,
"grad_norm": 0.6410149691778323,
"learning_rate": 9.816376147555464e-06,
"loss": 0.0612,
"step": 952
},
{
"epoch": 0.4335759781619654,
"grad_norm": 1.0196998859089288,
"learning_rate": 9.815992161950867e-06,
"loss": 0.1183,
"step": 953
},
{
"epoch": 0.4340309372156506,
"grad_norm": 0.5899375116434804,
"learning_rate": 9.81560778280514e-06,
"loss": 0.0604,
"step": 954
},
{
"epoch": 0.43448589626933576,
"grad_norm": 1.0046158107797931,
"learning_rate": 9.815223010149693e-06,
"loss": 0.0876,
"step": 955
},
{
"epoch": 0.4349408553230209,
"grad_norm": 0.7980339738331416,
"learning_rate": 9.814837844015966e-06,
"loss": 0.0894,
"step": 956
},
{
"epoch": 0.4353958143767061,
"grad_norm": 0.6974524248281853,
"learning_rate": 9.814452284435433e-06,
"loss": 0.0741,
"step": 957
},
{
"epoch": 0.43585077343039125,
"grad_norm": 0.7679692797858835,
"learning_rate": 9.814066331439603e-06,
"loss": 0.0796,
"step": 958
},
{
"epoch": 0.43630573248407645,
"grad_norm": 0.8183774417740679,
"learning_rate": 9.813679985060012e-06,
"loss": 0.0963,
"step": 959
},
{
"epoch": 0.4367606915377616,
"grad_norm": 0.7950656053104391,
"learning_rate": 9.81329324532823e-06,
"loss": 0.0837,
"step": 960
},
{
"epoch": 0.4372156505914468,
"grad_norm": 0.6056809369995887,
"learning_rate": 9.812906112275862e-06,
"loss": 0.0465,
"step": 961
},
{
"epoch": 0.43767060964513194,
"grad_norm": 1.0980359635620318,
"learning_rate": 9.81251858593454e-06,
"loss": 0.1206,
"step": 962
},
{
"epoch": 0.4381255686988171,
"grad_norm": 0.6123483237764059,
"learning_rate": 9.812130666335933e-06,
"loss": 0.08,
"step": 963
},
{
"epoch": 0.4385805277525023,
"grad_norm": 0.8151730014839008,
"learning_rate": 9.81174235351174e-06,
"loss": 0.0983,
"step": 964
},
{
"epoch": 0.4390354868061874,
"grad_norm": 0.7143828681073273,
"learning_rate": 9.811353647493691e-06,
"loss": 0.0809,
"step": 965
},
{
"epoch": 0.4394904458598726,
"grad_norm": 0.5647036962239634,
"learning_rate": 9.810964548313549e-06,
"loss": 0.0581,
"step": 966
},
{
"epoch": 0.43994540491355777,
"grad_norm": 0.7594400506736699,
"learning_rate": 9.81057505600311e-06,
"loss": 0.078,
"step": 967
},
{
"epoch": 0.44040036396724297,
"grad_norm": 0.6515426202345832,
"learning_rate": 9.810185170594205e-06,
"loss": 0.0688,
"step": 968
},
{
"epoch": 0.4408553230209281,
"grad_norm": 0.8798906332352223,
"learning_rate": 9.809794892118687e-06,
"loss": 0.0915,
"step": 969
},
{
"epoch": 0.44131028207461326,
"grad_norm": 0.7350866900672135,
"learning_rate": 9.809404220608451e-06,
"loss": 0.0671,
"step": 970
},
{
"epoch": 0.44176524112829846,
"grad_norm": 0.7216847217866104,
"learning_rate": 9.809013156095424e-06,
"loss": 0.0726,
"step": 971
},
{
"epoch": 0.4422202001819836,
"grad_norm": 0.8179702740752783,
"learning_rate": 9.808621698611557e-06,
"loss": 0.0758,
"step": 972
},
{
"epoch": 0.4426751592356688,
"grad_norm": 0.5533105745807706,
"learning_rate": 9.808229848188842e-06,
"loss": 0.0528,
"step": 973
},
{
"epoch": 0.44313011828935395,
"grad_norm": 0.7503486538749657,
"learning_rate": 9.807837604859296e-06,
"loss": 0.0878,
"step": 974
},
{
"epoch": 0.44358507734303915,
"grad_norm": 0.40510949005498975,
"learning_rate": 9.807444968654975e-06,
"loss": 0.0424,
"step": 975
},
{
"epoch": 0.4440400363967243,
"grad_norm": 0.8540666353042626,
"learning_rate": 9.807051939607959e-06,
"loss": 0.1108,
"step": 976
},
{
"epoch": 0.44449499545040944,
"grad_norm": 0.7543284179304937,
"learning_rate": 9.806658517750369e-06,
"loss": 0.0719,
"step": 977
},
{
"epoch": 0.44494995450409464,
"grad_norm": 0.6982493359241757,
"learning_rate": 9.80626470311435e-06,
"loss": 0.0777,
"step": 978
},
{
"epoch": 0.4454049135577798,
"grad_norm": 0.7275511253894157,
"learning_rate": 9.805870495732085e-06,
"loss": 0.0693,
"step": 979
},
{
"epoch": 0.445859872611465,
"grad_norm": 0.8647890459895436,
"learning_rate": 9.805475895635787e-06,
"loss": 0.0882,
"step": 980
},
{
"epoch": 0.4463148316651501,
"grad_norm": 0.757804762973183,
"learning_rate": 9.8050809028577e-06,
"loss": 0.0724,
"step": 981
},
{
"epoch": 0.4467697907188353,
"grad_norm": 0.7515219153063712,
"learning_rate": 9.8046855174301e-06,
"loss": 0.0659,
"step": 982
},
{
"epoch": 0.44722474977252047,
"grad_norm": 1.0502681583017184,
"learning_rate": 9.804289739385297e-06,
"loss": 0.1207,
"step": 983
},
{
"epoch": 0.44767970882620567,
"grad_norm": 0.5780062486364612,
"learning_rate": 9.803893568755633e-06,
"loss": 0.0772,
"step": 984
},
{
"epoch": 0.4481346678798908,
"grad_norm": 0.5515644567052078,
"learning_rate": 9.80349700557348e-06,
"loss": 0.0628,
"step": 985
},
{
"epoch": 0.44858962693357596,
"grad_norm": 0.6432677095504179,
"learning_rate": 9.803100049871246e-06,
"loss": 0.0817,
"step": 986
},
{
"epoch": 0.44904458598726116,
"grad_norm": 0.5424958391196154,
"learning_rate": 9.802702701681366e-06,
"loss": 0.0649,
"step": 987
},
{
"epoch": 0.4494995450409463,
"grad_norm": 0.6556126282036931,
"learning_rate": 9.80230496103631e-06,
"loss": 0.0579,
"step": 988
},
{
"epoch": 0.4499545040946315,
"grad_norm": 0.5632646083130022,
"learning_rate": 9.801906827968578e-06,
"loss": 0.0591,
"step": 989
},
{
"epoch": 0.45040946314831665,
"grad_norm": 1.0464719217252296,
"learning_rate": 9.801508302510707e-06,
"loss": 0.124,
"step": 990
},
{
"epoch": 0.45086442220200185,
"grad_norm": 0.7231067459050019,
"learning_rate": 9.801109384695261e-06,
"loss": 0.0631,
"step": 991
},
{
"epoch": 0.451319381255687,
"grad_norm": 0.775594128230074,
"learning_rate": 9.800710074554837e-06,
"loss": 0.0924,
"step": 992
},
{
"epoch": 0.45177434030937214,
"grad_norm": 0.6340180385643369,
"learning_rate": 9.800310372122066e-06,
"loss": 0.068,
"step": 993
},
{
"epoch": 0.45222929936305734,
"grad_norm": 0.9703750136380557,
"learning_rate": 9.799910277429609e-06,
"loss": 0.0902,
"step": 994
},
{
"epoch": 0.4526842584167425,
"grad_norm": 0.5881925827197537,
"learning_rate": 9.79950979051016e-06,
"loss": 0.0662,
"step": 995
},
{
"epoch": 0.4531392174704277,
"grad_norm": 0.7583235380843109,
"learning_rate": 9.799108911396446e-06,
"loss": 0.0755,
"step": 996
},
{
"epoch": 0.4535941765241128,
"grad_norm": 0.6585135755735663,
"learning_rate": 9.798707640121224e-06,
"loss": 0.0669,
"step": 997
},
{
"epoch": 0.454049135577798,
"grad_norm": 0.9344579240939844,
"learning_rate": 9.798305976717286e-06,
"loss": 0.1028,
"step": 998
},
{
"epoch": 0.45450409463148317,
"grad_norm": 0.6238360425747993,
"learning_rate": 9.79790392121745e-06,
"loss": 0.0608,
"step": 999
},
{
"epoch": 0.4549590536851683,
"grad_norm": 0.715680092291253,
"learning_rate": 9.797501473654573e-06,
"loss": 0.0792,
"step": 1000
},
{
"epoch": 0.4554140127388535,
"grad_norm": 0.8167758856821831,
"learning_rate": 9.797098634061543e-06,
"loss": 0.0948,
"step": 1001
},
{
"epoch": 0.45586897179253866,
"grad_norm": 0.8318764431867516,
"learning_rate": 9.796695402471275e-06,
"loss": 0.0967,
"step": 1002
},
{
"epoch": 0.45632393084622386,
"grad_norm": 0.9700547030363569,
"learning_rate": 9.79629177891672e-06,
"loss": 0.1138,
"step": 1003
},
{
"epoch": 0.456778889899909,
"grad_norm": 0.7702596501705347,
"learning_rate": 9.79588776343086e-06,
"loss": 0.0826,
"step": 1004
},
{
"epoch": 0.4572338489535942,
"grad_norm": 0.833778163717652,
"learning_rate": 9.795483356046711e-06,
"loss": 0.0927,
"step": 1005
},
{
"epoch": 0.45768880800727935,
"grad_norm": 0.7006737675801851,
"learning_rate": 9.795078556797318e-06,
"loss": 0.0747,
"step": 1006
},
{
"epoch": 0.4581437670609645,
"grad_norm": 0.8810114143185821,
"learning_rate": 9.794673365715761e-06,
"loss": 0.0921,
"step": 1007
},
{
"epoch": 0.4585987261146497,
"grad_norm": 0.7286145380478113,
"learning_rate": 9.794267782835148e-06,
"loss": 0.0832,
"step": 1008
},
{
"epoch": 0.45905368516833484,
"grad_norm": 0.8181887559127218,
"learning_rate": 9.793861808188622e-06,
"loss": 0.0729,
"step": 1009
},
{
"epoch": 0.45950864422202004,
"grad_norm": 1.0821839097582124,
"learning_rate": 9.793455441809359e-06,
"loss": 0.1025,
"step": 1010
},
{
"epoch": 0.4599636032757052,
"grad_norm": 0.515896949523265,
"learning_rate": 9.793048683730564e-06,
"loss": 0.0512,
"step": 1011
},
{
"epoch": 0.4604185623293904,
"grad_norm": 0.7800604571516774,
"learning_rate": 9.792641533985474e-06,
"loss": 0.1065,
"step": 1012
},
{
"epoch": 0.4608735213830755,
"grad_norm": 0.48365424866268936,
"learning_rate": 9.792233992607365e-06,
"loss": 0.0622,
"step": 1013
},
{
"epoch": 0.46132848043676067,
"grad_norm": 0.8472876133123602,
"learning_rate": 9.791826059629532e-06,
"loss": 0.0713,
"step": 1014
},
{
"epoch": 0.46178343949044587,
"grad_norm": 0.935522534168844,
"learning_rate": 9.791417735085316e-06,
"loss": 0.0853,
"step": 1015
},
{
"epoch": 0.462238398544131,
"grad_norm": 0.8028819334602026,
"learning_rate": 9.791009019008078e-06,
"loss": 0.0795,
"step": 1016
},
{
"epoch": 0.4626933575978162,
"grad_norm": 0.6458928385673616,
"learning_rate": 9.79059991143122e-06,
"loss": 0.0836,
"step": 1017
},
{
"epoch": 0.46314831665150136,
"grad_norm": 0.8309912415690437,
"learning_rate": 9.790190412388173e-06,
"loss": 0.0895,
"step": 1018
},
{
"epoch": 0.46360327570518656,
"grad_norm": 0.6953691809158898,
"learning_rate": 9.789780521912396e-06,
"loss": 0.0686,
"step": 1019
},
{
"epoch": 0.4640582347588717,
"grad_norm": 0.7563151979586233,
"learning_rate": 9.789370240037385e-06,
"loss": 0.0879,
"step": 1020
},
{
"epoch": 0.46451319381255685,
"grad_norm": 0.6646619102460968,
"learning_rate": 9.788959566796667e-06,
"loss": 0.0761,
"step": 1021
},
{
"epoch": 0.46496815286624205,
"grad_norm": 0.8092527562913561,
"learning_rate": 9.788548502223801e-06,
"loss": 0.0863,
"step": 1022
},
{
"epoch": 0.4654231119199272,
"grad_norm": 2.0284506817542396,
"learning_rate": 9.788137046352374e-06,
"loss": 0.2011,
"step": 1023
},
{
"epoch": 0.4658780709736124,
"grad_norm": 0.6524644993097855,
"learning_rate": 9.787725199216011e-06,
"loss": 0.0765,
"step": 1024
},
{
"epoch": 0.46633303002729753,
"grad_norm": 0.48134373932870766,
"learning_rate": 9.787312960848368e-06,
"loss": 0.0505,
"step": 1025
},
{
"epoch": 0.46678798908098273,
"grad_norm": 0.6646547386252114,
"learning_rate": 9.786900331283128e-06,
"loss": 0.0825,
"step": 1026
},
{
"epoch": 0.4672429481346679,
"grad_norm": 0.5655812014606527,
"learning_rate": 9.78648731055401e-06,
"loss": 0.0659,
"step": 1027
},
{
"epoch": 0.467697907188353,
"grad_norm": 0.680196435092224,
"learning_rate": 9.786073898694766e-06,
"loss": 0.0734,
"step": 1028
},
{
"epoch": 0.4681528662420382,
"grad_norm": 0.6198434008496165,
"learning_rate": 9.785660095739176e-06,
"loss": 0.0687,
"step": 1029
},
{
"epoch": 0.46860782529572337,
"grad_norm": 0.5967309034966486,
"learning_rate": 9.785245901721054e-06,
"loss": 0.0443,
"step": 1030
},
{
"epoch": 0.46906278434940857,
"grad_norm": 0.588565790719301,
"learning_rate": 9.784831316674246e-06,
"loss": 0.0741,
"step": 1031
},
{
"epoch": 0.4695177434030937,
"grad_norm": 0.6384508627867143,
"learning_rate": 9.784416340632634e-06,
"loss": 0.0639,
"step": 1032
},
{
"epoch": 0.4699727024567789,
"grad_norm": 0.528980291125106,
"learning_rate": 9.784000973630124e-06,
"loss": 0.0506,
"step": 1033
},
{
"epoch": 0.47042766151046406,
"grad_norm": 0.6297922247581061,
"learning_rate": 9.783585215700656e-06,
"loss": 0.0704,
"step": 1034
},
{
"epoch": 0.4708826205641492,
"grad_norm": 1.1014615381108162,
"learning_rate": 9.783169066878208e-06,
"loss": 0.1063,
"step": 1035
},
{
"epoch": 0.4713375796178344,
"grad_norm": 0.7370811970547196,
"learning_rate": 9.782752527196785e-06,
"loss": 0.0888,
"step": 1036
},
{
"epoch": 0.47179253867151955,
"grad_norm": 0.6272964856361817,
"learning_rate": 9.782335596690425e-06,
"loss": 0.0683,
"step": 1037
},
{
"epoch": 0.47224749772520475,
"grad_norm": 0.9675945822898259,
"learning_rate": 9.781918275393196e-06,
"loss": 0.1031,
"step": 1038
},
{
"epoch": 0.4727024567788899,
"grad_norm": 0.8448129794628584,
"learning_rate": 9.781500563339202e-06,
"loss": 0.0818,
"step": 1039
},
{
"epoch": 0.4731574158325751,
"grad_norm": 0.5148120993988892,
"learning_rate": 9.781082460562574e-06,
"loss": 0.0525,
"step": 1040
},
{
"epoch": 0.47361237488626023,
"grad_norm": 0.7767251927940846,
"learning_rate": 9.780663967097477e-06,
"loss": 0.0869,
"step": 1041
},
{
"epoch": 0.4740673339399454,
"grad_norm": 0.9661754574144388,
"learning_rate": 9.780245082978112e-06,
"loss": 0.0923,
"step": 1042
},
{
"epoch": 0.4745222929936306,
"grad_norm": 0.780061387882855,
"learning_rate": 9.779825808238705e-06,
"loss": 0.095,
"step": 1043
},
{
"epoch": 0.4749772520473157,
"grad_norm": 0.8513172657519864,
"learning_rate": 9.77940614291352e-06,
"loss": 0.0772,
"step": 1044
},
{
"epoch": 0.4754322111010009,
"grad_norm": 0.6199453465731616,
"learning_rate": 9.778986087036846e-06,
"loss": 0.0701,
"step": 1045
},
{
"epoch": 0.47588717015468607,
"grad_norm": 0.5327629714743946,
"learning_rate": 9.778565640643011e-06,
"loss": 0.0447,
"step": 1046
},
{
"epoch": 0.47634212920837127,
"grad_norm": 0.8882337205809296,
"learning_rate": 9.778144803766375e-06,
"loss": 0.0788,
"step": 1047
},
{
"epoch": 0.4767970882620564,
"grad_norm": 0.6023343672839219,
"learning_rate": 9.77772357644132e-06,
"loss": 0.0693,
"step": 1048
},
{
"epoch": 0.47725204731574156,
"grad_norm": 0.8031515985448552,
"learning_rate": 9.777301958702273e-06,
"loss": 0.0911,
"step": 1049
},
{
"epoch": 0.47770700636942676,
"grad_norm": 0.8695877166802147,
"learning_rate": 9.776879950583683e-06,
"loss": 0.12,
"step": 1050
},
{
"epoch": 0.4781619654231119,
"grad_norm": 0.6077253389668626,
"learning_rate": 9.776457552120034e-06,
"loss": 0.0722,
"step": 1051
},
{
"epoch": 0.4786169244767971,
"grad_norm": 0.7976020915977983,
"learning_rate": 9.776034763345845e-06,
"loss": 0.0783,
"step": 1052
},
{
"epoch": 0.47907188353048225,
"grad_norm": 0.7091049596783572,
"learning_rate": 9.775611584295663e-06,
"loss": 0.0739,
"step": 1053
},
{
"epoch": 0.47952684258416745,
"grad_norm": 0.7919907245184465,
"learning_rate": 9.775188015004072e-06,
"loss": 0.0728,
"step": 1054
},
{
"epoch": 0.4799818016378526,
"grad_norm": 0.9227645018819045,
"learning_rate": 9.774764055505676e-06,
"loss": 0.0905,
"step": 1055
},
{
"epoch": 0.48043676069153773,
"grad_norm": 0.7130315690029604,
"learning_rate": 9.774339705835127e-06,
"loss": 0.09,
"step": 1056
},
{
"epoch": 0.48089171974522293,
"grad_norm": 0.7993270676292756,
"learning_rate": 9.773914966027098e-06,
"loss": 0.1011,
"step": 1057
},
{
"epoch": 0.4813466787989081,
"grad_norm": 0.8955668988276211,
"learning_rate": 9.773489836116297e-06,
"loss": 0.0963,
"step": 1058
},
{
"epoch": 0.4818016378525933,
"grad_norm": 0.7582155580680914,
"learning_rate": 9.773064316137464e-06,
"loss": 0.0766,
"step": 1059
},
{
"epoch": 0.4822565969062784,
"grad_norm": 0.6939955066308027,
"learning_rate": 9.772638406125367e-06,
"loss": 0.0687,
"step": 1060
},
{
"epoch": 0.4827115559599636,
"grad_norm": 0.8091635860789653,
"learning_rate": 9.772212106114816e-06,
"loss": 0.0754,
"step": 1061
},
{
"epoch": 0.48316651501364877,
"grad_norm": 0.8236012040739623,
"learning_rate": 9.77178541614064e-06,
"loss": 0.0951,
"step": 1062
},
{
"epoch": 0.48362147406733397,
"grad_norm": 0.6622501946117725,
"learning_rate": 9.77135833623771e-06,
"loss": 0.083,
"step": 1063
},
{
"epoch": 0.4840764331210191,
"grad_norm": 0.8689743387052602,
"learning_rate": 9.770930866440927e-06,
"loss": 0.1074,
"step": 1064
},
{
"epoch": 0.48453139217470426,
"grad_norm": 0.6733750246744147,
"learning_rate": 9.770503006785214e-06,
"loss": 0.0639,
"step": 1065
},
{
"epoch": 0.48498635122838946,
"grad_norm": 0.9485233745498586,
"learning_rate": 9.770074757305541e-06,
"loss": 0.1106,
"step": 1066
},
{
"epoch": 0.4854413102820746,
"grad_norm": 0.8288392949652397,
"learning_rate": 9.769646118036902e-06,
"loss": 0.0661,
"step": 1067
},
{
"epoch": 0.4858962693357598,
"grad_norm": 0.7475423805914638,
"learning_rate": 9.76921708901432e-06,
"loss": 0.0686,
"step": 1068
},
{
"epoch": 0.48635122838944495,
"grad_norm": 0.54120364671088,
"learning_rate": 9.768787670272855e-06,
"loss": 0.0629,
"step": 1069
},
{
"epoch": 0.48680618744313015,
"grad_norm": 0.7281619635509152,
"learning_rate": 9.768357861847598e-06,
"loss": 0.0723,
"step": 1070
},
{
"epoch": 0.4872611464968153,
"grad_norm": 0.8883321717067604,
"learning_rate": 9.767927663773668e-06,
"loss": 0.0832,
"step": 1071
},
{
"epoch": 0.48771610555050043,
"grad_norm": 0.7681469789077073,
"learning_rate": 9.767497076086223e-06,
"loss": 0.0786,
"step": 1072
},
{
"epoch": 0.48817106460418563,
"grad_norm": 0.6590861395931087,
"learning_rate": 9.767066098820446e-06,
"loss": 0.0704,
"step": 1073
},
{
"epoch": 0.4886260236578708,
"grad_norm": 0.7944203702948146,
"learning_rate": 9.766634732011557e-06,
"loss": 0.0867,
"step": 1074
},
{
"epoch": 0.489080982711556,
"grad_norm": 0.7832480468570255,
"learning_rate": 9.766202975694801e-06,
"loss": 0.0873,
"step": 1075
},
{
"epoch": 0.4895359417652411,
"grad_norm": 0.7232266679451883,
"learning_rate": 9.765770829905464e-06,
"loss": 0.0785,
"step": 1076
},
{
"epoch": 0.4899909008189263,
"grad_norm": 0.5406798309730716,
"learning_rate": 9.765338294678856e-06,
"loss": 0.0469,
"step": 1077
},
{
"epoch": 0.49044585987261147,
"grad_norm": 0.5866548164219128,
"learning_rate": 9.764905370050321e-06,
"loss": 0.0524,
"step": 1078
},
{
"epoch": 0.4909008189262966,
"grad_norm": 0.9915720236606885,
"learning_rate": 9.76447205605524e-06,
"loss": 0.1019,
"step": 1079
},
{
"epoch": 0.4913557779799818,
"grad_norm": 0.6838845303274752,
"learning_rate": 9.764038352729018e-06,
"loss": 0.0891,
"step": 1080
},
{
"epoch": 0.49181073703366696,
"grad_norm": 0.9385660559352969,
"learning_rate": 9.763604260107096e-06,
"loss": 0.1058,
"step": 1081
},
{
"epoch": 0.49226569608735216,
"grad_norm": 0.6710872617569944,
"learning_rate": 9.763169778224946e-06,
"loss": 0.0665,
"step": 1082
},
{
"epoch": 0.4927206551410373,
"grad_norm": 0.7878885609137168,
"learning_rate": 9.762734907118072e-06,
"loss": 0.0876,
"step": 1083
},
{
"epoch": 0.4931756141947225,
"grad_norm": 0.6302166766090778,
"learning_rate": 9.76229964682201e-06,
"loss": 0.0507,
"step": 1084
},
{
"epoch": 0.49363057324840764,
"grad_norm": 0.5833462678864086,
"learning_rate": 9.761863997372325e-06,
"loss": 0.0612,
"step": 1085
},
{
"epoch": 0.4940855323020928,
"grad_norm": 1.036522158484448,
"learning_rate": 9.761427958804621e-06,
"loss": 0.1395,
"step": 1086
},
{
"epoch": 0.494540491355778,
"grad_norm": 1.1502320115946314,
"learning_rate": 9.760991531154526e-06,
"loss": 0.1149,
"step": 1087
},
{
"epoch": 0.49499545040946313,
"grad_norm": 0.7616054217825209,
"learning_rate": 9.760554714457704e-06,
"loss": 0.0684,
"step": 1088
},
{
"epoch": 0.49545040946314833,
"grad_norm": 0.5129309167340426,
"learning_rate": 9.760117508749846e-06,
"loss": 0.0614,
"step": 1089
},
{
"epoch": 0.4959053685168335,
"grad_norm": 0.7147170789642256,
"learning_rate": 9.759679914066686e-06,
"loss": 0.0842,
"step": 1090
},
{
"epoch": 0.4963603275705187,
"grad_norm": 0.7513123367978354,
"learning_rate": 9.759241930443975e-06,
"loss": 0.0749,
"step": 1091
},
{
"epoch": 0.4968152866242038,
"grad_norm": 0.5462870672862663,
"learning_rate": 9.75880355791751e-06,
"loss": 0.0588,
"step": 1092
},
{
"epoch": 0.49727024567788897,
"grad_norm": 0.6158644897786469,
"learning_rate": 9.758364796523105e-06,
"loss": 0.0578,
"step": 1093
},
{
"epoch": 0.49772520473157417,
"grad_norm": 0.5248367448810554,
"learning_rate": 9.757925646296617e-06,
"loss": 0.0504,
"step": 1094
},
{
"epoch": 0.4981801637852593,
"grad_norm": 0.7801307646100064,
"learning_rate": 9.757486107273935e-06,
"loss": 0.0819,
"step": 1095
},
{
"epoch": 0.4986351228389445,
"grad_norm": 0.6822936325355138,
"learning_rate": 9.75704617949097e-06,
"loss": 0.0828,
"step": 1096
},
{
"epoch": 0.49909008189262966,
"grad_norm": 0.49379397863131413,
"learning_rate": 9.756605862983675e-06,
"loss": 0.0606,
"step": 1097
},
{
"epoch": 0.49954504094631486,
"grad_norm": 0.5236513133369656,
"learning_rate": 9.756165157788029e-06,
"loss": 0.0493,
"step": 1098
},
{
"epoch": 0.5,
"grad_norm": 0.7323812225903658,
"learning_rate": 9.755724063940047e-06,
"loss": 0.0794,
"step": 1099
},
{
"epoch": 0.5004549590536852,
"grad_norm": 0.853156508842135,
"learning_rate": 9.755282581475769e-06,
"loss": 0.08,
"step": 1100
},
{
"epoch": 0.5009099181073703,
"grad_norm": 0.7117091061791435,
"learning_rate": 9.754840710431274e-06,
"loss": 0.0773,
"step": 1101
},
{
"epoch": 0.5013648771610555,
"grad_norm": 0.9350752111669145,
"learning_rate": 9.754398450842668e-06,
"loss": 0.1046,
"step": 1102
},
{
"epoch": 0.5018198362147407,
"grad_norm": 0.8834833642233855,
"learning_rate": 9.753955802746091e-06,
"loss": 0.1284,
"step": 1103
},
{
"epoch": 0.5022747952684259,
"grad_norm": 0.9022387216275947,
"learning_rate": 9.753512766177717e-06,
"loss": 0.0898,
"step": 1104
},
{
"epoch": 0.502729754322111,
"grad_norm": 0.551248880180483,
"learning_rate": 9.753069341173745e-06,
"loss": 0.0596,
"step": 1105
},
{
"epoch": 0.5031847133757962,
"grad_norm": 0.5970423480352659,
"learning_rate": 9.752625527770409e-06,
"loss": 0.0723,
"step": 1106
},
{
"epoch": 0.5036396724294814,
"grad_norm": 0.7620108531589319,
"learning_rate": 9.75218132600398e-06,
"loss": 0.0856,
"step": 1107
},
{
"epoch": 0.5040946314831665,
"grad_norm": 0.7720887684681512,
"learning_rate": 9.751736735910753e-06,
"loss": 0.0904,
"step": 1108
},
{
"epoch": 0.5045495905368517,
"grad_norm": 0.8672659681858957,
"learning_rate": 9.75129175752706e-06,
"loss": 0.1043,
"step": 1109
},
{
"epoch": 0.5050045495905369,
"grad_norm": 0.7511079874116621,
"learning_rate": 9.75084639088926e-06,
"loss": 0.0719,
"step": 1110
}
],
"logging_steps": 1,
"max_steps": 10990,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 555,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7279902056448.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}