Qwen_Math_high_1e5 / checkpoint-555 /trainer_state.json
redsgnaoh's picture
Upload folder using huggingface_hub
487e8fc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.25250227479526843,
"eval_steps": 500,
"global_step": 555,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00045495905368516835,
"grad_norm": 9.461428161462043,
"learning_rate": 1e-05,
"loss": 0.1263,
"step": 1
},
{
"epoch": 0.0009099181073703367,
"grad_norm": 5.190780450250769,
"learning_rate": 9.99999979571129e-06,
"loss": 0.1723,
"step": 2
},
{
"epoch": 0.001364877161055505,
"grad_norm": 7.521926017130347,
"learning_rate": 9.999999182845177e-06,
"loss": 0.1327,
"step": 3
},
{
"epoch": 0.0018198362147406734,
"grad_norm": 2.5665810200307217,
"learning_rate": 9.99999816140171e-06,
"loss": 0.1095,
"step": 4
},
{
"epoch": 0.0022747952684258415,
"grad_norm": 2.738508706395883,
"learning_rate": 9.999996731380973e-06,
"loss": 0.1151,
"step": 5
},
{
"epoch": 0.00272975432211101,
"grad_norm": 2.67941899677245,
"learning_rate": 9.999994892783083e-06,
"loss": 0.0821,
"step": 6
},
{
"epoch": 0.0031847133757961785,
"grad_norm": 2.137586234420784,
"learning_rate": 9.99999264560819e-06,
"loss": 0.0729,
"step": 7
},
{
"epoch": 0.003639672429481347,
"grad_norm": 2.8221590420989164,
"learning_rate": 9.999989989856477e-06,
"loss": 0.0929,
"step": 8
},
{
"epoch": 0.004094631483166515,
"grad_norm": 1.6167314639784554,
"learning_rate": 9.999986925528164e-06,
"loss": 0.0466,
"step": 9
},
{
"epoch": 0.004549590536851683,
"grad_norm": 2.1773262431631313,
"learning_rate": 9.999983452623498e-06,
"loss": 0.0709,
"step": 10
},
{
"epoch": 0.005004549590536852,
"grad_norm": 7.6444390817806465,
"learning_rate": 9.999979571142765e-06,
"loss": 0.0809,
"step": 11
},
{
"epoch": 0.00545950864422202,
"grad_norm": 2.034523884241798,
"learning_rate": 9.999975281086278e-06,
"loss": 0.0839,
"step": 12
},
{
"epoch": 0.005914467697907188,
"grad_norm": 3.576108282005355,
"learning_rate": 9.999970582454392e-06,
"loss": 0.0728,
"step": 13
},
{
"epoch": 0.006369426751592357,
"grad_norm": 2.623641566468802,
"learning_rate": 9.999965475247491e-06,
"loss": 0.1052,
"step": 14
},
{
"epoch": 0.006824385805277525,
"grad_norm": 2.1413574998269085,
"learning_rate": 9.99995995946599e-06,
"loss": 0.0885,
"step": 15
},
{
"epoch": 0.007279344858962694,
"grad_norm": 1.4859066724415246,
"learning_rate": 9.999954035110342e-06,
"loss": 0.0644,
"step": 16
},
{
"epoch": 0.0077343039126478615,
"grad_norm": 2.851793157608408,
"learning_rate": 9.999947702181027e-06,
"loss": 0.1057,
"step": 17
},
{
"epoch": 0.00818926296633303,
"grad_norm": 4.693829546662477,
"learning_rate": 9.999940960678568e-06,
"loss": 0.0867,
"step": 18
},
{
"epoch": 0.008644222020018199,
"grad_norm": 2.2728033563417362,
"learning_rate": 9.999933810603513e-06,
"loss": 0.0789,
"step": 19
},
{
"epoch": 0.009099181073703366,
"grad_norm": 1.6705986173507794,
"learning_rate": 9.999926251956447e-06,
"loss": 0.0683,
"step": 20
},
{
"epoch": 0.009554140127388535,
"grad_norm": 2.187579869114393,
"learning_rate": 9.999918284737986e-06,
"loss": 0.0984,
"step": 21
},
{
"epoch": 0.010009099181073703,
"grad_norm": 2.328040268012338,
"learning_rate": 9.999909908948782e-06,
"loss": 0.0699,
"step": 22
},
{
"epoch": 0.010464058234758872,
"grad_norm": 5.572389775693198,
"learning_rate": 9.999901124589519e-06,
"loss": 0.0912,
"step": 23
},
{
"epoch": 0.01091901728844404,
"grad_norm": 1.84796719674859,
"learning_rate": 9.999891931660916e-06,
"loss": 0.1015,
"step": 24
},
{
"epoch": 0.011373976342129208,
"grad_norm": 1.7501762990792236,
"learning_rate": 9.999882330163725e-06,
"loss": 0.0909,
"step": 25
},
{
"epoch": 0.011828935395814377,
"grad_norm": 0.9922115950592263,
"learning_rate": 9.999872320098729e-06,
"loss": 0.0656,
"step": 26
},
{
"epoch": 0.012283894449499545,
"grad_norm": 1.5612370560987539,
"learning_rate": 9.999861901466746e-06,
"loss": 0.0974,
"step": 27
},
{
"epoch": 0.012738853503184714,
"grad_norm": 1.4617271794930395,
"learning_rate": 9.999851074268625e-06,
"loss": 0.0853,
"step": 28
},
{
"epoch": 0.013193812556869881,
"grad_norm": 1.8127085104491556,
"learning_rate": 9.999839838505257e-06,
"loss": 0.1081,
"step": 29
},
{
"epoch": 0.01364877161055505,
"grad_norm": 1.4710105512612208,
"learning_rate": 9.999828194177555e-06,
"loss": 0.0868,
"step": 30
},
{
"epoch": 0.014103730664240218,
"grad_norm": 1.3474487189311888,
"learning_rate": 9.999816141286472e-06,
"loss": 0.0817,
"step": 31
},
{
"epoch": 0.014558689717925387,
"grad_norm": 1.0967596652549403,
"learning_rate": 9.99980367983299e-06,
"loss": 0.0637,
"step": 32
},
{
"epoch": 0.015013648771610554,
"grad_norm": 3.179425671823194,
"learning_rate": 9.999790809818134e-06,
"loss": 0.069,
"step": 33
},
{
"epoch": 0.015468607825295723,
"grad_norm": 4.482257681577152,
"learning_rate": 9.999777531242951e-06,
"loss": 0.0915,
"step": 34
},
{
"epoch": 0.01592356687898089,
"grad_norm": 3.953299040475791,
"learning_rate": 9.999763844108528e-06,
"loss": 0.0562,
"step": 35
},
{
"epoch": 0.01637852593266606,
"grad_norm": 1.1127201050382067,
"learning_rate": 9.999749748415982e-06,
"loss": 0.0556,
"step": 36
},
{
"epoch": 0.01683348498635123,
"grad_norm": 79.45756094624792,
"learning_rate": 9.999735244166464e-06,
"loss": 0.1223,
"step": 37
},
{
"epoch": 0.017288444040036398,
"grad_norm": 2777.9092912017113,
"learning_rate": 9.99972033136116e-06,
"loss": 0.3211,
"step": 38
},
{
"epoch": 0.017743403093721567,
"grad_norm": 2.5204693177238466,
"learning_rate": 9.999705010001291e-06,
"loss": 0.0723,
"step": 39
},
{
"epoch": 0.018198362147406732,
"grad_norm": 2.2975907071135655,
"learning_rate": 9.999689280088105e-06,
"loss": 0.0696,
"step": 40
},
{
"epoch": 0.0186533212010919,
"grad_norm": 2.998434349074003,
"learning_rate": 9.99967314162289e-06,
"loss": 0.083,
"step": 41
},
{
"epoch": 0.01910828025477707,
"grad_norm": 3.882239448575704,
"learning_rate": 9.999656594606966e-06,
"loss": 0.1015,
"step": 42
},
{
"epoch": 0.019563239308462238,
"grad_norm": 3.5286596480512493,
"learning_rate": 9.999639639041681e-06,
"loss": 0.0817,
"step": 43
},
{
"epoch": 0.020018198362147407,
"grad_norm": 1.6933989447443707,
"learning_rate": 9.999622274928424e-06,
"loss": 0.1003,
"step": 44
},
{
"epoch": 0.020473157415832575,
"grad_norm": 1.2483160046323276,
"learning_rate": 9.999604502268614e-06,
"loss": 0.0952,
"step": 45
},
{
"epoch": 0.020928116469517744,
"grad_norm": 0.9417906124383243,
"learning_rate": 9.9995863210637e-06,
"loss": 0.0731,
"step": 46
},
{
"epoch": 0.021383075523202913,
"grad_norm": 2.8195414757816897,
"learning_rate": 9.99956773131517e-06,
"loss": 0.1845,
"step": 47
},
{
"epoch": 0.02183803457688808,
"grad_norm": 2.74390379471345,
"learning_rate": 9.999548733024545e-06,
"loss": 0.1826,
"step": 48
},
{
"epoch": 0.022292993630573247,
"grad_norm": 1.5138494619527987,
"learning_rate": 9.999529326193373e-06,
"loss": 0.0857,
"step": 49
},
{
"epoch": 0.022747952684258416,
"grad_norm": 1.215379974181271,
"learning_rate": 9.999509510823242e-06,
"loss": 0.0686,
"step": 50
},
{
"epoch": 0.023202911737943584,
"grad_norm": 1.292187967807859,
"learning_rate": 9.999489286915773e-06,
"loss": 0.0707,
"step": 51
},
{
"epoch": 0.023657870791628753,
"grad_norm": 1.7888013203563982,
"learning_rate": 9.999468654472614e-06,
"loss": 0.0682,
"step": 52
},
{
"epoch": 0.024112829845313922,
"grad_norm": 0.8979425621703144,
"learning_rate": 9.999447613495457e-06,
"loss": 0.0508,
"step": 53
},
{
"epoch": 0.02456778889899909,
"grad_norm": 1.9123835444775663,
"learning_rate": 9.99942616398602e-06,
"loss": 0.0689,
"step": 54
},
{
"epoch": 0.02502274795268426,
"grad_norm": 0.9393581994096443,
"learning_rate": 9.99940430594605e-06,
"loss": 0.0496,
"step": 55
},
{
"epoch": 0.025477707006369428,
"grad_norm": 1.0234476513644222,
"learning_rate": 9.999382039377339e-06,
"loss": 0.0601,
"step": 56
},
{
"epoch": 0.025932666060054597,
"grad_norm": 0.9291387208138827,
"learning_rate": 9.999359364281704e-06,
"loss": 0.0377,
"step": 57
},
{
"epoch": 0.026387625113739762,
"grad_norm": 1.8209170803663992,
"learning_rate": 9.999336280660999e-06,
"loss": 0.1144,
"step": 58
},
{
"epoch": 0.02684258416742493,
"grad_norm": 1.1214625046464874,
"learning_rate": 9.99931278851711e-06,
"loss": 0.0622,
"step": 59
},
{
"epoch": 0.0272975432211101,
"grad_norm": 1.0331723997917317,
"learning_rate": 9.999288887851956e-06,
"loss": 0.0667,
"step": 60
},
{
"epoch": 0.027752502274795268,
"grad_norm": 1.0412381501406744,
"learning_rate": 9.999264578667493e-06,
"loss": 0.0566,
"step": 61
},
{
"epoch": 0.028207461328480437,
"grad_norm": 1.4510603110658047,
"learning_rate": 9.999239860965703e-06,
"loss": 0.0845,
"step": 62
},
{
"epoch": 0.028662420382165606,
"grad_norm": 1.301162540669183,
"learning_rate": 9.999214734748609e-06,
"loss": 0.0759,
"step": 63
},
{
"epoch": 0.029117379435850774,
"grad_norm": 0.9977688847603402,
"learning_rate": 9.999189200018263e-06,
"loss": 0.0528,
"step": 64
},
{
"epoch": 0.029572338489535943,
"grad_norm": 1.2894688842348854,
"learning_rate": 9.99916325677675e-06,
"loss": 0.0899,
"step": 65
},
{
"epoch": 0.03002729754322111,
"grad_norm": 1.4627871680702638,
"learning_rate": 9.999136905026194e-06,
"loss": 0.1456,
"step": 66
},
{
"epoch": 0.030482256596906277,
"grad_norm": 1.2304385710214434,
"learning_rate": 9.999110144768745e-06,
"loss": 0.079,
"step": 67
},
{
"epoch": 0.030937215650591446,
"grad_norm": 1.085016380732753,
"learning_rate": 9.99908297600659e-06,
"loss": 0.0696,
"step": 68
},
{
"epoch": 0.03139217470427662,
"grad_norm": 0.989450558642297,
"learning_rate": 9.99905539874195e-06,
"loss": 0.069,
"step": 69
},
{
"epoch": 0.03184713375796178,
"grad_norm": 1.0510491151133208,
"learning_rate": 9.99902741297708e-06,
"loss": 0.0555,
"step": 70
},
{
"epoch": 0.03230209281164695,
"grad_norm": 0.8938033562648371,
"learning_rate": 9.998999018714264e-06,
"loss": 0.0783,
"step": 71
},
{
"epoch": 0.03275705186533212,
"grad_norm": 2.902512108322722,
"learning_rate": 9.998970215955824e-06,
"loss": 0.0702,
"step": 72
},
{
"epoch": 0.033212010919017286,
"grad_norm": 0.7661831894133686,
"learning_rate": 9.998941004704113e-06,
"loss": 0.0519,
"step": 73
},
{
"epoch": 0.03366696997270246,
"grad_norm": 1.1047249497744047,
"learning_rate": 9.998911384961518e-06,
"loss": 0.0773,
"step": 74
},
{
"epoch": 0.034121929026387623,
"grad_norm": 0.7750047299312716,
"learning_rate": 9.998881356730458e-06,
"loss": 0.0598,
"step": 75
},
{
"epoch": 0.034576888080072796,
"grad_norm": 0.9815801555720315,
"learning_rate": 9.99885092001339e-06,
"loss": 0.0661,
"step": 76
},
{
"epoch": 0.03503184713375796,
"grad_norm": 1.3090963451351905,
"learning_rate": 9.998820074812799e-06,
"loss": 0.0713,
"step": 77
},
{
"epoch": 0.03548680618744313,
"grad_norm": 1.1489338732270693,
"learning_rate": 9.998788821131207e-06,
"loss": 0.0946,
"step": 78
},
{
"epoch": 0.0359417652411283,
"grad_norm": 0.9040381990998293,
"learning_rate": 9.998757158971164e-06,
"loss": 0.067,
"step": 79
},
{
"epoch": 0.036396724294813464,
"grad_norm": 1.1019926198229115,
"learning_rate": 9.998725088335263e-06,
"loss": 0.0874,
"step": 80
},
{
"epoch": 0.036851683348498636,
"grad_norm": 0.5779852750462403,
"learning_rate": 9.99869260922612e-06,
"loss": 0.0492,
"step": 81
},
{
"epoch": 0.0373066424021838,
"grad_norm": 1.2769852710418472,
"learning_rate": 9.998659721646393e-06,
"loss": 0.0781,
"step": 82
},
{
"epoch": 0.03776160145586897,
"grad_norm": 0.9020624084974485,
"learning_rate": 9.998626425598766e-06,
"loss": 0.0734,
"step": 83
},
{
"epoch": 0.03821656050955414,
"grad_norm": 0.9626764462141776,
"learning_rate": 9.99859272108596e-06,
"loss": 0.0719,
"step": 84
},
{
"epoch": 0.03867151956323931,
"grad_norm": 0.9435885887029873,
"learning_rate": 9.998558608110733e-06,
"loss": 0.0835,
"step": 85
},
{
"epoch": 0.039126478616924476,
"grad_norm": 1.0578725525123687,
"learning_rate": 9.998524086675867e-06,
"loss": 0.0746,
"step": 86
},
{
"epoch": 0.03958143767060965,
"grad_norm": 1.0366588534208079,
"learning_rate": 9.998489156784188e-06,
"loss": 0.0933,
"step": 87
},
{
"epoch": 0.040036396724294813,
"grad_norm": 1.0595948680723846,
"learning_rate": 9.998453818438547e-06,
"loss": 0.0846,
"step": 88
},
{
"epoch": 0.04049135577797998,
"grad_norm": 0.8807515753016749,
"learning_rate": 9.998418071641833e-06,
"loss": 0.0649,
"step": 89
},
{
"epoch": 0.04094631483166515,
"grad_norm": 0.9034225145874141,
"learning_rate": 9.998381916396967e-06,
"loss": 0.0621,
"step": 90
},
{
"epoch": 0.041401273885350316,
"grad_norm": 0.6732889821553815,
"learning_rate": 9.998345352706901e-06,
"loss": 0.0367,
"step": 91
},
{
"epoch": 0.04185623293903549,
"grad_norm": 0.7136967603743426,
"learning_rate": 9.998308380574628e-06,
"loss": 0.0569,
"step": 92
},
{
"epoch": 0.042311191992720654,
"grad_norm": 1.1459385364035048,
"learning_rate": 9.998271000003166e-06,
"loss": 0.1184,
"step": 93
},
{
"epoch": 0.042766151046405826,
"grad_norm": 0.8224906129097734,
"learning_rate": 9.998233210995569e-06,
"loss": 0.0682,
"step": 94
},
{
"epoch": 0.04322111010009099,
"grad_norm": 1.5182946932236698,
"learning_rate": 9.998195013554926e-06,
"loss": 0.0875,
"step": 95
},
{
"epoch": 0.04367606915377616,
"grad_norm": 0.9355855711018981,
"learning_rate": 9.998156407684359e-06,
"loss": 0.0939,
"step": 96
},
{
"epoch": 0.04413102820746133,
"grad_norm": 0.7329840867165283,
"learning_rate": 9.998117393387022e-06,
"loss": 0.0466,
"step": 97
},
{
"epoch": 0.044585987261146494,
"grad_norm": 0.8701001036058451,
"learning_rate": 9.9980779706661e-06,
"loss": 0.0729,
"step": 98
},
{
"epoch": 0.045040946314831666,
"grad_norm": 1.0218896298663185,
"learning_rate": 9.99803813952482e-06,
"loss": 0.0828,
"step": 99
},
{
"epoch": 0.04549590536851683,
"grad_norm": 0.9044995357273884,
"learning_rate": 9.997997899966433e-06,
"loss": 0.0709,
"step": 100
},
{
"epoch": 0.045950864422202004,
"grad_norm": 0.9877796099816964,
"learning_rate": 9.99795725199423e-06,
"loss": 0.0903,
"step": 101
},
{
"epoch": 0.04640582347588717,
"grad_norm": 1.0061501994463906,
"learning_rate": 9.99791619561153e-06,
"loss": 0.0831,
"step": 102
},
{
"epoch": 0.04686078252957234,
"grad_norm": 0.8789173954818107,
"learning_rate": 9.997874730821689e-06,
"loss": 0.0714,
"step": 103
},
{
"epoch": 0.047315741583257506,
"grad_norm": 15.480920098194954,
"learning_rate": 9.997832857628093e-06,
"loss": 0.2603,
"step": 104
},
{
"epoch": 0.04777070063694268,
"grad_norm": 1.3806761301603454,
"learning_rate": 9.99779057603417e-06,
"loss": 0.1227,
"step": 105
},
{
"epoch": 0.048225659690627844,
"grad_norm": 0.8462176607269959,
"learning_rate": 9.997747886043368e-06,
"loss": 0.0605,
"step": 106
},
{
"epoch": 0.04868061874431301,
"grad_norm": 0.7467169847716549,
"learning_rate": 9.997704787659179e-06,
"loss": 0.0618,
"step": 107
},
{
"epoch": 0.04913557779799818,
"grad_norm": 1.5653334818977065,
"learning_rate": 9.997661280885125e-06,
"loss": 0.1253,
"step": 108
},
{
"epoch": 0.049590536851683346,
"grad_norm": 0.871706038604149,
"learning_rate": 9.99761736572476e-06,
"loss": 0.0716,
"step": 109
},
{
"epoch": 0.05004549590536852,
"grad_norm": 1.1398296008355844,
"learning_rate": 9.997573042181672e-06,
"loss": 0.0698,
"step": 110
},
{
"epoch": 0.050500454959053684,
"grad_norm": 1.0487992691419916,
"learning_rate": 9.997528310259485e-06,
"loss": 0.1102,
"step": 111
},
{
"epoch": 0.050955414012738856,
"grad_norm": 0.9112684449646818,
"learning_rate": 9.997483169961852e-06,
"loss": 0.1032,
"step": 112
},
{
"epoch": 0.05141037306642402,
"grad_norm": 0.9418790141923585,
"learning_rate": 9.997437621292463e-06,
"loss": 0.0771,
"step": 113
},
{
"epoch": 0.051865332120109194,
"grad_norm": 0.7796140692842074,
"learning_rate": 9.99739166425504e-06,
"loss": 0.0627,
"step": 114
},
{
"epoch": 0.05232029117379436,
"grad_norm": 1.5434421216734795,
"learning_rate": 9.997345298853339e-06,
"loss": 0.1495,
"step": 115
},
{
"epoch": 0.052775250227479524,
"grad_norm": 0.8898179660551836,
"learning_rate": 9.997298525091148e-06,
"loss": 0.0735,
"step": 116
},
{
"epoch": 0.053230209281164696,
"grad_norm": 0.8585916871524272,
"learning_rate": 9.997251342972288e-06,
"loss": 0.068,
"step": 117
},
{
"epoch": 0.05368516833484986,
"grad_norm": 0.812806800238708,
"learning_rate": 9.997203752500616e-06,
"loss": 0.0689,
"step": 118
},
{
"epoch": 0.054140127388535034,
"grad_norm": 0.9677722064277628,
"learning_rate": 9.997155753680021e-06,
"loss": 0.0795,
"step": 119
},
{
"epoch": 0.0545950864422202,
"grad_norm": 1.621934591654054,
"learning_rate": 9.997107346514425e-06,
"loss": 0.0707,
"step": 120
},
{
"epoch": 0.05505004549590537,
"grad_norm": 0.6750452750311531,
"learning_rate": 9.997058531007782e-06,
"loss": 0.0588,
"step": 121
},
{
"epoch": 0.055505004549590536,
"grad_norm": 0.9583870506818666,
"learning_rate": 9.997009307164083e-06,
"loss": 0.0859,
"step": 122
},
{
"epoch": 0.05595996360327571,
"grad_norm": 1.247483970027119,
"learning_rate": 9.99695967498735e-06,
"loss": 0.0952,
"step": 123
},
{
"epoch": 0.056414922656960874,
"grad_norm": 0.7937903902273558,
"learning_rate": 9.996909634481639e-06,
"loss": 0.0614,
"step": 124
},
{
"epoch": 0.05686988171064604,
"grad_norm": 4.855426128828546,
"learning_rate": 9.996859185651038e-06,
"loss": 0.1629,
"step": 125
},
{
"epoch": 0.05732484076433121,
"grad_norm": 1.0499970639607177,
"learning_rate": 9.99680832849967e-06,
"loss": 0.1031,
"step": 126
},
{
"epoch": 0.05777979981801638,
"grad_norm": 0.8730447821488512,
"learning_rate": 9.99675706303169e-06,
"loss": 0.0606,
"step": 127
},
{
"epoch": 0.05823475887170155,
"grad_norm": 1.2779985416162813,
"learning_rate": 9.99670538925129e-06,
"loss": 0.074,
"step": 128
},
{
"epoch": 0.058689717925386714,
"grad_norm": 0.8606157718419157,
"learning_rate": 9.996653307162687e-06,
"loss": 0.0703,
"step": 129
},
{
"epoch": 0.059144676979071886,
"grad_norm": 0.8920761218762643,
"learning_rate": 9.996600816770144e-06,
"loss": 0.0818,
"step": 130
},
{
"epoch": 0.05959963603275705,
"grad_norm": 1.1603462045917847,
"learning_rate": 9.996547918077944e-06,
"loss": 0.1148,
"step": 131
},
{
"epoch": 0.06005459508644222,
"grad_norm": 0.9108713801214797,
"learning_rate": 9.996494611090414e-06,
"loss": 0.0884,
"step": 132
},
{
"epoch": 0.06050955414012739,
"grad_norm": 0.6523725468628359,
"learning_rate": 9.996440895811907e-06,
"loss": 0.0535,
"step": 133
},
{
"epoch": 0.060964513193812554,
"grad_norm": 0.8812777694752004,
"learning_rate": 9.996386772246816e-06,
"loss": 0.087,
"step": 134
},
{
"epoch": 0.061419472247497726,
"grad_norm": 1.0622191207422995,
"learning_rate": 9.99633224039956e-06,
"loss": 0.0982,
"step": 135
},
{
"epoch": 0.06187443130118289,
"grad_norm": 3.7961077321923025,
"learning_rate": 9.996277300274596e-06,
"loss": 0.1526,
"step": 136
},
{
"epoch": 0.062329390354868064,
"grad_norm": 0.9444433559435487,
"learning_rate": 9.996221951876415e-06,
"loss": 0.0996,
"step": 137
},
{
"epoch": 0.06278434940855324,
"grad_norm": 1.444871481552235,
"learning_rate": 9.996166195209539e-06,
"loss": 0.1075,
"step": 138
},
{
"epoch": 0.0632393084622384,
"grad_norm": 0.7446446480732116,
"learning_rate": 9.996110030278522e-06,
"loss": 0.0561,
"step": 139
},
{
"epoch": 0.06369426751592357,
"grad_norm": 0.8913010543094952,
"learning_rate": 9.996053457087958e-06,
"loss": 0.0715,
"step": 140
},
{
"epoch": 0.06414922656960874,
"grad_norm": 0.7815821404043856,
"learning_rate": 9.995996475642466e-06,
"loss": 0.0796,
"step": 141
},
{
"epoch": 0.0646041856232939,
"grad_norm": 0.74337588448595,
"learning_rate": 9.995939085946704e-06,
"loss": 0.0661,
"step": 142
},
{
"epoch": 0.06505914467697907,
"grad_norm": 0.9974255688753435,
"learning_rate": 9.995881288005363e-06,
"loss": 0.0869,
"step": 143
},
{
"epoch": 0.06551410373066424,
"grad_norm": 1.2260290141946268,
"learning_rate": 9.995823081823162e-06,
"loss": 0.0766,
"step": 144
},
{
"epoch": 0.06596906278434941,
"grad_norm": 0.9751795993584637,
"learning_rate": 9.99576446740486e-06,
"loss": 0.091,
"step": 145
},
{
"epoch": 0.06642402183803457,
"grad_norm": 1.6175476325168967,
"learning_rate": 9.995705444755249e-06,
"loss": 0.1208,
"step": 146
},
{
"epoch": 0.06687898089171974,
"grad_norm": 0.7580083688127299,
"learning_rate": 9.995646013879147e-06,
"loss": 0.0622,
"step": 147
},
{
"epoch": 0.06733393994540492,
"grad_norm": 1.0194887039793072,
"learning_rate": 9.995586174781413e-06,
"loss": 0.0753,
"step": 148
},
{
"epoch": 0.06778889899909009,
"grad_norm": 0.9065646408503975,
"learning_rate": 9.995525927466936e-06,
"loss": 0.0848,
"step": 149
},
{
"epoch": 0.06824385805277525,
"grad_norm": 0.8871078738477127,
"learning_rate": 9.995465271940641e-06,
"loss": 0.0607,
"step": 150
},
{
"epoch": 0.06869881710646042,
"grad_norm": 1.1486707652049646,
"learning_rate": 9.995404208207485e-06,
"loss": 0.0809,
"step": 151
},
{
"epoch": 0.06915377616014559,
"grad_norm": 1.1473150526096232,
"learning_rate": 9.995342736272453e-06,
"loss": 0.1035,
"step": 152
},
{
"epoch": 0.06960873521383075,
"grad_norm": 1.3025683052462544,
"learning_rate": 9.995280856140572e-06,
"loss": 0.1197,
"step": 153
},
{
"epoch": 0.07006369426751592,
"grad_norm": 0.8069596755970996,
"learning_rate": 9.9952185678169e-06,
"loss": 0.0526,
"step": 154
},
{
"epoch": 0.0705186533212011,
"grad_norm": 0.8153700064848134,
"learning_rate": 9.995155871306524e-06,
"loss": 0.0613,
"step": 155
},
{
"epoch": 0.07097361237488627,
"grad_norm": 0.7319023745966868,
"learning_rate": 9.995092766614567e-06,
"loss": 0.0512,
"step": 156
},
{
"epoch": 0.07142857142857142,
"grad_norm": 1.0146656175738817,
"learning_rate": 9.995029253746186e-06,
"loss": 0.0846,
"step": 157
},
{
"epoch": 0.0718835304822566,
"grad_norm": 0.8015254985373994,
"learning_rate": 9.994965332706574e-06,
"loss": 0.0619,
"step": 158
},
{
"epoch": 0.07233848953594177,
"grad_norm": 1.0630207312416284,
"learning_rate": 9.994901003500952e-06,
"loss": 0.0796,
"step": 159
},
{
"epoch": 0.07279344858962693,
"grad_norm": 0.9431304991088505,
"learning_rate": 9.994836266134575e-06,
"loss": 0.0743,
"step": 160
},
{
"epoch": 0.0732484076433121,
"grad_norm": 1.023738915097686,
"learning_rate": 9.994771120612737e-06,
"loss": 0.0888,
"step": 161
},
{
"epoch": 0.07370336669699727,
"grad_norm": 0.9272637744585672,
"learning_rate": 9.994705566940757e-06,
"loss": 0.084,
"step": 162
},
{
"epoch": 0.07415832575068244,
"grad_norm": 1.122378326253592,
"learning_rate": 9.994639605123994e-06,
"loss": 0.0961,
"step": 163
},
{
"epoch": 0.0746132848043676,
"grad_norm": 0.753531768411978,
"learning_rate": 9.994573235167839e-06,
"loss": 0.0736,
"step": 164
},
{
"epoch": 0.07506824385805277,
"grad_norm": 0.9314766958597749,
"learning_rate": 9.994506457077715e-06,
"loss": 0.0838,
"step": 165
},
{
"epoch": 0.07552320291173795,
"grad_norm": 0.996008388557059,
"learning_rate": 9.994439270859077e-06,
"loss": 0.1076,
"step": 166
},
{
"epoch": 0.07597816196542312,
"grad_norm": 0.9199332464612126,
"learning_rate": 9.994371676517418e-06,
"loss": 0.0724,
"step": 167
},
{
"epoch": 0.07643312101910828,
"grad_norm": 0.8652292283168678,
"learning_rate": 9.994303674058259e-06,
"loss": 0.0628,
"step": 168
},
{
"epoch": 0.07688808007279345,
"grad_norm": 0.8176262426438138,
"learning_rate": 9.994235263487158e-06,
"loss": 0.0743,
"step": 169
},
{
"epoch": 0.07734303912647862,
"grad_norm": 0.8147855247941459,
"learning_rate": 9.994166444809705e-06,
"loss": 0.0559,
"step": 170
},
{
"epoch": 0.07779799818016378,
"grad_norm": 0.7853019575635352,
"learning_rate": 9.994097218031524e-06,
"loss": 0.0681,
"step": 171
},
{
"epoch": 0.07825295723384895,
"grad_norm": 0.8445610480134321,
"learning_rate": 9.994027583158272e-06,
"loss": 0.0785,
"step": 172
},
{
"epoch": 0.07870791628753412,
"grad_norm": 0.8555498692388026,
"learning_rate": 9.993957540195638e-06,
"loss": 0.077,
"step": 173
},
{
"epoch": 0.0791628753412193,
"grad_norm": 0.8281270493499452,
"learning_rate": 9.993887089149346e-06,
"loss": 0.0848,
"step": 174
},
{
"epoch": 0.07961783439490445,
"grad_norm": 0.7180425978661062,
"learning_rate": 9.993816230025152e-06,
"loss": 0.0588,
"step": 175
},
{
"epoch": 0.08007279344858963,
"grad_norm": 0.9287545326980071,
"learning_rate": 9.99374496282885e-06,
"loss": 0.0874,
"step": 176
},
{
"epoch": 0.0805277525022748,
"grad_norm": 1.5950603980195528,
"learning_rate": 9.993673287566261e-06,
"loss": 0.1301,
"step": 177
},
{
"epoch": 0.08098271155595996,
"grad_norm": 0.505966633973175,
"learning_rate": 9.99360120424324e-06,
"loss": 0.0459,
"step": 178
},
{
"epoch": 0.08143767060964513,
"grad_norm": 0.6170796905443107,
"learning_rate": 9.993528712865681e-06,
"loss": 0.0666,
"step": 179
},
{
"epoch": 0.0818926296633303,
"grad_norm": 0.8965600572228928,
"learning_rate": 9.993455813439507e-06,
"loss": 0.0648,
"step": 180
},
{
"epoch": 0.08234758871701547,
"grad_norm": 0.7555745664692847,
"learning_rate": 9.993382505970673e-06,
"loss": 0.0479,
"step": 181
},
{
"epoch": 0.08280254777070063,
"grad_norm": 0.7885826993774436,
"learning_rate": 9.99330879046517e-06,
"loss": 0.0605,
"step": 182
},
{
"epoch": 0.0832575068243858,
"grad_norm": 0.6970911126559147,
"learning_rate": 9.993234666929024e-06,
"loss": 0.0545,
"step": 183
},
{
"epoch": 0.08371246587807098,
"grad_norm": 0.8281240642020996,
"learning_rate": 9.99316013536829e-06,
"loss": 0.0651,
"step": 184
},
{
"epoch": 0.08416742493175614,
"grad_norm": 0.8497823551734951,
"learning_rate": 9.993085195789057e-06,
"loss": 0.098,
"step": 185
},
{
"epoch": 0.08462238398544131,
"grad_norm": 0.8425278224044996,
"learning_rate": 9.993009848197452e-06,
"loss": 0.0861,
"step": 186
},
{
"epoch": 0.08507734303912648,
"grad_norm": 0.729342450692031,
"learning_rate": 9.992934092599629e-06,
"loss": 0.0651,
"step": 187
},
{
"epoch": 0.08553230209281165,
"grad_norm": 0.8810253378927329,
"learning_rate": 9.99285792900178e-06,
"loss": 0.0995,
"step": 188
},
{
"epoch": 0.08598726114649681,
"grad_norm": 1.0402457083445067,
"learning_rate": 9.992781357410131e-06,
"loss": 0.1061,
"step": 189
},
{
"epoch": 0.08644222020018198,
"grad_norm": 0.7397036090930822,
"learning_rate": 9.992704377830934e-06,
"loss": 0.0571,
"step": 190
},
{
"epoch": 0.08689717925386715,
"grad_norm": 1.4783630598693296,
"learning_rate": 9.992626990270484e-06,
"loss": 0.1154,
"step": 191
},
{
"epoch": 0.08735213830755233,
"grad_norm": 1.1100322283473036,
"learning_rate": 9.992549194735101e-06,
"loss": 0.1179,
"step": 192
},
{
"epoch": 0.08780709736123748,
"grad_norm": 0.5797984556503705,
"learning_rate": 9.992470991231144e-06,
"loss": 0.0466,
"step": 193
},
{
"epoch": 0.08826205641492266,
"grad_norm": 1.059908713900853,
"learning_rate": 9.992392379765005e-06,
"loss": 0.0994,
"step": 194
},
{
"epoch": 0.08871701546860783,
"grad_norm": 1.1187885391430794,
"learning_rate": 9.992313360343104e-06,
"loss": 0.0986,
"step": 195
},
{
"epoch": 0.08917197452229299,
"grad_norm": 0.7509441330173129,
"learning_rate": 9.992233932971901e-06,
"loss": 0.0634,
"step": 196
},
{
"epoch": 0.08962693357597816,
"grad_norm": 0.9426276516690344,
"learning_rate": 9.992154097657888e-06,
"loss": 0.0857,
"step": 197
},
{
"epoch": 0.09008189262966333,
"grad_norm": 0.8754039034503873,
"learning_rate": 9.992073854407585e-06,
"loss": 0.0881,
"step": 198
},
{
"epoch": 0.0905368516833485,
"grad_norm": 2.8697219156120712,
"learning_rate": 9.99199320322755e-06,
"loss": 0.0851,
"step": 199
},
{
"epoch": 0.09099181073703366,
"grad_norm": 0.7429242681646778,
"learning_rate": 9.991912144124375e-06,
"loss": 0.0729,
"step": 200
},
{
"epoch": 0.09144676979071883,
"grad_norm": 1.0552979449251756,
"learning_rate": 9.991830677104682e-06,
"loss": 0.1066,
"step": 201
},
{
"epoch": 0.09190172884440401,
"grad_norm": 0.8812651371324355,
"learning_rate": 9.99174880217513e-06,
"loss": 0.0732,
"step": 202
},
{
"epoch": 0.09235668789808917,
"grad_norm": 1.0755107845413352,
"learning_rate": 9.991666519342407e-06,
"loss": 0.0977,
"step": 203
},
{
"epoch": 0.09281164695177434,
"grad_norm": 0.8925063431256136,
"learning_rate": 9.99158382861324e-06,
"loss": 0.0904,
"step": 204
},
{
"epoch": 0.09326660600545951,
"grad_norm": 0.8190206986922173,
"learning_rate": 9.991500729994384e-06,
"loss": 0.0729,
"step": 205
},
{
"epoch": 0.09372156505914468,
"grad_norm": 0.6635798147425112,
"learning_rate": 9.991417223492629e-06,
"loss": 0.0631,
"step": 206
},
{
"epoch": 0.09417652411282984,
"grad_norm": 1.0314655306023923,
"learning_rate": 9.991333309114798e-06,
"loss": 0.0852,
"step": 207
},
{
"epoch": 0.09463148316651501,
"grad_norm": 0.8533496857694978,
"learning_rate": 9.991248986867753e-06,
"loss": 0.0868,
"step": 208
},
{
"epoch": 0.09508644222020018,
"grad_norm": 1.039085255997433,
"learning_rate": 9.991164256758378e-06,
"loss": 0.095,
"step": 209
},
{
"epoch": 0.09554140127388536,
"grad_norm": 1.1484522866350177,
"learning_rate": 9.9910791187936e-06,
"loss": 0.1333,
"step": 210
},
{
"epoch": 0.09599636032757052,
"grad_norm": 0.8277820800102422,
"learning_rate": 9.99099357298038e-06,
"loss": 0.0664,
"step": 211
},
{
"epoch": 0.09645131938125569,
"grad_norm": 0.821796111319934,
"learning_rate": 9.9909076193257e-06,
"loss": 0.083,
"step": 212
},
{
"epoch": 0.09690627843494086,
"grad_norm": 0.9448800546720313,
"learning_rate": 9.990821257836589e-06,
"loss": 0.0873,
"step": 213
},
{
"epoch": 0.09736123748862602,
"grad_norm": 0.9002810379340489,
"learning_rate": 9.990734488520103e-06,
"loss": 0.099,
"step": 214
},
{
"epoch": 0.09781619654231119,
"grad_norm": 0.6145149717344348,
"learning_rate": 9.990647311383334e-06,
"loss": 0.0425,
"step": 215
},
{
"epoch": 0.09827115559599636,
"grad_norm": 1.1377497370761045,
"learning_rate": 9.990559726433404e-06,
"loss": 0.0903,
"step": 216
},
{
"epoch": 0.09872611464968153,
"grad_norm": 0.8401357673155365,
"learning_rate": 9.99047173367747e-06,
"loss": 0.0812,
"step": 217
},
{
"epoch": 0.09918107370336669,
"grad_norm": 0.6977882365614015,
"learning_rate": 9.990383333122722e-06,
"loss": 0.0613,
"step": 218
},
{
"epoch": 0.09963603275705187,
"grad_norm": 0.6751056796776193,
"learning_rate": 9.990294524776384e-06,
"loss": 0.0636,
"step": 219
},
{
"epoch": 0.10009099181073704,
"grad_norm": 0.7973250315161167,
"learning_rate": 9.990205308645716e-06,
"loss": 0.0655,
"step": 220
},
{
"epoch": 0.1005459508644222,
"grad_norm": 0.6494979859380491,
"learning_rate": 9.990115684738005e-06,
"loss": 0.0461,
"step": 221
},
{
"epoch": 0.10100090991810737,
"grad_norm": 0.7863907355652456,
"learning_rate": 9.990025653060574e-06,
"loss": 0.0881,
"step": 222
},
{
"epoch": 0.10145586897179254,
"grad_norm": 1.2756737972223395,
"learning_rate": 9.98993521362078e-06,
"loss": 0.1102,
"step": 223
},
{
"epoch": 0.10191082802547771,
"grad_norm": 1.1992554133605928,
"learning_rate": 9.989844366426018e-06,
"loss": 0.1147,
"step": 224
},
{
"epoch": 0.10236578707916287,
"grad_norm": 0.5034605400337953,
"learning_rate": 9.989753111483707e-06,
"loss": 0.0462,
"step": 225
},
{
"epoch": 0.10282074613284804,
"grad_norm": 0.9881921480518578,
"learning_rate": 9.989661448801305e-06,
"loss": 0.0848,
"step": 226
},
{
"epoch": 0.10327570518653321,
"grad_norm": 0.7581777568438945,
"learning_rate": 9.989569378386303e-06,
"loss": 0.079,
"step": 227
},
{
"epoch": 0.10373066424021839,
"grad_norm": 0.6464731162067388,
"learning_rate": 9.989476900246223e-06,
"loss": 0.0617,
"step": 228
},
{
"epoch": 0.10418562329390355,
"grad_norm": 0.8780639185859085,
"learning_rate": 9.989384014388624e-06,
"loss": 0.086,
"step": 229
},
{
"epoch": 0.10464058234758872,
"grad_norm": 0.6623808171307163,
"learning_rate": 9.989290720821095e-06,
"loss": 0.0694,
"step": 230
},
{
"epoch": 0.10509554140127389,
"grad_norm": 0.721054554263859,
"learning_rate": 9.98919701955126e-06,
"loss": 0.0735,
"step": 231
},
{
"epoch": 0.10555050045495905,
"grad_norm": 0.7868134014829404,
"learning_rate": 9.989102910586776e-06,
"loss": 0.0546,
"step": 232
},
{
"epoch": 0.10600545950864422,
"grad_norm": 0.9137158371163484,
"learning_rate": 9.989008393935331e-06,
"loss": 0.0771,
"step": 233
},
{
"epoch": 0.10646041856232939,
"grad_norm": 0.8326009579593463,
"learning_rate": 9.98891346960465e-06,
"loss": 0.0667,
"step": 234
},
{
"epoch": 0.10691537761601456,
"grad_norm": 0.6462724580348628,
"learning_rate": 9.988818137602494e-06,
"loss": 0.0717,
"step": 235
},
{
"epoch": 0.10737033666969972,
"grad_norm": 0.7513725247558808,
"learning_rate": 9.988722397936646e-06,
"loss": 0.0733,
"step": 236
},
{
"epoch": 0.1078252957233849,
"grad_norm": 1.094509848236789,
"learning_rate": 9.988626250614932e-06,
"loss": 0.1009,
"step": 237
},
{
"epoch": 0.10828025477707007,
"grad_norm": 0.8200579138639758,
"learning_rate": 9.98852969564521e-06,
"loss": 0.0844,
"step": 238
},
{
"epoch": 0.10873521383075523,
"grad_norm": 0.7417763562196316,
"learning_rate": 9.988432733035369e-06,
"loss": 0.0611,
"step": 239
},
{
"epoch": 0.1091901728844404,
"grad_norm": 0.8476475869820355,
"learning_rate": 9.988335362793333e-06,
"loss": 0.0863,
"step": 240
},
{
"epoch": 0.10964513193812557,
"grad_norm": 0.9998642783878469,
"learning_rate": 9.988237584927058e-06,
"loss": 0.0909,
"step": 241
},
{
"epoch": 0.11010009099181074,
"grad_norm": 1.1689324698997519,
"learning_rate": 9.988139399444534e-06,
"loss": 0.124,
"step": 242
},
{
"epoch": 0.1105550500454959,
"grad_norm": 0.790901332269412,
"learning_rate": 9.988040806353786e-06,
"loss": 0.0855,
"step": 243
},
{
"epoch": 0.11101000909918107,
"grad_norm": 0.8931785977847209,
"learning_rate": 9.987941805662869e-06,
"loss": 0.1023,
"step": 244
},
{
"epoch": 0.11146496815286625,
"grad_norm": 0.7352781929773609,
"learning_rate": 9.98784239737987e-06,
"loss": 0.0563,
"step": 245
},
{
"epoch": 0.11191992720655142,
"grad_norm": 0.7169092611535308,
"learning_rate": 9.987742581512919e-06,
"loss": 0.0683,
"step": 246
},
{
"epoch": 0.11237488626023658,
"grad_norm": 0.6767560569792272,
"learning_rate": 9.987642358070167e-06,
"loss": 0.0669,
"step": 247
},
{
"epoch": 0.11282984531392175,
"grad_norm": 0.8442319805699996,
"learning_rate": 9.987541727059805e-06,
"loss": 0.0768,
"step": 248
},
{
"epoch": 0.11328480436760692,
"grad_norm": 0.7700876798522618,
"learning_rate": 9.987440688490058e-06,
"loss": 0.0643,
"step": 249
},
{
"epoch": 0.11373976342129208,
"grad_norm": 0.7286087978317647,
"learning_rate": 9.98733924236918e-06,
"loss": 0.0698,
"step": 250
},
{
"epoch": 0.11419472247497725,
"grad_norm": 0.7917355018437868,
"learning_rate": 9.98723738870546e-06,
"loss": 0.0791,
"step": 251
},
{
"epoch": 0.11464968152866242,
"grad_norm": 1.0469499693242315,
"learning_rate": 9.987135127507226e-06,
"loss": 0.0761,
"step": 252
},
{
"epoch": 0.1151046405823476,
"grad_norm": 0.8361714930383379,
"learning_rate": 9.987032458782828e-06,
"loss": 0.0789,
"step": 253
},
{
"epoch": 0.11555959963603275,
"grad_norm": 0.5902853873046482,
"learning_rate": 9.986929382540662e-06,
"loss": 0.0479,
"step": 254
},
{
"epoch": 0.11601455868971793,
"grad_norm": 0.7349436304465384,
"learning_rate": 9.986825898789145e-06,
"loss": 0.0668,
"step": 255
},
{
"epoch": 0.1164695177434031,
"grad_norm": 0.7657107039148755,
"learning_rate": 9.986722007536737e-06,
"loss": 0.0617,
"step": 256
},
{
"epoch": 0.11692447679708826,
"grad_norm": 0.6450631027744769,
"learning_rate": 9.986617708791926e-06,
"loss": 0.0679,
"step": 257
},
{
"epoch": 0.11737943585077343,
"grad_norm": 0.6292930010016882,
"learning_rate": 9.986513002563236e-06,
"loss": 0.0482,
"step": 258
},
{
"epoch": 0.1178343949044586,
"grad_norm": 0.8758541343517451,
"learning_rate": 9.986407888859221e-06,
"loss": 0.0994,
"step": 259
},
{
"epoch": 0.11828935395814377,
"grad_norm": 0.6537445862223847,
"learning_rate": 9.986302367688473e-06,
"loss": 0.07,
"step": 260
},
{
"epoch": 0.11874431301182893,
"grad_norm": 0.8029660816844667,
"learning_rate": 9.986196439059613e-06,
"loss": 0.0623,
"step": 261
},
{
"epoch": 0.1191992720655141,
"grad_norm": 0.7339528606524214,
"learning_rate": 9.986090102981297e-06,
"loss": 0.0791,
"step": 262
},
{
"epoch": 0.11965423111919928,
"grad_norm": 0.7934112522002073,
"learning_rate": 9.985983359462215e-06,
"loss": 0.0672,
"step": 263
},
{
"epoch": 0.12010919017288443,
"grad_norm": 1.0186962263060808,
"learning_rate": 9.98587620851109e-06,
"loss": 0.1213,
"step": 264
},
{
"epoch": 0.1205641492265696,
"grad_norm": 0.6769843647605545,
"learning_rate": 9.985768650136679e-06,
"loss": 0.0685,
"step": 265
},
{
"epoch": 0.12101910828025478,
"grad_norm": 0.7543020935976431,
"learning_rate": 9.985660684347765e-06,
"loss": 0.0861,
"step": 266
},
{
"epoch": 0.12147406733393995,
"grad_norm": 0.9552124731299731,
"learning_rate": 9.985552311153178e-06,
"loss": 0.0922,
"step": 267
},
{
"epoch": 0.12192902638762511,
"grad_norm": 0.7436699167226903,
"learning_rate": 9.985443530561769e-06,
"loss": 0.0885,
"step": 268
},
{
"epoch": 0.12238398544131028,
"grad_norm": 1.329058937551934,
"learning_rate": 9.98533434258243e-06,
"loss": 0.1115,
"step": 269
},
{
"epoch": 0.12283894449499545,
"grad_norm": 0.6835909813818813,
"learning_rate": 9.985224747224083e-06,
"loss": 0.0586,
"step": 270
},
{
"epoch": 0.12329390354868063,
"grad_norm": 1.0733107060854794,
"learning_rate": 9.98511474449568e-06,
"loss": 0.0811,
"step": 271
},
{
"epoch": 0.12374886260236578,
"grad_norm": 0.5916007278667166,
"learning_rate": 9.985004334406215e-06,
"loss": 0.0696,
"step": 272
},
{
"epoch": 0.12420382165605096,
"grad_norm": 0.9149357508392912,
"learning_rate": 9.984893516964707e-06,
"loss": 0.0704,
"step": 273
},
{
"epoch": 0.12465878070973613,
"grad_norm": 1.1634742377762608,
"learning_rate": 9.984782292180212e-06,
"loss": 0.1178,
"step": 274
},
{
"epoch": 0.1251137397634213,
"grad_norm": 0.603957454908005,
"learning_rate": 9.98467066006182e-06,
"loss": 0.0585,
"step": 275
},
{
"epoch": 0.12556869881710647,
"grad_norm": 0.7735087790025026,
"learning_rate": 9.984558620618651e-06,
"loss": 0.0953,
"step": 276
},
{
"epoch": 0.12602365787079162,
"grad_norm": 1.2570182633873541,
"learning_rate": 9.984446173859863e-06,
"loss": 0.1353,
"step": 277
},
{
"epoch": 0.1264786169244768,
"grad_norm": 0.7275895818672663,
"learning_rate": 9.984333319794642e-06,
"loss": 0.0774,
"step": 278
},
{
"epoch": 0.12693357597816196,
"grad_norm": 0.6395006056363333,
"learning_rate": 9.984220058432212e-06,
"loss": 0.0591,
"step": 279
},
{
"epoch": 0.12738853503184713,
"grad_norm": 0.6563921850032347,
"learning_rate": 9.984106389781828e-06,
"loss": 0.0573,
"step": 280
},
{
"epoch": 0.1278434940855323,
"grad_norm": 0.9399157526953884,
"learning_rate": 9.983992313852776e-06,
"loss": 0.0793,
"step": 281
},
{
"epoch": 0.12829845313921748,
"grad_norm": 0.93528061821534,
"learning_rate": 9.983877830654381e-06,
"loss": 0.0807,
"step": 282
},
{
"epoch": 0.12875341219290265,
"grad_norm": 0.7192448233352142,
"learning_rate": 9.983762940195996e-06,
"loss": 0.0773,
"step": 283
},
{
"epoch": 0.1292083712465878,
"grad_norm": 0.7097381072031733,
"learning_rate": 9.98364764248701e-06,
"loss": 0.0698,
"step": 284
},
{
"epoch": 0.12966333030027297,
"grad_norm": 1.1635566012920768,
"learning_rate": 9.983531937536844e-06,
"loss": 0.0893,
"step": 285
},
{
"epoch": 0.13011828935395814,
"grad_norm": 0.8456555685011555,
"learning_rate": 9.983415825354954e-06,
"loss": 0.0628,
"step": 286
},
{
"epoch": 0.1305732484076433,
"grad_norm": 0.7151838393189083,
"learning_rate": 9.983299305950828e-06,
"loss": 0.0557,
"step": 287
},
{
"epoch": 0.13102820746132848,
"grad_norm": 0.7095193783870621,
"learning_rate": 9.983182379333989e-06,
"loss": 0.0604,
"step": 288
},
{
"epoch": 0.13148316651501366,
"grad_norm": 0.8581434444337498,
"learning_rate": 9.983065045513986e-06,
"loss": 0.0781,
"step": 289
},
{
"epoch": 0.13193812556869883,
"grad_norm": 0.5600994934804626,
"learning_rate": 9.982947304500414e-06,
"loss": 0.0498,
"step": 290
},
{
"epoch": 0.13239308462238397,
"grad_norm": 0.7355720212694087,
"learning_rate": 9.98282915630289e-06,
"loss": 0.0692,
"step": 291
},
{
"epoch": 0.13284804367606914,
"grad_norm": 1.6846985851500909,
"learning_rate": 9.98271060093107e-06,
"loss": 0.1687,
"step": 292
},
{
"epoch": 0.13330300272975432,
"grad_norm": 0.7959406174268434,
"learning_rate": 9.98259163839464e-06,
"loss": 0.0718,
"step": 293
},
{
"epoch": 0.1337579617834395,
"grad_norm": 0.6005858848115938,
"learning_rate": 9.982472268703323e-06,
"loss": 0.0465,
"step": 294
},
{
"epoch": 0.13421292083712466,
"grad_norm": 0.7865103977061746,
"learning_rate": 9.982352491866874e-06,
"loss": 0.071,
"step": 295
},
{
"epoch": 0.13466787989080983,
"grad_norm": 0.7167219429964851,
"learning_rate": 9.982232307895077e-06,
"loss": 0.0658,
"step": 296
},
{
"epoch": 0.135122838944495,
"grad_norm": 1.206398567596641,
"learning_rate": 9.982111716797758e-06,
"loss": 0.101,
"step": 297
},
{
"epoch": 0.13557779799818018,
"grad_norm": 1.0085912508470862,
"learning_rate": 9.981990718584768e-06,
"loss": 0.0959,
"step": 298
},
{
"epoch": 0.13603275705186532,
"grad_norm": 0.8594135430057543,
"learning_rate": 9.981869313265995e-06,
"loss": 0.0912,
"step": 299
},
{
"epoch": 0.1364877161055505,
"grad_norm": 0.9903339586980618,
"learning_rate": 9.981747500851357e-06,
"loss": 0.0692,
"step": 300
},
{
"epoch": 0.13694267515923567,
"grad_norm": 0.7623380548666351,
"learning_rate": 9.981625281350812e-06,
"loss": 0.0699,
"step": 301
},
{
"epoch": 0.13739763421292084,
"grad_norm": 0.6267143484055344,
"learning_rate": 9.981502654774349e-06,
"loss": 0.0499,
"step": 302
},
{
"epoch": 0.137852593266606,
"grad_norm": 0.8234150836820757,
"learning_rate": 9.98137962113198e-06,
"loss": 0.0788,
"step": 303
},
{
"epoch": 0.13830755232029118,
"grad_norm": 0.8158733102806115,
"learning_rate": 9.98125618043377e-06,
"loss": 0.089,
"step": 304
},
{
"epoch": 0.13876251137397635,
"grad_norm": 0.6372656549463032,
"learning_rate": 9.981132332689796e-06,
"loss": 0.0517,
"step": 305
},
{
"epoch": 0.1392174704276615,
"grad_norm": 0.7713863813548327,
"learning_rate": 9.981008077910184e-06,
"loss": 0.0769,
"step": 306
},
{
"epoch": 0.13967242948134667,
"grad_norm": 0.8883775702857831,
"learning_rate": 9.980883416105084e-06,
"loss": 0.0828,
"step": 307
},
{
"epoch": 0.14012738853503184,
"grad_norm": 0.6490936355626988,
"learning_rate": 9.980758347284687e-06,
"loss": 0.0618,
"step": 308
},
{
"epoch": 0.14058234758871702,
"grad_norm": 0.8359554084586713,
"learning_rate": 9.980632871459209e-06,
"loss": 0.0714,
"step": 309
},
{
"epoch": 0.1410373066424022,
"grad_norm": 0.7373523328454649,
"learning_rate": 9.980506988638906e-06,
"loss": 0.0836,
"step": 310
},
{
"epoch": 0.14149226569608736,
"grad_norm": 0.6644370731485183,
"learning_rate": 9.980380698834064e-06,
"loss": 0.0777,
"step": 311
},
{
"epoch": 0.14194722474977253,
"grad_norm": 0.870883965477211,
"learning_rate": 9.980254002055003e-06,
"loss": 0.0847,
"step": 312
},
{
"epoch": 0.14240218380345768,
"grad_norm": 0.6021065409531002,
"learning_rate": 9.980126898312074e-06,
"loss": 0.0583,
"step": 313
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.8705461588189498,
"learning_rate": 9.979999387615665e-06,
"loss": 0.0895,
"step": 314
},
{
"epoch": 0.14331210191082802,
"grad_norm": 0.9639410731114018,
"learning_rate": 9.979871469976197e-06,
"loss": 0.0901,
"step": 315
},
{
"epoch": 0.1437670609645132,
"grad_norm": 0.7554126383153169,
"learning_rate": 9.97974314540412e-06,
"loss": 0.0699,
"step": 316
},
{
"epoch": 0.14422202001819837,
"grad_norm": 1.1039648440512544,
"learning_rate": 9.979614413909922e-06,
"loss": 0.1013,
"step": 317
},
{
"epoch": 0.14467697907188354,
"grad_norm": 0.5258831871743486,
"learning_rate": 9.979485275504121e-06,
"loss": 0.0544,
"step": 318
},
{
"epoch": 0.1451319381255687,
"grad_norm": 1.3025897394440575,
"learning_rate": 9.979355730197271e-06,
"loss": 0.1067,
"step": 319
},
{
"epoch": 0.14558689717925385,
"grad_norm": 0.5206132423310033,
"learning_rate": 9.979225777999956e-06,
"loss": 0.0497,
"step": 320
},
{
"epoch": 0.14604185623293903,
"grad_norm": 0.7202189397663867,
"learning_rate": 9.9790954189228e-06,
"loss": 0.0807,
"step": 321
},
{
"epoch": 0.1464968152866242,
"grad_norm": 0.5738667169449175,
"learning_rate": 9.97896465297645e-06,
"loss": 0.0614,
"step": 322
},
{
"epoch": 0.14695177434030937,
"grad_norm": 0.7972440737628133,
"learning_rate": 9.978833480171592e-06,
"loss": 0.0906,
"step": 323
},
{
"epoch": 0.14740673339399454,
"grad_norm": 0.7697423454053598,
"learning_rate": 9.978701900518947e-06,
"loss": 0.0632,
"step": 324
},
{
"epoch": 0.14786169244767972,
"grad_norm": 0.8259885564233931,
"learning_rate": 9.978569914029267e-06,
"loss": 0.0944,
"step": 325
},
{
"epoch": 0.1483166515013649,
"grad_norm": 0.8450006655868962,
"learning_rate": 9.978437520713335e-06,
"loss": 0.0862,
"step": 326
},
{
"epoch": 0.14877161055505003,
"grad_norm": 0.7746078278616594,
"learning_rate": 9.978304720581973e-06,
"loss": 0.088,
"step": 327
},
{
"epoch": 0.1492265696087352,
"grad_norm": 0.9977734940815816,
"learning_rate": 9.97817151364603e-06,
"loss": 0.1036,
"step": 328
},
{
"epoch": 0.14968152866242038,
"grad_norm": 0.7800752301510507,
"learning_rate": 9.978037899916393e-06,
"loss": 0.0778,
"step": 329
},
{
"epoch": 0.15013648771610555,
"grad_norm": 0.7521153273438224,
"learning_rate": 9.97790387940398e-06,
"loss": 0.0532,
"step": 330
},
{
"epoch": 0.15059144676979072,
"grad_norm": 0.8046420256419254,
"learning_rate": 9.977769452119741e-06,
"loss": 0.0708,
"step": 331
},
{
"epoch": 0.1510464058234759,
"grad_norm": 0.9071770528791517,
"learning_rate": 9.97763461807466e-06,
"loss": 0.1006,
"step": 332
},
{
"epoch": 0.15150136487716107,
"grad_norm": 0.8824570234268595,
"learning_rate": 9.97749937727976e-06,
"loss": 0.0855,
"step": 333
},
{
"epoch": 0.15195632393084624,
"grad_norm": 0.8286075823730068,
"learning_rate": 9.977363729746088e-06,
"loss": 0.077,
"step": 334
},
{
"epoch": 0.15241128298453138,
"grad_norm": 0.6791233851472963,
"learning_rate": 9.977227675484729e-06,
"loss": 0.0698,
"step": 335
},
{
"epoch": 0.15286624203821655,
"grad_norm": 0.9813875260679181,
"learning_rate": 9.977091214506803e-06,
"loss": 0.0838,
"step": 336
},
{
"epoch": 0.15332120109190173,
"grad_norm": 0.9986284190120469,
"learning_rate": 9.976954346823456e-06,
"loss": 0.0789,
"step": 337
},
{
"epoch": 0.1537761601455869,
"grad_norm": 0.6456071732838817,
"learning_rate": 9.976817072445878e-06,
"loss": 0.0566,
"step": 338
},
{
"epoch": 0.15423111919927207,
"grad_norm": 0.7707362352402762,
"learning_rate": 9.976679391385283e-06,
"loss": 0.0677,
"step": 339
},
{
"epoch": 0.15468607825295724,
"grad_norm": 0.5804713825378958,
"learning_rate": 9.976541303652923e-06,
"loss": 0.0547,
"step": 340
},
{
"epoch": 0.15514103730664242,
"grad_norm": 0.7705377953828665,
"learning_rate": 9.976402809260083e-06,
"loss": 0.0673,
"step": 341
},
{
"epoch": 0.15559599636032756,
"grad_norm": 0.651002355082985,
"learning_rate": 9.976263908218076e-06,
"loss": 0.066,
"step": 342
},
{
"epoch": 0.15605095541401273,
"grad_norm": 1.0075230687249708,
"learning_rate": 9.976124600538257e-06,
"loss": 0.1151,
"step": 343
},
{
"epoch": 0.1565059144676979,
"grad_norm": 0.7110146200064966,
"learning_rate": 9.975984886232006e-06,
"loss": 0.0693,
"step": 344
},
{
"epoch": 0.15696087352138308,
"grad_norm": 0.782615076662302,
"learning_rate": 9.975844765310743e-06,
"loss": 0.071,
"step": 345
},
{
"epoch": 0.15741583257506825,
"grad_norm": 1.091513822496144,
"learning_rate": 9.975704237785915e-06,
"loss": 0.1277,
"step": 346
},
{
"epoch": 0.15787079162875342,
"grad_norm": 0.8244942271322709,
"learning_rate": 9.975563303669006e-06,
"loss": 0.092,
"step": 347
},
{
"epoch": 0.1583257506824386,
"grad_norm": 1.0997264747524325,
"learning_rate": 9.975421962971536e-06,
"loss": 0.102,
"step": 348
},
{
"epoch": 0.15878070973612374,
"grad_norm": 1.0471722358260585,
"learning_rate": 9.97528021570505e-06,
"loss": 0.1112,
"step": 349
},
{
"epoch": 0.1592356687898089,
"grad_norm": 0.6366013160292697,
"learning_rate": 9.975138061881135e-06,
"loss": 0.0629,
"step": 350
},
{
"epoch": 0.15969062784349408,
"grad_norm": 0.7145502784859615,
"learning_rate": 9.974995501511404e-06,
"loss": 0.0567,
"step": 351
},
{
"epoch": 0.16014558689717925,
"grad_norm": 1.0825694007542435,
"learning_rate": 9.974852534607506e-06,
"loss": 0.0897,
"step": 352
},
{
"epoch": 0.16060054595086443,
"grad_norm": 0.8874195306329471,
"learning_rate": 9.974709161181126e-06,
"loss": 0.0879,
"step": 353
},
{
"epoch": 0.1610555050045496,
"grad_norm": 0.8193025449594961,
"learning_rate": 9.974565381243982e-06,
"loss": 0.0969,
"step": 354
},
{
"epoch": 0.16151046405823477,
"grad_norm": 0.76528422131405,
"learning_rate": 9.974421194807815e-06,
"loss": 0.0786,
"step": 355
},
{
"epoch": 0.16196542311191992,
"grad_norm": 0.8836543328533641,
"learning_rate": 9.974276601884416e-06,
"loss": 0.0744,
"step": 356
},
{
"epoch": 0.1624203821656051,
"grad_norm": 0.7482952108426273,
"learning_rate": 9.974131602485596e-06,
"loss": 0.0772,
"step": 357
},
{
"epoch": 0.16287534121929026,
"grad_norm": 0.9122723647083647,
"learning_rate": 9.973986196623203e-06,
"loss": 0.0851,
"step": 358
},
{
"epoch": 0.16333030027297543,
"grad_norm": 0.8373653902978805,
"learning_rate": 9.973840384309121e-06,
"loss": 0.0865,
"step": 359
},
{
"epoch": 0.1637852593266606,
"grad_norm": 0.6360069343077157,
"learning_rate": 9.973694165555264e-06,
"loss": 0.0618,
"step": 360
},
{
"epoch": 0.16424021838034578,
"grad_norm": 0.7967304456611868,
"learning_rate": 9.973547540373582e-06,
"loss": 0.0865,
"step": 361
},
{
"epoch": 0.16469517743403095,
"grad_norm": 1.1699452577832765,
"learning_rate": 9.973400508776054e-06,
"loss": 0.1144,
"step": 362
},
{
"epoch": 0.1651501364877161,
"grad_norm": 0.6282867599706373,
"learning_rate": 9.973253070774698e-06,
"loss": 0.0633,
"step": 363
},
{
"epoch": 0.16560509554140126,
"grad_norm": 0.79942272506218,
"learning_rate": 9.973105226381559e-06,
"loss": 0.069,
"step": 364
},
{
"epoch": 0.16606005459508644,
"grad_norm": 0.9348674828410355,
"learning_rate": 9.972956975608719e-06,
"loss": 0.1019,
"step": 365
},
{
"epoch": 0.1665150136487716,
"grad_norm": 1.0942665884463076,
"learning_rate": 9.972808318468292e-06,
"loss": 0.0859,
"step": 366
},
{
"epoch": 0.16696997270245678,
"grad_norm": 0.6283579225277517,
"learning_rate": 9.972659254972426e-06,
"loss": 0.0589,
"step": 367
},
{
"epoch": 0.16742493175614195,
"grad_norm": 1.0989677054167046,
"learning_rate": 9.972509785133304e-06,
"loss": 0.1081,
"step": 368
},
{
"epoch": 0.16787989080982713,
"grad_norm": 0.7310198219540203,
"learning_rate": 9.972359908963137e-06,
"loss": 0.0675,
"step": 369
},
{
"epoch": 0.16833484986351227,
"grad_norm": 0.757671629194488,
"learning_rate": 9.972209626474172e-06,
"loss": 0.0734,
"step": 370
},
{
"epoch": 0.16878980891719744,
"grad_norm": 0.7966175159886519,
"learning_rate": 9.972058937678692e-06,
"loss": 0.075,
"step": 371
},
{
"epoch": 0.16924476797088261,
"grad_norm": 0.9805514159267839,
"learning_rate": 9.97190784258901e-06,
"loss": 0.1071,
"step": 372
},
{
"epoch": 0.1696997270245678,
"grad_norm": 0.7000612574442994,
"learning_rate": 9.971756341217471e-06,
"loss": 0.0526,
"step": 373
},
{
"epoch": 0.17015468607825296,
"grad_norm": 0.7917466702374949,
"learning_rate": 9.971604433576456e-06,
"loss": 0.0698,
"step": 374
},
{
"epoch": 0.17060964513193813,
"grad_norm": 0.8412692631182211,
"learning_rate": 9.97145211967838e-06,
"loss": 0.0783,
"step": 375
},
{
"epoch": 0.1710646041856233,
"grad_norm": 0.5615038895232536,
"learning_rate": 9.971299399535685e-06,
"loss": 0.053,
"step": 376
},
{
"epoch": 0.17151956323930848,
"grad_norm": 0.6849745369298482,
"learning_rate": 9.971146273160854e-06,
"loss": 0.0774,
"step": 377
},
{
"epoch": 0.17197452229299362,
"grad_norm": 0.6466596777060115,
"learning_rate": 9.9709927405664e-06,
"loss": 0.0606,
"step": 378
},
{
"epoch": 0.1724294813466788,
"grad_norm": 0.7169884074840761,
"learning_rate": 9.970838801764866e-06,
"loss": 0.0839,
"step": 379
},
{
"epoch": 0.17288444040036396,
"grad_norm": 0.9393396355410675,
"learning_rate": 9.970684456768836e-06,
"loss": 0.1132,
"step": 380
},
{
"epoch": 0.17333939945404914,
"grad_norm": 12.197098173453568,
"learning_rate": 9.970529705590918e-06,
"loss": 0.4858,
"step": 381
},
{
"epoch": 0.1737943585077343,
"grad_norm": 0.7355841274771772,
"learning_rate": 9.97037454824376e-06,
"loss": 0.0714,
"step": 382
},
{
"epoch": 0.17424931756141948,
"grad_norm": 1.050385265783733,
"learning_rate": 9.97021898474004e-06,
"loss": 0.1024,
"step": 383
},
{
"epoch": 0.17470427661510465,
"grad_norm": 0.8612087678995594,
"learning_rate": 9.970063015092469e-06,
"loss": 0.085,
"step": 384
},
{
"epoch": 0.1751592356687898,
"grad_norm": 1.3886472100476919,
"learning_rate": 9.969906639313793e-06,
"loss": 0.1212,
"step": 385
},
{
"epoch": 0.17561419472247497,
"grad_norm": 0.8238176964814595,
"learning_rate": 9.96974985741679e-06,
"loss": 0.0721,
"step": 386
},
{
"epoch": 0.17606915377616014,
"grad_norm": 0.8718897735731601,
"learning_rate": 9.969592669414272e-06,
"loss": 0.0959,
"step": 387
},
{
"epoch": 0.17652411282984531,
"grad_norm": 6.796752422837202,
"learning_rate": 9.969435075319083e-06,
"loss": 0.115,
"step": 388
},
{
"epoch": 0.1769790718835305,
"grad_norm": 0.58176536820322,
"learning_rate": 9.969277075144104e-06,
"loss": 0.0459,
"step": 389
},
{
"epoch": 0.17743403093721566,
"grad_norm": 0.7267253435076165,
"learning_rate": 9.969118668902242e-06,
"loss": 0.07,
"step": 390
},
{
"epoch": 0.17788898999090083,
"grad_norm": 0.7682389367523258,
"learning_rate": 9.968959856606442e-06,
"loss": 0.0542,
"step": 391
},
{
"epoch": 0.17834394904458598,
"grad_norm": 0.7873348185837048,
"learning_rate": 9.968800638269682e-06,
"loss": 0.0598,
"step": 392
},
{
"epoch": 0.17879890809827115,
"grad_norm": 1.287713292390112,
"learning_rate": 9.968641013904974e-06,
"loss": 0.1442,
"step": 393
},
{
"epoch": 0.17925386715195632,
"grad_norm": 1.085650814952146,
"learning_rate": 9.968480983525359e-06,
"loss": 0.0926,
"step": 394
},
{
"epoch": 0.1797088262056415,
"grad_norm": 0.6716676596759695,
"learning_rate": 9.968320547143918e-06,
"loss": 0.0767,
"step": 395
},
{
"epoch": 0.18016378525932666,
"grad_norm": 0.8467396807693714,
"learning_rate": 9.968159704773757e-06,
"loss": 0.0977,
"step": 396
},
{
"epoch": 0.18061874431301184,
"grad_norm": 0.6438855833782786,
"learning_rate": 9.967998456428021e-06,
"loss": 0.0586,
"step": 397
},
{
"epoch": 0.181073703366697,
"grad_norm": 0.7254140122399564,
"learning_rate": 9.967836802119886e-06,
"loss": 0.06,
"step": 398
},
{
"epoch": 0.18152866242038215,
"grad_norm": 0.87517545358881,
"learning_rate": 9.967674741862563e-06,
"loss": 0.1016,
"step": 399
},
{
"epoch": 0.18198362147406733,
"grad_norm": 1.0624206936058178,
"learning_rate": 9.967512275669294e-06,
"loss": 0.1296,
"step": 400
},
{
"epoch": 0.1824385805277525,
"grad_norm": 1.0284720738314184,
"learning_rate": 9.967349403553353e-06,
"loss": 0.0862,
"step": 401
},
{
"epoch": 0.18289353958143767,
"grad_norm": 0.8342932737384292,
"learning_rate": 9.967186125528053e-06,
"loss": 0.0873,
"step": 402
},
{
"epoch": 0.18334849863512284,
"grad_norm": 1.543095569701571,
"learning_rate": 9.967022441606734e-06,
"loss": 0.1209,
"step": 403
},
{
"epoch": 0.18380345768880801,
"grad_norm": 0.70731586616612,
"learning_rate": 9.966858351802773e-06,
"loss": 0.0726,
"step": 404
},
{
"epoch": 0.1842584167424932,
"grad_norm": 0.6660531988680356,
"learning_rate": 9.966693856129576e-06,
"loss": 0.0562,
"step": 405
},
{
"epoch": 0.18471337579617833,
"grad_norm": 0.8503640969928286,
"learning_rate": 9.966528954600587e-06,
"loss": 0.0838,
"step": 406
},
{
"epoch": 0.1851683348498635,
"grad_norm": 0.6021534124846688,
"learning_rate": 9.96636364722928e-06,
"loss": 0.0673,
"step": 407
},
{
"epoch": 0.18562329390354868,
"grad_norm": 0.8782816795828058,
"learning_rate": 9.966197934029165e-06,
"loss": 0.0845,
"step": 408
},
{
"epoch": 0.18607825295723385,
"grad_norm": 0.9030990654346936,
"learning_rate": 9.966031815013781e-06,
"loss": 0.0839,
"step": 409
},
{
"epoch": 0.18653321201091902,
"grad_norm": 0.8567507299712805,
"learning_rate": 9.965865290196703e-06,
"loss": 0.0935,
"step": 410
},
{
"epoch": 0.1869881710646042,
"grad_norm": 0.8099856489670021,
"learning_rate": 9.96569835959154e-06,
"loss": 0.0747,
"step": 411
},
{
"epoch": 0.18744313011828936,
"grad_norm": 0.8938878675243255,
"learning_rate": 9.965531023211931e-06,
"loss": 0.0854,
"step": 412
},
{
"epoch": 0.18789808917197454,
"grad_norm": 0.735313860104022,
"learning_rate": 9.965363281071551e-06,
"loss": 0.0865,
"step": 413
},
{
"epoch": 0.18835304822565968,
"grad_norm": 0.5495229598132649,
"learning_rate": 9.965195133184108e-06,
"loss": 0.0403,
"step": 414
},
{
"epoch": 0.18880800727934485,
"grad_norm": 1.0700416713113117,
"learning_rate": 9.965026579563342e-06,
"loss": 0.1086,
"step": 415
},
{
"epoch": 0.18926296633303002,
"grad_norm": 0.7118653717355078,
"learning_rate": 9.964857620223024e-06,
"loss": 0.0691,
"step": 416
},
{
"epoch": 0.1897179253867152,
"grad_norm": 0.6871481686027417,
"learning_rate": 9.964688255176963e-06,
"loss": 0.0667,
"step": 417
},
{
"epoch": 0.19017288444040037,
"grad_norm": 0.9848841869658392,
"learning_rate": 9.964518484438998e-06,
"loss": 0.0813,
"step": 418
},
{
"epoch": 0.19062784349408554,
"grad_norm": 0.6311750922074311,
"learning_rate": 9.964348308023001e-06,
"loss": 0.0592,
"step": 419
},
{
"epoch": 0.1910828025477707,
"grad_norm": 0.7813168734245782,
"learning_rate": 9.964177725942881e-06,
"loss": 0.0826,
"step": 420
},
{
"epoch": 0.19153776160145586,
"grad_norm": 0.8572110622332836,
"learning_rate": 9.964006738212574e-06,
"loss": 0.0853,
"step": 421
},
{
"epoch": 0.19199272065514103,
"grad_norm": 0.5304433423014596,
"learning_rate": 9.963835344846056e-06,
"loss": 0.048,
"step": 422
},
{
"epoch": 0.1924476797088262,
"grad_norm": 0.7598521228122416,
"learning_rate": 9.963663545857328e-06,
"loss": 0.0757,
"step": 423
},
{
"epoch": 0.19290263876251137,
"grad_norm": 1.1542546683489703,
"learning_rate": 9.963491341260432e-06,
"loss": 0.104,
"step": 424
},
{
"epoch": 0.19335759781619655,
"grad_norm": 0.7766563582253432,
"learning_rate": 9.963318731069437e-06,
"loss": 0.0952,
"step": 425
},
{
"epoch": 0.19381255686988172,
"grad_norm": 1.1319194983916299,
"learning_rate": 9.96314571529845e-06,
"loss": 0.1005,
"step": 426
},
{
"epoch": 0.1942675159235669,
"grad_norm": 0.7230559135257585,
"learning_rate": 9.962972293961608e-06,
"loss": 0.0647,
"step": 427
},
{
"epoch": 0.19472247497725204,
"grad_norm": 0.9863934566369588,
"learning_rate": 9.962798467073083e-06,
"loss": 0.0763,
"step": 428
},
{
"epoch": 0.1951774340309372,
"grad_norm": 0.8259784410005646,
"learning_rate": 9.96262423464708e-06,
"loss": 0.087,
"step": 429
},
{
"epoch": 0.19563239308462238,
"grad_norm": 0.7987139095182185,
"learning_rate": 9.962449596697834e-06,
"loss": 0.0671,
"step": 430
},
{
"epoch": 0.19608735213830755,
"grad_norm": 1.130208173229934,
"learning_rate": 9.962274553239619e-06,
"loss": 0.119,
"step": 431
},
{
"epoch": 0.19654231119199272,
"grad_norm": 0.7399696243677417,
"learning_rate": 9.962099104286735e-06,
"loss": 0.064,
"step": 432
},
{
"epoch": 0.1969972702456779,
"grad_norm": 1.156015767405528,
"learning_rate": 9.961923249853523e-06,
"loss": 0.1102,
"step": 433
},
{
"epoch": 0.19745222929936307,
"grad_norm": 0.972422739757894,
"learning_rate": 9.961746989954349e-06,
"loss": 0.1093,
"step": 434
},
{
"epoch": 0.1979071883530482,
"grad_norm": 0.7766700420403171,
"learning_rate": 9.96157032460362e-06,
"loss": 0.0655,
"step": 435
},
{
"epoch": 0.19836214740673339,
"grad_norm": 0.7460679115751414,
"learning_rate": 9.961393253815767e-06,
"loss": 0.0751,
"step": 436
},
{
"epoch": 0.19881710646041856,
"grad_norm": 1.0684214450487566,
"learning_rate": 9.961215777605266e-06,
"loss": 0.0789,
"step": 437
},
{
"epoch": 0.19927206551410373,
"grad_norm": 0.7683994291392229,
"learning_rate": 9.961037895986615e-06,
"loss": 0.0849,
"step": 438
},
{
"epoch": 0.1997270245677889,
"grad_norm": 0.7270368453251704,
"learning_rate": 9.960859608974352e-06,
"loss": 0.0779,
"step": 439
},
{
"epoch": 0.20018198362147407,
"grad_norm": 0.701460207303568,
"learning_rate": 9.960680916583042e-06,
"loss": 0.0639,
"step": 440
},
{
"epoch": 0.20063694267515925,
"grad_norm": 0.6784619280926262,
"learning_rate": 9.960501818827292e-06,
"loss": 0.077,
"step": 441
},
{
"epoch": 0.2010919017288444,
"grad_norm": 0.8064075868568972,
"learning_rate": 9.960322315721735e-06,
"loss": 0.0827,
"step": 442
},
{
"epoch": 0.20154686078252956,
"grad_norm": 0.9155026735417204,
"learning_rate": 9.960142407281039e-06,
"loss": 0.0841,
"step": 443
},
{
"epoch": 0.20200181983621474,
"grad_norm": 0.6167749294869733,
"learning_rate": 9.959962093519904e-06,
"loss": 0.054,
"step": 444
},
{
"epoch": 0.2024567788898999,
"grad_norm": 0.8127781985331358,
"learning_rate": 9.959781374453066e-06,
"loss": 0.0751,
"step": 445
},
{
"epoch": 0.20291173794358508,
"grad_norm": 0.98306444688532,
"learning_rate": 9.959600250095294e-06,
"loss": 0.075,
"step": 446
},
{
"epoch": 0.20336669699727025,
"grad_norm": 0.7982130269360888,
"learning_rate": 9.959418720461384e-06,
"loss": 0.0834,
"step": 447
},
{
"epoch": 0.20382165605095542,
"grad_norm": 0.7862225023823932,
"learning_rate": 9.959236785566175e-06,
"loss": 0.0704,
"step": 448
},
{
"epoch": 0.20427661510464057,
"grad_norm": 0.562107514296544,
"learning_rate": 9.959054445424532e-06,
"loss": 0.0644,
"step": 449
},
{
"epoch": 0.20473157415832574,
"grad_norm": 0.6089607791855781,
"learning_rate": 9.958871700051353e-06,
"loss": 0.0512,
"step": 450
},
{
"epoch": 0.2051865332120109,
"grad_norm": 0.6962095067981563,
"learning_rate": 9.958688549461573e-06,
"loss": 0.0712,
"step": 451
},
{
"epoch": 0.20564149226569609,
"grad_norm": 1.155217046291275,
"learning_rate": 9.958504993670158e-06,
"loss": 0.1049,
"step": 452
},
{
"epoch": 0.20609645131938126,
"grad_norm": 1.0913314226134752,
"learning_rate": 9.958321032692107e-06,
"loss": 0.1226,
"step": 453
},
{
"epoch": 0.20655141037306643,
"grad_norm": 22.735025633907238,
"learning_rate": 9.958136666542455e-06,
"loss": 0.8419,
"step": 454
},
{
"epoch": 0.2070063694267516,
"grad_norm": 1.184019553325164,
"learning_rate": 9.957951895236262e-06,
"loss": 0.1113,
"step": 455
},
{
"epoch": 0.20746132848043677,
"grad_norm": 0.7664792046331882,
"learning_rate": 9.957766718788632e-06,
"loss": 0.104,
"step": 456
},
{
"epoch": 0.20791628753412192,
"grad_norm": 0.8672883026786035,
"learning_rate": 9.957581137214695e-06,
"loss": 0.074,
"step": 457
},
{
"epoch": 0.2083712465878071,
"grad_norm": 0.8772220264781722,
"learning_rate": 9.957395150529615e-06,
"loss": 0.0986,
"step": 458
},
{
"epoch": 0.20882620564149226,
"grad_norm": 0.7016331971826193,
"learning_rate": 9.95720875874859e-06,
"loss": 0.0752,
"step": 459
},
{
"epoch": 0.20928116469517744,
"grad_norm": 0.6308822051977305,
"learning_rate": 9.957021961886855e-06,
"loss": 0.0608,
"step": 460
},
{
"epoch": 0.2097361237488626,
"grad_norm": 0.9803601042372939,
"learning_rate": 9.956834759959669e-06,
"loss": 0.0908,
"step": 461
},
{
"epoch": 0.21019108280254778,
"grad_norm": 0.7674462109758159,
"learning_rate": 9.95664715298233e-06,
"loss": 0.074,
"step": 462
},
{
"epoch": 0.21064604185623295,
"grad_norm": 0.7450186566335193,
"learning_rate": 9.95645914097017e-06,
"loss": 0.0817,
"step": 463
},
{
"epoch": 0.2111010009099181,
"grad_norm": 0.7225723661612439,
"learning_rate": 9.956270723938553e-06,
"loss": 0.0849,
"step": 464
},
{
"epoch": 0.21155595996360327,
"grad_norm": 0.7190355211871646,
"learning_rate": 9.956081901902875e-06,
"loss": 0.0748,
"step": 465
},
{
"epoch": 0.21201091901728844,
"grad_norm": 1.210684562087392,
"learning_rate": 9.955892674878565e-06,
"loss": 0.1272,
"step": 466
},
{
"epoch": 0.2124658780709736,
"grad_norm": 0.834170476650907,
"learning_rate": 9.955703042881087e-06,
"loss": 0.0992,
"step": 467
},
{
"epoch": 0.21292083712465878,
"grad_norm": 0.874478173291907,
"learning_rate": 9.955513005925934e-06,
"loss": 0.0858,
"step": 468
},
{
"epoch": 0.21337579617834396,
"grad_norm": 0.5510320150423565,
"learning_rate": 9.95532256402864e-06,
"loss": 0.0574,
"step": 469
},
{
"epoch": 0.21383075523202913,
"grad_norm": 0.5657171871822584,
"learning_rate": 9.955131717204762e-06,
"loss": 0.0671,
"step": 470
},
{
"epoch": 0.21428571428571427,
"grad_norm": 0.7564664653864259,
"learning_rate": 9.954940465469898e-06,
"loss": 0.085,
"step": 471
},
{
"epoch": 0.21474067333939945,
"grad_norm": 0.7594501005901694,
"learning_rate": 9.954748808839675e-06,
"loss": 0.0733,
"step": 472
},
{
"epoch": 0.21519563239308462,
"grad_norm": 0.6748092428366178,
"learning_rate": 9.954556747329754e-06,
"loss": 0.0707,
"step": 473
},
{
"epoch": 0.2156505914467698,
"grad_norm": 1.715089789819449,
"learning_rate": 9.954364280955832e-06,
"loss": 0.1045,
"step": 474
},
{
"epoch": 0.21610555050045496,
"grad_norm": 0.6668751648778155,
"learning_rate": 9.954171409733634e-06,
"loss": 0.0573,
"step": 475
},
{
"epoch": 0.21656050955414013,
"grad_norm": 0.5963716475430643,
"learning_rate": 9.95397813367892e-06,
"loss": 0.0752,
"step": 476
},
{
"epoch": 0.2170154686078253,
"grad_norm": 0.9917190233932158,
"learning_rate": 9.953784452807487e-06,
"loss": 0.1049,
"step": 477
},
{
"epoch": 0.21747042766151045,
"grad_norm": 0.5638529401686616,
"learning_rate": 9.953590367135159e-06,
"loss": 0.0547,
"step": 478
},
{
"epoch": 0.21792538671519562,
"grad_norm": 0.6477110515460727,
"learning_rate": 9.953395876677796e-06,
"loss": 0.0564,
"step": 479
},
{
"epoch": 0.2183803457688808,
"grad_norm": 0.5492055118574499,
"learning_rate": 9.95320098145129e-06,
"loss": 0.0505,
"step": 480
},
{
"epoch": 0.21883530482256597,
"grad_norm": 0.8954528378372288,
"learning_rate": 9.95300568147157e-06,
"loss": 0.126,
"step": 481
},
{
"epoch": 0.21929026387625114,
"grad_norm": 0.6155736143826033,
"learning_rate": 9.952809976754593e-06,
"loss": 0.0518,
"step": 482
},
{
"epoch": 0.2197452229299363,
"grad_norm": 1.1486004986445648,
"learning_rate": 9.952613867316351e-06,
"loss": 0.1142,
"step": 483
},
{
"epoch": 0.22020018198362148,
"grad_norm": 0.8236924325360948,
"learning_rate": 9.95241735317287e-06,
"loss": 0.1047,
"step": 484
},
{
"epoch": 0.22065514103730663,
"grad_norm": 0.832372102653505,
"learning_rate": 9.952220434340209e-06,
"loss": 0.0729,
"step": 485
},
{
"epoch": 0.2211101000909918,
"grad_norm": 0.7288716722109786,
"learning_rate": 9.952023110834456e-06,
"loss": 0.068,
"step": 486
},
{
"epoch": 0.22156505914467697,
"grad_norm": 0.5327254294033283,
"learning_rate": 9.951825382671739e-06,
"loss": 0.0614,
"step": 487
},
{
"epoch": 0.22202001819836215,
"grad_norm": 0.7204991379763186,
"learning_rate": 9.951627249868213e-06,
"loss": 0.0666,
"step": 488
},
{
"epoch": 0.22247497725204732,
"grad_norm": 0.7485835393026234,
"learning_rate": 9.95142871244007e-06,
"loss": 0.068,
"step": 489
},
{
"epoch": 0.2229299363057325,
"grad_norm": 0.45602532896445397,
"learning_rate": 9.951229770403531e-06,
"loss": 0.0414,
"step": 490
},
{
"epoch": 0.22338489535941766,
"grad_norm": 0.7240661348572547,
"learning_rate": 9.951030423774858e-06,
"loss": 0.0798,
"step": 491
},
{
"epoch": 0.22383985441310283,
"grad_norm": 0.7716352477687572,
"learning_rate": 9.950830672570337e-06,
"loss": 0.071,
"step": 492
},
{
"epoch": 0.22429481346678798,
"grad_norm": 1.22677184750836,
"learning_rate": 9.95063051680629e-06,
"loss": 0.1373,
"step": 493
},
{
"epoch": 0.22474977252047315,
"grad_norm": 0.7365431233953595,
"learning_rate": 9.950429956499074e-06,
"loss": 0.0699,
"step": 494
},
{
"epoch": 0.22520473157415832,
"grad_norm": 0.705654951368504,
"learning_rate": 9.950228991665078e-06,
"loss": 0.0741,
"step": 495
},
{
"epoch": 0.2256596906278435,
"grad_norm": 0.8261497906057415,
"learning_rate": 9.950027622320724e-06,
"loss": 0.0764,
"step": 496
},
{
"epoch": 0.22611464968152867,
"grad_norm": 0.9965395262255518,
"learning_rate": 9.949825848482465e-06,
"loss": 0.0852,
"step": 497
},
{
"epoch": 0.22656960873521384,
"grad_norm": 0.6807161957389707,
"learning_rate": 9.949623670166794e-06,
"loss": 0.074,
"step": 498
},
{
"epoch": 0.227024567788899,
"grad_norm": 1.1216390709095547,
"learning_rate": 9.949421087390228e-06,
"loss": 0.0931,
"step": 499
},
{
"epoch": 0.22747952684258416,
"grad_norm": 1.1278655216416786,
"learning_rate": 9.949218100169322e-06,
"loss": 0.1177,
"step": 500
},
{
"epoch": 0.22793448589626933,
"grad_norm": 0.9160591457448575,
"learning_rate": 9.949014708520664e-06,
"loss": 0.1015,
"step": 501
},
{
"epoch": 0.2283894449499545,
"grad_norm": 0.9377363057118697,
"learning_rate": 9.948810912460872e-06,
"loss": 0.1059,
"step": 502
},
{
"epoch": 0.22884440400363967,
"grad_norm": 0.8760932101779023,
"learning_rate": 9.948606712006601e-06,
"loss": 0.0812,
"step": 503
},
{
"epoch": 0.22929936305732485,
"grad_norm": 0.6962605051289937,
"learning_rate": 9.948402107174537e-06,
"loss": 0.0735,
"step": 504
},
{
"epoch": 0.22975432211101002,
"grad_norm": 0.6501265713488487,
"learning_rate": 9.948197097981401e-06,
"loss": 0.0551,
"step": 505
},
{
"epoch": 0.2302092811646952,
"grad_norm": 1.2156011775652311,
"learning_rate": 9.947991684443942e-06,
"loss": 0.1066,
"step": 506
},
{
"epoch": 0.23066424021838033,
"grad_norm": 0.9679794435610901,
"learning_rate": 9.947785866578951e-06,
"loss": 0.0981,
"step": 507
},
{
"epoch": 0.2311191992720655,
"grad_norm": 0.7195724631231237,
"learning_rate": 9.94757964440324e-06,
"loss": 0.0777,
"step": 508
},
{
"epoch": 0.23157415832575068,
"grad_norm": 0.549427502610929,
"learning_rate": 9.947373017933665e-06,
"loss": 0.0516,
"step": 509
},
{
"epoch": 0.23202911737943585,
"grad_norm": 0.5667212336170355,
"learning_rate": 9.947165987187108e-06,
"loss": 0.0583,
"step": 510
},
{
"epoch": 0.23248407643312102,
"grad_norm": 0.6638127935874616,
"learning_rate": 9.946958552180489e-06,
"loss": 0.0723,
"step": 511
},
{
"epoch": 0.2329390354868062,
"grad_norm": 0.5226768129517959,
"learning_rate": 9.946750712930756e-06,
"loss": 0.0482,
"step": 512
},
{
"epoch": 0.23339399454049137,
"grad_norm": 0.8358986518129136,
"learning_rate": 9.946542469454894e-06,
"loss": 0.1037,
"step": 513
},
{
"epoch": 0.2338489535941765,
"grad_norm": 0.6695809647699968,
"learning_rate": 9.94633382176992e-06,
"loss": 0.0728,
"step": 514
},
{
"epoch": 0.23430391264786168,
"grad_norm": 1.0608546974350634,
"learning_rate": 9.946124769892884e-06,
"loss": 0.1192,
"step": 515
},
{
"epoch": 0.23475887170154686,
"grad_norm": 0.5090717025630993,
"learning_rate": 9.945915313840869e-06,
"loss": 0.0612,
"step": 516
},
{
"epoch": 0.23521383075523203,
"grad_norm": 0.8105130307542814,
"learning_rate": 9.94570545363099e-06,
"loss": 0.0838,
"step": 517
},
{
"epoch": 0.2356687898089172,
"grad_norm": 0.7752986876049957,
"learning_rate": 9.945495189280394e-06,
"loss": 0.092,
"step": 518
},
{
"epoch": 0.23612374886260237,
"grad_norm": 0.869801315379322,
"learning_rate": 9.945284520806267e-06,
"loss": 0.077,
"step": 519
},
{
"epoch": 0.23657870791628755,
"grad_norm": 0.5427153243822386,
"learning_rate": 9.94507344822582e-06,
"loss": 0.0592,
"step": 520
},
{
"epoch": 0.2370336669699727,
"grad_norm": 0.7368670007832758,
"learning_rate": 9.944861971556305e-06,
"loss": 0.0608,
"step": 521
},
{
"epoch": 0.23748862602365786,
"grad_norm": 0.8141430793460733,
"learning_rate": 9.944650090814998e-06,
"loss": 0.0616,
"step": 522
},
{
"epoch": 0.23794358507734303,
"grad_norm": 2.1096588720516425,
"learning_rate": 9.944437806019216e-06,
"loss": 0.0938,
"step": 523
},
{
"epoch": 0.2383985441310282,
"grad_norm": 0.7014907085161215,
"learning_rate": 9.944225117186306e-06,
"loss": 0.0812,
"step": 524
},
{
"epoch": 0.23885350318471338,
"grad_norm": 0.5078467158211916,
"learning_rate": 9.944012024333647e-06,
"loss": 0.0561,
"step": 525
},
{
"epoch": 0.23930846223839855,
"grad_norm": 0.6379031604907951,
"learning_rate": 9.943798527478652e-06,
"loss": 0.0678,
"step": 526
},
{
"epoch": 0.23976342129208372,
"grad_norm": 0.799876019099874,
"learning_rate": 9.943584626638768e-06,
"loss": 0.0914,
"step": 527
},
{
"epoch": 0.24021838034576887,
"grad_norm": 0.6550229607349646,
"learning_rate": 9.943370321831474e-06,
"loss": 0.0668,
"step": 528
},
{
"epoch": 0.24067333939945404,
"grad_norm": 0.767534839542607,
"learning_rate": 9.943155613074279e-06,
"loss": 0.0711,
"step": 529
},
{
"epoch": 0.2411282984531392,
"grad_norm": 0.7571838990000624,
"learning_rate": 9.942940500384733e-06,
"loss": 0.0893,
"step": 530
},
{
"epoch": 0.24158325750682438,
"grad_norm": 17.807000846945513,
"learning_rate": 9.942724983780409e-06,
"loss": 0.3419,
"step": 531
},
{
"epoch": 0.24203821656050956,
"grad_norm": 1.2088422410181228,
"learning_rate": 9.942509063278922e-06,
"loss": 0.1173,
"step": 532
},
{
"epoch": 0.24249317561419473,
"grad_norm": 0.8811842157145667,
"learning_rate": 9.942292738897914e-06,
"loss": 0.1006,
"step": 533
},
{
"epoch": 0.2429481346678799,
"grad_norm": 0.7726281786442553,
"learning_rate": 9.942076010655063e-06,
"loss": 0.0909,
"step": 534
},
{
"epoch": 0.24340309372156507,
"grad_norm": 0.9942256398778268,
"learning_rate": 9.941858878568078e-06,
"loss": 0.134,
"step": 535
},
{
"epoch": 0.24385805277525022,
"grad_norm": 1.001596627292525,
"learning_rate": 9.941641342654702e-06,
"loss": 0.0977,
"step": 536
},
{
"epoch": 0.2443130118289354,
"grad_norm": 0.5064863363900076,
"learning_rate": 9.941423402932713e-06,
"loss": 0.0559,
"step": 537
},
{
"epoch": 0.24476797088262056,
"grad_norm": 0.8589680374278897,
"learning_rate": 9.94120505941992e-06,
"loss": 0.0992,
"step": 538
},
{
"epoch": 0.24522292993630573,
"grad_norm": 0.7830880681851201,
"learning_rate": 9.940986312134162e-06,
"loss": 0.0825,
"step": 539
},
{
"epoch": 0.2456778889899909,
"grad_norm": 0.5778344550660577,
"learning_rate": 9.940767161093316e-06,
"loss": 0.0637,
"step": 540
},
{
"epoch": 0.24613284804367608,
"grad_norm": 0.8661775200374767,
"learning_rate": 9.94054760631529e-06,
"loss": 0.0958,
"step": 541
},
{
"epoch": 0.24658780709736125,
"grad_norm": 0.6976226834296251,
"learning_rate": 9.940327647818026e-06,
"loss": 0.0752,
"step": 542
},
{
"epoch": 0.2470427661510464,
"grad_norm": 0.7530160135685138,
"learning_rate": 9.940107285619495e-06,
"loss": 0.077,
"step": 543
},
{
"epoch": 0.24749772520473157,
"grad_norm": 0.7997106896354084,
"learning_rate": 9.939886519737707e-06,
"loss": 0.0958,
"step": 544
},
{
"epoch": 0.24795268425841674,
"grad_norm": 0.8918061918047896,
"learning_rate": 9.939665350190702e-06,
"loss": 0.0822,
"step": 545
},
{
"epoch": 0.2484076433121019,
"grad_norm": 0.804115756264787,
"learning_rate": 9.93944377699655e-06,
"loss": 0.0915,
"step": 546
},
{
"epoch": 0.24886260236578708,
"grad_norm": 0.6234057941022288,
"learning_rate": 9.93922180017336e-06,
"loss": 0.0672,
"step": 547
},
{
"epoch": 0.24931756141947226,
"grad_norm": 0.8269450754551354,
"learning_rate": 9.93899941973927e-06,
"loss": 0.1102,
"step": 548
},
{
"epoch": 0.24977252047315743,
"grad_norm": 0.9233841316663005,
"learning_rate": 9.93877663571245e-06,
"loss": 0.0963,
"step": 549
},
{
"epoch": 0.2502274795268426,
"grad_norm": 0.9944861568923805,
"learning_rate": 9.938553448111108e-06,
"loss": 0.1127,
"step": 550
},
{
"epoch": 0.25068243858052774,
"grad_norm": 0.8423641298780182,
"learning_rate": 9.938329856953482e-06,
"loss": 0.0788,
"step": 551
},
{
"epoch": 0.25113739763421294,
"grad_norm": 0.8124861649110975,
"learning_rate": 9.938105862257839e-06,
"loss": 0.0831,
"step": 552
},
{
"epoch": 0.2515923566878981,
"grad_norm": 0.6612222253979325,
"learning_rate": 9.937881464042485e-06,
"loss": 0.0703,
"step": 553
},
{
"epoch": 0.25204731574158323,
"grad_norm": 0.854447666921162,
"learning_rate": 9.937656662325759e-06,
"loss": 0.1074,
"step": 554
},
{
"epoch": 0.25250227479526843,
"grad_norm": 0.74521770368624,
"learning_rate": 9.937431457126028e-06,
"loss": 0.0777,
"step": 555
}
],
"logging_steps": 1,
"max_steps": 10990,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 555,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3666645319680.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}