prm_math_only_hf / trainer_state.json
DongfuJiang's picture
Duplicate from DongfuJiang/prm_version3_full_hf
600f251 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996064541519087,
"eval_steps": 500,
"global_step": 1270,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007870916961826052,
"grad_norm": 4.882866791973475,
"learning_rate": 7.8125e-08,
"loss": 0.357,
"step": 1
},
{
"epoch": 0.0015741833923652105,
"grad_norm": 4.89981767485179,
"learning_rate": 1.5625e-07,
"loss": 0.3398,
"step": 2
},
{
"epoch": 0.0023612750885478157,
"grad_norm": 4.908915346462736,
"learning_rate": 2.3437500000000003e-07,
"loss": 0.3326,
"step": 3
},
{
"epoch": 0.003148366784730421,
"grad_norm": 4.731649267914947,
"learning_rate": 3.125e-07,
"loss": 0.342,
"step": 4
},
{
"epoch": 0.003935458480913027,
"grad_norm": 4.769633474207938,
"learning_rate": 3.90625e-07,
"loss": 0.3431,
"step": 5
},
{
"epoch": 0.004722550177095631,
"grad_norm": 4.754569879633701,
"learning_rate": 4.6875000000000006e-07,
"loss": 0.3369,
"step": 6
},
{
"epoch": 0.005509641873278237,
"grad_norm": 4.354074850343827,
"learning_rate": 5.468750000000001e-07,
"loss": 0.3416,
"step": 7
},
{
"epoch": 0.006296733569460842,
"grad_norm": 4.51384309102365,
"learning_rate": 6.25e-07,
"loss": 0.3577,
"step": 8
},
{
"epoch": 0.0070838252656434475,
"grad_norm": 4.127868399899779,
"learning_rate": 7.03125e-07,
"loss": 0.3447,
"step": 9
},
{
"epoch": 0.007870916961826053,
"grad_norm": 4.268670776824985,
"learning_rate": 7.8125e-07,
"loss": 0.318,
"step": 10
},
{
"epoch": 0.008658008658008658,
"grad_norm": 3.7083716156028674,
"learning_rate": 8.59375e-07,
"loss": 0.3106,
"step": 11
},
{
"epoch": 0.009445100354191263,
"grad_norm": 3.1545864445099263,
"learning_rate": 9.375000000000001e-07,
"loss": 0.3131,
"step": 12
},
{
"epoch": 0.01023219205037387,
"grad_norm": 3.020045714405798,
"learning_rate": 1.0156250000000001e-06,
"loss": 0.3215,
"step": 13
},
{
"epoch": 0.011019283746556474,
"grad_norm": 2.748876831681126,
"learning_rate": 1.0937500000000001e-06,
"loss": 0.3072,
"step": 14
},
{
"epoch": 0.011806375442739079,
"grad_norm": 2.2307366833759485,
"learning_rate": 1.1718750000000001e-06,
"loss": 0.2922,
"step": 15
},
{
"epoch": 0.012593467138921684,
"grad_norm": 2.219422516987874,
"learning_rate": 1.25e-06,
"loss": 0.2842,
"step": 16
},
{
"epoch": 0.01338055883510429,
"grad_norm": 2.7073031779339973,
"learning_rate": 1.328125e-06,
"loss": 0.2674,
"step": 17
},
{
"epoch": 0.014167650531286895,
"grad_norm": 2.873035911017537,
"learning_rate": 1.40625e-06,
"loss": 0.294,
"step": 18
},
{
"epoch": 0.0149547422274695,
"grad_norm": 2.119880363778339,
"learning_rate": 1.484375e-06,
"loss": 0.2693,
"step": 19
},
{
"epoch": 0.015741833923652106,
"grad_norm": 1.7740660860958901,
"learning_rate": 1.5625e-06,
"loss": 0.2607,
"step": 20
},
{
"epoch": 0.01652892561983471,
"grad_norm": 1.654838099179133,
"learning_rate": 1.640625e-06,
"loss": 0.2539,
"step": 21
},
{
"epoch": 0.017316017316017316,
"grad_norm": 2.1067372096520884,
"learning_rate": 1.71875e-06,
"loss": 0.274,
"step": 22
},
{
"epoch": 0.01810310901219992,
"grad_norm": 2.227492365997846,
"learning_rate": 1.796875e-06,
"loss": 0.2667,
"step": 23
},
{
"epoch": 0.018890200708382526,
"grad_norm": 1.9818482942437547,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.2507,
"step": 24
},
{
"epoch": 0.01967729240456513,
"grad_norm": 1.9916019664977938,
"learning_rate": 1.953125e-06,
"loss": 0.2302,
"step": 25
},
{
"epoch": 0.02046438410074774,
"grad_norm": 2.0987871479467533,
"learning_rate": 2.0312500000000002e-06,
"loss": 0.2563,
"step": 26
},
{
"epoch": 0.021251475796930343,
"grad_norm": 1.7851505967742112,
"learning_rate": 2.109375e-06,
"loss": 0.2432,
"step": 27
},
{
"epoch": 0.02203856749311295,
"grad_norm": 1.6067598902195293,
"learning_rate": 2.1875000000000002e-06,
"loss": 0.2466,
"step": 28
},
{
"epoch": 0.022825659189295553,
"grad_norm": 1.436243142469347,
"learning_rate": 2.265625e-06,
"loss": 0.2486,
"step": 29
},
{
"epoch": 0.023612750885478158,
"grad_norm": 1.631080710695958,
"learning_rate": 2.3437500000000002e-06,
"loss": 0.2692,
"step": 30
},
{
"epoch": 0.024399842581660763,
"grad_norm": 1.42554302342302,
"learning_rate": 2.421875e-06,
"loss": 0.2374,
"step": 31
},
{
"epoch": 0.025186934277843367,
"grad_norm": 1.479794666013743,
"learning_rate": 2.5e-06,
"loss": 0.238,
"step": 32
},
{
"epoch": 0.025974025974025976,
"grad_norm": 1.3857185652178832,
"learning_rate": 2.5781250000000004e-06,
"loss": 0.2366,
"step": 33
},
{
"epoch": 0.02676111767020858,
"grad_norm": 1.335993998237778,
"learning_rate": 2.65625e-06,
"loss": 0.2251,
"step": 34
},
{
"epoch": 0.027548209366391185,
"grad_norm": 1.5950255189913525,
"learning_rate": 2.7343750000000004e-06,
"loss": 0.2506,
"step": 35
},
{
"epoch": 0.02833530106257379,
"grad_norm": 1.3773024411686232,
"learning_rate": 2.8125e-06,
"loss": 0.2107,
"step": 36
},
{
"epoch": 0.029122392758756395,
"grad_norm": 1.391558709917223,
"learning_rate": 2.8906250000000004e-06,
"loss": 0.236,
"step": 37
},
{
"epoch": 0.029909484454939,
"grad_norm": 1.4317153691023394,
"learning_rate": 2.96875e-06,
"loss": 0.2369,
"step": 38
},
{
"epoch": 0.030696576151121605,
"grad_norm": 1.5982850208202695,
"learning_rate": 3.0468750000000004e-06,
"loss": 0.2135,
"step": 39
},
{
"epoch": 0.03148366784730421,
"grad_norm": 1.2463748947443163,
"learning_rate": 3.125e-06,
"loss": 0.2109,
"step": 40
},
{
"epoch": 0.032270759543486814,
"grad_norm": 1.489195263138514,
"learning_rate": 3.2031250000000004e-06,
"loss": 0.2254,
"step": 41
},
{
"epoch": 0.03305785123966942,
"grad_norm": 1.1952984228039816,
"learning_rate": 3.28125e-06,
"loss": 0.2124,
"step": 42
},
{
"epoch": 0.033844942935852024,
"grad_norm": 1.3331698750786545,
"learning_rate": 3.3593750000000003e-06,
"loss": 0.2192,
"step": 43
},
{
"epoch": 0.03463203463203463,
"grad_norm": 1.3944936006633961,
"learning_rate": 3.4375e-06,
"loss": 0.2024,
"step": 44
},
{
"epoch": 0.03541912632821724,
"grad_norm": 1.3992213437238004,
"learning_rate": 3.5156250000000003e-06,
"loss": 0.2274,
"step": 45
},
{
"epoch": 0.03620621802439984,
"grad_norm": 1.3664016160053327,
"learning_rate": 3.59375e-06,
"loss": 0.2152,
"step": 46
},
{
"epoch": 0.03699330972058245,
"grad_norm": 1.4891884814728509,
"learning_rate": 3.6718750000000003e-06,
"loss": 0.2292,
"step": 47
},
{
"epoch": 0.03778040141676505,
"grad_norm": 1.3270512221194979,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.1997,
"step": 48
},
{
"epoch": 0.03856749311294766,
"grad_norm": 1.4002885202427642,
"learning_rate": 3.828125000000001e-06,
"loss": 0.2049,
"step": 49
},
{
"epoch": 0.03935458480913026,
"grad_norm": 1.3129491965923514,
"learning_rate": 3.90625e-06,
"loss": 0.2108,
"step": 50
},
{
"epoch": 0.04014167650531287,
"grad_norm": 1.5308223134960726,
"learning_rate": 3.984375e-06,
"loss": 0.2185,
"step": 51
},
{
"epoch": 0.04092876820149548,
"grad_norm": 1.4637864330743848,
"learning_rate": 4.0625000000000005e-06,
"loss": 0.1959,
"step": 52
},
{
"epoch": 0.04171585989767808,
"grad_norm": 1.3727993193826824,
"learning_rate": 4.140625000000001e-06,
"loss": 0.2079,
"step": 53
},
{
"epoch": 0.04250295159386069,
"grad_norm": 1.324719262777532,
"learning_rate": 4.21875e-06,
"loss": 0.1986,
"step": 54
},
{
"epoch": 0.04329004329004329,
"grad_norm": 1.5169702547195179,
"learning_rate": 4.296875e-06,
"loss": 0.2233,
"step": 55
},
{
"epoch": 0.0440771349862259,
"grad_norm": 1.2762861570952524,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.1887,
"step": 56
},
{
"epoch": 0.0448642266824085,
"grad_norm": 1.3276694806352698,
"learning_rate": 4.453125000000001e-06,
"loss": 0.2039,
"step": 57
},
{
"epoch": 0.045651318378591106,
"grad_norm": 1.3146250055299598,
"learning_rate": 4.53125e-06,
"loss": 0.2142,
"step": 58
},
{
"epoch": 0.046438410074773714,
"grad_norm": 1.333854988794211,
"learning_rate": 4.609375e-06,
"loss": 0.1999,
"step": 59
},
{
"epoch": 0.047225501770956316,
"grad_norm": 1.3581177171903593,
"learning_rate": 4.6875000000000004e-06,
"loss": 0.1957,
"step": 60
},
{
"epoch": 0.048012593467138924,
"grad_norm": 1.3130565355707189,
"learning_rate": 4.765625000000001e-06,
"loss": 0.2122,
"step": 61
},
{
"epoch": 0.048799685163321525,
"grad_norm": 1.4142816132337854,
"learning_rate": 4.84375e-06,
"loss": 0.2147,
"step": 62
},
{
"epoch": 0.049586776859504134,
"grad_norm": 1.357898232243354,
"learning_rate": 4.921875e-06,
"loss": 0.1877,
"step": 63
},
{
"epoch": 0.050373868555686735,
"grad_norm": 1.4153342064426397,
"learning_rate": 5e-06,
"loss": 0.1945,
"step": 64
},
{
"epoch": 0.05116096025186934,
"grad_norm": 1.4495342715013748,
"learning_rate": 4.999991517675219e-06,
"loss": 0.1939,
"step": 65
},
{
"epoch": 0.05194805194805195,
"grad_norm": 1.1539274129121713,
"learning_rate": 4.999966070758437e-06,
"loss": 0.2003,
"step": 66
},
{
"epoch": 0.05273514364423455,
"grad_norm": 1.3379283904444008,
"learning_rate": 4.999923659422332e-06,
"loss": 0.2007,
"step": 67
},
{
"epoch": 0.05352223534041716,
"grad_norm": 1.3492954613335875,
"learning_rate": 4.999864283954702e-06,
"loss": 0.1989,
"step": 68
},
{
"epoch": 0.05430932703659976,
"grad_norm": 1.1801853129144864,
"learning_rate": 4.99978794475846e-06,
"loss": 0.2114,
"step": 69
},
{
"epoch": 0.05509641873278237,
"grad_norm": 1.2068999367428581,
"learning_rate": 4.999694642351633e-06,
"loss": 0.2033,
"step": 70
},
{
"epoch": 0.05588351042896497,
"grad_norm": 1.2287271472480104,
"learning_rate": 4.999584377367359e-06,
"loss": 0.1895,
"step": 71
},
{
"epoch": 0.05667060212514758,
"grad_norm": 1.3129837217534652,
"learning_rate": 4.99945715055388e-06,
"loss": 0.1905,
"step": 72
},
{
"epoch": 0.05745769382133018,
"grad_norm": 1.1734967025843308,
"learning_rate": 4.99931296277454e-06,
"loss": 0.213,
"step": 73
},
{
"epoch": 0.05824478551751279,
"grad_norm": 1.3738466570011791,
"learning_rate": 4.999151815007776e-06,
"loss": 0.2214,
"step": 74
},
{
"epoch": 0.0590318772136954,
"grad_norm": 1.273179655688277,
"learning_rate": 4.9989737083471165e-06,
"loss": 0.1894,
"step": 75
},
{
"epoch": 0.059818968909878,
"grad_norm": 1.0843431120214646,
"learning_rate": 4.998778644001165e-06,
"loss": 0.1967,
"step": 76
},
{
"epoch": 0.06060606060606061,
"grad_norm": 1.4896402431576707,
"learning_rate": 4.998566623293603e-06,
"loss": 0.1752,
"step": 77
},
{
"epoch": 0.06139315230224321,
"grad_norm": 1.3405458603738243,
"learning_rate": 4.9983376476631725e-06,
"loss": 0.1998,
"step": 78
},
{
"epoch": 0.06218024399842582,
"grad_norm": 1.3641086369593634,
"learning_rate": 4.998091718663671e-06,
"loss": 0.2047,
"step": 79
},
{
"epoch": 0.06296733569460843,
"grad_norm": 1.3391162585136267,
"learning_rate": 4.997828837963937e-06,
"loss": 0.181,
"step": 80
},
{
"epoch": 0.06375442739079103,
"grad_norm": 1.1899411991269295,
"learning_rate": 4.997549007347843e-06,
"loss": 0.1946,
"step": 81
},
{
"epoch": 0.06454151908697363,
"grad_norm": 1.3917818646896112,
"learning_rate": 4.997252228714279e-06,
"loss": 0.1919,
"step": 82
},
{
"epoch": 0.06532861078315624,
"grad_norm": 1.2543099071691322,
"learning_rate": 4.996938504077145e-06,
"loss": 0.1948,
"step": 83
},
{
"epoch": 0.06611570247933884,
"grad_norm": 1.3941008619735185,
"learning_rate": 4.99660783556533e-06,
"loss": 0.1861,
"step": 84
},
{
"epoch": 0.06690279417552145,
"grad_norm": 1.1765528133487257,
"learning_rate": 4.9962602254227075e-06,
"loss": 0.1817,
"step": 85
},
{
"epoch": 0.06768988587170405,
"grad_norm": 1.223066746932356,
"learning_rate": 4.995895676008109e-06,
"loss": 0.1934,
"step": 86
},
{
"epoch": 0.06847697756788666,
"grad_norm": 1.3140944559909808,
"learning_rate": 4.995514189795316e-06,
"loss": 0.197,
"step": 87
},
{
"epoch": 0.06926406926406926,
"grad_norm": 1.1819977914205286,
"learning_rate": 4.99511576937304e-06,
"loss": 0.1972,
"step": 88
},
{
"epoch": 0.07005116096025187,
"grad_norm": 1.3152579578345207,
"learning_rate": 4.994700417444907e-06,
"loss": 0.207,
"step": 89
},
{
"epoch": 0.07083825265643448,
"grad_norm": 1.2064669225701854,
"learning_rate": 4.994268136829438e-06,
"loss": 0.1953,
"step": 90
},
{
"epoch": 0.07162534435261708,
"grad_norm": 1.1619755664518439,
"learning_rate": 4.993818930460026e-06,
"loss": 0.1982,
"step": 91
},
{
"epoch": 0.07241243604879968,
"grad_norm": 1.1792837872493809,
"learning_rate": 4.993352801384924e-06,
"loss": 0.1886,
"step": 92
},
{
"epoch": 0.07319952774498228,
"grad_norm": 1.097328306217708,
"learning_rate": 4.992869752767218e-06,
"loss": 0.1673,
"step": 93
},
{
"epoch": 0.0739866194411649,
"grad_norm": 1.2788239338552108,
"learning_rate": 4.992369787884809e-06,
"loss": 0.1972,
"step": 94
},
{
"epoch": 0.0747737111373475,
"grad_norm": 1.1905278770669998,
"learning_rate": 4.991852910130388e-06,
"loss": 0.1872,
"step": 95
},
{
"epoch": 0.0755608028335301,
"grad_norm": 1.2133270115400816,
"learning_rate": 4.9913191230114154e-06,
"loss": 0.1748,
"step": 96
},
{
"epoch": 0.07634789452971272,
"grad_norm": 1.2840440499091732,
"learning_rate": 4.990768430150096e-06,
"loss": 0.1942,
"step": 97
},
{
"epoch": 0.07713498622589532,
"grad_norm": 1.5346248945491554,
"learning_rate": 4.990200835283353e-06,
"loss": 0.1861,
"step": 98
},
{
"epoch": 0.07792207792207792,
"grad_norm": 1.1936205681426777,
"learning_rate": 4.989616342262807e-06,
"loss": 0.1975,
"step": 99
},
{
"epoch": 0.07870916961826052,
"grad_norm": 1.2662437794316659,
"learning_rate": 4.989014955054746e-06,
"loss": 0.1853,
"step": 100
},
{
"epoch": 0.07949626131444314,
"grad_norm": 1.116915072535967,
"learning_rate": 4.988396677740097e-06,
"loss": 0.1738,
"step": 101
},
{
"epoch": 0.08028335301062574,
"grad_norm": 1.1577366023558335,
"learning_rate": 4.9877615145144055e-06,
"loss": 0.2045,
"step": 102
},
{
"epoch": 0.08107044470680834,
"grad_norm": 1.1022178093801993,
"learning_rate": 4.9871094696878e-06,
"loss": 0.1814,
"step": 103
},
{
"epoch": 0.08185753640299095,
"grad_norm": 1.218327314143879,
"learning_rate": 4.986440547684963e-06,
"loss": 0.1822,
"step": 104
},
{
"epoch": 0.08264462809917356,
"grad_norm": 1.0747362510591434,
"learning_rate": 4.985754753045108e-06,
"loss": 0.1639,
"step": 105
},
{
"epoch": 0.08343171979535616,
"grad_norm": 1.236686976609853,
"learning_rate": 4.9850520904219406e-06,
"loss": 0.1773,
"step": 106
},
{
"epoch": 0.08421881149153876,
"grad_norm": 1.2843110878866029,
"learning_rate": 4.98433256458363e-06,
"loss": 0.1931,
"step": 107
},
{
"epoch": 0.08500590318772137,
"grad_norm": 1.2556201190754803,
"learning_rate": 4.983596180412779e-06,
"loss": 0.1891,
"step": 108
},
{
"epoch": 0.08579299488390398,
"grad_norm": 1.1736861180333642,
"learning_rate": 4.982842942906386e-06,
"loss": 0.1932,
"step": 109
},
{
"epoch": 0.08658008658008658,
"grad_norm": 1.250703274500956,
"learning_rate": 4.982072857175816e-06,
"loss": 0.1979,
"step": 110
},
{
"epoch": 0.08736717827626919,
"grad_norm": 1.111910462348759,
"learning_rate": 4.981285928446762e-06,
"loss": 0.1729,
"step": 111
},
{
"epoch": 0.0881542699724518,
"grad_norm": 1.2267625409230847,
"learning_rate": 4.980482162059214e-06,
"loss": 0.1993,
"step": 112
},
{
"epoch": 0.0889413616686344,
"grad_norm": 1.350342930816002,
"learning_rate": 4.979661563467415e-06,
"loss": 0.1914,
"step": 113
},
{
"epoch": 0.089728453364817,
"grad_norm": 0.9837790085016399,
"learning_rate": 4.978824138239835e-06,
"loss": 0.1852,
"step": 114
},
{
"epoch": 0.09051554506099961,
"grad_norm": 1.1412715811918805,
"learning_rate": 4.977969892059123e-06,
"loss": 0.1791,
"step": 115
},
{
"epoch": 0.09130263675718221,
"grad_norm": 1.091735318231847,
"learning_rate": 4.977098830722074e-06,
"loss": 0.1879,
"step": 116
},
{
"epoch": 0.09208972845336481,
"grad_norm": 1.1356995797773966,
"learning_rate": 4.976210960139587e-06,
"loss": 0.1942,
"step": 117
},
{
"epoch": 0.09287682014954743,
"grad_norm": 1.197221158258512,
"learning_rate": 4.975306286336628e-06,
"loss": 0.1822,
"step": 118
},
{
"epoch": 0.09366391184573003,
"grad_norm": 1.1622435205009634,
"learning_rate": 4.974384815452187e-06,
"loss": 0.1938,
"step": 119
},
{
"epoch": 0.09445100354191263,
"grad_norm": 1.1980574826361372,
"learning_rate": 4.9734465537392365e-06,
"loss": 0.1703,
"step": 120
},
{
"epoch": 0.09523809523809523,
"grad_norm": 1.090793092501407,
"learning_rate": 4.972491507564688e-06,
"loss": 0.1681,
"step": 121
},
{
"epoch": 0.09602518693427785,
"grad_norm": 1.2120296842604672,
"learning_rate": 4.9715196834093525e-06,
"loss": 0.1562,
"step": 122
},
{
"epoch": 0.09681227863046045,
"grad_norm": 1.1420618659168036,
"learning_rate": 4.97053108786789e-06,
"loss": 0.1812,
"step": 123
},
{
"epoch": 0.09759937032664305,
"grad_norm": 1.145370838994205,
"learning_rate": 4.969525727648774e-06,
"loss": 0.1873,
"step": 124
},
{
"epoch": 0.09838646202282567,
"grad_norm": 1.1676600414602372,
"learning_rate": 4.9685036095742365e-06,
"loss": 0.1972,
"step": 125
},
{
"epoch": 0.09917355371900827,
"grad_norm": 1.204479600477317,
"learning_rate": 4.967464740580228e-06,
"loss": 0.1904,
"step": 126
},
{
"epoch": 0.09996064541519087,
"grad_norm": 1.119994012971968,
"learning_rate": 4.9664091277163664e-06,
"loss": 0.1851,
"step": 127
},
{
"epoch": 0.10074773711137347,
"grad_norm": 1.1043684264734095,
"learning_rate": 4.9653367781458946e-06,
"loss": 0.1926,
"step": 128
},
{
"epoch": 0.10153482880755609,
"grad_norm": 1.127680976136701,
"learning_rate": 4.964247699145626e-06,
"loss": 0.1886,
"step": 129
},
{
"epoch": 0.10232192050373869,
"grad_norm": 1.1813875803533898,
"learning_rate": 4.963141898105898e-06,
"loss": 0.1858,
"step": 130
},
{
"epoch": 0.10310901219992129,
"grad_norm": 1.1497128287458092,
"learning_rate": 4.962019382530521e-06,
"loss": 0.1724,
"step": 131
},
{
"epoch": 0.1038961038961039,
"grad_norm": 1.223498886081565,
"learning_rate": 4.960880160036728e-06,
"loss": 0.194,
"step": 132
},
{
"epoch": 0.1046831955922865,
"grad_norm": 1.1861652899170938,
"learning_rate": 4.959724238355124e-06,
"loss": 0.1841,
"step": 133
},
{
"epoch": 0.1054702872884691,
"grad_norm": 1.0805114288365025,
"learning_rate": 4.958551625329631e-06,
"loss": 0.1646,
"step": 134
},
{
"epoch": 0.10625737898465171,
"grad_norm": 1.29808710670669,
"learning_rate": 4.957362328917437e-06,
"loss": 0.1833,
"step": 135
},
{
"epoch": 0.10704447068083432,
"grad_norm": 1.2660501691777906,
"learning_rate": 4.95615635718894e-06,
"loss": 0.1753,
"step": 136
},
{
"epoch": 0.10783156237701692,
"grad_norm": 1.1429230314494303,
"learning_rate": 4.954933718327697e-06,
"loss": 0.1734,
"step": 137
},
{
"epoch": 0.10861865407319952,
"grad_norm": 1.114357361335831,
"learning_rate": 4.953694420630361e-06,
"loss": 0.1925,
"step": 138
},
{
"epoch": 0.10940574576938213,
"grad_norm": 1.1238119767186239,
"learning_rate": 4.952438472506636e-06,
"loss": 0.1805,
"step": 139
},
{
"epoch": 0.11019283746556474,
"grad_norm": 1.1524735878912507,
"learning_rate": 4.951165882479206e-06,
"loss": 0.1783,
"step": 140
},
{
"epoch": 0.11097992916174734,
"grad_norm": 1.0546198047284017,
"learning_rate": 4.949876659183693e-06,
"loss": 0.1745,
"step": 141
},
{
"epoch": 0.11176702085792994,
"grad_norm": 1.0925714956018635,
"learning_rate": 4.94857081136858e-06,
"loss": 0.1763,
"step": 142
},
{
"epoch": 0.11255411255411256,
"grad_norm": 1.1039385653204372,
"learning_rate": 4.947248347895172e-06,
"loss": 0.1777,
"step": 143
},
{
"epoch": 0.11334120425029516,
"grad_norm": 1.145622347104172,
"learning_rate": 4.945909277737519e-06,
"loss": 0.1804,
"step": 144
},
{
"epoch": 0.11412829594647776,
"grad_norm": 1.0810330697861197,
"learning_rate": 4.944553609982363e-06,
"loss": 0.18,
"step": 145
},
{
"epoch": 0.11491538764266036,
"grad_norm": 1.079722871077113,
"learning_rate": 4.943181353829077e-06,
"loss": 0.1805,
"step": 146
},
{
"epoch": 0.11570247933884298,
"grad_norm": 1.2122723148500483,
"learning_rate": 4.941792518589596e-06,
"loss": 0.2113,
"step": 147
},
{
"epoch": 0.11648957103502558,
"grad_norm": 1.1619622709214918,
"learning_rate": 4.940387113688364e-06,
"loss": 0.1714,
"step": 148
},
{
"epoch": 0.11727666273120818,
"grad_norm": 1.0508760593348456,
"learning_rate": 4.93896514866226e-06,
"loss": 0.1625,
"step": 149
},
{
"epoch": 0.1180637544273908,
"grad_norm": 1.0710088382142664,
"learning_rate": 4.93752663316054e-06,
"loss": 0.1778,
"step": 150
},
{
"epoch": 0.1188508461235734,
"grad_norm": 1.0503531295721205,
"learning_rate": 4.936071576944769e-06,
"loss": 0.1726,
"step": 151
},
{
"epoch": 0.119637937819756,
"grad_norm": 1.0686610020146463,
"learning_rate": 4.934599989888753e-06,
"loss": 0.1769,
"step": 152
},
{
"epoch": 0.1204250295159386,
"grad_norm": 1.072378297090023,
"learning_rate": 4.933111881978478e-06,
"loss": 0.1866,
"step": 153
},
{
"epoch": 0.12121212121212122,
"grad_norm": 1.2495883030259693,
"learning_rate": 4.931607263312033e-06,
"loss": 0.1998,
"step": 154
},
{
"epoch": 0.12199921290830382,
"grad_norm": 1.109893027407933,
"learning_rate": 4.93008614409955e-06,
"loss": 0.1805,
"step": 155
},
{
"epoch": 0.12278630460448642,
"grad_norm": 1.1570851370725408,
"learning_rate": 4.928548534663133e-06,
"loss": 0.1725,
"step": 156
},
{
"epoch": 0.12357339630066903,
"grad_norm": 1.1758781032456742,
"learning_rate": 4.9269944454367815e-06,
"loss": 0.176,
"step": 157
},
{
"epoch": 0.12436048799685163,
"grad_norm": 1.1408455648753233,
"learning_rate": 4.925423886966328e-06,
"loss": 0.1848,
"step": 158
},
{
"epoch": 0.12514757969303425,
"grad_norm": 1.1318514267380126,
"learning_rate": 4.923836869909363e-06,
"loss": 0.1764,
"step": 159
},
{
"epoch": 0.12593467138921685,
"grad_norm": 1.1451300788977063,
"learning_rate": 4.9222334050351595e-06,
"loss": 0.1756,
"step": 160
},
{
"epoch": 0.12672176308539945,
"grad_norm": 1.1117305593028235,
"learning_rate": 4.920613503224608e-06,
"loss": 0.1797,
"step": 161
},
{
"epoch": 0.12750885478158205,
"grad_norm": 1.1301581138966732,
"learning_rate": 4.9189771754701335e-06,
"loss": 0.1675,
"step": 162
},
{
"epoch": 0.12829594647776466,
"grad_norm": 1.0326917294149387,
"learning_rate": 4.917324432875627e-06,
"loss": 0.1784,
"step": 163
},
{
"epoch": 0.12908303817394726,
"grad_norm": 1.1983588521884831,
"learning_rate": 4.915655286656368e-06,
"loss": 0.1966,
"step": 164
},
{
"epoch": 0.12987012987012986,
"grad_norm": 1.0140424703790007,
"learning_rate": 4.9139697481389505e-06,
"loss": 0.1744,
"step": 165
},
{
"epoch": 0.1306572215663125,
"grad_norm": 1.223539092779737,
"learning_rate": 4.9122678287612e-06,
"loss": 0.1831,
"step": 166
},
{
"epoch": 0.1314443132624951,
"grad_norm": 1.0918972348910556,
"learning_rate": 4.910549540072104e-06,
"loss": 0.1843,
"step": 167
},
{
"epoch": 0.1322314049586777,
"grad_norm": 1.1292739304249166,
"learning_rate": 4.908814893731728e-06,
"loss": 0.1552,
"step": 168
},
{
"epoch": 0.1330184966548603,
"grad_norm": 1.1923518362383727,
"learning_rate": 4.9070639015111406e-06,
"loss": 0.1895,
"step": 169
},
{
"epoch": 0.1338055883510429,
"grad_norm": 1.083542335588892,
"learning_rate": 4.905296575292329e-06,
"loss": 0.1745,
"step": 170
},
{
"epoch": 0.1345926800472255,
"grad_norm": 1.2673623015109376,
"learning_rate": 4.90351292706812e-06,
"loss": 0.1726,
"step": 171
},
{
"epoch": 0.1353797717434081,
"grad_norm": 1.1129476624507257,
"learning_rate": 4.901712968942101e-06,
"loss": 0.1706,
"step": 172
},
{
"epoch": 0.13616686343959072,
"grad_norm": 1.1735922432656085,
"learning_rate": 4.899896713128536e-06,
"loss": 0.1741,
"step": 173
},
{
"epoch": 0.13695395513577333,
"grad_norm": 1.2331570034422519,
"learning_rate": 4.898064171952281e-06,
"loss": 0.1946,
"step": 174
},
{
"epoch": 0.13774104683195593,
"grad_norm": 1.2376618802061816,
"learning_rate": 4.896215357848706e-06,
"loss": 0.1715,
"step": 175
},
{
"epoch": 0.13852813852813853,
"grad_norm": 1.0860947256302276,
"learning_rate": 4.894350283363603e-06,
"loss": 0.1664,
"step": 176
},
{
"epoch": 0.13931523022432113,
"grad_norm": 1.1284792933006988,
"learning_rate": 4.892468961153105e-06,
"loss": 0.1721,
"step": 177
},
{
"epoch": 0.14010232192050373,
"grad_norm": 1.1811695933066144,
"learning_rate": 4.8905714039836026e-06,
"loss": 0.1768,
"step": 178
},
{
"epoch": 0.14088941361668633,
"grad_norm": 1.1690172197627666,
"learning_rate": 4.888657624731652e-06,
"loss": 0.1784,
"step": 179
},
{
"epoch": 0.14167650531286896,
"grad_norm": 1.2215187765329307,
"learning_rate": 4.88672763638389e-06,
"loss": 0.1762,
"step": 180
},
{
"epoch": 0.14246359700905156,
"grad_norm": 1.1657625904368065,
"learning_rate": 4.884781452036948e-06,
"loss": 0.1754,
"step": 181
},
{
"epoch": 0.14325068870523416,
"grad_norm": 1.0812019740421663,
"learning_rate": 4.88281908489736e-06,
"loss": 0.1745,
"step": 182
},
{
"epoch": 0.14403778040141677,
"grad_norm": 1.1662972444477193,
"learning_rate": 4.880840548281475e-06,
"loss": 0.1844,
"step": 183
},
{
"epoch": 0.14482487209759937,
"grad_norm": 1.1318261660435303,
"learning_rate": 4.878845855615364e-06,
"loss": 0.177,
"step": 184
},
{
"epoch": 0.14561196379378197,
"grad_norm": 1.0454173174935852,
"learning_rate": 4.876835020434733e-06,
"loss": 0.1726,
"step": 185
},
{
"epoch": 0.14639905548996457,
"grad_norm": 1.1649728572384528,
"learning_rate": 4.874808056384826e-06,
"loss": 0.1829,
"step": 186
},
{
"epoch": 0.1471861471861472,
"grad_norm": 0.9809711097737751,
"learning_rate": 4.8727649772203375e-06,
"loss": 0.1626,
"step": 187
},
{
"epoch": 0.1479732388823298,
"grad_norm": 1.0024677570018588,
"learning_rate": 4.8707057968053175e-06,
"loss": 0.1564,
"step": 188
},
{
"epoch": 0.1487603305785124,
"grad_norm": 1.0801740218719516,
"learning_rate": 4.868630529113075e-06,
"loss": 0.1571,
"step": 189
},
{
"epoch": 0.149547422274695,
"grad_norm": 1.0633734918657578,
"learning_rate": 4.866539188226086e-06,
"loss": 0.1558,
"step": 190
},
{
"epoch": 0.1503345139708776,
"grad_norm": 1.1110942685300096,
"learning_rate": 4.864431788335895e-06,
"loss": 0.1739,
"step": 191
},
{
"epoch": 0.1511216056670602,
"grad_norm": 1.088865739839623,
"learning_rate": 4.862308343743024e-06,
"loss": 0.1705,
"step": 192
},
{
"epoch": 0.1519086973632428,
"grad_norm": 1.158763785538179,
"learning_rate": 4.86016886885687e-06,
"loss": 0.1754,
"step": 193
},
{
"epoch": 0.15269578905942544,
"grad_norm": 1.0665033787081621,
"learning_rate": 4.858013378195609e-06,
"loss": 0.1814,
"step": 194
},
{
"epoch": 0.15348288075560804,
"grad_norm": 1.0347506383595513,
"learning_rate": 4.855841886386099e-06,
"loss": 0.1659,
"step": 195
},
{
"epoch": 0.15426997245179064,
"grad_norm": 1.3652087096932124,
"learning_rate": 4.8536544081637785e-06,
"loss": 0.1693,
"step": 196
},
{
"epoch": 0.15505706414797324,
"grad_norm": 1.1571980267809596,
"learning_rate": 4.8514509583725685e-06,
"loss": 0.1735,
"step": 197
},
{
"epoch": 0.15584415584415584,
"grad_norm": 1.1126611625924816,
"learning_rate": 4.849231551964771e-06,
"loss": 0.1878,
"step": 198
},
{
"epoch": 0.15663124754033844,
"grad_norm": 1.0666827506948415,
"learning_rate": 4.846996204000967e-06,
"loss": 0.1686,
"step": 199
},
{
"epoch": 0.15741833923652104,
"grad_norm": 1.1408187983192677,
"learning_rate": 4.844744929649912e-06,
"loss": 0.1785,
"step": 200
},
{
"epoch": 0.15820543093270367,
"grad_norm": 1.1050850982745672,
"learning_rate": 4.842477744188441e-06,
"loss": 0.1663,
"step": 201
},
{
"epoch": 0.15899252262888627,
"grad_norm": 1.0153624350350885,
"learning_rate": 4.840194663001354e-06,
"loss": 0.1755,
"step": 202
},
{
"epoch": 0.15977961432506887,
"grad_norm": 1.0251264155888737,
"learning_rate": 4.837895701581322e-06,
"loss": 0.1537,
"step": 203
},
{
"epoch": 0.16056670602125148,
"grad_norm": 1.0673153393456505,
"learning_rate": 4.835580875528776e-06,
"loss": 0.1633,
"step": 204
},
{
"epoch": 0.16135379771743408,
"grad_norm": 1.0273828987011315,
"learning_rate": 4.833250200551798e-06,
"loss": 0.1746,
"step": 205
},
{
"epoch": 0.16214088941361668,
"grad_norm": 1.0964068866663357,
"learning_rate": 4.830903692466023e-06,
"loss": 0.1674,
"step": 206
},
{
"epoch": 0.16292798110979928,
"grad_norm": 1.1142080493277295,
"learning_rate": 4.828541367194527e-06,
"loss": 0.1828,
"step": 207
},
{
"epoch": 0.1637150728059819,
"grad_norm": 1.0617790409690397,
"learning_rate": 4.826163240767717e-06,
"loss": 0.1676,
"step": 208
},
{
"epoch": 0.1645021645021645,
"grad_norm": 1.2859855971245049,
"learning_rate": 4.8237693293232256e-06,
"loss": 0.1942,
"step": 209
},
{
"epoch": 0.1652892561983471,
"grad_norm": 1.000840540957111,
"learning_rate": 4.821359649105801e-06,
"loss": 0.1686,
"step": 210
},
{
"epoch": 0.1660763478945297,
"grad_norm": 1.049595380158752,
"learning_rate": 4.818934216467195e-06,
"loss": 0.1696,
"step": 211
},
{
"epoch": 0.16686343959071231,
"grad_norm": 1.0218031530162965,
"learning_rate": 4.816493047866053e-06,
"loss": 0.1653,
"step": 212
},
{
"epoch": 0.16765053128689492,
"grad_norm": 1.0715206508098112,
"learning_rate": 4.8140361598678034e-06,
"loss": 0.1735,
"step": 213
},
{
"epoch": 0.16843762298307752,
"grad_norm": 1.093161202120212,
"learning_rate": 4.811563569144544e-06,
"loss": 0.1698,
"step": 214
},
{
"epoch": 0.16922471467926015,
"grad_norm": 1.078958887147992,
"learning_rate": 4.809075292474929e-06,
"loss": 0.1671,
"step": 215
},
{
"epoch": 0.17001180637544275,
"grad_norm": 1.1213364259804648,
"learning_rate": 4.806571346744053e-06,
"loss": 0.1798,
"step": 216
},
{
"epoch": 0.17079889807162535,
"grad_norm": 1.102076724202232,
"learning_rate": 4.804051748943343e-06,
"loss": 0.1845,
"step": 217
},
{
"epoch": 0.17158598976780795,
"grad_norm": 1.1103430873095865,
"learning_rate": 4.801516516170437e-06,
"loss": 0.177,
"step": 218
},
{
"epoch": 0.17237308146399055,
"grad_norm": 1.228711789290585,
"learning_rate": 4.798965665629068e-06,
"loss": 0.1636,
"step": 219
},
{
"epoch": 0.17316017316017315,
"grad_norm": 1.1219855198900837,
"learning_rate": 4.796399214628949e-06,
"loss": 0.1802,
"step": 220
},
{
"epoch": 0.17394726485635575,
"grad_norm": 1.1846418832749555,
"learning_rate": 4.7938171805856596e-06,
"loss": 0.1717,
"step": 221
},
{
"epoch": 0.17473435655253838,
"grad_norm": 1.0672386815907553,
"learning_rate": 4.791219581020518e-06,
"loss": 0.1663,
"step": 222
},
{
"epoch": 0.17552144824872098,
"grad_norm": 1.0398388591323704,
"learning_rate": 4.788606433560473e-06,
"loss": 0.1593,
"step": 223
},
{
"epoch": 0.1763085399449036,
"grad_norm": 1.1402534682960337,
"learning_rate": 4.785977755937977e-06,
"loss": 0.1876,
"step": 224
},
{
"epoch": 0.1770956316410862,
"grad_norm": 1.1260603683997887,
"learning_rate": 4.783333565990865e-06,
"loss": 0.172,
"step": 225
},
{
"epoch": 0.1778827233372688,
"grad_norm": 1.062290554096683,
"learning_rate": 4.780673881662242e-06,
"loss": 0.1709,
"step": 226
},
{
"epoch": 0.1786698150334514,
"grad_norm": 1.0650729387286197,
"learning_rate": 4.777998721000353e-06,
"loss": 0.1614,
"step": 227
},
{
"epoch": 0.179456906729634,
"grad_norm": 1.0365419204779498,
"learning_rate": 4.775308102158461e-06,
"loss": 0.1605,
"step": 228
},
{
"epoch": 0.18024399842581662,
"grad_norm": 1.1444494636007958,
"learning_rate": 4.772602043394731e-06,
"loss": 0.1867,
"step": 229
},
{
"epoch": 0.18103109012199922,
"grad_norm": 1.1053808430839196,
"learning_rate": 4.769880563072097e-06,
"loss": 0.1627,
"step": 230
},
{
"epoch": 0.18181818181818182,
"grad_norm": 1.0763207393373317,
"learning_rate": 4.767143679658143e-06,
"loss": 0.1703,
"step": 231
},
{
"epoch": 0.18260527351436442,
"grad_norm": 1.1302336936081483,
"learning_rate": 4.764391411724977e-06,
"loss": 0.1697,
"step": 232
},
{
"epoch": 0.18339236521054703,
"grad_norm": 1.059980991296742,
"learning_rate": 4.7616237779491026e-06,
"loss": 0.1658,
"step": 233
},
{
"epoch": 0.18417945690672963,
"grad_norm": 1.0952807461742509,
"learning_rate": 4.758840797111295e-06,
"loss": 0.1833,
"step": 234
},
{
"epoch": 0.18496654860291223,
"grad_norm": 1.0263555674269131,
"learning_rate": 4.756042488096472e-06,
"loss": 0.1732,
"step": 235
},
{
"epoch": 0.18575364029909486,
"grad_norm": 1.088261327233659,
"learning_rate": 4.753228869893566e-06,
"loss": 0.1646,
"step": 236
},
{
"epoch": 0.18654073199527746,
"grad_norm": 1.0644325115229099,
"learning_rate": 4.750399961595395e-06,
"loss": 0.1576,
"step": 237
},
{
"epoch": 0.18732782369146006,
"grad_norm": 0.9952967090049917,
"learning_rate": 4.747555782398537e-06,
"loss": 0.1598,
"step": 238
},
{
"epoch": 0.18811491538764266,
"grad_norm": 1.0300249714418026,
"learning_rate": 4.7446963516031904e-06,
"loss": 0.1883,
"step": 239
},
{
"epoch": 0.18890200708382526,
"grad_norm": 1.0275382678304879,
"learning_rate": 4.741821688613054e-06,
"loss": 0.1704,
"step": 240
},
{
"epoch": 0.18968909878000786,
"grad_norm": 1.0616733952682182,
"learning_rate": 4.738931812935186e-06,
"loss": 0.1907,
"step": 241
},
{
"epoch": 0.19047619047619047,
"grad_norm": 1.0103628221724312,
"learning_rate": 4.736026744179878e-06,
"loss": 0.1556,
"step": 242
},
{
"epoch": 0.1912632821723731,
"grad_norm": 1.0535669337117792,
"learning_rate": 4.73310650206052e-06,
"loss": 0.1809,
"step": 243
},
{
"epoch": 0.1920503738685557,
"grad_norm": 1.0554553563643476,
"learning_rate": 4.730171106393466e-06,
"loss": 0.1675,
"step": 244
},
{
"epoch": 0.1928374655647383,
"grad_norm": 0.9417424551436594,
"learning_rate": 4.7272205770979e-06,
"loss": 0.1438,
"step": 245
},
{
"epoch": 0.1936245572609209,
"grad_norm": 1.1154888244817747,
"learning_rate": 4.724254934195698e-06,
"loss": 0.1765,
"step": 246
},
{
"epoch": 0.1944116489571035,
"grad_norm": 1.1742188581521773,
"learning_rate": 4.721274197811298e-06,
"loss": 0.1711,
"step": 247
},
{
"epoch": 0.1951987406532861,
"grad_norm": 1.057640390538921,
"learning_rate": 4.71827838817156e-06,
"loss": 0.1678,
"step": 248
},
{
"epoch": 0.1959858323494687,
"grad_norm": 1.022336905613029,
"learning_rate": 4.715267525605627e-06,
"loss": 0.1552,
"step": 249
},
{
"epoch": 0.19677292404565133,
"grad_norm": 1.181830506383501,
"learning_rate": 4.712241630544792e-06,
"loss": 0.1765,
"step": 250
},
{
"epoch": 0.19756001574183393,
"grad_norm": 1.1571296526874602,
"learning_rate": 4.709200723522353e-06,
"loss": 0.1758,
"step": 251
},
{
"epoch": 0.19834710743801653,
"grad_norm": 1.082056647389628,
"learning_rate": 4.706144825173481e-06,
"loss": 0.1638,
"step": 252
},
{
"epoch": 0.19913419913419914,
"grad_norm": 1.0648327864294944,
"learning_rate": 4.703073956235071e-06,
"loss": 0.1747,
"step": 253
},
{
"epoch": 0.19992129083038174,
"grad_norm": 1.1273460773870558,
"learning_rate": 4.6999881375456116e-06,
"loss": 0.1767,
"step": 254
},
{
"epoch": 0.20070838252656434,
"grad_norm": 1.0782376126285664,
"learning_rate": 4.696887390045035e-06,
"loss": 0.169,
"step": 255
},
{
"epoch": 0.20149547422274694,
"grad_norm": 1.043398805036875,
"learning_rate": 4.693771734774578e-06,
"loss": 0.1774,
"step": 256
},
{
"epoch": 0.20228256591892957,
"grad_norm": 1.067320862475683,
"learning_rate": 4.690641192876643e-06,
"loss": 0.1607,
"step": 257
},
{
"epoch": 0.20306965761511217,
"grad_norm": 1.1843944163744937,
"learning_rate": 4.687495785594646e-06,
"loss": 0.1633,
"step": 258
},
{
"epoch": 0.20385674931129477,
"grad_norm": 1.0931562611646284,
"learning_rate": 4.684335534272881e-06,
"loss": 0.1687,
"step": 259
},
{
"epoch": 0.20464384100747737,
"grad_norm": 1.1204870400497637,
"learning_rate": 4.68116046035637e-06,
"loss": 0.1639,
"step": 260
},
{
"epoch": 0.20543093270365997,
"grad_norm": 1.2082791443480092,
"learning_rate": 4.6779705853907205e-06,
"loss": 0.1683,
"step": 261
},
{
"epoch": 0.20621802439984258,
"grad_norm": 1.0646518318192153,
"learning_rate": 4.674765931021976e-06,
"loss": 0.1611,
"step": 262
},
{
"epoch": 0.20700511609602518,
"grad_norm": 1.1268791395123645,
"learning_rate": 4.671546518996473e-06,
"loss": 0.1553,
"step": 263
},
{
"epoch": 0.2077922077922078,
"grad_norm": 1.0048534045343525,
"learning_rate": 4.668312371160688e-06,
"loss": 0.1571,
"step": 264
},
{
"epoch": 0.2085792994883904,
"grad_norm": 1.0052893495164037,
"learning_rate": 4.665063509461098e-06,
"loss": 0.1679,
"step": 265
},
{
"epoch": 0.209366391184573,
"grad_norm": 0.9679422598052939,
"learning_rate": 4.661799955944019e-06,
"loss": 0.1556,
"step": 266
},
{
"epoch": 0.2101534828807556,
"grad_norm": 1.0487292157874373,
"learning_rate": 4.658521732755471e-06,
"loss": 0.183,
"step": 267
},
{
"epoch": 0.2109405745769382,
"grad_norm": 1.0878511570789495,
"learning_rate": 4.655228862141017e-06,
"loss": 0.1762,
"step": 268
},
{
"epoch": 0.2117276662731208,
"grad_norm": 0.9275216638767947,
"learning_rate": 4.651921366445613e-06,
"loss": 0.1483,
"step": 269
},
{
"epoch": 0.21251475796930341,
"grad_norm": 1.0291173856009612,
"learning_rate": 4.648599268113464e-06,
"loss": 0.1657,
"step": 270
},
{
"epoch": 0.21330184966548604,
"grad_norm": 0.9814951923963836,
"learning_rate": 4.645262589687861e-06,
"loss": 0.1737,
"step": 271
},
{
"epoch": 0.21408894136166864,
"grad_norm": 0.9574503772544043,
"learning_rate": 4.641911353811038e-06,
"loss": 0.1638,
"step": 272
},
{
"epoch": 0.21487603305785125,
"grad_norm": 0.9684496500051328,
"learning_rate": 4.638545583224011e-06,
"loss": 0.1649,
"step": 273
},
{
"epoch": 0.21566312475403385,
"grad_norm": 1.0314787067828541,
"learning_rate": 4.635165300766428e-06,
"loss": 0.1699,
"step": 274
},
{
"epoch": 0.21645021645021645,
"grad_norm": 1.0287264097080684,
"learning_rate": 4.63177052937641e-06,
"loss": 0.1602,
"step": 275
},
{
"epoch": 0.21723730814639905,
"grad_norm": 1.1114659065296888,
"learning_rate": 4.628361292090403e-06,
"loss": 0.1783,
"step": 276
},
{
"epoch": 0.21802439984258165,
"grad_norm": 1.0298788844790752,
"learning_rate": 4.6249376120430115e-06,
"loss": 0.1678,
"step": 277
},
{
"epoch": 0.21881149153876425,
"grad_norm": 1.0099420287081406,
"learning_rate": 4.621499512466847e-06,
"loss": 0.1672,
"step": 278
},
{
"epoch": 0.21959858323494688,
"grad_norm": 0.9892117727941296,
"learning_rate": 4.618047016692374e-06,
"loss": 0.1663,
"step": 279
},
{
"epoch": 0.22038567493112948,
"grad_norm": 0.9289360238552057,
"learning_rate": 4.614580148147744e-06,
"loss": 0.1563,
"step": 280
},
{
"epoch": 0.22117276662731208,
"grad_norm": 0.9603340451855991,
"learning_rate": 4.61109893035864e-06,
"loss": 0.1561,
"step": 281
},
{
"epoch": 0.22195985832349469,
"grad_norm": 1.0449269347565262,
"learning_rate": 4.607603386948119e-06,
"loss": 0.165,
"step": 282
},
{
"epoch": 0.2227469500196773,
"grad_norm": 0.990226128298578,
"learning_rate": 4.604093541636448e-06,
"loss": 0.1704,
"step": 283
},
{
"epoch": 0.2235340417158599,
"grad_norm": 1.031797952555019,
"learning_rate": 4.600569418240946e-06,
"loss": 0.1677,
"step": 284
},
{
"epoch": 0.2243211334120425,
"grad_norm": 1.0506428763431659,
"learning_rate": 4.597031040675819e-06,
"loss": 0.1802,
"step": 285
},
{
"epoch": 0.22510822510822512,
"grad_norm": 0.980146123693525,
"learning_rate": 4.593478432952002e-06,
"loss": 0.1656,
"step": 286
},
{
"epoch": 0.22589531680440772,
"grad_norm": 1.0058178922055618,
"learning_rate": 4.589911619176993e-06,
"loss": 0.1601,
"step": 287
},
{
"epoch": 0.22668240850059032,
"grad_norm": 1.1532752501338874,
"learning_rate": 4.586330623554691e-06,
"loss": 0.1707,
"step": 288
},
{
"epoch": 0.22746950019677292,
"grad_norm": 0.9925104519486038,
"learning_rate": 4.582735470385229e-06,
"loss": 0.1712,
"step": 289
},
{
"epoch": 0.22825659189295552,
"grad_norm": 1.1312813134045174,
"learning_rate": 4.579126184064814e-06,
"loss": 0.1607,
"step": 290
},
{
"epoch": 0.22904368358913813,
"grad_norm": 1.2454875330122912,
"learning_rate": 4.575502789085555e-06,
"loss": 0.1656,
"step": 291
},
{
"epoch": 0.22983077528532073,
"grad_norm": 0.9825183210915687,
"learning_rate": 4.571865310035304e-06,
"loss": 0.1589,
"step": 292
},
{
"epoch": 0.23061786698150336,
"grad_norm": 1.0887371255437703,
"learning_rate": 4.568213771597484e-06,
"loss": 0.1585,
"step": 293
},
{
"epoch": 0.23140495867768596,
"grad_norm": 1.0975434488519114,
"learning_rate": 4.564548198550922e-06,
"loss": 0.1435,
"step": 294
},
{
"epoch": 0.23219205037386856,
"grad_norm": 1.0593259383463134,
"learning_rate": 4.5608686157696844e-06,
"loss": 0.167,
"step": 295
},
{
"epoch": 0.23297914207005116,
"grad_norm": 1.1536948102561841,
"learning_rate": 4.557175048222901e-06,
"loss": 0.1621,
"step": 296
},
{
"epoch": 0.23376623376623376,
"grad_norm": 1.1369019291567328,
"learning_rate": 4.5534675209746076e-06,
"loss": 0.1654,
"step": 297
},
{
"epoch": 0.23455332546241636,
"grad_norm": 0.9585590140764199,
"learning_rate": 4.5497460591835615e-06,
"loss": 0.148,
"step": 298
},
{
"epoch": 0.23534041715859896,
"grad_norm": 1.2337420030262027,
"learning_rate": 4.546010688103082e-06,
"loss": 0.1599,
"step": 299
},
{
"epoch": 0.2361275088547816,
"grad_norm": 1.1641848426244756,
"learning_rate": 4.542261433080874e-06,
"loss": 0.1641,
"step": 300
},
{
"epoch": 0.2369146005509642,
"grad_norm": 0.9715264597638171,
"learning_rate": 4.538498319558854e-06,
"loss": 0.1604,
"step": 301
},
{
"epoch": 0.2377016922471468,
"grad_norm": 1.2043568904283137,
"learning_rate": 4.534721373072986e-06,
"loss": 0.1561,
"step": 302
},
{
"epoch": 0.2384887839433294,
"grad_norm": 1.087701432883666,
"learning_rate": 4.530930619253097e-06,
"loss": 0.1573,
"step": 303
},
{
"epoch": 0.239275875639512,
"grad_norm": 1.0432095830081018,
"learning_rate": 4.527126083822713e-06,
"loss": 0.1576,
"step": 304
},
{
"epoch": 0.2400629673356946,
"grad_norm": 1.1515388977241858,
"learning_rate": 4.523307792598877e-06,
"loss": 0.1836,
"step": 305
},
{
"epoch": 0.2408500590318772,
"grad_norm": 1.1236907370811289,
"learning_rate": 4.519475771491978e-06,
"loss": 0.1654,
"step": 306
},
{
"epoch": 0.24163715072805983,
"grad_norm": 1.0492490872684002,
"learning_rate": 4.515630046505575e-06,
"loss": 0.1604,
"step": 307
},
{
"epoch": 0.24242424242424243,
"grad_norm": 1.0414505694174347,
"learning_rate": 4.511770643736217e-06,
"loss": 0.1587,
"step": 308
},
{
"epoch": 0.24321133412042503,
"grad_norm": 0.9963463131455829,
"learning_rate": 4.507897589373272e-06,
"loss": 0.1536,
"step": 309
},
{
"epoch": 0.24399842581660763,
"grad_norm": 0.9437267739253786,
"learning_rate": 4.504010909698744e-06,
"loss": 0.1573,
"step": 310
},
{
"epoch": 0.24478551751279023,
"grad_norm": 0.9915304289222059,
"learning_rate": 4.500110631087095e-06,
"loss": 0.1519,
"step": 311
},
{
"epoch": 0.24557260920897284,
"grad_norm": 0.9782358310573961,
"learning_rate": 4.496196780005069e-06,
"loss": 0.1629,
"step": 312
},
{
"epoch": 0.24635970090515544,
"grad_norm": 1.0770165377269398,
"learning_rate": 4.492269383011512e-06,
"loss": 0.1623,
"step": 313
},
{
"epoch": 0.24714679260133807,
"grad_norm": 1.052396599909024,
"learning_rate": 4.4883284667571894e-06,
"loss": 0.1533,
"step": 314
},
{
"epoch": 0.24793388429752067,
"grad_norm": 1.0084809840218907,
"learning_rate": 4.4843740579846055e-06,
"loss": 0.1512,
"step": 315
},
{
"epoch": 0.24872097599370327,
"grad_norm": 1.0756395659672484,
"learning_rate": 4.480406183527823e-06,
"loss": 0.1682,
"step": 316
},
{
"epoch": 0.24950806768988587,
"grad_norm": 1.095604151904482,
"learning_rate": 4.476424870312286e-06,
"loss": 0.1588,
"step": 317
},
{
"epoch": 0.2502951593860685,
"grad_norm": 1.073871876794014,
"learning_rate": 4.472430145354622e-06,
"loss": 0.1663,
"step": 318
},
{
"epoch": 0.2510822510822511,
"grad_norm": 1.00181438336178,
"learning_rate": 4.46842203576248e-06,
"loss": 0.1668,
"step": 319
},
{
"epoch": 0.2518693427784337,
"grad_norm": 1.0179064844212398,
"learning_rate": 4.464400568734327e-06,
"loss": 0.1618,
"step": 320
},
{
"epoch": 0.2526564344746163,
"grad_norm": 1.1266566093245078,
"learning_rate": 4.460365771559275e-06,
"loss": 0.1726,
"step": 321
},
{
"epoch": 0.2534435261707989,
"grad_norm": 1.0831980755033608,
"learning_rate": 4.456317671616892e-06,
"loss": 0.1674,
"step": 322
},
{
"epoch": 0.2542306178669815,
"grad_norm": 0.9991360442603613,
"learning_rate": 4.452256296377017e-06,
"loss": 0.1534,
"step": 323
},
{
"epoch": 0.2550177095631641,
"grad_norm": 0.9497710360440503,
"learning_rate": 4.448181673399573e-06,
"loss": 0.1562,
"step": 324
},
{
"epoch": 0.25580480125934674,
"grad_norm": 1.1113260986403124,
"learning_rate": 4.444093830334381e-06,
"loss": 0.1639,
"step": 325
},
{
"epoch": 0.2565918929555293,
"grad_norm": 1.1452949830587935,
"learning_rate": 4.4399927949209685e-06,
"loss": 0.1633,
"step": 326
},
{
"epoch": 0.25737898465171194,
"grad_norm": 1.0842379105419755,
"learning_rate": 4.43587859498839e-06,
"loss": 0.1754,
"step": 327
},
{
"epoch": 0.2581660763478945,
"grad_norm": 1.0361570331888057,
"learning_rate": 4.431751258455029e-06,
"loss": 0.1629,
"step": 328
},
{
"epoch": 0.25895316804407714,
"grad_norm": 0.9514704452172565,
"learning_rate": 4.4276108133284115e-06,
"loss": 0.1615,
"step": 329
},
{
"epoch": 0.2597402597402597,
"grad_norm": 1.0051943736689641,
"learning_rate": 4.4234572877050175e-06,
"loss": 0.1635,
"step": 330
},
{
"epoch": 0.26052735143644234,
"grad_norm": 1.061826511574687,
"learning_rate": 4.419290709770091e-06,
"loss": 0.1572,
"step": 331
},
{
"epoch": 0.261314443132625,
"grad_norm": 1.0098180333606226,
"learning_rate": 4.415111107797445e-06,
"loss": 0.1625,
"step": 332
},
{
"epoch": 0.26210153482880755,
"grad_norm": 0.9258158779374888,
"learning_rate": 4.4109185101492735e-06,
"loss": 0.163,
"step": 333
},
{
"epoch": 0.2628886265249902,
"grad_norm": 1.031959410480149,
"learning_rate": 4.406712945275955e-06,
"loss": 0.1601,
"step": 334
},
{
"epoch": 0.26367571822117275,
"grad_norm": 1.098174422684468,
"learning_rate": 4.402494441715864e-06,
"loss": 0.1632,
"step": 335
},
{
"epoch": 0.2644628099173554,
"grad_norm": 0.9325275936138202,
"learning_rate": 4.398263028095175e-06,
"loss": 0.1568,
"step": 336
},
{
"epoch": 0.26524990161353795,
"grad_norm": 0.9452361980478395,
"learning_rate": 4.394018733127667e-06,
"loss": 0.1514,
"step": 337
},
{
"epoch": 0.2660369933097206,
"grad_norm": 0.9440560796701104,
"learning_rate": 4.389761585614531e-06,
"loss": 0.1568,
"step": 338
},
{
"epoch": 0.2668240850059032,
"grad_norm": 0.9825093172685871,
"learning_rate": 4.3854916144441714e-06,
"loss": 0.1513,
"step": 339
},
{
"epoch": 0.2676111767020858,
"grad_norm": 0.9909422001877334,
"learning_rate": 4.381208848592017e-06,
"loss": 0.1607,
"step": 340
},
{
"epoch": 0.2683982683982684,
"grad_norm": 1.026772957857381,
"learning_rate": 4.3769133171203146e-06,
"loss": 0.1579,
"step": 341
},
{
"epoch": 0.269185360094451,
"grad_norm": 0.9727634660522837,
"learning_rate": 4.372605049177939e-06,
"loss": 0.1611,
"step": 342
},
{
"epoch": 0.2699724517906336,
"grad_norm": 0.9991705382361779,
"learning_rate": 4.368284074000193e-06,
"loss": 0.1423,
"step": 343
},
{
"epoch": 0.2707595434868162,
"grad_norm": 1.0413166825567135,
"learning_rate": 4.363950420908608e-06,
"loss": 0.1531,
"step": 344
},
{
"epoch": 0.2715466351829988,
"grad_norm": 1.051399331371367,
"learning_rate": 4.3596041193107475e-06,
"loss": 0.1537,
"step": 345
},
{
"epoch": 0.27233372687918145,
"grad_norm": 1.1268002118202416,
"learning_rate": 4.355245198700003e-06,
"loss": 0.1687,
"step": 346
},
{
"epoch": 0.273120818575364,
"grad_norm": 1.0579162910588005,
"learning_rate": 4.3508736886554e-06,
"loss": 0.1545,
"step": 347
},
{
"epoch": 0.27390791027154665,
"grad_norm": 1.0780531804812832,
"learning_rate": 4.346489618841393e-06,
"loss": 0.1478,
"step": 348
},
{
"epoch": 0.2746950019677292,
"grad_norm": 1.1629336261073622,
"learning_rate": 4.342093019007664e-06,
"loss": 0.1507,
"step": 349
},
{
"epoch": 0.27548209366391185,
"grad_norm": 0.9806357134318359,
"learning_rate": 4.337683918988924e-06,
"loss": 0.1605,
"step": 350
},
{
"epoch": 0.2762691853600944,
"grad_norm": 1.0271547256327147,
"learning_rate": 4.333262348704708e-06,
"loss": 0.1544,
"step": 351
},
{
"epoch": 0.27705627705627706,
"grad_norm": 1.040963108089893,
"learning_rate": 4.328828338159173e-06,
"loss": 0.1505,
"step": 352
},
{
"epoch": 0.2778433687524597,
"grad_norm": 1.036202462349552,
"learning_rate": 4.324381917440891e-06,
"loss": 0.1558,
"step": 353
},
{
"epoch": 0.27863046044864226,
"grad_norm": 0.975994343559266,
"learning_rate": 4.319923116722651e-06,
"loss": 0.1641,
"step": 354
},
{
"epoch": 0.2794175521448249,
"grad_norm": 1.039409188253541,
"learning_rate": 4.315451966261248e-06,
"loss": 0.1549,
"step": 355
},
{
"epoch": 0.28020464384100746,
"grad_norm": 1.047725080130562,
"learning_rate": 4.310968496397284e-06,
"loss": 0.165,
"step": 356
},
{
"epoch": 0.2809917355371901,
"grad_norm": 1.0011313336241248,
"learning_rate": 4.306472737554957e-06,
"loss": 0.1456,
"step": 357
},
{
"epoch": 0.28177882723337266,
"grad_norm": 0.9015679935576075,
"learning_rate": 4.301964720241857e-06,
"loss": 0.1369,
"step": 358
},
{
"epoch": 0.2825659189295553,
"grad_norm": 1.049381444767021,
"learning_rate": 4.297444475048755e-06,
"loss": 0.1563,
"step": 359
},
{
"epoch": 0.2833530106257379,
"grad_norm": 1.0194195709152667,
"learning_rate": 4.292912032649403e-06,
"loss": 0.1649,
"step": 360
},
{
"epoch": 0.2841401023219205,
"grad_norm": 0.957368492301693,
"learning_rate": 4.2883674238003195e-06,
"loss": 0.1515,
"step": 361
},
{
"epoch": 0.2849271940181031,
"grad_norm": 1.1143901057936236,
"learning_rate": 4.2838106793405825e-06,
"loss": 0.1625,
"step": 362
},
{
"epoch": 0.2857142857142857,
"grad_norm": 1.0882799366794436,
"learning_rate": 4.2792418301916225e-06,
"loss": 0.153,
"step": 363
},
{
"epoch": 0.2865013774104683,
"grad_norm": 1.0192177035415801,
"learning_rate": 4.274660907357009e-06,
"loss": 0.1645,
"step": 364
},
{
"epoch": 0.2872884691066509,
"grad_norm": 1.0205240256871457,
"learning_rate": 4.2700679419222415e-06,
"loss": 0.1459,
"step": 365
},
{
"epoch": 0.28807556080283353,
"grad_norm": 1.2141057030738465,
"learning_rate": 4.265462965054539e-06,
"loss": 0.1597,
"step": 366
},
{
"epoch": 0.28886265249901616,
"grad_norm": 1.017121088131926,
"learning_rate": 4.260846008002631e-06,
"loss": 0.1619,
"step": 367
},
{
"epoch": 0.28964974419519873,
"grad_norm": 1.0877328731947116,
"learning_rate": 4.25621710209654e-06,
"loss": 0.1716,
"step": 368
},
{
"epoch": 0.29043683589138136,
"grad_norm": 1.1099554764936985,
"learning_rate": 4.251576278747372e-06,
"loss": 0.1599,
"step": 369
},
{
"epoch": 0.29122392758756394,
"grad_norm": 0.962048395782395,
"learning_rate": 4.246923569447105e-06,
"loss": 0.1465,
"step": 370
},
{
"epoch": 0.29201101928374656,
"grad_norm": 1.0782102946203345,
"learning_rate": 4.24225900576837e-06,
"loss": 0.1584,
"step": 371
},
{
"epoch": 0.29279811097992914,
"grad_norm": 1.0600722154446367,
"learning_rate": 4.237582619364244e-06,
"loss": 0.1518,
"step": 372
},
{
"epoch": 0.29358520267611177,
"grad_norm": 1.0154082912245785,
"learning_rate": 4.23289444196803e-06,
"loss": 0.1455,
"step": 373
},
{
"epoch": 0.2943722943722944,
"grad_norm": 1.1254176051245297,
"learning_rate": 4.228194505393041e-06,
"loss": 0.1544,
"step": 374
},
{
"epoch": 0.29515938606847697,
"grad_norm": 1.1003313998342341,
"learning_rate": 4.22348284153239e-06,
"loss": 0.1611,
"step": 375
},
{
"epoch": 0.2959464777646596,
"grad_norm": 0.9110264218620379,
"learning_rate": 4.218759482358765e-06,
"loss": 0.1479,
"step": 376
},
{
"epoch": 0.2967335694608422,
"grad_norm": 1.0433752096490876,
"learning_rate": 4.214024459924221e-06,
"loss": 0.1561,
"step": 377
},
{
"epoch": 0.2975206611570248,
"grad_norm": 0.9985242964251728,
"learning_rate": 4.209277806359956e-06,
"loss": 0.1486,
"step": 378
},
{
"epoch": 0.2983077528532074,
"grad_norm": 0.9830203630270159,
"learning_rate": 4.204519553876095e-06,
"loss": 0.153,
"step": 379
},
{
"epoch": 0.29909484454939,
"grad_norm": 1.0623334389041004,
"learning_rate": 4.199749734761473e-06,
"loss": 0.1584,
"step": 380
},
{
"epoch": 0.29988193624557263,
"grad_norm": 1.007050119697646,
"learning_rate": 4.194968381383414e-06,
"loss": 0.162,
"step": 381
},
{
"epoch": 0.3006690279417552,
"grad_norm": 0.9212276043601202,
"learning_rate": 4.1901755261875116e-06,
"loss": 0.1417,
"step": 382
},
{
"epoch": 0.30145611963793784,
"grad_norm": 1.0195210503773229,
"learning_rate": 4.18537120169741e-06,
"loss": 0.1631,
"step": 383
},
{
"epoch": 0.3022432113341204,
"grad_norm": 0.9791393783618954,
"learning_rate": 4.1805554405145805e-06,
"loss": 0.151,
"step": 384
},
{
"epoch": 0.30303030303030304,
"grad_norm": 0.9560471554995319,
"learning_rate": 4.175728275318105e-06,
"loss": 0.1537,
"step": 385
},
{
"epoch": 0.3038173947264856,
"grad_norm": 0.9732207377472094,
"learning_rate": 4.170889738864448e-06,
"loss": 0.1541,
"step": 386
},
{
"epoch": 0.30460448642266824,
"grad_norm": 1.0273971232086052,
"learning_rate": 4.166039863987241e-06,
"loss": 0.1623,
"step": 387
},
{
"epoch": 0.30539157811885087,
"grad_norm": 1.0066781633766182,
"learning_rate": 4.161178683597055e-06,
"loss": 0.1623,
"step": 388
},
{
"epoch": 0.30617866981503344,
"grad_norm": 0.9519906303887405,
"learning_rate": 4.156306230681178e-06,
"loss": 0.1606,
"step": 389
},
{
"epoch": 0.3069657615112161,
"grad_norm": 1.0274010773396909,
"learning_rate": 4.151422538303393e-06,
"loss": 0.1588,
"step": 390
},
{
"epoch": 0.30775285320739865,
"grad_norm": 1.0092975668609663,
"learning_rate": 4.1465276396037516e-06,
"loss": 0.1549,
"step": 391
},
{
"epoch": 0.3085399449035813,
"grad_norm": 0.9952464194936945,
"learning_rate": 4.141621567798351e-06,
"loss": 0.1468,
"step": 392
},
{
"epoch": 0.30932703659976385,
"grad_norm": 0.9809046515355889,
"learning_rate": 4.136704356179105e-06,
"loss": 0.1509,
"step": 393
},
{
"epoch": 0.3101141282959465,
"grad_norm": 1.0624718823483572,
"learning_rate": 4.131776038113524e-06,
"loss": 0.1629,
"step": 394
},
{
"epoch": 0.3109012199921291,
"grad_norm": 0.9361042861540975,
"learning_rate": 4.126836647044484e-06,
"loss": 0.1453,
"step": 395
},
{
"epoch": 0.3116883116883117,
"grad_norm": 1.0675702598561039,
"learning_rate": 4.121886216489999e-06,
"loss": 0.1657,
"step": 396
},
{
"epoch": 0.3124754033844943,
"grad_norm": 1.0221190108601212,
"learning_rate": 4.116924780042997e-06,
"loss": 0.1609,
"step": 397
},
{
"epoch": 0.3132624950806769,
"grad_norm": 0.98812521742716,
"learning_rate": 4.111952371371091e-06,
"loss": 0.1488,
"step": 398
},
{
"epoch": 0.3140495867768595,
"grad_norm": 0.9689235987787954,
"learning_rate": 4.106969024216348e-06,
"loss": 0.1547,
"step": 399
},
{
"epoch": 0.3148366784730421,
"grad_norm": 1.0046393279094348,
"learning_rate": 4.101974772395066e-06,
"loss": 0.1467,
"step": 400
},
{
"epoch": 0.3156237701692247,
"grad_norm": 0.968527185963086,
"learning_rate": 4.096969649797534e-06,
"loss": 0.1432,
"step": 401
},
{
"epoch": 0.31641086186540734,
"grad_norm": 1.0188815460176754,
"learning_rate": 4.091953690387815e-06,
"loss": 0.1521,
"step": 402
},
{
"epoch": 0.3171979535615899,
"grad_norm": 1.035965071382904,
"learning_rate": 4.086926928203506e-06,
"loss": 0.1575,
"step": 403
},
{
"epoch": 0.31798504525777255,
"grad_norm": 1.0400881738148544,
"learning_rate": 4.081889397355509e-06,
"loss": 0.1646,
"step": 404
},
{
"epoch": 0.3187721369539551,
"grad_norm": 1.0353365656909388,
"learning_rate": 4.076841132027805e-06,
"loss": 0.1578,
"step": 405
},
{
"epoch": 0.31955922865013775,
"grad_norm": 0.9785090873779988,
"learning_rate": 4.071782166477213e-06,
"loss": 0.1485,
"step": 406
},
{
"epoch": 0.3203463203463203,
"grad_norm": 1.0365440161437718,
"learning_rate": 4.066712535033164e-06,
"loss": 0.1644,
"step": 407
},
{
"epoch": 0.32113341204250295,
"grad_norm": 0.9337858697268638,
"learning_rate": 4.061632272097467e-06,
"loss": 0.1396,
"step": 408
},
{
"epoch": 0.3219205037386856,
"grad_norm": 0.9930564105014524,
"learning_rate": 4.056541412144073e-06,
"loss": 0.1466,
"step": 409
},
{
"epoch": 0.32270759543486816,
"grad_norm": 1.0123860857315623,
"learning_rate": 4.051439989718845e-06,
"loss": 0.1718,
"step": 410
},
{
"epoch": 0.3234946871310508,
"grad_norm": 0.9886983463565112,
"learning_rate": 4.0463280394393216e-06,
"loss": 0.1465,
"step": 411
},
{
"epoch": 0.32428177882723336,
"grad_norm": 0.9489896550219313,
"learning_rate": 4.041205595994478e-06,
"loss": 0.1553,
"step": 412
},
{
"epoch": 0.325068870523416,
"grad_norm": 0.935055903913981,
"learning_rate": 4.036072694144501e-06,
"loss": 0.1486,
"step": 413
},
{
"epoch": 0.32585596221959856,
"grad_norm": 1.0109287737515016,
"learning_rate": 4.030929368720539e-06,
"loss": 0.1563,
"step": 414
},
{
"epoch": 0.3266430539157812,
"grad_norm": 0.9682667210224672,
"learning_rate": 4.025775654624481e-06,
"loss": 0.154,
"step": 415
},
{
"epoch": 0.3274301456119638,
"grad_norm": 0.9195115794003238,
"learning_rate": 4.020611586828705e-06,
"loss": 0.1433,
"step": 416
},
{
"epoch": 0.3282172373081464,
"grad_norm": 0.886911970381121,
"learning_rate": 4.015437200375855e-06,
"loss": 0.1374,
"step": 417
},
{
"epoch": 0.329004329004329,
"grad_norm": 1.021240159520919,
"learning_rate": 4.01025253037859e-06,
"loss": 0.1567,
"step": 418
},
{
"epoch": 0.3297914207005116,
"grad_norm": 0.9462875663398478,
"learning_rate": 4.005057612019353e-06,
"loss": 0.1516,
"step": 419
},
{
"epoch": 0.3305785123966942,
"grad_norm": 0.9850150964188347,
"learning_rate": 3.9998524805501335e-06,
"loss": 0.149,
"step": 420
},
{
"epoch": 0.3313656040928768,
"grad_norm": 1.0466582919958791,
"learning_rate": 3.994637171292223e-06,
"loss": 0.1504,
"step": 421
},
{
"epoch": 0.3321526957890594,
"grad_norm": 0.9892923577104711,
"learning_rate": 3.989411719635979e-06,
"loss": 0.1465,
"step": 422
},
{
"epoch": 0.33293978748524206,
"grad_norm": 1.0797299646481497,
"learning_rate": 3.984176161040585e-06,
"loss": 0.1655,
"step": 423
},
{
"epoch": 0.33372687918142463,
"grad_norm": 1.0280855278386611,
"learning_rate": 3.978930531033807e-06,
"loss": 0.1614,
"step": 424
},
{
"epoch": 0.33451397087760726,
"grad_norm": 1.0013220452351206,
"learning_rate": 3.973674865211754e-06,
"loss": 0.1529,
"step": 425
},
{
"epoch": 0.33530106257378983,
"grad_norm": 1.0185754306460224,
"learning_rate": 3.968409199238639e-06,
"loss": 0.1535,
"step": 426
},
{
"epoch": 0.33608815426997246,
"grad_norm": 0.9680713968649773,
"learning_rate": 3.963133568846533e-06,
"loss": 0.1532,
"step": 427
},
{
"epoch": 0.33687524596615503,
"grad_norm": 1.0465737054070945,
"learning_rate": 3.957848009835125e-06,
"loss": 0.1557,
"step": 428
},
{
"epoch": 0.33766233766233766,
"grad_norm": 1.0072097384887637,
"learning_rate": 3.952552558071475e-06,
"loss": 0.1686,
"step": 429
},
{
"epoch": 0.3384494293585203,
"grad_norm": 1.0495298691679416,
"learning_rate": 3.947247249489779e-06,
"loss": 0.1487,
"step": 430
},
{
"epoch": 0.33923652105470287,
"grad_norm": 1.0214586461562896,
"learning_rate": 3.941932120091116e-06,
"loss": 0.1621,
"step": 431
},
{
"epoch": 0.3400236127508855,
"grad_norm": 1.0494096714602847,
"learning_rate": 3.93660720594321e-06,
"loss": 0.1598,
"step": 432
},
{
"epoch": 0.34081070444706807,
"grad_norm": 1.0334818385570048,
"learning_rate": 3.93127254318018e-06,
"loss": 0.1577,
"step": 433
},
{
"epoch": 0.3415977961432507,
"grad_norm": 0.9700994625756835,
"learning_rate": 3.925928168002302e-06,
"loss": 0.1526,
"step": 434
},
{
"epoch": 0.34238488783943327,
"grad_norm": 1.047736033995709,
"learning_rate": 3.920574116675756e-06,
"loss": 0.1581,
"step": 435
},
{
"epoch": 0.3431719795356159,
"grad_norm": 1.0493869403649712,
"learning_rate": 3.915210425532383e-06,
"loss": 0.1495,
"step": 436
},
{
"epoch": 0.34395907123179853,
"grad_norm": 1.010254528268069,
"learning_rate": 3.90983713096944e-06,
"loss": 0.1539,
"step": 437
},
{
"epoch": 0.3447461629279811,
"grad_norm": 0.9846398029609658,
"learning_rate": 3.9044542694493515e-06,
"loss": 0.1463,
"step": 438
},
{
"epoch": 0.34553325462416373,
"grad_norm": 1.2083136674514858,
"learning_rate": 3.899061877499461e-06,
"loss": 0.1601,
"step": 439
},
{
"epoch": 0.3463203463203463,
"grad_norm": 0.97978554217786,
"learning_rate": 3.893659991711782e-06,
"loss": 0.139,
"step": 440
},
{
"epoch": 0.34710743801652894,
"grad_norm": 1.1022405018344112,
"learning_rate": 3.888248648742756e-06,
"loss": 0.1617,
"step": 441
},
{
"epoch": 0.3478945297127115,
"grad_norm": 1.0077367730076683,
"learning_rate": 3.882827885312999e-06,
"loss": 0.1488,
"step": 442
},
{
"epoch": 0.34868162140889414,
"grad_norm": 1.0119669193080498,
"learning_rate": 3.877397738207051e-06,
"loss": 0.1433,
"step": 443
},
{
"epoch": 0.34946871310507677,
"grad_norm": 0.9336119872704435,
"learning_rate": 3.8719582442731276e-06,
"loss": 0.1393,
"step": 444
},
{
"epoch": 0.35025580480125934,
"grad_norm": 1.0144372790745282,
"learning_rate": 3.866509440422873e-06,
"loss": 0.1515,
"step": 445
},
{
"epoch": 0.35104289649744197,
"grad_norm": 1.0618851735205919,
"learning_rate": 3.861051363631107e-06,
"loss": 0.1403,
"step": 446
},
{
"epoch": 0.35182998819362454,
"grad_norm": 1.0256940692518137,
"learning_rate": 3.855584050935574e-06,
"loss": 0.1533,
"step": 447
},
{
"epoch": 0.3526170798898072,
"grad_norm": 1.004262449427633,
"learning_rate": 3.85010753943669e-06,
"loss": 0.1437,
"step": 448
},
{
"epoch": 0.35340417158598975,
"grad_norm": 0.9608822952661715,
"learning_rate": 3.844621866297295e-06,
"loss": 0.1374,
"step": 449
},
{
"epoch": 0.3541912632821724,
"grad_norm": 1.032805552257636,
"learning_rate": 3.839127068742399e-06,
"loss": 0.1612,
"step": 450
},
{
"epoch": 0.354978354978355,
"grad_norm": 1.089158815357864,
"learning_rate": 3.833623184058926e-06,
"loss": 0.1564,
"step": 451
},
{
"epoch": 0.3557654466745376,
"grad_norm": 1.0527347217082683,
"learning_rate": 3.8281102495954684e-06,
"loss": 0.1475,
"step": 452
},
{
"epoch": 0.3565525383707202,
"grad_norm": 1.012969201356965,
"learning_rate": 3.8225883027620245e-06,
"loss": 0.1443,
"step": 453
},
{
"epoch": 0.3573396300669028,
"grad_norm": 0.9952397622221426,
"learning_rate": 3.817057381029752e-06,
"loss": 0.1488,
"step": 454
},
{
"epoch": 0.3581267217630854,
"grad_norm": 0.9773911500192811,
"learning_rate": 3.811517521930711e-06,
"loss": 0.1419,
"step": 455
},
{
"epoch": 0.358913813459268,
"grad_norm": 1.04344141144674,
"learning_rate": 3.805968763057609e-06,
"loss": 0.1335,
"step": 456
},
{
"epoch": 0.3597009051554506,
"grad_norm": 0.9127224357677829,
"learning_rate": 3.8004111420635453e-06,
"loss": 0.1421,
"step": 457
},
{
"epoch": 0.36048799685163324,
"grad_norm": 0.948335811441799,
"learning_rate": 3.7948446966617568e-06,
"loss": 0.1545,
"step": 458
},
{
"epoch": 0.3612750885478158,
"grad_norm": 1.054156015531643,
"learning_rate": 3.7892694646253624e-06,
"loss": 0.1462,
"step": 459
},
{
"epoch": 0.36206218024399844,
"grad_norm": 1.0883694334017704,
"learning_rate": 3.783685483787105e-06,
"loss": 0.1469,
"step": 460
},
{
"epoch": 0.362849271940181,
"grad_norm": 1.0265972829923478,
"learning_rate": 3.7780927920390965e-06,
"loss": 0.1572,
"step": 461
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.9351090223515385,
"learning_rate": 3.772491427332557e-06,
"loss": 0.1317,
"step": 462
},
{
"epoch": 0.3644234553325462,
"grad_norm": 0.96672130032329,
"learning_rate": 3.766881427677563e-06,
"loss": 0.1474,
"step": 463
},
{
"epoch": 0.36521054702872885,
"grad_norm": 0.9284227954997755,
"learning_rate": 3.761262831142788e-06,
"loss": 0.144,
"step": 464
},
{
"epoch": 0.3659976387249115,
"grad_norm": 1.02329013434613,
"learning_rate": 3.755635675855238e-06,
"loss": 0.1459,
"step": 465
},
{
"epoch": 0.36678473042109405,
"grad_norm": 0.9548918394606087,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.1431,
"step": 466
},
{
"epoch": 0.3675718221172767,
"grad_norm": 1.0029018534160843,
"learning_rate": 3.744355841819983e-06,
"loss": 0.1551,
"step": 467
},
{
"epoch": 0.36835891381345925,
"grad_norm": 1.0170466682076178,
"learning_rate": 3.7387032396156497e-06,
"loss": 0.1574,
"step": 468
},
{
"epoch": 0.3691460055096419,
"grad_norm": 0.950504547373793,
"learning_rate": 3.7330422317447686e-06,
"loss": 0.1413,
"step": 469
},
{
"epoch": 0.36993309720582446,
"grad_norm": 1.00490564394254,
"learning_rate": 3.7273728566221447e-06,
"loss": 0.1539,
"step": 470
},
{
"epoch": 0.3707201889020071,
"grad_norm": 1.0241073155182219,
"learning_rate": 3.721695152719364e-06,
"loss": 0.1505,
"step": 471
},
{
"epoch": 0.3715072805981897,
"grad_norm": 1.0650129974030413,
"learning_rate": 3.716009158564528e-06,
"loss": 0.1517,
"step": 472
},
{
"epoch": 0.3722943722943723,
"grad_norm": 1.0412500508709561,
"learning_rate": 3.710314912741997e-06,
"loss": 0.1447,
"step": 473
},
{
"epoch": 0.3730814639905549,
"grad_norm": 1.0273490151395026,
"learning_rate": 3.7046124538921237e-06,
"loss": 0.1429,
"step": 474
},
{
"epoch": 0.3738685556867375,
"grad_norm": 0.9952543111661871,
"learning_rate": 3.698901820710995e-06,
"loss": 0.1418,
"step": 475
},
{
"epoch": 0.3746556473829201,
"grad_norm": 1.0824700054534682,
"learning_rate": 3.693183051950168e-06,
"loss": 0.1437,
"step": 476
},
{
"epoch": 0.3754427390791027,
"grad_norm": 1.0142752196453109,
"learning_rate": 3.6874561864164056e-06,
"loss": 0.1435,
"step": 477
},
{
"epoch": 0.3762298307752853,
"grad_norm": 0.9888106276082754,
"learning_rate": 3.6817212629714135e-06,
"loss": 0.1395,
"step": 478
},
{
"epoch": 0.37701692247146795,
"grad_norm": 0.9673698851206235,
"learning_rate": 3.675978320531579e-06,
"loss": 0.1425,
"step": 479
},
{
"epoch": 0.3778040141676505,
"grad_norm": 1.096283920214562,
"learning_rate": 3.670227398067705e-06,
"loss": 0.1515,
"step": 480
},
{
"epoch": 0.37859110586383315,
"grad_norm": 1.0303413811027284,
"learning_rate": 3.664468534604745e-06,
"loss": 0.1462,
"step": 481
},
{
"epoch": 0.37937819756001573,
"grad_norm": 0.9550517801003708,
"learning_rate": 3.6587017692215387e-06,
"loss": 0.1483,
"step": 482
},
{
"epoch": 0.38016528925619836,
"grad_norm": 1.0499765951347195,
"learning_rate": 3.6529271410505483e-06,
"loss": 0.1516,
"step": 483
},
{
"epoch": 0.38095238095238093,
"grad_norm": 1.0612285971687154,
"learning_rate": 3.6471446892775896e-06,
"loss": 0.145,
"step": 484
},
{
"epoch": 0.38173947264856356,
"grad_norm": 0.9976574649139153,
"learning_rate": 3.6413544531415712e-06,
"loss": 0.1493,
"step": 485
},
{
"epoch": 0.3825265643447462,
"grad_norm": 1.011974051278155,
"learning_rate": 3.635556471934224e-06,
"loss": 0.1557,
"step": 486
},
{
"epoch": 0.38331365604092876,
"grad_norm": 1.015959048224715,
"learning_rate": 3.629750784999835e-06,
"loss": 0.152,
"step": 487
},
{
"epoch": 0.3841007477371114,
"grad_norm": 0.9638439392236781,
"learning_rate": 3.623937431734982e-06,
"loss": 0.1464,
"step": 488
},
{
"epoch": 0.38488783943329397,
"grad_norm": 0.9820530085625633,
"learning_rate": 3.6181164515882663e-06,
"loss": 0.1468,
"step": 489
},
{
"epoch": 0.3856749311294766,
"grad_norm": 0.9281524539517508,
"learning_rate": 3.6122878840600417e-06,
"loss": 0.1451,
"step": 490
},
{
"epoch": 0.38646202282565917,
"grad_norm": 1.039305922376239,
"learning_rate": 3.606451768702151e-06,
"loss": 0.1486,
"step": 491
},
{
"epoch": 0.3872491145218418,
"grad_norm": 1.026987888426606,
"learning_rate": 3.600608145117656e-06,
"loss": 0.1381,
"step": 492
},
{
"epoch": 0.3880362062180244,
"grad_norm": 1.058827889093346,
"learning_rate": 3.594757052960566e-06,
"loss": 0.1555,
"step": 493
},
{
"epoch": 0.388823297914207,
"grad_norm": 1.0027016575115129,
"learning_rate": 3.588898531935573e-06,
"loss": 0.1413,
"step": 494
},
{
"epoch": 0.38961038961038963,
"grad_norm": 1.0766471714794614,
"learning_rate": 3.583032621797778e-06,
"loss": 0.1418,
"step": 495
},
{
"epoch": 0.3903974813065722,
"grad_norm": 1.0326313481110534,
"learning_rate": 3.5771593623524263e-06,
"loss": 0.1345,
"step": 496
},
{
"epoch": 0.39118457300275483,
"grad_norm": 0.9649958546178075,
"learning_rate": 3.5712787934546336e-06,
"loss": 0.1397,
"step": 497
},
{
"epoch": 0.3919716646989374,
"grad_norm": 1.0461258832079,
"learning_rate": 3.5653909550091138e-06,
"loss": 0.16,
"step": 498
},
{
"epoch": 0.39275875639512003,
"grad_norm": 0.9741702004779168,
"learning_rate": 3.559495886969916e-06,
"loss": 0.1366,
"step": 499
},
{
"epoch": 0.39354584809130266,
"grad_norm": 0.9875184829637668,
"learning_rate": 3.553593629340144e-06,
"loss": 0.1391,
"step": 500
},
{
"epoch": 0.39354584809130266,
"eval_loss": 0.14773064851760864,
"eval_runtime": 18.0322,
"eval_samples_per_second": 45.585,
"eval_steps_per_second": 5.712,
"step": 500
},
{
"epoch": 0.39433293978748524,
"grad_norm": 0.9563741831859393,
"learning_rate": 3.5476842221716915e-06,
"loss": 0.1453,
"step": 501
},
{
"epoch": 0.39512003148366787,
"grad_norm": 0.9839000041167648,
"learning_rate": 3.541767705564967e-06,
"loss": 0.1509,
"step": 502
},
{
"epoch": 0.39590712317985044,
"grad_norm": 0.9666175985112762,
"learning_rate": 3.535844119668622e-06,
"loss": 0.1436,
"step": 503
},
{
"epoch": 0.39669421487603307,
"grad_norm": 1.0513295542177603,
"learning_rate": 3.5299135046792816e-06,
"loss": 0.1371,
"step": 504
},
{
"epoch": 0.39748130657221564,
"grad_norm": 1.0136623338528887,
"learning_rate": 3.5239759008412666e-06,
"loss": 0.1498,
"step": 505
},
{
"epoch": 0.39826839826839827,
"grad_norm": 0.9764920494655156,
"learning_rate": 3.518031348446324e-06,
"loss": 0.1371,
"step": 506
},
{
"epoch": 0.3990554899645809,
"grad_norm": 1.0113031849627157,
"learning_rate": 3.5120798878333544e-06,
"loss": 0.1453,
"step": 507
},
{
"epoch": 0.3998425816607635,
"grad_norm": 0.9947509560502654,
"learning_rate": 3.506121559388135e-06,
"loss": 0.1233,
"step": 508
},
{
"epoch": 0.4006296733569461,
"grad_norm": 1.1135464243984814,
"learning_rate": 3.500156403543046e-06,
"loss": 0.151,
"step": 509
},
{
"epoch": 0.4014167650531287,
"grad_norm": 1.0687025563863246,
"learning_rate": 3.4941844607768007e-06,
"loss": 0.1384,
"step": 510
},
{
"epoch": 0.4022038567493113,
"grad_norm": 0.9654525860741724,
"learning_rate": 3.488205771614164e-06,
"loss": 0.1348,
"step": 511
},
{
"epoch": 0.4029909484454939,
"grad_norm": 1.07357744190682,
"learning_rate": 3.4822203766256834e-06,
"loss": 0.1412,
"step": 512
},
{
"epoch": 0.4037780401416765,
"grad_norm": 1.2491546536330014,
"learning_rate": 3.4762283164274104e-06,
"loss": 0.1523,
"step": 513
},
{
"epoch": 0.40456513183785914,
"grad_norm": 1.0398955239354635,
"learning_rate": 3.4702296316806243e-06,
"loss": 0.1507,
"step": 514
},
{
"epoch": 0.4053522235340417,
"grad_norm": 0.947562520308943,
"learning_rate": 3.4642243630915606e-06,
"loss": 0.1486,
"step": 515
},
{
"epoch": 0.40613931523022434,
"grad_norm": 0.9405204018759319,
"learning_rate": 3.45821255141113e-06,
"loss": 0.1287,
"step": 516
},
{
"epoch": 0.4069264069264069,
"grad_norm": 1.01025400774114,
"learning_rate": 3.452194237434642e-06,
"loss": 0.1349,
"step": 517
},
{
"epoch": 0.40771349862258954,
"grad_norm": 1.0404932578099988,
"learning_rate": 3.446169462001534e-06,
"loss": 0.1508,
"step": 518
},
{
"epoch": 0.4085005903187721,
"grad_norm": 1.029425420995215,
"learning_rate": 3.4401382659950868e-06,
"loss": 0.1362,
"step": 519
},
{
"epoch": 0.40928768201495475,
"grad_norm": 1.025768159905711,
"learning_rate": 3.4341006903421493e-06,
"loss": 0.1437,
"step": 520
},
{
"epoch": 0.4100747737111374,
"grad_norm": 0.9507044448226175,
"learning_rate": 3.4280567760128658e-06,
"loss": 0.1393,
"step": 521
},
{
"epoch": 0.41086186540731995,
"grad_norm": 1.0374082813027519,
"learning_rate": 3.4220065640203916e-06,
"loss": 0.16,
"step": 522
},
{
"epoch": 0.4116489571035026,
"grad_norm": 0.9378353888939086,
"learning_rate": 3.415950095420616e-06,
"loss": 0.1355,
"step": 523
},
{
"epoch": 0.41243604879968515,
"grad_norm": 0.924561930587711,
"learning_rate": 3.4098874113118863e-06,
"loss": 0.1452,
"step": 524
},
{
"epoch": 0.4132231404958678,
"grad_norm": 0.9505049489489825,
"learning_rate": 3.403818552834727e-06,
"loss": 0.1448,
"step": 525
},
{
"epoch": 0.41401023219205035,
"grad_norm": 0.9701870488491394,
"learning_rate": 3.397743561171562e-06,
"loss": 0.1341,
"step": 526
},
{
"epoch": 0.414797323888233,
"grad_norm": 0.9122288876708122,
"learning_rate": 3.3916624775464318e-06,
"loss": 0.1291,
"step": 527
},
{
"epoch": 0.4155844155844156,
"grad_norm": 0.985150804267496,
"learning_rate": 3.385575343224718e-06,
"loss": 0.141,
"step": 528
},
{
"epoch": 0.4163715072805982,
"grad_norm": 0.9910844190276262,
"learning_rate": 3.3794821995128606e-06,
"loss": 0.1473,
"step": 529
},
{
"epoch": 0.4171585989767808,
"grad_norm": 0.9925292173111532,
"learning_rate": 3.3733830877580796e-06,
"loss": 0.1492,
"step": 530
},
{
"epoch": 0.4179456906729634,
"grad_norm": 0.9483537804421872,
"learning_rate": 3.3672780493480927e-06,
"loss": 0.1476,
"step": 531
},
{
"epoch": 0.418732782369146,
"grad_norm": 0.9716970355806354,
"learning_rate": 3.3611671257108323e-06,
"loss": 0.1288,
"step": 532
},
{
"epoch": 0.4195198740653286,
"grad_norm": 1.0525983321400059,
"learning_rate": 3.3550503583141726e-06,
"loss": 0.1541,
"step": 533
},
{
"epoch": 0.4203069657615112,
"grad_norm": 0.9476841095185634,
"learning_rate": 3.3489277886656373e-06,
"loss": 0.1395,
"step": 534
},
{
"epoch": 0.42109405745769385,
"grad_norm": 0.8883884320293254,
"learning_rate": 3.342799458312127e-06,
"loss": 0.1374,
"step": 535
},
{
"epoch": 0.4218811491538764,
"grad_norm": 1.026818858865084,
"learning_rate": 3.336665408839633e-06,
"loss": 0.1413,
"step": 536
},
{
"epoch": 0.42266824085005905,
"grad_norm": 0.9146805645048051,
"learning_rate": 3.330525681872954e-06,
"loss": 0.1352,
"step": 537
},
{
"epoch": 0.4234553325462416,
"grad_norm": 1.0439955820386841,
"learning_rate": 3.3243803190754166e-06,
"loss": 0.1482,
"step": 538
},
{
"epoch": 0.42424242424242425,
"grad_norm": 0.9964413472110166,
"learning_rate": 3.3182293621485923e-06,
"loss": 0.1524,
"step": 539
},
{
"epoch": 0.42502951593860683,
"grad_norm": 0.9626977177442709,
"learning_rate": 3.312072852832012e-06,
"loss": 0.1427,
"step": 540
},
{
"epoch": 0.42581660763478946,
"grad_norm": 0.8939878261316884,
"learning_rate": 3.3059108329028845e-06,
"loss": 0.1283,
"step": 541
},
{
"epoch": 0.4266036993309721,
"grad_norm": 1.0155176108909485,
"learning_rate": 3.299743344175814e-06,
"loss": 0.1434,
"step": 542
},
{
"epoch": 0.42739079102715466,
"grad_norm": 0.983969699589635,
"learning_rate": 3.293570428502515e-06,
"loss": 0.1479,
"step": 543
},
{
"epoch": 0.4281778827233373,
"grad_norm": 1.006021089515589,
"learning_rate": 3.287392127771526e-06,
"loss": 0.1386,
"step": 544
},
{
"epoch": 0.42896497441951986,
"grad_norm": 0.9594215523929834,
"learning_rate": 3.2812084839079316e-06,
"loss": 0.1326,
"step": 545
},
{
"epoch": 0.4297520661157025,
"grad_norm": 0.958170693889915,
"learning_rate": 3.275019538873071e-06,
"loss": 0.1418,
"step": 546
},
{
"epoch": 0.43053915781188506,
"grad_norm": 1.0256772435691563,
"learning_rate": 3.268825334664259e-06,
"loss": 0.1526,
"step": 547
},
{
"epoch": 0.4313262495080677,
"grad_norm": 1.0552921930006323,
"learning_rate": 3.2626259133144955e-06,
"loss": 0.1441,
"step": 548
},
{
"epoch": 0.43211334120425027,
"grad_norm": 1.0605556822333475,
"learning_rate": 3.2564213168921867e-06,
"loss": 0.1431,
"step": 549
},
{
"epoch": 0.4329004329004329,
"grad_norm": 0.982864648139079,
"learning_rate": 3.2502115875008523e-06,
"loss": 0.149,
"step": 550
},
{
"epoch": 0.4336875245966155,
"grad_norm": 1.0049025924744737,
"learning_rate": 3.2439967672788462e-06,
"loss": 0.1334,
"step": 551
},
{
"epoch": 0.4344746162927981,
"grad_norm": 0.9263761254409442,
"learning_rate": 3.2377768983990677e-06,
"loss": 0.1401,
"step": 552
},
{
"epoch": 0.43526170798898073,
"grad_norm": 1.0390883988019344,
"learning_rate": 3.2315520230686747e-06,
"loss": 0.1493,
"step": 553
},
{
"epoch": 0.4360487996851633,
"grad_norm": 0.9913895430005143,
"learning_rate": 3.2253221835287984e-06,
"loss": 0.1406,
"step": 554
},
{
"epoch": 0.43683589138134593,
"grad_norm": 0.9801664753977715,
"learning_rate": 3.2190874220542577e-06,
"loss": 0.1341,
"step": 555
},
{
"epoch": 0.4376229830775285,
"grad_norm": 0.9519041413125566,
"learning_rate": 3.2128477809532687e-06,
"loss": 0.1469,
"step": 556
},
{
"epoch": 0.43841007477371113,
"grad_norm": 1.0289764585305627,
"learning_rate": 3.2066033025671612e-06,
"loss": 0.1473,
"step": 557
},
{
"epoch": 0.43919716646989376,
"grad_norm": 1.0005689404595521,
"learning_rate": 3.200354029270091e-06,
"loss": 0.1477,
"step": 558
},
{
"epoch": 0.43998425816607634,
"grad_norm": 1.056463362355424,
"learning_rate": 3.1941000034687516e-06,
"loss": 0.1488,
"step": 559
},
{
"epoch": 0.44077134986225897,
"grad_norm": 0.9574144607007496,
"learning_rate": 3.187841267602084e-06,
"loss": 0.1445,
"step": 560
},
{
"epoch": 0.44155844155844154,
"grad_norm": 0.9562953543913302,
"learning_rate": 3.1815778641409924e-06,
"loss": 0.1414,
"step": 561
},
{
"epoch": 0.44234553325462417,
"grad_norm": 0.9444651486667015,
"learning_rate": 3.1753098355880557e-06,
"loss": 0.138,
"step": 562
},
{
"epoch": 0.44313262495080674,
"grad_norm": 0.9465351053953429,
"learning_rate": 3.169037224477236e-06,
"loss": 0.1437,
"step": 563
},
{
"epoch": 0.44391971664698937,
"grad_norm": 1.0206836940486426,
"learning_rate": 3.162760073373594e-06,
"loss": 0.1411,
"step": 564
},
{
"epoch": 0.444706808343172,
"grad_norm": 1.0878905236564318,
"learning_rate": 3.1564784248729965e-06,
"loss": 0.1408,
"step": 565
},
{
"epoch": 0.4454939000393546,
"grad_norm": 1.0130102955883906,
"learning_rate": 3.15019232160183e-06,
"loss": 0.1428,
"step": 566
},
{
"epoch": 0.4462809917355372,
"grad_norm": 0.980086016231054,
"learning_rate": 3.1439018062167092e-06,
"loss": 0.143,
"step": 567
},
{
"epoch": 0.4470680834317198,
"grad_norm": 1.0249915137559014,
"learning_rate": 3.1376069214041917e-06,
"loss": 0.1471,
"step": 568
},
{
"epoch": 0.4478551751279024,
"grad_norm": 1.1016327132095007,
"learning_rate": 3.1313077098804817e-06,
"loss": 0.1606,
"step": 569
},
{
"epoch": 0.448642266824085,
"grad_norm": 1.0411771801722989,
"learning_rate": 3.1250042143911462e-06,
"loss": 0.1499,
"step": 570
},
{
"epoch": 0.4494293585202676,
"grad_norm": 1.0122030093902548,
"learning_rate": 3.118696477710822e-06,
"loss": 0.141,
"step": 571
},
{
"epoch": 0.45021645021645024,
"grad_norm": 1.0708872672849516,
"learning_rate": 3.1123845426429265e-06,
"loss": 0.128,
"step": 572
},
{
"epoch": 0.4510035419126328,
"grad_norm": 1.029737403462412,
"learning_rate": 3.106068452019365e-06,
"loss": 0.1383,
"step": 573
},
{
"epoch": 0.45179063360881544,
"grad_norm": 0.9988296671107193,
"learning_rate": 3.099748248700245e-06,
"loss": 0.1376,
"step": 574
},
{
"epoch": 0.452577725304998,
"grad_norm": 1.0475513726672416,
"learning_rate": 3.0934239755735782e-06,
"loss": 0.1355,
"step": 575
},
{
"epoch": 0.45336481700118064,
"grad_norm": 1.0654745191838768,
"learning_rate": 3.0870956755549973e-06,
"loss": 0.143,
"step": 576
},
{
"epoch": 0.4541519086973632,
"grad_norm": 0.9397526290083124,
"learning_rate": 3.0807633915874585e-06,
"loss": 0.1406,
"step": 577
},
{
"epoch": 0.45493900039354584,
"grad_norm": 1.052837564760308,
"learning_rate": 3.0744271666409526e-06,
"loss": 0.1454,
"step": 578
},
{
"epoch": 0.4557260920897285,
"grad_norm": 1.1289865006459998,
"learning_rate": 3.0680870437122145e-06,
"loss": 0.1554,
"step": 579
},
{
"epoch": 0.45651318378591105,
"grad_norm": 0.9614320131595296,
"learning_rate": 3.0617430658244295e-06,
"loss": 0.1368,
"step": 580
},
{
"epoch": 0.4573002754820937,
"grad_norm": 0.9849943444472453,
"learning_rate": 3.0553952760269427e-06,
"loss": 0.1372,
"step": 581
},
{
"epoch": 0.45808736717827625,
"grad_norm": 0.9938446057985301,
"learning_rate": 3.0490437173949656e-06,
"loss": 0.1397,
"step": 582
},
{
"epoch": 0.4588744588744589,
"grad_norm": 0.9430590894578916,
"learning_rate": 3.0426884330292844e-06,
"loss": 0.1404,
"step": 583
},
{
"epoch": 0.45966155057064145,
"grad_norm": 0.8998337127762756,
"learning_rate": 3.0363294660559685e-06,
"loss": 0.133,
"step": 584
},
{
"epoch": 0.4604486422668241,
"grad_norm": 0.9469777276964015,
"learning_rate": 3.0299668596260755e-06,
"loss": 0.1429,
"step": 585
},
{
"epoch": 0.4612357339630067,
"grad_norm": 0.9961208676961326,
"learning_rate": 3.023600656915362e-06,
"loss": 0.1381,
"step": 586
},
{
"epoch": 0.4620228256591893,
"grad_norm": 0.9726679702119774,
"learning_rate": 3.017230901123985e-06,
"loss": 0.1391,
"step": 587
},
{
"epoch": 0.4628099173553719,
"grad_norm": 1.016233754336966,
"learning_rate": 3.0108576354762176e-06,
"loss": 0.1464,
"step": 588
},
{
"epoch": 0.4635970090515545,
"grad_norm": 0.891890572894692,
"learning_rate": 3.0044809032201448e-06,
"loss": 0.1312,
"step": 589
},
{
"epoch": 0.4643841007477371,
"grad_norm": 0.9300922465018149,
"learning_rate": 2.9981007476273787e-06,
"loss": 0.1272,
"step": 590
},
{
"epoch": 0.4651711924439197,
"grad_norm": 1.0381540629264334,
"learning_rate": 2.9917172119927607e-06,
"loss": 0.1479,
"step": 591
},
{
"epoch": 0.4659582841401023,
"grad_norm": 1.0642195977009175,
"learning_rate": 2.9853303396340695e-06,
"loss": 0.1364,
"step": 592
},
{
"epoch": 0.46674537583628495,
"grad_norm": 0.9295272897205104,
"learning_rate": 2.9789401738917244e-06,
"loss": 0.1249,
"step": 593
},
{
"epoch": 0.4675324675324675,
"grad_norm": 1.0180029223750298,
"learning_rate": 2.9725467581284944e-06,
"loss": 0.1407,
"step": 594
},
{
"epoch": 0.46831955922865015,
"grad_norm": 1.1385262618991847,
"learning_rate": 2.966150135729203e-06,
"loss": 0.1502,
"step": 595
},
{
"epoch": 0.4691066509248327,
"grad_norm": 1.0067715931565462,
"learning_rate": 2.9597503501004345e-06,
"loss": 0.1286,
"step": 596
},
{
"epoch": 0.46989374262101535,
"grad_norm": 0.9465710841629198,
"learning_rate": 2.9533474446702346e-06,
"loss": 0.1358,
"step": 597
},
{
"epoch": 0.4706808343171979,
"grad_norm": 1.04804051578767,
"learning_rate": 2.946941462887824e-06,
"loss": 0.1333,
"step": 598
},
{
"epoch": 0.47146792601338056,
"grad_norm": 1.0917713383450702,
"learning_rate": 2.940532448223296e-06,
"loss": 0.1462,
"step": 599
},
{
"epoch": 0.4722550177095632,
"grad_norm": 0.9580513732250364,
"learning_rate": 2.9341204441673267e-06,
"loss": 0.1321,
"step": 600
},
{
"epoch": 0.47304210940574576,
"grad_norm": 0.9439921102070582,
"learning_rate": 2.927705494230875e-06,
"loss": 0.1441,
"step": 601
},
{
"epoch": 0.4738292011019284,
"grad_norm": 1.0178216949448748,
"learning_rate": 2.9212876419448943e-06,
"loss": 0.1405,
"step": 602
},
{
"epoch": 0.47461629279811096,
"grad_norm": 1.0297426762245179,
"learning_rate": 2.9148669308600298e-06,
"loss": 0.1392,
"step": 603
},
{
"epoch": 0.4754033844942936,
"grad_norm": 0.9415986568330708,
"learning_rate": 2.9084434045463255e-06,
"loss": 0.1282,
"step": 604
},
{
"epoch": 0.47619047619047616,
"grad_norm": 1.0337230890115443,
"learning_rate": 2.9020171065929327e-06,
"loss": 0.1394,
"step": 605
},
{
"epoch": 0.4769775678866588,
"grad_norm": 1.0540052550471415,
"learning_rate": 2.895588080607807e-06,
"loss": 0.1472,
"step": 606
},
{
"epoch": 0.4777646595828414,
"grad_norm": 1.0081872244466563,
"learning_rate": 2.8891563702174174e-06,
"loss": 0.1372,
"step": 607
},
{
"epoch": 0.478551751279024,
"grad_norm": 1.0145019904402564,
"learning_rate": 2.8827220190664505e-06,
"loss": 0.1399,
"step": 608
},
{
"epoch": 0.4793388429752066,
"grad_norm": 1.0258604105718838,
"learning_rate": 2.8762850708175098e-06,
"loss": 0.1499,
"step": 609
},
{
"epoch": 0.4801259346713892,
"grad_norm": 1.0836484331180423,
"learning_rate": 2.869845569150825e-06,
"loss": 0.1388,
"step": 610
},
{
"epoch": 0.4809130263675718,
"grad_norm": 0.9946389106293178,
"learning_rate": 2.863403557763951e-06,
"loss": 0.1323,
"step": 611
},
{
"epoch": 0.4817001180637544,
"grad_norm": 0.9968164583365795,
"learning_rate": 2.856959080371474e-06,
"loss": 0.1402,
"step": 612
},
{
"epoch": 0.48248720975993703,
"grad_norm": 1.0526146596249044,
"learning_rate": 2.8505121807047155e-06,
"loss": 0.1342,
"step": 613
},
{
"epoch": 0.48327430145611966,
"grad_norm": 0.9881771003275511,
"learning_rate": 2.8440629025114308e-06,
"loss": 0.1414,
"step": 614
},
{
"epoch": 0.48406139315230223,
"grad_norm": 1.0170639400089367,
"learning_rate": 2.8376112895555184e-06,
"loss": 0.1415,
"step": 615
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.9618458339894986,
"learning_rate": 2.83115738561672e-06,
"loss": 0.125,
"step": 616
},
{
"epoch": 0.48563557654466744,
"grad_norm": 1.166675709546666,
"learning_rate": 2.8247012344903235e-06,
"loss": 0.1537,
"step": 617
},
{
"epoch": 0.48642266824085006,
"grad_norm": 1.0308351089525765,
"learning_rate": 2.8182428799868643e-06,
"loss": 0.1435,
"step": 618
},
{
"epoch": 0.48720975993703264,
"grad_norm": 0.9008466844444718,
"learning_rate": 2.811782365931832e-06,
"loss": 0.1255,
"step": 619
},
{
"epoch": 0.48799685163321527,
"grad_norm": 1.0328591551300574,
"learning_rate": 2.8053197361653684e-06,
"loss": 0.1431,
"step": 620
},
{
"epoch": 0.4887839433293979,
"grad_norm": 1.0223227370370647,
"learning_rate": 2.7988550345419733e-06,
"loss": 0.1302,
"step": 621
},
{
"epoch": 0.48957103502558047,
"grad_norm": 1.0130656273790444,
"learning_rate": 2.792388304930207e-06,
"loss": 0.1413,
"step": 622
},
{
"epoch": 0.4903581267217631,
"grad_norm": 0.9678629630338841,
"learning_rate": 2.7859195912123875e-06,
"loss": 0.1411,
"step": 623
},
{
"epoch": 0.4911452184179457,
"grad_norm": 1.0630235458290422,
"learning_rate": 2.779448937284302e-06,
"loss": 0.144,
"step": 624
},
{
"epoch": 0.4919323101141283,
"grad_norm": 1.1368466359085148,
"learning_rate": 2.772976387054899e-06,
"loss": 0.1603,
"step": 625
},
{
"epoch": 0.4927194018103109,
"grad_norm": 1.0638972206646764,
"learning_rate": 2.766501984445999e-06,
"loss": 0.1469,
"step": 626
},
{
"epoch": 0.4935064935064935,
"grad_norm": 0.9878723437777639,
"learning_rate": 2.7600257733919887e-06,
"loss": 0.1347,
"step": 627
},
{
"epoch": 0.49429358520267613,
"grad_norm": 0.9482438523704221,
"learning_rate": 2.7535477978395295e-06,
"loss": 0.1301,
"step": 628
},
{
"epoch": 0.4950806768988587,
"grad_norm": 1.0213978192147322,
"learning_rate": 2.7470681017472556e-06,
"loss": 0.1442,
"step": 629
},
{
"epoch": 0.49586776859504134,
"grad_norm": 1.0113916573838844,
"learning_rate": 2.740586729085476e-06,
"loss": 0.1477,
"step": 630
},
{
"epoch": 0.4966548602912239,
"grad_norm": 1.0353820062718653,
"learning_rate": 2.7341037238358774e-06,
"loss": 0.1483,
"step": 631
},
{
"epoch": 0.49744195198740654,
"grad_norm": 1.0552352024187672,
"learning_rate": 2.727619129991224e-06,
"loss": 0.1328,
"step": 632
},
{
"epoch": 0.4982290436835891,
"grad_norm": 0.9937705442973395,
"learning_rate": 2.7211329915550615e-06,
"loss": 0.1409,
"step": 633
},
{
"epoch": 0.49901613537977174,
"grad_norm": 1.0486309341654392,
"learning_rate": 2.714645352541415e-06,
"loss": 0.15,
"step": 634
},
{
"epoch": 0.49980322707595437,
"grad_norm": 1.015369060592149,
"learning_rate": 2.7081562569744948e-06,
"loss": 0.1298,
"step": 635
},
{
"epoch": 0.500590318772137,
"grad_norm": 1.014091287328762,
"learning_rate": 2.701665748888393e-06,
"loss": 0.139,
"step": 636
},
{
"epoch": 0.5013774104683195,
"grad_norm": 1.010797057516188,
"learning_rate": 2.695173872326788e-06,
"loss": 0.1306,
"step": 637
},
{
"epoch": 0.5021645021645021,
"grad_norm": 0.9886264059190445,
"learning_rate": 2.6886806713426435e-06,
"loss": 0.1493,
"step": 638
},
{
"epoch": 0.5029515938606848,
"grad_norm": 0.9006497838538798,
"learning_rate": 2.6821861899979116e-06,
"loss": 0.127,
"step": 639
},
{
"epoch": 0.5037386855568674,
"grad_norm": 1.0409028373992908,
"learning_rate": 2.6756904723632325e-06,
"loss": 0.1453,
"step": 640
},
{
"epoch": 0.50452577725305,
"grad_norm": 0.9741943151013064,
"learning_rate": 2.6691935625176357e-06,
"loss": 0.1353,
"step": 641
},
{
"epoch": 0.5053128689492326,
"grad_norm": 0.949636504358609,
"learning_rate": 2.6626955045482405e-06,
"loss": 0.1335,
"step": 642
},
{
"epoch": 0.5060999606454152,
"grad_norm": 0.9249297082390363,
"learning_rate": 2.6561963425499575e-06,
"loss": 0.1338,
"step": 643
},
{
"epoch": 0.5068870523415978,
"grad_norm": 1.0151555535359889,
"learning_rate": 2.649696120625188e-06,
"loss": 0.1515,
"step": 644
},
{
"epoch": 0.5076741440377804,
"grad_norm": 1.039472398997662,
"learning_rate": 2.643194882883528e-06,
"loss": 0.1474,
"step": 645
},
{
"epoch": 0.508461235733963,
"grad_norm": 0.9434610266773801,
"learning_rate": 2.6366926734414648e-06,
"loss": 0.1304,
"step": 646
},
{
"epoch": 0.5092483274301456,
"grad_norm": 0.8865198426440791,
"learning_rate": 2.6301895364220816e-06,
"loss": 0.1202,
"step": 647
},
{
"epoch": 0.5100354191263282,
"grad_norm": 0.9546278944005607,
"learning_rate": 2.6236855159547527e-06,
"loss": 0.1291,
"step": 648
},
{
"epoch": 0.5108225108225108,
"grad_norm": 0.9693013564144493,
"learning_rate": 2.6171806561748503e-06,
"loss": 0.1339,
"step": 649
},
{
"epoch": 0.5116096025186935,
"grad_norm": 1.0027100891356027,
"learning_rate": 2.610675001223441e-06,
"loss": 0.1407,
"step": 650
},
{
"epoch": 0.512396694214876,
"grad_norm": 0.8560729540932264,
"learning_rate": 2.6041685952469877e-06,
"loss": 0.116,
"step": 651
},
{
"epoch": 0.5131837859110586,
"grad_norm": 0.9257606438562741,
"learning_rate": 2.597661482397049e-06,
"loss": 0.1262,
"step": 652
},
{
"epoch": 0.5139708776072412,
"grad_norm": 1.0514657045725575,
"learning_rate": 2.5911537068299803e-06,
"loss": 0.1469,
"step": 653
},
{
"epoch": 0.5147579693034239,
"grad_norm": 0.9545058570137028,
"learning_rate": 2.584645312706634e-06,
"loss": 0.1302,
"step": 654
},
{
"epoch": 0.5155450609996065,
"grad_norm": 0.9392962167917809,
"learning_rate": 2.5781363441920614e-06,
"loss": 0.1335,
"step": 655
},
{
"epoch": 0.516332152695789,
"grad_norm": 0.9496925045032614,
"learning_rate": 2.5716268454552094e-06,
"loss": 0.135,
"step": 656
},
{
"epoch": 0.5171192443919717,
"grad_norm": 1.002838327785164,
"learning_rate": 2.565116860668625e-06,
"loss": 0.1316,
"step": 657
},
{
"epoch": 0.5179063360881543,
"grad_norm": 1.053225279922735,
"learning_rate": 2.5586064340081516e-06,
"loss": 0.1512,
"step": 658
},
{
"epoch": 0.5186934277843369,
"grad_norm": 0.9441130740117648,
"learning_rate": 2.5520956096526323e-06,
"loss": 0.131,
"step": 659
},
{
"epoch": 0.5194805194805194,
"grad_norm": 0.9889334534500898,
"learning_rate": 2.5455844317836077e-06,
"loss": 0.1331,
"step": 660
},
{
"epoch": 0.5202676111767021,
"grad_norm": 0.9646380257679634,
"learning_rate": 2.53907294458502e-06,
"loss": 0.1291,
"step": 661
},
{
"epoch": 0.5210547028728847,
"grad_norm": 1.0529655580058879,
"learning_rate": 2.5325611922429074e-06,
"loss": 0.1491,
"step": 662
},
{
"epoch": 0.5218417945690673,
"grad_norm": 0.9926818678117324,
"learning_rate": 2.5260492189451076e-06,
"loss": 0.1443,
"step": 663
},
{
"epoch": 0.52262888626525,
"grad_norm": 0.9144135454846201,
"learning_rate": 2.51953706888096e-06,
"loss": 0.1217,
"step": 664
},
{
"epoch": 0.5234159779614325,
"grad_norm": 0.921592430215234,
"learning_rate": 2.513024786241001e-06,
"loss": 0.1248,
"step": 665
},
{
"epoch": 0.5242030696576151,
"grad_norm": 0.9514782593826102,
"learning_rate": 2.5065124152166692e-06,
"loss": 0.1297,
"step": 666
},
{
"epoch": 0.5249901613537977,
"grad_norm": 1.0117305817250293,
"learning_rate": 2.5e-06,
"loss": 0.1497,
"step": 667
},
{
"epoch": 0.5257772530499804,
"grad_norm": 1.013668842181626,
"learning_rate": 2.4934875847833308e-06,
"loss": 0.1224,
"step": 668
},
{
"epoch": 0.526564344746163,
"grad_norm": 1.000211936689413,
"learning_rate": 2.4869752137589994e-06,
"loss": 0.1419,
"step": 669
},
{
"epoch": 0.5273514364423455,
"grad_norm": 0.9733370358487723,
"learning_rate": 2.48046293111904e-06,
"loss": 0.1245,
"step": 670
},
{
"epoch": 0.5281385281385281,
"grad_norm": 1.0646062724041805,
"learning_rate": 2.473950781054893e-06,
"loss": 0.1383,
"step": 671
},
{
"epoch": 0.5289256198347108,
"grad_norm": 0.930785726380819,
"learning_rate": 2.467438807757094e-06,
"loss": 0.1295,
"step": 672
},
{
"epoch": 0.5297127115308934,
"grad_norm": 0.9786127857256359,
"learning_rate": 2.460927055414981e-06,
"loss": 0.146,
"step": 673
},
{
"epoch": 0.5304998032270759,
"grad_norm": 0.9744140929407867,
"learning_rate": 2.4544155682163922e-06,
"loss": 0.1298,
"step": 674
},
{
"epoch": 0.5312868949232585,
"grad_norm": 0.9484782784554407,
"learning_rate": 2.447904390347369e-06,
"loss": 0.1278,
"step": 675
},
{
"epoch": 0.5320739866194412,
"grad_norm": 1.1172109264151044,
"learning_rate": 2.441393565991849e-06,
"loss": 0.146,
"step": 676
},
{
"epoch": 0.5328610783156238,
"grad_norm": 0.9832334733375834,
"learning_rate": 2.4348831393313763e-06,
"loss": 0.1341,
"step": 677
},
{
"epoch": 0.5336481700118064,
"grad_norm": 0.9993612072993626,
"learning_rate": 2.428373154544791e-06,
"loss": 0.1348,
"step": 678
},
{
"epoch": 0.5344352617079889,
"grad_norm": 0.8743067419696096,
"learning_rate": 2.42186365580794e-06,
"loss": 0.1127,
"step": 679
},
{
"epoch": 0.5352223534041716,
"grad_norm": 0.9878917132746777,
"learning_rate": 2.4153546872933667e-06,
"loss": 0.1289,
"step": 680
},
{
"epoch": 0.5360094451003542,
"grad_norm": 0.9872701224310093,
"learning_rate": 2.4088462931700214e-06,
"loss": 0.1382,
"step": 681
},
{
"epoch": 0.5367965367965368,
"grad_norm": 1.0291331541759994,
"learning_rate": 2.4023385176029516e-06,
"loss": 0.1398,
"step": 682
},
{
"epoch": 0.5375836284927195,
"grad_norm": 1.0314844938730774,
"learning_rate": 2.3958314047530127e-06,
"loss": 0.1407,
"step": 683
},
{
"epoch": 0.538370720188902,
"grad_norm": 0.9922009235690711,
"learning_rate": 2.3893249987765598e-06,
"loss": 0.1375,
"step": 684
},
{
"epoch": 0.5391578118850846,
"grad_norm": 1.0407160996339295,
"learning_rate": 2.3828193438251497e-06,
"loss": 0.1356,
"step": 685
},
{
"epoch": 0.5399449035812672,
"grad_norm": 0.9860703004700557,
"learning_rate": 2.376314484045248e-06,
"loss": 0.132,
"step": 686
},
{
"epoch": 0.5407319952774499,
"grad_norm": 1.0540933767364977,
"learning_rate": 2.369810463577919e-06,
"loss": 0.1467,
"step": 687
},
{
"epoch": 0.5415190869736324,
"grad_norm": 1.0135356185084303,
"learning_rate": 2.3633073265585356e-06,
"loss": 0.1381,
"step": 688
},
{
"epoch": 0.542306178669815,
"grad_norm": 0.9743937278639236,
"learning_rate": 2.3568051171164724e-06,
"loss": 0.1324,
"step": 689
},
{
"epoch": 0.5430932703659976,
"grad_norm": 1.0422560526589146,
"learning_rate": 2.350303879374813e-06,
"loss": 0.136,
"step": 690
},
{
"epoch": 0.5438803620621803,
"grad_norm": 1.0503391352080245,
"learning_rate": 2.3438036574500434e-06,
"loss": 0.147,
"step": 691
},
{
"epoch": 0.5446674537583629,
"grad_norm": 0.9557517793781123,
"learning_rate": 2.3373044954517603e-06,
"loss": 0.1216,
"step": 692
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.9898057468780994,
"learning_rate": 2.330806437482365e-06,
"loss": 0.1342,
"step": 693
},
{
"epoch": 0.546241637150728,
"grad_norm": 0.9685373418602369,
"learning_rate": 2.3243095276367687e-06,
"loss": 0.1294,
"step": 694
},
{
"epoch": 0.5470287288469107,
"grad_norm": 1.0187901801029866,
"learning_rate": 2.317813810002089e-06,
"loss": 0.1366,
"step": 695
},
{
"epoch": 0.5478158205430933,
"grad_norm": 1.036393473441657,
"learning_rate": 2.3113193286573577e-06,
"loss": 0.1384,
"step": 696
},
{
"epoch": 0.5486029122392759,
"grad_norm": 0.9735402694275894,
"learning_rate": 2.3048261276732133e-06,
"loss": 0.1325,
"step": 697
},
{
"epoch": 0.5493900039354584,
"grad_norm": 0.9435211562075637,
"learning_rate": 2.298334251111607e-06,
"loss": 0.1272,
"step": 698
},
{
"epoch": 0.5501770956316411,
"grad_norm": 0.9238771765346788,
"learning_rate": 2.2918437430255056e-06,
"loss": 0.1329,
"step": 699
},
{
"epoch": 0.5509641873278237,
"grad_norm": 0.9732329075427437,
"learning_rate": 2.285354647458585e-06,
"loss": 0.1316,
"step": 700
},
{
"epoch": 0.5517512790240063,
"grad_norm": 1.032698839528823,
"learning_rate": 2.2788670084449393e-06,
"loss": 0.1438,
"step": 701
},
{
"epoch": 0.5525383707201889,
"grad_norm": 0.9453000245373157,
"learning_rate": 2.2723808700087764e-06,
"loss": 0.1349,
"step": 702
},
{
"epoch": 0.5533254624163715,
"grad_norm": 1.017447417352295,
"learning_rate": 2.2658962761641235e-06,
"loss": 0.1346,
"step": 703
},
{
"epoch": 0.5541125541125541,
"grad_norm": 1.0593240948345142,
"learning_rate": 2.2594132709145245e-06,
"loss": 0.1391,
"step": 704
},
{
"epoch": 0.5548996458087367,
"grad_norm": 1.0524825811903469,
"learning_rate": 2.2529318982527453e-06,
"loss": 0.1397,
"step": 705
},
{
"epoch": 0.5556867375049194,
"grad_norm": 0.9994684504324172,
"learning_rate": 2.246452202160471e-06,
"loss": 0.1416,
"step": 706
},
{
"epoch": 0.5564738292011019,
"grad_norm": 1.033406524556106,
"learning_rate": 2.2399742266080126e-06,
"loss": 0.1269,
"step": 707
},
{
"epoch": 0.5572609208972845,
"grad_norm": 0.9246218098662428,
"learning_rate": 2.233498015554002e-06,
"loss": 0.1242,
"step": 708
},
{
"epoch": 0.5580480125934671,
"grad_norm": 0.904021008692359,
"learning_rate": 2.227023612945102e-06,
"loss": 0.1217,
"step": 709
},
{
"epoch": 0.5588351042896498,
"grad_norm": 0.9921447266760961,
"learning_rate": 2.220551062715699e-06,
"loss": 0.1241,
"step": 710
},
{
"epoch": 0.5596221959858324,
"grad_norm": 1.0273952935358304,
"learning_rate": 2.2140804087876134e-06,
"loss": 0.1244,
"step": 711
},
{
"epoch": 0.5604092876820149,
"grad_norm": 0.9958421204937957,
"learning_rate": 2.207611695069794e-06,
"loss": 0.1277,
"step": 712
},
{
"epoch": 0.5611963793781976,
"grad_norm": 1.0226304738126037,
"learning_rate": 2.2011449654580266e-06,
"loss": 0.1319,
"step": 713
},
{
"epoch": 0.5619834710743802,
"grad_norm": 0.9427241568832295,
"learning_rate": 2.1946802638346324e-06,
"loss": 0.1208,
"step": 714
},
{
"epoch": 0.5627705627705628,
"grad_norm": 0.9526667511261941,
"learning_rate": 2.1882176340681682e-06,
"loss": 0.1234,
"step": 715
},
{
"epoch": 0.5635576544667453,
"grad_norm": 0.9726636294262463,
"learning_rate": 2.181757120013136e-06,
"loss": 0.1241,
"step": 716
},
{
"epoch": 0.564344746162928,
"grad_norm": 0.9577642489452165,
"learning_rate": 2.1752987655096765e-06,
"loss": 0.1286,
"step": 717
},
{
"epoch": 0.5651318378591106,
"grad_norm": 0.9119267395234483,
"learning_rate": 2.1688426143832804e-06,
"loss": 0.132,
"step": 718
},
{
"epoch": 0.5659189295552932,
"grad_norm": 0.944139995902989,
"learning_rate": 2.162388710444482e-06,
"loss": 0.1234,
"step": 719
},
{
"epoch": 0.5667060212514758,
"grad_norm": 0.9645692490749199,
"learning_rate": 2.155937097488571e-06,
"loss": 0.1251,
"step": 720
},
{
"epoch": 0.5674931129476584,
"grad_norm": 0.9720255939912888,
"learning_rate": 2.1494878192952857e-06,
"loss": 0.1319,
"step": 721
},
{
"epoch": 0.568280204643841,
"grad_norm": 0.9511775624645177,
"learning_rate": 2.1430409196285268e-06,
"loss": 0.1327,
"step": 722
},
{
"epoch": 0.5690672963400236,
"grad_norm": 0.9008868958605895,
"learning_rate": 2.1365964422360495e-06,
"loss": 0.1257,
"step": 723
},
{
"epoch": 0.5698543880362062,
"grad_norm": 1.0087528087899673,
"learning_rate": 2.1301544308491755e-06,
"loss": 0.1404,
"step": 724
},
{
"epoch": 0.5706414797323889,
"grad_norm": 0.9236247008656706,
"learning_rate": 2.1237149291824906e-06,
"loss": 0.1122,
"step": 725
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.9656011748968637,
"learning_rate": 2.11727798093355e-06,
"loss": 0.1238,
"step": 726
},
{
"epoch": 0.572215663124754,
"grad_norm": 0.9574761410065884,
"learning_rate": 2.110843629782583e-06,
"loss": 0.1205,
"step": 727
},
{
"epoch": 0.5730027548209367,
"grad_norm": 1.0073901156504852,
"learning_rate": 2.1044119193921935e-06,
"loss": 0.141,
"step": 728
},
{
"epoch": 0.5737898465171193,
"grad_norm": 1.0135184994615516,
"learning_rate": 2.097982893407068e-06,
"loss": 0.1391,
"step": 729
},
{
"epoch": 0.5745769382133018,
"grad_norm": 0.9943855979768463,
"learning_rate": 2.0915565954536745e-06,
"loss": 0.1261,
"step": 730
},
{
"epoch": 0.5753640299094844,
"grad_norm": 0.9263403288426786,
"learning_rate": 2.085133069139971e-06,
"loss": 0.1199,
"step": 731
},
{
"epoch": 0.5761511216056671,
"grad_norm": 0.9250174978891127,
"learning_rate": 2.078712358055106e-06,
"loss": 0.1292,
"step": 732
},
{
"epoch": 0.5769382133018497,
"grad_norm": 0.9594966083023022,
"learning_rate": 2.0722945057691253e-06,
"loss": 0.13,
"step": 733
},
{
"epoch": 0.5777253049980323,
"grad_norm": 0.9996158846425939,
"learning_rate": 2.0658795558326745e-06,
"loss": 0.1346,
"step": 734
},
{
"epoch": 0.5785123966942148,
"grad_norm": 0.9512575411801811,
"learning_rate": 2.059467551776705e-06,
"loss": 0.1277,
"step": 735
},
{
"epoch": 0.5792994883903975,
"grad_norm": 0.9956048932258434,
"learning_rate": 2.053058537112177e-06,
"loss": 0.1364,
"step": 736
},
{
"epoch": 0.5800865800865801,
"grad_norm": 0.9822563309180476,
"learning_rate": 2.0466525553297666e-06,
"loss": 0.124,
"step": 737
},
{
"epoch": 0.5808736717827627,
"grad_norm": 0.9983781840041562,
"learning_rate": 2.0402496498995667e-06,
"loss": 0.1347,
"step": 738
},
{
"epoch": 0.5816607634789452,
"grad_norm": 0.9827604253780587,
"learning_rate": 2.0338498642707977e-06,
"loss": 0.1369,
"step": 739
},
{
"epoch": 0.5824478551751279,
"grad_norm": 1.0756801683767687,
"learning_rate": 2.027453241871506e-06,
"loss": 0.1323,
"step": 740
},
{
"epoch": 0.5832349468713105,
"grad_norm": 1.0081971409472221,
"learning_rate": 2.0210598261082764e-06,
"loss": 0.1356,
"step": 741
},
{
"epoch": 0.5840220385674931,
"grad_norm": 0.9898906943423369,
"learning_rate": 2.014669660365931e-06,
"loss": 0.1368,
"step": 742
},
{
"epoch": 0.5848091302636758,
"grad_norm": 0.9690524566063999,
"learning_rate": 2.0082827880072393e-06,
"loss": 0.135,
"step": 743
},
{
"epoch": 0.5855962219598583,
"grad_norm": 0.9708412001010785,
"learning_rate": 2.0018992523726217e-06,
"loss": 0.1252,
"step": 744
},
{
"epoch": 0.5863833136560409,
"grad_norm": 0.9322317029959182,
"learning_rate": 1.995519096779855e-06,
"loss": 0.1205,
"step": 745
},
{
"epoch": 0.5871704053522235,
"grad_norm": 0.934365579766912,
"learning_rate": 1.9891423645237832e-06,
"loss": 0.1194,
"step": 746
},
{
"epoch": 0.5879574970484062,
"grad_norm": 0.9421279165900748,
"learning_rate": 1.982769098876015e-06,
"loss": 0.1319,
"step": 747
},
{
"epoch": 0.5887445887445888,
"grad_norm": 0.9954570155310445,
"learning_rate": 1.9763993430846394e-06,
"loss": 0.1369,
"step": 748
},
{
"epoch": 0.5895316804407713,
"grad_norm": 0.931791564112829,
"learning_rate": 1.970033140373925e-06,
"loss": 0.1315,
"step": 749
},
{
"epoch": 0.5903187721369539,
"grad_norm": 0.9679723780616554,
"learning_rate": 1.9636705339440327e-06,
"loss": 0.1377,
"step": 750
},
{
"epoch": 0.5911058638331366,
"grad_norm": 0.9863750681505877,
"learning_rate": 1.957311566970716e-06,
"loss": 0.1293,
"step": 751
},
{
"epoch": 0.5918929555293192,
"grad_norm": 0.9598202963903522,
"learning_rate": 1.9509562826050353e-06,
"loss": 0.1273,
"step": 752
},
{
"epoch": 0.5926800472255017,
"grad_norm": 0.9800756257622318,
"learning_rate": 1.944604723973058e-06,
"loss": 0.1284,
"step": 753
},
{
"epoch": 0.5934671389216843,
"grad_norm": 1.0001757874575956,
"learning_rate": 1.938256934175571e-06,
"loss": 0.1303,
"step": 754
},
{
"epoch": 0.594254230617867,
"grad_norm": 0.9299561635899479,
"learning_rate": 1.9319129562877863e-06,
"loss": 0.1239,
"step": 755
},
{
"epoch": 0.5950413223140496,
"grad_norm": 1.0345095738407815,
"learning_rate": 1.925572833359048e-06,
"loss": 0.1305,
"step": 756
},
{
"epoch": 0.5958284140102322,
"grad_norm": 1.0520535233317054,
"learning_rate": 1.9192366084125423e-06,
"loss": 0.1373,
"step": 757
},
{
"epoch": 0.5966155057064148,
"grad_norm": 1.029018918955376,
"learning_rate": 1.9129043244450027e-06,
"loss": 0.1382,
"step": 758
},
{
"epoch": 0.5974025974025974,
"grad_norm": 1.0294584813791954,
"learning_rate": 1.906576024426422e-06,
"loss": 0.1368,
"step": 759
},
{
"epoch": 0.59818968909878,
"grad_norm": 0.9330122675132353,
"learning_rate": 1.9002517512997555e-06,
"loss": 0.1145,
"step": 760
},
{
"epoch": 0.5989767807949626,
"grad_norm": 0.9623676868988281,
"learning_rate": 1.8939315479806352e-06,
"loss": 0.1335,
"step": 761
},
{
"epoch": 0.5997638724911453,
"grad_norm": 0.9245436138689049,
"learning_rate": 1.8876154573570744e-06,
"loss": 0.1307,
"step": 762
},
{
"epoch": 0.6005509641873278,
"grad_norm": 0.8942520983106202,
"learning_rate": 1.8813035222891785e-06,
"loss": 0.1272,
"step": 763
},
{
"epoch": 0.6013380558835104,
"grad_norm": 0.9343347217079488,
"learning_rate": 1.8749957856088546e-06,
"loss": 0.1317,
"step": 764
},
{
"epoch": 0.602125147579693,
"grad_norm": 0.9393324326245188,
"learning_rate": 1.8686922901195197e-06,
"loss": 0.1313,
"step": 765
},
{
"epoch": 0.6029122392758757,
"grad_norm": 0.9333264421793994,
"learning_rate": 1.8623930785958092e-06,
"loss": 0.1226,
"step": 766
},
{
"epoch": 0.6036993309720582,
"grad_norm": 0.9718728327996774,
"learning_rate": 1.8560981937832916e-06,
"loss": 0.1314,
"step": 767
},
{
"epoch": 0.6044864226682408,
"grad_norm": 0.9437466891844623,
"learning_rate": 1.849807678398171e-06,
"loss": 0.1271,
"step": 768
},
{
"epoch": 0.6052735143644234,
"grad_norm": 0.9433172532376,
"learning_rate": 1.8435215751270048e-06,
"loss": 0.1083,
"step": 769
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.9486485907428178,
"learning_rate": 1.8372399266264069e-06,
"loss": 0.1245,
"step": 770
},
{
"epoch": 0.6068476977567887,
"grad_norm": 0.9345585045873044,
"learning_rate": 1.8309627755227643e-06,
"loss": 0.1205,
"step": 771
},
{
"epoch": 0.6076347894529712,
"grad_norm": 1.0082946745736912,
"learning_rate": 1.8246901644119447e-06,
"loss": 0.1337,
"step": 772
},
{
"epoch": 0.6084218811491539,
"grad_norm": 0.9640602588467792,
"learning_rate": 1.8184221358590078e-06,
"loss": 0.123,
"step": 773
},
{
"epoch": 0.6092089728453365,
"grad_norm": 1.0256774883323883,
"learning_rate": 1.812158732397917e-06,
"loss": 0.1331,
"step": 774
},
{
"epoch": 0.6099960645415191,
"grad_norm": 0.9485492161002549,
"learning_rate": 1.8058999965312484e-06,
"loss": 0.1328,
"step": 775
},
{
"epoch": 0.6107831562377017,
"grad_norm": 0.9763406590147844,
"learning_rate": 1.799645970729909e-06,
"loss": 0.1309,
"step": 776
},
{
"epoch": 0.6115702479338843,
"grad_norm": 0.9917227985654803,
"learning_rate": 1.793396697432839e-06,
"loss": 0.1349,
"step": 777
},
{
"epoch": 0.6123573396300669,
"grad_norm": 0.9926597353156553,
"learning_rate": 1.7871522190467327e-06,
"loss": 0.1303,
"step": 778
},
{
"epoch": 0.6131444313262495,
"grad_norm": 0.9259479446299848,
"learning_rate": 1.7809125779457432e-06,
"loss": 0.1145,
"step": 779
},
{
"epoch": 0.6139315230224321,
"grad_norm": 0.9291022839595524,
"learning_rate": 1.7746778164712024e-06,
"loss": 0.119,
"step": 780
},
{
"epoch": 0.6147186147186147,
"grad_norm": 0.951539094911597,
"learning_rate": 1.768447976931326e-06,
"loss": 0.1261,
"step": 781
},
{
"epoch": 0.6155057064147973,
"grad_norm": 0.993556323836548,
"learning_rate": 1.7622231016009333e-06,
"loss": 0.1297,
"step": 782
},
{
"epoch": 0.6162927981109799,
"grad_norm": 0.9391833540663885,
"learning_rate": 1.7560032327211546e-06,
"loss": 0.124,
"step": 783
},
{
"epoch": 0.6170798898071626,
"grad_norm": 1.031878772377542,
"learning_rate": 1.7497884124991487e-06,
"loss": 0.1308,
"step": 784
},
{
"epoch": 0.6178669815033452,
"grad_norm": 0.9834671756142636,
"learning_rate": 1.7435786831078144e-06,
"loss": 0.1303,
"step": 785
},
{
"epoch": 0.6186540731995277,
"grad_norm": 0.9859388240495401,
"learning_rate": 1.7373740866855043e-06,
"loss": 0.1326,
"step": 786
},
{
"epoch": 0.6194411648957103,
"grad_norm": 1.0156315373671152,
"learning_rate": 1.731174665335742e-06,
"loss": 0.1333,
"step": 787
},
{
"epoch": 0.620228256591893,
"grad_norm": 0.8457875340285443,
"learning_rate": 1.724980461126929e-06,
"loss": 0.1149,
"step": 788
},
{
"epoch": 0.6210153482880756,
"grad_norm": 0.9812167735229308,
"learning_rate": 1.7187915160920692e-06,
"loss": 0.1341,
"step": 789
},
{
"epoch": 0.6218024399842582,
"grad_norm": 0.9479256338770862,
"learning_rate": 1.7126078722284739e-06,
"loss": 0.1171,
"step": 790
},
{
"epoch": 0.6225895316804407,
"grad_norm": 0.9626147311559159,
"learning_rate": 1.706429571497486e-06,
"loss": 0.1195,
"step": 791
},
{
"epoch": 0.6233766233766234,
"grad_norm": 0.996537388600602,
"learning_rate": 1.7002566558241862e-06,
"loss": 0.1347,
"step": 792
},
{
"epoch": 0.624163715072806,
"grad_norm": 1.0979789094333103,
"learning_rate": 1.694089167097116e-06,
"loss": 0.1442,
"step": 793
},
{
"epoch": 0.6249508067689886,
"grad_norm": 0.9903394042224888,
"learning_rate": 1.6879271471679887e-06,
"loss": 0.1275,
"step": 794
},
{
"epoch": 0.6257378984651711,
"grad_norm": 0.9904940377814807,
"learning_rate": 1.681770637851409e-06,
"loss": 0.139,
"step": 795
},
{
"epoch": 0.6265249901613538,
"grad_norm": 0.9969630818236452,
"learning_rate": 1.675619680924584e-06,
"loss": 0.1325,
"step": 796
},
{
"epoch": 0.6273120818575364,
"grad_norm": 1.0558109930918702,
"learning_rate": 1.6694743181270474e-06,
"loss": 0.1448,
"step": 797
},
{
"epoch": 0.628099173553719,
"grad_norm": 0.956496508451797,
"learning_rate": 1.663334591160368e-06,
"loss": 0.1217,
"step": 798
},
{
"epoch": 0.6288862652499017,
"grad_norm": 0.9677341961932617,
"learning_rate": 1.657200541687874e-06,
"loss": 0.136,
"step": 799
},
{
"epoch": 0.6296733569460842,
"grad_norm": 1.0046971327809577,
"learning_rate": 1.6510722113343633e-06,
"loss": 0.1322,
"step": 800
},
{
"epoch": 0.6304604486422668,
"grad_norm": 0.9254874616921521,
"learning_rate": 1.6449496416858285e-06,
"loss": 0.1227,
"step": 801
},
{
"epoch": 0.6312475403384494,
"grad_norm": 1.128513971443689,
"learning_rate": 1.6388328742891679e-06,
"loss": 0.1357,
"step": 802
},
{
"epoch": 0.6320346320346321,
"grad_norm": 1.0186779597478501,
"learning_rate": 1.6327219506519082e-06,
"loss": 0.1369,
"step": 803
},
{
"epoch": 0.6328217237308147,
"grad_norm": 0.9605839825909683,
"learning_rate": 1.6266169122419208e-06,
"loss": 0.1222,
"step": 804
},
{
"epoch": 0.6336088154269972,
"grad_norm": 1.0294396317293524,
"learning_rate": 1.6205178004871392e-06,
"loss": 0.1265,
"step": 805
},
{
"epoch": 0.6343959071231798,
"grad_norm": 0.9797067002404048,
"learning_rate": 1.6144246567752831e-06,
"loss": 0.1298,
"step": 806
},
{
"epoch": 0.6351829988193625,
"grad_norm": 0.9531255383226177,
"learning_rate": 1.6083375224535689e-06,
"loss": 0.1204,
"step": 807
},
{
"epoch": 0.6359700905155451,
"grad_norm": 0.9228494475342526,
"learning_rate": 1.6022564388284391e-06,
"loss": 0.1122,
"step": 808
},
{
"epoch": 0.6367571822117276,
"grad_norm": 0.9854787445128979,
"learning_rate": 1.596181447165273e-06,
"loss": 0.1287,
"step": 809
},
{
"epoch": 0.6375442739079102,
"grad_norm": 0.9205768495534565,
"learning_rate": 1.5901125886881147e-06,
"loss": 0.1206,
"step": 810
},
{
"epoch": 0.6383313656040929,
"grad_norm": 0.9798975631304712,
"learning_rate": 1.5840499045793845e-06,
"loss": 0.1231,
"step": 811
},
{
"epoch": 0.6391184573002755,
"grad_norm": 0.9296415144186752,
"learning_rate": 1.5779934359796095e-06,
"loss": 0.1202,
"step": 812
},
{
"epoch": 0.6399055489964581,
"grad_norm": 0.9814672778856722,
"learning_rate": 1.5719432239871347e-06,
"loss": 0.1211,
"step": 813
},
{
"epoch": 0.6406926406926406,
"grad_norm": 0.9650667010737961,
"learning_rate": 1.5658993096578512e-06,
"loss": 0.123,
"step": 814
},
{
"epoch": 0.6414797323888233,
"grad_norm": 0.9606428490346777,
"learning_rate": 1.5598617340049145e-06,
"loss": 0.1196,
"step": 815
},
{
"epoch": 0.6422668240850059,
"grad_norm": 0.9865842262641049,
"learning_rate": 1.5538305379984661e-06,
"loss": 0.1414,
"step": 816
},
{
"epoch": 0.6430539157811885,
"grad_norm": 0.9734870716677574,
"learning_rate": 1.547805762565358e-06,
"loss": 0.1286,
"step": 817
},
{
"epoch": 0.6438410074773712,
"grad_norm": 0.9439567875437019,
"learning_rate": 1.5417874485888706e-06,
"loss": 0.1109,
"step": 818
},
{
"epoch": 0.6446280991735537,
"grad_norm": 1.0235998100882107,
"learning_rate": 1.5357756369084398e-06,
"loss": 0.123,
"step": 819
},
{
"epoch": 0.6454151908697363,
"grad_norm": 1.0269524388931728,
"learning_rate": 1.5297703683193755e-06,
"loss": 0.1324,
"step": 820
},
{
"epoch": 0.6462022825659189,
"grad_norm": 0.9493642789135233,
"learning_rate": 1.5237716835725907e-06,
"loss": 0.1125,
"step": 821
},
{
"epoch": 0.6469893742621016,
"grad_norm": 1.162601877497598,
"learning_rate": 1.5177796233743174e-06,
"loss": 0.1249,
"step": 822
},
{
"epoch": 0.6477764659582841,
"grad_norm": 1.034038388079516,
"learning_rate": 1.511794228385837e-06,
"loss": 0.1217,
"step": 823
},
{
"epoch": 0.6485635576544667,
"grad_norm": 0.9672639615152381,
"learning_rate": 1.5058155392232004e-06,
"loss": 0.1208,
"step": 824
},
{
"epoch": 0.6493506493506493,
"grad_norm": 1.080256521732267,
"learning_rate": 1.4998435964569552e-06,
"loss": 0.1279,
"step": 825
},
{
"epoch": 0.650137741046832,
"grad_norm": 0.9417495839242918,
"learning_rate": 1.4938784406118663e-06,
"loss": 0.1249,
"step": 826
},
{
"epoch": 0.6509248327430146,
"grad_norm": 1.006350286001005,
"learning_rate": 1.4879201121666466e-06,
"loss": 0.1251,
"step": 827
},
{
"epoch": 0.6517119244391971,
"grad_norm": 0.968507626389286,
"learning_rate": 1.4819686515536763e-06,
"loss": 0.1203,
"step": 828
},
{
"epoch": 0.6524990161353798,
"grad_norm": 0.979256644659201,
"learning_rate": 1.4760240991587338e-06,
"loss": 0.1309,
"step": 829
},
{
"epoch": 0.6532861078315624,
"grad_norm": 1.041802414674734,
"learning_rate": 1.4700864953207192e-06,
"loss": 0.124,
"step": 830
},
{
"epoch": 0.654073199527745,
"grad_norm": 0.9623673717149763,
"learning_rate": 1.4641558803313783e-06,
"loss": 0.1153,
"step": 831
},
{
"epoch": 0.6548602912239276,
"grad_norm": 1.0049463554640272,
"learning_rate": 1.4582322944350335e-06,
"loss": 0.123,
"step": 832
},
{
"epoch": 0.6556473829201102,
"grad_norm": 0.9822560730942449,
"learning_rate": 1.4523157778283082e-06,
"loss": 0.1253,
"step": 833
},
{
"epoch": 0.6564344746162928,
"grad_norm": 1.0300014906979744,
"learning_rate": 1.4464063706598563e-06,
"loss": 0.121,
"step": 834
},
{
"epoch": 0.6572215663124754,
"grad_norm": 0.9605069437184749,
"learning_rate": 1.440504113030084e-06,
"loss": 0.1303,
"step": 835
},
{
"epoch": 0.658008658008658,
"grad_norm": 1.0062748427154549,
"learning_rate": 1.4346090449908862e-06,
"loss": 0.1254,
"step": 836
},
{
"epoch": 0.6587957497048406,
"grad_norm": 1.003505120930448,
"learning_rate": 1.4287212065453681e-06,
"loss": 0.1293,
"step": 837
},
{
"epoch": 0.6595828414010232,
"grad_norm": 0.9215109848797975,
"learning_rate": 1.4228406376475741e-06,
"loss": 0.1156,
"step": 838
},
{
"epoch": 0.6603699330972058,
"grad_norm": 1.0375359512611602,
"learning_rate": 1.4169673782022232e-06,
"loss": 0.1251,
"step": 839
},
{
"epoch": 0.6611570247933884,
"grad_norm": 1.0075633482471045,
"learning_rate": 1.411101468064429e-06,
"loss": 0.1273,
"step": 840
},
{
"epoch": 0.6619441164895711,
"grad_norm": 1.0079245494150497,
"learning_rate": 1.4052429470394353e-06,
"loss": 0.1302,
"step": 841
},
{
"epoch": 0.6627312081857536,
"grad_norm": 0.9589739631373009,
"learning_rate": 1.3993918548823453e-06,
"loss": 0.1219,
"step": 842
},
{
"epoch": 0.6635182998819362,
"grad_norm": 0.9854619269672102,
"learning_rate": 1.3935482312978494e-06,
"loss": 0.1264,
"step": 843
},
{
"epoch": 0.6643053915781189,
"grad_norm": 1.0139593156707545,
"learning_rate": 1.3877121159399587e-06,
"loss": 0.1352,
"step": 844
},
{
"epoch": 0.6650924832743015,
"grad_norm": 0.9879913850797528,
"learning_rate": 1.381883548411735e-06,
"loss": 0.1252,
"step": 845
},
{
"epoch": 0.6658795749704841,
"grad_norm": 0.9828821822604814,
"learning_rate": 1.376062568265018e-06,
"loss": 0.1262,
"step": 846
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.9902383754663022,
"learning_rate": 1.370249215000166e-06,
"loss": 0.1339,
"step": 847
},
{
"epoch": 0.6674537583628493,
"grad_norm": 1.0169925787410046,
"learning_rate": 1.3644435280657765e-06,
"loss": 0.1325,
"step": 848
},
{
"epoch": 0.6682408500590319,
"grad_norm": 0.9802382914836032,
"learning_rate": 1.3586455468584292e-06,
"loss": 0.1294,
"step": 849
},
{
"epoch": 0.6690279417552145,
"grad_norm": 0.9103087080426163,
"learning_rate": 1.3528553107224108e-06,
"loss": 0.1132,
"step": 850
},
{
"epoch": 0.669815033451397,
"grad_norm": 1.0322697690605673,
"learning_rate": 1.347072858949453e-06,
"loss": 0.1326,
"step": 851
},
{
"epoch": 0.6706021251475797,
"grad_norm": 0.940497609406273,
"learning_rate": 1.3412982307784617e-06,
"loss": 0.1142,
"step": 852
},
{
"epoch": 0.6713892168437623,
"grad_norm": 0.9651333506256994,
"learning_rate": 1.3355314653952555e-06,
"loss": 0.12,
"step": 853
},
{
"epoch": 0.6721763085399449,
"grad_norm": 0.8974492403550183,
"learning_rate": 1.3297726019322948e-06,
"loss": 0.1252,
"step": 854
},
{
"epoch": 0.6729634002361276,
"grad_norm": 0.9779192150286001,
"learning_rate": 1.3240216794684212e-06,
"loss": 0.1265,
"step": 855
},
{
"epoch": 0.6737504919323101,
"grad_norm": 1.0060169889058102,
"learning_rate": 1.3182787370285865e-06,
"loss": 0.1305,
"step": 856
},
{
"epoch": 0.6745375836284927,
"grad_norm": 0.9623311050243877,
"learning_rate": 1.3125438135835955e-06,
"loss": 0.114,
"step": 857
},
{
"epoch": 0.6753246753246753,
"grad_norm": 1.005880860747008,
"learning_rate": 1.3068169480498333e-06,
"loss": 0.1237,
"step": 858
},
{
"epoch": 0.676111767020858,
"grad_norm": 1.0295442665880505,
"learning_rate": 1.3010981792890053e-06,
"loss": 0.141,
"step": 859
},
{
"epoch": 0.6768988587170406,
"grad_norm": 0.9746775819035803,
"learning_rate": 1.2953875461078777e-06,
"loss": 0.1174,
"step": 860
},
{
"epoch": 0.6776859504132231,
"grad_norm": 0.9651023742880912,
"learning_rate": 1.289685087258004e-06,
"loss": 0.1179,
"step": 861
},
{
"epoch": 0.6784730421094057,
"grad_norm": 0.9778504990448126,
"learning_rate": 1.283990841435473e-06,
"loss": 0.1232,
"step": 862
},
{
"epoch": 0.6792601338055884,
"grad_norm": 0.9823411560425596,
"learning_rate": 1.2783048472806364e-06,
"loss": 0.1214,
"step": 863
},
{
"epoch": 0.680047225501771,
"grad_norm": 0.9509119170509043,
"learning_rate": 1.2726271433778559e-06,
"loss": 0.1331,
"step": 864
},
{
"epoch": 0.6808343171979535,
"grad_norm": 0.9637465369074552,
"learning_rate": 1.266957768255232e-06,
"loss": 0.1221,
"step": 865
},
{
"epoch": 0.6816214088941361,
"grad_norm": 1.0309739334485784,
"learning_rate": 1.2612967603843512e-06,
"loss": 0.1337,
"step": 866
},
{
"epoch": 0.6824085005903188,
"grad_norm": 0.9227141127754309,
"learning_rate": 1.2556441581800182e-06,
"loss": 0.1118,
"step": 867
},
{
"epoch": 0.6831955922865014,
"grad_norm": 0.983027599423059,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.1201,
"step": 868
},
{
"epoch": 0.683982683982684,
"grad_norm": 0.969869074022873,
"learning_rate": 1.2443643241447629e-06,
"loss": 0.1205,
"step": 869
},
{
"epoch": 0.6847697756788665,
"grad_norm": 0.9626068462653994,
"learning_rate": 1.2387371688572133e-06,
"loss": 0.1294,
"step": 870
},
{
"epoch": 0.6855568673750492,
"grad_norm": 0.9924688128052054,
"learning_rate": 1.233118572322437e-06,
"loss": 0.1193,
"step": 871
},
{
"epoch": 0.6863439590712318,
"grad_norm": 0.9409212105627156,
"learning_rate": 1.2275085726674442e-06,
"loss": 0.1186,
"step": 872
},
{
"epoch": 0.6871310507674144,
"grad_norm": 0.9321864217317675,
"learning_rate": 1.2219072079609046e-06,
"loss": 0.118,
"step": 873
},
{
"epoch": 0.6879181424635971,
"grad_norm": 0.8802354237634122,
"learning_rate": 1.2163145162128948e-06,
"loss": 0.1092,
"step": 874
},
{
"epoch": 0.6887052341597796,
"grad_norm": 0.9820858832906886,
"learning_rate": 1.2107305353746376e-06,
"loss": 0.1261,
"step": 875
},
{
"epoch": 0.6894923258559622,
"grad_norm": 1.0214787998802317,
"learning_rate": 1.2051553033382426e-06,
"loss": 0.121,
"step": 876
},
{
"epoch": 0.6902794175521448,
"grad_norm": 0.9157258726824631,
"learning_rate": 1.1995888579364551e-06,
"loss": 0.1189,
"step": 877
},
{
"epoch": 0.6910665092483275,
"grad_norm": 0.9531462191249618,
"learning_rate": 1.1940312369423919e-06,
"loss": 0.1184,
"step": 878
},
{
"epoch": 0.69185360094451,
"grad_norm": 0.9748879770068989,
"learning_rate": 1.18848247806929e-06,
"loss": 0.1201,
"step": 879
},
{
"epoch": 0.6926406926406926,
"grad_norm": 0.9952760658770881,
"learning_rate": 1.1829426189702487e-06,
"loss": 0.1211,
"step": 880
},
{
"epoch": 0.6934277843368752,
"grad_norm": 0.9561514586133496,
"learning_rate": 1.177411697237977e-06,
"loss": 0.1208,
"step": 881
},
{
"epoch": 0.6942148760330579,
"grad_norm": 1.0289787958991654,
"learning_rate": 1.1718897504045328e-06,
"loss": 0.1329,
"step": 882
},
{
"epoch": 0.6950019677292405,
"grad_norm": 1.012367533381528,
"learning_rate": 1.1663768159410748e-06,
"loss": 0.1286,
"step": 883
},
{
"epoch": 0.695789059425423,
"grad_norm": 0.9932326189371155,
"learning_rate": 1.160872931257602e-06,
"loss": 0.1207,
"step": 884
},
{
"epoch": 0.6965761511216056,
"grad_norm": 0.9375475650331836,
"learning_rate": 1.1553781337027061e-06,
"loss": 0.1162,
"step": 885
},
{
"epoch": 0.6973632428177883,
"grad_norm": 1.0035582921316957,
"learning_rate": 1.149892460563311e-06,
"loss": 0.1272,
"step": 886
},
{
"epoch": 0.6981503345139709,
"grad_norm": 0.969216495536807,
"learning_rate": 1.1444159490644278e-06,
"loss": 0.1322,
"step": 887
},
{
"epoch": 0.6989374262101535,
"grad_norm": 0.9727140149487835,
"learning_rate": 1.1389486363688935e-06,
"loss": 0.1109,
"step": 888
},
{
"epoch": 0.699724517906336,
"grad_norm": 1.035921852021017,
"learning_rate": 1.1334905595771274e-06,
"loss": 0.125,
"step": 889
},
{
"epoch": 0.7005116096025187,
"grad_norm": 0.9817389469807767,
"learning_rate": 1.1280417557268735e-06,
"loss": 0.1263,
"step": 890
},
{
"epoch": 0.7012987012987013,
"grad_norm": 0.941993125359632,
"learning_rate": 1.12260226179295e-06,
"loss": 0.1204,
"step": 891
},
{
"epoch": 0.7020857929948839,
"grad_norm": 0.9776393790876531,
"learning_rate": 1.1171721146870015e-06,
"loss": 0.1351,
"step": 892
},
{
"epoch": 0.7028728846910665,
"grad_norm": 1.0272253940679958,
"learning_rate": 1.1117513512572436e-06,
"loss": 0.1297,
"step": 893
},
{
"epoch": 0.7036599763872491,
"grad_norm": 0.9825257474446853,
"learning_rate": 1.1063400082882188e-06,
"loss": 0.1089,
"step": 894
},
{
"epoch": 0.7044470680834317,
"grad_norm": 0.9941185616779367,
"learning_rate": 1.10093812250054e-06,
"loss": 0.1182,
"step": 895
},
{
"epoch": 0.7052341597796143,
"grad_norm": 0.9809962090348159,
"learning_rate": 1.095545730550649e-06,
"loss": 0.1221,
"step": 896
},
{
"epoch": 0.706021251475797,
"grad_norm": 0.9359419759382669,
"learning_rate": 1.0901628690305593e-06,
"loss": 0.1175,
"step": 897
},
{
"epoch": 0.7068083431719795,
"grad_norm": 1.0254153011332428,
"learning_rate": 1.0847895744676173e-06,
"loss": 0.1364,
"step": 898
},
{
"epoch": 0.7075954348681621,
"grad_norm": 1.0451822058149052,
"learning_rate": 1.0794258833242452e-06,
"loss": 0.1341,
"step": 899
},
{
"epoch": 0.7083825265643447,
"grad_norm": 1.0267091614696302,
"learning_rate": 1.0740718319976992e-06,
"loss": 0.1284,
"step": 900
},
{
"epoch": 0.7091696182605274,
"grad_norm": 0.8928053655240218,
"learning_rate": 1.0687274568198208e-06,
"loss": 0.1009,
"step": 901
},
{
"epoch": 0.70995670995671,
"grad_norm": 1.057234091878292,
"learning_rate": 1.063392794056792e-06,
"loss": 0.1346,
"step": 902
},
{
"epoch": 0.7107438016528925,
"grad_norm": 0.9612239375437197,
"learning_rate": 1.0580678799088847e-06,
"loss": 0.1158,
"step": 903
},
{
"epoch": 0.7115308933490752,
"grad_norm": 0.9876590104136502,
"learning_rate": 1.0527527505102213e-06,
"loss": 0.1193,
"step": 904
},
{
"epoch": 0.7123179850452578,
"grad_norm": 1.0155629806285287,
"learning_rate": 1.0474474419285255e-06,
"loss": 0.1206,
"step": 905
},
{
"epoch": 0.7131050767414404,
"grad_norm": 1.108914897353474,
"learning_rate": 1.0421519901648759e-06,
"loss": 0.1244,
"step": 906
},
{
"epoch": 0.7138921684376229,
"grad_norm": 0.9624208122062576,
"learning_rate": 1.0368664311534674e-06,
"loss": 0.122,
"step": 907
},
{
"epoch": 0.7146792601338056,
"grad_norm": 0.9051835119610858,
"learning_rate": 1.031590800761361e-06,
"loss": 0.1115,
"step": 908
},
{
"epoch": 0.7154663518299882,
"grad_norm": 1.001641822545354,
"learning_rate": 1.0263251347882467e-06,
"loss": 0.1205,
"step": 909
},
{
"epoch": 0.7162534435261708,
"grad_norm": 0.9587134043689033,
"learning_rate": 1.021069468966194e-06,
"loss": 0.114,
"step": 910
},
{
"epoch": 0.7170405352223534,
"grad_norm": 0.9734138315261187,
"learning_rate": 1.0158238389594164e-06,
"loss": 0.1237,
"step": 911
},
{
"epoch": 0.717827626918536,
"grad_norm": 0.9654730718585164,
"learning_rate": 1.0105882803640215e-06,
"loss": 0.1241,
"step": 912
},
{
"epoch": 0.7186147186147186,
"grad_norm": 1.0069324283880368,
"learning_rate": 1.0053628287077782e-06,
"loss": 0.129,
"step": 913
},
{
"epoch": 0.7194018103109012,
"grad_norm": 0.9724112904663149,
"learning_rate": 1.000147519449867e-06,
"loss": 0.1217,
"step": 914
},
{
"epoch": 0.7201889020070839,
"grad_norm": 0.9009157337976961,
"learning_rate": 9.94942387980648e-07,
"loss": 0.1215,
"step": 915
},
{
"epoch": 0.7209759937032665,
"grad_norm": 0.9903710831464596,
"learning_rate": 9.89747469621411e-07,
"loss": 0.1247,
"step": 916
},
{
"epoch": 0.721763085399449,
"grad_norm": 0.9627081908787005,
"learning_rate": 9.845627996241459e-07,
"loss": 0.1235,
"step": 917
},
{
"epoch": 0.7225501770956316,
"grad_norm": 0.9666668030573422,
"learning_rate": 9.793884131712943e-07,
"loss": 0.123,
"step": 918
},
{
"epoch": 0.7233372687918143,
"grad_norm": 0.9644911943474369,
"learning_rate": 9.742243453755202e-07,
"loss": 0.1142,
"step": 919
},
{
"epoch": 0.7241243604879969,
"grad_norm": 0.9984844200263358,
"learning_rate": 9.690706312794618e-07,
"loss": 0.1251,
"step": 920
},
{
"epoch": 0.7249114521841794,
"grad_norm": 0.988262708447867,
"learning_rate": 9.639273058555004e-07,
"loss": 0.1233,
"step": 921
},
{
"epoch": 0.725698543880362,
"grad_norm": 0.9062607929130434,
"learning_rate": 9.587944040055225e-07,
"loss": 0.1116,
"step": 922
},
{
"epoch": 0.7264856355765447,
"grad_norm": 1.0132516720132552,
"learning_rate": 9.536719605606795e-07,
"loss": 0.1314,
"step": 923
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.9210291352044477,
"learning_rate": 9.485600102811556e-07,
"loss": 0.108,
"step": 924
},
{
"epoch": 0.7280598189689099,
"grad_norm": 1.0099118734494892,
"learning_rate": 9.434585878559277e-07,
"loss": 0.1172,
"step": 925
},
{
"epoch": 0.7288469106650924,
"grad_norm": 1.0237482529235973,
"learning_rate": 9.383677279025347e-07,
"loss": 0.1186,
"step": 926
},
{
"epoch": 0.7296340023612751,
"grad_norm": 0.9855331385764105,
"learning_rate": 9.332874649668369e-07,
"loss": 0.1185,
"step": 927
},
{
"epoch": 0.7304210940574577,
"grad_norm": 0.9369233888911801,
"learning_rate": 9.282178335227885e-07,
"loss": 0.1067,
"step": 928
},
{
"epoch": 0.7312081857536403,
"grad_norm": 1.025834900254658,
"learning_rate": 9.231588679721956e-07,
"loss": 0.1256,
"step": 929
},
{
"epoch": 0.731995277449823,
"grad_norm": 1.0004815551544541,
"learning_rate": 9.181106026444913e-07,
"loss": 0.1171,
"step": 930
},
{
"epoch": 0.7327823691460055,
"grad_norm": 0.9247417584553485,
"learning_rate": 9.130730717964948e-07,
"loss": 0.1132,
"step": 931
},
{
"epoch": 0.7335694608421881,
"grad_norm": 0.9769073592720867,
"learning_rate": 9.08046309612185e-07,
"loss": 0.1242,
"step": 932
},
{
"epoch": 0.7343565525383707,
"grad_norm": 0.96681906386633,
"learning_rate": 9.030303502024662e-07,
"loss": 0.1179,
"step": 933
},
{
"epoch": 0.7351436442345534,
"grad_norm": 1.021595769957744,
"learning_rate": 8.980252276049345e-07,
"loss": 0.1161,
"step": 934
},
{
"epoch": 0.7359307359307359,
"grad_norm": 0.9231065432942811,
"learning_rate": 8.930309757836517e-07,
"loss": 0.1149,
"step": 935
},
{
"epoch": 0.7367178276269185,
"grad_norm": 1.026367432921577,
"learning_rate": 8.880476286289091e-07,
"loss": 0.1284,
"step": 936
},
{
"epoch": 0.7375049193231011,
"grad_norm": 0.981215601065822,
"learning_rate": 8.830752199570033e-07,
"loss": 0.1133,
"step": 937
},
{
"epoch": 0.7382920110192838,
"grad_norm": 0.9212608185738064,
"learning_rate": 8.781137835100021e-07,
"loss": 0.1077,
"step": 938
},
{
"epoch": 0.7390791027154664,
"grad_norm": 0.9833427367903659,
"learning_rate": 8.731633529555167e-07,
"loss": 0.1164,
"step": 939
},
{
"epoch": 0.7398661944116489,
"grad_norm": 0.9854894539977124,
"learning_rate": 8.682239618864763e-07,
"loss": 0.1155,
"step": 940
},
{
"epoch": 0.7406532861078315,
"grad_norm": 0.9551803394241506,
"learning_rate": 8.632956438208962e-07,
"loss": 0.1162,
"step": 941
},
{
"epoch": 0.7414403778040142,
"grad_norm": 0.9042419017178762,
"learning_rate": 8.583784322016503e-07,
"loss": 0.109,
"step": 942
},
{
"epoch": 0.7422274695001968,
"grad_norm": 0.9609816099291726,
"learning_rate": 8.534723603962497e-07,
"loss": 0.1191,
"step": 943
},
{
"epoch": 0.7430145611963794,
"grad_norm": 1.0149972325544658,
"learning_rate": 8.48577461696608e-07,
"loss": 0.1192,
"step": 944
},
{
"epoch": 0.743801652892562,
"grad_norm": 0.9584184891745349,
"learning_rate": 8.436937693188232e-07,
"loss": 0.1267,
"step": 945
},
{
"epoch": 0.7445887445887446,
"grad_norm": 0.9986011121611049,
"learning_rate": 8.38821316402946e-07,
"loss": 0.1177,
"step": 946
},
{
"epoch": 0.7453758362849272,
"grad_norm": 0.962256278467975,
"learning_rate": 8.339601360127592e-07,
"loss": 0.1131,
"step": 947
},
{
"epoch": 0.7461629279811098,
"grad_norm": 0.9419406227649391,
"learning_rate": 8.291102611355526e-07,
"loss": 0.1123,
"step": 948
},
{
"epoch": 0.7469500196772924,
"grad_norm": 0.9728190857016107,
"learning_rate": 8.242717246818957e-07,
"loss": 0.1197,
"step": 949
},
{
"epoch": 0.747737111373475,
"grad_norm": 1.0169044023539633,
"learning_rate": 8.1944455948542e-07,
"loss": 0.1219,
"step": 950
},
{
"epoch": 0.7485242030696576,
"grad_norm": 0.9972018368498321,
"learning_rate": 8.146287983025902e-07,
"loss": 0.1241,
"step": 951
},
{
"epoch": 0.7493112947658402,
"grad_norm": 1.040910663691627,
"learning_rate": 8.098244738124888e-07,
"loss": 0.1138,
"step": 952
},
{
"epoch": 0.7500983864620229,
"grad_norm": 1.0438538265069202,
"learning_rate": 8.050316186165862e-07,
"loss": 0.134,
"step": 953
},
{
"epoch": 0.7508854781582054,
"grad_norm": 0.9793759854817412,
"learning_rate": 8.002502652385278e-07,
"loss": 0.1241,
"step": 954
},
{
"epoch": 0.751672569854388,
"grad_norm": 0.9636283038275181,
"learning_rate": 7.954804461239054e-07,
"loss": 0.1171,
"step": 955
},
{
"epoch": 0.7524596615505706,
"grad_norm": 0.9416057200961391,
"learning_rate": 7.907221936400452e-07,
"loss": 0.1194,
"step": 956
},
{
"epoch": 0.7532467532467533,
"grad_norm": 0.9258555685816136,
"learning_rate": 7.859755400757793e-07,
"loss": 0.1199,
"step": 957
},
{
"epoch": 0.7540338449429359,
"grad_norm": 0.9697084160189383,
"learning_rate": 7.812405176412354e-07,
"loss": 0.1206,
"step": 958
},
{
"epoch": 0.7548209366391184,
"grad_norm": 1.008937777573116,
"learning_rate": 7.76517158467611e-07,
"loss": 0.1238,
"step": 959
},
{
"epoch": 0.755608028335301,
"grad_norm": 0.987888023607684,
"learning_rate": 7.718054946069589e-07,
"loss": 0.1246,
"step": 960
},
{
"epoch": 0.7563951200314837,
"grad_norm": 0.9699505992391279,
"learning_rate": 7.671055580319706e-07,
"loss": 0.1203,
"step": 961
},
{
"epoch": 0.7571822117276663,
"grad_norm": 0.9382257403962697,
"learning_rate": 7.62417380635756e-07,
"loss": 0.1151,
"step": 962
},
{
"epoch": 0.7579693034238488,
"grad_norm": 0.9437354430265479,
"learning_rate": 7.577409942316305e-07,
"loss": 0.1163,
"step": 963
},
{
"epoch": 0.7587563951200315,
"grad_norm": 0.9503047394426882,
"learning_rate": 7.530764305528959e-07,
"loss": 0.1211,
"step": 964
},
{
"epoch": 0.7595434868162141,
"grad_norm": 1.0356071156065598,
"learning_rate": 7.484237212526288e-07,
"loss": 0.1273,
"step": 965
},
{
"epoch": 0.7603305785123967,
"grad_norm": 0.9856511020736725,
"learning_rate": 7.437828979034606e-07,
"loss": 0.1315,
"step": 966
},
{
"epoch": 0.7611176702085793,
"grad_norm": 0.9629717208752256,
"learning_rate": 7.391539919973698e-07,
"loss": 0.1062,
"step": 967
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.9607163301231785,
"learning_rate": 7.345370349454611e-07,
"loss": 0.1189,
"step": 968
},
{
"epoch": 0.7626918536009445,
"grad_norm": 0.9578086152431808,
"learning_rate": 7.2993205807776e-07,
"loss": 0.1183,
"step": 969
},
{
"epoch": 0.7634789452971271,
"grad_norm": 0.9162347277154375,
"learning_rate": 7.253390926429918e-07,
"loss": 0.1104,
"step": 970
},
{
"epoch": 0.7642660369933097,
"grad_norm": 0.9822027407988481,
"learning_rate": 7.207581698083782e-07,
"loss": 0.1304,
"step": 971
},
{
"epoch": 0.7650531286894924,
"grad_norm": 0.940452384125095,
"learning_rate": 7.161893206594175e-07,
"loss": 0.1168,
"step": 972
},
{
"epoch": 0.7658402203856749,
"grad_norm": 0.9588362662800347,
"learning_rate": 7.116325761996818e-07,
"loss": 0.1206,
"step": 973
},
{
"epoch": 0.7666273120818575,
"grad_norm": 1.011535036970359,
"learning_rate": 7.070879673505976e-07,
"loss": 0.1141,
"step": 974
},
{
"epoch": 0.7674144037780402,
"grad_norm": 0.9688050929102817,
"learning_rate": 7.025555249512461e-07,
"loss": 0.1134,
"step": 975
},
{
"epoch": 0.7682014954742228,
"grad_norm": 0.9177610587932681,
"learning_rate": 6.980352797581438e-07,
"loss": 0.1089,
"step": 976
},
{
"epoch": 0.7689885871704053,
"grad_norm": 1.0472410615763514,
"learning_rate": 6.935272624450432e-07,
"loss": 0.1249,
"step": 977
},
{
"epoch": 0.7697756788665879,
"grad_norm": 0.9636506719030409,
"learning_rate": 6.890315036027156e-07,
"loss": 0.1166,
"step": 978
},
{
"epoch": 0.7705627705627706,
"grad_norm": 0.9412495575321557,
"learning_rate": 6.845480337387525e-07,
"loss": 0.1195,
"step": 979
},
{
"epoch": 0.7713498622589532,
"grad_norm": 0.9930839323289444,
"learning_rate": 6.800768832773505e-07,
"loss": 0.1267,
"step": 980
},
{
"epoch": 0.7721369539551358,
"grad_norm": 0.9929049734904327,
"learning_rate": 6.756180825591099e-07,
"loss": 0.1199,
"step": 981
},
{
"epoch": 0.7729240456513183,
"grad_norm": 0.9842816070024283,
"learning_rate": 6.711716618408282e-07,
"loss": 0.1179,
"step": 982
},
{
"epoch": 0.773711137347501,
"grad_norm": 0.9944580491304532,
"learning_rate": 6.66737651295292e-07,
"loss": 0.1198,
"step": 983
},
{
"epoch": 0.7744982290436836,
"grad_norm": 0.9839708888147434,
"learning_rate": 6.623160810110765e-07,
"loss": 0.1193,
"step": 984
},
{
"epoch": 0.7752853207398662,
"grad_norm": 0.9098159637038072,
"learning_rate": 6.579069809923367e-07,
"loss": 0.1123,
"step": 985
},
{
"epoch": 0.7760724124360489,
"grad_norm": 0.9606605502093222,
"learning_rate": 6.535103811586085e-07,
"loss": 0.1174,
"step": 986
},
{
"epoch": 0.7768595041322314,
"grad_norm": 1.0270282467875798,
"learning_rate": 6.491263113446005e-07,
"loss": 0.1287,
"step": 987
},
{
"epoch": 0.777646595828414,
"grad_norm": 0.9366844684145114,
"learning_rate": 6.44754801299998e-07,
"loss": 0.1158,
"step": 988
},
{
"epoch": 0.7784336875245966,
"grad_norm": 0.9037352215899601,
"learning_rate": 6.403958806892535e-07,
"loss": 0.1053,
"step": 989
},
{
"epoch": 0.7792207792207793,
"grad_norm": 0.9440225724017625,
"learning_rate": 6.360495790913926e-07,
"loss": 0.114,
"step": 990
},
{
"epoch": 0.7800078709169618,
"grad_norm": 0.9351662026461205,
"learning_rate": 6.317159259998074e-07,
"loss": 0.113,
"step": 991
},
{
"epoch": 0.7807949626131444,
"grad_norm": 0.9872272779542443,
"learning_rate": 6.273949508220612e-07,
"loss": 0.1217,
"step": 992
},
{
"epoch": 0.781582054309327,
"grad_norm": 1.0021043961378415,
"learning_rate": 6.23086682879686e-07,
"loss": 0.1194,
"step": 993
},
{
"epoch": 0.7823691460055097,
"grad_norm": 0.9797645648660196,
"learning_rate": 6.187911514079834e-07,
"loss": 0.1294,
"step": 994
},
{
"epoch": 0.7831562377016923,
"grad_norm": 1.0054784443943467,
"learning_rate": 6.14508385555829e-07,
"loss": 0.1236,
"step": 995
},
{
"epoch": 0.7839433293978748,
"grad_norm": 0.9433076242026539,
"learning_rate": 6.102384143854698e-07,
"loss": 0.1147,
"step": 996
},
{
"epoch": 0.7847304210940574,
"grad_norm": 0.9383907844400864,
"learning_rate": 6.059812668723336e-07,
"loss": 0.115,
"step": 997
},
{
"epoch": 0.7855175127902401,
"grad_norm": 0.9452315722932242,
"learning_rate": 6.017369719048255e-07,
"loss": 0.1154,
"step": 998
},
{
"epoch": 0.7863046044864227,
"grad_norm": 0.9247930090252802,
"learning_rate": 5.975055582841358e-07,
"loss": 0.1127,
"step": 999
},
{
"epoch": 0.7870916961826053,
"grad_norm": 1.0061872579787852,
"learning_rate": 5.932870547240455e-07,
"loss": 0.1183,
"step": 1000
},
{
"epoch": 0.7870916961826053,
"eval_loss": 0.11849173903465271,
"eval_runtime": 18.0453,
"eval_samples_per_second": 45.552,
"eval_steps_per_second": 5.708,
"step": 1000
},
{
"epoch": 0.7878787878787878,
"grad_norm": 0.9623831636196449,
"learning_rate": 5.890814898507277e-07,
"loss": 0.1201,
"step": 1001
},
{
"epoch": 0.7886658795749705,
"grad_norm": 1.0229456536544794,
"learning_rate": 5.848888922025553e-07,
"loss": 0.1223,
"step": 1002
},
{
"epoch": 0.7894529712711531,
"grad_norm": 1.0277580850565635,
"learning_rate": 5.8070929022991e-07,
"loss": 0.1178,
"step": 1003
},
{
"epoch": 0.7902400629673357,
"grad_norm": 1.0160977169162413,
"learning_rate": 5.76542712294983e-07,
"loss": 0.1216,
"step": 1004
},
{
"epoch": 0.7910271546635183,
"grad_norm": 0.9181674404236817,
"learning_rate": 5.723891866715899e-07,
"loss": 0.1118,
"step": 1005
},
{
"epoch": 0.7918142463597009,
"grad_norm": 0.9799718055295829,
"learning_rate": 5.682487415449719e-07,
"loss": 0.1217,
"step": 1006
},
{
"epoch": 0.7926013380558835,
"grad_norm": 0.980465660075739,
"learning_rate": 5.641214050116098e-07,
"loss": 0.1252,
"step": 1007
},
{
"epoch": 0.7933884297520661,
"grad_norm": 0.9234542096536653,
"learning_rate": 5.600072050790317e-07,
"loss": 0.1096,
"step": 1008
},
{
"epoch": 0.7941755214482488,
"grad_norm": 0.8947896913580902,
"learning_rate": 5.559061696656199e-07,
"loss": 0.1075,
"step": 1009
},
{
"epoch": 0.7949626131444313,
"grad_norm": 0.9790572754851533,
"learning_rate": 5.518183266004276e-07,
"loss": 0.1171,
"step": 1010
},
{
"epoch": 0.7957497048406139,
"grad_norm": 0.9667752845159187,
"learning_rate": 5.477437036229832e-07,
"loss": 0.1098,
"step": 1011
},
{
"epoch": 0.7965367965367965,
"grad_norm": 1.1238301396219903,
"learning_rate": 5.436823283831083e-07,
"loss": 0.1373,
"step": 1012
},
{
"epoch": 0.7973238882329792,
"grad_norm": 0.9927017197297952,
"learning_rate": 5.396342284407252e-07,
"loss": 0.1188,
"step": 1013
},
{
"epoch": 0.7981109799291618,
"grad_norm": 0.9934845633471078,
"learning_rate": 5.355994312656734e-07,
"loss": 0.1142,
"step": 1014
},
{
"epoch": 0.7988980716253443,
"grad_norm": 0.9432843712008361,
"learning_rate": 5.315779642375199e-07,
"loss": 0.1158,
"step": 1015
},
{
"epoch": 0.799685163321527,
"grad_norm": 1.0251992909650254,
"learning_rate": 5.275698546453775e-07,
"loss": 0.1175,
"step": 1016
},
{
"epoch": 0.8004722550177096,
"grad_norm": 1.010003247709555,
"learning_rate": 5.235751296877148e-07,
"loss": 0.1223,
"step": 1017
},
{
"epoch": 0.8012593467138922,
"grad_norm": 1.0290265946769084,
"learning_rate": 5.195938164721767e-07,
"loss": 0.1213,
"step": 1018
},
{
"epoch": 0.8020464384100747,
"grad_norm": 1.0065180201235937,
"learning_rate": 5.156259420153962e-07,
"loss": 0.1238,
"step": 1019
},
{
"epoch": 0.8028335301062574,
"grad_norm": 0.9673994358176322,
"learning_rate": 5.116715332428118e-07,
"loss": 0.1106,
"step": 1020
},
{
"epoch": 0.80362062180244,
"grad_norm": 0.9855226309577549,
"learning_rate": 5.077306169884888e-07,
"loss": 0.1107,
"step": 1021
},
{
"epoch": 0.8044077134986226,
"grad_norm": 0.9719491378095487,
"learning_rate": 5.038032199949313e-07,
"loss": 0.1241,
"step": 1022
},
{
"epoch": 0.8051948051948052,
"grad_norm": 0.9767976318784359,
"learning_rate": 4.998893689129061e-07,
"loss": 0.1165,
"step": 1023
},
{
"epoch": 0.8059818968909878,
"grad_norm": 0.9563874747183178,
"learning_rate": 4.959890903012568e-07,
"loss": 0.1084,
"step": 1024
},
{
"epoch": 0.8067689885871704,
"grad_norm": 0.9724824504692731,
"learning_rate": 4.921024106267283e-07,
"loss": 0.1199,
"step": 1025
},
{
"epoch": 0.807556080283353,
"grad_norm": 0.9939320935755256,
"learning_rate": 4.882293562637827e-07,
"loss": 0.1221,
"step": 1026
},
{
"epoch": 0.8083431719795356,
"grad_norm": 0.9407936084194475,
"learning_rate": 4.843699534944258e-07,
"loss": 0.114,
"step": 1027
},
{
"epoch": 0.8091302636757183,
"grad_norm": 1.00154528232707,
"learning_rate": 4.805242285080222e-07,
"loss": 0.116,
"step": 1028
},
{
"epoch": 0.8099173553719008,
"grad_norm": 0.9808095624927836,
"learning_rate": 4.7669220740112376e-07,
"loss": 0.1166,
"step": 1029
},
{
"epoch": 0.8107044470680834,
"grad_norm": 0.9094800308143445,
"learning_rate": 4.728739161772874e-07,
"loss": 0.1091,
"step": 1030
},
{
"epoch": 0.811491538764266,
"grad_norm": 0.8938814168806941,
"learning_rate": 4.690693807469035e-07,
"loss": 0.1032,
"step": 1031
},
{
"epoch": 0.8122786304604487,
"grad_norm": 0.9946966466940443,
"learning_rate": 4.6527862692701487e-07,
"loss": 0.1158,
"step": 1032
},
{
"epoch": 0.8130657221566312,
"grad_norm": 0.9504344369313477,
"learning_rate": 4.615016804411465e-07,
"loss": 0.1114,
"step": 1033
},
{
"epoch": 0.8138528138528138,
"grad_norm": 0.9453878426287087,
"learning_rate": 4.5773856691912726e-07,
"loss": 0.1208,
"step": 1034
},
{
"epoch": 0.8146399055489965,
"grad_norm": 1.0239509589587361,
"learning_rate": 4.53989311896918e-07,
"loss": 0.1186,
"step": 1035
},
{
"epoch": 0.8154269972451791,
"grad_norm": 0.915087834897205,
"learning_rate": 4.502539408164386e-07,
"loss": 0.106,
"step": 1036
},
{
"epoch": 0.8162140889413617,
"grad_norm": 0.936045456280838,
"learning_rate": 4.465324790253922e-07,
"loss": 0.1104,
"step": 1037
},
{
"epoch": 0.8170011806375442,
"grad_norm": 0.986165452237028,
"learning_rate": 4.428249517770986e-07,
"loss": 0.1137,
"step": 1038
},
{
"epoch": 0.8177882723337269,
"grad_norm": 0.9641211589952483,
"learning_rate": 4.391313842303166e-07,
"loss": 0.1164,
"step": 1039
},
{
"epoch": 0.8185753640299095,
"grad_norm": 0.9334697738707801,
"learning_rate": 4.3545180144907857e-07,
"loss": 0.113,
"step": 1040
},
{
"epoch": 0.8193624557260921,
"grad_norm": 0.9871068120631671,
"learning_rate": 4.3178622840251647e-07,
"loss": 0.1241,
"step": 1041
},
{
"epoch": 0.8201495474222747,
"grad_norm": 0.9529229208831895,
"learning_rate": 4.2813468996469654e-07,
"loss": 0.121,
"step": 1042
},
{
"epoch": 0.8209366391184573,
"grad_norm": 0.9336358260352323,
"learning_rate": 4.2449721091444545e-07,
"loss": 0.1134,
"step": 1043
},
{
"epoch": 0.8217237308146399,
"grad_norm": 1.0307131336178375,
"learning_rate": 4.2087381593518716e-07,
"loss": 0.1274,
"step": 1044
},
{
"epoch": 0.8225108225108225,
"grad_norm": 0.9433490812621332,
"learning_rate": 4.1726452961477147e-07,
"loss": 0.116,
"step": 1045
},
{
"epoch": 0.8232979142070052,
"grad_norm": 0.9751019494649681,
"learning_rate": 4.136693764453101e-07,
"loss": 0.1129,
"step": 1046
},
{
"epoch": 0.8240850059031877,
"grad_norm": 1.056328093801445,
"learning_rate": 4.1008838082300743e-07,
"loss": 0.1168,
"step": 1047
},
{
"epoch": 0.8248720975993703,
"grad_norm": 1.0079092402082175,
"learning_rate": 4.065215670479991e-07,
"loss": 0.1258,
"step": 1048
},
{
"epoch": 0.8256591892955529,
"grad_norm": 0.9852819205932097,
"learning_rate": 4.02968959324182e-07,
"loss": 0.1161,
"step": 1049
},
{
"epoch": 0.8264462809917356,
"grad_norm": 0.9840178810234324,
"learning_rate": 3.9943058175905493e-07,
"loss": 0.1184,
"step": 1050
},
{
"epoch": 0.8272333726879182,
"grad_norm": 1.0203308364665442,
"learning_rate": 3.9590645836355275e-07,
"loss": 0.1232,
"step": 1051
},
{
"epoch": 0.8280204643841007,
"grad_norm": 0.967005062015959,
"learning_rate": 3.923966130518814e-07,
"loss": 0.1209,
"step": 1052
},
{
"epoch": 0.8288075560802833,
"grad_norm": 0.9434419280443521,
"learning_rate": 3.889010696413606e-07,
"loss": 0.1211,
"step": 1053
},
{
"epoch": 0.829594647776466,
"grad_norm": 0.9442684717641329,
"learning_rate": 3.8541985185225645e-07,
"loss": 0.1078,
"step": 1054
},
{
"epoch": 0.8303817394726486,
"grad_norm": 0.9927726007062886,
"learning_rate": 3.819529833076263e-07,
"loss": 0.1214,
"step": 1055
},
{
"epoch": 0.8311688311688312,
"grad_norm": 0.9486732370194126,
"learning_rate": 3.7850048753315274e-07,
"loss": 0.1087,
"step": 1056
},
{
"epoch": 0.8319559228650137,
"grad_norm": 0.9803070634979109,
"learning_rate": 3.750623879569895e-07,
"loss": 0.1181,
"step": 1057
},
{
"epoch": 0.8327430145611964,
"grad_norm": 1.0010555667628969,
"learning_rate": 3.716387079095973e-07,
"loss": 0.1172,
"step": 1058
},
{
"epoch": 0.833530106257379,
"grad_norm": 0.9548728126471466,
"learning_rate": 3.6822947062359004e-07,
"loss": 0.1125,
"step": 1059
},
{
"epoch": 0.8343171979535616,
"grad_norm": 1.0333251380116057,
"learning_rate": 3.6483469923357327e-07,
"loss": 0.1119,
"step": 1060
},
{
"epoch": 0.8351042896497441,
"grad_norm": 0.9473784570136893,
"learning_rate": 3.614544167759901e-07,
"loss": 0.1136,
"step": 1061
},
{
"epoch": 0.8358913813459268,
"grad_norm": 0.9283902827668026,
"learning_rate": 3.5808864618896295e-07,
"loss": 0.1004,
"step": 1062
},
{
"epoch": 0.8366784730421094,
"grad_norm": 0.9856546672764643,
"learning_rate": 3.5473741031213983e-07,
"loss": 0.1136,
"step": 1063
},
{
"epoch": 0.837465564738292,
"grad_norm": 0.955024488651013,
"learning_rate": 3.51400731886537e-07,
"loss": 0.1199,
"step": 1064
},
{
"epoch": 0.8382526564344747,
"grad_norm": 0.9527578800496054,
"learning_rate": 3.4807863355438703e-07,
"loss": 0.1178,
"step": 1065
},
{
"epoch": 0.8390397481306572,
"grad_norm": 0.9748866277343534,
"learning_rate": 3.447711378589841e-07,
"loss": 0.1126,
"step": 1066
},
{
"epoch": 0.8398268398268398,
"grad_norm": 0.9722315190803439,
"learning_rate": 3.414782672445291e-07,
"loss": 0.1143,
"step": 1067
},
{
"epoch": 0.8406139315230224,
"grad_norm": 1.0223007557494088,
"learning_rate": 3.3820004405598157e-07,
"loss": 0.1141,
"step": 1068
},
{
"epoch": 0.8414010232192051,
"grad_norm": 0.969999076611352,
"learning_rate": 3.3493649053890325e-07,
"loss": 0.1161,
"step": 1069
},
{
"epoch": 0.8421881149153877,
"grad_norm": 0.9878840165050939,
"learning_rate": 3.3168762883931256e-07,
"loss": 0.1164,
"step": 1070
},
{
"epoch": 0.8429752066115702,
"grad_norm": 1.030579753477139,
"learning_rate": 3.284534810035278e-07,
"loss": 0.1258,
"step": 1071
},
{
"epoch": 0.8437622983077528,
"grad_norm": 0.987395029549749,
"learning_rate": 3.252340689780245e-07,
"loss": 0.1219,
"step": 1072
},
{
"epoch": 0.8445493900039355,
"grad_norm": 0.9750378310364627,
"learning_rate": 3.2202941460927977e-07,
"loss": 0.1275,
"step": 1073
},
{
"epoch": 0.8453364817001181,
"grad_norm": 0.9446071297273908,
"learning_rate": 3.1883953964363057e-07,
"loss": 0.1177,
"step": 1074
},
{
"epoch": 0.8461235733963006,
"grad_norm": 0.9469617356782836,
"learning_rate": 3.156644657271196e-07,
"loss": 0.1128,
"step": 1075
},
{
"epoch": 0.8469106650924833,
"grad_norm": 0.951509014508041,
"learning_rate": 3.12504214405355e-07,
"loss": 0.108,
"step": 1076
},
{
"epoch": 0.8476977567886659,
"grad_norm": 0.9472516964934904,
"learning_rate": 3.093588071233578e-07,
"loss": 0.1141,
"step": 1077
},
{
"epoch": 0.8484848484848485,
"grad_norm": 0.9084748554063148,
"learning_rate": 3.06228265225422e-07,
"loss": 0.1116,
"step": 1078
},
{
"epoch": 0.8492719401810311,
"grad_norm": 0.9337280963011981,
"learning_rate": 3.031126099549653e-07,
"loss": 0.1119,
"step": 1079
},
{
"epoch": 0.8500590318772137,
"grad_norm": 0.9417103223229273,
"learning_rate": 3.000118624543888e-07,
"loss": 0.1117,
"step": 1080
},
{
"epoch": 0.8508461235733963,
"grad_norm": 0.9259525236801444,
"learning_rate": 2.9692604376492935e-07,
"loss": 0.1067,
"step": 1081
},
{
"epoch": 0.8516332152695789,
"grad_norm": 0.98856338356212,
"learning_rate": 2.9385517482651974e-07,
"loss": 0.1218,
"step": 1082
},
{
"epoch": 0.8524203069657615,
"grad_norm": 0.9208981942587281,
"learning_rate": 2.907992764776471e-07,
"loss": 0.1078,
"step": 1083
},
{
"epoch": 0.8532073986619442,
"grad_norm": 1.0070559943311361,
"learning_rate": 2.877583694552083e-07,
"loss": 0.1236,
"step": 1084
},
{
"epoch": 0.8539944903581267,
"grad_norm": 1.0056266955151931,
"learning_rate": 2.847324743943733e-07,
"loss": 0.1168,
"step": 1085
},
{
"epoch": 0.8547815820543093,
"grad_norm": 0.9843297655404193,
"learning_rate": 2.8172161182844076e-07,
"loss": 0.1179,
"step": 1086
},
{
"epoch": 0.855568673750492,
"grad_norm": 0.9794092387444499,
"learning_rate": 2.7872580218870293e-07,
"loss": 0.1143,
"step": 1087
},
{
"epoch": 0.8563557654466746,
"grad_norm": 0.9193678895589191,
"learning_rate": 2.757450658043029e-07,
"loss": 0.1033,
"step": 1088
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.943695053791707,
"learning_rate": 2.7277942290210105e-07,
"loss": 0.1197,
"step": 1089
},
{
"epoch": 0.8579299488390397,
"grad_norm": 0.9996493471626344,
"learning_rate": 2.698288936065338e-07,
"loss": 0.1121,
"step": 1090
},
{
"epoch": 0.8587170405352224,
"grad_norm": 0.9457549280655658,
"learning_rate": 2.6689349793947993e-07,
"loss": 0.1135,
"step": 1091
},
{
"epoch": 0.859504132231405,
"grad_norm": 0.9370356637509019,
"learning_rate": 2.639732558201219e-07,
"loss": 0.1142,
"step": 1092
},
{
"epoch": 0.8602912239275876,
"grad_norm": 0.9355392394517238,
"learning_rate": 2.610681870648149e-07,
"loss": 0.1101,
"step": 1093
},
{
"epoch": 0.8610783156237701,
"grad_norm": 0.9395826896807081,
"learning_rate": 2.5817831138694685e-07,
"loss": 0.1143,
"step": 1094
},
{
"epoch": 0.8618654073199528,
"grad_norm": 0.9169613408255519,
"learning_rate": 2.553036483968094e-07,
"loss": 0.1125,
"step": 1095
},
{
"epoch": 0.8626524990161354,
"grad_norm": 0.9214498686393413,
"learning_rate": 2.5244421760146354e-07,
"loss": 0.1061,
"step": 1096
},
{
"epoch": 0.863439590712318,
"grad_norm": 0.9759019789855492,
"learning_rate": 2.496000384046046e-07,
"loss": 0.1134,
"step": 1097
},
{
"epoch": 0.8642266824085005,
"grad_norm": 1.002942267675624,
"learning_rate": 2.467711301064349e-07,
"loss": 0.1249,
"step": 1098
},
{
"epoch": 0.8650137741046832,
"grad_norm": 0.9774180094397517,
"learning_rate": 2.4395751190352924e-07,
"loss": 0.1192,
"step": 1099
},
{
"epoch": 0.8658008658008658,
"grad_norm": 0.9977028529540362,
"learning_rate": 2.411592028887058e-07,
"loss": 0.1189,
"step": 1100
},
{
"epoch": 0.8665879574970484,
"grad_norm": 0.9904872465257003,
"learning_rate": 2.383762220508984e-07,
"loss": 0.1183,
"step": 1101
},
{
"epoch": 0.867375049193231,
"grad_norm": 0.9148412123541501,
"learning_rate": 2.356085882750242e-07,
"loss": 0.1078,
"step": 1102
},
{
"epoch": 0.8681621408894136,
"grad_norm": 1.0117176122067204,
"learning_rate": 2.328563203418574e-07,
"loss": 0.1217,
"step": 1103
},
{
"epoch": 0.8689492325855962,
"grad_norm": 0.9872081797420905,
"learning_rate": 2.3011943692790389e-07,
"loss": 0.117,
"step": 1104
},
{
"epoch": 0.8697363242817788,
"grad_norm": 0.9476791655485511,
"learning_rate": 2.2739795660526948e-07,
"loss": 0.1157,
"step": 1105
},
{
"epoch": 0.8705234159779615,
"grad_norm": 0.9183530530163464,
"learning_rate": 2.246918978415394e-07,
"loss": 0.1108,
"step": 1106
},
{
"epoch": 0.8713105076741441,
"grad_norm": 0.9622583408924335,
"learning_rate": 2.2200127899964786e-07,
"loss": 0.1188,
"step": 1107
},
{
"epoch": 0.8720975993703266,
"grad_norm": 0.9915067004751748,
"learning_rate": 2.1932611833775846e-07,
"loss": 0.1151,
"step": 1108
},
{
"epoch": 0.8728846910665092,
"grad_norm": 0.9404810815181894,
"learning_rate": 2.1666643400913512e-07,
"loss": 0.1133,
"step": 1109
},
{
"epoch": 0.8736717827626919,
"grad_norm": 0.9750904254975121,
"learning_rate": 2.1402224406202377e-07,
"loss": 0.1187,
"step": 1110
},
{
"epoch": 0.8744588744588745,
"grad_norm": 0.942666742797311,
"learning_rate": 2.1139356643952667e-07,
"loss": 0.1133,
"step": 1111
},
{
"epoch": 0.875245966155057,
"grad_norm": 0.9261400322366565,
"learning_rate": 2.0878041897948121e-07,
"loss": 0.1095,
"step": 1112
},
{
"epoch": 0.8760330578512396,
"grad_norm": 1.0714254257987408,
"learning_rate": 2.0618281941434058e-07,
"loss": 0.1197,
"step": 1113
},
{
"epoch": 0.8768201495474223,
"grad_norm": 0.9322738584358286,
"learning_rate": 2.036007853710503e-07,
"loss": 0.114,
"step": 1114
},
{
"epoch": 0.8776072412436049,
"grad_norm": 0.9346649367642453,
"learning_rate": 2.0103433437093256e-07,
"loss": 0.1027,
"step": 1115
},
{
"epoch": 0.8783943329397875,
"grad_norm": 0.9499461297298013,
"learning_rate": 1.9848348382956294e-07,
"loss": 0.1228,
"step": 1116
},
{
"epoch": 0.87918142463597,
"grad_norm": 0.9811562591520676,
"learning_rate": 1.9594825105665654e-07,
"loss": 0.1168,
"step": 1117
},
{
"epoch": 0.8799685163321527,
"grad_norm": 0.901833893893408,
"learning_rate": 1.934286532559468e-07,
"loss": 0.0992,
"step": 1118
},
{
"epoch": 0.8807556080283353,
"grad_norm": 0.9566664879297264,
"learning_rate": 1.9092470752507225e-07,
"loss": 0.1114,
"step": 1119
},
{
"epoch": 0.8815426997245179,
"grad_norm": 0.8992036132523128,
"learning_rate": 1.8843643085545677e-07,
"loss": 0.1113,
"step": 1120
},
{
"epoch": 0.8823297914207006,
"grad_norm": 1.031563578089281,
"learning_rate": 1.8596384013219726e-07,
"loss": 0.1168,
"step": 1121
},
{
"epoch": 0.8831168831168831,
"grad_norm": 0.942026348911581,
"learning_rate": 1.8350695213394777e-07,
"loss": 0.1206,
"step": 1122
},
{
"epoch": 0.8839039748130657,
"grad_norm": 0.9439190891698341,
"learning_rate": 1.8106578353280585e-07,
"loss": 0.1138,
"step": 1123
},
{
"epoch": 0.8846910665092483,
"grad_norm": 0.9261746330624396,
"learning_rate": 1.7864035089419973e-07,
"loss": 0.1118,
"step": 1124
},
{
"epoch": 0.885478158205431,
"grad_norm": 0.9958265328548447,
"learning_rate": 1.7623067067677467e-07,
"loss": 0.1235,
"step": 1125
},
{
"epoch": 0.8862652499016135,
"grad_norm": 1.0264812939307284,
"learning_rate": 1.7383675923228372e-07,
"loss": 0.1221,
"step": 1126
},
{
"epoch": 0.8870523415977961,
"grad_norm": 1.0457105898882355,
"learning_rate": 1.7145863280547348e-07,
"loss": 0.1183,
"step": 1127
},
{
"epoch": 0.8878394332939787,
"grad_norm": 0.9616200935637597,
"learning_rate": 1.6909630753397716e-07,
"loss": 0.1055,
"step": 1128
},
{
"epoch": 0.8886265249901614,
"grad_norm": 1.0114965743550393,
"learning_rate": 1.6674979944820258e-07,
"loss": 0.1247,
"step": 1129
},
{
"epoch": 0.889413616686344,
"grad_norm": 1.0002918205099012,
"learning_rate": 1.644191244712251e-07,
"loss": 0.1245,
"step": 1130
},
{
"epoch": 0.8902007083825265,
"grad_norm": 1.0106011414793612,
"learning_rate": 1.621042984186777e-07,
"loss": 0.1222,
"step": 1131
},
{
"epoch": 0.8909878000787091,
"grad_norm": 0.9663301334412446,
"learning_rate": 1.598053369986463e-07,
"loss": 0.1194,
"step": 1132
},
{
"epoch": 0.8917748917748918,
"grad_norm": 0.9465191160189073,
"learning_rate": 1.5752225581155995e-07,
"loss": 0.1085,
"step": 1133
},
{
"epoch": 0.8925619834710744,
"grad_norm": 1.0466368501224192,
"learning_rate": 1.5525507035008852e-07,
"loss": 0.1306,
"step": 1134
},
{
"epoch": 0.893349075167257,
"grad_norm": 0.9494535695924311,
"learning_rate": 1.5300379599903408e-07,
"loss": 0.1172,
"step": 1135
},
{
"epoch": 0.8941361668634396,
"grad_norm": 0.9062893885912572,
"learning_rate": 1.507684480352292e-07,
"loss": 0.109,
"step": 1136
},
{
"epoch": 0.8949232585596222,
"grad_norm": 0.9371709714503786,
"learning_rate": 1.4854904162743127e-07,
"loss": 0.1047,
"step": 1137
},
{
"epoch": 0.8957103502558048,
"grad_norm": 1.0110155118454318,
"learning_rate": 1.4634559183622193e-07,
"loss": 0.126,
"step": 1138
},
{
"epoch": 0.8964974419519874,
"grad_norm": 1.0367843431150414,
"learning_rate": 1.4415811361390142e-07,
"loss": 0.1251,
"step": 1139
},
{
"epoch": 0.89728453364817,
"grad_norm": 0.9836901063346848,
"learning_rate": 1.4198662180439166e-07,
"loss": 0.1225,
"step": 1140
},
{
"epoch": 0.8980716253443526,
"grad_norm": 1.0729047805019176,
"learning_rate": 1.3983113114313078e-07,
"loss": 0.1321,
"step": 1141
},
{
"epoch": 0.8988587170405352,
"grad_norm": 0.9172308694693998,
"learning_rate": 1.3769165625697633e-07,
"loss": 0.1094,
"step": 1142
},
{
"epoch": 0.8996458087367178,
"grad_norm": 0.9246235655173746,
"learning_rate": 1.355682116641052e-07,
"loss": 0.1098,
"step": 1143
},
{
"epoch": 0.9004329004329005,
"grad_norm": 0.9671203173685715,
"learning_rate": 1.3346081177391474e-07,
"loss": 0.1084,
"step": 1144
},
{
"epoch": 0.901219992129083,
"grad_norm": 0.9683781341379243,
"learning_rate": 1.3136947088692537e-07,
"loss": 0.119,
"step": 1145
},
{
"epoch": 0.9020070838252656,
"grad_norm": 0.9395787949515245,
"learning_rate": 1.2929420319468254e-07,
"loss": 0.1135,
"step": 1146
},
{
"epoch": 0.9027941755214482,
"grad_norm": 0.9442679409050593,
"learning_rate": 1.272350227796626e-07,
"loss": 0.1215,
"step": 1147
},
{
"epoch": 0.9035812672176309,
"grad_norm": 0.9549776744266577,
"learning_rate": 1.2519194361517468e-07,
"loss": 0.1065,
"step": 1148
},
{
"epoch": 0.9043683589138135,
"grad_norm": 0.867426121426782,
"learning_rate": 1.231649795652684e-07,
"loss": 0.0945,
"step": 1149
},
{
"epoch": 0.905155450609996,
"grad_norm": 0.9448412697453261,
"learning_rate": 1.2115414438463646e-07,
"loss": 0.1101,
"step": 1150
},
{
"epoch": 0.9059425423061787,
"grad_norm": 0.9917169805460462,
"learning_rate": 1.1915945171852572e-07,
"loss": 0.1163,
"step": 1151
},
{
"epoch": 0.9067296340023613,
"grad_norm": 0.943816804931634,
"learning_rate": 1.171809151026404e-07,
"loss": 0.1079,
"step": 1152
},
{
"epoch": 0.9075167256985439,
"grad_norm": 0.9275143855618218,
"learning_rate": 1.1521854796305243e-07,
"loss": 0.1075,
"step": 1153
},
{
"epoch": 0.9083038173947264,
"grad_norm": 1.0135210788160407,
"learning_rate": 1.1327236361611066e-07,
"loss": 0.1267,
"step": 1154
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.918433915733085,
"learning_rate": 1.1134237526834901e-07,
"loss": 0.1134,
"step": 1155
},
{
"epoch": 0.9098780007870917,
"grad_norm": 1.0485642131970048,
"learning_rate": 1.0942859601639793e-07,
"loss": 0.1181,
"step": 1156
},
{
"epoch": 0.9106650924832743,
"grad_norm": 0.9666449795771067,
"learning_rate": 1.0753103884689503e-07,
"loss": 0.1142,
"step": 1157
},
{
"epoch": 0.911452184179457,
"grad_norm": 0.9155262976479461,
"learning_rate": 1.0564971663639761e-07,
"loss": 0.1079,
"step": 1158
},
{
"epoch": 0.9122392758756395,
"grad_norm": 0.9136247354670238,
"learning_rate": 1.0378464215129419e-07,
"loss": 0.1131,
"step": 1159
},
{
"epoch": 0.9130263675718221,
"grad_norm": 0.9434235785738653,
"learning_rate": 1.0193582804771868e-07,
"loss": 0.1126,
"step": 1160
},
{
"epoch": 0.9138134592680047,
"grad_norm": 0.9320439540121583,
"learning_rate": 1.0010328687146464e-07,
"loss": 0.1128,
"step": 1161
},
{
"epoch": 0.9146005509641874,
"grad_norm": 0.9883313764295104,
"learning_rate": 9.828703105789983e-08,
"loss": 0.1189,
"step": 1162
},
{
"epoch": 0.91538764266037,
"grad_norm": 0.962978283773575,
"learning_rate": 9.648707293188092e-08,
"loss": 0.1181,
"step": 1163
},
{
"epoch": 0.9161747343565525,
"grad_norm": 0.9154295867986278,
"learning_rate": 9.470342470767197e-08,
"loss": 0.1077,
"step": 1164
},
{
"epoch": 0.9169618260527351,
"grad_norm": 0.9190343113758656,
"learning_rate": 9.293609848885971e-08,
"loss": 0.1101,
"step": 1165
},
{
"epoch": 0.9177489177489178,
"grad_norm": 0.8935174011429071,
"learning_rate": 9.118510626827198e-08,
"loss": 0.1112,
"step": 1166
},
{
"epoch": 0.9185360094451004,
"grad_norm": 0.9304289493526803,
"learning_rate": 8.945045992789669e-08,
"loss": 0.1037,
"step": 1167
},
{
"epoch": 0.9193231011412829,
"grad_norm": 0.9708158046423621,
"learning_rate": 8.773217123880074e-08,
"loss": 0.1255,
"step": 1168
},
{
"epoch": 0.9201101928374655,
"grad_norm": 0.95525649253936,
"learning_rate": 8.603025186105064e-08,
"loss": 0.1119,
"step": 1169
},
{
"epoch": 0.9208972845336482,
"grad_norm": 1.0094573892269945,
"learning_rate": 8.434471334363204e-08,
"loss": 0.1201,
"step": 1170
},
{
"epoch": 0.9216843762298308,
"grad_norm": 0.9367157782024292,
"learning_rate": 8.267556712437342e-08,
"loss": 0.1057,
"step": 1171
},
{
"epoch": 0.9224714679260134,
"grad_norm": 0.9747763894177717,
"learning_rate": 8.102282452986693e-08,
"loss": 0.1098,
"step": 1172
},
{
"epoch": 0.9232585596221959,
"grad_norm": 0.9372662016679384,
"learning_rate": 7.938649677539268e-08,
"loss": 0.1081,
"step": 1173
},
{
"epoch": 0.9240456513183786,
"grad_norm": 0.9887803764792047,
"learning_rate": 7.77665949648404e-08,
"loss": 0.1199,
"step": 1174
},
{
"epoch": 0.9248327430145612,
"grad_norm": 0.948252366615409,
"learning_rate": 7.616313009063791e-08,
"loss": 0.1064,
"step": 1175
},
{
"epoch": 0.9256198347107438,
"grad_norm": 0.948272373380358,
"learning_rate": 7.457611303367196e-08,
"loss": 0.1153,
"step": 1176
},
{
"epoch": 0.9264069264069265,
"grad_norm": 0.951666041729817,
"learning_rate": 7.300555456321884e-08,
"loss": 0.1175,
"step": 1177
},
{
"epoch": 0.927194018103109,
"grad_norm": 0.982566934759444,
"learning_rate": 7.145146533686725e-08,
"loss": 0.1164,
"step": 1178
},
{
"epoch": 0.9279811097992916,
"grad_norm": 0.9553483812186222,
"learning_rate": 6.991385590044947e-08,
"loss": 0.1169,
"step": 1179
},
{
"epoch": 0.9287682014954742,
"grad_norm": 0.9724385774768447,
"learning_rate": 6.839273668796747e-08,
"loss": 0.1078,
"step": 1180
},
{
"epoch": 0.9295552931916569,
"grad_norm": 0.9324047746069145,
"learning_rate": 6.688811802152279e-08,
"loss": 0.1162,
"step": 1181
},
{
"epoch": 0.9303423848878394,
"grad_norm": 0.9711188611376046,
"learning_rate": 6.540001011124703e-08,
"loss": 0.1089,
"step": 1182
},
{
"epoch": 0.931129476584022,
"grad_norm": 1.0007682860058293,
"learning_rate": 6.392842305523172e-08,
"loss": 0.1225,
"step": 1183
},
{
"epoch": 0.9319165682802046,
"grad_norm": 0.9074164360304593,
"learning_rate": 6.247336683946031e-08,
"loss": 0.1086,
"step": 1184
},
{
"epoch": 0.9327036599763873,
"grad_norm": 0.9132051814101239,
"learning_rate": 6.103485133774039e-08,
"loss": 0.1168,
"step": 1185
},
{
"epoch": 0.9334907516725699,
"grad_norm": 0.9362633318018305,
"learning_rate": 5.961288631163687e-08,
"loss": 0.1162,
"step": 1186
},
{
"epoch": 0.9342778433687524,
"grad_norm": 1.0037784789548483,
"learning_rate": 5.820748141040444e-08,
"loss": 0.1246,
"step": 1187
},
{
"epoch": 0.935064935064935,
"grad_norm": 0.9109713869964553,
"learning_rate": 5.681864617092414e-08,
"loss": 0.1062,
"step": 1188
},
{
"epoch": 0.9358520267611177,
"grad_norm": 0.9353212505070928,
"learning_rate": 5.544639001763719e-08,
"loss": 0.1116,
"step": 1189
},
{
"epoch": 0.9366391184573003,
"grad_norm": 0.9763975248080838,
"learning_rate": 5.4090722262481463e-08,
"loss": 0.1183,
"step": 1190
},
{
"epoch": 0.9374262101534829,
"grad_norm": 0.9389308712439575,
"learning_rate": 5.2751652104828245e-08,
"loss": 0.1125,
"step": 1191
},
{
"epoch": 0.9382133018496654,
"grad_norm": 0.9219409357377748,
"learning_rate": 5.142918863141999e-08,
"loss": 0.1045,
"step": 1192
},
{
"epoch": 0.9390003935458481,
"grad_norm": 0.9112274236581333,
"learning_rate": 5.012334081630821e-08,
"loss": 0.1154,
"step": 1193
},
{
"epoch": 0.9397874852420307,
"grad_norm": 1.0413638466805462,
"learning_rate": 4.8834117520793754e-08,
"loss": 0.1235,
"step": 1194
},
{
"epoch": 0.9405745769382133,
"grad_norm": 0.929473983088696,
"learning_rate": 4.756152749336468e-08,
"loss": 0.1216,
"step": 1195
},
{
"epoch": 0.9413616686343959,
"grad_norm": 0.9499785152690334,
"learning_rate": 4.6305579369638474e-08,
"loss": 0.119,
"step": 1196
},
{
"epoch": 0.9421487603305785,
"grad_norm": 0.9512339302739883,
"learning_rate": 4.506628167230326e-08,
"loss": 0.1128,
"step": 1197
},
{
"epoch": 0.9429358520267611,
"grad_norm": 0.9789643064479855,
"learning_rate": 4.384364281105974e-08,
"loss": 0.1156,
"step": 1198
},
{
"epoch": 0.9437229437229437,
"grad_norm": 0.9338458352272411,
"learning_rate": 4.2637671082563225e-08,
"loss": 0.1097,
"step": 1199
},
{
"epoch": 0.9445100354191264,
"grad_norm": 0.9477658752462017,
"learning_rate": 4.144837467036922e-08,
"loss": 0.1062,
"step": 1200
},
{
"epoch": 0.9452971271153089,
"grad_norm": 0.9108658408708349,
"learning_rate": 4.0275761644876785e-08,
"loss": 0.113,
"step": 1201
},
{
"epoch": 0.9460842188114915,
"grad_norm": 0.9670323890311822,
"learning_rate": 3.911983996327251e-08,
"loss": 0.1159,
"step": 1202
},
{
"epoch": 0.9468713105076741,
"grad_norm": 0.9508320183305409,
"learning_rate": 3.798061746947995e-08,
"loss": 0.1183,
"step": 1203
},
{
"epoch": 0.9476584022038568,
"grad_norm": 0.9446280692518585,
"learning_rate": 3.6858101894102774e-08,
"loss": 0.1039,
"step": 1204
},
{
"epoch": 0.9484454939000394,
"grad_norm": 0.8915603321077954,
"learning_rate": 3.575230085437448e-08,
"loss": 0.1131,
"step": 1205
},
{
"epoch": 0.9492325855962219,
"grad_norm": 1.0049928450920236,
"learning_rate": 3.466322185410542e-08,
"loss": 0.1075,
"step": 1206
},
{
"epoch": 0.9500196772924046,
"grad_norm": 0.9377285133019151,
"learning_rate": 3.3590872283633944e-08,
"loss": 0.1047,
"step": 1207
},
{
"epoch": 0.9508067689885872,
"grad_norm": 1.0283573568735918,
"learning_rate": 3.253525941977309e-08,
"loss": 0.1207,
"step": 1208
},
{
"epoch": 0.9515938606847698,
"grad_norm": 0.8716127646526632,
"learning_rate": 3.1496390425764246e-08,
"loss": 0.1034,
"step": 1209
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.9413176172911034,
"learning_rate": 3.047427235122663e-08,
"loss": 0.1094,
"step": 1210
},
{
"epoch": 0.953168044077135,
"grad_norm": 0.9235158713031617,
"learning_rate": 2.9468912132110117e-08,
"loss": 0.1112,
"step": 1211
},
{
"epoch": 0.9539551357733176,
"grad_norm": 1.0172961607730988,
"learning_rate": 2.8480316590648315e-08,
"loss": 0.1174,
"step": 1212
},
{
"epoch": 0.9547422274695002,
"grad_norm": 0.9633734296197954,
"learning_rate": 2.750849243531223e-08,
"loss": 0.119,
"step": 1213
},
{
"epoch": 0.9555293191656828,
"grad_norm": 0.8730009976362983,
"learning_rate": 2.655344626076417e-08,
"loss": 0.1012,
"step": 1214
},
{
"epoch": 0.9563164108618654,
"grad_norm": 0.9472387317654097,
"learning_rate": 2.5615184547813364e-08,
"loss": 0.1169,
"step": 1215
},
{
"epoch": 0.957103502558048,
"grad_norm": 1.009282211648514,
"learning_rate": 2.4693713663372643e-08,
"loss": 0.1193,
"step": 1216
},
{
"epoch": 0.9578905942542306,
"grad_norm": 0.9320555090575322,
"learning_rate": 2.378903986041403e-08,
"loss": 0.1134,
"step": 1217
},
{
"epoch": 0.9586776859504132,
"grad_norm": 0.9964824551488268,
"learning_rate": 2.2901169277927126e-08,
"loss": 0.123,
"step": 1218
},
{
"epoch": 0.9594647776465959,
"grad_norm": 0.9252548740156445,
"learning_rate": 2.2030107940877733e-08,
"loss": 0.1145,
"step": 1219
},
{
"epoch": 0.9602518693427784,
"grad_norm": 1.0057993388132023,
"learning_rate": 2.117586176016512e-08,
"loss": 0.1246,
"step": 1220
},
{
"epoch": 0.961038961038961,
"grad_norm": 0.953596307625609,
"learning_rate": 2.0338436532584826e-08,
"loss": 0.1113,
"step": 1221
},
{
"epoch": 0.9618260527351437,
"grad_norm": 0.8951377954636907,
"learning_rate": 1.9517837940786767e-08,
"loss": 0.1033,
"step": 1222
},
{
"epoch": 0.9626131444313263,
"grad_norm": 1.0221216536124687,
"learning_rate": 1.8714071553238012e-08,
"loss": 0.125,
"step": 1223
},
{
"epoch": 0.9634002361275088,
"grad_norm": 0.9387870706305922,
"learning_rate": 1.7927142824184784e-08,
"loss": 0.1144,
"step": 1224
},
{
"epoch": 0.9641873278236914,
"grad_norm": 0.9321486745496109,
"learning_rate": 1.7157057093614704e-08,
"loss": 0.1094,
"step": 1225
},
{
"epoch": 0.9649744195198741,
"grad_norm": 1.0128840378759991,
"learning_rate": 1.6403819587221814e-08,
"loss": 0.1217,
"step": 1226
},
{
"epoch": 0.9657615112160567,
"grad_norm": 0.924640460993744,
"learning_rate": 1.5667435416370226e-08,
"loss": 0.1115,
"step": 1227
},
{
"epoch": 0.9665486029122393,
"grad_norm": 0.9932790569946806,
"learning_rate": 1.494790957805997e-08,
"loss": 0.115,
"step": 1228
},
{
"epoch": 0.9673356946084218,
"grad_norm": 0.9381951620042324,
"learning_rate": 1.4245246954892323e-08,
"loss": 0.1096,
"step": 1229
},
{
"epoch": 0.9681227863046045,
"grad_norm": 0.9070754098607924,
"learning_rate": 1.3559452315037025e-08,
"loss": 0.107,
"step": 1230
},
{
"epoch": 0.9689098780007871,
"grad_norm": 0.9138545833001099,
"learning_rate": 1.2890530312200944e-08,
"loss": 0.1027,
"step": 1231
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.959458803400461,
"learning_rate": 1.2238485485594753e-08,
"loss": 0.1163,
"step": 1232
},
{
"epoch": 0.9704840613931524,
"grad_norm": 0.9577614004721761,
"learning_rate": 1.160332225990296e-08,
"loss": 0.1148,
"step": 1233
},
{
"epoch": 0.9712711530893349,
"grad_norm": 0.9713054396060389,
"learning_rate": 1.0985044945254763e-08,
"loss": 0.1249,
"step": 1234
},
{
"epoch": 0.9720582447855175,
"grad_norm": 0.9727071823355634,
"learning_rate": 1.0383657737192964e-08,
"loss": 0.1188,
"step": 1235
},
{
"epoch": 0.9728453364817001,
"grad_norm": 0.9525541993565831,
"learning_rate": 9.79916471664677e-09,
"loss": 0.1129,
"step": 1236
},
{
"epoch": 0.9736324281778828,
"grad_norm": 0.9792192000262094,
"learning_rate": 9.231569849904309e-09,
"loss": 0.124,
"step": 1237
},
{
"epoch": 0.9744195198740653,
"grad_norm": 0.9711922853486228,
"learning_rate": 8.680876988584607e-09,
"loss": 0.1114,
"step": 1238
},
{
"epoch": 0.9752066115702479,
"grad_norm": 0.9623623423156289,
"learning_rate": 8.147089869612045e-09,
"loss": 0.1064,
"step": 1239
},
{
"epoch": 0.9759937032664305,
"grad_norm": 0.9583105118384698,
"learning_rate": 7.630212115191381e-09,
"loss": 0.1128,
"step": 1240
},
{
"epoch": 0.9767807949626132,
"grad_norm": 0.958416495157565,
"learning_rate": 7.130247232782217e-09,
"loss": 0.1124,
"step": 1241
},
{
"epoch": 0.9775678866587958,
"grad_norm": 0.9739671638616083,
"learning_rate": 6.647198615076789e-09,
"loss": 0.1239,
"step": 1242
},
{
"epoch": 0.9783549783549783,
"grad_norm": 0.9831503405039121,
"learning_rate": 6.181069539974716e-09,
"loss": 0.1152,
"step": 1243
},
{
"epoch": 0.9791420700511609,
"grad_norm": 0.9837400272891019,
"learning_rate": 5.7318631705630126e-09,
"loss": 0.1196,
"step": 1244
},
{
"epoch": 0.9799291617473436,
"grad_norm": 0.9663876709057511,
"learning_rate": 5.299582555093052e-09,
"loss": 0.1174,
"step": 1245
},
{
"epoch": 0.9807162534435262,
"grad_norm": 0.9077428753737183,
"learning_rate": 4.884230626960307e-09,
"loss": 0.1046,
"step": 1246
},
{
"epoch": 0.9815033451397088,
"grad_norm": 0.9624717672564354,
"learning_rate": 4.485810204684638e-09,
"loss": 0.1143,
"step": 1247
},
{
"epoch": 0.9822904368358913,
"grad_norm": 0.9768731662603329,
"learning_rate": 4.104323991891424e-09,
"loss": 0.1111,
"step": 1248
},
{
"epoch": 0.983077528532074,
"grad_norm": 0.9918364976987204,
"learning_rate": 3.739774577292688e-09,
"loss": 0.1146,
"step": 1249
},
{
"epoch": 0.9838646202282566,
"grad_norm": 0.9235361594154657,
"learning_rate": 3.392164434669609e-09,
"loss": 0.115,
"step": 1250
},
{
"epoch": 0.9846517119244392,
"grad_norm": 0.9407569711042593,
"learning_rate": 3.0614959228558728e-09,
"loss": 0.1048,
"step": 1251
},
{
"epoch": 0.9854388036206218,
"grad_norm": 0.9503971642912823,
"learning_rate": 2.7477712857215676e-09,
"loss": 0.1118,
"step": 1252
},
{
"epoch": 0.9862258953168044,
"grad_norm": 0.9423931683522886,
"learning_rate": 2.450992652157924e-09,
"loss": 0.1052,
"step": 1253
},
{
"epoch": 0.987012987012987,
"grad_norm": 0.9692730394836648,
"learning_rate": 2.1711620360634344e-09,
"loss": 0.1134,
"step": 1254
},
{
"epoch": 0.9878000787091696,
"grad_norm": 0.8578557269739953,
"learning_rate": 1.9082813363294205e-09,
"loss": 0.1062,
"step": 1255
},
{
"epoch": 0.9885871704053523,
"grad_norm": 0.9417059944579995,
"learning_rate": 1.662352336827544e-09,
"loss": 0.1198,
"step": 1256
},
{
"epoch": 0.9893742621015348,
"grad_norm": 1.0199292836158018,
"learning_rate": 1.4333767063973159e-09,
"loss": 0.1085,
"step": 1257
},
{
"epoch": 0.9901613537977174,
"grad_norm": 0.9283942300248242,
"learning_rate": 1.221355998835272e-09,
"loss": 0.1069,
"step": 1258
},
{
"epoch": 0.9909484454939,
"grad_norm": 0.9097063653552931,
"learning_rate": 1.0262916528841483e-09,
"loss": 0.1074,
"step": 1259
},
{
"epoch": 0.9917355371900827,
"grad_norm": 0.9806238353833303,
"learning_rate": 8.481849922237217e-10,
"loss": 0.1136,
"step": 1260
},
{
"epoch": 0.9925226288862653,
"grad_norm": 0.9354892517698297,
"learning_rate": 6.870372254602631e-10,
"loss": 0.1048,
"step": 1261
},
{
"epoch": 0.9933097205824478,
"grad_norm": 0.9045069809298675,
"learning_rate": 5.428494461201527e-10,
"loss": 0.109,
"step": 1262
},
{
"epoch": 0.9940968122786304,
"grad_norm": 0.9462573748322036,
"learning_rate": 4.156226326415547e-10,
"loss": 0.1156,
"step": 1263
},
{
"epoch": 0.9948839039748131,
"grad_norm": 0.9123815541723352,
"learning_rate": 3.0535764836747696e-10,
"loss": 0.1078,
"step": 1264
},
{
"epoch": 0.9956709956709957,
"grad_norm": 0.960544516493779,
"learning_rate": 2.1205524154105372e-10,
"loss": 0.1088,
"step": 1265
},
{
"epoch": 0.9964580873671782,
"grad_norm": 0.8887242195384208,
"learning_rate": 1.357160452988837e-10,
"loss": 0.1031,
"step": 1266
},
{
"epoch": 0.9972451790633609,
"grad_norm": 0.9502744942086568,
"learning_rate": 7.63405776685322e-11,
"loss": 0.1124,
"step": 1267
},
{
"epoch": 0.9980322707595435,
"grad_norm": 0.8901953170723566,
"learning_rate": 3.3929241563535056e-11,
"loss": 0.1001,
"step": 1268
},
{
"epoch": 0.9988193624557261,
"grad_norm": 0.9602428624879035,
"learning_rate": 8.482324780900718e-12,
"loss": 0.1116,
"step": 1269
},
{
"epoch": 0.9996064541519087,
"grad_norm": 0.9779040145681426,
"learning_rate": 0.0,
"loss": 0.119,
"step": 1270
},
{
"epoch": 0.9996064541519087,
"step": 1270,
"total_flos": 223330201436160.0,
"train_loss": 0.14531472616308316,
"train_runtime": 7100.3282,
"train_samples_per_second": 11.452,
"train_steps_per_second": 0.179
}
],
"logging_steps": 1,
"max_steps": 1270,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 223330201436160.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}