qwen_random_wo_error_25000 / trainer_state.json
jinqij's picture
Upload folder using huggingface_hub
b84c4de verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 3125,
"global_step": 9375,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 126.07254791259766,
"learning_rate": 5.330490405117271e-09,
"loss": 5.5721,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 50.644527435302734,
"learning_rate": 1.7057569296375268e-07,
"loss": 4.7319,
"step": 32
},
{
"epoch": 0.02,
"grad_norm": 50.18523406982422,
"learning_rate": 3.4115138592750537e-07,
"loss": 4.2388,
"step": 64
},
{
"epoch": 0.03,
"grad_norm": 22.55028533935547,
"learning_rate": 5.11727078891258e-07,
"loss": 4.1004,
"step": 96
},
{
"epoch": 0.04,
"grad_norm": 17.249303817749023,
"learning_rate": 6.823027718550107e-07,
"loss": 3.8056,
"step": 128
},
{
"epoch": 0.05,
"grad_norm": 27.913209915161133,
"learning_rate": 8.528784648187634e-07,
"loss": 3.7559,
"step": 160
},
{
"epoch": 0.06,
"grad_norm": 31.13373374938965,
"learning_rate": 1.023454157782516e-06,
"loss": 3.5802,
"step": 192
},
{
"epoch": 0.07,
"grad_norm": 51.599185943603516,
"learning_rate": 1.1940298507462686e-06,
"loss": 3.4202,
"step": 224
},
{
"epoch": 0.08,
"grad_norm": 26.857385635375977,
"learning_rate": 1.3646055437100215e-06,
"loss": 3.1577,
"step": 256
},
{
"epoch": 0.09,
"grad_norm": 36.01700210571289,
"learning_rate": 1.5351812366737743e-06,
"loss": 2.9155,
"step": 288
},
{
"epoch": 0.1,
"grad_norm": 48.35042953491211,
"learning_rate": 1.7057569296375267e-06,
"loss": 2.4688,
"step": 320
},
{
"epoch": 0.11,
"grad_norm": 32.770713806152344,
"learning_rate": 1.8763326226012796e-06,
"loss": 2.0288,
"step": 352
},
{
"epoch": 0.12,
"grad_norm": 28.91162872314453,
"learning_rate": 2.046908315565032e-06,
"loss": 1.713,
"step": 384
},
{
"epoch": 0.13,
"grad_norm": 28.609853744506836,
"learning_rate": 2.217484008528785e-06,
"loss": 1.2517,
"step": 416
},
{
"epoch": 0.14,
"grad_norm": 23.230613708496094,
"learning_rate": 2.3880597014925373e-06,
"loss": 0.8838,
"step": 448
},
{
"epoch": 0.15,
"grad_norm": 20.77098846435547,
"learning_rate": 2.55863539445629e-06,
"loss": 0.6982,
"step": 480
},
{
"epoch": 0.16,
"grad_norm": 8.70417594909668,
"learning_rate": 2.729211087420043e-06,
"loss": 0.5794,
"step": 512
},
{
"epoch": 0.17,
"grad_norm": 6.534135341644287,
"learning_rate": 2.8997867803837954e-06,
"loss": 0.5319,
"step": 544
},
{
"epoch": 0.18,
"grad_norm": 4.264636039733887,
"learning_rate": 3.0703624733475486e-06,
"loss": 0.5251,
"step": 576
},
{
"epoch": 0.19,
"grad_norm": 4.235575199127197,
"learning_rate": 3.240938166311301e-06,
"loss": 0.5002,
"step": 608
},
{
"epoch": 0.2,
"grad_norm": 3.1073174476623535,
"learning_rate": 3.4115138592750535e-06,
"loss": 0.4929,
"step": 640
},
{
"epoch": 0.22,
"grad_norm": 3.024670362472534,
"learning_rate": 3.582089552238806e-06,
"loss": 0.5191,
"step": 672
},
{
"epoch": 0.23,
"grad_norm": 2.9338278770446777,
"learning_rate": 3.752665245202559e-06,
"loss": 0.4784,
"step": 704
},
{
"epoch": 0.24,
"grad_norm": 2.67437481880188,
"learning_rate": 3.9232409381663116e-06,
"loss": 0.4685,
"step": 736
},
{
"epoch": 0.25,
"grad_norm": 2.5493340492248535,
"learning_rate": 4.093816631130064e-06,
"loss": 0.4812,
"step": 768
},
{
"epoch": 0.26,
"grad_norm": 2.4631426334381104,
"learning_rate": 4.264392324093816e-06,
"loss": 0.4695,
"step": 800
},
{
"epoch": 0.27,
"grad_norm": 2.305046319961548,
"learning_rate": 4.43496801705757e-06,
"loss": 0.4799,
"step": 832
},
{
"epoch": 0.28,
"grad_norm": 2.070432186126709,
"learning_rate": 4.605543710021322e-06,
"loss": 0.4713,
"step": 864
},
{
"epoch": 0.29,
"grad_norm": 2.4328596591949463,
"learning_rate": 4.7761194029850745e-06,
"loss": 0.4581,
"step": 896
},
{
"epoch": 0.3,
"grad_norm": 1.923413872718811,
"learning_rate": 4.946695095948828e-06,
"loss": 0.4723,
"step": 928
},
{
"epoch": 0.31,
"grad_norm": 2.141838312149048,
"learning_rate": 4.999916116490299e-06,
"loss": 0.454,
"step": 960
},
{
"epoch": 0.32,
"grad_norm": 1.8363310098648071,
"learning_rate": 4.999494633386398e-06,
"loss": 0.4541,
"step": 992
},
{
"epoch": 0.33,
"grad_norm": 1.8952796459197998,
"learning_rate": 4.998718279148715e-06,
"loss": 0.4404,
"step": 1024
},
{
"epoch": 0.34,
"grad_norm": 2.261046886444092,
"learning_rate": 4.997587164001815e-06,
"loss": 0.4578,
"step": 1056
},
{
"epoch": 0.35,
"grad_norm": 1.8481628894805908,
"learning_rate": 4.996101448538208e-06,
"loss": 0.4567,
"step": 1088
},
{
"epoch": 0.36,
"grad_norm": 1.8773612976074219,
"learning_rate": 4.994261343695546e-06,
"loss": 0.4448,
"step": 1120
},
{
"epoch": 0.37,
"grad_norm": 2.0779101848602295,
"learning_rate": 4.992067110726676e-06,
"loss": 0.4654,
"step": 1152
},
{
"epoch": 0.38,
"grad_norm": 1.975841999053955,
"learning_rate": 4.989519061162551e-06,
"loss": 0.451,
"step": 1184
},
{
"epoch": 0.39,
"grad_norm": 2.04510498046875,
"learning_rate": 4.986617556767996e-06,
"loss": 0.4494,
"step": 1216
},
{
"epoch": 0.4,
"grad_norm": 2.2312591075897217,
"learning_rate": 4.983363009490345e-06,
"loss": 0.4491,
"step": 1248
},
{
"epoch": 0.41,
"grad_norm": 2.15928053855896,
"learning_rate": 4.979755881400958e-06,
"loss": 0.4469,
"step": 1280
},
{
"epoch": 0.42,
"grad_norm": 1.8935317993164062,
"learning_rate": 4.975796684629615e-06,
"loss": 0.4407,
"step": 1312
},
{
"epoch": 0.43,
"grad_norm": 1.8919423818588257,
"learning_rate": 4.9714859812918025e-06,
"loss": 0.4378,
"step": 1344
},
{
"epoch": 0.44,
"grad_norm": 1.8667327165603638,
"learning_rate": 4.966824383408912e-06,
"loss": 0.4461,
"step": 1376
},
{
"epoch": 0.45,
"grad_norm": 1.7683241367340088,
"learning_rate": 4.961812552821344e-06,
"loss": 0.4673,
"step": 1408
},
{
"epoch": 0.46,
"grad_norm": 2.1038663387298584,
"learning_rate": 4.9564512010945376e-06,
"loss": 0.445,
"step": 1440
},
{
"epoch": 0.47,
"grad_norm": 1.9102988243103027,
"learning_rate": 4.950741089417953e-06,
"loss": 0.4387,
"step": 1472
},
{
"epoch": 0.48,
"grad_norm": 1.906335473060608,
"learning_rate": 4.9446830284969925e-06,
"loss": 0.4451,
"step": 1504
},
{
"epoch": 0.49,
"grad_norm": 1.8324521780014038,
"learning_rate": 4.9382778784379036e-06,
"loss": 0.4239,
"step": 1536
},
{
"epoch": 0.5,
"grad_norm": 1.6944546699523926,
"learning_rate": 4.93152654862566e-06,
"loss": 0.4323,
"step": 1568
},
{
"epoch": 0.51,
"grad_norm": 1.6708574295043945,
"learning_rate": 4.924429997594853e-06,
"loss": 0.4358,
"step": 1600
},
{
"epoch": 0.52,
"grad_norm": 1.7164026498794556,
"learning_rate": 4.916989232893599e-06,
"loss": 0.4464,
"step": 1632
},
{
"epoch": 0.53,
"grad_norm": 1.9400017261505127,
"learning_rate": 4.9092053109404915e-06,
"loss": 0.4439,
"step": 1664
},
{
"epoch": 0.54,
"grad_norm": 2.053682804107666,
"learning_rate": 4.901079336874613e-06,
"loss": 0.4232,
"step": 1696
},
{
"epoch": 0.55,
"grad_norm": 1.9536739587783813,
"learning_rate": 4.892612464398635e-06,
"loss": 0.4548,
"step": 1728
},
{
"epoch": 0.56,
"grad_norm": 1.9369513988494873,
"learning_rate": 4.883805895615012e-06,
"loss": 0.4334,
"step": 1760
},
{
"epoch": 0.57,
"grad_norm": 1.754228115081787,
"learning_rate": 4.874660880855312e-06,
"loss": 0.4525,
"step": 1792
},
{
"epoch": 0.58,
"grad_norm": 2.147170305252075,
"learning_rate": 4.865178718502702e-06,
"loss": 0.4281,
"step": 1824
},
{
"epoch": 0.59,
"grad_norm": 1.8635765314102173,
"learning_rate": 4.855360754807605e-06,
"loss": 0.4354,
"step": 1856
},
{
"epoch": 0.6,
"grad_norm": 1.63584303855896,
"learning_rate": 4.845208383696562e-06,
"loss": 0.4423,
"step": 1888
},
{
"epoch": 0.61,
"grad_norm": 1.874233365058899,
"learning_rate": 4.834723046574325e-06,
"loss": 0.4265,
"step": 1920
},
{
"epoch": 0.62,
"grad_norm": 2.0124189853668213,
"learning_rate": 4.823906232119217e-06,
"loss": 0.4336,
"step": 1952
},
{
"epoch": 0.63,
"grad_norm": 1.8331832885742188,
"learning_rate": 4.812759476071763e-06,
"loss": 0.4129,
"step": 1984
},
{
"epoch": 0.65,
"grad_norm": 2.133513927459717,
"learning_rate": 4.801284361016662e-06,
"loss": 0.4443,
"step": 2016
},
{
"epoch": 0.66,
"grad_norm": 1.4341892004013062,
"learning_rate": 4.7894825161580895e-06,
"loss": 0.4205,
"step": 2048
},
{
"epoch": 0.67,
"grad_norm": 2.1529791355133057,
"learning_rate": 4.777355617088385e-06,
"loss": 0.4233,
"step": 2080
},
{
"epoch": 0.68,
"grad_norm": 1.9409756660461426,
"learning_rate": 4.764905385550162e-06,
"loss": 0.4468,
"step": 2112
},
{
"epoch": 0.69,
"grad_norm": 2.3252675533294678,
"learning_rate": 4.752133589191858e-06,
"loss": 0.439,
"step": 2144
},
{
"epoch": 0.7,
"grad_norm": 2.023167133331299,
"learning_rate": 4.739042041316768e-06,
"loss": 0.4336,
"step": 2176
},
{
"epoch": 0.71,
"grad_norm": 1.8779696226119995,
"learning_rate": 4.725632600625596e-06,
"loss": 0.4262,
"step": 2208
},
{
"epoch": 0.72,
"grad_norm": 1.8494573831558228,
"learning_rate": 4.711907170952566e-06,
"loss": 0.4192,
"step": 2240
},
{
"epoch": 0.73,
"grad_norm": 2.6154637336730957,
"learning_rate": 4.697867700995114e-06,
"loss": 0.4195,
"step": 2272
},
{
"epoch": 0.74,
"grad_norm": 1.9525200128555298,
"learning_rate": 4.6835161840372275e-06,
"loss": 0.4277,
"step": 2304
},
{
"epoch": 0.75,
"grad_norm": 2.0012083053588867,
"learning_rate": 4.668854657666433e-06,
"loss": 0.4279,
"step": 2336
},
{
"epoch": 0.76,
"grad_norm": 1.5427674055099487,
"learning_rate": 4.653885203484516e-06,
"loss": 0.4291,
"step": 2368
},
{
"epoch": 0.77,
"grad_norm": 1.8067190647125244,
"learning_rate": 4.638609946811972e-06,
"loss": 0.4493,
"step": 2400
},
{
"epoch": 0.78,
"grad_norm": 1.7905209064483643,
"learning_rate": 4.623031056386266e-06,
"loss": 0.4218,
"step": 2432
},
{
"epoch": 0.79,
"grad_norm": 2.0493154525756836,
"learning_rate": 4.60715074405392e-06,
"loss": 0.4241,
"step": 2464
},
{
"epoch": 0.8,
"grad_norm": 1.927325963973999,
"learning_rate": 4.5909712644564785e-06,
"loss": 0.4361,
"step": 2496
},
{
"epoch": 0.81,
"grad_norm": 2.2927017211914062,
"learning_rate": 4.574494914710402e-06,
"loss": 0.4257,
"step": 2528
},
{
"epoch": 0.82,
"grad_norm": 1.3847366571426392,
"learning_rate": 4.557724034080933e-06,
"loss": 0.4221,
"step": 2560
},
{
"epoch": 0.83,
"grad_norm": 1.9423835277557373,
"learning_rate": 4.540661003649969e-06,
"loss": 0.4194,
"step": 2592
},
{
"epoch": 0.84,
"grad_norm": 1.7483317852020264,
"learning_rate": 4.523308245978002e-06,
"loss": 0.4231,
"step": 2624
},
{
"epoch": 0.85,
"grad_norm": 1.8706562519073486,
"learning_rate": 4.505668224760177e-06,
"loss": 0.4361,
"step": 2656
},
{
"epoch": 0.86,
"grad_norm": 1.5217187404632568,
"learning_rate": 4.487743444476497e-06,
"loss": 0.4293,
"step": 2688
},
{
"epoch": 0.87,
"grad_norm": 1.5683813095092773,
"learning_rate": 4.4695364500362505e-06,
"loss": 0.4191,
"step": 2720
},
{
"epoch": 0.88,
"grad_norm": 1.9017343521118164,
"learning_rate": 4.451049826416682e-06,
"loss": 0.425,
"step": 2752
},
{
"epoch": 0.89,
"grad_norm": 1.5698572397232056,
"learning_rate": 4.432286198295998e-06,
"loss": 0.4189,
"step": 2784
},
{
"epoch": 0.9,
"grad_norm": 1.894131064414978,
"learning_rate": 4.41324822968071e-06,
"loss": 0.4193,
"step": 2816
},
{
"epoch": 0.91,
"grad_norm": 1.7698094844818115,
"learning_rate": 4.393938623527417e-06,
"loss": 0.4211,
"step": 2848
},
{
"epoch": 0.92,
"grad_norm": 1.4975591897964478,
"learning_rate": 4.374360121359038e-06,
"loss": 0.4104,
"step": 2880
},
{
"epoch": 0.93,
"grad_norm": 1.7062549591064453,
"learning_rate": 4.3545155028755865e-06,
"loss": 0.4363,
"step": 2912
},
{
"epoch": 0.94,
"grad_norm": 1.5995006561279297,
"learning_rate": 4.33440758555951e-06,
"loss": 0.4153,
"step": 2944
},
{
"epoch": 0.95,
"grad_norm": 2.1322031021118164,
"learning_rate": 4.3140392242756776e-06,
"loss": 0.4211,
"step": 2976
},
{
"epoch": 0.96,
"grad_norm": 1.7106413841247559,
"learning_rate": 4.293413310866049e-06,
"loss": 0.4285,
"step": 3008
},
{
"epoch": 0.97,
"grad_norm": 1.458028793334961,
"learning_rate": 4.272532773739104e-06,
"loss": 0.4393,
"step": 3040
},
{
"epoch": 0.98,
"grad_norm": 1.8825474977493286,
"learning_rate": 4.251400577454071e-06,
"loss": 0.4252,
"step": 3072
},
{
"epoch": 0.99,
"grad_norm": 1.8799755573272705,
"learning_rate": 4.230019722300031e-06,
"loss": 0.4077,
"step": 3104
},
{
"epoch": 1.0,
"grad_norm": 2.2173070907592773,
"learning_rate": 4.208393243869944e-06,
"loss": 0.3926,
"step": 3136
},
{
"epoch": 1.01,
"grad_norm": 2.1596415042877197,
"learning_rate": 4.1865242126296595e-06,
"loss": 0.3684,
"step": 3168
},
{
"epoch": 1.02,
"grad_norm": 1.7839455604553223,
"learning_rate": 4.16441573348199e-06,
"loss": 0.3653,
"step": 3200
},
{
"epoch": 1.03,
"grad_norm": 2.041905403137207,
"learning_rate": 4.142070945325877e-06,
"loss": 0.3662,
"step": 3232
},
{
"epoch": 1.04,
"grad_norm": 1.9254001379013062,
"learning_rate": 4.119493020610743e-06,
"loss": 0.3666,
"step": 3264
},
{
"epoch": 1.05,
"grad_norm": 1.974266767501831,
"learning_rate": 4.096685164886077e-06,
"loss": 0.3741,
"step": 3296
},
{
"epoch": 1.06,
"grad_norm": 1.8142021894454956,
"learning_rate": 4.073650616346317e-06,
"loss": 0.3611,
"step": 3328
},
{
"epoch": 1.08,
"grad_norm": 2.006136417388916,
"learning_rate": 4.050392645371101e-06,
"loss": 0.3704,
"step": 3360
},
{
"epoch": 1.09,
"grad_norm": 1.5683622360229492,
"learning_rate": 4.02691455406095e-06,
"loss": 0.3533,
"step": 3392
},
{
"epoch": 1.1,
"grad_norm": 2.074063777923584,
"learning_rate": 4.003219675768442e-06,
"loss": 0.373,
"step": 3424
},
{
"epoch": 1.11,
"grad_norm": 1.7841862440109253,
"learning_rate": 3.9793113746249554e-06,
"loss": 0.3542,
"step": 3456
},
{
"epoch": 1.12,
"grad_norm": 1.8422183990478516,
"learning_rate": 3.955193045063038e-06,
"loss": 0.356,
"step": 3488
},
{
"epoch": 1.13,
"grad_norm": 1.7018710374832153,
"learning_rate": 3.930868111334471e-06,
"loss": 0.3623,
"step": 3520
},
{
"epoch": 1.14,
"grad_norm": 1.9030427932739258,
"learning_rate": 3.9063400270241114e-06,
"loss": 0.375,
"step": 3552
},
{
"epoch": 1.15,
"grad_norm": 2.2064173221588135,
"learning_rate": 3.8816122745595556e-06,
"loss": 0.3621,
"step": 3584
},
{
"epoch": 1.16,
"grad_norm": 1.7391672134399414,
"learning_rate": 3.856688364716715e-06,
"loss": 0.3675,
"step": 3616
},
{
"epoch": 1.17,
"grad_norm": 2.20231032371521,
"learning_rate": 3.8315718361213694e-06,
"loss": 0.3637,
"step": 3648
},
{
"epoch": 1.18,
"grad_norm": 1.6645841598510742,
"learning_rate": 3.8062662547467604e-06,
"loss": 0.355,
"step": 3680
},
{
"epoch": 1.19,
"grad_norm": 1.9098048210144043,
"learning_rate": 3.780775213407305e-06,
"loss": 0.3672,
"step": 3712
},
{
"epoch": 1.2,
"grad_norm": 1.7326850891113281,
"learning_rate": 3.755102331248497e-06,
"loss": 0.3653,
"step": 3744
},
{
"epoch": 1.21,
"grad_norm": 1.9420820474624634,
"learning_rate": 3.729251253233073e-06,
"loss": 0.3704,
"step": 3776
},
{
"epoch": 1.22,
"grad_norm": 1.9405999183654785,
"learning_rate": 3.703225649623511e-06,
"loss": 0.3897,
"step": 3808
},
{
"epoch": 1.23,
"grad_norm": 2.333876371383667,
"learning_rate": 3.677029215460935e-06,
"loss": 0.3575,
"step": 3840
},
{
"epoch": 1.24,
"grad_norm": 2.2306437492370605,
"learning_rate": 3.6506656700405045e-06,
"loss": 0.3566,
"step": 3872
},
{
"epoch": 1.25,
"grad_norm": 2.2080612182617188,
"learning_rate": 3.624138756383361e-06,
"loss": 0.3779,
"step": 3904
},
{
"epoch": 1.26,
"grad_norm": 1.5209364891052246,
"learning_rate": 3.5974522407052013e-06,
"loss": 0.3659,
"step": 3936
},
{
"epoch": 1.27,
"grad_norm": 1.905605435371399,
"learning_rate": 3.570609911881566e-06,
"loss": 0.3704,
"step": 3968
},
{
"epoch": 1.28,
"grad_norm": 1.9363412857055664,
"learning_rate": 3.543615580909898e-06,
"loss": 0.3752,
"step": 4000
},
{
"epoch": 1.29,
"grad_norm": 1.6711021661758423,
"learning_rate": 3.516473080368478e-06,
"loss": 0.3706,
"step": 4032
},
{
"epoch": 1.3,
"grad_norm": 2.128994941711426,
"learning_rate": 3.489186263872275e-06,
"loss": 0.3695,
"step": 4064
},
{
"epoch": 1.31,
"grad_norm": 1.7105140686035156,
"learning_rate": 3.461759005525831e-06,
"loss": 0.3701,
"step": 4096
},
{
"epoch": 1.32,
"grad_norm": 1.8307222127914429,
"learning_rate": 3.43419519937322e-06,
"loss": 0.3577,
"step": 4128
},
{
"epoch": 1.33,
"grad_norm": 1.8442600965499878,
"learning_rate": 3.406498758845184e-06,
"loss": 0.3706,
"step": 4160
},
{
"epoch": 1.34,
"grad_norm": 1.8921831846237183,
"learning_rate": 3.3786736162035187e-06,
"loss": 0.364,
"step": 4192
},
{
"epoch": 1.35,
"grad_norm": 1.7511910200119019,
"learning_rate": 3.3507237219827784e-06,
"loss": 0.3488,
"step": 4224
},
{
"epoch": 1.36,
"grad_norm": 1.3822124004364014,
"learning_rate": 3.3226530444293893e-06,
"loss": 0.3628,
"step": 4256
},
{
"epoch": 1.37,
"grad_norm": 2.302302837371826,
"learning_rate": 3.2944655689382554e-06,
"loss": 0.36,
"step": 4288
},
{
"epoch": 1.38,
"grad_norm": 1.6606963872909546,
"learning_rate": 3.2661652974869164e-06,
"loss": 0.3487,
"step": 4320
},
{
"epoch": 1.39,
"grad_norm": 2.009692668914795,
"learning_rate": 3.2377562480673623e-06,
"loss": 0.3789,
"step": 4352
},
{
"epoch": 1.4,
"grad_norm": 2.011293649673462,
"learning_rate": 3.20924245411557e-06,
"loss": 0.3669,
"step": 4384
},
{
"epoch": 1.41,
"grad_norm": 2.3706395626068115,
"learning_rate": 3.180627963938847e-06,
"loss": 0.3698,
"step": 4416
},
{
"epoch": 1.42,
"grad_norm": 2.0138208866119385,
"learning_rate": 3.1519168401410627e-06,
"loss": 0.355,
"step": 4448
},
{
"epoch": 1.43,
"grad_norm": 1.6110373735427856,
"learning_rate": 3.123113159045854e-06,
"loss": 0.3519,
"step": 4480
},
{
"epoch": 1.44,
"grad_norm": 1.9039591550827026,
"learning_rate": 3.09422101011788e-06,
"loss": 0.3677,
"step": 4512
},
{
"epoch": 1.45,
"grad_norm": 1.9409842491149902,
"learning_rate": 3.0652444953822097e-06,
"loss": 0.3476,
"step": 4544
},
{
"epoch": 1.46,
"grad_norm": 1.5499393939971924,
"learning_rate": 3.0361877288419306e-06,
"loss": 0.3642,
"step": 4576
},
{
"epoch": 1.47,
"grad_norm": 1.5498781204223633,
"learning_rate": 3.0070548358940523e-06,
"loss": 0.3811,
"step": 4608
},
{
"epoch": 1.48,
"grad_norm": 1.8059883117675781,
"learning_rate": 2.9778499527437932e-06,
"loss": 0.3741,
"step": 4640
},
{
"epoch": 1.5,
"grad_norm": 1.4482682943344116,
"learning_rate": 2.9485772258173405e-06,
"loss": 0.3674,
"step": 4672
},
{
"epoch": 1.51,
"grad_norm": 2.086606502532959,
"learning_rate": 2.919240811173143e-06,
"loss": 0.3621,
"step": 4704
},
{
"epoch": 1.52,
"grad_norm": 1.6022703647613525,
"learning_rate": 2.8898448739118533e-06,
"loss": 0.3651,
"step": 4736
},
{
"epoch": 1.53,
"grad_norm": 1.6419217586517334,
"learning_rate": 2.8603935875849744e-06,
"loss": 0.3709,
"step": 4768
},
{
"epoch": 1.54,
"grad_norm": 1.490946888923645,
"learning_rate": 2.830891133602311e-06,
"loss": 0.3408,
"step": 4800
},
{
"epoch": 1.55,
"grad_norm": 1.644801378250122,
"learning_rate": 2.8013417006383078e-06,
"loss": 0.3408,
"step": 4832
},
{
"epoch": 1.56,
"grad_norm": 1.872612714767456,
"learning_rate": 2.771749484037352e-06,
"loss": 0.3633,
"step": 4864
},
{
"epoch": 1.57,
"grad_norm": 2.0236546993255615,
"learning_rate": 2.7421186852181282e-06,
"loss": 0.3504,
"step": 4896
},
{
"epoch": 1.58,
"grad_norm": 1.8779181241989136,
"learning_rate": 2.7124535110771155e-06,
"loss": 0.3757,
"step": 4928
},
{
"epoch": 1.59,
"grad_norm": 1.8941439390182495,
"learning_rate": 2.6827581733913027e-06,
"loss": 0.3581,
"step": 4960
},
{
"epoch": 1.6,
"grad_norm": 1.814450740814209,
"learning_rate": 2.6530368882202127e-06,
"loss": 0.3623,
"step": 4992
},
{
"epoch": 1.61,
"grad_norm": 2.031270980834961,
"learning_rate": 2.623293875307319e-06,
"loss": 0.3712,
"step": 5024
},
{
"epoch": 1.62,
"grad_norm": 1.736070156097412,
"learning_rate": 2.5935333574809385e-06,
"loss": 0.3663,
"step": 5056
},
{
"epoch": 1.63,
"grad_norm": 1.778670072555542,
"learning_rate": 2.5637595600546855e-06,
"loss": 0.3683,
"step": 5088
},
{
"epoch": 1.64,
"grad_norm": 1.6772242784500122,
"learning_rate": 2.533976710227574e-06,
"loss": 0.3687,
"step": 5120
},
{
"epoch": 1.65,
"grad_norm": 1.56125807762146,
"learning_rate": 2.504189036483851e-06,
"loss": 0.3556,
"step": 5152
},
{
"epoch": 1.66,
"grad_norm": 1.861116647720337,
"learning_rate": 2.4744007679926514e-06,
"loss": 0.3567,
"step": 5184
},
{
"epoch": 1.67,
"grad_norm": 2.2606282234191895,
"learning_rate": 2.444616134007549e-06,
"loss": 0.3358,
"step": 5216
},
{
"epoch": 1.68,
"grad_norm": 2.367314100265503,
"learning_rate": 2.4148393632661033e-06,
"loss": 0.3554,
"step": 5248
},
{
"epoch": 1.69,
"grad_norm": 2.0420873165130615,
"learning_rate": 2.385074683389469e-06,
"loss": 0.3627,
"step": 5280
},
{
"epoch": 1.7,
"grad_norm": 2.024829864501953,
"learning_rate": 2.3553263202821775e-06,
"loss": 0.3581,
"step": 5312
},
{
"epoch": 1.71,
"grad_norm": 2.053925037384033,
"learning_rate": 2.3255984975321503e-06,
"loss": 0.3531,
"step": 5344
},
{
"epoch": 1.72,
"grad_norm": 1.7462561130523682,
"learning_rate": 2.2958954358110467e-06,
"loss": 0.3574,
"step": 5376
},
{
"epoch": 1.73,
"grad_norm": 1.6682531833648682,
"learning_rate": 2.266221352275029e-06,
"loss": 0.3675,
"step": 5408
},
{
"epoch": 1.74,
"grad_norm": 1.618286371231079,
"learning_rate": 2.2365804599660147e-06,
"loss": 0.361,
"step": 5440
},
{
"epoch": 1.75,
"grad_norm": 2.0060200691223145,
"learning_rate": 2.2069769672135283e-06,
"loss": 0.3491,
"step": 5472
},
{
"epoch": 1.76,
"grad_norm": 1.9721282720565796,
"learning_rate": 2.1774150770372106e-06,
"loss": 0.3621,
"step": 5504
},
{
"epoch": 1.77,
"grad_norm": 1.5395954847335815,
"learning_rate": 2.147898986550087e-06,
"loss": 0.3621,
"step": 5536
},
{
"epoch": 1.78,
"grad_norm": 1.6971800327301025,
"learning_rate": 2.1184328863626754e-06,
"loss": 0.3637,
"step": 5568
},
{
"epoch": 1.79,
"grad_norm": 1.3535592555999756,
"learning_rate": 2.089020959988009e-06,
"loss": 0.3536,
"step": 5600
},
{
"epoch": 1.8,
"grad_norm": 1.4577397108078003,
"learning_rate": 2.059667383247683e-06,
"loss": 0.3368,
"step": 5632
},
{
"epoch": 1.81,
"grad_norm": 1.9268568754196167,
"learning_rate": 2.0303763236789717e-06,
"loss": 0.3653,
"step": 5664
},
{
"epoch": 1.82,
"grad_norm": 1.6601406335830688,
"learning_rate": 2.001151939943144e-06,
"loss": 0.3661,
"step": 5696
},
{
"epoch": 1.83,
"grad_norm": 1.9766888618469238,
"learning_rate": 1.9719983812350193e-06,
"loss": 0.369,
"step": 5728
},
{
"epoch": 1.84,
"grad_norm": 1.538190484046936,
"learning_rate": 1.942919786693886e-06,
"loss": 0.3582,
"step": 5760
},
{
"epoch": 1.85,
"grad_norm": 2.4176807403564453,
"learning_rate": 1.913920284815831e-06,
"loss": 0.3556,
"step": 5792
},
{
"epoch": 1.86,
"grad_norm": 2.0340559482574463,
"learning_rate": 1.8850039928675898e-06,
"loss": 0.3705,
"step": 5824
},
{
"epoch": 1.87,
"grad_norm": 1.6979023218154907,
"learning_rate": 1.8561750163019896e-06,
"loss": 0.3571,
"step": 5856
},
{
"epoch": 1.88,
"grad_norm": 1.6448854207992554,
"learning_rate": 1.8274374481750662e-06,
"loss": 0.3598,
"step": 5888
},
{
"epoch": 1.89,
"grad_norm": 1.9256216287612915,
"learning_rate": 1.7987953685649485e-06,
"loss": 0.3704,
"step": 5920
},
{
"epoch": 1.9,
"grad_norm": 1.91543710231781,
"learning_rate": 1.7702528439925767e-06,
"loss": 0.3599,
"step": 5952
},
{
"epoch": 1.91,
"grad_norm": 1.8443219661712646,
"learning_rate": 1.7418139268443482e-06,
"loss": 0.3557,
"step": 5984
},
{
"epoch": 1.93,
"grad_norm": 1.9858685731887817,
"learning_rate": 1.7134826547967757e-06,
"loss": 0.3562,
"step": 6016
},
{
"epoch": 1.94,
"grad_norm": 1.5424166917800903,
"learning_rate": 1.6852630502432238e-06,
"loss": 0.3554,
"step": 6048
},
{
"epoch": 1.95,
"grad_norm": 1.9284968376159668,
"learning_rate": 1.6571591197228285e-06,
"loss": 0.3618,
"step": 6080
},
{
"epoch": 1.96,
"grad_norm": 1.8226544857025146,
"learning_rate": 1.629174853351651e-06,
"loss": 0.3528,
"step": 6112
},
{
"epoch": 1.97,
"grad_norm": 1.8396884202957153,
"learning_rate": 1.6013142242561813e-06,
"loss": 0.3507,
"step": 6144
},
{
"epoch": 1.98,
"grad_norm": 1.8388822078704834,
"learning_rate": 1.5735811880092394e-06,
"loss": 0.3611,
"step": 6176
},
{
"epoch": 1.99,
"grad_norm": 1.725791573524475,
"learning_rate": 1.5459796820683737e-06,
"loss": 0.3411,
"step": 6208
},
{
"epoch": 2.0,
"grad_norm": 2.2812933921813965,
"learning_rate": 1.518513625216838e-06,
"loss": 0.3445,
"step": 6240
},
{
"epoch": 2.01,
"grad_norm": 1.6806988716125488,
"learning_rate": 1.491186917007206e-06,
"loss": 0.3137,
"step": 6272
},
{
"epoch": 2.02,
"grad_norm": 1.8829107284545898,
"learning_rate": 1.4640034372077322e-06,
"loss": 0.2856,
"step": 6304
},
{
"epoch": 2.03,
"grad_norm": 1.9766901731491089,
"learning_rate": 1.4369670452515044e-06,
"loss": 0.2604,
"step": 6336
},
{
"epoch": 2.04,
"grad_norm": 2.055413007736206,
"learning_rate": 1.4100815796884998e-06,
"loss": 0.2694,
"step": 6368
},
{
"epoch": 2.05,
"grad_norm": 1.782673954963684,
"learning_rate": 1.3833508576405974e-06,
"loss": 0.2694,
"step": 6400
},
{
"epoch": 2.06,
"grad_norm": 2.202526807785034,
"learning_rate": 1.3567786742596283e-06,
"loss": 0.2621,
"step": 6432
},
{
"epoch": 2.07,
"grad_norm": 2.1951637268066406,
"learning_rate": 1.3303688021885575e-06,
"loss": 0.2789,
"step": 6464
},
{
"epoch": 2.08,
"grad_norm": 1.7499464750289917,
"learning_rate": 1.304124991025852e-06,
"loss": 0.2621,
"step": 6496
},
{
"epoch": 2.09,
"grad_norm": 1.8003720045089722,
"learning_rate": 1.2780509667931217e-06,
"loss": 0.2715,
"step": 6528
},
{
"epoch": 2.1,
"grad_norm": 1.9713287353515625,
"learning_rate": 1.2521504314061173e-06,
"loss": 0.2717,
"step": 6560
},
{
"epoch": 2.11,
"grad_norm": 1.9567084312438965,
"learning_rate": 1.2264270621491286e-06,
"loss": 0.274,
"step": 6592
},
{
"epoch": 2.12,
"grad_norm": 2.075596809387207,
"learning_rate": 1.2008845111529088e-06,
"loss": 0.2612,
"step": 6624
},
{
"epoch": 2.13,
"grad_norm": 2.1604607105255127,
"learning_rate": 1.1755264048761464e-06,
"loss": 0.2704,
"step": 6656
},
{
"epoch": 2.14,
"grad_norm": 2.0567729473114014,
"learning_rate": 1.1503563435905943e-06,
"loss": 0.2711,
"step": 6688
},
{
"epoch": 2.15,
"grad_norm": 2.0198721885681152,
"learning_rate": 1.1253779008699131e-06,
"loss": 0.2699,
"step": 6720
},
{
"epoch": 2.16,
"grad_norm": 2.6587815284729004,
"learning_rate": 1.100594623082303e-06,
"loss": 0.2647,
"step": 6752
},
{
"epoch": 2.17,
"grad_norm": 1.6927788257598877,
"learning_rate": 1.0760100288870077e-06,
"loss": 0.2648,
"step": 6784
},
{
"epoch": 2.18,
"grad_norm": 2.316779613494873,
"learning_rate": 1.051627608734733e-06,
"loss": 0.276,
"step": 6816
},
{
"epoch": 2.19,
"grad_norm": 1.5877448320388794,
"learning_rate": 1.027450824372094e-06,
"loss": 0.2622,
"step": 6848
},
{
"epoch": 2.2,
"grad_norm": 2.0945749282836914,
"learning_rate": 1.0034831083501206e-06,
"loss": 0.2597,
"step": 6880
},
{
"epoch": 2.21,
"grad_norm": 2.4265644550323486,
"learning_rate": 9.797278635369137e-07,
"loss": 0.258,
"step": 6912
},
{
"epoch": 2.22,
"grad_norm": 1.770364761352539,
"learning_rate": 9.561884626345206e-07,
"loss": 0.2624,
"step": 6944
},
{
"epoch": 2.23,
"grad_norm": 2.046229839324951,
"learning_rate": 9.328682477000789e-07,
"loss": 0.2618,
"step": 6976
},
{
"epoch": 2.24,
"grad_norm": 2.9548141956329346,
"learning_rate": 9.097705296713297e-07,
"loss": 0.2697,
"step": 7008
},
{
"epoch": 2.25,
"grad_norm": 2.126011848449707,
"learning_rate": 8.868985878965366e-07,
"loss": 0.2661,
"step": 7040
},
{
"epoch": 2.26,
"grad_norm": 2.0025265216827393,
"learning_rate": 8.642556696688922e-07,
"loss": 0.2533,
"step": 7072
},
{
"epoch": 2.27,
"grad_norm": 1.7288347482681274,
"learning_rate": 8.41844989765479e-07,
"loss": 0.2644,
"step": 7104
},
{
"epoch": 2.28,
"grad_norm": 1.7986013889312744,
"learning_rate": 8.196697299908424e-07,
"loss": 0.2684,
"step": 7136
},
{
"epoch": 2.29,
"grad_norm": 2.160757541656494,
"learning_rate": 7.977330387252477e-07,
"loss": 0.2807,
"step": 7168
},
{
"epoch": 2.3,
"grad_norm": 2.1614770889282227,
"learning_rate": 7.760380304776832e-07,
"loss": 0.2801,
"step": 7200
},
{
"epoch": 2.31,
"grad_norm": 2.361065149307251,
"learning_rate": 7.545877854436698e-07,
"loss": 0.2604,
"step": 7232
},
{
"epoch": 2.32,
"grad_norm": 1.8524858951568604,
"learning_rate": 7.333853490679435e-07,
"loss": 0.2776,
"step": 7264
},
{
"epoch": 2.33,
"grad_norm": 1.9294536113739014,
"learning_rate": 7.124337316120735e-07,
"loss": 0.2795,
"step": 7296
},
{
"epoch": 2.34,
"grad_norm": 1.9421364068984985,
"learning_rate": 6.917359077270716e-07,
"loss": 0.2635,
"step": 7328
},
{
"epoch": 2.36,
"grad_norm": 1.9388601779937744,
"learning_rate": 6.712948160310612e-07,
"loss": 0.2571,
"step": 7360
},
{
"epoch": 2.37,
"grad_norm": 1.906515121459961,
"learning_rate": 6.511133586920601e-07,
"loss": 0.2634,
"step": 7392
},
{
"epoch": 2.38,
"grad_norm": 1.894484043121338,
"learning_rate": 6.311944010159394e-07,
"loss": 0.2725,
"step": 7424
},
{
"epoch": 2.39,
"grad_norm": 2.0004079341888428,
"learning_rate": 6.115407710396145e-07,
"loss": 0.2565,
"step": 7456
},
{
"epoch": 2.4,
"grad_norm": 2.3763046264648438,
"learning_rate": 5.921552591295304e-07,
"loss": 0.2634,
"step": 7488
},
{
"epoch": 2.41,
"grad_norm": 2.0313122272491455,
"learning_rate": 5.730406175854908e-07,
"loss": 0.2668,
"step": 7520
},
{
"epoch": 2.42,
"grad_norm": 2.0953962802886963,
"learning_rate": 5.54199560249897e-07,
"loss": 0.2708,
"step": 7552
},
{
"epoch": 2.43,
"grad_norm": 2.3282768726348877,
"learning_rate": 5.35634762122442e-07,
"loss": 0.2639,
"step": 7584
},
{
"epoch": 2.44,
"grad_norm": 2.0609822273254395,
"learning_rate": 5.173488589803238e-07,
"loss": 0.2691,
"step": 7616
},
{
"epoch": 2.45,
"grad_norm": 1.9901596307754517,
"learning_rate": 4.993444470040234e-07,
"loss": 0.2646,
"step": 7648
},
{
"epoch": 2.46,
"grad_norm": 1.9838128089904785,
"learning_rate": 4.816240824087076e-07,
"loss": 0.2665,
"step": 7680
},
{
"epoch": 2.47,
"grad_norm": 2.1872060298919678,
"learning_rate": 4.6419028108130456e-07,
"loss": 0.2665,
"step": 7712
},
{
"epoch": 2.48,
"grad_norm": 1.5998047590255737,
"learning_rate": 4.470455182233052e-07,
"loss": 0.2539,
"step": 7744
},
{
"epoch": 2.49,
"grad_norm": 1.842804193496704,
"learning_rate": 4.3019222799934117e-07,
"loss": 0.2562,
"step": 7776
},
{
"epoch": 2.5,
"grad_norm": 2.4707839488983154,
"learning_rate": 4.1363280319158925e-07,
"loss": 0.2625,
"step": 7808
},
{
"epoch": 2.51,
"grad_norm": 2.171933174133301,
"learning_rate": 3.973695948600512e-07,
"loss": 0.2663,
"step": 7840
},
{
"epoch": 2.52,
"grad_norm": 2.098424196243286,
"learning_rate": 3.8140491200875567e-07,
"loss": 0.266,
"step": 7872
},
{
"epoch": 2.53,
"grad_norm": 2.3341708183288574,
"learning_rate": 3.6574102125793433e-07,
"loss": 0.2568,
"step": 7904
},
{
"epoch": 2.54,
"grad_norm": 2.1890530586242676,
"learning_rate": 3.50380146522212e-07,
"loss": 0.2735,
"step": 7936
},
{
"epoch": 2.55,
"grad_norm": 1.7215001583099365,
"learning_rate": 3.3532446869486255e-07,
"loss": 0.2553,
"step": 7968
},
{
"epoch": 2.56,
"grad_norm": 2.284882068634033,
"learning_rate": 3.205761253381706e-07,
"loss": 0.2622,
"step": 8000
},
{
"epoch": 2.57,
"grad_norm": 2.0944039821624756,
"learning_rate": 3.061372103799487e-07,
"loss": 0.2687,
"step": 8032
},
{
"epoch": 2.58,
"grad_norm": 1.7873029708862305,
"learning_rate": 2.920097738162453e-07,
"loss": 0.2609,
"step": 8064
},
{
"epoch": 2.59,
"grad_norm": 1.9296663999557495,
"learning_rate": 2.781958214202918e-07,
"loss": 0.2711,
"step": 8096
},
{
"epoch": 2.6,
"grad_norm": 2.2704007625579834,
"learning_rate": 2.646973144577325e-07,
"loss": 0.2591,
"step": 8128
},
{
"epoch": 2.61,
"grad_norm": 2.158012628555298,
"learning_rate": 2.515161694081647e-07,
"loss": 0.2572,
"step": 8160
},
{
"epoch": 2.62,
"grad_norm": 2.1333682537078857,
"learning_rate": 2.386542576930456e-07,
"loss": 0.2707,
"step": 8192
},
{
"epoch": 2.63,
"grad_norm": 2.117429733276367,
"learning_rate": 2.261134054099917e-07,
"loss": 0.2548,
"step": 8224
},
{
"epoch": 2.64,
"grad_norm": 1.9128592014312744,
"learning_rate": 2.1389539307351547e-07,
"loss": 0.2694,
"step": 8256
},
{
"epoch": 2.65,
"grad_norm": 2.2764246463775635,
"learning_rate": 2.0200195536223267e-07,
"loss": 0.2689,
"step": 8288
},
{
"epoch": 2.66,
"grad_norm": 1.794533371925354,
"learning_rate": 1.9043478087257623e-07,
"loss": 0.2625,
"step": 8320
},
{
"epoch": 2.67,
"grad_norm": 1.9093414545059204,
"learning_rate": 1.7919551187905837e-07,
"loss": 0.2612,
"step": 8352
},
{
"epoch": 2.68,
"grad_norm": 1.827714443206787,
"learning_rate": 1.6828574410110016e-07,
"loss": 0.2603,
"step": 8384
},
{
"epoch": 2.69,
"grad_norm": 2.5951650142669678,
"learning_rate": 1.5770702647647823e-07,
"loss": 0.2654,
"step": 8416
},
{
"epoch": 2.7,
"grad_norm": 2.659973621368408,
"learning_rate": 1.474608609414113e-07,
"loss": 0.2644,
"step": 8448
},
{
"epoch": 2.71,
"grad_norm": 2.4667563438415527,
"learning_rate": 1.3754870221731775e-07,
"loss": 0.2583,
"step": 8480
},
{
"epoch": 2.72,
"grad_norm": 1.7934099435806274,
"learning_rate": 1.2797195760428093e-07,
"loss": 0.2624,
"step": 8512
},
{
"epoch": 2.73,
"grad_norm": 2.117466926574707,
"learning_rate": 1.1873198678124309e-07,
"loss": 0.2452,
"step": 8544
},
{
"epoch": 2.74,
"grad_norm": 1.7996597290039062,
"learning_rate": 1.0983010161296215e-07,
"loss": 0.2512,
"step": 8576
},
{
"epoch": 2.75,
"grad_norm": 1.920240879058838,
"learning_rate": 1.0126756596375687e-07,
"loss": 0.2631,
"step": 8608
},
{
"epoch": 2.76,
"grad_norm": 2.1029436588287354,
"learning_rate": 9.304559551806675e-08,
"loss": 0.2703,
"step": 8640
},
{
"epoch": 2.78,
"grad_norm": 2.1418569087982178,
"learning_rate": 8.516535760785455e-08,
"loss": 0.2621,
"step": 8672
},
{
"epoch": 2.79,
"grad_norm": 2.1141951084136963,
"learning_rate": 7.762797104686858e-08,
"loss": 0.248,
"step": 8704
},
{
"epoch": 2.8,
"grad_norm": 2.1870038509368896,
"learning_rate": 7.043450597179979e-08,
"loss": 0.2679,
"step": 8736
},
{
"epoch": 2.81,
"grad_norm": 2.4698402881622314,
"learning_rate": 6.358598369034518e-08,
"loss": 0.2649,
"step": 8768
},
{
"epoch": 2.82,
"grad_norm": 1.8242698907852173,
"learning_rate": 5.7083376536204436e-08,
"loss": 0.268,
"step": 8800
},
{
"epoch": 2.83,
"grad_norm": 2.2052414417266846,
"learning_rate": 5.092760773103417e-08,
"loss": 0.2625,
"step": 8832
},
{
"epoch": 2.84,
"grad_norm": 2.050403118133545,
"learning_rate": 4.511955125336726e-08,
"loss": 0.2668,
"step": 8864
},
{
"epoch": 2.85,
"grad_norm": 1.8949859142303467,
"learning_rate": 3.966003171453181e-08,
"loss": 0.2615,
"step": 8896
},
{
"epoch": 2.86,
"grad_norm": 1.904829740524292,
"learning_rate": 3.4549824241572326e-08,
"loss": 0.2522,
"step": 8928
},
{
"epoch": 2.87,
"grad_norm": 1.9748365879058838,
"learning_rate": 2.9789654367200492e-08,
"loss": 0.2637,
"step": 8960
},
{
"epoch": 2.88,
"grad_norm": 1.9434069395065308,
"learning_rate": 2.538019792678703e-08,
"loss": 0.2555,
"step": 8992
},
{
"epoch": 2.89,
"grad_norm": 1.8021697998046875,
"learning_rate": 2.1322080962405677e-08,
"loss": 0.2479,
"step": 9024
},
{
"epoch": 2.9,
"grad_norm": 1.9932126998901367,
"learning_rate": 1.7615879633953724e-08,
"loss": 0.2724,
"step": 9056
},
{
"epoch": 2.91,
"grad_norm": 2.393543243408203,
"learning_rate": 1.4262120137345791e-08,
"loss": 0.2662,
"step": 9088
},
{
"epoch": 2.92,
"grad_norm": 2.3378348350524902,
"learning_rate": 1.1261278629810246e-08,
"loss": 0.2719,
"step": 9120
},
{
"epoch": 2.93,
"grad_norm": 1.788970947265625,
"learning_rate": 8.613781162282731e-09,
"loss": 0.2573,
"step": 9152
},
{
"epoch": 2.94,
"grad_norm": 1.7528036832809448,
"learning_rate": 6.32000361891788e-09,
"loss": 0.2645,
"step": 9184
},
{
"epoch": 2.95,
"grad_norm": 2.0878617763519287,
"learning_rate": 4.380271663723401e-09,
"loss": 0.2552,
"step": 9216
},
{
"epoch": 2.96,
"grad_norm": 1.8262449502944946,
"learning_rate": 2.794860694320389e-09,
"loss": 0.2565,
"step": 9248
},
{
"epoch": 2.97,
"grad_norm": 1.9307301044464111,
"learning_rate": 1.5639958028462742e-09,
"loss": 0.2743,
"step": 9280
},
{
"epoch": 2.98,
"grad_norm": 1.9267100095748901,
"learning_rate": 6.878517439948274e-10,
"loss": 0.2629,
"step": 9312
},
{
"epoch": 2.99,
"grad_norm": 1.860356330871582,
"learning_rate": 1.6655291020573062e-10,
"loss": 0.2593,
"step": 9344
}
],
"logging_steps": 32,
"max_steps": 9375,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 3125,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}