| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 12699, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.007559650366170565, |
| "grad_norm": 30.625, |
| "learning_rate": 2.4409448818897637e-07, |
| "loss": 0.997, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.01511930073234113, |
| "grad_norm": 22.75, |
| "learning_rate": 4.960629921259843e-07, |
| "loss": 1.0127, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.022678951098511695, |
| "grad_norm": 15.875, |
| "learning_rate": 7.480314960629922e-07, |
| "loss": 0.9785, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.03023860146468226, |
| "grad_norm": 19.125, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.8948, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.03779825183085282, |
| "grad_norm": 13.25, |
| "learning_rate": 1.251968503937008e-06, |
| "loss": 0.9001, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.04535790219702339, |
| "grad_norm": 10.0, |
| "learning_rate": 1.5039370078740158e-06, |
| "loss": 0.9018, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.05291755256319395, |
| "grad_norm": 9.5, |
| "learning_rate": 1.7559055118110239e-06, |
| "loss": 0.8796, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.06047720292936452, |
| "grad_norm": 9.8125, |
| "learning_rate": 2.007874015748032e-06, |
| "loss": 0.7863, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.06803685329553508, |
| "grad_norm": 13.125, |
| "learning_rate": 2.2598425196850397e-06, |
| "loss": 0.7503, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.07559650366170564, |
| "grad_norm": 7.46875, |
| "learning_rate": 2.511811023622047e-06, |
| "loss": 0.7361, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.08315615402787621, |
| "grad_norm": 10.125, |
| "learning_rate": 2.763779527559055e-06, |
| "loss": 0.7236, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.09071580439404678, |
| "grad_norm": 8.6875, |
| "learning_rate": 3.015748031496063e-06, |
| "loss": 0.7074, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.09827545476021735, |
| "grad_norm": 10.0625, |
| "learning_rate": 3.267716535433071e-06, |
| "loss": 0.7418, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.1058351051263879, |
| "grad_norm": 17.25, |
| "learning_rate": 3.519685039370079e-06, |
| "loss": 0.6522, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.11339475549255847, |
| "grad_norm": 4.8125, |
| "learning_rate": 3.7716535433070867e-06, |
| "loss": 0.6596, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.12095440585872903, |
| "grad_norm": 9.375, |
| "learning_rate": 4.0236220472440945e-06, |
| "loss": 0.6762, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.1285140562248996, |
| "grad_norm": 7.6875, |
| "learning_rate": 4.275590551181103e-06, |
| "loss": 0.6657, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.13607370659107015, |
| "grad_norm": 11.4375, |
| "learning_rate": 4.52755905511811e-06, |
| "loss": 0.5974, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.14363335695724072, |
| "grad_norm": 6.3125, |
| "learning_rate": 4.779527559055118e-06, |
| "loss": 0.6177, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.1511930073234113, |
| "grad_norm": 16.75, |
| "learning_rate": 5.031496062992126e-06, |
| "loss": 0.5999, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.15875265768958186, |
| "grad_norm": 11.625, |
| "learning_rate": 5.283464566929134e-06, |
| "loss": 0.5413, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.16631230805575242, |
| "grad_norm": 6.46875, |
| "learning_rate": 5.535433070866142e-06, |
| "loss": 0.5589, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.173871958421923, |
| "grad_norm": 8.6875, |
| "learning_rate": 5.78740157480315e-06, |
| "loss": 0.5491, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.18143160878809356, |
| "grad_norm": 15.375, |
| "learning_rate": 6.039370078740158e-06, |
| "loss": 0.523, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.18899125915426412, |
| "grad_norm": 6.875, |
| "learning_rate": 6.291338582677165e-06, |
| "loss": 0.5444, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.1965509095204347, |
| "grad_norm": 24.375, |
| "learning_rate": 6.543307086614174e-06, |
| "loss": 0.5077, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.20411055988660523, |
| "grad_norm": 7.21875, |
| "learning_rate": 6.795275590551181e-06, |
| "loss": 0.4579, |
| "step": 864 |
| }, |
| { |
| "epoch": 0.2116702102527758, |
| "grad_norm": 11.5625, |
| "learning_rate": 7.047244094488189e-06, |
| "loss": 0.4787, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.21922986061894637, |
| "grad_norm": 7.84375, |
| "learning_rate": 7.2992125984251975e-06, |
| "loss": 0.4627, |
| "step": 928 |
| }, |
| { |
| "epoch": 0.22678951098511693, |
| "grad_norm": 7.53125, |
| "learning_rate": 7.551181102362205e-06, |
| "loss": 0.4694, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.2343491613512875, |
| "grad_norm": 9.6875, |
| "learning_rate": 7.803149606299213e-06, |
| "loss": 0.4959, |
| "step": 992 |
| }, |
| { |
| "epoch": 0.24190881171745807, |
| "grad_norm": 9.4375, |
| "learning_rate": 8.055118110236221e-06, |
| "loss": 0.4708, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.24946846208362863, |
| "grad_norm": 8.875, |
| "learning_rate": 8.307086614173228e-06, |
| "loss": 0.4513, |
| "step": 1056 |
| }, |
| { |
| "epoch": 0.2570281124497992, |
| "grad_norm": 6.9375, |
| "learning_rate": 8.559055118110236e-06, |
| "loss": 0.4668, |
| "step": 1088 |
| }, |
| { |
| "epoch": 0.26458776281596974, |
| "grad_norm": 10.8125, |
| "learning_rate": 8.811023622047244e-06, |
| "loss": 0.4413, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.2721474131821403, |
| "grad_norm": 6.78125, |
| "learning_rate": 9.062992125984253e-06, |
| "loss": 0.4229, |
| "step": 1152 |
| }, |
| { |
| "epoch": 0.2797070635483109, |
| "grad_norm": 8.3125, |
| "learning_rate": 9.314960629921261e-06, |
| "loss": 0.4487, |
| "step": 1184 |
| }, |
| { |
| "epoch": 0.28726671391448144, |
| "grad_norm": 7.28125, |
| "learning_rate": 9.566929133858268e-06, |
| "loss": 0.4379, |
| "step": 1216 |
| }, |
| { |
| "epoch": 0.294826364280652, |
| "grad_norm": 7.375, |
| "learning_rate": 9.818897637795276e-06, |
| "loss": 0.4577, |
| "step": 1248 |
| }, |
| { |
| "epoch": 0.3023860146468226, |
| "grad_norm": 7.71875, |
| "learning_rate": 9.999984699413235e-06, |
| "loss": 0.4738, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.30994566501299314, |
| "grad_norm": 4.875, |
| "learning_rate": 9.999682468799545e-06, |
| "loss": 0.4452, |
| "step": 1312 |
| }, |
| { |
| "epoch": 0.3175053153791637, |
| "grad_norm": 8.3125, |
| "learning_rate": 9.998993405770503e-06, |
| "loss": 0.4617, |
| "step": 1344 |
| }, |
| { |
| "epoch": 0.3250649657453343, |
| "grad_norm": 7.21875, |
| "learning_rate": 9.997917563639873e-06, |
| "loss": 0.4511, |
| "step": 1376 |
| }, |
| { |
| "epoch": 0.33262461611150484, |
| "grad_norm": 5.78125, |
| "learning_rate": 9.99645502564707e-06, |
| "loss": 0.4646, |
| "step": 1408 |
| }, |
| { |
| "epoch": 0.3401842664776754, |
| "grad_norm": 7.21875, |
| "learning_rate": 9.994605904950693e-06, |
| "loss": 0.4219, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.347743916843846, |
| "grad_norm": 8.75, |
| "learning_rate": 9.992370344619799e-06, |
| "loss": 0.4755, |
| "step": 1472 |
| }, |
| { |
| "epoch": 0.35530356721001655, |
| "grad_norm": 5.59375, |
| "learning_rate": 9.98974851762281e-06, |
| "loss": 0.4415, |
| "step": 1504 |
| }, |
| { |
| "epoch": 0.3628632175761871, |
| "grad_norm": 10.8125, |
| "learning_rate": 9.986740626814144e-06, |
| "loss": 0.4628, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.3704228679423577, |
| "grad_norm": 5.71875, |
| "learning_rate": 9.983346904918514e-06, |
| "loss": 0.4241, |
| "step": 1568 |
| }, |
| { |
| "epoch": 0.37798251830852825, |
| "grad_norm": 7.40625, |
| "learning_rate": 9.97956761451292e-06, |
| "loss": 0.4183, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.3855421686746988, |
| "grad_norm": 4.71875, |
| "learning_rate": 9.975403048006342e-06, |
| "loss": 0.46, |
| "step": 1632 |
| }, |
| { |
| "epoch": 0.3931018190408694, |
| "grad_norm": 6.625, |
| "learning_rate": 9.970853527617105e-06, |
| "loss": 0.4349, |
| "step": 1664 |
| }, |
| { |
| "epoch": 0.40066146940703995, |
| "grad_norm": 11.3125, |
| "learning_rate": 9.965919405347958e-06, |
| "loss": 0.4359, |
| "step": 1696 |
| }, |
| { |
| "epoch": 0.40822111977321046, |
| "grad_norm": 6.5625, |
| "learning_rate": 9.960601062958833e-06, |
| "loss": 0.4228, |
| "step": 1728 |
| }, |
| { |
| "epoch": 0.41578077013938103, |
| "grad_norm": 6.75, |
| "learning_rate": 9.954898911937302e-06, |
| "loss": 0.4545, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.4233404205055516, |
| "grad_norm": 7.375, |
| "learning_rate": 9.94881339346676e-06, |
| "loss": 0.4468, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.43090007087172216, |
| "grad_norm": 7.0625, |
| "learning_rate": 9.942344978392267e-06, |
| "loss": 0.4355, |
| "step": 1824 |
| }, |
| { |
| "epoch": 0.43845972123789273, |
| "grad_norm": 6.21875, |
| "learning_rate": 9.935494167184133e-06, |
| "loss": 0.4563, |
| "step": 1856 |
| }, |
| { |
| "epoch": 0.4460193716040633, |
| "grad_norm": 7.34375, |
| "learning_rate": 9.928261489899187e-06, |
| "loss": 0.4273, |
| "step": 1888 |
| }, |
| { |
| "epoch": 0.45357902197023386, |
| "grad_norm": 6.40625, |
| "learning_rate": 9.920647506139774e-06, |
| "loss": 0.4072, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.46113867233640443, |
| "grad_norm": 5.25, |
| "learning_rate": 9.912652805010451e-06, |
| "loss": 0.4216, |
| "step": 1952 |
| }, |
| { |
| "epoch": 0.468698322702575, |
| "grad_norm": 5.9375, |
| "learning_rate": 9.90427800507241e-06, |
| "loss": 0.4853, |
| "step": 1984 |
| }, |
| { |
| "epoch": 0.47625797306874557, |
| "grad_norm": 6.625, |
| "learning_rate": 9.89552375429562e-06, |
| "loss": 0.4452, |
| "step": 2016 |
| }, |
| { |
| "epoch": 0.48381762343491613, |
| "grad_norm": 4.875, |
| "learning_rate": 9.886390730008688e-06, |
| "loss": 0.4505, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.4913772738010867, |
| "grad_norm": 8.9375, |
| "learning_rate": 9.87687963884646e-06, |
| "loss": 0.4279, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.49893692416725727, |
| "grad_norm": 4.46875, |
| "learning_rate": 9.86699121669534e-06, |
| "loss": 0.4225, |
| "step": 2112 |
| }, |
| { |
| "epoch": 0.5064965745334278, |
| "grad_norm": 4.75, |
| "learning_rate": 9.856726228636364e-06, |
| "loss": 0.4333, |
| "step": 2144 |
| }, |
| { |
| "epoch": 0.5140562248995983, |
| "grad_norm": 7.78125, |
| "learning_rate": 9.846085468885994e-06, |
| "loss": 0.4262, |
| "step": 2176 |
| }, |
| { |
| "epoch": 0.5216158752657689, |
| "grad_norm": 6.78125, |
| "learning_rate": 9.835069760734673e-06, |
| "loss": 0.4474, |
| "step": 2208 |
| }, |
| { |
| "epoch": 0.5291755256319395, |
| "grad_norm": 7.25, |
| "learning_rate": 9.823679956483122e-06, |
| "loss": 0.436, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.53673517599811, |
| "grad_norm": 5.71875, |
| "learning_rate": 9.81191693737641e-06, |
| "loss": 0.4293, |
| "step": 2272 |
| }, |
| { |
| "epoch": 0.5442948263642806, |
| "grad_norm": 7.84375, |
| "learning_rate": 9.799781613535747e-06, |
| "loss": 0.4253, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.5518544767304512, |
| "grad_norm": 5.875, |
| "learning_rate": 9.787274923888093e-06, |
| "loss": 0.4385, |
| "step": 2336 |
| }, |
| { |
| "epoch": 0.5594141270966217, |
| "grad_norm": 7.5625, |
| "learning_rate": 9.774397836093487e-06, |
| "loss": 0.4349, |
| "step": 2368 |
| }, |
| { |
| "epoch": 0.5669737774627923, |
| "grad_norm": 5.09375, |
| "learning_rate": 9.761151346470204e-06, |
| "loss": 0.3955, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.5745334278289629, |
| "grad_norm": 8.6875, |
| "learning_rate": 9.747536479917643e-06, |
| "loss": 0.4272, |
| "step": 2432 |
| }, |
| { |
| "epoch": 0.5820930781951335, |
| "grad_norm": 5.5625, |
| "learning_rate": 9.733554289837041e-06, |
| "loss": 0.4269, |
| "step": 2464 |
| }, |
| { |
| "epoch": 0.589652728561304, |
| "grad_norm": 6.40625, |
| "learning_rate": 9.719205858049978e-06, |
| "loss": 0.4676, |
| "step": 2496 |
| }, |
| { |
| "epoch": 0.5972123789274746, |
| "grad_norm": 3.578125, |
| "learning_rate": 9.704492294714658e-06, |
| "loss": 0.4336, |
| "step": 2528 |
| }, |
| { |
| "epoch": 0.6047720292936452, |
| "grad_norm": 8.4375, |
| "learning_rate": 9.689414738240026e-06, |
| "loss": 0.4546, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.6123316796598157, |
| "grad_norm": 6.65625, |
| "learning_rate": 9.673974355197684e-06, |
| "loss": 0.3893, |
| "step": 2592 |
| }, |
| { |
| "epoch": 0.6198913300259863, |
| "grad_norm": 6.78125, |
| "learning_rate": 9.658172340231636e-06, |
| "loss": 0.4525, |
| "step": 2624 |
| }, |
| { |
| "epoch": 0.6274509803921569, |
| "grad_norm": 5.78125, |
| "learning_rate": 9.642009915965844e-06, |
| "loss": 0.4303, |
| "step": 2656 |
| }, |
| { |
| "epoch": 0.6350106307583274, |
| "grad_norm": 6.125, |
| "learning_rate": 9.625488332909649e-06, |
| "loss": 0.4332, |
| "step": 2688 |
| }, |
| { |
| "epoch": 0.642570281124498, |
| "grad_norm": 20.375, |
| "learning_rate": 9.608608869361008e-06, |
| "loss": 0.4497, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.6501299314906686, |
| "grad_norm": 7.09375, |
| "learning_rate": 9.59137283130759e-06, |
| "loss": 0.4172, |
| "step": 2752 |
| }, |
| { |
| "epoch": 0.6576895818568391, |
| "grad_norm": 5.09375, |
| "learning_rate": 9.573781552325732e-06, |
| "loss": 0.4705, |
| "step": 2784 |
| }, |
| { |
| "epoch": 0.6652492322230097, |
| "grad_norm": 6.625, |
| "learning_rate": 9.555836393477254e-06, |
| "loss": 0.4326, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.6728088825891803, |
| "grad_norm": 5.0, |
| "learning_rate": 9.537538743204164e-06, |
| "loss": 0.4291, |
| "step": 2848 |
| }, |
| { |
| "epoch": 0.6803685329553508, |
| "grad_norm": 6.75, |
| "learning_rate": 9.518890017221214e-06, |
| "loss": 0.4252, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.6879281833215214, |
| "grad_norm": 7.125, |
| "learning_rate": 9.499891658406381e-06, |
| "loss": 0.4019, |
| "step": 2912 |
| }, |
| { |
| "epoch": 0.695487833687692, |
| "grad_norm": 6.46875, |
| "learning_rate": 9.480545136689222e-06, |
| "loss": 0.4161, |
| "step": 2944 |
| }, |
| { |
| "epoch": 0.7030474840538625, |
| "grad_norm": 4.0, |
| "learning_rate": 9.46085194893714e-06, |
| "loss": 0.4163, |
| "step": 2976 |
| }, |
| { |
| "epoch": 0.7106071344200331, |
| "grad_norm": 6.65625, |
| "learning_rate": 9.440813618839574e-06, |
| "loss": 0.4621, |
| "step": 3008 |
| }, |
| { |
| "epoch": 0.7181667847862037, |
| "grad_norm": 6.90625, |
| "learning_rate": 9.420431696790116e-06, |
| "loss": 0.4271, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.7257264351523742, |
| "grad_norm": 14.1875, |
| "learning_rate": 9.399707759766536e-06, |
| "loss": 0.4592, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.7332860855185448, |
| "grad_norm": 4.75, |
| "learning_rate": 9.378643411208785e-06, |
| "loss": 0.4204, |
| "step": 3104 |
| }, |
| { |
| "epoch": 0.7408457358847154, |
| "grad_norm": 6.40625, |
| "learning_rate": 9.357240280894935e-06, |
| "loss": 0.4407, |
| "step": 3136 |
| }, |
| { |
| "epoch": 0.7484053862508859, |
| "grad_norm": 7.84375, |
| "learning_rate": 9.335500024815066e-06, |
| "loss": 0.4412, |
| "step": 3168 |
| }, |
| { |
| "epoch": 0.7559650366170565, |
| "grad_norm": 6.15625, |
| "learning_rate": 9.313424325043156e-06, |
| "loss": 0.4509, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.7635246869832271, |
| "grad_norm": 6.71875, |
| "learning_rate": 9.291014889606927e-06, |
| "loss": 0.4084, |
| "step": 3232 |
| }, |
| { |
| "epoch": 0.7710843373493976, |
| "grad_norm": 7.8125, |
| "learning_rate": 9.268273452355698e-06, |
| "loss": 0.3744, |
| "step": 3264 |
| }, |
| { |
| "epoch": 0.7786439877155682, |
| "grad_norm": 5.5, |
| "learning_rate": 9.24520177282623e-06, |
| "loss": 0.4226, |
| "step": 3296 |
| }, |
| { |
| "epoch": 0.7862036380817388, |
| "grad_norm": 7.78125, |
| "learning_rate": 9.221801636106591e-06, |
| "loss": 0.4654, |
| "step": 3328 |
| }, |
| { |
| "epoch": 0.7937632884479093, |
| "grad_norm": 5.75, |
| "learning_rate": 9.198074852698042e-06, |
| "loss": 0.4746, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.8013229388140799, |
| "grad_norm": 10.0, |
| "learning_rate": 9.174023258374951e-06, |
| "loss": 0.4012, |
| "step": 3392 |
| }, |
| { |
| "epoch": 0.8088825891802505, |
| "grad_norm": 8.375, |
| "learning_rate": 9.149648714042763e-06, |
| "loss": 0.4434, |
| "step": 3424 |
| }, |
| { |
| "epoch": 0.8164422395464209, |
| "grad_norm": 5.9375, |
| "learning_rate": 9.124953105594014e-06, |
| "loss": 0.4421, |
| "step": 3456 |
| }, |
| { |
| "epoch": 0.8240018899125915, |
| "grad_norm": 4.75, |
| "learning_rate": 9.099938343762423e-06, |
| "loss": 0.4318, |
| "step": 3488 |
| }, |
| { |
| "epoch": 0.8315615402787621, |
| "grad_norm": 5.90625, |
| "learning_rate": 9.074606363975047e-06, |
| "loss": 0.4141, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.8391211906449326, |
| "grad_norm": 21.375, |
| "learning_rate": 9.048959126202543e-06, |
| "loss": 0.4459, |
| "step": 3552 |
| }, |
| { |
| "epoch": 0.8466808410111032, |
| "grad_norm": 13.0625, |
| "learning_rate": 9.022998614807519e-06, |
| "loss": 0.4449, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.8542404913772738, |
| "grad_norm": 6.59375, |
| "learning_rate": 8.996726838390995e-06, |
| "loss": 0.4375, |
| "step": 3616 |
| }, |
| { |
| "epoch": 0.8618001417434443, |
| "grad_norm": 5.625, |
| "learning_rate": 8.970145829637007e-06, |
| "loss": 0.4051, |
| "step": 3648 |
| }, |
| { |
| "epoch": 0.8693597921096149, |
| "grad_norm": 4.53125, |
| "learning_rate": 8.943257645155328e-06, |
| "loss": 0.4162, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.8769194424757855, |
| "grad_norm": 7.03125, |
| "learning_rate": 8.91606436532234e-06, |
| "loss": 0.4302, |
| "step": 3712 |
| }, |
| { |
| "epoch": 0.884479092841956, |
| "grad_norm": 6.0, |
| "learning_rate": 8.888568094120088e-06, |
| "loss": 0.4083, |
| "step": 3744 |
| }, |
| { |
| "epoch": 0.8920387432081266, |
| "grad_norm": 4.96875, |
| "learning_rate": 8.860770958973472e-06, |
| "loss": 0.3913, |
| "step": 3776 |
| }, |
| { |
| "epoch": 0.8995983935742972, |
| "grad_norm": 15.875, |
| "learning_rate": 8.832675110585663e-06, |
| "loss": 0.4226, |
| "step": 3808 |
| }, |
| { |
| "epoch": 0.9071580439404677, |
| "grad_norm": 6.8125, |
| "learning_rate": 8.804282722771691e-06, |
| "loss": 0.4122, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.9147176943066383, |
| "grad_norm": 5.625, |
| "learning_rate": 8.775595992290257e-06, |
| "loss": 0.4548, |
| "step": 3872 |
| }, |
| { |
| "epoch": 0.9222773446728089, |
| "grad_norm": 7.875, |
| "learning_rate": 8.746617138673761e-06, |
| "loss": 0.4418, |
| "step": 3904 |
| }, |
| { |
| "epoch": 0.9298369950389794, |
| "grad_norm": 12.375, |
| "learning_rate": 8.717348404056578e-06, |
| "loss": 0.4121, |
| "step": 3936 |
| }, |
| { |
| "epoch": 0.93739664540515, |
| "grad_norm": 6.3125, |
| "learning_rate": 8.687792053001587e-06, |
| "loss": 0.4491, |
| "step": 3968 |
| }, |
| { |
| "epoch": 0.9449562957713206, |
| "grad_norm": 5.625, |
| "learning_rate": 8.657950372324947e-06, |
| "loss": 0.4384, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.9525159461374911, |
| "grad_norm": 6.8125, |
| "learning_rate": 8.62782567091917e-06, |
| "loss": 0.4195, |
| "step": 4032 |
| }, |
| { |
| "epoch": 0.9600755965036617, |
| "grad_norm": 8.5625, |
| "learning_rate": 8.597420279574478e-06, |
| "loss": 0.4195, |
| "step": 4064 |
| }, |
| { |
| "epoch": 0.9676352468698323, |
| "grad_norm": 5.0625, |
| "learning_rate": 8.566736550798462e-06, |
| "loss": 0.4166, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.9751948972360028, |
| "grad_norm": 5.9375, |
| "learning_rate": 8.535776858634076e-06, |
| "loss": 0.4142, |
| "step": 4128 |
| }, |
| { |
| "epoch": 0.9827545476021734, |
| "grad_norm": 7.71875, |
| "learning_rate": 8.504543598475939e-06, |
| "loss": 0.4273, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.990314197968344, |
| "grad_norm": 5.59375, |
| "learning_rate": 8.47303918688501e-06, |
| "loss": 0.3669, |
| "step": 4192 |
| }, |
| { |
| "epoch": 0.9978738483345145, |
| "grad_norm": 6.09375, |
| "learning_rate": 8.441266061401613e-06, |
| "loss": 0.4211, |
| "step": 4224 |
| }, |
| { |
| "epoch": 1.005433498700685, |
| "grad_norm": 6.0, |
| "learning_rate": 8.40922668035685e-06, |
| "loss": 0.4052, |
| "step": 4256 |
| }, |
| { |
| "epoch": 1.0129931490668556, |
| "grad_norm": 5.5625, |
| "learning_rate": 8.376923522682372e-06, |
| "loss": 0.3879, |
| "step": 4288 |
| }, |
| { |
| "epoch": 1.0205527994330261, |
| "grad_norm": 7.46875, |
| "learning_rate": 8.344359087718607e-06, |
| "loss": 0.3971, |
| "step": 4320 |
| }, |
| { |
| "epoch": 1.0281124497991967, |
| "grad_norm": 6.375, |
| "learning_rate": 8.311535895021368e-06, |
| "loss": 0.3401, |
| "step": 4352 |
| }, |
| { |
| "epoch": 1.0356721001653673, |
| "grad_norm": 6.46875, |
| "learning_rate": 8.278456484166916e-06, |
| "loss": 0.3999, |
| "step": 4384 |
| }, |
| { |
| "epoch": 1.0432317505315378, |
| "grad_norm": 6.28125, |
| "learning_rate": 8.24512341455547e-06, |
| "loss": 0.349, |
| "step": 4416 |
| }, |
| { |
| "epoch": 1.0507914008977084, |
| "grad_norm": 13.375, |
| "learning_rate": 8.21153926521318e-06, |
| "loss": 0.4037, |
| "step": 4448 |
| }, |
| { |
| "epoch": 1.058351051263879, |
| "grad_norm": 7.125, |
| "learning_rate": 8.177706634592584e-06, |
| "loss": 0.3845, |
| "step": 4480 |
| }, |
| { |
| "epoch": 1.0659107016300495, |
| "grad_norm": 7.875, |
| "learning_rate": 8.143628140371565e-06, |
| "loss": 0.3731, |
| "step": 4512 |
| }, |
| { |
| "epoch": 1.07347035199622, |
| "grad_norm": 8.5625, |
| "learning_rate": 8.109306419250818e-06, |
| "loss": 0.3845, |
| "step": 4544 |
| }, |
| { |
| "epoch": 1.0810300023623907, |
| "grad_norm": 9.5, |
| "learning_rate": 8.074744126749839e-06, |
| "loss": 0.4437, |
| "step": 4576 |
| }, |
| { |
| "epoch": 1.0885896527285612, |
| "grad_norm": 9.625, |
| "learning_rate": 8.039943937001472e-06, |
| "loss": 0.3819, |
| "step": 4608 |
| }, |
| { |
| "epoch": 1.0961493030947318, |
| "grad_norm": 5.46875, |
| "learning_rate": 8.004908542545e-06, |
| "loss": 0.394, |
| "step": 4640 |
| }, |
| { |
| "epoch": 1.1037089534609024, |
| "grad_norm": 5.5625, |
| "learning_rate": 7.969640654117822e-06, |
| "loss": 0.4315, |
| "step": 4672 |
| }, |
| { |
| "epoch": 1.111268603827073, |
| "grad_norm": 8.5, |
| "learning_rate": 7.934143000445726e-06, |
| "loss": 0.4429, |
| "step": 4704 |
| }, |
| { |
| "epoch": 1.1188282541932435, |
| "grad_norm": 6.34375, |
| "learning_rate": 7.898418328031752e-06, |
| "loss": 0.4198, |
| "step": 4736 |
| }, |
| { |
| "epoch": 1.126387904559414, |
| "grad_norm": 7.21875, |
| "learning_rate": 7.8624694009437e-06, |
| "loss": 0.3899, |
| "step": 4768 |
| }, |
| { |
| "epoch": 1.1339475549255846, |
| "grad_norm": 4.65625, |
| "learning_rate": 7.826299000600262e-06, |
| "loss": 0.3963, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.1415072052917552, |
| "grad_norm": 4.375, |
| "learning_rate": 7.789909925555835e-06, |
| "loss": 0.3925, |
| "step": 4832 |
| }, |
| { |
| "epoch": 1.1490668556579258, |
| "grad_norm": 7.96875, |
| "learning_rate": 7.753304991283975e-06, |
| "loss": 0.4087, |
| "step": 4864 |
| }, |
| { |
| "epoch": 1.1566265060240963, |
| "grad_norm": 6.625, |
| "learning_rate": 7.71648702995957e-06, |
| "loss": 0.4089, |
| "step": 4896 |
| }, |
| { |
| "epoch": 1.164186156390267, |
| "grad_norm": 3.71875, |
| "learning_rate": 7.67945889023971e-06, |
| "loss": 0.4062, |
| "step": 4928 |
| }, |
| { |
| "epoch": 1.1717458067564375, |
| "grad_norm": 4.8125, |
| "learning_rate": 7.642223437043279e-06, |
| "loss": 0.41, |
| "step": 4960 |
| }, |
| { |
| "epoch": 1.179305457122608, |
| "grad_norm": 8.25, |
| "learning_rate": 7.604783551329298e-06, |
| "loss": 0.3862, |
| "step": 4992 |
| }, |
| { |
| "epoch": 1.1868651074887786, |
| "grad_norm": 5.09375, |
| "learning_rate": 7.567142129874012e-06, |
| "loss": 0.383, |
| "step": 5024 |
| }, |
| { |
| "epoch": 1.1944247578549492, |
| "grad_norm": 4.84375, |
| "learning_rate": 7.5293020850467705e-06, |
| "loss": 0.4054, |
| "step": 5056 |
| }, |
| { |
| "epoch": 1.2019844082211197, |
| "grad_norm": 6.3125, |
| "learning_rate": 7.491266344584691e-06, |
| "loss": 0.4077, |
| "step": 5088 |
| }, |
| { |
| "epoch": 1.2095440585872903, |
| "grad_norm": 6.8125, |
| "learning_rate": 7.453037851366136e-06, |
| "loss": 0.4097, |
| "step": 5120 |
| }, |
| { |
| "epoch": 1.2171037089534609, |
| "grad_norm": 4.59375, |
| "learning_rate": 7.414619563183017e-06, |
| "loss": 0.3759, |
| "step": 5152 |
| }, |
| { |
| "epoch": 1.2246633593196314, |
| "grad_norm": 4.5, |
| "learning_rate": 7.376014452511947e-06, |
| "loss": 0.3878, |
| "step": 5184 |
| }, |
| { |
| "epoch": 1.232223009685802, |
| "grad_norm": 6.25, |
| "learning_rate": 7.337225506284255e-06, |
| "loss": 0.3696, |
| "step": 5216 |
| }, |
| { |
| "epoch": 1.2397826600519726, |
| "grad_norm": 5.9375, |
| "learning_rate": 7.298255725654884e-06, |
| "loss": 0.4199, |
| "step": 5248 |
| }, |
| { |
| "epoch": 1.2473423104181431, |
| "grad_norm": 4.8125, |
| "learning_rate": 7.25910812577019e-06, |
| "loss": 0.4098, |
| "step": 5280 |
| }, |
| { |
| "epoch": 1.2549019607843137, |
| "grad_norm": 6.03125, |
| "learning_rate": 7.219785735534653e-06, |
| "loss": 0.3707, |
| "step": 5312 |
| }, |
| { |
| "epoch": 1.2624616111504843, |
| "grad_norm": 4.8125, |
| "learning_rate": 7.1802915973765184e-06, |
| "loss": 0.4299, |
| "step": 5344 |
| }, |
| { |
| "epoch": 1.2700212615166548, |
| "grad_norm": 4.03125, |
| "learning_rate": 7.14062876701242e-06, |
| "loss": 0.3897, |
| "step": 5376 |
| }, |
| { |
| "epoch": 1.2775809118828254, |
| "grad_norm": 5.375, |
| "learning_rate": 7.1008003132109345e-06, |
| "loss": 0.3932, |
| "step": 5408 |
| }, |
| { |
| "epoch": 1.285140562248996, |
| "grad_norm": 8.1875, |
| "learning_rate": 7.0608093175551615e-06, |
| "loss": 0.3614, |
| "step": 5440 |
| }, |
| { |
| "epoch": 1.2927002126151665, |
| "grad_norm": 5.71875, |
| "learning_rate": 7.020658874204286e-06, |
| "loss": 0.3998, |
| "step": 5472 |
| }, |
| { |
| "epoch": 1.3002598629813371, |
| "grad_norm": 4.5, |
| "learning_rate": 6.980352089654184e-06, |
| "loss": 0.3811, |
| "step": 5504 |
| }, |
| { |
| "epoch": 1.3078195133475077, |
| "grad_norm": 6.40625, |
| "learning_rate": 6.939892082497073e-06, |
| "loss": 0.379, |
| "step": 5536 |
| }, |
| { |
| "epoch": 1.3153791637136782, |
| "grad_norm": 7.21875, |
| "learning_rate": 6.899281983180213e-06, |
| "loss": 0.4307, |
| "step": 5568 |
| }, |
| { |
| "epoch": 1.3229388140798488, |
| "grad_norm": 6.21875, |
| "learning_rate": 6.858524933763706e-06, |
| "loss": 0.4209, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.3304984644460194, |
| "grad_norm": 5.5625, |
| "learning_rate": 6.817624087677388e-06, |
| "loss": 0.4009, |
| "step": 5632 |
| }, |
| { |
| "epoch": 1.33805811481219, |
| "grad_norm": 6.4375, |
| "learning_rate": 6.776582609476845e-06, |
| "loss": 0.3359, |
| "step": 5664 |
| }, |
| { |
| "epoch": 1.3456177651783605, |
| "grad_norm": 5.96875, |
| "learning_rate": 6.735403674598566e-06, |
| "loss": 0.414, |
| "step": 5696 |
| }, |
| { |
| "epoch": 1.353177415544531, |
| "grad_norm": 11.5, |
| "learning_rate": 6.694090469114254e-06, |
| "loss": 0.3847, |
| "step": 5728 |
| }, |
| { |
| "epoch": 1.3607370659107016, |
| "grad_norm": 3.78125, |
| "learning_rate": 6.652646189484317e-06, |
| "loss": 0.4152, |
| "step": 5760 |
| }, |
| { |
| "epoch": 1.3682967162768722, |
| "grad_norm": 5.25, |
| "learning_rate": 6.61107404231055e-06, |
| "loss": 0.3921, |
| "step": 5792 |
| }, |
| { |
| "epoch": 1.3758563666430428, |
| "grad_norm": 9.625, |
| "learning_rate": 6.569377244088044e-06, |
| "loss": 0.3857, |
| "step": 5824 |
| }, |
| { |
| "epoch": 1.3834160170092134, |
| "grad_norm": 5.375, |
| "learning_rate": 6.527559020956312e-06, |
| "loss": 0.3879, |
| "step": 5856 |
| }, |
| { |
| "epoch": 1.390975667375384, |
| "grad_norm": 5.40625, |
| "learning_rate": 6.485622608449684e-06, |
| "loss": 0.3835, |
| "step": 5888 |
| }, |
| { |
| "epoch": 1.3985353177415545, |
| "grad_norm": 6.6875, |
| "learning_rate": 6.443571251246964e-06, |
| "loss": 0.3753, |
| "step": 5920 |
| }, |
| { |
| "epoch": 1.406094968107725, |
| "grad_norm": 4.625, |
| "learning_rate": 6.401408202920387e-06, |
| "loss": 0.4092, |
| "step": 5952 |
| }, |
| { |
| "epoch": 1.4136546184738956, |
| "grad_norm": 6.78125, |
| "learning_rate": 6.3591367256838935e-06, |
| "loss": 0.4141, |
| "step": 5984 |
| }, |
| { |
| "epoch": 1.4212142688400662, |
| "grad_norm": 5.5, |
| "learning_rate": 6.316760090140713e-06, |
| "loss": 0.401, |
| "step": 6016 |
| }, |
| { |
| "epoch": 1.4287739192062368, |
| "grad_norm": 6.125, |
| "learning_rate": 6.274281575030321e-06, |
| "loss": 0.3655, |
| "step": 6048 |
| }, |
| { |
| "epoch": 1.4363335695724073, |
| "grad_norm": 6.9375, |
| "learning_rate": 6.2317044669747595e-06, |
| "loss": 0.4077, |
| "step": 6080 |
| }, |
| { |
| "epoch": 1.4438932199385779, |
| "grad_norm": 4.90625, |
| "learning_rate": 6.189032060224344e-06, |
| "loss": 0.3987, |
| "step": 6112 |
| }, |
| { |
| "epoch": 1.4514528703047485, |
| "grad_norm": 5.71875, |
| "learning_rate": 6.1462676564027814e-06, |
| "loss": 0.3943, |
| "step": 6144 |
| }, |
| { |
| "epoch": 1.459012520670919, |
| "grad_norm": 5.78125, |
| "learning_rate": 6.1034145642517236e-06, |
| "loss": 0.3839, |
| "step": 6176 |
| }, |
| { |
| "epoch": 1.4665721710370896, |
| "grad_norm": 19.25, |
| "learning_rate": 6.06047609937476e-06, |
| "loss": 0.412, |
| "step": 6208 |
| }, |
| { |
| "epoch": 1.4741318214032602, |
| "grad_norm": 7.375, |
| "learning_rate": 6.017455583980887e-06, |
| "loss": 0.3862, |
| "step": 6240 |
| }, |
| { |
| "epoch": 1.4816914717694307, |
| "grad_norm": 12.0625, |
| "learning_rate": 5.974356346627463e-06, |
| "loss": 0.4054, |
| "step": 6272 |
| }, |
| { |
| "epoch": 1.4892511221356013, |
| "grad_norm": 7.5, |
| "learning_rate": 5.931181721962682e-06, |
| "loss": 0.3772, |
| "step": 6304 |
| }, |
| { |
| "epoch": 1.4968107725017719, |
| "grad_norm": 7.46875, |
| "learning_rate": 5.887935050467547e-06, |
| "loss": 0.4123, |
| "step": 6336 |
| }, |
| { |
| "epoch": 1.5043704228679422, |
| "grad_norm": 8.4375, |
| "learning_rate": 5.844619678197434e-06, |
| "loss": 0.3815, |
| "step": 6368 |
| }, |
| { |
| "epoch": 1.511930073234113, |
| "grad_norm": 5.875, |
| "learning_rate": 5.801238956523192e-06, |
| "loss": 0.4348, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.5194897236002833, |
| "grad_norm": 6.375, |
| "learning_rate": 5.757796241871844e-06, |
| "loss": 0.3863, |
| "step": 6432 |
| }, |
| { |
| "epoch": 1.5270493739664541, |
| "grad_norm": 7.15625, |
| "learning_rate": 5.714294895466897e-06, |
| "loss": 0.4001, |
| "step": 6464 |
| }, |
| { |
| "epoch": 1.5346090243326245, |
| "grad_norm": 5.0, |
| "learning_rate": 5.670738283068273e-06, |
| "loss": 0.3793, |
| "step": 6496 |
| }, |
| { |
| "epoch": 1.5421686746987953, |
| "grad_norm": 6.65625, |
| "learning_rate": 5.627129774711912e-06, |
| "loss": 0.3899, |
| "step": 6528 |
| }, |
| { |
| "epoch": 1.5497283250649656, |
| "grad_norm": 5.75, |
| "learning_rate": 5.58347274444901e-06, |
| "loss": 0.3811, |
| "step": 6560 |
| }, |
| { |
| "epoch": 1.5572879754311364, |
| "grad_norm": 5.09375, |
| "learning_rate": 5.539770570084968e-06, |
| "loss": 0.3998, |
| "step": 6592 |
| }, |
| { |
| "epoch": 1.5648476257973067, |
| "grad_norm": 6.40625, |
| "learning_rate": 5.496026632918055e-06, |
| "loss": 0.3703, |
| "step": 6624 |
| }, |
| { |
| "epoch": 1.5724072761634775, |
| "grad_norm": 7.25, |
| "learning_rate": 5.452244317477785e-06, |
| "loss": 0.3892, |
| "step": 6656 |
| }, |
| { |
| "epoch": 1.5799669265296479, |
| "grad_norm": 7.46875, |
| "learning_rate": 5.40842701126305e-06, |
| "loss": 0.4057, |
| "step": 6688 |
| }, |
| { |
| "epoch": 1.5875265768958187, |
| "grad_norm": 5.625, |
| "learning_rate": 5.364578104480029e-06, |
| "loss": 0.4182, |
| "step": 6720 |
| }, |
| { |
| "epoch": 1.595086227261989, |
| "grad_norm": 5.25, |
| "learning_rate": 5.3207009897798825e-06, |
| "loss": 0.3668, |
| "step": 6752 |
| }, |
| { |
| "epoch": 1.6026458776281598, |
| "grad_norm": 7.28125, |
| "learning_rate": 5.276799061996252e-06, |
| "loss": 0.3976, |
| "step": 6784 |
| }, |
| { |
| "epoch": 1.6102055279943301, |
| "grad_norm": 5.625, |
| "learning_rate": 5.232875717882606e-06, |
| "loss": 0.4012, |
| "step": 6816 |
| }, |
| { |
| "epoch": 1.617765178360501, |
| "grad_norm": 8.5, |
| "learning_rate": 5.1889343558494264e-06, |
| "loss": 0.413, |
| "step": 6848 |
| }, |
| { |
| "epoch": 1.6253248287266713, |
| "grad_norm": 7.9375, |
| "learning_rate": 5.144978375701261e-06, |
| "loss": 0.435, |
| "step": 6880 |
| }, |
| { |
| "epoch": 1.632884479092842, |
| "grad_norm": 6.34375, |
| "learning_rate": 5.101011178373687e-06, |
| "loss": 0.3851, |
| "step": 6912 |
| }, |
| { |
| "epoch": 1.6404441294590124, |
| "grad_norm": 5.53125, |
| "learning_rate": 5.057036165670165e-06, |
| "loss": 0.3728, |
| "step": 6944 |
| }, |
| { |
| "epoch": 1.6480037798251832, |
| "grad_norm": 6.0625, |
| "learning_rate": 5.013056739998845e-06, |
| "loss": 0.4223, |
| "step": 6976 |
| }, |
| { |
| "epoch": 1.6555634301913535, |
| "grad_norm": 5.34375, |
| "learning_rate": 4.969076304109316e-06, |
| "loss": 0.4188, |
| "step": 7008 |
| }, |
| { |
| "epoch": 1.6631230805575243, |
| "grad_norm": 7.28125, |
| "learning_rate": 4.92509826082933e-06, |
| "loss": 0.3932, |
| "step": 7040 |
| }, |
| { |
| "epoch": 1.6706827309236947, |
| "grad_norm": 4.625, |
| "learning_rate": 4.881126012801512e-06, |
| "loss": 0.3873, |
| "step": 7072 |
| }, |
| { |
| "epoch": 1.6782423812898655, |
| "grad_norm": 5.8125, |
| "learning_rate": 4.8371629622201084e-06, |
| "loss": 0.3801, |
| "step": 7104 |
| }, |
| { |
| "epoch": 1.6858020316560358, |
| "grad_norm": 7.5, |
| "learning_rate": 4.793212510567741e-06, |
| "loss": 0.381, |
| "step": 7136 |
| }, |
| { |
| "epoch": 1.6933616820222066, |
| "grad_norm": 5.40625, |
| "learning_rate": 4.749278058352238e-06, |
| "loss": 0.4149, |
| "step": 7168 |
| }, |
| { |
| "epoch": 1.700921332388377, |
| "grad_norm": 4.0, |
| "learning_rate": 4.7053630048435295e-06, |
| "loss": 0.3805, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.7084809827545477, |
| "grad_norm": 5.6875, |
| "learning_rate": 4.661470747810635e-06, |
| "loss": 0.3912, |
| "step": 7232 |
| }, |
| { |
| "epoch": 1.716040633120718, |
| "grad_norm": 6.03125, |
| "learning_rate": 4.617604683258787e-06, |
| "loss": 0.4288, |
| "step": 7264 |
| }, |
| { |
| "epoch": 1.7236002834868889, |
| "grad_norm": 6.9375, |
| "learning_rate": 4.573768205166663e-06, |
| "loss": 0.3611, |
| "step": 7296 |
| }, |
| { |
| "epoch": 1.7311599338530592, |
| "grad_norm": 7.125, |
| "learning_rate": 4.529964705223796e-06, |
| "loss": 0.3765, |
| "step": 7328 |
| }, |
| { |
| "epoch": 1.73871958421923, |
| "grad_norm": 7.0625, |
| "learning_rate": 4.486197572568154e-06, |
| "loss": 0.4224, |
| "step": 7360 |
| }, |
| { |
| "epoch": 1.7462792345854004, |
| "grad_norm": 4.78125, |
| "learning_rate": 4.442470193523919e-06, |
| "loss": 0.4066, |
| "step": 7392 |
| }, |
| { |
| "epoch": 1.7538388849515711, |
| "grad_norm": 7.28125, |
| "learning_rate": 4.398785951339476e-06, |
| "loss": 0.4138, |
| "step": 7424 |
| }, |
| { |
| "epoch": 1.7613985353177415, |
| "grad_norm": 7.0, |
| "learning_rate": 4.355148225925658e-06, |
| "loss": 0.4131, |
| "step": 7456 |
| }, |
| { |
| "epoch": 1.7689581856839123, |
| "grad_norm": 6.125, |
| "learning_rate": 4.3115603935942226e-06, |
| "loss": 0.3907, |
| "step": 7488 |
| }, |
| { |
| "epoch": 1.7765178360500826, |
| "grad_norm": 6.1875, |
| "learning_rate": 4.268025826796636e-06, |
| "loss": 0.4282, |
| "step": 7520 |
| }, |
| { |
| "epoch": 1.7840774864162534, |
| "grad_norm": 6.15625, |
| "learning_rate": 4.224547893863133e-06, |
| "loss": 0.3837, |
| "step": 7552 |
| }, |
| { |
| "epoch": 1.7916371367824238, |
| "grad_norm": 7.46875, |
| "learning_rate": 4.181129958742107e-06, |
| "loss": 0.3634, |
| "step": 7584 |
| }, |
| { |
| "epoch": 1.7991967871485943, |
| "grad_norm": 6.0625, |
| "learning_rate": 4.137775380739839e-06, |
| "loss": 0.3652, |
| "step": 7616 |
| }, |
| { |
| "epoch": 1.806756437514765, |
| "grad_norm": 8.6875, |
| "learning_rate": 4.094487514260575e-06, |
| "loss": 0.4116, |
| "step": 7648 |
| }, |
| { |
| "epoch": 1.8143160878809355, |
| "grad_norm": 7.75, |
| "learning_rate": 4.051269708547008e-06, |
| "loss": 0.4062, |
| "step": 7680 |
| }, |
| { |
| "epoch": 1.821875738247106, |
| "grad_norm": 15.0, |
| "learning_rate": 4.0081253074211204e-06, |
| "loss": 0.3903, |
| "step": 7712 |
| }, |
| { |
| "epoch": 1.8294353886132766, |
| "grad_norm": 6.21875, |
| "learning_rate": 3.965057649025489e-06, |
| "loss": 0.3491, |
| "step": 7744 |
| }, |
| { |
| "epoch": 1.8369950389794472, |
| "grad_norm": 6.375, |
| "learning_rate": 3.9220700655649916e-06, |
| "loss": 0.3592, |
| "step": 7776 |
| }, |
| { |
| "epoch": 1.8445546893456177, |
| "grad_norm": 6.28125, |
| "learning_rate": 3.879165883048997e-06, |
| "loss": 0.3775, |
| "step": 7808 |
| }, |
| { |
| "epoch": 1.8521143397117883, |
| "grad_norm": 5.875, |
| "learning_rate": 3.8363484210340315e-06, |
| "loss": 0.4107, |
| "step": 7840 |
| }, |
| { |
| "epoch": 1.8596739900779589, |
| "grad_norm": 5.21875, |
| "learning_rate": 3.7936209923669286e-06, |
| "loss": 0.3756, |
| "step": 7872 |
| }, |
| { |
| "epoch": 1.8672336404441294, |
| "grad_norm": 6.25, |
| "learning_rate": 3.7509869029285216e-06, |
| "loss": 0.3981, |
| "step": 7904 |
| }, |
| { |
| "epoch": 1.8747932908103, |
| "grad_norm": 6.625, |
| "learning_rate": 3.708449451377851e-06, |
| "loss": 0.3983, |
| "step": 7936 |
| }, |
| { |
| "epoch": 1.8823529411764706, |
| "grad_norm": 5.9375, |
| "learning_rate": 3.6660119288969577e-06, |
| "loss": 0.4131, |
| "step": 7968 |
| }, |
| { |
| "epoch": 1.8899125915426411, |
| "grad_norm": 7.375, |
| "learning_rate": 3.623677618936221e-06, |
| "loss": 0.3686, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.8974722419088117, |
| "grad_norm": 6.3125, |
| "learning_rate": 3.5814497969603324e-06, |
| "loss": 0.3864, |
| "step": 8032 |
| }, |
| { |
| "epoch": 1.9050318922749823, |
| "grad_norm": 4.84375, |
| "learning_rate": 3.5393317301948517e-06, |
| "loss": 0.4053, |
| "step": 8064 |
| }, |
| { |
| "epoch": 1.9125915426411528, |
| "grad_norm": 5.125, |
| "learning_rate": 3.497326677373431e-06, |
| "loss": 0.3841, |
| "step": 8096 |
| }, |
| { |
| "epoch": 1.9201511930073234, |
| "grad_norm": 5.34375, |
| "learning_rate": 3.4554378884856694e-06, |
| "loss": 0.3439, |
| "step": 8128 |
| }, |
| { |
| "epoch": 1.927710843373494, |
| "grad_norm": 5.53125, |
| "learning_rate": 3.41366860452567e-06, |
| "loss": 0.4156, |
| "step": 8160 |
| }, |
| { |
| "epoch": 1.9352704937396645, |
| "grad_norm": 4.5, |
| "learning_rate": 3.372022057241269e-06, |
| "loss": 0.4258, |
| "step": 8192 |
| }, |
| { |
| "epoch": 1.942830144105835, |
| "grad_norm": 6.3125, |
| "learning_rate": 3.330501468883992e-06, |
| "loss": 0.419, |
| "step": 8224 |
| }, |
| { |
| "epoch": 1.9503897944720057, |
| "grad_norm": 9.4375, |
| "learning_rate": 3.289110051959754e-06, |
| "loss": 0.4285, |
| "step": 8256 |
| }, |
| { |
| "epoch": 1.9579494448381762, |
| "grad_norm": 6.1875, |
| "learning_rate": 3.247851008980286e-06, |
| "loss": 0.4105, |
| "step": 8288 |
| }, |
| { |
| "epoch": 1.9655090952043468, |
| "grad_norm": 8.375, |
| "learning_rate": 3.206727532215372e-06, |
| "loss": 0.4125, |
| "step": 8320 |
| }, |
| { |
| "epoch": 1.9730687455705174, |
| "grad_norm": 8.1875, |
| "learning_rate": 3.165742803445841e-06, |
| "loss": 0.3931, |
| "step": 8352 |
| }, |
| { |
| "epoch": 1.980628395936688, |
| "grad_norm": 6.03125, |
| "learning_rate": 3.1248999937174007e-06, |
| "loss": 0.3599, |
| "step": 8384 |
| }, |
| { |
| "epoch": 1.9881880463028585, |
| "grad_norm": 7.09375, |
| "learning_rate": 3.0842022630952784e-06, |
| "loss": 0.3852, |
| "step": 8416 |
| }, |
| { |
| "epoch": 1.995747696669029, |
| "grad_norm": 3.78125, |
| "learning_rate": 3.0436527604197374e-06, |
| "loss": 0.3744, |
| "step": 8448 |
| }, |
| { |
| "epoch": 2.0033073470351996, |
| "grad_norm": 6.34375, |
| "learning_rate": 3.00325462306243e-06, |
| "loss": 0.3665, |
| "step": 8480 |
| }, |
| { |
| "epoch": 2.01086699740137, |
| "grad_norm": 4.125, |
| "learning_rate": 2.9630109766836634e-06, |
| "loss": 0.3574, |
| "step": 8512 |
| }, |
| { |
| "epoch": 2.0184266477675408, |
| "grad_norm": 4.78125, |
| "learning_rate": 2.9229249349905686e-06, |
| "loss": 0.3651, |
| "step": 8544 |
| }, |
| { |
| "epoch": 2.025986298133711, |
| "grad_norm": 6.4375, |
| "learning_rate": 2.8829995994961725e-06, |
| "loss": 0.3792, |
| "step": 8576 |
| }, |
| { |
| "epoch": 2.033545948499882, |
| "grad_norm": 4.75, |
| "learning_rate": 2.8432380592794466e-06, |
| "loss": 0.4099, |
| "step": 8608 |
| }, |
| { |
| "epoch": 2.0411055988660523, |
| "grad_norm": 5.375, |
| "learning_rate": 2.8036433907462866e-06, |
| "loss": 0.3701, |
| "step": 8640 |
| }, |
| { |
| "epoch": 2.048665249232223, |
| "grad_norm": 6.46875, |
| "learning_rate": 2.7642186573915e-06, |
| "loss": 0.3454, |
| "step": 8672 |
| }, |
| { |
| "epoch": 2.0562248995983934, |
| "grad_norm": 8.1875, |
| "learning_rate": 2.724966909561765e-06, |
| "loss": 0.3676, |
| "step": 8704 |
| }, |
| { |
| "epoch": 2.063784549964564, |
| "grad_norm": 4.03125, |
| "learning_rate": 2.685891184219635e-06, |
| "loss": 0.3714, |
| "step": 8736 |
| }, |
| { |
| "epoch": 2.0713442003307345, |
| "grad_norm": 9.375, |
| "learning_rate": 2.646994504708551e-06, |
| "loss": 0.4112, |
| "step": 8768 |
| }, |
| { |
| "epoch": 2.0789038506969053, |
| "grad_norm": 5.0, |
| "learning_rate": 2.6082798805189347e-06, |
| "loss": 0.3587, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.0864635010630757, |
| "grad_norm": 11.25, |
| "learning_rate": 2.569750307055329e-06, |
| "loss": 0.3693, |
| "step": 8832 |
| }, |
| { |
| "epoch": 2.0940231514292464, |
| "grad_norm": 7.4375, |
| "learning_rate": 2.5314087654046403e-06, |
| "loss": 0.3768, |
| "step": 8864 |
| }, |
| { |
| "epoch": 2.101582801795417, |
| "grad_norm": 5.65625, |
| "learning_rate": 2.4932582221055024e-06, |
| "loss": 0.3525, |
| "step": 8896 |
| }, |
| { |
| "epoch": 2.1091424521615876, |
| "grad_norm": 7.59375, |
| "learning_rate": 2.455301628918727e-06, |
| "loss": 0.3835, |
| "step": 8928 |
| }, |
| { |
| "epoch": 2.116702102527758, |
| "grad_norm": 4.96875, |
| "learning_rate": 2.417541922598945e-06, |
| "loss": 0.3802, |
| "step": 8960 |
| }, |
| { |
| "epoch": 2.1242617528939287, |
| "grad_norm": 5.90625, |
| "learning_rate": 2.379982024667367e-06, |
| "loss": 0.4005, |
| "step": 8992 |
| }, |
| { |
| "epoch": 2.131821403260099, |
| "grad_norm": 10.1875, |
| "learning_rate": 2.3426248411857573e-06, |
| "loss": 0.3815, |
| "step": 9024 |
| }, |
| { |
| "epoch": 2.13938105362627, |
| "grad_norm": 6.09375, |
| "learning_rate": 2.3054732625315725e-06, |
| "loss": 0.3349, |
| "step": 9056 |
| }, |
| { |
| "epoch": 2.14694070399244, |
| "grad_norm": 7.28125, |
| "learning_rate": 2.2685301631743437e-06, |
| "loss": 0.3962, |
| "step": 9088 |
| }, |
| { |
| "epoch": 2.154500354358611, |
| "grad_norm": 4.0625, |
| "learning_rate": 2.2317984014532634e-06, |
| "loss": 0.3707, |
| "step": 9120 |
| }, |
| { |
| "epoch": 2.1620600047247813, |
| "grad_norm": 5.0625, |
| "learning_rate": 2.1952808193560367e-06, |
| "loss": 0.4251, |
| "step": 9152 |
| }, |
| { |
| "epoch": 2.169619655090952, |
| "grad_norm": 6.75, |
| "learning_rate": 2.158980242298989e-06, |
| "loss": 0.3916, |
| "step": 9184 |
| }, |
| { |
| "epoch": 2.1771793054571225, |
| "grad_norm": 9.9375, |
| "learning_rate": 2.12289947890847e-06, |
| "loss": 0.3959, |
| "step": 9216 |
| }, |
| { |
| "epoch": 2.1847389558232932, |
| "grad_norm": 5.5, |
| "learning_rate": 2.087041320803535e-06, |
| "loss": 0.3586, |
| "step": 9248 |
| }, |
| { |
| "epoch": 2.1922986061894636, |
| "grad_norm": 5.78125, |
| "learning_rate": 2.051408542379955e-06, |
| "loss": 0.3692, |
| "step": 9280 |
| }, |
| { |
| "epoch": 2.1998582565556344, |
| "grad_norm": 6.40625, |
| "learning_rate": 2.016003900595566e-06, |
| "loss": 0.3709, |
| "step": 9312 |
| }, |
| { |
| "epoch": 2.2074179069218047, |
| "grad_norm": 4.28125, |
| "learning_rate": 1.980830134756946e-06, |
| "loss": 0.3729, |
| "step": 9344 |
| }, |
| { |
| "epoch": 2.2149775572879755, |
| "grad_norm": 6.03125, |
| "learning_rate": 1.945889966307487e-06, |
| "loss": 0.3655, |
| "step": 9376 |
| }, |
| { |
| "epoch": 2.222537207654146, |
| "grad_norm": 6.28125, |
| "learning_rate": 1.911186098616819e-06, |
| "loss": 0.3661, |
| "step": 9408 |
| }, |
| { |
| "epoch": 2.2300968580203167, |
| "grad_norm": 8.875, |
| "learning_rate": 1.8767212167716536e-06, |
| "loss": 0.3685, |
| "step": 9440 |
| }, |
| { |
| "epoch": 2.237656508386487, |
| "grad_norm": 5.8125, |
| "learning_rate": 1.8424979873680332e-06, |
| "loss": 0.3623, |
| "step": 9472 |
| }, |
| { |
| "epoch": 2.245216158752658, |
| "grad_norm": 3.78125, |
| "learning_rate": 1.8085190583050166e-06, |
| "loss": 0.378, |
| "step": 9504 |
| }, |
| { |
| "epoch": 2.252775809118828, |
| "grad_norm": 6.375, |
| "learning_rate": 1.774787058579799e-06, |
| "loss": 0.3688, |
| "step": 9536 |
| }, |
| { |
| "epoch": 2.260335459484999, |
| "grad_norm": 7.625, |
| "learning_rate": 1.7413045980843119e-06, |
| "loss": 0.3841, |
| "step": 9568 |
| }, |
| { |
| "epoch": 2.2678951098511693, |
| "grad_norm": 5.34375, |
| "learning_rate": 1.7080742674032886e-06, |
| "loss": 0.3937, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.27545476021734, |
| "grad_norm": 6.75, |
| "learning_rate": 1.6750986376138207e-06, |
| "loss": 0.3752, |
| "step": 9632 |
| }, |
| { |
| "epoch": 2.2830144105835104, |
| "grad_norm": 6.40625, |
| "learning_rate": 1.6423802600864436e-06, |
| "loss": 0.394, |
| "step": 9664 |
| }, |
| { |
| "epoch": 2.290574060949681, |
| "grad_norm": 7.0625, |
| "learning_rate": 1.6099216662877204e-06, |
| "loss": 0.3868, |
| "step": 9696 |
| }, |
| { |
| "epoch": 2.2981337113158515, |
| "grad_norm": 6.5, |
| "learning_rate": 1.5777253675843873e-06, |
| "loss": 0.4061, |
| "step": 9728 |
| }, |
| { |
| "epoch": 2.3056933616820223, |
| "grad_norm": 12.25, |
| "learning_rate": 1.5457938550490387e-06, |
| "loss": 0.364, |
| "step": 9760 |
| }, |
| { |
| "epoch": 2.3132530120481927, |
| "grad_norm": 4.59375, |
| "learning_rate": 1.5141295992674e-06, |
| "loss": 0.3626, |
| "step": 9792 |
| }, |
| { |
| "epoch": 2.3208126624143635, |
| "grad_norm": 6.125, |
| "learning_rate": 1.482735050147161e-06, |
| "loss": 0.4113, |
| "step": 9824 |
| }, |
| { |
| "epoch": 2.328372312780534, |
| "grad_norm": 6.15625, |
| "learning_rate": 1.4516126367284355e-06, |
| "loss": 0.4139, |
| "step": 9856 |
| }, |
| { |
| "epoch": 2.3359319631467046, |
| "grad_norm": 4.875, |
| "learning_rate": 1.4207647669958186e-06, |
| "loss": 0.3712, |
| "step": 9888 |
| }, |
| { |
| "epoch": 2.343491613512875, |
| "grad_norm": 6.15625, |
| "learning_rate": 1.3901938276920712e-06, |
| "loss": 0.3864, |
| "step": 9920 |
| }, |
| { |
| "epoch": 2.3510512638790457, |
| "grad_norm": 4.6875, |
| "learning_rate": 1.359902184133467e-06, |
| "loss": 0.3999, |
| "step": 9952 |
| }, |
| { |
| "epoch": 2.358610914245216, |
| "grad_norm": 6.25, |
| "learning_rate": 1.3298921800267728e-06, |
| "loss": 0.3774, |
| "step": 9984 |
| }, |
| { |
| "epoch": 2.366170564611387, |
| "grad_norm": 6.4375, |
| "learning_rate": 1.3001661372879194e-06, |
| "loss": 0.4082, |
| "step": 10016 |
| }, |
| { |
| "epoch": 2.373730214977557, |
| "grad_norm": 9.1875, |
| "learning_rate": 1.2707263558623483e-06, |
| "loss": 0.3886, |
| "step": 10048 |
| }, |
| { |
| "epoch": 2.381289865343728, |
| "grad_norm": 5.875, |
| "learning_rate": 1.2415751135470693e-06, |
| "loss": 0.418, |
| "step": 10080 |
| }, |
| { |
| "epoch": 2.3888495157098983, |
| "grad_norm": 6.5625, |
| "learning_rate": 1.2127146658144095e-06, |
| "loss": 0.3886, |
| "step": 10112 |
| }, |
| { |
| "epoch": 2.396409166076069, |
| "grad_norm": 6.15625, |
| "learning_rate": 1.1841472456375219e-06, |
| "loss": 0.3816, |
| "step": 10144 |
| }, |
| { |
| "epoch": 2.4039688164422395, |
| "grad_norm": 4.46875, |
| "learning_rate": 1.1558750633175998e-06, |
| "loss": 0.3795, |
| "step": 10176 |
| }, |
| { |
| "epoch": 2.4115284668084103, |
| "grad_norm": 4.78125, |
| "learning_rate": 1.1279003063128773e-06, |
| "loss": 0.3967, |
| "step": 10208 |
| }, |
| { |
| "epoch": 2.4190881171745806, |
| "grad_norm": 4.25, |
| "learning_rate": 1.1002251390693763e-06, |
| "loss": 0.3719, |
| "step": 10240 |
| }, |
| { |
| "epoch": 2.4266477675407514, |
| "grad_norm": 4.09375, |
| "learning_rate": 1.0728517028534364e-06, |
| "loss": 0.3794, |
| "step": 10272 |
| }, |
| { |
| "epoch": 2.4342074179069217, |
| "grad_norm": 7.84375, |
| "learning_rate": 1.0457821155860488e-06, |
| "loss": 0.3876, |
| "step": 10304 |
| }, |
| { |
| "epoch": 2.4417670682730925, |
| "grad_norm": 4.40625, |
| "learning_rate": 1.0190184716789853e-06, |
| "loss": 0.4292, |
| "step": 10336 |
| }, |
| { |
| "epoch": 2.449326718639263, |
| "grad_norm": 6.78125, |
| "learning_rate": 9.925628418727563e-07, |
| "loss": 0.3976, |
| "step": 10368 |
| }, |
| { |
| "epoch": 2.4568863690054337, |
| "grad_norm": 4.5, |
| "learning_rate": 9.664172730763872e-07, |
| "loss": 0.4061, |
| "step": 10400 |
| }, |
| { |
| "epoch": 2.464446019371604, |
| "grad_norm": 5.90625, |
| "learning_rate": 9.405837882090534e-07, |
| "loss": 0.3729, |
| "step": 10432 |
| }, |
| { |
| "epoch": 2.472005669737775, |
| "grad_norm": 5.65625, |
| "learning_rate": 9.150643860435571e-07, |
| "loss": 0.3747, |
| "step": 10464 |
| }, |
| { |
| "epoch": 2.479565320103945, |
| "grad_norm": 7.34375, |
| "learning_rate": 8.898610410516873e-07, |
| "loss": 0.3779, |
| "step": 10496 |
| }, |
| { |
| "epoch": 2.487124970470116, |
| "grad_norm": 5.875, |
| "learning_rate": 8.649757032514439e-07, |
| "loss": 0.3851, |
| "step": 10528 |
| }, |
| { |
| "epoch": 2.4946846208362863, |
| "grad_norm": 11.0, |
| "learning_rate": 8.404102980561702e-07, |
| "loss": 0.3837, |
| "step": 10560 |
| }, |
| { |
| "epoch": 2.502244271202457, |
| "grad_norm": 5.3125, |
| "learning_rate": 8.161667261255746e-07, |
| "loss": 0.3362, |
| "step": 10592 |
| }, |
| { |
| "epoch": 2.5098039215686274, |
| "grad_norm": 7.34375, |
| "learning_rate": 7.922468632186748e-07, |
| "loss": 0.3638, |
| "step": 10624 |
| }, |
| { |
| "epoch": 2.5173635719347978, |
| "grad_norm": 4.34375, |
| "learning_rate": 7.686525600486743e-07, |
| "loss": 0.3874, |
| "step": 10656 |
| }, |
| { |
| "epoch": 2.5249232223009686, |
| "grad_norm": 7.53125, |
| "learning_rate": 7.453856421397598e-07, |
| "loss": 0.372, |
| "step": 10688 |
| }, |
| { |
| "epoch": 2.5324828726671393, |
| "grad_norm": 6.0625, |
| "learning_rate": 7.224479096858672e-07, |
| "loss": 0.4139, |
| "step": 10720 |
| }, |
| { |
| "epoch": 2.5400425230333097, |
| "grad_norm": 5.125, |
| "learning_rate": 6.9984113741139e-07, |
| "loss": 0.3876, |
| "step": 10752 |
| }, |
| { |
| "epoch": 2.54760217339948, |
| "grad_norm": 5.40625, |
| "learning_rate": 6.775670744338747e-07, |
| "loss": 0.3811, |
| "step": 10784 |
| }, |
| { |
| "epoch": 2.555161823765651, |
| "grad_norm": 5.9375, |
| "learning_rate": 6.556274441286809e-07, |
| "loss": 0.3698, |
| "step": 10816 |
| }, |
| { |
| "epoch": 2.5627214741318216, |
| "grad_norm": 4.09375, |
| "learning_rate": 6.340239439956486e-07, |
| "loss": 0.3724, |
| "step": 10848 |
| }, |
| { |
| "epoch": 2.570281124497992, |
| "grad_norm": 7.3125, |
| "learning_rate": 6.127582455277547e-07, |
| "loss": 0.4228, |
| "step": 10880 |
| }, |
| { |
| "epoch": 2.5778407748641623, |
| "grad_norm": 5.8125, |
| "learning_rate": 5.918319940817884e-07, |
| "loss": 0.3607, |
| "step": 10912 |
| }, |
| { |
| "epoch": 2.585400425230333, |
| "grad_norm": 10.625, |
| "learning_rate": 5.712468087510536e-07, |
| "loss": 0.3846, |
| "step": 10944 |
| }, |
| { |
| "epoch": 2.592960075596504, |
| "grad_norm": 8.375, |
| "learning_rate": 5.510042822400868e-07, |
| "loss": 0.4124, |
| "step": 10976 |
| }, |
| { |
| "epoch": 2.6005197259626742, |
| "grad_norm": 6.0, |
| "learning_rate": 5.311059807414376e-07, |
| "loss": 0.3939, |
| "step": 11008 |
| }, |
| { |
| "epoch": 2.6080793763288446, |
| "grad_norm": 7.96875, |
| "learning_rate": 5.11553443814482e-07, |
| "loss": 0.4022, |
| "step": 11040 |
| }, |
| { |
| "epoch": 2.6156390266950154, |
| "grad_norm": 9.1875, |
| "learning_rate": 4.923481842663114e-07, |
| "loss": 0.3872, |
| "step": 11072 |
| }, |
| { |
| "epoch": 2.623198677061186, |
| "grad_norm": 5.65625, |
| "learning_rate": 4.734916880346774e-07, |
| "loss": 0.4234, |
| "step": 11104 |
| }, |
| { |
| "epoch": 2.6307583274273565, |
| "grad_norm": 12.0, |
| "learning_rate": 4.549854140730325e-07, |
| "loss": 0.3591, |
| "step": 11136 |
| }, |
| { |
| "epoch": 2.638317977793527, |
| "grad_norm": 4.78125, |
| "learning_rate": 4.368307942376371e-07, |
| "loss": 0.4013, |
| "step": 11168 |
| }, |
| { |
| "epoch": 2.6458776281596976, |
| "grad_norm": 8.8125, |
| "learning_rate": 4.190292331767848e-07, |
| "loss": 0.3419, |
| "step": 11200 |
| }, |
| { |
| "epoch": 2.6534372785258684, |
| "grad_norm": 5.4375, |
| "learning_rate": 4.0158210822211496e-07, |
| "loss": 0.384, |
| "step": 11232 |
| }, |
| { |
| "epoch": 2.6609969288920388, |
| "grad_norm": 7.28125, |
| "learning_rate": 3.844907692820543e-07, |
| "loss": 0.3919, |
| "step": 11264 |
| }, |
| { |
| "epoch": 2.668556579258209, |
| "grad_norm": 4.625, |
| "learning_rate": 3.6775653873736774e-07, |
| "loss": 0.3707, |
| "step": 11296 |
| }, |
| { |
| "epoch": 2.67611622962438, |
| "grad_norm": 5.96875, |
| "learning_rate": 3.5138071133884235e-07, |
| "loss": 0.372, |
| "step": 11328 |
| }, |
| { |
| "epoch": 2.6836758799905507, |
| "grad_norm": 5.96875, |
| "learning_rate": 3.3536455410711654e-07, |
| "loss": 0.3835, |
| "step": 11360 |
| }, |
| { |
| "epoch": 2.691235530356721, |
| "grad_norm": 7.28125, |
| "learning_rate": 3.1970930623464403e-07, |
| "loss": 0.3677, |
| "step": 11392 |
| }, |
| { |
| "epoch": 2.6987951807228914, |
| "grad_norm": 5.3125, |
| "learning_rate": 3.044161789898159e-07, |
| "loss": 0.3694, |
| "step": 11424 |
| }, |
| { |
| "epoch": 2.706354831089062, |
| "grad_norm": 7.46875, |
| "learning_rate": 2.8948635562324747e-07, |
| "loss": 0.3782, |
| "step": 11456 |
| }, |
| { |
| "epoch": 2.713914481455233, |
| "grad_norm": 6.0625, |
| "learning_rate": 2.749209912762241e-07, |
| "loss": 0.3956, |
| "step": 11488 |
| }, |
| { |
| "epoch": 2.7214741318214033, |
| "grad_norm": 6.09375, |
| "learning_rate": 2.607212128913267e-07, |
| "loss": 0.406, |
| "step": 11520 |
| }, |
| { |
| "epoch": 2.7290337821875736, |
| "grad_norm": 8.75, |
| "learning_rate": 2.4688811912524204e-07, |
| "loss": 0.3621, |
| "step": 11552 |
| }, |
| { |
| "epoch": 2.7365934325537444, |
| "grad_norm": 4.65625, |
| "learning_rate": 2.3342278026375397e-07, |
| "loss": 0.3859, |
| "step": 11584 |
| }, |
| { |
| "epoch": 2.744153082919915, |
| "grad_norm": 5.9375, |
| "learning_rate": 2.203262381389365e-07, |
| "loss": 0.3772, |
| "step": 11616 |
| }, |
| { |
| "epoch": 2.7517127332860856, |
| "grad_norm": 5.21875, |
| "learning_rate": 2.0759950604854583e-07, |
| "loss": 0.3757, |
| "step": 11648 |
| }, |
| { |
| "epoch": 2.759272383652256, |
| "grad_norm": 6.09375, |
| "learning_rate": 1.9524356867761772e-07, |
| "loss": 0.4159, |
| "step": 11680 |
| }, |
| { |
| "epoch": 2.7668320340184267, |
| "grad_norm": 7.15625, |
| "learning_rate": 1.8325938202228276e-07, |
| "loss": 0.374, |
| "step": 11712 |
| }, |
| { |
| "epoch": 2.7743916843845975, |
| "grad_norm": 5.5, |
| "learning_rate": 1.7164787331580058e-07, |
| "loss": 0.3731, |
| "step": 11744 |
| }, |
| { |
| "epoch": 2.781951334750768, |
| "grad_norm": 6.375, |
| "learning_rate": 1.6040994095681716e-07, |
| "loss": 0.3725, |
| "step": 11776 |
| }, |
| { |
| "epoch": 2.789510985116938, |
| "grad_norm": 5.125, |
| "learning_rate": 1.4954645443985337e-07, |
| "loss": 0.4074, |
| "step": 11808 |
| }, |
| { |
| "epoch": 2.797070635483109, |
| "grad_norm": 5.3125, |
| "learning_rate": 1.3905825428803477e-07, |
| "loss": 0.3863, |
| "step": 11840 |
| }, |
| { |
| "epoch": 2.8046302858492793, |
| "grad_norm": 3.828125, |
| "learning_rate": 1.2894615198805415e-07, |
| "loss": 0.3922, |
| "step": 11872 |
| }, |
| { |
| "epoch": 2.81218993621545, |
| "grad_norm": 6.4375, |
| "learning_rate": 1.1921092992739025e-07, |
| "loss": 0.3889, |
| "step": 11904 |
| }, |
| { |
| "epoch": 2.8197495865816204, |
| "grad_norm": 7.75, |
| "learning_rate": 1.0985334133376991e-07, |
| "loss": 0.3989, |
| "step": 11936 |
| }, |
| { |
| "epoch": 2.8273092369477912, |
| "grad_norm": 8.0, |
| "learning_rate": 1.0087411021689252e-07, |
| "loss": 0.3941, |
| "step": 11968 |
| }, |
| { |
| "epoch": 2.8348688873139616, |
| "grad_norm": 5.5, |
| "learning_rate": 9.227393131240992e-08, |
| "loss": 0.3675, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.8424285376801324, |
| "grad_norm": 5.09375, |
| "learning_rate": 8.405347002817421e-08, |
| "loss": 0.3858, |
| "step": 12032 |
| }, |
| { |
| "epoch": 2.8499881880463027, |
| "grad_norm": 6.09375, |
| "learning_rate": 7.6213362392758e-08, |
| "loss": 0.4184, |
| "step": 12064 |
| }, |
| { |
| "epoch": 2.8575478384124735, |
| "grad_norm": 8.9375, |
| "learning_rate": 6.875421500623703e-08, |
| "loss": 0.3729, |
| "step": 12096 |
| }, |
| { |
| "epoch": 2.865107488778644, |
| "grad_norm": 7.1875, |
| "learning_rate": 6.167660499326322e-08, |
| "loss": 0.3836, |
| "step": 12128 |
| }, |
| { |
| "epoch": 2.8726671391448146, |
| "grad_norm": 4.59375, |
| "learning_rate": 5.498107995840774e-08, |
| "loss": 0.3789, |
| "step": 12160 |
| }, |
| { |
| "epoch": 2.880226789510985, |
| "grad_norm": 10.125, |
| "learning_rate": 4.86681579437942e-08, |
| "loss": 0.4012, |
| "step": 12192 |
| }, |
| { |
| "epoch": 2.8877864398771558, |
| "grad_norm": 4.21875, |
| "learning_rate": 4.273832738901529e-08, |
| "loss": 0.365, |
| "step": 12224 |
| }, |
| { |
| "epoch": 2.895346090243326, |
| "grad_norm": 6.0625, |
| "learning_rate": 3.719204709334345e-08, |
| "loss": 0.3903, |
| "step": 12256 |
| }, |
| { |
| "epoch": 2.902905740609497, |
| "grad_norm": 5.5625, |
| "learning_rate": 3.2029746180231023e-08, |
| "loss": 0.4063, |
| "step": 12288 |
| }, |
| { |
| "epoch": 2.9104653909756673, |
| "grad_norm": 6.8125, |
| "learning_rate": 2.725182406410842e-08, |
| "loss": 0.3909, |
| "step": 12320 |
| }, |
| { |
| "epoch": 2.918025041341838, |
| "grad_norm": 4.0625, |
| "learning_rate": 2.2858650419484985e-08, |
| "loss": 0.3711, |
| "step": 12352 |
| }, |
| { |
| "epoch": 2.9255846917080084, |
| "grad_norm": 5.71875, |
| "learning_rate": 1.8850565152339072e-08, |
| "loss": 0.3463, |
| "step": 12384 |
| }, |
| { |
| "epoch": 2.933144342074179, |
| "grad_norm": 5.0, |
| "learning_rate": 1.5227878373827443e-08, |
| "loss": 0.443, |
| "step": 12416 |
| }, |
| { |
| "epoch": 2.9407039924403495, |
| "grad_norm": 4.78125, |
| "learning_rate": 1.1990870376284435e-08, |
| "loss": 0.405, |
| "step": 12448 |
| }, |
| { |
| "epoch": 2.9482636428065203, |
| "grad_norm": 6.25, |
| "learning_rate": 9.139791611540438e-09, |
| "loss": 0.3871, |
| "step": 12480 |
| }, |
| { |
| "epoch": 2.9558232931726907, |
| "grad_norm": 5.28125, |
| "learning_rate": 6.6748626715407165e-09, |
| "loss": 0.4238, |
| "step": 12512 |
| }, |
| { |
| "epoch": 2.9633829435388614, |
| "grad_norm": 6.46875, |
| "learning_rate": 4.596274271280732e-09, |
| "loss": 0.3852, |
| "step": 12544 |
| }, |
| { |
| "epoch": 2.970942593905032, |
| "grad_norm": 5.8125, |
| "learning_rate": 2.9041872340479505e-09, |
| "loss": 0.3845, |
| "step": 12576 |
| }, |
| { |
| "epoch": 2.9785022442712026, |
| "grad_norm": 4.21875, |
| "learning_rate": 1.59873247897957e-09, |
| "loss": 0.4169, |
| "step": 12608 |
| }, |
| { |
| "epoch": 2.986061894637373, |
| "grad_norm": 5.96875, |
| "learning_rate": 6.800110109328462e-10, |
| "loss": 0.3872, |
| "step": 12640 |
| }, |
| { |
| "epoch": 2.9936215450035437, |
| "grad_norm": 11.125, |
| "learning_rate": 1.480939126713432e-10, |
| "loss": 0.3957, |
| "step": 12672 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 12699, |
| "total_flos": 2.247355191985367e+17, |
| "train_loss": 0.4236195460108568, |
| "train_runtime": 3683.6752, |
| "train_samples_per_second": 13.789, |
| "train_steps_per_second": 3.447 |
| } |
| ], |
| "logging_steps": 32, |
| "max_steps": 12699, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.247355191985367e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|