| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 21165, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.007559650366170565, |
| "grad_norm": 32.75, |
| "learning_rate": 1.464336324988191e-07, |
| "loss": 0.6933, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.01511930073234113, |
| "grad_norm": 14.8125, |
| "learning_rate": 2.975909305621162e-07, |
| "loss": 0.6983, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.022678951098511695, |
| "grad_norm": 28.25, |
| "learning_rate": 4.4874822862541336e-07, |
| "loss": 0.667, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.03023860146468226, |
| "grad_norm": 24.125, |
| "learning_rate": 5.999055266887105e-07, |
| "loss": 0.6409, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.03779825183085282, |
| "grad_norm": 26.25, |
| "learning_rate": 7.510628247520075e-07, |
| "loss": 0.6695, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.04535790219702339, |
| "grad_norm": 16.75, |
| "learning_rate": 9.022201228153047e-07, |
| "loss": 0.6807, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.05291755256319395, |
| "grad_norm": 16.625, |
| "learning_rate": 1.0533774208786019e-06, |
| "loss": 0.672, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.06047720292936452, |
| "grad_norm": 11.9375, |
| "learning_rate": 1.204534718941899e-06, |
| "loss": 0.6028, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.06803685329553508, |
| "grad_norm": 17.125, |
| "learning_rate": 1.3556920170051963e-06, |
| "loss": 0.6022, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.07559650366170564, |
| "grad_norm": 13.625, |
| "learning_rate": 1.5068493150684932e-06, |
| "loss": 0.5719, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.08315615402787621, |
| "grad_norm": 13.5625, |
| "learning_rate": 1.6580066131317905e-06, |
| "loss": 0.5594, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.09071580439404678, |
| "grad_norm": 9.1875, |
| "learning_rate": 1.8091639111950876e-06, |
| "loss": 0.5216, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.09827545476021735, |
| "grad_norm": 13.6875, |
| "learning_rate": 1.9603212092583847e-06, |
| "loss": 0.5795, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.1058351051263879, |
| "grad_norm": 20.0, |
| "learning_rate": 2.1114785073216816e-06, |
| "loss": 0.4809, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.11339475549255847, |
| "grad_norm": 5.65625, |
| "learning_rate": 2.262635805384979e-06, |
| "loss": 0.5054, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.12095440585872903, |
| "grad_norm": 12.125, |
| "learning_rate": 2.4137931034482762e-06, |
| "loss": 0.512, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.1285140562248996, |
| "grad_norm": 8.0625, |
| "learning_rate": 2.5649504015115736e-06, |
| "loss": 0.5196, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.13607370659107015, |
| "grad_norm": 6.625, |
| "learning_rate": 2.7161076995748705e-06, |
| "loss": 0.4724, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.14363335695724072, |
| "grad_norm": 6.75, |
| "learning_rate": 2.8672649976381674e-06, |
| "loss": 0.4971, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.1511930073234113, |
| "grad_norm": 15.4375, |
| "learning_rate": 3.0184222957014647e-06, |
| "loss": 0.4902, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.15875265768958186, |
| "grad_norm": 9.8125, |
| "learning_rate": 3.1695795937647616e-06, |
| "loss": 0.4461, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.16631230805575242, |
| "grad_norm": 5.78125, |
| "learning_rate": 3.3207368918280585e-06, |
| "loss": 0.4681, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.173871958421923, |
| "grad_norm": 5.84375, |
| "learning_rate": 3.4718941898913562e-06, |
| "loss": 0.4747, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.18143160878809356, |
| "grad_norm": 15.875, |
| "learning_rate": 3.623051487954653e-06, |
| "loss": 0.4603, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.18899125915426412, |
| "grad_norm": 6.59375, |
| "learning_rate": 3.7742087860179504e-06, |
| "loss": 0.5077, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.1965509095204347, |
| "grad_norm": 14.3125, |
| "learning_rate": 3.925366084081247e-06, |
| "loss": 0.4793, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.20411055988660523, |
| "grad_norm": 8.6875, |
| "learning_rate": 4.076523382144545e-06, |
| "loss": 0.4397, |
| "step": 864 |
| }, |
| { |
| "epoch": 0.2116702102527758, |
| "grad_norm": 6.96875, |
| "learning_rate": 4.227680680207842e-06, |
| "loss": 0.4632, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.21922986061894637, |
| "grad_norm": 7.6875, |
| "learning_rate": 4.3788379782711384e-06, |
| "loss": 0.4525, |
| "step": 928 |
| }, |
| { |
| "epoch": 0.22678951098511693, |
| "grad_norm": 8.5, |
| "learning_rate": 4.529995276334436e-06, |
| "loss": 0.4575, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.2343491613512875, |
| "grad_norm": 9.75, |
| "learning_rate": 4.681152574397733e-06, |
| "loss": 0.4867, |
| "step": 992 |
| }, |
| { |
| "epoch": 0.24190881171745807, |
| "grad_norm": 8.0625, |
| "learning_rate": 4.83230987246103e-06, |
| "loss": 0.4624, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.24946846208362863, |
| "grad_norm": 7.59375, |
| "learning_rate": 4.983467170524327e-06, |
| "loss": 0.4408, |
| "step": 1056 |
| }, |
| { |
| "epoch": 0.2570281124497992, |
| "grad_norm": 7.5625, |
| "learning_rate": 5.134624468587624e-06, |
| "loss": 0.46, |
| "step": 1088 |
| }, |
| { |
| "epoch": 0.26458776281596974, |
| "grad_norm": 8.1875, |
| "learning_rate": 5.2857817666509215e-06, |
| "loss": 0.4347, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.2721474131821403, |
| "grad_norm": 8.8125, |
| "learning_rate": 5.436939064714218e-06, |
| "loss": 0.4165, |
| "step": 1152 |
| }, |
| { |
| "epoch": 0.2797070635483109, |
| "grad_norm": 8.625, |
| "learning_rate": 5.588096362777515e-06, |
| "loss": 0.4455, |
| "step": 1184 |
| }, |
| { |
| "epoch": 0.28726671391448144, |
| "grad_norm": 8.4375, |
| "learning_rate": 5.7392536608408135e-06, |
| "loss": 0.4309, |
| "step": 1216 |
| }, |
| { |
| "epoch": 0.294826364280652, |
| "grad_norm": 7.84375, |
| "learning_rate": 5.89041095890411e-06, |
| "loss": 0.4558, |
| "step": 1248 |
| }, |
| { |
| "epoch": 0.3023860146468226, |
| "grad_norm": 13.0, |
| "learning_rate": 6.041568256967407e-06, |
| "loss": 0.4711, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.30994566501299314, |
| "grad_norm": 5.90625, |
| "learning_rate": 6.192725555030704e-06, |
| "loss": 0.4422, |
| "step": 1312 |
| }, |
| { |
| "epoch": 0.3175053153791637, |
| "grad_norm": 8.375, |
| "learning_rate": 6.343882853094001e-06, |
| "loss": 0.4616, |
| "step": 1344 |
| }, |
| { |
| "epoch": 0.3250649657453343, |
| "grad_norm": 7.28125, |
| "learning_rate": 6.495040151157299e-06, |
| "loss": 0.4529, |
| "step": 1376 |
| }, |
| { |
| "epoch": 0.33262461611150484, |
| "grad_norm": 6.375, |
| "learning_rate": 6.646197449220596e-06, |
| "loss": 0.4655, |
| "step": 1408 |
| }, |
| { |
| "epoch": 0.3401842664776754, |
| "grad_norm": 7.5625, |
| "learning_rate": 6.797354747283893e-06, |
| "loss": 0.4238, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.347743916843846, |
| "grad_norm": 10.25, |
| "learning_rate": 6.94851204534719e-06, |
| "loss": 0.4786, |
| "step": 1472 |
| }, |
| { |
| "epoch": 0.35530356721001655, |
| "grad_norm": 5.25, |
| "learning_rate": 7.099669343410487e-06, |
| "loss": 0.4443, |
| "step": 1504 |
| }, |
| { |
| "epoch": 0.3628632175761871, |
| "grad_norm": 14.0625, |
| "learning_rate": 7.250826641473784e-06, |
| "loss": 0.467, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.3704228679423577, |
| "grad_norm": 5.84375, |
| "learning_rate": 7.4019839395370815e-06, |
| "loss": 0.4281, |
| "step": 1568 |
| }, |
| { |
| "epoch": 0.37798251830852825, |
| "grad_norm": 7.4375, |
| "learning_rate": 7.553141237600379e-06, |
| "loss": 0.4198, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.3855421686746988, |
| "grad_norm": 4.78125, |
| "learning_rate": 7.704298535663676e-06, |
| "loss": 0.4646, |
| "step": 1632 |
| }, |
| { |
| "epoch": 0.3931018190408694, |
| "grad_norm": 7.09375, |
| "learning_rate": 7.855455833726973e-06, |
| "loss": 0.4396, |
| "step": 1664 |
| }, |
| { |
| "epoch": 0.40066146940703995, |
| "grad_norm": 7.65625, |
| "learning_rate": 8.006613131790269e-06, |
| "loss": 0.4395, |
| "step": 1696 |
| }, |
| { |
| "epoch": 0.40822111977321046, |
| "grad_norm": 10.1875, |
| "learning_rate": 8.157770429853567e-06, |
| "loss": 0.4247, |
| "step": 1728 |
| }, |
| { |
| "epoch": 0.41578077013938103, |
| "grad_norm": 7.53125, |
| "learning_rate": 8.308927727916864e-06, |
| "loss": 0.4591, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.4233404205055516, |
| "grad_norm": 7.25, |
| "learning_rate": 8.460085025980162e-06, |
| "loss": 0.451, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.43090007087172216, |
| "grad_norm": 7.21875, |
| "learning_rate": 8.611242324043458e-06, |
| "loss": 0.4388, |
| "step": 1824 |
| }, |
| { |
| "epoch": 0.43845972123789273, |
| "grad_norm": 6.09375, |
| "learning_rate": 8.762399622106755e-06, |
| "loss": 0.4619, |
| "step": 1856 |
| }, |
| { |
| "epoch": 0.4460193716040633, |
| "grad_norm": 7.6875, |
| "learning_rate": 8.913556920170053e-06, |
| "loss": 0.4335, |
| "step": 1888 |
| }, |
| { |
| "epoch": 0.45357902197023386, |
| "grad_norm": 7.0625, |
| "learning_rate": 9.064714218233351e-06, |
| "loss": 0.4103, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.46113867233640443, |
| "grad_norm": 5.75, |
| "learning_rate": 9.215871516296648e-06, |
| "loss": 0.4254, |
| "step": 1952 |
| }, |
| { |
| "epoch": 0.468698322702575, |
| "grad_norm": 6.53125, |
| "learning_rate": 9.367028814359944e-06, |
| "loss": 0.4907, |
| "step": 1984 |
| }, |
| { |
| "epoch": 0.47625797306874557, |
| "grad_norm": 7.15625, |
| "learning_rate": 9.51818611242324e-06, |
| "loss": 0.449, |
| "step": 2016 |
| }, |
| { |
| "epoch": 0.48381762343491613, |
| "grad_norm": 5.71875, |
| "learning_rate": 9.669343410486539e-06, |
| "loss": 0.4554, |
| "step": 2048 |
| }, |
| { |
| "epoch": 0.4913772738010867, |
| "grad_norm": 9.125, |
| "learning_rate": 9.820500708549835e-06, |
| "loss": 0.43, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.49893692416725727, |
| "grad_norm": 4.65625, |
| "learning_rate": 9.971658006613133e-06, |
| "loss": 0.4262, |
| "step": 2112 |
| }, |
| { |
| "epoch": 0.5064965745334278, |
| "grad_norm": 9.375, |
| "learning_rate": 9.999954028675169e-06, |
| "loss": 0.437, |
| "step": 2144 |
| }, |
| { |
| "epoch": 0.5140562248995983, |
| "grad_norm": 7.71875, |
| "learning_rate": 9.999771232848482e-06, |
| "loss": 0.4294, |
| "step": 2176 |
| }, |
| { |
| "epoch": 0.5216158752657689, |
| "grad_norm": 6.84375, |
| "learning_rate": 9.999449169431064e-06, |
| "loss": 0.4498, |
| "step": 2208 |
| }, |
| { |
| "epoch": 0.5291755256319395, |
| "grad_norm": 8.0625, |
| "learning_rate": 9.998987847393924e-06, |
| "loss": 0.4379, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.53673517599811, |
| "grad_norm": 5.6875, |
| "learning_rate": 9.998387279587092e-06, |
| "loss": 0.4315, |
| "step": 2272 |
| }, |
| { |
| "epoch": 0.5442948263642806, |
| "grad_norm": 8.5, |
| "learning_rate": 9.99764748273926e-06, |
| "loss": 0.4309, |
| "step": 2304 |
| }, |
| { |
| "epoch": 0.5518544767304512, |
| "grad_norm": 5.71875, |
| "learning_rate": 9.996768477457317e-06, |
| "loss": 0.4405, |
| "step": 2336 |
| }, |
| { |
| "epoch": 0.5594141270966217, |
| "grad_norm": 7.65625, |
| "learning_rate": 9.99575028822577e-06, |
| "loss": 0.4381, |
| "step": 2368 |
| }, |
| { |
| "epoch": 0.5669737774627923, |
| "grad_norm": 4.59375, |
| "learning_rate": 9.994592943406071e-06, |
| "loss": 0.3963, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.5745334278289629, |
| "grad_norm": 9.0, |
| "learning_rate": 9.993296475235821e-06, |
| "loss": 0.4307, |
| "step": 2432 |
| }, |
| { |
| "epoch": 0.5820930781951335, |
| "grad_norm": 6.15625, |
| "learning_rate": 9.991860919827869e-06, |
| "loss": 0.4298, |
| "step": 2464 |
| }, |
| { |
| "epoch": 0.589652728561304, |
| "grad_norm": 7.0625, |
| "learning_rate": 9.990286317169315e-06, |
| "loss": 0.4717, |
| "step": 2496 |
| }, |
| { |
| "epoch": 0.5972123789274746, |
| "grad_norm": 3.53125, |
| "learning_rate": 9.988572711120388e-06, |
| "loss": 0.4357, |
| "step": 2528 |
| }, |
| { |
| "epoch": 0.6047720292936452, |
| "grad_norm": 8.125, |
| "learning_rate": 9.986720149413232e-06, |
| "loss": 0.4583, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.6123316796598157, |
| "grad_norm": 6.6875, |
| "learning_rate": 9.984728683650566e-06, |
| "loss": 0.3911, |
| "step": 2592 |
| }, |
| { |
| "epoch": 0.6198913300259863, |
| "grad_norm": 6.8125, |
| "learning_rate": 9.982598369304259e-06, |
| "loss": 0.455, |
| "step": 2624 |
| }, |
| { |
| "epoch": 0.6274509803921569, |
| "grad_norm": 5.375, |
| "learning_rate": 9.980329265713772e-06, |
| "loss": 0.4316, |
| "step": 2656 |
| }, |
| { |
| "epoch": 0.6350106307583274, |
| "grad_norm": 6.59375, |
| "learning_rate": 9.977921436084517e-06, |
| "loss": 0.4341, |
| "step": 2688 |
| }, |
| { |
| "epoch": 0.642570281124498, |
| "grad_norm": 18.5, |
| "learning_rate": 9.975374947486086e-06, |
| "loss": 0.4523, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.6501299314906686, |
| "grad_norm": 4.59375, |
| "learning_rate": 9.972689870850396e-06, |
| "loss": 0.4188, |
| "step": 2752 |
| }, |
| { |
| "epoch": 0.6576895818568391, |
| "grad_norm": 5.75, |
| "learning_rate": 9.969866280969693e-06, |
| "loss": 0.4731, |
| "step": 2784 |
| }, |
| { |
| "epoch": 0.6652492322230097, |
| "grad_norm": 6.5, |
| "learning_rate": 9.966904256494494e-06, |
| "loss": 0.4347, |
| "step": 2816 |
| }, |
| { |
| "epoch": 0.6728088825891803, |
| "grad_norm": 5.125, |
| "learning_rate": 9.963803879931372e-06, |
| "loss": 0.4309, |
| "step": 2848 |
| }, |
| { |
| "epoch": 0.6803685329553508, |
| "grad_norm": 6.25, |
| "learning_rate": 9.960565237640679e-06, |
| "loss": 0.4262, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.6879281833215214, |
| "grad_norm": 7.03125, |
| "learning_rate": 9.957188419834115e-06, |
| "loss": 0.4044, |
| "step": 2912 |
| }, |
| { |
| "epoch": 0.695487833687692, |
| "grad_norm": 6.0625, |
| "learning_rate": 9.953673520572248e-06, |
| "loss": 0.418, |
| "step": 2944 |
| }, |
| { |
| "epoch": 0.7030474840538625, |
| "grad_norm": 3.953125, |
| "learning_rate": 9.950020637761863e-06, |
| "loss": 0.4171, |
| "step": 2976 |
| }, |
| { |
| "epoch": 0.7106071344200331, |
| "grad_norm": 7.3125, |
| "learning_rate": 9.946229873153257e-06, |
| "loss": 0.4638, |
| "step": 3008 |
| }, |
| { |
| "epoch": 0.7181667847862037, |
| "grad_norm": 7.375, |
| "learning_rate": 9.942301332337387e-06, |
| "loss": 0.4293, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.7257264351523742, |
| "grad_norm": 14.8125, |
| "learning_rate": 9.938235124742947e-06, |
| "loss": 0.4617, |
| "step": 3072 |
| }, |
| { |
| "epoch": 0.7332860855185448, |
| "grad_norm": 4.875, |
| "learning_rate": 9.934031363633306e-06, |
| "loss": 0.4218, |
| "step": 3104 |
| }, |
| { |
| "epoch": 0.7408457358847154, |
| "grad_norm": 6.875, |
| "learning_rate": 9.929690166103354e-06, |
| "loss": 0.4406, |
| "step": 3136 |
| }, |
| { |
| "epoch": 0.7484053862508859, |
| "grad_norm": 7.8125, |
| "learning_rate": 9.925211653076251e-06, |
| "loss": 0.4416, |
| "step": 3168 |
| }, |
| { |
| "epoch": 0.7559650366170565, |
| "grad_norm": 5.84375, |
| "learning_rate": 9.920595949300049e-06, |
| "loss": 0.454, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.7635246869832271, |
| "grad_norm": 7.5, |
| "learning_rate": 9.915843183344215e-06, |
| "loss": 0.4099, |
| "step": 3232 |
| }, |
| { |
| "epoch": 0.7710843373493976, |
| "grad_norm": 6.8125, |
| "learning_rate": 9.910953487596066e-06, |
| "loss": 0.3762, |
| "step": 3264 |
| }, |
| { |
| "epoch": 0.7786439877155682, |
| "grad_norm": 5.40625, |
| "learning_rate": 9.905926998257057e-06, |
| "loss": 0.424, |
| "step": 3296 |
| }, |
| { |
| "epoch": 0.7862036380817388, |
| "grad_norm": 7.78125, |
| "learning_rate": 9.900763855339009e-06, |
| "loss": 0.4663, |
| "step": 3328 |
| }, |
| { |
| "epoch": 0.7937632884479093, |
| "grad_norm": 5.5, |
| "learning_rate": 9.895464202660195e-06, |
| "loss": 0.4758, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.8013229388140799, |
| "grad_norm": 10.0625, |
| "learning_rate": 9.890028187841343e-06, |
| "loss": 0.4024, |
| "step": 3392 |
| }, |
| { |
| "epoch": 0.8088825891802505, |
| "grad_norm": 8.5625, |
| "learning_rate": 9.88445596230152e-06, |
| "loss": 0.4461, |
| "step": 3424 |
| }, |
| { |
| "epoch": 0.8164422395464209, |
| "grad_norm": 6.21875, |
| "learning_rate": 9.878747681253908e-06, |
| "loss": 0.4429, |
| "step": 3456 |
| }, |
| { |
| "epoch": 0.8240018899125915, |
| "grad_norm": 4.71875, |
| "learning_rate": 9.872903503701495e-06, |
| "loss": 0.4333, |
| "step": 3488 |
| }, |
| { |
| "epoch": 0.8315615402787621, |
| "grad_norm": 6.375, |
| "learning_rate": 9.866923592432633e-06, |
| "loss": 0.4168, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.8391211906449326, |
| "grad_norm": 23.25, |
| "learning_rate": 9.860808114016512e-06, |
| "loss": 0.4475, |
| "step": 3552 |
| }, |
| { |
| "epoch": 0.8466808410111032, |
| "grad_norm": 11.9375, |
| "learning_rate": 9.854557238798515e-06, |
| "loss": 0.4458, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.8542404913772738, |
| "grad_norm": 6.84375, |
| "learning_rate": 9.848171140895471e-06, |
| "loss": 0.4405, |
| "step": 3616 |
| }, |
| { |
| "epoch": 0.8618001417434443, |
| "grad_norm": 5.625, |
| "learning_rate": 9.841649998190818e-06, |
| "loss": 0.4059, |
| "step": 3648 |
| }, |
| { |
| "epoch": 0.8693597921096149, |
| "grad_norm": 4.34375, |
| "learning_rate": 9.834993992329629e-06, |
| "loss": 0.4179, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.8769194424757855, |
| "grad_norm": 7.34375, |
| "learning_rate": 9.828203308713568e-06, |
| "loss": 0.4298, |
| "step": 3712 |
| }, |
| { |
| "epoch": 0.884479092841956, |
| "grad_norm": 5.9375, |
| "learning_rate": 9.821278136495722e-06, |
| "loss": 0.4098, |
| "step": 3744 |
| }, |
| { |
| "epoch": 0.8920387432081266, |
| "grad_norm": 5.0625, |
| "learning_rate": 9.814218668575322e-06, |
| "loss": 0.3931, |
| "step": 3776 |
| }, |
| { |
| "epoch": 0.8995983935742972, |
| "grad_norm": 7.65625, |
| "learning_rate": 9.807025101592388e-06, |
| "loss": 0.4239, |
| "step": 3808 |
| }, |
| { |
| "epoch": 0.9071580439404677, |
| "grad_norm": 6.78125, |
| "learning_rate": 9.79969763592223e-06, |
| "loss": 0.4136, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.9147176943066383, |
| "grad_norm": 5.875, |
| "learning_rate": 9.792236475669889e-06, |
| "loss": 0.4549, |
| "step": 3872 |
| }, |
| { |
| "epoch": 0.9222773446728089, |
| "grad_norm": 7.75, |
| "learning_rate": 9.784641828664435e-06, |
| "loss": 0.4433, |
| "step": 3904 |
| }, |
| { |
| "epoch": 0.9298369950389794, |
| "grad_norm": 8.125, |
| "learning_rate": 9.776913906453184e-06, |
| "loss": 0.4139, |
| "step": 3936 |
| }, |
| { |
| "epoch": 0.93739664540515, |
| "grad_norm": 6.75, |
| "learning_rate": 9.76905292429581e-06, |
| "loss": 0.4514, |
| "step": 3968 |
| }, |
| { |
| "epoch": 0.9449562957713206, |
| "grad_norm": 5.65625, |
| "learning_rate": 9.76105910115834e-06, |
| "loss": 0.4389, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.9525159461374911, |
| "grad_norm": 6.875, |
| "learning_rate": 9.752932659707054e-06, |
| "loss": 0.4219, |
| "step": 4032 |
| }, |
| { |
| "epoch": 0.9600755965036617, |
| "grad_norm": 8.75, |
| "learning_rate": 9.7446738263023e-06, |
| "loss": 0.4214, |
| "step": 4064 |
| }, |
| { |
| "epoch": 0.9676352468698323, |
| "grad_norm": 5.1875, |
| "learning_rate": 9.736282830992165e-06, |
| "loss": 0.4179, |
| "step": 4096 |
| }, |
| { |
| "epoch": 0.9751948972360028, |
| "grad_norm": 5.90625, |
| "learning_rate": 9.727759907506085e-06, |
| "loss": 0.4156, |
| "step": 4128 |
| }, |
| { |
| "epoch": 0.9827545476021734, |
| "grad_norm": 7.3125, |
| "learning_rate": 9.719105293248327e-06, |
| "loss": 0.4292, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.990314197968344, |
| "grad_norm": 5.375, |
| "learning_rate": 9.710319229291374e-06, |
| "loss": 0.3681, |
| "step": 4192 |
| }, |
| { |
| "epoch": 0.9978738483345145, |
| "grad_norm": 5.5, |
| "learning_rate": 9.701401960369218e-06, |
| "loss": 0.4233, |
| "step": 4224 |
| }, |
| { |
| "epoch": 1.005433498700685, |
| "grad_norm": 6.59375, |
| "learning_rate": 9.692353734870532e-06, |
| "loss": 0.41, |
| "step": 4256 |
| }, |
| { |
| "epoch": 1.0129931490668556, |
| "grad_norm": 8.25, |
| "learning_rate": 9.683174804831763e-06, |
| "loss": 0.396, |
| "step": 4288 |
| }, |
| { |
| "epoch": 1.0205527994330261, |
| "grad_norm": 7.09375, |
| "learning_rate": 9.673865425930104e-06, |
| "loss": 0.4012, |
| "step": 4320 |
| }, |
| { |
| "epoch": 1.0281124497991967, |
| "grad_norm": 6.53125, |
| "learning_rate": 9.66442585747637e-06, |
| "loss": 0.348, |
| "step": 4352 |
| }, |
| { |
| "epoch": 1.0356721001653673, |
| "grad_norm": 6.75, |
| "learning_rate": 9.654856362407787e-06, |
| "loss": 0.4068, |
| "step": 4384 |
| }, |
| { |
| "epoch": 1.0432317505315378, |
| "grad_norm": 6.21875, |
| "learning_rate": 9.645157207280652e-06, |
| "loss": 0.3564, |
| "step": 4416 |
| }, |
| { |
| "epoch": 1.0507914008977084, |
| "grad_norm": 6.65625, |
| "learning_rate": 9.635328662262922e-06, |
| "loss": 0.4081, |
| "step": 4448 |
| }, |
| { |
| "epoch": 1.058351051263879, |
| "grad_norm": 6.84375, |
| "learning_rate": 9.625371001126678e-06, |
| "loss": 0.388, |
| "step": 4480 |
| }, |
| { |
| "epoch": 1.0659107016300495, |
| "grad_norm": 7.6875, |
| "learning_rate": 9.615284501240505e-06, |
| "loss": 0.3767, |
| "step": 4512 |
| }, |
| { |
| "epoch": 1.07347035199622, |
| "grad_norm": 9.4375, |
| "learning_rate": 9.605069443561768e-06, |
| "loss": 0.3913, |
| "step": 4544 |
| }, |
| { |
| "epoch": 1.0810300023623907, |
| "grad_norm": 9.25, |
| "learning_rate": 9.594726112628781e-06, |
| "loss": 0.4482, |
| "step": 4576 |
| }, |
| { |
| "epoch": 1.0885896527285612, |
| "grad_norm": 7.34375, |
| "learning_rate": 9.584254796552877e-06, |
| "loss": 0.3863, |
| "step": 4608 |
| }, |
| { |
| "epoch": 1.0961493030947318, |
| "grad_norm": 5.4375, |
| "learning_rate": 9.573655787010397e-06, |
| "loss": 0.3993, |
| "step": 4640 |
| }, |
| { |
| "epoch": 1.1037089534609024, |
| "grad_norm": 5.5, |
| "learning_rate": 9.562929379234554e-06, |
| "loss": 0.4405, |
| "step": 4672 |
| }, |
| { |
| "epoch": 1.111268603827073, |
| "grad_norm": 9.5, |
| "learning_rate": 9.55207587200721e-06, |
| "loss": 0.4459, |
| "step": 4704 |
| }, |
| { |
| "epoch": 1.1188282541932435, |
| "grad_norm": 6.25, |
| "learning_rate": 9.541095567650558e-06, |
| "loss": 0.4266, |
| "step": 4736 |
| }, |
| { |
| "epoch": 1.126387904559414, |
| "grad_norm": 7.3125, |
| "learning_rate": 9.529988772018699e-06, |
| "loss": 0.3956, |
| "step": 4768 |
| }, |
| { |
| "epoch": 1.1339475549255846, |
| "grad_norm": 4.46875, |
| "learning_rate": 9.518755794489123e-06, |
| "loss": 0.4003, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.1415072052917552, |
| "grad_norm": 4.5, |
| "learning_rate": 9.507396947954086e-06, |
| "loss": 0.3983, |
| "step": 4832 |
| }, |
| { |
| "epoch": 1.1490668556579258, |
| "grad_norm": 7.84375, |
| "learning_rate": 9.495912548811908e-06, |
| "loss": 0.4201, |
| "step": 4864 |
| }, |
| { |
| "epoch": 1.1566265060240963, |
| "grad_norm": 6.84375, |
| "learning_rate": 9.48430291695814e-06, |
| "loss": 0.4136, |
| "step": 4896 |
| }, |
| { |
| "epoch": 1.164186156390267, |
| "grad_norm": 5.15625, |
| "learning_rate": 9.472568375776669e-06, |
| "loss": 0.4123, |
| "step": 4928 |
| }, |
| { |
| "epoch": 1.1717458067564375, |
| "grad_norm": 4.96875, |
| "learning_rate": 9.46070925213071e-06, |
| "loss": 0.4157, |
| "step": 4960 |
| }, |
| { |
| "epoch": 1.179305457122608, |
| "grad_norm": 8.5, |
| "learning_rate": 9.448725876353692e-06, |
| "loss": 0.3889, |
| "step": 4992 |
| }, |
| { |
| "epoch": 1.1868651074887786, |
| "grad_norm": 5.34375, |
| "learning_rate": 9.43661858224006e-06, |
| "loss": 0.3872, |
| "step": 5024 |
| }, |
| { |
| "epoch": 1.1944247578549492, |
| "grad_norm": 4.84375, |
| "learning_rate": 9.42438770703598e-06, |
| "loss": 0.4089, |
| "step": 5056 |
| }, |
| { |
| "epoch": 1.2019844082211197, |
| "grad_norm": 5.96875, |
| "learning_rate": 9.412033591429947e-06, |
| "loss": 0.4128, |
| "step": 5088 |
| }, |
| { |
| "epoch": 1.2095440585872903, |
| "grad_norm": 6.5, |
| "learning_rate": 9.399556579543285e-06, |
| "loss": 0.4154, |
| "step": 5120 |
| }, |
| { |
| "epoch": 1.2171037089534609, |
| "grad_norm": 4.5625, |
| "learning_rate": 9.386957018920576e-06, |
| "loss": 0.3826, |
| "step": 5152 |
| }, |
| { |
| "epoch": 1.2246633593196314, |
| "grad_norm": 5.15625, |
| "learning_rate": 9.374235260519967e-06, |
| "loss": 0.3957, |
| "step": 5184 |
| }, |
| { |
| "epoch": 1.232223009685802, |
| "grad_norm": 6.1875, |
| "learning_rate": 9.361391658703396e-06, |
| "loss": 0.3757, |
| "step": 5216 |
| }, |
| { |
| "epoch": 1.2397826600519726, |
| "grad_norm": 5.75, |
| "learning_rate": 9.348426571226732e-06, |
| "loss": 0.4287, |
| "step": 5248 |
| }, |
| { |
| "epoch": 1.2473423104181431, |
| "grad_norm": 4.65625, |
| "learning_rate": 9.335340359229798e-06, |
| "loss": 0.4172, |
| "step": 5280 |
| }, |
| { |
| "epoch": 1.2549019607843137, |
| "grad_norm": 6.40625, |
| "learning_rate": 9.322133387226313e-06, |
| "loss": 0.3773, |
| "step": 5312 |
| }, |
| { |
| "epoch": 1.2624616111504843, |
| "grad_norm": 4.6875, |
| "learning_rate": 9.308806023093745e-06, |
| "loss": 0.4368, |
| "step": 5344 |
| }, |
| { |
| "epoch": 1.2700212615166548, |
| "grad_norm": 4.125, |
| "learning_rate": 9.295358638063054e-06, |
| "loss": 0.393, |
| "step": 5376 |
| }, |
| { |
| "epoch": 1.2775809118828254, |
| "grad_norm": 4.875, |
| "learning_rate": 9.281791606708365e-06, |
| "loss": 0.3973, |
| "step": 5408 |
| }, |
| { |
| "epoch": 1.285140562248996, |
| "grad_norm": 8.75, |
| "learning_rate": 9.268105306936521e-06, |
| "loss": 0.3701, |
| "step": 5440 |
| }, |
| { |
| "epoch": 1.2927002126151665, |
| "grad_norm": 6.0, |
| "learning_rate": 9.254300119976564e-06, |
| "loss": 0.4084, |
| "step": 5472 |
| }, |
| { |
| "epoch": 1.3002598629813371, |
| "grad_norm": 4.5625, |
| "learning_rate": 9.240376430369114e-06, |
| "loss": 0.3885, |
| "step": 5504 |
| }, |
| { |
| "epoch": 1.3078195133475077, |
| "grad_norm": 6.375, |
| "learning_rate": 9.226334625955655e-06, |
| "loss": 0.3864, |
| "step": 5536 |
| }, |
| { |
| "epoch": 1.3153791637136782, |
| "grad_norm": 8.3125, |
| "learning_rate": 9.212175097867738e-06, |
| "loss": 0.4363, |
| "step": 5568 |
| }, |
| { |
| "epoch": 1.3229388140798488, |
| "grad_norm": 6.15625, |
| "learning_rate": 9.197898240516083e-06, |
| "loss": 0.424, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.3304984644460194, |
| "grad_norm": 6.03125, |
| "learning_rate": 9.183504451579587e-06, |
| "loss": 0.4084, |
| "step": 5632 |
| }, |
| { |
| "epoch": 1.33805811481219, |
| "grad_norm": 6.28125, |
| "learning_rate": 9.168994131994257e-06, |
| "loss": 0.3426, |
| "step": 5664 |
| }, |
| { |
| "epoch": 1.3456177651783605, |
| "grad_norm": 6.0625, |
| "learning_rate": 9.154367685942039e-06, |
| "loss": 0.422, |
| "step": 5696 |
| }, |
| { |
| "epoch": 1.353177415544531, |
| "grad_norm": 4.3125, |
| "learning_rate": 9.139625520839548e-06, |
| "loss": 0.388, |
| "step": 5728 |
| }, |
| { |
| "epoch": 1.3607370659107016, |
| "grad_norm": 3.65625, |
| "learning_rate": 9.12476804732674e-06, |
| "loss": 0.4255, |
| "step": 5760 |
| }, |
| { |
| "epoch": 1.3682967162768722, |
| "grad_norm": 5.1875, |
| "learning_rate": 9.109795679255455e-06, |
| "loss": 0.3975, |
| "step": 5792 |
| }, |
| { |
| "epoch": 1.3758563666430428, |
| "grad_norm": 9.5, |
| "learning_rate": 9.094708833677904e-06, |
| "loss": 0.3914, |
| "step": 5824 |
| }, |
| { |
| "epoch": 1.3834160170092134, |
| "grad_norm": 5.28125, |
| "learning_rate": 9.079507930835039e-06, |
| "loss": 0.3943, |
| "step": 5856 |
| }, |
| { |
| "epoch": 1.390975667375384, |
| "grad_norm": 6.09375, |
| "learning_rate": 9.064193394144857e-06, |
| "loss": 0.3909, |
| "step": 5888 |
| }, |
| { |
| "epoch": 1.3985353177415545, |
| "grad_norm": 6.84375, |
| "learning_rate": 9.048765650190601e-06, |
| "loss": 0.3835, |
| "step": 5920 |
| }, |
| { |
| "epoch": 1.406094968107725, |
| "grad_norm": 4.59375, |
| "learning_rate": 9.033225128708877e-06, |
| "loss": 0.4175, |
| "step": 5952 |
| }, |
| { |
| "epoch": 1.4136546184738956, |
| "grad_norm": 6.65625, |
| "learning_rate": 9.017572262577691e-06, |
| "loss": 0.4212, |
| "step": 5984 |
| }, |
| { |
| "epoch": 1.4212142688400662, |
| "grad_norm": 5.4375, |
| "learning_rate": 9.001807487804384e-06, |
| "loss": 0.4079, |
| "step": 6016 |
| }, |
| { |
| "epoch": 1.4287739192062368, |
| "grad_norm": 6.40625, |
| "learning_rate": 8.985931243513481e-06, |
| "loss": 0.373, |
| "step": 6048 |
| }, |
| { |
| "epoch": 1.4363335695724073, |
| "grad_norm": 7.40625, |
| "learning_rate": 8.96994397193448e-06, |
| "loss": 0.4122, |
| "step": 6080 |
| }, |
| { |
| "epoch": 1.4438932199385779, |
| "grad_norm": 4.96875, |
| "learning_rate": 8.953846118389514e-06, |
| "loss": 0.4034, |
| "step": 6112 |
| }, |
| { |
| "epoch": 1.4514528703047485, |
| "grad_norm": 5.53125, |
| "learning_rate": 8.937638131280952e-06, |
| "loss": 0.4034, |
| "step": 6144 |
| }, |
| { |
| "epoch": 1.459012520670919, |
| "grad_norm": 5.625, |
| "learning_rate": 8.921320462078916e-06, |
| "loss": 0.3862, |
| "step": 6176 |
| }, |
| { |
| "epoch": 1.4665721710370896, |
| "grad_norm": 14.25, |
| "learning_rate": 8.904893565308697e-06, |
| "loss": 0.4192, |
| "step": 6208 |
| }, |
| { |
| "epoch": 1.4741318214032602, |
| "grad_norm": 6.9375, |
| "learning_rate": 8.888357898538095e-06, |
| "loss": 0.3923, |
| "step": 6240 |
| }, |
| { |
| "epoch": 1.4816914717694307, |
| "grad_norm": 9.25, |
| "learning_rate": 8.871713922364684e-06, |
| "loss": 0.4096, |
| "step": 6272 |
| }, |
| { |
| "epoch": 1.4892511221356013, |
| "grad_norm": 7.65625, |
| "learning_rate": 8.854962100402962e-06, |
| "loss": 0.3838, |
| "step": 6304 |
| }, |
| { |
| "epoch": 1.4968107725017719, |
| "grad_norm": 7.9375, |
| "learning_rate": 8.83810289927146e-06, |
| "loss": 0.4177, |
| "step": 6336 |
| }, |
| { |
| "epoch": 1.5043704228679422, |
| "grad_norm": 7.9375, |
| "learning_rate": 8.821136788579725e-06, |
| "loss": 0.3896, |
| "step": 6368 |
| }, |
| { |
| "epoch": 1.511930073234113, |
| "grad_norm": 7.0625, |
| "learning_rate": 8.804064240915253e-06, |
| "loss": 0.4424, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.5194897236002833, |
| "grad_norm": 6.21875, |
| "learning_rate": 8.786885731830322e-06, |
| "loss": 0.3924, |
| "step": 6432 |
| }, |
| { |
| "epoch": 1.5270493739664541, |
| "grad_norm": 8.625, |
| "learning_rate": 8.769601739828735e-06, |
| "loss": 0.4056, |
| "step": 6464 |
| }, |
| { |
| "epoch": 1.5346090243326245, |
| "grad_norm": 4.9375, |
| "learning_rate": 8.752212746352506e-06, |
| "loss": 0.3844, |
| "step": 6496 |
| }, |
| { |
| "epoch": 1.5421686746987953, |
| "grad_norm": 6.78125, |
| "learning_rate": 8.734719235768441e-06, |
| "loss": 0.3959, |
| "step": 6528 |
| }, |
| { |
| "epoch": 1.5497283250649656, |
| "grad_norm": 5.5, |
| "learning_rate": 8.717121695354651e-06, |
| "loss": 0.3885, |
| "step": 6560 |
| }, |
| { |
| "epoch": 1.5572879754311364, |
| "grad_norm": 4.8125, |
| "learning_rate": 8.699420615286974e-06, |
| "loss": 0.4049, |
| "step": 6592 |
| }, |
| { |
| "epoch": 1.5648476257973067, |
| "grad_norm": 6.21875, |
| "learning_rate": 8.681616488625323e-06, |
| "loss": 0.3783, |
| "step": 6624 |
| }, |
| { |
| "epoch": 1.5724072761634775, |
| "grad_norm": 6.90625, |
| "learning_rate": 8.663709811299954e-06, |
| "loss": 0.397, |
| "step": 6656 |
| }, |
| { |
| "epoch": 1.5799669265296479, |
| "grad_norm": 7.1875, |
| "learning_rate": 8.64570108209765e-06, |
| "loss": 0.415, |
| "step": 6688 |
| }, |
| { |
| "epoch": 1.5875265768958187, |
| "grad_norm": 5.4375, |
| "learning_rate": 8.627590802647829e-06, |
| "loss": 0.4219, |
| "step": 6720 |
| }, |
| { |
| "epoch": 1.595086227261989, |
| "grad_norm": 5.1875, |
| "learning_rate": 8.609379477408569e-06, |
| "loss": 0.3725, |
| "step": 6752 |
| }, |
| { |
| "epoch": 1.6026458776281598, |
| "grad_norm": 7.25, |
| "learning_rate": 8.591067613652552e-06, |
| "loss": 0.4042, |
| "step": 6784 |
| }, |
| { |
| "epoch": 1.6102055279943301, |
| "grad_norm": 5.5, |
| "learning_rate": 8.572655721452954e-06, |
| "loss": 0.4092, |
| "step": 6816 |
| }, |
| { |
| "epoch": 1.617765178360501, |
| "grad_norm": 7.4375, |
| "learning_rate": 8.554144313669208e-06, |
| "loss": 0.4191, |
| "step": 6848 |
| }, |
| { |
| "epoch": 1.6253248287266713, |
| "grad_norm": 7.0625, |
| "learning_rate": 8.535533905932739e-06, |
| "loss": 0.4391, |
| "step": 6880 |
| }, |
| { |
| "epoch": 1.632884479092842, |
| "grad_norm": 6.625, |
| "learning_rate": 8.516825016632594e-06, |
| "loss": 0.3896, |
| "step": 6912 |
| }, |
| { |
| "epoch": 1.6404441294590124, |
| "grad_norm": 5.28125, |
| "learning_rate": 8.498018166901008e-06, |
| "loss": 0.3774, |
| "step": 6944 |
| }, |
| { |
| "epoch": 1.6480037798251832, |
| "grad_norm": 6.1875, |
| "learning_rate": 8.479113880598875e-06, |
| "loss": 0.4245, |
| "step": 6976 |
| }, |
| { |
| "epoch": 1.6555634301913535, |
| "grad_norm": 5.5625, |
| "learning_rate": 8.460112684301172e-06, |
| "loss": 0.4239, |
| "step": 7008 |
| }, |
| { |
| "epoch": 1.6631230805575243, |
| "grad_norm": 6.71875, |
| "learning_rate": 8.441015107282281e-06, |
| "loss": 0.4013, |
| "step": 7040 |
| }, |
| { |
| "epoch": 1.6706827309236947, |
| "grad_norm": 3.65625, |
| "learning_rate": 8.421821681501248e-06, |
| "loss": 0.3926, |
| "step": 7072 |
| }, |
| { |
| "epoch": 1.6782423812898655, |
| "grad_norm": 5.40625, |
| "learning_rate": 8.402532941586968e-06, |
| "loss": 0.3848, |
| "step": 7104 |
| }, |
| { |
| "epoch": 1.6858020316560358, |
| "grad_norm": 7.125, |
| "learning_rate": 8.38314942482329e-06, |
| "loss": 0.3846, |
| "step": 7136 |
| }, |
| { |
| "epoch": 1.6933616820222066, |
| "grad_norm": 5.6875, |
| "learning_rate": 8.363671671134053e-06, |
| "loss": 0.4196, |
| "step": 7168 |
| }, |
| { |
| "epoch": 1.700921332388377, |
| "grad_norm": 4.03125, |
| "learning_rate": 8.344100223068048e-06, |
| "loss": 0.3903, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.7084809827545477, |
| "grad_norm": 5.78125, |
| "learning_rate": 8.324435625783901e-06, |
| "loss": 0.3984, |
| "step": 7232 |
| }, |
| { |
| "epoch": 1.716040633120718, |
| "grad_norm": 5.40625, |
| "learning_rate": 8.304678427034891e-06, |
| "loss": 0.4324, |
| "step": 7264 |
| }, |
| { |
| "epoch": 1.7236002834868889, |
| "grad_norm": 7.09375, |
| "learning_rate": 8.28482917715369e-06, |
| "loss": 0.3662, |
| "step": 7296 |
| }, |
| { |
| "epoch": 1.7311599338530592, |
| "grad_norm": 7.0, |
| "learning_rate": 8.26488842903704e-06, |
| "loss": 0.3834, |
| "step": 7328 |
| }, |
| { |
| "epoch": 1.73871958421923, |
| "grad_norm": 7.25, |
| "learning_rate": 8.244856738130339e-06, |
| "loss": 0.4279, |
| "step": 7360 |
| }, |
| { |
| "epoch": 1.7462792345854004, |
| "grad_norm": 4.5625, |
| "learning_rate": 8.224734662412187e-06, |
| "loss": 0.405, |
| "step": 7392 |
| }, |
| { |
| "epoch": 1.7538388849515711, |
| "grad_norm": 7.1875, |
| "learning_rate": 8.204522762378829e-06, |
| "loss": 0.4181, |
| "step": 7424 |
| }, |
| { |
| "epoch": 1.7613985353177415, |
| "grad_norm": 7.15625, |
| "learning_rate": 8.184221601028546e-06, |
| "loss": 0.4179, |
| "step": 7456 |
| }, |
| { |
| "epoch": 1.7689581856839123, |
| "grad_norm": 6.3125, |
| "learning_rate": 8.16383174384598e-06, |
| "loss": 0.3963, |
| "step": 7488 |
| }, |
| { |
| "epoch": 1.7765178360500826, |
| "grad_norm": 6.28125, |
| "learning_rate": 8.143353758786372e-06, |
| "loss": 0.4346, |
| "step": 7520 |
| }, |
| { |
| "epoch": 1.7840774864162534, |
| "grad_norm": 6.0625, |
| "learning_rate": 8.12278821625975e-06, |
| "loss": 0.3891, |
| "step": 7552 |
| }, |
| { |
| "epoch": 1.7916371367824238, |
| "grad_norm": 7.21875, |
| "learning_rate": 8.102135689115036e-06, |
| "loss": 0.3684, |
| "step": 7584 |
| }, |
| { |
| "epoch": 1.7991967871485943, |
| "grad_norm": 5.84375, |
| "learning_rate": 8.081396752624087e-06, |
| "loss": 0.3718, |
| "step": 7616 |
| }, |
| { |
| "epoch": 1.806756437514765, |
| "grad_norm": 7.90625, |
| "learning_rate": 8.060571984465679e-06, |
| "loss": 0.4179, |
| "step": 7648 |
| }, |
| { |
| "epoch": 1.8143160878809355, |
| "grad_norm": 11.875, |
| "learning_rate": 8.039661964709414e-06, |
| "loss": 0.4095, |
| "step": 7680 |
| }, |
| { |
| "epoch": 1.821875738247106, |
| "grad_norm": 15.0, |
| "learning_rate": 8.018667275799552e-06, |
| "loss": 0.3964, |
| "step": 7712 |
| }, |
| { |
| "epoch": 1.8294353886132766, |
| "grad_norm": 6.34375, |
| "learning_rate": 7.997588502538796e-06, |
| "loss": 0.3525, |
| "step": 7744 |
| }, |
| { |
| "epoch": 1.8369950389794472, |
| "grad_norm": 6.3125, |
| "learning_rate": 7.976426232072008e-06, |
| "loss": 0.3667, |
| "step": 7776 |
| }, |
| { |
| "epoch": 1.8445546893456177, |
| "grad_norm": 6.6875, |
| "learning_rate": 7.955181053869841e-06, |
| "loss": 0.3845, |
| "step": 7808 |
| }, |
| { |
| "epoch": 1.8521143397117883, |
| "grad_norm": 6.59375, |
| "learning_rate": 7.933853559712328e-06, |
| "loss": 0.416, |
| "step": 7840 |
| }, |
| { |
| "epoch": 1.8596739900779589, |
| "grad_norm": 5.40625, |
| "learning_rate": 7.912444343672395e-06, |
| "loss": 0.3795, |
| "step": 7872 |
| }, |
| { |
| "epoch": 1.8672336404441294, |
| "grad_norm": 5.96875, |
| "learning_rate": 7.890954002099312e-06, |
| "loss": 0.4025, |
| "step": 7904 |
| }, |
| { |
| "epoch": 1.8747932908103, |
| "grad_norm": 6.5, |
| "learning_rate": 7.869383133602091e-06, |
| "loss": 0.4047, |
| "step": 7936 |
| }, |
| { |
| "epoch": 1.8823529411764706, |
| "grad_norm": 5.875, |
| "learning_rate": 7.847732339032796e-06, |
| "loss": 0.4168, |
| "step": 7968 |
| }, |
| { |
| "epoch": 1.8899125915426411, |
| "grad_norm": 7.46875, |
| "learning_rate": 7.826002221469822e-06, |
| "loss": 0.374, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.8974722419088117, |
| "grad_norm": 6.0625, |
| "learning_rate": 7.804193386201086e-06, |
| "loss": 0.3925, |
| "step": 8032 |
| }, |
| { |
| "epoch": 1.9050318922749823, |
| "grad_norm": 5.21875, |
| "learning_rate": 7.782306440707173e-06, |
| "loss": 0.4083, |
| "step": 8064 |
| }, |
| { |
| "epoch": 1.9125915426411528, |
| "grad_norm": 5.0, |
| "learning_rate": 7.760341994644406e-06, |
| "loss": 0.3894, |
| "step": 8096 |
| }, |
| { |
| "epoch": 1.9201511930073234, |
| "grad_norm": 5.34375, |
| "learning_rate": 7.738300659827878e-06, |
| "loss": 0.3491, |
| "step": 8128 |
| }, |
| { |
| "epoch": 1.927710843373494, |
| "grad_norm": 5.8125, |
| "learning_rate": 7.7161830502144e-06, |
| "loss": 0.4238, |
| "step": 8160 |
| }, |
| { |
| "epoch": 1.9352704937396645, |
| "grad_norm": 4.84375, |
| "learning_rate": 7.693989781885396e-06, |
| "loss": 0.4313, |
| "step": 8192 |
| }, |
| { |
| "epoch": 1.942830144105835, |
| "grad_norm": 6.25, |
| "learning_rate": 7.671721473029756e-06, |
| "loss": 0.4254, |
| "step": 8224 |
| }, |
| { |
| "epoch": 1.9503897944720057, |
| "grad_norm": 9.0, |
| "learning_rate": 7.649378743926603e-06, |
| "loss": 0.4327, |
| "step": 8256 |
| }, |
| { |
| "epoch": 1.9579494448381762, |
| "grad_norm": 6.21875, |
| "learning_rate": 7.626962216928025e-06, |
| "loss": 0.4143, |
| "step": 8288 |
| }, |
| { |
| "epoch": 1.9655090952043468, |
| "grad_norm": 9.1875, |
| "learning_rate": 7.60447251644173e-06, |
| "loss": 0.417, |
| "step": 8320 |
| }, |
| { |
| "epoch": 1.9730687455705174, |
| "grad_norm": 8.1875, |
| "learning_rate": 7.58191026891366e-06, |
| "loss": 0.399, |
| "step": 8352 |
| }, |
| { |
| "epoch": 1.980628395936688, |
| "grad_norm": 5.90625, |
| "learning_rate": 7.559276102810541e-06, |
| "loss": 0.3637, |
| "step": 8384 |
| }, |
| { |
| "epoch": 1.9881880463028585, |
| "grad_norm": 7.28125, |
| "learning_rate": 7.536570648602377e-06, |
| "loss": 0.3907, |
| "step": 8416 |
| }, |
| { |
| "epoch": 1.995747696669029, |
| "grad_norm": 3.78125, |
| "learning_rate": 7.513794538744885e-06, |
| "loss": 0.3775, |
| "step": 8448 |
| }, |
| { |
| "epoch": 2.0033073470351996, |
| "grad_norm": 6.0625, |
| "learning_rate": 7.49094840766188e-06, |
| "loss": 0.3695, |
| "step": 8480 |
| }, |
| { |
| "epoch": 2.01086699740137, |
| "grad_norm": 4.0625, |
| "learning_rate": 7.468032891727606e-06, |
| "loss": 0.3548, |
| "step": 8512 |
| }, |
| { |
| "epoch": 2.0184266477675408, |
| "grad_norm": 4.8125, |
| "learning_rate": 7.445048629249007e-06, |
| "loss": 0.3596, |
| "step": 8544 |
| }, |
| { |
| "epoch": 2.025986298133711, |
| "grad_norm": 5.21875, |
| "learning_rate": 7.421996260447948e-06, |
| "loss": 0.3741, |
| "step": 8576 |
| }, |
| { |
| "epoch": 2.033545948499882, |
| "grad_norm": 4.625, |
| "learning_rate": 7.398876427443379e-06, |
| "loss": 0.4047, |
| "step": 8608 |
| }, |
| { |
| "epoch": 2.0411055988660523, |
| "grad_norm": 5.34375, |
| "learning_rate": 7.375689774233453e-06, |
| "loss": 0.3667, |
| "step": 8640 |
| }, |
| { |
| "epoch": 2.048665249232223, |
| "grad_norm": 6.40625, |
| "learning_rate": 7.352436946677589e-06, |
| "loss": 0.3425, |
| "step": 8672 |
| }, |
| { |
| "epoch": 2.0562248995983934, |
| "grad_norm": 8.25, |
| "learning_rate": 7.329118592478473e-06, |
| "loss": 0.3651, |
| "step": 8704 |
| }, |
| { |
| "epoch": 2.063784549964564, |
| "grad_norm": 4.1875, |
| "learning_rate": 7.305735361164028e-06, |
| "loss": 0.3707, |
| "step": 8736 |
| }, |
| { |
| "epoch": 2.0713442003307345, |
| "grad_norm": 9.375, |
| "learning_rate": 7.282287904069308e-06, |
| "loss": 0.406, |
| "step": 8768 |
| }, |
| { |
| "epoch": 2.0789038506969053, |
| "grad_norm": 4.71875, |
| "learning_rate": 7.258776874318371e-06, |
| "loss": 0.3535, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.0864635010630757, |
| "grad_norm": 9.3125, |
| "learning_rate": 7.235202926806075e-06, |
| "loss": 0.3671, |
| "step": 8832 |
| }, |
| { |
| "epoch": 2.0940231514292464, |
| "grad_norm": 7.53125, |
| "learning_rate": 7.211566718179837e-06, |
| "loss": 0.372, |
| "step": 8864 |
| }, |
| { |
| "epoch": 2.101582801795417, |
| "grad_norm": 5.625, |
| "learning_rate": 7.1878689068213466e-06, |
| "loss": 0.3503, |
| "step": 8896 |
| }, |
| { |
| "epoch": 2.1091424521615876, |
| "grad_norm": 7.59375, |
| "learning_rate": 7.164110152828223e-06, |
| "loss": 0.3796, |
| "step": 8928 |
| }, |
| { |
| "epoch": 2.116702102527758, |
| "grad_norm": 4.9375, |
| "learning_rate": 7.140291117995632e-06, |
| "loss": 0.3778, |
| "step": 8960 |
| }, |
| { |
| "epoch": 2.1242617528939287, |
| "grad_norm": 5.78125, |
| "learning_rate": 7.116412465797849e-06, |
| "loss": 0.3986, |
| "step": 8992 |
| }, |
| { |
| "epoch": 2.131821403260099, |
| "grad_norm": 10.3125, |
| "learning_rate": 7.092474861369778e-06, |
| "loss": 0.3781, |
| "step": 9024 |
| }, |
| { |
| "epoch": 2.13938105362627, |
| "grad_norm": 5.71875, |
| "learning_rate": 7.068478971488427e-06, |
| "loss": 0.3316, |
| "step": 9056 |
| }, |
| { |
| "epoch": 2.14694070399244, |
| "grad_norm": 6.96875, |
| "learning_rate": 7.04442546455433e-06, |
| "loss": 0.3911, |
| "step": 9088 |
| }, |
| { |
| "epoch": 2.154500354358611, |
| "grad_norm": 4.03125, |
| "learning_rate": 7.020315010572936e-06, |
| "loss": 0.3651, |
| "step": 9120 |
| }, |
| { |
| "epoch": 2.1620600047247813, |
| "grad_norm": 5.125, |
| "learning_rate": 6.996148281135936e-06, |
| "loss": 0.4247, |
| "step": 9152 |
| }, |
| { |
| "epoch": 2.169619655090952, |
| "grad_norm": 6.71875, |
| "learning_rate": 6.971925949402571e-06, |
| "loss": 0.3893, |
| "step": 9184 |
| }, |
| { |
| "epoch": 2.1771793054571225, |
| "grad_norm": 7.5, |
| "learning_rate": 6.947648690080866e-06, |
| "loss": 0.3916, |
| "step": 9216 |
| }, |
| { |
| "epoch": 2.1847389558232932, |
| "grad_norm": 5.5, |
| "learning_rate": 6.923317179408844e-06, |
| "loss": 0.3539, |
| "step": 9248 |
| }, |
| { |
| "epoch": 2.1922986061894636, |
| "grad_norm": 5.84375, |
| "learning_rate": 6.898932095135686e-06, |
| "loss": 0.3643, |
| "step": 9280 |
| }, |
| { |
| "epoch": 2.1998582565556344, |
| "grad_norm": 6.90625, |
| "learning_rate": 6.8744941165028625e-06, |
| "loss": 0.3667, |
| "step": 9312 |
| }, |
| { |
| "epoch": 2.2074179069218047, |
| "grad_norm": 4.65625, |
| "learning_rate": 6.850003924225196e-06, |
| "loss": 0.3679, |
| "step": 9344 |
| }, |
| { |
| "epoch": 2.2149775572879755, |
| "grad_norm": 6.1875, |
| "learning_rate": 6.825462200471913e-06, |
| "loss": 0.3646, |
| "step": 9376 |
| }, |
| { |
| "epoch": 2.222537207654146, |
| "grad_norm": 6.4375, |
| "learning_rate": 6.800869628847639e-06, |
| "loss": 0.3672, |
| "step": 9408 |
| }, |
| { |
| "epoch": 2.2300968580203167, |
| "grad_norm": 8.6875, |
| "learning_rate": 6.776226894373358e-06, |
| "loss": 0.3661, |
| "step": 9440 |
| }, |
| { |
| "epoch": 2.237656508386487, |
| "grad_norm": 5.59375, |
| "learning_rate": 6.751534683467326e-06, |
| "loss": 0.3592, |
| "step": 9472 |
| }, |
| { |
| "epoch": 2.245216158752658, |
| "grad_norm": 3.828125, |
| "learning_rate": 6.726793683925956e-06, |
| "loss": 0.3756, |
| "step": 9504 |
| }, |
| { |
| "epoch": 2.252775809118828, |
| "grad_norm": 6.53125, |
| "learning_rate": 6.70200458490466e-06, |
| "loss": 0.3646, |
| "step": 9536 |
| }, |
| { |
| "epoch": 2.260335459484999, |
| "grad_norm": 7.96875, |
| "learning_rate": 6.67716807689865e-06, |
| "loss": 0.3805, |
| "step": 9568 |
| }, |
| { |
| "epoch": 2.2678951098511693, |
| "grad_norm": 6.0, |
| "learning_rate": 6.652284851723706e-06, |
| "loss": 0.3929, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.27545476021734, |
| "grad_norm": 6.6875, |
| "learning_rate": 6.627355602496903e-06, |
| "loss": 0.3732, |
| "step": 9632 |
| }, |
| { |
| "epoch": 2.2830144105835104, |
| "grad_norm": 6.15625, |
| "learning_rate": 6.602381023617308e-06, |
| "loss": 0.392, |
| "step": 9664 |
| }, |
| { |
| "epoch": 2.290574060949681, |
| "grad_norm": 6.78125, |
| "learning_rate": 6.577361810746638e-06, |
| "loss": 0.3814, |
| "step": 9696 |
| }, |
| { |
| "epoch": 2.2981337113158515, |
| "grad_norm": 6.375, |
| "learning_rate": 6.552298660789875e-06, |
| "loss": 0.4029, |
| "step": 9728 |
| }, |
| { |
| "epoch": 2.3056933616820223, |
| "grad_norm": 4.96875, |
| "learning_rate": 6.5271922718758655e-06, |
| "loss": 0.3594, |
| "step": 9760 |
| }, |
| { |
| "epoch": 2.3132530120481927, |
| "grad_norm": 4.75, |
| "learning_rate": 6.502043343337864e-06, |
| "loss": 0.3599, |
| "step": 9792 |
| }, |
| { |
| "epoch": 2.3208126624143635, |
| "grad_norm": 7.875, |
| "learning_rate": 6.476852575694061e-06, |
| "loss": 0.4065, |
| "step": 9824 |
| }, |
| { |
| "epoch": 2.328372312780534, |
| "grad_norm": 6.0625, |
| "learning_rate": 6.451620670628062e-06, |
| "loss": 0.4125, |
| "step": 9856 |
| }, |
| { |
| "epoch": 2.3359319631467046, |
| "grad_norm": 4.9375, |
| "learning_rate": 6.426348330969353e-06, |
| "loss": 0.3699, |
| "step": 9888 |
| }, |
| { |
| "epoch": 2.343491613512875, |
| "grad_norm": 6.1875, |
| "learning_rate": 6.40103626067371e-06, |
| "loss": 0.3839, |
| "step": 9920 |
| }, |
| { |
| "epoch": 2.3510512638790457, |
| "grad_norm": 4.65625, |
| "learning_rate": 6.375685164803604e-06, |
| "loss": 0.4004, |
| "step": 9952 |
| }, |
| { |
| "epoch": 2.358610914245216, |
| "grad_norm": 12.0625, |
| "learning_rate": 6.350295749508551e-06, |
| "loss": 0.3754, |
| "step": 9984 |
| }, |
| { |
| "epoch": 2.366170564611387, |
| "grad_norm": 5.40625, |
| "learning_rate": 6.324868722005448e-06, |
| "loss": 0.4067, |
| "step": 10016 |
| }, |
| { |
| "epoch": 2.373730214977557, |
| "grad_norm": 9.0, |
| "learning_rate": 6.299404790558874e-06, |
| "loss": 0.3891, |
| "step": 10048 |
| }, |
| { |
| "epoch": 2.381289865343728, |
| "grad_norm": 6.15625, |
| "learning_rate": 6.273904664461358e-06, |
| "loss": 0.4121, |
| "step": 10080 |
| }, |
| { |
| "epoch": 2.3888495157098983, |
| "grad_norm": 6.0625, |
| "learning_rate": 6.248369054013622e-06, |
| "loss": 0.385, |
| "step": 10112 |
| }, |
| { |
| "epoch": 2.396409166076069, |
| "grad_norm": 5.96875, |
| "learning_rate": 6.2227986705048016e-06, |
| "loss": 0.3822, |
| "step": 10144 |
| }, |
| { |
| "epoch": 2.4039688164422395, |
| "grad_norm": 4.25, |
| "learning_rate": 6.1971942261926235e-06, |
| "loss": 0.3776, |
| "step": 10176 |
| }, |
| { |
| "epoch": 2.4115284668084103, |
| "grad_norm": 4.65625, |
| "learning_rate": 6.171556434283574e-06, |
| "loss": 0.3936, |
| "step": 10208 |
| }, |
| { |
| "epoch": 2.4190881171745806, |
| "grad_norm": 4.25, |
| "learning_rate": 6.145886008913031e-06, |
| "loss": 0.3732, |
| "step": 10240 |
| }, |
| { |
| "epoch": 2.4266477675407514, |
| "grad_norm": 4.0, |
| "learning_rate": 6.120183665125369e-06, |
| "loss": 0.3761, |
| "step": 10272 |
| }, |
| { |
| "epoch": 2.4342074179069217, |
| "grad_norm": 7.40625, |
| "learning_rate": 6.094450118854042e-06, |
| "loss": 0.3833, |
| "step": 10304 |
| }, |
| { |
| "epoch": 2.4417670682730925, |
| "grad_norm": 4.5625, |
| "learning_rate": 6.068686086901644e-06, |
| "loss": 0.4269, |
| "step": 10336 |
| }, |
| { |
| "epoch": 2.449326718639263, |
| "grad_norm": 6.28125, |
| "learning_rate": 6.042892286919943e-06, |
| "loss": 0.3946, |
| "step": 10368 |
| }, |
| { |
| "epoch": 2.4568863690054337, |
| "grad_norm": 4.1875, |
| "learning_rate": 6.017069437389889e-06, |
| "loss": 0.4035, |
| "step": 10400 |
| }, |
| { |
| "epoch": 2.464446019371604, |
| "grad_norm": 5.875, |
| "learning_rate": 5.991218257601597e-06, |
| "loss": 0.372, |
| "step": 10432 |
| }, |
| { |
| "epoch": 2.472005669737775, |
| "grad_norm": 5.75, |
| "learning_rate": 5.965339467634319e-06, |
| "loss": 0.3695, |
| "step": 10464 |
| }, |
| { |
| "epoch": 2.479565320103945, |
| "grad_norm": 6.75, |
| "learning_rate": 5.939433788336384e-06, |
| "loss": 0.3745, |
| "step": 10496 |
| }, |
| { |
| "epoch": 2.487124970470116, |
| "grad_norm": 5.65625, |
| "learning_rate": 5.913501941305114e-06, |
| "loss": 0.3821, |
| "step": 10528 |
| }, |
| { |
| "epoch": 2.4946846208362863, |
| "grad_norm": 6.4375, |
| "learning_rate": 5.88754464886673e-06, |
| "loss": 0.3796, |
| "step": 10560 |
| }, |
| { |
| "epoch": 2.502244271202457, |
| "grad_norm": 5.125, |
| "learning_rate": 5.861562634056232e-06, |
| "loss": 0.337, |
| "step": 10592 |
| }, |
| { |
| "epoch": 2.5098039215686274, |
| "grad_norm": 14.6875, |
| "learning_rate": 5.83555662059725e-06, |
| "loss": 0.361, |
| "step": 10624 |
| }, |
| { |
| "epoch": 2.5173635719347978, |
| "grad_norm": 4.25, |
| "learning_rate": 5.8095273328818965e-06, |
| "loss": 0.3816, |
| "step": 10656 |
| }, |
| { |
| "epoch": 2.5249232223009686, |
| "grad_norm": 6.3125, |
| "learning_rate": 5.783475495950583e-06, |
| "loss": 0.3682, |
| "step": 10688 |
| }, |
| { |
| "epoch": 2.5324828726671393, |
| "grad_norm": 6.125, |
| "learning_rate": 5.7574018354718254e-06, |
| "loss": 0.41, |
| "step": 10720 |
| }, |
| { |
| "epoch": 2.5400425230333097, |
| "grad_norm": 4.5625, |
| "learning_rate": 5.731307077722026e-06, |
| "loss": 0.3869, |
| "step": 10752 |
| }, |
| { |
| "epoch": 2.54760217339948, |
| "grad_norm": 5.21875, |
| "learning_rate": 5.705191949565252e-06, |
| "loss": 0.3776, |
| "step": 10784 |
| }, |
| { |
| "epoch": 2.555161823765651, |
| "grad_norm": 5.625, |
| "learning_rate": 5.6790571784329785e-06, |
| "loss": 0.3676, |
| "step": 10816 |
| }, |
| { |
| "epoch": 2.5627214741318216, |
| "grad_norm": 4.125, |
| "learning_rate": 5.6529034923038384e-06, |
| "loss": 0.3715, |
| "step": 10848 |
| }, |
| { |
| "epoch": 2.570281124497992, |
| "grad_norm": 5.71875, |
| "learning_rate": 5.626731619683333e-06, |
| "loss": 0.4199, |
| "step": 10880 |
| }, |
| { |
| "epoch": 2.5778407748641623, |
| "grad_norm": 5.65625, |
| "learning_rate": 5.600542289583547e-06, |
| "loss": 0.3604, |
| "step": 10912 |
| }, |
| { |
| "epoch": 2.585400425230333, |
| "grad_norm": 11.125, |
| "learning_rate": 5.574336231502837e-06, |
| "loss": 0.381, |
| "step": 10944 |
| }, |
| { |
| "epoch": 2.592960075596504, |
| "grad_norm": 7.40625, |
| "learning_rate": 5.548114175405518e-06, |
| "loss": 0.4102, |
| "step": 10976 |
| }, |
| { |
| "epoch": 2.6005197259626742, |
| "grad_norm": 5.71875, |
| "learning_rate": 5.521876851701522e-06, |
| "loss": 0.3902, |
| "step": 11008 |
| }, |
| { |
| "epoch": 2.6080793763288446, |
| "grad_norm": 7.6875, |
| "learning_rate": 5.49562499122606e-06, |
| "loss": 0.3992, |
| "step": 11040 |
| }, |
| { |
| "epoch": 2.6156390266950154, |
| "grad_norm": 8.6875, |
| "learning_rate": 5.46935932521926e-06, |
| "loss": 0.3845, |
| "step": 11072 |
| }, |
| { |
| "epoch": 2.623198677061186, |
| "grad_norm": 5.5, |
| "learning_rate": 5.443080585305802e-06, |
| "loss": 0.4265, |
| "step": 11104 |
| }, |
| { |
| "epoch": 2.6307583274273565, |
| "grad_norm": 10.875, |
| "learning_rate": 5.416789503474538e-06, |
| "loss": 0.3552, |
| "step": 11136 |
| }, |
| { |
| "epoch": 2.638317977793527, |
| "grad_norm": 5.40625, |
| "learning_rate": 5.390486812058096e-06, |
| "loss": 0.3974, |
| "step": 11168 |
| }, |
| { |
| "epoch": 2.6458776281596976, |
| "grad_norm": 5.75, |
| "learning_rate": 5.364173243712492e-06, |
| "loss": 0.3389, |
| "step": 11200 |
| }, |
| { |
| "epoch": 2.6534372785258684, |
| "grad_norm": 5.75, |
| "learning_rate": 5.337849531396714e-06, |
| "loss": 0.3822, |
| "step": 11232 |
| }, |
| { |
| "epoch": 2.6609969288920388, |
| "grad_norm": 7.25, |
| "learning_rate": 5.31151640835231e-06, |
| "loss": 0.3903, |
| "step": 11264 |
| }, |
| { |
| "epoch": 2.668556579258209, |
| "grad_norm": 4.375, |
| "learning_rate": 5.28517460808296e-06, |
| "loss": 0.3687, |
| "step": 11296 |
| }, |
| { |
| "epoch": 2.67611622962438, |
| "grad_norm": 5.75, |
| "learning_rate": 5.258824864334047e-06, |
| "loss": 0.3677, |
| "step": 11328 |
| }, |
| { |
| "epoch": 2.6836758799905507, |
| "grad_norm": 6.0, |
| "learning_rate": 5.2324679110722185e-06, |
| "loss": 0.3796, |
| "step": 11360 |
| }, |
| { |
| "epoch": 2.691235530356721, |
| "grad_norm": 7.125, |
| "learning_rate": 5.206104482464942e-06, |
| "loss": 0.3653, |
| "step": 11392 |
| }, |
| { |
| "epoch": 2.6987951807228914, |
| "grad_norm": 5.125, |
| "learning_rate": 5.179735312860053e-06, |
| "loss": 0.3673, |
| "step": 11424 |
| }, |
| { |
| "epoch": 2.706354831089062, |
| "grad_norm": 7.8125, |
| "learning_rate": 5.153361136765301e-06, |
| "loss": 0.3779, |
| "step": 11456 |
| }, |
| { |
| "epoch": 2.713914481455233, |
| "grad_norm": 7.40625, |
| "learning_rate": 5.126982688827892e-06, |
| "loss": 0.3924, |
| "step": 11488 |
| }, |
| { |
| "epoch": 2.7214741318214033, |
| "grad_norm": 6.4375, |
| "learning_rate": 5.100600703814021e-06, |
| "loss": 0.4042, |
| "step": 11520 |
| }, |
| { |
| "epoch": 2.7290337821875736, |
| "grad_norm": 8.6875, |
| "learning_rate": 5.074215916588412e-06, |
| "loss": 0.3592, |
| "step": 11552 |
| }, |
| { |
| "epoch": 2.7365934325537444, |
| "grad_norm": 4.375, |
| "learning_rate": 5.0478290620938385e-06, |
| "loss": 0.3844, |
| "step": 11584 |
| }, |
| { |
| "epoch": 2.744153082919915, |
| "grad_norm": 6.5625, |
| "learning_rate": 5.021440875330659e-06, |
| "loss": 0.3744, |
| "step": 11616 |
| }, |
| { |
| "epoch": 2.7517127332860856, |
| "grad_norm": 4.96875, |
| "learning_rate": 4.995052091336344e-06, |
| "loss": 0.3721, |
| "step": 11648 |
| }, |
| { |
| "epoch": 2.759272383652256, |
| "grad_norm": 5.96875, |
| "learning_rate": 4.968663445164999e-06, |
| "loss": 0.4132, |
| "step": 11680 |
| }, |
| { |
| "epoch": 2.7668320340184267, |
| "grad_norm": 7.03125, |
| "learning_rate": 4.942275671866891e-06, |
| "loss": 0.3718, |
| "step": 11712 |
| }, |
| { |
| "epoch": 2.7743916843845975, |
| "grad_norm": 5.78125, |
| "learning_rate": 4.915889506467969e-06, |
| "loss": 0.3718, |
| "step": 11744 |
| }, |
| { |
| "epoch": 2.781951334750768, |
| "grad_norm": 6.03125, |
| "learning_rate": 4.889505683949403e-06, |
| "loss": 0.3687, |
| "step": 11776 |
| }, |
| { |
| "epoch": 2.789510985116938, |
| "grad_norm": 5.25, |
| "learning_rate": 4.86312493922709e-06, |
| "loss": 0.4039, |
| "step": 11808 |
| }, |
| { |
| "epoch": 2.797070635483109, |
| "grad_norm": 5.21875, |
| "learning_rate": 4.836748007131208e-06, |
| "loss": 0.3829, |
| "step": 11840 |
| }, |
| { |
| "epoch": 2.8046302858492793, |
| "grad_norm": 3.890625, |
| "learning_rate": 4.81037562238573e-06, |
| "loss": 0.3893, |
| "step": 11872 |
| }, |
| { |
| "epoch": 2.81218993621545, |
| "grad_norm": 5.46875, |
| "learning_rate": 4.784008519587961e-06, |
| "loss": 0.3847, |
| "step": 11904 |
| }, |
| { |
| "epoch": 2.8197495865816204, |
| "grad_norm": 8.0625, |
| "learning_rate": 4.7576474331880815e-06, |
| "loss": 0.3949, |
| "step": 11936 |
| }, |
| { |
| "epoch": 2.8273092369477912, |
| "grad_norm": 7.71875, |
| "learning_rate": 4.731293097468688e-06, |
| "loss": 0.3903, |
| "step": 11968 |
| }, |
| { |
| "epoch": 2.8348688873139616, |
| "grad_norm": 5.4375, |
| "learning_rate": 4.704946246524333e-06, |
| "loss": 0.367, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.8424285376801324, |
| "grad_norm": 5.03125, |
| "learning_rate": 4.678607614241086e-06, |
| "loss": 0.3865, |
| "step": 12032 |
| }, |
| { |
| "epoch": 2.8499881880463027, |
| "grad_norm": 5.3125, |
| "learning_rate": 4.652277934276088e-06, |
| "loss": 0.4144, |
| "step": 12064 |
| }, |
| { |
| "epoch": 2.8575478384124735, |
| "grad_norm": 10.125, |
| "learning_rate": 4.625957940037112e-06, |
| "loss": 0.3697, |
| "step": 12096 |
| }, |
| { |
| "epoch": 2.865107488778644, |
| "grad_norm": 7.375, |
| "learning_rate": 4.5996483646621406e-06, |
| "loss": 0.3794, |
| "step": 12128 |
| }, |
| { |
| "epoch": 2.8726671391448146, |
| "grad_norm": 4.5625, |
| "learning_rate": 4.573349940998937e-06, |
| "loss": 0.3742, |
| "step": 12160 |
| }, |
| { |
| "epoch": 2.880226789510985, |
| "grad_norm": 5.5, |
| "learning_rate": 4.547063401584638e-06, |
| "loss": 0.3967, |
| "step": 12192 |
| }, |
| { |
| "epoch": 2.8877864398771558, |
| "grad_norm": 4.28125, |
| "learning_rate": 4.52078947862535e-06, |
| "loss": 0.363, |
| "step": 12224 |
| }, |
| { |
| "epoch": 2.895346090243326, |
| "grad_norm": 6.09375, |
| "learning_rate": 4.494528903975744e-06, |
| "loss": 0.3876, |
| "step": 12256 |
| }, |
| { |
| "epoch": 2.902905740609497, |
| "grad_norm": 5.5625, |
| "learning_rate": 4.4682824091186855e-06, |
| "loss": 0.4017, |
| "step": 12288 |
| }, |
| { |
| "epoch": 2.9104653909756673, |
| "grad_norm": 7.71875, |
| "learning_rate": 4.4420507251448385e-06, |
| "loss": 0.3884, |
| "step": 12320 |
| }, |
| { |
| "epoch": 2.918025041341838, |
| "grad_norm": 3.953125, |
| "learning_rate": 4.415834582732324e-06, |
| "loss": 0.3687, |
| "step": 12352 |
| }, |
| { |
| "epoch": 2.9255846917080084, |
| "grad_norm": 5.4375, |
| "learning_rate": 4.389634712126353e-06, |
| "loss": 0.3445, |
| "step": 12384 |
| }, |
| { |
| "epoch": 2.933144342074179, |
| "grad_norm": 4.875, |
| "learning_rate": 4.3634518431188825e-06, |
| "loss": 0.4402, |
| "step": 12416 |
| }, |
| { |
| "epoch": 2.9407039924403495, |
| "grad_norm": 4.875, |
| "learning_rate": 4.3372867050283005e-06, |
| "loss": 0.4026, |
| "step": 12448 |
| }, |
| { |
| "epoch": 2.9482636428065203, |
| "grad_norm": 6.0, |
| "learning_rate": 4.311140026679104e-06, |
| "loss": 0.3846, |
| "step": 12480 |
| }, |
| { |
| "epoch": 2.9558232931726907, |
| "grad_norm": 5.40625, |
| "learning_rate": 4.285012536381593e-06, |
| "loss": 0.4215, |
| "step": 12512 |
| }, |
| { |
| "epoch": 2.9633829435388614, |
| "grad_norm": 6.1875, |
| "learning_rate": 4.258904961911593e-06, |
| "loss": 0.3835, |
| "step": 12544 |
| }, |
| { |
| "epoch": 2.970942593905032, |
| "grad_norm": 5.78125, |
| "learning_rate": 4.232818030490172e-06, |
| "loss": 0.381, |
| "step": 12576 |
| }, |
| { |
| "epoch": 2.9785022442712026, |
| "grad_norm": 4.46875, |
| "learning_rate": 4.206752468763398e-06, |
| "loss": 0.4174, |
| "step": 12608 |
| }, |
| { |
| "epoch": 2.986061894637373, |
| "grad_norm": 5.375, |
| "learning_rate": 4.1807090027820874e-06, |
| "loss": 0.3842, |
| "step": 12640 |
| }, |
| { |
| "epoch": 2.9936215450035437, |
| "grad_norm": 10.4375, |
| "learning_rate": 4.15468835798158e-06, |
| "loss": 0.3946, |
| "step": 12672 |
| }, |
| { |
| "epoch": 3.001181195369714, |
| "grad_norm": 6.625, |
| "learning_rate": 4.128691259161543e-06, |
| "loss": 0.377, |
| "step": 12704 |
| }, |
| { |
| "epoch": 3.008740845735885, |
| "grad_norm": 6.90625, |
| "learning_rate": 4.102718430465772e-06, |
| "loss": 0.3613, |
| "step": 12736 |
| }, |
| { |
| "epoch": 3.016300496102055, |
| "grad_norm": 4.65625, |
| "learning_rate": 4.0767705953620226e-06, |
| "loss": 0.3953, |
| "step": 12768 |
| }, |
| { |
| "epoch": 3.023860146468226, |
| "grad_norm": 4.53125, |
| "learning_rate": 4.050848476621861e-06, |
| "loss": 0.3763, |
| "step": 12800 |
| }, |
| { |
| "epoch": 3.0314197968343963, |
| "grad_norm": 7.9375, |
| "learning_rate": 4.024952796300526e-06, |
| "loss": 0.3671, |
| "step": 12832 |
| }, |
| { |
| "epoch": 3.038979447200567, |
| "grad_norm": 3.765625, |
| "learning_rate": 3.999084275716824e-06, |
| "loss": 0.3639, |
| "step": 12864 |
| }, |
| { |
| "epoch": 3.0465390975667375, |
| "grad_norm": 4.59375, |
| "learning_rate": 3.973243635433033e-06, |
| "loss": 0.3926, |
| "step": 12896 |
| }, |
| { |
| "epoch": 3.0540987479329083, |
| "grad_norm": 5.40625, |
| "learning_rate": 3.947431595234823e-06, |
| "loss": 0.3694, |
| "step": 12928 |
| }, |
| { |
| "epoch": 3.0616583982990786, |
| "grad_norm": 4.8125, |
| "learning_rate": 3.921648874111224e-06, |
| "loss": 0.3668, |
| "step": 12960 |
| }, |
| { |
| "epoch": 3.0692180486652494, |
| "grad_norm": 7.375, |
| "learning_rate": 3.895896190234587e-06, |
| "loss": 0.3968, |
| "step": 12992 |
| }, |
| { |
| "epoch": 3.0767776990314197, |
| "grad_norm": 5.5625, |
| "learning_rate": 3.870174260940576e-06, |
| "loss": 0.3777, |
| "step": 13024 |
| }, |
| { |
| "epoch": 3.0843373493975905, |
| "grad_norm": 6.28125, |
| "learning_rate": 3.844483802708201e-06, |
| "loss": 0.3611, |
| "step": 13056 |
| }, |
| { |
| "epoch": 3.091896999763761, |
| "grad_norm": 7.0, |
| "learning_rate": 3.818825531139844e-06, |
| "loss": 0.3831, |
| "step": 13088 |
| }, |
| { |
| "epoch": 3.0994566501299317, |
| "grad_norm": 3.21875, |
| "learning_rate": 3.7932001609413387e-06, |
| "loss": 0.3749, |
| "step": 13120 |
| }, |
| { |
| "epoch": 3.107016300496102, |
| "grad_norm": 5.34375, |
| "learning_rate": 3.7676084059020613e-06, |
| "loss": 0.3775, |
| "step": 13152 |
| }, |
| { |
| "epoch": 3.114575950862273, |
| "grad_norm": 6.03125, |
| "learning_rate": 3.742050978875036e-06, |
| "loss": 0.3637, |
| "step": 13184 |
| }, |
| { |
| "epoch": 3.122135601228443, |
| "grad_norm": 6.625, |
| "learning_rate": 3.7165285917570924e-06, |
| "loss": 0.3696, |
| "step": 13216 |
| }, |
| { |
| "epoch": 3.129695251594614, |
| "grad_norm": 5.0, |
| "learning_rate": 3.6910419554690345e-06, |
| "loss": 0.3533, |
| "step": 13248 |
| }, |
| { |
| "epoch": 3.1372549019607843, |
| "grad_norm": 6.09375, |
| "learning_rate": 3.665591779935825e-06, |
| "loss": 0.3846, |
| "step": 13280 |
| }, |
| { |
| "epoch": 3.144814552326955, |
| "grad_norm": 5.03125, |
| "learning_rate": 3.6401787740668294e-06, |
| "loss": 0.3692, |
| "step": 13312 |
| }, |
| { |
| "epoch": 3.1523742026931254, |
| "grad_norm": 6.0625, |
| "learning_rate": 3.61480364573605e-06, |
| "loss": 0.4014, |
| "step": 13344 |
| }, |
| { |
| "epoch": 3.159933853059296, |
| "grad_norm": 5.75, |
| "learning_rate": 3.5894671017624284e-06, |
| "loss": 0.3938, |
| "step": 13376 |
| }, |
| { |
| "epoch": 3.1674935034254665, |
| "grad_norm": 4.21875, |
| "learning_rate": 3.5641698478901415e-06, |
| "loss": 0.3689, |
| "step": 13408 |
| }, |
| { |
| "epoch": 3.1750531537916373, |
| "grad_norm": 6.28125, |
| "learning_rate": 3.5389125887689467e-06, |
| "loss": 0.3838, |
| "step": 13440 |
| }, |
| { |
| "epoch": 3.1826128041578077, |
| "grad_norm": 7.59375, |
| "learning_rate": 3.513696027934561e-06, |
| "loss": 0.3836, |
| "step": 13472 |
| }, |
| { |
| "epoch": 3.1901724545239785, |
| "grad_norm": 5.90625, |
| "learning_rate": 3.488520867789056e-06, |
| "loss": 0.3615, |
| "step": 13504 |
| }, |
| { |
| "epoch": 3.197732104890149, |
| "grad_norm": 9.5625, |
| "learning_rate": 3.4633878095812945e-06, |
| "loss": 0.3421, |
| "step": 13536 |
| }, |
| { |
| "epoch": 3.2052917552563196, |
| "grad_norm": 5.875, |
| "learning_rate": 3.4382975533874025e-06, |
| "loss": 0.3784, |
| "step": 13568 |
| }, |
| { |
| "epoch": 3.21285140562249, |
| "grad_norm": 6.03125, |
| "learning_rate": 3.413250798091261e-06, |
| "loss": 0.3552, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.2204110559886607, |
| "grad_norm": 5.625, |
| "learning_rate": 3.3882482413650437e-06, |
| "loss": 0.3831, |
| "step": 13632 |
| }, |
| { |
| "epoch": 3.227970706354831, |
| "grad_norm": 6.9375, |
| "learning_rate": 3.363290579649785e-06, |
| "loss": 0.4101, |
| "step": 13664 |
| }, |
| { |
| "epoch": 3.235530356721002, |
| "grad_norm": 7.1875, |
| "learning_rate": 3.3383785081359734e-06, |
| "loss": 0.3955, |
| "step": 13696 |
| }, |
| { |
| "epoch": 3.243090007087172, |
| "grad_norm": 5.3125, |
| "learning_rate": 3.3135127207441935e-06, |
| "loss": 0.3705, |
| "step": 13728 |
| }, |
| { |
| "epoch": 3.2506496574533426, |
| "grad_norm": 5.375, |
| "learning_rate": 3.2886939101058e-06, |
| "loss": 0.3674, |
| "step": 13760 |
| }, |
| { |
| "epoch": 3.2582093078195133, |
| "grad_norm": 6.625, |
| "learning_rate": 3.263922767543611e-06, |
| "loss": 0.3454, |
| "step": 13792 |
| }, |
| { |
| "epoch": 3.265768958185684, |
| "grad_norm": 4.3125, |
| "learning_rate": 3.239199983052669e-06, |
| "loss": 0.3621, |
| "step": 13824 |
| }, |
| { |
| "epoch": 3.2733286085518545, |
| "grad_norm": 6.15625, |
| "learning_rate": 3.2145262452810046e-06, |
| "loss": 0.3794, |
| "step": 13856 |
| }, |
| { |
| "epoch": 3.280888258918025, |
| "grad_norm": 5.625, |
| "learning_rate": 3.1899022415104675e-06, |
| "loss": 0.3956, |
| "step": 13888 |
| }, |
| { |
| "epoch": 3.2884479092841956, |
| "grad_norm": 4.0, |
| "learning_rate": 3.1653286576375787e-06, |
| "loss": 0.3491, |
| "step": 13920 |
| }, |
| { |
| "epoch": 3.2960075596503664, |
| "grad_norm": 4.375, |
| "learning_rate": 3.140806178154415e-06, |
| "loss": 0.3698, |
| "step": 13952 |
| }, |
| { |
| "epoch": 3.3035672100165367, |
| "grad_norm": 8.0625, |
| "learning_rate": 3.1163354861295604e-06, |
| "loss": 0.3903, |
| "step": 13984 |
| }, |
| { |
| "epoch": 3.311126860382707, |
| "grad_norm": 7.46875, |
| "learning_rate": 3.091917263189066e-06, |
| "loss": 0.3397, |
| "step": 14016 |
| }, |
| { |
| "epoch": 3.318686510748878, |
| "grad_norm": 4.6875, |
| "learning_rate": 3.0675521894974647e-06, |
| "loss": 0.3895, |
| "step": 14048 |
| }, |
| { |
| "epoch": 3.3262461611150487, |
| "grad_norm": 4.96875, |
| "learning_rate": 3.0432409437388346e-06, |
| "loss": 0.3847, |
| "step": 14080 |
| }, |
| { |
| "epoch": 3.333805811481219, |
| "grad_norm": 7.96875, |
| "learning_rate": 3.0189842030978795e-06, |
| "loss": 0.3942, |
| "step": 14112 |
| }, |
| { |
| "epoch": 3.3413654618473894, |
| "grad_norm": 4.9375, |
| "learning_rate": 2.9947826432410816e-06, |
| "loss": 0.3959, |
| "step": 14144 |
| }, |
| { |
| "epoch": 3.34892511221356, |
| "grad_norm": 6.40625, |
| "learning_rate": 2.9706369382978726e-06, |
| "loss": 0.392, |
| "step": 14176 |
| }, |
| { |
| "epoch": 3.3564847625797305, |
| "grad_norm": 7.90625, |
| "learning_rate": 2.946547760841853e-06, |
| "loss": 0.3895, |
| "step": 14208 |
| }, |
| { |
| "epoch": 3.3640444129459013, |
| "grad_norm": 7.53125, |
| "learning_rate": 2.9225157818720674e-06, |
| "loss": 0.4072, |
| "step": 14240 |
| }, |
| { |
| "epoch": 3.3716040633120716, |
| "grad_norm": 6.625, |
| "learning_rate": 2.898541670794304e-06, |
| "loss": 0.3749, |
| "step": 14272 |
| }, |
| { |
| "epoch": 3.3791637136782424, |
| "grad_norm": 15.0625, |
| "learning_rate": 2.8746260954024544e-06, |
| "loss": 0.4098, |
| "step": 14304 |
| }, |
| { |
| "epoch": 3.3867233640444128, |
| "grad_norm": 5.15625, |
| "learning_rate": 2.850769721859913e-06, |
| "loss": 0.3795, |
| "step": 14336 |
| }, |
| { |
| "epoch": 3.3942830144105836, |
| "grad_norm": 7.625, |
| "learning_rate": 2.8269732146810147e-06, |
| "loss": 0.3841, |
| "step": 14368 |
| }, |
| { |
| "epoch": 3.401842664776754, |
| "grad_norm": 5.21875, |
| "learning_rate": 2.8032372367125306e-06, |
| "loss": 0.3812, |
| "step": 14400 |
| }, |
| { |
| "epoch": 3.4094023151429247, |
| "grad_norm": 4.40625, |
| "learning_rate": 2.7795624491152097e-06, |
| "loss": 0.3795, |
| "step": 14432 |
| }, |
| { |
| "epoch": 3.416961965509095, |
| "grad_norm": 5.21875, |
| "learning_rate": 2.755949511345343e-06, |
| "loss": 0.3996, |
| "step": 14464 |
| }, |
| { |
| "epoch": 3.424521615875266, |
| "grad_norm": 7.875, |
| "learning_rate": 2.73239908113642e-06, |
| "loss": 0.3947, |
| "step": 14496 |
| }, |
| { |
| "epoch": 3.432081266241436, |
| "grad_norm": 5.9375, |
| "learning_rate": 2.7089118144807885e-06, |
| "loss": 0.3866, |
| "step": 14528 |
| }, |
| { |
| "epoch": 3.439640916607607, |
| "grad_norm": 5.40625, |
| "learning_rate": 2.6854883656113896e-06, |
| "loss": 0.354, |
| "step": 14560 |
| }, |
| { |
| "epoch": 3.4472005669737773, |
| "grad_norm": 5.15625, |
| "learning_rate": 2.662129386983533e-06, |
| "loss": 0.3723, |
| "step": 14592 |
| }, |
| { |
| "epoch": 3.454760217339948, |
| "grad_norm": 9.375, |
| "learning_rate": 2.6388355292567247e-06, |
| "loss": 0.3359, |
| "step": 14624 |
| }, |
| { |
| "epoch": 3.4623198677061184, |
| "grad_norm": 4.34375, |
| "learning_rate": 2.61560744127654e-06, |
| "loss": 0.4005, |
| "step": 14656 |
| }, |
| { |
| "epoch": 3.4698795180722892, |
| "grad_norm": 5.65625, |
| "learning_rate": 2.592445770056551e-06, |
| "loss": 0.3485, |
| "step": 14688 |
| }, |
| { |
| "epoch": 3.4774391684384596, |
| "grad_norm": 6.21875, |
| "learning_rate": 2.569351160760307e-06, |
| "loss": 0.3756, |
| "step": 14720 |
| }, |
| { |
| "epoch": 3.4849988188046304, |
| "grad_norm": 8.375, |
| "learning_rate": 2.546324256683359e-06, |
| "loss": 0.3885, |
| "step": 14752 |
| }, |
| { |
| "epoch": 3.4925584691708007, |
| "grad_norm": 6.71875, |
| "learning_rate": 2.523365699235346e-06, |
| "loss": 0.374, |
| "step": 14784 |
| }, |
| { |
| "epoch": 3.5001181195369715, |
| "grad_norm": 5.25, |
| "learning_rate": 2.5004761279221236e-06, |
| "loss": 0.3993, |
| "step": 14816 |
| }, |
| { |
| "epoch": 3.507677769903142, |
| "grad_norm": 5.65625, |
| "learning_rate": 2.4776561803279524e-06, |
| "loss": 0.3895, |
| "step": 14848 |
| }, |
| { |
| "epoch": 3.5152374202693126, |
| "grad_norm": 7.375, |
| "learning_rate": 2.4549064920977407e-06, |
| "loss": 0.3596, |
| "step": 14880 |
| }, |
| { |
| "epoch": 3.522797070635483, |
| "grad_norm": 9.25, |
| "learning_rate": 2.4322276969193347e-06, |
| "loss": 0.3495, |
| "step": 14912 |
| }, |
| { |
| "epoch": 3.5303567210016538, |
| "grad_norm": 4.46875, |
| "learning_rate": 2.409620426505872e-06, |
| "loss": 0.3972, |
| "step": 14944 |
| }, |
| { |
| "epoch": 3.537916371367824, |
| "grad_norm": 6.625, |
| "learning_rate": 2.3870853105781803e-06, |
| "loss": 0.3702, |
| "step": 14976 |
| }, |
| { |
| "epoch": 3.545476021733995, |
| "grad_norm": 7.40625, |
| "learning_rate": 2.364622976847238e-06, |
| "loss": 0.3541, |
| "step": 15008 |
| }, |
| { |
| "epoch": 3.5530356721001652, |
| "grad_norm": 8.5, |
| "learning_rate": 2.3422340509966984e-06, |
| "loss": 0.4204, |
| "step": 15040 |
| }, |
| { |
| "epoch": 3.560595322466336, |
| "grad_norm": 5.28125, |
| "learning_rate": 2.3199191566654393e-06, |
| "loss": 0.3944, |
| "step": 15072 |
| }, |
| { |
| "epoch": 3.5681549728325064, |
| "grad_norm": 6.3125, |
| "learning_rate": 2.297678915430223e-06, |
| "loss": 0.3695, |
| "step": 15104 |
| }, |
| { |
| "epoch": 3.575714623198677, |
| "grad_norm": 6.1875, |
| "learning_rate": 2.275513946788348e-06, |
| "loss": 0.3747, |
| "step": 15136 |
| }, |
| { |
| "epoch": 3.5832742735648475, |
| "grad_norm": 5.34375, |
| "learning_rate": 2.253424868140425e-06, |
| "loss": 0.3704, |
| "step": 15168 |
| }, |
| { |
| "epoch": 3.5908339239310183, |
| "grad_norm": 4.71875, |
| "learning_rate": 2.2314122947731554e-06, |
| "loss": 0.3951, |
| "step": 15200 |
| }, |
| { |
| "epoch": 3.5983935742971886, |
| "grad_norm": 7.8125, |
| "learning_rate": 2.2094768398422063e-06, |
| "loss": 0.3807, |
| "step": 15232 |
| }, |
| { |
| "epoch": 3.6059532246633594, |
| "grad_norm": 5.625, |
| "learning_rate": 2.1876191143551225e-06, |
| "loss": 0.3907, |
| "step": 15264 |
| }, |
| { |
| "epoch": 3.61351287502953, |
| "grad_norm": 7.5, |
| "learning_rate": 2.1658397271543195e-06, |
| "loss": 0.3534, |
| "step": 15296 |
| }, |
| { |
| "epoch": 3.6210725253957006, |
| "grad_norm": 5.25, |
| "learning_rate": 2.1441392849001048e-06, |
| "loss": 0.3922, |
| "step": 15328 |
| }, |
| { |
| "epoch": 3.628632175761871, |
| "grad_norm": 7.1875, |
| "learning_rate": 2.122518392053803e-06, |
| "loss": 0.3487, |
| "step": 15360 |
| }, |
| { |
| "epoch": 3.6361918261280417, |
| "grad_norm": 8.25, |
| "learning_rate": 2.1009776508608924e-06, |
| "loss": 0.356, |
| "step": 15392 |
| }, |
| { |
| "epoch": 3.643751476494212, |
| "grad_norm": 7.46875, |
| "learning_rate": 2.0795176613342576e-06, |
| "loss": 0.3652, |
| "step": 15424 |
| }, |
| { |
| "epoch": 3.651311126860383, |
| "grad_norm": 6.34375, |
| "learning_rate": 2.058139021237454e-06, |
| "loss": 0.3784, |
| "step": 15456 |
| }, |
| { |
| "epoch": 3.658870777226553, |
| "grad_norm": 8.75, |
| "learning_rate": 2.0368423260680677e-06, |
| "loss": 0.3724, |
| "step": 15488 |
| }, |
| { |
| "epoch": 3.666430427592724, |
| "grad_norm": 6.6875, |
| "learning_rate": 2.015628169041125e-06, |
| "loss": 0.3554, |
| "step": 15520 |
| }, |
| { |
| "epoch": 3.6739900779588943, |
| "grad_norm": 8.0625, |
| "learning_rate": 1.9944971410725706e-06, |
| "loss": 0.3807, |
| "step": 15552 |
| }, |
| { |
| "epoch": 3.681549728325065, |
| "grad_norm": 6.3125, |
| "learning_rate": 1.973449830762806e-06, |
| "loss": 0.375, |
| "step": 15584 |
| }, |
| { |
| "epoch": 3.6891093786912355, |
| "grad_norm": 5.6875, |
| "learning_rate": 1.952486824380294e-06, |
| "loss": 0.3849, |
| "step": 15616 |
| }, |
| { |
| "epoch": 3.6966690290574062, |
| "grad_norm": 4.28125, |
| "learning_rate": 1.9316087058452304e-06, |
| "loss": 0.3551, |
| "step": 15648 |
| }, |
| { |
| "epoch": 3.7042286794235766, |
| "grad_norm": 5.125, |
| "learning_rate": 1.910816056713275e-06, |
| "loss": 0.35, |
| "step": 15680 |
| }, |
| { |
| "epoch": 3.7117883297897474, |
| "grad_norm": 6.1875, |
| "learning_rate": 1.890109456159362e-06, |
| "loss": 0.3814, |
| "step": 15712 |
| }, |
| { |
| "epoch": 3.7193479801559177, |
| "grad_norm": 4.875, |
| "learning_rate": 1.8694894809615478e-06, |
| "loss": 0.3751, |
| "step": 15744 |
| }, |
| { |
| "epoch": 3.7269076305220885, |
| "grad_norm": 5.40625, |
| "learning_rate": 1.848956705484971e-06, |
| "loss": 0.3796, |
| "step": 15776 |
| }, |
| { |
| "epoch": 3.734467280888259, |
| "grad_norm": 8.0, |
| "learning_rate": 1.8285117016658316e-06, |
| "loss": 0.3878, |
| "step": 15808 |
| }, |
| { |
| "epoch": 3.7420269312544296, |
| "grad_norm": 3.40625, |
| "learning_rate": 1.808155038995471e-06, |
| "loss": 0.3749, |
| "step": 15840 |
| }, |
| { |
| "epoch": 3.7495865816206, |
| "grad_norm": 4.96875, |
| "learning_rate": 1.7878872845045058e-06, |
| "loss": 0.3878, |
| "step": 15872 |
| }, |
| { |
| "epoch": 3.757146231986771, |
| "grad_norm": 4.9375, |
| "learning_rate": 1.767709002747034e-06, |
| "loss": 0.3523, |
| "step": 15904 |
| }, |
| { |
| "epoch": 3.764705882352941, |
| "grad_norm": 4.96875, |
| "learning_rate": 1.7476207557849067e-06, |
| "loss": 0.3773, |
| "step": 15936 |
| }, |
| { |
| "epoch": 3.7722655327191115, |
| "grad_norm": 6.09375, |
| "learning_rate": 1.727623103172082e-06, |
| "loss": 0.3723, |
| "step": 15968 |
| }, |
| { |
| "epoch": 3.7798251830852823, |
| "grad_norm": 7.1875, |
| "learning_rate": 1.707716601939019e-06, |
| "loss": 0.3705, |
| "step": 16000 |
| }, |
| { |
| "epoch": 3.787384833451453, |
| "grad_norm": 5.90625, |
| "learning_rate": 1.6879018065771885e-06, |
| "loss": 0.4124, |
| "step": 16032 |
| }, |
| { |
| "epoch": 3.7949444838176234, |
| "grad_norm": 5.46875, |
| "learning_rate": 1.6681792690235975e-06, |
| "loss": 0.3389, |
| "step": 16064 |
| }, |
| { |
| "epoch": 3.8025041341837937, |
| "grad_norm": 7.21875, |
| "learning_rate": 1.6485495386454458e-06, |
| "loss": 0.4017, |
| "step": 16096 |
| }, |
| { |
| "epoch": 3.8100637845499645, |
| "grad_norm": 6.40625, |
| "learning_rate": 1.629013162224799e-06, |
| "loss": 0.3876, |
| "step": 16128 |
| }, |
| { |
| "epoch": 3.8176234349161353, |
| "grad_norm": 4.625, |
| "learning_rate": 1.6095706839433705e-06, |
| "loss": 0.3619, |
| "step": 16160 |
| }, |
| { |
| "epoch": 3.8251830852823057, |
| "grad_norm": 4.90625, |
| "learning_rate": 1.5902226453673609e-06, |
| "loss": 0.3741, |
| "step": 16192 |
| }, |
| { |
| "epoch": 3.832742735648476, |
| "grad_norm": 5.0, |
| "learning_rate": 1.5709695854323715e-06, |
| "loss": 0.3624, |
| "step": 16224 |
| }, |
| { |
| "epoch": 3.840302386014647, |
| "grad_norm": 6.03125, |
| "learning_rate": 1.5518120404283922e-06, |
| "loss": 0.3819, |
| "step": 16256 |
| }, |
| { |
| "epoch": 3.8478620363808176, |
| "grad_norm": 6.5, |
| "learning_rate": 1.5327505439848706e-06, |
| "loss": 0.3872, |
| "step": 16288 |
| }, |
| { |
| "epoch": 3.855421686746988, |
| "grad_norm": 5.78125, |
| "learning_rate": 1.513785627055831e-06, |
| "loss": 0.3408, |
| "step": 16320 |
| }, |
| { |
| "epoch": 3.8629813371131583, |
| "grad_norm": 4.59375, |
| "learning_rate": 1.4949178179051043e-06, |
| "loss": 0.3919, |
| "step": 16352 |
| }, |
| { |
| "epoch": 3.870540987479329, |
| "grad_norm": 5.6875, |
| "learning_rate": 1.4761476420916015e-06, |
| "loss": 0.3582, |
| "step": 16384 |
| }, |
| { |
| "epoch": 3.8781006378455, |
| "grad_norm": 7.28125, |
| "learning_rate": 1.4574756224546755e-06, |
| "loss": 0.3809, |
| "step": 16416 |
| }, |
| { |
| "epoch": 3.88566028821167, |
| "grad_norm": 7.875, |
| "learning_rate": 1.4389022790995611e-06, |
| "loss": 0.4036, |
| "step": 16448 |
| }, |
| { |
| "epoch": 3.8932199385778405, |
| "grad_norm": 6.0, |
| "learning_rate": 1.4204281293828858e-06, |
| "loss": 0.3885, |
| "step": 16480 |
| }, |
| { |
| "epoch": 3.9007795889440113, |
| "grad_norm": 6.40625, |
| "learning_rate": 1.4020536878982576e-06, |
| "loss": 0.3713, |
| "step": 16512 |
| }, |
| { |
| "epoch": 3.908339239310182, |
| "grad_norm": 4.90625, |
| "learning_rate": 1.3837794664619337e-06, |
| "loss": 0.3758, |
| "step": 16544 |
| }, |
| { |
| "epoch": 3.9158988896763525, |
| "grad_norm": 7.9375, |
| "learning_rate": 1.3656059740985622e-06, |
| "loss": 0.3725, |
| "step": 16576 |
| }, |
| { |
| "epoch": 3.923458540042523, |
| "grad_norm": 5.34375, |
| "learning_rate": 1.3475337170270013e-06, |
| "loss": 0.3931, |
| "step": 16608 |
| }, |
| { |
| "epoch": 3.9310181904086936, |
| "grad_norm": 7.9375, |
| "learning_rate": 1.3295631986462292e-06, |
| "loss": 0.3809, |
| "step": 16640 |
| }, |
| { |
| "epoch": 3.9385778407748644, |
| "grad_norm": 7.3125, |
| "learning_rate": 1.311694919521302e-06, |
| "loss": 0.3828, |
| "step": 16672 |
| }, |
| { |
| "epoch": 3.9461374911410347, |
| "grad_norm": 6.90625, |
| "learning_rate": 1.2939293773694323e-06, |
| "loss": 0.3723, |
| "step": 16704 |
| }, |
| { |
| "epoch": 3.953697141507205, |
| "grad_norm": 5.65625, |
| "learning_rate": 1.2762670670461119e-06, |
| "loss": 0.3594, |
| "step": 16736 |
| }, |
| { |
| "epoch": 3.961256791873376, |
| "grad_norm": 7.09375, |
| "learning_rate": 1.258708480531331e-06, |
| "loss": 0.3808, |
| "step": 16768 |
| }, |
| { |
| "epoch": 3.9688164422395467, |
| "grad_norm": 6.375, |
| "learning_rate": 1.2412541069158752e-06, |
| "loss": 0.3977, |
| "step": 16800 |
| }, |
| { |
| "epoch": 3.976376092605717, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.223904432387702e-06, |
| "loss": 0.3656, |
| "step": 16832 |
| }, |
| { |
| "epoch": 3.9839357429718874, |
| "grad_norm": 7.6875, |
| "learning_rate": 1.2066599402183953e-06, |
| "loss": 0.3721, |
| "step": 16864 |
| }, |
| { |
| "epoch": 3.991495393338058, |
| "grad_norm": 5.75, |
| "learning_rate": 1.1895211107497124e-06, |
| "loss": 0.392, |
| "step": 16896 |
| }, |
| { |
| "epoch": 3.999055043704229, |
| "grad_norm": 7.0, |
| "learning_rate": 1.1724884213801874e-06, |
| "loss": 0.3781, |
| "step": 16928 |
| }, |
| { |
| "epoch": 4.006614694070399, |
| "grad_norm": 5.75, |
| "learning_rate": 1.155562346551855e-06, |
| "loss": 0.3662, |
| "step": 16960 |
| }, |
| { |
| "epoch": 4.01417434443657, |
| "grad_norm": 8.6875, |
| "learning_rate": 1.1387433577370172e-06, |
| "loss": 0.3938, |
| "step": 16992 |
| }, |
| { |
| "epoch": 4.02173399480274, |
| "grad_norm": 6.5, |
| "learning_rate": 1.1220319234251191e-06, |
| "loss": 0.4218, |
| "step": 17024 |
| }, |
| { |
| "epoch": 4.029293645168911, |
| "grad_norm": 6.875, |
| "learning_rate": 1.1054285091096978e-06, |
| "loss": 0.3709, |
| "step": 17056 |
| }, |
| { |
| "epoch": 4.0368532955350815, |
| "grad_norm": 5.84375, |
| "learning_rate": 1.088933577275415e-06, |
| "loss": 0.354, |
| "step": 17088 |
| }, |
| { |
| "epoch": 4.044412945901252, |
| "grad_norm": 6.21875, |
| "learning_rate": 1.0725475873851764e-06, |
| "loss": 0.3689, |
| "step": 17120 |
| }, |
| { |
| "epoch": 4.051972596267422, |
| "grad_norm": 6.5, |
| "learning_rate": 1.0562709958673318e-06, |
| "loss": 0.3731, |
| "step": 17152 |
| }, |
| { |
| "epoch": 4.0595322466335935, |
| "grad_norm": 5.75, |
| "learning_rate": 1.0401042561029617e-06, |
| "loss": 0.3691, |
| "step": 17184 |
| }, |
| { |
| "epoch": 4.067091896999764, |
| "grad_norm": 4.71875, |
| "learning_rate": 1.0240478184132486e-06, |
| "loss": 0.3861, |
| "step": 17216 |
| }, |
| { |
| "epoch": 4.074651547365934, |
| "grad_norm": 5.90625, |
| "learning_rate": 1.008102130046938e-06, |
| "loss": 0.3691, |
| "step": 17248 |
| }, |
| { |
| "epoch": 4.0822111977321045, |
| "grad_norm": 5.625, |
| "learning_rate": 9.92267635167866e-07, |
| "loss": 0.3865, |
| "step": 17280 |
| }, |
| { |
| "epoch": 4.089770848098276, |
| "grad_norm": 7.1875, |
| "learning_rate": 9.765447748426098e-07, |
| "loss": 0.3743, |
| "step": 17312 |
| }, |
| { |
| "epoch": 4.097330498464446, |
| "grad_norm": 10.4375, |
| "learning_rate": 9.60933987028177e-07, |
| "loss": 0.3831, |
| "step": 17344 |
| }, |
| { |
| "epoch": 4.104890148830616, |
| "grad_norm": 7.0, |
| "learning_rate": 9.454357065598285e-07, |
| "loss": 0.4306, |
| "step": 17376 |
| }, |
| { |
| "epoch": 4.112449799196787, |
| "grad_norm": 9.8125, |
| "learning_rate": 9.300503651389515e-07, |
| "loss": 0.3735, |
| "step": 17408 |
| }, |
| { |
| "epoch": 4.120009449562958, |
| "grad_norm": 9.6875, |
| "learning_rate": 9.147783913210395e-07, |
| "loss": 0.4346, |
| "step": 17440 |
| }, |
| { |
| "epoch": 4.127569099929128, |
| "grad_norm": 6.3125, |
| "learning_rate": 8.996202105037549e-07, |
| "loss": 0.4109, |
| "step": 17472 |
| }, |
| { |
| "epoch": 4.135128750295299, |
| "grad_norm": 7.59375, |
| "learning_rate": 8.845762449150846e-07, |
| "loss": 0.3595, |
| "step": 17504 |
| }, |
| { |
| "epoch": 4.142688400661469, |
| "grad_norm": 4.59375, |
| "learning_rate": 8.696469136015645e-07, |
| "loss": 0.3627, |
| "step": 17536 |
| }, |
| { |
| "epoch": 4.15024805102764, |
| "grad_norm": 6.4375, |
| "learning_rate": 8.548326324166268e-07, |
| "loss": 0.4108, |
| "step": 17568 |
| }, |
| { |
| "epoch": 4.157807701393811, |
| "grad_norm": 6.34375, |
| "learning_rate": 8.40133814008997e-07, |
| "loss": 0.3924, |
| "step": 17600 |
| }, |
| { |
| "epoch": 4.165367351759981, |
| "grad_norm": 7.1875, |
| "learning_rate": 8.255508678112167e-07, |
| "loss": 0.3692, |
| "step": 17632 |
| }, |
| { |
| "epoch": 4.172927002126151, |
| "grad_norm": 6.21875, |
| "learning_rate": 8.110842000282271e-07, |
| "loss": 0.3766, |
| "step": 17664 |
| }, |
| { |
| "epoch": 4.1804866524923225, |
| "grad_norm": 6.28125, |
| "learning_rate": 7.967342136260576e-07, |
| "loss": 0.3871, |
| "step": 17696 |
| }, |
| { |
| "epoch": 4.188046302858493, |
| "grad_norm": 4.4375, |
| "learning_rate": 7.825013083206029e-07, |
| "loss": 0.3621, |
| "step": 17728 |
| }, |
| { |
| "epoch": 4.195605953224663, |
| "grad_norm": 5.625, |
| "learning_rate": 7.683858805664923e-07, |
| "loss": 0.3834, |
| "step": 17760 |
| }, |
| { |
| "epoch": 4.203165603590834, |
| "grad_norm": 6.6875, |
| "learning_rate": 7.543883235460325e-07, |
| "loss": 0.3615, |
| "step": 17792 |
| }, |
| { |
| "epoch": 4.210725253957005, |
| "grad_norm": 4.5625, |
| "learning_rate": 7.405090271582765e-07, |
| "loss": 0.3525, |
| "step": 17824 |
| }, |
| { |
| "epoch": 4.218284904323175, |
| "grad_norm": 3.9375, |
| "learning_rate": 7.267483780081419e-07, |
| "loss": 0.3559, |
| "step": 17856 |
| }, |
| { |
| "epoch": 4.2258445546893455, |
| "grad_norm": 5.25, |
| "learning_rate": 7.131067593956609e-07, |
| "loss": 0.3724, |
| "step": 17888 |
| }, |
| { |
| "epoch": 4.233404205055516, |
| "grad_norm": 5.15625, |
| "learning_rate": 6.995845513052879e-07, |
| "loss": 0.401, |
| "step": 17920 |
| }, |
| { |
| "epoch": 4.240963855421687, |
| "grad_norm": 4.34375, |
| "learning_rate": 6.861821303953264e-07, |
| "loss": 0.389, |
| "step": 17952 |
| }, |
| { |
| "epoch": 4.248523505787857, |
| "grad_norm": 6.25, |
| "learning_rate": 6.7289986998743e-07, |
| "loss": 0.4079, |
| "step": 17984 |
| }, |
| { |
| "epoch": 4.256083156154028, |
| "grad_norm": 7.78125, |
| "learning_rate": 6.597381400562087e-07, |
| "loss": 0.3613, |
| "step": 18016 |
| }, |
| { |
| "epoch": 4.263642806520198, |
| "grad_norm": 5.5625, |
| "learning_rate": 6.466973072189187e-07, |
| "loss": 0.3457, |
| "step": 18048 |
| }, |
| { |
| "epoch": 4.271202456886369, |
| "grad_norm": 6.21875, |
| "learning_rate": 6.337777347252549e-07, |
| "loss": 0.3685, |
| "step": 18080 |
| }, |
| { |
| "epoch": 4.27876210725254, |
| "grad_norm": 6.0, |
| "learning_rate": 6.209797824472292e-07, |
| "loss": 0.3891, |
| "step": 18112 |
| }, |
| { |
| "epoch": 4.28632175761871, |
| "grad_norm": 5.90625, |
| "learning_rate": 6.083038068691472e-07, |
| "loss": 0.3649, |
| "step": 18144 |
| }, |
| { |
| "epoch": 4.29388140798488, |
| "grad_norm": 5.09375, |
| "learning_rate": 5.957501610776828e-07, |
| "loss": 0.3546, |
| "step": 18176 |
| }, |
| { |
| "epoch": 4.301441058351052, |
| "grad_norm": 6.5, |
| "learning_rate": 5.833191947520312e-07, |
| "loss": 0.3714, |
| "step": 18208 |
| }, |
| { |
| "epoch": 4.309000708717222, |
| "grad_norm": 5.34375, |
| "learning_rate": 5.710112541541845e-07, |
| "loss": 0.3742, |
| "step": 18240 |
| }, |
| { |
| "epoch": 4.316560359083392, |
| "grad_norm": 5.875, |
| "learning_rate": 5.588266821192745e-07, |
| "loss": 0.3735, |
| "step": 18272 |
| }, |
| { |
| "epoch": 4.324120009449563, |
| "grad_norm": 4.5, |
| "learning_rate": 5.467658180460284e-07, |
| "loss": 0.3636, |
| "step": 18304 |
| }, |
| { |
| "epoch": 4.331679659815734, |
| "grad_norm": 6.03125, |
| "learning_rate": 5.348289978873127e-07, |
| "loss": 0.3822, |
| "step": 18336 |
| }, |
| { |
| "epoch": 4.339239310181904, |
| "grad_norm": 7.21875, |
| "learning_rate": 5.230165541407784e-07, |
| "loss": 0.3793, |
| "step": 18368 |
| }, |
| { |
| "epoch": 4.346798960548075, |
| "grad_norm": 7.53125, |
| "learning_rate": 5.11328815839594e-07, |
| "loss": 0.321, |
| "step": 18400 |
| }, |
| { |
| "epoch": 4.354358610914245, |
| "grad_norm": 4.71875, |
| "learning_rate": 4.997661085432892e-07, |
| "loss": 0.3217, |
| "step": 18432 |
| }, |
| { |
| "epoch": 4.361918261280416, |
| "grad_norm": 6.125, |
| "learning_rate": 4.883287543286742e-07, |
| "loss": 0.385, |
| "step": 18464 |
| }, |
| { |
| "epoch": 4.3694779116465865, |
| "grad_norm": 4.9375, |
| "learning_rate": 4.770170717808803e-07, |
| "loss": 0.3809, |
| "step": 18496 |
| }, |
| { |
| "epoch": 4.377037562012757, |
| "grad_norm": 8.25, |
| "learning_rate": 4.65831375984479e-07, |
| "loss": 0.3516, |
| "step": 18528 |
| }, |
| { |
| "epoch": 4.384597212378927, |
| "grad_norm": 6.21875, |
| "learning_rate": 4.5477197851470647e-07, |
| "loss": 0.4025, |
| "step": 18560 |
| }, |
| { |
| "epoch": 4.392156862745098, |
| "grad_norm": 8.25, |
| "learning_rate": 4.4383918742878507e-07, |
| "loss": 0.3838, |
| "step": 18592 |
| }, |
| { |
| "epoch": 4.399716513111269, |
| "grad_norm": 5.375, |
| "learning_rate": 4.3303330725734284e-07, |
| "loss": 0.3612, |
| "step": 18624 |
| }, |
| { |
| "epoch": 4.407276163477439, |
| "grad_norm": 9.125, |
| "learning_rate": 4.223546389959321e-07, |
| "loss": 0.3869, |
| "step": 18656 |
| }, |
| { |
| "epoch": 4.4148358138436095, |
| "grad_norm": 10.4375, |
| "learning_rate": 4.1180348009664084e-07, |
| "loss": 0.3787, |
| "step": 18688 |
| }, |
| { |
| "epoch": 4.422395464209781, |
| "grad_norm": 4.71875, |
| "learning_rate": 4.013801244598131e-07, |
| "loss": 0.3816, |
| "step": 18720 |
| }, |
| { |
| "epoch": 4.429955114575951, |
| "grad_norm": 11.3125, |
| "learning_rate": 3.910848624258573e-07, |
| "loss": 0.38, |
| "step": 18752 |
| }, |
| { |
| "epoch": 4.437514764942121, |
| "grad_norm": 6.125, |
| "learning_rate": 3.809179807671637e-07, |
| "loss": 0.3407, |
| "step": 18784 |
| }, |
| { |
| "epoch": 4.445074415308292, |
| "grad_norm": 6.15625, |
| "learning_rate": 3.7087976268011026e-07, |
| "loss": 0.348, |
| "step": 18816 |
| }, |
| { |
| "epoch": 4.452634065674463, |
| "grad_norm": 7.875, |
| "learning_rate": 3.609704877771825e-07, |
| "loss": 0.3691, |
| "step": 18848 |
| }, |
| { |
| "epoch": 4.460193716040633, |
| "grad_norm": 5.625, |
| "learning_rate": 3.511904320791742e-07, |
| "loss": 0.3718, |
| "step": 18880 |
| }, |
| { |
| "epoch": 4.467753366406804, |
| "grad_norm": 5.5625, |
| "learning_rate": 3.4153986800751104e-07, |
| "loss": 0.3875, |
| "step": 18912 |
| }, |
| { |
| "epoch": 4.475313016772974, |
| "grad_norm": 8.9375, |
| "learning_rate": 3.3201906437665355e-07, |
| "loss": 0.3819, |
| "step": 18944 |
| }, |
| { |
| "epoch": 4.482872667139145, |
| "grad_norm": 5.875, |
| "learning_rate": 3.2262828638661093e-07, |
| "loss": 0.3796, |
| "step": 18976 |
| }, |
| { |
| "epoch": 4.490432317505316, |
| "grad_norm": 6.28125, |
| "learning_rate": 3.1336779561555674e-07, |
| "loss": 0.3351, |
| "step": 19008 |
| }, |
| { |
| "epoch": 4.497991967871486, |
| "grad_norm": 8.75, |
| "learning_rate": 3.0423785001254256e-07, |
| "loss": 0.3565, |
| "step": 19040 |
| }, |
| { |
| "epoch": 4.505551618237656, |
| "grad_norm": 5.84375, |
| "learning_rate": 2.9523870389030653e-07, |
| "loss": 0.357, |
| "step": 19072 |
| }, |
| { |
| "epoch": 4.5131112686038275, |
| "grad_norm": 4.375, |
| "learning_rate": 2.8637060791820105e-07, |
| "loss": 0.3654, |
| "step": 19104 |
| }, |
| { |
| "epoch": 4.520670918969998, |
| "grad_norm": 6.59375, |
| "learning_rate": 2.7763380911519646e-07, |
| "loss": 0.3916, |
| "step": 19136 |
| }, |
| { |
| "epoch": 4.528230569336168, |
| "grad_norm": 5.5, |
| "learning_rate": 2.690285508430135e-07, |
| "loss": 0.3595, |
| "step": 19168 |
| }, |
| { |
| "epoch": 4.5357902197023385, |
| "grad_norm": 6.96875, |
| "learning_rate": 2.605550727993367e-07, |
| "loss": 0.3779, |
| "step": 19200 |
| }, |
| { |
| "epoch": 4.543349870068509, |
| "grad_norm": 6.6875, |
| "learning_rate": 2.522136110111395e-07, |
| "loss": 0.3547, |
| "step": 19232 |
| }, |
| { |
| "epoch": 4.55090952043468, |
| "grad_norm": 6.125, |
| "learning_rate": 2.4400439782810814e-07, |
| "loss": 0.3585, |
| "step": 19264 |
| }, |
| { |
| "epoch": 4.5584691708008505, |
| "grad_norm": 6.0625, |
| "learning_rate": 2.3592766191617655e-07, |
| "loss": 0.3724, |
| "step": 19296 |
| }, |
| { |
| "epoch": 4.566028821167021, |
| "grad_norm": 11.3125, |
| "learning_rate": 2.2798362825114496e-07, |
| "loss": 0.3584, |
| "step": 19328 |
| }, |
| { |
| "epoch": 4.573588471533192, |
| "grad_norm": 7.96875, |
| "learning_rate": 2.2017251811242702e-07, |
| "loss": 0.3591, |
| "step": 19360 |
| }, |
| { |
| "epoch": 4.581148121899362, |
| "grad_norm": 6.46875, |
| "learning_rate": 2.124945490768715e-07, |
| "loss": 0.3554, |
| "step": 19392 |
| }, |
| { |
| "epoch": 4.588707772265533, |
| "grad_norm": 4.21875, |
| "learning_rate": 2.0494993501271708e-07, |
| "loss": 0.3778, |
| "step": 19424 |
| }, |
| { |
| "epoch": 4.596267422631703, |
| "grad_norm": 4.34375, |
| "learning_rate": 1.975388860736216e-07, |
| "loss": 0.3931, |
| "step": 19456 |
| }, |
| { |
| "epoch": 4.603827072997873, |
| "grad_norm": 5.71875, |
| "learning_rate": 1.9026160869281773e-07, |
| "loss": 0.3732, |
| "step": 19488 |
| }, |
| { |
| "epoch": 4.611386723364045, |
| "grad_norm": 6.75, |
| "learning_rate": 1.831183055773561e-07, |
| "loss": 0.379, |
| "step": 19520 |
| }, |
| { |
| "epoch": 4.618946373730215, |
| "grad_norm": 5.46875, |
| "learning_rate": 1.7610917570246465e-07, |
| "loss": 0.3433, |
| "step": 19552 |
| }, |
| { |
| "epoch": 4.626506024096385, |
| "grad_norm": 7.1875, |
| "learning_rate": 1.6923441430600152e-07, |
| "loss": 0.4048, |
| "step": 19584 |
| }, |
| { |
| "epoch": 4.634065674462557, |
| "grad_norm": 7.46875, |
| "learning_rate": 1.624942128830198e-07, |
| "loss": 0.3748, |
| "step": 19616 |
| }, |
| { |
| "epoch": 4.641625324828727, |
| "grad_norm": 8.0625, |
| "learning_rate": 1.5588875918043255e-07, |
| "loss": 0.3976, |
| "step": 19648 |
| }, |
| { |
| "epoch": 4.649184975194897, |
| "grad_norm": 4.3125, |
| "learning_rate": 1.4941823719178185e-07, |
| "loss": 0.3547, |
| "step": 19680 |
| }, |
| { |
| "epoch": 4.656744625561068, |
| "grad_norm": 7.3125, |
| "learning_rate": 1.430828271521173e-07, |
| "loss": 0.3879, |
| "step": 19712 |
| }, |
| { |
| "epoch": 4.664304275927238, |
| "grad_norm": 5.625, |
| "learning_rate": 1.3688270553296968e-07, |
| "loss": 0.3835, |
| "step": 19744 |
| }, |
| { |
| "epoch": 4.671863926293409, |
| "grad_norm": 5.125, |
| "learning_rate": 1.3081804503744188e-07, |
| "loss": 0.4104, |
| "step": 19776 |
| }, |
| { |
| "epoch": 4.6794235766595795, |
| "grad_norm": 5.71875, |
| "learning_rate": 1.2488901459539404e-07, |
| "loss": 0.3825, |
| "step": 19808 |
| }, |
| { |
| "epoch": 4.68698322702575, |
| "grad_norm": 6.6875, |
| "learning_rate": 1.1909577935873939e-07, |
| "loss": 0.3976, |
| "step": 19840 |
| }, |
| { |
| "epoch": 4.694542877391921, |
| "grad_norm": 6.15625, |
| "learning_rate": 1.1343850069684415e-07, |
| "loss": 0.3609, |
| "step": 19872 |
| }, |
| { |
| "epoch": 4.7021025277580915, |
| "grad_norm": 4.65625, |
| "learning_rate": 1.079173361920316e-07, |
| "loss": 0.4363, |
| "step": 19904 |
| }, |
| { |
| "epoch": 4.709662178124262, |
| "grad_norm": 7.15625, |
| "learning_rate": 1.0253243963519343e-07, |
| "loss": 0.3626, |
| "step": 19936 |
| }, |
| { |
| "epoch": 4.717221828490432, |
| "grad_norm": 7.9375, |
| "learning_rate": 9.728396102150872e-08, |
| "loss": 0.3629, |
| "step": 19968 |
| }, |
| { |
| "epoch": 4.7247814788566025, |
| "grad_norm": 6.875, |
| "learning_rate": 9.217204654625778e-08, |
| "loss": 0.4077, |
| "step": 20000 |
| }, |
| { |
| "epoch": 4.732341129222774, |
| "grad_norm": 7.03125, |
| "learning_rate": 8.71968386007599e-08, |
| "loss": 0.3592, |
| "step": 20032 |
| }, |
| { |
| "epoch": 4.739900779588944, |
| "grad_norm": 8.5625, |
| "learning_rate": 8.235847576839984e-08, |
| "loss": 0.3695, |
| "step": 20064 |
| }, |
| { |
| "epoch": 4.747460429955114, |
| "grad_norm": 6.9375, |
| "learning_rate": 7.765709282077149e-08, |
| "loss": 0.3835, |
| "step": 20096 |
| }, |
| { |
| "epoch": 4.755020080321285, |
| "grad_norm": 6.75, |
| "learning_rate": 7.309282071392087e-08, |
| "loss": 0.3384, |
| "step": 20128 |
| }, |
| { |
| "epoch": 4.762579730687456, |
| "grad_norm": 7.25, |
| "learning_rate": 6.866578658470179e-08, |
| "loss": 0.3575, |
| "step": 20160 |
| }, |
| { |
| "epoch": 4.770139381053626, |
| "grad_norm": 5.78125, |
| "learning_rate": 6.437611374723152e-08, |
| "loss": 0.3799, |
| "step": 20192 |
| }, |
| { |
| "epoch": 4.777699031419797, |
| "grad_norm": 6.9375, |
| "learning_rate": 6.022392168945623e-08, |
| "loss": 0.3736, |
| "step": 20224 |
| }, |
| { |
| "epoch": 4.785258681785967, |
| "grad_norm": 3.953125, |
| "learning_rate": 5.620932606982599e-08, |
| "loss": 0.3855, |
| "step": 20256 |
| }, |
| { |
| "epoch": 4.792818332152138, |
| "grad_norm": 6.28125, |
| "learning_rate": 5.233243871406779e-08, |
| "loss": 0.3909, |
| "step": 20288 |
| }, |
| { |
| "epoch": 4.800377982518309, |
| "grad_norm": 5.3125, |
| "learning_rate": 4.859336761207645e-08, |
| "loss": 0.3379, |
| "step": 20320 |
| }, |
| { |
| "epoch": 4.807937632884479, |
| "grad_norm": 8.0, |
| "learning_rate": 4.499221691490085e-08, |
| "loss": 0.3821, |
| "step": 20352 |
| }, |
| { |
| "epoch": 4.815497283250649, |
| "grad_norm": 3.84375, |
| "learning_rate": 4.152908693184743e-08, |
| "loss": 0.3674, |
| "step": 20384 |
| }, |
| { |
| "epoch": 4.8230569336168205, |
| "grad_norm": 5.875, |
| "learning_rate": 3.820407412768234e-08, |
| "loss": 0.406, |
| "step": 20416 |
| }, |
| { |
| "epoch": 4.830616583982991, |
| "grad_norm": 7.96875, |
| "learning_rate": 3.5017271119949234e-08, |
| "loss": 0.4036, |
| "step": 20448 |
| }, |
| { |
| "epoch": 4.838176234349161, |
| "grad_norm": 6.21875, |
| "learning_rate": 3.196876667638404e-08, |
| "loss": 0.3762, |
| "step": 20480 |
| }, |
| { |
| "epoch": 4.845735884715332, |
| "grad_norm": 6.40625, |
| "learning_rate": 2.9058645712445876e-08, |
| "loss": 0.3783, |
| "step": 20512 |
| }, |
| { |
| "epoch": 4.853295535081503, |
| "grad_norm": 3.984375, |
| "learning_rate": 2.628698928895057e-08, |
| "loss": 0.3324, |
| "step": 20544 |
| }, |
| { |
| "epoch": 4.860855185447673, |
| "grad_norm": 5.9375, |
| "learning_rate": 2.365387460981361e-08, |
| "loss": 0.3826, |
| "step": 20576 |
| }, |
| { |
| "epoch": 4.8684148358138435, |
| "grad_norm": 4.28125, |
| "learning_rate": 2.1159375019897398e-08, |
| "loss": 0.3714, |
| "step": 20608 |
| }, |
| { |
| "epoch": 4.875974486180014, |
| "grad_norm": 7.875, |
| "learning_rate": 1.8803560002971232e-08, |
| "loss": 0.3905, |
| "step": 20640 |
| }, |
| { |
| "epoch": 4.883534136546185, |
| "grad_norm": 10.75, |
| "learning_rate": 1.65864951797734e-08, |
| "loss": 0.3769, |
| "step": 20672 |
| }, |
| { |
| "epoch": 4.891093786912355, |
| "grad_norm": 8.875, |
| "learning_rate": 1.450824230618486e-08, |
| "loss": 0.3892, |
| "step": 20704 |
| }, |
| { |
| "epoch": 4.898653437278526, |
| "grad_norm": 6.6875, |
| "learning_rate": 1.2568859271508971e-08, |
| "loss": 0.3954, |
| "step": 20736 |
| }, |
| { |
| "epoch": 4.906213087644696, |
| "grad_norm": 7.28125, |
| "learning_rate": 1.0768400096856645e-08, |
| "loss": 0.3848, |
| "step": 20768 |
| }, |
| { |
| "epoch": 4.913772738010867, |
| "grad_norm": 5.40625, |
| "learning_rate": 9.106914933646461e-09, |
| "loss": 0.3892, |
| "step": 20800 |
| }, |
| { |
| "epoch": 4.921332388377038, |
| "grad_norm": 6.9375, |
| "learning_rate": 7.58445006220132e-09, |
| "loss": 0.3719, |
| "step": 20832 |
| }, |
| { |
| "epoch": 4.928892038743208, |
| "grad_norm": 5.625, |
| "learning_rate": 6.201047890465606e-09, |
| "loss": 0.389, |
| "step": 20864 |
| }, |
| { |
| "epoch": 4.936451689109378, |
| "grad_norm": 8.875, |
| "learning_rate": 4.9567469528194465e-09, |
| "loss": 0.3416, |
| "step": 20896 |
| }, |
| { |
| "epoch": 4.94401133947555, |
| "grad_norm": 6.46875, |
| "learning_rate": 3.851581909007363e-09, |
| "loss": 0.396, |
| "step": 20928 |
| }, |
| { |
| "epoch": 4.95157098984172, |
| "grad_norm": 8.875, |
| "learning_rate": 2.885583543172921e-09, |
| "loss": 0.3682, |
| "step": 20960 |
| }, |
| { |
| "epoch": 4.95913064020789, |
| "grad_norm": 7.875, |
| "learning_rate": 2.0587787629994248e-09, |
| "loss": 0.3538, |
| "step": 20992 |
| }, |
| { |
| "epoch": 4.966690290574061, |
| "grad_norm": 21.5, |
| "learning_rate": 1.37119059896329e-09, |
| "loss": 0.398, |
| "step": 21024 |
| }, |
| { |
| "epoch": 4.974249940940232, |
| "grad_norm": 5.71875, |
| "learning_rate": 8.228382036901128e-10, |
| "loss": 0.3672, |
| "step": 21056 |
| }, |
| { |
| "epoch": 4.981809591306402, |
| "grad_norm": 4.84375, |
| "learning_rate": 4.1373685142176433e-10, |
| "loss": 0.3525, |
| "step": 21088 |
| }, |
| { |
| "epoch": 4.989369241672573, |
| "grad_norm": 6.96875, |
| "learning_rate": 1.4389793759173043e-10, |
| "loss": 0.34, |
| "step": 21120 |
| }, |
| { |
| "epoch": 4.996928892038743, |
| "grad_norm": 3.75, |
| "learning_rate": 1.3328978507032298e-11, |
| "loss": 0.3755, |
| "step": 21152 |
| }, |
| { |
| "epoch": 5.0, |
| "step": 21165, |
| "total_flos": 3.745021438134743e+17, |
| "train_loss": 0.3988672580218839, |
| "train_runtime": 7931.2769, |
| "train_samples_per_second": 10.674, |
| "train_steps_per_second": 2.669 |
| } |
| ], |
| "logging_steps": 32, |
| "max_steps": 21165, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.745021438134743e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|