multitracks-lp / trainer_state.json
deepaksamuel-cuk's picture
Upload 13 files
f8214ed verified
Invalid JSON:Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 29.12,
"eval_steps": 100,
"global_step": 182000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016,
"grad_norm": 1.1145988702774048,
"learning_rate": 5.94e-05,
"loss": 129.2138,
"step": 100
},
{
"epoch": 0.032,
"grad_norm": 0.3314463794231415,
"learning_rate": 0.0001194,
"loss": 147.1265,
"step": 200
},
{
"epoch": 0.048,
"grad_norm": 0.30200499296188354,
"learning_rate": 0.00017939999999999997,
"loss": 147.1375,
"step": 300
},
{
"epoch": 0.064,
"grad_norm": 0.20890414714813232,
"learning_rate": 0.0002394,
"loss": 141.107,
"step": 400
},
{
"epoch": 0.08,
"grad_norm": 0.19977182149887085,
"learning_rate": 0.00029939999999999996,
"loss": 130.2311,
"step": 500
},
{
"epoch": 0.096,
"grad_norm": 0.1718936711549759,
"learning_rate": 0.00029999762390495616,
"loss": 116.9488,
"step": 600
},
{
"epoch": 0.112,
"grad_norm": 0.21659506857395172,
"learning_rate": 0.00029999522380895233,
"loss": 106.3702,
"step": 700
},
{
"epoch": 0.128,
"grad_norm": 0.19612713158130646,
"learning_rate": 0.0002999928237129485,
"loss": 98.8033,
"step": 800
},
{
"epoch": 0.144,
"grad_norm": 0.18958421051502228,
"learning_rate": 0.00029999042361694467,
"loss": 94.6761,
"step": 900
},
{
"epoch": 0.16,
"grad_norm": 0.25341877341270447,
"learning_rate": 0.00029998802352094084,
"loss": 88.2629,
"step": 1000
},
{
"epoch": 0.176,
"grad_norm": 0.1762186735868454,
"learning_rate": 0.000299985623424937,
"loss": 87.4362,
"step": 1100
},
{
"epoch": 0.192,
"grad_norm": 0.23407000303268433,
"learning_rate": 0.0002999832233289331,
"loss": 85.7211,
"step": 1200
},
{
"epoch": 0.208,
"grad_norm": 0.23202084004878998,
"learning_rate": 0.0002999808232329293,
"loss": 81.4749,
"step": 1300
},
{
"epoch": 0.224,
"grad_norm": 0.1819111853837967,
"learning_rate": 0.00029997842313692546,
"loss": 80.3999,
"step": 1400
},
{
"epoch": 0.24,
"grad_norm": 0.16154050827026367,
"learning_rate": 0.00029997602304092163,
"loss": 80.5113,
"step": 1500
},
{
"epoch": 0.256,
"grad_norm": 0.20147816836833954,
"learning_rate": 0.0002999736229449178,
"loss": 77.4306,
"step": 1600
},
{
"epoch": 0.272,
"grad_norm": 0.2032860815525055,
"learning_rate": 0.0002999712228489139,
"loss": 76.3299,
"step": 1700
},
{
"epoch": 0.288,
"grad_norm": 0.20103086531162262,
"learning_rate": 0.0002999688227529101,
"loss": 77.0755,
"step": 1800
},
{
"epoch": 0.304,
"grad_norm": 0.1930929720401764,
"learning_rate": 0.00029996642265690625,
"loss": 74.2643,
"step": 1900
},
{
"epoch": 0.32,
"grad_norm": 0.21013671159744263,
"learning_rate": 0.0002999640225609024,
"loss": 75.9168,
"step": 2000
},
{
"epoch": 0.336,
"grad_norm": 0.2554585635662079,
"learning_rate": 0.0002999616224648986,
"loss": 75.2005,
"step": 2100
},
{
"epoch": 0.352,
"grad_norm": 0.21000510454177856,
"learning_rate": 0.00029995922236889476,
"loss": 74.1565,
"step": 2200
},
{
"epoch": 0.368,
"grad_norm": 0.2096049040555954,
"learning_rate": 0.0002999568222728909,
"loss": 73.3684,
"step": 2300
},
{
"epoch": 0.384,
"grad_norm": 0.2806188464164734,
"learning_rate": 0.00029995442217688705,
"loss": 73.9772,
"step": 2400
},
{
"epoch": 0.4,
"grad_norm": 0.17476481199264526,
"learning_rate": 0.0002999520220808832,
"loss": 73.7125,
"step": 2500
},
{
"epoch": 0.416,
"grad_norm": 0.26867198944091797,
"learning_rate": 0.0002999496219848794,
"loss": 72.5119,
"step": 2600
},
{
"epoch": 0.432,
"grad_norm": 0.1896703690290451,
"learning_rate": 0.00029994722188887555,
"loss": 72.6918,
"step": 2700
},
{
"epoch": 0.448,
"grad_norm": 0.2521280348300934,
"learning_rate": 0.00029994482179287167,
"loss": 72.1229,
"step": 2800
},
{
"epoch": 0.464,
"grad_norm": 0.20409554243087769,
"learning_rate": 0.00029994242169686784,
"loss": 72.3524,
"step": 2900
},
{
"epoch": 0.48,
"grad_norm": 0.1911861002445221,
"learning_rate": 0.000299940021600864,
"loss": 70.9714,
"step": 3000
},
{
"epoch": 0.496,
"grad_norm": 0.21338903903961182,
"learning_rate": 0.0002999376215048602,
"loss": 69.5716,
"step": 3100
},
{
"epoch": 0.512,
"grad_norm": 0.20922720432281494,
"learning_rate": 0.00029993522140885634,
"loss": 70.1812,
"step": 3200
},
{
"epoch": 0.528,
"grad_norm": 0.2678331434726715,
"learning_rate": 0.0002999328213128525,
"loss": 68.8041,
"step": 3300
},
{
"epoch": 0.544,
"grad_norm": 0.25610026717185974,
"learning_rate": 0.00029993042121684863,
"loss": 71.186,
"step": 3400
},
{
"epoch": 0.56,
"grad_norm": 0.23267875611782074,
"learning_rate": 0.0002999280211208448,
"loss": 68.9921,
"step": 3500
},
{
"epoch": 0.576,
"grad_norm": 0.23876765370368958,
"learning_rate": 0.00029992562102484097,
"loss": 69.738,
"step": 3600
},
{
"epoch": 0.592,
"grad_norm": 0.1865028291940689,
"learning_rate": 0.00029992322092883714,
"loss": 68.9813,
"step": 3700
},
{
"epoch": 0.608,
"grad_norm": 0.21735595166683197,
"learning_rate": 0.0002999208208328333,
"loss": 67.5755,
"step": 3800
},
{
"epoch": 0.624,
"grad_norm": 0.16909943521022797,
"learning_rate": 0.0002999184207368294,
"loss": 66.3015,
"step": 3900
},
{
"epoch": 0.64,
"grad_norm": 0.19918648898601532,
"learning_rate": 0.0002999160206408256,
"loss": 67.3844,
"step": 4000
},
{
"epoch": 0.656,
"grad_norm": 0.22282840311527252,
"learning_rate": 0.00029991362054482176,
"loss": 66.0008,
"step": 4100
},
{
"epoch": 0.672,
"grad_norm": 0.19900047779083252,
"learning_rate": 0.00029991122044881793,
"loss": 66.029,
"step": 4200
},
{
"epoch": 0.688,
"grad_norm": 0.2067142128944397,
"learning_rate": 0.0002999088203528141,
"loss": 65.7196,
"step": 4300
},
{
"epoch": 0.704,
"grad_norm": 0.24062038958072662,
"learning_rate": 0.00029990642025681027,
"loss": 66.7571,
"step": 4400
},
{
"epoch": 0.72,
"grad_norm": 0.2454902082681656,
"learning_rate": 0.0002999040201608064,
"loss": 65.7736,
"step": 4500
},
{
"epoch": 0.736,
"grad_norm": 0.24499955773353577,
"learning_rate": 0.00029990162006480255,
"loss": 65.498,
"step": 4600
},
{
"epoch": 0.752,
"grad_norm": 0.2421354055404663,
"learning_rate": 0.0002998992199687987,
"loss": 65.9207,
"step": 4700
},
{
"epoch": 0.768,
"grad_norm": 0.1900254338979721,
"learning_rate": 0.0002998968198727949,
"loss": 63.4017,
"step": 4800
},
{
"epoch": 0.784,
"grad_norm": 0.21995197236537933,
"learning_rate": 0.00029989441977679106,
"loss": 65.4319,
"step": 4900
},
{
"epoch": 0.8,
"grad_norm": 0.2170778065919876,
"learning_rate": 0.00029989201968078717,
"loss": 64.1503,
"step": 5000
},
{
"epoch": 0.816,
"grad_norm": 0.29141783714294434,
"learning_rate": 0.00029988961958478334,
"loss": 63.4509,
"step": 5100
},
{
"epoch": 0.832,
"grad_norm": 0.2149534821510315,
"learning_rate": 0.0002998872194887795,
"loss": 63.8549,
"step": 5200
},
{
"epoch": 0.848,
"grad_norm": 0.2090325504541397,
"learning_rate": 0.0002998848193927757,
"loss": 62.5135,
"step": 5300
},
{
"epoch": 0.864,
"grad_norm": 0.19093327224254608,
"learning_rate": 0.00029988241929677185,
"loss": 64.1856,
"step": 5400
},
{
"epoch": 0.88,
"grad_norm": 0.24676312506198883,
"learning_rate": 0.000299880019200768,
"loss": 62.8992,
"step": 5500
},
{
"epoch": 0.896,
"grad_norm": 0.2047237902879715,
"learning_rate": 0.00029987761910476413,
"loss": 63.5,
"step": 5600
},
{
"epoch": 0.912,
"grad_norm": 0.2169736623764038,
"learning_rate": 0.0002998752190087603,
"loss": 63.2706,
"step": 5700
},
{
"epoch": 0.928,
"grad_norm": 0.2212333083152771,
"learning_rate": 0.00029987281891275647,
"loss": 62.8563,
"step": 5800
},
{
"epoch": 0.944,
"grad_norm": 0.22105100750923157,
"learning_rate": 0.00029987041881675264,
"loss": 61.4049,
"step": 5900
},
{
"epoch": 0.96,
"grad_norm": 0.21934692561626434,
"learning_rate": 0.0002998680187207488,
"loss": 61.2102,
"step": 6000
},
{
"epoch": 0.976,
"grad_norm": 0.231471449136734,
"learning_rate": 0.0002998656186247449,
"loss": 61.161,
"step": 6100
},
{
"epoch": 0.992,
"grad_norm": 0.20244845747947693,
"learning_rate": 0.0002998632185287411,
"loss": 61.5284,
"step": 6200
},
{
"epoch": 1.008,
"grad_norm": 0.31659385561943054,
"learning_rate": 0.00029986081843273726,
"loss": 59.6197,
"step": 6300
},
{
"epoch": 1.024,
"grad_norm": 0.22351042926311493,
"learning_rate": 0.00029985841833673343,
"loss": 60.8731,
"step": 6400
},
{
"epoch": 1.04,
"grad_norm": 0.20470276474952698,
"learning_rate": 0.0002998560182407296,
"loss": 60.5648,
"step": 6500
},
{
"epoch": 1.056,
"grad_norm": 0.17768125236034393,
"learning_rate": 0.00029985361814472577,
"loss": 59.2689,
"step": 6600
},
{
"epoch": 1.072,
"grad_norm": 0.20775848627090454,
"learning_rate": 0.0002998512180487219,
"loss": 58.2776,
"step": 6700
},
{
"epoch": 1.088,
"grad_norm": 0.2682810127735138,
"learning_rate": 0.00029984881795271806,
"loss": 60.5164,
"step": 6800
},
{
"epoch": 1.104,
"grad_norm": 0.22458679974079132,
"learning_rate": 0.0002998464178567142,
"loss": 60.1217,
"step": 6900
},
{
"epoch": 1.12,
"grad_norm": 0.22781415283679962,
"learning_rate": 0.0002998440177607104,
"loss": 58.191,
"step": 7000
},
{
"epoch": 1.1360000000000001,
"grad_norm": 0.2532273232936859,
"learning_rate": 0.00029984161766470656,
"loss": 58.8972,
"step": 7100
},
{
"epoch": 1.152,
"grad_norm": 0.2014983743429184,
"learning_rate": 0.00029983921756870273,
"loss": 58.7748,
"step": 7200
},
{
"epoch": 1.168,
"grad_norm": 0.19773030281066895,
"learning_rate": 0.0002998368174726989,
"loss": 57.9689,
"step": 7300
},
{
"epoch": 1.184,
"grad_norm": 0.245356023311615,
"learning_rate": 0.00029983441737669507,
"loss": 57.855,
"step": 7400
},
{
"epoch": 1.2,
"grad_norm": 0.2565186023712158,
"learning_rate": 0.00029983201728069124,
"loss": 56.8152,
"step": 7500
},
{
"epoch": 1.216,
"grad_norm": 0.17781591415405273,
"learning_rate": 0.00029982961718468735,
"loss": 55.2139,
"step": 7600
},
{
"epoch": 1.232,
"grad_norm": 0.21849973499774933,
"learning_rate": 0.0002998272170886835,
"loss": 55.9843,
"step": 7700
},
{
"epoch": 1.248,
"grad_norm": 0.17623578011989594,
"learning_rate": 0.0002998248169926797,
"loss": 57.3084,
"step": 7800
},
{
"epoch": 1.264,
"grad_norm": 0.22286267578601837,
"learning_rate": 0.00029982241689667586,
"loss": 56.4191,
"step": 7900
},
{
"epoch": 1.28,
"grad_norm": 0.20891787111759186,
"learning_rate": 0.00029982001680067203,
"loss": 56.4775,
"step": 8000
},
{
"epoch": 1.296,
"grad_norm": 0.19925983250141144,
"learning_rate": 0.00029981761670466815,
"loss": 55.0521,
"step": 8100
},
{
"epoch": 1.312,
"grad_norm": 0.22015956044197083,
"learning_rate": 0.0002998152166086643,
"loss": 55.6771,
"step": 8200
},
{
"epoch": 1.328,
"grad_norm": 0.24997876584529877,
"learning_rate": 0.0002998128165126605,
"loss": 53.8931,
"step": 8300
},
{
"epoch": 1.3439999999999999,
"grad_norm": 0.2933981418609619,
"learning_rate": 0.00029981041641665665,
"loss": 56.6028,
"step": 8400
},
{
"epoch": 1.3599999999999999,
"grad_norm": 0.1963578313589096,
"learning_rate": 0.0002998080163206528,
"loss": 54.5404,
"step": 8500
},
{
"epoch": 1.376,
"grad_norm": 0.21487855911254883,
"learning_rate": 0.000299805616224649,
"loss": 54.2586,
"step": 8600
},
{
"epoch": 1.392,
"grad_norm": 0.21776583790779114,
"learning_rate": 0.0002998032161286451,
"loss": 53.9896,
"step": 8700
},
{
"epoch": 1.408,
"grad_norm": 0.2172229140996933,
"learning_rate": 0.0002998008160326413,
"loss": 53.8424,
"step": 8800
},
{
"epoch": 1.424,
"grad_norm": 0.23105138540267944,
"learning_rate": 0.00029979841593663745,
"loss": 54.1874,
"step": 8900
},
{
"epoch": 1.44,
"grad_norm": 0.18797878921031952,
"learning_rate": 0.0002997960158406336,
"loss": 53.3869,
"step": 9000
},
{
"epoch": 1.456,
"grad_norm": 0.20597319304943085,
"learning_rate": 0.0002997936157446298,
"loss": 53.7132,
"step": 9100
},
{
"epoch": 1.472,
"grad_norm": 0.21674391627311707,
"learning_rate": 0.00029979121564862595,
"loss": 52.2728,
"step": 9200
},
{
"epoch": 1.488,
"grad_norm": 0.2250959277153015,
"learning_rate": 0.00029978881555262207,
"loss": 53.3457,
"step": 9300
},
{
"epoch": 1.504,
"grad_norm": 0.19289842247962952,
"learning_rate": 0.00029978641545661824,
"loss": 52.898,
"step": 9400
},
{
"epoch": 1.52,
"grad_norm": 0.2215307652950287,
"learning_rate": 0.0002997840153606144,
"loss": 52.8446,
"step": 9500
},
{
"epoch": 1.536,
"grad_norm": 0.19949446618556976,
"learning_rate": 0.0002997816152646106,
"loss": 51.9649,
"step": 9600
},
{
"epoch": 1.552,
"grad_norm": 0.1753661036491394,
"learning_rate": 0.00029977921516860675,
"loss": 51.5562,
"step": 9700
},
{
"epoch": 1.568,
"grad_norm": 0.22938130795955658,
"learning_rate": 0.00029977681507260286,
"loss": 52.4538,
"step": 9800
},
{
"epoch": 1.584,
"grad_norm": 0.255227655172348,
"learning_rate": 0.00029977441497659903,
"loss": 50.8902,
"step": 9900
},
{
"epoch": 1.6,
"grad_norm": 0.24369871616363525,
"learning_rate": 0.0002997720148805952,
"loss": 50.8092,
"step": 10000
},
{
"epoch": 1.616,
"grad_norm": 0.22126376628875732,
"learning_rate": 0.0002997696387855514,
"loss": 51.0513,
"step": 10100
},
{
"epoch": 1.6320000000000001,
"grad_norm": 0.199215367436409,
"learning_rate": 0.00029976723868954756,
"loss": 49.6234,
"step": 10200
},
{
"epoch": 1.6480000000000001,
"grad_norm": 0.22058773040771484,
"learning_rate": 0.0002997648385935437,
"loss": 51.2333,
"step": 10300
},
{
"epoch": 1.6640000000000001,
"grad_norm": 0.26106688380241394,
"learning_rate": 0.0002997624384975399,
"loss": 49.6582,
"step": 10400
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.23437049984931946,
"learning_rate": 0.00029976003840153606,
"loss": 49.6097,
"step": 10500
},
{
"epoch": 1.696,
"grad_norm": 0.1709340363740921,
"learning_rate": 0.00029975763830553223,
"loss": 49.9149,
"step": 10600
},
{
"epoch": 1.712,
"grad_norm": 0.2278878539800644,
"learning_rate": 0.00029975523820952835,
"loss": 50.2495,
"step": 10700
},
{
"epoch": 1.728,
"grad_norm": 0.25324809551239014,
"learning_rate": 0.0002997528381135245,
"loss": 48.3701,
"step": 10800
},
{
"epoch": 1.744,
"grad_norm": 0.21413564682006836,
"learning_rate": 0.0002997504380175207,
"loss": 48.8447,
"step": 10900
},
{
"epoch": 1.76,
"grad_norm": 0.2975509464740753,
"learning_rate": 0.00029974803792151686,
"loss": 50.0095,
"step": 11000
},
{
"epoch": 1.776,
"grad_norm": 0.19792191684246063,
"learning_rate": 0.00029974566182647304,
"loss": 49.2986,
"step": 11100
},
{
"epoch": 1.792,
"grad_norm": 0.2350345253944397,
"learning_rate": 0.0002997432617304692,
"loss": 48.7027,
"step": 11200
},
{
"epoch": 1.808,
"grad_norm": 0.19396322965621948,
"learning_rate": 0.00029974086163446533,
"loss": 47.9713,
"step": 11300
},
{
"epoch": 1.8239999999999998,
"grad_norm": 0.2414630949497223,
"learning_rate": 0.0002997384615384615,
"loss": 48.7363,
"step": 11400
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.2678147554397583,
"learning_rate": 0.00029973606144245767,
"loss": 48.4818,
"step": 11500
},
{
"epoch": 1.8559999999999999,
"grad_norm": 0.19563674926757812,
"learning_rate": 0.00029973366134645384,
"loss": 48.2693,
"step": 11600
},
{
"epoch": 1.8719999999999999,
"grad_norm": 0.22531713545322418,
"learning_rate": 0.00029973126125045,
"loss": 47.758,
"step": 11700
},
{
"epoch": 1.888,
"grad_norm": 0.22199738025665283,
"learning_rate": 0.0002997288611544461,
"loss": 46.9644,
"step": 11800
},
{
"epoch": 1.904,
"grad_norm": 0.253896027803421,
"learning_rate": 0.0002997264610584423,
"loss": 46.5968,
"step": 11900
},
{
"epoch": 1.92,
"grad_norm": 0.18806882202625275,
"learning_rate": 0.00029972406096243846,
"loss": 48.2712,
"step": 12000
},
{
"epoch": 1.936,
"grad_norm": 0.22023610770702362,
"learning_rate": 0.00029972166086643463,
"loss": 47.2612,
"step": 12100
},
{
"epoch": 1.952,
"grad_norm": 0.213795468211174,
"learning_rate": 0.0002997192607704308,
"loss": 45.9592,
"step": 12200
},
{
"epoch": 1.968,
"grad_norm": 0.19787845015525818,
"learning_rate": 0.00029971686067442697,
"loss": 47.5647,
"step": 12300
},
{
"epoch": 1.984,
"grad_norm": 0.19648146629333496,
"learning_rate": 0.0002997144605784231,
"loss": 46.8397,
"step": 12400
},
{
"epoch": 2.0,
"grad_norm": 0.1904546618461609,
"learning_rate": 0.00029971206048241925,
"loss": 46.2783,
"step": 12500
},
{
"epoch": 2.016,
"grad_norm": 0.23515231907367706,
"learning_rate": 0.0002997096603864154,
"loss": 46.5475,
"step": 12600
},
{
"epoch": 2.032,
"grad_norm": 0.21483579277992249,
"learning_rate": 0.0002997072602904116,
"loss": 44.2442,
"step": 12700
},
{
"epoch": 2.048,
"grad_norm": 0.2563657760620117,
"learning_rate": 0.00029970486019440776,
"loss": 46.1955,
"step": 12800
},
{
"epoch": 2.064,
"grad_norm": 0.20812326669692993,
"learning_rate": 0.00029970246009840387,
"loss": 45.5704,
"step": 12900
},
{
"epoch": 2.08,
"grad_norm": 0.2190365344285965,
"learning_rate": 0.00029970006000240004,
"loss": 45.7909,
"step": 13000
},
{
"epoch": 2.096,
"grad_norm": 0.2379041463136673,
"learning_rate": 0.0002996976599063962,
"loss": 46.2324,
"step": 13100
},
{
"epoch": 2.112,
"grad_norm": 0.2170909345149994,
"learning_rate": 0.0002996952598103924,
"loss": 44.766,
"step": 13200
},
{
"epoch": 2.128,
"grad_norm": 0.15927261114120483,
"learning_rate": 0.00029969285971438855,
"loss": 43.669,
"step": 13300
},
{
"epoch": 2.144,
"grad_norm": 0.22271278500556946,
"learning_rate": 0.0002996904596183847,
"loss": 45.0739,
"step": 13400
},
{
"epoch": 2.16,
"grad_norm": 0.17792785167694092,
"learning_rate": 0.0002996880595223809,
"loss": 43.8963,
"step": 13500
},
{
"epoch": 2.176,
"grad_norm": 0.28457048535346985,
"learning_rate": 0.00029968565942637706,
"loss": 44.6317,
"step": 13600
},
{
"epoch": 2.192,
"grad_norm": 0.19491800665855408,
"learning_rate": 0.0002996832593303732,
"loss": 43.8541,
"step": 13700
},
{
"epoch": 2.208,
"grad_norm": 0.21633195877075195,
"learning_rate": 0.00029968085923436934,
"loss": 43.2844,
"step": 13800
},
{
"epoch": 2.224,
"grad_norm": 0.2146127074956894,
"learning_rate": 0.0002996784591383655,
"loss": 45.0415,
"step": 13900
},
{
"epoch": 2.24,
"grad_norm": 0.2204289436340332,
"learning_rate": 0.0002996760590423617,
"loss": 44.2757,
"step": 14000
},
{
"epoch": 2.2560000000000002,
"grad_norm": 0.3051868677139282,
"learning_rate": 0.00029967365894635785,
"loss": 42.7227,
"step": 14100
},
{
"epoch": 2.2720000000000002,
"grad_norm": 0.23641665279865265,
"learning_rate": 0.000299671258850354,
"loss": 44.0578,
"step": 14200
},
{
"epoch": 2.288,
"grad_norm": 0.18554934859275818,
"learning_rate": 0.0002996688587543502,
"loss": 42.5159,
"step": 14300
},
{
"epoch": 2.304,
"grad_norm": 0.24741467833518982,
"learning_rate": 0.0002996664586583463,
"loss": 42.9106,
"step": 14400
},
{
"epoch": 2.32,
"grad_norm": 0.18483412265777588,
"learning_rate": 0.00029966405856234247,
"loss": 42.2459,
"step": 14500
},
{
"epoch": 2.336,
"grad_norm": 0.24359823763370514,
"learning_rate": 0.00029966165846633864,
"loss": 42.6733,
"step": 14600
},
{
"epoch": 2.352,
"grad_norm": 0.20456752181053162,
"learning_rate": 0.0002996592583703348,
"loss": 41.5754,
"step": 14700
},
{
"epoch": 2.368,
"grad_norm": 0.24165822565555573,
"learning_rate": 0.000299656858274331,
"loss": 43.6988,
"step": 14800
},
{
"epoch": 2.384,
"grad_norm": 0.20422741770744324,
"learning_rate": 0.0002996544581783271,
"loss": 41.9116,
"step": 14900
},
{
"epoch": 2.4,
"grad_norm": 0.2413185089826584,
"learning_rate": 0.00029965205808232326,
"loss": 41.8573,
"step": 15000
},
{
"epoch": 2.416,
"grad_norm": 0.20443005859851837,
"learning_rate": 0.00029964968198727945,
"loss": 42.3368,
"step": 15100
},
{
"epoch": 2.432,
"grad_norm": 0.21270470321178436,
"learning_rate": 0.0002996472818912756,
"loss": 40.336,
"step": 15200
},
{
"epoch": 2.448,
"grad_norm": 0.21689313650131226,
"learning_rate": 0.0002996448817952718,
"loss": 40.5125,
"step": 15300
},
{
"epoch": 2.464,
"grad_norm": 0.25577059388160706,
"learning_rate": 0.00029964248169926796,
"loss": 40.5761,
"step": 15400
},
{
"epoch": 2.48,
"grad_norm": 0.2624509930610657,
"learning_rate": 0.0002996400816032641,
"loss": 40.3047,
"step": 15500
},
{
"epoch": 2.496,
"grad_norm": 0.225455641746521,
"learning_rate": 0.00029963768150726024,
"loss": 40.3576,
"step": 15600
},
{
"epoch": 2.512,
"grad_norm": 0.18313691020011902,
"learning_rate": 0.0002996352814112564,
"loss": 41.113,
"step": 15700
},
{
"epoch": 2.528,
"grad_norm": 0.21272344887256622,
"learning_rate": 0.0002996328813152526,
"loss": 41.2563,
"step": 15800
},
{
"epoch": 2.544,
"grad_norm": 0.23525486886501312,
"learning_rate": 0.00029963048121924875,
"loss": 41.2227,
"step": 15900
},
{
"epoch": 2.56,
"grad_norm": 0.226985365152359,
"learning_rate": 0.00029962808112324487,
"loss": 40.6251,
"step": 16000
},
{
"epoch": 2.576,
"grad_norm": 0.20422585308551788,
"learning_rate": 0.00029962568102724103,
"loss": 40.6449,
"step": 16100
},
{
"epoch": 2.592,
"grad_norm": 0.18906068801879883,
"learning_rate": 0.0002996232809312372,
"loss": 39.5927,
"step": 16200
},
{
"epoch": 2.608,
"grad_norm": 0.21180450916290283,
"learning_rate": 0.0002996208808352334,
"loss": 39.7467,
"step": 16300
},
{
"epoch": 2.624,
"grad_norm": 0.2399897575378418,
"learning_rate": 0.00029961848073922954,
"loss": 38.9522,
"step": 16400
},
{
"epoch": 2.64,
"grad_norm": 0.1941596120595932,
"learning_rate": 0.0002996160806432257,
"loss": 39.5798,
"step": 16500
},
{
"epoch": 2.656,
"grad_norm": 0.19715790450572968,
"learning_rate": 0.0002996136805472218,
"loss": 39.9061,
"step": 16600
},
{
"epoch": 2.672,
"grad_norm": 0.22090336680412292,
"learning_rate": 0.00029961128045121805,
"loss": 39.6083,
"step": 16700
},
{
"epoch": 2.6879999999999997,
"grad_norm": 0.26035964488983154,
"learning_rate": 0.00029960890435617424,
"loss": 39.3414,
"step": 16800
},
{
"epoch": 2.7039999999999997,
"grad_norm": 0.21888568997383118,
"learning_rate": 0.00029960650426017035,
"loss": 38.3817,
"step": 16900
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.29924601316452026,
"learning_rate": 0.0002996041041641665,
"loss": 38.3896,
"step": 17000
},
{
"epoch": 2.7359999999999998,
"grad_norm": 0.20395514369010925,
"learning_rate": 0.0002996017040681627,
"loss": 38.8915,
"step": 17100
},
{
"epoch": 2.752,
"grad_norm": 0.20730023086071014,
"learning_rate": 0.00029959930397215886,
"loss": 38.9281,
"step": 17200
},
{
"epoch": 2.768,
"grad_norm": 0.23472309112548828,
"learning_rate": 0.00029959690387615503,
"loss": 39.371,
"step": 17300
},
{
"epoch": 2.784,
"grad_norm": 0.2272721529006958,
"learning_rate": 0.0002995945037801512,
"loss": 38.7238,
"step": 17400
},
{
"epoch": 2.8,
"grad_norm": 0.20280113816261292,
"learning_rate": 0.0002995921036841473,
"loss": 38.1639,
"step": 17500
},
{
"epoch": 2.816,
"grad_norm": 0.21985846757888794,
"learning_rate": 0.0002995897035881435,
"loss": 38.2459,
"step": 17600
},
{
"epoch": 2.832,
"grad_norm": 0.22791948914527893,
"learning_rate": 0.00029958730349213965,
"loss": 38.365,
"step": 17700
},
{
"epoch": 2.848,
"grad_norm": 0.218161940574646,
"learning_rate": 0.0002995849033961358,
"loss": 37.7998,
"step": 17800
},
{
"epoch": 2.864,
"grad_norm": 0.23389916121959686,
"learning_rate": 0.000299582503300132,
"loss": 38.0078,
"step": 17900
},
{
"epoch": 2.88,
"grad_norm": 0.20153094828128815,
"learning_rate": 0.0002995801032041281,
"loss": 37.1053,
"step": 18000
},
{
"epoch": 2.896,
"grad_norm": 0.231399804353714,
"learning_rate": 0.0002995777031081243,
"loss": 37.6589,
"step": 18100
},
{
"epoch": 2.912,
"grad_norm": 0.19814245402812958,
"learning_rate": 0.00029957530301212044,
"loss": 36.8171,
"step": 18200
},
{
"epoch": 2.928,
"grad_norm": 0.22390811145305634,
"learning_rate": 0.0002995729029161166,
"loss": 36.6616,
"step": 18300
},
{
"epoch": 2.944,
"grad_norm": 0.19958479702472687,
"learning_rate": 0.0002995705028201128,
"loss": 36.0232,
"step": 18400
},
{
"epoch": 2.96,
"grad_norm": 0.1972126066684723,
"learning_rate": 0.00029956810272410895,
"loss": 36.5331,
"step": 18500
},
{
"epoch": 2.976,
"grad_norm": 0.18196193873882294,
"learning_rate": 0.00029956570262810507,
"loss": 36.8888,
"step": 18600
},
{
"epoch": 2.992,
"grad_norm": 0.17047256231307983,
"learning_rate": 0.00029956330253210124,
"loss": 36.5987,
"step": 18700
},
{
"epoch": 3.008,
"grad_norm": 0.22138766944408417,
"learning_rate": 0.0002995609024360974,
"loss": 36.2777,
"step": 18800
},
{
"epoch": 3.024,
"grad_norm": 0.22713051736354828,
"learning_rate": 0.0002995585023400936,
"loss": 35.768,
"step": 18900
},
{
"epoch": 3.04,
"grad_norm": 0.1997511237859726,
"learning_rate": 0.00029955610224408974,
"loss": 35.872,
"step": 19000
},
{
"epoch": 3.056,
"grad_norm": 0.19796296954154968,
"learning_rate": 0.00029955370214808586,
"loss": 34.8971,
"step": 19100
},
{
"epoch": 3.072,
"grad_norm": 0.1922471821308136,
"learning_rate": 0.00029955130205208203,
"loss": 35.4181,
"step": 19200
},
{
"epoch": 3.088,
"grad_norm": 0.18493038415908813,
"learning_rate": 0.0002995489019560782,
"loss": 36.3712,
"step": 19300
},
{
"epoch": 3.104,
"grad_norm": 0.22148194909095764,
"learning_rate": 0.00029954650186007437,
"loss": 34.5266,
"step": 19400
},
{
"epoch": 3.12,
"grad_norm": 0.19701820611953735,
"learning_rate": 0.00029954410176407054,
"loss": 35.2642,
"step": 19500
},
{
"epoch": 3.136,
"grad_norm": 0.1763058602809906,
"learning_rate": 0.0002995417016680667,
"loss": 36.1582,
"step": 19600
},
{
"epoch": 3.152,
"grad_norm": 0.2792583107948303,
"learning_rate": 0.0002995393015720628,
"loss": 34.755,
"step": 19700
},
{
"epoch": 3.168,
"grad_norm": 0.20418234169483185,
"learning_rate": 0.00029953690147605904,
"loss": 34.5373,
"step": 19800
},
{
"epoch": 3.184,
"grad_norm": 0.24839259684085846,
"learning_rate": 0.0002995345013800552,
"loss": 34.5007,
"step": 19900
},
{
"epoch": 3.2,
"grad_norm": 0.22200001776218414,
"learning_rate": 0.00029953210128405133,
"loss": 34.8183,
"step": 20000
},
{
"epoch": 3.216,
"grad_norm": 0.2371726781129837,
"learning_rate": 0.0002995297011880475,
"loss": 34.0164,
"step": 20100
},
{
"epoch": 3.232,
"grad_norm": 0.21370230615139008,
"learning_rate": 0.00029952730109204367,
"loss": 34.8268,
"step": 20200
},
{
"epoch": 3.248,
"grad_norm": 0.20940592885017395,
"learning_rate": 0.00029952490099603983,
"loss": 33.8475,
"step": 20300
},
{
"epoch": 3.2640000000000002,
"grad_norm": 0.18580414354801178,
"learning_rate": 0.000299522500900036,
"loss": 33.8718,
"step": 20400
},
{
"epoch": 3.2800000000000002,
"grad_norm": 0.2200319468975067,
"learning_rate": 0.0002995201008040322,
"loss": 33.9083,
"step": 20500
},
{
"epoch": 3.296,
"grad_norm": 0.18141067028045654,
"learning_rate": 0.0002995177007080283,
"loss": 33.2878,
"step": 20600
},
{
"epoch": 3.312,
"grad_norm": 0.24104055762290955,
"learning_rate": 0.00029951530061202446,
"loss": 34.4549,
"step": 20700
},
{
"epoch": 3.328,
"grad_norm": 0.22455894947052002,
"learning_rate": 0.0002995129005160206,
"loss": 33.2184,
"step": 20800
},
{
"epoch": 3.344,
"grad_norm": 0.19662746787071228,
"learning_rate": 0.0002995105244209768,
"loss": 33.836,
"step": 20900
},
{
"epoch": 3.36,
"grad_norm": 0.2322922796010971,
"learning_rate": 0.000299508124324973,
"loss": 33.1089,
"step": 21000
},
{
"epoch": 3.376,
"grad_norm": 0.2140241116285324,
"learning_rate": 0.0002995057482299292,
"loss": 32.8205,
"step": 21100
},
{
"epoch": 3.392,
"grad_norm": 0.19320878386497498,
"learning_rate": 0.00029950334813392534,
"loss": 32.8251,
"step": 21200
},
{
"epoch": 3.408,
"grad_norm": 0.18298691511154175,
"learning_rate": 0.0002995009480379215,
"loss": 33.2469,
"step": 21300
},
{
"epoch": 3.424,
"grad_norm": 0.22385163605213165,
"learning_rate": 0.0002994985479419177,
"loss": 32.4997,
"step": 21400
},
{
"epoch": 3.44,
"grad_norm": 0.2047736793756485,
"learning_rate": 0.0002994961478459138,
"loss": 33.5516,
"step": 21500
},
{
"epoch": 3.456,
"grad_norm": 0.242600679397583,
"learning_rate": 0.00029949374774990996,
"loss": 33.4754,
"step": 21600
},
{
"epoch": 3.472,
"grad_norm": 0.21438950300216675,
"learning_rate": 0.00029949134765390613,
"loss": 33.2636,
"step": 21700
},
{
"epoch": 3.488,
"grad_norm": 0.16991284489631653,
"learning_rate": 0.0002994889475579023,
"loss": 32.2435,
"step": 21800
},
{
"epoch": 3.504,
"grad_norm": 0.21854659914970398,
"learning_rate": 0.00029948654746189847,
"loss": 32.986,
"step": 21900
},
{
"epoch": 3.52,
"grad_norm": 0.22860901057720184,
"learning_rate": 0.0002994841473658946,
"loss": 32.1887,
"step": 22000
},
{
"epoch": 3.536,
"grad_norm": 0.20433278381824493,
"learning_rate": 0.00029948174726989076,
"loss": 32.1502,
"step": 22100
},
{
"epoch": 3.552,
"grad_norm": 0.19475246965885162,
"learning_rate": 0.0002994793471738869,
"loss": 32.0844,
"step": 22200
},
{
"epoch": 3.568,
"grad_norm": 0.20006608963012695,
"learning_rate": 0.0002994769470778831,
"loss": 32.5956,
"step": 22300
},
{
"epoch": 3.584,
"grad_norm": 0.17535006999969482,
"learning_rate": 0.00029947454698187926,
"loss": 32.1812,
"step": 22400
},
{
"epoch": 3.6,
"grad_norm": 0.22252418100833893,
"learning_rate": 0.00029947214688587543,
"loss": 30.6041,
"step": 22500
},
{
"epoch": 3.616,
"grad_norm": 0.18110983073711395,
"learning_rate": 0.00029946974678987155,
"loss": 31.7236,
"step": 22600
},
{
"epoch": 3.632,
"grad_norm": 0.227754145860672,
"learning_rate": 0.0002994673466938677,
"loss": 31.2323,
"step": 22700
},
{
"epoch": 3.648,
"grad_norm": 0.19320198893547058,
"learning_rate": 0.0002994649465978639,
"loss": 31.4608,
"step": 22800
},
{
"epoch": 3.664,
"grad_norm": 0.17932754755020142,
"learning_rate": 0.00029946254650186006,
"loss": 31.9613,
"step": 22900
},
{
"epoch": 3.68,
"grad_norm": 0.19677236676216125,
"learning_rate": 0.0002994601464058562,
"loss": 30.9284,
"step": 23000
},
{
"epoch": 3.6959999999999997,
"grad_norm": 0.22562915086746216,
"learning_rate": 0.00029945774630985234,
"loss": 30.7692,
"step": 23100
},
{
"epoch": 3.7119999999999997,
"grad_norm": 0.19202880561351776,
"learning_rate": 0.0002994553462138485,
"loss": 31.2991,
"step": 23200
},
{
"epoch": 3.7279999999999998,
"grad_norm": 0.22251880168914795,
"learning_rate": 0.0002994529461178447,
"loss": 29.574,
"step": 23300
},
{
"epoch": 3.7439999999999998,
"grad_norm": 0.18705110251903534,
"learning_rate": 0.00029945054602184085,
"loss": 30.2693,
"step": 23400
},
{
"epoch": 3.76,
"grad_norm": 0.18061533570289612,
"learning_rate": 0.000299448145925837,
"loss": 30.0086,
"step": 23500
},
{
"epoch": 3.776,
"grad_norm": 0.23449186980724335,
"learning_rate": 0.0002994457458298332,
"loss": 29.9262,
"step": 23600
},
{
"epoch": 3.792,
"grad_norm": 0.20259559154510498,
"learning_rate": 0.0002994433457338293,
"loss": 30.0139,
"step": 23700
},
{
"epoch": 3.808,
"grad_norm": 0.21019335091114044,
"learning_rate": 0.00029944094563782547,
"loss": 30.853,
"step": 23800
},
{
"epoch": 3.824,
"grad_norm": 0.17927643656730652,
"learning_rate": 0.00029943854554182164,
"loss": 30.7392,
"step": 23900
},
{
"epoch": 3.84,
"grad_norm": 0.18862564861774445,
"learning_rate": 0.0002994361454458178,
"loss": 29.3096,
"step": 24000
},
{
"epoch": 3.856,
"grad_norm": 0.22294782102108002,
"learning_rate": 0.000299433745349814,
"loss": 30.2642,
"step": 24100
},
{
"epoch": 3.872,
"grad_norm": 0.20843671262264252,
"learning_rate": 0.0002994313452538101,
"loss": 29.4115,
"step": 24200
},
{
"epoch": 3.888,
"grad_norm": 0.19081708788871765,
"learning_rate": 0.00029942894515780626,
"loss": 30.0382,
"step": 24300
},
{
"epoch": 3.904,
"grad_norm": 0.18849343061447144,
"learning_rate": 0.00029942654506180243,
"loss": 29.6371,
"step": 24400
},
{
"epoch": 3.92,
"grad_norm": 0.2084178924560547,
"learning_rate": 0.0002994241449657986,
"loss": 29.5353,
"step": 24500
},
{
"epoch": 3.936,
"grad_norm": 0.179380401968956,
"learning_rate": 0.00029942174486979477,
"loss": 29.1119,
"step": 24600
},
{
"epoch": 3.952,
"grad_norm": 0.2312467098236084,
"learning_rate": 0.00029941934477379094,
"loss": 29.3352,
"step": 24700
},
{
"epoch": 3.968,
"grad_norm": 0.19268761575222015,
"learning_rate": 0.00029941694467778705,
"loss": 29.1584,
"step": 24800
},
{
"epoch": 3.984,
"grad_norm": 0.19523601233959198,
"learning_rate": 0.0002994145445817832,
"loss": 29.3122,
"step": 24900
},
{
"epoch": 4.0,
"grad_norm": 0.18007320165634155,
"learning_rate": 0.0002994121444857794,
"loss": 29.1468,
"step": 25000
},
{
"epoch": 4.016,
"grad_norm": 0.19717352092266083,
"learning_rate": 0.00029940974438977556,
"loss": 29.2291,
"step": 25100
},
{
"epoch": 4.032,
"grad_norm": 0.18931248784065247,
"learning_rate": 0.00029940736829473175,
"loss": 28.4476,
"step": 25200
},
{
"epoch": 4.048,
"grad_norm": 0.17574016749858856,
"learning_rate": 0.0002994049681987279,
"loss": 27.6189,
"step": 25300
},
{
"epoch": 4.064,
"grad_norm": 0.19395378232002258,
"learning_rate": 0.0002994025681027241,
"loss": 28.3701,
"step": 25400
},
{
"epoch": 4.08,
"grad_norm": 0.1916889250278473,
"learning_rate": 0.00029940016800672026,
"loss": 28.3605,
"step": 25500
},
{
"epoch": 4.096,
"grad_norm": 0.229524627327919,
"learning_rate": 0.0002993977679107164,
"loss": 27.7045,
"step": 25600
},
{
"epoch": 4.112,
"grad_norm": 0.191976860165596,
"learning_rate": 0.00029939536781471254,
"loss": 27.6015,
"step": 25700
},
{
"epoch": 4.128,
"grad_norm": 0.20611730217933655,
"learning_rate": 0.0002993929917196688,
"loss": 27.3844,
"step": 25800
},
{
"epoch": 4.144,
"grad_norm": 0.21954050660133362,
"learning_rate": 0.00029939059162366495,
"loss": 27.6474,
"step": 25900
},
{
"epoch": 4.16,
"grad_norm": 0.23369371891021729,
"learning_rate": 0.00029938819152766107,
"loss": 27.0846,
"step": 26000
},
{
"epoch": 4.176,
"grad_norm": 0.19088931381702423,
"learning_rate": 0.00029938579143165724,
"loss": 27.0919,
"step": 26100
},
{
"epoch": 4.192,
"grad_norm": 0.16385389864444733,
"learning_rate": 0.0002993833913356534,
"loss": 26.7928,
"step": 26200
},
{
"epoch": 4.208,
"grad_norm": 0.22816230356693268,
"learning_rate": 0.0002993809912396496,
"loss": 26.597,
"step": 26300
},
{
"epoch": 4.224,
"grad_norm": 0.22640523314476013,
"learning_rate": 0.00029937859114364574,
"loss": 26.6011,
"step": 26400
},
{
"epoch": 4.24,
"grad_norm": 0.18119996786117554,
"learning_rate": 0.0002993761910476419,
"loss": 26.8414,
"step": 26500
},
{
"epoch": 4.256,
"grad_norm": 0.2026926428079605,
"learning_rate": 0.00029937379095163803,
"loss": 26.9172,
"step": 26600
},
{
"epoch": 4.272,
"grad_norm": 0.20275373756885529,
"learning_rate": 0.0002993713908556342,
"loss": 26.6568,
"step": 26700
},
{
"epoch": 4.288,
"grad_norm": 0.2261670082807541,
"learning_rate": 0.00029936899075963037,
"loss": 27.1839,
"step": 26800
},
{
"epoch": 4.304,
"grad_norm": 0.18411505222320557,
"learning_rate": 0.00029936659066362654,
"loss": 26.4785,
"step": 26900
},
{
"epoch": 4.32,
"grad_norm": 0.2916317582130432,
"learning_rate": 0.0002993641905676227,
"loss": 26.5309,
"step": 27000
},
{
"epoch": 4.336,
"grad_norm": 0.18537244200706482,
"learning_rate": 0.0002993617904716188,
"loss": 27.1665,
"step": 27100
},
{
"epoch": 4.352,
"grad_norm": 0.16285920143127441,
"learning_rate": 0.000299359390375615,
"loss": 27.2424,
"step": 27200
},
{
"epoch": 4.368,
"grad_norm": 0.15773992240428925,
"learning_rate": 0.00029935699027961116,
"loss": 26.5359,
"step": 27300
},
{
"epoch": 4.384,
"grad_norm": 0.18703384697437286,
"learning_rate": 0.00029935459018360733,
"loss": 27.342,
"step": 27400
},
{
"epoch": 4.4,
"grad_norm": 0.18335498869419098,
"learning_rate": 0.0002993521900876035,
"loss": 27.0257,
"step": 27500
},
{
"epoch": 4.416,
"grad_norm": 0.19414934515953064,
"learning_rate": 0.00029934978999159967,
"loss": 26.2998,
"step": 27600
},
{
"epoch": 4.432,
"grad_norm": 0.20599210262298584,
"learning_rate": 0.0002993473898955958,
"loss": 25.9369,
"step": 27700
},
{
"epoch": 4.448,
"grad_norm": 0.27044299244880676,
"learning_rate": 0.00029934498979959195,
"loss": 26.4132,
"step": 27800
},
{
"epoch": 4.464,
"grad_norm": 0.22304300963878632,
"learning_rate": 0.0002993425897035881,
"loss": 26.2685,
"step": 27900
},
{
"epoch": 4.48,
"grad_norm": 0.20784711837768555,
"learning_rate": 0.0002993401896075843,
"loss": 25.336,
"step": 28000
},
{
"epoch": 4.496,
"grad_norm": 0.2017608880996704,
"learning_rate": 0.00029933778951158046,
"loss": 26.1331,
"step": 28100
},
{
"epoch": 4.5120000000000005,
"grad_norm": 0.18563418090343475,
"learning_rate": 0.0002993353894155766,
"loss": 25.6813,
"step": 28200
},
{
"epoch": 4.5280000000000005,
"grad_norm": 0.21515151858329773,
"learning_rate": 0.00029933298931957274,
"loss": 26.2951,
"step": 28300
},
{
"epoch": 4.5440000000000005,
"grad_norm": 0.20512834191322327,
"learning_rate": 0.0002993305892235689,
"loss": 25.2256,
"step": 28400
},
{
"epoch": 4.5600000000000005,
"grad_norm": 0.23129431903362274,
"learning_rate": 0.0002993281891275651,
"loss": 25.7071,
"step": 28500
},
{
"epoch": 4.576,
"grad_norm": 0.18308007717132568,
"learning_rate": 0.00029932578903156125,
"loss": 25.5192,
"step": 28600
},
{
"epoch": 4.592,
"grad_norm": 0.217178076505661,
"learning_rate": 0.0002993233889355574,
"loss": 25.349,
"step": 28700
},
{
"epoch": 4.608,
"grad_norm": 0.18590569496154785,
"learning_rate": 0.00029932098883955353,
"loss": 25.2593,
"step": 28800
},
{
"epoch": 4.624,
"grad_norm": 0.20052315294742584,
"learning_rate": 0.0002993185887435497,
"loss": 24.8334,
"step": 28900
},
{
"epoch": 4.64,
"grad_norm": 0.21725590527057648,
"learning_rate": 0.00029931621264850595,
"loss": 24.6134,
"step": 29000
},
{
"epoch": 4.656,
"grad_norm": 0.23973499238491058,
"learning_rate": 0.00029931381255250206,
"loss": 24.8209,
"step": 29100
},
{
"epoch": 4.672,
"grad_norm": 0.20804470777511597,
"learning_rate": 0.00029931141245649823,
"loss": 25.0912,
"step": 29200
},
{
"epoch": 4.688,
"grad_norm": 0.17555804550647736,
"learning_rate": 0.0002993090363614544,
"loss": 25.1723,
"step": 29300
},
{
"epoch": 4.704,
"grad_norm": 0.17459039390087128,
"learning_rate": 0.0002993066362654506,
"loss": 24.5282,
"step": 29400
},
{
"epoch": 4.72,
"grad_norm": 0.211078941822052,
"learning_rate": 0.00029930423616944676,
"loss": 24.6043,
"step": 29500
},
{
"epoch": 4.736,
"grad_norm": 0.16957704722881317,
"learning_rate": 0.0002993018360734429,
"loss": 24.7947,
"step": 29600
},
{
"epoch": 4.752,
"grad_norm": 0.2855212092399597,
"learning_rate": 0.00029929943597743904,
"loss": 24.5785,
"step": 29700
},
{
"epoch": 4.768,
"grad_norm": 0.19777260720729828,
"learning_rate": 0.0002992970358814352,
"loss": 24.4989,
"step": 29800
},
{
"epoch": 4.784,
"grad_norm": 0.17237554490566254,
"learning_rate": 0.0002992946357854314,
"loss": 24.6684,
"step": 29900
},
{
"epoch": 4.8,
"grad_norm": 0.1824658066034317,
"learning_rate": 0.00029929223568942755,
"loss": 24.934,
"step": 30000
},
{
"epoch": 4.816,
"grad_norm": 0.19774967432022095,
"learning_rate": 0.0002992898355934237,
"loss": 24.4343,
"step": 30100
},
{
"epoch": 4.832,
"grad_norm": 0.2127138376235962,
"learning_rate": 0.00029928743549741983,
"loss": 24.7444,
"step": 30200
},
{
"epoch": 4.848,
"grad_norm": 0.21794643998146057,
"learning_rate": 0.000299285035401416,
"loss": 25.2811,
"step": 30300
},
{
"epoch": 4.864,
"grad_norm": 0.178062304854393,
"learning_rate": 0.00029928263530541217,
"loss": 24.9453,
"step": 30400
},
{
"epoch": 4.88,
"grad_norm": 0.22796912491321564,
"learning_rate": 0.00029928023520940834,
"loss": 23.9367,
"step": 30500
},
{
"epoch": 4.896,
"grad_norm": 0.18951456248760223,
"learning_rate": 0.0002992778351134045,
"loss": 23.7658,
"step": 30600
},
{
"epoch": 4.912,
"grad_norm": 0.24202126264572144,
"learning_rate": 0.0002992754350174007,
"loss": 23.9004,
"step": 30700
},
{
"epoch": 4.928,
"grad_norm": 0.19269002974033356,
"learning_rate": 0.0002992730349213968,
"loss": 23.2493,
"step": 30800
},
{
"epoch": 4.944,
"grad_norm": 0.1657482087612152,
"learning_rate": 0.00029927063482539296,
"loss": 23.8883,
"step": 30900
},
{
"epoch": 4.96,
"grad_norm": 0.151734858751297,
"learning_rate": 0.00029926823472938913,
"loss": 23.7884,
"step": 31000
},
{
"epoch": 4.976,
"grad_norm": 0.2854020595550537,
"learning_rate": 0.0002992658346333853,
"loss": 24.1054,
"step": 31100
},
{
"epoch": 4.992,
"grad_norm": 0.17750577628612518,
"learning_rate": 0.00029926343453738147,
"loss": 23.6583,
"step": 31200
},
{
"epoch": 5.008,
"grad_norm": 0.17882367968559265,
"learning_rate": 0.00029926103444137764,
"loss": 23.4828,
"step": 31300
},
{
"epoch": 5.024,
"grad_norm": 0.17182889580726624,
"learning_rate": 0.0002992586343453738,
"loss": 22.8774,
"step": 31400
},
{
"epoch": 5.04,
"grad_norm": 0.20355378091335297,
"learning_rate": 0.00029925623424937,
"loss": 23.3064,
"step": 31500
},
{
"epoch": 5.056,
"grad_norm": 0.21614141762256622,
"learning_rate": 0.00029925383415336615,
"loss": 22.8978,
"step": 31600
},
{
"epoch": 5.072,
"grad_norm": 0.20654118061065674,
"learning_rate": 0.00029925143405736226,
"loss": 24.0182,
"step": 31700
},
{
"epoch": 5.088,
"grad_norm": 0.17882691323757172,
"learning_rate": 0.00029924903396135843,
"loss": 22.8556,
"step": 31800
},
{
"epoch": 5.104,
"grad_norm": 0.16477125883102417,
"learning_rate": 0.0002992466338653546,
"loss": 22.63,
"step": 31900
},
{
"epoch": 5.12,
"grad_norm": 0.15241862833499908,
"learning_rate": 0.00029924423376935077,
"loss": 22.9513,
"step": 32000
},
{
"epoch": 5.136,
"grad_norm": 0.17560409009456635,
"learning_rate": 0.00029924183367334694,
"loss": 22.808,
"step": 32100
},
{
"epoch": 5.152,
"grad_norm": 0.18167634308338165,
"learning_rate": 0.00029923943357734305,
"loss": 23.0177,
"step": 32200
},
{
"epoch": 5.168,
"grad_norm": 0.18328386545181274,
"learning_rate": 0.0002992370334813392,
"loss": 22.5144,
"step": 32300
},
{
"epoch": 5.184,
"grad_norm": 0.20202048122882843,
"learning_rate": 0.0002992346333853354,
"loss": 23.1037,
"step": 32400
},
{
"epoch": 5.2,
"grad_norm": 0.20026326179504395,
"learning_rate": 0.00029923223328933156,
"loss": 22.3593,
"step": 32500
},
{
"epoch": 5.216,
"grad_norm": 0.1727285534143448,
"learning_rate": 0.00029922983319332773,
"loss": 22.214,
"step": 32600
},
{
"epoch": 5.232,
"grad_norm": 0.1824960708618164,
"learning_rate": 0.0002992274330973239,
"loss": 22.2179,
"step": 32700
},
{
"epoch": 5.248,
"grad_norm": 0.19371069967746735,
"learning_rate": 0.00029922503300132,
"loss": 22.453,
"step": 32800
},
{
"epoch": 5.264,
"grad_norm": 0.22930407524108887,
"learning_rate": 0.0002992226329053162,
"loss": 22.1665,
"step": 32900
},
{
"epoch": 5.28,
"grad_norm": 0.20372043550014496,
"learning_rate": 0.00029922023280931235,
"loss": 22.1181,
"step": 33000
},
{
"epoch": 5.296,
"grad_norm": 0.20339564979076385,
"learning_rate": 0.0002992178327133085,
"loss": 22.5446,
"step": 33100
},
{
"epoch": 5.312,
"grad_norm": 0.2182660847902298,
"learning_rate": 0.0002992154326173047,
"loss": 22.3062,
"step": 33200
},
{
"epoch": 5.328,
"grad_norm": 0.18666419386863708,
"learning_rate": 0.0002992130325213008,
"loss": 22.0127,
"step": 33300
},
{
"epoch": 5.344,
"grad_norm": 0.2193373292684555,
"learning_rate": 0.000299210632425297,
"loss": 22.1167,
"step": 33400
},
{
"epoch": 5.36,
"grad_norm": 0.19642606377601624,
"learning_rate": 0.00029920823232929315,
"loss": 21.8393,
"step": 33500
},
{
"epoch": 5.376,
"grad_norm": 0.24106252193450928,
"learning_rate": 0.0002992058322332893,
"loss": 21.7386,
"step": 33600
},
{
"epoch": 5.392,
"grad_norm": 0.17611666023731232,
"learning_rate": 0.0002992034321372855,
"loss": 22.1787,
"step": 33700
},
{
"epoch": 5.408,
"grad_norm": 0.23640978336334229,
"learning_rate": 0.00029920103204128165,
"loss": 21.5912,
"step": 33800
},
{
"epoch": 5.424,
"grad_norm": 0.19579695165157318,
"learning_rate": 0.00029919863194527777,
"loss": 22.1147,
"step": 33900
},
{
"epoch": 5.44,
"grad_norm": 0.18251273036003113,
"learning_rate": 0.00029919623184927394,
"loss": 21.8284,
"step": 34000
},
{
"epoch": 5.456,
"grad_norm": 0.2099759876728058,
"learning_rate": 0.0002991938317532701,
"loss": 21.5234,
"step": 34100
},
{
"epoch": 5.4719999999999995,
"grad_norm": 0.21391774713993073,
"learning_rate": 0.0002991914316572663,
"loss": 21.1876,
"step": 34200
},
{
"epoch": 5.4879999999999995,
"grad_norm": 0.17656175792217255,
"learning_rate": 0.00029918903156126244,
"loss": 21.7905,
"step": 34300
},
{
"epoch": 5.504,
"grad_norm": 0.1752483993768692,
"learning_rate": 0.00029918663146525856,
"loss": 20.9481,
"step": 34400
},
{
"epoch": 5.52,
"grad_norm": 0.29879820346832275,
"learning_rate": 0.00029918423136925473,
"loss": 21.2073,
"step": 34500
},
{
"epoch": 5.536,
"grad_norm": 0.1947035789489746,
"learning_rate": 0.0002991818312732509,
"loss": 21.0199,
"step": 34600
},
{
"epoch": 5.552,
"grad_norm": 0.15402550995349884,
"learning_rate": 0.00029917943117724707,
"loss": 21.4862,
"step": 34700
},
{
"epoch": 5.568,
"grad_norm": 0.21479055285453796,
"learning_rate": 0.00029917703108124324,
"loss": 20.3479,
"step": 34800
},
{
"epoch": 5.584,
"grad_norm": 0.15968792140483856,
"learning_rate": 0.0002991746309852394,
"loss": 20.8151,
"step": 34900
},
{
"epoch": 5.6,
"grad_norm": 0.16876402497291565,
"learning_rate": 0.0002991722308892355,
"loss": 21.8482,
"step": 35000
},
{
"epoch": 5.616,
"grad_norm": 0.16191044449806213,
"learning_rate": 0.0002991698307932317,
"loss": 21.4486,
"step": 35100
},
{
"epoch": 5.632,
"grad_norm": 0.20595960319042206,
"learning_rate": 0.00029916743069722786,
"loss": 21.7225,
"step": 35200
},
{
"epoch": 5.648,
"grad_norm": 0.1939288079738617,
"learning_rate": 0.00029916503060122403,
"loss": 21.0107,
"step": 35300
},
{
"epoch": 5.664,
"grad_norm": 0.20212168991565704,
"learning_rate": 0.0002991626305052202,
"loss": 20.4026,
"step": 35400
},
{
"epoch": 5.68,
"grad_norm": 0.1956707388162613,
"learning_rate": 0.0002991602544101764,
"loss": 20.9491,
"step": 35500
},
{
"epoch": 5.696,
"grad_norm": 0.22702528536319733,
"learning_rate": 0.00029915785431417256,
"loss": 21.12,
"step": 35600
},
{
"epoch": 5.712,
"grad_norm": 0.19706673920154572,
"learning_rate": 0.00029915547821912874,
"loss": 21.5166,
"step": 35700
},
{
"epoch": 5.728,
"grad_norm": 0.18108151853084564,
"learning_rate": 0.0002991530781231249,
"loss": 20.4059,
"step": 35800
},
{
"epoch": 5.744,
"grad_norm": 0.1714268922805786,
"learning_rate": 0.00029915067802712103,
"loss": 20.2456,
"step": 35900
},
{
"epoch": 5.76,
"grad_norm": 0.1415804773569107,
"learning_rate": 0.0002991482779311172,
"loss": 20.3176,
"step": 36000
},
{
"epoch": 5.776,
"grad_norm": 0.1928543597459793,
"learning_rate": 0.00029914587783511337,
"loss": 20.797,
"step": 36100
},
{
"epoch": 5.792,
"grad_norm": 0.17042042315006256,
"learning_rate": 0.00029914347773910954,
"loss": 20.2684,
"step": 36200
},
{
"epoch": 5.808,
"grad_norm": 0.1929057389497757,
"learning_rate": 0.0002991410776431057,
"loss": 19.7169,
"step": 36300
},
{
"epoch": 5.824,
"grad_norm": 0.19770380854606628,
"learning_rate": 0.0002991386775471018,
"loss": 20.3972,
"step": 36400
},
{
"epoch": 5.84,
"grad_norm": 0.19927264750003815,
"learning_rate": 0.000299136277451098,
"loss": 20.3105,
"step": 36500
},
{
"epoch": 5.856,
"grad_norm": 0.2222350686788559,
"learning_rate": 0.00029913387735509416,
"loss": 20.3396,
"step": 36600
},
{
"epoch": 5.872,
"grad_norm": 0.15629681944847107,
"learning_rate": 0.00029913147725909033,
"loss": 19.7281,
"step": 36700
},
{
"epoch": 5.888,
"grad_norm": 0.1714082509279251,
"learning_rate": 0.0002991290771630865,
"loss": 20.2121,
"step": 36800
},
{
"epoch": 5.904,
"grad_norm": 0.19152860343456268,
"learning_rate": 0.00029912667706708267,
"loss": 20.3316,
"step": 36900
},
{
"epoch": 5.92,
"grad_norm": 0.18097779154777527,
"learning_rate": 0.0002991242769710788,
"loss": 19.9225,
"step": 37000
},
{
"epoch": 5.936,
"grad_norm": 0.21503089368343353,
"learning_rate": 0.00029912187687507495,
"loss": 20.3151,
"step": 37100
},
{
"epoch": 5.952,
"grad_norm": 0.16976934671401978,
"learning_rate": 0.0002991194767790711,
"loss": 20.4782,
"step": 37200
},
{
"epoch": 5.968,
"grad_norm": 0.1788826435804367,
"learning_rate": 0.0002991170766830673,
"loss": 19.616,
"step": 37300
},
{
"epoch": 5.984,
"grad_norm": 0.17762643098831177,
"learning_rate": 0.00029911467658706346,
"loss": 19.4074,
"step": 37400
},
{
"epoch": 6.0,
"grad_norm": 0.19231481850147247,
"learning_rate": 0.0002991122764910596,
"loss": 19.3966,
"step": 37500
},
{
"epoch": 6.016,
"grad_norm": 0.2067825198173523,
"learning_rate": 0.0002991098763950558,
"loss": 19.6924,
"step": 37600
},
{
"epoch": 6.032,
"grad_norm": 0.1930302083492279,
"learning_rate": 0.00029910747629905196,
"loss": 19.765,
"step": 37700
},
{
"epoch": 6.048,
"grad_norm": 0.2076890915632248,
"learning_rate": 0.00029910507620304813,
"loss": 19.0516,
"step": 37800
},
{
"epoch": 6.064,
"grad_norm": 0.2006111741065979,
"learning_rate": 0.00029910267610704425,
"loss": 19.1025,
"step": 37900
},
{
"epoch": 6.08,
"grad_norm": 0.1836411952972412,
"learning_rate": 0.0002991002760110404,
"loss": 19.3714,
"step": 38000
},
{
"epoch": 6.096,
"grad_norm": 0.1817934662103653,
"learning_rate": 0.0002990978759150366,
"loss": 19.1752,
"step": 38100
},
{
"epoch": 6.112,
"grad_norm": 0.18150608241558075,
"learning_rate": 0.00029909547581903276,
"loss": 19.5865,
"step": 38200
},
{
"epoch": 6.128,
"grad_norm": 0.3108033835887909,
"learning_rate": 0.0002990930757230289,
"loss": 19.3632,
"step": 38300
},
{
"epoch": 6.144,
"grad_norm": 0.18861189484596252,
"learning_rate": 0.00029909067562702504,
"loss": 19.9617,
"step": 38400
},
{
"epoch": 6.16,
"grad_norm": 0.16909874975681305,
"learning_rate": 0.0002990882755310212,
"loss": 19.8722,
"step": 38500
},
{
"epoch": 6.176,
"grad_norm": 0.16401100158691406,
"learning_rate": 0.0002990858754350174,
"loss": 19.3652,
"step": 38600
},
{
"epoch": 6.192,
"grad_norm": 0.17053301632404327,
"learning_rate": 0.00029908347533901355,
"loss": 19.4264,
"step": 38700
},
{
"epoch": 6.208,
"grad_norm": 0.18607936799526215,
"learning_rate": 0.0002990810752430097,
"loss": 19.3128,
"step": 38800
},
{
"epoch": 6.224,
"grad_norm": 0.2513495087623596,
"learning_rate": 0.0002990786751470059,
"loss": 20.1134,
"step": 38900
},
{
"epoch": 6.24,
"grad_norm": 0.21938976645469666,
"learning_rate": 0.000299076275051002,
"loss": 19.5682,
"step": 39000
},
{
"epoch": 6.256,
"grad_norm": 0.21253296732902527,
"learning_rate": 0.00029907387495499817,
"loss": 18.7325,
"step": 39100
},
{
"epoch": 6.272,
"grad_norm": 0.21298116445541382,
"learning_rate": 0.00029907147485899434,
"loss": 19.0698,
"step": 39200
},
{
"epoch": 6.288,
"grad_norm": 0.17804065346717834,
"learning_rate": 0.0002990690747629905,
"loss": 18.3022,
"step": 39300
},
{
"epoch": 6.304,
"grad_norm": 0.31990084052085876,
"learning_rate": 0.0002990666986679467,
"loss": 18.9093,
"step": 39400
},
{
"epoch": 6.32,
"grad_norm": 0.17742526531219482,
"learning_rate": 0.0002990642985719428,
"loss": 18.6614,
"step": 39500
},
{
"epoch": 6.336,
"grad_norm": 0.20601534843444824,
"learning_rate": 0.000299061898475939,
"loss": 19.6871,
"step": 39600
},
{
"epoch": 6.352,
"grad_norm": 0.16021846234798431,
"learning_rate": 0.00029905949837993515,
"loss": 18.6417,
"step": 39700
},
{
"epoch": 6.368,
"grad_norm": 0.1588086634874344,
"learning_rate": 0.0002990570982839313,
"loss": 18.3146,
"step": 39800
},
{
"epoch": 6.384,
"grad_norm": 0.21372877061367035,
"learning_rate": 0.0002990546981879275,
"loss": 19.0519,
"step": 39900
},
{
"epoch": 6.4,
"grad_norm": 0.18066450953483582,
"learning_rate": 0.00029905229809192366,
"loss": 19.2848,
"step": 40000
},
{
"epoch": 6.416,
"grad_norm": 0.23790153861045837,
"learning_rate": 0.0002990498979959198,
"loss": 18.7495,
"step": 40100
},
{
"epoch": 6.432,
"grad_norm": 0.21764115989208221,
"learning_rate": 0.00029904749789991594,
"loss": 18.5835,
"step": 40200
},
{
"epoch": 6.448,
"grad_norm": 0.18615952134132385,
"learning_rate": 0.0002990450978039121,
"loss": 17.9751,
"step": 40300
},
{
"epoch": 6.464,
"grad_norm": 0.1657874882221222,
"learning_rate": 0.0002990426977079083,
"loss": 18.5635,
"step": 40400
},
{
"epoch": 6.48,
"grad_norm": 0.3158019185066223,
"learning_rate": 0.00029904029761190445,
"loss": 18.6618,
"step": 40500
},
{
"epoch": 6.496,
"grad_norm": 0.2320430427789688,
"learning_rate": 0.0002990378975159006,
"loss": 18.2968,
"step": 40600
},
{
"epoch": 6.5120000000000005,
"grad_norm": 0.20868684351444244,
"learning_rate": 0.0002990354974198968,
"loss": 18.595,
"step": 40700
},
{
"epoch": 6.5280000000000005,
"grad_norm": 0.2185734063386917,
"learning_rate": 0.00029903309732389296,
"loss": 17.9672,
"step": 40800
},
{
"epoch": 6.5440000000000005,
"grad_norm": 0.22871826589107513,
"learning_rate": 0.0002990306972278891,
"loss": 18.0843,
"step": 40900
},
{
"epoch": 6.5600000000000005,
"grad_norm": 0.16801375150680542,
"learning_rate": 0.00029902829713188524,
"loss": 18.138,
"step": 41000
},
{
"epoch": 6.576,
"grad_norm": 0.17401717603206635,
"learning_rate": 0.0002990258970358814,
"loss": 18.7431,
"step": 41100
},
{
"epoch": 6.592,
"grad_norm": 0.17664673924446106,
"learning_rate": 0.0002990234969398776,
"loss": 17.966,
"step": 41200
},
{
"epoch": 6.608,
"grad_norm": 0.2024875283241272,
"learning_rate": 0.00029902109684387375,
"loss": 17.9339,
"step": 41300
},
{
"epoch": 6.624,
"grad_norm": 0.19322896003723145,
"learning_rate": 0.0002990186967478699,
"loss": 18.5554,
"step": 41400
},
{
"epoch": 6.64,
"grad_norm": 0.2797154188156128,
"learning_rate": 0.00029901629665186603,
"loss": 17.5192,
"step": 41500
},
{
"epoch": 6.656,
"grad_norm": 0.2197944074869156,
"learning_rate": 0.0002990138965558622,
"loss": 18.4582,
"step": 41600
},
{
"epoch": 6.672,
"grad_norm": 0.18805234134197235,
"learning_rate": 0.00029901149645985837,
"loss": 17.9245,
"step": 41700
},
{
"epoch": 6.688,
"grad_norm": 0.14986388385295868,
"learning_rate": 0.00029900909636385454,
"loss": 17.7746,
"step": 41800
},
{
"epoch": 6.704,
"grad_norm": 0.26323381066322327,
"learning_rate": 0.0002990066962678507,
"loss": 17.6134,
"step": 41900
},
{
"epoch": 6.72,
"grad_norm": 0.1791141778230667,
"learning_rate": 0.0002990042961718469,
"loss": 17.7648,
"step": 42000
},
{
"epoch": 6.736,
"grad_norm": 0.22629794478416443,
"learning_rate": 0.000299001920076803,
"loss": 18.2337,
"step": 42100
},
{
"epoch": 6.752,
"grad_norm": 0.17983581125736237,
"learning_rate": 0.0002989995199807992,
"loss": 17.4193,
"step": 42200
},
{
"epoch": 6.768,
"grad_norm": 0.17379482090473175,
"learning_rate": 0.00029899711988479535,
"loss": 17.9815,
"step": 42300
},
{
"epoch": 6.784,
"grad_norm": 0.2074684351682663,
"learning_rate": 0.0002989947197887915,
"loss": 17.898,
"step": 42400
},
{
"epoch": 6.8,
"grad_norm": 0.16909289360046387,
"learning_rate": 0.0002989923196927877,
"loss": 17.7292,
"step": 42500
},
{
"epoch": 6.816,
"grad_norm": 0.184371218085289,
"learning_rate": 0.00029898991959678386,
"loss": 18.0706,
"step": 42600
},
{
"epoch": 6.832,
"grad_norm": 0.17724382877349854,
"learning_rate": 0.00029898751950078,
"loss": 17.9871,
"step": 42700
},
{
"epoch": 6.848,
"grad_norm": 0.2286718785762787,
"learning_rate": 0.00029898511940477614,
"loss": 17.5911,
"step": 42800
},
{
"epoch": 6.864,
"grad_norm": 0.2002006471157074,
"learning_rate": 0.0002989827193087723,
"loss": 17.4336,
"step": 42900
},
{
"epoch": 6.88,
"grad_norm": 0.20236457884311676,
"learning_rate": 0.0002989803192127685,
"loss": 17.0849,
"step": 43000
},
{
"epoch": 6.896,
"grad_norm": 0.23483681678771973,
"learning_rate": 0.00029897791911676465,
"loss": 17.7893,
"step": 43100
},
{
"epoch": 6.912,
"grad_norm": 0.18751464784145355,
"learning_rate": 0.00029897551902076077,
"loss": 17.4798,
"step": 43200
},
{
"epoch": 6.928,
"grad_norm": 0.17341011762619019,
"learning_rate": 0.00029897311892475694,
"loss": 17.7278,
"step": 43300
},
{
"epoch": 6.944,
"grad_norm": 0.15160439908504486,
"learning_rate": 0.0002989707188287531,
"loss": 17.4948,
"step": 43400
},
{
"epoch": 6.96,
"grad_norm": 0.19316324591636658,
"learning_rate": 0.0002989683187327493,
"loss": 17.3409,
"step": 43500
},
{
"epoch": 6.976,
"grad_norm": 0.1800646036863327,
"learning_rate": 0.00029896591863674544,
"loss": 17.5152,
"step": 43600
},
{
"epoch": 6.992,
"grad_norm": 0.19359643757343292,
"learning_rate": 0.0002989635185407416,
"loss": 17.2701,
"step": 43700
},
{
"epoch": 7.008,
"grad_norm": 0.21103709936141968,
"learning_rate": 0.0002989611184447378,
"loss": 17.0028,
"step": 43800
},
{
"epoch": 7.024,
"grad_norm": 0.18972234427928925,
"learning_rate": 0.00029895871834873395,
"loss": 16.8714,
"step": 43900
},
{
"epoch": 7.04,
"grad_norm": 0.16335220634937286,
"learning_rate": 0.0002989563182527301,
"loss": 17.1409,
"step": 44000
},
{
"epoch": 7.056,
"grad_norm": 0.16595561802387238,
"learning_rate": 0.00029895391815672624,
"loss": 17.1677,
"step": 44100
},
{
"epoch": 7.072,
"grad_norm": 0.1885690540075302,
"learning_rate": 0.0002989515180607224,
"loss": 17.1327,
"step": 44200
},
{
"epoch": 7.088,
"grad_norm": 0.16525697708129883,
"learning_rate": 0.0002989491179647186,
"loss": 17.0265,
"step": 44300
},
{
"epoch": 7.104,
"grad_norm": 0.17798613011837006,
"learning_rate": 0.00029894671786871474,
"loss": 16.5858,
"step": 44400
},
{
"epoch": 7.12,
"grad_norm": 0.17442761361598969,
"learning_rate": 0.0002989443177727109,
"loss": 16.7029,
"step": 44500
},
{
"epoch": 7.136,
"grad_norm": 0.17014281451702118,
"learning_rate": 0.0002989419176767071,
"loss": 16.3283,
"step": 44600
},
{
"epoch": 7.152,
"grad_norm": 0.21125547587871552,
"learning_rate": 0.0002989395175807032,
"loss": 17.0964,
"step": 44700
},
{
"epoch": 7.168,
"grad_norm": 0.15473531186580658,
"learning_rate": 0.00029893711748469937,
"loss": 17.2634,
"step": 44800
},
{
"epoch": 7.184,
"grad_norm": 0.22423428297042847,
"learning_rate": 0.00029893471738869553,
"loss": 16.6492,
"step": 44900
},
{
"epoch": 7.2,
"grad_norm": 0.23651999235153198,
"learning_rate": 0.0002989323172926917,
"loss": 17.2672,
"step": 45000
},
{
"epoch": 7.216,
"grad_norm": 0.18389280140399933,
"learning_rate": 0.00029892991719668787,
"loss": 16.3061,
"step": 45100
},
{
"epoch": 7.232,
"grad_norm": 0.19786329567432404,
"learning_rate": 0.000298927517100684,
"loss": 16.7178,
"step": 45200
},
{
"epoch": 7.248,
"grad_norm": 0.1748264580965042,
"learning_rate": 0.00029892511700468016,
"loss": 16.8728,
"step": 45300
},
{
"epoch": 7.264,
"grad_norm": 0.17337900400161743,
"learning_rate": 0.0002989227169086763,
"loss": 16.143,
"step": 45400
},
{
"epoch": 7.28,
"grad_norm": 0.1627172827720642,
"learning_rate": 0.0002989203168126725,
"loss": 16.677,
"step": 45500
},
{
"epoch": 7.296,
"grad_norm": 0.18607047200202942,
"learning_rate": 0.00029891791671666866,
"loss": 16.6493,
"step": 45600
},
{
"epoch": 7.312,
"grad_norm": 0.17733363807201385,
"learning_rate": 0.00029891551662066483,
"loss": 16.8518,
"step": 45700
},
{
"epoch": 7.328,
"grad_norm": 0.17257067561149597,
"learning_rate": 0.00029891311652466095,
"loss": 16.7963,
"step": 45800
},
{
"epoch": 7.344,
"grad_norm": 0.22989864647388458,
"learning_rate": 0.0002989107164286571,
"loss": 16.6846,
"step": 45900
},
{
"epoch": 7.36,
"grad_norm": 0.1924850195646286,
"learning_rate": 0.0002989083163326533,
"loss": 16.7258,
"step": 46000
},
{
"epoch": 7.376,
"grad_norm": 0.15162524580955505,
"learning_rate": 0.00029890591623664946,
"loss": 16.0529,
"step": 46100
},
{
"epoch": 7.392,
"grad_norm": 0.19990018010139465,
"learning_rate": 0.00029890354014160564,
"loss": 16.3768,
"step": 46200
},
{
"epoch": 7.408,
"grad_norm": 0.1724652647972107,
"learning_rate": 0.00029890114004560176,
"loss": 17.0495,
"step": 46300
},
{
"epoch": 7.424,
"grad_norm": 0.1920676976442337,
"learning_rate": 0.00029889873994959793,
"loss": 16.1202,
"step": 46400
},
{
"epoch": 7.44,
"grad_norm": 0.1957552433013916,
"learning_rate": 0.00029889636385455417,
"loss": 16.413,
"step": 46500
},
{
"epoch": 7.456,
"grad_norm": 0.14071592688560486,
"learning_rate": 0.00029889396375855034,
"loss": 15.732,
"step": 46600
},
{
"epoch": 7.4719999999999995,
"grad_norm": 0.1833236664533615,
"learning_rate": 0.00029889156366254646,
"loss": 16.7192,
"step": 46700
},
{
"epoch": 7.4879999999999995,
"grad_norm": 0.2189483791589737,
"learning_rate": 0.0002988891635665426,
"loss": 16.0979,
"step": 46800
},
{
"epoch": 7.504,
"grad_norm": 0.17360301315784454,
"learning_rate": 0.0002988867634705388,
"loss": 15.8968,
"step": 46900
},
{
"epoch": 7.52,
"grad_norm": 0.1952562779188156,
"learning_rate": 0.00029888436337453496,
"loss": 15.9731,
"step": 47000
},
{
"epoch": 7.536,
"grad_norm": 0.1601036638021469,
"learning_rate": 0.00029888196327853113,
"loss": 16.392,
"step": 47100
},
{
"epoch": 7.552,
"grad_norm": 0.17277076840400696,
"learning_rate": 0.00029887956318252725,
"loss": 15.9779,
"step": 47200
},
{
"epoch": 7.568,
"grad_norm": 0.1868811696767807,
"learning_rate": 0.0002988771630865234,
"loss": 15.5355,
"step": 47300
},
{
"epoch": 7.584,
"grad_norm": 0.2078930139541626,
"learning_rate": 0.00029887478699147966,
"loss": 15.8833,
"step": 47400
},
{
"epoch": 7.6,
"grad_norm": 0.17647911608219147,
"learning_rate": 0.0002988723868954758,
"loss": 16.0442,
"step": 47500
},
{
"epoch": 7.616,
"grad_norm": 0.20268210768699646,
"learning_rate": 0.00029886998679947194,
"loss": 16.1957,
"step": 47600
},
{
"epoch": 7.632,
"grad_norm": 0.1820913553237915,
"learning_rate": 0.0002988675867034681,
"loss": 15.8208,
"step": 47700
},
{
"epoch": 7.648,
"grad_norm": 0.2001231610774994,
"learning_rate": 0.0002988651866074643,
"loss": 16.1706,
"step": 47800
},
{
"epoch": 7.664,
"grad_norm": 0.18558456003665924,
"learning_rate": 0.00029886278651146045,
"loss": 15.9747,
"step": 47900
},
{
"epoch": 7.68,
"grad_norm": 0.17034992575645447,
"learning_rate": 0.0002988603864154566,
"loss": 16.4537,
"step": 48000
},
{
"epoch": 7.696,
"grad_norm": 0.16974206268787384,
"learning_rate": 0.00029885798631945274,
"loss": 15.5116,
"step": 48100
},
{
"epoch": 7.712,
"grad_norm": 0.1771545112133026,
"learning_rate": 0.0002988555862234489,
"loss": 15.8605,
"step": 48200
},
{
"epoch": 7.728,
"grad_norm": 0.17756806313991547,
"learning_rate": 0.0002988531861274451,
"loss": 15.8965,
"step": 48300
},
{
"epoch": 7.744,
"grad_norm": 0.20773237943649292,
"learning_rate": 0.00029885078603144124,
"loss": 15.1184,
"step": 48400
},
{
"epoch": 7.76,
"grad_norm": 0.18383237719535828,
"learning_rate": 0.0002988483859354374,
"loss": 16.0467,
"step": 48500
},
{
"epoch": 7.776,
"grad_norm": 0.18748898804187775,
"learning_rate": 0.0002988459858394336,
"loss": 15.3286,
"step": 48600
},
{
"epoch": 7.792,
"grad_norm": 0.2877133786678314,
"learning_rate": 0.0002988435857434297,
"loss": 15.8562,
"step": 48700
},
{
"epoch": 7.808,
"grad_norm": 0.168177530169487,
"learning_rate": 0.00029884118564742587,
"loss": 15.8613,
"step": 48800
},
{
"epoch": 7.824,
"grad_norm": 0.18536759912967682,
"learning_rate": 0.00029883878555142203,
"loss": 15.8204,
"step": 48900
},
{
"epoch": 7.84,
"grad_norm": 0.15699341893196106,
"learning_rate": 0.0002988363854554182,
"loss": 15.6026,
"step": 49000
},
{
"epoch": 7.856,
"grad_norm": 0.17730812728405,
"learning_rate": 0.0002988339853594144,
"loss": 15.5268,
"step": 49100
},
{
"epoch": 7.872,
"grad_norm": 0.16140446066856384,
"learning_rate": 0.0002988315852634105,
"loss": 15.3766,
"step": 49200
},
{
"epoch": 7.888,
"grad_norm": 0.16114762425422668,
"learning_rate": 0.00029882918516740666,
"loss": 15.8614,
"step": 49300
},
{
"epoch": 7.904,
"grad_norm": 0.19132892787456512,
"learning_rate": 0.0002988267850714028,
"loss": 15.4026,
"step": 49400
},
{
"epoch": 7.92,
"grad_norm": 0.190206840634346,
"learning_rate": 0.000298824384975399,
"loss": 15.42,
"step": 49500
},
{
"epoch": 7.936,
"grad_norm": 0.18264752626419067,
"learning_rate": 0.00029882198487939516,
"loss": 15.5455,
"step": 49600
},
{
"epoch": 7.952,
"grad_norm": 0.1774350255727768,
"learning_rate": 0.00029881958478339133,
"loss": 15.7328,
"step": 49700
},
{
"epoch": 7.968,
"grad_norm": 0.1655503213405609,
"learning_rate": 0.00029881718468738745,
"loss": 15.5836,
"step": 49800
},
{
"epoch": 7.984,
"grad_norm": 0.18890833854675293,
"learning_rate": 0.0002988147845913836,
"loss": 15.4838,
"step": 49900
},
{
"epoch": 8.0,
"grad_norm": 0.1880652904510498,
"learning_rate": 0.0002988123844953798,
"loss": 15.2114,
"step": 50000
},
{
"epoch": 8.016,
"grad_norm": 0.18285752832889557,
"learning_rate": 0.00029880998439937596,
"loss": 14.9511,
"step": 50100
},
{
"epoch": 8.032,
"grad_norm": 0.19436243176460266,
"learning_rate": 0.0002988075843033721,
"loss": 15.4968,
"step": 50200
},
{
"epoch": 8.048,
"grad_norm": 0.1822815239429474,
"learning_rate": 0.00029880518420736824,
"loss": 14.7632,
"step": 50300
},
{
"epoch": 8.064,
"grad_norm": 0.16189494729042053,
"learning_rate": 0.0002988027841113644,
"loss": 14.937,
"step": 50400
},
{
"epoch": 8.08,
"grad_norm": 0.152993842959404,
"learning_rate": 0.0002988003840153606,
"loss": 14.676,
"step": 50500
},
{
"epoch": 8.096,
"grad_norm": 0.2119678407907486,
"learning_rate": 0.00029879798391935675,
"loss": 15.725,
"step": 50600
},
{
"epoch": 8.112,
"grad_norm": 0.22487041354179382,
"learning_rate": 0.0002987955838233529,
"loss": 15.0505,
"step": 50700
},
{
"epoch": 8.128,
"grad_norm": 0.16072215139865875,
"learning_rate": 0.0002987931837273491,
"loss": 15.4103,
"step": 50800
},
{
"epoch": 8.144,
"grad_norm": 0.16657765209674835,
"learning_rate": 0.0002987907836313452,
"loss": 14.7139,
"step": 50900
},
{
"epoch": 8.16,
"grad_norm": 0.15327660739421844,
"learning_rate": 0.00029878838353534137,
"loss": 14.6325,
"step": 51000
},
{
"epoch": 8.176,
"grad_norm": 0.20472773909568787,
"learning_rate": 0.00029878598343933754,
"loss": 14.7217,
"step": 51100
},
{
"epoch": 8.192,
"grad_norm": 0.214088574051857,
"learning_rate": 0.0002987835833433337,
"loss": 14.121,
"step": 51200
},
{
"epoch": 8.208,
"grad_norm": 0.20903360843658447,
"learning_rate": 0.0002987811832473299,
"loss": 15.1448,
"step": 51300
},
{
"epoch": 8.224,
"grad_norm": 0.20621182024478912,
"learning_rate": 0.000298778783151326,
"loss": 14.7588,
"step": 51400
},
{
"epoch": 8.24,
"grad_norm": 0.18515250086784363,
"learning_rate": 0.00029877638305532216,
"loss": 15.3639,
"step": 51500
},
{
"epoch": 8.256,
"grad_norm": 0.17146657407283783,
"learning_rate": 0.00029877398295931833,
"loss": 14.4964,
"step": 51600
},
{
"epoch": 8.272,
"grad_norm": 0.18953190743923187,
"learning_rate": 0.0002987715828633145,
"loss": 14.5639,
"step": 51700
},
{
"epoch": 8.288,
"grad_norm": 0.17434297502040863,
"learning_rate": 0.0002987692067682707,
"loss": 15.2875,
"step": 51800
},
{
"epoch": 8.304,
"grad_norm": 0.16686853766441345,
"learning_rate": 0.00029876680667226686,
"loss": 14.4679,
"step": 51900
},
{
"epoch": 8.32,
"grad_norm": 0.14394892752170563,
"learning_rate": 0.00029876440657626303,
"loss": 14.5162,
"step": 52000
},
{
"epoch": 8.336,
"grad_norm": 0.20816083252429962,
"learning_rate": 0.0002987620064802592,
"loss": 15.2646,
"step": 52100
},
{
"epoch": 8.352,
"grad_norm": 0.16660048067569733,
"learning_rate": 0.00029875960638425537,
"loss": 15.0214,
"step": 52200
},
{
"epoch": 8.368,
"grad_norm": 0.16948403418064117,
"learning_rate": 0.0002987572062882515,
"loss": 14.7227,
"step": 52300
},
{
"epoch": 8.384,
"grad_norm": 0.15360529720783234,
"learning_rate": 0.00029875480619224765,
"loss": 14.8453,
"step": 52400
},
{
"epoch": 8.4,
"grad_norm": 0.1730951964855194,
"learning_rate": 0.0002987524060962438,
"loss": 14.6784,
"step": 52500
},
{
"epoch": 8.416,
"grad_norm": 0.1714763641357422,
"learning_rate": 0.00029875000600024,
"loss": 14.3347,
"step": 52600
},
{
"epoch": 8.432,
"grad_norm": 0.21991823613643646,
"learning_rate": 0.00029874760590423616,
"loss": 14.7373,
"step": 52700
},
{
"epoch": 8.448,
"grad_norm": 0.26085495948791504,
"learning_rate": 0.00029874520580823233,
"loss": 14.4799,
"step": 52800
},
{
"epoch": 8.464,
"grad_norm": 0.15623599290847778,
"learning_rate": 0.00029874280571222844,
"loss": 14.9737,
"step": 52900
},
{
"epoch": 8.48,
"grad_norm": 0.14685533940792084,
"learning_rate": 0.0002987404056162246,
"loss": 14.4126,
"step": 53000
},
{
"epoch": 8.496,
"grad_norm": 0.19048573076725006,
"learning_rate": 0.0002987380055202208,
"loss": 14.6049,
"step": 53100
},
{
"epoch": 8.512,
"grad_norm": 0.15729829668998718,
"learning_rate": 0.00029873560542421695,
"loss": 14.8894,
"step": 53200
},
{
"epoch": 8.528,
"grad_norm": 0.18257932364940643,
"learning_rate": 0.0002987332053282131,
"loss": 14.3249,
"step": 53300
},
{
"epoch": 8.544,
"grad_norm": 0.20492464303970337,
"learning_rate": 0.00029873080523220923,
"loss": 15.0053,
"step": 53400
},
{
"epoch": 8.56,
"grad_norm": 0.22026245296001434,
"learning_rate": 0.0002987284051362054,
"loss": 14.1141,
"step": 53500
},
{
"epoch": 8.576,
"grad_norm": 0.16078276932239532,
"learning_rate": 0.00029872600504020157,
"loss": 14.3822,
"step": 53600
},
{
"epoch": 8.592,
"grad_norm": 0.19619469344615936,
"learning_rate": 0.00029872360494419774,
"loss": 14.3099,
"step": 53700
},
{
"epoch": 8.608,
"grad_norm": 0.15051692724227905,
"learning_rate": 0.0002987212048481939,
"loss": 13.7999,
"step": 53800
},
{
"epoch": 8.624,
"grad_norm": 0.19525863230228424,
"learning_rate": 0.0002987188047521901,
"loss": 14.3567,
"step": 53900
},
{
"epoch": 8.64,
"grad_norm": 0.16883693635463715,
"learning_rate": 0.0002987164046561862,
"loss": 13.3731,
"step": 54000
},
{
"epoch": 8.656,
"grad_norm": 0.1703290492296219,
"learning_rate": 0.00029871400456018236,
"loss": 13.8462,
"step": 54100
},
{
"epoch": 8.672,
"grad_norm": 0.18907932937145233,
"learning_rate": 0.00029871160446417853,
"loss": 14.5297,
"step": 54200
},
{
"epoch": 8.688,
"grad_norm": 0.16260308027267456,
"learning_rate": 0.0002987092043681747,
"loss": 14.0573,
"step": 54300
},
{
"epoch": 8.704,
"grad_norm": 0.1732938140630722,
"learning_rate": 0.0002987068282731309,
"loss": 14.1114,
"step": 54400
},
{
"epoch": 8.72,
"grad_norm": 0.20591895282268524,
"learning_rate": 0.00029870442817712706,
"loss": 13.7101,
"step": 54500
},
{
"epoch": 8.736,
"grad_norm": 0.1871296912431717,
"learning_rate": 0.00029870202808112323,
"loss": 14.539,
"step": 54600
},
{
"epoch": 8.752,
"grad_norm": 0.15711694955825806,
"learning_rate": 0.0002986996279851194,
"loss": 14.4353,
"step": 54700
},
{
"epoch": 8.768,
"grad_norm": 0.1790015697479248,
"learning_rate": 0.00029869722788911557,
"loss": 14.4861,
"step": 54800
},
{
"epoch": 8.784,
"grad_norm": 0.1903577744960785,
"learning_rate": 0.0002986948277931117,
"loss": 14.2582,
"step": 54900
},
{
"epoch": 8.8,
"grad_norm": 0.18150964379310608,
"learning_rate": 0.00029869242769710785,
"loss": 13.9522,
"step": 55000
},
{
"epoch": 8.816,
"grad_norm": 0.17604489624500275,
"learning_rate": 0.000298690027601104,
"loss": 14.4482,
"step": 55100
},
{
"epoch": 8.832,
"grad_norm": 0.18487071990966797,
"learning_rate": 0.0002986876275051002,
"loss": 13.9656,
"step": 55200
},
{
"epoch": 8.848,
"grad_norm": 0.15276212990283966,
"learning_rate": 0.00029868522740909636,
"loss": 14.2513,
"step": 55300
},
{
"epoch": 8.864,
"grad_norm": 0.19339829683303833,
"learning_rate": 0.0002986828273130925,
"loss": 13.7151,
"step": 55400
},
{
"epoch": 8.88,
"grad_norm": 0.14462265372276306,
"learning_rate": 0.00029868042721708864,
"loss": 13.8859,
"step": 55500
},
{
"epoch": 8.896,
"grad_norm": 0.16163522005081177,
"learning_rate": 0.0002986780271210848,
"loss": 13.7567,
"step": 55600
},
{
"epoch": 8.912,
"grad_norm": 0.15859289467334747,
"learning_rate": 0.000298675627025081,
"loss": 14.4693,
"step": 55700
},
{
"epoch": 8.928,
"grad_norm": 0.1641652137041092,
"learning_rate": 0.00029867322692907715,
"loss": 13.6118,
"step": 55800
},
{
"epoch": 8.943999999999999,
"grad_norm": 0.18410654366016388,
"learning_rate": 0.0002986708268330733,
"loss": 14.3033,
"step": 55900
},
{
"epoch": 8.96,
"grad_norm": 0.18847694993019104,
"learning_rate": 0.00029866842673706944,
"loss": 13.2935,
"step": 56000
},
{
"epoch": 8.975999999999999,
"grad_norm": 0.15224353969097137,
"learning_rate": 0.0002986660266410656,
"loss": 13.6185,
"step": 56100
},
{
"epoch": 8.992,
"grad_norm": 0.15307171642780304,
"learning_rate": 0.0002986636265450618,
"loss": 13.9229,
"step": 56200
},
{
"epoch": 9.008,
"grad_norm": 0.1455143541097641,
"learning_rate": 0.00029866122644905794,
"loss": 13.9716,
"step": 56300
},
{
"epoch": 9.024,
"grad_norm": 0.18889980018138885,
"learning_rate": 0.0002986588263530541,
"loss": 13.8509,
"step": 56400
},
{
"epoch": 9.04,
"grad_norm": 0.19757011532783508,
"learning_rate": 0.0002986564262570502,
"loss": 14.0519,
"step": 56500
},
{
"epoch": 9.056,
"grad_norm": 0.18008406460285187,
"learning_rate": 0.00029865405016200647,
"loss": 13.1833,
"step": 56600
},
{
"epoch": 9.072,
"grad_norm": 0.1602972447872162,
"learning_rate": 0.00029865165006600264,
"loss": 13.2838,
"step": 56700
},
{
"epoch": 9.088,
"grad_norm": 0.17582525312900543,
"learning_rate": 0.0002986492499699988,
"loss": 13.898,
"step": 56800
},
{
"epoch": 9.104,
"grad_norm": 0.15762995183467865,
"learning_rate": 0.0002986468498739949,
"loss": 13.5733,
"step": 56900
},
{
"epoch": 9.12,
"grad_norm": 0.1670118272304535,
"learning_rate": 0.0002986444497779911,
"loss": 13.5845,
"step": 57000
},
{
"epoch": 9.136,
"grad_norm": 0.18542303144931793,
"learning_rate": 0.00029864204968198726,
"loss": 13.9615,
"step": 57100
},
{
"epoch": 9.152,
"grad_norm": 0.18144281208515167,
"learning_rate": 0.00029863964958598343,
"loss": 13.0945,
"step": 57200
},
{
"epoch": 9.168,
"grad_norm": 0.18359419703483582,
"learning_rate": 0.0002986372494899796,
"loss": 13.4529,
"step": 57300
},
{
"epoch": 9.184,
"grad_norm": 0.2034582495689392,
"learning_rate": 0.0002986348493939757,
"loss": 13.2086,
"step": 57400
},
{
"epoch": 9.2,
"grad_norm": 0.1561286300420761,
"learning_rate": 0.0002986324492979719,
"loss": 13.5699,
"step": 57500
},
{
"epoch": 9.216,
"grad_norm": 0.2128494530916214,
"learning_rate": 0.00029863004920196805,
"loss": 13.7906,
"step": 57600
},
{
"epoch": 9.232,
"grad_norm": 0.18951255083084106,
"learning_rate": 0.0002986276491059642,
"loss": 13.4684,
"step": 57700
},
{
"epoch": 9.248,
"grad_norm": 0.14849476516246796,
"learning_rate": 0.0002986252490099604,
"loss": 13.6832,
"step": 57800
},
{
"epoch": 9.264,
"grad_norm": 0.19169315695762634,
"learning_rate": 0.00029862284891395656,
"loss": 12.9751,
"step": 57900
},
{
"epoch": 9.28,
"grad_norm": 0.219793900847435,
"learning_rate": 0.0002986204488179527,
"loss": 13.4069,
"step": 58000
},
{
"epoch": 9.296,
"grad_norm": 0.2139630764722824,
"learning_rate": 0.00029861804872194884,
"loss": 12.9185,
"step": 58100
},
{
"epoch": 9.312,
"grad_norm": 0.1722664088010788,
"learning_rate": 0.000298615648625945,
"loss": 13.4876,
"step": 58200
},
{
"epoch": 9.328,
"grad_norm": 0.15841473639011383,
"learning_rate": 0.0002986132485299412,
"loss": 13.481,
"step": 58300
},
{
"epoch": 9.344,
"grad_norm": 0.17484904825687408,
"learning_rate": 0.00029861084843393735,
"loss": 13.5925,
"step": 58400
},
{
"epoch": 9.36,
"grad_norm": 0.20388108491897583,
"learning_rate": 0.00029860844833793347,
"loss": 13.2549,
"step": 58500
},
{
"epoch": 9.376,
"grad_norm": 0.17959387600421906,
"learning_rate": 0.00029860604824192964,
"loss": 13.571,
"step": 58600
},
{
"epoch": 9.392,
"grad_norm": 0.1830485612154007,
"learning_rate": 0.0002986036481459258,
"loss": 13.0808,
"step": 58700
},
{
"epoch": 9.408,
"grad_norm": 0.1935325413942337,
"learning_rate": 0.000298601248049922,
"loss": 12.9193,
"step": 58800
},
{
"epoch": 9.424,
"grad_norm": 0.22928985953330994,
"learning_rate": 0.00029859884795391814,
"loss": 12.9233,
"step": 58900
},
{
"epoch": 9.44,
"grad_norm": 0.17562927305698395,
"learning_rate": 0.0002985964478579143,
"loss": 13.0933,
"step": 59000
},
{
"epoch": 9.456,
"grad_norm": 0.21014900505542755,
"learning_rate": 0.00029859404776191043,
"loss": 12.9421,
"step": 59100
},
{
"epoch": 9.472,
"grad_norm": 0.16698358952999115,
"learning_rate": 0.0002985916476659066,
"loss": 13.6465,
"step": 59200
},
{
"epoch": 9.488,
"grad_norm": 0.15990376472473145,
"learning_rate": 0.00029858924756990277,
"loss": 12.9832,
"step": 59300
},
{
"epoch": 9.504,
"grad_norm": 0.21185587346553802,
"learning_rate": 0.00029858684747389894,
"loss": 13.3695,
"step": 59400
},
{
"epoch": 9.52,
"grad_norm": 0.16105149686336517,
"learning_rate": 0.0002985844473778951,
"loss": 13.0733,
"step": 59500
},
{
"epoch": 9.536,
"grad_norm": 0.22624213993549347,
"learning_rate": 0.0002985820472818912,
"loss": 13.2586,
"step": 59600
},
{
"epoch": 9.552,
"grad_norm": 0.1732643097639084,
"learning_rate": 0.0002985796471858874,
"loss": 12.9246,
"step": 59700
},
{
"epoch": 9.568,
"grad_norm": 0.18406638503074646,
"learning_rate": 0.00029857724708988356,
"loss": 13.4556,
"step": 59800
},
{
"epoch": 9.584,
"grad_norm": 0.18207241594791412,
"learning_rate": 0.0002985748709948398,
"loss": 12.8405,
"step": 59900
},
{
"epoch": 9.6,
"grad_norm": 0.14808227121829987,
"learning_rate": 0.0002985724708988359,
"loss": 13.0075,
"step": 60000
},
{
"epoch": 9.616,
"grad_norm": 0.1976134330034256,
"learning_rate": 0.0002985700708028321,
"loss": 12.687,
"step": 60100
},
{
"epoch": 9.632,
"grad_norm": 0.1712380349636078,
"learning_rate": 0.00029856767070682825,
"loss": 13.003,
"step": 60200
},
{
"epoch": 9.648,
"grad_norm": 0.1509382426738739,
"learning_rate": 0.0002985652706108244,
"loss": 13.0863,
"step": 60300
},
{
"epoch": 9.664,
"grad_norm": 0.1992410570383072,
"learning_rate": 0.0002985628705148206,
"loss": 13.1396,
"step": 60400
},
{
"epoch": 9.68,
"grad_norm": 0.19914288818836212,
"learning_rate": 0.0002985604704188167,
"loss": 13.0716,
"step": 60500
},
{
"epoch": 9.696,
"grad_norm": 0.17157557606697083,
"learning_rate": 0.0002985580703228129,
"loss": 12.5376,
"step": 60600
},
{
"epoch": 9.712,
"grad_norm": 0.14820295572280884,
"learning_rate": 0.00029855567022680905,
"loss": 12.9209,
"step": 60700
},
{
"epoch": 9.728,
"grad_norm": 0.17262442409992218,
"learning_rate": 0.0002985532701308052,
"loss": 13.3595,
"step": 60800
},
{
"epoch": 9.744,
"grad_norm": 0.1804870218038559,
"learning_rate": 0.0002985508700348014,
"loss": 13.0037,
"step": 60900
},
{
"epoch": 9.76,
"grad_norm": 0.1507444977760315,
"learning_rate": 0.00029854846993879755,
"loss": 12.5568,
"step": 61000
},
{
"epoch": 9.776,
"grad_norm": 0.17809054255485535,
"learning_rate": 0.00029854606984279367,
"loss": 12.9826,
"step": 61100
},
{
"epoch": 9.792,
"grad_norm": 0.25455987453460693,
"learning_rate": 0.00029854366974678984,
"loss": 12.5432,
"step": 61200
},
{
"epoch": 9.808,
"grad_norm": 0.15175747871398926,
"learning_rate": 0.000298541269650786,
"loss": 12.9513,
"step": 61300
},
{
"epoch": 9.824,
"grad_norm": 0.22233819961547852,
"learning_rate": 0.0002985388695547822,
"loss": 13.2744,
"step": 61400
},
{
"epoch": 9.84,
"grad_norm": 0.1534196138381958,
"learning_rate": 0.00029853646945877835,
"loss": 12.4878,
"step": 61500
},
{
"epoch": 9.856,
"grad_norm": 0.17612405121326447,
"learning_rate": 0.00029853406936277446,
"loss": 12.6281,
"step": 61600
},
{
"epoch": 9.872,
"grad_norm": 0.14971201121807098,
"learning_rate": 0.00029853166926677063,
"loss": 12.4393,
"step": 61700
},
{
"epoch": 9.888,
"grad_norm": 0.15717633068561554,
"learning_rate": 0.0002985292691707668,
"loss": 12.6903,
"step": 61800
},
{
"epoch": 9.904,
"grad_norm": 0.1695670634508133,
"learning_rate": 0.00029852686907476297,
"loss": 12.9557,
"step": 61900
},
{
"epoch": 9.92,
"grad_norm": 0.16429013013839722,
"learning_rate": 0.00029852446897875914,
"loss": 12.9804,
"step": 62000
},
{
"epoch": 9.936,
"grad_norm": 0.1919148713350296,
"learning_rate": 0.0002985220688827553,
"loss": 12.8735,
"step": 62100
},
{
"epoch": 9.952,
"grad_norm": 0.1977461278438568,
"learning_rate": 0.0002985196687867514,
"loss": 12.6665,
"step": 62200
},
{
"epoch": 9.968,
"grad_norm": 0.3409396708011627,
"learning_rate": 0.0002985172686907476,
"loss": 11.9422,
"step": 62300
},
{
"epoch": 9.984,
"grad_norm": 0.1977001428604126,
"learning_rate": 0.00029851486859474376,
"loss": 13.392,
"step": 62400
},
{
"epoch": 10.0,
"grad_norm": 0.19805894792079926,
"learning_rate": 0.00029851246849873993,
"loss": 12.3432,
"step": 62500
},
{
"epoch": 10.016,
"grad_norm": 0.1851508915424347,
"learning_rate": 0.0002985100684027361,
"loss": 12.8953,
"step": 62600
},
{
"epoch": 10.032,
"grad_norm": 0.15137746930122375,
"learning_rate": 0.0002985076683067322,
"loss": 12.8256,
"step": 62700
},
{
"epoch": 10.048,
"grad_norm": 0.1815025508403778,
"learning_rate": 0.00029850529221168846,
"loss": 12.2427,
"step": 62800
},
{
"epoch": 10.064,
"grad_norm": 0.282045841217041,
"learning_rate": 0.0002985028921156846,
"loss": 12.5777,
"step": 62900
},
{
"epoch": 10.08,
"grad_norm": 0.19669105112552643,
"learning_rate": 0.0002985004920196808,
"loss": 12.85,
"step": 63000
},
{
"epoch": 10.096,
"grad_norm": 0.1557861566543579,
"learning_rate": 0.0002984980919236769,
"loss": 12.6325,
"step": 63100
},
{
"epoch": 10.112,
"grad_norm": 0.16353458166122437,
"learning_rate": 0.0002984956918276731,
"loss": 12.5578,
"step": 63200
},
{
"epoch": 10.128,
"grad_norm": 0.19124484062194824,
"learning_rate": 0.00029849329173166925,
"loss": 12.8784,
"step": 63300
},
{
"epoch": 10.144,
"grad_norm": 0.16097944974899292,
"learning_rate": 0.0002984908916356654,
"loss": 11.7994,
"step": 63400
},
{
"epoch": 10.16,
"grad_norm": 0.155614972114563,
"learning_rate": 0.0002984884915396616,
"loss": 11.9617,
"step": 63500
},
{
"epoch": 10.176,
"grad_norm": 0.19013510644435883,
"learning_rate": 0.0002984860914436577,
"loss": 12.1663,
"step": 63600
},
{
"epoch": 10.192,
"grad_norm": 0.21610714495182037,
"learning_rate": 0.00029848369134765387,
"loss": 12.2304,
"step": 63700
},
{
"epoch": 10.208,
"grad_norm": 0.15554966032505035,
"learning_rate": 0.00029848129125165004,
"loss": 11.9337,
"step": 63800
},
{
"epoch": 10.224,
"grad_norm": 0.14373019337654114,
"learning_rate": 0.0002984788911556462,
"loss": 12.5049,
"step": 63900
},
{
"epoch": 10.24,
"grad_norm": 0.197763592004776,
"learning_rate": 0.0002984764910596424,
"loss": 12.2087,
"step": 64000
},
{
"epoch": 10.256,
"grad_norm": 0.1522061824798584,
"learning_rate": 0.00029847409096363855,
"loss": 12.475,
"step": 64100
},
{
"epoch": 10.272,
"grad_norm": 0.15849411487579346,
"learning_rate": 0.00029847169086763466,
"loss": 12.1301,
"step": 64200
},
{
"epoch": 10.288,
"grad_norm": 0.1680125594139099,
"learning_rate": 0.00029846929077163083,
"loss": 12.2041,
"step": 64300
},
{
"epoch": 10.304,
"grad_norm": 0.17618972063064575,
"learning_rate": 0.000298466890675627,
"loss": 12.1634,
"step": 64400
},
{
"epoch": 10.32,
"grad_norm": 0.19345271587371826,
"learning_rate": 0.00029846449057962317,
"loss": 12.0509,
"step": 64500
},
{
"epoch": 10.336,
"grad_norm": 0.15981802344322205,
"learning_rate": 0.00029846209048361934,
"loss": 11.879,
"step": 64600
},
{
"epoch": 10.352,
"grad_norm": 0.1640341877937317,
"learning_rate": 0.00029845969038761545,
"loss": 12.3471,
"step": 64700
},
{
"epoch": 10.368,
"grad_norm": 0.1751720905303955,
"learning_rate": 0.0002984572902916116,
"loss": 11.7085,
"step": 64800
},
{
"epoch": 10.384,
"grad_norm": 0.15203487873077393,
"learning_rate": 0.00029845491419656787,
"loss": 11.9901,
"step": 64900
},
{
"epoch": 10.4,
"grad_norm": 0.1836910843849182,
"learning_rate": 0.00029845251410056403,
"loss": 11.5864,
"step": 65000
},
{
"epoch": 10.416,
"grad_norm": 0.2329769879579544,
"learning_rate": 0.00029845011400456015,
"loss": 11.8386,
"step": 65100
},
{
"epoch": 10.432,
"grad_norm": 0.25904643535614014,
"learning_rate": 0.0002984477139085563,
"loss": 11.6842,
"step": 65200
},
{
"epoch": 10.448,
"grad_norm": 0.16373856365680695,
"learning_rate": 0.0002984453138125525,
"loss": 11.9861,
"step": 65300
},
{
"epoch": 10.464,
"grad_norm": 0.1684304028749466,
"learning_rate": 0.00029844291371654866,
"loss": 12.1751,
"step": 65400
},
{
"epoch": 10.48,
"grad_norm": 0.1975129395723343,
"learning_rate": 0.0002984405136205448,
"loss": 11.9744,
"step": 65500
},
{
"epoch": 10.496,
"grad_norm": 0.144730344414711,
"learning_rate": 0.00029843811352454094,
"loss": 11.7554,
"step": 65600
},
{
"epoch": 10.512,
"grad_norm": 0.21416126191616058,
"learning_rate": 0.0002984357134285371,
"loss": 11.7885,
"step": 65700
},
{
"epoch": 10.528,
"grad_norm": 0.1401461511850357,
"learning_rate": 0.0002984333133325333,
"loss": 12.2278,
"step": 65800
},
{
"epoch": 10.544,
"grad_norm": 0.15199688076972961,
"learning_rate": 0.00029843091323652945,
"loss": 12.0611,
"step": 65900
},
{
"epoch": 10.56,
"grad_norm": 0.16079574823379517,
"learning_rate": 0.0002984285131405256,
"loss": 11.3473,
"step": 66000
},
{
"epoch": 10.576,
"grad_norm": 0.14441320300102234,
"learning_rate": 0.0002984261130445218,
"loss": 11.5284,
"step": 66100
},
{
"epoch": 10.592,
"grad_norm": 0.1676328480243683,
"learning_rate": 0.0002984237129485179,
"loss": 11.6487,
"step": 66200
},
{
"epoch": 10.608,
"grad_norm": 0.13956011831760406,
"learning_rate": 0.00029842131285251407,
"loss": 11.772,
"step": 66300
},
{
"epoch": 10.624,
"grad_norm": 0.17723798751831055,
"learning_rate": 0.00029841891275651024,
"loss": 11.7424,
"step": 66400
},
{
"epoch": 10.64,
"grad_norm": 0.18211066722869873,
"learning_rate": 0.0002984165126605064,
"loss": 11.9263,
"step": 66500
},
{
"epoch": 10.656,
"grad_norm": 0.18465609848499298,
"learning_rate": 0.0002984141125645026,
"loss": 12.1533,
"step": 66600
},
{
"epoch": 10.672,
"grad_norm": 0.15032535791397095,
"learning_rate": 0.0002984117124684987,
"loss": 11.8711,
"step": 66700
},
{
"epoch": 10.688,
"grad_norm": 0.25048136711120605,
"learning_rate": 0.00029840931237249486,
"loss": 12.1925,
"step": 66800
},
{
"epoch": 10.704,
"grad_norm": 0.17632503807544708,
"learning_rate": 0.00029840691227649103,
"loss": 12.0652,
"step": 66900
},
{
"epoch": 10.72,
"grad_norm": 0.17492571473121643,
"learning_rate": 0.0002984045121804872,
"loss": 12.3961,
"step": 67000
},
{
"epoch": 10.736,
"grad_norm": 0.17848367989063263,
"learning_rate": 0.00029840211208448337,
"loss": 12.0021,
"step": 67100
},
{
"epoch": 10.752,
"grad_norm": 0.23175941407680511,
"learning_rate": 0.00029839971198847954,
"loss": 11.4583,
"step": 67200
},
{
"epoch": 10.768,
"grad_norm": 0.24281519651412964,
"learning_rate": 0.0002983973358934357,
"loss": 12.0376,
"step": 67300
},
{
"epoch": 10.784,
"grad_norm": 0.18129272758960724,
"learning_rate": 0.00029839493579743184,
"loss": 12.1892,
"step": 67400
},
{
"epoch": 10.8,
"grad_norm": 0.1454136222600937,
"learning_rate": 0.000298392535701428,
"loss": 11.9333,
"step": 67500
},
{
"epoch": 10.816,
"grad_norm": 0.12412439286708832,
"learning_rate": 0.0002983901356054242,
"loss": 11.0441,
"step": 67600
},
{
"epoch": 10.832,
"grad_norm": 0.19814914464950562,
"learning_rate": 0.00029838773550942035,
"loss": 11.4348,
"step": 67700
},
{
"epoch": 10.848,
"grad_norm": 0.2250308245420456,
"learning_rate": 0.0002983853354134165,
"loss": 11.723,
"step": 67800
},
{
"epoch": 10.864,
"grad_norm": 0.1328551471233368,
"learning_rate": 0.0002983829353174127,
"loss": 11.4324,
"step": 67900
},
{
"epoch": 10.88,
"grad_norm": 0.2366170883178711,
"learning_rate": 0.00029838053522140886,
"loss": 12.1462,
"step": 68000
},
{
"epoch": 10.896,
"grad_norm": 0.20911742746829987,
"learning_rate": 0.00029837813512540503,
"loss": 11.6067,
"step": 68100
},
{
"epoch": 10.912,
"grad_norm": 0.1770290583372116,
"learning_rate": 0.00029837573502940114,
"loss": 11.9299,
"step": 68200
},
{
"epoch": 10.928,
"grad_norm": 0.21429571509361267,
"learning_rate": 0.0002983733349333973,
"loss": 11.3683,
"step": 68300
},
{
"epoch": 10.943999999999999,
"grad_norm": 0.1542270928621292,
"learning_rate": 0.0002983709348373935,
"loss": 11.3472,
"step": 68400
},
{
"epoch": 10.96,
"grad_norm": 0.2420985847711563,
"learning_rate": 0.00029836853474138965,
"loss": 11.5805,
"step": 68500
},
{
"epoch": 10.975999999999999,
"grad_norm": 0.17665143311023712,
"learning_rate": 0.0002983661346453858,
"loss": 11.7406,
"step": 68600
},
{
"epoch": 10.992,
"grad_norm": 0.26210835576057434,
"learning_rate": 0.00029836373454938193,
"loss": 11.7457,
"step": 68700
},
{
"epoch": 11.008,
"grad_norm": 0.14472606778144836,
"learning_rate": 0.0002983613344533781,
"loss": 11.4662,
"step": 68800
},
{
"epoch": 11.024,
"grad_norm": 0.17449091374874115,
"learning_rate": 0.0002983589343573743,
"loss": 11.0297,
"step": 68900
},
{
"epoch": 11.04,
"grad_norm": 0.15488724410533905,
"learning_rate": 0.00029835653426137044,
"loss": 11.792,
"step": 69000
},
{
"epoch": 11.056,
"grad_norm": 0.1447325348854065,
"learning_rate": 0.0002983541341653666,
"loss": 11.4483,
"step": 69100
},
{
"epoch": 11.072,
"grad_norm": 0.17111489176750183,
"learning_rate": 0.0002983517340693628,
"loss": 11.1499,
"step": 69200
},
{
"epoch": 11.088,
"grad_norm": 0.17446951568126678,
"learning_rate": 0.0002983493339733589,
"loss": 10.6961,
"step": 69300
},
{
"epoch": 11.104,
"grad_norm": 0.1421278566122055,
"learning_rate": 0.00029834693387735506,
"loss": 11.4794,
"step": 69400
},
{
"epoch": 11.12,
"grad_norm": 0.17439322173595428,
"learning_rate": 0.00029834455778231125,
"loss": 11.0965,
"step": 69500
},
{
"epoch": 11.136,
"grad_norm": 0.16200323402881622,
"learning_rate": 0.0002983421576863074,
"loss": 11.1367,
"step": 69600
},
{
"epoch": 11.152,
"grad_norm": 0.3391527831554413,
"learning_rate": 0.0002983397575903036,
"loss": 10.7709,
"step": 69700
},
{
"epoch": 11.168,
"grad_norm": 0.18793489038944244,
"learning_rate": 0.0002983373574942997,
"loss": 11.1479,
"step": 69800
},
{
"epoch": 11.184,
"grad_norm": 0.1996636688709259,
"learning_rate": 0.0002983349573982959,
"loss": 11.8347,
"step": 69900
},
{
"epoch": 11.2,
"grad_norm": 0.166090190410614,
"learning_rate": 0.00029833255730229205,
"loss": 10.9514,
"step": 70000
},
{
"epoch": 11.216,
"grad_norm": 0.17243006825447083,
"learning_rate": 0.0002983301572062882,
"loss": 11.2505,
"step": 70100
},
{
"epoch": 11.232,
"grad_norm": 0.17860250174999237,
"learning_rate": 0.0002983277571102844,
"loss": 11.023,
"step": 70200
},
{
"epoch": 11.248,
"grad_norm": 0.13896320760250092,
"learning_rate": 0.00029832535701428055,
"loss": 11.092,
"step": 70300
},
{
"epoch": 11.264,
"grad_norm": 0.20008546113967896,
"learning_rate": 0.00029832295691827667,
"loss": 11.2161,
"step": 70400
},
{
"epoch": 11.28,
"grad_norm": 0.14014984667301178,
"learning_rate": 0.00029832055682227284,
"loss": 11.315,
"step": 70500
},
{
"epoch": 11.296,
"grad_norm": 0.16158168017864227,
"learning_rate": 0.000298318156726269,
"loss": 11.3935,
"step": 70600
},
{
"epoch": 11.312,
"grad_norm": 0.15444719791412354,
"learning_rate": 0.0002983157566302652,
"loss": 10.9662,
"step": 70700
},
{
"epoch": 11.328,
"grad_norm": 0.21788270771503448,
"learning_rate": 0.00029831335653426134,
"loss": 11.4848,
"step": 70800
},
{
"epoch": 11.344,
"grad_norm": 0.17685194313526154,
"learning_rate": 0.0002983109564382575,
"loss": 11.3436,
"step": 70900
},
{
"epoch": 11.36,
"grad_norm": 0.15553423762321472,
"learning_rate": 0.0002983085563422537,
"loss": 11.1136,
"step": 71000
},
{
"epoch": 11.376,
"grad_norm": 0.1547129899263382,
"learning_rate": 0.00029830615624624985,
"loss": 10.7924,
"step": 71100
},
{
"epoch": 11.392,
"grad_norm": 0.1907842457294464,
"learning_rate": 0.000298303756150246,
"loss": 10.9726,
"step": 71200
},
{
"epoch": 11.408,
"grad_norm": 0.15053051710128784,
"learning_rate": 0.00029830135605424214,
"loss": 12.0626,
"step": 71300
},
{
"epoch": 11.424,
"grad_norm": 0.14403216540813446,
"learning_rate": 0.0002982989559582383,
"loss": 11.428,
"step": 71400
},
{
"epoch": 11.44,
"grad_norm": 0.15850169956684113,
"learning_rate": 0.0002982965558622345,
"loss": 11.1033,
"step": 71500
},
{
"epoch": 11.456,
"grad_norm": 0.18223829567432404,
"learning_rate": 0.00029829417976719066,
"loss": 11.5088,
"step": 71600
},
{
"epoch": 11.472,
"grad_norm": 0.18121246993541718,
"learning_rate": 0.00029829177967118683,
"loss": 11.0869,
"step": 71700
},
{
"epoch": 11.488,
"grad_norm": 0.1591707020998001,
"learning_rate": 0.00029828937957518295,
"loss": 10.5898,
"step": 71800
},
{
"epoch": 11.504,
"grad_norm": 0.1652923971414566,
"learning_rate": 0.0002982869794791791,
"loss": 11.3647,
"step": 71900
},
{
"epoch": 11.52,
"grad_norm": 0.1930815577507019,
"learning_rate": 0.0002982845793831753,
"loss": 11.4873,
"step": 72000
},
{
"epoch": 11.536,
"grad_norm": 0.1646055281162262,
"learning_rate": 0.00029828217928717145,
"loss": 11.3799,
"step": 72100
},
{
"epoch": 11.552,
"grad_norm": 0.19326475262641907,
"learning_rate": 0.0002982797791911676,
"loss": 10.8387,
"step": 72200
},
{
"epoch": 11.568,
"grad_norm": 0.23909342288970947,
"learning_rate": 0.0002982773790951638,
"loss": 10.757,
"step": 72300
},
{
"epoch": 11.584,
"grad_norm": 0.1616702377796173,
"learning_rate": 0.0002982749789991599,
"loss": 10.7907,
"step": 72400
},
{
"epoch": 11.6,
"grad_norm": 0.16581912338733673,
"learning_rate": 0.0002982725789031561,
"loss": 10.8977,
"step": 72500
},
{
"epoch": 11.616,
"grad_norm": 0.1478215605020523,
"learning_rate": 0.00029827017880715225,
"loss": 10.9325,
"step": 72600
},
{
"epoch": 11.632,
"grad_norm": 0.2693212628364563,
"learning_rate": 0.0002982677787111484,
"loss": 11.2731,
"step": 72700
},
{
"epoch": 11.648,
"grad_norm": 0.15163065493106842,
"learning_rate": 0.0002982653786151446,
"loss": 11.0141,
"step": 72800
},
{
"epoch": 11.664,
"grad_norm": 0.15364685654640198,
"learning_rate": 0.00029826297851914075,
"loss": 10.6781,
"step": 72900
},
{
"epoch": 11.68,
"grad_norm": 0.1410771906375885,
"learning_rate": 0.00029826057842313687,
"loss": 11.0262,
"step": 73000
},
{
"epoch": 11.696,
"grad_norm": 0.2245720773935318,
"learning_rate": 0.00029825817832713304,
"loss": 11.51,
"step": 73100
},
{
"epoch": 11.712,
"grad_norm": 0.17434003949165344,
"learning_rate": 0.0002982557782311292,
"loss": 10.7819,
"step": 73200
},
{
"epoch": 11.728,
"grad_norm": 0.13878166675567627,
"learning_rate": 0.0002982534021360854,
"loss": 10.8833,
"step": 73300
},
{
"epoch": 11.744,
"grad_norm": 0.13650259375572205,
"learning_rate": 0.00029825100204008157,
"loss": 11.0158,
"step": 73400
},
{
"epoch": 11.76,
"grad_norm": 0.22818398475646973,
"learning_rate": 0.00029824860194407773,
"loss": 10.8819,
"step": 73500
},
{
"epoch": 11.776,
"grad_norm": 0.14601178467273712,
"learning_rate": 0.0002982462018480739,
"loss": 10.0593,
"step": 73600
},
{
"epoch": 11.792,
"grad_norm": 0.2245131880044937,
"learning_rate": 0.00029824380175207007,
"loss": 10.6634,
"step": 73700
},
{
"epoch": 11.808,
"grad_norm": 1.000320553779602,
"learning_rate": 0.0002982414016560662,
"loss": 10.961,
"step": 73800
},
{
"epoch": 11.824,
"grad_norm": 0.18026384711265564,
"learning_rate": 0.00029823900156006236,
"loss": 11.1536,
"step": 73900
},
{
"epoch": 11.84,
"grad_norm": 0.15758727490901947,
"learning_rate": 0.0002982366014640585,
"loss": 10.6586,
"step": 74000
},
{
"epoch": 11.856,
"grad_norm": 0.19163353741168976,
"learning_rate": 0.0002982342013680547,
"loss": 11.0334,
"step": 74100
},
{
"epoch": 11.872,
"grad_norm": 0.11467296630144119,
"learning_rate": 0.00029823180127205086,
"loss": 10.8224,
"step": 74200
},
{
"epoch": 11.888,
"grad_norm": 0.15869416296482086,
"learning_rate": 0.00029822940117604703,
"loss": 10.4906,
"step": 74300
},
{
"epoch": 11.904,
"grad_norm": 0.1966274380683899,
"learning_rate": 0.00029822700108004315,
"loss": 10.4152,
"step": 74400
},
{
"epoch": 11.92,
"grad_norm": 0.16446225345134735,
"learning_rate": 0.0002982246009840393,
"loss": 10.4887,
"step": 74500
},
{
"epoch": 11.936,
"grad_norm": 0.16940893232822418,
"learning_rate": 0.0002982222008880355,
"loss": 10.39,
"step": 74600
},
{
"epoch": 11.952,
"grad_norm": 0.1838199496269226,
"learning_rate": 0.00029821980079203166,
"loss": 10.384,
"step": 74700
},
{
"epoch": 11.968,
"grad_norm": 0.17523860931396484,
"learning_rate": 0.0002982174006960278,
"loss": 10.8568,
"step": 74800
},
{
"epoch": 11.984,
"grad_norm": 0.1432792991399765,
"learning_rate": 0.000298215000600024,
"loss": 10.3596,
"step": 74900
},
{
"epoch": 12.0,
"grad_norm": 0.20020250976085663,
"learning_rate": 0.0002982126005040201,
"loss": 10.14,
"step": 75000
},
{
"epoch": 12.016,
"grad_norm": 0.19777518510818481,
"learning_rate": 0.0002982102004080163,
"loss": 10.9224,
"step": 75100
},
{
"epoch": 12.032,
"grad_norm": 0.17126210033893585,
"learning_rate": 0.00029820780031201245,
"loss": 10.5306,
"step": 75200
},
{
"epoch": 12.048,
"grad_norm": 0.16797253489494324,
"learning_rate": 0.0002982054002160086,
"loss": 10.8089,
"step": 75300
},
{
"epoch": 12.064,
"grad_norm": 0.20862014591693878,
"learning_rate": 0.0002982030001200048,
"loss": 10.4757,
"step": 75400
},
{
"epoch": 12.08,
"grad_norm": 0.18397895991802216,
"learning_rate": 0.0002982006000240009,
"loss": 9.9135,
"step": 75500
},
{
"epoch": 12.096,
"grad_norm": 0.16641663014888763,
"learning_rate": 0.00029819819992799707,
"loss": 10.6077,
"step": 75600
},
{
"epoch": 12.112,
"grad_norm": 0.16870319843292236,
"learning_rate": 0.00029819579983199324,
"loss": 10.5788,
"step": 75700
},
{
"epoch": 12.128,
"grad_norm": 0.16674315929412842,
"learning_rate": 0.0002981933997359894,
"loss": 10.7791,
"step": 75800
},
{
"epoch": 12.144,
"grad_norm": 0.1637590378522873,
"learning_rate": 0.0002981909996399856,
"loss": 10.0084,
"step": 75900
},
{
"epoch": 12.16,
"grad_norm": 0.16165070235729218,
"learning_rate": 0.00029818859954398175,
"loss": 10.7957,
"step": 76000
},
{
"epoch": 12.176,
"grad_norm": 0.1414174884557724,
"learning_rate": 0.00029818619944797786,
"loss": 9.8668,
"step": 76100
},
{
"epoch": 12.192,
"grad_norm": 0.1490393877029419,
"learning_rate": 0.00029818379935197403,
"loss": 10.5844,
"step": 76200
},
{
"epoch": 12.208,
"grad_norm": 0.15608841180801392,
"learning_rate": 0.0002981813992559702,
"loss": 10.7121,
"step": 76300
},
{
"epoch": 12.224,
"grad_norm": 0.1658240258693695,
"learning_rate": 0.00029817899915996637,
"loss": 10.4018,
"step": 76400
},
{
"epoch": 12.24,
"grad_norm": 0.1533997803926468,
"learning_rate": 0.00029817659906396254,
"loss": 10.0445,
"step": 76500
},
{
"epoch": 12.256,
"grad_norm": 0.14606164395809174,
"learning_rate": 0.00029817419896795865,
"loss": 10.8624,
"step": 76600
},
{
"epoch": 12.272,
"grad_norm": 0.1926526576280594,
"learning_rate": 0.0002981717988719548,
"loss": 9.9639,
"step": 76700
},
{
"epoch": 12.288,
"grad_norm": 0.16846922039985657,
"learning_rate": 0.000298169398775951,
"loss": 10.4076,
"step": 76800
},
{
"epoch": 12.304,
"grad_norm": 0.1497686505317688,
"learning_rate": 0.00029816699867994716,
"loss": 10.3741,
"step": 76900
},
{
"epoch": 12.32,
"grad_norm": 0.17146418988704681,
"learning_rate": 0.00029816459858394333,
"loss": 10.6163,
"step": 77000
},
{
"epoch": 12.336,
"grad_norm": 0.169904425740242,
"learning_rate": 0.0002981621984879395,
"loss": 10.0631,
"step": 77100
},
{
"epoch": 12.352,
"grad_norm": 0.15850874781608582,
"learning_rate": 0.00029815979839193567,
"loss": 10.0799,
"step": 77200
},
{
"epoch": 12.368,
"grad_norm": 0.15920597314834595,
"learning_rate": 0.00029815739829593184,
"loss": 9.6119,
"step": 77300
},
{
"epoch": 12.384,
"grad_norm": 0.2246374636888504,
"learning_rate": 0.000298154998199928,
"loss": 10.3029,
"step": 77400
},
{
"epoch": 12.4,
"grad_norm": 0.168796569108963,
"learning_rate": 0.0002981525981039241,
"loss": 10.3374,
"step": 77500
},
{
"epoch": 12.416,
"grad_norm": 0.1864066869020462,
"learning_rate": 0.0002981501980079203,
"loss": 10.0087,
"step": 77600
},
{
"epoch": 12.432,
"grad_norm": 0.14401012659072876,
"learning_rate": 0.0002981478219128765,
"loss": 10.1803,
"step": 77700
},
{
"epoch": 12.448,
"grad_norm": 0.1375201791524887,
"learning_rate": 0.00029814542181687265,
"loss": 9.911,
"step": 77800
},
{
"epoch": 12.464,
"grad_norm": 0.1398741900920868,
"learning_rate": 0.0002981430217208688,
"loss": 10.261,
"step": 77900
},
{
"epoch": 12.48,
"grad_norm": 0.15873165428638458,
"learning_rate": 0.000298140621624865,
"loss": 10.7101,
"step": 78000
},
{
"epoch": 12.496,
"grad_norm": 0.1714644730091095,
"learning_rate": 0.0002981382215288611,
"loss": 10.1714,
"step": 78100
},
{
"epoch": 12.512,
"grad_norm": 0.1591562181711197,
"learning_rate": 0.00029813582143285727,
"loss": 10.1645,
"step": 78200
},
{
"epoch": 12.528,
"grad_norm": 0.18264716863632202,
"learning_rate": 0.00029813342133685344,
"loss": 10.3564,
"step": 78300
},
{
"epoch": 12.544,
"grad_norm": 0.1514509618282318,
"learning_rate": 0.0002981310212408496,
"loss": 10.0476,
"step": 78400
},
{
"epoch": 12.56,
"grad_norm": 0.19021818041801453,
"learning_rate": 0.0002981286211448458,
"loss": 10.2492,
"step": 78500
},
{
"epoch": 12.576,
"grad_norm": 0.21221980452537537,
"learning_rate": 0.0002981262210488419,
"loss": 9.7379,
"step": 78600
},
{
"epoch": 12.592,
"grad_norm": 0.16575005650520325,
"learning_rate": 0.00029812382095283806,
"loss": 10.237,
"step": 78700
},
{
"epoch": 12.608,
"grad_norm": 0.12602052092552185,
"learning_rate": 0.00029812142085683423,
"loss": 10.0729,
"step": 78800
},
{
"epoch": 12.624,
"grad_norm": 0.23105710744857788,
"learning_rate": 0.0002981190207608304,
"loss": 9.8609,
"step": 78900
},
{
"epoch": 12.64,
"grad_norm": 0.29600638151168823,
"learning_rate": 0.00029811662066482657,
"loss": 9.8653,
"step": 79000
},
{
"epoch": 12.656,
"grad_norm": 0.19172607362270355,
"learning_rate": 0.00029811422056882274,
"loss": 9.8614,
"step": 79100
},
{
"epoch": 12.672,
"grad_norm": 0.1930418759584427,
"learning_rate": 0.00029811182047281886,
"loss": 10.0208,
"step": 79200
},
{
"epoch": 12.688,
"grad_norm": 0.12393278628587723,
"learning_rate": 0.000298109420376815,
"loss": 10.349,
"step": 79300
},
{
"epoch": 12.704,
"grad_norm": 0.1565830409526825,
"learning_rate": 0.0002981070202808112,
"loss": 10.5402,
"step": 79400
},
{
"epoch": 12.72,
"grad_norm": 0.13968247175216675,
"learning_rate": 0.00029810462018480736,
"loss": 9.9296,
"step": 79500
},
{
"epoch": 12.736,
"grad_norm": 0.17765802145004272,
"learning_rate": 0.00029810222008880353,
"loss": 9.8002,
"step": 79600
},
{
"epoch": 12.752,
"grad_norm": 0.23838719725608826,
"learning_rate": 0.00029809981999279965,
"loss": 9.8636,
"step": 79700
},
{
"epoch": 12.768,
"grad_norm": 0.23086270689964294,
"learning_rate": 0.0002980974438977559,
"loss": 9.9585,
"step": 79800
},
{
"epoch": 12.784,
"grad_norm": 0.14923255145549774,
"learning_rate": 0.00029809504380175206,
"loss": 9.5379,
"step": 79900
},
{
"epoch": 12.8,
"grad_norm": 0.1599462628364563,
"learning_rate": 0.00029809264370574823,
"loss": 9.641,
"step": 80000
},
{
"epoch": 12.816,
"grad_norm": 0.1716078370809555,
"learning_rate": 0.00029809024360974434,
"loss": 9.8697,
"step": 80100
},
{
"epoch": 12.832,
"grad_norm": 0.19052661955356598,
"learning_rate": 0.0002980878435137405,
"loss": 9.6785,
"step": 80200
},
{
"epoch": 12.848,
"grad_norm": 0.15575654804706573,
"learning_rate": 0.0002980854434177367,
"loss": 9.9394,
"step": 80300
},
{
"epoch": 12.864,
"grad_norm": 0.19439518451690674,
"learning_rate": 0.00029808304332173285,
"loss": 9.5522,
"step": 80400
},
{
"epoch": 12.88,
"grad_norm": 0.17798827588558197,
"learning_rate": 0.000298080643225729,
"loss": 9.9453,
"step": 80500
},
{
"epoch": 12.896,
"grad_norm": 0.16586044430732727,
"learning_rate": 0.00029807824312972513,
"loss": 9.8505,
"step": 80600
},
{
"epoch": 12.912,
"grad_norm": 0.15794214606285095,
"learning_rate": 0.0002980758430337213,
"loss": 10.0497,
"step": 80700
},
{
"epoch": 12.928,
"grad_norm": 0.1685098111629486,
"learning_rate": 0.0002980734429377175,
"loss": 10.2658,
"step": 80800
},
{
"epoch": 12.943999999999999,
"grad_norm": 0.16599301993846893,
"learning_rate": 0.00029807104284171364,
"loss": 9.837,
"step": 80900
},
{
"epoch": 12.96,
"grad_norm": 0.14692434668540955,
"learning_rate": 0.0002980686427457098,
"loss": 10.1817,
"step": 81000
},
{
"epoch": 12.975999999999999,
"grad_norm": 0.15374502539634705,
"learning_rate": 0.000298066242649706,
"loss": 10.1231,
"step": 81100
},
{
"epoch": 12.992,
"grad_norm": 0.1369294375181198,
"learning_rate": 0.0002980638425537021,
"loss": 9.8245,
"step": 81200
},
{
"epoch": 13.008,
"grad_norm": 0.20259645581245422,
"learning_rate": 0.00029806144245769826,
"loss": 9.7027,
"step": 81300
},
{
"epoch": 13.024,
"grad_norm": 0.1258879452943802,
"learning_rate": 0.00029805904236169443,
"loss": 9.8863,
"step": 81400
},
{
"epoch": 13.04,
"grad_norm": 0.14773085713386536,
"learning_rate": 0.0002980566422656906,
"loss": 9.4255,
"step": 81500
},
{
"epoch": 13.056,
"grad_norm": 0.17212265729904175,
"learning_rate": 0.00029805424216968677,
"loss": 10.0506,
"step": 81600
},
{
"epoch": 13.072,
"grad_norm": 0.179426372051239,
"learning_rate": 0.0002980518420736829,
"loss": 9.5137,
"step": 81700
},
{
"epoch": 13.088,
"grad_norm": 0.15935377776622772,
"learning_rate": 0.00029804944197767906,
"loss": 9.3141,
"step": 81800
},
{
"epoch": 13.104,
"grad_norm": 0.17460429668426514,
"learning_rate": 0.0002980470418816752,
"loss": 9.8005,
"step": 81900
},
{
"epoch": 13.12,
"grad_norm": 0.20005491375923157,
"learning_rate": 0.0002980446417856714,
"loss": 9.7239,
"step": 82000
},
{
"epoch": 13.136,
"grad_norm": 0.15051016211509705,
"learning_rate": 0.00029804224168966756,
"loss": 10.214,
"step": 82100
},
{
"epoch": 13.152,
"grad_norm": 0.16659046709537506,
"learning_rate": 0.00029803984159366373,
"loss": 9.4695,
"step": 82200
},
{
"epoch": 13.168,
"grad_norm": 0.16346730291843414,
"learning_rate": 0.00029803744149765985,
"loss": 9.5839,
"step": 82300
},
{
"epoch": 13.184,
"grad_norm": 0.16145597398281097,
"learning_rate": 0.000298035041401656,
"loss": 9.2663,
"step": 82400
},
{
"epoch": 13.2,
"grad_norm": 0.13834603130817413,
"learning_rate": 0.00029803266530661226,
"loss": 9.6926,
"step": 82500
},
{
"epoch": 13.216,
"grad_norm": 0.17841538786888123,
"learning_rate": 0.0002980302652106084,
"loss": 9.4752,
"step": 82600
},
{
"epoch": 13.232,
"grad_norm": 0.14639347791671753,
"learning_rate": 0.00029802786511460454,
"loss": 9.9606,
"step": 82700
},
{
"epoch": 13.248,
"grad_norm": 0.15291540324687958,
"learning_rate": 0.0002980254650186007,
"loss": 9.9284,
"step": 82800
},
{
"epoch": 13.264,
"grad_norm": 0.15908333659172058,
"learning_rate": 0.0002980230649225969,
"loss": 9.5464,
"step": 82900
},
{
"epoch": 13.28,
"grad_norm": 0.16768860816955566,
"learning_rate": 0.00029802066482659305,
"loss": 10.2164,
"step": 83000
},
{
"epoch": 13.296,
"grad_norm": 0.18221326172351837,
"learning_rate": 0.0002980182647305892,
"loss": 9.6566,
"step": 83100
},
{
"epoch": 13.312,
"grad_norm": 0.13944192230701447,
"learning_rate": 0.00029801586463458534,
"loss": 9.4149,
"step": 83200
},
{
"epoch": 13.328,
"grad_norm": 0.20090098679065704,
"learning_rate": 0.0002980134645385815,
"loss": 9.1968,
"step": 83300
},
{
"epoch": 13.344,
"grad_norm": 0.17636704444885254,
"learning_rate": 0.0002980110644425777,
"loss": 9.4497,
"step": 83400
},
{
"epoch": 13.36,
"grad_norm": 0.19672048091888428,
"learning_rate": 0.00029800866434657384,
"loss": 9.3083,
"step": 83500
},
{
"epoch": 13.376,
"grad_norm": 0.1991618573665619,
"learning_rate": 0.00029800626425057,
"loss": 9.59,
"step": 83600
},
{
"epoch": 13.392,
"grad_norm": 0.17260773479938507,
"learning_rate": 0.00029800386415456613,
"loss": 9.9553,
"step": 83700
},
{
"epoch": 13.408,
"grad_norm": 0.13101576268672943,
"learning_rate": 0.0002980014640585623,
"loss": 10.0732,
"step": 83800
},
{
"epoch": 13.424,
"grad_norm": 0.16349157691001892,
"learning_rate": 0.00029799906396255847,
"loss": 9.8363,
"step": 83900
},
{
"epoch": 13.44,
"grad_norm": 0.1792200654745102,
"learning_rate": 0.00029799666386655464,
"loss": 9.9456,
"step": 84000
},
{
"epoch": 13.456,
"grad_norm": 0.13476693630218506,
"learning_rate": 0.0002979942637705508,
"loss": 9.4642,
"step": 84100
},
{
"epoch": 13.472,
"grad_norm": 0.17343075573444366,
"learning_rate": 0.000297991863674547,
"loss": 9.4041,
"step": 84200
},
{
"epoch": 13.488,
"grad_norm": 0.16127794981002808,
"learning_rate": 0.0002979894635785431,
"loss": 9.2465,
"step": 84300
},
{
"epoch": 13.504,
"grad_norm": 0.14993996918201447,
"learning_rate": 0.00029798706348253926,
"loss": 9.5946,
"step": 84400
},
{
"epoch": 13.52,
"grad_norm": 0.21931160986423492,
"learning_rate": 0.0002979846873874955,
"loss": 9.5796,
"step": 84500
},
{
"epoch": 13.536,
"grad_norm": 0.15303994715213776,
"learning_rate": 0.0002979822872914916,
"loss": 9.4222,
"step": 84600
},
{
"epoch": 13.552,
"grad_norm": 0.1905248612165451,
"learning_rate": 0.0002979798871954878,
"loss": 9.4192,
"step": 84700
},
{
"epoch": 13.568,
"grad_norm": 0.17656217515468597,
"learning_rate": 0.00029797748709948395,
"loss": 9.685,
"step": 84800
},
{
"epoch": 13.584,
"grad_norm": 0.31464865803718567,
"learning_rate": 0.0002979750870034801,
"loss": 9.4839,
"step": 84900
},
{
"epoch": 13.6,
"grad_norm": 0.20140250027179718,
"learning_rate": 0.0002979726869074763,
"loss": 9.4393,
"step": 85000
},
{
"epoch": 13.616,
"grad_norm": 0.1453031599521637,
"learning_rate": 0.00029797028681147246,
"loss": 9.4777,
"step": 85100
},
{
"epoch": 13.632,
"grad_norm": 0.15559718012809753,
"learning_rate": 0.0002979678867154686,
"loss": 9.7772,
"step": 85200
},
{
"epoch": 13.648,
"grad_norm": 0.16849826276302338,
"learning_rate": 0.00029796548661946475,
"loss": 9.0954,
"step": 85300
},
{
"epoch": 13.664,
"grad_norm": 0.15798023343086243,
"learning_rate": 0.0002979630865234609,
"loss": 9.7756,
"step": 85400
},
{
"epoch": 13.68,
"grad_norm": 0.0940115824341774,
"learning_rate": 0.0002979606864274571,
"loss": 9.9294,
"step": 85500
},
{
"epoch": 13.696,
"grad_norm": 0.18608032166957855,
"learning_rate": 0.00029795828633145325,
"loss": 9.4524,
"step": 85600
},
{
"epoch": 13.712,
"grad_norm": 0.16172797977924347,
"learning_rate": 0.00029795588623544937,
"loss": 9.6146,
"step": 85700
},
{
"epoch": 13.728,
"grad_norm": 0.1493913233280182,
"learning_rate": 0.00029795348613944554,
"loss": 8.8783,
"step": 85800
},
{
"epoch": 13.744,
"grad_norm": 0.1365765631198883,
"learning_rate": 0.0002979510860434417,
"loss": 9.4707,
"step": 85900
},
{
"epoch": 13.76,
"grad_norm": 0.17814397811889648,
"learning_rate": 0.0002979486859474379,
"loss": 9.4121,
"step": 86000
},
{
"epoch": 13.776,
"grad_norm": 0.16484831273555756,
"learning_rate": 0.00029794628585143405,
"loss": 9.0902,
"step": 86100
},
{
"epoch": 13.792,
"grad_norm": 0.1435382217168808,
"learning_rate": 0.0002979438857554302,
"loss": 9.4565,
"step": 86200
},
{
"epoch": 13.808,
"grad_norm": 0.1451929211616516,
"learning_rate": 0.00029794148565942633,
"loss": 9.6377,
"step": 86300
},
{
"epoch": 13.824,
"grad_norm": 0.1439056396484375,
"learning_rate": 0.0002979390855634225,
"loss": 9.2624,
"step": 86400
},
{
"epoch": 13.84,
"grad_norm": 0.1712324023246765,
"learning_rate": 0.00029793668546741867,
"loss": 9.2021,
"step": 86500
},
{
"epoch": 13.856,
"grad_norm": 0.15382009744644165,
"learning_rate": 0.00029793428537141484,
"loss": 8.8688,
"step": 86600
},
{
"epoch": 13.872,
"grad_norm": 0.14327426254749298,
"learning_rate": 0.000297931885275411,
"loss": 9.2336,
"step": 86700
},
{
"epoch": 13.888,
"grad_norm": 0.21682417392730713,
"learning_rate": 0.0002979294851794071,
"loss": 8.9508,
"step": 86800
},
{
"epoch": 13.904,
"grad_norm": 0.18012550473213196,
"learning_rate": 0.0002979270850834033,
"loss": 8.8259,
"step": 86900
},
{
"epoch": 13.92,
"grad_norm": 0.19224317371845245,
"learning_rate": 0.00029792468498739946,
"loss": 9.0594,
"step": 87000
},
{
"epoch": 13.936,
"grad_norm": 0.14684438705444336,
"learning_rate": 0.00029792228489139563,
"loss": 8.6664,
"step": 87100
},
{
"epoch": 13.952,
"grad_norm": 0.15808767080307007,
"learning_rate": 0.0002979198847953918,
"loss": 8.8133,
"step": 87200
},
{
"epoch": 13.968,
"grad_norm": 0.1466471403837204,
"learning_rate": 0.00029791748469938797,
"loss": 9.2512,
"step": 87300
},
{
"epoch": 13.984,
"grad_norm": 0.13929226994514465,
"learning_rate": 0.0002979150846033841,
"loss": 9.0263,
"step": 87400
},
{
"epoch": 14.0,
"grad_norm": 0.1410779356956482,
"learning_rate": 0.00029791268450738025,
"loss": 9.0906,
"step": 87500
},
{
"epoch": 14.016,
"grad_norm": 0.16633394360542297,
"learning_rate": 0.0002979102844113764,
"loss": 8.8764,
"step": 87600
},
{
"epoch": 14.032,
"grad_norm": 0.19240239262580872,
"learning_rate": 0.0002979078843153726,
"loss": 8.6873,
"step": 87700
},
{
"epoch": 14.048,
"grad_norm": 0.2285340428352356,
"learning_rate": 0.00029790548421936876,
"loss": 8.7636,
"step": 87800
},
{
"epoch": 14.064,
"grad_norm": 0.16399361193180084,
"learning_rate": 0.0002979030841233649,
"loss": 9.3241,
"step": 87900
},
{
"epoch": 14.08,
"grad_norm": 0.14966578781604767,
"learning_rate": 0.00029790068402736104,
"loss": 9.0301,
"step": 88000
},
{
"epoch": 14.096,
"grad_norm": 0.17241202294826508,
"learning_rate": 0.0002978982839313572,
"loss": 8.9678,
"step": 88100
},
{
"epoch": 14.112,
"grad_norm": 0.13520659506320953,
"learning_rate": 0.0002978958838353534,
"loss": 9.0678,
"step": 88200
},
{
"epoch": 14.128,
"grad_norm": 0.15996631979942322,
"learning_rate": 0.00029789348373934955,
"loss": 8.7807,
"step": 88300
},
{
"epoch": 14.144,
"grad_norm": 0.14483466744422913,
"learning_rate": 0.0002978910836433457,
"loss": 8.6088,
"step": 88400
},
{
"epoch": 14.16,
"grad_norm": 0.15150679647922516,
"learning_rate": 0.00029788868354734183,
"loss": 9.2128,
"step": 88500
},
{
"epoch": 14.176,
"grad_norm": 0.1668185293674469,
"learning_rate": 0.0002978863074522981,
"loss": 9.518,
"step": 88600
},
{
"epoch": 14.192,
"grad_norm": 0.17209367454051971,
"learning_rate": 0.00029788393135725427,
"loss": 8.5952,
"step": 88700
},
{
"epoch": 14.208,
"grad_norm": 0.15907296538352966,
"learning_rate": 0.00029788155526221045,
"loss": 8.7632,
"step": 88800
},
{
"epoch": 14.224,
"grad_norm": 0.18298570811748505,
"learning_rate": 0.0002978791551662066,
"loss": 8.8021,
"step": 88900
},
{
"epoch": 14.24,
"grad_norm": 0.19813942909240723,
"learning_rate": 0.0002978767550702028,
"loss": 9.1381,
"step": 89000
},
{
"epoch": 14.256,
"grad_norm": 0.1819518506526947,
"learning_rate": 0.00029787435497419896,
"loss": 9.3086,
"step": 89100
},
{
"epoch": 14.272,
"grad_norm": 0.1506895273923874,
"learning_rate": 0.0002978719548781951,
"loss": 8.7471,
"step": 89200
},
{
"epoch": 14.288,
"grad_norm": 0.1686287224292755,
"learning_rate": 0.00029786955478219125,
"loss": 8.8441,
"step": 89300
},
{
"epoch": 14.304,
"grad_norm": 0.1486745923757553,
"learning_rate": 0.0002978671546861874,
"loss": 9.1216,
"step": 89400
},
{
"epoch": 14.32,
"grad_norm": 0.18762429058551788,
"learning_rate": 0.0002978647545901836,
"loss": 9.402,
"step": 89500
},
{
"epoch": 14.336,
"grad_norm": 0.13964596390724182,
"learning_rate": 0.00029786235449417975,
"loss": 9.2773,
"step": 89600
},
{
"epoch": 14.352,
"grad_norm": 0.2629782557487488,
"learning_rate": 0.00029785995439817587,
"loss": 9.05,
"step": 89700
},
{
"epoch": 14.368,
"grad_norm": 0.12668898701667786,
"learning_rate": 0.00029785755430217204,
"loss": 8.8949,
"step": 89800
},
{
"epoch": 14.384,
"grad_norm": 0.14362965524196625,
"learning_rate": 0.0002978551542061682,
"loss": 8.6261,
"step": 89900
},
{
"epoch": 14.4,
"grad_norm": 0.16700971126556396,
"learning_rate": 0.0002978527541101644,
"loss": 8.8621,
"step": 90000
},
{
"epoch": 14.416,
"grad_norm": 0.1597680300474167,
"learning_rate": 0.00029785035401416055,
"loss": 9.1614,
"step": 90100
},
{
"epoch": 14.432,
"grad_norm": 0.16268526017665863,
"learning_rate": 0.0002978479539181567,
"loss": 9.2429,
"step": 90200
},
{
"epoch": 14.448,
"grad_norm": 0.19829140603542328,
"learning_rate": 0.00029784555382215283,
"loss": 8.6337,
"step": 90300
},
{
"epoch": 14.464,
"grad_norm": 0.1362706571817398,
"learning_rate": 0.000297843153726149,
"loss": 8.5578,
"step": 90400
},
{
"epoch": 14.48,
"grad_norm": 0.17475652694702148,
"learning_rate": 0.00029784075363014517,
"loss": 9.3407,
"step": 90500
},
{
"epoch": 14.496,
"grad_norm": 0.139988973736763,
"learning_rate": 0.00029783835353414134,
"loss": 8.9509,
"step": 90600
},
{
"epoch": 14.512,
"grad_norm": 0.15270425379276276,
"learning_rate": 0.0002978359534381375,
"loss": 8.6833,
"step": 90700
},
{
"epoch": 14.528,
"grad_norm": 0.12172385305166245,
"learning_rate": 0.0002978335533421336,
"loss": 8.1913,
"step": 90800
},
{
"epoch": 14.544,
"grad_norm": 0.18453091382980347,
"learning_rate": 0.0002978311532461298,
"loss": 9.0573,
"step": 90900
},
{
"epoch": 14.56,
"grad_norm": 0.12650534510612488,
"learning_rate": 0.00029782875315012596,
"loss": 8.8951,
"step": 91000
},
{
"epoch": 14.576,
"grad_norm": 0.19508056342601776,
"learning_rate": 0.00029782635305412213,
"loss": 8.8831,
"step": 91100
},
{
"epoch": 14.592,
"grad_norm": 0.12826193869113922,
"learning_rate": 0.0002978239529581183,
"loss": 8.7428,
"step": 91200
},
{
"epoch": 14.608,
"grad_norm": 0.16497032344341278,
"learning_rate": 0.00029782155286211447,
"loss": 9.226,
"step": 91300
},
{
"epoch": 14.624,
"grad_norm": 0.1467789113521576,
"learning_rate": 0.0002978191527661106,
"loss": 8.56,
"step": 91400
},
{
"epoch": 14.64,
"grad_norm": 0.13535846769809723,
"learning_rate": 0.00029781675267010675,
"loss": 9.2005,
"step": 91500
},
{
"epoch": 14.656,
"grad_norm": 0.2261963039636612,
"learning_rate": 0.000297814352574103,
"loss": 8.9913,
"step": 91600
},
{
"epoch": 14.672,
"grad_norm": 0.16329319775104523,
"learning_rate": 0.0002978119524780991,
"loss": 8.8455,
"step": 91700
},
{
"epoch": 14.688,
"grad_norm": 0.14644941687583923,
"learning_rate": 0.00029780955238209526,
"loss": 8.8035,
"step": 91800
},
{
"epoch": 14.704,
"grad_norm": 0.17719560861587524,
"learning_rate": 0.00029780715228609143,
"loss": 8.9548,
"step": 91900
},
{
"epoch": 14.72,
"grad_norm": 0.17204242944717407,
"learning_rate": 0.0002978047521900876,
"loss": 8.9065,
"step": 92000
},
{
"epoch": 14.736,
"grad_norm": 0.15323054790496826,
"learning_rate": 0.00029780235209408377,
"loss": 8.642,
"step": 92100
},
{
"epoch": 14.752,
"grad_norm": 0.12264496088027954,
"learning_rate": 0.00029779995199807994,
"loss": 8.7372,
"step": 92200
},
{
"epoch": 14.768,
"grad_norm": 0.13607698678970337,
"learning_rate": 0.00029779755190207605,
"loss": 8.649,
"step": 92300
},
{
"epoch": 14.784,
"grad_norm": 0.1529749035835266,
"learning_rate": 0.0002977951518060722,
"loss": 8.6928,
"step": 92400
},
{
"epoch": 14.8,
"grad_norm": 0.14829668402671814,
"learning_rate": 0.0002977927517100684,
"loss": 8.2178,
"step": 92500
},
{
"epoch": 14.816,
"grad_norm": 0.15614420175552368,
"learning_rate": 0.00029779035161406456,
"loss": 8.4939,
"step": 92600
},
{
"epoch": 14.832,
"grad_norm": 0.18708457052707672,
"learning_rate": 0.00029778795151806073,
"loss": 8.4044,
"step": 92700
},
{
"epoch": 14.848,
"grad_norm": 0.1700950413942337,
"learning_rate": 0.00029778555142205684,
"loss": 9.142,
"step": 92800
},
{
"epoch": 14.864,
"grad_norm": 0.17176997661590576,
"learning_rate": 0.000297783151326053,
"loss": 8.3459,
"step": 92900
},
{
"epoch": 14.88,
"grad_norm": 0.17668530344963074,
"learning_rate": 0.0002977807512300492,
"loss": 8.4129,
"step": 93000
},
{
"epoch": 14.896,
"grad_norm": 0.13708771765232086,
"learning_rate": 0.00029777835113404535,
"loss": 8.6625,
"step": 93100
},
{
"epoch": 14.912,
"grad_norm": 0.2073189914226532,
"learning_rate": 0.00029777597503900154,
"loss": 8.8295,
"step": 93200
},
{
"epoch": 14.928,
"grad_norm": 0.1584160029888153,
"learning_rate": 0.0002977735749429977,
"loss": 8.2892,
"step": 93300
},
{
"epoch": 14.943999999999999,
"grad_norm": 0.13419002294540405,
"learning_rate": 0.0002977711748469938,
"loss": 8.6564,
"step": 93400
},
{
"epoch": 14.96,
"grad_norm": 0.12294425070285797,
"learning_rate": 0.00029776877475099,
"loss": 8.6937,
"step": 93500
},
{
"epoch": 14.975999999999999,
"grad_norm": 0.12022320926189423,
"learning_rate": 0.00029776637465498616,
"loss": 8.6577,
"step": 93600
},
{
"epoch": 14.992,
"grad_norm": 0.1635560393333435,
"learning_rate": 0.00029776397455898233,
"loss": 8.4075,
"step": 93700
},
{
"epoch": 15.008,
"grad_norm": 0.12280473113059998,
"learning_rate": 0.0002977615744629785,
"loss": 8.3065,
"step": 93800
},
{
"epoch": 15.024,
"grad_norm": 0.14091894030570984,
"learning_rate": 0.0002977591743669746,
"loss": 8.3845,
"step": 93900
},
{
"epoch": 15.04,
"grad_norm": 0.16942408680915833,
"learning_rate": 0.0002977567742709708,
"loss": 8.2751,
"step": 94000
},
{
"epoch": 15.056,
"grad_norm": 0.1858222782611847,
"learning_rate": 0.00029775437417496695,
"loss": 8.5152,
"step": 94100
},
{
"epoch": 15.072,
"grad_norm": 0.15426284074783325,
"learning_rate": 0.0002977519740789631,
"loss": 8.2321,
"step": 94200
},
{
"epoch": 15.088,
"grad_norm": 0.13960111141204834,
"learning_rate": 0.0002977495739829593,
"loss": 8.4343,
"step": 94300
},
{
"epoch": 15.104,
"grad_norm": 0.1927483230829239,
"learning_rate": 0.00029774717388695546,
"loss": 8.26,
"step": 94400
},
{
"epoch": 15.12,
"grad_norm": 0.15174433588981628,
"learning_rate": 0.0002977447737909516,
"loss": 8.665,
"step": 94500
},
{
"epoch": 15.136,
"grad_norm": 0.14686360955238342,
"learning_rate": 0.00029774237369494774,
"loss": 8.0608,
"step": 94600
},
{
"epoch": 15.152,
"grad_norm": 0.15865716338157654,
"learning_rate": 0.00029773997359894397,
"loss": 8.4204,
"step": 94700
},
{
"epoch": 15.168,
"grad_norm": 0.14913444221019745,
"learning_rate": 0.0002977375735029401,
"loss": 8.5544,
"step": 94800
},
{
"epoch": 15.184,
"grad_norm": 0.12727545201778412,
"learning_rate": 0.00029773517340693625,
"loss": 7.9671,
"step": 94900
},
{
"epoch": 15.2,
"grad_norm": 0.18612131476402283,
"learning_rate": 0.0002977327733109324,
"loss": 8.5797,
"step": 95000
},
{
"epoch": 15.216,
"grad_norm": 0.1876545250415802,
"learning_rate": 0.0002977303732149286,
"loss": 8.3126,
"step": 95100
},
{
"epoch": 15.232,
"grad_norm": 0.45961084961891174,
"learning_rate": 0.00029772797311892476,
"loss": 8.772,
"step": 95200
},
{
"epoch": 15.248,
"grad_norm": 0.16763293743133545,
"learning_rate": 0.00029772557302292093,
"loss": 8.6089,
"step": 95300
},
{
"epoch": 15.264,
"grad_norm": 0.17058174312114716,
"learning_rate": 0.00029772317292691704,
"loss": 8.5425,
"step": 95400
},
{
"epoch": 15.28,
"grad_norm": 0.17006829380989075,
"learning_rate": 0.0002977207728309132,
"loss": 8.8057,
"step": 95500
},
{
"epoch": 15.296,
"grad_norm": 0.09077399969100952,
"learning_rate": 0.0002977183727349094,
"loss": 8.343,
"step": 95600
},
{
"epoch": 15.312,
"grad_norm": 0.0950964093208313,
"learning_rate": 0.00029771599663986557,
"loss": 8.3518,
"step": 95700
},
{
"epoch": 15.328,
"grad_norm": 0.14622962474822998,
"learning_rate": 0.00029771359654386174,
"loss": 8.1654,
"step": 95800
},
{
"epoch": 15.344,
"grad_norm": 0.16222132742404938,
"learning_rate": 0.00029771119644785785,
"loss": 8.6123,
"step": 95900
},
{
"epoch": 15.36,
"grad_norm": 0.13185660541057587,
"learning_rate": 0.000297708796351854,
"loss": 8.6665,
"step": 96000
},
{
"epoch": 15.376,
"grad_norm": 0.1910812258720398,
"learning_rate": 0.0002977063962558502,
"loss": 8.2323,
"step": 96100
},
{
"epoch": 15.392,
"grad_norm": 0.18493321537971497,
"learning_rate": 0.00029770399615984636,
"loss": 8.2076,
"step": 96200
},
{
"epoch": 15.408,
"grad_norm": 0.15737323462963104,
"learning_rate": 0.00029770159606384253,
"loss": 8.4031,
"step": 96300
},
{
"epoch": 15.424,
"grad_norm": 0.1808168590068817,
"learning_rate": 0.0002976991959678387,
"loss": 8.0816,
"step": 96400
},
{
"epoch": 15.44,
"grad_norm": 0.12530648708343506,
"learning_rate": 0.0002976967958718348,
"loss": 8.0609,
"step": 96500
},
{
"epoch": 15.456,
"grad_norm": 0.12963543832302094,
"learning_rate": 0.000297694395775831,
"loss": 8.092,
"step": 96600
},
{
"epoch": 15.472,
"grad_norm": 0.1329260617494583,
"learning_rate": 0.00029769199567982715,
"loss": 8.4219,
"step": 96700
},
{
"epoch": 15.488,
"grad_norm": 0.1603865921497345,
"learning_rate": 0.0002976895955838233,
"loss": 7.8878,
"step": 96800
},
{
"epoch": 15.504,
"grad_norm": 0.16902674734592438,
"learning_rate": 0.0002976871954878195,
"loss": 8.2197,
"step": 96900
},
{
"epoch": 15.52,
"grad_norm": 0.15807543694972992,
"learning_rate": 0.0002976847953918156,
"loss": 7.937,
"step": 97000
},
{
"epoch": 15.536,
"grad_norm": 0.15132875740528107,
"learning_rate": 0.0002976823952958118,
"loss": 8.6177,
"step": 97100
},
{
"epoch": 15.552,
"grad_norm": 0.1347590982913971,
"learning_rate": 0.00029767999519980795,
"loss": 8.7107,
"step": 97200
},
{
"epoch": 15.568,
"grad_norm": 0.16151072084903717,
"learning_rate": 0.0002976775951038041,
"loss": 8.4782,
"step": 97300
},
{
"epoch": 15.584,
"grad_norm": 0.194889098405838,
"learning_rate": 0.0002976751950078003,
"loss": 8.128,
"step": 97400
},
{
"epoch": 15.6,
"grad_norm": 0.18148979544639587,
"learning_rate": 0.00029767279491179645,
"loss": 8.3591,
"step": 97500
},
{
"epoch": 15.616,
"grad_norm": 0.1610337197780609,
"learning_rate": 0.00029767039481579257,
"loss": 8.8492,
"step": 97600
},
{
"epoch": 15.632,
"grad_norm": 0.15079425275325775,
"learning_rate": 0.00029766799471978874,
"loss": 8.2512,
"step": 97700
},
{
"epoch": 15.648,
"grad_norm": 0.1274147629737854,
"learning_rate": 0.0002976655946237849,
"loss": 8.2239,
"step": 97800
},
{
"epoch": 15.664,
"grad_norm": 0.14330662786960602,
"learning_rate": 0.0002976631945277811,
"loss": 8.3046,
"step": 97900
},
{
"epoch": 15.68,
"grad_norm": 0.17394746840000153,
"learning_rate": 0.00029766079443177725,
"loss": 8.2542,
"step": 98000
},
{
"epoch": 15.696,
"grad_norm": 0.15639960765838623,
"learning_rate": 0.0002976583943357734,
"loss": 8.3993,
"step": 98100
},
{
"epoch": 15.712,
"grad_norm": 0.12845559418201447,
"learning_rate": 0.0002976559942397696,
"loss": 8.2055,
"step": 98200
},
{
"epoch": 15.728,
"grad_norm": 0.1673252284526825,
"learning_rate": 0.00029765359414376575,
"loss": 8.2969,
"step": 98300
},
{
"epoch": 15.744,
"grad_norm": 0.12345835566520691,
"learning_rate": 0.0002976511940477619,
"loss": 8.4381,
"step": 98400
},
{
"epoch": 15.76,
"grad_norm": 0.19648896157741547,
"learning_rate": 0.00029764879395175804,
"loss": 8.0932,
"step": 98500
},
{
"epoch": 15.776,
"grad_norm": 0.14960013329982758,
"learning_rate": 0.0002976463938557542,
"loss": 8.4303,
"step": 98600
},
{
"epoch": 15.792,
"grad_norm": 0.19554351270198822,
"learning_rate": 0.0002976439937597504,
"loss": 8.0159,
"step": 98700
},
{
"epoch": 15.808,
"grad_norm": 0.1545807123184204,
"learning_rate": 0.00029764159366374654,
"loss": 8.0277,
"step": 98800
},
{
"epoch": 15.824,
"grad_norm": 0.11705837398767471,
"learning_rate": 0.0002976391935677427,
"loss": 8.2474,
"step": 98900
},
{
"epoch": 15.84,
"grad_norm": 0.16222915053367615,
"learning_rate": 0.00029763679347173883,
"loss": 7.8129,
"step": 99000
},
{
"epoch": 15.856,
"grad_norm": 0.18901053071022034,
"learning_rate": 0.000297634393375735,
"loss": 8.3068,
"step": 99100
},
{
"epoch": 15.872,
"grad_norm": 0.13031688332557678,
"learning_rate": 0.00029763199327973117,
"loss": 8.1526,
"step": 99200
},
{
"epoch": 15.888,
"grad_norm": 0.17539045214653015,
"learning_rate": 0.00029762959318372734,
"loss": 7.7545,
"step": 99300
},
{
"epoch": 15.904,
"grad_norm": NaN,
"learning_rate": 0.0002976271930877235,
"loss": 7.8745,
"step": 99400
},
{
"epoch": 15.92,
"grad_norm": 0.17992717027664185,
"learning_rate": 0.0002976248169926797,
"loss": 7.9663,
"step": 99500
},
{
"epoch": 15.936,
"grad_norm": 0.40667879581451416,
"learning_rate": 0.0002976224168966758,
"loss": 8.1505,
"step": 99600
},
{
"epoch": 15.952,
"grad_norm": 0.15805494785308838,
"learning_rate": 0.000297620016800672,
"loss": 8.4417,
"step": 99700
},
{
"epoch": 15.968,
"grad_norm": 0.16626039147377014,
"learning_rate": 0.00029761761670466815,
"loss": 8.2951,
"step": 99800
},
{
"epoch": 15.984,
"grad_norm": 0.14239948987960815,
"learning_rate": 0.0002976152166086643,
"loss": 8.3205,
"step": 99900
},
{
"epoch": 16.0,
"grad_norm": 0.24553033709526062,
"learning_rate": 0.0002976128165126605,
"loss": 8.2056,
"step": 100000
},
{
"epoch": 16.016,
"grad_norm": 0.18159309029579163,
"learning_rate": 0.0002976104164166566,
"loss": 7.9151,
"step": 100100
},
{
"epoch": 16.032,
"grad_norm": 0.16968666017055511,
"learning_rate": 0.00029760801632065277,
"loss": 7.8903,
"step": 100200
},
{
"epoch": 16.048,
"grad_norm": 0.1661410927772522,
"learning_rate": 0.00029760561622464894,
"loss": 8.3051,
"step": 100300
},
{
"epoch": 16.064,
"grad_norm": 0.1526879370212555,
"learning_rate": 0.0002976032161286451,
"loss": 7.8435,
"step": 100400
},
{
"epoch": 16.08,
"grad_norm": 0.14917099475860596,
"learning_rate": 0.0002976008160326413,
"loss": 8.0571,
"step": 100500
},
{
"epoch": 16.096,
"grad_norm": 0.15157845616340637,
"learning_rate": 0.00029759841593663745,
"loss": 8.0002,
"step": 100600
},
{
"epoch": 16.112,
"grad_norm": 0.1487221121788025,
"learning_rate": 0.00029759601584063356,
"loss": 7.864,
"step": 100700
},
{
"epoch": 16.128,
"grad_norm": 0.1397908627986908,
"learning_rate": 0.00029759361574462973,
"loss": 8.0639,
"step": 100800
},
{
"epoch": 16.144,
"grad_norm": 0.1495772898197174,
"learning_rate": 0.0002975912156486259,
"loss": 7.8346,
"step": 100900
},
{
"epoch": 16.16,
"grad_norm": 0.17440412938594818,
"learning_rate": 0.00029758881555262207,
"loss": 8.1732,
"step": 101000
},
{
"epoch": 16.176,
"grad_norm": 0.15802791714668274,
"learning_rate": 0.00029758641545661824,
"loss": 7.9528,
"step": 101100
},
{
"epoch": 16.192,
"grad_norm": 0.15488143265247345,
"learning_rate": 0.0002975840153606144,
"loss": 7.8414,
"step": 101200
},
{
"epoch": 16.208,
"grad_norm": 0.1365291178226471,
"learning_rate": 0.0002975816152646106,
"loss": 7.9363,
"step": 101300
},
{
"epoch": 16.224,
"grad_norm": 0.13933680951595306,
"learning_rate": 0.00029757921516860675,
"loss": 7.5429,
"step": 101400
},
{
"epoch": 16.24,
"grad_norm": 0.19280196726322174,
"learning_rate": 0.0002975768150726029,
"loss": 7.913,
"step": 101500
},
{
"epoch": 16.256,
"grad_norm": 0.11700501292943954,
"learning_rate": 0.00029757441497659903,
"loss": 8.0237,
"step": 101600
},
{
"epoch": 16.272,
"grad_norm": 0.16518530249595642,
"learning_rate": 0.0002975720388815552,
"loss": 7.8771,
"step": 101700
},
{
"epoch": 16.288,
"grad_norm": 0.14215916395187378,
"learning_rate": 0.0002975696387855514,
"loss": 8.2513,
"step": 101800
},
{
"epoch": 16.304,
"grad_norm": 0.15119720995426178,
"learning_rate": 0.00029756723868954756,
"loss": 8.0416,
"step": 101900
},
{
"epoch": 16.32,
"grad_norm": 0.17267923057079315,
"learning_rate": 0.0002975648385935437,
"loss": 7.7183,
"step": 102000
},
{
"epoch": 16.336,
"grad_norm": 0.13659106194972992,
"learning_rate": 0.00029756243849753984,
"loss": 7.6539,
"step": 102100
},
{
"epoch": 16.352,
"grad_norm": 0.13859499990940094,
"learning_rate": 0.000297560038401536,
"loss": 7.9309,
"step": 102200
},
{
"epoch": 16.368,
"grad_norm": 0.16713272035121918,
"learning_rate": 0.0002975576383055322,
"loss": 7.7884,
"step": 102300
},
{
"epoch": 16.384,
"grad_norm": 0.19469381868839264,
"learning_rate": 0.00029755523820952835,
"loss": 7.6944,
"step": 102400
},
{
"epoch": 16.4,
"grad_norm": 0.14082291722297668,
"learning_rate": 0.0002975528381135245,
"loss": 7.5828,
"step": 102500
},
{
"epoch": 16.416,
"grad_norm": 0.12121783196926117,
"learning_rate": 0.0002975504380175207,
"loss": 7.813,
"step": 102600
},
{
"epoch": 16.432,
"grad_norm": 0.22072196006774902,
"learning_rate": 0.0002975480379215168,
"loss": 8.2315,
"step": 102700
},
{
"epoch": 16.448,
"grad_norm": 0.1469603329896927,
"learning_rate": 0.00029754563782551297,
"loss": 8.0137,
"step": 102800
},
{
"epoch": 16.464,
"grad_norm": 0.11437113583087921,
"learning_rate": 0.00029754323772950914,
"loss": 7.3291,
"step": 102900
},
{
"epoch": 16.48,
"grad_norm": 0.17373935878276825,
"learning_rate": 0.0002975408376335053,
"loss": 8.0078,
"step": 103000
},
{
"epoch": 16.496,
"grad_norm": 0.12379905581474304,
"learning_rate": 0.0002975384375375015,
"loss": 8.0724,
"step": 103100
},
{
"epoch": 16.512,
"grad_norm": 0.1540013700723648,
"learning_rate": 0.00029753603744149765,
"loss": 7.6953,
"step": 103200
},
{
"epoch": 16.528,
"grad_norm": 0.21880146861076355,
"learning_rate": 0.00029753363734549376,
"loss": 8.0522,
"step": 103300
},
{
"epoch": 16.544,
"grad_norm": 0.14410023391246796,
"learning_rate": 0.00029753123724948993,
"loss": 8.191,
"step": 103400
},
{
"epoch": 16.56,
"grad_norm": 0.13037148118019104,
"learning_rate": 0.0002975288371534861,
"loss": 7.6117,
"step": 103500
},
{
"epoch": 16.576,
"grad_norm": 0.16236849129199982,
"learning_rate": 0.00029752643705748227,
"loss": 7.9894,
"step": 103600
},
{
"epoch": 16.592,
"grad_norm": 0.1502009928226471,
"learning_rate": 0.00029752403696147844,
"loss": 7.7302,
"step": 103700
},
{
"epoch": 16.608,
"grad_norm": 0.18485447764396667,
"learning_rate": 0.00029752163686547455,
"loss": 7.8743,
"step": 103800
},
{
"epoch": 16.624,
"grad_norm": 0.12873640656471252,
"learning_rate": 0.0002975192367694707,
"loss": 7.6197,
"step": 103900
},
{
"epoch": 16.64,
"grad_norm": 0.11517874896526337,
"learning_rate": 0.0002975168366734669,
"loss": 7.4887,
"step": 104000
},
{
"epoch": 16.656,
"grad_norm": 0.11515144258737564,
"learning_rate": 0.00029751443657746306,
"loss": 7.706,
"step": 104100
},
{
"epoch": 16.672,
"grad_norm": 0.15465959906578064,
"learning_rate": 0.00029751203648145923,
"loss": 7.3052,
"step": 104200
},
{
"epoch": 16.688,
"grad_norm": 0.12962587177753448,
"learning_rate": 0.0002975096603864154,
"loss": 7.8117,
"step": 104300
},
{
"epoch": 16.704,
"grad_norm": 0.18321260809898376,
"learning_rate": 0.0002975072602904116,
"loss": 7.4464,
"step": 104400
},
{
"epoch": 16.72,
"grad_norm": 0.1769808679819107,
"learning_rate": 0.00029750486019440776,
"loss": 7.8639,
"step": 104500
},
{
"epoch": 16.736,
"grad_norm": 0.15869227051734924,
"learning_rate": 0.00029750246009840393,
"loss": 7.7956,
"step": 104600
},
{
"epoch": 16.752,
"grad_norm": 0.12134505808353424,
"learning_rate": 0.00029750006000240004,
"loss": 7.5809,
"step": 104700
},
{
"epoch": 16.768,
"grad_norm": 0.13986830413341522,
"learning_rate": 0.0002974976599063962,
"loss": 7.4372,
"step": 104800
},
{
"epoch": 16.784,
"grad_norm": 0.1761140078306198,
"learning_rate": 0.0002974952598103924,
"loss": 7.7486,
"step": 104900
},
{
"epoch": 16.8,
"grad_norm": 0.13163812458515167,
"learning_rate": 0.00029749285971438855,
"loss": 7.834,
"step": 105000
},
{
"epoch": 16.816,
"grad_norm": 0.1813841462135315,
"learning_rate": 0.0002974904596183847,
"loss": 7.5974,
"step": 105100
},
{
"epoch": 16.832,
"grad_norm": 0.15655750036239624,
"learning_rate": 0.0002974880595223809,
"loss": 7.4437,
"step": 105200
},
{
"epoch": 16.848,
"grad_norm": 0.16123917698860168,
"learning_rate": 0.000297485659426377,
"loss": 7.347,
"step": 105300
},
{
"epoch": 16.864,
"grad_norm": 0.18692290782928467,
"learning_rate": 0.00029748325933037317,
"loss": 7.8658,
"step": 105400
},
{
"epoch": 16.88,
"grad_norm": 0.15913629531860352,
"learning_rate": 0.00029748085923436934,
"loss": 7.9134,
"step": 105500
},
{
"epoch": 16.896,
"grad_norm": 0.1343807876110077,
"learning_rate": 0.0002974784591383655,
"loss": 7.5983,
"step": 105600
},
{
"epoch": 16.912,
"grad_norm": 0.2009182572364807,
"learning_rate": 0.0002974760590423617,
"loss": 7.3442,
"step": 105700
},
{
"epoch": 16.928,
"grad_norm": 0.1569000780582428,
"learning_rate": 0.0002974736589463578,
"loss": 7.5953,
"step": 105800
},
{
"epoch": 16.944,
"grad_norm": 0.1601628214120865,
"learning_rate": 0.00029747125885035396,
"loss": 7.5624,
"step": 105900
},
{
"epoch": 16.96,
"grad_norm": 0.14143775403499603,
"learning_rate": 0.00029746885875435013,
"loss": 7.579,
"step": 106000
},
{
"epoch": 16.976,
"grad_norm": 0.2106146216392517,
"learning_rate": 0.0002974664586583463,
"loss": 7.5958,
"step": 106100
},
{
"epoch": 16.992,
"grad_norm": 0.17329080402851105,
"learning_rate": 0.00029746405856234247,
"loss": 8.0935,
"step": 106200
},
{
"epoch": 17.008,
"grad_norm": 0.19225256145000458,
"learning_rate": 0.00029746165846633864,
"loss": 6.8958,
"step": 106300
},
{
"epoch": 17.024,
"grad_norm": 0.17550058662891388,
"learning_rate": 0.00029745925837033476,
"loss": 7.4002,
"step": 106400
},
{
"epoch": 17.04,
"grad_norm": 0.16778625547885895,
"learning_rate": 0.0002974568582743309,
"loss": 7.698,
"step": 106500
},
{
"epoch": 17.056,
"grad_norm": 0.14647962152957916,
"learning_rate": 0.0002974544581783271,
"loss": 7.5615,
"step": 106600
},
{
"epoch": 17.072,
"grad_norm": 0.15024389326572418,
"learning_rate": 0.00029745205808232326,
"loss": 7.6671,
"step": 106700
},
{
"epoch": 17.088,
"grad_norm": 0.11949127167463303,
"learning_rate": 0.00029744965798631943,
"loss": 7.6843,
"step": 106800
},
{
"epoch": 17.104,
"grad_norm": 0.15480674803256989,
"learning_rate": 0.00029744725789031555,
"loss": 7.9465,
"step": 106900
},
{
"epoch": 17.12,
"grad_norm": 0.14191922545433044,
"learning_rate": 0.0002974448577943117,
"loss": 7.7372,
"step": 107000
},
{
"epoch": 17.136,
"grad_norm": 0.19336700439453125,
"learning_rate": 0.0002974424576983079,
"loss": 7.6904,
"step": 107100
},
{
"epoch": 17.152,
"grad_norm": 0.17240415513515472,
"learning_rate": 0.0002974400576023041,
"loss": 7.4487,
"step": 107200
},
{
"epoch": 17.168,
"grad_norm": 0.135718435049057,
"learning_rate": 0.0002974376575063002,
"loss": 7.5844,
"step": 107300
},
{
"epoch": 17.184,
"grad_norm": 0.13594204187393188,
"learning_rate": 0.0002974352574102964,
"loss": 7.1186,
"step": 107400
},
{
"epoch": 17.2,
"grad_norm": 0.14997251331806183,
"learning_rate": 0.00029743285731429256,
"loss": 7.3525,
"step": 107500
},
{
"epoch": 17.216,
"grad_norm": 0.1264813244342804,
"learning_rate": 0.00029743045721828873,
"loss": 7.8519,
"step": 107600
},
{
"epoch": 17.232,
"grad_norm": 0.16751745343208313,
"learning_rate": 0.0002974280571222849,
"loss": 7.346,
"step": 107700
},
{
"epoch": 17.248,
"grad_norm": 0.196015402674675,
"learning_rate": 0.000297425657026281,
"loss": 7.5401,
"step": 107800
},
{
"epoch": 17.264,
"grad_norm": 0.14854785799980164,
"learning_rate": 0.0002974232569302772,
"loss": 7.3802,
"step": 107900
},
{
"epoch": 17.28,
"grad_norm": 0.1462150365114212,
"learning_rate": 0.00029742085683427335,
"loss": 7.56,
"step": 108000
},
{
"epoch": 17.296,
"grad_norm": 0.18656545877456665,
"learning_rate": 0.0002974184567382695,
"loss": 7.4044,
"step": 108100
},
{
"epoch": 17.312,
"grad_norm": 0.15170492231845856,
"learning_rate": 0.0002974160566422657,
"loss": 7.1246,
"step": 108200
},
{
"epoch": 17.328,
"grad_norm": 0.13659091293811798,
"learning_rate": 0.00029741365654626186,
"loss": 7.5455,
"step": 108300
},
{
"epoch": 17.344,
"grad_norm": 0.1527138650417328,
"learning_rate": 0.000297411256450258,
"loss": 7.5807,
"step": 108400
},
{
"epoch": 17.36,
"grad_norm": 0.15352298319339752,
"learning_rate": 0.00029740885635425415,
"loss": 7.3586,
"step": 108500
},
{
"epoch": 17.376,
"grad_norm": 0.16372795403003693,
"learning_rate": 0.0002974065042601704,
"loss": 7.5309,
"step": 108600
},
{
"epoch": 17.392,
"grad_norm": 0.14718171954154968,
"learning_rate": 0.0002974041041641665,
"loss": 7.7871,
"step": 108700
},
{
"epoch": 17.408,
"grad_norm": 0.13745012879371643,
"learning_rate": 0.0002974017040681627,
"loss": 7.4228,
"step": 108800
},
{
"epoch": 17.424,
"grad_norm": 0.1310426890850067,
"learning_rate": 0.00029739930397215886,
"loss": 6.914,
"step": 108900
},
{
"epoch": 17.44,
"grad_norm": 0.1291857808828354,
"learning_rate": 0.00029739690387615503,
"loss": 7.5163,
"step": 109000
},
{
"epoch": 17.456,
"grad_norm": 0.1615869104862213,
"learning_rate": 0.0002973945037801512,
"loss": 6.9051,
"step": 109100
},
{
"epoch": 17.472,
"grad_norm": 0.11409099400043488,
"learning_rate": 0.00029739210368414737,
"loss": 7.4919,
"step": 109200
},
{
"epoch": 17.488,
"grad_norm": 0.12527474761009216,
"learning_rate": 0.0002973897035881435,
"loss": 7.5104,
"step": 109300
},
{
"epoch": 17.504,
"grad_norm": 0.1936863362789154,
"learning_rate": 0.00029738730349213965,
"loss": 7.1046,
"step": 109400
},
{
"epoch": 17.52,
"grad_norm": 0.12854978442192078,
"learning_rate": 0.0002973849033961358,
"loss": 7.4067,
"step": 109500
},
{
"epoch": 17.536,
"grad_norm": 0.13116727769374847,
"learning_rate": 0.000297382503300132,
"loss": 7.2106,
"step": 109600
},
{
"epoch": 17.552,
"grad_norm": 0.16138528287410736,
"learning_rate": 0.00029738010320412816,
"loss": 7.263,
"step": 109700
},
{
"epoch": 17.568,
"grad_norm": 0.14999186992645264,
"learning_rate": 0.0002973777031081243,
"loss": 7.428,
"step": 109800
},
{
"epoch": 17.584,
"grad_norm": 0.13564202189445496,
"learning_rate": 0.00029737530301212045,
"loss": 7.6592,
"step": 109900
},
{
"epoch": 17.6,
"grad_norm": 0.14535826444625854,
"learning_rate": 0.0002973729029161166,
"loss": 7.2886,
"step": 110000
},
{
"epoch": 17.616,
"grad_norm": 0.13466519117355347,
"learning_rate": 0.0002973705028201128,
"loss": 7.4852,
"step": 110100
},
{
"epoch": 17.632,
"grad_norm": 0.1622999757528305,
"learning_rate": 0.00029736810272410895,
"loss": 7.6437,
"step": 110200
},
{
"epoch": 17.648,
"grad_norm": 0.15417474508285522,
"learning_rate": 0.0002973657026281051,
"loss": 7.4305,
"step": 110300
},
{
"epoch": 17.664,
"grad_norm": 0.1484052836894989,
"learning_rate": 0.00029736330253210124,
"loss": 7.5558,
"step": 110400
},
{
"epoch": 17.68,
"grad_norm": 0.15688396990299225,
"learning_rate": 0.0002973609024360974,
"loss": 7.4349,
"step": 110500
},
{
"epoch": 17.696,
"grad_norm": 0.15338055789470673,
"learning_rate": 0.0002973585023400936,
"loss": 7.2818,
"step": 110600
},
{
"epoch": 17.712,
"grad_norm": 0.1761266142129898,
"learning_rate": 0.00029735610224408974,
"loss": 7.2618,
"step": 110700
},
{
"epoch": 17.728,
"grad_norm": 0.17337530851364136,
"learning_rate": 0.0002973537021480859,
"loss": 7.0263,
"step": 110800
},
{
"epoch": 17.744,
"grad_norm": 0.14693669974803925,
"learning_rate": 0.00029735130205208203,
"loss": 6.9075,
"step": 110900
},
{
"epoch": 17.76,
"grad_norm": 0.14184145629405975,
"learning_rate": 0.00029734892595703827,
"loss": 7.1306,
"step": 111000
},
{
"epoch": 17.776,
"grad_norm": 0.15281623601913452,
"learning_rate": 0.00029734652586103444,
"loss": 6.9965,
"step": 111100
},
{
"epoch": 17.792,
"grad_norm": 0.30168259143829346,
"learning_rate": 0.0002973441257650306,
"loss": 7.3388,
"step": 111200
},
{
"epoch": 17.808,
"grad_norm": 0.15365231037139893,
"learning_rate": 0.0002973417256690267,
"loss": 7.2799,
"step": 111300
},
{
"epoch": 17.824,
"grad_norm": 0.1704150289297104,
"learning_rate": 0.0002973393255730229,
"loss": 7.3031,
"step": 111400
},
{
"epoch": 17.84,
"grad_norm": 0.16025039553642273,
"learning_rate": 0.00029733692547701906,
"loss": 6.9446,
"step": 111500
},
{
"epoch": 17.856,
"grad_norm": 0.14661014080047607,
"learning_rate": 0.00029733452538101523,
"loss": 7.4911,
"step": 111600
},
{
"epoch": 17.872,
"grad_norm": 0.18997499346733093,
"learning_rate": 0.0002973321252850114,
"loss": 7.2489,
"step": 111700
},
{
"epoch": 17.888,
"grad_norm": 0.16025018692016602,
"learning_rate": 0.0002973297251890075,
"loss": 7.4835,
"step": 111800
},
{
"epoch": 17.904,
"grad_norm": 0.19556750357151031,
"learning_rate": 0.0002973273250930037,
"loss": 7.5087,
"step": 111900
},
{
"epoch": 17.92,
"grad_norm": 0.14444762468338013,
"learning_rate": 0.00029732492499699986,
"loss": 7.3942,
"step": 112000
},
{
"epoch": 17.936,
"grad_norm": 0.12939786911010742,
"learning_rate": 0.000297322524900996,
"loss": 7.0694,
"step": 112100
},
{
"epoch": 17.951999999999998,
"grad_norm": 0.1845860481262207,
"learning_rate": 0.0002973201248049922,
"loss": 7.3517,
"step": 112200
},
{
"epoch": 17.968,
"grad_norm": 0.1611936390399933,
"learning_rate": 0.00029731772470898836,
"loss": 7.3119,
"step": 112300
},
{
"epoch": 17.984,
"grad_norm": 0.1410474330186844,
"learning_rate": 0.0002973153246129845,
"loss": 7.1857,
"step": 112400
},
{
"epoch": 18.0,
"grad_norm": 0.14935807883739471,
"learning_rate": 0.00029731292451698065,
"loss": 7.2314,
"step": 112500
},
{
"epoch": 18.016,
"grad_norm": 0.11792614310979843,
"learning_rate": 0.0002973105244209768,
"loss": 7.0182,
"step": 112600
},
{
"epoch": 18.032,
"grad_norm": 0.19907847046852112,
"learning_rate": 0.000297308124324973,
"loss": 7.036,
"step": 112700
},
{
"epoch": 18.048,
"grad_norm": 0.11814866214990616,
"learning_rate": 0.00029730572422896915,
"loss": 7.2484,
"step": 112800
},
{
"epoch": 18.064,
"grad_norm": 0.16914184391498566,
"learning_rate": 0.00029730332413296527,
"loss": 7.0729,
"step": 112900
},
{
"epoch": 18.08,
"grad_norm": 0.11930215358734131,
"learning_rate": 0.00029730092403696144,
"loss": 6.9642,
"step": 113000
},
{
"epoch": 18.096,
"grad_norm": 0.14744411408901215,
"learning_rate": 0.0002972985239409576,
"loss": 7.1132,
"step": 113100
},
{
"epoch": 18.112,
"grad_norm": 0.1400415003299713,
"learning_rate": 0.0002972961238449538,
"loss": 7.1415,
"step": 113200
},
{
"epoch": 18.128,
"grad_norm": 0.1671387106180191,
"learning_rate": 0.00029729374774990997,
"loss": 7.2558,
"step": 113300
},
{
"epoch": 18.144,
"grad_norm": 0.16554495692253113,
"learning_rate": 0.00029729134765390613,
"loss": 6.9987,
"step": 113400
},
{
"epoch": 18.16,
"grad_norm": 0.1383550763130188,
"learning_rate": 0.0002972889475579023,
"loss": 7.0975,
"step": 113500
},
{
"epoch": 18.176,
"grad_norm": 0.1566449999809265,
"learning_rate": 0.0002972865474618985,
"loss": 7.0562,
"step": 113600
},
{
"epoch": 18.192,
"grad_norm": 0.19498635828495026,
"learning_rate": 0.00029728414736589464,
"loss": 6.6165,
"step": 113700
},
{
"epoch": 18.208,
"grad_norm": 0.1640356481075287,
"learning_rate": 0.00029728174726989076,
"loss": 7.1794,
"step": 113800
},
{
"epoch": 18.224,
"grad_norm": 0.11614058166742325,
"learning_rate": 0.0002972793471738869,
"loss": 7.285,
"step": 113900
},
{
"epoch": 18.24,
"grad_norm": 0.15918317437171936,
"learning_rate": 0.0002972769470778831,
"loss": 7.163,
"step": 114000
},
{
"epoch": 18.256,
"grad_norm": 0.1565544754266739,
"learning_rate": 0.00029727454698187926,
"loss": 7.225,
"step": 114100
},
{
"epoch": 18.272,
"grad_norm": 0.17850929498672485,
"learning_rate": 0.00029727214688587543,
"loss": 6.801,
"step": 114200
},
{
"epoch": 18.288,
"grad_norm": 0.11589377373456955,
"learning_rate": 0.0002972697467898716,
"loss": 6.8754,
"step": 114300
},
{
"epoch": 18.304,
"grad_norm": 0.13528980314731598,
"learning_rate": 0.0002972673466938677,
"loss": 7.1785,
"step": 114400
},
{
"epoch": 18.32,
"grad_norm": 0.14462067186832428,
"learning_rate": 0.0002972649465978639,
"loss": 6.7743,
"step": 114500
},
{
"epoch": 18.336,
"grad_norm": 0.11352884024381638,
"learning_rate": 0.0002972625705028201,
"loss": 7.195,
"step": 114600
},
{
"epoch": 18.352,
"grad_norm": 0.15487293899059296,
"learning_rate": 0.00029726017040681624,
"loss": 6.9974,
"step": 114700
},
{
"epoch": 18.368,
"grad_norm": 0.18302305042743683,
"learning_rate": 0.0002972577703108124,
"loss": 7.3688,
"step": 114800
},
{
"epoch": 18.384,
"grad_norm": 0.13732467591762543,
"learning_rate": 0.00029725537021480853,
"loss": 7.1072,
"step": 114900
},
{
"epoch": 18.4,
"grad_norm": 0.16661597788333893,
"learning_rate": 0.0002972529701188047,
"loss": 6.9747,
"step": 115000
},
{
"epoch": 18.416,
"grad_norm": 0.13797527551651,
"learning_rate": 0.00029725057002280087,
"loss": 6.9419,
"step": 115100
},
{
"epoch": 18.432,
"grad_norm": 0.12859782576560974,
"learning_rate": 0.00029724816992679704,
"loss": 6.7853,
"step": 115200
},
{
"epoch": 18.448,
"grad_norm": 0.14815713465213776,
"learning_rate": 0.0002972457698307932,
"loss": 7.2451,
"step": 115300
},
{
"epoch": 18.464,
"grad_norm": 0.17937737703323364,
"learning_rate": 0.0002972433697347894,
"loss": 6.9378,
"step": 115400
},
{
"epoch": 18.48,
"grad_norm": 0.1678260713815689,
"learning_rate": 0.0002972409696387855,
"loss": 7.324,
"step": 115500
},
{
"epoch": 18.496,
"grad_norm": 0.1482672095298767,
"learning_rate": 0.0002972385695427817,
"loss": 6.7464,
"step": 115600
},
{
"epoch": 18.512,
"grad_norm": 0.13717281818389893,
"learning_rate": 0.0002972361694467779,
"loss": 6.9728,
"step": 115700
},
{
"epoch": 18.528,
"grad_norm": 0.16356568038463593,
"learning_rate": 0.000297233769350774,
"loss": 6.4269,
"step": 115800
},
{
"epoch": 18.544,
"grad_norm": 0.11255384981632233,
"learning_rate": 0.00029723136925477017,
"loss": 6.8938,
"step": 115900
},
{
"epoch": 18.56,
"grad_norm": 0.18403998017311096,
"learning_rate": 0.00029722896915876634,
"loss": 7.5852,
"step": 116000
},
{
"epoch": 18.576,
"grad_norm": 0.16399045288562775,
"learning_rate": 0.0002972265690627625,
"loss": 6.8499,
"step": 116100
},
{
"epoch": 18.592,
"grad_norm": 0.1565336287021637,
"learning_rate": 0.0002972241689667587,
"loss": 6.7727,
"step": 116200
},
{
"epoch": 18.608,
"grad_norm": 0.19689014554023743,
"learning_rate": 0.00029722176887075484,
"loss": 7.1385,
"step": 116300
},
{
"epoch": 18.624,
"grad_norm": 0.13252195715904236,
"learning_rate": 0.00029721936877475096,
"loss": 6.6291,
"step": 116400
},
{
"epoch": 18.64,
"grad_norm": 0.12019433081150055,
"learning_rate": 0.00029721696867874713,
"loss": 6.8913,
"step": 116500
},
{
"epoch": 18.656,
"grad_norm": 0.16386528313159943,
"learning_rate": 0.0002972145685827433,
"loss": 6.7989,
"step": 116600
},
{
"epoch": 18.672,
"grad_norm": 0.13716477155685425,
"learning_rate": 0.00029721216848673947,
"loss": 6.6763,
"step": 116700
},
{
"epoch": 18.688,
"grad_norm": 0.13785770535469055,
"learning_rate": 0.00029720976839073564,
"loss": 6.6476,
"step": 116800
},
{
"epoch": 18.704,
"grad_norm": 0.1605842560529709,
"learning_rate": 0.00029720736829473175,
"loss": 6.6566,
"step": 116900
},
{
"epoch": 18.72,
"grad_norm": 0.19339755177497864,
"learning_rate": 0.0002972049681987279,
"loss": 6.9454,
"step": 117000
},
{
"epoch": 18.736,
"grad_norm": 0.14963068068027496,
"learning_rate": 0.0002972025681027241,
"loss": 7.0718,
"step": 117100
},
{
"epoch": 18.752,
"grad_norm": 0.1378934234380722,
"learning_rate": 0.00029720016800672026,
"loss": 6.7582,
"step": 117200
},
{
"epoch": 18.768,
"grad_norm": 0.1546606719493866,
"learning_rate": 0.0002971977679107164,
"loss": 6.9278,
"step": 117300
},
{
"epoch": 18.784,
"grad_norm": 0.13777601718902588,
"learning_rate": 0.0002971953678147126,
"loss": 6.821,
"step": 117400
},
{
"epoch": 18.8,
"grad_norm": 0.1833031326532364,
"learning_rate": 0.0002971929677187087,
"loss": 7.345,
"step": 117500
},
{
"epoch": 18.816,
"grad_norm": 0.13752517104148865,
"learning_rate": 0.0002971905676227049,
"loss": 7.0435,
"step": 117600
},
{
"epoch": 18.832,
"grad_norm": 0.14740273356437683,
"learning_rate": 0.00029718816752670105,
"loss": 7.0617,
"step": 117700
},
{
"epoch": 18.848,
"grad_norm": 0.13207408785820007,
"learning_rate": 0.0002971857674306972,
"loss": 6.9374,
"step": 117800
},
{
"epoch": 18.864,
"grad_norm": 0.14092418551445007,
"learning_rate": 0.0002971833673346934,
"loss": 6.5626,
"step": 117900
},
{
"epoch": 18.88,
"grad_norm": 0.19631852209568024,
"learning_rate": 0.0002971809672386895,
"loss": 7.162,
"step": 118000
},
{
"epoch": 18.896,
"grad_norm": 0.12741628289222717,
"learning_rate": 0.00029717856714268567,
"loss": 6.8316,
"step": 118100
},
{
"epoch": 18.912,
"grad_norm": 0.17144246399402618,
"learning_rate": 0.00029717616704668184,
"loss": 6.5714,
"step": 118200
},
{
"epoch": 18.928,
"grad_norm": 0.1456017643213272,
"learning_rate": 0.000297173766950678,
"loss": 7.1563,
"step": 118300
},
{
"epoch": 18.944,
"grad_norm": 0.17816682159900665,
"learning_rate": 0.0002971713668546742,
"loss": 7.1767,
"step": 118400
},
{
"epoch": 18.96,
"grad_norm": 0.274588942527771,
"learning_rate": 0.00029716896675867035,
"loss": 6.9244,
"step": 118500
},
{
"epoch": 18.976,
"grad_norm": 0.14686717092990875,
"learning_rate": 0.00029716656666266646,
"loss": 6.9108,
"step": 118600
},
{
"epoch": 18.992,
"grad_norm": 0.1549716740846634,
"learning_rate": 0.00029716416656666263,
"loss": 7.1166,
"step": 118700
},
{
"epoch": 19.008,
"grad_norm": 0.24241045117378235,
"learning_rate": 0.0002971617664706588,
"loss": 6.7128,
"step": 118800
},
{
"epoch": 19.024,
"grad_norm": 0.14365893602371216,
"learning_rate": 0.00029715936637465497,
"loss": 6.5973,
"step": 118900
},
{
"epoch": 19.04,
"grad_norm": 0.1771174818277359,
"learning_rate": 0.00029715696627865114,
"loss": 6.8558,
"step": 119000
},
{
"epoch": 19.056,
"grad_norm": 0.1703067272901535,
"learning_rate": 0.00029715456618264726,
"loss": 6.748,
"step": 119100
},
{
"epoch": 19.072,
"grad_norm": 0.1466696858406067,
"learning_rate": 0.0002971521660866434,
"loss": 6.6093,
"step": 119200
},
{
"epoch": 19.088,
"grad_norm": 0.16070063412189484,
"learning_rate": 0.0002971497659906396,
"loss": 6.7417,
"step": 119300
},
{
"epoch": 19.104,
"grad_norm": 0.2056402564048767,
"learning_rate": 0.00029714738989559584,
"loss": 6.4175,
"step": 119400
},
{
"epoch": 19.12,
"grad_norm": 0.207046240568161,
"learning_rate": 0.00029714498979959195,
"loss": 6.9465,
"step": 119500
},
{
"epoch": 19.136,
"grad_norm": 0.12638603150844574,
"learning_rate": 0.0002971425897035881,
"loss": 6.882,
"step": 119600
},
{
"epoch": 19.152,
"grad_norm": 0.17709197103977203,
"learning_rate": 0.0002971401896075843,
"loss": 6.5151,
"step": 119700
},
{
"epoch": 19.168,
"grad_norm": 0.14313985407352448,
"learning_rate": 0.00029713778951158046,
"loss": 6.6897,
"step": 119800
},
{
"epoch": 19.184,
"grad_norm": 0.14212185144424438,
"learning_rate": 0.00029713538941557663,
"loss": 7.0293,
"step": 119900
},
{
"epoch": 19.2,
"grad_norm": 0.14830344915390015,
"learning_rate": 0.00029713298931957274,
"loss": 6.8398,
"step": 120000
},
{
"epoch": 19.216,
"grad_norm": 0.24165965616703033,
"learning_rate": 0.0002971305892235689,
"loss": 6.715,
"step": 120100
},
{
"epoch": 19.232,
"grad_norm": 0.13292773067951202,
"learning_rate": 0.0002971281891275651,
"loss": 6.8165,
"step": 120200
},
{
"epoch": 19.248,
"grad_norm": 0.1639406383037567,
"learning_rate": 0.00029712578903156125,
"loss": 6.9099,
"step": 120300
},
{
"epoch": 19.264,
"grad_norm": 0.18321408331394196,
"learning_rate": 0.0002971233889355574,
"loss": 6.4805,
"step": 120400
},
{
"epoch": 19.28,
"grad_norm": 0.18382756412029266,
"learning_rate": 0.0002971209888395536,
"loss": 6.8172,
"step": 120500
},
{
"epoch": 19.296,
"grad_norm": 0.15303823351860046,
"learning_rate": 0.0002971185887435497,
"loss": 6.2661,
"step": 120600
},
{
"epoch": 19.312,
"grad_norm": 0.1740507036447525,
"learning_rate": 0.0002971161886475459,
"loss": 6.6127,
"step": 120700
},
{
"epoch": 19.328,
"grad_norm": 0.14414259791374207,
"learning_rate": 0.00029711378855154204,
"loss": 6.4442,
"step": 120800
},
{
"epoch": 19.344,
"grad_norm": 0.14647360146045685,
"learning_rate": 0.0002971113884555382,
"loss": 6.6076,
"step": 120900
},
{
"epoch": 19.36,
"grad_norm": 0.15991808474063873,
"learning_rate": 0.0002971089883595344,
"loss": 6.787,
"step": 121000
},
{
"epoch": 19.376,
"grad_norm": 0.1332535594701767,
"learning_rate": 0.0002971065882635305,
"loss": 6.7092,
"step": 121100
},
{
"epoch": 19.392,
"grad_norm": 0.14746126532554626,
"learning_rate": 0.00029710418816752667,
"loss": 6.7574,
"step": 121200
},
{
"epoch": 19.408,
"grad_norm": 0.13268060982227325,
"learning_rate": 0.00029710178807152283,
"loss": 6.4729,
"step": 121300
},
{
"epoch": 19.424,
"grad_norm": 0.18852052092552185,
"learning_rate": 0.000297099387975519,
"loss": 6.7246,
"step": 121400
},
{
"epoch": 19.44,
"grad_norm": 0.20590665936470032,
"learning_rate": 0.00029709698787951517,
"loss": 6.7032,
"step": 121500
},
{
"epoch": 19.456,
"grad_norm": 0.18409046530723572,
"learning_rate": 0.00029709458778351134,
"loss": 6.9088,
"step": 121600
},
{
"epoch": 19.472,
"grad_norm": 0.1330518126487732,
"learning_rate": 0.00029709218768750746,
"loss": 6.7912,
"step": 121700
},
{
"epoch": 19.488,
"grad_norm": 0.17881762981414795,
"learning_rate": 0.0002970897875915036,
"loss": 6.6976,
"step": 121800
},
{
"epoch": 19.504,
"grad_norm": 0.1952984780073166,
"learning_rate": 0.0002970873874954998,
"loss": 6.6684,
"step": 121900
},
{
"epoch": 19.52,
"grad_norm": 0.10283193737268448,
"learning_rate": 0.00029708498739949596,
"loss": 6.8239,
"step": 122000
},
{
"epoch": 19.536,
"grad_norm": 0.14318746328353882,
"learning_rate": 0.00029708258730349213,
"loss": 6.3829,
"step": 122100
},
{
"epoch": 19.552,
"grad_norm": 0.27563196420669556,
"learning_rate": 0.00029708018720748825,
"loss": 6.5011,
"step": 122200
},
{
"epoch": 19.568,
"grad_norm": 0.22338111698627472,
"learning_rate": 0.0002970777871114844,
"loss": 6.5485,
"step": 122300
},
{
"epoch": 19.584,
"grad_norm": 0.12649616599082947,
"learning_rate": 0.0002970753870154806,
"loss": 6.7374,
"step": 122400
},
{
"epoch": 19.6,
"grad_norm": 0.15860269963741302,
"learning_rate": 0.00029707298691947676,
"loss": 6.3596,
"step": 122500
},
{
"epoch": 19.616,
"grad_norm": 0.12358345836400986,
"learning_rate": 0.00029707061082443294,
"loss": 6.3242,
"step": 122600
},
{
"epoch": 19.632,
"grad_norm": 0.16506068408489227,
"learning_rate": 0.0002970682107284291,
"loss": 6.5935,
"step": 122700
},
{
"epoch": 19.648,
"grad_norm": 0.19951657950878143,
"learning_rate": 0.0002970658106324253,
"loss": 6.4781,
"step": 122800
},
{
"epoch": 19.664,
"grad_norm": 0.16879688203334808,
"learning_rate": 0.00029706341053642145,
"loss": 6.4468,
"step": 122900
},
{
"epoch": 19.68,
"grad_norm": 0.14565648138523102,
"learning_rate": 0.0002970610104404176,
"loss": 6.635,
"step": 123000
},
{
"epoch": 19.696,
"grad_norm": 0.12739145755767822,
"learning_rate": 0.00029705861034441374,
"loss": 6.7823,
"step": 123100
},
{
"epoch": 19.712,
"grad_norm": 0.1428256332874298,
"learning_rate": 0.0002970562102484099,
"loss": 6.3011,
"step": 123200
},
{
"epoch": 19.728,
"grad_norm": 0.1541672646999359,
"learning_rate": 0.0002970538101524061,
"loss": 6.93,
"step": 123300
},
{
"epoch": 19.744,
"grad_norm": 0.14009244740009308,
"learning_rate": 0.00029705141005640224,
"loss": 6.4553,
"step": 123400
},
{
"epoch": 19.76,
"grad_norm": 0.1925840973854065,
"learning_rate": 0.0002970490099603984,
"loss": 6.812,
"step": 123500
},
{
"epoch": 19.776,
"grad_norm": 0.1624009907245636,
"learning_rate": 0.0002970466098643946,
"loss": 6.644,
"step": 123600
},
{
"epoch": 19.792,
"grad_norm": 0.12902632355690002,
"learning_rate": 0.0002970442097683907,
"loss": 6.8444,
"step": 123700
},
{
"epoch": 19.808,
"grad_norm": 0.1572074443101883,
"learning_rate": 0.00029704180967238687,
"loss": 6.8285,
"step": 123800
},
{
"epoch": 19.824,
"grad_norm": 0.17196834087371826,
"learning_rate": 0.00029703940957638304,
"loss": 6.318,
"step": 123900
},
{
"epoch": 19.84,
"grad_norm": 0.14329147338867188,
"learning_rate": 0.0002970370094803792,
"loss": 6.5197,
"step": 124000
},
{
"epoch": 19.856,
"grad_norm": 0.12039805948734283,
"learning_rate": 0.0002970346093843754,
"loss": 6.3033,
"step": 124100
},
{
"epoch": 19.872,
"grad_norm": 0.1786791980266571,
"learning_rate": 0.0002970322092883715,
"loss": 6.669,
"step": 124200
},
{
"epoch": 19.888,
"grad_norm": 0.12987840175628662,
"learning_rate": 0.00029702980919236766,
"loss": 6.2543,
"step": 124300
},
{
"epoch": 19.904,
"grad_norm": 0.12259730696678162,
"learning_rate": 0.00029702740909636383,
"loss": 6.4946,
"step": 124400
},
{
"epoch": 19.92,
"grad_norm": 0.10069935768842697,
"learning_rate": 0.00029702500900036,
"loss": 6.7976,
"step": 124500
},
{
"epoch": 19.936,
"grad_norm": 0.14555324614048004,
"learning_rate": 0.00029702260890435617,
"loss": 6.3994,
"step": 124600
},
{
"epoch": 19.951999999999998,
"grad_norm": 0.15070566534996033,
"learning_rate": 0.00029702020880835234,
"loss": 6.3558,
"step": 124700
},
{
"epoch": 19.968,
"grad_norm": 0.13936389982700348,
"learning_rate": 0.00029701780871234845,
"loss": 6.369,
"step": 124800
},
{
"epoch": 19.984,
"grad_norm": 0.20414897799491882,
"learning_rate": 0.0002970154086163446,
"loss": 6.4591,
"step": 124900
},
{
"epoch": 20.0,
"grad_norm": 0.17090056836605072,
"learning_rate": 0.0002970130085203408,
"loss": 6.6428,
"step": 125000
},
{
"epoch": 20.016,
"grad_norm": 0.13628321886062622,
"learning_rate": 0.00029701060842433696,
"loss": 6.6142,
"step": 125100
},
{
"epoch": 20.032,
"grad_norm": 0.1602114588022232,
"learning_rate": 0.0002970082083283331,
"loss": 6.2906,
"step": 125200
},
{
"epoch": 20.048,
"grad_norm": 0.16529148817062378,
"learning_rate": 0.00029700580823232924,
"loss": 6.32,
"step": 125300
},
{
"epoch": 20.064,
"grad_norm": 0.09591558575630188,
"learning_rate": 0.0002970034081363254,
"loss": 6.5236,
"step": 125400
},
{
"epoch": 20.08,
"grad_norm": 0.16209086775779724,
"learning_rate": 0.0002970010080403216,
"loss": 6.0982,
"step": 125500
},
{
"epoch": 20.096,
"grad_norm": 0.14823907613754272,
"learning_rate": 0.00029699860794431775,
"loss": 6.5177,
"step": 125600
},
{
"epoch": 20.112,
"grad_norm": 0.14667312800884247,
"learning_rate": 0.0002969962078483139,
"loss": 6.2496,
"step": 125700
},
{
"epoch": 20.128,
"grad_norm": 0.14101973176002502,
"learning_rate": 0.0002969938077523101,
"loss": 6.4982,
"step": 125800
},
{
"epoch": 20.144,
"grad_norm": 0.15947328507900238,
"learning_rate": 0.0002969914076563062,
"loss": 6.2799,
"step": 125900
},
{
"epoch": 20.16,
"grad_norm": 0.1501172035932541,
"learning_rate": 0.00029698900756030237,
"loss": 6.3317,
"step": 126000
},
{
"epoch": 20.176,
"grad_norm": 0.15825922787189484,
"learning_rate": 0.00029698660746429854,
"loss": 6.2838,
"step": 126100
},
{
"epoch": 20.192,
"grad_norm": 0.14270856976509094,
"learning_rate": 0.00029698423136925473,
"loss": 6.2077,
"step": 126200
},
{
"epoch": 20.208,
"grad_norm": 0.1994931846857071,
"learning_rate": 0.0002969818312732509,
"loss": 6.3276,
"step": 126300
},
{
"epoch": 20.224,
"grad_norm": 0.2308851182460785,
"learning_rate": 0.00029697943117724707,
"loss": 6.3211,
"step": 126400
},
{
"epoch": 20.24,
"grad_norm": 0.21615839004516602,
"learning_rate": 0.00029697703108124324,
"loss": 6.2481,
"step": 126500
},
{
"epoch": 20.256,
"grad_norm": 0.14972296357154846,
"learning_rate": 0.0002969746309852394,
"loss": 6.3543,
"step": 126600
},
{
"epoch": 20.272,
"grad_norm": 0.164517343044281,
"learning_rate": 0.0002969722308892356,
"loss": 6.3991,
"step": 126700
},
{
"epoch": 20.288,
"grad_norm": 0.15623216331005096,
"learning_rate": 0.0002969698307932317,
"loss": 6.6786,
"step": 126800
},
{
"epoch": 20.304,
"grad_norm": 0.1451660692691803,
"learning_rate": 0.00029696743069722786,
"loss": 6.2966,
"step": 126900
},
{
"epoch": 20.32,
"grad_norm": 0.17200326919555664,
"learning_rate": 0.00029696503060122403,
"loss": 6.4685,
"step": 127000
},
{
"epoch": 20.336,
"grad_norm": 0.15096783638000488,
"learning_rate": 0.0002969626305052202,
"loss": 6.2486,
"step": 127100
},
{
"epoch": 20.352,
"grad_norm": 0.14257729053497314,
"learning_rate": 0.00029696023040921637,
"loss": 6.2078,
"step": 127200
},
{
"epoch": 20.368,
"grad_norm": 0.21399612724781036,
"learning_rate": 0.0002969578303132125,
"loss": 6.0766,
"step": 127300
},
{
"epoch": 20.384,
"grad_norm": 0.11737848818302155,
"learning_rate": 0.00029695543021720865,
"loss": 6.3663,
"step": 127400
},
{
"epoch": 20.4,
"grad_norm": 0.13575823605060577,
"learning_rate": 0.0002969530301212048,
"loss": 6.202,
"step": 127500
},
{
"epoch": 20.416,
"grad_norm": 0.15899422764778137,
"learning_rate": 0.000296950630025201,
"loss": 6.0727,
"step": 127600
},
{
"epoch": 20.432,
"grad_norm": 0.18363483250141144,
"learning_rate": 0.00029694822992919716,
"loss": 6.594,
"step": 127700
},
{
"epoch": 20.448,
"grad_norm": 0.1325751096010208,
"learning_rate": 0.00029694582983319333,
"loss": 6.532,
"step": 127800
},
{
"epoch": 20.464,
"grad_norm": 0.13950107991695404,
"learning_rate": 0.00029694342973718944,
"loss": 5.9695,
"step": 127900
},
{
"epoch": 20.48,
"grad_norm": 0.09819541126489639,
"learning_rate": 0.0002969410296411856,
"loss": 6.3775,
"step": 128000
},
{
"epoch": 20.496,
"grad_norm": 0.15788622200489044,
"learning_rate": 0.0002969386295451818,
"loss": 6.5626,
"step": 128100
},
{
"epoch": 20.512,
"grad_norm": 0.1338583081960678,
"learning_rate": 0.00029693622944917795,
"loss": 6.3808,
"step": 128200
},
{
"epoch": 20.528,
"grad_norm": 0.1711709052324295,
"learning_rate": 0.0002969338293531741,
"loss": 6.3297,
"step": 128300
},
{
"epoch": 20.544,
"grad_norm": 0.10356644541025162,
"learning_rate": 0.00029693142925717023,
"loss": 6.2275,
"step": 128400
},
{
"epoch": 20.56,
"grad_norm": 0.17266201972961426,
"learning_rate": 0.0002969290291611664,
"loss": 6.399,
"step": 128500
},
{
"epoch": 20.576,
"grad_norm": 0.1582164466381073,
"learning_rate": 0.0002969266290651626,
"loss": 6.186,
"step": 128600
},
{
"epoch": 20.592,
"grad_norm": 0.15661326050758362,
"learning_rate": 0.00029692422896915874,
"loss": 6.3988,
"step": 128700
},
{
"epoch": 20.608,
"grad_norm": 0.12148367613554001,
"learning_rate": 0.00029692185287411493,
"loss": 6.4026,
"step": 128800
},
{
"epoch": 20.624,
"grad_norm": 0.15861108899116516,
"learning_rate": 0.0002969194527781111,
"loss": 6.1632,
"step": 128900
},
{
"epoch": 20.64,
"grad_norm": 0.21511606872081757,
"learning_rate": 0.00029691705268210727,
"loss": 6.1254,
"step": 129000
},
{
"epoch": 20.656,
"grad_norm": 0.17380183935165405,
"learning_rate": 0.00029691465258610344,
"loss": 5.8979,
"step": 129100
},
{
"epoch": 20.672,
"grad_norm": 0.15295742452144623,
"learning_rate": 0.0002969122524900996,
"loss": 6.1504,
"step": 129200
},
{
"epoch": 20.688,
"grad_norm": 0.14123979210853577,
"learning_rate": 0.0002969098523940957,
"loss": 6.3968,
"step": 129300
},
{
"epoch": 20.704,
"grad_norm": 0.11941767483949661,
"learning_rate": 0.0002969074522980919,
"loss": 6.2761,
"step": 129400
},
{
"epoch": 20.72,
"grad_norm": 0.1716291755437851,
"learning_rate": 0.00029690505220208806,
"loss": 6.1725,
"step": 129500
},
{
"epoch": 20.736,
"grad_norm": 0.10485927015542984,
"learning_rate": 0.00029690265210608423,
"loss": 6.3992,
"step": 129600
},
{
"epoch": 20.752,
"grad_norm": 0.14606288075447083,
"learning_rate": 0.0002969002520100804,
"loss": 6.3221,
"step": 129700
},
{
"epoch": 20.768,
"grad_norm": 0.1599857658147812,
"learning_rate": 0.00029689785191407657,
"loss": 6.4159,
"step": 129800
},
{
"epoch": 20.784,
"grad_norm": 0.1607884019613266,
"learning_rate": 0.0002968954518180727,
"loss": 6.2899,
"step": 129900
},
{
"epoch": 20.8,
"grad_norm": 0.17046970129013062,
"learning_rate": 0.00029689305172206885,
"loss": 6.195,
"step": 130000
},
{
"epoch": 20.816,
"grad_norm": 0.17893536388874054,
"learning_rate": 0.000296890651626065,
"loss": 6.3987,
"step": 130100
},
{
"epoch": 20.832,
"grad_norm": 0.15878397226333618,
"learning_rate": 0.0002968882515300612,
"loss": 6.8826,
"step": 130200
},
{
"epoch": 20.848,
"grad_norm": 0.17702220380306244,
"learning_rate": 0.00029688585143405736,
"loss": 6.4912,
"step": 130300
},
{
"epoch": 20.864,
"grad_norm": 0.1281166672706604,
"learning_rate": 0.0002968834513380535,
"loss": 6.5531,
"step": 130400
},
{
"epoch": 20.88,
"grad_norm": 0.16799704730510712,
"learning_rate": 0.00029688105124204964,
"loss": 5.9929,
"step": 130500
},
{
"epoch": 20.896,
"grad_norm": 0.1236133724451065,
"learning_rate": 0.0002968786511460458,
"loss": 6.0232,
"step": 130600
},
{
"epoch": 20.912,
"grad_norm": 0.1369544267654419,
"learning_rate": 0.000296876251050042,
"loss": 6.5761,
"step": 130700
},
{
"epoch": 20.928,
"grad_norm": 0.13266846537590027,
"learning_rate": 0.00029687385095403815,
"loss": 6.1677,
"step": 130800
},
{
"epoch": 20.944,
"grad_norm": 0.11849372833967209,
"learning_rate": 0.0002968714508580343,
"loss": 6.0787,
"step": 130900
},
{
"epoch": 20.96,
"grad_norm": 0.11395172029733658,
"learning_rate": 0.00029686905076203044,
"loss": 6.2634,
"step": 131000
},
{
"epoch": 20.976,
"grad_norm": 0.11821906268596649,
"learning_rate": 0.0002968666746669866,
"loss": 6.388,
"step": 131100
},
{
"epoch": 20.992,
"grad_norm": 0.12622199952602386,
"learning_rate": 0.00029686427457098285,
"loss": 6.0103,
"step": 131200
},
{
"epoch": 21.008,
"grad_norm": 0.16676801443099976,
"learning_rate": 0.00029686187447497896,
"loss": 5.865,
"step": 131300
},
{
"epoch": 21.024,
"grad_norm": 0.15502384305000305,
"learning_rate": 0.00029685947437897513,
"loss": 6.165,
"step": 131400
},
{
"epoch": 21.04,
"grad_norm": 0.24440471827983856,
"learning_rate": 0.0002968570742829713,
"loss": 5.9314,
"step": 131500
},
{
"epoch": 21.056,
"grad_norm": 0.1315223127603531,
"learning_rate": 0.00029685467418696747,
"loss": 6.0678,
"step": 131600
},
{
"epoch": 21.072,
"grad_norm": 0.1865660399198532,
"learning_rate": 0.00029685227409096364,
"loss": 5.9805,
"step": 131700
},
{
"epoch": 21.088,
"grad_norm": 0.2066924124956131,
"learning_rate": 0.0002968498739949598,
"loss": 6.1499,
"step": 131800
},
{
"epoch": 21.104,
"grad_norm": 0.14284636080265045,
"learning_rate": 0.0002968474738989559,
"loss": 5.7731,
"step": 131900
},
{
"epoch": 21.12,
"grad_norm": 0.15058225393295288,
"learning_rate": 0.0002968450738029521,
"loss": 6.1113,
"step": 132000
},
{
"epoch": 21.136,
"grad_norm": 0.12619538605213165,
"learning_rate": 0.00029684267370694826,
"loss": 5.9437,
"step": 132100
},
{
"epoch": 21.152,
"grad_norm": 0.15766064822673798,
"learning_rate": 0.00029684027361094443,
"loss": 6.2503,
"step": 132200
},
{
"epoch": 21.168,
"grad_norm": 0.14563268423080444,
"learning_rate": 0.0002968378735149406,
"loss": 5.96,
"step": 132300
},
{
"epoch": 21.184,
"grad_norm": 0.14157824218273163,
"learning_rate": 0.0002968354734189367,
"loss": 6.1794,
"step": 132400
},
{
"epoch": 21.2,
"grad_norm": 0.18574143946170807,
"learning_rate": 0.0002968330733229329,
"loss": 6.3155,
"step": 132500
},
{
"epoch": 21.216,
"grad_norm": 0.11855421960353851,
"learning_rate": 0.00029683067322692905,
"loss": 6.4108,
"step": 132600
},
{
"epoch": 21.232,
"grad_norm": 0.12140708416700363,
"learning_rate": 0.0002968282731309252,
"loss": 6.0888,
"step": 132700
},
{
"epoch": 21.248,
"grad_norm": 0.17192867398262024,
"learning_rate": 0.0002968258730349214,
"loss": 6.2884,
"step": 132800
},
{
"epoch": 21.264,
"grad_norm": 0.13360394537448883,
"learning_rate": 0.00029682347293891756,
"loss": 6.1993,
"step": 132900
},
{
"epoch": 21.28,
"grad_norm": 0.16163136065006256,
"learning_rate": 0.0002968210968438737,
"loss": 6.2262,
"step": 133000
},
{
"epoch": 21.296,
"grad_norm": 0.12919676303863525,
"learning_rate": 0.00029681869674786987,
"loss": 5.8,
"step": 133100
},
{
"epoch": 21.312,
"grad_norm": 0.1594499945640564,
"learning_rate": 0.00029681629665186603,
"loss": 5.8055,
"step": 133200
},
{
"epoch": 21.328,
"grad_norm": 0.12262352555990219,
"learning_rate": 0.0002968138965558622,
"loss": 5.6412,
"step": 133300
},
{
"epoch": 21.344,
"grad_norm": 0.16952601075172424,
"learning_rate": 0.0002968114964598584,
"loss": 6.0173,
"step": 133400
},
{
"epoch": 21.36,
"grad_norm": 0.17378447949886322,
"learning_rate": 0.0002968090963638545,
"loss": 5.5105,
"step": 133500
},
{
"epoch": 21.376,
"grad_norm": 0.12117540836334229,
"learning_rate": 0.00029680669626785066,
"loss": 6.5432,
"step": 133600
},
{
"epoch": 21.392,
"grad_norm": 0.15760718286037445,
"learning_rate": 0.0002968042961718468,
"loss": 5.6998,
"step": 133700
},
{
"epoch": 21.408,
"grad_norm": 0.20163291692733765,
"learning_rate": 0.000296801896075843,
"loss": 5.9457,
"step": 133800
},
{
"epoch": 21.424,
"grad_norm": 0.1601804941892624,
"learning_rate": 0.00029679949597983916,
"loss": 5.7331,
"step": 133900
},
{
"epoch": 21.44,
"grad_norm": 0.147283673286438,
"learning_rate": 0.00029679709588383533,
"loss": 6.034,
"step": 134000
},
{
"epoch": 21.456,
"grad_norm": 0.1677253395318985,
"learning_rate": 0.00029679469578783145,
"loss": 6.4454,
"step": 134100
},
{
"epoch": 21.472,
"grad_norm": 0.1402285099029541,
"learning_rate": 0.0002967922956918276,
"loss": 5.9842,
"step": 134200
},
{
"epoch": 21.488,
"grad_norm": 0.185127392411232,
"learning_rate": 0.00029678989559582384,
"loss": 6.0976,
"step": 134300
},
{
"epoch": 21.504,
"grad_norm": 0.17136482894420624,
"learning_rate": 0.00029678749549981996,
"loss": 6.3848,
"step": 134400
},
{
"epoch": 21.52,
"grad_norm": 0.14343611896038055,
"learning_rate": 0.0002967850954038161,
"loss": 6.1087,
"step": 134500
},
{
"epoch": 21.536,
"grad_norm": 0.13721515238285065,
"learning_rate": 0.0002967826953078123,
"loss": 6.0383,
"step": 134600
},
{
"epoch": 21.552,
"grad_norm": 0.13419759273529053,
"learning_rate": 0.00029678029521180846,
"loss": 5.8767,
"step": 134700
},
{
"epoch": 21.568,
"grad_norm": 0.18504373729228973,
"learning_rate": 0.00029677789511580463,
"loss": 6.0607,
"step": 134800
},
{
"epoch": 21.584,
"grad_norm": 0.14880910515785217,
"learning_rate": 0.0002967754950198008,
"loss": 5.9108,
"step": 134900
},
{
"epoch": 21.6,
"grad_norm": 0.13054971396923065,
"learning_rate": 0.0002967730949237969,
"loss": 6.0197,
"step": 135000
},
{
"epoch": 21.616,
"grad_norm": 0.16096660494804382,
"learning_rate": 0.0002967706948277931,
"loss": 5.8114,
"step": 135100
},
{
"epoch": 21.632,
"grad_norm": 0.16552191972732544,
"learning_rate": 0.00029676829473178926,
"loss": 6.2389,
"step": 135200
},
{
"epoch": 21.648,
"grad_norm": 0.13705958425998688,
"learning_rate": 0.0002967658946357854,
"loss": 6.2474,
"step": 135300
},
{
"epoch": 21.664,
"grad_norm": 0.17535176873207092,
"learning_rate": 0.0002967634945397816,
"loss": 6.0806,
"step": 135400
},
{
"epoch": 21.68,
"grad_norm": 0.15185397863388062,
"learning_rate": 0.0002967610944437777,
"loss": 6.2673,
"step": 135500
},
{
"epoch": 21.696,
"grad_norm": 0.1459989696741104,
"learning_rate": 0.0002967586943477739,
"loss": 6.1566,
"step": 135600
},
{
"epoch": 21.712,
"grad_norm": 0.1216706857085228,
"learning_rate": 0.00029675629425177005,
"loss": 5.9801,
"step": 135700
},
{
"epoch": 21.728,
"grad_norm": 0.1349131315946579,
"learning_rate": 0.0002967538941557662,
"loss": 5.8902,
"step": 135800
},
{
"epoch": 21.744,
"grad_norm": 0.14793895184993744,
"learning_rate": 0.0002967514940597624,
"loss": 5.7143,
"step": 135900
},
{
"epoch": 21.76,
"grad_norm": 0.171220600605011,
"learning_rate": 0.00029674909396375855,
"loss": 5.7715,
"step": 136000
},
{
"epoch": 21.776,
"grad_norm": 0.18677209317684174,
"learning_rate": 0.00029674669386775467,
"loss": 5.9996,
"step": 136100
},
{
"epoch": 21.792,
"grad_norm": 0.153004989027977,
"learning_rate": 0.00029674429377175084,
"loss": 6.1678,
"step": 136200
},
{
"epoch": 21.808,
"grad_norm": 0.12716227769851685,
"learning_rate": 0.000296741893675747,
"loss": 5.8525,
"step": 136300
},
{
"epoch": 21.824,
"grad_norm": 0.15531957149505615,
"learning_rate": 0.0002967394935797432,
"loss": 5.703,
"step": 136400
},
{
"epoch": 21.84,
"grad_norm": 0.16813132166862488,
"learning_rate": 0.00029673709348373935,
"loss": 5.7367,
"step": 136500
},
{
"epoch": 21.856,
"grad_norm": 0.1366407722234726,
"learning_rate": 0.0002967346933877355,
"loss": 6.4011,
"step": 136600
},
{
"epoch": 21.872,
"grad_norm": 0.1486620455980301,
"learning_rate": 0.00029673229329173163,
"loss": 6.0592,
"step": 136700
},
{
"epoch": 21.888,
"grad_norm": 0.1474551409482956,
"learning_rate": 0.0002967298931957278,
"loss": 6.1269,
"step": 136800
},
{
"epoch": 21.904,
"grad_norm": 0.1317261904478073,
"learning_rate": 0.00029672749309972397,
"loss": 6.2704,
"step": 136900
},
{
"epoch": 21.92,
"grad_norm": 0.12736591696739197,
"learning_rate": 0.00029672511700468016,
"loss": 5.9018,
"step": 137000
},
{
"epoch": 21.936,
"grad_norm": 0.17512458562850952,
"learning_rate": 0.0002967227169086763,
"loss": 6.1423,
"step": 137100
},
{
"epoch": 21.951999999999998,
"grad_norm": 0.2035478949546814,
"learning_rate": 0.0002967203408136325,
"loss": 5.8421,
"step": 137200
},
{
"epoch": 21.968,
"grad_norm": 0.15790584683418274,
"learning_rate": 0.0002967179407176287,
"loss": 5.6449,
"step": 137300
},
{
"epoch": 21.984,
"grad_norm": 0.13050822913646698,
"learning_rate": 0.00029671554062162485,
"loss": 6.0866,
"step": 137400
},
{
"epoch": 22.0,
"grad_norm": 0.1332990825176239,
"learning_rate": 0.00029671314052562097,
"loss": 5.8362,
"step": 137500
},
{
"epoch": 22.016,
"grad_norm": 0.14409734308719635,
"learning_rate": 0.00029671074042961714,
"loss": 5.7401,
"step": 137600
},
{
"epoch": 22.032,
"grad_norm": 0.1513838768005371,
"learning_rate": 0.0002967083403336133,
"loss": 5.8022,
"step": 137700
},
{
"epoch": 22.048,
"grad_norm": 0.14416912198066711,
"learning_rate": 0.0002967059402376095,
"loss": 5.7687,
"step": 137800
},
{
"epoch": 22.064,
"grad_norm": 0.13069897890090942,
"learning_rate": 0.00029670354014160565,
"loss": 5.7314,
"step": 137900
},
{
"epoch": 22.08,
"grad_norm": 0.15089532732963562,
"learning_rate": 0.0002967011400456018,
"loss": 5.6511,
"step": 138000
},
{
"epoch": 22.096,
"grad_norm": 0.1493406444787979,
"learning_rate": 0.00029669873994959793,
"loss": 5.7553,
"step": 138100
},
{
"epoch": 22.112,
"grad_norm": 0.11403771489858627,
"learning_rate": 0.0002966963398535941,
"loss": 5.8785,
"step": 138200
},
{
"epoch": 22.128,
"grad_norm": 0.1418454647064209,
"learning_rate": 0.00029669393975759027,
"loss": 5.906,
"step": 138300
},
{
"epoch": 22.144,
"grad_norm": 0.14632883667945862,
"learning_rate": 0.00029669153966158644,
"loss": 5.7911,
"step": 138400
},
{
"epoch": 22.16,
"grad_norm": 0.18317896127700806,
"learning_rate": 0.0002966891395655826,
"loss": 5.6022,
"step": 138500
},
{
"epoch": 22.176,
"grad_norm": 0.14640462398529053,
"learning_rate": 0.0002966867394695788,
"loss": 5.6879,
"step": 138600
},
{
"epoch": 22.192,
"grad_norm": 0.11322261393070221,
"learning_rate": 0.0002966843393735749,
"loss": 5.679,
"step": 138700
},
{
"epoch": 22.208,
"grad_norm": 0.14412596821784973,
"learning_rate": 0.00029668193927757106,
"loss": 5.6202,
"step": 138800
},
{
"epoch": 22.224,
"grad_norm": 0.14023444056510925,
"learning_rate": 0.00029667953918156723,
"loss": 6.0133,
"step": 138900
},
{
"epoch": 22.24,
"grad_norm": 0.18092051148414612,
"learning_rate": 0.0002966771390855634,
"loss": 5.6881,
"step": 139000
},
{
"epoch": 22.256,
"grad_norm": 0.13267236948013306,
"learning_rate": 0.00029667473898955957,
"loss": 5.742,
"step": 139100
},
{
"epoch": 22.272,
"grad_norm": 0.1066688597202301,
"learning_rate": 0.0002966723388935557,
"loss": 5.9524,
"step": 139200
},
{
"epoch": 22.288,
"grad_norm": 0.17234094440937042,
"learning_rate": 0.00029666993879755185,
"loss": 6.0385,
"step": 139300
},
{
"epoch": 22.304,
"grad_norm": 0.1593136042356491,
"learning_rate": 0.000296667538701548,
"loss": 5.7894,
"step": 139400
},
{
"epoch": 22.32,
"grad_norm": 0.1161966621875763,
"learning_rate": 0.0002966651386055442,
"loss": 5.6333,
"step": 139500
},
{
"epoch": 22.336,
"grad_norm": 0.16088221967220306,
"learning_rate": 0.00029666273850954036,
"loss": 5.3016,
"step": 139600
},
{
"epoch": 22.352,
"grad_norm": 0.195027694106102,
"learning_rate": 0.00029666033841353653,
"loss": 5.8886,
"step": 139700
},
{
"epoch": 22.368,
"grad_norm": 0.17010509967803955,
"learning_rate": 0.00029665793831753264,
"loss": 5.7462,
"step": 139800
},
{
"epoch": 22.384,
"grad_norm": 0.15900500118732452,
"learning_rate": 0.0002966555382215288,
"loss": 6.1951,
"step": 139900
},
{
"epoch": 22.4,
"grad_norm": 0.20321440696716309,
"learning_rate": 0.000296653138125525,
"loss": 5.8264,
"step": 140000
},
{
"epoch": 22.416,
"grad_norm": 0.21823586523532867,
"learning_rate": 0.00029665073802952115,
"loss": 5.7779,
"step": 140100
},
{
"epoch": 22.432,
"grad_norm": 0.12739881873130798,
"learning_rate": 0.0002966483379335173,
"loss": 5.6477,
"step": 140200
},
{
"epoch": 22.448,
"grad_norm": 0.1288122534751892,
"learning_rate": 0.00029664593783751344,
"loss": 5.5937,
"step": 140300
},
{
"epoch": 22.464,
"grad_norm": 0.12690824270248413,
"learning_rate": 0.0002966435377415096,
"loss": 6.0249,
"step": 140400
},
{
"epoch": 22.48,
"grad_norm": 0.16361913084983826,
"learning_rate": 0.00029664113764550583,
"loss": 5.8957,
"step": 140500
},
{
"epoch": 22.496,
"grad_norm": 0.13729694485664368,
"learning_rate": 0.000296638737549502,
"loss": 5.8405,
"step": 140600
},
{
"epoch": 22.512,
"grad_norm": 0.19917264580726624,
"learning_rate": 0.0002966363374534981,
"loss": 5.9084,
"step": 140700
},
{
"epoch": 22.528,
"grad_norm": 0.15145164728164673,
"learning_rate": 0.0002966339373574943,
"loss": 5.4631,
"step": 140800
},
{
"epoch": 22.544,
"grad_norm": 0.11967241019010544,
"learning_rate": 0.00029663153726149045,
"loss": 5.9098,
"step": 140900
},
{
"epoch": 22.56,
"grad_norm": 0.15000027418136597,
"learning_rate": 0.0002966291371654866,
"loss": 5.7238,
"step": 141000
},
{
"epoch": 22.576,
"grad_norm": 0.16883157193660736,
"learning_rate": 0.0002966267370694828,
"loss": 5.738,
"step": 141100
},
{
"epoch": 22.592,
"grad_norm": 0.13367842137813568,
"learning_rate": 0.0002966243369734789,
"loss": 5.5043,
"step": 141200
},
{
"epoch": 22.608,
"grad_norm": 0.15113677084445953,
"learning_rate": 0.00029662193687747507,
"loss": 5.6651,
"step": 141300
},
{
"epoch": 22.624,
"grad_norm": 0.13519582152366638,
"learning_rate": 0.00029661953678147124,
"loss": 5.9082,
"step": 141400
},
{
"epoch": 22.64,
"grad_norm": 0.15879906713962555,
"learning_rate": 0.0002966171366854674,
"loss": 6.094,
"step": 141500
},
{
"epoch": 22.656,
"grad_norm": 0.16288715600967407,
"learning_rate": 0.0002966147365894636,
"loss": 5.5707,
"step": 141600
},
{
"epoch": 22.672,
"grad_norm": 0.14412395656108856,
"learning_rate": 0.00029661233649345975,
"loss": 5.6827,
"step": 141700
},
{
"epoch": 22.688,
"grad_norm": 0.14847436547279358,
"learning_rate": 0.00029660993639745586,
"loss": 5.4179,
"step": 141800
},
{
"epoch": 22.704,
"grad_norm": 0.13256803154945374,
"learning_rate": 0.00029660753630145203,
"loss": 5.6927,
"step": 141900
},
{
"epoch": 22.72,
"grad_norm": 0.13526926934719086,
"learning_rate": 0.0002966051362054482,
"loss": 5.7505,
"step": 142000
},
{
"epoch": 22.736,
"grad_norm": 0.2226150929927826,
"learning_rate": 0.00029660273610944437,
"loss": 5.6683,
"step": 142100
},
{
"epoch": 22.752,
"grad_norm": 0.12251828610897064,
"learning_rate": 0.00029660033601344054,
"loss": 5.4908,
"step": 142200
},
{
"epoch": 22.768,
"grad_norm": 0.15432491898536682,
"learning_rate": 0.00029659793591743666,
"loss": 5.5662,
"step": 142300
},
{
"epoch": 22.784,
"grad_norm": 0.13890361785888672,
"learning_rate": 0.0002965955358214328,
"loss": 5.6202,
"step": 142400
},
{
"epoch": 22.8,
"grad_norm": 0.10568337142467499,
"learning_rate": 0.000296593135725429,
"loss": 5.7232,
"step": 142500
},
{
"epoch": 22.816,
"grad_norm": 0.14877153933048248,
"learning_rate": 0.00029659073562942516,
"loss": 5.4585,
"step": 142600
},
{
"epoch": 22.832,
"grad_norm": 0.1703936904668808,
"learning_rate": 0.00029658833553342133,
"loss": 5.8294,
"step": 142700
},
{
"epoch": 22.848,
"grad_norm": 0.12574242055416107,
"learning_rate": 0.0002965859594383775,
"loss": 6.0963,
"step": 142800
},
{
"epoch": 22.864,
"grad_norm": 0.1556757390499115,
"learning_rate": 0.00029658355934237364,
"loss": 5.6681,
"step": 142900
},
{
"epoch": 22.88,
"grad_norm": 0.14058822393417358,
"learning_rate": 0.0002965811592463698,
"loss": 5.6148,
"step": 143000
},
{
"epoch": 22.896,
"grad_norm": 0.1746063232421875,
"learning_rate": 0.000296578759150366,
"loss": 5.698,
"step": 143100
},
{
"epoch": 22.912,
"grad_norm": 0.14458870887756348,
"learning_rate": 0.00029657635905436214,
"loss": 5.439,
"step": 143200
},
{
"epoch": 22.928,
"grad_norm": 0.1708308756351471,
"learning_rate": 0.0002965739589583583,
"loss": 5.8077,
"step": 143300
},
{
"epoch": 22.944,
"grad_norm": 0.1382734328508377,
"learning_rate": 0.00029657155886235443,
"loss": 5.603,
"step": 143400
},
{
"epoch": 22.96,
"grad_norm": 0.15728691220283508,
"learning_rate": 0.0002965691587663506,
"loss": 5.8985,
"step": 143500
},
{
"epoch": 22.976,
"grad_norm": 0.12880076467990875,
"learning_rate": 0.00029656675867034677,
"loss": 5.7958,
"step": 143600
},
{
"epoch": 22.992,
"grad_norm": 0.130670964717865,
"learning_rate": 0.000296564358574343,
"loss": 5.6226,
"step": 143700
},
{
"epoch": 23.008,
"grad_norm": 0.1519329994916916,
"learning_rate": 0.0002965619584783391,
"loss": 5.5619,
"step": 143800
},
{
"epoch": 23.024,
"grad_norm": 0.11900737136602402,
"learning_rate": 0.0002965595583823353,
"loss": 5.5148,
"step": 143900
},
{
"epoch": 23.04,
"grad_norm": 0.13805437088012695,
"learning_rate": 0.00029655715828633144,
"loss": 5.1992,
"step": 144000
},
{
"epoch": 23.056,
"grad_norm": 0.15381775796413422,
"learning_rate": 0.0002965547581903276,
"loss": 5.6994,
"step": 144100
},
{
"epoch": 23.072,
"grad_norm": 0.17571000754833221,
"learning_rate": 0.0002965523580943238,
"loss": 5.4076,
"step": 144200
},
{
"epoch": 23.088,
"grad_norm": 0.1299617439508438,
"learning_rate": 0.0002965499579983199,
"loss": 5.5817,
"step": 144300
},
{
"epoch": 23.104,
"grad_norm": 0.1709066480398178,
"learning_rate": 0.00029654755790231607,
"loss": 5.6442,
"step": 144400
},
{
"epoch": 23.12,
"grad_norm": 0.11673315614461899,
"learning_rate": 0.00029654515780631224,
"loss": 5.4461,
"step": 144500
},
{
"epoch": 23.136,
"grad_norm": 0.17694547772407532,
"learning_rate": 0.0002965427577103084,
"loss": 5.4203,
"step": 144600
},
{
"epoch": 23.152,
"grad_norm": 0.1397058516740799,
"learning_rate": 0.0002965403576143046,
"loss": 5.6535,
"step": 144700
},
{
"epoch": 23.168,
"grad_norm": 0.14913706481456757,
"learning_rate": 0.00029653795751830074,
"loss": 5.327,
"step": 144800
},
{
"epoch": 23.184,
"grad_norm": 0.0980440080165863,
"learning_rate": 0.0002965355814232569,
"loss": 5.6265,
"step": 144900
},
{
"epoch": 23.2,
"grad_norm": 0.14519555866718292,
"learning_rate": 0.00029653318132725305,
"loss": 5.5968,
"step": 145000
},
{
"epoch": 23.216,
"grad_norm": 0.14121969044208527,
"learning_rate": 0.0002965307812312492,
"loss": 5.3419,
"step": 145100
},
{
"epoch": 23.232,
"grad_norm": 0.14867204427719116,
"learning_rate": 0.0002965283811352454,
"loss": 5.5432,
"step": 145200
},
{
"epoch": 23.248,
"grad_norm": 0.14526410400867462,
"learning_rate": 0.00029652598103924155,
"loss": 5.4119,
"step": 145300
},
{
"epoch": 23.264,
"grad_norm": 0.16068951785564423,
"learning_rate": 0.00029652358094323767,
"loss": 5.6084,
"step": 145400
},
{
"epoch": 23.28,
"grad_norm": 0.1540200263261795,
"learning_rate": 0.00029652118084723384,
"loss": 5.3346,
"step": 145500
},
{
"epoch": 23.296,
"grad_norm": 0.1306939572095871,
"learning_rate": 0.00029651878075123,
"loss": 5.4401,
"step": 145600
},
{
"epoch": 23.312,
"grad_norm": 0.19503143429756165,
"learning_rate": 0.0002965163806552262,
"loss": 5.5145,
"step": 145700
},
{
"epoch": 23.328,
"grad_norm": 0.16698400676250458,
"learning_rate": 0.00029651398055922235,
"loss": 5.4459,
"step": 145800
},
{
"epoch": 23.344,
"grad_norm": 0.14990036189556122,
"learning_rate": 0.0002965115804632185,
"loss": 5.9844,
"step": 145900
},
{
"epoch": 23.36,
"grad_norm": 0.12152257561683655,
"learning_rate": 0.00029650918036721463,
"loss": 5.4034,
"step": 146000
},
{
"epoch": 23.376,
"grad_norm": 0.12588883936405182,
"learning_rate": 0.0002965067802712108,
"loss": 5.6587,
"step": 146100
},
{
"epoch": 23.392,
"grad_norm": 0.13769680261611938,
"learning_rate": 0.00029650438017520697,
"loss": 5.6661,
"step": 146200
},
{
"epoch": 23.408,
"grad_norm": 0.18270593881607056,
"learning_rate": 0.00029650198007920314,
"loss": 5.4772,
"step": 146300
},
{
"epoch": 23.424,
"grad_norm": 0.16988155245780945,
"learning_rate": 0.0002964995799831993,
"loss": 5.861,
"step": 146400
},
{
"epoch": 23.44,
"grad_norm": 0.15813444554805756,
"learning_rate": 0.0002964971798871954,
"loss": 5.5742,
"step": 146500
},
{
"epoch": 23.456,
"grad_norm": 0.20319218933582306,
"learning_rate": 0.0002964947797911916,
"loss": 5.5046,
"step": 146600
},
{
"epoch": 23.472,
"grad_norm": 0.1794954091310501,
"learning_rate": 0.00029649237969518776,
"loss": 5.4266,
"step": 146700
},
{
"epoch": 23.488,
"grad_norm": 0.18233439326286316,
"learning_rate": 0.000296489979599184,
"loss": 5.7988,
"step": 146800
},
{
"epoch": 23.504,
"grad_norm": 0.24476204812526703,
"learning_rate": 0.0002964875795031801,
"loss": 5.5573,
"step": 146900
},
{
"epoch": 23.52,
"grad_norm": 0.12210160493850708,
"learning_rate": 0.00029648517940717627,
"loss": 5.3991,
"step": 147000
},
{
"epoch": 23.536,
"grad_norm": 0.18380597233772278,
"learning_rate": 0.00029648277931117244,
"loss": 5.7061,
"step": 147100
},
{
"epoch": 23.552,
"grad_norm": 0.14776001870632172,
"learning_rate": 0.0002964803792151686,
"loss": 5.6827,
"step": 147200
},
{
"epoch": 23.568,
"grad_norm": 0.13290056586265564,
"learning_rate": 0.0002964779791191648,
"loss": 5.6598,
"step": 147300
},
{
"epoch": 23.584,
"grad_norm": 0.12878666818141937,
"learning_rate": 0.0002964755790231609,
"loss": 5.4732,
"step": 147400
},
{
"epoch": 23.6,
"grad_norm": 0.11875222623348236,
"learning_rate": 0.00029647317892715706,
"loss": 5.9345,
"step": 147500
},
{
"epoch": 23.616,
"grad_norm": 0.1489972323179245,
"learning_rate": 0.00029647077883115323,
"loss": 5.5631,
"step": 147600
},
{
"epoch": 23.632,
"grad_norm": 0.22594046592712402,
"learning_rate": 0.0002964683787351494,
"loss": 5.2854,
"step": 147700
},
{
"epoch": 23.648,
"grad_norm": 0.14621250331401825,
"learning_rate": 0.00029646597863914557,
"loss": 5.2938,
"step": 147800
},
{
"epoch": 23.664,
"grad_norm": 0.14641734957695007,
"learning_rate": 0.00029646357854314174,
"loss": 5.7265,
"step": 147900
},
{
"epoch": 23.68,
"grad_norm": 0.14452804625034332,
"learning_rate": 0.00029646117844713785,
"loss": 5.3081,
"step": 148000
},
{
"epoch": 23.696,
"grad_norm": 0.1696479767560959,
"learning_rate": 0.000296458778351134,
"loss": 5.7359,
"step": 148100
},
{
"epoch": 23.712,
"grad_norm": 0.1629931777715683,
"learning_rate": 0.0002964563782551302,
"loss": 5.8091,
"step": 148200
},
{
"epoch": 23.728,
"grad_norm": 0.1588413119316101,
"learning_rate": 0.00029645397815912636,
"loss": 5.8185,
"step": 148300
},
{
"epoch": 23.744,
"grad_norm": 0.1528206616640091,
"learning_rate": 0.00029645157806312253,
"loss": 5.6945,
"step": 148400
},
{
"epoch": 23.76,
"grad_norm": 0.16446250677108765,
"learning_rate": 0.00029644917796711864,
"loss": 5.1739,
"step": 148500
},
{
"epoch": 23.776,
"grad_norm": 0.14487922191619873,
"learning_rate": 0.00029644680187207483,
"loss": 5.5836,
"step": 148600
},
{
"epoch": 23.792,
"grad_norm": 0.297879159450531,
"learning_rate": 0.0002964444257770311,
"loss": 5.5247,
"step": 148700
},
{
"epoch": 23.808,
"grad_norm": 0.1171737089753151,
"learning_rate": 0.00029644202568102724,
"loss": 5.3085,
"step": 148800
},
{
"epoch": 23.824,
"grad_norm": 0.1464715600013733,
"learning_rate": 0.00029643962558502336,
"loss": 5.3029,
"step": 148900
},
{
"epoch": 23.84,
"grad_norm": 0.16126649081707,
"learning_rate": 0.0002964372254890195,
"loss": 5.7273,
"step": 149000
},
{
"epoch": 23.856,
"grad_norm": 0.10824692994356155,
"learning_rate": 0.0002964348253930157,
"loss": 5.3296,
"step": 149100
},
{
"epoch": 23.872,
"grad_norm": 0.14661309123039246,
"learning_rate": 0.00029643242529701187,
"loss": 5.828,
"step": 149200
},
{
"epoch": 23.888,
"grad_norm": 0.16918961703777313,
"learning_rate": 0.00029643002520100803,
"loss": 5.359,
"step": 149300
},
{
"epoch": 23.904,
"grad_norm": 0.14028948545455933,
"learning_rate": 0.00029642762510500415,
"loss": 5.5027,
"step": 149400
},
{
"epoch": 23.92,
"grad_norm": 0.15497733652591705,
"learning_rate": 0.0002964252250090003,
"loss": 5.7539,
"step": 149500
},
{
"epoch": 23.936,
"grad_norm": 0.12349986284971237,
"learning_rate": 0.0002964228249129965,
"loss": 5.1582,
"step": 149600
},
{
"epoch": 23.951999999999998,
"grad_norm": 0.1359599381685257,
"learning_rate": 0.00029642042481699266,
"loss": 5.4394,
"step": 149700
},
{
"epoch": 23.968,
"grad_norm": 0.18629401922225952,
"learning_rate": 0.0002964180247209888,
"loss": 5.4743,
"step": 149800
},
{
"epoch": 23.984,
"grad_norm": 0.1438770890235901,
"learning_rate": 0.000296415624624985,
"loss": 5.4707,
"step": 149900
},
{
"epoch": 24.0,
"grad_norm": 0.11876608431339264,
"learning_rate": 0.0002964132245289811,
"loss": 5.2108,
"step": 150000
},
{
"epoch": 24.016,
"grad_norm": 0.1379069685935974,
"learning_rate": 0.0002964108244329773,
"loss": 5.5858,
"step": 150100
},
{
"epoch": 24.032,
"grad_norm": 0.15197959542274475,
"learning_rate": 0.00029640842433697345,
"loss": 5.3452,
"step": 150200
},
{
"epoch": 24.048,
"grad_norm": 0.16093584895133972,
"learning_rate": 0.0002964060242409696,
"loss": 5.1725,
"step": 150300
},
{
"epoch": 24.064,
"grad_norm": 0.14459937810897827,
"learning_rate": 0.0002964036241449658,
"loss": 5.529,
"step": 150400
},
{
"epoch": 24.08,
"grad_norm": 0.15908825397491455,
"learning_rate": 0.0002964012240489619,
"loss": 5.0667,
"step": 150500
},
{
"epoch": 24.096,
"grad_norm": 0.14320479333400726,
"learning_rate": 0.00029639882395295807,
"loss": 5.4541,
"step": 150600
},
{
"epoch": 24.112,
"grad_norm": 0.1382274329662323,
"learning_rate": 0.00029639642385695424,
"loss": 5.4337,
"step": 150700
},
{
"epoch": 24.128,
"grad_norm": 0.09485090523958206,
"learning_rate": 0.0002963940237609504,
"loss": 5.5169,
"step": 150800
},
{
"epoch": 24.144,
"grad_norm": 0.1434488147497177,
"learning_rate": 0.0002963916236649466,
"loss": 5.1838,
"step": 150900
},
{
"epoch": 24.16,
"grad_norm": 0.172550767660141,
"learning_rate": 0.00029638922356894275,
"loss": 5.4995,
"step": 151000
},
{
"epoch": 24.176,
"grad_norm": 0.17296665906906128,
"learning_rate": 0.00029638682347293886,
"loss": 5.3814,
"step": 151100
},
{
"epoch": 24.192,
"grad_norm": 0.13183431327342987,
"learning_rate": 0.00029638442337693503,
"loss": 5.4961,
"step": 151200
},
{
"epoch": 24.208,
"grad_norm": 0.11805009096860886,
"learning_rate": 0.0002963820472818913,
"loss": 5.3575,
"step": 151300
},
{
"epoch": 24.224,
"grad_norm": 0.1694483608007431,
"learning_rate": 0.0002963796471858874,
"loss": 5.4198,
"step": 151400
},
{
"epoch": 24.24,
"grad_norm": 0.14694049954414368,
"learning_rate": 0.00029637724708988356,
"loss": 5.2369,
"step": 151500
},
{
"epoch": 24.256,
"grad_norm": 0.14818693697452545,
"learning_rate": 0.00029637484699387973,
"loss": 5.5989,
"step": 151600
},
{
"epoch": 24.272,
"grad_norm": 0.12142101675271988,
"learning_rate": 0.0002963724468978759,
"loss": 5.5808,
"step": 151700
},
{
"epoch": 24.288,
"grad_norm": 0.1072693020105362,
"learning_rate": 0.00029637004680187207,
"loss": 5.2257,
"step": 151800
},
{
"epoch": 24.304,
"grad_norm": 0.20452247560024261,
"learning_rate": 0.00029636764670586824,
"loss": 4.9512,
"step": 151900
},
{
"epoch": 24.32,
"grad_norm": 0.13785667717456818,
"learning_rate": 0.00029636524660986435,
"loss": 5.3486,
"step": 152000
},
{
"epoch": 24.336,
"grad_norm": 0.16348302364349365,
"learning_rate": 0.0002963628465138605,
"loss": 5.416,
"step": 152100
},
{
"epoch": 24.352,
"grad_norm": 0.12873555719852448,
"learning_rate": 0.0002963604464178567,
"loss": 5.4854,
"step": 152200
},
{
"epoch": 24.368,
"grad_norm": 0.14430370926856995,
"learning_rate": 0.00029635804632185286,
"loss": 5.083,
"step": 152300
},
{
"epoch": 24.384,
"grad_norm": 0.14247077703475952,
"learning_rate": 0.00029635564622584903,
"loss": 5.2926,
"step": 152400
},
{
"epoch": 24.4,
"grad_norm": 0.12942449748516083,
"learning_rate": 0.00029635324612984514,
"loss": 5.2287,
"step": 152500
},
{
"epoch": 24.416,
"grad_norm": 0.1290571689605713,
"learning_rate": 0.0002963508460338413,
"loss": 5.1295,
"step": 152600
},
{
"epoch": 24.432,
"grad_norm": 0.14392858743667603,
"learning_rate": 0.0002963484459378375,
"loss": 5.2795,
"step": 152700
},
{
"epoch": 24.448,
"grad_norm": 0.10403969883918762,
"learning_rate": 0.00029634604584183365,
"loss": 5.4616,
"step": 152800
},
{
"epoch": 24.464,
"grad_norm": 0.1357210874557495,
"learning_rate": 0.0002963436457458298,
"loss": 5.0671,
"step": 152900
},
{
"epoch": 24.48,
"grad_norm": 0.162188321352005,
"learning_rate": 0.000296341245649826,
"loss": 5.1244,
"step": 153000
},
{
"epoch": 24.496,
"grad_norm": 0.1423524171113968,
"learning_rate": 0.0002963388455538221,
"loss": 5.2658,
"step": 153100
},
{
"epoch": 24.512,
"grad_norm": 0.15725597739219666,
"learning_rate": 0.00029633644545781827,
"loss": 5.4486,
"step": 153200
},
{
"epoch": 24.528,
"grad_norm": 0.10184895247220993,
"learning_rate": 0.00029633404536181444,
"loss": 5.1975,
"step": 153300
},
{
"epoch": 24.544,
"grad_norm": 0.11968593299388885,
"learning_rate": 0.0002963316452658106,
"loss": 5.0282,
"step": 153400
},
{
"epoch": 24.56,
"grad_norm": 0.15125450491905212,
"learning_rate": 0.0002963292451698068,
"loss": 5.0548,
"step": 153500
},
{
"epoch": 24.576,
"grad_norm": 0.1498018205165863,
"learning_rate": 0.0002963268450738029,
"loss": 5.2235,
"step": 153600
},
{
"epoch": 24.592,
"grad_norm": 0.14961381256580353,
"learning_rate": 0.00029632444497779906,
"loss": 5.282,
"step": 153700
},
{
"epoch": 24.608,
"grad_norm": 0.10805343836545944,
"learning_rate": 0.00029632204488179523,
"loss": 5.2164,
"step": 153800
},
{
"epoch": 24.624,
"grad_norm": 0.1407497674226761,
"learning_rate": 0.0002963196447857914,
"loss": 5.8793,
"step": 153900
},
{
"epoch": 24.64,
"grad_norm": 0.15589803457260132,
"learning_rate": 0.00029631724468978757,
"loss": 5.2803,
"step": 154000
},
{
"epoch": 24.656,
"grad_norm": 0.15549539029598236,
"learning_rate": 0.00029631484459378374,
"loss": 5.5255,
"step": 154100
},
{
"epoch": 24.672,
"grad_norm": 0.14697429537773132,
"learning_rate": 0.00029631244449777986,
"loss": 5.2088,
"step": 154200
},
{
"epoch": 24.688,
"grad_norm": 0.14445632696151733,
"learning_rate": 0.000296310044401776,
"loss": 5.314,
"step": 154300
},
{
"epoch": 24.704,
"grad_norm": 0.13264203071594238,
"learning_rate": 0.0002963076443057722,
"loss": 5.1363,
"step": 154400
},
{
"epoch": 24.72,
"grad_norm": 0.14595112204551697,
"learning_rate": 0.00029630524420976836,
"loss": 5.1834,
"step": 154500
},
{
"epoch": 24.736,
"grad_norm": 0.15063650906085968,
"learning_rate": 0.00029630284411376453,
"loss": 5.2409,
"step": 154600
},
{
"epoch": 24.752,
"grad_norm": 0.1531144678592682,
"learning_rate": 0.00029630044401776065,
"loss": 5.3414,
"step": 154700
},
{
"epoch": 24.768,
"grad_norm": 0.15418265759944916,
"learning_rate": 0.0002962980439217568,
"loss": 5.3579,
"step": 154800
},
{
"epoch": 24.784,
"grad_norm": 0.13664741814136505,
"learning_rate": 0.000296295643825753,
"loss": 5.4855,
"step": 154900
},
{
"epoch": 24.8,
"grad_norm": 0.15261198580265045,
"learning_rate": 0.00029629324372974916,
"loss": 5.5078,
"step": 155000
},
{
"epoch": 24.816,
"grad_norm": 0.1436208039522171,
"learning_rate": 0.0002962908436337453,
"loss": 5.2359,
"step": 155100
},
{
"epoch": 24.832,
"grad_norm": 0.1557721495628357,
"learning_rate": 0.0002962884435377415,
"loss": 5.1472,
"step": 155200
},
{
"epoch": 24.848,
"grad_norm": 0.1639142483472824,
"learning_rate": 0.0002962860434417376,
"loss": 5.1701,
"step": 155300
},
{
"epoch": 24.864,
"grad_norm": 0.1857120245695114,
"learning_rate": 0.0002962836433457338,
"loss": 5.3149,
"step": 155400
},
{
"epoch": 24.88,
"grad_norm": 0.1384589672088623,
"learning_rate": 0.00029628124324972995,
"loss": 5.1655,
"step": 155500
},
{
"epoch": 24.896,
"grad_norm": 0.16934780776500702,
"learning_rate": 0.0002962788431537261,
"loss": 5.0212,
"step": 155600
},
{
"epoch": 24.912,
"grad_norm": 0.14011263847351074,
"learning_rate": 0.0002962764430577223,
"loss": 5.3506,
"step": 155700
},
{
"epoch": 24.928,
"grad_norm": 0.12232084572315216,
"learning_rate": 0.0002962740429617184,
"loss": 4.9836,
"step": 155800
},
{
"epoch": 24.944,
"grad_norm": 0.1219339519739151,
"learning_rate": 0.00029627164286571457,
"loss": 5.337,
"step": 155900
},
{
"epoch": 24.96,
"grad_norm": 0.13951101899147034,
"learning_rate": 0.0002962692667706708,
"loss": 5.6947,
"step": 156000
},
{
"epoch": 24.976,
"grad_norm": 0.15717874467372894,
"learning_rate": 0.000296266866674667,
"loss": 5.0598,
"step": 156100
},
{
"epoch": 24.992,
"grad_norm": 0.16753438115119934,
"learning_rate": 0.0002962644665786631,
"loss": 5.1918,
"step": 156200
},
{
"epoch": 25.008,
"grad_norm": 0.11955256760120392,
"learning_rate": 0.00029626206648265927,
"loss": 5.1705,
"step": 156300
},
{
"epoch": 25.024,
"grad_norm": 0.11964499950408936,
"learning_rate": 0.00029625966638665544,
"loss": 5.3443,
"step": 156400
},
{
"epoch": 25.04,
"grad_norm": 0.123370461165905,
"learning_rate": 0.0002962572662906516,
"loss": 4.9845,
"step": 156500
},
{
"epoch": 25.056,
"grad_norm": 0.12556427717208862,
"learning_rate": 0.0002962548661946478,
"loss": 4.9369,
"step": 156600
},
{
"epoch": 25.072,
"grad_norm": 0.15033285319805145,
"learning_rate": 0.0002962524660986439,
"loss": 5.1891,
"step": 156700
},
{
"epoch": 25.088,
"grad_norm": 0.157626673579216,
"learning_rate": 0.00029625006600264006,
"loss": 5.0871,
"step": 156800
},
{
"epoch": 25.104,
"grad_norm": 0.12489177286624908,
"learning_rate": 0.0002962476659066362,
"loss": 4.9887,
"step": 156900
},
{
"epoch": 25.12,
"grad_norm": 0.17784586548805237,
"learning_rate": 0.0002962452658106324,
"loss": 4.9263,
"step": 157000
},
{
"epoch": 25.136,
"grad_norm": 0.26584434509277344,
"learning_rate": 0.00029624286571462857,
"loss": 5.1268,
"step": 157100
},
{
"epoch": 25.152,
"grad_norm": 0.14168865978717804,
"learning_rate": 0.00029624046561862473,
"loss": 5.4578,
"step": 157200
},
{
"epoch": 25.168,
"grad_norm": 0.1289631426334381,
"learning_rate": 0.00029623806552262085,
"loss": 5.2466,
"step": 157300
},
{
"epoch": 25.184,
"grad_norm": 0.12273957580327988,
"learning_rate": 0.000296235665426617,
"loss": 4.7845,
"step": 157400
},
{
"epoch": 25.2,
"grad_norm": 0.24651670455932617,
"learning_rate": 0.0002962332653306132,
"loss": 5.0988,
"step": 157500
},
{
"epoch": 25.216,
"grad_norm": 0.1415649801492691,
"learning_rate": 0.00029623086523460936,
"loss": 5.0998,
"step": 157600
},
{
"epoch": 25.232,
"grad_norm": 0.1132798045873642,
"learning_rate": 0.0002962284651386055,
"loss": 5.2229,
"step": 157700
},
{
"epoch": 25.248,
"grad_norm": 0.10961470752954483,
"learning_rate": 0.00029622606504260164,
"loss": 4.9959,
"step": 157800
},
{
"epoch": 25.264,
"grad_norm": 0.16054928302764893,
"learning_rate": 0.0002962236649465978,
"loss": 4.989,
"step": 157900
},
{
"epoch": 25.28,
"grad_norm": 0.16918180882930756,
"learning_rate": 0.000296221264850594,
"loss": 5.0824,
"step": 158000
},
{
"epoch": 25.296,
"grad_norm": 0.12880262732505798,
"learning_rate": 0.00029621886475459015,
"loss": 4.6069,
"step": 158100
},
{
"epoch": 25.312,
"grad_norm": 0.16930246353149414,
"learning_rate": 0.0002962164646585863,
"loss": 5.0421,
"step": 158200
},
{
"epoch": 25.328,
"grad_norm": 0.15791450440883636,
"learning_rate": 0.0002962140645625825,
"loss": 5.1324,
"step": 158300
},
{
"epoch": 25.344,
"grad_norm": 0.12896622717380524,
"learning_rate": 0.0002962116644665786,
"loss": 4.8697,
"step": 158400
},
{
"epoch": 25.36,
"grad_norm": 0.15522588789463043,
"learning_rate": 0.00029620926437057477,
"loss": 5.112,
"step": 158500
},
{
"epoch": 25.376,
"grad_norm": 0.15994909405708313,
"learning_rate": 0.00029620686427457094,
"loss": 5.1186,
"step": 158600
},
{
"epoch": 25.392,
"grad_norm": 0.16203735768795013,
"learning_rate": 0.0002962044641785671,
"loss": 5.2136,
"step": 158700
},
{
"epoch": 25.408,
"grad_norm": 0.14830628037452698,
"learning_rate": 0.0002962020640825633,
"loss": 4.8028,
"step": 158800
},
{
"epoch": 25.424,
"grad_norm": 0.17855019867420197,
"learning_rate": 0.00029619966398655945,
"loss": 5.2293,
"step": 158900
},
{
"epoch": 25.44,
"grad_norm": 0.13485394418239594,
"learning_rate": 0.00029619728789151564,
"loss": 5.1688,
"step": 159000
},
{
"epoch": 25.456,
"grad_norm": 0.15001603960990906,
"learning_rate": 0.0002961948877955118,
"loss": 5.2429,
"step": 159100
},
{
"epoch": 25.472,
"grad_norm": 0.15747343003749847,
"learning_rate": 0.000296192487699508,
"loss": 5.0648,
"step": 159200
},
{
"epoch": 25.488,
"grad_norm": 0.11709601432085037,
"learning_rate": 0.0002961900876035041,
"loss": 4.9424,
"step": 159300
},
{
"epoch": 25.504,
"grad_norm": 0.14115624129772186,
"learning_rate": 0.00029618768750750026,
"loss": 5.2824,
"step": 159400
},
{
"epoch": 25.52,
"grad_norm": 0.13271014392375946,
"learning_rate": 0.00029618528741149643,
"loss": 5.2082,
"step": 159500
},
{
"epoch": 25.536,
"grad_norm": 0.13927429914474487,
"learning_rate": 0.0002961828873154926,
"loss": 5.0302,
"step": 159600
},
{
"epoch": 25.552,
"grad_norm": 0.1625901609659195,
"learning_rate": 0.00029618048721948877,
"loss": 5.2649,
"step": 159700
},
{
"epoch": 25.568,
"grad_norm": 0.1242537572979927,
"learning_rate": 0.0002961780871234849,
"loss": 5.3638,
"step": 159800
},
{
"epoch": 25.584,
"grad_norm": 0.22442211210727692,
"learning_rate": 0.00029617568702748105,
"loss": 4.7374,
"step": 159900
},
{
"epoch": 25.6,
"grad_norm": 0.1424286961555481,
"learning_rate": 0.0002961732869314772,
"loss": 5.0878,
"step": 160000
},
{
"epoch": 25.616,
"grad_norm": 0.16174399852752686,
"learning_rate": 0.0002961708868354734,
"loss": 5.4059,
"step": 160100
},
{
"epoch": 25.632,
"grad_norm": 0.12529495358467102,
"learning_rate": 0.00029616848673946956,
"loss": 5.1528,
"step": 160200
},
{
"epoch": 25.648,
"grad_norm": 0.14766289293766022,
"learning_rate": 0.00029616608664346573,
"loss": 5.2453,
"step": 160300
},
{
"epoch": 25.664,
"grad_norm": 0.12722782790660858,
"learning_rate": 0.00029616368654746184,
"loss": 5.1237,
"step": 160400
},
{
"epoch": 25.68,
"grad_norm": 0.1653498262166977,
"learning_rate": 0.000296161286451458,
"loss": 5.2606,
"step": 160500
},
{
"epoch": 25.696,
"grad_norm": 0.15743720531463623,
"learning_rate": 0.0002961588863554542,
"loss": 5.3842,
"step": 160600
},
{
"epoch": 25.712,
"grad_norm": 0.11641506105661392,
"learning_rate": 0.00029615648625945035,
"loss": 5.0112,
"step": 160700
},
{
"epoch": 25.728,
"grad_norm": 0.1600313037633896,
"learning_rate": 0.0002961540861634465,
"loss": 5.1207,
"step": 160800
},
{
"epoch": 25.744,
"grad_norm": 0.1792784333229065,
"learning_rate": 0.0002961516860674427,
"loss": 5.1801,
"step": 160900
},
{
"epoch": 25.76,
"grad_norm": 0.12263203412294388,
"learning_rate": 0.0002961492859714388,
"loss": 5.1875,
"step": 161000
},
{
"epoch": 25.776,
"grad_norm": 0.1638142168521881,
"learning_rate": 0.00029614690987639505,
"loss": 5.5503,
"step": 161100
},
{
"epoch": 25.792,
"grad_norm": 0.12107832729816437,
"learning_rate": 0.0002961445097803912,
"loss": 5.312,
"step": 161200
},
{
"epoch": 25.808,
"grad_norm": 0.1593557745218277,
"learning_rate": 0.00029614210968438733,
"loss": 5.0444,
"step": 161300
},
{
"epoch": 25.824,
"grad_norm": 0.14629554748535156,
"learning_rate": 0.0002961397095883835,
"loss": 5.2007,
"step": 161400
},
{
"epoch": 25.84,
"grad_norm": 0.14022816717624664,
"learning_rate": 0.00029613730949237967,
"loss": 5.1234,
"step": 161500
},
{
"epoch": 25.856,
"grad_norm": 0.15026092529296875,
"learning_rate": 0.00029613490939637584,
"loss": 5.1459,
"step": 161600
},
{
"epoch": 25.872,
"grad_norm": 0.16642487049102783,
"learning_rate": 0.000296132509300372,
"loss": 5.074,
"step": 161700
},
{
"epoch": 25.888,
"grad_norm": 0.16100358963012695,
"learning_rate": 0.0002961301092043681,
"loss": 4.8445,
"step": 161800
},
{
"epoch": 25.904,
"grad_norm": 0.14411258697509766,
"learning_rate": 0.0002961277091083643,
"loss": 4.7157,
"step": 161900
},
{
"epoch": 25.92,
"grad_norm": 0.10813727974891663,
"learning_rate": 0.00029612530901236046,
"loss": 5.0682,
"step": 162000
},
{
"epoch": 25.936,
"grad_norm": 0.14450779557228088,
"learning_rate": 0.00029612290891635663,
"loss": 5.241,
"step": 162100
},
{
"epoch": 25.951999999999998,
"grad_norm": 0.16171583533287048,
"learning_rate": 0.0002961205088203528,
"loss": 5.1133,
"step": 162200
},
{
"epoch": 25.968,
"grad_norm": 0.12712721526622772,
"learning_rate": 0.00029611810872434897,
"loss": 5.0328,
"step": 162300
},
{
"epoch": 25.984,
"grad_norm": 0.12672489881515503,
"learning_rate": 0.0002961157086283451,
"loss": 4.8169,
"step": 162400
},
{
"epoch": 26.0,
"grad_norm": 0.15172095596790314,
"learning_rate": 0.00029611330853234125,
"loss": 5.092,
"step": 162500
},
{
"epoch": 26.016,
"grad_norm": 0.18036304414272308,
"learning_rate": 0.0002961109084363374,
"loss": 4.7511,
"step": 162600
},
{
"epoch": 26.032,
"grad_norm": 0.16676302254199982,
"learning_rate": 0.0002961085083403336,
"loss": 4.9628,
"step": 162700
},
{
"epoch": 26.048,
"grad_norm": 0.1724889576435089,
"learning_rate": 0.00029610610824432976,
"loss": 4.8742,
"step": 162800
},
{
"epoch": 26.064,
"grad_norm": 0.1280188113451004,
"learning_rate": 0.00029610370814832593,
"loss": 5.3059,
"step": 162900
},
{
"epoch": 26.08,
"grad_norm": 0.15785780549049377,
"learning_rate": 0.00029610130805232204,
"loss": 4.8671,
"step": 163000
},
{
"epoch": 26.096,
"grad_norm": 0.14080898463726044,
"learning_rate": 0.0002960989079563182,
"loss": 5.1418,
"step": 163100
},
{
"epoch": 26.112,
"grad_norm": 0.13095679879188538,
"learning_rate": 0.0002960965078603144,
"loss": 4.7194,
"step": 163200
},
{
"epoch": 26.128,
"grad_norm": 0.1574213057756424,
"learning_rate": 0.00029609410776431055,
"loss": 4.9184,
"step": 163300
},
{
"epoch": 26.144,
"grad_norm": 0.13669663667678833,
"learning_rate": 0.0002960917076683067,
"loss": 5.0563,
"step": 163400
},
{
"epoch": 26.16,
"grad_norm": 0.15946930646896362,
"learning_rate": 0.00029608930757230284,
"loss": 4.7656,
"step": 163500
},
{
"epoch": 26.176,
"grad_norm": 0.1457744687795639,
"learning_rate": 0.000296086907476299,
"loss": 4.894,
"step": 163600
},
{
"epoch": 26.192,
"grad_norm": 0.10747674852609634,
"learning_rate": 0.0002960845073802952,
"loss": 5.1462,
"step": 163700
},
{
"epoch": 26.208,
"grad_norm": 0.22094644606113434,
"learning_rate": 0.00029608210728429134,
"loss": 5.3243,
"step": 163800
},
{
"epoch": 26.224,
"grad_norm": 0.12370151281356812,
"learning_rate": 0.0002960797071882875,
"loss": 4.8294,
"step": 163900
},
{
"epoch": 26.24,
"grad_norm": 0.1479647010564804,
"learning_rate": 0.0002960773070922837,
"loss": 5.0416,
"step": 164000
},
{
"epoch": 26.256,
"grad_norm": 0.15605013072490692,
"learning_rate": 0.0002960749069962798,
"loss": 5.2773,
"step": 164100
},
{
"epoch": 26.272,
"grad_norm": 0.1911146342754364,
"learning_rate": 0.00029607250690027597,
"loss": 4.939,
"step": 164200
},
{
"epoch": 26.288,
"grad_norm": 0.12012562155723572,
"learning_rate": 0.0002960701308052322,
"loss": 4.8719,
"step": 164300
},
{
"epoch": 26.304,
"grad_norm": 0.12493129819631577,
"learning_rate": 0.0002960677307092283,
"loss": 4.7802,
"step": 164400
},
{
"epoch": 26.32,
"grad_norm": 0.12632489204406738,
"learning_rate": 0.0002960653306132245,
"loss": 4.8725,
"step": 164500
},
{
"epoch": 26.336,
"grad_norm": 0.15591692924499512,
"learning_rate": 0.00029606293051722066,
"loss": 5.2183,
"step": 164600
},
{
"epoch": 26.352,
"grad_norm": 0.12113320082426071,
"learning_rate": 0.00029606053042121683,
"loss": 4.981,
"step": 164700
},
{
"epoch": 26.368,
"grad_norm": 0.12973067164421082,
"learning_rate": 0.000296058130325213,
"loss": 5.1433,
"step": 164800
},
{
"epoch": 26.384,
"grad_norm": 0.15297859907150269,
"learning_rate": 0.00029605573022920917,
"loss": 4.9628,
"step": 164900
},
{
"epoch": 26.4,
"grad_norm": 0.13537169992923737,
"learning_rate": 0.0002960533301332053,
"loss": 4.6621,
"step": 165000
},
{
"epoch": 26.416,
"grad_norm": 0.12161804735660553,
"learning_rate": 0.00029605093003720145,
"loss": 4.9027,
"step": 165100
},
{
"epoch": 26.432,
"grad_norm": 0.14561276137828827,
"learning_rate": 0.0002960485299411976,
"loss": 4.7497,
"step": 165200
},
{
"epoch": 26.448,
"grad_norm": 0.1523263305425644,
"learning_rate": 0.0002960461298451938,
"loss": 4.7575,
"step": 165300
},
{
"epoch": 26.464,
"grad_norm": 0.13894937932491302,
"learning_rate": 0.00029604372974918996,
"loss": 5.1487,
"step": 165400
},
{
"epoch": 26.48,
"grad_norm": 0.1122347041964531,
"learning_rate": 0.0002960413296531861,
"loss": 4.8517,
"step": 165500
},
{
"epoch": 26.496,
"grad_norm": 0.12737123668193817,
"learning_rate": 0.00029603892955718225,
"loss": 4.8187,
"step": 165600
},
{
"epoch": 26.512,
"grad_norm": 0.1302328109741211,
"learning_rate": 0.0002960365294611784,
"loss": 4.6812,
"step": 165700
},
{
"epoch": 26.528,
"grad_norm": 0.14844807982444763,
"learning_rate": 0.0002960341293651746,
"loss": 4.9271,
"step": 165800
},
{
"epoch": 26.544,
"grad_norm": 0.17675945162773132,
"learning_rate": 0.00029603172926917075,
"loss": 4.7797,
"step": 165900
},
{
"epoch": 26.56,
"grad_norm": 0.18416370451450348,
"learning_rate": 0.0002960293291731669,
"loss": 5.1626,
"step": 166000
},
{
"epoch": 26.576,
"grad_norm": 0.12005133926868439,
"learning_rate": 0.00029602692907716304,
"loss": 4.7074,
"step": 166100
},
{
"epoch": 26.592,
"grad_norm": 0.185636967420578,
"learning_rate": 0.0002960245289811592,
"loss": 5.175,
"step": 166200
},
{
"epoch": 26.608,
"grad_norm": 0.11722932010889053,
"learning_rate": 0.0002960221288851554,
"loss": 4.9977,
"step": 166300
},
{
"epoch": 26.624,
"grad_norm": 0.13763803243637085,
"learning_rate": 0.00029601972878915154,
"loss": 4.732,
"step": 166400
},
{
"epoch": 26.64,
"grad_norm": 0.13912682235240936,
"learning_rate": 0.0002960173286931477,
"loss": 4.877,
"step": 166500
},
{
"epoch": 26.656,
"grad_norm": 0.10087449103593826,
"learning_rate": 0.00029601492859714383,
"loss": 4.7994,
"step": 166600
},
{
"epoch": 26.672,
"grad_norm": 0.1845891773700714,
"learning_rate": 0.00029601252850114,
"loss": 5.4515,
"step": 166700
},
{
"epoch": 26.688,
"grad_norm": 0.14900504052639008,
"learning_rate": 0.00029601012840513617,
"loss": 5.0709,
"step": 166800
},
{
"epoch": 26.704,
"grad_norm": 0.19447046518325806,
"learning_rate": 0.00029600772830913234,
"loss": 4.8345,
"step": 166900
},
{
"epoch": 26.72,
"grad_norm": 0.15507912635803223,
"learning_rate": 0.0002960053282131285,
"loss": 4.909,
"step": 167000
},
{
"epoch": 26.736,
"grad_norm": 0.12142092734575272,
"learning_rate": 0.0002960029281171247,
"loss": 4.8017,
"step": 167100
},
{
"epoch": 26.752,
"grad_norm": 0.12530605494976044,
"learning_rate": 0.0002960005280211208,
"loss": 5.1347,
"step": 167200
},
{
"epoch": 26.768,
"grad_norm": 0.14327798783779144,
"learning_rate": 0.00029599812792511696,
"loss": 4.7235,
"step": 167300
},
{
"epoch": 26.784,
"grad_norm": 0.14647874236106873,
"learning_rate": 0.00029599572782911313,
"loss": 4.9018,
"step": 167400
},
{
"epoch": 26.8,
"grad_norm": 0.13197900354862213,
"learning_rate": 0.0002959933277331093,
"loss": 5.1885,
"step": 167500
},
{
"epoch": 26.816,
"grad_norm": 0.13953787088394165,
"learning_rate": 0.00029599092763710547,
"loss": 4.8121,
"step": 167600
},
{
"epoch": 26.832,
"grad_norm": 0.16823934018611908,
"learning_rate": 0.0002959885275411016,
"loss": 4.7129,
"step": 167700
},
{
"epoch": 26.848,
"grad_norm": 0.1557362824678421,
"learning_rate": 0.00029598612744509775,
"loss": 5.2257,
"step": 167800
},
{
"epoch": 26.864,
"grad_norm": 0.16123229265213013,
"learning_rate": 0.000295983751350054,
"loss": 4.8921,
"step": 167900
},
{
"epoch": 26.88,
"grad_norm": 0.1613980084657669,
"learning_rate": 0.00029598135125405016,
"loss": 5.0361,
"step": 168000
},
{
"epoch": 26.896,
"grad_norm": 0.1302555948495865,
"learning_rate": 0.0002959789511580463,
"loss": 5.0077,
"step": 168100
},
{
"epoch": 26.912,
"grad_norm": 0.15182837843894958,
"learning_rate": 0.00029597655106204245,
"loss": 5.0202,
"step": 168200
},
{
"epoch": 26.928,
"grad_norm": 0.13955193758010864,
"learning_rate": 0.0002959741509660386,
"loss": 4.9305,
"step": 168300
},
{
"epoch": 26.944,
"grad_norm": 0.1417885273694992,
"learning_rate": 0.0002959717508700348,
"loss": 5.0889,
"step": 168400
},
{
"epoch": 26.96,
"grad_norm": 0.14792856574058533,
"learning_rate": 0.00029596935077403095,
"loss": 4.8685,
"step": 168500
},
{
"epoch": 26.976,
"grad_norm": 0.14266085624694824,
"learning_rate": 0.00029596695067802707,
"loss": 5.1578,
"step": 168600
},
{
"epoch": 26.992,
"grad_norm": 0.11925966292619705,
"learning_rate": 0.00029596455058202324,
"loss": 4.6746,
"step": 168700
},
{
"epoch": 27.008,
"grad_norm": 0.13332228362560272,
"learning_rate": 0.0002959621504860194,
"loss": 5.1295,
"step": 168800
},
{
"epoch": 27.024,
"grad_norm": 0.13257551193237305,
"learning_rate": 0.0002959597503900156,
"loss": 5.0958,
"step": 168900
},
{
"epoch": 27.04,
"grad_norm": 0.11077175289392471,
"learning_rate": 0.00029595735029401175,
"loss": 4.6509,
"step": 169000
},
{
"epoch": 27.056,
"grad_norm": 0.1581268608570099,
"learning_rate": 0.0002959549501980079,
"loss": 4.7619,
"step": 169100
},
{
"epoch": 27.072,
"grad_norm": 0.15108828246593475,
"learning_rate": 0.00029595255010200403,
"loss": 4.7792,
"step": 169200
},
{
"epoch": 27.088,
"grad_norm": 0.15362246334552765,
"learning_rate": 0.0002959501500060002,
"loss": 5.189,
"step": 169300
},
{
"epoch": 27.104,
"grad_norm": 0.1353999823331833,
"learning_rate": 0.00029594774990999637,
"loss": 4.7698,
"step": 169400
},
{
"epoch": 27.12,
"grad_norm": 0.15684208273887634,
"learning_rate": 0.00029594534981399254,
"loss": 4.8111,
"step": 169500
},
{
"epoch": 27.136,
"grad_norm": 0.17176128923892975,
"learning_rate": 0.0002959429497179887,
"loss": 4.8735,
"step": 169600
},
{
"epoch": 27.152,
"grad_norm": 0.12857766449451447,
"learning_rate": 0.0002959405496219848,
"loss": 4.5602,
"step": 169700
},
{
"epoch": 27.168,
"grad_norm": 0.2216508835554123,
"learning_rate": 0.000295938149525981,
"loss": 4.6848,
"step": 169800
},
{
"epoch": 27.184,
"grad_norm": 0.18342281877994537,
"learning_rate": 0.00029593577343093723,
"loss": 4.9973,
"step": 169900
},
{
"epoch": 27.2,
"grad_norm": 0.2726237177848816,
"learning_rate": 0.0002959333733349334,
"loss": 4.8341,
"step": 170000
},
{
"epoch": 27.216,
"grad_norm": 0.1373586356639862,
"learning_rate": 0.0002959309732389295,
"loss": 4.914,
"step": 170100
},
{
"epoch": 27.232,
"grad_norm": 0.13454484939575195,
"learning_rate": 0.0002959285731429257,
"loss": 5.0239,
"step": 170200
},
{
"epoch": 27.248,
"grad_norm": 0.146050363779068,
"learning_rate": 0.00029592617304692186,
"loss": 4.7314,
"step": 170300
},
{
"epoch": 27.264,
"grad_norm": 0.14222508668899536,
"learning_rate": 0.000295923772950918,
"loss": 4.6159,
"step": 170400
},
{
"epoch": 27.28,
"grad_norm": 0.14632238447666168,
"learning_rate": 0.0002959213728549142,
"loss": 4.4062,
"step": 170500
},
{
"epoch": 27.296,
"grad_norm": 0.16428226232528687,
"learning_rate": 0.0002959189727589103,
"loss": 5.1747,
"step": 170600
},
{
"epoch": 27.312,
"grad_norm": 0.1323370337486267,
"learning_rate": 0.0002959165726629065,
"loss": 4.5199,
"step": 170700
},
{
"epoch": 27.328,
"grad_norm": 0.14235830307006836,
"learning_rate": 0.00029591417256690265,
"loss": 4.9103,
"step": 170800
},
{
"epoch": 27.344,
"grad_norm": 0.13216975331306458,
"learning_rate": 0.0002959117724708988,
"loss": 4.8293,
"step": 170900
},
{
"epoch": 27.36,
"grad_norm": 0.15071095526218414,
"learning_rate": 0.000295909372374895,
"loss": 4.9801,
"step": 171000
},
{
"epoch": 27.376,
"grad_norm": 0.1272030919790268,
"learning_rate": 0.00029590697227889116,
"loss": 4.9456,
"step": 171100
},
{
"epoch": 27.392,
"grad_norm": 0.13579507172107697,
"learning_rate": 0.00029590457218288727,
"loss": 4.8712,
"step": 171200
},
{
"epoch": 27.408,
"grad_norm": 0.12844951450824738,
"learning_rate": 0.00029590217208688344,
"loss": 4.679,
"step": 171300
},
{
"epoch": 27.424,
"grad_norm": 0.10488644242286682,
"learning_rate": 0.0002958997719908796,
"loss": 4.8333,
"step": 171400
},
{
"epoch": 27.44,
"grad_norm": 0.1397544890642166,
"learning_rate": 0.0002958973718948758,
"loss": 4.9637,
"step": 171500
},
{
"epoch": 27.456,
"grad_norm": 0.17122800648212433,
"learning_rate": 0.00029589497179887195,
"loss": 4.5042,
"step": 171600
},
{
"epoch": 27.472,
"grad_norm": 0.1432805061340332,
"learning_rate": 0.00029589257170286806,
"loss": 4.9236,
"step": 171700
},
{
"epoch": 27.488,
"grad_norm": 0.2430882304906845,
"learning_rate": 0.00029589017160686423,
"loss": 4.6134,
"step": 171800
},
{
"epoch": 27.504,
"grad_norm": 0.12965236604213715,
"learning_rate": 0.0002958877715108604,
"loss": 4.8867,
"step": 171900
},
{
"epoch": 27.52,
"grad_norm": 0.13079382479190826,
"learning_rate": 0.00029588537141485657,
"loss": 4.7196,
"step": 172000
},
{
"epoch": 27.536,
"grad_norm": 0.16515448689460754,
"learning_rate": 0.00029588299531981276,
"loss": 4.6995,
"step": 172100
},
{
"epoch": 27.552,
"grad_norm": 0.12594960629940033,
"learning_rate": 0.00029588059522380893,
"loss": 4.8708,
"step": 172200
},
{
"epoch": 27.568,
"grad_norm": 0.1570487916469574,
"learning_rate": 0.0002958781951278051,
"loss": 4.8169,
"step": 172300
},
{
"epoch": 27.584,
"grad_norm": 0.13092289865016937,
"learning_rate": 0.00029587579503180127,
"loss": 4.695,
"step": 172400
},
{
"epoch": 27.6,
"grad_norm": 0.14942535758018494,
"learning_rate": 0.00029587339493579744,
"loss": 4.7415,
"step": 172500
},
{
"epoch": 27.616,
"grad_norm": 0.12075886875391006,
"learning_rate": 0.00029587099483979355,
"loss": 4.4839,
"step": 172600
},
{
"epoch": 27.632,
"grad_norm": 0.11725221574306488,
"learning_rate": 0.0002958685947437897,
"loss": 4.8162,
"step": 172700
},
{
"epoch": 27.648,
"grad_norm": 0.20893152058124542,
"learning_rate": 0.0002958661946477859,
"loss": 4.78,
"step": 172800
},
{
"epoch": 27.664,
"grad_norm": 0.14231526851654053,
"learning_rate": 0.00029586379455178206,
"loss": 4.7212,
"step": 172900
},
{
"epoch": 27.68,
"grad_norm": 0.1261710226535797,
"learning_rate": 0.0002958613944557782,
"loss": 4.96,
"step": 173000
},
{
"epoch": 27.696,
"grad_norm": 0.1408015638589859,
"learning_rate": 0.0002958589943597744,
"loss": 4.7388,
"step": 173100
},
{
"epoch": 27.712,
"grad_norm": 0.14422334730625153,
"learning_rate": 0.0002958565942637705,
"loss": 4.5018,
"step": 173200
},
{
"epoch": 27.728,
"grad_norm": 0.17371025681495667,
"learning_rate": 0.0002958541941677667,
"loss": 4.792,
"step": 173300
},
{
"epoch": 27.744,
"grad_norm": 0.21515819430351257,
"learning_rate": 0.00029585179407176285,
"loss": 4.8225,
"step": 173400
},
{
"epoch": 27.76,
"grad_norm": 0.1557329297065735,
"learning_rate": 0.000295849393975759,
"loss": 4.6305,
"step": 173500
},
{
"epoch": 27.776,
"grad_norm": 0.13870660960674286,
"learning_rate": 0.0002958469938797552,
"loss": 4.5486,
"step": 173600
},
{
"epoch": 27.792,
"grad_norm": 0.13383133709430695,
"learning_rate": 0.0002958445937837513,
"loss": 4.6136,
"step": 173700
},
{
"epoch": 27.808,
"grad_norm": 0.1399243175983429,
"learning_rate": 0.00029584219368774747,
"loss": 4.9352,
"step": 173800
},
{
"epoch": 27.824,
"grad_norm": 0.11231095343828201,
"learning_rate": 0.00029583979359174364,
"loss": 4.9996,
"step": 173900
},
{
"epoch": 27.84,
"grad_norm": 0.16128210723400116,
"learning_rate": 0.0002958373934957398,
"loss": 4.7546,
"step": 174000
},
{
"epoch": 27.856,
"grad_norm": 0.15589210391044617,
"learning_rate": 0.000295834993399736,
"loss": 4.8234,
"step": 174100
},
{
"epoch": 27.872,
"grad_norm": 0.22979894280433655,
"learning_rate": 0.00029583259330373215,
"loss": 4.8117,
"step": 174200
},
{
"epoch": 27.888,
"grad_norm": 0.14024117588996887,
"learning_rate": 0.00029583019320772826,
"loss": 4.5712,
"step": 174300
},
{
"epoch": 27.904,
"grad_norm": 0.16881561279296875,
"learning_rate": 0.00029582779311172443,
"loss": 4.8696,
"step": 174400
},
{
"epoch": 27.92,
"grad_norm": 0.14194153249263763,
"learning_rate": 0.0002958253930157206,
"loss": 4.7792,
"step": 174500
},
{
"epoch": 27.936,
"grad_norm": 0.16409501433372498,
"learning_rate": 0.00029582299291971677,
"loss": 4.862,
"step": 174600
},
{
"epoch": 27.951999999999998,
"grad_norm": 0.21548931300640106,
"learning_rate": 0.00029582059282371294,
"loss": 4.6556,
"step": 174700
},
{
"epoch": 27.968,
"grad_norm": 0.15370036661624908,
"learning_rate": 0.00029581819272770906,
"loss": 4.7855,
"step": 174800
},
{
"epoch": 27.984,
"grad_norm": 0.1505698263645172,
"learning_rate": 0.0002958157926317052,
"loss": 4.5333,
"step": 174900
},
{
"epoch": 28.0,
"grad_norm": 0.13952812552452087,
"learning_rate": 0.0002958133925357014,
"loss": 5.0827,
"step": 175000
},
{
"epoch": 28.016,
"grad_norm": 0.14113423228263855,
"learning_rate": 0.00029581099243969756,
"loss": 4.4652,
"step": 175100
},
{
"epoch": 28.032,
"grad_norm": 0.13563218712806702,
"learning_rate": 0.00029580859234369373,
"loss": 4.4769,
"step": 175200
},
{
"epoch": 28.048,
"grad_norm": 0.16485312581062317,
"learning_rate": 0.0002958061922476899,
"loss": 4.7196,
"step": 175300
},
{
"epoch": 28.064,
"grad_norm": 0.1928679645061493,
"learning_rate": 0.000295803792151686,
"loss": 4.5181,
"step": 175400
},
{
"epoch": 28.08,
"grad_norm": 0.16406244039535522,
"learning_rate": 0.00029580141605664226,
"loss": 4.5547,
"step": 175500
},
{
"epoch": 28.096,
"grad_norm": 0.12744209170341492,
"learning_rate": 0.00029579901596063843,
"loss": 4.6802,
"step": 175600
},
{
"epoch": 28.112,
"grad_norm": 0.15242663025856018,
"learning_rate": 0.00029579661586463454,
"loss": 4.7076,
"step": 175700
},
{
"epoch": 28.128,
"grad_norm": 0.1231980100274086,
"learning_rate": 0.0002957942157686307,
"loss": 4.7097,
"step": 175800
},
{
"epoch": 28.144,
"grad_norm": 0.1742876172065735,
"learning_rate": 0.0002957918156726269,
"loss": 4.8166,
"step": 175900
},
{
"epoch": 28.16,
"grad_norm": 0.15425816178321838,
"learning_rate": 0.00029578941557662305,
"loss": 4.6306,
"step": 176000
},
{
"epoch": 28.176,
"grad_norm": 0.1423932909965515,
"learning_rate": 0.0002957870154806192,
"loss": 4.7671,
"step": 176100
},
{
"epoch": 28.192,
"grad_norm": 0.13283143937587738,
"learning_rate": 0.0002957846153846154,
"loss": 4.5074,
"step": 176200
},
{
"epoch": 28.208,
"grad_norm": 0.1560533046722412,
"learning_rate": 0.0002957822152886115,
"loss": 4.8514,
"step": 176300
},
{
"epoch": 28.224,
"grad_norm": 0.12814775109291077,
"learning_rate": 0.0002957798151926077,
"loss": 4.7173,
"step": 176400
},
{
"epoch": 28.24,
"grad_norm": 0.1441114842891693,
"learning_rate": 0.00029577741509660384,
"loss": 4.7003,
"step": 176500
},
{
"epoch": 28.256,
"grad_norm": 0.13554996252059937,
"learning_rate": 0.0002957750150006,
"loss": 4.6206,
"step": 176600
},
{
"epoch": 28.272,
"grad_norm": 0.21647945046424866,
"learning_rate": 0.0002957726149045962,
"loss": 4.9289,
"step": 176700
},
{
"epoch": 28.288,
"grad_norm": 0.1216735765337944,
"learning_rate": 0.0002957702148085923,
"loss": 4.7441,
"step": 176800
},
{
"epoch": 28.304,
"grad_norm": 0.12911395728588104,
"learning_rate": 0.00029576781471258847,
"loss": 4.6493,
"step": 176900
},
{
"epoch": 28.32,
"grad_norm": 0.12240692973136902,
"learning_rate": 0.00029576541461658463,
"loss": 4.7305,
"step": 177000
},
{
"epoch": 28.336,
"grad_norm": 0.17344659566879272,
"learning_rate": 0.0002957630145205808,
"loss": 4.5246,
"step": 177100
},
{
"epoch": 28.352,
"grad_norm": 0.12759949266910553,
"learning_rate": 0.00029576061442457697,
"loss": 4.6852,
"step": 177200
},
{
"epoch": 28.368,
"grad_norm": 0.12402662634849548,
"learning_rate": 0.00029575821432857314,
"loss": 4.5194,
"step": 177300
},
{
"epoch": 28.384,
"grad_norm": 0.19976910948753357,
"learning_rate": 0.00029575581423256926,
"loss": 4.5166,
"step": 177400
},
{
"epoch": 28.4,
"grad_norm": 0.14362084865570068,
"learning_rate": 0.0002957534141365654,
"loss": 4.5147,
"step": 177500
},
{
"epoch": 28.416,
"grad_norm": 0.13851560652256012,
"learning_rate": 0.0002957510140405616,
"loss": 4.5473,
"step": 177600
},
{
"epoch": 28.432,
"grad_norm": 0.13696688413619995,
"learning_rate": 0.00029574861394455776,
"loss": 4.7163,
"step": 177700
},
{
"epoch": 28.448,
"grad_norm": 0.1331932544708252,
"learning_rate": 0.00029574621384855393,
"loss": 5.0066,
"step": 177800
},
{
"epoch": 28.464,
"grad_norm": 0.13118359446525574,
"learning_rate": 0.00029574381375255005,
"loss": 4.7009,
"step": 177900
},
{
"epoch": 28.48,
"grad_norm": 0.11460904031991959,
"learning_rate": 0.0002957414136565462,
"loss": 4.5525,
"step": 178000
},
{
"epoch": 28.496,
"grad_norm": 0.11112211644649506,
"learning_rate": 0.0002957390135605424,
"loss": 4.8012,
"step": 178100
},
{
"epoch": 28.512,
"grad_norm": 0.1618378460407257,
"learning_rate": 0.00029573661346453856,
"loss": 4.8419,
"step": 178200
},
{
"epoch": 28.528,
"grad_norm": 0.13665986061096191,
"learning_rate": 0.0002957342133685347,
"loss": 4.6129,
"step": 178300
},
{
"epoch": 28.544,
"grad_norm": 0.10059978067874908,
"learning_rate": 0.0002957318132725309,
"loss": 4.7326,
"step": 178400
},
{
"epoch": 28.56,
"grad_norm": 0.1575680524110794,
"learning_rate": 0.000295729413176527,
"loss": 5.0102,
"step": 178500
},
{
"epoch": 28.576,
"grad_norm": 0.10887812077999115,
"learning_rate": 0.0002957270130805232,
"loss": 4.7228,
"step": 178600
},
{
"epoch": 28.592,
"grad_norm": 0.08943487703800201,
"learning_rate": 0.0002957246369854794,
"loss": 4.4294,
"step": 178700
},
{
"epoch": 28.608,
"grad_norm": 0.14149336516857147,
"learning_rate": 0.00029572223688947554,
"loss": 4.6056,
"step": 178800
},
{
"epoch": 28.624,
"grad_norm": 0.12872636318206787,
"learning_rate": 0.0002957198367934717,
"loss": 4.8457,
"step": 178900
},
{
"epoch": 28.64,
"grad_norm": 0.15382656455039978,
"learning_rate": 0.0002957174366974679,
"loss": 4.7641,
"step": 179000
},
{
"epoch": 28.656,
"grad_norm": 0.15484744310379028,
"learning_rate": 0.00029571503660146404,
"loss": 4.7261,
"step": 179100
},
{
"epoch": 28.672,
"grad_norm": 0.1385447382926941,
"learning_rate": 0.0002957126365054602,
"loss": 4.8178,
"step": 179200
},
{
"epoch": 28.688,
"grad_norm": 0.09416704624891281,
"learning_rate": 0.0002957102364094564,
"loss": 4.462,
"step": 179300
},
{
"epoch": 28.704,
"grad_norm": 0.11756269633769989,
"learning_rate": 0.0002957078363134525,
"loss": 4.9817,
"step": 179400
},
{
"epoch": 28.72,
"grad_norm": 0.16298645734786987,
"learning_rate": 0.00029570543621744867,
"loss": 4.7884,
"step": 179500
},
{
"epoch": 28.736,
"grad_norm": 0.1666107177734375,
"learning_rate": 0.00029570303612144484,
"loss": 4.5478,
"step": 179600
},
{
"epoch": 28.752,
"grad_norm": 0.14432166516780853,
"learning_rate": 0.000295700636025441,
"loss": 4.5671,
"step": 179700
},
{
"epoch": 28.768,
"grad_norm": 0.14455050230026245,
"learning_rate": 0.0002956982359294372,
"loss": 4.4565,
"step": 179800
},
{
"epoch": 28.784,
"grad_norm": 0.11911621689796448,
"learning_rate": 0.0002956958358334333,
"loss": 4.8298,
"step": 179900
},
{
"epoch": 28.8,
"grad_norm": 0.11492261290550232,
"learning_rate": 0.00029569343573742946,
"loss": 4.8744,
"step": 180000
},
{
"epoch": 28.816,
"grad_norm": 0.11532367020845413,
"learning_rate": 0.00029569103564142563,
"loss": 4.9461,
"step": 180100
},
{
"epoch": 28.832,
"grad_norm": 0.11335845291614532,
"learning_rate": 0.0002956886355454218,
"loss": 4.6438,
"step": 180200
},
{
"epoch": 28.848,
"grad_norm": 0.13290923833847046,
"learning_rate": 0.00029568623544941797,
"loss": 4.5029,
"step": 180300
},
{
"epoch": 28.864,
"grad_norm": 0.12123245000839233,
"learning_rate": 0.00029568383535341414,
"loss": 5.002,
"step": 180400
},
{
"epoch": 28.88,
"grad_norm": 0.1688774973154068,
"learning_rate": 0.00029568143525741025,
"loss": 4.5888,
"step": 180500
},
{
"epoch": 28.896,
"grad_norm": 0.12593814730644226,
"learning_rate": 0.0002956790351614064,
"loss": 4.5949,
"step": 180600
},
{
"epoch": 28.912,
"grad_norm": 0.13134326040744781,
"learning_rate": 0.0002956766350654026,
"loss": 4.3431,
"step": 180700
},
{
"epoch": 28.928,
"grad_norm": 0.14252367615699768,
"learning_rate": 0.00029567423496939876,
"loss": 4.1599,
"step": 180800
},
{
"epoch": 28.944,
"grad_norm": 0.13371191918849945,
"learning_rate": 0.0002956718348733949,
"loss": 4.4618,
"step": 180900
},
{
"epoch": 28.96,
"grad_norm": 0.2305118888616562,
"learning_rate": 0.00029566943477739104,
"loss": 4.7324,
"step": 181000
},
{
"epoch": 28.976,
"grad_norm": 0.17778520286083221,
"learning_rate": 0.0002956670346813872,
"loss": 4.5895,
"step": 181100
},
{
"epoch": 28.992,
"grad_norm": 0.16209328174591064,
"learning_rate": 0.0002956646345853834,
"loss": 4.5924,
"step": 181200
},
{
"epoch": 29.008,
"grad_norm": 0.13874457776546478,
"learning_rate": 0.0002956622584903396,
"loss": 4.5032,
"step": 181300
},
{
"epoch": 29.024,
"grad_norm": 0.13318394124507904,
"learning_rate": 0.00029565985839433574,
"loss": 4.3979,
"step": 181400
},
{
"epoch": 29.04,
"grad_norm": 0.1424497812986374,
"learning_rate": 0.0002956574582983319,
"loss": 4.6121,
"step": 181500
},
{
"epoch": 29.056,
"grad_norm": 0.1274562031030655,
"learning_rate": 0.0002956550582023281,
"loss": 4.6716,
"step": 181600
},
{
"epoch": 29.072,
"grad_norm": 0.15418770909309387,
"learning_rate": 0.00029565265810632425,
"loss": 4.4586,
"step": 181700
},
{
"epoch": 29.088,
"grad_norm": 0.1679641753435135,
"learning_rate": 0.0002956502580103204,
"loss": 4.4676,
"step": 181800
},
{
"epoch": 29.104,
"grad_norm": 0.10988187789916992,
"learning_rate": 0.00029564788191527655,
"loss": 4.4074,
"step": 181900
},
{
"epoch": 29.12,
"grad_norm": 0.13705100119113922,
"learning_rate": 0.0002956454818192727,
"loss": 4.5681,
"step": 182000
}
],
"logging_steps": 100,
"max_steps": 12500000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2000,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.600452986732544e+17,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}