cleantracks / trainer_state.json
deepaksamuel-cuk's picture
Upload 13 files
d9aad75 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 35.5,
"eval_steps": 100,
"global_step": 142000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025,
"grad_norm": 0.7927406430244446,
"learning_rate": 5.82e-05,
"loss": 203.8328,
"step": 100
},
{
"epoch": 0.05,
"grad_norm": 0.901040256023407,
"learning_rate": 0.0001182,
"loss": 181.6551,
"step": 200
},
{
"epoch": 0.075,
"grad_norm": 0.14473982155323029,
"learning_rate": 0.00017819999999999997,
"loss": 174.6394,
"step": 300
},
{
"epoch": 0.1,
"grad_norm": 0.13423211872577667,
"learning_rate": 0.0002382,
"loss": 171.818,
"step": 400
},
{
"epoch": 0.125,
"grad_norm": 0.13924159109592438,
"learning_rate": 0.0002982,
"loss": 168.1486,
"step": 500
},
{
"epoch": 0.15,
"grad_norm": 0.11850500851869583,
"learning_rate": 0.000299996362272642,
"loss": 162.8829,
"step": 600
},
{
"epoch": 0.175,
"grad_norm": 0.15106040239334106,
"learning_rate": 0.0002999926120382524,
"loss": 158.5516,
"step": 700
},
{
"epoch": 0.2,
"grad_norm": 0.11745048314332962,
"learning_rate": 0.0002999888618038627,
"loss": 154.1395,
"step": 800
},
{
"epoch": 0.225,
"grad_norm": 0.22588345408439636,
"learning_rate": 0.00029998511156947307,
"loss": 150.8583,
"step": 900
},
{
"epoch": 0.25,
"grad_norm": 0.1475830227136612,
"learning_rate": 0.0002999813613350834,
"loss": 148.7021,
"step": 1000
},
{
"epoch": 0.275,
"grad_norm": 0.14757394790649414,
"learning_rate": 0.00029997761110069375,
"loss": 145.111,
"step": 1100
},
{
"epoch": 0.3,
"grad_norm": 0.13360479474067688,
"learning_rate": 0.0002999738608663041,
"loss": 142.679,
"step": 1200
},
{
"epoch": 0.325,
"grad_norm": 0.11122659593820572,
"learning_rate": 0.0002999701106319145,
"loss": 140.4614,
"step": 1300
},
{
"epoch": 0.35,
"grad_norm": 0.10133378952741623,
"learning_rate": 0.0002999663603975248,
"loss": 137.84,
"step": 1400
},
{
"epoch": 0.375,
"grad_norm": 0.12196547538042068,
"learning_rate": 0.00029996261016313516,
"loss": 136.1062,
"step": 1500
},
{
"epoch": 0.4,
"grad_norm": 0.09694620966911316,
"learning_rate": 0.0002999588599287455,
"loss": 134.5708,
"step": 1600
},
{
"epoch": 0.425,
"grad_norm": 0.14449502527713776,
"learning_rate": 0.0002999551096943559,
"loss": 131.672,
"step": 1700
},
{
"epoch": 0.45,
"grad_norm": 0.10163229703903198,
"learning_rate": 0.0002999513594599662,
"loss": 128.9171,
"step": 1800
},
{
"epoch": 0.475,
"grad_norm": 0.09789746254682541,
"learning_rate": 0.00029994760922557657,
"loss": 127.3757,
"step": 1900
},
{
"epoch": 0.5,
"grad_norm": 0.0996888279914856,
"learning_rate": 0.00029994385899118693,
"loss": 124.4876,
"step": 2000
},
{
"epoch": 0.525,
"grad_norm": 0.08484259247779846,
"learning_rate": 0.0002999401087567973,
"loss": 122.1805,
"step": 2100
},
{
"epoch": 0.55,
"grad_norm": 0.11729967594146729,
"learning_rate": 0.0002999363585224076,
"loss": 117.8535,
"step": 2200
},
{
"epoch": 0.575,
"grad_norm": 0.1445324867963791,
"learning_rate": 0.000299932608288018,
"loss": 116.244,
"step": 2300
},
{
"epoch": 0.6,
"grad_norm": 0.11317744106054306,
"learning_rate": 0.0002999288580536283,
"loss": 113.5543,
"step": 2400
},
{
"epoch": 0.625,
"grad_norm": 0.09375651925802231,
"learning_rate": 0.0002999251078192387,
"loss": 110.8541,
"step": 2500
},
{
"epoch": 0.65,
"grad_norm": 0.0896710455417633,
"learning_rate": 0.000299921357584849,
"loss": 110.1387,
"step": 2600
},
{
"epoch": 0.675,
"grad_norm": 0.09820675849914551,
"learning_rate": 0.0002999176073504594,
"loss": 107.0062,
"step": 2700
},
{
"epoch": 0.7,
"grad_norm": 0.09842734783887863,
"learning_rate": 0.0002999138571160697,
"loss": 105.1786,
"step": 2800
},
{
"epoch": 0.725,
"grad_norm": 0.09370853751897812,
"learning_rate": 0.00029991010688168007,
"loss": 103.8245,
"step": 2900
},
{
"epoch": 0.75,
"grad_norm": 0.12121213972568512,
"learning_rate": 0.00029990635664729043,
"loss": 101.6897,
"step": 3000
},
{
"epoch": 0.775,
"grad_norm": 0.09974240511655807,
"learning_rate": 0.0002999026064129008,
"loss": 100.3376,
"step": 3100
},
{
"epoch": 0.8,
"grad_norm": 0.09277965873479843,
"learning_rate": 0.0002998988561785111,
"loss": 99.2098,
"step": 3200
},
{
"epoch": 0.825,
"grad_norm": 0.12521271407604218,
"learning_rate": 0.0002998951059441215,
"loss": 98.4138,
"step": 3300
},
{
"epoch": 0.85,
"grad_norm": 0.1051282286643982,
"learning_rate": 0.00029989135570973184,
"loss": 99.5873,
"step": 3400
},
{
"epoch": 0.875,
"grad_norm": 0.13997547328472137,
"learning_rate": 0.0002998876054753422,
"loss": 97.4617,
"step": 3500
},
{
"epoch": 0.9,
"grad_norm": 0.1003558561205864,
"learning_rate": 0.0002998838552409525,
"loss": 96.093,
"step": 3600
},
{
"epoch": 0.925,
"grad_norm": 0.09967362880706787,
"learning_rate": 0.0002998801050065629,
"loss": 93.6796,
"step": 3700
},
{
"epoch": 0.95,
"grad_norm": 0.13389019668102264,
"learning_rate": 0.00029987635477217325,
"loss": 92.9668,
"step": 3800
},
{
"epoch": 0.975,
"grad_norm": 0.10552455484867096,
"learning_rate": 0.0002998726045377836,
"loss": 91.9125,
"step": 3900
},
{
"epoch": 1.0,
"grad_norm": 0.10877016931772232,
"learning_rate": 0.00029986885430339393,
"loss": 91.2492,
"step": 4000
},
{
"epoch": 1.025,
"grad_norm": 0.09188541024923325,
"learning_rate": 0.0002998651040690043,
"loss": 88.3832,
"step": 4100
},
{
"epoch": 1.05,
"grad_norm": 0.10517989099025726,
"learning_rate": 0.0002998613538346146,
"loss": 87.4386,
"step": 4200
},
{
"epoch": 1.075,
"grad_norm": 0.08605173230171204,
"learning_rate": 0.000299857603600225,
"loss": 86.7098,
"step": 4300
},
{
"epoch": 1.1,
"grad_norm": 0.13910797238349915,
"learning_rate": 0.00029985385336583534,
"loss": 85.1566,
"step": 4400
},
{
"epoch": 1.125,
"grad_norm": 0.08505425602197647,
"learning_rate": 0.00029985010313144565,
"loss": 86.1376,
"step": 4500
},
{
"epoch": 1.15,
"grad_norm": 0.10330720990896225,
"learning_rate": 0.000299846352897056,
"loss": 84.9761,
"step": 4600
},
{
"epoch": 1.175,
"grad_norm": 0.1150883138179779,
"learning_rate": 0.0002998426026626664,
"loss": 83.4733,
"step": 4700
},
{
"epoch": 1.2,
"grad_norm": 0.08464270830154419,
"learning_rate": 0.00029983885242827675,
"loss": 84.0231,
"step": 4800
},
{
"epoch": 1.225,
"grad_norm": 0.11479545384645462,
"learning_rate": 0.00029983510219388707,
"loss": 82.2074,
"step": 4900
},
{
"epoch": 1.25,
"grad_norm": 0.10978193581104279,
"learning_rate": 0.00029983135195949743,
"loss": 81.2586,
"step": 5000
},
{
"epoch": 1.275,
"grad_norm": 0.10087323933839798,
"learning_rate": 0.0002998276017251078,
"loss": 80.0028,
"step": 5100
},
{
"epoch": 1.3,
"grad_norm": 0.0992458313703537,
"learning_rate": 0.00029982385149071816,
"loss": 81.4542,
"step": 5200
},
{
"epoch": 1.325,
"grad_norm": 0.08898110687732697,
"learning_rate": 0.0002998201012563285,
"loss": 80.3485,
"step": 5300
},
{
"epoch": 1.35,
"grad_norm": 0.11424868553876877,
"learning_rate": 0.00029981635102193884,
"loss": 79.4734,
"step": 5400
},
{
"epoch": 1.375,
"grad_norm": 0.09483993798494339,
"learning_rate": 0.0002998126007875492,
"loss": 78.8044,
"step": 5500
},
{
"epoch": 1.4,
"grad_norm": 0.08650317788124084,
"learning_rate": 0.0002998088505531596,
"loss": 78.476,
"step": 5600
},
{
"epoch": 1.425,
"grad_norm": 0.08040408045053482,
"learning_rate": 0.0002998051003187699,
"loss": 77.8633,
"step": 5700
},
{
"epoch": 1.45,
"grad_norm": 0.08953177183866501,
"learning_rate": 0.00029980135008438025,
"loss": 76.5257,
"step": 5800
},
{
"epoch": 1.475,
"grad_norm": 0.10908912867307663,
"learning_rate": 0.00029979759984999056,
"loss": 76.2689,
"step": 5900
},
{
"epoch": 1.5,
"grad_norm": 0.12598766386508942,
"learning_rate": 0.00029979384961560093,
"loss": 76.7776,
"step": 6000
},
{
"epoch": 1.525,
"grad_norm": 0.0955086201429367,
"learning_rate": 0.0002997900993812113,
"loss": 76.5905,
"step": 6100
},
{
"epoch": 1.55,
"grad_norm": 0.08597240597009659,
"learning_rate": 0.00029978634914682166,
"loss": 74.2009,
"step": 6200
},
{
"epoch": 1.575,
"grad_norm": 0.08754386007785797,
"learning_rate": 0.000299782598912432,
"loss": 74.1175,
"step": 6300
},
{
"epoch": 1.6,
"grad_norm": 0.12214329093694687,
"learning_rate": 0.00029977884867804234,
"loss": 73.2265,
"step": 6400
},
{
"epoch": 1.625,
"grad_norm": 0.08221092820167542,
"learning_rate": 0.0002997750984436527,
"loss": 72.1494,
"step": 6500
},
{
"epoch": 1.65,
"grad_norm": 0.1369631290435791,
"learning_rate": 0.0002997713482092631,
"loss": 73.5853,
"step": 6600
},
{
"epoch": 1.675,
"grad_norm": 0.0787581205368042,
"learning_rate": 0.0002997675979748734,
"loss": 72.0935,
"step": 6700
},
{
"epoch": 1.7,
"grad_norm": 0.07737889885902405,
"learning_rate": 0.00029976384774048375,
"loss": 71.3515,
"step": 6800
},
{
"epoch": 1.725,
"grad_norm": 0.11298476159572601,
"learning_rate": 0.0002997600975060941,
"loss": 71.5356,
"step": 6900
},
{
"epoch": 1.75,
"grad_norm": 0.07955294102430344,
"learning_rate": 0.0002997563472717045,
"loss": 71.9312,
"step": 7000
},
{
"epoch": 1.775,
"grad_norm": 0.11449731886386871,
"learning_rate": 0.0002997525970373148,
"loss": 70.1805,
"step": 7100
},
{
"epoch": 1.8,
"grad_norm": 0.07159914076328278,
"learning_rate": 0.00029974884680292516,
"loss": 70.1074,
"step": 7200
},
{
"epoch": 1.825,
"grad_norm": 0.07785623520612717,
"learning_rate": 0.00029974509656853553,
"loss": 70.5433,
"step": 7300
},
{
"epoch": 1.85,
"grad_norm": 0.0750761404633522,
"learning_rate": 0.0002997413463341459,
"loss": 68.6654,
"step": 7400
},
{
"epoch": 1.875,
"grad_norm": 0.0909292995929718,
"learning_rate": 0.0002997375960997562,
"loss": 69.5312,
"step": 7500
},
{
"epoch": 1.9,
"grad_norm": 0.1320108026266098,
"learning_rate": 0.00029973384586536657,
"loss": 67.3222,
"step": 7600
},
{
"epoch": 1.925,
"grad_norm": 0.12221457809209824,
"learning_rate": 0.0002997300956309769,
"loss": 66.3137,
"step": 7700
},
{
"epoch": 1.95,
"grad_norm": 0.11239924281835556,
"learning_rate": 0.00029972634539658725,
"loss": 67.8054,
"step": 7800
},
{
"epoch": 1.975,
"grad_norm": 0.0858956053853035,
"learning_rate": 0.0002997225951621976,
"loss": 67.9956,
"step": 7900
},
{
"epoch": 2.0,
"grad_norm": 0.10778280347585678,
"learning_rate": 0.000299718844927808,
"loss": 66.5141,
"step": 8000
},
{
"epoch": 2.025,
"grad_norm": 0.10166219621896744,
"learning_rate": 0.0002997150946934183,
"loss": 65.9891,
"step": 8100
},
{
"epoch": 2.05,
"grad_norm": 0.09062575548887253,
"learning_rate": 0.00029971134445902866,
"loss": 67.5705,
"step": 8200
},
{
"epoch": 2.075,
"grad_norm": 0.0936209186911583,
"learning_rate": 0.000299707594224639,
"loss": 65.6743,
"step": 8300
},
{
"epoch": 2.1,
"grad_norm": 0.08781470358371735,
"learning_rate": 0.00029970384399024934,
"loss": 66.3408,
"step": 8400
},
{
"epoch": 2.125,
"grad_norm": 0.18813404440879822,
"learning_rate": 0.0002997000937558597,
"loss": 65.7238,
"step": 8500
},
{
"epoch": 2.15,
"grad_norm": 0.09089367091655731,
"learning_rate": 0.00029969634352147007,
"loss": 64.8326,
"step": 8600
},
{
"epoch": 2.175,
"grad_norm": 0.09775424748659134,
"learning_rate": 0.00029969259328708044,
"loss": 64.9571,
"step": 8700
},
{
"epoch": 2.2,
"grad_norm": 0.07110758870840073,
"learning_rate": 0.00029968888055503464,
"loss": 64.1227,
"step": 8800
},
{
"epoch": 2.225,
"grad_norm": 0.08944450318813324,
"learning_rate": 0.000299685130320645,
"loss": 63.0563,
"step": 8900
},
{
"epoch": 2.25,
"grad_norm": 0.0880662053823471,
"learning_rate": 0.0002996813800862554,
"loss": 63.5158,
"step": 9000
},
{
"epoch": 2.275,
"grad_norm": 0.08363056182861328,
"learning_rate": 0.00029967762985186574,
"loss": 63.1458,
"step": 9100
},
{
"epoch": 2.3,
"grad_norm": 0.0970577672123909,
"learning_rate": 0.00029967387961747605,
"loss": 63.6672,
"step": 9200
},
{
"epoch": 2.325,
"grad_norm": 0.07709024846553802,
"learning_rate": 0.0002996701293830864,
"loss": 62.5691,
"step": 9300
},
{
"epoch": 2.35,
"grad_norm": 0.09662684798240662,
"learning_rate": 0.00029966637914869673,
"loss": 63.201,
"step": 9400
},
{
"epoch": 2.375,
"grad_norm": 0.09886329621076584,
"learning_rate": 0.0002996626289143071,
"loss": 61.905,
"step": 9500
},
{
"epoch": 2.4,
"grad_norm": 0.09152296930551529,
"learning_rate": 0.00029965887867991746,
"loss": 62.0162,
"step": 9600
},
{
"epoch": 2.425,
"grad_norm": 0.08669120818376541,
"learning_rate": 0.00029965512844552783,
"loss": 61.177,
"step": 9700
},
{
"epoch": 2.45,
"grad_norm": 0.08084509521722794,
"learning_rate": 0.00029965137821113814,
"loss": 60.4171,
"step": 9800
},
{
"epoch": 2.475,
"grad_norm": 0.07486914098262787,
"learning_rate": 0.0002996476279767485,
"loss": 60.7016,
"step": 9900
},
{
"epoch": 2.5,
"grad_norm": 0.09742671251296997,
"learning_rate": 0.0002996438777423589,
"loss": 60.1792,
"step": 10000
},
{
"epoch": 2.525,
"grad_norm": 0.0987100750207901,
"learning_rate": 0.00029964012750796924,
"loss": 61.4537,
"step": 10100
},
{
"epoch": 2.55,
"grad_norm": 0.06886423379182816,
"learning_rate": 0.00029963637727357955,
"loss": 61.8643,
"step": 10200
},
{
"epoch": 2.575,
"grad_norm": 0.082525834441185,
"learning_rate": 0.0002996326270391899,
"loss": 60.4919,
"step": 10300
},
{
"epoch": 2.6,
"grad_norm": 0.08272566646337509,
"learning_rate": 0.0002996288768048003,
"loss": 60.0661,
"step": 10400
},
{
"epoch": 2.625,
"grad_norm": 0.09038376808166504,
"learning_rate": 0.00029962512657041065,
"loss": 60.936,
"step": 10500
},
{
"epoch": 2.65,
"grad_norm": 0.07726665586233139,
"learning_rate": 0.00029962137633602096,
"loss": 59.5663,
"step": 10600
},
{
"epoch": 2.675,
"grad_norm": 0.07424433529376984,
"learning_rate": 0.00029961762610163133,
"loss": 59.158,
"step": 10700
},
{
"epoch": 2.7,
"grad_norm": 0.07766600698232651,
"learning_rate": 0.0002996138758672417,
"loss": 60.6268,
"step": 10800
},
{
"epoch": 2.725,
"grad_norm": 0.06614714115858078,
"learning_rate": 0.00029961012563285206,
"loss": 59.6028,
"step": 10900
},
{
"epoch": 2.75,
"grad_norm": 0.10867344588041306,
"learning_rate": 0.0002996063753984624,
"loss": 58.8979,
"step": 11000
},
{
"epoch": 2.775,
"grad_norm": 0.08278031647205353,
"learning_rate": 0.00029960262516407274,
"loss": 58.4585,
"step": 11100
},
{
"epoch": 2.8,
"grad_norm": 0.0777415856719017,
"learning_rate": 0.00029959887492968305,
"loss": 58.2955,
"step": 11200
},
{
"epoch": 2.825,
"grad_norm": 0.08938944339752197,
"learning_rate": 0.0002995951246952934,
"loss": 58.4243,
"step": 11300
},
{
"epoch": 2.85,
"grad_norm": 0.07335088402032852,
"learning_rate": 0.0002995913744609038,
"loss": 58.3433,
"step": 11400
},
{
"epoch": 2.875,
"grad_norm": 0.08737402409315109,
"learning_rate": 0.00029958762422651415,
"loss": 58.083,
"step": 11500
},
{
"epoch": 2.9,
"grad_norm": 0.08511873334646225,
"learning_rate": 0.00029958387399212446,
"loss": 57.179,
"step": 11600
},
{
"epoch": 2.925,
"grad_norm": 0.10887938737869263,
"learning_rate": 0.00029958012375773483,
"loss": 56.4871,
"step": 11700
},
{
"epoch": 2.95,
"grad_norm": 0.06436943262815475,
"learning_rate": 0.0002995763735233452,
"loss": 56.647,
"step": 11800
},
{
"epoch": 2.975,
"grad_norm": 0.0767776370048523,
"learning_rate": 0.00029957262328895556,
"loss": 56.8327,
"step": 11900
},
{
"epoch": 3.0,
"grad_norm": 0.07136838138103485,
"learning_rate": 0.0002995688730545659,
"loss": 56.1021,
"step": 12000
},
{
"epoch": 3.025,
"grad_norm": 0.07126389443874359,
"learning_rate": 0.00029956512282017624,
"loss": 54.9375,
"step": 12100
},
{
"epoch": 3.05,
"grad_norm": 0.08064913004636765,
"learning_rate": 0.0002995613725857866,
"loss": 55.8513,
"step": 12200
},
{
"epoch": 3.075,
"grad_norm": 0.09110742062330246,
"learning_rate": 0.0002995576223513969,
"loss": 55.3327,
"step": 12300
},
{
"epoch": 3.1,
"grad_norm": 0.0769059956073761,
"learning_rate": 0.0002995538721170073,
"loss": 54.0639,
"step": 12400
},
{
"epoch": 3.125,
"grad_norm": 0.06642630696296692,
"learning_rate": 0.0002995501218826176,
"loss": 53.5245,
"step": 12500
},
{
"epoch": 3.15,
"grad_norm": 0.07648100703954697,
"learning_rate": 0.000299546371648228,
"loss": 53.7525,
"step": 12600
},
{
"epoch": 3.175,
"grad_norm": 0.07088977843523026,
"learning_rate": 0.00029954262141383833,
"loss": 52.302,
"step": 12700
},
{
"epoch": 3.2,
"grad_norm": 0.07282839715480804,
"learning_rate": 0.0002995388711794487,
"loss": 52.6612,
"step": 12800
},
{
"epoch": 3.225,
"grad_norm": 0.07733161747455597,
"learning_rate": 0.000299535120945059,
"loss": 51.6131,
"step": 12900
},
{
"epoch": 3.25,
"grad_norm": 0.06774196773767471,
"learning_rate": 0.00029953137071066937,
"loss": 51.9959,
"step": 13000
},
{
"epoch": 3.275,
"grad_norm": 0.08115985989570618,
"learning_rate": 0.00029952762047627974,
"loss": 49.8227,
"step": 13100
},
{
"epoch": 3.3,
"grad_norm": 0.0886857658624649,
"learning_rate": 0.0002995238702418901,
"loss": 50.5718,
"step": 13200
},
{
"epoch": 3.325,
"grad_norm": 0.07071532309055328,
"learning_rate": 0.0002995201200075004,
"loss": 51.6469,
"step": 13300
},
{
"epoch": 3.35,
"grad_norm": 0.09553579241037369,
"learning_rate": 0.0002995163697731108,
"loss": 50.2462,
"step": 13400
},
{
"epoch": 3.375,
"grad_norm": 0.07065360993146896,
"learning_rate": 0.00029951261953872115,
"loss": 49.4932,
"step": 13500
},
{
"epoch": 3.4,
"grad_norm": 0.07770080119371414,
"learning_rate": 0.0002995088693043315,
"loss": 49.8068,
"step": 13600
},
{
"epoch": 3.425,
"grad_norm": 0.08060113340616226,
"learning_rate": 0.0002995051190699418,
"loss": 48.4129,
"step": 13700
},
{
"epoch": 3.45,
"grad_norm": 0.07022694498300552,
"learning_rate": 0.0002995013688355522,
"loss": 48.5766,
"step": 13800
},
{
"epoch": 3.475,
"grad_norm": 0.08857674151659012,
"learning_rate": 0.00029949761860116256,
"loss": 47.6903,
"step": 13900
},
{
"epoch": 3.5,
"grad_norm": 0.069500632584095,
"learning_rate": 0.0002994938683667729,
"loss": 48.2677,
"step": 14000
},
{
"epoch": 3.525,
"grad_norm": 0.08871123939752579,
"learning_rate": 0.00029949011813238324,
"loss": 46.9917,
"step": 14100
},
{
"epoch": 3.55,
"grad_norm": 0.08282507210969925,
"learning_rate": 0.0002994863678979936,
"loss": 47.6174,
"step": 14200
},
{
"epoch": 3.575,
"grad_norm": 0.07892107963562012,
"learning_rate": 0.0002994826176636039,
"loss": 47.7429,
"step": 14300
},
{
"epoch": 3.6,
"grad_norm": 0.08358065783977509,
"learning_rate": 0.00029947886742921434,
"loss": 46.8444,
"step": 14400
},
{
"epoch": 3.625,
"grad_norm": 0.08042451739311218,
"learning_rate": 0.00029947511719482465,
"loss": 47.1196,
"step": 14500
},
{
"epoch": 3.65,
"grad_norm": 0.07715913653373718,
"learning_rate": 0.000299471366960435,
"loss": 46.1787,
"step": 14600
},
{
"epoch": 3.675,
"grad_norm": 0.07201175391674042,
"learning_rate": 0.0002994676167260453,
"loss": 44.82,
"step": 14700
},
{
"epoch": 3.7,
"grad_norm": 0.07503117620944977,
"learning_rate": 0.0002994638664916557,
"loss": 45.3985,
"step": 14800
},
{
"epoch": 3.725,
"grad_norm": 0.08126576244831085,
"learning_rate": 0.00029946011625726606,
"loss": 44.4742,
"step": 14900
},
{
"epoch": 3.75,
"grad_norm": 0.07859744131565094,
"learning_rate": 0.0002994563660228764,
"loss": 44.9098,
"step": 15000
},
{
"epoch": 3.775,
"grad_norm": 0.09183020889759064,
"learning_rate": 0.00029945261578848674,
"loss": 44.9649,
"step": 15100
},
{
"epoch": 3.8,
"grad_norm": 0.07173748314380646,
"learning_rate": 0.0002994488655540971,
"loss": 44.2067,
"step": 15200
},
{
"epoch": 3.825,
"grad_norm": 0.07911107689142227,
"learning_rate": 0.00029944511531970747,
"loss": 43.3721,
"step": 15300
},
{
"epoch": 3.85,
"grad_norm": 0.0707039088010788,
"learning_rate": 0.00029944136508531783,
"loss": 43.5256,
"step": 15400
},
{
"epoch": 3.875,
"grad_norm": 0.08927769958972931,
"learning_rate": 0.00029943761485092815,
"loss": 42.8865,
"step": 15500
},
{
"epoch": 3.9,
"grad_norm": 0.0942542776465416,
"learning_rate": 0.0002994338646165385,
"loss": 43.4099,
"step": 15600
},
{
"epoch": 3.925,
"grad_norm": 0.07037200033664703,
"learning_rate": 0.0002994301143821489,
"loss": 43.2838,
"step": 15700
},
{
"epoch": 3.95,
"grad_norm": 0.07836440950632095,
"learning_rate": 0.00029942636414775924,
"loss": 42.6156,
"step": 15800
},
{
"epoch": 3.975,
"grad_norm": 0.1048571839928627,
"learning_rate": 0.00029942261391336956,
"loss": 41.1921,
"step": 15900
},
{
"epoch": 4.0,
"grad_norm": 0.07439113408327103,
"learning_rate": 0.0002994188636789799,
"loss": 40.3632,
"step": 16000
},
{
"epoch": 4.025,
"grad_norm": 0.07776340842247009,
"learning_rate": 0.00029941511344459023,
"loss": 41.4027,
"step": 16100
},
{
"epoch": 4.05,
"grad_norm": 0.08847617357969284,
"learning_rate": 0.0002994113632102006,
"loss": 39.8482,
"step": 16200
},
{
"epoch": 4.075,
"grad_norm": 0.07630669325590134,
"learning_rate": 0.00029940761297581097,
"loss": 39.8514,
"step": 16300
},
{
"epoch": 4.1,
"grad_norm": 0.09090664237737656,
"learning_rate": 0.0002994038627414213,
"loss": 39.827,
"step": 16400
},
{
"epoch": 4.125,
"grad_norm": 0.07954572886228561,
"learning_rate": 0.00029940011250703164,
"loss": 39.1342,
"step": 16500
},
{
"epoch": 4.15,
"grad_norm": 0.09102310240268707,
"learning_rate": 0.000299396362272642,
"loss": 39.2371,
"step": 16600
},
{
"epoch": 4.175,
"grad_norm": 0.08122776448726654,
"learning_rate": 0.0002993926120382524,
"loss": 38.2627,
"step": 16700
},
{
"epoch": 4.2,
"grad_norm": 0.0793018564581871,
"learning_rate": 0.0002993888618038627,
"loss": 37.7778,
"step": 16800
},
{
"epoch": 4.225,
"grad_norm": 0.08967263251543045,
"learning_rate": 0.00029938511156947306,
"loss": 37.3333,
"step": 16900
},
{
"epoch": 4.25,
"grad_norm": 0.08178253471851349,
"learning_rate": 0.0002993813613350834,
"loss": 37.0271,
"step": 17000
},
{
"epoch": 4.275,
"grad_norm": 0.07139851152896881,
"learning_rate": 0.0002993776111006938,
"loss": 36.2547,
"step": 17100
},
{
"epoch": 4.3,
"grad_norm": 0.0816299095749855,
"learning_rate": 0.0002993738608663041,
"loss": 35.7427,
"step": 17200
},
{
"epoch": 4.325,
"grad_norm": 0.08794036507606506,
"learning_rate": 0.00029937011063191447,
"loss": 36.1878,
"step": 17300
},
{
"epoch": 4.35,
"grad_norm": 0.07489024847745895,
"learning_rate": 0.00029936636039752483,
"loss": 35.8839,
"step": 17400
},
{
"epoch": 4.375,
"grad_norm": 0.07704652100801468,
"learning_rate": 0.0002993626101631352,
"loss": 34.6569,
"step": 17500
},
{
"epoch": 4.4,
"grad_norm": 0.08644381910562515,
"learning_rate": 0.0002993588974310894,
"loss": 36.0711,
"step": 17600
},
{
"epoch": 4.425,
"grad_norm": 0.0718245580792427,
"learning_rate": 0.00029935514719669977,
"loss": 34.2787,
"step": 17700
},
{
"epoch": 4.45,
"grad_norm": 0.06881660968065262,
"learning_rate": 0.0002993513969623101,
"loss": 34.3262,
"step": 17800
},
{
"epoch": 4.475,
"grad_norm": 0.09241487085819244,
"learning_rate": 0.00029934764672792045,
"loss": 32.8671,
"step": 17900
},
{
"epoch": 4.5,
"grad_norm": 0.10901615768671036,
"learning_rate": 0.0002993438964935308,
"loss": 32.8513,
"step": 18000
},
{
"epoch": 4.525,
"grad_norm": 0.10043422877788544,
"learning_rate": 0.0002993401462591412,
"loss": 33.2156,
"step": 18100
},
{
"epoch": 4.55,
"grad_norm": 0.0931539386510849,
"learning_rate": 0.0002993363960247515,
"loss": 32.9817,
"step": 18200
},
{
"epoch": 4.575,
"grad_norm": 0.07910791784524918,
"learning_rate": 0.00029933264579036186,
"loss": 32.266,
"step": 18300
},
{
"epoch": 4.6,
"grad_norm": 0.07403460144996643,
"learning_rate": 0.0002993288955559722,
"loss": 32.3611,
"step": 18400
},
{
"epoch": 4.625,
"grad_norm": 0.0901438444852829,
"learning_rate": 0.0002993251453215826,
"loss": 31.6647,
"step": 18500
},
{
"epoch": 4.65,
"grad_norm": 0.08572247624397278,
"learning_rate": 0.0002993213950871929,
"loss": 31.4374,
"step": 18600
},
{
"epoch": 4.675,
"grad_norm": 0.10135528445243835,
"learning_rate": 0.00029931764485280327,
"loss": 30.899,
"step": 18700
},
{
"epoch": 4.7,
"grad_norm": 0.07215873152017593,
"learning_rate": 0.00029931389461841364,
"loss": 30.9789,
"step": 18800
},
{
"epoch": 4.725,
"grad_norm": 0.08922874182462692,
"learning_rate": 0.000299310144384024,
"loss": 30.7143,
"step": 18900
},
{
"epoch": 4.75,
"grad_norm": 0.08180548250675201,
"learning_rate": 0.0002993063941496343,
"loss": 30.1035,
"step": 19000
},
{
"epoch": 4.775,
"grad_norm": 0.07757364213466644,
"learning_rate": 0.0002993026439152447,
"loss": 29.8003,
"step": 19100
},
{
"epoch": 4.8,
"grad_norm": 0.09399455040693283,
"learning_rate": 0.00029929889368085505,
"loss": 29.8595,
"step": 19200
},
{
"epoch": 4.825,
"grad_norm": 0.08426772803068161,
"learning_rate": 0.0002992951434464654,
"loss": 29.8153,
"step": 19300
},
{
"epoch": 4.85,
"grad_norm": 0.08488670736551285,
"learning_rate": 0.0002992913932120757,
"loss": 29.5577,
"step": 19400
},
{
"epoch": 4.875,
"grad_norm": 0.06904991716146469,
"learning_rate": 0.0002992876429776861,
"loss": 28.5755,
"step": 19500
},
{
"epoch": 4.9,
"grad_norm": 0.11179706454277039,
"learning_rate": 0.0002992838927432964,
"loss": 28.8428,
"step": 19600
},
{
"epoch": 4.925,
"grad_norm": 0.0724404975771904,
"learning_rate": 0.00029928014250890677,
"loss": 28.2313,
"step": 19700
},
{
"epoch": 4.95,
"grad_norm": 0.08049552142620087,
"learning_rate": 0.00029927639227451714,
"loss": 27.1596,
"step": 19800
},
{
"epoch": 4.975,
"grad_norm": 0.07410436868667603,
"learning_rate": 0.0002992726420401275,
"loss": 26.9374,
"step": 19900
},
{
"epoch": 5.0,
"grad_norm": 0.0729108527302742,
"learning_rate": 0.0002992688918057378,
"loss": 27.3767,
"step": 20000
},
{
"epoch": 5.025,
"grad_norm": 0.0834740698337555,
"learning_rate": 0.0002992651790736921,
"loss": 26.5892,
"step": 20100
},
{
"epoch": 5.05,
"grad_norm": 0.07734266668558121,
"learning_rate": 0.00029926142883930244,
"loss": 26.4578,
"step": 20200
},
{
"epoch": 5.075,
"grad_norm": 0.07236121594905853,
"learning_rate": 0.00029925767860491275,
"loss": 27.4309,
"step": 20300
},
{
"epoch": 5.1,
"grad_norm": 0.07896186411380768,
"learning_rate": 0.0002992539283705231,
"loss": 26.7645,
"step": 20400
},
{
"epoch": 5.125,
"grad_norm": 0.09544118493795395,
"learning_rate": 0.0002992501781361335,
"loss": 26.149,
"step": 20500
},
{
"epoch": 5.15,
"grad_norm": 0.07782524079084396,
"learning_rate": 0.00029924642790174385,
"loss": 25.7688,
"step": 20600
},
{
"epoch": 5.175,
"grad_norm": 0.07927709072828293,
"learning_rate": 0.00029924267766735416,
"loss": 25.8487,
"step": 20700
},
{
"epoch": 5.2,
"grad_norm": 0.07417237758636475,
"learning_rate": 0.00029923892743296453,
"loss": 25.6094,
"step": 20800
},
{
"epoch": 5.225,
"grad_norm": 0.09987534582614899,
"learning_rate": 0.0002992352147009188,
"loss": 25.3336,
"step": 20900
},
{
"epoch": 5.25,
"grad_norm": 0.08160518109798431,
"learning_rate": 0.00029923146446652916,
"loss": 25.2813,
"step": 21000
},
{
"epoch": 5.275,
"grad_norm": 0.07650009542703629,
"learning_rate": 0.00029922771423213947,
"loss": 25.0793,
"step": 21100
},
{
"epoch": 5.3,
"grad_norm": 0.07089775055646896,
"learning_rate": 0.00029922396399774983,
"loss": 24.9184,
"step": 21200
},
{
"epoch": 5.325,
"grad_norm": 0.10953019559383392,
"learning_rate": 0.00029922021376336015,
"loss": 24.5976,
"step": 21300
},
{
"epoch": 5.35,
"grad_norm": 0.07163265347480774,
"learning_rate": 0.00029921646352897057,
"loss": 24.3399,
"step": 21400
},
{
"epoch": 5.375,
"grad_norm": 0.08414668589830399,
"learning_rate": 0.0002992127132945809,
"loss": 23.6757,
"step": 21500
},
{
"epoch": 5.4,
"grad_norm": 0.07715445011854172,
"learning_rate": 0.00029920896306019125,
"loss": 24.2548,
"step": 21600
},
{
"epoch": 5.425,
"grad_norm": 0.1033063754439354,
"learning_rate": 0.00029920521282580156,
"loss": 23.3908,
"step": 21700
},
{
"epoch": 5.45,
"grad_norm": 0.0769144669175148,
"learning_rate": 0.0002992014625914119,
"loss": 23.693,
"step": 21800
},
{
"epoch": 5.475,
"grad_norm": 0.07799799740314484,
"learning_rate": 0.0002991977123570223,
"loss": 23.9314,
"step": 21900
},
{
"epoch": 5.5,
"grad_norm": 0.07105720043182373,
"learning_rate": 0.00029919396212263266,
"loss": 23.2387,
"step": 22000
},
{
"epoch": 5.525,
"grad_norm": 0.0878797098994255,
"learning_rate": 0.00029919021188824297,
"loss": 22.7268,
"step": 22100
},
{
"epoch": 5.55,
"grad_norm": 0.0924353376030922,
"learning_rate": 0.00029918646165385333,
"loss": 23.1994,
"step": 22200
},
{
"epoch": 5.575,
"grad_norm": 0.09924343973398209,
"learning_rate": 0.0002991827114194637,
"loss": 22.7976,
"step": 22300
},
{
"epoch": 5.6,
"grad_norm": 0.0845380574464798,
"learning_rate": 0.00029917896118507407,
"loss": 22.6053,
"step": 22400
},
{
"epoch": 5.625,
"grad_norm": 0.09131123870611191,
"learning_rate": 0.0002991752109506844,
"loss": 22.813,
"step": 22500
},
{
"epoch": 5.65,
"grad_norm": 0.08501371741294861,
"learning_rate": 0.00029917146071629474,
"loss": 22.3981,
"step": 22600
},
{
"epoch": 5.675,
"grad_norm": 0.10916517674922943,
"learning_rate": 0.0002991677104819051,
"loss": 21.6828,
"step": 22700
},
{
"epoch": 5.7,
"grad_norm": 0.08462018519639969,
"learning_rate": 0.0002991639602475155,
"loss": 22.0131,
"step": 22800
},
{
"epoch": 5.725,
"grad_norm": 0.09394313395023346,
"learning_rate": 0.0002991602100131258,
"loss": 21.7932,
"step": 22900
},
{
"epoch": 5.75,
"grad_norm": 0.08408233523368835,
"learning_rate": 0.00029915645977873615,
"loss": 21.8634,
"step": 23000
},
{
"epoch": 5.775,
"grad_norm": 0.0706961527466774,
"learning_rate": 0.00029915270954434647,
"loss": 21.6353,
"step": 23100
},
{
"epoch": 5.8,
"grad_norm": 0.08162959665060043,
"learning_rate": 0.00029914895930995683,
"loss": 21.356,
"step": 23200
},
{
"epoch": 5.825,
"grad_norm": 0.08196116983890533,
"learning_rate": 0.0002991452090755672,
"loss": 21.3074,
"step": 23300
},
{
"epoch": 5.85,
"grad_norm": 0.07449360191822052,
"learning_rate": 0.00029914145884117756,
"loss": 21.2129,
"step": 23400
},
{
"epoch": 5.875,
"grad_norm": 0.08260208368301392,
"learning_rate": 0.0002991377086067879,
"loss": 20.7806,
"step": 23500
},
{
"epoch": 5.9,
"grad_norm": 0.07383255660533905,
"learning_rate": 0.00029913395837239824,
"loss": 20.9318,
"step": 23600
},
{
"epoch": 5.925,
"grad_norm": 0.08240984380245209,
"learning_rate": 0.0002991302081380086,
"loss": 20.5751,
"step": 23700
},
{
"epoch": 5.95,
"grad_norm": 0.06921262294054031,
"learning_rate": 0.000299126457903619,
"loss": 20.9214,
"step": 23800
},
{
"epoch": 5.975,
"grad_norm": 0.07990318536758423,
"learning_rate": 0.0002991227076692293,
"loss": 20.6422,
"step": 23900
},
{
"epoch": 6.0,
"grad_norm": 0.083002008497715,
"learning_rate": 0.00029911895743483965,
"loss": 19.8315,
"step": 24000
},
{
"epoch": 6.025,
"grad_norm": 0.08495783805847168,
"learning_rate": 0.00029911520720045,
"loss": 20.1271,
"step": 24100
},
{
"epoch": 6.05,
"grad_norm": 0.1061740592122078,
"learning_rate": 0.00029911145696606033,
"loss": 20.1241,
"step": 24200
},
{
"epoch": 6.075,
"grad_norm": 0.08326783776283264,
"learning_rate": 0.0002991077067316707,
"loss": 19.5344,
"step": 24300
},
{
"epoch": 6.1,
"grad_norm": 0.08668112009763718,
"learning_rate": 0.00029910395649728106,
"loss": 19.8691,
"step": 24400
},
{
"epoch": 6.125,
"grad_norm": 0.07595008611679077,
"learning_rate": 0.00029910020626289143,
"loss": 19.5726,
"step": 24500
},
{
"epoch": 6.15,
"grad_norm": 0.09996142983436584,
"learning_rate": 0.00029909645602850174,
"loss": 19.3215,
"step": 24600
},
{
"epoch": 6.175,
"grad_norm": 0.07515228539705276,
"learning_rate": 0.0002990927057941121,
"loss": 19.5642,
"step": 24700
},
{
"epoch": 6.2,
"grad_norm": 0.06983605772256851,
"learning_rate": 0.0002990889555597224,
"loss": 19.1783,
"step": 24800
},
{
"epoch": 6.225,
"grad_norm": 0.07114838808774948,
"learning_rate": 0.0002990852053253328,
"loss": 19.0791,
"step": 24900
},
{
"epoch": 6.25,
"grad_norm": 0.08623602986335754,
"learning_rate": 0.00029908145509094315,
"loss": 19.5374,
"step": 25000
},
{
"epoch": 6.275,
"grad_norm": 0.09096742421388626,
"learning_rate": 0.0002990777048565535,
"loss": 18.8189,
"step": 25100
},
{
"epoch": 6.3,
"grad_norm": 0.08167672157287598,
"learning_rate": 0.00029907395462216383,
"loss": 18.4164,
"step": 25200
},
{
"epoch": 6.325,
"grad_norm": 0.08562010526657104,
"learning_rate": 0.0002990702043877742,
"loss": 18.3827,
"step": 25300
},
{
"epoch": 6.35,
"grad_norm": 0.08020398765802383,
"learning_rate": 0.00029906645415338456,
"loss": 18.8151,
"step": 25400
},
{
"epoch": 6.375,
"grad_norm": 0.08050194382667542,
"learning_rate": 0.00029906270391899493,
"loss": 17.9696,
"step": 25500
},
{
"epoch": 6.4,
"grad_norm": 0.09030721336603165,
"learning_rate": 0.00029905895368460524,
"loss": 17.9795,
"step": 25600
},
{
"epoch": 6.425,
"grad_norm": 0.09238829463720322,
"learning_rate": 0.0002990552034502156,
"loss": 17.8095,
"step": 25700
},
{
"epoch": 6.45,
"grad_norm": 0.08500493317842484,
"learning_rate": 0.000299051453215826,
"loss": 18.3223,
"step": 25800
},
{
"epoch": 6.475,
"grad_norm": 0.08180621266365051,
"learning_rate": 0.00029904770298143634,
"loss": 17.7836,
"step": 25900
},
{
"epoch": 6.5,
"grad_norm": 0.09796881675720215,
"learning_rate": 0.00029904395274704665,
"loss": 17.7483,
"step": 26000
},
{
"epoch": 6.525,
"grad_norm": 0.08432163298130035,
"learning_rate": 0.000299040202512657,
"loss": 18.2479,
"step": 26100
},
{
"epoch": 6.55,
"grad_norm": 0.08197837322950363,
"learning_rate": 0.0002990364897806113,
"loss": 17.7703,
"step": 26200
},
{
"epoch": 6.575,
"grad_norm": 0.07721620053052902,
"learning_rate": 0.00029903273954622164,
"loss": 17.1537,
"step": 26300
},
{
"epoch": 6.6,
"grad_norm": 0.0785108208656311,
"learning_rate": 0.00029902898931183196,
"loss": 17.5139,
"step": 26400
},
{
"epoch": 6.625,
"grad_norm": 0.08640828728675842,
"learning_rate": 0.0002990252390774423,
"loss": 16.7445,
"step": 26500
},
{
"epoch": 6.65,
"grad_norm": 0.09119407832622528,
"learning_rate": 0.00029902148884305263,
"loss": 17.1573,
"step": 26600
},
{
"epoch": 6.675,
"grad_norm": 0.07212173193693161,
"learning_rate": 0.000299017738608663,
"loss": 17.0759,
"step": 26700
},
{
"epoch": 6.7,
"grad_norm": 0.08220189809799194,
"learning_rate": 0.00029901398837427337,
"loss": 16.9119,
"step": 26800
},
{
"epoch": 6.725,
"grad_norm": 0.10024359822273254,
"learning_rate": 0.00029901023813988373,
"loss": 16.7596,
"step": 26900
},
{
"epoch": 6.75,
"grad_norm": 0.0850207731127739,
"learning_rate": 0.00029900648790549405,
"loss": 16.9184,
"step": 27000
},
{
"epoch": 6.775,
"grad_norm": 0.07585939019918442,
"learning_rate": 0.0002990027376711044,
"loss": 16.4899,
"step": 27100
},
{
"epoch": 6.8,
"grad_norm": 0.08519823104143143,
"learning_rate": 0.0002989989874367148,
"loss": 16.8922,
"step": 27200
},
{
"epoch": 6.825,
"grad_norm": 0.08368838578462601,
"learning_rate": 0.00029899523720232514,
"loss": 16.8136,
"step": 27300
},
{
"epoch": 6.85,
"grad_norm": 0.08928319811820984,
"learning_rate": 0.00029899148696793546,
"loss": 16.2412,
"step": 27400
},
{
"epoch": 6.875,
"grad_norm": 0.08436159044504166,
"learning_rate": 0.0002989877367335458,
"loss": 16.6282,
"step": 27500
},
{
"epoch": 6.9,
"grad_norm": 0.0907684713602066,
"learning_rate": 0.0002989839864991562,
"loss": 16.3234,
"step": 27600
},
{
"epoch": 6.925,
"grad_norm": 0.08816706389188766,
"learning_rate": 0.00029898023626476655,
"loss": 16.164,
"step": 27700
},
{
"epoch": 6.95,
"grad_norm": 0.08335541933774948,
"learning_rate": 0.00029897648603037687,
"loss": 16.1988,
"step": 27800
},
{
"epoch": 6.975,
"grad_norm": 0.07165244221687317,
"learning_rate": 0.00029897273579598723,
"loss": 16.1657,
"step": 27900
},
{
"epoch": 7.0,
"grad_norm": 0.0803430899977684,
"learning_rate": 0.0002989689855615976,
"loss": 15.7038,
"step": 28000
},
{
"epoch": 7.025,
"grad_norm": 0.0674068033695221,
"learning_rate": 0.0002989652353272079,
"loss": 15.5932,
"step": 28100
},
{
"epoch": 7.05,
"grad_norm": 0.07914315909147263,
"learning_rate": 0.0002989614850928183,
"loss": 16.1827,
"step": 28200
},
{
"epoch": 7.075,
"grad_norm": 0.0919245108962059,
"learning_rate": 0.0002989577348584286,
"loss": 15.7686,
"step": 28300
},
{
"epoch": 7.1,
"grad_norm": 0.09044385701417923,
"learning_rate": 0.00029895398462403895,
"loss": 15.6737,
"step": 28400
},
{
"epoch": 7.125,
"grad_norm": 0.08890822529792786,
"learning_rate": 0.0002989502343896493,
"loss": 15.8661,
"step": 28500
},
{
"epoch": 7.15,
"grad_norm": 0.08436182141304016,
"learning_rate": 0.0002989464841552597,
"loss": 15.5255,
"step": 28600
},
{
"epoch": 7.175,
"grad_norm": 0.08775323629379272,
"learning_rate": 0.00029894273392087,
"loss": 15.4992,
"step": 28700
},
{
"epoch": 7.2,
"grad_norm": 0.09018935263156891,
"learning_rate": 0.00029893898368648036,
"loss": 15.3418,
"step": 28800
},
{
"epoch": 7.225,
"grad_norm": 0.08356596529483795,
"learning_rate": 0.00029893523345209073,
"loss": 15.0965,
"step": 28900
},
{
"epoch": 7.25,
"grad_norm": 0.09058874845504761,
"learning_rate": 0.0002989314832177011,
"loss": 15.0762,
"step": 29000
},
{
"epoch": 7.275,
"grad_norm": 0.07803665101528168,
"learning_rate": 0.0002989277329833114,
"loss": 14.6331,
"step": 29100
},
{
"epoch": 7.3,
"grad_norm": 0.08148869127035141,
"learning_rate": 0.0002989239827489218,
"loss": 14.8405,
"step": 29200
},
{
"epoch": 7.325,
"grad_norm": 0.08294442296028137,
"learning_rate": 0.00029892023251453214,
"loss": 15.2037,
"step": 29300
},
{
"epoch": 7.35,
"grad_norm": 0.0803549587726593,
"learning_rate": 0.0002989164822801425,
"loss": 14.8633,
"step": 29400
},
{
"epoch": 7.375,
"grad_norm": 0.08180885016918182,
"learning_rate": 0.0002989127320457528,
"loss": 14.8036,
"step": 29500
},
{
"epoch": 7.4,
"grad_norm": 0.08756575733423233,
"learning_rate": 0.0002989089818113632,
"loss": 14.2077,
"step": 29600
},
{
"epoch": 7.425,
"grad_norm": 0.0851132944226265,
"learning_rate": 0.00029890523157697355,
"loss": 14.569,
"step": 29700
},
{
"epoch": 7.45,
"grad_norm": 0.08879829198122025,
"learning_rate": 0.0002989014813425839,
"loss": 14.5104,
"step": 29800
},
{
"epoch": 7.475,
"grad_norm": 0.0918511152267456,
"learning_rate": 0.00029889773110819423,
"loss": 14.3482,
"step": 29900
},
{
"epoch": 7.5,
"grad_norm": 0.07251127064228058,
"learning_rate": 0.0002988939808738046,
"loss": 14.2309,
"step": 30000
},
{
"epoch": 7.525,
"grad_norm": 0.07517971098423004,
"learning_rate": 0.0002988902306394149,
"loss": 14.0291,
"step": 30100
},
{
"epoch": 7.55,
"grad_norm": 0.08854610472917557,
"learning_rate": 0.00029888651790736917,
"loss": 14.1938,
"step": 30200
},
{
"epoch": 7.575,
"grad_norm": 0.0849192887544632,
"learning_rate": 0.00029888276767297954,
"loss": 14.4531,
"step": 30300
},
{
"epoch": 7.6,
"grad_norm": 0.08010224252939224,
"learning_rate": 0.0002988790174385899,
"loss": 14.2434,
"step": 30400
},
{
"epoch": 7.625,
"grad_norm": 0.09017332643270493,
"learning_rate": 0.0002988752672042002,
"loss": 14.2892,
"step": 30500
},
{
"epoch": 7.65,
"grad_norm": 0.08440462499856949,
"learning_rate": 0.0002988715544721545,
"loss": 13.8386,
"step": 30600
},
{
"epoch": 7.675,
"grad_norm": 0.08667606860399246,
"learning_rate": 0.00029886780423776484,
"loss": 13.9581,
"step": 30700
},
{
"epoch": 7.7,
"grad_norm": 0.08237945288419724,
"learning_rate": 0.0002988640540033752,
"loss": 13.8813,
"step": 30800
},
{
"epoch": 7.725,
"grad_norm": 0.09895262122154236,
"learning_rate": 0.0002988603037689855,
"loss": 13.7951,
"step": 30900
},
{
"epoch": 7.75,
"grad_norm": 0.07596876472234726,
"learning_rate": 0.0002988565535345959,
"loss": 13.7703,
"step": 31000
},
{
"epoch": 7.775,
"grad_norm": 0.07925312221050262,
"learning_rate": 0.00029885280330020625,
"loss": 13.4507,
"step": 31100
},
{
"epoch": 7.8,
"grad_norm": 0.06997061520814896,
"learning_rate": 0.0002988490530658166,
"loss": 13.2481,
"step": 31200
},
{
"epoch": 7.825,
"grad_norm": 0.07986485958099365,
"learning_rate": 0.00029884530283142693,
"loss": 13.3403,
"step": 31300
},
{
"epoch": 7.85,
"grad_norm": 0.0819752886891365,
"learning_rate": 0.0002988415525970373,
"loss": 13.5279,
"step": 31400
},
{
"epoch": 7.875,
"grad_norm": 0.08534371107816696,
"learning_rate": 0.00029883780236264766,
"loss": 13.528,
"step": 31500
},
{
"epoch": 7.9,
"grad_norm": 0.06895570456981659,
"learning_rate": 0.00029883405212825803,
"loss": 13.0555,
"step": 31600
},
{
"epoch": 7.925,
"grad_norm": 0.07396534085273743,
"learning_rate": 0.00029883030189386834,
"loss": 13.1404,
"step": 31700
},
{
"epoch": 7.95,
"grad_norm": 0.0788232609629631,
"learning_rate": 0.00029882655165947865,
"loss": 13.1032,
"step": 31800
},
{
"epoch": 7.975,
"grad_norm": 0.0716477558016777,
"learning_rate": 0.000298822801425089,
"loss": 13.4664,
"step": 31900
},
{
"epoch": 8.0,
"grad_norm": 0.07852466404438019,
"learning_rate": 0.0002988190511906994,
"loss": 13.006,
"step": 32000
},
{
"epoch": 8.025,
"grad_norm": 0.1100274920463562,
"learning_rate": 0.00029881530095630975,
"loss": 13.0427,
"step": 32100
},
{
"epoch": 8.05,
"grad_norm": 0.07130661606788635,
"learning_rate": 0.00029881155072192006,
"loss": 12.7575,
"step": 32200
},
{
"epoch": 8.075,
"grad_norm": 0.0846419557929039,
"learning_rate": 0.00029880780048753043,
"loss": 12.788,
"step": 32300
},
{
"epoch": 8.1,
"grad_norm": 0.07769067585468292,
"learning_rate": 0.0002988040502531408,
"loss": 12.8833,
"step": 32400
},
{
"epoch": 8.125,
"grad_norm": 0.06623586267232895,
"learning_rate": 0.00029880030001875116,
"loss": 12.5255,
"step": 32500
},
{
"epoch": 8.15,
"grad_norm": 0.0744013637304306,
"learning_rate": 0.00029879654978436147,
"loss": 12.7006,
"step": 32600
},
{
"epoch": 8.175,
"grad_norm": 0.07793931663036346,
"learning_rate": 0.00029879279954997184,
"loss": 12.2209,
"step": 32700
},
{
"epoch": 8.2,
"grad_norm": 0.07592390477657318,
"learning_rate": 0.0002987890493155822,
"loss": 12.2655,
"step": 32800
},
{
"epoch": 8.225,
"grad_norm": 0.07824064791202545,
"learning_rate": 0.00029878529908119257,
"loss": 12.3666,
"step": 32900
},
{
"epoch": 8.25,
"grad_norm": 0.06895022094249725,
"learning_rate": 0.0002987815488468029,
"loss": 12.3957,
"step": 33000
},
{
"epoch": 8.275,
"grad_norm": 0.08005383610725403,
"learning_rate": 0.00029877779861241325,
"loss": 12.3892,
"step": 33100
},
{
"epoch": 8.3,
"grad_norm": 0.0835549384355545,
"learning_rate": 0.0002987740483780236,
"loss": 12.1796,
"step": 33200
},
{
"epoch": 8.325,
"grad_norm": 0.08501383662223816,
"learning_rate": 0.000298770298143634,
"loss": 11.9921,
"step": 33300
},
{
"epoch": 8.35,
"grad_norm": 0.08822602778673172,
"learning_rate": 0.0002987665479092443,
"loss": 12.4392,
"step": 33400
},
{
"epoch": 8.375,
"grad_norm": 0.07659414410591125,
"learning_rate": 0.00029876279767485466,
"loss": 12.0612,
"step": 33500
},
{
"epoch": 8.4,
"grad_norm": 0.08337811380624771,
"learning_rate": 0.00029875904744046497,
"loss": 12.0035,
"step": 33600
},
{
"epoch": 8.425,
"grad_norm": 0.07944267988204956,
"learning_rate": 0.00029875529720607534,
"loss": 11.8415,
"step": 33700
},
{
"epoch": 8.45,
"grad_norm": 0.0773790031671524,
"learning_rate": 0.0002987515469716857,
"loss": 12.1775,
"step": 33800
},
{
"epoch": 8.475,
"grad_norm": 0.08871705085039139,
"learning_rate": 0.00029874779673729607,
"loss": 12.208,
"step": 33900
},
{
"epoch": 8.5,
"grad_norm": 0.07573138922452927,
"learning_rate": 0.0002987440465029064,
"loss": 11.6756,
"step": 34000
},
{
"epoch": 8.525,
"grad_norm": 0.07265728712081909,
"learning_rate": 0.00029874029626851675,
"loss": 11.4454,
"step": 34100
},
{
"epoch": 8.55,
"grad_norm": 0.0791819617152214,
"learning_rate": 0.0002987365460341271,
"loss": 11.9128,
"step": 34200
},
{
"epoch": 8.575,
"grad_norm": 0.07876613736152649,
"learning_rate": 0.0002987327957997375,
"loss": 11.7746,
"step": 34300
},
{
"epoch": 8.6,
"grad_norm": 0.08273490518331528,
"learning_rate": 0.0002987290455653478,
"loss": 11.6367,
"step": 34400
},
{
"epoch": 8.625,
"grad_norm": 0.07402598857879639,
"learning_rate": 0.00029872529533095816,
"loss": 11.6052,
"step": 34500
},
{
"epoch": 8.65,
"grad_norm": 0.06618580222129822,
"learning_rate": 0.0002987215825989124,
"loss": 11.7364,
"step": 34600
},
{
"epoch": 8.675,
"grad_norm": 0.07777924090623856,
"learning_rate": 0.0002987178323645228,
"loss": 11.3839,
"step": 34700
},
{
"epoch": 8.7,
"grad_norm": 0.09256916493177414,
"learning_rate": 0.0002987140821301331,
"loss": 11.4444,
"step": 34800
},
{
"epoch": 8.725,
"grad_norm": 0.08080556988716125,
"learning_rate": 0.00029871033189574346,
"loss": 11.5891,
"step": 34900
},
{
"epoch": 8.75,
"grad_norm": 0.08270179480314255,
"learning_rate": 0.00029870658166135383,
"loss": 11.3784,
"step": 35000
},
{
"epoch": 8.775,
"grad_norm": 0.08168449997901917,
"learning_rate": 0.0002987028314269642,
"loss": 11.1576,
"step": 35100
},
{
"epoch": 8.8,
"grad_norm": 0.07069560140371323,
"learning_rate": 0.0002986990811925745,
"loss": 11.2748,
"step": 35200
},
{
"epoch": 8.825,
"grad_norm": 0.07771777361631393,
"learning_rate": 0.0002986953309581849,
"loss": 11.2124,
"step": 35300
},
{
"epoch": 8.85,
"grad_norm": 0.0844758003950119,
"learning_rate": 0.0002986915807237952,
"loss": 10.9886,
"step": 35400
},
{
"epoch": 8.875,
"grad_norm": 0.07531385868787766,
"learning_rate": 0.00029868783048940555,
"loss": 11.4722,
"step": 35500
},
{
"epoch": 8.9,
"grad_norm": 0.08248105645179749,
"learning_rate": 0.0002986840802550159,
"loss": 11.1052,
"step": 35600
},
{
"epoch": 8.925,
"grad_norm": 0.08126658946275711,
"learning_rate": 0.0002986803300206263,
"loss": 11.0637,
"step": 35700
},
{
"epoch": 8.95,
"grad_norm": 0.07933900505304337,
"learning_rate": 0.0002986765797862366,
"loss": 10.6369,
"step": 35800
},
{
"epoch": 8.975,
"grad_norm": 0.07628486305475235,
"learning_rate": 0.00029867282955184696,
"loss": 10.8511,
"step": 35900
},
{
"epoch": 9.0,
"grad_norm": 0.07509356737136841,
"learning_rate": 0.00029866907931745733,
"loss": 10.9576,
"step": 36000
},
{
"epoch": 9.025,
"grad_norm": 0.085249163210392,
"learning_rate": 0.00029866532908306764,
"loss": 10.9181,
"step": 36100
},
{
"epoch": 9.05,
"grad_norm": 0.08377708494663239,
"learning_rate": 0.000298661578848678,
"loss": 10.7095,
"step": 36200
},
{
"epoch": 9.075,
"grad_norm": 0.06539880484342575,
"learning_rate": 0.00029865786611663227,
"loss": 10.4937,
"step": 36300
},
{
"epoch": 9.1,
"grad_norm": 0.08634931594133377,
"learning_rate": 0.00029865411588224263,
"loss": 11.043,
"step": 36400
},
{
"epoch": 9.125,
"grad_norm": 0.06905148923397064,
"learning_rate": 0.00029865036564785295,
"loss": 11.0456,
"step": 36500
},
{
"epoch": 9.15,
"grad_norm": 0.07896845042705536,
"learning_rate": 0.0002986466154134633,
"loss": 10.5105,
"step": 36600
},
{
"epoch": 9.175,
"grad_norm": 0.07206033915281296,
"learning_rate": 0.0002986428651790737,
"loss": 10.7025,
"step": 36700
},
{
"epoch": 9.2,
"grad_norm": 0.06719633936882019,
"learning_rate": 0.00029863911494468405,
"loss": 10.3498,
"step": 36800
},
{
"epoch": 9.225,
"grad_norm": 0.07648395001888275,
"learning_rate": 0.00029863536471029436,
"loss": 10.4292,
"step": 36900
},
{
"epoch": 9.25,
"grad_norm": 0.08475750684738159,
"learning_rate": 0.0002986316144759047,
"loss": 10.5922,
"step": 37000
},
{
"epoch": 9.275,
"grad_norm": 0.09004350751638412,
"learning_rate": 0.00029862786424151504,
"loss": 10.3239,
"step": 37100
},
{
"epoch": 9.3,
"grad_norm": 0.06373389810323715,
"learning_rate": 0.0002986241140071254,
"loss": 10.2006,
"step": 37200
},
{
"epoch": 9.325,
"grad_norm": 0.07837036997079849,
"learning_rate": 0.00029862036377273577,
"loss": 10.193,
"step": 37300
},
{
"epoch": 9.35,
"grad_norm": 0.07210332155227661,
"learning_rate": 0.00029861661353834613,
"loss": 10.2084,
"step": 37400
},
{
"epoch": 9.375,
"grad_norm": 0.07254429906606674,
"learning_rate": 0.00029861286330395645,
"loss": 10.2551,
"step": 37500
},
{
"epoch": 9.4,
"grad_norm": 0.06640215963125229,
"learning_rate": 0.0002986091130695668,
"loss": 10.2847,
"step": 37600
},
{
"epoch": 9.425,
"grad_norm": 0.07777173817157745,
"learning_rate": 0.0002986053628351772,
"loss": 10.2434,
"step": 37700
},
{
"epoch": 9.45,
"grad_norm": 0.07829392701387405,
"learning_rate": 0.00029860161260078754,
"loss": 10.0319,
"step": 37800
},
{
"epoch": 9.475,
"grad_norm": 0.07961380481719971,
"learning_rate": 0.00029859786236639786,
"loss": 10.1739,
"step": 37900
},
{
"epoch": 9.5,
"grad_norm": 0.07749368995428085,
"learning_rate": 0.0002985941121320082,
"loss": 9.6391,
"step": 38000
},
{
"epoch": 9.525,
"grad_norm": 0.0826738029718399,
"learning_rate": 0.0002985903618976186,
"loss": 10.4704,
"step": 38100
},
{
"epoch": 9.55,
"grad_norm": 0.06573819369077682,
"learning_rate": 0.00029858661166322895,
"loss": 9.7767,
"step": 38200
},
{
"epoch": 9.575,
"grad_norm": 0.08020669966936111,
"learning_rate": 0.00029858286142883927,
"loss": 9.7305,
"step": 38300
},
{
"epoch": 9.6,
"grad_norm": 0.06815823167562485,
"learning_rate": 0.00029857911119444963,
"loss": 9.597,
"step": 38400
},
{
"epoch": 9.625,
"grad_norm": 0.07290255278348923,
"learning_rate": 0.0002985753984624039,
"loss": 9.8638,
"step": 38500
},
{
"epoch": 9.65,
"grad_norm": 0.06887535005807877,
"learning_rate": 0.00029857164822801426,
"loss": 9.6939,
"step": 38600
},
{
"epoch": 9.675,
"grad_norm": 0.08159805834293365,
"learning_rate": 0.00029856789799362457,
"loss": 9.8011,
"step": 38700
},
{
"epoch": 9.7,
"grad_norm": 0.08071273565292358,
"learning_rate": 0.00029856414775923494,
"loss": 9.5514,
"step": 38800
},
{
"epoch": 9.725,
"grad_norm": 0.07089462131261826,
"learning_rate": 0.00029856039752484525,
"loss": 9.8858,
"step": 38900
},
{
"epoch": 9.75,
"grad_norm": 0.08935658633708954,
"learning_rate": 0.0002985566472904556,
"loss": 9.6155,
"step": 39000
},
{
"epoch": 9.775,
"grad_norm": 0.08028286695480347,
"learning_rate": 0.000298552897056066,
"loss": 9.6638,
"step": 39100
},
{
"epoch": 9.8,
"grad_norm": 0.07186749577522278,
"learning_rate": 0.00029854914682167635,
"loss": 9.3091,
"step": 39200
},
{
"epoch": 9.825,
"grad_norm": 0.06545951217412949,
"learning_rate": 0.00029854539658728666,
"loss": 9.5374,
"step": 39300
},
{
"epoch": 9.85,
"grad_norm": 0.0787624716758728,
"learning_rate": 0.000298541646352897,
"loss": 9.4178,
"step": 39400
},
{
"epoch": 9.875,
"grad_norm": 0.07585486769676208,
"learning_rate": 0.0002985378961185074,
"loss": 9.2153,
"step": 39500
},
{
"epoch": 9.9,
"grad_norm": 0.07809693366289139,
"learning_rate": 0.0002985341458841177,
"loss": 9.2729,
"step": 39600
},
{
"epoch": 9.925,
"grad_norm": 0.12963560223579407,
"learning_rate": 0.00029853039564972807,
"loss": 9.1279,
"step": 39700
},
{
"epoch": 9.95,
"grad_norm": 0.06803625822067261,
"learning_rate": 0.00029852664541533844,
"loss": 9.3529,
"step": 39800
},
{
"epoch": 9.975,
"grad_norm": 0.07478567957878113,
"learning_rate": 0.0002985228951809488,
"loss": 9.1627,
"step": 39900
},
{
"epoch": 10.0,
"grad_norm": 0.07844047993421555,
"learning_rate": 0.0002985191449465591,
"loss": 9.0775,
"step": 40000
},
{
"epoch": 10.025,
"grad_norm": 0.07982715219259262,
"learning_rate": 0.0002985153947121695,
"loss": 9.4258,
"step": 40100
},
{
"epoch": 10.05,
"grad_norm": 0.0806502029299736,
"learning_rate": 0.00029851164447777985,
"loss": 9.3455,
"step": 40200
},
{
"epoch": 10.075,
"grad_norm": 0.06514900177717209,
"learning_rate": 0.0002985078942433902,
"loss": 8.9195,
"step": 40300
},
{
"epoch": 10.1,
"grad_norm": 0.08182831853628159,
"learning_rate": 0.0002985041440090005,
"loss": 8.9772,
"step": 40400
},
{
"epoch": 10.125,
"grad_norm": 0.07242997735738754,
"learning_rate": 0.0002985003937746109,
"loss": 9.3286,
"step": 40500
},
{
"epoch": 10.15,
"grad_norm": 0.07168876379728317,
"learning_rate": 0.0002984966435402212,
"loss": 8.8118,
"step": 40600
},
{
"epoch": 10.175,
"grad_norm": 0.07878579944372177,
"learning_rate": 0.00029849289330583157,
"loss": 9.0127,
"step": 40700
},
{
"epoch": 10.2,
"grad_norm": 0.06614303588867188,
"learning_rate": 0.00029848914307144194,
"loss": 8.8964,
"step": 40800
},
{
"epoch": 10.225,
"grad_norm": 0.07991635799407959,
"learning_rate": 0.0002984853928370523,
"loss": 8.7963,
"step": 40900
},
{
"epoch": 10.25,
"grad_norm": 0.07721689343452454,
"learning_rate": 0.0002984816426026626,
"loss": 8.797,
"step": 41000
},
{
"epoch": 10.275,
"grad_norm": 0.07666311413049698,
"learning_rate": 0.000298477892368273,
"loss": 8.6722,
"step": 41100
},
{
"epoch": 10.3,
"grad_norm": 0.0791340246796608,
"learning_rate": 0.00029847414213388335,
"loss": 8.6547,
"step": 41200
},
{
"epoch": 10.325,
"grad_norm": 0.0760653093457222,
"learning_rate": 0.0002984703918994937,
"loss": 8.696,
"step": 41300
},
{
"epoch": 10.35,
"grad_norm": 0.06864143908023834,
"learning_rate": 0.000298466641665104,
"loss": 8.8221,
"step": 41400
},
{
"epoch": 10.375,
"grad_norm": 0.07417836040258408,
"learning_rate": 0.0002984628914307144,
"loss": 8.5974,
"step": 41500
},
{
"epoch": 10.4,
"grad_norm": 0.073348268866539,
"learning_rate": 0.00029845914119632476,
"loss": 8.309,
"step": 41600
},
{
"epoch": 10.425,
"grad_norm": 0.0775461494922638,
"learning_rate": 0.0002984553909619351,
"loss": 8.6313,
"step": 41700
},
{
"epoch": 10.45,
"grad_norm": 0.07109999656677246,
"learning_rate": 0.00029845164072754543,
"loss": 8.3238,
"step": 41800
},
{
"epoch": 10.475,
"grad_norm": 0.06957342475652695,
"learning_rate": 0.0002984478904931558,
"loss": 8.3179,
"step": 41900
},
{
"epoch": 10.5,
"grad_norm": 0.07247728109359741,
"learning_rate": 0.00029844414025876617,
"loss": 8.3806,
"step": 42000
},
{
"epoch": 10.525,
"grad_norm": 0.08276287466287613,
"learning_rate": 0.00029844039002437653,
"loss": 8.495,
"step": 42100
},
{
"epoch": 10.55,
"grad_norm": 0.07794822007417679,
"learning_rate": 0.00029843663978998685,
"loss": 8.0454,
"step": 42200
},
{
"epoch": 10.575,
"grad_norm": 0.07254128903150558,
"learning_rate": 0.0002984328895555972,
"loss": 8.5174,
"step": 42300
},
{
"epoch": 10.6,
"grad_norm": 0.08386515080928802,
"learning_rate": 0.0002984291393212075,
"loss": 8.5586,
"step": 42400
},
{
"epoch": 10.625,
"grad_norm": 0.0731733962893486,
"learning_rate": 0.0002984254265891618,
"loss": 8.1163,
"step": 42500
},
{
"epoch": 10.65,
"grad_norm": 0.07960132509469986,
"learning_rate": 0.00029842167635477215,
"loss": 8.0072,
"step": 42600
},
{
"epoch": 10.675,
"grad_norm": 0.07048605382442474,
"learning_rate": 0.0002984179261203825,
"loss": 8.3243,
"step": 42700
},
{
"epoch": 10.7,
"grad_norm": 0.07215945422649384,
"learning_rate": 0.00029841417588599283,
"loss": 8.2795,
"step": 42800
},
{
"epoch": 10.725,
"grad_norm": 0.07723450660705566,
"learning_rate": 0.0002984104256516032,
"loss": 8.261,
"step": 42900
},
{
"epoch": 10.75,
"grad_norm": 0.06688930839300156,
"learning_rate": 0.00029840667541721356,
"loss": 8.1896,
"step": 43000
},
{
"epoch": 10.775,
"grad_norm": 0.07152280956506729,
"learning_rate": 0.00029840292518282393,
"loss": 7.8468,
"step": 43100
},
{
"epoch": 10.8,
"grad_norm": 0.0700908899307251,
"learning_rate": 0.00029839917494843424,
"loss": 8.2157,
"step": 43200
},
{
"epoch": 10.825,
"grad_norm": 0.08827432245016098,
"learning_rate": 0.0002983954247140446,
"loss": 8.2091,
"step": 43300
},
{
"epoch": 10.85,
"grad_norm": 0.07007287442684174,
"learning_rate": 0.00029839167447965497,
"loss": 8.2475,
"step": 43400
},
{
"epoch": 10.875,
"grad_norm": 0.07239579409360886,
"learning_rate": 0.0002983879242452653,
"loss": 7.9446,
"step": 43500
},
{
"epoch": 10.9,
"grad_norm": 0.06851651519536972,
"learning_rate": 0.00029838417401087565,
"loss": 7.9521,
"step": 43600
},
{
"epoch": 10.925,
"grad_norm": 0.07283764332532883,
"learning_rate": 0.00029838042377648596,
"loss": 7.9522,
"step": 43700
},
{
"epoch": 10.95,
"grad_norm": 0.06353294104337692,
"learning_rate": 0.0002983766735420964,
"loss": 7.9084,
"step": 43800
},
{
"epoch": 10.975,
"grad_norm": 0.07374967634677887,
"learning_rate": 0.0002983729608100506,
"loss": 7.6851,
"step": 43900
},
{
"epoch": 11.0,
"grad_norm": 0.08643588423728943,
"learning_rate": 0.00029836921057566096,
"loss": 7.7639,
"step": 44000
},
{
"epoch": 11.025,
"grad_norm": 0.06952405720949173,
"learning_rate": 0.00029836546034127127,
"loss": 7.8923,
"step": 44100
},
{
"epoch": 11.05,
"grad_norm": 0.0842747688293457,
"learning_rate": 0.00029836171010688163,
"loss": 7.7411,
"step": 44200
},
{
"epoch": 11.075,
"grad_norm": 0.07051684707403183,
"learning_rate": 0.000298357959872492,
"loss": 7.7914,
"step": 44300
},
{
"epoch": 11.1,
"grad_norm": 0.07264287769794464,
"learning_rate": 0.00029835420963810237,
"loss": 7.7216,
"step": 44400
},
{
"epoch": 11.125,
"grad_norm": 0.07382502406835556,
"learning_rate": 0.0002983504594037127,
"loss": 7.8505,
"step": 44500
},
{
"epoch": 11.15,
"grad_norm": 0.07358778268098831,
"learning_rate": 0.00029834670916932304,
"loss": 7.7822,
"step": 44600
},
{
"epoch": 11.175,
"grad_norm": 0.07758370041847229,
"learning_rate": 0.0002983429589349334,
"loss": 8.0006,
"step": 44700
},
{
"epoch": 11.2,
"grad_norm": 0.07674399763345718,
"learning_rate": 0.0002983392087005438,
"loss": 7.2497,
"step": 44800
},
{
"epoch": 11.225,
"grad_norm": 0.06659264862537384,
"learning_rate": 0.0002983354584661541,
"loss": 7.5115,
"step": 44900
},
{
"epoch": 11.25,
"grad_norm": 0.0640081837773323,
"learning_rate": 0.00029833170823176445,
"loss": 7.4374,
"step": 45000
},
{
"epoch": 11.275,
"grad_norm": 0.07784521579742432,
"learning_rate": 0.0002983279579973748,
"loss": 7.6097,
"step": 45100
},
{
"epoch": 11.3,
"grad_norm": 0.08755332231521606,
"learning_rate": 0.0002983242077629852,
"loss": 7.5832,
"step": 45200
},
{
"epoch": 11.325,
"grad_norm": 0.06300461292266846,
"learning_rate": 0.0002983204575285955,
"loss": 7.281,
"step": 45300
},
{
"epoch": 11.35,
"grad_norm": 0.06807196140289307,
"learning_rate": 0.00029831670729420586,
"loss": 7.2347,
"step": 45400
},
{
"epoch": 11.375,
"grad_norm": 0.07403436303138733,
"learning_rate": 0.00029831295705981623,
"loss": 7.0346,
"step": 45500
},
{
"epoch": 11.4,
"grad_norm": 0.07038521021604538,
"learning_rate": 0.0002983092068254266,
"loss": 7.6505,
"step": 45600
},
{
"epoch": 11.425,
"grad_norm": 0.08596746623516083,
"learning_rate": 0.0002983054565910369,
"loss": 7.2829,
"step": 45700
},
{
"epoch": 11.45,
"grad_norm": 0.06901860982179642,
"learning_rate": 0.0002983017063566473,
"loss": 7.4822,
"step": 45800
},
{
"epoch": 11.475,
"grad_norm": 0.07062174379825592,
"learning_rate": 0.0002982979561222576,
"loss": 7.2426,
"step": 45900
},
{
"epoch": 11.5,
"grad_norm": 0.06718676537275314,
"learning_rate": 0.00029829420588786795,
"loss": 7.2257,
"step": 46000
},
{
"epoch": 11.525,
"grad_norm": 0.10105819255113602,
"learning_rate": 0.0002982904556534783,
"loss": 7.1366,
"step": 46100
},
{
"epoch": 11.55,
"grad_norm": 0.06286392360925674,
"learning_rate": 0.0002982867054190887,
"loss": 7.4181,
"step": 46200
},
{
"epoch": 11.575,
"grad_norm": 0.09307048469781876,
"learning_rate": 0.000298282955184699,
"loss": 7.4101,
"step": 46300
},
{
"epoch": 11.6,
"grad_norm": 0.06440640985965729,
"learning_rate": 0.00029827920495030936,
"loss": 7.3866,
"step": 46400
},
{
"epoch": 11.625,
"grad_norm": 0.06852256506681442,
"learning_rate": 0.00029827545471591973,
"loss": 7.084,
"step": 46500
},
{
"epoch": 11.65,
"grad_norm": 0.06919901072978973,
"learning_rate": 0.0002982717044815301,
"loss": 6.9507,
"step": 46600
},
{
"epoch": 11.675,
"grad_norm": 0.0683809369802475,
"learning_rate": 0.0002982679542471404,
"loss": 7.1805,
"step": 46700
},
{
"epoch": 11.7,
"grad_norm": 0.06878841668367386,
"learning_rate": 0.0002982642040127508,
"loss": 7.2514,
"step": 46800
},
{
"epoch": 11.725,
"grad_norm": 0.06913451850414276,
"learning_rate": 0.00029826045377836114,
"loss": 6.9969,
"step": 46900
},
{
"epoch": 11.75,
"grad_norm": 0.06999741494655609,
"learning_rate": 0.0002982567035439715,
"loss": 6.8401,
"step": 47000
},
{
"epoch": 11.775,
"grad_norm": 0.07473236322402954,
"learning_rate": 0.0002982529533095818,
"loss": 6.8587,
"step": 47100
},
{
"epoch": 11.8,
"grad_norm": 0.07786587625741959,
"learning_rate": 0.0002982492030751922,
"loss": 7.0751,
"step": 47200
},
{
"epoch": 11.825,
"grad_norm": 0.0667233094573021,
"learning_rate": 0.00029824545284080255,
"loss": 6.9344,
"step": 47300
},
{
"epoch": 11.85,
"grad_norm": 0.07131955772638321,
"learning_rate": 0.0002982417026064129,
"loss": 7.0165,
"step": 47400
},
{
"epoch": 11.875,
"grad_norm": 0.08371793478727341,
"learning_rate": 0.00029823795237202323,
"loss": 6.7392,
"step": 47500
},
{
"epoch": 11.9,
"grad_norm": 0.07992976158857346,
"learning_rate": 0.00029823420213763354,
"loss": 6.7678,
"step": 47600
},
{
"epoch": 11.925,
"grad_norm": 0.07361280173063278,
"learning_rate": 0.0002982304519032439,
"loss": 6.5933,
"step": 47700
},
{
"epoch": 11.95,
"grad_norm": 0.0853012353181839,
"learning_rate": 0.00029822670166885427,
"loss": 6.6292,
"step": 47800
},
{
"epoch": 11.975,
"grad_norm": 0.07077699154615402,
"learning_rate": 0.00029822298893680853,
"loss": 7.045,
"step": 47900
},
{
"epoch": 12.0,
"grad_norm": 0.06884802132844925,
"learning_rate": 0.00029821923870241885,
"loss": 6.7302,
"step": 48000
},
{
"epoch": 12.025,
"grad_norm": 0.07187984138727188,
"learning_rate": 0.0002982154884680292,
"loss": 6.7884,
"step": 48100
},
{
"epoch": 12.05,
"grad_norm": 0.06950085610151291,
"learning_rate": 0.0002982117382336396,
"loss": 6.6858,
"step": 48200
},
{
"epoch": 12.075,
"grad_norm": 0.06879769265651703,
"learning_rate": 0.00029820798799924994,
"loss": 6.4815,
"step": 48300
},
{
"epoch": 12.1,
"grad_norm": 0.07400238513946533,
"learning_rate": 0.00029820423776486026,
"loss": 6.7837,
"step": 48400
},
{
"epoch": 12.125,
"grad_norm": 0.0689275860786438,
"learning_rate": 0.0002982004875304706,
"loss": 6.3745,
"step": 48500
},
{
"epoch": 12.15,
"grad_norm": 0.07304348796606064,
"learning_rate": 0.000298196737296081,
"loss": 6.7639,
"step": 48600
},
{
"epoch": 12.175,
"grad_norm": 0.07872481644153595,
"learning_rate": 0.00029819298706169135,
"loss": 6.5761,
"step": 48700
},
{
"epoch": 12.2,
"grad_norm": 0.06597219407558441,
"learning_rate": 0.00029818923682730167,
"loss": 6.6663,
"step": 48800
},
{
"epoch": 12.225,
"grad_norm": 0.060123708099126816,
"learning_rate": 0.00029818548659291203,
"loss": 6.5317,
"step": 48900
},
{
"epoch": 12.25,
"grad_norm": 0.07376055419445038,
"learning_rate": 0.00029818173635852234,
"loss": 6.4394,
"step": 49000
},
{
"epoch": 12.275,
"grad_norm": 0.06217016279697418,
"learning_rate": 0.00029817798612413277,
"loss": 6.4522,
"step": 49100
},
{
"epoch": 12.3,
"grad_norm": 0.06492452323436737,
"learning_rate": 0.0002981742358897431,
"loss": 6.5623,
"step": 49200
},
{
"epoch": 12.325,
"grad_norm": 0.08026625216007233,
"learning_rate": 0.00029817048565535344,
"loss": 6.3981,
"step": 49300
},
{
"epoch": 12.35,
"grad_norm": 0.07046521455049515,
"learning_rate": 0.00029816673542096376,
"loss": 6.4173,
"step": 49400
},
{
"epoch": 12.375,
"grad_norm": 0.07843586057424545,
"learning_rate": 0.0002981629851865741,
"loss": 6.499,
"step": 49500
},
{
"epoch": 12.4,
"grad_norm": 0.06976750493049622,
"learning_rate": 0.0002981592349521845,
"loss": 6.4019,
"step": 49600
},
{
"epoch": 12.425,
"grad_norm": 0.06601151078939438,
"learning_rate": 0.00029815548471779485,
"loss": 6.3474,
"step": 49700
},
{
"epoch": 12.45,
"grad_norm": 0.07471803575754166,
"learning_rate": 0.00029815173448340517,
"loss": 6.1884,
"step": 49800
},
{
"epoch": 12.475,
"grad_norm": 0.06310160458087921,
"learning_rate": 0.0002981480217513594,
"loss": 6.2996,
"step": 49900
},
{
"epoch": 12.5,
"grad_norm": 0.060027483850717545,
"learning_rate": 0.0002981442715169698,
"loss": 6.2398,
"step": 50000
},
{
"epoch": 12.525,
"grad_norm": 0.07511355727910995,
"learning_rate": 0.00029814052128258016,
"loss": 6.0126,
"step": 50100
},
{
"epoch": 12.55,
"grad_norm": 0.09251129627227783,
"learning_rate": 0.00029813677104819047,
"loss": 6.1201,
"step": 50200
},
{
"epoch": 12.575,
"grad_norm": 0.06512793153524399,
"learning_rate": 0.00029813302081380084,
"loss": 6.2464,
"step": 50300
},
{
"epoch": 12.6,
"grad_norm": 0.06275767832994461,
"learning_rate": 0.0002981292705794112,
"loss": 6.215,
"step": 50400
},
{
"epoch": 12.625,
"grad_norm": 0.07693471014499664,
"learning_rate": 0.00029812552034502157,
"loss": 6.1931,
"step": 50500
},
{
"epoch": 12.65,
"grad_norm": 0.06782624125480652,
"learning_rate": 0.0002981217701106319,
"loss": 6.3334,
"step": 50600
},
{
"epoch": 12.675,
"grad_norm": 0.06484679132699966,
"learning_rate": 0.00029811801987624225,
"loss": 5.9756,
"step": 50700
},
{
"epoch": 12.7,
"grad_norm": 0.07431244850158691,
"learning_rate": 0.0002981142696418526,
"loss": 6.2173,
"step": 50800
},
{
"epoch": 12.725,
"grad_norm": 0.07316889613866806,
"learning_rate": 0.000298110519407463,
"loss": 6.0987,
"step": 50900
},
{
"epoch": 12.75,
"grad_norm": 0.06565624475479126,
"learning_rate": 0.0002981067691730733,
"loss": 6.0928,
"step": 51000
},
{
"epoch": 12.775,
"grad_norm": 0.07335751503705978,
"learning_rate": 0.00029810301893868366,
"loss": 6.1505,
"step": 51100
},
{
"epoch": 12.8,
"grad_norm": 0.0684492215514183,
"learning_rate": 0.00029809926870429397,
"loss": 5.9197,
"step": 51200
},
{
"epoch": 12.825,
"grad_norm": 0.06604496389627457,
"learning_rate": 0.00029809551846990434,
"loss": 6.2255,
"step": 51300
},
{
"epoch": 12.85,
"grad_norm": 0.06465475261211395,
"learning_rate": 0.0002980917682355147,
"loss": 5.8412,
"step": 51400
},
{
"epoch": 12.875,
"grad_norm": 0.06663598865270615,
"learning_rate": 0.000298088018001125,
"loss": 5.7792,
"step": 51500
},
{
"epoch": 12.9,
"grad_norm": 0.06258101016283035,
"learning_rate": 0.0002980842677667354,
"loss": 5.7024,
"step": 51600
},
{
"epoch": 12.925,
"grad_norm": 0.06694167107343674,
"learning_rate": 0.00029808051753234575,
"loss": 5.9832,
"step": 51700
},
{
"epoch": 12.95,
"grad_norm": 0.06682337820529938,
"learning_rate": 0.0002980767672979561,
"loss": 5.8905,
"step": 51800
},
{
"epoch": 12.975,
"grad_norm": 0.07507793605327606,
"learning_rate": 0.0002980730545659103,
"loss": 5.8869,
"step": 51900
},
{
"epoch": 13.0,
"grad_norm": 0.0638195350766182,
"learning_rate": 0.0002980693043315207,
"loss": 5.9508,
"step": 52000
},
{
"epoch": 13.025,
"grad_norm": 0.089790940284729,
"learning_rate": 0.00029806555409713105,
"loss": 5.807,
"step": 52100
},
{
"epoch": 13.05,
"grad_norm": 0.06941410899162292,
"learning_rate": 0.0002980618038627414,
"loss": 5.8974,
"step": 52200
},
{
"epoch": 13.075,
"grad_norm": 0.06374108046293259,
"learning_rate": 0.00029805805362835173,
"loss": 6.02,
"step": 52300
},
{
"epoch": 13.1,
"grad_norm": 0.06581106036901474,
"learning_rate": 0.0002980543033939621,
"loss": 5.8285,
"step": 52400
},
{
"epoch": 13.125,
"grad_norm": 0.062402479350566864,
"learning_rate": 0.00029805055315957246,
"loss": 5.9327,
"step": 52500
},
{
"epoch": 13.15,
"grad_norm": 0.0768311470746994,
"learning_rate": 0.00029804680292518283,
"loss": 5.7586,
"step": 52600
},
{
"epoch": 13.175,
"grad_norm": 0.09206507354974747,
"learning_rate": 0.00029804305269079314,
"loss": 5.7239,
"step": 52700
},
{
"epoch": 13.2,
"grad_norm": 0.09109029918909073,
"learning_rate": 0.0002980393024564035,
"loss": 5.8506,
"step": 52800
},
{
"epoch": 13.225,
"grad_norm": 0.06463731825351715,
"learning_rate": 0.0002980355522220138,
"loss": 5.8716,
"step": 52900
},
{
"epoch": 13.25,
"grad_norm": 0.07239048928022385,
"learning_rate": 0.0002980318019876242,
"loss": 5.515,
"step": 53000
},
{
"epoch": 13.275,
"grad_norm": 0.06180089712142944,
"learning_rate": 0.00029802805175323455,
"loss": 5.4248,
"step": 53100
},
{
"epoch": 13.3,
"grad_norm": 0.05961550027132034,
"learning_rate": 0.0002980243015188449,
"loss": 5.8408,
"step": 53200
},
{
"epoch": 13.325,
"grad_norm": 0.06609106063842773,
"learning_rate": 0.00029802055128445523,
"loss": 5.5214,
"step": 53300
},
{
"epoch": 13.35,
"grad_norm": 0.07037625461816788,
"learning_rate": 0.0002980168010500656,
"loss": 5.6422,
"step": 53400
},
{
"epoch": 13.375,
"grad_norm": 0.05968979373574257,
"learning_rate": 0.00029801305081567596,
"loss": 5.4027,
"step": 53500
},
{
"epoch": 13.4,
"grad_norm": 0.06201528012752533,
"learning_rate": 0.00029800930058128633,
"loss": 5.5331,
"step": 53600
},
{
"epoch": 13.425,
"grad_norm": 0.07820463925600052,
"learning_rate": 0.00029800555034689664,
"loss": 5.6112,
"step": 53700
},
{
"epoch": 13.45,
"grad_norm": 0.07531889528036118,
"learning_rate": 0.000298001800112507,
"loss": 5.5128,
"step": 53800
},
{
"epoch": 13.475,
"grad_norm": 0.06690291315317154,
"learning_rate": 0.00029799808738046127,
"loss": 5.443,
"step": 53900
},
{
"epoch": 13.5,
"grad_norm": 0.08288581669330597,
"learning_rate": 0.00029799433714607163,
"loss": 5.5471,
"step": 54000
},
{
"epoch": 13.525,
"grad_norm": 0.06512220948934555,
"learning_rate": 0.00029799058691168195,
"loss": 5.475,
"step": 54100
},
{
"epoch": 13.55,
"grad_norm": 0.07862843573093414,
"learning_rate": 0.0002979868366772923,
"loss": 5.6017,
"step": 54200
},
{
"epoch": 13.575,
"grad_norm": 0.06599980592727661,
"learning_rate": 0.0002979830864429027,
"loss": 5.4367,
"step": 54300
},
{
"epoch": 13.6,
"grad_norm": 0.07014311850070953,
"learning_rate": 0.00029797933620851304,
"loss": 5.3765,
"step": 54400
},
{
"epoch": 13.625,
"grad_norm": 0.09498297423124313,
"learning_rate": 0.00029797558597412336,
"loss": 5.3329,
"step": 54500
},
{
"epoch": 13.65,
"grad_norm": 0.06557220965623856,
"learning_rate": 0.0002979718357397337,
"loss": 5.4082,
"step": 54600
},
{
"epoch": 13.675,
"grad_norm": 0.06320352107286453,
"learning_rate": 0.00029796808550534403,
"loss": 5.3671,
"step": 54700
},
{
"epoch": 13.7,
"grad_norm": 0.07630398869514465,
"learning_rate": 0.0002979643352709544,
"loss": 5.4613,
"step": 54800
},
{
"epoch": 13.725,
"grad_norm": 0.07285916805267334,
"learning_rate": 0.00029796058503656477,
"loss": 5.0222,
"step": 54900
},
{
"epoch": 13.75,
"grad_norm": 0.07314100861549377,
"learning_rate": 0.00029795683480217513,
"loss": 5.1593,
"step": 55000
},
{
"epoch": 13.775,
"grad_norm": 0.0632672905921936,
"learning_rate": 0.00029795308456778544,
"loss": 5.2524,
"step": 55100
},
{
"epoch": 13.8,
"grad_norm": 0.06146818399429321,
"learning_rate": 0.0002979493343333958,
"loss": 5.2068,
"step": 55200
},
{
"epoch": 13.825,
"grad_norm": 0.08438315987586975,
"learning_rate": 0.0002979455840990062,
"loss": 5.1854,
"step": 55300
},
{
"epoch": 13.85,
"grad_norm": 0.06263713538646698,
"learning_rate": 0.0002979418338646165,
"loss": 5.1888,
"step": 55400
},
{
"epoch": 13.875,
"grad_norm": 0.06485722959041595,
"learning_rate": 0.00029793808363022685,
"loss": 5.3774,
"step": 55500
},
{
"epoch": 13.9,
"grad_norm": 0.09563236683607101,
"learning_rate": 0.0002979343333958372,
"loss": 5.201,
"step": 55600
},
{
"epoch": 13.925,
"grad_norm": 0.06357564777135849,
"learning_rate": 0.0002979305831614476,
"loss": 5.1221,
"step": 55700
},
{
"epoch": 13.95,
"grad_norm": 0.06070085987448692,
"learning_rate": 0.0002979268329270579,
"loss": 5.1584,
"step": 55800
},
{
"epoch": 13.975,
"grad_norm": 0.0757615715265274,
"learning_rate": 0.00029792312019501216,
"loss": 5.0797,
"step": 55900
},
{
"epoch": 14.0,
"grad_norm": 0.07182688266038895,
"learning_rate": 0.0002979193699606225,
"loss": 5.2988,
"step": 56000
},
{
"epoch": 14.025,
"grad_norm": 0.06348109245300293,
"learning_rate": 0.0002979156197262329,
"loss": 4.992,
"step": 56100
},
{
"epoch": 14.05,
"grad_norm": 0.07352128624916077,
"learning_rate": 0.0002979118694918432,
"loss": 4.9483,
"step": 56200
},
{
"epoch": 14.075,
"grad_norm": 0.0681919977068901,
"learning_rate": 0.00029790811925745357,
"loss": 5.1792,
"step": 56300
},
{
"epoch": 14.1,
"grad_norm": 0.06682088226079941,
"learning_rate": 0.0002979043690230639,
"loss": 4.8559,
"step": 56400
},
{
"epoch": 14.125,
"grad_norm": 0.06291857361793518,
"learning_rate": 0.00029790061878867425,
"loss": 4.9382,
"step": 56500
},
{
"epoch": 14.15,
"grad_norm": 0.07243198156356812,
"learning_rate": 0.0002978968685542846,
"loss": 5.0399,
"step": 56600
},
{
"epoch": 14.175,
"grad_norm": 0.06961022317409515,
"learning_rate": 0.000297893118319895,
"loss": 5.0745,
"step": 56700
},
{
"epoch": 14.2,
"grad_norm": 0.06203046441078186,
"learning_rate": 0.0002978893680855053,
"loss": 5.1403,
"step": 56800
},
{
"epoch": 14.225,
"grad_norm": 0.06188129261136055,
"learning_rate": 0.00029788561785111566,
"loss": 4.9122,
"step": 56900
},
{
"epoch": 14.25,
"grad_norm": 0.05759645998477936,
"learning_rate": 0.000297881867616726,
"loss": 5.0696,
"step": 57000
},
{
"epoch": 14.275,
"grad_norm": 0.0592036135494709,
"learning_rate": 0.0002978781173823364,
"loss": 5.1164,
"step": 57100
},
{
"epoch": 14.3,
"grad_norm": 0.06267797201871872,
"learning_rate": 0.0002978743671479467,
"loss": 5.0722,
"step": 57200
},
{
"epoch": 14.325,
"grad_norm": 0.07611776143312454,
"learning_rate": 0.00029787061691355707,
"loss": 4.9118,
"step": 57300
},
{
"epoch": 14.35,
"grad_norm": 0.061794403940439224,
"learning_rate": 0.00029786686667916744,
"loss": 5.013,
"step": 57400
},
{
"epoch": 14.375,
"grad_norm": 0.2047680765390396,
"learning_rate": 0.0002978631164447778,
"loss": 4.7667,
"step": 57500
},
{
"epoch": 14.4,
"grad_norm": 0.0633254125714302,
"learning_rate": 0.0002978593662103881,
"loss": 4.8633,
"step": 57600
},
{
"epoch": 14.425,
"grad_norm": 0.06651504337787628,
"learning_rate": 0.0002978556159759985,
"loss": 4.9452,
"step": 57700
},
{
"epoch": 14.45,
"grad_norm": 0.07252359390258789,
"learning_rate": 0.00029785186574160885,
"loss": 4.8268,
"step": 57800
},
{
"epoch": 14.475,
"grad_norm": 0.07088153064250946,
"learning_rate": 0.0002978481530095631,
"loss": 4.7381,
"step": 57900
},
{
"epoch": 14.5,
"grad_norm": 0.06644707918167114,
"learning_rate": 0.0002978444027751734,
"loss": 4.6716,
"step": 58000
},
{
"epoch": 14.525,
"grad_norm": 0.06577486544847488,
"learning_rate": 0.0002978406525407838,
"loss": 4.8125,
"step": 58100
},
{
"epoch": 14.55,
"grad_norm": 0.06577962636947632,
"learning_rate": 0.0002978369023063941,
"loss": 4.6842,
"step": 58200
},
{
"epoch": 14.575,
"grad_norm": 0.060136351734399796,
"learning_rate": 0.00029783315207200446,
"loss": 4.6219,
"step": 58300
},
{
"epoch": 14.6,
"grad_norm": 0.06826278567314148,
"learning_rate": 0.00029782940183761483,
"loss": 4.7876,
"step": 58400
},
{
"epoch": 14.625,
"grad_norm": 0.06896788626909256,
"learning_rate": 0.0002978256516032252,
"loss": 4.8651,
"step": 58500
},
{
"epoch": 14.65,
"grad_norm": 0.06548253446817398,
"learning_rate": 0.0002978219013688355,
"loss": 4.9228,
"step": 58600
},
{
"epoch": 14.675,
"grad_norm": 0.08236391097307205,
"learning_rate": 0.0002978181511344459,
"loss": 4.7074,
"step": 58700
},
{
"epoch": 14.7,
"grad_norm": 0.06781431287527084,
"learning_rate": 0.00029781440090005624,
"loss": 5.0659,
"step": 58800
},
{
"epoch": 14.725,
"grad_norm": 0.06290601193904877,
"learning_rate": 0.0002978106506656666,
"loss": 4.8844,
"step": 58900
},
{
"epoch": 14.75,
"grad_norm": 0.0578296072781086,
"learning_rate": 0.0002978069004312769,
"loss": 4.7095,
"step": 59000
},
{
"epoch": 14.775,
"grad_norm": 0.05320196598768234,
"learning_rate": 0.0002978031501968873,
"loss": 4.6838,
"step": 59100
},
{
"epoch": 14.8,
"grad_norm": 0.07847319543361664,
"learning_rate": 0.00029779939996249765,
"loss": 4.7263,
"step": 59200
},
{
"epoch": 14.825,
"grad_norm": 0.07580792158842087,
"learning_rate": 0.00029779564972810796,
"loss": 4.5927,
"step": 59300
},
{
"epoch": 14.85,
"grad_norm": 0.06336116045713425,
"learning_rate": 0.00029779189949371833,
"loss": 4.6524,
"step": 59400
},
{
"epoch": 14.875,
"grad_norm": 0.0706322193145752,
"learning_rate": 0.0002977881492593287,
"loss": 4.5591,
"step": 59500
},
{
"epoch": 14.9,
"grad_norm": 0.09078390896320343,
"learning_rate": 0.00029778439902493906,
"loss": 4.6377,
"step": 59600
},
{
"epoch": 14.925,
"grad_norm": 0.07508181035518646,
"learning_rate": 0.00029778064879054937,
"loss": 4.4043,
"step": 59700
},
{
"epoch": 14.95,
"grad_norm": 0.06288613379001617,
"learning_rate": 0.00029777689855615974,
"loss": 4.5363,
"step": 59800
},
{
"epoch": 14.975,
"grad_norm": 0.0686824843287468,
"learning_rate": 0.00029777318582411395,
"loss": 4.6031,
"step": 59900
},
{
"epoch": 15.0,
"grad_norm": 0.0657496452331543,
"learning_rate": 0.0002977694355897243,
"loss": 4.4645,
"step": 60000
},
{
"epoch": 15.025,
"grad_norm": 0.0680643618106842,
"learning_rate": 0.0002977656853553347,
"loss": 4.6015,
"step": 60100
},
{
"epoch": 15.05,
"grad_norm": 0.06540867686271667,
"learning_rate": 0.00029776193512094504,
"loss": 4.4411,
"step": 60200
},
{
"epoch": 15.075,
"grad_norm": 0.060959845781326294,
"learning_rate": 0.00029775818488655536,
"loss": 4.2446,
"step": 60300
},
{
"epoch": 15.1,
"grad_norm": 0.07395045459270477,
"learning_rate": 0.0002977544346521657,
"loss": 4.4593,
"step": 60400
},
{
"epoch": 15.125,
"grad_norm": 0.0660228282213211,
"learning_rate": 0.0002977506844177761,
"loss": 4.2359,
"step": 60500
},
{
"epoch": 15.15,
"grad_norm": 0.06423047930002213,
"learning_rate": 0.00029774693418338645,
"loss": 4.4333,
"step": 60600
},
{
"epoch": 15.175,
"grad_norm": 0.07680130749940872,
"learning_rate": 0.00029774318394899677,
"loss": 4.4737,
"step": 60700
},
{
"epoch": 15.2,
"grad_norm": 0.0686013400554657,
"learning_rate": 0.00029773943371460713,
"loss": 4.301,
"step": 60800
},
{
"epoch": 15.225,
"grad_norm": 0.0519595630466938,
"learning_rate": 0.0002977356834802175,
"loss": 4.4112,
"step": 60900
},
{
"epoch": 15.25,
"grad_norm": 0.06710193306207657,
"learning_rate": 0.00029773193324582787,
"loss": 4.3652,
"step": 61000
},
{
"epoch": 15.275,
"grad_norm": 0.07808689773082733,
"learning_rate": 0.0002977281830114382,
"loss": 4.3473,
"step": 61100
},
{
"epoch": 15.3,
"grad_norm": 0.0767969936132431,
"learning_rate": 0.00029772443277704854,
"loss": 4.3302,
"step": 61200
},
{
"epoch": 15.325,
"grad_norm": 0.06145559623837471,
"learning_rate": 0.0002977206825426589,
"loss": 4.2091,
"step": 61300
},
{
"epoch": 15.35,
"grad_norm": 0.09096598625183105,
"learning_rate": 0.0002977169323082693,
"loss": 4.4397,
"step": 61400
},
{
"epoch": 15.375,
"grad_norm": 0.06596633046865463,
"learning_rate": 0.0002977131820738796,
"loss": 4.1544,
"step": 61500
},
{
"epoch": 15.4,
"grad_norm": 0.0632476657629013,
"learning_rate": 0.00029770943183948995,
"loss": 4.1507,
"step": 61600
},
{
"epoch": 15.425,
"grad_norm": 0.05707848072052002,
"learning_rate": 0.00029770568160510027,
"loss": 4.5147,
"step": 61700
},
{
"epoch": 15.45,
"grad_norm": 0.06603705137968063,
"learning_rate": 0.00029770193137071063,
"loss": 4.3091,
"step": 61800
},
{
"epoch": 15.475,
"grad_norm": 0.08647535741329193,
"learning_rate": 0.000297698181136321,
"loss": 4.4759,
"step": 61900
},
{
"epoch": 15.5,
"grad_norm": 0.0747227743268013,
"learning_rate": 0.00029769443090193136,
"loss": 4.3265,
"step": 62000
},
{
"epoch": 15.525,
"grad_norm": 0.06563801318407059,
"learning_rate": 0.0002976906806675417,
"loss": 4.5796,
"step": 62100
},
{
"epoch": 15.55,
"grad_norm": 0.06297031790018082,
"learning_rate": 0.00029768693043315204,
"loss": 4.2309,
"step": 62200
},
{
"epoch": 15.575,
"grad_norm": 0.05998208001255989,
"learning_rate": 0.0002976831801987624,
"loss": 4.244,
"step": 62300
},
{
"epoch": 15.6,
"grad_norm": 0.057426031678915024,
"learning_rate": 0.0002976794299643728,
"loss": 4.4331,
"step": 62400
},
{
"epoch": 15.625,
"grad_norm": 0.06295296549797058,
"learning_rate": 0.0002976756797299831,
"loss": 3.9931,
"step": 62500
},
{
"epoch": 15.65,
"grad_norm": 0.07305531948804855,
"learning_rate": 0.00029767192949559345,
"loss": 4.1127,
"step": 62600
},
{
"epoch": 15.675,
"grad_norm": 0.057404179126024246,
"learning_rate": 0.0002976681792612038,
"loss": 4.1888,
"step": 62700
},
{
"epoch": 15.7,
"grad_norm": 0.05540831759572029,
"learning_rate": 0.0002976644290268142,
"loss": 4.3955,
"step": 62800
},
{
"epoch": 15.725,
"grad_norm": 0.05315635725855827,
"learning_rate": 0.0002976606787924245,
"loss": 4.4189,
"step": 62900
},
{
"epoch": 15.75,
"grad_norm": 0.06974928081035614,
"learning_rate": 0.0002976569285580348,
"loss": 4.2076,
"step": 63000
},
{
"epoch": 15.775,
"grad_norm": 0.06797333806753159,
"learning_rate": 0.00029765317832364523,
"loss": 4.0685,
"step": 63100
},
{
"epoch": 15.8,
"grad_norm": 0.07094912976026535,
"learning_rate": 0.00029764942808925554,
"loss": 4.0277,
"step": 63200
},
{
"epoch": 15.825,
"grad_norm": 0.0728229507803917,
"learning_rate": 0.0002976456778548659,
"loss": 4.2609,
"step": 63300
},
{
"epoch": 15.85,
"grad_norm": 0.05918316915631294,
"learning_rate": 0.0002976419276204762,
"loss": 4.2609,
"step": 63400
},
{
"epoch": 15.875,
"grad_norm": 0.06454843282699585,
"learning_rate": 0.0002976381773860866,
"loss": 4.0982,
"step": 63500
},
{
"epoch": 15.9,
"grad_norm": 0.07737816870212555,
"learning_rate": 0.00029763442715169695,
"loss": 4.0363,
"step": 63600
},
{
"epoch": 15.925,
"grad_norm": 0.06324774026870728,
"learning_rate": 0.0002976306769173073,
"loss": 3.8008,
"step": 63700
},
{
"epoch": 15.95,
"grad_norm": 0.05786865949630737,
"learning_rate": 0.00029762692668291763,
"loss": 3.8747,
"step": 63800
},
{
"epoch": 15.975,
"grad_norm": 0.06020934507250786,
"learning_rate": 0.0002976232139508719,
"loss": 3.9662,
"step": 63900
},
{
"epoch": 16.0,
"grad_norm": 0.06533800065517426,
"learning_rate": 0.00029761946371648226,
"loss": 4.0263,
"step": 64000
},
{
"epoch": 16.025,
"grad_norm": 0.05861624330282211,
"learning_rate": 0.0002976157134820926,
"loss": 4.0456,
"step": 64100
},
{
"epoch": 16.05,
"grad_norm": 0.06453926116228104,
"learning_rate": 0.00029761196324770294,
"loss": 3.9041,
"step": 64200
},
{
"epoch": 16.075,
"grad_norm": 0.06458089500665665,
"learning_rate": 0.0002976082130133133,
"loss": 3.7986,
"step": 64300
},
{
"epoch": 16.1,
"grad_norm": 0.05067475885152817,
"learning_rate": 0.00029760446277892367,
"loss": 3.9836,
"step": 64400
},
{
"epoch": 16.125,
"grad_norm": 0.0557921938598156,
"learning_rate": 0.00029760071254453403,
"loss": 3.958,
"step": 64500
},
{
"epoch": 16.15,
"grad_norm": 0.05821559205651283,
"learning_rate": 0.00029759696231014435,
"loss": 4.0563,
"step": 64600
},
{
"epoch": 16.175,
"grad_norm": 0.06078817695379257,
"learning_rate": 0.0002975932120757547,
"loss": 4.0017,
"step": 64700
},
{
"epoch": 16.2,
"grad_norm": 0.07187299430370331,
"learning_rate": 0.0002975894618413651,
"loss": 3.7798,
"step": 64800
},
{
"epoch": 16.225,
"grad_norm": 0.05477326363325119,
"learning_rate": 0.00029758571160697544,
"loss": 3.7864,
"step": 64900
},
{
"epoch": 16.25,
"grad_norm": 0.06654859334230423,
"learning_rate": 0.00029758196137258576,
"loss": 3.9514,
"step": 65000
},
{
"epoch": 16.275,
"grad_norm": 0.0737365186214447,
"learning_rate": 0.0002975782111381961,
"loss": 3.9058,
"step": 65100
},
{
"epoch": 16.3,
"grad_norm": 0.06597916781902313,
"learning_rate": 0.00029757446090380643,
"loss": 3.9946,
"step": 65200
},
{
"epoch": 16.325,
"grad_norm": 0.05861925333738327,
"learning_rate": 0.0002975707106694168,
"loss": 3.9009,
"step": 65300
},
{
"epoch": 16.35,
"grad_norm": 0.06207166984677315,
"learning_rate": 0.00029756696043502717,
"loss": 3.9892,
"step": 65400
},
{
"epoch": 16.375,
"grad_norm": 0.07432432472705841,
"learning_rate": 0.00029756321020063753,
"loss": 3.7083,
"step": 65500
},
{
"epoch": 16.4,
"grad_norm": 0.05656394734978676,
"learning_rate": 0.00029755945996624784,
"loss": 3.8139,
"step": 65600
},
{
"epoch": 16.425,
"grad_norm": 0.07284687459468842,
"learning_rate": 0.0002975557097318582,
"loss": 3.8091,
"step": 65700
},
{
"epoch": 16.45,
"grad_norm": 0.06415148079395294,
"learning_rate": 0.0002975519594974686,
"loss": 3.8954,
"step": 65800
},
{
"epoch": 16.475,
"grad_norm": 0.06300424784421921,
"learning_rate": 0.00029754824676542284,
"loss": 3.5919,
"step": 65900
},
{
"epoch": 16.5,
"grad_norm": 0.06578180938959122,
"learning_rate": 0.00029754449653103315,
"loss": 3.7936,
"step": 66000
},
{
"epoch": 16.525,
"grad_norm": 0.07465810328722,
"learning_rate": 0.0002975407462966435,
"loss": 3.6781,
"step": 66100
},
{
"epoch": 16.55,
"grad_norm": 0.05531006306409836,
"learning_rate": 0.0002975369960622539,
"loss": 3.8176,
"step": 66200
},
{
"epoch": 16.575,
"grad_norm": 0.057088643312454224,
"learning_rate": 0.00029753324582786425,
"loss": 3.8375,
"step": 66300
},
{
"epoch": 16.6,
"grad_norm": 0.06409061700105667,
"learning_rate": 0.00029752949559347456,
"loss": 3.6946,
"step": 66400
},
{
"epoch": 16.625,
"grad_norm": 0.06034286320209503,
"learning_rate": 0.0002975257453590849,
"loss": 3.7127,
"step": 66500
},
{
"epoch": 16.65,
"grad_norm": 0.06990322470664978,
"learning_rate": 0.0002975219951246953,
"loss": 3.7908,
"step": 66600
},
{
"epoch": 16.675,
"grad_norm": 0.07301350682973862,
"learning_rate": 0.0002975182448903056,
"loss": 3.6067,
"step": 66700
},
{
"epoch": 16.7,
"grad_norm": 0.06309019029140472,
"learning_rate": 0.00029751449465591597,
"loss": 3.6552,
"step": 66800
},
{
"epoch": 16.725,
"grad_norm": 0.07269258797168732,
"learning_rate": 0.0002975107444215263,
"loss": 3.6489,
"step": 66900
},
{
"epoch": 16.75,
"grad_norm": 0.07549503445625305,
"learning_rate": 0.00029750699418713665,
"loss": 3.6146,
"step": 67000
},
{
"epoch": 16.775,
"grad_norm": 0.06944973766803741,
"learning_rate": 0.000297503243952747,
"loss": 3.592,
"step": 67100
},
{
"epoch": 16.8,
"grad_norm": 0.05656867474317551,
"learning_rate": 0.0002974994937183574,
"loss": 3.7087,
"step": 67200
},
{
"epoch": 16.825,
"grad_norm": 0.06444111466407776,
"learning_rate": 0.0002974957434839677,
"loss": 3.5458,
"step": 67300
},
{
"epoch": 16.85,
"grad_norm": 0.05399918928742409,
"learning_rate": 0.00029749199324957806,
"loss": 3.6962,
"step": 67400
},
{
"epoch": 16.875,
"grad_norm": 0.06424950808286667,
"learning_rate": 0.0002974882430151884,
"loss": 3.5515,
"step": 67500
},
{
"epoch": 16.9,
"grad_norm": 0.05898202210664749,
"learning_rate": 0.0002974844927807988,
"loss": 3.6593,
"step": 67600
},
{
"epoch": 16.925,
"grad_norm": 0.06607525050640106,
"learning_rate": 0.0002974807425464091,
"loss": 3.5478,
"step": 67700
},
{
"epoch": 16.95,
"grad_norm": 0.06299087405204773,
"learning_rate": 0.00029747699231201947,
"loss": 3.7256,
"step": 67800
},
{
"epoch": 16.975,
"grad_norm": 0.063835009932518,
"learning_rate": 0.00029747327957997373,
"loss": 3.538,
"step": 67900
},
{
"epoch": 17.0,
"grad_norm": 0.05786048248410225,
"learning_rate": 0.0002974695293455841,
"loss": 3.7246,
"step": 68000
},
{
"epoch": 17.025,
"grad_norm": 0.05804240703582764,
"learning_rate": 0.0002974657791111944,
"loss": 3.5207,
"step": 68100
},
{
"epoch": 17.05,
"grad_norm": 0.06179894134402275,
"learning_rate": 0.0002974620288768048,
"loss": 3.5634,
"step": 68200
},
{
"epoch": 17.075,
"grad_norm": 0.05166739225387573,
"learning_rate": 0.00029745827864241514,
"loss": 3.594,
"step": 68300
},
{
"epoch": 17.1,
"grad_norm": 0.05808790773153305,
"learning_rate": 0.0002974545284080255,
"loss": 3.4721,
"step": 68400
},
{
"epoch": 17.125,
"grad_norm": 0.058479100465774536,
"learning_rate": 0.0002974507781736358,
"loss": 3.4991,
"step": 68500
},
{
"epoch": 17.15,
"grad_norm": 0.06585648655891418,
"learning_rate": 0.0002974470279392462,
"loss": 3.4487,
"step": 68600
},
{
"epoch": 17.175,
"grad_norm": 0.07367991656064987,
"learning_rate": 0.0002974432777048565,
"loss": 3.715,
"step": 68700
},
{
"epoch": 17.2,
"grad_norm": 0.06693430244922638,
"learning_rate": 0.00029743952747046686,
"loss": 3.4574,
"step": 68800
},
{
"epoch": 17.225,
"grad_norm": 0.06379226595163345,
"learning_rate": 0.00029743577723607723,
"loss": 3.6117,
"step": 68900
},
{
"epoch": 17.25,
"grad_norm": 0.0511956624686718,
"learning_rate": 0.0002974320270016876,
"loss": 3.7448,
"step": 69000
},
{
"epoch": 17.275,
"grad_norm": 0.07336433976888657,
"learning_rate": 0.0002974282767672979,
"loss": 3.3539,
"step": 69100
},
{
"epoch": 17.3,
"grad_norm": 0.0531037300825119,
"learning_rate": 0.0002974245265329083,
"loss": 3.4722,
"step": 69200
},
{
"epoch": 17.325,
"grad_norm": 0.0836392492055893,
"learning_rate": 0.00029742077629851864,
"loss": 3.4829,
"step": 69300
},
{
"epoch": 17.35,
"grad_norm": 0.0543275885283947,
"learning_rate": 0.000297417026064129,
"loss": 3.3048,
"step": 69400
},
{
"epoch": 17.375,
"grad_norm": 0.05712301284074783,
"learning_rate": 0.0002974132758297393,
"loss": 3.3524,
"step": 69500
},
{
"epoch": 17.4,
"grad_norm": 0.07685862481594086,
"learning_rate": 0.0002974095255953497,
"loss": 3.4212,
"step": 69600
},
{
"epoch": 17.425,
"grad_norm": 0.06631585955619812,
"learning_rate": 0.00029740577536096005,
"loss": 3.3931,
"step": 69700
},
{
"epoch": 17.45,
"grad_norm": 0.05916072428226471,
"learning_rate": 0.0002974020251265704,
"loss": 3.4396,
"step": 69800
},
{
"epoch": 17.475,
"grad_norm": 0.06266429275274277,
"learning_rate": 0.0002973983123945246,
"loss": 3.618,
"step": 69900
},
{
"epoch": 17.5,
"grad_norm": 0.07458827644586563,
"learning_rate": 0.000297394562160135,
"loss": 3.3892,
"step": 70000
},
{
"epoch": 17.525,
"grad_norm": 0.05758730694651604,
"learning_rate": 0.00029739081192574536,
"loss": 3.3696,
"step": 70100
},
{
"epoch": 17.55,
"grad_norm": 0.061953071504831314,
"learning_rate": 0.0002973870616913557,
"loss": 3.2163,
"step": 70200
},
{
"epoch": 17.575,
"grad_norm": 0.06715140491724014,
"learning_rate": 0.00029738331145696603,
"loss": 3.5115,
"step": 70300
},
{
"epoch": 17.6,
"grad_norm": 0.06628040969371796,
"learning_rate": 0.0002973795612225764,
"loss": 3.4019,
"step": 70400
},
{
"epoch": 17.625,
"grad_norm": 0.06109810248017311,
"learning_rate": 0.0002973758109881867,
"loss": 3.327,
"step": 70500
},
{
"epoch": 17.65,
"grad_norm": 0.05486061051487923,
"learning_rate": 0.0002973720607537971,
"loss": 3.4603,
"step": 70600
},
{
"epoch": 17.675,
"grad_norm": 0.058648984879255295,
"learning_rate": 0.00029736831051940744,
"loss": 3.5709,
"step": 70700
},
{
"epoch": 17.7,
"grad_norm": 0.06253077834844589,
"learning_rate": 0.00029736456028501776,
"loss": 3.1556,
"step": 70800
},
{
"epoch": 17.725,
"grad_norm": 0.05633246898651123,
"learning_rate": 0.0002973608100506281,
"loss": 3.2396,
"step": 70900
},
{
"epoch": 17.75,
"grad_norm": 0.07230902463197708,
"learning_rate": 0.0002973570598162385,
"loss": 3.2617,
"step": 71000
},
{
"epoch": 17.775,
"grad_norm": 0.06703296303749084,
"learning_rate": 0.00029735330958184886,
"loss": 3.3253,
"step": 71100
},
{
"epoch": 17.8,
"grad_norm": 0.05392139405012131,
"learning_rate": 0.00029734955934745917,
"loss": 3.1445,
"step": 71200
},
{
"epoch": 17.825,
"grad_norm": 0.059445902705192566,
"learning_rate": 0.00029734580911306953,
"loss": 3.2005,
"step": 71300
},
{
"epoch": 17.85,
"grad_norm": 0.05022546648979187,
"learning_rate": 0.0002973420588786799,
"loss": 3.2086,
"step": 71400
},
{
"epoch": 17.875,
"grad_norm": 0.05383516103029251,
"learning_rate": 0.00029733830864429027,
"loss": 3.0877,
"step": 71500
},
{
"epoch": 17.9,
"grad_norm": 0.055024441331624985,
"learning_rate": 0.0002973345584099006,
"loss": 3.27,
"step": 71600
},
{
"epoch": 17.925,
"grad_norm": 0.0565604642033577,
"learning_rate": 0.00029733080817551094,
"loss": 3.4085,
"step": 71700
},
{
"epoch": 17.95,
"grad_norm": 0.056899093091487885,
"learning_rate": 0.0002973270579411213,
"loss": 3.3568,
"step": 71800
},
{
"epoch": 17.975,
"grad_norm": 0.06129912659525871,
"learning_rate": 0.00029732334520907557,
"loss": 3.1591,
"step": 71900
},
{
"epoch": 18.0,
"grad_norm": 0.06037045270204544,
"learning_rate": 0.0002973195949746859,
"loss": 3.3884,
"step": 72000
},
{
"epoch": 18.025,
"grad_norm": 0.059694815427064896,
"learning_rate": 0.00029731584474029625,
"loss": 3.0768,
"step": 72100
},
{
"epoch": 18.05,
"grad_norm": 0.06282085925340652,
"learning_rate": 0.00029731209450590656,
"loss": 3.3816,
"step": 72200
},
{
"epoch": 18.075,
"grad_norm": 0.05453978106379509,
"learning_rate": 0.00029730834427151693,
"loss": 3.1041,
"step": 72300
},
{
"epoch": 18.1,
"grad_norm": 0.0587979331612587,
"learning_rate": 0.0002973045940371273,
"loss": 3.1357,
"step": 72400
},
{
"epoch": 18.125,
"grad_norm": 0.05731925368309021,
"learning_rate": 0.00029730084380273766,
"loss": 3.0224,
"step": 72500
},
{
"epoch": 18.15,
"grad_norm": 0.05748147889971733,
"learning_rate": 0.00029729709356834797,
"loss": 3.1868,
"step": 72600
},
{
"epoch": 18.175,
"grad_norm": 0.2291877716779709,
"learning_rate": 0.00029729334333395834,
"loss": 3.5219,
"step": 72700
},
{
"epoch": 18.2,
"grad_norm": 0.05291415750980377,
"learning_rate": 0.0002972895930995687,
"loss": 3.0433,
"step": 72800
},
{
"epoch": 18.225,
"grad_norm": 0.05900726094841957,
"learning_rate": 0.00029728584286517907,
"loss": 3.2394,
"step": 72900
},
{
"epoch": 18.25,
"grad_norm": 0.05879193916916847,
"learning_rate": 0.0002972820926307894,
"loss": 3.2482,
"step": 73000
},
{
"epoch": 18.275,
"grad_norm": 0.061925821006298065,
"learning_rate": 0.00029727834239639975,
"loss": 3.1974,
"step": 73100
},
{
"epoch": 18.3,
"grad_norm": 0.07049068808555603,
"learning_rate": 0.0002972745921620101,
"loss": 3.2512,
"step": 73200
},
{
"epoch": 18.325,
"grad_norm": 0.06102385371923447,
"learning_rate": 0.0002972708419276205,
"loss": 3.1982,
"step": 73300
},
{
"epoch": 18.35,
"grad_norm": 0.05520262196660042,
"learning_rate": 0.0002972670916932308,
"loss": 3.054,
"step": 73400
},
{
"epoch": 18.375,
"grad_norm": 0.05517415702342987,
"learning_rate": 0.00029726334145884116,
"loss": 3.0914,
"step": 73500
},
{
"epoch": 18.4,
"grad_norm": 0.06400242447853088,
"learning_rate": 0.0002972595912244515,
"loss": 3.1063,
"step": 73600
},
{
"epoch": 18.425,
"grad_norm": 0.061084117740392685,
"learning_rate": 0.0002972558409900619,
"loss": 3.1149,
"step": 73700
},
{
"epoch": 18.45,
"grad_norm": 0.09352370351552963,
"learning_rate": 0.0002972520907556722,
"loss": 3.0725,
"step": 73800
},
{
"epoch": 18.475,
"grad_norm": 0.059218719601631165,
"learning_rate": 0.00029724837802362646,
"loss": 3.0702,
"step": 73900
},
{
"epoch": 18.5,
"grad_norm": 0.06091728433966637,
"learning_rate": 0.0002972446277892368,
"loss": 2.8734,
"step": 74000
},
{
"epoch": 18.525,
"grad_norm": 0.056753043085336685,
"learning_rate": 0.00029724087755484714,
"loss": 3.0829,
"step": 74100
},
{
"epoch": 18.55,
"grad_norm": 0.053419552743434906,
"learning_rate": 0.0002972371273204575,
"loss": 3.1694,
"step": 74200
},
{
"epoch": 18.575,
"grad_norm": 0.054798588156700134,
"learning_rate": 0.0002972333770860679,
"loss": 3.1102,
"step": 74300
},
{
"epoch": 18.6,
"grad_norm": 0.058476317673921585,
"learning_rate": 0.0002972296268516782,
"loss": 3.1152,
"step": 74400
},
{
"epoch": 18.625,
"grad_norm": 0.059114113450050354,
"learning_rate": 0.00029722587661728855,
"loss": 3.1364,
"step": 74500
},
{
"epoch": 18.65,
"grad_norm": 0.06834947317838669,
"learning_rate": 0.0002972221263828989,
"loss": 3.126,
"step": 74600
},
{
"epoch": 18.675,
"grad_norm": 0.05191313102841377,
"learning_rate": 0.00029721837614850923,
"loss": 3.0284,
"step": 74700
},
{
"epoch": 18.7,
"grad_norm": 0.07164154201745987,
"learning_rate": 0.0002972146259141196,
"loss": 2.9633,
"step": 74800
},
{
"epoch": 18.725,
"grad_norm": 0.05095268040895462,
"learning_rate": 0.00029721087567972996,
"loss": 3.0032,
"step": 74900
},
{
"epoch": 18.75,
"grad_norm": 0.05199890211224556,
"learning_rate": 0.00029720712544534033,
"loss": 3.0957,
"step": 75000
},
{
"epoch": 18.775,
"grad_norm": 0.08117477595806122,
"learning_rate": 0.00029720337521095064,
"loss": 3.0001,
"step": 75100
},
{
"epoch": 18.8,
"grad_norm": 0.05241430178284645,
"learning_rate": 0.000297199624976561,
"loss": 2.9402,
"step": 75200
},
{
"epoch": 18.825,
"grad_norm": 0.05886770412325859,
"learning_rate": 0.0002971958747421714,
"loss": 3.095,
"step": 75300
},
{
"epoch": 18.85,
"grad_norm": 0.05727067589759827,
"learning_rate": 0.00029719212450778174,
"loss": 2.9662,
"step": 75400
},
{
"epoch": 18.875,
"grad_norm": 0.0689665749669075,
"learning_rate": 0.00029718837427339205,
"loss": 2.7757,
"step": 75500
},
{
"epoch": 18.9,
"grad_norm": 0.05945652350783348,
"learning_rate": 0.0002971846240390024,
"loss": 2.822,
"step": 75600
},
{
"epoch": 18.925,
"grad_norm": 0.05478528141975403,
"learning_rate": 0.00029718087380461273,
"loss": 3.0564,
"step": 75700
},
{
"epoch": 18.95,
"grad_norm": 0.05541827157139778,
"learning_rate": 0.0002971771235702231,
"loss": 2.9453,
"step": 75800
},
{
"epoch": 18.975,
"grad_norm": 0.05722896754741669,
"learning_rate": 0.00029717341083817736,
"loss": 2.9093,
"step": 75900
},
{
"epoch": 19.0,
"grad_norm": 0.055735573172569275,
"learning_rate": 0.0002971696606037877,
"loss": 2.9792,
"step": 76000
},
{
"epoch": 19.025,
"grad_norm": 0.05422914773225784,
"learning_rate": 0.00029716591036939804,
"loss": 2.847,
"step": 76100
},
{
"epoch": 19.05,
"grad_norm": 0.059790875762701035,
"learning_rate": 0.0002971621601350084,
"loss": 2.9788,
"step": 76200
},
{
"epoch": 19.075,
"grad_norm": 0.07695723325014114,
"learning_rate": 0.00029715840990061877,
"loss": 2.9956,
"step": 76300
},
{
"epoch": 19.1,
"grad_norm": 0.0579293929040432,
"learning_rate": 0.00029715465966622913,
"loss": 3.2976,
"step": 76400
},
{
"epoch": 19.125,
"grad_norm": 0.05396733060479164,
"learning_rate": 0.00029715090943183945,
"loss": 2.9022,
"step": 76500
},
{
"epoch": 19.15,
"grad_norm": 0.056989822536706924,
"learning_rate": 0.0002971471591974498,
"loss": 2.7142,
"step": 76600
},
{
"epoch": 19.175,
"grad_norm": 0.05296149477362633,
"learning_rate": 0.0002971434089630602,
"loss": 2.8858,
"step": 76700
},
{
"epoch": 19.2,
"grad_norm": 0.061122532933950424,
"learning_rate": 0.00029713965872867054,
"loss": 2.7446,
"step": 76800
},
{
"epoch": 19.225,
"grad_norm": 0.05955662950873375,
"learning_rate": 0.00029713590849428086,
"loss": 2.6995,
"step": 76900
},
{
"epoch": 19.25,
"grad_norm": 0.0610017292201519,
"learning_rate": 0.0002971321582598912,
"loss": 2.7667,
"step": 77000
},
{
"epoch": 19.275,
"grad_norm": 0.05846131220459938,
"learning_rate": 0.0002971284080255016,
"loss": 2.7781,
"step": 77100
},
{
"epoch": 19.3,
"grad_norm": 0.05651117116212845,
"learning_rate": 0.00029712465779111195,
"loss": 2.814,
"step": 77200
},
{
"epoch": 19.325,
"grad_norm": 0.05765095725655556,
"learning_rate": 0.00029712090755672227,
"loss": 2.7334,
"step": 77300
},
{
"epoch": 19.35,
"grad_norm": 0.0659993514418602,
"learning_rate": 0.00029711715732233263,
"loss": 2.8981,
"step": 77400
},
{
"epoch": 19.375,
"grad_norm": 0.0573100671172142,
"learning_rate": 0.00029711340708794294,
"loss": 2.8433,
"step": 77500
},
{
"epoch": 19.4,
"grad_norm": 0.06855395436286926,
"learning_rate": 0.0002971096568535533,
"loss": 2.9283,
"step": 77600
},
{
"epoch": 19.425,
"grad_norm": 0.05601441487669945,
"learning_rate": 0.0002971059066191637,
"loss": 2.8565,
"step": 77700
},
{
"epoch": 19.45,
"grad_norm": 0.07347328960895538,
"learning_rate": 0.00029710215638477404,
"loss": 2.7694,
"step": 77800
},
{
"epoch": 19.475,
"grad_norm": 0.05399454012513161,
"learning_rate": 0.00029709844365272825,
"loss": 2.8265,
"step": 77900
},
{
"epoch": 19.5,
"grad_norm": 0.05960391089320183,
"learning_rate": 0.0002970946934183386,
"loss": 2.86,
"step": 78000
},
{
"epoch": 19.525,
"grad_norm": 0.050205573439598083,
"learning_rate": 0.000297090943183949,
"loss": 2.7896,
"step": 78100
},
{
"epoch": 19.55,
"grad_norm": 0.061351437121629715,
"learning_rate": 0.0002970871929495593,
"loss": 2.7925,
"step": 78200
},
{
"epoch": 19.575,
"grad_norm": 0.05008727312088013,
"learning_rate": 0.00029708344271516966,
"loss": 2.745,
"step": 78300
},
{
"epoch": 19.6,
"grad_norm": 0.05771077796816826,
"learning_rate": 0.0002970797299831239,
"loss": 2.8652,
"step": 78400
},
{
"epoch": 19.625,
"grad_norm": 0.053159620612859726,
"learning_rate": 0.0002970759797487343,
"loss": 2.8612,
"step": 78500
},
{
"epoch": 19.65,
"grad_norm": 0.05607482045888901,
"learning_rate": 0.0002970722295143446,
"loss": 2.8825,
"step": 78600
},
{
"epoch": 19.675,
"grad_norm": 0.05175361409783363,
"learning_rate": 0.00029706847927995497,
"loss": 2.9037,
"step": 78700
},
{
"epoch": 19.7,
"grad_norm": 0.059691160917282104,
"learning_rate": 0.00029706472904556533,
"loss": 2.745,
"step": 78800
},
{
"epoch": 19.725,
"grad_norm": 0.062432222068309784,
"learning_rate": 0.0002970609788111757,
"loss": 2.6383,
"step": 78900
},
{
"epoch": 19.75,
"grad_norm": 0.06708359718322754,
"learning_rate": 0.000297057228576786,
"loss": 2.5807,
"step": 79000
},
{
"epoch": 19.775,
"grad_norm": 0.060443244874477386,
"learning_rate": 0.0002970534783423964,
"loss": 2.7167,
"step": 79100
},
{
"epoch": 19.8,
"grad_norm": 0.060145530849695206,
"learning_rate": 0.0002970497281080067,
"loss": 2.7283,
"step": 79200
},
{
"epoch": 19.825,
"grad_norm": 0.06600401550531387,
"learning_rate": 0.00029704597787361705,
"loss": 2.8868,
"step": 79300
},
{
"epoch": 19.85,
"grad_norm": 0.0514482781291008,
"learning_rate": 0.0002970422276392274,
"loss": 2.59,
"step": 79400
},
{
"epoch": 19.875,
"grad_norm": 0.07618112862110138,
"learning_rate": 0.0002970384774048378,
"loss": 2.5086,
"step": 79500
},
{
"epoch": 19.9,
"grad_norm": 0.05636357143521309,
"learning_rate": 0.0002970347271704481,
"loss": 2.7034,
"step": 79600
},
{
"epoch": 19.925,
"grad_norm": 0.056812651455402374,
"learning_rate": 0.00029703097693605847,
"loss": 2.6996,
"step": 79700
},
{
"epoch": 19.95,
"grad_norm": 0.07078476995229721,
"learning_rate": 0.00029702722670166883,
"loss": 2.8029,
"step": 79800
},
{
"epoch": 19.975,
"grad_norm": 0.055067744106054306,
"learning_rate": 0.0002970234764672792,
"loss": 2.8455,
"step": 79900
},
{
"epoch": 20.0,
"grad_norm": 0.054148148745298386,
"learning_rate": 0.0002970197262328895,
"loss": 2.7438,
"step": 80000
},
{
"epoch": 20.025,
"grad_norm": 0.0576615035533905,
"learning_rate": 0.0002970159759984999,
"loss": 2.7244,
"step": 80100
},
{
"epoch": 20.05,
"grad_norm": 0.05849044770002365,
"learning_rate": 0.00029701222576411024,
"loss": 2.6015,
"step": 80200
},
{
"epoch": 20.075,
"grad_norm": 0.05542527511715889,
"learning_rate": 0.0002970084755297206,
"loss": 2.6276,
"step": 80300
},
{
"epoch": 20.1,
"grad_norm": 0.06275394558906555,
"learning_rate": 0.0002970047252953309,
"loss": 2.601,
"step": 80400
},
{
"epoch": 20.125,
"grad_norm": 0.05756799504160881,
"learning_rate": 0.0002970009750609413,
"loss": 2.6095,
"step": 80500
},
{
"epoch": 20.15,
"grad_norm": 0.05315446853637695,
"learning_rate": 0.00029699722482655165,
"loss": 2.8117,
"step": 80600
},
{
"epoch": 20.175,
"grad_norm": 0.06292139738798141,
"learning_rate": 0.000296993474592162,
"loss": 2.5364,
"step": 80700
},
{
"epoch": 20.2,
"grad_norm": 0.05451088026165962,
"learning_rate": 0.00029698972435777233,
"loss": 2.6838,
"step": 80800
},
{
"epoch": 20.225,
"grad_norm": 0.05063945800065994,
"learning_rate": 0.0002969859741233827,
"loss": 2.573,
"step": 80900
},
{
"epoch": 20.25,
"grad_norm": 0.058889806270599365,
"learning_rate": 0.000296982223888993,
"loss": 2.4947,
"step": 81000
},
{
"epoch": 20.275,
"grad_norm": 0.07975181192159653,
"learning_rate": 0.0002969784736546034,
"loss": 2.5364,
"step": 81100
},
{
"epoch": 20.3,
"grad_norm": 0.05763572081923485,
"learning_rate": 0.00029697472342021374,
"loss": 2.4907,
"step": 81200
},
{
"epoch": 20.325,
"grad_norm": 0.05867898836731911,
"learning_rate": 0.0002969709731858241,
"loss": 2.5361,
"step": 81300
},
{
"epoch": 20.35,
"grad_norm": 0.0528886653482914,
"learning_rate": 0.0002969672229514344,
"loss": 2.6669,
"step": 81400
},
{
"epoch": 20.375,
"grad_norm": 0.060931917279958725,
"learning_rate": 0.0002969634727170448,
"loss": 2.4697,
"step": 81500
},
{
"epoch": 20.4,
"grad_norm": 0.05871622636914253,
"learning_rate": 0.00029695972248265515,
"loss": 2.4717,
"step": 81600
},
{
"epoch": 20.425,
"grad_norm": 0.060853052884340286,
"learning_rate": 0.0002969559722482655,
"loss": 2.5891,
"step": 81700
},
{
"epoch": 20.45,
"grad_norm": 0.052957359701395035,
"learning_rate": 0.00029695222201387583,
"loss": 2.5919,
"step": 81800
},
{
"epoch": 20.475,
"grad_norm": 0.054768215864896774,
"learning_rate": 0.0002969484717794862,
"loss": 2.5348,
"step": 81900
},
{
"epoch": 20.5,
"grad_norm": 0.049939971417188644,
"learning_rate": 0.00029694472154509656,
"loss": 2.6501,
"step": 82000
},
{
"epoch": 20.525,
"grad_norm": 0.056562915444374084,
"learning_rate": 0.0002969409713107069,
"loss": 2.6031,
"step": 82100
},
{
"epoch": 20.55,
"grad_norm": 0.05061310529708862,
"learning_rate": 0.00029693722107631724,
"loss": 2.5924,
"step": 82200
},
{
"epoch": 20.575,
"grad_norm": 0.05474073067307472,
"learning_rate": 0.0002969334708419276,
"loss": 2.7109,
"step": 82300
},
{
"epoch": 20.6,
"grad_norm": 0.062750443816185,
"learning_rate": 0.00029692975810988187,
"loss": 2.5636,
"step": 82400
},
{
"epoch": 20.625,
"grad_norm": 0.05921516939997673,
"learning_rate": 0.0002969260078754922,
"loss": 2.4478,
"step": 82500
},
{
"epoch": 20.65,
"grad_norm": 0.06074066460132599,
"learning_rate": 0.00029692225764110254,
"loss": 2.5207,
"step": 82600
},
{
"epoch": 20.675,
"grad_norm": 0.06394727528095245,
"learning_rate": 0.00029691850740671286,
"loss": 2.7291,
"step": 82700
},
{
"epoch": 20.7,
"grad_norm": 0.06293661147356033,
"learning_rate": 0.0002969147571723232,
"loss": 2.5454,
"step": 82800
},
{
"epoch": 20.725,
"grad_norm": 0.049685824662446976,
"learning_rate": 0.0002969110069379336,
"loss": 2.7017,
"step": 82900
},
{
"epoch": 20.75,
"grad_norm": 0.0517297200858593,
"learning_rate": 0.00029690725670354396,
"loss": 2.5524,
"step": 83000
},
{
"epoch": 20.775,
"grad_norm": 0.061634670943021774,
"learning_rate": 0.00029690350646915427,
"loss": 2.4389,
"step": 83100
},
{
"epoch": 20.8,
"grad_norm": 0.06085900962352753,
"learning_rate": 0.00029689975623476463,
"loss": 2.4254,
"step": 83200
},
{
"epoch": 20.825,
"grad_norm": 0.05363364890217781,
"learning_rate": 0.000296896006000375,
"loss": 2.3591,
"step": 83300
},
{
"epoch": 20.85,
"grad_norm": 0.051609691232442856,
"learning_rate": 0.00029689225576598537,
"loss": 2.5282,
"step": 83400
},
{
"epoch": 20.875,
"grad_norm": 0.04989041015505791,
"learning_rate": 0.0002968885055315957,
"loss": 2.537,
"step": 83500
},
{
"epoch": 20.9,
"grad_norm": 0.053229689598083496,
"learning_rate": 0.00029688475529720604,
"loss": 2.5949,
"step": 83600
},
{
"epoch": 20.925,
"grad_norm": 0.05230165645480156,
"learning_rate": 0.0002968810050628164,
"loss": 2.4183,
"step": 83700
},
{
"epoch": 20.95,
"grad_norm": 0.05094073340296745,
"learning_rate": 0.0002968772548284268,
"loss": 2.6191,
"step": 83800
},
{
"epoch": 20.975,
"grad_norm": 0.05941576883196831,
"learning_rate": 0.0002968735045940371,
"loss": 2.3788,
"step": 83900
},
{
"epoch": 21.0,
"grad_norm": 0.05283214896917343,
"learning_rate": 0.00029686975435964745,
"loss": 2.5303,
"step": 84000
},
{
"epoch": 21.025,
"grad_norm": 0.06153716892004013,
"learning_rate": 0.0002968660041252578,
"loss": 2.4201,
"step": 84100
},
{
"epoch": 21.05,
"grad_norm": 0.05074555054306984,
"learning_rate": 0.0002968622538908682,
"loss": 2.4179,
"step": 84200
},
{
"epoch": 21.075,
"grad_norm": 0.05797216296195984,
"learning_rate": 0.0002968585036564785,
"loss": 2.3018,
"step": 84300
},
{
"epoch": 21.1,
"grad_norm": 0.053176261484622955,
"learning_rate": 0.00029685475342208886,
"loss": 2.4499,
"step": 84400
},
{
"epoch": 21.125,
"grad_norm": 0.0612250491976738,
"learning_rate": 0.00029685104069004307,
"loss": 2.5186,
"step": 84500
},
{
"epoch": 21.15,
"grad_norm": 0.055981434881687164,
"learning_rate": 0.00029684729045565344,
"loss": 2.3994,
"step": 84600
},
{
"epoch": 21.175,
"grad_norm": 0.07191935181617737,
"learning_rate": 0.0002968435402212638,
"loss": 2.4054,
"step": 84700
},
{
"epoch": 21.2,
"grad_norm": 0.05557156354188919,
"learning_rate": 0.00029683978998687417,
"loss": 2.3924,
"step": 84800
},
{
"epoch": 21.225,
"grad_norm": 0.06246166303753853,
"learning_rate": 0.0002968360397524845,
"loss": 2.4453,
"step": 84900
},
{
"epoch": 21.25,
"grad_norm": 0.061136774718761444,
"learning_rate": 0.00029683228951809485,
"loss": 2.3465,
"step": 85000
},
{
"epoch": 21.275,
"grad_norm": 0.06496226042509079,
"learning_rate": 0.0002968285392837052,
"loss": 2.356,
"step": 85100
},
{
"epoch": 21.3,
"grad_norm": 0.10879474133253098,
"learning_rate": 0.0002968247890493156,
"loss": 2.3113,
"step": 85200
},
{
"epoch": 21.325,
"grad_norm": 0.07896184921264648,
"learning_rate": 0.0002968210388149259,
"loss": 2.3167,
"step": 85300
},
{
"epoch": 21.35,
"grad_norm": 0.05807124823331833,
"learning_rate": 0.00029681728858053626,
"loss": 2.464,
"step": 85400
},
{
"epoch": 21.375,
"grad_norm": 0.05621746554970741,
"learning_rate": 0.0002968135383461466,
"loss": 2.4666,
"step": 85500
},
{
"epoch": 21.4,
"grad_norm": 0.06423439085483551,
"learning_rate": 0.000296809788111757,
"loss": 2.4151,
"step": 85600
},
{
"epoch": 21.425,
"grad_norm": 0.053314123302698135,
"learning_rate": 0.0002968060378773673,
"loss": 2.5222,
"step": 85700
},
{
"epoch": 21.45,
"grad_norm": 0.060538969933986664,
"learning_rate": 0.00029680228764297767,
"loss": 2.2422,
"step": 85800
},
{
"epoch": 21.475,
"grad_norm": 0.05905874818563461,
"learning_rate": 0.00029679853740858804,
"loss": 2.2856,
"step": 85900
},
{
"epoch": 21.5,
"grad_norm": 0.05516530200839043,
"learning_rate": 0.00029679478717419835,
"loss": 2.3191,
"step": 86000
},
{
"epoch": 21.525,
"grad_norm": 0.06160394474864006,
"learning_rate": 0.0002967910369398087,
"loss": 2.3382,
"step": 86100
},
{
"epoch": 21.55,
"grad_norm": 0.05599430948495865,
"learning_rate": 0.000296787286705419,
"loss": 2.4985,
"step": 86200
},
{
"epoch": 21.575,
"grad_norm": 0.06205850839614868,
"learning_rate": 0.0002967835364710294,
"loss": 2.4363,
"step": 86300
},
{
"epoch": 21.6,
"grad_norm": 0.05747246369719505,
"learning_rate": 0.00029677978623663976,
"loss": 2.3009,
"step": 86400
},
{
"epoch": 21.625,
"grad_norm": 0.05334313213825226,
"learning_rate": 0.000296776073504594,
"loss": 2.213,
"step": 86500
},
{
"epoch": 21.65,
"grad_norm": 0.05755939334630966,
"learning_rate": 0.00029677232327020433,
"loss": 2.3473,
"step": 86600
},
{
"epoch": 21.675,
"grad_norm": 0.06077682599425316,
"learning_rate": 0.0002967685730358147,
"loss": 2.3133,
"step": 86700
},
{
"epoch": 21.7,
"grad_norm": 0.04741760715842247,
"learning_rate": 0.00029676482280142506,
"loss": 2.2298,
"step": 86800
},
{
"epoch": 21.725,
"grad_norm": 0.05226515606045723,
"learning_rate": 0.00029676107256703543,
"loss": 2.3709,
"step": 86900
},
{
"epoch": 21.75,
"grad_norm": 0.05925588309764862,
"learning_rate": 0.00029675732233264574,
"loss": 2.3128,
"step": 87000
},
{
"epoch": 21.775,
"grad_norm": 0.05521254613995552,
"learning_rate": 0.0002967535720982561,
"loss": 2.1846,
"step": 87100
},
{
"epoch": 21.8,
"grad_norm": 0.058398790657520294,
"learning_rate": 0.0002967498218638665,
"loss": 2.2529,
"step": 87200
},
{
"epoch": 21.825,
"grad_norm": 0.051581237465143204,
"learning_rate": 0.00029674607162947684,
"loss": 2.3331,
"step": 87300
},
{
"epoch": 21.85,
"grad_norm": 0.046482495963573456,
"learning_rate": 0.00029674232139508715,
"loss": 2.3946,
"step": 87400
},
{
"epoch": 21.875,
"grad_norm": 0.053977347910404205,
"learning_rate": 0.0002967385711606975,
"loss": 2.3074,
"step": 87500
},
{
"epoch": 21.9,
"grad_norm": 0.0516643263399601,
"learning_rate": 0.0002967348209263079,
"loss": 2.3192,
"step": 87600
},
{
"epoch": 21.925,
"grad_norm": 0.04839833453297615,
"learning_rate": 0.00029673107069191825,
"loss": 2.2164,
"step": 87700
},
{
"epoch": 21.95,
"grad_norm": 0.05504479259252548,
"learning_rate": 0.00029672732045752856,
"loss": 2.3114,
"step": 87800
},
{
"epoch": 21.975,
"grad_norm": 0.05117473378777504,
"learning_rate": 0.00029672357022313893,
"loss": 2.2976,
"step": 87900
},
{
"epoch": 22.0,
"grad_norm": 0.052601177245378494,
"learning_rate": 0.00029671981998874924,
"loss": 2.4827,
"step": 88000
},
{
"epoch": 22.025,
"grad_norm": 0.04800357297062874,
"learning_rate": 0.0002967160697543596,
"loss": 2.2798,
"step": 88100
},
{
"epoch": 22.05,
"grad_norm": 0.06387566775083542,
"learning_rate": 0.00029671231951996997,
"loss": 2.2325,
"step": 88200
},
{
"epoch": 22.075,
"grad_norm": 0.05719434469938278,
"learning_rate": 0.00029670856928558034,
"loss": 2.2685,
"step": 88300
},
{
"epoch": 22.1,
"grad_norm": 0.05765566602349281,
"learning_rate": 0.00029670481905119065,
"loss": 2.1859,
"step": 88400
},
{
"epoch": 22.125,
"grad_norm": 0.06396758556365967,
"learning_rate": 0.000296701068816801,
"loss": 2.4629,
"step": 88500
},
{
"epoch": 22.15,
"grad_norm": 0.04949299618601799,
"learning_rate": 0.0002966973185824114,
"loss": 2.2405,
"step": 88600
},
{
"epoch": 22.175,
"grad_norm": 0.04977158457040787,
"learning_rate": 0.00029669356834802175,
"loss": 2.137,
"step": 88700
},
{
"epoch": 22.2,
"grad_norm": 0.06776726990938187,
"learning_rate": 0.00029668981811363206,
"loss": 2.1948,
"step": 88800
},
{
"epoch": 22.225,
"grad_norm": 0.05846365541219711,
"learning_rate": 0.0002966860678792424,
"loss": 2.0921,
"step": 88900
},
{
"epoch": 22.25,
"grad_norm": 0.05889894440770149,
"learning_rate": 0.0002966823176448528,
"loss": 2.3352,
"step": 89000
},
{
"epoch": 22.275,
"grad_norm": 0.04690111055970192,
"learning_rate": 0.00029667856741046316,
"loss": 2.3157,
"step": 89100
},
{
"epoch": 22.3,
"grad_norm": 0.05615220591425896,
"learning_rate": 0.00029667481717607347,
"loss": 2.1161,
"step": 89200
},
{
"epoch": 22.325,
"grad_norm": 0.0551600381731987,
"learning_rate": 0.00029667106694168384,
"loss": 2.125,
"step": 89300
},
{
"epoch": 22.35,
"grad_norm": 0.050111789256334305,
"learning_rate": 0.0002966673167072942,
"loss": 2.1135,
"step": 89400
},
{
"epoch": 22.375,
"grad_norm": 0.05537761375308037,
"learning_rate": 0.00029666356647290457,
"loss": 2.1623,
"step": 89500
},
{
"epoch": 22.4,
"grad_norm": 0.0577760748565197,
"learning_rate": 0.0002966598162385149,
"loss": 2.1871,
"step": 89600
},
{
"epoch": 22.425,
"grad_norm": 0.05141003802418709,
"learning_rate": 0.00029665606600412525,
"loss": 2.1437,
"step": 89700
},
{
"epoch": 22.45,
"grad_norm": 0.05164093151688576,
"learning_rate": 0.00029665231576973556,
"loss": 2.2704,
"step": 89800
},
{
"epoch": 22.475,
"grad_norm": 0.051070958375930786,
"learning_rate": 0.0002966485655353459,
"loss": 2.2791,
"step": 89900
},
{
"epoch": 22.5,
"grad_norm": 0.054080720990896225,
"learning_rate": 0.0002966448153009563,
"loss": 2.1997,
"step": 90000
},
{
"epoch": 22.525,
"grad_norm": 0.057264506816864014,
"learning_rate": 0.0002966410650665666,
"loss": 2.1997,
"step": 90100
},
{
"epoch": 22.55,
"grad_norm": 0.0729178935289383,
"learning_rate": 0.00029663731483217697,
"loss": 2.1692,
"step": 90200
},
{
"epoch": 22.575,
"grad_norm": 0.05248183757066727,
"learning_rate": 0.00029663356459778734,
"loss": 2.1341,
"step": 90300
},
{
"epoch": 22.6,
"grad_norm": 0.05090828239917755,
"learning_rate": 0.0002966298143633977,
"loss": 2.2374,
"step": 90400
},
{
"epoch": 22.625,
"grad_norm": 0.12061487883329391,
"learning_rate": 0.0002966261016313519,
"loss": 2.1671,
"step": 90500
},
{
"epoch": 22.65,
"grad_norm": 0.06009404733777046,
"learning_rate": 0.0002966223513969623,
"loss": 2.2945,
"step": 90600
},
{
"epoch": 22.675,
"grad_norm": 0.06756783276796341,
"learning_rate": 0.00029661860116257264,
"loss": 2.2064,
"step": 90700
},
{
"epoch": 22.7,
"grad_norm": 0.04783422127366066,
"learning_rate": 0.000296614850928183,
"loss": 2.1548,
"step": 90800
},
{
"epoch": 22.725,
"grad_norm": 0.06468702852725983,
"learning_rate": 0.0002966111006937933,
"loss": 2.0389,
"step": 90900
},
{
"epoch": 22.75,
"grad_norm": 0.05485010892152786,
"learning_rate": 0.0002966073504594037,
"loss": 2.1214,
"step": 91000
},
{
"epoch": 22.775,
"grad_norm": 0.05827448144555092,
"learning_rate": 0.00029660360022501405,
"loss": 2.2367,
"step": 91100
},
{
"epoch": 22.8,
"grad_norm": 0.054152172058820724,
"learning_rate": 0.0002965998499906244,
"loss": 2.1022,
"step": 91200
},
{
"epoch": 22.825,
"grad_norm": 0.04739788547158241,
"learning_rate": 0.00029659609975623473,
"loss": 2.1672,
"step": 91300
},
{
"epoch": 22.85,
"grad_norm": 0.05551367625594139,
"learning_rate": 0.0002965923495218451,
"loss": 2.05,
"step": 91400
},
{
"epoch": 22.875,
"grad_norm": 0.05317440256476402,
"learning_rate": 0.0002965885992874554,
"loss": 2.012,
"step": 91500
},
{
"epoch": 22.9,
"grad_norm": 0.053941987454891205,
"learning_rate": 0.00029658488655540967,
"loss": 2.1268,
"step": 91600
},
{
"epoch": 22.925,
"grad_norm": 0.05108709633350372,
"learning_rate": 0.00029658113632102004,
"loss": 2.1342,
"step": 91700
},
{
"epoch": 22.95,
"grad_norm": 0.052761614322662354,
"learning_rate": 0.0002965773860866304,
"loss": 2.08,
"step": 91800
},
{
"epoch": 22.975,
"grad_norm": 0.05674518644809723,
"learning_rate": 0.0002965736358522407,
"loss": 2.1533,
"step": 91900
},
{
"epoch": 23.0,
"grad_norm": 0.06261865794658661,
"learning_rate": 0.0002965698856178511,
"loss": 2.0382,
"step": 92000
},
{
"epoch": 23.025,
"grad_norm": 0.04918836057186127,
"learning_rate": 0.00029656613538346145,
"loss": 2.0315,
"step": 92100
},
{
"epoch": 23.05,
"grad_norm": 0.04982222989201546,
"learning_rate": 0.0002965623851490718,
"loss": 2.1285,
"step": 92200
},
{
"epoch": 23.075,
"grad_norm": 0.051534924656152725,
"learning_rate": 0.0002965586349146821,
"loss": 2.1746,
"step": 92300
},
{
"epoch": 23.1,
"grad_norm": 0.059025805443525314,
"learning_rate": 0.0002965548846802925,
"loss": 2.1339,
"step": 92400
},
{
"epoch": 23.125,
"grad_norm": 0.05158498138189316,
"learning_rate": 0.00029655113444590286,
"loss": 2.049,
"step": 92500
},
{
"epoch": 23.15,
"grad_norm": 0.049751464277505875,
"learning_rate": 0.0002965473842115132,
"loss": 2.0587,
"step": 92600
},
{
"epoch": 23.175,
"grad_norm": 0.05357548967003822,
"learning_rate": 0.00029654363397712353,
"loss": 2.1765,
"step": 92700
},
{
"epoch": 23.2,
"grad_norm": 0.05639924481511116,
"learning_rate": 0.0002965398837427339,
"loss": 2.0229,
"step": 92800
},
{
"epoch": 23.225,
"grad_norm": 0.057067710906267166,
"learning_rate": 0.00029653613350834427,
"loss": 2.1208,
"step": 92900
},
{
"epoch": 23.25,
"grad_norm": 0.056406810879707336,
"learning_rate": 0.00029653238327395463,
"loss": 2.1044,
"step": 93000
},
{
"epoch": 23.275,
"grad_norm": 0.05794864147901535,
"learning_rate": 0.00029652863303956495,
"loss": 1.9575,
"step": 93100
},
{
"epoch": 23.3,
"grad_norm": 0.059239715337753296,
"learning_rate": 0.0002965248828051753,
"loss": 2.1206,
"step": 93200
},
{
"epoch": 23.325,
"grad_norm": 0.05163438990712166,
"learning_rate": 0.0002965211325707856,
"loss": 1.9799,
"step": 93300
},
{
"epoch": 23.35,
"grad_norm": 0.05853526294231415,
"learning_rate": 0.000296517382336396,
"loss": 2.0314,
"step": 93400
},
{
"epoch": 23.375,
"grad_norm": 0.04642421007156372,
"learning_rate": 0.00029651363210200636,
"loss": 2.0252,
"step": 93500
},
{
"epoch": 23.4,
"grad_norm": 0.05934316664934158,
"learning_rate": 0.0002965098818676167,
"loss": 2.0889,
"step": 93600
},
{
"epoch": 23.425,
"grad_norm": 0.05159417912364006,
"learning_rate": 0.00029650613163322703,
"loss": 2.0017,
"step": 93700
},
{
"epoch": 23.45,
"grad_norm": 0.04541020095348358,
"learning_rate": 0.0002965023813988374,
"loss": 2.0592,
"step": 93800
},
{
"epoch": 23.475,
"grad_norm": 0.05421976000070572,
"learning_rate": 0.00029649863116444777,
"loss": 1.9184,
"step": 93900
},
{
"epoch": 23.5,
"grad_norm": 0.05134705454111099,
"learning_rate": 0.0002964948809300581,
"loss": 2.2841,
"step": 94000
},
{
"epoch": 23.525,
"grad_norm": 0.050796929746866226,
"learning_rate": 0.00029649113069566844,
"loss": 1.9773,
"step": 94100
},
{
"epoch": 23.55,
"grad_norm": 0.062260136008262634,
"learning_rate": 0.0002964873804612788,
"loss": 2.1259,
"step": 94200
},
{
"epoch": 23.575,
"grad_norm": 0.051263660192489624,
"learning_rate": 0.0002964836302268892,
"loss": 1.996,
"step": 94300
},
{
"epoch": 23.6,
"grad_norm": 0.052974916994571686,
"learning_rate": 0.0002964798799924995,
"loss": 2.0231,
"step": 94400
},
{
"epoch": 23.625,
"grad_norm": 0.06232937052845955,
"learning_rate": 0.00029647612975810985,
"loss": 1.9196,
"step": 94500
},
{
"epoch": 23.65,
"grad_norm": 0.05306218937039375,
"learning_rate": 0.0002964723795237202,
"loss": 1.9388,
"step": 94600
},
{
"epoch": 23.675,
"grad_norm": 0.05512924864888191,
"learning_rate": 0.0002964686292893306,
"loss": 2.1401,
"step": 94700
},
{
"epoch": 23.7,
"grad_norm": 0.056388285011053085,
"learning_rate": 0.0002964648790549409,
"loss": 2.0013,
"step": 94800
},
{
"epoch": 23.725,
"grad_norm": 0.05032140389084816,
"learning_rate": 0.00029646112882055126,
"loss": 1.9568,
"step": 94900
},
{
"epoch": 23.75,
"grad_norm": 0.04757603630423546,
"learning_rate": 0.0002964573785861616,
"loss": 1.8944,
"step": 95000
},
{
"epoch": 23.775,
"grad_norm": 0.05020546913146973,
"learning_rate": 0.00029645362835177194,
"loss": 2.0146,
"step": 95100
},
{
"epoch": 23.8,
"grad_norm": 0.056530579924583435,
"learning_rate": 0.0002964498781173823,
"loss": 1.9345,
"step": 95200
},
{
"epoch": 23.825,
"grad_norm": 0.07894182950258255,
"learning_rate": 0.0002964461278829927,
"loss": 2.1116,
"step": 95300
},
{
"epoch": 23.85,
"grad_norm": 0.05175475776195526,
"learning_rate": 0.000296442377648603,
"loss": 2.1331,
"step": 95400
},
{
"epoch": 23.875,
"grad_norm": 0.05405741557478905,
"learning_rate": 0.00029643862741421335,
"loss": 1.8724,
"step": 95500
},
{
"epoch": 23.9,
"grad_norm": 0.06405475735664368,
"learning_rate": 0.0002964349146821676,
"loss": 1.8652,
"step": 95600
},
{
"epoch": 23.925,
"grad_norm": 0.0548410564661026,
"learning_rate": 0.000296431164447778,
"loss": 1.9177,
"step": 95700
},
{
"epoch": 23.95,
"grad_norm": 0.04941118508577347,
"learning_rate": 0.0002964274142133883,
"loss": 1.981,
"step": 95800
},
{
"epoch": 23.975,
"grad_norm": 0.06233079358935356,
"learning_rate": 0.00029642366397899866,
"loss": 1.886,
"step": 95900
},
{
"epoch": 24.0,
"grad_norm": 0.06110682711005211,
"learning_rate": 0.000296419913744609,
"loss": 1.906,
"step": 96000
},
{
"epoch": 24.025,
"grad_norm": 0.056876040995121,
"learning_rate": 0.0002964161635102194,
"loss": 1.9632,
"step": 96100
},
{
"epoch": 24.05,
"grad_norm": 0.056007348001003265,
"learning_rate": 0.0002964124132758297,
"loss": 1.8518,
"step": 96200
},
{
"epoch": 24.075,
"grad_norm": 0.052707262337207794,
"learning_rate": 0.00029640866304144007,
"loss": 2.1039,
"step": 96300
},
{
"epoch": 24.1,
"grad_norm": 0.05575592815876007,
"learning_rate": 0.00029640491280705044,
"loss": 1.8103,
"step": 96400
},
{
"epoch": 24.125,
"grad_norm": 0.05587482079863548,
"learning_rate": 0.0002964011625726608,
"loss": 1.9645,
"step": 96500
},
{
"epoch": 24.15,
"grad_norm": 0.08619283139705658,
"learning_rate": 0.0002963974123382711,
"loss": 1.9429,
"step": 96600
},
{
"epoch": 24.175,
"grad_norm": 0.09571905434131622,
"learning_rate": 0.0002963936621038815,
"loss": 1.902,
"step": 96700
},
{
"epoch": 24.2,
"grad_norm": 0.050410255789756775,
"learning_rate": 0.0002963899118694918,
"loss": 2.0446,
"step": 96800
},
{
"epoch": 24.225,
"grad_norm": 0.060695916414260864,
"learning_rate": 0.00029638616163510216,
"loss": 1.9231,
"step": 96900
},
{
"epoch": 24.25,
"grad_norm": 0.05033661425113678,
"learning_rate": 0.0002963824114007125,
"loss": 1.9065,
"step": 97000
},
{
"epoch": 24.275,
"grad_norm": 0.05458163470029831,
"learning_rate": 0.0002963786611663229,
"loss": 1.858,
"step": 97100
},
{
"epoch": 24.3,
"grad_norm": 0.05258990451693535,
"learning_rate": 0.0002963749109319332,
"loss": 2.0328,
"step": 97200
},
{
"epoch": 24.325,
"grad_norm": 0.04619702324271202,
"learning_rate": 0.00029637116069754357,
"loss": 1.8548,
"step": 97300
},
{
"epoch": 24.35,
"grad_norm": 0.06743716448545456,
"learning_rate": 0.00029636741046315393,
"loss": 1.9381,
"step": 97400
},
{
"epoch": 24.375,
"grad_norm": 0.049068696796894073,
"learning_rate": 0.0002963636602287643,
"loss": 1.9359,
"step": 97500
},
{
"epoch": 24.4,
"grad_norm": 0.061207227408885956,
"learning_rate": 0.0002963599474967185,
"loss": 1.8927,
"step": 97600
},
{
"epoch": 24.425,
"grad_norm": 0.05484483018517494,
"learning_rate": 0.0002963561972623289,
"loss": 1.88,
"step": 97700
},
{
"epoch": 24.45,
"grad_norm": 0.057467181235551834,
"learning_rate": 0.00029635244702793924,
"loss": 1.856,
"step": 97800
},
{
"epoch": 24.475,
"grad_norm": 0.049861736595630646,
"learning_rate": 0.00029634869679354955,
"loss": 2.0343,
"step": 97900
},
{
"epoch": 24.5,
"grad_norm": 0.049673888832330704,
"learning_rate": 0.0002963449465591599,
"loss": 1.8138,
"step": 98000
},
{
"epoch": 24.525,
"grad_norm": 0.06320221722126007,
"learning_rate": 0.0002963411963247703,
"loss": 1.9389,
"step": 98100
},
{
"epoch": 24.55,
"grad_norm": 0.0863277018070221,
"learning_rate": 0.00029633744609038065,
"loss": 1.9127,
"step": 98200
},
{
"epoch": 24.575,
"grad_norm": 0.04973394796252251,
"learning_rate": 0.00029633369585599096,
"loss": 1.8468,
"step": 98300
},
{
"epoch": 24.6,
"grad_norm": 0.061264049261808395,
"learning_rate": 0.00029632994562160133,
"loss": 1.9194,
"step": 98400
},
{
"epoch": 24.625,
"grad_norm": 0.05264371261000633,
"learning_rate": 0.00029632619538721164,
"loss": 1.8896,
"step": 98500
},
{
"epoch": 24.65,
"grad_norm": 0.054599445313215256,
"learning_rate": 0.000296322445152822,
"loss": 1.9001,
"step": 98600
},
{
"epoch": 24.675,
"grad_norm": 0.05259576812386513,
"learning_rate": 0.00029631869491843237,
"loss": 1.8258,
"step": 98700
},
{
"epoch": 24.7,
"grad_norm": 0.05342064052820206,
"learning_rate": 0.00029631494468404274,
"loss": 1.926,
"step": 98800
},
{
"epoch": 24.725,
"grad_norm": 0.04714656248688698,
"learning_rate": 0.00029631119444965305,
"loss": 1.8823,
"step": 98900
},
{
"epoch": 24.75,
"grad_norm": 0.050276800990104675,
"learning_rate": 0.0002963074442152634,
"loss": 1.823,
"step": 99000
},
{
"epoch": 24.775,
"grad_norm": 0.051686566323041916,
"learning_rate": 0.0002963036939808738,
"loss": 1.8796,
"step": 99100
},
{
"epoch": 24.8,
"grad_norm": 0.051118552684783936,
"learning_rate": 0.00029629994374648415,
"loss": 1.9002,
"step": 99200
},
{
"epoch": 24.825,
"grad_norm": 0.05065715312957764,
"learning_rate": 0.00029629619351209446,
"loss": 1.868,
"step": 99300
},
{
"epoch": 24.85,
"grad_norm": 0.043341364711523056,
"learning_rate": 0.00029629244327770483,
"loss": 1.9614,
"step": 99400
},
{
"epoch": 24.875,
"grad_norm": 0.052784670144319534,
"learning_rate": 0.0002962886930433152,
"loss": 1.9323,
"step": 99500
},
{
"epoch": 24.9,
"grad_norm": 0.055045951157808304,
"learning_rate": 0.00029628494280892556,
"loss": 1.8218,
"step": 99600
},
{
"epoch": 24.925,
"grad_norm": 0.058140724897384644,
"learning_rate": 0.00029628123007687977,
"loss": 1.8894,
"step": 99700
},
{
"epoch": 24.95,
"grad_norm": 0.058738358318805695,
"learning_rate": 0.00029627747984249013,
"loss": 1.7708,
"step": 99800
},
{
"epoch": 24.975,
"grad_norm": 0.05485925078392029,
"learning_rate": 0.0002962737296081005,
"loss": 1.9136,
"step": 99900
},
{
"epoch": 25.0,
"grad_norm": 0.05562080442905426,
"learning_rate": 0.00029626997937371087,
"loss": 1.9072,
"step": 100000
},
{
"epoch": 25.025,
"grad_norm": 0.04997032880783081,
"learning_rate": 0.0002962662291393212,
"loss": 1.7119,
"step": 100100
},
{
"epoch": 25.05,
"grad_norm": 0.05290250480175018,
"learning_rate": 0.00029626247890493154,
"loss": 1.706,
"step": 100200
},
{
"epoch": 25.075,
"grad_norm": 0.04861506074666977,
"learning_rate": 0.00029625872867054186,
"loss": 1.7061,
"step": 100300
},
{
"epoch": 25.1,
"grad_norm": 0.05706246569752693,
"learning_rate": 0.0002962549784361522,
"loss": 1.9067,
"step": 100400
},
{
"epoch": 25.125,
"grad_norm": 0.055538617074489594,
"learning_rate": 0.0002962512282017626,
"loss": 1.8622,
"step": 100500
},
{
"epoch": 25.15,
"grad_norm": 0.06384219229221344,
"learning_rate": 0.00029624747796737295,
"loss": 1.7935,
"step": 100600
},
{
"epoch": 25.175,
"grad_norm": 0.057620443403720856,
"learning_rate": 0.00029624372773298327,
"loss": 1.8746,
"step": 100700
},
{
"epoch": 25.2,
"grad_norm": 0.05917825549840927,
"learning_rate": 0.00029623997749859363,
"loss": 1.7152,
"step": 100800
},
{
"epoch": 25.225,
"grad_norm": 0.061573103070259094,
"learning_rate": 0.000296236227264204,
"loss": 1.8928,
"step": 100900
},
{
"epoch": 25.25,
"grad_norm": 0.04456368088722229,
"learning_rate": 0.00029623247702981436,
"loss": 1.798,
"step": 101000
},
{
"epoch": 25.275,
"grad_norm": 0.06028895452618599,
"learning_rate": 0.0002962287267954247,
"loss": 1.8044,
"step": 101100
},
{
"epoch": 25.3,
"grad_norm": 0.0548817440867424,
"learning_rate": 0.00029622497656103504,
"loss": 1.9204,
"step": 101200
},
{
"epoch": 25.325,
"grad_norm": 0.045852452516555786,
"learning_rate": 0.0002962212263266454,
"loss": 1.924,
"step": 101300
},
{
"epoch": 25.35,
"grad_norm": 0.04782922565937042,
"learning_rate": 0.0002962174760922558,
"loss": 1.7096,
"step": 101400
},
{
"epoch": 25.375,
"grad_norm": 0.049990586936473846,
"learning_rate": 0.0002962137258578661,
"loss": 1.9654,
"step": 101500
},
{
"epoch": 25.4,
"grad_norm": 0.04626760631799698,
"learning_rate": 0.0002962099756234764,
"loss": 1.7223,
"step": 101600
},
{
"epoch": 25.425,
"grad_norm": 0.054343245923519135,
"learning_rate": 0.0002962062253890868,
"loss": 1.85,
"step": 101700
},
{
"epoch": 25.45,
"grad_norm": 0.04563869535923004,
"learning_rate": 0.000296202512657041,
"loss": 1.8011,
"step": 101800
},
{
"epoch": 25.475,
"grad_norm": 0.05334710702300072,
"learning_rate": 0.0002961987624226514,
"loss": 1.7863,
"step": 101900
},
{
"epoch": 25.5,
"grad_norm": 0.05533549562096596,
"learning_rate": 0.0002961950121882617,
"loss": 1.7575,
"step": 102000
},
{
"epoch": 25.525,
"grad_norm": 0.05645955726504326,
"learning_rate": 0.00029619126195387207,
"loss": 1.6948,
"step": 102100
},
{
"epoch": 25.55,
"grad_norm": 0.05024164915084839,
"learning_rate": 0.00029618751171948244,
"loss": 1.6452,
"step": 102200
},
{
"epoch": 25.575,
"grad_norm": 0.051269952207803726,
"learning_rate": 0.0002961837614850928,
"loss": 1.7991,
"step": 102300
},
{
"epoch": 25.6,
"grad_norm": 0.05763736367225647,
"learning_rate": 0.0002961800112507031,
"loss": 1.7634,
"step": 102400
},
{
"epoch": 25.625,
"grad_norm": 0.05718966946005821,
"learning_rate": 0.0002961762610163135,
"loss": 1.7013,
"step": 102500
},
{
"epoch": 25.65,
"grad_norm": 0.05326114594936371,
"learning_rate": 0.00029617251078192385,
"loss": 1.6578,
"step": 102600
},
{
"epoch": 25.675,
"grad_norm": 0.05004553496837616,
"learning_rate": 0.0002961687605475342,
"loss": 1.6707,
"step": 102700
},
{
"epoch": 25.7,
"grad_norm": 0.047597501426935196,
"learning_rate": 0.0002961650103131445,
"loss": 1.8098,
"step": 102800
},
{
"epoch": 25.725,
"grad_norm": 0.05360327288508415,
"learning_rate": 0.0002961612600787549,
"loss": 1.8259,
"step": 102900
},
{
"epoch": 25.75,
"grad_norm": 0.04639869183301926,
"learning_rate": 0.00029615750984436526,
"loss": 1.8487,
"step": 103000
},
{
"epoch": 25.775,
"grad_norm": 0.048653990030288696,
"learning_rate": 0.0002961537596099756,
"loss": 1.6956,
"step": 103100
},
{
"epoch": 25.8,
"grad_norm": 0.043963368982076645,
"learning_rate": 0.00029615000937558594,
"loss": 1.6178,
"step": 103200
},
{
"epoch": 25.825,
"grad_norm": 0.05706685408949852,
"learning_rate": 0.0002961462591411963,
"loss": 1.6809,
"step": 103300
},
{
"epoch": 25.85,
"grad_norm": 0.05852410942316055,
"learning_rate": 0.00029614250890680667,
"loss": 1.6511,
"step": 103400
},
{
"epoch": 25.875,
"grad_norm": 0.054208237677812576,
"learning_rate": 0.00029613875867241703,
"loss": 1.8168,
"step": 103500
},
{
"epoch": 25.9,
"grad_norm": 0.05457128956913948,
"learning_rate": 0.00029613500843802735,
"loss": 1.7456,
"step": 103600
},
{
"epoch": 25.925,
"grad_norm": 0.047613076865673065,
"learning_rate": 0.0002961312582036377,
"loss": 1.629,
"step": 103700
},
{
"epoch": 25.95,
"grad_norm": 0.05182652920484543,
"learning_rate": 0.0002961275454715919,
"loss": 1.6386,
"step": 103800
},
{
"epoch": 25.975,
"grad_norm": 0.046905118972063065,
"learning_rate": 0.0002961237952372023,
"loss": 1.8368,
"step": 103900
},
{
"epoch": 26.0,
"grad_norm": 0.04973314702510834,
"learning_rate": 0.00029612004500281265,
"loss": 1.8125,
"step": 104000
},
{
"epoch": 26.025,
"grad_norm": 0.048138804733753204,
"learning_rate": 0.000296116294768423,
"loss": 1.6797,
"step": 104100
},
{
"epoch": 26.05,
"grad_norm": 0.0547357015311718,
"learning_rate": 0.00029611254453403333,
"loss": 1.67,
"step": 104200
},
{
"epoch": 26.075,
"grad_norm": 0.05443267896771431,
"learning_rate": 0.0002961087942996437,
"loss": 1.6682,
"step": 104300
},
{
"epoch": 26.1,
"grad_norm": 0.06275078654289246,
"learning_rate": 0.00029610504406525406,
"loss": 1.7022,
"step": 104400
},
{
"epoch": 26.125,
"grad_norm": 0.05464591458439827,
"learning_rate": 0.00029610129383086443,
"loss": 1.8136,
"step": 104500
},
{
"epoch": 26.15,
"grad_norm": 0.05352524295449257,
"learning_rate": 0.00029609754359647474,
"loss": 1.7319,
"step": 104600
},
{
"epoch": 26.175,
"grad_norm": 0.05525488778948784,
"learning_rate": 0.0002960937933620851,
"loss": 1.766,
"step": 104700
},
{
"epoch": 26.2,
"grad_norm": 0.05569114536046982,
"learning_rate": 0.00029609004312769547,
"loss": 1.7767,
"step": 104800
},
{
"epoch": 26.225,
"grad_norm": 0.0440787635743618,
"learning_rate": 0.00029608629289330584,
"loss": 1.6786,
"step": 104900
},
{
"epoch": 26.25,
"grad_norm": 0.05321473628282547,
"learning_rate": 0.00029608254265891615,
"loss": 1.6904,
"step": 105000
},
{
"epoch": 26.275,
"grad_norm": 0.047589514404535294,
"learning_rate": 0.0002960787924245265,
"loss": 1.5513,
"step": 105100
},
{
"epoch": 26.3,
"grad_norm": 0.0542590469121933,
"learning_rate": 0.0002960750421901369,
"loss": 1.8018,
"step": 105200
},
{
"epoch": 26.325,
"grad_norm": 0.052015386521816254,
"learning_rate": 0.0002960712919557472,
"loss": 1.6334,
"step": 105300
},
{
"epoch": 26.35,
"grad_norm": 0.16159088909626007,
"learning_rate": 0.00029606754172135756,
"loss": 1.5818,
"step": 105400
},
{
"epoch": 26.375,
"grad_norm": 0.04810553416609764,
"learning_rate": 0.00029606379148696787,
"loss": 1.6274,
"step": 105500
},
{
"epoch": 26.4,
"grad_norm": 0.053879667073488235,
"learning_rate": 0.00029606004125257824,
"loss": 1.8122,
"step": 105600
},
{
"epoch": 26.425,
"grad_norm": 0.04980600252747536,
"learning_rate": 0.0002960562910181886,
"loss": 1.7187,
"step": 105700
},
{
"epoch": 26.45,
"grad_norm": 0.059906307607889175,
"learning_rate": 0.00029605257828614287,
"loss": 1.7223,
"step": 105800
},
{
"epoch": 26.475,
"grad_norm": 0.04634363576769829,
"learning_rate": 0.0002960488280517532,
"loss": 1.6282,
"step": 105900
},
{
"epoch": 26.5,
"grad_norm": 0.052842844277620316,
"learning_rate": 0.00029604507781736354,
"loss": 1.6203,
"step": 106000
},
{
"epoch": 26.525,
"grad_norm": 0.05409262329339981,
"learning_rate": 0.0002960413275829739,
"loss": 1.7725,
"step": 106100
},
{
"epoch": 26.55,
"grad_norm": 0.04745221883058548,
"learning_rate": 0.0002960375773485843,
"loss": 1.6498,
"step": 106200
},
{
"epoch": 26.575,
"grad_norm": 0.050988294184207916,
"learning_rate": 0.0002960338271141946,
"loss": 1.6534,
"step": 106300
},
{
"epoch": 26.6,
"grad_norm": 0.046150580048561096,
"learning_rate": 0.00029603007687980495,
"loss": 1.7042,
"step": 106400
},
{
"epoch": 26.625,
"grad_norm": 0.05468379706144333,
"learning_rate": 0.0002960263266454153,
"loss": 1.6467,
"step": 106500
},
{
"epoch": 26.65,
"grad_norm": 0.05112981051206589,
"learning_rate": 0.0002960225764110257,
"loss": 1.5898,
"step": 106600
},
{
"epoch": 26.675,
"grad_norm": 0.050162170082330704,
"learning_rate": 0.000296018826176636,
"loss": 1.7128,
"step": 106700
},
{
"epoch": 26.7,
"grad_norm": 0.05202512443065643,
"learning_rate": 0.00029601507594224637,
"loss": 1.6162,
"step": 106800
},
{
"epoch": 26.725,
"grad_norm": 0.05049065127968788,
"learning_rate": 0.00029601132570785673,
"loss": 1.7741,
"step": 106900
},
{
"epoch": 26.75,
"grad_norm": 0.05425161495804787,
"learning_rate": 0.000296007612975811,
"loss": 1.5715,
"step": 107000
},
{
"epoch": 26.775,
"grad_norm": 0.04676578938961029,
"learning_rate": 0.0002960038627414213,
"loss": 1.4396,
"step": 107100
},
{
"epoch": 26.8,
"grad_norm": 0.04315830394625664,
"learning_rate": 0.00029600011250703167,
"loss": 1.648,
"step": 107200
},
{
"epoch": 26.825,
"grad_norm": 0.052309952676296234,
"learning_rate": 0.000295996362272642,
"loss": 1.5737,
"step": 107300
},
{
"epoch": 26.85,
"grad_norm": 0.05186279118061066,
"learning_rate": 0.00029599261203825235,
"loss": 1.5913,
"step": 107400
},
{
"epoch": 26.875,
"grad_norm": 0.05266883224248886,
"learning_rate": 0.0002959888618038627,
"loss": 1.567,
"step": 107500
},
{
"epoch": 26.9,
"grad_norm": 0.04454510286450386,
"learning_rate": 0.0002959851115694731,
"loss": 1.5123,
"step": 107600
},
{
"epoch": 26.925,
"grad_norm": 0.05315356329083443,
"learning_rate": 0.0002959813613350834,
"loss": 1.6372,
"step": 107700
},
{
"epoch": 26.95,
"grad_norm": 0.04607756808400154,
"learning_rate": 0.00029597761110069376,
"loss": 1.6074,
"step": 107800
},
{
"epoch": 26.975,
"grad_norm": 0.04452488571405411,
"learning_rate": 0.0002959738608663041,
"loss": 1.5927,
"step": 107900
},
{
"epoch": 27.0,
"grad_norm": 0.05356653034687042,
"learning_rate": 0.0002959701106319145,
"loss": 1.6214,
"step": 108000
},
{
"epoch": 27.025,
"grad_norm": 0.04785982891917229,
"learning_rate": 0.0002959663603975248,
"loss": 1.6273,
"step": 108100
},
{
"epoch": 27.05,
"grad_norm": 0.04626493901014328,
"learning_rate": 0.00029596261016313517,
"loss": 1.6494,
"step": 108200
},
{
"epoch": 27.075,
"grad_norm": 0.04791727289557457,
"learning_rate": 0.00029595885992874554,
"loss": 1.5452,
"step": 108300
},
{
"epoch": 27.1,
"grad_norm": 0.06166384369134903,
"learning_rate": 0.0002959551096943559,
"loss": 1.5749,
"step": 108400
},
{
"epoch": 27.125,
"grad_norm": 0.05195313319563866,
"learning_rate": 0.0002959513594599662,
"loss": 1.536,
"step": 108500
},
{
"epoch": 27.15,
"grad_norm": 0.0505547821521759,
"learning_rate": 0.0002959476092255766,
"loss": 1.6606,
"step": 108600
},
{
"epoch": 27.175,
"grad_norm": 0.04837740212678909,
"learning_rate": 0.00029594385899118695,
"loss": 1.5617,
"step": 108700
},
{
"epoch": 27.2,
"grad_norm": 0.04828809201717377,
"learning_rate": 0.0002959401087567973,
"loss": 1.7326,
"step": 108800
},
{
"epoch": 27.225,
"grad_norm": 0.06565222144126892,
"learning_rate": 0.0002959363585224076,
"loss": 1.5621,
"step": 108900
},
{
"epoch": 27.25,
"grad_norm": 0.05221616104245186,
"learning_rate": 0.000295932608288018,
"loss": 1.7385,
"step": 109000
},
{
"epoch": 27.275,
"grad_norm": 0.05376584827899933,
"learning_rate": 0.0002959288580536283,
"loss": 1.5078,
"step": 109100
},
{
"epoch": 27.3,
"grad_norm": 0.04505067691206932,
"learning_rate": 0.00029592510781923867,
"loss": 1.6082,
"step": 109200
},
{
"epoch": 27.325,
"grad_norm": 0.047202132642269135,
"learning_rate": 0.00029592135758484903,
"loss": 1.5304,
"step": 109300
},
{
"epoch": 27.35,
"grad_norm": 0.06032031401991844,
"learning_rate": 0.00029591760735045935,
"loss": 1.6035,
"step": 109400
},
{
"epoch": 27.375,
"grad_norm": 0.044648509472608566,
"learning_rate": 0.0002959138571160697,
"loss": 1.5581,
"step": 109500
},
{
"epoch": 27.4,
"grad_norm": 0.05649425461888313,
"learning_rate": 0.0002959101068816801,
"loss": 1.5482,
"step": 109600
},
{
"epoch": 27.425,
"grad_norm": 0.05527213215827942,
"learning_rate": 0.00029590635664729044,
"loss": 1.6155,
"step": 109700
},
{
"epoch": 27.45,
"grad_norm": 0.050836507230997086,
"learning_rate": 0.00029590260641290076,
"loss": 1.4239,
"step": 109800
},
{
"epoch": 27.475,
"grad_norm": 0.06156973913311958,
"learning_rate": 0.0002958988561785111,
"loss": 1.4574,
"step": 109900
},
{
"epoch": 27.5,
"grad_norm": 0.04659149423241615,
"learning_rate": 0.0002958951059441215,
"loss": 1.6488,
"step": 110000
},
{
"epoch": 27.525,
"grad_norm": 0.05683763325214386,
"learning_rate": 0.00029589135570973186,
"loss": 1.6128,
"step": 110100
},
{
"epoch": 27.55,
"grad_norm": 0.0504351444542408,
"learning_rate": 0.00029588760547534217,
"loss": 1.6495,
"step": 110200
},
{
"epoch": 27.575,
"grad_norm": 0.04385405406355858,
"learning_rate": 0.00029588385524095253,
"loss": 1.5644,
"step": 110300
},
{
"epoch": 27.6,
"grad_norm": 0.056605253368616104,
"learning_rate": 0.0002958801050065629,
"loss": 1.4853,
"step": 110400
},
{
"epoch": 27.625,
"grad_norm": 0.061634745448827744,
"learning_rate": 0.00029587635477217327,
"loss": 1.7518,
"step": 110500
},
{
"epoch": 27.65,
"grad_norm": 0.05308396369218826,
"learning_rate": 0.0002958726045377836,
"loss": 1.4906,
"step": 110600
},
{
"epoch": 27.675,
"grad_norm": 0.05271327123045921,
"learning_rate": 0.00029586885430339394,
"loss": 1.591,
"step": 110700
},
{
"epoch": 27.7,
"grad_norm": 0.04924798756837845,
"learning_rate": 0.00029586510406900426,
"loss": 1.5645,
"step": 110800
},
{
"epoch": 27.725,
"grad_norm": 0.05398215353488922,
"learning_rate": 0.0002958613538346146,
"loss": 1.5635,
"step": 110900
},
{
"epoch": 27.75,
"grad_norm": 0.04747261479496956,
"learning_rate": 0.000295857603600225,
"loss": 1.501,
"step": 111000
},
{
"epoch": 27.775,
"grad_norm": 0.048297274857759476,
"learning_rate": 0.00029585389086817925,
"loss": 1.4673,
"step": 111100
},
{
"epoch": 27.8,
"grad_norm": 0.047769028693437576,
"learning_rate": 0.00029585014063378956,
"loss": 1.5335,
"step": 111200
},
{
"epoch": 27.825,
"grad_norm": 0.05535224825143814,
"learning_rate": 0.00029584639039939993,
"loss": 1.5235,
"step": 111300
},
{
"epoch": 27.85,
"grad_norm": 0.04392020031809807,
"learning_rate": 0.0002958426401650103,
"loss": 1.5657,
"step": 111400
},
{
"epoch": 27.875,
"grad_norm": 0.052205685526132584,
"learning_rate": 0.00029583888993062066,
"loss": 1.5018,
"step": 111500
},
{
"epoch": 27.9,
"grad_norm": 0.0470951683819294,
"learning_rate": 0.00029583513969623097,
"loss": 1.3486,
"step": 111600
},
{
"epoch": 27.925,
"grad_norm": 0.045637097209692,
"learning_rate": 0.00029583138946184134,
"loss": 1.5814,
"step": 111700
},
{
"epoch": 27.95,
"grad_norm": 0.050197433680295944,
"learning_rate": 0.0002958276392274517,
"loss": 1.6106,
"step": 111800
},
{
"epoch": 27.975,
"grad_norm": 0.047528669238090515,
"learning_rate": 0.00029582388899306207,
"loss": 1.5872,
"step": 111900
},
{
"epoch": 28.0,
"grad_norm": 0.052580513060092926,
"learning_rate": 0.0002958201387586724,
"loss": 1.4037,
"step": 112000
},
{
"epoch": 28.025,
"grad_norm": 0.05215739831328392,
"learning_rate": 0.00029581638852428275,
"loss": 1.5155,
"step": 112100
},
{
"epoch": 28.05,
"grad_norm": 0.0481177382171154,
"learning_rate": 0.0002958126382898931,
"loss": 1.5689,
"step": 112200
},
{
"epoch": 28.075,
"grad_norm": 0.06459362804889679,
"learning_rate": 0.0002958088880555035,
"loss": 1.4518,
"step": 112300
},
{
"epoch": 28.1,
"grad_norm": 0.0489063635468483,
"learning_rate": 0.0002958051378211138,
"loss": 1.5451,
"step": 112400
},
{
"epoch": 28.125,
"grad_norm": 0.05155845358967781,
"learning_rate": 0.00029580138758672416,
"loss": 1.4813,
"step": 112500
},
{
"epoch": 28.15,
"grad_norm": 0.05029693618416786,
"learning_rate": 0.00029579763735233447,
"loss": 1.4739,
"step": 112600
},
{
"epoch": 28.175,
"grad_norm": 0.06580676138401031,
"learning_rate": 0.00029579388711794484,
"loss": 1.5699,
"step": 112700
},
{
"epoch": 28.2,
"grad_norm": 0.04858999699354172,
"learning_rate": 0.0002957901368835552,
"loss": 1.4865,
"step": 112800
},
{
"epoch": 28.225,
"grad_norm": 0.048569995909929276,
"learning_rate": 0.00029578638664916557,
"loss": 1.466,
"step": 112900
},
{
"epoch": 28.25,
"grad_norm": 0.05034118890762329,
"learning_rate": 0.0002957826364147759,
"loss": 1.5571,
"step": 113000
},
{
"epoch": 28.275,
"grad_norm": 0.05421663448214531,
"learning_rate": 0.00029577888618038625,
"loss": 1.5187,
"step": 113100
},
{
"epoch": 28.3,
"grad_norm": 0.04554268717765808,
"learning_rate": 0.0002957751359459966,
"loss": 1.4526,
"step": 113200
},
{
"epoch": 28.325,
"grad_norm": 0.04670153930783272,
"learning_rate": 0.0002957713857116069,
"loss": 1.4785,
"step": 113300
},
{
"epoch": 28.35,
"grad_norm": 0.05041331797838211,
"learning_rate": 0.0002957676354772173,
"loss": 1.4533,
"step": 113400
},
{
"epoch": 28.375,
"grad_norm": 0.042034462094306946,
"learning_rate": 0.00029576388524282766,
"loss": 1.4947,
"step": 113500
},
{
"epoch": 28.4,
"grad_norm": 0.050760041922330856,
"learning_rate": 0.000295760135008438,
"loss": 1.5469,
"step": 113600
},
{
"epoch": 28.425,
"grad_norm": 0.04767528921365738,
"learning_rate": 0.00029575638477404834,
"loss": 1.4801,
"step": 113700
},
{
"epoch": 28.45,
"grad_norm": 0.05914180353283882,
"learning_rate": 0.0002957526720420026,
"loss": 1.5372,
"step": 113800
},
{
"epoch": 28.475,
"grad_norm": 0.05601555109024048,
"learning_rate": 0.00029574892180761296,
"loss": 1.4325,
"step": 113900
},
{
"epoch": 28.5,
"grad_norm": 0.056612931191921234,
"learning_rate": 0.00029574517157322333,
"loss": 1.4873,
"step": 114000
},
{
"epoch": 28.525,
"grad_norm": 0.04357181489467621,
"learning_rate": 0.00029574142133883364,
"loss": 1.4405,
"step": 114100
},
{
"epoch": 28.55,
"grad_norm": 0.05303529277443886,
"learning_rate": 0.000295737671104444,
"loss": 1.4365,
"step": 114200
},
{
"epoch": 28.575,
"grad_norm": 0.048596885055303574,
"learning_rate": 0.0002957339208700543,
"loss": 1.4425,
"step": 114300
},
{
"epoch": 28.6,
"grad_norm": 0.05361025035381317,
"learning_rate": 0.0002957301706356647,
"loss": 1.4063,
"step": 114400
},
{
"epoch": 28.625,
"grad_norm": 0.05975283682346344,
"learning_rate": 0.00029572642040127505,
"loss": 1.4549,
"step": 114500
},
{
"epoch": 28.65,
"grad_norm": 0.04482881724834442,
"learning_rate": 0.0002957226701668854,
"loss": 1.3836,
"step": 114600
},
{
"epoch": 28.675,
"grad_norm": 0.05114329233765602,
"learning_rate": 0.00029571891993249573,
"loss": 1.5901,
"step": 114700
},
{
"epoch": 28.7,
"grad_norm": 0.04038051888346672,
"learning_rate": 0.0002957151696981061,
"loss": 1.5117,
"step": 114800
},
{
"epoch": 28.725,
"grad_norm": 0.052758511155843735,
"learning_rate": 0.00029571141946371646,
"loss": 1.4111,
"step": 114900
},
{
"epoch": 28.75,
"grad_norm": 0.049384575337171555,
"learning_rate": 0.00029570766922932683,
"loss": 1.4381,
"step": 115000
},
{
"epoch": 28.775,
"grad_norm": 0.047072507441043854,
"learning_rate": 0.00029570391899493714,
"loss": 1.4444,
"step": 115100
},
{
"epoch": 28.8,
"grad_norm": 0.05382237955927849,
"learning_rate": 0.0002957001687605475,
"loss": 1.4174,
"step": 115200
},
{
"epoch": 28.825,
"grad_norm": 0.04967265948653221,
"learning_rate": 0.00029569641852615787,
"loss": 1.4709,
"step": 115300
},
{
"epoch": 28.85,
"grad_norm": 0.045560047030448914,
"learning_rate": 0.00029569266829176824,
"loss": 1.5302,
"step": 115400
},
{
"epoch": 28.875,
"grad_norm": 0.058798883110284805,
"learning_rate": 0.00029568891805737855,
"loss": 1.4022,
"step": 115500
},
{
"epoch": 28.9,
"grad_norm": 0.04776821285486221,
"learning_rate": 0.0002956851678229889,
"loss": 1.3512,
"step": 115600
},
{
"epoch": 28.925,
"grad_norm": 0.05173936486244202,
"learning_rate": 0.0002956814175885993,
"loss": 1.5405,
"step": 115700
},
{
"epoch": 28.95,
"grad_norm": 0.04927581176161766,
"learning_rate": 0.00029567766735420965,
"loss": 1.435,
"step": 115800
},
{
"epoch": 28.975,
"grad_norm": 0.04748755320906639,
"learning_rate": 0.00029567391711981996,
"loss": 1.4073,
"step": 115900
},
{
"epoch": 29.0,
"grad_norm": 0.04827181622385979,
"learning_rate": 0.0002956701668854303,
"loss": 1.4046,
"step": 116000
},
{
"epoch": 29.025,
"grad_norm": 0.05039271339774132,
"learning_rate": 0.00029566645415338453,
"loss": 1.3616,
"step": 116100
},
{
"epoch": 29.05,
"grad_norm": 0.046831537038087845,
"learning_rate": 0.0002956627039189949,
"loss": 1.3991,
"step": 116200
},
{
"epoch": 29.075,
"grad_norm": 0.056436687707901,
"learning_rate": 0.00029565895368460527,
"loss": 1.448,
"step": 116300
},
{
"epoch": 29.1,
"grad_norm": 0.04817488044500351,
"learning_rate": 0.00029565520345021563,
"loss": 1.363,
"step": 116400
},
{
"epoch": 29.125,
"grad_norm": 0.05330492928624153,
"learning_rate": 0.00029565145321582594,
"loss": 1.4313,
"step": 116500
},
{
"epoch": 29.15,
"grad_norm": 0.05745427682995796,
"learning_rate": 0.0002956477029814363,
"loss": 1.5579,
"step": 116600
},
{
"epoch": 29.175,
"grad_norm": 0.05263765901327133,
"learning_rate": 0.0002956439527470467,
"loss": 1.5836,
"step": 116700
},
{
"epoch": 29.2,
"grad_norm": 0.044311635196208954,
"learning_rate": 0.00029564020251265704,
"loss": 1.4367,
"step": 116800
},
{
"epoch": 29.225,
"grad_norm": 0.053102701902389526,
"learning_rate": 0.00029563645227826735,
"loss": 1.4936,
"step": 116900
},
{
"epoch": 29.25,
"grad_norm": 0.04289867728948593,
"learning_rate": 0.0002956327020438777,
"loss": 1.438,
"step": 117000
},
{
"epoch": 29.275,
"grad_norm": 0.05283905565738678,
"learning_rate": 0.0002956289518094881,
"loss": 1.5341,
"step": 117100
},
{
"epoch": 29.3,
"grad_norm": 0.0411902479827404,
"learning_rate": 0.0002956252015750984,
"loss": 1.3774,
"step": 117200
},
{
"epoch": 29.325,
"grad_norm": 0.0581793412566185,
"learning_rate": 0.00029562145134070877,
"loss": 1.4712,
"step": 117300
},
{
"epoch": 29.35,
"grad_norm": 0.04655259847640991,
"learning_rate": 0.00029561770110631913,
"loss": 1.2906,
"step": 117400
},
{
"epoch": 29.375,
"grad_norm": 0.05028205364942551,
"learning_rate": 0.0002956139508719295,
"loss": 1.3921,
"step": 117500
},
{
"epoch": 29.4,
"grad_norm": 0.049044106155633926,
"learning_rate": 0.0002956102006375398,
"loss": 1.4684,
"step": 117600
},
{
"epoch": 29.425,
"grad_norm": 0.05344530567526817,
"learning_rate": 0.0002956064504031502,
"loss": 1.399,
"step": 117700
},
{
"epoch": 29.45,
"grad_norm": 0.05248359963297844,
"learning_rate": 0.0002956027001687605,
"loss": 1.3738,
"step": 117800
},
{
"epoch": 29.475,
"grad_norm": 0.053722232580184937,
"learning_rate": 0.00029559894993437085,
"loss": 1.27,
"step": 117900
},
{
"epoch": 29.5,
"grad_norm": 0.05581889674067497,
"learning_rate": 0.0002955951996999812,
"loss": 1.4523,
"step": 118000
},
{
"epoch": 29.525,
"grad_norm": 0.04724375531077385,
"learning_rate": 0.0002955914494655916,
"loss": 1.2637,
"step": 118100
},
{
"epoch": 29.55,
"grad_norm": 0.04487941041588783,
"learning_rate": 0.0002955877367335458,
"loss": 1.3064,
"step": 118200
},
{
"epoch": 29.575,
"grad_norm": 0.04799391329288483,
"learning_rate": 0.00029558398649915616,
"loss": 1.4433,
"step": 118300
},
{
"epoch": 29.6,
"grad_norm": 0.04437430948019028,
"learning_rate": 0.0002955802362647665,
"loss": 1.3427,
"step": 118400
},
{
"epoch": 29.625,
"grad_norm": 0.04969744756817818,
"learning_rate": 0.0002955764860303769,
"loss": 1.3415,
"step": 118500
},
{
"epoch": 29.65,
"grad_norm": 0.05268990993499756,
"learning_rate": 0.0002955727357959872,
"loss": 1.37,
"step": 118600
},
{
"epoch": 29.675,
"grad_norm": 0.05563261732459068,
"learning_rate": 0.00029556898556159757,
"loss": 1.3404,
"step": 118700
},
{
"epoch": 29.7,
"grad_norm": 0.045039862394332886,
"learning_rate": 0.00029556523532720794,
"loss": 1.2967,
"step": 118800
},
{
"epoch": 29.725,
"grad_norm": 0.06740451604127884,
"learning_rate": 0.0002955614850928183,
"loss": 1.4316,
"step": 118900
},
{
"epoch": 29.75,
"grad_norm": 0.046530742198228836,
"learning_rate": 0.0002955577348584286,
"loss": 1.3871,
"step": 119000
},
{
"epoch": 29.775,
"grad_norm": 0.04662451893091202,
"learning_rate": 0.000295553984624039,
"loss": 1.3832,
"step": 119100
},
{
"epoch": 29.8,
"grad_norm": 0.05180426687002182,
"learning_rate": 0.00029555023438964935,
"loss": 1.3783,
"step": 119200
},
{
"epoch": 29.825,
"grad_norm": 0.04919251427054405,
"learning_rate": 0.0002955464841552597,
"loss": 1.3789,
"step": 119300
},
{
"epoch": 29.85,
"grad_norm": 0.04741760343313217,
"learning_rate": 0.00029554273392087,
"loss": 1.392,
"step": 119400
},
{
"epoch": 29.875,
"grad_norm": 0.05151817202568054,
"learning_rate": 0.0002955389836864804,
"loss": 1.3472,
"step": 119500
},
{
"epoch": 29.9,
"grad_norm": 0.05211416259407997,
"learning_rate": 0.0002955352334520907,
"loss": 1.4448,
"step": 119600
},
{
"epoch": 29.925,
"grad_norm": 0.04866619408130646,
"learning_rate": 0.00029553148321770107,
"loss": 1.3788,
"step": 119700
},
{
"epoch": 29.95,
"grad_norm": 0.056409094482660294,
"learning_rate": 0.00029552773298331143,
"loss": 1.4182,
"step": 119800
},
{
"epoch": 29.975,
"grad_norm": 0.045399557799100876,
"learning_rate": 0.0002955239827489218,
"loss": 1.3579,
"step": 119900
},
{
"epoch": 30.0,
"grad_norm": 0.05333389341831207,
"learning_rate": 0.0002955202325145321,
"loss": 1.4833,
"step": 120000
},
{
"epoch": 30.025,
"grad_norm": 0.047169484198093414,
"learning_rate": 0.0002955164822801425,
"loss": 1.3531,
"step": 120100
},
{
"epoch": 30.05,
"grad_norm": 0.04647146537899971,
"learning_rate": 0.00029551273204575285,
"loss": 1.3722,
"step": 120200
},
{
"epoch": 30.075,
"grad_norm": 0.05528531223535538,
"learning_rate": 0.0002955089818113632,
"loss": 1.268,
"step": 120300
},
{
"epoch": 30.1,
"grad_norm": 0.050155188888311386,
"learning_rate": 0.0002955052315769735,
"loss": 1.3659,
"step": 120400
},
{
"epoch": 30.125,
"grad_norm": 0.047319624572992325,
"learning_rate": 0.0002955014813425839,
"loss": 1.4225,
"step": 120500
},
{
"epoch": 30.15,
"grad_norm": 0.04249805584549904,
"learning_rate": 0.00029549773110819426,
"loss": 1.4412,
"step": 120600
},
{
"epoch": 30.175,
"grad_norm": 0.05880492925643921,
"learning_rate": 0.0002954939808738046,
"loss": 1.5054,
"step": 120700
},
{
"epoch": 30.2,
"grad_norm": 0.047143761068582535,
"learning_rate": 0.00029549023063941493,
"loss": 1.3931,
"step": 120800
},
{
"epoch": 30.225,
"grad_norm": 0.04481210932135582,
"learning_rate": 0.00029548648040502525,
"loss": 1.2962,
"step": 120900
},
{
"epoch": 30.25,
"grad_norm": 0.044143520295619965,
"learning_rate": 0.00029548273017063567,
"loss": 1.2338,
"step": 121000
},
{
"epoch": 30.275,
"grad_norm": 0.06169132515788078,
"learning_rate": 0.000295478979936246,
"loss": 1.3578,
"step": 121100
},
{
"epoch": 30.3,
"grad_norm": 0.061004914343357086,
"learning_rate": 0.00029547522970185634,
"loss": 1.334,
"step": 121200
},
{
"epoch": 30.325,
"grad_norm": 0.04402782768011093,
"learning_rate": 0.00029547147946746666,
"loss": 1.404,
"step": 121300
},
{
"epoch": 30.35,
"grad_norm": 0.05749357491731644,
"learning_rate": 0.000295467729233077,
"loss": 1.2942,
"step": 121400
},
{
"epoch": 30.375,
"grad_norm": 0.052716564387083054,
"learning_rate": 0.0002954639789986874,
"loss": 1.2753,
"step": 121500
},
{
"epoch": 30.4,
"grad_norm": 0.04735216125845909,
"learning_rate": 0.00029546022876429775,
"loss": 1.3316,
"step": 121600
},
{
"epoch": 30.425,
"grad_norm": 0.05518503487110138,
"learning_rate": 0.00029545651603225196,
"loss": 1.3901,
"step": 121700
},
{
"epoch": 30.45,
"grad_norm": 0.04617263004183769,
"learning_rate": 0.00029545276579786233,
"loss": 1.3542,
"step": 121800
},
{
"epoch": 30.475,
"grad_norm": 0.04624765366315842,
"learning_rate": 0.0002954490155634727,
"loss": 1.3594,
"step": 121900
},
{
"epoch": 30.5,
"grad_norm": 0.05599815025925636,
"learning_rate": 0.00029544526532908306,
"loss": 1.3957,
"step": 122000
},
{
"epoch": 30.525,
"grad_norm": 0.047623343765735626,
"learning_rate": 0.00029544151509469337,
"loss": 1.3099,
"step": 122100
},
{
"epoch": 30.55,
"grad_norm": 0.04954765364527702,
"learning_rate": 0.00029543776486030374,
"loss": 1.4809,
"step": 122200
},
{
"epoch": 30.575,
"grad_norm": 0.057207658886909485,
"learning_rate": 0.0002954340146259141,
"loss": 1.3149,
"step": 122300
},
{
"epoch": 30.6,
"grad_norm": 0.04670143872499466,
"learning_rate": 0.00029543026439152447,
"loss": 1.3461,
"step": 122400
},
{
"epoch": 30.625,
"grad_norm": 0.04433277249336243,
"learning_rate": 0.0002954265141571348,
"loss": 1.1924,
"step": 122500
},
{
"epoch": 30.65,
"grad_norm": 0.045901257544755936,
"learning_rate": 0.00029542276392274515,
"loss": 1.3508,
"step": 122600
},
{
"epoch": 30.675,
"grad_norm": 0.048084866255521774,
"learning_rate": 0.0002954190136883555,
"loss": 1.3341,
"step": 122700
},
{
"epoch": 30.7,
"grad_norm": 0.04639054462313652,
"learning_rate": 0.0002954152634539659,
"loss": 1.2832,
"step": 122800
},
{
"epoch": 30.725,
"grad_norm": 0.05224520340561867,
"learning_rate": 0.0002954115132195762,
"loss": 1.2682,
"step": 122900
},
{
"epoch": 30.75,
"grad_norm": 0.05258006602525711,
"learning_rate": 0.00029540776298518656,
"loss": 1.3085,
"step": 123000
},
{
"epoch": 30.775,
"grad_norm": 0.0506523959338665,
"learning_rate": 0.00029540401275079687,
"loss": 1.3224,
"step": 123100
},
{
"epoch": 30.8,
"grad_norm": 0.046581752598285675,
"learning_rate": 0.00029540026251640724,
"loss": 1.2794,
"step": 123200
},
{
"epoch": 30.825,
"grad_norm": 0.04979027807712555,
"learning_rate": 0.0002953965122820176,
"loss": 1.1661,
"step": 123300
},
{
"epoch": 30.85,
"grad_norm": 0.07573187351226807,
"learning_rate": 0.00029539276204762797,
"loss": 1.3565,
"step": 123400
},
{
"epoch": 30.875,
"grad_norm": 0.05088147893548012,
"learning_rate": 0.0002953890118132383,
"loss": 1.3488,
"step": 123500
},
{
"epoch": 30.9,
"grad_norm": 0.05240534245967865,
"learning_rate": 0.00029538526157884865,
"loss": 1.336,
"step": 123600
},
{
"epoch": 30.925,
"grad_norm": 0.04134645685553551,
"learning_rate": 0.000295381511344459,
"loss": 1.2747,
"step": 123700
},
{
"epoch": 30.95,
"grad_norm": 0.05094057694077492,
"learning_rate": 0.0002953777611100694,
"loss": 1.3445,
"step": 123800
},
{
"epoch": 30.975,
"grad_norm": 0.045938342809677124,
"learning_rate": 0.0002953740108756797,
"loss": 1.2555,
"step": 123900
},
{
"epoch": 31.0,
"grad_norm": 0.04664922505617142,
"learning_rate": 0.00029537026064129006,
"loss": 1.3741,
"step": 124000
},
{
"epoch": 31.025,
"grad_norm": 0.04887442663311958,
"learning_rate": 0.0002953665104069004,
"loss": 1.2055,
"step": 124100
},
{
"epoch": 31.05,
"grad_norm": 0.04919900372624397,
"learning_rate": 0.0002953627601725108,
"loss": 1.1721,
"step": 124200
},
{
"epoch": 31.075,
"grad_norm": 0.048029493540525436,
"learning_rate": 0.0002953590099381211,
"loss": 1.3029,
"step": 124300
},
{
"epoch": 31.1,
"grad_norm": 0.053546350449323654,
"learning_rate": 0.00029535525970373147,
"loss": 1.3137,
"step": 124400
},
{
"epoch": 31.125,
"grad_norm": 0.04450497403740883,
"learning_rate": 0.0002953515094693418,
"loss": 1.3236,
"step": 124500
},
{
"epoch": 31.15,
"grad_norm": 0.04896382614970207,
"learning_rate": 0.0002953477592349522,
"loss": 1.2933,
"step": 124600
},
{
"epoch": 31.175,
"grad_norm": 0.04476182907819748,
"learning_rate": 0.0002953440465029064,
"loss": 1.3332,
"step": 124700
},
{
"epoch": 31.2,
"grad_norm": 0.054897475987672806,
"learning_rate": 0.0002953402962685167,
"loss": 1.3213,
"step": 124800
},
{
"epoch": 31.225,
"grad_norm": 0.04679589346051216,
"learning_rate": 0.0002953365460341271,
"loss": 1.3065,
"step": 124900
},
{
"epoch": 31.25,
"grad_norm": 0.04921596497297287,
"learning_rate": 0.00029533279579973745,
"loss": 1.1591,
"step": 125000
},
{
"epoch": 31.275,
"grad_norm": 0.0433526448905468,
"learning_rate": 0.0002953290455653478,
"loss": 1.3262,
"step": 125100
},
{
"epoch": 31.3,
"grad_norm": 0.043862484395504,
"learning_rate": 0.00029532529533095813,
"loss": 1.2693,
"step": 125200
},
{
"epoch": 31.325,
"grad_norm": 0.06467683613300323,
"learning_rate": 0.0002953215450965685,
"loss": 1.3879,
"step": 125300
},
{
"epoch": 31.35,
"grad_norm": 0.05398791283369064,
"learning_rate": 0.00029531779486217886,
"loss": 1.2593,
"step": 125400
},
{
"epoch": 31.375,
"grad_norm": 0.06727266311645508,
"learning_rate": 0.00029531404462778923,
"loss": 1.3277,
"step": 125500
},
{
"epoch": 31.4,
"grad_norm": 0.0463390052318573,
"learning_rate": 0.00029531029439339954,
"loss": 1.3013,
"step": 125600
},
{
"epoch": 31.425,
"grad_norm": 0.04781678318977356,
"learning_rate": 0.0002953065441590099,
"loss": 1.2572,
"step": 125700
},
{
"epoch": 31.45,
"grad_norm": 0.0504741370677948,
"learning_rate": 0.00029530279392462027,
"loss": 1.276,
"step": 125800
},
{
"epoch": 31.475,
"grad_norm": 0.08227650821208954,
"learning_rate": 0.00029529904369023064,
"loss": 1.3546,
"step": 125900
},
{
"epoch": 31.5,
"grad_norm": 0.04831939563155174,
"learning_rate": 0.00029529529345584095,
"loss": 1.2622,
"step": 126000
},
{
"epoch": 31.525,
"grad_norm": 0.04759907349944115,
"learning_rate": 0.0002952915432214513,
"loss": 1.3973,
"step": 126100
},
{
"epoch": 31.55,
"grad_norm": 0.0501595176756382,
"learning_rate": 0.00029528779298706163,
"loss": 1.309,
"step": 126200
},
{
"epoch": 31.575,
"grad_norm": 0.04236988723278046,
"learning_rate": 0.00029528404275267205,
"loss": 1.2076,
"step": 126300
},
{
"epoch": 31.6,
"grad_norm": 0.045248087495565414,
"learning_rate": 0.00029528029251828236,
"loss": 1.1881,
"step": 126400
},
{
"epoch": 31.625,
"grad_norm": 0.05358180031180382,
"learning_rate": 0.00029527654228389273,
"loss": 1.242,
"step": 126500
},
{
"epoch": 31.65,
"grad_norm": 0.06812089681625366,
"learning_rate": 0.00029527279204950304,
"loss": 1.3071,
"step": 126600
},
{
"epoch": 31.675,
"grad_norm": 0.0523652583360672,
"learning_rate": 0.0002952690418151134,
"loss": 1.2635,
"step": 126700
},
{
"epoch": 31.7,
"grad_norm": 0.054195646196603775,
"learning_rate": 0.00029526529158072377,
"loss": 1.3601,
"step": 126800
},
{
"epoch": 31.725,
"grad_norm": 0.05106286332011223,
"learning_rate": 0.00029526154134633414,
"loss": 1.2716,
"step": 126900
},
{
"epoch": 31.75,
"grad_norm": 0.04490172490477562,
"learning_rate": 0.00029525779111194445,
"loss": 1.1354,
"step": 127000
},
{
"epoch": 31.775,
"grad_norm": 0.04846130311489105,
"learning_rate": 0.0002952540408775548,
"loss": 1.3259,
"step": 127100
},
{
"epoch": 31.8,
"grad_norm": 0.050297126173973083,
"learning_rate": 0.0002952502906431652,
"loss": 1.1898,
"step": 127200
},
{
"epoch": 31.825,
"grad_norm": 0.0532267764210701,
"learning_rate": 0.00029524654040877555,
"loss": 1.1544,
"step": 127300
},
{
"epoch": 31.85,
"grad_norm": 0.03898947685956955,
"learning_rate": 0.00029524282767672976,
"loss": 1.3027,
"step": 127400
},
{
"epoch": 31.875,
"grad_norm": 0.055518005043268204,
"learning_rate": 0.0002952390774423401,
"loss": 1.1795,
"step": 127500
},
{
"epoch": 31.9,
"grad_norm": 0.045770760625600815,
"learning_rate": 0.0002952353272079505,
"loss": 1.2203,
"step": 127600
},
{
"epoch": 31.925,
"grad_norm": 0.04108942300081253,
"learning_rate": 0.00029523157697356085,
"loss": 1.2737,
"step": 127700
},
{
"epoch": 31.95,
"grad_norm": 0.04591604694724083,
"learning_rate": 0.00029522782673917117,
"loss": 1.2465,
"step": 127800
},
{
"epoch": 31.975,
"grad_norm": 0.04735784977674484,
"learning_rate": 0.00029522407650478153,
"loss": 1.3007,
"step": 127900
},
{
"epoch": 32.0,
"grad_norm": 0.04895665496587753,
"learning_rate": 0.0002952203262703919,
"loss": 1.3006,
"step": 128000
},
{
"epoch": 32.025,
"grad_norm": 0.05351528897881508,
"learning_rate": 0.00029521657603600226,
"loss": 1.2599,
"step": 128100
},
{
"epoch": 32.05,
"grad_norm": 0.04478209838271141,
"learning_rate": 0.0002952128258016126,
"loss": 1.2839,
"step": 128200
},
{
"epoch": 32.075,
"grad_norm": 0.05886415019631386,
"learning_rate": 0.00029520907556722294,
"loss": 1.2412,
"step": 128300
},
{
"epoch": 32.1,
"grad_norm": 0.04743971303105354,
"learning_rate": 0.00029520532533283325,
"loss": 1.2031,
"step": 128400
},
{
"epoch": 32.125,
"grad_norm": 0.046698570251464844,
"learning_rate": 0.0002952015750984436,
"loss": 1.2691,
"step": 128500
},
{
"epoch": 32.15,
"grad_norm": 0.04950440675020218,
"learning_rate": 0.000295197824864054,
"loss": 1.2178,
"step": 128600
},
{
"epoch": 32.175,
"grad_norm": 0.047533079981803894,
"learning_rate": 0.0002951940746296643,
"loss": 1.1742,
"step": 128700
},
{
"epoch": 32.2,
"grad_norm": 0.1709842085838318,
"learning_rate": 0.00029519032439527466,
"loss": 1.2904,
"step": 128800
},
{
"epoch": 32.225,
"grad_norm": 0.053603630512952805,
"learning_rate": 0.00029518657416088503,
"loss": 1.2806,
"step": 128900
},
{
"epoch": 32.25,
"grad_norm": 0.05528594180941582,
"learning_rate": 0.0002951828239264954,
"loss": 1.2891,
"step": 129000
},
{
"epoch": 32.275,
"grad_norm": 0.051689211279153824,
"learning_rate": 0.0002951790736921057,
"loss": 1.3107,
"step": 129100
},
{
"epoch": 32.3,
"grad_norm": 0.0504557229578495,
"learning_rate": 0.0002951753234577161,
"loss": 1.2528,
"step": 129200
},
{
"epoch": 32.325,
"grad_norm": 0.048762448132038116,
"learning_rate": 0.00029517157322332644,
"loss": 1.1503,
"step": 129300
},
{
"epoch": 32.35,
"grad_norm": 0.05114434286952019,
"learning_rate": 0.0002951678229889368,
"loss": 1.1685,
"step": 129400
},
{
"epoch": 32.375,
"grad_norm": 0.04877127707004547,
"learning_rate": 0.0002951640727545471,
"loss": 1.1642,
"step": 129500
},
{
"epoch": 32.4,
"grad_norm": 0.04645070433616638,
"learning_rate": 0.0002951603225201575,
"loss": 1.2363,
"step": 129600
},
{
"epoch": 32.425,
"grad_norm": 0.049255430698394775,
"learning_rate": 0.0002951565722857678,
"loss": 1.286,
"step": 129700
},
{
"epoch": 32.45,
"grad_norm": 0.05051419138908386,
"learning_rate": 0.00029515282205137816,
"loss": 1.2311,
"step": 129800
},
{
"epoch": 32.475,
"grad_norm": 0.05819782614707947,
"learning_rate": 0.00029514907181698853,
"loss": 1.2218,
"step": 129900
},
{
"epoch": 32.5,
"grad_norm": 0.04523173347115517,
"learning_rate": 0.0002951453215825989,
"loss": 1.17,
"step": 130000
},
{
"epoch": 32.525,
"grad_norm": 0.047802697867155075,
"learning_rate": 0.0002951415713482092,
"loss": 1.2679,
"step": 130100
},
{
"epoch": 32.55,
"grad_norm": 0.04578109085559845,
"learning_rate": 0.0002951378211138196,
"loss": 1.134,
"step": 130200
},
{
"epoch": 32.575,
"grad_norm": 0.040033962577581406,
"learning_rate": 0.00029513407087942994,
"loss": 1.222,
"step": 130300
},
{
"epoch": 32.6,
"grad_norm": 0.04128117114305496,
"learning_rate": 0.0002951303206450403,
"loss": 1.2106,
"step": 130400
},
{
"epoch": 32.625,
"grad_norm": 0.04531345143914223,
"learning_rate": 0.0002951265704106506,
"loss": 1.186,
"step": 130500
},
{
"epoch": 32.65,
"grad_norm": 0.043665412813425064,
"learning_rate": 0.000295122820176261,
"loss": 1.2078,
"step": 130600
},
{
"epoch": 32.675,
"grad_norm": 0.04887350648641586,
"learning_rate": 0.00029511906994187135,
"loss": 1.2482,
"step": 130700
},
{
"epoch": 32.7,
"grad_norm": 0.05151134356856346,
"learning_rate": 0.0002951153197074817,
"loss": 1.2568,
"step": 130800
},
{
"epoch": 32.725,
"grad_norm": 0.042473357170820236,
"learning_rate": 0.00029511156947309203,
"loss": 1.1829,
"step": 130900
},
{
"epoch": 32.75,
"grad_norm": 0.05092649906873703,
"learning_rate": 0.0002951078192387024,
"loss": 1.1481,
"step": 131000
},
{
"epoch": 32.775,
"grad_norm": 0.044292863458395004,
"learning_rate": 0.00029510406900431276,
"loss": 1.1682,
"step": 131100
},
{
"epoch": 32.8,
"grad_norm": 0.054200585931539536,
"learning_rate": 0.0002951003187699231,
"loss": 1.2387,
"step": 131200
},
{
"epoch": 32.825,
"grad_norm": 0.04644659161567688,
"learning_rate": 0.00029509656853553344,
"loss": 1.2118,
"step": 131300
},
{
"epoch": 32.85,
"grad_norm": 0.06080161780118942,
"learning_rate": 0.0002950928558034877,
"loss": 1.1483,
"step": 131400
},
{
"epoch": 32.875,
"grad_norm": 0.07698054611682892,
"learning_rate": 0.000295089105569098,
"loss": 1.1887,
"step": 131500
},
{
"epoch": 32.9,
"grad_norm": 0.038868315517902374,
"learning_rate": 0.00029508535533470843,
"loss": 1.1528,
"step": 131600
},
{
"epoch": 32.925,
"grad_norm": 0.05261719226837158,
"learning_rate": 0.00029508160510031874,
"loss": 1.085,
"step": 131700
},
{
"epoch": 32.95,
"grad_norm": 0.043816640973091125,
"learning_rate": 0.0002950778548659291,
"loss": 1.2063,
"step": 131800
},
{
"epoch": 32.975,
"grad_norm": 0.042075928300619125,
"learning_rate": 0.0002950741046315394,
"loss": 1.1792,
"step": 131900
},
{
"epoch": 33.0,
"grad_norm": 0.04904596507549286,
"learning_rate": 0.0002950703543971498,
"loss": 1.2376,
"step": 132000
},
{
"epoch": 33.025,
"grad_norm": 0.051781512796878815,
"learning_rate": 0.00029506660416276015,
"loss": 1.181,
"step": 132100
},
{
"epoch": 33.05,
"grad_norm": 0.055431291460990906,
"learning_rate": 0.0002950628539283705,
"loss": 1.1771,
"step": 132200
},
{
"epoch": 33.075,
"grad_norm": 0.04665238782763481,
"learning_rate": 0.00029505910369398083,
"loss": 1.1322,
"step": 132300
},
{
"epoch": 33.1,
"grad_norm": 0.04755477234721184,
"learning_rate": 0.0002950553534595912,
"loss": 1.2262,
"step": 132400
},
{
"epoch": 33.125,
"grad_norm": 0.0748729407787323,
"learning_rate": 0.00029505164072754546,
"loss": 1.0936,
"step": 132500
},
{
"epoch": 33.15,
"grad_norm": 0.05131325498223305,
"learning_rate": 0.00029504789049315577,
"loss": 1.2296,
"step": 132600
},
{
"epoch": 33.175,
"grad_norm": 0.051855139434337616,
"learning_rate": 0.00029504414025876614,
"loss": 1.2527,
"step": 132700
},
{
"epoch": 33.2,
"grad_norm": 0.04259216785430908,
"learning_rate": 0.0002950403900243765,
"loss": 1.1978,
"step": 132800
},
{
"epoch": 33.225,
"grad_norm": 0.0451393760740757,
"learning_rate": 0.00029503663978998687,
"loss": 1.1695,
"step": 132900
},
{
"epoch": 33.25,
"grad_norm": 0.0477844700217247,
"learning_rate": 0.0002950328895555972,
"loss": 1.1885,
"step": 133000
},
{
"epoch": 33.275,
"grad_norm": 0.04242611676454544,
"learning_rate": 0.00029502913932120755,
"loss": 1.1393,
"step": 133100
},
{
"epoch": 33.3,
"grad_norm": 0.046090077608823776,
"learning_rate": 0.00029502538908681786,
"loss": 1.1158,
"step": 133200
},
{
"epoch": 33.325,
"grad_norm": 0.04372167959809303,
"learning_rate": 0.0002950216388524283,
"loss": 1.1583,
"step": 133300
},
{
"epoch": 33.35,
"grad_norm": 0.044858288019895554,
"learning_rate": 0.0002950178886180386,
"loss": 1.1877,
"step": 133400
},
{
"epoch": 33.375,
"grad_norm": 0.042134176939725876,
"learning_rate": 0.00029501413838364896,
"loss": 1.1365,
"step": 133500
},
{
"epoch": 33.4,
"grad_norm": 0.05012949928641319,
"learning_rate": 0.00029501038814925927,
"loss": 1.2341,
"step": 133600
},
{
"epoch": 33.425,
"grad_norm": 0.04589414969086647,
"learning_rate": 0.00029500663791486964,
"loss": 1.1346,
"step": 133700
},
{
"epoch": 33.45,
"grad_norm": 0.059703532606363297,
"learning_rate": 0.00029500288768048,
"loss": 1.2177,
"step": 133800
},
{
"epoch": 33.475,
"grad_norm": 0.04715392366051674,
"learning_rate": 0.00029499913744609037,
"loss": 1.2044,
"step": 133900
},
{
"epoch": 33.5,
"grad_norm": 0.04391086474061012,
"learning_rate": 0.0002949953872117007,
"loss": 1.1846,
"step": 134000
},
{
"epoch": 33.525,
"grad_norm": 0.04045191779732704,
"learning_rate": 0.00029499163697731105,
"loss": 1.2048,
"step": 134100
},
{
"epoch": 33.55,
"grad_norm": 0.04283670708537102,
"learning_rate": 0.0002949878867429214,
"loss": 1.2246,
"step": 134200
},
{
"epoch": 33.575,
"grad_norm": 0.04338289797306061,
"learning_rate": 0.0002949841365085318,
"loss": 1.2334,
"step": 134300
},
{
"epoch": 33.6,
"grad_norm": 0.05026433989405632,
"learning_rate": 0.0002949803862741421,
"loss": 1.1017,
"step": 134400
},
{
"epoch": 33.625,
"grad_norm": 0.04827344790101051,
"learning_rate": 0.00029497663603975246,
"loss": 1.1765,
"step": 134500
},
{
"epoch": 33.65,
"grad_norm": 0.055267006158828735,
"learning_rate": 0.0002949728858053628,
"loss": 1.0555,
"step": 134600
},
{
"epoch": 33.675,
"grad_norm": 0.05551549047231674,
"learning_rate": 0.0002949691730733171,
"loss": 1.1171,
"step": 134700
},
{
"epoch": 33.7,
"grad_norm": 0.04356600344181061,
"learning_rate": 0.0002949654228389274,
"loss": 1.2224,
"step": 134800
},
{
"epoch": 33.725,
"grad_norm": 0.049372829496860504,
"learning_rate": 0.00029496167260453776,
"loss": 1.0843,
"step": 134900
},
{
"epoch": 33.75,
"grad_norm": 0.04735811799764633,
"learning_rate": 0.00029495792237014813,
"loss": 1.2027,
"step": 135000
},
{
"epoch": 33.775,
"grad_norm": 0.048068366944789886,
"learning_rate": 0.0002949541721357585,
"loss": 1.182,
"step": 135100
},
{
"epoch": 33.8,
"grad_norm": 0.05330264940857887,
"learning_rate": 0.0002949504219013688,
"loss": 1.1519,
"step": 135200
},
{
"epoch": 33.825,
"grad_norm": 0.04151195287704468,
"learning_rate": 0.0002949466716669792,
"loss": 1.0107,
"step": 135300
},
{
"epoch": 33.85,
"grad_norm": 0.04683278128504753,
"learning_rate": 0.0002949429214325895,
"loss": 1.2629,
"step": 135400
},
{
"epoch": 33.875,
"grad_norm": 0.04796934127807617,
"learning_rate": 0.00029493917119819985,
"loss": 1.0715,
"step": 135500
},
{
"epoch": 33.9,
"grad_norm": 0.048207636922597885,
"learning_rate": 0.0002949354209638102,
"loss": 1.1114,
"step": 135600
},
{
"epoch": 33.925,
"grad_norm": 0.0472245067358017,
"learning_rate": 0.0002949316707294206,
"loss": 1.1557,
"step": 135700
},
{
"epoch": 33.95,
"grad_norm": 0.051259011030197144,
"learning_rate": 0.0002949279204950309,
"loss": 1.1246,
"step": 135800
},
{
"epoch": 33.975,
"grad_norm": 0.054303720593452454,
"learning_rate": 0.00029492417026064126,
"loss": 1.0731,
"step": 135900
},
{
"epoch": 34.0,
"grad_norm": 0.06228245794773102,
"learning_rate": 0.00029492042002625163,
"loss": 1.1498,
"step": 136000
},
{
"epoch": 34.025,
"grad_norm": 0.04442556947469711,
"learning_rate": 0.000294916669791862,
"loss": 1.1424,
"step": 136100
},
{
"epoch": 34.05,
"grad_norm": 0.05475945398211479,
"learning_rate": 0.0002949129195574723,
"loss": 1.1854,
"step": 136200
},
{
"epoch": 34.075,
"grad_norm": 0.058647606521844864,
"learning_rate": 0.0002949091693230827,
"loss": 1.2086,
"step": 136300
},
{
"epoch": 34.1,
"grad_norm": 0.04777631536126137,
"learning_rate": 0.00029490541908869304,
"loss": 1.175,
"step": 136400
},
{
"epoch": 34.125,
"grad_norm": 0.04744923487305641,
"learning_rate": 0.00029490166885430335,
"loss": 1.0887,
"step": 136500
},
{
"epoch": 34.15,
"grad_norm": 0.04286637902259827,
"learning_rate": 0.0002948979186199137,
"loss": 1.1652,
"step": 136600
},
{
"epoch": 34.175,
"grad_norm": 0.0456664115190506,
"learning_rate": 0.00029489416838552403,
"loss": 1.0565,
"step": 136700
},
{
"epoch": 34.2,
"grad_norm": 0.06168069317936897,
"learning_rate": 0.0002948904181511344,
"loss": 1.2153,
"step": 136800
},
{
"epoch": 34.225,
"grad_norm": 0.04141145944595337,
"learning_rate": 0.00029488666791674476,
"loss": 1.1138,
"step": 136900
},
{
"epoch": 34.25,
"grad_norm": 0.04432584345340729,
"learning_rate": 0.00029488291768235513,
"loss": 1.1477,
"step": 137000
},
{
"epoch": 34.275,
"grad_norm": 0.04956555366516113,
"learning_rate": 0.00029487916744796544,
"loss": 1.0743,
"step": 137100
},
{
"epoch": 34.3,
"grad_norm": 0.04936617240309715,
"learning_rate": 0.0002948754172135758,
"loss": 0.988,
"step": 137200
},
{
"epoch": 34.325,
"grad_norm": 0.04362035542726517,
"learning_rate": 0.00029487166697918617,
"loss": 1.0981,
"step": 137300
},
{
"epoch": 34.35,
"grad_norm": 0.051287226378917694,
"learning_rate": 0.00029486791674479654,
"loss": 1.088,
"step": 137400
},
{
"epoch": 34.375,
"grad_norm": 0.03998219966888428,
"learning_rate": 0.00029486416651040685,
"loss": 1.1762,
"step": 137500
},
{
"epoch": 34.4,
"grad_norm": 0.048108555376529694,
"learning_rate": 0.0002948604162760172,
"loss": 1.084,
"step": 137600
},
{
"epoch": 34.425,
"grad_norm": 0.04450273886322975,
"learning_rate": 0.0002948566660416276,
"loss": 1.0954,
"step": 137700
},
{
"epoch": 34.45,
"grad_norm": 0.04805700480937958,
"learning_rate": 0.00029485291580723795,
"loss": 0.9584,
"step": 137800
},
{
"epoch": 34.475,
"grad_norm": 0.05516688898205757,
"learning_rate": 0.00029484916557284826,
"loss": 1.2255,
"step": 137900
},
{
"epoch": 34.5,
"grad_norm": 0.04300745949149132,
"learning_rate": 0.0002948454153384586,
"loss": 1.113,
"step": 138000
},
{
"epoch": 34.525,
"grad_norm": 0.04395318031311035,
"learning_rate": 0.000294841665104069,
"loss": 1.0804,
"step": 138100
},
{
"epoch": 34.55,
"grad_norm": 0.0548313707113266,
"learning_rate": 0.00029483791486967936,
"loss": 1.1407,
"step": 138200
},
{
"epoch": 34.575,
"grad_norm": 0.04328515753149986,
"learning_rate": 0.00029483416463528967,
"loss": 1.1493,
"step": 138300
},
{
"epoch": 34.6,
"grad_norm": 0.0498124323785305,
"learning_rate": 0.00029483041440090004,
"loss": 1.1091,
"step": 138400
},
{
"epoch": 34.625,
"grad_norm": 0.0529802069067955,
"learning_rate": 0.00029482666416651035,
"loss": 1.1526,
"step": 138500
},
{
"epoch": 34.65,
"grad_norm": 0.0480722077190876,
"learning_rate": 0.0002948229139321207,
"loss": 1.1961,
"step": 138600
},
{
"epoch": 34.675,
"grad_norm": 0.03908173367381096,
"learning_rate": 0.000294819201200075,
"loss": 1.0955,
"step": 138700
},
{
"epoch": 34.7,
"grad_norm": 0.04808943718671799,
"learning_rate": 0.00029481545096568534,
"loss": 1.1239,
"step": 138800
},
{
"epoch": 34.725,
"grad_norm": 0.046047843992710114,
"learning_rate": 0.00029481170073129565,
"loss": 1.0062,
"step": 138900
},
{
"epoch": 34.75,
"grad_norm": 0.041441336274147034,
"learning_rate": 0.000294807950496906,
"loss": 1.1386,
"step": 139000
},
{
"epoch": 34.775,
"grad_norm": 0.044936537742614746,
"learning_rate": 0.0002948042002625164,
"loss": 1.0692,
"step": 139100
},
{
"epoch": 34.8,
"grad_norm": 0.04202251508831978,
"learning_rate": 0.00029480045002812675,
"loss": 1.1048,
"step": 139200
},
{
"epoch": 34.825,
"grad_norm": 0.06056401878595352,
"learning_rate": 0.00029479669979373706,
"loss": 1.0427,
"step": 139300
},
{
"epoch": 34.85,
"grad_norm": 0.047068677842617035,
"learning_rate": 0.00029479294955934743,
"loss": 1.0166,
"step": 139400
},
{
"epoch": 34.875,
"grad_norm": 0.0437459833920002,
"learning_rate": 0.0002947891993249578,
"loss": 1.1336,
"step": 139500
},
{
"epoch": 34.9,
"grad_norm": 0.04363924637436867,
"learning_rate": 0.00029478544909056816,
"loss": 1.0419,
"step": 139600
},
{
"epoch": 34.925,
"grad_norm": 0.04847422614693642,
"learning_rate": 0.0002947816988561785,
"loss": 1.1885,
"step": 139700
},
{
"epoch": 34.95,
"grad_norm": 0.04593125358223915,
"learning_rate": 0.00029477794862178884,
"loss": 1.1173,
"step": 139800
},
{
"epoch": 34.975,
"grad_norm": 0.04662812873721123,
"learning_rate": 0.0002947741983873992,
"loss": 1.0086,
"step": 139900
},
{
"epoch": 35.0,
"grad_norm": 0.04696165770292282,
"learning_rate": 0.0002947704481530096,
"loss": 0.9674,
"step": 140000
},
{
"epoch": 35.025,
"grad_norm": 0.04659904167056084,
"learning_rate": 0.0002947666979186199,
"loss": 1.0319,
"step": 140100
},
{
"epoch": 35.05,
"grad_norm": 0.0433788076043129,
"learning_rate": 0.00029476294768423025,
"loss": 1.0989,
"step": 140200
},
{
"epoch": 35.075,
"grad_norm": 0.04491908475756645,
"learning_rate": 0.00029475919744984056,
"loss": 1.0623,
"step": 140300
},
{
"epoch": 35.1,
"grad_norm": 0.045701559633016586,
"learning_rate": 0.00029475544721545093,
"loss": 1.1146,
"step": 140400
},
{
"epoch": 35.125,
"grad_norm": 0.04654062166810036,
"learning_rate": 0.0002947517344834052,
"loss": 1.0735,
"step": 140500
},
{
"epoch": 35.15,
"grad_norm": 0.05366494506597519,
"learning_rate": 0.0002947479842490155,
"loss": 1.1706,
"step": 140600
},
{
"epoch": 35.175,
"grad_norm": 0.047658320516347885,
"learning_rate": 0.00029474423401462587,
"loss": 1.1263,
"step": 140700
},
{
"epoch": 35.2,
"grad_norm": 0.04554996266961098,
"learning_rate": 0.00029474048378023624,
"loss": 1.1135,
"step": 140800
},
{
"epoch": 35.225,
"grad_norm": 0.04832541570067406,
"learning_rate": 0.0002947367335458466,
"loss": 1.0375,
"step": 140900
},
{
"epoch": 35.25,
"grad_norm": 0.0434059239923954,
"learning_rate": 0.0002947329833114569,
"loss": 1.0696,
"step": 141000
},
{
"epoch": 35.275,
"grad_norm": 0.04571983963251114,
"learning_rate": 0.0002947292330770673,
"loss": 1.1276,
"step": 141100
},
{
"epoch": 35.3,
"grad_norm": 0.04176199808716774,
"learning_rate": 0.00029472548284267765,
"loss": 0.957,
"step": 141200
},
{
"epoch": 35.325,
"grad_norm": 0.06178323179483414,
"learning_rate": 0.000294721732608288,
"loss": 1.0451,
"step": 141300
},
{
"epoch": 35.35,
"grad_norm": 0.05882290005683899,
"learning_rate": 0.0002947179823738983,
"loss": 1.1542,
"step": 141400
},
{
"epoch": 35.375,
"grad_norm": 0.04132578894495964,
"learning_rate": 0.0002947142321395087,
"loss": 0.9828,
"step": 141500
},
{
"epoch": 35.4,
"grad_norm": 0.04464949667453766,
"learning_rate": 0.00029471048190511906,
"loss": 1.0171,
"step": 141600
},
{
"epoch": 35.425,
"grad_norm": 0.04540353640913963,
"learning_rate": 0.0002947067316707294,
"loss": 1.1018,
"step": 141700
},
{
"epoch": 35.45,
"grad_norm": 0.04491226375102997,
"learning_rate": 0.00029470298143633973,
"loss": 1.1166,
"step": 141800
},
{
"epoch": 35.475,
"grad_norm": 0.0440848246216774,
"learning_rate": 0.0002946992312019501,
"loss": 1.039,
"step": 141900
},
{
"epoch": 35.5,
"grad_norm": 0.04919476807117462,
"learning_rate": 0.0002946954809675604,
"loss": 0.9442,
"step": 142000
}
],
"logging_steps": 100,
"max_steps": 8000000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2000,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.68516799709184e+17,
"train_batch_size": 125,
"trial_name": null,
"trial_params": null
}