MorganGen / trainer_state.json
lamthuy's picture
Upload folder using huggingface_hub
f0a96d3 verified
{
"best_metric": 0.035888671875,
"best_model_checkpoint": "./results_morgangen/checkpoint-100000",
"epoch": 0.001,
"eval_steps": 20000,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1e-06,
"grad_norm": 2.3610475063323975,
"learning_rate": 4.9328196339992464e-06,
"loss": 3.6864,
"step": 100
},
{
"epoch": 2e-06,
"grad_norm": 1.656525731086731,
"learning_rate": 5.719504324825564e-06,
"loss": 1.718,
"step": 200
},
{
"epoch": 3e-06,
"grad_norm": 1.8576802015304565,
"learning_rate": 6.1708683260303926e-06,
"loss": 1.4701,
"step": 300
},
{
"epoch": 4e-06,
"grad_norm": 1.383483648300171,
"learning_rate": 6.488740554563935e-06,
"loss": 1.3455,
"step": 400
},
{
"epoch": 5e-06,
"grad_norm": 1.4719737768173218,
"learning_rate": 6.734317372309117e-06,
"loss": 1.261,
"step": 500
},
{
"epoch": 6e-06,
"grad_norm": 1.5180269479751587,
"learning_rate": 6.934466112452983e-06,
"loss": 1.1993,
"step": 600
},
{
"epoch": 7e-06,
"grad_norm": 1.4714821577072144,
"learning_rate": 7.103398676137137e-06,
"loss": 1.1509,
"step": 700
},
{
"epoch": 8e-06,
"grad_norm": 1.4220333099365234,
"learning_rate": 7.249551256067741e-06,
"loss": 1.104,
"step": 800
},
{
"epoch": 9e-06,
"grad_norm": 2.056689977645874,
"learning_rate": 7.378343796989793e-06,
"loss": 1.0759,
"step": 900
},
{
"epoch": 1e-05,
"grad_norm": 1.5754709243774414,
"learning_rate": 7.493465960993282e-06,
"loss": 1.0397,
"step": 1000
},
{
"epoch": 1.1e-05,
"grad_norm": 1.4028830528259277,
"learning_rate": 7.596550404874257e-06,
"loss": 1.0055,
"step": 1100
},
{
"epoch": 1.2e-05,
"grad_norm": 1.4989030361175537,
"learning_rate": 7.691601109175854e-06,
"loss": 0.9769,
"step": 1200
},
{
"epoch": 1.3e-05,
"grad_norm": 1.9516693353652954,
"learning_rate": 7.778996312200985e-06,
"loss": 0.9499,
"step": 1300
},
{
"epoch": 1.4e-05,
"grad_norm": 1.7118744850158691,
"learning_rate": 7.859877791059908e-06,
"loss": 0.9245,
"step": 1400
},
{
"epoch": 1.5e-05,
"grad_norm": 1.9349831342697144,
"learning_rate": 7.935149519312563e-06,
"loss": 0.9002,
"step": 1500
},
{
"epoch": 1.6e-05,
"grad_norm": 1.3496636152267456,
"learning_rate": 8.005539439502828e-06,
"loss": 0.881,
"step": 1600
},
{
"epoch": 1.7e-05,
"grad_norm": 1.6026203632354736,
"learning_rate": 8.071642395272339e-06,
"loss": 0.8592,
"step": 1700
},
{
"epoch": 1.8e-05,
"grad_norm": 1.9675606489181519,
"learning_rate": 8.133950723905457e-06,
"loss": 0.841,
"step": 1800
},
{
"epoch": 1.9e-05,
"grad_norm": 2.057551145553589,
"learning_rate": 8.19287653490949e-06,
"loss": 0.8228,
"step": 1900
},
{
"epoch": 2e-05,
"grad_norm": 1.8387638330459595,
"learning_rate": 8.248223335219199e-06,
"loss": 0.8037,
"step": 2000
},
{
"epoch": 2.1e-05,
"grad_norm": 1.7323527336120605,
"learning_rate": 8.30140420048809e-06,
"loss": 0.7874,
"step": 2100
},
{
"epoch": 2.2e-05,
"grad_norm": 2.6023099422454834,
"learning_rate": 8.352101374530827e-06,
"loss": 0.7677,
"step": 2200
},
{
"epoch": 2.3e-05,
"grad_norm": 1.9935753345489502,
"learning_rate": 8.400536533238381e-06,
"loss": 0.7554,
"step": 2300
},
{
"epoch": 2.4e-05,
"grad_norm": 1.632101058959961,
"learning_rate": 8.446902938290931e-06,
"loss": 0.7376,
"step": 2400
},
{
"epoch": 2.5e-05,
"grad_norm": 2.2343554496765137,
"learning_rate": 8.491370094967829e-06,
"loss": 0.7202,
"step": 2500
},
{
"epoch": 2.6e-05,
"grad_norm": 2.182626962661743,
"learning_rate": 8.534087492996389e-06,
"loss": 0.706,
"step": 2600
},
{
"epoch": 2.7e-05,
"grad_norm": 1.9008549451828003,
"learning_rate": 8.575187638879847e-06,
"loss": 0.694,
"step": 2700
},
{
"epoch": 2.8e-05,
"grad_norm": 1.8491170406341553,
"learning_rate": 8.614788534877808e-06,
"loss": 0.6765,
"step": 2800
},
{
"epoch": 2.9e-05,
"grad_norm": 2.2471179962158203,
"learning_rate": 8.652995721556234e-06,
"loss": 0.6633,
"step": 2900
},
{
"epoch": 3e-05,
"grad_norm": 1.9031002521514893,
"learning_rate": 8.689903972981059e-06,
"loss": 0.6455,
"step": 3000
},
{
"epoch": 3.1e-05,
"grad_norm": 1.8448904752731323,
"learning_rate": 8.725598713115716e-06,
"loss": 0.6356,
"step": 3100
},
{
"epoch": 3.2e-05,
"grad_norm": 1.7731763124465942,
"learning_rate": 8.760157206696729e-06,
"loss": 0.6215,
"step": 3200
},
{
"epoch": 3.3e-05,
"grad_norm": 1.9336998462677002,
"learning_rate": 8.79364956635058e-06,
"loss": 0.6082,
"step": 3300
},
{
"epoch": 3.4e-05,
"grad_norm": 1.5020197629928589,
"learning_rate": 8.82613960896169e-06,
"loss": 0.5918,
"step": 3400
},
{
"epoch": 3.5e-05,
"grad_norm": 1.8765689134597778,
"learning_rate": 8.85768558758383e-06,
"loss": 0.5799,
"step": 3500
},
{
"epoch": 3.6e-05,
"grad_norm": 1.8304609060287476,
"learning_rate": 8.888340819988166e-06,
"loss": 0.5658,
"step": 3600
},
{
"epoch": 3.7e-05,
"grad_norm": 1.663233757019043,
"learning_rate": 8.918154230884686e-06,
"loss": 0.5574,
"step": 3700
},
{
"epoch": 3.8e-05,
"grad_norm": 1.8172345161437988,
"learning_rate": 8.947170821665072e-06,
"loss": 0.5465,
"step": 3800
},
{
"epoch": 3.9e-05,
"grad_norm": 1.7093075513839722,
"learning_rate": 8.975432078990786e-06,
"loss": 0.5315,
"step": 3900
},
{
"epoch": 4e-05,
"grad_norm": 1.7914036512374878,
"learning_rate": 9.002976331538332e-06,
"loss": 0.5227,
"step": 4000
},
{
"epoch": 4.1e-05,
"grad_norm": 1.441435694694519,
"learning_rate": 9.029839062600307e-06,
"loss": 0.5131,
"step": 4100
},
{
"epoch": 4.2e-05,
"grad_norm": 1.3954449892044067,
"learning_rate": 9.056053184939176e-06,
"loss": 0.4998,
"step": 4200
},
{
"epoch": 4.3e-05,
"grad_norm": 1.7888164520263672,
"learning_rate": 9.081649283234784e-06,
"loss": 0.4961,
"step": 4300
},
{
"epoch": 4.4e-05,
"grad_norm": 1.7587261199951172,
"learning_rate": 9.106655828605087e-06,
"loss": 0.4875,
"step": 4400
},
{
"epoch": 4.5e-05,
"grad_norm": 1.9113582372665405,
"learning_rate": 9.13109936897355e-06,
"loss": 0.4794,
"step": 4500
},
{
"epoch": 4.6e-05,
"grad_norm": 1.6648356914520264,
"learning_rate": 9.155004698474792e-06,
"loss": 0.4697,
"step": 4600
},
{
"epoch": 4.7e-05,
"grad_norm": 1.5259454250335693,
"learning_rate": 9.17839500860873e-06,
"loss": 0.4622,
"step": 4700
},
{
"epoch": 4.8e-05,
"grad_norm": 1.8361080884933472,
"learning_rate": 9.201292023453135e-06,
"loss": 0.453,
"step": 4800
},
{
"epoch": 4.9e-05,
"grad_norm": 1.6309137344360352,
"learning_rate": 9.22371612091062e-06,
"loss": 0.4429,
"step": 4900
},
{
"epoch": 5e-05,
"grad_norm": 1.7207796573638916,
"learning_rate": 9.245686441685918e-06,
"loss": 0.4382,
"step": 5000
},
{
"epoch": 5.1e-05,
"grad_norm": 1.552103042602539,
"learning_rate": 9.267220987454044e-06,
"loss": 0.4315,
"step": 5100
},
{
"epoch": 5.2e-05,
"grad_norm": 1.6008425951004028,
"learning_rate": 9.28833670948078e-06,
"loss": 0.4244,
"step": 5200
},
{
"epoch": 5.3e-05,
"grad_norm": 1.8220570087432861,
"learning_rate": 9.309049588788657e-06,
"loss": 0.4162,
"step": 5300
},
{
"epoch": 5.4e-05,
"grad_norm": 1.5230612754821777,
"learning_rate": 9.329374708818158e-06,
"loss": 0.4112,
"step": 5400
},
{
"epoch": 5.5e-05,
"grad_norm": 1.7809470891952515,
"learning_rate": 9.349326321411793e-06,
"loss": 0.4052,
"step": 5500
},
{
"epoch": 5.6e-05,
"grad_norm": 1.5959115028381348,
"learning_rate": 9.368917906844062e-06,
"loss": 0.4009,
"step": 5600
},
{
"epoch": 5.7e-05,
"grad_norm": 1.563692331314087,
"learning_rate": 9.388162228530614e-06,
"loss": 0.394,
"step": 5700
},
{
"epoch": 5.8e-05,
"grad_norm": 1.4869149923324585,
"learning_rate": 9.407071382972726e-06,
"loss": 0.3879,
"step": 5800
},
{
"epoch": 5.9e-05,
"grad_norm": 1.5701963901519775,
"learning_rate": 9.425656845426483e-06,
"loss": 0.3784,
"step": 5900
},
{
"epoch": 6e-05,
"grad_norm": 1.496894359588623,
"learning_rate": 9.443929511728523e-06,
"loss": 0.3746,
"step": 6000
},
{
"epoch": 6.1e-05,
"grad_norm": 1.4307634830474854,
"learning_rate": 9.461899736660011e-06,
"loss": 0.372,
"step": 6100
},
{
"epoch": 6.2e-05,
"grad_norm": 1.4771479368209839,
"learning_rate": 9.479577369187091e-06,
"loss": 0.3661,
"step": 6200
},
{
"epoch": 6.3e-05,
"grad_norm": 1.2904491424560547,
"learning_rate": 9.496971784878123e-06,
"loss": 0.3625,
"step": 6300
},
{
"epoch": 6.4e-05,
"grad_norm": 1.5488417148590088,
"learning_rate": 9.514091915764837e-06,
"loss": 0.3547,
"step": 6400
},
{
"epoch": 6.5e-05,
"grad_norm": 1.4266217947006226,
"learning_rate": 9.530946277885485e-06,
"loss": 0.3491,
"step": 6500
},
{
"epoch": 6.6e-05,
"grad_norm": 1.5423930883407593,
"learning_rate": 9.547542996722649e-06,
"loss": 0.3442,
"step": 6600
},
{
"epoch": 6.7e-05,
"grad_norm": 1.3324171304702759,
"learning_rate": 9.563889830725893e-06,
"loss": 0.3427,
"step": 6700
},
{
"epoch": 6.8e-05,
"grad_norm": 1.3407986164093018,
"learning_rate": 9.57999419308974e-06,
"loss": 0.3376,
"step": 6800
},
{
"epoch": 6.9e-05,
"grad_norm": 1.303074598312378,
"learning_rate": 9.595863171939976e-06,
"loss": 0.3346,
"step": 6900
},
{
"epoch": 7e-05,
"grad_norm": 1.3955286741256714,
"learning_rate": 9.611192939364202e-06,
"loss": 0.3283,
"step": 7000
},
{
"epoch": 7.1e-05,
"grad_norm": 1.3460999727249146,
"learning_rate": 9.626462440880078e-06,
"loss": 0.3269,
"step": 7100
},
{
"epoch": 7.2e-05,
"grad_norm": 1.5643832683563232,
"learning_rate": 9.641671209028838e-06,
"loss": 0.3235,
"step": 7200
},
{
"epoch": 7.3e-05,
"grad_norm": 1.503298044204712,
"learning_rate": 9.65666987557147e-06,
"loss": 0.3184,
"step": 7300
},
{
"epoch": 7.4e-05,
"grad_norm": 1.4040926694869995,
"learning_rate": 9.671464166396914e-06,
"loss": 0.3173,
"step": 7400
},
{
"epoch": 7.5e-05,
"grad_norm": 1.5793565511703491,
"learning_rate": 9.686059576466255e-06,
"loss": 0.3118,
"step": 7500
},
{
"epoch": 7.6e-05,
"grad_norm": 1.2530806064605713,
"learning_rate": 9.700461382066083e-06,
"loss": 0.3073,
"step": 7600
},
{
"epoch": 7.7e-05,
"grad_norm": 1.6009125709533691,
"learning_rate": 9.714674652259765e-06,
"loss": 0.3058,
"step": 7700
},
{
"epoch": 7.8e-05,
"grad_norm": 1.52604341506958,
"learning_rate": 9.7287042595988e-06,
"loss": 0.299,
"step": 7800
},
{
"epoch": 7.9e-05,
"grad_norm": 1.512654185295105,
"learning_rate": 9.742554890150908e-06,
"loss": 0.2997,
"step": 7900
},
{
"epoch": 8e-05,
"grad_norm": 1.3372293710708618,
"learning_rate": 9.75623105289651e-06,
"loss": 0.2959,
"step": 8000
},
{
"epoch": 8.1e-05,
"grad_norm": 1.3194124698638916,
"learning_rate": 9.769737088540707e-06,
"loss": 0.2915,
"step": 8100
},
{
"epoch": 8.2e-05,
"grad_norm": 1.3267931938171387,
"learning_rate": 9.783077177783901e-06,
"loss": 0.2883,
"step": 8200
},
{
"epoch": 8.3e-05,
"grad_norm": 1.4453672170639038,
"learning_rate": 9.796255349090433e-06,
"loss": 0.2857,
"step": 8300
},
{
"epoch": 8.4e-05,
"grad_norm": 1.2656625509262085,
"learning_rate": 9.809275485991406e-06,
"loss": 0.2824,
"step": 8400
},
{
"epoch": 8.5e-05,
"grad_norm": 1.347659707069397,
"learning_rate": 9.822141333954775e-06,
"loss": 0.2805,
"step": 8500
},
{
"epoch": 8.6e-05,
"grad_norm": 1.3958872556686401,
"learning_rate": 9.834856506853153e-06,
"loss": 0.2777,
"step": 8600
},
{
"epoch": 8.7e-05,
"grad_norm": 1.4277667999267578,
"learning_rate": 9.847424493057225e-06,
"loss": 0.2734,
"step": 8700
},
{
"epoch": 8.8e-05,
"grad_norm": 1.233550786972046,
"learning_rate": 9.85984866118054e-06,
"loss": 0.2727,
"step": 8800
},
{
"epoch": 8.9e-05,
"grad_norm": 1.499273657798767,
"learning_rate": 9.872132265499283e-06,
"loss": 0.2712,
"step": 8900
},
{
"epoch": 9e-05,
"grad_norm": 1.4485379457473755,
"learning_rate": 9.884278451068888e-06,
"loss": 0.2669,
"step": 9000
},
{
"epoch": 9.1e-05,
"grad_norm": 1.2728357315063477,
"learning_rate": 9.896051320131294e-06,
"loss": 0.2657,
"step": 9100
},
{
"epoch": 9.2e-05,
"grad_norm": 1.2725111246109009,
"learning_rate": 9.90793429093813e-06,
"loss": 0.2635,
"step": 9200
},
{
"epoch": 9.3e-05,
"grad_norm": 1.3907318115234375,
"learning_rate": 9.919688613870083e-06,
"loss": 0.2581,
"step": 9300
},
{
"epoch": 9.4e-05,
"grad_norm": 1.3836479187011719,
"learning_rate": 9.93131704466464e-06,
"loss": 0.2588,
"step": 9400
},
{
"epoch": 9.5e-05,
"grad_norm": 1.2773383855819702,
"learning_rate": 9.942822251451706e-06,
"loss": 0.2536,
"step": 9500
},
{
"epoch": 9.6e-05,
"grad_norm": 1.2910404205322266,
"learning_rate": 9.954206818428214e-06,
"loss": 0.2513,
"step": 9600
},
{
"epoch": 9.7e-05,
"grad_norm": 1.234729528427124,
"learning_rate": 9.96547324934206e-06,
"loss": 0.2476,
"step": 9700
},
{
"epoch": 9.8e-05,
"grad_norm": 1.1756150722503662,
"learning_rate": 9.976623970797134e-06,
"loss": 0.2471,
"step": 9800
},
{
"epoch": 9.9e-05,
"grad_norm": 1.261687159538269,
"learning_rate": 9.987661335390354e-06,
"loss": 0.2489,
"step": 9900
},
{
"epoch": 0.0001,
"grad_norm": 1.3017668724060059,
"learning_rate": 9.998587624690824e-06,
"loss": 0.2435,
"step": 10000
},
{
"epoch": 0.000101,
"grad_norm": 1.232535719871521,
"learning_rate": 1e-05,
"loss": 0.2425,
"step": 10100
},
{
"epoch": 0.000102,
"grad_norm": 1.306433081626892,
"learning_rate": 1e-05,
"loss": 0.2406,
"step": 10200
},
{
"epoch": 0.000103,
"grad_norm": 1.3659272193908691,
"learning_rate": 1e-05,
"loss": 0.2389,
"step": 10300
},
{
"epoch": 0.000104,
"grad_norm": 1.1521058082580566,
"learning_rate": 1e-05,
"loss": 0.2338,
"step": 10400
},
{
"epoch": 0.000105,
"grad_norm": 1.1397546529769897,
"learning_rate": 1e-05,
"loss": 0.2342,
"step": 10500
},
{
"epoch": 0.000106,
"grad_norm": 1.3130905628204346,
"learning_rate": 1e-05,
"loss": 0.2313,
"step": 10600
},
{
"epoch": 0.000107,
"grad_norm": 1.1320550441741943,
"learning_rate": 1e-05,
"loss": 0.2288,
"step": 10700
},
{
"epoch": 0.000108,
"grad_norm": 1.2157635688781738,
"learning_rate": 1e-05,
"loss": 0.2296,
"step": 10800
},
{
"epoch": 0.000109,
"grad_norm": 1.2038499116897583,
"learning_rate": 1e-05,
"loss": 0.2249,
"step": 10900
},
{
"epoch": 0.00011,
"grad_norm": 1.3213508129119873,
"learning_rate": 1e-05,
"loss": 0.2243,
"step": 11000
},
{
"epoch": 0.000111,
"grad_norm": 1.1428966522216797,
"learning_rate": 1e-05,
"loss": 0.2213,
"step": 11100
},
{
"epoch": 0.000112,
"grad_norm": 1.2259374856948853,
"learning_rate": 1e-05,
"loss": 0.2202,
"step": 11200
},
{
"epoch": 0.000113,
"grad_norm": 1.1567683219909668,
"learning_rate": 1e-05,
"loss": 0.2175,
"step": 11300
},
{
"epoch": 0.000114,
"grad_norm": 1.2655612230300903,
"learning_rate": 1e-05,
"loss": 0.216,
"step": 11400
},
{
"epoch": 0.000115,
"grad_norm": 1.1602586507797241,
"learning_rate": 1e-05,
"loss": 0.2146,
"step": 11500
},
{
"epoch": 0.000116,
"grad_norm": 1.1369308233261108,
"learning_rate": 1e-05,
"loss": 0.2126,
"step": 11600
},
{
"epoch": 0.000117,
"grad_norm": 1.1988592147827148,
"learning_rate": 1e-05,
"loss": 0.2121,
"step": 11700
},
{
"epoch": 0.000118,
"grad_norm": 1.087939977645874,
"learning_rate": 1e-05,
"loss": 0.2101,
"step": 11800
},
{
"epoch": 0.000119,
"grad_norm": 1.2805454730987549,
"learning_rate": 1e-05,
"loss": 0.2094,
"step": 11900
},
{
"epoch": 0.00012,
"grad_norm": 1.4006527662277222,
"learning_rate": 1e-05,
"loss": 0.2043,
"step": 12000
},
{
"epoch": 0.000121,
"grad_norm": 1.2651677131652832,
"learning_rate": 1e-05,
"loss": 0.205,
"step": 12100
},
{
"epoch": 0.000122,
"grad_norm": 1.3023113012313843,
"learning_rate": 1e-05,
"loss": 0.2066,
"step": 12200
},
{
"epoch": 0.000123,
"grad_norm": 1.0964651107788086,
"learning_rate": 1e-05,
"loss": 0.2019,
"step": 12300
},
{
"epoch": 0.000124,
"grad_norm": 1.1747757196426392,
"learning_rate": 1e-05,
"loss": 0.201,
"step": 12400
},
{
"epoch": 0.000125,
"grad_norm": 1.0360560417175293,
"learning_rate": 1e-05,
"loss": 0.1995,
"step": 12500
},
{
"epoch": 0.000126,
"grad_norm": 1.0915257930755615,
"learning_rate": 1e-05,
"loss": 0.1979,
"step": 12600
},
{
"epoch": 0.000127,
"grad_norm": 1.1433717012405396,
"learning_rate": 1e-05,
"loss": 0.2003,
"step": 12700
},
{
"epoch": 0.000128,
"grad_norm": 1.1049145460128784,
"learning_rate": 1e-05,
"loss": 0.1956,
"step": 12800
},
{
"epoch": 0.000129,
"grad_norm": 1.11701238155365,
"learning_rate": 1e-05,
"loss": 0.1951,
"step": 12900
},
{
"epoch": 0.00013,
"grad_norm": 1.1755869388580322,
"learning_rate": 1e-05,
"loss": 0.1936,
"step": 13000
},
{
"epoch": 0.000131,
"grad_norm": 1.0519227981567383,
"learning_rate": 1e-05,
"loss": 0.1914,
"step": 13100
},
{
"epoch": 0.000132,
"grad_norm": 1.1982672214508057,
"learning_rate": 1e-05,
"loss": 0.1895,
"step": 13200
},
{
"epoch": 0.000133,
"grad_norm": 1.135452389717102,
"learning_rate": 1e-05,
"loss": 0.1899,
"step": 13300
},
{
"epoch": 0.000134,
"grad_norm": 1.0130894184112549,
"learning_rate": 1e-05,
"loss": 0.1858,
"step": 13400
},
{
"epoch": 0.000135,
"grad_norm": 1.1471365690231323,
"learning_rate": 1e-05,
"loss": 0.1872,
"step": 13500
},
{
"epoch": 0.000136,
"grad_norm": 1.1107739210128784,
"learning_rate": 1e-05,
"loss": 0.1864,
"step": 13600
},
{
"epoch": 0.000137,
"grad_norm": 1.1473486423492432,
"learning_rate": 1e-05,
"loss": 0.1854,
"step": 13700
},
{
"epoch": 0.000138,
"grad_norm": 1.0697531700134277,
"learning_rate": 1e-05,
"loss": 0.1813,
"step": 13800
},
{
"epoch": 0.000139,
"grad_norm": 0.9683561325073242,
"learning_rate": 1e-05,
"loss": 0.1801,
"step": 13900
},
{
"epoch": 0.00014,
"grad_norm": 1.1696103811264038,
"learning_rate": 1e-05,
"loss": 0.1802,
"step": 14000
},
{
"epoch": 0.000141,
"grad_norm": 1.2879928350448608,
"learning_rate": 1e-05,
"loss": 0.1808,
"step": 14100
},
{
"epoch": 0.000142,
"grad_norm": 1.0318293571472168,
"learning_rate": 1e-05,
"loss": 0.1792,
"step": 14200
},
{
"epoch": 0.000143,
"grad_norm": 1.0072672367095947,
"learning_rate": 1e-05,
"loss": 0.1784,
"step": 14300
},
{
"epoch": 0.000144,
"grad_norm": 1.0204075574874878,
"learning_rate": 1e-05,
"loss": 0.1756,
"step": 14400
},
{
"epoch": 0.000145,
"grad_norm": 1.1072639226913452,
"learning_rate": 1e-05,
"loss": 0.174,
"step": 14500
},
{
"epoch": 0.000146,
"grad_norm": 1.1650497913360596,
"learning_rate": 1e-05,
"loss": 0.1699,
"step": 14600
},
{
"epoch": 0.000147,
"grad_norm": 1.1133906841278076,
"learning_rate": 1e-05,
"loss": 0.1712,
"step": 14700
},
{
"epoch": 0.000148,
"grad_norm": 1.2355847358703613,
"learning_rate": 1e-05,
"loss": 0.1712,
"step": 14800
},
{
"epoch": 0.000149,
"grad_norm": 1.0743693113327026,
"learning_rate": 1e-05,
"loss": 0.1701,
"step": 14900
},
{
"epoch": 0.00015,
"grad_norm": 1.1882842779159546,
"learning_rate": 1e-05,
"loss": 0.1703,
"step": 15000
},
{
"epoch": 0.000151,
"grad_norm": 1.0762616395950317,
"learning_rate": 1e-05,
"loss": 0.1692,
"step": 15100
},
{
"epoch": 0.000152,
"grad_norm": 1.0435552597045898,
"learning_rate": 1e-05,
"loss": 0.1675,
"step": 15200
},
{
"epoch": 0.000153,
"grad_norm": 1.0835367441177368,
"learning_rate": 1e-05,
"loss": 0.1668,
"step": 15300
},
{
"epoch": 0.000154,
"grad_norm": 1.0594781637191772,
"learning_rate": 1e-05,
"loss": 0.1638,
"step": 15400
},
{
"epoch": 0.000155,
"grad_norm": 1.0666881799697876,
"learning_rate": 1e-05,
"loss": 0.1636,
"step": 15500
},
{
"epoch": 0.000156,
"grad_norm": 0.9173826575279236,
"learning_rate": 1e-05,
"loss": 0.1632,
"step": 15600
},
{
"epoch": 0.000157,
"grad_norm": 1.1107499599456787,
"learning_rate": 1e-05,
"loss": 0.163,
"step": 15700
},
{
"epoch": 0.000158,
"grad_norm": 1.0352386236190796,
"learning_rate": 1e-05,
"loss": 0.1602,
"step": 15800
},
{
"epoch": 0.000159,
"grad_norm": 0.9977409839630127,
"learning_rate": 1e-05,
"loss": 0.1623,
"step": 15900
},
{
"epoch": 0.00016,
"grad_norm": 1.0943259000778198,
"learning_rate": 1e-05,
"loss": 0.1603,
"step": 16000
},
{
"epoch": 0.000161,
"grad_norm": 1.0809710025787354,
"learning_rate": 1e-05,
"loss": 0.1582,
"step": 16100
},
{
"epoch": 0.000162,
"grad_norm": 1.1283208131790161,
"learning_rate": 1e-05,
"loss": 0.1583,
"step": 16200
},
{
"epoch": 0.000163,
"grad_norm": 1.0325435400009155,
"learning_rate": 1e-05,
"loss": 0.158,
"step": 16300
},
{
"epoch": 0.000164,
"grad_norm": 1.0305627584457397,
"learning_rate": 1e-05,
"loss": 0.1573,
"step": 16400
},
{
"epoch": 0.000165,
"grad_norm": 1.0640127658843994,
"learning_rate": 1e-05,
"loss": 0.1551,
"step": 16500
},
{
"epoch": 0.000166,
"grad_norm": 0.9327529668807983,
"learning_rate": 1e-05,
"loss": 0.1562,
"step": 16600
},
{
"epoch": 0.000167,
"grad_norm": 1.0069410800933838,
"learning_rate": 1e-05,
"loss": 0.1533,
"step": 16700
},
{
"epoch": 0.000168,
"grad_norm": 1.040076494216919,
"learning_rate": 1e-05,
"loss": 0.1527,
"step": 16800
},
{
"epoch": 0.000169,
"grad_norm": 1.008461356163025,
"learning_rate": 1e-05,
"loss": 0.1525,
"step": 16900
},
{
"epoch": 0.00017,
"grad_norm": 1.0036898851394653,
"learning_rate": 1e-05,
"loss": 0.1517,
"step": 17000
},
{
"epoch": 0.000171,
"grad_norm": 0.9357483386993408,
"learning_rate": 1e-05,
"loss": 0.1511,
"step": 17100
},
{
"epoch": 0.000172,
"grad_norm": 1.0033488273620605,
"learning_rate": 1e-05,
"loss": 0.1468,
"step": 17200
},
{
"epoch": 0.000173,
"grad_norm": 1.0451477766036987,
"learning_rate": 1e-05,
"loss": 0.15,
"step": 17300
},
{
"epoch": 0.000174,
"grad_norm": 0.971612274646759,
"learning_rate": 1e-05,
"loss": 0.1476,
"step": 17400
},
{
"epoch": 0.000175,
"grad_norm": 1.079099416732788,
"learning_rate": 1e-05,
"loss": 0.1479,
"step": 17500
},
{
"epoch": 0.000176,
"grad_norm": 1.0661680698394775,
"learning_rate": 1e-05,
"loss": 0.1476,
"step": 17600
},
{
"epoch": 0.000177,
"grad_norm": 1.0154145956039429,
"learning_rate": 1e-05,
"loss": 0.1467,
"step": 17700
},
{
"epoch": 0.000178,
"grad_norm": 1.0474337339401245,
"learning_rate": 1e-05,
"loss": 0.1441,
"step": 17800
},
{
"epoch": 0.000179,
"grad_norm": 1.0646860599517822,
"learning_rate": 1e-05,
"loss": 0.1459,
"step": 17900
},
{
"epoch": 0.00018,
"grad_norm": 1.0854105949401855,
"learning_rate": 1e-05,
"loss": 0.1437,
"step": 18000
},
{
"epoch": 0.000181,
"grad_norm": 0.9846110939979553,
"learning_rate": 1e-05,
"loss": 0.1425,
"step": 18100
},
{
"epoch": 0.000182,
"grad_norm": 1.0286470651626587,
"learning_rate": 1e-05,
"loss": 0.1432,
"step": 18200
},
{
"epoch": 0.000183,
"grad_norm": 1.0388602018356323,
"learning_rate": 1e-05,
"loss": 0.1403,
"step": 18300
},
{
"epoch": 0.000184,
"grad_norm": 0.9657048583030701,
"learning_rate": 1e-05,
"loss": 0.1417,
"step": 18400
},
{
"epoch": 0.000185,
"grad_norm": 0.8501772880554199,
"learning_rate": 1e-05,
"loss": 0.1424,
"step": 18500
},
{
"epoch": 0.000186,
"grad_norm": 0.9153370261192322,
"learning_rate": 1e-05,
"loss": 0.1376,
"step": 18600
},
{
"epoch": 0.000187,
"grad_norm": 0.9047082662582397,
"learning_rate": 1e-05,
"loss": 0.1413,
"step": 18700
},
{
"epoch": 0.000188,
"grad_norm": 0.9566175937652588,
"learning_rate": 1e-05,
"loss": 0.1387,
"step": 18800
},
{
"epoch": 0.000189,
"grad_norm": 1.069942831993103,
"learning_rate": 1e-05,
"loss": 0.1355,
"step": 18900
},
{
"epoch": 0.00019,
"grad_norm": 1.019620656967163,
"learning_rate": 1e-05,
"loss": 0.1357,
"step": 19000
},
{
"epoch": 0.000191,
"grad_norm": 0.9842545390129089,
"learning_rate": 1e-05,
"loss": 0.1366,
"step": 19100
},
{
"epoch": 0.000192,
"grad_norm": 0.972135603427887,
"learning_rate": 1e-05,
"loss": 0.1357,
"step": 19200
},
{
"epoch": 0.000193,
"grad_norm": 0.9025226831436157,
"learning_rate": 1e-05,
"loss": 0.1347,
"step": 19300
},
{
"epoch": 0.000194,
"grad_norm": 0.9164988398551941,
"learning_rate": 1e-05,
"loss": 0.1338,
"step": 19400
},
{
"epoch": 0.000195,
"grad_norm": 0.8067638874053955,
"learning_rate": 1e-05,
"loss": 0.133,
"step": 19500
},
{
"epoch": 0.000196,
"grad_norm": 0.8477145433425903,
"learning_rate": 1e-05,
"loss": 0.1334,
"step": 19600
},
{
"epoch": 0.000197,
"grad_norm": 0.860883891582489,
"learning_rate": 1e-05,
"loss": 0.1327,
"step": 19700
},
{
"epoch": 0.000198,
"grad_norm": 0.9660979509353638,
"learning_rate": 1e-05,
"loss": 0.1332,
"step": 19800
},
{
"epoch": 0.000199,
"grad_norm": 0.8979732394218445,
"learning_rate": 1e-05,
"loss": 0.1317,
"step": 19900
},
{
"epoch": 0.0002,
"grad_norm": 0.8831902146339417,
"learning_rate": 1e-05,
"loss": 0.1313,
"step": 20000
},
{
"epoch": 0.0002,
"eval_loss": 0.0992431640625,
"eval_runtime": 152.8076,
"eval_samples_per_second": 327.209,
"eval_steps_per_second": 20.451,
"step": 20000
},
{
"epoch": 0.000201,
"grad_norm": 0.9081249833106995,
"learning_rate": 1e-05,
"loss": 0.1296,
"step": 20100
},
{
"epoch": 0.000202,
"grad_norm": 1.0295116901397705,
"learning_rate": 1e-05,
"loss": 0.1296,
"step": 20200
},
{
"epoch": 0.000203,
"grad_norm": 0.8534417152404785,
"learning_rate": 1e-05,
"loss": 0.1271,
"step": 20300
},
{
"epoch": 0.000204,
"grad_norm": 0.8878474235534668,
"learning_rate": 1e-05,
"loss": 0.1276,
"step": 20400
},
{
"epoch": 0.000205,
"grad_norm": 0.9492274522781372,
"learning_rate": 1e-05,
"loss": 0.1296,
"step": 20500
},
{
"epoch": 0.000206,
"grad_norm": 0.9542170166969299,
"learning_rate": 1e-05,
"loss": 0.1284,
"step": 20600
},
{
"epoch": 0.000207,
"grad_norm": 0.8887580633163452,
"learning_rate": 1e-05,
"loss": 0.1257,
"step": 20700
},
{
"epoch": 0.000208,
"grad_norm": 0.8237319588661194,
"learning_rate": 1e-05,
"loss": 0.1253,
"step": 20800
},
{
"epoch": 0.000209,
"grad_norm": 0.8409337401390076,
"learning_rate": 1e-05,
"loss": 0.1241,
"step": 20900
},
{
"epoch": 0.00021,
"grad_norm": 0.8566481471061707,
"learning_rate": 1e-05,
"loss": 0.1252,
"step": 21000
},
{
"epoch": 0.000211,
"grad_norm": 0.8407108783721924,
"learning_rate": 1e-05,
"loss": 0.1242,
"step": 21100
},
{
"epoch": 0.000212,
"grad_norm": 0.853947639465332,
"learning_rate": 1e-05,
"loss": 0.1246,
"step": 21200
},
{
"epoch": 0.000213,
"grad_norm": 0.8899252414703369,
"learning_rate": 1e-05,
"loss": 0.1237,
"step": 21300
},
{
"epoch": 0.000214,
"grad_norm": 0.8689791560173035,
"learning_rate": 1e-05,
"loss": 0.1225,
"step": 21400
},
{
"epoch": 0.000215,
"grad_norm": 0.9782620668411255,
"learning_rate": 1e-05,
"loss": 0.1229,
"step": 21500
},
{
"epoch": 0.000216,
"grad_norm": 0.9015646576881409,
"learning_rate": 1e-05,
"loss": 0.1241,
"step": 21600
},
{
"epoch": 0.000217,
"grad_norm": 0.9284467697143555,
"learning_rate": 1e-05,
"loss": 0.1216,
"step": 21700
},
{
"epoch": 0.000218,
"grad_norm": 0.8393162488937378,
"learning_rate": 1e-05,
"loss": 0.1219,
"step": 21800
},
{
"epoch": 0.000219,
"grad_norm": 0.9249029159545898,
"learning_rate": 1e-05,
"loss": 0.1222,
"step": 21900
},
{
"epoch": 0.00022,
"grad_norm": 0.931483805179596,
"learning_rate": 1e-05,
"loss": 0.1208,
"step": 22000
},
{
"epoch": 0.000221,
"grad_norm": 0.9092661142349243,
"learning_rate": 1e-05,
"loss": 0.1214,
"step": 22100
},
{
"epoch": 0.000222,
"grad_norm": 0.9886374473571777,
"learning_rate": 1e-05,
"loss": 0.1189,
"step": 22200
},
{
"epoch": 0.000223,
"grad_norm": 0.8833937644958496,
"learning_rate": 1e-05,
"loss": 0.1175,
"step": 22300
},
{
"epoch": 0.000224,
"grad_norm": 0.9673048257827759,
"learning_rate": 1e-05,
"loss": 0.1168,
"step": 22400
},
{
"epoch": 0.000225,
"grad_norm": 0.872240424156189,
"learning_rate": 1e-05,
"loss": 0.1177,
"step": 22500
},
{
"epoch": 0.000226,
"grad_norm": 0.849644660949707,
"learning_rate": 1e-05,
"loss": 0.1177,
"step": 22600
},
{
"epoch": 0.000227,
"grad_norm": 0.9396729469299316,
"learning_rate": 1e-05,
"loss": 0.1174,
"step": 22700
},
{
"epoch": 0.000228,
"grad_norm": 0.9100921750068665,
"learning_rate": 1e-05,
"loss": 0.1161,
"step": 22800
},
{
"epoch": 0.000229,
"grad_norm": 0.8232945203781128,
"learning_rate": 1e-05,
"loss": 0.1149,
"step": 22900
},
{
"epoch": 0.00023,
"grad_norm": 0.8654581904411316,
"learning_rate": 1e-05,
"loss": 0.116,
"step": 23000
},
{
"epoch": 0.000231,
"grad_norm": 0.8864552974700928,
"learning_rate": 1e-05,
"loss": 0.1161,
"step": 23100
},
{
"epoch": 0.000232,
"grad_norm": 0.9292982816696167,
"learning_rate": 1e-05,
"loss": 0.1126,
"step": 23200
},
{
"epoch": 0.000233,
"grad_norm": 0.8095874786376953,
"learning_rate": 1e-05,
"loss": 0.1141,
"step": 23300
},
{
"epoch": 0.000234,
"grad_norm": 1.1662276983261108,
"learning_rate": 1e-05,
"loss": 0.113,
"step": 23400
},
{
"epoch": 0.000235,
"grad_norm": 0.8531011343002319,
"learning_rate": 1e-05,
"loss": 0.1147,
"step": 23500
},
{
"epoch": 0.000236,
"grad_norm": 0.895802915096283,
"learning_rate": 1e-05,
"loss": 0.114,
"step": 23600
},
{
"epoch": 0.000237,
"grad_norm": 0.8489896655082703,
"learning_rate": 1e-05,
"loss": 0.1142,
"step": 23700
},
{
"epoch": 0.000238,
"grad_norm": 0.8372708559036255,
"learning_rate": 1e-05,
"loss": 0.1123,
"step": 23800
},
{
"epoch": 0.000239,
"grad_norm": 0.8919999003410339,
"learning_rate": 1e-05,
"loss": 0.1134,
"step": 23900
},
{
"epoch": 0.00024,
"grad_norm": 0.8561524152755737,
"learning_rate": 1e-05,
"loss": 0.112,
"step": 24000
},
{
"epoch": 0.000241,
"grad_norm": 0.8549727201461792,
"learning_rate": 1e-05,
"loss": 0.1123,
"step": 24100
},
{
"epoch": 0.000242,
"grad_norm": 0.8339006900787354,
"learning_rate": 1e-05,
"loss": 0.1116,
"step": 24200
},
{
"epoch": 0.000243,
"grad_norm": 0.8727480173110962,
"learning_rate": 1e-05,
"loss": 0.1113,
"step": 24300
},
{
"epoch": 0.000244,
"grad_norm": 0.881377637386322,
"learning_rate": 1e-05,
"loss": 0.1098,
"step": 24400
},
{
"epoch": 0.000245,
"grad_norm": 0.8690173029899597,
"learning_rate": 1e-05,
"loss": 0.1109,
"step": 24500
},
{
"epoch": 0.000246,
"grad_norm": 0.833027720451355,
"learning_rate": 1e-05,
"loss": 0.1094,
"step": 24600
},
{
"epoch": 0.000247,
"grad_norm": 0.8230149745941162,
"learning_rate": 1e-05,
"loss": 0.1094,
"step": 24700
},
{
"epoch": 0.000248,
"grad_norm": 0.8857430219650269,
"learning_rate": 1e-05,
"loss": 0.1077,
"step": 24800
},
{
"epoch": 0.000249,
"grad_norm": 0.9106509685516357,
"learning_rate": 1e-05,
"loss": 0.1081,
"step": 24900
},
{
"epoch": 0.00025,
"grad_norm": 0.9534709453582764,
"learning_rate": 1e-05,
"loss": 0.1084,
"step": 25000
},
{
"epoch": 0.000251,
"grad_norm": 0.8446188569068909,
"learning_rate": 1e-05,
"loss": 0.1069,
"step": 25100
},
{
"epoch": 0.000252,
"grad_norm": 0.8347111344337463,
"learning_rate": 1e-05,
"loss": 0.1077,
"step": 25200
},
{
"epoch": 0.000253,
"grad_norm": 0.8703511357307434,
"learning_rate": 1e-05,
"loss": 0.1069,
"step": 25300
},
{
"epoch": 0.000254,
"grad_norm": 0.8182582259178162,
"learning_rate": 1e-05,
"loss": 0.1058,
"step": 25400
},
{
"epoch": 0.000255,
"grad_norm": 0.8704941868782043,
"learning_rate": 1e-05,
"loss": 0.1063,
"step": 25500
},
{
"epoch": 0.000256,
"grad_norm": 0.8137685656547546,
"learning_rate": 1e-05,
"loss": 0.1041,
"step": 25600
},
{
"epoch": 0.000257,
"grad_norm": 0.7531348466873169,
"learning_rate": 1e-05,
"loss": 0.106,
"step": 25700
},
{
"epoch": 0.000258,
"grad_norm": 0.886814534664154,
"learning_rate": 1e-05,
"loss": 0.1051,
"step": 25800
},
{
"epoch": 0.000259,
"grad_norm": 0.8390068411827087,
"learning_rate": 1e-05,
"loss": 0.105,
"step": 25900
},
{
"epoch": 0.00026,
"grad_norm": 0.7962291836738586,
"learning_rate": 1e-05,
"loss": 0.1046,
"step": 26000
},
{
"epoch": 0.000261,
"grad_norm": 0.9102724194526672,
"learning_rate": 1e-05,
"loss": 0.1044,
"step": 26100
},
{
"epoch": 0.000262,
"grad_norm": 0.8715778589248657,
"learning_rate": 1e-05,
"loss": 0.1031,
"step": 26200
},
{
"epoch": 0.000263,
"grad_norm": 0.8876039385795593,
"learning_rate": 1e-05,
"loss": 0.103,
"step": 26300
},
{
"epoch": 0.000264,
"grad_norm": 0.7934551239013672,
"learning_rate": 1e-05,
"loss": 0.1017,
"step": 26400
},
{
"epoch": 0.000265,
"grad_norm": 0.9847850799560547,
"learning_rate": 1e-05,
"loss": 0.1032,
"step": 26500
},
{
"epoch": 0.000266,
"grad_norm": 0.8920612335205078,
"learning_rate": 1e-05,
"loss": 0.1032,
"step": 26600
},
{
"epoch": 0.000267,
"grad_norm": 0.9092204570770264,
"learning_rate": 1e-05,
"loss": 0.102,
"step": 26700
},
{
"epoch": 0.000268,
"grad_norm": 0.7922365069389343,
"learning_rate": 1e-05,
"loss": 0.1024,
"step": 26800
},
{
"epoch": 0.000269,
"grad_norm": 0.8614472150802612,
"learning_rate": 1e-05,
"loss": 0.1022,
"step": 26900
},
{
"epoch": 0.00027,
"grad_norm": 0.7870116829872131,
"learning_rate": 1e-05,
"loss": 0.1004,
"step": 27000
},
{
"epoch": 0.000271,
"grad_norm": 0.6980022192001343,
"learning_rate": 1e-05,
"loss": 0.1006,
"step": 27100
},
{
"epoch": 0.000272,
"grad_norm": 0.7720369100570679,
"learning_rate": 1e-05,
"loss": 0.1012,
"step": 27200
},
{
"epoch": 0.000273,
"grad_norm": 0.8154132962226868,
"learning_rate": 1e-05,
"loss": 0.1005,
"step": 27300
},
{
"epoch": 0.000274,
"grad_norm": 0.8288457989692688,
"learning_rate": 1e-05,
"loss": 0.0985,
"step": 27400
},
{
"epoch": 0.000275,
"grad_norm": 0.8117573261260986,
"learning_rate": 1e-05,
"loss": 0.1008,
"step": 27500
},
{
"epoch": 0.000276,
"grad_norm": 0.7800782918930054,
"learning_rate": 1e-05,
"loss": 0.0988,
"step": 27600
},
{
"epoch": 0.000277,
"grad_norm": 0.9139901399612427,
"learning_rate": 1e-05,
"loss": 0.0994,
"step": 27700
},
{
"epoch": 0.000278,
"grad_norm": 0.745152473449707,
"learning_rate": 1e-05,
"loss": 0.0982,
"step": 27800
},
{
"epoch": 0.000279,
"grad_norm": 0.7476614117622375,
"learning_rate": 1e-05,
"loss": 0.0965,
"step": 27900
},
{
"epoch": 0.00028,
"grad_norm": 0.7490776777267456,
"learning_rate": 1e-05,
"loss": 0.0972,
"step": 28000
},
{
"epoch": 0.000281,
"grad_norm": 0.7730040550231934,
"learning_rate": 1e-05,
"loss": 0.0976,
"step": 28100
},
{
"epoch": 0.000282,
"grad_norm": 0.7657092213630676,
"learning_rate": 1e-05,
"loss": 0.0982,
"step": 28200
},
{
"epoch": 0.000283,
"grad_norm": 0.9147765040397644,
"learning_rate": 1e-05,
"loss": 0.0978,
"step": 28300
},
{
"epoch": 0.000284,
"grad_norm": 0.7426789999008179,
"learning_rate": 1e-05,
"loss": 0.0968,
"step": 28400
},
{
"epoch": 0.000285,
"grad_norm": 0.8652293086051941,
"learning_rate": 1e-05,
"loss": 0.0981,
"step": 28500
},
{
"epoch": 0.000286,
"grad_norm": 0.6864128112792969,
"learning_rate": 1e-05,
"loss": 0.0963,
"step": 28600
},
{
"epoch": 0.000287,
"grad_norm": 0.7807822227478027,
"learning_rate": 1e-05,
"loss": 0.0962,
"step": 28700
},
{
"epoch": 0.000288,
"grad_norm": 0.8013282418251038,
"learning_rate": 1e-05,
"loss": 0.0964,
"step": 28800
},
{
"epoch": 0.000289,
"grad_norm": 0.7287372350692749,
"learning_rate": 1e-05,
"loss": 0.0966,
"step": 28900
},
{
"epoch": 0.00029,
"grad_norm": 0.7577667832374573,
"learning_rate": 1e-05,
"loss": 0.0958,
"step": 29000
},
{
"epoch": 0.000291,
"grad_norm": 0.7510080933570862,
"learning_rate": 1e-05,
"loss": 0.0947,
"step": 29100
},
{
"epoch": 0.000292,
"grad_norm": 0.8355770707130432,
"learning_rate": 1e-05,
"loss": 0.0946,
"step": 29200
},
{
"epoch": 0.000293,
"grad_norm": 0.8899005651473999,
"learning_rate": 1e-05,
"loss": 0.0948,
"step": 29300
},
{
"epoch": 0.000294,
"grad_norm": 0.8526831865310669,
"learning_rate": 1e-05,
"loss": 0.0947,
"step": 29400
},
{
"epoch": 0.000295,
"grad_norm": 0.740943968296051,
"learning_rate": 1e-05,
"loss": 0.0928,
"step": 29500
},
{
"epoch": 0.000296,
"grad_norm": 0.8096754550933838,
"learning_rate": 1e-05,
"loss": 0.0948,
"step": 29600
},
{
"epoch": 0.000297,
"grad_norm": 0.8890173435211182,
"learning_rate": 1e-05,
"loss": 0.0934,
"step": 29700
},
{
"epoch": 0.000298,
"grad_norm": 0.8200284838676453,
"learning_rate": 1e-05,
"loss": 0.0931,
"step": 29800
},
{
"epoch": 0.000299,
"grad_norm": 0.70655757188797,
"learning_rate": 1e-05,
"loss": 0.0946,
"step": 29900
},
{
"epoch": 0.0003,
"grad_norm": 0.7843393087387085,
"learning_rate": 1e-05,
"loss": 0.0924,
"step": 30000
},
{
"epoch": 0.000301,
"grad_norm": 0.6674346923828125,
"learning_rate": 1e-05,
"loss": 0.0925,
"step": 30100
},
{
"epoch": 0.000302,
"grad_norm": 0.7955383062362671,
"learning_rate": 1e-05,
"loss": 0.0927,
"step": 30200
},
{
"epoch": 0.000303,
"grad_norm": 0.7410333752632141,
"learning_rate": 1e-05,
"loss": 0.0923,
"step": 30300
},
{
"epoch": 0.000304,
"grad_norm": 0.716390073299408,
"learning_rate": 1e-05,
"loss": 0.0924,
"step": 30400
},
{
"epoch": 0.000305,
"grad_norm": 0.7392554879188538,
"learning_rate": 1e-05,
"loss": 0.0921,
"step": 30500
},
{
"epoch": 0.000306,
"grad_norm": 0.9256471991539001,
"learning_rate": 1e-05,
"loss": 0.091,
"step": 30600
},
{
"epoch": 0.000307,
"grad_norm": 0.7692530751228333,
"learning_rate": 1e-05,
"loss": 0.0928,
"step": 30700
},
{
"epoch": 0.000308,
"grad_norm": 0.7785292863845825,
"learning_rate": 1e-05,
"loss": 0.0906,
"step": 30800
},
{
"epoch": 0.000309,
"grad_norm": 0.8413007259368896,
"learning_rate": 1e-05,
"loss": 0.09,
"step": 30900
},
{
"epoch": 0.00031,
"grad_norm": 0.9082907438278198,
"learning_rate": 1e-05,
"loss": 0.0896,
"step": 31000
},
{
"epoch": 0.000311,
"grad_norm": 0.7937412261962891,
"learning_rate": 1e-05,
"loss": 0.0892,
"step": 31100
},
{
"epoch": 0.000312,
"grad_norm": 0.7778225541114807,
"learning_rate": 1e-05,
"loss": 0.088,
"step": 31200
},
{
"epoch": 0.000313,
"grad_norm": 0.7651337385177612,
"learning_rate": 1e-05,
"loss": 0.0897,
"step": 31300
},
{
"epoch": 0.000314,
"grad_norm": 0.7604988217353821,
"learning_rate": 1e-05,
"loss": 0.0901,
"step": 31400
},
{
"epoch": 0.000315,
"grad_norm": 0.779761016368866,
"learning_rate": 1e-05,
"loss": 0.0903,
"step": 31500
},
{
"epoch": 0.000316,
"grad_norm": 0.7517678737640381,
"learning_rate": 1e-05,
"loss": 0.0885,
"step": 31600
},
{
"epoch": 0.000317,
"grad_norm": 0.8016210794448853,
"learning_rate": 1e-05,
"loss": 0.0893,
"step": 31700
},
{
"epoch": 0.000318,
"grad_norm": 0.678521990776062,
"learning_rate": 1e-05,
"loss": 0.0886,
"step": 31800
},
{
"epoch": 0.000319,
"grad_norm": 0.7407852411270142,
"learning_rate": 1e-05,
"loss": 0.0899,
"step": 31900
},
{
"epoch": 0.00032,
"grad_norm": 0.8720430135726929,
"learning_rate": 1e-05,
"loss": 0.0876,
"step": 32000
},
{
"epoch": 0.000321,
"grad_norm": 0.7622641324996948,
"learning_rate": 1e-05,
"loss": 0.0881,
"step": 32100
},
{
"epoch": 0.000322,
"grad_norm": 0.6715940237045288,
"learning_rate": 1e-05,
"loss": 0.0867,
"step": 32200
},
{
"epoch": 0.000323,
"grad_norm": 0.8118298053741455,
"learning_rate": 1e-05,
"loss": 0.0887,
"step": 32300
},
{
"epoch": 0.000324,
"grad_norm": 0.7427231073379517,
"learning_rate": 1e-05,
"loss": 0.0878,
"step": 32400
},
{
"epoch": 0.000325,
"grad_norm": 0.7627066969871521,
"learning_rate": 1e-05,
"loss": 0.0879,
"step": 32500
},
{
"epoch": 0.000326,
"grad_norm": 0.7354280948638916,
"learning_rate": 1e-05,
"loss": 0.088,
"step": 32600
},
{
"epoch": 0.000327,
"grad_norm": 0.6953477263450623,
"learning_rate": 1e-05,
"loss": 0.0867,
"step": 32700
},
{
"epoch": 0.000328,
"grad_norm": 0.7861385345458984,
"learning_rate": 1e-05,
"loss": 0.0858,
"step": 32800
},
{
"epoch": 0.000329,
"grad_norm": 0.7112125158309937,
"learning_rate": 1e-05,
"loss": 0.0859,
"step": 32900
},
{
"epoch": 0.00033,
"grad_norm": 0.7531374096870422,
"learning_rate": 1e-05,
"loss": 0.0862,
"step": 33000
},
{
"epoch": 0.000331,
"grad_norm": 0.7147675156593323,
"learning_rate": 1e-05,
"loss": 0.0851,
"step": 33100
},
{
"epoch": 0.000332,
"grad_norm": 0.8516043424606323,
"learning_rate": 1e-05,
"loss": 0.0858,
"step": 33200
},
{
"epoch": 0.000333,
"grad_norm": 0.7007201313972473,
"learning_rate": 1e-05,
"loss": 0.0856,
"step": 33300
},
{
"epoch": 0.000334,
"grad_norm": 0.7700639963150024,
"learning_rate": 1e-05,
"loss": 0.085,
"step": 33400
},
{
"epoch": 0.000335,
"grad_norm": 0.7579879760742188,
"learning_rate": 1e-05,
"loss": 0.0844,
"step": 33500
},
{
"epoch": 0.000336,
"grad_norm": 0.7982689738273621,
"learning_rate": 1e-05,
"loss": 0.0849,
"step": 33600
},
{
"epoch": 0.000337,
"grad_norm": 0.69140625,
"learning_rate": 1e-05,
"loss": 0.0864,
"step": 33700
},
{
"epoch": 0.000338,
"grad_norm": 0.723205029964447,
"learning_rate": 1e-05,
"loss": 0.0858,
"step": 33800
},
{
"epoch": 0.000339,
"grad_norm": 0.7827596664428711,
"learning_rate": 1e-05,
"loss": 0.084,
"step": 33900
},
{
"epoch": 0.00034,
"grad_norm": 0.8219903111457825,
"learning_rate": 1e-05,
"loss": 0.0852,
"step": 34000
},
{
"epoch": 0.000341,
"grad_norm": 0.8129620552062988,
"learning_rate": 1e-05,
"loss": 0.0848,
"step": 34100
},
{
"epoch": 0.000342,
"grad_norm": 0.6510952115058899,
"learning_rate": 1e-05,
"loss": 0.0827,
"step": 34200
},
{
"epoch": 0.000343,
"grad_norm": 0.7110053896903992,
"learning_rate": 1e-05,
"loss": 0.0836,
"step": 34300
},
{
"epoch": 0.000344,
"grad_norm": 0.7686619162559509,
"learning_rate": 1e-05,
"loss": 0.0835,
"step": 34400
},
{
"epoch": 0.000345,
"grad_norm": 0.829767107963562,
"learning_rate": 1e-05,
"loss": 0.0827,
"step": 34500
},
{
"epoch": 0.000346,
"grad_norm": 0.7650629281997681,
"learning_rate": 1e-05,
"loss": 0.0826,
"step": 34600
},
{
"epoch": 0.000347,
"grad_norm": 0.6766960024833679,
"learning_rate": 1e-05,
"loss": 0.0831,
"step": 34700
},
{
"epoch": 0.000348,
"grad_norm": 0.7824012637138367,
"learning_rate": 1e-05,
"loss": 0.0831,
"step": 34800
},
{
"epoch": 0.000349,
"grad_norm": 0.697309136390686,
"learning_rate": 1e-05,
"loss": 0.0826,
"step": 34900
},
{
"epoch": 0.00035,
"grad_norm": 0.6359274387359619,
"learning_rate": 1e-05,
"loss": 0.0821,
"step": 35000
},
{
"epoch": 0.000351,
"grad_norm": 0.7838051915168762,
"learning_rate": 1e-05,
"loss": 0.0828,
"step": 35100
},
{
"epoch": 0.000352,
"grad_norm": 0.8149462938308716,
"learning_rate": 1e-05,
"loss": 0.0819,
"step": 35200
},
{
"epoch": 0.000353,
"grad_norm": 0.7315548062324524,
"learning_rate": 1e-05,
"loss": 0.081,
"step": 35300
},
{
"epoch": 0.000354,
"grad_norm": 0.6927749514579773,
"learning_rate": 1e-05,
"loss": 0.0802,
"step": 35400
},
{
"epoch": 0.000355,
"grad_norm": 0.7449594736099243,
"learning_rate": 1e-05,
"loss": 0.0822,
"step": 35500
},
{
"epoch": 0.000356,
"grad_norm": 0.6572420597076416,
"learning_rate": 1e-05,
"loss": 0.0809,
"step": 35600
},
{
"epoch": 0.000357,
"grad_norm": 0.7096725702285767,
"learning_rate": 1e-05,
"loss": 0.0805,
"step": 35700
},
{
"epoch": 0.000358,
"grad_norm": 0.8065080046653748,
"learning_rate": 1e-05,
"loss": 0.08,
"step": 35800
},
{
"epoch": 0.000359,
"grad_norm": 0.5750519633293152,
"learning_rate": 1e-05,
"loss": 0.0796,
"step": 35900
},
{
"epoch": 0.00036,
"grad_norm": 0.7987583875656128,
"learning_rate": 1e-05,
"loss": 0.0795,
"step": 36000
},
{
"epoch": 0.000361,
"grad_norm": 0.7741938233375549,
"learning_rate": 1e-05,
"loss": 0.0795,
"step": 36100
},
{
"epoch": 0.000362,
"grad_norm": 0.7459242343902588,
"learning_rate": 1e-05,
"loss": 0.0804,
"step": 36200
},
{
"epoch": 0.000363,
"grad_norm": 0.6847333312034607,
"learning_rate": 1e-05,
"loss": 0.0809,
"step": 36300
},
{
"epoch": 0.000364,
"grad_norm": 0.7405627369880676,
"learning_rate": 1e-05,
"loss": 0.0782,
"step": 36400
},
{
"epoch": 0.000365,
"grad_norm": 0.6119332909584045,
"learning_rate": 1e-05,
"loss": 0.0806,
"step": 36500
},
{
"epoch": 0.000366,
"grad_norm": 0.7295922636985779,
"learning_rate": 1e-05,
"loss": 0.0791,
"step": 36600
},
{
"epoch": 0.000367,
"grad_norm": 0.7362000346183777,
"learning_rate": 1e-05,
"loss": 0.0793,
"step": 36700
},
{
"epoch": 0.000368,
"grad_norm": 0.650321900844574,
"learning_rate": 1e-05,
"loss": 0.0787,
"step": 36800
},
{
"epoch": 0.000369,
"grad_norm": 0.6487528681755066,
"learning_rate": 1e-05,
"loss": 0.0788,
"step": 36900
},
{
"epoch": 0.00037,
"grad_norm": 0.6908884644508362,
"learning_rate": 1e-05,
"loss": 0.078,
"step": 37000
},
{
"epoch": 0.000371,
"grad_norm": 0.7823421359062195,
"learning_rate": 1e-05,
"loss": 0.0773,
"step": 37100
},
{
"epoch": 0.000372,
"grad_norm": 0.7242419719696045,
"learning_rate": 1e-05,
"loss": 0.0789,
"step": 37200
},
{
"epoch": 0.000373,
"grad_norm": 0.7191994786262512,
"learning_rate": 1e-05,
"loss": 0.0786,
"step": 37300
},
{
"epoch": 0.000374,
"grad_norm": 0.6352174282073975,
"learning_rate": 1e-05,
"loss": 0.0782,
"step": 37400
},
{
"epoch": 0.000375,
"grad_norm": 0.6456391215324402,
"learning_rate": 1e-05,
"loss": 0.0801,
"step": 37500
},
{
"epoch": 0.000376,
"grad_norm": 0.7176135182380676,
"learning_rate": 1e-05,
"loss": 0.0788,
"step": 37600
},
{
"epoch": 0.000377,
"grad_norm": 0.7592889666557312,
"learning_rate": 1e-05,
"loss": 0.0782,
"step": 37700
},
{
"epoch": 0.000378,
"grad_norm": 0.7405545115470886,
"learning_rate": 1e-05,
"loss": 0.0772,
"step": 37800
},
{
"epoch": 0.000379,
"grad_norm": 0.6966970562934875,
"learning_rate": 1e-05,
"loss": 0.0761,
"step": 37900
},
{
"epoch": 0.00038,
"grad_norm": 0.7346359491348267,
"learning_rate": 1e-05,
"loss": 0.0775,
"step": 38000
},
{
"epoch": 0.000381,
"grad_norm": 0.729246199131012,
"learning_rate": 1e-05,
"loss": 0.0767,
"step": 38100
},
{
"epoch": 0.000382,
"grad_norm": 0.8081512451171875,
"learning_rate": 1e-05,
"loss": 0.078,
"step": 38200
},
{
"epoch": 0.000383,
"grad_norm": 0.6851301193237305,
"learning_rate": 1e-05,
"loss": 0.0757,
"step": 38300
},
{
"epoch": 0.000384,
"grad_norm": 0.6699986457824707,
"learning_rate": 1e-05,
"loss": 0.0767,
"step": 38400
},
{
"epoch": 0.000385,
"grad_norm": 0.7026481032371521,
"learning_rate": 1e-05,
"loss": 0.0776,
"step": 38500
},
{
"epoch": 0.000386,
"grad_norm": 0.7267670035362244,
"learning_rate": 1e-05,
"loss": 0.0761,
"step": 38600
},
{
"epoch": 0.000387,
"grad_norm": 0.648714005947113,
"learning_rate": 1e-05,
"loss": 0.0749,
"step": 38700
},
{
"epoch": 0.000388,
"grad_norm": 0.7160006165504456,
"learning_rate": 1e-05,
"loss": 0.0756,
"step": 38800
},
{
"epoch": 0.000389,
"grad_norm": 0.7773024439811707,
"learning_rate": 1e-05,
"loss": 0.0759,
"step": 38900
},
{
"epoch": 0.00039,
"grad_norm": 0.7162371277809143,
"learning_rate": 1e-05,
"loss": 0.0749,
"step": 39000
},
{
"epoch": 0.000391,
"grad_norm": 0.7529783844947815,
"learning_rate": 1e-05,
"loss": 0.0746,
"step": 39100
},
{
"epoch": 0.000392,
"grad_norm": 0.866392195224762,
"learning_rate": 1e-05,
"loss": 0.0755,
"step": 39200
},
{
"epoch": 0.000393,
"grad_norm": 0.751728355884552,
"learning_rate": 1e-05,
"loss": 0.0752,
"step": 39300
},
{
"epoch": 0.000394,
"grad_norm": 0.6856648325920105,
"learning_rate": 1e-05,
"loss": 0.0753,
"step": 39400
},
{
"epoch": 0.000395,
"grad_norm": 0.683175265789032,
"learning_rate": 1e-05,
"loss": 0.0739,
"step": 39500
},
{
"epoch": 0.000396,
"grad_norm": 0.7458997368812561,
"learning_rate": 1e-05,
"loss": 0.0752,
"step": 39600
},
{
"epoch": 0.000397,
"grad_norm": 0.7095280885696411,
"learning_rate": 1e-05,
"loss": 0.0753,
"step": 39700
},
{
"epoch": 0.000398,
"grad_norm": 0.6352033019065857,
"learning_rate": 1e-05,
"loss": 0.0737,
"step": 39800
},
{
"epoch": 0.000399,
"grad_norm": 0.695184588432312,
"learning_rate": 1e-05,
"loss": 0.0738,
"step": 39900
},
{
"epoch": 0.0004,
"grad_norm": 0.6518137454986572,
"learning_rate": 1e-05,
"loss": 0.074,
"step": 40000
},
{
"epoch": 0.0004,
"eval_loss": 0.057464599609375,
"eval_runtime": 146.7084,
"eval_samples_per_second": 340.812,
"eval_steps_per_second": 21.301,
"step": 40000
},
{
"epoch": 0.000401,
"grad_norm": 0.7782549858093262,
"learning_rate": 1e-05,
"loss": 0.074,
"step": 40100
},
{
"epoch": 0.000402,
"grad_norm": 0.6919134855270386,
"learning_rate": 1e-05,
"loss": 0.0739,
"step": 40200
},
{
"epoch": 0.000403,
"grad_norm": 0.661824643611908,
"learning_rate": 1e-05,
"loss": 0.0744,
"step": 40300
},
{
"epoch": 0.000404,
"grad_norm": 0.6964775919914246,
"learning_rate": 1e-05,
"loss": 0.0732,
"step": 40400
},
{
"epoch": 0.000405,
"grad_norm": 0.860140860080719,
"learning_rate": 1e-05,
"loss": 0.0736,
"step": 40500
},
{
"epoch": 0.000406,
"grad_norm": 0.6227797865867615,
"learning_rate": 1e-05,
"loss": 0.0734,
"step": 40600
},
{
"epoch": 0.000407,
"grad_norm": 0.5687974095344543,
"learning_rate": 1e-05,
"loss": 0.0734,
"step": 40700
},
{
"epoch": 0.000408,
"grad_norm": 0.6930891871452332,
"learning_rate": 1e-05,
"loss": 0.074,
"step": 40800
},
{
"epoch": 0.000409,
"grad_norm": 0.6303442716598511,
"learning_rate": 1e-05,
"loss": 0.0728,
"step": 40900
},
{
"epoch": 0.00041,
"grad_norm": 0.6731743812561035,
"learning_rate": 1e-05,
"loss": 0.0742,
"step": 41000
},
{
"epoch": 0.000411,
"grad_norm": 0.6712822318077087,
"learning_rate": 1e-05,
"loss": 0.0737,
"step": 41100
},
{
"epoch": 0.000412,
"grad_norm": 0.6134166717529297,
"learning_rate": 1e-05,
"loss": 0.0728,
"step": 41200
},
{
"epoch": 0.000413,
"grad_norm": 0.6910662651062012,
"learning_rate": 1e-05,
"loss": 0.0726,
"step": 41300
},
{
"epoch": 0.000414,
"grad_norm": 0.6266744136810303,
"learning_rate": 1e-05,
"loss": 0.0719,
"step": 41400
},
{
"epoch": 0.000415,
"grad_norm": 0.600907027721405,
"learning_rate": 1e-05,
"loss": 0.0737,
"step": 41500
},
{
"epoch": 0.000416,
"grad_norm": 0.6139588356018066,
"learning_rate": 1e-05,
"loss": 0.0722,
"step": 41600
},
{
"epoch": 0.000417,
"grad_norm": 0.6445550918579102,
"learning_rate": 1e-05,
"loss": 0.0721,
"step": 41700
},
{
"epoch": 0.000418,
"grad_norm": 0.7176617980003357,
"learning_rate": 1e-05,
"loss": 0.0718,
"step": 41800
},
{
"epoch": 0.000419,
"grad_norm": 0.7564845085144043,
"learning_rate": 1e-05,
"loss": 0.0724,
"step": 41900
},
{
"epoch": 0.00042,
"grad_norm": 0.7683578133583069,
"learning_rate": 1e-05,
"loss": 0.0714,
"step": 42000
},
{
"epoch": 0.000421,
"grad_norm": 0.731192946434021,
"learning_rate": 1e-05,
"loss": 0.0707,
"step": 42100
},
{
"epoch": 0.000422,
"grad_norm": 0.6390314102172852,
"learning_rate": 1e-05,
"loss": 0.0706,
"step": 42200
},
{
"epoch": 0.000423,
"grad_norm": 0.6024550199508667,
"learning_rate": 1e-05,
"loss": 0.0714,
"step": 42300
},
{
"epoch": 0.000424,
"grad_norm": 0.6974002718925476,
"learning_rate": 1e-05,
"loss": 0.0712,
"step": 42400
},
{
"epoch": 0.000425,
"grad_norm": 0.6231324672698975,
"learning_rate": 1e-05,
"loss": 0.0719,
"step": 42500
},
{
"epoch": 0.000426,
"grad_norm": 0.6329951882362366,
"learning_rate": 1e-05,
"loss": 0.0708,
"step": 42600
},
{
"epoch": 0.000427,
"grad_norm": 0.7328572869300842,
"learning_rate": 1e-05,
"loss": 0.0713,
"step": 42700
},
{
"epoch": 0.000428,
"grad_norm": 0.6142467856407166,
"learning_rate": 1e-05,
"loss": 0.0721,
"step": 42800
},
{
"epoch": 0.000429,
"grad_norm": 0.7287197113037109,
"learning_rate": 1e-05,
"loss": 0.0706,
"step": 42900
},
{
"epoch": 0.00043,
"grad_norm": 0.6606420278549194,
"learning_rate": 1e-05,
"loss": 0.0699,
"step": 43000
},
{
"epoch": 0.000431,
"grad_norm": 0.7667610049247742,
"learning_rate": 1e-05,
"loss": 0.07,
"step": 43100
},
{
"epoch": 0.000432,
"grad_norm": 0.5734269618988037,
"learning_rate": 1e-05,
"loss": 0.0699,
"step": 43200
},
{
"epoch": 0.000433,
"grad_norm": 0.5326073169708252,
"learning_rate": 1e-05,
"loss": 0.07,
"step": 43300
},
{
"epoch": 0.000434,
"grad_norm": 0.7028875946998596,
"learning_rate": 1e-05,
"loss": 0.0696,
"step": 43400
},
{
"epoch": 0.000435,
"grad_norm": 0.6137057542800903,
"learning_rate": 1e-05,
"loss": 0.0691,
"step": 43500
},
{
"epoch": 0.000436,
"grad_norm": 0.5539369583129883,
"learning_rate": 1e-05,
"loss": 0.0688,
"step": 43600
},
{
"epoch": 0.000437,
"grad_norm": 0.7035527229309082,
"learning_rate": 1e-05,
"loss": 0.071,
"step": 43700
},
{
"epoch": 0.000438,
"grad_norm": 0.7055030465126038,
"learning_rate": 1e-05,
"loss": 0.0699,
"step": 43800
},
{
"epoch": 0.000439,
"grad_norm": 0.536948025226593,
"learning_rate": 1e-05,
"loss": 0.0697,
"step": 43900
},
{
"epoch": 0.00044,
"grad_norm": 0.6797453165054321,
"learning_rate": 1e-05,
"loss": 0.0677,
"step": 44000
},
{
"epoch": 0.000441,
"grad_norm": 0.6475409865379333,
"learning_rate": 1e-05,
"loss": 0.0696,
"step": 44100
},
{
"epoch": 0.000442,
"grad_norm": 0.5951113700866699,
"learning_rate": 1e-05,
"loss": 0.0683,
"step": 44200
},
{
"epoch": 0.000443,
"grad_norm": 0.7197650671005249,
"learning_rate": 1e-05,
"loss": 0.0696,
"step": 44300
},
{
"epoch": 0.000444,
"grad_norm": 0.6708860397338867,
"learning_rate": 1e-05,
"loss": 0.0692,
"step": 44400
},
{
"epoch": 0.000445,
"grad_norm": 0.6833498477935791,
"learning_rate": 1e-05,
"loss": 0.0694,
"step": 44500
},
{
"epoch": 0.000446,
"grad_norm": 0.6520599722862244,
"learning_rate": 1e-05,
"loss": 0.0694,
"step": 44600
},
{
"epoch": 0.000447,
"grad_norm": 0.7471343278884888,
"learning_rate": 1e-05,
"loss": 0.0679,
"step": 44700
},
{
"epoch": 0.000448,
"grad_norm": 0.6124304533004761,
"learning_rate": 1e-05,
"loss": 0.0685,
"step": 44800
},
{
"epoch": 0.000449,
"grad_norm": 0.6457110643386841,
"learning_rate": 1e-05,
"loss": 0.0683,
"step": 44900
},
{
"epoch": 0.00045,
"grad_norm": 0.8282802104949951,
"learning_rate": 1e-05,
"loss": 0.0675,
"step": 45000
},
{
"epoch": 0.000451,
"grad_norm": 0.7290102243423462,
"learning_rate": 1e-05,
"loss": 0.067,
"step": 45100
},
{
"epoch": 0.000452,
"grad_norm": 0.6666006445884705,
"learning_rate": 1e-05,
"loss": 0.0672,
"step": 45200
},
{
"epoch": 0.000453,
"grad_norm": 0.5930759906768799,
"learning_rate": 1e-05,
"loss": 0.0687,
"step": 45300
},
{
"epoch": 0.000454,
"grad_norm": 0.7391034960746765,
"learning_rate": 1e-05,
"loss": 0.0681,
"step": 45400
},
{
"epoch": 0.000455,
"grad_norm": 0.6331747770309448,
"learning_rate": 1e-05,
"loss": 0.0686,
"step": 45500
},
{
"epoch": 0.000456,
"grad_norm": 0.7175407409667969,
"learning_rate": 1e-05,
"loss": 0.0682,
"step": 45600
},
{
"epoch": 0.000457,
"grad_norm": 0.6839337348937988,
"learning_rate": 1e-05,
"loss": 0.068,
"step": 45700
},
{
"epoch": 0.000458,
"grad_norm": 0.7204523682594299,
"learning_rate": 1e-05,
"loss": 0.0674,
"step": 45800
},
{
"epoch": 0.000459,
"grad_norm": 0.6172782778739929,
"learning_rate": 1e-05,
"loss": 0.0672,
"step": 45900
},
{
"epoch": 0.00046,
"grad_norm": 0.6801437735557556,
"learning_rate": 1e-05,
"loss": 0.068,
"step": 46000
},
{
"epoch": 0.000461,
"grad_norm": 0.6950106620788574,
"learning_rate": 1e-05,
"loss": 0.0667,
"step": 46100
},
{
"epoch": 0.000462,
"grad_norm": 0.7430393099784851,
"learning_rate": 1e-05,
"loss": 0.0661,
"step": 46200
},
{
"epoch": 0.000463,
"grad_norm": 0.7335778474807739,
"learning_rate": 1e-05,
"loss": 0.0664,
"step": 46300
},
{
"epoch": 0.000464,
"grad_norm": 0.6109582185745239,
"learning_rate": 1e-05,
"loss": 0.0678,
"step": 46400
},
{
"epoch": 0.000465,
"grad_norm": 0.747843325138092,
"learning_rate": 1e-05,
"loss": 0.0666,
"step": 46500
},
{
"epoch": 0.000466,
"grad_norm": 0.5541141033172607,
"learning_rate": 1e-05,
"loss": 0.066,
"step": 46600
},
{
"epoch": 0.000467,
"grad_norm": 0.7821163535118103,
"learning_rate": 1e-05,
"loss": 0.0663,
"step": 46700
},
{
"epoch": 0.000468,
"grad_norm": 0.6927903294563293,
"learning_rate": 1e-05,
"loss": 0.0668,
"step": 46800
},
{
"epoch": 0.000469,
"grad_norm": 0.6270934343338013,
"learning_rate": 1e-05,
"loss": 0.0674,
"step": 46900
},
{
"epoch": 0.00047,
"grad_norm": 0.7509257197380066,
"learning_rate": 1e-05,
"loss": 0.0661,
"step": 47000
},
{
"epoch": 0.000471,
"grad_norm": 0.6083252429962158,
"learning_rate": 1e-05,
"loss": 0.0655,
"step": 47100
},
{
"epoch": 0.000472,
"grad_norm": 0.5622929334640503,
"learning_rate": 1e-05,
"loss": 0.065,
"step": 47200
},
{
"epoch": 0.000473,
"grad_norm": 0.5768439173698425,
"learning_rate": 1e-05,
"loss": 0.0663,
"step": 47300
},
{
"epoch": 0.000474,
"grad_norm": 0.7420287728309631,
"learning_rate": 1e-05,
"loss": 0.0647,
"step": 47400
},
{
"epoch": 0.000475,
"grad_norm": 0.6630219221115112,
"learning_rate": 1e-05,
"loss": 0.066,
"step": 47500
},
{
"epoch": 0.000476,
"grad_norm": 0.5590940713882446,
"learning_rate": 1e-05,
"loss": 0.0662,
"step": 47600
},
{
"epoch": 0.000477,
"grad_norm": 0.5448912382125854,
"learning_rate": 1e-05,
"loss": 0.0648,
"step": 47700
},
{
"epoch": 0.000478,
"grad_norm": 0.6090975999832153,
"learning_rate": 1e-05,
"loss": 0.0653,
"step": 47800
},
{
"epoch": 0.000479,
"grad_norm": 0.7398414611816406,
"learning_rate": 1e-05,
"loss": 0.0653,
"step": 47900
},
{
"epoch": 0.00048,
"grad_norm": 0.6005905270576477,
"learning_rate": 1e-05,
"loss": 0.0654,
"step": 48000
},
{
"epoch": 0.000481,
"grad_norm": 0.6361467838287354,
"learning_rate": 1e-05,
"loss": 0.0653,
"step": 48100
},
{
"epoch": 0.000482,
"grad_norm": 0.6767069101333618,
"learning_rate": 1e-05,
"loss": 0.0652,
"step": 48200
},
{
"epoch": 0.000483,
"grad_norm": 0.6184808015823364,
"learning_rate": 1e-05,
"loss": 0.0654,
"step": 48300
},
{
"epoch": 0.000484,
"grad_norm": 0.7021101117134094,
"learning_rate": 1e-05,
"loss": 0.0637,
"step": 48400
},
{
"epoch": 0.000485,
"grad_norm": 0.6103231310844421,
"learning_rate": 1e-05,
"loss": 0.0653,
"step": 48500
},
{
"epoch": 0.000486,
"grad_norm": 0.5976945161819458,
"learning_rate": 1e-05,
"loss": 0.0647,
"step": 48600
},
{
"epoch": 0.000487,
"grad_norm": 0.6222690343856812,
"learning_rate": 1e-05,
"loss": 0.0647,
"step": 48700
},
{
"epoch": 0.000488,
"grad_norm": 0.5408068299293518,
"learning_rate": 1e-05,
"loss": 0.0641,
"step": 48800
},
{
"epoch": 0.000489,
"grad_norm": 0.628935694694519,
"learning_rate": 1e-05,
"loss": 0.0642,
"step": 48900
},
{
"epoch": 0.00049,
"grad_norm": 0.6062678694725037,
"learning_rate": 1e-05,
"loss": 0.0645,
"step": 49000
},
{
"epoch": 0.000491,
"grad_norm": 0.6533873677253723,
"learning_rate": 1e-05,
"loss": 0.0648,
"step": 49100
},
{
"epoch": 0.000492,
"grad_norm": 0.6818357706069946,
"learning_rate": 1e-05,
"loss": 0.0642,
"step": 49200
},
{
"epoch": 0.000493,
"grad_norm": 0.5615854859352112,
"learning_rate": 1e-05,
"loss": 0.0649,
"step": 49300
},
{
"epoch": 0.000494,
"grad_norm": 0.5262526273727417,
"learning_rate": 1e-05,
"loss": 0.0645,
"step": 49400
},
{
"epoch": 0.000495,
"grad_norm": 0.5227097868919373,
"learning_rate": 1e-05,
"loss": 0.0634,
"step": 49500
},
{
"epoch": 0.000496,
"grad_norm": 0.5794950723648071,
"learning_rate": 1e-05,
"loss": 0.0632,
"step": 49600
},
{
"epoch": 0.000497,
"grad_norm": 0.5515991449356079,
"learning_rate": 1e-05,
"loss": 0.0639,
"step": 49700
},
{
"epoch": 0.000498,
"grad_norm": 0.5834317803382874,
"learning_rate": 1e-05,
"loss": 0.0633,
"step": 49800
},
{
"epoch": 0.000499,
"grad_norm": 0.6389098763465881,
"learning_rate": 1e-05,
"loss": 0.0637,
"step": 49900
},
{
"epoch": 0.0005,
"grad_norm": 0.6473069787025452,
"learning_rate": 1e-05,
"loss": 0.0634,
"step": 50000
},
{
"epoch": 0.000501,
"grad_norm": 0.5156600475311279,
"learning_rate": 1e-05,
"loss": 0.0638,
"step": 50100
},
{
"epoch": 0.000502,
"grad_norm": 0.6542375683784485,
"learning_rate": 1e-05,
"loss": 0.0635,
"step": 50200
},
{
"epoch": 0.000503,
"grad_norm": 0.8224967122077942,
"learning_rate": 1e-05,
"loss": 0.0631,
"step": 50300
},
{
"epoch": 0.000504,
"grad_norm": 0.6293924450874329,
"learning_rate": 1e-05,
"loss": 0.0619,
"step": 50400
},
{
"epoch": 0.000505,
"grad_norm": 0.7436028718948364,
"learning_rate": 1e-05,
"loss": 0.064,
"step": 50500
},
{
"epoch": 0.000506,
"grad_norm": 0.660367488861084,
"learning_rate": 1e-05,
"loss": 0.0639,
"step": 50600
},
{
"epoch": 0.000507,
"grad_norm": 0.5511479377746582,
"learning_rate": 1e-05,
"loss": 0.0625,
"step": 50700
},
{
"epoch": 0.000508,
"grad_norm": 0.5846619009971619,
"learning_rate": 1e-05,
"loss": 0.0634,
"step": 50800
},
{
"epoch": 0.000509,
"grad_norm": 0.5902076959609985,
"learning_rate": 1e-05,
"loss": 0.0637,
"step": 50900
},
{
"epoch": 0.00051,
"grad_norm": 0.5104527473449707,
"learning_rate": 1e-05,
"loss": 0.0627,
"step": 51000
},
{
"epoch": 0.000511,
"grad_norm": 0.592365026473999,
"learning_rate": 1e-05,
"loss": 0.0624,
"step": 51100
},
{
"epoch": 0.000512,
"grad_norm": 0.7283549904823303,
"learning_rate": 1e-05,
"loss": 0.0618,
"step": 51200
},
{
"epoch": 0.000513,
"grad_norm": 0.6117008328437805,
"learning_rate": 1e-05,
"loss": 0.0621,
"step": 51300
},
{
"epoch": 0.000514,
"grad_norm": 0.6155059933662415,
"learning_rate": 1e-05,
"loss": 0.0627,
"step": 51400
},
{
"epoch": 0.000515,
"grad_norm": 0.6605076789855957,
"learning_rate": 1e-05,
"loss": 0.0626,
"step": 51500
},
{
"epoch": 0.000516,
"grad_norm": 0.7391318082809448,
"learning_rate": 1e-05,
"loss": 0.0609,
"step": 51600
},
{
"epoch": 0.000517,
"grad_norm": 0.5673928260803223,
"learning_rate": 1e-05,
"loss": 0.0626,
"step": 51700
},
{
"epoch": 0.000518,
"grad_norm": 0.7229452729225159,
"learning_rate": 1e-05,
"loss": 0.0613,
"step": 51800
},
{
"epoch": 0.000519,
"grad_norm": 0.6015135049819946,
"learning_rate": 1e-05,
"loss": 0.0614,
"step": 51900
},
{
"epoch": 0.00052,
"grad_norm": 3.3136706352233887,
"learning_rate": 1e-05,
"loss": 0.0607,
"step": 52000
},
{
"epoch": 0.000521,
"grad_norm": 0.5922873616218567,
"learning_rate": 1e-05,
"loss": 0.0627,
"step": 52100
},
{
"epoch": 0.000522,
"grad_norm": 0.6967010498046875,
"learning_rate": 1e-05,
"loss": 0.0611,
"step": 52200
},
{
"epoch": 0.000523,
"grad_norm": 0.5986941456794739,
"learning_rate": 1e-05,
"loss": 0.0618,
"step": 52300
},
{
"epoch": 0.000524,
"grad_norm": 0.5476034879684448,
"learning_rate": 1e-05,
"loss": 0.0614,
"step": 52400
},
{
"epoch": 0.000525,
"grad_norm": 0.5859378576278687,
"learning_rate": 1e-05,
"loss": 0.0614,
"step": 52500
},
{
"epoch": 0.000526,
"grad_norm": 0.601116955280304,
"learning_rate": 1e-05,
"loss": 0.0618,
"step": 52600
},
{
"epoch": 0.000527,
"grad_norm": 0.5084663033485413,
"learning_rate": 1e-05,
"loss": 0.0622,
"step": 52700
},
{
"epoch": 0.000528,
"grad_norm": 0.5654129385948181,
"learning_rate": 1e-05,
"loss": 0.0625,
"step": 52800
},
{
"epoch": 0.000529,
"grad_norm": 0.5403587222099304,
"learning_rate": 1e-05,
"loss": 0.0605,
"step": 52900
},
{
"epoch": 0.00053,
"grad_norm": 0.5523150563240051,
"learning_rate": 1e-05,
"loss": 0.0615,
"step": 53000
},
{
"epoch": 0.000531,
"grad_norm": 0.6014654636383057,
"learning_rate": 1e-05,
"loss": 0.0613,
"step": 53100
},
{
"epoch": 0.000532,
"grad_norm": 0.6389763355255127,
"learning_rate": 1e-05,
"loss": 0.0618,
"step": 53200
},
{
"epoch": 0.000533,
"grad_norm": 0.6326813697814941,
"learning_rate": 1e-05,
"loss": 0.0621,
"step": 53300
},
{
"epoch": 0.000534,
"grad_norm": 0.5675824284553528,
"learning_rate": 1e-05,
"loss": 0.0603,
"step": 53400
},
{
"epoch": 0.000535,
"grad_norm": 0.6056302189826965,
"learning_rate": 1e-05,
"loss": 0.0604,
"step": 53500
},
{
"epoch": 0.000536,
"grad_norm": 0.7404552698135376,
"learning_rate": 1e-05,
"loss": 0.0617,
"step": 53600
},
{
"epoch": 0.000537,
"grad_norm": 0.5762139558792114,
"learning_rate": 1e-05,
"loss": 0.061,
"step": 53700
},
{
"epoch": 0.000538,
"grad_norm": 0.6377224922180176,
"learning_rate": 1e-05,
"loss": 0.0606,
"step": 53800
},
{
"epoch": 0.000539,
"grad_norm": 0.6007105708122253,
"learning_rate": 1e-05,
"loss": 0.0617,
"step": 53900
},
{
"epoch": 0.00054,
"grad_norm": 0.679589033126831,
"learning_rate": 1e-05,
"loss": 0.0609,
"step": 54000
},
{
"epoch": 0.000541,
"grad_norm": 0.6322323679924011,
"learning_rate": 1e-05,
"loss": 0.0611,
"step": 54100
},
{
"epoch": 0.000542,
"grad_norm": 0.7151752710342407,
"learning_rate": 1e-05,
"loss": 0.0594,
"step": 54200
},
{
"epoch": 0.000543,
"grad_norm": 0.5888739228248596,
"learning_rate": 1e-05,
"loss": 0.0608,
"step": 54300
},
{
"epoch": 0.000544,
"grad_norm": 0.5529482364654541,
"learning_rate": 1e-05,
"loss": 0.0616,
"step": 54400
},
{
"epoch": 0.000545,
"grad_norm": 0.5086714625358582,
"learning_rate": 1e-05,
"loss": 0.0599,
"step": 54500
},
{
"epoch": 0.000546,
"grad_norm": 0.5248231887817383,
"learning_rate": 1e-05,
"loss": 0.0611,
"step": 54600
},
{
"epoch": 0.000547,
"grad_norm": 0.48391416668891907,
"learning_rate": 1e-05,
"loss": 0.0603,
"step": 54700
},
{
"epoch": 0.000548,
"grad_norm": 0.6535386443138123,
"learning_rate": 1e-05,
"loss": 0.0599,
"step": 54800
},
{
"epoch": 0.000549,
"grad_norm": 0.6315100193023682,
"learning_rate": 1e-05,
"loss": 0.0599,
"step": 54900
},
{
"epoch": 0.00055,
"grad_norm": 0.5279924273490906,
"learning_rate": 1e-05,
"loss": 0.0601,
"step": 55000
},
{
"epoch": 0.000551,
"grad_norm": 0.5455300807952881,
"learning_rate": 1e-05,
"loss": 0.0601,
"step": 55100
},
{
"epoch": 0.000552,
"grad_norm": 0.556695282459259,
"learning_rate": 1e-05,
"loss": 0.06,
"step": 55200
},
{
"epoch": 0.000553,
"grad_norm": 0.5867908000946045,
"learning_rate": 1e-05,
"loss": 0.0584,
"step": 55300
},
{
"epoch": 0.000554,
"grad_norm": 0.6211426258087158,
"learning_rate": 1e-05,
"loss": 0.0594,
"step": 55400
},
{
"epoch": 0.000555,
"grad_norm": 0.6962873339653015,
"learning_rate": 1e-05,
"loss": 0.0588,
"step": 55500
},
{
"epoch": 0.000556,
"grad_norm": 0.5341864228248596,
"learning_rate": 1e-05,
"loss": 0.0594,
"step": 55600
},
{
"epoch": 0.000557,
"grad_norm": 0.5630548000335693,
"learning_rate": 1e-05,
"loss": 0.0596,
"step": 55700
},
{
"epoch": 0.000558,
"grad_norm": 0.6993235349655151,
"learning_rate": 1e-05,
"loss": 0.0592,
"step": 55800
},
{
"epoch": 0.000559,
"grad_norm": 0.5936434268951416,
"learning_rate": 1e-05,
"loss": 0.0589,
"step": 55900
},
{
"epoch": 0.00056,
"grad_norm": 0.6682338714599609,
"learning_rate": 1e-05,
"loss": 0.0592,
"step": 56000
},
{
"epoch": 0.000561,
"grad_norm": 0.5741124749183655,
"learning_rate": 1e-05,
"loss": 0.0586,
"step": 56100
},
{
"epoch": 0.000562,
"grad_norm": 0.5639105439186096,
"learning_rate": 1e-05,
"loss": 0.0596,
"step": 56200
},
{
"epoch": 0.000563,
"grad_norm": 0.6496306657791138,
"learning_rate": 1e-05,
"loss": 0.0588,
"step": 56300
},
{
"epoch": 0.000564,
"grad_norm": 0.6160135865211487,
"learning_rate": 1e-05,
"loss": 0.0593,
"step": 56400
},
{
"epoch": 0.000565,
"grad_norm": 0.6027793288230896,
"learning_rate": 1e-05,
"loss": 0.0579,
"step": 56500
},
{
"epoch": 0.000566,
"grad_norm": 0.6365297436714172,
"learning_rate": 1e-05,
"loss": 0.0592,
"step": 56600
},
{
"epoch": 0.000567,
"grad_norm": 0.6124427914619446,
"learning_rate": 1e-05,
"loss": 0.0584,
"step": 56700
},
{
"epoch": 0.000568,
"grad_norm": 0.5500183701515198,
"learning_rate": 1e-05,
"loss": 0.0603,
"step": 56800
},
{
"epoch": 0.000569,
"grad_norm": 0.6076985597610474,
"learning_rate": 1e-05,
"loss": 0.0586,
"step": 56900
},
{
"epoch": 0.00057,
"grad_norm": 0.5683192610740662,
"learning_rate": 1e-05,
"loss": 0.0577,
"step": 57000
},
{
"epoch": 0.000571,
"grad_norm": 0.6625038385391235,
"learning_rate": 1e-05,
"loss": 0.0581,
"step": 57100
},
{
"epoch": 0.000572,
"grad_norm": 0.40177464485168457,
"learning_rate": 1e-05,
"loss": 0.0586,
"step": 57200
},
{
"epoch": 0.000573,
"grad_norm": 0.6952741742134094,
"learning_rate": 1e-05,
"loss": 0.0584,
"step": 57300
},
{
"epoch": 0.000574,
"grad_norm": 0.6179869771003723,
"learning_rate": 1e-05,
"loss": 0.0589,
"step": 57400
},
{
"epoch": 0.000575,
"grad_norm": 0.5745118260383606,
"learning_rate": 1e-05,
"loss": 0.0578,
"step": 57500
},
{
"epoch": 0.000576,
"grad_norm": 0.4852728843688965,
"learning_rate": 1e-05,
"loss": 0.0584,
"step": 57600
},
{
"epoch": 0.000577,
"grad_norm": 0.6206620335578918,
"learning_rate": 1e-05,
"loss": 0.0583,
"step": 57700
},
{
"epoch": 0.000578,
"grad_norm": 0.6402736306190491,
"learning_rate": 1e-05,
"loss": 0.0574,
"step": 57800
},
{
"epoch": 0.000579,
"grad_norm": 0.5858712792396545,
"learning_rate": 1e-05,
"loss": 0.0582,
"step": 57900
},
{
"epoch": 0.00058,
"grad_norm": 0.5614802837371826,
"learning_rate": 1e-05,
"loss": 0.0586,
"step": 58000
},
{
"epoch": 0.000581,
"grad_norm": 0.6376156210899353,
"learning_rate": 1e-05,
"loss": 0.0574,
"step": 58100
},
{
"epoch": 0.000582,
"grad_norm": 0.5398702621459961,
"learning_rate": 1e-05,
"loss": 0.0567,
"step": 58200
},
{
"epoch": 0.000583,
"grad_norm": 0.6560328602790833,
"learning_rate": 1e-05,
"loss": 0.0586,
"step": 58300
},
{
"epoch": 0.000584,
"grad_norm": 0.48175305128097534,
"learning_rate": 1e-05,
"loss": 0.0579,
"step": 58400
},
{
"epoch": 0.000585,
"grad_norm": 0.47494786977767944,
"learning_rate": 1e-05,
"loss": 0.0565,
"step": 58500
},
{
"epoch": 0.000586,
"grad_norm": 0.6271668672561646,
"learning_rate": 1e-05,
"loss": 0.0572,
"step": 58600
},
{
"epoch": 0.000587,
"grad_norm": 0.5039101243019104,
"learning_rate": 1e-05,
"loss": 0.0578,
"step": 58700
},
{
"epoch": 0.000588,
"grad_norm": 0.5363636612892151,
"learning_rate": 1e-05,
"loss": 0.0578,
"step": 58800
},
{
"epoch": 0.000589,
"grad_norm": 0.6029368042945862,
"learning_rate": 1e-05,
"loss": 0.0567,
"step": 58900
},
{
"epoch": 0.00059,
"grad_norm": 0.5582793354988098,
"learning_rate": 1e-05,
"loss": 0.0579,
"step": 59000
},
{
"epoch": 0.000591,
"grad_norm": 0.5290389657020569,
"learning_rate": 1e-05,
"loss": 0.0575,
"step": 59100
},
{
"epoch": 0.000592,
"grad_norm": 0.5864163041114807,
"learning_rate": 1e-05,
"loss": 0.0565,
"step": 59200
},
{
"epoch": 0.000593,
"grad_norm": 0.49124574661254883,
"learning_rate": 1e-05,
"loss": 0.0584,
"step": 59300
},
{
"epoch": 0.000594,
"grad_norm": 0.5180615782737732,
"learning_rate": 1e-05,
"loss": 0.0555,
"step": 59400
},
{
"epoch": 0.000595,
"grad_norm": 0.5236871838569641,
"learning_rate": 1e-05,
"loss": 0.0574,
"step": 59500
},
{
"epoch": 0.000596,
"grad_norm": 0.7328921556472778,
"learning_rate": 1e-05,
"loss": 0.057,
"step": 59600
},
{
"epoch": 0.000597,
"grad_norm": 0.5635091662406921,
"learning_rate": 1e-05,
"loss": 0.0559,
"step": 59700
},
{
"epoch": 0.000598,
"grad_norm": 0.5094209313392639,
"learning_rate": 1e-05,
"loss": 0.057,
"step": 59800
},
{
"epoch": 0.000599,
"grad_norm": 0.5855716466903687,
"learning_rate": 1e-05,
"loss": 0.0566,
"step": 59900
},
{
"epoch": 0.0006,
"grad_norm": 0.6821003556251526,
"learning_rate": 1e-05,
"loss": 0.0559,
"step": 60000
},
{
"epoch": 0.0006,
"eval_loss": 0.045257568359375,
"eval_runtime": 142.735,
"eval_samples_per_second": 350.3,
"eval_steps_per_second": 21.894,
"step": 60000
},
{
"epoch": 0.000601,
"grad_norm": 0.5633527040481567,
"learning_rate": 1e-05,
"loss": 0.0562,
"step": 60100
},
{
"epoch": 0.000602,
"grad_norm": 0.5337314009666443,
"learning_rate": 1e-05,
"loss": 0.0562,
"step": 60200
},
{
"epoch": 0.000603,
"grad_norm": 0.5282440185546875,
"learning_rate": 1e-05,
"loss": 0.0549,
"step": 60300
},
{
"epoch": 0.000604,
"grad_norm": 0.5766568779945374,
"learning_rate": 1e-05,
"loss": 0.0576,
"step": 60400
},
{
"epoch": 0.000605,
"grad_norm": 0.5904074311256409,
"learning_rate": 1e-05,
"loss": 0.0563,
"step": 60500
},
{
"epoch": 0.000606,
"grad_norm": 0.6538689136505127,
"learning_rate": 1e-05,
"loss": 0.0566,
"step": 60600
},
{
"epoch": 0.000607,
"grad_norm": 0.45561283826828003,
"learning_rate": 1e-05,
"loss": 0.0561,
"step": 60700
},
{
"epoch": 0.000608,
"grad_norm": 0.47445598244667053,
"learning_rate": 1e-05,
"loss": 0.0561,
"step": 60800
},
{
"epoch": 0.000609,
"grad_norm": 0.7631045579910278,
"learning_rate": 1e-05,
"loss": 0.0556,
"step": 60900
},
{
"epoch": 0.00061,
"grad_norm": 0.5754849910736084,
"learning_rate": 1e-05,
"loss": 0.0553,
"step": 61000
},
{
"epoch": 0.000611,
"grad_norm": 0.6670407652854919,
"learning_rate": 1e-05,
"loss": 0.057,
"step": 61100
},
{
"epoch": 0.000612,
"grad_norm": 0.5728887319564819,
"learning_rate": 1e-05,
"loss": 0.0566,
"step": 61200
},
{
"epoch": 0.000613,
"grad_norm": 0.5342495441436768,
"learning_rate": 1e-05,
"loss": 0.0552,
"step": 61300
},
{
"epoch": 0.000614,
"grad_norm": 0.5812315344810486,
"learning_rate": 1e-05,
"loss": 0.0556,
"step": 61400
},
{
"epoch": 0.000615,
"grad_norm": 0.5818805694580078,
"learning_rate": 1e-05,
"loss": 0.0551,
"step": 61500
},
{
"epoch": 0.000616,
"grad_norm": 0.6204677224159241,
"learning_rate": 1e-05,
"loss": 0.0556,
"step": 61600
},
{
"epoch": 0.000617,
"grad_norm": 0.5443527102470398,
"learning_rate": 1e-05,
"loss": 0.0552,
"step": 61700
},
{
"epoch": 0.000618,
"grad_norm": 0.49102166295051575,
"learning_rate": 1e-05,
"loss": 0.0549,
"step": 61800
},
{
"epoch": 0.000619,
"grad_norm": 0.557538628578186,
"learning_rate": 1e-05,
"loss": 0.0543,
"step": 61900
},
{
"epoch": 0.00062,
"grad_norm": 0.620365560054779,
"learning_rate": 1e-05,
"loss": 0.0561,
"step": 62000
},
{
"epoch": 0.000621,
"grad_norm": 0.6253044009208679,
"learning_rate": 1e-05,
"loss": 0.0557,
"step": 62100
},
{
"epoch": 0.000622,
"grad_norm": 0.7837327122688293,
"learning_rate": 1e-05,
"loss": 0.055,
"step": 62200
},
{
"epoch": 0.000623,
"grad_norm": 0.5085681676864624,
"learning_rate": 1e-05,
"loss": 0.0552,
"step": 62300
},
{
"epoch": 0.000624,
"grad_norm": 0.608761191368103,
"learning_rate": 1e-05,
"loss": 0.0532,
"step": 62400
},
{
"epoch": 0.000625,
"grad_norm": 0.7588841915130615,
"learning_rate": 1e-05,
"loss": 0.0552,
"step": 62500
},
{
"epoch": 0.000626,
"grad_norm": 0.5510600209236145,
"learning_rate": 1e-05,
"loss": 0.0551,
"step": 62600
},
{
"epoch": 0.000627,
"grad_norm": 0.5801370739936829,
"learning_rate": 1e-05,
"loss": 0.0542,
"step": 62700
},
{
"epoch": 0.000628,
"grad_norm": 0.6703765988349915,
"learning_rate": 1e-05,
"loss": 0.0569,
"step": 62800
},
{
"epoch": 0.000629,
"grad_norm": 0.4344656467437744,
"learning_rate": 1e-05,
"loss": 0.0549,
"step": 62900
},
{
"epoch": 0.00063,
"grad_norm": 0.5678920745849609,
"learning_rate": 1e-05,
"loss": 0.0555,
"step": 63000
},
{
"epoch": 0.000631,
"grad_norm": 0.5048655271530151,
"learning_rate": 1e-05,
"loss": 0.0547,
"step": 63100
},
{
"epoch": 0.000632,
"grad_norm": 0.5324554443359375,
"learning_rate": 1e-05,
"loss": 0.0551,
"step": 63200
},
{
"epoch": 0.000633,
"grad_norm": 0.5735768675804138,
"learning_rate": 1e-05,
"loss": 0.0551,
"step": 63300
},
{
"epoch": 0.000634,
"grad_norm": 0.5694500803947449,
"learning_rate": 1e-05,
"loss": 0.0542,
"step": 63400
},
{
"epoch": 0.000635,
"grad_norm": 0.5009059906005859,
"learning_rate": 1e-05,
"loss": 0.0538,
"step": 63500
},
{
"epoch": 0.000636,
"grad_norm": 0.5886440277099609,
"learning_rate": 1e-05,
"loss": 0.0545,
"step": 63600
},
{
"epoch": 0.000637,
"grad_norm": 0.5673546195030212,
"learning_rate": 1e-05,
"loss": 0.0548,
"step": 63700
},
{
"epoch": 0.000638,
"grad_norm": 0.5466011762619019,
"learning_rate": 1e-05,
"loss": 0.054,
"step": 63800
},
{
"epoch": 0.000639,
"grad_norm": 0.5927892923355103,
"learning_rate": 1e-05,
"loss": 0.0548,
"step": 63900
},
{
"epoch": 0.00064,
"grad_norm": 0.7305207252502441,
"learning_rate": 1e-05,
"loss": 0.0536,
"step": 64000
},
{
"epoch": 0.000641,
"grad_norm": 0.5603034496307373,
"learning_rate": 1e-05,
"loss": 0.0536,
"step": 64100
},
{
"epoch": 0.000642,
"grad_norm": 0.6965247988700867,
"learning_rate": 1e-05,
"loss": 0.0546,
"step": 64200
},
{
"epoch": 0.000643,
"grad_norm": 0.57351154088974,
"learning_rate": 1e-05,
"loss": 0.0535,
"step": 64300
},
{
"epoch": 0.000644,
"grad_norm": 0.511005163192749,
"learning_rate": 1e-05,
"loss": 0.0548,
"step": 64400
},
{
"epoch": 0.000645,
"grad_norm": 0.5340495705604553,
"learning_rate": 1e-05,
"loss": 0.0543,
"step": 64500
},
{
"epoch": 0.000646,
"grad_norm": 0.6858961582183838,
"learning_rate": 1e-05,
"loss": 0.0538,
"step": 64600
},
{
"epoch": 0.000647,
"grad_norm": 0.6375705599784851,
"learning_rate": 1e-05,
"loss": 0.0536,
"step": 64700
},
{
"epoch": 0.000648,
"grad_norm": 0.48544806241989136,
"learning_rate": 1e-05,
"loss": 0.0529,
"step": 64800
},
{
"epoch": 0.000649,
"grad_norm": 0.49595892429351807,
"learning_rate": 1e-05,
"loss": 0.0529,
"step": 64900
},
{
"epoch": 0.00065,
"grad_norm": 0.4976153075695038,
"learning_rate": 1e-05,
"loss": 0.0538,
"step": 65000
},
{
"epoch": 0.000651,
"grad_norm": 0.5489813089370728,
"learning_rate": 1e-05,
"loss": 0.0528,
"step": 65100
},
{
"epoch": 0.000652,
"grad_norm": 0.4820660650730133,
"learning_rate": 1e-05,
"loss": 0.0536,
"step": 65200
},
{
"epoch": 0.000653,
"grad_norm": 0.5546014308929443,
"learning_rate": 1e-05,
"loss": 0.0529,
"step": 65300
},
{
"epoch": 0.000654,
"grad_norm": 0.4900113344192505,
"learning_rate": 1e-05,
"loss": 0.0535,
"step": 65400
},
{
"epoch": 0.000655,
"grad_norm": 0.6061577796936035,
"learning_rate": 1e-05,
"loss": 0.0533,
"step": 65500
},
{
"epoch": 0.000656,
"grad_norm": 0.6450973749160767,
"learning_rate": 1e-05,
"loss": 0.0542,
"step": 65600
},
{
"epoch": 0.000657,
"grad_norm": 0.677505612373352,
"learning_rate": 1e-05,
"loss": 0.0539,
"step": 65700
},
{
"epoch": 0.000658,
"grad_norm": 0.48482370376586914,
"learning_rate": 1e-05,
"loss": 0.0533,
"step": 65800
},
{
"epoch": 0.000659,
"grad_norm": 0.49198102951049805,
"learning_rate": 1e-05,
"loss": 0.0527,
"step": 65900
},
{
"epoch": 0.00066,
"grad_norm": 0.47996985912323,
"learning_rate": 1e-05,
"loss": 0.0544,
"step": 66000
},
{
"epoch": 0.000661,
"grad_norm": 0.548791229724884,
"learning_rate": 1e-05,
"loss": 0.0534,
"step": 66100
},
{
"epoch": 0.000662,
"grad_norm": 0.6156114935874939,
"learning_rate": 1e-05,
"loss": 0.0538,
"step": 66200
},
{
"epoch": 0.000663,
"grad_norm": 0.5212823748588562,
"learning_rate": 1e-05,
"loss": 0.0534,
"step": 66300
},
{
"epoch": 0.000664,
"grad_norm": 0.5812687873840332,
"learning_rate": 1e-05,
"loss": 0.0527,
"step": 66400
},
{
"epoch": 0.000665,
"grad_norm": 0.4992978572845459,
"learning_rate": 1e-05,
"loss": 0.0532,
"step": 66500
},
{
"epoch": 0.000666,
"grad_norm": 0.5525248050689697,
"learning_rate": 1e-05,
"loss": 0.0533,
"step": 66600
},
{
"epoch": 0.000667,
"grad_norm": 0.6456683874130249,
"learning_rate": 1e-05,
"loss": 0.0535,
"step": 66700
},
{
"epoch": 0.000668,
"grad_norm": 0.6112907528877258,
"learning_rate": 1e-05,
"loss": 0.0532,
"step": 66800
},
{
"epoch": 0.000669,
"grad_norm": 0.543624222278595,
"learning_rate": 1e-05,
"loss": 0.0539,
"step": 66900
},
{
"epoch": 0.00067,
"grad_norm": 0.5512799024581909,
"learning_rate": 1e-05,
"loss": 0.0513,
"step": 67000
},
{
"epoch": 0.000671,
"grad_norm": 0.631289005279541,
"learning_rate": 1e-05,
"loss": 0.0519,
"step": 67100
},
{
"epoch": 0.000672,
"grad_norm": 0.47048887610435486,
"learning_rate": 1e-05,
"loss": 0.0532,
"step": 67200
},
{
"epoch": 0.000673,
"grad_norm": 0.5930091142654419,
"learning_rate": 1e-05,
"loss": 0.0528,
"step": 67300
},
{
"epoch": 0.000674,
"grad_norm": 0.7611256837844849,
"learning_rate": 1e-05,
"loss": 0.0527,
"step": 67400
},
{
"epoch": 0.000675,
"grad_norm": 0.49624642729759216,
"learning_rate": 1e-05,
"loss": 0.0528,
"step": 67500
},
{
"epoch": 0.000676,
"grad_norm": 0.6547495126724243,
"learning_rate": 1e-05,
"loss": 0.0523,
"step": 67600
},
{
"epoch": 0.000677,
"grad_norm": 0.635519802570343,
"learning_rate": 1e-05,
"loss": 0.0521,
"step": 67700
},
{
"epoch": 0.000678,
"grad_norm": 0.606388509273529,
"learning_rate": 1e-05,
"loss": 0.052,
"step": 67800
},
{
"epoch": 0.000679,
"grad_norm": 0.4945245385169983,
"learning_rate": 1e-05,
"loss": 0.0522,
"step": 67900
},
{
"epoch": 0.00068,
"grad_norm": 0.4815261662006378,
"learning_rate": 1e-05,
"loss": 0.0533,
"step": 68000
},
{
"epoch": 0.000681,
"grad_norm": 0.47382187843322754,
"learning_rate": 1e-05,
"loss": 0.0519,
"step": 68100
},
{
"epoch": 0.000682,
"grad_norm": 0.549886167049408,
"learning_rate": 1e-05,
"loss": 0.0518,
"step": 68200
},
{
"epoch": 0.000683,
"grad_norm": 0.5204160213470459,
"learning_rate": 1e-05,
"loss": 0.0519,
"step": 68300
},
{
"epoch": 0.000684,
"grad_norm": 0.5802004933357239,
"learning_rate": 1e-05,
"loss": 0.0517,
"step": 68400
},
{
"epoch": 0.000685,
"grad_norm": 0.5576998591423035,
"learning_rate": 1e-05,
"loss": 0.0519,
"step": 68500
},
{
"epoch": 0.000686,
"grad_norm": 0.5708860158920288,
"learning_rate": 1e-05,
"loss": 0.0523,
"step": 68600
},
{
"epoch": 0.000687,
"grad_norm": 0.6270045042037964,
"learning_rate": 1e-05,
"loss": 0.0502,
"step": 68700
},
{
"epoch": 0.000688,
"grad_norm": 0.462593138217926,
"learning_rate": 1e-05,
"loss": 0.0517,
"step": 68800
},
{
"epoch": 0.000689,
"grad_norm": 0.4807493984699249,
"learning_rate": 1e-05,
"loss": 0.0524,
"step": 68900
},
{
"epoch": 0.00069,
"grad_norm": 0.5798048973083496,
"learning_rate": 1e-05,
"loss": 0.0527,
"step": 69000
},
{
"epoch": 0.000691,
"grad_norm": 0.44622689485549927,
"learning_rate": 1e-05,
"loss": 0.0528,
"step": 69100
},
{
"epoch": 0.000692,
"grad_norm": 0.5129225254058838,
"learning_rate": 1e-05,
"loss": 0.0528,
"step": 69200
},
{
"epoch": 0.000693,
"grad_norm": 0.5368632674217224,
"learning_rate": 1e-05,
"loss": 0.0524,
"step": 69300
},
{
"epoch": 0.000694,
"grad_norm": 0.559655487537384,
"learning_rate": 1e-05,
"loss": 0.0525,
"step": 69400
},
{
"epoch": 0.000695,
"grad_norm": 0.6121320128440857,
"learning_rate": 1e-05,
"loss": 0.0507,
"step": 69500
},
{
"epoch": 0.000696,
"grad_norm": 0.5470311045646667,
"learning_rate": 1e-05,
"loss": 0.0511,
"step": 69600
},
{
"epoch": 0.000697,
"grad_norm": 0.5142286419868469,
"learning_rate": 1e-05,
"loss": 0.0516,
"step": 69700
},
{
"epoch": 0.000698,
"grad_norm": 0.6724265217781067,
"learning_rate": 1e-05,
"loss": 0.0517,
"step": 69800
},
{
"epoch": 0.000699,
"grad_norm": 0.4707196354866028,
"learning_rate": 1e-05,
"loss": 0.0511,
"step": 69900
},
{
"epoch": 0.0007,
"grad_norm": 0.616026759147644,
"learning_rate": 1e-05,
"loss": 0.0517,
"step": 70000
},
{
"epoch": 0.000701,
"grad_norm": 0.5991165041923523,
"learning_rate": 1e-05,
"loss": 0.0512,
"step": 70100
},
{
"epoch": 0.000702,
"grad_norm": 0.5611563324928284,
"learning_rate": 1e-05,
"loss": 0.0509,
"step": 70200
},
{
"epoch": 0.000703,
"grad_norm": 0.46492424607276917,
"learning_rate": 1e-05,
"loss": 0.0511,
"step": 70300
},
{
"epoch": 0.000704,
"grad_norm": 0.5256513357162476,
"learning_rate": 1e-05,
"loss": 0.0518,
"step": 70400
},
{
"epoch": 0.000705,
"grad_norm": 0.499254435300827,
"learning_rate": 1e-05,
"loss": 0.0501,
"step": 70500
},
{
"epoch": 0.000706,
"grad_norm": 0.5403403043746948,
"learning_rate": 1e-05,
"loss": 0.0509,
"step": 70600
},
{
"epoch": 0.000707,
"grad_norm": 0.6283129453659058,
"learning_rate": 1e-05,
"loss": 0.0519,
"step": 70700
},
{
"epoch": 0.000708,
"grad_norm": 0.5229069590568542,
"learning_rate": 1e-05,
"loss": 0.051,
"step": 70800
},
{
"epoch": 0.000709,
"grad_norm": 0.48306700587272644,
"learning_rate": 1e-05,
"loss": 0.0504,
"step": 70900
},
{
"epoch": 0.00071,
"grad_norm": 0.5926072597503662,
"learning_rate": 1e-05,
"loss": 0.0506,
"step": 71000
},
{
"epoch": 0.000711,
"grad_norm": 0.5640701651573181,
"learning_rate": 1e-05,
"loss": 0.0506,
"step": 71100
},
{
"epoch": 0.000712,
"grad_norm": 0.49134358763694763,
"learning_rate": 1e-05,
"loss": 0.0512,
"step": 71200
},
{
"epoch": 0.000713,
"grad_norm": 0.4878164231777191,
"learning_rate": 1e-05,
"loss": 0.0512,
"step": 71300
},
{
"epoch": 0.000714,
"grad_norm": 0.6183532476425171,
"learning_rate": 1e-05,
"loss": 0.0508,
"step": 71400
},
{
"epoch": 0.000715,
"grad_norm": 0.5065814852714539,
"learning_rate": 1e-05,
"loss": 0.0505,
"step": 71500
},
{
"epoch": 0.000716,
"grad_norm": 0.548599898815155,
"learning_rate": 1e-05,
"loss": 0.0502,
"step": 71600
},
{
"epoch": 0.000717,
"grad_norm": 0.4534250795841217,
"learning_rate": 1e-05,
"loss": 0.05,
"step": 71700
},
{
"epoch": 0.000718,
"grad_norm": 0.5044461488723755,
"learning_rate": 1e-05,
"loss": 0.0502,
"step": 71800
},
{
"epoch": 0.000719,
"grad_norm": 0.5321183204650879,
"learning_rate": 1e-05,
"loss": 0.0498,
"step": 71900
},
{
"epoch": 0.00072,
"grad_norm": 0.4777474105358124,
"learning_rate": 1e-05,
"loss": 0.0503,
"step": 72000
},
{
"epoch": 0.000721,
"grad_norm": 0.6466835141181946,
"learning_rate": 1e-05,
"loss": 0.0507,
"step": 72100
},
{
"epoch": 0.000722,
"grad_norm": 0.5359812378883362,
"learning_rate": 1e-05,
"loss": 0.0506,
"step": 72200
},
{
"epoch": 0.000723,
"grad_norm": 0.4923792779445648,
"learning_rate": 1e-05,
"loss": 0.05,
"step": 72300
},
{
"epoch": 0.000724,
"grad_norm": 0.5708417892456055,
"learning_rate": 1e-05,
"loss": 0.0511,
"step": 72400
},
{
"epoch": 0.000725,
"grad_norm": 0.5016763806343079,
"learning_rate": 1e-05,
"loss": 0.0509,
"step": 72500
},
{
"epoch": 0.000726,
"grad_norm": 0.4299620985984802,
"learning_rate": 1e-05,
"loss": 0.0504,
"step": 72600
},
{
"epoch": 0.000727,
"grad_norm": 0.387928307056427,
"learning_rate": 1e-05,
"loss": 0.0493,
"step": 72700
},
{
"epoch": 0.000728,
"grad_norm": 0.5286259651184082,
"learning_rate": 1e-05,
"loss": 0.0508,
"step": 72800
},
{
"epoch": 0.000729,
"grad_norm": 0.511677622795105,
"learning_rate": 1e-05,
"loss": 0.0503,
"step": 72900
},
{
"epoch": 0.00073,
"grad_norm": 0.4648519456386566,
"learning_rate": 1e-05,
"loss": 0.0494,
"step": 73000
},
{
"epoch": 0.000731,
"grad_norm": 0.4918229877948761,
"learning_rate": 1e-05,
"loss": 0.0496,
"step": 73100
},
{
"epoch": 0.000732,
"grad_norm": 0.49148622155189514,
"learning_rate": 1e-05,
"loss": 0.0494,
"step": 73200
},
{
"epoch": 0.000733,
"grad_norm": 0.5078290104866028,
"learning_rate": 1e-05,
"loss": 0.0495,
"step": 73300
},
{
"epoch": 0.000734,
"grad_norm": 0.591152012348175,
"learning_rate": 1e-05,
"loss": 0.0506,
"step": 73400
},
{
"epoch": 0.000735,
"grad_norm": 0.5350937843322754,
"learning_rate": 1e-05,
"loss": 0.0499,
"step": 73500
},
{
"epoch": 0.000736,
"grad_norm": 0.4960618019104004,
"learning_rate": 1e-05,
"loss": 0.0495,
"step": 73600
},
{
"epoch": 0.000737,
"grad_norm": 0.46348682045936584,
"learning_rate": 1e-05,
"loss": 0.0493,
"step": 73700
},
{
"epoch": 0.000738,
"grad_norm": 0.6859008073806763,
"learning_rate": 1e-05,
"loss": 0.0506,
"step": 73800
},
{
"epoch": 0.000739,
"grad_norm": 0.5936481952667236,
"learning_rate": 1e-05,
"loss": 0.0504,
"step": 73900
},
{
"epoch": 0.00074,
"grad_norm": 0.6398313045501709,
"learning_rate": 1e-05,
"loss": 0.0498,
"step": 74000
},
{
"epoch": 0.000741,
"grad_norm": 0.6062189936637878,
"learning_rate": 1e-05,
"loss": 0.05,
"step": 74100
},
{
"epoch": 0.000742,
"grad_norm": 0.5730705261230469,
"learning_rate": 1e-05,
"loss": 0.0498,
"step": 74200
},
{
"epoch": 0.000743,
"grad_norm": 0.5183285474777222,
"learning_rate": 1e-05,
"loss": 0.05,
"step": 74300
},
{
"epoch": 0.000744,
"grad_norm": 0.4582626521587372,
"learning_rate": 1e-05,
"loss": 0.0493,
"step": 74400
},
{
"epoch": 0.000745,
"grad_norm": 0.4545513987541199,
"learning_rate": 1e-05,
"loss": 0.0497,
"step": 74500
},
{
"epoch": 0.000746,
"grad_norm": 0.6823522448539734,
"learning_rate": 1e-05,
"loss": 0.0494,
"step": 74600
},
{
"epoch": 0.000747,
"grad_norm": 0.5017057061195374,
"learning_rate": 1e-05,
"loss": 0.0498,
"step": 74700
},
{
"epoch": 0.000748,
"grad_norm": 0.4436599910259247,
"learning_rate": 1e-05,
"loss": 0.0507,
"step": 74800
},
{
"epoch": 0.000749,
"grad_norm": 0.5471747517585754,
"learning_rate": 1e-05,
"loss": 0.0491,
"step": 74900
},
{
"epoch": 0.00075,
"grad_norm": 0.4700005352497101,
"learning_rate": 1e-05,
"loss": 0.0493,
"step": 75000
},
{
"epoch": 0.000751,
"grad_norm": 0.5744854211807251,
"learning_rate": 1e-05,
"loss": 0.0494,
"step": 75100
},
{
"epoch": 0.000752,
"grad_norm": 0.4908376634120941,
"learning_rate": 1e-05,
"loss": 0.0493,
"step": 75200
},
{
"epoch": 0.000753,
"grad_norm": 0.5889230966567993,
"learning_rate": 1e-05,
"loss": 0.0497,
"step": 75300
},
{
"epoch": 0.000754,
"grad_norm": 0.5542328953742981,
"learning_rate": 1e-05,
"loss": 0.049,
"step": 75400
},
{
"epoch": 0.000755,
"grad_norm": 0.567498505115509,
"learning_rate": 1e-05,
"loss": 0.0487,
"step": 75500
},
{
"epoch": 0.000756,
"grad_norm": 0.4234246611595154,
"learning_rate": 1e-05,
"loss": 0.0494,
"step": 75600
},
{
"epoch": 0.000757,
"grad_norm": 0.7256674766540527,
"learning_rate": 1e-05,
"loss": 0.0496,
"step": 75700
},
{
"epoch": 0.000758,
"grad_norm": 0.6111962795257568,
"learning_rate": 1e-05,
"loss": 0.0494,
"step": 75800
},
{
"epoch": 0.000759,
"grad_norm": 0.5681432485580444,
"learning_rate": 1e-05,
"loss": 0.0483,
"step": 75900
},
{
"epoch": 0.00076,
"grad_norm": 0.44954606890678406,
"learning_rate": 1e-05,
"loss": 0.049,
"step": 76000
},
{
"epoch": 0.000761,
"grad_norm": 0.5693077445030212,
"learning_rate": 1e-05,
"loss": 0.0485,
"step": 76100
},
{
"epoch": 0.000762,
"grad_norm": 0.47221890091896057,
"learning_rate": 1e-05,
"loss": 0.0485,
"step": 76200
},
{
"epoch": 0.000763,
"grad_norm": 0.5012596249580383,
"learning_rate": 1e-05,
"loss": 0.0488,
"step": 76300
},
{
"epoch": 0.000764,
"grad_norm": 0.5051250457763672,
"learning_rate": 1e-05,
"loss": 0.0492,
"step": 76400
},
{
"epoch": 0.000765,
"grad_norm": 0.45128434896469116,
"learning_rate": 1e-05,
"loss": 0.0483,
"step": 76500
},
{
"epoch": 0.000766,
"grad_norm": 0.48324739933013916,
"learning_rate": 1e-05,
"loss": 0.0482,
"step": 76600
},
{
"epoch": 0.000767,
"grad_norm": 0.6752970814704895,
"learning_rate": 1e-05,
"loss": 0.0487,
"step": 76700
},
{
"epoch": 0.000768,
"grad_norm": 0.4630663990974426,
"learning_rate": 1e-05,
"loss": 0.0501,
"step": 76800
},
{
"epoch": 0.000769,
"grad_norm": 0.4887773394584656,
"learning_rate": 1e-05,
"loss": 0.0481,
"step": 76900
},
{
"epoch": 0.00077,
"grad_norm": 0.4609774947166443,
"learning_rate": 1e-05,
"loss": 0.0486,
"step": 77000
},
{
"epoch": 0.000771,
"grad_norm": 0.6502612233161926,
"learning_rate": 1e-05,
"loss": 0.0495,
"step": 77100
},
{
"epoch": 0.000772,
"grad_norm": 0.563583254814148,
"learning_rate": 1e-05,
"loss": 0.0493,
"step": 77200
},
{
"epoch": 0.000773,
"grad_norm": 0.5242981314659119,
"learning_rate": 1e-05,
"loss": 0.0485,
"step": 77300
},
{
"epoch": 0.000774,
"grad_norm": 0.5238550901412964,
"learning_rate": 1e-05,
"loss": 0.0482,
"step": 77400
},
{
"epoch": 0.000775,
"grad_norm": 0.38637349009513855,
"learning_rate": 1e-05,
"loss": 0.0482,
"step": 77500
},
{
"epoch": 0.000776,
"grad_norm": 0.5395223498344421,
"learning_rate": 1e-05,
"loss": 0.0482,
"step": 77600
},
{
"epoch": 0.000777,
"grad_norm": 0.5965639352798462,
"learning_rate": 1e-05,
"loss": 0.0482,
"step": 77700
},
{
"epoch": 0.000778,
"grad_norm": 0.4685559868812561,
"learning_rate": 1e-05,
"loss": 0.0474,
"step": 77800
},
{
"epoch": 0.000779,
"grad_norm": 0.46465954184532166,
"learning_rate": 1e-05,
"loss": 0.049,
"step": 77900
},
{
"epoch": 0.00078,
"grad_norm": 0.5408352017402649,
"learning_rate": 1e-05,
"loss": 0.0487,
"step": 78000
},
{
"epoch": 0.000781,
"grad_norm": 0.3893685042858124,
"learning_rate": 1e-05,
"loss": 0.0479,
"step": 78100
},
{
"epoch": 0.000782,
"grad_norm": 0.6658462285995483,
"learning_rate": 1e-05,
"loss": 0.048,
"step": 78200
},
{
"epoch": 0.000783,
"grad_norm": 0.6283921003341675,
"learning_rate": 1e-05,
"loss": 0.0488,
"step": 78300
},
{
"epoch": 0.000784,
"grad_norm": 0.4658546447753906,
"learning_rate": 1e-05,
"loss": 0.0486,
"step": 78400
},
{
"epoch": 0.000785,
"grad_norm": 0.5362129807472229,
"learning_rate": 1e-05,
"loss": 0.0488,
"step": 78500
},
{
"epoch": 0.000786,
"grad_norm": 0.5157918334007263,
"learning_rate": 1e-05,
"loss": 0.0482,
"step": 78600
},
{
"epoch": 0.000787,
"grad_norm": 0.5089668035507202,
"learning_rate": 1e-05,
"loss": 0.0485,
"step": 78700
},
{
"epoch": 0.000788,
"grad_norm": 0.49590611457824707,
"learning_rate": 1e-05,
"loss": 0.0476,
"step": 78800
},
{
"epoch": 0.000789,
"grad_norm": 0.4500684440135956,
"learning_rate": 1e-05,
"loss": 0.0482,
"step": 78900
},
{
"epoch": 0.00079,
"grad_norm": 0.4456005096435547,
"learning_rate": 1e-05,
"loss": 0.0479,
"step": 79000
},
{
"epoch": 0.000791,
"grad_norm": 0.502184271812439,
"learning_rate": 1e-05,
"loss": 0.0483,
"step": 79100
},
{
"epoch": 0.000792,
"grad_norm": 0.4004657566547394,
"learning_rate": 1e-05,
"loss": 0.0483,
"step": 79200
},
{
"epoch": 0.000793,
"grad_norm": 0.6616214513778687,
"learning_rate": 1e-05,
"loss": 0.048,
"step": 79300
},
{
"epoch": 0.000794,
"grad_norm": 0.5488511323928833,
"learning_rate": 1e-05,
"loss": 0.0473,
"step": 79400
},
{
"epoch": 0.000795,
"grad_norm": 0.5251606702804565,
"learning_rate": 1e-05,
"loss": 0.0489,
"step": 79500
},
{
"epoch": 0.000796,
"grad_norm": 0.43220826983451843,
"learning_rate": 1e-05,
"loss": 0.0469,
"step": 79600
},
{
"epoch": 0.000797,
"grad_norm": 0.5535863041877747,
"learning_rate": 1e-05,
"loss": 0.0487,
"step": 79700
},
{
"epoch": 0.000798,
"grad_norm": 0.4892144799232483,
"learning_rate": 1e-05,
"loss": 0.048,
"step": 79800
},
{
"epoch": 0.000799,
"grad_norm": 0.443042516708374,
"learning_rate": 1e-05,
"loss": 0.0484,
"step": 79900
},
{
"epoch": 0.0008,
"grad_norm": 0.4258803725242615,
"learning_rate": 1e-05,
"loss": 0.0476,
"step": 80000
},
{
"epoch": 0.0008,
"eval_loss": 0.0394287109375,
"eval_runtime": 147.8199,
"eval_samples_per_second": 338.249,
"eval_steps_per_second": 21.141,
"step": 80000
},
{
"epoch": 0.000801,
"grad_norm": 0.5370935201644897,
"learning_rate": 1e-05,
"loss": 0.0478,
"step": 80100
},
{
"epoch": 0.000802,
"grad_norm": 0.5561772584915161,
"learning_rate": 1e-05,
"loss": 0.0487,
"step": 80200
},
{
"epoch": 0.000803,
"grad_norm": 0.5092744827270508,
"learning_rate": 1e-05,
"loss": 0.0476,
"step": 80300
},
{
"epoch": 0.000804,
"grad_norm": 0.4691084623336792,
"learning_rate": 1e-05,
"loss": 0.0473,
"step": 80400
},
{
"epoch": 0.000805,
"grad_norm": 0.5660099387168884,
"learning_rate": 1e-05,
"loss": 0.0475,
"step": 80500
},
{
"epoch": 0.000806,
"grad_norm": 0.5250957012176514,
"learning_rate": 1e-05,
"loss": 0.0471,
"step": 80600
},
{
"epoch": 0.000807,
"grad_norm": 0.5492421388626099,
"learning_rate": 1e-05,
"loss": 0.0481,
"step": 80700
},
{
"epoch": 0.000808,
"grad_norm": 0.7874831557273865,
"learning_rate": 1e-05,
"loss": 0.0475,
"step": 80800
},
{
"epoch": 0.000809,
"grad_norm": 0.6476261615753174,
"learning_rate": 1e-05,
"loss": 0.0477,
"step": 80900
},
{
"epoch": 0.00081,
"grad_norm": 0.557145357131958,
"learning_rate": 1e-05,
"loss": 0.0477,
"step": 81000
},
{
"epoch": 0.000811,
"grad_norm": 0.5536689758300781,
"learning_rate": 1e-05,
"loss": 0.0475,
"step": 81100
},
{
"epoch": 0.000812,
"grad_norm": 0.5005760788917542,
"learning_rate": 1e-05,
"loss": 0.0472,
"step": 81200
},
{
"epoch": 0.000813,
"grad_norm": 0.43560323119163513,
"learning_rate": 1e-05,
"loss": 0.0473,
"step": 81300
},
{
"epoch": 0.000814,
"grad_norm": 0.49981963634490967,
"learning_rate": 1e-05,
"loss": 0.0468,
"step": 81400
},
{
"epoch": 0.000815,
"grad_norm": 0.5209627151489258,
"learning_rate": 1e-05,
"loss": 0.0476,
"step": 81500
},
{
"epoch": 0.000816,
"grad_norm": 0.7528536319732666,
"learning_rate": 1e-05,
"loss": 0.0471,
"step": 81600
},
{
"epoch": 0.000817,
"grad_norm": 0.6212517023086548,
"learning_rate": 1e-05,
"loss": 0.0476,
"step": 81700
},
{
"epoch": 0.000818,
"grad_norm": 0.45106619596481323,
"learning_rate": 1e-05,
"loss": 0.0475,
"step": 81800
},
{
"epoch": 0.000819,
"grad_norm": 0.5259119868278503,
"learning_rate": 1e-05,
"loss": 0.0473,
"step": 81900
},
{
"epoch": 0.00082,
"grad_norm": 0.4737171232700348,
"learning_rate": 1e-05,
"loss": 0.0478,
"step": 82000
},
{
"epoch": 0.000821,
"grad_norm": 0.5119843482971191,
"learning_rate": 1e-05,
"loss": 0.0467,
"step": 82100
},
{
"epoch": 0.000822,
"grad_norm": 0.3932953178882599,
"learning_rate": 1e-05,
"loss": 0.0465,
"step": 82200
},
{
"epoch": 0.000823,
"grad_norm": 0.43303382396698,
"learning_rate": 1e-05,
"loss": 0.047,
"step": 82300
},
{
"epoch": 0.000824,
"grad_norm": 0.5500777363777161,
"learning_rate": 1e-05,
"loss": 0.0461,
"step": 82400
},
{
"epoch": 0.000825,
"grad_norm": 0.5227336883544922,
"learning_rate": 1e-05,
"loss": 0.0477,
"step": 82500
},
{
"epoch": 0.000826,
"grad_norm": 0.5672751665115356,
"learning_rate": 1e-05,
"loss": 0.0476,
"step": 82600
},
{
"epoch": 0.000827,
"grad_norm": 0.5093204975128174,
"learning_rate": 1e-05,
"loss": 0.0468,
"step": 82700
},
{
"epoch": 0.000828,
"grad_norm": 0.47309496998786926,
"learning_rate": 1e-05,
"loss": 0.0464,
"step": 82800
},
{
"epoch": 0.000829,
"grad_norm": 0.4092000722885132,
"learning_rate": 1e-05,
"loss": 0.0467,
"step": 82900
},
{
"epoch": 0.00083,
"grad_norm": 0.42544227838516235,
"learning_rate": 1e-05,
"loss": 0.0455,
"step": 83000
},
{
"epoch": 0.000831,
"grad_norm": 0.5713441371917725,
"learning_rate": 1e-05,
"loss": 0.0457,
"step": 83100
},
{
"epoch": 0.000832,
"grad_norm": 0.5193179845809937,
"learning_rate": 1e-05,
"loss": 0.0463,
"step": 83200
},
{
"epoch": 0.000833,
"grad_norm": 0.43209248781204224,
"learning_rate": 1e-05,
"loss": 0.047,
"step": 83300
},
{
"epoch": 0.000834,
"grad_norm": 0.5342600345611572,
"learning_rate": 1e-05,
"loss": 0.0456,
"step": 83400
},
{
"epoch": 0.000835,
"grad_norm": 0.592204213142395,
"learning_rate": 1e-05,
"loss": 0.0472,
"step": 83500
},
{
"epoch": 0.000836,
"grad_norm": 0.5118575692176819,
"learning_rate": 1e-05,
"loss": 0.0467,
"step": 83600
},
{
"epoch": 0.000837,
"grad_norm": 0.4781627058982849,
"learning_rate": 1e-05,
"loss": 0.0463,
"step": 83700
},
{
"epoch": 0.000838,
"grad_norm": 0.4500192403793335,
"learning_rate": 1e-05,
"loss": 0.0468,
"step": 83800
},
{
"epoch": 0.000839,
"grad_norm": 0.49369123578071594,
"learning_rate": 1e-05,
"loss": 0.0463,
"step": 83900
},
{
"epoch": 0.00084,
"grad_norm": 0.48518478870391846,
"learning_rate": 1e-05,
"loss": 0.0466,
"step": 84000
},
{
"epoch": 0.000841,
"grad_norm": 0.4960392117500305,
"learning_rate": 1e-05,
"loss": 0.0464,
"step": 84100
},
{
"epoch": 0.000842,
"grad_norm": 0.4881882667541504,
"learning_rate": 1e-05,
"loss": 0.0461,
"step": 84200
},
{
"epoch": 0.000843,
"grad_norm": 0.45837706327438354,
"learning_rate": 1e-05,
"loss": 0.0462,
"step": 84300
},
{
"epoch": 0.000844,
"grad_norm": 0.4866684675216675,
"learning_rate": 1e-05,
"loss": 0.0456,
"step": 84400
},
{
"epoch": 0.000845,
"grad_norm": 0.5094208121299744,
"learning_rate": 1e-05,
"loss": 0.0466,
"step": 84500
},
{
"epoch": 0.000846,
"grad_norm": 0.45124098658561707,
"learning_rate": 1e-05,
"loss": 0.0474,
"step": 84600
},
{
"epoch": 0.000847,
"grad_norm": 0.5730771422386169,
"learning_rate": 1e-05,
"loss": 0.0465,
"step": 84700
},
{
"epoch": 0.000848,
"grad_norm": 0.48597007989883423,
"learning_rate": 1e-05,
"loss": 0.0463,
"step": 84800
},
{
"epoch": 0.000849,
"grad_norm": 0.46603092551231384,
"learning_rate": 1e-05,
"loss": 0.0465,
"step": 84900
},
{
"epoch": 0.00085,
"grad_norm": 0.5534038543701172,
"learning_rate": 1e-05,
"loss": 0.0469,
"step": 85000
},
{
"epoch": 0.000851,
"grad_norm": 0.42876607179641724,
"learning_rate": 1e-05,
"loss": 0.0465,
"step": 85100
},
{
"epoch": 0.000852,
"grad_norm": 0.39502009749412537,
"learning_rate": 1e-05,
"loss": 0.0466,
"step": 85200
},
{
"epoch": 0.000853,
"grad_norm": 0.44408953189849854,
"learning_rate": 1e-05,
"loss": 0.0468,
"step": 85300
},
{
"epoch": 0.000854,
"grad_norm": 0.444979190826416,
"learning_rate": 1e-05,
"loss": 0.0463,
"step": 85400
},
{
"epoch": 0.000855,
"grad_norm": 0.4805260896682739,
"learning_rate": 1e-05,
"loss": 0.0465,
"step": 85500
},
{
"epoch": 0.000856,
"grad_norm": 0.552291750907898,
"learning_rate": 1e-05,
"loss": 0.0462,
"step": 85600
},
{
"epoch": 0.000857,
"grad_norm": 0.5068393349647522,
"learning_rate": 1e-05,
"loss": 0.0461,
"step": 85700
},
{
"epoch": 0.000858,
"grad_norm": 0.41845035552978516,
"learning_rate": 1e-05,
"loss": 0.0461,
"step": 85800
},
{
"epoch": 0.000859,
"grad_norm": 0.4751891493797302,
"learning_rate": 1e-05,
"loss": 0.0458,
"step": 85900
},
{
"epoch": 0.00086,
"grad_norm": 0.5280572175979614,
"learning_rate": 1e-05,
"loss": 0.0458,
"step": 86000
},
{
"epoch": 0.000861,
"grad_norm": 0.68556147813797,
"learning_rate": 1e-05,
"loss": 0.046,
"step": 86100
},
{
"epoch": 0.000862,
"grad_norm": 0.5463889241218567,
"learning_rate": 1e-05,
"loss": 0.0457,
"step": 86200
},
{
"epoch": 0.000863,
"grad_norm": 0.44014325737953186,
"learning_rate": 1e-05,
"loss": 0.0459,
"step": 86300
},
{
"epoch": 0.000864,
"grad_norm": 0.5454211235046387,
"learning_rate": 1e-05,
"loss": 0.0457,
"step": 86400
},
{
"epoch": 0.000865,
"grad_norm": 0.5828255414962769,
"learning_rate": 1e-05,
"loss": 0.0453,
"step": 86500
},
{
"epoch": 0.000866,
"grad_norm": 0.4621482789516449,
"learning_rate": 1e-05,
"loss": 0.0456,
"step": 86600
},
{
"epoch": 0.000867,
"grad_norm": 0.4085827171802521,
"learning_rate": 1e-05,
"loss": 0.0458,
"step": 86700
},
{
"epoch": 0.000868,
"grad_norm": 0.504058301448822,
"learning_rate": 1e-05,
"loss": 0.0459,
"step": 86800
},
{
"epoch": 0.000869,
"grad_norm": 0.48852622509002686,
"learning_rate": 1e-05,
"loss": 0.0454,
"step": 86900
},
{
"epoch": 0.00087,
"grad_norm": 0.4814854860305786,
"learning_rate": 1e-05,
"loss": 0.0457,
"step": 87000
},
{
"epoch": 0.000871,
"grad_norm": 0.40433430671691895,
"learning_rate": 1e-05,
"loss": 0.0463,
"step": 87100
},
{
"epoch": 0.000872,
"grad_norm": 0.40531593561172485,
"learning_rate": 1e-05,
"loss": 0.0452,
"step": 87200
},
{
"epoch": 0.000873,
"grad_norm": 0.5245575308799744,
"learning_rate": 1e-05,
"loss": 0.0449,
"step": 87300
},
{
"epoch": 0.000874,
"grad_norm": 0.39926889538764954,
"learning_rate": 1e-05,
"loss": 0.0459,
"step": 87400
},
{
"epoch": 0.000875,
"grad_norm": 0.4549976587295532,
"learning_rate": 1e-05,
"loss": 0.0464,
"step": 87500
},
{
"epoch": 0.000876,
"grad_norm": 0.4379943013191223,
"learning_rate": 1e-05,
"loss": 0.0458,
"step": 87600
},
{
"epoch": 0.000877,
"grad_norm": 0.5028941035270691,
"learning_rate": 1e-05,
"loss": 0.0462,
"step": 87700
},
{
"epoch": 0.000878,
"grad_norm": 0.43268847465515137,
"learning_rate": 1e-05,
"loss": 0.0459,
"step": 87800
},
{
"epoch": 0.000879,
"grad_norm": 0.5015890002250671,
"learning_rate": 1e-05,
"loss": 0.0449,
"step": 87900
},
{
"epoch": 0.00088,
"grad_norm": 0.445121705532074,
"learning_rate": 1e-05,
"loss": 0.0457,
"step": 88000
},
{
"epoch": 0.000881,
"grad_norm": 0.49214833974838257,
"learning_rate": 1e-05,
"loss": 0.0459,
"step": 88100
},
{
"epoch": 0.000882,
"grad_norm": 0.4444495141506195,
"learning_rate": 1e-05,
"loss": 0.0455,
"step": 88200
},
{
"epoch": 0.000883,
"grad_norm": 0.49876669049263,
"learning_rate": 1e-05,
"loss": 0.0459,
"step": 88300
},
{
"epoch": 0.000884,
"grad_norm": 0.5114990472793579,
"learning_rate": 1e-05,
"loss": 0.045,
"step": 88400
},
{
"epoch": 0.000885,
"grad_norm": 0.48783600330352783,
"learning_rate": 1e-05,
"loss": 0.0461,
"step": 88500
},
{
"epoch": 0.000886,
"grad_norm": 0.45137009024620056,
"learning_rate": 1e-05,
"loss": 0.0451,
"step": 88600
},
{
"epoch": 0.000887,
"grad_norm": 0.5109623074531555,
"learning_rate": 1e-05,
"loss": 0.0453,
"step": 88700
},
{
"epoch": 0.000888,
"grad_norm": 0.57321697473526,
"learning_rate": 1e-05,
"loss": 0.0458,
"step": 88800
},
{
"epoch": 0.000889,
"grad_norm": 0.4072723686695099,
"learning_rate": 1e-05,
"loss": 0.0446,
"step": 88900
},
{
"epoch": 0.00089,
"grad_norm": 0.5093070268630981,
"learning_rate": 1e-05,
"loss": 0.045,
"step": 89000
},
{
"epoch": 0.000891,
"grad_norm": 0.5923020839691162,
"learning_rate": 1e-05,
"loss": 0.0444,
"step": 89100
},
{
"epoch": 0.000892,
"grad_norm": 0.4343903958797455,
"learning_rate": 1e-05,
"loss": 0.0445,
"step": 89200
},
{
"epoch": 0.000893,
"grad_norm": 0.6024598479270935,
"learning_rate": 1e-05,
"loss": 0.045,
"step": 89300
},
{
"epoch": 0.000894,
"grad_norm": 0.5708175301551819,
"learning_rate": 1e-05,
"loss": 0.0445,
"step": 89400
},
{
"epoch": 0.000895,
"grad_norm": 0.42085763812065125,
"learning_rate": 1e-05,
"loss": 0.0448,
"step": 89500
},
{
"epoch": 0.000896,
"grad_norm": 0.4565168023109436,
"learning_rate": 1e-05,
"loss": 0.0448,
"step": 89600
},
{
"epoch": 0.000897,
"grad_norm": 0.4638221561908722,
"learning_rate": 1e-05,
"loss": 0.0455,
"step": 89700
},
{
"epoch": 0.000898,
"grad_norm": 0.3921230435371399,
"learning_rate": 1e-05,
"loss": 0.0452,
"step": 89800
},
{
"epoch": 0.000899,
"grad_norm": 0.5701455473899841,
"learning_rate": 1e-05,
"loss": 0.045,
"step": 89900
},
{
"epoch": 0.0009,
"grad_norm": 0.5132615566253662,
"learning_rate": 1e-05,
"loss": 0.0458,
"step": 90000
},
{
"epoch": 0.000901,
"grad_norm": 0.43130597472190857,
"learning_rate": 1e-05,
"loss": 0.0449,
"step": 90100
},
{
"epoch": 0.000902,
"grad_norm": 0.4558640718460083,
"learning_rate": 1e-05,
"loss": 0.0446,
"step": 90200
},
{
"epoch": 0.000903,
"grad_norm": 0.4325823485851288,
"learning_rate": 1e-05,
"loss": 0.0449,
"step": 90300
},
{
"epoch": 0.000904,
"grad_norm": 0.5899006724357605,
"learning_rate": 1e-05,
"loss": 0.0446,
"step": 90400
},
{
"epoch": 0.000905,
"grad_norm": 0.6101588010787964,
"learning_rate": 1e-05,
"loss": 0.0455,
"step": 90500
},
{
"epoch": 0.000906,
"grad_norm": 0.5354421138763428,
"learning_rate": 1e-05,
"loss": 0.0447,
"step": 90600
},
{
"epoch": 0.000907,
"grad_norm": 0.4496416449546814,
"learning_rate": 1e-05,
"loss": 0.0444,
"step": 90700
},
{
"epoch": 0.000908,
"grad_norm": 0.40793660283088684,
"learning_rate": 1e-05,
"loss": 0.0447,
"step": 90800
},
{
"epoch": 0.000909,
"grad_norm": 0.5534836053848267,
"learning_rate": 1e-05,
"loss": 0.0444,
"step": 90900
},
{
"epoch": 0.00091,
"grad_norm": 0.4275030493736267,
"learning_rate": 1e-05,
"loss": 0.0448,
"step": 91000
},
{
"epoch": 0.000911,
"grad_norm": 0.5632148385047913,
"learning_rate": 1e-05,
"loss": 0.0443,
"step": 91100
},
{
"epoch": 0.000912,
"grad_norm": 0.43501216173171997,
"learning_rate": 1e-05,
"loss": 0.0439,
"step": 91200
},
{
"epoch": 0.000913,
"grad_norm": 0.54071444272995,
"learning_rate": 1e-05,
"loss": 0.0449,
"step": 91300
},
{
"epoch": 0.000914,
"grad_norm": 0.40895435214042664,
"learning_rate": 1e-05,
"loss": 0.0451,
"step": 91400
},
{
"epoch": 0.000915,
"grad_norm": 0.495510995388031,
"learning_rate": 1e-05,
"loss": 0.0441,
"step": 91500
},
{
"epoch": 0.000916,
"grad_norm": 0.3936554789543152,
"learning_rate": 1e-05,
"loss": 0.0437,
"step": 91600
},
{
"epoch": 0.000917,
"grad_norm": 0.4443312883377075,
"learning_rate": 1e-05,
"loss": 0.0443,
"step": 91700
},
{
"epoch": 0.000918,
"grad_norm": 0.5269384384155273,
"learning_rate": 1e-05,
"loss": 0.0442,
"step": 91800
},
{
"epoch": 0.000919,
"grad_norm": 0.43092164397239685,
"learning_rate": 1e-05,
"loss": 0.0442,
"step": 91900
},
{
"epoch": 0.00092,
"grad_norm": 0.498935729265213,
"learning_rate": 1e-05,
"loss": 0.044,
"step": 92000
},
{
"epoch": 0.000921,
"grad_norm": 0.4460262656211853,
"learning_rate": 1e-05,
"loss": 0.0448,
"step": 92100
},
{
"epoch": 0.000922,
"grad_norm": 0.4452255964279175,
"learning_rate": 1e-05,
"loss": 0.0441,
"step": 92200
},
{
"epoch": 0.000923,
"grad_norm": 0.5646675229072571,
"learning_rate": 1e-05,
"loss": 0.044,
"step": 92300
},
{
"epoch": 0.000924,
"grad_norm": 0.5320536494255066,
"learning_rate": 1e-05,
"loss": 0.0439,
"step": 92400
},
{
"epoch": 0.000925,
"grad_norm": 0.4475862681865692,
"learning_rate": 1e-05,
"loss": 0.0432,
"step": 92500
},
{
"epoch": 0.000926,
"grad_norm": 0.42607611417770386,
"learning_rate": 1e-05,
"loss": 0.0448,
"step": 92600
},
{
"epoch": 0.000927,
"grad_norm": 0.465669721364975,
"learning_rate": 1e-05,
"loss": 0.0447,
"step": 92700
},
{
"epoch": 0.000928,
"grad_norm": 0.47202736139297485,
"learning_rate": 1e-05,
"loss": 0.0449,
"step": 92800
},
{
"epoch": 0.000929,
"grad_norm": 0.45119792222976685,
"learning_rate": 1e-05,
"loss": 0.0443,
"step": 92900
},
{
"epoch": 0.00093,
"grad_norm": 0.4515833258628845,
"learning_rate": 1e-05,
"loss": 0.0446,
"step": 93000
},
{
"epoch": 0.000931,
"grad_norm": 0.43587127327919006,
"learning_rate": 1e-05,
"loss": 0.0436,
"step": 93100
},
{
"epoch": 0.000932,
"grad_norm": 0.4407802224159241,
"learning_rate": 1e-05,
"loss": 0.0446,
"step": 93200
},
{
"epoch": 0.000933,
"grad_norm": 0.4792422950267792,
"learning_rate": 1e-05,
"loss": 0.0439,
"step": 93300
},
{
"epoch": 0.000934,
"grad_norm": 0.5214342474937439,
"learning_rate": 1e-05,
"loss": 0.0438,
"step": 93400
},
{
"epoch": 0.000935,
"grad_norm": 0.5573062300682068,
"learning_rate": 1e-05,
"loss": 0.044,
"step": 93500
},
{
"epoch": 0.000936,
"grad_norm": 0.5918563008308411,
"learning_rate": 1e-05,
"loss": 0.0436,
"step": 93600
},
{
"epoch": 0.000937,
"grad_norm": 0.48166489601135254,
"learning_rate": 1e-05,
"loss": 0.0442,
"step": 93700
},
{
"epoch": 0.000938,
"grad_norm": 0.4840247631072998,
"learning_rate": 1e-05,
"loss": 0.0438,
"step": 93800
},
{
"epoch": 0.000939,
"grad_norm": 0.44477516412734985,
"learning_rate": 1e-05,
"loss": 0.044,
"step": 93900
},
{
"epoch": 0.00094,
"grad_norm": 0.5108721256256104,
"learning_rate": 1e-05,
"loss": 0.0434,
"step": 94000
},
{
"epoch": 0.000941,
"grad_norm": 0.5947906970977783,
"learning_rate": 1e-05,
"loss": 0.0441,
"step": 94100
},
{
"epoch": 0.000942,
"grad_norm": 0.4325408637523651,
"learning_rate": 1e-05,
"loss": 0.0434,
"step": 94200
},
{
"epoch": 0.000943,
"grad_norm": 0.5207073092460632,
"learning_rate": 1e-05,
"loss": 0.0432,
"step": 94300
},
{
"epoch": 0.000944,
"grad_norm": 0.4852275848388672,
"learning_rate": 1e-05,
"loss": 0.0439,
"step": 94400
},
{
"epoch": 0.000945,
"grad_norm": 0.5342420339584351,
"learning_rate": 1e-05,
"loss": 0.0436,
"step": 94500
},
{
"epoch": 0.000946,
"grad_norm": 0.6544240713119507,
"learning_rate": 1e-05,
"loss": 0.0436,
"step": 94600
},
{
"epoch": 0.000947,
"grad_norm": 0.456338107585907,
"learning_rate": 1e-05,
"loss": 0.0437,
"step": 94700
},
{
"epoch": 0.000948,
"grad_norm": 0.51591956615448,
"learning_rate": 1e-05,
"loss": 0.0429,
"step": 94800
},
{
"epoch": 0.000949,
"grad_norm": 0.5521871447563171,
"learning_rate": 1e-05,
"loss": 0.0437,
"step": 94900
},
{
"epoch": 0.00095,
"grad_norm": 0.46055886149406433,
"learning_rate": 1e-05,
"loss": 0.0433,
"step": 95000
},
{
"epoch": 0.000951,
"grad_norm": 0.5128651261329651,
"learning_rate": 1e-05,
"loss": 0.0441,
"step": 95100
},
{
"epoch": 0.000952,
"grad_norm": 0.5421969294548035,
"learning_rate": 1e-05,
"loss": 0.044,
"step": 95200
},
{
"epoch": 0.000953,
"grad_norm": 0.4281409680843353,
"learning_rate": 1e-05,
"loss": 0.0439,
"step": 95300
},
{
"epoch": 0.000954,
"grad_norm": 0.3867093622684479,
"learning_rate": 1e-05,
"loss": 0.0439,
"step": 95400
},
{
"epoch": 0.000955,
"grad_norm": 0.39425021409988403,
"learning_rate": 1e-05,
"loss": 0.0434,
"step": 95500
},
{
"epoch": 0.000956,
"grad_norm": 0.45868080854415894,
"learning_rate": 1e-05,
"loss": 0.0435,
"step": 95600
},
{
"epoch": 0.000957,
"grad_norm": 0.38381725549697876,
"learning_rate": 1e-05,
"loss": 0.0434,
"step": 95700
},
{
"epoch": 0.000958,
"grad_norm": 0.5100952386856079,
"learning_rate": 1e-05,
"loss": 0.043,
"step": 95800
},
{
"epoch": 0.000959,
"grad_norm": 0.45941147208213806,
"learning_rate": 1e-05,
"loss": 0.0442,
"step": 95900
},
{
"epoch": 0.00096,
"grad_norm": 0.3832944929599762,
"learning_rate": 1e-05,
"loss": 0.044,
"step": 96000
},
{
"epoch": 0.000961,
"grad_norm": 0.3378923535346985,
"learning_rate": 1e-05,
"loss": 0.0436,
"step": 96100
},
{
"epoch": 0.000962,
"grad_norm": 0.41457870602607727,
"learning_rate": 1e-05,
"loss": 0.0435,
"step": 96200
},
{
"epoch": 0.000963,
"grad_norm": 0.49303749203681946,
"learning_rate": 1e-05,
"loss": 0.0426,
"step": 96300
},
{
"epoch": 0.000964,
"grad_norm": 0.3703688383102417,
"learning_rate": 1e-05,
"loss": 0.0436,
"step": 96400
},
{
"epoch": 0.000965,
"grad_norm": 0.3742707371711731,
"learning_rate": 1e-05,
"loss": 0.0431,
"step": 96500
},
{
"epoch": 0.000966,
"grad_norm": 0.4352927505970001,
"learning_rate": 1e-05,
"loss": 0.0427,
"step": 96600
},
{
"epoch": 0.000967,
"grad_norm": 0.4979144334793091,
"learning_rate": 1e-05,
"loss": 0.044,
"step": 96700
},
{
"epoch": 0.000968,
"grad_norm": 0.38628560304641724,
"learning_rate": 1e-05,
"loss": 0.0434,
"step": 96800
},
{
"epoch": 0.000969,
"grad_norm": 0.5488578677177429,
"learning_rate": 1e-05,
"loss": 0.0442,
"step": 96900
},
{
"epoch": 0.00097,
"grad_norm": 0.3385869264602661,
"learning_rate": 1e-05,
"loss": 0.043,
"step": 97000
},
{
"epoch": 0.000971,
"grad_norm": 0.3328537046909332,
"learning_rate": 1e-05,
"loss": 0.0434,
"step": 97100
},
{
"epoch": 0.000972,
"grad_norm": 0.3595049977302551,
"learning_rate": 1e-05,
"loss": 0.0435,
"step": 97200
},
{
"epoch": 0.000973,
"grad_norm": 0.4202601909637451,
"learning_rate": 1e-05,
"loss": 0.0433,
"step": 97300
},
{
"epoch": 0.000974,
"grad_norm": 0.47522690892219543,
"learning_rate": 1e-05,
"loss": 0.0428,
"step": 97400
},
{
"epoch": 0.000975,
"grad_norm": 0.4936007857322693,
"learning_rate": 1e-05,
"loss": 0.0429,
"step": 97500
},
{
"epoch": 0.000976,
"grad_norm": 0.40649285912513733,
"learning_rate": 1e-05,
"loss": 0.0433,
"step": 97600
},
{
"epoch": 0.000977,
"grad_norm": 0.4392286241054535,
"learning_rate": 1e-05,
"loss": 0.0429,
"step": 97700
},
{
"epoch": 0.000978,
"grad_norm": 0.38572990894317627,
"learning_rate": 1e-05,
"loss": 0.0435,
"step": 97800
},
{
"epoch": 0.000979,
"grad_norm": 0.5374602675437927,
"learning_rate": 1e-05,
"loss": 0.0428,
"step": 97900
},
{
"epoch": 0.00098,
"grad_norm": 0.4686330258846283,
"learning_rate": 1e-05,
"loss": 0.0431,
"step": 98000
},
{
"epoch": 0.000981,
"grad_norm": 0.44734638929367065,
"learning_rate": 1e-05,
"loss": 0.0424,
"step": 98100
},
{
"epoch": 0.000982,
"grad_norm": 0.47658222913742065,
"learning_rate": 1e-05,
"loss": 0.0439,
"step": 98200
},
{
"epoch": 0.000983,
"grad_norm": 0.73811274766922,
"learning_rate": 1e-05,
"loss": 0.0428,
"step": 98300
},
{
"epoch": 0.000984,
"grad_norm": 0.4593341648578644,
"learning_rate": 1e-05,
"loss": 0.0429,
"step": 98400
},
{
"epoch": 0.000985,
"grad_norm": 0.4732546806335449,
"learning_rate": 1e-05,
"loss": 0.0433,
"step": 98500
},
{
"epoch": 0.000986,
"grad_norm": 0.37035250663757324,
"learning_rate": 1e-05,
"loss": 0.0424,
"step": 98600
},
{
"epoch": 0.000987,
"grad_norm": 0.47103026509284973,
"learning_rate": 1e-05,
"loss": 0.0435,
"step": 98700
},
{
"epoch": 0.000988,
"grad_norm": 0.47766396403312683,
"learning_rate": 1e-05,
"loss": 0.0431,
"step": 98800
},
{
"epoch": 0.000989,
"grad_norm": 0.44070738554000854,
"learning_rate": 1e-05,
"loss": 0.0431,
"step": 98900
},
{
"epoch": 0.00099,
"grad_norm": 0.44191232323646545,
"learning_rate": 1e-05,
"loss": 0.0429,
"step": 99000
},
{
"epoch": 0.000991,
"grad_norm": 0.4926696717739105,
"learning_rate": 1e-05,
"loss": 0.0426,
"step": 99100
},
{
"epoch": 0.000992,
"grad_norm": 0.3758436143398285,
"learning_rate": 1e-05,
"loss": 0.042,
"step": 99200
},
{
"epoch": 0.000993,
"grad_norm": 0.4165551960468292,
"learning_rate": 1e-05,
"loss": 0.0435,
"step": 99300
},
{
"epoch": 0.000994,
"grad_norm": 0.4664058983325958,
"learning_rate": 1e-05,
"loss": 0.0436,
"step": 99400
},
{
"epoch": 0.000995,
"grad_norm": 0.5242469906806946,
"learning_rate": 1e-05,
"loss": 0.0431,
"step": 99500
},
{
"epoch": 0.000996,
"grad_norm": 0.5722303986549377,
"learning_rate": 1e-05,
"loss": 0.0433,
"step": 99600
},
{
"epoch": 0.000997,
"grad_norm": 0.4828585684299469,
"learning_rate": 1e-05,
"loss": 0.0425,
"step": 99700
},
{
"epoch": 0.000998,
"grad_norm": 0.46811702847480774,
"learning_rate": 1e-05,
"loss": 0.0429,
"step": 99800
},
{
"epoch": 0.000999,
"grad_norm": 0.379393070936203,
"learning_rate": 1e-05,
"loss": 0.0432,
"step": 99900
},
{
"epoch": 0.001,
"grad_norm": 0.5672951340675354,
"learning_rate": 1e-05,
"loss": 0.0435,
"step": 100000
},
{
"epoch": 0.001,
"eval_loss": 0.035888671875,
"eval_runtime": 147.9294,
"eval_samples_per_second": 337.999,
"eval_steps_per_second": 21.125,
"step": 100000
}
],
"logging_steps": 100,
"max_steps": 100000000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 20000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 200,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.951156666368e+18,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}