GTECorrector_32_nq / trainer_state.json
DornierDo17's picture
Upload folder using huggingface_hub
af83f06 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.2814258911819887,
"eval_steps": 800,
"global_step": 30400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004803001876172608,
"grad_norm": 3.330345630645752,
"learning_rate": 0.0001113525,
"loss": 4.3152,
"step": 64
},
{
"epoch": 0.009606003752345216,
"grad_norm": 2.5987207889556885,
"learning_rate": 0.00022447249999999998,
"loss": 3.5908,
"step": 128
},
{
"epoch": 0.014409005628517824,
"grad_norm": 10.176867485046387,
"learning_rate": 0.00033759249999999996,
"loss": 3.3927,
"step": 192
},
{
"epoch": 0.01921200750469043,
"grad_norm": 6.534875869750977,
"learning_rate": 0.00045071249999999993,
"loss": 3.3333,
"step": 256
},
{
"epoch": 0.02401500938086304,
"grad_norm": 6.088456630706787,
"learning_rate": 0.0005638325,
"loss": 3.2928,
"step": 320
},
{
"epoch": 0.028818011257035647,
"grad_norm": 6.937580108642578,
"learning_rate": 0.0006769524999999999,
"loss": 3.2901,
"step": 384
},
{
"epoch": 0.033621013133208255,
"grad_norm": 6.744969844818115,
"learning_rate": 0.0007900724999999999,
"loss": 3.2589,
"step": 448
},
{
"epoch": 0.03842401500938086,
"grad_norm": 2.2261719703674316,
"learning_rate": 0.0009031925,
"loss": 3.231,
"step": 512
},
{
"epoch": 0.04322701688555347,
"grad_norm": 1.030404806137085,
"learning_rate": 0.0010163124999999999,
"loss": 3.2278,
"step": 576
},
{
"epoch": 0.04803001876172608,
"grad_norm": 1.036293387413025,
"learning_rate": 0.0011294324999999998,
"loss": 3.272,
"step": 640
},
{
"epoch": 0.05283302063789869,
"grad_norm": 1.1835274696350098,
"learning_rate": 0.0012425525,
"loss": 3.256,
"step": 704
},
{
"epoch": 0.057636022514071295,
"grad_norm": 0.8378634452819824,
"learning_rate": 0.0013556724999999998,
"loss": 3.27,
"step": 768
},
{
"epoch": 0.0624390243902439,
"grad_norm": 0.7602612972259521,
"learning_rate": 0.0014687925,
"loss": 3.2261,
"step": 832
},
{
"epoch": 0.06724202626641651,
"grad_norm": 0.6387987732887268,
"learning_rate": 0.0015819124999999997,
"loss": 3.2153,
"step": 896
},
{
"epoch": 0.07204502814258912,
"grad_norm": 0.4422095715999603,
"learning_rate": 0.0016950325,
"loss": 3.1975,
"step": 960
},
{
"epoch": 0.07684803001876173,
"grad_norm": 0.39002183079719543,
"learning_rate": 0.0018081524999999999,
"loss": 3.1983,
"step": 1024
},
{
"epoch": 0.08165103189493433,
"grad_norm": 5.926162242889404,
"learning_rate": 0.0019212724999999996,
"loss": 3.1763,
"step": 1088
},
{
"epoch": 0.08645403377110694,
"grad_norm": 0.4173193871974945,
"learning_rate": 0.0020343924999999996,
"loss": 3.1833,
"step": 1152
},
{
"epoch": 0.09125703564727955,
"grad_norm": 0.4136042594909668,
"learning_rate": 0.0021475125,
"loss": 3.1846,
"step": 1216
},
{
"epoch": 0.09606003752345216,
"grad_norm": 0.39301183819770813,
"learning_rate": 0.0022606324999999996,
"loss": 3.1739,
"step": 1280
},
{
"epoch": 0.10086303939962477,
"grad_norm": 0.4910842776298523,
"learning_rate": 0.0023737525,
"loss": 3.1614,
"step": 1344
},
{
"epoch": 0.10566604127579737,
"grad_norm": 0.4039038121700287,
"learning_rate": 0.0024868725,
"loss": 3.1577,
"step": 1408
},
{
"epoch": 0.11046904315196998,
"grad_norm": 0.3286585211753845,
"learning_rate": 0.0025999925,
"loss": 3.1458,
"step": 1472
},
{
"epoch": 0.11527204502814259,
"grad_norm": 0.44095373153686523,
"learning_rate": 0.0027131125,
"loss": 3.155,
"step": 1536
},
{
"epoch": 0.1200750469043152,
"grad_norm": 0.40613290667533875,
"learning_rate": 0.0028262325,
"loss": 3.1469,
"step": 1600
},
{
"epoch": 0.1248780487804878,
"grad_norm": 0.4613141417503357,
"learning_rate": 0.002828,
"loss": 3.1392,
"step": 1664
},
{
"epoch": 0.1296810506566604,
"grad_norm": 0.3758493661880493,
"learning_rate": 0.002828,
"loss": 3.1298,
"step": 1728
},
{
"epoch": 0.13448405253283302,
"grad_norm": 0.32609787583351135,
"learning_rate": 0.002828,
"loss": 3.123,
"step": 1792
},
{
"epoch": 0.13928705440900563,
"grad_norm": 0.4221761226654053,
"learning_rate": 0.002828,
"loss": 3.1076,
"step": 1856
},
{
"epoch": 0.14409005628517824,
"grad_norm": 0.4372267425060272,
"learning_rate": 0.002828,
"loss": 3.098,
"step": 1920
},
{
"epoch": 0.14889305816135084,
"grad_norm": 0.36804404854774475,
"learning_rate": 0.002828,
"loss": 3.0952,
"step": 1984
},
{
"epoch": 0.15369606003752345,
"grad_norm": 0.314120888710022,
"learning_rate": 0.002828,
"loss": 3.0751,
"step": 2048
},
{
"epoch": 0.15849906191369606,
"grad_norm": 0.3158409297466278,
"learning_rate": 0.002828,
"loss": 3.0574,
"step": 2112
},
{
"epoch": 0.16330206378986867,
"grad_norm": 0.35668376088142395,
"learning_rate": 0.002828,
"loss": 3.0598,
"step": 2176
},
{
"epoch": 0.16810506566604128,
"grad_norm": 0.3429064452648163,
"learning_rate": 0.002828,
"loss": 3.0554,
"step": 2240
},
{
"epoch": 0.17290806754221388,
"grad_norm": 0.37981563806533813,
"learning_rate": 0.002828,
"loss": 3.0439,
"step": 2304
},
{
"epoch": 0.1777110694183865,
"grad_norm": 0.45046043395996094,
"learning_rate": 0.002828,
"loss": 3.034,
"step": 2368
},
{
"epoch": 0.1825140712945591,
"grad_norm": 0.30424681305885315,
"learning_rate": 0.002828,
"loss": 3.0408,
"step": 2432
},
{
"epoch": 0.1873170731707317,
"grad_norm": 0.4374525845050812,
"learning_rate": 0.002828,
"loss": 3.0289,
"step": 2496
},
{
"epoch": 0.19212007504690432,
"grad_norm": 0.4312361776828766,
"learning_rate": 0.002828,
"loss": 3.0252,
"step": 2560
},
{
"epoch": 0.19692307692307692,
"grad_norm": 0.33109021186828613,
"learning_rate": 0.002828,
"loss": 3.0094,
"step": 2624
},
{
"epoch": 0.20172607879924953,
"grad_norm": 0.4393901228904724,
"learning_rate": 0.002828,
"loss": 3.0021,
"step": 2688
},
{
"epoch": 0.20652908067542214,
"grad_norm": 0.44241341948509216,
"learning_rate": 0.002828,
"loss": 3.0005,
"step": 2752
},
{
"epoch": 0.21133208255159475,
"grad_norm": 0.36241745948791504,
"learning_rate": 0.002828,
"loss": 2.9939,
"step": 2816
},
{
"epoch": 0.21613508442776735,
"grad_norm": 0.40780672430992126,
"learning_rate": 0.002828,
"loss": 2.9788,
"step": 2880
},
{
"epoch": 0.22093808630393996,
"grad_norm": 0.3944590389728546,
"learning_rate": 0.002828,
"loss": 2.9854,
"step": 2944
},
{
"epoch": 0.22574108818011257,
"grad_norm": 0.40449267625808716,
"learning_rate": 0.002828,
"loss": 2.9819,
"step": 3008
},
{
"epoch": 0.23054409005628518,
"grad_norm": 0.37247487902641296,
"learning_rate": 0.002828,
"loss": 2.9827,
"step": 3072
},
{
"epoch": 0.23534709193245779,
"grad_norm": 0.3732891082763672,
"learning_rate": 0.002828,
"loss": 2.9714,
"step": 3136
},
{
"epoch": 0.2401500938086304,
"grad_norm": 0.3168690800666809,
"learning_rate": 0.002828,
"loss": 2.9649,
"step": 3200
},
{
"epoch": 0.244953095684803,
"grad_norm": 0.32185083627700806,
"learning_rate": 0.002828,
"loss": 2.9607,
"step": 3264
},
{
"epoch": 0.2497560975609756,
"grad_norm": 0.3293335437774658,
"learning_rate": 0.002828,
"loss": 2.9464,
"step": 3328
},
{
"epoch": 0.2545590994371482,
"grad_norm": 0.39153945446014404,
"learning_rate": 0.002828,
"loss": 2.9513,
"step": 3392
},
{
"epoch": 0.2593621013133208,
"grad_norm": 0.36884990334510803,
"learning_rate": 0.002828,
"loss": 2.9418,
"step": 3456
},
{
"epoch": 0.26416510318949343,
"grad_norm": 0.39196011424064636,
"learning_rate": 0.002828,
"loss": 2.9407,
"step": 3520
},
{
"epoch": 0.26896810506566604,
"grad_norm": 0.36011603474617004,
"learning_rate": 0.002828,
"loss": 2.9461,
"step": 3584
},
{
"epoch": 0.27377110694183865,
"grad_norm": 0.3608081638813019,
"learning_rate": 0.002828,
"loss": 2.937,
"step": 3648
},
{
"epoch": 0.27857410881801126,
"grad_norm": 0.3833774924278259,
"learning_rate": 0.002828,
"loss": 2.9254,
"step": 3712
},
{
"epoch": 0.28337711069418386,
"grad_norm": 0.35225459933280945,
"learning_rate": 0.002828,
"loss": 2.9165,
"step": 3776
},
{
"epoch": 0.2881801125703565,
"grad_norm": 0.39832860231399536,
"learning_rate": 0.002828,
"loss": 2.9259,
"step": 3840
},
{
"epoch": 0.2929831144465291,
"grad_norm": 0.36834558844566345,
"learning_rate": 0.002828,
"loss": 2.9186,
"step": 3904
},
{
"epoch": 0.2977861163227017,
"grad_norm": 0.3877101540565491,
"learning_rate": 0.002828,
"loss": 2.9107,
"step": 3968
},
{
"epoch": 0.3025891181988743,
"grad_norm": 0.40037983655929565,
"learning_rate": 0.002828,
"loss": 2.9086,
"step": 4032
},
{
"epoch": 0.3073921200750469,
"grad_norm": 0.35432353615760803,
"learning_rate": 0.002828,
"loss": 2.9039,
"step": 4096
},
{
"epoch": 0.3121951219512195,
"grad_norm": 0.3740752935409546,
"learning_rate": 0.002828,
"loss": 2.8973,
"step": 4160
},
{
"epoch": 0.3169981238273921,
"grad_norm": 0.3972289264202118,
"learning_rate": 0.002828,
"loss": 2.8868,
"step": 4224
},
{
"epoch": 0.3218011257035647,
"grad_norm": 0.3818065822124481,
"learning_rate": 0.002828,
"loss": 2.8916,
"step": 4288
},
{
"epoch": 0.32660412757973734,
"grad_norm": 0.31802886724472046,
"learning_rate": 0.002828,
"loss": 2.895,
"step": 4352
},
{
"epoch": 0.33140712945590994,
"grad_norm": 0.3920498192310333,
"learning_rate": 0.002828,
"loss": 2.896,
"step": 4416
},
{
"epoch": 0.33621013133208255,
"grad_norm": 0.42001602053642273,
"learning_rate": 0.002828,
"loss": 2.8757,
"step": 4480
},
{
"epoch": 0.34101313320825516,
"grad_norm": 0.38037222623825073,
"learning_rate": 0.002828,
"loss": 2.8812,
"step": 4544
},
{
"epoch": 0.34581613508442777,
"grad_norm": 0.6402748823165894,
"learning_rate": 0.002828,
"loss": 2.8741,
"step": 4608
},
{
"epoch": 0.3506191369606004,
"grad_norm": 0.3265625536441803,
"learning_rate": 0.002828,
"loss": 2.8659,
"step": 4672
},
{
"epoch": 0.355422138836773,
"grad_norm": 0.3389698565006256,
"learning_rate": 0.002828,
"loss": 2.863,
"step": 4736
},
{
"epoch": 0.3602251407129456,
"grad_norm": 0.34922096133232117,
"learning_rate": 0.002828,
"loss": 2.8555,
"step": 4800
},
{
"epoch": 0.3650281425891182,
"grad_norm": 0.370980441570282,
"learning_rate": 0.002828,
"loss": 2.8624,
"step": 4864
},
{
"epoch": 0.3698311444652908,
"grad_norm": 0.3553221821784973,
"learning_rate": 0.002828,
"loss": 2.8573,
"step": 4928
},
{
"epoch": 0.3746341463414634,
"grad_norm": 0.36796537041664124,
"learning_rate": 0.002828,
"loss": 2.8567,
"step": 4992
},
{
"epoch": 0.379437148217636,
"grad_norm": 0.3615240752696991,
"learning_rate": 0.002828,
"loss": 2.8444,
"step": 5056
},
{
"epoch": 0.38424015009380863,
"grad_norm": 0.4196101427078247,
"learning_rate": 0.002828,
"loss": 2.845,
"step": 5120
},
{
"epoch": 0.38904315196998124,
"grad_norm": 0.334185928106308,
"learning_rate": 0.002828,
"loss": 2.8376,
"step": 5184
},
{
"epoch": 0.39384615384615385,
"grad_norm": 0.30093756318092346,
"learning_rate": 0.002828,
"loss": 2.8302,
"step": 5248
},
{
"epoch": 0.39864915572232645,
"grad_norm": 0.41615140438079834,
"learning_rate": 0.002828,
"loss": 2.8365,
"step": 5312
},
{
"epoch": 0.40345215759849906,
"grad_norm": 0.38547712564468384,
"learning_rate": 0.002828,
"loss": 2.833,
"step": 5376
},
{
"epoch": 0.40825515947467167,
"grad_norm": 0.336453378200531,
"learning_rate": 0.002828,
"loss": 2.8289,
"step": 5440
},
{
"epoch": 0.4130581613508443,
"grad_norm": 0.33043336868286133,
"learning_rate": 0.002828,
"loss": 2.8154,
"step": 5504
},
{
"epoch": 0.4178611632270169,
"grad_norm": 0.33151519298553467,
"learning_rate": 0.002828,
"loss": 2.8267,
"step": 5568
},
{
"epoch": 0.4226641651031895,
"grad_norm": 0.29418498277664185,
"learning_rate": 0.002828,
"loss": 2.8167,
"step": 5632
},
{
"epoch": 0.4274671669793621,
"grad_norm": 0.3507523536682129,
"learning_rate": 0.002828,
"loss": 2.8227,
"step": 5696
},
{
"epoch": 0.4322701688555347,
"grad_norm": 0.36976736783981323,
"learning_rate": 0.002828,
"loss": 2.8087,
"step": 5760
},
{
"epoch": 0.4370731707317073,
"grad_norm": 0.4142448604106903,
"learning_rate": 0.002828,
"loss": 2.8191,
"step": 5824
},
{
"epoch": 0.4418761726078799,
"grad_norm": 0.3893688917160034,
"learning_rate": 0.002828,
"loss": 2.8032,
"step": 5888
},
{
"epoch": 0.44667917448405253,
"grad_norm": 0.3025995194911957,
"learning_rate": 0.002828,
"loss": 2.8049,
"step": 5952
},
{
"epoch": 0.45148217636022514,
"grad_norm": 0.3676198422908783,
"learning_rate": 0.002828,
"loss": 2.7976,
"step": 6016
},
{
"epoch": 0.45628517823639775,
"grad_norm": 0.39022454619407654,
"learning_rate": 0.002828,
"loss": 2.796,
"step": 6080
},
{
"epoch": 0.46108818011257036,
"grad_norm": 0.38986560702323914,
"learning_rate": 0.002828,
"loss": 2.791,
"step": 6144
},
{
"epoch": 0.46589118198874296,
"grad_norm": 0.35879769921302795,
"learning_rate": 0.002828,
"loss": 2.7949,
"step": 6208
},
{
"epoch": 0.47069418386491557,
"grad_norm": 0.44419315457344055,
"learning_rate": 0.002828,
"loss": 2.7862,
"step": 6272
},
{
"epoch": 0.4754971857410882,
"grad_norm": 0.30884304642677307,
"learning_rate": 0.002828,
"loss": 2.7864,
"step": 6336
},
{
"epoch": 0.4803001876172608,
"grad_norm": 0.542960524559021,
"learning_rate": 0.002828,
"loss": 2.7842,
"step": 6400
},
{
"epoch": 0.4851031894934334,
"grad_norm": 0.39032405614852905,
"learning_rate": 0.002828,
"loss": 2.7798,
"step": 6464
},
{
"epoch": 0.489906191369606,
"grad_norm": 0.3760650157928467,
"learning_rate": 0.002828,
"loss": 2.78,
"step": 6528
},
{
"epoch": 0.4947091932457786,
"grad_norm": 0.33309632539749146,
"learning_rate": 0.002828,
"loss": 2.7741,
"step": 6592
},
{
"epoch": 0.4995121951219512,
"grad_norm": 0.37640711665153503,
"learning_rate": 0.002828,
"loss": 2.7795,
"step": 6656
},
{
"epoch": 0.5043151969981239,
"grad_norm": 0.36830273270606995,
"learning_rate": 0.002828,
"loss": 2.7596,
"step": 6720
},
{
"epoch": 0.5091181988742964,
"grad_norm": 0.3751394748687744,
"learning_rate": 0.002828,
"loss": 2.761,
"step": 6784
},
{
"epoch": 0.5139212007504691,
"grad_norm": 0.3472868800163269,
"learning_rate": 0.002828,
"loss": 2.7567,
"step": 6848
},
{
"epoch": 0.5187242026266417,
"grad_norm": 0.3749905526638031,
"learning_rate": 0.002828,
"loss": 2.7654,
"step": 6912
},
{
"epoch": 0.5235272045028143,
"grad_norm": 0.4672335982322693,
"learning_rate": 0.002828,
"loss": 2.7467,
"step": 6976
},
{
"epoch": 0.5283302063789869,
"grad_norm": 0.30083194375038147,
"learning_rate": 0.002828,
"loss": 2.7596,
"step": 7040
},
{
"epoch": 0.5331332082551595,
"grad_norm": 0.34232673048973083,
"learning_rate": 0.002828,
"loss": 2.7425,
"step": 7104
},
{
"epoch": 0.5379362101313321,
"grad_norm": 0.42222973704338074,
"learning_rate": 0.002828,
"loss": 2.7486,
"step": 7168
},
{
"epoch": 0.5427392120075047,
"grad_norm": 0.36008650064468384,
"learning_rate": 0.002828,
"loss": 2.7451,
"step": 7232
},
{
"epoch": 0.5475422138836773,
"grad_norm": 0.34359127283096313,
"learning_rate": 0.002828,
"loss": 2.734,
"step": 7296
},
{
"epoch": 0.55234521575985,
"grad_norm": 0.3953745663166046,
"learning_rate": 0.002828,
"loss": 2.7397,
"step": 7360
},
{
"epoch": 0.5571482176360225,
"grad_norm": 0.36703094840049744,
"learning_rate": 0.002828,
"loss": 2.7313,
"step": 7424
},
{
"epoch": 0.5619512195121952,
"grad_norm": 0.31787919998168945,
"learning_rate": 0.002828,
"loss": 2.7363,
"step": 7488
},
{
"epoch": 0.5667542213883677,
"grad_norm": 0.31179967522621155,
"learning_rate": 0.002828,
"loss": 2.7236,
"step": 7552
},
{
"epoch": 0.5715572232645404,
"grad_norm": 0.3990299105644226,
"learning_rate": 0.002828,
"loss": 2.7191,
"step": 7616
},
{
"epoch": 0.576360225140713,
"grad_norm": 0.3776848018169403,
"learning_rate": 0.002828,
"loss": 2.7244,
"step": 7680
},
{
"epoch": 0.5811632270168856,
"grad_norm": 0.36117562651634216,
"learning_rate": 0.002828,
"loss": 2.7131,
"step": 7744
},
{
"epoch": 0.5859662288930582,
"grad_norm": 0.3219313323497772,
"learning_rate": 0.002828,
"loss": 2.7202,
"step": 7808
},
{
"epoch": 0.5907692307692308,
"grad_norm": 0.4501495957374573,
"learning_rate": 0.002828,
"loss": 2.7115,
"step": 7872
},
{
"epoch": 0.5955722326454034,
"grad_norm": 0.3939913809299469,
"learning_rate": 0.002828,
"loss": 2.7076,
"step": 7936
},
{
"epoch": 0.600375234521576,
"grad_norm": 0.3244933485984802,
"learning_rate": 0.002828,
"loss": 2.7047,
"step": 8000
},
{
"epoch": 0.6051782363977486,
"grad_norm": 0.3094891607761383,
"learning_rate": 0.002828,
"loss": 2.698,
"step": 8064
},
{
"epoch": 0.6099812382739213,
"grad_norm": 0.3525580167770386,
"learning_rate": 0.002828,
"loss": 2.7056,
"step": 8128
},
{
"epoch": 0.6147842401500938,
"grad_norm": 0.3058718144893646,
"learning_rate": 0.002828,
"loss": 2.6937,
"step": 8192
},
{
"epoch": 0.6195872420262665,
"grad_norm": 0.31864726543426514,
"learning_rate": 0.002828,
"loss": 2.6935,
"step": 8256
},
{
"epoch": 0.624390243902439,
"grad_norm": 0.3197256028652191,
"learning_rate": 0.002828,
"loss": 2.6981,
"step": 8320
},
{
"epoch": 0.6291932457786117,
"grad_norm": 0.30954182147979736,
"learning_rate": 0.002828,
"loss": 2.705,
"step": 8384
},
{
"epoch": 0.6339962476547842,
"grad_norm": 0.4144911468029022,
"learning_rate": 0.002828,
"loss": 2.6832,
"step": 8448
},
{
"epoch": 0.6387992495309569,
"grad_norm": 0.34720951318740845,
"learning_rate": 0.002828,
"loss": 2.6858,
"step": 8512
},
{
"epoch": 0.6436022514071295,
"grad_norm": 0.30545172095298767,
"learning_rate": 0.002828,
"loss": 2.6758,
"step": 8576
},
{
"epoch": 0.6484052532833021,
"grad_norm": 0.3341416120529175,
"learning_rate": 0.002828,
"loss": 2.6673,
"step": 8640
},
{
"epoch": 0.6532082551594747,
"grad_norm": 0.5191973447799683,
"learning_rate": 0.002828,
"loss": 2.6798,
"step": 8704
},
{
"epoch": 0.6580112570356473,
"grad_norm": 0.44382575154304504,
"learning_rate": 0.002828,
"loss": 2.683,
"step": 8768
},
{
"epoch": 0.6628142589118199,
"grad_norm": 0.45676809549331665,
"learning_rate": 0.002828,
"loss": 2.6731,
"step": 8832
},
{
"epoch": 0.6676172607879926,
"grad_norm": 0.3542475700378418,
"learning_rate": 0.002828,
"loss": 2.6813,
"step": 8896
},
{
"epoch": 0.6724202626641651,
"grad_norm": 0.3976110517978668,
"learning_rate": 0.002828,
"loss": 2.6714,
"step": 8960
},
{
"epoch": 0.6772232645403378,
"grad_norm": 0.37194061279296875,
"learning_rate": 0.002828,
"loss": 2.6646,
"step": 9024
},
{
"epoch": 0.6820262664165103,
"grad_norm": 0.4080849289894104,
"learning_rate": 0.002828,
"loss": 2.6638,
"step": 9088
},
{
"epoch": 0.686829268292683,
"grad_norm": 0.3275296986103058,
"learning_rate": 0.002828,
"loss": 2.6643,
"step": 9152
},
{
"epoch": 0.6916322701688555,
"grad_norm": 0.4300732910633087,
"learning_rate": 0.002828,
"loss": 2.6545,
"step": 9216
},
{
"epoch": 0.6964352720450282,
"grad_norm": 0.528816282749176,
"learning_rate": 0.002828,
"loss": 2.6639,
"step": 9280
},
{
"epoch": 0.7012382739212007,
"grad_norm": 0.39729437232017517,
"learning_rate": 0.002828,
"loss": 2.6669,
"step": 9344
},
{
"epoch": 0.7060412757973734,
"grad_norm": 0.36177024245262146,
"learning_rate": 0.002828,
"loss": 2.6429,
"step": 9408
},
{
"epoch": 0.710844277673546,
"grad_norm": 0.3488599956035614,
"learning_rate": 0.002828,
"loss": 2.6409,
"step": 9472
},
{
"epoch": 0.7156472795497186,
"grad_norm": 0.361208438873291,
"learning_rate": 0.002828,
"loss": 2.6354,
"step": 9536
},
{
"epoch": 0.7204502814258912,
"grad_norm": 0.3307696282863617,
"learning_rate": 0.002828,
"loss": 2.6398,
"step": 9600
},
{
"epoch": 0.7252532833020638,
"grad_norm": 0.47409588098526,
"learning_rate": 0.002828,
"loss": 2.6899,
"step": 9664
},
{
"epoch": 0.7300562851782364,
"grad_norm": 0.43482983112335205,
"learning_rate": 0.002828,
"loss": 2.6675,
"step": 9728
},
{
"epoch": 0.7348592870544091,
"grad_norm": 0.43177512288093567,
"learning_rate": 0.002828,
"loss": 2.6703,
"step": 9792
},
{
"epoch": 0.7396622889305816,
"grad_norm": 0.5830815434455872,
"learning_rate": 0.002828,
"loss": 2.6698,
"step": 9856
},
{
"epoch": 0.7444652908067543,
"grad_norm": 0.42559024691581726,
"learning_rate": 0.002828,
"loss": 2.6687,
"step": 9920
},
{
"epoch": 0.7492682926829268,
"grad_norm": 0.36572182178497314,
"learning_rate": 0.002828,
"loss": 2.6602,
"step": 9984
},
{
"epoch": 0.7540712945590995,
"grad_norm": 0.42863738536834717,
"learning_rate": 0.002828,
"loss": 2.6684,
"step": 10048
},
{
"epoch": 0.758874296435272,
"grad_norm": 0.34681934118270874,
"learning_rate": 0.002828,
"loss": 2.6618,
"step": 10112
},
{
"epoch": 0.7636772983114447,
"grad_norm": 0.40332967042922974,
"learning_rate": 0.002828,
"loss": 2.6523,
"step": 10176
},
{
"epoch": 0.7684803001876173,
"grad_norm": 0.47137463092803955,
"learning_rate": 0.002828,
"loss": 2.6543,
"step": 10240
},
{
"epoch": 0.7732833020637899,
"grad_norm": 0.3324384093284607,
"learning_rate": 0.002828,
"loss": 2.6444,
"step": 10304
},
{
"epoch": 0.7780863039399625,
"grad_norm": 0.3714103698730469,
"learning_rate": 0.002828,
"loss": 2.6466,
"step": 10368
},
{
"epoch": 0.7828893058161351,
"grad_norm": 0.3684547543525696,
"learning_rate": 0.002828,
"loss": 2.6497,
"step": 10432
},
{
"epoch": 0.7876923076923077,
"grad_norm": 0.3580617606639862,
"learning_rate": 0.002828,
"loss": 2.6428,
"step": 10496
},
{
"epoch": 0.7924953095684804,
"grad_norm": 0.4132176339626312,
"learning_rate": 0.002828,
"loss": 2.6407,
"step": 10560
},
{
"epoch": 0.7972983114446529,
"grad_norm": 0.4079800546169281,
"learning_rate": 0.002828,
"loss": 2.6374,
"step": 10624
},
{
"epoch": 0.8021013133208256,
"grad_norm": 0.40170854330062866,
"learning_rate": 0.002828,
"loss": 2.6319,
"step": 10688
},
{
"epoch": 0.8069043151969981,
"grad_norm": 0.4748755097389221,
"learning_rate": 0.002828,
"loss": 2.6489,
"step": 10752
},
{
"epoch": 0.8117073170731708,
"grad_norm": 0.3806183338165283,
"learning_rate": 0.002828,
"loss": 2.6363,
"step": 10816
},
{
"epoch": 0.8165103189493433,
"grad_norm": 0.32777532935142517,
"learning_rate": 0.002828,
"loss": 2.6386,
"step": 10880
},
{
"epoch": 0.821313320825516,
"grad_norm": 0.4884773790836334,
"learning_rate": 0.002828,
"loss": 2.6293,
"step": 10944
},
{
"epoch": 0.8261163227016886,
"grad_norm": 0.43175649642944336,
"learning_rate": 0.002828,
"loss": 2.6351,
"step": 11008
},
{
"epoch": 0.8309193245778612,
"grad_norm": 0.44375500082969666,
"learning_rate": 0.002828,
"loss": 2.6272,
"step": 11072
},
{
"epoch": 0.8357223264540338,
"grad_norm": 0.36503469944000244,
"learning_rate": 0.002828,
"loss": 2.628,
"step": 11136
},
{
"epoch": 0.8405253283302064,
"grad_norm": 0.3493196368217468,
"learning_rate": 0.002828,
"loss": 2.6238,
"step": 11200
},
{
"epoch": 0.845328330206379,
"grad_norm": 0.3593812584877014,
"learning_rate": 0.002828,
"loss": 2.6161,
"step": 11264
},
{
"epoch": 0.8501313320825516,
"grad_norm": 0.4043927788734436,
"learning_rate": 0.002828,
"loss": 2.6248,
"step": 11328
},
{
"epoch": 0.8549343339587242,
"grad_norm": 0.3805730938911438,
"learning_rate": 0.002828,
"loss": 2.619,
"step": 11392
},
{
"epoch": 0.8597373358348969,
"grad_norm": 0.40822461247444153,
"learning_rate": 0.002828,
"loss": 2.619,
"step": 11456
},
{
"epoch": 0.8645403377110694,
"grad_norm": 0.3430253565311432,
"learning_rate": 0.002828,
"loss": 2.6162,
"step": 11520
},
{
"epoch": 0.8693433395872421,
"grad_norm": 0.3665921688079834,
"learning_rate": 0.002828,
"loss": 2.6083,
"step": 11584
},
{
"epoch": 0.8741463414634146,
"grad_norm": 0.3768637776374817,
"learning_rate": 0.002828,
"loss": 2.6085,
"step": 11648
},
{
"epoch": 0.8789493433395873,
"grad_norm": 0.6709098219871521,
"learning_rate": 0.002828,
"loss": 2.6067,
"step": 11712
},
{
"epoch": 0.8837523452157598,
"grad_norm": 0.37109729647636414,
"learning_rate": 0.002828,
"loss": 2.5975,
"step": 11776
},
{
"epoch": 0.8885553470919325,
"grad_norm": 0.35545358061790466,
"learning_rate": 0.002828,
"loss": 2.6086,
"step": 11840
},
{
"epoch": 0.8933583489681051,
"grad_norm": 0.34493309259414673,
"learning_rate": 0.002828,
"loss": 2.6009,
"step": 11904
},
{
"epoch": 0.8981613508442777,
"grad_norm": 0.35226738452911377,
"learning_rate": 0.002828,
"loss": 2.5909,
"step": 11968
},
{
"epoch": 0.9029643527204503,
"grad_norm": 0.3626823425292969,
"learning_rate": 0.002828,
"loss": 2.5954,
"step": 12032
},
{
"epoch": 0.9077673545966229,
"grad_norm": 0.4639281630516052,
"learning_rate": 0.002828,
"loss": 2.5976,
"step": 12096
},
{
"epoch": 0.9125703564727955,
"grad_norm": 0.425073504447937,
"learning_rate": 0.002828,
"loss": 2.5846,
"step": 12160
},
{
"epoch": 0.9173733583489682,
"grad_norm": 0.4849206507205963,
"learning_rate": 0.002828,
"loss": 2.5851,
"step": 12224
},
{
"epoch": 0.9221763602251407,
"grad_norm": 0.3517647385597229,
"learning_rate": 0.002828,
"loss": 2.5832,
"step": 12288
},
{
"epoch": 0.9269793621013134,
"grad_norm": 0.4217440187931061,
"learning_rate": 0.002828,
"loss": 2.5777,
"step": 12352
},
{
"epoch": 0.9317823639774859,
"grad_norm": 0.3862438499927521,
"learning_rate": 0.002828,
"loss": 2.5769,
"step": 12416
},
{
"epoch": 0.9365853658536586,
"grad_norm": 0.4026007056236267,
"learning_rate": 0.002828,
"loss": 2.5802,
"step": 12480
},
{
"epoch": 0.9413883677298311,
"grad_norm": 0.3353049159049988,
"learning_rate": 0.002828,
"loss": 2.5741,
"step": 12544
},
{
"epoch": 0.9461913696060038,
"grad_norm": 0.35357797145843506,
"learning_rate": 0.002828,
"loss": 2.5723,
"step": 12608
},
{
"epoch": 0.9509943714821764,
"grad_norm": 0.35685861110687256,
"learning_rate": 0.002828,
"loss": 2.5801,
"step": 12672
},
{
"epoch": 0.955797373358349,
"grad_norm": 0.36265361309051514,
"learning_rate": 0.002828,
"loss": 2.5784,
"step": 12736
},
{
"epoch": 0.9606003752345216,
"grad_norm": 0.4119773805141449,
"learning_rate": 0.002828,
"loss": 2.5646,
"step": 12800
},
{
"epoch": 0.9654033771106942,
"grad_norm": 0.3662680387496948,
"learning_rate": 0.002828,
"loss": 2.5693,
"step": 12864
},
{
"epoch": 0.9702063789868668,
"grad_norm": 0.3822716772556305,
"learning_rate": 0.002828,
"loss": 2.5643,
"step": 12928
},
{
"epoch": 0.9750093808630395,
"grad_norm": 0.3412950038909912,
"learning_rate": 0.002828,
"loss": 2.5646,
"step": 12992
},
{
"epoch": 0.979812382739212,
"grad_norm": 0.373353511095047,
"learning_rate": 0.002828,
"loss": 2.5614,
"step": 13056
},
{
"epoch": 0.9846153846153847,
"grad_norm": 10.112526893615723,
"learning_rate": 0.002828,
"loss": 2.5578,
"step": 13120
},
{
"epoch": 0.9894183864915572,
"grad_norm": 0.36393383145332336,
"learning_rate": 0.002828,
"loss": 2.5696,
"step": 13184
},
{
"epoch": 0.9942213883677299,
"grad_norm": 0.4176023006439209,
"learning_rate": 0.002828,
"loss": 2.5533,
"step": 13248
},
{
"epoch": 0.9990243902439024,
"grad_norm": 0.4248984456062317,
"learning_rate": 0.002828,
"loss": 2.5569,
"step": 13312
},
{
"epoch": 1.003827392120075,
"grad_norm": 0.3931824564933777,
"learning_rate": 0.002828,
"loss": 2.5246,
"step": 13376
},
{
"epoch": 1.0086303939962478,
"grad_norm": 0.3742982745170593,
"learning_rate": 0.002828,
"loss": 2.5104,
"step": 13440
},
{
"epoch": 1.0134333958724202,
"grad_norm": 0.4388613998889923,
"learning_rate": 0.002828,
"loss": 2.519,
"step": 13504
},
{
"epoch": 1.0182363977485929,
"grad_norm": 0.41458427906036377,
"learning_rate": 0.002828,
"loss": 2.5162,
"step": 13568
},
{
"epoch": 1.0230393996247655,
"grad_norm": 0.3841855227947235,
"learning_rate": 0.002828,
"loss": 2.5129,
"step": 13632
},
{
"epoch": 1.027842401500938,
"grad_norm": 0.43930500745773315,
"learning_rate": 0.002828,
"loss": 2.5179,
"step": 13696
},
{
"epoch": 1.0326454033771106,
"grad_norm": 0.3687760531902313,
"learning_rate": 0.002828,
"loss": 2.5006,
"step": 13760
},
{
"epoch": 1.0374484052532833,
"grad_norm": 0.3823833465576172,
"learning_rate": 0.002828,
"loss": 2.5039,
"step": 13824
},
{
"epoch": 1.042251407129456,
"grad_norm": 0.40025222301483154,
"learning_rate": 0.002828,
"loss": 2.5155,
"step": 13888
},
{
"epoch": 1.0470544090056286,
"grad_norm": 0.40790122747421265,
"learning_rate": 0.002828,
"loss": 2.5064,
"step": 13952
},
{
"epoch": 1.051857410881801,
"grad_norm": 0.42718634009361267,
"learning_rate": 0.002828,
"loss": 2.5095,
"step": 14016
},
{
"epoch": 1.0566604127579737,
"grad_norm": 0.3305782079696655,
"learning_rate": 0.002828,
"loss": 2.5,
"step": 14080
},
{
"epoch": 1.0614634146341464,
"grad_norm": 0.37126559019088745,
"learning_rate": 0.002828,
"loss": 2.5099,
"step": 14144
},
{
"epoch": 1.0662664165103188,
"grad_norm": 0.414987176656723,
"learning_rate": 0.002828,
"loss": 2.501,
"step": 14208
},
{
"epoch": 1.0710694183864915,
"grad_norm": 0.45917075872421265,
"learning_rate": 0.002828,
"loss": 2.5062,
"step": 14272
},
{
"epoch": 1.0758724202626642,
"grad_norm": 0.4362465739250183,
"learning_rate": 0.002828,
"loss": 2.4949,
"step": 14336
},
{
"epoch": 1.0806754221388368,
"grad_norm": 0.40015289187431335,
"learning_rate": 0.002828,
"loss": 2.4957,
"step": 14400
},
{
"epoch": 1.0854784240150095,
"grad_norm": 0.3781159818172455,
"learning_rate": 0.002828,
"loss": 2.4979,
"step": 14464
},
{
"epoch": 1.090281425891182,
"grad_norm": 0.4165579676628113,
"learning_rate": 0.002828,
"loss": 2.4913,
"step": 14528
},
{
"epoch": 1.0950844277673546,
"grad_norm": 0.4100767970085144,
"learning_rate": 0.002828,
"loss": 2.4978,
"step": 14592
},
{
"epoch": 1.0998874296435273,
"grad_norm": 0.4211256504058838,
"learning_rate": 0.002828,
"loss": 2.4871,
"step": 14656
},
{
"epoch": 1.1046904315196997,
"grad_norm": 0.390396386384964,
"learning_rate": 0.002828,
"loss": 2.4933,
"step": 14720
},
{
"epoch": 1.1094934333958724,
"grad_norm": 0.3585355281829834,
"learning_rate": 0.002828,
"loss": 2.4811,
"step": 14784
},
{
"epoch": 1.114296435272045,
"grad_norm": 0.5148431062698364,
"learning_rate": 0.002828,
"loss": 2.4888,
"step": 14848
},
{
"epoch": 1.1190994371482177,
"grad_norm": 0.44254639744758606,
"learning_rate": 0.002828,
"loss": 2.4821,
"step": 14912
},
{
"epoch": 1.1239024390243904,
"grad_norm": 0.3710468113422394,
"learning_rate": 0.002828,
"loss": 2.4819,
"step": 14976
},
{
"epoch": 1.1287054409005628,
"grad_norm": 0.41197285056114197,
"learning_rate": 0.002828,
"loss": 2.4842,
"step": 15040
},
{
"epoch": 1.1335084427767355,
"grad_norm": 0.37512508034706116,
"learning_rate": 0.002828,
"loss": 2.4776,
"step": 15104
},
{
"epoch": 1.1383114446529081,
"grad_norm": 0.4286038279533386,
"learning_rate": 0.002828,
"loss": 2.4748,
"step": 15168
},
{
"epoch": 1.1431144465290806,
"grad_norm": 0.37446776032447815,
"learning_rate": 0.002828,
"loss": 2.4727,
"step": 15232
},
{
"epoch": 1.1479174484052532,
"grad_norm": 0.4537597894668579,
"learning_rate": 0.002828,
"loss": 2.4663,
"step": 15296
},
{
"epoch": 1.152720450281426,
"grad_norm": 0.36247050762176514,
"learning_rate": 0.002828,
"loss": 2.4699,
"step": 15360
},
{
"epoch": 1.1575234521575986,
"grad_norm": 0.3772297501564026,
"learning_rate": 0.002828,
"loss": 2.4734,
"step": 15424
},
{
"epoch": 1.1623264540337712,
"grad_norm": 0.3789200186729431,
"learning_rate": 0.002828,
"loss": 2.4696,
"step": 15488
},
{
"epoch": 1.1671294559099437,
"grad_norm": 0.36870113015174866,
"learning_rate": 0.002828,
"loss": 2.4671,
"step": 15552
},
{
"epoch": 1.1719324577861163,
"grad_norm": 0.37448298931121826,
"learning_rate": 0.002828,
"loss": 2.462,
"step": 15616
},
{
"epoch": 1.176735459662289,
"grad_norm": 0.4384878873825073,
"learning_rate": 0.002828,
"loss": 2.4648,
"step": 15680
},
{
"epoch": 1.1815384615384614,
"grad_norm": 0.37811148166656494,
"learning_rate": 0.002828,
"loss": 2.4598,
"step": 15744
},
{
"epoch": 1.186341463414634,
"grad_norm": 0.4190385341644287,
"learning_rate": 0.002828,
"loss": 2.4643,
"step": 15808
},
{
"epoch": 1.1911444652908068,
"grad_norm": 0.48885485529899597,
"learning_rate": 0.002828,
"loss": 2.4564,
"step": 15872
},
{
"epoch": 1.1959474671669794,
"grad_norm": 0.42267611622810364,
"learning_rate": 0.002828,
"loss": 2.4671,
"step": 15936
},
{
"epoch": 1.200750469043152,
"grad_norm": 0.3886626064777374,
"learning_rate": 0.002828,
"loss": 2.4715,
"step": 16000
},
{
"epoch": 1.2055534709193245,
"grad_norm": 0.40871456265449524,
"learning_rate": 0.002828,
"loss": 2.4558,
"step": 16064
},
{
"epoch": 1.2103564727954972,
"grad_norm": 0.46952739357948303,
"learning_rate": 0.002828,
"loss": 2.4497,
"step": 16128
},
{
"epoch": 1.2151594746716698,
"grad_norm": 0.41340023279190063,
"learning_rate": 0.002828,
"loss": 2.4402,
"step": 16192
},
{
"epoch": 1.2199624765478423,
"grad_norm": 0.36176440119743347,
"learning_rate": 0.002828,
"loss": 2.4473,
"step": 16256
},
{
"epoch": 1.224765478424015,
"grad_norm": 0.4117899239063263,
"learning_rate": 0.002828,
"loss": 2.443,
"step": 16320
},
{
"epoch": 1.2295684803001876,
"grad_norm": 0.5039286613464355,
"learning_rate": 0.002828,
"loss": 2.4557,
"step": 16384
},
{
"epoch": 1.2343714821763603,
"grad_norm": 0.3716677129268646,
"learning_rate": 0.002828,
"loss": 2.4522,
"step": 16448
},
{
"epoch": 1.239174484052533,
"grad_norm": 0.42316168546676636,
"learning_rate": 0.002828,
"loss": 2.4424,
"step": 16512
},
{
"epoch": 1.2439774859287054,
"grad_norm": 0.5081620216369629,
"learning_rate": 0.002828,
"loss": 2.4325,
"step": 16576
},
{
"epoch": 1.248780487804878,
"grad_norm": 0.39409589767456055,
"learning_rate": 0.002828,
"loss": 2.435,
"step": 16640
},
{
"epoch": 1.2535834896810507,
"grad_norm": 0.38638824224472046,
"learning_rate": 0.002828,
"loss": 2.4363,
"step": 16704
},
{
"epoch": 1.2583864915572232,
"grad_norm": 0.41918718814849854,
"learning_rate": 0.002828,
"loss": 2.4404,
"step": 16768
},
{
"epoch": 1.2631894934333958,
"grad_norm": 0.3932395279407501,
"learning_rate": 0.002828,
"loss": 2.4403,
"step": 16832
},
{
"epoch": 1.2679924953095685,
"grad_norm": 0.3787371814250946,
"learning_rate": 0.002828,
"loss": 2.4386,
"step": 16896
},
{
"epoch": 1.2727954971857411,
"grad_norm": 0.40612953901290894,
"learning_rate": 0.002828,
"loss": 2.4219,
"step": 16960
},
{
"epoch": 1.2775984990619138,
"grad_norm": 0.4243071675300598,
"learning_rate": 0.002828,
"loss": 2.4261,
"step": 17024
},
{
"epoch": 1.2824015009380862,
"grad_norm": 0.4240303039550781,
"learning_rate": 0.002828,
"loss": 2.444,
"step": 17088
},
{
"epoch": 1.287204502814259,
"grad_norm": 0.4888259470462799,
"learning_rate": 0.002828,
"loss": 2.4344,
"step": 17152
},
{
"epoch": 1.2920075046904316,
"grad_norm": 0.4678399860858917,
"learning_rate": 0.002828,
"loss": 2.4306,
"step": 17216
},
{
"epoch": 1.296810506566604,
"grad_norm": 0.38733649253845215,
"learning_rate": 0.002828,
"loss": 2.431,
"step": 17280
},
{
"epoch": 1.3016135084427767,
"grad_norm": 0.38587358593940735,
"learning_rate": 0.002828,
"loss": 2.4205,
"step": 17344
},
{
"epoch": 1.3064165103189493,
"grad_norm": 0.39998751878738403,
"learning_rate": 0.002828,
"loss": 2.4336,
"step": 17408
},
{
"epoch": 1.311219512195122,
"grad_norm": 0.36294978857040405,
"learning_rate": 0.002828,
"loss": 2.4238,
"step": 17472
},
{
"epoch": 1.3160225140712947,
"grad_norm": 0.3924562633037567,
"learning_rate": 0.002828,
"loss": 2.4197,
"step": 17536
},
{
"epoch": 1.320825515947467,
"grad_norm": 0.3837553560733795,
"learning_rate": 0.002828,
"loss": 2.4243,
"step": 17600
},
{
"epoch": 1.3256285178236398,
"grad_norm": 0.38875913619995117,
"learning_rate": 0.002828,
"loss": 2.4172,
"step": 17664
},
{
"epoch": 1.3304315196998124,
"grad_norm": 0.41738125681877136,
"learning_rate": 0.002828,
"loss": 2.4225,
"step": 17728
},
{
"epoch": 1.3352345215759849,
"grad_norm": 0.3645491898059845,
"learning_rate": 0.002828,
"loss": 2.4151,
"step": 17792
},
{
"epoch": 1.3400375234521575,
"grad_norm": 0.43829870223999023,
"learning_rate": 0.002828,
"loss": 2.4099,
"step": 17856
},
{
"epoch": 1.3448405253283302,
"grad_norm": 0.3851640820503235,
"learning_rate": 0.002828,
"loss": 2.4168,
"step": 17920
},
{
"epoch": 1.3496435272045029,
"grad_norm": 0.36147060990333557,
"learning_rate": 0.002828,
"loss": 2.4085,
"step": 17984
},
{
"epoch": 1.3544465290806755,
"grad_norm": 0.42050638794898987,
"learning_rate": 0.002828,
"loss": 2.4121,
"step": 18048
},
{
"epoch": 1.359249530956848,
"grad_norm": 0.3830699920654297,
"learning_rate": 0.002828,
"loss": 2.4095,
"step": 18112
},
{
"epoch": 1.3640525328330206,
"grad_norm": 0.3830968737602234,
"learning_rate": 0.002828,
"loss": 2.4077,
"step": 18176
},
{
"epoch": 1.3688555347091933,
"grad_norm": 0.3880060017108917,
"learning_rate": 0.002828,
"loss": 2.4124,
"step": 18240
},
{
"epoch": 1.3736585365853657,
"grad_norm": 0.45445796847343445,
"learning_rate": 0.002828,
"loss": 2.4014,
"step": 18304
},
{
"epoch": 1.3784615384615384,
"grad_norm": 0.3750540316104889,
"learning_rate": 0.002828,
"loss": 2.4003,
"step": 18368
},
{
"epoch": 1.383264540337711,
"grad_norm": 0.3783455193042755,
"learning_rate": 0.002828,
"loss": 2.3983,
"step": 18432
},
{
"epoch": 1.3880675422138837,
"grad_norm": 0.40336528420448303,
"learning_rate": 0.002828,
"loss": 2.4105,
"step": 18496
},
{
"epoch": 1.3928705440900564,
"grad_norm": 0.43220385909080505,
"learning_rate": 0.002828,
"loss": 2.4018,
"step": 18560
},
{
"epoch": 1.3976735459662288,
"grad_norm": 0.4069630205631256,
"learning_rate": 0.002828,
"loss": 2.4049,
"step": 18624
},
{
"epoch": 1.4024765478424015,
"grad_norm": 0.3866819441318512,
"learning_rate": 0.002828,
"loss": 2.3917,
"step": 18688
},
{
"epoch": 1.4072795497185742,
"grad_norm": 0.3699668347835541,
"learning_rate": 0.002828,
"loss": 2.3908,
"step": 18752
},
{
"epoch": 1.4120825515947466,
"grad_norm": 0.377645879983902,
"learning_rate": 0.002828,
"loss": 2.3957,
"step": 18816
},
{
"epoch": 1.4168855534709193,
"grad_norm": 0.36612892150878906,
"learning_rate": 0.002828,
"loss": 2.3973,
"step": 18880
},
{
"epoch": 1.421688555347092,
"grad_norm": 0.385735422372818,
"learning_rate": 0.002828,
"loss": 2.3952,
"step": 18944
},
{
"epoch": 1.4264915572232646,
"grad_norm": 0.4026818871498108,
"learning_rate": 0.002828,
"loss": 2.3908,
"step": 19008
},
{
"epoch": 1.4312945590994373,
"grad_norm": 0.39212891459465027,
"learning_rate": 0.002828,
"loss": 2.3923,
"step": 19072
},
{
"epoch": 1.4360975609756097,
"grad_norm": 0.43533411622047424,
"learning_rate": 0.002828,
"loss": 2.3918,
"step": 19136
},
{
"epoch": 1.4409005628517824,
"grad_norm": 0.4136466383934021,
"learning_rate": 0.002828,
"loss": 2.3885,
"step": 19200
},
{
"epoch": 1.445703564727955,
"grad_norm": 0.38349345326423645,
"learning_rate": 0.002828,
"loss": 2.3891,
"step": 19264
},
{
"epoch": 1.4505065666041275,
"grad_norm": 0.42666760087013245,
"learning_rate": 0.002828,
"loss": 2.3725,
"step": 19328
},
{
"epoch": 1.4553095684803001,
"grad_norm": 0.3926577866077423,
"learning_rate": 0.002828,
"loss": 2.3885,
"step": 19392
},
{
"epoch": 1.4601125703564728,
"grad_norm": 0.3736414611339569,
"learning_rate": 0.002828,
"loss": 2.3855,
"step": 19456
},
{
"epoch": 1.4649155722326455,
"grad_norm": 0.36343908309936523,
"learning_rate": 0.002828,
"loss": 2.3773,
"step": 19520
},
{
"epoch": 1.4697185741088181,
"grad_norm": 0.380211740732193,
"learning_rate": 0.002828,
"loss": 2.3809,
"step": 19584
},
{
"epoch": 1.4745215759849906,
"grad_norm": 0.40481454133987427,
"learning_rate": 0.002828,
"loss": 2.375,
"step": 19648
},
{
"epoch": 1.4793245778611632,
"grad_norm": 0.45368635654449463,
"learning_rate": 0.002828,
"loss": 2.3707,
"step": 19712
},
{
"epoch": 1.484127579737336,
"grad_norm": 0.4029395580291748,
"learning_rate": 0.002828,
"loss": 2.3733,
"step": 19776
},
{
"epoch": 1.4889305816135083,
"grad_norm": 0.3748946785926819,
"learning_rate": 0.002828,
"loss": 2.3739,
"step": 19840
},
{
"epoch": 1.493733583489681,
"grad_norm": 0.36640551686286926,
"learning_rate": 0.002828,
"loss": 2.3652,
"step": 19904
},
{
"epoch": 1.4985365853658537,
"grad_norm": 0.4150533676147461,
"learning_rate": 0.002828,
"loss": 2.3709,
"step": 19968
},
{
"epoch": 1.5033395872420263,
"grad_norm": 0.49730879068374634,
"learning_rate": 0.002828,
"loss": 2.3668,
"step": 20032
},
{
"epoch": 1.508142589118199,
"grad_norm": 0.37675461173057556,
"learning_rate": 0.002828,
"loss": 2.3695,
"step": 20096
},
{
"epoch": 1.5129455909943714,
"grad_norm": 0.3647516965866089,
"learning_rate": 0.002828,
"loss": 2.3733,
"step": 20160
},
{
"epoch": 1.517748592870544,
"grad_norm": 1.2981253862380981,
"learning_rate": 0.002828,
"loss": 2.369,
"step": 20224
},
{
"epoch": 1.5225515947467168,
"grad_norm": 0.5044511556625366,
"learning_rate": 0.002828,
"loss": 2.3578,
"step": 20288
},
{
"epoch": 1.5273545966228892,
"grad_norm": 0.3651883006095886,
"learning_rate": 0.002828,
"loss": 2.3601,
"step": 20352
},
{
"epoch": 1.532157598499062,
"grad_norm": 0.4419403076171875,
"learning_rate": 0.002828,
"loss": 2.3607,
"step": 20416
},
{
"epoch": 1.5369606003752345,
"grad_norm": 0.38631224632263184,
"learning_rate": 0.002828,
"loss": 2.3619,
"step": 20480
},
{
"epoch": 1.5417636022514072,
"grad_norm": 0.34725359082221985,
"learning_rate": 0.002828,
"loss": 2.3573,
"step": 20544
},
{
"epoch": 1.5465666041275798,
"grad_norm": 0.3991786241531372,
"learning_rate": 0.002828,
"loss": 2.357,
"step": 20608
},
{
"epoch": 1.5513696060037523,
"grad_norm": 0.3595084846019745,
"learning_rate": 0.002828,
"loss": 2.357,
"step": 20672
},
{
"epoch": 1.556172607879925,
"grad_norm": 0.4021853804588318,
"learning_rate": 0.002828,
"loss": 2.3537,
"step": 20736
},
{
"epoch": 1.5609756097560976,
"grad_norm": 0.3939075767993927,
"learning_rate": 0.002828,
"loss": 2.3594,
"step": 20800
},
{
"epoch": 1.56577861163227,
"grad_norm": 0.3889540135860443,
"learning_rate": 0.002828,
"loss": 2.3573,
"step": 20864
},
{
"epoch": 1.570581613508443,
"grad_norm": 0.41366517543792725,
"learning_rate": 0.002828,
"loss": 2.3442,
"step": 20928
},
{
"epoch": 1.5753846153846154,
"grad_norm": 0.37127187848091125,
"learning_rate": 0.002828,
"loss": 2.3457,
"step": 20992
},
{
"epoch": 1.580187617260788,
"grad_norm": 0.4014946520328522,
"learning_rate": 0.002828,
"loss": 2.3457,
"step": 21056
},
{
"epoch": 1.5849906191369607,
"grad_norm": 0.35794708132743835,
"learning_rate": 0.002828,
"loss": 2.3508,
"step": 21120
},
{
"epoch": 1.5897936210131332,
"grad_norm": 0.3924767076969147,
"learning_rate": 0.002828,
"loss": 2.3451,
"step": 21184
},
{
"epoch": 1.5945966228893058,
"grad_norm": 0.34789031744003296,
"learning_rate": 0.002828,
"loss": 2.3444,
"step": 21248
},
{
"epoch": 1.5993996247654785,
"grad_norm": 0.37461933493614197,
"learning_rate": 0.002828,
"loss": 2.3385,
"step": 21312
},
{
"epoch": 1.604202626641651,
"grad_norm": 0.40146076679229736,
"learning_rate": 0.002828,
"loss": 2.3406,
"step": 21376
},
{
"epoch": 1.6090056285178238,
"grad_norm": 0.4080921411514282,
"learning_rate": 0.002828,
"loss": 2.3423,
"step": 21440
},
{
"epoch": 1.6138086303939962,
"grad_norm": 0.40802744030952454,
"learning_rate": 0.002828,
"loss": 2.3377,
"step": 21504
},
{
"epoch": 1.618611632270169,
"grad_norm": 0.420188307762146,
"learning_rate": 0.002828,
"loss": 2.3399,
"step": 21568
},
{
"epoch": 1.6234146341463416,
"grad_norm": 0.40739214420318604,
"learning_rate": 0.002828,
"loss": 2.34,
"step": 21632
},
{
"epoch": 1.628217636022514,
"grad_norm": 0.41674676537513733,
"learning_rate": 0.002828,
"loss": 2.3326,
"step": 21696
},
{
"epoch": 1.6330206378986867,
"grad_norm": 0.41856762766838074,
"learning_rate": 0.002828,
"loss": 2.3366,
"step": 21760
},
{
"epoch": 1.6378236397748593,
"grad_norm": 0.39763346314430237,
"learning_rate": 0.002828,
"loss": 2.3339,
"step": 21824
},
{
"epoch": 1.6426266416510318,
"grad_norm": 0.3777034282684326,
"learning_rate": 0.002828,
"loss": 2.343,
"step": 21888
},
{
"epoch": 1.6474296435272047,
"grad_norm": 0.3617188036441803,
"learning_rate": 0.002828,
"loss": 2.3341,
"step": 21952
},
{
"epoch": 1.652232645403377,
"grad_norm": 0.4504718482494354,
"learning_rate": 0.002828,
"loss": 2.3295,
"step": 22016
},
{
"epoch": 1.6570356472795498,
"grad_norm": 0.37388357520103455,
"learning_rate": 0.002828,
"loss": 2.3408,
"step": 22080
},
{
"epoch": 1.6618386491557224,
"grad_norm": 0.3807313144207001,
"learning_rate": 0.002828,
"loss": 2.3249,
"step": 22144
},
{
"epoch": 1.6666416510318949,
"grad_norm": 0.4428509771823883,
"learning_rate": 0.002828,
"loss": 2.3347,
"step": 22208
},
{
"epoch": 1.6714446529080675,
"grad_norm": 0.39028382301330566,
"learning_rate": 0.002828,
"loss": 2.336,
"step": 22272
},
{
"epoch": 1.6762476547842402,
"grad_norm": 0.482424259185791,
"learning_rate": 0.002828,
"loss": 2.3212,
"step": 22336
},
{
"epoch": 1.6810506566604126,
"grad_norm": 0.39801299571990967,
"learning_rate": 0.002828,
"loss": 2.314,
"step": 22400
},
{
"epoch": 1.6858536585365855,
"grad_norm": 0.4351527988910675,
"learning_rate": 0.002828,
"loss": 2.3223,
"step": 22464
},
{
"epoch": 1.690656660412758,
"grad_norm": 0.4509490430355072,
"learning_rate": 0.002828,
"loss": 2.3246,
"step": 22528
},
{
"epoch": 1.6954596622889306,
"grad_norm": 0.35885152220726013,
"learning_rate": 0.002828,
"loss": 2.319,
"step": 22592
},
{
"epoch": 1.7002626641651033,
"grad_norm": 0.4146900177001953,
"learning_rate": 0.002828,
"loss": 2.3214,
"step": 22656
},
{
"epoch": 1.7050656660412757,
"grad_norm": 0.40194573998451233,
"learning_rate": 0.002828,
"loss": 2.322,
"step": 22720
},
{
"epoch": 1.7098686679174484,
"grad_norm": 0.43570390343666077,
"learning_rate": 0.002828,
"loss": 2.3241,
"step": 22784
},
{
"epoch": 1.714671669793621,
"grad_norm": 0.35558512806892395,
"learning_rate": 0.002828,
"loss": 2.3193,
"step": 22848
},
{
"epoch": 1.7194746716697935,
"grad_norm": 0.3700902760028839,
"learning_rate": 0.002828,
"loss": 2.3219,
"step": 22912
},
{
"epoch": 1.7242776735459664,
"grad_norm": 0.4287453591823578,
"learning_rate": 0.002828,
"loss": 2.3078,
"step": 22976
},
{
"epoch": 1.7290806754221388,
"grad_norm": 0.41843536496162415,
"learning_rate": 0.002828,
"loss": 2.3103,
"step": 23040
},
{
"epoch": 1.7338836772983115,
"grad_norm": 0.3938317596912384,
"learning_rate": 0.002828,
"loss": 2.3176,
"step": 23104
},
{
"epoch": 1.7386866791744842,
"grad_norm": 0.44625958800315857,
"learning_rate": 0.002828,
"loss": 2.307,
"step": 23168
},
{
"epoch": 1.7434896810506566,
"grad_norm": 0.4598078727722168,
"learning_rate": 0.002828,
"loss": 2.2958,
"step": 23232
},
{
"epoch": 1.7482926829268293,
"grad_norm": 0.4126788377761841,
"learning_rate": 0.002828,
"loss": 2.3094,
"step": 23296
},
{
"epoch": 1.753095684803002,
"grad_norm": 0.3801914155483246,
"learning_rate": 0.002828,
"loss": 2.3048,
"step": 23360
},
{
"epoch": 1.7578986866791744,
"grad_norm": 0.4619985818862915,
"learning_rate": 0.002828,
"loss": 2.3069,
"step": 23424
},
{
"epoch": 1.7627016885553473,
"grad_norm": 0.4068593680858612,
"learning_rate": 0.002828,
"loss": 2.299,
"step": 23488
},
{
"epoch": 1.7675046904315197,
"grad_norm": 0.36146870255470276,
"learning_rate": 0.002828,
"loss": 2.3042,
"step": 23552
},
{
"epoch": 1.7723076923076924,
"grad_norm": 0.3995908200740814,
"learning_rate": 0.002828,
"loss": 2.3006,
"step": 23616
},
{
"epoch": 1.777110694183865,
"grad_norm": 0.3970596492290497,
"learning_rate": 0.002828,
"loss": 2.313,
"step": 23680
},
{
"epoch": 1.7819136960600375,
"grad_norm": 0.4287073612213135,
"learning_rate": 0.002828,
"loss": 2.2974,
"step": 23744
},
{
"epoch": 1.7867166979362101,
"grad_norm": 0.41250482201576233,
"learning_rate": 0.002828,
"loss": 2.2937,
"step": 23808
},
{
"epoch": 1.7915196998123828,
"grad_norm": 0.411668062210083,
"learning_rate": 0.002828,
"loss": 2.2994,
"step": 23872
},
{
"epoch": 1.7963227016885552,
"grad_norm": 0.4834740459918976,
"learning_rate": 0.002828,
"loss": 2.2895,
"step": 23936
},
{
"epoch": 1.8011257035647281,
"grad_norm": 0.3624022603034973,
"learning_rate": 0.002828,
"loss": 2.2888,
"step": 24000
},
{
"epoch": 1.8059287054409006,
"grad_norm": 0.36700454354286194,
"learning_rate": 0.002828,
"loss": 2.2917,
"step": 24064
},
{
"epoch": 1.8107317073170732,
"grad_norm": 0.3666454255580902,
"learning_rate": 0.002828,
"loss": 2.2896,
"step": 24128
},
{
"epoch": 1.8155347091932459,
"grad_norm": 0.4110506474971771,
"learning_rate": 0.002828,
"loss": 2.2947,
"step": 24192
},
{
"epoch": 1.8203377110694183,
"grad_norm": 0.3604464530944824,
"learning_rate": 0.002828,
"loss": 2.289,
"step": 24256
},
{
"epoch": 1.825140712945591,
"grad_norm": 0.40807706117630005,
"learning_rate": 0.002828,
"loss": 2.288,
"step": 24320
},
{
"epoch": 1.8299437148217637,
"grad_norm": 0.3632533848285675,
"learning_rate": 0.002828,
"loss": 2.29,
"step": 24384
},
{
"epoch": 1.834746716697936,
"grad_norm": 0.38520562648773193,
"learning_rate": 0.002828,
"loss": 2.2805,
"step": 24448
},
{
"epoch": 1.839549718574109,
"grad_norm": 0.4228810966014862,
"learning_rate": 0.002828,
"loss": 2.2842,
"step": 24512
},
{
"epoch": 1.8443527204502814,
"grad_norm": 0.4542325735092163,
"learning_rate": 0.002828,
"loss": 2.2781,
"step": 24576
},
{
"epoch": 1.849155722326454,
"grad_norm": 0.37316882610321045,
"learning_rate": 0.002828,
"loss": 2.2829,
"step": 24640
},
{
"epoch": 1.8539587242026268,
"grad_norm": 0.5505624413490295,
"learning_rate": 0.002828,
"loss": 2.2942,
"step": 24704
},
{
"epoch": 1.8587617260787992,
"grad_norm": 0.4269484281539917,
"learning_rate": 0.002828,
"loss": 2.2793,
"step": 24768
},
{
"epoch": 1.8635647279549719,
"grad_norm": 0.407760888338089,
"learning_rate": 0.002828,
"loss": 2.2803,
"step": 24832
},
{
"epoch": 1.8683677298311445,
"grad_norm": 0.4192192554473877,
"learning_rate": 0.002828,
"loss": 2.2818,
"step": 24896
},
{
"epoch": 1.873170731707317,
"grad_norm": 0.3924838602542877,
"learning_rate": 0.002828,
"loss": 2.2757,
"step": 24960
},
{
"epoch": 1.8779737335834898,
"grad_norm": 0.3799656629562378,
"learning_rate": 0.002828,
"loss": 2.2695,
"step": 25024
},
{
"epoch": 1.8827767354596623,
"grad_norm": 0.40570494532585144,
"learning_rate": 0.002828,
"loss": 2.2801,
"step": 25088
},
{
"epoch": 1.887579737335835,
"grad_norm": 0.3898228704929352,
"learning_rate": 0.002828,
"loss": 2.2709,
"step": 25152
},
{
"epoch": 1.8923827392120076,
"grad_norm": 0.393216073513031,
"learning_rate": 0.002828,
"loss": 2.282,
"step": 25216
},
{
"epoch": 1.89718574108818,
"grad_norm": 0.4247749149799347,
"learning_rate": 0.002828,
"loss": 2.2776,
"step": 25280
},
{
"epoch": 1.9019887429643527,
"grad_norm": 0.4670035243034363,
"learning_rate": 0.002828,
"loss": 2.2754,
"step": 25344
},
{
"epoch": 1.9067917448405254,
"grad_norm": 0.40336644649505615,
"learning_rate": 0.002828,
"loss": 2.2768,
"step": 25408
},
{
"epoch": 1.9115947467166978,
"grad_norm": 0.48462921380996704,
"learning_rate": 0.002828,
"loss": 2.2634,
"step": 25472
},
{
"epoch": 1.9163977485928707,
"grad_norm": 0.44047805666923523,
"learning_rate": 0.002828,
"loss": 2.2674,
"step": 25536
},
{
"epoch": 1.9212007504690432,
"grad_norm": 0.4221409261226654,
"learning_rate": 0.002828,
"loss": 2.2724,
"step": 25600
},
{
"epoch": 1.9260037523452158,
"grad_norm": 0.4272362291812897,
"learning_rate": 0.002828,
"loss": 2.2683,
"step": 25664
},
{
"epoch": 1.9308067542213885,
"grad_norm": 0.4309645891189575,
"learning_rate": 0.002828,
"loss": 2.2612,
"step": 25728
},
{
"epoch": 1.935609756097561,
"grad_norm": 0.4220867156982422,
"learning_rate": 0.002828,
"loss": 2.2665,
"step": 25792
},
{
"epoch": 1.9404127579737336,
"grad_norm": 0.3765920102596283,
"learning_rate": 0.002828,
"loss": 2.2652,
"step": 25856
},
{
"epoch": 1.9452157598499062,
"grad_norm": 0.44643986225128174,
"learning_rate": 0.002828,
"loss": 2.2627,
"step": 25920
},
{
"epoch": 1.9500187617260787,
"grad_norm": 0.4022061824798584,
"learning_rate": 0.002828,
"loss": 2.2665,
"step": 25984
},
{
"epoch": 1.9548217636022516,
"grad_norm": 0.3935778737068176,
"learning_rate": 0.002828,
"loss": 2.2585,
"step": 26048
},
{
"epoch": 1.959624765478424,
"grad_norm": 0.3877500295639038,
"learning_rate": 0.002828,
"loss": 2.2629,
"step": 26112
},
{
"epoch": 1.9644277673545967,
"grad_norm": 0.3891729712486267,
"learning_rate": 0.002828,
"loss": 2.2594,
"step": 26176
},
{
"epoch": 1.9692307692307693,
"grad_norm": 0.3616099953651428,
"learning_rate": 0.002828,
"loss": 2.2601,
"step": 26240
},
{
"epoch": 1.9740337711069418,
"grad_norm": 0.3855280578136444,
"learning_rate": 0.002828,
"loss": 2.2677,
"step": 26304
},
{
"epoch": 1.9788367729831144,
"grad_norm": 0.44039493799209595,
"learning_rate": 0.002828,
"loss": 2.2641,
"step": 26368
},
{
"epoch": 1.983639774859287,
"grad_norm": 0.37217262387275696,
"learning_rate": 0.002828,
"loss": 2.2603,
"step": 26432
},
{
"epoch": 1.9884427767354595,
"grad_norm": 0.3942553997039795,
"learning_rate": 0.002828,
"loss": 2.2508,
"step": 26496
},
{
"epoch": 1.9932457786116324,
"grad_norm": 0.3975297808647156,
"learning_rate": 0.002828,
"loss": 2.2466,
"step": 26560
},
{
"epoch": 1.9980487804878049,
"grad_norm": 0.39197394251823425,
"learning_rate": 0.002828,
"loss": 2.2515,
"step": 26624
},
{
"epoch": 2.0028517823639773,
"grad_norm": 0.38722801208496094,
"learning_rate": 0.002828,
"loss": 2.2354,
"step": 26688
},
{
"epoch": 2.00765478424015,
"grad_norm": 0.38619640469551086,
"learning_rate": 0.002828,
"loss": 2.2152,
"step": 26752
},
{
"epoch": 2.0124577861163226,
"grad_norm": 0.49529945850372314,
"learning_rate": 0.002828,
"loss": 2.2167,
"step": 26816
},
{
"epoch": 2.0172607879924955,
"grad_norm": 0.4199656844139099,
"learning_rate": 0.002828,
"loss": 2.2092,
"step": 26880
},
{
"epoch": 2.022063789868668,
"grad_norm": 0.45820868015289307,
"learning_rate": 0.002828,
"loss": 2.2249,
"step": 26944
},
{
"epoch": 2.0268667917448404,
"grad_norm": 0.4006725251674652,
"learning_rate": 0.002828,
"loss": 2.2165,
"step": 27008
},
{
"epoch": 2.0316697936210133,
"grad_norm": 0.4596467614173889,
"learning_rate": 0.002828,
"loss": 2.2154,
"step": 27072
},
{
"epoch": 2.0364727954971857,
"grad_norm": 0.38660213351249695,
"learning_rate": 0.002828,
"loss": 2.2062,
"step": 27136
},
{
"epoch": 2.041275797373358,
"grad_norm": 0.44082361459732056,
"learning_rate": 0.002828,
"loss": 2.2124,
"step": 27200
},
{
"epoch": 2.046078799249531,
"grad_norm": 0.3886605203151703,
"learning_rate": 0.002828,
"loss": 2.2182,
"step": 27264
},
{
"epoch": 2.0508818011257035,
"grad_norm": 0.41386017203330994,
"learning_rate": 0.002828,
"loss": 2.2168,
"step": 27328
},
{
"epoch": 2.055684803001876,
"grad_norm": 0.411478191614151,
"learning_rate": 0.002828,
"loss": 2.2092,
"step": 27392
},
{
"epoch": 2.060487804878049,
"grad_norm": 0.47288912534713745,
"learning_rate": 0.002828,
"loss": 2.21,
"step": 27456
},
{
"epoch": 2.0652908067542213,
"grad_norm": 0.36384883522987366,
"learning_rate": 0.002828,
"loss": 2.2095,
"step": 27520
},
{
"epoch": 2.070093808630394,
"grad_norm": 0.40636852383613586,
"learning_rate": 0.002828,
"loss": 2.2092,
"step": 27584
},
{
"epoch": 2.0748968105065666,
"grad_norm": 0.4425170421600342,
"learning_rate": 0.002828,
"loss": 2.212,
"step": 27648
},
{
"epoch": 2.079699812382739,
"grad_norm": 0.48468896746635437,
"learning_rate": 0.002828,
"loss": 2.2078,
"step": 27712
},
{
"epoch": 2.084502814258912,
"grad_norm": 0.40420570969581604,
"learning_rate": 0.002828,
"loss": 2.2158,
"step": 27776
},
{
"epoch": 2.0893058161350844,
"grad_norm": 0.44314709305763245,
"learning_rate": 0.002828,
"loss": 2.2121,
"step": 27840
},
{
"epoch": 2.0941088180112573,
"grad_norm": 0.5187743306159973,
"learning_rate": 0.002828,
"loss": 2.2138,
"step": 27904
},
{
"epoch": 2.0989118198874297,
"grad_norm": 0.4796048104763031,
"learning_rate": 0.002828,
"loss": 2.2027,
"step": 27968
},
{
"epoch": 2.103714821763602,
"grad_norm": 0.43605130910873413,
"learning_rate": 0.002828,
"loss": 2.2029,
"step": 28032
},
{
"epoch": 2.108517823639775,
"grad_norm": 0.4523628056049347,
"learning_rate": 0.002828,
"loss": 2.2038,
"step": 28096
},
{
"epoch": 2.1133208255159475,
"grad_norm": 0.4183247983455658,
"learning_rate": 0.002828,
"loss": 2.2026,
"step": 28160
},
{
"epoch": 2.11812382739212,
"grad_norm": 0.5113268494606018,
"learning_rate": 0.002828,
"loss": 2.1998,
"step": 28224
},
{
"epoch": 2.122926829268293,
"grad_norm": 0.40837016701698303,
"learning_rate": 0.002828,
"loss": 2.2054,
"step": 28288
},
{
"epoch": 2.1277298311444652,
"grad_norm": 0.40093889832496643,
"learning_rate": 0.002828,
"loss": 2.208,
"step": 28352
},
{
"epoch": 2.1325328330206377,
"grad_norm": 0.3988894820213318,
"learning_rate": 0.002828,
"loss": 2.2028,
"step": 28416
},
{
"epoch": 2.1373358348968106,
"grad_norm": 0.42024731636047363,
"learning_rate": 0.002828,
"loss": 2.1952,
"step": 28480
},
{
"epoch": 2.142138836772983,
"grad_norm": 0.38691264390945435,
"learning_rate": 0.002828,
"loss": 2.2035,
"step": 28544
},
{
"epoch": 2.146941838649156,
"grad_norm": 0.41956332325935364,
"learning_rate": 0.002828,
"loss": 2.196,
"step": 28608
},
{
"epoch": 2.1517448405253283,
"grad_norm": 0.4035188555717468,
"learning_rate": 0.002828,
"loss": 2.2038,
"step": 28672
},
{
"epoch": 2.1565478424015008,
"grad_norm": 0.35282230377197266,
"learning_rate": 0.002828,
"loss": 2.1835,
"step": 28736
},
{
"epoch": 2.1613508442776737,
"grad_norm": 0.43618568778038025,
"learning_rate": 0.002828,
"loss": 2.1946,
"step": 28800
},
{
"epoch": 2.166153846153846,
"grad_norm": 0.4310976564884186,
"learning_rate": 0.002828,
"loss": 2.1873,
"step": 28864
},
{
"epoch": 2.170956848030019,
"grad_norm": 0.4475420415401459,
"learning_rate": 0.002828,
"loss": 2.1946,
"step": 28928
},
{
"epoch": 2.1757598499061914,
"grad_norm": 0.4384845197200775,
"learning_rate": 0.002828,
"loss": 2.1935,
"step": 28992
},
{
"epoch": 2.180562851782364,
"grad_norm": 0.40141811966896057,
"learning_rate": 0.002828,
"loss": 2.1925,
"step": 29056
},
{
"epoch": 2.1853658536585368,
"grad_norm": 0.3754780888557434,
"learning_rate": 0.002828,
"loss": 2.19,
"step": 29120
},
{
"epoch": 2.190168855534709,
"grad_norm": 0.40471306443214417,
"learning_rate": 0.002828,
"loss": 2.1915,
"step": 29184
},
{
"epoch": 2.1949718574108816,
"grad_norm": 1.464024543762207,
"learning_rate": 0.002828,
"loss": 2.1909,
"step": 29248
},
{
"epoch": 2.1997748592870545,
"grad_norm": 0.3818819522857666,
"learning_rate": 0.002828,
"loss": 2.1915,
"step": 29312
},
{
"epoch": 2.204577861163227,
"grad_norm": 0.3688436448574066,
"learning_rate": 0.002828,
"loss": 2.184,
"step": 29376
},
{
"epoch": 2.2093808630393994,
"grad_norm": 0.4367921054363251,
"learning_rate": 0.002828,
"loss": 2.185,
"step": 29440
},
{
"epoch": 2.2141838649155723,
"grad_norm": 0.3566763401031494,
"learning_rate": 0.002828,
"loss": 2.1871,
"step": 29504
},
{
"epoch": 2.2189868667917447,
"grad_norm": 0.4481133222579956,
"learning_rate": 0.002828,
"loss": 2.1826,
"step": 29568
},
{
"epoch": 2.2237898686679176,
"grad_norm": 0.44622039794921875,
"learning_rate": 0.002828,
"loss": 2.1885,
"step": 29632
},
{
"epoch": 2.22859287054409,
"grad_norm": 0.4857657253742218,
"learning_rate": 0.002828,
"loss": 2.1784,
"step": 29696
},
{
"epoch": 2.2333958724202625,
"grad_norm": 0.41923773288726807,
"learning_rate": 0.002828,
"loss": 2.1807,
"step": 29760
},
{
"epoch": 2.2381988742964354,
"grad_norm": 0.4176802933216095,
"learning_rate": 0.002828,
"loss": 2.1798,
"step": 29824
},
{
"epoch": 2.243001876172608,
"grad_norm": 0.4086935520172119,
"learning_rate": 0.002828,
"loss": 2.1739,
"step": 29888
},
{
"epoch": 2.2478048780487807,
"grad_norm": 0.40138566493988037,
"learning_rate": 0.002828,
"loss": 2.1857,
"step": 29952
},
{
"epoch": 2.252607879924953,
"grad_norm": 0.393996000289917,
"learning_rate": 0.002828,
"loss": 2.1818,
"step": 30016
},
{
"epoch": 2.2574108818011256,
"grad_norm": 0.3962005078792572,
"learning_rate": 0.002828,
"loss": 2.1724,
"step": 30080
},
{
"epoch": 2.2622138836772985,
"grad_norm": 0.41648438572883606,
"learning_rate": 0.002828,
"loss": 2.1835,
"step": 30144
},
{
"epoch": 2.267016885553471,
"grad_norm": 0.3810112774372101,
"learning_rate": 0.002828,
"loss": 2.1736,
"step": 30208
},
{
"epoch": 2.2718198874296434,
"grad_norm": 0.4520975649356842,
"learning_rate": 0.002828,
"loss": 2.1793,
"step": 30272
},
{
"epoch": 2.2766228893058162,
"grad_norm": 0.4406943917274475,
"learning_rate": 0.002828,
"loss": 2.1732,
"step": 30336
},
{
"epoch": 2.2814258911819887,
"grad_norm": 0.4186633825302124,
"learning_rate": 0.002828,
"loss": 2.1774,
"step": 30400
}
],
"logging_steps": 64,
"max_steps": 333125,
"num_input_tokens_seen": 0,
"num_train_epochs": 25,
"save_steps": 320,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.993193875177472e+17,
"train_batch_size": 200,
"trial_name": null,
"trial_params": null
}