model_c3a74aad / checkpoint-894 /trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
4d541a9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 894,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011185682326621924,
"grad_norm": 2.231250286102295,
"learning_rate": 5.0000000000000004e-08,
"loss": 1.0507,
"step": 1
},
{
"epoch": 0.0022371364653243847,
"grad_norm": 2.1123249530792236,
"learning_rate": 1.0000000000000001e-07,
"loss": 1.04,
"step": 2
},
{
"epoch": 0.003355704697986577,
"grad_norm": 2.0946707725524902,
"learning_rate": 1.5000000000000002e-07,
"loss": 1.0307,
"step": 3
},
{
"epoch": 0.0044742729306487695,
"grad_norm": 2.0837416648864746,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.0484,
"step": 4
},
{
"epoch": 0.005592841163310962,
"grad_norm": 1.9843275547027588,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.0012,
"step": 5
},
{
"epoch": 0.006711409395973154,
"grad_norm": 2.121988296508789,
"learning_rate": 3.0000000000000004e-07,
"loss": 1.0603,
"step": 6
},
{
"epoch": 0.007829977628635347,
"grad_norm": 2.029029369354248,
"learning_rate": 3.5000000000000004e-07,
"loss": 1.0323,
"step": 7
},
{
"epoch": 0.008948545861297539,
"grad_norm": 1.9815905094146729,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.0273,
"step": 8
},
{
"epoch": 0.010067114093959731,
"grad_norm": 2.3339314460754395,
"learning_rate": 4.5000000000000003e-07,
"loss": 1.0805,
"step": 9
},
{
"epoch": 0.011185682326621925,
"grad_norm": 2.1078243255615234,
"learning_rate": 5.000000000000001e-07,
"loss": 1.0382,
"step": 10
},
{
"epoch": 0.012304250559284116,
"grad_norm": 1.8874777555465698,
"learning_rate": 5.5e-07,
"loss": 1.008,
"step": 11
},
{
"epoch": 0.013422818791946308,
"grad_norm": 1.9720211029052734,
"learning_rate": 6.000000000000001e-07,
"loss": 1.0065,
"step": 12
},
{
"epoch": 0.0145413870246085,
"grad_norm": 2.0002245903015137,
"learning_rate": 6.5e-07,
"loss": 1.0379,
"step": 13
},
{
"epoch": 0.015659955257270694,
"grad_norm": 1.983207106590271,
"learning_rate": 7.000000000000001e-07,
"loss": 1.0271,
"step": 14
},
{
"epoch": 0.016778523489932886,
"grad_norm": 1.886121153831482,
"learning_rate": 7.5e-07,
"loss": 1.0019,
"step": 15
},
{
"epoch": 0.017897091722595078,
"grad_norm": 1.9403958320617676,
"learning_rate": 8.000000000000001e-07,
"loss": 0.9885,
"step": 16
},
{
"epoch": 0.01901565995525727,
"grad_norm": 1.9739996194839478,
"learning_rate": 8.500000000000001e-07,
"loss": 0.9904,
"step": 17
},
{
"epoch": 0.020134228187919462,
"grad_norm": 1.7419469356536865,
"learning_rate": 9.000000000000001e-07,
"loss": 0.9709,
"step": 18
},
{
"epoch": 0.021252796420581657,
"grad_norm": 1.7856152057647705,
"learning_rate": 9.500000000000001e-07,
"loss": 0.9859,
"step": 19
},
{
"epoch": 0.02237136465324385,
"grad_norm": 1.6159933805465698,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.9895,
"step": 20
},
{
"epoch": 0.02348993288590604,
"grad_norm": 1.7010679244995117,
"learning_rate": 1.0500000000000001e-06,
"loss": 1.0115,
"step": 21
},
{
"epoch": 0.024608501118568233,
"grad_norm": 1.7860039472579956,
"learning_rate": 1.1e-06,
"loss": 0.9917,
"step": 22
},
{
"epoch": 0.025727069351230425,
"grad_norm": 1.3735058307647705,
"learning_rate": 1.1500000000000002e-06,
"loss": 0.9441,
"step": 23
},
{
"epoch": 0.026845637583892617,
"grad_norm": 1.439109206199646,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.9304,
"step": 24
},
{
"epoch": 0.02796420581655481,
"grad_norm": 1.380369782447815,
"learning_rate": 1.25e-06,
"loss": 0.9469,
"step": 25
},
{
"epoch": 0.029082774049217,
"grad_norm": 1.2287472486495972,
"learning_rate": 1.3e-06,
"loss": 0.8808,
"step": 26
},
{
"epoch": 0.030201342281879196,
"grad_norm": 1.0899194478988647,
"learning_rate": 1.3500000000000002e-06,
"loss": 0.8912,
"step": 27
},
{
"epoch": 0.03131991051454139,
"grad_norm": 1.0445002317428589,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.876,
"step": 28
},
{
"epoch": 0.03243847874720358,
"grad_norm": 1.0201383829116821,
"learning_rate": 1.45e-06,
"loss": 0.9003,
"step": 29
},
{
"epoch": 0.03355704697986577,
"grad_norm": 0.9528365731239319,
"learning_rate": 1.5e-06,
"loss": 0.8537,
"step": 30
},
{
"epoch": 0.03467561521252797,
"grad_norm": 0.9615768194198608,
"learning_rate": 1.5500000000000002e-06,
"loss": 0.8819,
"step": 31
},
{
"epoch": 0.035794183445190156,
"grad_norm": 0.9578896760940552,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.8859,
"step": 32
},
{
"epoch": 0.03691275167785235,
"grad_norm": 0.977853536605835,
"learning_rate": 1.6500000000000003e-06,
"loss": 0.8835,
"step": 33
},
{
"epoch": 0.03803131991051454,
"grad_norm": 0.8976068496704102,
"learning_rate": 1.7000000000000002e-06,
"loss": 0.8599,
"step": 34
},
{
"epoch": 0.039149888143176735,
"grad_norm": 0.8779590725898743,
"learning_rate": 1.75e-06,
"loss": 0.8708,
"step": 35
},
{
"epoch": 0.040268456375838924,
"grad_norm": 0.853705644607544,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.8465,
"step": 36
},
{
"epoch": 0.04138702460850112,
"grad_norm": 0.8480839729309082,
"learning_rate": 1.85e-06,
"loss": 0.8292,
"step": 37
},
{
"epoch": 0.042505592841163314,
"grad_norm": 0.8372538089752197,
"learning_rate": 1.9000000000000002e-06,
"loss": 0.8026,
"step": 38
},
{
"epoch": 0.0436241610738255,
"grad_norm": 0.8592961430549622,
"learning_rate": 1.9500000000000004e-06,
"loss": 0.8153,
"step": 39
},
{
"epoch": 0.0447427293064877,
"grad_norm": 0.8222276568412781,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.814,
"step": 40
},
{
"epoch": 0.04586129753914989,
"grad_norm": 0.825672447681427,
"learning_rate": 2.05e-06,
"loss": 0.7793,
"step": 41
},
{
"epoch": 0.04697986577181208,
"grad_norm": 0.8016732335090637,
"learning_rate": 2.1000000000000002e-06,
"loss": 0.771,
"step": 42
},
{
"epoch": 0.04809843400447427,
"grad_norm": 0.7026550769805908,
"learning_rate": 2.15e-06,
"loss": 0.7664,
"step": 43
},
{
"epoch": 0.049217002237136466,
"grad_norm": 0.6678670644760132,
"learning_rate": 2.2e-06,
"loss": 0.7774,
"step": 44
},
{
"epoch": 0.050335570469798654,
"grad_norm": 0.6766750812530518,
"learning_rate": 2.25e-06,
"loss": 0.7832,
"step": 45
},
{
"epoch": 0.05145413870246085,
"grad_norm": 0.7094117999076843,
"learning_rate": 2.3000000000000004e-06,
"loss": 0.7861,
"step": 46
},
{
"epoch": 0.052572706935123045,
"grad_norm": 0.6871191263198853,
"learning_rate": 2.35e-06,
"loss": 0.7848,
"step": 47
},
{
"epoch": 0.053691275167785234,
"grad_norm": 0.6089867353439331,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.7658,
"step": 48
},
{
"epoch": 0.05480984340044743,
"grad_norm": 0.5112010836601257,
"learning_rate": 2.4500000000000003e-06,
"loss": 0.7921,
"step": 49
},
{
"epoch": 0.05592841163310962,
"grad_norm": 0.5008496046066284,
"learning_rate": 2.5e-06,
"loss": 0.7105,
"step": 50
},
{
"epoch": 0.05704697986577181,
"grad_norm": 0.5599631071090698,
"learning_rate": 2.55e-06,
"loss": 0.7526,
"step": 51
},
{
"epoch": 0.058165548098434,
"grad_norm": 0.6905913352966309,
"learning_rate": 2.6e-06,
"loss": 0.7496,
"step": 52
},
{
"epoch": 0.0592841163310962,
"grad_norm": 0.6198621392250061,
"learning_rate": 2.6500000000000005e-06,
"loss": 0.7297,
"step": 53
},
{
"epoch": 0.06040268456375839,
"grad_norm": 0.6158658862113953,
"learning_rate": 2.7000000000000004e-06,
"loss": 0.7309,
"step": 54
},
{
"epoch": 0.06152125279642058,
"grad_norm": 0.5798735618591309,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.7102,
"step": 55
},
{
"epoch": 0.06263982102908278,
"grad_norm": 0.5550254583358765,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.7488,
"step": 56
},
{
"epoch": 0.06375838926174497,
"grad_norm": 0.4888734221458435,
"learning_rate": 2.85e-06,
"loss": 0.75,
"step": 57
},
{
"epoch": 0.06487695749440715,
"grad_norm": 0.4579496383666992,
"learning_rate": 2.9e-06,
"loss": 0.7108,
"step": 58
},
{
"epoch": 0.06599552572706935,
"grad_norm": 0.5775673389434814,
"learning_rate": 2.95e-06,
"loss": 0.7337,
"step": 59
},
{
"epoch": 0.06711409395973154,
"grad_norm": 0.5035051703453064,
"learning_rate": 3e-06,
"loss": 0.7677,
"step": 60
},
{
"epoch": 0.06823266219239374,
"grad_norm": 0.4771614074707031,
"learning_rate": 3.05e-06,
"loss": 0.724,
"step": 61
},
{
"epoch": 0.06935123042505593,
"grad_norm": 0.45495525002479553,
"learning_rate": 3.1000000000000004e-06,
"loss": 0.7393,
"step": 62
},
{
"epoch": 0.07046979865771812,
"grad_norm": 0.36385607719421387,
"learning_rate": 3.1500000000000003e-06,
"loss": 0.7029,
"step": 63
},
{
"epoch": 0.07158836689038031,
"grad_norm": 0.3554967939853668,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.6991,
"step": 64
},
{
"epoch": 0.07270693512304251,
"grad_norm": 0.36548176407814026,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.7292,
"step": 65
},
{
"epoch": 0.0738255033557047,
"grad_norm": 0.35280168056488037,
"learning_rate": 3.3000000000000006e-06,
"loss": 0.7295,
"step": 66
},
{
"epoch": 0.07494407158836688,
"grad_norm": 0.3599022924900055,
"learning_rate": 3.3500000000000005e-06,
"loss": 0.6956,
"step": 67
},
{
"epoch": 0.07606263982102908,
"grad_norm": 0.3802206516265869,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.6796,
"step": 68
},
{
"epoch": 0.07718120805369127,
"grad_norm": 0.3787902891635895,
"learning_rate": 3.45e-06,
"loss": 0.7141,
"step": 69
},
{
"epoch": 0.07829977628635347,
"grad_norm": 0.374461829662323,
"learning_rate": 3.5e-06,
"loss": 0.7043,
"step": 70
},
{
"epoch": 0.07941834451901567,
"grad_norm": 0.34469330310821533,
"learning_rate": 3.5500000000000003e-06,
"loss": 0.7037,
"step": 71
},
{
"epoch": 0.08053691275167785,
"grad_norm": 0.346836119890213,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.7246,
"step": 72
},
{
"epoch": 0.08165548098434004,
"grad_norm": 0.34163376688957214,
"learning_rate": 3.65e-06,
"loss": 0.6977,
"step": 73
},
{
"epoch": 0.08277404921700224,
"grad_norm": 0.3481418788433075,
"learning_rate": 3.7e-06,
"loss": 0.7356,
"step": 74
},
{
"epoch": 0.08389261744966443,
"grad_norm": 0.3230934739112854,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.6869,
"step": 75
},
{
"epoch": 0.08501118568232663,
"grad_norm": 0.319917231798172,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.6722,
"step": 76
},
{
"epoch": 0.08612975391498881,
"grad_norm": 0.3535120487213135,
"learning_rate": 3.85e-06,
"loss": 0.6951,
"step": 77
},
{
"epoch": 0.087248322147651,
"grad_norm": 0.3229662775993347,
"learning_rate": 3.900000000000001e-06,
"loss": 0.69,
"step": 78
},
{
"epoch": 0.0883668903803132,
"grad_norm": 0.33365264534950256,
"learning_rate": 3.95e-06,
"loss": 0.701,
"step": 79
},
{
"epoch": 0.0894854586129754,
"grad_norm": 0.3302946984767914,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6733,
"step": 80
},
{
"epoch": 0.09060402684563758,
"grad_norm": 0.3478582799434662,
"learning_rate": 4.05e-06,
"loss": 0.7022,
"step": 81
},
{
"epoch": 0.09172259507829977,
"grad_norm": 0.33355170488357544,
"learning_rate": 4.1e-06,
"loss": 0.7141,
"step": 82
},
{
"epoch": 0.09284116331096197,
"grad_norm": 0.3217330574989319,
"learning_rate": 4.15e-06,
"loss": 0.6799,
"step": 83
},
{
"epoch": 0.09395973154362416,
"grad_norm": 0.328838050365448,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.6943,
"step": 84
},
{
"epoch": 0.09507829977628636,
"grad_norm": 0.3279136121273041,
"learning_rate": 4.25e-06,
"loss": 0.6699,
"step": 85
},
{
"epoch": 0.09619686800894854,
"grad_norm": 0.333351194858551,
"learning_rate": 4.3e-06,
"loss": 0.712,
"step": 86
},
{
"epoch": 0.09731543624161074,
"grad_norm": 0.33052128553390503,
"learning_rate": 4.350000000000001e-06,
"loss": 0.7169,
"step": 87
},
{
"epoch": 0.09843400447427293,
"grad_norm": 0.31631597876548767,
"learning_rate": 4.4e-06,
"loss": 0.6772,
"step": 88
},
{
"epoch": 0.09955257270693513,
"grad_norm": 0.327311635017395,
"learning_rate": 4.450000000000001e-06,
"loss": 0.6873,
"step": 89
},
{
"epoch": 0.10067114093959731,
"grad_norm": 0.32048892974853516,
"learning_rate": 4.5e-06,
"loss": 0.6614,
"step": 90
},
{
"epoch": 0.1017897091722595,
"grad_norm": 0.32614201307296753,
"learning_rate": 4.5500000000000005e-06,
"loss": 0.7197,
"step": 91
},
{
"epoch": 0.1029082774049217,
"grad_norm": 0.31145355105400085,
"learning_rate": 4.600000000000001e-06,
"loss": 0.6567,
"step": 92
},
{
"epoch": 0.1040268456375839,
"grad_norm": 0.31379351019859314,
"learning_rate": 4.65e-06,
"loss": 0.7013,
"step": 93
},
{
"epoch": 0.10514541387024609,
"grad_norm": 0.32741424441337585,
"learning_rate": 4.7e-06,
"loss": 0.6737,
"step": 94
},
{
"epoch": 0.10626398210290827,
"grad_norm": 0.325630247592926,
"learning_rate": 4.75e-06,
"loss": 0.6673,
"step": 95
},
{
"epoch": 0.10738255033557047,
"grad_norm": 0.3153480291366577,
"learning_rate": 4.800000000000001e-06,
"loss": 0.6943,
"step": 96
},
{
"epoch": 0.10850111856823266,
"grad_norm": 0.3244793117046356,
"learning_rate": 4.85e-06,
"loss": 0.6896,
"step": 97
},
{
"epoch": 0.10961968680089486,
"grad_norm": 0.3078743517398834,
"learning_rate": 4.9000000000000005e-06,
"loss": 0.6837,
"step": 98
},
{
"epoch": 0.11073825503355705,
"grad_norm": 0.3314874470233917,
"learning_rate": 4.95e-06,
"loss": 0.7416,
"step": 99
},
{
"epoch": 0.11185682326621924,
"grad_norm": 0.31931284070014954,
"learning_rate": 5e-06,
"loss": 0.6903,
"step": 100
},
{
"epoch": 0.11297539149888143,
"grad_norm": 0.3176276981830597,
"learning_rate": 4.999999554776598e-06,
"loss": 0.6761,
"step": 101
},
{
"epoch": 0.11409395973154363,
"grad_norm": 0.3285365700721741,
"learning_rate": 4.999998219106549e-06,
"loss": 0.6892,
"step": 102
},
{
"epoch": 0.11521252796420582,
"grad_norm": 0.31144094467163086,
"learning_rate": 4.99999599299033e-06,
"loss": 0.6586,
"step": 103
},
{
"epoch": 0.116331096196868,
"grad_norm": 0.313289076089859,
"learning_rate": 4.999992876428732e-06,
"loss": 0.708,
"step": 104
},
{
"epoch": 0.1174496644295302,
"grad_norm": 0.3252837061882019,
"learning_rate": 4.999988869422867e-06,
"loss": 0.7083,
"step": 105
},
{
"epoch": 0.1185682326621924,
"grad_norm": 0.3168275058269501,
"learning_rate": 4.9999839719741615e-06,
"loss": 0.6806,
"step": 106
},
{
"epoch": 0.11968680089485459,
"grad_norm": 0.31589415669441223,
"learning_rate": 4.9999781840843594e-06,
"loss": 0.6702,
"step": 107
},
{
"epoch": 0.12080536912751678,
"grad_norm": 0.318037748336792,
"learning_rate": 4.999971505755523e-06,
"loss": 0.6601,
"step": 108
},
{
"epoch": 0.12192393736017897,
"grad_norm": 0.33259475231170654,
"learning_rate": 4.999963936990031e-06,
"loss": 0.7001,
"step": 109
},
{
"epoch": 0.12304250559284116,
"grad_norm": 0.33322346210479736,
"learning_rate": 4.999955477790579e-06,
"loss": 0.6731,
"step": 110
},
{
"epoch": 0.12416107382550336,
"grad_norm": 0.31344881653785706,
"learning_rate": 4.999946128160179e-06,
"loss": 0.6667,
"step": 111
},
{
"epoch": 0.12527964205816555,
"grad_norm": 0.32769575715065,
"learning_rate": 4.999935888102162e-06,
"loss": 0.7123,
"step": 112
},
{
"epoch": 0.12639821029082773,
"grad_norm": 0.314619243144989,
"learning_rate": 4.9999247576201765e-06,
"loss": 0.683,
"step": 113
},
{
"epoch": 0.12751677852348994,
"grad_norm": 0.3301268219947815,
"learning_rate": 4.999912736718185e-06,
"loss": 0.6761,
"step": 114
},
{
"epoch": 0.12863534675615212,
"grad_norm": 0.31477460265159607,
"learning_rate": 4.99989982540047e-06,
"loss": 0.6722,
"step": 115
},
{
"epoch": 0.1297539149888143,
"grad_norm": 0.31430870294570923,
"learning_rate": 4.999886023671629e-06,
"loss": 0.6693,
"step": 116
},
{
"epoch": 0.13087248322147652,
"grad_norm": 0.31705909967422485,
"learning_rate": 4.999871331536581e-06,
"loss": 0.6567,
"step": 117
},
{
"epoch": 0.1319910514541387,
"grad_norm": 0.3331652879714966,
"learning_rate": 4.999855749000555e-06,
"loss": 0.6895,
"step": 118
},
{
"epoch": 0.1331096196868009,
"grad_norm": 0.32147714495658875,
"learning_rate": 4.999839276069105e-06,
"loss": 0.6693,
"step": 119
},
{
"epoch": 0.1342281879194631,
"grad_norm": 0.3312559127807617,
"learning_rate": 4.999821912748095e-06,
"loss": 0.6843,
"step": 120
},
{
"epoch": 0.13534675615212527,
"grad_norm": 0.34178397059440613,
"learning_rate": 4.999803659043712e-06,
"loss": 0.6774,
"step": 121
},
{
"epoch": 0.13646532438478748,
"grad_norm": 0.3154846727848053,
"learning_rate": 4.999784514962456e-06,
"loss": 0.6638,
"step": 122
},
{
"epoch": 0.13758389261744966,
"grad_norm": 0.31137940287590027,
"learning_rate": 4.999764480511145e-06,
"loss": 0.6467,
"step": 123
},
{
"epoch": 0.13870246085011187,
"grad_norm": 0.3188192546367645,
"learning_rate": 4.999743555696918e-06,
"loss": 0.6511,
"step": 124
},
{
"epoch": 0.13982102908277405,
"grad_norm": 0.30495911836624146,
"learning_rate": 4.999721740527225e-06,
"loss": 0.6637,
"step": 125
},
{
"epoch": 0.14093959731543623,
"grad_norm": 0.3152139186859131,
"learning_rate": 4.999699035009837e-06,
"loss": 0.6631,
"step": 126
},
{
"epoch": 0.14205816554809844,
"grad_norm": 0.32285481691360474,
"learning_rate": 4.999675439152842e-06,
"loss": 0.6621,
"step": 127
},
{
"epoch": 0.14317673378076062,
"grad_norm": 0.3176666796207428,
"learning_rate": 4.999650952964643e-06,
"loss": 0.6654,
"step": 128
},
{
"epoch": 0.14429530201342283,
"grad_norm": 0.314035028219223,
"learning_rate": 4.999625576453962e-06,
"loss": 0.6927,
"step": 129
},
{
"epoch": 0.14541387024608501,
"grad_norm": 0.3227815628051758,
"learning_rate": 4.999599309629839e-06,
"loss": 0.6865,
"step": 130
},
{
"epoch": 0.1465324384787472,
"grad_norm": 0.3137218952178955,
"learning_rate": 4.9995721525016275e-06,
"loss": 0.6499,
"step": 131
},
{
"epoch": 0.1476510067114094,
"grad_norm": 0.32401353120803833,
"learning_rate": 4.999544105079001e-06,
"loss": 0.64,
"step": 132
},
{
"epoch": 0.1487695749440716,
"grad_norm": 0.3110584020614624,
"learning_rate": 4.99951516737195e-06,
"loss": 0.6747,
"step": 133
},
{
"epoch": 0.14988814317673377,
"grad_norm": 0.3246876895427704,
"learning_rate": 4.999485339390781e-06,
"loss": 0.6943,
"step": 134
},
{
"epoch": 0.15100671140939598,
"grad_norm": 0.3346574008464813,
"learning_rate": 4.999454621146117e-06,
"loss": 0.6675,
"step": 135
},
{
"epoch": 0.15212527964205816,
"grad_norm": 0.3305971920490265,
"learning_rate": 4.999423012648902e-06,
"loss": 0.7065,
"step": 136
},
{
"epoch": 0.15324384787472037,
"grad_norm": 0.31732234358787537,
"learning_rate": 4.9993905139103924e-06,
"loss": 0.7038,
"step": 137
},
{
"epoch": 0.15436241610738255,
"grad_norm": 0.3233291208744049,
"learning_rate": 4.999357124942163e-06,
"loss": 0.6856,
"step": 138
},
{
"epoch": 0.15548098434004473,
"grad_norm": 0.31733304262161255,
"learning_rate": 4.999322845756107e-06,
"loss": 0.702,
"step": 139
},
{
"epoch": 0.15659955257270694,
"grad_norm": 0.33124351501464844,
"learning_rate": 4.9992876763644346e-06,
"loss": 0.6616,
"step": 140
},
{
"epoch": 0.15771812080536912,
"grad_norm": 0.3264501094818115,
"learning_rate": 4.999251616779671e-06,
"loss": 0.6773,
"step": 141
},
{
"epoch": 0.15883668903803133,
"grad_norm": 0.34606418013572693,
"learning_rate": 4.999214667014662e-06,
"loss": 0.6765,
"step": 142
},
{
"epoch": 0.1599552572706935,
"grad_norm": 0.3292436897754669,
"learning_rate": 4.999176827082566e-06,
"loss": 0.6692,
"step": 143
},
{
"epoch": 0.1610738255033557,
"grad_norm": 0.31322377920150757,
"learning_rate": 4.9991380969968615e-06,
"loss": 0.6811,
"step": 144
},
{
"epoch": 0.1621923937360179,
"grad_norm": 0.32053160667419434,
"learning_rate": 4.999098476771344e-06,
"loss": 0.6544,
"step": 145
},
{
"epoch": 0.16331096196868009,
"grad_norm": 0.34363314509391785,
"learning_rate": 4.9990579664201244e-06,
"loss": 0.6839,
"step": 146
},
{
"epoch": 0.1644295302013423,
"grad_norm": 0.3260481357574463,
"learning_rate": 4.999016565957633e-06,
"loss": 0.7048,
"step": 147
},
{
"epoch": 0.16554809843400448,
"grad_norm": 0.3410928547382355,
"learning_rate": 4.998974275398614e-06,
"loss": 0.6846,
"step": 148
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.33661726117134094,
"learning_rate": 4.998931094758132e-06,
"loss": 0.6468,
"step": 149
},
{
"epoch": 0.16778523489932887,
"grad_norm": 0.32493966817855835,
"learning_rate": 4.998887024051565e-06,
"loss": 0.6741,
"step": 150
},
{
"epoch": 0.16890380313199105,
"grad_norm": 0.3396671414375305,
"learning_rate": 4.998842063294613e-06,
"loss": 0.6703,
"step": 151
},
{
"epoch": 0.17002237136465326,
"grad_norm": 0.32851850986480713,
"learning_rate": 4.998796212503287e-06,
"loss": 0.6589,
"step": 152
},
{
"epoch": 0.17114093959731544,
"grad_norm": 0.33433711528778076,
"learning_rate": 4.99874947169392e-06,
"loss": 0.6594,
"step": 153
},
{
"epoch": 0.17225950782997762,
"grad_norm": 0.3388006091117859,
"learning_rate": 4.99870184088316e-06,
"loss": 0.6604,
"step": 154
},
{
"epoch": 0.17337807606263983,
"grad_norm": 0.31802693009376526,
"learning_rate": 4.998653320087971e-06,
"loss": 0.6467,
"step": 155
},
{
"epoch": 0.174496644295302,
"grad_norm": 0.33016613125801086,
"learning_rate": 4.998603909325636e-06,
"loss": 0.6599,
"step": 156
},
{
"epoch": 0.1756152125279642,
"grad_norm": 0.32546237111091614,
"learning_rate": 4.998553608613755e-06,
"loss": 0.6519,
"step": 157
},
{
"epoch": 0.1767337807606264,
"grad_norm": 0.3362942337989807,
"learning_rate": 4.998502417970242e-06,
"loss": 0.671,
"step": 158
},
{
"epoch": 0.17785234899328858,
"grad_norm": 0.33070167899131775,
"learning_rate": 4.998450337413331e-06,
"loss": 0.6624,
"step": 159
},
{
"epoch": 0.1789709172259508,
"grad_norm": 0.32430973649024963,
"learning_rate": 4.998397366961571e-06,
"loss": 0.6263,
"step": 160
},
{
"epoch": 0.18008948545861297,
"grad_norm": 0.32481464743614197,
"learning_rate": 4.998343506633831e-06,
"loss": 0.6683,
"step": 161
},
{
"epoch": 0.18120805369127516,
"grad_norm": 0.33035483956336975,
"learning_rate": 4.998288756449292e-06,
"loss": 0.6816,
"step": 162
},
{
"epoch": 0.18232662192393737,
"grad_norm": 0.33188679814338684,
"learning_rate": 4.998233116427458e-06,
"loss": 0.6693,
"step": 163
},
{
"epoch": 0.18344519015659955,
"grad_norm": 0.33667680621147156,
"learning_rate": 4.998176586588145e-06,
"loss": 0.6619,
"step": 164
},
{
"epoch": 0.18456375838926176,
"grad_norm": 0.33836281299591064,
"learning_rate": 4.998119166951488e-06,
"loss": 0.6697,
"step": 165
},
{
"epoch": 0.18568232662192394,
"grad_norm": 0.31710004806518555,
"learning_rate": 4.998060857537938e-06,
"loss": 0.6386,
"step": 166
},
{
"epoch": 0.18680089485458612,
"grad_norm": 0.3220674395561218,
"learning_rate": 4.9980016583682655e-06,
"loss": 0.6477,
"step": 167
},
{
"epoch": 0.18791946308724833,
"grad_norm": 0.31624945998191833,
"learning_rate": 4.997941569463554e-06,
"loss": 0.6771,
"step": 168
},
{
"epoch": 0.1890380313199105,
"grad_norm": 0.33520039916038513,
"learning_rate": 4.997880590845208e-06,
"loss": 0.6777,
"step": 169
},
{
"epoch": 0.19015659955257272,
"grad_norm": 0.33738774061203003,
"learning_rate": 4.997818722534944e-06,
"loss": 0.6603,
"step": 170
},
{
"epoch": 0.1912751677852349,
"grad_norm": 0.33408045768737793,
"learning_rate": 4.9977559645548e-06,
"loss": 0.6581,
"step": 171
},
{
"epoch": 0.19239373601789708,
"grad_norm": 0.3269501328468323,
"learning_rate": 4.997692316927129e-06,
"loss": 0.6623,
"step": 172
},
{
"epoch": 0.1935123042505593,
"grad_norm": 0.33073386549949646,
"learning_rate": 4.997627779674601e-06,
"loss": 0.6465,
"step": 173
},
{
"epoch": 0.19463087248322147,
"grad_norm": 0.33027294278144836,
"learning_rate": 4.997562352820201e-06,
"loss": 0.6795,
"step": 174
},
{
"epoch": 0.19574944071588368,
"grad_norm": 0.3329165577888489,
"learning_rate": 4.997496036387235e-06,
"loss": 0.6717,
"step": 175
},
{
"epoch": 0.19686800894854586,
"grad_norm": 0.3321872353553772,
"learning_rate": 4.997428830399322e-06,
"loss": 0.6415,
"step": 176
},
{
"epoch": 0.19798657718120805,
"grad_norm": 0.3222750723361969,
"learning_rate": 4.997360734880401e-06,
"loss": 0.657,
"step": 177
},
{
"epoch": 0.19910514541387025,
"grad_norm": 0.33294835686683655,
"learning_rate": 4.997291749854725e-06,
"loss": 0.6931,
"step": 178
},
{
"epoch": 0.20022371364653244,
"grad_norm": 0.3413322865962982,
"learning_rate": 4.997221875346863e-06,
"loss": 0.6761,
"step": 179
},
{
"epoch": 0.20134228187919462,
"grad_norm": 0.3300095796585083,
"learning_rate": 4.997151111381707e-06,
"loss": 0.6626,
"step": 180
},
{
"epoch": 0.20246085011185683,
"grad_norm": 0.337289035320282,
"learning_rate": 4.997079457984459e-06,
"loss": 0.6861,
"step": 181
},
{
"epoch": 0.203579418344519,
"grad_norm": 0.3266119658946991,
"learning_rate": 4.997006915180642e-06,
"loss": 0.6687,
"step": 182
},
{
"epoch": 0.20469798657718122,
"grad_norm": 0.33044853806495667,
"learning_rate": 4.996933482996092e-06,
"loss": 0.6637,
"step": 183
},
{
"epoch": 0.2058165548098434,
"grad_norm": 0.33716171979904175,
"learning_rate": 4.996859161456965e-06,
"loss": 0.6644,
"step": 184
},
{
"epoch": 0.20693512304250558,
"grad_norm": 0.32554203271865845,
"learning_rate": 4.996783950589733e-06,
"loss": 0.6524,
"step": 185
},
{
"epoch": 0.2080536912751678,
"grad_norm": 0.3271404504776001,
"learning_rate": 4.996707850421184e-06,
"loss": 0.6581,
"step": 186
},
{
"epoch": 0.20917225950782997,
"grad_norm": 0.34464138746261597,
"learning_rate": 4.996630860978424e-06,
"loss": 0.6768,
"step": 187
},
{
"epoch": 0.21029082774049218,
"grad_norm": 0.3408767282962799,
"learning_rate": 4.996552982288875e-06,
"loss": 0.6556,
"step": 188
},
{
"epoch": 0.21140939597315436,
"grad_norm": 0.3375307023525238,
"learning_rate": 4.996474214380276e-06,
"loss": 0.6819,
"step": 189
},
{
"epoch": 0.21252796420581654,
"grad_norm": 0.3313542902469635,
"learning_rate": 4.99639455728068e-06,
"loss": 0.6483,
"step": 190
},
{
"epoch": 0.21364653243847875,
"grad_norm": 0.3327822685241699,
"learning_rate": 4.996314011018462e-06,
"loss": 0.6669,
"step": 191
},
{
"epoch": 0.21476510067114093,
"grad_norm": 0.33021339774131775,
"learning_rate": 4.99623257562231e-06,
"loss": 0.6734,
"step": 192
},
{
"epoch": 0.21588366890380314,
"grad_norm": 0.32687169313430786,
"learning_rate": 4.996150251121229e-06,
"loss": 0.6387,
"step": 193
},
{
"epoch": 0.21700223713646533,
"grad_norm": 0.3394392728805542,
"learning_rate": 4.996067037544542e-06,
"loss": 0.6623,
"step": 194
},
{
"epoch": 0.2181208053691275,
"grad_norm": 0.33284223079681396,
"learning_rate": 4.995982934921887e-06,
"loss": 0.6405,
"step": 195
},
{
"epoch": 0.21923937360178972,
"grad_norm": 0.344235360622406,
"learning_rate": 4.995897943283221e-06,
"loss": 0.6741,
"step": 196
},
{
"epoch": 0.2203579418344519,
"grad_norm": 0.33437255024909973,
"learning_rate": 4.995812062658815e-06,
"loss": 0.6718,
"step": 197
},
{
"epoch": 0.2214765100671141,
"grad_norm": 0.3216111361980438,
"learning_rate": 4.995725293079257e-06,
"loss": 0.6709,
"step": 198
},
{
"epoch": 0.2225950782997763,
"grad_norm": 0.3448997139930725,
"learning_rate": 4.9956376345754556e-06,
"loss": 0.6458,
"step": 199
},
{
"epoch": 0.22371364653243847,
"grad_norm": 0.34354478120803833,
"learning_rate": 4.99554908717863e-06,
"loss": 0.6615,
"step": 200
},
{
"epoch": 0.22483221476510068,
"grad_norm": 0.3417740762233734,
"learning_rate": 4.99545965092032e-06,
"loss": 0.6847,
"step": 201
},
{
"epoch": 0.22595078299776286,
"grad_norm": 0.3366676867008209,
"learning_rate": 4.99536932583238e-06,
"loss": 0.649,
"step": 202
},
{
"epoch": 0.22706935123042504,
"grad_norm": 0.3610089421272278,
"learning_rate": 4.995278111946983e-06,
"loss": 0.6616,
"step": 203
},
{
"epoch": 0.22818791946308725,
"grad_norm": 0.3359774053096771,
"learning_rate": 4.995186009296618e-06,
"loss": 0.6519,
"step": 204
},
{
"epoch": 0.22930648769574943,
"grad_norm": 0.34075963497161865,
"learning_rate": 4.9950930179140885e-06,
"loss": 0.6762,
"step": 205
},
{
"epoch": 0.23042505592841164,
"grad_norm": 0.32416507601737976,
"learning_rate": 4.994999137832517e-06,
"loss": 0.6499,
"step": 206
},
{
"epoch": 0.23154362416107382,
"grad_norm": 0.32749176025390625,
"learning_rate": 4.99490436908534e-06,
"loss": 0.645,
"step": 207
},
{
"epoch": 0.232662192393736,
"grad_norm": 0.3349708318710327,
"learning_rate": 4.994808711706314e-06,
"loss": 0.6676,
"step": 208
},
{
"epoch": 0.23378076062639822,
"grad_norm": 0.3491227328777313,
"learning_rate": 4.9947121657295094e-06,
"loss": 0.6287,
"step": 209
},
{
"epoch": 0.2348993288590604,
"grad_norm": 0.3400874733924866,
"learning_rate": 4.994614731189314e-06,
"loss": 0.6473,
"step": 210
},
{
"epoch": 0.2360178970917226,
"grad_norm": 0.3388952612876892,
"learning_rate": 4.994516408120432e-06,
"loss": 0.6821,
"step": 211
},
{
"epoch": 0.2371364653243848,
"grad_norm": 0.33224812150001526,
"learning_rate": 4.994417196557884e-06,
"loss": 0.649,
"step": 212
},
{
"epoch": 0.23825503355704697,
"grad_norm": 0.3307199478149414,
"learning_rate": 4.994317096537006e-06,
"loss": 0.6581,
"step": 213
},
{
"epoch": 0.23937360178970918,
"grad_norm": 0.3505164682865143,
"learning_rate": 4.994216108093452e-06,
"loss": 0.6498,
"step": 214
},
{
"epoch": 0.24049217002237136,
"grad_norm": 0.3284938335418701,
"learning_rate": 4.994114231263193e-06,
"loss": 0.6503,
"step": 215
},
{
"epoch": 0.24161073825503357,
"grad_norm": 0.3475011885166168,
"learning_rate": 4.994011466082514e-06,
"loss": 0.6724,
"step": 216
},
{
"epoch": 0.24272930648769575,
"grad_norm": 0.34667858481407166,
"learning_rate": 4.993907812588019e-06,
"loss": 0.6373,
"step": 217
},
{
"epoch": 0.24384787472035793,
"grad_norm": 0.34276899695396423,
"learning_rate": 4.993803270816627e-06,
"loss": 0.6513,
"step": 218
},
{
"epoch": 0.24496644295302014,
"grad_norm": 0.34308409690856934,
"learning_rate": 4.993697840805572e-06,
"loss": 0.6596,
"step": 219
},
{
"epoch": 0.24608501118568232,
"grad_norm": 0.33108261227607727,
"learning_rate": 4.9935915225924075e-06,
"loss": 0.6623,
"step": 220
},
{
"epoch": 0.24720357941834453,
"grad_norm": 0.3529888093471527,
"learning_rate": 4.9934843162150015e-06,
"loss": 0.658,
"step": 221
},
{
"epoch": 0.2483221476510067,
"grad_norm": 0.3457166850566864,
"learning_rate": 4.993376221711538e-06,
"loss": 0.6342,
"step": 222
},
{
"epoch": 0.2494407158836689,
"grad_norm": 0.35108813643455505,
"learning_rate": 4.993267239120519e-06,
"loss": 0.6325,
"step": 223
},
{
"epoch": 0.2505592841163311,
"grad_norm": 0.3448682129383087,
"learning_rate": 4.993157368480761e-06,
"loss": 0.6746,
"step": 224
},
{
"epoch": 0.2516778523489933,
"grad_norm": 0.34094589948654175,
"learning_rate": 4.993046609831397e-06,
"loss": 0.6313,
"step": 225
},
{
"epoch": 0.25279642058165547,
"grad_norm": 0.33934327960014343,
"learning_rate": 4.9929349632118785e-06,
"loss": 0.6371,
"step": 226
},
{
"epoch": 0.2539149888143177,
"grad_norm": 0.3517382740974426,
"learning_rate": 4.99282242866197e-06,
"loss": 0.6411,
"step": 227
},
{
"epoch": 0.2550335570469799,
"grad_norm": 0.34098172187805176,
"learning_rate": 4.992709006221755e-06,
"loss": 0.6648,
"step": 228
},
{
"epoch": 0.25615212527964204,
"grad_norm": 0.3397183120250702,
"learning_rate": 4.992594695931632e-06,
"loss": 0.6038,
"step": 229
},
{
"epoch": 0.25727069351230425,
"grad_norm": 0.35994404554367065,
"learning_rate": 4.992479497832316e-06,
"loss": 0.6832,
"step": 230
},
{
"epoch": 0.25838926174496646,
"grad_norm": 0.3505656123161316,
"learning_rate": 4.992363411964838e-06,
"loss": 0.682,
"step": 231
},
{
"epoch": 0.2595078299776286,
"grad_norm": 0.343159556388855,
"learning_rate": 4.992246438370545e-06,
"loss": 0.6597,
"step": 232
},
{
"epoch": 0.2606263982102908,
"grad_norm": 0.36491280794143677,
"learning_rate": 4.9921285770911e-06,
"loss": 0.6422,
"step": 233
},
{
"epoch": 0.26174496644295303,
"grad_norm": 0.3656606078147888,
"learning_rate": 4.992009828168484e-06,
"loss": 0.6988,
"step": 234
},
{
"epoch": 0.26286353467561524,
"grad_norm": 0.3348519206047058,
"learning_rate": 4.991890191644993e-06,
"loss": 0.6281,
"step": 235
},
{
"epoch": 0.2639821029082774,
"grad_norm": 0.3407367467880249,
"learning_rate": 4.991769667563237e-06,
"loss": 0.6487,
"step": 236
},
{
"epoch": 0.2651006711409396,
"grad_norm": 0.3644556999206543,
"learning_rate": 4.991648255966145e-06,
"loss": 0.6443,
"step": 237
},
{
"epoch": 0.2662192393736018,
"grad_norm": 0.3346659243106842,
"learning_rate": 4.991525956896962e-06,
"loss": 0.632,
"step": 238
},
{
"epoch": 0.26733780760626397,
"grad_norm": 0.36535120010375977,
"learning_rate": 4.991402770399249e-06,
"loss": 0.6347,
"step": 239
},
{
"epoch": 0.2684563758389262,
"grad_norm": 0.34253549575805664,
"learning_rate": 4.991278696516879e-06,
"loss": 0.6946,
"step": 240
},
{
"epoch": 0.2695749440715884,
"grad_norm": 0.35125476121902466,
"learning_rate": 4.9911537352940485e-06,
"loss": 0.6669,
"step": 241
},
{
"epoch": 0.27069351230425054,
"grad_norm": 0.35836276412010193,
"learning_rate": 4.991027886775264e-06,
"loss": 0.6534,
"step": 242
},
{
"epoch": 0.27181208053691275,
"grad_norm": 0.34344252943992615,
"learning_rate": 4.990901151005349e-06,
"loss": 0.6595,
"step": 243
},
{
"epoch": 0.27293064876957496,
"grad_norm": 0.35524797439575195,
"learning_rate": 4.9907735280294465e-06,
"loss": 0.6612,
"step": 244
},
{
"epoch": 0.2740492170022371,
"grad_norm": 0.3483973741531372,
"learning_rate": 4.990645017893013e-06,
"loss": 0.6694,
"step": 245
},
{
"epoch": 0.2751677852348993,
"grad_norm": 0.34605199098587036,
"learning_rate": 4.990515620641819e-06,
"loss": 0.6453,
"step": 246
},
{
"epoch": 0.27628635346756153,
"grad_norm": 0.3441944122314453,
"learning_rate": 4.990385336321954e-06,
"loss": 0.6356,
"step": 247
},
{
"epoch": 0.27740492170022374,
"grad_norm": 0.36233291029930115,
"learning_rate": 4.990254164979823e-06,
"loss": 0.673,
"step": 248
},
{
"epoch": 0.2785234899328859,
"grad_norm": 0.3624320328235626,
"learning_rate": 4.990122106662145e-06,
"loss": 0.6459,
"step": 249
},
{
"epoch": 0.2796420581655481,
"grad_norm": 0.3635922372341156,
"learning_rate": 4.989989161415959e-06,
"loss": 0.6552,
"step": 250
},
{
"epoch": 0.2807606263982103,
"grad_norm": 0.3370678424835205,
"learning_rate": 4.989855329288615e-06,
"loss": 0.6098,
"step": 251
},
{
"epoch": 0.28187919463087246,
"grad_norm": 0.35488393902778625,
"learning_rate": 4.989720610327782e-06,
"loss": 0.6554,
"step": 252
},
{
"epoch": 0.2829977628635347,
"grad_norm": 0.3495752513408661,
"learning_rate": 4.989585004581444e-06,
"loss": 0.6339,
"step": 253
},
{
"epoch": 0.2841163310961969,
"grad_norm": 0.3451905846595764,
"learning_rate": 4.989448512097901e-06,
"loss": 0.6954,
"step": 254
},
{
"epoch": 0.28523489932885904,
"grad_norm": 0.3532280921936035,
"learning_rate": 4.989311132925768e-06,
"loss": 0.6198,
"step": 255
},
{
"epoch": 0.28635346756152125,
"grad_norm": 0.3570882976055145,
"learning_rate": 4.989172867113976e-06,
"loss": 0.6492,
"step": 256
},
{
"epoch": 0.28747203579418346,
"grad_norm": 0.33201339840888977,
"learning_rate": 4.9890337147117755e-06,
"loss": 0.6324,
"step": 257
},
{
"epoch": 0.28859060402684567,
"grad_norm": 0.3354071080684662,
"learning_rate": 4.988893675768726e-06,
"loss": 0.628,
"step": 258
},
{
"epoch": 0.2897091722595078,
"grad_norm": 0.341133713722229,
"learning_rate": 4.988752750334708e-06,
"loss": 0.6316,
"step": 259
},
{
"epoch": 0.29082774049217003,
"grad_norm": 0.33613571524620056,
"learning_rate": 4.9886109384599165e-06,
"loss": 0.6401,
"step": 260
},
{
"epoch": 0.29194630872483224,
"grad_norm": 0.3657302260398865,
"learning_rate": 4.988468240194861e-06,
"loss": 0.6743,
"step": 261
},
{
"epoch": 0.2930648769574944,
"grad_norm": 0.3349529504776001,
"learning_rate": 4.988324655590369e-06,
"loss": 0.6121,
"step": 262
},
{
"epoch": 0.2941834451901566,
"grad_norm": 0.34364601969718933,
"learning_rate": 4.98818018469758e-06,
"loss": 0.6427,
"step": 263
},
{
"epoch": 0.2953020134228188,
"grad_norm": 0.35895341634750366,
"learning_rate": 4.988034827567953e-06,
"loss": 0.6913,
"step": 264
},
{
"epoch": 0.29642058165548096,
"grad_norm": 0.3571792244911194,
"learning_rate": 4.987888584253262e-06,
"loss": 0.6286,
"step": 265
},
{
"epoch": 0.2975391498881432,
"grad_norm": 0.3684253394603729,
"learning_rate": 4.987741454805594e-06,
"loss": 0.6365,
"step": 266
},
{
"epoch": 0.2986577181208054,
"grad_norm": 0.3521265983581543,
"learning_rate": 4.987593439277353e-06,
"loss": 0.6172,
"step": 267
},
{
"epoch": 0.29977628635346754,
"grad_norm": 0.34943872690200806,
"learning_rate": 4.98744453772126e-06,
"loss": 0.6382,
"step": 268
},
{
"epoch": 0.30089485458612975,
"grad_norm": 0.35075268149375916,
"learning_rate": 4.9872947501903515e-06,
"loss": 0.6497,
"step": 269
},
{
"epoch": 0.30201342281879195,
"grad_norm": 0.3558429479598999,
"learning_rate": 4.987144076737978e-06,
"loss": 0.6561,
"step": 270
},
{
"epoch": 0.30313199105145416,
"grad_norm": 0.35898861289024353,
"learning_rate": 4.986992517417805e-06,
"loss": 0.6613,
"step": 271
},
{
"epoch": 0.3042505592841163,
"grad_norm": 0.35135966539382935,
"learning_rate": 4.986840072283815e-06,
"loss": 0.6507,
"step": 272
},
{
"epoch": 0.3053691275167785,
"grad_norm": 0.3530326783657074,
"learning_rate": 4.986686741390308e-06,
"loss": 0.6459,
"step": 273
},
{
"epoch": 0.30648769574944074,
"grad_norm": 0.34631597995758057,
"learning_rate": 4.986532524791894e-06,
"loss": 0.6074,
"step": 274
},
{
"epoch": 0.3076062639821029,
"grad_norm": 0.37036123871803284,
"learning_rate": 4.986377422543503e-06,
"loss": 0.6416,
"step": 275
},
{
"epoch": 0.3087248322147651,
"grad_norm": 0.3576701283454895,
"learning_rate": 4.98622143470038e-06,
"loss": 0.5939,
"step": 276
},
{
"epoch": 0.3098434004474273,
"grad_norm": 0.35285863280296326,
"learning_rate": 4.986064561318083e-06,
"loss": 0.6405,
"step": 277
},
{
"epoch": 0.31096196868008946,
"grad_norm": 0.35576099157333374,
"learning_rate": 4.985906802452488e-06,
"loss": 0.6348,
"step": 278
},
{
"epoch": 0.31208053691275167,
"grad_norm": 0.35739952325820923,
"learning_rate": 4.985748158159785e-06,
"loss": 0.65,
"step": 279
},
{
"epoch": 0.3131991051454139,
"grad_norm": 0.35885411500930786,
"learning_rate": 4.985588628496481e-06,
"loss": 0.6575,
"step": 280
},
{
"epoch": 0.3143176733780761,
"grad_norm": 0.36760038137435913,
"learning_rate": 4.985428213519396e-06,
"loss": 0.6606,
"step": 281
},
{
"epoch": 0.31543624161073824,
"grad_norm": 0.35371875762939453,
"learning_rate": 4.9852669132856645e-06,
"loss": 0.6495,
"step": 282
},
{
"epoch": 0.31655480984340045,
"grad_norm": 0.3489198684692383,
"learning_rate": 4.985104727852741e-06,
"loss": 0.6402,
"step": 283
},
{
"epoch": 0.31767337807606266,
"grad_norm": 0.3595716953277588,
"learning_rate": 4.984941657278392e-06,
"loss": 0.6495,
"step": 284
},
{
"epoch": 0.3187919463087248,
"grad_norm": 0.3551527261734009,
"learning_rate": 4.984777701620698e-06,
"loss": 0.6555,
"step": 285
},
{
"epoch": 0.319910514541387,
"grad_norm": 0.35428524017333984,
"learning_rate": 4.984612860938059e-06,
"loss": 0.6435,
"step": 286
},
{
"epoch": 0.32102908277404923,
"grad_norm": 0.36495980620384216,
"learning_rate": 4.984447135289185e-06,
"loss": 0.6375,
"step": 287
},
{
"epoch": 0.3221476510067114,
"grad_norm": 0.36956459283828735,
"learning_rate": 4.984280524733107e-06,
"loss": 0.654,
"step": 288
},
{
"epoch": 0.3232662192393736,
"grad_norm": 0.34770330786705017,
"learning_rate": 4.984113029329166e-06,
"loss": 0.6313,
"step": 289
},
{
"epoch": 0.3243847874720358,
"grad_norm": 0.3676060140132904,
"learning_rate": 4.9839446491370215e-06,
"loss": 0.6697,
"step": 290
},
{
"epoch": 0.32550335570469796,
"grad_norm": 0.3490992486476898,
"learning_rate": 4.983775384216646e-06,
"loss": 0.6343,
"step": 291
},
{
"epoch": 0.32662192393736017,
"grad_norm": 0.36720773577690125,
"learning_rate": 4.983605234628328e-06,
"loss": 0.6609,
"step": 292
},
{
"epoch": 0.3277404921700224,
"grad_norm": 0.3619595766067505,
"learning_rate": 4.983434200432672e-06,
"loss": 0.6635,
"step": 293
},
{
"epoch": 0.3288590604026846,
"grad_norm": 0.35261741280555725,
"learning_rate": 4.983262281690596e-06,
"loss": 0.6273,
"step": 294
},
{
"epoch": 0.32997762863534674,
"grad_norm": 0.34182801842689514,
"learning_rate": 4.983089478463335e-06,
"loss": 0.6271,
"step": 295
},
{
"epoch": 0.33109619686800895,
"grad_norm": 0.3623373806476593,
"learning_rate": 4.982915790812436e-06,
"loss": 0.6491,
"step": 296
},
{
"epoch": 0.33221476510067116,
"grad_norm": 0.3656613826751709,
"learning_rate": 4.982741218799763e-06,
"loss": 0.6672,
"step": 297
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.3533206880092621,
"learning_rate": 4.982565762487498e-06,
"loss": 0.6257,
"step": 298
},
{
"epoch": 0.3344519015659955,
"grad_norm": 0.35345301032066345,
"learning_rate": 4.982389421938131e-06,
"loss": 0.6486,
"step": 299
},
{
"epoch": 0.33557046979865773,
"grad_norm": 0.36566492915153503,
"learning_rate": 4.982212197214472e-06,
"loss": 0.6411,
"step": 300
},
{
"epoch": 0.3366890380313199,
"grad_norm": 0.3681536018848419,
"learning_rate": 4.982034088379646e-06,
"loss": 0.6208,
"step": 301
},
{
"epoch": 0.3378076062639821,
"grad_norm": 0.3649962246417999,
"learning_rate": 4.98185509549709e-06,
"loss": 0.6557,
"step": 302
},
{
"epoch": 0.3389261744966443,
"grad_norm": 0.350563108921051,
"learning_rate": 4.981675218630557e-06,
"loss": 0.6361,
"step": 303
},
{
"epoch": 0.3400447427293065,
"grad_norm": 0.35432034730911255,
"learning_rate": 4.981494457844117e-06,
"loss": 0.6294,
"step": 304
},
{
"epoch": 0.34116331096196867,
"grad_norm": 0.3590239882469177,
"learning_rate": 4.981312813202153e-06,
"loss": 0.6318,
"step": 305
},
{
"epoch": 0.3422818791946309,
"grad_norm": 0.3664059042930603,
"learning_rate": 4.981130284769361e-06,
"loss": 0.648,
"step": 306
},
{
"epoch": 0.3434004474272931,
"grad_norm": 0.35878944396972656,
"learning_rate": 4.9809468726107555e-06,
"loss": 0.619,
"step": 307
},
{
"epoch": 0.34451901565995524,
"grad_norm": 0.35763007402420044,
"learning_rate": 4.980762576791664e-06,
"loss": 0.6567,
"step": 308
},
{
"epoch": 0.34563758389261745,
"grad_norm": 0.35878288745880127,
"learning_rate": 4.980577397377728e-06,
"loss": 0.6421,
"step": 309
},
{
"epoch": 0.34675615212527966,
"grad_norm": 0.3597519099712372,
"learning_rate": 4.980391334434906e-06,
"loss": 0.6352,
"step": 310
},
{
"epoch": 0.3478747203579418,
"grad_norm": 0.3498169481754303,
"learning_rate": 4.980204388029466e-06,
"loss": 0.626,
"step": 311
},
{
"epoch": 0.348993288590604,
"grad_norm": 0.36353105306625366,
"learning_rate": 4.980016558227998e-06,
"loss": 0.6513,
"step": 312
},
{
"epoch": 0.35011185682326623,
"grad_norm": 0.355794221162796,
"learning_rate": 4.979827845097402e-06,
"loss": 0.6385,
"step": 313
},
{
"epoch": 0.3512304250559284,
"grad_norm": 0.3594406545162201,
"learning_rate": 4.979638248704894e-06,
"loss": 0.6134,
"step": 314
},
{
"epoch": 0.3523489932885906,
"grad_norm": 0.3639025092124939,
"learning_rate": 4.979447769118002e-06,
"loss": 0.6386,
"step": 315
},
{
"epoch": 0.3534675615212528,
"grad_norm": 0.3681359887123108,
"learning_rate": 4.979256406404574e-06,
"loss": 0.6213,
"step": 316
},
{
"epoch": 0.354586129753915,
"grad_norm": 0.35021546483039856,
"learning_rate": 4.979064160632766e-06,
"loss": 0.5933,
"step": 317
},
{
"epoch": 0.35570469798657717,
"grad_norm": 0.35783469676971436,
"learning_rate": 4.978871031871054e-06,
"loss": 0.6054,
"step": 318
},
{
"epoch": 0.3568232662192394,
"grad_norm": 0.3768575191497803,
"learning_rate": 4.978677020188226e-06,
"loss": 0.651,
"step": 319
},
{
"epoch": 0.3579418344519016,
"grad_norm": 0.3841581642627716,
"learning_rate": 4.978482125653385e-06,
"loss": 0.6447,
"step": 320
},
{
"epoch": 0.35906040268456374,
"grad_norm": 0.3749678134918213,
"learning_rate": 4.978286348335949e-06,
"loss": 0.6403,
"step": 321
},
{
"epoch": 0.36017897091722595,
"grad_norm": 0.35757121443748474,
"learning_rate": 4.978089688305647e-06,
"loss": 0.6297,
"step": 322
},
{
"epoch": 0.36129753914988816,
"grad_norm": 0.36303797364234924,
"learning_rate": 4.977892145632528e-06,
"loss": 0.6438,
"step": 323
},
{
"epoch": 0.3624161073825503,
"grad_norm": 0.3670295476913452,
"learning_rate": 4.977693720386951e-06,
"loss": 0.6055,
"step": 324
},
{
"epoch": 0.3635346756152125,
"grad_norm": 0.3521486818790436,
"learning_rate": 4.977494412639591e-06,
"loss": 0.6072,
"step": 325
},
{
"epoch": 0.36465324384787473,
"grad_norm": 0.35688498616218567,
"learning_rate": 4.9772942224614375e-06,
"loss": 0.6252,
"step": 326
},
{
"epoch": 0.36577181208053694,
"grad_norm": 0.3542022109031677,
"learning_rate": 4.9770931499237925e-06,
"loss": 0.6407,
"step": 327
},
{
"epoch": 0.3668903803131991,
"grad_norm": 0.3773319125175476,
"learning_rate": 4.976891195098277e-06,
"loss": 0.6524,
"step": 328
},
{
"epoch": 0.3680089485458613,
"grad_norm": 0.3697628080844879,
"learning_rate": 4.97668835805682e-06,
"loss": 0.6503,
"step": 329
},
{
"epoch": 0.3691275167785235,
"grad_norm": 0.36458712816238403,
"learning_rate": 4.976484638871669e-06,
"loss": 0.6722,
"step": 330
},
{
"epoch": 0.37024608501118567,
"grad_norm": 0.3523670434951782,
"learning_rate": 4.976280037615385e-06,
"loss": 0.6273,
"step": 331
},
{
"epoch": 0.3713646532438479,
"grad_norm": 0.35473543405532837,
"learning_rate": 4.9760745543608414e-06,
"loss": 0.6243,
"step": 332
},
{
"epoch": 0.3724832214765101,
"grad_norm": 0.36719077825546265,
"learning_rate": 4.9758681891812276e-06,
"loss": 0.6476,
"step": 333
},
{
"epoch": 0.37360178970917224,
"grad_norm": 0.3712293207645416,
"learning_rate": 4.9756609421500464e-06,
"loss": 0.6475,
"step": 334
},
{
"epoch": 0.37472035794183445,
"grad_norm": 0.36749109625816345,
"learning_rate": 4.9754528133411144e-06,
"loss": 0.6428,
"step": 335
},
{
"epoch": 0.37583892617449666,
"grad_norm": 0.35749316215515137,
"learning_rate": 4.975243802828563e-06,
"loss": 0.6123,
"step": 336
},
{
"epoch": 0.3769574944071588,
"grad_norm": 0.37466734647750854,
"learning_rate": 4.975033910686837e-06,
"loss": 0.6393,
"step": 337
},
{
"epoch": 0.378076062639821,
"grad_norm": 0.36750441789627075,
"learning_rate": 4.974823136990697e-06,
"loss": 0.6405,
"step": 338
},
{
"epoch": 0.37919463087248323,
"grad_norm": 0.3831922113895416,
"learning_rate": 4.9746114818152135e-06,
"loss": 0.6633,
"step": 339
},
{
"epoch": 0.38031319910514544,
"grad_norm": 0.35679274797439575,
"learning_rate": 4.974398945235776e-06,
"loss": 0.6431,
"step": 340
},
{
"epoch": 0.3814317673378076,
"grad_norm": 0.36524152755737305,
"learning_rate": 4.974185527328084e-06,
"loss": 0.6419,
"step": 341
},
{
"epoch": 0.3825503355704698,
"grad_norm": 0.3668903410434723,
"learning_rate": 4.9739712281681525e-06,
"loss": 0.6418,
"step": 342
},
{
"epoch": 0.383668903803132,
"grad_norm": 0.37841862440109253,
"learning_rate": 4.973756047832312e-06,
"loss": 0.6585,
"step": 343
},
{
"epoch": 0.38478747203579416,
"grad_norm": 0.37758868932724,
"learning_rate": 4.9735399863972024e-06,
"loss": 0.6493,
"step": 344
},
{
"epoch": 0.3859060402684564,
"grad_norm": 0.3663494288921356,
"learning_rate": 4.973323043939783e-06,
"loss": 0.6728,
"step": 345
},
{
"epoch": 0.3870246085011186,
"grad_norm": 0.3930216431617737,
"learning_rate": 4.973105220537322e-06,
"loss": 0.6608,
"step": 346
},
{
"epoch": 0.38814317673378074,
"grad_norm": 0.390828400850296,
"learning_rate": 4.972886516267404e-06,
"loss": 0.6497,
"step": 347
},
{
"epoch": 0.38926174496644295,
"grad_norm": 0.37600764632225037,
"learning_rate": 4.972666931207927e-06,
"loss": 0.6426,
"step": 348
},
{
"epoch": 0.39038031319910516,
"grad_norm": 0.36520275473594666,
"learning_rate": 4.972446465437103e-06,
"loss": 0.645,
"step": 349
},
{
"epoch": 0.39149888143176736,
"grad_norm": 0.3984422981739044,
"learning_rate": 4.972225119033457e-06,
"loss": 0.6368,
"step": 350
},
{
"epoch": 0.3926174496644295,
"grad_norm": 0.3725559711456299,
"learning_rate": 4.972002892075827e-06,
"loss": 0.625,
"step": 351
},
{
"epoch": 0.39373601789709173,
"grad_norm": 0.3889387547969818,
"learning_rate": 4.9717797846433655e-06,
"loss": 0.6258,
"step": 352
},
{
"epoch": 0.39485458612975394,
"grad_norm": 0.37537676095962524,
"learning_rate": 4.97155579681554e-06,
"loss": 0.64,
"step": 353
},
{
"epoch": 0.3959731543624161,
"grad_norm": 0.3795606791973114,
"learning_rate": 4.97133092867213e-06,
"loss": 0.6849,
"step": 354
},
{
"epoch": 0.3970917225950783,
"grad_norm": 0.38519197702407837,
"learning_rate": 4.971105180293228e-06,
"loss": 0.6493,
"step": 355
},
{
"epoch": 0.3982102908277405,
"grad_norm": 0.3789883553981781,
"learning_rate": 4.97087855175924e-06,
"loss": 0.6281,
"step": 356
},
{
"epoch": 0.39932885906040266,
"grad_norm": 0.3687867820262909,
"learning_rate": 4.970651043150887e-06,
"loss": 0.6278,
"step": 357
},
{
"epoch": 0.4004474272930649,
"grad_norm": 0.365581214427948,
"learning_rate": 4.970422654549204e-06,
"loss": 0.647,
"step": 358
},
{
"epoch": 0.4015659955257271,
"grad_norm": 0.37871256470680237,
"learning_rate": 4.970193386035537e-06,
"loss": 0.6349,
"step": 359
},
{
"epoch": 0.40268456375838924,
"grad_norm": 0.37639445066452026,
"learning_rate": 4.969963237691547e-06,
"loss": 0.6544,
"step": 360
},
{
"epoch": 0.40380313199105144,
"grad_norm": 0.38033226132392883,
"learning_rate": 4.9697322095992075e-06,
"loss": 0.6216,
"step": 361
},
{
"epoch": 0.40492170022371365,
"grad_norm": 0.3785533010959625,
"learning_rate": 4.969500301840805e-06,
"loss": 0.6379,
"step": 362
},
{
"epoch": 0.40604026845637586,
"grad_norm": 0.36831173300743103,
"learning_rate": 4.969267514498942e-06,
"loss": 0.6305,
"step": 363
},
{
"epoch": 0.407158836689038,
"grad_norm": 0.3860856592655182,
"learning_rate": 4.969033847656531e-06,
"loss": 0.6428,
"step": 364
},
{
"epoch": 0.4082774049217002,
"grad_norm": 0.37992534041404724,
"learning_rate": 4.9687993013968e-06,
"loss": 0.629,
"step": 365
},
{
"epoch": 0.40939597315436244,
"grad_norm": 0.37179872393608093,
"learning_rate": 4.9685638758032885e-06,
"loss": 0.6146,
"step": 366
},
{
"epoch": 0.4105145413870246,
"grad_norm": 0.3762771487236023,
"learning_rate": 4.96832757095985e-06,
"loss": 0.6294,
"step": 367
},
{
"epoch": 0.4116331096196868,
"grad_norm": 0.37078356742858887,
"learning_rate": 4.968090386950653e-06,
"loss": 0.6438,
"step": 368
},
{
"epoch": 0.412751677852349,
"grad_norm": 0.3619535565376282,
"learning_rate": 4.967852323860176e-06,
"loss": 0.6229,
"step": 369
},
{
"epoch": 0.41387024608501116,
"grad_norm": 0.3610592782497406,
"learning_rate": 4.967613381773211e-06,
"loss": 0.6332,
"step": 370
},
{
"epoch": 0.41498881431767337,
"grad_norm": 0.36372244358062744,
"learning_rate": 4.9673735607748665e-06,
"loss": 0.6379,
"step": 371
},
{
"epoch": 0.4161073825503356,
"grad_norm": 0.3713506758213043,
"learning_rate": 4.96713286095056e-06,
"loss": 0.6051,
"step": 372
},
{
"epoch": 0.4172259507829978,
"grad_norm": 0.37290191650390625,
"learning_rate": 4.9668912823860244e-06,
"loss": 0.6431,
"step": 373
},
{
"epoch": 0.41834451901565994,
"grad_norm": 0.3736407458782196,
"learning_rate": 4.966648825167305e-06,
"loss": 0.6296,
"step": 374
},
{
"epoch": 0.41946308724832215,
"grad_norm": 0.38261404633522034,
"learning_rate": 4.9664054893807586e-06,
"loss": 0.6559,
"step": 375
},
{
"epoch": 0.42058165548098436,
"grad_norm": 0.36865612864494324,
"learning_rate": 4.966161275113057e-06,
"loss": 0.6372,
"step": 376
},
{
"epoch": 0.4217002237136465,
"grad_norm": 0.3745094835758209,
"learning_rate": 4.965916182451185e-06,
"loss": 0.6526,
"step": 377
},
{
"epoch": 0.4228187919463087,
"grad_norm": 0.3758225440979004,
"learning_rate": 4.965670211482437e-06,
"loss": 0.6423,
"step": 378
},
{
"epoch": 0.42393736017897093,
"grad_norm": 0.37716934084892273,
"learning_rate": 4.965423362294426e-06,
"loss": 0.6431,
"step": 379
},
{
"epoch": 0.4250559284116331,
"grad_norm": 0.3757461905479431,
"learning_rate": 4.965175634975072e-06,
"loss": 0.6335,
"step": 380
},
{
"epoch": 0.4261744966442953,
"grad_norm": 0.3701077401638031,
"learning_rate": 4.964927029612611e-06,
"loss": 0.6182,
"step": 381
},
{
"epoch": 0.4272930648769575,
"grad_norm": 0.38263121247291565,
"learning_rate": 4.96467754629559e-06,
"loss": 0.6371,
"step": 382
},
{
"epoch": 0.42841163310961966,
"grad_norm": 0.3740926682949066,
"learning_rate": 4.9644271851128715e-06,
"loss": 0.6272,
"step": 383
},
{
"epoch": 0.42953020134228187,
"grad_norm": 0.39056089520454407,
"learning_rate": 4.964175946153627e-06,
"loss": 0.624,
"step": 384
},
{
"epoch": 0.4306487695749441,
"grad_norm": 0.3867873549461365,
"learning_rate": 4.963923829507343e-06,
"loss": 0.6714,
"step": 385
},
{
"epoch": 0.4317673378076063,
"grad_norm": 0.3808860182762146,
"learning_rate": 4.963670835263819e-06,
"loss": 0.6412,
"step": 386
},
{
"epoch": 0.43288590604026844,
"grad_norm": 0.3839844763278961,
"learning_rate": 4.963416963513166e-06,
"loss": 0.6288,
"step": 387
},
{
"epoch": 0.43400447427293065,
"grad_norm": 0.37187114357948303,
"learning_rate": 4.963162214345806e-06,
"loss": 0.6307,
"step": 388
},
{
"epoch": 0.43512304250559286,
"grad_norm": 0.36723873019218445,
"learning_rate": 4.962906587852477e-06,
"loss": 0.6285,
"step": 389
},
{
"epoch": 0.436241610738255,
"grad_norm": 0.3754160404205322,
"learning_rate": 4.962650084124226e-06,
"loss": 0.6227,
"step": 390
},
{
"epoch": 0.4373601789709172,
"grad_norm": 0.374239981174469,
"learning_rate": 4.962392703252417e-06,
"loss": 0.6612,
"step": 391
},
{
"epoch": 0.43847874720357943,
"grad_norm": 0.3758632242679596,
"learning_rate": 4.9621344453287214e-06,
"loss": 0.6408,
"step": 392
},
{
"epoch": 0.4395973154362416,
"grad_norm": 0.3839190602302551,
"learning_rate": 4.9618753104451254e-06,
"loss": 0.6524,
"step": 393
},
{
"epoch": 0.4407158836689038,
"grad_norm": 0.36766374111175537,
"learning_rate": 4.961615298693928e-06,
"loss": 0.6232,
"step": 394
},
{
"epoch": 0.441834451901566,
"grad_norm": 0.3692423701286316,
"learning_rate": 4.961354410167739e-06,
"loss": 0.6436,
"step": 395
},
{
"epoch": 0.4429530201342282,
"grad_norm": 0.3720521926879883,
"learning_rate": 4.961092644959482e-06,
"loss": 0.6346,
"step": 396
},
{
"epoch": 0.44407158836689037,
"grad_norm": 0.373910129070282,
"learning_rate": 4.960830003162392e-06,
"loss": 0.6211,
"step": 397
},
{
"epoch": 0.4451901565995526,
"grad_norm": 0.37455853819847107,
"learning_rate": 4.960566484870017e-06,
"loss": 0.6366,
"step": 398
},
{
"epoch": 0.4463087248322148,
"grad_norm": 0.387390673160553,
"learning_rate": 4.960302090176215e-06,
"loss": 0.6543,
"step": 399
},
{
"epoch": 0.44742729306487694,
"grad_norm": 0.3862502872943878,
"learning_rate": 4.960036819175159e-06,
"loss": 0.6351,
"step": 400
},
{
"epoch": 0.44854586129753915,
"grad_norm": 0.38686901330947876,
"learning_rate": 4.959770671961334e-06,
"loss": 0.6247,
"step": 401
},
{
"epoch": 0.44966442953020136,
"grad_norm": 0.38111770153045654,
"learning_rate": 4.959503648629534e-06,
"loss": 0.6624,
"step": 402
},
{
"epoch": 0.4507829977628635,
"grad_norm": 0.3962753713130951,
"learning_rate": 4.959235749274866e-06,
"loss": 0.6224,
"step": 403
},
{
"epoch": 0.4519015659955257,
"grad_norm": 0.36403393745422363,
"learning_rate": 4.958966973992754e-06,
"loss": 0.6215,
"step": 404
},
{
"epoch": 0.45302013422818793,
"grad_norm": 0.3858584463596344,
"learning_rate": 4.958697322878926e-06,
"loss": 0.6473,
"step": 405
},
{
"epoch": 0.4541387024608501,
"grad_norm": 0.39325979351997375,
"learning_rate": 4.958426796029429e-06,
"loss": 0.6664,
"step": 406
},
{
"epoch": 0.4552572706935123,
"grad_norm": 0.37423112988471985,
"learning_rate": 4.958155393540618e-06,
"loss": 0.6416,
"step": 407
},
{
"epoch": 0.4563758389261745,
"grad_norm": 0.3979191482067108,
"learning_rate": 4.9578831155091585e-06,
"loss": 0.6493,
"step": 408
},
{
"epoch": 0.4574944071588367,
"grad_norm": 0.375473290681839,
"learning_rate": 4.957609962032034e-06,
"loss": 0.6246,
"step": 409
},
{
"epoch": 0.45861297539149887,
"grad_norm": 0.37951260805130005,
"learning_rate": 4.957335933206533e-06,
"loss": 0.6374,
"step": 410
},
{
"epoch": 0.4597315436241611,
"grad_norm": 0.384162575006485,
"learning_rate": 4.9570610291302605e-06,
"loss": 0.6411,
"step": 411
},
{
"epoch": 0.4608501118568233,
"grad_norm": 0.37713801860809326,
"learning_rate": 4.95678524990113e-06,
"loss": 0.6384,
"step": 412
},
{
"epoch": 0.46196868008948544,
"grad_norm": 0.3779420554637909,
"learning_rate": 4.95650859561737e-06,
"loss": 0.6238,
"step": 413
},
{
"epoch": 0.46308724832214765,
"grad_norm": 0.3826324939727783,
"learning_rate": 4.956231066377517e-06,
"loss": 0.6373,
"step": 414
},
{
"epoch": 0.46420581655480986,
"grad_norm": 0.3693124055862427,
"learning_rate": 4.955952662280422e-06,
"loss": 0.6264,
"step": 415
},
{
"epoch": 0.465324384787472,
"grad_norm": 0.3891177177429199,
"learning_rate": 4.9556733834252465e-06,
"loss": 0.6755,
"step": 416
},
{
"epoch": 0.4664429530201342,
"grad_norm": 0.3732079863548279,
"learning_rate": 4.955393229911465e-06,
"loss": 0.6163,
"step": 417
},
{
"epoch": 0.46756152125279643,
"grad_norm": 0.39267081022262573,
"learning_rate": 4.955112201838859e-06,
"loss": 0.653,
"step": 418
},
{
"epoch": 0.46868008948545864,
"grad_norm": 0.37127041816711426,
"learning_rate": 4.9548302993075275e-06,
"loss": 0.6024,
"step": 419
},
{
"epoch": 0.4697986577181208,
"grad_norm": 0.38274380564689636,
"learning_rate": 4.954547522417878e-06,
"loss": 0.6103,
"step": 420
},
{
"epoch": 0.470917225950783,
"grad_norm": 0.39440205693244934,
"learning_rate": 4.954263871270627e-06,
"loss": 0.6388,
"step": 421
},
{
"epoch": 0.4720357941834452,
"grad_norm": 0.38207298517227173,
"learning_rate": 4.953979345966808e-06,
"loss": 0.6157,
"step": 422
},
{
"epoch": 0.47315436241610737,
"grad_norm": 0.37390536069869995,
"learning_rate": 4.953693946607762e-06,
"loss": 0.612,
"step": 423
},
{
"epoch": 0.4742729306487696,
"grad_norm": 0.3679952621459961,
"learning_rate": 4.953407673295141e-06,
"loss": 0.5962,
"step": 424
},
{
"epoch": 0.4753914988814318,
"grad_norm": 0.36741313338279724,
"learning_rate": 4.953120526130911e-06,
"loss": 0.5802,
"step": 425
},
{
"epoch": 0.47651006711409394,
"grad_norm": 0.40101951360702515,
"learning_rate": 4.952832505217347e-06,
"loss": 0.631,
"step": 426
},
{
"epoch": 0.47762863534675615,
"grad_norm": 0.37646785378456116,
"learning_rate": 4.952543610657036e-06,
"loss": 0.6192,
"step": 427
},
{
"epoch": 0.47874720357941836,
"grad_norm": 0.3909439444541931,
"learning_rate": 4.952253842552876e-06,
"loss": 0.6288,
"step": 428
},
{
"epoch": 0.4798657718120805,
"grad_norm": 0.379685640335083,
"learning_rate": 4.9519632010080765e-06,
"loss": 0.6296,
"step": 429
},
{
"epoch": 0.4809843400447427,
"grad_norm": 0.3872782588005066,
"learning_rate": 4.9516716861261575e-06,
"loss": 0.6307,
"step": 430
},
{
"epoch": 0.48210290827740493,
"grad_norm": 0.4066009223461151,
"learning_rate": 4.951379298010951e-06,
"loss": 0.6454,
"step": 431
},
{
"epoch": 0.48322147651006714,
"grad_norm": 0.38412487506866455,
"learning_rate": 4.951086036766599e-06,
"loss": 0.6254,
"step": 432
},
{
"epoch": 0.4843400447427293,
"grad_norm": 0.37819865345954895,
"learning_rate": 4.9507919024975545e-06,
"loss": 0.629,
"step": 433
},
{
"epoch": 0.4854586129753915,
"grad_norm": 0.38674691319465637,
"learning_rate": 4.950496895308582e-06,
"loss": 0.6357,
"step": 434
},
{
"epoch": 0.4865771812080537,
"grad_norm": 0.39304593205451965,
"learning_rate": 4.950201015304758e-06,
"loss": 0.6475,
"step": 435
},
{
"epoch": 0.48769574944071586,
"grad_norm": 0.381124347448349,
"learning_rate": 4.949904262591467e-06,
"loss": 0.6523,
"step": 436
},
{
"epoch": 0.4888143176733781,
"grad_norm": 0.4084749221801758,
"learning_rate": 4.949606637274408e-06,
"loss": 0.6773,
"step": 437
},
{
"epoch": 0.4899328859060403,
"grad_norm": 0.3967250883579254,
"learning_rate": 4.949308139459586e-06,
"loss": 0.6263,
"step": 438
},
{
"epoch": 0.49105145413870244,
"grad_norm": 0.39761948585510254,
"learning_rate": 4.949008769253322e-06,
"loss": 0.6273,
"step": 439
},
{
"epoch": 0.49217002237136465,
"grad_norm": 0.3865715265274048,
"learning_rate": 4.948708526762244e-06,
"loss": 0.6464,
"step": 440
},
{
"epoch": 0.49328859060402686,
"grad_norm": 0.3970697820186615,
"learning_rate": 4.948407412093292e-06,
"loss": 0.6229,
"step": 441
},
{
"epoch": 0.49440715883668906,
"grad_norm": 0.3817065954208374,
"learning_rate": 4.948105425353718e-06,
"loss": 0.6375,
"step": 442
},
{
"epoch": 0.4955257270693512,
"grad_norm": 0.3877985179424286,
"learning_rate": 4.947802566651082e-06,
"loss": 0.6389,
"step": 443
},
{
"epoch": 0.4966442953020134,
"grad_norm": 0.40800127387046814,
"learning_rate": 4.947498836093257e-06,
"loss": 0.6627,
"step": 444
},
{
"epoch": 0.49776286353467564,
"grad_norm": 0.40732380747795105,
"learning_rate": 4.947194233788423e-06,
"loss": 0.6156,
"step": 445
},
{
"epoch": 0.4988814317673378,
"grad_norm": 0.3948177695274353,
"learning_rate": 4.946888759845074e-06,
"loss": 0.6481,
"step": 446
},
{
"epoch": 0.5,
"grad_norm": 0.38517189025878906,
"learning_rate": 4.9465824143720145e-06,
"loss": 0.6224,
"step": 447
},
{
"epoch": 0.5011185682326622,
"grad_norm": 0.3713424503803253,
"learning_rate": 4.946275197478358e-06,
"loss": 0.626,
"step": 448
},
{
"epoch": 0.5022371364653244,
"grad_norm": 0.4172223210334778,
"learning_rate": 4.945967109273527e-06,
"loss": 0.6405,
"step": 449
},
{
"epoch": 0.5033557046979866,
"grad_norm": 0.4550599157810211,
"learning_rate": 4.945658149867257e-06,
"loss": 0.6103,
"step": 450
},
{
"epoch": 0.5044742729306487,
"grad_norm": 0.3938581347465515,
"learning_rate": 4.945348319369593e-06,
"loss": 0.6304,
"step": 451
},
{
"epoch": 0.5055928411633109,
"grad_norm": 0.3923262059688568,
"learning_rate": 4.94503761789089e-06,
"loss": 0.6603,
"step": 452
},
{
"epoch": 0.5067114093959731,
"grad_norm": 0.3978983163833618,
"learning_rate": 4.944726045541814e-06,
"loss": 0.6445,
"step": 453
},
{
"epoch": 0.5078299776286354,
"grad_norm": 0.4101882576942444,
"learning_rate": 4.9444136024333374e-06,
"loss": 0.6223,
"step": 454
},
{
"epoch": 0.5089485458612976,
"grad_norm": 0.4056575298309326,
"learning_rate": 4.944100288676749e-06,
"loss": 0.6343,
"step": 455
},
{
"epoch": 0.5100671140939598,
"grad_norm": 0.39720436930656433,
"learning_rate": 4.943786104383644e-06,
"loss": 0.6246,
"step": 456
},
{
"epoch": 0.5111856823266219,
"grad_norm": 0.3909725248813629,
"learning_rate": 4.943471049665925e-06,
"loss": 0.6339,
"step": 457
},
{
"epoch": 0.5123042505592841,
"grad_norm": 0.3773731291294098,
"learning_rate": 4.943155124635812e-06,
"loss": 0.6215,
"step": 458
},
{
"epoch": 0.5134228187919463,
"grad_norm": 0.4020001292228699,
"learning_rate": 4.9428383294058295e-06,
"loss": 0.6269,
"step": 459
},
{
"epoch": 0.5145413870246085,
"grad_norm": 0.3916706144809723,
"learning_rate": 4.942520664088812e-06,
"loss": 0.6233,
"step": 460
},
{
"epoch": 0.5156599552572707,
"grad_norm": 0.38717713952064514,
"learning_rate": 4.9422021287979076e-06,
"loss": 0.6216,
"step": 461
},
{
"epoch": 0.5167785234899329,
"grad_norm": 0.38645485043525696,
"learning_rate": 4.941882723646568e-06,
"loss": 0.6092,
"step": 462
},
{
"epoch": 0.5178970917225951,
"grad_norm": 0.38496407866477966,
"learning_rate": 4.9415624487485615e-06,
"loss": 0.6368,
"step": 463
},
{
"epoch": 0.5190156599552572,
"grad_norm": 0.3946744501590729,
"learning_rate": 4.941241304217962e-06,
"loss": 0.6525,
"step": 464
},
{
"epoch": 0.5201342281879194,
"grad_norm": 0.3994438648223877,
"learning_rate": 4.940919290169155e-06,
"loss": 0.6314,
"step": 465
},
{
"epoch": 0.5212527964205816,
"grad_norm": 0.3929794728755951,
"learning_rate": 4.940596406716834e-06,
"loss": 0.6148,
"step": 466
},
{
"epoch": 0.5223713646532439,
"grad_norm": 0.42620542645454407,
"learning_rate": 4.940272653976005e-06,
"loss": 0.6468,
"step": 467
},
{
"epoch": 0.5234899328859061,
"grad_norm": 0.4014374613761902,
"learning_rate": 4.9399480320619805e-06,
"loss": 0.6451,
"step": 468
},
{
"epoch": 0.5246085011185683,
"grad_norm": 0.39342424273490906,
"learning_rate": 4.939622541090384e-06,
"loss": 0.6696,
"step": 469
},
{
"epoch": 0.5257270693512305,
"grad_norm": 0.3870956301689148,
"learning_rate": 4.939296181177149e-06,
"loss": 0.6451,
"step": 470
},
{
"epoch": 0.5268456375838926,
"grad_norm": 0.39973214268684387,
"learning_rate": 4.938968952438518e-06,
"loss": 0.6254,
"step": 471
},
{
"epoch": 0.5279642058165548,
"grad_norm": 0.3956799805164337,
"learning_rate": 4.938640854991041e-06,
"loss": 0.6169,
"step": 472
},
{
"epoch": 0.529082774049217,
"grad_norm": 0.38881829380989075,
"learning_rate": 4.938311888951583e-06,
"loss": 0.5989,
"step": 473
},
{
"epoch": 0.5302013422818792,
"grad_norm": 0.392107218503952,
"learning_rate": 4.93798205443731e-06,
"loss": 0.6284,
"step": 474
},
{
"epoch": 0.5313199105145414,
"grad_norm": 0.4042797088623047,
"learning_rate": 4.937651351565707e-06,
"loss": 0.6235,
"step": 475
},
{
"epoch": 0.5324384787472036,
"grad_norm": 0.380206435918808,
"learning_rate": 4.937319780454559e-06,
"loss": 0.5894,
"step": 476
},
{
"epoch": 0.5335570469798657,
"grad_norm": 0.3989536166191101,
"learning_rate": 4.936987341221968e-06,
"loss": 0.6522,
"step": 477
},
{
"epoch": 0.5346756152125279,
"grad_norm": 0.38699498772621155,
"learning_rate": 4.9366540339863395e-06,
"loss": 0.6202,
"step": 478
},
{
"epoch": 0.5357941834451901,
"grad_norm": 0.4171985387802124,
"learning_rate": 4.936319858866391e-06,
"loss": 0.624,
"step": 479
},
{
"epoch": 0.5369127516778524,
"grad_norm": 0.3932148218154907,
"learning_rate": 4.93598481598115e-06,
"loss": 0.6215,
"step": 480
},
{
"epoch": 0.5380313199105146,
"grad_norm": 0.3934101462364197,
"learning_rate": 4.935648905449949e-06,
"loss": 0.6402,
"step": 481
},
{
"epoch": 0.5391498881431768,
"grad_norm": 0.3917444348335266,
"learning_rate": 4.935312127392434e-06,
"loss": 0.641,
"step": 482
},
{
"epoch": 0.540268456375839,
"grad_norm": 0.4036387503147125,
"learning_rate": 4.9349744819285584e-06,
"loss": 0.6405,
"step": 483
},
{
"epoch": 0.5413870246085011,
"grad_norm": 0.38611260056495667,
"learning_rate": 4.934635969178584e-06,
"loss": 0.6231,
"step": 484
},
{
"epoch": 0.5425055928411633,
"grad_norm": 0.39185649156570435,
"learning_rate": 4.9342965892630805e-06,
"loss": 0.6214,
"step": 485
},
{
"epoch": 0.5436241610738255,
"grad_norm": 0.3736090362071991,
"learning_rate": 4.933956342302929e-06,
"loss": 0.6053,
"step": 486
},
{
"epoch": 0.5447427293064877,
"grad_norm": 0.39649662375450134,
"learning_rate": 4.93361522841932e-06,
"loss": 0.6408,
"step": 487
},
{
"epoch": 0.5458612975391499,
"grad_norm": 0.3990592658519745,
"learning_rate": 4.933273247733746e-06,
"loss": 0.6081,
"step": 488
},
{
"epoch": 0.5469798657718121,
"grad_norm": 0.39177680015563965,
"learning_rate": 4.932930400368019e-06,
"loss": 0.6114,
"step": 489
},
{
"epoch": 0.5480984340044742,
"grad_norm": 0.3953116536140442,
"learning_rate": 4.9325866864442495e-06,
"loss": 0.6339,
"step": 490
},
{
"epoch": 0.5492170022371364,
"grad_norm": 0.38563409447669983,
"learning_rate": 4.932242106084864e-06,
"loss": 0.6331,
"step": 491
},
{
"epoch": 0.5503355704697986,
"grad_norm": 0.40618443489074707,
"learning_rate": 4.931896659412593e-06,
"loss": 0.6441,
"step": 492
},
{
"epoch": 0.5514541387024608,
"grad_norm": 0.4008066654205322,
"learning_rate": 4.931550346550479e-06,
"loss": 0.6243,
"step": 493
},
{
"epoch": 0.5525727069351231,
"grad_norm": 0.39776620268821716,
"learning_rate": 4.931203167621868e-06,
"loss": 0.6152,
"step": 494
},
{
"epoch": 0.5536912751677853,
"grad_norm": 0.38687410950660706,
"learning_rate": 4.930855122750421e-06,
"loss": 0.5969,
"step": 495
},
{
"epoch": 0.5548098434004475,
"grad_norm": 0.3877246081829071,
"learning_rate": 4.9305062120601035e-06,
"loss": 0.6016,
"step": 496
},
{
"epoch": 0.5559284116331096,
"grad_norm": 0.40948086977005005,
"learning_rate": 4.930156435675189e-06,
"loss": 0.6168,
"step": 497
},
{
"epoch": 0.5570469798657718,
"grad_norm": 0.404021292924881,
"learning_rate": 4.929805793720262e-06,
"loss": 0.6092,
"step": 498
},
{
"epoch": 0.558165548098434,
"grad_norm": 0.39509114623069763,
"learning_rate": 4.929454286320211e-06,
"loss": 0.6346,
"step": 499
},
{
"epoch": 0.5592841163310962,
"grad_norm": 0.39687812328338623,
"learning_rate": 4.9291019136002385e-06,
"loss": 0.639,
"step": 500
},
{
"epoch": 0.5604026845637584,
"grad_norm": 0.39210405945777893,
"learning_rate": 4.92874867568585e-06,
"loss": 0.6083,
"step": 501
},
{
"epoch": 0.5615212527964206,
"grad_norm": 0.4022452235221863,
"learning_rate": 4.928394572702862e-06,
"loss": 0.6252,
"step": 502
},
{
"epoch": 0.5626398210290827,
"grad_norm": 0.40317103266716003,
"learning_rate": 4.928039604777399e-06,
"loss": 0.614,
"step": 503
},
{
"epoch": 0.5637583892617449,
"grad_norm": 0.4097250998020172,
"learning_rate": 4.9276837720358924e-06,
"loss": 0.6218,
"step": 504
},
{
"epoch": 0.5648769574944071,
"grad_norm": 0.3927348554134369,
"learning_rate": 4.927327074605083e-06,
"loss": 0.6079,
"step": 505
},
{
"epoch": 0.5659955257270693,
"grad_norm": 0.3961605131626129,
"learning_rate": 4.9269695126120185e-06,
"loss": 0.612,
"step": 506
},
{
"epoch": 0.5671140939597316,
"grad_norm": 0.3945924639701843,
"learning_rate": 4.926611086184054e-06,
"loss": 0.6268,
"step": 507
},
{
"epoch": 0.5682326621923938,
"grad_norm": 0.39072591066360474,
"learning_rate": 4.926251795448854e-06,
"loss": 0.6176,
"step": 508
},
{
"epoch": 0.569351230425056,
"grad_norm": 0.39760643243789673,
"learning_rate": 4.9258916405343904e-06,
"loss": 0.6437,
"step": 509
},
{
"epoch": 0.5704697986577181,
"grad_norm": 0.39866000413894653,
"learning_rate": 4.925530621568942e-06,
"loss": 0.6383,
"step": 510
},
{
"epoch": 0.5715883668903803,
"grad_norm": 0.3932257294654846,
"learning_rate": 4.925168738681097e-06,
"loss": 0.6156,
"step": 511
},
{
"epoch": 0.5727069351230425,
"grad_norm": 0.39929500222206116,
"learning_rate": 4.924805991999751e-06,
"loss": 0.6069,
"step": 512
},
{
"epoch": 0.5738255033557047,
"grad_norm": 0.41192054748535156,
"learning_rate": 4.924442381654105e-06,
"loss": 0.6451,
"step": 513
},
{
"epoch": 0.5749440715883669,
"grad_norm": 0.41147273778915405,
"learning_rate": 4.92407790777367e-06,
"loss": 0.647,
"step": 514
},
{
"epoch": 0.5760626398210291,
"grad_norm": 0.4128178358078003,
"learning_rate": 4.923712570488264e-06,
"loss": 0.5909,
"step": 515
},
{
"epoch": 0.5771812080536913,
"grad_norm": 0.4081036150455475,
"learning_rate": 4.923346369928012e-06,
"loss": 0.6248,
"step": 516
},
{
"epoch": 0.5782997762863534,
"grad_norm": 0.3965778350830078,
"learning_rate": 4.922979306223347e-06,
"loss": 0.6019,
"step": 517
},
{
"epoch": 0.5794183445190156,
"grad_norm": 0.3979526162147522,
"learning_rate": 4.922611379505009e-06,
"loss": 0.6368,
"step": 518
},
{
"epoch": 0.5805369127516778,
"grad_norm": 0.38306665420532227,
"learning_rate": 4.922242589904046e-06,
"loss": 0.62,
"step": 519
},
{
"epoch": 0.5816554809843401,
"grad_norm": 0.3833399713039398,
"learning_rate": 4.921872937551814e-06,
"loss": 0.6064,
"step": 520
},
{
"epoch": 0.5827740492170023,
"grad_norm": 0.39361608028411865,
"learning_rate": 4.921502422579973e-06,
"loss": 0.6236,
"step": 521
},
{
"epoch": 0.5838926174496645,
"grad_norm": 0.39250272512435913,
"learning_rate": 4.921131045120494e-06,
"loss": 0.624,
"step": 522
},
{
"epoch": 0.5850111856823266,
"grad_norm": 0.40747684240341187,
"learning_rate": 4.920758805305654e-06,
"loss": 0.6096,
"step": 523
},
{
"epoch": 0.5861297539149888,
"grad_norm": 0.39987003803253174,
"learning_rate": 4.920385703268037e-06,
"loss": 0.6282,
"step": 524
},
{
"epoch": 0.587248322147651,
"grad_norm": 0.39122274518013,
"learning_rate": 4.920011739140532e-06,
"loss": 0.6479,
"step": 525
},
{
"epoch": 0.5883668903803132,
"grad_norm": 0.39809542894363403,
"learning_rate": 4.919636913056339e-06,
"loss": 0.6213,
"step": 526
},
{
"epoch": 0.5894854586129754,
"grad_norm": 0.39921343326568604,
"learning_rate": 4.919261225148963e-06,
"loss": 0.6118,
"step": 527
},
{
"epoch": 0.5906040268456376,
"grad_norm": 0.4086368680000305,
"learning_rate": 4.9188846755522155e-06,
"loss": 0.6214,
"step": 528
},
{
"epoch": 0.5917225950782998,
"grad_norm": 0.4066048264503479,
"learning_rate": 4.918507264400216e-06,
"loss": 0.6316,
"step": 529
},
{
"epoch": 0.5928411633109619,
"grad_norm": 0.41961807012557983,
"learning_rate": 4.91812899182739e-06,
"loss": 0.5948,
"step": 530
},
{
"epoch": 0.5939597315436241,
"grad_norm": 0.39992618560791016,
"learning_rate": 4.917749857968469e-06,
"loss": 0.6113,
"step": 531
},
{
"epoch": 0.5950782997762863,
"grad_norm": 0.41020235419273376,
"learning_rate": 4.917369862958494e-06,
"loss": 0.622,
"step": 532
},
{
"epoch": 0.5961968680089486,
"grad_norm": 0.40504705905914307,
"learning_rate": 4.916989006932811e-06,
"loss": 0.621,
"step": 533
},
{
"epoch": 0.5973154362416108,
"grad_norm": 0.3829837441444397,
"learning_rate": 4.9166072900270725e-06,
"loss": 0.5942,
"step": 534
},
{
"epoch": 0.598434004474273,
"grad_norm": 0.4082834720611572,
"learning_rate": 4.9162247123772375e-06,
"loss": 0.5923,
"step": 535
},
{
"epoch": 0.5995525727069351,
"grad_norm": 0.40038296580314636,
"learning_rate": 4.915841274119572e-06,
"loss": 0.6057,
"step": 536
},
{
"epoch": 0.6006711409395973,
"grad_norm": 0.40687569975852966,
"learning_rate": 4.91545697539065e-06,
"loss": 0.6343,
"step": 537
},
{
"epoch": 0.6017897091722595,
"grad_norm": 0.386262983083725,
"learning_rate": 4.9150718163273494e-06,
"loss": 0.6372,
"step": 538
},
{
"epoch": 0.6029082774049217,
"grad_norm": 0.39570850133895874,
"learning_rate": 4.914685797066855e-06,
"loss": 0.6157,
"step": 539
},
{
"epoch": 0.6040268456375839,
"grad_norm": 0.40055716037750244,
"learning_rate": 4.9142989177466594e-06,
"loss": 0.6141,
"step": 540
},
{
"epoch": 0.6051454138702461,
"grad_norm": 0.4038466811180115,
"learning_rate": 4.913911178504562e-06,
"loss": 0.6286,
"step": 541
},
{
"epoch": 0.6062639821029083,
"grad_norm": 0.38774847984313965,
"learning_rate": 4.913522579478664e-06,
"loss": 0.6343,
"step": 542
},
{
"epoch": 0.6073825503355704,
"grad_norm": 0.39426755905151367,
"learning_rate": 4.913133120807379e-06,
"loss": 0.6121,
"step": 543
},
{
"epoch": 0.6085011185682326,
"grad_norm": 0.4076898396015167,
"learning_rate": 4.912742802629423e-06,
"loss": 0.6273,
"step": 544
},
{
"epoch": 0.6096196868008948,
"grad_norm": 0.3859540820121765,
"learning_rate": 4.91235162508382e-06,
"loss": 0.6314,
"step": 545
},
{
"epoch": 0.610738255033557,
"grad_norm": 0.3914327621459961,
"learning_rate": 4.911959588309897e-06,
"loss": 0.6027,
"step": 546
},
{
"epoch": 0.6118568232662193,
"grad_norm": 0.3892766833305359,
"learning_rate": 4.9115666924472906e-06,
"loss": 0.5922,
"step": 547
},
{
"epoch": 0.6129753914988815,
"grad_norm": 0.3921322226524353,
"learning_rate": 4.911172937635942e-06,
"loss": 0.6066,
"step": 548
},
{
"epoch": 0.6140939597315436,
"grad_norm": 0.3972843885421753,
"learning_rate": 4.910778324016098e-06,
"loss": 0.614,
"step": 549
},
{
"epoch": 0.6152125279642058,
"grad_norm": 0.4027954638004303,
"learning_rate": 4.9103828517283105e-06,
"loss": 0.6174,
"step": 550
},
{
"epoch": 0.616331096196868,
"grad_norm": 0.40479257702827454,
"learning_rate": 4.909986520913441e-06,
"loss": 0.6114,
"step": 551
},
{
"epoch": 0.6174496644295302,
"grad_norm": 0.4246085584163666,
"learning_rate": 4.909589331712651e-06,
"loss": 0.6145,
"step": 552
},
{
"epoch": 0.6185682326621924,
"grad_norm": 0.4173775017261505,
"learning_rate": 4.909191284267413e-06,
"loss": 0.6375,
"step": 553
},
{
"epoch": 0.6196868008948546,
"grad_norm": 0.4135677218437195,
"learning_rate": 4.908792378719502e-06,
"loss": 0.6444,
"step": 554
},
{
"epoch": 0.6208053691275168,
"grad_norm": 0.40163061022758484,
"learning_rate": 4.9083926152110004e-06,
"loss": 0.6128,
"step": 555
},
{
"epoch": 0.6219239373601789,
"grad_norm": 0.41246625781059265,
"learning_rate": 4.907991993884295e-06,
"loss": 0.6229,
"step": 556
},
{
"epoch": 0.6230425055928411,
"grad_norm": 0.4114304780960083,
"learning_rate": 4.907590514882079e-06,
"loss": 0.6028,
"step": 557
},
{
"epoch": 0.6241610738255033,
"grad_norm": 0.40224742889404297,
"learning_rate": 4.90718817834735e-06,
"loss": 0.5896,
"step": 558
},
{
"epoch": 0.6252796420581656,
"grad_norm": 0.397650808095932,
"learning_rate": 4.906784984423411e-06,
"loss": 0.6309,
"step": 559
},
{
"epoch": 0.6263982102908278,
"grad_norm": 0.4087318480014801,
"learning_rate": 4.906380933253874e-06,
"loss": 0.6002,
"step": 560
},
{
"epoch": 0.62751677852349,
"grad_norm": 0.3988543450832367,
"learning_rate": 4.90597602498265e-06,
"loss": 0.6415,
"step": 561
},
{
"epoch": 0.6286353467561522,
"grad_norm": 0.38457274436950684,
"learning_rate": 4.905570259753961e-06,
"loss": 0.6105,
"step": 562
},
{
"epoch": 0.6297539149888143,
"grad_norm": 0.38756313920021057,
"learning_rate": 4.905163637712331e-06,
"loss": 0.5953,
"step": 563
},
{
"epoch": 0.6308724832214765,
"grad_norm": 0.4071662127971649,
"learning_rate": 4.90475615900259e-06,
"loss": 0.635,
"step": 564
},
{
"epoch": 0.6319910514541387,
"grad_norm": 0.4213521182537079,
"learning_rate": 4.904347823769875e-06,
"loss": 0.6141,
"step": 565
},
{
"epoch": 0.6331096196868009,
"grad_norm": 0.4104982018470764,
"learning_rate": 4.9039386321596235e-06,
"loss": 0.6235,
"step": 566
},
{
"epoch": 0.6342281879194631,
"grad_norm": 0.41318175196647644,
"learning_rate": 4.903528584317583e-06,
"loss": 0.6315,
"step": 567
},
{
"epoch": 0.6353467561521253,
"grad_norm": 0.39332863688468933,
"learning_rate": 4.903117680389802e-06,
"loss": 0.5807,
"step": 568
},
{
"epoch": 0.6364653243847874,
"grad_norm": 0.4188497066497803,
"learning_rate": 4.902705920522638e-06,
"loss": 0.6176,
"step": 569
},
{
"epoch": 0.6375838926174496,
"grad_norm": 0.41399574279785156,
"learning_rate": 4.9022933048627496e-06,
"loss": 0.6067,
"step": 570
},
{
"epoch": 0.6387024608501118,
"grad_norm": 0.4197136461734772,
"learning_rate": 4.901879833557102e-06,
"loss": 0.6182,
"step": 571
},
{
"epoch": 0.639821029082774,
"grad_norm": 0.41715213656425476,
"learning_rate": 4.9014655067529645e-06,
"loss": 0.6088,
"step": 572
},
{
"epoch": 0.6409395973154363,
"grad_norm": 0.40957003831863403,
"learning_rate": 4.901050324597912e-06,
"loss": 0.5942,
"step": 573
},
{
"epoch": 0.6420581655480985,
"grad_norm": 0.4082324206829071,
"learning_rate": 4.9006342872398235e-06,
"loss": 0.6389,
"step": 574
},
{
"epoch": 0.6431767337807607,
"grad_norm": 0.41336655616760254,
"learning_rate": 4.900217394826882e-06,
"loss": 0.6122,
"step": 575
},
{
"epoch": 0.6442953020134228,
"grad_norm": 0.40878477692604065,
"learning_rate": 4.899799647507577e-06,
"loss": 0.6372,
"step": 576
},
{
"epoch": 0.645413870246085,
"grad_norm": 0.4028140604496002,
"learning_rate": 4.899381045430701e-06,
"loss": 0.5949,
"step": 577
},
{
"epoch": 0.6465324384787472,
"grad_norm": 0.42413756251335144,
"learning_rate": 4.89896158874535e-06,
"loss": 0.6217,
"step": 578
},
{
"epoch": 0.6476510067114094,
"grad_norm": 0.4108542501926422,
"learning_rate": 4.898541277600927e-06,
"loss": 0.6283,
"step": 579
},
{
"epoch": 0.6487695749440716,
"grad_norm": 0.4101778566837311,
"learning_rate": 4.898120112147135e-06,
"loss": 0.6028,
"step": 580
},
{
"epoch": 0.6498881431767338,
"grad_norm": 0.4104720652103424,
"learning_rate": 4.897698092533988e-06,
"loss": 0.6481,
"step": 581
},
{
"epoch": 0.6510067114093959,
"grad_norm": 0.4004007577896118,
"learning_rate": 4.897275218911799e-06,
"loss": 0.6042,
"step": 582
},
{
"epoch": 0.6521252796420581,
"grad_norm": 0.4042847752571106,
"learning_rate": 4.896851491431185e-06,
"loss": 0.6076,
"step": 583
},
{
"epoch": 0.6532438478747203,
"grad_norm": 0.40685543417930603,
"learning_rate": 4.89642691024307e-06,
"loss": 0.6066,
"step": 584
},
{
"epoch": 0.6543624161073825,
"grad_norm": 0.41699886322021484,
"learning_rate": 4.896001475498682e-06,
"loss": 0.6091,
"step": 585
},
{
"epoch": 0.6554809843400448,
"grad_norm": 0.3905144929885864,
"learning_rate": 4.89557518734955e-06,
"loss": 0.6272,
"step": 586
},
{
"epoch": 0.656599552572707,
"grad_norm": 0.40033814311027527,
"learning_rate": 4.895148045947509e-06,
"loss": 0.6183,
"step": 587
},
{
"epoch": 0.6577181208053692,
"grad_norm": 0.39996397495269775,
"learning_rate": 4.894720051444698e-06,
"loss": 0.5996,
"step": 588
},
{
"epoch": 0.6588366890380313,
"grad_norm": 0.42592981457710266,
"learning_rate": 4.894291203993561e-06,
"loss": 0.6506,
"step": 589
},
{
"epoch": 0.6599552572706935,
"grad_norm": 0.40710797905921936,
"learning_rate": 4.8938615037468405e-06,
"loss": 0.6044,
"step": 590
},
{
"epoch": 0.6610738255033557,
"grad_norm": 0.427405446767807,
"learning_rate": 4.893430950857591e-06,
"loss": 0.6236,
"step": 591
},
{
"epoch": 0.6621923937360179,
"grad_norm": 0.40190666913986206,
"learning_rate": 4.892999545479163e-06,
"loss": 0.6031,
"step": 592
},
{
"epoch": 0.6633109619686801,
"grad_norm": 0.4019568860530853,
"learning_rate": 4.8925672877652155e-06,
"loss": 0.6232,
"step": 593
},
{
"epoch": 0.6644295302013423,
"grad_norm": 0.40001606941223145,
"learning_rate": 4.892134177869709e-06,
"loss": 0.6141,
"step": 594
},
{
"epoch": 0.6655480984340044,
"grad_norm": 0.40650853514671326,
"learning_rate": 4.891700215946909e-06,
"loss": 0.6011,
"step": 595
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.41019386053085327,
"learning_rate": 4.8912654021513815e-06,
"loss": 0.6262,
"step": 596
},
{
"epoch": 0.6677852348993288,
"grad_norm": 0.4064581096172333,
"learning_rate": 4.890829736638e-06,
"loss": 0.6329,
"step": 597
},
{
"epoch": 0.668903803131991,
"grad_norm": 0.4091740846633911,
"learning_rate": 4.890393219561938e-06,
"loss": 0.6193,
"step": 598
},
{
"epoch": 0.6700223713646533,
"grad_norm": 0.407552570104599,
"learning_rate": 4.889955851078674e-06,
"loss": 0.6535,
"step": 599
},
{
"epoch": 0.6711409395973155,
"grad_norm": 0.3950585126876831,
"learning_rate": 4.889517631343988e-06,
"loss": 0.6033,
"step": 600
},
{
"epoch": 0.6722595078299777,
"grad_norm": 0.39515334367752075,
"learning_rate": 4.889078560513968e-06,
"loss": 0.6006,
"step": 601
},
{
"epoch": 0.6733780760626398,
"grad_norm": 0.4188336431980133,
"learning_rate": 4.888638638744999e-06,
"loss": 0.6333,
"step": 602
},
{
"epoch": 0.674496644295302,
"grad_norm": 0.4050522446632385,
"learning_rate": 4.888197866193772e-06,
"loss": 0.6329,
"step": 603
},
{
"epoch": 0.6756152125279642,
"grad_norm": 0.39428868889808655,
"learning_rate": 4.887756243017282e-06,
"loss": 0.6007,
"step": 604
},
{
"epoch": 0.6767337807606264,
"grad_norm": 0.402410626411438,
"learning_rate": 4.887313769372823e-06,
"loss": 0.5885,
"step": 605
},
{
"epoch": 0.6778523489932886,
"grad_norm": 0.4062318205833435,
"learning_rate": 4.886870445417998e-06,
"loss": 0.6312,
"step": 606
},
{
"epoch": 0.6789709172259508,
"grad_norm": 0.4123631715774536,
"learning_rate": 4.886426271310708e-06,
"loss": 0.619,
"step": 607
},
{
"epoch": 0.680089485458613,
"grad_norm": 0.40353336930274963,
"learning_rate": 4.885981247209159e-06,
"loss": 0.6324,
"step": 608
},
{
"epoch": 0.6812080536912751,
"grad_norm": 0.40726402401924133,
"learning_rate": 4.885535373271858e-06,
"loss": 0.5819,
"step": 609
},
{
"epoch": 0.6823266219239373,
"grad_norm": 0.4142349660396576,
"learning_rate": 4.885088649657618e-06,
"loss": 0.6175,
"step": 610
},
{
"epoch": 0.6834451901565995,
"grad_norm": 0.4165160059928894,
"learning_rate": 4.884641076525549e-06,
"loss": 0.597,
"step": 611
},
{
"epoch": 0.6845637583892618,
"grad_norm": 0.4037843942642212,
"learning_rate": 4.884192654035069e-06,
"loss": 0.6183,
"step": 612
},
{
"epoch": 0.685682326621924,
"grad_norm": 0.41777777671813965,
"learning_rate": 4.883743382345898e-06,
"loss": 0.6063,
"step": 613
},
{
"epoch": 0.6868008948545862,
"grad_norm": 0.41021421551704407,
"learning_rate": 4.883293261618054e-06,
"loss": 0.6134,
"step": 614
},
{
"epoch": 0.6879194630872483,
"grad_norm": 0.4215847849845886,
"learning_rate": 4.882842292011863e-06,
"loss": 0.6458,
"step": 615
},
{
"epoch": 0.6890380313199105,
"grad_norm": 0.42801633477211,
"learning_rate": 4.882390473687949e-06,
"loss": 0.6259,
"step": 616
},
{
"epoch": 0.6901565995525727,
"grad_norm": 0.40879589319229126,
"learning_rate": 4.881937806807241e-06,
"loss": 0.6208,
"step": 617
},
{
"epoch": 0.6912751677852349,
"grad_norm": 0.39453622698783875,
"learning_rate": 4.881484291530969e-06,
"loss": 0.5966,
"step": 618
},
{
"epoch": 0.6923937360178971,
"grad_norm": 0.3992539048194885,
"learning_rate": 4.881029928020666e-06,
"loss": 0.5976,
"step": 619
},
{
"epoch": 0.6935123042505593,
"grad_norm": 0.4175397753715515,
"learning_rate": 4.880574716438166e-06,
"loss": 0.6261,
"step": 620
},
{
"epoch": 0.6946308724832215,
"grad_norm": 0.40533408522605896,
"learning_rate": 4.880118656945606e-06,
"loss": 0.5945,
"step": 621
},
{
"epoch": 0.6957494407158836,
"grad_norm": 0.4089728593826294,
"learning_rate": 4.879661749705424e-06,
"loss": 0.6226,
"step": 622
},
{
"epoch": 0.6968680089485458,
"grad_norm": 0.4341566562652588,
"learning_rate": 4.879203994880362e-06,
"loss": 0.6463,
"step": 623
},
{
"epoch": 0.697986577181208,
"grad_norm": 0.44256189465522766,
"learning_rate": 4.878745392633462e-06,
"loss": 0.653,
"step": 624
},
{
"epoch": 0.6991051454138703,
"grad_norm": 0.4098159372806549,
"learning_rate": 4.878285943128067e-06,
"loss": 0.5808,
"step": 625
},
{
"epoch": 0.7002237136465325,
"grad_norm": 0.43130752444267273,
"learning_rate": 4.8778256465278245e-06,
"loss": 0.6261,
"step": 626
},
{
"epoch": 0.7013422818791947,
"grad_norm": 0.4110218286514282,
"learning_rate": 4.877364502996682e-06,
"loss": 0.5954,
"step": 627
},
{
"epoch": 0.7024608501118568,
"grad_norm": 0.42106136679649353,
"learning_rate": 4.87690251269889e-06,
"loss": 0.6026,
"step": 628
},
{
"epoch": 0.703579418344519,
"grad_norm": 0.4233524203300476,
"learning_rate": 4.876439675798997e-06,
"loss": 0.6432,
"step": 629
},
{
"epoch": 0.7046979865771812,
"grad_norm": 0.42107197642326355,
"learning_rate": 4.87597599246186e-06,
"loss": 0.6198,
"step": 630
},
{
"epoch": 0.7058165548098434,
"grad_norm": 0.43851831555366516,
"learning_rate": 4.875511462852628e-06,
"loss": 0.6293,
"step": 631
},
{
"epoch": 0.7069351230425056,
"grad_norm": 0.41345685720443726,
"learning_rate": 4.87504608713676e-06,
"loss": 0.6178,
"step": 632
},
{
"epoch": 0.7080536912751678,
"grad_norm": 0.41011178493499756,
"learning_rate": 4.874579865480013e-06,
"loss": 0.6441,
"step": 633
},
{
"epoch": 0.70917225950783,
"grad_norm": 0.41372135281562805,
"learning_rate": 4.874112798048442e-06,
"loss": 0.6142,
"step": 634
},
{
"epoch": 0.7102908277404921,
"grad_norm": 0.41231900453567505,
"learning_rate": 4.8736448850084105e-06,
"loss": 0.6277,
"step": 635
},
{
"epoch": 0.7114093959731543,
"grad_norm": 0.4147928059101105,
"learning_rate": 4.873176126526578e-06,
"loss": 0.6197,
"step": 636
},
{
"epoch": 0.7125279642058165,
"grad_norm": 0.4046717882156372,
"learning_rate": 4.8727065227699035e-06,
"loss": 0.6138,
"step": 637
},
{
"epoch": 0.7136465324384788,
"grad_norm": 0.4150887727737427,
"learning_rate": 4.872236073905654e-06,
"loss": 0.616,
"step": 638
},
{
"epoch": 0.714765100671141,
"grad_norm": 0.41429632902145386,
"learning_rate": 4.87176478010139e-06,
"loss": 0.6153,
"step": 639
},
{
"epoch": 0.7158836689038032,
"grad_norm": 0.41153407096862793,
"learning_rate": 4.8712926415249785e-06,
"loss": 0.6171,
"step": 640
},
{
"epoch": 0.7170022371364653,
"grad_norm": 0.4178698658943176,
"learning_rate": 4.870819658344584e-06,
"loss": 0.6417,
"step": 641
},
{
"epoch": 0.7181208053691275,
"grad_norm": 0.40587952733039856,
"learning_rate": 4.870345830728675e-06,
"loss": 0.6206,
"step": 642
},
{
"epoch": 0.7192393736017897,
"grad_norm": 0.42633864283561707,
"learning_rate": 4.869871158846016e-06,
"loss": 0.6246,
"step": 643
},
{
"epoch": 0.7203579418344519,
"grad_norm": 0.41023534536361694,
"learning_rate": 4.8693956428656766e-06,
"loss": 0.601,
"step": 644
},
{
"epoch": 0.7214765100671141,
"grad_norm": 0.40645042061805725,
"learning_rate": 4.868919282957024e-06,
"loss": 0.6193,
"step": 645
},
{
"epoch": 0.7225950782997763,
"grad_norm": 0.40088531374931335,
"learning_rate": 4.86844207928973e-06,
"loss": 0.5869,
"step": 646
},
{
"epoch": 0.7237136465324385,
"grad_norm": 0.4136696755886078,
"learning_rate": 4.8679640320337625e-06,
"loss": 0.6413,
"step": 647
},
{
"epoch": 0.7248322147651006,
"grad_norm": 0.40026187896728516,
"learning_rate": 4.867485141359394e-06,
"loss": 0.6075,
"step": 648
},
{
"epoch": 0.7259507829977628,
"grad_norm": 0.40911242365837097,
"learning_rate": 4.867005407437192e-06,
"loss": 0.6411,
"step": 649
},
{
"epoch": 0.727069351230425,
"grad_norm": 0.42306697368621826,
"learning_rate": 4.866524830438029e-06,
"loss": 0.6376,
"step": 650
},
{
"epoch": 0.7281879194630873,
"grad_norm": 0.40857061743736267,
"learning_rate": 4.866043410533077e-06,
"loss": 0.6071,
"step": 651
},
{
"epoch": 0.7293064876957495,
"grad_norm": 0.41601142287254333,
"learning_rate": 4.8655611478938055e-06,
"loss": 0.6079,
"step": 652
},
{
"epoch": 0.7304250559284117,
"grad_norm": 0.40857282280921936,
"learning_rate": 4.8650780426919895e-06,
"loss": 0.6246,
"step": 653
},
{
"epoch": 0.7315436241610739,
"grad_norm": 0.4063502252101898,
"learning_rate": 4.864594095099697e-06,
"loss": 0.6105,
"step": 654
},
{
"epoch": 0.732662192393736,
"grad_norm": 0.40278729796409607,
"learning_rate": 4.864109305289303e-06,
"loss": 0.5936,
"step": 655
},
{
"epoch": 0.7337807606263982,
"grad_norm": 0.4201098382472992,
"learning_rate": 4.863623673433478e-06,
"loss": 0.6081,
"step": 656
},
{
"epoch": 0.7348993288590604,
"grad_norm": 0.40003877878189087,
"learning_rate": 4.863137199705192e-06,
"loss": 0.6085,
"step": 657
},
{
"epoch": 0.7360178970917226,
"grad_norm": 0.41234898567199707,
"learning_rate": 4.86264988427772e-06,
"loss": 0.6252,
"step": 658
},
{
"epoch": 0.7371364653243848,
"grad_norm": 0.4233507513999939,
"learning_rate": 4.862161727324632e-06,
"loss": 0.5987,
"step": 659
},
{
"epoch": 0.738255033557047,
"grad_norm": 0.4099391996860504,
"learning_rate": 4.861672729019798e-06,
"loss": 0.6293,
"step": 660
},
{
"epoch": 0.7393736017897091,
"grad_norm": 0.4255772829055786,
"learning_rate": 4.861182889537389e-06,
"loss": 0.6268,
"step": 661
},
{
"epoch": 0.7404921700223713,
"grad_norm": 0.4317517578601837,
"learning_rate": 4.860692209051877e-06,
"loss": 0.6444,
"step": 662
},
{
"epoch": 0.7416107382550335,
"grad_norm": 0.4352816939353943,
"learning_rate": 4.86020068773803e-06,
"loss": 0.6304,
"step": 663
},
{
"epoch": 0.7427293064876958,
"grad_norm": 0.3987254500389099,
"learning_rate": 4.859708325770919e-06,
"loss": 0.611,
"step": 664
},
{
"epoch": 0.743847874720358,
"grad_norm": 0.4213384985923767,
"learning_rate": 4.859215123325912e-06,
"loss": 0.6292,
"step": 665
},
{
"epoch": 0.7449664429530202,
"grad_norm": 0.4062172472476959,
"learning_rate": 4.8587210805786785e-06,
"loss": 0.6197,
"step": 666
},
{
"epoch": 0.7460850111856824,
"grad_norm": 0.41443362832069397,
"learning_rate": 4.858226197705183e-06,
"loss": 0.6414,
"step": 667
},
{
"epoch": 0.7472035794183445,
"grad_norm": 0.4183506667613983,
"learning_rate": 4.857730474881696e-06,
"loss": 0.6294,
"step": 668
},
{
"epoch": 0.7483221476510067,
"grad_norm": 0.42685073614120483,
"learning_rate": 4.857233912284781e-06,
"loss": 0.6264,
"step": 669
},
{
"epoch": 0.7494407158836689,
"grad_norm": 0.4143792390823364,
"learning_rate": 4.856736510091304e-06,
"loss": 0.6575,
"step": 670
},
{
"epoch": 0.7505592841163311,
"grad_norm": 0.4124217629432678,
"learning_rate": 4.8562382684784284e-06,
"loss": 0.6295,
"step": 671
},
{
"epoch": 0.7516778523489933,
"grad_norm": 0.4060792624950409,
"learning_rate": 4.855739187623619e-06,
"loss": 0.5983,
"step": 672
},
{
"epoch": 0.7527964205816555,
"grad_norm": 0.4100533723831177,
"learning_rate": 4.855239267704635e-06,
"loss": 0.6271,
"step": 673
},
{
"epoch": 0.7539149888143176,
"grad_norm": 0.4047471582889557,
"learning_rate": 4.854738508899538e-06,
"loss": 0.5843,
"step": 674
},
{
"epoch": 0.7550335570469798,
"grad_norm": 0.41550201177597046,
"learning_rate": 4.854236911386689e-06,
"loss": 0.6015,
"step": 675
},
{
"epoch": 0.756152125279642,
"grad_norm": 0.4035356044769287,
"learning_rate": 4.853734475344745e-06,
"loss": 0.6085,
"step": 676
},
{
"epoch": 0.7572706935123042,
"grad_norm": 0.4054676294326782,
"learning_rate": 4.853231200952665e-06,
"loss": 0.5879,
"step": 677
},
{
"epoch": 0.7583892617449665,
"grad_norm": 0.4165349304676056,
"learning_rate": 4.852727088389702e-06,
"loss": 0.6065,
"step": 678
},
{
"epoch": 0.7595078299776287,
"grad_norm": 0.41854768991470337,
"learning_rate": 4.8522221378354125e-06,
"loss": 0.6115,
"step": 679
},
{
"epoch": 0.7606263982102909,
"grad_norm": 0.4189227223396301,
"learning_rate": 4.851716349469647e-06,
"loss": 0.6174,
"step": 680
},
{
"epoch": 0.761744966442953,
"grad_norm": 0.44432833790779114,
"learning_rate": 4.851209723472559e-06,
"loss": 0.6382,
"step": 681
},
{
"epoch": 0.7628635346756152,
"grad_norm": 0.4199828803539276,
"learning_rate": 4.8507022600245954e-06,
"loss": 0.6125,
"step": 682
},
{
"epoch": 0.7639821029082774,
"grad_norm": 0.44079893827438354,
"learning_rate": 4.850193959306506e-06,
"loss": 0.6263,
"step": 683
},
{
"epoch": 0.7651006711409396,
"grad_norm": 0.41406047344207764,
"learning_rate": 4.8496848214993355e-06,
"loss": 0.5979,
"step": 684
},
{
"epoch": 0.7662192393736018,
"grad_norm": 0.43209850788116455,
"learning_rate": 4.849174846784428e-06,
"loss": 0.6451,
"step": 685
},
{
"epoch": 0.767337807606264,
"grad_norm": 0.4180072844028473,
"learning_rate": 4.848664035343425e-06,
"loss": 0.6009,
"step": 686
},
{
"epoch": 0.7684563758389261,
"grad_norm": 0.4092356860637665,
"learning_rate": 4.8481523873582685e-06,
"loss": 0.6431,
"step": 687
},
{
"epoch": 0.7695749440715883,
"grad_norm": 0.41440829634666443,
"learning_rate": 4.847639903011196e-06,
"loss": 0.6001,
"step": 688
},
{
"epoch": 0.7706935123042505,
"grad_norm": 0.4246008098125458,
"learning_rate": 4.8471265824847415e-06,
"loss": 0.6137,
"step": 689
},
{
"epoch": 0.7718120805369127,
"grad_norm": 0.4177666902542114,
"learning_rate": 4.846612425961742e-06,
"loss": 0.6026,
"step": 690
},
{
"epoch": 0.772930648769575,
"grad_norm": 0.4130840003490448,
"learning_rate": 4.846097433625327e-06,
"loss": 0.6183,
"step": 691
},
{
"epoch": 0.7740492170022372,
"grad_norm": 0.406780868768692,
"learning_rate": 4.845581605658926e-06,
"loss": 0.5992,
"step": 692
},
{
"epoch": 0.7751677852348994,
"grad_norm": 0.42086103558540344,
"learning_rate": 4.845064942246267e-06,
"loss": 0.6057,
"step": 693
},
{
"epoch": 0.7762863534675615,
"grad_norm": 0.4122505486011505,
"learning_rate": 4.844547443571374e-06,
"loss": 0.6134,
"step": 694
},
{
"epoch": 0.7774049217002237,
"grad_norm": 0.43634387850761414,
"learning_rate": 4.8440291098185686e-06,
"loss": 0.6044,
"step": 695
},
{
"epoch": 0.7785234899328859,
"grad_norm": 0.4160690903663635,
"learning_rate": 4.843509941172471e-06,
"loss": 0.6046,
"step": 696
},
{
"epoch": 0.7796420581655481,
"grad_norm": 0.41897231340408325,
"learning_rate": 4.842989937817997e-06,
"loss": 0.6186,
"step": 697
},
{
"epoch": 0.7807606263982103,
"grad_norm": 0.4187341034412384,
"learning_rate": 4.842469099940361e-06,
"loss": 0.6266,
"step": 698
},
{
"epoch": 0.7818791946308725,
"grad_norm": 0.4075968563556671,
"learning_rate": 4.841947427725076e-06,
"loss": 0.5772,
"step": 699
},
{
"epoch": 0.7829977628635347,
"grad_norm": 0.4157114028930664,
"learning_rate": 4.841424921357948e-06,
"loss": 0.5999,
"step": 700
},
{
"epoch": 0.7841163310961968,
"grad_norm": 0.4198933243751526,
"learning_rate": 4.840901581025083e-06,
"loss": 0.6273,
"step": 701
},
{
"epoch": 0.785234899328859,
"grad_norm": 0.42646607756614685,
"learning_rate": 4.840377406912887e-06,
"loss": 0.6074,
"step": 702
},
{
"epoch": 0.7863534675615212,
"grad_norm": 0.42644554376602173,
"learning_rate": 4.839852399208056e-06,
"loss": 0.5872,
"step": 703
},
{
"epoch": 0.7874720357941835,
"grad_norm": 0.43172845244407654,
"learning_rate": 4.839326558097587e-06,
"loss": 0.633,
"step": 704
},
{
"epoch": 0.7885906040268457,
"grad_norm": 0.4165332317352295,
"learning_rate": 4.838799883768775e-06,
"loss": 0.6206,
"step": 705
},
{
"epoch": 0.7897091722595079,
"grad_norm": 0.4209877550601959,
"learning_rate": 4.83827237640921e-06,
"loss": 0.6015,
"step": 706
},
{
"epoch": 0.79082774049217,
"grad_norm": 0.4267021715641022,
"learning_rate": 4.837744036206777e-06,
"loss": 0.5975,
"step": 707
},
{
"epoch": 0.7919463087248322,
"grad_norm": 0.4415457546710968,
"learning_rate": 4.837214863349662e-06,
"loss": 0.6251,
"step": 708
},
{
"epoch": 0.7930648769574944,
"grad_norm": 0.43104031682014465,
"learning_rate": 4.836684858026343e-06,
"loss": 0.6048,
"step": 709
},
{
"epoch": 0.7941834451901566,
"grad_norm": 0.41736820340156555,
"learning_rate": 4.8361540204255985e-06,
"loss": 0.5948,
"step": 710
},
{
"epoch": 0.7953020134228188,
"grad_norm": 0.4202009439468384,
"learning_rate": 4.835622350736499e-06,
"loss": 0.6099,
"step": 711
},
{
"epoch": 0.796420581655481,
"grad_norm": 0.42279568314552307,
"learning_rate": 4.8350898491484175e-06,
"loss": 0.6247,
"step": 712
},
{
"epoch": 0.7975391498881432,
"grad_norm": 0.4266239404678345,
"learning_rate": 4.8345565158510176e-06,
"loss": 0.6136,
"step": 713
},
{
"epoch": 0.7986577181208053,
"grad_norm": 0.42605841159820557,
"learning_rate": 4.83402235103426e-06,
"loss": 0.6001,
"step": 714
},
{
"epoch": 0.7997762863534675,
"grad_norm": 0.42846307158470154,
"learning_rate": 4.8334873548884055e-06,
"loss": 0.5941,
"step": 715
},
{
"epoch": 0.8008948545861297,
"grad_norm": 0.44009047746658325,
"learning_rate": 4.832951527604007e-06,
"loss": 0.622,
"step": 716
},
{
"epoch": 0.802013422818792,
"grad_norm": 0.44512951374053955,
"learning_rate": 4.8324148693719145e-06,
"loss": 0.6507,
"step": 717
},
{
"epoch": 0.8031319910514542,
"grad_norm": 0.455010324716568,
"learning_rate": 4.831877380383276e-06,
"loss": 0.6201,
"step": 718
},
{
"epoch": 0.8042505592841164,
"grad_norm": 0.43456459045410156,
"learning_rate": 4.83133906082953e-06,
"loss": 0.623,
"step": 719
},
{
"epoch": 0.8053691275167785,
"grad_norm": 0.42063653469085693,
"learning_rate": 4.830799910902418e-06,
"loss": 0.5841,
"step": 720
},
{
"epoch": 0.8064876957494407,
"grad_norm": 0.41323843598365784,
"learning_rate": 4.8302599307939725e-06,
"loss": 0.6127,
"step": 721
},
{
"epoch": 0.8076062639821029,
"grad_norm": 0.41982001066207886,
"learning_rate": 4.829719120696523e-06,
"loss": 0.6274,
"step": 722
},
{
"epoch": 0.8087248322147651,
"grad_norm": 0.43330860137939453,
"learning_rate": 4.829177480802694e-06,
"loss": 0.6416,
"step": 723
},
{
"epoch": 0.8098434004474273,
"grad_norm": 0.4351330101490021,
"learning_rate": 4.828635011305407e-06,
"loss": 0.6399,
"step": 724
},
{
"epoch": 0.8109619686800895,
"grad_norm": 0.4017598032951355,
"learning_rate": 4.828091712397878e-06,
"loss": 0.5817,
"step": 725
},
{
"epoch": 0.8120805369127517,
"grad_norm": 0.42594751715660095,
"learning_rate": 4.827547584273618e-06,
"loss": 0.6438,
"step": 726
},
{
"epoch": 0.8131991051454138,
"grad_norm": 0.409135639667511,
"learning_rate": 4.827002627126433e-06,
"loss": 0.5797,
"step": 727
},
{
"epoch": 0.814317673378076,
"grad_norm": 0.4304857850074768,
"learning_rate": 4.826456841150428e-06,
"loss": 0.6173,
"step": 728
},
{
"epoch": 0.8154362416107382,
"grad_norm": 0.446872740983963,
"learning_rate": 4.825910226539997e-06,
"loss": 0.6059,
"step": 729
},
{
"epoch": 0.8165548098434005,
"grad_norm": 0.42625290155410767,
"learning_rate": 4.8253627834898355e-06,
"loss": 0.5994,
"step": 730
},
{
"epoch": 0.8176733780760627,
"grad_norm": 0.42183029651641846,
"learning_rate": 4.824814512194929e-06,
"loss": 0.6202,
"step": 731
},
{
"epoch": 0.8187919463087249,
"grad_norm": 0.4235664904117584,
"learning_rate": 4.824265412850559e-06,
"loss": 0.6263,
"step": 732
},
{
"epoch": 0.819910514541387,
"grad_norm": 0.4118615686893463,
"learning_rate": 4.823715485652307e-06,
"loss": 0.6058,
"step": 733
},
{
"epoch": 0.8210290827740492,
"grad_norm": 0.43514224886894226,
"learning_rate": 4.823164730796042e-06,
"loss": 0.6092,
"step": 734
},
{
"epoch": 0.8221476510067114,
"grad_norm": 0.41756734251976013,
"learning_rate": 4.8226131484779325e-06,
"loss": 0.6281,
"step": 735
},
{
"epoch": 0.8232662192393736,
"grad_norm": 0.438475638628006,
"learning_rate": 4.822060738894439e-06,
"loss": 0.6122,
"step": 736
},
{
"epoch": 0.8243847874720358,
"grad_norm": 0.426792174577713,
"learning_rate": 4.821507502242321e-06,
"loss": 0.6407,
"step": 737
},
{
"epoch": 0.825503355704698,
"grad_norm": 0.42697012424468994,
"learning_rate": 4.820953438718626e-06,
"loss": 0.5996,
"step": 738
},
{
"epoch": 0.8266219239373602,
"grad_norm": 0.42373016476631165,
"learning_rate": 4.820398548520702e-06,
"loss": 0.6075,
"step": 739
},
{
"epoch": 0.8277404921700223,
"grad_norm": 0.42235615849494934,
"learning_rate": 4.81984283184619e-06,
"loss": 0.608,
"step": 740
},
{
"epoch": 0.8288590604026845,
"grad_norm": 0.41180866956710815,
"learning_rate": 4.819286288893022e-06,
"loss": 0.6127,
"step": 741
},
{
"epoch": 0.8299776286353467,
"grad_norm": 0.4207548499107361,
"learning_rate": 4.818728919859426e-06,
"loss": 0.6131,
"step": 742
},
{
"epoch": 0.831096196868009,
"grad_norm": 0.4295390546321869,
"learning_rate": 4.818170724943928e-06,
"loss": 0.629,
"step": 743
},
{
"epoch": 0.8322147651006712,
"grad_norm": 0.4099291265010834,
"learning_rate": 4.817611704345344e-06,
"loss": 0.6055,
"step": 744
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.4142029583454132,
"learning_rate": 4.817051858262785e-06,
"loss": 0.6127,
"step": 745
},
{
"epoch": 0.8344519015659956,
"grad_norm": 0.41662877798080444,
"learning_rate": 4.816491186895656e-06,
"loss": 0.6171,
"step": 746
},
{
"epoch": 0.8355704697986577,
"grad_norm": 0.4345078766345978,
"learning_rate": 4.815929690443657e-06,
"loss": 0.6091,
"step": 747
},
{
"epoch": 0.8366890380313199,
"grad_norm": 0.43010810017585754,
"learning_rate": 4.8153673691067806e-06,
"loss": 0.626,
"step": 748
},
{
"epoch": 0.8378076062639821,
"grad_norm": 0.4256346821784973,
"learning_rate": 4.814804223085313e-06,
"loss": 0.6216,
"step": 749
},
{
"epoch": 0.8389261744966443,
"grad_norm": 0.42812031507492065,
"learning_rate": 4.814240252579836e-06,
"loss": 0.6138,
"step": 750
},
{
"epoch": 0.8400447427293065,
"grad_norm": 0.41682377457618713,
"learning_rate": 4.813675457791224e-06,
"loss": 0.5783,
"step": 751
},
{
"epoch": 0.8411633109619687,
"grad_norm": 0.4257197380065918,
"learning_rate": 4.8131098389206435e-06,
"loss": 0.6006,
"step": 752
},
{
"epoch": 0.8422818791946308,
"grad_norm": 0.42872053384780884,
"learning_rate": 4.812543396169557e-06,
"loss": 0.6272,
"step": 753
},
{
"epoch": 0.843400447427293,
"grad_norm": 0.4263060986995697,
"learning_rate": 4.81197612973972e-06,
"loss": 0.6093,
"step": 754
},
{
"epoch": 0.8445190156599552,
"grad_norm": 0.4151028096675873,
"learning_rate": 4.811408039833178e-06,
"loss": 0.5773,
"step": 755
},
{
"epoch": 0.8456375838926175,
"grad_norm": 0.4382091164588928,
"learning_rate": 4.810839126652275e-06,
"loss": 0.596,
"step": 756
},
{
"epoch": 0.8467561521252797,
"grad_norm": 0.4290934205055237,
"learning_rate": 4.810269390399646e-06,
"loss": 0.5904,
"step": 757
},
{
"epoch": 0.8478747203579419,
"grad_norm": 0.4299798011779785,
"learning_rate": 4.809698831278217e-06,
"loss": 0.6449,
"step": 758
},
{
"epoch": 0.8489932885906041,
"grad_norm": 0.4188365042209625,
"learning_rate": 4.809127449491211e-06,
"loss": 0.6007,
"step": 759
},
{
"epoch": 0.8501118568232662,
"grad_norm": 0.41594186425209045,
"learning_rate": 4.808555245242141e-06,
"loss": 0.5888,
"step": 760
},
{
"epoch": 0.8512304250559284,
"grad_norm": 0.4184630513191223,
"learning_rate": 4.807982218734814e-06,
"loss": 0.6495,
"step": 761
},
{
"epoch": 0.8523489932885906,
"grad_norm": 0.418849378824234,
"learning_rate": 4.80740837017333e-06,
"loss": 0.6128,
"step": 762
},
{
"epoch": 0.8534675615212528,
"grad_norm": 0.42030590772628784,
"learning_rate": 4.8068336997620804e-06,
"loss": 0.6294,
"step": 763
},
{
"epoch": 0.854586129753915,
"grad_norm": 0.4204208552837372,
"learning_rate": 4.806258207705753e-06,
"loss": 0.6279,
"step": 764
},
{
"epoch": 0.8557046979865772,
"grad_norm": 0.41544729471206665,
"learning_rate": 4.805681894209324e-06,
"loss": 0.6235,
"step": 765
},
{
"epoch": 0.8568232662192393,
"grad_norm": 0.430301308631897,
"learning_rate": 4.805104759478065e-06,
"loss": 0.5876,
"step": 766
},
{
"epoch": 0.8579418344519015,
"grad_norm": 0.4640876054763794,
"learning_rate": 4.804526803717539e-06,
"loss": 0.6264,
"step": 767
},
{
"epoch": 0.8590604026845637,
"grad_norm": 0.43245360255241394,
"learning_rate": 4.8039480271336005e-06,
"loss": 0.5871,
"step": 768
},
{
"epoch": 0.860178970917226,
"grad_norm": 0.41617804765701294,
"learning_rate": 4.803368429932399e-06,
"loss": 0.6218,
"step": 769
},
{
"epoch": 0.8612975391498882,
"grad_norm": 0.4325237274169922,
"learning_rate": 4.8027880123203726e-06,
"loss": 0.5874,
"step": 770
},
{
"epoch": 0.8624161073825504,
"grad_norm": 0.4480580687522888,
"learning_rate": 4.802206774504255e-06,
"loss": 0.6093,
"step": 771
},
{
"epoch": 0.8635346756152126,
"grad_norm": 0.44137299060821533,
"learning_rate": 4.801624716691072e-06,
"loss": 0.6031,
"step": 772
},
{
"epoch": 0.8646532438478747,
"grad_norm": 0.4269614815711975,
"learning_rate": 4.801041839088139e-06,
"loss": 0.5963,
"step": 773
},
{
"epoch": 0.8657718120805369,
"grad_norm": 0.4120911657810211,
"learning_rate": 4.800458141903064e-06,
"loss": 0.5959,
"step": 774
},
{
"epoch": 0.8668903803131991,
"grad_norm": 0.4332381784915924,
"learning_rate": 4.799873625343747e-06,
"loss": 0.6007,
"step": 775
},
{
"epoch": 0.8680089485458613,
"grad_norm": 0.4361085295677185,
"learning_rate": 4.7992882896183825e-06,
"loss": 0.6012,
"step": 776
},
{
"epoch": 0.8691275167785235,
"grad_norm": 0.43251463770866394,
"learning_rate": 4.798702134935454e-06,
"loss": 0.5799,
"step": 777
},
{
"epoch": 0.8702460850111857,
"grad_norm": 0.4305305778980255,
"learning_rate": 4.798115161503735e-06,
"loss": 0.6068,
"step": 778
},
{
"epoch": 0.8713646532438478,
"grad_norm": 0.4410499930381775,
"learning_rate": 4.797527369532296e-06,
"loss": 0.6486,
"step": 779
},
{
"epoch": 0.87248322147651,
"grad_norm": 0.43271416425704956,
"learning_rate": 4.796938759230494e-06,
"loss": 0.6367,
"step": 780
},
{
"epoch": 0.8736017897091722,
"grad_norm": 0.430905282497406,
"learning_rate": 4.7963493308079815e-06,
"loss": 0.5753,
"step": 781
},
{
"epoch": 0.8747203579418344,
"grad_norm": 0.4054125249385834,
"learning_rate": 4.7957590844746986e-06,
"loss": 0.5806,
"step": 782
},
{
"epoch": 0.8758389261744967,
"grad_norm": 0.4322145879268646,
"learning_rate": 4.795168020440878e-06,
"loss": 0.5989,
"step": 783
},
{
"epoch": 0.8769574944071589,
"grad_norm": 0.433716744184494,
"learning_rate": 4.7945761389170464e-06,
"loss": 0.6284,
"step": 784
},
{
"epoch": 0.8780760626398211,
"grad_norm": 0.4301673471927643,
"learning_rate": 4.793983440114018e-06,
"loss": 0.6469,
"step": 785
},
{
"epoch": 0.8791946308724832,
"grad_norm": 0.4279182553291321,
"learning_rate": 4.7933899242428986e-06,
"loss": 0.6032,
"step": 786
},
{
"epoch": 0.8803131991051454,
"grad_norm": 0.4133903682231903,
"learning_rate": 4.792795591515087e-06,
"loss": 0.5745,
"step": 787
},
{
"epoch": 0.8814317673378076,
"grad_norm": 0.45094674825668335,
"learning_rate": 4.792200442142273e-06,
"loss": 0.6212,
"step": 788
},
{
"epoch": 0.8825503355704698,
"grad_norm": 0.4304371774196625,
"learning_rate": 4.7916044763364344e-06,
"loss": 0.61,
"step": 789
},
{
"epoch": 0.883668903803132,
"grad_norm": 0.4232103228569031,
"learning_rate": 4.791007694309842e-06,
"loss": 0.5942,
"step": 790
},
{
"epoch": 0.8847874720357942,
"grad_norm": 0.41822549700737,
"learning_rate": 4.790410096275057e-06,
"loss": 0.5829,
"step": 791
},
{
"epoch": 0.8859060402684564,
"grad_norm": 0.44473201036453247,
"learning_rate": 4.789811682444931e-06,
"loss": 0.6359,
"step": 792
},
{
"epoch": 0.8870246085011185,
"grad_norm": 0.42226001620292664,
"learning_rate": 4.7892124530326065e-06,
"loss": 0.5966,
"step": 793
},
{
"epoch": 0.8881431767337807,
"grad_norm": 0.4254050552845001,
"learning_rate": 4.788612408251517e-06,
"loss": 0.6211,
"step": 794
},
{
"epoch": 0.889261744966443,
"grad_norm": 0.42909836769104004,
"learning_rate": 4.788011548315383e-06,
"loss": 0.6039,
"step": 795
},
{
"epoch": 0.8903803131991052,
"grad_norm": 0.4296148419380188,
"learning_rate": 4.78740987343822e-06,
"loss": 0.6055,
"step": 796
},
{
"epoch": 0.8914988814317674,
"grad_norm": 0.4225505292415619,
"learning_rate": 4.786807383834332e-06,
"loss": 0.5947,
"step": 797
},
{
"epoch": 0.8926174496644296,
"grad_norm": 0.4271566867828369,
"learning_rate": 4.786204079718314e-06,
"loss": 0.6002,
"step": 798
},
{
"epoch": 0.8937360178970917,
"grad_norm": 0.42522063851356506,
"learning_rate": 4.785599961305048e-06,
"loss": 0.6231,
"step": 799
},
{
"epoch": 0.8948545861297539,
"grad_norm": 0.4384607970714569,
"learning_rate": 4.784995028809707e-06,
"loss": 0.6072,
"step": 800
},
{
"epoch": 0.8959731543624161,
"grad_norm": 0.4194418787956238,
"learning_rate": 4.784389282447759e-06,
"loss": 0.5979,
"step": 801
},
{
"epoch": 0.8970917225950783,
"grad_norm": 0.43825826048851013,
"learning_rate": 4.7837827224349544e-06,
"loss": 0.6256,
"step": 802
},
{
"epoch": 0.8982102908277405,
"grad_norm": 0.43381622433662415,
"learning_rate": 4.783175348987339e-06,
"loss": 0.5932,
"step": 803
},
{
"epoch": 0.8993288590604027,
"grad_norm": 0.4468502700328827,
"learning_rate": 4.7825671623212456e-06,
"loss": 0.618,
"step": 804
},
{
"epoch": 0.9004474272930649,
"grad_norm": 0.4352877140045166,
"learning_rate": 4.781958162653298e-06,
"loss": 0.5898,
"step": 805
},
{
"epoch": 0.901565995525727,
"grad_norm": 0.4242689907550812,
"learning_rate": 4.781348350200408e-06,
"loss": 0.5856,
"step": 806
},
{
"epoch": 0.9026845637583892,
"grad_norm": 0.4262087941169739,
"learning_rate": 4.780737725179778e-06,
"loss": 0.5994,
"step": 807
},
{
"epoch": 0.9038031319910514,
"grad_norm": 0.42303264141082764,
"learning_rate": 4.780126287808899e-06,
"loss": 0.6106,
"step": 808
},
{
"epoch": 0.9049217002237137,
"grad_norm": 0.43589121103286743,
"learning_rate": 4.779514038305555e-06,
"loss": 0.6251,
"step": 809
},
{
"epoch": 0.9060402684563759,
"grad_norm": 0.43768516182899475,
"learning_rate": 4.778900976887813e-06,
"loss": 0.6124,
"step": 810
},
{
"epoch": 0.9071588366890381,
"grad_norm": 0.4439849257469177,
"learning_rate": 4.778287103774033e-06,
"loss": 0.6397,
"step": 811
},
{
"epoch": 0.9082774049217002,
"grad_norm": 0.44813254475593567,
"learning_rate": 4.777672419182863e-06,
"loss": 0.6213,
"step": 812
},
{
"epoch": 0.9093959731543624,
"grad_norm": 0.4133831858634949,
"learning_rate": 4.777056923333244e-06,
"loss": 0.6138,
"step": 813
},
{
"epoch": 0.9105145413870246,
"grad_norm": 0.4255264699459076,
"learning_rate": 4.7764406164444e-06,
"loss": 0.6143,
"step": 814
},
{
"epoch": 0.9116331096196868,
"grad_norm": 0.42810630798339844,
"learning_rate": 4.775823498735845e-06,
"loss": 0.6253,
"step": 815
},
{
"epoch": 0.912751677852349,
"grad_norm": 0.42162856459617615,
"learning_rate": 4.775205570427386e-06,
"loss": 0.602,
"step": 816
},
{
"epoch": 0.9138702460850112,
"grad_norm": 0.4342280328273773,
"learning_rate": 4.7745868317391135e-06,
"loss": 0.6088,
"step": 817
},
{
"epoch": 0.9149888143176734,
"grad_norm": 0.42438629269599915,
"learning_rate": 4.773967282891411e-06,
"loss": 0.5788,
"step": 818
},
{
"epoch": 0.9161073825503355,
"grad_norm": 0.437950074672699,
"learning_rate": 4.7733469241049475e-06,
"loss": 0.6277,
"step": 819
},
{
"epoch": 0.9172259507829977,
"grad_norm": 0.4286377429962158,
"learning_rate": 4.772725755600682e-06,
"loss": 0.6024,
"step": 820
},
{
"epoch": 0.9183445190156599,
"grad_norm": 0.4317566156387329,
"learning_rate": 4.772103777599861e-06,
"loss": 0.6048,
"step": 821
},
{
"epoch": 0.9194630872483222,
"grad_norm": 0.4509202837944031,
"learning_rate": 4.771480990324021e-06,
"loss": 0.6219,
"step": 822
},
{
"epoch": 0.9205816554809844,
"grad_norm": 0.4387308955192566,
"learning_rate": 4.7708573939949845e-06,
"loss": 0.6082,
"step": 823
},
{
"epoch": 0.9217002237136466,
"grad_norm": 0.457883358001709,
"learning_rate": 4.770232988834864e-06,
"loss": 0.6112,
"step": 824
},
{
"epoch": 0.9228187919463087,
"grad_norm": 0.44200408458709717,
"learning_rate": 4.769607775066058e-06,
"loss": 0.6146,
"step": 825
},
{
"epoch": 0.9239373601789709,
"grad_norm": 0.44704675674438477,
"learning_rate": 4.768981752911256e-06,
"loss": 0.5921,
"step": 826
},
{
"epoch": 0.9250559284116331,
"grad_norm": 0.4367467164993286,
"learning_rate": 4.768354922593433e-06,
"loss": 0.6075,
"step": 827
},
{
"epoch": 0.9261744966442953,
"grad_norm": 0.4321734309196472,
"learning_rate": 4.767727284335852e-06,
"loss": 0.6041,
"step": 828
},
{
"epoch": 0.9272930648769575,
"grad_norm": 0.42991605401039124,
"learning_rate": 4.767098838362065e-06,
"loss": 0.5804,
"step": 829
},
{
"epoch": 0.9284116331096197,
"grad_norm": 0.43791651725769043,
"learning_rate": 4.766469584895912e-06,
"loss": 0.6005,
"step": 830
},
{
"epoch": 0.9295302013422819,
"grad_norm": 0.41972237825393677,
"learning_rate": 4.765839524161518e-06,
"loss": 0.582,
"step": 831
},
{
"epoch": 0.930648769574944,
"grad_norm": 0.4424271881580353,
"learning_rate": 4.765208656383299e-06,
"loss": 0.5978,
"step": 832
},
{
"epoch": 0.9317673378076062,
"grad_norm": 0.45667219161987305,
"learning_rate": 4.7645769817859554e-06,
"loss": 0.6208,
"step": 833
},
{
"epoch": 0.9328859060402684,
"grad_norm": 0.4423377811908722,
"learning_rate": 4.763944500594476e-06,
"loss": 0.6061,
"step": 834
},
{
"epoch": 0.9340044742729307,
"grad_norm": 0.4316536784172058,
"learning_rate": 4.7633112130341385e-06,
"loss": 0.6116,
"step": 835
},
{
"epoch": 0.9351230425055929,
"grad_norm": 0.4591672718524933,
"learning_rate": 4.762677119330505e-06,
"loss": 0.5729,
"step": 836
},
{
"epoch": 0.9362416107382551,
"grad_norm": 0.4469880759716034,
"learning_rate": 4.762042219709427e-06,
"loss": 0.6025,
"step": 837
},
{
"epoch": 0.9373601789709173,
"grad_norm": 0.4560692012310028,
"learning_rate": 4.761406514397042e-06,
"loss": 0.6103,
"step": 838
},
{
"epoch": 0.9384787472035794,
"grad_norm": 0.4428820013999939,
"learning_rate": 4.760770003619775e-06,
"loss": 0.6258,
"step": 839
},
{
"epoch": 0.9395973154362416,
"grad_norm": 0.44238874316215515,
"learning_rate": 4.760132687604338e-06,
"loss": 0.6032,
"step": 840
},
{
"epoch": 0.9407158836689038,
"grad_norm": 0.46432724595069885,
"learning_rate": 4.759494566577727e-06,
"loss": 0.6266,
"step": 841
},
{
"epoch": 0.941834451901566,
"grad_norm": 0.42941901087760925,
"learning_rate": 4.75885564076723e-06,
"loss": 0.5927,
"step": 842
},
{
"epoch": 0.9429530201342282,
"grad_norm": 0.43781232833862305,
"learning_rate": 4.758215910400418e-06,
"loss": 0.5967,
"step": 843
},
{
"epoch": 0.9440715883668904,
"grad_norm": 0.45641666650772095,
"learning_rate": 4.757575375705149e-06,
"loss": 0.6423,
"step": 844
},
{
"epoch": 0.9451901565995525,
"grad_norm": 0.43784114718437195,
"learning_rate": 4.756934036909567e-06,
"loss": 0.606,
"step": 845
},
{
"epoch": 0.9463087248322147,
"grad_norm": 0.4379528760910034,
"learning_rate": 4.756291894242106e-06,
"loss": 0.6201,
"step": 846
},
{
"epoch": 0.9474272930648769,
"grad_norm": 0.42831459641456604,
"learning_rate": 4.755648947931479e-06,
"loss": 0.6121,
"step": 847
},
{
"epoch": 0.9485458612975392,
"grad_norm": 0.43790462613105774,
"learning_rate": 4.7550051982066945e-06,
"loss": 0.5785,
"step": 848
},
{
"epoch": 0.9496644295302014,
"grad_norm": 0.4407269358634949,
"learning_rate": 4.75436064529704e-06,
"loss": 0.6127,
"step": 849
},
{
"epoch": 0.9507829977628636,
"grad_norm": 0.4252265393733978,
"learning_rate": 4.753715289432092e-06,
"loss": 0.6129,
"step": 850
},
{
"epoch": 0.9519015659955258,
"grad_norm": 0.4376990795135498,
"learning_rate": 4.753069130841712e-06,
"loss": 0.614,
"step": 851
},
{
"epoch": 0.9530201342281879,
"grad_norm": 0.43123552203178406,
"learning_rate": 4.752422169756048e-06,
"loss": 0.6169,
"step": 852
},
{
"epoch": 0.9541387024608501,
"grad_norm": 0.4513196349143982,
"learning_rate": 4.7517744064055345e-06,
"loss": 0.6381,
"step": 853
},
{
"epoch": 0.9552572706935123,
"grad_norm": 0.44663751125335693,
"learning_rate": 4.751125841020891e-06,
"loss": 0.605,
"step": 854
},
{
"epoch": 0.9563758389261745,
"grad_norm": 0.44196903705596924,
"learning_rate": 4.750476473833123e-06,
"loss": 0.6163,
"step": 855
},
{
"epoch": 0.9574944071588367,
"grad_norm": 0.40786847472190857,
"learning_rate": 4.74982630507352e-06,
"loss": 0.5624,
"step": 856
},
{
"epoch": 0.9586129753914989,
"grad_norm": 0.43438002467155457,
"learning_rate": 4.749175334973659e-06,
"loss": 0.6183,
"step": 857
},
{
"epoch": 0.959731543624161,
"grad_norm": 0.43120619654655457,
"learning_rate": 4.748523563765401e-06,
"loss": 0.6097,
"step": 858
},
{
"epoch": 0.9608501118568232,
"grad_norm": 0.4761989414691925,
"learning_rate": 4.747870991680895e-06,
"loss": 0.6029,
"step": 859
},
{
"epoch": 0.9619686800894854,
"grad_norm": 0.44484785199165344,
"learning_rate": 4.747217618952571e-06,
"loss": 0.5955,
"step": 860
},
{
"epoch": 0.9630872483221476,
"grad_norm": 0.4473284184932709,
"learning_rate": 4.746563445813148e-06,
"loss": 0.6367,
"step": 861
},
{
"epoch": 0.9642058165548099,
"grad_norm": 0.4486042857170105,
"learning_rate": 4.745908472495628e-06,
"loss": 0.5917,
"step": 862
},
{
"epoch": 0.9653243847874721,
"grad_norm": 0.45661088824272156,
"learning_rate": 4.745252699233298e-06,
"loss": 0.61,
"step": 863
},
{
"epoch": 0.9664429530201343,
"grad_norm": 0.4235589802265167,
"learning_rate": 4.744596126259731e-06,
"loss": 0.5887,
"step": 864
},
{
"epoch": 0.9675615212527964,
"grad_norm": 0.45463138818740845,
"learning_rate": 4.743938753808785e-06,
"loss": 0.6295,
"step": 865
},
{
"epoch": 0.9686800894854586,
"grad_norm": 0.4576405882835388,
"learning_rate": 4.743280582114601e-06,
"loss": 0.6301,
"step": 866
},
{
"epoch": 0.9697986577181208,
"grad_norm": 0.4380008280277252,
"learning_rate": 4.742621611411606e-06,
"loss": 0.619,
"step": 867
},
{
"epoch": 0.970917225950783,
"grad_norm": 0.4485333263874054,
"learning_rate": 4.7419618419345124e-06,
"loss": 0.6311,
"step": 868
},
{
"epoch": 0.9720357941834452,
"grad_norm": 0.44375064969062805,
"learning_rate": 4.741301273918314e-06,
"loss": 0.6095,
"step": 869
},
{
"epoch": 0.9731543624161074,
"grad_norm": 0.4440104365348816,
"learning_rate": 4.740639907598293e-06,
"loss": 0.6173,
"step": 870
},
{
"epoch": 0.9742729306487695,
"grad_norm": 0.4545641243457794,
"learning_rate": 4.739977743210014e-06,
"loss": 0.6046,
"step": 871
},
{
"epoch": 0.9753914988814317,
"grad_norm": 0.41214534640312195,
"learning_rate": 4.739314780989324e-06,
"loss": 0.6072,
"step": 872
},
{
"epoch": 0.9765100671140939,
"grad_norm": 0.4223095178604126,
"learning_rate": 4.738651021172357e-06,
"loss": 0.5878,
"step": 873
},
{
"epoch": 0.9776286353467561,
"grad_norm": 0.42885127663612366,
"learning_rate": 4.7379864639955304e-06,
"loss": 0.577,
"step": 874
},
{
"epoch": 0.9787472035794184,
"grad_norm": 0.42921069264411926,
"learning_rate": 4.737321109695546e-06,
"loss": 0.5844,
"step": 875
},
{
"epoch": 0.9798657718120806,
"grad_norm": 0.43462637066841125,
"learning_rate": 4.736654958509387e-06,
"loss": 0.6135,
"step": 876
},
{
"epoch": 0.9809843400447428,
"grad_norm": 0.43558555841445923,
"learning_rate": 4.735988010674324e-06,
"loss": 0.6255,
"step": 877
},
{
"epoch": 0.9821029082774049,
"grad_norm": 0.44332823157310486,
"learning_rate": 4.735320266427909e-06,
"loss": 0.6266,
"step": 878
},
{
"epoch": 0.9832214765100671,
"grad_norm": 0.4158824682235718,
"learning_rate": 4.734651726007978e-06,
"loss": 0.585,
"step": 879
},
{
"epoch": 0.9843400447427293,
"grad_norm": 0.4264974296092987,
"learning_rate": 4.733982389652652e-06,
"loss": 0.5871,
"step": 880
},
{
"epoch": 0.9854586129753915,
"grad_norm": 0.44846397638320923,
"learning_rate": 4.733312257600332e-06,
"loss": 0.6441,
"step": 881
},
{
"epoch": 0.9865771812080537,
"grad_norm": 0.46592265367507935,
"learning_rate": 4.732641330089707e-06,
"loss": 0.6326,
"step": 882
},
{
"epoch": 0.9876957494407159,
"grad_norm": 0.423447847366333,
"learning_rate": 4.731969607359747e-06,
"loss": 0.5922,
"step": 883
},
{
"epoch": 0.9888143176733781,
"grad_norm": 0.43406492471694946,
"learning_rate": 4.731297089649704e-06,
"loss": 0.6234,
"step": 884
},
{
"epoch": 0.9899328859060402,
"grad_norm": 0.443352073431015,
"learning_rate": 4.730623777199115e-06,
"loss": 0.6397,
"step": 885
},
{
"epoch": 0.9910514541387024,
"grad_norm": 0.4311400353908539,
"learning_rate": 4.7299496702478e-06,
"loss": 0.6073,
"step": 886
},
{
"epoch": 0.9921700223713646,
"grad_norm": 0.4217356741428375,
"learning_rate": 4.729274769035861e-06,
"loss": 0.6177,
"step": 887
},
{
"epoch": 0.9932885906040269,
"grad_norm": 0.45120054483413696,
"learning_rate": 4.728599073803685e-06,
"loss": 0.6181,
"step": 888
},
{
"epoch": 0.9944071588366891,
"grad_norm": 0.4567187428474426,
"learning_rate": 4.7279225847919375e-06,
"loss": 0.5839,
"step": 889
},
{
"epoch": 0.9955257270693513,
"grad_norm": 0.4429774880409241,
"learning_rate": 4.727245302241572e-06,
"loss": 0.6033,
"step": 890
},
{
"epoch": 0.9966442953020134,
"grad_norm": 0.44755569100379944,
"learning_rate": 4.726567226393821e-06,
"loss": 0.5877,
"step": 891
},
{
"epoch": 0.9977628635346756,
"grad_norm": 0.45908474922180176,
"learning_rate": 4.725888357490201e-06,
"loss": 0.6017,
"step": 892
},
{
"epoch": 0.9988814317673378,
"grad_norm": 0.4454707205295563,
"learning_rate": 4.725208695772511e-06,
"loss": 0.6007,
"step": 893
},
{
"epoch": 1.0,
"grad_norm": 0.43688732385635376,
"learning_rate": 4.7245282414828305e-06,
"loss": 0.6202,
"step": 894
}
],
"logging_steps": 1,
"max_steps": 5364,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 894,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.0026878455835525e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}