model_45198fio / checkpoint-1901 /trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
46a6c3d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1901,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005260389268805891,
"grad_norm": 4.7434234619140625,
"learning_rate": 5.0000000000000004e-08,
"loss": 1.7896,
"step": 1
},
{
"epoch": 0.0010520778537611783,
"grad_norm": 4.893940448760986,
"learning_rate": 1.0000000000000001e-07,
"loss": 1.8423,
"step": 2
},
{
"epoch": 0.0015781167806417674,
"grad_norm": 5.008203029632568,
"learning_rate": 1.5000000000000002e-07,
"loss": 1.7775,
"step": 3
},
{
"epoch": 0.0021041557075223566,
"grad_norm": 4.682094097137451,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.7132,
"step": 4
},
{
"epoch": 0.0026301946344029457,
"grad_norm": 5.076476097106934,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.7946,
"step": 5
},
{
"epoch": 0.003156233561283535,
"grad_norm": 5.164911270141602,
"learning_rate": 3.0000000000000004e-07,
"loss": 1.7562,
"step": 6
},
{
"epoch": 0.003682272488164124,
"grad_norm": 5.532482624053955,
"learning_rate": 3.5000000000000004e-07,
"loss": 1.9173,
"step": 7
},
{
"epoch": 0.004208311415044713,
"grad_norm": 4.994466304779053,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.8048,
"step": 8
},
{
"epoch": 0.004734350341925302,
"grad_norm": 4.728099822998047,
"learning_rate": 4.5000000000000003e-07,
"loss": 1.8313,
"step": 9
},
{
"epoch": 0.0052603892688058915,
"grad_norm": 4.757445335388184,
"learning_rate": 5.000000000000001e-07,
"loss": 1.7745,
"step": 10
},
{
"epoch": 0.005786428195686481,
"grad_norm": 4.926065444946289,
"learning_rate": 5.5e-07,
"loss": 1.8448,
"step": 11
},
{
"epoch": 0.00631246712256707,
"grad_norm": 4.987133979797363,
"learning_rate": 6.000000000000001e-07,
"loss": 1.7755,
"step": 12
},
{
"epoch": 0.006838506049447659,
"grad_norm": 4.783141613006592,
"learning_rate": 6.5e-07,
"loss": 1.7815,
"step": 13
},
{
"epoch": 0.007364544976328248,
"grad_norm": 4.668217182159424,
"learning_rate": 7.000000000000001e-07,
"loss": 1.754,
"step": 14
},
{
"epoch": 0.007890583903208837,
"grad_norm": 4.673665523529053,
"learning_rate": 7.5e-07,
"loss": 1.7608,
"step": 15
},
{
"epoch": 0.008416622830089426,
"grad_norm": 4.452486991882324,
"learning_rate": 8.000000000000001e-07,
"loss": 1.7222,
"step": 16
},
{
"epoch": 0.008942661756970016,
"grad_norm": 4.257665157318115,
"learning_rate": 8.500000000000001e-07,
"loss": 1.7556,
"step": 17
},
{
"epoch": 0.009468700683850605,
"grad_norm": 4.1270432472229,
"learning_rate": 9.000000000000001e-07,
"loss": 1.7121,
"step": 18
},
{
"epoch": 0.009994739610731194,
"grad_norm": 4.321215629577637,
"learning_rate": 9.500000000000001e-07,
"loss": 1.7584,
"step": 19
},
{
"epoch": 0.010520778537611783,
"grad_norm": 3.8703970909118652,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.6611,
"step": 20
},
{
"epoch": 0.011046817464492372,
"grad_norm": 4.07947301864624,
"learning_rate": 1.0500000000000001e-06,
"loss": 1.7914,
"step": 21
},
{
"epoch": 0.011572856391372961,
"grad_norm": 3.9068686962127686,
"learning_rate": 1.1e-06,
"loss": 1.7848,
"step": 22
},
{
"epoch": 0.01209889531825355,
"grad_norm": 3.7697386741638184,
"learning_rate": 1.1500000000000002e-06,
"loss": 1.6694,
"step": 23
},
{
"epoch": 0.01262493424513414,
"grad_norm": 3.795276641845703,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.759,
"step": 24
},
{
"epoch": 0.013150973172014729,
"grad_norm": 3.331472396850586,
"learning_rate": 1.25e-06,
"loss": 1.7053,
"step": 25
},
{
"epoch": 0.013677012098895318,
"grad_norm": 3.381592035293579,
"learning_rate": 1.3e-06,
"loss": 1.683,
"step": 26
},
{
"epoch": 0.014203051025775907,
"grad_norm": 3.2494184970855713,
"learning_rate": 1.3500000000000002e-06,
"loss": 1.5756,
"step": 27
},
{
"epoch": 0.014729089952656496,
"grad_norm": 3.124213695526123,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.7102,
"step": 28
},
{
"epoch": 0.015255128879537085,
"grad_norm": 2.9148762226104736,
"learning_rate": 1.45e-06,
"loss": 1.6007,
"step": 29
},
{
"epoch": 0.015781167806417674,
"grad_norm": 2.886734962463379,
"learning_rate": 1.5e-06,
"loss": 1.7086,
"step": 30
},
{
"epoch": 0.016307206733298264,
"grad_norm": 2.6898605823516846,
"learning_rate": 1.5500000000000002e-06,
"loss": 1.5655,
"step": 31
},
{
"epoch": 0.016833245660178853,
"grad_norm": 2.6458981037139893,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.4881,
"step": 32
},
{
"epoch": 0.017359284587059442,
"grad_norm": 2.481387138366699,
"learning_rate": 1.6500000000000003e-06,
"loss": 1.5608,
"step": 33
},
{
"epoch": 0.01788532351394003,
"grad_norm": 2.743023633956909,
"learning_rate": 1.7000000000000002e-06,
"loss": 1.5705,
"step": 34
},
{
"epoch": 0.01841136244082062,
"grad_norm": 2.7273406982421875,
"learning_rate": 1.75e-06,
"loss": 1.5819,
"step": 35
},
{
"epoch": 0.01893740136770121,
"grad_norm": 2.7253308296203613,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.5201,
"step": 36
},
{
"epoch": 0.0194634402945818,
"grad_norm": 2.8794732093811035,
"learning_rate": 1.85e-06,
"loss": 1.4743,
"step": 37
},
{
"epoch": 0.019989479221462388,
"grad_norm": 2.767172336578369,
"learning_rate": 1.9000000000000002e-06,
"loss": 1.5366,
"step": 38
},
{
"epoch": 0.020515518148342977,
"grad_norm": 2.84169864654541,
"learning_rate": 1.9500000000000004e-06,
"loss": 1.5635,
"step": 39
},
{
"epoch": 0.021041557075223566,
"grad_norm": 2.6982147693634033,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.49,
"step": 40
},
{
"epoch": 0.021567596002104155,
"grad_norm": 2.597731590270996,
"learning_rate": 2.05e-06,
"loss": 1.5189,
"step": 41
},
{
"epoch": 0.022093634928984744,
"grad_norm": 2.4286556243896484,
"learning_rate": 2.1000000000000002e-06,
"loss": 1.4439,
"step": 42
},
{
"epoch": 0.022619673855865333,
"grad_norm": 2.6267499923706055,
"learning_rate": 2.15e-06,
"loss": 1.3522,
"step": 43
},
{
"epoch": 0.023145712782745922,
"grad_norm": 2.2576816082000732,
"learning_rate": 2.2e-06,
"loss": 1.4713,
"step": 44
},
{
"epoch": 0.02367175170962651,
"grad_norm": 2.406381368637085,
"learning_rate": 2.25e-06,
"loss": 1.47,
"step": 45
},
{
"epoch": 0.0241977906365071,
"grad_norm": 2.2341415882110596,
"learning_rate": 2.3000000000000004e-06,
"loss": 1.4041,
"step": 46
},
{
"epoch": 0.02472382956338769,
"grad_norm": 2.5055644512176514,
"learning_rate": 2.35e-06,
"loss": 1.4321,
"step": 47
},
{
"epoch": 0.02524986849026828,
"grad_norm": 2.2131927013397217,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.3631,
"step": 48
},
{
"epoch": 0.025775907417148868,
"grad_norm": 2.3399457931518555,
"learning_rate": 2.4500000000000003e-06,
"loss": 1.4055,
"step": 49
},
{
"epoch": 0.026301946344029457,
"grad_norm": 2.2194554805755615,
"learning_rate": 2.5e-06,
"loss": 1.3722,
"step": 50
},
{
"epoch": 0.026827985270910047,
"grad_norm": 2.196530342102051,
"learning_rate": 2.55e-06,
"loss": 1.4126,
"step": 51
},
{
"epoch": 0.027354024197790636,
"grad_norm": 2.401376485824585,
"learning_rate": 2.6e-06,
"loss": 1.4174,
"step": 52
},
{
"epoch": 0.027880063124671225,
"grad_norm": 2.2509777545928955,
"learning_rate": 2.6500000000000005e-06,
"loss": 1.3725,
"step": 53
},
{
"epoch": 0.028406102051551814,
"grad_norm": 2.2538340091705322,
"learning_rate": 2.7000000000000004e-06,
"loss": 1.4274,
"step": 54
},
{
"epoch": 0.028932140978432403,
"grad_norm": 2.218494176864624,
"learning_rate": 2.7500000000000004e-06,
"loss": 1.4518,
"step": 55
},
{
"epoch": 0.029458179905312992,
"grad_norm": 2.06544828414917,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.3547,
"step": 56
},
{
"epoch": 0.02998421883219358,
"grad_norm": 2.014075994491577,
"learning_rate": 2.85e-06,
"loss": 1.2274,
"step": 57
},
{
"epoch": 0.03051025775907417,
"grad_norm": 2.187418222427368,
"learning_rate": 2.9e-06,
"loss": 1.3663,
"step": 58
},
{
"epoch": 0.03103629668595476,
"grad_norm": 1.993913173675537,
"learning_rate": 2.95e-06,
"loss": 1.3357,
"step": 59
},
{
"epoch": 0.03156233561283535,
"grad_norm": 2.1067426204681396,
"learning_rate": 3e-06,
"loss": 1.3627,
"step": 60
},
{
"epoch": 0.03208837453971594,
"grad_norm": 2.0144565105438232,
"learning_rate": 3.05e-06,
"loss": 1.394,
"step": 61
},
{
"epoch": 0.03261441346659653,
"grad_norm": 2.2240288257598877,
"learning_rate": 3.1000000000000004e-06,
"loss": 1.3657,
"step": 62
},
{
"epoch": 0.03314045239347712,
"grad_norm": 2.0080718994140625,
"learning_rate": 3.1500000000000003e-06,
"loss": 1.2954,
"step": 63
},
{
"epoch": 0.033666491320357705,
"grad_norm": 2.1592211723327637,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.363,
"step": 64
},
{
"epoch": 0.0341925302472383,
"grad_norm": 2.1390435695648193,
"learning_rate": 3.2500000000000002e-06,
"loss": 1.3329,
"step": 65
},
{
"epoch": 0.034718569174118884,
"grad_norm": 2.309795379638672,
"learning_rate": 3.3000000000000006e-06,
"loss": 1.3378,
"step": 66
},
{
"epoch": 0.035244608100999476,
"grad_norm": 2.0283970832824707,
"learning_rate": 3.3500000000000005e-06,
"loss": 1.2707,
"step": 67
},
{
"epoch": 0.03577064702788006,
"grad_norm": 2.3350703716278076,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.3149,
"step": 68
},
{
"epoch": 0.036296685954760655,
"grad_norm": 2.1374268531799316,
"learning_rate": 3.45e-06,
"loss": 1.3181,
"step": 69
},
{
"epoch": 0.03682272488164124,
"grad_norm": 2.1340744495391846,
"learning_rate": 3.5e-06,
"loss": 1.2968,
"step": 70
},
{
"epoch": 0.03734876380852183,
"grad_norm": 2.212939500808716,
"learning_rate": 3.5500000000000003e-06,
"loss": 1.3285,
"step": 71
},
{
"epoch": 0.03787480273540242,
"grad_norm": 2.0891077518463135,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.3142,
"step": 72
},
{
"epoch": 0.03840084166228301,
"grad_norm": 2.0146496295928955,
"learning_rate": 3.65e-06,
"loss": 1.2932,
"step": 73
},
{
"epoch": 0.0389268805891636,
"grad_norm": 2.2315266132354736,
"learning_rate": 3.7e-06,
"loss": 1.3515,
"step": 74
},
{
"epoch": 0.03945291951604419,
"grad_norm": 2.0311717987060547,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.2601,
"step": 75
},
{
"epoch": 0.039978958442924775,
"grad_norm": 1.9522899389266968,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.3521,
"step": 76
},
{
"epoch": 0.04050499736980537,
"grad_norm": 2.0501742362976074,
"learning_rate": 3.85e-06,
"loss": 1.3243,
"step": 77
},
{
"epoch": 0.041031036296685953,
"grad_norm": 2.136033535003662,
"learning_rate": 3.900000000000001e-06,
"loss": 1.3373,
"step": 78
},
{
"epoch": 0.041557075223566546,
"grad_norm": 2.328866958618164,
"learning_rate": 3.95e-06,
"loss": 1.2864,
"step": 79
},
{
"epoch": 0.04208311415044713,
"grad_norm": 2.0889344215393066,
"learning_rate": 4.000000000000001e-06,
"loss": 1.2692,
"step": 80
},
{
"epoch": 0.042609153077327724,
"grad_norm": 2.088667631149292,
"learning_rate": 4.05e-06,
"loss": 1.2232,
"step": 81
},
{
"epoch": 0.04313519200420831,
"grad_norm": 2.0293898582458496,
"learning_rate": 4.1e-06,
"loss": 1.2505,
"step": 82
},
{
"epoch": 0.0436612309310889,
"grad_norm": 2.240025281906128,
"learning_rate": 4.15e-06,
"loss": 1.3107,
"step": 83
},
{
"epoch": 0.04418726985796949,
"grad_norm": 2.123445987701416,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.1674,
"step": 84
},
{
"epoch": 0.04471330878485008,
"grad_norm": 2.1865620613098145,
"learning_rate": 4.25e-06,
"loss": 1.3257,
"step": 85
},
{
"epoch": 0.04523934771173067,
"grad_norm": 2.1336405277252197,
"learning_rate": 4.3e-06,
"loss": 1.2968,
"step": 86
},
{
"epoch": 0.04576538663861126,
"grad_norm": 2.117763042449951,
"learning_rate": 4.350000000000001e-06,
"loss": 1.2294,
"step": 87
},
{
"epoch": 0.046291425565491845,
"grad_norm": 1.9969348907470703,
"learning_rate": 4.4e-06,
"loss": 1.2621,
"step": 88
},
{
"epoch": 0.04681746449237244,
"grad_norm": 2.24861741065979,
"learning_rate": 4.450000000000001e-06,
"loss": 1.2909,
"step": 89
},
{
"epoch": 0.04734350341925302,
"grad_norm": 2.08335542678833,
"learning_rate": 4.5e-06,
"loss": 1.2691,
"step": 90
},
{
"epoch": 0.047869542346133616,
"grad_norm": 2.1306045055389404,
"learning_rate": 4.5500000000000005e-06,
"loss": 1.3248,
"step": 91
},
{
"epoch": 0.0483955812730142,
"grad_norm": 2.2251298427581787,
"learning_rate": 4.600000000000001e-06,
"loss": 1.2391,
"step": 92
},
{
"epoch": 0.048921620199894794,
"grad_norm": 2.1604959964752197,
"learning_rate": 4.65e-06,
"loss": 1.2169,
"step": 93
},
{
"epoch": 0.04944765912677538,
"grad_norm": 2.0155038833618164,
"learning_rate": 4.7e-06,
"loss": 1.2533,
"step": 94
},
{
"epoch": 0.04997369805365597,
"grad_norm": 1.9579726457595825,
"learning_rate": 4.75e-06,
"loss": 1.2228,
"step": 95
},
{
"epoch": 0.05049973698053656,
"grad_norm": 2.129992961883545,
"learning_rate": 4.800000000000001e-06,
"loss": 1.2573,
"step": 96
},
{
"epoch": 0.05102577590741715,
"grad_norm": 2.0832459926605225,
"learning_rate": 4.85e-06,
"loss": 1.241,
"step": 97
},
{
"epoch": 0.051551814834297736,
"grad_norm": 2.278550148010254,
"learning_rate": 4.9000000000000005e-06,
"loss": 1.2565,
"step": 98
},
{
"epoch": 0.05207785376117833,
"grad_norm": 2.0997259616851807,
"learning_rate": 4.95e-06,
"loss": 1.2445,
"step": 99
},
{
"epoch": 0.052603892688058915,
"grad_norm": 2.127976417541504,
"learning_rate": 5e-06,
"loss": 1.2605,
"step": 100
},
{
"epoch": 0.05312993161493951,
"grad_norm": 2.1200127601623535,
"learning_rate": 4.9999999034856715e-06,
"loss": 1.3057,
"step": 101
},
{
"epoch": 0.05365597054182009,
"grad_norm": 2.456881046295166,
"learning_rate": 4.999999613942694e-06,
"loss": 1.2741,
"step": 102
},
{
"epoch": 0.054182009468700686,
"grad_norm": 2.189507484436035,
"learning_rate": 4.9999991313710884e-06,
"loss": 1.2399,
"step": 103
},
{
"epoch": 0.05470804839558127,
"grad_norm": 2.258619785308838,
"learning_rate": 4.9999984557708936e-06,
"loss": 1.2161,
"step": 104
},
{
"epoch": 0.055234087322461864,
"grad_norm": 1.983225703239441,
"learning_rate": 4.999997587142161e-06,
"loss": 1.2027,
"step": 105
},
{
"epoch": 0.05576012624934245,
"grad_norm": 2.1400973796844482,
"learning_rate": 4.999996525484957e-06,
"loss": 1.2685,
"step": 106
},
{
"epoch": 0.05628616517622304,
"grad_norm": 1.9494950771331787,
"learning_rate": 4.999995270799365e-06,
"loss": 1.2604,
"step": 107
},
{
"epoch": 0.05681220410310363,
"grad_norm": 2.1203386783599854,
"learning_rate": 4.9999938230854814e-06,
"loss": 1.2345,
"step": 108
},
{
"epoch": 0.05733824302998422,
"grad_norm": 2.131884813308716,
"learning_rate": 4.999992182343417e-06,
"loss": 1.2097,
"step": 109
},
{
"epoch": 0.057864281956864806,
"grad_norm": 2.136289119720459,
"learning_rate": 4.9999903485732996e-06,
"loss": 1.2617,
"step": 110
},
{
"epoch": 0.0583903208837454,
"grad_norm": 2.025071144104004,
"learning_rate": 4.9999883217752705e-06,
"loss": 1.2004,
"step": 111
},
{
"epoch": 0.058916359810625984,
"grad_norm": 2.513960838317871,
"learning_rate": 4.999986101949486e-06,
"loss": 1.2399,
"step": 112
},
{
"epoch": 0.05944239873750658,
"grad_norm": 2.2483277320861816,
"learning_rate": 4.999983689096117e-06,
"loss": 1.2265,
"step": 113
},
{
"epoch": 0.05996843766438716,
"grad_norm": 2.0863187313079834,
"learning_rate": 4.999981083215352e-06,
"loss": 1.1969,
"step": 114
},
{
"epoch": 0.060494476591267755,
"grad_norm": 2.1240596771240234,
"learning_rate": 4.99997828430739e-06,
"loss": 1.275,
"step": 115
},
{
"epoch": 0.06102051551814834,
"grad_norm": 2.3810060024261475,
"learning_rate": 4.9999752923724465e-06,
"loss": 1.3054,
"step": 116
},
{
"epoch": 0.061546554445028934,
"grad_norm": 2.1266205310821533,
"learning_rate": 4.999972107410754e-06,
"loss": 1.1933,
"step": 117
},
{
"epoch": 0.06207259337190952,
"grad_norm": 2.039619207382202,
"learning_rate": 4.999968729422559e-06,
"loss": 1.1886,
"step": 118
},
{
"epoch": 0.0625986322987901,
"grad_norm": 2.024503707885742,
"learning_rate": 4.999965158408122e-06,
"loss": 1.2008,
"step": 119
},
{
"epoch": 0.0631246712256707,
"grad_norm": 2.058926582336426,
"learning_rate": 4.999961394367717e-06,
"loss": 1.1772,
"step": 120
},
{
"epoch": 0.06365071015255129,
"grad_norm": 1.989399790763855,
"learning_rate": 4.999957437301637e-06,
"loss": 1.1869,
"step": 121
},
{
"epoch": 0.06417674907943188,
"grad_norm": 2.0462567806243896,
"learning_rate": 4.999953287210185e-06,
"loss": 1.1944,
"step": 122
},
{
"epoch": 0.06470278800631246,
"grad_norm": 2.258549213409424,
"learning_rate": 4.999948944093683e-06,
"loss": 1.2304,
"step": 123
},
{
"epoch": 0.06522882693319305,
"grad_norm": 2.115344285964966,
"learning_rate": 4.999944407952467e-06,
"loss": 1.1901,
"step": 124
},
{
"epoch": 0.06575486586007365,
"grad_norm": 2.082406997680664,
"learning_rate": 4.999939678786886e-06,
"loss": 1.2481,
"step": 125
},
{
"epoch": 0.06628090478695424,
"grad_norm": 2.5095906257629395,
"learning_rate": 4.999934756597305e-06,
"loss": 1.2526,
"step": 126
},
{
"epoch": 0.06680694371383482,
"grad_norm": 1.989524483680725,
"learning_rate": 4.999929641384105e-06,
"loss": 1.2298,
"step": 127
},
{
"epoch": 0.06733298264071541,
"grad_norm": 2.3429722785949707,
"learning_rate": 4.999924333147681e-06,
"loss": 1.2511,
"step": 128
},
{
"epoch": 0.067859021567596,
"grad_norm": 2.064497232437134,
"learning_rate": 4.999918831888441e-06,
"loss": 1.2041,
"step": 129
},
{
"epoch": 0.0683850604944766,
"grad_norm": 2.099992513656616,
"learning_rate": 4.999913137606813e-06,
"loss": 1.2256,
"step": 130
},
{
"epoch": 0.06891109942135717,
"grad_norm": 2.188778877258301,
"learning_rate": 4.999907250303234e-06,
"loss": 1.2009,
"step": 131
},
{
"epoch": 0.06943713834823777,
"grad_norm": 2.154895067214966,
"learning_rate": 4.999901169978158e-06,
"loss": 1.273,
"step": 132
},
{
"epoch": 0.06996317727511836,
"grad_norm": 2.457084894180298,
"learning_rate": 4.999894896632058e-06,
"loss": 1.2003,
"step": 133
},
{
"epoch": 0.07048921620199895,
"grad_norm": 2.0455472469329834,
"learning_rate": 4.999888430265415e-06,
"loss": 1.1909,
"step": 134
},
{
"epoch": 0.07101525512887953,
"grad_norm": 2.3690097332000732,
"learning_rate": 4.99988177087873e-06,
"loss": 1.2414,
"step": 135
},
{
"epoch": 0.07154129405576012,
"grad_norm": 2.0194432735443115,
"learning_rate": 4.999874918472516e-06,
"loss": 1.2072,
"step": 136
},
{
"epoch": 0.07206733298264072,
"grad_norm": 2.0639989376068115,
"learning_rate": 4.999867873047303e-06,
"loss": 1.1853,
"step": 137
},
{
"epoch": 0.07259337190952131,
"grad_norm": 2.1263129711151123,
"learning_rate": 4.999860634603635e-06,
"loss": 1.1915,
"step": 138
},
{
"epoch": 0.07311941083640189,
"grad_norm": 1.9768770933151245,
"learning_rate": 4.99985320314207e-06,
"loss": 1.1623,
"step": 139
},
{
"epoch": 0.07364544976328248,
"grad_norm": 2.4466986656188965,
"learning_rate": 4.9998455786631835e-06,
"loss": 1.2549,
"step": 140
},
{
"epoch": 0.07417148869016307,
"grad_norm": 2.482954263687134,
"learning_rate": 4.999837761167563e-06,
"loss": 1.1503,
"step": 141
},
{
"epoch": 0.07469752761704367,
"grad_norm": 2.1949164867401123,
"learning_rate": 4.9998297506558116e-06,
"loss": 1.2515,
"step": 142
},
{
"epoch": 0.07522356654392424,
"grad_norm": 2.3435401916503906,
"learning_rate": 4.9998215471285486e-06,
"loss": 1.2231,
"step": 143
},
{
"epoch": 0.07574960547080484,
"grad_norm": 2.2442994117736816,
"learning_rate": 4.9998131505864064e-06,
"loss": 1.2472,
"step": 144
},
{
"epoch": 0.07627564439768543,
"grad_norm": 2.4117157459259033,
"learning_rate": 4.999804561030036e-06,
"loss": 1.2303,
"step": 145
},
{
"epoch": 0.07680168332456602,
"grad_norm": 2.263303279876709,
"learning_rate": 4.999795778460097e-06,
"loss": 1.2435,
"step": 146
},
{
"epoch": 0.0773277222514466,
"grad_norm": 2.174962282180786,
"learning_rate": 4.99978680287727e-06,
"loss": 1.2074,
"step": 147
},
{
"epoch": 0.0778537611783272,
"grad_norm": 2.1498875617980957,
"learning_rate": 4.999777634282248e-06,
"loss": 1.1665,
"step": 148
},
{
"epoch": 0.07837980010520779,
"grad_norm": 2.0245747566223145,
"learning_rate": 4.999768272675737e-06,
"loss": 1.169,
"step": 149
},
{
"epoch": 0.07890583903208838,
"grad_norm": 2.03243350982666,
"learning_rate": 4.999758718058462e-06,
"loss": 1.2113,
"step": 150
},
{
"epoch": 0.07943187795896896,
"grad_norm": 2.104052782058716,
"learning_rate": 4.9997489704311586e-06,
"loss": 1.1792,
"step": 151
},
{
"epoch": 0.07995791688584955,
"grad_norm": 2.16056752204895,
"learning_rate": 4.999739029794581e-06,
"loss": 1.2183,
"step": 152
},
{
"epoch": 0.08048395581273014,
"grad_norm": 2.1418581008911133,
"learning_rate": 4.9997288961494975e-06,
"loss": 1.2024,
"step": 153
},
{
"epoch": 0.08100999473961074,
"grad_norm": 2.235917329788208,
"learning_rate": 4.999718569496688e-06,
"loss": 1.2234,
"step": 154
},
{
"epoch": 0.08153603366649131,
"grad_norm": 2.0039474964141846,
"learning_rate": 4.999708049836952e-06,
"loss": 1.1164,
"step": 155
},
{
"epoch": 0.08206207259337191,
"grad_norm": 2.0888242721557617,
"learning_rate": 4.9996973371710995e-06,
"loss": 1.1935,
"step": 156
},
{
"epoch": 0.0825881115202525,
"grad_norm": 2.245558500289917,
"learning_rate": 4.999686431499961e-06,
"loss": 1.1438,
"step": 157
},
{
"epoch": 0.08311415044713309,
"grad_norm": 2.351905345916748,
"learning_rate": 4.999675332824376e-06,
"loss": 1.2208,
"step": 158
},
{
"epoch": 0.08364018937401367,
"grad_norm": 2.0418808460235596,
"learning_rate": 4.999664041145201e-06,
"loss": 1.1537,
"step": 159
},
{
"epoch": 0.08416622830089426,
"grad_norm": 2.194399118423462,
"learning_rate": 4.99965255646331e-06,
"loss": 1.1602,
"step": 160
},
{
"epoch": 0.08469226722777486,
"grad_norm": 2.4853098392486572,
"learning_rate": 4.999640878779588e-06,
"loss": 1.1981,
"step": 161
},
{
"epoch": 0.08521830615465545,
"grad_norm": 2.1702558994293213,
"learning_rate": 4.9996290080949386e-06,
"loss": 1.1682,
"step": 162
},
{
"epoch": 0.08574434508153603,
"grad_norm": 2.150707960128784,
"learning_rate": 4.999616944410276e-06,
"loss": 1.2123,
"step": 163
},
{
"epoch": 0.08627038400841662,
"grad_norm": 2.166897773742676,
"learning_rate": 4.9996046877265325e-06,
"loss": 1.1855,
"step": 164
},
{
"epoch": 0.08679642293529721,
"grad_norm": 2.1538188457489014,
"learning_rate": 4.999592238044655e-06,
"loss": 1.1797,
"step": 165
},
{
"epoch": 0.0873224618621778,
"grad_norm": 2.222170114517212,
"learning_rate": 4.999579595365604e-06,
"loss": 1.1606,
"step": 166
},
{
"epoch": 0.08784850078905838,
"grad_norm": 2.264437437057495,
"learning_rate": 4.999566759690356e-06,
"loss": 1.1662,
"step": 167
},
{
"epoch": 0.08837453971593898,
"grad_norm": 2.2306337356567383,
"learning_rate": 4.999553731019903e-06,
"loss": 1.1933,
"step": 168
},
{
"epoch": 0.08890057864281957,
"grad_norm": 2.2025609016418457,
"learning_rate": 4.9995405093552495e-06,
"loss": 1.2241,
"step": 169
},
{
"epoch": 0.08942661756970016,
"grad_norm": 2.3908772468566895,
"learning_rate": 4.999527094697418e-06,
"loss": 1.1954,
"step": 170
},
{
"epoch": 0.08995265649658074,
"grad_norm": 2.1161653995513916,
"learning_rate": 4.999513487047442e-06,
"loss": 1.2315,
"step": 171
},
{
"epoch": 0.09047869542346133,
"grad_norm": 2.0984017848968506,
"learning_rate": 4.9994996864063735e-06,
"loss": 1.2413,
"step": 172
},
{
"epoch": 0.09100473435034193,
"grad_norm": 2.205087900161743,
"learning_rate": 4.999485692775279e-06,
"loss": 1.2267,
"step": 173
},
{
"epoch": 0.09153077327722252,
"grad_norm": 2.224553108215332,
"learning_rate": 4.9994715061552365e-06,
"loss": 1.1613,
"step": 174
},
{
"epoch": 0.0920568122041031,
"grad_norm": 2.191676139831543,
"learning_rate": 4.999457126547344e-06,
"loss": 1.168,
"step": 175
},
{
"epoch": 0.09258285113098369,
"grad_norm": 2.2432751655578613,
"learning_rate": 4.99944255395271e-06,
"loss": 1.218,
"step": 176
},
{
"epoch": 0.09310889005786428,
"grad_norm": 2.1327083110809326,
"learning_rate": 4.999427788372461e-06,
"loss": 1.1994,
"step": 177
},
{
"epoch": 0.09363492898474488,
"grad_norm": 2.146256923675537,
"learning_rate": 4.999412829807735e-06,
"loss": 1.1387,
"step": 178
},
{
"epoch": 0.09416096791162545,
"grad_norm": 2.377356767654419,
"learning_rate": 4.999397678259689e-06,
"loss": 1.1901,
"step": 179
},
{
"epoch": 0.09468700683850605,
"grad_norm": 2.192535638809204,
"learning_rate": 4.999382333729492e-06,
"loss": 1.2079,
"step": 180
},
{
"epoch": 0.09521304576538664,
"grad_norm": 2.0958621501922607,
"learning_rate": 4.999366796218329e-06,
"loss": 1.1663,
"step": 181
},
{
"epoch": 0.09573908469226723,
"grad_norm": 2.1492772102355957,
"learning_rate": 4.9993510657274e-06,
"loss": 1.1877,
"step": 182
},
{
"epoch": 0.09626512361914781,
"grad_norm": 2.366111993789673,
"learning_rate": 4.999335142257919e-06,
"loss": 1.1849,
"step": 183
},
{
"epoch": 0.0967911625460284,
"grad_norm": 2.144526243209839,
"learning_rate": 4.999319025811116e-06,
"loss": 1.1739,
"step": 184
},
{
"epoch": 0.097317201472909,
"grad_norm": 2.3407647609710693,
"learning_rate": 4.999302716388234e-06,
"loss": 1.1987,
"step": 185
},
{
"epoch": 0.09784324039978959,
"grad_norm": 2.3771328926086426,
"learning_rate": 4.999286213990534e-06,
"loss": 1.2024,
"step": 186
},
{
"epoch": 0.09836927932667017,
"grad_norm": 2.2484753131866455,
"learning_rate": 4.99926951861929e-06,
"loss": 1.2087,
"step": 187
},
{
"epoch": 0.09889531825355076,
"grad_norm": 2.276099681854248,
"learning_rate": 4.99925263027579e-06,
"loss": 1.1696,
"step": 188
},
{
"epoch": 0.09942135718043135,
"grad_norm": 2.1576876640319824,
"learning_rate": 4.999235548961338e-06,
"loss": 1.1404,
"step": 189
},
{
"epoch": 0.09994739610731194,
"grad_norm": 2.1412558555603027,
"learning_rate": 4.999218274677254e-06,
"loss": 1.1279,
"step": 190
},
{
"epoch": 0.10047343503419252,
"grad_norm": 2.1507153511047363,
"learning_rate": 4.999200807424871e-06,
"loss": 1.1841,
"step": 191
},
{
"epoch": 0.10099947396107312,
"grad_norm": 2.236116886138916,
"learning_rate": 4.999183147205538e-06,
"loss": 1.208,
"step": 192
},
{
"epoch": 0.10152551288795371,
"grad_norm": 2.1643691062927246,
"learning_rate": 4.9991652940206185e-06,
"loss": 1.1325,
"step": 193
},
{
"epoch": 0.1020515518148343,
"grad_norm": 2.11639142036438,
"learning_rate": 4.999147247871491e-06,
"loss": 1.2073,
"step": 194
},
{
"epoch": 0.10257759074171488,
"grad_norm": 1.9682193994522095,
"learning_rate": 4.9991290087595475e-06,
"loss": 1.1447,
"step": 195
},
{
"epoch": 0.10310362966859547,
"grad_norm": 1.9927830696105957,
"learning_rate": 4.9991105766861996e-06,
"loss": 1.1694,
"step": 196
},
{
"epoch": 0.10362966859547607,
"grad_norm": 2.0124592781066895,
"learning_rate": 4.999091951652867e-06,
"loss": 1.152,
"step": 197
},
{
"epoch": 0.10415570752235666,
"grad_norm": 2.1793248653411865,
"learning_rate": 4.99907313366099e-06,
"loss": 1.228,
"step": 198
},
{
"epoch": 0.10468174644923724,
"grad_norm": 2.1615028381347656,
"learning_rate": 4.99905412271202e-06,
"loss": 1.2106,
"step": 199
},
{
"epoch": 0.10520778537611783,
"grad_norm": 1.9827650785446167,
"learning_rate": 4.999034918807425e-06,
"loss": 1.1829,
"step": 200
},
{
"epoch": 0.10573382430299842,
"grad_norm": 2.1772680282592773,
"learning_rate": 4.999015521948689e-06,
"loss": 1.13,
"step": 201
},
{
"epoch": 0.10625986322987901,
"grad_norm": 2.257385492324829,
"learning_rate": 4.99899593213731e-06,
"loss": 1.2144,
"step": 202
},
{
"epoch": 0.1067859021567596,
"grad_norm": 2.104809045791626,
"learning_rate": 4.998976149374799e-06,
"loss": 1.1715,
"step": 203
},
{
"epoch": 0.10731194108364019,
"grad_norm": 2.116504430770874,
"learning_rate": 4.998956173662683e-06,
"loss": 1.1442,
"step": 204
},
{
"epoch": 0.10783798001052078,
"grad_norm": 2.2018845081329346,
"learning_rate": 4.998936005002507e-06,
"loss": 1.1327,
"step": 205
},
{
"epoch": 0.10836401893740137,
"grad_norm": 2.2733311653137207,
"learning_rate": 4.998915643395826e-06,
"loss": 1.1821,
"step": 206
},
{
"epoch": 0.10889005786428195,
"grad_norm": 2.0005805492401123,
"learning_rate": 4.998895088844212e-06,
"loss": 1.0955,
"step": 207
},
{
"epoch": 0.10941609679116254,
"grad_norm": 2.0851638317108154,
"learning_rate": 4.998874341349253e-06,
"loss": 1.1851,
"step": 208
},
{
"epoch": 0.10994213571804314,
"grad_norm": 2.032989501953125,
"learning_rate": 4.998853400912552e-06,
"loss": 1.1069,
"step": 209
},
{
"epoch": 0.11046817464492373,
"grad_norm": 2.295994520187378,
"learning_rate": 4.9988322675357235e-06,
"loss": 1.1511,
"step": 210
},
{
"epoch": 0.1109942135718043,
"grad_norm": 1.9963881969451904,
"learning_rate": 4.9988109412204015e-06,
"loss": 1.1497,
"step": 211
},
{
"epoch": 0.1115202524986849,
"grad_norm": 2.6223835945129395,
"learning_rate": 4.998789421968231e-06,
"loss": 1.1692,
"step": 212
},
{
"epoch": 0.11204629142556549,
"grad_norm": 2.1924188137054443,
"learning_rate": 4.998767709780873e-06,
"loss": 1.1659,
"step": 213
},
{
"epoch": 0.11257233035244608,
"grad_norm": 2.4124836921691895,
"learning_rate": 4.998745804660005e-06,
"loss": 1.1965,
"step": 214
},
{
"epoch": 0.11309836927932668,
"grad_norm": 2.15348482131958,
"learning_rate": 4.99872370660732e-06,
"loss": 1.1337,
"step": 215
},
{
"epoch": 0.11362440820620726,
"grad_norm": 2.3462562561035156,
"learning_rate": 4.9987014156245215e-06,
"loss": 1.1793,
"step": 216
},
{
"epoch": 0.11415044713308785,
"grad_norm": 2.1864969730377197,
"learning_rate": 4.998678931713331e-06,
"loss": 1.1139,
"step": 217
},
{
"epoch": 0.11467648605996844,
"grad_norm": 2.1411378383636475,
"learning_rate": 4.998656254875486e-06,
"loss": 1.1582,
"step": 218
},
{
"epoch": 0.11520252498684903,
"grad_norm": 2.2826247215270996,
"learning_rate": 4.998633385112737e-06,
"loss": 1.1779,
"step": 219
},
{
"epoch": 0.11572856391372961,
"grad_norm": 2.0697169303894043,
"learning_rate": 4.998610322426848e-06,
"loss": 1.1775,
"step": 220
},
{
"epoch": 0.1162546028406102,
"grad_norm": 2.153381824493408,
"learning_rate": 4.998587066819602e-06,
"loss": 1.2244,
"step": 221
},
{
"epoch": 0.1167806417674908,
"grad_norm": 2.151595115661621,
"learning_rate": 4.998563618292793e-06,
"loss": 1.1562,
"step": 222
},
{
"epoch": 0.11730668069437139,
"grad_norm": 2.1102607250213623,
"learning_rate": 4.998539976848233e-06,
"loss": 1.1326,
"step": 223
},
{
"epoch": 0.11783271962125197,
"grad_norm": 2.3099205493927,
"learning_rate": 4.998516142487746e-06,
"loss": 1.1934,
"step": 224
},
{
"epoch": 0.11835875854813256,
"grad_norm": 2.0830485820770264,
"learning_rate": 4.998492115213173e-06,
"loss": 1.105,
"step": 225
},
{
"epoch": 0.11888479747501315,
"grad_norm": 1.965256929397583,
"learning_rate": 4.998467895026369e-06,
"loss": 1.1496,
"step": 226
},
{
"epoch": 0.11941083640189375,
"grad_norm": 2.060734272003174,
"learning_rate": 4.9984434819292036e-06,
"loss": 1.1256,
"step": 227
},
{
"epoch": 0.11993687532877433,
"grad_norm": 2.278106927871704,
"learning_rate": 4.998418875923563e-06,
"loss": 1.1557,
"step": 228
},
{
"epoch": 0.12046291425565492,
"grad_norm": 2.562490463256836,
"learning_rate": 4.998394077011346e-06,
"loss": 1.1579,
"step": 229
},
{
"epoch": 0.12098895318253551,
"grad_norm": 2.20798921585083,
"learning_rate": 4.998369085194468e-06,
"loss": 1.181,
"step": 230
},
{
"epoch": 0.1215149921094161,
"grad_norm": 2.3529961109161377,
"learning_rate": 4.998343900474858e-06,
"loss": 1.1514,
"step": 231
},
{
"epoch": 0.12204103103629668,
"grad_norm": 2.2413651943206787,
"learning_rate": 4.998318522854461e-06,
"loss": 1.1317,
"step": 232
},
{
"epoch": 0.12256706996317727,
"grad_norm": 2.2179031372070312,
"learning_rate": 4.998292952335236e-06,
"loss": 1.1784,
"step": 233
},
{
"epoch": 0.12309310889005787,
"grad_norm": 2.2591211795806885,
"learning_rate": 4.998267188919158e-06,
"loss": 1.1587,
"step": 234
},
{
"epoch": 0.12361914781693846,
"grad_norm": 2.4820573329925537,
"learning_rate": 4.998241232608216e-06,
"loss": 1.1448,
"step": 235
},
{
"epoch": 0.12414518674381904,
"grad_norm": 2.202066659927368,
"learning_rate": 4.998215083404414e-06,
"loss": 1.1859,
"step": 236
},
{
"epoch": 0.12467122567069963,
"grad_norm": 2.246918201446533,
"learning_rate": 4.9981887413097705e-06,
"loss": 1.1778,
"step": 237
},
{
"epoch": 0.1251972645975802,
"grad_norm": 2.166926145553589,
"learning_rate": 4.9981622063263205e-06,
"loss": 1.16,
"step": 238
},
{
"epoch": 0.12572330352446082,
"grad_norm": 2.2850661277770996,
"learning_rate": 4.998135478456112e-06,
"loss": 1.1522,
"step": 239
},
{
"epoch": 0.1262493424513414,
"grad_norm": 2.1694653034210205,
"learning_rate": 4.9981085577012095e-06,
"loss": 1.1394,
"step": 240
},
{
"epoch": 0.126775381378222,
"grad_norm": 2.061791181564331,
"learning_rate": 4.998081444063691e-06,
"loss": 1.1551,
"step": 241
},
{
"epoch": 0.12730142030510258,
"grad_norm": 2.1517114639282227,
"learning_rate": 4.998054137545649e-06,
"loss": 1.1487,
"step": 242
},
{
"epoch": 0.12782745923198316,
"grad_norm": 2.118903398513794,
"learning_rate": 4.9980266381491935e-06,
"loss": 1.1871,
"step": 243
},
{
"epoch": 0.12835349815886377,
"grad_norm": 2.271512508392334,
"learning_rate": 4.997998945876448e-06,
"loss": 1.21,
"step": 244
},
{
"epoch": 0.12887953708574434,
"grad_norm": 2.199542760848999,
"learning_rate": 4.997971060729549e-06,
"loss": 1.17,
"step": 245
},
{
"epoch": 0.12940557601262492,
"grad_norm": 2.213566303253174,
"learning_rate": 4.997942982710651e-06,
"loss": 1.1521,
"step": 246
},
{
"epoch": 0.12993161493950553,
"grad_norm": 2.291456699371338,
"learning_rate": 4.997914711821921e-06,
"loss": 1.1671,
"step": 247
},
{
"epoch": 0.1304576538663861,
"grad_norm": 2.017871856689453,
"learning_rate": 4.997886248065542e-06,
"loss": 1.1522,
"step": 248
},
{
"epoch": 0.13098369279326671,
"grad_norm": 2.1125521659851074,
"learning_rate": 4.9978575914437115e-06,
"loss": 1.1335,
"step": 249
},
{
"epoch": 0.1315097317201473,
"grad_norm": 2.262874126434326,
"learning_rate": 4.997828741958643e-06,
"loss": 1.1697,
"step": 250
},
{
"epoch": 0.13203577064702787,
"grad_norm": 2.450192451477051,
"learning_rate": 4.997799699612563e-06,
"loss": 1.1329,
"step": 251
},
{
"epoch": 0.13256180957390848,
"grad_norm": 2.0831351280212402,
"learning_rate": 4.997770464407715e-06,
"loss": 1.1711,
"step": 252
},
{
"epoch": 0.13308784850078906,
"grad_norm": 2.2078895568847656,
"learning_rate": 4.997741036346357e-06,
"loss": 1.1998,
"step": 253
},
{
"epoch": 0.13361388742766964,
"grad_norm": 2.175858497619629,
"learning_rate": 4.997711415430759e-06,
"loss": 1.1083,
"step": 254
},
{
"epoch": 0.13413992635455024,
"grad_norm": 2.203817129135132,
"learning_rate": 4.997681601663207e-06,
"loss": 1.088,
"step": 255
},
{
"epoch": 0.13466596528143082,
"grad_norm": 2.0065557956695557,
"learning_rate": 4.997651595046007e-06,
"loss": 1.1584,
"step": 256
},
{
"epoch": 0.13519200420831143,
"grad_norm": 2.299633264541626,
"learning_rate": 4.997621395581474e-06,
"loss": 1.2102,
"step": 257
},
{
"epoch": 0.135718043135192,
"grad_norm": 2.2972707748413086,
"learning_rate": 4.997591003271938e-06,
"loss": 1.1821,
"step": 258
},
{
"epoch": 0.13624408206207259,
"grad_norm": 2.399705171585083,
"learning_rate": 4.997560418119749e-06,
"loss": 1.1325,
"step": 259
},
{
"epoch": 0.1367701209889532,
"grad_norm": 2.2461678981781006,
"learning_rate": 4.997529640127266e-06,
"loss": 1.2361,
"step": 260
},
{
"epoch": 0.13729615991583377,
"grad_norm": 2.236917495727539,
"learning_rate": 4.997498669296865e-06,
"loss": 1.1159,
"step": 261
},
{
"epoch": 0.13782219884271435,
"grad_norm": 2.2851338386535645,
"learning_rate": 4.99746750563094e-06,
"loss": 1.1688,
"step": 262
},
{
"epoch": 0.13834823776959496,
"grad_norm": 2.1499626636505127,
"learning_rate": 4.997436149131894e-06,
"loss": 1.1478,
"step": 263
},
{
"epoch": 0.13887427669647553,
"grad_norm": 2.0969858169555664,
"learning_rate": 4.997404599802151e-06,
"loss": 1.1102,
"step": 264
},
{
"epoch": 0.13940031562335614,
"grad_norm": 2.5635933876037598,
"learning_rate": 4.997372857644146e-06,
"loss": 1.1173,
"step": 265
},
{
"epoch": 0.13992635455023672,
"grad_norm": 2.1076197624206543,
"learning_rate": 4.997340922660329e-06,
"loss": 1.1321,
"step": 266
},
{
"epoch": 0.1404523934771173,
"grad_norm": 2.179189443588257,
"learning_rate": 4.997308794853165e-06,
"loss": 1.1325,
"step": 267
},
{
"epoch": 0.1409784324039979,
"grad_norm": 2.0838067531585693,
"learning_rate": 4.9972764742251375e-06,
"loss": 1.1243,
"step": 268
},
{
"epoch": 0.14150447133087848,
"grad_norm": 2.1462979316711426,
"learning_rate": 4.9972439607787405e-06,
"loss": 1.1251,
"step": 269
},
{
"epoch": 0.14203051025775906,
"grad_norm": 2.144658088684082,
"learning_rate": 4.997211254516484e-06,
"loss": 1.1879,
"step": 270
},
{
"epoch": 0.14255654918463967,
"grad_norm": 2.118098020553589,
"learning_rate": 4.997178355440892e-06,
"loss": 1.1635,
"step": 271
},
{
"epoch": 0.14308258811152025,
"grad_norm": 2.284640312194824,
"learning_rate": 4.99714526355451e-06,
"loss": 1.1181,
"step": 272
},
{
"epoch": 0.14360862703840085,
"grad_norm": 2.2020652294158936,
"learning_rate": 4.997111978859886e-06,
"loss": 1.1234,
"step": 273
},
{
"epoch": 0.14413466596528143,
"grad_norm": 2.164998769760132,
"learning_rate": 4.997078501359595e-06,
"loss": 1.1723,
"step": 274
},
{
"epoch": 0.144660704892162,
"grad_norm": 2.1917877197265625,
"learning_rate": 4.9970448310562196e-06,
"loss": 1.1222,
"step": 275
},
{
"epoch": 0.14518674381904262,
"grad_norm": 2.314770221710205,
"learning_rate": 4.99701096795236e-06,
"loss": 1.183,
"step": 276
},
{
"epoch": 0.1457127827459232,
"grad_norm": 2.217176675796509,
"learning_rate": 4.996976912050632e-06,
"loss": 1.1509,
"step": 277
},
{
"epoch": 0.14623882167280378,
"grad_norm": 2.253232002258301,
"learning_rate": 4.996942663353663e-06,
"loss": 1.1733,
"step": 278
},
{
"epoch": 0.14676486059968438,
"grad_norm": 2.091414213180542,
"learning_rate": 4.996908221864099e-06,
"loss": 1.1479,
"step": 279
},
{
"epoch": 0.14729089952656496,
"grad_norm": 2.391035556793213,
"learning_rate": 4.996873587584599e-06,
"loss": 1.1646,
"step": 280
},
{
"epoch": 0.14781693845344557,
"grad_norm": 1.941179871559143,
"learning_rate": 4.996838760517836e-06,
"loss": 1.1362,
"step": 281
},
{
"epoch": 0.14834297738032615,
"grad_norm": 2.3869614601135254,
"learning_rate": 4.9968037406665e-06,
"loss": 1.1455,
"step": 282
},
{
"epoch": 0.14886901630720673,
"grad_norm": 2.2253477573394775,
"learning_rate": 4.9967685280332955e-06,
"loss": 1.1934,
"step": 283
},
{
"epoch": 0.14939505523408733,
"grad_norm": 2.235481023788452,
"learning_rate": 4.99673312262094e-06,
"loss": 1.1457,
"step": 284
},
{
"epoch": 0.1499210941609679,
"grad_norm": 2.1756770610809326,
"learning_rate": 4.996697524432169e-06,
"loss": 1.1874,
"step": 285
},
{
"epoch": 0.1504471330878485,
"grad_norm": 1.9890838861465454,
"learning_rate": 4.99666173346973e-06,
"loss": 1.1381,
"step": 286
},
{
"epoch": 0.1509731720147291,
"grad_norm": 2.032940149307251,
"learning_rate": 4.996625749736386e-06,
"loss": 1.1408,
"step": 287
},
{
"epoch": 0.15149921094160967,
"grad_norm": 2.38653564453125,
"learning_rate": 4.996589573234915e-06,
"loss": 1.1137,
"step": 288
},
{
"epoch": 0.15202524986849028,
"grad_norm": 2.5009000301361084,
"learning_rate": 4.9965532039681116e-06,
"loss": 1.1404,
"step": 289
},
{
"epoch": 0.15255128879537086,
"grad_norm": 2.113600969314575,
"learning_rate": 4.996516641938784e-06,
"loss": 1.0764,
"step": 290
},
{
"epoch": 0.15307732772225144,
"grad_norm": 2.2645368576049805,
"learning_rate": 4.996479887149754e-06,
"loss": 1.1499,
"step": 291
},
{
"epoch": 0.15360336664913204,
"grad_norm": 2.015124559402466,
"learning_rate": 4.99644293960386e-06,
"loss": 1.0487,
"step": 292
},
{
"epoch": 0.15412940557601262,
"grad_norm": 2.121588706970215,
"learning_rate": 4.996405799303955e-06,
"loss": 1.1119,
"step": 293
},
{
"epoch": 0.1546554445028932,
"grad_norm": 2.3707003593444824,
"learning_rate": 4.996368466252907e-06,
"loss": 1.1797,
"step": 294
},
{
"epoch": 0.1551814834297738,
"grad_norm": 2.3027000427246094,
"learning_rate": 4.996330940453598e-06,
"loss": 1.1228,
"step": 295
},
{
"epoch": 0.1557075223566544,
"grad_norm": 2.0909178256988525,
"learning_rate": 4.996293221908925e-06,
"loss": 1.0932,
"step": 296
},
{
"epoch": 0.156233561283535,
"grad_norm": 2.362823486328125,
"learning_rate": 4.996255310621801e-06,
"loss": 1.1507,
"step": 297
},
{
"epoch": 0.15675960021041557,
"grad_norm": 2.080667495727539,
"learning_rate": 4.996217206595153e-06,
"loss": 1.1158,
"step": 298
},
{
"epoch": 0.15728563913729615,
"grad_norm": 2.0508742332458496,
"learning_rate": 4.996178909831922e-06,
"loss": 1.1326,
"step": 299
},
{
"epoch": 0.15781167806417676,
"grad_norm": 2.1632707118988037,
"learning_rate": 4.996140420335068e-06,
"loss": 1.0946,
"step": 300
},
{
"epoch": 0.15833771699105734,
"grad_norm": 1.9084789752960205,
"learning_rate": 4.996101738107559e-06,
"loss": 1.0939,
"step": 301
},
{
"epoch": 0.15886375591793792,
"grad_norm": 1.9817906618118286,
"learning_rate": 4.996062863152385e-06,
"loss": 1.1013,
"step": 302
},
{
"epoch": 0.15938979484481852,
"grad_norm": 1.9947365522384644,
"learning_rate": 4.9960237954725446e-06,
"loss": 1.0635,
"step": 303
},
{
"epoch": 0.1599158337716991,
"grad_norm": 2.0908870697021484,
"learning_rate": 4.995984535071056e-06,
"loss": 1.0914,
"step": 304
},
{
"epoch": 0.1604418726985797,
"grad_norm": 2.1920530796051025,
"learning_rate": 4.995945081950952e-06,
"loss": 1.1816,
"step": 305
},
{
"epoch": 0.16096791162546029,
"grad_norm": 2.250007152557373,
"learning_rate": 4.995905436115276e-06,
"loss": 1.1543,
"step": 306
},
{
"epoch": 0.16149395055234086,
"grad_norm": 2.3157906532287598,
"learning_rate": 4.995865597567091e-06,
"loss": 1.1349,
"step": 307
},
{
"epoch": 0.16201998947922147,
"grad_norm": 2.816443681716919,
"learning_rate": 4.995825566309471e-06,
"loss": 1.1154,
"step": 308
},
{
"epoch": 0.16254602840610205,
"grad_norm": 2.3194282054901123,
"learning_rate": 4.995785342345509e-06,
"loss": 1.1547,
"step": 309
},
{
"epoch": 0.16307206733298263,
"grad_norm": 2.1249098777770996,
"learning_rate": 4.99574492567831e-06,
"loss": 1.0995,
"step": 310
},
{
"epoch": 0.16359810625986324,
"grad_norm": 2.100315809249878,
"learning_rate": 4.995704316310994e-06,
"loss": 1.1662,
"step": 311
},
{
"epoch": 0.16412414518674381,
"grad_norm": 2.1664323806762695,
"learning_rate": 4.995663514246697e-06,
"loss": 1.1466,
"step": 312
},
{
"epoch": 0.16465018411362442,
"grad_norm": 2.217438220977783,
"learning_rate": 4.9956225194885704e-06,
"loss": 1.1908,
"step": 313
},
{
"epoch": 0.165176223040505,
"grad_norm": 2.3328514099121094,
"learning_rate": 4.995581332039778e-06,
"loss": 1.0809,
"step": 314
},
{
"epoch": 0.16570226196738558,
"grad_norm": 2.088467836380005,
"learning_rate": 4.9955399519035e-06,
"loss": 1.0908,
"step": 315
},
{
"epoch": 0.16622830089426618,
"grad_norm": 2.2554612159729004,
"learning_rate": 4.995498379082932e-06,
"loss": 1.1702,
"step": 316
},
{
"epoch": 0.16675433982114676,
"grad_norm": 2.2798142433166504,
"learning_rate": 4.995456613581284e-06,
"loss": 1.107,
"step": 317
},
{
"epoch": 0.16728037874802734,
"grad_norm": 2.4394755363464355,
"learning_rate": 4.9954146554017816e-06,
"loss": 1.0881,
"step": 318
},
{
"epoch": 0.16780641767490795,
"grad_norm": 2.1176295280456543,
"learning_rate": 4.995372504547662e-06,
"loss": 1.1177,
"step": 319
},
{
"epoch": 0.16833245660178853,
"grad_norm": 2.141923189163208,
"learning_rate": 4.995330161022181e-06,
"loss": 1.1321,
"step": 320
},
{
"epoch": 0.16885849552866913,
"grad_norm": 2.273068428039551,
"learning_rate": 4.9952876248286086e-06,
"loss": 1.1832,
"step": 321
},
{
"epoch": 0.1693845344555497,
"grad_norm": 2.267636299133301,
"learning_rate": 4.995244895970228e-06,
"loss": 1.1058,
"step": 322
},
{
"epoch": 0.1699105733824303,
"grad_norm": 2.133772850036621,
"learning_rate": 4.99520197445034e-06,
"loss": 1.1478,
"step": 323
},
{
"epoch": 0.1704366123093109,
"grad_norm": 2.2782862186431885,
"learning_rate": 4.995158860272257e-06,
"loss": 1.1074,
"step": 324
},
{
"epoch": 0.17096265123619148,
"grad_norm": 2.544316053390503,
"learning_rate": 4.995115553439308e-06,
"loss": 1.0583,
"step": 325
},
{
"epoch": 0.17148869016307206,
"grad_norm": 2.2900187969207764,
"learning_rate": 4.995072053954838e-06,
"loss": 1.1933,
"step": 326
},
{
"epoch": 0.17201472908995266,
"grad_norm": 2.190380811691284,
"learning_rate": 4.995028361822206e-06,
"loss": 1.135,
"step": 327
},
{
"epoch": 0.17254076801683324,
"grad_norm": 2.4495794773101807,
"learning_rate": 4.9949844770447834e-06,
"loss": 1.1214,
"step": 328
},
{
"epoch": 0.17306680694371385,
"grad_norm": 2.332644462585449,
"learning_rate": 4.994940399625959e-06,
"loss": 1.1017,
"step": 329
},
{
"epoch": 0.17359284587059443,
"grad_norm": 2.0709457397460938,
"learning_rate": 4.994896129569138e-06,
"loss": 1.1073,
"step": 330
},
{
"epoch": 0.174118884797475,
"grad_norm": 2.8817923069000244,
"learning_rate": 4.994851666877736e-06,
"loss": 1.0758,
"step": 331
},
{
"epoch": 0.1746449237243556,
"grad_norm": 2.2557790279388428,
"learning_rate": 4.994807011555189e-06,
"loss": 1.173,
"step": 332
},
{
"epoch": 0.1751709626512362,
"grad_norm": 2.2412662506103516,
"learning_rate": 4.994762163604942e-06,
"loss": 1.1357,
"step": 333
},
{
"epoch": 0.17569700157811677,
"grad_norm": 2.1749277114868164,
"learning_rate": 4.9947171230304595e-06,
"loss": 1.0988,
"step": 334
},
{
"epoch": 0.17622304050499737,
"grad_norm": 2.4530062675476074,
"learning_rate": 4.994671889835218e-06,
"loss": 1.1377,
"step": 335
},
{
"epoch": 0.17674907943187795,
"grad_norm": 2.2602410316467285,
"learning_rate": 4.994626464022711e-06,
"loss": 1.0799,
"step": 336
},
{
"epoch": 0.17727511835875856,
"grad_norm": 2.0797061920166016,
"learning_rate": 4.994580845596446e-06,
"loss": 1.1214,
"step": 337
},
{
"epoch": 0.17780115728563914,
"grad_norm": 2.1437630653381348,
"learning_rate": 4.994535034559945e-06,
"loss": 1.1794,
"step": 338
},
{
"epoch": 0.17832719621251972,
"grad_norm": 2.0809285640716553,
"learning_rate": 4.994489030916745e-06,
"loss": 1.1331,
"step": 339
},
{
"epoch": 0.17885323513940032,
"grad_norm": 2.31193208694458,
"learning_rate": 4.994442834670397e-06,
"loss": 1.1425,
"step": 340
},
{
"epoch": 0.1793792740662809,
"grad_norm": 2.0348451137542725,
"learning_rate": 4.99439644582447e-06,
"loss": 1.1149,
"step": 341
},
{
"epoch": 0.17990531299316148,
"grad_norm": 2.2816810607910156,
"learning_rate": 4.994349864382544e-06,
"loss": 1.1509,
"step": 342
},
{
"epoch": 0.1804313519200421,
"grad_norm": 2.08492374420166,
"learning_rate": 4.994303090348217e-06,
"loss": 1.0854,
"step": 343
},
{
"epoch": 0.18095739084692267,
"grad_norm": 2.0389866828918457,
"learning_rate": 4.994256123725098e-06,
"loss": 1.1195,
"step": 344
},
{
"epoch": 0.18148342977380327,
"grad_norm": 2.2040510177612305,
"learning_rate": 4.9942089645168175e-06,
"loss": 1.1112,
"step": 345
},
{
"epoch": 0.18200946870068385,
"grad_norm": 2.058849811553955,
"learning_rate": 4.994161612727013e-06,
"loss": 1.1462,
"step": 346
},
{
"epoch": 0.18253550762756443,
"grad_norm": 2.2940948009490967,
"learning_rate": 4.994114068359343e-06,
"loss": 1.2183,
"step": 347
},
{
"epoch": 0.18306154655444504,
"grad_norm": 2.0303874015808105,
"learning_rate": 4.9940663314174756e-06,
"loss": 1.1136,
"step": 348
},
{
"epoch": 0.18358758548132562,
"grad_norm": 2.208289861679077,
"learning_rate": 4.9940184019051e-06,
"loss": 1.1507,
"step": 349
},
{
"epoch": 0.1841136244082062,
"grad_norm": 2.438228130340576,
"learning_rate": 4.993970279825915e-06,
"loss": 1.1619,
"step": 350
},
{
"epoch": 0.1846396633350868,
"grad_norm": 2.1701645851135254,
"learning_rate": 4.993921965183636e-06,
"loss": 1.1057,
"step": 351
},
{
"epoch": 0.18516570226196738,
"grad_norm": 2.345054864883423,
"learning_rate": 4.9938734579819944e-06,
"loss": 1.1758,
"step": 352
},
{
"epoch": 0.185691741188848,
"grad_norm": 2.3761768341064453,
"learning_rate": 4.9938247582247345e-06,
"loss": 1.1093,
"step": 353
},
{
"epoch": 0.18621778011572857,
"grad_norm": 2.2209126949310303,
"learning_rate": 4.993775865915618e-06,
"loss": 1.0882,
"step": 354
},
{
"epoch": 0.18674381904260914,
"grad_norm": 2.093406915664673,
"learning_rate": 4.993726781058419e-06,
"loss": 1.1621,
"step": 355
},
{
"epoch": 0.18726985796948975,
"grad_norm": 2.509725332260132,
"learning_rate": 4.993677503656927e-06,
"loss": 1.1411,
"step": 356
},
{
"epoch": 0.18779589689637033,
"grad_norm": 2.2245242595672607,
"learning_rate": 4.993628033714947e-06,
"loss": 1.1042,
"step": 357
},
{
"epoch": 0.1883219358232509,
"grad_norm": 1.838408350944519,
"learning_rate": 4.9935783712363e-06,
"loss": 1.0204,
"step": 358
},
{
"epoch": 0.18884797475013151,
"grad_norm": 2.0559537410736084,
"learning_rate": 4.993528516224818e-06,
"loss": 1.0681,
"step": 359
},
{
"epoch": 0.1893740136770121,
"grad_norm": 2.084890604019165,
"learning_rate": 4.993478468684352e-06,
"loss": 1.1149,
"step": 360
},
{
"epoch": 0.1899000526038927,
"grad_norm": 2.179478168487549,
"learning_rate": 4.993428228618767e-06,
"loss": 1.1342,
"step": 361
},
{
"epoch": 0.19042609153077328,
"grad_norm": 2.082578182220459,
"learning_rate": 4.99337779603194e-06,
"loss": 1.1293,
"step": 362
},
{
"epoch": 0.19095213045765386,
"grad_norm": 2.031831979751587,
"learning_rate": 4.993327170927766e-06,
"loss": 1.0728,
"step": 363
},
{
"epoch": 0.19147816938453446,
"grad_norm": 2.1939597129821777,
"learning_rate": 4.993276353310155e-06,
"loss": 1.1252,
"step": 364
},
{
"epoch": 0.19200420831141504,
"grad_norm": 2.031350612640381,
"learning_rate": 4.9932253431830295e-06,
"loss": 1.1039,
"step": 365
},
{
"epoch": 0.19253024723829562,
"grad_norm": 2.3367671966552734,
"learning_rate": 4.993174140550327e-06,
"loss": 1.1211,
"step": 366
},
{
"epoch": 0.19305628616517623,
"grad_norm": 2.2768945693969727,
"learning_rate": 4.993122745416003e-06,
"loss": 1.1119,
"step": 367
},
{
"epoch": 0.1935823250920568,
"grad_norm": 2.220766544342041,
"learning_rate": 4.993071157784025e-06,
"loss": 1.1451,
"step": 368
},
{
"epoch": 0.1941083640189374,
"grad_norm": 2.3694369792938232,
"learning_rate": 4.993019377658376e-06,
"loss": 1.1156,
"step": 369
},
{
"epoch": 0.194634402945818,
"grad_norm": 2.245237350463867,
"learning_rate": 4.9929674050430535e-06,
"loss": 1.1316,
"step": 370
},
{
"epoch": 0.19516044187269857,
"grad_norm": 2.720625400543213,
"learning_rate": 4.992915239942071e-06,
"loss": 1.1092,
"step": 371
},
{
"epoch": 0.19568648079957918,
"grad_norm": 2.115727424621582,
"learning_rate": 4.992862882359457e-06,
"loss": 1.1769,
"step": 372
},
{
"epoch": 0.19621251972645976,
"grad_norm": 2.235677480697632,
"learning_rate": 4.992810332299253e-06,
"loss": 1.1786,
"step": 373
},
{
"epoch": 0.19673855865334033,
"grad_norm": 2.539433002471924,
"learning_rate": 4.992757589765516e-06,
"loss": 1.1251,
"step": 374
},
{
"epoch": 0.19726459758022094,
"grad_norm": 5.042508602142334,
"learning_rate": 4.99270465476232e-06,
"loss": 1.0706,
"step": 375
},
{
"epoch": 0.19779063650710152,
"grad_norm": 2.1171703338623047,
"learning_rate": 4.9926515272937516e-06,
"loss": 1.1287,
"step": 376
},
{
"epoch": 0.19831667543398213,
"grad_norm": 2.4587223529815674,
"learning_rate": 4.992598207363912e-06,
"loss": 1.053,
"step": 377
},
{
"epoch": 0.1988427143608627,
"grad_norm": 2.1502695083618164,
"learning_rate": 4.9925446949769184e-06,
"loss": 1.0837,
"step": 378
},
{
"epoch": 0.19936875328774328,
"grad_norm": 2.139822483062744,
"learning_rate": 4.992490990136903e-06,
"loss": 1.1358,
"step": 379
},
{
"epoch": 0.1998947922146239,
"grad_norm": 2.4914610385894775,
"learning_rate": 4.992437092848012e-06,
"loss": 1.1053,
"step": 380
},
{
"epoch": 0.20042083114150447,
"grad_norm": 2.24576735496521,
"learning_rate": 4.992383003114408e-06,
"loss": 1.1034,
"step": 381
},
{
"epoch": 0.20094687006838505,
"grad_norm": 2.1979477405548096,
"learning_rate": 4.992328720940266e-06,
"loss": 1.0839,
"step": 382
},
{
"epoch": 0.20147290899526565,
"grad_norm": 2.1680850982666016,
"learning_rate": 4.992274246329778e-06,
"loss": 1.1011,
"step": 383
},
{
"epoch": 0.20199894792214623,
"grad_norm": 2.3214027881622314,
"learning_rate": 4.9922195792871495e-06,
"loss": 1.03,
"step": 384
},
{
"epoch": 0.20252498684902684,
"grad_norm": 2.162393808364868,
"learning_rate": 4.9921647198166014e-06,
"loss": 1.0466,
"step": 385
},
{
"epoch": 0.20305102577590742,
"grad_norm": 2.184163808822632,
"learning_rate": 4.99210966792237e-06,
"loss": 1.1379,
"step": 386
},
{
"epoch": 0.203577064702788,
"grad_norm": 2.3308913707733154,
"learning_rate": 4.992054423608706e-06,
"loss": 1.1751,
"step": 387
},
{
"epoch": 0.2041031036296686,
"grad_norm": 2.123298168182373,
"learning_rate": 4.991998986879874e-06,
"loss": 1.1079,
"step": 388
},
{
"epoch": 0.20462914255654918,
"grad_norm": 2.229844331741333,
"learning_rate": 4.991943357740155e-06,
"loss": 1.1242,
"step": 389
},
{
"epoch": 0.20515518148342976,
"grad_norm": 2.1815683841705322,
"learning_rate": 4.991887536193845e-06,
"loss": 1.0949,
"step": 390
},
{
"epoch": 0.20568122041031037,
"grad_norm": 2.4636261463165283,
"learning_rate": 4.991831522245253e-06,
"loss": 1.1118,
"step": 391
},
{
"epoch": 0.20620725933719095,
"grad_norm": 2.0095014572143555,
"learning_rate": 4.991775315898703e-06,
"loss": 1.0197,
"step": 392
},
{
"epoch": 0.20673329826407155,
"grad_norm": 2.1244406700134277,
"learning_rate": 4.991718917158538e-06,
"loss": 1.1081,
"step": 393
},
{
"epoch": 0.20725933719095213,
"grad_norm": 1.9773920774459839,
"learning_rate": 4.991662326029109e-06,
"loss": 1.0657,
"step": 394
},
{
"epoch": 0.2077853761178327,
"grad_norm": 2.204554796218872,
"learning_rate": 4.9916055425147874e-06,
"loss": 1.1434,
"step": 395
},
{
"epoch": 0.20831141504471332,
"grad_norm": 2.068147659301758,
"learning_rate": 4.991548566619957e-06,
"loss": 1.1281,
"step": 396
},
{
"epoch": 0.2088374539715939,
"grad_norm": 2.1518101692199707,
"learning_rate": 4.991491398349017e-06,
"loss": 1.0977,
"step": 397
},
{
"epoch": 0.20936349289847447,
"grad_norm": 2.091654062271118,
"learning_rate": 4.991434037706382e-06,
"loss": 1.1033,
"step": 398
},
{
"epoch": 0.20988953182535508,
"grad_norm": 2.8754067420959473,
"learning_rate": 4.9913764846964805e-06,
"loss": 1.1237,
"step": 399
},
{
"epoch": 0.21041557075223566,
"grad_norm": 2.2165675163269043,
"learning_rate": 4.991318739323757e-06,
"loss": 1.1298,
"step": 400
},
{
"epoch": 0.21094160967911627,
"grad_norm": 2.1219065189361572,
"learning_rate": 4.991260801592668e-06,
"loss": 1.0795,
"step": 401
},
{
"epoch": 0.21146764860599684,
"grad_norm": 2.132737159729004,
"learning_rate": 4.9912026715076885e-06,
"loss": 1.0546,
"step": 402
},
{
"epoch": 0.21199368753287742,
"grad_norm": 2.228076457977295,
"learning_rate": 4.9911443490733075e-06,
"loss": 1.1759,
"step": 403
},
{
"epoch": 0.21251972645975803,
"grad_norm": 2.1305177211761475,
"learning_rate": 4.991085834294027e-06,
"loss": 1.0865,
"step": 404
},
{
"epoch": 0.2130457653866386,
"grad_norm": 2.1550936698913574,
"learning_rate": 4.991027127174365e-06,
"loss": 1.1027,
"step": 405
},
{
"epoch": 0.2135718043135192,
"grad_norm": 2.3489346504211426,
"learning_rate": 4.990968227718854e-06,
"loss": 1.184,
"step": 406
},
{
"epoch": 0.2140978432403998,
"grad_norm": 2.2208189964294434,
"learning_rate": 4.9909091359320434e-06,
"loss": 1.1476,
"step": 407
},
{
"epoch": 0.21462388216728037,
"grad_norm": 2.230978012084961,
"learning_rate": 4.990849851818494e-06,
"loss": 1.1125,
"step": 408
},
{
"epoch": 0.21514992109416098,
"grad_norm": 2.294647216796875,
"learning_rate": 4.990790375382784e-06,
"loss": 1.1526,
"step": 409
},
{
"epoch": 0.21567596002104156,
"grad_norm": 2.160446882247925,
"learning_rate": 4.990730706629507e-06,
"loss": 1.1569,
"step": 410
},
{
"epoch": 0.21620199894792214,
"grad_norm": 2.1352434158325195,
"learning_rate": 4.990670845563268e-06,
"loss": 1.049,
"step": 411
},
{
"epoch": 0.21672803787480274,
"grad_norm": 2.0740866661071777,
"learning_rate": 4.99061079218869e-06,
"loss": 1.104,
"step": 412
},
{
"epoch": 0.21725407680168332,
"grad_norm": 2.302877426147461,
"learning_rate": 4.990550546510408e-06,
"loss": 1.0942,
"step": 413
},
{
"epoch": 0.2177801157285639,
"grad_norm": 2.270836353302002,
"learning_rate": 4.990490108533076e-06,
"loss": 1.107,
"step": 414
},
{
"epoch": 0.2183061546554445,
"grad_norm": 2.05703067779541,
"learning_rate": 4.99042947826136e-06,
"loss": 1.1284,
"step": 415
},
{
"epoch": 0.21883219358232509,
"grad_norm": 2.3524155616760254,
"learning_rate": 4.990368655699941e-06,
"loss": 1.068,
"step": 416
},
{
"epoch": 0.2193582325092057,
"grad_norm": 2.5300350189208984,
"learning_rate": 4.9903076408535145e-06,
"loss": 1.0993,
"step": 417
},
{
"epoch": 0.21988427143608627,
"grad_norm": 2.1858162879943848,
"learning_rate": 4.990246433726793e-06,
"loss": 1.1398,
"step": 418
},
{
"epoch": 0.22041031036296685,
"grad_norm": 1.9856489896774292,
"learning_rate": 4.990185034324501e-06,
"loss": 1.0671,
"step": 419
},
{
"epoch": 0.22093634928984746,
"grad_norm": 2.177152156829834,
"learning_rate": 4.99012344265138e-06,
"loss": 1.1673,
"step": 420
},
{
"epoch": 0.22146238821672803,
"grad_norm": 2.128787040710449,
"learning_rate": 4.990061658712186e-06,
"loss": 1.1629,
"step": 421
},
{
"epoch": 0.2219884271436086,
"grad_norm": 2.1840457916259766,
"learning_rate": 4.989999682511688e-06,
"loss": 1.0739,
"step": 422
},
{
"epoch": 0.22251446607048922,
"grad_norm": 2.37825608253479,
"learning_rate": 4.989937514054673e-06,
"loss": 1.1179,
"step": 423
},
{
"epoch": 0.2230405049973698,
"grad_norm": 2.2746498584747314,
"learning_rate": 4.98987515334594e-06,
"loss": 1.1117,
"step": 424
},
{
"epoch": 0.2235665439242504,
"grad_norm": 2.441087007522583,
"learning_rate": 4.989812600390304e-06,
"loss": 1.134,
"step": 425
},
{
"epoch": 0.22409258285113098,
"grad_norm": 1.9548932313919067,
"learning_rate": 4.989749855192596e-06,
"loss": 1.0962,
"step": 426
},
{
"epoch": 0.22461862177801156,
"grad_norm": 2.382025957107544,
"learning_rate": 4.989686917757659e-06,
"loss": 1.1233,
"step": 427
},
{
"epoch": 0.22514466070489217,
"grad_norm": 2.1739771366119385,
"learning_rate": 4.989623788090353e-06,
"loss": 1.0665,
"step": 428
},
{
"epoch": 0.22567069963177275,
"grad_norm": 2.3246262073516846,
"learning_rate": 4.989560466195553e-06,
"loss": 1.0834,
"step": 429
},
{
"epoch": 0.22619673855865335,
"grad_norm": 2.1649882793426514,
"learning_rate": 4.9894969520781475e-06,
"loss": 1.1144,
"step": 430
},
{
"epoch": 0.22672277748553393,
"grad_norm": 2.307199001312256,
"learning_rate": 4.98943324574304e-06,
"loss": 1.2195,
"step": 431
},
{
"epoch": 0.2272488164124145,
"grad_norm": 2.2414958477020264,
"learning_rate": 4.989369347195151e-06,
"loss": 1.0549,
"step": 432
},
{
"epoch": 0.22777485533929512,
"grad_norm": 2.12762713432312,
"learning_rate": 4.989305256439413e-06,
"loss": 1.1185,
"step": 433
},
{
"epoch": 0.2283008942661757,
"grad_norm": 2.1503520011901855,
"learning_rate": 4.989240973480774e-06,
"loss": 1.1294,
"step": 434
},
{
"epoch": 0.22882693319305628,
"grad_norm": 2.1283833980560303,
"learning_rate": 4.9891764983242e-06,
"loss": 1.1154,
"step": 435
},
{
"epoch": 0.22935297211993688,
"grad_norm": 2.239828109741211,
"learning_rate": 4.9891118309746666e-06,
"loss": 1.073,
"step": 436
},
{
"epoch": 0.22987901104681746,
"grad_norm": 2.396672248840332,
"learning_rate": 4.989046971437167e-06,
"loss": 1.0916,
"step": 437
},
{
"epoch": 0.23040504997369807,
"grad_norm": 2.1172304153442383,
"learning_rate": 4.98898191971671e-06,
"loss": 1.1001,
"step": 438
},
{
"epoch": 0.23093108890057865,
"grad_norm": 2.1714346408843994,
"learning_rate": 4.98891667581832e-06,
"loss": 1.1672,
"step": 439
},
{
"epoch": 0.23145712782745922,
"grad_norm": 2.058523178100586,
"learning_rate": 4.98885123974703e-06,
"loss": 1.0842,
"step": 440
},
{
"epoch": 0.23198316675433983,
"grad_norm": 2.4147160053253174,
"learning_rate": 4.988785611507896e-06,
"loss": 1.0755,
"step": 441
},
{
"epoch": 0.2325092056812204,
"grad_norm": 2.274296283721924,
"learning_rate": 4.988719791105985e-06,
"loss": 1.1141,
"step": 442
},
{
"epoch": 0.233035244608101,
"grad_norm": 2.178182363510132,
"learning_rate": 4.988653778546379e-06,
"loss": 1.212,
"step": 443
},
{
"epoch": 0.2335612835349816,
"grad_norm": 2.200793743133545,
"learning_rate": 4.988587573834173e-06,
"loss": 1.0992,
"step": 444
},
{
"epoch": 0.23408732246186217,
"grad_norm": 1.9726881980895996,
"learning_rate": 4.98852117697448e-06,
"loss": 1.1165,
"step": 445
},
{
"epoch": 0.23461336138874278,
"grad_norm": 2.1173300743103027,
"learning_rate": 4.988454587972428e-06,
"loss": 1.1162,
"step": 446
},
{
"epoch": 0.23513940031562336,
"grad_norm": 2.1428768634796143,
"learning_rate": 4.9883878068331556e-06,
"loss": 1.1343,
"step": 447
},
{
"epoch": 0.23566543924250394,
"grad_norm": 2.00190806388855,
"learning_rate": 4.988320833561822e-06,
"loss": 1.0873,
"step": 448
},
{
"epoch": 0.23619147816938454,
"grad_norm": 2.2472777366638184,
"learning_rate": 4.988253668163596e-06,
"loss": 1.1209,
"step": 449
},
{
"epoch": 0.23671751709626512,
"grad_norm": 2.0522475242614746,
"learning_rate": 4.988186310643666e-06,
"loss": 1.0912,
"step": 450
},
{
"epoch": 0.2372435560231457,
"grad_norm": 2.1521215438842773,
"learning_rate": 4.98811876100723e-06,
"loss": 1.0971,
"step": 451
},
{
"epoch": 0.2377695949500263,
"grad_norm": 2.1117734909057617,
"learning_rate": 4.988051019259505e-06,
"loss": 1.1247,
"step": 452
},
{
"epoch": 0.2382956338769069,
"grad_norm": 2.1884706020355225,
"learning_rate": 4.987983085405722e-06,
"loss": 1.1255,
"step": 453
},
{
"epoch": 0.2388216728037875,
"grad_norm": 2.138962984085083,
"learning_rate": 4.9879149594511245e-06,
"loss": 1.0787,
"step": 454
},
{
"epoch": 0.23934771173066807,
"grad_norm": 2.553452730178833,
"learning_rate": 4.987846641400974e-06,
"loss": 1.1178,
"step": 455
},
{
"epoch": 0.23987375065754865,
"grad_norm": 2.5340464115142822,
"learning_rate": 4.987778131260546e-06,
"loss": 1.1577,
"step": 456
},
{
"epoch": 0.24039978958442926,
"grad_norm": 2.2375919818878174,
"learning_rate": 4.987709429035128e-06,
"loss": 1.0711,
"step": 457
},
{
"epoch": 0.24092582851130984,
"grad_norm": 2.35756254196167,
"learning_rate": 4.987640534730027e-06,
"loss": 1.1031,
"step": 458
},
{
"epoch": 0.24145186743819042,
"grad_norm": 2.03385591506958,
"learning_rate": 4.987571448350561e-06,
"loss": 1.0869,
"step": 459
},
{
"epoch": 0.24197790636507102,
"grad_norm": 2.662584066390991,
"learning_rate": 4.987502169902065e-06,
"loss": 1.0909,
"step": 460
},
{
"epoch": 0.2425039452919516,
"grad_norm": 2.2569165229797363,
"learning_rate": 4.987432699389888e-06,
"loss": 1.1576,
"step": 461
},
{
"epoch": 0.2430299842188322,
"grad_norm": 1.9718097448349,
"learning_rate": 4.987363036819393e-06,
"loss": 1.0577,
"step": 462
},
{
"epoch": 0.24355602314571279,
"grad_norm": 2.2083537578582764,
"learning_rate": 4.987293182195959e-06,
"loss": 1.1328,
"step": 463
},
{
"epoch": 0.24408206207259336,
"grad_norm": 2.2045726776123047,
"learning_rate": 4.987223135524981e-06,
"loss": 1.0908,
"step": 464
},
{
"epoch": 0.24460810099947397,
"grad_norm": 2.213714122772217,
"learning_rate": 4.987152896811866e-06,
"loss": 1.124,
"step": 465
},
{
"epoch": 0.24513413992635455,
"grad_norm": 4.030746936798096,
"learning_rate": 4.987082466062038e-06,
"loss": 1.0855,
"step": 466
},
{
"epoch": 0.24566017885323513,
"grad_norm": 2.1142022609710693,
"learning_rate": 4.987011843280934e-06,
"loss": 1.1305,
"step": 467
},
{
"epoch": 0.24618621778011573,
"grad_norm": 2.1746232509613037,
"learning_rate": 4.986941028474009e-06,
"loss": 1.0846,
"step": 468
},
{
"epoch": 0.2467122567069963,
"grad_norm": 2.038947820663452,
"learning_rate": 4.986870021646728e-06,
"loss": 1.0907,
"step": 469
},
{
"epoch": 0.24723829563387692,
"grad_norm": 12.261099815368652,
"learning_rate": 4.986798822804576e-06,
"loss": 1.1012,
"step": 470
},
{
"epoch": 0.2477643345607575,
"grad_norm": 2.020077705383301,
"learning_rate": 4.986727431953048e-06,
"loss": 1.097,
"step": 471
},
{
"epoch": 0.24829037348763808,
"grad_norm": 2.070114850997925,
"learning_rate": 4.986655849097658e-06,
"loss": 1.175,
"step": 472
},
{
"epoch": 0.24881641241451868,
"grad_norm": 2.0364394187927246,
"learning_rate": 4.986584074243932e-06,
"loss": 1.0892,
"step": 473
},
{
"epoch": 0.24934245134139926,
"grad_norm": 2.1961004734039307,
"learning_rate": 4.986512107397413e-06,
"loss": 1.0867,
"step": 474
},
{
"epoch": 0.24986849026827984,
"grad_norm": 3.1488072872161865,
"learning_rate": 4.986439948563656e-06,
"loss": 1.0276,
"step": 475
},
{
"epoch": 0.2503945291951604,
"grad_norm": 2.3070068359375,
"learning_rate": 4.986367597748235e-06,
"loss": 1.0897,
"step": 476
},
{
"epoch": 0.25092056812204105,
"grad_norm": 2.0328757762908936,
"learning_rate": 4.986295054956733e-06,
"loss": 1.0573,
"step": 477
},
{
"epoch": 0.25144660704892163,
"grad_norm": 2.4608747959136963,
"learning_rate": 4.986222320194754e-06,
"loss": 1.1343,
"step": 478
},
{
"epoch": 0.2519726459758022,
"grad_norm": 2.249994993209839,
"learning_rate": 4.986149393467913e-06,
"loss": 1.0771,
"step": 479
},
{
"epoch": 0.2524986849026828,
"grad_norm": 2.1573803424835205,
"learning_rate": 4.98607627478184e-06,
"loss": 1.0795,
"step": 480
},
{
"epoch": 0.25302472382956337,
"grad_norm": 2.6239383220672607,
"learning_rate": 4.986002964142182e-06,
"loss": 1.0874,
"step": 481
},
{
"epoch": 0.253550762756444,
"grad_norm": 2.0815794467926025,
"learning_rate": 4.985929461554597e-06,
"loss": 1.0729,
"step": 482
},
{
"epoch": 0.2540768016833246,
"grad_norm": 2.156259059906006,
"learning_rate": 4.985855767024763e-06,
"loss": 1.0912,
"step": 483
},
{
"epoch": 0.25460284061020516,
"grad_norm": 2.4136252403259277,
"learning_rate": 4.985781880558369e-06,
"loss": 1.1365,
"step": 484
},
{
"epoch": 0.25512887953708574,
"grad_norm": 2.265622854232788,
"learning_rate": 4.98570780216112e-06,
"loss": 1.1218,
"step": 485
},
{
"epoch": 0.2556549184639663,
"grad_norm": 2.1097841262817383,
"learning_rate": 4.985633531838735e-06,
"loss": 1.1238,
"step": 486
},
{
"epoch": 0.2561809573908469,
"grad_norm": 2.205012083053589,
"learning_rate": 4.985559069596949e-06,
"loss": 1.0664,
"step": 487
},
{
"epoch": 0.25670699631772753,
"grad_norm": 2.1896169185638428,
"learning_rate": 4.9854844154415115e-06,
"loss": 1.0374,
"step": 488
},
{
"epoch": 0.2572330352446081,
"grad_norm": 2.0652949810028076,
"learning_rate": 4.985409569378187e-06,
"loss": 1.1016,
"step": 489
},
{
"epoch": 0.2577590741714887,
"grad_norm": 2.1278676986694336,
"learning_rate": 4.985334531412754e-06,
"loss": 1.147,
"step": 490
},
{
"epoch": 0.25828511309836927,
"grad_norm": 2.2769057750701904,
"learning_rate": 4.985259301551005e-06,
"loss": 1.1389,
"step": 491
},
{
"epoch": 0.25881115202524985,
"grad_norm": 2.0440104007720947,
"learning_rate": 4.985183879798751e-06,
"loss": 1.0826,
"step": 492
},
{
"epoch": 0.2593371909521305,
"grad_norm": 2.4153213500976562,
"learning_rate": 4.985108266161815e-06,
"loss": 1.105,
"step": 493
},
{
"epoch": 0.25986322987901106,
"grad_norm": 2.3863043785095215,
"learning_rate": 4.985032460646033e-06,
"loss": 1.1023,
"step": 494
},
{
"epoch": 0.26038926880589164,
"grad_norm": 2.2597336769104004,
"learning_rate": 4.98495646325726e-06,
"loss": 1.1046,
"step": 495
},
{
"epoch": 0.2609153077327722,
"grad_norm": 2.541444778442383,
"learning_rate": 4.984880274001364e-06,
"loss": 1.1149,
"step": 496
},
{
"epoch": 0.2614413466596528,
"grad_norm": 2.3011064529418945,
"learning_rate": 4.984803892884227e-06,
"loss": 1.0757,
"step": 497
},
{
"epoch": 0.26196738558653343,
"grad_norm": 2.116774797439575,
"learning_rate": 4.9847273199117475e-06,
"loss": 1.1151,
"step": 498
},
{
"epoch": 0.262493424513414,
"grad_norm": 2.2372357845306396,
"learning_rate": 4.984650555089836e-06,
"loss": 1.1107,
"step": 499
},
{
"epoch": 0.2630194634402946,
"grad_norm": 2.0782155990600586,
"learning_rate": 4.984573598424421e-06,
"loss": 1.1174,
"step": 500
},
{
"epoch": 0.26354550236717517,
"grad_norm": 2.0625476837158203,
"learning_rate": 4.984496449921444e-06,
"loss": 1.0965,
"step": 501
},
{
"epoch": 0.26407154129405574,
"grad_norm": 2.142184019088745,
"learning_rate": 4.9844191095868615e-06,
"loss": 1.0678,
"step": 502
},
{
"epoch": 0.2645975802209363,
"grad_norm": 2.1218082904815674,
"learning_rate": 4.984341577426646e-06,
"loss": 1.0661,
"step": 503
},
{
"epoch": 0.26512361914781696,
"grad_norm": 2.2910757064819336,
"learning_rate": 4.984263853446783e-06,
"loss": 1.1111,
"step": 504
},
{
"epoch": 0.26564965807469754,
"grad_norm": 2.0604546070098877,
"learning_rate": 4.984185937653274e-06,
"loss": 1.0614,
"step": 505
},
{
"epoch": 0.2661756970015781,
"grad_norm": 2.1210556030273438,
"learning_rate": 4.984107830052134e-06,
"loss": 1.0925,
"step": 506
},
{
"epoch": 0.2667017359284587,
"grad_norm": 2.535501003265381,
"learning_rate": 4.984029530649396e-06,
"loss": 1.1238,
"step": 507
},
{
"epoch": 0.2672277748553393,
"grad_norm": 2.2978546619415283,
"learning_rate": 4.9839510394511035e-06,
"loss": 1.1615,
"step": 508
},
{
"epoch": 0.2677538137822199,
"grad_norm": 2.0443382263183594,
"learning_rate": 4.983872356463318e-06,
"loss": 1.1087,
"step": 509
},
{
"epoch": 0.2682798527091005,
"grad_norm": 2.216139316558838,
"learning_rate": 4.983793481692114e-06,
"loss": 1.1431,
"step": 510
},
{
"epoch": 0.26880589163598106,
"grad_norm": 1.9255571365356445,
"learning_rate": 4.983714415143583e-06,
"loss": 1.0204,
"step": 511
},
{
"epoch": 0.26933193056286164,
"grad_norm": 2.103969097137451,
"learning_rate": 4.9836351568238286e-06,
"loss": 1.0855,
"step": 512
},
{
"epoch": 0.2698579694897422,
"grad_norm": 2.5458972454071045,
"learning_rate": 4.98355570673897e-06,
"loss": 1.0747,
"step": 513
},
{
"epoch": 0.27038400841662286,
"grad_norm": 2.023601531982422,
"learning_rate": 4.983476064895143e-06,
"loss": 1.0471,
"step": 514
},
{
"epoch": 0.27091004734350344,
"grad_norm": 2.0976908206939697,
"learning_rate": 4.983396231298496e-06,
"loss": 1.0658,
"step": 515
},
{
"epoch": 0.271436086270384,
"grad_norm": 2.4051074981689453,
"learning_rate": 4.9833162059551936e-06,
"loss": 1.0624,
"step": 516
},
{
"epoch": 0.2719621251972646,
"grad_norm": 2.0524230003356934,
"learning_rate": 4.983235988871414e-06,
"loss": 1.1261,
"step": 517
},
{
"epoch": 0.27248816412414517,
"grad_norm": 2.1440162658691406,
"learning_rate": 4.983155580053351e-06,
"loss": 0.9893,
"step": 518
},
{
"epoch": 0.27301420305102575,
"grad_norm": 2.1923670768737793,
"learning_rate": 4.983074979507213e-06,
"loss": 1.1066,
"step": 519
},
{
"epoch": 0.2735402419779064,
"grad_norm": 2.2967565059661865,
"learning_rate": 4.982994187239225e-06,
"loss": 1.1256,
"step": 520
},
{
"epoch": 0.27406628090478696,
"grad_norm": 2.0392587184906006,
"learning_rate": 4.982913203255623e-06,
"loss": 1.1026,
"step": 521
},
{
"epoch": 0.27459231983166754,
"grad_norm": 2.371121644973755,
"learning_rate": 4.9828320275626605e-06,
"loss": 1.0607,
"step": 522
},
{
"epoch": 0.2751183587585481,
"grad_norm": 2.082239866256714,
"learning_rate": 4.982750660166606e-06,
"loss": 1.0749,
"step": 523
},
{
"epoch": 0.2756443976854287,
"grad_norm": 2.2039687633514404,
"learning_rate": 4.98266910107374e-06,
"loss": 1.0769,
"step": 524
},
{
"epoch": 0.27617043661230933,
"grad_norm": 2.087859869003296,
"learning_rate": 4.9825873502903625e-06,
"loss": 1.1575,
"step": 525
},
{
"epoch": 0.2766964755391899,
"grad_norm": 2.1991021633148193,
"learning_rate": 4.982505407822783e-06,
"loss": 1.1149,
"step": 526
},
{
"epoch": 0.2772225144660705,
"grad_norm": 2.2656140327453613,
"learning_rate": 4.98242327367733e-06,
"loss": 1.0948,
"step": 527
},
{
"epoch": 0.27774855339295107,
"grad_norm": 2.1107430458068848,
"learning_rate": 4.982340947860344e-06,
"loss": 1.0289,
"step": 528
},
{
"epoch": 0.27827459231983165,
"grad_norm": 2.2510344982147217,
"learning_rate": 4.982258430378184e-06,
"loss": 1.0694,
"step": 529
},
{
"epoch": 0.2788006312467123,
"grad_norm": 2.252258062362671,
"learning_rate": 4.982175721237218e-06,
"loss": 1.0435,
"step": 530
},
{
"epoch": 0.27932667017359286,
"grad_norm": 2.12455677986145,
"learning_rate": 4.982092820443834e-06,
"loss": 1.0202,
"step": 531
},
{
"epoch": 0.27985270910047344,
"grad_norm": 2.3654651641845703,
"learning_rate": 4.982009728004433e-06,
"loss": 1.1282,
"step": 532
},
{
"epoch": 0.280378748027354,
"grad_norm": 2.3759138584136963,
"learning_rate": 4.981926443925431e-06,
"loss": 1.1557,
"step": 533
},
{
"epoch": 0.2809047869542346,
"grad_norm": 1.9874821901321411,
"learning_rate": 4.981842968213256e-06,
"loss": 1.0723,
"step": 534
},
{
"epoch": 0.2814308258811152,
"grad_norm": 2.154383897781372,
"learning_rate": 4.981759300874356e-06,
"loss": 1.0786,
"step": 535
},
{
"epoch": 0.2819568648079958,
"grad_norm": 2.1774797439575195,
"learning_rate": 4.9816754419151906e-06,
"loss": 1.0457,
"step": 536
},
{
"epoch": 0.2824829037348764,
"grad_norm": 2.206082820892334,
"learning_rate": 4.981591391342233e-06,
"loss": 1.0216,
"step": 537
},
{
"epoch": 0.28300894266175697,
"grad_norm": 2.008676528930664,
"learning_rate": 4.981507149161975e-06,
"loss": 1.0297,
"step": 538
},
{
"epoch": 0.28353498158863755,
"grad_norm": 2.0553462505340576,
"learning_rate": 4.981422715380919e-06,
"loss": 1.0967,
"step": 539
},
{
"epoch": 0.2840610205155181,
"grad_norm": 2.047567844390869,
"learning_rate": 4.981338090005586e-06,
"loss": 1.0524,
"step": 540
},
{
"epoch": 0.28458705944239876,
"grad_norm": 2.2144312858581543,
"learning_rate": 4.981253273042509e-06,
"loss": 1.1178,
"step": 541
},
{
"epoch": 0.28511309836927934,
"grad_norm": 2.388124465942383,
"learning_rate": 4.981168264498238e-06,
"loss": 1.0728,
"step": 542
},
{
"epoch": 0.2856391372961599,
"grad_norm": 2.152280807495117,
"learning_rate": 4.981083064379335e-06,
"loss": 1.1146,
"step": 543
},
{
"epoch": 0.2861651762230405,
"grad_norm": 2.1481564044952393,
"learning_rate": 4.98099767269238e-06,
"loss": 1.1376,
"step": 544
},
{
"epoch": 0.2866912151499211,
"grad_norm": 2.060664415359497,
"learning_rate": 4.980912089443966e-06,
"loss": 1.0961,
"step": 545
},
{
"epoch": 0.2872172540768017,
"grad_norm": 2.032557964324951,
"learning_rate": 4.9808263146406985e-06,
"loss": 1.1055,
"step": 546
},
{
"epoch": 0.2877432930036823,
"grad_norm": 2.0957093238830566,
"learning_rate": 4.980740348289204e-06,
"loss": 1.0444,
"step": 547
},
{
"epoch": 0.28826933193056287,
"grad_norm": 2.0774853229522705,
"learning_rate": 4.980654190396118e-06,
"loss": 1.0963,
"step": 548
},
{
"epoch": 0.28879537085744345,
"grad_norm": 2.0808207988739014,
"learning_rate": 4.980567840968094e-06,
"loss": 1.0634,
"step": 549
},
{
"epoch": 0.289321409784324,
"grad_norm": 2.2924559116363525,
"learning_rate": 4.980481300011797e-06,
"loss": 1.0805,
"step": 550
},
{
"epoch": 0.2898474487112046,
"grad_norm": 2.041088104248047,
"learning_rate": 4.980394567533911e-06,
"loss": 1.0983,
"step": 551
},
{
"epoch": 0.29037348763808524,
"grad_norm": 2.030073881149292,
"learning_rate": 4.980307643541132e-06,
"loss": 1.1334,
"step": 552
},
{
"epoch": 0.2908995265649658,
"grad_norm": 2.15849232673645,
"learning_rate": 4.980220528040172e-06,
"loss": 1.0906,
"step": 553
},
{
"epoch": 0.2914255654918464,
"grad_norm": 2.094135284423828,
"learning_rate": 4.9801332210377574e-06,
"loss": 1.0644,
"step": 554
},
{
"epoch": 0.291951604418727,
"grad_norm": 2.193941354751587,
"learning_rate": 4.980045722540628e-06,
"loss": 1.0819,
"step": 555
},
{
"epoch": 0.29247764334560755,
"grad_norm": 2.2015504837036133,
"learning_rate": 4.979958032555542e-06,
"loss": 1.0759,
"step": 556
},
{
"epoch": 0.2930036822724882,
"grad_norm": 2.1240222454071045,
"learning_rate": 4.979870151089267e-06,
"loss": 1.1268,
"step": 557
},
{
"epoch": 0.29352972119936876,
"grad_norm": 2.0243959426879883,
"learning_rate": 4.9797820781485905e-06,
"loss": 1.0449,
"step": 558
},
{
"epoch": 0.29405576012624934,
"grad_norm": 2.2300705909729004,
"learning_rate": 4.979693813740313e-06,
"loss": 1.0493,
"step": 559
},
{
"epoch": 0.2945817990531299,
"grad_norm": 2.1185836791992188,
"learning_rate": 4.979605357871249e-06,
"loss": 1.0921,
"step": 560
},
{
"epoch": 0.2951078379800105,
"grad_norm": 2.091691732406616,
"learning_rate": 4.979516710548227e-06,
"loss": 1.1025,
"step": 561
},
{
"epoch": 0.29563387690689114,
"grad_norm": 2.1666178703308105,
"learning_rate": 4.979427871778094e-06,
"loss": 1.1245,
"step": 562
},
{
"epoch": 0.2961599158337717,
"grad_norm": 2.6985056400299072,
"learning_rate": 4.9793388415677066e-06,
"loss": 1.1398,
"step": 563
},
{
"epoch": 0.2966859547606523,
"grad_norm": 2.118074655532837,
"learning_rate": 4.979249619923942e-06,
"loss": 1.0897,
"step": 564
},
{
"epoch": 0.29721199368753287,
"grad_norm": 2.246856927871704,
"learning_rate": 4.979160206853687e-06,
"loss": 1.0714,
"step": 565
},
{
"epoch": 0.29773803261441345,
"grad_norm": 2.201953887939453,
"learning_rate": 4.979070602363846e-06,
"loss": 1.1466,
"step": 566
},
{
"epoch": 0.29826407154129403,
"grad_norm": 2.048617362976074,
"learning_rate": 4.9789808064613375e-06,
"loss": 1.1368,
"step": 567
},
{
"epoch": 0.29879011046817466,
"grad_norm": 2.1507785320281982,
"learning_rate": 4.978890819153095e-06,
"loss": 1.1499,
"step": 568
},
{
"epoch": 0.29931614939505524,
"grad_norm": 1.9633440971374512,
"learning_rate": 4.978800640446066e-06,
"loss": 1.0667,
"step": 569
},
{
"epoch": 0.2998421883219358,
"grad_norm": 2.1089606285095215,
"learning_rate": 4.978710270347214e-06,
"loss": 1.0611,
"step": 570
},
{
"epoch": 0.3003682272488164,
"grad_norm": 2.170901298522949,
"learning_rate": 4.9786197088635145e-06,
"loss": 1.1524,
"step": 571
},
{
"epoch": 0.300894266175697,
"grad_norm": 2.165510892868042,
"learning_rate": 4.978528956001964e-06,
"loss": 1.0987,
"step": 572
},
{
"epoch": 0.3014203051025776,
"grad_norm": 2.0415878295898438,
"learning_rate": 4.978438011769565e-06,
"loss": 1.1582,
"step": 573
},
{
"epoch": 0.3019463440294582,
"grad_norm": 2.110260248184204,
"learning_rate": 4.978346876173342e-06,
"loss": 1.0587,
"step": 574
},
{
"epoch": 0.30247238295633877,
"grad_norm": 2.253488063812256,
"learning_rate": 4.9782555492203334e-06,
"loss": 1.1038,
"step": 575
},
{
"epoch": 0.30299842188321935,
"grad_norm": 2.0166091918945312,
"learning_rate": 4.978164030917587e-06,
"loss": 1.0367,
"step": 576
},
{
"epoch": 0.3035244608100999,
"grad_norm": 2.2842600345611572,
"learning_rate": 4.978072321272171e-06,
"loss": 1.0996,
"step": 577
},
{
"epoch": 0.30405049973698056,
"grad_norm": 2.0563907623291016,
"learning_rate": 4.977980420291166e-06,
"loss": 1.1219,
"step": 578
},
{
"epoch": 0.30457653866386114,
"grad_norm": 2.059800863265991,
"learning_rate": 4.977888327981668e-06,
"loss": 1.1193,
"step": 579
},
{
"epoch": 0.3051025775907417,
"grad_norm": 2.242919921875,
"learning_rate": 4.977796044350788e-06,
"loss": 1.0701,
"step": 580
},
{
"epoch": 0.3056286165176223,
"grad_norm": 1.9749282598495483,
"learning_rate": 4.977703569405651e-06,
"loss": 1.0771,
"step": 581
},
{
"epoch": 0.3061546554445029,
"grad_norm": 2.2251386642456055,
"learning_rate": 4.977610903153397e-06,
"loss": 1.084,
"step": 582
},
{
"epoch": 0.30668069437138346,
"grad_norm": 2.0289855003356934,
"learning_rate": 4.97751804560118e-06,
"loss": 1.0732,
"step": 583
},
{
"epoch": 0.3072067332982641,
"grad_norm": 2.152841806411743,
"learning_rate": 4.977424996756171e-06,
"loss": 1.0712,
"step": 584
},
{
"epoch": 0.30773277222514467,
"grad_norm": 2.3243937492370605,
"learning_rate": 4.977331756625555e-06,
"loss": 1.0197,
"step": 585
},
{
"epoch": 0.30825881115202525,
"grad_norm": 2.293274402618408,
"learning_rate": 4.97723832521653e-06,
"loss": 1.1121,
"step": 586
},
{
"epoch": 0.3087848500789058,
"grad_norm": 2.139958143234253,
"learning_rate": 4.97714470253631e-06,
"loss": 1.0799,
"step": 587
},
{
"epoch": 0.3093108890057864,
"grad_norm": 2.269357442855835,
"learning_rate": 4.977050888592123e-06,
"loss": 1.0872,
"step": 588
},
{
"epoch": 0.30983692793266704,
"grad_norm": 2.268691301345825,
"learning_rate": 4.976956883391215e-06,
"loss": 1.1079,
"step": 589
},
{
"epoch": 0.3103629668595476,
"grad_norm": 2.127131223678589,
"learning_rate": 4.976862686940842e-06,
"loss": 1.1217,
"step": 590
},
{
"epoch": 0.3108890057864282,
"grad_norm": 2.0126006603240967,
"learning_rate": 4.976768299248278e-06,
"loss": 1.0719,
"step": 591
},
{
"epoch": 0.3114150447133088,
"grad_norm": 1.965903639793396,
"learning_rate": 4.97667372032081e-06,
"loss": 1.0843,
"step": 592
},
{
"epoch": 0.31194108364018935,
"grad_norm": 2.1280322074890137,
"learning_rate": 4.976578950165742e-06,
"loss": 1.0676,
"step": 593
},
{
"epoch": 0.31246712256707,
"grad_norm": 2.2355756759643555,
"learning_rate": 4.976483988790391e-06,
"loss": 1.0855,
"step": 594
},
{
"epoch": 0.31299316149395057,
"grad_norm": 2.153095245361328,
"learning_rate": 4.976388836202088e-06,
"loss": 1.0357,
"step": 595
},
{
"epoch": 0.31351920042083115,
"grad_norm": 2.023137092590332,
"learning_rate": 4.97629349240818e-06,
"loss": 1.0381,
"step": 596
},
{
"epoch": 0.3140452393477117,
"grad_norm": 2.2524759769439697,
"learning_rate": 4.97619795741603e-06,
"loss": 1.0911,
"step": 597
},
{
"epoch": 0.3145712782745923,
"grad_norm": 2.1904008388519287,
"learning_rate": 4.9761022312330135e-06,
"loss": 1.047,
"step": 598
},
{
"epoch": 0.3150973172014729,
"grad_norm": 2.3166565895080566,
"learning_rate": 4.976006313866521e-06,
"loss": 1.0663,
"step": 599
},
{
"epoch": 0.3156233561283535,
"grad_norm": 2.11413836479187,
"learning_rate": 4.975910205323959e-06,
"loss": 1.0843,
"step": 600
},
{
"epoch": 0.3161493950552341,
"grad_norm": 2.1609344482421875,
"learning_rate": 4.975813905612749e-06,
"loss": 1.1344,
"step": 601
},
{
"epoch": 0.3166754339821147,
"grad_norm": 2.055330276489258,
"learning_rate": 4.975717414740326e-06,
"loss": 1.0663,
"step": 602
},
{
"epoch": 0.31720147290899525,
"grad_norm": 2.2735755443573,
"learning_rate": 4.975620732714139e-06,
"loss": 1.1061,
"step": 603
},
{
"epoch": 0.31772751183587583,
"grad_norm": 2.1966300010681152,
"learning_rate": 4.975523859541654e-06,
"loss": 1.1498,
"step": 604
},
{
"epoch": 0.31825355076275647,
"grad_norm": 2.20951247215271,
"learning_rate": 4.975426795230351e-06,
"loss": 1.1057,
"step": 605
},
{
"epoch": 0.31877958968963704,
"grad_norm": 2.0706050395965576,
"learning_rate": 4.975329539787725e-06,
"loss": 1.0906,
"step": 606
},
{
"epoch": 0.3193056286165176,
"grad_norm": 2.0394089221954346,
"learning_rate": 4.975232093221284e-06,
"loss": 1.0514,
"step": 607
},
{
"epoch": 0.3198316675433982,
"grad_norm": 2.1639111042022705,
"learning_rate": 4.975134455538551e-06,
"loss": 1.0787,
"step": 608
},
{
"epoch": 0.3203577064702788,
"grad_norm": 2.025575876235962,
"learning_rate": 4.975036626747067e-06,
"loss": 1.0451,
"step": 609
},
{
"epoch": 0.3208837453971594,
"grad_norm": 2.060215950012207,
"learning_rate": 4.974938606854384e-06,
"loss": 1.0821,
"step": 610
},
{
"epoch": 0.32140978432404,
"grad_norm": 2.265155792236328,
"learning_rate": 4.974840395868073e-06,
"loss": 1.1341,
"step": 611
},
{
"epoch": 0.32193582325092057,
"grad_norm": 2.22503924369812,
"learning_rate": 4.974741993795712e-06,
"loss": 1.1643,
"step": 612
},
{
"epoch": 0.32246186217780115,
"grad_norm": 2.11155104637146,
"learning_rate": 4.9746434006449034e-06,
"loss": 1.0548,
"step": 613
},
{
"epoch": 0.32298790110468173,
"grad_norm": 2.0055696964263916,
"learning_rate": 4.974544616423258e-06,
"loss": 1.0769,
"step": 614
},
{
"epoch": 0.3235139400315623,
"grad_norm": 2.0843770503997803,
"learning_rate": 4.974445641138403e-06,
"loss": 1.0701,
"step": 615
},
{
"epoch": 0.32403997895844294,
"grad_norm": 2.0580337047576904,
"learning_rate": 4.9743464747979785e-06,
"loss": 1.0465,
"step": 616
},
{
"epoch": 0.3245660178853235,
"grad_norm": 2.3719844818115234,
"learning_rate": 4.974247117409645e-06,
"loss": 1.1498,
"step": 617
},
{
"epoch": 0.3250920568122041,
"grad_norm": 1.9926241636276245,
"learning_rate": 4.974147568981072e-06,
"loss": 1.081,
"step": 618
},
{
"epoch": 0.3256180957390847,
"grad_norm": 2.029318332672119,
"learning_rate": 4.974047829519946e-06,
"loss": 1.139,
"step": 619
},
{
"epoch": 0.32614413466596526,
"grad_norm": 2.0171804428100586,
"learning_rate": 4.973947899033969e-06,
"loss": 1.0887,
"step": 620
},
{
"epoch": 0.3266701735928459,
"grad_norm": 2.3209071159362793,
"learning_rate": 4.973847777530854e-06,
"loss": 1.1156,
"step": 621
},
{
"epoch": 0.32719621251972647,
"grad_norm": 2.360849142074585,
"learning_rate": 4.973747465018334e-06,
"loss": 1.1305,
"step": 622
},
{
"epoch": 0.32772225144660705,
"grad_norm": 2.1828086376190186,
"learning_rate": 4.973646961504154e-06,
"loss": 1.091,
"step": 623
},
{
"epoch": 0.32824829037348763,
"grad_norm": 1.9628446102142334,
"learning_rate": 4.973546266996074e-06,
"loss": 1.0932,
"step": 624
},
{
"epoch": 0.3287743293003682,
"grad_norm": 2.0040283203125,
"learning_rate": 4.973445381501868e-06,
"loss": 1.0723,
"step": 625
},
{
"epoch": 0.32930036822724884,
"grad_norm": 2.289292097091675,
"learning_rate": 4.973344305029326e-06,
"loss": 1.1526,
"step": 626
},
{
"epoch": 0.3298264071541294,
"grad_norm": 2.1106910705566406,
"learning_rate": 4.973243037586252e-06,
"loss": 1.1327,
"step": 627
},
{
"epoch": 0.33035244608101,
"grad_norm": 2.326677083969116,
"learning_rate": 4.9731415791804655e-06,
"loss": 1.0898,
"step": 628
},
{
"epoch": 0.3308784850078906,
"grad_norm": 2.086299180984497,
"learning_rate": 4.9730399298198e-06,
"loss": 1.0842,
"step": 629
},
{
"epoch": 0.33140452393477116,
"grad_norm": 2.045738935470581,
"learning_rate": 4.972938089512104e-06,
"loss": 1.0156,
"step": 630
},
{
"epoch": 0.33193056286165173,
"grad_norm": 2.038058280944824,
"learning_rate": 4.97283605826524e-06,
"loss": 1.0545,
"step": 631
},
{
"epoch": 0.33245660178853237,
"grad_norm": 2.0892717838287354,
"learning_rate": 4.972733836087088e-06,
"loss": 1.099,
"step": 632
},
{
"epoch": 0.33298264071541295,
"grad_norm": 2.2152934074401855,
"learning_rate": 4.972631422985538e-06,
"loss": 1.0775,
"step": 633
},
{
"epoch": 0.3335086796422935,
"grad_norm": 2.3605494499206543,
"learning_rate": 4.9725288189685e-06,
"loss": 1.0682,
"step": 634
},
{
"epoch": 0.3340347185691741,
"grad_norm": 2.076491117477417,
"learning_rate": 4.9724260240438945e-06,
"loss": 1.063,
"step": 635
},
{
"epoch": 0.3345607574960547,
"grad_norm": 3.2677767276763916,
"learning_rate": 4.97232303821966e-06,
"loss": 1.1173,
"step": 636
},
{
"epoch": 0.3350867964229353,
"grad_norm": 2.110320568084717,
"learning_rate": 4.972219861503746e-06,
"loss": 1.0264,
"step": 637
},
{
"epoch": 0.3356128353498159,
"grad_norm": 2.101353406906128,
"learning_rate": 4.972116493904121e-06,
"loss": 1.0806,
"step": 638
},
{
"epoch": 0.3361388742766965,
"grad_norm": 2.247091293334961,
"learning_rate": 4.972012935428765e-06,
"loss": 1.1178,
"step": 639
},
{
"epoch": 0.33666491320357705,
"grad_norm": 2.183757781982422,
"learning_rate": 4.971909186085675e-06,
"loss": 1.0615,
"step": 640
},
{
"epoch": 0.33719095213045763,
"grad_norm": 2.0801236629486084,
"learning_rate": 4.97180524588286e-06,
"loss": 1.0441,
"step": 641
},
{
"epoch": 0.33771699105733827,
"grad_norm": 1.9939873218536377,
"learning_rate": 4.9717011148283455e-06,
"loss": 1.0853,
"step": 642
},
{
"epoch": 0.33824302998421885,
"grad_norm": 2.13399338722229,
"learning_rate": 4.971596792930174e-06,
"loss": 0.9943,
"step": 643
},
{
"epoch": 0.3387690689110994,
"grad_norm": 2.1221766471862793,
"learning_rate": 4.971492280196397e-06,
"loss": 1.0088,
"step": 644
},
{
"epoch": 0.33929510783798,
"grad_norm": 2.023320436477661,
"learning_rate": 4.971387576635087e-06,
"loss": 1.0449,
"step": 645
},
{
"epoch": 0.3398211467648606,
"grad_norm": 2.1422126293182373,
"learning_rate": 4.971282682254327e-06,
"loss": 1.0987,
"step": 646
},
{
"epoch": 0.3403471856917412,
"grad_norm": 2.136868715286255,
"learning_rate": 4.971177597062215e-06,
"loss": 1.0983,
"step": 647
},
{
"epoch": 0.3408732246186218,
"grad_norm": 2.1036930084228516,
"learning_rate": 4.971072321066868e-06,
"loss": 1.1284,
"step": 648
},
{
"epoch": 0.3413992635455024,
"grad_norm": 2.147191286087036,
"learning_rate": 4.970966854276411e-06,
"loss": 1.1165,
"step": 649
},
{
"epoch": 0.34192530247238295,
"grad_norm": 2.1734893321990967,
"learning_rate": 4.970861196698988e-06,
"loss": 1.0834,
"step": 650
},
{
"epoch": 0.34245134139926353,
"grad_norm": 2.038435459136963,
"learning_rate": 4.97075534834276e-06,
"loss": 1.0193,
"step": 651
},
{
"epoch": 0.3429773803261441,
"grad_norm": 2.077822208404541,
"learning_rate": 4.970649309215895e-06,
"loss": 1.0697,
"step": 652
},
{
"epoch": 0.34350341925302474,
"grad_norm": 2.056907892227173,
"learning_rate": 4.970543079326584e-06,
"loss": 1.0593,
"step": 653
},
{
"epoch": 0.3440294581799053,
"grad_norm": 2.7795369625091553,
"learning_rate": 4.9704366586830275e-06,
"loss": 1.122,
"step": 654
},
{
"epoch": 0.3445554971067859,
"grad_norm": 2.0807559490203857,
"learning_rate": 4.970330047293443e-06,
"loss": 1.0225,
"step": 655
},
{
"epoch": 0.3450815360336665,
"grad_norm": 2.219024658203125,
"learning_rate": 4.970223245166062e-06,
"loss": 1.1506,
"step": 656
},
{
"epoch": 0.34560757496054706,
"grad_norm": 2.1809475421905518,
"learning_rate": 4.970116252309131e-06,
"loss": 1.1094,
"step": 657
},
{
"epoch": 0.3461336138874277,
"grad_norm": 2.243777275085449,
"learning_rate": 4.970009068730911e-06,
"loss": 1.0942,
"step": 658
},
{
"epoch": 0.3466596528143083,
"grad_norm": 2.106391191482544,
"learning_rate": 4.969901694439677e-06,
"loss": 1.0899,
"step": 659
},
{
"epoch": 0.34718569174118885,
"grad_norm": 2.1109979152679443,
"learning_rate": 4.96979412944372e-06,
"loss": 1.0622,
"step": 660
},
{
"epoch": 0.34771173066806943,
"grad_norm": 2.292466163635254,
"learning_rate": 4.969686373751347e-06,
"loss": 1.1081,
"step": 661
},
{
"epoch": 0.34823776959495,
"grad_norm": 1.9919096231460571,
"learning_rate": 4.9695784273708755e-06,
"loss": 1.0774,
"step": 662
},
{
"epoch": 0.34876380852183064,
"grad_norm": 2.2421789169311523,
"learning_rate": 4.969470290310641e-06,
"loss": 1.0958,
"step": 663
},
{
"epoch": 0.3492898474487112,
"grad_norm": 2.069939613342285,
"learning_rate": 4.969361962578994e-06,
"loss": 1.0758,
"step": 664
},
{
"epoch": 0.3498158863755918,
"grad_norm": 2.0892951488494873,
"learning_rate": 4.969253444184297e-06,
"loss": 1.105,
"step": 665
},
{
"epoch": 0.3503419253024724,
"grad_norm": 2.1536753177642822,
"learning_rate": 4.969144735134929e-06,
"loss": 1.0655,
"step": 666
},
{
"epoch": 0.35086796422935296,
"grad_norm": 2.031996250152588,
"learning_rate": 4.969035835439284e-06,
"loss": 1.1107,
"step": 667
},
{
"epoch": 0.35139400315623354,
"grad_norm": 2.068693161010742,
"learning_rate": 4.9689267451057714e-06,
"loss": 1.0293,
"step": 668
},
{
"epoch": 0.35192004208311417,
"grad_norm": 2.1489906311035156,
"learning_rate": 4.9688174641428136e-06,
"loss": 1.0656,
"step": 669
},
{
"epoch": 0.35244608100999475,
"grad_norm": 2.5132720470428467,
"learning_rate": 4.9687079925588475e-06,
"loss": 1.0558,
"step": 670
},
{
"epoch": 0.35297211993687533,
"grad_norm": 1.9639642238616943,
"learning_rate": 4.968598330362326e-06,
"loss": 1.0498,
"step": 671
},
{
"epoch": 0.3534981588637559,
"grad_norm": 2.2413175106048584,
"learning_rate": 4.968488477561716e-06,
"loss": 0.986,
"step": 672
},
{
"epoch": 0.3540241977906365,
"grad_norm": 2.0109381675720215,
"learning_rate": 4.968378434165501e-06,
"loss": 1.1112,
"step": 673
},
{
"epoch": 0.3545502367175171,
"grad_norm": 2.1863934993743896,
"learning_rate": 4.968268200182175e-06,
"loss": 1.0843,
"step": 674
},
{
"epoch": 0.3550762756443977,
"grad_norm": 2.262173652648926,
"learning_rate": 4.968157775620252e-06,
"loss": 1.0938,
"step": 675
},
{
"epoch": 0.3556023145712783,
"grad_norm": 2.261918067932129,
"learning_rate": 4.968047160488256e-06,
"loss": 1.1004,
"step": 676
},
{
"epoch": 0.35612835349815886,
"grad_norm": 2.13324236869812,
"learning_rate": 4.967936354794728e-06,
"loss": 1.0881,
"step": 677
},
{
"epoch": 0.35665439242503943,
"grad_norm": 2.271207809448242,
"learning_rate": 4.967825358548225e-06,
"loss": 1.0967,
"step": 678
},
{
"epoch": 0.35718043135192007,
"grad_norm": 2.177339553833008,
"learning_rate": 4.967714171757315e-06,
"loss": 1.1131,
"step": 679
},
{
"epoch": 0.35770647027880065,
"grad_norm": 2.1329848766326904,
"learning_rate": 4.967602794430585e-06,
"loss": 1.112,
"step": 680
},
{
"epoch": 0.3582325092056812,
"grad_norm": 2.0018250942230225,
"learning_rate": 4.967491226576634e-06,
"loss": 1.0853,
"step": 681
},
{
"epoch": 0.3587585481325618,
"grad_norm": 2.06925106048584,
"learning_rate": 4.967379468204075e-06,
"loss": 1.1405,
"step": 682
},
{
"epoch": 0.3592845870594424,
"grad_norm": 2.0437614917755127,
"learning_rate": 4.967267519321538e-06,
"loss": 1.1165,
"step": 683
},
{
"epoch": 0.35981062598632296,
"grad_norm": 2.043297290802002,
"learning_rate": 4.9671553799376685e-06,
"loss": 1.0438,
"step": 684
},
{
"epoch": 0.3603366649132036,
"grad_norm": 2.060760259628296,
"learning_rate": 4.967043050061121e-06,
"loss": 1.0401,
"step": 685
},
{
"epoch": 0.3608627038400842,
"grad_norm": 2.3929009437561035,
"learning_rate": 4.966930529700572e-06,
"loss": 1.0812,
"step": 686
},
{
"epoch": 0.36138874276696475,
"grad_norm": 2.2057461738586426,
"learning_rate": 4.966817818864708e-06,
"loss": 1.0499,
"step": 687
},
{
"epoch": 0.36191478169384533,
"grad_norm": 2.0358550548553467,
"learning_rate": 4.966704917562231e-06,
"loss": 1.1603,
"step": 688
},
{
"epoch": 0.3624408206207259,
"grad_norm": 2.0840682983398438,
"learning_rate": 4.966591825801859e-06,
"loss": 1.0967,
"step": 689
},
{
"epoch": 0.36296685954760655,
"grad_norm": 2.0170061588287354,
"learning_rate": 4.9664785435923255e-06,
"loss": 1.0573,
"step": 690
},
{
"epoch": 0.3634928984744871,
"grad_norm": 2.1349408626556396,
"learning_rate": 4.966365070942375e-06,
"loss": 1.0665,
"step": 691
},
{
"epoch": 0.3640189374013677,
"grad_norm": 2.1616368293762207,
"learning_rate": 4.966251407860769e-06,
"loss": 1.0306,
"step": 692
},
{
"epoch": 0.3645449763282483,
"grad_norm": 2.2529335021972656,
"learning_rate": 4.966137554356285e-06,
"loss": 1.0445,
"step": 693
},
{
"epoch": 0.36507101525512886,
"grad_norm": 2.041102170944214,
"learning_rate": 4.966023510437713e-06,
"loss": 1.0395,
"step": 694
},
{
"epoch": 0.3655970541820095,
"grad_norm": 2.0450620651245117,
"learning_rate": 4.9659092761138585e-06,
"loss": 1.064,
"step": 695
},
{
"epoch": 0.3661230931088901,
"grad_norm": 2.163081407546997,
"learning_rate": 4.965794851393541e-06,
"loss": 1.0729,
"step": 696
},
{
"epoch": 0.36664913203577065,
"grad_norm": 2.1602089405059814,
"learning_rate": 4.965680236285596e-06,
"loss": 1.0707,
"step": 697
},
{
"epoch": 0.36717517096265123,
"grad_norm": 2.3263938426971436,
"learning_rate": 4.965565430798875e-06,
"loss": 1.0146,
"step": 698
},
{
"epoch": 0.3677012098895318,
"grad_norm": 2.0192365646362305,
"learning_rate": 4.965450434942238e-06,
"loss": 1.0751,
"step": 699
},
{
"epoch": 0.3682272488164124,
"grad_norm": 2.0557174682617188,
"learning_rate": 4.965335248724568e-06,
"loss": 1.0749,
"step": 700
},
{
"epoch": 0.368753287743293,
"grad_norm": 2.29679799079895,
"learning_rate": 4.965219872154757e-06,
"loss": 1.0516,
"step": 701
},
{
"epoch": 0.3692793266701736,
"grad_norm": 2.2303829193115234,
"learning_rate": 4.965104305241713e-06,
"loss": 1.1586,
"step": 702
},
{
"epoch": 0.3698053655970542,
"grad_norm": 2.112283706665039,
"learning_rate": 4.964988547994361e-06,
"loss": 1.0833,
"step": 703
},
{
"epoch": 0.37033140452393476,
"grad_norm": 2.1807613372802734,
"learning_rate": 4.9648726004216354e-06,
"loss": 1.0786,
"step": 704
},
{
"epoch": 0.37085744345081534,
"grad_norm": 2.0990889072418213,
"learning_rate": 4.964756462532492e-06,
"loss": 1.0555,
"step": 705
},
{
"epoch": 0.371383482377696,
"grad_norm": 2.2034318447113037,
"learning_rate": 4.964640134335896e-06,
"loss": 1.0696,
"step": 706
},
{
"epoch": 0.37190952130457655,
"grad_norm": 2.207235813140869,
"learning_rate": 4.964523615840831e-06,
"loss": 1.0897,
"step": 707
},
{
"epoch": 0.37243556023145713,
"grad_norm": 1.8820483684539795,
"learning_rate": 4.964406907056291e-06,
"loss": 1.0822,
"step": 708
},
{
"epoch": 0.3729615991583377,
"grad_norm": 2.2243785858154297,
"learning_rate": 4.964290007991291e-06,
"loss": 1.0958,
"step": 709
},
{
"epoch": 0.3734876380852183,
"grad_norm": 2.208770990371704,
"learning_rate": 4.964172918654854e-06,
"loss": 1.0803,
"step": 710
},
{
"epoch": 0.3740136770120989,
"grad_norm": 2.1083521842956543,
"learning_rate": 4.96405563905602e-06,
"loss": 1.0513,
"step": 711
},
{
"epoch": 0.3745397159389795,
"grad_norm": 2.0161774158477783,
"learning_rate": 4.963938169203847e-06,
"loss": 1.0775,
"step": 712
},
{
"epoch": 0.3750657548658601,
"grad_norm": 2.1578962802886963,
"learning_rate": 4.963820509107403e-06,
"loss": 1.0695,
"step": 713
},
{
"epoch": 0.37559179379274066,
"grad_norm": 2.1972339153289795,
"learning_rate": 4.963702658775774e-06,
"loss": 1.0703,
"step": 714
},
{
"epoch": 0.37611783271962124,
"grad_norm": 2.338205575942993,
"learning_rate": 4.9635846182180594e-06,
"loss": 1.0756,
"step": 715
},
{
"epoch": 0.3766438716465018,
"grad_norm": 2.281242847442627,
"learning_rate": 4.963466387443372e-06,
"loss": 1.1177,
"step": 716
},
{
"epoch": 0.37716991057338245,
"grad_norm": 2.092036724090576,
"learning_rate": 4.963347966460841e-06,
"loss": 1.1004,
"step": 717
},
{
"epoch": 0.37769594950026303,
"grad_norm": 2.148244857788086,
"learning_rate": 4.963229355279611e-06,
"loss": 1.1157,
"step": 718
},
{
"epoch": 0.3782219884271436,
"grad_norm": 1.9961777925491333,
"learning_rate": 4.963110553908838e-06,
"loss": 1.0703,
"step": 719
},
{
"epoch": 0.3787480273540242,
"grad_norm": 2.299091339111328,
"learning_rate": 4.962991562357697e-06,
"loss": 1.1265,
"step": 720
},
{
"epoch": 0.37927406628090476,
"grad_norm": 2.1055006980895996,
"learning_rate": 4.962872380635374e-06,
"loss": 1.0361,
"step": 721
},
{
"epoch": 0.3798001052077854,
"grad_norm": 2.1554667949676514,
"learning_rate": 4.9627530087510725e-06,
"loss": 1.0603,
"step": 722
},
{
"epoch": 0.380326144134666,
"grad_norm": 2.1003949642181396,
"learning_rate": 4.962633446714009e-06,
"loss": 1.0714,
"step": 723
},
{
"epoch": 0.38085218306154656,
"grad_norm": 2.1850736141204834,
"learning_rate": 4.962513694533414e-06,
"loss": 1.0795,
"step": 724
},
{
"epoch": 0.38137822198842714,
"grad_norm": 2.0440175533294678,
"learning_rate": 4.962393752218535e-06,
"loss": 1.0882,
"step": 725
},
{
"epoch": 0.3819042609153077,
"grad_norm": 2.2579755783081055,
"learning_rate": 4.962273619778632e-06,
"loss": 1.1066,
"step": 726
},
{
"epoch": 0.38243029984218835,
"grad_norm": 2.0210318565368652,
"learning_rate": 4.962153297222981e-06,
"loss": 1.0843,
"step": 727
},
{
"epoch": 0.3829563387690689,
"grad_norm": 2.1218135356903076,
"learning_rate": 4.962032784560873e-06,
"loss": 1.1039,
"step": 728
},
{
"epoch": 0.3834823776959495,
"grad_norm": 2.2498831748962402,
"learning_rate": 4.961912081801612e-06,
"loss": 1.0389,
"step": 729
},
{
"epoch": 0.3840084166228301,
"grad_norm": 2.6789276599884033,
"learning_rate": 4.9617911889545175e-06,
"loss": 1.0772,
"step": 730
},
{
"epoch": 0.38453445554971066,
"grad_norm": 1.9847339391708374,
"learning_rate": 4.961670106028924e-06,
"loss": 1.0804,
"step": 731
},
{
"epoch": 0.38506049447659124,
"grad_norm": 2.048737049102783,
"learning_rate": 4.9615488330341814e-06,
"loss": 1.1089,
"step": 732
},
{
"epoch": 0.3855865334034719,
"grad_norm": 2.2241313457489014,
"learning_rate": 4.961427369979652e-06,
"loss": 1.0618,
"step": 733
},
{
"epoch": 0.38611257233035245,
"grad_norm": 1.9084025621414185,
"learning_rate": 4.961305716874716e-06,
"loss": 1.0316,
"step": 734
},
{
"epoch": 0.38663861125723303,
"grad_norm": 2.0064773559570312,
"learning_rate": 4.9611838737287646e-06,
"loss": 1.0289,
"step": 735
},
{
"epoch": 0.3871646501841136,
"grad_norm": 2.386962652206421,
"learning_rate": 4.961061840551205e-06,
"loss": 1.1488,
"step": 736
},
{
"epoch": 0.3876906891109942,
"grad_norm": 2.0626862049102783,
"learning_rate": 4.960939617351462e-06,
"loss": 1.0793,
"step": 737
},
{
"epoch": 0.3882167280378748,
"grad_norm": 2.1622767448425293,
"learning_rate": 4.960817204138971e-06,
"loss": 1.0923,
"step": 738
},
{
"epoch": 0.3887427669647554,
"grad_norm": 2.049163818359375,
"learning_rate": 4.9606946009231834e-06,
"loss": 1.0423,
"step": 739
},
{
"epoch": 0.389268805891636,
"grad_norm": 2.0196399688720703,
"learning_rate": 4.960571807713568e-06,
"loss": 0.9832,
"step": 740
},
{
"epoch": 0.38979484481851656,
"grad_norm": 1.982647180557251,
"learning_rate": 4.960448824519602e-06,
"loss": 1.0424,
"step": 741
},
{
"epoch": 0.39032088374539714,
"grad_norm": 2.0468926429748535,
"learning_rate": 4.960325651350784e-06,
"loss": 1.074,
"step": 742
},
{
"epoch": 0.3908469226722778,
"grad_norm": 2.402381181716919,
"learning_rate": 4.960202288216624e-06,
"loss": 1.058,
"step": 743
},
{
"epoch": 0.39137296159915835,
"grad_norm": 2.065232753753662,
"learning_rate": 4.960078735126646e-06,
"loss": 1.0985,
"step": 744
},
{
"epoch": 0.39189900052603893,
"grad_norm": 2.1949756145477295,
"learning_rate": 4.95995499209039e-06,
"loss": 1.0791,
"step": 745
},
{
"epoch": 0.3924250394529195,
"grad_norm": 2.121232271194458,
"learning_rate": 4.959831059117411e-06,
"loss": 1.0606,
"step": 746
},
{
"epoch": 0.3929510783798001,
"grad_norm": 2.247145652770996,
"learning_rate": 4.959706936217278e-06,
"loss": 1.0991,
"step": 747
},
{
"epoch": 0.39347711730668067,
"grad_norm": 2.0540339946746826,
"learning_rate": 4.9595826233995735e-06,
"loss": 1.0835,
"step": 748
},
{
"epoch": 0.3940031562335613,
"grad_norm": 2.173257350921631,
"learning_rate": 4.959458120673898e-06,
"loss": 1.0588,
"step": 749
},
{
"epoch": 0.3945291951604419,
"grad_norm": 2.1530778408050537,
"learning_rate": 4.959333428049862e-06,
"loss": 1.0395,
"step": 750
},
{
"epoch": 0.39505523408732246,
"grad_norm": 2.0705490112304688,
"learning_rate": 4.959208545537095e-06,
"loss": 1.071,
"step": 751
},
{
"epoch": 0.39558127301420304,
"grad_norm": 1.9439338445663452,
"learning_rate": 4.95908347314524e-06,
"loss": 1.0224,
"step": 752
},
{
"epoch": 0.3961073119410836,
"grad_norm": 2.1683454513549805,
"learning_rate": 4.958958210883952e-06,
"loss": 1.0745,
"step": 753
},
{
"epoch": 0.39663335086796425,
"grad_norm": 2.2809042930603027,
"learning_rate": 4.958832758762903e-06,
"loss": 1.0887,
"step": 754
},
{
"epoch": 0.39715938979484483,
"grad_norm": 2.161447048187256,
"learning_rate": 4.9587071167917814e-06,
"loss": 1.1447,
"step": 755
},
{
"epoch": 0.3976854287217254,
"grad_norm": 2.1375932693481445,
"learning_rate": 4.958581284980285e-06,
"loss": 1.0295,
"step": 756
},
{
"epoch": 0.398211467648606,
"grad_norm": 2.0431041717529297,
"learning_rate": 4.958455263338133e-06,
"loss": 1.0567,
"step": 757
},
{
"epoch": 0.39873750657548657,
"grad_norm": 2.0288238525390625,
"learning_rate": 4.958329051875053e-06,
"loss": 1.0736,
"step": 758
},
{
"epoch": 0.3992635455023672,
"grad_norm": 2.146132230758667,
"learning_rate": 4.958202650600791e-06,
"loss": 1.0744,
"step": 759
},
{
"epoch": 0.3997895844292478,
"grad_norm": 2.1740963459014893,
"learning_rate": 4.958076059525107e-06,
"loss": 1.0263,
"step": 760
},
{
"epoch": 0.40031562335612836,
"grad_norm": 2.1219875812530518,
"learning_rate": 4.957949278657773e-06,
"loss": 1.0508,
"step": 761
},
{
"epoch": 0.40084166228300894,
"grad_norm": 2.0742340087890625,
"learning_rate": 4.9578223080085815e-06,
"loss": 1.0455,
"step": 762
},
{
"epoch": 0.4013677012098895,
"grad_norm": 2.1779415607452393,
"learning_rate": 4.957695147587334e-06,
"loss": 1.1079,
"step": 763
},
{
"epoch": 0.4018937401367701,
"grad_norm": 2.151047706604004,
"learning_rate": 4.957567797403848e-06,
"loss": 1.0893,
"step": 764
},
{
"epoch": 0.40241977906365073,
"grad_norm": 2.1728570461273193,
"learning_rate": 4.9574402574679594e-06,
"loss": 1.0726,
"step": 765
},
{
"epoch": 0.4029458179905313,
"grad_norm": 1.982230305671692,
"learning_rate": 4.957312527789512e-06,
"loss": 1.0629,
"step": 766
},
{
"epoch": 0.4034718569174119,
"grad_norm": 1.953464150428772,
"learning_rate": 4.95718460837837e-06,
"loss": 1.1093,
"step": 767
},
{
"epoch": 0.40399789584429247,
"grad_norm": 1.9718215465545654,
"learning_rate": 4.9570564992444116e-06,
"loss": 1.1018,
"step": 768
},
{
"epoch": 0.40452393477117304,
"grad_norm": 2.067629337310791,
"learning_rate": 4.956928200397526e-06,
"loss": 1.0364,
"step": 769
},
{
"epoch": 0.4050499736980537,
"grad_norm": 2.1172022819519043,
"learning_rate": 4.956799711847619e-06,
"loss": 1.0693,
"step": 770
},
{
"epoch": 0.40557601262493426,
"grad_norm": 2.0539615154266357,
"learning_rate": 4.956671033604613e-06,
"loss": 1.0034,
"step": 771
},
{
"epoch": 0.40610205155181484,
"grad_norm": 1.9780375957489014,
"learning_rate": 4.956542165678443e-06,
"loss": 1.0515,
"step": 772
},
{
"epoch": 0.4066280904786954,
"grad_norm": 2.0974819660186768,
"learning_rate": 4.95641310807906e-06,
"loss": 1.0754,
"step": 773
},
{
"epoch": 0.407154129405576,
"grad_norm": 2.1018221378326416,
"learning_rate": 4.956283860816427e-06,
"loss": 1.1102,
"step": 774
},
{
"epoch": 0.4076801683324566,
"grad_norm": 2.3969085216522217,
"learning_rate": 4.9561544239005235e-06,
"loss": 1.0455,
"step": 775
},
{
"epoch": 0.4082062072593372,
"grad_norm": 2.2645649909973145,
"learning_rate": 4.956024797341345e-06,
"loss": 0.9724,
"step": 776
},
{
"epoch": 0.4087322461862178,
"grad_norm": 2.3406150341033936,
"learning_rate": 4.955894981148898e-06,
"loss": 1.1341,
"step": 777
},
{
"epoch": 0.40925828511309836,
"grad_norm": 2.0782880783081055,
"learning_rate": 4.955764975333208e-06,
"loss": 1.0116,
"step": 778
},
{
"epoch": 0.40978432403997894,
"grad_norm": 2.1269314289093018,
"learning_rate": 4.955634779904312e-06,
"loss": 1.0967,
"step": 779
},
{
"epoch": 0.4103103629668595,
"grad_norm": 2.198559522628784,
"learning_rate": 4.9555043948722625e-06,
"loss": 1.0815,
"step": 780
},
{
"epoch": 0.41083640189374016,
"grad_norm": 2.2189719676971436,
"learning_rate": 4.9553738202471264e-06,
"loss": 1.0559,
"step": 781
},
{
"epoch": 0.41136244082062073,
"grad_norm": 2.2313179969787598,
"learning_rate": 4.955243056038986e-06,
"loss": 1.046,
"step": 782
},
{
"epoch": 0.4118884797475013,
"grad_norm": 1.9563003778457642,
"learning_rate": 4.955112102257939e-06,
"loss": 1.0735,
"step": 783
},
{
"epoch": 0.4124145186743819,
"grad_norm": 1.99479341506958,
"learning_rate": 4.954980958914093e-06,
"loss": 1.0657,
"step": 784
},
{
"epoch": 0.41294055760126247,
"grad_norm": 2.029634714126587,
"learning_rate": 4.954849626017577e-06,
"loss": 1.0811,
"step": 785
},
{
"epoch": 0.4134665965281431,
"grad_norm": 2.2947723865509033,
"learning_rate": 4.9547181035785314e-06,
"loss": 1.0807,
"step": 786
},
{
"epoch": 0.4139926354550237,
"grad_norm": 2.0323445796966553,
"learning_rate": 4.9545863916071094e-06,
"loss": 1.0715,
"step": 787
},
{
"epoch": 0.41451867438190426,
"grad_norm": 2.0068464279174805,
"learning_rate": 4.954454490113482e-06,
"loss": 1.0447,
"step": 788
},
{
"epoch": 0.41504471330878484,
"grad_norm": 2.132549285888672,
"learning_rate": 4.954322399107833e-06,
"loss": 1.0454,
"step": 789
},
{
"epoch": 0.4155707522356654,
"grad_norm": 2.0086755752563477,
"learning_rate": 4.954190118600361e-06,
"loss": 1.0724,
"step": 790
},
{
"epoch": 0.41609679116254605,
"grad_norm": 2.1461241245269775,
"learning_rate": 4.95405764860128e-06,
"loss": 1.0391,
"step": 791
},
{
"epoch": 0.41662283008942663,
"grad_norm": 2.1352107524871826,
"learning_rate": 4.953924989120818e-06,
"loss": 0.9898,
"step": 792
},
{
"epoch": 0.4171488690163072,
"grad_norm": 2.0694406032562256,
"learning_rate": 4.953792140169219e-06,
"loss": 1.0819,
"step": 793
},
{
"epoch": 0.4176749079431878,
"grad_norm": 2.088433027267456,
"learning_rate": 4.953659101756739e-06,
"loss": 1.0833,
"step": 794
},
{
"epoch": 0.41820094687006837,
"grad_norm": 2.1760306358337402,
"learning_rate": 4.95352587389365e-06,
"loss": 1.0535,
"step": 795
},
{
"epoch": 0.41872698579694895,
"grad_norm": 2.2031099796295166,
"learning_rate": 4.95339245659024e-06,
"loss": 1.0389,
"step": 796
},
{
"epoch": 0.4192530247238296,
"grad_norm": 2.247276782989502,
"learning_rate": 4.953258849856809e-06,
"loss": 1.0839,
"step": 797
},
{
"epoch": 0.41977906365071016,
"grad_norm": 2.24357271194458,
"learning_rate": 4.953125053703674e-06,
"loss": 1.0666,
"step": 798
},
{
"epoch": 0.42030510257759074,
"grad_norm": 2.240151882171631,
"learning_rate": 4.952991068141165e-06,
"loss": 1.1009,
"step": 799
},
{
"epoch": 0.4208311415044713,
"grad_norm": 2.2172327041625977,
"learning_rate": 4.952856893179628e-06,
"loss": 1.0928,
"step": 800
},
{
"epoch": 0.4213571804313519,
"grad_norm": 2.377336025238037,
"learning_rate": 4.952722528829422e-06,
"loss": 1.0968,
"step": 801
},
{
"epoch": 0.42188321935823253,
"grad_norm": 2.466841459274292,
"learning_rate": 4.9525879751009205e-06,
"loss": 1.0631,
"step": 802
},
{
"epoch": 0.4224092582851131,
"grad_norm": 2.035644054412842,
"learning_rate": 4.952453232004516e-06,
"loss": 1.0609,
"step": 803
},
{
"epoch": 0.4229352972119937,
"grad_norm": 2.2472054958343506,
"learning_rate": 4.952318299550608e-06,
"loss": 1.0613,
"step": 804
},
{
"epoch": 0.42346133613887427,
"grad_norm": 2.175999879837036,
"learning_rate": 4.952183177749618e-06,
"loss": 1.0954,
"step": 805
},
{
"epoch": 0.42398737506575485,
"grad_norm": 2.206052303314209,
"learning_rate": 4.952047866611978e-06,
"loss": 1.0965,
"step": 806
},
{
"epoch": 0.4245134139926355,
"grad_norm": 1.9550546407699585,
"learning_rate": 4.951912366148135e-06,
"loss": 1.0835,
"step": 807
},
{
"epoch": 0.42503945291951606,
"grad_norm": 2.194734811782837,
"learning_rate": 4.951776676368552e-06,
"loss": 1.1179,
"step": 808
},
{
"epoch": 0.42556549184639664,
"grad_norm": 2.094862222671509,
"learning_rate": 4.951640797283704e-06,
"loss": 1.0634,
"step": 809
},
{
"epoch": 0.4260915307732772,
"grad_norm": 1.980043888092041,
"learning_rate": 4.951504728904085e-06,
"loss": 1.0874,
"step": 810
},
{
"epoch": 0.4266175697001578,
"grad_norm": 2.2654919624328613,
"learning_rate": 4.9513684712402e-06,
"loss": 1.057,
"step": 811
},
{
"epoch": 0.4271436086270384,
"grad_norm": 2.197120189666748,
"learning_rate": 4.951232024302569e-06,
"loss": 1.1114,
"step": 812
},
{
"epoch": 0.427669647553919,
"grad_norm": 2.143324375152588,
"learning_rate": 4.9510953881017275e-06,
"loss": 1.07,
"step": 813
},
{
"epoch": 0.4281956864807996,
"grad_norm": 2.1920077800750732,
"learning_rate": 4.950958562648226e-06,
"loss": 1.0373,
"step": 814
},
{
"epoch": 0.42872172540768017,
"grad_norm": 2.0401923656463623,
"learning_rate": 4.950821547952629e-06,
"loss": 1.1111,
"step": 815
},
{
"epoch": 0.42924776433456074,
"grad_norm": 1.9541674852371216,
"learning_rate": 4.950684344025515e-06,
"loss": 1.0153,
"step": 816
},
{
"epoch": 0.4297738032614413,
"grad_norm": 3.5096704959869385,
"learning_rate": 4.9505469508774776e-06,
"loss": 1.0435,
"step": 817
},
{
"epoch": 0.43029984218832196,
"grad_norm": 2.0304462909698486,
"learning_rate": 4.9504093685191255e-06,
"loss": 1.0786,
"step": 818
},
{
"epoch": 0.43082588111520254,
"grad_norm": 2.115224599838257,
"learning_rate": 4.950271596961082e-06,
"loss": 1.0854,
"step": 819
},
{
"epoch": 0.4313519200420831,
"grad_norm": 2.176621913909912,
"learning_rate": 4.950133636213984e-06,
"loss": 0.9909,
"step": 820
},
{
"epoch": 0.4318779589689637,
"grad_norm": 2.2046449184417725,
"learning_rate": 4.949995486288484e-06,
"loss": 1.0688,
"step": 821
},
{
"epoch": 0.43240399789584427,
"grad_norm": 2.1462888717651367,
"learning_rate": 4.949857147195249e-06,
"loss": 1.0644,
"step": 822
},
{
"epoch": 0.4329300368227249,
"grad_norm": 2.0735347270965576,
"learning_rate": 4.94971861894496e-06,
"loss": 1.022,
"step": 823
},
{
"epoch": 0.4334560757496055,
"grad_norm": 2.086724042892456,
"learning_rate": 4.949579901548312e-06,
"loss": 1.02,
"step": 824
},
{
"epoch": 0.43398211467648606,
"grad_norm": 2.078622341156006,
"learning_rate": 4.949440995016018e-06,
"loss": 1.0653,
"step": 825
},
{
"epoch": 0.43450815360336664,
"grad_norm": 2.1504440307617188,
"learning_rate": 4.949301899358801e-06,
"loss": 1.0708,
"step": 826
},
{
"epoch": 0.4350341925302472,
"grad_norm": 2.2340216636657715,
"learning_rate": 4.949162614587401e-06,
"loss": 1.0688,
"step": 827
},
{
"epoch": 0.4355602314571278,
"grad_norm": 2.2017569541931152,
"learning_rate": 4.949023140712574e-06,
"loss": 1.0935,
"step": 828
},
{
"epoch": 0.43608627038400843,
"grad_norm": 2.117745876312256,
"learning_rate": 4.948883477745088e-06,
"loss": 1.0868,
"step": 829
},
{
"epoch": 0.436612309310889,
"grad_norm": 2.0983524322509766,
"learning_rate": 4.948743625695726e-06,
"loss": 1.0695,
"step": 830
},
{
"epoch": 0.4371383482377696,
"grad_norm": 2.205693244934082,
"learning_rate": 4.948603584575287e-06,
"loss": 1.0541,
"step": 831
},
{
"epoch": 0.43766438716465017,
"grad_norm": 1.9967527389526367,
"learning_rate": 4.948463354394583e-06,
"loss": 0.9933,
"step": 832
},
{
"epoch": 0.43819042609153075,
"grad_norm": 2.113577127456665,
"learning_rate": 4.948322935164442e-06,
"loss": 1.0199,
"step": 833
},
{
"epoch": 0.4387164650184114,
"grad_norm": 2.0825533866882324,
"learning_rate": 4.948182326895705e-06,
"loss": 1.0446,
"step": 834
},
{
"epoch": 0.43924250394529196,
"grad_norm": 2.0186421871185303,
"learning_rate": 4.94804152959923e-06,
"loss": 1.0798,
"step": 835
},
{
"epoch": 0.43976854287217254,
"grad_norm": 2.3025147914886475,
"learning_rate": 4.947900543285888e-06,
"loss": 0.9977,
"step": 836
},
{
"epoch": 0.4402945817990531,
"grad_norm": 2.1662867069244385,
"learning_rate": 4.947759367966564e-06,
"loss": 1.048,
"step": 837
},
{
"epoch": 0.4408206207259337,
"grad_norm": 2.0708656311035156,
"learning_rate": 4.947618003652158e-06,
"loss": 1.0715,
"step": 838
},
{
"epoch": 0.44134665965281433,
"grad_norm": 2.2494263648986816,
"learning_rate": 4.947476450353586e-06,
"loss": 1.0901,
"step": 839
},
{
"epoch": 0.4418726985796949,
"grad_norm": 2.3319430351257324,
"learning_rate": 4.947334708081777e-06,
"loss": 1.0308,
"step": 840
},
{
"epoch": 0.4423987375065755,
"grad_norm": 2.134620428085327,
"learning_rate": 4.947192776847676e-06,
"loss": 1.0459,
"step": 841
},
{
"epoch": 0.44292477643345607,
"grad_norm": 2.075429916381836,
"learning_rate": 4.94705065666224e-06,
"loss": 1.0733,
"step": 842
},
{
"epoch": 0.44345081536033665,
"grad_norm": 2.173069953918457,
"learning_rate": 4.946908347536444e-06,
"loss": 1.1092,
"step": 843
},
{
"epoch": 0.4439768542872172,
"grad_norm": 2.1481893062591553,
"learning_rate": 4.946765849481274e-06,
"loss": 1.0822,
"step": 844
},
{
"epoch": 0.44450289321409786,
"grad_norm": 2.247277021408081,
"learning_rate": 4.9466231625077354e-06,
"loss": 1.0777,
"step": 845
},
{
"epoch": 0.44502893214097844,
"grad_norm": 2.1181042194366455,
"learning_rate": 4.946480286626842e-06,
"loss": 1.1139,
"step": 846
},
{
"epoch": 0.445554971067859,
"grad_norm": 2.05195951461792,
"learning_rate": 4.946337221849628e-06,
"loss": 1.0738,
"step": 847
},
{
"epoch": 0.4460810099947396,
"grad_norm": 2.122732639312744,
"learning_rate": 4.946193968187139e-06,
"loss": 1.061,
"step": 848
},
{
"epoch": 0.4466070489216202,
"grad_norm": 1.8827515840530396,
"learning_rate": 4.946050525650434e-06,
"loss": 1.061,
"step": 849
},
{
"epoch": 0.4471330878485008,
"grad_norm": 2.3874471187591553,
"learning_rate": 4.945906894250591e-06,
"loss": 1.0667,
"step": 850
},
{
"epoch": 0.4476591267753814,
"grad_norm": 2.274724006652832,
"learning_rate": 4.945763073998699e-06,
"loss": 1.0559,
"step": 851
},
{
"epoch": 0.44818516570226197,
"grad_norm": 2.2730906009674072,
"learning_rate": 4.945619064905861e-06,
"loss": 1.0952,
"step": 852
},
{
"epoch": 0.44871120462914255,
"grad_norm": 2.190969944000244,
"learning_rate": 4.945474866983199e-06,
"loss": 1.0816,
"step": 853
},
{
"epoch": 0.4492372435560231,
"grad_norm": 3.6214282512664795,
"learning_rate": 4.945330480241844e-06,
"loss": 1.09,
"step": 854
},
{
"epoch": 0.44976328248290376,
"grad_norm": 2.0487356185913086,
"learning_rate": 4.945185904692946e-06,
"loss": 1.0279,
"step": 855
},
{
"epoch": 0.45028932140978434,
"grad_norm": 2.074282646179199,
"learning_rate": 4.945041140347669e-06,
"loss": 1.0514,
"step": 856
},
{
"epoch": 0.4508153603366649,
"grad_norm": 2.126495838165283,
"learning_rate": 4.944896187217187e-06,
"loss": 1.0819,
"step": 857
},
{
"epoch": 0.4513413992635455,
"grad_norm": 2.0265605449676514,
"learning_rate": 4.944751045312695e-06,
"loss": 1.0282,
"step": 858
},
{
"epoch": 0.4518674381904261,
"grad_norm": 2.0557355880737305,
"learning_rate": 4.944605714645399e-06,
"loss": 1.1052,
"step": 859
},
{
"epoch": 0.4523934771173067,
"grad_norm": 2.026393175125122,
"learning_rate": 4.944460195226519e-06,
"loss": 0.982,
"step": 860
},
{
"epoch": 0.4529195160441873,
"grad_norm": 2.1781463623046875,
"learning_rate": 4.9443144870672925e-06,
"loss": 1.1251,
"step": 861
},
{
"epoch": 0.45344555497106787,
"grad_norm": 2.053683042526245,
"learning_rate": 4.944168590178968e-06,
"loss": 1.0766,
"step": 862
},
{
"epoch": 0.45397159389794844,
"grad_norm": 2.1147496700286865,
"learning_rate": 4.944022504572811e-06,
"loss": 1.0174,
"step": 863
},
{
"epoch": 0.454497632824829,
"grad_norm": 2.06046199798584,
"learning_rate": 4.943876230260102e-06,
"loss": 1.0836,
"step": 864
},
{
"epoch": 0.4550236717517096,
"grad_norm": 2.171419382095337,
"learning_rate": 4.9437297672521345e-06,
"loss": 1.0695,
"step": 865
},
{
"epoch": 0.45554971067859024,
"grad_norm": 2.064301013946533,
"learning_rate": 4.943583115560217e-06,
"loss": 1.0147,
"step": 866
},
{
"epoch": 0.4560757496054708,
"grad_norm": 2.6638195514678955,
"learning_rate": 4.943436275195673e-06,
"loss": 1.0565,
"step": 867
},
{
"epoch": 0.4566017885323514,
"grad_norm": 3.9418976306915283,
"learning_rate": 4.943289246169839e-06,
"loss": 1.0768,
"step": 868
},
{
"epoch": 0.457127827459232,
"grad_norm": 2.114297389984131,
"learning_rate": 4.943142028494069e-06,
"loss": 1.0687,
"step": 869
},
{
"epoch": 0.45765386638611255,
"grad_norm": 2.139803171157837,
"learning_rate": 4.942994622179729e-06,
"loss": 1.0464,
"step": 870
},
{
"epoch": 0.4581799053129932,
"grad_norm": 2.011474370956421,
"learning_rate": 4.942847027238201e-06,
"loss": 1.0181,
"step": 871
},
{
"epoch": 0.45870594423987376,
"grad_norm": 2.1592113971710205,
"learning_rate": 4.94269924368088e-06,
"loss": 1.0699,
"step": 872
},
{
"epoch": 0.45923198316675434,
"grad_norm": 2.0230283737182617,
"learning_rate": 4.942551271519178e-06,
"loss": 1.075,
"step": 873
},
{
"epoch": 0.4597580220936349,
"grad_norm": 2.286768913269043,
"learning_rate": 4.942403110764518e-06,
"loss": 1.0604,
"step": 874
},
{
"epoch": 0.4602840610205155,
"grad_norm": 2.305375337600708,
"learning_rate": 4.942254761428343e-06,
"loss": 1.0067,
"step": 875
},
{
"epoch": 0.46081009994739613,
"grad_norm": 2.416245698928833,
"learning_rate": 4.942106223522104e-06,
"loss": 1.1109,
"step": 876
},
{
"epoch": 0.4613361388742767,
"grad_norm": 2.1339962482452393,
"learning_rate": 4.941957497057272e-06,
"loss": 1.0708,
"step": 877
},
{
"epoch": 0.4618621778011573,
"grad_norm": 1.9983795881271362,
"learning_rate": 4.941808582045329e-06,
"loss": 1.0032,
"step": 878
},
{
"epoch": 0.46238821672803787,
"grad_norm": 2.1115024089813232,
"learning_rate": 4.9416594784977735e-06,
"loss": 1.0272,
"step": 879
},
{
"epoch": 0.46291425565491845,
"grad_norm": 2.2785818576812744,
"learning_rate": 4.941510186426118e-06,
"loss": 1.0538,
"step": 880
},
{
"epoch": 0.46344029458179903,
"grad_norm": 2.009938955307007,
"learning_rate": 4.94136070584189e-06,
"loss": 1.0432,
"step": 881
},
{
"epoch": 0.46396633350867966,
"grad_norm": 2.119264841079712,
"learning_rate": 4.94121103675663e-06,
"loss": 1.063,
"step": 882
},
{
"epoch": 0.46449237243556024,
"grad_norm": 2.267575979232788,
"learning_rate": 4.941061179181896e-06,
"loss": 1.0698,
"step": 883
},
{
"epoch": 0.4650184113624408,
"grad_norm": 2.2345592975616455,
"learning_rate": 4.940911133129257e-06,
"loss": 1.0898,
"step": 884
},
{
"epoch": 0.4655444502893214,
"grad_norm": 2.175180673599243,
"learning_rate": 4.940760898610299e-06,
"loss": 1.0915,
"step": 885
},
{
"epoch": 0.466070489216202,
"grad_norm": 2.036628246307373,
"learning_rate": 4.940610475636621e-06,
"loss": 1.0981,
"step": 886
},
{
"epoch": 0.4665965281430826,
"grad_norm": 2.193129539489746,
"learning_rate": 4.9404598642198386e-06,
"loss": 1.1237,
"step": 887
},
{
"epoch": 0.4671225670699632,
"grad_norm": 1.920074462890625,
"learning_rate": 4.9403090643715804e-06,
"loss": 1.0358,
"step": 888
},
{
"epoch": 0.46764860599684377,
"grad_norm": 2.0745346546173096,
"learning_rate": 4.940158076103489e-06,
"loss": 1.0487,
"step": 889
},
{
"epoch": 0.46817464492372435,
"grad_norm": 1.9645469188690186,
"learning_rate": 4.940006899427225e-06,
"loss": 1.0256,
"step": 890
},
{
"epoch": 0.4687006838506049,
"grad_norm": 1.9696778059005737,
"learning_rate": 4.939855534354458e-06,
"loss": 1.0302,
"step": 891
},
{
"epoch": 0.46922672277748556,
"grad_norm": 2.1893057823181152,
"learning_rate": 4.939703980896875e-06,
"loss": 1.0391,
"step": 892
},
{
"epoch": 0.46975276170436614,
"grad_norm": 2.0537021160125732,
"learning_rate": 4.93955223906618e-06,
"loss": 1.0498,
"step": 893
},
{
"epoch": 0.4702788006312467,
"grad_norm": 2.4528138637542725,
"learning_rate": 4.9394003088740875e-06,
"loss": 1.0393,
"step": 894
},
{
"epoch": 0.4708048395581273,
"grad_norm": 2.2085723876953125,
"learning_rate": 4.93924819033233e-06,
"loss": 1.0789,
"step": 895
},
{
"epoch": 0.4713308784850079,
"grad_norm": 2.0029642581939697,
"learning_rate": 4.9390958834526504e-06,
"loss": 1.0621,
"step": 896
},
{
"epoch": 0.47185691741188845,
"grad_norm": 2.0400004386901855,
"learning_rate": 4.93894338824681e-06,
"loss": 1.0426,
"step": 897
},
{
"epoch": 0.4723829563387691,
"grad_norm": 2.3174595832824707,
"learning_rate": 4.9387907047265825e-06,
"loss": 1.0273,
"step": 898
},
{
"epoch": 0.47290899526564967,
"grad_norm": 1.998889446258545,
"learning_rate": 4.938637832903758e-06,
"loss": 1.0401,
"step": 899
},
{
"epoch": 0.47343503419253025,
"grad_norm": 2.0847246646881104,
"learning_rate": 4.93848477279014e-06,
"loss": 1.0677,
"step": 900
},
{
"epoch": 0.4739610731194108,
"grad_norm": 2.086249351501465,
"learning_rate": 4.938331524397544e-06,
"loss": 1.043,
"step": 901
},
{
"epoch": 0.4744871120462914,
"grad_norm": 2.1909382343292236,
"learning_rate": 4.938178087737805e-06,
"loss": 0.9977,
"step": 902
},
{
"epoch": 0.47501315097317204,
"grad_norm": 2.066394567489624,
"learning_rate": 4.938024462822769e-06,
"loss": 1.044,
"step": 903
},
{
"epoch": 0.4755391899000526,
"grad_norm": 2.1768858432769775,
"learning_rate": 4.937870649664299e-06,
"loss": 0.9886,
"step": 904
},
{
"epoch": 0.4760652288269332,
"grad_norm": 2.0450236797332764,
"learning_rate": 4.937716648274269e-06,
"loss": 1.0471,
"step": 905
},
{
"epoch": 0.4765912677538138,
"grad_norm": 2.218719720840454,
"learning_rate": 4.937562458664571e-06,
"loss": 1.0324,
"step": 906
},
{
"epoch": 0.47711730668069435,
"grad_norm": 2.2519423961639404,
"learning_rate": 4.937408080847109e-06,
"loss": 1.0899,
"step": 907
},
{
"epoch": 0.477643345607575,
"grad_norm": 2.045959234237671,
"learning_rate": 4.9372535148338055e-06,
"loss": 1.0383,
"step": 908
},
{
"epoch": 0.47816938453445557,
"grad_norm": 2.1137306690216064,
"learning_rate": 4.937098760636591e-06,
"loss": 1.0223,
"step": 909
},
{
"epoch": 0.47869542346133614,
"grad_norm": 2.2585835456848145,
"learning_rate": 4.936943818267418e-06,
"loss": 1.027,
"step": 910
},
{
"epoch": 0.4792214623882167,
"grad_norm": 2.161625862121582,
"learning_rate": 4.936788687738247e-06,
"loss": 1.0318,
"step": 911
},
{
"epoch": 0.4797475013150973,
"grad_norm": 2.0743277072906494,
"learning_rate": 4.936633369061057e-06,
"loss": 1.1014,
"step": 912
},
{
"epoch": 0.4802735402419779,
"grad_norm": 2.1271307468414307,
"learning_rate": 4.936477862247841e-06,
"loss": 1.0403,
"step": 913
},
{
"epoch": 0.4807995791688585,
"grad_norm": 2.0820491313934326,
"learning_rate": 4.9363221673106046e-06,
"loss": 1.069,
"step": 914
},
{
"epoch": 0.4813256180957391,
"grad_norm": 2.0069093704223633,
"learning_rate": 4.936166284261369e-06,
"loss": 1.0752,
"step": 915
},
{
"epoch": 0.4818516570226197,
"grad_norm": 2.2541720867156982,
"learning_rate": 4.936010213112172e-06,
"loss": 1.0309,
"step": 916
},
{
"epoch": 0.48237769594950025,
"grad_norm": 2.155980110168457,
"learning_rate": 4.9358539538750636e-06,
"loss": 1.0078,
"step": 917
},
{
"epoch": 0.48290373487638083,
"grad_norm": 2.217339038848877,
"learning_rate": 4.935697506562107e-06,
"loss": 1.0522,
"step": 918
},
{
"epoch": 0.48342977380326146,
"grad_norm": 1.963270902633667,
"learning_rate": 4.935540871185384e-06,
"loss": 1.0692,
"step": 919
},
{
"epoch": 0.48395581273014204,
"grad_norm": 1.9923917055130005,
"learning_rate": 4.935384047756987e-06,
"loss": 1.0926,
"step": 920
},
{
"epoch": 0.4844818516570226,
"grad_norm": 2.177624464035034,
"learning_rate": 4.935227036289026e-06,
"loss": 1.0727,
"step": 921
},
{
"epoch": 0.4850078905839032,
"grad_norm": 2.022496461868286,
"learning_rate": 4.935069836793622e-06,
"loss": 1.0267,
"step": 922
},
{
"epoch": 0.4855339295107838,
"grad_norm": 2.0110666751861572,
"learning_rate": 4.9349124492829155e-06,
"loss": 1.0911,
"step": 923
},
{
"epoch": 0.4860599684376644,
"grad_norm": 2.1780877113342285,
"learning_rate": 4.934754873769057e-06,
"loss": 1.0494,
"step": 924
},
{
"epoch": 0.486586007364545,
"grad_norm": 2.0291390419006348,
"learning_rate": 4.934597110264212e-06,
"loss": 1.0485,
"step": 925
},
{
"epoch": 0.48711204629142557,
"grad_norm": 1.947896957397461,
"learning_rate": 4.9344391587805626e-06,
"loss": 1.0789,
"step": 926
},
{
"epoch": 0.48763808521830615,
"grad_norm": 1.9520971775054932,
"learning_rate": 4.934281019330305e-06,
"loss": 1.0644,
"step": 927
},
{
"epoch": 0.48816412414518673,
"grad_norm": 2.0348432064056396,
"learning_rate": 4.93412269192565e-06,
"loss": 1.0471,
"step": 928
},
{
"epoch": 0.4886901630720673,
"grad_norm": 2.214876651763916,
"learning_rate": 4.93396417657882e-06,
"loss": 1.0921,
"step": 929
},
{
"epoch": 0.48921620199894794,
"grad_norm": 1.9910991191864014,
"learning_rate": 4.933805473302057e-06,
"loss": 1.0962,
"step": 930
},
{
"epoch": 0.4897422409258285,
"grad_norm": 2.0497536659240723,
"learning_rate": 4.933646582107612e-06,
"loss": 1.0502,
"step": 931
},
{
"epoch": 0.4902682798527091,
"grad_norm": 2.102994203567505,
"learning_rate": 4.933487503007756e-06,
"loss": 1.0676,
"step": 932
},
{
"epoch": 0.4907943187795897,
"grad_norm": 1.885666012763977,
"learning_rate": 4.933328236014768e-06,
"loss": 1.0005,
"step": 933
},
{
"epoch": 0.49132035770647026,
"grad_norm": 2.1525766849517822,
"learning_rate": 4.933168781140949e-06,
"loss": 1.0997,
"step": 934
},
{
"epoch": 0.4918463966333509,
"grad_norm": 2.0346620082855225,
"learning_rate": 4.9330091383986086e-06,
"loss": 1.0651,
"step": 935
},
{
"epoch": 0.49237243556023147,
"grad_norm": 2.0436878204345703,
"learning_rate": 4.932849307800074e-06,
"loss": 1.0539,
"step": 936
},
{
"epoch": 0.49289847448711205,
"grad_norm": 2.1023032665252686,
"learning_rate": 4.932689289357686e-06,
"loss": 1.0583,
"step": 937
},
{
"epoch": 0.4934245134139926,
"grad_norm": 2.0781443119049072,
"learning_rate": 4.932529083083798e-06,
"loss": 1.0753,
"step": 938
},
{
"epoch": 0.4939505523408732,
"grad_norm": 2.0385992527008057,
"learning_rate": 4.932368688990783e-06,
"loss": 1.0165,
"step": 939
},
{
"epoch": 0.49447659126775384,
"grad_norm": 2.350186586380005,
"learning_rate": 4.932208107091022e-06,
"loss": 1.0834,
"step": 940
},
{
"epoch": 0.4950026301946344,
"grad_norm": 2.2009286880493164,
"learning_rate": 4.932047337396917e-06,
"loss": 1.0975,
"step": 941
},
{
"epoch": 0.495528669121515,
"grad_norm": 2.389380931854248,
"learning_rate": 4.931886379920878e-06,
"loss": 1.0853,
"step": 942
},
{
"epoch": 0.4960547080483956,
"grad_norm": 2.016162157058716,
"learning_rate": 4.931725234675334e-06,
"loss": 1.039,
"step": 943
},
{
"epoch": 0.49658074697527615,
"grad_norm": 2.116718292236328,
"learning_rate": 4.9315639016727286e-06,
"loss": 1.0182,
"step": 944
},
{
"epoch": 0.49710678590215673,
"grad_norm": 2.1381125450134277,
"learning_rate": 4.931402380925517e-06,
"loss": 1.1051,
"step": 945
},
{
"epoch": 0.49763282482903737,
"grad_norm": 2.0954737663269043,
"learning_rate": 4.931240672446171e-06,
"loss": 1.038,
"step": 946
},
{
"epoch": 0.49815886375591795,
"grad_norm": 2.167865037918091,
"learning_rate": 4.931078776247176e-06,
"loss": 1.0998,
"step": 947
},
{
"epoch": 0.4986849026827985,
"grad_norm": 2.1278021335601807,
"learning_rate": 4.930916692341034e-06,
"loss": 1.0374,
"step": 948
},
{
"epoch": 0.4992109416096791,
"grad_norm": 2.088512420654297,
"learning_rate": 4.9307544207402565e-06,
"loss": 1.0954,
"step": 949
},
{
"epoch": 0.4997369805365597,
"grad_norm": 2.015916109085083,
"learning_rate": 4.930591961457375e-06,
"loss": 1.0163,
"step": 950
},
{
"epoch": 0.5002630194634403,
"grad_norm": 2.0662143230438232,
"learning_rate": 4.930429314504933e-06,
"loss": 1.0968,
"step": 951
},
{
"epoch": 0.5007890583903208,
"grad_norm": 2.0692410469055176,
"learning_rate": 4.930266479895488e-06,
"loss": 1.0772,
"step": 952
},
{
"epoch": 0.5013150973172015,
"grad_norm": 2.0734803676605225,
"learning_rate": 4.930103457641613e-06,
"loss": 1.1096,
"step": 953
},
{
"epoch": 0.5018411362440821,
"grad_norm": 2.167228937149048,
"learning_rate": 4.929940247755896e-06,
"loss": 1.0608,
"step": 954
},
{
"epoch": 0.5023671751709626,
"grad_norm": 2.272087574005127,
"learning_rate": 4.929776850250937e-06,
"loss": 1.0825,
"step": 955
},
{
"epoch": 0.5028932140978433,
"grad_norm": 2.0937726497650146,
"learning_rate": 4.929613265139354e-06,
"loss": 1.0651,
"step": 956
},
{
"epoch": 0.5034192530247238,
"grad_norm": 2.168090343475342,
"learning_rate": 4.929449492433777e-06,
"loss": 1.0821,
"step": 957
},
{
"epoch": 0.5039452919516044,
"grad_norm": 2.0708675384521484,
"learning_rate": 4.92928553214685e-06,
"loss": 1.0655,
"step": 958
},
{
"epoch": 0.5044713308784851,
"grad_norm": 2.067678689956665,
"learning_rate": 4.929121384291234e-06,
"loss": 1.05,
"step": 959
},
{
"epoch": 0.5049973698053656,
"grad_norm": 1.9181219339370728,
"learning_rate": 4.928957048879602e-06,
"loss": 0.9935,
"step": 960
},
{
"epoch": 0.5055234087322462,
"grad_norm": 2.217785358428955,
"learning_rate": 4.928792525924644e-06,
"loss": 0.97,
"step": 961
},
{
"epoch": 0.5060494476591267,
"grad_norm": 2.084656238555908,
"learning_rate": 4.928627815439062e-06,
"loss": 1.0541,
"step": 962
},
{
"epoch": 0.5065754865860074,
"grad_norm": 2.035367727279663,
"learning_rate": 4.928462917435574e-06,
"loss": 1.0694,
"step": 963
},
{
"epoch": 0.507101525512888,
"grad_norm": 2.001654624938965,
"learning_rate": 4.928297831926912e-06,
"loss": 1.0232,
"step": 964
},
{
"epoch": 0.5076275644397685,
"grad_norm": 2.57733154296875,
"learning_rate": 4.928132558925822e-06,
"loss": 1.0664,
"step": 965
},
{
"epoch": 0.5081536033666492,
"grad_norm": 2.1757423877716064,
"learning_rate": 4.927967098445066e-06,
"loss": 1.1119,
"step": 966
},
{
"epoch": 0.5086796422935297,
"grad_norm": 2.089594602584839,
"learning_rate": 4.927801450497417e-06,
"loss": 1.0212,
"step": 967
},
{
"epoch": 0.5092056812204103,
"grad_norm": 2.078519821166992,
"learning_rate": 4.927635615095668e-06,
"loss": 1.0381,
"step": 968
},
{
"epoch": 0.5097317201472908,
"grad_norm": 2.0807132720947266,
"learning_rate": 4.927469592252621e-06,
"loss": 1.0272,
"step": 969
},
{
"epoch": 0.5102577590741715,
"grad_norm": 2.1806020736694336,
"learning_rate": 4.927303381981098e-06,
"loss": 1.0846,
"step": 970
},
{
"epoch": 0.5107837980010521,
"grad_norm": 2.1434948444366455,
"learning_rate": 4.927136984293928e-06,
"loss": 1.0775,
"step": 971
},
{
"epoch": 0.5113098369279326,
"grad_norm": 2.000924825668335,
"learning_rate": 4.926970399203962e-06,
"loss": 1.0272,
"step": 972
},
{
"epoch": 0.5118358758548133,
"grad_norm": 2.1742711067199707,
"learning_rate": 4.926803626724062e-06,
"loss": 1.0253,
"step": 973
},
{
"epoch": 0.5123619147816938,
"grad_norm": 2.1074674129486084,
"learning_rate": 4.926636666867103e-06,
"loss": 1.0146,
"step": 974
},
{
"epoch": 0.5128879537085744,
"grad_norm": 2.1562392711639404,
"learning_rate": 4.926469519645976e-06,
"loss": 1.0364,
"step": 975
},
{
"epoch": 0.5134139926354551,
"grad_norm": 2.4177775382995605,
"learning_rate": 4.926302185073591e-06,
"loss": 1.0658,
"step": 976
},
{
"epoch": 0.5139400315623356,
"grad_norm": 2.322571277618408,
"learning_rate": 4.9261346631628635e-06,
"loss": 1.0489,
"step": 977
},
{
"epoch": 0.5144660704892162,
"grad_norm": 2.0937836170196533,
"learning_rate": 4.925966953926729e-06,
"loss": 1.0535,
"step": 978
},
{
"epoch": 0.5149921094160967,
"grad_norm": 2.065213680267334,
"learning_rate": 4.925799057378139e-06,
"loss": 1.0097,
"step": 979
},
{
"epoch": 0.5155181483429774,
"grad_norm": 2.0844249725341797,
"learning_rate": 4.925630973530054e-06,
"loss": 1.0719,
"step": 980
},
{
"epoch": 0.516044187269858,
"grad_norm": 2.4148666858673096,
"learning_rate": 4.925462702395454e-06,
"loss": 1.0906,
"step": 981
},
{
"epoch": 0.5165702261967385,
"grad_norm": 2.071423053741455,
"learning_rate": 4.925294243987331e-06,
"loss": 1.03,
"step": 982
},
{
"epoch": 0.5170962651236192,
"grad_norm": 2.2397513389587402,
"learning_rate": 4.9251255983186915e-06,
"loss": 1.0412,
"step": 983
},
{
"epoch": 0.5176223040504997,
"grad_norm": 2.171269655227661,
"learning_rate": 4.924956765402557e-06,
"loss": 1.1,
"step": 984
},
{
"epoch": 0.5181483429773803,
"grad_norm": 2.1705877780914307,
"learning_rate": 4.924787745251963e-06,
"loss": 1.0534,
"step": 985
},
{
"epoch": 0.518674381904261,
"grad_norm": 2.178514003753662,
"learning_rate": 4.924618537879961e-06,
"loss": 1.0759,
"step": 986
},
{
"epoch": 0.5192004208311415,
"grad_norm": 2.072097063064575,
"learning_rate": 4.924449143299614e-06,
"loss": 1.0321,
"step": 987
},
{
"epoch": 0.5197264597580221,
"grad_norm": 2.124030351638794,
"learning_rate": 4.924279561524004e-06,
"loss": 1.0465,
"step": 988
},
{
"epoch": 0.5202524986849026,
"grad_norm": 2.0927019119262695,
"learning_rate": 4.924109792566222e-06,
"loss": 1.0716,
"step": 989
},
{
"epoch": 0.5207785376117833,
"grad_norm": 2.0673232078552246,
"learning_rate": 4.923939836439377e-06,
"loss": 1.0628,
"step": 990
},
{
"epoch": 0.5213045765386639,
"grad_norm": 2.2538528442382812,
"learning_rate": 4.92376969315659e-06,
"loss": 1.0687,
"step": 991
},
{
"epoch": 0.5218306154655444,
"grad_norm": 2.120530366897583,
"learning_rate": 4.923599362731001e-06,
"loss": 1.0893,
"step": 992
},
{
"epoch": 0.5223566543924251,
"grad_norm": 2.0750699043273926,
"learning_rate": 4.92342884517576e-06,
"loss": 1.0016,
"step": 993
},
{
"epoch": 0.5228826933193056,
"grad_norm": 1.9984569549560547,
"learning_rate": 4.923258140504032e-06,
"loss": 1.0326,
"step": 994
},
{
"epoch": 0.5234087322461862,
"grad_norm": 2.201758861541748,
"learning_rate": 4.923087248729e-06,
"loss": 1.0413,
"step": 995
},
{
"epoch": 0.5239347711730669,
"grad_norm": 2.1322648525238037,
"learning_rate": 4.922916169863855e-06,
"loss": 1.0505,
"step": 996
},
{
"epoch": 0.5244608100999474,
"grad_norm": 2.0557119846343994,
"learning_rate": 4.922744903921809e-06,
"loss": 0.9761,
"step": 997
},
{
"epoch": 0.524986849026828,
"grad_norm": 2.0989720821380615,
"learning_rate": 4.922573450916086e-06,
"loss": 1.0436,
"step": 998
},
{
"epoch": 0.5255128879537085,
"grad_norm": 2.152665138244629,
"learning_rate": 4.922401810859922e-06,
"loss": 1.0567,
"step": 999
},
{
"epoch": 0.5260389268805892,
"grad_norm": 1.959796667098999,
"learning_rate": 4.922229983766571e-06,
"loss": 1.0694,
"step": 1000
},
{
"epoch": 0.5265649658074697,
"grad_norm": 2.121493101119995,
"learning_rate": 4.9220579696493e-06,
"loss": 1.1024,
"step": 1001
},
{
"epoch": 0.5270910047343503,
"grad_norm": 1.9629384279251099,
"learning_rate": 4.92188576852139e-06,
"loss": 1.0538,
"step": 1002
},
{
"epoch": 0.527617043661231,
"grad_norm": 2.396224021911621,
"learning_rate": 4.921713380396137e-06,
"loss": 1.0711,
"step": 1003
},
{
"epoch": 0.5281430825881115,
"grad_norm": 2.1571781635284424,
"learning_rate": 4.921540805286852e-06,
"loss": 1.0663,
"step": 1004
},
{
"epoch": 0.5286691215149921,
"grad_norm": 2.032282590866089,
"learning_rate": 4.921368043206858e-06,
"loss": 1.0658,
"step": 1005
},
{
"epoch": 0.5291951604418726,
"grad_norm": 1.9589232206344604,
"learning_rate": 4.921195094169496e-06,
"loss": 0.9755,
"step": 1006
},
{
"epoch": 0.5297211993687533,
"grad_norm": 1.9304051399230957,
"learning_rate": 4.92102195818812e-06,
"loss": 1.011,
"step": 1007
},
{
"epoch": 0.5302472382956339,
"grad_norm": 2.306674003601074,
"learning_rate": 4.920848635276096e-06,
"loss": 1.0626,
"step": 1008
},
{
"epoch": 0.5307732772225144,
"grad_norm": 2.156906843185425,
"learning_rate": 4.920675125446809e-06,
"loss": 1.0107,
"step": 1009
},
{
"epoch": 0.5312993161493951,
"grad_norm": 2.2959272861480713,
"learning_rate": 4.9205014287136535e-06,
"loss": 1.0527,
"step": 1010
},
{
"epoch": 0.5318253550762756,
"grad_norm": 2.046900510787964,
"learning_rate": 4.9203275450900426e-06,
"loss": 1.0154,
"step": 1011
},
{
"epoch": 0.5323513940031562,
"grad_norm": 1.9947476387023926,
"learning_rate": 4.920153474589401e-06,
"loss": 1.0456,
"step": 1012
},
{
"epoch": 0.5328774329300369,
"grad_norm": 2.3516438007354736,
"learning_rate": 4.919979217225169e-06,
"loss": 1.0982,
"step": 1013
},
{
"epoch": 0.5334034718569174,
"grad_norm": 2.5909998416900635,
"learning_rate": 4.919804773010802e-06,
"loss": 1.0436,
"step": 1014
},
{
"epoch": 0.533929510783798,
"grad_norm": 2.5206117630004883,
"learning_rate": 4.91963014195977e-06,
"loss": 1.0405,
"step": 1015
},
{
"epoch": 0.5344555497106785,
"grad_norm": 2.21992826461792,
"learning_rate": 4.919455324085554e-06,
"loss": 1.0594,
"step": 1016
},
{
"epoch": 0.5349815886375592,
"grad_norm": 2.2773711681365967,
"learning_rate": 4.919280319401654e-06,
"loss": 1.0501,
"step": 1017
},
{
"epoch": 0.5355076275644398,
"grad_norm": 2.075962543487549,
"learning_rate": 4.919105127921582e-06,
"loss": 1.0052,
"step": 1018
},
{
"epoch": 0.5360336664913203,
"grad_norm": 2.108670473098755,
"learning_rate": 4.9189297496588636e-06,
"loss": 1.0675,
"step": 1019
},
{
"epoch": 0.536559705418201,
"grad_norm": 2.125927209854126,
"learning_rate": 4.918754184627041e-06,
"loss": 1.0912,
"step": 1020
},
{
"epoch": 0.5370857443450815,
"grad_norm": 2.1099467277526855,
"learning_rate": 4.91857843283967e-06,
"loss": 1.0424,
"step": 1021
},
{
"epoch": 0.5376117832719621,
"grad_norm": 2.0880467891693115,
"learning_rate": 4.918402494310319e-06,
"loss": 1.061,
"step": 1022
},
{
"epoch": 0.5381378221988428,
"grad_norm": 2.1544101238250732,
"learning_rate": 4.918226369052575e-06,
"loss": 1.0608,
"step": 1023
},
{
"epoch": 0.5386638611257233,
"grad_norm": 2.213214635848999,
"learning_rate": 4.918050057080036e-06,
"loss": 1.1368,
"step": 1024
},
{
"epoch": 0.5391899000526039,
"grad_norm": 2.062903642654419,
"learning_rate": 4.917873558406315e-06,
"loss": 1.0861,
"step": 1025
},
{
"epoch": 0.5397159389794844,
"grad_norm": 1.9643436670303345,
"learning_rate": 4.917696873045039e-06,
"loss": 1.0008,
"step": 1026
},
{
"epoch": 0.5402419779063651,
"grad_norm": 2.276639699935913,
"learning_rate": 4.917520001009851e-06,
"loss": 0.9812,
"step": 1027
},
{
"epoch": 0.5407680168332457,
"grad_norm": 2.1487631797790527,
"learning_rate": 4.917342942314407e-06,
"loss": 1.0603,
"step": 1028
},
{
"epoch": 0.5412940557601262,
"grad_norm": 2.1040542125701904,
"learning_rate": 4.917165696972379e-06,
"loss": 1.0425,
"step": 1029
},
{
"epoch": 0.5418200946870069,
"grad_norm": 2.214475154876709,
"learning_rate": 4.916988264997452e-06,
"loss": 1.032,
"step": 1030
},
{
"epoch": 0.5423461336138874,
"grad_norm": 2.154320001602173,
"learning_rate": 4.916810646403325e-06,
"loss": 1.0371,
"step": 1031
},
{
"epoch": 0.542872172540768,
"grad_norm": 2.1565327644348145,
"learning_rate": 4.916632841203714e-06,
"loss": 1.0866,
"step": 1032
},
{
"epoch": 0.5433982114676486,
"grad_norm": 2.197402238845825,
"learning_rate": 4.916454849412344e-06,
"loss": 1.0531,
"step": 1033
},
{
"epoch": 0.5439242503945292,
"grad_norm": 2.0249993801116943,
"learning_rate": 4.916276671042962e-06,
"loss": 1.0485,
"step": 1034
},
{
"epoch": 0.5444502893214098,
"grad_norm": 2.077765703201294,
"learning_rate": 4.916098306109323e-06,
"loss": 1.0731,
"step": 1035
},
{
"epoch": 0.5449763282482903,
"grad_norm": 2.0669186115264893,
"learning_rate": 4.915919754625199e-06,
"loss": 1.0912,
"step": 1036
},
{
"epoch": 0.545502367175171,
"grad_norm": 2.160076379776001,
"learning_rate": 4.915741016604378e-06,
"loss": 1.0523,
"step": 1037
},
{
"epoch": 0.5460284061020515,
"grad_norm": 1.8992373943328857,
"learning_rate": 4.915562092060659e-06,
"loss": 1.0185,
"step": 1038
},
{
"epoch": 0.5465544450289321,
"grad_norm": 2.0712900161743164,
"learning_rate": 4.915382981007857e-06,
"loss": 1.0581,
"step": 1039
},
{
"epoch": 0.5470804839558128,
"grad_norm": 2.2600317001342773,
"learning_rate": 4.915203683459802e-06,
"loss": 1.0154,
"step": 1040
},
{
"epoch": 0.5476065228826933,
"grad_norm": 2.050366163253784,
"learning_rate": 4.915024199430338e-06,
"loss": 1.0371,
"step": 1041
},
{
"epoch": 0.5481325618095739,
"grad_norm": 2.208393096923828,
"learning_rate": 4.914844528933322e-06,
"loss": 1.0767,
"step": 1042
},
{
"epoch": 0.5486586007364544,
"grad_norm": 2.1388466358184814,
"learning_rate": 4.914664671982629e-06,
"loss": 1.1074,
"step": 1043
},
{
"epoch": 0.5491846396633351,
"grad_norm": 2.253007411956787,
"learning_rate": 4.914484628592144e-06,
"loss": 1.0455,
"step": 1044
},
{
"epoch": 0.5497106785902157,
"grad_norm": 2.2380669116973877,
"learning_rate": 4.9143043987757684e-06,
"loss": 1.0581,
"step": 1045
},
{
"epoch": 0.5502367175170962,
"grad_norm": 2.136256456375122,
"learning_rate": 4.914123982547419e-06,
"loss": 1.0588,
"step": 1046
},
{
"epoch": 0.5507627564439769,
"grad_norm": 2.0044257640838623,
"learning_rate": 4.913943379921025e-06,
"loss": 0.9918,
"step": 1047
},
{
"epoch": 0.5512887953708574,
"grad_norm": 2.089315414428711,
"learning_rate": 4.913762590910533e-06,
"loss": 1.0675,
"step": 1048
},
{
"epoch": 0.551814834297738,
"grad_norm": 2.048976182937622,
"learning_rate": 4.9135816155298985e-06,
"loss": 1.0259,
"step": 1049
},
{
"epoch": 0.5523408732246187,
"grad_norm": 2.273501396179199,
"learning_rate": 4.913400453793098e-06,
"loss": 1.0743,
"step": 1050
},
{
"epoch": 0.5528669121514992,
"grad_norm": 2.0761802196502686,
"learning_rate": 4.913219105714117e-06,
"loss": 1.0199,
"step": 1051
},
{
"epoch": 0.5533929510783798,
"grad_norm": 1.9552183151245117,
"learning_rate": 4.913037571306961e-06,
"loss": 1.0582,
"step": 1052
},
{
"epoch": 0.5539189900052603,
"grad_norm": 2.0593061447143555,
"learning_rate": 4.9128558505856425e-06,
"loss": 0.9626,
"step": 1053
},
{
"epoch": 0.554445028932141,
"grad_norm": 2.026820659637451,
"learning_rate": 4.9126739435641955e-06,
"loss": 1.0253,
"step": 1054
},
{
"epoch": 0.5549710678590216,
"grad_norm": 2.22835111618042,
"learning_rate": 4.9124918502566635e-06,
"loss": 1.0176,
"step": 1055
},
{
"epoch": 0.5554971067859021,
"grad_norm": 1.9653559923171997,
"learning_rate": 4.9123095706771064e-06,
"loss": 0.9886,
"step": 1056
},
{
"epoch": 0.5560231457127828,
"grad_norm": 2.083310842514038,
"learning_rate": 4.912127104839599e-06,
"loss": 1.0105,
"step": 1057
},
{
"epoch": 0.5565491846396633,
"grad_norm": 2.1681482791900635,
"learning_rate": 4.91194445275823e-06,
"loss": 1.0359,
"step": 1058
},
{
"epoch": 0.5570752235665439,
"grad_norm": 1.990717887878418,
"learning_rate": 4.911761614447101e-06,
"loss": 1.0369,
"step": 1059
},
{
"epoch": 0.5576012624934246,
"grad_norm": 2.159813642501831,
"learning_rate": 4.91157858992033e-06,
"loss": 1.0341,
"step": 1060
},
{
"epoch": 0.5581273014203051,
"grad_norm": 1.9474655389785767,
"learning_rate": 4.911395379192048e-06,
"loss": 1.0432,
"step": 1061
},
{
"epoch": 0.5586533403471857,
"grad_norm": 2.140634536743164,
"learning_rate": 4.911211982276402e-06,
"loss": 1.0485,
"step": 1062
},
{
"epoch": 0.5591793792740662,
"grad_norm": 2.2925636768341064,
"learning_rate": 4.911028399187552e-06,
"loss": 1.0648,
"step": 1063
},
{
"epoch": 0.5597054182009469,
"grad_norm": 2.037755250930786,
"learning_rate": 4.910844629939672e-06,
"loss": 1.0568,
"step": 1064
},
{
"epoch": 0.5602314571278275,
"grad_norm": 1.997471809387207,
"learning_rate": 4.910660674546951e-06,
"loss": 1.0109,
"step": 1065
},
{
"epoch": 0.560757496054708,
"grad_norm": 2.109219551086426,
"learning_rate": 4.910476533023593e-06,
"loss": 1.0658,
"step": 1066
},
{
"epoch": 0.5612835349815887,
"grad_norm": 2.10469388961792,
"learning_rate": 4.9102922053838175e-06,
"loss": 1.0612,
"step": 1067
},
{
"epoch": 0.5618095739084692,
"grad_norm": 2.2748658657073975,
"learning_rate": 4.9101076916418535e-06,
"loss": 1.0422,
"step": 1068
},
{
"epoch": 0.5623356128353498,
"grad_norm": 2.0472326278686523,
"learning_rate": 4.90992299181195e-06,
"loss": 1.0339,
"step": 1069
},
{
"epoch": 0.5628616517622304,
"grad_norm": 2.0694494247436523,
"learning_rate": 4.909738105908367e-06,
"loss": 1.0083,
"step": 1070
},
{
"epoch": 0.563387690689111,
"grad_norm": 2.0032498836517334,
"learning_rate": 4.909553033945379e-06,
"loss": 0.9889,
"step": 1071
},
{
"epoch": 0.5639137296159916,
"grad_norm": 2.091393232345581,
"learning_rate": 4.909367775937278e-06,
"loss": 1.0856,
"step": 1072
},
{
"epoch": 0.5644397685428721,
"grad_norm": 2.0542173385620117,
"learning_rate": 4.909182331898366e-06,
"loss": 1.0422,
"step": 1073
},
{
"epoch": 0.5649658074697528,
"grad_norm": 2.009228467941284,
"learning_rate": 4.908996701842962e-06,
"loss": 1.0594,
"step": 1074
},
{
"epoch": 0.5654918463966333,
"grad_norm": 1.9546911716461182,
"learning_rate": 4.9088108857853985e-06,
"loss": 1.0691,
"step": 1075
},
{
"epoch": 0.5660178853235139,
"grad_norm": 2.1657440662384033,
"learning_rate": 4.908624883740023e-06,
"loss": 1.0252,
"step": 1076
},
{
"epoch": 0.5665439242503946,
"grad_norm": 2.151035785675049,
"learning_rate": 4.9084386957211975e-06,
"loss": 1.0587,
"step": 1077
},
{
"epoch": 0.5670699631772751,
"grad_norm": 2.299673557281494,
"learning_rate": 4.908252321743296e-06,
"loss": 1.0221,
"step": 1078
},
{
"epoch": 0.5675960021041557,
"grad_norm": 2.0144848823547363,
"learning_rate": 4.908065761820711e-06,
"loss": 1.0256,
"step": 1079
},
{
"epoch": 0.5681220410310363,
"grad_norm": 2.172971725463867,
"learning_rate": 4.907879015967846e-06,
"loss": 1.0231,
"step": 1080
},
{
"epoch": 0.5686480799579169,
"grad_norm": 2.0427041053771973,
"learning_rate": 4.907692084199119e-06,
"loss": 1.0433,
"step": 1081
},
{
"epoch": 0.5691741188847975,
"grad_norm": 2.1561834812164307,
"learning_rate": 4.907504966528966e-06,
"loss": 1.0478,
"step": 1082
},
{
"epoch": 0.569700157811678,
"grad_norm": 2.012385606765747,
"learning_rate": 4.907317662971831e-06,
"loss": 1.0703,
"step": 1083
},
{
"epoch": 0.5702261967385587,
"grad_norm": 2.137075424194336,
"learning_rate": 4.907130173542179e-06,
"loss": 1.0527,
"step": 1084
},
{
"epoch": 0.5707522356654392,
"grad_norm": 2.039424180984497,
"learning_rate": 4.906942498254485e-06,
"loss": 0.9969,
"step": 1085
},
{
"epoch": 0.5712782745923198,
"grad_norm": 2.0207748413085938,
"learning_rate": 4.90675463712324e-06,
"loss": 1.0157,
"step": 1086
},
{
"epoch": 0.5718043135192005,
"grad_norm": 2.024454116821289,
"learning_rate": 4.906566590162949e-06,
"loss": 1.0699,
"step": 1087
},
{
"epoch": 0.572330352446081,
"grad_norm": 2.256537675857544,
"learning_rate": 4.90637835738813e-06,
"loss": 1.1083,
"step": 1088
},
{
"epoch": 0.5728563913729616,
"grad_norm": 2.099698543548584,
"learning_rate": 4.90618993881332e-06,
"loss": 1.0242,
"step": 1089
},
{
"epoch": 0.5733824302998421,
"grad_norm": 2.0367214679718018,
"learning_rate": 4.906001334453064e-06,
"loss": 1.0088,
"step": 1090
},
{
"epoch": 0.5739084692267228,
"grad_norm": 1.9988690614700317,
"learning_rate": 4.9058125443219245e-06,
"loss": 1.044,
"step": 1091
},
{
"epoch": 0.5744345081536034,
"grad_norm": 1.9970273971557617,
"learning_rate": 4.9056235684344805e-06,
"loss": 1.0847,
"step": 1092
},
{
"epoch": 0.5749605470804839,
"grad_norm": 2.152602434158325,
"learning_rate": 4.905434406805322e-06,
"loss": 1.0931,
"step": 1093
},
{
"epoch": 0.5754865860073646,
"grad_norm": 2.0728707313537598,
"learning_rate": 4.905245059449053e-06,
"loss": 1.0401,
"step": 1094
},
{
"epoch": 0.5760126249342451,
"grad_norm": 1.94095778465271,
"learning_rate": 4.9050555263802954e-06,
"loss": 1.0262,
"step": 1095
},
{
"epoch": 0.5765386638611257,
"grad_norm": 2.126347780227661,
"learning_rate": 4.904865807613683e-06,
"loss": 1.0678,
"step": 1096
},
{
"epoch": 0.5770647027880064,
"grad_norm": 2.085378646850586,
"learning_rate": 4.904675903163864e-06,
"loss": 1.0665,
"step": 1097
},
{
"epoch": 0.5775907417148869,
"grad_norm": 2.2276804447174072,
"learning_rate": 4.9044858130454995e-06,
"loss": 1.0718,
"step": 1098
},
{
"epoch": 0.5781167806417675,
"grad_norm": 2.2318899631500244,
"learning_rate": 4.904295537273269e-06,
"loss": 1.0663,
"step": 1099
},
{
"epoch": 0.578642819568648,
"grad_norm": 2.0555522441864014,
"learning_rate": 4.904105075861864e-06,
"loss": 0.9989,
"step": 1100
},
{
"epoch": 0.5791688584955287,
"grad_norm": 2.094501256942749,
"learning_rate": 4.9039144288259876e-06,
"loss": 1.0802,
"step": 1101
},
{
"epoch": 0.5796948974224092,
"grad_norm": 2.7403769493103027,
"learning_rate": 4.903723596180363e-06,
"loss": 1.0024,
"step": 1102
},
{
"epoch": 0.5802209363492898,
"grad_norm": 2.1775436401367188,
"learning_rate": 4.9035325779397225e-06,
"loss": 1.0234,
"step": 1103
},
{
"epoch": 0.5807469752761705,
"grad_norm": 2.2489676475524902,
"learning_rate": 4.903341374118816e-06,
"loss": 1.0188,
"step": 1104
},
{
"epoch": 0.581273014203051,
"grad_norm": 2.2214367389678955,
"learning_rate": 4.903149984732407e-06,
"loss": 1.0835,
"step": 1105
},
{
"epoch": 0.5817990531299316,
"grad_norm": 2.203273296356201,
"learning_rate": 4.902958409795272e-06,
"loss": 1.0547,
"step": 1106
},
{
"epoch": 0.5823250920568122,
"grad_norm": 2.1076622009277344,
"learning_rate": 4.902766649322204e-06,
"loss": 1.0571,
"step": 1107
},
{
"epoch": 0.5828511309836928,
"grad_norm": 2.1270394325256348,
"learning_rate": 4.902574703328007e-06,
"loss": 0.9863,
"step": 1108
},
{
"epoch": 0.5833771699105734,
"grad_norm": 2.1030006408691406,
"learning_rate": 4.902382571827503e-06,
"loss": 1.0404,
"step": 1109
},
{
"epoch": 0.583903208837454,
"grad_norm": 2.1046831607818604,
"learning_rate": 4.9021902548355275e-06,
"loss": 1.018,
"step": 1110
},
{
"epoch": 0.5844292477643346,
"grad_norm": 2.0193376541137695,
"learning_rate": 4.901997752366927e-06,
"loss": 1.0035,
"step": 1111
},
{
"epoch": 0.5849552866912151,
"grad_norm": 2.0812923908233643,
"learning_rate": 4.9018050644365675e-06,
"loss": 0.9928,
"step": 1112
},
{
"epoch": 0.5854813256180957,
"grad_norm": 2.035750150680542,
"learning_rate": 4.901612191059325e-06,
"loss": 1.0658,
"step": 1113
},
{
"epoch": 0.5860073645449764,
"grad_norm": 2.093606948852539,
"learning_rate": 4.901419132250093e-06,
"loss": 1.0019,
"step": 1114
},
{
"epoch": 0.5865334034718569,
"grad_norm": 2.4018402099609375,
"learning_rate": 4.901225888023776e-06,
"loss": 1.0785,
"step": 1115
},
{
"epoch": 0.5870594423987375,
"grad_norm": 2.1731529235839844,
"learning_rate": 4.901032458395296e-06,
"loss": 1.0437,
"step": 1116
},
{
"epoch": 0.587585481325618,
"grad_norm": 2.085692882537842,
"learning_rate": 4.900838843379588e-06,
"loss": 1.0122,
"step": 1117
},
{
"epoch": 0.5881115202524987,
"grad_norm": 2.272787094116211,
"learning_rate": 4.900645042991601e-06,
"loss": 1.0708,
"step": 1118
},
{
"epoch": 0.5886375591793793,
"grad_norm": 2.197758913040161,
"learning_rate": 4.900451057246298e-06,
"loss": 1.037,
"step": 1119
},
{
"epoch": 0.5891635981062598,
"grad_norm": 2.228980779647827,
"learning_rate": 4.900256886158658e-06,
"loss": 1.0306,
"step": 1120
},
{
"epoch": 0.5896896370331405,
"grad_norm": 2.010698080062866,
"learning_rate": 4.900062529743672e-06,
"loss": 1.0777,
"step": 1121
},
{
"epoch": 0.590215675960021,
"grad_norm": 2.0015103816986084,
"learning_rate": 4.899867988016348e-06,
"loss": 0.9991,
"step": 1122
},
{
"epoch": 0.5907417148869016,
"grad_norm": 1.9307256937026978,
"learning_rate": 4.899673260991706e-06,
"loss": 1.0655,
"step": 1123
},
{
"epoch": 0.5912677538137823,
"grad_norm": 2.339930295944214,
"learning_rate": 4.899478348684782e-06,
"loss": 1.0177,
"step": 1124
},
{
"epoch": 0.5917937927406628,
"grad_norm": 2.000337839126587,
"learning_rate": 4.899283251110624e-06,
"loss": 1.036,
"step": 1125
},
{
"epoch": 0.5923198316675434,
"grad_norm": 2.0116374492645264,
"learning_rate": 4.899087968284297e-06,
"loss": 0.9666,
"step": 1126
},
{
"epoch": 0.592845870594424,
"grad_norm": 2.27270245552063,
"learning_rate": 4.898892500220878e-06,
"loss": 1.0526,
"step": 1127
},
{
"epoch": 0.5933719095213046,
"grad_norm": 2.1844749450683594,
"learning_rate": 4.89869684693546e-06,
"loss": 1.0606,
"step": 1128
},
{
"epoch": 0.5938979484481852,
"grad_norm": 2.112031936645508,
"learning_rate": 4.898501008443151e-06,
"loss": 1.0846,
"step": 1129
},
{
"epoch": 0.5944239873750657,
"grad_norm": 2.251878499984741,
"learning_rate": 4.898304984759069e-06,
"loss": 1.023,
"step": 1130
},
{
"epoch": 0.5949500263019464,
"grad_norm": 2.064732074737549,
"learning_rate": 4.898108775898351e-06,
"loss": 1.066,
"step": 1131
},
{
"epoch": 0.5954760652288269,
"grad_norm": 2.10412335395813,
"learning_rate": 4.897912381876147e-06,
"loss": 1.0476,
"step": 1132
},
{
"epoch": 0.5960021041557075,
"grad_norm": 2.1343259811401367,
"learning_rate": 4.897715802707621e-06,
"loss": 1.0264,
"step": 1133
},
{
"epoch": 0.5965281430825881,
"grad_norm": 2.3453173637390137,
"learning_rate": 4.89751903840795e-06,
"loss": 1.076,
"step": 1134
},
{
"epoch": 0.5970541820094687,
"grad_norm": 2.040123462677002,
"learning_rate": 4.897322088992326e-06,
"loss": 1.0494,
"step": 1135
},
{
"epoch": 0.5975802209363493,
"grad_norm": 2.070585012435913,
"learning_rate": 4.897124954475958e-06,
"loss": 1.0904,
"step": 1136
},
{
"epoch": 0.5981062598632298,
"grad_norm": 2.048081159591675,
"learning_rate": 4.896927634874065e-06,
"loss": 0.9855,
"step": 1137
},
{
"epoch": 0.5986322987901105,
"grad_norm": 2.07633113861084,
"learning_rate": 4.896730130201883e-06,
"loss": 1.0848,
"step": 1138
},
{
"epoch": 0.599158337716991,
"grad_norm": 2.233821153640747,
"learning_rate": 4.8965324404746624e-06,
"loss": 1.0419,
"step": 1139
},
{
"epoch": 0.5996843766438716,
"grad_norm": 2.1806929111480713,
"learning_rate": 4.896334565707666e-06,
"loss": 1.0377,
"step": 1140
},
{
"epoch": 0.6002104155707523,
"grad_norm": 2.056483268737793,
"learning_rate": 4.896136505916174e-06,
"loss": 1.0269,
"step": 1141
},
{
"epoch": 0.6007364544976328,
"grad_norm": 1.9446007013320923,
"learning_rate": 4.895938261115476e-06,
"loss": 0.9958,
"step": 1142
},
{
"epoch": 0.6012624934245134,
"grad_norm": 1.9170737266540527,
"learning_rate": 4.8957398313208795e-06,
"loss": 1.0083,
"step": 1143
},
{
"epoch": 0.601788532351394,
"grad_norm": 2.0455801486968994,
"learning_rate": 4.895541216547707e-06,
"loss": 1.0819,
"step": 1144
},
{
"epoch": 0.6023145712782746,
"grad_norm": 2.410231828689575,
"learning_rate": 4.8953424168112925e-06,
"loss": 1.0265,
"step": 1145
},
{
"epoch": 0.6028406102051552,
"grad_norm": 2.0946412086486816,
"learning_rate": 4.895143432126986e-06,
"loss": 1.014,
"step": 1146
},
{
"epoch": 0.6033666491320357,
"grad_norm": 1.9825836420059204,
"learning_rate": 4.894944262510152e-06,
"loss": 0.9721,
"step": 1147
},
{
"epoch": 0.6038926880589164,
"grad_norm": 2.1228606700897217,
"learning_rate": 4.8947449079761685e-06,
"loss": 1.0971,
"step": 1148
},
{
"epoch": 0.6044187269857969,
"grad_norm": 2.1443943977355957,
"learning_rate": 4.894545368540427e-06,
"loss": 0.9956,
"step": 1149
},
{
"epoch": 0.6049447659126775,
"grad_norm": 1.9651165008544922,
"learning_rate": 4.894345644218335e-06,
"loss": 1.0103,
"step": 1150
},
{
"epoch": 0.6054708048395582,
"grad_norm": 1.9829816818237305,
"learning_rate": 4.8941457350253134e-06,
"loss": 1.0425,
"step": 1151
},
{
"epoch": 0.6059968437664387,
"grad_norm": 2.122873067855835,
"learning_rate": 4.893945640976798e-06,
"loss": 1.0532,
"step": 1152
},
{
"epoch": 0.6065228826933193,
"grad_norm": 2.0714738368988037,
"learning_rate": 4.8937453620882365e-06,
"loss": 1.0307,
"step": 1153
},
{
"epoch": 0.6070489216201999,
"grad_norm": 1.9049363136291504,
"learning_rate": 4.893544898375096e-06,
"loss": 0.9805,
"step": 1154
},
{
"epoch": 0.6075749605470805,
"grad_norm": 2.432041645050049,
"learning_rate": 4.893344249852851e-06,
"loss": 1.0833,
"step": 1155
},
{
"epoch": 0.6081009994739611,
"grad_norm": 2.055748224258423,
"learning_rate": 4.893143416536997e-06,
"loss": 1.0315,
"step": 1156
},
{
"epoch": 0.6086270384008416,
"grad_norm": 1.9813153743743896,
"learning_rate": 4.892942398443037e-06,
"loss": 1.0786,
"step": 1157
},
{
"epoch": 0.6091530773277223,
"grad_norm": 2.2038941383361816,
"learning_rate": 4.892741195586496e-06,
"loss": 1.0604,
"step": 1158
},
{
"epoch": 0.6096791162546028,
"grad_norm": 2.0015673637390137,
"learning_rate": 4.892539807982906e-06,
"loss": 0.9863,
"step": 1159
},
{
"epoch": 0.6102051551814834,
"grad_norm": 2.0392401218414307,
"learning_rate": 4.892338235647818e-06,
"loss": 1.0218,
"step": 1160
},
{
"epoch": 0.6107311941083641,
"grad_norm": 2.0060133934020996,
"learning_rate": 4.892136478596796e-06,
"loss": 1.0134,
"step": 1161
},
{
"epoch": 0.6112572330352446,
"grad_norm": 1.9645148515701294,
"learning_rate": 4.8919345368454164e-06,
"loss": 1.0206,
"step": 1162
},
{
"epoch": 0.6117832719621252,
"grad_norm": 1.9299581050872803,
"learning_rate": 4.8917324104092725e-06,
"loss": 1.0243,
"step": 1163
},
{
"epoch": 0.6123093108890058,
"grad_norm": 2.071143388748169,
"learning_rate": 4.891530099303971e-06,
"loss": 1.0466,
"step": 1164
},
{
"epoch": 0.6128353498158864,
"grad_norm": 2.122020959854126,
"learning_rate": 4.891327603545132e-06,
"loss": 1.0886,
"step": 1165
},
{
"epoch": 0.6133613887427669,
"grad_norm": 2.0861775875091553,
"learning_rate": 4.891124923148391e-06,
"loss": 1.0481,
"step": 1166
},
{
"epoch": 0.6138874276696475,
"grad_norm": 2.053553581237793,
"learning_rate": 4.890922058129396e-06,
"loss": 1.0332,
"step": 1167
},
{
"epoch": 0.6144134665965282,
"grad_norm": 2.0698556900024414,
"learning_rate": 4.890719008503813e-06,
"loss": 0.9913,
"step": 1168
},
{
"epoch": 0.6149395055234087,
"grad_norm": 2.0626866817474365,
"learning_rate": 4.890515774287317e-06,
"loss": 1.0383,
"step": 1169
},
{
"epoch": 0.6154655444502893,
"grad_norm": 2.001122236251831,
"learning_rate": 4.890312355495602e-06,
"loss": 0.997,
"step": 1170
},
{
"epoch": 0.6159915833771699,
"grad_norm": 2.141261577606201,
"learning_rate": 4.890108752144373e-06,
"loss": 1.0139,
"step": 1171
},
{
"epoch": 0.6165176223040505,
"grad_norm": 2.0430335998535156,
"learning_rate": 4.8899049642493514e-06,
"loss": 1.0177,
"step": 1172
},
{
"epoch": 0.6170436612309311,
"grad_norm": 2.0376110076904297,
"learning_rate": 4.889700991826271e-06,
"loss": 1.0306,
"step": 1173
},
{
"epoch": 0.6175697001578117,
"grad_norm": 2.0546419620513916,
"learning_rate": 4.889496834890882e-06,
"loss": 1.0379,
"step": 1174
},
{
"epoch": 0.6180957390846923,
"grad_norm": 2.004117012023926,
"learning_rate": 4.889292493458947e-06,
"loss": 1.1014,
"step": 1175
},
{
"epoch": 0.6186217780115728,
"grad_norm": 2.1904101371765137,
"learning_rate": 4.889087967546243e-06,
"loss": 1.0252,
"step": 1176
},
{
"epoch": 0.6191478169384534,
"grad_norm": 2.2026965618133545,
"learning_rate": 4.8888832571685626e-06,
"loss": 1.0309,
"step": 1177
},
{
"epoch": 0.6196738558653341,
"grad_norm": 1.9925811290740967,
"learning_rate": 4.888678362341711e-06,
"loss": 1.0157,
"step": 1178
},
{
"epoch": 0.6201998947922146,
"grad_norm": 2.4098422527313232,
"learning_rate": 4.88847328308151e-06,
"loss": 0.9825,
"step": 1179
},
{
"epoch": 0.6207259337190952,
"grad_norm": 1.9352220296859741,
"learning_rate": 4.888268019403792e-06,
"loss": 1.0235,
"step": 1180
},
{
"epoch": 0.6212519726459758,
"grad_norm": 1.9798966646194458,
"learning_rate": 4.888062571324407e-06,
"loss": 1.0124,
"step": 1181
},
{
"epoch": 0.6217780115728564,
"grad_norm": 1.9737377166748047,
"learning_rate": 4.887856938859218e-06,
"loss": 1.005,
"step": 1182
},
{
"epoch": 0.622304050499737,
"grad_norm": 2.2528250217437744,
"learning_rate": 4.887651122024102e-06,
"loss": 1.0207,
"step": 1183
},
{
"epoch": 0.6228300894266176,
"grad_norm": 2.01436185836792,
"learning_rate": 4.887445120834949e-06,
"loss": 1.0368,
"step": 1184
},
{
"epoch": 0.6233561283534982,
"grad_norm": 2.0212924480438232,
"learning_rate": 4.887238935307667e-06,
"loss": 1.0136,
"step": 1185
},
{
"epoch": 0.6238821672803787,
"grad_norm": 2.080514669418335,
"learning_rate": 4.887032565458174e-06,
"loss": 1.0012,
"step": 1186
},
{
"epoch": 0.6244082062072593,
"grad_norm": 2.220168113708496,
"learning_rate": 4.886826011302406e-06,
"loss": 1.0055,
"step": 1187
},
{
"epoch": 0.62493424513414,
"grad_norm": 2.042325258255005,
"learning_rate": 4.886619272856309e-06,
"loss": 1.0793,
"step": 1188
},
{
"epoch": 0.6254602840610205,
"grad_norm": 2.0139427185058594,
"learning_rate": 4.886412350135848e-06,
"loss": 1.0853,
"step": 1189
},
{
"epoch": 0.6259863229879011,
"grad_norm": 2.072531223297119,
"learning_rate": 4.886205243156998e-06,
"loss": 1.0611,
"step": 1190
},
{
"epoch": 0.6265123619147817,
"grad_norm": 2.1070992946624756,
"learning_rate": 4.8859979519357505e-06,
"loss": 1.0171,
"step": 1191
},
{
"epoch": 0.6270384008416623,
"grad_norm": 1.9750585556030273,
"learning_rate": 4.885790476488111e-06,
"loss": 1.01,
"step": 1192
},
{
"epoch": 0.6275644397685429,
"grad_norm": 1.9221036434173584,
"learning_rate": 4.885582816830099e-06,
"loss": 1.0173,
"step": 1193
},
{
"epoch": 0.6280904786954234,
"grad_norm": 2.0700929164886475,
"learning_rate": 4.885374972977748e-06,
"loss": 1.0469,
"step": 1194
},
{
"epoch": 0.6286165176223041,
"grad_norm": 2.1358914375305176,
"learning_rate": 4.885166944947106e-06,
"loss": 1.0144,
"step": 1195
},
{
"epoch": 0.6291425565491846,
"grad_norm": 2.0657570362091064,
"learning_rate": 4.884958732754236e-06,
"loss": 1.0278,
"step": 1196
},
{
"epoch": 0.6296685954760652,
"grad_norm": 2.050619125366211,
"learning_rate": 4.884750336415213e-06,
"loss": 1.0401,
"step": 1197
},
{
"epoch": 0.6301946344029458,
"grad_norm": 2.029069423675537,
"learning_rate": 4.884541755946127e-06,
"loss": 1.0265,
"step": 1198
},
{
"epoch": 0.6307206733298264,
"grad_norm": 2.2242050170898438,
"learning_rate": 4.884332991363086e-06,
"loss": 1.043,
"step": 1199
},
{
"epoch": 0.631246712256707,
"grad_norm": 1.9235576391220093,
"learning_rate": 4.8841240426822056e-06,
"loss": 1.0323,
"step": 1200
},
{
"epoch": 0.6317727511835876,
"grad_norm": 2.0110039710998535,
"learning_rate": 4.88391490991962e-06,
"loss": 0.9861,
"step": 1201
},
{
"epoch": 0.6322987901104682,
"grad_norm": 1.9583542346954346,
"learning_rate": 4.883705593091478e-06,
"loss": 1.0907,
"step": 1202
},
{
"epoch": 0.6328248290373487,
"grad_norm": 2.046147346496582,
"learning_rate": 4.88349609221394e-06,
"loss": 1.0264,
"step": 1203
},
{
"epoch": 0.6333508679642293,
"grad_norm": 2.072329521179199,
"learning_rate": 4.8832864073031826e-06,
"loss": 1.0273,
"step": 1204
},
{
"epoch": 0.63387690689111,
"grad_norm": 2.163562774658203,
"learning_rate": 4.883076538375395e-06,
"loss": 0.9729,
"step": 1205
},
{
"epoch": 0.6344029458179905,
"grad_norm": 2.018745183944702,
"learning_rate": 4.8828664854467825e-06,
"loss": 1.0349,
"step": 1206
},
{
"epoch": 0.6349289847448711,
"grad_norm": 1.9641830921173096,
"learning_rate": 4.882656248533562e-06,
"loss": 1.0254,
"step": 1207
},
{
"epoch": 0.6354550236717517,
"grad_norm": 2.189903736114502,
"learning_rate": 4.8824458276519676e-06,
"loss": 1.0347,
"step": 1208
},
{
"epoch": 0.6359810625986323,
"grad_norm": 1.9000815153121948,
"learning_rate": 4.882235222818245e-06,
"loss": 1.0068,
"step": 1209
},
{
"epoch": 0.6365071015255129,
"grad_norm": 2.008253335952759,
"learning_rate": 4.882024434048658e-06,
"loss": 0.9951,
"step": 1210
},
{
"epoch": 0.6370331404523935,
"grad_norm": 2.254880905151367,
"learning_rate": 4.881813461359479e-06,
"loss": 1.0254,
"step": 1211
},
{
"epoch": 0.6375591793792741,
"grad_norm": 2.079281806945801,
"learning_rate": 4.881602304766999e-06,
"loss": 1.0138,
"step": 1212
},
{
"epoch": 0.6380852183061546,
"grad_norm": 1.9515445232391357,
"learning_rate": 4.881390964287521e-06,
"loss": 0.9896,
"step": 1213
},
{
"epoch": 0.6386112572330352,
"grad_norm": 2.118746757507324,
"learning_rate": 4.881179439937363e-06,
"loss": 1.0554,
"step": 1214
},
{
"epoch": 0.6391372961599159,
"grad_norm": 1.9809492826461792,
"learning_rate": 4.8809677317328574e-06,
"loss": 1.0327,
"step": 1215
},
{
"epoch": 0.6396633350867964,
"grad_norm": 2.0196714401245117,
"learning_rate": 4.88075583969035e-06,
"loss": 1.0072,
"step": 1216
},
{
"epoch": 0.640189374013677,
"grad_norm": 2.075596570968628,
"learning_rate": 4.8805437638262024e-06,
"loss": 1.0088,
"step": 1217
},
{
"epoch": 0.6407154129405576,
"grad_norm": 1.919331431388855,
"learning_rate": 4.880331504156788e-06,
"loss": 0.9561,
"step": 1218
},
{
"epoch": 0.6412414518674382,
"grad_norm": 2.1209754943847656,
"learning_rate": 4.8801190606984974e-06,
"loss": 1.0436,
"step": 1219
},
{
"epoch": 0.6417674907943188,
"grad_norm": 2.1692416667938232,
"learning_rate": 4.879906433467731e-06,
"loss": 1.0596,
"step": 1220
},
{
"epoch": 0.6422935297211994,
"grad_norm": 2.127383232116699,
"learning_rate": 4.879693622480908e-06,
"loss": 1.0527,
"step": 1221
},
{
"epoch": 0.64281956864808,
"grad_norm": 2.0686752796173096,
"learning_rate": 4.87948062775446e-06,
"loss": 1.0161,
"step": 1222
},
{
"epoch": 0.6433456075749605,
"grad_norm": 1.9912559986114502,
"learning_rate": 4.879267449304831e-06,
"loss": 1.0246,
"step": 1223
},
{
"epoch": 0.6438716465018411,
"grad_norm": 1.9714523553848267,
"learning_rate": 4.879054087148483e-06,
"loss": 1.0669,
"step": 1224
},
{
"epoch": 0.6443976854287218,
"grad_norm": 2.0122146606445312,
"learning_rate": 4.878840541301888e-06,
"loss": 1.0383,
"step": 1225
},
{
"epoch": 0.6449237243556023,
"grad_norm": 2.191110134124756,
"learning_rate": 4.878626811781536e-06,
"loss": 1.0832,
"step": 1226
},
{
"epoch": 0.6454497632824829,
"grad_norm": 2.018800735473633,
"learning_rate": 4.8784128986039274e-06,
"loss": 1.0588,
"step": 1227
},
{
"epoch": 0.6459758022093635,
"grad_norm": 2.0812923908233643,
"learning_rate": 4.87819880178558e-06,
"loss": 1.0221,
"step": 1228
},
{
"epoch": 0.6465018411362441,
"grad_norm": 2.110596179962158,
"learning_rate": 4.877984521343025e-06,
"loss": 1.0252,
"step": 1229
},
{
"epoch": 0.6470278800631246,
"grad_norm": 2.2176296710968018,
"learning_rate": 4.877770057292806e-06,
"loss": 1.0575,
"step": 1230
},
{
"epoch": 0.6475539189900053,
"grad_norm": 2.0294981002807617,
"learning_rate": 4.8775554096514836e-06,
"loss": 0.9862,
"step": 1231
},
{
"epoch": 0.6480799579168859,
"grad_norm": 2.03635573387146,
"learning_rate": 4.8773405784356285e-06,
"loss": 1.0229,
"step": 1232
},
{
"epoch": 0.6486059968437664,
"grad_norm": 2.2391481399536133,
"learning_rate": 4.877125563661831e-06,
"loss": 1.1258,
"step": 1233
},
{
"epoch": 0.649132035770647,
"grad_norm": 2.1449427604675293,
"learning_rate": 4.876910365346691e-06,
"loss": 1.039,
"step": 1234
},
{
"epoch": 0.6496580746975276,
"grad_norm": 2.075510025024414,
"learning_rate": 4.876694983506826e-06,
"loss": 1.047,
"step": 1235
},
{
"epoch": 0.6501841136244082,
"grad_norm": 1.9154462814331055,
"learning_rate": 4.876479418158862e-06,
"loss": 0.9906,
"step": 1236
},
{
"epoch": 0.6507101525512888,
"grad_norm": 2.2096331119537354,
"learning_rate": 4.876263669319449e-06,
"loss": 1.0843,
"step": 1237
},
{
"epoch": 0.6512361914781694,
"grad_norm": 2.0682895183563232,
"learning_rate": 4.87604773700524e-06,
"loss": 1.0262,
"step": 1238
},
{
"epoch": 0.65176223040505,
"grad_norm": 2.0859344005584717,
"learning_rate": 4.8758316212329106e-06,
"loss": 1.02,
"step": 1239
},
{
"epoch": 0.6522882693319305,
"grad_norm": 2.060521364212036,
"learning_rate": 4.875615322019146e-06,
"loss": 1.0455,
"step": 1240
},
{
"epoch": 0.6528143082588111,
"grad_norm": 2.049457311630249,
"learning_rate": 4.875398839380647e-06,
"loss": 1.0763,
"step": 1241
},
{
"epoch": 0.6533403471856918,
"grad_norm": 2.2475039958953857,
"learning_rate": 4.875182173334129e-06,
"loss": 1.0599,
"step": 1242
},
{
"epoch": 0.6538663861125723,
"grad_norm": 1.9375535249710083,
"learning_rate": 4.874965323896321e-06,
"loss": 0.9758,
"step": 1243
},
{
"epoch": 0.6543924250394529,
"grad_norm": 2.0157570838928223,
"learning_rate": 4.874748291083967e-06,
"loss": 1.0491,
"step": 1244
},
{
"epoch": 0.6549184639663335,
"grad_norm": 2.1339237689971924,
"learning_rate": 4.874531074913823e-06,
"loss": 0.9634,
"step": 1245
},
{
"epoch": 0.6554445028932141,
"grad_norm": 1.946191430091858,
"learning_rate": 4.874313675402662e-06,
"loss": 1.0407,
"step": 1246
},
{
"epoch": 0.6559705418200947,
"grad_norm": 1.9623258113861084,
"learning_rate": 4.874096092567268e-06,
"loss": 1.0662,
"step": 1247
},
{
"epoch": 0.6564965807469753,
"grad_norm": 2.092224359512329,
"learning_rate": 4.873878326424443e-06,
"loss": 1.0802,
"step": 1248
},
{
"epoch": 0.6570226196738559,
"grad_norm": 1.863853931427002,
"learning_rate": 4.873660376990999e-06,
"loss": 1.0789,
"step": 1249
},
{
"epoch": 0.6575486586007364,
"grad_norm": 2.146857976913452,
"learning_rate": 4.8734422442837655e-06,
"loss": 1.0132,
"step": 1250
},
{
"epoch": 0.658074697527617,
"grad_norm": 2.022573232650757,
"learning_rate": 4.8732239283195844e-06,
"loss": 1.0252,
"step": 1251
},
{
"epoch": 0.6586007364544977,
"grad_norm": 2.160632848739624,
"learning_rate": 4.873005429115312e-06,
"loss": 1.0235,
"step": 1252
},
{
"epoch": 0.6591267753813782,
"grad_norm": 2.0909252166748047,
"learning_rate": 4.87278674668782e-06,
"loss": 1.0671,
"step": 1253
},
{
"epoch": 0.6596528143082588,
"grad_norm": 1.9689445495605469,
"learning_rate": 4.872567881053991e-06,
"loss": 1.0323,
"step": 1254
},
{
"epoch": 0.6601788532351394,
"grad_norm": 2.141439914703369,
"learning_rate": 4.872348832230727e-06,
"loss": 1.0019,
"step": 1255
},
{
"epoch": 0.66070489216202,
"grad_norm": 1.9927963018417358,
"learning_rate": 4.872129600234938e-06,
"loss": 1.0262,
"step": 1256
},
{
"epoch": 0.6612309310889006,
"grad_norm": 2.1227667331695557,
"learning_rate": 4.871910185083554e-06,
"loss": 1.0341,
"step": 1257
},
{
"epoch": 0.6617569700157812,
"grad_norm": 2.0554583072662354,
"learning_rate": 4.871690586793514e-06,
"loss": 1.0458,
"step": 1258
},
{
"epoch": 0.6622830089426618,
"grad_norm": 1.9936654567718506,
"learning_rate": 4.871470805381775e-06,
"loss": 1.0125,
"step": 1259
},
{
"epoch": 0.6628090478695423,
"grad_norm": 2.0953080654144287,
"learning_rate": 4.871250840865306e-06,
"loss": 1.0518,
"step": 1260
},
{
"epoch": 0.663335086796423,
"grad_norm": 1.9445053339004517,
"learning_rate": 4.871030693261091e-06,
"loss": 0.9892,
"step": 1261
},
{
"epoch": 0.6638611257233035,
"grad_norm": 2.054898500442505,
"learning_rate": 4.870810362586127e-06,
"loss": 1.0712,
"step": 1262
},
{
"epoch": 0.6643871646501841,
"grad_norm": 2.158090114593506,
"learning_rate": 4.870589848857428e-06,
"loss": 0.9874,
"step": 1263
},
{
"epoch": 0.6649132035770647,
"grad_norm": 2.081550121307373,
"learning_rate": 4.870369152092019e-06,
"loss": 1.0299,
"step": 1264
},
{
"epoch": 0.6654392425039453,
"grad_norm": 1.9839400053024292,
"learning_rate": 4.87014827230694e-06,
"loss": 0.9997,
"step": 1265
},
{
"epoch": 0.6659652814308259,
"grad_norm": 2.0596096515655518,
"learning_rate": 4.869927209519246e-06,
"loss": 1.0655,
"step": 1266
},
{
"epoch": 0.6664913203577064,
"grad_norm": 2.3403422832489014,
"learning_rate": 4.8697059637460055e-06,
"loss": 1.0551,
"step": 1267
},
{
"epoch": 0.667017359284587,
"grad_norm": 2.072814702987671,
"learning_rate": 4.8694845350043004e-06,
"loss": 1.0454,
"step": 1268
},
{
"epoch": 0.6675433982114677,
"grad_norm": 2.2819271087646484,
"learning_rate": 4.86926292331123e-06,
"loss": 1.0076,
"step": 1269
},
{
"epoch": 0.6680694371383482,
"grad_norm": 2.162179708480835,
"learning_rate": 4.8690411286839024e-06,
"loss": 1.0145,
"step": 1270
},
{
"epoch": 0.6685954760652288,
"grad_norm": 2.1072568893432617,
"learning_rate": 4.868819151139443e-06,
"loss": 1.0936,
"step": 1271
},
{
"epoch": 0.6691215149921094,
"grad_norm": 2.113056182861328,
"learning_rate": 4.868596990694994e-06,
"loss": 1.044,
"step": 1272
},
{
"epoch": 0.66964755391899,
"grad_norm": 1.9856184720993042,
"learning_rate": 4.868374647367705e-06,
"loss": 1.0119,
"step": 1273
},
{
"epoch": 0.6701735928458706,
"grad_norm": 2.013106346130371,
"learning_rate": 4.868152121174746e-06,
"loss": 1.0913,
"step": 1274
},
{
"epoch": 0.6706996317727512,
"grad_norm": 1.8831686973571777,
"learning_rate": 4.867929412133297e-06,
"loss": 1.0077,
"step": 1275
},
{
"epoch": 0.6712256706996318,
"grad_norm": 2.035214424133301,
"learning_rate": 4.867706520260554e-06,
"loss": 0.9683,
"step": 1276
},
{
"epoch": 0.6717517096265123,
"grad_norm": 2.0336945056915283,
"learning_rate": 4.867483445573727e-06,
"loss": 1.0583,
"step": 1277
},
{
"epoch": 0.672277748553393,
"grad_norm": 1.9241890907287598,
"learning_rate": 4.867260188090041e-06,
"loss": 1.0162,
"step": 1278
},
{
"epoch": 0.6728037874802736,
"grad_norm": 2.122288942337036,
"learning_rate": 4.8670367478267335e-06,
"loss": 1.0633,
"step": 1279
},
{
"epoch": 0.6733298264071541,
"grad_norm": 1.964282512664795,
"learning_rate": 4.8668131248010555e-06,
"loss": 1.0009,
"step": 1280
},
{
"epoch": 0.6738558653340347,
"grad_norm": 2.075181722640991,
"learning_rate": 4.866589319030273e-06,
"loss": 1.0535,
"step": 1281
},
{
"epoch": 0.6743819042609153,
"grad_norm": 2.086574077606201,
"learning_rate": 4.866365330531668e-06,
"loss": 1.0125,
"step": 1282
},
{
"epoch": 0.6749079431877959,
"grad_norm": 2.176712989807129,
"learning_rate": 4.866141159322535e-06,
"loss": 1.0883,
"step": 1283
},
{
"epoch": 0.6754339821146765,
"grad_norm": 2.4133596420288086,
"learning_rate": 4.865916805420181e-06,
"loss": 1.1115,
"step": 1284
},
{
"epoch": 0.6759600210415571,
"grad_norm": 1.9632985591888428,
"learning_rate": 4.865692268841931e-06,
"loss": 0.9837,
"step": 1285
},
{
"epoch": 0.6764860599684377,
"grad_norm": 2.320810556411743,
"learning_rate": 4.865467549605119e-06,
"loss": 1.0307,
"step": 1286
},
{
"epoch": 0.6770120988953182,
"grad_norm": 2.259291172027588,
"learning_rate": 4.865242647727097e-06,
"loss": 1.0125,
"step": 1287
},
{
"epoch": 0.6775381378221988,
"grad_norm": 2.069227695465088,
"learning_rate": 4.8650175632252314e-06,
"loss": 1.0348,
"step": 1288
},
{
"epoch": 0.6780641767490795,
"grad_norm": 2.093912363052368,
"learning_rate": 4.8647922961169e-06,
"loss": 1.0628,
"step": 1289
},
{
"epoch": 0.67859021567596,
"grad_norm": 2.0842857360839844,
"learning_rate": 4.864566846419497e-06,
"loss": 1.0296,
"step": 1290
},
{
"epoch": 0.6791162546028406,
"grad_norm": 2.1448631286621094,
"learning_rate": 4.864341214150428e-06,
"loss": 1.0344,
"step": 1291
},
{
"epoch": 0.6796422935297212,
"grad_norm": 2.173478841781616,
"learning_rate": 4.864115399327115e-06,
"loss": 1.0662,
"step": 1292
},
{
"epoch": 0.6801683324566018,
"grad_norm": 2.1156740188598633,
"learning_rate": 4.863889401966995e-06,
"loss": 1.0568,
"step": 1293
},
{
"epoch": 0.6806943713834824,
"grad_norm": 2.0641050338745117,
"learning_rate": 4.863663222087515e-06,
"loss": 1.0508,
"step": 1294
},
{
"epoch": 0.681220410310363,
"grad_norm": 2.050645112991333,
"learning_rate": 4.863436859706141e-06,
"loss": 1.0198,
"step": 1295
},
{
"epoch": 0.6817464492372436,
"grad_norm": 1.9624086618423462,
"learning_rate": 4.86321031484035e-06,
"loss": 1.012,
"step": 1296
},
{
"epoch": 0.6822724881641241,
"grad_norm": 2.2763307094573975,
"learning_rate": 4.8629835875076325e-06,
"loss": 1.0208,
"step": 1297
},
{
"epoch": 0.6827985270910047,
"grad_norm": 1.952094316482544,
"learning_rate": 4.862756677725496e-06,
"loss": 0.9912,
"step": 1298
},
{
"epoch": 0.6833245660178853,
"grad_norm": 1.9964386224746704,
"learning_rate": 4.862529585511461e-06,
"loss": 1.0216,
"step": 1299
},
{
"epoch": 0.6838506049447659,
"grad_norm": 2.0915441513061523,
"learning_rate": 4.862302310883061e-06,
"loss": 1.028,
"step": 1300
},
{
"epoch": 0.6843766438716465,
"grad_norm": 2.239182233810425,
"learning_rate": 4.862074853857843e-06,
"loss": 1.1119,
"step": 1301
},
{
"epoch": 0.6849026827985271,
"grad_norm": 2.120128870010376,
"learning_rate": 4.861847214453371e-06,
"loss": 1.0811,
"step": 1302
},
{
"epoch": 0.6854287217254077,
"grad_norm": 1.8495033979415894,
"learning_rate": 4.86161939268722e-06,
"loss": 0.9559,
"step": 1303
},
{
"epoch": 0.6859547606522882,
"grad_norm": 1.9767253398895264,
"learning_rate": 4.861391388576982e-06,
"loss": 0.9942,
"step": 1304
},
{
"epoch": 0.6864807995791689,
"grad_norm": 1.9148463010787964,
"learning_rate": 4.8611632021402605e-06,
"loss": 1.0152,
"step": 1305
},
{
"epoch": 0.6870068385060495,
"grad_norm": 2.036726474761963,
"learning_rate": 4.860934833394674e-06,
"loss": 1.0692,
"step": 1306
},
{
"epoch": 0.68753287743293,
"grad_norm": 2.03383731842041,
"learning_rate": 4.860706282357856e-06,
"loss": 1.0429,
"step": 1307
},
{
"epoch": 0.6880589163598106,
"grad_norm": 1.986863374710083,
"learning_rate": 4.860477549047452e-06,
"loss": 0.9737,
"step": 1308
},
{
"epoch": 0.6885849552866912,
"grad_norm": 1.9917157888412476,
"learning_rate": 4.860248633481124e-06,
"loss": 0.9808,
"step": 1309
},
{
"epoch": 0.6891109942135718,
"grad_norm": 1.9868308305740356,
"learning_rate": 4.860019535676546e-06,
"loss": 1.0001,
"step": 1310
},
{
"epoch": 0.6896370331404524,
"grad_norm": 1.9900240898132324,
"learning_rate": 4.859790255651408e-06,
"loss": 1.0561,
"step": 1311
},
{
"epoch": 0.690163072067333,
"grad_norm": 1.987703800201416,
"learning_rate": 4.859560793423412e-06,
"loss": 1.013,
"step": 1312
},
{
"epoch": 0.6906891109942136,
"grad_norm": 1.9851711988449097,
"learning_rate": 4.859331149010276e-06,
"loss": 1.0727,
"step": 1313
},
{
"epoch": 0.6912151499210941,
"grad_norm": 1.9733060598373413,
"learning_rate": 4.8591013224297304e-06,
"loss": 0.9924,
"step": 1314
},
{
"epoch": 0.6917411888479748,
"grad_norm": 1.9737035036087036,
"learning_rate": 4.85887131369952e-06,
"loss": 1.0131,
"step": 1315
},
{
"epoch": 0.6922672277748554,
"grad_norm": 2.176969528198242,
"learning_rate": 4.858641122837407e-06,
"loss": 1.0382,
"step": 1316
},
{
"epoch": 0.6927932667017359,
"grad_norm": 1.951177716255188,
"learning_rate": 4.858410749861161e-06,
"loss": 1.011,
"step": 1317
},
{
"epoch": 0.6933193056286165,
"grad_norm": 2.009986639022827,
"learning_rate": 4.858180194788572e-06,
"loss": 1.0999,
"step": 1318
},
{
"epoch": 0.6938453445554971,
"grad_norm": 2.0470845699310303,
"learning_rate": 4.857949457637441e-06,
"loss": 1.0477,
"step": 1319
},
{
"epoch": 0.6943713834823777,
"grad_norm": 2.163547992706299,
"learning_rate": 4.857718538425582e-06,
"loss": 1.0229,
"step": 1320
},
{
"epoch": 0.6948974224092583,
"grad_norm": 2.0979368686676025,
"learning_rate": 4.857487437170827e-06,
"loss": 1.0686,
"step": 1321
},
{
"epoch": 0.6954234613361389,
"grad_norm": 2.0388388633728027,
"learning_rate": 4.857256153891017e-06,
"loss": 0.991,
"step": 1322
},
{
"epoch": 0.6959495002630195,
"grad_norm": 2.136115312576294,
"learning_rate": 4.8570246886040124e-06,
"loss": 1.0249,
"step": 1323
},
{
"epoch": 0.6964755391899,
"grad_norm": 2.0932974815368652,
"learning_rate": 4.8567930413276835e-06,
"loss": 1.0649,
"step": 1324
},
{
"epoch": 0.6970015781167807,
"grad_norm": 2.0559682846069336,
"learning_rate": 4.856561212079916e-06,
"loss": 0.9931,
"step": 1325
},
{
"epoch": 0.6975276170436613,
"grad_norm": 1.9723689556121826,
"learning_rate": 4.856329200878611e-06,
"loss": 0.9628,
"step": 1326
},
{
"epoch": 0.6980536559705418,
"grad_norm": 2.054049253463745,
"learning_rate": 4.8560970077416805e-06,
"loss": 1.0322,
"step": 1327
},
{
"epoch": 0.6985796948974224,
"grad_norm": 2.100574254989624,
"learning_rate": 4.855864632687055e-06,
"loss": 1.0941,
"step": 1328
},
{
"epoch": 0.699105733824303,
"grad_norm": 2.1415367126464844,
"learning_rate": 4.8556320757326735e-06,
"loss": 1.0341,
"step": 1329
},
{
"epoch": 0.6996317727511836,
"grad_norm": 1.988004207611084,
"learning_rate": 4.855399336896495e-06,
"loss": 1.0357,
"step": 1330
},
{
"epoch": 0.7001578116780641,
"grad_norm": 2.0249714851379395,
"learning_rate": 4.855166416196487e-06,
"loss": 1.0489,
"step": 1331
},
{
"epoch": 0.7006838506049448,
"grad_norm": 1.9197039604187012,
"learning_rate": 4.8549333136506356e-06,
"loss": 1.0094,
"step": 1332
},
{
"epoch": 0.7012098895318254,
"grad_norm": 2.153716564178467,
"learning_rate": 4.854700029276938e-06,
"loss": 1.0613,
"step": 1333
},
{
"epoch": 0.7017359284587059,
"grad_norm": 1.9626339673995972,
"learning_rate": 4.854466563093407e-06,
"loss": 1.024,
"step": 1334
},
{
"epoch": 0.7022619673855865,
"grad_norm": 2.0288281440734863,
"learning_rate": 4.854232915118068e-06,
"loss": 0.9778,
"step": 1335
},
{
"epoch": 0.7027880063124671,
"grad_norm": 1.9677989482879639,
"learning_rate": 4.853999085368963e-06,
"loss": 0.9802,
"step": 1336
},
{
"epoch": 0.7033140452393477,
"grad_norm": 2.054617404937744,
"learning_rate": 4.853765073864144e-06,
"loss": 0.9523,
"step": 1337
},
{
"epoch": 0.7038400841662283,
"grad_norm": 2.0509955883026123,
"learning_rate": 4.853530880621681e-06,
"loss": 1.0324,
"step": 1338
},
{
"epoch": 0.7043661230931089,
"grad_norm": 2.224724054336548,
"learning_rate": 4.853296505659657e-06,
"loss": 1.0965,
"step": 1339
},
{
"epoch": 0.7048921620199895,
"grad_norm": 1.9698208570480347,
"learning_rate": 4.8530619489961664e-06,
"loss": 1.0486,
"step": 1340
},
{
"epoch": 0.70541820094687,
"grad_norm": 2.129383087158203,
"learning_rate": 4.85282721064932e-06,
"loss": 1.0857,
"step": 1341
},
{
"epoch": 0.7059442398737507,
"grad_norm": 2.2943053245544434,
"learning_rate": 4.852592290637244e-06,
"loss": 1.0628,
"step": 1342
},
{
"epoch": 0.7064702788006313,
"grad_norm": 2.0792641639709473,
"learning_rate": 4.852357188978075e-06,
"loss": 1.0604,
"step": 1343
},
{
"epoch": 0.7069963177275118,
"grad_norm": 2.0224812030792236,
"learning_rate": 4.852121905689968e-06,
"loss": 1.0687,
"step": 1344
},
{
"epoch": 0.7075223566543924,
"grad_norm": 2.4030919075012207,
"learning_rate": 4.851886440791087e-06,
"loss": 1.0942,
"step": 1345
},
{
"epoch": 0.708048395581273,
"grad_norm": 2.190215826034546,
"learning_rate": 4.851650794299614e-06,
"loss": 1.0393,
"step": 1346
},
{
"epoch": 0.7085744345081536,
"grad_norm": 2.1099565029144287,
"learning_rate": 4.851414966233743e-06,
"loss": 1.0452,
"step": 1347
},
{
"epoch": 0.7091004734350342,
"grad_norm": 2.156395673751831,
"learning_rate": 4.851178956611682e-06,
"loss": 1.0625,
"step": 1348
},
{
"epoch": 0.7096265123619148,
"grad_norm": 2.1840314865112305,
"learning_rate": 4.850942765451655e-06,
"loss": 1.0467,
"step": 1349
},
{
"epoch": 0.7101525512887954,
"grad_norm": 2.0080723762512207,
"learning_rate": 4.850706392771899e-06,
"loss": 1.0187,
"step": 1350
},
{
"epoch": 0.7106785902156759,
"grad_norm": 2.1242828369140625,
"learning_rate": 4.850469838590664e-06,
"loss": 1.0459,
"step": 1351
},
{
"epoch": 0.7112046291425566,
"grad_norm": 1.9652162790298462,
"learning_rate": 4.8502331029262125e-06,
"loss": 1.0404,
"step": 1352
},
{
"epoch": 0.7117306680694372,
"grad_norm": 2.2363545894622803,
"learning_rate": 4.849996185796827e-06,
"loss": 1.0182,
"step": 1353
},
{
"epoch": 0.7122567069963177,
"grad_norm": 2.028017044067383,
"learning_rate": 4.849759087220798e-06,
"loss": 1.0213,
"step": 1354
},
{
"epoch": 0.7127827459231983,
"grad_norm": 2.265037775039673,
"learning_rate": 4.849521807216432e-06,
"loss": 1.0316,
"step": 1355
},
{
"epoch": 0.7133087848500789,
"grad_norm": 2.083799362182617,
"learning_rate": 4.849284345802051e-06,
"loss": 1.0133,
"step": 1356
},
{
"epoch": 0.7138348237769595,
"grad_norm": 1.9307647943496704,
"learning_rate": 4.8490467029959895e-06,
"loss": 1.0023,
"step": 1357
},
{
"epoch": 0.7143608627038401,
"grad_norm": 2.1079766750335693,
"learning_rate": 4.848808878816595e-06,
"loss": 1.0208,
"step": 1358
},
{
"epoch": 0.7148869016307207,
"grad_norm": 2.0214877128601074,
"learning_rate": 4.8485708732822315e-06,
"loss": 0.9904,
"step": 1359
},
{
"epoch": 0.7154129405576013,
"grad_norm": 2.150768756866455,
"learning_rate": 4.848332686411276e-06,
"loss": 0.9969,
"step": 1360
},
{
"epoch": 0.7159389794844818,
"grad_norm": 2.0330607891082764,
"learning_rate": 4.8480943182221184e-06,
"loss": 0.9865,
"step": 1361
},
{
"epoch": 0.7164650184113625,
"grad_norm": 1.973970651626587,
"learning_rate": 4.847855768733163e-06,
"loss": 0.9815,
"step": 1362
},
{
"epoch": 0.716991057338243,
"grad_norm": 2.074868679046631,
"learning_rate": 4.84761703796283e-06,
"loss": 1.0499,
"step": 1363
},
{
"epoch": 0.7175170962651236,
"grad_norm": 1.9750478267669678,
"learning_rate": 4.8473781259295514e-06,
"loss": 0.9797,
"step": 1364
},
{
"epoch": 0.7180431351920042,
"grad_norm": 1.971375823020935,
"learning_rate": 4.847139032651774e-06,
"loss": 0.9805,
"step": 1365
},
{
"epoch": 0.7185691741188848,
"grad_norm": 2.0710880756378174,
"learning_rate": 4.846899758147958e-06,
"loss": 1.0143,
"step": 1366
},
{
"epoch": 0.7190952130457654,
"grad_norm": 1.9696688652038574,
"learning_rate": 4.8466603024365785e-06,
"loss": 0.9869,
"step": 1367
},
{
"epoch": 0.7196212519726459,
"grad_norm": 2.1022462844848633,
"learning_rate": 4.846420665536126e-06,
"loss": 1.0048,
"step": 1368
},
{
"epoch": 0.7201472908995266,
"grad_norm": 2.164783000946045,
"learning_rate": 4.8461808474651e-06,
"loss": 1.0114,
"step": 1369
},
{
"epoch": 0.7206733298264072,
"grad_norm": 2.0148744583129883,
"learning_rate": 4.845940848242019e-06,
"loss": 1.0232,
"step": 1370
},
{
"epoch": 0.7211993687532877,
"grad_norm": 2.0193605422973633,
"learning_rate": 4.845700667885414e-06,
"loss": 0.9764,
"step": 1371
},
{
"epoch": 0.7217254076801684,
"grad_norm": 2.005157232284546,
"learning_rate": 4.845460306413829e-06,
"loss": 1.0242,
"step": 1372
},
{
"epoch": 0.7222514466070489,
"grad_norm": 2.128805637359619,
"learning_rate": 4.845219763845823e-06,
"loss": 0.9964,
"step": 1373
},
{
"epoch": 0.7227774855339295,
"grad_norm": 1.9924060106277466,
"learning_rate": 4.844979040199968e-06,
"loss": 1.0185,
"step": 1374
},
{
"epoch": 0.7233035244608101,
"grad_norm": 2.2126121520996094,
"learning_rate": 4.844738135494851e-06,
"loss": 1.0013,
"step": 1375
},
{
"epoch": 0.7238295633876907,
"grad_norm": 2.0494630336761475,
"learning_rate": 4.844497049749073e-06,
"loss": 1.0628,
"step": 1376
},
{
"epoch": 0.7243556023145713,
"grad_norm": 2.4115402698516846,
"learning_rate": 4.844255782981249e-06,
"loss": 1.0623,
"step": 1377
},
{
"epoch": 0.7248816412414518,
"grad_norm": 2.062485933303833,
"learning_rate": 4.8440143352100054e-06,
"loss": 1.0115,
"step": 1378
},
{
"epoch": 0.7254076801683325,
"grad_norm": 2.995894432067871,
"learning_rate": 4.843772706453988e-06,
"loss": 1.0805,
"step": 1379
},
{
"epoch": 0.7259337190952131,
"grad_norm": 1.9974204301834106,
"learning_rate": 4.84353089673185e-06,
"loss": 1.0221,
"step": 1380
},
{
"epoch": 0.7264597580220936,
"grad_norm": 2.1927318572998047,
"learning_rate": 4.843288906062264e-06,
"loss": 1.0273,
"step": 1381
},
{
"epoch": 0.7269857969489742,
"grad_norm": 2.0213675498962402,
"learning_rate": 4.8430467344639136e-06,
"loss": 0.968,
"step": 1382
},
{
"epoch": 0.7275118358758548,
"grad_norm": 2.2534306049346924,
"learning_rate": 4.842804381955497e-06,
"loss": 1.0457,
"step": 1383
},
{
"epoch": 0.7280378748027354,
"grad_norm": 2.003638505935669,
"learning_rate": 4.842561848555728e-06,
"loss": 1.0471,
"step": 1384
},
{
"epoch": 0.728563913729616,
"grad_norm": 2.217237949371338,
"learning_rate": 4.842319134283331e-06,
"loss": 1.0348,
"step": 1385
},
{
"epoch": 0.7290899526564966,
"grad_norm": 2.1162800788879395,
"learning_rate": 4.842076239157047e-06,
"loss": 1.0548,
"step": 1386
},
{
"epoch": 0.7296159915833772,
"grad_norm": 2.043252944946289,
"learning_rate": 4.8418331631956325e-06,
"loss": 1.0931,
"step": 1387
},
{
"epoch": 0.7301420305102577,
"grad_norm": 2.099283218383789,
"learning_rate": 4.841589906417853e-06,
"loss": 1.0059,
"step": 1388
},
{
"epoch": 0.7306680694371384,
"grad_norm": 1.9934890270233154,
"learning_rate": 4.8413464688424904e-06,
"loss": 1.0327,
"step": 1389
},
{
"epoch": 0.731194108364019,
"grad_norm": 1.868202567100525,
"learning_rate": 4.841102850488343e-06,
"loss": 0.9622,
"step": 1390
},
{
"epoch": 0.7317201472908995,
"grad_norm": 1.9592076539993286,
"learning_rate": 4.84085905137422e-06,
"loss": 1.0413,
"step": 1391
},
{
"epoch": 0.7322461862177801,
"grad_norm": 2.0478546619415283,
"learning_rate": 4.840615071518946e-06,
"loss": 1.0343,
"step": 1392
},
{
"epoch": 0.7327722251446607,
"grad_norm": 2.4996554851531982,
"learning_rate": 4.840370910941358e-06,
"loss": 1.1106,
"step": 1393
},
{
"epoch": 0.7332982640715413,
"grad_norm": 2.0023233890533447,
"learning_rate": 4.8401265696603085e-06,
"loss": 1.0273,
"step": 1394
},
{
"epoch": 0.7338243029984218,
"grad_norm": 2.0366029739379883,
"learning_rate": 4.8398820476946625e-06,
"loss": 1.0092,
"step": 1395
},
{
"epoch": 0.7343503419253025,
"grad_norm": 2.2142248153686523,
"learning_rate": 4.839637345063302e-06,
"loss": 0.9884,
"step": 1396
},
{
"epoch": 0.7348763808521831,
"grad_norm": 1.9955226182937622,
"learning_rate": 4.839392461785119e-06,
"loss": 1.054,
"step": 1397
},
{
"epoch": 0.7354024197790636,
"grad_norm": 2.0607223510742188,
"learning_rate": 4.839147397879023e-06,
"loss": 0.9826,
"step": 1398
},
{
"epoch": 0.7359284587059443,
"grad_norm": 2.054483652114868,
"learning_rate": 4.8389021533639345e-06,
"loss": 1.0738,
"step": 1399
},
{
"epoch": 0.7364544976328248,
"grad_norm": 2.1066908836364746,
"learning_rate": 4.8386567282587886e-06,
"loss": 1.0937,
"step": 1400
},
{
"epoch": 0.7369805365597054,
"grad_norm": 2.018155097961426,
"learning_rate": 4.8384111225825355e-06,
"loss": 0.9767,
"step": 1401
},
{
"epoch": 0.737506575486586,
"grad_norm": 2.152189016342163,
"learning_rate": 4.83816533635414e-06,
"loss": 1.0062,
"step": 1402
},
{
"epoch": 0.7380326144134666,
"grad_norm": 1.9946335554122925,
"learning_rate": 4.8379193695925785e-06,
"loss": 1.0724,
"step": 1403
},
{
"epoch": 0.7385586533403472,
"grad_norm": 2.077017307281494,
"learning_rate": 4.837673222316843e-06,
"loss": 1.0991,
"step": 1404
},
{
"epoch": 0.7390846922672277,
"grad_norm": 2.0850563049316406,
"learning_rate": 4.837426894545938e-06,
"loss": 1.0527,
"step": 1405
},
{
"epoch": 0.7396107311941084,
"grad_norm": 1.9786406755447388,
"learning_rate": 4.837180386298883e-06,
"loss": 0.9666,
"step": 1406
},
{
"epoch": 0.740136770120989,
"grad_norm": 2.0060155391693115,
"learning_rate": 4.836933697594711e-06,
"loss": 1.0795,
"step": 1407
},
{
"epoch": 0.7406628090478695,
"grad_norm": 2.086906909942627,
"learning_rate": 4.836686828452471e-06,
"loss": 0.9925,
"step": 1408
},
{
"epoch": 0.7411888479747502,
"grad_norm": 2.0125632286071777,
"learning_rate": 4.836439778891223e-06,
"loss": 0.9706,
"step": 1409
},
{
"epoch": 0.7417148869016307,
"grad_norm": 1.8921434879302979,
"learning_rate": 4.836192548930041e-06,
"loss": 1.0237,
"step": 1410
},
{
"epoch": 0.7422409258285113,
"grad_norm": 1.9400858879089355,
"learning_rate": 4.835945138588015e-06,
"loss": 1.0444,
"step": 1411
},
{
"epoch": 0.742766964755392,
"grad_norm": 2.083749294281006,
"learning_rate": 4.835697547884248e-06,
"loss": 1.0136,
"step": 1412
},
{
"epoch": 0.7432930036822725,
"grad_norm": 2.0750844478607178,
"learning_rate": 4.8354497768378575e-06,
"loss": 1.0863,
"step": 1413
},
{
"epoch": 0.7438190426091531,
"grad_norm": 2.137214183807373,
"learning_rate": 4.835201825467973e-06,
"loss": 1.0095,
"step": 1414
},
{
"epoch": 0.7443450815360336,
"grad_norm": 2.06549072265625,
"learning_rate": 4.834953693793739e-06,
"loss": 1.0449,
"step": 1415
},
{
"epoch": 0.7448711204629143,
"grad_norm": 2.0396728515625,
"learning_rate": 4.834705381834315e-06,
"loss": 1.0093,
"step": 1416
},
{
"epoch": 0.7453971593897949,
"grad_norm": 1.993697166442871,
"learning_rate": 4.834456889608874e-06,
"loss": 1.0075,
"step": 1417
},
{
"epoch": 0.7459231983166754,
"grad_norm": 2.1017816066741943,
"learning_rate": 4.834208217136601e-06,
"loss": 1.0687,
"step": 1418
},
{
"epoch": 0.746449237243556,
"grad_norm": 2.0740413665771484,
"learning_rate": 4.833959364436698e-06,
"loss": 0.9777,
"step": 1419
},
{
"epoch": 0.7469752761704366,
"grad_norm": 2.0858206748962402,
"learning_rate": 4.833710331528377e-06,
"loss": 1.044,
"step": 1420
},
{
"epoch": 0.7475013150973172,
"grad_norm": 2.33298921585083,
"learning_rate": 4.833461118430869e-06,
"loss": 1.0602,
"step": 1421
},
{
"epoch": 0.7480273540241978,
"grad_norm": 2.1458897590637207,
"learning_rate": 4.833211725163414e-06,
"loss": 0.9903,
"step": 1422
},
{
"epoch": 0.7485533929510784,
"grad_norm": 2.15071177482605,
"learning_rate": 4.8329621517452685e-06,
"loss": 1.011,
"step": 1423
},
{
"epoch": 0.749079431877959,
"grad_norm": 2.0375895500183105,
"learning_rate": 4.8327123981957025e-06,
"loss": 1.0021,
"step": 1424
},
{
"epoch": 0.7496054708048395,
"grad_norm": 1.9808685779571533,
"learning_rate": 4.832462464534e-06,
"loss": 1.025,
"step": 1425
},
{
"epoch": 0.7501315097317202,
"grad_norm": 2.046558380126953,
"learning_rate": 4.832212350779459e-06,
"loss": 1.0435,
"step": 1426
},
{
"epoch": 0.7506575486586007,
"grad_norm": 2.0020248889923096,
"learning_rate": 4.831962056951392e-06,
"loss": 1.0207,
"step": 1427
},
{
"epoch": 0.7511835875854813,
"grad_norm": 1.9901740550994873,
"learning_rate": 4.831711583069122e-06,
"loss": 1.0505,
"step": 1428
},
{
"epoch": 0.751709626512362,
"grad_norm": 2.112236738204956,
"learning_rate": 4.83146092915199e-06,
"loss": 1.0353,
"step": 1429
},
{
"epoch": 0.7522356654392425,
"grad_norm": 2.0244028568267822,
"learning_rate": 4.831210095219349e-06,
"loss": 1.0169,
"step": 1430
},
{
"epoch": 0.7527617043661231,
"grad_norm": 2.298645257949829,
"learning_rate": 4.830959081290567e-06,
"loss": 1.0498,
"step": 1431
},
{
"epoch": 0.7532877432930036,
"grad_norm": 2.1593234539031982,
"learning_rate": 4.8307078873850244e-06,
"loss": 1.0954,
"step": 1432
},
{
"epoch": 0.7538137822198843,
"grad_norm": 1.9387123584747314,
"learning_rate": 4.830456513522117e-06,
"loss": 0.9784,
"step": 1433
},
{
"epoch": 0.7543398211467649,
"grad_norm": 2.1634531021118164,
"learning_rate": 4.830204959721253e-06,
"loss": 1.0516,
"step": 1434
},
{
"epoch": 0.7548658600736454,
"grad_norm": 1.9310704469680786,
"learning_rate": 4.829953226001855e-06,
"loss": 0.9648,
"step": 1435
},
{
"epoch": 0.7553918990005261,
"grad_norm": 2.0547149181365967,
"learning_rate": 4.8297013123833605e-06,
"loss": 1.0734,
"step": 1436
},
{
"epoch": 0.7559179379274066,
"grad_norm": 2.222872734069824,
"learning_rate": 4.829449218885219e-06,
"loss": 0.9645,
"step": 1437
},
{
"epoch": 0.7564439768542872,
"grad_norm": 2.128120183944702,
"learning_rate": 4.829196945526897e-06,
"loss": 1.046,
"step": 1438
},
{
"epoch": 0.7569700157811678,
"grad_norm": 2.0309526920318604,
"learning_rate": 4.828944492327872e-06,
"loss": 1.0596,
"step": 1439
},
{
"epoch": 0.7574960547080484,
"grad_norm": 2.0946176052093506,
"learning_rate": 4.828691859307635e-06,
"loss": 1.0134,
"step": 1440
},
{
"epoch": 0.758022093634929,
"grad_norm": 1.9159823656082153,
"learning_rate": 4.828439046485693e-06,
"loss": 1.0081,
"step": 1441
},
{
"epoch": 0.7585481325618095,
"grad_norm": 2.203627586364746,
"learning_rate": 4.828186053881566e-06,
"loss": 1.0451,
"step": 1442
},
{
"epoch": 0.7590741714886902,
"grad_norm": 2.065521240234375,
"learning_rate": 4.8279328815147895e-06,
"loss": 1.0289,
"step": 1443
},
{
"epoch": 0.7596002104155708,
"grad_norm": 2.1597719192504883,
"learning_rate": 4.827679529404909e-06,
"loss": 1.0373,
"step": 1444
},
{
"epoch": 0.7601262493424513,
"grad_norm": 2.6100237369537354,
"learning_rate": 4.827425997571488e-06,
"loss": 1.0254,
"step": 1445
},
{
"epoch": 0.760652288269332,
"grad_norm": 2.1975550651550293,
"learning_rate": 4.8271722860341e-06,
"loss": 1.0254,
"step": 1446
},
{
"epoch": 0.7611783271962125,
"grad_norm": 2.019261360168457,
"learning_rate": 4.826918394812336e-06,
"loss": 1.0823,
"step": 1447
},
{
"epoch": 0.7617043661230931,
"grad_norm": 1.9351961612701416,
"learning_rate": 4.8266643239257996e-06,
"loss": 1.0248,
"step": 1448
},
{
"epoch": 0.7622304050499737,
"grad_norm": 1.9437129497528076,
"learning_rate": 4.826410073394106e-06,
"loss": 0.984,
"step": 1449
},
{
"epoch": 0.7627564439768543,
"grad_norm": 2.277479887008667,
"learning_rate": 4.826155643236889e-06,
"loss": 1.0264,
"step": 1450
},
{
"epoch": 0.7632824829037349,
"grad_norm": 2.2033772468566895,
"learning_rate": 4.825901033473791e-06,
"loss": 1.0249,
"step": 1451
},
{
"epoch": 0.7638085218306154,
"grad_norm": 2.1912593841552734,
"learning_rate": 4.825646244124472e-06,
"loss": 1.0366,
"step": 1452
},
{
"epoch": 0.7643345607574961,
"grad_norm": 2.0046746730804443,
"learning_rate": 4.825391275208606e-06,
"loss": 1.0411,
"step": 1453
},
{
"epoch": 0.7648605996843767,
"grad_norm": 2.0601322650909424,
"learning_rate": 4.825136126745877e-06,
"loss": 1.052,
"step": 1454
},
{
"epoch": 0.7653866386112572,
"grad_norm": 2.148794651031494,
"learning_rate": 4.824880798755986e-06,
"loss": 1.04,
"step": 1455
},
{
"epoch": 0.7659126775381379,
"grad_norm": 2.027374505996704,
"learning_rate": 4.824625291258649e-06,
"loss": 1.005,
"step": 1456
},
{
"epoch": 0.7664387164650184,
"grad_norm": 2.0703351497650146,
"learning_rate": 4.824369604273592e-06,
"loss": 1.0157,
"step": 1457
},
{
"epoch": 0.766964755391899,
"grad_norm": 2.1002986431121826,
"learning_rate": 4.8241137378205575e-06,
"loss": 1.0355,
"step": 1458
},
{
"epoch": 0.7674907943187795,
"grad_norm": 1.9970546960830688,
"learning_rate": 4.823857691919302e-06,
"loss": 0.9833,
"step": 1459
},
{
"epoch": 0.7680168332456602,
"grad_norm": 2.0489771366119385,
"learning_rate": 4.823601466589595e-06,
"loss": 1.0351,
"step": 1460
},
{
"epoch": 0.7685428721725408,
"grad_norm": 2.0190834999084473,
"learning_rate": 4.823345061851219e-06,
"loss": 1.0406,
"step": 1461
},
{
"epoch": 0.7690689110994213,
"grad_norm": 2.0567877292633057,
"learning_rate": 4.823088477723973e-06,
"loss": 1.0593,
"step": 1462
},
{
"epoch": 0.769594950026302,
"grad_norm": 1.883132815361023,
"learning_rate": 4.822831714227667e-06,
"loss": 1.0055,
"step": 1463
},
{
"epoch": 0.7701209889531825,
"grad_norm": 1.9520277976989746,
"learning_rate": 4.822574771382127e-06,
"loss": 0.9831,
"step": 1464
},
{
"epoch": 0.7706470278800631,
"grad_norm": 2.0123813152313232,
"learning_rate": 4.822317649207191e-06,
"loss": 0.9841,
"step": 1465
},
{
"epoch": 0.7711730668069438,
"grad_norm": 2.089940309524536,
"learning_rate": 4.8220603477227124e-06,
"loss": 1.0121,
"step": 1466
},
{
"epoch": 0.7716991057338243,
"grad_norm": 1.9485499858856201,
"learning_rate": 4.8218028669485585e-06,
"loss": 0.9744,
"step": 1467
},
{
"epoch": 0.7722251446607049,
"grad_norm": 2.2764859199523926,
"learning_rate": 4.821545206904608e-06,
"loss": 1.0018,
"step": 1468
},
{
"epoch": 0.7727511835875854,
"grad_norm": 2.039769411087036,
"learning_rate": 4.821287367610756e-06,
"loss": 1.0256,
"step": 1469
},
{
"epoch": 0.7732772225144661,
"grad_norm": 2.0036065578460693,
"learning_rate": 4.821029349086911e-06,
"loss": 1.0399,
"step": 1470
},
{
"epoch": 0.7738032614413467,
"grad_norm": 2.056286573410034,
"learning_rate": 4.820771151352996e-06,
"loss": 1.0077,
"step": 1471
},
{
"epoch": 0.7743293003682272,
"grad_norm": 2.0001938343048096,
"learning_rate": 4.820512774428944e-06,
"loss": 1.0109,
"step": 1472
},
{
"epoch": 0.7748553392951079,
"grad_norm": 2.007289409637451,
"learning_rate": 4.820254218334707e-06,
"loss": 1.0223,
"step": 1473
},
{
"epoch": 0.7753813782219884,
"grad_norm": 2.079768657684326,
"learning_rate": 4.8199954830902465e-06,
"loss": 1.0565,
"step": 1474
},
{
"epoch": 0.775907417148869,
"grad_norm": 2.030198097229004,
"learning_rate": 4.819736568715543e-06,
"loss": 1.033,
"step": 1475
},
{
"epoch": 0.7764334560757497,
"grad_norm": 2.6482961177825928,
"learning_rate": 4.819477475230584e-06,
"loss": 1.0595,
"step": 1476
},
{
"epoch": 0.7769594950026302,
"grad_norm": 2.160472869873047,
"learning_rate": 4.8192182026553775e-06,
"loss": 1.0214,
"step": 1477
},
{
"epoch": 0.7774855339295108,
"grad_norm": 2.1956963539123535,
"learning_rate": 4.818958751009941e-06,
"loss": 1.0647,
"step": 1478
},
{
"epoch": 0.7780115728563913,
"grad_norm": 2.346040725708008,
"learning_rate": 4.818699120314306e-06,
"loss": 1.0289,
"step": 1479
},
{
"epoch": 0.778537611783272,
"grad_norm": 2.049593448638916,
"learning_rate": 4.818439310588521e-06,
"loss": 1.0188,
"step": 1480
},
{
"epoch": 0.7790636507101526,
"grad_norm": 1.9567065238952637,
"learning_rate": 4.818179321852646e-06,
"loss": 1.0645,
"step": 1481
},
{
"epoch": 0.7795896896370331,
"grad_norm": 2.0995101928710938,
"learning_rate": 4.817919154126753e-06,
"loss": 1.0283,
"step": 1482
},
{
"epoch": 0.7801157285639138,
"grad_norm": 2.117649555206299,
"learning_rate": 4.817658807430933e-06,
"loss": 0.9973,
"step": 1483
},
{
"epoch": 0.7806417674907943,
"grad_norm": 2.058525800704956,
"learning_rate": 4.817398281785286e-06,
"loss": 1.0278,
"step": 1484
},
{
"epoch": 0.7811678064176749,
"grad_norm": 1.9914313554763794,
"learning_rate": 4.817137577209927e-06,
"loss": 0.9591,
"step": 1485
},
{
"epoch": 0.7816938453445555,
"grad_norm": 1.9432276487350464,
"learning_rate": 4.816876693724987e-06,
"loss": 0.9964,
"step": 1486
},
{
"epoch": 0.7822198842714361,
"grad_norm": 2.011399507522583,
"learning_rate": 4.816615631350608e-06,
"loss": 0.9963,
"step": 1487
},
{
"epoch": 0.7827459231983167,
"grad_norm": 1.9606966972351074,
"learning_rate": 4.816354390106947e-06,
"loss": 0.9756,
"step": 1488
},
{
"epoch": 0.7832719621251972,
"grad_norm": 2.011887788772583,
"learning_rate": 4.816092970014176e-06,
"loss": 1.0194,
"step": 1489
},
{
"epoch": 0.7837980010520779,
"grad_norm": 2.0520918369293213,
"learning_rate": 4.815831371092478e-06,
"loss": 1.02,
"step": 1490
},
{
"epoch": 0.7843240399789585,
"grad_norm": 2.018293619155884,
"learning_rate": 4.815569593362053e-06,
"loss": 1.0289,
"step": 1491
},
{
"epoch": 0.784850078905839,
"grad_norm": 2.016738176345825,
"learning_rate": 4.815307636843112e-06,
"loss": 1.0523,
"step": 1492
},
{
"epoch": 0.7853761178327197,
"grad_norm": 2.063619375228882,
"learning_rate": 4.815045501555882e-06,
"loss": 1.0099,
"step": 1493
},
{
"epoch": 0.7859021567596002,
"grad_norm": 2.122360944747925,
"learning_rate": 4.814783187520602e-06,
"loss": 1.0346,
"step": 1494
},
{
"epoch": 0.7864281956864808,
"grad_norm": 2.040095329284668,
"learning_rate": 4.814520694757526e-06,
"loss": 1.0017,
"step": 1495
},
{
"epoch": 0.7869542346133613,
"grad_norm": 2.003471612930298,
"learning_rate": 4.814258023286922e-06,
"loss": 0.975,
"step": 1496
},
{
"epoch": 0.787480273540242,
"grad_norm": 1.905517816543579,
"learning_rate": 4.81399517312907e-06,
"loss": 0.9899,
"step": 1497
},
{
"epoch": 0.7880063124671226,
"grad_norm": 2.047112226486206,
"learning_rate": 4.813732144304266e-06,
"loss": 0.9558,
"step": 1498
},
{
"epoch": 0.7885323513940031,
"grad_norm": 1.9621355533599854,
"learning_rate": 4.8134689368328194e-06,
"loss": 1.0668,
"step": 1499
},
{
"epoch": 0.7890583903208838,
"grad_norm": 1.9221957921981812,
"learning_rate": 4.813205550735052e-06,
"loss": 1.0082,
"step": 1500
},
{
"epoch": 0.7895844292477643,
"grad_norm": 2.002659797668457,
"learning_rate": 4.812941986031299e-06,
"loss": 1.0192,
"step": 1501
},
{
"epoch": 0.7901104681746449,
"grad_norm": 2.1077136993408203,
"learning_rate": 4.812678242741913e-06,
"loss": 1.0316,
"step": 1502
},
{
"epoch": 0.7906365071015256,
"grad_norm": 2.0782320499420166,
"learning_rate": 4.812414320887256e-06,
"loss": 1.058,
"step": 1503
},
{
"epoch": 0.7911625460284061,
"grad_norm": 2.049888849258423,
"learning_rate": 4.812150220487708e-06,
"loss": 1.0033,
"step": 1504
},
{
"epoch": 0.7916885849552867,
"grad_norm": 2.025468587875366,
"learning_rate": 4.811885941563659e-06,
"loss": 1.0066,
"step": 1505
},
{
"epoch": 0.7922146238821672,
"grad_norm": 2.0612878799438477,
"learning_rate": 4.8116214841355145e-06,
"loss": 0.9783,
"step": 1506
},
{
"epoch": 0.7927406628090479,
"grad_norm": 1.9370075464248657,
"learning_rate": 4.811356848223693e-06,
"loss": 1.0171,
"step": 1507
},
{
"epoch": 0.7932667017359285,
"grad_norm": 2.069326877593994,
"learning_rate": 4.8110920338486285e-06,
"loss": 1.0283,
"step": 1508
},
{
"epoch": 0.793792740662809,
"grad_norm": 2.076786518096924,
"learning_rate": 4.810827041030768e-06,
"loss": 0.9942,
"step": 1509
},
{
"epoch": 0.7943187795896897,
"grad_norm": 1.8861708641052246,
"learning_rate": 4.810561869790571e-06,
"loss": 0.9909,
"step": 1510
},
{
"epoch": 0.7948448185165702,
"grad_norm": 2.064493417739868,
"learning_rate": 4.810296520148513e-06,
"loss": 1.0302,
"step": 1511
},
{
"epoch": 0.7953708574434508,
"grad_norm": 2.0212459564208984,
"learning_rate": 4.810030992125081e-06,
"loss": 0.9912,
"step": 1512
},
{
"epoch": 0.7958968963703315,
"grad_norm": 2.047384023666382,
"learning_rate": 4.809765285740776e-06,
"loss": 1.0165,
"step": 1513
},
{
"epoch": 0.796422935297212,
"grad_norm": 2.2222740650177,
"learning_rate": 4.809499401016115e-06,
"loss": 1.0295,
"step": 1514
},
{
"epoch": 0.7969489742240926,
"grad_norm": 1.9516112804412842,
"learning_rate": 4.809233337971627e-06,
"loss": 0.9562,
"step": 1515
},
{
"epoch": 0.7974750131509731,
"grad_norm": 2.0002121925354004,
"learning_rate": 4.808967096627855e-06,
"loss": 1.0076,
"step": 1516
},
{
"epoch": 0.7980010520778538,
"grad_norm": 2.182039260864258,
"learning_rate": 4.808700677005357e-06,
"loss": 0.9925,
"step": 1517
},
{
"epoch": 0.7985270910047344,
"grad_norm": 2.0578761100769043,
"learning_rate": 4.808434079124701e-06,
"loss": 0.9831,
"step": 1518
},
{
"epoch": 0.7990531299316149,
"grad_norm": 1.8856642246246338,
"learning_rate": 4.8081673030064735e-06,
"loss": 1.0309,
"step": 1519
},
{
"epoch": 0.7995791688584956,
"grad_norm": 2.1273880004882812,
"learning_rate": 4.807900348671272e-06,
"loss": 1.0581,
"step": 1520
},
{
"epoch": 0.8001052077853761,
"grad_norm": 2.0696675777435303,
"learning_rate": 4.8076332161397085e-06,
"loss": 1.0402,
"step": 1521
},
{
"epoch": 0.8006312467122567,
"grad_norm": 2.034176826477051,
"learning_rate": 4.80736590543241e-06,
"loss": 1.018,
"step": 1522
},
{
"epoch": 0.8011572856391374,
"grad_norm": 1.9405510425567627,
"learning_rate": 4.807098416570014e-06,
"loss": 1.037,
"step": 1523
},
{
"epoch": 0.8016833245660179,
"grad_norm": 2.0185844898223877,
"learning_rate": 4.806830749573174e-06,
"loss": 1.0817,
"step": 1524
},
{
"epoch": 0.8022093634928985,
"grad_norm": 2.0617692470550537,
"learning_rate": 4.806562904462559e-06,
"loss": 0.989,
"step": 1525
},
{
"epoch": 0.802735402419779,
"grad_norm": 2.022000789642334,
"learning_rate": 4.806294881258846e-06,
"loss": 1.0245,
"step": 1526
},
{
"epoch": 0.8032614413466597,
"grad_norm": 2.189361572265625,
"learning_rate": 4.806026679982733e-06,
"loss": 1.0537,
"step": 1527
},
{
"epoch": 0.8037874802735402,
"grad_norm": 2.0094563961029053,
"learning_rate": 4.805758300654926e-06,
"loss": 1.0437,
"step": 1528
},
{
"epoch": 0.8043135192004208,
"grad_norm": 1.8940585851669312,
"learning_rate": 4.805489743296148e-06,
"loss": 0.9811,
"step": 1529
},
{
"epoch": 0.8048395581273015,
"grad_norm": 2.0169241428375244,
"learning_rate": 4.805221007927134e-06,
"loss": 1.0354,
"step": 1530
},
{
"epoch": 0.805365597054182,
"grad_norm": 2.1269545555114746,
"learning_rate": 4.804952094568635e-06,
"loss": 1.0439,
"step": 1531
},
{
"epoch": 0.8058916359810626,
"grad_norm": 1.99850332736969,
"learning_rate": 4.804683003241413e-06,
"loss": 1.0313,
"step": 1532
},
{
"epoch": 0.8064176749079431,
"grad_norm": 2.0577683448791504,
"learning_rate": 4.804413733966244e-06,
"loss": 1.0319,
"step": 1533
},
{
"epoch": 0.8069437138348238,
"grad_norm": 1.993945837020874,
"learning_rate": 4.804144286763921e-06,
"loss": 1.0571,
"step": 1534
},
{
"epoch": 0.8074697527617044,
"grad_norm": 2.00144624710083,
"learning_rate": 4.803874661655246e-06,
"loss": 1.0136,
"step": 1535
},
{
"epoch": 0.8079957916885849,
"grad_norm": 2.114583969116211,
"learning_rate": 4.8036048586610394e-06,
"loss": 0.9996,
"step": 1536
},
{
"epoch": 0.8085218306154656,
"grad_norm": 2.019767999649048,
"learning_rate": 4.803334877802131e-06,
"loss": 0.9812,
"step": 1537
},
{
"epoch": 0.8090478695423461,
"grad_norm": 2.1253437995910645,
"learning_rate": 4.803064719099368e-06,
"loss": 1.041,
"step": 1538
},
{
"epoch": 0.8095739084692267,
"grad_norm": 2.055514335632324,
"learning_rate": 4.802794382573609e-06,
"loss": 0.9733,
"step": 1539
},
{
"epoch": 0.8100999473961074,
"grad_norm": 2.0274434089660645,
"learning_rate": 4.802523868245727e-06,
"loss": 1.0222,
"step": 1540
},
{
"epoch": 0.8106259863229879,
"grad_norm": 2.1663291454315186,
"learning_rate": 4.80225317613661e-06,
"loss": 1.0308,
"step": 1541
},
{
"epoch": 0.8111520252498685,
"grad_norm": 1.8864918947219849,
"learning_rate": 4.801982306267156e-06,
"loss": 0.9551,
"step": 1542
},
{
"epoch": 0.811678064176749,
"grad_norm": 2.1302011013031006,
"learning_rate": 4.801711258658281e-06,
"loss": 1.0188,
"step": 1543
},
{
"epoch": 0.8122041031036297,
"grad_norm": 1.9002829790115356,
"learning_rate": 4.801440033330914e-06,
"loss": 1.0278,
"step": 1544
},
{
"epoch": 0.8127301420305103,
"grad_norm": 2.1114113330841064,
"learning_rate": 4.801168630305995e-06,
"loss": 1.0616,
"step": 1545
},
{
"epoch": 0.8132561809573908,
"grad_norm": 1.9383304119110107,
"learning_rate": 4.800897049604479e-06,
"loss": 0.9977,
"step": 1546
},
{
"epoch": 0.8137822198842715,
"grad_norm": 1.9206221103668213,
"learning_rate": 4.800625291247338e-06,
"loss": 0.9758,
"step": 1547
},
{
"epoch": 0.814308258811152,
"grad_norm": 1.9258513450622559,
"learning_rate": 4.800353355255552e-06,
"loss": 0.985,
"step": 1548
},
{
"epoch": 0.8148342977380326,
"grad_norm": 1.9767898321151733,
"learning_rate": 4.800081241650117e-06,
"loss": 0.9802,
"step": 1549
},
{
"epoch": 0.8153603366649133,
"grad_norm": 1.9899487495422363,
"learning_rate": 4.799808950452047e-06,
"loss": 1.0104,
"step": 1550
},
{
"epoch": 0.8158863755917938,
"grad_norm": 1.9970616102218628,
"learning_rate": 4.799536481682362e-06,
"loss": 1.0125,
"step": 1551
},
{
"epoch": 0.8164124145186744,
"grad_norm": 1.9914542436599731,
"learning_rate": 4.799263835362103e-06,
"loss": 1.0458,
"step": 1552
},
{
"epoch": 0.8169384534455549,
"grad_norm": 2.072939157485962,
"learning_rate": 4.798991011512319e-06,
"loss": 1.0663,
"step": 1553
},
{
"epoch": 0.8174644923724356,
"grad_norm": 1.9783833026885986,
"learning_rate": 4.798718010154076e-06,
"loss": 1.0281,
"step": 1554
},
{
"epoch": 0.8179905312993162,
"grad_norm": 2.4431405067443848,
"learning_rate": 4.798444831308454e-06,
"loss": 1.0667,
"step": 1555
},
{
"epoch": 0.8185165702261967,
"grad_norm": 2.1270408630371094,
"learning_rate": 4.798171474996543e-06,
"loss": 1.0217,
"step": 1556
},
{
"epoch": 0.8190426091530774,
"grad_norm": 2.091042995452881,
"learning_rate": 4.797897941239452e-06,
"loss": 1.0126,
"step": 1557
},
{
"epoch": 0.8195686480799579,
"grad_norm": 2.016575336456299,
"learning_rate": 4.797624230058299e-06,
"loss": 1.0269,
"step": 1558
},
{
"epoch": 0.8200946870068385,
"grad_norm": 2.1780738830566406,
"learning_rate": 4.797350341474218e-06,
"loss": 1.0405,
"step": 1559
},
{
"epoch": 0.820620725933719,
"grad_norm": 2.0331525802612305,
"learning_rate": 4.797076275508358e-06,
"loss": 1.0452,
"step": 1560
},
{
"epoch": 0.8211467648605997,
"grad_norm": 2.0023865699768066,
"learning_rate": 4.796802032181877e-06,
"loss": 0.9752,
"step": 1561
},
{
"epoch": 0.8216728037874803,
"grad_norm": 2.11030912399292,
"learning_rate": 4.796527611515952e-06,
"loss": 1.0675,
"step": 1562
},
{
"epoch": 0.8221988427143608,
"grad_norm": 2.0733113288879395,
"learning_rate": 4.7962530135317705e-06,
"loss": 1.0511,
"step": 1563
},
{
"epoch": 0.8227248816412415,
"grad_norm": 2.0920655727386475,
"learning_rate": 4.795978238250535e-06,
"loss": 1.0797,
"step": 1564
},
{
"epoch": 0.823250920568122,
"grad_norm": 2.218693256378174,
"learning_rate": 4.795703285693461e-06,
"loss": 1.0385,
"step": 1565
},
{
"epoch": 0.8237769594950026,
"grad_norm": 1.9661623239517212,
"learning_rate": 4.795428155881779e-06,
"loss": 1.001,
"step": 1566
},
{
"epoch": 0.8243029984218833,
"grad_norm": 2.1669209003448486,
"learning_rate": 4.795152848836731e-06,
"loss": 1.0317,
"step": 1567
},
{
"epoch": 0.8248290373487638,
"grad_norm": 1.9323532581329346,
"learning_rate": 4.794877364579573e-06,
"loss": 1.0182,
"step": 1568
},
{
"epoch": 0.8253550762756444,
"grad_norm": 1.9551295042037964,
"learning_rate": 4.794601703131579e-06,
"loss": 1.0048,
"step": 1569
},
{
"epoch": 0.8258811152025249,
"grad_norm": 1.9809366464614868,
"learning_rate": 4.7943258645140285e-06,
"loss": 1.0377,
"step": 1570
},
{
"epoch": 0.8264071541294056,
"grad_norm": 2.0074756145477295,
"learning_rate": 4.794049848748224e-06,
"loss": 1.0218,
"step": 1571
},
{
"epoch": 0.8269331930562862,
"grad_norm": 2.0177736282348633,
"learning_rate": 4.793773655855474e-06,
"loss": 1.0402,
"step": 1572
},
{
"epoch": 0.8274592319831667,
"grad_norm": 2.0348360538482666,
"learning_rate": 4.7934972858571035e-06,
"loss": 1.0312,
"step": 1573
},
{
"epoch": 0.8279852709100474,
"grad_norm": 2.097808599472046,
"learning_rate": 4.793220738774455e-06,
"loss": 1.0618,
"step": 1574
},
{
"epoch": 0.8285113098369279,
"grad_norm": 2.061023473739624,
"learning_rate": 4.792944014628877e-06,
"loss": 1.0464,
"step": 1575
},
{
"epoch": 0.8290373487638085,
"grad_norm": 2.1510798931121826,
"learning_rate": 4.792667113441738e-06,
"loss": 1.0102,
"step": 1576
},
{
"epoch": 0.8295633876906892,
"grad_norm": 2.1446409225463867,
"learning_rate": 4.7923900352344185e-06,
"loss": 1.0577,
"step": 1577
},
{
"epoch": 0.8300894266175697,
"grad_norm": 2.2582831382751465,
"learning_rate": 4.79211278002831e-06,
"loss": 1.1042,
"step": 1578
},
{
"epoch": 0.8306154655444503,
"grad_norm": 2.0069401264190674,
"learning_rate": 4.791835347844821e-06,
"loss": 0.9835,
"step": 1579
},
{
"epoch": 0.8311415044713308,
"grad_norm": 2.0074360370635986,
"learning_rate": 4.791557738705372e-06,
"loss": 1.0596,
"step": 1580
},
{
"epoch": 0.8316675433982115,
"grad_norm": 2.2237892150878906,
"learning_rate": 4.791279952631399e-06,
"loss": 1.0162,
"step": 1581
},
{
"epoch": 0.8321935823250921,
"grad_norm": 2.0037453174591064,
"learning_rate": 4.791001989644349e-06,
"loss": 0.9879,
"step": 1582
},
{
"epoch": 0.8327196212519726,
"grad_norm": 1.994869351387024,
"learning_rate": 4.790723849765684e-06,
"loss": 0.9908,
"step": 1583
},
{
"epoch": 0.8332456601788533,
"grad_norm": 2.1808955669403076,
"learning_rate": 4.790445533016879e-06,
"loss": 0.9896,
"step": 1584
},
{
"epoch": 0.8337716991057338,
"grad_norm": 1.9274131059646606,
"learning_rate": 4.790167039419424e-06,
"loss": 0.9383,
"step": 1585
},
{
"epoch": 0.8342977380326144,
"grad_norm": 2.0095322132110596,
"learning_rate": 4.789888368994823e-06,
"loss": 1.0282,
"step": 1586
},
{
"epoch": 0.8348237769594951,
"grad_norm": 1.957546353340149,
"learning_rate": 4.7896095217645895e-06,
"loss": 0.9559,
"step": 1587
},
{
"epoch": 0.8353498158863756,
"grad_norm": 2.1231918334960938,
"learning_rate": 4.789330497750258e-06,
"loss": 1.0414,
"step": 1588
},
{
"epoch": 0.8358758548132562,
"grad_norm": 2.0618984699249268,
"learning_rate": 4.789051296973368e-06,
"loss": 0.9931,
"step": 1589
},
{
"epoch": 0.8364018937401367,
"grad_norm": 2.023416042327881,
"learning_rate": 4.78877191945548e-06,
"loss": 0.963,
"step": 1590
},
{
"epoch": 0.8369279326670174,
"grad_norm": 2.0902810096740723,
"learning_rate": 4.788492365218164e-06,
"loss": 1.076,
"step": 1591
},
{
"epoch": 0.8374539715938979,
"grad_norm": 1.9094164371490479,
"learning_rate": 4.788212634283005e-06,
"loss": 0.9444,
"step": 1592
},
{
"epoch": 0.8379800105207785,
"grad_norm": 1.9887592792510986,
"learning_rate": 4.7879327266716e-06,
"loss": 1.0364,
"step": 1593
},
{
"epoch": 0.8385060494476592,
"grad_norm": 2.0019707679748535,
"learning_rate": 4.787652642405564e-06,
"loss": 1.0544,
"step": 1594
},
{
"epoch": 0.8390320883745397,
"grad_norm": 2.0776329040527344,
"learning_rate": 4.787372381506521e-06,
"loss": 0.9949,
"step": 1595
},
{
"epoch": 0.8395581273014203,
"grad_norm": 2.0091662406921387,
"learning_rate": 4.7870919439961094e-06,
"loss": 1.0165,
"step": 1596
},
{
"epoch": 0.8400841662283008,
"grad_norm": 2.0458288192749023,
"learning_rate": 4.786811329895984e-06,
"loss": 1.0341,
"step": 1597
},
{
"epoch": 0.8406102051551815,
"grad_norm": 2.0741751194000244,
"learning_rate": 4.78653053922781e-06,
"loss": 1.0509,
"step": 1598
},
{
"epoch": 0.8411362440820621,
"grad_norm": 2.141406774520874,
"learning_rate": 4.7862495720132695e-06,
"loss": 1.0665,
"step": 1599
},
{
"epoch": 0.8416622830089426,
"grad_norm": 2.2400975227355957,
"learning_rate": 4.785968428274055e-06,
"loss": 0.93,
"step": 1600
},
{
"epoch": 0.8421883219358233,
"grad_norm": 1.929742455482483,
"learning_rate": 4.785687108031875e-06,
"loss": 1.0339,
"step": 1601
},
{
"epoch": 0.8427143608627038,
"grad_norm": 2.012728452682495,
"learning_rate": 4.785405611308448e-06,
"loss": 0.9945,
"step": 1602
},
{
"epoch": 0.8432403997895844,
"grad_norm": 2.0826306343078613,
"learning_rate": 4.785123938125511e-06,
"loss": 1.0322,
"step": 1603
},
{
"epoch": 0.8437664387164651,
"grad_norm": 2.0303595066070557,
"learning_rate": 4.784842088504813e-06,
"loss": 1.0304,
"step": 1604
},
{
"epoch": 0.8442924776433456,
"grad_norm": 2.0710513591766357,
"learning_rate": 4.7845600624681145e-06,
"loss": 1.0358,
"step": 1605
},
{
"epoch": 0.8448185165702262,
"grad_norm": 2.052515983581543,
"learning_rate": 4.784277860037192e-06,
"loss": 1.0316,
"step": 1606
},
{
"epoch": 0.8453445554971067,
"grad_norm": 2.1331636905670166,
"learning_rate": 4.783995481233835e-06,
"loss": 1.0139,
"step": 1607
},
{
"epoch": 0.8458705944239874,
"grad_norm": 1.9738709926605225,
"learning_rate": 4.783712926079846e-06,
"loss": 1.034,
"step": 1608
},
{
"epoch": 0.846396633350868,
"grad_norm": 2.059412956237793,
"learning_rate": 4.78343019459704e-06,
"loss": 1.0468,
"step": 1609
},
{
"epoch": 0.8469226722777485,
"grad_norm": 2.027773141860962,
"learning_rate": 4.783147286807249e-06,
"loss": 1.0028,
"step": 1610
},
{
"epoch": 0.8474487112046292,
"grad_norm": 2.1288933753967285,
"learning_rate": 4.782864202732317e-06,
"loss": 1.0177,
"step": 1611
},
{
"epoch": 0.8479747501315097,
"grad_norm": 2.160947322845459,
"learning_rate": 4.7825809423941e-06,
"loss": 0.9814,
"step": 1612
},
{
"epoch": 0.8485007890583903,
"grad_norm": 2.021970272064209,
"learning_rate": 4.782297505814469e-06,
"loss": 1.0198,
"step": 1613
},
{
"epoch": 0.849026827985271,
"grad_norm": 1.9154043197631836,
"learning_rate": 4.7820138930153106e-06,
"loss": 1.0044,
"step": 1614
},
{
"epoch": 0.8495528669121515,
"grad_norm": 2.0858964920043945,
"learning_rate": 4.781730104018521e-06,
"loss": 0.9932,
"step": 1615
},
{
"epoch": 0.8500789058390321,
"grad_norm": 2.236711025238037,
"learning_rate": 4.7814461388460105e-06,
"loss": 1.0495,
"step": 1616
},
{
"epoch": 0.8506049447659126,
"grad_norm": 2.0810344219207764,
"learning_rate": 4.781161997519707e-06,
"loss": 1.0617,
"step": 1617
},
{
"epoch": 0.8511309836927933,
"grad_norm": 2.224187135696411,
"learning_rate": 4.780877680061551e-06,
"loss": 0.9911,
"step": 1618
},
{
"epoch": 0.8516570226196739,
"grad_norm": 1.8846218585968018,
"learning_rate": 4.780593186493491e-06,
"loss": 1.0185,
"step": 1619
},
{
"epoch": 0.8521830615465544,
"grad_norm": 2.0876333713531494,
"learning_rate": 4.780308516837495e-06,
"loss": 1.0173,
"step": 1620
},
{
"epoch": 0.8527091004734351,
"grad_norm": 1.942492961883545,
"learning_rate": 4.780023671115544e-06,
"loss": 1.0154,
"step": 1621
},
{
"epoch": 0.8532351394003156,
"grad_norm": 1.9483400583267212,
"learning_rate": 4.779738649349629e-06,
"loss": 1.0492,
"step": 1622
},
{
"epoch": 0.8537611783271962,
"grad_norm": 1.8866205215454102,
"learning_rate": 4.7794534515617586e-06,
"loss": 0.9896,
"step": 1623
},
{
"epoch": 0.8542872172540767,
"grad_norm": 2.146117687225342,
"learning_rate": 4.779168077773953e-06,
"loss": 1.0391,
"step": 1624
},
{
"epoch": 0.8548132561809574,
"grad_norm": 2.099858283996582,
"learning_rate": 4.778882528008245e-06,
"loss": 1.0185,
"step": 1625
},
{
"epoch": 0.855339295107838,
"grad_norm": 2.0597662925720215,
"learning_rate": 4.7785968022866846e-06,
"loss": 1.0373,
"step": 1626
},
{
"epoch": 0.8558653340347185,
"grad_norm": 2.0234663486480713,
"learning_rate": 4.7783109006313316e-06,
"loss": 1.0471,
"step": 1627
},
{
"epoch": 0.8563913729615992,
"grad_norm": 1.9113049507141113,
"learning_rate": 4.778024823064261e-06,
"loss": 1.01,
"step": 1628
},
{
"epoch": 0.8569174118884797,
"grad_norm": 2.4924910068511963,
"learning_rate": 4.777738569607562e-06,
"loss": 1.0267,
"step": 1629
},
{
"epoch": 0.8574434508153603,
"grad_norm": 1.9605613946914673,
"learning_rate": 4.777452140283336e-06,
"loss": 1.0237,
"step": 1630
},
{
"epoch": 0.857969489742241,
"grad_norm": 2.1404225826263428,
"learning_rate": 4.7771655351136996e-06,
"loss": 1.0353,
"step": 1631
},
{
"epoch": 0.8584955286691215,
"grad_norm": 2.1174509525299072,
"learning_rate": 4.776878754120781e-06,
"loss": 1.0517,
"step": 1632
},
{
"epoch": 0.8590215675960021,
"grad_norm": 1.895843267440796,
"learning_rate": 4.7765917973267226e-06,
"loss": 0.9479,
"step": 1633
},
{
"epoch": 0.8595476065228826,
"grad_norm": 2.080152988433838,
"learning_rate": 4.776304664753682e-06,
"loss": 1.0642,
"step": 1634
},
{
"epoch": 0.8600736454497633,
"grad_norm": 1.9730490446090698,
"learning_rate": 4.776017356423827e-06,
"loss": 1.0059,
"step": 1635
},
{
"epoch": 0.8605996843766439,
"grad_norm": 2.19085693359375,
"learning_rate": 4.775729872359343e-06,
"loss": 1.0368,
"step": 1636
},
{
"epoch": 0.8611257233035244,
"grad_norm": 2.14911150932312,
"learning_rate": 4.775442212582428e-06,
"loss": 1.0583,
"step": 1637
},
{
"epoch": 0.8616517622304051,
"grad_norm": 1.9603419303894043,
"learning_rate": 4.775154377115291e-06,
"loss": 1.0336,
"step": 1638
},
{
"epoch": 0.8621778011572856,
"grad_norm": 1.9417442083358765,
"learning_rate": 4.774866365980156e-06,
"loss": 0.9885,
"step": 1639
},
{
"epoch": 0.8627038400841662,
"grad_norm": 2.092170000076294,
"learning_rate": 4.774578179199261e-06,
"loss": 1.0496,
"step": 1640
},
{
"epoch": 0.8632298790110469,
"grad_norm": 2.0614163875579834,
"learning_rate": 4.774289816794858e-06,
"loss": 1.0011,
"step": 1641
},
{
"epoch": 0.8637559179379274,
"grad_norm": 2.168977975845337,
"learning_rate": 4.774001278789211e-06,
"loss": 1.0342,
"step": 1642
},
{
"epoch": 0.864281956864808,
"grad_norm": 2.0560708045959473,
"learning_rate": 4.773712565204599e-06,
"loss": 1.0239,
"step": 1643
},
{
"epoch": 0.8648079957916885,
"grad_norm": 1.9980727434158325,
"learning_rate": 4.773423676063314e-06,
"loss": 1.0312,
"step": 1644
},
{
"epoch": 0.8653340347185692,
"grad_norm": 2.0650413036346436,
"learning_rate": 4.773134611387661e-06,
"loss": 1.0468,
"step": 1645
},
{
"epoch": 0.8658600736454498,
"grad_norm": 1.954148530960083,
"learning_rate": 4.77284537119996e-06,
"loss": 1.0138,
"step": 1646
},
{
"epoch": 0.8663861125723303,
"grad_norm": 2.092515468597412,
"learning_rate": 4.772555955522543e-06,
"loss": 0.987,
"step": 1647
},
{
"epoch": 0.866912151499211,
"grad_norm": 2.007941246032715,
"learning_rate": 4.772266364377757e-06,
"loss": 0.9918,
"step": 1648
},
{
"epoch": 0.8674381904260915,
"grad_norm": 1.9608757495880127,
"learning_rate": 4.77197659778796e-06,
"loss": 1.0502,
"step": 1649
},
{
"epoch": 0.8679642293529721,
"grad_norm": 2.0067436695098877,
"learning_rate": 4.771686655775527e-06,
"loss": 1.0335,
"step": 1650
},
{
"epoch": 0.8684902682798528,
"grad_norm": 2.079745292663574,
"learning_rate": 4.771396538362845e-06,
"loss": 1.043,
"step": 1651
},
{
"epoch": 0.8690163072067333,
"grad_norm": 1.9542405605316162,
"learning_rate": 4.771106245572313e-06,
"loss": 0.984,
"step": 1652
},
{
"epoch": 0.8695423461336139,
"grad_norm": 2.028416872024536,
"learning_rate": 4.770815777426346e-06,
"loss": 0.9933,
"step": 1653
},
{
"epoch": 0.8700683850604944,
"grad_norm": 1.9436818361282349,
"learning_rate": 4.77052513394737e-06,
"loss": 1.0118,
"step": 1654
},
{
"epoch": 0.8705944239873751,
"grad_norm": 2.028409004211426,
"learning_rate": 4.770234315157828e-06,
"loss": 1.0494,
"step": 1655
},
{
"epoch": 0.8711204629142556,
"grad_norm": 2.0709540843963623,
"learning_rate": 4.769943321080174e-06,
"loss": 1.0542,
"step": 1656
},
{
"epoch": 0.8716465018411362,
"grad_norm": 2.0256619453430176,
"learning_rate": 4.7696521517368755e-06,
"loss": 1.0011,
"step": 1657
},
{
"epoch": 0.8721725407680169,
"grad_norm": 2.0937297344207764,
"learning_rate": 4.769360807150414e-06,
"loss": 0.9974,
"step": 1658
},
{
"epoch": 0.8726985796948974,
"grad_norm": 2.2346062660217285,
"learning_rate": 4.769069287343285e-06,
"loss": 1.0128,
"step": 1659
},
{
"epoch": 0.873224618621778,
"grad_norm": 2.1082491874694824,
"learning_rate": 4.7687775923379975e-06,
"loss": 1.0321,
"step": 1660
},
{
"epoch": 0.8737506575486585,
"grad_norm": 2.0769453048706055,
"learning_rate": 4.768485722157074e-06,
"loss": 0.973,
"step": 1661
},
{
"epoch": 0.8742766964755392,
"grad_norm": 2.0329558849334717,
"learning_rate": 4.768193676823048e-06,
"loss": 1.0102,
"step": 1662
},
{
"epoch": 0.8748027354024198,
"grad_norm": 2.0758261680603027,
"learning_rate": 4.767901456358471e-06,
"loss": 1.0125,
"step": 1663
},
{
"epoch": 0.8753287743293003,
"grad_norm": 2.12320613861084,
"learning_rate": 4.767609060785905e-06,
"loss": 1.0294,
"step": 1664
},
{
"epoch": 0.875854813256181,
"grad_norm": 1.9771841764450073,
"learning_rate": 4.767316490127927e-06,
"loss": 0.9886,
"step": 1665
},
{
"epoch": 0.8763808521830615,
"grad_norm": 1.9373329877853394,
"learning_rate": 4.7670237444071255e-06,
"loss": 0.994,
"step": 1666
},
{
"epoch": 0.8769068911099421,
"grad_norm": 2.0343801975250244,
"learning_rate": 4.766730823646105e-06,
"loss": 1.0352,
"step": 1667
},
{
"epoch": 0.8774329300368228,
"grad_norm": 2.020343542098999,
"learning_rate": 4.766437727867481e-06,
"loss": 0.979,
"step": 1668
},
{
"epoch": 0.8779589689637033,
"grad_norm": 2.107820510864258,
"learning_rate": 4.766144457093886e-06,
"loss": 1.0296,
"step": 1669
},
{
"epoch": 0.8784850078905839,
"grad_norm": 2.1452198028564453,
"learning_rate": 4.765851011347962e-06,
"loss": 1.0438,
"step": 1670
},
{
"epoch": 0.8790110468174644,
"grad_norm": 2.087686777114868,
"learning_rate": 4.7655573906523665e-06,
"loss": 0.9788,
"step": 1671
},
{
"epoch": 0.8795370857443451,
"grad_norm": 2.083097457885742,
"learning_rate": 4.765263595029771e-06,
"loss": 0.9921,
"step": 1672
},
{
"epoch": 0.8800631246712257,
"grad_norm": 2.0001168251037598,
"learning_rate": 4.76496962450286e-06,
"loss": 0.9784,
"step": 1673
},
{
"epoch": 0.8805891635981062,
"grad_norm": 1.9493898153305054,
"learning_rate": 4.7646754790943315e-06,
"loss": 1.0145,
"step": 1674
},
{
"epoch": 0.8811152025249869,
"grad_norm": 2.140746831893921,
"learning_rate": 4.764381158826896e-06,
"loss": 1.0286,
"step": 1675
},
{
"epoch": 0.8816412414518674,
"grad_norm": 2.0411407947540283,
"learning_rate": 4.764086663723278e-06,
"loss": 1.0297,
"step": 1676
},
{
"epoch": 0.882167280378748,
"grad_norm": 2.164043664932251,
"learning_rate": 4.763791993806218e-06,
"loss": 1.0246,
"step": 1677
},
{
"epoch": 0.8826933193056287,
"grad_norm": 2.0231616497039795,
"learning_rate": 4.7634971490984675e-06,
"loss": 0.9692,
"step": 1678
},
{
"epoch": 0.8832193582325092,
"grad_norm": 2.0884130001068115,
"learning_rate": 4.763202129622789e-06,
"loss": 1.0441,
"step": 1679
},
{
"epoch": 0.8837453971593898,
"grad_norm": 1.959078311920166,
"learning_rate": 4.7629069354019654e-06,
"loss": 1.0166,
"step": 1680
},
{
"epoch": 0.8842714360862703,
"grad_norm": 1.836121916770935,
"learning_rate": 4.762611566458786e-06,
"loss": 1.0347,
"step": 1681
},
{
"epoch": 0.884797475013151,
"grad_norm": 2.099907398223877,
"learning_rate": 4.762316022816058e-06,
"loss": 1.0309,
"step": 1682
},
{
"epoch": 0.8853235139400316,
"grad_norm": 1.941465139389038,
"learning_rate": 4.7620203044966004e-06,
"loss": 1.0203,
"step": 1683
},
{
"epoch": 0.8858495528669121,
"grad_norm": 1.893522024154663,
"learning_rate": 4.761724411523247e-06,
"loss": 0.9769,
"step": 1684
},
{
"epoch": 0.8863755917937928,
"grad_norm": 1.9919662475585938,
"learning_rate": 4.7614283439188426e-06,
"loss": 1.0116,
"step": 1685
},
{
"epoch": 0.8869016307206733,
"grad_norm": 1.9670614004135132,
"learning_rate": 4.761132101706249e-06,
"loss": 0.9719,
"step": 1686
},
{
"epoch": 0.8874276696475539,
"grad_norm": 1.9545384645462036,
"learning_rate": 4.760835684908337e-06,
"loss": 0.9986,
"step": 1687
},
{
"epoch": 0.8879537085744345,
"grad_norm": 1.9402283430099487,
"learning_rate": 4.7605390935479946e-06,
"loss": 0.9911,
"step": 1688
},
{
"epoch": 0.8884797475013151,
"grad_norm": 1.954526424407959,
"learning_rate": 4.760242327648122e-06,
"loss": 1.0021,
"step": 1689
},
{
"epoch": 0.8890057864281957,
"grad_norm": 1.9458253383636475,
"learning_rate": 4.759945387231633e-06,
"loss": 1.0346,
"step": 1690
},
{
"epoch": 0.8895318253550762,
"grad_norm": 1.9583990573883057,
"learning_rate": 4.7596482723214565e-06,
"loss": 1.0509,
"step": 1691
},
{
"epoch": 0.8900578642819569,
"grad_norm": 2.0227482318878174,
"learning_rate": 4.75935098294053e-06,
"loss": 1.0651,
"step": 1692
},
{
"epoch": 0.8905839032088374,
"grad_norm": 1.977971076965332,
"learning_rate": 4.7590535191118096e-06,
"loss": 1.0609,
"step": 1693
},
{
"epoch": 0.891109942135718,
"grad_norm": 2.0564186573028564,
"learning_rate": 4.758755880858262e-06,
"loss": 1.0125,
"step": 1694
},
{
"epoch": 0.8916359810625987,
"grad_norm": 1.9081783294677734,
"learning_rate": 4.75845806820287e-06,
"loss": 1.007,
"step": 1695
},
{
"epoch": 0.8921620199894792,
"grad_norm": 2.0456745624542236,
"learning_rate": 4.758160081168626e-06,
"loss": 1.0116,
"step": 1696
},
{
"epoch": 0.8926880589163598,
"grad_norm": 1.9237746000289917,
"learning_rate": 4.757861919778539e-06,
"loss": 1.0023,
"step": 1697
},
{
"epoch": 0.8932140978432404,
"grad_norm": 1.9402356147766113,
"learning_rate": 4.75756358405563e-06,
"loss": 1.0264,
"step": 1698
},
{
"epoch": 0.893740136770121,
"grad_norm": 1.9538573026657104,
"learning_rate": 4.757265074022935e-06,
"loss": 0.9582,
"step": 1699
},
{
"epoch": 0.8942661756970016,
"grad_norm": 2.09053897857666,
"learning_rate": 4.756966389703501e-06,
"loss": 1.0245,
"step": 1700
},
{
"epoch": 0.8947922146238821,
"grad_norm": 2.071685552597046,
"learning_rate": 4.756667531120391e-06,
"loss": 1.0124,
"step": 1701
},
{
"epoch": 0.8953182535507628,
"grad_norm": 2.0141103267669678,
"learning_rate": 4.75636849829668e-06,
"loss": 0.9852,
"step": 1702
},
{
"epoch": 0.8958442924776433,
"grad_norm": 1.9167203903198242,
"learning_rate": 4.756069291255456e-06,
"loss": 1.0194,
"step": 1703
},
{
"epoch": 0.8963703314045239,
"grad_norm": 2.011918067932129,
"learning_rate": 4.755769910019823e-06,
"loss": 1.0029,
"step": 1704
},
{
"epoch": 0.8968963703314046,
"grad_norm": 2.1252031326293945,
"learning_rate": 4.755470354612895e-06,
"loss": 1.0071,
"step": 1705
},
{
"epoch": 0.8974224092582851,
"grad_norm": 2.0214016437530518,
"learning_rate": 4.755170625057801e-06,
"loss": 1.0371,
"step": 1706
},
{
"epoch": 0.8979484481851657,
"grad_norm": 2.4289193153381348,
"learning_rate": 4.754870721377685e-06,
"loss": 1.0581,
"step": 1707
},
{
"epoch": 0.8984744871120462,
"grad_norm": 2.1093404293060303,
"learning_rate": 4.754570643595702e-06,
"loss": 1.0017,
"step": 1708
},
{
"epoch": 0.8990005260389269,
"grad_norm": 2.0420546531677246,
"learning_rate": 4.7542703917350215e-06,
"loss": 1.0642,
"step": 1709
},
{
"epoch": 0.8995265649658075,
"grad_norm": 1.9818446636199951,
"learning_rate": 4.753969965818827e-06,
"loss": 1.0313,
"step": 1710
},
{
"epoch": 0.900052603892688,
"grad_norm": 1.897628664970398,
"learning_rate": 4.753669365870313e-06,
"loss": 0.9875,
"step": 1711
},
{
"epoch": 0.9005786428195687,
"grad_norm": 2.0208487510681152,
"learning_rate": 4.753368591912693e-06,
"loss": 1.0271,
"step": 1712
},
{
"epoch": 0.9011046817464492,
"grad_norm": 1.9346519708633423,
"learning_rate": 4.753067643969186e-06,
"loss": 1.0352,
"step": 1713
},
{
"epoch": 0.9016307206733298,
"grad_norm": 2.0617661476135254,
"learning_rate": 4.75276652206303e-06,
"loss": 0.9806,
"step": 1714
},
{
"epoch": 0.9021567596002105,
"grad_norm": 1.8809938430786133,
"learning_rate": 4.752465226217477e-06,
"loss": 1.0333,
"step": 1715
},
{
"epoch": 0.902682798527091,
"grad_norm": 2.047309398651123,
"learning_rate": 4.752163756455789e-06,
"loss": 1.0614,
"step": 1716
},
{
"epoch": 0.9032088374539716,
"grad_norm": 2.1308083534240723,
"learning_rate": 4.751862112801242e-06,
"loss": 1.0229,
"step": 1717
},
{
"epoch": 0.9037348763808521,
"grad_norm": 2.0333852767944336,
"learning_rate": 4.751560295277127e-06,
"loss": 1.0077,
"step": 1718
},
{
"epoch": 0.9042609153077328,
"grad_norm": 1.9486128091812134,
"learning_rate": 4.7512583039067485e-06,
"loss": 1.0026,
"step": 1719
},
{
"epoch": 0.9047869542346134,
"grad_norm": 2.004258394241333,
"learning_rate": 4.750956138713424e-06,
"loss": 0.986,
"step": 1720
},
{
"epoch": 0.9053129931614939,
"grad_norm": 2.5763192176818848,
"learning_rate": 4.750653799720483e-06,
"loss": 0.979,
"step": 1721
},
{
"epoch": 0.9058390320883746,
"grad_norm": 2.1086039543151855,
"learning_rate": 4.750351286951269e-06,
"loss": 1.0368,
"step": 1722
},
{
"epoch": 0.9063650710152551,
"grad_norm": 2.0445361137390137,
"learning_rate": 4.750048600429141e-06,
"loss": 0.9756,
"step": 1723
},
{
"epoch": 0.9068911099421357,
"grad_norm": 1.8900635242462158,
"learning_rate": 4.7497457401774694e-06,
"loss": 0.8947,
"step": 1724
},
{
"epoch": 0.9074171488690163,
"grad_norm": 2.116900682449341,
"learning_rate": 4.749442706219638e-06,
"loss": 1.0502,
"step": 1725
},
{
"epoch": 0.9079431877958969,
"grad_norm": 2.1096391677856445,
"learning_rate": 4.749139498579044e-06,
"loss": 1.0089,
"step": 1726
},
{
"epoch": 0.9084692267227775,
"grad_norm": 2.2117018699645996,
"learning_rate": 4.7488361172791005e-06,
"loss": 1.056,
"step": 1727
},
{
"epoch": 0.908995265649658,
"grad_norm": 2.0012335777282715,
"learning_rate": 4.748532562343231e-06,
"loss": 0.916,
"step": 1728
},
{
"epoch": 0.9095213045765387,
"grad_norm": 1.8673421144485474,
"learning_rate": 4.748228833794872e-06,
"loss": 0.9844,
"step": 1729
},
{
"epoch": 0.9100473435034192,
"grad_norm": 1.9152559041976929,
"learning_rate": 4.747924931657477e-06,
"loss": 0.9619,
"step": 1730
},
{
"epoch": 0.9105733824302998,
"grad_norm": 2.107985496520996,
"learning_rate": 4.7476208559545104e-06,
"loss": 1.017,
"step": 1731
},
{
"epoch": 0.9110994213571805,
"grad_norm": 2.162464141845703,
"learning_rate": 4.7473166067094474e-06,
"loss": 1.0197,
"step": 1732
},
{
"epoch": 0.911625460284061,
"grad_norm": 2.085958480834961,
"learning_rate": 4.747012183945784e-06,
"loss": 1.0166,
"step": 1733
},
{
"epoch": 0.9121514992109416,
"grad_norm": 2.0198309421539307,
"learning_rate": 4.746707587687022e-06,
"loss": 0.9883,
"step": 1734
},
{
"epoch": 0.9126775381378222,
"grad_norm": 2.013784646987915,
"learning_rate": 4.746402817956681e-06,
"loss": 0.9775,
"step": 1735
},
{
"epoch": 0.9132035770647028,
"grad_norm": 2.1442627906799316,
"learning_rate": 4.746097874778293e-06,
"loss": 1.0358,
"step": 1736
},
{
"epoch": 0.9137296159915834,
"grad_norm": 2.143627643585205,
"learning_rate": 4.745792758175402e-06,
"loss": 0.9537,
"step": 1737
},
{
"epoch": 0.914255654918464,
"grad_norm": 1.9581515789031982,
"learning_rate": 4.745487468171566e-06,
"loss": 0.9756,
"step": 1738
},
{
"epoch": 0.9147816938453446,
"grad_norm": 1.9869537353515625,
"learning_rate": 4.74518200479036e-06,
"loss": 0.995,
"step": 1739
},
{
"epoch": 0.9153077327722251,
"grad_norm": 1.9129465818405151,
"learning_rate": 4.744876368055365e-06,
"loss": 1.0088,
"step": 1740
},
{
"epoch": 0.9158337716991057,
"grad_norm": 1.957229733467102,
"learning_rate": 4.744570557990183e-06,
"loss": 0.9832,
"step": 1741
},
{
"epoch": 0.9163598106259864,
"grad_norm": 2.061002492904663,
"learning_rate": 4.744264574618425e-06,
"loss": 1.0338,
"step": 1742
},
{
"epoch": 0.9168858495528669,
"grad_norm": 2.0439558029174805,
"learning_rate": 4.743958417963715e-06,
"loss": 1.0678,
"step": 1743
},
{
"epoch": 0.9174118884797475,
"grad_norm": 2.0407450199127197,
"learning_rate": 4.743652088049695e-06,
"loss": 1.0219,
"step": 1744
},
{
"epoch": 0.917937927406628,
"grad_norm": 2.2696166038513184,
"learning_rate": 4.743345584900014e-06,
"loss": 0.9909,
"step": 1745
},
{
"epoch": 0.9184639663335087,
"grad_norm": 1.9783145189285278,
"learning_rate": 4.74303890853834e-06,
"loss": 0.9423,
"step": 1746
},
{
"epoch": 0.9189900052603893,
"grad_norm": 2.019179344177246,
"learning_rate": 4.74273205898835e-06,
"loss": 0.9985,
"step": 1747
},
{
"epoch": 0.9195160441872698,
"grad_norm": 1.966417670249939,
"learning_rate": 4.742425036273737e-06,
"loss": 1.0605,
"step": 1748
},
{
"epoch": 0.9200420831141505,
"grad_norm": 1.9425163269042969,
"learning_rate": 4.742117840418207e-06,
"loss": 0.9855,
"step": 1749
},
{
"epoch": 0.920568122041031,
"grad_norm": 1.9825159311294556,
"learning_rate": 4.741810471445478e-06,
"loss": 1.0214,
"step": 1750
},
{
"epoch": 0.9210941609679116,
"grad_norm": 1.9764158725738525,
"learning_rate": 4.741502929379284e-06,
"loss": 1.0249,
"step": 1751
},
{
"epoch": 0.9216201998947923,
"grad_norm": 2.0177724361419678,
"learning_rate": 4.74119521424337e-06,
"loss": 1.0434,
"step": 1752
},
{
"epoch": 0.9221462388216728,
"grad_norm": 2.0949506759643555,
"learning_rate": 4.740887326061495e-06,
"loss": 1.0331,
"step": 1753
},
{
"epoch": 0.9226722777485534,
"grad_norm": 1.9468920230865479,
"learning_rate": 4.740579264857431e-06,
"loss": 0.9212,
"step": 1754
},
{
"epoch": 0.923198316675434,
"grad_norm": 2.2116925716400146,
"learning_rate": 4.740271030654965e-06,
"loss": 1.0241,
"step": 1755
},
{
"epoch": 0.9237243556023146,
"grad_norm": 1.9227603673934937,
"learning_rate": 4.739962623477896e-06,
"loss": 0.98,
"step": 1756
},
{
"epoch": 0.9242503945291951,
"grad_norm": 2.013141632080078,
"learning_rate": 4.739654043350036e-06,
"loss": 1.0321,
"step": 1757
},
{
"epoch": 0.9247764334560757,
"grad_norm": 2.1053218841552734,
"learning_rate": 4.739345290295211e-06,
"loss": 1.0359,
"step": 1758
},
{
"epoch": 0.9253024723829564,
"grad_norm": 2.072932243347168,
"learning_rate": 4.739036364337261e-06,
"loss": 0.9826,
"step": 1759
},
{
"epoch": 0.9258285113098369,
"grad_norm": 2.104072093963623,
"learning_rate": 4.738727265500037e-06,
"loss": 1.0239,
"step": 1760
},
{
"epoch": 0.9263545502367175,
"grad_norm": 2.0704009532928467,
"learning_rate": 4.738417993807407e-06,
"loss": 1.0235,
"step": 1761
},
{
"epoch": 0.9268805891635981,
"grad_norm": 1.9992990493774414,
"learning_rate": 4.738108549283249e-06,
"loss": 0.988,
"step": 1762
},
{
"epoch": 0.9274066280904787,
"grad_norm": 2.150501251220703,
"learning_rate": 4.737798931951456e-06,
"loss": 1.0574,
"step": 1763
},
{
"epoch": 0.9279326670173593,
"grad_norm": 1.906421184539795,
"learning_rate": 4.7374891418359345e-06,
"loss": 1.0479,
"step": 1764
},
{
"epoch": 0.9284587059442398,
"grad_norm": 1.8720351457595825,
"learning_rate": 4.737179178960603e-06,
"loss": 1.038,
"step": 1765
},
{
"epoch": 0.9289847448711205,
"grad_norm": 1.9185991287231445,
"learning_rate": 4.736869043349394e-06,
"loss": 1.0632,
"step": 1766
},
{
"epoch": 0.929510783798001,
"grad_norm": 2.040290594100952,
"learning_rate": 4.736558735026255e-06,
"loss": 0.9857,
"step": 1767
},
{
"epoch": 0.9300368227248816,
"grad_norm": 1.9188529253005981,
"learning_rate": 4.7362482540151445e-06,
"loss": 1.0115,
"step": 1768
},
{
"epoch": 0.9305628616517623,
"grad_norm": 2.092855215072632,
"learning_rate": 4.7359376003400345e-06,
"loss": 1.0318,
"step": 1769
},
{
"epoch": 0.9310889005786428,
"grad_norm": 1.9537826776504517,
"learning_rate": 4.735626774024912e-06,
"loss": 1.0005,
"step": 1770
},
{
"epoch": 0.9316149395055234,
"grad_norm": 1.8022964000701904,
"learning_rate": 4.735315775093775e-06,
"loss": 0.9696,
"step": 1771
},
{
"epoch": 0.932140978432404,
"grad_norm": 2.0534324645996094,
"learning_rate": 4.735004603570639e-06,
"loss": 1.0647,
"step": 1772
},
{
"epoch": 0.9326670173592846,
"grad_norm": 2.082421064376831,
"learning_rate": 4.734693259479527e-06,
"loss": 1.0168,
"step": 1773
},
{
"epoch": 0.9331930562861652,
"grad_norm": 2.2331955432891846,
"learning_rate": 4.734381742844481e-06,
"loss": 1.0288,
"step": 1774
},
{
"epoch": 0.9337190952130457,
"grad_norm": 1.9978649616241455,
"learning_rate": 4.73407005368955e-06,
"loss": 0.9542,
"step": 1775
},
{
"epoch": 0.9342451341399264,
"grad_norm": 2.054856061935425,
"learning_rate": 4.733758192038804e-06,
"loss": 1.0457,
"step": 1776
},
{
"epoch": 0.9347711730668069,
"grad_norm": 2.1446175575256348,
"learning_rate": 4.733446157916319e-06,
"loss": 1.0767,
"step": 1777
},
{
"epoch": 0.9352972119936875,
"grad_norm": 2.149594783782959,
"learning_rate": 4.7331339513461905e-06,
"loss": 0.9975,
"step": 1778
},
{
"epoch": 0.9358232509205682,
"grad_norm": 2.0066800117492676,
"learning_rate": 4.732821572352522e-06,
"loss": 1.0296,
"step": 1779
},
{
"epoch": 0.9363492898474487,
"grad_norm": 2.4036574363708496,
"learning_rate": 4.732509020959434e-06,
"loss": 0.9726,
"step": 1780
},
{
"epoch": 0.9368753287743293,
"grad_norm": 2.0901482105255127,
"learning_rate": 4.73219629719106e-06,
"loss": 1.0748,
"step": 1781
},
{
"epoch": 0.9374013677012099,
"grad_norm": 2.093503713607788,
"learning_rate": 4.731883401071543e-06,
"loss": 1.0413,
"step": 1782
},
{
"epoch": 0.9379274066280905,
"grad_norm": 2.1437647342681885,
"learning_rate": 4.731570332625044e-06,
"loss": 1.0624,
"step": 1783
},
{
"epoch": 0.9384534455549711,
"grad_norm": 2.141866445541382,
"learning_rate": 4.731257091875736e-06,
"loss": 0.9547,
"step": 1784
},
{
"epoch": 0.9389794844818516,
"grad_norm": 2.138530731201172,
"learning_rate": 4.730943678847804e-06,
"loss": 1.0498,
"step": 1785
},
{
"epoch": 0.9395055234087323,
"grad_norm": 2.192941188812256,
"learning_rate": 4.730630093565447e-06,
"loss": 1.0426,
"step": 1786
},
{
"epoch": 0.9400315623356128,
"grad_norm": 1.9256808757781982,
"learning_rate": 4.730316336052877e-06,
"loss": 0.9864,
"step": 1787
},
{
"epoch": 0.9405576012624934,
"grad_norm": 2.1694893836975098,
"learning_rate": 4.730002406334321e-06,
"loss": 0.9926,
"step": 1788
},
{
"epoch": 0.941083640189374,
"grad_norm": 1.9891979694366455,
"learning_rate": 4.729688304434017e-06,
"loss": 0.9835,
"step": 1789
},
{
"epoch": 0.9416096791162546,
"grad_norm": 2.112396240234375,
"learning_rate": 4.729374030376217e-06,
"loss": 1.0131,
"step": 1790
},
{
"epoch": 0.9421357180431352,
"grad_norm": 2.049139976501465,
"learning_rate": 4.729059584185187e-06,
"loss": 1.0176,
"step": 1791
},
{
"epoch": 0.9426617569700158,
"grad_norm": 2.259706497192383,
"learning_rate": 4.728744965885207e-06,
"loss": 1.0566,
"step": 1792
},
{
"epoch": 0.9431877958968964,
"grad_norm": 1.9924520254135132,
"learning_rate": 4.728430175500567e-06,
"loss": 0.9912,
"step": 1793
},
{
"epoch": 0.9437138348237769,
"grad_norm": 2.1724114418029785,
"learning_rate": 4.728115213055573e-06,
"loss": 0.9919,
"step": 1794
},
{
"epoch": 0.9442398737506575,
"grad_norm": 2.083853244781494,
"learning_rate": 4.7278000785745445e-06,
"loss": 1.0368,
"step": 1795
},
{
"epoch": 0.9447659126775382,
"grad_norm": 2.089245080947876,
"learning_rate": 4.727484772081814e-06,
"loss": 1.0471,
"step": 1796
},
{
"epoch": 0.9452919516044187,
"grad_norm": 1.9880348443984985,
"learning_rate": 4.727169293601725e-06,
"loss": 0.9752,
"step": 1797
},
{
"epoch": 0.9458179905312993,
"grad_norm": 2.0518887042999268,
"learning_rate": 4.7268536431586375e-06,
"loss": 0.977,
"step": 1798
},
{
"epoch": 0.9463440294581799,
"grad_norm": 2.3292527198791504,
"learning_rate": 4.726537820776922e-06,
"loss": 0.9696,
"step": 1799
},
{
"epoch": 0.9468700683850605,
"grad_norm": 2.093759775161743,
"learning_rate": 4.7262218264809656e-06,
"loss": 1.028,
"step": 1800
},
{
"epoch": 0.9473961073119411,
"grad_norm": 1.9579375982284546,
"learning_rate": 4.7259056602951644e-06,
"loss": 0.9797,
"step": 1801
},
{
"epoch": 0.9479221462388217,
"grad_norm": 2.1174583435058594,
"learning_rate": 4.725589322243932e-06,
"loss": 0.9993,
"step": 1802
},
{
"epoch": 0.9484481851657023,
"grad_norm": 2.167732000350952,
"learning_rate": 4.725272812351692e-06,
"loss": 1.0031,
"step": 1803
},
{
"epoch": 0.9489742240925828,
"grad_norm": 2.1166253089904785,
"learning_rate": 4.724956130642883e-06,
"loss": 1.0029,
"step": 1804
},
{
"epoch": 0.9495002630194634,
"grad_norm": 2.0212886333465576,
"learning_rate": 4.724639277141957e-06,
"loss": 1.0202,
"step": 1805
},
{
"epoch": 0.9500263019463441,
"grad_norm": 2.1849446296691895,
"learning_rate": 4.7243222518733775e-06,
"loss": 0.9847,
"step": 1806
},
{
"epoch": 0.9505523408732246,
"grad_norm": 2.019671678543091,
"learning_rate": 4.724005054861623e-06,
"loss": 1.0141,
"step": 1807
},
{
"epoch": 0.9510783798001052,
"grad_norm": 2.0654826164245605,
"learning_rate": 4.723687686131186e-06,
"loss": 1.0266,
"step": 1808
},
{
"epoch": 0.9516044187269858,
"grad_norm": 2.0668342113494873,
"learning_rate": 4.7233701457065694e-06,
"loss": 1.0249,
"step": 1809
},
{
"epoch": 0.9521304576538664,
"grad_norm": 1.9022929668426514,
"learning_rate": 4.723052433612292e-06,
"loss": 1.0092,
"step": 1810
},
{
"epoch": 0.952656496580747,
"grad_norm": 2.0411059856414795,
"learning_rate": 4.722734549872884e-06,
"loss": 0.9896,
"step": 1811
},
{
"epoch": 0.9531825355076275,
"grad_norm": 2.0354626178741455,
"learning_rate": 4.722416494512889e-06,
"loss": 0.9529,
"step": 1812
},
{
"epoch": 0.9537085744345082,
"grad_norm": 1.866688847541809,
"learning_rate": 4.722098267556867e-06,
"loss": 0.971,
"step": 1813
},
{
"epoch": 0.9542346133613887,
"grad_norm": 1.9963386058807373,
"learning_rate": 4.721779869029387e-06,
"loss": 0.9931,
"step": 1814
},
{
"epoch": 0.9547606522882693,
"grad_norm": 1.9810550212860107,
"learning_rate": 4.721461298955033e-06,
"loss": 1.0335,
"step": 1815
},
{
"epoch": 0.95528669121515,
"grad_norm": 2.0094194412231445,
"learning_rate": 4.721142557358402e-06,
"loss": 1.0248,
"step": 1816
},
{
"epoch": 0.9558127301420305,
"grad_norm": 2.110318183898926,
"learning_rate": 4.720823644264106e-06,
"loss": 0.9726,
"step": 1817
},
{
"epoch": 0.9563387690689111,
"grad_norm": 2.051914691925049,
"learning_rate": 4.720504559696768e-06,
"loss": 1.0205,
"step": 1818
},
{
"epoch": 0.9568648079957917,
"grad_norm": 2.0969302654266357,
"learning_rate": 4.7201853036810245e-06,
"loss": 1.0313,
"step": 1819
},
{
"epoch": 0.9573908469226723,
"grad_norm": 2.098721742630005,
"learning_rate": 4.719865876241525e-06,
"loss": 1.0276,
"step": 1820
},
{
"epoch": 0.9579168858495528,
"grad_norm": 1.9741021394729614,
"learning_rate": 4.719546277402936e-06,
"loss": 1.0142,
"step": 1821
},
{
"epoch": 0.9584429247764334,
"grad_norm": 2.1097187995910645,
"learning_rate": 4.71922650718993e-06,
"loss": 0.9812,
"step": 1822
},
{
"epoch": 0.9589689637033141,
"grad_norm": 2.1343348026275635,
"learning_rate": 4.718906565627201e-06,
"loss": 1.0126,
"step": 1823
},
{
"epoch": 0.9594950026301946,
"grad_norm": 2.089698553085327,
"learning_rate": 4.71858645273945e-06,
"loss": 0.9982,
"step": 1824
},
{
"epoch": 0.9600210415570752,
"grad_norm": 2.1942148208618164,
"learning_rate": 4.7182661685513925e-06,
"loss": 1.0781,
"step": 1825
},
{
"epoch": 0.9605470804839558,
"grad_norm": 1.92880380153656,
"learning_rate": 4.7179457130877605e-06,
"loss": 1.0214,
"step": 1826
},
{
"epoch": 0.9610731194108364,
"grad_norm": 2.093219518661499,
"learning_rate": 4.717625086373295e-06,
"loss": 1.0411,
"step": 1827
},
{
"epoch": 0.961599158337717,
"grad_norm": 1.9406787157058716,
"learning_rate": 4.7173042884327525e-06,
"loss": 1.0296,
"step": 1828
},
{
"epoch": 0.9621251972645976,
"grad_norm": 1.9737564325332642,
"learning_rate": 4.7169833192909025e-06,
"loss": 1.0119,
"step": 1829
},
{
"epoch": 0.9626512361914782,
"grad_norm": 1.9281796216964722,
"learning_rate": 4.7166621789725276e-06,
"loss": 1.0203,
"step": 1830
},
{
"epoch": 0.9631772751183587,
"grad_norm": 2.128120183944702,
"learning_rate": 4.716340867502424e-06,
"loss": 1.087,
"step": 1831
},
{
"epoch": 0.9637033140452393,
"grad_norm": 2.1313352584838867,
"learning_rate": 4.716019384905399e-06,
"loss": 1.0049,
"step": 1832
},
{
"epoch": 0.96422935297212,
"grad_norm": 1.882323980331421,
"learning_rate": 4.715697731206275e-06,
"loss": 1.052,
"step": 1833
},
{
"epoch": 0.9647553918990005,
"grad_norm": 1.902729868888855,
"learning_rate": 4.71537590642989e-06,
"loss": 1.013,
"step": 1834
},
{
"epoch": 0.9652814308258811,
"grad_norm": 1.9752705097198486,
"learning_rate": 4.715053910601089e-06,
"loss": 0.9964,
"step": 1835
},
{
"epoch": 0.9658074697527617,
"grad_norm": 2.2092044353485107,
"learning_rate": 4.714731743744736e-06,
"loss": 1.0142,
"step": 1836
},
{
"epoch": 0.9663335086796423,
"grad_norm": 1.9738699197769165,
"learning_rate": 4.714409405885706e-06,
"loss": 1.0431,
"step": 1837
},
{
"epoch": 0.9668595476065229,
"grad_norm": 1.94752836227417,
"learning_rate": 4.714086897048886e-06,
"loss": 0.9776,
"step": 1838
},
{
"epoch": 0.9673855865334035,
"grad_norm": 2.044384717941284,
"learning_rate": 4.713764217259178e-06,
"loss": 0.9428,
"step": 1839
},
{
"epoch": 0.9679116254602841,
"grad_norm": 2.067378520965576,
"learning_rate": 4.713441366541497e-06,
"loss": 1.0222,
"step": 1840
},
{
"epoch": 0.9684376643871646,
"grad_norm": 2.0729427337646484,
"learning_rate": 4.71311834492077e-06,
"loss": 1.0244,
"step": 1841
},
{
"epoch": 0.9689637033140452,
"grad_norm": 1.9986896514892578,
"learning_rate": 4.712795152421938e-06,
"loss": 1.0246,
"step": 1842
},
{
"epoch": 0.9694897422409259,
"grad_norm": 2.134274482727051,
"learning_rate": 4.712471789069956e-06,
"loss": 1.0317,
"step": 1843
},
{
"epoch": 0.9700157811678064,
"grad_norm": 2.116116762161255,
"learning_rate": 4.7121482548897896e-06,
"loss": 1.0431,
"step": 1844
},
{
"epoch": 0.970541820094687,
"grad_norm": 2.146329164505005,
"learning_rate": 4.7118245499064205e-06,
"loss": 1.0185,
"step": 1845
},
{
"epoch": 0.9710678590215676,
"grad_norm": 2.2587080001831055,
"learning_rate": 4.711500674144844e-06,
"loss": 1.0172,
"step": 1846
},
{
"epoch": 0.9715938979484482,
"grad_norm": 2.133565902709961,
"learning_rate": 4.7111766276300645e-06,
"loss": 1.0887,
"step": 1847
},
{
"epoch": 0.9721199368753288,
"grad_norm": 2.4180047512054443,
"learning_rate": 4.710852410387103e-06,
"loss": 1.0686,
"step": 1848
},
{
"epoch": 0.9726459758022094,
"grad_norm": 1.9758679866790771,
"learning_rate": 4.7105280224409936e-06,
"loss": 0.9851,
"step": 1849
},
{
"epoch": 0.97317201472909,
"grad_norm": 2.0190632343292236,
"learning_rate": 4.710203463816782e-06,
"loss": 0.9967,
"step": 1850
},
{
"epoch": 0.9736980536559705,
"grad_norm": 2.0636117458343506,
"learning_rate": 4.709878734539527e-06,
"loss": 1.0209,
"step": 1851
},
{
"epoch": 0.9742240925828511,
"grad_norm": 2.0756478309631348,
"learning_rate": 4.709553834634303e-06,
"loss": 0.9793,
"step": 1852
},
{
"epoch": 0.9747501315097317,
"grad_norm": 1.94191312789917,
"learning_rate": 4.709228764126195e-06,
"loss": 0.9697,
"step": 1853
},
{
"epoch": 0.9752761704366123,
"grad_norm": 2.057345390319824,
"learning_rate": 4.708903523040303e-06,
"loss": 0.938,
"step": 1854
},
{
"epoch": 0.9758022093634929,
"grad_norm": 2.1611337661743164,
"learning_rate": 4.7085781114017384e-06,
"loss": 1.0464,
"step": 1855
},
{
"epoch": 0.9763282482903735,
"grad_norm": 1.9461411237716675,
"learning_rate": 4.708252529235627e-06,
"loss": 0.9934,
"step": 1856
},
{
"epoch": 0.9768542872172541,
"grad_norm": 1.9107236862182617,
"learning_rate": 4.707926776567108e-06,
"loss": 0.9895,
"step": 1857
},
{
"epoch": 0.9773803261441346,
"grad_norm": 2.0953640937805176,
"learning_rate": 4.707600853421332e-06,
"loss": 1.0009,
"step": 1858
},
{
"epoch": 0.9779063650710152,
"grad_norm": 2.126648187637329,
"learning_rate": 4.707274759823466e-06,
"loss": 0.9801,
"step": 1859
},
{
"epoch": 0.9784324039978959,
"grad_norm": 2.0868916511535645,
"learning_rate": 4.706948495798687e-06,
"loss": 0.9765,
"step": 1860
},
{
"epoch": 0.9789584429247764,
"grad_norm": 2.0332181453704834,
"learning_rate": 4.706622061372185e-06,
"loss": 1.0216,
"step": 1861
},
{
"epoch": 0.979484481851657,
"grad_norm": 2.05155348777771,
"learning_rate": 4.706295456569167e-06,
"loss": 1.0594,
"step": 1862
},
{
"epoch": 0.9800105207785376,
"grad_norm": 2.1178739070892334,
"learning_rate": 4.7059686814148485e-06,
"loss": 1.0463,
"step": 1863
},
{
"epoch": 0.9805365597054182,
"grad_norm": 1.9961886405944824,
"learning_rate": 4.705641735934462e-06,
"loss": 0.9658,
"step": 1864
},
{
"epoch": 0.9810625986322988,
"grad_norm": 1.9905188083648682,
"learning_rate": 4.705314620153251e-06,
"loss": 0.9677,
"step": 1865
},
{
"epoch": 0.9815886375591794,
"grad_norm": 1.9200838804244995,
"learning_rate": 4.704987334096471e-06,
"loss": 1.0011,
"step": 1866
},
{
"epoch": 0.98211467648606,
"grad_norm": 2.069359302520752,
"learning_rate": 4.704659877789395e-06,
"loss": 1.01,
"step": 1867
},
{
"epoch": 0.9826407154129405,
"grad_norm": 1.8069074153900146,
"learning_rate": 4.704332251257304e-06,
"loss": 1.037,
"step": 1868
},
{
"epoch": 0.9831667543398211,
"grad_norm": 1.9900349378585815,
"learning_rate": 4.704004454525496e-06,
"loss": 1.0035,
"step": 1869
},
{
"epoch": 0.9836927932667018,
"grad_norm": 1.902032494544983,
"learning_rate": 4.70367648761928e-06,
"loss": 1.0001,
"step": 1870
},
{
"epoch": 0.9842188321935823,
"grad_norm": 2.5718839168548584,
"learning_rate": 4.703348350563978e-06,
"loss": 1.002,
"step": 1871
},
{
"epoch": 0.9847448711204629,
"grad_norm": 1.90852952003479,
"learning_rate": 4.703020043384927e-06,
"loss": 1.0338,
"step": 1872
},
{
"epoch": 0.9852709100473435,
"grad_norm": 2.0179872512817383,
"learning_rate": 4.702691566107477e-06,
"loss": 0.9724,
"step": 1873
},
{
"epoch": 0.9857969489742241,
"grad_norm": 2.0315425395965576,
"learning_rate": 4.702362918756988e-06,
"loss": 1.0256,
"step": 1874
},
{
"epoch": 0.9863229879011047,
"grad_norm": 1.898896336555481,
"learning_rate": 4.702034101358837e-06,
"loss": 0.9695,
"step": 1875
},
{
"epoch": 0.9868490268279853,
"grad_norm": 2.1176962852478027,
"learning_rate": 4.701705113938411e-06,
"loss": 1.0217,
"step": 1876
},
{
"epoch": 0.9873750657548659,
"grad_norm": 1.94914972782135,
"learning_rate": 4.701375956521113e-06,
"loss": 1.0081,
"step": 1877
},
{
"epoch": 0.9879011046817464,
"grad_norm": 1.9665032625198364,
"learning_rate": 4.701046629132358e-06,
"loss": 1.0174,
"step": 1878
},
{
"epoch": 0.988427143608627,
"grad_norm": 2.005793571472168,
"learning_rate": 4.700717131797573e-06,
"loss": 0.9653,
"step": 1879
},
{
"epoch": 0.9889531825355077,
"grad_norm": 2.0769705772399902,
"learning_rate": 4.700387464542199e-06,
"loss": 1.0142,
"step": 1880
},
{
"epoch": 0.9894792214623882,
"grad_norm": 1.9945422410964966,
"learning_rate": 4.700057627391689e-06,
"loss": 1.0225,
"step": 1881
},
{
"epoch": 0.9900052603892688,
"grad_norm": 2.1121349334716797,
"learning_rate": 4.699727620371513e-06,
"loss": 1.0056,
"step": 1882
},
{
"epoch": 0.9905312993161494,
"grad_norm": 2.156942844390869,
"learning_rate": 4.699397443507148e-06,
"loss": 1.0049,
"step": 1883
},
{
"epoch": 0.99105733824303,
"grad_norm": 2.065075159072876,
"learning_rate": 4.699067096824091e-06,
"loss": 0.9694,
"step": 1884
},
{
"epoch": 0.9915833771699105,
"grad_norm": 2.12490177154541,
"learning_rate": 4.698736580347845e-06,
"loss": 1.0268,
"step": 1885
},
{
"epoch": 0.9921094160967912,
"grad_norm": 2.039874792098999,
"learning_rate": 4.698405894103932e-06,
"loss": 1.0122,
"step": 1886
},
{
"epoch": 0.9926354550236718,
"grad_norm": 2.0004734992980957,
"learning_rate": 4.698075038117884e-06,
"loss": 0.9996,
"step": 1887
},
{
"epoch": 0.9931614939505523,
"grad_norm": 1.996697187423706,
"learning_rate": 4.697744012415248e-06,
"loss": 1.0658,
"step": 1888
},
{
"epoch": 0.9936875328774329,
"grad_norm": 1.9783189296722412,
"learning_rate": 4.69741281702158e-06,
"loss": 0.9799,
"step": 1889
},
{
"epoch": 0.9942135718043135,
"grad_norm": 2.054898738861084,
"learning_rate": 4.697081451962456e-06,
"loss": 1.0302,
"step": 1890
},
{
"epoch": 0.9947396107311941,
"grad_norm": 1.953337550163269,
"learning_rate": 4.696749917263458e-06,
"loss": 0.9634,
"step": 1891
},
{
"epoch": 0.9952656496580747,
"grad_norm": 2.6126086711883545,
"learning_rate": 4.6964182129501855e-06,
"loss": 0.9659,
"step": 1892
},
{
"epoch": 0.9957916885849553,
"grad_norm": 1.931026816368103,
"learning_rate": 4.69608633904825e-06,
"loss": 1.0456,
"step": 1893
},
{
"epoch": 0.9963177275118359,
"grad_norm": 1.9246487617492676,
"learning_rate": 4.695754295583276e-06,
"loss": 1.0057,
"step": 1894
},
{
"epoch": 0.9968437664387164,
"grad_norm": 1.9731547832489014,
"learning_rate": 4.695422082580901e-06,
"loss": 0.9619,
"step": 1895
},
{
"epoch": 0.997369805365597,
"grad_norm": 2.1975600719451904,
"learning_rate": 4.695089700066776e-06,
"loss": 0.9667,
"step": 1896
},
{
"epoch": 0.9978958442924777,
"grad_norm": 1.9038164615631104,
"learning_rate": 4.6947571480665636e-06,
"loss": 0.9564,
"step": 1897
},
{
"epoch": 0.9984218832193582,
"grad_norm": 1.9997332096099854,
"learning_rate": 4.694424426605942e-06,
"loss": 0.9717,
"step": 1898
},
{
"epoch": 0.9989479221462388,
"grad_norm": 2.0790839195251465,
"learning_rate": 4.6940915357106e-06,
"loss": 1.044,
"step": 1899
},
{
"epoch": 0.9994739610731194,
"grad_norm": 2.0779690742492676,
"learning_rate": 4.693758475406241e-06,
"loss": 1.052,
"step": 1900
},
{
"epoch": 1.0,
"grad_norm": 2.3423078060150146,
"learning_rate": 4.693425245718581e-06,
"loss": 0.9887,
"step": 1901
}
],
"logging_steps": 1,
"max_steps": 11406,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 1901,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.801364251367178e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}