LLama3-3.2-instruct-trained / trainer_state.json
Hagrass's picture
Upload folder using huggingface_hub
3f03713 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5959253603486163,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002979626801743082,
"grad_norm": 3.1244261264801025,
"learning_rate": 0.0,
"loss": 1.0791,
"step": 1
},
{
"epoch": 0.0005959253603486164,
"grad_norm": 3.44380259513855,
"learning_rate": 2.9761904761904764e-08,
"loss": 1.1166,
"step": 2
},
{
"epoch": 0.0008938880405229246,
"grad_norm": 3.0464375019073486,
"learning_rate": 5.952380952380953e-08,
"loss": 1.0773,
"step": 3
},
{
"epoch": 0.0011918507206972327,
"grad_norm": 3.4371163845062256,
"learning_rate": 8.928571428571429e-08,
"loss": 1.1197,
"step": 4
},
{
"epoch": 0.0014898134008715408,
"grad_norm": 3.463773727416992,
"learning_rate": 1.1904761904761906e-07,
"loss": 1.0671,
"step": 5
},
{
"epoch": 0.001787776081045849,
"grad_norm": 3.2587881088256836,
"learning_rate": 1.4880952380952382e-07,
"loss": 1.044,
"step": 6
},
{
"epoch": 0.002085738761220157,
"grad_norm": 3.1563422679901123,
"learning_rate": 1.7857142857142858e-07,
"loss": 1.067,
"step": 7
},
{
"epoch": 0.0023837014413944655,
"grad_norm": 3.1642303466796875,
"learning_rate": 2.0833333333333333e-07,
"loss": 1.1304,
"step": 8
},
{
"epoch": 0.0026816641215687735,
"grad_norm": 3.3693394660949707,
"learning_rate": 2.3809523809523811e-07,
"loss": 1.1305,
"step": 9
},
{
"epoch": 0.0029796268017430816,
"grad_norm": 3.046869993209839,
"learning_rate": 2.6785714285714284e-07,
"loss": 1.0495,
"step": 10
},
{
"epoch": 0.0032775894819173897,
"grad_norm": 3.0591135025024414,
"learning_rate": 2.9761904761904765e-07,
"loss": 1.079,
"step": 11
},
{
"epoch": 0.003575552162091698,
"grad_norm": 3.081220865249634,
"learning_rate": 3.273809523809524e-07,
"loss": 1.0133,
"step": 12
},
{
"epoch": 0.0038735148422660063,
"grad_norm": 2.966681718826294,
"learning_rate": 3.5714285714285716e-07,
"loss": 1.0237,
"step": 13
},
{
"epoch": 0.004171477522440314,
"grad_norm": 3.099379777908325,
"learning_rate": 3.8690476190476196e-07,
"loss": 1.1378,
"step": 14
},
{
"epoch": 0.004469440202614623,
"grad_norm": 3.090646505355835,
"learning_rate": 4.1666666666666667e-07,
"loss": 1.1272,
"step": 15
},
{
"epoch": 0.004767402882788931,
"grad_norm": 2.8938851356506348,
"learning_rate": 4.4642857142857147e-07,
"loss": 1.0334,
"step": 16
},
{
"epoch": 0.005065365562963239,
"grad_norm": 3.3159046173095703,
"learning_rate": 4.7619047619047623e-07,
"loss": 1.1159,
"step": 17
},
{
"epoch": 0.005363328243137547,
"grad_norm": 2.672910451889038,
"learning_rate": 5.05952380952381e-07,
"loss": 1.044,
"step": 18
},
{
"epoch": 0.005661290923311855,
"grad_norm": 2.416795015335083,
"learning_rate": 5.357142857142857e-07,
"loss": 1.0055,
"step": 19
},
{
"epoch": 0.005959253603486163,
"grad_norm": 2.769660472869873,
"learning_rate": 5.654761904761905e-07,
"loss": 1.0578,
"step": 20
},
{
"epoch": 0.006257216283660471,
"grad_norm": 3.1771910190582275,
"learning_rate": 5.952380952380953e-07,
"loss": 1.0242,
"step": 21
},
{
"epoch": 0.006555178963834779,
"grad_norm": 2.6926465034484863,
"learning_rate": 6.25e-07,
"loss": 1.0106,
"step": 22
},
{
"epoch": 0.0068531416440090875,
"grad_norm": 2.533829689025879,
"learning_rate": 6.547619047619048e-07,
"loss": 1.0589,
"step": 23
},
{
"epoch": 0.007151104324183396,
"grad_norm": 2.9328370094299316,
"learning_rate": 6.845238095238097e-07,
"loss": 1.0738,
"step": 24
},
{
"epoch": 0.0074490670043577045,
"grad_norm": 2.1759138107299805,
"learning_rate": 7.142857142857143e-07,
"loss": 1.0317,
"step": 25
},
{
"epoch": 0.0077470296845320126,
"grad_norm": 1.8581618070602417,
"learning_rate": 7.440476190476191e-07,
"loss": 1.0521,
"step": 26
},
{
"epoch": 0.00804499236470632,
"grad_norm": 1.785060167312622,
"learning_rate": 7.738095238095239e-07,
"loss": 1.0483,
"step": 27
},
{
"epoch": 0.008342955044880628,
"grad_norm": 1.8366209268569946,
"learning_rate": 8.035714285714287e-07,
"loss": 1.0467,
"step": 28
},
{
"epoch": 0.008640917725054938,
"grad_norm": 1.7094985246658325,
"learning_rate": 8.333333333333333e-07,
"loss": 1.08,
"step": 29
},
{
"epoch": 0.008938880405229246,
"grad_norm": 1.4586992263793945,
"learning_rate": 8.630952380952382e-07,
"loss": 0.9621,
"step": 30
},
{
"epoch": 0.009236843085403554,
"grad_norm": 1.654887080192566,
"learning_rate": 8.928571428571429e-07,
"loss": 1.0169,
"step": 31
},
{
"epoch": 0.009534805765577862,
"grad_norm": 1.4168285131454468,
"learning_rate": 9.226190476190477e-07,
"loss": 0.9862,
"step": 32
},
{
"epoch": 0.00983276844575217,
"grad_norm": 1.4141634702682495,
"learning_rate": 9.523809523809525e-07,
"loss": 0.9741,
"step": 33
},
{
"epoch": 0.010130731125926478,
"grad_norm": 1.186896562576294,
"learning_rate": 9.821428571428572e-07,
"loss": 0.9686,
"step": 34
},
{
"epoch": 0.010428693806100786,
"grad_norm": 1.0775161981582642,
"learning_rate": 1.011904761904762e-06,
"loss": 0.9944,
"step": 35
},
{
"epoch": 0.010726656486275094,
"grad_norm": 1.0778316259384155,
"learning_rate": 1.0416666666666667e-06,
"loss": 0.9942,
"step": 36
},
{
"epoch": 0.011024619166449402,
"grad_norm": 1.0128413438796997,
"learning_rate": 1.0714285714285714e-06,
"loss": 0.9727,
"step": 37
},
{
"epoch": 0.01132258184662371,
"grad_norm": 1.043727159500122,
"learning_rate": 1.1011904761904762e-06,
"loss": 1.0207,
"step": 38
},
{
"epoch": 0.011620544526798018,
"grad_norm": 1.0442272424697876,
"learning_rate": 1.130952380952381e-06,
"loss": 1.0429,
"step": 39
},
{
"epoch": 0.011918507206972326,
"grad_norm": 0.8829795718193054,
"learning_rate": 1.160714285714286e-06,
"loss": 0.938,
"step": 40
},
{
"epoch": 0.012216469887146635,
"grad_norm": 0.8920165300369263,
"learning_rate": 1.1904761904761906e-06,
"loss": 0.9499,
"step": 41
},
{
"epoch": 0.012514432567320943,
"grad_norm": 0.8303807973861694,
"learning_rate": 1.2202380952380952e-06,
"loss": 1.0088,
"step": 42
},
{
"epoch": 0.01281239524749525,
"grad_norm": 0.7994042038917542,
"learning_rate": 1.25e-06,
"loss": 0.9187,
"step": 43
},
{
"epoch": 0.013110357927669559,
"grad_norm": 0.7477161884307861,
"learning_rate": 1.2797619047619048e-06,
"loss": 0.9482,
"step": 44
},
{
"epoch": 0.013408320607843867,
"grad_norm": 0.75196373462677,
"learning_rate": 1.3095238095238096e-06,
"loss": 1.0033,
"step": 45
},
{
"epoch": 0.013706283288018175,
"grad_norm": 0.6549458503723145,
"learning_rate": 1.3392857142857143e-06,
"loss": 0.9267,
"step": 46
},
{
"epoch": 0.014004245968192485,
"grad_norm": 0.6505201458930969,
"learning_rate": 1.3690476190476193e-06,
"loss": 0.9653,
"step": 47
},
{
"epoch": 0.014302208648366793,
"grad_norm": 0.6912044286727905,
"learning_rate": 1.398809523809524e-06,
"loss": 0.9638,
"step": 48
},
{
"epoch": 0.014600171328541101,
"grad_norm": 0.7041192650794983,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.95,
"step": 49
},
{
"epoch": 0.014898134008715409,
"grad_norm": 0.7128690481185913,
"learning_rate": 1.4583333333333335e-06,
"loss": 0.9374,
"step": 50
},
{
"epoch": 0.015196096688889717,
"grad_norm": 0.7611543536186218,
"learning_rate": 1.4880952380952381e-06,
"loss": 0.9481,
"step": 51
},
{
"epoch": 0.015494059369064025,
"grad_norm": 0.7015216946601868,
"learning_rate": 1.5178571428571428e-06,
"loss": 0.9783,
"step": 52
},
{
"epoch": 0.01579202204923833,
"grad_norm": 0.6188274621963501,
"learning_rate": 1.5476190476190479e-06,
"loss": 0.9205,
"step": 53
},
{
"epoch": 0.01608998472941264,
"grad_norm": 0.5838772058486938,
"learning_rate": 1.5773809523809525e-06,
"loss": 0.8994,
"step": 54
},
{
"epoch": 0.016387947409586948,
"grad_norm": 0.5457233786582947,
"learning_rate": 1.6071428571428574e-06,
"loss": 0.9036,
"step": 55
},
{
"epoch": 0.016685910089761256,
"grad_norm": 0.5556498765945435,
"learning_rate": 1.636904761904762e-06,
"loss": 0.9701,
"step": 56
},
{
"epoch": 0.016983872769935567,
"grad_norm": 0.5316476821899414,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.9581,
"step": 57
},
{
"epoch": 0.017281835450109875,
"grad_norm": 0.5063617825508118,
"learning_rate": 1.6964285714285717e-06,
"loss": 0.925,
"step": 58
},
{
"epoch": 0.017579798130284183,
"grad_norm": 0.5297480225563049,
"learning_rate": 1.7261904761904764e-06,
"loss": 0.9422,
"step": 59
},
{
"epoch": 0.01787776081045849,
"grad_norm": 0.6286386251449585,
"learning_rate": 1.755952380952381e-06,
"loss": 0.9976,
"step": 60
},
{
"epoch": 0.0181757234906328,
"grad_norm": 0.6573434472084045,
"learning_rate": 1.7857142857142859e-06,
"loss": 0.9702,
"step": 61
},
{
"epoch": 0.018473686170807108,
"grad_norm": 0.5362038016319275,
"learning_rate": 1.8154761904761905e-06,
"loss": 0.9038,
"step": 62
},
{
"epoch": 0.018771648850981416,
"grad_norm": 0.5523419976234436,
"learning_rate": 1.8452380952380954e-06,
"loss": 0.9804,
"step": 63
},
{
"epoch": 0.019069611531155724,
"grad_norm": 0.5619953870773315,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.8545,
"step": 64
},
{
"epoch": 0.019367574211330032,
"grad_norm": 0.5033801794052124,
"learning_rate": 1.904761904761905e-06,
"loss": 0.9159,
"step": 65
},
{
"epoch": 0.01966553689150434,
"grad_norm": 0.512747585773468,
"learning_rate": 1.9345238095238096e-06,
"loss": 0.9277,
"step": 66
},
{
"epoch": 0.019963499571678648,
"grad_norm": 0.4727635681629181,
"learning_rate": 1.9642857142857144e-06,
"loss": 0.9122,
"step": 67
},
{
"epoch": 0.020261462251852956,
"grad_norm": 0.4830302596092224,
"learning_rate": 1.9940476190476193e-06,
"loss": 0.898,
"step": 68
},
{
"epoch": 0.020559424932027264,
"grad_norm": 0.487798273563385,
"learning_rate": 2.023809523809524e-06,
"loss": 0.9453,
"step": 69
},
{
"epoch": 0.020857387612201572,
"grad_norm": 0.4499029219150543,
"learning_rate": 2.0535714285714286e-06,
"loss": 0.8866,
"step": 70
},
{
"epoch": 0.02115535029237588,
"grad_norm": 0.4831499755382538,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.9475,
"step": 71
},
{
"epoch": 0.02145331297255019,
"grad_norm": 0.47613057494163513,
"learning_rate": 2.1130952380952383e-06,
"loss": 0.9084,
"step": 72
},
{
"epoch": 0.021751275652724496,
"grad_norm": 0.4876677393913269,
"learning_rate": 2.1428571428571427e-06,
"loss": 0.9322,
"step": 73
},
{
"epoch": 0.022049238332898805,
"grad_norm": 0.46994251012802124,
"learning_rate": 2.172619047619048e-06,
"loss": 0.8968,
"step": 74
},
{
"epoch": 0.022347201013073113,
"grad_norm": 0.470572292804718,
"learning_rate": 2.2023809523809525e-06,
"loss": 0.934,
"step": 75
},
{
"epoch": 0.02264516369324742,
"grad_norm": 0.4928075969219208,
"learning_rate": 2.2321428571428573e-06,
"loss": 0.9155,
"step": 76
},
{
"epoch": 0.02294312637342173,
"grad_norm": 0.47621583938598633,
"learning_rate": 2.261904761904762e-06,
"loss": 0.9037,
"step": 77
},
{
"epoch": 0.023241089053596037,
"grad_norm": 0.47395530343055725,
"learning_rate": 2.2916666666666666e-06,
"loss": 0.9576,
"step": 78
},
{
"epoch": 0.023539051733770345,
"grad_norm": 0.47508686780929565,
"learning_rate": 2.321428571428572e-06,
"loss": 0.9182,
"step": 79
},
{
"epoch": 0.023837014413944653,
"grad_norm": 0.45137453079223633,
"learning_rate": 2.3511904761904763e-06,
"loss": 0.8932,
"step": 80
},
{
"epoch": 0.02413497709411896,
"grad_norm": 0.43199121952056885,
"learning_rate": 2.380952380952381e-06,
"loss": 0.8907,
"step": 81
},
{
"epoch": 0.02443293977429327,
"grad_norm": 0.4940590262413025,
"learning_rate": 2.410714285714286e-06,
"loss": 0.9687,
"step": 82
},
{
"epoch": 0.024730902454467577,
"grad_norm": 0.4669753611087799,
"learning_rate": 2.4404761904761905e-06,
"loss": 0.9435,
"step": 83
},
{
"epoch": 0.025028865134641885,
"grad_norm": 0.45019495487213135,
"learning_rate": 2.4702380952380953e-06,
"loss": 0.8945,
"step": 84
},
{
"epoch": 0.025326827814816193,
"grad_norm": 0.45232030749320984,
"learning_rate": 2.5e-06,
"loss": 0.9446,
"step": 85
},
{
"epoch": 0.0256247904949905,
"grad_norm": 0.42479407787323,
"learning_rate": 2.529761904761905e-06,
"loss": 0.832,
"step": 86
},
{
"epoch": 0.02592275317516481,
"grad_norm": 0.47908833622932434,
"learning_rate": 2.5595238095238095e-06,
"loss": 0.9384,
"step": 87
},
{
"epoch": 0.026220715855339118,
"grad_norm": 0.49192699790000916,
"learning_rate": 2.5892857142857148e-06,
"loss": 0.9669,
"step": 88
},
{
"epoch": 0.026518678535513426,
"grad_norm": 0.43540194630622864,
"learning_rate": 2.6190476190476192e-06,
"loss": 0.9107,
"step": 89
},
{
"epoch": 0.026816641215687734,
"grad_norm": 0.4820593595504761,
"learning_rate": 2.648809523809524e-06,
"loss": 0.9847,
"step": 90
},
{
"epoch": 0.027114603895862042,
"grad_norm": 0.46626460552215576,
"learning_rate": 2.6785714285714285e-06,
"loss": 0.9219,
"step": 91
},
{
"epoch": 0.02741256657603635,
"grad_norm": 0.47896644473075867,
"learning_rate": 2.7083333333333334e-06,
"loss": 0.9815,
"step": 92
},
{
"epoch": 0.027710529256210658,
"grad_norm": 0.46742215752601624,
"learning_rate": 2.7380952380952387e-06,
"loss": 0.8771,
"step": 93
},
{
"epoch": 0.02800849193638497,
"grad_norm": 0.44341200590133667,
"learning_rate": 2.767857142857143e-06,
"loss": 0.9431,
"step": 94
},
{
"epoch": 0.028306454616559278,
"grad_norm": 0.463361918926239,
"learning_rate": 2.797619047619048e-06,
"loss": 0.9401,
"step": 95
},
{
"epoch": 0.028604417296733586,
"grad_norm": 0.453514039516449,
"learning_rate": 2.8273809523809524e-06,
"loss": 0.8765,
"step": 96
},
{
"epoch": 0.028902379976907894,
"grad_norm": 0.4397393465042114,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.8968,
"step": 97
},
{
"epoch": 0.029200342657082202,
"grad_norm": 0.4268377423286438,
"learning_rate": 2.8869047619047617e-06,
"loss": 0.8797,
"step": 98
},
{
"epoch": 0.02949830533725651,
"grad_norm": 0.47780126333236694,
"learning_rate": 2.916666666666667e-06,
"loss": 0.9006,
"step": 99
},
{
"epoch": 0.029796268017430818,
"grad_norm": 0.4212696850299835,
"learning_rate": 2.946428571428572e-06,
"loss": 0.9118,
"step": 100
},
{
"epoch": 0.030094230697605126,
"grad_norm": 0.4879840016365051,
"learning_rate": 2.9761904761904763e-06,
"loss": 0.9467,
"step": 101
},
{
"epoch": 0.030392193377779434,
"grad_norm": 0.4617227613925934,
"learning_rate": 3.005952380952381e-06,
"loss": 0.8902,
"step": 102
},
{
"epoch": 0.030690156057953742,
"grad_norm": 0.44039371609687805,
"learning_rate": 3.0357142857142856e-06,
"loss": 0.9228,
"step": 103
},
{
"epoch": 0.03098811873812805,
"grad_norm": 0.4614720344543457,
"learning_rate": 3.065476190476191e-06,
"loss": 0.971,
"step": 104
},
{
"epoch": 0.03128608141830236,
"grad_norm": 0.4567113220691681,
"learning_rate": 3.0952380952380957e-06,
"loss": 0.9025,
"step": 105
},
{
"epoch": 0.03158404409847666,
"grad_norm": 0.4666142761707306,
"learning_rate": 3.125e-06,
"loss": 0.9523,
"step": 106
},
{
"epoch": 0.031882006778650974,
"grad_norm": 0.429106742143631,
"learning_rate": 3.154761904761905e-06,
"loss": 0.9029,
"step": 107
},
{
"epoch": 0.03217996945882528,
"grad_norm": 0.421153724193573,
"learning_rate": 3.1845238095238094e-06,
"loss": 0.8562,
"step": 108
},
{
"epoch": 0.03247793213899959,
"grad_norm": 0.428710013628006,
"learning_rate": 3.2142857142857147e-06,
"loss": 0.8686,
"step": 109
},
{
"epoch": 0.032775894819173895,
"grad_norm": 0.4266265630722046,
"learning_rate": 3.2440476190476196e-06,
"loss": 0.9008,
"step": 110
},
{
"epoch": 0.03307385749934821,
"grad_norm": 0.4408479928970337,
"learning_rate": 3.273809523809524e-06,
"loss": 0.8632,
"step": 111
},
{
"epoch": 0.03337182017952251,
"grad_norm": 0.43926018476486206,
"learning_rate": 3.303571428571429e-06,
"loss": 0.8777,
"step": 112
},
{
"epoch": 0.03366978285969682,
"grad_norm": 0.44635215401649475,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.9039,
"step": 113
},
{
"epoch": 0.033967745539871134,
"grad_norm": 0.4410278797149658,
"learning_rate": 3.3630952380952386e-06,
"loss": 0.9095,
"step": 114
},
{
"epoch": 0.03426570822004544,
"grad_norm": 0.4349464476108551,
"learning_rate": 3.3928571428571435e-06,
"loss": 0.9068,
"step": 115
},
{
"epoch": 0.03456367090021975,
"grad_norm": 0.4464796185493469,
"learning_rate": 3.422619047619048e-06,
"loss": 0.8984,
"step": 116
},
{
"epoch": 0.034861633580394055,
"grad_norm": 0.440498024225235,
"learning_rate": 3.4523809523809528e-06,
"loss": 0.8895,
"step": 117
},
{
"epoch": 0.03515959626056837,
"grad_norm": 0.4387143552303314,
"learning_rate": 3.482142857142857e-06,
"loss": 0.8553,
"step": 118
},
{
"epoch": 0.03545755894074267,
"grad_norm": 0.4271283745765686,
"learning_rate": 3.511904761904762e-06,
"loss": 0.9052,
"step": 119
},
{
"epoch": 0.03575552162091698,
"grad_norm": 0.45256221294403076,
"learning_rate": 3.5416666666666673e-06,
"loss": 0.9058,
"step": 120
},
{
"epoch": 0.03605348430109129,
"grad_norm": 0.47053638100624084,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.9059,
"step": 121
},
{
"epoch": 0.0363514469812656,
"grad_norm": 0.45863860845565796,
"learning_rate": 3.6011904761904766e-06,
"loss": 0.9446,
"step": 122
},
{
"epoch": 0.036649409661439904,
"grad_norm": 0.41932588815689087,
"learning_rate": 3.630952380952381e-06,
"loss": 0.8577,
"step": 123
},
{
"epoch": 0.036947372341614215,
"grad_norm": 0.45671793818473816,
"learning_rate": 3.660714285714286e-06,
"loss": 0.8999,
"step": 124
},
{
"epoch": 0.03724533502178852,
"grad_norm": 0.46893569827079773,
"learning_rate": 3.690476190476191e-06,
"loss": 0.9324,
"step": 125
},
{
"epoch": 0.03754329770196283,
"grad_norm": 0.4448380470275879,
"learning_rate": 3.7202380952380957e-06,
"loss": 0.9114,
"step": 126
},
{
"epoch": 0.037841260382137136,
"grad_norm": 0.4434012174606323,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.8897,
"step": 127
},
{
"epoch": 0.03813922306231145,
"grad_norm": 0.4325112998485565,
"learning_rate": 3.779761904761905e-06,
"loss": 0.8819,
"step": 128
},
{
"epoch": 0.03843718574248575,
"grad_norm": 0.43405723571777344,
"learning_rate": 3.80952380952381e-06,
"loss": 0.8718,
"step": 129
},
{
"epoch": 0.038735148422660064,
"grad_norm": 0.4202880859375,
"learning_rate": 3.839285714285715e-06,
"loss": 0.9105,
"step": 130
},
{
"epoch": 0.03903311110283437,
"grad_norm": 0.4502437710762024,
"learning_rate": 3.869047619047619e-06,
"loss": 0.8891,
"step": 131
},
{
"epoch": 0.03933107378300868,
"grad_norm": 0.45120131969451904,
"learning_rate": 3.898809523809524e-06,
"loss": 0.8806,
"step": 132
},
{
"epoch": 0.039629036463182984,
"grad_norm": 0.45555320382118225,
"learning_rate": 3.928571428571429e-06,
"loss": 0.9092,
"step": 133
},
{
"epoch": 0.039926999143357296,
"grad_norm": 0.47341638803482056,
"learning_rate": 3.958333333333333e-06,
"loss": 0.89,
"step": 134
},
{
"epoch": 0.0402249618235316,
"grad_norm": 0.44647476077079773,
"learning_rate": 3.9880952380952386e-06,
"loss": 0.8783,
"step": 135
},
{
"epoch": 0.04052292450370591,
"grad_norm": 0.44241106510162354,
"learning_rate": 4.017857142857143e-06,
"loss": 0.9057,
"step": 136
},
{
"epoch": 0.04082088718388022,
"grad_norm": 0.44842639565467834,
"learning_rate": 4.047619047619048e-06,
"loss": 0.8812,
"step": 137
},
{
"epoch": 0.04111884986405453,
"grad_norm": 0.4141370952129364,
"learning_rate": 4.077380952380953e-06,
"loss": 0.8659,
"step": 138
},
{
"epoch": 0.04141681254422883,
"grad_norm": 0.4200192093849182,
"learning_rate": 4.107142857142857e-06,
"loss": 0.8981,
"step": 139
},
{
"epoch": 0.041714775224403144,
"grad_norm": 0.42679327726364136,
"learning_rate": 4.136904761904762e-06,
"loss": 0.8819,
"step": 140
},
{
"epoch": 0.04201273790457745,
"grad_norm": 0.4665552079677582,
"learning_rate": 4.166666666666667e-06,
"loss": 0.905,
"step": 141
},
{
"epoch": 0.04231070058475176,
"grad_norm": 0.44266387820243835,
"learning_rate": 4.196428571428572e-06,
"loss": 0.9064,
"step": 142
},
{
"epoch": 0.042608663264926065,
"grad_norm": 0.47972339391708374,
"learning_rate": 4.226190476190477e-06,
"loss": 0.9095,
"step": 143
},
{
"epoch": 0.04290662594510038,
"grad_norm": 0.45169147849082947,
"learning_rate": 4.255952380952381e-06,
"loss": 0.9141,
"step": 144
},
{
"epoch": 0.04320458862527468,
"grad_norm": 0.42670682072639465,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.8293,
"step": 145
},
{
"epoch": 0.04350255130544899,
"grad_norm": 0.44024792313575745,
"learning_rate": 4.315476190476191e-06,
"loss": 0.8798,
"step": 146
},
{
"epoch": 0.0438005139856233,
"grad_norm": 0.4199204444885254,
"learning_rate": 4.345238095238096e-06,
"loss": 0.8516,
"step": 147
},
{
"epoch": 0.04409847666579761,
"grad_norm": 0.4429978132247925,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.909,
"step": 148
},
{
"epoch": 0.044396439345971914,
"grad_norm": 0.44615527987480164,
"learning_rate": 4.404761904761905e-06,
"loss": 0.8753,
"step": 149
},
{
"epoch": 0.044694402026146225,
"grad_norm": 0.4358450472354889,
"learning_rate": 4.434523809523809e-06,
"loss": 0.8705,
"step": 150
},
{
"epoch": 0.04499236470632054,
"grad_norm": 0.4348714351654053,
"learning_rate": 4.464285714285715e-06,
"loss": 0.9207,
"step": 151
},
{
"epoch": 0.04529032738649484,
"grad_norm": 0.4443501830101013,
"learning_rate": 4.49404761904762e-06,
"loss": 0.8558,
"step": 152
},
{
"epoch": 0.04558829006666915,
"grad_norm": 0.4495562016963959,
"learning_rate": 4.523809523809524e-06,
"loss": 0.9077,
"step": 153
},
{
"epoch": 0.04588625274684346,
"grad_norm": 0.4733486771583557,
"learning_rate": 4.553571428571429e-06,
"loss": 0.9129,
"step": 154
},
{
"epoch": 0.04618421542701777,
"grad_norm": 0.44653400778770447,
"learning_rate": 4.583333333333333e-06,
"loss": 0.9177,
"step": 155
},
{
"epoch": 0.046482178107192074,
"grad_norm": 0.4165500998497009,
"learning_rate": 4.6130952380952385e-06,
"loss": 0.8254,
"step": 156
},
{
"epoch": 0.046780140787366385,
"grad_norm": 0.43224039673805237,
"learning_rate": 4.642857142857144e-06,
"loss": 0.8474,
"step": 157
},
{
"epoch": 0.04707810346754069,
"grad_norm": 0.41379058361053467,
"learning_rate": 4.672619047619048e-06,
"loss": 0.8491,
"step": 158
},
{
"epoch": 0.047376066147715,
"grad_norm": 0.46250954270362854,
"learning_rate": 4.702380952380953e-06,
"loss": 0.8944,
"step": 159
},
{
"epoch": 0.047674028827889306,
"grad_norm": 0.4260331392288208,
"learning_rate": 4.732142857142857e-06,
"loss": 0.8812,
"step": 160
},
{
"epoch": 0.04797199150806362,
"grad_norm": 0.7473785281181335,
"learning_rate": 4.761904761904762e-06,
"loss": 0.8824,
"step": 161
},
{
"epoch": 0.04826995418823792,
"grad_norm": 0.4553568959236145,
"learning_rate": 4.791666666666668e-06,
"loss": 0.8871,
"step": 162
},
{
"epoch": 0.048567916868412234,
"grad_norm": 0.44239333271980286,
"learning_rate": 4.821428571428572e-06,
"loss": 0.8757,
"step": 163
},
{
"epoch": 0.04886587954858654,
"grad_norm": 0.5126583576202393,
"learning_rate": 4.8511904761904765e-06,
"loss": 0.9648,
"step": 164
},
{
"epoch": 0.04916384222876085,
"grad_norm": 0.4421823024749756,
"learning_rate": 4.880952380952381e-06,
"loss": 0.8868,
"step": 165
},
{
"epoch": 0.049461804908935154,
"grad_norm": 0.4590631425380707,
"learning_rate": 4.910714285714286e-06,
"loss": 0.8645,
"step": 166
},
{
"epoch": 0.049759767589109466,
"grad_norm": 0.43286260962486267,
"learning_rate": 4.940476190476191e-06,
"loss": 0.9016,
"step": 167
},
{
"epoch": 0.05005773026928377,
"grad_norm": 0.44442832469940186,
"learning_rate": 4.970238095238096e-06,
"loss": 0.884,
"step": 168
},
{
"epoch": 0.05035569294945808,
"grad_norm": 0.4276241958141327,
"learning_rate": 5e-06,
"loss": 0.8364,
"step": 169
},
{
"epoch": 0.05065365562963239,
"grad_norm": 0.41683584451675415,
"learning_rate": 5.029761904761905e-06,
"loss": 0.8192,
"step": 170
},
{
"epoch": 0.0509516183098067,
"grad_norm": 0.4456952214241028,
"learning_rate": 5.05952380952381e-06,
"loss": 0.9499,
"step": 171
},
{
"epoch": 0.051249580989981,
"grad_norm": 0.44173622131347656,
"learning_rate": 5.0892857142857146e-06,
"loss": 0.8524,
"step": 172
},
{
"epoch": 0.051547543670155314,
"grad_norm": 0.4558558762073517,
"learning_rate": 5.119047619047619e-06,
"loss": 0.9077,
"step": 173
},
{
"epoch": 0.05184550635032962,
"grad_norm": 0.42778849601745605,
"learning_rate": 5.1488095238095234e-06,
"loss": 0.8791,
"step": 174
},
{
"epoch": 0.05214346903050393,
"grad_norm": 0.44972923398017883,
"learning_rate": 5.1785714285714296e-06,
"loss": 0.8228,
"step": 175
},
{
"epoch": 0.052441431710678235,
"grad_norm": 0.4365461766719818,
"learning_rate": 5.208333333333334e-06,
"loss": 0.8482,
"step": 176
},
{
"epoch": 0.05273939439085255,
"grad_norm": 0.44673120975494385,
"learning_rate": 5.2380952380952384e-06,
"loss": 0.8984,
"step": 177
},
{
"epoch": 0.05303735707102685,
"grad_norm": 0.4192390441894531,
"learning_rate": 5.267857142857144e-06,
"loss": 0.8376,
"step": 178
},
{
"epoch": 0.05333531975120116,
"grad_norm": 0.4381980895996094,
"learning_rate": 5.297619047619048e-06,
"loss": 0.8781,
"step": 179
},
{
"epoch": 0.05363328243137547,
"grad_norm": 0.4356313645839691,
"learning_rate": 5.327380952380953e-06,
"loss": 0.8816,
"step": 180
},
{
"epoch": 0.05393124511154978,
"grad_norm": 0.44520509243011475,
"learning_rate": 5.357142857142857e-06,
"loss": 0.8691,
"step": 181
},
{
"epoch": 0.054229207791724084,
"grad_norm": 0.44160404801368713,
"learning_rate": 5.386904761904762e-06,
"loss": 0.8315,
"step": 182
},
{
"epoch": 0.054527170471898395,
"grad_norm": 0.4544958472251892,
"learning_rate": 5.416666666666667e-06,
"loss": 0.8707,
"step": 183
},
{
"epoch": 0.0548251331520727,
"grad_norm": 0.44830402731895447,
"learning_rate": 5.446428571428571e-06,
"loss": 0.8906,
"step": 184
},
{
"epoch": 0.05512309583224701,
"grad_norm": 0.4348909258842468,
"learning_rate": 5.476190476190477e-06,
"loss": 0.8344,
"step": 185
},
{
"epoch": 0.055421058512421316,
"grad_norm": 0.43994957208633423,
"learning_rate": 5.505952380952382e-06,
"loss": 0.8617,
"step": 186
},
{
"epoch": 0.05571902119259563,
"grad_norm": 0.42906102538108826,
"learning_rate": 5.535714285714286e-06,
"loss": 0.8581,
"step": 187
},
{
"epoch": 0.05601698387276994,
"grad_norm": 0.43206915259361267,
"learning_rate": 5.5654761904761915e-06,
"loss": 0.8104,
"step": 188
},
{
"epoch": 0.056314946552944244,
"grad_norm": 0.4183652698993683,
"learning_rate": 5.595238095238096e-06,
"loss": 0.845,
"step": 189
},
{
"epoch": 0.056612909233118555,
"grad_norm": 0.4581888020038605,
"learning_rate": 5.625e-06,
"loss": 0.8133,
"step": 190
},
{
"epoch": 0.05691087191329286,
"grad_norm": 0.4345604479312897,
"learning_rate": 5.654761904761905e-06,
"loss": 0.8912,
"step": 191
},
{
"epoch": 0.05720883459346717,
"grad_norm": 0.4140756130218506,
"learning_rate": 5.68452380952381e-06,
"loss": 0.8372,
"step": 192
},
{
"epoch": 0.057506797273641476,
"grad_norm": 0.4593667685985565,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.9017,
"step": 193
},
{
"epoch": 0.05780475995381579,
"grad_norm": 0.45131516456604004,
"learning_rate": 5.744047619047619e-06,
"loss": 0.8617,
"step": 194
},
{
"epoch": 0.05810272263399009,
"grad_norm": 0.46563607454299927,
"learning_rate": 5.773809523809523e-06,
"loss": 0.8698,
"step": 195
},
{
"epoch": 0.058400685314164404,
"grad_norm": 0.4295847415924072,
"learning_rate": 5.8035714285714295e-06,
"loss": 0.8934,
"step": 196
},
{
"epoch": 0.05869864799433871,
"grad_norm": 0.4323939085006714,
"learning_rate": 5.833333333333334e-06,
"loss": 0.8464,
"step": 197
},
{
"epoch": 0.05899661067451302,
"grad_norm": 0.43190956115722656,
"learning_rate": 5.863095238095239e-06,
"loss": 0.8173,
"step": 198
},
{
"epoch": 0.059294573354687324,
"grad_norm": 0.4357495605945587,
"learning_rate": 5.892857142857144e-06,
"loss": 0.8204,
"step": 199
},
{
"epoch": 0.059592536034861636,
"grad_norm": 0.4913332164287567,
"learning_rate": 5.922619047619048e-06,
"loss": 0.8507,
"step": 200
},
{
"epoch": 0.05989049871503594,
"grad_norm": 0.44041842222213745,
"learning_rate": 5.9523809523809525e-06,
"loss": 0.8634,
"step": 201
},
{
"epoch": 0.06018846139521025,
"grad_norm": 0.4505321681499481,
"learning_rate": 5.982142857142858e-06,
"loss": 0.8607,
"step": 202
},
{
"epoch": 0.06048642407538456,
"grad_norm": 0.43640005588531494,
"learning_rate": 6.011904761904762e-06,
"loss": 0.8371,
"step": 203
},
{
"epoch": 0.06078438675555887,
"grad_norm": 0.4439584016799927,
"learning_rate": 6.041666666666667e-06,
"loss": 0.8969,
"step": 204
},
{
"epoch": 0.06108234943573317,
"grad_norm": 0.4225601553916931,
"learning_rate": 6.071428571428571e-06,
"loss": 0.8679,
"step": 205
},
{
"epoch": 0.061380312115907484,
"grad_norm": 0.4633278250694275,
"learning_rate": 6.101190476190477e-06,
"loss": 0.8758,
"step": 206
},
{
"epoch": 0.06167827479608179,
"grad_norm": 0.45114865899086,
"learning_rate": 6.130952380952382e-06,
"loss": 0.8672,
"step": 207
},
{
"epoch": 0.0619762374762561,
"grad_norm": 0.43823346495628357,
"learning_rate": 6.160714285714286e-06,
"loss": 0.8154,
"step": 208
},
{
"epoch": 0.062274200156430405,
"grad_norm": 0.43499240279197693,
"learning_rate": 6.1904761904761914e-06,
"loss": 0.8114,
"step": 209
},
{
"epoch": 0.06257216283660472,
"grad_norm": 0.4600598216056824,
"learning_rate": 6.220238095238096e-06,
"loss": 0.8233,
"step": 210
},
{
"epoch": 0.06287012551677902,
"grad_norm": 0.45372816920280457,
"learning_rate": 6.25e-06,
"loss": 0.892,
"step": 211
},
{
"epoch": 0.06316808819695333,
"grad_norm": 0.4123630225658417,
"learning_rate": 6.279761904761906e-06,
"loss": 0.8195,
"step": 212
},
{
"epoch": 0.06346605087712764,
"grad_norm": 0.425900936126709,
"learning_rate": 6.30952380952381e-06,
"loss": 0.8401,
"step": 213
},
{
"epoch": 0.06376401355730195,
"grad_norm": 0.4495628774166107,
"learning_rate": 6.3392857142857145e-06,
"loss": 0.8707,
"step": 214
},
{
"epoch": 0.06406197623747625,
"grad_norm": 0.45770758390426636,
"learning_rate": 6.369047619047619e-06,
"loss": 0.884,
"step": 215
},
{
"epoch": 0.06435993891765056,
"grad_norm": 0.43433791399002075,
"learning_rate": 6.398809523809524e-06,
"loss": 0.8851,
"step": 216
},
{
"epoch": 0.06465790159782488,
"grad_norm": 0.45469143986701965,
"learning_rate": 6.4285714285714295e-06,
"loss": 0.8428,
"step": 217
},
{
"epoch": 0.06495586427799918,
"grad_norm": 0.4268864095211029,
"learning_rate": 6.458333333333334e-06,
"loss": 0.8159,
"step": 218
},
{
"epoch": 0.06525382695817349,
"grad_norm": 0.45345690846443176,
"learning_rate": 6.488095238095239e-06,
"loss": 0.897,
"step": 219
},
{
"epoch": 0.06555178963834779,
"grad_norm": 0.4494103193283081,
"learning_rate": 6.517857142857144e-06,
"loss": 0.8642,
"step": 220
},
{
"epoch": 0.06584975231852211,
"grad_norm": 0.44220754504203796,
"learning_rate": 6.547619047619048e-06,
"loss": 0.8561,
"step": 221
},
{
"epoch": 0.06614771499869641,
"grad_norm": 0.4213166832923889,
"learning_rate": 6.5773809523809525e-06,
"loss": 0.8176,
"step": 222
},
{
"epoch": 0.06644567767887072,
"grad_norm": 0.41464102268218994,
"learning_rate": 6.607142857142858e-06,
"loss": 0.7852,
"step": 223
},
{
"epoch": 0.06674364035904502,
"grad_norm": 0.4477230906486511,
"learning_rate": 6.636904761904762e-06,
"loss": 0.8522,
"step": 224
},
{
"epoch": 0.06704160303921934,
"grad_norm": 0.4465818405151367,
"learning_rate": 6.666666666666667e-06,
"loss": 0.8233,
"step": 225
},
{
"epoch": 0.06733956571939365,
"grad_norm": 0.44877326488494873,
"learning_rate": 6.696428571428571e-06,
"loss": 0.8299,
"step": 226
},
{
"epoch": 0.06763752839956795,
"grad_norm": 0.4402409791946411,
"learning_rate": 6.726190476190477e-06,
"loss": 0.8232,
"step": 227
},
{
"epoch": 0.06793549107974227,
"grad_norm": 0.42664897441864014,
"learning_rate": 6.755952380952382e-06,
"loss": 0.8381,
"step": 228
},
{
"epoch": 0.06823345375991657,
"grad_norm": 0.4469190835952759,
"learning_rate": 6.785714285714287e-06,
"loss": 0.8382,
"step": 229
},
{
"epoch": 0.06853141644009088,
"grad_norm": 0.4511435329914093,
"learning_rate": 6.815476190476191e-06,
"loss": 0.8829,
"step": 230
},
{
"epoch": 0.06882937912026518,
"grad_norm": 0.43204864859580994,
"learning_rate": 6.845238095238096e-06,
"loss": 0.8522,
"step": 231
},
{
"epoch": 0.0691273418004395,
"grad_norm": 0.4274302124977112,
"learning_rate": 6.875e-06,
"loss": 0.8728,
"step": 232
},
{
"epoch": 0.0694253044806138,
"grad_norm": 0.44673213362693787,
"learning_rate": 6.9047619047619055e-06,
"loss": 0.8657,
"step": 233
},
{
"epoch": 0.06972326716078811,
"grad_norm": 0.4676547944545746,
"learning_rate": 6.93452380952381e-06,
"loss": 0.9529,
"step": 234
},
{
"epoch": 0.07002122984096242,
"grad_norm": 0.4478471279144287,
"learning_rate": 6.964285714285714e-06,
"loss": 0.8192,
"step": 235
},
{
"epoch": 0.07031919252113673,
"grad_norm": 0.4769634008407593,
"learning_rate": 6.994047619047619e-06,
"loss": 0.8546,
"step": 236
},
{
"epoch": 0.07061715520131104,
"grad_norm": 0.4771283268928528,
"learning_rate": 7.023809523809524e-06,
"loss": 0.9309,
"step": 237
},
{
"epoch": 0.07091511788148534,
"grad_norm": 0.4384443163871765,
"learning_rate": 7.053571428571429e-06,
"loss": 0.8646,
"step": 238
},
{
"epoch": 0.07121308056165965,
"grad_norm": 0.44072097539901733,
"learning_rate": 7.083333333333335e-06,
"loss": 0.8803,
"step": 239
},
{
"epoch": 0.07151104324183397,
"grad_norm": 0.45785021781921387,
"learning_rate": 7.113095238095239e-06,
"loss": 0.8373,
"step": 240
},
{
"epoch": 0.07180900592200827,
"grad_norm": 0.431471586227417,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.8369,
"step": 241
},
{
"epoch": 0.07210696860218258,
"grad_norm": 0.41161608695983887,
"learning_rate": 7.172619047619048e-06,
"loss": 0.8022,
"step": 242
},
{
"epoch": 0.07240493128235688,
"grad_norm": 0.46845054626464844,
"learning_rate": 7.202380952380953e-06,
"loss": 0.8721,
"step": 243
},
{
"epoch": 0.0727028939625312,
"grad_norm": 0.43750834465026855,
"learning_rate": 7.232142857142858e-06,
"loss": 0.8554,
"step": 244
},
{
"epoch": 0.0730008566427055,
"grad_norm": 0.4418376088142395,
"learning_rate": 7.261904761904762e-06,
"loss": 0.7971,
"step": 245
},
{
"epoch": 0.07329881932287981,
"grad_norm": 0.467026948928833,
"learning_rate": 7.291666666666667e-06,
"loss": 0.8905,
"step": 246
},
{
"epoch": 0.07359678200305411,
"grad_norm": 0.4285699129104614,
"learning_rate": 7.321428571428572e-06,
"loss": 0.8548,
"step": 247
},
{
"epoch": 0.07389474468322843,
"grad_norm": 0.4519418478012085,
"learning_rate": 7.351190476190477e-06,
"loss": 0.8449,
"step": 248
},
{
"epoch": 0.07419270736340274,
"grad_norm": 0.4394064247608185,
"learning_rate": 7.380952380952382e-06,
"loss": 0.8235,
"step": 249
},
{
"epoch": 0.07449067004357704,
"grad_norm": 0.4810203015804291,
"learning_rate": 7.410714285714287e-06,
"loss": 0.901,
"step": 250
},
{
"epoch": 0.07478863272375134,
"grad_norm": 0.43417516350746155,
"learning_rate": 7.440476190476191e-06,
"loss": 0.8469,
"step": 251
},
{
"epoch": 0.07508659540392566,
"grad_norm": 0.44011834263801575,
"learning_rate": 7.470238095238096e-06,
"loss": 0.8403,
"step": 252
},
{
"epoch": 0.07538455808409997,
"grad_norm": 0.4219614863395691,
"learning_rate": 7.500000000000001e-06,
"loss": 0.7862,
"step": 253
},
{
"epoch": 0.07568252076427427,
"grad_norm": 0.4581497311592102,
"learning_rate": 7.5297619047619055e-06,
"loss": 0.8715,
"step": 254
},
{
"epoch": 0.07598048344444858,
"grad_norm": 0.42541468143463135,
"learning_rate": 7.55952380952381e-06,
"loss": 0.8086,
"step": 255
},
{
"epoch": 0.0762784461246229,
"grad_norm": 0.4385511875152588,
"learning_rate": 7.589285714285714e-06,
"loss": 0.8344,
"step": 256
},
{
"epoch": 0.0765764088047972,
"grad_norm": 0.4721318185329437,
"learning_rate": 7.61904761904762e-06,
"loss": 0.9129,
"step": 257
},
{
"epoch": 0.0768743714849715,
"grad_norm": 0.4345463514328003,
"learning_rate": 7.648809523809523e-06,
"loss": 0.8777,
"step": 258
},
{
"epoch": 0.07717233416514581,
"grad_norm": 0.47514408826828003,
"learning_rate": 7.67857142857143e-06,
"loss": 0.8465,
"step": 259
},
{
"epoch": 0.07747029684532013,
"grad_norm": 0.4220348596572876,
"learning_rate": 7.708333333333334e-06,
"loss": 0.8236,
"step": 260
},
{
"epoch": 0.07776825952549443,
"grad_norm": 0.4870074689388275,
"learning_rate": 7.738095238095238e-06,
"loss": 0.8636,
"step": 261
},
{
"epoch": 0.07806622220566874,
"grad_norm": 0.44031956791877747,
"learning_rate": 7.767857142857144e-06,
"loss": 0.8036,
"step": 262
},
{
"epoch": 0.07836418488584306,
"grad_norm": 0.46993595361709595,
"learning_rate": 7.797619047619049e-06,
"loss": 0.8744,
"step": 263
},
{
"epoch": 0.07866214756601736,
"grad_norm": 0.4338163137435913,
"learning_rate": 7.827380952380953e-06,
"loss": 0.8596,
"step": 264
},
{
"epoch": 0.07896011024619166,
"grad_norm": 0.4179791510105133,
"learning_rate": 7.857142857142858e-06,
"loss": 0.8182,
"step": 265
},
{
"epoch": 0.07925807292636597,
"grad_norm": 0.434553861618042,
"learning_rate": 7.886904761904762e-06,
"loss": 0.8126,
"step": 266
},
{
"epoch": 0.07955603560654029,
"grad_norm": 0.44742393493652344,
"learning_rate": 7.916666666666667e-06,
"loss": 0.9022,
"step": 267
},
{
"epoch": 0.07985399828671459,
"grad_norm": 0.4458036720752716,
"learning_rate": 7.946428571428571e-06,
"loss": 0.8876,
"step": 268
},
{
"epoch": 0.0801519609668889,
"grad_norm": 0.43985581398010254,
"learning_rate": 7.976190476190477e-06,
"loss": 0.8213,
"step": 269
},
{
"epoch": 0.0804499236470632,
"grad_norm": 0.46211549639701843,
"learning_rate": 8.005952380952382e-06,
"loss": 0.8251,
"step": 270
},
{
"epoch": 0.08074788632723752,
"grad_norm": 0.44748368859291077,
"learning_rate": 8.035714285714286e-06,
"loss": 0.8527,
"step": 271
},
{
"epoch": 0.08104584900741182,
"grad_norm": 0.4547558128833771,
"learning_rate": 8.065476190476192e-06,
"loss": 0.8539,
"step": 272
},
{
"epoch": 0.08134381168758613,
"grad_norm": 0.4298979938030243,
"learning_rate": 8.095238095238097e-06,
"loss": 0.8308,
"step": 273
},
{
"epoch": 0.08164177436776043,
"grad_norm": 0.4201894700527191,
"learning_rate": 8.125000000000001e-06,
"loss": 0.7705,
"step": 274
},
{
"epoch": 0.08193973704793475,
"grad_norm": 0.4482576549053192,
"learning_rate": 8.154761904761905e-06,
"loss": 0.7786,
"step": 275
},
{
"epoch": 0.08223769972810906,
"grad_norm": 0.4452459216117859,
"learning_rate": 8.18452380952381e-06,
"loss": 0.8253,
"step": 276
},
{
"epoch": 0.08253566240828336,
"grad_norm": 0.46517354249954224,
"learning_rate": 8.214285714285714e-06,
"loss": 0.8713,
"step": 277
},
{
"epoch": 0.08283362508845767,
"grad_norm": 0.43335846066474915,
"learning_rate": 8.244047619047619e-06,
"loss": 0.8498,
"step": 278
},
{
"epoch": 0.08313158776863198,
"grad_norm": 0.44680821895599365,
"learning_rate": 8.273809523809523e-06,
"loss": 0.8586,
"step": 279
},
{
"epoch": 0.08342955044880629,
"grad_norm": 0.4653940796852112,
"learning_rate": 8.30357142857143e-06,
"loss": 0.8745,
"step": 280
},
{
"epoch": 0.0837275131289806,
"grad_norm": 0.4642374515533447,
"learning_rate": 8.333333333333334e-06,
"loss": 0.8634,
"step": 281
},
{
"epoch": 0.0840254758091549,
"grad_norm": 0.41972967982292175,
"learning_rate": 8.36309523809524e-06,
"loss": 0.8017,
"step": 282
},
{
"epoch": 0.08432343848932922,
"grad_norm": 0.4606432020664215,
"learning_rate": 8.392857142857144e-06,
"loss": 0.8669,
"step": 283
},
{
"epoch": 0.08462140116950352,
"grad_norm": 0.4177170693874359,
"learning_rate": 8.422619047619049e-06,
"loss": 0.7702,
"step": 284
},
{
"epoch": 0.08491936384967783,
"grad_norm": 0.4431699812412262,
"learning_rate": 8.452380952380953e-06,
"loss": 0.8088,
"step": 285
},
{
"epoch": 0.08521732652985213,
"grad_norm": 0.42295727133750916,
"learning_rate": 8.482142857142858e-06,
"loss": 0.7878,
"step": 286
},
{
"epoch": 0.08551528921002645,
"grad_norm": 0.47301793098449707,
"learning_rate": 8.511904761904762e-06,
"loss": 0.9065,
"step": 287
},
{
"epoch": 0.08581325189020075,
"grad_norm": 0.45198720693588257,
"learning_rate": 8.541666666666666e-06,
"loss": 0.8367,
"step": 288
},
{
"epoch": 0.08611121457037506,
"grad_norm": 0.4279087483882904,
"learning_rate": 8.571428571428571e-06,
"loss": 0.8425,
"step": 289
},
{
"epoch": 0.08640917725054936,
"grad_norm": 0.46242713928222656,
"learning_rate": 8.601190476190477e-06,
"loss": 0.863,
"step": 290
},
{
"epoch": 0.08670713993072368,
"grad_norm": 0.4571658968925476,
"learning_rate": 8.630952380952381e-06,
"loss": 0.8399,
"step": 291
},
{
"epoch": 0.08700510261089799,
"grad_norm": 0.4324505627155304,
"learning_rate": 8.660714285714286e-06,
"loss": 0.8386,
"step": 292
},
{
"epoch": 0.08730306529107229,
"grad_norm": 0.4449789822101593,
"learning_rate": 8.690476190476192e-06,
"loss": 0.8195,
"step": 293
},
{
"epoch": 0.0876010279712466,
"grad_norm": 0.44687506556510925,
"learning_rate": 8.720238095238096e-06,
"loss": 0.8626,
"step": 294
},
{
"epoch": 0.08789899065142091,
"grad_norm": 0.4689542055130005,
"learning_rate": 8.750000000000001e-06,
"loss": 0.8919,
"step": 295
},
{
"epoch": 0.08819695333159522,
"grad_norm": 0.45117467641830444,
"learning_rate": 8.779761904761905e-06,
"loss": 0.8111,
"step": 296
},
{
"epoch": 0.08849491601176952,
"grad_norm": 0.4455942213535309,
"learning_rate": 8.80952380952381e-06,
"loss": 0.8325,
"step": 297
},
{
"epoch": 0.08879287869194383,
"grad_norm": 0.4475480318069458,
"learning_rate": 8.839285714285714e-06,
"loss": 0.8813,
"step": 298
},
{
"epoch": 0.08909084137211815,
"grad_norm": 0.4561273455619812,
"learning_rate": 8.869047619047619e-06,
"loss": 0.8432,
"step": 299
},
{
"epoch": 0.08938880405229245,
"grad_norm": 0.4637228548526764,
"learning_rate": 8.898809523809525e-06,
"loss": 0.8244,
"step": 300
},
{
"epoch": 0.08968676673246676,
"grad_norm": 0.4699828624725342,
"learning_rate": 8.92857142857143e-06,
"loss": 0.8929,
"step": 301
},
{
"epoch": 0.08998472941264107,
"grad_norm": 0.41704750061035156,
"learning_rate": 8.958333333333334e-06,
"loss": 0.8017,
"step": 302
},
{
"epoch": 0.09028269209281538,
"grad_norm": 0.46975040435791016,
"learning_rate": 8.98809523809524e-06,
"loss": 0.8518,
"step": 303
},
{
"epoch": 0.09058065477298968,
"grad_norm": 0.4363666772842407,
"learning_rate": 9.017857142857144e-06,
"loss": 0.7656,
"step": 304
},
{
"epoch": 0.09087861745316399,
"grad_norm": 0.4625988006591797,
"learning_rate": 9.047619047619049e-06,
"loss": 0.9311,
"step": 305
},
{
"epoch": 0.0911765801333383,
"grad_norm": 0.45612746477127075,
"learning_rate": 9.077380952380953e-06,
"loss": 0.8161,
"step": 306
},
{
"epoch": 0.09147454281351261,
"grad_norm": 0.44814276695251465,
"learning_rate": 9.107142857142858e-06,
"loss": 0.808,
"step": 307
},
{
"epoch": 0.09177250549368692,
"grad_norm": 0.45561328530311584,
"learning_rate": 9.136904761904762e-06,
"loss": 0.8347,
"step": 308
},
{
"epoch": 0.09207046817386122,
"grad_norm": 0.4581974148750305,
"learning_rate": 9.166666666666666e-06,
"loss": 0.8393,
"step": 309
},
{
"epoch": 0.09236843085403554,
"grad_norm": 0.4595828354358673,
"learning_rate": 9.196428571428571e-06,
"loss": 0.8555,
"step": 310
},
{
"epoch": 0.09266639353420984,
"grad_norm": 0.4495246708393097,
"learning_rate": 9.226190476190477e-06,
"loss": 0.8571,
"step": 311
},
{
"epoch": 0.09296435621438415,
"grad_norm": 0.430372416973114,
"learning_rate": 9.255952380952381e-06,
"loss": 0.811,
"step": 312
},
{
"epoch": 0.09326231889455845,
"grad_norm": 0.4460920989513397,
"learning_rate": 9.285714285714288e-06,
"loss": 0.8714,
"step": 313
},
{
"epoch": 0.09356028157473277,
"grad_norm": 0.4535922706127167,
"learning_rate": 9.315476190476192e-06,
"loss": 0.8481,
"step": 314
},
{
"epoch": 0.09385824425490708,
"grad_norm": 0.4503169655799866,
"learning_rate": 9.345238095238096e-06,
"loss": 0.8073,
"step": 315
},
{
"epoch": 0.09415620693508138,
"grad_norm": 0.4372020363807678,
"learning_rate": 9.375000000000001e-06,
"loss": 0.8383,
"step": 316
},
{
"epoch": 0.09445416961525568,
"grad_norm": 0.4314536154270172,
"learning_rate": 9.404761904761905e-06,
"loss": 0.8458,
"step": 317
},
{
"epoch": 0.09475213229543,
"grad_norm": 0.46163350343704224,
"learning_rate": 9.43452380952381e-06,
"loss": 0.7861,
"step": 318
},
{
"epoch": 0.09505009497560431,
"grad_norm": 0.4365135729312897,
"learning_rate": 9.464285714285714e-06,
"loss": 0.8201,
"step": 319
},
{
"epoch": 0.09534805765577861,
"grad_norm": 0.44700679183006287,
"learning_rate": 9.494047619047619e-06,
"loss": 0.8428,
"step": 320
},
{
"epoch": 0.09564602033595292,
"grad_norm": 0.4734896719455719,
"learning_rate": 9.523809523809525e-06,
"loss": 0.843,
"step": 321
},
{
"epoch": 0.09594398301612724,
"grad_norm": 0.4385888874530792,
"learning_rate": 9.55357142857143e-06,
"loss": 0.8571,
"step": 322
},
{
"epoch": 0.09624194569630154,
"grad_norm": 0.435771644115448,
"learning_rate": 9.583333333333335e-06,
"loss": 0.8199,
"step": 323
},
{
"epoch": 0.09653990837647584,
"grad_norm": 0.4613741636276245,
"learning_rate": 9.61309523809524e-06,
"loss": 0.8007,
"step": 324
},
{
"epoch": 0.09683787105665015,
"grad_norm": 0.44617676734924316,
"learning_rate": 9.642857142857144e-06,
"loss": 0.8375,
"step": 325
},
{
"epoch": 0.09713583373682447,
"grad_norm": 0.47633469104766846,
"learning_rate": 9.672619047619049e-06,
"loss": 0.8671,
"step": 326
},
{
"epoch": 0.09743379641699877,
"grad_norm": 0.4520438313484192,
"learning_rate": 9.702380952380953e-06,
"loss": 0.8374,
"step": 327
},
{
"epoch": 0.09773175909717308,
"grad_norm": 0.45184773206710815,
"learning_rate": 9.732142857142858e-06,
"loss": 0.8357,
"step": 328
},
{
"epoch": 0.09802972177734738,
"grad_norm": 0.44382697343826294,
"learning_rate": 9.761904761904762e-06,
"loss": 0.8052,
"step": 329
},
{
"epoch": 0.0983276844575217,
"grad_norm": 0.4614221453666687,
"learning_rate": 9.791666666666666e-06,
"loss": 0.8192,
"step": 330
},
{
"epoch": 0.098625647137696,
"grad_norm": 0.4119229316711426,
"learning_rate": 9.821428571428573e-06,
"loss": 0.8091,
"step": 331
},
{
"epoch": 0.09892360981787031,
"grad_norm": 0.435222327709198,
"learning_rate": 9.851190476190477e-06,
"loss": 0.8437,
"step": 332
},
{
"epoch": 0.09922157249804461,
"grad_norm": 0.43353399634361267,
"learning_rate": 9.880952380952381e-06,
"loss": 0.8329,
"step": 333
},
{
"epoch": 0.09951953517821893,
"grad_norm": 0.462616890668869,
"learning_rate": 9.910714285714288e-06,
"loss": 0.8742,
"step": 334
},
{
"epoch": 0.09981749785839324,
"grad_norm": 0.4411180019378662,
"learning_rate": 9.940476190476192e-06,
"loss": 0.7696,
"step": 335
},
{
"epoch": 0.10011546053856754,
"grad_norm": 0.461515873670578,
"learning_rate": 9.970238095238096e-06,
"loss": 0.8183,
"step": 336
},
{
"epoch": 0.10041342321874185,
"grad_norm": 0.4494532644748688,
"learning_rate": 1e-05,
"loss": 0.8568,
"step": 337
},
{
"epoch": 0.10071138589891616,
"grad_norm": 0.4770655632019043,
"learning_rate": 9.99999729642598e-06,
"loss": 0.8497,
"step": 338
},
{
"epoch": 0.10100934857909047,
"grad_norm": 0.464408278465271,
"learning_rate": 9.999989185706846e-06,
"loss": 0.8591,
"step": 339
},
{
"epoch": 0.10130731125926477,
"grad_norm": 0.45243039727211,
"learning_rate": 9.999975667851366e-06,
"loss": 0.8139,
"step": 340
},
{
"epoch": 0.10160527393943909,
"grad_norm": 0.4484204947948456,
"learning_rate": 9.999956742874162e-06,
"loss": 0.8169,
"step": 341
},
{
"epoch": 0.1019032366196134,
"grad_norm": 0.44056135416030884,
"learning_rate": 9.999932410795697e-06,
"loss": 0.8453,
"step": 342
},
{
"epoch": 0.1022011992997877,
"grad_norm": 0.4422775208950043,
"learning_rate": 9.999902671642285e-06,
"loss": 0.8621,
"step": 343
},
{
"epoch": 0.102499161979962,
"grad_norm": 0.43026235699653625,
"learning_rate": 9.99986752544609e-06,
"loss": 0.7996,
"step": 344
},
{
"epoch": 0.10279712466013632,
"grad_norm": 0.4360259473323822,
"learning_rate": 9.999826972245115e-06,
"loss": 0.8296,
"step": 345
},
{
"epoch": 0.10309508734031063,
"grad_norm": 0.46168243885040283,
"learning_rate": 9.99978101208322e-06,
"loss": 0.875,
"step": 346
},
{
"epoch": 0.10339305002048493,
"grad_norm": 0.44053173065185547,
"learning_rate": 9.999729645010105e-06,
"loss": 0.8376,
"step": 347
},
{
"epoch": 0.10369101270065924,
"grad_norm": 0.4546683728694916,
"learning_rate": 9.99967287108132e-06,
"loss": 0.8286,
"step": 348
},
{
"epoch": 0.10398897538083356,
"grad_norm": 0.43043702840805054,
"learning_rate": 9.999610690358263e-06,
"loss": 0.8071,
"step": 349
},
{
"epoch": 0.10428693806100786,
"grad_norm": 0.42279356718063354,
"learning_rate": 9.999543102908178e-06,
"loss": 0.8177,
"step": 350
},
{
"epoch": 0.10458490074118217,
"grad_norm": 0.4474245607852936,
"learning_rate": 9.999470108804156e-06,
"loss": 0.806,
"step": 351
},
{
"epoch": 0.10488286342135647,
"grad_norm": 0.4298347532749176,
"learning_rate": 9.999391708125134e-06,
"loss": 0.8269,
"step": 352
},
{
"epoch": 0.10518082610153079,
"grad_norm": 0.45362961292266846,
"learning_rate": 9.999307900955898e-06,
"loss": 0.8196,
"step": 353
},
{
"epoch": 0.1054787887817051,
"grad_norm": 0.47332948446273804,
"learning_rate": 9.999218687387081e-06,
"loss": 0.8233,
"step": 354
},
{
"epoch": 0.1057767514618794,
"grad_norm": 0.4430951774120331,
"learning_rate": 9.999124067515158e-06,
"loss": 0.7823,
"step": 355
},
{
"epoch": 0.1060747141420537,
"grad_norm": 0.45825430750846863,
"learning_rate": 9.999024041442455e-06,
"loss": 0.806,
"step": 356
},
{
"epoch": 0.10637267682222802,
"grad_norm": 0.44608044624328613,
"learning_rate": 9.998918609277144e-06,
"loss": 0.7771,
"step": 357
},
{
"epoch": 0.10667063950240233,
"grad_norm": 0.4534968435764313,
"learning_rate": 9.998807771133241e-06,
"loss": 0.844,
"step": 358
},
{
"epoch": 0.10696860218257663,
"grad_norm": 0.49156180024147034,
"learning_rate": 9.998691527130609e-06,
"loss": 0.8696,
"step": 359
},
{
"epoch": 0.10726656486275093,
"grad_norm": 0.4962518513202667,
"learning_rate": 9.998569877394961e-06,
"loss": 0.8413,
"step": 360
},
{
"epoch": 0.10756452754292525,
"grad_norm": 0.4482788145542145,
"learning_rate": 9.99844282205785e-06,
"loss": 0.7665,
"step": 361
},
{
"epoch": 0.10786249022309956,
"grad_norm": 0.4612559378147125,
"learning_rate": 9.998310361256678e-06,
"loss": 0.856,
"step": 362
},
{
"epoch": 0.10816045290327386,
"grad_norm": 0.4336289167404175,
"learning_rate": 9.998172495134692e-06,
"loss": 0.7627,
"step": 363
},
{
"epoch": 0.10845841558344817,
"grad_norm": 0.46035903692245483,
"learning_rate": 9.998029223840986e-06,
"loss": 0.7778,
"step": 364
},
{
"epoch": 0.10875637826362249,
"grad_norm": 0.4648987948894501,
"learning_rate": 9.997880547530494e-06,
"loss": 0.88,
"step": 365
},
{
"epoch": 0.10905434094379679,
"grad_norm": 0.4816872775554657,
"learning_rate": 9.997726466364003e-06,
"loss": 0.8894,
"step": 366
},
{
"epoch": 0.1093523036239711,
"grad_norm": 0.445965051651001,
"learning_rate": 9.99756698050814e-06,
"loss": 0.7752,
"step": 367
},
{
"epoch": 0.1096502663041454,
"grad_norm": 0.4469074308872223,
"learning_rate": 9.997402090135377e-06,
"loss": 0.7908,
"step": 368
},
{
"epoch": 0.10994822898431972,
"grad_norm": 0.45743241906166077,
"learning_rate": 9.99723179542403e-06,
"loss": 0.8262,
"step": 369
},
{
"epoch": 0.11024619166449402,
"grad_norm": 0.4579828679561615,
"learning_rate": 9.997056096558264e-06,
"loss": 0.8464,
"step": 370
},
{
"epoch": 0.11054415434466833,
"grad_norm": 0.42541900277137756,
"learning_rate": 9.996874993728083e-06,
"loss": 0.773,
"step": 371
},
{
"epoch": 0.11084211702484263,
"grad_norm": 0.47497400641441345,
"learning_rate": 9.996688487129335e-06,
"loss": 0.8227,
"step": 372
},
{
"epoch": 0.11114007970501695,
"grad_norm": 0.4370492994785309,
"learning_rate": 9.996496576963716e-06,
"loss": 0.8387,
"step": 373
},
{
"epoch": 0.11143804238519125,
"grad_norm": 0.43119439482688904,
"learning_rate": 9.996299263438765e-06,
"loss": 0.7698,
"step": 374
},
{
"epoch": 0.11173600506536556,
"grad_norm": 0.43310031294822693,
"learning_rate": 9.99609654676786e-06,
"loss": 0.7699,
"step": 375
},
{
"epoch": 0.11203396774553988,
"grad_norm": 0.47822192311286926,
"learning_rate": 9.995888427170226e-06,
"loss": 0.8429,
"step": 376
},
{
"epoch": 0.11233193042571418,
"grad_norm": 0.45406052470207214,
"learning_rate": 9.995674904870929e-06,
"loss": 0.8128,
"step": 377
},
{
"epoch": 0.11262989310588849,
"grad_norm": 0.4138409495353699,
"learning_rate": 9.99545598010088e-06,
"loss": 0.7859,
"step": 378
},
{
"epoch": 0.11292785578606279,
"grad_norm": 0.43571943044662476,
"learning_rate": 9.995231653096826e-06,
"loss": 0.7812,
"step": 379
},
{
"epoch": 0.11322581846623711,
"grad_norm": 0.44058656692504883,
"learning_rate": 9.995001924101368e-06,
"loss": 0.7777,
"step": 380
},
{
"epoch": 0.11352378114641141,
"grad_norm": 0.4526267349720001,
"learning_rate": 9.994766793362936e-06,
"loss": 0.8221,
"step": 381
},
{
"epoch": 0.11382174382658572,
"grad_norm": 0.4531581997871399,
"learning_rate": 9.99452626113581e-06,
"loss": 0.8714,
"step": 382
},
{
"epoch": 0.11411970650676002,
"grad_norm": 0.4537730813026428,
"learning_rate": 9.994280327680109e-06,
"loss": 0.8046,
"step": 383
},
{
"epoch": 0.11441766918693434,
"grad_norm": 0.4391308128833771,
"learning_rate": 9.994028993261789e-06,
"loss": 0.8068,
"step": 384
},
{
"epoch": 0.11471563186710865,
"grad_norm": 0.47409799695014954,
"learning_rate": 9.993772258152656e-06,
"loss": 0.818,
"step": 385
},
{
"epoch": 0.11501359454728295,
"grad_norm": 0.4408448338508606,
"learning_rate": 9.993510122630346e-06,
"loss": 0.8178,
"step": 386
},
{
"epoch": 0.11531155722745726,
"grad_norm": 0.44910043478012085,
"learning_rate": 9.993242586978345e-06,
"loss": 0.8454,
"step": 387
},
{
"epoch": 0.11560951990763157,
"grad_norm": 0.4543640911579132,
"learning_rate": 9.992969651485968e-06,
"loss": 0.8315,
"step": 388
},
{
"epoch": 0.11590748258780588,
"grad_norm": 0.41494789719581604,
"learning_rate": 9.992691316448382e-06,
"loss": 0.7609,
"step": 389
},
{
"epoch": 0.11620544526798018,
"grad_norm": 0.48728737235069275,
"learning_rate": 9.992407582166582e-06,
"loss": 0.8313,
"step": 390
},
{
"epoch": 0.11650340794815449,
"grad_norm": 0.4488702416419983,
"learning_rate": 9.992118448947408e-06,
"loss": 0.8095,
"step": 391
},
{
"epoch": 0.11680137062832881,
"grad_norm": 0.42593449354171753,
"learning_rate": 9.991823917103539e-06,
"loss": 0.83,
"step": 392
},
{
"epoch": 0.11709933330850311,
"grad_norm": 0.45822906494140625,
"learning_rate": 9.991523986953487e-06,
"loss": 0.815,
"step": 393
},
{
"epoch": 0.11739729598867742,
"grad_norm": 0.42860767245292664,
"learning_rate": 9.991218658821609e-06,
"loss": 0.7676,
"step": 394
},
{
"epoch": 0.11769525866885172,
"grad_norm": 0.4358139634132385,
"learning_rate": 9.990907933038091e-06,
"loss": 0.7755,
"step": 395
},
{
"epoch": 0.11799322134902604,
"grad_norm": 0.4387064278125763,
"learning_rate": 9.990591809938968e-06,
"loss": 0.8135,
"step": 396
},
{
"epoch": 0.11829118402920034,
"grad_norm": 0.442857563495636,
"learning_rate": 9.990270289866099e-06,
"loss": 0.8091,
"step": 397
},
{
"epoch": 0.11858914670937465,
"grad_norm": 0.419209748506546,
"learning_rate": 9.989943373167189e-06,
"loss": 0.8249,
"step": 398
},
{
"epoch": 0.11888710938954895,
"grad_norm": 0.4570047855377197,
"learning_rate": 9.98961106019577e-06,
"loss": 0.8933,
"step": 399
},
{
"epoch": 0.11918507206972327,
"grad_norm": 0.4366416037082672,
"learning_rate": 9.989273351311222e-06,
"loss": 0.8162,
"step": 400
},
{
"epoch": 0.11948303474989758,
"grad_norm": 0.4550735354423523,
"learning_rate": 9.98893024687875e-06,
"loss": 0.8217,
"step": 401
},
{
"epoch": 0.11978099743007188,
"grad_norm": 0.43284013867378235,
"learning_rate": 9.988581747269397e-06,
"loss": 0.8131,
"step": 402
},
{
"epoch": 0.12007896011024619,
"grad_norm": 0.4315564036369324,
"learning_rate": 9.988227852860042e-06,
"loss": 0.8065,
"step": 403
},
{
"epoch": 0.1203769227904205,
"grad_norm": 0.4316180646419525,
"learning_rate": 9.987868564033396e-06,
"loss": 0.7969,
"step": 404
},
{
"epoch": 0.12067488547059481,
"grad_norm": 0.45488640666007996,
"learning_rate": 9.987503881178004e-06,
"loss": 0.8231,
"step": 405
},
{
"epoch": 0.12097284815076911,
"grad_norm": 0.4185773432254791,
"learning_rate": 9.987133804688247e-06,
"loss": 0.7916,
"step": 406
},
{
"epoch": 0.12127081083094342,
"grad_norm": 0.4486062824726105,
"learning_rate": 9.986758334964333e-06,
"loss": 0.8391,
"step": 407
},
{
"epoch": 0.12156877351111774,
"grad_norm": 0.4506734311580658,
"learning_rate": 9.986377472412311e-06,
"loss": 0.8327,
"step": 408
},
{
"epoch": 0.12186673619129204,
"grad_norm": 0.43692904710769653,
"learning_rate": 9.985991217444053e-06,
"loss": 0.7939,
"step": 409
},
{
"epoch": 0.12216469887146635,
"grad_norm": 0.4649176001548767,
"learning_rate": 9.98559957047727e-06,
"loss": 0.8608,
"step": 410
},
{
"epoch": 0.12246266155164065,
"grad_norm": 0.43806663155555725,
"learning_rate": 9.985202531935496e-06,
"loss": 0.7629,
"step": 411
},
{
"epoch": 0.12276062423181497,
"grad_norm": 0.4385770559310913,
"learning_rate": 9.984800102248105e-06,
"loss": 0.8069,
"step": 412
},
{
"epoch": 0.12305858691198927,
"grad_norm": 0.41685423254966736,
"learning_rate": 9.984392281850293e-06,
"loss": 0.8103,
"step": 413
},
{
"epoch": 0.12335654959216358,
"grad_norm": 0.43887874484062195,
"learning_rate": 9.98397907118309e-06,
"loss": 0.7902,
"step": 414
},
{
"epoch": 0.1236545122723379,
"grad_norm": 0.4662448465824127,
"learning_rate": 9.983560470693354e-06,
"loss": 0.8686,
"step": 415
},
{
"epoch": 0.1239524749525122,
"grad_norm": 0.4399791359901428,
"learning_rate": 9.983136480833773e-06,
"loss": 0.8166,
"step": 416
},
{
"epoch": 0.1242504376326865,
"grad_norm": 0.41617995500564575,
"learning_rate": 9.982707102062863e-06,
"loss": 0.8155,
"step": 417
},
{
"epoch": 0.12454840031286081,
"grad_norm": 0.4192359447479248,
"learning_rate": 9.982272334844964e-06,
"loss": 0.795,
"step": 418
},
{
"epoch": 0.12484636299303513,
"grad_norm": 0.4392685294151306,
"learning_rate": 9.981832179650251e-06,
"loss": 0.8034,
"step": 419
},
{
"epoch": 0.12514432567320943,
"grad_norm": 0.4487253427505493,
"learning_rate": 9.981386636954713e-06,
"loss": 0.837,
"step": 420
},
{
"epoch": 0.12544228835338375,
"grad_norm": 0.4271675646305084,
"learning_rate": 9.98093570724018e-06,
"loss": 0.7799,
"step": 421
},
{
"epoch": 0.12574025103355804,
"grad_norm": 0.44783201813697815,
"learning_rate": 9.9804793909943e-06,
"loss": 0.8396,
"step": 422
},
{
"epoch": 0.12603821371373236,
"grad_norm": 0.4500182271003723,
"learning_rate": 9.980017688710542e-06,
"loss": 0.8214,
"step": 423
},
{
"epoch": 0.12633617639390665,
"grad_norm": 0.45347052812576294,
"learning_rate": 9.97955060088821e-06,
"loss": 0.8384,
"step": 424
},
{
"epoch": 0.12663413907408097,
"grad_norm": 0.4413428008556366,
"learning_rate": 9.979078128032424e-06,
"loss": 0.8109,
"step": 425
},
{
"epoch": 0.1269321017542553,
"grad_norm": 0.4482523798942566,
"learning_rate": 9.978600270654131e-06,
"loss": 0.8163,
"step": 426
},
{
"epoch": 0.12723006443442958,
"grad_norm": 0.4338665306568146,
"learning_rate": 9.978117029270098e-06,
"loss": 0.8161,
"step": 427
},
{
"epoch": 0.1275280271146039,
"grad_norm": 0.43441489338874817,
"learning_rate": 9.977628404402918e-06,
"loss": 0.8336,
"step": 428
},
{
"epoch": 0.12782598979477822,
"grad_norm": 0.4508003890514374,
"learning_rate": 9.977134396581008e-06,
"loss": 0.7987,
"step": 429
},
{
"epoch": 0.1281239524749525,
"grad_norm": 0.42367222905158997,
"learning_rate": 9.976635006338598e-06,
"loss": 0.7722,
"step": 430
},
{
"epoch": 0.12842191515512683,
"grad_norm": 0.42868027091026306,
"learning_rate": 9.976130234215743e-06,
"loss": 0.7623,
"step": 431
},
{
"epoch": 0.12871987783530112,
"grad_norm": 0.4461642801761627,
"learning_rate": 9.975620080758321e-06,
"loss": 0.815,
"step": 432
},
{
"epoch": 0.12901784051547543,
"grad_norm": 0.4450134038925171,
"learning_rate": 9.975104546518026e-06,
"loss": 0.7492,
"step": 433
},
{
"epoch": 0.12931580319564975,
"grad_norm": 0.44301509857177734,
"learning_rate": 9.974583632052373e-06,
"loss": 0.8026,
"step": 434
},
{
"epoch": 0.12961376587582404,
"grad_norm": 0.4301476776599884,
"learning_rate": 9.974057337924695e-06,
"loss": 0.8118,
"step": 435
},
{
"epoch": 0.12991172855599836,
"grad_norm": 0.46609169244766235,
"learning_rate": 9.973525664704137e-06,
"loss": 0.8148,
"step": 436
},
{
"epoch": 0.13020969123617268,
"grad_norm": 0.4507199823856354,
"learning_rate": 9.972988612965673e-06,
"loss": 0.8282,
"step": 437
},
{
"epoch": 0.13050765391634697,
"grad_norm": 0.4439927935600281,
"learning_rate": 9.972446183290082e-06,
"loss": 0.8319,
"step": 438
},
{
"epoch": 0.1308056165965213,
"grad_norm": 0.4421440064907074,
"learning_rate": 9.971898376263966e-06,
"loss": 0.7802,
"step": 439
},
{
"epoch": 0.13110357927669558,
"grad_norm": 0.43711912631988525,
"learning_rate": 9.971345192479738e-06,
"loss": 0.7945,
"step": 440
},
{
"epoch": 0.1314015419568699,
"grad_norm": 0.43256676197052,
"learning_rate": 9.970786632535627e-06,
"loss": 0.8015,
"step": 441
},
{
"epoch": 0.13169950463704422,
"grad_norm": 0.44375714659690857,
"learning_rate": 9.970222697035679e-06,
"loss": 0.8725,
"step": 442
},
{
"epoch": 0.1319974673172185,
"grad_norm": 0.4452434182167053,
"learning_rate": 9.969653386589749e-06,
"loss": 0.813,
"step": 443
},
{
"epoch": 0.13229542999739283,
"grad_norm": 0.4254850447177887,
"learning_rate": 9.969078701813505e-06,
"loss": 0.7976,
"step": 444
},
{
"epoch": 0.13259339267756715,
"grad_norm": 0.434270441532135,
"learning_rate": 9.968498643328427e-06,
"loss": 0.881,
"step": 445
},
{
"epoch": 0.13289135535774144,
"grad_norm": 0.4523266851902008,
"learning_rate": 9.967913211761813e-06,
"loss": 0.8292,
"step": 446
},
{
"epoch": 0.13318931803791575,
"grad_norm": 0.48667654395103455,
"learning_rate": 9.967322407746762e-06,
"loss": 0.8422,
"step": 447
},
{
"epoch": 0.13348728071809005,
"grad_norm": 0.42540228366851807,
"learning_rate": 9.966726231922188e-06,
"loss": 0.7824,
"step": 448
},
{
"epoch": 0.13378524339826436,
"grad_norm": 0.4267347455024719,
"learning_rate": 9.966124684932811e-06,
"loss": 0.8036,
"step": 449
},
{
"epoch": 0.13408320607843868,
"grad_norm": 0.41667813062667847,
"learning_rate": 9.965517767429165e-06,
"loss": 0.7872,
"step": 450
},
{
"epoch": 0.13438116875861297,
"grad_norm": 0.46928688883781433,
"learning_rate": 9.964905480067585e-06,
"loss": 0.8074,
"step": 451
},
{
"epoch": 0.1346791314387873,
"grad_norm": 0.46049174666404724,
"learning_rate": 9.964287823510222e-06,
"loss": 0.8378,
"step": 452
},
{
"epoch": 0.1349770941189616,
"grad_norm": 0.43071913719177246,
"learning_rate": 9.96366479842502e-06,
"loss": 0.8032,
"step": 453
},
{
"epoch": 0.1352750567991359,
"grad_norm": 0.43020790815353394,
"learning_rate": 9.963036405485747e-06,
"loss": 0.8458,
"step": 454
},
{
"epoch": 0.13557301947931022,
"grad_norm": 0.42616328597068787,
"learning_rate": 9.962402645371957e-06,
"loss": 0.8036,
"step": 455
},
{
"epoch": 0.13587098215948454,
"grad_norm": 0.4324115514755249,
"learning_rate": 9.96176351876902e-06,
"loss": 0.7969,
"step": 456
},
{
"epoch": 0.13616894483965883,
"grad_norm": 0.4453105032444,
"learning_rate": 9.961119026368107e-06,
"loss": 0.8329,
"step": 457
},
{
"epoch": 0.13646690751983315,
"grad_norm": 0.46170443296432495,
"learning_rate": 9.960469168866192e-06,
"loss": 0.8129,
"step": 458
},
{
"epoch": 0.13676487020000744,
"grad_norm": 0.448223739862442,
"learning_rate": 9.959813946966048e-06,
"loss": 0.8331,
"step": 459
},
{
"epoch": 0.13706283288018176,
"grad_norm": 0.421773761510849,
"learning_rate": 9.959153361376254e-06,
"loss": 0.7798,
"step": 460
},
{
"epoch": 0.13736079556035607,
"grad_norm": 0.4486232399940491,
"learning_rate": 9.958487412811184e-06,
"loss": 0.8891,
"step": 461
},
{
"epoch": 0.13765875824053037,
"grad_norm": 0.42223700881004333,
"learning_rate": 9.957816101991015e-06,
"loss": 0.7495,
"step": 462
},
{
"epoch": 0.13795672092070468,
"grad_norm": 0.4491265118122101,
"learning_rate": 9.957139429641723e-06,
"loss": 0.8457,
"step": 463
},
{
"epoch": 0.138254683600879,
"grad_norm": 0.44002753496170044,
"learning_rate": 9.956457396495083e-06,
"loss": 0.8126,
"step": 464
},
{
"epoch": 0.1385526462810533,
"grad_norm": 0.4285680651664734,
"learning_rate": 9.955770003288663e-06,
"loss": 0.7692,
"step": 465
},
{
"epoch": 0.1388506089612276,
"grad_norm": 0.4280463755130768,
"learning_rate": 9.955077250765833e-06,
"loss": 0.8255,
"step": 466
},
{
"epoch": 0.1391485716414019,
"grad_norm": 0.4269079864025116,
"learning_rate": 9.954379139675753e-06,
"loss": 0.782,
"step": 467
},
{
"epoch": 0.13944653432157622,
"grad_norm": 0.4445587992668152,
"learning_rate": 9.953675670773384e-06,
"loss": 0.8237,
"step": 468
},
{
"epoch": 0.13974449700175054,
"grad_norm": 0.43859168887138367,
"learning_rate": 9.952966844819479e-06,
"loss": 0.8206,
"step": 469
},
{
"epoch": 0.14004245968192483,
"grad_norm": 0.42063581943511963,
"learning_rate": 9.95225266258058e-06,
"loss": 0.7909,
"step": 470
},
{
"epoch": 0.14034042236209915,
"grad_norm": 0.4376140236854553,
"learning_rate": 9.951533124829024e-06,
"loss": 0.7956,
"step": 471
},
{
"epoch": 0.14063838504227347,
"grad_norm": 0.4519414007663727,
"learning_rate": 9.950808232342945e-06,
"loss": 0.8011,
"step": 472
},
{
"epoch": 0.14093634772244776,
"grad_norm": 0.47149237990379333,
"learning_rate": 9.950077985906259e-06,
"loss": 0.8267,
"step": 473
},
{
"epoch": 0.14123431040262208,
"grad_norm": 0.418052613735199,
"learning_rate": 9.949342386308679e-06,
"loss": 0.786,
"step": 474
},
{
"epoch": 0.14153227308279637,
"grad_norm": 0.4265899360179901,
"learning_rate": 9.948601434345704e-06,
"loss": 0.8061,
"step": 475
},
{
"epoch": 0.14183023576297069,
"grad_norm": 0.4369049668312073,
"learning_rate": 9.947855130818618e-06,
"loss": 0.7718,
"step": 476
},
{
"epoch": 0.142128198443145,
"grad_norm": 0.4536924958229065,
"learning_rate": 9.9471034765345e-06,
"loss": 0.8304,
"step": 477
},
{
"epoch": 0.1424261611233193,
"grad_norm": 0.44105738401412964,
"learning_rate": 9.94634647230621e-06,
"loss": 0.8509,
"step": 478
},
{
"epoch": 0.1427241238034936,
"grad_norm": 0.44205665588378906,
"learning_rate": 9.945584118952392e-06,
"loss": 0.8142,
"step": 479
},
{
"epoch": 0.14302208648366793,
"grad_norm": 0.4375988841056824,
"learning_rate": 9.944816417297482e-06,
"loss": 0.7874,
"step": 480
},
{
"epoch": 0.14332004916384222,
"grad_norm": 0.44625169038772583,
"learning_rate": 9.944043368171692e-06,
"loss": 0.7945,
"step": 481
},
{
"epoch": 0.14361801184401654,
"grad_norm": 0.43976661562919617,
"learning_rate": 9.94326497241102e-06,
"loss": 0.8225,
"step": 482
},
{
"epoch": 0.14391597452419083,
"grad_norm": 0.4386625587940216,
"learning_rate": 9.942481230857249e-06,
"loss": 0.7961,
"step": 483
},
{
"epoch": 0.14421393720436515,
"grad_norm": 0.4405531883239746,
"learning_rate": 9.941692144357938e-06,
"loss": 0.7938,
"step": 484
},
{
"epoch": 0.14451189988453947,
"grad_norm": 0.44884464144706726,
"learning_rate": 9.940897713766428e-06,
"loss": 0.8093,
"step": 485
},
{
"epoch": 0.14480986256471376,
"grad_norm": 0.4623242914676666,
"learning_rate": 9.940097939941843e-06,
"loss": 0.8567,
"step": 486
},
{
"epoch": 0.14510782524488808,
"grad_norm": 0.47886964678764343,
"learning_rate": 9.93929282374908e-06,
"loss": 0.832,
"step": 487
},
{
"epoch": 0.1454057879250624,
"grad_norm": 0.43132802844047546,
"learning_rate": 9.938482366058814e-06,
"loss": 0.7834,
"step": 488
},
{
"epoch": 0.1457037506052367,
"grad_norm": 0.46928003430366516,
"learning_rate": 9.9376665677475e-06,
"loss": 0.8418,
"step": 489
},
{
"epoch": 0.146001713285411,
"grad_norm": 0.4581755995750427,
"learning_rate": 9.936845429697369e-06,
"loss": 0.8114,
"step": 490
},
{
"epoch": 0.14629967596558532,
"grad_norm": 0.4166138172149658,
"learning_rate": 9.936018952796417e-06,
"loss": 0.7967,
"step": 491
},
{
"epoch": 0.14659763864575961,
"grad_norm": 0.43533584475517273,
"learning_rate": 9.935187137938427e-06,
"loss": 0.8353,
"step": 492
},
{
"epoch": 0.14689560132593393,
"grad_norm": 0.44667741656303406,
"learning_rate": 9.934349986022946e-06,
"loss": 0.8317,
"step": 493
},
{
"epoch": 0.14719356400610822,
"grad_norm": 0.44888830184936523,
"learning_rate": 9.933507497955292e-06,
"loss": 0.8174,
"step": 494
},
{
"epoch": 0.14749152668628254,
"grad_norm": 0.46241915225982666,
"learning_rate": 9.93265967464656e-06,
"loss": 0.8514,
"step": 495
},
{
"epoch": 0.14778948936645686,
"grad_norm": 0.4498129189014435,
"learning_rate": 9.931806517013612e-06,
"loss": 0.7824,
"step": 496
},
{
"epoch": 0.14808745204663115,
"grad_norm": 0.43418675661087036,
"learning_rate": 9.930948025979076e-06,
"loss": 0.8287,
"step": 497
},
{
"epoch": 0.14838541472680547,
"grad_norm": 0.4767434298992157,
"learning_rate": 9.93008420247135e-06,
"loss": 0.7997,
"step": 498
},
{
"epoch": 0.1486833774069798,
"grad_norm": 0.4500294029712677,
"learning_rate": 9.929215047424598e-06,
"loss": 0.8001,
"step": 499
},
{
"epoch": 0.14898134008715408,
"grad_norm": 0.47594842314720154,
"learning_rate": 9.928340561778748e-06,
"loss": 0.8219,
"step": 500
},
{
"epoch": 0.1492793027673284,
"grad_norm": 0.43770483136177063,
"learning_rate": 9.927460746479501e-06,
"loss": 0.7888,
"step": 501
},
{
"epoch": 0.1495772654475027,
"grad_norm": 0.42728111147880554,
"learning_rate": 9.926575602478309e-06,
"loss": 0.8404,
"step": 502
},
{
"epoch": 0.149875228127677,
"grad_norm": 0.4243437647819519,
"learning_rate": 9.925685130732396e-06,
"loss": 0.7973,
"step": 503
},
{
"epoch": 0.15017319080785133,
"grad_norm": 0.42418187856674194,
"learning_rate": 9.924789332204743e-06,
"loss": 0.7519,
"step": 504
},
{
"epoch": 0.15047115348802562,
"grad_norm": 0.45427075028419495,
"learning_rate": 9.923888207864093e-06,
"loss": 0.8269,
"step": 505
},
{
"epoch": 0.15076911616819993,
"grad_norm": 0.4241939187049866,
"learning_rate": 9.92298175868495e-06,
"loss": 0.7208,
"step": 506
},
{
"epoch": 0.15106707884837425,
"grad_norm": 0.4593588411808014,
"learning_rate": 9.922069985647576e-06,
"loss": 0.7882,
"step": 507
},
{
"epoch": 0.15136504152854854,
"grad_norm": 0.449123352766037,
"learning_rate": 9.921152889737985e-06,
"loss": 0.8168,
"step": 508
},
{
"epoch": 0.15166300420872286,
"grad_norm": 0.4293762743473053,
"learning_rate": 9.920230471947957e-06,
"loss": 0.7966,
"step": 509
},
{
"epoch": 0.15196096688889715,
"grad_norm": 0.43696144223213196,
"learning_rate": 9.919302733275015e-06,
"loss": 0.8051,
"step": 510
},
{
"epoch": 0.15225892956907147,
"grad_norm": 0.46048763394355774,
"learning_rate": 9.91836967472245e-06,
"loss": 0.8789,
"step": 511
},
{
"epoch": 0.1525568922492458,
"grad_norm": 0.45033976435661316,
"learning_rate": 9.917431297299297e-06,
"loss": 0.8209,
"step": 512
},
{
"epoch": 0.15285485492942008,
"grad_norm": 0.45003750920295715,
"learning_rate": 9.916487602020344e-06,
"loss": 0.7816,
"step": 513
},
{
"epoch": 0.1531528176095944,
"grad_norm": 0.44182878732681274,
"learning_rate": 9.91553858990613e-06,
"loss": 0.8211,
"step": 514
},
{
"epoch": 0.15345078028976872,
"grad_norm": 0.44483861327171326,
"learning_rate": 9.91458426198295e-06,
"loss": 0.7971,
"step": 515
},
{
"epoch": 0.153748742969943,
"grad_norm": 0.43113207817077637,
"learning_rate": 9.913624619282835e-06,
"loss": 0.7627,
"step": 516
},
{
"epoch": 0.15404670565011733,
"grad_norm": 0.4577217698097229,
"learning_rate": 9.912659662843578e-06,
"loss": 0.8413,
"step": 517
},
{
"epoch": 0.15434466833029162,
"grad_norm": 0.4342012107372284,
"learning_rate": 9.911689393708707e-06,
"loss": 0.7875,
"step": 518
},
{
"epoch": 0.15464263101046594,
"grad_norm": 0.4530002176761627,
"learning_rate": 9.9107138129275e-06,
"loss": 0.8166,
"step": 519
},
{
"epoch": 0.15494059369064025,
"grad_norm": 0.4360707998275757,
"learning_rate": 9.909732921554982e-06,
"loss": 0.8236,
"step": 520
},
{
"epoch": 0.15523855637081455,
"grad_norm": 0.44726189970970154,
"learning_rate": 9.908746720651914e-06,
"loss": 0.7925,
"step": 521
},
{
"epoch": 0.15553651905098886,
"grad_norm": 0.43377047777175903,
"learning_rate": 9.907755211284807e-06,
"loss": 0.8127,
"step": 522
},
{
"epoch": 0.15583448173116318,
"grad_norm": 0.41775742173194885,
"learning_rate": 9.906758394525905e-06,
"loss": 0.7722,
"step": 523
},
{
"epoch": 0.15613244441133747,
"grad_norm": 0.4309692978858948,
"learning_rate": 9.905756271453198e-06,
"loss": 0.799,
"step": 524
},
{
"epoch": 0.1564304070915118,
"grad_norm": 0.4536021053791046,
"learning_rate": 9.904748843150407e-06,
"loss": 0.7884,
"step": 525
},
{
"epoch": 0.1567283697716861,
"grad_norm": 0.4355040490627289,
"learning_rate": 9.903736110707001e-06,
"loss": 0.7667,
"step": 526
},
{
"epoch": 0.1570263324518604,
"grad_norm": 0.427120566368103,
"learning_rate": 9.902718075218176e-06,
"loss": 0.8023,
"step": 527
},
{
"epoch": 0.15732429513203472,
"grad_norm": 0.4440107047557831,
"learning_rate": 9.901694737784864e-06,
"loss": 0.8138,
"step": 528
},
{
"epoch": 0.157622257812209,
"grad_norm": 0.43493953347206116,
"learning_rate": 9.900666099513734e-06,
"loss": 0.8013,
"step": 529
},
{
"epoch": 0.15792022049238333,
"grad_norm": 0.4667569398880005,
"learning_rate": 9.899632161517187e-06,
"loss": 0.7905,
"step": 530
},
{
"epoch": 0.15821818317255765,
"grad_norm": 0.4225075840950012,
"learning_rate": 9.898592924913353e-06,
"loss": 0.7552,
"step": 531
},
{
"epoch": 0.15851614585273194,
"grad_norm": 0.4328601062297821,
"learning_rate": 9.897548390826092e-06,
"loss": 0.7987,
"step": 532
},
{
"epoch": 0.15881410853290626,
"grad_norm": 0.4600571393966675,
"learning_rate": 9.896498560384996e-06,
"loss": 0.8294,
"step": 533
},
{
"epoch": 0.15911207121308057,
"grad_norm": 0.4488067328929901,
"learning_rate": 9.895443434725382e-06,
"loss": 0.7989,
"step": 534
},
{
"epoch": 0.15941003389325487,
"grad_norm": 0.42931830883026123,
"learning_rate": 9.894383014988294e-06,
"loss": 0.809,
"step": 535
},
{
"epoch": 0.15970799657342918,
"grad_norm": 0.4342135787010193,
"learning_rate": 9.893317302320501e-06,
"loss": 0.7469,
"step": 536
},
{
"epoch": 0.16000595925360347,
"grad_norm": 0.46612685918807983,
"learning_rate": 9.892246297874497e-06,
"loss": 0.8434,
"step": 537
},
{
"epoch": 0.1603039219337778,
"grad_norm": 0.4532231092453003,
"learning_rate": 9.891170002808498e-06,
"loss": 0.8516,
"step": 538
},
{
"epoch": 0.1606018846139521,
"grad_norm": 0.43740570545196533,
"learning_rate": 9.89008841828644e-06,
"loss": 0.8111,
"step": 539
},
{
"epoch": 0.1608998472941264,
"grad_norm": 0.43567419052124023,
"learning_rate": 9.889001545477984e-06,
"loss": 0.7746,
"step": 540
},
{
"epoch": 0.16119780997430072,
"grad_norm": 0.4601244032382965,
"learning_rate": 9.8879093855585e-06,
"loss": 0.8171,
"step": 541
},
{
"epoch": 0.16149577265447504,
"grad_norm": 0.4863916039466858,
"learning_rate": 9.886811939709089e-06,
"loss": 0.7941,
"step": 542
},
{
"epoch": 0.16179373533464933,
"grad_norm": 0.4400995969772339,
"learning_rate": 9.885709209116557e-06,
"loss": 0.8239,
"step": 543
},
{
"epoch": 0.16209169801482365,
"grad_norm": 0.4158054292201996,
"learning_rate": 9.884601194973432e-06,
"loss": 0.6939,
"step": 544
},
{
"epoch": 0.16238966069499794,
"grad_norm": 0.4609817862510681,
"learning_rate": 9.883487898477951e-06,
"loss": 0.8264,
"step": 545
},
{
"epoch": 0.16268762337517226,
"grad_norm": 0.4383307099342346,
"learning_rate": 9.882369320834068e-06,
"loss": 0.7792,
"step": 546
},
{
"epoch": 0.16298558605534658,
"grad_norm": 0.45086583495140076,
"learning_rate": 9.881245463251446e-06,
"loss": 0.8195,
"step": 547
},
{
"epoch": 0.16328354873552087,
"grad_norm": 0.4327777326107025,
"learning_rate": 9.880116326945455e-06,
"loss": 0.7916,
"step": 548
},
{
"epoch": 0.16358151141569519,
"grad_norm": 0.48214995861053467,
"learning_rate": 9.878981913137178e-06,
"loss": 0.8232,
"step": 549
},
{
"epoch": 0.1638794740958695,
"grad_norm": 0.4498960077762604,
"learning_rate": 9.877842223053406e-06,
"loss": 0.8134,
"step": 550
},
{
"epoch": 0.1641774367760438,
"grad_norm": 0.44083136320114136,
"learning_rate": 9.876697257926632e-06,
"loss": 0.7982,
"step": 551
},
{
"epoch": 0.1644753994562181,
"grad_norm": 0.44026824831962585,
"learning_rate": 9.875547018995052e-06,
"loss": 0.7627,
"step": 552
},
{
"epoch": 0.1647733621363924,
"grad_norm": 0.44473356008529663,
"learning_rate": 9.874391507502572e-06,
"loss": 0.8151,
"step": 553
},
{
"epoch": 0.16507132481656672,
"grad_norm": 0.4847549498081207,
"learning_rate": 9.873230724698797e-06,
"loss": 0.8662,
"step": 554
},
{
"epoch": 0.16536928749674104,
"grad_norm": 0.4360688626766205,
"learning_rate": 9.872064671839029e-06,
"loss": 0.8291,
"step": 555
},
{
"epoch": 0.16566725017691533,
"grad_norm": 0.43601882457733154,
"learning_rate": 9.870893350184274e-06,
"loss": 0.8048,
"step": 556
},
{
"epoch": 0.16596521285708965,
"grad_norm": 0.43026798963546753,
"learning_rate": 9.869716761001234e-06,
"loss": 0.7519,
"step": 557
},
{
"epoch": 0.16626317553726397,
"grad_norm": 0.4556662440299988,
"learning_rate": 9.868534905562306e-06,
"loss": 0.8546,
"step": 558
},
{
"epoch": 0.16656113821743826,
"grad_norm": 0.4316236674785614,
"learning_rate": 9.867347785145584e-06,
"loss": 0.8059,
"step": 559
},
{
"epoch": 0.16685910089761258,
"grad_norm": 0.4355992078781128,
"learning_rate": 9.866155401034856e-06,
"loss": 0.8053,
"step": 560
},
{
"epoch": 0.16715706357778687,
"grad_norm": 0.43576356768608093,
"learning_rate": 9.864957754519602e-06,
"loss": 0.7883,
"step": 561
},
{
"epoch": 0.1674550262579612,
"grad_norm": 0.42826154828071594,
"learning_rate": 9.86375484689499e-06,
"loss": 0.7288,
"step": 562
},
{
"epoch": 0.1677529889381355,
"grad_norm": 0.43339434266090393,
"learning_rate": 9.862546679461882e-06,
"loss": 0.7906,
"step": 563
},
{
"epoch": 0.1680509516183098,
"grad_norm": 0.4407076835632324,
"learning_rate": 9.861333253526826e-06,
"loss": 0.8304,
"step": 564
},
{
"epoch": 0.16834891429848411,
"grad_norm": 0.44356203079223633,
"learning_rate": 9.860114570402055e-06,
"loss": 0.7864,
"step": 565
},
{
"epoch": 0.16864687697865843,
"grad_norm": 0.40894874930381775,
"learning_rate": 9.85889063140549e-06,
"loss": 0.7819,
"step": 566
},
{
"epoch": 0.16894483965883272,
"grad_norm": 0.41408395767211914,
"learning_rate": 9.857661437860735e-06,
"loss": 0.7753,
"step": 567
},
{
"epoch": 0.16924280233900704,
"grad_norm": 0.4399091303348541,
"learning_rate": 9.856426991097077e-06,
"loss": 0.81,
"step": 568
},
{
"epoch": 0.16954076501918136,
"grad_norm": 0.44112658500671387,
"learning_rate": 9.85518729244948e-06,
"loss": 0.7672,
"step": 569
},
{
"epoch": 0.16983872769935565,
"grad_norm": 0.4537188708782196,
"learning_rate": 9.853942343258596e-06,
"loss": 0.8056,
"step": 570
},
{
"epoch": 0.17013669037952997,
"grad_norm": 0.43311551213264465,
"learning_rate": 9.852692144870746e-06,
"loss": 0.7554,
"step": 571
},
{
"epoch": 0.17043465305970426,
"grad_norm": 0.43048954010009766,
"learning_rate": 9.851436698637932e-06,
"loss": 0.7763,
"step": 572
},
{
"epoch": 0.17073261573987858,
"grad_norm": 0.41111141443252563,
"learning_rate": 9.850176005917835e-06,
"loss": 0.7883,
"step": 573
},
{
"epoch": 0.1710305784200529,
"grad_norm": 0.4324644207954407,
"learning_rate": 9.848910068073799e-06,
"loss": 0.8108,
"step": 574
},
{
"epoch": 0.1713285411002272,
"grad_norm": 0.44743847846984863,
"learning_rate": 9.84763888647485e-06,
"loss": 0.7932,
"step": 575
},
{
"epoch": 0.1716265037804015,
"grad_norm": 0.4240942895412445,
"learning_rate": 9.846362462495682e-06,
"loss": 0.7787,
"step": 576
},
{
"epoch": 0.17192446646057583,
"grad_norm": 0.4266510605812073,
"learning_rate": 9.845080797516655e-06,
"loss": 0.7735,
"step": 577
},
{
"epoch": 0.17222242914075012,
"grad_norm": 0.4296509623527527,
"learning_rate": 9.843793892923801e-06,
"loss": 0.8095,
"step": 578
},
{
"epoch": 0.17252039182092443,
"grad_norm": 0.4529971480369568,
"learning_rate": 9.84250175010882e-06,
"loss": 0.7629,
"step": 579
},
{
"epoch": 0.17281835450109873,
"grad_norm": 0.4344656765460968,
"learning_rate": 9.841204370469066e-06,
"loss": 0.7899,
"step": 580
},
{
"epoch": 0.17311631718127304,
"grad_norm": 0.42121410369873047,
"learning_rate": 9.839901755407572e-06,
"loss": 0.7554,
"step": 581
},
{
"epoch": 0.17341427986144736,
"grad_norm": 0.43178629875183105,
"learning_rate": 9.838593906333018e-06,
"loss": 0.8102,
"step": 582
},
{
"epoch": 0.17371224254162165,
"grad_norm": 0.4340752959251404,
"learning_rate": 9.837280824659755e-06,
"loss": 0.7796,
"step": 583
},
{
"epoch": 0.17401020522179597,
"grad_norm": 0.4354706108570099,
"learning_rate": 9.835962511807786e-06,
"loss": 0.7993,
"step": 584
},
{
"epoch": 0.1743081679019703,
"grad_norm": 0.4145393967628479,
"learning_rate": 9.834638969202774e-06,
"loss": 0.7862,
"step": 585
},
{
"epoch": 0.17460613058214458,
"grad_norm": 0.4493045508861542,
"learning_rate": 9.833310198276037e-06,
"loss": 0.7941,
"step": 586
},
{
"epoch": 0.1749040932623189,
"grad_norm": 0.42931944131851196,
"learning_rate": 9.831976200464551e-06,
"loss": 0.8223,
"step": 587
},
{
"epoch": 0.1752020559424932,
"grad_norm": 0.4153611660003662,
"learning_rate": 9.830636977210934e-06,
"loss": 0.8077,
"step": 588
},
{
"epoch": 0.1755000186226675,
"grad_norm": 0.4432365894317627,
"learning_rate": 9.829292529963467e-06,
"loss": 0.8329,
"step": 589
},
{
"epoch": 0.17579798130284183,
"grad_norm": 0.4226314425468445,
"learning_rate": 9.827942860176072e-06,
"loss": 0.7617,
"step": 590
},
{
"epoch": 0.17609594398301612,
"grad_norm": 0.4454766809940338,
"learning_rate": 9.826587969308322e-06,
"loss": 0.8462,
"step": 591
},
{
"epoch": 0.17639390666319044,
"grad_norm": 0.4389556348323822,
"learning_rate": 9.825227858825439e-06,
"loss": 0.7955,
"step": 592
},
{
"epoch": 0.17669186934336475,
"grad_norm": 0.4516342878341675,
"learning_rate": 9.823862530198285e-06,
"loss": 0.7961,
"step": 593
},
{
"epoch": 0.17698983202353905,
"grad_norm": 0.43754610419273376,
"learning_rate": 9.822491984903367e-06,
"loss": 0.8458,
"step": 594
},
{
"epoch": 0.17728779470371336,
"grad_norm": 0.42383718490600586,
"learning_rate": 9.821116224422832e-06,
"loss": 0.817,
"step": 595
},
{
"epoch": 0.17758575738388765,
"grad_norm": 0.40285003185272217,
"learning_rate": 9.819735250244469e-06,
"loss": 0.8049,
"step": 596
},
{
"epoch": 0.17788372006406197,
"grad_norm": 0.43438956141471863,
"learning_rate": 9.818349063861703e-06,
"loss": 0.7974,
"step": 597
},
{
"epoch": 0.1781816827442363,
"grad_norm": 0.46335262060165405,
"learning_rate": 9.816957666773601e-06,
"loss": 0.7877,
"step": 598
},
{
"epoch": 0.17847964542441058,
"grad_norm": 0.42925113439559937,
"learning_rate": 9.815561060484857e-06,
"loss": 0.7703,
"step": 599
},
{
"epoch": 0.1787776081045849,
"grad_norm": 0.4416888952255249,
"learning_rate": 9.814159246505803e-06,
"loss": 0.7857,
"step": 600
},
{
"epoch": 0.17907557078475922,
"grad_norm": 0.42968836426734924,
"learning_rate": 9.812752226352405e-06,
"loss": 0.8382,
"step": 601
},
{
"epoch": 0.1793735334649335,
"grad_norm": 0.4462573230266571,
"learning_rate": 9.811340001546252e-06,
"loss": 0.7949,
"step": 602
},
{
"epoch": 0.17967149614510783,
"grad_norm": 0.4179610311985016,
"learning_rate": 9.80992257361457e-06,
"loss": 0.7564,
"step": 603
},
{
"epoch": 0.17996945882528215,
"grad_norm": 0.42330700159072876,
"learning_rate": 9.808499944090204e-06,
"loss": 0.7669,
"step": 604
},
{
"epoch": 0.18026742150545644,
"grad_norm": 0.42339032888412476,
"learning_rate": 9.80707211451163e-06,
"loss": 0.7717,
"step": 605
},
{
"epoch": 0.18056538418563076,
"grad_norm": 0.44835034012794495,
"learning_rate": 9.805639086422944e-06,
"loss": 0.8127,
"step": 606
},
{
"epoch": 0.18086334686580505,
"grad_norm": 0.4259271025657654,
"learning_rate": 9.804200861373866e-06,
"loss": 0.7793,
"step": 607
},
{
"epoch": 0.18116130954597937,
"grad_norm": 0.4456773102283478,
"learning_rate": 9.802757440919734e-06,
"loss": 0.7896,
"step": 608
},
{
"epoch": 0.18145927222615368,
"grad_norm": 0.4379430413246155,
"learning_rate": 9.801308826621505e-06,
"loss": 0.7806,
"step": 609
},
{
"epoch": 0.18175723490632797,
"grad_norm": 0.42454832792282104,
"learning_rate": 9.799855020045756e-06,
"loss": 0.8095,
"step": 610
},
{
"epoch": 0.1820551975865023,
"grad_norm": 0.41606152057647705,
"learning_rate": 9.798396022764673e-06,
"loss": 0.7679,
"step": 611
},
{
"epoch": 0.1823531602666766,
"grad_norm": 0.44875243306159973,
"learning_rate": 9.796931836356062e-06,
"loss": 0.8114,
"step": 612
},
{
"epoch": 0.1826511229468509,
"grad_norm": 0.43045029044151306,
"learning_rate": 9.795462462403339e-06,
"loss": 0.8003,
"step": 613
},
{
"epoch": 0.18294908562702522,
"grad_norm": 0.4321276843547821,
"learning_rate": 9.793987902495522e-06,
"loss": 0.8126,
"step": 614
},
{
"epoch": 0.1832470483071995,
"grad_norm": 0.43846407532691956,
"learning_rate": 9.79250815822725e-06,
"loss": 0.8485,
"step": 615
},
{
"epoch": 0.18354501098737383,
"grad_norm": 0.41620397567749023,
"learning_rate": 9.791023231198757e-06,
"loss": 0.7452,
"step": 616
},
{
"epoch": 0.18384297366754815,
"grad_norm": 0.42242833971977234,
"learning_rate": 9.789533123015893e-06,
"loss": 0.7881,
"step": 617
},
{
"epoch": 0.18414093634772244,
"grad_norm": 0.4209827482700348,
"learning_rate": 9.7880378352901e-06,
"loss": 0.8115,
"step": 618
},
{
"epoch": 0.18443889902789676,
"grad_norm": 0.44570618867874146,
"learning_rate": 9.786537369638429e-06,
"loss": 0.7989,
"step": 619
},
{
"epoch": 0.18473686170807108,
"grad_norm": 0.43412548303604126,
"learning_rate": 9.785031727683528e-06,
"loss": 0.7623,
"step": 620
},
{
"epoch": 0.18503482438824537,
"grad_norm": 0.43142980337142944,
"learning_rate": 9.783520911053642e-06,
"loss": 0.7922,
"step": 621
},
{
"epoch": 0.18533278706841969,
"grad_norm": 0.4264300763607025,
"learning_rate": 9.782004921382612e-06,
"loss": 0.8033,
"step": 622
},
{
"epoch": 0.18563074974859398,
"grad_norm": 0.4410349726676941,
"learning_rate": 9.780483760309876e-06,
"loss": 0.7679,
"step": 623
},
{
"epoch": 0.1859287124287683,
"grad_norm": 0.4315160810947418,
"learning_rate": 9.778957429480463e-06,
"loss": 0.7944,
"step": 624
},
{
"epoch": 0.1862266751089426,
"grad_norm": 0.4013029634952545,
"learning_rate": 9.77742593054499e-06,
"loss": 0.7517,
"step": 625
},
{
"epoch": 0.1865246377891169,
"grad_norm": 0.42481091618537903,
"learning_rate": 9.775889265159667e-06,
"loss": 0.7834,
"step": 626
},
{
"epoch": 0.18682260046929122,
"grad_norm": 0.41402578353881836,
"learning_rate": 9.774347434986287e-06,
"loss": 0.805,
"step": 627
},
{
"epoch": 0.18712056314946554,
"grad_norm": 0.42417481541633606,
"learning_rate": 9.772800441692234e-06,
"loss": 0.7971,
"step": 628
},
{
"epoch": 0.18741852582963983,
"grad_norm": 0.435710072517395,
"learning_rate": 9.771248286950472e-06,
"loss": 0.7968,
"step": 629
},
{
"epoch": 0.18771648850981415,
"grad_norm": 0.43094027042388916,
"learning_rate": 9.769690972439545e-06,
"loss": 0.8031,
"step": 630
},
{
"epoch": 0.18801445118998844,
"grad_norm": 0.4466800391674042,
"learning_rate": 9.768128499843579e-06,
"loss": 0.7671,
"step": 631
},
{
"epoch": 0.18831241387016276,
"grad_norm": 0.46381083130836487,
"learning_rate": 9.76656087085228e-06,
"loss": 0.8268,
"step": 632
},
{
"epoch": 0.18861037655033708,
"grad_norm": 0.4458133578300476,
"learning_rate": 9.76498808716093e-06,
"loss": 0.8519,
"step": 633
},
{
"epoch": 0.18890833923051137,
"grad_norm": 0.4384130537509918,
"learning_rate": 9.763410150470378e-06,
"loss": 0.8272,
"step": 634
},
{
"epoch": 0.1892063019106857,
"grad_norm": 0.4333811104297638,
"learning_rate": 9.761827062487056e-06,
"loss": 0.7305,
"step": 635
},
{
"epoch": 0.18950426459086,
"grad_norm": 0.4350837171077728,
"learning_rate": 9.760238824922962e-06,
"loss": 0.7394,
"step": 636
},
{
"epoch": 0.1898022272710343,
"grad_norm": 0.4497371315956116,
"learning_rate": 9.758645439495662e-06,
"loss": 0.8362,
"step": 637
},
{
"epoch": 0.19010018995120861,
"grad_norm": 0.4223061501979828,
"learning_rate": 9.757046907928291e-06,
"loss": 0.7831,
"step": 638
},
{
"epoch": 0.19039815263138293,
"grad_norm": 0.42122185230255127,
"learning_rate": 9.755443231949548e-06,
"loss": 0.7959,
"step": 639
},
{
"epoch": 0.19069611531155722,
"grad_norm": 0.44597384333610535,
"learning_rate": 9.753834413293695e-06,
"loss": 0.8639,
"step": 640
},
{
"epoch": 0.19099407799173154,
"grad_norm": 0.4516172409057617,
"learning_rate": 9.752220453700556e-06,
"loss": 0.7922,
"step": 641
},
{
"epoch": 0.19129204067190583,
"grad_norm": 0.42263251543045044,
"learning_rate": 9.750601354915516e-06,
"loss": 0.7685,
"step": 642
},
{
"epoch": 0.19159000335208015,
"grad_norm": 0.4246070384979248,
"learning_rate": 9.748977118689516e-06,
"loss": 0.7938,
"step": 643
},
{
"epoch": 0.19188796603225447,
"grad_norm": 0.4520643949508667,
"learning_rate": 9.747347746779052e-06,
"loss": 0.8233,
"step": 644
},
{
"epoch": 0.19218592871242876,
"grad_norm": 0.4654163718223572,
"learning_rate": 9.745713240946177e-06,
"loss": 0.8011,
"step": 645
},
{
"epoch": 0.19248389139260308,
"grad_norm": 0.43705472350120544,
"learning_rate": 9.744073602958493e-06,
"loss": 0.8239,
"step": 646
},
{
"epoch": 0.1927818540727774,
"grad_norm": 0.4432106614112854,
"learning_rate": 9.742428834589152e-06,
"loss": 0.824,
"step": 647
},
{
"epoch": 0.1930798167529517,
"grad_norm": 0.43141478300094604,
"learning_rate": 9.740778937616858e-06,
"loss": 0.7822,
"step": 648
},
{
"epoch": 0.193377779433126,
"grad_norm": 0.41723155975341797,
"learning_rate": 9.739123913825855e-06,
"loss": 0.7457,
"step": 649
},
{
"epoch": 0.1936757421133003,
"grad_norm": 0.44804713129997253,
"learning_rate": 9.737463765005934e-06,
"loss": 0.8064,
"step": 650
},
{
"epoch": 0.19397370479347462,
"grad_norm": 0.4133176803588867,
"learning_rate": 9.735798492952435e-06,
"loss": 0.8116,
"step": 651
},
{
"epoch": 0.19427166747364893,
"grad_norm": 0.4027285575866699,
"learning_rate": 9.734128099466227e-06,
"loss": 0.7512,
"step": 652
},
{
"epoch": 0.19456963015382323,
"grad_norm": 0.43247896432876587,
"learning_rate": 9.732452586353727e-06,
"loss": 0.7555,
"step": 653
},
{
"epoch": 0.19486759283399754,
"grad_norm": 0.4446199834346771,
"learning_rate": 9.73077195542688e-06,
"loss": 0.7942,
"step": 654
},
{
"epoch": 0.19516555551417186,
"grad_norm": 0.45987439155578613,
"learning_rate": 9.729086208503174e-06,
"loss": 0.8086,
"step": 655
},
{
"epoch": 0.19546351819434615,
"grad_norm": 0.4186185896396637,
"learning_rate": 9.727395347405624e-06,
"loss": 0.7624,
"step": 656
},
{
"epoch": 0.19576148087452047,
"grad_norm": 0.4368334114551544,
"learning_rate": 9.725699373962778e-06,
"loss": 0.7926,
"step": 657
},
{
"epoch": 0.19605944355469476,
"grad_norm": 0.42097970843315125,
"learning_rate": 9.723998290008709e-06,
"loss": 0.7777,
"step": 658
},
{
"epoch": 0.19635740623486908,
"grad_norm": 0.41991859674453735,
"learning_rate": 9.722292097383024e-06,
"loss": 0.7675,
"step": 659
},
{
"epoch": 0.1966553689150434,
"grad_norm": 0.40364593267440796,
"learning_rate": 9.720580797930845e-06,
"loss": 0.7554,
"step": 660
},
{
"epoch": 0.1969533315952177,
"grad_norm": 0.44092264771461487,
"learning_rate": 9.718864393502828e-06,
"loss": 0.7927,
"step": 661
},
{
"epoch": 0.197251294275392,
"grad_norm": 0.44391563534736633,
"learning_rate": 9.71714288595514e-06,
"loss": 0.7809,
"step": 662
},
{
"epoch": 0.19754925695556633,
"grad_norm": 0.42969176173210144,
"learning_rate": 9.715416277149469e-06,
"loss": 0.7927,
"step": 663
},
{
"epoch": 0.19784721963574062,
"grad_norm": 0.42373907566070557,
"learning_rate": 9.713684568953023e-06,
"loss": 0.8096,
"step": 664
},
{
"epoch": 0.19814518231591494,
"grad_norm": 0.4161315858364105,
"learning_rate": 9.711947763238523e-06,
"loss": 0.7868,
"step": 665
},
{
"epoch": 0.19844314499608923,
"grad_norm": 0.41807207465171814,
"learning_rate": 9.7102058618842e-06,
"loss": 0.7613,
"step": 666
},
{
"epoch": 0.19874110767626355,
"grad_norm": 0.4127216339111328,
"learning_rate": 9.708458866773803e-06,
"loss": 0.7741,
"step": 667
},
{
"epoch": 0.19903907035643786,
"grad_norm": 0.4264618158340454,
"learning_rate": 9.706706779796576e-06,
"loss": 0.7858,
"step": 668
},
{
"epoch": 0.19933703303661215,
"grad_norm": 0.42385661602020264,
"learning_rate": 9.704949602847282e-06,
"loss": 0.7293,
"step": 669
},
{
"epoch": 0.19963499571678647,
"grad_norm": 0.39673057198524475,
"learning_rate": 9.703187337826186e-06,
"loss": 0.7404,
"step": 670
},
{
"epoch": 0.1999329583969608,
"grad_norm": 0.44886425137519836,
"learning_rate": 9.70141998663905e-06,
"loss": 0.7941,
"step": 671
},
{
"epoch": 0.20023092107713508,
"grad_norm": 0.42239850759506226,
"learning_rate": 9.699647551197142e-06,
"loss": 0.7402,
"step": 672
},
{
"epoch": 0.2005288837573094,
"grad_norm": 0.4410516321659088,
"learning_rate": 9.697870033417226e-06,
"loss": 0.811,
"step": 673
},
{
"epoch": 0.2008268464374837,
"grad_norm": 0.4216253161430359,
"learning_rate": 9.696087435221562e-06,
"loss": 0.7885,
"step": 674
},
{
"epoch": 0.201124809117658,
"grad_norm": 0.4702225625514984,
"learning_rate": 9.694299758537905e-06,
"loss": 0.7736,
"step": 675
},
{
"epoch": 0.20142277179783233,
"grad_norm": 0.42728155851364136,
"learning_rate": 9.692507005299499e-06,
"loss": 0.7845,
"step": 676
},
{
"epoch": 0.20172073447800662,
"grad_norm": 0.42187970876693726,
"learning_rate": 9.690709177445084e-06,
"loss": 0.7835,
"step": 677
},
{
"epoch": 0.20201869715818094,
"grad_norm": 0.43492591381073,
"learning_rate": 9.688906276918883e-06,
"loss": 0.8551,
"step": 678
},
{
"epoch": 0.20231665983835526,
"grad_norm": 0.4242953062057495,
"learning_rate": 9.687098305670606e-06,
"loss": 0.7807,
"step": 679
},
{
"epoch": 0.20261462251852955,
"grad_norm": 0.432669073343277,
"learning_rate": 9.685285265655444e-06,
"loss": 0.7859,
"step": 680
},
{
"epoch": 0.20291258519870387,
"grad_norm": 0.4527738094329834,
"learning_rate": 9.683467158834076e-06,
"loss": 0.8505,
"step": 681
},
{
"epoch": 0.20321054787887818,
"grad_norm": 0.4490962624549866,
"learning_rate": 9.681643987172656e-06,
"loss": 0.7705,
"step": 682
},
{
"epoch": 0.20350851055905247,
"grad_norm": 0.45207998156547546,
"learning_rate": 9.679815752642814e-06,
"loss": 0.8056,
"step": 683
},
{
"epoch": 0.2038064732392268,
"grad_norm": 0.41610947251319885,
"learning_rate": 9.677982457221658e-06,
"loss": 0.7969,
"step": 684
},
{
"epoch": 0.20410443591940108,
"grad_norm": 0.43091249465942383,
"learning_rate": 9.67614410289177e-06,
"loss": 0.7579,
"step": 685
},
{
"epoch": 0.2044023985995754,
"grad_norm": 0.4388624429702759,
"learning_rate": 9.674300691641194e-06,
"loss": 0.8206,
"step": 686
},
{
"epoch": 0.20470036127974972,
"grad_norm": 0.4264225661754608,
"learning_rate": 9.672452225463458e-06,
"loss": 0.7646,
"step": 687
},
{
"epoch": 0.204998323959924,
"grad_norm": 0.4286390542984009,
"learning_rate": 9.67059870635754e-06,
"loss": 0.7895,
"step": 688
},
{
"epoch": 0.20529628664009833,
"grad_norm": 0.4425601363182068,
"learning_rate": 9.668740136327898e-06,
"loss": 0.8329,
"step": 689
},
{
"epoch": 0.20559424932027265,
"grad_norm": 0.43018388748168945,
"learning_rate": 9.666876517384441e-06,
"loss": 0.8013,
"step": 690
},
{
"epoch": 0.20589221200044694,
"grad_norm": 0.43569415807724,
"learning_rate": 9.665007851542541e-06,
"loss": 0.7902,
"step": 691
},
{
"epoch": 0.20619017468062126,
"grad_norm": 0.4561285376548767,
"learning_rate": 9.663134140823031e-06,
"loss": 0.7929,
"step": 692
},
{
"epoch": 0.20648813736079555,
"grad_norm": 0.4289737045764923,
"learning_rate": 9.661255387252195e-06,
"loss": 0.7618,
"step": 693
},
{
"epoch": 0.20678610004096987,
"grad_norm": 0.42727822065353394,
"learning_rate": 9.659371592861772e-06,
"loss": 0.78,
"step": 694
},
{
"epoch": 0.20708406272114419,
"grad_norm": 0.4296926259994507,
"learning_rate": 9.657482759688957e-06,
"loss": 0.7892,
"step": 695
},
{
"epoch": 0.20738202540131848,
"grad_norm": 0.41573357582092285,
"learning_rate": 9.655588889776385e-06,
"loss": 0.782,
"step": 696
},
{
"epoch": 0.2076799880814928,
"grad_norm": 0.4244844913482666,
"learning_rate": 9.653689985172148e-06,
"loss": 0.7801,
"step": 697
},
{
"epoch": 0.2079779507616671,
"grad_norm": 0.4274427890777588,
"learning_rate": 9.651786047929772e-06,
"loss": 0.8083,
"step": 698
},
{
"epoch": 0.2082759134418414,
"grad_norm": 0.4298088848590851,
"learning_rate": 9.649877080108239e-06,
"loss": 0.7653,
"step": 699
},
{
"epoch": 0.20857387612201572,
"grad_norm": 0.42512091994285583,
"learning_rate": 9.647963083771957e-06,
"loss": 0.7663,
"step": 700
},
{
"epoch": 0.20887183880219,
"grad_norm": 0.43867695331573486,
"learning_rate": 9.646044060990778e-06,
"loss": 0.7577,
"step": 701
},
{
"epoch": 0.20916980148236433,
"grad_norm": 0.41578346490859985,
"learning_rate": 9.644120013839993e-06,
"loss": 0.751,
"step": 702
},
{
"epoch": 0.20946776416253865,
"grad_norm": 0.43413275480270386,
"learning_rate": 9.642190944400323e-06,
"loss": 0.7961,
"step": 703
},
{
"epoch": 0.20976572684271294,
"grad_norm": 0.42895805835723877,
"learning_rate": 9.640256854757921e-06,
"loss": 0.7646,
"step": 704
},
{
"epoch": 0.21006368952288726,
"grad_norm": 0.435161292552948,
"learning_rate": 9.638317747004369e-06,
"loss": 0.8106,
"step": 705
},
{
"epoch": 0.21036165220306158,
"grad_norm": 0.44427958130836487,
"learning_rate": 9.636373623236672e-06,
"loss": 0.8242,
"step": 706
},
{
"epoch": 0.21065961488323587,
"grad_norm": 0.4361114203929901,
"learning_rate": 9.634424485557267e-06,
"loss": 0.778,
"step": 707
},
{
"epoch": 0.2109575775634102,
"grad_norm": 0.4281226396560669,
"learning_rate": 9.632470336074009e-06,
"loss": 0.8027,
"step": 708
},
{
"epoch": 0.21125554024358448,
"grad_norm": 0.46293097734451294,
"learning_rate": 9.630511176900172e-06,
"loss": 0.776,
"step": 709
},
{
"epoch": 0.2115535029237588,
"grad_norm": 0.4355347156524658,
"learning_rate": 9.628547010154449e-06,
"loss": 0.7818,
"step": 710
},
{
"epoch": 0.21185146560393311,
"grad_norm": 0.4164382517337799,
"learning_rate": 9.626577837960947e-06,
"loss": 0.7261,
"step": 711
},
{
"epoch": 0.2121494282841074,
"grad_norm": 0.41936761140823364,
"learning_rate": 9.624603662449188e-06,
"loss": 0.773,
"step": 712
},
{
"epoch": 0.21244739096428172,
"grad_norm": 0.4399654269218445,
"learning_rate": 9.622624485754104e-06,
"loss": 0.8033,
"step": 713
},
{
"epoch": 0.21274535364445604,
"grad_norm": 0.4517837464809418,
"learning_rate": 9.620640310016036e-06,
"loss": 0.7878,
"step": 714
},
{
"epoch": 0.21304331632463033,
"grad_norm": 0.43587207794189453,
"learning_rate": 9.618651137380729e-06,
"loss": 0.7804,
"step": 715
},
{
"epoch": 0.21334127900480465,
"grad_norm": 0.43398943543434143,
"learning_rate": 9.616656969999334e-06,
"loss": 0.8006,
"step": 716
},
{
"epoch": 0.21363924168497897,
"grad_norm": 0.4055356979370117,
"learning_rate": 9.614657810028402e-06,
"loss": 0.7497,
"step": 717
},
{
"epoch": 0.21393720436515326,
"grad_norm": 0.4483594000339508,
"learning_rate": 9.612653659629884e-06,
"loss": 0.767,
"step": 718
},
{
"epoch": 0.21423516704532758,
"grad_norm": 0.4102337956428528,
"learning_rate": 9.610644520971129e-06,
"loss": 0.7865,
"step": 719
},
{
"epoch": 0.21453312972550187,
"grad_norm": 0.4471098482608795,
"learning_rate": 9.608630396224876e-06,
"loss": 0.8297,
"step": 720
},
{
"epoch": 0.2148310924056762,
"grad_norm": 0.43259644508361816,
"learning_rate": 9.60661128756926e-06,
"loss": 0.7794,
"step": 721
},
{
"epoch": 0.2151290550858505,
"grad_norm": 0.42484912276268005,
"learning_rate": 9.604587197187809e-06,
"loss": 0.776,
"step": 722
},
{
"epoch": 0.2154270177660248,
"grad_norm": 0.4235680401325226,
"learning_rate": 9.60255812726943e-06,
"loss": 0.7792,
"step": 723
},
{
"epoch": 0.21572498044619912,
"grad_norm": 0.42460060119628906,
"learning_rate": 9.60052408000842e-06,
"loss": 0.803,
"step": 724
},
{
"epoch": 0.21602294312637343,
"grad_norm": 0.4313143789768219,
"learning_rate": 9.598485057604458e-06,
"loss": 0.8066,
"step": 725
},
{
"epoch": 0.21632090580654773,
"grad_norm": 0.4411279857158661,
"learning_rate": 9.596441062262602e-06,
"loss": 0.8148,
"step": 726
},
{
"epoch": 0.21661886848672204,
"grad_norm": 0.4420432150363922,
"learning_rate": 9.594392096193294e-06,
"loss": 0.8216,
"step": 727
},
{
"epoch": 0.21691683116689633,
"grad_norm": 0.416064977645874,
"learning_rate": 9.59233816161234e-06,
"loss": 0.7754,
"step": 728
},
{
"epoch": 0.21721479384707065,
"grad_norm": 0.4199596345424652,
"learning_rate": 9.590279260740932e-06,
"loss": 0.7731,
"step": 729
},
{
"epoch": 0.21751275652724497,
"grad_norm": 0.45451900362968445,
"learning_rate": 9.58821539580562e-06,
"loss": 0.8086,
"step": 730
},
{
"epoch": 0.21781071920741926,
"grad_norm": 0.4234671890735626,
"learning_rate": 9.586146569038332e-06,
"loss": 0.7796,
"step": 731
},
{
"epoch": 0.21810868188759358,
"grad_norm": 0.4232145845890045,
"learning_rate": 9.58407278267636e-06,
"loss": 0.7848,
"step": 732
},
{
"epoch": 0.2184066445677679,
"grad_norm": 0.4316883683204651,
"learning_rate": 9.581994038962356e-06,
"loss": 0.856,
"step": 733
},
{
"epoch": 0.2187046072479422,
"grad_norm": 0.418620228767395,
"learning_rate": 9.579910340144335e-06,
"loss": 0.7897,
"step": 734
},
{
"epoch": 0.2190025699281165,
"grad_norm": 0.4266875386238098,
"learning_rate": 9.57782168847567e-06,
"loss": 0.8091,
"step": 735
},
{
"epoch": 0.2193005326082908,
"grad_norm": 0.4251216650009155,
"learning_rate": 9.575728086215093e-06,
"loss": 0.7564,
"step": 736
},
{
"epoch": 0.21959849528846512,
"grad_norm": 0.4177185893058777,
"learning_rate": 9.573629535626685e-06,
"loss": 0.7581,
"step": 737
},
{
"epoch": 0.21989645796863944,
"grad_norm": 0.4171562194824219,
"learning_rate": 9.571526038979883e-06,
"loss": 0.7498,
"step": 738
},
{
"epoch": 0.22019442064881373,
"grad_norm": 0.4368170201778412,
"learning_rate": 9.56941759854947e-06,
"loss": 0.7649,
"step": 739
},
{
"epoch": 0.22049238332898805,
"grad_norm": 0.4155232310295105,
"learning_rate": 9.567304216615574e-06,
"loss": 0.7696,
"step": 740
},
{
"epoch": 0.22079034600916236,
"grad_norm": 0.43209031224250793,
"learning_rate": 9.565185895463669e-06,
"loss": 0.7839,
"step": 741
},
{
"epoch": 0.22108830868933665,
"grad_norm": 0.4308395981788635,
"learning_rate": 9.563062637384574e-06,
"loss": 0.8109,
"step": 742
},
{
"epoch": 0.22138627136951097,
"grad_norm": 0.4467731714248657,
"learning_rate": 9.560934444674438e-06,
"loss": 0.762,
"step": 743
},
{
"epoch": 0.22168423404968526,
"grad_norm": 0.4020759165287018,
"learning_rate": 9.558801319634756e-06,
"loss": 0.7837,
"step": 744
},
{
"epoch": 0.22198219672985958,
"grad_norm": 0.44077128171920776,
"learning_rate": 9.55666326457235e-06,
"loss": 0.7307,
"step": 745
},
{
"epoch": 0.2222801594100339,
"grad_norm": 0.41470280289649963,
"learning_rate": 9.554520281799377e-06,
"loss": 0.7745,
"step": 746
},
{
"epoch": 0.2225781220902082,
"grad_norm": 0.45578980445861816,
"learning_rate": 9.552372373633321e-06,
"loss": 0.7452,
"step": 747
},
{
"epoch": 0.2228760847703825,
"grad_norm": 0.4778141975402832,
"learning_rate": 9.550219542396995e-06,
"loss": 0.8343,
"step": 748
},
{
"epoch": 0.22317404745055683,
"grad_norm": 0.4479955732822418,
"learning_rate": 9.548061790418533e-06,
"loss": 0.7938,
"step": 749
},
{
"epoch": 0.22347201013073112,
"grad_norm": 0.4472423493862152,
"learning_rate": 9.545899120031392e-06,
"loss": 0.8185,
"step": 750
},
{
"epoch": 0.22376997281090544,
"grad_norm": 0.43098342418670654,
"learning_rate": 9.543731533574349e-06,
"loss": 0.7369,
"step": 751
},
{
"epoch": 0.22406793549107976,
"grad_norm": 0.4441945254802704,
"learning_rate": 9.541559033391497e-06,
"loss": 0.8144,
"step": 752
},
{
"epoch": 0.22436589817125405,
"grad_norm": 0.4668506681919098,
"learning_rate": 9.539381621832238e-06,
"loss": 0.7816,
"step": 753
},
{
"epoch": 0.22466386085142837,
"grad_norm": 0.4563528001308441,
"learning_rate": 9.537199301251292e-06,
"loss": 0.7993,
"step": 754
},
{
"epoch": 0.22496182353160266,
"grad_norm": 0.46585187315940857,
"learning_rate": 9.535012074008688e-06,
"loss": 0.8185,
"step": 755
},
{
"epoch": 0.22525978621177697,
"grad_norm": 0.4397961497306824,
"learning_rate": 9.532819942469752e-06,
"loss": 0.78,
"step": 756
},
{
"epoch": 0.2255577488919513,
"grad_norm": 0.4402436912059784,
"learning_rate": 9.530622909005125e-06,
"loss": 0.7768,
"step": 757
},
{
"epoch": 0.22585571157212558,
"grad_norm": 0.42605629563331604,
"learning_rate": 9.52842097599074e-06,
"loss": 0.7831,
"step": 758
},
{
"epoch": 0.2261536742522999,
"grad_norm": 0.4295814335346222,
"learning_rate": 9.526214145807837e-06,
"loss": 0.7579,
"step": 759
},
{
"epoch": 0.22645163693247422,
"grad_norm": 0.4337891936302185,
"learning_rate": 9.524002420842944e-06,
"loss": 0.7983,
"step": 760
},
{
"epoch": 0.2267495996126485,
"grad_norm": 0.4422137141227722,
"learning_rate": 9.521785803487888e-06,
"loss": 0.7739,
"step": 761
},
{
"epoch": 0.22704756229282283,
"grad_norm": 0.4624215364456177,
"learning_rate": 9.519564296139784e-06,
"loss": 0.7989,
"step": 762
},
{
"epoch": 0.22734552497299712,
"grad_norm": 0.43990784883499146,
"learning_rate": 9.517337901201035e-06,
"loss": 0.7734,
"step": 763
},
{
"epoch": 0.22764348765317144,
"grad_norm": 0.4662509262561798,
"learning_rate": 9.51510662107933e-06,
"loss": 0.7337,
"step": 764
},
{
"epoch": 0.22794145033334576,
"grad_norm": 0.432779461145401,
"learning_rate": 9.512870458187644e-06,
"loss": 0.7488,
"step": 765
},
{
"epoch": 0.22823941301352005,
"grad_norm": 0.43612611293792725,
"learning_rate": 9.510629414944229e-06,
"loss": 0.7623,
"step": 766
},
{
"epoch": 0.22853737569369437,
"grad_norm": 0.43317270278930664,
"learning_rate": 9.508383493772612e-06,
"loss": 0.7859,
"step": 767
},
{
"epoch": 0.22883533837386869,
"grad_norm": 0.4157412052154541,
"learning_rate": 9.506132697101601e-06,
"loss": 0.7705,
"step": 768
},
{
"epoch": 0.22913330105404298,
"grad_norm": 0.4522346258163452,
"learning_rate": 9.503877027365277e-06,
"loss": 0.7669,
"step": 769
},
{
"epoch": 0.2294312637342173,
"grad_norm": 0.421061247587204,
"learning_rate": 9.501616487002985e-06,
"loss": 0.7731,
"step": 770
},
{
"epoch": 0.22972922641439159,
"grad_norm": 0.4325638711452484,
"learning_rate": 9.49935107845934e-06,
"loss": 0.7923,
"step": 771
},
{
"epoch": 0.2300271890945659,
"grad_norm": 0.4344866871833801,
"learning_rate": 9.497080804184225e-06,
"loss": 0.7785,
"step": 772
},
{
"epoch": 0.23032515177474022,
"grad_norm": 0.43878617882728577,
"learning_rate": 9.494805666632776e-06,
"loss": 0.8048,
"step": 773
},
{
"epoch": 0.2306231144549145,
"grad_norm": 0.4464431703090668,
"learning_rate": 9.4925256682654e-06,
"loss": 0.8073,
"step": 774
},
{
"epoch": 0.23092107713508883,
"grad_norm": 0.4230099022388458,
"learning_rate": 9.490240811547751e-06,
"loss": 0.7575,
"step": 775
},
{
"epoch": 0.23121903981526315,
"grad_norm": 0.427174836397171,
"learning_rate": 9.487951098950744e-06,
"loss": 0.7523,
"step": 776
},
{
"epoch": 0.23151700249543744,
"grad_norm": 0.4299585819244385,
"learning_rate": 9.485656532950536e-06,
"loss": 0.7831,
"step": 777
},
{
"epoch": 0.23181496517561176,
"grad_norm": 0.4389987587928772,
"learning_rate": 9.483357116028547e-06,
"loss": 0.7697,
"step": 778
},
{
"epoch": 0.23211292785578605,
"grad_norm": 0.42435556650161743,
"learning_rate": 9.481052850671427e-06,
"loss": 0.7929,
"step": 779
},
{
"epoch": 0.23241089053596037,
"grad_norm": 0.4137531518936157,
"learning_rate": 9.47874373937108e-06,
"loss": 0.8013,
"step": 780
},
{
"epoch": 0.2327088532161347,
"grad_norm": 0.44368651509284973,
"learning_rate": 9.47642978462465e-06,
"loss": 0.7884,
"step": 781
},
{
"epoch": 0.23300681589630898,
"grad_norm": 0.44244545698165894,
"learning_rate": 9.474110988934512e-06,
"loss": 0.8247,
"step": 782
},
{
"epoch": 0.2333047785764833,
"grad_norm": 0.44734129309654236,
"learning_rate": 9.471787354808282e-06,
"loss": 0.7893,
"step": 783
},
{
"epoch": 0.23360274125665761,
"grad_norm": 0.4227079153060913,
"learning_rate": 9.469458884758807e-06,
"loss": 0.7755,
"step": 784
},
{
"epoch": 0.2339007039368319,
"grad_norm": 0.41927456855773926,
"learning_rate": 9.467125581304163e-06,
"loss": 0.7707,
"step": 785
},
{
"epoch": 0.23419866661700622,
"grad_norm": 0.425749272108078,
"learning_rate": 9.464787446967652e-06,
"loss": 0.8195,
"step": 786
},
{
"epoch": 0.23449662929718051,
"grad_norm": 0.41660913825035095,
"learning_rate": 9.462444484277804e-06,
"loss": 0.7799,
"step": 787
},
{
"epoch": 0.23479459197735483,
"grad_norm": 0.4416177570819855,
"learning_rate": 9.460096695768367e-06,
"loss": 0.8226,
"step": 788
},
{
"epoch": 0.23509255465752915,
"grad_norm": 0.43158313632011414,
"learning_rate": 9.45774408397831e-06,
"loss": 0.805,
"step": 789
},
{
"epoch": 0.23539051733770344,
"grad_norm": 0.4583197832107544,
"learning_rate": 9.455386651451816e-06,
"loss": 0.8215,
"step": 790
},
{
"epoch": 0.23568848001787776,
"grad_norm": 0.45189374685287476,
"learning_rate": 9.453024400738282e-06,
"loss": 0.8024,
"step": 791
},
{
"epoch": 0.23598644269805208,
"grad_norm": 0.44654136896133423,
"learning_rate": 9.450657334392317e-06,
"loss": 0.7515,
"step": 792
},
{
"epoch": 0.23628440537822637,
"grad_norm": 0.43402862548828125,
"learning_rate": 9.448285454973739e-06,
"loss": 0.7531,
"step": 793
},
{
"epoch": 0.2365823680584007,
"grad_norm": 0.4220675528049469,
"learning_rate": 9.445908765047562e-06,
"loss": 0.7749,
"step": 794
},
{
"epoch": 0.236880330738575,
"grad_norm": 0.43355607986450195,
"learning_rate": 9.443527267184015e-06,
"loss": 0.8116,
"step": 795
},
{
"epoch": 0.2371782934187493,
"grad_norm": 0.4237934648990631,
"learning_rate": 9.441140963958515e-06,
"loss": 0.7694,
"step": 796
},
{
"epoch": 0.23747625609892362,
"grad_norm": 0.43690553307533264,
"learning_rate": 9.438749857951687e-06,
"loss": 0.8207,
"step": 797
},
{
"epoch": 0.2377742187790979,
"grad_norm": 0.41167691349983215,
"learning_rate": 9.43635395174934e-06,
"loss": 0.7473,
"step": 798
},
{
"epoch": 0.23807218145927223,
"grad_norm": 0.4328611493110657,
"learning_rate": 9.433953247942478e-06,
"loss": 0.8034,
"step": 799
},
{
"epoch": 0.23837014413944654,
"grad_norm": 0.4350714087486267,
"learning_rate": 9.431547749127295e-06,
"loss": 0.7804,
"step": 800
},
{
"epoch": 0.23866810681962083,
"grad_norm": 0.4180876314640045,
"learning_rate": 9.429137457905166e-06,
"loss": 0.7845,
"step": 801
},
{
"epoch": 0.23896606949979515,
"grad_norm": 0.4411112368106842,
"learning_rate": 9.426722376882654e-06,
"loss": 0.7987,
"step": 802
},
{
"epoch": 0.23926403217996947,
"grad_norm": 0.44115111231803894,
"learning_rate": 9.424302508671497e-06,
"loss": 0.7925,
"step": 803
},
{
"epoch": 0.23956199486014376,
"grad_norm": 0.42536935210227966,
"learning_rate": 9.421877855888615e-06,
"loss": 0.838,
"step": 804
},
{
"epoch": 0.23985995754031808,
"grad_norm": 0.4546675682067871,
"learning_rate": 9.419448421156096e-06,
"loss": 0.799,
"step": 805
},
{
"epoch": 0.24015792022049237,
"grad_norm": 0.4298256039619446,
"learning_rate": 9.417014207101202e-06,
"loss": 0.7974,
"step": 806
},
{
"epoch": 0.2404558829006667,
"grad_norm": 0.40863409638404846,
"learning_rate": 9.41457521635637e-06,
"loss": 0.7688,
"step": 807
},
{
"epoch": 0.240753845580841,
"grad_norm": 0.412500262260437,
"learning_rate": 9.41213145155919e-06,
"loss": 0.7326,
"step": 808
},
{
"epoch": 0.2410518082610153,
"grad_norm": 0.4555049538612366,
"learning_rate": 9.409682915352427e-06,
"loss": 0.8555,
"step": 809
},
{
"epoch": 0.24134977094118962,
"grad_norm": 0.4487118422985077,
"learning_rate": 9.407229610383996e-06,
"loss": 0.7483,
"step": 810
},
{
"epoch": 0.24164773362136394,
"grad_norm": 0.42939889430999756,
"learning_rate": 9.404771539306978e-06,
"loss": 0.7861,
"step": 811
},
{
"epoch": 0.24194569630153823,
"grad_norm": 0.4451942443847656,
"learning_rate": 9.4023087047796e-06,
"loss": 0.7841,
"step": 812
},
{
"epoch": 0.24224365898171255,
"grad_norm": 0.4249064326286316,
"learning_rate": 9.399841109465246e-06,
"loss": 0.7772,
"step": 813
},
{
"epoch": 0.24254162166188684,
"grad_norm": 0.4324464797973633,
"learning_rate": 9.397368756032445e-06,
"loss": 0.7329,
"step": 814
},
{
"epoch": 0.24283958434206115,
"grad_norm": 0.43788713216781616,
"learning_rate": 9.394891647154879e-06,
"loss": 0.7951,
"step": 815
},
{
"epoch": 0.24313754702223547,
"grad_norm": 0.43921443819999695,
"learning_rate": 9.392409785511358e-06,
"loss": 0.7611,
"step": 816
},
{
"epoch": 0.24343550970240976,
"grad_norm": 0.43168798089027405,
"learning_rate": 9.389923173785847e-06,
"loss": 0.7818,
"step": 817
},
{
"epoch": 0.24373347238258408,
"grad_norm": 0.4548880159854889,
"learning_rate": 9.38743181466744e-06,
"loss": 0.7766,
"step": 818
},
{
"epoch": 0.2440314350627584,
"grad_norm": 0.41462671756744385,
"learning_rate": 9.384935710850364e-06,
"loss": 0.7705,
"step": 819
},
{
"epoch": 0.2443293977429327,
"grad_norm": 0.43719160556793213,
"learning_rate": 9.382434865033985e-06,
"loss": 0.8107,
"step": 820
},
{
"epoch": 0.244627360423107,
"grad_norm": 0.4376414716243744,
"learning_rate": 9.379929279922785e-06,
"loss": 0.8114,
"step": 821
},
{
"epoch": 0.2449253231032813,
"grad_norm": 0.4218818247318268,
"learning_rate": 9.377418958226385e-06,
"loss": 0.8041,
"step": 822
},
{
"epoch": 0.24522328578345562,
"grad_norm": 0.4311424791812897,
"learning_rate": 9.374903902659516e-06,
"loss": 0.8029,
"step": 823
},
{
"epoch": 0.24552124846362994,
"grad_norm": 0.4447738528251648,
"learning_rate": 9.372384115942034e-06,
"loss": 0.7575,
"step": 824
},
{
"epoch": 0.24581921114380423,
"grad_norm": 0.4226698577404022,
"learning_rate": 9.369859600798914e-06,
"loss": 0.7984,
"step": 825
},
{
"epoch": 0.24611717382397855,
"grad_norm": 0.4459143280982971,
"learning_rate": 9.367330359960239e-06,
"loss": 0.7921,
"step": 826
},
{
"epoch": 0.24641513650415287,
"grad_norm": 0.4613502621650696,
"learning_rate": 9.364796396161207e-06,
"loss": 0.7864,
"step": 827
},
{
"epoch": 0.24671309918432716,
"grad_norm": 0.4165995121002197,
"learning_rate": 9.362257712142118e-06,
"loss": 0.7657,
"step": 828
},
{
"epoch": 0.24701106186450147,
"grad_norm": 0.43035662174224854,
"learning_rate": 9.359714310648383e-06,
"loss": 0.7977,
"step": 829
},
{
"epoch": 0.2473090245446758,
"grad_norm": 0.4222116768360138,
"learning_rate": 9.357166194430509e-06,
"loss": 0.7441,
"step": 830
},
{
"epoch": 0.24760698722485008,
"grad_norm": 0.4269767999649048,
"learning_rate": 9.354613366244108e-06,
"loss": 0.7722,
"step": 831
},
{
"epoch": 0.2479049499050244,
"grad_norm": 0.42692193388938904,
"learning_rate": 9.352055828849879e-06,
"loss": 0.7902,
"step": 832
},
{
"epoch": 0.2482029125851987,
"grad_norm": 0.4122476577758789,
"learning_rate": 9.349493585013625e-06,
"loss": 0.7554,
"step": 833
},
{
"epoch": 0.248500875265373,
"grad_norm": 0.4117843210697174,
"learning_rate": 9.346926637506229e-06,
"loss": 0.7574,
"step": 834
},
{
"epoch": 0.24879883794554733,
"grad_norm": 0.45276376605033875,
"learning_rate": 9.344354989103662e-06,
"loss": 0.8091,
"step": 835
},
{
"epoch": 0.24909680062572162,
"grad_norm": 0.43268904089927673,
"learning_rate": 9.341778642586984e-06,
"loss": 0.7779,
"step": 836
},
{
"epoch": 0.24939476330589594,
"grad_norm": 0.43194571137428284,
"learning_rate": 9.339197600742331e-06,
"loss": 0.7645,
"step": 837
},
{
"epoch": 0.24969272598607026,
"grad_norm": 0.4139356017112732,
"learning_rate": 9.33661186636092e-06,
"loss": 0.7613,
"step": 838
},
{
"epoch": 0.24999068866624455,
"grad_norm": 0.43096089363098145,
"learning_rate": 9.334021442239036e-06,
"loss": 0.7913,
"step": 839
},
{
"epoch": 0.25028865134641887,
"grad_norm": 0.43842166662216187,
"learning_rate": 9.331426331178044e-06,
"loss": 0.8234,
"step": 840
},
{
"epoch": 0.25058661402659316,
"grad_norm": 0.4145195186138153,
"learning_rate": 9.328826535984374e-06,
"loss": 0.7251,
"step": 841
},
{
"epoch": 0.2508845767067675,
"grad_norm": 0.4230673611164093,
"learning_rate": 9.32622205946952e-06,
"loss": 0.776,
"step": 842
},
{
"epoch": 0.2511825393869418,
"grad_norm": 0.43353500962257385,
"learning_rate": 9.32361290445004e-06,
"loss": 0.7712,
"step": 843
},
{
"epoch": 0.2514805020671161,
"grad_norm": 0.43357017636299133,
"learning_rate": 9.320999073747557e-06,
"loss": 0.7598,
"step": 844
},
{
"epoch": 0.2517784647472904,
"grad_norm": 0.4321240186691284,
"learning_rate": 9.318380570188735e-06,
"loss": 0.7898,
"step": 845
},
{
"epoch": 0.2520764274274647,
"grad_norm": 0.4369968771934509,
"learning_rate": 9.315757396605309e-06,
"loss": 0.7819,
"step": 846
},
{
"epoch": 0.252374390107639,
"grad_norm": 0.45207393169403076,
"learning_rate": 9.313129555834053e-06,
"loss": 0.8038,
"step": 847
},
{
"epoch": 0.2526723527878133,
"grad_norm": 0.43222910165786743,
"learning_rate": 9.310497050716794e-06,
"loss": 0.7921,
"step": 848
},
{
"epoch": 0.25297031546798765,
"grad_norm": 0.41990602016448975,
"learning_rate": 9.307859884100399e-06,
"loss": 0.7962,
"step": 849
},
{
"epoch": 0.25326827814816194,
"grad_norm": 0.42566487193107605,
"learning_rate": 9.305218058836778e-06,
"loss": 0.7696,
"step": 850
},
{
"epoch": 0.25356624082833623,
"grad_norm": 0.4148566424846649,
"learning_rate": 9.302571577782881e-06,
"loss": 0.7435,
"step": 851
},
{
"epoch": 0.2538642035085106,
"grad_norm": 0.4349095821380615,
"learning_rate": 9.29992044380069e-06,
"loss": 0.7844,
"step": 852
},
{
"epoch": 0.25416216618868487,
"grad_norm": 0.43746015429496765,
"learning_rate": 9.297264659757218e-06,
"loss": 0.7827,
"step": 853
},
{
"epoch": 0.25446012886885916,
"grad_norm": 0.42558470368385315,
"learning_rate": 9.294604228524514e-06,
"loss": 0.7282,
"step": 854
},
{
"epoch": 0.2547580915490335,
"grad_norm": 0.40940043330192566,
"learning_rate": 9.29193915297964e-06,
"loss": 0.7429,
"step": 855
},
{
"epoch": 0.2550560542292078,
"grad_norm": 0.43153926730155945,
"learning_rate": 9.289269436004692e-06,
"loss": 0.7809,
"step": 856
},
{
"epoch": 0.2553540169093821,
"grad_norm": 0.44316428899765015,
"learning_rate": 9.28659508048678e-06,
"loss": 0.7876,
"step": 857
},
{
"epoch": 0.25565197958955643,
"grad_norm": 0.435019314289093,
"learning_rate": 9.28391608931803e-06,
"loss": 0.7773,
"step": 858
},
{
"epoch": 0.2559499422697307,
"grad_norm": 0.45403623580932617,
"learning_rate": 9.281232465395584e-06,
"loss": 0.7903,
"step": 859
},
{
"epoch": 0.256247904949905,
"grad_norm": 0.40584635734558105,
"learning_rate": 9.278544211621593e-06,
"loss": 0.7183,
"step": 860
},
{
"epoch": 0.2565458676300793,
"grad_norm": 0.40424615144729614,
"learning_rate": 9.275851330903212e-06,
"loss": 0.7044,
"step": 861
},
{
"epoch": 0.25684383031025365,
"grad_norm": 0.4259452223777771,
"learning_rate": 9.273153826152604e-06,
"loss": 0.8228,
"step": 862
},
{
"epoch": 0.25714179299042794,
"grad_norm": 0.43459808826446533,
"learning_rate": 9.270451700286928e-06,
"loss": 0.7089,
"step": 863
},
{
"epoch": 0.25743975567060223,
"grad_norm": 0.4281349182128906,
"learning_rate": 9.267744956228347e-06,
"loss": 0.7967,
"step": 864
},
{
"epoch": 0.2577377183507766,
"grad_norm": 0.45212414860725403,
"learning_rate": 9.26503359690401e-06,
"loss": 0.8149,
"step": 865
},
{
"epoch": 0.25803568103095087,
"grad_norm": 0.4262511730194092,
"learning_rate": 9.262317625246061e-06,
"loss": 0.7652,
"step": 866
},
{
"epoch": 0.25833364371112516,
"grad_norm": 0.4217842221260071,
"learning_rate": 9.259597044191635e-06,
"loss": 0.7651,
"step": 867
},
{
"epoch": 0.2586316063912995,
"grad_norm": 0.43619224429130554,
"learning_rate": 9.25687185668285e-06,
"loss": 0.7734,
"step": 868
},
{
"epoch": 0.2589295690714738,
"grad_norm": 0.4308563768863678,
"learning_rate": 9.254142065666802e-06,
"loss": 0.8017,
"step": 869
},
{
"epoch": 0.2592275317516481,
"grad_norm": 0.4390491545200348,
"learning_rate": 9.251407674095565e-06,
"loss": 0.7897,
"step": 870
},
{
"epoch": 0.25952549443182243,
"grad_norm": 0.4305284023284912,
"learning_rate": 9.248668684926199e-06,
"loss": 0.7986,
"step": 871
},
{
"epoch": 0.2598234571119967,
"grad_norm": 0.4587078392505646,
"learning_rate": 9.24592510112072e-06,
"loss": 0.7804,
"step": 872
},
{
"epoch": 0.260121419792171,
"grad_norm": 0.3946491777896881,
"learning_rate": 9.243176925646125e-06,
"loss": 0.7586,
"step": 873
},
{
"epoch": 0.26041938247234536,
"grad_norm": 0.42059147357940674,
"learning_rate": 9.24042416147437e-06,
"loss": 0.7567,
"step": 874
},
{
"epoch": 0.26071734515251965,
"grad_norm": 0.43899062275886536,
"learning_rate": 9.237666811582377e-06,
"loss": 0.7887,
"step": 875
},
{
"epoch": 0.26101530783269394,
"grad_norm": 0.4542399048805237,
"learning_rate": 9.234904878952026e-06,
"loss": 0.7744,
"step": 876
},
{
"epoch": 0.2613132705128683,
"grad_norm": 0.4268796741962433,
"learning_rate": 9.232138366570154e-06,
"loss": 0.7336,
"step": 877
},
{
"epoch": 0.2616112331930426,
"grad_norm": 0.4323793351650238,
"learning_rate": 9.229367277428547e-06,
"loss": 0.7321,
"step": 878
},
{
"epoch": 0.26190919587321687,
"grad_norm": 0.424514502286911,
"learning_rate": 9.226591614523944e-06,
"loss": 0.8011,
"step": 879
},
{
"epoch": 0.26220715855339116,
"grad_norm": 0.42081788182258606,
"learning_rate": 9.223811380858029e-06,
"loss": 0.8084,
"step": 880
},
{
"epoch": 0.2625051212335655,
"grad_norm": 0.4073812663555145,
"learning_rate": 9.22102657943743e-06,
"loss": 0.7337,
"step": 881
},
{
"epoch": 0.2628030839137398,
"grad_norm": 0.4527837932109833,
"learning_rate": 9.218237213273708e-06,
"loss": 0.7865,
"step": 882
},
{
"epoch": 0.2631010465939141,
"grad_norm": 0.4368094801902771,
"learning_rate": 9.215443285383375e-06,
"loss": 0.79,
"step": 883
},
{
"epoch": 0.26339900927408844,
"grad_norm": 0.4208792746067047,
"learning_rate": 9.21264479878786e-06,
"loss": 0.7982,
"step": 884
},
{
"epoch": 0.2636969719542627,
"grad_norm": 0.4094206988811493,
"learning_rate": 9.209841756513535e-06,
"loss": 0.7542,
"step": 885
},
{
"epoch": 0.263994934634437,
"grad_norm": 0.44005078077316284,
"learning_rate": 9.207034161591689e-06,
"loss": 0.7995,
"step": 886
},
{
"epoch": 0.26429289731461136,
"grad_norm": 0.41605010628700256,
"learning_rate": 9.20422201705854e-06,
"loss": 0.7476,
"step": 887
},
{
"epoch": 0.26459085999478565,
"grad_norm": 0.43873661756515503,
"learning_rate": 9.201405325955222e-06,
"loss": 0.7842,
"step": 888
},
{
"epoch": 0.26488882267495995,
"grad_norm": 0.4268430769443512,
"learning_rate": 9.198584091327792e-06,
"loss": 0.8257,
"step": 889
},
{
"epoch": 0.2651867853551343,
"grad_norm": 0.41748034954071045,
"learning_rate": 9.195758316227212e-06,
"loss": 0.7608,
"step": 890
},
{
"epoch": 0.2654847480353086,
"grad_norm": 0.42829155921936035,
"learning_rate": 9.192928003709365e-06,
"loss": 0.7829,
"step": 891
},
{
"epoch": 0.2657827107154829,
"grad_norm": 0.42252838611602783,
"learning_rate": 9.19009315683503e-06,
"loss": 0.7527,
"step": 892
},
{
"epoch": 0.2660806733956572,
"grad_norm": 0.4455977976322174,
"learning_rate": 9.187253778669893e-06,
"loss": 0.8133,
"step": 893
},
{
"epoch": 0.2663786360758315,
"grad_norm": 0.41617825627326965,
"learning_rate": 9.184409872284547e-06,
"loss": 0.8074,
"step": 894
},
{
"epoch": 0.2666765987560058,
"grad_norm": 0.42229339480400085,
"learning_rate": 9.181561440754474e-06,
"loss": 0.759,
"step": 895
},
{
"epoch": 0.2669745614361801,
"grad_norm": 0.4231303334236145,
"learning_rate": 9.17870848716005e-06,
"loss": 0.761,
"step": 896
},
{
"epoch": 0.26727252411635444,
"grad_norm": 0.4557526111602783,
"learning_rate": 9.175851014586545e-06,
"loss": 0.7976,
"step": 897
},
{
"epoch": 0.26757048679652873,
"grad_norm": 0.4390769302845001,
"learning_rate": 9.172989026124117e-06,
"loss": 0.7469,
"step": 898
},
{
"epoch": 0.267868449476703,
"grad_norm": 0.4452618956565857,
"learning_rate": 9.170122524867802e-06,
"loss": 0.7984,
"step": 899
},
{
"epoch": 0.26816641215687737,
"grad_norm": 0.43199318647384644,
"learning_rate": 9.16725151391752e-06,
"loss": 0.8091,
"step": 900
},
{
"epoch": 0.26846437483705166,
"grad_norm": 0.43193402886390686,
"learning_rate": 9.16437599637807e-06,
"loss": 0.7346,
"step": 901
},
{
"epoch": 0.26876233751722595,
"grad_norm": 0.4428356885910034,
"learning_rate": 9.161495975359116e-06,
"loss": 0.8055,
"step": 902
},
{
"epoch": 0.2690603001974003,
"grad_norm": 0.41652411222457886,
"learning_rate": 9.158611453975203e-06,
"loss": 0.7706,
"step": 903
},
{
"epoch": 0.2693582628775746,
"grad_norm": 0.398971825838089,
"learning_rate": 9.155722435345736e-06,
"loss": 0.7612,
"step": 904
},
{
"epoch": 0.2696562255577489,
"grad_norm": 0.41312122344970703,
"learning_rate": 9.152828922594984e-06,
"loss": 0.7395,
"step": 905
},
{
"epoch": 0.2699541882379232,
"grad_norm": 0.42090773582458496,
"learning_rate": 9.149930918852079e-06,
"loss": 0.7502,
"step": 906
},
{
"epoch": 0.2702521509180975,
"grad_norm": 0.4145689010620117,
"learning_rate": 9.14702842725101e-06,
"loss": 0.7173,
"step": 907
},
{
"epoch": 0.2705501135982718,
"grad_norm": 0.39954444766044617,
"learning_rate": 9.144121450930614e-06,
"loss": 0.7615,
"step": 908
},
{
"epoch": 0.27084807627844615,
"grad_norm": 0.4271208941936493,
"learning_rate": 9.141209993034583e-06,
"loss": 0.8106,
"step": 909
},
{
"epoch": 0.27114603895862044,
"grad_norm": 0.4281978905200958,
"learning_rate": 9.138294056711452e-06,
"loss": 0.822,
"step": 910
},
{
"epoch": 0.27144400163879473,
"grad_norm": 0.424578458070755,
"learning_rate": 9.135373645114603e-06,
"loss": 0.7777,
"step": 911
},
{
"epoch": 0.2717419643189691,
"grad_norm": 0.4227428436279297,
"learning_rate": 9.132448761402254e-06,
"loss": 0.7059,
"step": 912
},
{
"epoch": 0.27203992699914337,
"grad_norm": 0.4126034379005432,
"learning_rate": 9.129519408737461e-06,
"loss": 0.7988,
"step": 913
},
{
"epoch": 0.27233788967931766,
"grad_norm": 0.4289577603340149,
"learning_rate": 9.126585590288115e-06,
"loss": 0.7926,
"step": 914
},
{
"epoch": 0.27263585235949195,
"grad_norm": 0.4278191924095154,
"learning_rate": 9.123647309226932e-06,
"loss": 0.7608,
"step": 915
},
{
"epoch": 0.2729338150396663,
"grad_norm": 0.42958348989486694,
"learning_rate": 9.120704568731455e-06,
"loss": 0.7819,
"step": 916
},
{
"epoch": 0.2732317777198406,
"grad_norm": 0.43810129165649414,
"learning_rate": 9.117757371984053e-06,
"loss": 0.7723,
"step": 917
},
{
"epoch": 0.2735297404000149,
"grad_norm": 0.43591946363449097,
"learning_rate": 9.114805722171912e-06,
"loss": 0.786,
"step": 918
},
{
"epoch": 0.2738277030801892,
"grad_norm": 0.41927027702331543,
"learning_rate": 9.111849622487032e-06,
"loss": 0.7662,
"step": 919
},
{
"epoch": 0.2741256657603635,
"grad_norm": 0.412334680557251,
"learning_rate": 9.108889076126226e-06,
"loss": 0.7426,
"step": 920
},
{
"epoch": 0.2744236284405378,
"grad_norm": 0.43852120637893677,
"learning_rate": 9.105924086291118e-06,
"loss": 0.8548,
"step": 921
},
{
"epoch": 0.27472159112071215,
"grad_norm": 0.4445139467716217,
"learning_rate": 9.102954656188138e-06,
"loss": 0.7758,
"step": 922
},
{
"epoch": 0.27501955380088644,
"grad_norm": 0.46849504113197327,
"learning_rate": 9.09998078902851e-06,
"loss": 0.8096,
"step": 923
},
{
"epoch": 0.27531751648106073,
"grad_norm": 0.4167250096797943,
"learning_rate": 9.097002488028268e-06,
"loss": 0.8019,
"step": 924
},
{
"epoch": 0.2756154791612351,
"grad_norm": 0.43559810519218445,
"learning_rate": 9.09401975640823e-06,
"loss": 0.7536,
"step": 925
},
{
"epoch": 0.27591344184140937,
"grad_norm": 0.4368812143802643,
"learning_rate": 9.091032597394012e-06,
"loss": 0.7998,
"step": 926
},
{
"epoch": 0.27621140452158366,
"grad_norm": 0.424889475107193,
"learning_rate": 9.088041014216019e-06,
"loss": 0.8002,
"step": 927
},
{
"epoch": 0.276509367201758,
"grad_norm": 0.436631441116333,
"learning_rate": 9.085045010109433e-06,
"loss": 0.7772,
"step": 928
},
{
"epoch": 0.2768073298819323,
"grad_norm": 0.4361373484134674,
"learning_rate": 9.082044588314224e-06,
"loss": 0.777,
"step": 929
},
{
"epoch": 0.2771052925621066,
"grad_norm": 0.43156692385673523,
"learning_rate": 9.079039752075137e-06,
"loss": 0.7728,
"step": 930
},
{
"epoch": 0.2774032552422809,
"grad_norm": 0.42506295442581177,
"learning_rate": 9.07603050464169e-06,
"loss": 0.7588,
"step": 931
},
{
"epoch": 0.2777012179224552,
"grad_norm": 0.42249932885169983,
"learning_rate": 9.073016849268172e-06,
"loss": 0.7871,
"step": 932
},
{
"epoch": 0.2779991806026295,
"grad_norm": 0.43025729060173035,
"learning_rate": 9.069998789213644e-06,
"loss": 0.7543,
"step": 933
},
{
"epoch": 0.2782971432828038,
"grad_norm": 0.41047433018684387,
"learning_rate": 9.066976327741917e-06,
"loss": 0.7721,
"step": 934
},
{
"epoch": 0.27859510596297815,
"grad_norm": 0.43525633215904236,
"learning_rate": 9.063949468121576e-06,
"loss": 0.8204,
"step": 935
},
{
"epoch": 0.27889306864315244,
"grad_norm": 0.41752514243125916,
"learning_rate": 9.060918213625957e-06,
"loss": 0.7804,
"step": 936
},
{
"epoch": 0.27919103132332673,
"grad_norm": 0.41439223289489746,
"learning_rate": 9.057882567533145e-06,
"loss": 0.8042,
"step": 937
},
{
"epoch": 0.2794889940035011,
"grad_norm": 0.4178325831890106,
"learning_rate": 9.054842533125981e-06,
"loss": 0.7772,
"step": 938
},
{
"epoch": 0.27978695668367537,
"grad_norm": 0.42388367652893066,
"learning_rate": 9.051798113692043e-06,
"loss": 0.7701,
"step": 939
},
{
"epoch": 0.28008491936384966,
"grad_norm": 0.4107518792152405,
"learning_rate": 9.048749312523664e-06,
"loss": 0.7838,
"step": 940
},
{
"epoch": 0.280382882044024,
"grad_norm": 0.4476031959056854,
"learning_rate": 9.0456961329179e-06,
"loss": 0.8054,
"step": 941
},
{
"epoch": 0.2806808447241983,
"grad_norm": 0.412369042634964,
"learning_rate": 9.042638578176558e-06,
"loss": 0.7573,
"step": 942
},
{
"epoch": 0.2809788074043726,
"grad_norm": 0.4082777500152588,
"learning_rate": 9.03957665160616e-06,
"loss": 0.7895,
"step": 943
},
{
"epoch": 0.28127677008454693,
"grad_norm": 0.44257837533950806,
"learning_rate": 9.03651035651797e-06,
"loss": 0.7842,
"step": 944
},
{
"epoch": 0.2815747327647212,
"grad_norm": 0.43410831689834595,
"learning_rate": 9.033439696227966e-06,
"loss": 0.7251,
"step": 945
},
{
"epoch": 0.2818726954448955,
"grad_norm": 0.4205782413482666,
"learning_rate": 9.030364674056853e-06,
"loss": 0.7386,
"step": 946
},
{
"epoch": 0.28217065812506986,
"grad_norm": 0.42461472749710083,
"learning_rate": 9.027285293330052e-06,
"loss": 0.7763,
"step": 947
},
{
"epoch": 0.28246862080524415,
"grad_norm": 0.4243285655975342,
"learning_rate": 9.024201557377697e-06,
"loss": 0.7627,
"step": 948
},
{
"epoch": 0.28276658348541844,
"grad_norm": 0.4105435907840729,
"learning_rate": 9.021113469534628e-06,
"loss": 0.74,
"step": 949
},
{
"epoch": 0.28306454616559273,
"grad_norm": 0.41979560256004333,
"learning_rate": 9.018021033140398e-06,
"loss": 0.7814,
"step": 950
},
{
"epoch": 0.2833625088457671,
"grad_norm": 0.4249080717563629,
"learning_rate": 9.014924251539256e-06,
"loss": 0.7588,
"step": 951
},
{
"epoch": 0.28366047152594137,
"grad_norm": 0.40093064308166504,
"learning_rate": 9.011823128080157e-06,
"loss": 0.7548,
"step": 952
},
{
"epoch": 0.28395843420611566,
"grad_norm": 0.42459458112716675,
"learning_rate": 9.008717666116744e-06,
"loss": 0.8018,
"step": 953
},
{
"epoch": 0.28425639688629,
"grad_norm": 0.425436407327652,
"learning_rate": 9.005607869007358e-06,
"loss": 0.7903,
"step": 954
},
{
"epoch": 0.2845543595664643,
"grad_norm": 0.44178900122642517,
"learning_rate": 9.002493740115026e-06,
"loss": 0.8043,
"step": 955
},
{
"epoch": 0.2848523222466386,
"grad_norm": 0.4691467583179474,
"learning_rate": 8.99937528280746e-06,
"loss": 0.8411,
"step": 956
},
{
"epoch": 0.28515028492681294,
"grad_norm": 0.43360698223114014,
"learning_rate": 8.996252500457046e-06,
"loss": 0.7763,
"step": 957
},
{
"epoch": 0.2854482476069872,
"grad_norm": 0.43464329838752747,
"learning_rate": 8.99312539644086e-06,
"loss": 0.8087,
"step": 958
},
{
"epoch": 0.2857462102871615,
"grad_norm": 0.47156545519828796,
"learning_rate": 8.98999397414064e-06,
"loss": 0.863,
"step": 959
},
{
"epoch": 0.28604417296733586,
"grad_norm": 0.4277881681919098,
"learning_rate": 8.986858236942804e-06,
"loss": 0.8069,
"step": 960
},
{
"epoch": 0.28634213564751015,
"grad_norm": 0.4219008684158325,
"learning_rate": 8.983718188238428e-06,
"loss": 0.7834,
"step": 961
},
{
"epoch": 0.28664009832768444,
"grad_norm": 0.42039385437965393,
"learning_rate": 8.980573831423253e-06,
"loss": 0.7816,
"step": 962
},
{
"epoch": 0.2869380610078588,
"grad_norm": 0.4653097987174988,
"learning_rate": 8.97742516989768e-06,
"loss": 0.7487,
"step": 963
},
{
"epoch": 0.2872360236880331,
"grad_norm": 0.4438186585903168,
"learning_rate": 8.974272207066767e-06,
"loss": 0.7729,
"step": 964
},
{
"epoch": 0.2875339863682074,
"grad_norm": 0.42774060368537903,
"learning_rate": 8.97111494634022e-06,
"loss": 0.8071,
"step": 965
},
{
"epoch": 0.28783194904838166,
"grad_norm": 0.40935760736465454,
"learning_rate": 8.96795339113239e-06,
"loss": 0.7585,
"step": 966
},
{
"epoch": 0.288129911728556,
"grad_norm": 0.44249454140663147,
"learning_rate": 8.964787544862285e-06,
"loss": 0.7787,
"step": 967
},
{
"epoch": 0.2884278744087303,
"grad_norm": 0.42188560962677,
"learning_rate": 8.961617410953537e-06,
"loss": 0.7275,
"step": 968
},
{
"epoch": 0.2887258370889046,
"grad_norm": 0.42883557081222534,
"learning_rate": 8.958442992834428e-06,
"loss": 0.7552,
"step": 969
},
{
"epoch": 0.28902379976907894,
"grad_norm": 0.42095375061035156,
"learning_rate": 8.955264293937865e-06,
"loss": 0.7141,
"step": 970
},
{
"epoch": 0.28932176244925323,
"grad_norm": 0.42243123054504395,
"learning_rate": 8.952081317701386e-06,
"loss": 0.7697,
"step": 971
},
{
"epoch": 0.2896197251294275,
"grad_norm": 0.4220430552959442,
"learning_rate": 8.94889406756716e-06,
"loss": 0.7733,
"step": 972
},
{
"epoch": 0.28991768780960187,
"grad_norm": 0.41057878732681274,
"learning_rate": 8.94570254698197e-06,
"loss": 0.771,
"step": 973
},
{
"epoch": 0.29021565048977616,
"grad_norm": 0.41766127943992615,
"learning_rate": 8.94250675939722e-06,
"loss": 0.7669,
"step": 974
},
{
"epoch": 0.29051361316995045,
"grad_norm": 0.43557193875312805,
"learning_rate": 8.939306708268934e-06,
"loss": 0.7552,
"step": 975
},
{
"epoch": 0.2908115758501248,
"grad_norm": 0.4357999265193939,
"learning_rate": 8.936102397057737e-06,
"loss": 0.7948,
"step": 976
},
{
"epoch": 0.2911095385302991,
"grad_norm": 0.4122772812843323,
"learning_rate": 8.93289382922887e-06,
"loss": 0.7368,
"step": 977
},
{
"epoch": 0.2914075012104734,
"grad_norm": 0.4376829266548157,
"learning_rate": 8.929681008252171e-06,
"loss": 0.7996,
"step": 978
},
{
"epoch": 0.2917054638906477,
"grad_norm": 0.4416571259498596,
"learning_rate": 8.926463937602081e-06,
"loss": 0.8093,
"step": 979
},
{
"epoch": 0.292003426570822,
"grad_norm": 0.44324541091918945,
"learning_rate": 8.923242620757634e-06,
"loss": 0.758,
"step": 980
},
{
"epoch": 0.2923013892509963,
"grad_norm": 0.4390997886657715,
"learning_rate": 8.920017061202458e-06,
"loss": 0.7548,
"step": 981
},
{
"epoch": 0.29259935193117065,
"grad_norm": 0.4270910918712616,
"learning_rate": 8.916787262424768e-06,
"loss": 0.7457,
"step": 982
},
{
"epoch": 0.29289731461134494,
"grad_norm": 0.436987966299057,
"learning_rate": 8.913553227917366e-06,
"loss": 0.8104,
"step": 983
},
{
"epoch": 0.29319527729151923,
"grad_norm": 0.4338602125644684,
"learning_rate": 8.910314961177633e-06,
"loss": 0.7614,
"step": 984
},
{
"epoch": 0.2934932399716935,
"grad_norm": 0.457612544298172,
"learning_rate": 8.907072465707522e-06,
"loss": 0.7978,
"step": 985
},
{
"epoch": 0.29379120265186787,
"grad_norm": 0.42325571179389954,
"learning_rate": 8.90382574501357e-06,
"loss": 0.7897,
"step": 986
},
{
"epoch": 0.29408916533204216,
"grad_norm": 0.4008309543132782,
"learning_rate": 8.90057480260687e-06,
"loss": 0.762,
"step": 987
},
{
"epoch": 0.29438712801221645,
"grad_norm": 0.4499809145927429,
"learning_rate": 8.897319642003092e-06,
"loss": 0.7651,
"step": 988
},
{
"epoch": 0.2946850906923908,
"grad_norm": 0.41584426164627075,
"learning_rate": 8.894060266722461e-06,
"loss": 0.7373,
"step": 989
},
{
"epoch": 0.2949830533725651,
"grad_norm": 0.4230383336544037,
"learning_rate": 8.890796680289767e-06,
"loss": 0.7966,
"step": 990
},
{
"epoch": 0.2952810160527394,
"grad_norm": 0.40172526240348816,
"learning_rate": 8.88752888623434e-06,
"loss": 0.7453,
"step": 991
},
{
"epoch": 0.2955789787329137,
"grad_norm": 0.4164963960647583,
"learning_rate": 8.884256888090076e-06,
"loss": 0.7575,
"step": 992
},
{
"epoch": 0.295876941413088,
"grad_norm": 0.43570059537887573,
"learning_rate": 8.880980689395408e-06,
"loss": 0.7848,
"step": 993
},
{
"epoch": 0.2961749040932623,
"grad_norm": 0.4313972592353821,
"learning_rate": 8.877700293693316e-06,
"loss": 0.7565,
"step": 994
},
{
"epoch": 0.29647286677343665,
"grad_norm": 0.43945807218551636,
"learning_rate": 8.874415704531316e-06,
"loss": 0.7986,
"step": 995
},
{
"epoch": 0.29677082945361094,
"grad_norm": 0.4161829948425293,
"learning_rate": 8.871126925461459e-06,
"loss": 0.795,
"step": 996
},
{
"epoch": 0.29706879213378523,
"grad_norm": 0.4219336211681366,
"learning_rate": 8.867833960040331e-06,
"loss": 0.8043,
"step": 997
},
{
"epoch": 0.2973667548139596,
"grad_norm": 0.42536213994026184,
"learning_rate": 8.864536811829038e-06,
"loss": 0.7994,
"step": 998
},
{
"epoch": 0.29766471749413387,
"grad_norm": 0.4447495639324188,
"learning_rate": 8.861235484393218e-06,
"loss": 0.7554,
"step": 999
},
{
"epoch": 0.29796268017430816,
"grad_norm": 0.43489551544189453,
"learning_rate": 8.857929981303022e-06,
"loss": 0.8059,
"step": 1000
},
{
"epoch": 0.29826064285448245,
"grad_norm": 0.43018990755081177,
"learning_rate": 8.854620306133118e-06,
"loss": 0.774,
"step": 1001
},
{
"epoch": 0.2985586055346568,
"grad_norm": 0.4343620836734772,
"learning_rate": 8.851306462462689e-06,
"loss": 0.7731,
"step": 1002
},
{
"epoch": 0.2988565682148311,
"grad_norm": 0.4382462203502655,
"learning_rate": 8.847988453875423e-06,
"loss": 0.7903,
"step": 1003
},
{
"epoch": 0.2991545308950054,
"grad_norm": 0.4331133961677551,
"learning_rate": 8.84466628395951e-06,
"loss": 0.8021,
"step": 1004
},
{
"epoch": 0.2994524935751797,
"grad_norm": 0.4116860330104828,
"learning_rate": 8.841339956307647e-06,
"loss": 0.7148,
"step": 1005
},
{
"epoch": 0.299750456255354,
"grad_norm": 0.4209303557872772,
"learning_rate": 8.838009474517022e-06,
"loss": 0.7646,
"step": 1006
},
{
"epoch": 0.3000484189355283,
"grad_norm": 0.4389602243900299,
"learning_rate": 8.834674842189314e-06,
"loss": 0.7725,
"step": 1007
},
{
"epoch": 0.30034638161570265,
"grad_norm": 0.4069165587425232,
"learning_rate": 8.831336062930697e-06,
"loss": 0.7569,
"step": 1008
},
{
"epoch": 0.30064434429587694,
"grad_norm": 0.4373205602169037,
"learning_rate": 8.827993140351825e-06,
"loss": 0.7413,
"step": 1009
},
{
"epoch": 0.30094230697605123,
"grad_norm": 0.4289036989212036,
"learning_rate": 8.824646078067831e-06,
"loss": 0.7664,
"step": 1010
},
{
"epoch": 0.3012402696562256,
"grad_norm": 0.43298622965812683,
"learning_rate": 8.821294879698327e-06,
"loss": 0.7754,
"step": 1011
},
{
"epoch": 0.30153823233639987,
"grad_norm": 0.41397884488105774,
"learning_rate": 8.817939548867403e-06,
"loss": 0.7657,
"step": 1012
},
{
"epoch": 0.30183619501657416,
"grad_norm": 0.43379777669906616,
"learning_rate": 8.814580089203608e-06,
"loss": 0.7813,
"step": 1013
},
{
"epoch": 0.3021341576967485,
"grad_norm": 0.4450131356716156,
"learning_rate": 8.811216504339963e-06,
"loss": 0.7755,
"step": 1014
},
{
"epoch": 0.3024321203769228,
"grad_norm": 0.43981751799583435,
"learning_rate": 8.807848797913949e-06,
"loss": 0.78,
"step": 1015
},
{
"epoch": 0.3027300830570971,
"grad_norm": 0.41197219491004944,
"learning_rate": 8.804476973567502e-06,
"loss": 0.7442,
"step": 1016
},
{
"epoch": 0.30302804573727143,
"grad_norm": 0.42295488715171814,
"learning_rate": 8.801101034947015e-06,
"loss": 0.7764,
"step": 1017
},
{
"epoch": 0.3033260084174457,
"grad_norm": 0.44448795914649963,
"learning_rate": 8.797720985703323e-06,
"loss": 0.7724,
"step": 1018
},
{
"epoch": 0.30362397109762,
"grad_norm": 0.4202626347541809,
"learning_rate": 8.794336829491718e-06,
"loss": 0.7451,
"step": 1019
},
{
"epoch": 0.3039219337777943,
"grad_norm": 0.4201606214046478,
"learning_rate": 8.790948569971921e-06,
"loss": 0.785,
"step": 1020
},
{
"epoch": 0.30421989645796865,
"grad_norm": 0.4151608347892761,
"learning_rate": 8.787556210808101e-06,
"loss": 0.762,
"step": 1021
},
{
"epoch": 0.30451785913814294,
"grad_norm": 0.4044322371482849,
"learning_rate": 8.784159755668852e-06,
"loss": 0.7488,
"step": 1022
},
{
"epoch": 0.30481582181831723,
"grad_norm": 0.41898661851882935,
"learning_rate": 8.780759208227202e-06,
"loss": 0.7408,
"step": 1023
},
{
"epoch": 0.3051137844984916,
"grad_norm": 0.42571327090263367,
"learning_rate": 8.777354572160606e-06,
"loss": 0.7869,
"step": 1024
},
{
"epoch": 0.30541174717866587,
"grad_norm": 0.44300898909568787,
"learning_rate": 8.773945851150934e-06,
"loss": 0.8209,
"step": 1025
},
{
"epoch": 0.30570970985884016,
"grad_norm": 0.4257313013076782,
"learning_rate": 8.770533048884483e-06,
"loss": 0.778,
"step": 1026
},
{
"epoch": 0.3060076725390145,
"grad_norm": 0.4134843647480011,
"learning_rate": 8.767116169051952e-06,
"loss": 0.7673,
"step": 1027
},
{
"epoch": 0.3063056352191888,
"grad_norm": 0.43889355659484863,
"learning_rate": 8.763695215348462e-06,
"loss": 0.7609,
"step": 1028
},
{
"epoch": 0.3066035978993631,
"grad_norm": 0.4281556308269501,
"learning_rate": 8.760270191473532e-06,
"loss": 0.7589,
"step": 1029
},
{
"epoch": 0.30690156057953744,
"grad_norm": 0.4477680027484894,
"learning_rate": 8.756841101131081e-06,
"loss": 0.8458,
"step": 1030
},
{
"epoch": 0.3071995232597117,
"grad_norm": 0.4279250204563141,
"learning_rate": 8.753407948029433e-06,
"loss": 0.8087,
"step": 1031
},
{
"epoch": 0.307497485939886,
"grad_norm": 0.42505520582199097,
"learning_rate": 8.749970735881298e-06,
"loss": 0.7525,
"step": 1032
},
{
"epoch": 0.30779544862006036,
"grad_norm": 0.42256543040275574,
"learning_rate": 8.746529468403781e-06,
"loss": 0.7384,
"step": 1033
},
{
"epoch": 0.30809341130023465,
"grad_norm": 0.4340267777442932,
"learning_rate": 8.743084149318372e-06,
"loss": 0.7983,
"step": 1034
},
{
"epoch": 0.30839137398040894,
"grad_norm": 0.4179762303829193,
"learning_rate": 8.739634782350938e-06,
"loss": 0.7531,
"step": 1035
},
{
"epoch": 0.30868933666058324,
"grad_norm": 0.4406876266002655,
"learning_rate": 8.736181371231728e-06,
"loss": 0.7502,
"step": 1036
},
{
"epoch": 0.3089872993407576,
"grad_norm": 0.4257606565952301,
"learning_rate": 8.732723919695364e-06,
"loss": 0.7323,
"step": 1037
},
{
"epoch": 0.3092852620209319,
"grad_norm": 0.42342373728752136,
"learning_rate": 8.729262431480832e-06,
"loss": 0.7413,
"step": 1038
},
{
"epoch": 0.30958322470110616,
"grad_norm": 0.41246771812438965,
"learning_rate": 8.725796910331494e-06,
"loss": 0.7008,
"step": 1039
},
{
"epoch": 0.3098811873812805,
"grad_norm": 0.4482112526893616,
"learning_rate": 8.722327359995064e-06,
"loss": 0.7964,
"step": 1040
},
{
"epoch": 0.3101791500614548,
"grad_norm": 0.42225462198257446,
"learning_rate": 8.718853784223618e-06,
"loss": 0.7816,
"step": 1041
},
{
"epoch": 0.3104771127416291,
"grad_norm": 0.4308784306049347,
"learning_rate": 8.71537618677358e-06,
"loss": 0.7556,
"step": 1042
},
{
"epoch": 0.31077507542180344,
"grad_norm": 0.42901504039764404,
"learning_rate": 8.71189457140573e-06,
"loss": 0.7398,
"step": 1043
},
{
"epoch": 0.31107303810197773,
"grad_norm": 0.42239463329315186,
"learning_rate": 8.708408941885189e-06,
"loss": 0.7544,
"step": 1044
},
{
"epoch": 0.311371000782152,
"grad_norm": 0.4180073142051697,
"learning_rate": 8.704919301981422e-06,
"loss": 0.7368,
"step": 1045
},
{
"epoch": 0.31166896346232636,
"grad_norm": 0.4252670109272003,
"learning_rate": 8.701425655468226e-06,
"loss": 0.7761,
"step": 1046
},
{
"epoch": 0.31196692614250066,
"grad_norm": 0.4541017711162567,
"learning_rate": 8.697928006123735e-06,
"loss": 0.8097,
"step": 1047
},
{
"epoch": 0.31226488882267495,
"grad_norm": 0.42537379264831543,
"learning_rate": 8.69442635773041e-06,
"loss": 0.7039,
"step": 1048
},
{
"epoch": 0.3125628515028493,
"grad_norm": 0.41526198387145996,
"learning_rate": 8.690920714075039e-06,
"loss": 0.7425,
"step": 1049
},
{
"epoch": 0.3128608141830236,
"grad_norm": 0.4391304552555084,
"learning_rate": 8.687411078948727e-06,
"loss": 0.7795,
"step": 1050
},
{
"epoch": 0.3131587768631979,
"grad_norm": 0.45744815468788147,
"learning_rate": 8.683897456146897e-06,
"loss": 0.7992,
"step": 1051
},
{
"epoch": 0.3134567395433722,
"grad_norm": 0.4174099564552307,
"learning_rate": 8.680379849469287e-06,
"loss": 0.8134,
"step": 1052
},
{
"epoch": 0.3137547022235465,
"grad_norm": 0.4404802620410919,
"learning_rate": 8.676858262719939e-06,
"loss": 0.8169,
"step": 1053
},
{
"epoch": 0.3140526649037208,
"grad_norm": 0.44465020298957825,
"learning_rate": 8.673332699707202e-06,
"loss": 0.7445,
"step": 1054
},
{
"epoch": 0.3143506275838951,
"grad_norm": 0.39243802428245544,
"learning_rate": 8.669803164243725e-06,
"loss": 0.7439,
"step": 1055
},
{
"epoch": 0.31464859026406944,
"grad_norm": 0.42216596007347107,
"learning_rate": 8.66626966014645e-06,
"loss": 0.7361,
"step": 1056
},
{
"epoch": 0.31494655294424373,
"grad_norm": 0.44119539856910706,
"learning_rate": 8.662732191236614e-06,
"loss": 0.7504,
"step": 1057
},
{
"epoch": 0.315244515624418,
"grad_norm": 0.42714157700538635,
"learning_rate": 8.659190761339741e-06,
"loss": 0.7716,
"step": 1058
},
{
"epoch": 0.31554247830459237,
"grad_norm": 0.425434947013855,
"learning_rate": 8.655645374285637e-06,
"loss": 0.8043,
"step": 1059
},
{
"epoch": 0.31584044098476666,
"grad_norm": 0.4378584921360016,
"learning_rate": 8.652096033908391e-06,
"loss": 0.7834,
"step": 1060
},
{
"epoch": 0.31613840366494095,
"grad_norm": 0.40135619044303894,
"learning_rate": 8.648542744046364e-06,
"loss": 0.736,
"step": 1061
},
{
"epoch": 0.3164363663451153,
"grad_norm": 0.43657106161117554,
"learning_rate": 8.644985508542186e-06,
"loss": 0.7844,
"step": 1062
},
{
"epoch": 0.3167343290252896,
"grad_norm": 0.4240058660507202,
"learning_rate": 8.64142433124276e-06,
"loss": 0.752,
"step": 1063
},
{
"epoch": 0.3170322917054639,
"grad_norm": 0.4469079077243805,
"learning_rate": 8.637859215999246e-06,
"loss": 0.8066,
"step": 1064
},
{
"epoch": 0.3173302543856382,
"grad_norm": 0.4204435348510742,
"learning_rate": 8.63429016666707e-06,
"loss": 0.8185,
"step": 1065
},
{
"epoch": 0.3176282170658125,
"grad_norm": 0.42589515447616577,
"learning_rate": 8.630717187105902e-06,
"loss": 0.7787,
"step": 1066
},
{
"epoch": 0.3179261797459868,
"grad_norm": 0.43811801075935364,
"learning_rate": 8.62714028117967e-06,
"loss": 0.7667,
"step": 1067
},
{
"epoch": 0.31822414242616115,
"grad_norm": 0.4212261140346527,
"learning_rate": 8.623559452756547e-06,
"loss": 0.7984,
"step": 1068
},
{
"epoch": 0.31852210510633544,
"grad_norm": 0.4261661767959595,
"learning_rate": 8.619974705708945e-06,
"loss": 0.7834,
"step": 1069
},
{
"epoch": 0.31882006778650973,
"grad_norm": 0.4385144114494324,
"learning_rate": 8.616386043913516e-06,
"loss": 0.7839,
"step": 1070
},
{
"epoch": 0.319118030466684,
"grad_norm": 0.427738755941391,
"learning_rate": 8.612793471251148e-06,
"loss": 0.7846,
"step": 1071
},
{
"epoch": 0.31941599314685837,
"grad_norm": 0.426027774810791,
"learning_rate": 8.609196991606951e-06,
"loss": 0.8066,
"step": 1072
},
{
"epoch": 0.31971395582703266,
"grad_norm": 0.41970694065093994,
"learning_rate": 8.605596608870268e-06,
"loss": 0.7786,
"step": 1073
},
{
"epoch": 0.32001191850720695,
"grad_norm": 0.4090143144130707,
"learning_rate": 8.601992326934658e-06,
"loss": 0.7421,
"step": 1074
},
{
"epoch": 0.3203098811873813,
"grad_norm": 0.4413098692893982,
"learning_rate": 8.5983841496979e-06,
"loss": 0.8006,
"step": 1075
},
{
"epoch": 0.3206078438675556,
"grad_norm": 0.44104504585266113,
"learning_rate": 8.59477208106198e-06,
"loss": 0.78,
"step": 1076
},
{
"epoch": 0.3209058065477299,
"grad_norm": 0.43282023072242737,
"learning_rate": 8.591156124933097e-06,
"loss": 0.7596,
"step": 1077
},
{
"epoch": 0.3212037692279042,
"grad_norm": 0.42781636118888855,
"learning_rate": 8.587536285221656e-06,
"loss": 0.7919,
"step": 1078
},
{
"epoch": 0.3215017319080785,
"grad_norm": 0.41227588057518005,
"learning_rate": 8.583912565842258e-06,
"loss": 0.7646,
"step": 1079
},
{
"epoch": 0.3217996945882528,
"grad_norm": 0.41265419125556946,
"learning_rate": 8.580284970713697e-06,
"loss": 0.7113,
"step": 1080
},
{
"epoch": 0.32209765726842715,
"grad_norm": 0.43087223172187805,
"learning_rate": 8.576653503758964e-06,
"loss": 0.7746,
"step": 1081
},
{
"epoch": 0.32239561994860144,
"grad_norm": 0.4424741268157959,
"learning_rate": 8.573018168905237e-06,
"loss": 0.7646,
"step": 1082
},
{
"epoch": 0.32269358262877573,
"grad_norm": 0.41031649708747864,
"learning_rate": 8.569378970083873e-06,
"loss": 0.7532,
"step": 1083
},
{
"epoch": 0.3229915453089501,
"grad_norm": 0.4124182164669037,
"learning_rate": 8.565735911230407e-06,
"loss": 0.769,
"step": 1084
},
{
"epoch": 0.32328950798912437,
"grad_norm": 0.436513751745224,
"learning_rate": 8.562088996284555e-06,
"loss": 0.7941,
"step": 1085
},
{
"epoch": 0.32358747066929866,
"grad_norm": 0.4268746078014374,
"learning_rate": 8.558438229190195e-06,
"loss": 0.793,
"step": 1086
},
{
"epoch": 0.32388543334947295,
"grad_norm": 0.430209219455719,
"learning_rate": 8.554783613895377e-06,
"loss": 0.7172,
"step": 1087
},
{
"epoch": 0.3241833960296473,
"grad_norm": 0.44001534581184387,
"learning_rate": 8.551125154352309e-06,
"loss": 0.7422,
"step": 1088
},
{
"epoch": 0.3244813587098216,
"grad_norm": 0.41053760051727295,
"learning_rate": 8.54746285451736e-06,
"loss": 0.7834,
"step": 1089
},
{
"epoch": 0.3247793213899959,
"grad_norm": 0.42692187428474426,
"learning_rate": 8.543796718351043e-06,
"loss": 0.737,
"step": 1090
},
{
"epoch": 0.3250772840701702,
"grad_norm": 0.4233838617801666,
"learning_rate": 8.540126749818033e-06,
"loss": 0.7313,
"step": 1091
},
{
"epoch": 0.3253752467503445,
"grad_norm": 0.4325284957885742,
"learning_rate": 8.536452952887142e-06,
"loss": 0.8187,
"step": 1092
},
{
"epoch": 0.3256732094305188,
"grad_norm": 0.42090630531311035,
"learning_rate": 8.532775331531317e-06,
"loss": 0.7263,
"step": 1093
},
{
"epoch": 0.32597117211069315,
"grad_norm": 0.43220120668411255,
"learning_rate": 8.529093889727655e-06,
"loss": 0.7538,
"step": 1094
},
{
"epoch": 0.32626913479086744,
"grad_norm": 0.42502284049987793,
"learning_rate": 8.52540863145737e-06,
"loss": 0.7794,
"step": 1095
},
{
"epoch": 0.32656709747104173,
"grad_norm": 0.4070689380168915,
"learning_rate": 8.52171956070581e-06,
"loss": 0.7249,
"step": 1096
},
{
"epoch": 0.3268650601512161,
"grad_norm": 0.42295122146606445,
"learning_rate": 8.518026681462448e-06,
"loss": 0.7516,
"step": 1097
},
{
"epoch": 0.32716302283139037,
"grad_norm": 0.418568879365921,
"learning_rate": 8.514329997720871e-06,
"loss": 0.7951,
"step": 1098
},
{
"epoch": 0.32746098551156466,
"grad_norm": 0.47218871116638184,
"learning_rate": 8.510629513478783e-06,
"loss": 0.7837,
"step": 1099
},
{
"epoch": 0.327758948191739,
"grad_norm": 0.42859897017478943,
"learning_rate": 8.506925232737998e-06,
"loss": 0.7544,
"step": 1100
},
{
"epoch": 0.3280569108719133,
"grad_norm": 0.4329354763031006,
"learning_rate": 8.50321715950443e-06,
"loss": 0.805,
"step": 1101
},
{
"epoch": 0.3283548735520876,
"grad_norm": 0.43533289432525635,
"learning_rate": 8.499505297788106e-06,
"loss": 0.776,
"step": 1102
},
{
"epoch": 0.32865283623226194,
"grad_norm": 0.4444495737552643,
"learning_rate": 8.49578965160314e-06,
"loss": 0.7492,
"step": 1103
},
{
"epoch": 0.3289507989124362,
"grad_norm": 0.4216909408569336,
"learning_rate": 8.492070224967742e-06,
"loss": 0.7342,
"step": 1104
},
{
"epoch": 0.3292487615926105,
"grad_norm": 0.4235077202320099,
"learning_rate": 8.48834702190421e-06,
"loss": 0.7462,
"step": 1105
},
{
"epoch": 0.3295467242727848,
"grad_norm": 0.43822556734085083,
"learning_rate": 8.484620046438925e-06,
"loss": 0.8223,
"step": 1106
},
{
"epoch": 0.32984468695295915,
"grad_norm": 0.4066564738750458,
"learning_rate": 8.480889302602351e-06,
"loss": 0.7321,
"step": 1107
},
{
"epoch": 0.33014264963313344,
"grad_norm": 0.4117765426635742,
"learning_rate": 8.477154794429021e-06,
"loss": 0.7348,
"step": 1108
},
{
"epoch": 0.33044061231330774,
"grad_norm": 0.4176870882511139,
"learning_rate": 8.47341652595755e-06,
"loss": 0.8148,
"step": 1109
},
{
"epoch": 0.3307385749934821,
"grad_norm": 0.41546934843063354,
"learning_rate": 8.469674501230603e-06,
"loss": 0.7704,
"step": 1110
},
{
"epoch": 0.3310365376736564,
"grad_norm": 0.44431930780410767,
"learning_rate": 8.465928724294923e-06,
"loss": 0.8094,
"step": 1111
},
{
"epoch": 0.33133450035383066,
"grad_norm": 0.42732754349708557,
"learning_rate": 8.462179199201301e-06,
"loss": 0.7129,
"step": 1112
},
{
"epoch": 0.331632463034005,
"grad_norm": 0.4333447515964508,
"learning_rate": 8.458425930004585e-06,
"loss": 0.8335,
"step": 1113
},
{
"epoch": 0.3319304257141793,
"grad_norm": 0.4211162030696869,
"learning_rate": 8.454668920763672e-06,
"loss": 0.7637,
"step": 1114
},
{
"epoch": 0.3322283883943536,
"grad_norm": 0.44972220063209534,
"learning_rate": 8.450908175541503e-06,
"loss": 0.8342,
"step": 1115
},
{
"epoch": 0.33252635107452794,
"grad_norm": 0.4448044002056122,
"learning_rate": 8.44714369840506e-06,
"loss": 0.8266,
"step": 1116
},
{
"epoch": 0.33282431375470223,
"grad_norm": 0.4465561509132385,
"learning_rate": 8.443375493425358e-06,
"loss": 0.7993,
"step": 1117
},
{
"epoch": 0.3331222764348765,
"grad_norm": 0.4390435516834259,
"learning_rate": 8.439603564677448e-06,
"loss": 0.7782,
"step": 1118
},
{
"epoch": 0.33342023911505086,
"grad_norm": 0.42651331424713135,
"learning_rate": 8.435827916240403e-06,
"loss": 0.7605,
"step": 1119
},
{
"epoch": 0.33371820179522516,
"grad_norm": 0.4195694327354431,
"learning_rate": 8.43204855219732e-06,
"loss": 0.7726,
"step": 1120
},
{
"epoch": 0.33401616447539945,
"grad_norm": 0.42806294560432434,
"learning_rate": 8.42826547663532e-06,
"loss": 0.735,
"step": 1121
},
{
"epoch": 0.33431412715557374,
"grad_norm": 0.42807263135910034,
"learning_rate": 8.424478693645528e-06,
"loss": 0.7883,
"step": 1122
},
{
"epoch": 0.3346120898357481,
"grad_norm": 0.43336206674575806,
"learning_rate": 8.420688207323085e-06,
"loss": 0.806,
"step": 1123
},
{
"epoch": 0.3349100525159224,
"grad_norm": 0.4646878242492676,
"learning_rate": 8.416894021767137e-06,
"loss": 0.7781,
"step": 1124
},
{
"epoch": 0.33520801519609666,
"grad_norm": 0.4356582462787628,
"learning_rate": 8.413096141080827e-06,
"loss": 0.7731,
"step": 1125
},
{
"epoch": 0.335505977876271,
"grad_norm": 0.4233420491218567,
"learning_rate": 8.409294569371293e-06,
"loss": 0.7666,
"step": 1126
},
{
"epoch": 0.3358039405564453,
"grad_norm": 0.4309924244880676,
"learning_rate": 8.405489310749672e-06,
"loss": 0.7458,
"step": 1127
},
{
"epoch": 0.3361019032366196,
"grad_norm": 0.4494045078754425,
"learning_rate": 8.401680369331083e-06,
"loss": 0.7762,
"step": 1128
},
{
"epoch": 0.33639986591679394,
"grad_norm": 0.4436827600002289,
"learning_rate": 8.397867749234623e-06,
"loss": 0.7755,
"step": 1129
},
{
"epoch": 0.33669782859696823,
"grad_norm": 0.4168068468570709,
"learning_rate": 8.394051454583376e-06,
"loss": 0.7506,
"step": 1130
},
{
"epoch": 0.3369957912771425,
"grad_norm": 0.42762142419815063,
"learning_rate": 8.390231489504397e-06,
"loss": 0.7819,
"step": 1131
},
{
"epoch": 0.33729375395731687,
"grad_norm": 0.4372676908969879,
"learning_rate": 8.386407858128707e-06,
"loss": 0.8063,
"step": 1132
},
{
"epoch": 0.33759171663749116,
"grad_norm": 0.45513224601745605,
"learning_rate": 8.382580564591294e-06,
"loss": 0.8075,
"step": 1133
},
{
"epoch": 0.33788967931766545,
"grad_norm": 0.4320538640022278,
"learning_rate": 8.378749613031108e-06,
"loss": 0.7857,
"step": 1134
},
{
"epoch": 0.3381876419978398,
"grad_norm": 0.41906487941741943,
"learning_rate": 8.374915007591053e-06,
"loss": 0.7429,
"step": 1135
},
{
"epoch": 0.3384856046780141,
"grad_norm": 0.41150668263435364,
"learning_rate": 8.371076752417986e-06,
"loss": 0.713,
"step": 1136
},
{
"epoch": 0.3387835673581884,
"grad_norm": 0.45930665731430054,
"learning_rate": 8.367234851662707e-06,
"loss": 0.7644,
"step": 1137
},
{
"epoch": 0.3390815300383627,
"grad_norm": 0.43832212686538696,
"learning_rate": 8.363389309479964e-06,
"loss": 0.7738,
"step": 1138
},
{
"epoch": 0.339379492718537,
"grad_norm": 0.42082133889198303,
"learning_rate": 8.359540130028439e-06,
"loss": 0.7319,
"step": 1139
},
{
"epoch": 0.3396774553987113,
"grad_norm": 0.4095704257488251,
"learning_rate": 8.355687317470749e-06,
"loss": 0.7524,
"step": 1140
},
{
"epoch": 0.3399754180788856,
"grad_norm": 0.41502609848976135,
"learning_rate": 8.351830875973436e-06,
"loss": 0.7406,
"step": 1141
},
{
"epoch": 0.34027338075905994,
"grad_norm": 0.43125033378601074,
"learning_rate": 8.347970809706977e-06,
"loss": 0.7803,
"step": 1142
},
{
"epoch": 0.34057134343923423,
"grad_norm": 0.45613449811935425,
"learning_rate": 8.344107122845757e-06,
"loss": 0.7607,
"step": 1143
},
{
"epoch": 0.3408693061194085,
"grad_norm": 0.4124937951564789,
"learning_rate": 8.340239819568082e-06,
"loss": 0.7234,
"step": 1144
},
{
"epoch": 0.34116726879958287,
"grad_norm": 0.426287442445755,
"learning_rate": 8.336368904056169e-06,
"loss": 0.756,
"step": 1145
},
{
"epoch": 0.34146523147975716,
"grad_norm": 0.44698894023895264,
"learning_rate": 8.332494380496142e-06,
"loss": 0.7876,
"step": 1146
},
{
"epoch": 0.34176319415993145,
"grad_norm": 0.41564255952835083,
"learning_rate": 8.32861625307802e-06,
"loss": 0.7673,
"step": 1147
},
{
"epoch": 0.3420611568401058,
"grad_norm": 0.44098252058029175,
"learning_rate": 8.324734525995732e-06,
"loss": 0.7602,
"step": 1148
},
{
"epoch": 0.3423591195202801,
"grad_norm": 0.41699904203414917,
"learning_rate": 8.32084920344709e-06,
"loss": 0.7877,
"step": 1149
},
{
"epoch": 0.3426570822004544,
"grad_norm": 0.41518479585647583,
"learning_rate": 8.316960289633795e-06,
"loss": 0.7653,
"step": 1150
},
{
"epoch": 0.3429550448806287,
"grad_norm": 0.3980076014995575,
"learning_rate": 8.313067788761436e-06,
"loss": 0.7658,
"step": 1151
},
{
"epoch": 0.343253007560803,
"grad_norm": 0.4039032757282257,
"learning_rate": 8.309171705039474e-06,
"loss": 0.7153,
"step": 1152
},
{
"epoch": 0.3435509702409773,
"grad_norm": 0.40779900550842285,
"learning_rate": 8.305272042681257e-06,
"loss": 0.7334,
"step": 1153
},
{
"epoch": 0.34384893292115165,
"grad_norm": 0.4208314120769501,
"learning_rate": 8.301368805903988e-06,
"loss": 0.75,
"step": 1154
},
{
"epoch": 0.34414689560132594,
"grad_norm": 0.4231463372707367,
"learning_rate": 8.297461998928746e-06,
"loss": 0.7798,
"step": 1155
},
{
"epoch": 0.34444485828150023,
"grad_norm": 0.42949163913726807,
"learning_rate": 8.293551625980468e-06,
"loss": 0.7663,
"step": 1156
},
{
"epoch": 0.3447428209616745,
"grad_norm": 0.4180258810520172,
"learning_rate": 8.289637691287948e-06,
"loss": 0.7535,
"step": 1157
},
{
"epoch": 0.34504078364184887,
"grad_norm": 0.4052999019622803,
"learning_rate": 8.28572019908383e-06,
"loss": 0.7316,
"step": 1158
},
{
"epoch": 0.34533874632202316,
"grad_norm": 0.4323941469192505,
"learning_rate": 8.281799153604603e-06,
"loss": 0.7624,
"step": 1159
},
{
"epoch": 0.34563670900219745,
"grad_norm": 0.4146830439567566,
"learning_rate": 8.277874559090605e-06,
"loss": 0.7593,
"step": 1160
},
{
"epoch": 0.3459346716823718,
"grad_norm": 0.420483261346817,
"learning_rate": 8.273946419786008e-06,
"loss": 0.7781,
"step": 1161
},
{
"epoch": 0.3462326343625461,
"grad_norm": 0.41429367661476135,
"learning_rate": 8.27001473993882e-06,
"loss": 0.7429,
"step": 1162
},
{
"epoch": 0.3465305970427204,
"grad_norm": 0.42921391129493713,
"learning_rate": 8.266079523800873e-06,
"loss": 0.7878,
"step": 1163
},
{
"epoch": 0.3468285597228947,
"grad_norm": 0.4069218337535858,
"learning_rate": 8.262140775627827e-06,
"loss": 0.7468,
"step": 1164
},
{
"epoch": 0.347126522403069,
"grad_norm": 0.42255890369415283,
"learning_rate": 8.258198499679162e-06,
"loss": 0.7485,
"step": 1165
},
{
"epoch": 0.3474244850832433,
"grad_norm": 0.4330653250217438,
"learning_rate": 8.25425270021817e-06,
"loss": 0.7907,
"step": 1166
},
{
"epoch": 0.34772244776341765,
"grad_norm": 0.4273219704627991,
"learning_rate": 8.250303381511957e-06,
"loss": 0.7894,
"step": 1167
},
{
"epoch": 0.34802041044359194,
"grad_norm": 0.41967499256134033,
"learning_rate": 8.246350547831433e-06,
"loss": 0.7366,
"step": 1168
},
{
"epoch": 0.34831837312376623,
"grad_norm": 0.4150393009185791,
"learning_rate": 8.24239420345131e-06,
"loss": 0.7823,
"step": 1169
},
{
"epoch": 0.3486163358039406,
"grad_norm": 0.43301746249198914,
"learning_rate": 8.238434352650094e-06,
"loss": 0.7735,
"step": 1170
},
{
"epoch": 0.34891429848411487,
"grad_norm": 0.4161169230937958,
"learning_rate": 8.234470999710086e-06,
"loss": 0.7563,
"step": 1171
},
{
"epoch": 0.34921226116428916,
"grad_norm": 0.4297851324081421,
"learning_rate": 8.230504148917374e-06,
"loss": 0.7164,
"step": 1172
},
{
"epoch": 0.3495102238444635,
"grad_norm": 0.42612648010253906,
"learning_rate": 8.226533804561828e-06,
"loss": 0.786,
"step": 1173
},
{
"epoch": 0.3498081865246378,
"grad_norm": 0.4089588522911072,
"learning_rate": 8.222559970937092e-06,
"loss": 0.7427,
"step": 1174
},
{
"epoch": 0.3501061492048121,
"grad_norm": 0.42445510625839233,
"learning_rate": 8.218582652340592e-06,
"loss": 0.7663,
"step": 1175
},
{
"epoch": 0.3504041118849864,
"grad_norm": 0.41788461804389954,
"learning_rate": 8.214601853073516e-06,
"loss": 0.7726,
"step": 1176
},
{
"epoch": 0.3507020745651607,
"grad_norm": 0.4169197380542755,
"learning_rate": 8.21061757744082e-06,
"loss": 0.774,
"step": 1177
},
{
"epoch": 0.351000037245335,
"grad_norm": 0.40096545219421387,
"learning_rate": 8.206629829751215e-06,
"loss": 0.7321,
"step": 1178
},
{
"epoch": 0.3512979999255093,
"grad_norm": 0.41959595680236816,
"learning_rate": 8.202638614317171e-06,
"loss": 0.7679,
"step": 1179
},
{
"epoch": 0.35159596260568365,
"grad_norm": 0.41927823424339294,
"learning_rate": 8.198643935454907e-06,
"loss": 0.7578,
"step": 1180
},
{
"epoch": 0.35189392528585794,
"grad_norm": 0.45546793937683105,
"learning_rate": 8.194645797484385e-06,
"loss": 0.7964,
"step": 1181
},
{
"epoch": 0.35219188796603224,
"grad_norm": 0.4366207420825958,
"learning_rate": 8.190644204729313e-06,
"loss": 0.7855,
"step": 1182
},
{
"epoch": 0.3524898506462066,
"grad_norm": 0.41425538063049316,
"learning_rate": 8.186639161517127e-06,
"loss": 0.7442,
"step": 1183
},
{
"epoch": 0.3527878133263809,
"grad_norm": 0.4379921853542328,
"learning_rate": 8.182630672179003e-06,
"loss": 0.8418,
"step": 1184
},
{
"epoch": 0.35308577600655516,
"grad_norm": 0.4329274296760559,
"learning_rate": 8.178618741049841e-06,
"loss": 0.7885,
"step": 1185
},
{
"epoch": 0.3533837386867295,
"grad_norm": 0.4258681833744049,
"learning_rate": 8.174603372468259e-06,
"loss": 0.7588,
"step": 1186
},
{
"epoch": 0.3536817013669038,
"grad_norm": 0.41804084181785583,
"learning_rate": 8.170584570776598e-06,
"loss": 0.8098,
"step": 1187
},
{
"epoch": 0.3539796640470781,
"grad_norm": 0.4102548062801361,
"learning_rate": 8.166562340320908e-06,
"loss": 0.7863,
"step": 1188
},
{
"epoch": 0.35427762672725244,
"grad_norm": 0.4187453091144562,
"learning_rate": 8.162536685450945e-06,
"loss": 0.7904,
"step": 1189
},
{
"epoch": 0.35457558940742673,
"grad_norm": 0.43551990389823914,
"learning_rate": 8.158507610520177e-06,
"loss": 0.8015,
"step": 1190
},
{
"epoch": 0.354873552087601,
"grad_norm": 0.41999849677085876,
"learning_rate": 8.154475119885763e-06,
"loss": 0.7717,
"step": 1191
},
{
"epoch": 0.3551715147677753,
"grad_norm": 0.43046441674232483,
"learning_rate": 8.150439217908557e-06,
"loss": 0.7526,
"step": 1192
},
{
"epoch": 0.35546947744794966,
"grad_norm": 0.4299757778644562,
"learning_rate": 8.146399908953102e-06,
"loss": 0.7774,
"step": 1193
},
{
"epoch": 0.35576744012812395,
"grad_norm": 0.43132033944129944,
"learning_rate": 8.142357197387627e-06,
"loss": 0.7387,
"step": 1194
},
{
"epoch": 0.35606540280829824,
"grad_norm": 0.43697425723075867,
"learning_rate": 8.138311087584042e-06,
"loss": 0.783,
"step": 1195
},
{
"epoch": 0.3563633654884726,
"grad_norm": 0.42331957817077637,
"learning_rate": 8.134261583917927e-06,
"loss": 0.7536,
"step": 1196
},
{
"epoch": 0.3566613281686469,
"grad_norm": 0.4190444350242615,
"learning_rate": 8.130208690768536e-06,
"loss": 0.7617,
"step": 1197
},
{
"epoch": 0.35695929084882116,
"grad_norm": 0.4350127875804901,
"learning_rate": 8.126152412518788e-06,
"loss": 0.7954,
"step": 1198
},
{
"epoch": 0.3572572535289955,
"grad_norm": 0.4178895652294159,
"learning_rate": 8.122092753555265e-06,
"loss": 0.7746,
"step": 1199
},
{
"epoch": 0.3575552162091698,
"grad_norm": 0.4325284957885742,
"learning_rate": 8.118029718268197e-06,
"loss": 0.7596,
"step": 1200
},
{
"epoch": 0.3578531788893441,
"grad_norm": 0.4090530574321747,
"learning_rate": 8.113963311051474e-06,
"loss": 0.754,
"step": 1201
},
{
"epoch": 0.35815114156951844,
"grad_norm": 0.39271825551986694,
"learning_rate": 8.10989353630263e-06,
"loss": 0.739,
"step": 1202
},
{
"epoch": 0.35844910424969273,
"grad_norm": 0.4179271459579468,
"learning_rate": 8.105820398422837e-06,
"loss": 0.767,
"step": 1203
},
{
"epoch": 0.358747066929867,
"grad_norm": 0.43544018268585205,
"learning_rate": 8.10174390181691e-06,
"loss": 0.768,
"step": 1204
},
{
"epoch": 0.35904502961004137,
"grad_norm": 0.41055986285209656,
"learning_rate": 8.09766405089329e-06,
"loss": 0.7455,
"step": 1205
},
{
"epoch": 0.35934299229021566,
"grad_norm": 0.43460318446159363,
"learning_rate": 8.093580850064053e-06,
"loss": 0.7631,
"step": 1206
},
{
"epoch": 0.35964095497038995,
"grad_norm": 0.4326554238796234,
"learning_rate": 8.08949430374489e-06,
"loss": 0.7593,
"step": 1207
},
{
"epoch": 0.3599389176505643,
"grad_norm": 0.4217560291290283,
"learning_rate": 8.085404416355111e-06,
"loss": 0.7579,
"step": 1208
},
{
"epoch": 0.3602368803307386,
"grad_norm": 0.44069936871528625,
"learning_rate": 8.081311192317645e-06,
"loss": 0.7789,
"step": 1209
},
{
"epoch": 0.3605348430109129,
"grad_norm": 0.4214688539505005,
"learning_rate": 8.077214636059025e-06,
"loss": 0.7705,
"step": 1210
},
{
"epoch": 0.36083280569108717,
"grad_norm": 0.41780489683151245,
"learning_rate": 8.073114752009388e-06,
"loss": 0.8051,
"step": 1211
},
{
"epoch": 0.3611307683712615,
"grad_norm": 0.4128536581993103,
"learning_rate": 8.06901154460247e-06,
"loss": 0.7839,
"step": 1212
},
{
"epoch": 0.3614287310514358,
"grad_norm": 0.4101253151893616,
"learning_rate": 8.0649050182756e-06,
"loss": 0.7666,
"step": 1213
},
{
"epoch": 0.3617266937316101,
"grad_norm": 0.4235800504684448,
"learning_rate": 8.060795177469698e-06,
"loss": 0.7479,
"step": 1214
},
{
"epoch": 0.36202465641178444,
"grad_norm": 0.4299834966659546,
"learning_rate": 8.056682026629269e-06,
"loss": 0.7552,
"step": 1215
},
{
"epoch": 0.36232261909195873,
"grad_norm": 0.43004798889160156,
"learning_rate": 8.052565570202394e-06,
"loss": 0.7419,
"step": 1216
},
{
"epoch": 0.362620581772133,
"grad_norm": 0.4240386486053467,
"learning_rate": 8.04844581264073e-06,
"loss": 0.7907,
"step": 1217
},
{
"epoch": 0.36291854445230737,
"grad_norm": 0.4139268398284912,
"learning_rate": 8.044322758399508e-06,
"loss": 0.7389,
"step": 1218
},
{
"epoch": 0.36321650713248166,
"grad_norm": 0.45361292362213135,
"learning_rate": 8.04019641193752e-06,
"loss": 0.8171,
"step": 1219
},
{
"epoch": 0.36351446981265595,
"grad_norm": 0.42017659544944763,
"learning_rate": 8.036066777717117e-06,
"loss": 0.7835,
"step": 1220
},
{
"epoch": 0.3638124324928303,
"grad_norm": 0.43891727924346924,
"learning_rate": 8.031933860204208e-06,
"loss": 0.8214,
"step": 1221
},
{
"epoch": 0.3641103951730046,
"grad_norm": 0.41762715578079224,
"learning_rate": 8.027797663868255e-06,
"loss": 0.7513,
"step": 1222
},
{
"epoch": 0.3644083578531789,
"grad_norm": 0.4542342722415924,
"learning_rate": 8.023658193182261e-06,
"loss": 0.767,
"step": 1223
},
{
"epoch": 0.3647063205333532,
"grad_norm": 0.4249032139778137,
"learning_rate": 8.019515452622775e-06,
"loss": 0.7792,
"step": 1224
},
{
"epoch": 0.3650042832135275,
"grad_norm": 0.4247143268585205,
"learning_rate": 8.015369446669877e-06,
"loss": 0.7667,
"step": 1225
},
{
"epoch": 0.3653022458937018,
"grad_norm": 0.4281252920627594,
"learning_rate": 8.011220179807178e-06,
"loss": 0.7489,
"step": 1226
},
{
"epoch": 0.3656002085738761,
"grad_norm": 0.411519855260849,
"learning_rate": 8.007067656521823e-06,
"loss": 0.7618,
"step": 1227
},
{
"epoch": 0.36589817125405044,
"grad_norm": 0.4394058585166931,
"learning_rate": 8.00291188130447e-06,
"loss": 0.7529,
"step": 1228
},
{
"epoch": 0.36619613393422473,
"grad_norm": 0.4067392945289612,
"learning_rate": 7.9987528586493e-06,
"loss": 0.7288,
"step": 1229
},
{
"epoch": 0.366494096614399,
"grad_norm": 0.4320877194404602,
"learning_rate": 7.994590593054001e-06,
"loss": 0.7867,
"step": 1230
},
{
"epoch": 0.36679205929457337,
"grad_norm": 0.42730119824409485,
"learning_rate": 7.990425089019774e-06,
"loss": 0.755,
"step": 1231
},
{
"epoch": 0.36709002197474766,
"grad_norm": 0.4184194505214691,
"learning_rate": 7.98625635105131e-06,
"loss": 0.7375,
"step": 1232
},
{
"epoch": 0.36738798465492195,
"grad_norm": 0.4290657341480255,
"learning_rate": 7.982084383656818e-06,
"loss": 0.773,
"step": 1233
},
{
"epoch": 0.3676859473350963,
"grad_norm": 0.43041571974754333,
"learning_rate": 7.977909191347977e-06,
"loss": 0.8132,
"step": 1234
},
{
"epoch": 0.3679839100152706,
"grad_norm": 0.40691784024238586,
"learning_rate": 7.973730778639968e-06,
"loss": 0.7787,
"step": 1235
},
{
"epoch": 0.3682818726954449,
"grad_norm": 0.4414646625518799,
"learning_rate": 7.969549150051447e-06,
"loss": 0.7625,
"step": 1236
},
{
"epoch": 0.3685798353756192,
"grad_norm": 0.4310557544231415,
"learning_rate": 7.965364310104556e-06,
"loss": 0.781,
"step": 1237
},
{
"epoch": 0.3688777980557935,
"grad_norm": 0.42236313223838806,
"learning_rate": 7.961176263324902e-06,
"loss": 0.7593,
"step": 1238
},
{
"epoch": 0.3691757607359678,
"grad_norm": 0.4452737867832184,
"learning_rate": 7.95698501424156e-06,
"loss": 0.7807,
"step": 1239
},
{
"epoch": 0.36947372341614215,
"grad_norm": 0.4227520525455475,
"learning_rate": 7.952790567387077e-06,
"loss": 0.7445,
"step": 1240
},
{
"epoch": 0.36977168609631644,
"grad_norm": 0.43495917320251465,
"learning_rate": 7.948592927297446e-06,
"loss": 0.7486,
"step": 1241
},
{
"epoch": 0.37006964877649073,
"grad_norm": 0.4254031777381897,
"learning_rate": 7.944392098512123e-06,
"loss": 0.7607,
"step": 1242
},
{
"epoch": 0.3703676114566651,
"grad_norm": 0.4352802634239197,
"learning_rate": 7.940188085574007e-06,
"loss": 0.7986,
"step": 1243
},
{
"epoch": 0.37066557413683937,
"grad_norm": 0.43163731694221497,
"learning_rate": 7.935980893029442e-06,
"loss": 0.7769,
"step": 1244
},
{
"epoch": 0.37096353681701366,
"grad_norm": 0.4114803969860077,
"learning_rate": 7.931770525428212e-06,
"loss": 0.7559,
"step": 1245
},
{
"epoch": 0.37126149949718795,
"grad_norm": 0.4310123920440674,
"learning_rate": 7.927556987323534e-06,
"loss": 0.7836,
"step": 1246
},
{
"epoch": 0.3715594621773623,
"grad_norm": 0.4124079644680023,
"learning_rate": 7.92334028327205e-06,
"loss": 0.7224,
"step": 1247
},
{
"epoch": 0.3718574248575366,
"grad_norm": 0.4300309419631958,
"learning_rate": 7.91912041783383e-06,
"loss": 0.7505,
"step": 1248
},
{
"epoch": 0.3721553875377109,
"grad_norm": 0.4144781827926636,
"learning_rate": 7.914897395572362e-06,
"loss": 0.7697,
"step": 1249
},
{
"epoch": 0.3724533502178852,
"grad_norm": 0.4345705509185791,
"learning_rate": 7.910671221054545e-06,
"loss": 0.7951,
"step": 1250
},
{
"epoch": 0.3727513128980595,
"grad_norm": 0.421006977558136,
"learning_rate": 7.906441898850693e-06,
"loss": 0.7166,
"step": 1251
},
{
"epoch": 0.3730492755782338,
"grad_norm": 0.4450724422931671,
"learning_rate": 7.902209433534515e-06,
"loss": 0.7898,
"step": 1252
},
{
"epoch": 0.37334723825840815,
"grad_norm": 0.41672611236572266,
"learning_rate": 7.89797382968313e-06,
"loss": 0.7618,
"step": 1253
},
{
"epoch": 0.37364520093858244,
"grad_norm": 0.42059627175331116,
"learning_rate": 7.893735091877041e-06,
"loss": 0.7821,
"step": 1254
},
{
"epoch": 0.37394316361875674,
"grad_norm": 0.4378127455711365,
"learning_rate": 7.889493224700147e-06,
"loss": 0.7896,
"step": 1255
},
{
"epoch": 0.3742411262989311,
"grad_norm": 0.4272556006908417,
"learning_rate": 7.885248232739729e-06,
"loss": 0.7865,
"step": 1256
},
{
"epoch": 0.3745390889791054,
"grad_norm": 0.443332701921463,
"learning_rate": 7.881000120586446e-06,
"loss": 0.7354,
"step": 1257
},
{
"epoch": 0.37483705165927966,
"grad_norm": 0.42500534653663635,
"learning_rate": 7.876748892834331e-06,
"loss": 0.6991,
"step": 1258
},
{
"epoch": 0.375135014339454,
"grad_norm": 0.42817673087120056,
"learning_rate": 7.87249455408079e-06,
"loss": 0.7993,
"step": 1259
},
{
"epoch": 0.3754329770196283,
"grad_norm": 0.4016212224960327,
"learning_rate": 7.86823710892659e-06,
"loss": 0.7421,
"step": 1260
},
{
"epoch": 0.3757309396998026,
"grad_norm": 0.41569772362709045,
"learning_rate": 7.86397656197586e-06,
"loss": 0.7628,
"step": 1261
},
{
"epoch": 0.3760289023799769,
"grad_norm": 0.4199194014072418,
"learning_rate": 7.859712917836075e-06,
"loss": 0.7591,
"step": 1262
},
{
"epoch": 0.3763268650601512,
"grad_norm": 0.413968026638031,
"learning_rate": 7.855446181118074e-06,
"loss": 0.7582,
"step": 1263
},
{
"epoch": 0.3766248277403255,
"grad_norm": 0.4255465865135193,
"learning_rate": 7.851176356436028e-06,
"loss": 0.8063,
"step": 1264
},
{
"epoch": 0.3769227904204998,
"grad_norm": 0.43615594506263733,
"learning_rate": 7.846903448407454e-06,
"loss": 0.7209,
"step": 1265
},
{
"epoch": 0.37722075310067416,
"grad_norm": 0.4406788945198059,
"learning_rate": 7.842627461653198e-06,
"loss": 0.7863,
"step": 1266
},
{
"epoch": 0.37751871578084845,
"grad_norm": 0.4182646870613098,
"learning_rate": 7.838348400797443e-06,
"loss": 0.7392,
"step": 1267
},
{
"epoch": 0.37781667846102274,
"grad_norm": 0.4302336871623993,
"learning_rate": 7.83406627046769e-06,
"loss": 0.7663,
"step": 1268
},
{
"epoch": 0.3781146411411971,
"grad_norm": 0.43965375423431396,
"learning_rate": 7.829781075294762e-06,
"loss": 0.801,
"step": 1269
},
{
"epoch": 0.3784126038213714,
"grad_norm": 0.4432234466075897,
"learning_rate": 7.825492819912792e-06,
"loss": 0.7936,
"step": 1270
},
{
"epoch": 0.37871056650154566,
"grad_norm": 0.4336239993572235,
"learning_rate": 7.821201508959233e-06,
"loss": 0.7912,
"step": 1271
},
{
"epoch": 0.37900852918172,
"grad_norm": 0.43910470604896545,
"learning_rate": 7.816907147074832e-06,
"loss": 0.762,
"step": 1272
},
{
"epoch": 0.3793064918618943,
"grad_norm": 0.41593313217163086,
"learning_rate": 7.81260973890364e-06,
"loss": 0.7793,
"step": 1273
},
{
"epoch": 0.3796044545420686,
"grad_norm": 0.4045245051383972,
"learning_rate": 7.808309289093e-06,
"loss": 0.7362,
"step": 1274
},
{
"epoch": 0.37990241722224294,
"grad_norm": 0.43610960245132446,
"learning_rate": 7.804005802293547e-06,
"loss": 0.7572,
"step": 1275
},
{
"epoch": 0.38020037990241723,
"grad_norm": 0.4366176426410675,
"learning_rate": 7.799699283159199e-06,
"loss": 0.7504,
"step": 1276
},
{
"epoch": 0.3804983425825915,
"grad_norm": 0.4121038019657135,
"learning_rate": 7.795389736347152e-06,
"loss": 0.7285,
"step": 1277
},
{
"epoch": 0.38079630526276587,
"grad_norm": 0.42790961265563965,
"learning_rate": 7.791077166517881e-06,
"loss": 0.7538,
"step": 1278
},
{
"epoch": 0.38109426794294016,
"grad_norm": 0.4392932057380676,
"learning_rate": 7.786761578335123e-06,
"loss": 0.7744,
"step": 1279
},
{
"epoch": 0.38139223062311445,
"grad_norm": 0.41220176219940186,
"learning_rate": 7.782442976465885e-06,
"loss": 0.7064,
"step": 1280
},
{
"epoch": 0.38169019330328874,
"grad_norm": 0.4391211271286011,
"learning_rate": 7.778121365580428e-06,
"loss": 0.7975,
"step": 1281
},
{
"epoch": 0.3819881559834631,
"grad_norm": 0.4090668559074402,
"learning_rate": 7.773796750352274e-06,
"loss": 0.7432,
"step": 1282
},
{
"epoch": 0.3822861186636374,
"grad_norm": 0.4221076965332031,
"learning_rate": 7.769469135458187e-06,
"loss": 0.7434,
"step": 1283
},
{
"epoch": 0.38258408134381167,
"grad_norm": 0.4447765052318573,
"learning_rate": 7.765138525578179e-06,
"loss": 0.7629,
"step": 1284
},
{
"epoch": 0.382882044023986,
"grad_norm": 0.4213871657848358,
"learning_rate": 7.760804925395502e-06,
"loss": 0.7572,
"step": 1285
},
{
"epoch": 0.3831800067041603,
"grad_norm": 0.4042508602142334,
"learning_rate": 7.756468339596634e-06,
"loss": 0.771,
"step": 1286
},
{
"epoch": 0.3834779693843346,
"grad_norm": 0.413955956697464,
"learning_rate": 7.752128772871292e-06,
"loss": 0.7879,
"step": 1287
},
{
"epoch": 0.38377593206450894,
"grad_norm": 0.41381534934043884,
"learning_rate": 7.74778622991241e-06,
"loss": 0.81,
"step": 1288
},
{
"epoch": 0.38407389474468323,
"grad_norm": 0.44178399443626404,
"learning_rate": 7.743440715416144e-06,
"loss": 0.7526,
"step": 1289
},
{
"epoch": 0.3843718574248575,
"grad_norm": 0.4154306948184967,
"learning_rate": 7.73909223408186e-06,
"loss": 0.7251,
"step": 1290
},
{
"epoch": 0.38466982010503187,
"grad_norm": 0.440555602312088,
"learning_rate": 7.734740790612137e-06,
"loss": 0.7891,
"step": 1291
},
{
"epoch": 0.38496778278520616,
"grad_norm": 0.4190932810306549,
"learning_rate": 7.730386389712749e-06,
"loss": 0.751,
"step": 1292
},
{
"epoch": 0.38526574546538045,
"grad_norm": 0.4299873411655426,
"learning_rate": 7.726029036092682e-06,
"loss": 0.7627,
"step": 1293
},
{
"epoch": 0.3855637081455548,
"grad_norm": 0.42644360661506653,
"learning_rate": 7.721668734464103e-06,
"loss": 0.775,
"step": 1294
},
{
"epoch": 0.3858616708257291,
"grad_norm": 0.42981818318367004,
"learning_rate": 7.71730548954237e-06,
"loss": 0.736,
"step": 1295
},
{
"epoch": 0.3861596335059034,
"grad_norm": 0.435674250125885,
"learning_rate": 7.71293930604603e-06,
"loss": 0.7854,
"step": 1296
},
{
"epoch": 0.38645759618607767,
"grad_norm": 0.4008498191833496,
"learning_rate": 7.708570188696798e-06,
"loss": 0.7294,
"step": 1297
},
{
"epoch": 0.386755558866252,
"grad_norm": 0.4344206154346466,
"learning_rate": 7.70419814221957e-06,
"loss": 0.7457,
"step": 1298
},
{
"epoch": 0.3870535215464263,
"grad_norm": 0.4119266867637634,
"learning_rate": 7.699823171342404e-06,
"loss": 0.7669,
"step": 1299
},
{
"epoch": 0.3873514842266006,
"grad_norm": 0.42819297313690186,
"learning_rate": 7.695445280796527e-06,
"loss": 0.7921,
"step": 1300
},
{
"epoch": 0.38764944690677494,
"grad_norm": 0.4224448502063751,
"learning_rate": 7.691064475316314e-06,
"loss": 0.7711,
"step": 1301
},
{
"epoch": 0.38794740958694923,
"grad_norm": 0.43162328004837036,
"learning_rate": 7.686680759639304e-06,
"loss": 0.7536,
"step": 1302
},
{
"epoch": 0.3882453722671235,
"grad_norm": 0.4317812919616699,
"learning_rate": 7.682294138506171e-06,
"loss": 0.7053,
"step": 1303
},
{
"epoch": 0.38854333494729787,
"grad_norm": 0.42532142996788025,
"learning_rate": 7.677904616660742e-06,
"loss": 0.7753,
"step": 1304
},
{
"epoch": 0.38884129762747216,
"grad_norm": 0.40439435839653015,
"learning_rate": 7.673512198849973e-06,
"loss": 0.7299,
"step": 1305
},
{
"epoch": 0.38913926030764645,
"grad_norm": 0.4271869659423828,
"learning_rate": 7.669116889823955e-06,
"loss": 0.7583,
"step": 1306
},
{
"epoch": 0.3894372229878208,
"grad_norm": 0.4402204751968384,
"learning_rate": 7.664718694335904e-06,
"loss": 0.8032,
"step": 1307
},
{
"epoch": 0.3897351856679951,
"grad_norm": 0.4161050617694855,
"learning_rate": 7.660317617142163e-06,
"loss": 0.7019,
"step": 1308
},
{
"epoch": 0.3900331483481694,
"grad_norm": 0.4266067445278168,
"learning_rate": 7.655913663002181e-06,
"loss": 0.7739,
"step": 1309
},
{
"epoch": 0.3903311110283437,
"grad_norm": 0.4075363874435425,
"learning_rate": 7.651506836678531e-06,
"loss": 0.736,
"step": 1310
},
{
"epoch": 0.390629073708518,
"grad_norm": 0.4212097227573395,
"learning_rate": 7.647097142936881e-06,
"loss": 0.7503,
"step": 1311
},
{
"epoch": 0.3909270363886923,
"grad_norm": 0.41902589797973633,
"learning_rate": 7.642684586546008e-06,
"loss": 0.7603,
"step": 1312
},
{
"epoch": 0.3912249990688666,
"grad_norm": 0.4070022404193878,
"learning_rate": 7.638269172277777e-06,
"loss": 0.7606,
"step": 1313
},
{
"epoch": 0.39152296174904094,
"grad_norm": 0.4232437014579773,
"learning_rate": 7.633850904907149e-06,
"loss": 0.7716,
"step": 1314
},
{
"epoch": 0.39182092442921523,
"grad_norm": 0.44182655215263367,
"learning_rate": 7.62942978921217e-06,
"loss": 0.8091,
"step": 1315
},
{
"epoch": 0.3921188871093895,
"grad_norm": 0.4218367040157318,
"learning_rate": 7.625005829973966e-06,
"loss": 0.7416,
"step": 1316
},
{
"epoch": 0.39241684978956387,
"grad_norm": 0.41140156984329224,
"learning_rate": 7.6205790319767385e-06,
"loss": 0.7577,
"step": 1317
},
{
"epoch": 0.39271481246973816,
"grad_norm": 0.42180532217025757,
"learning_rate": 7.616149400007753e-06,
"loss": 0.7348,
"step": 1318
},
{
"epoch": 0.39301277514991245,
"grad_norm": 0.3995833396911621,
"learning_rate": 7.611716938857349e-06,
"loss": 0.748,
"step": 1319
},
{
"epoch": 0.3933107378300868,
"grad_norm": 0.42666950821876526,
"learning_rate": 7.60728165331892e-06,
"loss": 0.7483,
"step": 1320
},
{
"epoch": 0.3936087005102611,
"grad_norm": 0.43507495522499084,
"learning_rate": 7.602843548188915e-06,
"loss": 0.8089,
"step": 1321
},
{
"epoch": 0.3939066631904354,
"grad_norm": 0.4356527328491211,
"learning_rate": 7.598402628266832e-06,
"loss": 0.7452,
"step": 1322
},
{
"epoch": 0.3942046258706097,
"grad_norm": 0.41399919986724854,
"learning_rate": 7.5939588983552145e-06,
"loss": 0.7603,
"step": 1323
},
{
"epoch": 0.394502588550784,
"grad_norm": 0.4259139597415924,
"learning_rate": 7.589512363259643e-06,
"loss": 0.7756,
"step": 1324
},
{
"epoch": 0.3948005512309583,
"grad_norm": 0.42768535017967224,
"learning_rate": 7.58506302778873e-06,
"loss": 0.7465,
"step": 1325
},
{
"epoch": 0.39509851391113265,
"grad_norm": 0.42649754881858826,
"learning_rate": 7.580610896754122e-06,
"loss": 0.7372,
"step": 1326
},
{
"epoch": 0.39539647659130694,
"grad_norm": 0.4322483539581299,
"learning_rate": 7.576155974970485e-06,
"loss": 0.7693,
"step": 1327
},
{
"epoch": 0.39569443927148124,
"grad_norm": 0.41652461886405945,
"learning_rate": 7.5716982672555e-06,
"loss": 0.7764,
"step": 1328
},
{
"epoch": 0.3959924019516556,
"grad_norm": 0.4124981164932251,
"learning_rate": 7.567237778429868e-06,
"loss": 0.7475,
"step": 1329
},
{
"epoch": 0.3962903646318299,
"grad_norm": 0.3989923298358917,
"learning_rate": 7.562774513317293e-06,
"loss": 0.7229,
"step": 1330
},
{
"epoch": 0.39658832731200416,
"grad_norm": 0.41557246446609497,
"learning_rate": 7.558308476744478e-06,
"loss": 0.7503,
"step": 1331
},
{
"epoch": 0.39688628999217845,
"grad_norm": 0.4041295051574707,
"learning_rate": 7.553839673541133e-06,
"loss": 0.7339,
"step": 1332
},
{
"epoch": 0.3971842526723528,
"grad_norm": 0.4372117221355438,
"learning_rate": 7.54936810853995e-06,
"loss": 0.7584,
"step": 1333
},
{
"epoch": 0.3974822153525271,
"grad_norm": 0.4198533594608307,
"learning_rate": 7.544893786576612e-06,
"loss": 0.7624,
"step": 1334
},
{
"epoch": 0.3977801780327014,
"grad_norm": 0.4439017176628113,
"learning_rate": 7.540416712489786e-06,
"loss": 0.8231,
"step": 1335
},
{
"epoch": 0.3980781407128757,
"grad_norm": 0.4390065371990204,
"learning_rate": 7.5359368911211115e-06,
"loss": 0.7142,
"step": 1336
},
{
"epoch": 0.39837610339305,
"grad_norm": 0.4036790430545807,
"learning_rate": 7.5314543273151986e-06,
"loss": 0.7458,
"step": 1337
},
{
"epoch": 0.3986740660732243,
"grad_norm": 0.43398842215538025,
"learning_rate": 7.5269690259196235e-06,
"loss": 0.7881,
"step": 1338
},
{
"epoch": 0.39897202875339866,
"grad_norm": 0.4433099031448364,
"learning_rate": 7.522480991784928e-06,
"loss": 0.7741,
"step": 1339
},
{
"epoch": 0.39926999143357295,
"grad_norm": 0.41631844639778137,
"learning_rate": 7.517990229764602e-06,
"loss": 0.7323,
"step": 1340
},
{
"epoch": 0.39956795411374724,
"grad_norm": 0.43654537200927734,
"learning_rate": 7.51349674471509e-06,
"loss": 0.7332,
"step": 1341
},
{
"epoch": 0.3998659167939216,
"grad_norm": 0.42507266998291016,
"learning_rate": 7.509000541495777e-06,
"loss": 0.7597,
"step": 1342
},
{
"epoch": 0.4001638794740959,
"grad_norm": 0.4104708135128021,
"learning_rate": 7.504501624968995e-06,
"loss": 0.7441,
"step": 1343
},
{
"epoch": 0.40046184215427016,
"grad_norm": 0.415763258934021,
"learning_rate": 7.500000000000001e-06,
"loss": 0.7459,
"step": 1344
},
{
"epoch": 0.4007598048344445,
"grad_norm": 0.41063833236694336,
"learning_rate": 7.495495671456987e-06,
"loss": 0.742,
"step": 1345
},
{
"epoch": 0.4010577675146188,
"grad_norm": 0.4284031093120575,
"learning_rate": 7.4909886442110694e-06,
"loss": 0.7591,
"step": 1346
},
{
"epoch": 0.4013557301947931,
"grad_norm": 0.4156349003314972,
"learning_rate": 7.4864789231362776e-06,
"loss": 0.7683,
"step": 1347
},
{
"epoch": 0.4016536928749674,
"grad_norm": 0.422329843044281,
"learning_rate": 7.481966513109561e-06,
"loss": 0.7323,
"step": 1348
},
{
"epoch": 0.40195165555514173,
"grad_norm": 0.4287894070148468,
"learning_rate": 7.477451419010768e-06,
"loss": 0.7342,
"step": 1349
},
{
"epoch": 0.402249618235316,
"grad_norm": 0.40589234232902527,
"learning_rate": 7.472933645722662e-06,
"loss": 0.7642,
"step": 1350
},
{
"epoch": 0.4025475809154903,
"grad_norm": 0.42224130034446716,
"learning_rate": 7.468413198130891e-06,
"loss": 0.7288,
"step": 1351
},
{
"epoch": 0.40284554359566466,
"grad_norm": 0.433326780796051,
"learning_rate": 7.463890081124005e-06,
"loss": 0.7924,
"step": 1352
},
{
"epoch": 0.40314350627583895,
"grad_norm": 0.39268532395362854,
"learning_rate": 7.459364299593433e-06,
"loss": 0.6931,
"step": 1353
},
{
"epoch": 0.40344146895601324,
"grad_norm": 0.41443511843681335,
"learning_rate": 7.4548358584334924e-06,
"loss": 0.7505,
"step": 1354
},
{
"epoch": 0.4037394316361876,
"grad_norm": 0.4311977028846741,
"learning_rate": 7.4503047625413715e-06,
"loss": 0.7849,
"step": 1355
},
{
"epoch": 0.4040373943163619,
"grad_norm": 0.42603302001953125,
"learning_rate": 7.445771016817132e-06,
"loss": 0.7755,
"step": 1356
},
{
"epoch": 0.40433535699653617,
"grad_norm": 0.419454962015152,
"learning_rate": 7.4412346261637e-06,
"loss": 0.7299,
"step": 1357
},
{
"epoch": 0.4046333196767105,
"grad_norm": 0.4183696508407593,
"learning_rate": 7.436695595486865e-06,
"loss": 0.7846,
"step": 1358
},
{
"epoch": 0.4049312823568848,
"grad_norm": 0.4117453992366791,
"learning_rate": 7.432153929695268e-06,
"loss": 0.7586,
"step": 1359
},
{
"epoch": 0.4052292450370591,
"grad_norm": 0.42735040187835693,
"learning_rate": 7.427609633700399e-06,
"loss": 0.7615,
"step": 1360
},
{
"epoch": 0.40552720771723344,
"grad_norm": 0.4226491451263428,
"learning_rate": 7.4230627124165975e-06,
"loss": 0.7847,
"step": 1361
},
{
"epoch": 0.40582517039740773,
"grad_norm": 0.40541863441467285,
"learning_rate": 7.418513170761036e-06,
"loss": 0.7348,
"step": 1362
},
{
"epoch": 0.406123133077582,
"grad_norm": 0.5737308263778687,
"learning_rate": 7.413961013653725e-06,
"loss": 0.7987,
"step": 1363
},
{
"epoch": 0.40642109575775637,
"grad_norm": 0.4181864559650421,
"learning_rate": 7.409406246017501e-06,
"loss": 0.7746,
"step": 1364
},
{
"epoch": 0.40671905843793066,
"grad_norm": 0.42666494846343994,
"learning_rate": 7.404848872778028e-06,
"loss": 0.7393,
"step": 1365
},
{
"epoch": 0.40701702111810495,
"grad_norm": 0.4203423857688904,
"learning_rate": 7.400288898863779e-06,
"loss": 0.7602,
"step": 1366
},
{
"epoch": 0.40731498379827924,
"grad_norm": 0.4137178063392639,
"learning_rate": 7.395726329206048e-06,
"loss": 0.7705,
"step": 1367
},
{
"epoch": 0.4076129464784536,
"grad_norm": 0.44370102882385254,
"learning_rate": 7.3911611687389314e-06,
"loss": 0.7377,
"step": 1368
},
{
"epoch": 0.4079109091586279,
"grad_norm": 0.4083855152130127,
"learning_rate": 7.386593422399331e-06,
"loss": 0.7214,
"step": 1369
},
{
"epoch": 0.40820887183880217,
"grad_norm": 0.4298425316810608,
"learning_rate": 7.382023095126941e-06,
"loss": 0.7707,
"step": 1370
},
{
"epoch": 0.4085068345189765,
"grad_norm": 0.44515231251716614,
"learning_rate": 7.377450191864249e-06,
"loss": 0.7834,
"step": 1371
},
{
"epoch": 0.4088047971991508,
"grad_norm": 0.4382359981536865,
"learning_rate": 7.372874717556529e-06,
"loss": 0.7679,
"step": 1372
},
{
"epoch": 0.4091027598793251,
"grad_norm": 0.417392373085022,
"learning_rate": 7.368296677151834e-06,
"loss": 0.7495,
"step": 1373
},
{
"epoch": 0.40940072255949944,
"grad_norm": 0.4449048340320587,
"learning_rate": 7.363716075600993e-06,
"loss": 0.7477,
"step": 1374
},
{
"epoch": 0.40969868523967373,
"grad_norm": 0.42793670296669006,
"learning_rate": 7.359132917857601e-06,
"loss": 0.8068,
"step": 1375
},
{
"epoch": 0.409996647919848,
"grad_norm": 0.43676117062568665,
"learning_rate": 7.354547208878025e-06,
"loss": 0.7482,
"step": 1376
},
{
"epoch": 0.41029461060002237,
"grad_norm": 0.41651538014411926,
"learning_rate": 7.349958953621383e-06,
"loss": 0.693,
"step": 1377
},
{
"epoch": 0.41059257328019666,
"grad_norm": 0.42439186573028564,
"learning_rate": 7.34536815704955e-06,
"loss": 0.7363,
"step": 1378
},
{
"epoch": 0.41089053596037095,
"grad_norm": 0.4144623577594757,
"learning_rate": 7.340774824127153e-06,
"loss": 0.7194,
"step": 1379
},
{
"epoch": 0.4111884986405453,
"grad_norm": 0.42826560139656067,
"learning_rate": 7.336178959821555e-06,
"loss": 0.7529,
"step": 1380
},
{
"epoch": 0.4114864613207196,
"grad_norm": 0.4292297661304474,
"learning_rate": 7.3315805691028615e-06,
"loss": 0.7688,
"step": 1381
},
{
"epoch": 0.4117844240008939,
"grad_norm": 0.4229586124420166,
"learning_rate": 7.326979656943907e-06,
"loss": 0.7452,
"step": 1382
},
{
"epoch": 0.41208238668106817,
"grad_norm": 0.4257849156856537,
"learning_rate": 7.322376228320254e-06,
"loss": 0.7808,
"step": 1383
},
{
"epoch": 0.4123803493612425,
"grad_norm": 0.45161929726600647,
"learning_rate": 7.317770288210187e-06,
"loss": 0.7898,
"step": 1384
},
{
"epoch": 0.4126783120414168,
"grad_norm": 0.4150608479976654,
"learning_rate": 7.313161841594708e-06,
"loss": 0.7411,
"step": 1385
},
{
"epoch": 0.4129762747215911,
"grad_norm": 0.4321781098842621,
"learning_rate": 7.308550893457524e-06,
"loss": 0.8008,
"step": 1386
},
{
"epoch": 0.41327423740176544,
"grad_norm": 0.4279387891292572,
"learning_rate": 7.303937448785052e-06,
"loss": 0.7508,
"step": 1387
},
{
"epoch": 0.41357220008193973,
"grad_norm": 0.4056704640388489,
"learning_rate": 7.29932151256641e-06,
"loss": 0.7161,
"step": 1388
},
{
"epoch": 0.413870162762114,
"grad_norm": 0.4097346067428589,
"learning_rate": 7.294703089793406e-06,
"loss": 0.7178,
"step": 1389
},
{
"epoch": 0.41416812544228837,
"grad_norm": 0.4417518973350525,
"learning_rate": 7.290082185460539e-06,
"loss": 0.7807,
"step": 1390
},
{
"epoch": 0.41446608812246266,
"grad_norm": 0.4345901608467102,
"learning_rate": 7.285458804564991e-06,
"loss": 0.7812,
"step": 1391
},
{
"epoch": 0.41476405080263695,
"grad_norm": 0.4186284840106964,
"learning_rate": 7.280832952106627e-06,
"loss": 0.7779,
"step": 1392
},
{
"epoch": 0.4150620134828113,
"grad_norm": 0.4079751968383789,
"learning_rate": 7.276204633087976e-06,
"loss": 0.7309,
"step": 1393
},
{
"epoch": 0.4153599761629856,
"grad_norm": 0.4316011369228363,
"learning_rate": 7.271573852514242e-06,
"loss": 0.7751,
"step": 1394
},
{
"epoch": 0.4156579388431599,
"grad_norm": 0.4291500747203827,
"learning_rate": 7.266940615393288e-06,
"loss": 0.8154,
"step": 1395
},
{
"epoch": 0.4159559015233342,
"grad_norm": 0.4209052324295044,
"learning_rate": 7.262304926735633e-06,
"loss": 0.7949,
"step": 1396
},
{
"epoch": 0.4162538642035085,
"grad_norm": 0.42992156744003296,
"learning_rate": 7.257666791554448e-06,
"loss": 0.7878,
"step": 1397
},
{
"epoch": 0.4165518268836828,
"grad_norm": 0.42956528067588806,
"learning_rate": 7.253026214865549e-06,
"loss": 0.7604,
"step": 1398
},
{
"epoch": 0.41684978956385715,
"grad_norm": 0.41761088371276855,
"learning_rate": 7.2483832016873955e-06,
"loss": 0.7618,
"step": 1399
},
{
"epoch": 0.41714775224403144,
"grad_norm": 0.4143986701965332,
"learning_rate": 7.243737757041077e-06,
"loss": 0.7209,
"step": 1400
},
{
"epoch": 0.41744571492420574,
"grad_norm": 0.4181540012359619,
"learning_rate": 7.239089885950317e-06,
"loss": 0.7822,
"step": 1401
},
{
"epoch": 0.41774367760438,
"grad_norm": 0.42317330837249756,
"learning_rate": 7.234439593441458e-06,
"loss": 0.7581,
"step": 1402
},
{
"epoch": 0.41804164028455437,
"grad_norm": 0.4166615903377533,
"learning_rate": 7.2297868845434674e-06,
"loss": 0.7416,
"step": 1403
},
{
"epoch": 0.41833960296472866,
"grad_norm": 0.4076935648918152,
"learning_rate": 7.225131764287919e-06,
"loss": 0.7423,
"step": 1404
},
{
"epoch": 0.41863756564490295,
"grad_norm": 0.4171837270259857,
"learning_rate": 7.220474237709001e-06,
"loss": 0.7502,
"step": 1405
},
{
"epoch": 0.4189355283250773,
"grad_norm": 0.44579628109931946,
"learning_rate": 7.215814309843496e-06,
"loss": 0.8137,
"step": 1406
},
{
"epoch": 0.4192334910052516,
"grad_norm": 0.4234749376773834,
"learning_rate": 7.211151985730794e-06,
"loss": 0.725,
"step": 1407
},
{
"epoch": 0.4195314536854259,
"grad_norm": 0.42978689074516296,
"learning_rate": 7.206487270412866e-06,
"loss": 0.7727,
"step": 1408
},
{
"epoch": 0.4198294163656002,
"grad_norm": 0.4192066788673401,
"learning_rate": 7.2018201689342745e-06,
"loss": 0.7638,
"step": 1409
},
{
"epoch": 0.4201273790457745,
"grad_norm": 0.4143064022064209,
"learning_rate": 7.197150686342161e-06,
"loss": 0.7527,
"step": 1410
},
{
"epoch": 0.4204253417259488,
"grad_norm": 0.43450862169265747,
"learning_rate": 7.192478827686242e-06,
"loss": 0.7769,
"step": 1411
},
{
"epoch": 0.42072330440612316,
"grad_norm": 0.4161491394042969,
"learning_rate": 7.187804598018806e-06,
"loss": 0.7545,
"step": 1412
},
{
"epoch": 0.42102126708629745,
"grad_norm": 0.4148748219013214,
"learning_rate": 7.183128002394699e-06,
"loss": 0.767,
"step": 1413
},
{
"epoch": 0.42131922976647174,
"grad_norm": 0.42876750230789185,
"learning_rate": 7.178449045871335e-06,
"loss": 0.7673,
"step": 1414
},
{
"epoch": 0.4216171924466461,
"grad_norm": 0.41839900612831116,
"learning_rate": 7.173767733508672e-06,
"loss": 0.7258,
"step": 1415
},
{
"epoch": 0.4219151551268204,
"grad_norm": 0.4298432469367981,
"learning_rate": 7.169084070369223e-06,
"loss": 0.7544,
"step": 1416
},
{
"epoch": 0.42221311780699466,
"grad_norm": 0.4193957448005676,
"learning_rate": 7.164398061518036e-06,
"loss": 0.7454,
"step": 1417
},
{
"epoch": 0.42251108048716896,
"grad_norm": 0.40669599175453186,
"learning_rate": 7.159709712022705e-06,
"loss": 0.7237,
"step": 1418
},
{
"epoch": 0.4228090431673433,
"grad_norm": 0.4243616461753845,
"learning_rate": 7.1550190269533435e-06,
"loss": 0.7765,
"step": 1419
},
{
"epoch": 0.4231070058475176,
"grad_norm": 0.4437214136123657,
"learning_rate": 7.1503260113826035e-06,
"loss": 0.7835,
"step": 1420
},
{
"epoch": 0.4234049685276919,
"grad_norm": 0.4286574721336365,
"learning_rate": 7.145630670385647e-06,
"loss": 0.7892,
"step": 1421
},
{
"epoch": 0.42370293120786623,
"grad_norm": 0.4140666127204895,
"learning_rate": 7.1409330090401564e-06,
"loss": 0.7386,
"step": 1422
},
{
"epoch": 0.4240008938880405,
"grad_norm": 0.4383534789085388,
"learning_rate": 7.136233032426322e-06,
"loss": 0.7703,
"step": 1423
},
{
"epoch": 0.4242988565682148,
"grad_norm": 0.406421422958374,
"learning_rate": 7.131530745626836e-06,
"loss": 0.7656,
"step": 1424
},
{
"epoch": 0.42459681924838916,
"grad_norm": 0.4072284698486328,
"learning_rate": 7.126826153726893e-06,
"loss": 0.7291,
"step": 1425
},
{
"epoch": 0.42489478192856345,
"grad_norm": 0.42674052715301514,
"learning_rate": 7.122119261814175e-06,
"loss": 0.7584,
"step": 1426
},
{
"epoch": 0.42519274460873774,
"grad_norm": 0.4246309995651245,
"learning_rate": 7.117410074978858e-06,
"loss": 0.7668,
"step": 1427
},
{
"epoch": 0.4254907072889121,
"grad_norm": 0.4239397346973419,
"learning_rate": 7.112698598313591e-06,
"loss": 0.7371,
"step": 1428
},
{
"epoch": 0.4257886699690864,
"grad_norm": 0.43355709314346313,
"learning_rate": 7.10798483691351e-06,
"loss": 0.769,
"step": 1429
},
{
"epoch": 0.42608663264926067,
"grad_norm": 0.42314520478248596,
"learning_rate": 7.103268795876212e-06,
"loss": 0.7634,
"step": 1430
},
{
"epoch": 0.426384595329435,
"grad_norm": 0.42701730132102966,
"learning_rate": 7.098550480301765e-06,
"loss": 0.7857,
"step": 1431
},
{
"epoch": 0.4266825580096093,
"grad_norm": 0.4175165295600891,
"learning_rate": 7.093829895292695e-06,
"loss": 0.7779,
"step": 1432
},
{
"epoch": 0.4269805206897836,
"grad_norm": 0.41399118304252625,
"learning_rate": 7.089107045953983e-06,
"loss": 0.7646,
"step": 1433
},
{
"epoch": 0.42727848336995794,
"grad_norm": 0.4138357937335968,
"learning_rate": 7.084381937393059e-06,
"loss": 0.7591,
"step": 1434
},
{
"epoch": 0.42757644605013223,
"grad_norm": 0.41874101758003235,
"learning_rate": 7.0796545747197924e-06,
"loss": 0.7773,
"step": 1435
},
{
"epoch": 0.4278744087303065,
"grad_norm": 0.43565741181373596,
"learning_rate": 7.0749249630464935e-06,
"loss": 0.7483,
"step": 1436
},
{
"epoch": 0.4281723714104808,
"grad_norm": 0.4324802756309509,
"learning_rate": 7.070193107487906e-06,
"loss": 0.8143,
"step": 1437
},
{
"epoch": 0.42847033409065516,
"grad_norm": 0.43232670426368713,
"learning_rate": 7.0654590131612e-06,
"loss": 0.7796,
"step": 1438
},
{
"epoch": 0.42876829677082945,
"grad_norm": 0.40568041801452637,
"learning_rate": 7.060722685185961e-06,
"loss": 0.7366,
"step": 1439
},
{
"epoch": 0.42906625945100374,
"grad_norm": 0.4250433146953583,
"learning_rate": 7.0559841286841975e-06,
"loss": 0.7601,
"step": 1440
},
{
"epoch": 0.4293642221311781,
"grad_norm": 0.439263254404068,
"learning_rate": 7.0512433487803245e-06,
"loss": 0.7444,
"step": 1441
},
{
"epoch": 0.4296621848113524,
"grad_norm": 0.42873579263687134,
"learning_rate": 7.04650035060116e-06,
"loss": 0.784,
"step": 1442
},
{
"epoch": 0.42996014749152667,
"grad_norm": 0.4230228066444397,
"learning_rate": 7.041755139275925e-06,
"loss": 0.7377,
"step": 1443
},
{
"epoch": 0.430258110171701,
"grad_norm": 0.4337446391582489,
"learning_rate": 7.03700771993623e-06,
"loss": 0.7417,
"step": 1444
},
{
"epoch": 0.4305560728518753,
"grad_norm": 0.4101364314556122,
"learning_rate": 7.032258097716076e-06,
"loss": 0.7559,
"step": 1445
},
{
"epoch": 0.4308540355320496,
"grad_norm": 0.44110795855522156,
"learning_rate": 7.027506277751843e-06,
"loss": 0.8001,
"step": 1446
},
{
"epoch": 0.43115199821222394,
"grad_norm": 0.42024466395378113,
"learning_rate": 7.022752265182292e-06,
"loss": 0.7348,
"step": 1447
},
{
"epoch": 0.43144996089239823,
"grad_norm": 0.41125401854515076,
"learning_rate": 7.017996065148553e-06,
"loss": 0.8091,
"step": 1448
},
{
"epoch": 0.4317479235725725,
"grad_norm": 0.4192337989807129,
"learning_rate": 7.01323768279412e-06,
"loss": 0.7416,
"step": 1449
},
{
"epoch": 0.43204588625274687,
"grad_norm": 0.4032820761203766,
"learning_rate": 7.008477123264849e-06,
"loss": 0.7004,
"step": 1450
},
{
"epoch": 0.43234384893292116,
"grad_norm": 0.4296363294124603,
"learning_rate": 7.0037143917089485e-06,
"loss": 0.7787,
"step": 1451
},
{
"epoch": 0.43264181161309545,
"grad_norm": 0.41232022643089294,
"learning_rate": 6.9989494932769805e-06,
"loss": 0.7424,
"step": 1452
},
{
"epoch": 0.43293977429326974,
"grad_norm": 0.4449349641799927,
"learning_rate": 6.9941824331218465e-06,
"loss": 0.787,
"step": 1453
},
{
"epoch": 0.4332377369734441,
"grad_norm": 0.41432490944862366,
"learning_rate": 6.989413216398786e-06,
"loss": 0.7426,
"step": 1454
},
{
"epoch": 0.4335356996536184,
"grad_norm": 0.4493269622325897,
"learning_rate": 6.98464184826537e-06,
"loss": 0.8144,
"step": 1455
},
{
"epoch": 0.43383366233379267,
"grad_norm": 0.3982318937778473,
"learning_rate": 6.979868333881499e-06,
"loss": 0.7263,
"step": 1456
},
{
"epoch": 0.434131625013967,
"grad_norm": 0.4245948791503906,
"learning_rate": 6.975092678409392e-06,
"loss": 0.7877,
"step": 1457
},
{
"epoch": 0.4344295876941413,
"grad_norm": 0.4272557497024536,
"learning_rate": 6.970314887013585e-06,
"loss": 0.7497,
"step": 1458
},
{
"epoch": 0.4347275503743156,
"grad_norm": 0.432786226272583,
"learning_rate": 6.965534964860921e-06,
"loss": 0.731,
"step": 1459
},
{
"epoch": 0.43502551305448994,
"grad_norm": 0.42844119668006897,
"learning_rate": 6.960752917120552e-06,
"loss": 0.7579,
"step": 1460
},
{
"epoch": 0.43532347573466423,
"grad_norm": 0.42431870102882385,
"learning_rate": 6.955968748963924e-06,
"loss": 0.7786,
"step": 1461
},
{
"epoch": 0.4356214384148385,
"grad_norm": 0.40737029910087585,
"learning_rate": 6.9511824655647786e-06,
"loss": 0.7403,
"step": 1462
},
{
"epoch": 0.43591940109501287,
"grad_norm": 0.4208509027957916,
"learning_rate": 6.946394072099145e-06,
"loss": 0.7551,
"step": 1463
},
{
"epoch": 0.43621736377518716,
"grad_norm": 0.4238913953304291,
"learning_rate": 6.941603573745334e-06,
"loss": 0.7633,
"step": 1464
},
{
"epoch": 0.43651532645536145,
"grad_norm": 0.42684584856033325,
"learning_rate": 6.936810975683931e-06,
"loss": 0.7652,
"step": 1465
},
{
"epoch": 0.4368132891355358,
"grad_norm": 0.4150695204734802,
"learning_rate": 6.932016283097793e-06,
"loss": 0.7425,
"step": 1466
},
{
"epoch": 0.4371112518157101,
"grad_norm": 0.4127320945262909,
"learning_rate": 6.927219501172046e-06,
"loss": 0.7914,
"step": 1467
},
{
"epoch": 0.4374092144958844,
"grad_norm": 0.41298285126686096,
"learning_rate": 6.922420635094067e-06,
"loss": 0.7591,
"step": 1468
},
{
"epoch": 0.4377071771760587,
"grad_norm": 0.4197395145893097,
"learning_rate": 6.9176196900534975e-06,
"loss": 0.7628,
"step": 1469
},
{
"epoch": 0.438005139856233,
"grad_norm": 0.4192826449871063,
"learning_rate": 6.912816671242215e-06,
"loss": 0.7607,
"step": 1470
},
{
"epoch": 0.4383031025364073,
"grad_norm": 0.4042337238788605,
"learning_rate": 6.908011583854353e-06,
"loss": 0.6951,
"step": 1471
},
{
"epoch": 0.4386010652165816,
"grad_norm": 0.4131634533405304,
"learning_rate": 6.90320443308627e-06,
"loss": 0.766,
"step": 1472
},
{
"epoch": 0.43889902789675594,
"grad_norm": 0.4269515573978424,
"learning_rate": 6.898395224136565e-06,
"loss": 0.7805,
"step": 1473
},
{
"epoch": 0.43919699057693024,
"grad_norm": 0.4271021783351898,
"learning_rate": 6.8935839622060564e-06,
"loss": 0.8168,
"step": 1474
},
{
"epoch": 0.4394949532571045,
"grad_norm": 0.4018744230270386,
"learning_rate": 6.888770652497785e-06,
"loss": 0.755,
"step": 1475
},
{
"epoch": 0.43979291593727887,
"grad_norm": 0.4159916043281555,
"learning_rate": 6.88395530021701e-06,
"loss": 0.74,
"step": 1476
},
{
"epoch": 0.44009087861745316,
"grad_norm": 0.42056897282600403,
"learning_rate": 6.879137910571191e-06,
"loss": 0.7153,
"step": 1477
},
{
"epoch": 0.44038884129762745,
"grad_norm": 0.4232238233089447,
"learning_rate": 6.87431848877e-06,
"loss": 0.7604,
"step": 1478
},
{
"epoch": 0.4406868039778018,
"grad_norm": 0.40519949793815613,
"learning_rate": 6.8694970400253e-06,
"loss": 0.752,
"step": 1479
},
{
"epoch": 0.4409847666579761,
"grad_norm": 0.42621520161628723,
"learning_rate": 6.86467356955115e-06,
"loss": 0.7631,
"step": 1480
},
{
"epoch": 0.4412827293381504,
"grad_norm": 0.40618953108787537,
"learning_rate": 6.8598480825637916e-06,
"loss": 0.7715,
"step": 1481
},
{
"epoch": 0.4415806920183247,
"grad_norm": 0.4289984405040741,
"learning_rate": 6.855020584281651e-06,
"loss": 0.7577,
"step": 1482
},
{
"epoch": 0.441878654698499,
"grad_norm": 0.4101161062717438,
"learning_rate": 6.850191079925328e-06,
"loss": 0.7559,
"step": 1483
},
{
"epoch": 0.4421766173786733,
"grad_norm": 0.4357437789440155,
"learning_rate": 6.845359574717591e-06,
"loss": 0.7396,
"step": 1484
},
{
"epoch": 0.44247458005884766,
"grad_norm": 0.4237337112426758,
"learning_rate": 6.8405260738833715e-06,
"loss": 0.7199,
"step": 1485
},
{
"epoch": 0.44277254273902195,
"grad_norm": 0.42737624049186707,
"learning_rate": 6.835690582649762e-06,
"loss": 0.7638,
"step": 1486
},
{
"epoch": 0.44307050541919624,
"grad_norm": 0.42396315932273865,
"learning_rate": 6.830853106246007e-06,
"loss": 0.7432,
"step": 1487
},
{
"epoch": 0.4433684680993705,
"grad_norm": 0.4084084928035736,
"learning_rate": 6.826013649903495e-06,
"loss": 0.7566,
"step": 1488
},
{
"epoch": 0.4436664307795449,
"grad_norm": 0.42464542388916016,
"learning_rate": 6.821172218855756e-06,
"loss": 0.7647,
"step": 1489
},
{
"epoch": 0.44396439345971916,
"grad_norm": 0.4080066978931427,
"learning_rate": 6.81632881833846e-06,
"loss": 0.742,
"step": 1490
},
{
"epoch": 0.44426235613989346,
"grad_norm": 0.4154869616031647,
"learning_rate": 6.811483453589403e-06,
"loss": 0.7579,
"step": 1491
},
{
"epoch": 0.4445603188200678,
"grad_norm": 0.41644084453582764,
"learning_rate": 6.806636129848504e-06,
"loss": 0.7504,
"step": 1492
},
{
"epoch": 0.4448582815002421,
"grad_norm": 0.413764089345932,
"learning_rate": 6.801786852357804e-06,
"loss": 0.7259,
"step": 1493
},
{
"epoch": 0.4451562441804164,
"grad_norm": 0.4061482846736908,
"learning_rate": 6.796935626361454e-06,
"loss": 0.7619,
"step": 1494
},
{
"epoch": 0.44545420686059073,
"grad_norm": 0.4003064036369324,
"learning_rate": 6.792082457105714e-06,
"loss": 0.7358,
"step": 1495
},
{
"epoch": 0.445752169540765,
"grad_norm": 0.45082637667655945,
"learning_rate": 6.787227349838946e-06,
"loss": 0.7915,
"step": 1496
},
{
"epoch": 0.4460501322209393,
"grad_norm": 0.4260353147983551,
"learning_rate": 6.782370309811605e-06,
"loss": 0.7218,
"step": 1497
},
{
"epoch": 0.44634809490111366,
"grad_norm": 0.4147459864616394,
"learning_rate": 6.777511342276242e-06,
"loss": 0.7893,
"step": 1498
},
{
"epoch": 0.44664605758128795,
"grad_norm": 0.43019166588783264,
"learning_rate": 6.772650452487482e-06,
"loss": 0.7851,
"step": 1499
},
{
"epoch": 0.44694402026146224,
"grad_norm": 0.40980443358421326,
"learning_rate": 6.767787645702039e-06,
"loss": 0.7539,
"step": 1500
},
{
"epoch": 0.4472419829416366,
"grad_norm": 0.41804051399230957,
"learning_rate": 6.762922927178696e-06,
"loss": 0.7229,
"step": 1501
},
{
"epoch": 0.4475399456218109,
"grad_norm": 0.41674575209617615,
"learning_rate": 6.7580563021783045e-06,
"loss": 0.7914,
"step": 1502
},
{
"epoch": 0.44783790830198517,
"grad_norm": 0.4301901161670685,
"learning_rate": 6.753187775963773e-06,
"loss": 0.7459,
"step": 1503
},
{
"epoch": 0.4481358709821595,
"grad_norm": 0.4142482876777649,
"learning_rate": 6.7483173538000734e-06,
"loss": 0.7596,
"step": 1504
},
{
"epoch": 0.4484338336623338,
"grad_norm": 0.42101266980171204,
"learning_rate": 6.743445040954223e-06,
"loss": 0.7949,
"step": 1505
},
{
"epoch": 0.4487317963425081,
"grad_norm": 0.42256978154182434,
"learning_rate": 6.738570842695287e-06,
"loss": 0.7771,
"step": 1506
},
{
"epoch": 0.4490297590226824,
"grad_norm": 0.40780338644981384,
"learning_rate": 6.7336947642943665e-06,
"loss": 0.7704,
"step": 1507
},
{
"epoch": 0.44932772170285673,
"grad_norm": 0.4173721671104431,
"learning_rate": 6.728816811024594e-06,
"loss": 0.7672,
"step": 1508
},
{
"epoch": 0.449625684383031,
"grad_norm": 0.4159814119338989,
"learning_rate": 6.723936988161138e-06,
"loss": 0.7421,
"step": 1509
},
{
"epoch": 0.4499236470632053,
"grad_norm": 0.4115670621395111,
"learning_rate": 6.719055300981181e-06,
"loss": 0.7692,
"step": 1510
},
{
"epoch": 0.45022160974337966,
"grad_norm": 0.4083748161792755,
"learning_rate": 6.714171754763923e-06,
"loss": 0.7779,
"step": 1511
},
{
"epoch": 0.45051957242355395,
"grad_norm": 0.42115432024002075,
"learning_rate": 6.709286354790577e-06,
"loss": 0.7554,
"step": 1512
},
{
"epoch": 0.45081753510372824,
"grad_norm": 0.40441468358039856,
"learning_rate": 6.704399106344359e-06,
"loss": 0.7255,
"step": 1513
},
{
"epoch": 0.4511154977839026,
"grad_norm": 0.40472057461738586,
"learning_rate": 6.699510014710484e-06,
"loss": 0.6787,
"step": 1514
},
{
"epoch": 0.4514134604640769,
"grad_norm": 0.42571139335632324,
"learning_rate": 6.694619085176159e-06,
"loss": 0.7412,
"step": 1515
},
{
"epoch": 0.45171142314425117,
"grad_norm": 0.4025956690311432,
"learning_rate": 6.689726323030582e-06,
"loss": 0.726,
"step": 1516
},
{
"epoch": 0.4520093858244255,
"grad_norm": 0.41559621691703796,
"learning_rate": 6.684831733564929e-06,
"loss": 0.7741,
"step": 1517
},
{
"epoch": 0.4523073485045998,
"grad_norm": 0.4175313413143158,
"learning_rate": 6.679935322072358e-06,
"loss": 0.7438,
"step": 1518
},
{
"epoch": 0.4526053111847741,
"grad_norm": 0.4239237606525421,
"learning_rate": 6.6750370938479895e-06,
"loss": 0.7551,
"step": 1519
},
{
"epoch": 0.45290327386494844,
"grad_norm": 0.41169679164886475,
"learning_rate": 6.670137054188912e-06,
"loss": 0.7535,
"step": 1520
},
{
"epoch": 0.45320123654512273,
"grad_norm": 0.4318566620349884,
"learning_rate": 6.665235208394175e-06,
"loss": 0.7596,
"step": 1521
},
{
"epoch": 0.453499199225297,
"grad_norm": 0.4227028787136078,
"learning_rate": 6.660331561764781e-06,
"loss": 0.7899,
"step": 1522
},
{
"epoch": 0.4537971619054713,
"grad_norm": 0.4185028076171875,
"learning_rate": 6.6554261196036755e-06,
"loss": 0.7579,
"step": 1523
},
{
"epoch": 0.45409512458564566,
"grad_norm": 0.451593279838562,
"learning_rate": 6.6505188872157525e-06,
"loss": 0.7607,
"step": 1524
},
{
"epoch": 0.45439308726581995,
"grad_norm": 0.4055825173854828,
"learning_rate": 6.645609869907835e-06,
"loss": 0.775,
"step": 1525
},
{
"epoch": 0.45469104994599424,
"grad_norm": 0.4261016547679901,
"learning_rate": 6.640699072988681e-06,
"loss": 0.7472,
"step": 1526
},
{
"epoch": 0.4549890126261686,
"grad_norm": 0.43738311529159546,
"learning_rate": 6.635786501768973e-06,
"loss": 0.7921,
"step": 1527
},
{
"epoch": 0.4552869753063429,
"grad_norm": 0.41713571548461914,
"learning_rate": 6.6308721615613106e-06,
"loss": 0.7361,
"step": 1528
},
{
"epoch": 0.45558493798651717,
"grad_norm": 0.41634663939476013,
"learning_rate": 6.6259560576802055e-06,
"loss": 0.7177,
"step": 1529
},
{
"epoch": 0.4558829006666915,
"grad_norm": 0.4361664652824402,
"learning_rate": 6.621038195442078e-06,
"loss": 0.7898,
"step": 1530
},
{
"epoch": 0.4561808633468658,
"grad_norm": 0.43086275458335876,
"learning_rate": 6.6161185801652495e-06,
"loss": 0.7399,
"step": 1531
},
{
"epoch": 0.4564788260270401,
"grad_norm": 0.4243473410606384,
"learning_rate": 6.61119721716994e-06,
"loss": 0.7602,
"step": 1532
},
{
"epoch": 0.45677678870721444,
"grad_norm": 0.40631866455078125,
"learning_rate": 6.606274111778257e-06,
"loss": 0.7682,
"step": 1533
},
{
"epoch": 0.45707475138738873,
"grad_norm": 0.42651253938674927,
"learning_rate": 6.601349269314188e-06,
"loss": 0.7426,
"step": 1534
},
{
"epoch": 0.457372714067563,
"grad_norm": 0.4395078718662262,
"learning_rate": 6.596422695103609e-06,
"loss": 0.8164,
"step": 1535
},
{
"epoch": 0.45767067674773737,
"grad_norm": 0.417476624250412,
"learning_rate": 6.591494394474261e-06,
"loss": 0.7173,
"step": 1536
},
{
"epoch": 0.45796863942791166,
"grad_norm": 0.4121398627758026,
"learning_rate": 6.586564372755754e-06,
"loss": 0.7461,
"step": 1537
},
{
"epoch": 0.45826660210808595,
"grad_norm": 0.4245747923851013,
"learning_rate": 6.581632635279558e-06,
"loss": 0.7696,
"step": 1538
},
{
"epoch": 0.45856456478826024,
"grad_norm": 0.4338565468788147,
"learning_rate": 6.576699187379003e-06,
"loss": 0.7906,
"step": 1539
},
{
"epoch": 0.4588625274684346,
"grad_norm": 0.44849637150764465,
"learning_rate": 6.571764034389263e-06,
"loss": 0.7914,
"step": 1540
},
{
"epoch": 0.4591604901486089,
"grad_norm": 0.42695313692092896,
"learning_rate": 6.566827181647361e-06,
"loss": 0.7365,
"step": 1541
},
{
"epoch": 0.45945845282878317,
"grad_norm": 0.43655329942703247,
"learning_rate": 6.561888634492153e-06,
"loss": 0.7474,
"step": 1542
},
{
"epoch": 0.4597564155089575,
"grad_norm": 0.4254245460033417,
"learning_rate": 6.556948398264332e-06,
"loss": 0.7564,
"step": 1543
},
{
"epoch": 0.4600543781891318,
"grad_norm": 0.41453203558921814,
"learning_rate": 6.552006478306416e-06,
"loss": 0.7308,
"step": 1544
},
{
"epoch": 0.4603523408693061,
"grad_norm": 0.43700653314590454,
"learning_rate": 6.547062879962742e-06,
"loss": 0.7549,
"step": 1545
},
{
"epoch": 0.46065030354948044,
"grad_norm": 0.42417776584625244,
"learning_rate": 6.5421176085794645e-06,
"loss": 0.7371,
"step": 1546
},
{
"epoch": 0.46094826622965474,
"grad_norm": 0.4269201159477234,
"learning_rate": 6.537170669504547e-06,
"loss": 0.7504,
"step": 1547
},
{
"epoch": 0.461246228909829,
"grad_norm": 0.4198196232318878,
"learning_rate": 6.532222068087754e-06,
"loss": 0.716,
"step": 1548
},
{
"epoch": 0.46154419159000337,
"grad_norm": 0.4342532455921173,
"learning_rate": 6.527271809680651e-06,
"loss": 0.7595,
"step": 1549
},
{
"epoch": 0.46184215427017766,
"grad_norm": 0.4008735716342926,
"learning_rate": 6.522319899636594e-06,
"loss": 0.7101,
"step": 1550
},
{
"epoch": 0.46214011695035195,
"grad_norm": 0.4136100113391876,
"learning_rate": 6.517366343310726e-06,
"loss": 0.7564,
"step": 1551
},
{
"epoch": 0.4624380796305263,
"grad_norm": 0.4406328797340393,
"learning_rate": 6.512411146059967e-06,
"loss": 0.7355,
"step": 1552
},
{
"epoch": 0.4627360423107006,
"grad_norm": 0.40765517950057983,
"learning_rate": 6.507454313243016e-06,
"loss": 0.7443,
"step": 1553
},
{
"epoch": 0.4630340049908749,
"grad_norm": 0.418593168258667,
"learning_rate": 6.502495850220337e-06,
"loss": 0.7818,
"step": 1554
},
{
"epoch": 0.4633319676710492,
"grad_norm": 0.4101206958293915,
"learning_rate": 6.497535762354162e-06,
"loss": 0.7134,
"step": 1555
},
{
"epoch": 0.4636299303512235,
"grad_norm": 0.4457070827484131,
"learning_rate": 6.492574055008474e-06,
"loss": 0.756,
"step": 1556
},
{
"epoch": 0.4639278930313978,
"grad_norm": 0.4158073961734772,
"learning_rate": 6.4876107335490106e-06,
"loss": 0.7487,
"step": 1557
},
{
"epoch": 0.4642258557115721,
"grad_norm": 0.4253988564014435,
"learning_rate": 6.482645803343255e-06,
"loss": 0.7218,
"step": 1558
},
{
"epoch": 0.46452381839174645,
"grad_norm": 0.4238702654838562,
"learning_rate": 6.4776792697604305e-06,
"loss": 0.7543,
"step": 1559
},
{
"epoch": 0.46482178107192074,
"grad_norm": 0.4382781386375427,
"learning_rate": 6.472711138171492e-06,
"loss": 0.7234,
"step": 1560
},
{
"epoch": 0.465119743752095,
"grad_norm": 0.41743335127830505,
"learning_rate": 6.467741413949124e-06,
"loss": 0.7032,
"step": 1561
},
{
"epoch": 0.4654177064322694,
"grad_norm": 0.4467609226703644,
"learning_rate": 6.462770102467736e-06,
"loss": 0.7761,
"step": 1562
},
{
"epoch": 0.46571566911244366,
"grad_norm": 0.41901764273643494,
"learning_rate": 6.457797209103449e-06,
"loss": 0.7398,
"step": 1563
},
{
"epoch": 0.46601363179261795,
"grad_norm": 0.45777711272239685,
"learning_rate": 6.452822739234097e-06,
"loss": 0.7641,
"step": 1564
},
{
"epoch": 0.4663115944727923,
"grad_norm": 0.4191468358039856,
"learning_rate": 6.447846698239221e-06,
"loss": 0.7425,
"step": 1565
},
{
"epoch": 0.4666095571529666,
"grad_norm": 0.42025595903396606,
"learning_rate": 6.442869091500058e-06,
"loss": 0.7796,
"step": 1566
},
{
"epoch": 0.4669075198331409,
"grad_norm": 0.41758546233177185,
"learning_rate": 6.437889924399539e-06,
"loss": 0.7638,
"step": 1567
},
{
"epoch": 0.46720548251331523,
"grad_norm": 0.41302821040153503,
"learning_rate": 6.4329092023222825e-06,
"loss": 0.7336,
"step": 1568
},
{
"epoch": 0.4675034451934895,
"grad_norm": 0.42900779843330383,
"learning_rate": 6.427926930654589e-06,
"loss": 0.7841,
"step": 1569
},
{
"epoch": 0.4678014078736638,
"grad_norm": 0.42111918330192566,
"learning_rate": 6.422943114784437e-06,
"loss": 0.7613,
"step": 1570
},
{
"epoch": 0.46809937055383816,
"grad_norm": 0.417671799659729,
"learning_rate": 6.417957760101467e-06,
"loss": 0.7484,
"step": 1571
},
{
"epoch": 0.46839733323401245,
"grad_norm": 0.4312800168991089,
"learning_rate": 6.412970871996995e-06,
"loss": 0.7391,
"step": 1572
},
{
"epoch": 0.46869529591418674,
"grad_norm": 0.40557584166526794,
"learning_rate": 6.407982455863986e-06,
"loss": 0.6973,
"step": 1573
},
{
"epoch": 0.46899325859436103,
"grad_norm": 0.4075579345226288,
"learning_rate": 6.402992517097062e-06,
"loss": 0.7604,
"step": 1574
},
{
"epoch": 0.4692912212745354,
"grad_norm": 0.41097551584243774,
"learning_rate": 6.398001061092492e-06,
"loss": 0.7373,
"step": 1575
},
{
"epoch": 0.46958918395470967,
"grad_norm": 0.4276979863643646,
"learning_rate": 6.39300809324818e-06,
"loss": 0.7527,
"step": 1576
},
{
"epoch": 0.46988714663488396,
"grad_norm": 0.40917807817459106,
"learning_rate": 6.388013618963674e-06,
"loss": 0.7059,
"step": 1577
},
{
"epoch": 0.4701851093150583,
"grad_norm": 0.41270890831947327,
"learning_rate": 6.383017643640144e-06,
"loss": 0.7242,
"step": 1578
},
{
"epoch": 0.4704830719952326,
"grad_norm": 0.4280890226364136,
"learning_rate": 6.378020172680386e-06,
"loss": 0.7686,
"step": 1579
},
{
"epoch": 0.4707810346754069,
"grad_norm": 0.430791437625885,
"learning_rate": 6.373021211488812e-06,
"loss": 0.7696,
"step": 1580
},
{
"epoch": 0.47107899735558123,
"grad_norm": 0.4117489457130432,
"learning_rate": 6.36802076547145e-06,
"loss": 0.7613,
"step": 1581
},
{
"epoch": 0.4713769600357555,
"grad_norm": 0.43257904052734375,
"learning_rate": 6.363018840035926e-06,
"loss": 0.7356,
"step": 1582
},
{
"epoch": 0.4716749227159298,
"grad_norm": 0.4123290181159973,
"learning_rate": 6.358015440591472e-06,
"loss": 0.7342,
"step": 1583
},
{
"epoch": 0.47197288539610416,
"grad_norm": 0.44685089588165283,
"learning_rate": 6.3530105725489136e-06,
"loss": 0.7003,
"step": 1584
},
{
"epoch": 0.47227084807627845,
"grad_norm": 0.40533214807510376,
"learning_rate": 6.348004241320662e-06,
"loss": 0.7865,
"step": 1585
},
{
"epoch": 0.47256881075645274,
"grad_norm": 0.4086616039276123,
"learning_rate": 6.342996452320713e-06,
"loss": 0.7439,
"step": 1586
},
{
"epoch": 0.4728667734366271,
"grad_norm": 0.42993593215942383,
"learning_rate": 6.337987210964636e-06,
"loss": 0.7582,
"step": 1587
},
{
"epoch": 0.4731647361168014,
"grad_norm": 0.423460990190506,
"learning_rate": 6.332976522669576e-06,
"loss": 0.7413,
"step": 1588
},
{
"epoch": 0.47346269879697567,
"grad_norm": 0.43371692299842834,
"learning_rate": 6.327964392854237e-06,
"loss": 0.7746,
"step": 1589
},
{
"epoch": 0.47376066147715,
"grad_norm": 0.42651933431625366,
"learning_rate": 6.322950826938885e-06,
"loss": 0.7536,
"step": 1590
},
{
"epoch": 0.4740586241573243,
"grad_norm": 0.4330197870731354,
"learning_rate": 6.3179358303453386e-06,
"loss": 0.8055,
"step": 1591
},
{
"epoch": 0.4743565868374986,
"grad_norm": 0.4229655861854553,
"learning_rate": 6.3129194084969655e-06,
"loss": 0.7512,
"step": 1592
},
{
"epoch": 0.4746545495176729,
"grad_norm": 0.42545217275619507,
"learning_rate": 6.30790156681867e-06,
"loss": 0.7412,
"step": 1593
},
{
"epoch": 0.47495251219784723,
"grad_norm": 0.42011532187461853,
"learning_rate": 6.3028823107368965e-06,
"loss": 0.7265,
"step": 1594
},
{
"epoch": 0.4752504748780215,
"grad_norm": 0.4164109528064728,
"learning_rate": 6.297861645679616e-06,
"loss": 0.7349,
"step": 1595
},
{
"epoch": 0.4755484375581958,
"grad_norm": 0.4242698550224304,
"learning_rate": 6.292839577076326e-06,
"loss": 0.7233,
"step": 1596
},
{
"epoch": 0.47584640023837016,
"grad_norm": 0.4278213083744049,
"learning_rate": 6.2878161103580395e-06,
"loss": 0.7834,
"step": 1597
},
{
"epoch": 0.47614436291854445,
"grad_norm": 0.43776217103004456,
"learning_rate": 6.28279125095728e-06,
"loss": 0.7799,
"step": 1598
},
{
"epoch": 0.47644232559871874,
"grad_norm": 0.4515795409679413,
"learning_rate": 6.277765004308083e-06,
"loss": 0.8182,
"step": 1599
},
{
"epoch": 0.4767402882788931,
"grad_norm": 0.39705896377563477,
"learning_rate": 6.2727373758459765e-06,
"loss": 0.7643,
"step": 1600
},
{
"epoch": 0.4770382509590674,
"grad_norm": 0.42140883207321167,
"learning_rate": 6.267708371007991e-06,
"loss": 0.7589,
"step": 1601
},
{
"epoch": 0.47733621363924167,
"grad_norm": 0.41087716817855835,
"learning_rate": 6.262677995232637e-06,
"loss": 0.7324,
"step": 1602
},
{
"epoch": 0.477634176319416,
"grad_norm": 0.40221107006073,
"learning_rate": 6.2576462539599145e-06,
"loss": 0.7054,
"step": 1603
},
{
"epoch": 0.4779321389995903,
"grad_norm": 0.4330417513847351,
"learning_rate": 6.252613152631297e-06,
"loss": 0.7367,
"step": 1604
},
{
"epoch": 0.4782301016797646,
"grad_norm": 0.40460506081581116,
"learning_rate": 6.247578696689729e-06,
"loss": 0.7369,
"step": 1605
},
{
"epoch": 0.47852806435993894,
"grad_norm": 0.42739930748939514,
"learning_rate": 6.242542891579619e-06,
"loss": 0.7868,
"step": 1606
},
{
"epoch": 0.47882602704011323,
"grad_norm": 0.4340622127056122,
"learning_rate": 6.237505742746839e-06,
"loss": 0.7502,
"step": 1607
},
{
"epoch": 0.4791239897202875,
"grad_norm": 0.41937345266342163,
"learning_rate": 6.232467255638709e-06,
"loss": 0.751,
"step": 1608
},
{
"epoch": 0.4794219524004618,
"grad_norm": 0.4325072765350342,
"learning_rate": 6.227427435703997e-06,
"loss": 0.7515,
"step": 1609
},
{
"epoch": 0.47971991508063616,
"grad_norm": 0.409756600856781,
"learning_rate": 6.222386288392914e-06,
"loss": 0.745,
"step": 1610
},
{
"epoch": 0.48001787776081045,
"grad_norm": 0.4101891815662384,
"learning_rate": 6.217343819157106e-06,
"loss": 0.7049,
"step": 1611
},
{
"epoch": 0.48031584044098474,
"grad_norm": 0.4222071170806885,
"learning_rate": 6.212300033449652e-06,
"loss": 0.7959,
"step": 1612
},
{
"epoch": 0.4806138031211591,
"grad_norm": 0.4142032265663147,
"learning_rate": 6.2072549367250465e-06,
"loss": 0.7132,
"step": 1613
},
{
"epoch": 0.4809117658013334,
"grad_norm": 0.4161210358142853,
"learning_rate": 6.202208534439208e-06,
"loss": 0.7321,
"step": 1614
},
{
"epoch": 0.48120972848150767,
"grad_norm": 0.4110502302646637,
"learning_rate": 6.197160832049466e-06,
"loss": 0.7407,
"step": 1615
},
{
"epoch": 0.481507691161682,
"grad_norm": 0.4301973879337311,
"learning_rate": 6.192111835014554e-06,
"loss": 0.7345,
"step": 1616
},
{
"epoch": 0.4818056538418563,
"grad_norm": 0.41788768768310547,
"learning_rate": 6.187061548794609e-06,
"loss": 0.7467,
"step": 1617
},
{
"epoch": 0.4821036165220306,
"grad_norm": 0.41017264127731323,
"learning_rate": 6.182009978851158e-06,
"loss": 0.7098,
"step": 1618
},
{
"epoch": 0.48240157920220494,
"grad_norm": 0.4400678873062134,
"learning_rate": 6.17695713064712e-06,
"loss": 0.7861,
"step": 1619
},
{
"epoch": 0.48269954188237923,
"grad_norm": 0.407147616147995,
"learning_rate": 6.171903009646792e-06,
"loss": 0.7835,
"step": 1620
},
{
"epoch": 0.4829975045625535,
"grad_norm": 0.41501620411872864,
"learning_rate": 6.1668476213158525e-06,
"loss": 0.7324,
"step": 1621
},
{
"epoch": 0.48329546724272787,
"grad_norm": 0.4054969549179077,
"learning_rate": 6.161790971121349e-06,
"loss": 0.7487,
"step": 1622
},
{
"epoch": 0.48359342992290216,
"grad_norm": 0.4138328433036804,
"learning_rate": 6.1567330645316906e-06,
"loss": 0.7081,
"step": 1623
},
{
"epoch": 0.48389139260307645,
"grad_norm": 0.42272600531578064,
"learning_rate": 6.151673907016646e-06,
"loss": 0.7654,
"step": 1624
},
{
"epoch": 0.4841893552832508,
"grad_norm": 0.4159351587295532,
"learning_rate": 6.146613504047342e-06,
"loss": 0.7813,
"step": 1625
},
{
"epoch": 0.4844873179634251,
"grad_norm": 0.42444831132888794,
"learning_rate": 6.1415518610962445e-06,
"loss": 0.7335,
"step": 1626
},
{
"epoch": 0.4847852806435994,
"grad_norm": 0.4191901683807373,
"learning_rate": 6.136488983637165e-06,
"loss": 0.7595,
"step": 1627
},
{
"epoch": 0.48508324332377367,
"grad_norm": 0.4251089096069336,
"learning_rate": 6.131424877145252e-06,
"loss": 0.7984,
"step": 1628
},
{
"epoch": 0.485381206003948,
"grad_norm": 0.4156152904033661,
"learning_rate": 6.126359547096975e-06,
"loss": 0.783,
"step": 1629
},
{
"epoch": 0.4856791686841223,
"grad_norm": 0.40349531173706055,
"learning_rate": 6.121292998970138e-06,
"loss": 0.7275,
"step": 1630
},
{
"epoch": 0.4859771313642966,
"grad_norm": 0.44632115960121155,
"learning_rate": 6.11622523824385e-06,
"loss": 0.8035,
"step": 1631
},
{
"epoch": 0.48627509404447095,
"grad_norm": 0.42304009199142456,
"learning_rate": 6.111156270398542e-06,
"loss": 0.7565,
"step": 1632
},
{
"epoch": 0.48657305672464524,
"grad_norm": 0.4415636658668518,
"learning_rate": 6.106086100915942e-06,
"loss": 0.7514,
"step": 1633
},
{
"epoch": 0.4868710194048195,
"grad_norm": 0.442160427570343,
"learning_rate": 6.1010147352790875e-06,
"loss": 0.722,
"step": 1634
},
{
"epoch": 0.4871689820849939,
"grad_norm": 0.4188246428966522,
"learning_rate": 6.095942178972296e-06,
"loss": 0.7349,
"step": 1635
},
{
"epoch": 0.48746694476516816,
"grad_norm": 0.41654735803604126,
"learning_rate": 6.090868437481185e-06,
"loss": 0.732,
"step": 1636
},
{
"epoch": 0.48776490744534245,
"grad_norm": 0.43010467290878296,
"learning_rate": 6.085793516292647e-06,
"loss": 0.7667,
"step": 1637
},
{
"epoch": 0.4880628701255168,
"grad_norm": 0.4185599088668823,
"learning_rate": 6.080717420894852e-06,
"loss": 0.7515,
"step": 1638
},
{
"epoch": 0.4883608328056911,
"grad_norm": 0.423115998506546,
"learning_rate": 6.075640156777243e-06,
"loss": 0.7805,
"step": 1639
},
{
"epoch": 0.4886587954858654,
"grad_norm": 0.42163342237472534,
"learning_rate": 6.070561729430518e-06,
"loss": 0.7515,
"step": 1640
},
{
"epoch": 0.48895675816603973,
"grad_norm": 0.4198808968067169,
"learning_rate": 6.065482144346644e-06,
"loss": 0.7094,
"step": 1641
},
{
"epoch": 0.489254720846214,
"grad_norm": 0.43005016446113586,
"learning_rate": 6.060401407018832e-06,
"loss": 0.7835,
"step": 1642
},
{
"epoch": 0.4895526835263883,
"grad_norm": 0.431854784488678,
"learning_rate": 6.055319522941543e-06,
"loss": 0.7756,
"step": 1643
},
{
"epoch": 0.4898506462065626,
"grad_norm": 0.4222675561904907,
"learning_rate": 6.0502364976104734e-06,
"loss": 0.7308,
"step": 1644
},
{
"epoch": 0.49014860888673695,
"grad_norm": 0.40350571274757385,
"learning_rate": 6.045152336522562e-06,
"loss": 0.7208,
"step": 1645
},
{
"epoch": 0.49044657156691124,
"grad_norm": 0.4296424388885498,
"learning_rate": 6.040067045175969e-06,
"loss": 0.7707,
"step": 1646
},
{
"epoch": 0.49074453424708553,
"grad_norm": 0.42658230662345886,
"learning_rate": 6.034980629070078e-06,
"loss": 0.7956,
"step": 1647
},
{
"epoch": 0.4910424969272599,
"grad_norm": 0.4275640547275543,
"learning_rate": 6.029893093705492e-06,
"loss": 0.746,
"step": 1648
},
{
"epoch": 0.49134045960743417,
"grad_norm": 0.4200149476528168,
"learning_rate": 6.0248044445840215e-06,
"loss": 0.7644,
"step": 1649
},
{
"epoch": 0.49163842228760846,
"grad_norm": 0.40972739458084106,
"learning_rate": 6.019714687208684e-06,
"loss": 0.7393,
"step": 1650
},
{
"epoch": 0.4919363849677828,
"grad_norm": 0.4072873294353485,
"learning_rate": 6.0146238270836895e-06,
"loss": 0.6877,
"step": 1651
},
{
"epoch": 0.4922343476479571,
"grad_norm": 0.43277212977409363,
"learning_rate": 6.00953186971445e-06,
"loss": 0.7561,
"step": 1652
},
{
"epoch": 0.4925323103281314,
"grad_norm": 0.4115188419818878,
"learning_rate": 6.004438820607554e-06,
"loss": 0.7191,
"step": 1653
},
{
"epoch": 0.49283027300830573,
"grad_norm": 0.4096651077270508,
"learning_rate": 5.999344685270782e-06,
"loss": 0.7819,
"step": 1654
},
{
"epoch": 0.49312823568848,
"grad_norm": 0.4046769440174103,
"learning_rate": 5.9942494692130744e-06,
"loss": 0.6998,
"step": 1655
},
{
"epoch": 0.4934261983686543,
"grad_norm": 0.4084392488002777,
"learning_rate": 5.989153177944555e-06,
"loss": 0.7341,
"step": 1656
},
{
"epoch": 0.49372416104882866,
"grad_norm": 0.4243430495262146,
"learning_rate": 5.984055816976504e-06,
"loss": 0.71,
"step": 1657
},
{
"epoch": 0.49402212372900295,
"grad_norm": 0.4338114857673645,
"learning_rate": 5.978957391821354e-06,
"loss": 0.7579,
"step": 1658
},
{
"epoch": 0.49432008640917724,
"grad_norm": 0.41517770290374756,
"learning_rate": 5.973857907992698e-06,
"loss": 0.7255,
"step": 1659
},
{
"epoch": 0.4946180490893516,
"grad_norm": 0.40864789485931396,
"learning_rate": 5.968757371005265e-06,
"loss": 0.7707,
"step": 1660
},
{
"epoch": 0.4949160117695259,
"grad_norm": 0.41645941138267517,
"learning_rate": 5.963655786374929e-06,
"loss": 0.7655,
"step": 1661
},
{
"epoch": 0.49521397444970017,
"grad_norm": 0.4150625169277191,
"learning_rate": 5.958553159618693e-06,
"loss": 0.7697,
"step": 1662
},
{
"epoch": 0.49551193712987446,
"grad_norm": 0.41089800000190735,
"learning_rate": 5.95344949625469e-06,
"loss": 0.738,
"step": 1663
},
{
"epoch": 0.4958098998100488,
"grad_norm": 0.41153329610824585,
"learning_rate": 5.948344801802172e-06,
"loss": 0.7249,
"step": 1664
},
{
"epoch": 0.4961078624902231,
"grad_norm": 0.4157087206840515,
"learning_rate": 5.943239081781508e-06,
"loss": 0.7291,
"step": 1665
},
{
"epoch": 0.4964058251703974,
"grad_norm": 0.4415079355239868,
"learning_rate": 5.938132341714173e-06,
"loss": 0.7922,
"step": 1666
},
{
"epoch": 0.49670378785057173,
"grad_norm": 0.4293401837348938,
"learning_rate": 5.933024587122745e-06,
"loss": 0.7878,
"step": 1667
},
{
"epoch": 0.497001750530746,
"grad_norm": 0.41637614369392395,
"learning_rate": 5.927915823530907e-06,
"loss": 0.7944,
"step": 1668
},
{
"epoch": 0.4972997132109203,
"grad_norm": 0.41283151507377625,
"learning_rate": 5.922806056463421e-06,
"loss": 0.7635,
"step": 1669
},
{
"epoch": 0.49759767589109466,
"grad_norm": 0.4281843304634094,
"learning_rate": 5.917695291446146e-06,
"loss": 0.7463,
"step": 1670
},
{
"epoch": 0.49789563857126895,
"grad_norm": 0.416308730840683,
"learning_rate": 5.91258353400601e-06,
"loss": 0.7406,
"step": 1671
},
{
"epoch": 0.49819360125144324,
"grad_norm": 0.43074557185173035,
"learning_rate": 5.9074707896710225e-06,
"loss": 0.7725,
"step": 1672
},
{
"epoch": 0.4984915639316176,
"grad_norm": 0.41739097237586975,
"learning_rate": 5.9023570639702544e-06,
"loss": 0.7566,
"step": 1673
},
{
"epoch": 0.4987895266117919,
"grad_norm": 0.40861406922340393,
"learning_rate": 5.8972423624338395e-06,
"loss": 0.7582,
"step": 1674
},
{
"epoch": 0.49908748929196617,
"grad_norm": 0.4410512447357178,
"learning_rate": 5.892126690592969e-06,
"loss": 0.7696,
"step": 1675
},
{
"epoch": 0.4993854519721405,
"grad_norm": 0.41618219017982483,
"learning_rate": 5.887010053979881e-06,
"loss": 0.7333,
"step": 1676
},
{
"epoch": 0.4996834146523148,
"grad_norm": 0.4145171046257019,
"learning_rate": 5.881892458127858e-06,
"loss": 0.759,
"step": 1677
},
{
"epoch": 0.4999813773324891,
"grad_norm": 0.42233115434646606,
"learning_rate": 5.87677390857122e-06,
"loss": 0.7641,
"step": 1678
},
{
"epoch": 0.5002793400126634,
"grad_norm": 0.4378186762332916,
"learning_rate": 5.871654410845317e-06,
"loss": 0.7472,
"step": 1679
},
{
"epoch": 0.5005773026928377,
"grad_norm": 0.40987905859947205,
"learning_rate": 5.866533970486529e-06,
"loss": 0.724,
"step": 1680
},
{
"epoch": 0.5008752653730121,
"grad_norm": 0.4189298152923584,
"learning_rate": 5.861412593032247e-06,
"loss": 0.7603,
"step": 1681
},
{
"epoch": 0.5011732280531863,
"grad_norm": 0.4255460798740387,
"learning_rate": 5.856290284020883e-06,
"loss": 0.7548,
"step": 1682
},
{
"epoch": 0.5014711907333607,
"grad_norm": 0.4227476716041565,
"learning_rate": 5.851167048991853e-06,
"loss": 0.7465,
"step": 1683
},
{
"epoch": 0.501769153413535,
"grad_norm": 0.3998330235481262,
"learning_rate": 5.846042893485575e-06,
"loss": 0.7286,
"step": 1684
},
{
"epoch": 0.5020671160937092,
"grad_norm": 0.42391958832740784,
"learning_rate": 5.8409178230434615e-06,
"loss": 0.7546,
"step": 1685
},
{
"epoch": 0.5023650787738836,
"grad_norm": 0.4327545762062073,
"learning_rate": 5.835791843207916e-06,
"loss": 0.7452,
"step": 1686
},
{
"epoch": 0.5026630414540578,
"grad_norm": 0.440319299697876,
"learning_rate": 5.830664959522328e-06,
"loss": 0.7819,
"step": 1687
},
{
"epoch": 0.5029610041342322,
"grad_norm": 0.4276452958583832,
"learning_rate": 5.825537177531057e-06,
"loss": 0.7612,
"step": 1688
},
{
"epoch": 0.5032589668144065,
"grad_norm": 0.42737245559692383,
"learning_rate": 5.82040850277944e-06,
"loss": 0.7507,
"step": 1689
},
{
"epoch": 0.5035569294945808,
"grad_norm": 0.4190289080142975,
"learning_rate": 5.815278940813777e-06,
"loss": 0.729,
"step": 1690
},
{
"epoch": 0.5038548921747551,
"grad_norm": 0.4288443326950073,
"learning_rate": 5.810148497181328e-06,
"loss": 0.7807,
"step": 1691
},
{
"epoch": 0.5041528548549294,
"grad_norm": 0.41055840253829956,
"learning_rate": 5.80501717743031e-06,
"loss": 0.7483,
"step": 1692
},
{
"epoch": 0.5044508175351037,
"grad_norm": 0.42178279161453247,
"learning_rate": 5.799884987109878e-06,
"loss": 0.7567,
"step": 1693
},
{
"epoch": 0.504748780215278,
"grad_norm": 0.4185313284397125,
"learning_rate": 5.794751931770142e-06,
"loss": 0.7636,
"step": 1694
},
{
"epoch": 0.5050467428954524,
"grad_norm": 0.4304468631744385,
"learning_rate": 5.789618016962134e-06,
"loss": 0.7748,
"step": 1695
},
{
"epoch": 0.5053447055756266,
"grad_norm": 0.4272032380104065,
"learning_rate": 5.7844832482378245e-06,
"loss": 0.7829,
"step": 1696
},
{
"epoch": 0.505642668255801,
"grad_norm": 0.4228638708591461,
"learning_rate": 5.779347631150101e-06,
"loss": 0.7745,
"step": 1697
},
{
"epoch": 0.5059406309359753,
"grad_norm": 0.4103485643863678,
"learning_rate": 5.774211171252777e-06,
"loss": 0.739,
"step": 1698
},
{
"epoch": 0.5062385936161495,
"grad_norm": 0.4145384430885315,
"learning_rate": 5.769073874100569e-06,
"loss": 0.7347,
"step": 1699
},
{
"epoch": 0.5065365562963239,
"grad_norm": 0.4036468267440796,
"learning_rate": 5.763935745249103e-06,
"loss": 0.7609,
"step": 1700
},
{
"epoch": 0.5068345189764982,
"grad_norm": 0.426704466342926,
"learning_rate": 5.758796790254902e-06,
"loss": 0.7553,
"step": 1701
},
{
"epoch": 0.5071324816566725,
"grad_norm": 0.4068341851234436,
"learning_rate": 5.7536570146753874e-06,
"loss": 0.7423,
"step": 1702
},
{
"epoch": 0.5074304443368468,
"grad_norm": 0.4228654205799103,
"learning_rate": 5.748516424068864e-06,
"loss": 0.72,
"step": 1703
},
{
"epoch": 0.5077284070170212,
"grad_norm": 0.41845396161079407,
"learning_rate": 5.743375023994514e-06,
"loss": 0.7157,
"step": 1704
},
{
"epoch": 0.5080263696971954,
"grad_norm": 0.39111408591270447,
"learning_rate": 5.738232820012407e-06,
"loss": 0.7146,
"step": 1705
},
{
"epoch": 0.5083243323773697,
"grad_norm": 0.4307633936405182,
"learning_rate": 5.733089817683469e-06,
"loss": 0.7701,
"step": 1706
},
{
"epoch": 0.5086222950575441,
"grad_norm": 0.4309898316860199,
"learning_rate": 5.7279460225694985e-06,
"loss": 0.7344,
"step": 1707
},
{
"epoch": 0.5089202577377183,
"grad_norm": 0.42719435691833496,
"learning_rate": 5.722801440233145e-06,
"loss": 0.7531,
"step": 1708
},
{
"epoch": 0.5092182204178927,
"grad_norm": 0.4361584484577179,
"learning_rate": 5.7176560762379144e-06,
"loss": 0.756,
"step": 1709
},
{
"epoch": 0.509516183098067,
"grad_norm": 0.40878501534461975,
"learning_rate": 5.712509936148153e-06,
"loss": 0.7514,
"step": 1710
},
{
"epoch": 0.5098141457782412,
"grad_norm": 0.4180915653705597,
"learning_rate": 5.7073630255290515e-06,
"loss": 0.7179,
"step": 1711
},
{
"epoch": 0.5101121084584156,
"grad_norm": 0.41091781854629517,
"learning_rate": 5.70221534994663e-06,
"loss": 0.744,
"step": 1712
},
{
"epoch": 0.5104100711385899,
"grad_norm": 0.41649574041366577,
"learning_rate": 5.6970669149677395e-06,
"loss": 0.7387,
"step": 1713
},
{
"epoch": 0.5107080338187642,
"grad_norm": 0.4178697466850281,
"learning_rate": 5.691917726160049e-06,
"loss": 0.7765,
"step": 1714
},
{
"epoch": 0.5110059964989385,
"grad_norm": 0.423127681016922,
"learning_rate": 5.686767789092041e-06,
"loss": 0.7307,
"step": 1715
},
{
"epoch": 0.5113039591791129,
"grad_norm": 0.4184946119785309,
"learning_rate": 5.6816171093330145e-06,
"loss": 0.7752,
"step": 1716
},
{
"epoch": 0.5116019218592871,
"grad_norm": 0.4245437979698181,
"learning_rate": 5.676465692453063e-06,
"loss": 0.7716,
"step": 1717
},
{
"epoch": 0.5118998845394614,
"grad_norm": 0.42974284291267395,
"learning_rate": 5.671313544023084e-06,
"loss": 0.7343,
"step": 1718
},
{
"epoch": 0.5121978472196358,
"grad_norm": 0.415322870016098,
"learning_rate": 5.666160669614761e-06,
"loss": 0.7643,
"step": 1719
},
{
"epoch": 0.51249580989981,
"grad_norm": 0.41199418902397156,
"learning_rate": 5.661007074800569e-06,
"loss": 0.7272,
"step": 1720
},
{
"epoch": 0.5127937725799844,
"grad_norm": 0.4084899425506592,
"learning_rate": 5.655852765153752e-06,
"loss": 0.7091,
"step": 1721
},
{
"epoch": 0.5130917352601586,
"grad_norm": 0.42653077840805054,
"learning_rate": 5.650697746248338e-06,
"loss": 0.7605,
"step": 1722
},
{
"epoch": 0.513389697940333,
"grad_norm": 0.41063791513442993,
"learning_rate": 5.645542023659115e-06,
"loss": 0.7312,
"step": 1723
},
{
"epoch": 0.5136876606205073,
"grad_norm": 0.40550047159194946,
"learning_rate": 5.640385602961634e-06,
"loss": 0.7468,
"step": 1724
},
{
"epoch": 0.5139856233006815,
"grad_norm": 0.4107825756072998,
"learning_rate": 5.635228489732204e-06,
"loss": 0.7817,
"step": 1725
},
{
"epoch": 0.5142835859808559,
"grad_norm": 0.42436328530311584,
"learning_rate": 5.630070689547875e-06,
"loss": 0.7305,
"step": 1726
},
{
"epoch": 0.5145815486610302,
"grad_norm": 0.4121907949447632,
"learning_rate": 5.624912207986448e-06,
"loss": 0.7238,
"step": 1727
},
{
"epoch": 0.5148795113412045,
"grad_norm": 0.41771507263183594,
"learning_rate": 5.619753050626458e-06,
"loss": 0.7572,
"step": 1728
},
{
"epoch": 0.5151774740213788,
"grad_norm": 0.3964030146598816,
"learning_rate": 5.614593223047169e-06,
"loss": 0.7157,
"step": 1729
},
{
"epoch": 0.5154754367015532,
"grad_norm": 0.425343781709671,
"learning_rate": 5.609432730828571e-06,
"loss": 0.7681,
"step": 1730
},
{
"epoch": 0.5157733993817274,
"grad_norm": 0.4268225133419037,
"learning_rate": 5.604271579551375e-06,
"loss": 0.7906,
"step": 1731
},
{
"epoch": 0.5160713620619017,
"grad_norm": 0.41018426418304443,
"learning_rate": 5.599109774797e-06,
"loss": 0.7768,
"step": 1732
},
{
"epoch": 0.5163693247420761,
"grad_norm": 0.42251598834991455,
"learning_rate": 5.593947322147577e-06,
"loss": 0.7095,
"step": 1733
},
{
"epoch": 0.5166672874222503,
"grad_norm": 0.40326982736587524,
"learning_rate": 5.588784227185936e-06,
"loss": 0.7339,
"step": 1734
},
{
"epoch": 0.5169652501024247,
"grad_norm": 0.41833508014678955,
"learning_rate": 5.583620495495596e-06,
"loss": 0.7188,
"step": 1735
},
{
"epoch": 0.517263212782599,
"grad_norm": 0.41312336921691895,
"learning_rate": 5.578456132660774e-06,
"loss": 0.7073,
"step": 1736
},
{
"epoch": 0.5175611754627732,
"grad_norm": 0.4036800265312195,
"learning_rate": 5.573291144266364e-06,
"loss": 0.7246,
"step": 1737
},
{
"epoch": 0.5178591381429476,
"grad_norm": 0.4259328842163086,
"learning_rate": 5.5681255358979355e-06,
"loss": 0.7764,
"step": 1738
},
{
"epoch": 0.5181571008231219,
"grad_norm": 0.40341201424598694,
"learning_rate": 5.562959313141732e-06,
"loss": 0.7309,
"step": 1739
},
{
"epoch": 0.5184550635032962,
"grad_norm": 0.4262118935585022,
"learning_rate": 5.557792481584661e-06,
"loss": 0.7878,
"step": 1740
},
{
"epoch": 0.5187530261834705,
"grad_norm": 0.4327569007873535,
"learning_rate": 5.552625046814283e-06,
"loss": 0.7517,
"step": 1741
},
{
"epoch": 0.5190509888636449,
"grad_norm": 0.41983869671821594,
"learning_rate": 5.547457014418818e-06,
"loss": 0.8,
"step": 1742
},
{
"epoch": 0.5193489515438191,
"grad_norm": 0.4388618767261505,
"learning_rate": 5.542288389987128e-06,
"loss": 0.7637,
"step": 1743
},
{
"epoch": 0.5196469142239935,
"grad_norm": 0.41374140977859497,
"learning_rate": 5.5371191791087185e-06,
"loss": 0.7614,
"step": 1744
},
{
"epoch": 0.5199448769041678,
"grad_norm": 0.4400937557220459,
"learning_rate": 5.531949387373725e-06,
"loss": 0.7939,
"step": 1745
},
{
"epoch": 0.520242839584342,
"grad_norm": 0.4292343556880951,
"learning_rate": 5.526779020372913e-06,
"loss": 0.7514,
"step": 1746
},
{
"epoch": 0.5205408022645164,
"grad_norm": 0.4206315577030182,
"learning_rate": 5.521608083697673e-06,
"loss": 0.7334,
"step": 1747
},
{
"epoch": 0.5208387649446907,
"grad_norm": 0.41921359300613403,
"learning_rate": 5.516436582940007e-06,
"loss": 0.7557,
"step": 1748
},
{
"epoch": 0.521136727624865,
"grad_norm": 0.42275798320770264,
"learning_rate": 5.511264523692531e-06,
"loss": 0.7549,
"step": 1749
},
{
"epoch": 0.5214346903050393,
"grad_norm": 0.4251478314399719,
"learning_rate": 5.5060919115484594e-06,
"loss": 0.7387,
"step": 1750
},
{
"epoch": 0.5217326529852137,
"grad_norm": 0.41317999362945557,
"learning_rate": 5.500918752101611e-06,
"loss": 0.7472,
"step": 1751
},
{
"epoch": 0.5220306156653879,
"grad_norm": 0.4308857023715973,
"learning_rate": 5.495745050946394e-06,
"loss": 0.7731,
"step": 1752
},
{
"epoch": 0.5223285783455622,
"grad_norm": 0.42264774441719055,
"learning_rate": 5.4905708136778e-06,
"loss": 0.7117,
"step": 1753
},
{
"epoch": 0.5226265410257366,
"grad_norm": 0.44053274393081665,
"learning_rate": 5.485396045891404e-06,
"loss": 0.8017,
"step": 1754
},
{
"epoch": 0.5229245037059108,
"grad_norm": 0.41573989391326904,
"learning_rate": 5.480220753183353e-06,
"loss": 0.7292,
"step": 1755
},
{
"epoch": 0.5232224663860852,
"grad_norm": 0.42802026867866516,
"learning_rate": 5.475044941150361e-06,
"loss": 0.7523,
"step": 1756
},
{
"epoch": 0.5235204290662594,
"grad_norm": 0.4277799129486084,
"learning_rate": 5.469868615389703e-06,
"loss": 0.7641,
"step": 1757
},
{
"epoch": 0.5238183917464337,
"grad_norm": 0.4125003218650818,
"learning_rate": 5.4646917814992125e-06,
"loss": 0.781,
"step": 1758
},
{
"epoch": 0.5241163544266081,
"grad_norm": 0.4128820300102234,
"learning_rate": 5.459514445077272e-06,
"loss": 0.7664,
"step": 1759
},
{
"epoch": 0.5244143171067823,
"grad_norm": 0.4197518527507782,
"learning_rate": 5.454336611722807e-06,
"loss": 0.7541,
"step": 1760
},
{
"epoch": 0.5247122797869567,
"grad_norm": 0.4234422445297241,
"learning_rate": 5.449158287035274e-06,
"loss": 0.757,
"step": 1761
},
{
"epoch": 0.525010242467131,
"grad_norm": 0.4273323118686676,
"learning_rate": 5.443979476614674e-06,
"loss": 0.7315,
"step": 1762
},
{
"epoch": 0.5253082051473053,
"grad_norm": 0.43033096194267273,
"learning_rate": 5.4388001860615225e-06,
"loss": 0.7632,
"step": 1763
},
{
"epoch": 0.5256061678274796,
"grad_norm": 0.43124809861183167,
"learning_rate": 5.4336204209768584e-06,
"loss": 0.7752,
"step": 1764
},
{
"epoch": 0.5259041305076539,
"grad_norm": 0.40140900015830994,
"learning_rate": 5.4284401869622306e-06,
"loss": 0.7451,
"step": 1765
},
{
"epoch": 0.5262020931878282,
"grad_norm": 0.4155518114566803,
"learning_rate": 5.423259489619701e-06,
"loss": 0.7886,
"step": 1766
},
{
"epoch": 0.5265000558680025,
"grad_norm": 0.4166567027568817,
"learning_rate": 5.418078334551826e-06,
"loss": 0.7376,
"step": 1767
},
{
"epoch": 0.5267980185481769,
"grad_norm": 0.42524638772010803,
"learning_rate": 5.412896727361663e-06,
"loss": 0.7756,
"step": 1768
},
{
"epoch": 0.5270959812283511,
"grad_norm": 0.4090453088283539,
"learning_rate": 5.407714673652753e-06,
"loss": 0.7191,
"step": 1769
},
{
"epoch": 0.5273939439085255,
"grad_norm": 0.39983639121055603,
"learning_rate": 5.402532179029123e-06,
"loss": 0.7204,
"step": 1770
},
{
"epoch": 0.5276919065886998,
"grad_norm": 0.4308376610279083,
"learning_rate": 5.397349249095279e-06,
"loss": 0.7643,
"step": 1771
},
{
"epoch": 0.527989869268874,
"grad_norm": 0.41261640191078186,
"learning_rate": 5.392165889456189e-06,
"loss": 0.7758,
"step": 1772
},
{
"epoch": 0.5282878319490484,
"grad_norm": 0.4529683589935303,
"learning_rate": 5.386982105717298e-06,
"loss": 0.8002,
"step": 1773
},
{
"epoch": 0.5285857946292227,
"grad_norm": 0.41529229283332825,
"learning_rate": 5.381797903484498e-06,
"loss": 0.76,
"step": 1774
},
{
"epoch": 0.528883757309397,
"grad_norm": 0.4131626486778259,
"learning_rate": 5.376613288364142e-06,
"loss": 0.7323,
"step": 1775
},
{
"epoch": 0.5291817199895713,
"grad_norm": 0.40386584401130676,
"learning_rate": 5.371428265963024e-06,
"loss": 0.6704,
"step": 1776
},
{
"epoch": 0.5294796826697457,
"grad_norm": 0.42746710777282715,
"learning_rate": 5.366242841888384e-06,
"loss": 0.7703,
"step": 1777
},
{
"epoch": 0.5297776453499199,
"grad_norm": 0.4215255677700043,
"learning_rate": 5.3610570217478895e-06,
"loss": 0.7314,
"step": 1778
},
{
"epoch": 0.5300756080300942,
"grad_norm": 0.42057353258132935,
"learning_rate": 5.355870811149643e-06,
"loss": 0.7438,
"step": 1779
},
{
"epoch": 0.5303735707102686,
"grad_norm": 0.4245503544807434,
"learning_rate": 5.3506842157021635e-06,
"loss": 0.7708,
"step": 1780
},
{
"epoch": 0.5306715333904428,
"grad_norm": 0.4192019999027252,
"learning_rate": 5.34549724101439e-06,
"loss": 0.7375,
"step": 1781
},
{
"epoch": 0.5309694960706172,
"grad_norm": 0.39441004395484924,
"learning_rate": 5.340309892695672e-06,
"loss": 0.7373,
"step": 1782
},
{
"epoch": 0.5312674587507915,
"grad_norm": 0.4011537432670593,
"learning_rate": 5.335122176355759e-06,
"loss": 0.7192,
"step": 1783
},
{
"epoch": 0.5315654214309657,
"grad_norm": 0.4111559987068176,
"learning_rate": 5.3299340976048035e-06,
"loss": 0.7763,
"step": 1784
},
{
"epoch": 0.5318633841111401,
"grad_norm": 0.4052737355232239,
"learning_rate": 5.324745662053344e-06,
"loss": 0.7252,
"step": 1785
},
{
"epoch": 0.5321613467913144,
"grad_norm": 0.434222012758255,
"learning_rate": 5.319556875312313e-06,
"loss": 0.7802,
"step": 1786
},
{
"epoch": 0.5324593094714887,
"grad_norm": 0.43211445212364197,
"learning_rate": 5.314367742993014e-06,
"loss": 0.7522,
"step": 1787
},
{
"epoch": 0.532757272151663,
"grad_norm": 0.429675817489624,
"learning_rate": 5.30917827070713e-06,
"loss": 0.7566,
"step": 1788
},
{
"epoch": 0.5330552348318374,
"grad_norm": 0.42022308707237244,
"learning_rate": 5.3039884640667115e-06,
"loss": 0.7878,
"step": 1789
},
{
"epoch": 0.5333531975120116,
"grad_norm": 0.42533957958221436,
"learning_rate": 5.298798328684166e-06,
"loss": 0.7506,
"step": 1790
},
{
"epoch": 0.533651160192186,
"grad_norm": 0.4366130232810974,
"learning_rate": 5.2936078701722615e-06,
"loss": 0.7773,
"step": 1791
},
{
"epoch": 0.5339491228723602,
"grad_norm": 0.43641233444213867,
"learning_rate": 5.288417094144113e-06,
"loss": 0.791,
"step": 1792
},
{
"epoch": 0.5342470855525345,
"grad_norm": 0.40789636969566345,
"learning_rate": 5.28322600621318e-06,
"loss": 0.728,
"step": 1793
},
{
"epoch": 0.5345450482327089,
"grad_norm": 0.4139827489852905,
"learning_rate": 5.278034611993258e-06,
"loss": 0.7806,
"step": 1794
},
{
"epoch": 0.5348430109128831,
"grad_norm": 0.4079539179801941,
"learning_rate": 5.272842917098474e-06,
"loss": 0.7228,
"step": 1795
},
{
"epoch": 0.5351409735930575,
"grad_norm": 0.42347198724746704,
"learning_rate": 5.2676509271432815e-06,
"loss": 0.7461,
"step": 1796
},
{
"epoch": 0.5354389362732318,
"grad_norm": 0.4191540777683258,
"learning_rate": 5.262458647742454e-06,
"loss": 0.7454,
"step": 1797
},
{
"epoch": 0.535736898953406,
"grad_norm": 0.4120540916919708,
"learning_rate": 5.25726608451107e-06,
"loss": 0.7311,
"step": 1798
},
{
"epoch": 0.5360348616335804,
"grad_norm": 0.4422626495361328,
"learning_rate": 5.2520732430645275e-06,
"loss": 0.7563,
"step": 1799
},
{
"epoch": 0.5363328243137547,
"grad_norm": 0.4033385217189789,
"learning_rate": 5.246880129018515e-06,
"loss": 0.6934,
"step": 1800
},
{
"epoch": 0.536630786993929,
"grad_norm": 0.43625926971435547,
"learning_rate": 5.241686747989023e-06,
"loss": 0.7616,
"step": 1801
},
{
"epoch": 0.5369287496741033,
"grad_norm": 0.43050000071525574,
"learning_rate": 5.236493105592326e-06,
"loss": 0.8261,
"step": 1802
},
{
"epoch": 0.5372267123542777,
"grad_norm": 0.4144790768623352,
"learning_rate": 5.231299207444981e-06,
"loss": 0.7389,
"step": 1803
},
{
"epoch": 0.5375246750344519,
"grad_norm": 0.41443172097206116,
"learning_rate": 5.226105059163826e-06,
"loss": 0.763,
"step": 1804
},
{
"epoch": 0.5378226377146262,
"grad_norm": 0.4017443358898163,
"learning_rate": 5.220910666365966e-06,
"loss": 0.751,
"step": 1805
},
{
"epoch": 0.5381206003948006,
"grad_norm": 0.4197418987751007,
"learning_rate": 5.21571603466877e-06,
"loss": 0.7079,
"step": 1806
},
{
"epoch": 0.5384185630749748,
"grad_norm": 0.41983088850975037,
"learning_rate": 5.210521169689866e-06,
"loss": 0.732,
"step": 1807
},
{
"epoch": 0.5387165257551492,
"grad_norm": 0.4189915955066681,
"learning_rate": 5.205326077047138e-06,
"loss": 0.7227,
"step": 1808
},
{
"epoch": 0.5390144884353235,
"grad_norm": 0.4161134958267212,
"learning_rate": 5.200130762358711e-06,
"loss": 0.7473,
"step": 1809
},
{
"epoch": 0.5393124511154977,
"grad_norm": 0.41901764273643494,
"learning_rate": 5.1949352312429515e-06,
"loss": 0.7724,
"step": 1810
},
{
"epoch": 0.5396104137956721,
"grad_norm": 0.4030422270298004,
"learning_rate": 5.189739489318461e-06,
"loss": 0.7334,
"step": 1811
},
{
"epoch": 0.5399083764758464,
"grad_norm": 0.3990989327430725,
"learning_rate": 5.184543542204068e-06,
"loss": 0.7149,
"step": 1812
},
{
"epoch": 0.5402063391560207,
"grad_norm": 0.3978815972805023,
"learning_rate": 5.179347395518827e-06,
"loss": 0.7204,
"step": 1813
},
{
"epoch": 0.540504301836195,
"grad_norm": 0.4164794087409973,
"learning_rate": 5.174151054881999e-06,
"loss": 0.6979,
"step": 1814
},
{
"epoch": 0.5408022645163694,
"grad_norm": 0.41425642371177673,
"learning_rate": 5.168954525913068e-06,
"loss": 0.7426,
"step": 1815
},
{
"epoch": 0.5411002271965436,
"grad_norm": 0.40607571601867676,
"learning_rate": 5.163757814231708e-06,
"loss": 0.7328,
"step": 1816
},
{
"epoch": 0.541398189876718,
"grad_norm": 0.4419354498386383,
"learning_rate": 5.158560925457801e-06,
"loss": 0.7914,
"step": 1817
},
{
"epoch": 0.5416961525568923,
"grad_norm": 0.432871013879776,
"learning_rate": 5.153363865211411e-06,
"loss": 0.7788,
"step": 1818
},
{
"epoch": 0.5419941152370665,
"grad_norm": 0.4156447947025299,
"learning_rate": 5.148166639112799e-06,
"loss": 0.7013,
"step": 1819
},
{
"epoch": 0.5422920779172409,
"grad_norm": 0.4269291162490845,
"learning_rate": 5.142969252782397e-06,
"loss": 0.765,
"step": 1820
},
{
"epoch": 0.5425900405974152,
"grad_norm": 0.41579586267471313,
"learning_rate": 5.137771711840811e-06,
"loss": 0.7588,
"step": 1821
},
{
"epoch": 0.5428880032775895,
"grad_norm": 0.41332152485847473,
"learning_rate": 5.132574021908816e-06,
"loss": 0.7532,
"step": 1822
},
{
"epoch": 0.5431859659577638,
"grad_norm": 0.41672608256340027,
"learning_rate": 5.1273761886073496e-06,
"loss": 0.746,
"step": 1823
},
{
"epoch": 0.5434839286379382,
"grad_norm": 0.41137874126434326,
"learning_rate": 5.122178217557502e-06,
"loss": 0.7141,
"step": 1824
},
{
"epoch": 0.5437818913181124,
"grad_norm": 0.4233442544937134,
"learning_rate": 5.116980114380511e-06,
"loss": 0.7442,
"step": 1825
},
{
"epoch": 0.5440798539982867,
"grad_norm": 0.4179309904575348,
"learning_rate": 5.111781884697762e-06,
"loss": 0.7486,
"step": 1826
},
{
"epoch": 0.544377816678461,
"grad_norm": 0.4062498211860657,
"learning_rate": 5.106583534130773e-06,
"loss": 0.7475,
"step": 1827
},
{
"epoch": 0.5446757793586353,
"grad_norm": 0.42390862107276917,
"learning_rate": 5.101385068301194e-06,
"loss": 0.7607,
"step": 1828
},
{
"epoch": 0.5449737420388097,
"grad_norm": 0.40766555070877075,
"learning_rate": 5.0961864928308005e-06,
"loss": 0.7231,
"step": 1829
},
{
"epoch": 0.5452717047189839,
"grad_norm": 0.43810999393463135,
"learning_rate": 5.090987813341486e-06,
"loss": 0.7514,
"step": 1830
},
{
"epoch": 0.5455696673991582,
"grad_norm": 0.42822471261024475,
"learning_rate": 5.085789035455256e-06,
"loss": 0.7263,
"step": 1831
},
{
"epoch": 0.5458676300793326,
"grad_norm": 0.40355852246284485,
"learning_rate": 5.0805901647942226e-06,
"loss": 0.7258,
"step": 1832
},
{
"epoch": 0.5461655927595068,
"grad_norm": 0.4210696220397949,
"learning_rate": 5.0753912069806e-06,
"loss": 0.7308,
"step": 1833
},
{
"epoch": 0.5464635554396812,
"grad_norm": 0.41088956594467163,
"learning_rate": 5.070192167636693e-06,
"loss": 0.7468,
"step": 1834
},
{
"epoch": 0.5467615181198555,
"grad_norm": 0.4268404245376587,
"learning_rate": 5.064993052384899e-06,
"loss": 0.7003,
"step": 1835
},
{
"epoch": 0.5470594808000298,
"grad_norm": 0.4162386953830719,
"learning_rate": 5.059793866847692e-06,
"loss": 0.7529,
"step": 1836
},
{
"epoch": 0.5473574434802041,
"grad_norm": 0.4230744242668152,
"learning_rate": 5.054594616647628e-06,
"loss": 0.7394,
"step": 1837
},
{
"epoch": 0.5476554061603784,
"grad_norm": 0.4292016327381134,
"learning_rate": 5.049395307407329e-06,
"loss": 0.7795,
"step": 1838
},
{
"epoch": 0.5479533688405527,
"grad_norm": 0.4010887145996094,
"learning_rate": 5.044195944749482e-06,
"loss": 0.7457,
"step": 1839
},
{
"epoch": 0.548251331520727,
"grad_norm": 0.40281057357788086,
"learning_rate": 5.0389965342968316e-06,
"loss": 0.7384,
"step": 1840
},
{
"epoch": 0.5485492942009014,
"grad_norm": 0.42106863856315613,
"learning_rate": 5.033797081672176e-06,
"loss": 0.7607,
"step": 1841
},
{
"epoch": 0.5488472568810756,
"grad_norm": 0.4178571403026581,
"learning_rate": 5.0285975924983546e-06,
"loss": 0.746,
"step": 1842
},
{
"epoch": 0.54914521956125,
"grad_norm": 0.4276307225227356,
"learning_rate": 5.023398072398249e-06,
"loss": 0.7679,
"step": 1843
},
{
"epoch": 0.5494431822414243,
"grad_norm": 0.4141428470611572,
"learning_rate": 5.0181985269947754e-06,
"loss": 0.7667,
"step": 1844
},
{
"epoch": 0.5497411449215985,
"grad_norm": 0.41870513558387756,
"learning_rate": 5.012998961910876e-06,
"loss": 0.7816,
"step": 1845
},
{
"epoch": 0.5500391076017729,
"grad_norm": 0.41629573702812195,
"learning_rate": 5.007799382769516e-06,
"loss": 0.7634,
"step": 1846
},
{
"epoch": 0.5503370702819472,
"grad_norm": 0.42141273617744446,
"learning_rate": 5.002599795193671e-06,
"loss": 0.7559,
"step": 1847
},
{
"epoch": 0.5506350329621215,
"grad_norm": 0.4244276285171509,
"learning_rate": 4.9974002048063314e-06,
"loss": 0.762,
"step": 1848
},
{
"epoch": 0.5509329956422958,
"grad_norm": 0.40807273983955383,
"learning_rate": 4.9922006172304855e-06,
"loss": 0.7242,
"step": 1849
},
{
"epoch": 0.5512309583224702,
"grad_norm": 0.41207119822502136,
"learning_rate": 4.987001038089124e-06,
"loss": 0.7411,
"step": 1850
},
{
"epoch": 0.5515289210026444,
"grad_norm": 0.42448246479034424,
"learning_rate": 4.981801473005226e-06,
"loss": 0.7246,
"step": 1851
},
{
"epoch": 0.5518268836828187,
"grad_norm": 0.4166666269302368,
"learning_rate": 4.976601927601752e-06,
"loss": 0.706,
"step": 1852
},
{
"epoch": 0.5521248463629931,
"grad_norm": 0.42458873987197876,
"learning_rate": 4.971402407501649e-06,
"loss": 0.756,
"step": 1853
},
{
"epoch": 0.5524228090431673,
"grad_norm": 0.40440189838409424,
"learning_rate": 4.966202918327826e-06,
"loss": 0.7532,
"step": 1854
},
{
"epoch": 0.5527207717233417,
"grad_norm": 0.4150710701942444,
"learning_rate": 4.961003465703168e-06,
"loss": 0.74,
"step": 1855
},
{
"epoch": 0.553018734403516,
"grad_norm": 0.42995816469192505,
"learning_rate": 4.955804055250519e-06,
"loss": 0.7332,
"step": 1856
},
{
"epoch": 0.5533166970836902,
"grad_norm": 0.41072604060173035,
"learning_rate": 4.9506046925926725e-06,
"loss": 0.7239,
"step": 1857
},
{
"epoch": 0.5536146597638646,
"grad_norm": 0.40858787298202515,
"learning_rate": 4.945405383352372e-06,
"loss": 0.7475,
"step": 1858
},
{
"epoch": 0.5539126224440389,
"grad_norm": 0.39616289734840393,
"learning_rate": 4.94020613315231e-06,
"loss": 0.7375,
"step": 1859
},
{
"epoch": 0.5542105851242132,
"grad_norm": 0.4078272879123688,
"learning_rate": 4.935006947615103e-06,
"loss": 0.7545,
"step": 1860
},
{
"epoch": 0.5545085478043875,
"grad_norm": 0.4218798279762268,
"learning_rate": 4.929807832363308e-06,
"loss": 0.7587,
"step": 1861
},
{
"epoch": 0.5548065104845618,
"grad_norm": 0.4236418902873993,
"learning_rate": 4.9246087930194016e-06,
"loss": 0.7854,
"step": 1862
},
{
"epoch": 0.5551044731647361,
"grad_norm": 0.42399102449417114,
"learning_rate": 4.919409835205778e-06,
"loss": 0.709,
"step": 1863
},
{
"epoch": 0.5554024358449104,
"grad_norm": 0.43494337797164917,
"learning_rate": 4.914210964544747e-06,
"loss": 0.7599,
"step": 1864
},
{
"epoch": 0.5557003985250847,
"grad_norm": 0.431159645318985,
"learning_rate": 4.9090121866585155e-06,
"loss": 0.7465,
"step": 1865
},
{
"epoch": 0.555998361205259,
"grad_norm": 0.4185139536857605,
"learning_rate": 4.9038135071692e-06,
"loss": 0.7772,
"step": 1866
},
{
"epoch": 0.5562963238854334,
"grad_norm": 0.41682150959968567,
"learning_rate": 4.898614931698808e-06,
"loss": 0.727,
"step": 1867
},
{
"epoch": 0.5565942865656076,
"grad_norm": 0.40183204412460327,
"learning_rate": 4.893416465869229e-06,
"loss": 0.7562,
"step": 1868
},
{
"epoch": 0.556892249245782,
"grad_norm": 0.41225665807724,
"learning_rate": 4.888218115302238e-06,
"loss": 0.7261,
"step": 1869
},
{
"epoch": 0.5571902119259563,
"grad_norm": 0.4158567488193512,
"learning_rate": 4.883019885619491e-06,
"loss": 0.7621,
"step": 1870
},
{
"epoch": 0.5574881746061305,
"grad_norm": 0.4105948805809021,
"learning_rate": 4.8778217824425e-06,
"loss": 0.7156,
"step": 1871
},
{
"epoch": 0.5577861372863049,
"grad_norm": 0.4392630159854889,
"learning_rate": 4.872623811392652e-06,
"loss": 0.7595,
"step": 1872
},
{
"epoch": 0.5580840999664792,
"grad_norm": 0.4194263517856598,
"learning_rate": 4.867425978091185e-06,
"loss": 0.7488,
"step": 1873
},
{
"epoch": 0.5583820626466535,
"grad_norm": 0.4097282588481903,
"learning_rate": 4.862228288159191e-06,
"loss": 0.7466,
"step": 1874
},
{
"epoch": 0.5586800253268278,
"grad_norm": 0.42875537276268005,
"learning_rate": 4.857030747217606e-06,
"loss": 0.8227,
"step": 1875
},
{
"epoch": 0.5589779880070022,
"grad_norm": 0.4271450638771057,
"learning_rate": 4.8518333608872015e-06,
"loss": 0.7602,
"step": 1876
},
{
"epoch": 0.5592759506871764,
"grad_norm": 0.4113244116306305,
"learning_rate": 4.846636134788589e-06,
"loss": 0.7145,
"step": 1877
},
{
"epoch": 0.5595739133673507,
"grad_norm": 0.40590837597846985,
"learning_rate": 4.841439074542202e-06,
"loss": 0.712,
"step": 1878
},
{
"epoch": 0.5598718760475251,
"grad_norm": 0.39654046297073364,
"learning_rate": 4.836242185768293e-06,
"loss": 0.716,
"step": 1879
},
{
"epoch": 0.5601698387276993,
"grad_norm": 0.41102418303489685,
"learning_rate": 4.831045474086932e-06,
"loss": 0.7426,
"step": 1880
},
{
"epoch": 0.5604678014078737,
"grad_norm": 0.4282558560371399,
"learning_rate": 4.8258489451180014e-06,
"loss": 0.7371,
"step": 1881
},
{
"epoch": 0.560765764088048,
"grad_norm": 0.3982420563697815,
"learning_rate": 4.820652604481175e-06,
"loss": 0.71,
"step": 1882
},
{
"epoch": 0.5610637267682222,
"grad_norm": 0.4026240408420563,
"learning_rate": 4.815456457795933e-06,
"loss": 0.7056,
"step": 1883
},
{
"epoch": 0.5613616894483966,
"grad_norm": 0.4201667010784149,
"learning_rate": 4.810260510681541e-06,
"loss": 0.7408,
"step": 1884
},
{
"epoch": 0.5616596521285709,
"grad_norm": 0.42020440101623535,
"learning_rate": 4.805064768757051e-06,
"loss": 0.7764,
"step": 1885
},
{
"epoch": 0.5619576148087452,
"grad_norm": 0.4300926625728607,
"learning_rate": 4.799869237641292e-06,
"loss": 0.7987,
"step": 1886
},
{
"epoch": 0.5622555774889195,
"grad_norm": 0.41482165455818176,
"learning_rate": 4.794673922952863e-06,
"loss": 0.7313,
"step": 1887
},
{
"epoch": 0.5625535401690939,
"grad_norm": 0.40758150815963745,
"learning_rate": 4.789478830310134e-06,
"loss": 0.7154,
"step": 1888
},
{
"epoch": 0.5628515028492681,
"grad_norm": 0.40623360872268677,
"learning_rate": 4.784283965331232e-06,
"loss": 0.7059,
"step": 1889
},
{
"epoch": 0.5631494655294425,
"grad_norm": 0.43937063217163086,
"learning_rate": 4.779089333634036e-06,
"loss": 0.7701,
"step": 1890
},
{
"epoch": 0.5634474282096168,
"grad_norm": 0.41792380809783936,
"learning_rate": 4.773894940836174e-06,
"loss": 0.7407,
"step": 1891
},
{
"epoch": 0.563745390889791,
"grad_norm": 0.4050884544849396,
"learning_rate": 4.76870079255502e-06,
"loss": 0.7353,
"step": 1892
},
{
"epoch": 0.5640433535699654,
"grad_norm": 0.4137275815010071,
"learning_rate": 4.763506894407675e-06,
"loss": 0.7236,
"step": 1893
},
{
"epoch": 0.5643413162501397,
"grad_norm": 0.4158051013946533,
"learning_rate": 4.7583132520109784e-06,
"loss": 0.7762,
"step": 1894
},
{
"epoch": 0.564639278930314,
"grad_norm": 0.4179161787033081,
"learning_rate": 4.753119870981486e-06,
"loss": 0.707,
"step": 1895
},
{
"epoch": 0.5649372416104883,
"grad_norm": 0.40873193740844727,
"learning_rate": 4.747926756935474e-06,
"loss": 0.721,
"step": 1896
},
{
"epoch": 0.5652352042906625,
"grad_norm": 0.4177382290363312,
"learning_rate": 4.742733915488932e-06,
"loss": 0.7517,
"step": 1897
},
{
"epoch": 0.5655331669708369,
"grad_norm": 0.4085400700569153,
"learning_rate": 4.737541352257549e-06,
"loss": 0.6908,
"step": 1898
},
{
"epoch": 0.5658311296510112,
"grad_norm": 0.4144119620323181,
"learning_rate": 4.732349072856719e-06,
"loss": 0.7344,
"step": 1899
},
{
"epoch": 0.5661290923311855,
"grad_norm": 0.4040220379829407,
"learning_rate": 4.727157082901527e-06,
"loss": 0.7104,
"step": 1900
},
{
"epoch": 0.5664270550113598,
"grad_norm": 0.41264522075653076,
"learning_rate": 4.721965388006743e-06,
"loss": 0.7355,
"step": 1901
},
{
"epoch": 0.5667250176915342,
"grad_norm": 0.45604878664016724,
"learning_rate": 4.716773993786822e-06,
"loss": 0.7708,
"step": 1902
},
{
"epoch": 0.5670229803717084,
"grad_norm": 0.4117855131626129,
"learning_rate": 4.711582905855889e-06,
"loss": 0.7366,
"step": 1903
},
{
"epoch": 0.5673209430518827,
"grad_norm": 0.41091251373291016,
"learning_rate": 4.706392129827739e-06,
"loss": 0.7461,
"step": 1904
},
{
"epoch": 0.5676189057320571,
"grad_norm": 0.4039057195186615,
"learning_rate": 4.7012016713158355e-06,
"loss": 0.6907,
"step": 1905
},
{
"epoch": 0.5679168684122313,
"grad_norm": 0.4205576777458191,
"learning_rate": 4.69601153593329e-06,
"loss": 0.7653,
"step": 1906
},
{
"epoch": 0.5682148310924057,
"grad_norm": 0.44450855255126953,
"learning_rate": 4.6908217292928705e-06,
"loss": 0.8305,
"step": 1907
},
{
"epoch": 0.56851279377258,
"grad_norm": 0.4235401451587677,
"learning_rate": 4.685632257006988e-06,
"loss": 0.7603,
"step": 1908
},
{
"epoch": 0.5688107564527543,
"grad_norm": 0.4110579192638397,
"learning_rate": 4.680443124687688e-06,
"loss": 0.707,
"step": 1909
},
{
"epoch": 0.5691087191329286,
"grad_norm": 0.4077126085758209,
"learning_rate": 4.675254337946656e-06,
"loss": 0.7189,
"step": 1910
},
{
"epoch": 0.5694066818131029,
"grad_norm": 0.40424802899360657,
"learning_rate": 4.670065902395199e-06,
"loss": 0.7206,
"step": 1911
},
{
"epoch": 0.5697046444932772,
"grad_norm": 0.42563310265541077,
"learning_rate": 4.664877823644242e-06,
"loss": 0.7752,
"step": 1912
},
{
"epoch": 0.5700026071734515,
"grad_norm": 0.4315684735774994,
"learning_rate": 4.659690107304331e-06,
"loss": 0.7838,
"step": 1913
},
{
"epoch": 0.5703005698536259,
"grad_norm": 0.41235530376434326,
"learning_rate": 4.654502758985611e-06,
"loss": 0.7051,
"step": 1914
},
{
"epoch": 0.5705985325338001,
"grad_norm": 0.4013887345790863,
"learning_rate": 4.649315784297837e-06,
"loss": 0.7212,
"step": 1915
},
{
"epoch": 0.5708964952139745,
"grad_norm": 0.43051424622535706,
"learning_rate": 4.644129188850359e-06,
"loss": 0.7512,
"step": 1916
},
{
"epoch": 0.5711944578941488,
"grad_norm": 0.4274933636188507,
"learning_rate": 4.638942978252111e-06,
"loss": 0.7321,
"step": 1917
},
{
"epoch": 0.571492420574323,
"grad_norm": 0.41361722350120544,
"learning_rate": 4.633757158111617e-06,
"loss": 0.7293,
"step": 1918
},
{
"epoch": 0.5717903832544974,
"grad_norm": 0.40967097878456116,
"learning_rate": 4.6285717340369774e-06,
"loss": 0.7139,
"step": 1919
},
{
"epoch": 0.5720883459346717,
"grad_norm": 0.441385954618454,
"learning_rate": 4.6233867116358586e-06,
"loss": 0.7658,
"step": 1920
},
{
"epoch": 0.572386308614846,
"grad_norm": 0.4277268946170807,
"learning_rate": 4.618202096515505e-06,
"loss": 0.7178,
"step": 1921
},
{
"epoch": 0.5726842712950203,
"grad_norm": 0.41798168420791626,
"learning_rate": 4.6130178942827045e-06,
"loss": 0.7251,
"step": 1922
},
{
"epoch": 0.5729822339751947,
"grad_norm": 0.43137815594673157,
"learning_rate": 4.607834110543812e-06,
"loss": 0.7746,
"step": 1923
},
{
"epoch": 0.5732801966553689,
"grad_norm": 0.40609511733055115,
"learning_rate": 4.602650750904724e-06,
"loss": 0.7358,
"step": 1924
},
{
"epoch": 0.5735781593355432,
"grad_norm": 0.41860562562942505,
"learning_rate": 4.597467820970879e-06,
"loss": 0.7486,
"step": 1925
},
{
"epoch": 0.5738761220157176,
"grad_norm": 0.41176602244377136,
"learning_rate": 4.5922853263472475e-06,
"loss": 0.7587,
"step": 1926
},
{
"epoch": 0.5741740846958918,
"grad_norm": 0.4014931619167328,
"learning_rate": 4.587103272638339e-06,
"loss": 0.7296,
"step": 1927
},
{
"epoch": 0.5744720473760662,
"grad_norm": 0.4227864444255829,
"learning_rate": 4.5819216654481756e-06,
"loss": 0.7648,
"step": 1928
},
{
"epoch": 0.5747700100562405,
"grad_norm": 0.39504462480545044,
"learning_rate": 4.576740510380301e-06,
"loss": 0.6974,
"step": 1929
},
{
"epoch": 0.5750679727364147,
"grad_norm": 0.4270729720592499,
"learning_rate": 4.571559813037771e-06,
"loss": 0.7364,
"step": 1930
},
{
"epoch": 0.5753659354165891,
"grad_norm": 0.42046594619750977,
"learning_rate": 4.566379579023143e-06,
"loss": 0.7719,
"step": 1931
},
{
"epoch": 0.5756638980967633,
"grad_norm": 0.4060092270374298,
"learning_rate": 4.56119981393848e-06,
"loss": 0.7261,
"step": 1932
},
{
"epoch": 0.5759618607769377,
"grad_norm": 0.41615304350852966,
"learning_rate": 4.556020523385326e-06,
"loss": 0.7051,
"step": 1933
},
{
"epoch": 0.576259823457112,
"grad_norm": 0.41677922010421753,
"learning_rate": 4.550841712964725e-06,
"loss": 0.7293,
"step": 1934
},
{
"epoch": 0.5765577861372863,
"grad_norm": 0.4129596948623657,
"learning_rate": 4.545663388277196e-06,
"loss": 0.7013,
"step": 1935
},
{
"epoch": 0.5768557488174606,
"grad_norm": 0.4105396866798401,
"learning_rate": 4.540485554922729e-06,
"loss": 0.7264,
"step": 1936
},
{
"epoch": 0.577153711497635,
"grad_norm": 0.39716649055480957,
"learning_rate": 4.535308218500787e-06,
"loss": 0.7288,
"step": 1937
},
{
"epoch": 0.5774516741778092,
"grad_norm": 0.41149744391441345,
"learning_rate": 4.530131384610299e-06,
"loss": 0.7837,
"step": 1938
},
{
"epoch": 0.5777496368579835,
"grad_norm": 0.4145060181617737,
"learning_rate": 4.524955058849641e-06,
"loss": 0.7403,
"step": 1939
},
{
"epoch": 0.5780475995381579,
"grad_norm": 0.41767367720603943,
"learning_rate": 4.51977924681665e-06,
"loss": 0.7836,
"step": 1940
},
{
"epoch": 0.5783455622183321,
"grad_norm": 0.40490424633026123,
"learning_rate": 4.514603954108597e-06,
"loss": 0.7535,
"step": 1941
},
{
"epoch": 0.5786435248985065,
"grad_norm": 0.41554492712020874,
"learning_rate": 4.5094291863222e-06,
"loss": 0.7461,
"step": 1942
},
{
"epoch": 0.5789414875786808,
"grad_norm": 0.42319101095199585,
"learning_rate": 4.504254949053608e-06,
"loss": 0.7393,
"step": 1943
},
{
"epoch": 0.579239450258855,
"grad_norm": 0.4117283225059509,
"learning_rate": 4.4990812478983895e-06,
"loss": 0.7383,
"step": 1944
},
{
"epoch": 0.5795374129390294,
"grad_norm": 0.41115203499794006,
"learning_rate": 4.493908088451541e-06,
"loss": 0.7583,
"step": 1945
},
{
"epoch": 0.5798353756192037,
"grad_norm": 0.41332003474235535,
"learning_rate": 4.488735476307472e-06,
"loss": 0.7868,
"step": 1946
},
{
"epoch": 0.580133338299378,
"grad_norm": 0.42866283655166626,
"learning_rate": 4.483563417059995e-06,
"loss": 0.7707,
"step": 1947
},
{
"epoch": 0.5804313009795523,
"grad_norm": 0.4217878580093384,
"learning_rate": 4.478391916302327e-06,
"loss": 0.751,
"step": 1948
},
{
"epoch": 0.5807292636597267,
"grad_norm": 0.4109710454940796,
"learning_rate": 4.473220979627088e-06,
"loss": 0.7387,
"step": 1949
},
{
"epoch": 0.5810272263399009,
"grad_norm": 0.41321882605552673,
"learning_rate": 4.468050612626277e-06,
"loss": 0.7451,
"step": 1950
},
{
"epoch": 0.5813251890200752,
"grad_norm": 0.4039456248283386,
"learning_rate": 4.462880820891284e-06,
"loss": 0.7271,
"step": 1951
},
{
"epoch": 0.5816231517002496,
"grad_norm": 0.41406941413879395,
"learning_rate": 4.457711610012873e-06,
"loss": 0.755,
"step": 1952
},
{
"epoch": 0.5819211143804238,
"grad_norm": 0.42365142703056335,
"learning_rate": 4.452542985581184e-06,
"loss": 0.7512,
"step": 1953
},
{
"epoch": 0.5822190770605982,
"grad_norm": 0.43334272503852844,
"learning_rate": 4.44737495318572e-06,
"loss": 0.7667,
"step": 1954
},
{
"epoch": 0.5825170397407725,
"grad_norm": 0.41110551357269287,
"learning_rate": 4.442207518415341e-06,
"loss": 0.7271,
"step": 1955
},
{
"epoch": 0.5828150024209467,
"grad_norm": 0.4264329969882965,
"learning_rate": 4.4370406868582684e-06,
"loss": 0.7621,
"step": 1956
},
{
"epoch": 0.5831129651011211,
"grad_norm": 0.4122719168663025,
"learning_rate": 4.431874464102065e-06,
"loss": 0.7343,
"step": 1957
},
{
"epoch": 0.5834109277812954,
"grad_norm": 0.4079119861125946,
"learning_rate": 4.426708855733637e-06,
"loss": 0.7283,
"step": 1958
},
{
"epoch": 0.5837088904614697,
"grad_norm": 0.422280490398407,
"learning_rate": 4.421543867339227e-06,
"loss": 0.7529,
"step": 1959
},
{
"epoch": 0.584006853141644,
"grad_norm": 0.4280681312084198,
"learning_rate": 4.4163795045044055e-06,
"loss": 0.7469,
"step": 1960
},
{
"epoch": 0.5843048158218184,
"grad_norm": 0.4085499048233032,
"learning_rate": 4.411215772814066e-06,
"loss": 0.7161,
"step": 1961
},
{
"epoch": 0.5846027785019926,
"grad_norm": 0.43589451909065247,
"learning_rate": 4.4060526778524245e-06,
"loss": 0.779,
"step": 1962
},
{
"epoch": 0.584900741182167,
"grad_norm": 0.42634084820747375,
"learning_rate": 4.400890225203001e-06,
"loss": 0.7485,
"step": 1963
},
{
"epoch": 0.5851987038623413,
"grad_norm": 0.41059428453445435,
"learning_rate": 4.395728420448627e-06,
"loss": 0.755,
"step": 1964
},
{
"epoch": 0.5854966665425155,
"grad_norm": 0.40567854046821594,
"learning_rate": 4.3905672691714315e-06,
"loss": 0.7407,
"step": 1965
},
{
"epoch": 0.5857946292226899,
"grad_norm": 0.4206496477127075,
"learning_rate": 4.385406776952833e-06,
"loss": 0.7505,
"step": 1966
},
{
"epoch": 0.5860925919028641,
"grad_norm": 0.4188379943370819,
"learning_rate": 4.380246949373543e-06,
"loss": 0.7567,
"step": 1967
},
{
"epoch": 0.5863905545830385,
"grad_norm": 0.4289083182811737,
"learning_rate": 4.375087792013553e-06,
"loss": 0.7583,
"step": 1968
},
{
"epoch": 0.5866885172632128,
"grad_norm": 0.41483402252197266,
"learning_rate": 4.369929310452126e-06,
"loss": 0.766,
"step": 1969
},
{
"epoch": 0.586986479943387,
"grad_norm": 0.4211726486682892,
"learning_rate": 4.364771510267798e-06,
"loss": 0.7582,
"step": 1970
},
{
"epoch": 0.5872844426235614,
"grad_norm": 0.4256436824798584,
"learning_rate": 4.3596143970383665e-06,
"loss": 0.7636,
"step": 1971
},
{
"epoch": 0.5875824053037357,
"grad_norm": 0.4167037606239319,
"learning_rate": 4.3544579763408855e-06,
"loss": 0.7531,
"step": 1972
},
{
"epoch": 0.58788036798391,
"grad_norm": 0.40595850348472595,
"learning_rate": 4.3493022537516634e-06,
"loss": 0.6923,
"step": 1973
},
{
"epoch": 0.5881783306640843,
"grad_norm": 0.4079906642436981,
"learning_rate": 4.344147234846249e-06,
"loss": 0.7281,
"step": 1974
},
{
"epoch": 0.5884762933442587,
"grad_norm": 0.41655340790748596,
"learning_rate": 4.338992925199433e-06,
"loss": 0.7389,
"step": 1975
},
{
"epoch": 0.5887742560244329,
"grad_norm": 0.42848271131515503,
"learning_rate": 4.333839330385241e-06,
"loss": 0.7105,
"step": 1976
},
{
"epoch": 0.5890722187046072,
"grad_norm": 0.4202434718608856,
"learning_rate": 4.328686455976917e-06,
"loss": 0.7297,
"step": 1977
},
{
"epoch": 0.5893701813847816,
"grad_norm": 0.40047043561935425,
"learning_rate": 4.323534307546938e-06,
"loss": 0.7358,
"step": 1978
},
{
"epoch": 0.5896681440649558,
"grad_norm": 0.4223696291446686,
"learning_rate": 4.318382890666988e-06,
"loss": 0.7616,
"step": 1979
},
{
"epoch": 0.5899661067451302,
"grad_norm": 0.414849191904068,
"learning_rate": 4.313232210907959e-06,
"loss": 0.7623,
"step": 1980
},
{
"epoch": 0.5902640694253045,
"grad_norm": 0.4242858290672302,
"learning_rate": 4.308082273839953e-06,
"loss": 0.7302,
"step": 1981
},
{
"epoch": 0.5905620321054788,
"grad_norm": 0.41924822330474854,
"learning_rate": 4.302933085032262e-06,
"loss": 0.7421,
"step": 1982
},
{
"epoch": 0.5908599947856531,
"grad_norm": 0.42674970626831055,
"learning_rate": 4.29778465005337e-06,
"loss": 0.7565,
"step": 1983
},
{
"epoch": 0.5911579574658274,
"grad_norm": 0.4174593687057495,
"learning_rate": 4.29263697447095e-06,
"loss": 0.7441,
"step": 1984
},
{
"epoch": 0.5914559201460017,
"grad_norm": 0.40316957235336304,
"learning_rate": 4.287490063851848e-06,
"loss": 0.6995,
"step": 1985
},
{
"epoch": 0.591753882826176,
"grad_norm": 0.40519094467163086,
"learning_rate": 4.282343923762088e-06,
"loss": 0.7523,
"step": 1986
},
{
"epoch": 0.5920518455063504,
"grad_norm": 0.4047006368637085,
"learning_rate": 4.277198559766858e-06,
"loss": 0.7322,
"step": 1987
},
{
"epoch": 0.5923498081865246,
"grad_norm": 0.4245583713054657,
"learning_rate": 4.272053977430503e-06,
"loss": 0.7703,
"step": 1988
},
{
"epoch": 0.592647770866699,
"grad_norm": 0.4122294485569,
"learning_rate": 4.266910182316533e-06,
"loss": 0.7066,
"step": 1989
},
{
"epoch": 0.5929457335468733,
"grad_norm": 0.4069257080554962,
"learning_rate": 4.261767179987595e-06,
"loss": 0.7085,
"step": 1990
},
{
"epoch": 0.5932436962270475,
"grad_norm": 0.41777312755584717,
"learning_rate": 4.256624976005485e-06,
"loss": 0.7569,
"step": 1991
},
{
"epoch": 0.5935416589072219,
"grad_norm": 0.41323649883270264,
"learning_rate": 4.251483575931139e-06,
"loss": 0.7342,
"step": 1992
},
{
"epoch": 0.5938396215873962,
"grad_norm": 0.42368587851524353,
"learning_rate": 4.246342985324614e-06,
"loss": 0.7261,
"step": 1993
},
{
"epoch": 0.5941375842675705,
"grad_norm": 0.42891785502433777,
"learning_rate": 4.241203209745098e-06,
"loss": 0.7437,
"step": 1994
},
{
"epoch": 0.5944355469477448,
"grad_norm": 0.4087558388710022,
"learning_rate": 4.236064254750899e-06,
"loss": 0.7473,
"step": 1995
},
{
"epoch": 0.5947335096279192,
"grad_norm": 0.41087907552719116,
"learning_rate": 4.230926125899432e-06,
"loss": 0.7118,
"step": 1996
},
{
"epoch": 0.5950314723080934,
"grad_norm": 0.4258415400981903,
"learning_rate": 4.225788828747224e-06,
"loss": 0.7338,
"step": 1997
},
{
"epoch": 0.5953294349882677,
"grad_norm": 0.42956599593162537,
"learning_rate": 4.2206523688499e-06,
"loss": 0.7546,
"step": 1998
},
{
"epoch": 0.5956273976684421,
"grad_norm": 0.4405895173549652,
"learning_rate": 4.215516751762177e-06,
"loss": 0.8171,
"step": 1999
},
{
"epoch": 0.5959253603486163,
"grad_norm": 0.4291638433933258,
"learning_rate": 4.210381983037869e-06,
"loss": 0.7791,
"step": 2000
}
],
"logging_steps": 1,
"max_steps": 3357,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1083765476491264e+19,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}