tqwq / trainer_state.json
C10X's picture
Upload trainer_state.json with huggingface_hub
d66a531 verified
{
"best_global_step": 1000,
"best_metric": 2.9839203357696533,
"best_model_checkpoint": "outputs/checkpoint-1000",
"epoch": 0.6430868167202572,
"eval_steps": 1000,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006430868167202572,
"grad_norm": 1.678249716758728,
"learning_rate": 0.0,
"loss": 10.412,
"step": 1
},
{
"epoch": 0.0012861736334405145,
"grad_norm": 1.7936455011367798,
"learning_rate": 4e-05,
"loss": 10.4163,
"step": 2
},
{
"epoch": 0.0019292604501607716,
"grad_norm": 1.7285282611846924,
"learning_rate": 8e-05,
"loss": 10.3998,
"step": 3
},
{
"epoch": 0.002572347266881029,
"grad_norm": 1.6896724700927734,
"learning_rate": 0.00012,
"loss": 10.3641,
"step": 4
},
{
"epoch": 0.003215434083601286,
"grad_norm": 1.5910422801971436,
"learning_rate": 0.00016,
"loss": 10.3105,
"step": 5
},
{
"epoch": 0.0038585209003215433,
"grad_norm": 1.5040550231933594,
"learning_rate": 0.0002,
"loss": 10.2539,
"step": 6
},
{
"epoch": 0.0045016077170418,
"grad_norm": 1.4353411197662354,
"learning_rate": 0.00019998713412672887,
"loss": 10.1944,
"step": 7
},
{
"epoch": 0.005144694533762058,
"grad_norm": 1.4172208309173584,
"learning_rate": 0.0001999742682534577,
"loss": 10.1482,
"step": 8
},
{
"epoch": 0.005787781350482315,
"grad_norm": 1.4051563739776611,
"learning_rate": 0.00019996140238018656,
"loss": 10.0827,
"step": 9
},
{
"epoch": 0.006430868167202572,
"grad_norm": 1.3599131107330322,
"learning_rate": 0.0001999485365069154,
"loss": 10.0401,
"step": 10
},
{
"epoch": 0.00707395498392283,
"grad_norm": 1.3481664657592773,
"learning_rate": 0.00019993567063364426,
"loss": 9.9904,
"step": 11
},
{
"epoch": 0.0077170418006430866,
"grad_norm": 1.328471302986145,
"learning_rate": 0.00019992280476037312,
"loss": 9.9554,
"step": 12
},
{
"epoch": 0.008360128617363344,
"grad_norm": 1.3357861042022705,
"learning_rate": 0.00019990993888710198,
"loss": 9.9119,
"step": 13
},
{
"epoch": 0.0090032154340836,
"grad_norm": 1.3247236013412476,
"learning_rate": 0.00019989707301383084,
"loss": 9.8818,
"step": 14
},
{
"epoch": 0.00964630225080386,
"grad_norm": 1.3218022584915161,
"learning_rate": 0.00019988420714055967,
"loss": 9.842,
"step": 15
},
{
"epoch": 0.010289389067524116,
"grad_norm": 1.3450143337249756,
"learning_rate": 0.00019987134126728853,
"loss": 9.7933,
"step": 16
},
{
"epoch": 0.010932475884244373,
"grad_norm": 1.352043628692627,
"learning_rate": 0.0001998584753940174,
"loss": 9.7582,
"step": 17
},
{
"epoch": 0.01157556270096463,
"grad_norm": 1.3545559644699097,
"learning_rate": 0.00019984560952074622,
"loss": 9.713,
"step": 18
},
{
"epoch": 0.012218649517684888,
"grad_norm": 1.3423742055892944,
"learning_rate": 0.00019983274364747508,
"loss": 9.6714,
"step": 19
},
{
"epoch": 0.012861736334405145,
"grad_norm": 1.295168399810791,
"learning_rate": 0.00019981987777420391,
"loss": 9.6478,
"step": 20
},
{
"epoch": 0.013504823151125401,
"grad_norm": 1.3305379152297974,
"learning_rate": 0.00019980701190093277,
"loss": 9.603,
"step": 21
},
{
"epoch": 0.01414790996784566,
"grad_norm": 1.33871591091156,
"learning_rate": 0.00019979414602766163,
"loss": 9.5493,
"step": 22
},
{
"epoch": 0.014790996784565916,
"grad_norm": 1.335689902305603,
"learning_rate": 0.0001997812801543905,
"loss": 9.517,
"step": 23
},
{
"epoch": 0.015434083601286173,
"grad_norm": 1.3390552997589111,
"learning_rate": 0.00019976841428111935,
"loss": 9.4742,
"step": 24
},
{
"epoch": 0.01607717041800643,
"grad_norm": 1.337459683418274,
"learning_rate": 0.00019975554840784821,
"loss": 9.4299,
"step": 25
},
{
"epoch": 0.016720257234726688,
"grad_norm": 1.3750091791152954,
"learning_rate": 0.00019974268253457705,
"loss": 9.3735,
"step": 26
},
{
"epoch": 0.017363344051446947,
"grad_norm": 1.34926438331604,
"learning_rate": 0.0001997298166613059,
"loss": 9.3448,
"step": 27
},
{
"epoch": 0.0180064308681672,
"grad_norm": 1.3303914070129395,
"learning_rate": 0.00019971695078803474,
"loss": 9.3069,
"step": 28
},
{
"epoch": 0.01864951768488746,
"grad_norm": 1.3551392555236816,
"learning_rate": 0.0001997040849147636,
"loss": 9.2533,
"step": 29
},
{
"epoch": 0.01929260450160772,
"grad_norm": 1.3162682056427002,
"learning_rate": 0.00019969121904149246,
"loss": 9.2248,
"step": 30
},
{
"epoch": 0.019935691318327974,
"grad_norm": 1.3329894542694092,
"learning_rate": 0.0001996783531682213,
"loss": 9.1831,
"step": 31
},
{
"epoch": 0.020578778135048232,
"grad_norm": 1.373375654220581,
"learning_rate": 0.00019966548729495015,
"loss": 9.1197,
"step": 32
},
{
"epoch": 0.02122186495176849,
"grad_norm": 1.3253285884857178,
"learning_rate": 0.00019965262142167899,
"loss": 9.0978,
"step": 33
},
{
"epoch": 0.021864951768488745,
"grad_norm": 1.3255876302719116,
"learning_rate": 0.00019963975554840785,
"loss": 9.0456,
"step": 34
},
{
"epoch": 0.022508038585209004,
"grad_norm": 1.3354089260101318,
"learning_rate": 0.0001996268896751367,
"loss": 8.9979,
"step": 35
},
{
"epoch": 0.02315112540192926,
"grad_norm": 1.3058476448059082,
"learning_rate": 0.00019961402380186557,
"loss": 8.9718,
"step": 36
},
{
"epoch": 0.023794212218649517,
"grad_norm": 1.3565034866333008,
"learning_rate": 0.00019960115792859443,
"loss": 8.8975,
"step": 37
},
{
"epoch": 0.024437299035369776,
"grad_norm": 1.3475068807601929,
"learning_rate": 0.00019958829205532326,
"loss": 8.8667,
"step": 38
},
{
"epoch": 0.02508038585209003,
"grad_norm": 1.3298248052597046,
"learning_rate": 0.00019957542618205212,
"loss": 8.845,
"step": 39
},
{
"epoch": 0.02572347266881029,
"grad_norm": 1.3165632486343384,
"learning_rate": 0.00019956256030878098,
"loss": 8.8057,
"step": 40
},
{
"epoch": 0.026366559485530548,
"grad_norm": 1.3182189464569092,
"learning_rate": 0.0001995496944355098,
"loss": 8.7485,
"step": 41
},
{
"epoch": 0.027009646302250803,
"grad_norm": 1.3431370258331299,
"learning_rate": 0.00019953682856223867,
"loss": 8.6918,
"step": 42
},
{
"epoch": 0.02765273311897106,
"grad_norm": 1.317123532295227,
"learning_rate": 0.0001995239626889675,
"loss": 8.6704,
"step": 43
},
{
"epoch": 0.02829581993569132,
"grad_norm": 1.3226360082626343,
"learning_rate": 0.00019951109681569636,
"loss": 8.6149,
"step": 44
},
{
"epoch": 0.028938906752411574,
"grad_norm": 1.330268144607544,
"learning_rate": 0.00019949823094242522,
"loss": 8.5877,
"step": 45
},
{
"epoch": 0.029581993569131833,
"grad_norm": 1.3029155731201172,
"learning_rate": 0.00019948536506915408,
"loss": 8.552,
"step": 46
},
{
"epoch": 0.03022508038585209,
"grad_norm": 1.3699071407318115,
"learning_rate": 0.00019947249919588294,
"loss": 8.4862,
"step": 47
},
{
"epoch": 0.030868167202572346,
"grad_norm": 1.3225411176681519,
"learning_rate": 0.00019945963332261178,
"loss": 8.4505,
"step": 48
},
{
"epoch": 0.031511254019292605,
"grad_norm": 1.3253546953201294,
"learning_rate": 0.00019944676744934064,
"loss": 8.4268,
"step": 49
},
{
"epoch": 0.03215434083601286,
"grad_norm": 1.3012568950653076,
"learning_rate": 0.0001994339015760695,
"loss": 8.373,
"step": 50
},
{
"epoch": 0.03279742765273312,
"grad_norm": 1.329947829246521,
"learning_rate": 0.00019942103570279833,
"loss": 8.3321,
"step": 51
},
{
"epoch": 0.033440514469453377,
"grad_norm": 1.3405938148498535,
"learning_rate": 0.0001994081698295272,
"loss": 8.3065,
"step": 52
},
{
"epoch": 0.03408360128617363,
"grad_norm": 1.3572288751602173,
"learning_rate": 0.00019939530395625602,
"loss": 8.2457,
"step": 53
},
{
"epoch": 0.03472668810289389,
"grad_norm": 1.332276701927185,
"learning_rate": 0.00019938243808298488,
"loss": 8.2034,
"step": 54
},
{
"epoch": 0.03536977491961415,
"grad_norm": 1.297584891319275,
"learning_rate": 0.00019936957220971374,
"loss": 8.1932,
"step": 55
},
{
"epoch": 0.0360128617363344,
"grad_norm": 1.3370877504348755,
"learning_rate": 0.00019935670633644258,
"loss": 8.1234,
"step": 56
},
{
"epoch": 0.036655948553054665,
"grad_norm": 1.3226819038391113,
"learning_rate": 0.00019934384046317146,
"loss": 8.0947,
"step": 57
},
{
"epoch": 0.03729903536977492,
"grad_norm": 1.3129390478134155,
"learning_rate": 0.00019933097458990032,
"loss": 8.0517,
"step": 58
},
{
"epoch": 0.037942122186495175,
"grad_norm": 1.305672287940979,
"learning_rate": 0.00019931810871662916,
"loss": 8.0226,
"step": 59
},
{
"epoch": 0.03858520900321544,
"grad_norm": 1.3064535856246948,
"learning_rate": 0.00019930524284335802,
"loss": 7.992,
"step": 60
},
{
"epoch": 0.03922829581993569,
"grad_norm": 1.3159136772155762,
"learning_rate": 0.00019929237697008685,
"loss": 7.9149,
"step": 61
},
{
"epoch": 0.03987138263665595,
"grad_norm": 1.3587119579315186,
"learning_rate": 0.0001992795110968157,
"loss": 7.897,
"step": 62
},
{
"epoch": 0.04051446945337621,
"grad_norm": 1.2870018482208252,
"learning_rate": 0.00019926664522354457,
"loss": 7.8889,
"step": 63
},
{
"epoch": 0.041157556270096464,
"grad_norm": 1.3369699716567993,
"learning_rate": 0.0001992537793502734,
"loss": 7.8203,
"step": 64
},
{
"epoch": 0.04180064308681672,
"grad_norm": 1.330676794052124,
"learning_rate": 0.00019924091347700226,
"loss": 7.7895,
"step": 65
},
{
"epoch": 0.04244372990353698,
"grad_norm": 1.2833577394485474,
"learning_rate": 0.0001992280476037311,
"loss": 7.7855,
"step": 66
},
{
"epoch": 0.043086816720257236,
"grad_norm": 1.3233436346054077,
"learning_rate": 0.00019921518173045995,
"loss": 7.7267,
"step": 67
},
{
"epoch": 0.04372990353697749,
"grad_norm": 1.3114689588546753,
"learning_rate": 0.00019920231585718881,
"loss": 7.7057,
"step": 68
},
{
"epoch": 0.044372990353697746,
"grad_norm": 1.3106704950332642,
"learning_rate": 0.00019918944998391767,
"loss": 7.6647,
"step": 69
},
{
"epoch": 0.04501607717041801,
"grad_norm": 1.281671404838562,
"learning_rate": 0.00019917658411064653,
"loss": 7.6598,
"step": 70
},
{
"epoch": 0.04565916398713826,
"grad_norm": 1.2698378562927246,
"learning_rate": 0.00019916371823737537,
"loss": 7.5991,
"step": 71
},
{
"epoch": 0.04630225080385852,
"grad_norm": 1.2966727018356323,
"learning_rate": 0.00019915085236410423,
"loss": 7.5675,
"step": 72
},
{
"epoch": 0.04694533762057878,
"grad_norm": 1.3163456916809082,
"learning_rate": 0.0001991379864908331,
"loss": 7.5388,
"step": 73
},
{
"epoch": 0.047588424437299034,
"grad_norm": 1.3120678663253784,
"learning_rate": 0.00019912512061756192,
"loss": 7.5088,
"step": 74
},
{
"epoch": 0.04823151125401929,
"grad_norm": 1.3121012449264526,
"learning_rate": 0.00019911225474429078,
"loss": 7.426,
"step": 75
},
{
"epoch": 0.04887459807073955,
"grad_norm": 1.2783628702163696,
"learning_rate": 0.0001990993888710196,
"loss": 7.4365,
"step": 76
},
{
"epoch": 0.049517684887459806,
"grad_norm": 1.275612235069275,
"learning_rate": 0.00019908652299774847,
"loss": 7.3942,
"step": 77
},
{
"epoch": 0.05016077170418006,
"grad_norm": 1.352935552597046,
"learning_rate": 0.00019907365712447733,
"loss": 7.3319,
"step": 78
},
{
"epoch": 0.05080385852090032,
"grad_norm": 1.299548625946045,
"learning_rate": 0.0001990607912512062,
"loss": 7.3322,
"step": 79
},
{
"epoch": 0.05144694533762058,
"grad_norm": 1.29573392868042,
"learning_rate": 0.00019904792537793505,
"loss": 7.315,
"step": 80
},
{
"epoch": 0.05209003215434083,
"grad_norm": 1.2807579040527344,
"learning_rate": 0.00019903505950466389,
"loss": 7.2971,
"step": 81
},
{
"epoch": 0.052733118971061095,
"grad_norm": 1.2788913249969482,
"learning_rate": 0.00019902219363139275,
"loss": 7.217,
"step": 82
},
{
"epoch": 0.05337620578778135,
"grad_norm": 1.2539947032928467,
"learning_rate": 0.0001990093277581216,
"loss": 7.2079,
"step": 83
},
{
"epoch": 0.054019292604501605,
"grad_norm": 1.2816716432571411,
"learning_rate": 0.00019899646188485044,
"loss": 7.1968,
"step": 84
},
{
"epoch": 0.05466237942122187,
"grad_norm": 1.2161004543304443,
"learning_rate": 0.0001989835960115793,
"loss": 7.2596,
"step": 85
},
{
"epoch": 0.05530546623794212,
"grad_norm": 1.250944972038269,
"learning_rate": 0.00019897073013830813,
"loss": 7.1328,
"step": 86
},
{
"epoch": 0.05594855305466238,
"grad_norm": 1.2679494619369507,
"learning_rate": 0.000198957864265037,
"loss": 7.1424,
"step": 87
},
{
"epoch": 0.05659163987138264,
"grad_norm": 1.2727211713790894,
"learning_rate": 0.00019894499839176585,
"loss": 7.1108,
"step": 88
},
{
"epoch": 0.057234726688102894,
"grad_norm": 1.250409722328186,
"learning_rate": 0.00019893213251849468,
"loss": 7.043,
"step": 89
},
{
"epoch": 0.05787781350482315,
"grad_norm": 1.2838069200515747,
"learning_rate": 0.00019891926664522354,
"loss": 7.0079,
"step": 90
},
{
"epoch": 0.05852090032154341,
"grad_norm": 1.269007682800293,
"learning_rate": 0.0001989064007719524,
"loss": 7.0067,
"step": 91
},
{
"epoch": 0.059163987138263666,
"grad_norm": 1.2593508958816528,
"learning_rate": 0.00019889353489868126,
"loss": 6.9798,
"step": 92
},
{
"epoch": 0.05980707395498392,
"grad_norm": 1.237831950187683,
"learning_rate": 0.00019888066902541012,
"loss": 6.9631,
"step": 93
},
{
"epoch": 0.06045016077170418,
"grad_norm": 1.259639024734497,
"learning_rate": 0.00019886780315213896,
"loss": 6.9282,
"step": 94
},
{
"epoch": 0.06109324758842444,
"grad_norm": 1.245284080505371,
"learning_rate": 0.00019885493727886782,
"loss": 6.9056,
"step": 95
},
{
"epoch": 0.06173633440514469,
"grad_norm": 1.20187246799469,
"learning_rate": 0.00019884207140559668,
"loss": 6.8874,
"step": 96
},
{
"epoch": 0.062379421221864954,
"grad_norm": 1.2145123481750488,
"learning_rate": 0.0001988292055323255,
"loss": 6.8898,
"step": 97
},
{
"epoch": 0.06302250803858521,
"grad_norm": 1.2658990621566772,
"learning_rate": 0.00019881633965905437,
"loss": 6.831,
"step": 98
},
{
"epoch": 0.06366559485530547,
"grad_norm": 1.2510586977005005,
"learning_rate": 0.0001988034737857832,
"loss": 6.8096,
"step": 99
},
{
"epoch": 0.06430868167202572,
"grad_norm": 1.2315446138381958,
"learning_rate": 0.00019879060791251206,
"loss": 6.8044,
"step": 100
},
{
"epoch": 0.06495176848874598,
"grad_norm": 1.2362980842590332,
"learning_rate": 0.00019877774203924092,
"loss": 6.7773,
"step": 101
},
{
"epoch": 0.06559485530546624,
"grad_norm": 1.2314499616622925,
"learning_rate": 0.00019876487616596978,
"loss": 6.7239,
"step": 102
},
{
"epoch": 0.06623794212218649,
"grad_norm": 1.2099567651748657,
"learning_rate": 0.00019875201029269864,
"loss": 6.7143,
"step": 103
},
{
"epoch": 0.06688102893890675,
"grad_norm": 1.2021255493164062,
"learning_rate": 0.00019873914441942748,
"loss": 6.7148,
"step": 104
},
{
"epoch": 0.06752411575562701,
"grad_norm": 1.2055983543395996,
"learning_rate": 0.00019872627854615634,
"loss": 6.6475,
"step": 105
},
{
"epoch": 0.06816720257234726,
"grad_norm": 1.1841413974761963,
"learning_rate": 0.0001987134126728852,
"loss": 6.6964,
"step": 106
},
{
"epoch": 0.06881028938906752,
"grad_norm": 1.2152525186538696,
"learning_rate": 0.00019870054679961403,
"loss": 6.6206,
"step": 107
},
{
"epoch": 0.06945337620578779,
"grad_norm": 1.2249363660812378,
"learning_rate": 0.0001986876809263429,
"loss": 6.5891,
"step": 108
},
{
"epoch": 0.07009646302250803,
"grad_norm": 1.2074142694473267,
"learning_rate": 0.00019867481505307172,
"loss": 6.6226,
"step": 109
},
{
"epoch": 0.0707395498392283,
"grad_norm": 1.206978678703308,
"learning_rate": 0.00019866194917980058,
"loss": 6.5834,
"step": 110
},
{
"epoch": 0.07138263665594856,
"grad_norm": 1.2047126293182373,
"learning_rate": 0.00019864908330652944,
"loss": 6.5454,
"step": 111
},
{
"epoch": 0.0720257234726688,
"grad_norm": 1.1759907007217407,
"learning_rate": 0.00019863621743325827,
"loss": 6.4939,
"step": 112
},
{
"epoch": 0.07266881028938907,
"grad_norm": 1.20724356174469,
"learning_rate": 0.00019862335155998713,
"loss": 6.5207,
"step": 113
},
{
"epoch": 0.07331189710610933,
"grad_norm": 1.1968728303909302,
"learning_rate": 0.000198610485686716,
"loss": 6.4886,
"step": 114
},
{
"epoch": 0.07395498392282958,
"grad_norm": 1.17826247215271,
"learning_rate": 0.00019859761981344485,
"loss": 6.4474,
"step": 115
},
{
"epoch": 0.07459807073954984,
"grad_norm": 1.1574997901916504,
"learning_rate": 0.00019858475394017371,
"loss": 6.4592,
"step": 116
},
{
"epoch": 0.0752411575562701,
"grad_norm": 1.1507346630096436,
"learning_rate": 0.00019857188806690255,
"loss": 6.4508,
"step": 117
},
{
"epoch": 0.07588424437299035,
"grad_norm": 1.1639803647994995,
"learning_rate": 0.0001985590221936314,
"loss": 6.453,
"step": 118
},
{
"epoch": 0.07652733118971061,
"grad_norm": 1.1529334783554077,
"learning_rate": 0.00019854615632036024,
"loss": 6.4111,
"step": 119
},
{
"epoch": 0.07717041800643087,
"grad_norm": 1.1437855958938599,
"learning_rate": 0.0001985332904470891,
"loss": 6.3518,
"step": 120
},
{
"epoch": 0.07781350482315112,
"grad_norm": 1.1764041185379028,
"learning_rate": 0.00019852042457381796,
"loss": 6.3225,
"step": 121
},
{
"epoch": 0.07845659163987138,
"grad_norm": 1.1735254526138306,
"learning_rate": 0.0001985075587005468,
"loss": 6.3173,
"step": 122
},
{
"epoch": 0.07909967845659165,
"grad_norm": 1.1502288579940796,
"learning_rate": 0.00019849469282727565,
"loss": 6.3184,
"step": 123
},
{
"epoch": 0.0797427652733119,
"grad_norm": 1.1381540298461914,
"learning_rate": 0.0001984818269540045,
"loss": 6.2804,
"step": 124
},
{
"epoch": 0.08038585209003216,
"grad_norm": 1.1680670976638794,
"learning_rate": 0.00019846896108073337,
"loss": 6.2858,
"step": 125
},
{
"epoch": 0.08102893890675242,
"grad_norm": 1.1194247007369995,
"learning_rate": 0.00019845609520746223,
"loss": 6.2869,
"step": 126
},
{
"epoch": 0.08167202572347267,
"grad_norm": 1.1161319017410278,
"learning_rate": 0.00019844322933419107,
"loss": 6.2556,
"step": 127
},
{
"epoch": 0.08231511254019293,
"grad_norm": 1.1190017461776733,
"learning_rate": 0.00019843036346091993,
"loss": 6.1889,
"step": 128
},
{
"epoch": 0.08295819935691319,
"grad_norm": 1.1169451475143433,
"learning_rate": 0.00019841749758764879,
"loss": 6.1969,
"step": 129
},
{
"epoch": 0.08360128617363344,
"grad_norm": 1.1366708278656006,
"learning_rate": 0.00019840463171437762,
"loss": 6.2215,
"step": 130
},
{
"epoch": 0.0842443729903537,
"grad_norm": 1.1058257818222046,
"learning_rate": 0.00019839176584110648,
"loss": 6.1801,
"step": 131
},
{
"epoch": 0.08488745980707396,
"grad_norm": 1.1087491512298584,
"learning_rate": 0.0001983788999678353,
"loss": 6.1201,
"step": 132
},
{
"epoch": 0.08553054662379421,
"grad_norm": 1.0842581987380981,
"learning_rate": 0.00019836603409456417,
"loss": 6.1497,
"step": 133
},
{
"epoch": 0.08617363344051447,
"grad_norm": 1.12769615650177,
"learning_rate": 0.00019835316822129303,
"loss": 6.111,
"step": 134
},
{
"epoch": 0.08681672025723473,
"grad_norm": 1.0909464359283447,
"learning_rate": 0.00019834030234802186,
"loss": 6.0962,
"step": 135
},
{
"epoch": 0.08745980707395498,
"grad_norm": 1.0879305601119995,
"learning_rate": 0.00019832743647475075,
"loss": 6.0805,
"step": 136
},
{
"epoch": 0.08810289389067524,
"grad_norm": 1.0964096784591675,
"learning_rate": 0.00019831457060147958,
"loss": 6.0659,
"step": 137
},
{
"epoch": 0.08874598070739549,
"grad_norm": 1.059978723526001,
"learning_rate": 0.00019830170472820844,
"loss": 6.0938,
"step": 138
},
{
"epoch": 0.08938906752411575,
"grad_norm": 1.0322396755218506,
"learning_rate": 0.0001982888388549373,
"loss": 6.0523,
"step": 139
},
{
"epoch": 0.09003215434083602,
"grad_norm": 1.0639983415603638,
"learning_rate": 0.00019827597298166614,
"loss": 6.0221,
"step": 140
},
{
"epoch": 0.09067524115755626,
"grad_norm": 1.0435878038406372,
"learning_rate": 0.000198263107108395,
"loss": 6.0895,
"step": 141
},
{
"epoch": 0.09131832797427653,
"grad_norm": 1.070421814918518,
"learning_rate": 0.00019825024123512383,
"loss": 6.0178,
"step": 142
},
{
"epoch": 0.09196141479099679,
"grad_norm": 1.0622031688690186,
"learning_rate": 0.0001982373753618527,
"loss": 6.043,
"step": 143
},
{
"epoch": 0.09260450160771704,
"grad_norm": 1.0540876388549805,
"learning_rate": 0.00019822450948858155,
"loss": 6.014,
"step": 144
},
{
"epoch": 0.0932475884244373,
"grad_norm": 1.0724049806594849,
"learning_rate": 0.00019821164361531038,
"loss": 5.9128,
"step": 145
},
{
"epoch": 0.09389067524115756,
"grad_norm": 1.066551685333252,
"learning_rate": 0.00019819877774203924,
"loss": 5.951,
"step": 146
},
{
"epoch": 0.09453376205787781,
"grad_norm": 1.0557595491409302,
"learning_rate": 0.0001981859118687681,
"loss": 5.9189,
"step": 147
},
{
"epoch": 0.09517684887459807,
"grad_norm": 1.0220601558685303,
"learning_rate": 0.00019817304599549696,
"loss": 5.9411,
"step": 148
},
{
"epoch": 0.09581993569131833,
"grad_norm": 1.0399051904678345,
"learning_rate": 0.00019816018012222582,
"loss": 5.9042,
"step": 149
},
{
"epoch": 0.09646302250803858,
"grad_norm": 1.0221998691558838,
"learning_rate": 0.00019814731424895466,
"loss": 5.92,
"step": 150
},
{
"epoch": 0.09710610932475884,
"grad_norm": 1.019439935684204,
"learning_rate": 0.00019813444837568352,
"loss": 5.8743,
"step": 151
},
{
"epoch": 0.0977491961414791,
"grad_norm": 1.0221492052078247,
"learning_rate": 0.00019812158250241235,
"loss": 5.8733,
"step": 152
},
{
"epoch": 0.09839228295819935,
"grad_norm": 0.9593134522438049,
"learning_rate": 0.0001981087166291412,
"loss": 5.8632,
"step": 153
},
{
"epoch": 0.09903536977491961,
"grad_norm": 1.0003180503845215,
"learning_rate": 0.00019809585075587007,
"loss": 5.8589,
"step": 154
},
{
"epoch": 0.09967845659163987,
"grad_norm": 1.0050652027130127,
"learning_rate": 0.0001980829848825989,
"loss": 5.808,
"step": 155
},
{
"epoch": 0.10032154340836012,
"grad_norm": 1.0235966444015503,
"learning_rate": 0.00019807011900932776,
"loss": 5.7926,
"step": 156
},
{
"epoch": 0.10096463022508038,
"grad_norm": 1.0320799350738525,
"learning_rate": 0.00019805725313605662,
"loss": 5.7693,
"step": 157
},
{
"epoch": 0.10160771704180065,
"grad_norm": 0.9429930448532104,
"learning_rate": 0.00019804438726278548,
"loss": 5.8678,
"step": 158
},
{
"epoch": 0.1022508038585209,
"grad_norm": 0.9937098622322083,
"learning_rate": 0.00019803152138951434,
"loss": 5.7777,
"step": 159
},
{
"epoch": 0.10289389067524116,
"grad_norm": 0.9316997528076172,
"learning_rate": 0.00019801865551624317,
"loss": 5.8055,
"step": 160
},
{
"epoch": 0.10353697749196142,
"grad_norm": 0.9722780585289001,
"learning_rate": 0.00019800578964297203,
"loss": 5.7881,
"step": 161
},
{
"epoch": 0.10418006430868167,
"grad_norm": 0.9671345353126526,
"learning_rate": 0.0001979929237697009,
"loss": 5.7469,
"step": 162
},
{
"epoch": 0.10482315112540193,
"grad_norm": 0.9491237998008728,
"learning_rate": 0.00019798005789642973,
"loss": 5.7068,
"step": 163
},
{
"epoch": 0.10546623794212219,
"grad_norm": 0.912465512752533,
"learning_rate": 0.0001979671920231586,
"loss": 5.7732,
"step": 164
},
{
"epoch": 0.10610932475884244,
"grad_norm": 0.9776575565338135,
"learning_rate": 0.00019795432614988742,
"loss": 5.6843,
"step": 165
},
{
"epoch": 0.1067524115755627,
"grad_norm": 0.9189504384994507,
"learning_rate": 0.00019794146027661628,
"loss": 5.6807,
"step": 166
},
{
"epoch": 0.10739549839228296,
"grad_norm": 0.9223593473434448,
"learning_rate": 0.00019792859440334514,
"loss": 5.6291,
"step": 167
},
{
"epoch": 0.10803858520900321,
"grad_norm": 0.9045426249504089,
"learning_rate": 0.00019791572853007397,
"loss": 5.6484,
"step": 168
},
{
"epoch": 0.10868167202572347,
"grad_norm": 0.9032799601554871,
"learning_rate": 0.00019790286265680283,
"loss": 5.6995,
"step": 169
},
{
"epoch": 0.10932475884244373,
"grad_norm": 0.9077114462852478,
"learning_rate": 0.0001978899967835317,
"loss": 5.5522,
"step": 170
},
{
"epoch": 0.10996784565916398,
"grad_norm": 0.9258122444152832,
"learning_rate": 0.00019787713091026055,
"loss": 5.5694,
"step": 171
},
{
"epoch": 0.11061093247588424,
"grad_norm": 0.9175561666488647,
"learning_rate": 0.00019786426503698941,
"loss": 5.5755,
"step": 172
},
{
"epoch": 0.1112540192926045,
"grad_norm": 0.9215130805969238,
"learning_rate": 0.00019785139916371825,
"loss": 5.5379,
"step": 173
},
{
"epoch": 0.11189710610932475,
"grad_norm": 0.9266327023506165,
"learning_rate": 0.0001978385332904471,
"loss": 5.582,
"step": 174
},
{
"epoch": 0.11254019292604502,
"grad_norm": 0.9182630777359009,
"learning_rate": 0.00019782566741717594,
"loss": 5.6012,
"step": 175
},
{
"epoch": 0.11318327974276528,
"grad_norm": 0.8782434463500977,
"learning_rate": 0.0001978128015439048,
"loss": 5.5496,
"step": 176
},
{
"epoch": 0.11382636655948553,
"grad_norm": 0.8495333194732666,
"learning_rate": 0.00019779993567063366,
"loss": 5.6246,
"step": 177
},
{
"epoch": 0.11446945337620579,
"grad_norm": 0.8858991861343384,
"learning_rate": 0.0001977870697973625,
"loss": 5.4775,
"step": 178
},
{
"epoch": 0.11511254019292605,
"grad_norm": 0.9661223292350769,
"learning_rate": 0.00019777420392409135,
"loss": 5.5474,
"step": 179
},
{
"epoch": 0.1157556270096463,
"grad_norm": 0.9988970160484314,
"learning_rate": 0.0001977613380508202,
"loss": 5.596,
"step": 180
},
{
"epoch": 0.11639871382636656,
"grad_norm": 0.8391322493553162,
"learning_rate": 0.00019774847217754907,
"loss": 5.5846,
"step": 181
},
{
"epoch": 0.11704180064308682,
"grad_norm": 1.1805757284164429,
"learning_rate": 0.00019773560630427793,
"loss": 5.5573,
"step": 182
},
{
"epoch": 0.11768488745980707,
"grad_norm": 1.0904446840286255,
"learning_rate": 0.00019772274043100677,
"loss": 5.4494,
"step": 183
},
{
"epoch": 0.11832797427652733,
"grad_norm": 0.8829322457313538,
"learning_rate": 0.00019770987455773563,
"loss": 5.4855,
"step": 184
},
{
"epoch": 0.1189710610932476,
"grad_norm": 1.4423890113830566,
"learning_rate": 0.00019769700868446446,
"loss": 5.5052,
"step": 185
},
{
"epoch": 0.11961414790996784,
"grad_norm": 0.8625758290290833,
"learning_rate": 0.00019768414281119332,
"loss": 5.4188,
"step": 186
},
{
"epoch": 0.1202572347266881,
"grad_norm": 1.0383747816085815,
"learning_rate": 0.00019767127693792218,
"loss": 5.4561,
"step": 187
},
{
"epoch": 0.12090032154340837,
"grad_norm": 0.8543033003807068,
"learning_rate": 0.000197658411064651,
"loss": 5.427,
"step": 188
},
{
"epoch": 0.12154340836012861,
"grad_norm": 1.1611664295196533,
"learning_rate": 0.00019764554519137987,
"loss": 5.4432,
"step": 189
},
{
"epoch": 0.12218649517684887,
"grad_norm": 0.8035333156585693,
"learning_rate": 0.0001976326793181087,
"loss": 5.4058,
"step": 190
},
{
"epoch": 0.12282958199356914,
"grad_norm": 1.01143217086792,
"learning_rate": 0.00019761981344483756,
"loss": 5.4268,
"step": 191
},
{
"epoch": 0.12347266881028938,
"grad_norm": 0.8253054022789001,
"learning_rate": 0.00019760694757156642,
"loss": 5.3515,
"step": 192
},
{
"epoch": 0.12411575562700965,
"grad_norm": 1.043700098991394,
"learning_rate": 0.00019759408169829528,
"loss": 5.3555,
"step": 193
},
{
"epoch": 0.12475884244372991,
"grad_norm": 0.8708436489105225,
"learning_rate": 0.00019758121582502414,
"loss": 5.4721,
"step": 194
},
{
"epoch": 0.12540192926045016,
"grad_norm": 0.80093914270401,
"learning_rate": 0.000197568349951753,
"loss": 5.4312,
"step": 195
},
{
"epoch": 0.12604501607717042,
"grad_norm": 0.8914348483085632,
"learning_rate": 0.00019755548407848184,
"loss": 5.2986,
"step": 196
},
{
"epoch": 0.12668810289389068,
"grad_norm": 0.8047232627868652,
"learning_rate": 0.0001975426182052107,
"loss": 5.3595,
"step": 197
},
{
"epoch": 0.12733118971061094,
"grad_norm": 0.8185480833053589,
"learning_rate": 0.00019752975233193953,
"loss": 5.3846,
"step": 198
},
{
"epoch": 0.12797427652733118,
"grad_norm": 0.8160484433174133,
"learning_rate": 0.0001975168864586684,
"loss": 5.2739,
"step": 199
},
{
"epoch": 0.12861736334405144,
"grad_norm": 0.8484665155410767,
"learning_rate": 0.00019750402058539725,
"loss": 5.4102,
"step": 200
},
{
"epoch": 0.1292604501607717,
"grad_norm": 0.8212493062019348,
"learning_rate": 0.00019749115471212608,
"loss": 5.338,
"step": 201
},
{
"epoch": 0.12990353697749196,
"grad_norm": 0.8622007966041565,
"learning_rate": 0.00019747828883885494,
"loss": 5.2163,
"step": 202
},
{
"epoch": 0.13054662379421222,
"grad_norm": 0.7989214062690735,
"learning_rate": 0.0001974654229655838,
"loss": 5.3385,
"step": 203
},
{
"epoch": 0.1311897106109325,
"grad_norm": 0.971836268901825,
"learning_rate": 0.00019745255709231266,
"loss": 5.3164,
"step": 204
},
{
"epoch": 0.13183279742765272,
"grad_norm": 0.7155956029891968,
"learning_rate": 0.00019743969121904152,
"loss": 5.333,
"step": 205
},
{
"epoch": 0.13247588424437298,
"grad_norm": 0.9477896690368652,
"learning_rate": 0.00019742682534577036,
"loss": 5.2987,
"step": 206
},
{
"epoch": 0.13311897106109324,
"grad_norm": 0.7635351419448853,
"learning_rate": 0.00019741395947249922,
"loss": 5.2301,
"step": 207
},
{
"epoch": 0.1337620578778135,
"grad_norm": 0.8150351643562317,
"learning_rate": 0.00019740109359922805,
"loss": 5.3204,
"step": 208
},
{
"epoch": 0.13440514469453377,
"grad_norm": 0.7588405013084412,
"learning_rate": 0.0001973882277259569,
"loss": 5.2901,
"step": 209
},
{
"epoch": 0.13504823151125403,
"grad_norm": 0.7870814800262451,
"learning_rate": 0.00019737536185268577,
"loss": 5.1423,
"step": 210
},
{
"epoch": 0.13569131832797426,
"grad_norm": 0.8244409561157227,
"learning_rate": 0.0001973624959794146,
"loss": 5.1665,
"step": 211
},
{
"epoch": 0.13633440514469453,
"grad_norm": 0.7376208901405334,
"learning_rate": 0.00019734963010614346,
"loss": 5.248,
"step": 212
},
{
"epoch": 0.1369774919614148,
"grad_norm": 0.7543022632598877,
"learning_rate": 0.0001973367642328723,
"loss": 5.2257,
"step": 213
},
{
"epoch": 0.13762057877813505,
"grad_norm": 0.7643454074859619,
"learning_rate": 0.00019732389835960115,
"loss": 5.1449,
"step": 214
},
{
"epoch": 0.1382636655948553,
"grad_norm": 0.8774954676628113,
"learning_rate": 0.00019731103248633001,
"loss": 5.168,
"step": 215
},
{
"epoch": 0.13890675241157557,
"grad_norm": 0.7466834187507629,
"learning_rate": 0.00019729816661305887,
"loss": 5.162,
"step": 216
},
{
"epoch": 0.1395498392282958,
"grad_norm": 0.7911695837974548,
"learning_rate": 0.00019728530073978773,
"loss": 5.1449,
"step": 217
},
{
"epoch": 0.14019292604501607,
"grad_norm": 0.7217370271682739,
"learning_rate": 0.00019727243486651657,
"loss": 5.2318,
"step": 218
},
{
"epoch": 0.14083601286173633,
"grad_norm": 0.8801223039627075,
"learning_rate": 0.00019725956899324543,
"loss": 5.1067,
"step": 219
},
{
"epoch": 0.1414790996784566,
"grad_norm": 0.7971013188362122,
"learning_rate": 0.0001972467031199743,
"loss": 5.1875,
"step": 220
},
{
"epoch": 0.14212218649517686,
"grad_norm": 0.8382704257965088,
"learning_rate": 0.00019723383724670312,
"loss": 5.092,
"step": 221
},
{
"epoch": 0.14276527331189712,
"grad_norm": 0.8966850638389587,
"learning_rate": 0.00019722097137343198,
"loss": 5.1178,
"step": 222
},
{
"epoch": 0.14340836012861735,
"grad_norm": 0.7162383794784546,
"learning_rate": 0.0001972081055001608,
"loss": 5.1642,
"step": 223
},
{
"epoch": 0.1440514469453376,
"grad_norm": 0.7657078504562378,
"learning_rate": 0.00019719523962688967,
"loss": 5.102,
"step": 224
},
{
"epoch": 0.14469453376205788,
"grad_norm": 0.8197327256202698,
"learning_rate": 0.00019718237375361853,
"loss": 5.0262,
"step": 225
},
{
"epoch": 0.14533762057877814,
"grad_norm": 0.7781340479850769,
"learning_rate": 0.0001971695078803474,
"loss": 5.039,
"step": 226
},
{
"epoch": 0.1459807073954984,
"grad_norm": 0.6628583073616028,
"learning_rate": 0.00019715664200707625,
"loss": 5.0771,
"step": 227
},
{
"epoch": 0.14662379421221866,
"grad_norm": 0.7722617983818054,
"learning_rate": 0.00019714377613380509,
"loss": 4.9389,
"step": 228
},
{
"epoch": 0.1472668810289389,
"grad_norm": 0.7367834448814392,
"learning_rate": 0.00019713091026053395,
"loss": 5.0164,
"step": 229
},
{
"epoch": 0.14790996784565916,
"grad_norm": 0.6781874895095825,
"learning_rate": 0.0001971180443872628,
"loss": 5.0505,
"step": 230
},
{
"epoch": 0.14855305466237942,
"grad_norm": 0.7370553016662598,
"learning_rate": 0.00019710517851399164,
"loss": 5.027,
"step": 231
},
{
"epoch": 0.14919614147909968,
"grad_norm": 0.6999610662460327,
"learning_rate": 0.0001970923126407205,
"loss": 4.9715,
"step": 232
},
{
"epoch": 0.14983922829581994,
"grad_norm": 0.7564293146133423,
"learning_rate": 0.00019707944676744936,
"loss": 4.9666,
"step": 233
},
{
"epoch": 0.1504823151125402,
"grad_norm": 0.6723111271858215,
"learning_rate": 0.0001970665808941782,
"loss": 4.986,
"step": 234
},
{
"epoch": 0.15112540192926044,
"grad_norm": 0.6594771146774292,
"learning_rate": 0.00019705371502090705,
"loss": 5.0235,
"step": 235
},
{
"epoch": 0.1517684887459807,
"grad_norm": 0.681059718132019,
"learning_rate": 0.00019704084914763588,
"loss": 5.014,
"step": 236
},
{
"epoch": 0.15241157556270096,
"grad_norm": 0.7105944156646729,
"learning_rate": 0.00019702798327436477,
"loss": 4.9888,
"step": 237
},
{
"epoch": 0.15305466237942122,
"grad_norm": 0.676173210144043,
"learning_rate": 0.00019701511740109363,
"loss": 4.963,
"step": 238
},
{
"epoch": 0.1536977491961415,
"grad_norm": 0.6689148545265198,
"learning_rate": 0.00019700225152782246,
"loss": 5.0294,
"step": 239
},
{
"epoch": 0.15434083601286175,
"grad_norm": 0.6836123466491699,
"learning_rate": 0.00019698938565455132,
"loss": 4.8988,
"step": 240
},
{
"epoch": 0.15498392282958198,
"grad_norm": 0.7466531991958618,
"learning_rate": 0.00019697651978128016,
"loss": 4.7996,
"step": 241
},
{
"epoch": 0.15562700964630224,
"grad_norm": 0.6778351664543152,
"learning_rate": 0.00019696365390800902,
"loss": 4.9041,
"step": 242
},
{
"epoch": 0.1562700964630225,
"grad_norm": 0.6739835739135742,
"learning_rate": 0.00019695078803473788,
"loss": 4.9133,
"step": 243
},
{
"epoch": 0.15691318327974277,
"grad_norm": 0.7548195123672485,
"learning_rate": 0.0001969379221614667,
"loss": 4.8105,
"step": 244
},
{
"epoch": 0.15755627009646303,
"grad_norm": 0.753843367099762,
"learning_rate": 0.00019692505628819557,
"loss": 4.8267,
"step": 245
},
{
"epoch": 0.1581993569131833,
"grad_norm": 0.7508904933929443,
"learning_rate": 0.0001969121904149244,
"loss": 4.94,
"step": 246
},
{
"epoch": 0.15884244372990353,
"grad_norm": 0.6692425012588501,
"learning_rate": 0.00019689932454165326,
"loss": 4.8711,
"step": 247
},
{
"epoch": 0.1594855305466238,
"grad_norm": 0.7251532077789307,
"learning_rate": 0.00019688645866838212,
"loss": 4.9862,
"step": 248
},
{
"epoch": 0.16012861736334405,
"grad_norm": 0.8498165011405945,
"learning_rate": 0.00019687359279511098,
"loss": 4.8864,
"step": 249
},
{
"epoch": 0.1607717041800643,
"grad_norm": 0.6454371809959412,
"learning_rate": 0.00019686072692183984,
"loss": 4.7845,
"step": 250
},
{
"epoch": 0.16141479099678457,
"grad_norm": 0.7783510088920593,
"learning_rate": 0.00019684786104856868,
"loss": 4.8436,
"step": 251
},
{
"epoch": 0.16205787781350484,
"grad_norm": 0.754112720489502,
"learning_rate": 0.00019683499517529754,
"loss": 4.8208,
"step": 252
},
{
"epoch": 0.16270096463022507,
"grad_norm": 0.6148940324783325,
"learning_rate": 0.0001968221293020264,
"loss": 4.8404,
"step": 253
},
{
"epoch": 0.16334405144694533,
"grad_norm": 0.754871129989624,
"learning_rate": 0.00019680926342875523,
"loss": 4.8534,
"step": 254
},
{
"epoch": 0.1639871382636656,
"grad_norm": 0.7887524962425232,
"learning_rate": 0.0001967963975554841,
"loss": 4.84,
"step": 255
},
{
"epoch": 0.16463022508038586,
"grad_norm": 0.6893947124481201,
"learning_rate": 0.00019678353168221292,
"loss": 4.729,
"step": 256
},
{
"epoch": 0.16527331189710612,
"grad_norm": 0.6434347629547119,
"learning_rate": 0.00019677066580894178,
"loss": 4.8239,
"step": 257
},
{
"epoch": 0.16591639871382638,
"grad_norm": 0.650383472442627,
"learning_rate": 0.00019675779993567064,
"loss": 4.8329,
"step": 258
},
{
"epoch": 0.1665594855305466,
"grad_norm": 0.6655612587928772,
"learning_rate": 0.0001967449340623995,
"loss": 4.8266,
"step": 259
},
{
"epoch": 0.16720257234726688,
"grad_norm": 0.6401621103286743,
"learning_rate": 0.00019673206818912836,
"loss": 4.7178,
"step": 260
},
{
"epoch": 0.16784565916398714,
"grad_norm": 0.7202311754226685,
"learning_rate": 0.0001967192023158572,
"loss": 4.7191,
"step": 261
},
{
"epoch": 0.1684887459807074,
"grad_norm": 0.6817168593406677,
"learning_rate": 0.00019670633644258605,
"loss": 4.7444,
"step": 262
},
{
"epoch": 0.16913183279742766,
"grad_norm": 0.7872809767723083,
"learning_rate": 0.00019669347056931491,
"loss": 4.7297,
"step": 263
},
{
"epoch": 0.16977491961414792,
"grad_norm": 0.9827927947044373,
"learning_rate": 0.00019668060469604375,
"loss": 4.6757,
"step": 264
},
{
"epoch": 0.17041800643086816,
"grad_norm": 0.743180513381958,
"learning_rate": 0.0001966677388227726,
"loss": 4.7413,
"step": 265
},
{
"epoch": 0.17106109324758842,
"grad_norm": 0.9192097783088684,
"learning_rate": 0.00019665487294950147,
"loss": 4.7543,
"step": 266
},
{
"epoch": 0.17170418006430868,
"grad_norm": 1.0367995500564575,
"learning_rate": 0.0001966420070762303,
"loss": 4.7055,
"step": 267
},
{
"epoch": 0.17234726688102894,
"grad_norm": 0.6918021440505981,
"learning_rate": 0.00019662914120295916,
"loss": 4.7372,
"step": 268
},
{
"epoch": 0.1729903536977492,
"grad_norm": 1.148046851158142,
"learning_rate": 0.000196616275329688,
"loss": 4.7336,
"step": 269
},
{
"epoch": 0.17363344051446947,
"grad_norm": 0.8208276629447937,
"learning_rate": 0.00019660340945641685,
"loss": 4.7057,
"step": 270
},
{
"epoch": 0.1742765273311897,
"grad_norm": 0.8050063848495483,
"learning_rate": 0.0001965905435831457,
"loss": 4.6625,
"step": 271
},
{
"epoch": 0.17491961414790996,
"grad_norm": 0.9354422092437744,
"learning_rate": 0.00019657767770987457,
"loss": 4.6884,
"step": 272
},
{
"epoch": 0.17556270096463023,
"grad_norm": 0.6692061424255371,
"learning_rate": 0.00019656481183660343,
"loss": 4.6549,
"step": 273
},
{
"epoch": 0.1762057877813505,
"grad_norm": 1.0897949934005737,
"learning_rate": 0.00019655194596333227,
"loss": 4.6261,
"step": 274
},
{
"epoch": 0.17684887459807075,
"grad_norm": 0.7366625666618347,
"learning_rate": 0.00019653908009006113,
"loss": 4.6679,
"step": 275
},
{
"epoch": 0.17749196141479098,
"grad_norm": 0.8077712655067444,
"learning_rate": 0.00019652621421678999,
"loss": 4.6497,
"step": 276
},
{
"epoch": 0.17813504823151124,
"grad_norm": 0.833954930305481,
"learning_rate": 0.00019651334834351882,
"loss": 4.6921,
"step": 277
},
{
"epoch": 0.1787781350482315,
"grad_norm": 0.7255340218544006,
"learning_rate": 0.00019650048247024768,
"loss": 4.5728,
"step": 278
},
{
"epoch": 0.17942122186495177,
"grad_norm": 0.9826524257659912,
"learning_rate": 0.0001964876165969765,
"loss": 4.6316,
"step": 279
},
{
"epoch": 0.18006430868167203,
"grad_norm": 0.6553508043289185,
"learning_rate": 0.00019647475072370537,
"loss": 4.6898,
"step": 280
},
{
"epoch": 0.1807073954983923,
"grad_norm": 0.7640435099601746,
"learning_rate": 0.00019646188485043423,
"loss": 4.67,
"step": 281
},
{
"epoch": 0.18135048231511253,
"grad_norm": 0.8134769201278687,
"learning_rate": 0.0001964490189771631,
"loss": 4.5907,
"step": 282
},
{
"epoch": 0.1819935691318328,
"grad_norm": 0.8734142780303955,
"learning_rate": 0.00019643615310389195,
"loss": 4.5712,
"step": 283
},
{
"epoch": 0.18263665594855305,
"grad_norm": 0.75421541929245,
"learning_rate": 0.00019642328723062078,
"loss": 4.5915,
"step": 284
},
{
"epoch": 0.1832797427652733,
"grad_norm": 0.6870218515396118,
"learning_rate": 0.00019641042135734964,
"loss": 4.6099,
"step": 285
},
{
"epoch": 0.18392282958199357,
"grad_norm": 0.754561722278595,
"learning_rate": 0.0001963975554840785,
"loss": 4.5375,
"step": 286
},
{
"epoch": 0.18456591639871384,
"grad_norm": 0.6426610350608826,
"learning_rate": 0.00019638468961080734,
"loss": 4.5184,
"step": 287
},
{
"epoch": 0.18520900321543407,
"grad_norm": 0.7033764123916626,
"learning_rate": 0.0001963718237375362,
"loss": 4.5226,
"step": 288
},
{
"epoch": 0.18585209003215433,
"grad_norm": 0.683521032333374,
"learning_rate": 0.00019635895786426503,
"loss": 4.6466,
"step": 289
},
{
"epoch": 0.1864951768488746,
"grad_norm": 0.7819412350654602,
"learning_rate": 0.0001963460919909939,
"loss": 4.5982,
"step": 290
},
{
"epoch": 0.18713826366559486,
"grad_norm": 0.7259661555290222,
"learning_rate": 0.00019633322611772275,
"loss": 4.6051,
"step": 291
},
{
"epoch": 0.18778135048231512,
"grad_norm": 0.7699325084686279,
"learning_rate": 0.00019632036024445158,
"loss": 4.5585,
"step": 292
},
{
"epoch": 0.18842443729903538,
"grad_norm": 0.7889232039451599,
"learning_rate": 0.00019630749437118044,
"loss": 4.5264,
"step": 293
},
{
"epoch": 0.18906752411575561,
"grad_norm": 0.6296926736831665,
"learning_rate": 0.0001962946284979093,
"loss": 4.4552,
"step": 294
},
{
"epoch": 0.18971061093247588,
"grad_norm": 0.8028717041015625,
"learning_rate": 0.00019628176262463816,
"loss": 4.5493,
"step": 295
},
{
"epoch": 0.19035369774919614,
"grad_norm": 0.655351996421814,
"learning_rate": 0.00019626889675136702,
"loss": 4.5392,
"step": 296
},
{
"epoch": 0.1909967845659164,
"grad_norm": 0.7414562702178955,
"learning_rate": 0.00019625603087809586,
"loss": 4.4959,
"step": 297
},
{
"epoch": 0.19163987138263666,
"grad_norm": 0.7951900362968445,
"learning_rate": 0.00019624316500482472,
"loss": 4.4467,
"step": 298
},
{
"epoch": 0.19228295819935692,
"grad_norm": 0.6390551328659058,
"learning_rate": 0.00019623029913155358,
"loss": 4.5044,
"step": 299
},
{
"epoch": 0.19292604501607716,
"grad_norm": 0.9853672385215759,
"learning_rate": 0.0001962174332582824,
"loss": 4.4453,
"step": 300
},
{
"epoch": 0.19356913183279742,
"grad_norm": 0.7224156260490417,
"learning_rate": 0.00019620456738501127,
"loss": 4.5398,
"step": 301
},
{
"epoch": 0.19421221864951768,
"grad_norm": 0.6318510174751282,
"learning_rate": 0.0001961917015117401,
"loss": 4.5053,
"step": 302
},
{
"epoch": 0.19485530546623794,
"grad_norm": 0.6725063920021057,
"learning_rate": 0.00019617883563846896,
"loss": 4.3985,
"step": 303
},
{
"epoch": 0.1954983922829582,
"grad_norm": 0.5671389102935791,
"learning_rate": 0.00019616596976519782,
"loss": 4.4917,
"step": 304
},
{
"epoch": 0.19614147909967847,
"grad_norm": 0.7262303233146667,
"learning_rate": 0.00019615310389192668,
"loss": 4.4999,
"step": 305
},
{
"epoch": 0.1967845659163987,
"grad_norm": 0.7654133439064026,
"learning_rate": 0.00019614023801865554,
"loss": 4.3863,
"step": 306
},
{
"epoch": 0.19742765273311896,
"grad_norm": 0.6135159134864807,
"learning_rate": 0.00019612737214538437,
"loss": 4.4282,
"step": 307
},
{
"epoch": 0.19807073954983923,
"grad_norm": 0.818827748298645,
"learning_rate": 0.00019611450627211323,
"loss": 4.3878,
"step": 308
},
{
"epoch": 0.1987138263665595,
"grad_norm": 0.6961670517921448,
"learning_rate": 0.0001961016403988421,
"loss": 4.316,
"step": 309
},
{
"epoch": 0.19935691318327975,
"grad_norm": 0.7855870127677917,
"learning_rate": 0.00019608877452557093,
"loss": 4.3459,
"step": 310
},
{
"epoch": 0.2,
"grad_norm": 0.7561706304550171,
"learning_rate": 0.0001960759086522998,
"loss": 4.4741,
"step": 311
},
{
"epoch": 0.20064308681672025,
"grad_norm": 0.6723310351371765,
"learning_rate": 0.00019606304277902862,
"loss": 4.3894,
"step": 312
},
{
"epoch": 0.2012861736334405,
"grad_norm": 0.8107967376708984,
"learning_rate": 0.00019605017690575748,
"loss": 4.4692,
"step": 313
},
{
"epoch": 0.20192926045016077,
"grad_norm": 0.634990930557251,
"learning_rate": 0.00019603731103248634,
"loss": 4.4678,
"step": 314
},
{
"epoch": 0.20257234726688103,
"grad_norm": 0.759669303894043,
"learning_rate": 0.00019602444515921517,
"loss": 4.358,
"step": 315
},
{
"epoch": 0.2032154340836013,
"grad_norm": 0.651257336139679,
"learning_rate": 0.00019601157928594406,
"loss": 4.2959,
"step": 316
},
{
"epoch": 0.20385852090032155,
"grad_norm": 0.9010285139083862,
"learning_rate": 0.0001959987134126729,
"loss": 4.4263,
"step": 317
},
{
"epoch": 0.2045016077170418,
"grad_norm": 0.6414353251457214,
"learning_rate": 0.00019598584753940175,
"loss": 4.3515,
"step": 318
},
{
"epoch": 0.20514469453376205,
"grad_norm": 1.069684624671936,
"learning_rate": 0.0001959729816661306,
"loss": 4.3952,
"step": 319
},
{
"epoch": 0.2057877813504823,
"grad_norm": 0.9279118776321411,
"learning_rate": 0.00019596011579285945,
"loss": 4.3126,
"step": 320
},
{
"epoch": 0.20643086816720257,
"grad_norm": 0.763932466506958,
"learning_rate": 0.0001959472499195883,
"loss": 4.3882,
"step": 321
},
{
"epoch": 0.20707395498392284,
"grad_norm": 1.1243666410446167,
"learning_rate": 0.00019593438404631714,
"loss": 4.3228,
"step": 322
},
{
"epoch": 0.2077170418006431,
"grad_norm": 0.7078732252120972,
"learning_rate": 0.000195921518173046,
"loss": 4.3726,
"step": 323
},
{
"epoch": 0.20836012861736333,
"grad_norm": 0.9525623917579651,
"learning_rate": 0.00019590865229977486,
"loss": 4.3597,
"step": 324
},
{
"epoch": 0.2090032154340836,
"grad_norm": 0.7362571954727173,
"learning_rate": 0.0001958957864265037,
"loss": 4.3049,
"step": 325
},
{
"epoch": 0.20964630225080386,
"grad_norm": 0.8940659165382385,
"learning_rate": 0.00019588292055323255,
"loss": 4.2689,
"step": 326
},
{
"epoch": 0.21028938906752412,
"grad_norm": 0.6892198324203491,
"learning_rate": 0.0001958700546799614,
"loss": 4.2664,
"step": 327
},
{
"epoch": 0.21093247588424438,
"grad_norm": 0.8762657046318054,
"learning_rate": 0.00019585718880669027,
"loss": 4.3218,
"step": 328
},
{
"epoch": 0.21157556270096464,
"grad_norm": 0.7590584754943848,
"learning_rate": 0.00019584432293341913,
"loss": 4.3964,
"step": 329
},
{
"epoch": 0.21221864951768488,
"grad_norm": 0.7163234949111938,
"learning_rate": 0.00019583145706014796,
"loss": 4.3316,
"step": 330
},
{
"epoch": 0.21286173633440514,
"grad_norm": 0.7926210761070251,
"learning_rate": 0.00019581859118687682,
"loss": 4.2691,
"step": 331
},
{
"epoch": 0.2135048231511254,
"grad_norm": 0.5673405528068542,
"learning_rate": 0.00019580572531360568,
"loss": 4.3685,
"step": 332
},
{
"epoch": 0.21414790996784566,
"grad_norm": 0.690924882888794,
"learning_rate": 0.00019579285944033452,
"loss": 4.3116,
"step": 333
},
{
"epoch": 0.21479099678456592,
"grad_norm": 0.5892926454544067,
"learning_rate": 0.00019577999356706338,
"loss": 4.4015,
"step": 334
},
{
"epoch": 0.21543408360128619,
"grad_norm": 0.640165388584137,
"learning_rate": 0.0001957671276937922,
"loss": 4.3069,
"step": 335
},
{
"epoch": 0.21607717041800642,
"grad_norm": 0.6696978807449341,
"learning_rate": 0.00019575426182052107,
"loss": 4.28,
"step": 336
},
{
"epoch": 0.21672025723472668,
"grad_norm": 0.5902023911476135,
"learning_rate": 0.00019574139594724993,
"loss": 4.2424,
"step": 337
},
{
"epoch": 0.21736334405144694,
"grad_norm": 0.8078274726867676,
"learning_rate": 0.0001957285300739788,
"loss": 4.198,
"step": 338
},
{
"epoch": 0.2180064308681672,
"grad_norm": 0.6141580939292908,
"learning_rate": 0.00019571566420070765,
"loss": 4.2204,
"step": 339
},
{
"epoch": 0.21864951768488747,
"grad_norm": 0.8612291812896729,
"learning_rate": 0.00019570279832743648,
"loss": 4.2663,
"step": 340
},
{
"epoch": 0.21929260450160773,
"grad_norm": 0.6782243251800537,
"learning_rate": 0.00019568993245416534,
"loss": 4.2699,
"step": 341
},
{
"epoch": 0.21993569131832796,
"grad_norm": 0.6207177042961121,
"learning_rate": 0.0001956770665808942,
"loss": 4.2526,
"step": 342
},
{
"epoch": 0.22057877813504823,
"grad_norm": 0.8165475726127625,
"learning_rate": 0.00019566420070762304,
"loss": 4.2142,
"step": 343
},
{
"epoch": 0.2212218649517685,
"grad_norm": 0.6984496116638184,
"learning_rate": 0.0001956513348343519,
"loss": 4.3011,
"step": 344
},
{
"epoch": 0.22186495176848875,
"grad_norm": 0.5956530570983887,
"learning_rate": 0.00019563846896108073,
"loss": 4.3658,
"step": 345
},
{
"epoch": 0.222508038585209,
"grad_norm": 0.5854830145835876,
"learning_rate": 0.0001956256030878096,
"loss": 4.2409,
"step": 346
},
{
"epoch": 0.22315112540192927,
"grad_norm": 0.6498653888702393,
"learning_rate": 0.00019561273721453845,
"loss": 4.3254,
"step": 347
},
{
"epoch": 0.2237942122186495,
"grad_norm": 0.7944484949111938,
"learning_rate": 0.00019559987134126728,
"loss": 4.2179,
"step": 348
},
{
"epoch": 0.22443729903536977,
"grad_norm": 0.7387772798538208,
"learning_rate": 0.00019558700546799614,
"loss": 4.1393,
"step": 349
},
{
"epoch": 0.22508038585209003,
"grad_norm": 0.8723940253257751,
"learning_rate": 0.000195574139594725,
"loss": 4.0767,
"step": 350
},
{
"epoch": 0.2257234726688103,
"grad_norm": 0.7291579842567444,
"learning_rate": 0.00019556127372145386,
"loss": 4.1707,
"step": 351
},
{
"epoch": 0.22636655948553056,
"grad_norm": 0.688996434211731,
"learning_rate": 0.00019554840784818272,
"loss": 4.3018,
"step": 352
},
{
"epoch": 0.22700964630225082,
"grad_norm": 0.7762923240661621,
"learning_rate": 0.00019553554197491155,
"loss": 4.1497,
"step": 353
},
{
"epoch": 0.22765273311897105,
"grad_norm": 0.6835098266601562,
"learning_rate": 0.00019552267610164041,
"loss": 4.1906,
"step": 354
},
{
"epoch": 0.2282958199356913,
"grad_norm": 0.8440699577331543,
"learning_rate": 0.00019550981022836925,
"loss": 4.2099,
"step": 355
},
{
"epoch": 0.22893890675241158,
"grad_norm": 0.8429433107376099,
"learning_rate": 0.0001954969443550981,
"loss": 4.2266,
"step": 356
},
{
"epoch": 0.22958199356913184,
"grad_norm": 0.6459960341453552,
"learning_rate": 0.00019548407848182697,
"loss": 4.2333,
"step": 357
},
{
"epoch": 0.2302250803858521,
"grad_norm": 0.9635197520256042,
"learning_rate": 0.0001954712126085558,
"loss": 4.1193,
"step": 358
},
{
"epoch": 0.23086816720257236,
"grad_norm": 0.6429690718650818,
"learning_rate": 0.00019545834673528466,
"loss": 4.1826,
"step": 359
},
{
"epoch": 0.2315112540192926,
"grad_norm": 0.8262260556221008,
"learning_rate": 0.00019544548086201352,
"loss": 4.1671,
"step": 360
},
{
"epoch": 0.23215434083601286,
"grad_norm": 0.6780098676681519,
"learning_rate": 0.00019543261498874238,
"loss": 4.2259,
"step": 361
},
{
"epoch": 0.23279742765273312,
"grad_norm": 0.7936576008796692,
"learning_rate": 0.00019541974911547124,
"loss": 4.0532,
"step": 362
},
{
"epoch": 0.23344051446945338,
"grad_norm": 0.8148082494735718,
"learning_rate": 0.00019540688324220007,
"loss": 4.2186,
"step": 363
},
{
"epoch": 0.23408360128617364,
"grad_norm": 0.9641815423965454,
"learning_rate": 0.00019539401736892893,
"loss": 4.1883,
"step": 364
},
{
"epoch": 0.2347266881028939,
"grad_norm": 0.8979797959327698,
"learning_rate": 0.0001953811514956578,
"loss": 4.0844,
"step": 365
},
{
"epoch": 0.23536977491961414,
"grad_norm": 0.9056326746940613,
"learning_rate": 0.00019536828562238663,
"loss": 4.1042,
"step": 366
},
{
"epoch": 0.2360128617363344,
"grad_norm": 1.0736181735992432,
"learning_rate": 0.00019535541974911549,
"loss": 4.1356,
"step": 367
},
{
"epoch": 0.23665594855305466,
"grad_norm": 0.8122031688690186,
"learning_rate": 0.00019534255387584432,
"loss": 4.096,
"step": 368
},
{
"epoch": 0.23729903536977492,
"grad_norm": 1.0321471691131592,
"learning_rate": 0.00019532968800257318,
"loss": 4.0537,
"step": 369
},
{
"epoch": 0.2379421221864952,
"grad_norm": 0.6910458207130432,
"learning_rate": 0.00019531682212930204,
"loss": 3.9984,
"step": 370
},
{
"epoch": 0.23858520900321542,
"grad_norm": 0.9629825949668884,
"learning_rate": 0.00019530395625603087,
"loss": 4.1706,
"step": 371
},
{
"epoch": 0.23922829581993568,
"grad_norm": 0.7162978649139404,
"learning_rate": 0.00019529109038275973,
"loss": 4.2802,
"step": 372
},
{
"epoch": 0.23987138263665594,
"grad_norm": 0.7020505666732788,
"learning_rate": 0.0001952782245094886,
"loss": 4.1238,
"step": 373
},
{
"epoch": 0.2405144694533762,
"grad_norm": 1.0391113758087158,
"learning_rate": 0.00019526535863621745,
"loss": 4.1262,
"step": 374
},
{
"epoch": 0.24115755627009647,
"grad_norm": 0.6830191016197205,
"learning_rate": 0.0001952524927629463,
"loss": 4.0751,
"step": 375
},
{
"epoch": 0.24180064308681673,
"grad_norm": 0.9518135786056519,
"learning_rate": 0.00019523962688967515,
"loss": 4.0223,
"step": 376
},
{
"epoch": 0.24244372990353696,
"grad_norm": 0.9503171443939209,
"learning_rate": 0.000195226761016404,
"loss": 4.1063,
"step": 377
},
{
"epoch": 0.24308681672025723,
"grad_norm": 0.7507718205451965,
"learning_rate": 0.00019521389514313284,
"loss": 4.0348,
"step": 378
},
{
"epoch": 0.2437299035369775,
"grad_norm": 0.8583810329437256,
"learning_rate": 0.0001952010292698617,
"loss": 4.0744,
"step": 379
},
{
"epoch": 0.24437299035369775,
"grad_norm": 0.9669693112373352,
"learning_rate": 0.00019518816339659056,
"loss": 4.0018,
"step": 380
},
{
"epoch": 0.245016077170418,
"grad_norm": 0.8933892846107483,
"learning_rate": 0.0001951752975233194,
"loss": 3.9325,
"step": 381
},
{
"epoch": 0.24565916398713827,
"grad_norm": 0.9541077613830566,
"learning_rate": 0.00019516243165004825,
"loss": 3.9337,
"step": 382
},
{
"epoch": 0.2463022508038585,
"grad_norm": 0.9491611123085022,
"learning_rate": 0.0001951495657767771,
"loss": 3.9845,
"step": 383
},
{
"epoch": 0.24694533762057877,
"grad_norm": 0.8005610108375549,
"learning_rate": 0.00019513669990350597,
"loss": 4.0319,
"step": 384
},
{
"epoch": 0.24758842443729903,
"grad_norm": 0.8354764580726624,
"learning_rate": 0.00019512383403023483,
"loss": 4.0569,
"step": 385
},
{
"epoch": 0.2482315112540193,
"grad_norm": 0.6670393943786621,
"learning_rate": 0.00019511096815696366,
"loss": 4.1072,
"step": 386
},
{
"epoch": 0.24887459807073956,
"grad_norm": 0.7420167922973633,
"learning_rate": 0.00019509810228369252,
"loss": 3.9871,
"step": 387
},
{
"epoch": 0.24951768488745982,
"grad_norm": 0.8710854649543762,
"learning_rate": 0.00019508523641042136,
"loss": 4.0891,
"step": 388
},
{
"epoch": 0.25016077170418005,
"grad_norm": 0.7787466049194336,
"learning_rate": 0.00019507237053715022,
"loss": 3.9738,
"step": 389
},
{
"epoch": 0.2508038585209003,
"grad_norm": 0.8813666105270386,
"learning_rate": 0.00019505950466387908,
"loss": 4.0161,
"step": 390
},
{
"epoch": 0.2514469453376206,
"grad_norm": 0.7082841992378235,
"learning_rate": 0.0001950466387906079,
"loss": 4.0657,
"step": 391
},
{
"epoch": 0.25209003215434084,
"grad_norm": 0.9131672382354736,
"learning_rate": 0.00019503377291733677,
"loss": 3.8768,
"step": 392
},
{
"epoch": 0.2527331189710611,
"grad_norm": 0.7093556523323059,
"learning_rate": 0.0001950209070440656,
"loss": 3.9413,
"step": 393
},
{
"epoch": 0.25337620578778136,
"grad_norm": 0.6677314639091492,
"learning_rate": 0.00019500804117079446,
"loss": 4.06,
"step": 394
},
{
"epoch": 0.2540192926045016,
"grad_norm": 0.7473803758621216,
"learning_rate": 0.00019499517529752332,
"loss": 4.0565,
"step": 395
},
{
"epoch": 0.2546623794212219,
"grad_norm": 0.7619471549987793,
"learning_rate": 0.00019498230942425218,
"loss": 3.993,
"step": 396
},
{
"epoch": 0.25530546623794215,
"grad_norm": 0.6803014278411865,
"learning_rate": 0.00019496944355098104,
"loss": 4.0404,
"step": 397
},
{
"epoch": 0.25594855305466235,
"grad_norm": 0.7883164882659912,
"learning_rate": 0.00019495657767770988,
"loss": 3.9816,
"step": 398
},
{
"epoch": 0.2565916398713826,
"grad_norm": 0.6208122968673706,
"learning_rate": 0.00019494371180443874,
"loss": 3.9774,
"step": 399
},
{
"epoch": 0.2572347266881029,
"grad_norm": 0.7383295893669128,
"learning_rate": 0.0001949308459311676,
"loss": 3.9462,
"step": 400
},
{
"epoch": 0.25787781350482314,
"grad_norm": 0.7095021605491638,
"learning_rate": 0.00019491798005789643,
"loss": 4.0581,
"step": 401
},
{
"epoch": 0.2585209003215434,
"grad_norm": 0.7832754254341125,
"learning_rate": 0.0001949051141846253,
"loss": 3.828,
"step": 402
},
{
"epoch": 0.25916398713826366,
"grad_norm": 0.7597861289978027,
"learning_rate": 0.00019489224831135415,
"loss": 4.018,
"step": 403
},
{
"epoch": 0.2598070739549839,
"grad_norm": 0.9110789895057678,
"learning_rate": 0.00019487938243808298,
"loss": 3.8421,
"step": 404
},
{
"epoch": 0.2604501607717042,
"grad_norm": 0.6640086770057678,
"learning_rate": 0.00019486651656481184,
"loss": 3.9569,
"step": 405
},
{
"epoch": 0.26109324758842445,
"grad_norm": 0.6850053668022156,
"learning_rate": 0.0001948536506915407,
"loss": 3.9652,
"step": 406
},
{
"epoch": 0.2617363344051447,
"grad_norm": 0.7008517980575562,
"learning_rate": 0.00019484078481826956,
"loss": 3.9146,
"step": 407
},
{
"epoch": 0.262379421221865,
"grad_norm": 0.719175398349762,
"learning_rate": 0.00019482791894499842,
"loss": 3.9555,
"step": 408
},
{
"epoch": 0.26302250803858523,
"grad_norm": 0.6538800001144409,
"learning_rate": 0.00019481505307172725,
"loss": 3.9122,
"step": 409
},
{
"epoch": 0.26366559485530544,
"grad_norm": 0.7621102929115295,
"learning_rate": 0.00019480218719845611,
"loss": 4.0086,
"step": 410
},
{
"epoch": 0.2643086816720257,
"grad_norm": 0.7248497605323792,
"learning_rate": 0.00019478932132518495,
"loss": 3.9794,
"step": 411
},
{
"epoch": 0.26495176848874596,
"grad_norm": 0.8252344131469727,
"learning_rate": 0.0001947764554519138,
"loss": 3.9337,
"step": 412
},
{
"epoch": 0.2655948553054662,
"grad_norm": 0.8331144452095032,
"learning_rate": 0.00019476358957864267,
"loss": 3.9903,
"step": 413
},
{
"epoch": 0.2662379421221865,
"grad_norm": 0.8301029801368713,
"learning_rate": 0.0001947507237053715,
"loss": 3.944,
"step": 414
},
{
"epoch": 0.26688102893890675,
"grad_norm": 0.739193856716156,
"learning_rate": 0.00019473785783210036,
"loss": 3.9245,
"step": 415
},
{
"epoch": 0.267524115755627,
"grad_norm": 0.7821977734565735,
"learning_rate": 0.0001947249919588292,
"loss": 3.9548,
"step": 416
},
{
"epoch": 0.2681672025723473,
"grad_norm": 0.8310061097145081,
"learning_rate": 0.00019471212608555808,
"loss": 3.9038,
"step": 417
},
{
"epoch": 0.26881028938906754,
"grad_norm": 0.6713045835494995,
"learning_rate": 0.00019469926021228694,
"loss": 3.9723,
"step": 418
},
{
"epoch": 0.2694533762057878,
"grad_norm": 1.0380114316940308,
"learning_rate": 0.00019468639433901577,
"loss": 3.8221,
"step": 419
},
{
"epoch": 0.27009646302250806,
"grad_norm": 0.8434408903121948,
"learning_rate": 0.00019467352846574463,
"loss": 4.0023,
"step": 420
},
{
"epoch": 0.2707395498392283,
"grad_norm": 0.9480482339859009,
"learning_rate": 0.00019466066259247347,
"loss": 3.9496,
"step": 421
},
{
"epoch": 0.27138263665594853,
"grad_norm": 0.869701623916626,
"learning_rate": 0.00019464779671920233,
"loss": 3.9507,
"step": 422
},
{
"epoch": 0.2720257234726688,
"grad_norm": 0.9808540940284729,
"learning_rate": 0.00019463493084593119,
"loss": 3.8506,
"step": 423
},
{
"epoch": 0.27266881028938905,
"grad_norm": 0.9635753035545349,
"learning_rate": 0.00019462206497266002,
"loss": 3.9128,
"step": 424
},
{
"epoch": 0.2733118971061093,
"grad_norm": 0.8993518352508545,
"learning_rate": 0.00019460919909938888,
"loss": 3.8746,
"step": 425
},
{
"epoch": 0.2739549839228296,
"grad_norm": 1.05186927318573,
"learning_rate": 0.0001945963332261177,
"loss": 4.0274,
"step": 426
},
{
"epoch": 0.27459807073954984,
"grad_norm": 1.055508017539978,
"learning_rate": 0.00019458346735284657,
"loss": 3.9418,
"step": 427
},
{
"epoch": 0.2752411575562701,
"grad_norm": 1.0845950841903687,
"learning_rate": 0.00019457060147957543,
"loss": 3.8926,
"step": 428
},
{
"epoch": 0.27588424437299036,
"grad_norm": 0.9033150672912598,
"learning_rate": 0.0001945577356063043,
"loss": 3.8776,
"step": 429
},
{
"epoch": 0.2765273311897106,
"grad_norm": 0.9415053129196167,
"learning_rate": 0.00019454486973303315,
"loss": 3.9336,
"step": 430
},
{
"epoch": 0.2771704180064309,
"grad_norm": 1.02390456199646,
"learning_rate": 0.00019453200385976198,
"loss": 3.7793,
"step": 431
},
{
"epoch": 0.27781350482315115,
"grad_norm": 0.9444904327392578,
"learning_rate": 0.00019451913798649084,
"loss": 3.9075,
"step": 432
},
{
"epoch": 0.2784565916398714,
"grad_norm": 1.0796852111816406,
"learning_rate": 0.0001945062721132197,
"loss": 3.8586,
"step": 433
},
{
"epoch": 0.2790996784565916,
"grad_norm": 0.8367241621017456,
"learning_rate": 0.00019449340623994854,
"loss": 3.93,
"step": 434
},
{
"epoch": 0.2797427652733119,
"grad_norm": 0.9733452796936035,
"learning_rate": 0.0001944805403666774,
"loss": 3.7786,
"step": 435
},
{
"epoch": 0.28038585209003214,
"grad_norm": 0.7164201736450195,
"learning_rate": 0.00019446767449340626,
"loss": 3.8682,
"step": 436
},
{
"epoch": 0.2810289389067524,
"grad_norm": 0.959409236907959,
"learning_rate": 0.0001944548086201351,
"loss": 3.8415,
"step": 437
},
{
"epoch": 0.28167202572347266,
"grad_norm": 0.9369955658912659,
"learning_rate": 0.00019444194274686395,
"loss": 3.8418,
"step": 438
},
{
"epoch": 0.2823151125401929,
"grad_norm": 1.112101435661316,
"learning_rate": 0.0001944290768735928,
"loss": 3.8607,
"step": 439
},
{
"epoch": 0.2829581993569132,
"grad_norm": 0.9493166208267212,
"learning_rate": 0.00019441621100032167,
"loss": 3.9228,
"step": 440
},
{
"epoch": 0.28360128617363345,
"grad_norm": 0.9076783061027527,
"learning_rate": 0.00019440334512705053,
"loss": 3.8624,
"step": 441
},
{
"epoch": 0.2842443729903537,
"grad_norm": 0.9614561200141907,
"learning_rate": 0.00019439047925377936,
"loss": 3.9938,
"step": 442
},
{
"epoch": 0.284887459807074,
"grad_norm": 0.9035642743110657,
"learning_rate": 0.00019437761338050822,
"loss": 3.8205,
"step": 443
},
{
"epoch": 0.28553054662379423,
"grad_norm": 0.8883315324783325,
"learning_rate": 0.00019436474750723706,
"loss": 3.8716,
"step": 444
},
{
"epoch": 0.2861736334405145,
"grad_norm": 0.7665774822235107,
"learning_rate": 0.00019435188163396592,
"loss": 3.8546,
"step": 445
},
{
"epoch": 0.2868167202572347,
"grad_norm": 0.8760365843772888,
"learning_rate": 0.00019433901576069478,
"loss": 3.8498,
"step": 446
},
{
"epoch": 0.28745980707395496,
"grad_norm": 0.9582746624946594,
"learning_rate": 0.0001943261498874236,
"loss": 3.7724,
"step": 447
},
{
"epoch": 0.2881028938906752,
"grad_norm": 0.7356158494949341,
"learning_rate": 0.00019431328401415247,
"loss": 3.8751,
"step": 448
},
{
"epoch": 0.2887459807073955,
"grad_norm": 0.8500118851661682,
"learning_rate": 0.0001943004181408813,
"loss": 3.8485,
"step": 449
},
{
"epoch": 0.28938906752411575,
"grad_norm": 1.0538520812988281,
"learning_rate": 0.00019428755226761016,
"loss": 3.8594,
"step": 450
},
{
"epoch": 0.290032154340836,
"grad_norm": 0.7139402031898499,
"learning_rate": 0.00019427468639433902,
"loss": 3.6391,
"step": 451
},
{
"epoch": 0.2906752411575563,
"grad_norm": 1.1346955299377441,
"learning_rate": 0.00019426182052106788,
"loss": 3.8601,
"step": 452
},
{
"epoch": 0.29131832797427654,
"grad_norm": 0.8908848762512207,
"learning_rate": 0.00019424895464779674,
"loss": 3.7756,
"step": 453
},
{
"epoch": 0.2919614147909968,
"grad_norm": 0.9202871918678284,
"learning_rate": 0.00019423608877452557,
"loss": 3.7043,
"step": 454
},
{
"epoch": 0.29260450160771706,
"grad_norm": 1.1041529178619385,
"learning_rate": 0.00019422322290125443,
"loss": 3.8593,
"step": 455
},
{
"epoch": 0.2932475884244373,
"grad_norm": 0.8448892831802368,
"learning_rate": 0.0001942103570279833,
"loss": 3.7343,
"step": 456
},
{
"epoch": 0.2938906752411576,
"grad_norm": 1.2526445388793945,
"learning_rate": 0.00019419749115471213,
"loss": 3.7036,
"step": 457
},
{
"epoch": 0.2945337620578778,
"grad_norm": 0.9721910953521729,
"learning_rate": 0.000194184625281441,
"loss": 3.8348,
"step": 458
},
{
"epoch": 0.29517684887459805,
"grad_norm": 1.0412633419036865,
"learning_rate": 0.00019417175940816982,
"loss": 3.7733,
"step": 459
},
{
"epoch": 0.2958199356913183,
"grad_norm": 1.1480042934417725,
"learning_rate": 0.00019415889353489868,
"loss": 3.8188,
"step": 460
},
{
"epoch": 0.2964630225080386,
"grad_norm": 0.8680059909820557,
"learning_rate": 0.00019414602766162754,
"loss": 3.8395,
"step": 461
},
{
"epoch": 0.29710610932475884,
"grad_norm": 1.0635613203048706,
"learning_rate": 0.0001941331617883564,
"loss": 3.7715,
"step": 462
},
{
"epoch": 0.2977491961414791,
"grad_norm": 1.0242252349853516,
"learning_rate": 0.00019412029591508526,
"loss": 3.7702,
"step": 463
},
{
"epoch": 0.29839228295819936,
"grad_norm": 0.841439962387085,
"learning_rate": 0.0001941074300418141,
"loss": 3.7487,
"step": 464
},
{
"epoch": 0.2990353697749196,
"grad_norm": 1.0437833070755005,
"learning_rate": 0.00019409456416854295,
"loss": 3.8626,
"step": 465
},
{
"epoch": 0.2996784565916399,
"grad_norm": 0.9704720377922058,
"learning_rate": 0.0001940816982952718,
"loss": 3.7011,
"step": 466
},
{
"epoch": 0.30032154340836015,
"grad_norm": 1.080511212348938,
"learning_rate": 0.00019406883242200065,
"loss": 3.7024,
"step": 467
},
{
"epoch": 0.3009646302250804,
"grad_norm": 1.121840000152588,
"learning_rate": 0.0001940559665487295,
"loss": 3.7532,
"step": 468
},
{
"epoch": 0.3016077170418006,
"grad_norm": 0.9904775023460388,
"learning_rate": 0.00019404310067545837,
"loss": 3.7648,
"step": 469
},
{
"epoch": 0.3022508038585209,
"grad_norm": 1.0320771932601929,
"learning_rate": 0.0001940302348021872,
"loss": 3.6929,
"step": 470
},
{
"epoch": 0.30289389067524114,
"grad_norm": 1.1518800258636475,
"learning_rate": 0.00019401736892891606,
"loss": 3.8161,
"step": 471
},
{
"epoch": 0.3035369774919614,
"grad_norm": 0.9655705094337463,
"learning_rate": 0.0001940045030556449,
"loss": 3.7359,
"step": 472
},
{
"epoch": 0.30418006430868166,
"grad_norm": 0.9549182653427124,
"learning_rate": 0.00019399163718237375,
"loss": 3.7,
"step": 473
},
{
"epoch": 0.3048231511254019,
"grad_norm": 0.914478063583374,
"learning_rate": 0.0001939787713091026,
"loss": 3.7204,
"step": 474
},
{
"epoch": 0.3054662379421222,
"grad_norm": 0.8414446711540222,
"learning_rate": 0.00019396590543583147,
"loss": 3.8886,
"step": 475
},
{
"epoch": 0.30610932475884245,
"grad_norm": 1.0151910781860352,
"learning_rate": 0.00019395303956256033,
"loss": 3.7341,
"step": 476
},
{
"epoch": 0.3067524115755627,
"grad_norm": 0.9511162042617798,
"learning_rate": 0.00019394017368928916,
"loss": 3.6728,
"step": 477
},
{
"epoch": 0.307395498392283,
"grad_norm": 0.8905565142631531,
"learning_rate": 0.00019392730781601802,
"loss": 3.7416,
"step": 478
},
{
"epoch": 0.30803858520900324,
"grad_norm": 0.9920605421066284,
"learning_rate": 0.00019391444194274688,
"loss": 3.7493,
"step": 479
},
{
"epoch": 0.3086816720257235,
"grad_norm": 0.7968128323554993,
"learning_rate": 0.00019390157606947572,
"loss": 3.7592,
"step": 480
},
{
"epoch": 0.3093247588424437,
"grad_norm": 1.2561795711517334,
"learning_rate": 0.00019388871019620458,
"loss": 3.7363,
"step": 481
},
{
"epoch": 0.30996784565916397,
"grad_norm": 0.8600074648857117,
"learning_rate": 0.0001938758443229334,
"loss": 3.7102,
"step": 482
},
{
"epoch": 0.3106109324758842,
"grad_norm": 0.9748154282569885,
"learning_rate": 0.00019386297844966227,
"loss": 3.8,
"step": 483
},
{
"epoch": 0.3112540192926045,
"grad_norm": 1.0293912887573242,
"learning_rate": 0.00019385011257639113,
"loss": 3.6432,
"step": 484
},
{
"epoch": 0.31189710610932475,
"grad_norm": 0.7619652152061462,
"learning_rate": 0.00019383724670312,
"loss": 3.6905,
"step": 485
},
{
"epoch": 0.312540192926045,
"grad_norm": 1.2902241945266724,
"learning_rate": 0.00019382438082984885,
"loss": 3.7061,
"step": 486
},
{
"epoch": 0.3131832797427653,
"grad_norm": 0.8420130610466003,
"learning_rate": 0.00019381151495657768,
"loss": 3.7971,
"step": 487
},
{
"epoch": 0.31382636655948554,
"grad_norm": 1.0442159175872803,
"learning_rate": 0.00019379864908330654,
"loss": 3.6643,
"step": 488
},
{
"epoch": 0.3144694533762058,
"grad_norm": 0.7481032609939575,
"learning_rate": 0.0001937857832100354,
"loss": 3.6,
"step": 489
},
{
"epoch": 0.31511254019292606,
"grad_norm": 0.8739355802536011,
"learning_rate": 0.00019377291733676424,
"loss": 3.6363,
"step": 490
},
{
"epoch": 0.3157556270096463,
"grad_norm": 0.7196866273880005,
"learning_rate": 0.0001937600514634931,
"loss": 3.7621,
"step": 491
},
{
"epoch": 0.3163987138263666,
"grad_norm": 0.7693118453025818,
"learning_rate": 0.00019374718559022193,
"loss": 3.7453,
"step": 492
},
{
"epoch": 0.3170418006430868,
"grad_norm": 0.8000239729881287,
"learning_rate": 0.0001937343197169508,
"loss": 3.697,
"step": 493
},
{
"epoch": 0.31768488745980705,
"grad_norm": 0.6940252780914307,
"learning_rate": 0.00019372145384367965,
"loss": 3.6958,
"step": 494
},
{
"epoch": 0.3183279742765273,
"grad_norm": 0.8711187243461609,
"learning_rate": 0.00019370858797040848,
"loss": 3.8204,
"step": 495
},
{
"epoch": 0.3189710610932476,
"grad_norm": 0.8630790114402771,
"learning_rate": 0.00019369572209713737,
"loss": 3.6836,
"step": 496
},
{
"epoch": 0.31961414790996784,
"grad_norm": 0.8443520665168762,
"learning_rate": 0.0001936828562238662,
"loss": 3.7011,
"step": 497
},
{
"epoch": 0.3202572347266881,
"grad_norm": 0.9406055808067322,
"learning_rate": 0.00019366999035059506,
"loss": 3.6641,
"step": 498
},
{
"epoch": 0.32090032154340836,
"grad_norm": 0.7495781779289246,
"learning_rate": 0.00019365712447732392,
"loss": 3.6973,
"step": 499
},
{
"epoch": 0.3215434083601286,
"grad_norm": 0.7676818370819092,
"learning_rate": 0.00019364425860405275,
"loss": 3.7072,
"step": 500
},
{
"epoch": 0.3221864951768489,
"grad_norm": 0.9088597893714905,
"learning_rate": 0.00019363139273078161,
"loss": 3.7242,
"step": 501
},
{
"epoch": 0.32282958199356915,
"grad_norm": 0.8267994523048401,
"learning_rate": 0.00019361852685751047,
"loss": 3.5553,
"step": 502
},
{
"epoch": 0.3234726688102894,
"grad_norm": 0.7666113376617432,
"learning_rate": 0.0001936056609842393,
"loss": 3.5928,
"step": 503
},
{
"epoch": 0.32411575562700967,
"grad_norm": 0.7497265338897705,
"learning_rate": 0.00019359279511096817,
"loss": 3.673,
"step": 504
},
{
"epoch": 0.3247588424437299,
"grad_norm": 0.925579845905304,
"learning_rate": 0.000193579929237697,
"loss": 3.6482,
"step": 505
},
{
"epoch": 0.32540192926045014,
"grad_norm": 1.0170692205429077,
"learning_rate": 0.00019356706336442586,
"loss": 3.6582,
"step": 506
},
{
"epoch": 0.3260450160771704,
"grad_norm": 0.8777898550033569,
"learning_rate": 0.00019355419749115472,
"loss": 3.6206,
"step": 507
},
{
"epoch": 0.32668810289389066,
"grad_norm": 0.907271683216095,
"learning_rate": 0.00019354133161788358,
"loss": 3.6647,
"step": 508
},
{
"epoch": 0.3273311897106109,
"grad_norm": 1.1781278848648071,
"learning_rate": 0.00019352846574461244,
"loss": 3.6539,
"step": 509
},
{
"epoch": 0.3279742765273312,
"grad_norm": 0.8342487215995789,
"learning_rate": 0.00019351559987134127,
"loss": 3.6976,
"step": 510
},
{
"epoch": 0.32861736334405145,
"grad_norm": 1.1345239877700806,
"learning_rate": 0.00019350273399807013,
"loss": 3.7446,
"step": 511
},
{
"epoch": 0.3292604501607717,
"grad_norm": 1.1331250667572021,
"learning_rate": 0.000193489868124799,
"loss": 3.6652,
"step": 512
},
{
"epoch": 0.329903536977492,
"grad_norm": 0.9640575647354126,
"learning_rate": 0.00019347700225152783,
"loss": 3.7036,
"step": 513
},
{
"epoch": 0.33054662379421224,
"grad_norm": 0.947372555732727,
"learning_rate": 0.00019346413637825669,
"loss": 3.7328,
"step": 514
},
{
"epoch": 0.3311897106109325,
"grad_norm": 0.9356391429901123,
"learning_rate": 0.00019345127050498552,
"loss": 3.6027,
"step": 515
},
{
"epoch": 0.33183279742765276,
"grad_norm": 0.8815325498580933,
"learning_rate": 0.00019343840463171438,
"loss": 3.69,
"step": 516
},
{
"epoch": 0.33247588424437297,
"grad_norm": 0.9109652042388916,
"learning_rate": 0.00019342553875844324,
"loss": 3.7513,
"step": 517
},
{
"epoch": 0.3331189710610932,
"grad_norm": 0.9772228002548218,
"learning_rate": 0.0001934126728851721,
"loss": 3.6968,
"step": 518
},
{
"epoch": 0.3337620578778135,
"grad_norm": 0.992035448551178,
"learning_rate": 0.00019339980701190096,
"loss": 3.604,
"step": 519
},
{
"epoch": 0.33440514469453375,
"grad_norm": 0.9157900810241699,
"learning_rate": 0.0001933869411386298,
"loss": 3.6083,
"step": 520
},
{
"epoch": 0.335048231511254,
"grad_norm": 1.0214614868164062,
"learning_rate": 0.00019337407526535865,
"loss": 3.62,
"step": 521
},
{
"epoch": 0.3356913183279743,
"grad_norm": 0.9241607785224915,
"learning_rate": 0.0001933612093920875,
"loss": 3.6061,
"step": 522
},
{
"epoch": 0.33633440514469454,
"grad_norm": 1.1566587686538696,
"learning_rate": 0.00019334834351881634,
"loss": 3.6324,
"step": 523
},
{
"epoch": 0.3369774919614148,
"grad_norm": 0.8433347940444946,
"learning_rate": 0.0001933354776455452,
"loss": 3.548,
"step": 524
},
{
"epoch": 0.33762057877813506,
"grad_norm": 0.9520887136459351,
"learning_rate": 0.00019332261177227404,
"loss": 3.6148,
"step": 525
},
{
"epoch": 0.3382636655948553,
"grad_norm": 0.974737823009491,
"learning_rate": 0.0001933097458990029,
"loss": 3.6365,
"step": 526
},
{
"epoch": 0.3389067524115756,
"grad_norm": 0.8349432945251465,
"learning_rate": 0.00019329688002573176,
"loss": 3.6434,
"step": 527
},
{
"epoch": 0.33954983922829585,
"grad_norm": 1.0445458889007568,
"learning_rate": 0.0001932840141524606,
"loss": 3.4959,
"step": 528
},
{
"epoch": 0.34019292604501605,
"grad_norm": 0.7769454121589661,
"learning_rate": 0.00019327114827918945,
"loss": 3.5066,
"step": 529
},
{
"epoch": 0.3408360128617363,
"grad_norm": 0.8699477910995483,
"learning_rate": 0.0001932582824059183,
"loss": 3.683,
"step": 530
},
{
"epoch": 0.3414790996784566,
"grad_norm": 0.834698498249054,
"learning_rate": 0.00019324541653264717,
"loss": 3.6224,
"step": 531
},
{
"epoch": 0.34212218649517684,
"grad_norm": 0.8166964054107666,
"learning_rate": 0.00019323255065937603,
"loss": 3.7364,
"step": 532
},
{
"epoch": 0.3427652733118971,
"grad_norm": 0.8357145190238953,
"learning_rate": 0.00019321968478610486,
"loss": 3.5562,
"step": 533
},
{
"epoch": 0.34340836012861736,
"grad_norm": 0.9249911904335022,
"learning_rate": 0.00019320681891283372,
"loss": 3.5898,
"step": 534
},
{
"epoch": 0.3440514469453376,
"grad_norm": 0.7656811475753784,
"learning_rate": 0.00019319395303956256,
"loss": 3.5595,
"step": 535
},
{
"epoch": 0.3446945337620579,
"grad_norm": 0.892476499080658,
"learning_rate": 0.00019318108716629142,
"loss": 3.681,
"step": 536
},
{
"epoch": 0.34533762057877815,
"grad_norm": 0.8413558006286621,
"learning_rate": 0.00019316822129302028,
"loss": 3.7022,
"step": 537
},
{
"epoch": 0.3459807073954984,
"grad_norm": 0.7751324772834778,
"learning_rate": 0.0001931553554197491,
"loss": 3.5837,
"step": 538
},
{
"epoch": 0.34662379421221867,
"grad_norm": 0.8419694304466248,
"learning_rate": 0.00019314248954647797,
"loss": 3.6111,
"step": 539
},
{
"epoch": 0.34726688102893893,
"grad_norm": 1.005031943321228,
"learning_rate": 0.00019312962367320683,
"loss": 3.5626,
"step": 540
},
{
"epoch": 0.34790996784565914,
"grad_norm": 0.9872586131095886,
"learning_rate": 0.0001931167577999357,
"loss": 3.5682,
"step": 541
},
{
"epoch": 0.3485530546623794,
"grad_norm": 0.8768367767333984,
"learning_rate": 0.00019310389192666455,
"loss": 3.6085,
"step": 542
},
{
"epoch": 0.34919614147909966,
"grad_norm": 0.9605547189712524,
"learning_rate": 0.00019309102605339338,
"loss": 3.6829,
"step": 543
},
{
"epoch": 0.3498392282958199,
"grad_norm": 0.9464700222015381,
"learning_rate": 0.00019307816018012224,
"loss": 3.6298,
"step": 544
},
{
"epoch": 0.3504823151125402,
"grad_norm": 0.863048791885376,
"learning_rate": 0.0001930652943068511,
"loss": 3.4854,
"step": 545
},
{
"epoch": 0.35112540192926045,
"grad_norm": 1.1198543310165405,
"learning_rate": 0.00019305242843357993,
"loss": 3.4904,
"step": 546
},
{
"epoch": 0.3517684887459807,
"grad_norm": 0.8883192539215088,
"learning_rate": 0.0001930395625603088,
"loss": 3.555,
"step": 547
},
{
"epoch": 0.352411575562701,
"grad_norm": 0.9818452596664429,
"learning_rate": 0.00019302669668703763,
"loss": 3.5089,
"step": 548
},
{
"epoch": 0.35305466237942124,
"grad_norm": 0.9535132646560669,
"learning_rate": 0.0001930138308137665,
"loss": 3.4332,
"step": 549
},
{
"epoch": 0.3536977491961415,
"grad_norm": 1.1151210069656372,
"learning_rate": 0.00019300096494049535,
"loss": 3.5434,
"step": 550
},
{
"epoch": 0.35434083601286176,
"grad_norm": 1.3266632556915283,
"learning_rate": 0.00019298809906722418,
"loss": 3.5069,
"step": 551
},
{
"epoch": 0.35498392282958197,
"grad_norm": 1.041902780532837,
"learning_rate": 0.00019297523319395304,
"loss": 3.5696,
"step": 552
},
{
"epoch": 0.35562700964630223,
"grad_norm": 1.1225471496582031,
"learning_rate": 0.0001929623673206819,
"loss": 3.6051,
"step": 553
},
{
"epoch": 0.3562700964630225,
"grad_norm": 1.0280983448028564,
"learning_rate": 0.00019294950144741076,
"loss": 3.5618,
"step": 554
},
{
"epoch": 0.35691318327974275,
"grad_norm": 1.1007379293441772,
"learning_rate": 0.00019293663557413962,
"loss": 3.6334,
"step": 555
},
{
"epoch": 0.357556270096463,
"grad_norm": 1.260547161102295,
"learning_rate": 0.00019292376970086845,
"loss": 3.5079,
"step": 556
},
{
"epoch": 0.3581993569131833,
"grad_norm": 0.7994619607925415,
"learning_rate": 0.0001929109038275973,
"loss": 3.4662,
"step": 557
},
{
"epoch": 0.35884244372990354,
"grad_norm": 1.1786779165267944,
"learning_rate": 0.00019289803795432615,
"loss": 3.5494,
"step": 558
},
{
"epoch": 0.3594855305466238,
"grad_norm": 1.115881085395813,
"learning_rate": 0.000192885172081055,
"loss": 3.58,
"step": 559
},
{
"epoch": 0.36012861736334406,
"grad_norm": 1.2896467447280884,
"learning_rate": 0.00019287230620778387,
"loss": 3.5807,
"step": 560
},
{
"epoch": 0.3607717041800643,
"grad_norm": 1.0027967691421509,
"learning_rate": 0.0001928594403345127,
"loss": 3.56,
"step": 561
},
{
"epoch": 0.3614147909967846,
"grad_norm": 1.3239651918411255,
"learning_rate": 0.00019284657446124156,
"loss": 3.4984,
"step": 562
},
{
"epoch": 0.36205787781350485,
"grad_norm": 1.21293044090271,
"learning_rate": 0.00019283370858797042,
"loss": 3.5481,
"step": 563
},
{
"epoch": 0.36270096463022505,
"grad_norm": 0.9017782807350159,
"learning_rate": 0.00019282084271469928,
"loss": 3.5069,
"step": 564
},
{
"epoch": 0.3633440514469453,
"grad_norm": 1.3459876775741577,
"learning_rate": 0.00019280797684142814,
"loss": 3.5389,
"step": 565
},
{
"epoch": 0.3639871382636656,
"grad_norm": 1.2981561422348022,
"learning_rate": 0.00019279511096815697,
"loss": 3.4336,
"step": 566
},
{
"epoch": 0.36463022508038584,
"grad_norm": 1.0287895202636719,
"learning_rate": 0.00019278224509488583,
"loss": 3.5067,
"step": 567
},
{
"epoch": 0.3652733118971061,
"grad_norm": 1.131381630897522,
"learning_rate": 0.00019276937922161467,
"loss": 3.5079,
"step": 568
},
{
"epoch": 0.36591639871382636,
"grad_norm": 0.9513139724731445,
"learning_rate": 0.00019275651334834353,
"loss": 3.5221,
"step": 569
},
{
"epoch": 0.3665594855305466,
"grad_norm": 0.9313328266143799,
"learning_rate": 0.00019274364747507239,
"loss": 3.5944,
"step": 570
},
{
"epoch": 0.3672025723472669,
"grad_norm": 0.8763944506645203,
"learning_rate": 0.00019273078160180122,
"loss": 3.535,
"step": 571
},
{
"epoch": 0.36784565916398715,
"grad_norm": 1.1983128786087036,
"learning_rate": 0.00019271791572853008,
"loss": 3.6055,
"step": 572
},
{
"epoch": 0.3684887459807074,
"grad_norm": 0.8382171392440796,
"learning_rate": 0.00019270504985525894,
"loss": 3.4601,
"step": 573
},
{
"epoch": 0.3691318327974277,
"grad_norm": 0.8634599447250366,
"learning_rate": 0.00019269218398198777,
"loss": 3.3993,
"step": 574
},
{
"epoch": 0.36977491961414793,
"grad_norm": 1.220293402671814,
"learning_rate": 0.00019267931810871666,
"loss": 3.4932,
"step": 575
},
{
"epoch": 0.37041800643086814,
"grad_norm": 1.0614768266677856,
"learning_rate": 0.0001926664522354455,
"loss": 3.5723,
"step": 576
},
{
"epoch": 0.3710610932475884,
"grad_norm": 1.0492634773254395,
"learning_rate": 0.00019265358636217435,
"loss": 3.5106,
"step": 577
},
{
"epoch": 0.37170418006430866,
"grad_norm": 1.4759819507598877,
"learning_rate": 0.0001926407204889032,
"loss": 3.4732,
"step": 578
},
{
"epoch": 0.3723472668810289,
"grad_norm": 1.150651454925537,
"learning_rate": 0.00019262785461563204,
"loss": 3.5596,
"step": 579
},
{
"epoch": 0.3729903536977492,
"grad_norm": 1.0370066165924072,
"learning_rate": 0.0001926149887423609,
"loss": 3.4118,
"step": 580
},
{
"epoch": 0.37363344051446945,
"grad_norm": 1.4111131429672241,
"learning_rate": 0.00019260212286908974,
"loss": 3.3447,
"step": 581
},
{
"epoch": 0.3742765273311897,
"grad_norm": 0.9858524203300476,
"learning_rate": 0.0001925892569958186,
"loss": 3.4701,
"step": 582
},
{
"epoch": 0.37491961414791,
"grad_norm": 1.0511631965637207,
"learning_rate": 0.00019257639112254746,
"loss": 3.5845,
"step": 583
},
{
"epoch": 0.37556270096463024,
"grad_norm": 1.1130648851394653,
"learning_rate": 0.0001925635252492763,
"loss": 3.3556,
"step": 584
},
{
"epoch": 0.3762057877813505,
"grad_norm": 1.1107404232025146,
"learning_rate": 0.00019255065937600515,
"loss": 3.4739,
"step": 585
},
{
"epoch": 0.37684887459807076,
"grad_norm": 1.00481116771698,
"learning_rate": 0.000192537793502734,
"loss": 3.5032,
"step": 586
},
{
"epoch": 0.377491961414791,
"grad_norm": 1.0136293172836304,
"learning_rate": 0.00019252492762946287,
"loss": 3.4813,
"step": 587
},
{
"epoch": 0.37813504823151123,
"grad_norm": 0.810249388217926,
"learning_rate": 0.00019251206175619173,
"loss": 3.4907,
"step": 588
},
{
"epoch": 0.3787781350482315,
"grad_norm": 1.0708262920379639,
"learning_rate": 0.00019249919588292056,
"loss": 3.492,
"step": 589
},
{
"epoch": 0.37942122186495175,
"grad_norm": 1.0980275869369507,
"learning_rate": 0.00019248633000964942,
"loss": 3.45,
"step": 590
},
{
"epoch": 0.380064308681672,
"grad_norm": 1.0108336210250854,
"learning_rate": 0.00019247346413637826,
"loss": 3.4736,
"step": 591
},
{
"epoch": 0.3807073954983923,
"grad_norm": 1.0177563428878784,
"learning_rate": 0.00019246059826310712,
"loss": 3.4808,
"step": 592
},
{
"epoch": 0.38135048231511254,
"grad_norm": 1.3187386989593506,
"learning_rate": 0.00019244773238983598,
"loss": 3.4965,
"step": 593
},
{
"epoch": 0.3819935691318328,
"grad_norm": 0.9032576084136963,
"learning_rate": 0.0001924348665165648,
"loss": 3.4265,
"step": 594
},
{
"epoch": 0.38263665594855306,
"grad_norm": 0.9487648606300354,
"learning_rate": 0.00019242200064329367,
"loss": 3.4562,
"step": 595
},
{
"epoch": 0.3832797427652733,
"grad_norm": 1.0431288480758667,
"learning_rate": 0.0001924091347700225,
"loss": 3.4572,
"step": 596
},
{
"epoch": 0.3839228295819936,
"grad_norm": 1.07583487033844,
"learning_rate": 0.0001923962688967514,
"loss": 3.5218,
"step": 597
},
{
"epoch": 0.38456591639871385,
"grad_norm": 0.9846882820129395,
"learning_rate": 0.00019238340302348025,
"loss": 3.4248,
"step": 598
},
{
"epoch": 0.3852090032154341,
"grad_norm": 1.018010139465332,
"learning_rate": 0.00019237053715020908,
"loss": 3.4861,
"step": 599
},
{
"epoch": 0.3858520900321543,
"grad_norm": 1.1103070974349976,
"learning_rate": 0.00019235767127693794,
"loss": 3.4394,
"step": 600
},
{
"epoch": 0.3864951768488746,
"grad_norm": 1.005889654159546,
"learning_rate": 0.00019234480540366677,
"loss": 3.503,
"step": 601
},
{
"epoch": 0.38713826366559484,
"grad_norm": 1.321091651916504,
"learning_rate": 0.00019233193953039563,
"loss": 3.5028,
"step": 602
},
{
"epoch": 0.3877813504823151,
"grad_norm": 1.2860043048858643,
"learning_rate": 0.0001923190736571245,
"loss": 3.5034,
"step": 603
},
{
"epoch": 0.38842443729903536,
"grad_norm": 1.118884801864624,
"learning_rate": 0.00019230620778385333,
"loss": 3.456,
"step": 604
},
{
"epoch": 0.3890675241157556,
"grad_norm": 1.0882989168167114,
"learning_rate": 0.0001922933419105822,
"loss": 3.4183,
"step": 605
},
{
"epoch": 0.3897106109324759,
"grad_norm": 0.8584060072898865,
"learning_rate": 0.00019228047603731105,
"loss": 3.5375,
"step": 606
},
{
"epoch": 0.39035369774919615,
"grad_norm": 0.9511725902557373,
"learning_rate": 0.00019226761016403988,
"loss": 3.4394,
"step": 607
},
{
"epoch": 0.3909967845659164,
"grad_norm": 1.0036194324493408,
"learning_rate": 0.00019225474429076874,
"loss": 3.4764,
"step": 608
},
{
"epoch": 0.3916398713826367,
"grad_norm": 1.0174047946929932,
"learning_rate": 0.0001922418784174976,
"loss": 3.3898,
"step": 609
},
{
"epoch": 0.39228295819935693,
"grad_norm": 1.1916203498840332,
"learning_rate": 0.00019222901254422646,
"loss": 3.2736,
"step": 610
},
{
"epoch": 0.3929260450160772,
"grad_norm": 1.096472144126892,
"learning_rate": 0.00019221614667095532,
"loss": 3.4986,
"step": 611
},
{
"epoch": 0.3935691318327974,
"grad_norm": 1.0956705808639526,
"learning_rate": 0.00019220328079768415,
"loss": 3.3589,
"step": 612
},
{
"epoch": 0.39421221864951767,
"grad_norm": 1.2517657279968262,
"learning_rate": 0.000192190414924413,
"loss": 3.4869,
"step": 613
},
{
"epoch": 0.3948553054662379,
"grad_norm": 1.0856648683547974,
"learning_rate": 0.00019217754905114185,
"loss": 3.5259,
"step": 614
},
{
"epoch": 0.3954983922829582,
"grad_norm": 0.9672216773033142,
"learning_rate": 0.0001921646831778707,
"loss": 3.3918,
"step": 615
},
{
"epoch": 0.39614147909967845,
"grad_norm": 1.1243929862976074,
"learning_rate": 0.00019215181730459957,
"loss": 3.3397,
"step": 616
},
{
"epoch": 0.3967845659163987,
"grad_norm": 1.1399009227752686,
"learning_rate": 0.0001921389514313284,
"loss": 3.4099,
"step": 617
},
{
"epoch": 0.397427652733119,
"grad_norm": 1.0743727684020996,
"learning_rate": 0.00019212608555805726,
"loss": 3.3701,
"step": 618
},
{
"epoch": 0.39807073954983924,
"grad_norm": 0.9858459830284119,
"learning_rate": 0.00019211321968478612,
"loss": 3.4865,
"step": 619
},
{
"epoch": 0.3987138263665595,
"grad_norm": 1.0459102392196655,
"learning_rate": 0.00019210035381151498,
"loss": 3.4858,
"step": 620
},
{
"epoch": 0.39935691318327976,
"grad_norm": 1.106679081916809,
"learning_rate": 0.00019208748793824384,
"loss": 3.4588,
"step": 621
},
{
"epoch": 0.4,
"grad_norm": 1.1907827854156494,
"learning_rate": 0.00019207462206497267,
"loss": 3.41,
"step": 622
},
{
"epoch": 0.4006430868167203,
"grad_norm": 1.1031006574630737,
"learning_rate": 0.00019206175619170153,
"loss": 3.4571,
"step": 623
},
{
"epoch": 0.4012861736334405,
"grad_norm": 1.2004433870315552,
"learning_rate": 0.00019204889031843036,
"loss": 3.3627,
"step": 624
},
{
"epoch": 0.40192926045016075,
"grad_norm": 1.3993326425552368,
"learning_rate": 0.00019203602444515922,
"loss": 3.5063,
"step": 625
},
{
"epoch": 0.402572347266881,
"grad_norm": 1.4372771978378296,
"learning_rate": 0.00019202315857188808,
"loss": 3.3901,
"step": 626
},
{
"epoch": 0.4032154340836013,
"grad_norm": 1.3181085586547852,
"learning_rate": 0.00019201029269861692,
"loss": 3.4757,
"step": 627
},
{
"epoch": 0.40385852090032154,
"grad_norm": 1.3685520887374878,
"learning_rate": 0.00019199742682534578,
"loss": 3.3122,
"step": 628
},
{
"epoch": 0.4045016077170418,
"grad_norm": 1.124450445175171,
"learning_rate": 0.0001919845609520746,
"loss": 3.3933,
"step": 629
},
{
"epoch": 0.40514469453376206,
"grad_norm": 1.247326135635376,
"learning_rate": 0.00019197169507880347,
"loss": 3.434,
"step": 630
},
{
"epoch": 0.4057877813504823,
"grad_norm": 1.3969745635986328,
"learning_rate": 0.00019195882920553233,
"loss": 3.3692,
"step": 631
},
{
"epoch": 0.4064308681672026,
"grad_norm": 1.0851151943206787,
"learning_rate": 0.0001919459633322612,
"loss": 3.3945,
"step": 632
},
{
"epoch": 0.40707395498392285,
"grad_norm": 1.0287741422653198,
"learning_rate": 0.00019193309745899005,
"loss": 3.4111,
"step": 633
},
{
"epoch": 0.4077170418006431,
"grad_norm": 1.1267411708831787,
"learning_rate": 0.00019192023158571888,
"loss": 3.395,
"step": 634
},
{
"epoch": 0.40836012861736337,
"grad_norm": 0.9802685379981995,
"learning_rate": 0.00019190736571244774,
"loss": 3.3637,
"step": 635
},
{
"epoch": 0.4090032154340836,
"grad_norm": 1.075246810913086,
"learning_rate": 0.0001918944998391766,
"loss": 3.3833,
"step": 636
},
{
"epoch": 0.40964630225080384,
"grad_norm": 0.8767859935760498,
"learning_rate": 0.00019188163396590544,
"loss": 3.2365,
"step": 637
},
{
"epoch": 0.4102893890675241,
"grad_norm": 1.2082061767578125,
"learning_rate": 0.0001918687680926343,
"loss": 3.4539,
"step": 638
},
{
"epoch": 0.41093247588424436,
"grad_norm": 1.1632206439971924,
"learning_rate": 0.00019185590221936316,
"loss": 3.4249,
"step": 639
},
{
"epoch": 0.4115755627009646,
"grad_norm": 0.8956990242004395,
"learning_rate": 0.000191843036346092,
"loss": 3.4036,
"step": 640
},
{
"epoch": 0.4122186495176849,
"grad_norm": 1.136662244796753,
"learning_rate": 0.00019183017047282085,
"loss": 3.2916,
"step": 641
},
{
"epoch": 0.41286173633440515,
"grad_norm": 1.1098051071166992,
"learning_rate": 0.0001918173045995497,
"loss": 3.3685,
"step": 642
},
{
"epoch": 0.4135048231511254,
"grad_norm": 1.0665825605392456,
"learning_rate": 0.00019180443872627857,
"loss": 3.3959,
"step": 643
},
{
"epoch": 0.4141479099678457,
"grad_norm": 1.1620614528656006,
"learning_rate": 0.00019179157285300743,
"loss": 3.4566,
"step": 644
},
{
"epoch": 0.41479099678456594,
"grad_norm": 0.905841588973999,
"learning_rate": 0.00019177870697973626,
"loss": 3.3252,
"step": 645
},
{
"epoch": 0.4154340836012862,
"grad_norm": 1.4927425384521484,
"learning_rate": 0.00019176584110646512,
"loss": 3.5348,
"step": 646
},
{
"epoch": 0.4160771704180064,
"grad_norm": 1.1315077543258667,
"learning_rate": 0.00019175297523319395,
"loss": 3.3393,
"step": 647
},
{
"epoch": 0.41672025723472667,
"grad_norm": 0.9733510613441467,
"learning_rate": 0.00019174010935992281,
"loss": 3.3689,
"step": 648
},
{
"epoch": 0.4173633440514469,
"grad_norm": 1.0324153900146484,
"learning_rate": 0.00019172724348665167,
"loss": 3.3592,
"step": 649
},
{
"epoch": 0.4180064308681672,
"grad_norm": 1.057694435119629,
"learning_rate": 0.0001917143776133805,
"loss": 3.4237,
"step": 650
},
{
"epoch": 0.41864951768488745,
"grad_norm": 1.2631813287734985,
"learning_rate": 0.00019170151174010937,
"loss": 3.2269,
"step": 651
},
{
"epoch": 0.4192926045016077,
"grad_norm": 0.9592750668525696,
"learning_rate": 0.0001916886458668382,
"loss": 3.2824,
"step": 652
},
{
"epoch": 0.419935691318328,
"grad_norm": 1.085555911064148,
"learning_rate": 0.00019167577999356706,
"loss": 3.3289,
"step": 653
},
{
"epoch": 0.42057877813504824,
"grad_norm": 1.005272388458252,
"learning_rate": 0.00019166291412029592,
"loss": 3.3593,
"step": 654
},
{
"epoch": 0.4212218649517685,
"grad_norm": 0.8905820846557617,
"learning_rate": 0.00019165004824702478,
"loss": 3.4101,
"step": 655
},
{
"epoch": 0.42186495176848876,
"grad_norm": 1.3383442163467407,
"learning_rate": 0.00019163718237375364,
"loss": 3.4679,
"step": 656
},
{
"epoch": 0.422508038585209,
"grad_norm": 1.0254385471343994,
"learning_rate": 0.00019162431650048247,
"loss": 3.411,
"step": 657
},
{
"epoch": 0.4231511254019293,
"grad_norm": 1.010944128036499,
"learning_rate": 0.00019161145062721133,
"loss": 3.2327,
"step": 658
},
{
"epoch": 0.4237942122186495,
"grad_norm": 1.0854414701461792,
"learning_rate": 0.0001915985847539402,
"loss": 3.4239,
"step": 659
},
{
"epoch": 0.42443729903536975,
"grad_norm": 1.0793399810791016,
"learning_rate": 0.00019158571888066903,
"loss": 3.2859,
"step": 660
},
{
"epoch": 0.42508038585209,
"grad_norm": 1.158903956413269,
"learning_rate": 0.00019157285300739789,
"loss": 3.388,
"step": 661
},
{
"epoch": 0.4257234726688103,
"grad_norm": 1.126036524772644,
"learning_rate": 0.00019155998713412672,
"loss": 3.3337,
"step": 662
},
{
"epoch": 0.42636655948553054,
"grad_norm": 0.9668582677841187,
"learning_rate": 0.00019154712126085558,
"loss": 3.3274,
"step": 663
},
{
"epoch": 0.4270096463022508,
"grad_norm": 1.0506800413131714,
"learning_rate": 0.00019153425538758444,
"loss": 3.374,
"step": 664
},
{
"epoch": 0.42765273311897106,
"grad_norm": 1.2126219272613525,
"learning_rate": 0.0001915213895143133,
"loss": 3.4406,
"step": 665
},
{
"epoch": 0.4282958199356913,
"grad_norm": 1.0302507877349854,
"learning_rate": 0.00019150852364104216,
"loss": 3.3207,
"step": 666
},
{
"epoch": 0.4289389067524116,
"grad_norm": 1.025179386138916,
"learning_rate": 0.000191495657767771,
"loss": 3.4822,
"step": 667
},
{
"epoch": 0.42958199356913185,
"grad_norm": 1.0189132690429688,
"learning_rate": 0.00019148279189449985,
"loss": 3.3718,
"step": 668
},
{
"epoch": 0.4302250803858521,
"grad_norm": 1.157392144203186,
"learning_rate": 0.0001914699260212287,
"loss": 3.3795,
"step": 669
},
{
"epoch": 0.43086816720257237,
"grad_norm": 1.1980301141738892,
"learning_rate": 0.00019145706014795754,
"loss": 3.324,
"step": 670
},
{
"epoch": 0.4315112540192926,
"grad_norm": 1.1774390935897827,
"learning_rate": 0.0001914441942746864,
"loss": 3.2682,
"step": 671
},
{
"epoch": 0.43215434083601284,
"grad_norm": 1.0564755201339722,
"learning_rate": 0.00019143132840141524,
"loss": 3.3632,
"step": 672
},
{
"epoch": 0.4327974276527331,
"grad_norm": 1.2775535583496094,
"learning_rate": 0.0001914184625281441,
"loss": 3.2401,
"step": 673
},
{
"epoch": 0.43344051446945336,
"grad_norm": 1.172121524810791,
"learning_rate": 0.00019140559665487296,
"loss": 3.3599,
"step": 674
},
{
"epoch": 0.4340836012861736,
"grad_norm": 1.014290690422058,
"learning_rate": 0.0001913927307816018,
"loss": 3.3045,
"step": 675
},
{
"epoch": 0.4347266881028939,
"grad_norm": 1.083747386932373,
"learning_rate": 0.00019137986490833068,
"loss": 3.206,
"step": 676
},
{
"epoch": 0.43536977491961415,
"grad_norm": 1.0449104309082031,
"learning_rate": 0.00019136699903505954,
"loss": 3.2687,
"step": 677
},
{
"epoch": 0.4360128617363344,
"grad_norm": 1.0497163534164429,
"learning_rate": 0.00019135413316178837,
"loss": 3.2697,
"step": 678
},
{
"epoch": 0.4366559485530547,
"grad_norm": 1.0219502449035645,
"learning_rate": 0.00019134126728851723,
"loss": 3.3335,
"step": 679
},
{
"epoch": 0.43729903536977494,
"grad_norm": 1.150263786315918,
"learning_rate": 0.00019132840141524606,
"loss": 3.3292,
"step": 680
},
{
"epoch": 0.4379421221864952,
"grad_norm": 1.0708996057510376,
"learning_rate": 0.00019131553554197492,
"loss": 3.3125,
"step": 681
},
{
"epoch": 0.43858520900321546,
"grad_norm": 1.1313010454177856,
"learning_rate": 0.00019130266966870378,
"loss": 3.2571,
"step": 682
},
{
"epoch": 0.43922829581993567,
"grad_norm": 1.1228160858154297,
"learning_rate": 0.00019128980379543262,
"loss": 3.3591,
"step": 683
},
{
"epoch": 0.4398713826366559,
"grad_norm": 1.2336113452911377,
"learning_rate": 0.00019127693792216148,
"loss": 3.3048,
"step": 684
},
{
"epoch": 0.4405144694533762,
"grad_norm": 1.0371979475021362,
"learning_rate": 0.0001912640720488903,
"loss": 3.4398,
"step": 685
},
{
"epoch": 0.44115755627009645,
"grad_norm": 1.0723671913146973,
"learning_rate": 0.00019125120617561917,
"loss": 3.3608,
"step": 686
},
{
"epoch": 0.4418006430868167,
"grad_norm": 1.1435720920562744,
"learning_rate": 0.00019123834030234803,
"loss": 3.3603,
"step": 687
},
{
"epoch": 0.442443729903537,
"grad_norm": 1.118101954460144,
"learning_rate": 0.0001912254744290769,
"loss": 3.3674,
"step": 688
},
{
"epoch": 0.44308681672025724,
"grad_norm": 1.160780668258667,
"learning_rate": 0.00019121260855580575,
"loss": 3.2998,
"step": 689
},
{
"epoch": 0.4437299035369775,
"grad_norm": 1.065692663192749,
"learning_rate": 0.00019119974268253458,
"loss": 3.3548,
"step": 690
},
{
"epoch": 0.44437299035369776,
"grad_norm": 1.0315111875534058,
"learning_rate": 0.00019118687680926344,
"loss": 3.3054,
"step": 691
},
{
"epoch": 0.445016077170418,
"grad_norm": 1.1071641445159912,
"learning_rate": 0.0001911740109359923,
"loss": 3.3202,
"step": 692
},
{
"epoch": 0.4456591639871383,
"grad_norm": 1.1420172452926636,
"learning_rate": 0.00019116114506272113,
"loss": 3.2928,
"step": 693
},
{
"epoch": 0.44630225080385855,
"grad_norm": 1.2857017517089844,
"learning_rate": 0.00019114827918945,
"loss": 3.229,
"step": 694
},
{
"epoch": 0.44694533762057875,
"grad_norm": 1.4348443746566772,
"learning_rate": 0.00019113541331617883,
"loss": 3.4344,
"step": 695
},
{
"epoch": 0.447588424437299,
"grad_norm": 1.149869441986084,
"learning_rate": 0.0001911225474429077,
"loss": 3.3527,
"step": 696
},
{
"epoch": 0.4482315112540193,
"grad_norm": 1.4944316148757935,
"learning_rate": 0.00019110968156963655,
"loss": 3.3647,
"step": 697
},
{
"epoch": 0.44887459807073954,
"grad_norm": 1.3225072622299194,
"learning_rate": 0.0001910968156963654,
"loss": 3.3557,
"step": 698
},
{
"epoch": 0.4495176848874598,
"grad_norm": 1.4342169761657715,
"learning_rate": 0.00019108394982309427,
"loss": 3.3011,
"step": 699
},
{
"epoch": 0.45016077170418006,
"grad_norm": 1.2693394422531128,
"learning_rate": 0.0001910710839498231,
"loss": 3.4043,
"step": 700
},
{
"epoch": 0.4508038585209003,
"grad_norm": 1.1735270023345947,
"learning_rate": 0.00019105821807655196,
"loss": 3.2249,
"step": 701
},
{
"epoch": 0.4514469453376206,
"grad_norm": 1.238094449043274,
"learning_rate": 0.00019104535220328082,
"loss": 3.3155,
"step": 702
},
{
"epoch": 0.45209003215434085,
"grad_norm": 1.1719659566879272,
"learning_rate": 0.00019103248633000965,
"loss": 3.3839,
"step": 703
},
{
"epoch": 0.4527331189710611,
"grad_norm": 0.9133745431900024,
"learning_rate": 0.0001910196204567385,
"loss": 3.3059,
"step": 704
},
{
"epoch": 0.4533762057877814,
"grad_norm": 1.281079649925232,
"learning_rate": 0.00019100675458346735,
"loss": 3.2738,
"step": 705
},
{
"epoch": 0.45401929260450163,
"grad_norm": 1.1078438758850098,
"learning_rate": 0.0001909938887101962,
"loss": 3.3192,
"step": 706
},
{
"epoch": 0.45466237942122184,
"grad_norm": 1.1194933652877808,
"learning_rate": 0.00019098102283692507,
"loss": 3.3174,
"step": 707
},
{
"epoch": 0.4553054662379421,
"grad_norm": 1.1717486381530762,
"learning_rate": 0.0001909681569636539,
"loss": 3.2079,
"step": 708
},
{
"epoch": 0.45594855305466236,
"grad_norm": 1.0228320360183716,
"learning_rate": 0.00019095529109038276,
"loss": 3.2985,
"step": 709
},
{
"epoch": 0.4565916398713826,
"grad_norm": 1.071147084236145,
"learning_rate": 0.00019094242521711162,
"loss": 3.2786,
"step": 710
},
{
"epoch": 0.4572347266881029,
"grad_norm": 1.0266625881195068,
"learning_rate": 0.00019092955934384048,
"loss": 3.3508,
"step": 711
},
{
"epoch": 0.45787781350482315,
"grad_norm": 1.0354539155960083,
"learning_rate": 0.00019091669347056934,
"loss": 3.2937,
"step": 712
},
{
"epoch": 0.4585209003215434,
"grad_norm": 1.1274703741073608,
"learning_rate": 0.00019090382759729817,
"loss": 3.3525,
"step": 713
},
{
"epoch": 0.4591639871382637,
"grad_norm": 1.1178417205810547,
"learning_rate": 0.00019089096172402703,
"loss": 3.3228,
"step": 714
},
{
"epoch": 0.45980707395498394,
"grad_norm": 1.040401816368103,
"learning_rate": 0.0001908780958507559,
"loss": 3.2619,
"step": 715
},
{
"epoch": 0.4604501607717042,
"grad_norm": 1.083000659942627,
"learning_rate": 0.00019086522997748472,
"loss": 3.2896,
"step": 716
},
{
"epoch": 0.46109324758842446,
"grad_norm": 1.0114821195602417,
"learning_rate": 0.00019085236410421358,
"loss": 3.1989,
"step": 717
},
{
"epoch": 0.4617363344051447,
"grad_norm": 1.0420598983764648,
"learning_rate": 0.00019083949823094242,
"loss": 3.3671,
"step": 718
},
{
"epoch": 0.46237942122186493,
"grad_norm": 1.2424354553222656,
"learning_rate": 0.00019082663235767128,
"loss": 3.2971,
"step": 719
},
{
"epoch": 0.4630225080385852,
"grad_norm": 0.9966109395027161,
"learning_rate": 0.00019081376648440014,
"loss": 3.3125,
"step": 720
},
{
"epoch": 0.46366559485530545,
"grad_norm": 1.2035448551177979,
"learning_rate": 0.000190800900611129,
"loss": 3.352,
"step": 721
},
{
"epoch": 0.4643086816720257,
"grad_norm": 1.3619177341461182,
"learning_rate": 0.00019078803473785786,
"loss": 3.2357,
"step": 722
},
{
"epoch": 0.464951768488746,
"grad_norm": 1.0742976665496826,
"learning_rate": 0.0001907751688645867,
"loss": 3.2926,
"step": 723
},
{
"epoch": 0.46559485530546624,
"grad_norm": 1.2764792442321777,
"learning_rate": 0.00019076230299131555,
"loss": 3.2803,
"step": 724
},
{
"epoch": 0.4662379421221865,
"grad_norm": 0.9125346541404724,
"learning_rate": 0.0001907494371180444,
"loss": 3.2938,
"step": 725
},
{
"epoch": 0.46688102893890676,
"grad_norm": 1.1562446355819702,
"learning_rate": 0.00019073657124477324,
"loss": 3.4009,
"step": 726
},
{
"epoch": 0.467524115755627,
"grad_norm": 1.0354193449020386,
"learning_rate": 0.0001907237053715021,
"loss": 3.2119,
"step": 727
},
{
"epoch": 0.4681672025723473,
"grad_norm": 1.2904752492904663,
"learning_rate": 0.00019071083949823094,
"loss": 3.2445,
"step": 728
},
{
"epoch": 0.46881028938906755,
"grad_norm": 1.183132290840149,
"learning_rate": 0.0001906979736249598,
"loss": 3.2195,
"step": 729
},
{
"epoch": 0.4694533762057878,
"grad_norm": 0.9867958426475525,
"learning_rate": 0.00019068510775168866,
"loss": 3.2712,
"step": 730
},
{
"epoch": 0.470096463022508,
"grad_norm": 1.1032123565673828,
"learning_rate": 0.0001906722418784175,
"loss": 3.2709,
"step": 731
},
{
"epoch": 0.4707395498392283,
"grad_norm": 1.199021577835083,
"learning_rate": 0.00019065937600514635,
"loss": 3.2594,
"step": 732
},
{
"epoch": 0.47138263665594854,
"grad_norm": 1.191476583480835,
"learning_rate": 0.0001906465101318752,
"loss": 3.293,
"step": 733
},
{
"epoch": 0.4720257234726688,
"grad_norm": 1.2461153268814087,
"learning_rate": 0.00019063364425860407,
"loss": 3.2375,
"step": 734
},
{
"epoch": 0.47266881028938906,
"grad_norm": 1.1664519309997559,
"learning_rate": 0.00019062077838533293,
"loss": 3.2176,
"step": 735
},
{
"epoch": 0.4733118971061093,
"grad_norm": 1.1912034749984741,
"learning_rate": 0.00019060791251206176,
"loss": 3.2735,
"step": 736
},
{
"epoch": 0.4739549839228296,
"grad_norm": 1.369511604309082,
"learning_rate": 0.00019059504663879062,
"loss": 3.1804,
"step": 737
},
{
"epoch": 0.47459807073954985,
"grad_norm": 1.230069875717163,
"learning_rate": 0.00019058218076551945,
"loss": 3.328,
"step": 738
},
{
"epoch": 0.4752411575562701,
"grad_norm": 1.3258293867111206,
"learning_rate": 0.00019056931489224831,
"loss": 3.2911,
"step": 739
},
{
"epoch": 0.4758842443729904,
"grad_norm": 1.2006981372833252,
"learning_rate": 0.00019055644901897717,
"loss": 3.391,
"step": 740
},
{
"epoch": 0.47652733118971063,
"grad_norm": 1.0535584688186646,
"learning_rate": 0.000190543583145706,
"loss": 3.2206,
"step": 741
},
{
"epoch": 0.47717041800643084,
"grad_norm": 1.3122670650482178,
"learning_rate": 0.00019053071727243487,
"loss": 3.4187,
"step": 742
},
{
"epoch": 0.4778135048231511,
"grad_norm": 1.1901317834854126,
"learning_rate": 0.00019051785139916373,
"loss": 3.2673,
"step": 743
},
{
"epoch": 0.47845659163987136,
"grad_norm": 1.2190645933151245,
"learning_rate": 0.0001905049855258926,
"loss": 3.2902,
"step": 744
},
{
"epoch": 0.4790996784565916,
"grad_norm": 1.4672080278396606,
"learning_rate": 0.00019049211965262145,
"loss": 3.1481,
"step": 745
},
{
"epoch": 0.4797427652733119,
"grad_norm": 1.1337164640426636,
"learning_rate": 0.00019047925377935028,
"loss": 3.2713,
"step": 746
},
{
"epoch": 0.48038585209003215,
"grad_norm": 1.2864869832992554,
"learning_rate": 0.00019046638790607914,
"loss": 3.2631,
"step": 747
},
{
"epoch": 0.4810289389067524,
"grad_norm": 1.163984775543213,
"learning_rate": 0.000190453522032808,
"loss": 3.2007,
"step": 748
},
{
"epoch": 0.4816720257234727,
"grad_norm": 1.3512396812438965,
"learning_rate": 0.00019044065615953683,
"loss": 3.1754,
"step": 749
},
{
"epoch": 0.48231511254019294,
"grad_norm": 1.2742228507995605,
"learning_rate": 0.0001904277902862657,
"loss": 3.2099,
"step": 750
},
{
"epoch": 0.4829581993569132,
"grad_norm": 1.090262532234192,
"learning_rate": 0.00019041492441299453,
"loss": 3.2087,
"step": 751
},
{
"epoch": 0.48360128617363346,
"grad_norm": 1.0582116842269897,
"learning_rate": 0.00019040205853972339,
"loss": 3.2742,
"step": 752
},
{
"epoch": 0.4842443729903537,
"grad_norm": 1.299381971359253,
"learning_rate": 0.00019038919266645225,
"loss": 3.3179,
"step": 753
},
{
"epoch": 0.48488745980707393,
"grad_norm": 1.3819348812103271,
"learning_rate": 0.00019037632679318108,
"loss": 3.3102,
"step": 754
},
{
"epoch": 0.4855305466237942,
"grad_norm": 1.135838270187378,
"learning_rate": 0.00019036346091990997,
"loss": 3.2429,
"step": 755
},
{
"epoch": 0.48617363344051445,
"grad_norm": 1.0232223272323608,
"learning_rate": 0.0001903505950466388,
"loss": 3.1738,
"step": 756
},
{
"epoch": 0.4868167202572347,
"grad_norm": 1.0841803550720215,
"learning_rate": 0.00019033772917336766,
"loss": 3.2514,
"step": 757
},
{
"epoch": 0.487459807073955,
"grad_norm": 1.2285308837890625,
"learning_rate": 0.00019032486330009652,
"loss": 3.1662,
"step": 758
},
{
"epoch": 0.48810289389067524,
"grad_norm": 1.1364984512329102,
"learning_rate": 0.00019031199742682535,
"loss": 3.2686,
"step": 759
},
{
"epoch": 0.4887459807073955,
"grad_norm": 1.167251467704773,
"learning_rate": 0.0001902991315535542,
"loss": 3.2791,
"step": 760
},
{
"epoch": 0.48938906752411576,
"grad_norm": 1.1167079210281372,
"learning_rate": 0.00019028626568028304,
"loss": 3.1558,
"step": 761
},
{
"epoch": 0.490032154340836,
"grad_norm": 1.1595637798309326,
"learning_rate": 0.0001902733998070119,
"loss": 3.207,
"step": 762
},
{
"epoch": 0.4906752411575563,
"grad_norm": 1.3299120664596558,
"learning_rate": 0.00019026053393374076,
"loss": 3.2207,
"step": 763
},
{
"epoch": 0.49131832797427655,
"grad_norm": 1.2679451704025269,
"learning_rate": 0.0001902476680604696,
"loss": 3.2342,
"step": 764
},
{
"epoch": 0.4919614147909968,
"grad_norm": 1.289936900138855,
"learning_rate": 0.00019023480218719846,
"loss": 3.3722,
"step": 765
},
{
"epoch": 0.492604501607717,
"grad_norm": 1.2382489442825317,
"learning_rate": 0.00019022193631392732,
"loss": 3.2567,
"step": 766
},
{
"epoch": 0.4932475884244373,
"grad_norm": 1.2640892267227173,
"learning_rate": 0.00019020907044065618,
"loss": 3.1951,
"step": 767
},
{
"epoch": 0.49389067524115754,
"grad_norm": 1.2752370834350586,
"learning_rate": 0.00019019620456738504,
"loss": 3.2232,
"step": 768
},
{
"epoch": 0.4945337620578778,
"grad_norm": 1.3053966760635376,
"learning_rate": 0.00019018333869411387,
"loss": 3.1693,
"step": 769
},
{
"epoch": 0.49517684887459806,
"grad_norm": 1.3304706811904907,
"learning_rate": 0.00019017047282084273,
"loss": 3.1699,
"step": 770
},
{
"epoch": 0.4958199356913183,
"grad_norm": 1.0017753839492798,
"learning_rate": 0.00019015760694757156,
"loss": 3.1091,
"step": 771
},
{
"epoch": 0.4964630225080386,
"grad_norm": 1.1159201860427856,
"learning_rate": 0.00019014474107430042,
"loss": 3.222,
"step": 772
},
{
"epoch": 0.49710610932475885,
"grad_norm": 1.049239158630371,
"learning_rate": 0.00019013187520102928,
"loss": 3.2,
"step": 773
},
{
"epoch": 0.4977491961414791,
"grad_norm": 1.3383749723434448,
"learning_rate": 0.00019011900932775812,
"loss": 3.077,
"step": 774
},
{
"epoch": 0.4983922829581994,
"grad_norm": 1.0949004888534546,
"learning_rate": 0.00019010614345448698,
"loss": 3.3314,
"step": 775
},
{
"epoch": 0.49903536977491963,
"grad_norm": 1.262426733970642,
"learning_rate": 0.00019009327758121584,
"loss": 3.1419,
"step": 776
},
{
"epoch": 0.4996784565916399,
"grad_norm": 1.2074456214904785,
"learning_rate": 0.0001900804117079447,
"loss": 3.1357,
"step": 777
},
{
"epoch": 0.5003215434083601,
"grad_norm": 1.0343276262283325,
"learning_rate": 0.00019006754583467356,
"loss": 3.2589,
"step": 778
},
{
"epoch": 0.5009646302250804,
"grad_norm": 1.3059816360473633,
"learning_rate": 0.0001900546799614024,
"loss": 3.2158,
"step": 779
},
{
"epoch": 0.5016077170418006,
"grad_norm": 1.1477670669555664,
"learning_rate": 0.00019004181408813125,
"loss": 3.2286,
"step": 780
},
{
"epoch": 0.5022508038585209,
"grad_norm": 0.9345281720161438,
"learning_rate": 0.0001900289482148601,
"loss": 3.1478,
"step": 781
},
{
"epoch": 0.5028938906752412,
"grad_norm": 1.545401930809021,
"learning_rate": 0.00019001608234158894,
"loss": 3.2269,
"step": 782
},
{
"epoch": 0.5035369774919615,
"grad_norm": 1.1383672952651978,
"learning_rate": 0.0001900032164683178,
"loss": 3.2749,
"step": 783
},
{
"epoch": 0.5041800643086817,
"grad_norm": 1.1127848625183105,
"learning_rate": 0.00018999035059504664,
"loss": 3.1765,
"step": 784
},
{
"epoch": 0.5048231511254019,
"grad_norm": 1.338855504989624,
"learning_rate": 0.0001899774847217755,
"loss": 3.1888,
"step": 785
},
{
"epoch": 0.5054662379421222,
"grad_norm": 1.183184266090393,
"learning_rate": 0.00018996461884850436,
"loss": 3.2694,
"step": 786
},
{
"epoch": 0.5061093247588424,
"grad_norm": 1.0847487449645996,
"learning_rate": 0.0001899517529752332,
"loss": 3.2269,
"step": 787
},
{
"epoch": 0.5067524115755627,
"grad_norm": 1.4246736764907837,
"learning_rate": 0.00018993888710196205,
"loss": 3.2068,
"step": 788
},
{
"epoch": 0.5073954983922829,
"grad_norm": 1.0470788478851318,
"learning_rate": 0.0001899260212286909,
"loss": 3.1605,
"step": 789
},
{
"epoch": 0.5080385852090032,
"grad_norm": 1.329217791557312,
"learning_rate": 0.00018991315535541977,
"loss": 3.2064,
"step": 790
},
{
"epoch": 0.5086816720257235,
"grad_norm": 1.4735416173934937,
"learning_rate": 0.00018990028948214863,
"loss": 3.289,
"step": 791
},
{
"epoch": 0.5093247588424438,
"grad_norm": 1.110742449760437,
"learning_rate": 0.00018988742360887746,
"loss": 3.2186,
"step": 792
},
{
"epoch": 0.509967845659164,
"grad_norm": 1.3622022867202759,
"learning_rate": 0.00018987455773560632,
"loss": 3.2241,
"step": 793
},
{
"epoch": 0.5106109324758843,
"grad_norm": 1.1964266300201416,
"learning_rate": 0.00018986169186233515,
"loss": 3.1842,
"step": 794
},
{
"epoch": 0.5112540192926045,
"grad_norm": 1.2105697393417358,
"learning_rate": 0.00018984882598906401,
"loss": 3.159,
"step": 795
},
{
"epoch": 0.5118971061093247,
"grad_norm": 1.2597590684890747,
"learning_rate": 0.00018983596011579287,
"loss": 3.184,
"step": 796
},
{
"epoch": 0.512540192926045,
"grad_norm": 1.1780807971954346,
"learning_rate": 0.0001898230942425217,
"loss": 3.1621,
"step": 797
},
{
"epoch": 0.5131832797427652,
"grad_norm": 1.048012137413025,
"learning_rate": 0.00018981022836925057,
"loss": 3.2248,
"step": 798
},
{
"epoch": 0.5138263665594855,
"grad_norm": 1.4612892866134644,
"learning_rate": 0.00018979736249597943,
"loss": 3.1989,
"step": 799
},
{
"epoch": 0.5144694533762058,
"grad_norm": 1.5027645826339722,
"learning_rate": 0.0001897844966227083,
"loss": 3.1642,
"step": 800
},
{
"epoch": 0.5151125401929261,
"grad_norm": 1.2052314281463623,
"learning_rate": 0.00018977163074943715,
"loss": 3.1384,
"step": 801
},
{
"epoch": 0.5157556270096463,
"grad_norm": 1.327204704284668,
"learning_rate": 0.00018975876487616598,
"loss": 3.2679,
"step": 802
},
{
"epoch": 0.5163987138263666,
"grad_norm": 1.7540024518966675,
"learning_rate": 0.00018974589900289484,
"loss": 3.2228,
"step": 803
},
{
"epoch": 0.5170418006430868,
"grad_norm": 1.187525987625122,
"learning_rate": 0.00018973303312962367,
"loss": 3.1897,
"step": 804
},
{
"epoch": 0.5176848874598071,
"grad_norm": 1.3195263147354126,
"learning_rate": 0.00018972016725635253,
"loss": 3.1717,
"step": 805
},
{
"epoch": 0.5183279742765273,
"grad_norm": 1.3006914854049683,
"learning_rate": 0.0001897073013830814,
"loss": 3.0372,
"step": 806
},
{
"epoch": 0.5189710610932476,
"grad_norm": 1.3825714588165283,
"learning_rate": 0.00018969443550981023,
"loss": 3.2407,
"step": 807
},
{
"epoch": 0.5196141479099678,
"grad_norm": 1.3513221740722656,
"learning_rate": 0.00018968156963653909,
"loss": 3.1957,
"step": 808
},
{
"epoch": 0.5202572347266881,
"grad_norm": 1.1686780452728271,
"learning_rate": 0.00018966870376326795,
"loss": 3.1793,
"step": 809
},
{
"epoch": 0.5209003215434084,
"grad_norm": 1.485041856765747,
"learning_rate": 0.00018965583788999678,
"loss": 3.1323,
"step": 810
},
{
"epoch": 0.5215434083601286,
"grad_norm": 1.2755258083343506,
"learning_rate": 0.00018964297201672564,
"loss": 3.0376,
"step": 811
},
{
"epoch": 0.5221864951768489,
"grad_norm": 1.2364903688430786,
"learning_rate": 0.0001896301061434545,
"loss": 3.1489,
"step": 812
},
{
"epoch": 0.5228295819935691,
"grad_norm": 1.2964539527893066,
"learning_rate": 0.00018961724027018336,
"loss": 3.1077,
"step": 813
},
{
"epoch": 0.5234726688102894,
"grad_norm": 1.259230375289917,
"learning_rate": 0.00018960437439691222,
"loss": 3.2041,
"step": 814
},
{
"epoch": 0.5241157556270096,
"grad_norm": 1.2545006275177002,
"learning_rate": 0.00018959150852364105,
"loss": 3.329,
"step": 815
},
{
"epoch": 0.52475884244373,
"grad_norm": 1.2107785940170288,
"learning_rate": 0.0001895786426503699,
"loss": 3.1635,
"step": 816
},
{
"epoch": 0.5254019292604502,
"grad_norm": 1.251050591468811,
"learning_rate": 0.00018956577677709874,
"loss": 3.1996,
"step": 817
},
{
"epoch": 0.5260450160771705,
"grad_norm": 1.2920563220977783,
"learning_rate": 0.0001895529109038276,
"loss": 3.1743,
"step": 818
},
{
"epoch": 0.5266881028938907,
"grad_norm": 1.373834490776062,
"learning_rate": 0.00018954004503055646,
"loss": 3.1707,
"step": 819
},
{
"epoch": 0.5273311897106109,
"grad_norm": 1.1601290702819824,
"learning_rate": 0.0001895271791572853,
"loss": 3.1825,
"step": 820
},
{
"epoch": 0.5279742765273312,
"grad_norm": 1.1515223979949951,
"learning_rate": 0.00018951431328401416,
"loss": 3.1254,
"step": 821
},
{
"epoch": 0.5286173633440514,
"grad_norm": 1.3081713914871216,
"learning_rate": 0.00018950144741074302,
"loss": 3.2704,
"step": 822
},
{
"epoch": 0.5292604501607717,
"grad_norm": 1.4287598133087158,
"learning_rate": 0.00018948858153747188,
"loss": 3.1797,
"step": 823
},
{
"epoch": 0.5299035369774919,
"grad_norm": 1.2460300922393799,
"learning_rate": 0.00018947571566420074,
"loss": 3.1673,
"step": 824
},
{
"epoch": 0.5305466237942122,
"grad_norm": 1.2695204019546509,
"learning_rate": 0.00018946284979092957,
"loss": 3.1102,
"step": 825
},
{
"epoch": 0.5311897106109325,
"grad_norm": 1.490421175956726,
"learning_rate": 0.00018944998391765843,
"loss": 3.067,
"step": 826
},
{
"epoch": 0.5318327974276528,
"grad_norm": 1.4937459230422974,
"learning_rate": 0.00018943711804438726,
"loss": 3.3027,
"step": 827
},
{
"epoch": 0.532475884244373,
"grad_norm": 1.1960488557815552,
"learning_rate": 0.00018942425217111612,
"loss": 3.1005,
"step": 828
},
{
"epoch": 0.5331189710610933,
"grad_norm": 1.1672232151031494,
"learning_rate": 0.00018941138629784498,
"loss": 3.1973,
"step": 829
},
{
"epoch": 0.5337620578778135,
"grad_norm": 1.3269081115722656,
"learning_rate": 0.00018939852042457382,
"loss": 3.0847,
"step": 830
},
{
"epoch": 0.5344051446945337,
"grad_norm": 1.2900030612945557,
"learning_rate": 0.00018938565455130268,
"loss": 3.0921,
"step": 831
},
{
"epoch": 0.535048231511254,
"grad_norm": 1.076888084411621,
"learning_rate": 0.0001893727886780315,
"loss": 3.2275,
"step": 832
},
{
"epoch": 0.5356913183279742,
"grad_norm": 1.106076717376709,
"learning_rate": 0.00018935992280476037,
"loss": 3.1484,
"step": 833
},
{
"epoch": 0.5363344051446945,
"grad_norm": 1.191945195198059,
"learning_rate": 0.00018934705693148923,
"loss": 3.2304,
"step": 834
},
{
"epoch": 0.5369774919614148,
"grad_norm": 1.2519569396972656,
"learning_rate": 0.0001893341910582181,
"loss": 3.1809,
"step": 835
},
{
"epoch": 0.5376205787781351,
"grad_norm": 1.303782343864441,
"learning_rate": 0.00018932132518494695,
"loss": 3.0748,
"step": 836
},
{
"epoch": 0.5382636655948553,
"grad_norm": 1.1886576414108276,
"learning_rate": 0.00018930845931167578,
"loss": 3.1439,
"step": 837
},
{
"epoch": 0.5389067524115756,
"grad_norm": 1.5747839212417603,
"learning_rate": 0.00018929559343840464,
"loss": 3.2817,
"step": 838
},
{
"epoch": 0.5395498392282958,
"grad_norm": 1.1393136978149414,
"learning_rate": 0.0001892827275651335,
"loss": 3.1519,
"step": 839
},
{
"epoch": 0.5401929260450161,
"grad_norm": 1.1961064338684082,
"learning_rate": 0.00018926986169186233,
"loss": 3.1477,
"step": 840
},
{
"epoch": 0.5408360128617363,
"grad_norm": 1.2031160593032837,
"learning_rate": 0.0001892569958185912,
"loss": 3.1408,
"step": 841
},
{
"epoch": 0.5414790996784566,
"grad_norm": 1.2001953125,
"learning_rate": 0.00018924412994532003,
"loss": 3.1901,
"step": 842
},
{
"epoch": 0.5421221864951768,
"grad_norm": 1.0908831357955933,
"learning_rate": 0.0001892312640720489,
"loss": 3.2398,
"step": 843
},
{
"epoch": 0.5427652733118971,
"grad_norm": 1.09163498878479,
"learning_rate": 0.00018921839819877775,
"loss": 3.2032,
"step": 844
},
{
"epoch": 0.5434083601286174,
"grad_norm": 1.162726879119873,
"learning_rate": 0.0001892055323255066,
"loss": 3.1095,
"step": 845
},
{
"epoch": 0.5440514469453376,
"grad_norm": 1.0796010494232178,
"learning_rate": 0.00018919266645223547,
"loss": 3.1118,
"step": 846
},
{
"epoch": 0.5446945337620579,
"grad_norm": 1.1972142457962036,
"learning_rate": 0.00018917980057896433,
"loss": 3.1192,
"step": 847
},
{
"epoch": 0.5453376205787781,
"grad_norm": 0.996917724609375,
"learning_rate": 0.00018916693470569316,
"loss": 3.1339,
"step": 848
},
{
"epoch": 0.5459807073954984,
"grad_norm": 1.2327890396118164,
"learning_rate": 0.00018915406883242202,
"loss": 3.1469,
"step": 849
},
{
"epoch": 0.5466237942122186,
"grad_norm": 1.1841199398040771,
"learning_rate": 0.00018914120295915085,
"loss": 3.1122,
"step": 850
},
{
"epoch": 0.547266881028939,
"grad_norm": 1.1988370418548584,
"learning_rate": 0.0001891283370858797,
"loss": 3.1059,
"step": 851
},
{
"epoch": 0.5479099678456592,
"grad_norm": 1.2205297946929932,
"learning_rate": 0.00018911547121260857,
"loss": 3.109,
"step": 852
},
{
"epoch": 0.5485530546623795,
"grad_norm": 1.1680948734283447,
"learning_rate": 0.0001891026053393374,
"loss": 3.1783,
"step": 853
},
{
"epoch": 0.5491961414790997,
"grad_norm": 1.4123858213424683,
"learning_rate": 0.00018908973946606627,
"loss": 3.093,
"step": 854
},
{
"epoch": 0.5498392282958199,
"grad_norm": 1.3545359373092651,
"learning_rate": 0.0001890768735927951,
"loss": 3.1441,
"step": 855
},
{
"epoch": 0.5504823151125402,
"grad_norm": 1.0951796770095825,
"learning_rate": 0.00018906400771952399,
"loss": 3.092,
"step": 856
},
{
"epoch": 0.5511254019292604,
"grad_norm": 1.2619158029556274,
"learning_rate": 0.00018905114184625285,
"loss": 3.1159,
"step": 857
},
{
"epoch": 0.5517684887459807,
"grad_norm": 1.3777707815170288,
"learning_rate": 0.00018903827597298168,
"loss": 3.1173,
"step": 858
},
{
"epoch": 0.5524115755627009,
"grad_norm": 1.1288398504257202,
"learning_rate": 0.00018902541009971054,
"loss": 3.1351,
"step": 859
},
{
"epoch": 0.5530546623794212,
"grad_norm": 1.0961090326309204,
"learning_rate": 0.00018901254422643937,
"loss": 3.1003,
"step": 860
},
{
"epoch": 0.5536977491961415,
"grad_norm": 1.383499026298523,
"learning_rate": 0.00018899967835316823,
"loss": 3.1626,
"step": 861
},
{
"epoch": 0.5543408360128618,
"grad_norm": 1.2608115673065186,
"learning_rate": 0.0001889868124798971,
"loss": 3.1251,
"step": 862
},
{
"epoch": 0.554983922829582,
"grad_norm": 1.0015844106674194,
"learning_rate": 0.00018897394660662592,
"loss": 3.0936,
"step": 863
},
{
"epoch": 0.5556270096463023,
"grad_norm": 1.2425543069839478,
"learning_rate": 0.00018896108073335478,
"loss": 2.9914,
"step": 864
},
{
"epoch": 0.5562700964630225,
"grad_norm": 1.169270634651184,
"learning_rate": 0.00018894821486008362,
"loss": 3.0521,
"step": 865
},
{
"epoch": 0.5569131832797428,
"grad_norm": 1.1069300174713135,
"learning_rate": 0.00018893534898681248,
"loss": 3.1724,
"step": 866
},
{
"epoch": 0.557556270096463,
"grad_norm": 1.0531796216964722,
"learning_rate": 0.00018892248311354134,
"loss": 3.1483,
"step": 867
},
{
"epoch": 0.5581993569131832,
"grad_norm": 1.192090630531311,
"learning_rate": 0.0001889096172402702,
"loss": 3.1941,
"step": 868
},
{
"epoch": 0.5588424437299035,
"grad_norm": 1.1669304370880127,
"learning_rate": 0.00018889675136699906,
"loss": 3.0172,
"step": 869
},
{
"epoch": 0.5594855305466238,
"grad_norm": 1.3154963254928589,
"learning_rate": 0.0001888838854937279,
"loss": 3.0505,
"step": 870
},
{
"epoch": 0.5601286173633441,
"grad_norm": 1.1110012531280518,
"learning_rate": 0.00018887101962045675,
"loss": 3.057,
"step": 871
},
{
"epoch": 0.5607717041800643,
"grad_norm": 1.350762128829956,
"learning_rate": 0.0001888581537471856,
"loss": 3.1815,
"step": 872
},
{
"epoch": 0.5614147909967846,
"grad_norm": 1.2955763339996338,
"learning_rate": 0.00018884528787391444,
"loss": 3.153,
"step": 873
},
{
"epoch": 0.5620578778135048,
"grad_norm": 1.4547818899154663,
"learning_rate": 0.0001888324220006433,
"loss": 3.0896,
"step": 874
},
{
"epoch": 0.5627009646302251,
"grad_norm": 1.323948621749878,
"learning_rate": 0.00018881955612737214,
"loss": 3.1684,
"step": 875
},
{
"epoch": 0.5633440514469453,
"grad_norm": 0.9974737167358398,
"learning_rate": 0.000188806690254101,
"loss": 3.1328,
"step": 876
},
{
"epoch": 0.5639871382636656,
"grad_norm": 1.2835307121276855,
"learning_rate": 0.00018879382438082986,
"loss": 3.1713,
"step": 877
},
{
"epoch": 0.5646302250803859,
"grad_norm": 1.4593397378921509,
"learning_rate": 0.00018878095850755872,
"loss": 3.0995,
"step": 878
},
{
"epoch": 0.5652733118971061,
"grad_norm": 1.3302654027938843,
"learning_rate": 0.00018876809263428758,
"loss": 3.0794,
"step": 879
},
{
"epoch": 0.5659163987138264,
"grad_norm": 1.3593260049819946,
"learning_rate": 0.00018875522676101644,
"loss": 3.0851,
"step": 880
},
{
"epoch": 0.5665594855305466,
"grad_norm": 1.515762209892273,
"learning_rate": 0.00018874236088774527,
"loss": 3.1116,
"step": 881
},
{
"epoch": 0.5672025723472669,
"grad_norm": 1.3611632585525513,
"learning_rate": 0.00018872949501447413,
"loss": 3.1126,
"step": 882
},
{
"epoch": 0.5678456591639871,
"grad_norm": 1.281817078590393,
"learning_rate": 0.00018871662914120296,
"loss": 3.0904,
"step": 883
},
{
"epoch": 0.5684887459807074,
"grad_norm": 1.5134152173995972,
"learning_rate": 0.00018870376326793182,
"loss": 3.1503,
"step": 884
},
{
"epoch": 0.5691318327974276,
"grad_norm": 1.3870372772216797,
"learning_rate": 0.00018869089739466068,
"loss": 3.1883,
"step": 885
},
{
"epoch": 0.569774919614148,
"grad_norm": 1.3908785581588745,
"learning_rate": 0.00018867803152138951,
"loss": 3.0711,
"step": 886
},
{
"epoch": 0.5704180064308682,
"grad_norm": 1.5557795763015747,
"learning_rate": 0.00018866516564811837,
"loss": 3.2031,
"step": 887
},
{
"epoch": 0.5710610932475885,
"grad_norm": 1.3231662511825562,
"learning_rate": 0.0001886522997748472,
"loss": 3.0175,
"step": 888
},
{
"epoch": 0.5717041800643087,
"grad_norm": 1.1312745809555054,
"learning_rate": 0.00018863943390157607,
"loss": 3.1328,
"step": 889
},
{
"epoch": 0.572347266881029,
"grad_norm": 1.6406269073486328,
"learning_rate": 0.00018862656802830493,
"loss": 3.0261,
"step": 890
},
{
"epoch": 0.5729903536977492,
"grad_norm": 1.5973625183105469,
"learning_rate": 0.0001886137021550338,
"loss": 3.0951,
"step": 891
},
{
"epoch": 0.5736334405144694,
"grad_norm": 1.237862229347229,
"learning_rate": 0.00018860083628176265,
"loss": 3.0748,
"step": 892
},
{
"epoch": 0.5742765273311897,
"grad_norm": 1.2180136442184448,
"learning_rate": 0.00018858797040849148,
"loss": 3.1109,
"step": 893
},
{
"epoch": 0.5749196141479099,
"grad_norm": 1.3336851596832275,
"learning_rate": 0.00018857510453522034,
"loss": 3.0763,
"step": 894
},
{
"epoch": 0.5755627009646302,
"grad_norm": 1.3121765851974487,
"learning_rate": 0.0001885622386619492,
"loss": 3.1751,
"step": 895
},
{
"epoch": 0.5762057877813505,
"grad_norm": 1.292459487915039,
"learning_rate": 0.00018854937278867803,
"loss": 3.118,
"step": 896
},
{
"epoch": 0.5768488745980708,
"grad_norm": 1.2852542400360107,
"learning_rate": 0.0001885365069154069,
"loss": 3.1566,
"step": 897
},
{
"epoch": 0.577491961414791,
"grad_norm": 1.2023179531097412,
"learning_rate": 0.00018852364104213573,
"loss": 3.2088,
"step": 898
},
{
"epoch": 0.5781350482315113,
"grad_norm": 1.2750616073608398,
"learning_rate": 0.00018851077516886459,
"loss": 3.0904,
"step": 899
},
{
"epoch": 0.5787781350482315,
"grad_norm": 1.2595165967941284,
"learning_rate": 0.00018849790929559345,
"loss": 3.0601,
"step": 900
},
{
"epoch": 0.5794212218649518,
"grad_norm": 1.2735553979873657,
"learning_rate": 0.0001884850434223223,
"loss": 3.0751,
"step": 901
},
{
"epoch": 0.580064308681672,
"grad_norm": 1.3524245023727417,
"learning_rate": 0.00018847217754905117,
"loss": 3.0462,
"step": 902
},
{
"epoch": 0.5807073954983922,
"grad_norm": 1.335268259048462,
"learning_rate": 0.00018845931167578,
"loss": 3.0297,
"step": 903
},
{
"epoch": 0.5813504823151125,
"grad_norm": 1.3870205879211426,
"learning_rate": 0.00018844644580250886,
"loss": 3.2041,
"step": 904
},
{
"epoch": 0.5819935691318328,
"grad_norm": 1.4512618780136108,
"learning_rate": 0.00018843357992923772,
"loss": 3.278,
"step": 905
},
{
"epoch": 0.5826366559485531,
"grad_norm": 1.4124186038970947,
"learning_rate": 0.00018842071405596655,
"loss": 3.1195,
"step": 906
},
{
"epoch": 0.5832797427652733,
"grad_norm": 1.5869147777557373,
"learning_rate": 0.0001884078481826954,
"loss": 3.0581,
"step": 907
},
{
"epoch": 0.5839228295819936,
"grad_norm": 1.358933687210083,
"learning_rate": 0.00018839498230942424,
"loss": 3.0677,
"step": 908
},
{
"epoch": 0.5845659163987138,
"grad_norm": 1.1760841608047485,
"learning_rate": 0.0001883821164361531,
"loss": 2.995,
"step": 909
},
{
"epoch": 0.5852090032154341,
"grad_norm": 1.4777272939682007,
"learning_rate": 0.00018836925056288196,
"loss": 2.9668,
"step": 910
},
{
"epoch": 0.5858520900321543,
"grad_norm": 1.1686232089996338,
"learning_rate": 0.0001883563846896108,
"loss": 3.1695,
"step": 911
},
{
"epoch": 0.5864951768488746,
"grad_norm": 1.0929896831512451,
"learning_rate": 0.00018834351881633966,
"loss": 3.1391,
"step": 912
},
{
"epoch": 0.5871382636655949,
"grad_norm": 1.4318856000900269,
"learning_rate": 0.00018833065294306852,
"loss": 3.0627,
"step": 913
},
{
"epoch": 0.5877813504823152,
"grad_norm": 1.3018511533737183,
"learning_rate": 0.00018831778706979738,
"loss": 3.1305,
"step": 914
},
{
"epoch": 0.5884244372990354,
"grad_norm": 1.395560622215271,
"learning_rate": 0.00018830492119652624,
"loss": 3.1234,
"step": 915
},
{
"epoch": 0.5890675241157556,
"grad_norm": 1.3363614082336426,
"learning_rate": 0.00018829205532325507,
"loss": 3.0128,
"step": 916
},
{
"epoch": 0.5897106109324759,
"grad_norm": 1.264737606048584,
"learning_rate": 0.00018827918944998393,
"loss": 3.1762,
"step": 917
},
{
"epoch": 0.5903536977491961,
"grad_norm": 1.3853096961975098,
"learning_rate": 0.0001882663235767128,
"loss": 3.1491,
"step": 918
},
{
"epoch": 0.5909967845659164,
"grad_norm": 1.5895792245864868,
"learning_rate": 0.00018825345770344162,
"loss": 3.1872,
"step": 919
},
{
"epoch": 0.5916398713826366,
"grad_norm": 1.3258469104766846,
"learning_rate": 0.00018824059183017048,
"loss": 3.0809,
"step": 920
},
{
"epoch": 0.592282958199357,
"grad_norm": 1.3434430360794067,
"learning_rate": 0.00018822772595689932,
"loss": 3.0111,
"step": 921
},
{
"epoch": 0.5929260450160772,
"grad_norm": 1.3564894199371338,
"learning_rate": 0.00018821486008362818,
"loss": 3.0968,
"step": 922
},
{
"epoch": 0.5935691318327975,
"grad_norm": 1.4929382801055908,
"learning_rate": 0.00018820199421035704,
"loss": 3.1024,
"step": 923
},
{
"epoch": 0.5942122186495177,
"grad_norm": 1.380781888961792,
"learning_rate": 0.0001881891283370859,
"loss": 3.0029,
"step": 924
},
{
"epoch": 0.594855305466238,
"grad_norm": 1.2817083597183228,
"learning_rate": 0.00018817626246381476,
"loss": 3.0043,
"step": 925
},
{
"epoch": 0.5954983922829582,
"grad_norm": 1.182271957397461,
"learning_rate": 0.0001881633965905436,
"loss": 3.0496,
"step": 926
},
{
"epoch": 0.5961414790996784,
"grad_norm": 1.124681830406189,
"learning_rate": 0.00018815053071727245,
"loss": 3.0583,
"step": 927
},
{
"epoch": 0.5967845659163987,
"grad_norm": 1.037346601486206,
"learning_rate": 0.0001881376648440013,
"loss": 3.0009,
"step": 928
},
{
"epoch": 0.5974276527331189,
"grad_norm": 1.2296048402786255,
"learning_rate": 0.00018812479897073014,
"loss": 3.0378,
"step": 929
},
{
"epoch": 0.5980707395498392,
"grad_norm": 1.1884113550186157,
"learning_rate": 0.000188111933097459,
"loss": 3.0192,
"step": 930
},
{
"epoch": 0.5987138263665595,
"grad_norm": 1.3827930688858032,
"learning_rate": 0.00018809906722418783,
"loss": 3.0462,
"step": 931
},
{
"epoch": 0.5993569131832798,
"grad_norm": 1.4823634624481201,
"learning_rate": 0.0001880862013509167,
"loss": 3.1284,
"step": 932
},
{
"epoch": 0.6,
"grad_norm": 1.3943361043930054,
"learning_rate": 0.00018807333547764555,
"loss": 3.0216,
"step": 933
},
{
"epoch": 0.6006430868167203,
"grad_norm": 1.2865196466445923,
"learning_rate": 0.0001880604696043744,
"loss": 3.0187,
"step": 934
},
{
"epoch": 0.6012861736334405,
"grad_norm": 1.4083055257797241,
"learning_rate": 0.00018804760373110327,
"loss": 3.1085,
"step": 935
},
{
"epoch": 0.6019292604501608,
"grad_norm": 1.317826509475708,
"learning_rate": 0.0001880347378578321,
"loss": 3.0967,
"step": 936
},
{
"epoch": 0.602572347266881,
"grad_norm": 1.337525486946106,
"learning_rate": 0.00018802187198456097,
"loss": 3.072,
"step": 937
},
{
"epoch": 0.6032154340836012,
"grad_norm": 1.328406810760498,
"learning_rate": 0.00018800900611128983,
"loss": 2.93,
"step": 938
},
{
"epoch": 0.6038585209003215,
"grad_norm": 1.381536841392517,
"learning_rate": 0.00018799614023801866,
"loss": 2.9831,
"step": 939
},
{
"epoch": 0.6045016077170418,
"grad_norm": 1.3098235130310059,
"learning_rate": 0.00018798327436474752,
"loss": 3.1046,
"step": 940
},
{
"epoch": 0.6051446945337621,
"grad_norm": 1.2065914869308472,
"learning_rate": 0.00018797040849147635,
"loss": 2.9651,
"step": 941
},
{
"epoch": 0.6057877813504823,
"grad_norm": 1.3447399139404297,
"learning_rate": 0.0001879575426182052,
"loss": 3.1248,
"step": 942
},
{
"epoch": 0.6064308681672026,
"grad_norm": 1.2985327243804932,
"learning_rate": 0.00018794467674493407,
"loss": 3.1018,
"step": 943
},
{
"epoch": 0.6070739549839228,
"grad_norm": 1.3033169507980347,
"learning_rate": 0.0001879318108716629,
"loss": 2.9986,
"step": 944
},
{
"epoch": 0.6077170418006431,
"grad_norm": 1.3129827976226807,
"learning_rate": 0.00018791894499839177,
"loss": 3.0661,
"step": 945
},
{
"epoch": 0.6083601286173633,
"grad_norm": 1.1535412073135376,
"learning_rate": 0.00018790607912512063,
"loss": 3.0727,
"step": 946
},
{
"epoch": 0.6090032154340836,
"grad_norm": 1.2947921752929688,
"learning_rate": 0.00018789321325184949,
"loss": 2.997,
"step": 947
},
{
"epoch": 0.6096463022508039,
"grad_norm": 1.1528445482254028,
"learning_rate": 0.00018788034737857835,
"loss": 3.0687,
"step": 948
},
{
"epoch": 0.6102893890675242,
"grad_norm": 1.1423406600952148,
"learning_rate": 0.00018786748150530718,
"loss": 3.0846,
"step": 949
},
{
"epoch": 0.6109324758842444,
"grad_norm": 1.1380009651184082,
"learning_rate": 0.00018785461563203604,
"loss": 3.0174,
"step": 950
},
{
"epoch": 0.6115755627009646,
"grad_norm": 1.1382924318313599,
"learning_rate": 0.0001878417497587649,
"loss": 3.0143,
"step": 951
},
{
"epoch": 0.6122186495176849,
"grad_norm": 1.2278555631637573,
"learning_rate": 0.00018782888388549373,
"loss": 3.0719,
"step": 952
},
{
"epoch": 0.6128617363344051,
"grad_norm": 1.331308126449585,
"learning_rate": 0.0001878160180122226,
"loss": 3.0604,
"step": 953
},
{
"epoch": 0.6135048231511254,
"grad_norm": 1.3055458068847656,
"learning_rate": 0.00018780315213895142,
"loss": 3.2065,
"step": 954
},
{
"epoch": 0.6141479099678456,
"grad_norm": 1.0813393592834473,
"learning_rate": 0.00018779028626568028,
"loss": 3.0412,
"step": 955
},
{
"epoch": 0.614790996784566,
"grad_norm": 1.3665434122085571,
"learning_rate": 0.00018777742039240914,
"loss": 3.1002,
"step": 956
},
{
"epoch": 0.6154340836012862,
"grad_norm": 1.0831019878387451,
"learning_rate": 0.000187764554519138,
"loss": 2.9718,
"step": 957
},
{
"epoch": 0.6160771704180065,
"grad_norm": 1.2719820737838745,
"learning_rate": 0.00018775168864586686,
"loss": 3.0464,
"step": 958
},
{
"epoch": 0.6167202572347267,
"grad_norm": 1.290090560913086,
"learning_rate": 0.0001877388227725957,
"loss": 3.083,
"step": 959
},
{
"epoch": 0.617363344051447,
"grad_norm": 1.1994895935058594,
"learning_rate": 0.00018772595689932456,
"loss": 2.9636,
"step": 960
},
{
"epoch": 0.6180064308681672,
"grad_norm": 1.330077886581421,
"learning_rate": 0.00018771309102605342,
"loss": 3.0226,
"step": 961
},
{
"epoch": 0.6186495176848874,
"grad_norm": 1.1761059761047363,
"learning_rate": 0.00018770022515278225,
"loss": 3.0349,
"step": 962
},
{
"epoch": 0.6192926045016077,
"grad_norm": 1.1384966373443604,
"learning_rate": 0.0001876873592795111,
"loss": 3.0769,
"step": 963
},
{
"epoch": 0.6199356913183279,
"grad_norm": 1.1724135875701904,
"learning_rate": 0.00018767449340623994,
"loss": 3.0703,
"step": 964
},
{
"epoch": 0.6205787781350482,
"grad_norm": 1.1943421363830566,
"learning_rate": 0.0001876616275329688,
"loss": 3.0188,
"step": 965
},
{
"epoch": 0.6212218649517685,
"grad_norm": 1.1089471578598022,
"learning_rate": 0.00018764876165969766,
"loss": 3.088,
"step": 966
},
{
"epoch": 0.6218649517684888,
"grad_norm": 1.1156290769577026,
"learning_rate": 0.0001876358957864265,
"loss": 3.0223,
"step": 967
},
{
"epoch": 0.622508038585209,
"grad_norm": 1.3356914520263672,
"learning_rate": 0.00018762302991315536,
"loss": 3.0365,
"step": 968
},
{
"epoch": 0.6231511254019293,
"grad_norm": 1.2664483785629272,
"learning_rate": 0.00018761016403988422,
"loss": 3.1077,
"step": 969
},
{
"epoch": 0.6237942122186495,
"grad_norm": 1.211126685142517,
"learning_rate": 0.00018759729816661308,
"loss": 3.1502,
"step": 970
},
{
"epoch": 0.6244372990353698,
"grad_norm": 1.1281132698059082,
"learning_rate": 0.00018758443229334194,
"loss": 2.9797,
"step": 971
},
{
"epoch": 0.62508038585209,
"grad_norm": 1.2974032163619995,
"learning_rate": 0.00018757156642007077,
"loss": 3.0456,
"step": 972
},
{
"epoch": 0.6257234726688103,
"grad_norm": 1.1352269649505615,
"learning_rate": 0.00018755870054679963,
"loss": 3.0097,
"step": 973
},
{
"epoch": 0.6263665594855305,
"grad_norm": 1.256424903869629,
"learning_rate": 0.00018754583467352846,
"loss": 2.9262,
"step": 974
},
{
"epoch": 0.6270096463022508,
"grad_norm": 1.1779004335403442,
"learning_rate": 0.00018753296880025732,
"loss": 2.931,
"step": 975
},
{
"epoch": 0.6276527331189711,
"grad_norm": 1.3205631971359253,
"learning_rate": 0.00018752010292698618,
"loss": 2.996,
"step": 976
},
{
"epoch": 0.6282958199356913,
"grad_norm": 1.3165156841278076,
"learning_rate": 0.00018750723705371502,
"loss": 2.9635,
"step": 977
},
{
"epoch": 0.6289389067524116,
"grad_norm": 1.3659495115280151,
"learning_rate": 0.00018749437118044388,
"loss": 2.9997,
"step": 978
},
{
"epoch": 0.6295819935691318,
"grad_norm": 1.2277815341949463,
"learning_rate": 0.00018748150530717274,
"loss": 3.0684,
"step": 979
},
{
"epoch": 0.6302250803858521,
"grad_norm": 1.3968592882156372,
"learning_rate": 0.0001874686394339016,
"loss": 3.018,
"step": 980
},
{
"epoch": 0.6308681672025723,
"grad_norm": 1.1932144165039062,
"learning_rate": 0.00018745577356063046,
"loss": 2.9842,
"step": 981
},
{
"epoch": 0.6315112540192926,
"grad_norm": 1.1522760391235352,
"learning_rate": 0.0001874429076873593,
"loss": 3.0659,
"step": 982
},
{
"epoch": 0.6321543408360129,
"grad_norm": 1.3020713329315186,
"learning_rate": 0.00018743004181408815,
"loss": 2.915,
"step": 983
},
{
"epoch": 0.6327974276527332,
"grad_norm": 1.2788695096969604,
"learning_rate": 0.000187417175940817,
"loss": 2.9683,
"step": 984
},
{
"epoch": 0.6334405144694534,
"grad_norm": 1.1550734043121338,
"learning_rate": 0.00018740431006754584,
"loss": 3.1207,
"step": 985
},
{
"epoch": 0.6340836012861736,
"grad_norm": 1.3394757509231567,
"learning_rate": 0.0001873914441942747,
"loss": 3.0282,
"step": 986
},
{
"epoch": 0.6347266881028939,
"grad_norm": 1.2279311418533325,
"learning_rate": 0.00018737857832100353,
"loss": 2.9223,
"step": 987
},
{
"epoch": 0.6353697749196141,
"grad_norm": 1.3444873094558716,
"learning_rate": 0.0001873657124477324,
"loss": 2.9301,
"step": 988
},
{
"epoch": 0.6360128617363344,
"grad_norm": 1.389906883239746,
"learning_rate": 0.00018735284657446125,
"loss": 3.0233,
"step": 989
},
{
"epoch": 0.6366559485530546,
"grad_norm": 1.3237117528915405,
"learning_rate": 0.0001873399807011901,
"loss": 3.0986,
"step": 990
},
{
"epoch": 0.637299035369775,
"grad_norm": 1.294018268585205,
"learning_rate": 0.00018732711482791895,
"loss": 2.9491,
"step": 991
},
{
"epoch": 0.6379421221864952,
"grad_norm": 1.852734923362732,
"learning_rate": 0.0001873142489546478,
"loss": 3.0225,
"step": 992
},
{
"epoch": 0.6385852090032155,
"grad_norm": 1.240017294883728,
"learning_rate": 0.00018730138308137667,
"loss": 2.9521,
"step": 993
},
{
"epoch": 0.6392282958199357,
"grad_norm": 1.3555036783218384,
"learning_rate": 0.00018728851720810553,
"loss": 3.013,
"step": 994
},
{
"epoch": 0.639871382636656,
"grad_norm": 1.4047218561172485,
"learning_rate": 0.00018727565133483436,
"loss": 3.0727,
"step": 995
},
{
"epoch": 0.6405144694533762,
"grad_norm": 1.4910492897033691,
"learning_rate": 0.00018726278546156322,
"loss": 3.012,
"step": 996
},
{
"epoch": 0.6411575562700965,
"grad_norm": 1.2978317737579346,
"learning_rate": 0.00018724991958829205,
"loss": 3.0042,
"step": 997
},
{
"epoch": 0.6418006430868167,
"grad_norm": 1.492448329925537,
"learning_rate": 0.0001872370537150209,
"loss": 2.9069,
"step": 998
},
{
"epoch": 0.6424437299035369,
"grad_norm": 1.340957522392273,
"learning_rate": 0.00018722418784174977,
"loss": 2.9449,
"step": 999
},
{
"epoch": 0.6430868167202572,
"grad_norm": 1.2786818742752075,
"learning_rate": 0.0001872113219684786,
"loss": 2.9224,
"step": 1000
},
{
"epoch": 0.6430868167202572,
"eval_loss": 2.9839203357696533,
"eval_runtime": 5.4642,
"eval_samples_per_second": 91.505,
"eval_steps_per_second": 45.753,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 15550,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1051904803307520.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}