intervl-2B-full-sft / trainer_state.json
pltops's picture
Upload folder using huggingface_hub
6b204d3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 816,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004920049200492005,
"grad_norm": 483.4133605957031,
"learning_rate": 0.0,
"loss": 0.95,
"step": 1
},
{
"epoch": 0.00984009840098401,
"grad_norm": 414.1131286621094,
"learning_rate": 8.000000000000001e-06,
"loss": 0.7919,
"step": 2
},
{
"epoch": 0.014760147601476014,
"grad_norm": 20.664552688598633,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.2639,
"step": 3
},
{
"epoch": 0.01968019680196802,
"grad_norm": 19.735389709472656,
"learning_rate": 2.4e-05,
"loss": 0.3098,
"step": 4
},
{
"epoch": 0.024600246002460024,
"grad_norm": 630.93115234375,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.3057,
"step": 5
},
{
"epoch": 0.02952029520295203,
"grad_norm": 108.70830535888672,
"learning_rate": 4e-05,
"loss": 0.784,
"step": 6
},
{
"epoch": 0.03444034440344403,
"grad_norm": 25.684120178222656,
"learning_rate": 4.8e-05,
"loss": 0.6977,
"step": 7
},
{
"epoch": 0.03936039360393604,
"grad_norm": 28.066545486450195,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.6476,
"step": 8
},
{
"epoch": 0.04428044280442804,
"grad_norm": 76.08965301513672,
"learning_rate": 6.400000000000001e-05,
"loss": 0.5767,
"step": 9
},
{
"epoch": 0.04920049200492005,
"grad_norm": 14.550333023071289,
"learning_rate": 7.2e-05,
"loss": 0.4199,
"step": 10
},
{
"epoch": 0.05412054120541206,
"grad_norm": 66.70437622070312,
"learning_rate": 8e-05,
"loss": 0.6942,
"step": 11
},
{
"epoch": 0.05904059040590406,
"grad_norm": 73.81010437011719,
"learning_rate": 8.800000000000001e-05,
"loss": 1.517,
"step": 12
},
{
"epoch": 0.06396063960639606,
"grad_norm": 6.138183116912842,
"learning_rate": 9.6e-05,
"loss": 0.3904,
"step": 13
},
{
"epoch": 0.06888068880688807,
"grad_norm": 11.106021881103516,
"learning_rate": 0.00010400000000000001,
"loss": 0.4124,
"step": 14
},
{
"epoch": 0.07380073800738007,
"grad_norm": 12.445630073547363,
"learning_rate": 0.00011200000000000001,
"loss": 0.4418,
"step": 15
},
{
"epoch": 0.07872078720787208,
"grad_norm": 7.307021141052246,
"learning_rate": 0.00012,
"loss": 0.4751,
"step": 16
},
{
"epoch": 0.08364083640836409,
"grad_norm": 19.08457374572754,
"learning_rate": 0.00012800000000000002,
"loss": 0.5671,
"step": 17
},
{
"epoch": 0.08856088560885608,
"grad_norm": 11.03348159790039,
"learning_rate": 0.00013600000000000003,
"loss": 0.4441,
"step": 18
},
{
"epoch": 0.09348093480934809,
"grad_norm": 155.23716735839844,
"learning_rate": 0.000144,
"loss": 0.5707,
"step": 19
},
{
"epoch": 0.0984009840098401,
"grad_norm": 7.583343029022217,
"learning_rate": 0.000152,
"loss": 0.4662,
"step": 20
},
{
"epoch": 0.1033210332103321,
"grad_norm": 6.289183139801025,
"learning_rate": 0.00016,
"loss": 0.4997,
"step": 21
},
{
"epoch": 0.10824108241082411,
"grad_norm": 24.76793098449707,
"learning_rate": 0.000168,
"loss": 0.491,
"step": 22
},
{
"epoch": 0.11316113161131611,
"grad_norm": 17.512184143066406,
"learning_rate": 0.00017600000000000002,
"loss": 0.4472,
"step": 23
},
{
"epoch": 0.11808118081180811,
"grad_norm": 16.47793960571289,
"learning_rate": 0.00018400000000000003,
"loss": 0.5235,
"step": 24
},
{
"epoch": 0.12300123001230012,
"grad_norm": 8.312434196472168,
"learning_rate": 0.000192,
"loss": 0.4818,
"step": 25
},
{
"epoch": 0.12792127921279212,
"grad_norm": 83.81122589111328,
"learning_rate": 0.0002,
"loss": 0.5259,
"step": 26
},
{
"epoch": 0.13284132841328414,
"grad_norm": 8.704654693603516,
"learning_rate": 0.000199999211292062,
"loss": 0.4744,
"step": 27
},
{
"epoch": 0.13776137761377613,
"grad_norm": 2.8565006256103516,
"learning_rate": 0.00019999684518068916,
"loss": 0.4066,
"step": 28
},
{
"epoch": 0.14268142681426815,
"grad_norm": 5.916236877441406,
"learning_rate": 0.00019999290170320485,
"loss": 0.4154,
"step": 29
},
{
"epoch": 0.14760147601476015,
"grad_norm": 5.697567462921143,
"learning_rate": 0.00019998738092181421,
"loss": 0.4639,
"step": 30
},
{
"epoch": 0.15252152521525214,
"grad_norm": 1.615671157836914,
"learning_rate": 0.00019998028292360286,
"loss": 0.4108,
"step": 31
},
{
"epoch": 0.15744157441574416,
"grad_norm": 11.121039390563965,
"learning_rate": 0.00019997160782053578,
"loss": 0.449,
"step": 32
},
{
"epoch": 0.16236162361623616,
"grad_norm": 7.386440277099609,
"learning_rate": 0.00019996135574945544,
"loss": 0.4216,
"step": 33
},
{
"epoch": 0.16728167281672818,
"grad_norm": 84.96491241455078,
"learning_rate": 0.00019994952687207954,
"loss": 0.5282,
"step": 34
},
{
"epoch": 0.17220172201722017,
"grad_norm": 6.692220211029053,
"learning_rate": 0.00019993612137499876,
"loss": 0.5036,
"step": 35
},
{
"epoch": 0.17712177121771217,
"grad_norm": 5.1402363777160645,
"learning_rate": 0.00019992113946967353,
"loss": 0.4041,
"step": 36
},
{
"epoch": 0.1820418204182042,
"grad_norm": 3.2179603576660156,
"learning_rate": 0.00019990458139243077,
"loss": 0.398,
"step": 37
},
{
"epoch": 0.18696186961869618,
"grad_norm": 5.34651517868042,
"learning_rate": 0.00019988644740446022,
"loss": 0.4233,
"step": 38
},
{
"epoch": 0.1918819188191882,
"grad_norm": 4.085568428039551,
"learning_rate": 0.00019986673779181033,
"loss": 0.3935,
"step": 39
},
{
"epoch": 0.1968019680196802,
"grad_norm": 1.318534255027771,
"learning_rate": 0.0001998454528653836,
"loss": 0.3458,
"step": 40
},
{
"epoch": 0.2017220172201722,
"grad_norm": 3.834606409072876,
"learning_rate": 0.0001998225929609319,
"loss": 0.3819,
"step": 41
},
{
"epoch": 0.2066420664206642,
"grad_norm": 4.840269088745117,
"learning_rate": 0.00019979815843905097,
"loss": 0.473,
"step": 42
},
{
"epoch": 0.2115621156211562,
"grad_norm": 498.64990234375,
"learning_rate": 0.0001997721496851748,
"loss": 3.6745,
"step": 43
},
{
"epoch": 0.21648216482164823,
"grad_norm": 4.956181526184082,
"learning_rate": 0.00019974456710956964,
"loss": 0.3385,
"step": 44
},
{
"epoch": 0.22140221402214022,
"grad_norm": 6.580547332763672,
"learning_rate": 0.00019971541114732741,
"loss": 0.4277,
"step": 45
},
{
"epoch": 0.22632226322263221,
"grad_norm": 37.05827713012695,
"learning_rate": 0.0001996846822583589,
"loss": 0.8045,
"step": 46
},
{
"epoch": 0.23124231242312424,
"grad_norm": 5.152987480163574,
"learning_rate": 0.00019965238092738643,
"loss": 0.4173,
"step": 47
},
{
"epoch": 0.23616236162361623,
"grad_norm": 277.78857421875,
"learning_rate": 0.0001996185076639364,
"loss": 0.8626,
"step": 48
},
{
"epoch": 0.24108241082410825,
"grad_norm": 2.9399607181549072,
"learning_rate": 0.00019958306300233098,
"loss": 0.3167,
"step": 49
},
{
"epoch": 0.24600246002460024,
"grad_norm": 3.668168306350708,
"learning_rate": 0.00019954604750167993,
"loss": 0.4422,
"step": 50
},
{
"epoch": 0.25092250922509224,
"grad_norm": 4.103700637817383,
"learning_rate": 0.00019950746174587163,
"loss": 0.3683,
"step": 51
},
{
"epoch": 0.25584255842558423,
"grad_norm": 0.7908763885498047,
"learning_rate": 0.0001994673063435639,
"loss": 0.3834,
"step": 52
},
{
"epoch": 0.2607626076260763,
"grad_norm": 1.0205233097076416,
"learning_rate": 0.0001994255819281744,
"loss": 0.375,
"step": 53
},
{
"epoch": 0.2656826568265683,
"grad_norm": 3.6355137825012207,
"learning_rate": 0.0001993822891578708,
"loss": 0.4557,
"step": 54
},
{
"epoch": 0.27060270602706027,
"grad_norm": 6.3725409507751465,
"learning_rate": 0.00019933742871556,
"loss": 0.4183,
"step": 55
},
{
"epoch": 0.27552275522755226,
"grad_norm": 6.519746780395508,
"learning_rate": 0.00019929100130887782,
"loss": 0.4416,
"step": 56
},
{
"epoch": 0.28044280442804426,
"grad_norm": 3.950495719909668,
"learning_rate": 0.0001992430076701775,
"loss": 0.4255,
"step": 57
},
{
"epoch": 0.2853628536285363,
"grad_norm": 2.0773677825927734,
"learning_rate": 0.00019919344855651833,
"loss": 0.3393,
"step": 58
},
{
"epoch": 0.2902829028290283,
"grad_norm": 8.755096435546875,
"learning_rate": 0.00019914232474965365,
"loss": 0.5312,
"step": 59
},
{
"epoch": 0.2952029520295203,
"grad_norm": 4.103138446807861,
"learning_rate": 0.00019908963705601846,
"loss": 0.4104,
"step": 60
},
{
"epoch": 0.3001230012300123,
"grad_norm": 3.1862292289733887,
"learning_rate": 0.0001990353863067169,
"loss": 0.4211,
"step": 61
},
{
"epoch": 0.3050430504305043,
"grad_norm": 3.5777299404144287,
"learning_rate": 0.00019897957335750878,
"loss": 0.38,
"step": 62
},
{
"epoch": 0.30996309963099633,
"grad_norm": 3.0276339054107666,
"learning_rate": 0.00019892219908879653,
"loss": 0.4561,
"step": 63
},
{
"epoch": 0.3148831488314883,
"grad_norm": 1.853022575378418,
"learning_rate": 0.00019886326440561093,
"loss": 0.3874,
"step": 64
},
{
"epoch": 0.3198031980319803,
"grad_norm": 5.521330833435059,
"learning_rate": 0.00019880277023759702,
"loss": 0.459,
"step": 65
},
{
"epoch": 0.3247232472324723,
"grad_norm": 8.374741554260254,
"learning_rate": 0.0001987407175389994,
"loss": 0.4025,
"step": 66
},
{
"epoch": 0.3296432964329643,
"grad_norm": 3.8265085220336914,
"learning_rate": 0.0001986771072886472,
"loss": 0.4654,
"step": 67
},
{
"epoch": 0.33456334563345635,
"grad_norm": 2.002042770385742,
"learning_rate": 0.00019861194048993863,
"loss": 0.312,
"step": 68
},
{
"epoch": 0.33948339483394835,
"grad_norm": 6.2479634284973145,
"learning_rate": 0.0001985452181708251,
"loss": 0.4739,
"step": 69
},
{
"epoch": 0.34440344403444034,
"grad_norm": 3.072579860687256,
"learning_rate": 0.00019847694138379506,
"loss": 0.4282,
"step": 70
},
{
"epoch": 0.34932349323493234,
"grad_norm": 1.4464001655578613,
"learning_rate": 0.0001984071112058574,
"loss": 0.4165,
"step": 71
},
{
"epoch": 0.35424354243542433,
"grad_norm": 1.2664532661437988,
"learning_rate": 0.00019833572873852444,
"loss": 0.4575,
"step": 72
},
{
"epoch": 0.3591635916359164,
"grad_norm": 9.184704780578613,
"learning_rate": 0.00019826279510779454,
"loss": 0.4957,
"step": 73
},
{
"epoch": 0.3640836408364084,
"grad_norm": 8.479774475097656,
"learning_rate": 0.00019818831146413434,
"loss": 0.5062,
"step": 74
},
{
"epoch": 0.36900369003690037,
"grad_norm": 6.585694789886475,
"learning_rate": 0.0001981122789824607,
"loss": 0.4461,
"step": 75
},
{
"epoch": 0.37392373923739236,
"grad_norm": 2.46947979927063,
"learning_rate": 0.0001980346988621221,
"loss": 0.4175,
"step": 76
},
{
"epoch": 0.37884378843788435,
"grad_norm": 5.467379570007324,
"learning_rate": 0.00019795557232687956,
"loss": 0.4634,
"step": 77
},
{
"epoch": 0.3837638376383764,
"grad_norm": 3.7511723041534424,
"learning_rate": 0.0001978749006248877,
"loss": 0.466,
"step": 78
},
{
"epoch": 0.3886838868388684,
"grad_norm": 4.01120138168335,
"learning_rate": 0.00019779268502867473,
"loss": 0.5087,
"step": 79
},
{
"epoch": 0.3936039360393604,
"grad_norm": 3.0289227962493896,
"learning_rate": 0.0001977089268351225,
"loss": 0.4315,
"step": 80
},
{
"epoch": 0.3985239852398524,
"grad_norm": 2.6868069171905518,
"learning_rate": 0.00019762362736544607,
"loss": 0.3795,
"step": 81
},
{
"epoch": 0.4034440344403444,
"grad_norm": 3.252519130706787,
"learning_rate": 0.00019753678796517282,
"loss": 0.3436,
"step": 82
},
{
"epoch": 0.40836408364083643,
"grad_norm": 3.5098648071289062,
"learning_rate": 0.00019744841000412123,
"loss": 0.3921,
"step": 83
},
{
"epoch": 0.4132841328413284,
"grad_norm": 4.654256820678711,
"learning_rate": 0.00019735849487637929,
"loss": 0.4444,
"step": 84
},
{
"epoch": 0.4182041820418204,
"grad_norm": 1.7671858072280884,
"learning_rate": 0.0001972670440002825,
"loss": 0.3749,
"step": 85
},
{
"epoch": 0.4231242312423124,
"grad_norm": 2.725391387939453,
"learning_rate": 0.00019717405881839145,
"loss": 0.4183,
"step": 86
},
{
"epoch": 0.4280442804428044,
"grad_norm": 1.985857605934143,
"learning_rate": 0.00019707954079746927,
"loss": 0.3992,
"step": 87
},
{
"epoch": 0.43296432964329645,
"grad_norm": 4.41717004776001,
"learning_rate": 0.00019698349142845814,
"loss": 0.4746,
"step": 88
},
{
"epoch": 0.43788437884378845,
"grad_norm": 4.98541784286499,
"learning_rate": 0.00019688591222645607,
"loss": 0.3828,
"step": 89
},
{
"epoch": 0.44280442804428044,
"grad_norm": 5.29671573638916,
"learning_rate": 0.00019678680473069293,
"loss": 0.3513,
"step": 90
},
{
"epoch": 0.44772447724477243,
"grad_norm": 2.3669795989990234,
"learning_rate": 0.00019668617050450603,
"loss": 0.3433,
"step": 91
},
{
"epoch": 0.45264452644526443,
"grad_norm": 3.3942222595214844,
"learning_rate": 0.00019658401113531565,
"loss": 0.4033,
"step": 92
},
{
"epoch": 0.4575645756457565,
"grad_norm": 2.1776537895202637,
"learning_rate": 0.00019648032823459994,
"loss": 0.2924,
"step": 93
},
{
"epoch": 0.46248462484624847,
"grad_norm": 3.5817902088165283,
"learning_rate": 0.00019637512343786937,
"loss": 0.3886,
"step": 94
},
{
"epoch": 0.46740467404674046,
"grad_norm": 5.881927490234375,
"learning_rate": 0.00019626839840464119,
"loss": 0.5516,
"step": 95
},
{
"epoch": 0.47232472324723246,
"grad_norm": 3.89084529876709,
"learning_rate": 0.0001961601548184129,
"loss": 0.5291,
"step": 96
},
{
"epoch": 0.47724477244772445,
"grad_norm": 1.7908971309661865,
"learning_rate": 0.00019605039438663614,
"loss": 0.4671,
"step": 97
},
{
"epoch": 0.4821648216482165,
"grad_norm": 3.8980045318603516,
"learning_rate": 0.0001959391188406893,
"loss": 0.4105,
"step": 98
},
{
"epoch": 0.4870848708487085,
"grad_norm": 5.375885486602783,
"learning_rate": 0.00019582632993585052,
"loss": 0.4652,
"step": 99
},
{
"epoch": 0.4920049200492005,
"grad_norm": 4.329046249389648,
"learning_rate": 0.00019571202945126994,
"loss": 0.4507,
"step": 100
},
{
"epoch": 0.4969249692496925,
"grad_norm": 3.3222126960754395,
"learning_rate": 0.0001955962191899415,
"loss": 0.43,
"step": 101
},
{
"epoch": 0.5018450184501845,
"grad_norm": 5.076950550079346,
"learning_rate": 0.00019547890097867468,
"loss": 0.3756,
"step": 102
},
{
"epoch": 0.5067650676506765,
"grad_norm": 4.1895294189453125,
"learning_rate": 0.00019536007666806556,
"loss": 0.4545,
"step": 103
},
{
"epoch": 0.5116851168511685,
"grad_norm": 2.738429069519043,
"learning_rate": 0.00019523974813246767,
"loss": 0.3971,
"step": 104
},
{
"epoch": 0.5166051660516605,
"grad_norm": 1.7775121927261353,
"learning_rate": 0.00019511791726996243,
"loss": 0.3815,
"step": 105
},
{
"epoch": 0.5215252152521526,
"grad_norm": 0.5655261874198914,
"learning_rate": 0.0001949945860023292,
"loss": 0.308,
"step": 106
},
{
"epoch": 0.5264452644526445,
"grad_norm": 2.861567258834839,
"learning_rate": 0.00019486975627501502,
"loss": 0.349,
"step": 107
},
{
"epoch": 0.5313653136531366,
"grad_norm": 0.9508899450302124,
"learning_rate": 0.0001947434300571038,
"loss": 0.3095,
"step": 108
},
{
"epoch": 0.5362853628536285,
"grad_norm": 4.669578552246094,
"learning_rate": 0.00019461560934128533,
"loss": 0.5094,
"step": 109
},
{
"epoch": 0.5412054120541205,
"grad_norm": 0.9468059539794922,
"learning_rate": 0.0001944862961438239,
"loss": 0.2996,
"step": 110
},
{
"epoch": 0.5461254612546126,
"grad_norm": 3.7661190032958984,
"learning_rate": 0.00019435549250452645,
"loss": 0.3556,
"step": 111
},
{
"epoch": 0.5510455104551045,
"grad_norm": 3.7507622241973877,
"learning_rate": 0.0001942232004867103,
"loss": 0.4196,
"step": 112
},
{
"epoch": 0.5559655596555966,
"grad_norm": 1.455446481704712,
"learning_rate": 0.0001940894221771708,
"loss": 0.3751,
"step": 113
},
{
"epoch": 0.5608856088560885,
"grad_norm": 5.634677886962891,
"learning_rate": 0.00019395415968614813,
"loss": 0.4659,
"step": 114
},
{
"epoch": 0.5658056580565806,
"grad_norm": 1.9559741020202637,
"learning_rate": 0.00019381741514729443,
"loss": 0.4113,
"step": 115
},
{
"epoch": 0.5707257072570726,
"grad_norm": 2.967988967895508,
"learning_rate": 0.0001936791907176397,
"loss": 0.4565,
"step": 116
},
{
"epoch": 0.5756457564575646,
"grad_norm": 6.420986175537109,
"learning_rate": 0.00019353948857755803,
"loss": 0.4465,
"step": 117
},
{
"epoch": 0.5805658056580566,
"grad_norm": 5.503588676452637,
"learning_rate": 0.00019339831093073318,
"loss": 0.4705,
"step": 118
},
{
"epoch": 0.5854858548585485,
"grad_norm": 5.966702461242676,
"learning_rate": 0.00019325566000412376,
"loss": 0.4205,
"step": 119
},
{
"epoch": 0.5904059040590406,
"grad_norm": 2.865349054336548,
"learning_rate": 0.0001931115380479281,
"loss": 0.3988,
"step": 120
},
{
"epoch": 0.5953259532595326,
"grad_norm": 1.7353636026382446,
"learning_rate": 0.00019296594733554892,
"loss": 0.4364,
"step": 121
},
{
"epoch": 0.6002460024600246,
"grad_norm": 3.236239194869995,
"learning_rate": 0.0001928188901635571,
"loss": 0.4553,
"step": 122
},
{
"epoch": 0.6051660516605166,
"grad_norm": 2.1501266956329346,
"learning_rate": 0.00019267036885165588,
"loss": 0.4121,
"step": 123
},
{
"epoch": 0.6100861008610086,
"grad_norm": 2.416868209838867,
"learning_rate": 0.00019252038574264405,
"loss": 0.3616,
"step": 124
},
{
"epoch": 0.6150061500615006,
"grad_norm": 2.7004334926605225,
"learning_rate": 0.00019236894320237894,
"loss": 0.3443,
"step": 125
},
{
"epoch": 0.6199261992619927,
"grad_norm": 1.992761492729187,
"learning_rate": 0.00019221604361973919,
"loss": 0.4068,
"step": 126
},
{
"epoch": 0.6248462484624846,
"grad_norm": 0.9761249423027039,
"learning_rate": 0.00019206168940658712,
"loss": 0.3952,
"step": 127
},
{
"epoch": 0.6297662976629766,
"grad_norm": 1.5893077850341797,
"learning_rate": 0.00019190588299773062,
"loss": 0.383,
"step": 128
},
{
"epoch": 0.6346863468634686,
"grad_norm": 1.1404695510864258,
"learning_rate": 0.00019174862685088472,
"loss": 0.4274,
"step": 129
},
{
"epoch": 0.6396063960639606,
"grad_norm": 1.63871431350708,
"learning_rate": 0.0001915899234466328,
"loss": 0.3883,
"step": 130
},
{
"epoch": 0.6445264452644527,
"grad_norm": 1.9504516124725342,
"learning_rate": 0.00019142977528838762,
"loss": 0.3932,
"step": 131
},
{
"epoch": 0.6494464944649446,
"grad_norm": 1.6097129583358765,
"learning_rate": 0.0001912681849023516,
"loss": 0.4028,
"step": 132
},
{
"epoch": 0.6543665436654367,
"grad_norm": 2.071103572845459,
"learning_rate": 0.00019110515483747716,
"loss": 0.4016,
"step": 133
},
{
"epoch": 0.6592865928659286,
"grad_norm": 2.177647352218628,
"learning_rate": 0.0001909406876654264,
"loss": 0.3975,
"step": 134
},
{
"epoch": 0.6642066420664207,
"grad_norm": 1.2018887996673584,
"learning_rate": 0.00019077478598053063,
"loss": 0.3846,
"step": 135
},
{
"epoch": 0.6691266912669127,
"grad_norm": 3.688076972961426,
"learning_rate": 0.00019060745239974936,
"loss": 0.4432,
"step": 136
},
{
"epoch": 0.6740467404674046,
"grad_norm": 2.5613861083984375,
"learning_rate": 0.0001904386895626291,
"loss": 0.3704,
"step": 137
},
{
"epoch": 0.6789667896678967,
"grad_norm": 2.255295753479004,
"learning_rate": 0.00019026850013126157,
"loss": 0.3267,
"step": 138
},
{
"epoch": 0.6838868388683886,
"grad_norm": 3.4777557849884033,
"learning_rate": 0.0001900968867902419,
"loss": 0.5534,
"step": 139
},
{
"epoch": 0.6888068880688807,
"grad_norm": 2.0409767627716064,
"learning_rate": 0.00018992385224662623,
"loss": 0.4607,
"step": 140
},
{
"epoch": 0.6937269372693727,
"grad_norm": 2.408515691757202,
"learning_rate": 0.00018974939922988883,
"loss": 0.3969,
"step": 141
},
{
"epoch": 0.6986469864698647,
"grad_norm": 1.8554408550262451,
"learning_rate": 0.00018957353049187936,
"loss": 0.3385,
"step": 142
},
{
"epoch": 0.7035670356703567,
"grad_norm": 3.487424612045288,
"learning_rate": 0.00018939624880677918,
"loss": 0.4132,
"step": 143
},
{
"epoch": 0.7084870848708487,
"grad_norm": 3.606100559234619,
"learning_rate": 0.0001892175569710577,
"loss": 0.4644,
"step": 144
},
{
"epoch": 0.7134071340713407,
"grad_norm": 3.1930618286132812,
"learning_rate": 0.00018903745780342839,
"loss": 0.4235,
"step": 145
},
{
"epoch": 0.7183271832718328,
"grad_norm": 3.3350257873535156,
"learning_rate": 0.00018885595414480405,
"loss": 0.4837,
"step": 146
},
{
"epoch": 0.7232472324723247,
"grad_norm": 2.0124611854553223,
"learning_rate": 0.0001886730488582522,
"loss": 0.4149,
"step": 147
},
{
"epoch": 0.7281672816728167,
"grad_norm": 3.699632167816162,
"learning_rate": 0.00018848874482894993,
"loss": 0.376,
"step": 148
},
{
"epoch": 0.7330873308733087,
"grad_norm": 2.4049108028411865,
"learning_rate": 0.00018830304496413822,
"loss": 0.4215,
"step": 149
},
{
"epoch": 0.7380073800738007,
"grad_norm": 2.440385341644287,
"learning_rate": 0.00018811595219307622,
"loss": 0.4041,
"step": 150
},
{
"epoch": 0.7429274292742928,
"grad_norm": 2.6796436309814453,
"learning_rate": 0.000187927469466995,
"loss": 0.3949,
"step": 151
},
{
"epoch": 0.7478474784747847,
"grad_norm": 1.940114974975586,
"learning_rate": 0.00018773759975905098,
"loss": 0.4411,
"step": 152
},
{
"epoch": 0.7527675276752768,
"grad_norm": 3.338021755218506,
"learning_rate": 0.00018754634606427914,
"loss": 0.4607,
"step": 153
},
{
"epoch": 0.7576875768757687,
"grad_norm": 2.3407375812530518,
"learning_rate": 0.00018735371139954558,
"loss": 0.416,
"step": 154
},
{
"epoch": 0.7626076260762608,
"grad_norm": 1.1078053712844849,
"learning_rate": 0.0001871596988035001,
"loss": 0.34,
"step": 155
},
{
"epoch": 0.7675276752767528,
"grad_norm": 1.750227928161621,
"learning_rate": 0.00018696431133652817,
"loss": 0.3084,
"step": 156
},
{
"epoch": 0.7724477244772447,
"grad_norm": 2.9180145263671875,
"learning_rate": 0.00018676755208070275,
"loss": 0.4109,
"step": 157
},
{
"epoch": 0.7773677736777368,
"grad_norm": 2.675165891647339,
"learning_rate": 0.00018656942413973555,
"loss": 0.4438,
"step": 158
},
{
"epoch": 0.7822878228782287,
"grad_norm": 3.3854095935821533,
"learning_rate": 0.0001863699306389282,
"loss": 0.4418,
"step": 159
},
{
"epoch": 0.7872078720787208,
"grad_norm": 0.5620162487030029,
"learning_rate": 0.0001861690747251228,
"loss": 0.3806,
"step": 160
},
{
"epoch": 0.7921279212792128,
"grad_norm": 1.223493218421936,
"learning_rate": 0.00018596685956665245,
"loss": 0.3758,
"step": 161
},
{
"epoch": 0.7970479704797048,
"grad_norm": 1.9586799144744873,
"learning_rate": 0.00018576328835329117,
"loss": 0.3354,
"step": 162
},
{
"epoch": 0.8019680196801968,
"grad_norm": 3.2813546657562256,
"learning_rate": 0.00018555836429620358,
"loss": 0.4063,
"step": 163
},
{
"epoch": 0.8068880688806888,
"grad_norm": 2.182837963104248,
"learning_rate": 0.00018535209062789433,
"loss": 0.3697,
"step": 164
},
{
"epoch": 0.8118081180811808,
"grad_norm": 1.3659495115280151,
"learning_rate": 0.00018514447060215698,
"loss": 0.3351,
"step": 165
},
{
"epoch": 0.8167281672816729,
"grad_norm": 1.170257329940796,
"learning_rate": 0.00018493550749402278,
"loss": 0.3225,
"step": 166
},
{
"epoch": 0.8216482164821648,
"grad_norm": 4.230517387390137,
"learning_rate": 0.00018472520459970898,
"loss": 0.4448,
"step": 167
},
{
"epoch": 0.8265682656826568,
"grad_norm": 2.8111300468444824,
"learning_rate": 0.0001845135652365668,
"loss": 0.3761,
"step": 168
},
{
"epoch": 0.8314883148831488,
"grad_norm": 5.860655307769775,
"learning_rate": 0.00018430059274302917,
"loss": 0.4974,
"step": 169
},
{
"epoch": 0.8364083640836408,
"grad_norm": 3.6116364002227783,
"learning_rate": 0.00018408629047855804,
"loss": 0.4327,
"step": 170
},
{
"epoch": 0.8413284132841329,
"grad_norm": 2.6450071334838867,
"learning_rate": 0.00018387066182359133,
"loss": 0.3813,
"step": 171
},
{
"epoch": 0.8462484624846248,
"grad_norm": 2.9791674613952637,
"learning_rate": 0.00018365371017948964,
"loss": 0.4184,
"step": 172
},
{
"epoch": 0.8511685116851169,
"grad_norm": 1.7529772520065308,
"learning_rate": 0.00018343543896848273,
"loss": 0.3489,
"step": 173
},
{
"epoch": 0.8560885608856088,
"grad_norm": 3.5216493606567383,
"learning_rate": 0.00018321585163361527,
"loss": 0.3988,
"step": 174
},
{
"epoch": 0.8610086100861009,
"grad_norm": 2.470106840133667,
"learning_rate": 0.00018299495163869275,
"loss": 0.3919,
"step": 175
},
{
"epoch": 0.8659286592865929,
"grad_norm": 3.1759798526763916,
"learning_rate": 0.0001827727424682268,
"loss": 0.3853,
"step": 176
},
{
"epoch": 0.8708487084870848,
"grad_norm": 3.581413745880127,
"learning_rate": 0.00018254922762738008,
"loss": 0.4041,
"step": 177
},
{
"epoch": 0.8757687576875769,
"grad_norm": 1.35221266746521,
"learning_rate": 0.00018232441064191125,
"loss": 0.3564,
"step": 178
},
{
"epoch": 0.8806888068880688,
"grad_norm": 2.2829418182373047,
"learning_rate": 0.0001820982950581191,
"loss": 0.443,
"step": 179
},
{
"epoch": 0.8856088560885609,
"grad_norm": 2.931074619293213,
"learning_rate": 0.00018187088444278674,
"loss": 0.4088,
"step": 180
},
{
"epoch": 0.8905289052890529,
"grad_norm": 3.7436723709106445,
"learning_rate": 0.00018164218238312535,
"loss": 0.4888,
"step": 181
},
{
"epoch": 0.8954489544895449,
"grad_norm": 2.7169697284698486,
"learning_rate": 0.00018141219248671745,
"loss": 0.4432,
"step": 182
},
{
"epoch": 0.9003690036900369,
"grad_norm": 1.1118288040161133,
"learning_rate": 0.00018118091838146029,
"loss": 0.3677,
"step": 183
},
{
"epoch": 0.9052890528905289,
"grad_norm": 3.0052273273468018,
"learning_rate": 0.00018094836371550824,
"loss": 0.3169,
"step": 184
},
{
"epoch": 0.9102091020910209,
"grad_norm": 2.911255121231079,
"learning_rate": 0.00018071453215721554,
"loss": 0.4721,
"step": 185
},
{
"epoch": 0.915129151291513,
"grad_norm": 2.483900547027588,
"learning_rate": 0.00018047942739507836,
"loss": 0.3812,
"step": 186
},
{
"epoch": 0.9200492004920049,
"grad_norm": 1.8842488527297974,
"learning_rate": 0.00018024305313767646,
"loss": 0.436,
"step": 187
},
{
"epoch": 0.9249692496924969,
"grad_norm": 1.5961415767669678,
"learning_rate": 0.000180005413113615,
"loss": 0.3808,
"step": 188
},
{
"epoch": 0.9298892988929889,
"grad_norm": 2.0334715843200684,
"learning_rate": 0.00017976651107146533,
"loss": 0.4548,
"step": 189
},
{
"epoch": 0.9348093480934809,
"grad_norm": 1.6422673463821411,
"learning_rate": 0.0001795263507797063,
"loss": 0.406,
"step": 190
},
{
"epoch": 0.939729397293973,
"grad_norm": 2.5241055488586426,
"learning_rate": 0.00017928493602666445,
"loss": 0.3661,
"step": 191
},
{
"epoch": 0.9446494464944649,
"grad_norm": 2.3822920322418213,
"learning_rate": 0.00017904227062045437,
"loss": 0.4581,
"step": 192
},
{
"epoch": 0.949569495694957,
"grad_norm": 3.649919271469116,
"learning_rate": 0.00017879835838891875,
"loss": 0.4743,
"step": 193
},
{
"epoch": 0.9544895448954489,
"grad_norm": 1.9197454452514648,
"learning_rate": 0.00017855320317956784,
"loss": 0.3857,
"step": 194
},
{
"epoch": 0.959409594095941,
"grad_norm": 1.4304083585739136,
"learning_rate": 0.00017830680885951887,
"loss": 0.3935,
"step": 195
},
{
"epoch": 0.964329643296433,
"grad_norm": 0.4576971232891083,
"learning_rate": 0.00017805917931543492,
"loss": 0.4147,
"step": 196
},
{
"epoch": 0.9692496924969249,
"grad_norm": 1.4386779069900513,
"learning_rate": 0.00017781031845346375,
"loss": 0.3927,
"step": 197
},
{
"epoch": 0.974169741697417,
"grad_norm": 1.496974229812622,
"learning_rate": 0.00017756023019917607,
"loss": 0.3666,
"step": 198
},
{
"epoch": 0.9790897908979089,
"grad_norm": 1.221921682357788,
"learning_rate": 0.00017730891849750377,
"loss": 0.3938,
"step": 199
},
{
"epoch": 0.984009840098401,
"grad_norm": 1.7949525117874146,
"learning_rate": 0.0001770563873126775,
"loss": 0.4118,
"step": 200
},
{
"epoch": 0.988929889298893,
"grad_norm": 1.1061089038848877,
"learning_rate": 0.0001768026406281642,
"loss": 0.4086,
"step": 201
},
{
"epoch": 0.993849938499385,
"grad_norm": 3.217977523803711,
"learning_rate": 0.00017654768244660448,
"loss": 0.4018,
"step": 202
},
{
"epoch": 0.998769987699877,
"grad_norm": 0.9173564314842224,
"learning_rate": 0.00017629151678974907,
"loss": 0.3952,
"step": 203
},
{
"epoch": 1.0,
"grad_norm": 5.783997058868408,
"learning_rate": 0.00017603414769839577,
"loss": 0.375,
"step": 204
},
{
"epoch": 1.004920049200492,
"grad_norm": 3.309582471847534,
"learning_rate": 0.00017577557923232546,
"loss": 0.4257,
"step": 205
},
{
"epoch": 1.009840098400984,
"grad_norm": 1.2689415216445923,
"learning_rate": 0.00017551581547023819,
"loss": 0.4078,
"step": 206
},
{
"epoch": 1.014760147601476,
"grad_norm": 1.5618160963058472,
"learning_rate": 0.00017525486050968875,
"loss": 0.3948,
"step": 207
},
{
"epoch": 1.019680196801968,
"grad_norm": 2.377791166305542,
"learning_rate": 0.00017499271846702213,
"loss": 0.3407,
"step": 208
},
{
"epoch": 1.0246002460024601,
"grad_norm": 1.7102715969085693,
"learning_rate": 0.00017472939347730856,
"loss": 0.3997,
"step": 209
},
{
"epoch": 1.029520295202952,
"grad_norm": 1.2720469236373901,
"learning_rate": 0.0001744648896942782,
"loss": 0.339,
"step": 210
},
{
"epoch": 1.034440344403444,
"grad_norm": 2.7468247413635254,
"learning_rate": 0.00017419921129025576,
"loss": 0.3818,
"step": 211
},
{
"epoch": 1.039360393603936,
"grad_norm": 3.501011371612549,
"learning_rate": 0.0001739323624560945,
"loss": 0.451,
"step": 212
},
{
"epoch": 1.044280442804428,
"grad_norm": 1.6358418464660645,
"learning_rate": 0.00017366434740111037,
"loss": 0.3493,
"step": 213
},
{
"epoch": 1.04920049200492,
"grad_norm": 3.540642023086548,
"learning_rate": 0.00017339517035301532,
"loss": 0.455,
"step": 214
},
{
"epoch": 1.054120541205412,
"grad_norm": 2.093965530395508,
"learning_rate": 0.00017312483555785086,
"loss": 0.3673,
"step": 215
},
{
"epoch": 1.0590405904059041,
"grad_norm": 4.747845649719238,
"learning_rate": 0.000172853347279921,
"loss": 0.5216,
"step": 216
},
{
"epoch": 1.063960639606396,
"grad_norm": 2.5414655208587646,
"learning_rate": 0.00017258070980172494,
"loss": 0.4571,
"step": 217
},
{
"epoch": 1.068880688806888,
"grad_norm": 1.3232766389846802,
"learning_rate": 0.0001723069274238895,
"loss": 0.4014,
"step": 218
},
{
"epoch": 1.07380073800738,
"grad_norm": 2.045196771621704,
"learning_rate": 0.0001720320044651014,
"loss": 0.4119,
"step": 219
},
{
"epoch": 1.0787207872078721,
"grad_norm": 5.2392096519470215,
"learning_rate": 0.00017175594526203905,
"loss": 0.3691,
"step": 220
},
{
"epoch": 1.083640836408364,
"grad_norm": 2.589878797531128,
"learning_rate": 0.00017147875416930416,
"loss": 0.4317,
"step": 221
},
{
"epoch": 1.088560885608856,
"grad_norm": 1.5000386238098145,
"learning_rate": 0.00017120043555935298,
"loss": 0.4135,
"step": 222
},
{
"epoch": 1.0934809348093482,
"grad_norm": 0.8919417262077332,
"learning_rate": 0.00017092099382242748,
"loss": 0.4183,
"step": 223
},
{
"epoch": 1.09840098400984,
"grad_norm": 1.059650182723999,
"learning_rate": 0.00017064043336648599,
"loss": 0.3791,
"step": 224
},
{
"epoch": 1.103321033210332,
"grad_norm": 1.8085955381393433,
"learning_rate": 0.0001703587586171337,
"loss": 0.3893,
"step": 225
},
{
"epoch": 1.1082410824108242,
"grad_norm": 2.2094881534576416,
"learning_rate": 0.00017007597401755276,
"loss": 0.3871,
"step": 226
},
{
"epoch": 1.1131611316113161,
"grad_norm": 3.2818965911865234,
"learning_rate": 0.00016979208402843237,
"loss": 0.4412,
"step": 227
},
{
"epoch": 1.118081180811808,
"grad_norm": 1.4197732210159302,
"learning_rate": 0.00016950709312789833,
"loss": 0.3248,
"step": 228
},
{
"epoch": 1.1230012300123002,
"grad_norm": 3.690911054611206,
"learning_rate": 0.00016922100581144228,
"loss": 0.4552,
"step": 229
},
{
"epoch": 1.1279212792127922,
"grad_norm": 4.1127424240112305,
"learning_rate": 0.00016893382659185105,
"loss": 0.4887,
"step": 230
},
{
"epoch": 1.132841328413284,
"grad_norm": 2.658750295639038,
"learning_rate": 0.00016864555999913518,
"loss": 0.4037,
"step": 231
},
{
"epoch": 1.137761377613776,
"grad_norm": 1.4189069271087646,
"learning_rate": 0.0001683562105804577,
"loss": 0.3705,
"step": 232
},
{
"epoch": 1.1426814268142682,
"grad_norm": 2.599860191345215,
"learning_rate": 0.00016806578290006225,
"loss": 0.3975,
"step": 233
},
{
"epoch": 1.1476014760147601,
"grad_norm": 3.787053108215332,
"learning_rate": 0.0001677742815392012,
"loss": 0.4294,
"step": 234
},
{
"epoch": 1.152521525215252,
"grad_norm": 3.4738941192626953,
"learning_rate": 0.00016748171109606328,
"loss": 0.3847,
"step": 235
},
{
"epoch": 1.1574415744157442,
"grad_norm": 2.0890064239501953,
"learning_rate": 0.00016718807618570106,
"loss": 0.4156,
"step": 236
},
{
"epoch": 1.1623616236162362,
"grad_norm": 2.275296211242676,
"learning_rate": 0.00016689338143995833,
"loss": 0.4598,
"step": 237
},
{
"epoch": 1.1672816728167281,
"grad_norm": 0.8225153088569641,
"learning_rate": 0.00016659763150739677,
"loss": 0.3495,
"step": 238
},
{
"epoch": 1.17220172201722,
"grad_norm": 0.9762566685676575,
"learning_rate": 0.00016630083105322266,
"loss": 0.3705,
"step": 239
},
{
"epoch": 1.1771217712177122,
"grad_norm": 3.3081791400909424,
"learning_rate": 0.00016600298475921365,
"loss": 0.4167,
"step": 240
},
{
"epoch": 1.1820418204182042,
"grad_norm": 4.026612281799316,
"learning_rate": 0.00016570409732364437,
"loss": 0.4859,
"step": 241
},
{
"epoch": 1.186961869618696,
"grad_norm": 2.193952798843384,
"learning_rate": 0.0001654041734612127,
"loss": 0.4207,
"step": 242
},
{
"epoch": 1.1918819188191883,
"grad_norm": 2.2682714462280273,
"learning_rate": 0.00016510321790296525,
"loss": 0.4344,
"step": 243
},
{
"epoch": 1.1968019680196802,
"grad_norm": 3.3938522338867188,
"learning_rate": 0.00016480123539622281,
"loss": 0.4628,
"step": 244
},
{
"epoch": 1.2017220172201721,
"grad_norm": 4.911561489105225,
"learning_rate": 0.00016449823070450531,
"loss": 0.3449,
"step": 245
},
{
"epoch": 1.2066420664206643,
"grad_norm": 2.2653610706329346,
"learning_rate": 0.00016419420860745699,
"loss": 0.3965,
"step": 246
},
{
"epoch": 1.2115621156211562,
"grad_norm": 1.626495361328125,
"learning_rate": 0.00016388917390077054,
"loss": 0.3818,
"step": 247
},
{
"epoch": 1.2164821648216482,
"grad_norm": 2.9067697525024414,
"learning_rate": 0.00016358313139611195,
"loss": 0.4184,
"step": 248
},
{
"epoch": 1.2214022140221403,
"grad_norm": 1.6488162279129028,
"learning_rate": 0.0001632760859210442,
"loss": 0.3561,
"step": 249
},
{
"epoch": 1.2263222632226323,
"grad_norm": 1.5693081617355347,
"learning_rate": 0.00016296804231895142,
"loss": 0.404,
"step": 250
},
{
"epoch": 1.2312423124231242,
"grad_norm": 2.674132823944092,
"learning_rate": 0.00016265900544896225,
"loss": 0.4402,
"step": 251
},
{
"epoch": 1.2361623616236161,
"grad_norm": 1.432892918586731,
"learning_rate": 0.00016234898018587337,
"loss": 0.3073,
"step": 252
},
{
"epoch": 1.2410824108241083,
"grad_norm": 1.0736567974090576,
"learning_rate": 0.0001620379714200725,
"loss": 0.3551,
"step": 253
},
{
"epoch": 1.2460024600246002,
"grad_norm": 2.692246675491333,
"learning_rate": 0.00016172598405746124,
"loss": 0.4585,
"step": 254
},
{
"epoch": 1.2509225092250922,
"grad_norm": 1.1363232135772705,
"learning_rate": 0.00016141302301937786,
"loss": 0.3566,
"step": 255
},
{
"epoch": 1.2558425584255843,
"grad_norm": 2.9427497386932373,
"learning_rate": 0.0001610990932425194,
"loss": 0.4541,
"step": 256
},
{
"epoch": 1.2607626076260763,
"grad_norm": 1.8412046432495117,
"learning_rate": 0.00016078419967886402,
"loss": 0.4018,
"step": 257
},
{
"epoch": 1.2656826568265682,
"grad_norm": 1.6179234981536865,
"learning_rate": 0.0001604683472955928,
"loss": 0.4115,
"step": 258
},
{
"epoch": 1.2706027060270602,
"grad_norm": 1.2234046459197998,
"learning_rate": 0.00016015154107501133,
"loss": 0.4339,
"step": 259
},
{
"epoch": 1.2755227552275523,
"grad_norm": 1.4952470064163208,
"learning_rate": 0.00015983378601447127,
"loss": 0.4079,
"step": 260
},
{
"epoch": 1.2804428044280443,
"grad_norm": 0.6399968266487122,
"learning_rate": 0.0001595150871262914,
"loss": 0.4262,
"step": 261
},
{
"epoch": 1.2853628536285364,
"grad_norm": 2.328315258026123,
"learning_rate": 0.00015919544943767856,
"loss": 0.4236,
"step": 262
},
{
"epoch": 1.2902829028290284,
"grad_norm": 2.3608176708221436,
"learning_rate": 0.00015887487799064838,
"loss": 0.3888,
"step": 263
},
{
"epoch": 1.2952029520295203,
"grad_norm": 1.258406639099121,
"learning_rate": 0.00015855337784194577,
"loss": 0.405,
"step": 264
},
{
"epoch": 1.3001230012300122,
"grad_norm": 1.4067128896713257,
"learning_rate": 0.00015823095406296514,
"loss": 0.426,
"step": 265
},
{
"epoch": 1.3050430504305042,
"grad_norm": 2.433593988418579,
"learning_rate": 0.00015790761173967036,
"loss": 0.404,
"step": 266
},
{
"epoch": 1.3099630996309963,
"grad_norm": 1.216810703277588,
"learning_rate": 0.00015758335597251458,
"loss": 0.3607,
"step": 267
},
{
"epoch": 1.3148831488314883,
"grad_norm": 3.8141870498657227,
"learning_rate": 0.00015725819187635968,
"loss": 0.487,
"step": 268
},
{
"epoch": 1.3198031980319804,
"grad_norm": 0.3127140700817108,
"learning_rate": 0.00015693212458039584,
"loss": 0.3555,
"step": 269
},
{
"epoch": 1.3247232472324724,
"grad_norm": 3.0095064640045166,
"learning_rate": 0.00015660515922806027,
"loss": 0.4525,
"step": 270
},
{
"epoch": 1.3296432964329643,
"grad_norm": 1.490798830986023,
"learning_rate": 0.00015627730097695638,
"loss": 0.3951,
"step": 271
},
{
"epoch": 1.3345633456334562,
"grad_norm": 2.4473958015441895,
"learning_rate": 0.0001559485549987723,
"loss": 0.3184,
"step": 272
},
{
"epoch": 1.3394833948339484,
"grad_norm": 1.3399827480316162,
"learning_rate": 0.0001556189264791992,
"loss": 0.401,
"step": 273
},
{
"epoch": 1.3444034440344403,
"grad_norm": 1.9885616302490234,
"learning_rate": 0.0001552884206178498,
"loss": 0.4482,
"step": 274
},
{
"epoch": 1.3493234932349323,
"grad_norm": 1.2298444509506226,
"learning_rate": 0.00015495704262817597,
"loss": 0.4295,
"step": 275
},
{
"epoch": 1.3542435424354244,
"grad_norm": 1.3658753633499146,
"learning_rate": 0.0001546247977373867,
"loss": 0.3741,
"step": 276
},
{
"epoch": 1.3591635916359164,
"grad_norm": 3.8412437438964844,
"learning_rate": 0.00015429169118636566,
"loss": 0.356,
"step": 277
},
{
"epoch": 1.3640836408364083,
"grad_norm": 2.24770188331604,
"learning_rate": 0.00015395772822958845,
"loss": 0.3911,
"step": 278
},
{
"epoch": 1.3690036900369003,
"grad_norm": 1.0389429330825806,
"learning_rate": 0.00015362291413503984,
"loss": 0.4239,
"step": 279
},
{
"epoch": 1.3739237392373924,
"grad_norm": 2.6337220668792725,
"learning_rate": 0.00015328725418413045,
"loss": 0.3546,
"step": 280
},
{
"epoch": 1.3788437884378844,
"grad_norm": 1.609165906906128,
"learning_rate": 0.00015295075367161367,
"loss": 0.4083,
"step": 281
},
{
"epoch": 1.3837638376383765,
"grad_norm": 2.580286741256714,
"learning_rate": 0.00015261341790550196,
"loss": 0.3493,
"step": 282
},
{
"epoch": 1.3886838868388685,
"grad_norm": 2.396101474761963,
"learning_rate": 0.0001522752522069833,
"loss": 0.4164,
"step": 283
},
{
"epoch": 1.3936039360393604,
"grad_norm": 1.4685685634613037,
"learning_rate": 0.00015193626191033712,
"loss": 0.3765,
"step": 284
},
{
"epoch": 1.3985239852398523,
"grad_norm": 2.472041368484497,
"learning_rate": 0.0001515964523628501,
"loss": 0.4015,
"step": 285
},
{
"epoch": 1.4034440344403443,
"grad_norm": 2.8179895877838135,
"learning_rate": 0.00015125582892473204,
"loss": 0.4108,
"step": 286
},
{
"epoch": 1.4083640836408364,
"grad_norm": 2.4579968452453613,
"learning_rate": 0.00015091439696903115,
"loss": 0.4333,
"step": 287
},
{
"epoch": 1.4132841328413284,
"grad_norm": 2.46209716796875,
"learning_rate": 0.00015057216188154928,
"loss": 0.468,
"step": 288
},
{
"epoch": 1.4182041820418205,
"grad_norm": 1.7040590047836304,
"learning_rate": 0.00015022912906075702,
"loss": 0.421,
"step": 289
},
{
"epoch": 1.4231242312423125,
"grad_norm": 4.77639102935791,
"learning_rate": 0.00014988530391770856,
"loss": 0.4105,
"step": 290
},
{
"epoch": 1.4280442804428044,
"grad_norm": 1.7009060382843018,
"learning_rate": 0.00014954069187595633,
"loss": 0.4034,
"step": 291
},
{
"epoch": 1.4329643296432963,
"grad_norm": 1.7801786661148071,
"learning_rate": 0.00014919529837146528,
"loss": 0.3962,
"step": 292
},
{
"epoch": 1.4378843788437885,
"grad_norm": 0.9307105541229248,
"learning_rate": 0.0001488491288525275,
"loss": 0.3895,
"step": 293
},
{
"epoch": 1.4428044280442804,
"grad_norm": 1.3841267824172974,
"learning_rate": 0.0001485021887796759,
"loss": 0.3871,
"step": 294
},
{
"epoch": 1.4477244772447724,
"grad_norm": 1.3614524602890015,
"learning_rate": 0.00014815448362559826,
"loss": 0.4055,
"step": 295
},
{
"epoch": 1.4526445264452645,
"grad_norm": 3.51263165473938,
"learning_rate": 0.00014780601887505088,
"loss": 0.2708,
"step": 296
},
{
"epoch": 1.4575645756457565,
"grad_norm": 2.4436159133911133,
"learning_rate": 0.00014745680002477203,
"loss": 0.388,
"step": 297
},
{
"epoch": 1.4624846248462484,
"grad_norm": 0.775227427482605,
"learning_rate": 0.00014710683258339536,
"loss": 0.3506,
"step": 298
},
{
"epoch": 1.4674046740467404,
"grad_norm": 1.1680070161819458,
"learning_rate": 0.0001467561220713628,
"loss": 0.3227,
"step": 299
},
{
"epoch": 1.4723247232472325,
"grad_norm": 2.8542237281799316,
"learning_rate": 0.0001464046740208377,
"loss": 0.3588,
"step": 300
},
{
"epoch": 1.4772447724477245,
"grad_norm": 2.2465827465057373,
"learning_rate": 0.00014605249397561736,
"loss": 0.4161,
"step": 301
},
{
"epoch": 1.4821648216482166,
"grad_norm": 3.5913736820220947,
"learning_rate": 0.00014569958749104575,
"loss": 0.4758,
"step": 302
},
{
"epoch": 1.4870848708487086,
"grad_norm": 0.5437675714492798,
"learning_rate": 0.00014534596013392575,
"loss": 0.3388,
"step": 303
},
{
"epoch": 1.4920049200492005,
"grad_norm": 2.386204242706299,
"learning_rate": 0.00014499161748243147,
"loss": 0.4425,
"step": 304
},
{
"epoch": 1.4969249692496924,
"grad_norm": 1.160514235496521,
"learning_rate": 0.0001446365651260201,
"loss": 0.3747,
"step": 305
},
{
"epoch": 1.5018450184501844,
"grad_norm": 0.49445146322250366,
"learning_rate": 0.00014428080866534396,
"loss": 0.3707,
"step": 306
},
{
"epoch": 1.5067650676506765,
"grad_norm": 1.3350694179534912,
"learning_rate": 0.00014392435371216185,
"loss": 0.3455,
"step": 307
},
{
"epoch": 1.5116851168511685,
"grad_norm": 1.1016676425933838,
"learning_rate": 0.0001435672058892509,
"loss": 0.4095,
"step": 308
},
{
"epoch": 1.5166051660516606,
"grad_norm": 2.0227558612823486,
"learning_rate": 0.00014320937083031748,
"loss": 0.3706,
"step": 309
},
{
"epoch": 1.5215252152521526,
"grad_norm": 3.2734158039093018,
"learning_rate": 0.0001428508541799086,
"loss": 0.3384,
"step": 310
},
{
"epoch": 1.5264452644526445,
"grad_norm": 1.967950701713562,
"learning_rate": 0.0001424916615933229,
"loss": 0.4321,
"step": 311
},
{
"epoch": 1.5313653136531364,
"grad_norm": 1.8679777383804321,
"learning_rate": 0.00014213179873652127,
"loss": 0.3597,
"step": 312
},
{
"epoch": 1.5362853628536284,
"grad_norm": 1.111864447593689,
"learning_rate": 0.00014177127128603745,
"loss": 0.3639,
"step": 313
},
{
"epoch": 1.5412054120541205,
"grad_norm": 1.1539496183395386,
"learning_rate": 0.0001414100849288888,
"loss": 0.3735,
"step": 314
},
{
"epoch": 1.5461254612546127,
"grad_norm": 2.5453989505767822,
"learning_rate": 0.00014104824536248614,
"loss": 0.4241,
"step": 315
},
{
"epoch": 1.5510455104551046,
"grad_norm": 1.5490731000900269,
"learning_rate": 0.00014068575829454436,
"loss": 0.38,
"step": 316
},
{
"epoch": 1.5559655596555966,
"grad_norm": 2.0369129180908203,
"learning_rate": 0.00014032262944299194,
"loss": 0.432,
"step": 317
},
{
"epoch": 1.5608856088560885,
"grad_norm": 1.938671588897705,
"learning_rate": 0.00013995886453588104,
"loss": 0.4407,
"step": 318
},
{
"epoch": 1.5658056580565805,
"grad_norm": 1.5802247524261475,
"learning_rate": 0.00013959446931129704,
"loss": 0.4174,
"step": 319
},
{
"epoch": 1.5707257072570726,
"grad_norm": 1.7823857069015503,
"learning_rate": 0.0001392294495172681,
"loss": 0.3608,
"step": 320
},
{
"epoch": 1.5756457564575646,
"grad_norm": 1.5793462991714478,
"learning_rate": 0.0001388638109116744,
"loss": 0.4049,
"step": 321
},
{
"epoch": 1.5805658056580567,
"grad_norm": 2.478447437286377,
"learning_rate": 0.00013849755926215735,
"loss": 0.3822,
"step": 322
},
{
"epoch": 1.5854858548585486,
"grad_norm": 3.0512235164642334,
"learning_rate": 0.00013813070034602863,
"loss": 0.3729,
"step": 323
},
{
"epoch": 1.5904059040590406,
"grad_norm": 2.298110008239746,
"learning_rate": 0.00013776323995017898,
"loss": 0.3757,
"step": 324
},
{
"epoch": 1.5953259532595325,
"grad_norm": 1.1335664987564087,
"learning_rate": 0.00013739518387098705,
"loss": 0.3436,
"step": 325
},
{
"epoch": 1.6002460024600245,
"grad_norm": 1.9639568328857422,
"learning_rate": 0.0001370265379142279,
"loss": 0.4321,
"step": 326
},
{
"epoch": 1.6051660516605166,
"grad_norm": 2.0375776290893555,
"learning_rate": 0.0001366573078949813,
"loss": 0.3924,
"step": 327
},
{
"epoch": 1.6100861008610086,
"grad_norm": 2.925692558288574,
"learning_rate": 0.00013628749963754026,
"loss": 0.447,
"step": 328
},
{
"epoch": 1.6150061500615007,
"grad_norm": 2.7469842433929443,
"learning_rate": 0.0001359171189753189,
"loss": 0.4045,
"step": 329
},
{
"epoch": 1.6199261992619927,
"grad_norm": 1.8784551620483398,
"learning_rate": 0.00013554617175076062,
"loss": 0.398,
"step": 330
},
{
"epoch": 1.6248462484624846,
"grad_norm": 3.961890459060669,
"learning_rate": 0.0001351746638152458,
"loss": 0.4837,
"step": 331
},
{
"epoch": 1.6297662976629765,
"grad_norm": 1.2118688821792603,
"learning_rate": 0.00013480260102899966,
"loss": 0.3792,
"step": 332
},
{
"epoch": 1.6346863468634685,
"grad_norm": 0.8447842597961426,
"learning_rate": 0.0001344299892609996,
"loss": 0.3939,
"step": 333
},
{
"epoch": 1.6396063960639606,
"grad_norm": 0.8459701538085938,
"learning_rate": 0.00013405683438888282,
"loss": 0.4246,
"step": 334
},
{
"epoch": 1.6445264452644528,
"grad_norm": 2.4549758434295654,
"learning_rate": 0.00013368314229885347,
"loss": 0.3642,
"step": 335
},
{
"epoch": 1.6494464944649447,
"grad_norm": 3.813248872756958,
"learning_rate": 0.00013330891888559002,
"loss": 0.4009,
"step": 336
},
{
"epoch": 1.6543665436654367,
"grad_norm": 3.946821689605713,
"learning_rate": 0.00013293417005215188,
"loss": 0.3961,
"step": 337
},
{
"epoch": 1.6592865928659286,
"grad_norm": 1.7004120349884033,
"learning_rate": 0.0001325589017098867,
"loss": 0.4145,
"step": 338
},
{
"epoch": 1.6642066420664205,
"grad_norm": 2.824493169784546,
"learning_rate": 0.00013218311977833687,
"loss": 0.403,
"step": 339
},
{
"epoch": 1.6691266912669127,
"grad_norm": 2.5144972801208496,
"learning_rate": 0.0001318068301851463,
"loss": 0.4236,
"step": 340
},
{
"epoch": 1.6740467404674046,
"grad_norm": 2.8602144718170166,
"learning_rate": 0.00013143003886596669,
"loss": 0.4267,
"step": 341
},
{
"epoch": 1.6789667896678968,
"grad_norm": 1.5796253681182861,
"learning_rate": 0.0001310527517643642,
"loss": 0.4181,
"step": 342
},
{
"epoch": 1.6838868388683887,
"grad_norm": 1.748310923576355,
"learning_rate": 0.00013067497483172538,
"loss": 0.3817,
"step": 343
},
{
"epoch": 1.6888068880688807,
"grad_norm": 1.1796998977661133,
"learning_rate": 0.00013029671402716366,
"loss": 0.3891,
"step": 344
},
{
"epoch": 1.6937269372693726,
"grad_norm": 0.8031755089759827,
"learning_rate": 0.00012991797531742492,
"loss": 0.3746,
"step": 345
},
{
"epoch": 1.6986469864698646,
"grad_norm": 2.449571132659912,
"learning_rate": 0.00012953876467679373,
"loss": 0.3759,
"step": 346
},
{
"epoch": 1.7035670356703567,
"grad_norm": 2.167459726333618,
"learning_rate": 0.00012915908808699893,
"loss": 0.4026,
"step": 347
},
{
"epoch": 1.7084870848708487,
"grad_norm": 0.8750459551811218,
"learning_rate": 0.00012877895153711935,
"loss": 0.3612,
"step": 348
},
{
"epoch": 1.7134071340713408,
"grad_norm": 0.9922705292701721,
"learning_rate": 0.00012839836102348926,
"loss": 0.3894,
"step": 349
},
{
"epoch": 1.7183271832718328,
"grad_norm": 2.778425693511963,
"learning_rate": 0.00012801732254960388,
"loss": 0.5034,
"step": 350
},
{
"epoch": 1.7232472324723247,
"grad_norm": 2.2978157997131348,
"learning_rate": 0.00012763584212602453,
"loss": 0.3919,
"step": 351
},
{
"epoch": 1.7281672816728166,
"grad_norm": 0.7636315822601318,
"learning_rate": 0.00012725392577028402,
"loss": 0.3465,
"step": 352
},
{
"epoch": 1.7330873308733086,
"grad_norm": 0.9501156210899353,
"learning_rate": 0.0001268715795067916,
"loss": 0.353,
"step": 353
},
{
"epoch": 1.7380073800738007,
"grad_norm": 1.9191248416900635,
"learning_rate": 0.00012648880936673787,
"loss": 0.4535,
"step": 354
},
{
"epoch": 1.742927429274293,
"grad_norm": 0.5128054618835449,
"learning_rate": 0.00012610562138799978,
"loss": 0.3761,
"step": 355
},
{
"epoch": 1.7478474784747848,
"grad_norm": 1.427462100982666,
"learning_rate": 0.00012572202161504543,
"loss": 0.3871,
"step": 356
},
{
"epoch": 1.7527675276752768,
"grad_norm": 0.44678959250450134,
"learning_rate": 0.00012533801609883842,
"loss": 0.3858,
"step": 357
},
{
"epoch": 1.7576875768757687,
"grad_norm": 1.5311493873596191,
"learning_rate": 0.00012495361089674285,
"loss": 0.3638,
"step": 358
},
{
"epoch": 1.7626076260762606,
"grad_norm": 1.7714836597442627,
"learning_rate": 0.00012456881207242732,
"loss": 0.324,
"step": 359
},
{
"epoch": 1.7675276752767528,
"grad_norm": 0.6259622573852539,
"learning_rate": 0.00012418362569576965,
"loss": 0.3832,
"step": 360
},
{
"epoch": 1.7724477244772447,
"grad_norm": 1.4297990798950195,
"learning_rate": 0.00012379805784276082,
"loss": 0.3464,
"step": 361
},
{
"epoch": 1.777367773677737,
"grad_norm": 0.9419127702713013,
"learning_rate": 0.0001234121145954094,
"loss": 0.3605,
"step": 362
},
{
"epoch": 1.7822878228782288,
"grad_norm": 1.7905269861221313,
"learning_rate": 0.00012302580204164541,
"loss": 0.4042,
"step": 363
},
{
"epoch": 1.7872078720787208,
"grad_norm": 2.3646910190582275,
"learning_rate": 0.0001226391262752245,
"loss": 0.4208,
"step": 364
},
{
"epoch": 1.7921279212792127,
"grad_norm": 1.26406991481781,
"learning_rate": 0.00012225209339563145,
"loss": 0.3653,
"step": 365
},
{
"epoch": 1.7970479704797047,
"grad_norm": 2.371533155441284,
"learning_rate": 0.00012186470950798445,
"loss": 0.4039,
"step": 366
},
{
"epoch": 1.8019680196801968,
"grad_norm": 3.1603784561157227,
"learning_rate": 0.00012147698072293842,
"loss": 0.4911,
"step": 367
},
{
"epoch": 1.8068880688806888,
"grad_norm": 2.687168836593628,
"learning_rate": 0.00012108891315658879,
"loss": 0.4356,
"step": 368
},
{
"epoch": 1.811808118081181,
"grad_norm": 3.9243521690368652,
"learning_rate": 0.00012070051293037492,
"loss": 0.434,
"step": 369
},
{
"epoch": 1.8167281672816729,
"grad_norm": 2.8489391803741455,
"learning_rate": 0.00012031178617098371,
"loss": 0.3572,
"step": 370
},
{
"epoch": 1.8216482164821648,
"grad_norm": 2.8946075439453125,
"learning_rate": 0.00011992273901025269,
"loss": 0.3993,
"step": 371
},
{
"epoch": 1.8265682656826567,
"grad_norm": 1.3082534074783325,
"learning_rate": 0.0001195333775850736,
"loss": 0.4137,
"step": 372
},
{
"epoch": 1.8314883148831487,
"grad_norm": 1.9355298280715942,
"learning_rate": 0.00011914370803729533,
"loss": 0.3746,
"step": 373
},
{
"epoch": 1.8364083640836408,
"grad_norm": 2.1702141761779785,
"learning_rate": 0.00011875373651362727,
"loss": 0.3622,
"step": 374
},
{
"epoch": 1.841328413284133,
"grad_norm": 1.4988595247268677,
"learning_rate": 0.00011836346916554205,
"loss": 0.3619,
"step": 375
},
{
"epoch": 1.846248462484625,
"grad_norm": 1.761991262435913,
"learning_rate": 0.00011797291214917881,
"loss": 0.4358,
"step": 376
},
{
"epoch": 1.8511685116851169,
"grad_norm": 0.745695173740387,
"learning_rate": 0.00011758207162524598,
"loss": 0.3995,
"step": 377
},
{
"epoch": 1.8560885608856088,
"grad_norm": 1.9512763023376465,
"learning_rate": 0.00011719095375892396,
"loss": 0.4432,
"step": 378
},
{
"epoch": 1.8610086100861007,
"grad_norm": 2.6219289302825928,
"learning_rate": 0.00011679956471976814,
"loss": 0.4677,
"step": 379
},
{
"epoch": 1.865928659286593,
"grad_norm": 3.995495080947876,
"learning_rate": 0.0001164079106816113,
"loss": 0.2968,
"step": 380
},
{
"epoch": 1.8708487084870848,
"grad_norm": 0.8375853300094604,
"learning_rate": 0.00011601599782246646,
"loss": 0.4035,
"step": 381
},
{
"epoch": 1.875768757687577,
"grad_norm": 1.5365298986434937,
"learning_rate": 0.00011562383232442926,
"loss": 0.417,
"step": 382
},
{
"epoch": 1.880688806888069,
"grad_norm": 0.9506827592849731,
"learning_rate": 0.0001152314203735805,
"loss": 0.3772,
"step": 383
},
{
"epoch": 1.8856088560885609,
"grad_norm": 0.900571346282959,
"learning_rate": 0.00011483876815988867,
"loss": 0.3805,
"step": 384
},
{
"epoch": 1.8905289052890528,
"grad_norm": 1.0722607374191284,
"learning_rate": 0.00011444588187711205,
"loss": 0.4088,
"step": 385
},
{
"epoch": 1.8954489544895448,
"grad_norm": 1.7054160833358765,
"learning_rate": 0.00011405276772270126,
"loss": 0.3956,
"step": 386
},
{
"epoch": 1.900369003690037,
"grad_norm": 0.986569881439209,
"learning_rate": 0.0001136594318977014,
"loss": 0.4079,
"step": 387
},
{
"epoch": 1.9052890528905289,
"grad_norm": 0.8886733651161194,
"learning_rate": 0.0001132658806066542,
"loss": 0.3842,
"step": 388
},
{
"epoch": 1.910209102091021,
"grad_norm": 0.8961542248725891,
"learning_rate": 0.00011287212005750024,
"loss": 0.3881,
"step": 389
},
{
"epoch": 1.915129151291513,
"grad_norm": 2.585698127746582,
"learning_rate": 0.00011247815646148087,
"loss": 0.4671,
"step": 390
},
{
"epoch": 1.920049200492005,
"grad_norm": 1.6980706453323364,
"learning_rate": 0.00011208399603304047,
"loss": 0.3499,
"step": 391
},
{
"epoch": 1.9249692496924968,
"grad_norm": 0.8087127804756165,
"learning_rate": 0.00011168964498972818,
"loss": 0.3786,
"step": 392
},
{
"epoch": 1.9298892988929888,
"grad_norm": 1.2001378536224365,
"learning_rate": 0.00011129510955209996,
"loss": 0.3502,
"step": 393
},
{
"epoch": 1.934809348093481,
"grad_norm": 1.2342605590820312,
"learning_rate": 0.00011090039594362045,
"loss": 0.3924,
"step": 394
},
{
"epoch": 1.939729397293973,
"grad_norm": 1.3440324068069458,
"learning_rate": 0.00011050551039056479,
"loss": 0.4143,
"step": 395
},
{
"epoch": 1.944649446494465,
"grad_norm": 0.7715713977813721,
"learning_rate": 0.00011011045912192035,
"loss": 0.3371,
"step": 396
},
{
"epoch": 1.949569495694957,
"grad_norm": 0.9439634084701538,
"learning_rate": 0.0001097152483692886,
"loss": 0.3916,
"step": 397
},
{
"epoch": 1.954489544895449,
"grad_norm": 1.3055254220962524,
"learning_rate": 0.00010931988436678666,
"loss": 0.4016,
"step": 398
},
{
"epoch": 1.9594095940959408,
"grad_norm": 0.8916832208633423,
"learning_rate": 0.00010892437335094912,
"loss": 0.3525,
"step": 399
},
{
"epoch": 1.964329643296433,
"grad_norm": 1.3914546966552734,
"learning_rate": 0.00010852872156062946,
"loss": 0.3771,
"step": 400
},
{
"epoch": 1.969249692496925,
"grad_norm": 0.9857692122459412,
"learning_rate": 0.00010813293523690191,
"loss": 0.3393,
"step": 401
},
{
"epoch": 1.974169741697417,
"grad_norm": 2.0917956829071045,
"learning_rate": 0.00010773702062296273,
"loss": 0.4354,
"step": 402
},
{
"epoch": 1.979089790897909,
"grad_norm": 1.275038719177246,
"learning_rate": 0.00010734098396403192,
"loss": 0.3398,
"step": 403
},
{
"epoch": 1.984009840098401,
"grad_norm": 2.0591840744018555,
"learning_rate": 0.00010694483150725458,
"loss": 0.4068,
"step": 404
},
{
"epoch": 1.988929889298893,
"grad_norm": 4.258203506469727,
"learning_rate": 0.00010654856950160253,
"loss": 0.5179,
"step": 405
},
{
"epoch": 1.9938499384993849,
"grad_norm": 1.638200283050537,
"learning_rate": 0.00010615220419777548,
"loss": 0.3814,
"step": 406
},
{
"epoch": 1.998769987699877,
"grad_norm": 1.5387883186340332,
"learning_rate": 0.00010575574184810269,
"loss": 0.3818,
"step": 407
},
{
"epoch": 2.0,
"grad_norm": 4.10122013092041,
"learning_rate": 0.0001053591887064442,
"loss": 0.3577,
"step": 408
},
{
"epoch": 2.004920049200492,
"grad_norm": 1.2864503860473633,
"learning_rate": 0.00010496255102809223,
"loss": 0.3394,
"step": 409
},
{
"epoch": 2.009840098400984,
"grad_norm": 0.9785577654838562,
"learning_rate": 0.00010456583506967248,
"loss": 0.3734,
"step": 410
},
{
"epoch": 2.014760147601476,
"grad_norm": 0.43325719237327576,
"learning_rate": 0.00010416904708904548,
"loss": 0.3805,
"step": 411
},
{
"epoch": 2.019680196801968,
"grad_norm": 3.109909772872925,
"learning_rate": 0.00010377219334520783,
"loss": 0.4594,
"step": 412
},
{
"epoch": 2.02460024600246,
"grad_norm": 1.8757784366607666,
"learning_rate": 0.00010337528009819344,
"loss": 0.4087,
"step": 413
},
{
"epoch": 2.029520295202952,
"grad_norm": 3.7887425422668457,
"learning_rate": 0.00010297831360897492,
"loss": 0.436,
"step": 414
},
{
"epoch": 2.034440344403444,
"grad_norm": 0.3496626019477844,
"learning_rate": 0.00010258130013936474,
"loss": 0.3854,
"step": 415
},
{
"epoch": 2.039360393603936,
"grad_norm": 1.3036730289459229,
"learning_rate": 0.00010218424595191631,
"loss": 0.3803,
"step": 416
},
{
"epoch": 2.044280442804428,
"grad_norm": 4.245285987854004,
"learning_rate": 0.00010178715730982549,
"loss": 0.4952,
"step": 417
},
{
"epoch": 2.0492004920049203,
"grad_norm": 3.09157133102417,
"learning_rate": 0.00010139004047683151,
"loss": 0.3944,
"step": 418
},
{
"epoch": 2.054120541205412,
"grad_norm": 1.5304118394851685,
"learning_rate": 0.00010099290171711841,
"loss": 0.4174,
"step": 419
},
{
"epoch": 2.059040590405904,
"grad_norm": 2.2359232902526855,
"learning_rate": 0.00010059574729521595,
"loss": 0.3358,
"step": 420
},
{
"epoch": 2.063960639606396,
"grad_norm": 0.6024315357208252,
"learning_rate": 0.0001001985834759011,
"loss": 0.3981,
"step": 421
},
{
"epoch": 2.068880688806888,
"grad_norm": 1.2679041624069214,
"learning_rate": 9.980141652409895e-05,
"loss": 0.4225,
"step": 422
},
{
"epoch": 2.07380073800738,
"grad_norm": 0.895416796207428,
"learning_rate": 9.940425270478407e-05,
"loss": 0.388,
"step": 423
},
{
"epoch": 2.078720787207872,
"grad_norm": 1.567826747894287,
"learning_rate": 9.900709828288164e-05,
"loss": 0.3704,
"step": 424
},
{
"epoch": 2.0836408364083643,
"grad_norm": 1.9329123497009277,
"learning_rate": 9.860995952316851e-05,
"loss": 0.4234,
"step": 425
},
{
"epoch": 2.088560885608856,
"grad_norm": 0.44675880670547485,
"learning_rate": 9.821284269017455e-05,
"loss": 0.3876,
"step": 426
},
{
"epoch": 2.093480934809348,
"grad_norm": 1.7258495092391968,
"learning_rate": 9.781575404808371e-05,
"loss": 0.4297,
"step": 427
},
{
"epoch": 2.09840098400984,
"grad_norm": 1.092556118965149,
"learning_rate": 9.741869986063526e-05,
"loss": 0.4026,
"step": 428
},
{
"epoch": 2.103321033210332,
"grad_norm": 1.4725236892700195,
"learning_rate": 9.702168639102509e-05,
"loss": 0.4385,
"step": 429
},
{
"epoch": 2.108241082410824,
"grad_norm": 2.7385778427124023,
"learning_rate": 9.662471990180657e-05,
"loss": 0.4424,
"step": 430
},
{
"epoch": 2.113161131611316,
"grad_norm": 2.480210542678833,
"learning_rate": 9.622780665479222e-05,
"loss": 0.4206,
"step": 431
},
{
"epoch": 2.1180811808118083,
"grad_norm": 0.8362523913383484,
"learning_rate": 9.583095291095453e-05,
"loss": 0.3986,
"step": 432
},
{
"epoch": 2.1230012300123002,
"grad_norm": 2.0622987747192383,
"learning_rate": 9.543416493032757e-05,
"loss": 0.3485,
"step": 433
},
{
"epoch": 2.127921279212792,
"grad_norm": 2.7538793087005615,
"learning_rate": 9.503744897190778e-05,
"loss": 0.3756,
"step": 434
},
{
"epoch": 2.132841328413284,
"grad_norm": 0.8746367692947388,
"learning_rate": 9.464081129355586e-05,
"loss": 0.3751,
"step": 435
},
{
"epoch": 2.137761377613776,
"grad_norm": 1.8478419780731201,
"learning_rate": 9.424425815189733e-05,
"loss": 0.4292,
"step": 436
},
{
"epoch": 2.142681426814268,
"grad_norm": 1.6647083759307861,
"learning_rate": 9.384779580222453e-05,
"loss": 0.339,
"step": 437
},
{
"epoch": 2.14760147601476,
"grad_norm": 4.109962463378906,
"learning_rate": 9.345143049839749e-05,
"loss": 0.4544,
"step": 438
},
{
"epoch": 2.1525215252152523,
"grad_norm": 1.978119969367981,
"learning_rate": 9.305516849274541e-05,
"loss": 0.3702,
"step": 439
},
{
"epoch": 2.1574415744157442,
"grad_norm": 1.917183518409729,
"learning_rate": 9.265901603596811e-05,
"loss": 0.4077,
"step": 440
},
{
"epoch": 2.162361623616236,
"grad_norm": 0.47567835450172424,
"learning_rate": 9.226297937703728e-05,
"loss": 0.3356,
"step": 441
},
{
"epoch": 2.167281672816728,
"grad_norm": 2.546321392059326,
"learning_rate": 9.186706476309812e-05,
"loss": 0.4337,
"step": 442
},
{
"epoch": 2.17220172201722,
"grad_norm": 2.111480236053467,
"learning_rate": 9.147127843937055e-05,
"loss": 0.4024,
"step": 443
},
{
"epoch": 2.177121771217712,
"grad_norm": 1.1858526468276978,
"learning_rate": 9.107562664905093e-05,
"loss": 0.3637,
"step": 444
},
{
"epoch": 2.1820418204182044,
"grad_norm": 1.404078722000122,
"learning_rate": 9.068011563321336e-05,
"loss": 0.4173,
"step": 445
},
{
"epoch": 2.1869618696186963,
"grad_norm": 1.1295206546783447,
"learning_rate": 9.028475163071141e-05,
"loss": 0.3856,
"step": 446
},
{
"epoch": 2.1918819188191883,
"grad_norm": 1.2605645656585693,
"learning_rate": 8.988954087807968e-05,
"loss": 0.4193,
"step": 447
},
{
"epoch": 2.19680196801968,
"grad_norm": 1.1261564493179321,
"learning_rate": 8.949448960943524e-05,
"loss": 0.407,
"step": 448
},
{
"epoch": 2.201722017220172,
"grad_norm": 2.366487979888916,
"learning_rate": 8.909960405637958e-05,
"loss": 0.3946,
"step": 449
},
{
"epoch": 2.206642066420664,
"grad_norm": 2.1479427814483643,
"learning_rate": 8.870489044790006e-05,
"loss": 0.3728,
"step": 450
},
{
"epoch": 2.211562115621156,
"grad_norm": 2.990525245666504,
"learning_rate": 8.831035501027186e-05,
"loss": 0.3367,
"step": 451
},
{
"epoch": 2.2164821648216484,
"grad_norm": 1.812566876411438,
"learning_rate": 8.791600396695954e-05,
"loss": 0.3689,
"step": 452
},
{
"epoch": 2.2214022140221403,
"grad_norm": 0.5948531031608582,
"learning_rate": 8.752184353851916e-05,
"loss": 0.4018,
"step": 453
},
{
"epoch": 2.2263222632226323,
"grad_norm": 1.8020761013031006,
"learning_rate": 8.712787994249979e-05,
"loss": 0.3965,
"step": 454
},
{
"epoch": 2.231242312423124,
"grad_norm": 1.5464495420455933,
"learning_rate": 8.673411939334581e-05,
"loss": 0.3353,
"step": 455
},
{
"epoch": 2.236162361623616,
"grad_norm": 1.8382320404052734,
"learning_rate": 8.634056810229862e-05,
"loss": 0.3916,
"step": 456
},
{
"epoch": 2.241082410824108,
"grad_norm": 1.5499740839004517,
"learning_rate": 8.594723227729875e-05,
"loss": 0.3895,
"step": 457
},
{
"epoch": 2.2460024600246005,
"grad_norm": 4.030876636505127,
"learning_rate": 8.555411812288798e-05,
"loss": 0.4616,
"step": 458
},
{
"epoch": 2.2509225092250924,
"grad_norm": 1.1098424196243286,
"learning_rate": 8.516123184011135e-05,
"loss": 0.2977,
"step": 459
},
{
"epoch": 2.2558425584255843,
"grad_norm": 1.2961804866790771,
"learning_rate": 8.47685796264195e-05,
"loss": 0.36,
"step": 460
},
{
"epoch": 2.2607626076260763,
"grad_norm": 1.140372633934021,
"learning_rate": 8.437616767557077e-05,
"loss": 0.351,
"step": 461
},
{
"epoch": 2.265682656826568,
"grad_norm": 3.4962611198425293,
"learning_rate": 8.398400217753357e-05,
"loss": 0.4724,
"step": 462
},
{
"epoch": 2.27060270602706,
"grad_norm": 2.6000497341156006,
"learning_rate": 8.359208931838871e-05,
"loss": 0.4618,
"step": 463
},
{
"epoch": 2.275522755227552,
"grad_norm": 2.955470323562622,
"learning_rate": 8.320043528023188e-05,
"loss": 0.461,
"step": 464
},
{
"epoch": 2.280442804428044,
"grad_norm": 0.9662995934486389,
"learning_rate": 8.280904624107606e-05,
"loss": 0.3457,
"step": 465
},
{
"epoch": 2.2853628536285364,
"grad_norm": 0.8392460346221924,
"learning_rate": 8.241792837475405e-05,
"loss": 0.354,
"step": 466
},
{
"epoch": 2.2902829028290284,
"grad_norm": 2.8896520137786865,
"learning_rate": 8.202708785082121e-05,
"loss": 0.416,
"step": 467
},
{
"epoch": 2.2952029520295203,
"grad_norm": 1.6311709880828857,
"learning_rate": 8.163653083445799e-05,
"loss": 0.399,
"step": 468
},
{
"epoch": 2.3001230012300122,
"grad_norm": 0.4800054132938385,
"learning_rate": 8.124626348637279e-05,
"loss": 0.3758,
"step": 469
},
{
"epoch": 2.305043050430504,
"grad_norm": 1.0817440748214722,
"learning_rate": 8.085629196270469e-05,
"loss": 0.3764,
"step": 470
},
{
"epoch": 2.3099630996309966,
"grad_norm": 1.6088804006576538,
"learning_rate": 8.046662241492645e-05,
"loss": 0.4473,
"step": 471
},
{
"epoch": 2.3148831488314885,
"grad_norm": 1.0749715566635132,
"learning_rate": 8.007726098974734e-05,
"loss": 0.3703,
"step": 472
},
{
"epoch": 2.3198031980319804,
"grad_norm": 1.5354204177856445,
"learning_rate": 7.96882138290163e-05,
"loss": 0.4164,
"step": 473
},
{
"epoch": 2.3247232472324724,
"grad_norm": 1.117240309715271,
"learning_rate": 7.929948706962508e-05,
"loss": 0.4144,
"step": 474
},
{
"epoch": 2.3296432964329643,
"grad_norm": 0.9542057514190674,
"learning_rate": 7.891108684341121e-05,
"loss": 0.4162,
"step": 475
},
{
"epoch": 2.3345633456334562,
"grad_norm": 1.411424994468689,
"learning_rate": 7.852301927706159e-05,
"loss": 0.4402,
"step": 476
},
{
"epoch": 2.339483394833948,
"grad_norm": 1.8303946256637573,
"learning_rate": 7.813529049201556e-05,
"loss": 0.423,
"step": 477
},
{
"epoch": 2.34440344403444,
"grad_norm": 1.6640418767929077,
"learning_rate": 7.774790660436858e-05,
"loss": 0.3943,
"step": 478
},
{
"epoch": 2.3493234932349325,
"grad_norm": 0.5208873152732849,
"learning_rate": 7.736087372477554e-05,
"loss": 0.4215,
"step": 479
},
{
"epoch": 2.3542435424354244,
"grad_norm": 2.1671223640441895,
"learning_rate": 7.69741979583546e-05,
"loss": 0.3839,
"step": 480
},
{
"epoch": 2.3591635916359164,
"grad_norm": 2.075159788131714,
"learning_rate": 7.658788540459062e-05,
"loss": 0.3851,
"step": 481
},
{
"epoch": 2.3640836408364083,
"grad_norm": 1.8642665147781372,
"learning_rate": 7.620194215723919e-05,
"loss": 0.3669,
"step": 482
},
{
"epoch": 2.3690036900369003,
"grad_norm": 2.8715755939483643,
"learning_rate": 7.581637430423037e-05,
"loss": 0.4352,
"step": 483
},
{
"epoch": 2.373923739237392,
"grad_norm": 0.8020451664924622,
"learning_rate": 7.543118792757266e-05,
"loss": 0.3657,
"step": 484
},
{
"epoch": 2.3788437884378846,
"grad_norm": 2.100980758666992,
"learning_rate": 7.504638910325717e-05,
"loss": 0.3141,
"step": 485
},
{
"epoch": 2.3837638376383765,
"grad_norm": 3.8309755325317383,
"learning_rate": 7.466198390116158e-05,
"loss": 0.494,
"step": 486
},
{
"epoch": 2.3886838868388685,
"grad_norm": 1.7863093614578247,
"learning_rate": 7.427797838495463e-05,
"loss": 0.3792,
"step": 487
},
{
"epoch": 2.3936039360393604,
"grad_norm": 1.1884002685546875,
"learning_rate": 7.389437861200024e-05,
"loss": 0.3928,
"step": 488
},
{
"epoch": 2.3985239852398523,
"grad_norm": 1.9756462574005127,
"learning_rate": 7.35111906332622e-05,
"loss": 0.4218,
"step": 489
},
{
"epoch": 2.4034440344403443,
"grad_norm": 3.6889054775238037,
"learning_rate": 7.312842049320844e-05,
"loss": 0.4441,
"step": 490
},
{
"epoch": 2.408364083640836,
"grad_norm": 2.106717109680176,
"learning_rate": 7.2746074229716e-05,
"loss": 0.3783,
"step": 491
},
{
"epoch": 2.4132841328413286,
"grad_norm": 1.312242865562439,
"learning_rate": 7.236415787397548e-05,
"loss": 0.3342,
"step": 492
},
{
"epoch": 2.4182041820418205,
"grad_norm": 0.7120693325996399,
"learning_rate": 7.198267745039612e-05,
"loss": 0.3846,
"step": 493
},
{
"epoch": 2.4231242312423125,
"grad_norm": 1.6067770719528198,
"learning_rate": 7.160163897651075e-05,
"loss": 0.4396,
"step": 494
},
{
"epoch": 2.4280442804428044,
"grad_norm": 4.197781562805176,
"learning_rate": 7.122104846288064e-05,
"loss": 0.2713,
"step": 495
},
{
"epoch": 2.4329643296432963,
"grad_norm": 1.1666693687438965,
"learning_rate": 7.08409119130011e-05,
"loss": 0.3647,
"step": 496
},
{
"epoch": 2.4378843788437883,
"grad_norm": 1.3826804161071777,
"learning_rate": 7.04612353232063e-05,
"loss": 0.3739,
"step": 497
},
{
"epoch": 2.4428044280442807,
"grad_norm": 1.140659213066101,
"learning_rate": 7.008202468257514e-05,
"loss": 0.4207,
"step": 498
},
{
"epoch": 2.4477244772447726,
"grad_norm": 2.2047266960144043,
"learning_rate": 6.970328597283637e-05,
"loss": 0.3767,
"step": 499
},
{
"epoch": 2.4526445264452645,
"grad_norm": 2.385573148727417,
"learning_rate": 6.932502516827461e-05,
"loss": 0.3369,
"step": 500
},
{
"epoch": 2.4575645756457565,
"grad_norm": 1.869011402130127,
"learning_rate": 6.894724823563583e-05,
"loss": 0.3521,
"step": 501
},
{
"epoch": 2.4624846248462484,
"grad_norm": 1.2904314994812012,
"learning_rate": 6.85699611340333e-05,
"loss": 0.3519,
"step": 502
},
{
"epoch": 2.4674046740467404,
"grad_norm": 1.8398619890213013,
"learning_rate": 6.819316981485372e-05,
"loss": 0.3123,
"step": 503
},
{
"epoch": 2.4723247232472323,
"grad_norm": 2.567601442337036,
"learning_rate": 6.781688022166311e-05,
"loss": 0.4435,
"step": 504
},
{
"epoch": 2.4772447724477242,
"grad_norm": 1.0559594631195068,
"learning_rate": 6.744109829011332e-05,
"loss": 0.3921,
"step": 505
},
{
"epoch": 2.4821648216482166,
"grad_norm": 1.4271594285964966,
"learning_rate": 6.706582994784814e-05,
"loss": 0.382,
"step": 506
},
{
"epoch": 2.4870848708487086,
"grad_norm": 2.100080728530884,
"learning_rate": 6.669108111441003e-05,
"loss": 0.4241,
"step": 507
},
{
"epoch": 2.4920049200492005,
"grad_norm": 2.3189799785614014,
"learning_rate": 6.631685770114654e-05,
"loss": 0.4492,
"step": 508
},
{
"epoch": 2.4969249692496924,
"grad_norm": 1.2089158296585083,
"learning_rate": 6.594316561111724e-05,
"loss": 0.3763,
"step": 509
},
{
"epoch": 2.5018450184501844,
"grad_norm": 2.086798906326294,
"learning_rate": 6.557001073900044e-05,
"loss": 0.4291,
"step": 510
},
{
"epoch": 2.5067650676506767,
"grad_norm": 1.3246550559997559,
"learning_rate": 6.519739897100034e-05,
"loss": 0.4328,
"step": 511
},
{
"epoch": 2.5116851168511687,
"grad_norm": 3.522636890411377,
"learning_rate": 6.482533618475422e-05,
"loss": 0.3572,
"step": 512
},
{
"epoch": 2.5166051660516606,
"grad_norm": 2.3924167156219482,
"learning_rate": 6.445382824923938e-05,
"loss": 0.4262,
"step": 513
},
{
"epoch": 2.5215252152521526,
"grad_norm": 3.661113739013672,
"learning_rate": 6.408288102468113e-05,
"loss": 0.3796,
"step": 514
},
{
"epoch": 2.5264452644526445,
"grad_norm": 1.2376595735549927,
"learning_rate": 6.371250036245976e-05,
"loss": 0.3796,
"step": 515
},
{
"epoch": 2.5313653136531364,
"grad_norm": 1.8098406791687012,
"learning_rate": 6.334269210501875e-05,
"loss": 0.3707,
"step": 516
},
{
"epoch": 2.5362853628536284,
"grad_norm": 1.7512861490249634,
"learning_rate": 6.297346208577213e-05,
"loss": 0.3753,
"step": 517
},
{
"epoch": 2.5412054120541203,
"grad_norm": 2.3691437244415283,
"learning_rate": 6.260481612901299e-05,
"loss": 0.3678,
"step": 518
},
{
"epoch": 2.5461254612546127,
"grad_norm": 2.599379539489746,
"learning_rate": 6.223676004982105e-05,
"loss": 0.4462,
"step": 519
},
{
"epoch": 2.5510455104551046,
"grad_norm": 1.492092251777649,
"learning_rate": 6.18692996539714e-05,
"loss": 0.4379,
"step": 520
},
{
"epoch": 2.5559655596555966,
"grad_norm": 1.2708606719970703,
"learning_rate": 6.150244073784266e-05,
"loss": 0.3573,
"step": 521
},
{
"epoch": 2.5608856088560885,
"grad_norm": 2.399810552597046,
"learning_rate": 6.113618908832561e-05,
"loss": 0.4584,
"step": 522
},
{
"epoch": 2.5658056580565805,
"grad_norm": 3.6795196533203125,
"learning_rate": 6.0770550482731924e-05,
"loss": 0.247,
"step": 523
},
{
"epoch": 2.570725707257073,
"grad_norm": 1.5731709003448486,
"learning_rate": 6.0405530688702986e-05,
"loss": 0.4207,
"step": 524
},
{
"epoch": 2.5756457564575648,
"grad_norm": 2.2327213287353516,
"learning_rate": 6.0041135464119024e-05,
"loss": 0.389,
"step": 525
},
{
"epoch": 2.5805658056580567,
"grad_norm": 2.482600688934326,
"learning_rate": 5.9677370557008104e-05,
"loss": 0.4297,
"step": 526
},
{
"epoch": 2.5854858548585486,
"grad_norm": 0.5008729696273804,
"learning_rate": 5.9314241705455674e-05,
"loss": 0.37,
"step": 527
},
{
"epoch": 2.5904059040590406,
"grad_norm": 1.346571683883667,
"learning_rate": 5.895175463751385e-05,
"loss": 0.398,
"step": 528
},
{
"epoch": 2.5953259532595325,
"grad_norm": 1.3295096158981323,
"learning_rate": 5.858991507111122e-05,
"loss": 0.4046,
"step": 529
},
{
"epoch": 2.6002460024600245,
"grad_norm": 2.531033515930176,
"learning_rate": 5.8228728713962543e-05,
"loss": 0.441,
"step": 530
},
{
"epoch": 2.6051660516605164,
"grad_norm": 1.4059702157974243,
"learning_rate": 5.786820126347876e-05,
"loss": 0.3887,
"step": 531
},
{
"epoch": 2.6100861008610083,
"grad_norm": 0.8365688323974609,
"learning_rate": 5.750833840667711e-05,
"loss": 0.3926,
"step": 532
},
{
"epoch": 2.6150061500615007,
"grad_norm": 1.1072005033493042,
"learning_rate": 5.7149145820091385e-05,
"loss": 0.4331,
"step": 533
},
{
"epoch": 2.6199261992619927,
"grad_norm": 4.232044219970703,
"learning_rate": 5.6790629169682564e-05,
"loss": 0.313,
"step": 534
},
{
"epoch": 2.6248462484624846,
"grad_norm": 1.9496935606002808,
"learning_rate": 5.6432794110749134e-05,
"loss": 0.388,
"step": 535
},
{
"epoch": 2.6297662976629765,
"grad_norm": 2.281867265701294,
"learning_rate": 5.607564628783817e-05,
"loss": 0.3739,
"step": 536
},
{
"epoch": 2.6346863468634685,
"grad_norm": 1.5007566213607788,
"learning_rate": 5.571919133465605e-05,
"loss": 0.4018,
"step": 537
},
{
"epoch": 2.639606396063961,
"grad_norm": 1.5338659286499023,
"learning_rate": 5.5363434873979903e-05,
"loss": 0.3782,
"step": 538
},
{
"epoch": 2.644526445264453,
"grad_norm": 1.8886133432388306,
"learning_rate": 5.500838251756857e-05,
"loss": 0.4441,
"step": 539
},
{
"epoch": 2.6494464944649447,
"grad_norm": 3.30102801322937,
"learning_rate": 5.465403986607426e-05,
"loss": 0.3137,
"step": 540
},
{
"epoch": 2.6543665436654367,
"grad_norm": 1.8262077569961548,
"learning_rate": 5.430041250895428e-05,
"loss": 0.4104,
"step": 541
},
{
"epoch": 2.6592865928659286,
"grad_norm": 1.551676869392395,
"learning_rate": 5.3947506024382665e-05,
"loss": 0.3337,
"step": 542
},
{
"epoch": 2.6642066420664205,
"grad_norm": 2.0609912872314453,
"learning_rate": 5.359532597916233e-05,
"loss": 0.3059,
"step": 543
},
{
"epoch": 2.6691266912669125,
"grad_norm": 2.948434829711914,
"learning_rate": 5.324387792863719e-05,
"loss": 0.4629,
"step": 544
},
{
"epoch": 2.6740467404674044,
"grad_norm": 1.7256718873977661,
"learning_rate": 5.289316741660466e-05,
"loss": 0.3752,
"step": 545
},
{
"epoch": 2.678966789667897,
"grad_norm": 3.3157119750976562,
"learning_rate": 5.254319997522796e-05,
"loss": 0.4715,
"step": 546
},
{
"epoch": 2.6838868388683887,
"grad_norm": 2.951591730117798,
"learning_rate": 5.21939811249492e-05,
"loss": 0.4372,
"step": 547
},
{
"epoch": 2.6888068880688807,
"grad_norm": 1.9655730724334717,
"learning_rate": 5.1845516374401784e-05,
"loss": 0.3728,
"step": 548
},
{
"epoch": 2.6937269372693726,
"grad_norm": 2.9351847171783447,
"learning_rate": 5.14978112203241e-05,
"loss": 0.4404,
"step": 549
},
{
"epoch": 2.6986469864698646,
"grad_norm": 1.8943357467651367,
"learning_rate": 5.11508711474725e-05,
"loss": 0.2844,
"step": 550
},
{
"epoch": 2.703567035670357,
"grad_norm": 0.9512324333190918,
"learning_rate": 5.080470162853472e-05,
"loss": 0.3566,
"step": 551
},
{
"epoch": 2.708487084870849,
"grad_norm": 1.0358315706253052,
"learning_rate": 5.0459308124043715e-05,
"loss": 0.3267,
"step": 552
},
{
"epoch": 2.713407134071341,
"grad_norm": 0.828611433506012,
"learning_rate": 5.0114696082291425e-05,
"loss": 0.3766,
"step": 553
},
{
"epoch": 2.7183271832718328,
"grad_norm": 1.0593851804733276,
"learning_rate": 4.9770870939242986e-05,
"loss": 0.3895,
"step": 554
},
{
"epoch": 2.7232472324723247,
"grad_norm": 1.6970057487487793,
"learning_rate": 4.942783811845074e-05,
"loss": 0.3349,
"step": 555
},
{
"epoch": 2.7281672816728166,
"grad_norm": 0.8009957671165466,
"learning_rate": 4.908560303096887e-05,
"loss": 0.3741,
"step": 556
},
{
"epoch": 2.7330873308733086,
"grad_norm": 1.5965189933776855,
"learning_rate": 4.874417107526795e-05,
"loss": 0.326,
"step": 557
},
{
"epoch": 2.7380073800738005,
"grad_norm": 1.6400642395019531,
"learning_rate": 4.840354763714991e-05,
"loss": 0.3416,
"step": 558
},
{
"epoch": 2.742927429274293,
"grad_norm": 2.1281847953796387,
"learning_rate": 4.8063738089662926e-05,
"loss": 0.3142,
"step": 559
},
{
"epoch": 2.747847478474785,
"grad_norm": 1.0202291011810303,
"learning_rate": 4.772474779301669e-05,
"loss": 0.389,
"step": 560
},
{
"epoch": 2.7527675276752768,
"grad_norm": 1.6404527425765991,
"learning_rate": 4.738658209449805e-05,
"loss": 0.3891,
"step": 561
},
{
"epoch": 2.7576875768757687,
"grad_norm": 0.9586972594261169,
"learning_rate": 4.704924632838636e-05,
"loss": 0.3888,
"step": 562
},
{
"epoch": 2.7626076260762606,
"grad_norm": 3.16829776763916,
"learning_rate": 4.671274581586958e-05,
"loss": 0.4402,
"step": 563
},
{
"epoch": 2.767527675276753,
"grad_norm": 1.2215882539749146,
"learning_rate": 4.637708586496018e-05,
"loss": 0.3508,
"step": 564
},
{
"epoch": 2.772447724477245,
"grad_norm": 1.4616819620132446,
"learning_rate": 4.604227177041156e-05,
"loss": 0.4277,
"step": 565
},
{
"epoch": 2.777367773677737,
"grad_norm": 1.0973330736160278,
"learning_rate": 4.570830881363439e-05,
"loss": 0.4127,
"step": 566
},
{
"epoch": 2.782287822878229,
"grad_norm": 1.676638126373291,
"learning_rate": 4.537520226261333e-05,
"loss": 0.4243,
"step": 567
},
{
"epoch": 2.787207872078721,
"grad_norm": 2.636601448059082,
"learning_rate": 4.5042957371824057e-05,
"loss": 0.3116,
"step": 568
},
{
"epoch": 2.7921279212792127,
"grad_norm": 1.0604605674743652,
"learning_rate": 4.471157938215017e-05,
"loss": 0.4186,
"step": 569
},
{
"epoch": 2.7970479704797047,
"grad_norm": 1.1565591096878052,
"learning_rate": 4.438107352080076e-05,
"loss": 0.4068,
"step": 570
},
{
"epoch": 2.8019680196801966,
"grad_norm": 0.5449007153511047,
"learning_rate": 4.405144500122772e-05,
"loss": 0.3739,
"step": 571
},
{
"epoch": 2.8068880688806885,
"grad_norm": 1.7176798582077026,
"learning_rate": 4.372269902304363e-05,
"loss": 0.4036,
"step": 572
},
{
"epoch": 2.811808118081181,
"grad_norm": 1.9100306034088135,
"learning_rate": 4.339484077193974e-05,
"loss": 0.3436,
"step": 573
},
{
"epoch": 2.816728167281673,
"grad_norm": 0.950062096118927,
"learning_rate": 4.3067875419604184e-05,
"loss": 0.4206,
"step": 574
},
{
"epoch": 2.821648216482165,
"grad_norm": 1.950170636177063,
"learning_rate": 4.2741808123640335e-05,
"loss": 0.4187,
"step": 575
},
{
"epoch": 2.8265682656826567,
"grad_norm": 1.7998218536376953,
"learning_rate": 4.241664402748544e-05,
"loss": 0.3643,
"step": 576
},
{
"epoch": 2.8314883148831487,
"grad_norm": 0.5359982252120972,
"learning_rate": 4.209238826032965e-05,
"loss": 0.4071,
"step": 577
},
{
"epoch": 2.836408364083641,
"grad_norm": 2.177288055419922,
"learning_rate": 4.1769045937034876e-05,
"loss": 0.4656,
"step": 578
},
{
"epoch": 2.841328413284133,
"grad_norm": 1.8691096305847168,
"learning_rate": 4.144662215805426e-05,
"loss": 0.4428,
"step": 579
},
{
"epoch": 2.846248462484625,
"grad_norm": 1.3971619606018066,
"learning_rate": 4.1125122009351634e-05,
"loss": 0.3774,
"step": 580
},
{
"epoch": 2.851168511685117,
"grad_norm": 1.863781213760376,
"learning_rate": 4.080455056232147e-05,
"loss": 0.3686,
"step": 581
},
{
"epoch": 2.856088560885609,
"grad_norm": 2.2776503562927246,
"learning_rate": 4.048491287370863e-05,
"loss": 0.4009,
"step": 582
},
{
"epoch": 2.8610086100861007,
"grad_norm": 0.3682532012462616,
"learning_rate": 4.016621398552877e-05,
"loss": 0.4106,
"step": 583
},
{
"epoch": 2.8659286592865927,
"grad_norm": 1.6590131521224976,
"learning_rate": 3.9848458924988684e-05,
"loss": 0.4113,
"step": 584
},
{
"epoch": 2.8708487084870846,
"grad_norm": 0.8245828747749329,
"learning_rate": 3.953165270440721e-05,
"loss": 0.3874,
"step": 585
},
{
"epoch": 2.875768757687577,
"grad_norm": 1.1494457721710205,
"learning_rate": 3.921580032113602e-05,
"loss": 0.4041,
"step": 586
},
{
"epoch": 2.880688806888069,
"grad_norm": 0.3291958272457123,
"learning_rate": 3.8900906757480614e-05,
"loss": 0.4009,
"step": 587
},
{
"epoch": 2.885608856088561,
"grad_norm": 3.5113492012023926,
"learning_rate": 3.858697698062217e-05,
"loss": 0.4783,
"step": 588
},
{
"epoch": 2.890528905289053,
"grad_norm": 0.7835597991943359,
"learning_rate": 3.8274015942538745e-05,
"loss": 0.3928,
"step": 589
},
{
"epoch": 2.8954489544895448,
"grad_norm": 1.4036983251571655,
"learning_rate": 3.7962028579927555e-05,
"loss": 0.3694,
"step": 590
},
{
"epoch": 2.900369003690037,
"grad_norm": 1.1807712316513062,
"learning_rate": 3.7651019814126654e-05,
"loss": 0.385,
"step": 591
},
{
"epoch": 2.905289052890529,
"grad_norm": 2.0742995738983154,
"learning_rate": 3.734099455103779e-05,
"loss": 0.4164,
"step": 592
},
{
"epoch": 2.910209102091021,
"grad_norm": 2.6549105644226074,
"learning_rate": 3.7031957681048604e-05,
"loss": 0.347,
"step": 593
},
{
"epoch": 2.915129151291513,
"grad_norm": 1.3094247579574585,
"learning_rate": 3.6723914078955825e-05,
"loss": 0.4112,
"step": 594
},
{
"epoch": 2.920049200492005,
"grad_norm": 0.5627428293228149,
"learning_rate": 3.64168686038881e-05,
"loss": 0.3947,
"step": 595
},
{
"epoch": 2.924969249692497,
"grad_norm": 1.4705300331115723,
"learning_rate": 3.6110826099229453e-05,
"loss": 0.3828,
"step": 596
},
{
"epoch": 2.9298892988929888,
"grad_norm": 0.9498153924942017,
"learning_rate": 3.580579139254303e-05,
"loss": 0.3829,
"step": 597
},
{
"epoch": 2.9348093480934807,
"grad_norm": 2.052823781967163,
"learning_rate": 3.550176929549468e-05,
"loss": 0.3334,
"step": 598
},
{
"epoch": 2.939729397293973,
"grad_norm": 0.9632225036621094,
"learning_rate": 3.5198764603777235e-05,
"loss": 0.3681,
"step": 599
},
{
"epoch": 2.944649446494465,
"grad_norm": 1.2577297687530518,
"learning_rate": 3.489678209703475e-05,
"loss": 0.3469,
"step": 600
},
{
"epoch": 2.949569495694957,
"grad_norm": 1.42790949344635,
"learning_rate": 3.459582653878731e-05,
"loss": 0.4072,
"step": 601
},
{
"epoch": 2.954489544895449,
"grad_norm": 2.504870653152466,
"learning_rate": 3.429590267635565e-05,
"loss": 0.4232,
"step": 602
},
{
"epoch": 2.959409594095941,
"grad_norm": 2.3047032356262207,
"learning_rate": 3.399701524078635e-05,
"loss": 0.3763,
"step": 603
},
{
"epoch": 2.9643296432964332,
"grad_norm": 1.7464078664779663,
"learning_rate": 3.369916894677733e-05,
"loss": 0.3354,
"step": 604
},
{
"epoch": 2.969249692496925,
"grad_norm": 1.6479971408843994,
"learning_rate": 3.340236849260324e-05,
"loss": 0.3798,
"step": 605
},
{
"epoch": 2.974169741697417,
"grad_norm": 1.558695673942566,
"learning_rate": 3.31066185600417e-05,
"loss": 0.3488,
"step": 606
},
{
"epoch": 2.979089790897909,
"grad_norm": 3.189610719680786,
"learning_rate": 3.281192381429894e-05,
"loss": 0.441,
"step": 607
},
{
"epoch": 2.984009840098401,
"grad_norm": 0.9114331603050232,
"learning_rate": 3.251828890393677e-05,
"loss": 0.3922,
"step": 608
},
{
"epoch": 2.988929889298893,
"grad_norm": 0.84954833984375,
"learning_rate": 3.222571846079881e-05,
"loss": 0.3682,
"step": 609
},
{
"epoch": 2.993849938499385,
"grad_norm": 2.6202147006988525,
"learning_rate": 3.193421709993779e-05,
"loss": 0.453,
"step": 610
},
{
"epoch": 2.998769987699877,
"grad_norm": 1.6845208406448364,
"learning_rate": 3.1643789419542324e-05,
"loss": 0.3606,
"step": 611
},
{
"epoch": 3.0,
"grad_norm": 4.893674850463867,
"learning_rate": 3.135444000086485e-05,
"loss": 0.5199,
"step": 612
},
{
"epoch": 3.004920049200492,
"grad_norm": 1.350771427154541,
"learning_rate": 3.1066173408148955e-05,
"loss": 0.3319,
"step": 613
},
{
"epoch": 3.009840098400984,
"grad_norm": 2.239192247390747,
"learning_rate": 3.077899418855772e-05,
"loss": 0.4358,
"step": 614
},
{
"epoch": 3.014760147601476,
"grad_norm": 2.0310704708099365,
"learning_rate": 3.04929068721017e-05,
"loss": 0.4024,
"step": 615
},
{
"epoch": 3.019680196801968,
"grad_norm": 0.5520709156990051,
"learning_rate": 3.0207915971567624e-05,
"loss": 0.3869,
"step": 616
},
{
"epoch": 3.02460024600246,
"grad_norm": 1.409179925918579,
"learning_rate": 2.992402598244727e-05,
"loss": 0.343,
"step": 617
},
{
"epoch": 3.029520295202952,
"grad_norm": 3.2636709213256836,
"learning_rate": 2.9641241382866348e-05,
"loss": 0.3208,
"step": 618
},
{
"epoch": 3.034440344403444,
"grad_norm": 1.3331984281539917,
"learning_rate": 2.9359566633514037e-05,
"loss": 0.4065,
"step": 619
},
{
"epoch": 3.039360393603936,
"grad_norm": 1.49379563331604,
"learning_rate": 2.907900617757252e-05,
"loss": 0.3844,
"step": 620
},
{
"epoch": 3.044280442804428,
"grad_norm": 1.0063300132751465,
"learning_rate": 2.879956444064703e-05,
"loss": 0.4103,
"step": 621
},
{
"epoch": 3.0492004920049203,
"grad_norm": 1.5763076543807983,
"learning_rate": 2.8521245830695864e-05,
"loss": 0.4199,
"step": 622
},
{
"epoch": 3.054120541205412,
"grad_norm": 1.9557186365127563,
"learning_rate": 2.8244054737960935e-05,
"loss": 0.3928,
"step": 623
},
{
"epoch": 3.059040590405904,
"grad_norm": 1.7936758995056152,
"learning_rate": 2.7967995534898596e-05,
"loss": 0.3503,
"step": 624
},
{
"epoch": 3.063960639606396,
"grad_norm": 2.0918500423431396,
"learning_rate": 2.7693072576110514e-05,
"loss": 0.3772,
"step": 625
},
{
"epoch": 3.068880688806888,
"grad_norm": 1.531785249710083,
"learning_rate": 2.7419290198275095e-05,
"loss": 0.413,
"step": 626
},
{
"epoch": 3.07380073800738,
"grad_norm": 0.7834340929985046,
"learning_rate": 2.7146652720079003e-05,
"loss": 0.3919,
"step": 627
},
{
"epoch": 3.078720787207872,
"grad_norm": 1.8467501401901245,
"learning_rate": 2.6875164442149147e-05,
"loss": 0.368,
"step": 628
},
{
"epoch": 3.0836408364083643,
"grad_norm": 1.6197096109390259,
"learning_rate": 2.6604829646984686e-05,
"loss": 0.3476,
"step": 629
},
{
"epoch": 3.088560885608856,
"grad_norm": 2.2266929149627686,
"learning_rate": 2.6335652598889683e-05,
"loss": 0.3692,
"step": 630
},
{
"epoch": 3.093480934809348,
"grad_norm": 1.0801973342895508,
"learning_rate": 2.60676375439055e-05,
"loss": 0.4145,
"step": 631
},
{
"epoch": 3.09840098400984,
"grad_norm": 0.6759971976280212,
"learning_rate": 2.5800788709744227e-05,
"loss": 0.3621,
"step": 632
},
{
"epoch": 3.103321033210332,
"grad_norm": 1.5428274869918823,
"learning_rate": 2.5535110305721776e-05,
"loss": 0.3946,
"step": 633
},
{
"epoch": 3.108241082410824,
"grad_norm": 0.4800112843513489,
"learning_rate": 2.5270606522691443e-05,
"loss": 0.3695,
"step": 634
},
{
"epoch": 3.113161131611316,
"grad_norm": 1.418677568435669,
"learning_rate": 2.500728153297788e-05,
"loss": 0.3413,
"step": 635
},
{
"epoch": 3.1180811808118083,
"grad_norm": 1.384252667427063,
"learning_rate": 2.4745139490311254e-05,
"loss": 0.3376,
"step": 636
},
{
"epoch": 3.1230012300123002,
"grad_norm": 0.7807061672210693,
"learning_rate": 2.4484184529761834e-05,
"loss": 0.4,
"step": 637
},
{
"epoch": 3.127921279212792,
"grad_norm": 1.9366016387939453,
"learning_rate": 2.4224420767674562e-05,
"loss": 0.3731,
"step": 638
},
{
"epoch": 3.132841328413284,
"grad_norm": 2.5923564434051514,
"learning_rate": 2.3965852301604254e-05,
"loss": 0.4395,
"step": 639
},
{
"epoch": 3.137761377613776,
"grad_norm": 0.9284645318984985,
"learning_rate": 2.370848321025093e-05,
"loss": 0.3901,
"step": 640
},
{
"epoch": 3.142681426814268,
"grad_norm": 1.9988764524459839,
"learning_rate": 2.345231755339554e-05,
"loss": 0.4379,
"step": 641
},
{
"epoch": 3.14760147601476,
"grad_norm": 1.626031517982483,
"learning_rate": 2.3197359371835802e-05,
"loss": 0.4256,
"step": 642
},
{
"epoch": 3.1525215252152523,
"grad_norm": 2.1211905479431152,
"learning_rate": 2.2943612687322525e-05,
"loss": 0.3934,
"step": 643
},
{
"epoch": 3.1574415744157442,
"grad_norm": 1.0140880346298218,
"learning_rate": 2.2691081502496246e-05,
"loss": 0.3604,
"step": 644
},
{
"epoch": 3.162361623616236,
"grad_norm": 2.3775453567504883,
"learning_rate": 2.243976980082394e-05,
"loss": 0.4068,
"step": 645
},
{
"epoch": 3.167281672816728,
"grad_norm": 2.1912922859191895,
"learning_rate": 2.218968154653629e-05,
"loss": 0.3614,
"step": 646
},
{
"epoch": 3.17220172201722,
"grad_norm": 1.8802082538604736,
"learning_rate": 2.194082068456509e-05,
"loss": 0.3843,
"step": 647
},
{
"epoch": 3.177121771217712,
"grad_norm": 1.67764151096344,
"learning_rate": 2.169319114048114e-05,
"loss": 0.3707,
"step": 648
},
{
"epoch": 3.1820418204182044,
"grad_norm": 1.9697654247283936,
"learning_rate": 2.1446796820432167e-05,
"loss": 0.3357,
"step": 649
},
{
"epoch": 3.1869618696186963,
"grad_norm": 1.7767447233200073,
"learning_rate": 2.1201641611081246e-05,
"loss": 0.3937,
"step": 650
},
{
"epoch": 3.1918819188191883,
"grad_norm": 1.3625164031982422,
"learning_rate": 2.0957729379545655e-05,
"loss": 0.3593,
"step": 651
},
{
"epoch": 3.19680196801968,
"grad_norm": 1.0841906070709229,
"learning_rate": 2.0715063973335568e-05,
"loss": 0.393,
"step": 652
},
{
"epoch": 3.201722017220172,
"grad_norm": 3.0648295879364014,
"learning_rate": 2.04736492202937e-05,
"loss": 0.3615,
"step": 653
},
{
"epoch": 3.206642066420664,
"grad_norm": 1.1780354976654053,
"learning_rate": 2.0233488928534673e-05,
"loss": 0.3733,
"step": 654
},
{
"epoch": 3.211562115621156,
"grad_norm": 2.0348012447357178,
"learning_rate": 1.9994586886385046e-05,
"loss": 0.3895,
"step": 655
},
{
"epoch": 3.2164821648216484,
"grad_norm": 0.7234269380569458,
"learning_rate": 1.9756946862323535e-05,
"loss": 0.3621,
"step": 656
},
{
"epoch": 3.2214022140221403,
"grad_norm": 2.2290384769439697,
"learning_rate": 1.9520572604921672e-05,
"loss": 0.4369,
"step": 657
},
{
"epoch": 3.2263222632226323,
"grad_norm": 0.7513899803161621,
"learning_rate": 1.9285467842784467e-05,
"loss": 0.3614,
"step": 658
},
{
"epoch": 3.231242312423124,
"grad_norm": 2.5259876251220703,
"learning_rate": 1.9051636284491757e-05,
"loss": 0.3877,
"step": 659
},
{
"epoch": 3.236162361623616,
"grad_norm": 2.885737180709839,
"learning_rate": 1.8819081618539723e-05,
"loss": 0.4691,
"step": 660
},
{
"epoch": 3.241082410824108,
"grad_norm": 1.888336181640625,
"learning_rate": 1.858780751328255e-05,
"loss": 0.433,
"step": 661
},
{
"epoch": 3.2460024600246005,
"grad_norm": 0.801278293132782,
"learning_rate": 1.8357817616874694e-05,
"loss": 0.3704,
"step": 662
},
{
"epoch": 3.2509225092250924,
"grad_norm": 1.2432537078857422,
"learning_rate": 1.8129115557213262e-05,
"loss": 0.3552,
"step": 663
},
{
"epoch": 3.2558425584255843,
"grad_norm": 1.9892895221710205,
"learning_rate": 1.7901704941880914e-05,
"loss": 0.3551,
"step": 664
},
{
"epoch": 3.2607626076260763,
"grad_norm": 1.448431372642517,
"learning_rate": 1.7675589358088763e-05,
"loss": 0.4053,
"step": 665
},
{
"epoch": 3.265682656826568,
"grad_norm": 2.4297046661376953,
"learning_rate": 1.745077237261994e-05,
"loss": 0.4334,
"step": 666
},
{
"epoch": 3.27060270602706,
"grad_norm": 1.624751329421997,
"learning_rate": 1.7227257531773223e-05,
"loss": 0.4296,
"step": 667
},
{
"epoch": 3.275522755227552,
"grad_norm": 1.1023207902908325,
"learning_rate": 1.7005048361307262e-05,
"loss": 0.375,
"step": 668
},
{
"epoch": 3.280442804428044,
"grad_norm": 1.6138256788253784,
"learning_rate": 1.6784148366384754e-05,
"loss": 0.3394,
"step": 669
},
{
"epoch": 3.2853628536285364,
"grad_norm": 0.9887522459030151,
"learning_rate": 1.656456103151728e-05,
"loss": 0.3597,
"step": 670
},
{
"epoch": 3.2902829028290284,
"grad_norm": 1.7043898105621338,
"learning_rate": 1.6346289820510363e-05,
"loss": 0.3417,
"step": 671
},
{
"epoch": 3.2952029520295203,
"grad_norm": 1.6882188320159912,
"learning_rate": 1.612933817640868e-05,
"loss": 0.436,
"step": 672
},
{
"epoch": 3.3001230012300122,
"grad_norm": 0.7217171788215637,
"learning_rate": 1.5913709521441988e-05,
"loss": 0.3997,
"step": 673
},
{
"epoch": 3.305043050430504,
"grad_norm": 2.6820271015167236,
"learning_rate": 1.5699407256970833e-05,
"loss": 0.3115,
"step": 674
},
{
"epoch": 3.3099630996309966,
"grad_norm": 1.4860421419143677,
"learning_rate": 1.5486434763433222e-05,
"loss": 0.3516,
"step": 675
},
{
"epoch": 3.3148831488314885,
"grad_norm": 1.136051893234253,
"learning_rate": 1.527479540029104e-05,
"loss": 0.4023,
"step": 676
},
{
"epoch": 3.3198031980319804,
"grad_norm": 2.500821828842163,
"learning_rate": 1.5064492505977234e-05,
"loss": 0.4225,
"step": 677
},
{
"epoch": 3.3247232472324724,
"grad_norm": 0.5306374430656433,
"learning_rate": 1.4855529397843038e-05,
"loss": 0.3675,
"step": 678
},
{
"epoch": 3.3296432964329643,
"grad_norm": 1.5522453784942627,
"learning_rate": 1.4647909372105672e-05,
"loss": 0.3182,
"step": 679
},
{
"epoch": 3.3345633456334562,
"grad_norm": 1.6273597478866577,
"learning_rate": 1.4441635703796408e-05,
"loss": 0.3548,
"step": 680
},
{
"epoch": 3.339483394833948,
"grad_norm": 1.7513864040374756,
"learning_rate": 1.4236711646708844e-05,
"loss": 0.3177,
"step": 681
},
{
"epoch": 3.34440344403444,
"grad_norm": 1.033565878868103,
"learning_rate": 1.4033140433347569e-05,
"loss": 0.3639,
"step": 682
},
{
"epoch": 3.3493234932349325,
"grad_norm": 1.3103158473968506,
"learning_rate": 1.3830925274877216e-05,
"loss": 0.4256,
"step": 683
},
{
"epoch": 3.3542435424354244,
"grad_norm": 2.1008458137512207,
"learning_rate": 1.363006936107183e-05,
"loss": 0.4194,
"step": 684
},
{
"epoch": 3.3591635916359164,
"grad_norm": 1.350831151008606,
"learning_rate": 1.343057586026446e-05,
"loss": 0.3792,
"step": 685
},
{
"epoch": 3.3640836408364083,
"grad_norm": 3.0984957218170166,
"learning_rate": 1.3232447919297274e-05,
"loss": 0.4341,
"step": 686
},
{
"epoch": 3.3690036900369003,
"grad_norm": 0.47078070044517517,
"learning_rate": 1.3035688663471834e-05,
"loss": 0.3664,
"step": 687
},
{
"epoch": 3.373923739237392,
"grad_norm": 1.277298927307129,
"learning_rate": 1.2840301196499893e-05,
"loss": 0.3714,
"step": 688
},
{
"epoch": 3.3788437884378846,
"grad_norm": 2.4945287704467773,
"learning_rate": 1.2646288600454448e-05,
"loss": 0.3517,
"step": 689
},
{
"epoch": 3.3837638376383765,
"grad_norm": 0.9373493194580078,
"learning_rate": 1.2453653935720867e-05,
"loss": 0.3881,
"step": 690
},
{
"epoch": 3.3886838868388685,
"grad_norm": 4.251840591430664,
"learning_rate": 1.2262400240949023e-05,
"loss": 0.305,
"step": 691
},
{
"epoch": 3.3936039360393604,
"grad_norm": 2.382617950439453,
"learning_rate": 1.2072530533005012e-05,
"loss": 0.4376,
"step": 692
},
{
"epoch": 3.3985239852398523,
"grad_norm": 1.3531382083892822,
"learning_rate": 1.1884047806923815e-05,
"loss": 0.4127,
"step": 693
},
{
"epoch": 3.4034440344403443,
"grad_norm": 0.8284920454025269,
"learning_rate": 1.169695503586179e-05,
"loss": 0.406,
"step": 694
},
{
"epoch": 3.408364083640836,
"grad_norm": 0.6216104030609131,
"learning_rate": 1.1511255171050084e-05,
"loss": 0.3963,
"step": 695
},
{
"epoch": 3.4132841328413286,
"grad_norm": 2.1421051025390625,
"learning_rate": 1.1326951141747788e-05,
"loss": 0.449,
"step": 696
},
{
"epoch": 3.4182041820418205,
"grad_norm": 1.2773298025131226,
"learning_rate": 1.1144045855195973e-05,
"loss": 0.3583,
"step": 697
},
{
"epoch": 3.4231242312423125,
"grad_norm": 1.9336838722229004,
"learning_rate": 1.0962542196571634e-05,
"loss": 0.363,
"step": 698
},
{
"epoch": 3.4280442804428044,
"grad_norm": 2.467573881149292,
"learning_rate": 1.078244302894229e-05,
"loss": 0.4245,
"step": 699
},
{
"epoch": 3.4329643296432963,
"grad_norm": 2.337416648864746,
"learning_rate": 1.0603751193220846e-05,
"loss": 0.4083,
"step": 700
},
{
"epoch": 3.4378843788437883,
"grad_norm": 2.5366225242614746,
"learning_rate": 1.0426469508120662e-05,
"loss": 0.353,
"step": 701
},
{
"epoch": 3.4428044280442807,
"grad_norm": 1.9000239372253418,
"learning_rate": 1.0250600770111185e-05,
"loss": 0.4028,
"step": 702
},
{
"epoch": 3.4477244772447726,
"grad_norm": 1.7372283935546875,
"learning_rate": 1.0076147753373789e-05,
"loss": 0.4029,
"step": 703
},
{
"epoch": 3.4526445264452645,
"grad_norm": 1.1029900312423706,
"learning_rate": 9.903113209758096e-06,
"loss": 0.3817,
"step": 704
},
{
"epoch": 3.4575645756457565,
"grad_norm": 1.5212130546569824,
"learning_rate": 9.731499868738447e-06,
"loss": 0.3745,
"step": 705
},
{
"epoch": 3.4624846248462484,
"grad_norm": 1.2530347108840942,
"learning_rate": 9.561310437370907e-06,
"loss": 0.4198,
"step": 706
},
{
"epoch": 3.4674046740467404,
"grad_norm": 1.090973138809204,
"learning_rate": 9.392547600250634e-06,
"loss": 0.3743,
"step": 707
},
{
"epoch": 3.4723247232472323,
"grad_norm": 0.8587853312492371,
"learning_rate": 9.225214019469385e-06,
"loss": 0.3928,
"step": 708
},
{
"epoch": 3.4772447724477242,
"grad_norm": 1.6450562477111816,
"learning_rate": 9.059312334573633e-06,
"loss": 0.3529,
"step": 709
},
{
"epoch": 3.4821648216482166,
"grad_norm": 1.3053218126296997,
"learning_rate": 8.89484516252287e-06,
"loss": 0.3634,
"step": 710
},
{
"epoch": 3.4870848708487086,
"grad_norm": 2.639911413192749,
"learning_rate": 8.731815097648433e-06,
"loss": 0.4159,
"step": 711
},
{
"epoch": 3.4920049200492005,
"grad_norm": 0.9935341477394104,
"learning_rate": 8.570224711612385e-06,
"loss": 0.3803,
"step": 712
},
{
"epoch": 3.4969249692496924,
"grad_norm": 1.752165675163269,
"learning_rate": 8.410076553367208e-06,
"loss": 0.4104,
"step": 713
},
{
"epoch": 3.5018450184501844,
"grad_norm": 1.270850419998169,
"learning_rate": 8.251373149115293e-06,
"loss": 0.4122,
"step": 714
},
{
"epoch": 3.5067650676506767,
"grad_norm": 2.370002508163452,
"learning_rate": 8.094117002269363e-06,
"loss": 0.4529,
"step": 715
},
{
"epoch": 3.5116851168511687,
"grad_norm": 2.229987382888794,
"learning_rate": 7.938310593412879e-06,
"loss": 0.4117,
"step": 716
},
{
"epoch": 3.5166051660516606,
"grad_norm": 1.700907588005066,
"learning_rate": 7.783956380260837e-06,
"loss": 0.3801,
"step": 717
},
{
"epoch": 3.5215252152521526,
"grad_norm": 1.5140172243118286,
"learning_rate": 7.631056797621106e-06,
"loss": 0.3708,
"step": 718
},
{
"epoch": 3.5264452644526445,
"grad_norm": 1.4080220460891724,
"learning_rate": 7.479614257355971e-06,
"loss": 0.3763,
"step": 719
},
{
"epoch": 3.5313653136531364,
"grad_norm": 1.585070252418518,
"learning_rate": 7.329631148344118e-06,
"loss": 0.358,
"step": 720
},
{
"epoch": 3.5362853628536284,
"grad_norm": 2.044015645980835,
"learning_rate": 7.181109836442912e-06,
"loss": 0.3774,
"step": 721
},
{
"epoch": 3.5412054120541203,
"grad_norm": 0.8359534740447998,
"learning_rate": 7.034052664451118e-06,
"loss": 0.3663,
"step": 722
},
{
"epoch": 3.5461254612546127,
"grad_norm": 2.3022444248199463,
"learning_rate": 6.88846195207189e-06,
"loss": 0.3065,
"step": 723
},
{
"epoch": 3.5510455104551046,
"grad_norm": 1.8175033330917358,
"learning_rate": 6.7443399958762584e-06,
"loss": 0.4242,
"step": 724
},
{
"epoch": 3.5559655596555966,
"grad_norm": 1.7454516887664795,
"learning_rate": 6.6016890692668364e-06,
"loss": 0.3996,
"step": 725
},
{
"epoch": 3.5608856088560885,
"grad_norm": 2.403921604156494,
"learning_rate": 6.460511422441984e-06,
"loss": 0.4444,
"step": 726
},
{
"epoch": 3.5658056580565805,
"grad_norm": 1.0997297763824463,
"learning_rate": 6.320809282360319e-06,
"loss": 0.4124,
"step": 727
},
{
"epoch": 3.570725707257073,
"grad_norm": 3.04303240776062,
"learning_rate": 6.1825848527055865e-06,
"loss": 0.4291,
"step": 728
},
{
"epoch": 3.5756457564575648,
"grad_norm": 0.9251189827919006,
"learning_rate": 6.04584031385188e-06,
"loss": 0.3733,
"step": 729
},
{
"epoch": 3.5805658056580567,
"grad_norm": 1.9034310579299927,
"learning_rate": 5.910577822829233e-06,
"loss": 0.3884,
"step": 730
},
{
"epoch": 3.5854858548585486,
"grad_norm": 1.187487244606018,
"learning_rate": 5.77679951328971e-06,
"loss": 0.4108,
"step": 731
},
{
"epoch": 3.5904059040590406,
"grad_norm": 1.513329267501831,
"learning_rate": 5.644507495473572e-06,
"loss": 0.4008,
"step": 732
},
{
"epoch": 3.5953259532595325,
"grad_norm": 2.4123191833496094,
"learning_rate": 5.5137038561761115e-06,
"loss": 0.4162,
"step": 733
},
{
"epoch": 3.6002460024600245,
"grad_norm": 1.3358474969863892,
"learning_rate": 5.3843906587146886e-06,
"loss": 0.4287,
"step": 734
},
{
"epoch": 3.6051660516605164,
"grad_norm": 1.746752142906189,
"learning_rate": 5.256569942896217e-06,
"loss": 0.341,
"step": 735
},
{
"epoch": 3.6100861008610083,
"grad_norm": 1.716902732849121,
"learning_rate": 5.130243724984995e-06,
"loss": 0.4344,
"step": 736
},
{
"epoch": 3.6150061500615007,
"grad_norm": 0.44636377692222595,
"learning_rate": 5.005413997670816e-06,
"loss": 0.3995,
"step": 737
},
{
"epoch": 3.6199261992619927,
"grad_norm": 0.6673928499221802,
"learning_rate": 4.8820827300376075e-06,
"loss": 0.3771,
"step": 738
},
{
"epoch": 3.6248462484624846,
"grad_norm": 1.8165249824523926,
"learning_rate": 4.760251867532362e-06,
"loss": 0.4214,
"step": 739
},
{
"epoch": 3.6297662976629765,
"grad_norm": 1.8206608295440674,
"learning_rate": 4.639923331934471e-06,
"loss": 0.3361,
"step": 740
},
{
"epoch": 3.6346863468634685,
"grad_norm": 1.2049740552902222,
"learning_rate": 4.521099021325336e-06,
"loss": 0.4241,
"step": 741
},
{
"epoch": 3.639606396063961,
"grad_norm": 2.151357650756836,
"learning_rate": 4.403780810058511e-06,
"loss": 0.3934,
"step": 742
},
{
"epoch": 3.644526445264453,
"grad_norm": 2.024153470993042,
"learning_rate": 4.287970548730069e-06,
"loss": 0.4109,
"step": 743
},
{
"epoch": 3.6494464944649447,
"grad_norm": 0.612326979637146,
"learning_rate": 4.173670064149482e-06,
"loss": 0.4119,
"step": 744
},
{
"epoch": 3.6543665436654367,
"grad_norm": 1.2650341987609863,
"learning_rate": 4.060881159310725e-06,
"loss": 0.4048,
"step": 745
},
{
"epoch": 3.6592865928659286,
"grad_norm": 1.5588371753692627,
"learning_rate": 3.949605613363882e-06,
"loss": 0.3616,
"step": 746
},
{
"epoch": 3.6642066420664205,
"grad_norm": 0.8163132667541504,
"learning_rate": 3.839845181587098e-06,
"loss": 0.4051,
"step": 747
},
{
"epoch": 3.6691266912669125,
"grad_norm": 2.6811370849609375,
"learning_rate": 3.7316015953588467e-06,
"loss": 0.4446,
"step": 748
},
{
"epoch": 3.6740467404674044,
"grad_norm": 3.0077154636383057,
"learning_rate": 3.6248765621306414e-06,
"loss": 0.3562,
"step": 749
},
{
"epoch": 3.678966789667897,
"grad_norm": 1.8142826557159424,
"learning_rate": 3.519671765400079e-06,
"loss": 0.3967,
"step": 750
},
{
"epoch": 3.6838868388683887,
"grad_norm": 4.520020008087158,
"learning_rate": 3.4159888646843495e-06,
"loss": 0.4737,
"step": 751
},
{
"epoch": 3.6888068880688807,
"grad_norm": 2.5950474739074707,
"learning_rate": 3.313829495493992e-06,
"loss": 0.3269,
"step": 752
},
{
"epoch": 3.6937269372693726,
"grad_norm": 0.9162222146987915,
"learning_rate": 3.2131952693070898e-06,
"loss": 0.4284,
"step": 753
},
{
"epoch": 3.6986469864698646,
"grad_norm": 2.3598175048828125,
"learning_rate": 3.1140877735439387e-06,
"loss": 0.4268,
"step": 754
},
{
"epoch": 3.703567035670357,
"grad_norm": 2.1901378631591797,
"learning_rate": 3.0165085715418763e-06,
"loss": 0.3514,
"step": 755
},
{
"epoch": 3.708487084870849,
"grad_norm": 1.2730752229690552,
"learning_rate": 2.9204592025307566e-06,
"loss": 0.3697,
"step": 756
},
{
"epoch": 3.713407134071341,
"grad_norm": 1.7523503303527832,
"learning_rate": 2.8259411816085492e-06,
"loss": 0.3626,
"step": 757
},
{
"epoch": 3.7183271832718328,
"grad_norm": 0.7201489806175232,
"learning_rate": 2.732955999717546e-06,
"loss": 0.4082,
"step": 758
},
{
"epoch": 3.7232472324723247,
"grad_norm": 2.6464169025421143,
"learning_rate": 2.6415051236207355e-06,
"loss": 0.3311,
"step": 759
},
{
"epoch": 3.7281672816728166,
"grad_norm": 1.9799178838729858,
"learning_rate": 2.551589995878789e-06,
"loss": 0.392,
"step": 760
},
{
"epoch": 3.7330873308733086,
"grad_norm": 1.5155545473098755,
"learning_rate": 2.4632120348272003e-06,
"loss": 0.3762,
"step": 761
},
{
"epoch": 3.7380073800738005,
"grad_norm": 1.5089105367660522,
"learning_rate": 2.376372634553936e-06,
"loss": 0.3995,
"step": 762
},
{
"epoch": 3.742927429274293,
"grad_norm": 1.772503137588501,
"learning_rate": 2.291073164877511e-06,
"loss": 0.3853,
"step": 763
},
{
"epoch": 3.747847478474785,
"grad_norm": 2.189436435699463,
"learning_rate": 2.207314971325292e-06,
"loss": 0.3494,
"step": 764
},
{
"epoch": 3.7527675276752768,
"grad_norm": 1.9785796403884888,
"learning_rate": 2.125099375112316e-06,
"loss": 0.3675,
"step": 765
},
{
"epoch": 3.7576875768757687,
"grad_norm": 2.732494831085205,
"learning_rate": 2.0444276731204415e-06,
"loss": 0.4188,
"step": 766
},
{
"epoch": 3.7626076260762606,
"grad_norm": 1.5634301900863647,
"learning_rate": 1.9653011378779283e-06,
"loss": 0.4186,
"step": 767
},
{
"epoch": 3.767527675276753,
"grad_norm": 0.6259942650794983,
"learning_rate": 1.88772101753929e-06,
"loss": 0.3834,
"step": 768
},
{
"epoch": 3.772447724477245,
"grad_norm": 1.3457146883010864,
"learning_rate": 1.8116885358656744e-06,
"loss": 0.3696,
"step": 769
},
{
"epoch": 3.777367773677737,
"grad_norm": 1.3714008331298828,
"learning_rate": 1.7372048922054906e-06,
"loss": 0.3921,
"step": 770
},
{
"epoch": 3.782287822878229,
"grad_norm": 1.4138679504394531,
"learning_rate": 1.6642712614755695e-06,
"loss": 0.4379,
"step": 771
},
{
"epoch": 3.787207872078721,
"grad_norm": 0.921842634677887,
"learning_rate": 1.5928887941426107e-06,
"loss": 0.3714,
"step": 772
},
{
"epoch": 3.7921279212792127,
"grad_norm": 2.7711589336395264,
"learning_rate": 1.523058616204942e-06,
"loss": 0.3689,
"step": 773
},
{
"epoch": 3.7970479704797047,
"grad_norm": 2.5462987422943115,
"learning_rate": 1.4547818291749115e-06,
"loss": 0.4578,
"step": 774
},
{
"epoch": 3.8019680196801966,
"grad_norm": 2.8806490898132324,
"learning_rate": 1.3880595100613792e-06,
"loss": 0.3297,
"step": 775
},
{
"epoch": 3.8068880688806885,
"grad_norm": 1.5188145637512207,
"learning_rate": 1.3228927113528189e-06,
"loss": 0.3871,
"step": 776
},
{
"epoch": 3.811808118081181,
"grad_norm": 0.9707936644554138,
"learning_rate": 1.2592824610006215e-06,
"loss": 0.3656,
"step": 777
},
{
"epoch": 3.816728167281673,
"grad_norm": 1.8770543336868286,
"learning_rate": 1.1972297624030072e-06,
"loss": 0.3981,
"step": 778
},
{
"epoch": 3.821648216482165,
"grad_norm": 2.3081560134887695,
"learning_rate": 1.1367355943890823e-06,
"loss": 0.341,
"step": 779
},
{
"epoch": 3.8265682656826567,
"grad_norm": 1.113144040107727,
"learning_rate": 1.0778009112034748e-06,
"loss": 0.3586,
"step": 780
},
{
"epoch": 3.8314883148831487,
"grad_norm": 0.5980240702629089,
"learning_rate": 1.0204266424912123e-06,
"loss": 0.376,
"step": 781
},
{
"epoch": 3.836408364083641,
"grad_norm": 0.6723970174789429,
"learning_rate": 9.64613693283123e-07,
"loss": 0.4038,
"step": 782
},
{
"epoch": 3.841328413284133,
"grad_norm": 2.4948697090148926,
"learning_rate": 9.103629439815354e-07,
"loss": 0.3738,
"step": 783
},
{
"epoch": 3.846248462484625,
"grad_norm": 1.11293625831604,
"learning_rate": 8.57675250346368e-07,
"loss": 0.3866,
"step": 784
},
{
"epoch": 3.851168511685117,
"grad_norm": 2.0996763706207275,
"learning_rate": 8.065514434816845e-07,
"loss": 0.4064,
"step": 785
},
{
"epoch": 3.856088560885609,
"grad_norm": 1.6557263135910034,
"learning_rate": 7.569923298225146e-07,
"loss": 0.3567,
"step": 786
},
{
"epoch": 3.8610086100861007,
"grad_norm": 1.717772364616394,
"learning_rate": 7.08998691122198e-07,
"loss": 0.3856,
"step": 787
},
{
"epoch": 3.8659286592865927,
"grad_norm": 1.4299819469451904,
"learning_rate": 6.625712844400056e-07,
"loss": 0.3652,
"step": 788
},
{
"epoch": 3.8708487084870846,
"grad_norm": 2.8910887241363525,
"learning_rate": 6.177108421292266e-07,
"loss": 0.4677,
"step": 789
},
{
"epoch": 3.875768757687577,
"grad_norm": 1.175137996673584,
"learning_rate": 5.744180718255776e-07,
"loss": 0.4193,
"step": 790
},
{
"epoch": 3.880688806888069,
"grad_norm": 1.1175763607025146,
"learning_rate": 5.326936564361118e-07,
"loss": 0.3875,
"step": 791
},
{
"epoch": 3.885608856088561,
"grad_norm": 0.9984952211380005,
"learning_rate": 4.92538254128383e-07,
"loss": 0.3799,
"step": 792
},
{
"epoch": 3.890528905289053,
"grad_norm": 1.142543077468872,
"learning_rate": 4.5395249832007604e-07,
"loss": 0.4194,
"step": 793
},
{
"epoch": 3.8954489544895448,
"grad_norm": 1.1013692617416382,
"learning_rate": 4.1693699766902626e-07,
"loss": 0.3853,
"step": 794
},
{
"epoch": 3.900369003690037,
"grad_norm": 1.5713825225830078,
"learning_rate": 3.814923360636158e-07,
"loss": 0.4418,
"step": 795
},
{
"epoch": 3.905289052890529,
"grad_norm": 3.3740017414093018,
"learning_rate": 3.4761907261356976e-07,
"loss": 0.3226,
"step": 796
},
{
"epoch": 3.910209102091021,
"grad_norm": 2.347411870956421,
"learning_rate": 3.1531774164111903e-07,
"loss": 0.4269,
"step": 797
},
{
"epoch": 3.915129151291513,
"grad_norm": 0.46610283851623535,
"learning_rate": 2.8458885267260705e-07,
"loss": 0.3861,
"step": 798
},
{
"epoch": 3.920049200492005,
"grad_norm": 2.183335304260254,
"learning_rate": 2.554328904303738e-07,
"loss": 0.4076,
"step": 799
},
{
"epoch": 3.924969249692497,
"grad_norm": 0.9739826321601868,
"learning_rate": 2.2785031482521758e-07,
"loss": 0.366,
"step": 800
},
{
"epoch": 3.9298892988929888,
"grad_norm": 1.9975255727767944,
"learning_rate": 2.0184156094905648e-07,
"loss": 0.4491,
"step": 801
},
{
"epoch": 3.9348093480934807,
"grad_norm": 2.732900619506836,
"learning_rate": 1.7740703906810042e-07,
"loss": 0.3248,
"step": 802
},
{
"epoch": 3.939729397293973,
"grad_norm": 0.8809100389480591,
"learning_rate": 1.545471346164007e-07,
"loss": 0.3633,
"step": 803
},
{
"epoch": 3.944649446494465,
"grad_norm": 0.5867434740066528,
"learning_rate": 1.3326220818968838e-07,
"loss": 0.3881,
"step": 804
},
{
"epoch": 3.949569495694957,
"grad_norm": 0.8650780320167542,
"learning_rate": 1.1355259553978981e-07,
"loss": 0.3669,
"step": 805
},
{
"epoch": 3.954489544895449,
"grad_norm": 1.4509629011154175,
"learning_rate": 9.541860756925314e-08,
"loss": 0.3649,
"step": 806
},
{
"epoch": 3.959409594095941,
"grad_norm": 2.9854180812835693,
"learning_rate": 7.886053032649665e-08,
"loss": 0.3379,
"step": 807
},
{
"epoch": 3.9643296432964332,
"grad_norm": 3.3452847003936768,
"learning_rate": 6.387862500125685e-08,
"loss": 0.3104,
"step": 808
},
{
"epoch": 3.969249692496925,
"grad_norm": 1.342034935951233,
"learning_rate": 5.047312792046954e-08,
"loss": 0.3895,
"step": 809
},
{
"epoch": 3.974169741697417,
"grad_norm": 1.3684653043746948,
"learning_rate": 3.8644250544594975e-08,
"loss": 0.3729,
"step": 810
},
{
"epoch": 3.979089790897909,
"grad_norm": 2.351048231124878,
"learning_rate": 2.839217946422057e-08,
"loss": 0.4621,
"step": 811
},
{
"epoch": 3.984009840098401,
"grad_norm": 0.49089106917381287,
"learning_rate": 1.971707639712994e-08,
"loss": 0.3819,
"step": 812
},
{
"epoch": 3.988929889298893,
"grad_norm": 1.8144298791885376,
"learning_rate": 1.2619078185793776e-08,
"loss": 0.4157,
"step": 813
},
{
"epoch": 3.993849938499385,
"grad_norm": 1.8721059560775757,
"learning_rate": 7.098296795138293e-09,
"loss": 0.3468,
"step": 814
},
{
"epoch": 3.998769987699877,
"grad_norm": 1.0250661373138428,
"learning_rate": 3.154819310868806e-09,
"loss": 0.401,
"step": 815
},
{
"epoch": 4.0,
"grad_norm": 1.8312103748321533,
"learning_rate": 7.887079380153317e-10,
"loss": 0.3332,
"step": 816
},
{
"epoch": 4.0,
"step": 816,
"total_flos": 1.3456927249947034e+17,
"train_loss": 0.40810306406780783,
"train_runtime": 2344.5136,
"train_samples_per_second": 11.093,
"train_steps_per_second": 0.348
}
],
"logging_steps": 1,
"max_steps": 816,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 2400000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3456927249947034e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}