1476 / trainer_state.json
iamPi's picture
Add files using upload-large-folder tool
6329f4a verified
Raw
History Blame Contribute Delete
253 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8972644376899696,
"eval_steps": 500,
"global_step": 1476,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006079027355623101,
"grad_norm": 44.818572998046875,
"learning_rate": 0.0,
"loss": 7.186539173126221,
"step": 1
},
{
"epoch": 0.0012158054711246201,
"grad_norm": 47.259071350097656,
"learning_rate": 1e-05,
"loss": 7.313593864440918,
"step": 2
},
{
"epoch": 0.00182370820668693,
"grad_norm": 23.298837661743164,
"learning_rate": 2e-05,
"loss": 7.087122917175293,
"step": 3
},
{
"epoch": 0.0024316109422492403,
"grad_norm": 13.535771369934082,
"learning_rate": 3e-05,
"loss": 6.942234992980957,
"step": 4
},
{
"epoch": 0.00303951367781155,
"grad_norm": 11.997403144836426,
"learning_rate": 4e-05,
"loss": 6.6411614418029785,
"step": 5
},
{
"epoch": 0.00364741641337386,
"grad_norm": 13.242263793945312,
"learning_rate": 5e-05,
"loss": 6.319230079650879,
"step": 6
},
{
"epoch": 0.00425531914893617,
"grad_norm": 10.080074310302734,
"learning_rate": 6e-05,
"loss": 6.251328468322754,
"step": 7
},
{
"epoch": 0.004863221884498481,
"grad_norm": 14.386478424072266,
"learning_rate": 7.000000000000001e-05,
"loss": 6.372805595397949,
"step": 8
},
{
"epoch": 0.00547112462006079,
"grad_norm": 6.731114387512207,
"learning_rate": 8e-05,
"loss": 6.32672119140625,
"step": 9
},
{
"epoch": 0.0060790273556231,
"grad_norm": 7.430361747741699,
"learning_rate": 8.999999999999999e-05,
"loss": 5.981637954711914,
"step": 10
},
{
"epoch": 0.006686930091185411,
"grad_norm": 6.817004680633545,
"learning_rate": 0.0001,
"loss": 6.182029724121094,
"step": 11
},
{
"epoch": 0.00729483282674772,
"grad_norm": 6.540442943572998,
"learning_rate": 0.00011,
"loss": 6.224725723266602,
"step": 12
},
{
"epoch": 0.007902735562310031,
"grad_norm": 6.224416255950928,
"learning_rate": 0.00012,
"loss": 6.106351852416992,
"step": 13
},
{
"epoch": 0.00851063829787234,
"grad_norm": 5.954357624053955,
"learning_rate": 0.00013000000000000002,
"loss": 6.050826072692871,
"step": 14
},
{
"epoch": 0.00911854103343465,
"grad_norm": 5.7734551429748535,
"learning_rate": 0.00014000000000000001,
"loss": 6.147342681884766,
"step": 15
},
{
"epoch": 0.009726443768996961,
"grad_norm": 6.399932861328125,
"learning_rate": 0.00015,
"loss": 6.284224510192871,
"step": 16
},
{
"epoch": 0.01033434650455927,
"grad_norm": 4.2578558921813965,
"learning_rate": 0.00016,
"loss": 5.968033790588379,
"step": 17
},
{
"epoch": 0.01094224924012158,
"grad_norm": 3.9558868408203125,
"learning_rate": 0.00017,
"loss": 5.909118175506592,
"step": 18
},
{
"epoch": 0.011550151975683891,
"grad_norm": 3.4882659912109375,
"learning_rate": 0.00017999999999999998,
"loss": 6.045907974243164,
"step": 19
},
{
"epoch": 0.0121580547112462,
"grad_norm": 6.301029682159424,
"learning_rate": 0.00019,
"loss": 5.905165672302246,
"step": 20
},
{
"epoch": 0.01276595744680851,
"grad_norm": 3.891385078430176,
"learning_rate": 0.0002,
"loss": 5.9485931396484375,
"step": 21
},
{
"epoch": 0.013373860182370821,
"grad_norm": 4.277671813964844,
"learning_rate": 0.00021,
"loss": 5.995012283325195,
"step": 22
},
{
"epoch": 0.01398176291793313,
"grad_norm": 3.7930500507354736,
"learning_rate": 0.00022,
"loss": 6.081092834472656,
"step": 23
},
{
"epoch": 0.01458966565349544,
"grad_norm": 5.02017879486084,
"learning_rate": 0.00023,
"loss": 6.232627868652344,
"step": 24
},
{
"epoch": 0.015197568389057751,
"grad_norm": 3.485990285873413,
"learning_rate": 0.00024,
"loss": 6.189592361450195,
"step": 25
},
{
"epoch": 0.015805471124620062,
"grad_norm": 4.133285999298096,
"learning_rate": 0.00025,
"loss": 5.953710079193115,
"step": 26
},
{
"epoch": 0.01641337386018237,
"grad_norm": 4.140801429748535,
"learning_rate": 0.00026000000000000003,
"loss": 5.926338195800781,
"step": 27
},
{
"epoch": 0.01702127659574468,
"grad_norm": 3.4010164737701416,
"learning_rate": 0.00027,
"loss": 5.7254462242126465,
"step": 28
},
{
"epoch": 0.01762917933130699,
"grad_norm": 10.262829780578613,
"learning_rate": 0.00028000000000000003,
"loss": 6.183866500854492,
"step": 29
},
{
"epoch": 0.0182370820668693,
"grad_norm": 4.732674598693848,
"learning_rate": 0.00029,
"loss": 5.899426460266113,
"step": 30
},
{
"epoch": 0.01884498480243161,
"grad_norm": 4.868585109710693,
"learning_rate": 0.0003,
"loss": 5.8833699226379395,
"step": 31
},
{
"epoch": 0.019452887537993922,
"grad_norm": 4.654231071472168,
"learning_rate": 0.00031,
"loss": 5.967190265655518,
"step": 32
},
{
"epoch": 0.02006079027355623,
"grad_norm": 4.583294868469238,
"learning_rate": 0.00032,
"loss": 6.027661323547363,
"step": 33
},
{
"epoch": 0.02066869300911854,
"grad_norm": 4.038606643676758,
"learning_rate": 0.00033,
"loss": 6.06468391418457,
"step": 34
},
{
"epoch": 0.02127659574468085,
"grad_norm": 3.1677229404449463,
"learning_rate": 0.00034,
"loss": 5.97524881362915,
"step": 35
},
{
"epoch": 0.02188449848024316,
"grad_norm": 4.171515941619873,
"learning_rate": 0.00035,
"loss": 5.981804370880127,
"step": 36
},
{
"epoch": 0.022492401215805473,
"grad_norm": 5.382990837097168,
"learning_rate": 0.00035999999999999997,
"loss": 6.05380916595459,
"step": 37
},
{
"epoch": 0.023100303951367782,
"grad_norm": 4.436893463134766,
"learning_rate": 0.00037,
"loss": 6.156210899353027,
"step": 38
},
{
"epoch": 0.02370820668693009,
"grad_norm": 4.104293346405029,
"learning_rate": 0.00038,
"loss": 5.963473320007324,
"step": 39
},
{
"epoch": 0.0243161094224924,
"grad_norm": 7.8225202560424805,
"learning_rate": 0.00039000000000000005,
"loss": 5.945594310760498,
"step": 40
},
{
"epoch": 0.02492401215805471,
"grad_norm": 3.7115426063537598,
"learning_rate": 0.0004,
"loss": 5.866631984710693,
"step": 41
},
{
"epoch": 0.02553191489361702,
"grad_norm": 3.377136468887329,
"learning_rate": 0.00041,
"loss": 5.87300968170166,
"step": 42
},
{
"epoch": 0.026139817629179333,
"grad_norm": 3.0676238536834717,
"learning_rate": 0.00042,
"loss": 5.819428443908691,
"step": 43
},
{
"epoch": 0.026747720364741642,
"grad_norm": 3.4088737964630127,
"learning_rate": 0.00043,
"loss": 5.686548709869385,
"step": 44
},
{
"epoch": 0.02735562310030395,
"grad_norm": 4.599688529968262,
"learning_rate": 0.00044,
"loss": 6.143298149108887,
"step": 45
},
{
"epoch": 0.02796352583586626,
"grad_norm": 3.1253559589385986,
"learning_rate": 0.00045000000000000004,
"loss": 5.965961933135986,
"step": 46
},
{
"epoch": 0.02857142857142857,
"grad_norm": 3.3107733726501465,
"learning_rate": 0.00046,
"loss": 5.744629859924316,
"step": 47
},
{
"epoch": 0.02917933130699088,
"grad_norm": 3.4835944175720215,
"learning_rate": 0.00047,
"loss": 5.963787078857422,
"step": 48
},
{
"epoch": 0.029787234042553193,
"grad_norm": 4.766516208648682,
"learning_rate": 0.00048,
"loss": 5.903127670288086,
"step": 49
},
{
"epoch": 0.030395136778115502,
"grad_norm": 3.4444823265075684,
"learning_rate": 0.00049,
"loss": 5.898875713348389,
"step": 50
},
{
"epoch": 0.03100303951367781,
"grad_norm": 3.4199633598327637,
"learning_rate": 0.0005,
"loss": 5.995363235473633,
"step": 51
},
{
"epoch": 0.031610942249240125,
"grad_norm": 4.609949111938477,
"learning_rate": 0.0005,
"loss": 5.867133140563965,
"step": 52
},
{
"epoch": 0.03221884498480243,
"grad_norm": 2.445003032684326,
"learning_rate": 0.0005,
"loss": 5.596291542053223,
"step": 53
},
{
"epoch": 0.03282674772036474,
"grad_norm": 7.065042972564697,
"learning_rate": 0.0005,
"loss": 5.764184951782227,
"step": 54
},
{
"epoch": 0.03343465045592705,
"grad_norm": 3.3624749183654785,
"learning_rate": 0.0005,
"loss": 5.835771560668945,
"step": 55
},
{
"epoch": 0.03404255319148936,
"grad_norm": 2.667015790939331,
"learning_rate": 0.0005,
"loss": 5.9446611404418945,
"step": 56
},
{
"epoch": 0.034650455927051675,
"grad_norm": 3.2562549114227295,
"learning_rate": 0.0005,
"loss": 6.190652370452881,
"step": 57
},
{
"epoch": 0.03525835866261398,
"grad_norm": 3.5651185512542725,
"learning_rate": 0.0005,
"loss": 5.877089500427246,
"step": 58
},
{
"epoch": 0.035866261398176294,
"grad_norm": 2.6607139110565186,
"learning_rate": 0.0005,
"loss": 5.947436332702637,
"step": 59
},
{
"epoch": 0.0364741641337386,
"grad_norm": 2.5586416721343994,
"learning_rate": 0.0005,
"loss": 6.041194915771484,
"step": 60
},
{
"epoch": 0.03708206686930091,
"grad_norm": 3.5156543254852295,
"learning_rate": 0.0005,
"loss": 5.8784284591674805,
"step": 61
},
{
"epoch": 0.03768996960486322,
"grad_norm": 2.013105630874634,
"learning_rate": 0.0005,
"loss": 5.705929756164551,
"step": 62
},
{
"epoch": 0.03829787234042553,
"grad_norm": 2.2044196128845215,
"learning_rate": 0.0005,
"loss": 5.775040626525879,
"step": 63
},
{
"epoch": 0.038905775075987845,
"grad_norm": 3.8432488441467285,
"learning_rate": 0.0005,
"loss": 5.757482528686523,
"step": 64
},
{
"epoch": 0.03951367781155015,
"grad_norm": 2.794318437576294,
"learning_rate": 0.0005,
"loss": 5.4956865310668945,
"step": 65
},
{
"epoch": 0.04012158054711246,
"grad_norm": 5.635376930236816,
"learning_rate": 0.0005,
"loss": 5.950571060180664,
"step": 66
},
{
"epoch": 0.04072948328267477,
"grad_norm": 2.8366096019744873,
"learning_rate": 0.0005,
"loss": 5.937989711761475,
"step": 67
},
{
"epoch": 0.04133738601823708,
"grad_norm": 4.0585455894470215,
"learning_rate": 0.0005,
"loss": 6.175616264343262,
"step": 68
},
{
"epoch": 0.041945288753799395,
"grad_norm": 2.4633665084838867,
"learning_rate": 0.0005,
"loss": 5.856078147888184,
"step": 69
},
{
"epoch": 0.0425531914893617,
"grad_norm": 2.900541305541992,
"learning_rate": 0.0005,
"loss": 5.562302112579346,
"step": 70
},
{
"epoch": 0.043161094224924014,
"grad_norm": 2.1582231521606445,
"learning_rate": 0.0005,
"loss": 5.853466033935547,
"step": 71
},
{
"epoch": 0.04376899696048632,
"grad_norm": 2.823076009750366,
"learning_rate": 0.0005,
"loss": 5.676411151885986,
"step": 72
},
{
"epoch": 0.04437689969604863,
"grad_norm": 3.4227182865142822,
"learning_rate": 0.0005,
"loss": 5.687357425689697,
"step": 73
},
{
"epoch": 0.044984802431610946,
"grad_norm": 2.4039175510406494,
"learning_rate": 0.0005,
"loss": 5.892976760864258,
"step": 74
},
{
"epoch": 0.04559270516717325,
"grad_norm": 2.6830098628997803,
"learning_rate": 0.0005,
"loss": 5.66058349609375,
"step": 75
},
{
"epoch": 0.046200607902735565,
"grad_norm": 2.413268566131592,
"learning_rate": 0.0005,
"loss": 5.7166547775268555,
"step": 76
},
{
"epoch": 0.04680851063829787,
"grad_norm": 2.110560894012451,
"learning_rate": 0.0005,
"loss": 5.578657150268555,
"step": 77
},
{
"epoch": 0.04741641337386018,
"grad_norm": 2.293944835662842,
"learning_rate": 0.0005,
"loss": 5.830209732055664,
"step": 78
},
{
"epoch": 0.04802431610942249,
"grad_norm": 2.3141164779663086,
"learning_rate": 0.0005,
"loss": 5.730184555053711,
"step": 79
},
{
"epoch": 0.0486322188449848,
"grad_norm": 2.4202141761779785,
"learning_rate": 0.0005,
"loss": 5.657958030700684,
"step": 80
},
{
"epoch": 0.049240121580547115,
"grad_norm": 2.1450300216674805,
"learning_rate": 0.0005,
"loss": 5.734421253204346,
"step": 81
},
{
"epoch": 0.04984802431610942,
"grad_norm": 2.340426206588745,
"learning_rate": 0.0005,
"loss": 5.912275314331055,
"step": 82
},
{
"epoch": 0.050455927051671734,
"grad_norm": 2.2572286128997803,
"learning_rate": 0.0005,
"loss": 6.227065086364746,
"step": 83
},
{
"epoch": 0.05106382978723404,
"grad_norm": 1.9745402336120605,
"learning_rate": 0.0005,
"loss": 5.538962364196777,
"step": 84
},
{
"epoch": 0.05167173252279635,
"grad_norm": 1.8350422382354736,
"learning_rate": 0.0005,
"loss": 5.68572998046875,
"step": 85
},
{
"epoch": 0.052279635258358666,
"grad_norm": 1.4099390506744385,
"learning_rate": 0.0005,
"loss": 5.548061370849609,
"step": 86
},
{
"epoch": 0.05288753799392097,
"grad_norm": 1.7324459552764893,
"learning_rate": 0.0005,
"loss": 5.791088104248047,
"step": 87
},
{
"epoch": 0.053495440729483285,
"grad_norm": 2.2765917778015137,
"learning_rate": 0.0005,
"loss": 5.66319465637207,
"step": 88
},
{
"epoch": 0.05410334346504559,
"grad_norm": 1.8931759595870972,
"learning_rate": 0.0005,
"loss": 5.931559085845947,
"step": 89
},
{
"epoch": 0.0547112462006079,
"grad_norm": 3.1260805130004883,
"learning_rate": 0.0005,
"loss": 5.887214183807373,
"step": 90
},
{
"epoch": 0.05531914893617021,
"grad_norm": 2.076260805130005,
"learning_rate": 0.0005,
"loss": 5.837953567504883,
"step": 91
},
{
"epoch": 0.05592705167173252,
"grad_norm": 2.6507105827331543,
"learning_rate": 0.0005,
"loss": 5.720830917358398,
"step": 92
},
{
"epoch": 0.056534954407294835,
"grad_norm": 1.761267900466919,
"learning_rate": 0.0005,
"loss": 5.8046417236328125,
"step": 93
},
{
"epoch": 0.05714285714285714,
"grad_norm": 2.158432722091675,
"learning_rate": 0.0005,
"loss": 5.530825614929199,
"step": 94
},
{
"epoch": 0.057750759878419454,
"grad_norm": 1.8743107318878174,
"learning_rate": 0.0005,
"loss": 5.851261138916016,
"step": 95
},
{
"epoch": 0.05835866261398176,
"grad_norm": 2.2951159477233887,
"learning_rate": 0.0005,
"loss": 5.754410743713379,
"step": 96
},
{
"epoch": 0.05896656534954407,
"grad_norm": 1.6710808277130127,
"learning_rate": 0.0005,
"loss": 5.511685371398926,
"step": 97
},
{
"epoch": 0.059574468085106386,
"grad_norm": 2.4671308994293213,
"learning_rate": 0.0005,
"loss": 5.762502193450928,
"step": 98
},
{
"epoch": 0.06018237082066869,
"grad_norm": 1.7344735860824585,
"learning_rate": 0.0005,
"loss": 5.726058006286621,
"step": 99
},
{
"epoch": 0.060790273556231005,
"grad_norm": 1.9786497354507446,
"learning_rate": 0.0005,
"loss": 5.570637226104736,
"step": 100
},
{
"epoch": 0.06139817629179331,
"grad_norm": 1.672898769378662,
"learning_rate": 0.0005,
"loss": 5.4022722244262695,
"step": 101
},
{
"epoch": 0.06200607902735562,
"grad_norm": 1.975422978401184,
"learning_rate": 0.0005,
"loss": 5.58085823059082,
"step": 102
},
{
"epoch": 0.06261398176291794,
"grad_norm": 1.6185539960861206,
"learning_rate": 0.0005,
"loss": 5.551645755767822,
"step": 103
},
{
"epoch": 0.06322188449848025,
"grad_norm": 1.6963152885437012,
"learning_rate": 0.0005,
"loss": 5.634788990020752,
"step": 104
},
{
"epoch": 0.06382978723404255,
"grad_norm": 1.6010147333145142,
"learning_rate": 0.0005,
"loss": 5.439291954040527,
"step": 105
},
{
"epoch": 0.06443768996960486,
"grad_norm": 1.4918285608291626,
"learning_rate": 0.0005,
"loss": 5.595495700836182,
"step": 106
},
{
"epoch": 0.06504559270516717,
"grad_norm": 1.7921746969223022,
"learning_rate": 0.0005,
"loss": 5.7882080078125,
"step": 107
},
{
"epoch": 0.06565349544072949,
"grad_norm": 1.6905741691589355,
"learning_rate": 0.0005,
"loss": 5.6724653244018555,
"step": 108
},
{
"epoch": 0.0662613981762918,
"grad_norm": 1.5293573141098022,
"learning_rate": 0.0005,
"loss": 5.407555103302002,
"step": 109
},
{
"epoch": 0.0668693009118541,
"grad_norm": 1.3903565406799316,
"learning_rate": 0.0005,
"loss": 5.763338565826416,
"step": 110
},
{
"epoch": 0.06747720364741641,
"grad_norm": 1.6731656789779663,
"learning_rate": 0.0005,
"loss": 5.656299591064453,
"step": 111
},
{
"epoch": 0.06808510638297872,
"grad_norm": 1.6174890995025635,
"learning_rate": 0.0005,
"loss": 5.728058815002441,
"step": 112
},
{
"epoch": 0.06869300911854104,
"grad_norm": 1.9111192226409912,
"learning_rate": 0.0005,
"loss": 5.569175720214844,
"step": 113
},
{
"epoch": 0.06930091185410335,
"grad_norm": 1.397756576538086,
"learning_rate": 0.0005,
"loss": 5.692349433898926,
"step": 114
},
{
"epoch": 0.06990881458966565,
"grad_norm": 1.4280520677566528,
"learning_rate": 0.0005,
"loss": 5.366017818450928,
"step": 115
},
{
"epoch": 0.07051671732522796,
"grad_norm": 2.1756176948547363,
"learning_rate": 0.0005,
"loss": 5.529537677764893,
"step": 116
},
{
"epoch": 0.07112462006079028,
"grad_norm": 1.6855345964431763,
"learning_rate": 0.0005,
"loss": 5.3663010597229,
"step": 117
},
{
"epoch": 0.07173252279635259,
"grad_norm": 1.3849018812179565,
"learning_rate": 0.0005,
"loss": 5.661293983459473,
"step": 118
},
{
"epoch": 0.07234042553191489,
"grad_norm": 1.5399678945541382,
"learning_rate": 0.0005,
"loss": 5.681015968322754,
"step": 119
},
{
"epoch": 0.0729483282674772,
"grad_norm": 1.3474847078323364,
"learning_rate": 0.0005,
"loss": 5.404428482055664,
"step": 120
},
{
"epoch": 0.07355623100303951,
"grad_norm": 1.4353671073913574,
"learning_rate": 0.0005,
"loss": 5.621041297912598,
"step": 121
},
{
"epoch": 0.07416413373860183,
"grad_norm": 1.385099172592163,
"learning_rate": 0.0005,
"loss": 5.410789489746094,
"step": 122
},
{
"epoch": 0.07477203647416414,
"grad_norm": 1.5382664203643799,
"learning_rate": 0.0005,
"loss": 5.401933670043945,
"step": 123
},
{
"epoch": 0.07537993920972644,
"grad_norm": 1.48553466796875,
"learning_rate": 0.0005,
"loss": 5.547571182250977,
"step": 124
},
{
"epoch": 0.07598784194528875,
"grad_norm": 1.3798505067825317,
"learning_rate": 0.0005,
"loss": 5.5776872634887695,
"step": 125
},
{
"epoch": 0.07659574468085106,
"grad_norm": 1.863465428352356,
"learning_rate": 0.0005,
"loss": 5.570428371429443,
"step": 126
},
{
"epoch": 0.07720364741641338,
"grad_norm": 1.7337578535079956,
"learning_rate": 0.0005,
"loss": 5.60271692276001,
"step": 127
},
{
"epoch": 0.07781155015197569,
"grad_norm": 1.7129346132278442,
"learning_rate": 0.0005,
"loss": 5.655090808868408,
"step": 128
},
{
"epoch": 0.07841945288753799,
"grad_norm": 1.8253934383392334,
"learning_rate": 0.0005,
"loss": 5.726884841918945,
"step": 129
},
{
"epoch": 0.0790273556231003,
"grad_norm": 1.493262529373169,
"learning_rate": 0.0005,
"loss": 5.307271957397461,
"step": 130
},
{
"epoch": 0.07963525835866261,
"grad_norm": 1.9851430654525757,
"learning_rate": 0.0005,
"loss": 5.40402889251709,
"step": 131
},
{
"epoch": 0.08024316109422493,
"grad_norm": 1.4382926225662231,
"learning_rate": 0.0005,
"loss": 5.55129337310791,
"step": 132
},
{
"epoch": 0.08085106382978724,
"grad_norm": 2.1384055614471436,
"learning_rate": 0.0005,
"loss": 5.42939567565918,
"step": 133
},
{
"epoch": 0.08145896656534954,
"grad_norm": 1.5483143329620361,
"learning_rate": 0.0005,
"loss": 5.495145797729492,
"step": 134
},
{
"epoch": 0.08206686930091185,
"grad_norm": 1.6180500984191895,
"learning_rate": 0.0005,
"loss": 5.596287727355957,
"step": 135
},
{
"epoch": 0.08267477203647416,
"grad_norm": 1.6833781003952026,
"learning_rate": 0.0005,
"loss": 5.704960346221924,
"step": 136
},
{
"epoch": 0.08328267477203648,
"grad_norm": 1.731799602508545,
"learning_rate": 0.0005,
"loss": 5.343502998352051,
"step": 137
},
{
"epoch": 0.08389057750759879,
"grad_norm": 1.7854918241500854,
"learning_rate": 0.0005,
"loss": 5.647939205169678,
"step": 138
},
{
"epoch": 0.08449848024316109,
"grad_norm": 1.2474077939987183,
"learning_rate": 0.0005,
"loss": 5.360551834106445,
"step": 139
},
{
"epoch": 0.0851063829787234,
"grad_norm": 5.299109935760498,
"learning_rate": 0.0005,
"loss": 5.383178234100342,
"step": 140
},
{
"epoch": 0.08571428571428572,
"grad_norm": 2.591733694076538,
"learning_rate": 0.0005,
"loss": 5.623793601989746,
"step": 141
},
{
"epoch": 0.08632218844984803,
"grad_norm": 1.5868524312973022,
"learning_rate": 0.0005,
"loss": 5.522441864013672,
"step": 142
},
{
"epoch": 0.08693009118541034,
"grad_norm": 1.752677083015442,
"learning_rate": 0.0005,
"loss": 5.5086774826049805,
"step": 143
},
{
"epoch": 0.08753799392097264,
"grad_norm": 1.5863618850708008,
"learning_rate": 0.0005,
"loss": 5.492759704589844,
"step": 144
},
{
"epoch": 0.08814589665653495,
"grad_norm": 1.4941948652267456,
"learning_rate": 0.0005,
"loss": 5.475063323974609,
"step": 145
},
{
"epoch": 0.08875379939209727,
"grad_norm": 1.5351965427398682,
"learning_rate": 0.0005,
"loss": 5.511392593383789,
"step": 146
},
{
"epoch": 0.08936170212765958,
"grad_norm": 1.5566837787628174,
"learning_rate": 0.0005,
"loss": 5.4525909423828125,
"step": 147
},
{
"epoch": 0.08996960486322189,
"grad_norm": 1.5408483743667603,
"learning_rate": 0.0005,
"loss": 5.592557430267334,
"step": 148
},
{
"epoch": 0.09057750759878419,
"grad_norm": 1.3915044069290161,
"learning_rate": 0.0005,
"loss": 5.68109130859375,
"step": 149
},
{
"epoch": 0.0911854103343465,
"grad_norm": 1.4081814289093018,
"learning_rate": 0.0005,
"loss": 5.310542106628418,
"step": 150
},
{
"epoch": 0.09179331306990882,
"grad_norm": 1.368977427482605,
"learning_rate": 0.0005,
"loss": 5.590452194213867,
"step": 151
},
{
"epoch": 0.09240121580547113,
"grad_norm": 1.7604471445083618,
"learning_rate": 0.0005,
"loss": 5.2881550788879395,
"step": 152
},
{
"epoch": 0.09300911854103343,
"grad_norm": 1.2718323469161987,
"learning_rate": 0.0005,
"loss": 5.228243827819824,
"step": 153
},
{
"epoch": 0.09361702127659574,
"grad_norm": 1.853657841682434,
"learning_rate": 0.0005,
"loss": 5.344303131103516,
"step": 154
},
{
"epoch": 0.09422492401215805,
"grad_norm": 1.2742729187011719,
"learning_rate": 0.0005,
"loss": 5.602327346801758,
"step": 155
},
{
"epoch": 0.09483282674772037,
"grad_norm": 1.3428983688354492,
"learning_rate": 0.0005,
"loss": 5.564847469329834,
"step": 156
},
{
"epoch": 0.09544072948328268,
"grad_norm": 1.307673454284668,
"learning_rate": 0.0005,
"loss": 5.5293378829956055,
"step": 157
},
{
"epoch": 0.09604863221884498,
"grad_norm": 1.2413536310195923,
"learning_rate": 0.0005,
"loss": 5.751148223876953,
"step": 158
},
{
"epoch": 0.09665653495440729,
"grad_norm": 1.5207955837249756,
"learning_rate": 0.0005,
"loss": 5.464879989624023,
"step": 159
},
{
"epoch": 0.0972644376899696,
"grad_norm": 1.2123122215270996,
"learning_rate": 0.0005,
"loss": 5.438077926635742,
"step": 160
},
{
"epoch": 0.09787234042553192,
"grad_norm": 1.420456051826477,
"learning_rate": 0.0005,
"loss": 5.586366176605225,
"step": 161
},
{
"epoch": 0.09848024316109423,
"grad_norm": 1.2411231994628906,
"learning_rate": 0.0005,
"loss": 5.465837478637695,
"step": 162
},
{
"epoch": 0.09908814589665653,
"grad_norm": 1.4124112129211426,
"learning_rate": 0.0005,
"loss": 5.58890438079834,
"step": 163
},
{
"epoch": 0.09969604863221884,
"grad_norm": 1.421832799911499,
"learning_rate": 0.0005,
"loss": 5.211925029754639,
"step": 164
},
{
"epoch": 0.10030395136778116,
"grad_norm": 1.4735937118530273,
"learning_rate": 0.0005,
"loss": 5.542084693908691,
"step": 165
},
{
"epoch": 0.10091185410334347,
"grad_norm": 1.2726881504058838,
"learning_rate": 0.0005,
"loss": 5.566733360290527,
"step": 166
},
{
"epoch": 0.10151975683890578,
"grad_norm": 1.3275830745697021,
"learning_rate": 0.0005,
"loss": 5.730228424072266,
"step": 167
},
{
"epoch": 0.10212765957446808,
"grad_norm": 1.6597068309783936,
"learning_rate": 0.0005,
"loss": 5.339101791381836,
"step": 168
},
{
"epoch": 0.10273556231003039,
"grad_norm": 1.46490478515625,
"learning_rate": 0.0005,
"loss": 5.410638809204102,
"step": 169
},
{
"epoch": 0.1033434650455927,
"grad_norm": 1.3094699382781982,
"learning_rate": 0.0005,
"loss": 5.219968318939209,
"step": 170
},
{
"epoch": 0.10395136778115502,
"grad_norm": 1.4983205795288086,
"learning_rate": 0.0005,
"loss": 5.392378330230713,
"step": 171
},
{
"epoch": 0.10455927051671733,
"grad_norm": 1.517512559890747,
"learning_rate": 0.0005,
"loss": 5.38358736038208,
"step": 172
},
{
"epoch": 0.10516717325227963,
"grad_norm": 1.5345962047576904,
"learning_rate": 0.0005,
"loss": 5.368213653564453,
"step": 173
},
{
"epoch": 0.10577507598784194,
"grad_norm": 1.1318706274032593,
"learning_rate": 0.0005,
"loss": 5.639193534851074,
"step": 174
},
{
"epoch": 0.10638297872340426,
"grad_norm": 1.3089977502822876,
"learning_rate": 0.0005,
"loss": 5.508517265319824,
"step": 175
},
{
"epoch": 0.10699088145896657,
"grad_norm": 1.16405189037323,
"learning_rate": 0.0005,
"loss": 5.238767623901367,
"step": 176
},
{
"epoch": 0.10759878419452888,
"grad_norm": 1.318361759185791,
"learning_rate": 0.0005,
"loss": 5.591005325317383,
"step": 177
},
{
"epoch": 0.10820668693009118,
"grad_norm": 1.7068839073181152,
"learning_rate": 0.0005,
"loss": 5.138769149780273,
"step": 178
},
{
"epoch": 0.1088145896656535,
"grad_norm": 1.4426335096359253,
"learning_rate": 0.0005,
"loss": 5.406965255737305,
"step": 179
},
{
"epoch": 0.1094224924012158,
"grad_norm": 1.3298251628875732,
"learning_rate": 0.0005,
"loss": 5.486334323883057,
"step": 180
},
{
"epoch": 0.11003039513677812,
"grad_norm": 1.2703888416290283,
"learning_rate": 0.0005,
"loss": 5.543169021606445,
"step": 181
},
{
"epoch": 0.11063829787234042,
"grad_norm": 1.0853707790374756,
"learning_rate": 0.0005,
"loss": 5.2396135330200195,
"step": 182
},
{
"epoch": 0.11124620060790273,
"grad_norm": 1.283922553062439,
"learning_rate": 0.0005,
"loss": 5.168734550476074,
"step": 183
},
{
"epoch": 0.11185410334346504,
"grad_norm": 1.4008558988571167,
"learning_rate": 0.0005,
"loss": 5.464504241943359,
"step": 184
},
{
"epoch": 0.11246200607902736,
"grad_norm": 1.6104100942611694,
"learning_rate": 0.0005,
"loss": 5.350894927978516,
"step": 185
},
{
"epoch": 0.11306990881458967,
"grad_norm": 1.1095637083053589,
"learning_rate": 0.0005,
"loss": 5.330683708190918,
"step": 186
},
{
"epoch": 0.11367781155015197,
"grad_norm": 1.3298522233963013,
"learning_rate": 0.0005,
"loss": 5.376528739929199,
"step": 187
},
{
"epoch": 0.11428571428571428,
"grad_norm": 1.4511582851409912,
"learning_rate": 0.0005,
"loss": 5.49576473236084,
"step": 188
},
{
"epoch": 0.1148936170212766,
"grad_norm": 1.4968204498291016,
"learning_rate": 0.0005,
"loss": 5.232635021209717,
"step": 189
},
{
"epoch": 0.11550151975683891,
"grad_norm": 1.2423769235610962,
"learning_rate": 0.0005,
"loss": 5.456453323364258,
"step": 190
},
{
"epoch": 0.11610942249240122,
"grad_norm": 1.2642461061477661,
"learning_rate": 0.0005,
"loss": 5.673423767089844,
"step": 191
},
{
"epoch": 0.11671732522796352,
"grad_norm": 1.6604862213134766,
"learning_rate": 0.0005,
"loss": 5.230939865112305,
"step": 192
},
{
"epoch": 0.11732522796352583,
"grad_norm": 1.4601672887802124,
"learning_rate": 0.0005,
"loss": 5.308025360107422,
"step": 193
},
{
"epoch": 0.11793313069908815,
"grad_norm": 1.66468346118927,
"learning_rate": 0.0005,
"loss": 5.50089168548584,
"step": 194
},
{
"epoch": 0.11854103343465046,
"grad_norm": 1.4034700393676758,
"learning_rate": 0.0005,
"loss": 5.4229583740234375,
"step": 195
},
{
"epoch": 0.11914893617021277,
"grad_norm": 1.3911566734313965,
"learning_rate": 0.0005,
"loss": 5.266064643859863,
"step": 196
},
{
"epoch": 0.11975683890577507,
"grad_norm": 1.5582391023635864,
"learning_rate": 0.0005,
"loss": 5.215412616729736,
"step": 197
},
{
"epoch": 0.12036474164133738,
"grad_norm": 1.4908430576324463,
"learning_rate": 0.0005,
"loss": 5.305833339691162,
"step": 198
},
{
"epoch": 0.1209726443768997,
"grad_norm": 1.4207631349563599,
"learning_rate": 0.0005,
"loss": 5.2746734619140625,
"step": 199
},
{
"epoch": 0.12158054711246201,
"grad_norm": 1.5322375297546387,
"learning_rate": 0.0005,
"loss": 5.160092353820801,
"step": 200
},
{
"epoch": 0.12218844984802432,
"grad_norm": 1.538822889328003,
"learning_rate": 0.0005,
"loss": 5.2349467277526855,
"step": 201
},
{
"epoch": 0.12279635258358662,
"grad_norm": 1.487720251083374,
"learning_rate": 0.0005,
"loss": 5.305604934692383,
"step": 202
},
{
"epoch": 0.12340425531914893,
"grad_norm": 1.402201771736145,
"learning_rate": 0.0005,
"loss": 5.271785736083984,
"step": 203
},
{
"epoch": 0.12401215805471125,
"grad_norm": 1.4523091316223145,
"learning_rate": 0.0005,
"loss": 5.260416030883789,
"step": 204
},
{
"epoch": 0.12462006079027356,
"grad_norm": 1.3056803941726685,
"learning_rate": 0.0005,
"loss": 5.221076488494873,
"step": 205
},
{
"epoch": 0.12522796352583587,
"grad_norm": 1.4249091148376465,
"learning_rate": 0.0005,
"loss": 5.13364839553833,
"step": 206
},
{
"epoch": 0.12583586626139817,
"grad_norm": 1.417321801185608,
"learning_rate": 0.0005,
"loss": 5.294346332550049,
"step": 207
},
{
"epoch": 0.1264437689969605,
"grad_norm": 1.3512288331985474,
"learning_rate": 0.0005,
"loss": 5.273685455322266,
"step": 208
},
{
"epoch": 0.1270516717325228,
"grad_norm": 1.53708016872406,
"learning_rate": 0.0005,
"loss": 5.160931587219238,
"step": 209
},
{
"epoch": 0.1276595744680851,
"grad_norm": 1.3125845193862915,
"learning_rate": 0.0005,
"loss": 5.472460746765137,
"step": 210
},
{
"epoch": 0.12826747720364742,
"grad_norm": 1.6518676280975342,
"learning_rate": 0.0005,
"loss": 5.4825568199157715,
"step": 211
},
{
"epoch": 0.12887537993920972,
"grad_norm": 1.203003168106079,
"learning_rate": 0.0005,
"loss": 5.11652946472168,
"step": 212
},
{
"epoch": 0.12948328267477205,
"grad_norm": 1.3805352449417114,
"learning_rate": 0.0005,
"loss": 5.366741180419922,
"step": 213
},
{
"epoch": 0.13009118541033435,
"grad_norm": 1.8709197044372559,
"learning_rate": 0.0005,
"loss": 5.435246467590332,
"step": 214
},
{
"epoch": 0.13069908814589665,
"grad_norm": 1.7283586263656616,
"learning_rate": 0.0005,
"loss": 5.202251434326172,
"step": 215
},
{
"epoch": 0.13130699088145897,
"grad_norm": 1.2809170484542847,
"learning_rate": 0.0005,
"loss": 5.283895492553711,
"step": 216
},
{
"epoch": 0.13191489361702127,
"grad_norm": 1.249645709991455,
"learning_rate": 0.0005,
"loss": 5.123793601989746,
"step": 217
},
{
"epoch": 0.1325227963525836,
"grad_norm": 1.3356451988220215,
"learning_rate": 0.0005,
"loss": 5.174809455871582,
"step": 218
},
{
"epoch": 0.1331306990881459,
"grad_norm": 1.139381766319275,
"learning_rate": 0.0005,
"loss": 5.0811967849731445,
"step": 219
},
{
"epoch": 0.1337386018237082,
"grad_norm": 1.2006030082702637,
"learning_rate": 0.0005,
"loss": 5.268994331359863,
"step": 220
},
{
"epoch": 0.13434650455927052,
"grad_norm": 1.2994015216827393,
"learning_rate": 0.0005,
"loss": 5.426079750061035,
"step": 221
},
{
"epoch": 0.13495440729483282,
"grad_norm": 1.0793324708938599,
"learning_rate": 0.0005,
"loss": 5.424633979797363,
"step": 222
},
{
"epoch": 0.13556231003039515,
"grad_norm": 1.1271226406097412,
"learning_rate": 0.0005,
"loss": 5.310846328735352,
"step": 223
},
{
"epoch": 0.13617021276595745,
"grad_norm": 1.1775165796279907,
"learning_rate": 0.0005,
"loss": 5.071159839630127,
"step": 224
},
{
"epoch": 0.13677811550151975,
"grad_norm": 1.1077218055725098,
"learning_rate": 0.0005,
"loss": 5.208876609802246,
"step": 225
},
{
"epoch": 0.13738601823708207,
"grad_norm": 1.3281017541885376,
"learning_rate": 0.0005,
"loss": 5.371927261352539,
"step": 226
},
{
"epoch": 0.13799392097264437,
"grad_norm": 1.4999650716781616,
"learning_rate": 0.0005,
"loss": 5.17914342880249,
"step": 227
},
{
"epoch": 0.1386018237082067,
"grad_norm": 1.2213531732559204,
"learning_rate": 0.0005,
"loss": 5.079235076904297,
"step": 228
},
{
"epoch": 0.139209726443769,
"grad_norm": 1.409624695777893,
"learning_rate": 0.0005,
"loss": 5.218929767608643,
"step": 229
},
{
"epoch": 0.1398176291793313,
"grad_norm": 1.2914072275161743,
"learning_rate": 0.0005,
"loss": 5.254355430603027,
"step": 230
},
{
"epoch": 0.14042553191489363,
"grad_norm": 1.27825927734375,
"learning_rate": 0.0005,
"loss": 5.02869987487793,
"step": 231
},
{
"epoch": 0.14103343465045592,
"grad_norm": 1.367679238319397,
"learning_rate": 0.0005,
"loss": 5.032447814941406,
"step": 232
},
{
"epoch": 0.14164133738601822,
"grad_norm": 1.1813191175460815,
"learning_rate": 0.0005,
"loss": 5.181385040283203,
"step": 233
},
{
"epoch": 0.14224924012158055,
"grad_norm": 1.385109305381775,
"learning_rate": 0.0005,
"loss": 5.294610977172852,
"step": 234
},
{
"epoch": 0.14285714285714285,
"grad_norm": 1.2544500827789307,
"learning_rate": 0.0005,
"loss": 5.046303749084473,
"step": 235
},
{
"epoch": 0.14346504559270518,
"grad_norm": 1.487121820449829,
"learning_rate": 0.0005,
"loss": 5.523983001708984,
"step": 236
},
{
"epoch": 0.14407294832826747,
"grad_norm": 1.263445258140564,
"learning_rate": 0.0005,
"loss": 5.192383289337158,
"step": 237
},
{
"epoch": 0.14468085106382977,
"grad_norm": 1.0454970598220825,
"learning_rate": 0.0005,
"loss": 5.0029120445251465,
"step": 238
},
{
"epoch": 0.1452887537993921,
"grad_norm": 1.131041407585144,
"learning_rate": 0.0005,
"loss": 5.140591144561768,
"step": 239
},
{
"epoch": 0.1458966565349544,
"grad_norm": 1.3271952867507935,
"learning_rate": 0.0005,
"loss": 5.232538223266602,
"step": 240
},
{
"epoch": 0.14650455927051673,
"grad_norm": 1.2867931127548218,
"learning_rate": 0.0005,
"loss": 5.288295745849609,
"step": 241
},
{
"epoch": 0.14711246200607903,
"grad_norm": 1.2857162952423096,
"learning_rate": 0.0005,
"loss": 4.999725341796875,
"step": 242
},
{
"epoch": 0.14772036474164132,
"grad_norm": 1.308387279510498,
"learning_rate": 0.0005,
"loss": 5.332901477813721,
"step": 243
},
{
"epoch": 0.14832826747720365,
"grad_norm": 1.431774377822876,
"learning_rate": 0.0005,
"loss": 5.33701753616333,
"step": 244
},
{
"epoch": 0.14893617021276595,
"grad_norm": 1.2257990837097168,
"learning_rate": 0.0005,
"loss": 5.286837100982666,
"step": 245
},
{
"epoch": 0.14954407294832828,
"grad_norm": 1.2497832775115967,
"learning_rate": 0.0005,
"loss": 5.060267448425293,
"step": 246
},
{
"epoch": 0.15015197568389058,
"grad_norm": 1.3174192905426025,
"learning_rate": 0.0005,
"loss": 5.460453987121582,
"step": 247
},
{
"epoch": 0.15075987841945288,
"grad_norm": 1.2937954664230347,
"learning_rate": 0.0005,
"loss": 5.300616264343262,
"step": 248
},
{
"epoch": 0.1513677811550152,
"grad_norm": 1.1722848415374756,
"learning_rate": 0.0005,
"loss": 5.289948463439941,
"step": 249
},
{
"epoch": 0.1519756838905775,
"grad_norm": 1.365752100944519,
"learning_rate": 0.0005,
"loss": 5.077818870544434,
"step": 250
},
{
"epoch": 0.15258358662613983,
"grad_norm": 1.2099617719650269,
"learning_rate": 0.0005,
"loss": 5.033614635467529,
"step": 251
},
{
"epoch": 0.15319148936170213,
"grad_norm": 1.3854937553405762,
"learning_rate": 0.0005,
"loss": 5.019617080688477,
"step": 252
},
{
"epoch": 0.15379939209726443,
"grad_norm": 1.3792158365249634,
"learning_rate": 0.0005,
"loss": 5.079125881195068,
"step": 253
},
{
"epoch": 0.15440729483282675,
"grad_norm": 1.1149134635925293,
"learning_rate": 0.0005,
"loss": 5.06775426864624,
"step": 254
},
{
"epoch": 0.15501519756838905,
"grad_norm": 1.4162288904190063,
"learning_rate": 0.0005,
"loss": 5.29591178894043,
"step": 255
},
{
"epoch": 0.15562310030395138,
"grad_norm": 1.298060417175293,
"learning_rate": 0.0005,
"loss": 5.090610504150391,
"step": 256
},
{
"epoch": 0.15623100303951368,
"grad_norm": 1.1845481395721436,
"learning_rate": 0.0005,
"loss": 5.00084114074707,
"step": 257
},
{
"epoch": 0.15683890577507598,
"grad_norm": 1.1649361848831177,
"learning_rate": 0.0005,
"loss": 5.0191168785095215,
"step": 258
},
{
"epoch": 0.1574468085106383,
"grad_norm": 1.1649863719940186,
"learning_rate": 0.0005,
"loss": 4.924384117126465,
"step": 259
},
{
"epoch": 0.1580547112462006,
"grad_norm": 1.305981159210205,
"learning_rate": 0.0005,
"loss": 5.208071708679199,
"step": 260
},
{
"epoch": 0.15866261398176293,
"grad_norm": 1.1375975608825684,
"learning_rate": 0.0005,
"loss": 5.07304048538208,
"step": 261
},
{
"epoch": 0.15927051671732523,
"grad_norm": 1.570008635520935,
"learning_rate": 0.0005,
"loss": 5.2816667556762695,
"step": 262
},
{
"epoch": 0.15987841945288753,
"grad_norm": 1.168481469154358,
"learning_rate": 0.0005,
"loss": 5.156436920166016,
"step": 263
},
{
"epoch": 0.16048632218844985,
"grad_norm": 1.17093026638031,
"learning_rate": 0.0005,
"loss": 5.264464378356934,
"step": 264
},
{
"epoch": 0.16109422492401215,
"grad_norm": 1.1767195463180542,
"learning_rate": 0.0005,
"loss": 5.278616905212402,
"step": 265
},
{
"epoch": 0.16170212765957448,
"grad_norm": 1.2456096410751343,
"learning_rate": 0.0005,
"loss": 5.296989440917969,
"step": 266
},
{
"epoch": 0.16231003039513678,
"grad_norm": 1.260128140449524,
"learning_rate": 0.0005,
"loss": 5.161136150360107,
"step": 267
},
{
"epoch": 0.16291793313069908,
"grad_norm": 1.3702967166900635,
"learning_rate": 0.0005,
"loss": 5.2522077560424805,
"step": 268
},
{
"epoch": 0.1635258358662614,
"grad_norm": 1.1898664236068726,
"learning_rate": 0.0005,
"loss": 5.138284683227539,
"step": 269
},
{
"epoch": 0.1641337386018237,
"grad_norm": 1.586888074874878,
"learning_rate": 0.0005,
"loss": 4.960643291473389,
"step": 270
},
{
"epoch": 0.16474164133738603,
"grad_norm": 1.2508625984191895,
"learning_rate": 0.0005,
"loss": 5.2589569091796875,
"step": 271
},
{
"epoch": 0.16534954407294833,
"grad_norm": 1.1662089824676514,
"learning_rate": 0.0005,
"loss": 5.264585494995117,
"step": 272
},
{
"epoch": 0.16595744680851063,
"grad_norm": 1.2917591333389282,
"learning_rate": 0.0005,
"loss": 4.975507736206055,
"step": 273
},
{
"epoch": 0.16656534954407295,
"grad_norm": 1.0556538105010986,
"learning_rate": 0.0005,
"loss": 5.047136306762695,
"step": 274
},
{
"epoch": 0.16717325227963525,
"grad_norm": 1.0959351062774658,
"learning_rate": 0.0005,
"loss": 5.063904762268066,
"step": 275
},
{
"epoch": 0.16778115501519758,
"grad_norm": 1.0194965600967407,
"learning_rate": 0.0005,
"loss": 5.230169296264648,
"step": 276
},
{
"epoch": 0.16838905775075988,
"grad_norm": 1.326802372932434,
"learning_rate": 0.0005,
"loss": 5.127433776855469,
"step": 277
},
{
"epoch": 0.16899696048632218,
"grad_norm": 1.17707097530365,
"learning_rate": 0.0005,
"loss": 5.209277153015137,
"step": 278
},
{
"epoch": 0.1696048632218845,
"grad_norm": 0.9115813970565796,
"learning_rate": 0.0005,
"loss": 5.025136470794678,
"step": 279
},
{
"epoch": 0.1702127659574468,
"grad_norm": 1.1245434284210205,
"learning_rate": 0.0005,
"loss": 5.057619094848633,
"step": 280
},
{
"epoch": 0.17082066869300913,
"grad_norm": 1.3757452964782715,
"learning_rate": 0.0005,
"loss": 4.920927047729492,
"step": 281
},
{
"epoch": 0.17142857142857143,
"grad_norm": 1.4696053266525269,
"learning_rate": 0.0005,
"loss": 5.1536760330200195,
"step": 282
},
{
"epoch": 0.17203647416413373,
"grad_norm": 1.2874000072479248,
"learning_rate": 0.0005,
"loss": 5.050880432128906,
"step": 283
},
{
"epoch": 0.17264437689969606,
"grad_norm": 1.2090721130371094,
"learning_rate": 0.0005,
"loss": 5.024714469909668,
"step": 284
},
{
"epoch": 0.17325227963525835,
"grad_norm": 1.3489820957183838,
"learning_rate": 0.0005,
"loss": 5.124329090118408,
"step": 285
},
{
"epoch": 0.17386018237082068,
"grad_norm": 1.055483102798462,
"learning_rate": 0.0005,
"loss": 4.890225887298584,
"step": 286
},
{
"epoch": 0.17446808510638298,
"grad_norm": 1.2479093074798584,
"learning_rate": 0.0005,
"loss": 4.835631370544434,
"step": 287
},
{
"epoch": 0.17507598784194528,
"grad_norm": 1.1899778842926025,
"learning_rate": 0.0005,
"loss": 5.027457237243652,
"step": 288
},
{
"epoch": 0.1756838905775076,
"grad_norm": 1.1618897914886475,
"learning_rate": 0.0005,
"loss": 5.145232677459717,
"step": 289
},
{
"epoch": 0.1762917933130699,
"grad_norm": 1.2332507371902466,
"learning_rate": 0.0005,
"loss": 5.138116359710693,
"step": 290
},
{
"epoch": 0.17689969604863223,
"grad_norm": 1.1276404857635498,
"learning_rate": 0.0005,
"loss": 5.094466209411621,
"step": 291
},
{
"epoch": 0.17750759878419453,
"grad_norm": 1.4890656471252441,
"learning_rate": 0.0005,
"loss": 4.797001838684082,
"step": 292
},
{
"epoch": 0.17811550151975683,
"grad_norm": 1.0490905046463013,
"learning_rate": 0.0005,
"loss": 5.235766410827637,
"step": 293
},
{
"epoch": 0.17872340425531916,
"grad_norm": 1.1675019264221191,
"learning_rate": 0.0005,
"loss": 4.964472770690918,
"step": 294
},
{
"epoch": 0.17933130699088146,
"grad_norm": 0.9588620662689209,
"learning_rate": 0.0005,
"loss": 5.124715805053711,
"step": 295
},
{
"epoch": 0.17993920972644378,
"grad_norm": 1.3892091512680054,
"learning_rate": 0.0005,
"loss": 4.847377300262451,
"step": 296
},
{
"epoch": 0.18054711246200608,
"grad_norm": 1.1051721572875977,
"learning_rate": 0.0005,
"loss": 5.199601173400879,
"step": 297
},
{
"epoch": 0.18115501519756838,
"grad_norm": 1.0869505405426025,
"learning_rate": 0.0005,
"loss": 5.3870697021484375,
"step": 298
},
{
"epoch": 0.1817629179331307,
"grad_norm": 1.111187219619751,
"learning_rate": 0.0005,
"loss": 5.190181732177734,
"step": 299
},
{
"epoch": 0.182370820668693,
"grad_norm": 1.2440016269683838,
"learning_rate": 0.0005,
"loss": 5.041322231292725,
"step": 300
},
{
"epoch": 0.1829787234042553,
"grad_norm": 1.2418692111968994,
"learning_rate": 0.0005,
"loss": 5.212306022644043,
"step": 301
},
{
"epoch": 0.18358662613981763,
"grad_norm": 1.2612659931182861,
"learning_rate": 0.0005,
"loss": 4.961835861206055,
"step": 302
},
{
"epoch": 0.18419452887537993,
"grad_norm": 1.1162973642349243,
"learning_rate": 0.0005,
"loss": 4.950830936431885,
"step": 303
},
{
"epoch": 0.18480243161094226,
"grad_norm": 1.144067406654358,
"learning_rate": 0.0005,
"loss": 4.8998637199401855,
"step": 304
},
{
"epoch": 0.18541033434650456,
"grad_norm": 1.2814747095108032,
"learning_rate": 0.0005,
"loss": 5.224381446838379,
"step": 305
},
{
"epoch": 0.18601823708206686,
"grad_norm": 1.3770310878753662,
"learning_rate": 0.0005,
"loss": 5.05579137802124,
"step": 306
},
{
"epoch": 0.18662613981762918,
"grad_norm": 1.5116229057312012,
"learning_rate": 0.0005,
"loss": 5.082482814788818,
"step": 307
},
{
"epoch": 0.18723404255319148,
"grad_norm": 1.0909713506698608,
"learning_rate": 0.0005,
"loss": 4.967124938964844,
"step": 308
},
{
"epoch": 0.1878419452887538,
"grad_norm": 1.1027607917785645,
"learning_rate": 0.0005,
"loss": 5.00374698638916,
"step": 309
},
{
"epoch": 0.1884498480243161,
"grad_norm": 1.238652229309082,
"learning_rate": 0.0005,
"loss": 4.993183135986328,
"step": 310
},
{
"epoch": 0.1890577507598784,
"grad_norm": 1.0609782934188843,
"learning_rate": 0.0005,
"loss": 5.019218444824219,
"step": 311
},
{
"epoch": 0.18966565349544073,
"grad_norm": 1.1945058107376099,
"learning_rate": 0.0005,
"loss": 5.068751335144043,
"step": 312
},
{
"epoch": 0.19027355623100303,
"grad_norm": 1.2640782594680786,
"learning_rate": 0.0005,
"loss": 5.185402870178223,
"step": 313
},
{
"epoch": 0.19088145896656536,
"grad_norm": 1.0532907247543335,
"learning_rate": 0.0005,
"loss": 5.222114562988281,
"step": 314
},
{
"epoch": 0.19148936170212766,
"grad_norm": 1.0423952341079712,
"learning_rate": 0.0005,
"loss": 5.1693806648254395,
"step": 315
},
{
"epoch": 0.19209726443768996,
"grad_norm": 1.0700887441635132,
"learning_rate": 0.0005,
"loss": 5.0217485427856445,
"step": 316
},
{
"epoch": 0.19270516717325228,
"grad_norm": 1.2595866918563843,
"learning_rate": 0.0005,
"loss": 5.231429576873779,
"step": 317
},
{
"epoch": 0.19331306990881458,
"grad_norm": 1.1495158672332764,
"learning_rate": 0.0005,
"loss": 5.015372276306152,
"step": 318
},
{
"epoch": 0.1939209726443769,
"grad_norm": 1.3977763652801514,
"learning_rate": 0.0005,
"loss": 5.323009490966797,
"step": 319
},
{
"epoch": 0.1945288753799392,
"grad_norm": 1.4009697437286377,
"learning_rate": 0.0005,
"loss": 5.2833638191223145,
"step": 320
},
{
"epoch": 0.1951367781155015,
"grad_norm": 1.1618447303771973,
"learning_rate": 0.0005,
"loss": 5.064535140991211,
"step": 321
},
{
"epoch": 0.19574468085106383,
"grad_norm": 1.1447522640228271,
"learning_rate": 0.0005,
"loss": 4.99235725402832,
"step": 322
},
{
"epoch": 0.19635258358662613,
"grad_norm": 1.2342157363891602,
"learning_rate": 0.0005,
"loss": 5.036558151245117,
"step": 323
},
{
"epoch": 0.19696048632218846,
"grad_norm": 1.2487186193466187,
"learning_rate": 0.0005,
"loss": 5.207220077514648,
"step": 324
},
{
"epoch": 0.19756838905775076,
"grad_norm": 1.4693067073822021,
"learning_rate": 0.0005,
"loss": 5.096504211425781,
"step": 325
},
{
"epoch": 0.19817629179331306,
"grad_norm": 1.1707696914672852,
"learning_rate": 0.0005,
"loss": 5.003598213195801,
"step": 326
},
{
"epoch": 0.19878419452887539,
"grad_norm": 0.9728778600692749,
"learning_rate": 0.0005,
"loss": 4.8744659423828125,
"step": 327
},
{
"epoch": 0.19939209726443768,
"grad_norm": 1.383410096168518,
"learning_rate": 0.0005,
"loss": 5.1511383056640625,
"step": 328
},
{
"epoch": 0.2,
"grad_norm": 1.0482876300811768,
"learning_rate": 0.0005,
"loss": 5.014847755432129,
"step": 329
},
{
"epoch": 0.2006079027355623,
"grad_norm": 1.2320209741592407,
"learning_rate": 0.0005,
"loss": 4.923969745635986,
"step": 330
},
{
"epoch": 0.2012158054711246,
"grad_norm": 2.013617753982544,
"learning_rate": 0.0005,
"loss": 4.876163482666016,
"step": 331
},
{
"epoch": 0.20182370820668694,
"grad_norm": 1.4123047590255737,
"learning_rate": 0.0005,
"loss": 4.870320796966553,
"step": 332
},
{
"epoch": 0.20243161094224923,
"grad_norm": 0.9998598694801331,
"learning_rate": 0.0005,
"loss": 4.8142805099487305,
"step": 333
},
{
"epoch": 0.20303951367781156,
"grad_norm": 1.255579948425293,
"learning_rate": 0.0005,
"loss": 5.134385108947754,
"step": 334
},
{
"epoch": 0.20364741641337386,
"grad_norm": 1.1863816976547241,
"learning_rate": 0.0005,
"loss": 4.943517208099365,
"step": 335
},
{
"epoch": 0.20425531914893616,
"grad_norm": 1.3125497102737427,
"learning_rate": 0.0005,
"loss": 4.835733413696289,
"step": 336
},
{
"epoch": 0.2048632218844985,
"grad_norm": 1.330944538116455,
"learning_rate": 0.0005,
"loss": 4.996496200561523,
"step": 337
},
{
"epoch": 0.20547112462006079,
"grad_norm": 1.4103339910507202,
"learning_rate": 0.0005,
"loss": 5.215001106262207,
"step": 338
},
{
"epoch": 0.2060790273556231,
"grad_norm": 1.1276763677597046,
"learning_rate": 0.0005,
"loss": 5.080985069274902,
"step": 339
},
{
"epoch": 0.2066869300911854,
"grad_norm": 1.2522611618041992,
"learning_rate": 0.0005,
"loss": 5.1337480545043945,
"step": 340
},
{
"epoch": 0.2072948328267477,
"grad_norm": 1.0622775554656982,
"learning_rate": 0.0005,
"loss": 5.139281272888184,
"step": 341
},
{
"epoch": 0.20790273556231004,
"grad_norm": 1.2667897939682007,
"learning_rate": 0.0005,
"loss": 4.985269546508789,
"step": 342
},
{
"epoch": 0.20851063829787234,
"grad_norm": 1.2665342092514038,
"learning_rate": 0.0005,
"loss": 4.907642841339111,
"step": 343
},
{
"epoch": 0.20911854103343466,
"grad_norm": 1.2670104503631592,
"learning_rate": 0.0005,
"loss": 4.9238739013671875,
"step": 344
},
{
"epoch": 0.20972644376899696,
"grad_norm": 1.3876585960388184,
"learning_rate": 0.0005,
"loss": 5.280843734741211,
"step": 345
},
{
"epoch": 0.21033434650455926,
"grad_norm": 1.172425389289856,
"learning_rate": 0.0005,
"loss": 5.018771171569824,
"step": 346
},
{
"epoch": 0.2109422492401216,
"grad_norm": 1.057332158088684,
"learning_rate": 0.0005,
"loss": 4.957630157470703,
"step": 347
},
{
"epoch": 0.2115501519756839,
"grad_norm": 1.2106921672821045,
"learning_rate": 0.0005,
"loss": 5.079224109649658,
"step": 348
},
{
"epoch": 0.2121580547112462,
"grad_norm": 1.2184040546417236,
"learning_rate": 0.0005,
"loss": 4.923876762390137,
"step": 349
},
{
"epoch": 0.2127659574468085,
"grad_norm": 1.3889566659927368,
"learning_rate": 0.0005,
"loss": 5.0445098876953125,
"step": 350
},
{
"epoch": 0.2133738601823708,
"grad_norm": 1.1836071014404297,
"learning_rate": 0.0005,
"loss": 4.762534141540527,
"step": 351
},
{
"epoch": 0.21398176291793314,
"grad_norm": 1.2222967147827148,
"learning_rate": 0.0005,
"loss": 5.045120716094971,
"step": 352
},
{
"epoch": 0.21458966565349544,
"grad_norm": 1.203317403793335,
"learning_rate": 0.0005,
"loss": 5.027883052825928,
"step": 353
},
{
"epoch": 0.21519756838905776,
"grad_norm": 1.118275761604309,
"learning_rate": 0.0005,
"loss": 5.153387069702148,
"step": 354
},
{
"epoch": 0.21580547112462006,
"grad_norm": 1.1502918004989624,
"learning_rate": 0.0005,
"loss": 4.907447814941406,
"step": 355
},
{
"epoch": 0.21641337386018236,
"grad_norm": 0.916477620601654,
"learning_rate": 0.0005,
"loss": 4.913633346557617,
"step": 356
},
{
"epoch": 0.2170212765957447,
"grad_norm": 0.9976673722267151,
"learning_rate": 0.0005,
"loss": 4.855230331420898,
"step": 357
},
{
"epoch": 0.217629179331307,
"grad_norm": 1.2301874160766602,
"learning_rate": 0.0005,
"loss": 5.274983882904053,
"step": 358
},
{
"epoch": 0.21823708206686931,
"grad_norm": 1.268349051475525,
"learning_rate": 0.0005,
"loss": 4.990891933441162,
"step": 359
},
{
"epoch": 0.2188449848024316,
"grad_norm": 1.7098944187164307,
"learning_rate": 0.0005,
"loss": 5.0019989013671875,
"step": 360
},
{
"epoch": 0.2194528875379939,
"grad_norm": 1.3171290159225464,
"learning_rate": 0.0005,
"loss": 5.091225624084473,
"step": 361
},
{
"epoch": 0.22006079027355624,
"grad_norm": 1.1964459419250488,
"learning_rate": 0.0005,
"loss": 4.942023754119873,
"step": 362
},
{
"epoch": 0.22066869300911854,
"grad_norm": 1.212193250656128,
"learning_rate": 0.0005,
"loss": 4.842243194580078,
"step": 363
},
{
"epoch": 0.22127659574468084,
"grad_norm": 1.2447597980499268,
"learning_rate": 0.0005,
"loss": 4.891105651855469,
"step": 364
},
{
"epoch": 0.22188449848024316,
"grad_norm": 1.0322506427764893,
"learning_rate": 0.0005,
"loss": 5.083103179931641,
"step": 365
},
{
"epoch": 0.22249240121580546,
"grad_norm": 1.1431292295455933,
"learning_rate": 0.0005,
"loss": 5.104142189025879,
"step": 366
},
{
"epoch": 0.2231003039513678,
"grad_norm": 1.1028327941894531,
"learning_rate": 0.0005,
"loss": 4.933050632476807,
"step": 367
},
{
"epoch": 0.2237082066869301,
"grad_norm": 0.9712069630622864,
"learning_rate": 0.0005,
"loss": 4.821019172668457,
"step": 368
},
{
"epoch": 0.2243161094224924,
"grad_norm": 1.063249111175537,
"learning_rate": 0.0005,
"loss": 4.972682476043701,
"step": 369
},
{
"epoch": 0.22492401215805471,
"grad_norm": 1.1715357303619385,
"learning_rate": 0.0005,
"loss": 5.0836591720581055,
"step": 370
},
{
"epoch": 0.225531914893617,
"grad_norm": 1.128483772277832,
"learning_rate": 0.0005,
"loss": 5.094054698944092,
"step": 371
},
{
"epoch": 0.22613981762917934,
"grad_norm": 1.2616199254989624,
"learning_rate": 0.0005,
"loss": 4.991359710693359,
"step": 372
},
{
"epoch": 0.22674772036474164,
"grad_norm": 1.2140382528305054,
"learning_rate": 0.0005,
"loss": 4.7401838302612305,
"step": 373
},
{
"epoch": 0.22735562310030394,
"grad_norm": 1.1435750722885132,
"learning_rate": 0.0005,
"loss": 5.093307971954346,
"step": 374
},
{
"epoch": 0.22796352583586627,
"grad_norm": 1.0213854312896729,
"learning_rate": 0.0005,
"loss": 4.898110389709473,
"step": 375
},
{
"epoch": 0.22857142857142856,
"grad_norm": 1.6159358024597168,
"learning_rate": 0.0005,
"loss": 4.884780406951904,
"step": 376
},
{
"epoch": 0.2291793313069909,
"grad_norm": 1.0451385974884033,
"learning_rate": 0.0005,
"loss": 5.046623229980469,
"step": 377
},
{
"epoch": 0.2297872340425532,
"grad_norm": 1.0726312398910522,
"learning_rate": 0.0005,
"loss": 5.3511962890625,
"step": 378
},
{
"epoch": 0.2303951367781155,
"grad_norm": 1.1179200410842896,
"learning_rate": 0.0005,
"loss": 4.847324371337891,
"step": 379
},
{
"epoch": 0.23100303951367782,
"grad_norm": 1.1474509239196777,
"learning_rate": 0.0005,
"loss": 4.830921173095703,
"step": 380
},
{
"epoch": 0.23161094224924011,
"grad_norm": 1.0454329252243042,
"learning_rate": 0.0005,
"loss": 4.962401390075684,
"step": 381
},
{
"epoch": 0.23221884498480244,
"grad_norm": 1.214348316192627,
"learning_rate": 0.0005,
"loss": 4.800313472747803,
"step": 382
},
{
"epoch": 0.23282674772036474,
"grad_norm": 1.18563973903656,
"learning_rate": 0.0005,
"loss": 4.8629655838012695,
"step": 383
},
{
"epoch": 0.23343465045592704,
"grad_norm": 1.0595086812973022,
"learning_rate": 0.0005,
"loss": 4.9949750900268555,
"step": 384
},
{
"epoch": 0.23404255319148937,
"grad_norm": 1.0595086812973022,
"learning_rate": 0.0005,
"loss": 4.926072597503662,
"step": 385
},
{
"epoch": 0.23465045592705167,
"grad_norm": 1.1770035028457642,
"learning_rate": 0.0005,
"loss": 4.766304969787598,
"step": 386
},
{
"epoch": 0.235258358662614,
"grad_norm": 1.1117204427719116,
"learning_rate": 0.0005,
"loss": 4.896605968475342,
"step": 387
},
{
"epoch": 0.2358662613981763,
"grad_norm": 1.2087441682815552,
"learning_rate": 0.0005,
"loss": 4.892548084259033,
"step": 388
},
{
"epoch": 0.2364741641337386,
"grad_norm": 0.9041852355003357,
"learning_rate": 0.0005,
"loss": 4.948829650878906,
"step": 389
},
{
"epoch": 0.23708206686930092,
"grad_norm": 0.94862300157547,
"learning_rate": 0.0005,
"loss": 4.8753533363342285,
"step": 390
},
{
"epoch": 0.23768996960486322,
"grad_norm": 1.055679202079773,
"learning_rate": 0.0005,
"loss": 4.816287994384766,
"step": 391
},
{
"epoch": 0.23829787234042554,
"grad_norm": 1.413857340812683,
"learning_rate": 0.0005,
"loss": 4.809457778930664,
"step": 392
},
{
"epoch": 0.23890577507598784,
"grad_norm": 1.326051950454712,
"learning_rate": 0.0005,
"loss": 5.0313568115234375,
"step": 393
},
{
"epoch": 0.23951367781155014,
"grad_norm": 1.2621649503707886,
"learning_rate": 0.0005,
"loss": 4.906643867492676,
"step": 394
},
{
"epoch": 0.24012158054711247,
"grad_norm": 1.2217754125595093,
"learning_rate": 0.0005,
"loss": 4.929527759552002,
"step": 395
},
{
"epoch": 0.24072948328267477,
"grad_norm": 1.1450992822647095,
"learning_rate": 0.0005,
"loss": 4.908195495605469,
"step": 396
},
{
"epoch": 0.2413373860182371,
"grad_norm": 1.4507970809936523,
"learning_rate": 0.0005,
"loss": 5.079260349273682,
"step": 397
},
{
"epoch": 0.2419452887537994,
"grad_norm": 1.086036205291748,
"learning_rate": 0.0005,
"loss": 4.996855735778809,
"step": 398
},
{
"epoch": 0.2425531914893617,
"grad_norm": 1.0666170120239258,
"learning_rate": 0.0005,
"loss": 5.002256393432617,
"step": 399
},
{
"epoch": 0.24316109422492402,
"grad_norm": 1.199183702468872,
"learning_rate": 0.0005,
"loss": 5.217647552490234,
"step": 400
},
{
"epoch": 0.24376899696048632,
"grad_norm": 1.156293511390686,
"learning_rate": 0.0005,
"loss": 4.900952339172363,
"step": 401
},
{
"epoch": 0.24437689969604864,
"grad_norm": 1.3151594400405884,
"learning_rate": 0.0005,
"loss": 4.980197906494141,
"step": 402
},
{
"epoch": 0.24498480243161094,
"grad_norm": 1.0817885398864746,
"learning_rate": 0.0005,
"loss": 4.745031356811523,
"step": 403
},
{
"epoch": 0.24559270516717324,
"grad_norm": 1.0003957748413086,
"learning_rate": 0.0005,
"loss": 4.599782466888428,
"step": 404
},
{
"epoch": 0.24620060790273557,
"grad_norm": 0.95441734790802,
"learning_rate": 0.0005,
"loss": 4.928730010986328,
"step": 405
},
{
"epoch": 0.24680851063829787,
"grad_norm": 1.1539515256881714,
"learning_rate": 0.0005,
"loss": 5.01755428314209,
"step": 406
},
{
"epoch": 0.2474164133738602,
"grad_norm": 1.1274021863937378,
"learning_rate": 0.0005,
"loss": 4.92464542388916,
"step": 407
},
{
"epoch": 0.2480243161094225,
"grad_norm": 1.075126051902771,
"learning_rate": 0.0005,
"loss": 4.842813014984131,
"step": 408
},
{
"epoch": 0.2486322188449848,
"grad_norm": 1.1200828552246094,
"learning_rate": 0.0005,
"loss": 4.701647758483887,
"step": 409
},
{
"epoch": 0.24924012158054712,
"grad_norm": 1.349135398864746,
"learning_rate": 0.0005,
"loss": 5.124917030334473,
"step": 410
},
{
"epoch": 0.24984802431610942,
"grad_norm": 1.403590440750122,
"learning_rate": 0.0005,
"loss": 5.070537567138672,
"step": 411
},
{
"epoch": 0.25045592705167175,
"grad_norm": 0.9664301872253418,
"learning_rate": 0.0005,
"loss": 4.846314430236816,
"step": 412
},
{
"epoch": 0.251063829787234,
"grad_norm": 1.1642309427261353,
"learning_rate": 0.0005,
"loss": 4.933165550231934,
"step": 413
},
{
"epoch": 0.25167173252279634,
"grad_norm": 1.1649516820907593,
"learning_rate": 0.0005,
"loss": 4.789491653442383,
"step": 414
},
{
"epoch": 0.25227963525835867,
"grad_norm": 1.1041150093078613,
"learning_rate": 0.0005,
"loss": 4.580702781677246,
"step": 415
},
{
"epoch": 0.252887537993921,
"grad_norm": 1.0078331232070923,
"learning_rate": 0.0005,
"loss": 4.77386999130249,
"step": 416
},
{
"epoch": 0.25349544072948327,
"grad_norm": 1.0907591581344604,
"learning_rate": 0.0005,
"loss": 4.774503707885742,
"step": 417
},
{
"epoch": 0.2541033434650456,
"grad_norm": 1.3880425691604614,
"learning_rate": 0.0005,
"loss": 4.793880462646484,
"step": 418
},
{
"epoch": 0.2547112462006079,
"grad_norm": 1.2313039302825928,
"learning_rate": 0.0005,
"loss": 4.7932891845703125,
"step": 419
},
{
"epoch": 0.2553191489361702,
"grad_norm": 0.9940412044525146,
"learning_rate": 0.0005,
"loss": 5.119372367858887,
"step": 420
},
{
"epoch": 0.2559270516717325,
"grad_norm": 1.0474408864974976,
"learning_rate": 0.0005,
"loss": 4.940298080444336,
"step": 421
},
{
"epoch": 0.25653495440729485,
"grad_norm": 1.091572642326355,
"learning_rate": 0.0005,
"loss": 4.824063777923584,
"step": 422
},
{
"epoch": 0.2571428571428571,
"grad_norm": 0.9919223189353943,
"learning_rate": 0.0005,
"loss": 4.823666572570801,
"step": 423
},
{
"epoch": 0.25775075987841944,
"grad_norm": 0.9640527963638306,
"learning_rate": 0.0005,
"loss": 4.798361778259277,
"step": 424
},
{
"epoch": 0.25835866261398177,
"grad_norm": 1.0292719602584839,
"learning_rate": 0.0005,
"loss": 4.69101619720459,
"step": 425
},
{
"epoch": 0.2589665653495441,
"grad_norm": 1.2390789985656738,
"learning_rate": 0.0005,
"loss": 4.671029090881348,
"step": 426
},
{
"epoch": 0.25957446808510637,
"grad_norm": 1.2008142471313477,
"learning_rate": 0.0005,
"loss": 4.796487331390381,
"step": 427
},
{
"epoch": 0.2601823708206687,
"grad_norm": 1.0405327081680298,
"learning_rate": 0.0005,
"loss": 4.8557820320129395,
"step": 428
},
{
"epoch": 0.260790273556231,
"grad_norm": 1.042792558670044,
"learning_rate": 0.0005,
"loss": 4.805086135864258,
"step": 429
},
{
"epoch": 0.2613981762917933,
"grad_norm": 1.6039878129959106,
"learning_rate": 0.0005,
"loss": 4.892642974853516,
"step": 430
},
{
"epoch": 0.2620060790273556,
"grad_norm": 1.0221588611602783,
"learning_rate": 0.0005,
"loss": 4.868304252624512,
"step": 431
},
{
"epoch": 0.26261398176291795,
"grad_norm": 1.0673880577087402,
"learning_rate": 0.0005,
"loss": 4.52126932144165,
"step": 432
},
{
"epoch": 0.2632218844984802,
"grad_norm": 1.1782925128936768,
"learning_rate": 0.0005,
"loss": 4.9915618896484375,
"step": 433
},
{
"epoch": 0.26382978723404255,
"grad_norm": 0.9004169702529907,
"learning_rate": 0.0005,
"loss": 5.040285110473633,
"step": 434
},
{
"epoch": 0.26443768996960487,
"grad_norm": 1.1495839357376099,
"learning_rate": 0.0005,
"loss": 4.991700172424316,
"step": 435
},
{
"epoch": 0.2650455927051672,
"grad_norm": 1.4188427925109863,
"learning_rate": 0.0005,
"loss": 4.851819038391113,
"step": 436
},
{
"epoch": 0.26565349544072947,
"grad_norm": 1.1886249780654907,
"learning_rate": 0.0005,
"loss": 4.819738388061523,
"step": 437
},
{
"epoch": 0.2662613981762918,
"grad_norm": 1.0886558294296265,
"learning_rate": 0.0005,
"loss": 4.889862537384033,
"step": 438
},
{
"epoch": 0.2668693009118541,
"grad_norm": 1.215423822402954,
"learning_rate": 0.0005,
"loss": 4.66435432434082,
"step": 439
},
{
"epoch": 0.2674772036474164,
"grad_norm": 1.2564237117767334,
"learning_rate": 0.0005,
"loss": 4.840651512145996,
"step": 440
},
{
"epoch": 0.2680851063829787,
"grad_norm": 0.9406836628913879,
"learning_rate": 0.0005,
"loss": 4.836145401000977,
"step": 441
},
{
"epoch": 0.26869300911854105,
"grad_norm": 0.9963774085044861,
"learning_rate": 0.0005,
"loss": 4.879360675811768,
"step": 442
},
{
"epoch": 0.2693009118541033,
"grad_norm": 1.349959135055542,
"learning_rate": 0.0005,
"loss": 5.149614334106445,
"step": 443
},
{
"epoch": 0.26990881458966565,
"grad_norm": 1.0401732921600342,
"learning_rate": 0.0005,
"loss": 4.831120491027832,
"step": 444
},
{
"epoch": 0.270516717325228,
"grad_norm": 1.0176857709884644,
"learning_rate": 0.0005,
"loss": 4.795515060424805,
"step": 445
},
{
"epoch": 0.2711246200607903,
"grad_norm": 1.025748610496521,
"learning_rate": 0.0005,
"loss": 4.850000381469727,
"step": 446
},
{
"epoch": 0.27173252279635257,
"grad_norm": 1.179107904434204,
"learning_rate": 0.0005,
"loss": 4.714792728424072,
"step": 447
},
{
"epoch": 0.2723404255319149,
"grad_norm": 1.0913288593292236,
"learning_rate": 0.0005,
"loss": 4.713229656219482,
"step": 448
},
{
"epoch": 0.2729483282674772,
"grad_norm": 1.2143056392669678,
"learning_rate": 0.0005,
"loss": 4.776023864746094,
"step": 449
},
{
"epoch": 0.2735562310030395,
"grad_norm": 1.0799494981765747,
"learning_rate": 0.0005,
"loss": 4.930194854736328,
"step": 450
},
{
"epoch": 0.2741641337386018,
"grad_norm": 1.108874797821045,
"learning_rate": 0.0005,
"loss": 4.798364162445068,
"step": 451
},
{
"epoch": 0.27477203647416415,
"grad_norm": 1.023545742034912,
"learning_rate": 0.0005,
"loss": 4.951462745666504,
"step": 452
},
{
"epoch": 0.2753799392097264,
"grad_norm": 1.109633207321167,
"learning_rate": 0.0005,
"loss": 4.775464057922363,
"step": 453
},
{
"epoch": 0.27598784194528875,
"grad_norm": 1.3409186601638794,
"learning_rate": 0.0005,
"loss": 4.637991905212402,
"step": 454
},
{
"epoch": 0.2765957446808511,
"grad_norm": 1.3562052249908447,
"learning_rate": 0.0005,
"loss": 4.67308235168457,
"step": 455
},
{
"epoch": 0.2772036474164134,
"grad_norm": 1.0121145248413086,
"learning_rate": 0.0005,
"loss": 4.8010430335998535,
"step": 456
},
{
"epoch": 0.2778115501519757,
"grad_norm": 1.1394174098968506,
"learning_rate": 0.0005,
"loss": 4.878546237945557,
"step": 457
},
{
"epoch": 0.278419452887538,
"grad_norm": 1.2403444051742554,
"learning_rate": 0.0005,
"loss": 4.8740434646606445,
"step": 458
},
{
"epoch": 0.2790273556231003,
"grad_norm": 1.242672085762024,
"learning_rate": 0.0005,
"loss": 4.854490280151367,
"step": 459
},
{
"epoch": 0.2796352583586626,
"grad_norm": 1.1986356973648071,
"learning_rate": 0.0005,
"loss": 4.629700660705566,
"step": 460
},
{
"epoch": 0.2802431610942249,
"grad_norm": 1.0786645412445068,
"learning_rate": 0.0005,
"loss": 4.87874698638916,
"step": 461
},
{
"epoch": 0.28085106382978725,
"grad_norm": 1.1056885719299316,
"learning_rate": 0.0005,
"loss": 4.816555023193359,
"step": 462
},
{
"epoch": 0.2814589665653495,
"grad_norm": 1.2329976558685303,
"learning_rate": 0.0005,
"loss": 4.837638854980469,
"step": 463
},
{
"epoch": 0.28206686930091185,
"grad_norm": 1.0028218030929565,
"learning_rate": 0.0005,
"loss": 4.760637283325195,
"step": 464
},
{
"epoch": 0.2826747720364742,
"grad_norm": 2.1149895191192627,
"learning_rate": 0.0005,
"loss": 4.90034818649292,
"step": 465
},
{
"epoch": 0.28328267477203645,
"grad_norm": 1.1582082509994507,
"learning_rate": 0.0005,
"loss": 4.943870544433594,
"step": 466
},
{
"epoch": 0.2838905775075988,
"grad_norm": 1.069417119026184,
"learning_rate": 0.0005,
"loss": 4.872045993804932,
"step": 467
},
{
"epoch": 0.2844984802431611,
"grad_norm": 1.0112608671188354,
"learning_rate": 0.0005,
"loss": 4.7598490715026855,
"step": 468
},
{
"epoch": 0.2851063829787234,
"grad_norm": 1.2075181007385254,
"learning_rate": 0.0005,
"loss": 4.731328010559082,
"step": 469
},
{
"epoch": 0.2857142857142857,
"grad_norm": 1.2083991765975952,
"learning_rate": 0.0005,
"loss": 4.927289962768555,
"step": 470
},
{
"epoch": 0.286322188449848,
"grad_norm": 1.1168643236160278,
"learning_rate": 0.0005,
"loss": 4.864751815795898,
"step": 471
},
{
"epoch": 0.28693009118541035,
"grad_norm": 1.078041434288025,
"learning_rate": 0.0005,
"loss": 4.8492431640625,
"step": 472
},
{
"epoch": 0.2875379939209726,
"grad_norm": 1.1274940967559814,
"learning_rate": 0.0005,
"loss": 4.937112808227539,
"step": 473
},
{
"epoch": 0.28814589665653495,
"grad_norm": 1.0653259754180908,
"learning_rate": 0.0005,
"loss": 4.594569683074951,
"step": 474
},
{
"epoch": 0.2887537993920973,
"grad_norm": 1.1258432865142822,
"learning_rate": 0.0005,
"loss": 4.773998260498047,
"step": 475
},
{
"epoch": 0.28936170212765955,
"grad_norm": 1.0394357442855835,
"learning_rate": 0.0005,
"loss": 4.6821393966674805,
"step": 476
},
{
"epoch": 0.2899696048632219,
"grad_norm": 0.9899529218673706,
"learning_rate": 0.0005,
"loss": 4.887704849243164,
"step": 477
},
{
"epoch": 0.2905775075987842,
"grad_norm": 1.1077382564544678,
"learning_rate": 0.0005,
"loss": 4.747071266174316,
"step": 478
},
{
"epoch": 0.29118541033434653,
"grad_norm": 1.1913772821426392,
"learning_rate": 0.0005,
"loss": 4.718881607055664,
"step": 479
},
{
"epoch": 0.2917933130699088,
"grad_norm": 1.0459861755371094,
"learning_rate": 0.0005,
"loss": 4.841939926147461,
"step": 480
},
{
"epoch": 0.2924012158054711,
"grad_norm": 1.0120186805725098,
"learning_rate": 0.0005,
"loss": 4.599112510681152,
"step": 481
},
{
"epoch": 0.29300911854103345,
"grad_norm": 1.195823073387146,
"learning_rate": 0.0005,
"loss": 4.728496551513672,
"step": 482
},
{
"epoch": 0.2936170212765957,
"grad_norm": 1.3696142435073853,
"learning_rate": 0.0005,
"loss": 4.8885321617126465,
"step": 483
},
{
"epoch": 0.29422492401215805,
"grad_norm": 1.0792248249053955,
"learning_rate": 0.0005,
"loss": 4.971987724304199,
"step": 484
},
{
"epoch": 0.2948328267477204,
"grad_norm": 1.1619709730148315,
"learning_rate": 0.0005,
"loss": 4.571520805358887,
"step": 485
},
{
"epoch": 0.29544072948328265,
"grad_norm": 1.0330854654312134,
"learning_rate": 0.0005,
"loss": 4.9688520431518555,
"step": 486
},
{
"epoch": 0.296048632218845,
"grad_norm": 1.0170172452926636,
"learning_rate": 0.0005,
"loss": 4.837705135345459,
"step": 487
},
{
"epoch": 0.2966565349544073,
"grad_norm": 0.9504514932632446,
"learning_rate": 0.0005,
"loss": 4.930578231811523,
"step": 488
},
{
"epoch": 0.29726443768996963,
"grad_norm": 1.0397839546203613,
"learning_rate": 0.0005,
"loss": 4.835279941558838,
"step": 489
},
{
"epoch": 0.2978723404255319,
"grad_norm": 1.1507797241210938,
"learning_rate": 0.0005,
"loss": 4.659822463989258,
"step": 490
},
{
"epoch": 0.2984802431610942,
"grad_norm": 1.0850329399108887,
"learning_rate": 0.0005,
"loss": 4.845378875732422,
"step": 491
},
{
"epoch": 0.29908814589665655,
"grad_norm": 0.9977235794067383,
"learning_rate": 0.0005,
"loss": 4.6792449951171875,
"step": 492
},
{
"epoch": 0.2996960486322188,
"grad_norm": 1.1023447513580322,
"learning_rate": 0.0005,
"loss": 4.397878646850586,
"step": 493
},
{
"epoch": 0.30030395136778115,
"grad_norm": 1.151859998703003,
"learning_rate": 0.0005,
"loss": 4.909426689147949,
"step": 494
},
{
"epoch": 0.3009118541033435,
"grad_norm": 0.9461018443107605,
"learning_rate": 0.0005,
"loss": 4.778614044189453,
"step": 495
},
{
"epoch": 0.30151975683890575,
"grad_norm": 1.0753334760665894,
"learning_rate": 0.0005,
"loss": 4.747906684875488,
"step": 496
},
{
"epoch": 0.3021276595744681,
"grad_norm": 1.1790133714675903,
"learning_rate": 0.0005,
"loss": 4.932548522949219,
"step": 497
},
{
"epoch": 0.3027355623100304,
"grad_norm": 0.9537319540977478,
"learning_rate": 0.0005,
"loss": 4.962670803070068,
"step": 498
},
{
"epoch": 0.30334346504559273,
"grad_norm": 1.0915073156356812,
"learning_rate": 0.0005,
"loss": 4.60493278503418,
"step": 499
},
{
"epoch": 0.303951367781155,
"grad_norm": 1.1177006959915161,
"learning_rate": 0.0005,
"loss": 4.69853401184082,
"step": 500
},
{
"epoch": 0.30455927051671733,
"grad_norm": 1.297899842262268,
"learning_rate": 0.0005,
"loss": 4.779489517211914,
"step": 501
},
{
"epoch": 0.30516717325227966,
"grad_norm": 1.0834105014801025,
"learning_rate": 0.0005,
"loss": 4.795891761779785,
"step": 502
},
{
"epoch": 0.3057750759878419,
"grad_norm": 1.345795750617981,
"learning_rate": 0.0005,
"loss": 4.725937843322754,
"step": 503
},
{
"epoch": 0.30638297872340425,
"grad_norm": 1.0314546823501587,
"learning_rate": 0.0005,
"loss": 4.679283142089844,
"step": 504
},
{
"epoch": 0.3069908814589666,
"grad_norm": 1.0348689556121826,
"learning_rate": 0.0005,
"loss": 4.620650291442871,
"step": 505
},
{
"epoch": 0.30759878419452885,
"grad_norm": 1.266882061958313,
"learning_rate": 0.0005,
"loss": 4.773314476013184,
"step": 506
},
{
"epoch": 0.3082066869300912,
"grad_norm": 1.1243505477905273,
"learning_rate": 0.0005,
"loss": 4.748200416564941,
"step": 507
},
{
"epoch": 0.3088145896656535,
"grad_norm": 1.1018924713134766,
"learning_rate": 0.0005,
"loss": 4.68126106262207,
"step": 508
},
{
"epoch": 0.30942249240121583,
"grad_norm": 0.9563927054405212,
"learning_rate": 0.0005,
"loss": 4.857057094573975,
"step": 509
},
{
"epoch": 0.3100303951367781,
"grad_norm": 0.9670454263687134,
"learning_rate": 0.0005,
"loss": 4.659792900085449,
"step": 510
},
{
"epoch": 0.31063829787234043,
"grad_norm": 1.3360145092010498,
"learning_rate": 0.0005,
"loss": 4.829246520996094,
"step": 511
},
{
"epoch": 0.31124620060790276,
"grad_norm": 1.2123932838439941,
"learning_rate": 0.0005,
"loss": 4.866283416748047,
"step": 512
},
{
"epoch": 0.31185410334346503,
"grad_norm": 1.1718541383743286,
"learning_rate": 0.0005,
"loss": 4.582745552062988,
"step": 513
},
{
"epoch": 0.31246200607902735,
"grad_norm": 1.0925103425979614,
"learning_rate": 0.0005,
"loss": 4.792252540588379,
"step": 514
},
{
"epoch": 0.3130699088145897,
"grad_norm": 1.1929430961608887,
"learning_rate": 0.0005,
"loss": 5.072274208068848,
"step": 515
},
{
"epoch": 0.31367781155015195,
"grad_norm": 1.1033862829208374,
"learning_rate": 0.0005,
"loss": 5.100406646728516,
"step": 516
},
{
"epoch": 0.3142857142857143,
"grad_norm": 1.0984266996383667,
"learning_rate": 0.0005,
"loss": 4.652458190917969,
"step": 517
},
{
"epoch": 0.3148936170212766,
"grad_norm": 1.1322665214538574,
"learning_rate": 0.0005,
"loss": 4.757636070251465,
"step": 518
},
{
"epoch": 0.31550151975683893,
"grad_norm": 1.062367558479309,
"learning_rate": 0.0005,
"loss": 4.769024848937988,
"step": 519
},
{
"epoch": 0.3161094224924012,
"grad_norm": 1.2141786813735962,
"learning_rate": 0.0005,
"loss": 4.795253753662109,
"step": 520
},
{
"epoch": 0.31671732522796353,
"grad_norm": 1.0612986087799072,
"learning_rate": 0.0005,
"loss": 4.869831562042236,
"step": 521
},
{
"epoch": 0.31732522796352586,
"grad_norm": 1.0063875913619995,
"learning_rate": 0.0005,
"loss": 4.789008617401123,
"step": 522
},
{
"epoch": 0.31793313069908813,
"grad_norm": 1.1345361471176147,
"learning_rate": 0.0005,
"loss": 4.858623504638672,
"step": 523
},
{
"epoch": 0.31854103343465046,
"grad_norm": 1.0883427858352661,
"learning_rate": 0.0005,
"loss": 4.6939568519592285,
"step": 524
},
{
"epoch": 0.3191489361702128,
"grad_norm": 1.210877776145935,
"learning_rate": 0.0005,
"loss": 4.860000133514404,
"step": 525
},
{
"epoch": 0.31975683890577505,
"grad_norm": 0.9779753088951111,
"learning_rate": 0.0005,
"loss": 4.710822582244873,
"step": 526
},
{
"epoch": 0.3203647416413374,
"grad_norm": 1.130603313446045,
"learning_rate": 0.0005,
"loss": 4.8572678565979,
"step": 527
},
{
"epoch": 0.3209726443768997,
"grad_norm": 1.0674115419387817,
"learning_rate": 0.0005,
"loss": 4.597178936004639,
"step": 528
},
{
"epoch": 0.321580547112462,
"grad_norm": 1.2021600008010864,
"learning_rate": 0.0005,
"loss": 4.564465045928955,
"step": 529
},
{
"epoch": 0.3221884498480243,
"grad_norm": 1.018747329711914,
"learning_rate": 0.0005,
"loss": 4.791827201843262,
"step": 530
},
{
"epoch": 0.32279635258358663,
"grad_norm": 0.847745418548584,
"learning_rate": 0.0005,
"loss": 4.538583278656006,
"step": 531
},
{
"epoch": 0.32340425531914896,
"grad_norm": 1.0722301006317139,
"learning_rate": 0.0005,
"loss": 4.728479385375977,
"step": 532
},
{
"epoch": 0.32401215805471123,
"grad_norm": 1.0908275842666626,
"learning_rate": 0.0005,
"loss": 4.7406721115112305,
"step": 533
},
{
"epoch": 0.32462006079027356,
"grad_norm": 1.0944693088531494,
"learning_rate": 0.0005,
"loss": 4.56569242477417,
"step": 534
},
{
"epoch": 0.3252279635258359,
"grad_norm": 1.2364919185638428,
"learning_rate": 0.0005,
"loss": 4.977725028991699,
"step": 535
},
{
"epoch": 0.32583586626139815,
"grad_norm": 0.9999113082885742,
"learning_rate": 0.0005,
"loss": 4.493361473083496,
"step": 536
},
{
"epoch": 0.3264437689969605,
"grad_norm": 1.3366332054138184,
"learning_rate": 0.0005,
"loss": 4.634256362915039,
"step": 537
},
{
"epoch": 0.3270516717325228,
"grad_norm": 1.1342191696166992,
"learning_rate": 0.0005,
"loss": 4.737150192260742,
"step": 538
},
{
"epoch": 0.3276595744680851,
"grad_norm": 1.582653284072876,
"learning_rate": 0.0005,
"loss": 4.870404243469238,
"step": 539
},
{
"epoch": 0.3282674772036474,
"grad_norm": 1.1713464260101318,
"learning_rate": 0.0005,
"loss": 4.6230669021606445,
"step": 540
},
{
"epoch": 0.32887537993920973,
"grad_norm": 1.4178698062896729,
"learning_rate": 0.0005,
"loss": 4.764198303222656,
"step": 541
},
{
"epoch": 0.32948328267477206,
"grad_norm": 1.2060075998306274,
"learning_rate": 0.0005,
"loss": 4.675044059753418,
"step": 542
},
{
"epoch": 0.33009118541033433,
"grad_norm": 1.1698312759399414,
"learning_rate": 0.0005,
"loss": 4.706038475036621,
"step": 543
},
{
"epoch": 0.33069908814589666,
"grad_norm": 1.23035728931427,
"learning_rate": 0.0005,
"loss": 4.638150215148926,
"step": 544
},
{
"epoch": 0.331306990881459,
"grad_norm": 1.2109099626541138,
"learning_rate": 0.0005,
"loss": 4.521143436431885,
"step": 545
},
{
"epoch": 0.33191489361702126,
"grad_norm": 1.0906360149383545,
"learning_rate": 0.0005,
"loss": 4.71769380569458,
"step": 546
},
{
"epoch": 0.3325227963525836,
"grad_norm": 0.9782645106315613,
"learning_rate": 0.0005,
"loss": 4.610015869140625,
"step": 547
},
{
"epoch": 0.3331306990881459,
"grad_norm": 0.9349035620689392,
"learning_rate": 0.0005,
"loss": 4.59166955947876,
"step": 548
},
{
"epoch": 0.3337386018237082,
"grad_norm": 0.987219512462616,
"learning_rate": 0.0005,
"loss": 4.769125938415527,
"step": 549
},
{
"epoch": 0.3343465045592705,
"grad_norm": 1.1204229593276978,
"learning_rate": 0.0005,
"loss": 4.561359405517578,
"step": 550
},
{
"epoch": 0.33495440729483283,
"grad_norm": 0.9658718109130859,
"learning_rate": 0.0005,
"loss": 4.64151668548584,
"step": 551
},
{
"epoch": 0.33556231003039516,
"grad_norm": 0.9612642526626587,
"learning_rate": 0.0005,
"loss": 4.750694274902344,
"step": 552
},
{
"epoch": 0.33617021276595743,
"grad_norm": 1.215868592262268,
"learning_rate": 0.0005,
"loss": 4.788500785827637,
"step": 553
},
{
"epoch": 0.33677811550151976,
"grad_norm": 1.1488007307052612,
"learning_rate": 0.0005,
"loss": 4.708594799041748,
"step": 554
},
{
"epoch": 0.3373860182370821,
"grad_norm": 1.7407371997833252,
"learning_rate": 0.0005,
"loss": 4.751000881195068,
"step": 555
},
{
"epoch": 0.33799392097264436,
"grad_norm": 1.0364381074905396,
"learning_rate": 0.0005,
"loss": 4.5454301834106445,
"step": 556
},
{
"epoch": 0.3386018237082067,
"grad_norm": 1.0255850553512573,
"learning_rate": 0.0005,
"loss": 4.67049503326416,
"step": 557
},
{
"epoch": 0.339209726443769,
"grad_norm": 1.1722489595413208,
"learning_rate": 0.0005,
"loss": 4.762301445007324,
"step": 558
},
{
"epoch": 0.3398176291793313,
"grad_norm": 0.9487795829772949,
"learning_rate": 0.0005,
"loss": 4.537074089050293,
"step": 559
},
{
"epoch": 0.3404255319148936,
"grad_norm": 1.0322198867797852,
"learning_rate": 0.0005,
"loss": 4.325550079345703,
"step": 560
},
{
"epoch": 0.34103343465045594,
"grad_norm": 1.1969901323318481,
"learning_rate": 0.0005,
"loss": 4.897404670715332,
"step": 561
},
{
"epoch": 0.34164133738601826,
"grad_norm": 0.9366703629493713,
"learning_rate": 0.0005,
"loss": 4.552170753479004,
"step": 562
},
{
"epoch": 0.34224924012158053,
"grad_norm": 0.9916586875915527,
"learning_rate": 0.0005,
"loss": 4.596172332763672,
"step": 563
},
{
"epoch": 0.34285714285714286,
"grad_norm": 1.1367878913879395,
"learning_rate": 0.0005,
"loss": 4.745723724365234,
"step": 564
},
{
"epoch": 0.3434650455927052,
"grad_norm": 1.0490455627441406,
"learning_rate": 0.0005,
"loss": 4.605084419250488,
"step": 565
},
{
"epoch": 0.34407294832826746,
"grad_norm": 1.2300151586532593,
"learning_rate": 0.0005,
"loss": 4.680173397064209,
"step": 566
},
{
"epoch": 0.3446808510638298,
"grad_norm": 0.9747954607009888,
"learning_rate": 0.0005,
"loss": 4.755300521850586,
"step": 567
},
{
"epoch": 0.3452887537993921,
"grad_norm": 1.2195698022842407,
"learning_rate": 0.0005,
"loss": 4.678683280944824,
"step": 568
},
{
"epoch": 0.3458966565349544,
"grad_norm": 1.1122758388519287,
"learning_rate": 0.0005,
"loss": 4.55827522277832,
"step": 569
},
{
"epoch": 0.3465045592705167,
"grad_norm": 1.1671665906906128,
"learning_rate": 0.0005,
"loss": 4.6204071044921875,
"step": 570
},
{
"epoch": 0.34711246200607904,
"grad_norm": 0.912133514881134,
"learning_rate": 0.0005,
"loss": 4.619932174682617,
"step": 571
},
{
"epoch": 0.34772036474164136,
"grad_norm": 1.0673686265945435,
"learning_rate": 0.0005,
"loss": 4.7417120933532715,
"step": 572
},
{
"epoch": 0.34832826747720363,
"grad_norm": 1.0796691179275513,
"learning_rate": 0.0005,
"loss": 4.666133880615234,
"step": 573
},
{
"epoch": 0.34893617021276596,
"grad_norm": 1.177518367767334,
"learning_rate": 0.0005,
"loss": 4.443113803863525,
"step": 574
},
{
"epoch": 0.3495440729483283,
"grad_norm": 0.9157246351242065,
"learning_rate": 0.0005,
"loss": 4.578097343444824,
"step": 575
},
{
"epoch": 0.35015197568389056,
"grad_norm": 1.034294843673706,
"learning_rate": 0.0005,
"loss": 4.393146514892578,
"step": 576
},
{
"epoch": 0.3507598784194529,
"grad_norm": 0.9026995301246643,
"learning_rate": 0.0005,
"loss": 4.868537425994873,
"step": 577
},
{
"epoch": 0.3513677811550152,
"grad_norm": 1.1576241254806519,
"learning_rate": 0.0005,
"loss": 4.755158424377441,
"step": 578
},
{
"epoch": 0.3519756838905775,
"grad_norm": 1.061812400817871,
"learning_rate": 0.0005,
"loss": 4.48585319519043,
"step": 579
},
{
"epoch": 0.3525835866261398,
"grad_norm": 0.9842910170555115,
"learning_rate": 0.0005,
"loss": 4.865891456604004,
"step": 580
},
{
"epoch": 0.35319148936170214,
"grad_norm": 1.0243335962295532,
"learning_rate": 0.0005,
"loss": 4.523388862609863,
"step": 581
},
{
"epoch": 0.35379939209726446,
"grad_norm": 1.2581957578659058,
"learning_rate": 0.0005,
"loss": 4.821706771850586,
"step": 582
},
{
"epoch": 0.35440729483282674,
"grad_norm": 1.1777689456939697,
"learning_rate": 0.0005,
"loss": 4.600160121917725,
"step": 583
},
{
"epoch": 0.35501519756838906,
"grad_norm": 0.9623486995697021,
"learning_rate": 0.0005,
"loss": 4.775470733642578,
"step": 584
},
{
"epoch": 0.3556231003039514,
"grad_norm": 1.302804708480835,
"learning_rate": 0.0005,
"loss": 4.704485893249512,
"step": 585
},
{
"epoch": 0.35623100303951366,
"grad_norm": 1.15083646774292,
"learning_rate": 0.0005,
"loss": 4.685108184814453,
"step": 586
},
{
"epoch": 0.356838905775076,
"grad_norm": 1.0529240369796753,
"learning_rate": 0.0005,
"loss": 4.762598991394043,
"step": 587
},
{
"epoch": 0.3574468085106383,
"grad_norm": 1.008600115776062,
"learning_rate": 0.0005,
"loss": 4.711298942565918,
"step": 588
},
{
"epoch": 0.3580547112462006,
"grad_norm": 1.1591368913650513,
"learning_rate": 0.0005,
"loss": 4.836706638336182,
"step": 589
},
{
"epoch": 0.3586626139817629,
"grad_norm": 1.0372366905212402,
"learning_rate": 0.0005,
"loss": 4.753532409667969,
"step": 590
},
{
"epoch": 0.35927051671732524,
"grad_norm": 0.9533773064613342,
"learning_rate": 0.0005,
"loss": 4.787997245788574,
"step": 591
},
{
"epoch": 0.35987841945288757,
"grad_norm": 1.3395041227340698,
"learning_rate": 0.0005,
"loss": 4.700077533721924,
"step": 592
},
{
"epoch": 0.36048632218844984,
"grad_norm": 1.0645594596862793,
"learning_rate": 0.0005,
"loss": 4.607672691345215,
"step": 593
},
{
"epoch": 0.36109422492401216,
"grad_norm": 1.2142505645751953,
"learning_rate": 0.0005,
"loss": 4.6179375648498535,
"step": 594
},
{
"epoch": 0.3617021276595745,
"grad_norm": 1.2730581760406494,
"learning_rate": 0.0005,
"loss": 4.555119514465332,
"step": 595
},
{
"epoch": 0.36231003039513676,
"grad_norm": 1.0680732727050781,
"learning_rate": 0.0005,
"loss": 4.700529098510742,
"step": 596
},
{
"epoch": 0.3629179331306991,
"grad_norm": 1.055757761001587,
"learning_rate": 0.0005,
"loss": 4.544746398925781,
"step": 597
},
{
"epoch": 0.3635258358662614,
"grad_norm": 1.2012107372283936,
"learning_rate": 0.0005,
"loss": 4.614580154418945,
"step": 598
},
{
"epoch": 0.3641337386018237,
"grad_norm": 1.0662033557891846,
"learning_rate": 0.0005,
"loss": 4.880558967590332,
"step": 599
},
{
"epoch": 0.364741641337386,
"grad_norm": 1.0305242538452148,
"learning_rate": 0.0005,
"loss": 4.462358474731445,
"step": 600
},
{
"epoch": 0.36534954407294834,
"grad_norm": 1.0423706769943237,
"learning_rate": 0.0005,
"loss": 4.591382026672363,
"step": 601
},
{
"epoch": 0.3659574468085106,
"grad_norm": 1.2076576948165894,
"learning_rate": 0.0005,
"loss": 4.7383599281311035,
"step": 602
},
{
"epoch": 0.36656534954407294,
"grad_norm": 1.0415648221969604,
"learning_rate": 0.0005,
"loss": 4.586676597595215,
"step": 603
},
{
"epoch": 0.36717325227963526,
"grad_norm": 0.9548492431640625,
"learning_rate": 0.0005,
"loss": 4.836339950561523,
"step": 604
},
{
"epoch": 0.3677811550151976,
"grad_norm": 1.1116399765014648,
"learning_rate": 0.0005,
"loss": 4.634486198425293,
"step": 605
},
{
"epoch": 0.36838905775075986,
"grad_norm": 0.9329056739807129,
"learning_rate": 0.0005,
"loss": 4.806420803070068,
"step": 606
},
{
"epoch": 0.3689969604863222,
"grad_norm": 1.167823314666748,
"learning_rate": 0.0005,
"loss": 4.594254493713379,
"step": 607
},
{
"epoch": 0.3696048632218845,
"grad_norm": 1.0034370422363281,
"learning_rate": 0.0005,
"loss": 4.6151347160339355,
"step": 608
},
{
"epoch": 0.3702127659574468,
"grad_norm": 1.0906440019607544,
"learning_rate": 0.0005,
"loss": 4.540549278259277,
"step": 609
},
{
"epoch": 0.3708206686930091,
"grad_norm": 1.0491790771484375,
"learning_rate": 0.0005,
"loss": 4.600298881530762,
"step": 610
},
{
"epoch": 0.37142857142857144,
"grad_norm": 1.2935380935668945,
"learning_rate": 0.0005,
"loss": 4.646307945251465,
"step": 611
},
{
"epoch": 0.3720364741641337,
"grad_norm": 1.1572242975234985,
"learning_rate": 0.0005,
"loss": 4.820685863494873,
"step": 612
},
{
"epoch": 0.37264437689969604,
"grad_norm": 1.0526167154312134,
"learning_rate": 0.0005,
"loss": 4.463221549987793,
"step": 613
},
{
"epoch": 0.37325227963525837,
"grad_norm": 1.0142046213150024,
"learning_rate": 0.0005,
"loss": 4.979160308837891,
"step": 614
},
{
"epoch": 0.3738601823708207,
"grad_norm": 1.0886595249176025,
"learning_rate": 0.0005,
"loss": 4.659153461456299,
"step": 615
},
{
"epoch": 0.37446808510638296,
"grad_norm": 1.0294383764266968,
"learning_rate": 0.0005,
"loss": 4.511576175689697,
"step": 616
},
{
"epoch": 0.3750759878419453,
"grad_norm": 1.220738172531128,
"learning_rate": 0.0005,
"loss": 4.640242576599121,
"step": 617
},
{
"epoch": 0.3756838905775076,
"grad_norm": 0.976274311542511,
"learning_rate": 0.0005,
"loss": 4.557078838348389,
"step": 618
},
{
"epoch": 0.3762917933130699,
"grad_norm": 1.1121824979782104,
"learning_rate": 0.0005,
"loss": 4.412234306335449,
"step": 619
},
{
"epoch": 0.3768996960486322,
"grad_norm": 1.0940440893173218,
"learning_rate": 0.0005,
"loss": 4.597440242767334,
"step": 620
},
{
"epoch": 0.37750759878419454,
"grad_norm": 1.1758757829666138,
"learning_rate": 0.0005,
"loss": 4.729987144470215,
"step": 621
},
{
"epoch": 0.3781155015197568,
"grad_norm": 0.979016900062561,
"learning_rate": 0.0005,
"loss": 4.656641960144043,
"step": 622
},
{
"epoch": 0.37872340425531914,
"grad_norm": 1.1017565727233887,
"learning_rate": 0.0005,
"loss": 4.587738037109375,
"step": 623
},
{
"epoch": 0.37933130699088147,
"grad_norm": 1.0581464767456055,
"learning_rate": 0.0005,
"loss": 4.452451705932617,
"step": 624
},
{
"epoch": 0.3799392097264438,
"grad_norm": 1.0750993490219116,
"learning_rate": 0.0005,
"loss": 4.531889915466309,
"step": 625
},
{
"epoch": 0.38054711246200607,
"grad_norm": 0.9821625351905823,
"learning_rate": 0.0005,
"loss": 4.488890171051025,
"step": 626
},
{
"epoch": 0.3811550151975684,
"grad_norm": 1.0691367387771606,
"learning_rate": 0.0005,
"loss": 4.62428617477417,
"step": 627
},
{
"epoch": 0.3817629179331307,
"grad_norm": 1.0314120054244995,
"learning_rate": 0.0005,
"loss": 4.533023834228516,
"step": 628
},
{
"epoch": 0.382370820668693,
"grad_norm": 0.9268558025360107,
"learning_rate": 0.0005,
"loss": 4.565212249755859,
"step": 629
},
{
"epoch": 0.3829787234042553,
"grad_norm": 1.0632472038269043,
"learning_rate": 0.0005,
"loss": 4.5511980056762695,
"step": 630
},
{
"epoch": 0.38358662613981764,
"grad_norm": 0.9516937732696533,
"learning_rate": 0.0005,
"loss": 4.546860694885254,
"step": 631
},
{
"epoch": 0.3841945288753799,
"grad_norm": 0.8885926008224487,
"learning_rate": 0.0005,
"loss": 4.540233612060547,
"step": 632
},
{
"epoch": 0.38480243161094224,
"grad_norm": 0.9631567001342773,
"learning_rate": 0.0005,
"loss": 4.552545070648193,
"step": 633
},
{
"epoch": 0.38541033434650457,
"grad_norm": 1.0189249515533447,
"learning_rate": 0.0005,
"loss": 4.413745880126953,
"step": 634
},
{
"epoch": 0.3860182370820669,
"grad_norm": 1.0094175338745117,
"learning_rate": 0.0005,
"loss": 4.266282081604004,
"step": 635
},
{
"epoch": 0.38662613981762917,
"grad_norm": 1.1108192205429077,
"learning_rate": 0.0005,
"loss": 4.169710159301758,
"step": 636
},
{
"epoch": 0.3872340425531915,
"grad_norm": 1.1999133825302124,
"learning_rate": 0.0005,
"loss": 4.5471391677856445,
"step": 637
},
{
"epoch": 0.3878419452887538,
"grad_norm": 1.047059178352356,
"learning_rate": 0.0005,
"loss": 4.793215751647949,
"step": 638
},
{
"epoch": 0.3884498480243161,
"grad_norm": 1.1927613019943237,
"learning_rate": 0.0005,
"loss": 4.474370002746582,
"step": 639
},
{
"epoch": 0.3890577507598784,
"grad_norm": 1.0722092390060425,
"learning_rate": 0.0005,
"loss": 4.685356140136719,
"step": 640
},
{
"epoch": 0.38966565349544074,
"grad_norm": 1.0422673225402832,
"learning_rate": 0.0005,
"loss": 4.5289201736450195,
"step": 641
},
{
"epoch": 0.390273556231003,
"grad_norm": 0.9556507468223572,
"learning_rate": 0.0005,
"loss": 4.421667098999023,
"step": 642
},
{
"epoch": 0.39088145896656534,
"grad_norm": 1.0354868173599243,
"learning_rate": 0.0005,
"loss": 4.573639869689941,
"step": 643
},
{
"epoch": 0.39148936170212767,
"grad_norm": 1.0089163780212402,
"learning_rate": 0.0005,
"loss": 4.505742073059082,
"step": 644
},
{
"epoch": 0.39209726443769,
"grad_norm": 1.098516821861267,
"learning_rate": 0.0005,
"loss": 4.61726713180542,
"step": 645
},
{
"epoch": 0.39270516717325227,
"grad_norm": 1.0022438764572144,
"learning_rate": 0.0005,
"loss": 4.8146162033081055,
"step": 646
},
{
"epoch": 0.3933130699088146,
"grad_norm": 1.219514012336731,
"learning_rate": 0.0005,
"loss": 4.5992279052734375,
"step": 647
},
{
"epoch": 0.3939209726443769,
"grad_norm": 1.0511285066604614,
"learning_rate": 0.0005,
"loss": 4.65933895111084,
"step": 648
},
{
"epoch": 0.3945288753799392,
"grad_norm": 1.0481231212615967,
"learning_rate": 0.0005,
"loss": 4.405591011047363,
"step": 649
},
{
"epoch": 0.3951367781155015,
"grad_norm": 1.1169630289077759,
"learning_rate": 0.0005,
"loss": 4.621652603149414,
"step": 650
},
{
"epoch": 0.39574468085106385,
"grad_norm": 1.031966209411621,
"learning_rate": 0.0005,
"loss": 4.5710320472717285,
"step": 651
},
{
"epoch": 0.3963525835866261,
"grad_norm": 1.1107763051986694,
"learning_rate": 0.0005,
"loss": 4.537693023681641,
"step": 652
},
{
"epoch": 0.39696048632218844,
"grad_norm": 0.9889346957206726,
"learning_rate": 0.0005,
"loss": 4.518610000610352,
"step": 653
},
{
"epoch": 0.39756838905775077,
"grad_norm": 1.1640068292617798,
"learning_rate": 0.0005,
"loss": 4.595146179199219,
"step": 654
},
{
"epoch": 0.3981762917933131,
"grad_norm": 1.2929025888442993,
"learning_rate": 0.0005,
"loss": 4.559798240661621,
"step": 655
},
{
"epoch": 0.39878419452887537,
"grad_norm": 1.098781943321228,
"learning_rate": 0.0005,
"loss": 4.602121353149414,
"step": 656
},
{
"epoch": 0.3993920972644377,
"grad_norm": 1.0199748277664185,
"learning_rate": 0.0005,
"loss": 4.460375785827637,
"step": 657
},
{
"epoch": 0.4,
"grad_norm": 1.4516689777374268,
"learning_rate": 0.0005,
"loss": 4.583429336547852,
"step": 658
},
{
"epoch": 0.4006079027355623,
"grad_norm": 1.0523816347122192,
"learning_rate": 0.0005,
"loss": 4.602944374084473,
"step": 659
},
{
"epoch": 0.4012158054711246,
"grad_norm": 1.052711844444275,
"learning_rate": 0.0005,
"loss": 4.508934020996094,
"step": 660
},
{
"epoch": 0.40182370820668695,
"grad_norm": 1.0846177339553833,
"learning_rate": 0.0005,
"loss": 4.532805442810059,
"step": 661
},
{
"epoch": 0.4024316109422492,
"grad_norm": 0.9877490401268005,
"learning_rate": 0.0005,
"loss": 4.644316673278809,
"step": 662
},
{
"epoch": 0.40303951367781155,
"grad_norm": 1.04659104347229,
"learning_rate": 0.0005,
"loss": 4.376730918884277,
"step": 663
},
{
"epoch": 0.40364741641337387,
"grad_norm": 1.250658392906189,
"learning_rate": 0.0005,
"loss": 4.553335666656494,
"step": 664
},
{
"epoch": 0.40425531914893614,
"grad_norm": 1.1647439002990723,
"learning_rate": 0.0005,
"loss": 4.282361030578613,
"step": 665
},
{
"epoch": 0.40486322188449847,
"grad_norm": 1.086575984954834,
"learning_rate": 0.0005,
"loss": 4.545602798461914,
"step": 666
},
{
"epoch": 0.4054711246200608,
"grad_norm": 1.0094430446624756,
"learning_rate": 0.0005,
"loss": 4.514423370361328,
"step": 667
},
{
"epoch": 0.4060790273556231,
"grad_norm": 1.1341593265533447,
"learning_rate": 0.0005,
"loss": 4.359306812286377,
"step": 668
},
{
"epoch": 0.4066869300911854,
"grad_norm": 1.0556292533874512,
"learning_rate": 0.0005,
"loss": 4.663166046142578,
"step": 669
},
{
"epoch": 0.4072948328267477,
"grad_norm": 0.9918414950370789,
"learning_rate": 0.0005,
"loss": 4.348359107971191,
"step": 670
},
{
"epoch": 0.40790273556231005,
"grad_norm": 1.2771086692810059,
"learning_rate": 0.0005,
"loss": 4.380928993225098,
"step": 671
},
{
"epoch": 0.4085106382978723,
"grad_norm": 1.2792952060699463,
"learning_rate": 0.0005,
"loss": 4.493129253387451,
"step": 672
},
{
"epoch": 0.40911854103343465,
"grad_norm": 1.115451693534851,
"learning_rate": 0.0005,
"loss": 4.5493903160095215,
"step": 673
},
{
"epoch": 0.409726443768997,
"grad_norm": 1.02188241481781,
"learning_rate": 0.0005,
"loss": 4.540634632110596,
"step": 674
},
{
"epoch": 0.41033434650455924,
"grad_norm": 1.1881492137908936,
"learning_rate": 0.0005,
"loss": 4.6216325759887695,
"step": 675
},
{
"epoch": 0.41094224924012157,
"grad_norm": 1.1510716676712036,
"learning_rate": 0.0005,
"loss": 4.753006935119629,
"step": 676
},
{
"epoch": 0.4115501519756839,
"grad_norm": 0.9409204125404358,
"learning_rate": 0.0005,
"loss": 4.558671951293945,
"step": 677
},
{
"epoch": 0.4121580547112462,
"grad_norm": 0.9652894735336304,
"learning_rate": 0.0005,
"loss": 4.586430549621582,
"step": 678
},
{
"epoch": 0.4127659574468085,
"grad_norm": 1.0625907182693481,
"learning_rate": 0.0005,
"loss": 4.467252254486084,
"step": 679
},
{
"epoch": 0.4133738601823708,
"grad_norm": 1.078682780265808,
"learning_rate": 0.0005,
"loss": 4.66164493560791,
"step": 680
},
{
"epoch": 0.41398176291793315,
"grad_norm": 1.0304362773895264,
"learning_rate": 0.0005,
"loss": 4.765620231628418,
"step": 681
},
{
"epoch": 0.4145896656534954,
"grad_norm": 0.9225407242774963,
"learning_rate": 0.0005,
"loss": 4.550148010253906,
"step": 682
},
{
"epoch": 0.41519756838905775,
"grad_norm": 1.0196508169174194,
"learning_rate": 0.0005,
"loss": 4.9098100662231445,
"step": 683
},
{
"epoch": 0.4158054711246201,
"grad_norm": 0.9961191415786743,
"learning_rate": 0.0005,
"loss": 4.4087114334106445,
"step": 684
},
{
"epoch": 0.41641337386018235,
"grad_norm": 1.0987764596939087,
"learning_rate": 0.0005,
"loss": 4.60486364364624,
"step": 685
},
{
"epoch": 0.41702127659574467,
"grad_norm": 1.3485429286956787,
"learning_rate": 0.0005,
"loss": 4.509698390960693,
"step": 686
},
{
"epoch": 0.417629179331307,
"grad_norm": 1.0834795236587524,
"learning_rate": 0.0005,
"loss": 4.131223678588867,
"step": 687
},
{
"epoch": 0.4182370820668693,
"grad_norm": 1.2778581380844116,
"learning_rate": 0.0005,
"loss": 4.530914306640625,
"step": 688
},
{
"epoch": 0.4188449848024316,
"grad_norm": 0.9555144309997559,
"learning_rate": 0.0005,
"loss": 4.773101806640625,
"step": 689
},
{
"epoch": 0.4194528875379939,
"grad_norm": 1.0608127117156982,
"learning_rate": 0.0005,
"loss": 4.457843780517578,
"step": 690
},
{
"epoch": 0.42006079027355625,
"grad_norm": 1.2380342483520508,
"learning_rate": 0.0005,
"loss": 4.438450813293457,
"step": 691
},
{
"epoch": 0.4206686930091185,
"grad_norm": 1.0234472751617432,
"learning_rate": 0.0005,
"loss": 4.412363052368164,
"step": 692
},
{
"epoch": 0.42127659574468085,
"grad_norm": 1.0774229764938354,
"learning_rate": 0.0005,
"loss": 4.687466144561768,
"step": 693
},
{
"epoch": 0.4218844984802432,
"grad_norm": 0.9822944402694702,
"learning_rate": 0.0005,
"loss": 4.798013687133789,
"step": 694
},
{
"epoch": 0.42249240121580545,
"grad_norm": 1.1232951879501343,
"learning_rate": 0.0005,
"loss": 4.548072814941406,
"step": 695
},
{
"epoch": 0.4231003039513678,
"grad_norm": 1.5027856826782227,
"learning_rate": 0.0005,
"loss": 4.7048797607421875,
"step": 696
},
{
"epoch": 0.4237082066869301,
"grad_norm": 1.036541223526001,
"learning_rate": 0.0005,
"loss": 4.6969709396362305,
"step": 697
},
{
"epoch": 0.4243161094224924,
"grad_norm": 1.1823787689208984,
"learning_rate": 0.0005,
"loss": 4.457941055297852,
"step": 698
},
{
"epoch": 0.4249240121580547,
"grad_norm": 0.9230678081512451,
"learning_rate": 0.0005,
"loss": 4.421998500823975,
"step": 699
},
{
"epoch": 0.425531914893617,
"grad_norm": 1.7750741243362427,
"learning_rate": 0.0005,
"loss": 4.76076602935791,
"step": 700
},
{
"epoch": 0.42613981762917935,
"grad_norm": 1.0719808340072632,
"learning_rate": 0.0005,
"loss": 4.580799102783203,
"step": 701
},
{
"epoch": 0.4267477203647416,
"grad_norm": 1.0799646377563477,
"learning_rate": 0.0005,
"loss": 4.311610221862793,
"step": 702
},
{
"epoch": 0.42735562310030395,
"grad_norm": 0.8947767019271851,
"learning_rate": 0.0005,
"loss": 4.4494123458862305,
"step": 703
},
{
"epoch": 0.4279635258358663,
"grad_norm": 1.0298351049423218,
"learning_rate": 0.0005,
"loss": 4.393129348754883,
"step": 704
},
{
"epoch": 0.42857142857142855,
"grad_norm": 1.098189115524292,
"learning_rate": 0.0005,
"loss": 4.199446678161621,
"step": 705
},
{
"epoch": 0.4291793313069909,
"grad_norm": 1.112589955329895,
"learning_rate": 0.0005,
"loss": 4.471273422241211,
"step": 706
},
{
"epoch": 0.4297872340425532,
"grad_norm": 1.2152529954910278,
"learning_rate": 0.0005,
"loss": 4.727916240692139,
"step": 707
},
{
"epoch": 0.43039513677811553,
"grad_norm": 1.1162065267562866,
"learning_rate": 0.0005,
"loss": 4.282822132110596,
"step": 708
},
{
"epoch": 0.4310030395136778,
"grad_norm": 1.2259479761123657,
"learning_rate": 0.0005,
"loss": 4.1524977684021,
"step": 709
},
{
"epoch": 0.4316109422492401,
"grad_norm": 1.0089929103851318,
"learning_rate": 0.0005,
"loss": 4.150537490844727,
"step": 710
},
{
"epoch": 0.43221884498480245,
"grad_norm": 0.9101129770278931,
"learning_rate": 0.0005,
"loss": 4.379437446594238,
"step": 711
},
{
"epoch": 0.4328267477203647,
"grad_norm": 0.9849691390991211,
"learning_rate": 0.0005,
"loss": 4.299429893493652,
"step": 712
},
{
"epoch": 0.43343465045592705,
"grad_norm": 0.9956537485122681,
"learning_rate": 0.0005,
"loss": 4.439446926116943,
"step": 713
},
{
"epoch": 0.4340425531914894,
"grad_norm": 1.0646576881408691,
"learning_rate": 0.0005,
"loss": 4.680734634399414,
"step": 714
},
{
"epoch": 0.43465045592705165,
"grad_norm": 1.1268900632858276,
"learning_rate": 0.0005,
"loss": 4.390021324157715,
"step": 715
},
{
"epoch": 0.435258358662614,
"grad_norm": 1.1238709688186646,
"learning_rate": 0.0005,
"loss": 4.414492607116699,
"step": 716
},
{
"epoch": 0.4358662613981763,
"grad_norm": 1.0272475481033325,
"learning_rate": 0.0005,
"loss": 4.48759651184082,
"step": 717
},
{
"epoch": 0.43647416413373863,
"grad_norm": 0.9443128108978271,
"learning_rate": 0.0005,
"loss": 4.241964340209961,
"step": 718
},
{
"epoch": 0.4370820668693009,
"grad_norm": 0.8795979022979736,
"learning_rate": 0.0005,
"loss": 4.438322067260742,
"step": 719
},
{
"epoch": 0.4376899696048632,
"grad_norm": 1.0388433933258057,
"learning_rate": 0.0005,
"loss": 4.499500274658203,
"step": 720
},
{
"epoch": 0.43829787234042555,
"grad_norm": 1.0285965204238892,
"learning_rate": 0.0005,
"loss": 4.458085060119629,
"step": 721
},
{
"epoch": 0.4389057750759878,
"grad_norm": 1.0486245155334473,
"learning_rate": 0.0005,
"loss": 4.3121843338012695,
"step": 722
},
{
"epoch": 0.43951367781155015,
"grad_norm": 0.974229633808136,
"learning_rate": 0.0005,
"loss": 4.484938621520996,
"step": 723
},
{
"epoch": 0.4401215805471125,
"grad_norm": 1.028061032295227,
"learning_rate": 0.0005,
"loss": 4.343748092651367,
"step": 724
},
{
"epoch": 0.44072948328267475,
"grad_norm": 1.247310757637024,
"learning_rate": 0.0005,
"loss": 4.43183708190918,
"step": 725
},
{
"epoch": 0.4413373860182371,
"grad_norm": 1.07508385181427,
"learning_rate": 0.0005,
"loss": 4.473773956298828,
"step": 726
},
{
"epoch": 0.4419452887537994,
"grad_norm": 1.0861989259719849,
"learning_rate": 0.0005,
"loss": 4.50743293762207,
"step": 727
},
{
"epoch": 0.4425531914893617,
"grad_norm": 1.043446660041809,
"learning_rate": 0.0005,
"loss": 4.65224027633667,
"step": 728
},
{
"epoch": 0.443161094224924,
"grad_norm": 1.1153486967086792,
"learning_rate": 0.0005,
"loss": 4.275899887084961,
"step": 729
},
{
"epoch": 0.44376899696048633,
"grad_norm": 1.0387423038482666,
"learning_rate": 0.0005,
"loss": 4.571664333343506,
"step": 730
},
{
"epoch": 0.44437689969604866,
"grad_norm": 1.1121833324432373,
"learning_rate": 0.0005,
"loss": 4.472873687744141,
"step": 731
},
{
"epoch": 0.4449848024316109,
"grad_norm": 1.110357642173767,
"learning_rate": 0.0005,
"loss": 4.507586479187012,
"step": 732
},
{
"epoch": 0.44559270516717325,
"grad_norm": 1.0192921161651611,
"learning_rate": 0.0005,
"loss": 4.614180564880371,
"step": 733
},
{
"epoch": 0.4462006079027356,
"grad_norm": 1.2011562585830688,
"learning_rate": 0.0005,
"loss": 4.410806655883789,
"step": 734
},
{
"epoch": 0.44680851063829785,
"grad_norm": 1.045922040939331,
"learning_rate": 0.0005,
"loss": 4.522254943847656,
"step": 735
},
{
"epoch": 0.4474164133738602,
"grad_norm": 1.1084001064300537,
"learning_rate": 0.0005,
"loss": 4.473600387573242,
"step": 736
},
{
"epoch": 0.4480243161094225,
"grad_norm": 1.0580531358718872,
"learning_rate": 0.0005,
"loss": 4.495148658752441,
"step": 737
},
{
"epoch": 0.4486322188449848,
"grad_norm": 1.0791500806808472,
"learning_rate": 0.0005,
"loss": 4.559470176696777,
"step": 738
},
{
"epoch": 0.4492401215805471,
"grad_norm": 0.9919356107711792,
"learning_rate": 0.0005,
"loss": 4.445730209350586,
"step": 739
},
{
"epoch": 0.44984802431610943,
"grad_norm": 0.9215476512908936,
"learning_rate": 0.0005,
"loss": 4.360682487487793,
"step": 740
},
{
"epoch": 0.45045592705167176,
"grad_norm": 1.1767232418060303,
"learning_rate": 0.0005,
"loss": 4.51902437210083,
"step": 741
},
{
"epoch": 0.451063829787234,
"grad_norm": 1.1746350526809692,
"learning_rate": 0.0005,
"loss": 4.362285614013672,
"step": 742
},
{
"epoch": 0.45167173252279635,
"grad_norm": 1.0243946313858032,
"learning_rate": 0.0005,
"loss": 4.443662166595459,
"step": 743
},
{
"epoch": 0.4522796352583587,
"grad_norm": 1.034515619277954,
"learning_rate": 0.0005,
"loss": 4.329188346862793,
"step": 744
},
{
"epoch": 0.45288753799392095,
"grad_norm": 1.1209111213684082,
"learning_rate": 0.0005,
"loss": 4.6534223556518555,
"step": 745
},
{
"epoch": 0.4534954407294833,
"grad_norm": 1.0455032587051392,
"learning_rate": 0.0005,
"loss": 4.511608600616455,
"step": 746
},
{
"epoch": 0.4541033434650456,
"grad_norm": 1.002439022064209,
"learning_rate": 0.0005,
"loss": 4.4008378982543945,
"step": 747
},
{
"epoch": 0.4547112462006079,
"grad_norm": 0.9780976176261902,
"learning_rate": 0.0005,
"loss": 4.478031158447266,
"step": 748
},
{
"epoch": 0.4553191489361702,
"grad_norm": 1.0394052267074585,
"learning_rate": 0.0005,
"loss": 4.431166648864746,
"step": 749
},
{
"epoch": 0.45592705167173253,
"grad_norm": 1.0838037729263306,
"learning_rate": 0.0005,
"loss": 4.38276481628418,
"step": 750
},
{
"epoch": 0.45653495440729486,
"grad_norm": 1.2306514978408813,
"learning_rate": 0.0005,
"loss": 4.427013874053955,
"step": 751
},
{
"epoch": 0.45714285714285713,
"grad_norm": 0.8942012190818787,
"learning_rate": 0.0005,
"loss": 4.463613986968994,
"step": 752
},
{
"epoch": 0.45775075987841946,
"grad_norm": 1.0273581743240356,
"learning_rate": 0.0005,
"loss": 4.331677436828613,
"step": 753
},
{
"epoch": 0.4583586626139818,
"grad_norm": 1.061225414276123,
"learning_rate": 0.0005,
"loss": 4.360611438751221,
"step": 754
},
{
"epoch": 0.45896656534954405,
"grad_norm": 0.9954508543014526,
"learning_rate": 0.0005,
"loss": 4.37364387512207,
"step": 755
},
{
"epoch": 0.4595744680851064,
"grad_norm": 0.9806733131408691,
"learning_rate": 0.0005,
"loss": 4.469931602478027,
"step": 756
},
{
"epoch": 0.4601823708206687,
"grad_norm": 1.131806492805481,
"learning_rate": 0.0005,
"loss": 4.487429618835449,
"step": 757
},
{
"epoch": 0.460790273556231,
"grad_norm": 0.9451801776885986,
"learning_rate": 0.0005,
"loss": 4.476114749908447,
"step": 758
},
{
"epoch": 0.4613981762917933,
"grad_norm": 1.064634084701538,
"learning_rate": 0.0005,
"loss": 4.607744216918945,
"step": 759
},
{
"epoch": 0.46200607902735563,
"grad_norm": 1.0846835374832153,
"learning_rate": 0.0005,
"loss": 4.312438011169434,
"step": 760
},
{
"epoch": 0.46261398176291796,
"grad_norm": 0.9688083529472351,
"learning_rate": 0.0005,
"loss": 4.376931667327881,
"step": 761
},
{
"epoch": 0.46322188449848023,
"grad_norm": 1.1652911901474,
"learning_rate": 0.0005,
"loss": 4.416962623596191,
"step": 762
},
{
"epoch": 0.46382978723404256,
"grad_norm": 1.147851586341858,
"learning_rate": 0.0005,
"loss": 4.349986553192139,
"step": 763
},
{
"epoch": 0.4644376899696049,
"grad_norm": 0.9702697992324829,
"learning_rate": 0.0005,
"loss": 4.497674942016602,
"step": 764
},
{
"epoch": 0.46504559270516715,
"grad_norm": 1.0843515396118164,
"learning_rate": 0.0005,
"loss": 4.49877405166626,
"step": 765
},
{
"epoch": 0.4656534954407295,
"grad_norm": 0.9171056747436523,
"learning_rate": 0.0005,
"loss": 4.1539201736450195,
"step": 766
},
{
"epoch": 0.4662613981762918,
"grad_norm": 1.164944052696228,
"learning_rate": 0.0005,
"loss": 4.509303092956543,
"step": 767
},
{
"epoch": 0.4668693009118541,
"grad_norm": 1.0968433618545532,
"learning_rate": 0.0005,
"loss": 4.4588727951049805,
"step": 768
},
{
"epoch": 0.4674772036474164,
"grad_norm": 1.0154880285263062,
"learning_rate": 0.0005,
"loss": 4.611554145812988,
"step": 769
},
{
"epoch": 0.46808510638297873,
"grad_norm": 0.9653189778327942,
"learning_rate": 0.0005,
"loss": 4.324926376342773,
"step": 770
},
{
"epoch": 0.46869300911854106,
"grad_norm": 1.1051913499832153,
"learning_rate": 0.0005,
"loss": 4.4647111892700195,
"step": 771
},
{
"epoch": 0.46930091185410333,
"grad_norm": 0.9223854541778564,
"learning_rate": 0.0005,
"loss": 4.7103400230407715,
"step": 772
},
{
"epoch": 0.46990881458966566,
"grad_norm": 1.124935507774353,
"learning_rate": 0.0005,
"loss": 4.453402519226074,
"step": 773
},
{
"epoch": 0.470516717325228,
"grad_norm": 1.3314533233642578,
"learning_rate": 0.0005,
"loss": 4.297192573547363,
"step": 774
},
{
"epoch": 0.47112462006079026,
"grad_norm": 1.0218007564544678,
"learning_rate": 0.0005,
"loss": 4.496466159820557,
"step": 775
},
{
"epoch": 0.4717325227963526,
"grad_norm": 1.0308325290679932,
"learning_rate": 0.0005,
"loss": 4.3223772048950195,
"step": 776
},
{
"epoch": 0.4723404255319149,
"grad_norm": 1.1283831596374512,
"learning_rate": 0.0005,
"loss": 4.398843288421631,
"step": 777
},
{
"epoch": 0.4729483282674772,
"grad_norm": 1.1089282035827637,
"learning_rate": 0.0005,
"loss": 4.226986408233643,
"step": 778
},
{
"epoch": 0.4735562310030395,
"grad_norm": 0.9950074553489685,
"learning_rate": 0.0005,
"loss": 4.401683807373047,
"step": 779
},
{
"epoch": 0.47416413373860183,
"grad_norm": 1.1220934391021729,
"learning_rate": 0.0005,
"loss": 4.23845100402832,
"step": 780
},
{
"epoch": 0.47477203647416416,
"grad_norm": 1.1314822435379028,
"learning_rate": 0.0005,
"loss": 4.648829936981201,
"step": 781
},
{
"epoch": 0.47537993920972643,
"grad_norm": 1.0067565441131592,
"learning_rate": 0.0005,
"loss": 4.342182159423828,
"step": 782
},
{
"epoch": 0.47598784194528876,
"grad_norm": 1.4291990995407104,
"learning_rate": 0.0005,
"loss": 4.222455978393555,
"step": 783
},
{
"epoch": 0.4765957446808511,
"grad_norm": 1.0664339065551758,
"learning_rate": 0.0005,
"loss": 4.533761978149414,
"step": 784
},
{
"epoch": 0.47720364741641336,
"grad_norm": 0.837992787361145,
"learning_rate": 0.0005,
"loss": 4.583135604858398,
"step": 785
},
{
"epoch": 0.4778115501519757,
"grad_norm": 1.0775222778320312,
"learning_rate": 0.0005,
"loss": 4.407233238220215,
"step": 786
},
{
"epoch": 0.478419452887538,
"grad_norm": 1.1260716915130615,
"learning_rate": 0.0005,
"loss": 4.408687114715576,
"step": 787
},
{
"epoch": 0.4790273556231003,
"grad_norm": 1.1476800441741943,
"learning_rate": 0.0005,
"loss": 4.5264692306518555,
"step": 788
},
{
"epoch": 0.4796352583586626,
"grad_norm": 1.0624704360961914,
"learning_rate": 0.0005,
"loss": 4.47670316696167,
"step": 789
},
{
"epoch": 0.48024316109422494,
"grad_norm": 1.4008615016937256,
"learning_rate": 0.0005,
"loss": 4.542054653167725,
"step": 790
},
{
"epoch": 0.4808510638297872,
"grad_norm": 1.6348981857299805,
"learning_rate": 0.0005,
"loss": 4.272322654724121,
"step": 791
},
{
"epoch": 0.48145896656534953,
"grad_norm": 1.110823154449463,
"learning_rate": 0.0005,
"loss": 4.32360315322876,
"step": 792
},
{
"epoch": 0.48206686930091186,
"grad_norm": 0.9771617650985718,
"learning_rate": 0.0005,
"loss": 4.4510321617126465,
"step": 793
},
{
"epoch": 0.4826747720364742,
"grad_norm": 1.0948632955551147,
"learning_rate": 0.0005,
"loss": 4.335118293762207,
"step": 794
},
{
"epoch": 0.48328267477203646,
"grad_norm": 1.2692338228225708,
"learning_rate": 0.0005,
"loss": 4.3776655197143555,
"step": 795
},
{
"epoch": 0.4838905775075988,
"grad_norm": 0.8474439978599548,
"learning_rate": 0.0005,
"loss": 4.24397087097168,
"step": 796
},
{
"epoch": 0.4844984802431611,
"grad_norm": 0.9258842468261719,
"learning_rate": 0.0005,
"loss": 4.602321624755859,
"step": 797
},
{
"epoch": 0.4851063829787234,
"grad_norm": 1.1678420305252075,
"learning_rate": 0.0005,
"loss": 4.5578203201293945,
"step": 798
},
{
"epoch": 0.4857142857142857,
"grad_norm": 1.0839719772338867,
"learning_rate": 0.0005,
"loss": 4.4719109535217285,
"step": 799
},
{
"epoch": 0.48632218844984804,
"grad_norm": 1.0721313953399658,
"learning_rate": 0.0005,
"loss": 4.1971516609191895,
"step": 800
},
{
"epoch": 0.4869300911854103,
"grad_norm": 1.077587366104126,
"learning_rate": 0.0005,
"loss": 4.452859401702881,
"step": 801
},
{
"epoch": 0.48753799392097263,
"grad_norm": 0.9456436038017273,
"learning_rate": 0.0005,
"loss": 4.417455673217773,
"step": 802
},
{
"epoch": 0.48814589665653496,
"grad_norm": 0.9326696991920471,
"learning_rate": 0.0005,
"loss": 4.389290809631348,
"step": 803
},
{
"epoch": 0.4887537993920973,
"grad_norm": 1.0423635244369507,
"learning_rate": 0.0005,
"loss": 4.4252448081970215,
"step": 804
},
{
"epoch": 0.48936170212765956,
"grad_norm": 1.0106087923049927,
"learning_rate": 0.0005,
"loss": 4.29632043838501,
"step": 805
},
{
"epoch": 0.4899696048632219,
"grad_norm": 0.8635157942771912,
"learning_rate": 0.0005,
"loss": 4.45654296875,
"step": 806
},
{
"epoch": 0.4905775075987842,
"grad_norm": 0.9637815952301025,
"learning_rate": 0.0005,
"loss": 4.305363655090332,
"step": 807
},
{
"epoch": 0.4911854103343465,
"grad_norm": 0.9523938298225403,
"learning_rate": 0.0005,
"loss": 4.561666965484619,
"step": 808
},
{
"epoch": 0.4917933130699088,
"grad_norm": 1.1045883893966675,
"learning_rate": 0.0005,
"loss": 4.385721206665039,
"step": 809
},
{
"epoch": 0.49240121580547114,
"grad_norm": 0.951117992401123,
"learning_rate": 0.0005,
"loss": 4.302276611328125,
"step": 810
},
{
"epoch": 0.4930091185410334,
"grad_norm": 1.091933250427246,
"learning_rate": 0.0005,
"loss": 4.64669132232666,
"step": 811
},
{
"epoch": 0.49361702127659574,
"grad_norm": 1.0813966989517212,
"learning_rate": 0.0005,
"loss": 4.266849517822266,
"step": 812
},
{
"epoch": 0.49422492401215806,
"grad_norm": 0.9683962464332581,
"learning_rate": 0.0005,
"loss": 4.304372787475586,
"step": 813
},
{
"epoch": 0.4948328267477204,
"grad_norm": 0.960382342338562,
"learning_rate": 0.0005,
"loss": 4.221304416656494,
"step": 814
},
{
"epoch": 0.49544072948328266,
"grad_norm": 0.9746182560920715,
"learning_rate": 0.0005,
"loss": 4.392333030700684,
"step": 815
},
{
"epoch": 0.496048632218845,
"grad_norm": 0.9449917078018188,
"learning_rate": 0.0005,
"loss": 4.274685859680176,
"step": 816
},
{
"epoch": 0.4966565349544073,
"grad_norm": 0.8899694085121155,
"learning_rate": 0.0005,
"loss": 4.206332206726074,
"step": 817
},
{
"epoch": 0.4972644376899696,
"grad_norm": 0.9504559636116028,
"learning_rate": 0.0005,
"loss": 4.2690534591674805,
"step": 818
},
{
"epoch": 0.4978723404255319,
"grad_norm": 0.9823598265647888,
"learning_rate": 0.0005,
"loss": 4.379746437072754,
"step": 819
},
{
"epoch": 0.49848024316109424,
"grad_norm": 1.0227431058883667,
"learning_rate": 0.0005,
"loss": 4.233619213104248,
"step": 820
},
{
"epoch": 0.4990881458966565,
"grad_norm": 0.9714612364768982,
"learning_rate": 0.0005,
"loss": 4.607011795043945,
"step": 821
},
{
"epoch": 0.49969604863221884,
"grad_norm": 0.9920446276664734,
"learning_rate": 0.0005,
"loss": 4.5199127197265625,
"step": 822
},
{
"epoch": 0.5003039513677812,
"grad_norm": 1.0052610635757446,
"learning_rate": 0.0005,
"loss": 4.538883209228516,
"step": 823
},
{
"epoch": 0.5009118541033435,
"grad_norm": 0.961460292339325,
"learning_rate": 0.0005,
"loss": 4.37430477142334,
"step": 824
},
{
"epoch": 0.5015197568389058,
"grad_norm": 0.9705450534820557,
"learning_rate": 0.0005,
"loss": 4.36405611038208,
"step": 825
},
{
"epoch": 0.502127659574468,
"grad_norm": 1.0589666366577148,
"learning_rate": 0.0005,
"loss": 4.532018661499023,
"step": 826
},
{
"epoch": 0.5027355623100304,
"grad_norm": 1.0190895795822144,
"learning_rate": 0.0005,
"loss": 4.366916656494141,
"step": 827
},
{
"epoch": 0.5033434650455927,
"grad_norm": 1.2047783136367798,
"learning_rate": 0.0005,
"loss": 4.332704544067383,
"step": 828
},
{
"epoch": 0.503951367781155,
"grad_norm": 0.9100733995437622,
"learning_rate": 0.0005,
"loss": 4.1280975341796875,
"step": 829
},
{
"epoch": 0.5045592705167173,
"grad_norm": 1.0953924655914307,
"learning_rate": 0.0005,
"loss": 4.338841438293457,
"step": 830
},
{
"epoch": 0.5051671732522797,
"grad_norm": 1.2325948476791382,
"learning_rate": 0.0005,
"loss": 4.425684928894043,
"step": 831
},
{
"epoch": 0.505775075987842,
"grad_norm": 1.0776824951171875,
"learning_rate": 0.0005,
"loss": 4.238302230834961,
"step": 832
},
{
"epoch": 0.5063829787234042,
"grad_norm": 1.002465009689331,
"learning_rate": 0.0005,
"loss": 4.1673173904418945,
"step": 833
},
{
"epoch": 0.5069908814589665,
"grad_norm": 1.0070068836212158,
"learning_rate": 0.0005,
"loss": 4.502063751220703,
"step": 834
},
{
"epoch": 0.5075987841945289,
"grad_norm": 0.9460301995277405,
"learning_rate": 0.0005,
"loss": 4.266294479370117,
"step": 835
},
{
"epoch": 0.5082066869300912,
"grad_norm": 0.9609605669975281,
"learning_rate": 0.0005,
"loss": 4.49836540222168,
"step": 836
},
{
"epoch": 0.5088145896656535,
"grad_norm": 1.0298100709915161,
"learning_rate": 0.0005,
"loss": 4.342093467712402,
"step": 837
},
{
"epoch": 0.5094224924012158,
"grad_norm": 1.102327585220337,
"learning_rate": 0.0005,
"loss": 4.25087833404541,
"step": 838
},
{
"epoch": 0.5100303951367782,
"grad_norm": 1.2569550275802612,
"learning_rate": 0.0005,
"loss": 4.285090446472168,
"step": 839
},
{
"epoch": 0.5106382978723404,
"grad_norm": 1.0138150453567505,
"learning_rate": 0.0005,
"loss": 4.334506034851074,
"step": 840
},
{
"epoch": 0.5112462006079027,
"grad_norm": 1.0152983665466309,
"learning_rate": 0.0005,
"loss": 4.283235549926758,
"step": 841
},
{
"epoch": 0.511854103343465,
"grad_norm": 1.1372138261795044,
"learning_rate": 0.0005,
"loss": 4.07025146484375,
"step": 842
},
{
"epoch": 0.5124620060790274,
"grad_norm": 1.1843246221542358,
"learning_rate": 0.0005,
"loss": 4.353334426879883,
"step": 843
},
{
"epoch": 0.5130699088145897,
"grad_norm": 1.1458396911621094,
"learning_rate": 0.0005,
"loss": 4.34335994720459,
"step": 844
},
{
"epoch": 0.513677811550152,
"grad_norm": 1.0594899654388428,
"learning_rate": 0.0005,
"loss": 4.31781005859375,
"step": 845
},
{
"epoch": 0.5142857142857142,
"grad_norm": 0.844513475894928,
"learning_rate": 0.0005,
"loss": 4.4846577644348145,
"step": 846
},
{
"epoch": 0.5148936170212766,
"grad_norm": 2.6839306354522705,
"learning_rate": 0.0005,
"loss": 4.262670993804932,
"step": 847
},
{
"epoch": 0.5155015197568389,
"grad_norm": 1.0088754892349243,
"learning_rate": 0.0005,
"loss": 4.266050338745117,
"step": 848
},
{
"epoch": 0.5161094224924012,
"grad_norm": 1.0849522352218628,
"learning_rate": 0.0005,
"loss": 4.108889102935791,
"step": 849
},
{
"epoch": 0.5167173252279635,
"grad_norm": 1.0903068780899048,
"learning_rate": 0.0005,
"loss": 4.313821315765381,
"step": 850
},
{
"epoch": 0.5173252279635259,
"grad_norm": 1.1618335247039795,
"learning_rate": 0.0005,
"loss": 4.295135498046875,
"step": 851
},
{
"epoch": 0.5179331306990882,
"grad_norm": 0.9828124046325684,
"learning_rate": 0.0005,
"loss": 4.440587043762207,
"step": 852
},
{
"epoch": 0.5185410334346504,
"grad_norm": 1.131939172744751,
"learning_rate": 0.0005,
"loss": 4.306354522705078,
"step": 853
},
{
"epoch": 0.5191489361702127,
"grad_norm": 1.3951880931854248,
"learning_rate": 0.0005,
"loss": 4.395257949829102,
"step": 854
},
{
"epoch": 0.5197568389057751,
"grad_norm": 1.28059983253479,
"learning_rate": 0.0005,
"loss": 4.033473968505859,
"step": 855
},
{
"epoch": 0.5203647416413374,
"grad_norm": 0.9717862606048584,
"learning_rate": 0.0005,
"loss": 4.356355667114258,
"step": 856
},
{
"epoch": 0.5209726443768997,
"grad_norm": 1.043353796005249,
"learning_rate": 0.0005,
"loss": 4.250835418701172,
"step": 857
},
{
"epoch": 0.521580547112462,
"grad_norm": 1.016579508781433,
"learning_rate": 0.0005,
"loss": 4.286150932312012,
"step": 858
},
{
"epoch": 0.5221884498480243,
"grad_norm": 1.112782597541809,
"learning_rate": 0.0005,
"loss": 4.598012924194336,
"step": 859
},
{
"epoch": 0.5227963525835866,
"grad_norm": 1.1940479278564453,
"learning_rate": 0.0005,
"loss": 4.4383955001831055,
"step": 860
},
{
"epoch": 0.5234042553191489,
"grad_norm": 1.254970669746399,
"learning_rate": 0.0005,
"loss": 4.322863578796387,
"step": 861
},
{
"epoch": 0.5240121580547112,
"grad_norm": 1.0700422525405884,
"learning_rate": 0.0005,
"loss": 4.244253158569336,
"step": 862
},
{
"epoch": 0.5246200607902736,
"grad_norm": 1.0553544759750366,
"learning_rate": 0.0005,
"loss": 4.310792446136475,
"step": 863
},
{
"epoch": 0.5252279635258359,
"grad_norm": 1.0288846492767334,
"learning_rate": 0.0005,
"loss": 4.3274383544921875,
"step": 864
},
{
"epoch": 0.5258358662613982,
"grad_norm": 1.0445955991744995,
"learning_rate": 0.0005,
"loss": 4.45347261428833,
"step": 865
},
{
"epoch": 0.5264437689969604,
"grad_norm": 1.1357736587524414,
"learning_rate": 0.0005,
"loss": 4.4809064865112305,
"step": 866
},
{
"epoch": 0.5270516717325228,
"grad_norm": 1.109326720237732,
"learning_rate": 0.0005,
"loss": 4.253253936767578,
"step": 867
},
{
"epoch": 0.5276595744680851,
"grad_norm": 1.1890736818313599,
"learning_rate": 0.0005,
"loss": 4.426365852355957,
"step": 868
},
{
"epoch": 0.5282674772036474,
"grad_norm": 1.0840505361557007,
"learning_rate": 0.0005,
"loss": 4.321274280548096,
"step": 869
},
{
"epoch": 0.5288753799392097,
"grad_norm": 1.2200610637664795,
"learning_rate": 0.0005,
"loss": 4.557803153991699,
"step": 870
},
{
"epoch": 0.5294832826747721,
"grad_norm": 0.9972710609436035,
"learning_rate": 0.0005,
"loss": 4.23234748840332,
"step": 871
},
{
"epoch": 0.5300911854103344,
"grad_norm": 1.0316972732543945,
"learning_rate": 0.0005,
"loss": 4.139028549194336,
"step": 872
},
{
"epoch": 0.5306990881458966,
"grad_norm": 1.0380617380142212,
"learning_rate": 0.0005,
"loss": 4.348488807678223,
"step": 873
},
{
"epoch": 0.5313069908814589,
"grad_norm": 0.9867698550224304,
"learning_rate": 0.0005,
"loss": 4.302568435668945,
"step": 874
},
{
"epoch": 0.5319148936170213,
"grad_norm": 1.0779541730880737,
"learning_rate": 0.0005,
"loss": 4.425013542175293,
"step": 875
},
{
"epoch": 0.5325227963525836,
"grad_norm": 1.2543246746063232,
"learning_rate": 0.0005,
"loss": 4.724435806274414,
"step": 876
},
{
"epoch": 0.5331306990881459,
"grad_norm": 1.2280689477920532,
"learning_rate": 0.0005,
"loss": 4.2406415939331055,
"step": 877
},
{
"epoch": 0.5337386018237082,
"grad_norm": 1.3842073678970337,
"learning_rate": 0.0005,
"loss": 4.396044731140137,
"step": 878
},
{
"epoch": 0.5343465045592705,
"grad_norm": 1.0350067615509033,
"learning_rate": 0.0005,
"loss": 4.17176628112793,
"step": 879
},
{
"epoch": 0.5349544072948328,
"grad_norm": 0.9484389424324036,
"learning_rate": 0.0005,
"loss": 4.430863380432129,
"step": 880
},
{
"epoch": 0.5355623100303951,
"grad_norm": 1.1557071208953857,
"learning_rate": 0.0005,
"loss": 4.12956428527832,
"step": 881
},
{
"epoch": 0.5361702127659574,
"grad_norm": 0.9079960584640503,
"learning_rate": 0.0005,
"loss": 4.4100542068481445,
"step": 882
},
{
"epoch": 0.5367781155015198,
"grad_norm": 0.9755933880805969,
"learning_rate": 0.0005,
"loss": 4.136897563934326,
"step": 883
},
{
"epoch": 0.5373860182370821,
"grad_norm": 1.0319873094558716,
"learning_rate": 0.0005,
"loss": 4.440415859222412,
"step": 884
},
{
"epoch": 0.5379939209726444,
"grad_norm": 0.8542789220809937,
"learning_rate": 0.0005,
"loss": 4.413039207458496,
"step": 885
},
{
"epoch": 0.5386018237082066,
"grad_norm": 1.0158871412277222,
"learning_rate": 0.0005,
"loss": 4.379025459289551,
"step": 886
},
{
"epoch": 0.539209726443769,
"grad_norm": 0.8926265835762024,
"learning_rate": 0.0005,
"loss": 4.344198226928711,
"step": 887
},
{
"epoch": 0.5398176291793313,
"grad_norm": 0.8857081532478333,
"learning_rate": 0.0005,
"loss": 4.38722562789917,
"step": 888
},
{
"epoch": 0.5404255319148936,
"grad_norm": 0.9595281481742859,
"learning_rate": 0.0005,
"loss": 4.3452959060668945,
"step": 889
},
{
"epoch": 0.541033434650456,
"grad_norm": 0.9428173303604126,
"learning_rate": 0.0005,
"loss": 4.258479118347168,
"step": 890
},
{
"epoch": 0.5416413373860183,
"grad_norm": 1.5479097366333008,
"learning_rate": 0.0005,
"loss": 4.245420455932617,
"step": 891
},
{
"epoch": 0.5422492401215806,
"grad_norm": 1.1619681119918823,
"learning_rate": 0.0005,
"loss": 4.385200500488281,
"step": 892
},
{
"epoch": 0.5428571428571428,
"grad_norm": 0.9958190321922302,
"learning_rate": 0.0005,
"loss": 4.102227687835693,
"step": 893
},
{
"epoch": 0.5434650455927051,
"grad_norm": 1.0156055688858032,
"learning_rate": 0.0005,
"loss": 4.067695617675781,
"step": 894
},
{
"epoch": 0.5440729483282675,
"grad_norm": 1.1579831838607788,
"learning_rate": 0.0005,
"loss": 4.48448371887207,
"step": 895
},
{
"epoch": 0.5446808510638298,
"grad_norm": 1.23504638671875,
"learning_rate": 0.0005,
"loss": 4.5317583084106445,
"step": 896
},
{
"epoch": 0.5452887537993921,
"grad_norm": 1.167401909828186,
"learning_rate": 0.0005,
"loss": 4.3551435470581055,
"step": 897
},
{
"epoch": 0.5458966565349544,
"grad_norm": 1.4126181602478027,
"learning_rate": 0.0005,
"loss": 4.373387813568115,
"step": 898
},
{
"epoch": 0.5465045592705167,
"grad_norm": 1.152944564819336,
"learning_rate": 0.0005,
"loss": 3.9645819664001465,
"step": 899
},
{
"epoch": 0.547112462006079,
"grad_norm": 1.5390210151672363,
"learning_rate": 0.0005,
"loss": 4.454073429107666,
"step": 900
},
{
"epoch": 0.5477203647416413,
"grad_norm": 1.0349818468093872,
"learning_rate": 0.0005,
"loss": 4.398721694946289,
"step": 901
},
{
"epoch": 0.5483282674772036,
"grad_norm": 1.0963656902313232,
"learning_rate": 0.0005,
"loss": 4.0993242263793945,
"step": 902
},
{
"epoch": 0.548936170212766,
"grad_norm": 1.1737645864486694,
"learning_rate": 0.0005,
"loss": 4.228819370269775,
"step": 903
},
{
"epoch": 0.5495440729483283,
"grad_norm": 1.1499532461166382,
"learning_rate": 0.0005,
"loss": 4.329497337341309,
"step": 904
},
{
"epoch": 0.5501519756838906,
"grad_norm": 1.1188825368881226,
"learning_rate": 0.0005,
"loss": 4.631438255310059,
"step": 905
},
{
"epoch": 0.5507598784194528,
"grad_norm": 1.0337425470352173,
"learning_rate": 0.0005,
"loss": 4.373821258544922,
"step": 906
},
{
"epoch": 0.5513677811550152,
"grad_norm": 1.098497986793518,
"learning_rate": 0.0005,
"loss": 4.344779014587402,
"step": 907
},
{
"epoch": 0.5519756838905775,
"grad_norm": 1.0316400527954102,
"learning_rate": 0.0005,
"loss": 4.097405910491943,
"step": 908
},
{
"epoch": 0.5525835866261398,
"grad_norm": 1.0182708501815796,
"learning_rate": 0.0005,
"loss": 4.353028297424316,
"step": 909
},
{
"epoch": 0.5531914893617021,
"grad_norm": 1.2190346717834473,
"learning_rate": 0.0005,
"loss": 4.165225028991699,
"step": 910
},
{
"epoch": 0.5537993920972645,
"grad_norm": 1.017309546470642,
"learning_rate": 0.0005,
"loss": 4.318220138549805,
"step": 911
},
{
"epoch": 0.5544072948328268,
"grad_norm": 1.1314797401428223,
"learning_rate": 0.0005,
"loss": 4.121149063110352,
"step": 912
},
{
"epoch": 0.555015197568389,
"grad_norm": 1.0844316482543945,
"learning_rate": 0.0005,
"loss": 4.213968276977539,
"step": 913
},
{
"epoch": 0.5556231003039513,
"grad_norm": 0.9382945895195007,
"learning_rate": 0.0005,
"loss": 4.214629650115967,
"step": 914
},
{
"epoch": 0.5562310030395137,
"grad_norm": 1.245742678642273,
"learning_rate": 0.0005,
"loss": 4.122774124145508,
"step": 915
},
{
"epoch": 0.556838905775076,
"grad_norm": 1.095625877380371,
"learning_rate": 0.0005,
"loss": 4.296173095703125,
"step": 916
},
{
"epoch": 0.5574468085106383,
"grad_norm": 1.0720239877700806,
"learning_rate": 0.0005,
"loss": 4.165585994720459,
"step": 917
},
{
"epoch": 0.5580547112462007,
"grad_norm": 1.1082829236984253,
"learning_rate": 0.0005,
"loss": 4.3951921463012695,
"step": 918
},
{
"epoch": 0.5586626139817629,
"grad_norm": 1.1302635669708252,
"learning_rate": 0.0005,
"loss": 4.336912155151367,
"step": 919
},
{
"epoch": 0.5592705167173252,
"grad_norm": 0.9658374786376953,
"learning_rate": 0.0005,
"loss": 4.145001411437988,
"step": 920
},
{
"epoch": 0.5598784194528875,
"grad_norm": 1.2869893312454224,
"learning_rate": 0.0005,
"loss": 4.438281536102295,
"step": 921
},
{
"epoch": 0.5604863221884498,
"grad_norm": 0.9351769089698792,
"learning_rate": 0.0005,
"loss": 4.342588424682617,
"step": 922
},
{
"epoch": 0.5610942249240122,
"grad_norm": 1.075165867805481,
"learning_rate": 0.0005,
"loss": 4.3024187088012695,
"step": 923
},
{
"epoch": 0.5617021276595745,
"grad_norm": 1.0462286472320557,
"learning_rate": 0.0005,
"loss": 4.308043479919434,
"step": 924
},
{
"epoch": 0.5623100303951368,
"grad_norm": 1.1331902742385864,
"learning_rate": 0.0005,
"loss": 4.122361183166504,
"step": 925
},
{
"epoch": 0.562917933130699,
"grad_norm": 1.0483379364013672,
"learning_rate": 0.0005,
"loss": 4.140399932861328,
"step": 926
},
{
"epoch": 0.5635258358662614,
"grad_norm": 1.0775599479675293,
"learning_rate": 0.0005,
"loss": 4.258686065673828,
"step": 927
},
{
"epoch": 0.5641337386018237,
"grad_norm": 1.1621100902557373,
"learning_rate": 0.0005,
"loss": 4.177057266235352,
"step": 928
},
{
"epoch": 0.564741641337386,
"grad_norm": 1.144015908241272,
"learning_rate": 0.0005,
"loss": 3.854235887527466,
"step": 929
},
{
"epoch": 0.5653495440729484,
"grad_norm": 1.0188685655593872,
"learning_rate": 0.0005,
"loss": 4.226658821105957,
"step": 930
},
{
"epoch": 0.5659574468085107,
"grad_norm": 1.214069128036499,
"learning_rate": 0.0005,
"loss": 4.558093070983887,
"step": 931
},
{
"epoch": 0.5665653495440729,
"grad_norm": 1.0221775770187378,
"learning_rate": 0.0005,
"loss": 4.362860202789307,
"step": 932
},
{
"epoch": 0.5671732522796352,
"grad_norm": 1.1003692150115967,
"learning_rate": 0.0005,
"loss": 4.412820339202881,
"step": 933
},
{
"epoch": 0.5677811550151975,
"grad_norm": 1.0189692974090576,
"learning_rate": 0.0005,
"loss": 4.141862392425537,
"step": 934
},
{
"epoch": 0.5683890577507599,
"grad_norm": 1.1275514364242554,
"learning_rate": 0.0005,
"loss": 4.0759077072143555,
"step": 935
},
{
"epoch": 0.5689969604863222,
"grad_norm": 1.0595769882202148,
"learning_rate": 0.0005,
"loss": 4.234007835388184,
"step": 936
},
{
"epoch": 0.5696048632218845,
"grad_norm": 1.0620779991149902,
"learning_rate": 0.0005,
"loss": 4.242690086364746,
"step": 937
},
{
"epoch": 0.5702127659574469,
"grad_norm": 1.0344425439834595,
"learning_rate": 0.0005,
"loss": 4.393516540527344,
"step": 938
},
{
"epoch": 0.5708206686930091,
"grad_norm": 1.1058911085128784,
"learning_rate": 0.0005,
"loss": 4.163288116455078,
"step": 939
},
{
"epoch": 0.5714285714285714,
"grad_norm": 1.38120698928833,
"learning_rate": 0.0005,
"loss": 4.052524566650391,
"step": 940
},
{
"epoch": 0.5720364741641337,
"grad_norm": 1.0876317024230957,
"learning_rate": 0.0005,
"loss": 4.03524112701416,
"step": 941
},
{
"epoch": 0.572644376899696,
"grad_norm": 1.0367745161056519,
"learning_rate": 0.0005,
"loss": 4.183863639831543,
"step": 942
},
{
"epoch": 0.5732522796352584,
"grad_norm": 1.008543848991394,
"learning_rate": 0.0005,
"loss": 4.219581127166748,
"step": 943
},
{
"epoch": 0.5738601823708207,
"grad_norm": 1.0349946022033691,
"learning_rate": 0.0005,
"loss": 4.3019561767578125,
"step": 944
},
{
"epoch": 0.574468085106383,
"grad_norm": 1.5670639276504517,
"learning_rate": 0.0005,
"loss": 4.330730438232422,
"step": 945
},
{
"epoch": 0.5750759878419452,
"grad_norm": 1.0402114391326904,
"learning_rate": 0.0005,
"loss": 4.131731033325195,
"step": 946
},
{
"epoch": 0.5756838905775076,
"grad_norm": 1.092549204826355,
"learning_rate": 0.0005,
"loss": 4.311880111694336,
"step": 947
},
{
"epoch": 0.5762917933130699,
"grad_norm": 0.968338668346405,
"learning_rate": 0.0005,
"loss": 4.174734115600586,
"step": 948
},
{
"epoch": 0.5768996960486322,
"grad_norm": 1.0552120208740234,
"learning_rate": 0.0005,
"loss": 3.9222404956817627,
"step": 949
},
{
"epoch": 0.5775075987841946,
"grad_norm": 1.0390139818191528,
"learning_rate": 0.0005,
"loss": 4.248430252075195,
"step": 950
},
{
"epoch": 0.5781155015197569,
"grad_norm": 1.1151931285858154,
"learning_rate": 0.0005,
"loss": 4.177859306335449,
"step": 951
},
{
"epoch": 0.5787234042553191,
"grad_norm": 1.0557676553726196,
"learning_rate": 0.0005,
"loss": 4.173686981201172,
"step": 952
},
{
"epoch": 0.5793313069908814,
"grad_norm": 1.133589506149292,
"learning_rate": 0.0005,
"loss": 4.227142333984375,
"step": 953
},
{
"epoch": 0.5799392097264437,
"grad_norm": 1.12785804271698,
"learning_rate": 0.0005,
"loss": 4.077308654785156,
"step": 954
},
{
"epoch": 0.5805471124620061,
"grad_norm": 1.0380632877349854,
"learning_rate": 0.0005,
"loss": 4.485074996948242,
"step": 955
},
{
"epoch": 0.5811550151975684,
"grad_norm": 1.0573036670684814,
"learning_rate": 0.0005,
"loss": 4.045351028442383,
"step": 956
},
{
"epoch": 0.5817629179331307,
"grad_norm": 1.0433647632598877,
"learning_rate": 0.0005,
"loss": 3.9277734756469727,
"step": 957
},
{
"epoch": 0.5823708206686931,
"grad_norm": 1.077911376953125,
"learning_rate": 0.0005,
"loss": 4.329649448394775,
"step": 958
},
{
"epoch": 0.5829787234042553,
"grad_norm": 0.9521039128303528,
"learning_rate": 0.0005,
"loss": 4.175987720489502,
"step": 959
},
{
"epoch": 0.5835866261398176,
"grad_norm": 1.0778512954711914,
"learning_rate": 0.0005,
"loss": 4.32703971862793,
"step": 960
},
{
"epoch": 0.5841945288753799,
"grad_norm": 1.048074722290039,
"learning_rate": 0.0005,
"loss": 4.146064758300781,
"step": 961
},
{
"epoch": 0.5848024316109423,
"grad_norm": 1.0995032787322998,
"learning_rate": 0.0005,
"loss": 4.317961692810059,
"step": 962
},
{
"epoch": 0.5854103343465046,
"grad_norm": 1.1812586784362793,
"learning_rate": 0.0005,
"loss": 4.266629219055176,
"step": 963
},
{
"epoch": 0.5860182370820669,
"grad_norm": 1.2058099508285522,
"learning_rate": 0.0005,
"loss": 4.350966930389404,
"step": 964
},
{
"epoch": 0.5866261398176292,
"grad_norm": 1.1499630212783813,
"learning_rate": 0.0005,
"loss": 4.47420072555542,
"step": 965
},
{
"epoch": 0.5872340425531914,
"grad_norm": 1.212178111076355,
"learning_rate": 0.0005,
"loss": 4.201877593994141,
"step": 966
},
{
"epoch": 0.5878419452887538,
"grad_norm": 1.0750401020050049,
"learning_rate": 0.0005,
"loss": 4.032867431640625,
"step": 967
},
{
"epoch": 0.5884498480243161,
"grad_norm": 1.0766054391860962,
"learning_rate": 0.0005,
"loss": 4.070111274719238,
"step": 968
},
{
"epoch": 0.5890577507598784,
"grad_norm": 1.0466876029968262,
"learning_rate": 0.0005,
"loss": 4.164140224456787,
"step": 969
},
{
"epoch": 0.5896656534954408,
"grad_norm": 0.9755964875221252,
"learning_rate": 0.0005,
"loss": 4.061018943786621,
"step": 970
},
{
"epoch": 0.5902735562310031,
"grad_norm": 1.0774449110031128,
"learning_rate": 0.0005,
"loss": 4.103540420532227,
"step": 971
},
{
"epoch": 0.5908814589665653,
"grad_norm": 1.016599178314209,
"learning_rate": 0.0005,
"loss": 4.367238998413086,
"step": 972
},
{
"epoch": 0.5914893617021276,
"grad_norm": 1.273015022277832,
"learning_rate": 0.0005,
"loss": 4.130205154418945,
"step": 973
},
{
"epoch": 0.59209726443769,
"grad_norm": 1.121202826499939,
"learning_rate": 0.0005,
"loss": 4.058278560638428,
"step": 974
},
{
"epoch": 0.5927051671732523,
"grad_norm": 1.782248854637146,
"learning_rate": 0.0005,
"loss": 4.2193732261657715,
"step": 975
},
{
"epoch": 0.5933130699088146,
"grad_norm": 1.2525842189788818,
"learning_rate": 0.0005,
"loss": 4.324434757232666,
"step": 976
},
{
"epoch": 0.5939209726443769,
"grad_norm": 0.9859209656715393,
"learning_rate": 0.0005,
"loss": 4.235608100891113,
"step": 977
},
{
"epoch": 0.5945288753799393,
"grad_norm": 1.0421037673950195,
"learning_rate": 0.0005,
"loss": 4.312819480895996,
"step": 978
},
{
"epoch": 0.5951367781155015,
"grad_norm": 1.2486640214920044,
"learning_rate": 0.0005,
"loss": 4.172330856323242,
"step": 979
},
{
"epoch": 0.5957446808510638,
"grad_norm": 1.049641489982605,
"learning_rate": 0.0005,
"loss": 4.053893089294434,
"step": 980
},
{
"epoch": 0.5963525835866261,
"grad_norm": 1.0123006105422974,
"learning_rate": 0.0005,
"loss": 4.453596115112305,
"step": 981
},
{
"epoch": 0.5969604863221885,
"grad_norm": 0.9871963858604431,
"learning_rate": 0.0005,
"loss": 4.007091999053955,
"step": 982
},
{
"epoch": 0.5975683890577508,
"grad_norm": 0.9984953999519348,
"learning_rate": 0.0005,
"loss": 4.22979736328125,
"step": 983
},
{
"epoch": 0.5981762917933131,
"grad_norm": 1.281544804573059,
"learning_rate": 0.0005,
"loss": 4.074709892272949,
"step": 984
},
{
"epoch": 0.5987841945288754,
"grad_norm": 1.1482913494110107,
"learning_rate": 0.0005,
"loss": 4.320782661437988,
"step": 985
},
{
"epoch": 0.5993920972644377,
"grad_norm": 1.2105413675308228,
"learning_rate": 0.0005,
"loss": 4.15565299987793,
"step": 986
},
{
"epoch": 0.6,
"grad_norm": 1.0716112852096558,
"learning_rate": 0.0005,
"loss": 4.429147720336914,
"step": 987
},
{
"epoch": 0.6006079027355623,
"grad_norm": 1.1487056016921997,
"learning_rate": 0.0005,
"loss": 4.206772804260254,
"step": 988
},
{
"epoch": 0.6012158054711246,
"grad_norm": 0.9919009208679199,
"learning_rate": 0.0005,
"loss": 4.105408191680908,
"step": 989
},
{
"epoch": 0.601823708206687,
"grad_norm": 1.1244338750839233,
"learning_rate": 0.0005,
"loss": 4.034040451049805,
"step": 990
},
{
"epoch": 0.6024316109422493,
"grad_norm": 0.9693543910980225,
"learning_rate": 0.0005,
"loss": 3.8358006477355957,
"step": 991
},
{
"epoch": 0.6030395136778115,
"grad_norm": 1.147226333618164,
"learning_rate": 0.0005,
"loss": 4.114927291870117,
"step": 992
},
{
"epoch": 0.6036474164133738,
"grad_norm": 1.1658263206481934,
"learning_rate": 0.0005,
"loss": 4.3732099533081055,
"step": 993
},
{
"epoch": 0.6042553191489362,
"grad_norm": 1.1261506080627441,
"learning_rate": 0.0005,
"loss": 4.212404251098633,
"step": 994
},
{
"epoch": 0.6048632218844985,
"grad_norm": 1.214408040046692,
"learning_rate": 0.0005,
"loss": 4.113962173461914,
"step": 995
},
{
"epoch": 0.6054711246200608,
"grad_norm": 1.1703499555587769,
"learning_rate": 0.0005,
"loss": 3.9795780181884766,
"step": 996
},
{
"epoch": 0.6060790273556231,
"grad_norm": 1.2819421291351318,
"learning_rate": 0.0005,
"loss": 3.820543050765991,
"step": 997
},
{
"epoch": 0.6066869300911855,
"grad_norm": 1.1751822233200073,
"learning_rate": 0.0005,
"loss": 4.115008354187012,
"step": 998
},
{
"epoch": 0.6072948328267477,
"grad_norm": 1.133631944656372,
"learning_rate": 0.0005,
"loss": 4.320215225219727,
"step": 999
},
{
"epoch": 0.60790273556231,
"grad_norm": 1.2056914567947388,
"learning_rate": 0.0005,
"loss": 4.139728546142578,
"step": 1000
},
{
"epoch": 0.6085106382978723,
"grad_norm": 1.1610949039459229,
"learning_rate": 0.0005,
"loss": 4.215843200683594,
"step": 1001
},
{
"epoch": 0.6091185410334347,
"grad_norm": 1.2171114683151245,
"learning_rate": 0.0005,
"loss": 4.104484558105469,
"step": 1002
},
{
"epoch": 0.609726443768997,
"grad_norm": 1.0760419368743896,
"learning_rate": 0.0005,
"loss": 4.106335639953613,
"step": 1003
},
{
"epoch": 0.6103343465045593,
"grad_norm": 1.0737935304641724,
"learning_rate": 0.0005,
"loss": 4.18284797668457,
"step": 1004
},
{
"epoch": 0.6109422492401215,
"grad_norm": 1.0054482221603394,
"learning_rate": 0.0005,
"loss": 4.185699462890625,
"step": 1005
},
{
"epoch": 0.6115501519756839,
"grad_norm": 1.0817815065383911,
"learning_rate": 0.0005,
"loss": 3.9077231884002686,
"step": 1006
},
{
"epoch": 0.6121580547112462,
"grad_norm": 1.1520154476165771,
"learning_rate": 0.0005,
"loss": 4.094099044799805,
"step": 1007
},
{
"epoch": 0.6127659574468085,
"grad_norm": 0.9844207763671875,
"learning_rate": 0.0005,
"loss": 4.341885566711426,
"step": 1008
},
{
"epoch": 0.6133738601823708,
"grad_norm": 1.2627776861190796,
"learning_rate": 0.0005,
"loss": 4.28475284576416,
"step": 1009
},
{
"epoch": 0.6139817629179332,
"grad_norm": 0.9542902112007141,
"learning_rate": 0.0005,
"loss": 4.2372026443481445,
"step": 1010
},
{
"epoch": 0.6145896656534955,
"grad_norm": 3.645486831665039,
"learning_rate": 0.0005,
"loss": 4.06125545501709,
"step": 1011
},
{
"epoch": 0.6151975683890577,
"grad_norm": 1.4817546606063843,
"learning_rate": 0.0005,
"loss": 3.9809517860412598,
"step": 1012
},
{
"epoch": 0.61580547112462,
"grad_norm": 1.1932374238967896,
"learning_rate": 0.0005,
"loss": 4.242306232452393,
"step": 1013
},
{
"epoch": 0.6164133738601824,
"grad_norm": 0.9499757289886475,
"learning_rate": 0.0005,
"loss": 3.9819726943969727,
"step": 1014
},
{
"epoch": 0.6170212765957447,
"grad_norm": 1.1981247663497925,
"learning_rate": 0.0005,
"loss": 4.266401290893555,
"step": 1015
},
{
"epoch": 0.617629179331307,
"grad_norm": 1.2060346603393555,
"learning_rate": 0.0005,
"loss": 4.270205497741699,
"step": 1016
},
{
"epoch": 0.6182370820668693,
"grad_norm": 1.002508282661438,
"learning_rate": 0.0005,
"loss": 4.509585380554199,
"step": 1017
},
{
"epoch": 0.6188449848024317,
"grad_norm": 1.0094107389450073,
"learning_rate": 0.0005,
"loss": 4.162940979003906,
"step": 1018
},
{
"epoch": 0.6194528875379939,
"grad_norm": 1.180220365524292,
"learning_rate": 0.0005,
"loss": 4.317109107971191,
"step": 1019
},
{
"epoch": 0.6200607902735562,
"grad_norm": 0.980454683303833,
"learning_rate": 0.0005,
"loss": 4.042284965515137,
"step": 1020
},
{
"epoch": 0.6206686930091185,
"grad_norm": 1.0461052656173706,
"learning_rate": 0.0005,
"loss": 4.0409698486328125,
"step": 1021
},
{
"epoch": 0.6212765957446809,
"grad_norm": 1.0268027782440186,
"learning_rate": 0.0005,
"loss": 4.10588264465332,
"step": 1022
},
{
"epoch": 0.6218844984802432,
"grad_norm": 0.9659956693649292,
"learning_rate": 0.0005,
"loss": 4.511264801025391,
"step": 1023
},
{
"epoch": 0.6224924012158055,
"grad_norm": 1.0161465406417847,
"learning_rate": 0.0005,
"loss": 4.369597911834717,
"step": 1024
},
{
"epoch": 0.6231003039513677,
"grad_norm": 1.145430326461792,
"learning_rate": 0.0005,
"loss": 4.104186058044434,
"step": 1025
},
{
"epoch": 0.6237082066869301,
"grad_norm": 0.968573808670044,
"learning_rate": 0.0005,
"loss": 4.03414249420166,
"step": 1026
},
{
"epoch": 0.6243161094224924,
"grad_norm": 1.2972266674041748,
"learning_rate": 0.0005,
"loss": 4.18367862701416,
"step": 1027
},
{
"epoch": 0.6249240121580547,
"grad_norm": 0.9075741171836853,
"learning_rate": 0.0005,
"loss": 4.101839065551758,
"step": 1028
},
{
"epoch": 0.625531914893617,
"grad_norm": 1.2480190992355347,
"learning_rate": 0.0005,
"loss": 4.170825004577637,
"step": 1029
},
{
"epoch": 0.6261398176291794,
"grad_norm": 1.1662267446517944,
"learning_rate": 0.0005,
"loss": 4.132046222686768,
"step": 1030
},
{
"epoch": 0.6267477203647417,
"grad_norm": 0.9081514477729797,
"learning_rate": 0.0005,
"loss": 4.023431777954102,
"step": 1031
},
{
"epoch": 0.6273556231003039,
"grad_norm": 1.1570264101028442,
"learning_rate": 0.0005,
"loss": 4.246901512145996,
"step": 1032
},
{
"epoch": 0.6279635258358662,
"grad_norm": 1.0261447429656982,
"learning_rate": 0.0005,
"loss": 4.251025199890137,
"step": 1033
},
{
"epoch": 0.6285714285714286,
"grad_norm": 0.9957416653633118,
"learning_rate": 0.0005,
"loss": 4.112504482269287,
"step": 1034
},
{
"epoch": 0.6291793313069909,
"grad_norm": 1.2634888887405396,
"learning_rate": 0.0005,
"loss": 4.397002220153809,
"step": 1035
},
{
"epoch": 0.6297872340425532,
"grad_norm": 1.0848995447158813,
"learning_rate": 0.0005,
"loss": 4.163301467895508,
"step": 1036
},
{
"epoch": 0.6303951367781155,
"grad_norm": 1.0806390047073364,
"learning_rate": 0.0005,
"loss": 3.78402042388916,
"step": 1037
},
{
"epoch": 0.6310030395136779,
"grad_norm": 1.0640003681182861,
"learning_rate": 0.0005,
"loss": 4.0556440353393555,
"step": 1038
},
{
"epoch": 0.6316109422492401,
"grad_norm": 0.9620634317398071,
"learning_rate": 0.0005,
"loss": 4.19709587097168,
"step": 1039
},
{
"epoch": 0.6322188449848024,
"grad_norm": 1.4484918117523193,
"learning_rate": 0.0005,
"loss": 4.058507442474365,
"step": 1040
},
{
"epoch": 0.6328267477203647,
"grad_norm": 1.219489574432373,
"learning_rate": 0.0005,
"loss": 4.1419267654418945,
"step": 1041
},
{
"epoch": 0.6334346504559271,
"grad_norm": 1.127636194229126,
"learning_rate": 0.0005,
"loss": 3.892014980316162,
"step": 1042
},
{
"epoch": 0.6340425531914894,
"grad_norm": 1.326476812362671,
"learning_rate": 0.0005,
"loss": 4.128079414367676,
"step": 1043
},
{
"epoch": 0.6346504559270517,
"grad_norm": 1.1010375022888184,
"learning_rate": 0.0005,
"loss": 3.898940086364746,
"step": 1044
},
{
"epoch": 0.6352583586626139,
"grad_norm": 1.1064268350601196,
"learning_rate": 0.0005,
"loss": 4.141763687133789,
"step": 1045
},
{
"epoch": 0.6358662613981763,
"grad_norm": 1.24687659740448,
"learning_rate": 0.0005,
"loss": 4.210533618927002,
"step": 1046
},
{
"epoch": 0.6364741641337386,
"grad_norm": 1.0071916580200195,
"learning_rate": 0.0005,
"loss": 4.255558013916016,
"step": 1047
},
{
"epoch": 0.6370820668693009,
"grad_norm": 1.0620638132095337,
"learning_rate": 0.0005,
"loss": 4.008969306945801,
"step": 1048
},
{
"epoch": 0.6376899696048632,
"grad_norm": 1.0604190826416016,
"learning_rate": 0.0005,
"loss": 4.224608421325684,
"step": 1049
},
{
"epoch": 0.6382978723404256,
"grad_norm": 1.032774567604065,
"learning_rate": 0.0005,
"loss": 4.131565093994141,
"step": 1050
},
{
"epoch": 0.6389057750759879,
"grad_norm": 0.9236063361167908,
"learning_rate": 0.0005,
"loss": 4.309024333953857,
"step": 1051
},
{
"epoch": 0.6395136778115501,
"grad_norm": 1.059757947921753,
"learning_rate": 0.0005,
"loss": 4.041001319885254,
"step": 1052
},
{
"epoch": 0.6401215805471124,
"grad_norm": 1.1099759340286255,
"learning_rate": 0.0005,
"loss": 3.9661004543304443,
"step": 1053
},
{
"epoch": 0.6407294832826748,
"grad_norm": 1.0091055631637573,
"learning_rate": 0.0005,
"loss": 3.987016439437866,
"step": 1054
},
{
"epoch": 0.6413373860182371,
"grad_norm": 1.1090649366378784,
"learning_rate": 0.0005,
"loss": 4.068497657775879,
"step": 1055
},
{
"epoch": 0.6419452887537994,
"grad_norm": 1.0738252401351929,
"learning_rate": 0.0005,
"loss": 3.9846339225769043,
"step": 1056
},
{
"epoch": 0.6425531914893617,
"grad_norm": 1.1196277141571045,
"learning_rate": 0.0005,
"loss": 4.063312530517578,
"step": 1057
},
{
"epoch": 0.643161094224924,
"grad_norm": 1.2615549564361572,
"learning_rate": 0.0005,
"loss": 3.9986069202423096,
"step": 1058
},
{
"epoch": 0.6437689969604863,
"grad_norm": 1.49628746509552,
"learning_rate": 0.0005,
"loss": 4.1674394607543945,
"step": 1059
},
{
"epoch": 0.6443768996960486,
"grad_norm": 1.279189109802246,
"learning_rate": 0.0005,
"loss": 3.8027124404907227,
"step": 1060
},
{
"epoch": 0.6449848024316109,
"grad_norm": 1.1228110790252686,
"learning_rate": 0.0005,
"loss": 3.7935433387756348,
"step": 1061
},
{
"epoch": 0.6455927051671733,
"grad_norm": 1.082332730293274,
"learning_rate": 0.0005,
"loss": 4.039803981781006,
"step": 1062
},
{
"epoch": 0.6462006079027356,
"grad_norm": 0.9758466482162476,
"learning_rate": 0.0005,
"loss": 4.102064609527588,
"step": 1063
},
{
"epoch": 0.6468085106382979,
"grad_norm": 1.0097397565841675,
"learning_rate": 0.0005,
"loss": 4.058742523193359,
"step": 1064
},
{
"epoch": 0.6474164133738601,
"grad_norm": 1.0726414918899536,
"learning_rate": 0.0005,
"loss": 4.0242133140563965,
"step": 1065
},
{
"epoch": 0.6480243161094225,
"grad_norm": 1.107040524482727,
"learning_rate": 0.0005,
"loss": 3.9720733165740967,
"step": 1066
},
{
"epoch": 0.6486322188449848,
"grad_norm": 1.258399248123169,
"learning_rate": 0.0005,
"loss": 3.85103178024292,
"step": 1067
},
{
"epoch": 0.6492401215805471,
"grad_norm": 1.215524435043335,
"learning_rate": 0.0005,
"loss": 4.162143707275391,
"step": 1068
},
{
"epoch": 0.6498480243161094,
"grad_norm": 1.0505629777908325,
"learning_rate": 0.0005,
"loss": 4.23874568939209,
"step": 1069
},
{
"epoch": 0.6504559270516718,
"grad_norm": 1.2580337524414062,
"learning_rate": 0.0005,
"loss": 4.126619338989258,
"step": 1070
},
{
"epoch": 0.6510638297872341,
"grad_norm": 1.1980527639389038,
"learning_rate": 0.0005,
"loss": 4.011953353881836,
"step": 1071
},
{
"epoch": 0.6516717325227963,
"grad_norm": 1.020224690437317,
"learning_rate": 0.0005,
"loss": 4.2201948165893555,
"step": 1072
},
{
"epoch": 0.6522796352583586,
"grad_norm": 1.0695855617523193,
"learning_rate": 0.0005,
"loss": 4.277288436889648,
"step": 1073
},
{
"epoch": 0.652887537993921,
"grad_norm": 1.1862881183624268,
"learning_rate": 0.0005,
"loss": 4.1104841232299805,
"step": 1074
},
{
"epoch": 0.6534954407294833,
"grad_norm": 1.7002424001693726,
"learning_rate": 0.0005,
"loss": 4.274345874786377,
"step": 1075
},
{
"epoch": 0.6541033434650456,
"grad_norm": 1.3632254600524902,
"learning_rate": 0.0005,
"loss": 4.318878173828125,
"step": 1076
},
{
"epoch": 0.6547112462006079,
"grad_norm": 1.1510448455810547,
"learning_rate": 0.0005,
"loss": 4.182323455810547,
"step": 1077
},
{
"epoch": 0.6553191489361702,
"grad_norm": 1.143638014793396,
"learning_rate": 0.0005,
"loss": 4.174741744995117,
"step": 1078
},
{
"epoch": 0.6559270516717325,
"grad_norm": 1.1500475406646729,
"learning_rate": 0.0005,
"loss": 3.9260623455047607,
"step": 1079
},
{
"epoch": 0.6565349544072948,
"grad_norm": 1.293712854385376,
"learning_rate": 0.0005,
"loss": 4.087700843811035,
"step": 1080
},
{
"epoch": 0.6571428571428571,
"grad_norm": 1.3932772874832153,
"learning_rate": 0.0005,
"loss": 4.118124961853027,
"step": 1081
},
{
"epoch": 0.6577507598784195,
"grad_norm": 1.094328761100769,
"learning_rate": 0.0005,
"loss": 4.175318241119385,
"step": 1082
},
{
"epoch": 0.6583586626139818,
"grad_norm": 1.467499017715454,
"learning_rate": 0.0005,
"loss": 4.272140979766846,
"step": 1083
},
{
"epoch": 0.6589665653495441,
"grad_norm": 1.1503561735153198,
"learning_rate": 0.0005,
"loss": 4.183167934417725,
"step": 1084
},
{
"epoch": 0.6595744680851063,
"grad_norm": 1.1912407875061035,
"learning_rate": 0.0005,
"loss": 4.055290222167969,
"step": 1085
},
{
"epoch": 0.6601823708206687,
"grad_norm": 1.1428508758544922,
"learning_rate": 0.0005,
"loss": 4.183894157409668,
"step": 1086
},
{
"epoch": 0.660790273556231,
"grad_norm": 1.136474609375,
"learning_rate": 0.0005,
"loss": 3.86468768119812,
"step": 1087
},
{
"epoch": 0.6613981762917933,
"grad_norm": 1.0048547983169556,
"learning_rate": 0.0005,
"loss": 4.054813385009766,
"step": 1088
},
{
"epoch": 0.6620060790273556,
"grad_norm": 1.021672010421753,
"learning_rate": 0.0005,
"loss": 3.9937808513641357,
"step": 1089
},
{
"epoch": 0.662613981762918,
"grad_norm": 1.184766173362732,
"learning_rate": 0.0005,
"loss": 4.007226943969727,
"step": 1090
},
{
"epoch": 0.6632218844984803,
"grad_norm": 1.1701700687408447,
"learning_rate": 0.0005,
"loss": 3.880901336669922,
"step": 1091
},
{
"epoch": 0.6638297872340425,
"grad_norm": 1.0928300619125366,
"learning_rate": 0.0005,
"loss": 4.0920090675354,
"step": 1092
},
{
"epoch": 0.6644376899696048,
"grad_norm": 1.0498013496398926,
"learning_rate": 0.0005,
"loss": 4.117219924926758,
"step": 1093
},
{
"epoch": 0.6650455927051672,
"grad_norm": 1.034084439277649,
"learning_rate": 0.0005,
"loss": 4.0926313400268555,
"step": 1094
},
{
"epoch": 0.6656534954407295,
"grad_norm": 0.939494788646698,
"learning_rate": 0.0005,
"loss": 4.018050670623779,
"step": 1095
},
{
"epoch": 0.6662613981762918,
"grad_norm": 1.2339518070220947,
"learning_rate": 0.0005,
"loss": 3.9285688400268555,
"step": 1096
},
{
"epoch": 0.6668693009118541,
"grad_norm": 1.1236822605133057,
"learning_rate": 0.0005,
"loss": 3.9050168991088867,
"step": 1097
},
{
"epoch": 0.6674772036474164,
"grad_norm": 0.9875328540802002,
"learning_rate": 0.0005,
"loss": 4.033220291137695,
"step": 1098
},
{
"epoch": 0.6680851063829787,
"grad_norm": 0.9468657374382019,
"learning_rate": 0.0005,
"loss": 4.108023643493652,
"step": 1099
},
{
"epoch": 0.668693009118541,
"grad_norm": 1.0056613683700562,
"learning_rate": 0.0005,
"loss": 4.217726707458496,
"step": 1100
},
{
"epoch": 0.6693009118541033,
"grad_norm": 1.1911637783050537,
"learning_rate": 0.0005,
"loss": 3.9922735691070557,
"step": 1101
},
{
"epoch": 0.6699088145896657,
"grad_norm": 0.9524610638618469,
"learning_rate": 0.0005,
"loss": 3.843928337097168,
"step": 1102
},
{
"epoch": 0.670516717325228,
"grad_norm": 1.1759804487228394,
"learning_rate": 0.0005,
"loss": 3.8452773094177246,
"step": 1103
},
{
"epoch": 0.6711246200607903,
"grad_norm": 1.1534795761108398,
"learning_rate": 0.0005,
"loss": 4.147238731384277,
"step": 1104
},
{
"epoch": 0.6717325227963525,
"grad_norm": 1.0438340902328491,
"learning_rate": 0.0005,
"loss": 3.814009666442871,
"step": 1105
},
{
"epoch": 0.6723404255319149,
"grad_norm": 1.4943510293960571,
"learning_rate": 0.0005,
"loss": 4.062148571014404,
"step": 1106
},
{
"epoch": 0.6729483282674772,
"grad_norm": 0.9739040732383728,
"learning_rate": 0.0005,
"loss": 4.066575050354004,
"step": 1107
},
{
"epoch": 0.6735562310030395,
"grad_norm": 1.0727957487106323,
"learning_rate": 0.0005,
"loss": 3.935608386993408,
"step": 1108
},
{
"epoch": 0.6741641337386018,
"grad_norm": 1.480692744255066,
"learning_rate": 0.0005,
"loss": 4.12183952331543,
"step": 1109
},
{
"epoch": 0.6747720364741642,
"grad_norm": 1.1042070388793945,
"learning_rate": 0.0005,
"loss": 3.8309693336486816,
"step": 1110
},
{
"epoch": 0.6753799392097265,
"grad_norm": 1.5949453115463257,
"learning_rate": 0.0005,
"loss": 4.225711822509766,
"step": 1111
},
{
"epoch": 0.6759878419452887,
"grad_norm": 1.1404409408569336,
"learning_rate": 0.0005,
"loss": 3.870262384414673,
"step": 1112
},
{
"epoch": 0.676595744680851,
"grad_norm": 1.1272308826446533,
"learning_rate": 0.0005,
"loss": 4.375516891479492,
"step": 1113
},
{
"epoch": 0.6772036474164134,
"grad_norm": 1.3391433954238892,
"learning_rate": 0.0005,
"loss": 3.9125869274139404,
"step": 1114
},
{
"epoch": 0.6778115501519757,
"grad_norm": 0.9406550526618958,
"learning_rate": 0.0005,
"loss": 4.000041961669922,
"step": 1115
},
{
"epoch": 0.678419452887538,
"grad_norm": 1.211789846420288,
"learning_rate": 0.0005,
"loss": 4.146924018859863,
"step": 1116
},
{
"epoch": 0.6790273556231003,
"grad_norm": 1.0479586124420166,
"learning_rate": 0.0005,
"loss": 3.77646803855896,
"step": 1117
},
{
"epoch": 0.6796352583586626,
"grad_norm": 1.0069152116775513,
"learning_rate": 0.0005,
"loss": 4.110267162322998,
"step": 1118
},
{
"epoch": 0.6802431610942249,
"grad_norm": 1.2088702917099,
"learning_rate": 0.0005,
"loss": 4.083201885223389,
"step": 1119
},
{
"epoch": 0.6808510638297872,
"grad_norm": 1.3016067743301392,
"learning_rate": 0.0005,
"loss": 4.1130218505859375,
"step": 1120
},
{
"epoch": 0.6814589665653495,
"grad_norm": 1.0395400524139404,
"learning_rate": 0.0005,
"loss": 4.012112617492676,
"step": 1121
},
{
"epoch": 0.6820668693009119,
"grad_norm": 1.1534603834152222,
"learning_rate": 0.0005,
"loss": 3.8767285346984863,
"step": 1122
},
{
"epoch": 0.6826747720364742,
"grad_norm": 1.1331707239151,
"learning_rate": 0.0005,
"loss": 3.8466670513153076,
"step": 1123
},
{
"epoch": 0.6832826747720365,
"grad_norm": 1.0023419857025146,
"learning_rate": 0.0005,
"loss": 3.978550910949707,
"step": 1124
},
{
"epoch": 0.6838905775075987,
"grad_norm": 1.198326826095581,
"learning_rate": 0.0005,
"loss": 4.160974502563477,
"step": 1125
},
{
"epoch": 0.6844984802431611,
"grad_norm": 1.0249745845794678,
"learning_rate": 0.0005,
"loss": 3.961395740509033,
"step": 1126
},
{
"epoch": 0.6851063829787234,
"grad_norm": 1.2853235006332397,
"learning_rate": 0.0005,
"loss": 4.322844505310059,
"step": 1127
},
{
"epoch": 0.6857142857142857,
"grad_norm": 0.9774798154830933,
"learning_rate": 0.0005,
"loss": 4.034377098083496,
"step": 1128
},
{
"epoch": 0.686322188449848,
"grad_norm": 1.1903027296066284,
"learning_rate": 0.0005,
"loss": 3.896298408508301,
"step": 1129
},
{
"epoch": 0.6869300911854104,
"grad_norm": 0.9409128427505493,
"learning_rate": 0.0005,
"loss": 3.967303514480591,
"step": 1130
},
{
"epoch": 0.6875379939209726,
"grad_norm": 1.0214065313339233,
"learning_rate": 0.0005,
"loss": 3.916367769241333,
"step": 1131
},
{
"epoch": 0.6881458966565349,
"grad_norm": 1.3258956670761108,
"learning_rate": 0.0005,
"loss": 3.859543800354004,
"step": 1132
},
{
"epoch": 0.6887537993920972,
"grad_norm": 1.0668888092041016,
"learning_rate": 0.0005,
"loss": 3.929979085922241,
"step": 1133
},
{
"epoch": 0.6893617021276596,
"grad_norm": 1.0921815633773804,
"learning_rate": 0.0005,
"loss": 3.942767381668091,
"step": 1134
},
{
"epoch": 0.6899696048632219,
"grad_norm": 1.1683087348937988,
"learning_rate": 0.0005,
"loss": 4.096218109130859,
"step": 1135
},
{
"epoch": 0.6905775075987842,
"grad_norm": 1.150611162185669,
"learning_rate": 0.0005,
"loss": 4.065778732299805,
"step": 1136
},
{
"epoch": 0.6911854103343466,
"grad_norm": 0.9955292344093323,
"learning_rate": 0.0005,
"loss": 3.83855938911438,
"step": 1137
},
{
"epoch": 0.6917933130699088,
"grad_norm": 1.1191688776016235,
"learning_rate": 0.0005,
"loss": 4.189568519592285,
"step": 1138
},
{
"epoch": 0.6924012158054711,
"grad_norm": 1.1021112203598022,
"learning_rate": 0.0005,
"loss": 4.004612445831299,
"step": 1139
},
{
"epoch": 0.6930091185410334,
"grad_norm": 1.2468072175979614,
"learning_rate": 0.0005,
"loss": 3.867835283279419,
"step": 1140
},
{
"epoch": 0.6936170212765957,
"grad_norm": 0.9965139627456665,
"learning_rate": 0.0005,
"loss": 3.8393120765686035,
"step": 1141
},
{
"epoch": 0.6942249240121581,
"grad_norm": 1.2608331441879272,
"learning_rate": 0.0005,
"loss": 4.122796535491943,
"step": 1142
},
{
"epoch": 0.6948328267477204,
"grad_norm": 0.9645028710365295,
"learning_rate": 0.0005,
"loss": 4.193379878997803,
"step": 1143
},
{
"epoch": 0.6954407294832827,
"grad_norm": 1.103003978729248,
"learning_rate": 0.0005,
"loss": 3.80690860748291,
"step": 1144
},
{
"epoch": 0.6960486322188449,
"grad_norm": 0.9812702536582947,
"learning_rate": 0.0005,
"loss": 3.910191059112549,
"step": 1145
},
{
"epoch": 0.6966565349544073,
"grad_norm": 1.1629973649978638,
"learning_rate": 0.0005,
"loss": 3.912919282913208,
"step": 1146
},
{
"epoch": 0.6972644376899696,
"grad_norm": 0.9559318423271179,
"learning_rate": 0.0005,
"loss": 4.1403703689575195,
"step": 1147
},
{
"epoch": 0.6978723404255319,
"grad_norm": 1.187225103378296,
"learning_rate": 0.0005,
"loss": 4.227229595184326,
"step": 1148
},
{
"epoch": 0.6984802431610942,
"grad_norm": 1.0893582105636597,
"learning_rate": 0.0005,
"loss": 4.085037708282471,
"step": 1149
},
{
"epoch": 0.6990881458966566,
"grad_norm": 1.207614541053772,
"learning_rate": 0.0005,
"loss": 4.065666198730469,
"step": 1150
},
{
"epoch": 0.6996960486322188,
"grad_norm": 1.1726024150848389,
"learning_rate": 0.0005,
"loss": 4.011224269866943,
"step": 1151
},
{
"epoch": 0.7003039513677811,
"grad_norm": 1.0657603740692139,
"learning_rate": 0.0005,
"loss": 3.7515785694122314,
"step": 1152
},
{
"epoch": 0.7009118541033434,
"grad_norm": 1.069787859916687,
"learning_rate": 0.0005,
"loss": 4.024471759796143,
"step": 1153
},
{
"epoch": 0.7015197568389058,
"grad_norm": 1.0333293676376343,
"learning_rate": 0.0005,
"loss": 3.9765753746032715,
"step": 1154
},
{
"epoch": 0.7021276595744681,
"grad_norm": 1.3091932535171509,
"learning_rate": 0.0005,
"loss": 4.086296081542969,
"step": 1155
},
{
"epoch": 0.7027355623100304,
"grad_norm": 0.96831214427948,
"learning_rate": 0.0005,
"loss": 4.159407615661621,
"step": 1156
},
{
"epoch": 0.7033434650455928,
"grad_norm": 1.0307363271713257,
"learning_rate": 0.0005,
"loss": 4.085778713226318,
"step": 1157
},
{
"epoch": 0.703951367781155,
"grad_norm": 1.2046213150024414,
"learning_rate": 0.0005,
"loss": 4.149312973022461,
"step": 1158
},
{
"epoch": 0.7045592705167173,
"grad_norm": 1.027969241142273,
"learning_rate": 0.0005,
"loss": 4.021113872528076,
"step": 1159
},
{
"epoch": 0.7051671732522796,
"grad_norm": 0.886216938495636,
"learning_rate": 0.0005,
"loss": 4.1072492599487305,
"step": 1160
},
{
"epoch": 0.705775075987842,
"grad_norm": 1.2814362049102783,
"learning_rate": 0.0005,
"loss": 4.351136207580566,
"step": 1161
},
{
"epoch": 0.7063829787234043,
"grad_norm": 1.195614218711853,
"learning_rate": 0.0005,
"loss": 3.859333038330078,
"step": 1162
},
{
"epoch": 0.7069908814589666,
"grad_norm": 1.02545964717865,
"learning_rate": 0.0005,
"loss": 3.993720531463623,
"step": 1163
},
{
"epoch": 0.7075987841945289,
"grad_norm": 1.1973057985305786,
"learning_rate": 0.0005,
"loss": 4.022034168243408,
"step": 1164
},
{
"epoch": 0.7082066869300911,
"grad_norm": 1.047211766242981,
"learning_rate": 0.0005,
"loss": 4.110629081726074,
"step": 1165
},
{
"epoch": 0.7088145896656535,
"grad_norm": 0.9065303206443787,
"learning_rate": 0.0005,
"loss": 4.266797065734863,
"step": 1166
},
{
"epoch": 0.7094224924012158,
"grad_norm": 1.0121465921401978,
"learning_rate": 0.0005,
"loss": 3.7510571479797363,
"step": 1167
},
{
"epoch": 0.7100303951367781,
"grad_norm": 1.2128599882125854,
"learning_rate": 0.0005,
"loss": 4.190847396850586,
"step": 1168
},
{
"epoch": 0.7106382978723405,
"grad_norm": 1.4867533445358276,
"learning_rate": 0.0005,
"loss": 4.125823497772217,
"step": 1169
},
{
"epoch": 0.7112462006079028,
"grad_norm": 0.9088724851608276,
"learning_rate": 0.0005,
"loss": 4.092848777770996,
"step": 1170
},
{
"epoch": 0.711854103343465,
"grad_norm": 0.980387270450592,
"learning_rate": 0.0005,
"loss": 3.963526725769043,
"step": 1171
},
{
"epoch": 0.7124620060790273,
"grad_norm": 0.9671593308448792,
"learning_rate": 0.0005,
"loss": 3.886415958404541,
"step": 1172
},
{
"epoch": 0.7130699088145896,
"grad_norm": 0.8448948860168457,
"learning_rate": 0.0005,
"loss": 3.960893154144287,
"step": 1173
},
{
"epoch": 0.713677811550152,
"grad_norm": 1.0654000043869019,
"learning_rate": 0.0005,
"loss": 3.9057178497314453,
"step": 1174
},
{
"epoch": 0.7142857142857143,
"grad_norm": 1.255560040473938,
"learning_rate": 0.0005,
"loss": 4.085208892822266,
"step": 1175
},
{
"epoch": 0.7148936170212766,
"grad_norm": 1.172607183456421,
"learning_rate": 0.0005,
"loss": 3.8918302059173584,
"step": 1176
},
{
"epoch": 0.715501519756839,
"grad_norm": 1.1429939270019531,
"learning_rate": 0.0005,
"loss": 3.8840150833129883,
"step": 1177
},
{
"epoch": 0.7161094224924012,
"grad_norm": 1.0610404014587402,
"learning_rate": 0.0005,
"loss": 4.0701003074646,
"step": 1178
},
{
"epoch": 0.7167173252279635,
"grad_norm": 1.0055387020111084,
"learning_rate": 0.0005,
"loss": 3.773474931716919,
"step": 1179
},
{
"epoch": 0.7173252279635258,
"grad_norm": 1.0536381006240845,
"learning_rate": 0.0005,
"loss": 3.885234832763672,
"step": 1180
},
{
"epoch": 0.7179331306990882,
"grad_norm": 1.2304924726486206,
"learning_rate": 0.0005,
"loss": 4.134721755981445,
"step": 1181
},
{
"epoch": 0.7185410334346505,
"grad_norm": 1.1367759704589844,
"learning_rate": 0.0005,
"loss": 4.07640266418457,
"step": 1182
},
{
"epoch": 0.7191489361702128,
"grad_norm": 0.9987047910690308,
"learning_rate": 0.0005,
"loss": 3.941793918609619,
"step": 1183
},
{
"epoch": 0.7197568389057751,
"grad_norm": 0.9390357136726379,
"learning_rate": 0.0005,
"loss": 4.057319641113281,
"step": 1184
},
{
"epoch": 0.7203647416413373,
"grad_norm": 1.3009685277938843,
"learning_rate": 0.0005,
"loss": 3.7487921714782715,
"step": 1185
},
{
"epoch": 0.7209726443768997,
"grad_norm": 1.0107924938201904,
"learning_rate": 0.0005,
"loss": 3.9321677684783936,
"step": 1186
},
{
"epoch": 0.721580547112462,
"grad_norm": 1.003091812133789,
"learning_rate": 0.0005,
"loss": 3.855192184448242,
"step": 1187
},
{
"epoch": 0.7221884498480243,
"grad_norm": 1.1665643453598022,
"learning_rate": 0.0005,
"loss": 4.062481880187988,
"step": 1188
},
{
"epoch": 0.7227963525835867,
"grad_norm": 1.0481219291687012,
"learning_rate": 0.0005,
"loss": 3.7130908966064453,
"step": 1189
},
{
"epoch": 0.723404255319149,
"grad_norm": 1.4968420267105103,
"learning_rate": 0.0005,
"loss": 4.054961681365967,
"step": 1190
},
{
"epoch": 0.7240121580547112,
"grad_norm": 1.0543270111083984,
"learning_rate": 0.0005,
"loss": 4.080737113952637,
"step": 1191
},
{
"epoch": 0.7246200607902735,
"grad_norm": 1.3208811283111572,
"learning_rate": 0.0005,
"loss": 3.828869581222534,
"step": 1192
},
{
"epoch": 0.7252279635258359,
"grad_norm": 1.1503605842590332,
"learning_rate": 0.0005,
"loss": 3.897340774536133,
"step": 1193
},
{
"epoch": 0.7258358662613982,
"grad_norm": 0.9485260844230652,
"learning_rate": 0.0005,
"loss": 4.022025108337402,
"step": 1194
},
{
"epoch": 0.7264437689969605,
"grad_norm": 1.0768346786499023,
"learning_rate": 0.0005,
"loss": 4.132440567016602,
"step": 1195
},
{
"epoch": 0.7270516717325228,
"grad_norm": 1.0768530368804932,
"learning_rate": 0.0005,
"loss": 4.167514801025391,
"step": 1196
},
{
"epoch": 0.7276595744680852,
"grad_norm": 1.1659386157989502,
"learning_rate": 0.0005,
"loss": 3.9605331420898438,
"step": 1197
},
{
"epoch": 0.7282674772036474,
"grad_norm": 0.9825963377952576,
"learning_rate": 0.0005,
"loss": 3.9677042961120605,
"step": 1198
},
{
"epoch": 0.7288753799392097,
"grad_norm": 1.200975775718689,
"learning_rate": 0.0005,
"loss": 4.059627056121826,
"step": 1199
},
{
"epoch": 0.729483282674772,
"grad_norm": 1.0287483930587769,
"learning_rate": 0.0005,
"loss": 3.9571118354797363,
"step": 1200
},
{
"epoch": 0.7300911854103344,
"grad_norm": 1.171775221824646,
"learning_rate": 0.0005,
"loss": 4.2555084228515625,
"step": 1201
},
{
"epoch": 0.7306990881458967,
"grad_norm": 1.2075831890106201,
"learning_rate": 0.0005,
"loss": 3.9444262981414795,
"step": 1202
},
{
"epoch": 0.731306990881459,
"grad_norm": 1.1258975267410278,
"learning_rate": 0.0005,
"loss": 4.037412643432617,
"step": 1203
},
{
"epoch": 0.7319148936170212,
"grad_norm": 1.107055902481079,
"learning_rate": 0.0005,
"loss": 4.046412467956543,
"step": 1204
},
{
"epoch": 0.7325227963525835,
"grad_norm": 1.1721580028533936,
"learning_rate": 0.0005,
"loss": 4.039186477661133,
"step": 1205
},
{
"epoch": 0.7331306990881459,
"grad_norm": 1.8083940744400024,
"learning_rate": 0.0005,
"loss": 4.255344390869141,
"step": 1206
},
{
"epoch": 0.7337386018237082,
"grad_norm": 1.1505194902420044,
"learning_rate": 0.0005,
"loss": 3.849947452545166,
"step": 1207
},
{
"epoch": 0.7343465045592705,
"grad_norm": 1.0368176698684692,
"learning_rate": 0.0005,
"loss": 4.037250518798828,
"step": 1208
},
{
"epoch": 0.7349544072948329,
"grad_norm": 1.076282262802124,
"learning_rate": 0.0005,
"loss": 3.8192124366760254,
"step": 1209
},
{
"epoch": 0.7355623100303952,
"grad_norm": 0.9457529187202454,
"learning_rate": 0.0005,
"loss": 3.988126516342163,
"step": 1210
},
{
"epoch": 0.7361702127659574,
"grad_norm": 1.396436333656311,
"learning_rate": 0.0005,
"loss": 3.8735647201538086,
"step": 1211
},
{
"epoch": 0.7367781155015197,
"grad_norm": 1.1978737115859985,
"learning_rate": 0.0005,
"loss": 3.7308835983276367,
"step": 1212
},
{
"epoch": 0.737386018237082,
"grad_norm": 1.2270631790161133,
"learning_rate": 0.0005,
"loss": 3.7548632621765137,
"step": 1213
},
{
"epoch": 0.7379939209726444,
"grad_norm": 1.0319976806640625,
"learning_rate": 0.0005,
"loss": 4.095251083374023,
"step": 1214
},
{
"epoch": 0.7386018237082067,
"grad_norm": 1.2742465734481812,
"learning_rate": 0.0005,
"loss": 3.7844438552856445,
"step": 1215
},
{
"epoch": 0.739209726443769,
"grad_norm": 0.9936171174049377,
"learning_rate": 0.0005,
"loss": 3.6828408241271973,
"step": 1216
},
{
"epoch": 0.7398176291793314,
"grad_norm": 1.0827305316925049,
"learning_rate": 0.0005,
"loss": 4.2918291091918945,
"step": 1217
},
{
"epoch": 0.7404255319148936,
"grad_norm": 1.0626490116119385,
"learning_rate": 0.0005,
"loss": 3.8438615798950195,
"step": 1218
},
{
"epoch": 0.7410334346504559,
"grad_norm": 1.0187205076217651,
"learning_rate": 0.0005,
"loss": 4.007755279541016,
"step": 1219
},
{
"epoch": 0.7416413373860182,
"grad_norm": 0.9945427775382996,
"learning_rate": 0.0005,
"loss": 3.8854458332061768,
"step": 1220
},
{
"epoch": 0.7422492401215806,
"grad_norm": 0.9728744029998779,
"learning_rate": 0.0005,
"loss": 3.7727737426757812,
"step": 1221
},
{
"epoch": 0.7428571428571429,
"grad_norm": 1.0771368741989136,
"learning_rate": 0.0005,
"loss": 3.984614133834839,
"step": 1222
},
{
"epoch": 0.7434650455927052,
"grad_norm": 1.0673145055770874,
"learning_rate": 0.0005,
"loss": 4.186018943786621,
"step": 1223
},
{
"epoch": 0.7440729483282674,
"grad_norm": 1.0385884046554565,
"learning_rate": 0.0005,
"loss": 3.9700469970703125,
"step": 1224
},
{
"epoch": 0.7446808510638298,
"grad_norm": 0.9378101229667664,
"learning_rate": 0.0005,
"loss": 4.093457221984863,
"step": 1225
},
{
"epoch": 0.7452887537993921,
"grad_norm": 1.1992157697677612,
"learning_rate": 0.0005,
"loss": 3.8426108360290527,
"step": 1226
},
{
"epoch": 0.7458966565349544,
"grad_norm": 0.9516767263412476,
"learning_rate": 0.0005,
"loss": 3.540165901184082,
"step": 1227
},
{
"epoch": 0.7465045592705167,
"grad_norm": 0.9911203980445862,
"learning_rate": 0.0005,
"loss": 3.791531562805176,
"step": 1228
},
{
"epoch": 0.7471124620060791,
"grad_norm": 1.1304718255996704,
"learning_rate": 0.0005,
"loss": 3.8638508319854736,
"step": 1229
},
{
"epoch": 0.7477203647416414,
"grad_norm": 3.538874626159668,
"learning_rate": 0.0005,
"loss": 4.131631851196289,
"step": 1230
},
{
"epoch": 0.7483282674772036,
"grad_norm": 1.096618413925171,
"learning_rate": 0.0005,
"loss": 3.782884120941162,
"step": 1231
},
{
"epoch": 0.7489361702127659,
"grad_norm": 1.2701330184936523,
"learning_rate": 0.0005,
"loss": 3.992222785949707,
"step": 1232
},
{
"epoch": 0.7495440729483283,
"grad_norm": 1.0706497430801392,
"learning_rate": 0.0005,
"loss": 3.908442735671997,
"step": 1233
},
{
"epoch": 0.7501519756838906,
"grad_norm": 1.030834436416626,
"learning_rate": 0.0005,
"loss": 4.000621318817139,
"step": 1234
},
{
"epoch": 0.7507598784194529,
"grad_norm": 1.3895245790481567,
"learning_rate": 0.0005,
"loss": 3.80660343170166,
"step": 1235
},
{
"epoch": 0.7513677811550152,
"grad_norm": 0.9692356586456299,
"learning_rate": 0.0005,
"loss": 4.078845977783203,
"step": 1236
},
{
"epoch": 0.7519756838905776,
"grad_norm": 1.1271778345108032,
"learning_rate": 0.0005,
"loss": 3.9555394649505615,
"step": 1237
},
{
"epoch": 0.7525835866261398,
"grad_norm": 1.5441569089889526,
"learning_rate": 0.0005,
"loss": 3.963904857635498,
"step": 1238
},
{
"epoch": 0.7531914893617021,
"grad_norm": 1.7030054330825806,
"learning_rate": 0.0005,
"loss": 4.024696350097656,
"step": 1239
},
{
"epoch": 0.7537993920972644,
"grad_norm": 1.12552011013031,
"learning_rate": 0.0005,
"loss": 3.919168472290039,
"step": 1240
},
{
"epoch": 0.7544072948328268,
"grad_norm": 1.0487366914749146,
"learning_rate": 0.0005,
"loss": 4.095437526702881,
"step": 1241
},
{
"epoch": 0.7550151975683891,
"grad_norm": 1.0279390811920166,
"learning_rate": 0.0005,
"loss": 3.941718816757202,
"step": 1242
},
{
"epoch": 0.7556231003039514,
"grad_norm": 1.080350399017334,
"learning_rate": 0.0005,
"loss": 3.7040228843688965,
"step": 1243
},
{
"epoch": 0.7562310030395136,
"grad_norm": 1.0182151794433594,
"learning_rate": 0.0005,
"loss": 4.062251091003418,
"step": 1244
},
{
"epoch": 0.756838905775076,
"grad_norm": 1.078009843826294,
"learning_rate": 0.0005,
"loss": 3.8745062351226807,
"step": 1245
},
{
"epoch": 0.7574468085106383,
"grad_norm": 1.0222269296646118,
"learning_rate": 0.0005,
"loss": 3.7564854621887207,
"step": 1246
},
{
"epoch": 0.7580547112462006,
"grad_norm": 1.329654335975647,
"learning_rate": 0.0005,
"loss": 3.8875160217285156,
"step": 1247
},
{
"epoch": 0.7586626139817629,
"grad_norm": 1.0129868984222412,
"learning_rate": 0.0005,
"loss": 3.8748350143432617,
"step": 1248
},
{
"epoch": 0.7592705167173253,
"grad_norm": 1.030468225479126,
"learning_rate": 0.0005,
"loss": 3.8655738830566406,
"step": 1249
},
{
"epoch": 0.7598784194528876,
"grad_norm": 1.111459732055664,
"learning_rate": 0.0005,
"loss": 3.9891488552093506,
"step": 1250
},
{
"epoch": 0.7604863221884498,
"grad_norm": 1.4396013021469116,
"learning_rate": 0.0005,
"loss": 3.9904720783233643,
"step": 1251
},
{
"epoch": 0.7610942249240121,
"grad_norm": 1.2336925268173218,
"learning_rate": 0.0005,
"loss": 3.7369742393493652,
"step": 1252
},
{
"epoch": 0.7617021276595745,
"grad_norm": 0.8990273475646973,
"learning_rate": 0.0005,
"loss": 3.9168124198913574,
"step": 1253
},
{
"epoch": 0.7623100303951368,
"grad_norm": 1.2932227849960327,
"learning_rate": 0.0005,
"loss": 4.008082389831543,
"step": 1254
},
{
"epoch": 0.7629179331306991,
"grad_norm": 0.9154768586158752,
"learning_rate": 0.0005,
"loss": 4.019550323486328,
"step": 1255
},
{
"epoch": 0.7635258358662614,
"grad_norm": 0.9175946712493896,
"learning_rate": 0.0005,
"loss": 3.97037935256958,
"step": 1256
},
{
"epoch": 0.7641337386018237,
"grad_norm": 1.067017912864685,
"learning_rate": 0.0005,
"loss": 4.1387038230896,
"step": 1257
},
{
"epoch": 0.764741641337386,
"grad_norm": 1.1540616750717163,
"learning_rate": 0.0005,
"loss": 3.979078769683838,
"step": 1258
},
{
"epoch": 0.7653495440729483,
"grad_norm": 0.9942051768302917,
"learning_rate": 0.0005,
"loss": 4.157475471496582,
"step": 1259
},
{
"epoch": 0.7659574468085106,
"grad_norm": 1.0882611274719238,
"learning_rate": 0.0005,
"loss": 3.784665584564209,
"step": 1260
},
{
"epoch": 0.766565349544073,
"grad_norm": 1.0358823537826538,
"learning_rate": 0.0005,
"loss": 3.8665788173675537,
"step": 1261
},
{
"epoch": 0.7671732522796353,
"grad_norm": 0.9150176048278809,
"learning_rate": 0.0005,
"loss": 3.9708409309387207,
"step": 1262
},
{
"epoch": 0.7677811550151976,
"grad_norm": 1.2305281162261963,
"learning_rate": 0.0005,
"loss": 3.791486978530884,
"step": 1263
},
{
"epoch": 0.7683890577507598,
"grad_norm": 1.0246379375457764,
"learning_rate": 0.0005,
"loss": 3.931403875350952,
"step": 1264
},
{
"epoch": 0.7689969604863222,
"grad_norm": 1.342997431755066,
"learning_rate": 0.0005,
"loss": 3.800549030303955,
"step": 1265
},
{
"epoch": 0.7696048632218845,
"grad_norm": 1.0477383136749268,
"learning_rate": 0.0005,
"loss": 3.9642491340637207,
"step": 1266
},
{
"epoch": 0.7702127659574468,
"grad_norm": 1.5231037139892578,
"learning_rate": 0.0005,
"loss": 3.883274555206299,
"step": 1267
},
{
"epoch": 0.7708206686930091,
"grad_norm": 1.21817147731781,
"learning_rate": 0.0005,
"loss": 3.8319201469421387,
"step": 1268
},
{
"epoch": 0.7714285714285715,
"grad_norm": 1.3139930963516235,
"learning_rate": 0.0005,
"loss": 3.902513027191162,
"step": 1269
},
{
"epoch": 0.7720364741641338,
"grad_norm": 1.1108347177505493,
"learning_rate": 0.0005,
"loss": 4.040473937988281,
"step": 1270
},
{
"epoch": 0.772644376899696,
"grad_norm": 0.9352411031723022,
"learning_rate": 0.0005,
"loss": 3.8833415508270264,
"step": 1271
},
{
"epoch": 0.7732522796352583,
"grad_norm": 0.9234441518783569,
"learning_rate": 0.0005,
"loss": 4.101876258850098,
"step": 1272
},
{
"epoch": 0.7738601823708207,
"grad_norm": 1.0629017353057861,
"learning_rate": 0.0005,
"loss": 3.869561195373535,
"step": 1273
},
{
"epoch": 0.774468085106383,
"grad_norm": 1.0356484651565552,
"learning_rate": 0.0005,
"loss": 3.9723856449127197,
"step": 1274
},
{
"epoch": 0.7750759878419453,
"grad_norm": 0.9600344896316528,
"learning_rate": 0.0005,
"loss": 3.824707508087158,
"step": 1275
},
{
"epoch": 0.7756838905775076,
"grad_norm": 1.0315158367156982,
"learning_rate": 0.0005,
"loss": 4.001948356628418,
"step": 1276
},
{
"epoch": 0.7762917933130699,
"grad_norm": 1.1866099834442139,
"learning_rate": 0.0005,
"loss": 3.763075828552246,
"step": 1277
},
{
"epoch": 0.7768996960486322,
"grad_norm": 1.1227611303329468,
"learning_rate": 0.0005,
"loss": 3.846872329711914,
"step": 1278
},
{
"epoch": 0.7775075987841945,
"grad_norm": 1.1628526449203491,
"learning_rate": 0.0005,
"loss": 3.929243564605713,
"step": 1279
},
{
"epoch": 0.7781155015197568,
"grad_norm": 0.9936217069625854,
"learning_rate": 0.0005,
"loss": 3.736764907836914,
"step": 1280
},
{
"epoch": 0.7787234042553192,
"grad_norm": 1.0325050354003906,
"learning_rate": 0.0005,
"loss": 3.935077667236328,
"step": 1281
},
{
"epoch": 0.7793313069908815,
"grad_norm": 1.0567058324813843,
"learning_rate": 0.0005,
"loss": 3.801319122314453,
"step": 1282
},
{
"epoch": 0.7799392097264438,
"grad_norm": 1.313740611076355,
"learning_rate": 0.0005,
"loss": 4.132801055908203,
"step": 1283
},
{
"epoch": 0.780547112462006,
"grad_norm": 1.4536793231964111,
"learning_rate": 0.0005,
"loss": 3.88094425201416,
"step": 1284
},
{
"epoch": 0.7811550151975684,
"grad_norm": 1.1501535177230835,
"learning_rate": 0.0005,
"loss": 3.8404948711395264,
"step": 1285
},
{
"epoch": 0.7817629179331307,
"grad_norm": 1.3253229856491089,
"learning_rate": 0.0005,
"loss": 4.016423225402832,
"step": 1286
},
{
"epoch": 0.782370820668693,
"grad_norm": 1.2896214723587036,
"learning_rate": 0.0005,
"loss": 3.8204050064086914,
"step": 1287
},
{
"epoch": 0.7829787234042553,
"grad_norm": 1.347516655921936,
"learning_rate": 0.0005,
"loss": 3.849546432495117,
"step": 1288
},
{
"epoch": 0.7835866261398177,
"grad_norm": 1.5418754816055298,
"learning_rate": 0.0005,
"loss": 4.135068893432617,
"step": 1289
},
{
"epoch": 0.78419452887538,
"grad_norm": 1.0823962688446045,
"learning_rate": 0.0005,
"loss": 3.752423048019409,
"step": 1290
},
{
"epoch": 0.7848024316109422,
"grad_norm": 1.1146916151046753,
"learning_rate": 0.0005,
"loss": 3.810540199279785,
"step": 1291
},
{
"epoch": 0.7854103343465045,
"grad_norm": 1.0943037271499634,
"learning_rate": 0.0005,
"loss": 3.761442184448242,
"step": 1292
},
{
"epoch": 0.7860182370820669,
"grad_norm": 1.0425827503204346,
"learning_rate": 0.0005,
"loss": 3.7996015548706055,
"step": 1293
},
{
"epoch": 0.7866261398176292,
"grad_norm": 1.5982511043548584,
"learning_rate": 0.0005,
"loss": 3.8388147354125977,
"step": 1294
},
{
"epoch": 0.7872340425531915,
"grad_norm": 1.4619585275650024,
"learning_rate": 0.0005,
"loss": 4.016120910644531,
"step": 1295
},
{
"epoch": 0.7878419452887538,
"grad_norm": 1.3633700609207153,
"learning_rate": 0.0005,
"loss": 4.069618225097656,
"step": 1296
},
{
"epoch": 0.7884498480243161,
"grad_norm": 1.009056568145752,
"learning_rate": 0.0005,
"loss": 4.0978264808654785,
"step": 1297
},
{
"epoch": 0.7890577507598784,
"grad_norm": 1.1812894344329834,
"learning_rate": 0.0005,
"loss": 3.6178488731384277,
"step": 1298
},
{
"epoch": 0.7896656534954407,
"grad_norm": 1.0647777318954468,
"learning_rate": 0.0005,
"loss": 3.910210371017456,
"step": 1299
},
{
"epoch": 0.790273556231003,
"grad_norm": 1.4413726329803467,
"learning_rate": 0.0005,
"loss": 3.9200563430786133,
"step": 1300
},
{
"epoch": 0.7908814589665654,
"grad_norm": 1.1021374464035034,
"learning_rate": 0.0005,
"loss": 3.680574655532837,
"step": 1301
},
{
"epoch": 0.7914893617021277,
"grad_norm": 1.0827854871749878,
"learning_rate": 0.0005,
"loss": 3.842402458190918,
"step": 1302
},
{
"epoch": 0.79209726443769,
"grad_norm": 1.2615513801574707,
"learning_rate": 0.0005,
"loss": 4.0547590255737305,
"step": 1303
},
{
"epoch": 0.7927051671732522,
"grad_norm": 1.0599168539047241,
"learning_rate": 0.0005,
"loss": 3.8400776386260986,
"step": 1304
},
{
"epoch": 0.7933130699088146,
"grad_norm": 1.4258071184158325,
"learning_rate": 0.0005,
"loss": 3.945885181427002,
"step": 1305
},
{
"epoch": 0.7939209726443769,
"grad_norm": 1.107612133026123,
"learning_rate": 0.0005,
"loss": 3.6351089477539062,
"step": 1306
},
{
"epoch": 0.7945288753799392,
"grad_norm": 0.9725725650787354,
"learning_rate": 0.0005,
"loss": 3.5905802249908447,
"step": 1307
},
{
"epoch": 0.7951367781155015,
"grad_norm": 1.3178088665008545,
"learning_rate": 0.0005,
"loss": 4.063264846801758,
"step": 1308
},
{
"epoch": 0.7957446808510639,
"grad_norm": 1.111405611038208,
"learning_rate": 0.0005,
"loss": 3.70896053314209,
"step": 1309
},
{
"epoch": 0.7963525835866262,
"grad_norm": 1.0547385215759277,
"learning_rate": 0.0005,
"loss": 4.020359516143799,
"step": 1310
},
{
"epoch": 0.7969604863221884,
"grad_norm": 1.1632133722305298,
"learning_rate": 0.0005,
"loss": 3.8566200733184814,
"step": 1311
},
{
"epoch": 0.7975683890577507,
"grad_norm": 1.0662367343902588,
"learning_rate": 0.0005,
"loss": 3.7626304626464844,
"step": 1312
},
{
"epoch": 0.7981762917933131,
"grad_norm": 1.0058962106704712,
"learning_rate": 0.0005,
"loss": 3.667207956314087,
"step": 1313
},
{
"epoch": 0.7987841945288754,
"grad_norm": 1.21786367893219,
"learning_rate": 0.0005,
"loss": 3.7486650943756104,
"step": 1314
},
{
"epoch": 0.7993920972644377,
"grad_norm": 1.576144814491272,
"learning_rate": 0.0005,
"loss": 3.836618185043335,
"step": 1315
},
{
"epoch": 0.8,
"grad_norm": 1.0205941200256348,
"learning_rate": 0.0005,
"loss": 3.921718120574951,
"step": 1316
},
{
"epoch": 0.8006079027355623,
"grad_norm": 1.1202620267868042,
"learning_rate": 0.0005,
"loss": 3.979546308517456,
"step": 1317
},
{
"epoch": 0.8012158054711246,
"grad_norm": 1.266727089881897,
"learning_rate": 0.0004999886023671629,
"loss": 3.7467775344848633,
"step": 1318
},
{
"epoch": 0.8018237082066869,
"grad_norm": 1.1622782945632935,
"learning_rate": 0.0004999544105079001,
"loss": 4.046473503112793,
"step": 1319
},
{
"epoch": 0.8024316109422492,
"grad_norm": 1.1754651069641113,
"learning_rate": 0.0004998974275398614,
"loss": 3.6320791244506836,
"step": 1320
},
{
"epoch": 0.8030395136778116,
"grad_norm": 0.9786376953125,
"learning_rate": 0.0004998176586588145,
"loss": 3.6877191066741943,
"step": 1321
},
{
"epoch": 0.8036474164133739,
"grad_norm": 0.969366729259491,
"learning_rate": 0.0004997151111381707,
"loss": 3.766533374786377,
"step": 1322
},
{
"epoch": 0.8042553191489362,
"grad_norm": 0.9558953046798706,
"learning_rate": 0.0004995897943283221,
"loss": 4.06315803527832,
"step": 1323
},
{
"epoch": 0.8048632218844984,
"grad_norm": 0.8645924925804138,
"learning_rate": 0.0004994417196557883,
"loss": 3.838135004043579,
"step": 1324
},
{
"epoch": 0.8054711246200608,
"grad_norm": 0.8671835064888,
"learning_rate": 0.0004992709006221755,
"loss": 3.883330821990967,
"step": 1325
},
{
"epoch": 0.8060790273556231,
"grad_norm": 1.1144053936004639,
"learning_rate": 0.0004990773528029446,
"loss": 3.7989044189453125,
"step": 1326
},
{
"epoch": 0.8066869300911854,
"grad_norm": 1.0151537656784058,
"learning_rate": 0.0004988610938459917,
"loss": 4.007248878479004,
"step": 1327
},
{
"epoch": 0.8072948328267477,
"grad_norm": 1.2170069217681885,
"learning_rate": 0.0004986221434700379,
"loss": 3.8616843223571777,
"step": 1328
},
{
"epoch": 0.8079027355623101,
"grad_norm": 0.8724591135978699,
"learning_rate": 0.0004983605234628328,
"loss": 4.205953598022461,
"step": 1329
},
{
"epoch": 0.8085106382978723,
"grad_norm": 1.1466760635375977,
"learning_rate": 0.0004980762576791664,
"loss": 3.9655470848083496,
"step": 1330
},
{
"epoch": 0.8091185410334346,
"grad_norm": 1.1359692811965942,
"learning_rate": 0.000497769372038695,
"loss": 4.22013521194458,
"step": 1331
},
{
"epoch": 0.8097264437689969,
"grad_norm": 1.0394648313522339,
"learning_rate": 0.0004974398945235776,
"loss": 3.911543130874634,
"step": 1332
},
{
"epoch": 0.8103343465045593,
"grad_norm": 1.0383487939834595,
"learning_rate": 0.0004970878551759239,
"loss": 3.8219704627990723,
"step": 1333
},
{
"epoch": 0.8109422492401216,
"grad_norm": 1.0844473838806152,
"learning_rate": 0.000496713286095056,
"loss": 3.876410484313965,
"step": 1334
},
{
"epoch": 0.8115501519756839,
"grad_norm": 1.2770010232925415,
"learning_rate": 0.0004963162214345805,
"loss": 3.8071320056915283,
"step": 1335
},
{
"epoch": 0.8121580547112462,
"grad_norm": 1.0182770490646362,
"learning_rate": 0.0004958966973992754,
"loss": 3.6059393882751465,
"step": 1336
},
{
"epoch": 0.8127659574468085,
"grad_norm": 1.02802574634552,
"learning_rate": 0.0004954547522417877,
"loss": 3.669658660888672,
"step": 1337
},
{
"epoch": 0.8133738601823708,
"grad_norm": 1.1248687505722046,
"learning_rate": 0.0004949904262591467,
"loss": 3.9866435527801514,
"step": 1338
},
{
"epoch": 0.8139817629179331,
"grad_norm": 1.0492587089538574,
"learning_rate": 0.0004945037617890889,
"loss": 3.949676036834717,
"step": 1339
},
{
"epoch": 0.8145896656534954,
"grad_norm": 0.9690307974815369,
"learning_rate": 0.000493994803206198,
"loss": 3.748741626739502,
"step": 1340
},
{
"epoch": 0.8151975683890578,
"grad_norm": 1.465824842453003,
"learning_rate": 0.0004934635969178583,
"loss": 3.977262020111084,
"step": 1341
},
{
"epoch": 0.8158054711246201,
"grad_norm": 1.0349231958389282,
"learning_rate": 0.0004929101913600238,
"loss": 3.619255542755127,
"step": 1342
},
{
"epoch": 0.8164133738601824,
"grad_norm": 1.0467352867126465,
"learning_rate": 0.0004923346369928012,
"loss": 3.9860079288482666,
"step": 1343
},
{
"epoch": 0.8170212765957446,
"grad_norm": 1.0222679376602173,
"learning_rate": 0.0004917369862958494,
"loss": 3.830394744873047,
"step": 1344
},
{
"epoch": 0.817629179331307,
"grad_norm": 1.117563247680664,
"learning_rate": 0.0004911172937635942,
"loss": 3.7490053176879883,
"step": 1345
},
{
"epoch": 0.8182370820668693,
"grad_norm": 1.4361516237258911,
"learning_rate": 0.000490475615900259,
"loss": 3.8583335876464844,
"step": 1346
},
{
"epoch": 0.8188449848024316,
"grad_norm": 1.2900465726852417,
"learning_rate": 0.0004898120112147136,
"loss": 3.906479835510254,
"step": 1347
},
{
"epoch": 0.819452887537994,
"grad_norm": 1.1463675498962402,
"learning_rate": 0.0004891265402151381,
"loss": 3.9391555786132812,
"step": 1348
},
{
"epoch": 0.8200607902735563,
"grad_norm": 1.5694284439086914,
"learning_rate": 0.0004884192654035069,
"loss": 3.974485397338867,
"step": 1349
},
{
"epoch": 0.8206686930091185,
"grad_norm": 1.0200462341308594,
"learning_rate": 0.000487690251269889,
"loss": 3.6864073276519775,
"step": 1350
},
{
"epoch": 0.8212765957446808,
"grad_norm": 1.089603304862976,
"learning_rate": 0.0004869395642865676,
"loss": 3.7212605476379395,
"step": 1351
},
{
"epoch": 0.8218844984802431,
"grad_norm": 1.2351415157318115,
"learning_rate": 0.0004861672729019797,
"loss": 3.700591802597046,
"step": 1352
},
{
"epoch": 0.8224924012158055,
"grad_norm": 0.9957062602043152,
"learning_rate": 0.00048537344753447453,
"loss": 3.7319130897521973,
"step": 1353
},
{
"epoch": 0.8231003039513678,
"grad_norm": 1.056557059288025,
"learning_rate": 0.0004845581605658926,
"loss": 3.657074213027954,
"step": 1354
},
{
"epoch": 0.8237082066869301,
"grad_norm": 1.0980826616287231,
"learning_rate": 0.00048372148633496617,
"loss": 3.770319938659668,
"step": 1355
},
{
"epoch": 0.8243161094224924,
"grad_norm": 0.8664339780807495,
"learning_rate": 0.0004828635011305407,
"loss": 3.894157886505127,
"step": 1356
},
{
"epoch": 0.8249240121580547,
"grad_norm": 1.2869031429290771,
"learning_rate": 0.00048198428318461896,
"loss": 3.6396484375,
"step": 1357
},
{
"epoch": 0.825531914893617,
"grad_norm": 1.459326148033142,
"learning_rate": 0.0004810839126652275,
"loss": 4.004338264465332,
"step": 1358
},
{
"epoch": 0.8261398176291793,
"grad_norm": 1.1490086317062378,
"learning_rate": 0.0004801624716691072,
"loss": 4.074912071228027,
"step": 1359
},
{
"epoch": 0.8267477203647416,
"grad_norm": 1.0660607814788818,
"learning_rate": 0.00047922004421422726,
"loss": 3.8288257122039795,
"step": 1360
},
{
"epoch": 0.827355623100304,
"grad_norm": 1.1202282905578613,
"learning_rate": 0.00047825671623212454,
"loss": 3.728804111480713,
"step": 1361
},
{
"epoch": 0.8279635258358663,
"grad_norm": 1.07158625125885,
"learning_rate": 0.0004772725755600682,
"loss": 3.5751538276672363,
"step": 1362
},
{
"epoch": 0.8285714285714286,
"grad_norm": 1.008811354637146,
"learning_rate": 0.0004762677119330505,
"loss": 3.8057093620300293,
"step": 1363
},
{
"epoch": 0.8291793313069908,
"grad_norm": 1.4745137691497803,
"learning_rate": 0.00047524221697560476,
"loss": 3.8376100063323975,
"step": 1364
},
{
"epoch": 0.8297872340425532,
"grad_norm": 1.2719781398773193,
"learning_rate": 0.00047419618419345115,
"loss": 3.747580051422119,
"step": 1365
},
{
"epoch": 0.8303951367781155,
"grad_norm": 1.1576839685440063,
"learning_rate": 0.0004731297089649703,
"loss": 3.7823610305786133,
"step": 1366
},
{
"epoch": 0.8310030395136778,
"grad_norm": 1.0849125385284424,
"learning_rate": 0.0004720428885325069,
"loss": 3.9312424659729004,
"step": 1367
},
{
"epoch": 0.8316109422492401,
"grad_norm": 1.0173479318618774,
"learning_rate": 0.00047093582199350285,
"loss": 3.641855239868164,
"step": 1368
},
{
"epoch": 0.8322188449848025,
"grad_norm": 0.9390632510185242,
"learning_rate": 0.00046980861029146173,
"loss": 4.027669906616211,
"step": 1369
},
{
"epoch": 0.8328267477203647,
"grad_norm": 1.0367680788040161,
"learning_rate": 0.0004686613562067444,
"loss": 3.9053921699523926,
"step": 1370
},
{
"epoch": 0.833434650455927,
"grad_norm": 0.9983039498329163,
"learning_rate": 0.00046749416434719747,
"loss": 3.6601035594940186,
"step": 1371
},
{
"epoch": 0.8340425531914893,
"grad_norm": 1.2556368112564087,
"learning_rate": 0.00046630714113861507,
"loss": 3.643587350845337,
"step": 1372
},
{
"epoch": 0.8346504559270517,
"grad_norm": 1.0456199645996094,
"learning_rate": 0.00046510039481503486,
"loss": 3.689802646636963,
"step": 1373
},
{
"epoch": 0.835258358662614,
"grad_norm": 0.9730342626571655,
"learning_rate": 0.00046387403540886895,
"loss": 3.6004483699798584,
"step": 1374
},
{
"epoch": 0.8358662613981763,
"grad_norm": 1.2402491569519043,
"learning_rate": 0.00046262817474087127,
"loss": 3.6834664344787598,
"step": 1375
},
{
"epoch": 0.8364741641337387,
"grad_norm": 1.1597247123718262,
"learning_rate": 0.00046136292640994154,
"loss": 3.7525768280029297,
"step": 1376
},
{
"epoch": 0.8370820668693009,
"grad_norm": 1.2773432731628418,
"learning_rate": 0.0004600784057827671,
"loss": 3.862699508666992,
"step": 1377
},
{
"epoch": 0.8376899696048632,
"grad_norm": 1.2818998098373413,
"learning_rate": 0.00045877472998330385,
"loss": 4.099722385406494,
"step": 1378
},
{
"epoch": 0.8382978723404255,
"grad_norm": 1.0724464654922485,
"learning_rate": 0.0004574520178820965,
"loss": 3.8608179092407227,
"step": 1379
},
{
"epoch": 0.8389057750759878,
"grad_norm": 1.293906807899475,
"learning_rate": 0.0004561103900854401,
"loss": 3.723815441131592,
"step": 1380
},
{
"epoch": 0.8395136778115502,
"grad_norm": 1.0045194625854492,
"learning_rate": 0.0004547499689243829,
"loss": 3.7255592346191406,
"step": 1381
},
{
"epoch": 0.8401215805471125,
"grad_norm": 1.0186697244644165,
"learning_rate": 0.0004533708784435722,
"loss": 3.6717958450317383,
"step": 1382
},
{
"epoch": 0.8407294832826747,
"grad_norm": 1.0383477210998535,
"learning_rate": 0.0004519732443899435,
"loss": 3.681596279144287,
"step": 1383
},
{
"epoch": 0.841337386018237,
"grad_norm": 1.1144697666168213,
"learning_rate": 0.00045055719420125504,
"loss": 3.9934191703796387,
"step": 1384
},
{
"epoch": 0.8419452887537994,
"grad_norm": 1.056483268737793,
"learning_rate": 0.0004491228569944679,
"loss": 4.028287887573242,
"step": 1385
},
{
"epoch": 0.8425531914893617,
"grad_norm": 1.1046830415725708,
"learning_rate": 0.0004476703635539728,
"loss": 3.823612689971924,
"step": 1386
},
{
"epoch": 0.843161094224924,
"grad_norm": 1.1697293519973755,
"learning_rate": 0.00044619984631966527,
"loss": 3.7220816612243652,
"step": 1387
},
{
"epoch": 0.8437689969604864,
"grad_norm": 1.0626883506774902,
"learning_rate": 0.0004447114393748694,
"loss": 3.5306200981140137,
"step": 1388
},
{
"epoch": 0.8443768996960487,
"grad_norm": 1.153074026107788,
"learning_rate": 0.0004432052784341122,
"loss": 3.672762393951416,
"step": 1389
},
{
"epoch": 0.8449848024316109,
"grad_norm": 0.9894313812255859,
"learning_rate": 0.0004416815008307488,
"loss": 3.661726474761963,
"step": 1390
},
{
"epoch": 0.8455927051671732,
"grad_norm": 0.9667363166809082,
"learning_rate": 0.00044014024550444045,
"loss": 3.788522720336914,
"step": 1391
},
{
"epoch": 0.8462006079027355,
"grad_norm": 1.2645761966705322,
"learning_rate": 0.00043858165298848556,
"loss": 3.721158981323242,
"step": 1392
},
{
"epoch": 0.8468085106382979,
"grad_norm": 1.0775492191314697,
"learning_rate": 0.00043700586539700614,
"loss": 3.5772523880004883,
"step": 1393
},
{
"epoch": 0.8474164133738602,
"grad_norm": 1.0271198749542236,
"learning_rate": 0.00043541302641198946,
"loss": 3.820373058319092,
"step": 1394
},
{
"epoch": 0.8480243161094225,
"grad_norm": 0.8722153902053833,
"learning_rate": 0.00043380328127018663,
"loss": 3.610518455505371,
"step": 1395
},
{
"epoch": 0.8486322188449849,
"grad_norm": 1.0228782892227173,
"learning_rate": 0.00043217677674987047,
"loss": 3.7967772483825684,
"step": 1396
},
{
"epoch": 0.8492401215805471,
"grad_norm": 1.0525845289230347,
"learning_rate": 0.00043053366115745174,
"loss": 3.623091697692871,
"step": 1397
},
{
"epoch": 0.8498480243161094,
"grad_norm": 0.998408317565918,
"learning_rate": 0.00042887408431395614,
"loss": 3.685908317565918,
"step": 1398
},
{
"epoch": 0.8504559270516717,
"grad_norm": 1.026895523071289,
"learning_rate": 0.0004271981975413639,
"loss": 3.5633139610290527,
"step": 1399
},
{
"epoch": 0.851063829787234,
"grad_norm": 1.0553339719772339,
"learning_rate": 0.00042550615364881196,
"loss": 3.833423137664795,
"step": 1400
},
{
"epoch": 0.8516717325227964,
"grad_norm": 1.1019606590270996,
"learning_rate": 0.00042379810691866064,
"loss": 3.8337411880493164,
"step": 1401
},
{
"epoch": 0.8522796352583587,
"grad_norm": 1.7001227140426636,
"learning_rate": 0.0004220742130924257,
"loss": 3.495081663131714,
"step": 1402
},
{
"epoch": 0.8528875379939209,
"grad_norm": 1.0422172546386719,
"learning_rate": 0.0004203346293565784,
"loss": 3.7549071311950684,
"step": 1403
},
{
"epoch": 0.8534954407294832,
"grad_norm": 1.2587510347366333,
"learning_rate": 0.0004185795143282123,
"loss": 3.770139217376709,
"step": 1404
},
{
"epoch": 0.8541033434650456,
"grad_norm": 1.1424074172973633,
"learning_rate": 0.00041680902804058095,
"loss": 3.779757499694824,
"step": 1405
},
{
"epoch": 0.8547112462006079,
"grad_norm": 1.0849041938781738,
"learning_rate": 0.0004150233319285055,
"loss": 3.8310835361480713,
"step": 1406
},
{
"epoch": 0.8553191489361702,
"grad_norm": 1.1193660497665405,
"learning_rate": 0.00041322258881365515,
"loss": 3.7291765213012695,
"step": 1407
},
{
"epoch": 0.8559270516717326,
"grad_norm": 1.1108680963516235,
"learning_rate": 0.0004114069628897006,
"loss": 4.129992485046387,
"step": 1408
},
{
"epoch": 0.8565349544072949,
"grad_norm": 1.1723637580871582,
"learning_rate": 0.0004095766197073432,
"loss": 3.6475980281829834,
"step": 1409
},
{
"epoch": 0.8571428571428571,
"grad_norm": 1.0332688093185425,
"learning_rate": 0.0004077317261592194,
"loss": 3.8548192977905273,
"step": 1410
},
{
"epoch": 0.8577507598784194,
"grad_norm": 1.0339442491531372,
"learning_rate": 0.0004058724504646834,
"loss": 3.8385329246520996,
"step": 1411
},
{
"epoch": 0.8583586626139817,
"grad_norm": 1.0235612392425537,
"learning_rate": 0.000403998962154469,
"loss": 3.8993711471557617,
"step": 1412
},
{
"epoch": 0.8589665653495441,
"grad_norm": 0.8945487141609192,
"learning_rate": 0.0004021114320552311,
"loss": 3.681536912918091,
"step": 1413
},
{
"epoch": 0.8595744680851064,
"grad_norm": 0.907351016998291,
"learning_rate": 0.00040021003227397014,
"loss": 3.751767635345459,
"step": 1414
},
{
"epoch": 0.8601823708206687,
"grad_norm": 0.8751946091651917,
"learning_rate": 0.0003982949361823388,
"loss": 3.6982154846191406,
"step": 1415
},
{
"epoch": 0.8607902735562311,
"grad_norm": 0.9630452990531921,
"learning_rate": 0.0003963663184008338,
"loss": 3.7995591163635254,
"step": 1416
},
{
"epoch": 0.8613981762917933,
"grad_norm": 1.0385856628417969,
"learning_rate": 0.0003944243547828742,
"loss": 3.583292007446289,
"step": 1417
},
{
"epoch": 0.8620060790273556,
"grad_norm": 1.1438446044921875,
"learning_rate": 0.000392469222398766,
"loss": 3.97701096534729,
"step": 1418
},
{
"epoch": 0.8626139817629179,
"grad_norm": 1.0620638132095337,
"learning_rate": 0.00039050109951955814,
"loss": 3.5980987548828125,
"step": 1419
},
{
"epoch": 0.8632218844984803,
"grad_norm": 0.978015661239624,
"learning_rate": 0.000388520165600786,
"loss": 3.7316596508026123,
"step": 1420
},
{
"epoch": 0.8638297872340426,
"grad_norm": 1.0127967596054077,
"learning_rate": 0.0003865266012661095,
"loss": 3.9404823780059814,
"step": 1421
},
{
"epoch": 0.8644376899696049,
"grad_norm": 1.2003899812698364,
"learning_rate": 0.0003845205882908432,
"loss": 3.918931245803833,
"step": 1422
},
{
"epoch": 0.8650455927051671,
"grad_norm": 1.1857889890670776,
"learning_rate": 0.000382502309585382,
"loss": 3.6090548038482666,
"step": 1423
},
{
"epoch": 0.8656534954407294,
"grad_norm": 1.2434966564178467,
"learning_rate": 0.000380471949178523,
"loss": 3.467123031616211,
"step": 1424
},
{
"epoch": 0.8662613981762918,
"grad_norm": 1.2050342559814453,
"learning_rate": 0.0003784296922006859,
"loss": 3.696073055267334,
"step": 1425
},
{
"epoch": 0.8668693009118541,
"grad_norm": 1.1129356622695923,
"learning_rate": 0.0003763757248670321,
"loss": 3.715449810028076,
"step": 1426
},
{
"epoch": 0.8674772036474164,
"grad_norm": 1.1708143949508667,
"learning_rate": 0.00037431023446048595,
"loss": 3.860975980758667,
"step": 1427
},
{
"epoch": 0.8680851063829788,
"grad_norm": 1.1058366298675537,
"learning_rate": 0.0003722334093146576,
"loss": 3.7916457653045654,
"step": 1428
},
{
"epoch": 0.8686930091185411,
"grad_norm": 1.274646520614624,
"learning_rate": 0.00037014543879667093,
"loss": 3.8200652599334717,
"step": 1429
},
{
"epoch": 0.8693009118541033,
"grad_norm": 1.2253806591033936,
"learning_rate": 0.00036804651328989666,
"loss": 3.522810459136963,
"step": 1430
},
{
"epoch": 0.8699088145896656,
"grad_norm": 1.1936273574829102,
"learning_rate": 0.000365936824176593,
"loss": 3.9892830848693848,
"step": 1431
},
{
"epoch": 0.870516717325228,
"grad_norm": 1.1107733249664307,
"learning_rate": 0.00036381656382045526,
"loss": 3.6833291053771973,
"step": 1432
},
{
"epoch": 0.8711246200607903,
"grad_norm": 1.4528982639312744,
"learning_rate": 0.00036168592554907596,
"loss": 3.4317424297332764,
"step": 1433
},
{
"epoch": 0.8717325227963526,
"grad_norm": 1.539918303489685,
"learning_rate": 0.0003595451036363168,
"loss": 3.9146463871002197,
"step": 1434
},
{
"epoch": 0.8723404255319149,
"grad_norm": 1.0589654445648193,
"learning_rate": 0.00035739429328459493,
"loss": 3.64989972114563,
"step": 1435
},
{
"epoch": 0.8729483282674773,
"grad_norm": 0.9970619082450867,
"learning_rate": 0.0003552336906070838,
"loss": 3.7197318077087402,
"step": 1436
},
{
"epoch": 0.8735562310030395,
"grad_norm": 1.1559967994689941,
"learning_rate": 0.0003530634926098316,
"loss": 3.835594892501831,
"step": 1437
},
{
"epoch": 0.8741641337386018,
"grad_norm": 1.0069043636322021,
"learning_rate": 0.0003508838971737981,
"loss": 3.8029980659484863,
"step": 1438
},
{
"epoch": 0.8747720364741641,
"grad_norm": 1.3581100702285767,
"learning_rate": 0.0003486951030368113,
"loss": 3.6824827194213867,
"step": 1439
},
{
"epoch": 0.8753799392097265,
"grad_norm": 1.7533200979232788,
"learning_rate": 0.00034649730977544664,
"loss": 3.7235536575317383,
"step": 1440
},
{
"epoch": 0.8759878419452888,
"grad_norm": 1.0940066576004028,
"learning_rate": 0.0003442907177868293,
"loss": 3.6482458114624023,
"step": 1441
},
{
"epoch": 0.8765957446808511,
"grad_norm": 1.0252796411514282,
"learning_rate": 0.00034207552827036176,
"loss": 3.634884834289551,
"step": 1442
},
{
"epoch": 0.8772036474164133,
"grad_norm": 1.3038619756698608,
"learning_rate": 0.0003398519432093782,
"loss": 3.886862277984619,
"step": 1443
},
{
"epoch": 0.8778115501519757,
"grad_norm": 1.5358000993728638,
"learning_rate": 0.00033762016535272745,
"loss": 3.916736125946045,
"step": 1444
},
{
"epoch": 0.878419452887538,
"grad_norm": 1.0540707111358643,
"learning_rate": 0.00033538039819628625,
"loss": 3.914485454559326,
"step": 1445
},
{
"epoch": 0.8790273556231003,
"grad_norm": 1.0498977899551392,
"learning_rate": 0.000333132845964404,
"loss": 3.6423823833465576,
"step": 1446
},
{
"epoch": 0.8796352583586626,
"grad_norm": 2.2342031002044678,
"learning_rate": 0.00033087771359128175,
"loss": 3.7215816974639893,
"step": 1447
},
{
"epoch": 0.880243161094225,
"grad_norm": 1.4365023374557495,
"learning_rate": 0.00032861520670228586,
"loss": 3.7631328105926514,
"step": 1448
},
{
"epoch": 0.8808510638297873,
"grad_norm": 2.098018169403076,
"learning_rate": 0.00032634553159519865,
"loss": 3.4372754096984863,
"step": 1449
},
{
"epoch": 0.8814589665653495,
"grad_norm": 0.9924235939979553,
"learning_rate": 0.0003240688952214085,
"loss": 4.062948226928711,
"step": 1450
},
{
"epoch": 0.8820668693009118,
"grad_norm": 1.2176319360733032,
"learning_rate": 0.0003217855051670393,
"loss": 3.6439735889434814,
"step": 1451
},
{
"epoch": 0.8826747720364742,
"grad_norm": 1.2388694286346436,
"learning_rate": 0.00031949556963402283,
"loss": 3.8236451148986816,
"step": 1452
},
{
"epoch": 0.8832826747720365,
"grad_norm": 0.8196237683296204,
"learning_rate": 0.00031719929742111437,
"loss": 3.686429977416992,
"step": 1453
},
{
"epoch": 0.8838905775075988,
"grad_norm": 0.9667937755584717,
"learning_rate": 0.00031489689790485464,
"loss": 4.012905120849609,
"step": 1454
},
{
"epoch": 0.8844984802431611,
"grad_norm": 0.9525713920593262,
"learning_rate": 0.00031258858102047813,
"loss": 3.484525680541992,
"step": 1455
},
{
"epoch": 0.8851063829787233,
"grad_norm": 1.0953255891799927,
"learning_rate": 0.0003102745572427716,
"loss": 3.6367969512939453,
"step": 1456
},
{
"epoch": 0.8857142857142857,
"grad_norm": 0.8041018843650818,
"learning_rate": 0.0003079550375668821,
"loss": 3.627480983734131,
"step": 1457
},
{
"epoch": 0.886322188449848,
"grad_norm": 0.9474192261695862,
"learning_rate": 0.0003056302334890786,
"loss": 3.771761894226074,
"step": 1458
},
{
"epoch": 0.8869300911854103,
"grad_norm": 0.9393053650856018,
"learning_rate": 0.00030330035698746753,
"loss": 3.4475784301757812,
"step": 1459
},
{
"epoch": 0.8875379939209727,
"grad_norm": 1.0495154857635498,
"learning_rate": 0.00030096562050266427,
"loss": 3.7747950553894043,
"step": 1460
},
{
"epoch": 0.888145896656535,
"grad_norm": 1.0793986320495605,
"learning_rate": 0.0002986262369184226,
"loss": 3.5836963653564453,
"step": 1461
},
{
"epoch": 0.8887537993920973,
"grad_norm": 1.0350525379180908,
"learning_rate": 0.0002962824195422238,
"loss": 3.8103108406066895,
"step": 1462
},
{
"epoch": 0.8893617021276595,
"grad_norm": 0.923565149307251,
"learning_rate": 0.0002939343820858269,
"loss": 3.5080511569976807,
"step": 1463
},
{
"epoch": 0.8899696048632219,
"grad_norm": 0.9419893026351929,
"learning_rate": 0.00029158233864578256,
"loss": 3.5664780139923096,
"step": 1464
},
{
"epoch": 0.8905775075987842,
"grad_norm": 1.120071530342102,
"learning_rate": 0.000289226503683911,
"loss": 3.5272912979125977,
"step": 1465
},
{
"epoch": 0.8911854103343465,
"grad_norm": 0.9679391980171204,
"learning_rate": 0.0002868670920077478,
"loss": 3.7422268390655518,
"step": 1466
},
{
"epoch": 0.8917933130699088,
"grad_norm": 0.8348677754402161,
"learning_rate": 0.0002845043187509567,
"loss": 3.677544355392456,
"step": 1467
},
{
"epoch": 0.8924012158054712,
"grad_norm": 0.8530043959617615,
"learning_rate": 0.0002821383993537144,
"loss": 3.6701407432556152,
"step": 1468
},
{
"epoch": 0.8930091185410335,
"grad_norm": 0.9390717148780823,
"learning_rate": 0.00027976954954306554,
"loss": 3.7104759216308594,
"step": 1469
},
{
"epoch": 0.8936170212765957,
"grad_norm": 1.0507652759552002,
"learning_rate": 0.0002773979853132534,
"loss": 3.5879673957824707,
"step": 1470
},
{
"epoch": 0.894224924012158,
"grad_norm": 0.9291044473648071,
"learning_rate": 0.0002750239229060246,
"loss": 3.655197858810425,
"step": 1471
},
{
"epoch": 0.8948328267477204,
"grad_norm": 0.9448993802070618,
"learning_rate": 0.0002726475787909125,
"loss": 3.6126198768615723,
"step": 1472
},
{
"epoch": 0.8954407294832827,
"grad_norm": 0.9143878221511841,
"learning_rate": 0.0002702691696454986,
"loss": 3.7886955738067627,
"step": 1473
},
{
"epoch": 0.896048632218845,
"grad_norm": 0.8731086850166321,
"learning_rate": 0.00026788891233565655,
"loss": 3.4998018741607666,
"step": 1474
},
{
"epoch": 0.8966565349544073,
"grad_norm": 1.0264720916748047,
"learning_rate": 0.0002655070238957772,
"loss": 3.7816460132598877,
"step": 1475
},
{
"epoch": 0.8972644376899696,
"grad_norm": 0.9198083877563477,
"learning_rate": 0.0002631237215089798,
"loss": 3.2887322902679443,
"step": 1476
}
],
"logging_steps": 1,
"max_steps": 1645,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 164,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.8063788889999933e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}