lqd_augment / last-checkpoint /trainer_state.json
Ba2han's picture
Training in progress, step 787, checkpoint
4451ae6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21018895639981305,
"eval_steps": 500,
"global_step": 787,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002670761834813381,
"grad_norm": 17.75,
"learning_rate": 0.0,
"loss": 2.6790831089019775,
"step": 1
},
{
"epoch": 0.0005341523669626762,
"grad_norm": 19.375,
"learning_rate": 7.692307692307694e-07,
"loss": 2.857093095779419,
"step": 2
},
{
"epoch": 0.0008012285504440141,
"grad_norm": 17.75,
"learning_rate": 1.5384615384615387e-06,
"loss": 2.6315698623657227,
"step": 3
},
{
"epoch": 0.0010683047339253523,
"grad_norm": 18.75,
"learning_rate": 2.307692307692308e-06,
"loss": 2.714285135269165,
"step": 4
},
{
"epoch": 0.0013353809174066903,
"grad_norm": 16.25,
"learning_rate": 3.0769230769230774e-06,
"loss": 2.6139986515045166,
"step": 5
},
{
"epoch": 0.0016024571008880282,
"grad_norm": 22.75,
"learning_rate": 3.846153846153847e-06,
"loss": 2.76374888420105,
"step": 6
},
{
"epoch": 0.0018695332843693664,
"grad_norm": 18.0,
"learning_rate": 4.615384615384616e-06,
"loss": 2.660518169403076,
"step": 7
},
{
"epoch": 0.0021366094678507046,
"grad_norm": 24.25,
"learning_rate": 5.3846153846153855e-06,
"loss": 2.71116042137146,
"step": 8
},
{
"epoch": 0.0024036856513320426,
"grad_norm": 23.0,
"learning_rate": 6.153846153846155e-06,
"loss": 2.778017997741699,
"step": 9
},
{
"epoch": 0.0026707618348133805,
"grad_norm": 20.0,
"learning_rate": 6.923076923076923e-06,
"loss": 2.5963754653930664,
"step": 10
},
{
"epoch": 0.0029378380182947185,
"grad_norm": 20.875,
"learning_rate": 7.692307692307694e-06,
"loss": 2.766429901123047,
"step": 11
},
{
"epoch": 0.0032049142017760565,
"grad_norm": 23.125,
"learning_rate": 8.461538461538462e-06,
"loss": 2.9626660346984863,
"step": 12
},
{
"epoch": 0.003471990385257395,
"grad_norm": 26.0,
"learning_rate": 9.230769230769232e-06,
"loss": 2.910458564758301,
"step": 13
},
{
"epoch": 0.003739066568738733,
"grad_norm": 22.125,
"learning_rate": 1e-05,
"loss": 2.8882126808166504,
"step": 14
},
{
"epoch": 0.004006142752220071,
"grad_norm": 17.125,
"learning_rate": 1.0769230769230771e-05,
"loss": 2.5886685848236084,
"step": 15
},
{
"epoch": 0.004273218935701409,
"grad_norm": 17.25,
"learning_rate": 1.153846153846154e-05,
"loss": 2.7212939262390137,
"step": 16
},
{
"epoch": 0.004540295119182747,
"grad_norm": 22.125,
"learning_rate": 1.230769230769231e-05,
"loss": 2.689997911453247,
"step": 17
},
{
"epoch": 0.004807371302664085,
"grad_norm": 24.25,
"learning_rate": 1.3076923076923078e-05,
"loss": 2.7217793464660645,
"step": 18
},
{
"epoch": 0.005074447486145423,
"grad_norm": 17.0,
"learning_rate": 1.3846153846153847e-05,
"loss": 2.58693265914917,
"step": 19
},
{
"epoch": 0.005341523669626761,
"grad_norm": 20.625,
"learning_rate": 1.4615384615384617e-05,
"loss": 2.793051242828369,
"step": 20
},
{
"epoch": 0.0056085998531080995,
"grad_norm": 21.375,
"learning_rate": 1.5384615384615387e-05,
"loss": 2.7657687664031982,
"step": 21
},
{
"epoch": 0.005875676036589437,
"grad_norm": 18.125,
"learning_rate": 1.6153846153846154e-05,
"loss": 2.8189685344696045,
"step": 22
},
{
"epoch": 0.006142752220070775,
"grad_norm": 14.875,
"learning_rate": 1.6923076923076924e-05,
"loss": 2.580648183822632,
"step": 23
},
{
"epoch": 0.006409828403552113,
"grad_norm": 21.875,
"learning_rate": 1.7692307692307694e-05,
"loss": 2.74700927734375,
"step": 24
},
{
"epoch": 0.006676904587033451,
"grad_norm": 17.25,
"learning_rate": 1.8461538461538465e-05,
"loss": 2.4545698165893555,
"step": 25
},
{
"epoch": 0.00694398077051479,
"grad_norm": 20.5,
"learning_rate": 1.923076923076923e-05,
"loss": 2.6994550228118896,
"step": 26
},
{
"epoch": 0.007211056953996127,
"grad_norm": 17.0,
"learning_rate": 2e-05,
"loss": 2.6980443000793457,
"step": 27
},
{
"epoch": 0.007478133137477466,
"grad_norm": 17.375,
"learning_rate": 2.0769230769230772e-05,
"loss": 2.78818941116333,
"step": 28
},
{
"epoch": 0.007745209320958803,
"grad_norm": 20.25,
"learning_rate": 2.1538461538461542e-05,
"loss": 2.742327928543091,
"step": 29
},
{
"epoch": 0.008012285504440142,
"grad_norm": 15.9375,
"learning_rate": 2.230769230769231e-05,
"loss": 2.6761393547058105,
"step": 30
},
{
"epoch": 0.00827936168792148,
"grad_norm": 17.625,
"learning_rate": 2.307692307692308e-05,
"loss": 2.639137029647827,
"step": 31
},
{
"epoch": 0.008546437871402818,
"grad_norm": 16.25,
"learning_rate": 2.384615384615385e-05,
"loss": 2.640866279602051,
"step": 32
},
{
"epoch": 0.008813514054884156,
"grad_norm": 20.5,
"learning_rate": 2.461538461538462e-05,
"loss": 2.843217134475708,
"step": 33
},
{
"epoch": 0.009080590238365494,
"grad_norm": 17.625,
"learning_rate": 2.5384615384615383e-05,
"loss": 2.504178047180176,
"step": 34
},
{
"epoch": 0.009347666421846831,
"grad_norm": 16.0,
"learning_rate": 2.6153846153846157e-05,
"loss": 2.5768792629241943,
"step": 35
},
{
"epoch": 0.00961474260532817,
"grad_norm": 16.875,
"learning_rate": 2.6923076923076923e-05,
"loss": 2.5246806144714355,
"step": 36
},
{
"epoch": 0.009881818788809508,
"grad_norm": 18.875,
"learning_rate": 2.7692307692307694e-05,
"loss": 2.6676347255706787,
"step": 37
},
{
"epoch": 0.010148894972290845,
"grad_norm": 17.875,
"learning_rate": 2.846153846153846e-05,
"loss": 2.667938470840454,
"step": 38
},
{
"epoch": 0.010415971155772185,
"grad_norm": 16.5,
"learning_rate": 2.9230769230769234e-05,
"loss": 2.4972121715545654,
"step": 39
},
{
"epoch": 0.010683047339253522,
"grad_norm": 16.625,
"learning_rate": 3e-05,
"loss": 2.6042871475219727,
"step": 40
},
{
"epoch": 0.01095012352273486,
"grad_norm": 16.5,
"learning_rate": 3.0769230769230774e-05,
"loss": 2.8169543743133545,
"step": 41
},
{
"epoch": 0.011217199706216199,
"grad_norm": 16.0,
"learning_rate": 3.153846153846154e-05,
"loss": 2.542125701904297,
"step": 42
},
{
"epoch": 0.011484275889697537,
"grad_norm": 13.9375,
"learning_rate": 3.230769230769231e-05,
"loss": 2.404881000518799,
"step": 43
},
{
"epoch": 0.011751352073178874,
"grad_norm": 15.5625,
"learning_rate": 3.307692307692308e-05,
"loss": 2.658536672592163,
"step": 44
},
{
"epoch": 0.012018428256660212,
"grad_norm": 13.6875,
"learning_rate": 3.384615384615385e-05,
"loss": 2.5229556560516357,
"step": 45
},
{
"epoch": 0.01228550444014155,
"grad_norm": 13.9375,
"learning_rate": 3.461538461538462e-05,
"loss": 2.4668819904327393,
"step": 46
},
{
"epoch": 0.012552580623622888,
"grad_norm": 13.6875,
"learning_rate": 3.538461538461539e-05,
"loss": 2.3786585330963135,
"step": 47
},
{
"epoch": 0.012819656807104226,
"grad_norm": 14.625,
"learning_rate": 3.615384615384615e-05,
"loss": 2.486743688583374,
"step": 48
},
{
"epoch": 0.013086732990585565,
"grad_norm": 14.0625,
"learning_rate": 3.692307692307693e-05,
"loss": 2.581742763519287,
"step": 49
},
{
"epoch": 0.013353809174066903,
"grad_norm": 12.3125,
"learning_rate": 3.769230769230769e-05,
"loss": 2.4105544090270996,
"step": 50
},
{
"epoch": 0.01362088535754824,
"grad_norm": 14.875,
"learning_rate": 3.846153846153846e-05,
"loss": 2.5906612873077393,
"step": 51
},
{
"epoch": 0.01388796154102958,
"grad_norm": 14.75,
"learning_rate": 3.923076923076923e-05,
"loss": 2.617494821548462,
"step": 52
},
{
"epoch": 0.014155037724510917,
"grad_norm": 13.8125,
"learning_rate": 4e-05,
"loss": 2.562443494796753,
"step": 53
},
{
"epoch": 0.014422113907992255,
"grad_norm": 14.1875,
"learning_rate": 4.0769230769230773e-05,
"loss": 2.5650522708892822,
"step": 54
},
{
"epoch": 0.014689190091473592,
"grad_norm": 15.375,
"learning_rate": 4.1538461538461544e-05,
"loss": 2.4969890117645264,
"step": 55
},
{
"epoch": 0.014956266274954931,
"grad_norm": 13.0625,
"learning_rate": 4.230769230769231e-05,
"loss": 2.533348321914673,
"step": 56
},
{
"epoch": 0.015223342458436269,
"grad_norm": 13.6875,
"learning_rate": 4.3076923076923084e-05,
"loss": 2.639575719833374,
"step": 57
},
{
"epoch": 0.015490418641917606,
"grad_norm": 12.3125,
"learning_rate": 4.384615384615385e-05,
"loss": 2.369950771331787,
"step": 58
},
{
"epoch": 0.015757494825398944,
"grad_norm": 11.8125,
"learning_rate": 4.461538461538462e-05,
"loss": 2.5953032970428467,
"step": 59
},
{
"epoch": 0.016024571008880283,
"grad_norm": 11.75,
"learning_rate": 4.538461538461539e-05,
"loss": 2.4076132774353027,
"step": 60
},
{
"epoch": 0.016291647192361623,
"grad_norm": 14.875,
"learning_rate": 4.615384615384616e-05,
"loss": 2.5904717445373535,
"step": 61
},
{
"epoch": 0.01655872337584296,
"grad_norm": 11.8125,
"learning_rate": 4.692307692307693e-05,
"loss": 2.411831855773926,
"step": 62
},
{
"epoch": 0.016825799559324298,
"grad_norm": 11.6875,
"learning_rate": 4.76923076923077e-05,
"loss": 2.5101966857910156,
"step": 63
},
{
"epoch": 0.017092875742805637,
"grad_norm": 11.125,
"learning_rate": 4.846153846153846e-05,
"loss": 2.2876455783843994,
"step": 64
},
{
"epoch": 0.017359951926286973,
"grad_norm": 10.25,
"learning_rate": 4.923076923076924e-05,
"loss": 2.278735399246216,
"step": 65
},
{
"epoch": 0.017627028109768312,
"grad_norm": 10.5,
"learning_rate": 5e-05,
"loss": 2.300778865814209,
"step": 66
},
{
"epoch": 0.017894104293249648,
"grad_norm": 14.375,
"learning_rate": 5.0769230769230766e-05,
"loss": 2.7023446559906006,
"step": 67
},
{
"epoch": 0.018161180476730987,
"grad_norm": 10.5625,
"learning_rate": 5.1538461538461536e-05,
"loss": 2.2479007244110107,
"step": 68
},
{
"epoch": 0.018428256660212326,
"grad_norm": 10.1875,
"learning_rate": 5.230769230769231e-05,
"loss": 2.2641005516052246,
"step": 69
},
{
"epoch": 0.018695332843693662,
"grad_norm": 10.875,
"learning_rate": 5.3076923076923076e-05,
"loss": 2.417478561401367,
"step": 70
},
{
"epoch": 0.018962409027175,
"grad_norm": 11.5625,
"learning_rate": 5.384615384615385e-05,
"loss": 2.4386911392211914,
"step": 71
},
{
"epoch": 0.01922948521065634,
"grad_norm": 11.5625,
"learning_rate": 5.461538461538461e-05,
"loss": 2.562483787536621,
"step": 72
},
{
"epoch": 0.019496561394137676,
"grad_norm": 10.8125,
"learning_rate": 5.538461538461539e-05,
"loss": 2.3742763996124268,
"step": 73
},
{
"epoch": 0.019763637577619016,
"grad_norm": 9.0625,
"learning_rate": 5.615384615384616e-05,
"loss": 2.1751370429992676,
"step": 74
},
{
"epoch": 0.020030713761100355,
"grad_norm": 10.3125,
"learning_rate": 5.692307692307692e-05,
"loss": 2.398951292037964,
"step": 75
},
{
"epoch": 0.02029778994458169,
"grad_norm": 9.4375,
"learning_rate": 5.769230769230769e-05,
"loss": 2.19356632232666,
"step": 76
},
{
"epoch": 0.02056486612806303,
"grad_norm": 9.0625,
"learning_rate": 5.846153846153847e-05,
"loss": 2.200453519821167,
"step": 77
},
{
"epoch": 0.02083194231154437,
"grad_norm": 10.25,
"learning_rate": 5.923076923076923e-05,
"loss": 2.3186240196228027,
"step": 78
},
{
"epoch": 0.021099018495025705,
"grad_norm": 9.625,
"learning_rate": 6e-05,
"loss": 2.2442519664764404,
"step": 79
},
{
"epoch": 0.021366094678507044,
"grad_norm": 9.0625,
"learning_rate": 6.0769230769230765e-05,
"loss": 2.2705352306365967,
"step": 80
},
{
"epoch": 0.021633170861988384,
"grad_norm": 9.125,
"learning_rate": 6.153846153846155e-05,
"loss": 2.2238214015960693,
"step": 81
},
{
"epoch": 0.02190024704546972,
"grad_norm": 9.5625,
"learning_rate": 6.23076923076923e-05,
"loss": 2.2311673164367676,
"step": 82
},
{
"epoch": 0.02216732322895106,
"grad_norm": 9.5625,
"learning_rate": 6.307692307692308e-05,
"loss": 2.2941524982452393,
"step": 83
},
{
"epoch": 0.022434399412432398,
"grad_norm": 9.8125,
"learning_rate": 6.384615384615385e-05,
"loss": 2.2252962589263916,
"step": 84
},
{
"epoch": 0.022701475595913734,
"grad_norm": 9.3125,
"learning_rate": 6.461538461538462e-05,
"loss": 2.204983949661255,
"step": 85
},
{
"epoch": 0.022968551779395073,
"grad_norm": 8.6875,
"learning_rate": 6.538461538461539e-05,
"loss": 2.074981212615967,
"step": 86
},
{
"epoch": 0.02323562796287641,
"grad_norm": 9.0625,
"learning_rate": 6.615384615384616e-05,
"loss": 2.0775859355926514,
"step": 87
},
{
"epoch": 0.023502704146357748,
"grad_norm": 9.6875,
"learning_rate": 6.692307692307693e-05,
"loss": 2.144122362136841,
"step": 88
},
{
"epoch": 0.023769780329839087,
"grad_norm": 8.4375,
"learning_rate": 6.76923076923077e-05,
"loss": 2.1126554012298584,
"step": 89
},
{
"epoch": 0.024036856513320423,
"grad_norm": 7.6875,
"learning_rate": 6.846153846153847e-05,
"loss": 1.9261810779571533,
"step": 90
},
{
"epoch": 0.024303932696801762,
"grad_norm": 8.0625,
"learning_rate": 6.923076923076924e-05,
"loss": 2.0227725505828857,
"step": 91
},
{
"epoch": 0.0245710088802831,
"grad_norm": 6.59375,
"learning_rate": 7e-05,
"loss": 1.9648834466934204,
"step": 92
},
{
"epoch": 0.024838085063764438,
"grad_norm": 6.59375,
"learning_rate": 7.076923076923078e-05,
"loss": 1.920790672302246,
"step": 93
},
{
"epoch": 0.025105161247245777,
"grad_norm": 6.3125,
"learning_rate": 7.153846153846155e-05,
"loss": 1.898984432220459,
"step": 94
},
{
"epoch": 0.025372237430727116,
"grad_norm": 5.5,
"learning_rate": 7.23076923076923e-05,
"loss": 1.9621495008468628,
"step": 95
},
{
"epoch": 0.025639313614208452,
"grad_norm": 5.25,
"learning_rate": 7.307692307692307e-05,
"loss": 1.930998682975769,
"step": 96
},
{
"epoch": 0.02590638979768979,
"grad_norm": 5.125,
"learning_rate": 7.384615384615386e-05,
"loss": 1.8902232646942139,
"step": 97
},
{
"epoch": 0.02617346598117113,
"grad_norm": 5.15625,
"learning_rate": 7.461538461538462e-05,
"loss": 1.916045904159546,
"step": 98
},
{
"epoch": 0.026440542164652466,
"grad_norm": 6.0,
"learning_rate": 7.538461538461539e-05,
"loss": 2.0126044750213623,
"step": 99
},
{
"epoch": 0.026707618348133805,
"grad_norm": 4.75,
"learning_rate": 7.615384615384616e-05,
"loss": 1.8163182735443115,
"step": 100
},
{
"epoch": 0.026974694531615145,
"grad_norm": 4.8125,
"learning_rate": 7.692307692307693e-05,
"loss": 1.8071495294570923,
"step": 101
},
{
"epoch": 0.02724177071509648,
"grad_norm": 4.8125,
"learning_rate": 7.76923076923077e-05,
"loss": 2.030604362487793,
"step": 102
},
{
"epoch": 0.02750884689857782,
"grad_norm": 4.78125,
"learning_rate": 7.846153846153847e-05,
"loss": 1.9025654792785645,
"step": 103
},
{
"epoch": 0.02777592308205916,
"grad_norm": 4.5,
"learning_rate": 7.923076923076924e-05,
"loss": 1.9143315553665161,
"step": 104
},
{
"epoch": 0.028042999265540495,
"grad_norm": 3.828125,
"learning_rate": 8e-05,
"loss": 1.839133858680725,
"step": 105
},
{
"epoch": 0.028310075449021834,
"grad_norm": 4.0625,
"learning_rate": 8.076923076923078e-05,
"loss": 1.8787554502487183,
"step": 106
},
{
"epoch": 0.02857715163250317,
"grad_norm": 3.375,
"learning_rate": 8.153846153846155e-05,
"loss": 1.7920942306518555,
"step": 107
},
{
"epoch": 0.02884422781598451,
"grad_norm": 3.671875,
"learning_rate": 8.23076923076923e-05,
"loss": 1.8200159072875977,
"step": 108
},
{
"epoch": 0.02911130399946585,
"grad_norm": 3.234375,
"learning_rate": 8.307692307692309e-05,
"loss": 1.7615149021148682,
"step": 109
},
{
"epoch": 0.029378380182947184,
"grad_norm": 3.0,
"learning_rate": 8.384615384615386e-05,
"loss": 1.7836936712265015,
"step": 110
},
{
"epoch": 0.029645456366428524,
"grad_norm": 3.0625,
"learning_rate": 8.461538461538461e-05,
"loss": 1.7257531881332397,
"step": 111
},
{
"epoch": 0.029912532549909863,
"grad_norm": 2.96875,
"learning_rate": 8.538461538461538e-05,
"loss": 1.894051194190979,
"step": 112
},
{
"epoch": 0.0301796087333912,
"grad_norm": 3.078125,
"learning_rate": 8.615384615384617e-05,
"loss": 1.7538135051727295,
"step": 113
},
{
"epoch": 0.030446684916872538,
"grad_norm": 3.328125,
"learning_rate": 8.692307692307692e-05,
"loss": 1.7873612642288208,
"step": 114
},
{
"epoch": 0.030713761100353877,
"grad_norm": 2.671875,
"learning_rate": 8.76923076923077e-05,
"loss": 1.692636489868164,
"step": 115
},
{
"epoch": 0.030980837283835213,
"grad_norm": 2.765625,
"learning_rate": 8.846153846153847e-05,
"loss": 1.7532554864883423,
"step": 116
},
{
"epoch": 0.031247913467316552,
"grad_norm": 2.15625,
"learning_rate": 8.923076923076924e-05,
"loss": 1.5472298860549927,
"step": 117
},
{
"epoch": 0.03151498965079789,
"grad_norm": 2.296875,
"learning_rate": 9e-05,
"loss": 1.7773547172546387,
"step": 118
},
{
"epoch": 0.03178206583427923,
"grad_norm": 2.625,
"learning_rate": 9.076923076923078e-05,
"loss": 1.756500482559204,
"step": 119
},
{
"epoch": 0.03204914201776057,
"grad_norm": 2.359375,
"learning_rate": 9.153846153846155e-05,
"loss": 1.7489063739776611,
"step": 120
},
{
"epoch": 0.032316218201241906,
"grad_norm": 2.203125,
"learning_rate": 9.230769230769232e-05,
"loss": 1.734527349472046,
"step": 121
},
{
"epoch": 0.032583294384723245,
"grad_norm": 2.34375,
"learning_rate": 9.307692307692309e-05,
"loss": 1.7665072679519653,
"step": 122
},
{
"epoch": 0.03285037056820458,
"grad_norm": 2.125,
"learning_rate": 9.384615384615386e-05,
"loss": 1.7649085521697998,
"step": 123
},
{
"epoch": 0.03311744675168592,
"grad_norm": 2.03125,
"learning_rate": 9.461538461538461e-05,
"loss": 1.7251827716827393,
"step": 124
},
{
"epoch": 0.033384522935167256,
"grad_norm": 2.0625,
"learning_rate": 9.53846153846154e-05,
"loss": 1.6280146837234497,
"step": 125
},
{
"epoch": 0.033651599118648595,
"grad_norm": 1.9140625,
"learning_rate": 9.615384615384617e-05,
"loss": 1.700337290763855,
"step": 126
},
{
"epoch": 0.033918675302129934,
"grad_norm": 1.9765625,
"learning_rate": 9.692307692307692e-05,
"loss": 1.711737871170044,
"step": 127
},
{
"epoch": 0.034185751485611274,
"grad_norm": 1.875,
"learning_rate": 9.76923076923077e-05,
"loss": 1.6615790128707886,
"step": 128
},
{
"epoch": 0.034452827669092606,
"grad_norm": 1.78125,
"learning_rate": 9.846153846153848e-05,
"loss": 1.6334642171859741,
"step": 129
},
{
"epoch": 0.034719903852573945,
"grad_norm": 1.9140625,
"learning_rate": 9.923076923076923e-05,
"loss": 1.6756442785263062,
"step": 130
},
{
"epoch": 0.034986980036055285,
"grad_norm": 1.703125,
"learning_rate": 0.0001,
"loss": 1.623317003250122,
"step": 131
},
{
"epoch": 0.035254056219536624,
"grad_norm": 1.9296875,
"learning_rate": 0.0001,
"loss": 1.7026211023330688,
"step": 132
},
{
"epoch": 0.03552113240301796,
"grad_norm": 1.7265625,
"learning_rate": 0.0001,
"loss": 1.6647707223892212,
"step": 133
},
{
"epoch": 0.035788208586499295,
"grad_norm": 1.9375,
"learning_rate": 0.0001,
"loss": 1.75358247756958,
"step": 134
},
{
"epoch": 0.036055284769980635,
"grad_norm": 1.5625,
"learning_rate": 0.0001,
"loss": 1.631664752960205,
"step": 135
},
{
"epoch": 0.036322360953461974,
"grad_norm": 1.484375,
"learning_rate": 0.0001,
"loss": 1.6193870306015015,
"step": 136
},
{
"epoch": 0.03658943713694331,
"grad_norm": 1.5234375,
"learning_rate": 0.0001,
"loss": 1.6243921518325806,
"step": 137
},
{
"epoch": 0.03685651332042465,
"grad_norm": 1.4296875,
"learning_rate": 0.0001,
"loss": 1.6910765171051025,
"step": 138
},
{
"epoch": 0.03712358950390599,
"grad_norm": 1.3984375,
"learning_rate": 0.0001,
"loss": 1.617536187171936,
"step": 139
},
{
"epoch": 0.037390665687387324,
"grad_norm": 1.3125,
"learning_rate": 0.0001,
"loss": 1.4483500719070435,
"step": 140
},
{
"epoch": 0.03765774187086866,
"grad_norm": 1.421875,
"learning_rate": 0.0001,
"loss": 1.6344833374023438,
"step": 141
},
{
"epoch": 0.03792481805435,
"grad_norm": 1.328125,
"learning_rate": 0.0001,
"loss": 1.6913228034973145,
"step": 142
},
{
"epoch": 0.03819189423783134,
"grad_norm": 1.34375,
"learning_rate": 0.0001,
"loss": 1.598616123199463,
"step": 143
},
{
"epoch": 0.03845897042131268,
"grad_norm": 1.3359375,
"learning_rate": 0.0001,
"loss": 1.705214500427246,
"step": 144
},
{
"epoch": 0.03872604660479402,
"grad_norm": 1.1953125,
"learning_rate": 0.0001,
"loss": 1.574364423751831,
"step": 145
},
{
"epoch": 0.03899312278827535,
"grad_norm": 1.3671875,
"learning_rate": 0.0001,
"loss": 1.6656533479690552,
"step": 146
},
{
"epoch": 0.03926019897175669,
"grad_norm": 1.265625,
"learning_rate": 0.0001,
"loss": 1.5525474548339844,
"step": 147
},
{
"epoch": 0.03952727515523803,
"grad_norm": 1.234375,
"learning_rate": 0.0001,
"loss": 1.6093608140945435,
"step": 148
},
{
"epoch": 0.03979435133871937,
"grad_norm": 1.21875,
"learning_rate": 0.0001,
"loss": 1.5538296699523926,
"step": 149
},
{
"epoch": 0.04006142752220071,
"grad_norm": 1.1328125,
"learning_rate": 0.0001,
"loss": 1.566986083984375,
"step": 150
},
{
"epoch": 0.04032850370568205,
"grad_norm": 1.078125,
"learning_rate": 0.0001,
"loss": 1.4673757553100586,
"step": 151
},
{
"epoch": 0.04059557988916338,
"grad_norm": 1.125,
"learning_rate": 0.0001,
"loss": 1.5815367698669434,
"step": 152
},
{
"epoch": 0.04086265607264472,
"grad_norm": 1.1796875,
"learning_rate": 0.0001,
"loss": 1.5937564373016357,
"step": 153
},
{
"epoch": 0.04112973225612606,
"grad_norm": 1.25,
"learning_rate": 0.0001,
"loss": 1.5485172271728516,
"step": 154
},
{
"epoch": 0.0413968084396074,
"grad_norm": 1.1796875,
"learning_rate": 0.0001,
"loss": 1.5556213855743408,
"step": 155
},
{
"epoch": 0.04166388462308874,
"grad_norm": 1.1015625,
"learning_rate": 0.0001,
"loss": 1.5599007606506348,
"step": 156
},
{
"epoch": 0.04193096080657007,
"grad_norm": 1.078125,
"learning_rate": 0.0001,
"loss": 1.5992523431777954,
"step": 157
},
{
"epoch": 0.04219803699005141,
"grad_norm": 0.984375,
"learning_rate": 0.0001,
"loss": 1.5620003938674927,
"step": 158
},
{
"epoch": 0.04246511317353275,
"grad_norm": 1.078125,
"learning_rate": 0.0001,
"loss": 1.5492624044418335,
"step": 159
},
{
"epoch": 0.04273218935701409,
"grad_norm": 0.984375,
"learning_rate": 0.0001,
"loss": 1.6017942428588867,
"step": 160
},
{
"epoch": 0.04299926554049543,
"grad_norm": 1.09375,
"learning_rate": 0.0001,
"loss": 1.5750805139541626,
"step": 161
},
{
"epoch": 0.04326634172397677,
"grad_norm": 1.0546875,
"learning_rate": 0.0001,
"loss": 1.761667013168335,
"step": 162
},
{
"epoch": 0.0435334179074581,
"grad_norm": 0.98046875,
"learning_rate": 0.0001,
"loss": 1.6303234100341797,
"step": 163
},
{
"epoch": 0.04380049409093944,
"grad_norm": 0.95703125,
"learning_rate": 0.0001,
"loss": 1.5894020795822144,
"step": 164
},
{
"epoch": 0.04406757027442078,
"grad_norm": 0.91015625,
"learning_rate": 0.0001,
"loss": 1.569838047027588,
"step": 165
},
{
"epoch": 0.04433464645790212,
"grad_norm": 1.0234375,
"learning_rate": 0.0001,
"loss": 1.4858709573745728,
"step": 166
},
{
"epoch": 0.04460172264138346,
"grad_norm": 0.95703125,
"learning_rate": 0.0001,
"loss": 1.6576966047286987,
"step": 167
},
{
"epoch": 0.044868798824864796,
"grad_norm": 1.0390625,
"learning_rate": 0.0001,
"loss": 1.5174238681793213,
"step": 168
},
{
"epoch": 0.04513587500834613,
"grad_norm": 0.89453125,
"learning_rate": 0.0001,
"loss": 1.5932589769363403,
"step": 169
},
{
"epoch": 0.04540295119182747,
"grad_norm": 0.89453125,
"learning_rate": 0.0001,
"loss": 1.3785005807876587,
"step": 170
},
{
"epoch": 0.04567002737530881,
"grad_norm": 0.93359375,
"learning_rate": 0.0001,
"loss": 1.491324782371521,
"step": 171
},
{
"epoch": 0.045937103558790146,
"grad_norm": 0.921875,
"learning_rate": 0.0001,
"loss": 1.588956594467163,
"step": 172
},
{
"epoch": 0.046204179742271485,
"grad_norm": 0.87890625,
"learning_rate": 0.0001,
"loss": 1.4634578227996826,
"step": 173
},
{
"epoch": 0.04647125592575282,
"grad_norm": 1.0703125,
"learning_rate": 0.0001,
"loss": 1.520629644393921,
"step": 174
},
{
"epoch": 0.04673833210923416,
"grad_norm": 0.84765625,
"learning_rate": 0.0001,
"loss": 1.5000630617141724,
"step": 175
},
{
"epoch": 0.047005408292715496,
"grad_norm": 1.0390625,
"learning_rate": 0.0001,
"loss": 1.632713794708252,
"step": 176
},
{
"epoch": 0.047272484476196835,
"grad_norm": 0.9609375,
"learning_rate": 0.0001,
"loss": 1.5651711225509644,
"step": 177
},
{
"epoch": 0.047539560659678175,
"grad_norm": 0.87109375,
"learning_rate": 0.0001,
"loss": 1.5135159492492676,
"step": 178
},
{
"epoch": 0.047806636843159514,
"grad_norm": 0.875,
"learning_rate": 0.0001,
"loss": 1.4664947986602783,
"step": 179
},
{
"epoch": 0.048073713026640846,
"grad_norm": 0.8515625,
"learning_rate": 0.0001,
"loss": 1.5288572311401367,
"step": 180
},
{
"epoch": 0.048340789210122186,
"grad_norm": 0.90234375,
"learning_rate": 0.0001,
"loss": 1.4387882947921753,
"step": 181
},
{
"epoch": 0.048607865393603525,
"grad_norm": 0.8359375,
"learning_rate": 0.0001,
"loss": 1.5431747436523438,
"step": 182
},
{
"epoch": 0.048874941577084864,
"grad_norm": 0.84765625,
"learning_rate": 0.0001,
"loss": 1.58133065700531,
"step": 183
},
{
"epoch": 0.0491420177605662,
"grad_norm": 0.8203125,
"learning_rate": 0.0001,
"loss": 1.5671203136444092,
"step": 184
},
{
"epoch": 0.04940909394404754,
"grad_norm": 0.796875,
"learning_rate": 0.0001,
"loss": 1.5029916763305664,
"step": 185
},
{
"epoch": 0.049676170127528875,
"grad_norm": 0.8203125,
"learning_rate": 0.0001,
"loss": 1.5585753917694092,
"step": 186
},
{
"epoch": 0.049943246311010214,
"grad_norm": 0.859375,
"learning_rate": 0.0001,
"loss": 1.4746276140213013,
"step": 187
},
{
"epoch": 0.050210322494491554,
"grad_norm": 0.828125,
"learning_rate": 0.0001,
"loss": 1.4432883262634277,
"step": 188
},
{
"epoch": 0.05047739867797289,
"grad_norm": 0.9296875,
"learning_rate": 0.0001,
"loss": 1.472724437713623,
"step": 189
},
{
"epoch": 0.05074447486145423,
"grad_norm": 0.8203125,
"learning_rate": 0.0001,
"loss": 1.4479737281799316,
"step": 190
},
{
"epoch": 0.051011551044935564,
"grad_norm": 0.80859375,
"learning_rate": 0.0001,
"loss": 1.4101297855377197,
"step": 191
},
{
"epoch": 0.051278627228416904,
"grad_norm": 0.91796875,
"learning_rate": 0.0001,
"loss": 1.5154465436935425,
"step": 192
},
{
"epoch": 0.05154570341189824,
"grad_norm": 0.82421875,
"learning_rate": 0.0001,
"loss": 1.5523278713226318,
"step": 193
},
{
"epoch": 0.05181277959537958,
"grad_norm": 0.83203125,
"learning_rate": 0.0001,
"loss": 1.5290699005126953,
"step": 194
},
{
"epoch": 0.05207985577886092,
"grad_norm": 0.8515625,
"learning_rate": 0.0001,
"loss": 1.527392029762268,
"step": 195
},
{
"epoch": 0.05234693196234226,
"grad_norm": 0.7421875,
"learning_rate": 0.0001,
"loss": 1.5038968324661255,
"step": 196
},
{
"epoch": 0.05261400814582359,
"grad_norm": 0.79296875,
"learning_rate": 0.0001,
"loss": 1.4845222234725952,
"step": 197
},
{
"epoch": 0.05288108432930493,
"grad_norm": 0.77734375,
"learning_rate": 0.0001,
"loss": 1.513843059539795,
"step": 198
},
{
"epoch": 0.05314816051278627,
"grad_norm": 0.82421875,
"learning_rate": 0.0001,
"loss": 1.4493604898452759,
"step": 199
},
{
"epoch": 0.05341523669626761,
"grad_norm": 0.796875,
"learning_rate": 0.0001,
"loss": 1.3438012599945068,
"step": 200
},
{
"epoch": 0.05368231287974895,
"grad_norm": 0.71875,
"learning_rate": 0.0001,
"loss": 1.463066577911377,
"step": 201
},
{
"epoch": 0.05394938906323029,
"grad_norm": 0.7578125,
"learning_rate": 0.0001,
"loss": 1.492901086807251,
"step": 202
},
{
"epoch": 0.05421646524671162,
"grad_norm": 0.734375,
"learning_rate": 0.0001,
"loss": 1.4638193845748901,
"step": 203
},
{
"epoch": 0.05448354143019296,
"grad_norm": 0.70703125,
"learning_rate": 0.0001,
"loss": 1.4464643001556396,
"step": 204
},
{
"epoch": 0.0547506176136743,
"grad_norm": 0.75390625,
"learning_rate": 0.0001,
"loss": 1.5037822723388672,
"step": 205
},
{
"epoch": 0.05501769379715564,
"grad_norm": 0.7421875,
"learning_rate": 0.0001,
"loss": 1.544986367225647,
"step": 206
},
{
"epoch": 0.05528476998063698,
"grad_norm": 0.796875,
"learning_rate": 0.0001,
"loss": 1.500586986541748,
"step": 207
},
{
"epoch": 0.05555184616411832,
"grad_norm": 0.68359375,
"learning_rate": 0.0001,
"loss": 1.463313341140747,
"step": 208
},
{
"epoch": 0.05581892234759965,
"grad_norm": 0.71484375,
"learning_rate": 0.0001,
"loss": 1.5014681816101074,
"step": 209
},
{
"epoch": 0.05608599853108099,
"grad_norm": 0.6875,
"learning_rate": 0.0001,
"loss": 1.4201849699020386,
"step": 210
},
{
"epoch": 0.05635307471456233,
"grad_norm": 0.7109375,
"learning_rate": 0.0001,
"loss": 1.4925004243850708,
"step": 211
},
{
"epoch": 0.05662015089804367,
"grad_norm": 0.73046875,
"learning_rate": 0.0001,
"loss": 1.509586215019226,
"step": 212
},
{
"epoch": 0.05688722708152501,
"grad_norm": 0.65625,
"learning_rate": 0.0001,
"loss": 1.332578420639038,
"step": 213
},
{
"epoch": 0.05715430326500634,
"grad_norm": 0.73046875,
"learning_rate": 0.0001,
"loss": 1.4592535495758057,
"step": 214
},
{
"epoch": 0.05742137944848768,
"grad_norm": 0.7421875,
"learning_rate": 0.0001,
"loss": 1.4789878129959106,
"step": 215
},
{
"epoch": 0.05768845563196902,
"grad_norm": 0.73828125,
"learning_rate": 0.0001,
"loss": 1.5233962535858154,
"step": 216
},
{
"epoch": 0.05795553181545036,
"grad_norm": 0.703125,
"learning_rate": 0.0001,
"loss": 1.587152123451233,
"step": 217
},
{
"epoch": 0.0582226079989317,
"grad_norm": 0.75,
"learning_rate": 0.0001,
"loss": 1.3452866077423096,
"step": 218
},
{
"epoch": 0.058489684182413036,
"grad_norm": 0.703125,
"learning_rate": 0.0001,
"loss": 1.5141767263412476,
"step": 219
},
{
"epoch": 0.05875676036589437,
"grad_norm": 0.671875,
"learning_rate": 0.0001,
"loss": 1.39532470703125,
"step": 220
},
{
"epoch": 0.05902383654937571,
"grad_norm": 0.70703125,
"learning_rate": 0.0001,
"loss": 1.5537728071212769,
"step": 221
},
{
"epoch": 0.05929091273285705,
"grad_norm": 0.61328125,
"learning_rate": 0.0001,
"loss": 1.3248519897460938,
"step": 222
},
{
"epoch": 0.059557988916338386,
"grad_norm": 0.66796875,
"learning_rate": 0.0001,
"loss": 1.4280136823654175,
"step": 223
},
{
"epoch": 0.059825065099819726,
"grad_norm": 0.67578125,
"learning_rate": 0.0001,
"loss": 1.590162754058838,
"step": 224
},
{
"epoch": 0.060092141283301065,
"grad_norm": 0.68359375,
"learning_rate": 0.0001,
"loss": 1.5274425745010376,
"step": 225
},
{
"epoch": 0.0603592174667824,
"grad_norm": 0.69140625,
"learning_rate": 0.0001,
"loss": 1.538464903831482,
"step": 226
},
{
"epoch": 0.060626293650263736,
"grad_norm": 0.67578125,
"learning_rate": 0.0001,
"loss": 1.43494713306427,
"step": 227
},
{
"epoch": 0.060893369833745076,
"grad_norm": 0.83984375,
"learning_rate": 0.0001,
"loss": 1.3913284540176392,
"step": 228
},
{
"epoch": 0.061160446017226415,
"grad_norm": 0.6953125,
"learning_rate": 0.0001,
"loss": 1.4944510459899902,
"step": 229
},
{
"epoch": 0.061427522200707754,
"grad_norm": 0.66796875,
"learning_rate": 0.0001,
"loss": 1.384867548942566,
"step": 230
},
{
"epoch": 0.06169459838418909,
"grad_norm": 0.65625,
"learning_rate": 0.0001,
"loss": 1.421984314918518,
"step": 231
},
{
"epoch": 0.061961674567670426,
"grad_norm": 0.7578125,
"learning_rate": 0.0001,
"loss": 1.52085280418396,
"step": 232
},
{
"epoch": 0.062228750751151765,
"grad_norm": 0.7109375,
"learning_rate": 0.0001,
"loss": 1.5080655813217163,
"step": 233
},
{
"epoch": 0.062495826934633104,
"grad_norm": 0.71875,
"learning_rate": 0.0001,
"loss": 1.5477678775787354,
"step": 234
},
{
"epoch": 0.06276290311811444,
"grad_norm": 0.6640625,
"learning_rate": 0.0001,
"loss": 1.577529788017273,
"step": 235
},
{
"epoch": 0.06302997930159578,
"grad_norm": 0.63671875,
"learning_rate": 0.0001,
"loss": 1.4905290603637695,
"step": 236
},
{
"epoch": 0.06329705548507712,
"grad_norm": 0.671875,
"learning_rate": 0.0001,
"loss": 1.4339290857315063,
"step": 237
},
{
"epoch": 0.06356413166855845,
"grad_norm": 0.65625,
"learning_rate": 0.0001,
"loss": 1.4818267822265625,
"step": 238
},
{
"epoch": 0.0638312078520398,
"grad_norm": 0.69921875,
"learning_rate": 0.0001,
"loss": 1.484571099281311,
"step": 239
},
{
"epoch": 0.06409828403552113,
"grad_norm": 0.7265625,
"learning_rate": 0.0001,
"loss": 1.4844155311584473,
"step": 240
},
{
"epoch": 0.06436536021900247,
"grad_norm": 0.66796875,
"learning_rate": 0.0001,
"loss": 1.3982433080673218,
"step": 241
},
{
"epoch": 0.06463243640248381,
"grad_norm": 0.66015625,
"learning_rate": 0.0001,
"loss": 1.3840627670288086,
"step": 242
},
{
"epoch": 0.06489951258596514,
"grad_norm": 1.5,
"learning_rate": 0.0001,
"loss": 1.516506552696228,
"step": 243
},
{
"epoch": 0.06516658876944649,
"grad_norm": 0.6796875,
"learning_rate": 0.0001,
"loss": 1.381393551826477,
"step": 244
},
{
"epoch": 0.06543366495292782,
"grad_norm": 0.7578125,
"learning_rate": 0.0001,
"loss": 1.461839199066162,
"step": 245
},
{
"epoch": 0.06570074113640915,
"grad_norm": 0.828125,
"learning_rate": 0.0001,
"loss": 1.5335147380828857,
"step": 246
},
{
"epoch": 0.0659678173198905,
"grad_norm": 0.6796875,
"learning_rate": 0.0001,
"loss": 1.5478235483169556,
"step": 247
},
{
"epoch": 0.06623489350337183,
"grad_norm": 0.65234375,
"learning_rate": 0.0001,
"loss": 1.376564383506775,
"step": 248
},
{
"epoch": 0.06650196968685318,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.3668392896652222,
"step": 249
},
{
"epoch": 0.06676904587033451,
"grad_norm": 0.65625,
"learning_rate": 0.0001,
"loss": 1.4300906658172607,
"step": 250
},
{
"epoch": 0.06703612205381584,
"grad_norm": 0.68359375,
"learning_rate": 0.0001,
"loss": 1.4604257345199585,
"step": 251
},
{
"epoch": 0.06730319823729719,
"grad_norm": 0.64453125,
"learning_rate": 0.0001,
"loss": 1.4916373491287231,
"step": 252
},
{
"epoch": 0.06757027442077852,
"grad_norm": 0.68359375,
"learning_rate": 0.0001,
"loss": 1.5384293794631958,
"step": 253
},
{
"epoch": 0.06783735060425987,
"grad_norm": 0.64453125,
"learning_rate": 0.0001,
"loss": 1.4481867551803589,
"step": 254
},
{
"epoch": 0.0681044267877412,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.3729946613311768,
"step": 255
},
{
"epoch": 0.06837150297122255,
"grad_norm": 0.7265625,
"learning_rate": 0.0001,
"loss": 1.4905198812484741,
"step": 256
},
{
"epoch": 0.06863857915470388,
"grad_norm": 0.61328125,
"learning_rate": 0.0001,
"loss": 1.3511143922805786,
"step": 257
},
{
"epoch": 0.06890565533818521,
"grad_norm": 0.6875,
"learning_rate": 0.0001,
"loss": 1.546562671661377,
"step": 258
},
{
"epoch": 0.06917273152166656,
"grad_norm": 0.65234375,
"learning_rate": 0.0001,
"loss": 1.4856135845184326,
"step": 259
},
{
"epoch": 0.06943980770514789,
"grad_norm": 0.6796875,
"learning_rate": 0.0001,
"loss": 1.5204493999481201,
"step": 260
},
{
"epoch": 0.06970688388862924,
"grad_norm": 0.70703125,
"learning_rate": 0.0001,
"loss": 1.496220350265503,
"step": 261
},
{
"epoch": 0.06997396007211057,
"grad_norm": 0.7109375,
"learning_rate": 0.0001,
"loss": 1.453678846359253,
"step": 262
},
{
"epoch": 0.0702410362555919,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 1.4709477424621582,
"step": 263
},
{
"epoch": 0.07050811243907325,
"grad_norm": 0.65625,
"learning_rate": 0.0001,
"loss": 1.420995831489563,
"step": 264
},
{
"epoch": 0.07077518862255458,
"grad_norm": 0.640625,
"learning_rate": 0.0001,
"loss": 1.4407060146331787,
"step": 265
},
{
"epoch": 0.07104226480603593,
"grad_norm": 0.8203125,
"learning_rate": 0.0001,
"loss": 1.4003022909164429,
"step": 266
},
{
"epoch": 0.07130934098951726,
"grad_norm": 0.6796875,
"learning_rate": 0.0001,
"loss": 1.4565346240997314,
"step": 267
},
{
"epoch": 0.07157641717299859,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 1.4498852491378784,
"step": 268
},
{
"epoch": 0.07184349335647994,
"grad_norm": 0.6640625,
"learning_rate": 0.0001,
"loss": 1.437258243560791,
"step": 269
},
{
"epoch": 0.07211056953996127,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 1.4084056615829468,
"step": 270
},
{
"epoch": 0.07237764572344262,
"grad_norm": 0.66015625,
"learning_rate": 0.0001,
"loss": 1.410775899887085,
"step": 271
},
{
"epoch": 0.07264472190692395,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.4345930814743042,
"step": 272
},
{
"epoch": 0.0729117980904053,
"grad_norm": 0.63671875,
"learning_rate": 0.0001,
"loss": 1.431238055229187,
"step": 273
},
{
"epoch": 0.07317887427388663,
"grad_norm": 0.68359375,
"learning_rate": 0.0001,
"loss": 1.491136074066162,
"step": 274
},
{
"epoch": 0.07344595045736796,
"grad_norm": 0.6796875,
"learning_rate": 0.0001,
"loss": 1.3734588623046875,
"step": 275
},
{
"epoch": 0.0737130266408493,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.3641507625579834,
"step": 276
},
{
"epoch": 0.07398010282433064,
"grad_norm": 0.65625,
"learning_rate": 0.0001,
"loss": 1.499869704246521,
"step": 277
},
{
"epoch": 0.07424717900781198,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.4527770280838013,
"step": 278
},
{
"epoch": 0.07451425519129332,
"grad_norm": 0.6640625,
"learning_rate": 0.0001,
"loss": 1.450440526008606,
"step": 279
},
{
"epoch": 0.07478133137477465,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.3640730381011963,
"step": 280
},
{
"epoch": 0.075048407558256,
"grad_norm": 0.7265625,
"learning_rate": 0.0001,
"loss": 1.3600690364837646,
"step": 281
},
{
"epoch": 0.07531548374173733,
"grad_norm": 0.65625,
"learning_rate": 0.0001,
"loss": 1.3847788572311401,
"step": 282
},
{
"epoch": 0.07558255992521867,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.471301555633545,
"step": 283
},
{
"epoch": 0.0758496361087,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.3216960430145264,
"step": 284
},
{
"epoch": 0.07611671229218135,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.4935619831085205,
"step": 285
},
{
"epoch": 0.07638378847566268,
"grad_norm": 0.65234375,
"learning_rate": 0.0001,
"loss": 1.4458503723144531,
"step": 286
},
{
"epoch": 0.07665086465914402,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.3823765516281128,
"step": 287
},
{
"epoch": 0.07691794084262536,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.43092942237854,
"step": 288
},
{
"epoch": 0.0771850170261067,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.2901136875152588,
"step": 289
},
{
"epoch": 0.07745209320958804,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 1.4777882099151611,
"step": 290
},
{
"epoch": 0.07771916939306937,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.3023655414581299,
"step": 291
},
{
"epoch": 0.0779862455765507,
"grad_norm": 0.65234375,
"learning_rate": 0.0001,
"loss": 1.4468395709991455,
"step": 292
},
{
"epoch": 0.07825332176003205,
"grad_norm": 0.66796875,
"learning_rate": 0.0001,
"loss": 1.5395652055740356,
"step": 293
},
{
"epoch": 0.07852039794351338,
"grad_norm": 0.6484375,
"learning_rate": 0.0001,
"loss": 1.5038352012634277,
"step": 294
},
{
"epoch": 0.07878747412699473,
"grad_norm": 0.65234375,
"learning_rate": 0.0001,
"loss": 1.4570817947387695,
"step": 295
},
{
"epoch": 0.07905455031047606,
"grad_norm": 0.66015625,
"learning_rate": 0.0001,
"loss": 1.4961787462234497,
"step": 296
},
{
"epoch": 0.0793216264939574,
"grad_norm": 0.73828125,
"learning_rate": 0.0001,
"loss": 1.413039207458496,
"step": 297
},
{
"epoch": 0.07958870267743874,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.4544329643249512,
"step": 298
},
{
"epoch": 0.07985577886092007,
"grad_norm": 0.6796875,
"learning_rate": 0.0001,
"loss": 1.4016033411026,
"step": 299
},
{
"epoch": 0.08012285504440142,
"grad_norm": 0.63671875,
"learning_rate": 0.0001,
"loss": 1.4141845703125,
"step": 300
},
{
"epoch": 0.08038993122788275,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.300679326057434,
"step": 301
},
{
"epoch": 0.0806570074113641,
"grad_norm": 0.66796875,
"learning_rate": 0.0001,
"loss": 1.4802281856536865,
"step": 302
},
{
"epoch": 0.08092408359484543,
"grad_norm": 0.71875,
"learning_rate": 0.0001,
"loss": 1.4601387977600098,
"step": 303
},
{
"epoch": 0.08119115977832676,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.3691799640655518,
"step": 304
},
{
"epoch": 0.08145823596180811,
"grad_norm": 0.66796875,
"learning_rate": 0.0001,
"loss": 1.4916398525238037,
"step": 305
},
{
"epoch": 0.08172531214528944,
"grad_norm": 0.734375,
"learning_rate": 0.0001,
"loss": 1.3630765676498413,
"step": 306
},
{
"epoch": 0.08199238832877079,
"grad_norm": 0.6875,
"learning_rate": 0.0001,
"loss": 1.3123859167099,
"step": 307
},
{
"epoch": 0.08225946451225212,
"grad_norm": 0.640625,
"learning_rate": 0.0001,
"loss": 1.4827555418014526,
"step": 308
},
{
"epoch": 0.08252654069573345,
"grad_norm": 0.640625,
"learning_rate": 0.0001,
"loss": 1.3617500066757202,
"step": 309
},
{
"epoch": 0.0827936168792148,
"grad_norm": 0.69140625,
"learning_rate": 0.0001,
"loss": 1.5043036937713623,
"step": 310
},
{
"epoch": 0.08306069306269613,
"grad_norm": 0.61328125,
"learning_rate": 0.0001,
"loss": 1.49618399143219,
"step": 311
},
{
"epoch": 0.08332776924617748,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.5405969619750977,
"step": 312
},
{
"epoch": 0.08359484542965881,
"grad_norm": 0.65234375,
"learning_rate": 0.0001,
"loss": 1.458032488822937,
"step": 313
},
{
"epoch": 0.08386192161314014,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.3159329891204834,
"step": 314
},
{
"epoch": 0.08412899779662149,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.497544765472412,
"step": 315
},
{
"epoch": 0.08439607398010282,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.4141039848327637,
"step": 316
},
{
"epoch": 0.08466315016358417,
"grad_norm": 0.63671875,
"learning_rate": 0.0001,
"loss": 1.4044368267059326,
"step": 317
},
{
"epoch": 0.0849302263470655,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.4094866514205933,
"step": 318
},
{
"epoch": 0.08519730253054685,
"grad_norm": 0.61328125,
"learning_rate": 0.0001,
"loss": 1.4708569049835205,
"step": 319
},
{
"epoch": 0.08546437871402818,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.490856409072876,
"step": 320
},
{
"epoch": 0.08573145489750951,
"grad_norm": 0.671875,
"learning_rate": 0.0001,
"loss": 1.4941420555114746,
"step": 321
},
{
"epoch": 0.08599853108099086,
"grad_norm": 0.66796875,
"learning_rate": 0.0001,
"loss": 1.4737300872802734,
"step": 322
},
{
"epoch": 0.08626560726447219,
"grad_norm": 0.65625,
"learning_rate": 0.0001,
"loss": 1.4835773706436157,
"step": 323
},
{
"epoch": 0.08653268344795353,
"grad_norm": 0.6484375,
"learning_rate": 0.0001,
"loss": 1.3013849258422852,
"step": 324
},
{
"epoch": 0.08679975963143487,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.456613540649414,
"step": 325
},
{
"epoch": 0.0870668358149162,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.4825116395950317,
"step": 326
},
{
"epoch": 0.08733391199839755,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.5193110704421997,
"step": 327
},
{
"epoch": 0.08760098818187888,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.4921228885650635,
"step": 328
},
{
"epoch": 0.08786806436536022,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.5096917152404785,
"step": 329
},
{
"epoch": 0.08813514054884156,
"grad_norm": 0.6640625,
"learning_rate": 0.0001,
"loss": 1.40569269657135,
"step": 330
},
{
"epoch": 0.08840221673232289,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.3891390562057495,
"step": 331
},
{
"epoch": 0.08866929291580423,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.431661605834961,
"step": 332
},
{
"epoch": 0.08893636909928557,
"grad_norm": 0.66015625,
"learning_rate": 0.0001,
"loss": 1.3885363340377808,
"step": 333
},
{
"epoch": 0.08920344528276691,
"grad_norm": 0.640625,
"learning_rate": 0.0001,
"loss": 1.4512131214141846,
"step": 334
},
{
"epoch": 0.08947052146624825,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.404493808746338,
"step": 335
},
{
"epoch": 0.08973759764972959,
"grad_norm": 0.61328125,
"learning_rate": 0.0001,
"loss": 1.3655177354812622,
"step": 336
},
{
"epoch": 0.09000467383321092,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.2801893949508667,
"step": 337
},
{
"epoch": 0.09027175001669226,
"grad_norm": 0.671875,
"learning_rate": 0.0001,
"loss": 1.470276117324829,
"step": 338
},
{
"epoch": 0.0905388262001736,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.4459853172302246,
"step": 339
},
{
"epoch": 0.09080590238365494,
"grad_norm": 0.640625,
"learning_rate": 0.0001,
"loss": 1.4520962238311768,
"step": 340
},
{
"epoch": 0.09107297856713628,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.3865824937820435,
"step": 341
},
{
"epoch": 0.09134005475061761,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.3370836973190308,
"step": 342
},
{
"epoch": 0.09160713093409895,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.4441577196121216,
"step": 343
},
{
"epoch": 0.09187420711758029,
"grad_norm": 0.64453125,
"learning_rate": 0.0001,
"loss": 1.4073468446731567,
"step": 344
},
{
"epoch": 0.09214128330106162,
"grad_norm": 0.640625,
"learning_rate": 0.0001,
"loss": 1.3208041191101074,
"step": 345
},
{
"epoch": 0.09240835948454297,
"grad_norm": 0.66796875,
"learning_rate": 0.0001,
"loss": 1.5300800800323486,
"step": 346
},
{
"epoch": 0.0926754356680243,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 1.4317735433578491,
"step": 347
},
{
"epoch": 0.09294251185150564,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.1871222257614136,
"step": 348
},
{
"epoch": 0.09320958803498698,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.3505630493164062,
"step": 349
},
{
"epoch": 0.09347666421846831,
"grad_norm": 0.64453125,
"learning_rate": 0.0001,
"loss": 1.4304125308990479,
"step": 350
},
{
"epoch": 0.09374374040194966,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.4102681875228882,
"step": 351
},
{
"epoch": 0.09401081658543099,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 1.49142324924469,
"step": 352
},
{
"epoch": 0.09427789276891234,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.3962408304214478,
"step": 353
},
{
"epoch": 0.09454496895239367,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.273288607597351,
"step": 354
},
{
"epoch": 0.094812045135875,
"grad_norm": 0.6796875,
"learning_rate": 0.0001,
"loss": 1.4168131351470947,
"step": 355
},
{
"epoch": 0.09507912131935635,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.3748528957366943,
"step": 356
},
{
"epoch": 0.09534619750283768,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.3892818689346313,
"step": 357
},
{
"epoch": 0.09561327368631903,
"grad_norm": 0.65234375,
"learning_rate": 0.0001,
"loss": 1.453303575515747,
"step": 358
},
{
"epoch": 0.09588034986980036,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.4077439308166504,
"step": 359
},
{
"epoch": 0.09614742605328169,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.4063971042633057,
"step": 360
},
{
"epoch": 0.09641450223676304,
"grad_norm": 0.703125,
"learning_rate": 0.0001,
"loss": 1.4442360401153564,
"step": 361
},
{
"epoch": 0.09668157842024437,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.4110056161880493,
"step": 362
},
{
"epoch": 0.09694865460372572,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.5454267263412476,
"step": 363
},
{
"epoch": 0.09721573078720705,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.4862934350967407,
"step": 364
},
{
"epoch": 0.09748280697068838,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.4219492673873901,
"step": 365
},
{
"epoch": 0.09774988315416973,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.4686615467071533,
"step": 366
},
{
"epoch": 0.09801695933765106,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.3849600553512573,
"step": 367
},
{
"epoch": 0.0982840355211324,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.442658543586731,
"step": 368
},
{
"epoch": 0.09855111170461374,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.3929274082183838,
"step": 369
},
{
"epoch": 0.09881818788809509,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.511911392211914,
"step": 370
},
{
"epoch": 0.09908526407157642,
"grad_norm": 0.640625,
"learning_rate": 0.0001,
"loss": 1.3419064283370972,
"step": 371
},
{
"epoch": 0.09935234025505775,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.3798582553863525,
"step": 372
},
{
"epoch": 0.0996194164385391,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.4674209356307983,
"step": 373
},
{
"epoch": 0.09988649262202043,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.4218908548355103,
"step": 374
},
{
"epoch": 0.10015356880550177,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.341963529586792,
"step": 375
},
{
"epoch": 0.10042064498898311,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.4310853481292725,
"step": 376
},
{
"epoch": 0.10068772117246444,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.4752094745635986,
"step": 377
},
{
"epoch": 0.10095479735594579,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.3446142673492432,
"step": 378
},
{
"epoch": 0.10122187353942712,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.4467658996582031,
"step": 379
},
{
"epoch": 0.10148894972290846,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.4706642627716064,
"step": 380
},
{
"epoch": 0.1017560259063898,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.4760010242462158,
"step": 381
},
{
"epoch": 0.10202310208987113,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.4614394903182983,
"step": 382
},
{
"epoch": 0.10229017827335248,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.48732590675354,
"step": 383
},
{
"epoch": 0.10255725445683381,
"grad_norm": 0.66015625,
"learning_rate": 0.0001,
"loss": 1.4869389533996582,
"step": 384
},
{
"epoch": 0.10282433064031515,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.455293893814087,
"step": 385
},
{
"epoch": 0.10309140682379649,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.412632703781128,
"step": 386
},
{
"epoch": 0.10335848300727783,
"grad_norm": 0.640625,
"learning_rate": 0.0001,
"loss": 1.4052395820617676,
"step": 387
},
{
"epoch": 0.10362555919075916,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.3353341817855835,
"step": 388
},
{
"epoch": 0.1038926353742405,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.3652608394622803,
"step": 389
},
{
"epoch": 0.10415971155772184,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.3833329677581787,
"step": 390
},
{
"epoch": 0.10442678774120318,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.3956871032714844,
"step": 391
},
{
"epoch": 0.10469386392468452,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.2527759075164795,
"step": 392
},
{
"epoch": 0.10496094010816585,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.3367494344711304,
"step": 393
},
{
"epoch": 0.10522801629164719,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.4054964780807495,
"step": 394
},
{
"epoch": 0.10549509247512853,
"grad_norm": 0.671875,
"learning_rate": 0.0001,
"loss": 1.4781296253204346,
"step": 395
},
{
"epoch": 0.10576216865860986,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.158512830734253,
"step": 396
},
{
"epoch": 0.10602924484209121,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.4767026901245117,
"step": 397
},
{
"epoch": 0.10629632102557254,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.3846932649612427,
"step": 398
},
{
"epoch": 0.10656339720905389,
"grad_norm": 0.640625,
"learning_rate": 0.0001,
"loss": 1.434056282043457,
"step": 399
},
{
"epoch": 0.10683047339253522,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.3218588829040527,
"step": 400
},
{
"epoch": 0.10709754957601655,
"grad_norm": 0.69140625,
"learning_rate": 0.0001,
"loss": 1.4285893440246582,
"step": 401
},
{
"epoch": 0.1073646257594979,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.4643278121948242,
"step": 402
},
{
"epoch": 0.10763170194297923,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.3895577192306519,
"step": 403
},
{
"epoch": 0.10789877812646058,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.4688113927841187,
"step": 404
},
{
"epoch": 0.10816585430994191,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.3960481882095337,
"step": 405
},
{
"epoch": 0.10843293049342324,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.3620350360870361,
"step": 406
},
{
"epoch": 0.10870000667690459,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.308117389678955,
"step": 407
},
{
"epoch": 0.10896708286038592,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.3918156623840332,
"step": 408
},
{
"epoch": 0.10923415904386727,
"grad_norm": 0.61328125,
"learning_rate": 0.0001,
"loss": 1.2705599069595337,
"step": 409
},
{
"epoch": 0.1095012352273486,
"grad_norm": 0.61328125,
"learning_rate": 0.0001,
"loss": 1.3722550868988037,
"step": 410
},
{
"epoch": 0.10976831141082993,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.3725674152374268,
"step": 411
},
{
"epoch": 0.11003538759431128,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.4689661264419556,
"step": 412
},
{
"epoch": 0.11030246377779261,
"grad_norm": 0.6640625,
"learning_rate": 0.0001,
"loss": 1.4475466012954712,
"step": 413
},
{
"epoch": 0.11056953996127396,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.310787558555603,
"step": 414
},
{
"epoch": 0.11083661614475529,
"grad_norm": 0.63671875,
"learning_rate": 0.0001,
"loss": 1.3508751392364502,
"step": 415
},
{
"epoch": 0.11110369232823664,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.2748651504516602,
"step": 416
},
{
"epoch": 0.11137076851171797,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.429402232170105,
"step": 417
},
{
"epoch": 0.1116378446951993,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.4464179277420044,
"step": 418
},
{
"epoch": 0.11190492087868065,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.5171209573745728,
"step": 419
},
{
"epoch": 0.11217199706216198,
"grad_norm": 0.65234375,
"learning_rate": 0.0001,
"loss": 1.3573802709579468,
"step": 420
},
{
"epoch": 0.11243907324564333,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.268883228302002,
"step": 421
},
{
"epoch": 0.11270614942912466,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.4801915884017944,
"step": 422
},
{
"epoch": 0.11297322561260599,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.5770097970962524,
"step": 423
},
{
"epoch": 0.11324030179608734,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.3894881010055542,
"step": 424
},
{
"epoch": 0.11350737797956867,
"grad_norm": 0.6796875,
"learning_rate": 0.0001,
"loss": 1.4948337078094482,
"step": 425
},
{
"epoch": 0.11377445416305002,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.4342015981674194,
"step": 426
},
{
"epoch": 0.11404153034653135,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.3664584159851074,
"step": 427
},
{
"epoch": 0.11430860653001268,
"grad_norm": 0.67578125,
"learning_rate": 0.0001,
"loss": 1.3029652833938599,
"step": 428
},
{
"epoch": 0.11457568271349403,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.3096115589141846,
"step": 429
},
{
"epoch": 0.11484275889697536,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.4837114810943604,
"step": 430
},
{
"epoch": 0.1151098350804567,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.4767651557922363,
"step": 431
},
{
"epoch": 0.11537691126393804,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.3935829401016235,
"step": 432
},
{
"epoch": 0.11564398744741938,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 1.4307037591934204,
"step": 433
},
{
"epoch": 0.11591106363090072,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.4892133474349976,
"step": 434
},
{
"epoch": 0.11617813981438205,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.427190899848938,
"step": 435
},
{
"epoch": 0.1164452159978634,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.2890833616256714,
"step": 436
},
{
"epoch": 0.11671229218134473,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.4582980871200562,
"step": 437
},
{
"epoch": 0.11697936836482607,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.3579460382461548,
"step": 438
},
{
"epoch": 0.1172464445483074,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.4087233543395996,
"step": 439
},
{
"epoch": 0.11751352073178874,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.4538408517837524,
"step": 440
},
{
"epoch": 0.11778059691527008,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.4369275569915771,
"step": 441
},
{
"epoch": 0.11804767309875142,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.3921966552734375,
"step": 442
},
{
"epoch": 0.11831474928223276,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.3368277549743652,
"step": 443
},
{
"epoch": 0.1185818254657141,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.34857976436615,
"step": 444
},
{
"epoch": 0.11884890164919543,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.4927914142608643,
"step": 445
},
{
"epoch": 0.11911597783267677,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.402164340019226,
"step": 446
},
{
"epoch": 0.1193830540161581,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.3122864961624146,
"step": 447
},
{
"epoch": 0.11965013019963945,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.1882364749908447,
"step": 448
},
{
"epoch": 0.11991720638312078,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.371604323387146,
"step": 449
},
{
"epoch": 0.12018428256660213,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.3432501554489136,
"step": 450
},
{
"epoch": 0.12045135875008346,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.43648362159729,
"step": 451
},
{
"epoch": 0.1207184349335648,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.3512425422668457,
"step": 452
},
{
"epoch": 0.12098551111704614,
"grad_norm": 0.61328125,
"learning_rate": 0.0001,
"loss": 1.4198323488235474,
"step": 453
},
{
"epoch": 0.12125258730052747,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.3973807096481323,
"step": 454
},
{
"epoch": 0.12151966348400882,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.4904663562774658,
"step": 455
},
{
"epoch": 0.12178673966749015,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.3683151006698608,
"step": 456
},
{
"epoch": 0.12205381585097148,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.47139310836792,
"step": 457
},
{
"epoch": 0.12232089203445283,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.4727649688720703,
"step": 458
},
{
"epoch": 0.12258796821793416,
"grad_norm": 0.68359375,
"learning_rate": 0.0001,
"loss": 1.4294495582580566,
"step": 459
},
{
"epoch": 0.12285504440141551,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.4658524990081787,
"step": 460
},
{
"epoch": 0.12312212058489684,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.457288146018982,
"step": 461
},
{
"epoch": 0.12338919676837817,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3517018556594849,
"step": 462
},
{
"epoch": 0.12365627295185952,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.2725576162338257,
"step": 463
},
{
"epoch": 0.12392334913534085,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.3586177825927734,
"step": 464
},
{
"epoch": 0.1241904253188222,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.2883931398391724,
"step": 465
},
{
"epoch": 0.12445750150230353,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.4876344203948975,
"step": 466
},
{
"epoch": 0.12472457768578488,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.4705077409744263,
"step": 467
},
{
"epoch": 0.12499165386926621,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.412283182144165,
"step": 468
},
{
"epoch": 0.12525873005274754,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.335158348083496,
"step": 469
},
{
"epoch": 0.1255258062362289,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.3694274425506592,
"step": 470
},
{
"epoch": 0.12579288241971023,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.372456669807434,
"step": 471
},
{
"epoch": 0.12605995860319155,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.4638047218322754,
"step": 472
},
{
"epoch": 0.1263270347866729,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.413342833518982,
"step": 473
},
{
"epoch": 0.12659411097015424,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.397768497467041,
"step": 474
},
{
"epoch": 0.12686118715363556,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.2481286525726318,
"step": 475
},
{
"epoch": 0.1271282633371169,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.4423224925994873,
"step": 476
},
{
"epoch": 0.12739533952059826,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.3875746726989746,
"step": 477
},
{
"epoch": 0.1276624157040796,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.5474607944488525,
"step": 478
},
{
"epoch": 0.12792949188756092,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.352391242980957,
"step": 479
},
{
"epoch": 0.12819656807104227,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.472024917602539,
"step": 480
},
{
"epoch": 0.1284636442545236,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.3799539804458618,
"step": 481
},
{
"epoch": 0.12873072043800493,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3873319625854492,
"step": 482
},
{
"epoch": 0.12899779662148628,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.342333197593689,
"step": 483
},
{
"epoch": 0.12926487280496762,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.3376230001449585,
"step": 484
},
{
"epoch": 0.12953194898844894,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.3061014413833618,
"step": 485
},
{
"epoch": 0.1297990251719303,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.4731895923614502,
"step": 486
},
{
"epoch": 0.13006610135541163,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.3420498371124268,
"step": 487
},
{
"epoch": 0.13033317753889298,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.4132871627807617,
"step": 488
},
{
"epoch": 0.1306002537223743,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.4671186208724976,
"step": 489
},
{
"epoch": 0.13086732990585564,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.4150232076644897,
"step": 490
},
{
"epoch": 0.131134406089337,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.3895783424377441,
"step": 491
},
{
"epoch": 0.1314014822728183,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.3676741123199463,
"step": 492
},
{
"epoch": 0.13166855845629966,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 1.384874701499939,
"step": 493
},
{
"epoch": 0.131935634639781,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.358314037322998,
"step": 494
},
{
"epoch": 0.13220271082326235,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.5222220420837402,
"step": 495
},
{
"epoch": 0.13246978700674367,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.3388750553131104,
"step": 496
},
{
"epoch": 0.132736863190225,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.4615256786346436,
"step": 497
},
{
"epoch": 0.13300393937370636,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.4470080137252808,
"step": 498
},
{
"epoch": 0.13327101555718768,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.3600274324417114,
"step": 499
},
{
"epoch": 0.13353809174066902,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.32597017288208,
"step": 500
},
{
"epoch": 0.13380516792415037,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.4446059465408325,
"step": 501
},
{
"epoch": 0.1340722441076317,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.3945196866989136,
"step": 502
},
{
"epoch": 0.13433932029111303,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.460927128791809,
"step": 503
},
{
"epoch": 0.13460639647459438,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.424298644065857,
"step": 504
},
{
"epoch": 0.13487347265807573,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.4830772876739502,
"step": 505
},
{
"epoch": 0.13514054884155705,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.3743841648101807,
"step": 506
},
{
"epoch": 0.1354076250250384,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.5224791765213013,
"step": 507
},
{
"epoch": 0.13567470120851974,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.341582179069519,
"step": 508
},
{
"epoch": 0.13594177739200106,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.3958293199539185,
"step": 509
},
{
"epoch": 0.1362088535754824,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.2593824863433838,
"step": 510
},
{
"epoch": 0.13647592975896375,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.5547127723693848,
"step": 511
},
{
"epoch": 0.1367430059424451,
"grad_norm": 0.67578125,
"learning_rate": 0.0001,
"loss": 1.3834812641143799,
"step": 512
},
{
"epoch": 0.1370100821259264,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.4330908060073853,
"step": 513
},
{
"epoch": 0.13727715830940776,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.3858208656311035,
"step": 514
},
{
"epoch": 0.1375442344928891,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.2102665901184082,
"step": 515
},
{
"epoch": 0.13781131067637042,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.3554112911224365,
"step": 516
},
{
"epoch": 0.13807838685985177,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.2493407726287842,
"step": 517
},
{
"epoch": 0.13834546304333312,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.2981582880020142,
"step": 518
},
{
"epoch": 0.13861253922681444,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.252562165260315,
"step": 519
},
{
"epoch": 0.13887961541029578,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.4024418592453003,
"step": 520
},
{
"epoch": 0.13914669159377713,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.3612288236618042,
"step": 521
},
{
"epoch": 0.13941376777725847,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.512868881225586,
"step": 522
},
{
"epoch": 0.1396808439607398,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.375083565711975,
"step": 523
},
{
"epoch": 0.13994792014422114,
"grad_norm": 0.640625,
"learning_rate": 0.0001,
"loss": 1.3480380773544312,
"step": 524
},
{
"epoch": 0.14021499632770248,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3594597578048706,
"step": 525
},
{
"epoch": 0.1404820725111838,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.3115019798278809,
"step": 526
},
{
"epoch": 0.14074914869466515,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.2540839910507202,
"step": 527
},
{
"epoch": 0.1410162248781465,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.3259623050689697,
"step": 528
},
{
"epoch": 0.14128330106162784,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.2489577531814575,
"step": 529
},
{
"epoch": 0.14155037724510916,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.3643953800201416,
"step": 530
},
{
"epoch": 0.1418174534285905,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.4134252071380615,
"step": 531
},
{
"epoch": 0.14208452961207185,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3410513401031494,
"step": 532
},
{
"epoch": 0.14235160579555317,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.5260295867919922,
"step": 533
},
{
"epoch": 0.14261868197903452,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.39544677734375,
"step": 534
},
{
"epoch": 0.14288575816251586,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.5388062000274658,
"step": 535
},
{
"epoch": 0.14315283434599718,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.3159434795379639,
"step": 536
},
{
"epoch": 0.14341991052947853,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3709022998809814,
"step": 537
},
{
"epoch": 0.14368698671295987,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.3411214351654053,
"step": 538
},
{
"epoch": 0.14395406289644122,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.3669507503509521,
"step": 539
},
{
"epoch": 0.14422113907992254,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.4368212223052979,
"step": 540
},
{
"epoch": 0.14448821526340389,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.392094373703003,
"step": 541
},
{
"epoch": 0.14475529144688523,
"grad_norm": 0.61328125,
"learning_rate": 0.0001,
"loss": 1.4218195676803589,
"step": 542
},
{
"epoch": 0.14502236763036655,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.3535877466201782,
"step": 543
},
{
"epoch": 0.1452894438138479,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.3613425493240356,
"step": 544
},
{
"epoch": 0.14555651999732924,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.4432311058044434,
"step": 545
},
{
"epoch": 0.1458235961808106,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.4000786542892456,
"step": 546
},
{
"epoch": 0.1460906723642919,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.4382905960083008,
"step": 547
},
{
"epoch": 0.14635774854777325,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3199044466018677,
"step": 548
},
{
"epoch": 0.1466248247312546,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.4392772912979126,
"step": 549
},
{
"epoch": 0.14689190091473592,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.2513238191604614,
"step": 550
},
{
"epoch": 0.14715897709821726,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.3273593187332153,
"step": 551
},
{
"epoch": 0.1474260532816986,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.4180893898010254,
"step": 552
},
{
"epoch": 0.14769312946517996,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.3058419227600098,
"step": 553
},
{
"epoch": 0.14796020564866127,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.3267779350280762,
"step": 554
},
{
"epoch": 0.14822728183214262,
"grad_norm": 0.72265625,
"learning_rate": 0.0001,
"loss": 1.4524626731872559,
"step": 555
},
{
"epoch": 0.14849435801562397,
"grad_norm": 0.6796875,
"learning_rate": 0.0001,
"loss": 1.3795995712280273,
"step": 556
},
{
"epoch": 0.14876143419910529,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.3587958812713623,
"step": 557
},
{
"epoch": 0.14902851038258663,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.2744011878967285,
"step": 558
},
{
"epoch": 0.14929558656606798,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.3527799844741821,
"step": 559
},
{
"epoch": 0.1495626627495493,
"grad_norm": 0.71875,
"learning_rate": 0.0001,
"loss": 1.5189881324768066,
"step": 560
},
{
"epoch": 0.14982973893303064,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.2625980377197266,
"step": 561
},
{
"epoch": 0.150096815116512,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.3527413606643677,
"step": 562
},
{
"epoch": 0.15036389129999334,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.3367376327514648,
"step": 563
},
{
"epoch": 0.15063096748347465,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.5792129039764404,
"step": 564
},
{
"epoch": 0.150898043666956,
"grad_norm": 0.61328125,
"learning_rate": 0.0001,
"loss": 1.3854796886444092,
"step": 565
},
{
"epoch": 0.15116511985043735,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.3474575281143188,
"step": 566
},
{
"epoch": 0.15143219603391866,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.347915530204773,
"step": 567
},
{
"epoch": 0.1516992722174,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.433038353919983,
"step": 568
},
{
"epoch": 0.15196634840088136,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 1.2815639972686768,
"step": 569
},
{
"epoch": 0.1522334245843627,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.4771748781204224,
"step": 570
},
{
"epoch": 0.15250050076784402,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.4043114185333252,
"step": 571
},
{
"epoch": 0.15276757695132537,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.3523536920547485,
"step": 572
},
{
"epoch": 0.15303465313480671,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.347900629043579,
"step": 573
},
{
"epoch": 0.15330172931828803,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.3196823596954346,
"step": 574
},
{
"epoch": 0.15356880550176938,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.472100853919983,
"step": 575
},
{
"epoch": 0.15383588168525072,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.3329824209213257,
"step": 576
},
{
"epoch": 0.15410295786873204,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 1.3529117107391357,
"step": 577
},
{
"epoch": 0.1543700340522134,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.507158637046814,
"step": 578
},
{
"epoch": 0.15463711023569474,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.2954974174499512,
"step": 579
},
{
"epoch": 0.15490418641917608,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.4197229146957397,
"step": 580
},
{
"epoch": 0.1551712626026574,
"grad_norm": 0.63671875,
"learning_rate": 0.0001,
"loss": 1.3419297933578491,
"step": 581
},
{
"epoch": 0.15543833878613875,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.4200016260147095,
"step": 582
},
{
"epoch": 0.1557054149696201,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.434874176979065,
"step": 583
},
{
"epoch": 0.1559724911531014,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.4581799507141113,
"step": 584
},
{
"epoch": 0.15623956733658276,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.3460516929626465,
"step": 585
},
{
"epoch": 0.1565066435200641,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 1.2796165943145752,
"step": 586
},
{
"epoch": 0.15677371970354545,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.339218258857727,
"step": 587
},
{
"epoch": 0.15704079588702677,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.4210413694381714,
"step": 588
},
{
"epoch": 0.15730787207050811,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3302514553070068,
"step": 589
},
{
"epoch": 0.15757494825398946,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.2733567953109741,
"step": 590
},
{
"epoch": 0.15784202443747078,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.335167407989502,
"step": 591
},
{
"epoch": 0.15810910062095213,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.3808437585830688,
"step": 592
},
{
"epoch": 0.15837617680443347,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.3993290662765503,
"step": 593
},
{
"epoch": 0.1586432529879148,
"grad_norm": 0.64453125,
"learning_rate": 0.0001,
"loss": 1.4145331382751465,
"step": 594
},
{
"epoch": 0.15891032917139614,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3426066637039185,
"step": 595
},
{
"epoch": 0.15917740535487748,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.311093807220459,
"step": 596
},
{
"epoch": 0.15944448153835883,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.4404199123382568,
"step": 597
},
{
"epoch": 0.15971155772184015,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.2987630367279053,
"step": 598
},
{
"epoch": 0.1599786339053215,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.100266933441162,
"step": 599
},
{
"epoch": 0.16024571008880284,
"grad_norm": 0.53125,
"learning_rate": 0.0001,
"loss": 1.4733037948608398,
"step": 600
},
{
"epoch": 0.16051278627228416,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.4752408266067505,
"step": 601
},
{
"epoch": 0.1607798624557655,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.365689992904663,
"step": 602
},
{
"epoch": 0.16104693863924685,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.4368810653686523,
"step": 603
},
{
"epoch": 0.1613140148227282,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.3821358680725098,
"step": 604
},
{
"epoch": 0.16158109100620952,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.3334194421768188,
"step": 605
},
{
"epoch": 0.16184816718969086,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.3358609676361084,
"step": 606
},
{
"epoch": 0.1621152433731722,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3391547203063965,
"step": 607
},
{
"epoch": 0.16238231955665353,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.271005630493164,
"step": 608
},
{
"epoch": 0.16264939574013487,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.308809518814087,
"step": 609
},
{
"epoch": 0.16291647192361622,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.3306488990783691,
"step": 610
},
{
"epoch": 0.16318354810709754,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.3931329250335693,
"step": 611
},
{
"epoch": 0.16345062429057888,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.391507863998413,
"step": 612
},
{
"epoch": 0.16371770047406023,
"grad_norm": 0.5390625,
"learning_rate": 0.0001,
"loss": 1.3093831539154053,
"step": 613
},
{
"epoch": 0.16398477665754158,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.3817846775054932,
"step": 614
},
{
"epoch": 0.1642518528410229,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.2933706045150757,
"step": 615
},
{
"epoch": 0.16451892902450424,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.4987157583236694,
"step": 616
},
{
"epoch": 0.1647860052079856,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.4318113327026367,
"step": 617
},
{
"epoch": 0.1650530813914669,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.405381202697754,
"step": 618
},
{
"epoch": 0.16532015757494825,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3609851598739624,
"step": 619
},
{
"epoch": 0.1655872337584296,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.549910306930542,
"step": 620
},
{
"epoch": 0.16585430994191094,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.4672483205795288,
"step": 621
},
{
"epoch": 0.16612138612539226,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.279062032699585,
"step": 622
},
{
"epoch": 0.1663884623088736,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.398356556892395,
"step": 623
},
{
"epoch": 0.16665553849235495,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3836430311203003,
"step": 624
},
{
"epoch": 0.16692261467583627,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.4430739879608154,
"step": 625
},
{
"epoch": 0.16718969085931762,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.314014196395874,
"step": 626
},
{
"epoch": 0.16745676704279897,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.3610138893127441,
"step": 627
},
{
"epoch": 0.16772384322628028,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.4223636388778687,
"step": 628
},
{
"epoch": 0.16799091940976163,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3471919298171997,
"step": 629
},
{
"epoch": 0.16825799559324298,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.321734070777893,
"step": 630
},
{
"epoch": 0.16852507177672432,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.4103505611419678,
"step": 631
},
{
"epoch": 0.16879214796020564,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.366790771484375,
"step": 632
},
{
"epoch": 0.169059224143687,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3268681764602661,
"step": 633
},
{
"epoch": 0.16932630032716833,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.4994505643844604,
"step": 634
},
{
"epoch": 0.16959337651064965,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.2760584354400635,
"step": 635
},
{
"epoch": 0.169860452694131,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.5046621561050415,
"step": 636
},
{
"epoch": 0.17012752887761234,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.4411461353302002,
"step": 637
},
{
"epoch": 0.1703946050610937,
"grad_norm": 0.53515625,
"learning_rate": 0.0001,
"loss": 1.2349016666412354,
"step": 638
},
{
"epoch": 0.170661681244575,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.327634334564209,
"step": 639
},
{
"epoch": 0.17092875742805635,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.3230228424072266,
"step": 640
},
{
"epoch": 0.1711958336115377,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.3155494928359985,
"step": 641
},
{
"epoch": 0.17146290979501902,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.2801501750946045,
"step": 642
},
{
"epoch": 0.17172998597850037,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.3254103660583496,
"step": 643
},
{
"epoch": 0.1719970621619817,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.4413820505142212,
"step": 644
},
{
"epoch": 0.17226413834546303,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.3388326168060303,
"step": 645
},
{
"epoch": 0.17253121452894438,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.4060581922531128,
"step": 646
},
{
"epoch": 0.17279829071242572,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 1.4181727170944214,
"step": 647
},
{
"epoch": 0.17306536689590707,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.3995436429977417,
"step": 648
},
{
"epoch": 0.1733324430793884,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3278270959854126,
"step": 649
},
{
"epoch": 0.17359951926286973,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.396315574645996,
"step": 650
},
{
"epoch": 0.17386659544635108,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.224435806274414,
"step": 651
},
{
"epoch": 0.1741336716298324,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.4477213621139526,
"step": 652
},
{
"epoch": 0.17440074781331374,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.3974782228469849,
"step": 653
},
{
"epoch": 0.1746678239967951,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.4719974994659424,
"step": 654
},
{
"epoch": 0.17493490018027644,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.3227864503860474,
"step": 655
},
{
"epoch": 0.17520197636375776,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.4213438034057617,
"step": 656
},
{
"epoch": 0.1754690525472391,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3693983554840088,
"step": 657
},
{
"epoch": 0.17573612873072045,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.4880212545394897,
"step": 658
},
{
"epoch": 0.17600320491420177,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.315510630607605,
"step": 659
},
{
"epoch": 0.1762702810976831,
"grad_norm": 0.5390625,
"learning_rate": 0.0001,
"loss": 1.3102097511291504,
"step": 660
},
{
"epoch": 0.17653735728116446,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.2761492729187012,
"step": 661
},
{
"epoch": 0.17680443346464578,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3073588609695435,
"step": 662
},
{
"epoch": 0.17707150964812712,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.2740814685821533,
"step": 663
},
{
"epoch": 0.17733858583160847,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3133610486984253,
"step": 664
},
{
"epoch": 0.17760566201508982,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.2949233055114746,
"step": 665
},
{
"epoch": 0.17787273819857113,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.2975229024887085,
"step": 666
},
{
"epoch": 0.17813981438205248,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.355084776878357,
"step": 667
},
{
"epoch": 0.17840689056553383,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.3896042108535767,
"step": 668
},
{
"epoch": 0.17867396674901515,
"grad_norm": 0.53125,
"learning_rate": 0.0001,
"loss": 1.2886717319488525,
"step": 669
},
{
"epoch": 0.1789410429324965,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.3847600221633911,
"step": 670
},
{
"epoch": 0.17920811911597784,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3929096460342407,
"step": 671
},
{
"epoch": 0.17947519529945918,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.3475611209869385,
"step": 672
},
{
"epoch": 0.1797422714829405,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.5096707344055176,
"step": 673
},
{
"epoch": 0.18000934766642185,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 1.4462485313415527,
"step": 674
},
{
"epoch": 0.1802764238499032,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.2290098667144775,
"step": 675
},
{
"epoch": 0.1805435000333845,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.3566489219665527,
"step": 676
},
{
"epoch": 0.18081057621686586,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.2263463735580444,
"step": 677
},
{
"epoch": 0.1810776524003472,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.3689738512039185,
"step": 678
},
{
"epoch": 0.18134472858382852,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.3112177848815918,
"step": 679
},
{
"epoch": 0.18161180476730987,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.4430503845214844,
"step": 680
},
{
"epoch": 0.18187888095079122,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.3968268632888794,
"step": 681
},
{
"epoch": 0.18214595713427256,
"grad_norm": 0.5390625,
"learning_rate": 0.0001,
"loss": 1.2902494668960571,
"step": 682
},
{
"epoch": 0.18241303331775388,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.368390679359436,
"step": 683
},
{
"epoch": 0.18268010950123523,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.36076021194458,
"step": 684
},
{
"epoch": 0.18294718568471657,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.292517900466919,
"step": 685
},
{
"epoch": 0.1832142618681979,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.3457889556884766,
"step": 686
},
{
"epoch": 0.18348133805167924,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.44827139377594,
"step": 687
},
{
"epoch": 0.18374841423516058,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.3091659545898438,
"step": 688
},
{
"epoch": 0.18401549041864193,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3264273405075073,
"step": 689
},
{
"epoch": 0.18428256660212325,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.329689621925354,
"step": 690
},
{
"epoch": 0.1845496427856046,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.419305443763733,
"step": 691
},
{
"epoch": 0.18481671896908594,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.3605815172195435,
"step": 692
},
{
"epoch": 0.18508379515256726,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.2165064811706543,
"step": 693
},
{
"epoch": 0.1853508713360486,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.407888412475586,
"step": 694
},
{
"epoch": 0.18561794751952995,
"grad_norm": 0.53515625,
"learning_rate": 0.0001,
"loss": 1.3541319370269775,
"step": 695
},
{
"epoch": 0.18588502370301127,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.364072561264038,
"step": 696
},
{
"epoch": 0.18615209988649262,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.3818382024765015,
"step": 697
},
{
"epoch": 0.18641917606997396,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3012504577636719,
"step": 698
},
{
"epoch": 0.1866862522534553,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.2554136514663696,
"step": 699
},
{
"epoch": 0.18695332843693663,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.3880873918533325,
"step": 700
},
{
"epoch": 0.18722040462041797,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.3451950550079346,
"step": 701
},
{
"epoch": 0.18748748080389932,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.5112781524658203,
"step": 702
},
{
"epoch": 0.18775455698738064,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.3787572383880615,
"step": 703
},
{
"epoch": 0.18802163317086198,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.3001948595046997,
"step": 704
},
{
"epoch": 0.18828870935434333,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.376787543296814,
"step": 705
},
{
"epoch": 0.18855578553782468,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.4305808544158936,
"step": 706
},
{
"epoch": 0.188822861721306,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.4173235893249512,
"step": 707
},
{
"epoch": 0.18908993790478734,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 1.3962050676345825,
"step": 708
},
{
"epoch": 0.1893570140882687,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.3929705619812012,
"step": 709
},
{
"epoch": 0.18962409027175,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 1.3011528253555298,
"step": 710
},
{
"epoch": 0.18989116645523135,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.4486719369888306,
"step": 711
},
{
"epoch": 0.1901582426387127,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.4150975942611694,
"step": 712
},
{
"epoch": 0.19042531882219402,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3898661136627197,
"step": 713
},
{
"epoch": 0.19069239500567536,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.3499014377593994,
"step": 714
},
{
"epoch": 0.1909594711891567,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.4843147993087769,
"step": 715
},
{
"epoch": 0.19122654737263806,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.37148916721344,
"step": 716
},
{
"epoch": 0.19149362355611937,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.3226914405822754,
"step": 717
},
{
"epoch": 0.19176069973960072,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.3623806238174438,
"step": 718
},
{
"epoch": 0.19202777592308207,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3770349025726318,
"step": 719
},
{
"epoch": 0.19229485210656339,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.447762370109558,
"step": 720
},
{
"epoch": 0.19256192829004473,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.3057664632797241,
"step": 721
},
{
"epoch": 0.19282900447352608,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.3898338079452515,
"step": 722
},
{
"epoch": 0.19309608065700742,
"grad_norm": 0.53125,
"learning_rate": 0.0001,
"loss": 1.429726243019104,
"step": 723
},
{
"epoch": 0.19336315684048874,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3420591354370117,
"step": 724
},
{
"epoch": 0.1936302330239701,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 1.445176362991333,
"step": 725
},
{
"epoch": 0.19389730920745143,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.3709323406219482,
"step": 726
},
{
"epoch": 0.19416438539093275,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.3264213800430298,
"step": 727
},
{
"epoch": 0.1944314615744141,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.388136863708496,
"step": 728
},
{
"epoch": 0.19469853775789545,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.3481216430664062,
"step": 729
},
{
"epoch": 0.19496561394137676,
"grad_norm": 0.5390625,
"learning_rate": 0.0001,
"loss": 1.4106545448303223,
"step": 730
},
{
"epoch": 0.1952326901248581,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3216980695724487,
"step": 731
},
{
"epoch": 0.19549976630833946,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.3771264553070068,
"step": 732
},
{
"epoch": 0.1957668424918208,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.428194522857666,
"step": 733
},
{
"epoch": 0.19603391867530212,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 1.3380928039550781,
"step": 734
},
{
"epoch": 0.19630099485878347,
"grad_norm": 0.5390625,
"learning_rate": 0.0001,
"loss": 1.40969979763031,
"step": 735
},
{
"epoch": 0.1965680710422648,
"grad_norm": 0.52734375,
"learning_rate": 0.0001,
"loss": 1.274341106414795,
"step": 736
},
{
"epoch": 0.19683514722574613,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.5402621030807495,
"step": 737
},
{
"epoch": 0.19710222340922748,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.297295331954956,
"step": 738
},
{
"epoch": 0.19736929959270882,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.4769492149353027,
"step": 739
},
{
"epoch": 0.19763637577619017,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.3517801761627197,
"step": 740
},
{
"epoch": 0.1979034519596715,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.415041446685791,
"step": 741
},
{
"epoch": 0.19817052814315284,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.376120686531067,
"step": 742
},
{
"epoch": 0.19843760432663418,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.2844655513763428,
"step": 743
},
{
"epoch": 0.1987046805101155,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.3740079402923584,
"step": 744
},
{
"epoch": 0.19897175669359685,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 1.3367841243743896,
"step": 745
},
{
"epoch": 0.1992388328770782,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.3713178634643555,
"step": 746
},
{
"epoch": 0.1995059090605595,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.390384554862976,
"step": 747
},
{
"epoch": 0.19977298524404086,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.4415326118469238,
"step": 748
},
{
"epoch": 0.2000400614275222,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.3195914030075073,
"step": 749
},
{
"epoch": 0.20030713761100355,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.329708456993103,
"step": 750
},
{
"epoch": 0.20057421379448487,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.363006830215454,
"step": 751
},
{
"epoch": 0.20084128997796621,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.2761677503585815,
"step": 752
},
{
"epoch": 0.20110836616144756,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.4017164707183838,
"step": 753
},
{
"epoch": 0.20137544234492888,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.3224306106567383,
"step": 754
},
{
"epoch": 0.20164251852841023,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.337402105331421,
"step": 755
},
{
"epoch": 0.20190959471189157,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.4196721315383911,
"step": 756
},
{
"epoch": 0.20217667089537292,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.314110279083252,
"step": 757
},
{
"epoch": 0.20244374707885424,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.4188446998596191,
"step": 758
},
{
"epoch": 0.20271082326233558,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3670579195022583,
"step": 759
},
{
"epoch": 0.20297789944581693,
"grad_norm": 0.58203125,
"learning_rate": 0.0001,
"loss": 1.358155608177185,
"step": 760
},
{
"epoch": 0.20324497562929825,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.2619855403900146,
"step": 761
},
{
"epoch": 0.2035120518127796,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.2777178287506104,
"step": 762
},
{
"epoch": 0.20377912799626094,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 1.4464168548583984,
"step": 763
},
{
"epoch": 0.20404620417974226,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.2626973390579224,
"step": 764
},
{
"epoch": 0.2043132803632236,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.3790322542190552,
"step": 765
},
{
"epoch": 0.20458035654670495,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 1.368752360343933,
"step": 766
},
{
"epoch": 0.2048474327301863,
"grad_norm": 0.5234375,
"learning_rate": 0.0001,
"loss": 1.2792086601257324,
"step": 767
},
{
"epoch": 0.20511450891366761,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.3987016677856445,
"step": 768
},
{
"epoch": 0.20538158509714896,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.3528289794921875,
"step": 769
},
{
"epoch": 0.2056486612806303,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.3498940467834473,
"step": 770
},
{
"epoch": 0.20591573746411163,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.277837872505188,
"step": 771
},
{
"epoch": 0.20618281364759297,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 1.3961533308029175,
"step": 772
},
{
"epoch": 0.20644988983107432,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3514395952224731,
"step": 773
},
{
"epoch": 0.20671696601455566,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.3249895572662354,
"step": 774
},
{
"epoch": 0.20698404219803698,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 1.3612358570098877,
"step": 775
},
{
"epoch": 0.20725111838151833,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.4175899028778076,
"step": 776
},
{
"epoch": 0.20751819456499968,
"grad_norm": 0.5390625,
"learning_rate": 0.0001,
"loss": 1.3095741271972656,
"step": 777
},
{
"epoch": 0.207785270748481,
"grad_norm": 0.57421875,
"learning_rate": 0.0001,
"loss": 1.4355252981185913,
"step": 778
},
{
"epoch": 0.20805234693196234,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.3619897365570068,
"step": 779
},
{
"epoch": 0.20831942311544369,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 1.2804675102233887,
"step": 780
},
{
"epoch": 0.20858649929892503,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.3464568853378296,
"step": 781
},
{
"epoch": 0.20885357548240635,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 1.3062856197357178,
"step": 782
},
{
"epoch": 0.2091206516658877,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.3532344102859497,
"step": 783
},
{
"epoch": 0.20938772784936904,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 1.3980780839920044,
"step": 784
},
{
"epoch": 0.20965480403285036,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 1.3094193935394287,
"step": 785
},
{
"epoch": 0.2099218802163317,
"grad_norm": 0.53125,
"learning_rate": 0.0001,
"loss": 1.436868667602539,
"step": 786
},
{
"epoch": 0.21018895639981305,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 1.4248528480529785,
"step": 787
}
],
"logging_steps": 1,
"max_steps": 3933,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 787,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.5555284789987e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}