chabot-supervisor-phi4rpKL / trainer_state.json
shareit's picture
Upload folder using huggingface_hub
818980a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 888,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004507042253521127,
"grad_norm": 0.5109427571296692,
"learning_rate": 0.0,
"loss": 2.8366,
"step": 1
},
{
"epoch": 0.009014084507042254,
"grad_norm": 0.5052362084388733,
"learning_rate": 7.4074074074074075e-06,
"loss": 2.9992,
"step": 2
},
{
"epoch": 0.013521126760563381,
"grad_norm": 0.4910641610622406,
"learning_rate": 1.4814814814814815e-05,
"loss": 2.8925,
"step": 3
},
{
"epoch": 0.018028169014084508,
"grad_norm": 0.5134980082511902,
"learning_rate": 2.2222222222222223e-05,
"loss": 2.8476,
"step": 4
},
{
"epoch": 0.022535211267605635,
"grad_norm": 0.5817605257034302,
"learning_rate": 2.962962962962963e-05,
"loss": 2.9803,
"step": 5
},
{
"epoch": 0.027042253521126762,
"grad_norm": 0.7044035792350769,
"learning_rate": 3.7037037037037037e-05,
"loss": 2.8277,
"step": 6
},
{
"epoch": 0.031549295774647886,
"grad_norm": 0.793286919593811,
"learning_rate": 4.4444444444444447e-05,
"loss": 2.9407,
"step": 7
},
{
"epoch": 0.036056338028169016,
"grad_norm": 0.9440262317657471,
"learning_rate": 5.185185185185185e-05,
"loss": 2.9527,
"step": 8
},
{
"epoch": 0.04056338028169014,
"grad_norm": 1.107783555984497,
"learning_rate": 5.925925925925926e-05,
"loss": 2.5235,
"step": 9
},
{
"epoch": 0.04507042253521127,
"grad_norm": 1.3675676584243774,
"learning_rate": 6.666666666666667e-05,
"loss": 2.377,
"step": 10
},
{
"epoch": 0.049577464788732394,
"grad_norm": 1.4395591020584106,
"learning_rate": 7.407407407407407e-05,
"loss": 2.1443,
"step": 11
},
{
"epoch": 0.054084507042253524,
"grad_norm": 0.5429375171661377,
"learning_rate": 8.148148148148148e-05,
"loss": 1.8369,
"step": 12
},
{
"epoch": 0.05859154929577465,
"grad_norm": 1.5771080255508423,
"learning_rate": 8.888888888888889e-05,
"loss": 1.9669,
"step": 13
},
{
"epoch": 0.06309859154929577,
"grad_norm": 1.6812283992767334,
"learning_rate": 9.62962962962963e-05,
"loss": 1.918,
"step": 14
},
{
"epoch": 0.0676056338028169,
"grad_norm": 1.698196530342102,
"learning_rate": 0.0001037037037037037,
"loss": 1.7981,
"step": 15
},
{
"epoch": 0.07211267605633803,
"grad_norm": 1.4648041725158691,
"learning_rate": 0.00011111111111111112,
"loss": 1.686,
"step": 16
},
{
"epoch": 0.07661971830985916,
"grad_norm": 0.7997398972511292,
"learning_rate": 0.00011851851851851852,
"loss": 1.6433,
"step": 17
},
{
"epoch": 0.08112676056338028,
"grad_norm": 0.6825237274169922,
"learning_rate": 0.00012592592592592592,
"loss": 1.461,
"step": 18
},
{
"epoch": 0.0856338028169014,
"grad_norm": 0.598156213760376,
"learning_rate": 0.00013333333333333334,
"loss": 1.3822,
"step": 19
},
{
"epoch": 0.09014084507042254,
"grad_norm": 0.5010932683944702,
"learning_rate": 0.00014074074074074076,
"loss": 1.3466,
"step": 20
},
{
"epoch": 0.09464788732394366,
"grad_norm": 0.4217166006565094,
"learning_rate": 0.00014814814814814815,
"loss": 1.2368,
"step": 21
},
{
"epoch": 0.09915492957746479,
"grad_norm": 0.3251509368419647,
"learning_rate": 0.00015555555555555556,
"loss": 1.0822,
"step": 22
},
{
"epoch": 0.10366197183098591,
"grad_norm": 0.28032082319259644,
"learning_rate": 0.00016296296296296295,
"loss": 1.0166,
"step": 23
},
{
"epoch": 0.10816901408450705,
"grad_norm": 0.43019387125968933,
"learning_rate": 0.00017037037037037037,
"loss": 1.1404,
"step": 24
},
{
"epoch": 0.11267605633802817,
"grad_norm": 0.371652215719223,
"learning_rate": 0.00017777777777777779,
"loss": 1.0273,
"step": 25
},
{
"epoch": 0.1171830985915493,
"grad_norm": 0.4319760799407959,
"learning_rate": 0.0001851851851851852,
"loss": 1.0482,
"step": 26
},
{
"epoch": 0.12169014084507042,
"grad_norm": 0.4139591157436371,
"learning_rate": 0.0001925925925925926,
"loss": 0.9763,
"step": 27
},
{
"epoch": 0.12619718309859154,
"grad_norm": 0.3923117220401764,
"learning_rate": 0.0002,
"loss": 0.9831,
"step": 28
},
{
"epoch": 0.13070422535211268,
"grad_norm": 0.3156369626522064,
"learning_rate": 0.00019999933432389942,
"loss": 0.9356,
"step": 29
},
{
"epoch": 0.1352112676056338,
"grad_norm": 0.23431973159313202,
"learning_rate": 0.00019999733730446018,
"loss": 0.912,
"step": 30
},
{
"epoch": 0.13971830985915493,
"grad_norm": 0.2367880940437317,
"learning_rate": 0.00019999400896826965,
"loss": 0.8699,
"step": 31
},
{
"epoch": 0.14422535211267606,
"grad_norm": 0.23735183477401733,
"learning_rate": 0.00019998934935963966,
"loss": 0.8918,
"step": 32
},
{
"epoch": 0.14873239436619717,
"grad_norm": 0.21039249002933502,
"learning_rate": 0.00019998335854060607,
"loss": 0.8716,
"step": 33
},
{
"epoch": 0.1532394366197183,
"grad_norm": 0.2169954627752304,
"learning_rate": 0.00019997603659092773,
"loss": 0.8872,
"step": 34
},
{
"epoch": 0.15774647887323945,
"grad_norm": 0.18319812417030334,
"learning_rate": 0.00019996738360808565,
"loss": 0.8339,
"step": 35
},
{
"epoch": 0.16225352112676056,
"grad_norm": 0.16776813566684723,
"learning_rate": 0.00019995739970728144,
"loss": 0.882,
"step": 36
},
{
"epoch": 0.1667605633802817,
"grad_norm": 0.1763199269771576,
"learning_rate": 0.00019994608502143602,
"loss": 0.8939,
"step": 37
},
{
"epoch": 0.1712676056338028,
"grad_norm": 0.14987778663635254,
"learning_rate": 0.00019993343970118766,
"loss": 0.7841,
"step": 38
},
{
"epoch": 0.17577464788732394,
"grad_norm": 0.14466699957847595,
"learning_rate": 0.00019991946391489018,
"loss": 0.7943,
"step": 39
},
{
"epoch": 0.18028169014084508,
"grad_norm": 0.13038891553878784,
"learning_rate": 0.00019990415784861047,
"loss": 0.781,
"step": 40
},
{
"epoch": 0.1847887323943662,
"grad_norm": 0.1293761432170868,
"learning_rate": 0.00019988752170612618,
"loss": 0.793,
"step": 41
},
{
"epoch": 0.18929577464788733,
"grad_norm": 0.12768785655498505,
"learning_rate": 0.000199869555708923,
"loss": 0.7708,
"step": 42
},
{
"epoch": 0.19380281690140846,
"grad_norm": 0.14372171461582184,
"learning_rate": 0.0001998502600961916,
"loss": 0.7788,
"step": 43
},
{
"epoch": 0.19830985915492957,
"grad_norm": 0.12837673723697662,
"learning_rate": 0.00019982963512482453,
"loss": 0.7968,
"step": 44
},
{
"epoch": 0.2028169014084507,
"grad_norm": 0.12973128259181976,
"learning_rate": 0.00019980768106941281,
"loss": 0.7833,
"step": 45
},
{
"epoch": 0.20732394366197182,
"grad_norm": 0.11364801228046417,
"learning_rate": 0.00019978439822224226,
"loss": 0.78,
"step": 46
},
{
"epoch": 0.21183098591549296,
"grad_norm": 0.09620922058820724,
"learning_rate": 0.00019975978689328958,
"loss": 0.7192,
"step": 47
},
{
"epoch": 0.2163380281690141,
"grad_norm": 0.09795072674751282,
"learning_rate": 0.0001997338474102182,
"loss": 0.7628,
"step": 48
},
{
"epoch": 0.2208450704225352,
"grad_norm": 0.18416811525821686,
"learning_rate": 0.00019970658011837404,
"loss": 0.7298,
"step": 49
},
{
"epoch": 0.22535211267605634,
"grad_norm": 0.09638620167970657,
"learning_rate": 0.00019967798538078076,
"loss": 0.7248,
"step": 50
},
{
"epoch": 0.22985915492957745,
"grad_norm": 0.10183525085449219,
"learning_rate": 0.00019964806357813508,
"loss": 0.7704,
"step": 51
},
{
"epoch": 0.2343661971830986,
"grad_norm": 0.09559155255556107,
"learning_rate": 0.0001996168151088015,
"loss": 0.7559,
"step": 52
},
{
"epoch": 0.23887323943661973,
"grad_norm": 0.11242512613534927,
"learning_rate": 0.00019958424038880727,
"loss": 0.7897,
"step": 53
},
{
"epoch": 0.24338028169014084,
"grad_norm": 0.09209013730287552,
"learning_rate": 0.0001995503398518366,
"loss": 0.7212,
"step": 54
},
{
"epoch": 0.24788732394366197,
"grad_norm": 0.08857212960720062,
"learning_rate": 0.00019951511394922507,
"loss": 0.7409,
"step": 55
},
{
"epoch": 0.2523943661971831,
"grad_norm": 0.11040879040956497,
"learning_rate": 0.00019947856314995349,
"loss": 0.7288,
"step": 56
},
{
"epoch": 0.25690140845070425,
"grad_norm": 0.09153151512145996,
"learning_rate": 0.00019944068794064174,
"loss": 0.7387,
"step": 57
},
{
"epoch": 0.26140845070422536,
"grad_norm": 0.08312317728996277,
"learning_rate": 0.0001994014888255422,
"loss": 0.7295,
"step": 58
},
{
"epoch": 0.26591549295774647,
"grad_norm": 0.08618035167455673,
"learning_rate": 0.00019936096632653324,
"loss": 0.7089,
"step": 59
},
{
"epoch": 0.2704225352112676,
"grad_norm": 0.09828522056341171,
"learning_rate": 0.00019931912098311195,
"loss": 0.7105,
"step": 60
},
{
"epoch": 0.27492957746478874,
"grad_norm": 0.0870802253484726,
"learning_rate": 0.00019927595335238733,
"loss": 0.6672,
"step": 61
},
{
"epoch": 0.27943661971830985,
"grad_norm": 0.09070249646902084,
"learning_rate": 0.00019923146400907253,
"loss": 0.6541,
"step": 62
},
{
"epoch": 0.28394366197183096,
"grad_norm": 0.07697242498397827,
"learning_rate": 0.0001991856535454774,
"loss": 0.6902,
"step": 63
},
{
"epoch": 0.28845070422535213,
"grad_norm": 0.08537200093269348,
"learning_rate": 0.00019913852257150052,
"loss": 0.7073,
"step": 64
},
{
"epoch": 0.29295774647887324,
"grad_norm": 0.08310703933238983,
"learning_rate": 0.0001990900717146212,
"loss": 0.6784,
"step": 65
},
{
"epoch": 0.29746478873239435,
"grad_norm": 0.10847494751214981,
"learning_rate": 0.00019904030161989103,
"loss": 0.7397,
"step": 66
},
{
"epoch": 0.3019718309859155,
"grad_norm": 0.08034008741378784,
"learning_rate": 0.00019898921294992516,
"loss": 0.7044,
"step": 67
},
{
"epoch": 0.3064788732394366,
"grad_norm": 0.08679196983575821,
"learning_rate": 0.00019893680638489382,
"loss": 0.6948,
"step": 68
},
{
"epoch": 0.31098591549295773,
"grad_norm": 0.08680360019207001,
"learning_rate": 0.00019888308262251285,
"loss": 0.7017,
"step": 69
},
{
"epoch": 0.3154929577464789,
"grad_norm": 0.08481983840465546,
"learning_rate": 0.00019882804237803488,
"loss": 0.7111,
"step": 70
},
{
"epoch": 0.32,
"grad_norm": 0.07690601050853729,
"learning_rate": 0.0001987716863842393,
"loss": 0.6722,
"step": 71
},
{
"epoch": 0.3245070422535211,
"grad_norm": 0.06967870891094208,
"learning_rate": 0.00019871401539142293,
"loss": 0.6485,
"step": 72
},
{
"epoch": 0.3290140845070422,
"grad_norm": 0.0827360600233078,
"learning_rate": 0.00019865503016738983,
"loss": 0.698,
"step": 73
},
{
"epoch": 0.3335211267605634,
"grad_norm": 0.0766606405377388,
"learning_rate": 0.000198594731497441,
"loss": 0.691,
"step": 74
},
{
"epoch": 0.3380281690140845,
"grad_norm": 0.08298925310373306,
"learning_rate": 0.00019853312018436417,
"loss": 0.7125,
"step": 75
},
{
"epoch": 0.3425352112676056,
"grad_norm": 0.07882972061634064,
"learning_rate": 0.0001984701970484229,
"loss": 0.6932,
"step": 76
},
{
"epoch": 0.3470422535211268,
"grad_norm": 0.08647109568119049,
"learning_rate": 0.0001984059629273457,
"loss": 0.6862,
"step": 77
},
{
"epoch": 0.3515492957746479,
"grad_norm": 0.0839175134897232,
"learning_rate": 0.00019834041867631505,
"loss": 0.6784,
"step": 78
},
{
"epoch": 0.356056338028169,
"grad_norm": 0.07932542264461517,
"learning_rate": 0.0001982735651679557,
"loss": 0.6707,
"step": 79
},
{
"epoch": 0.36056338028169016,
"grad_norm": 0.0704977959394455,
"learning_rate": 0.00019820540329232333,
"loss": 0.6374,
"step": 80
},
{
"epoch": 0.36507042253521127,
"grad_norm": 0.07499450445175171,
"learning_rate": 0.0001981359339568926,
"loss": 0.6452,
"step": 81
},
{
"epoch": 0.3695774647887324,
"grad_norm": 0.07560130953788757,
"learning_rate": 0.00019806515808654498,
"loss": 0.6599,
"step": 82
},
{
"epoch": 0.37408450704225354,
"grad_norm": 0.0893062874674797,
"learning_rate": 0.0001979930766235566,
"loss": 0.6745,
"step": 83
},
{
"epoch": 0.37859154929577465,
"grad_norm": 0.08140068501234055,
"learning_rate": 0.00019791969052758562,
"loss": 0.6559,
"step": 84
},
{
"epoch": 0.38309859154929576,
"grad_norm": 0.0829370766878128,
"learning_rate": 0.00019784500077565944,
"loss": 0.6857,
"step": 85
},
{
"epoch": 0.38760563380281693,
"grad_norm": 0.0947769358754158,
"learning_rate": 0.0001977690083621617,
"loss": 0.7109,
"step": 86
},
{
"epoch": 0.39211267605633804,
"grad_norm": 0.10486528277397156,
"learning_rate": 0.00019769171429881907,
"loss": 0.7033,
"step": 87
},
{
"epoch": 0.39661971830985915,
"grad_norm": 0.11483494937419891,
"learning_rate": 0.0001976131196146878,
"loss": 0.7183,
"step": 88
},
{
"epoch": 0.40112676056338026,
"grad_norm": 0.08664172142744064,
"learning_rate": 0.0001975332253561399,
"loss": 0.6734,
"step": 89
},
{
"epoch": 0.4056338028169014,
"grad_norm": 0.08497391641139984,
"learning_rate": 0.00019745203258684938,
"loss": 0.6627,
"step": 90
},
{
"epoch": 0.41014084507042253,
"grad_norm": 0.09175261110067368,
"learning_rate": 0.00019736954238777792,
"loss": 0.6678,
"step": 91
},
{
"epoch": 0.41464788732394364,
"grad_norm": 0.08519595116376877,
"learning_rate": 0.0001972857558571606,
"loss": 0.6716,
"step": 92
},
{
"epoch": 0.4191549295774648,
"grad_norm": 0.08249272406101227,
"learning_rate": 0.0001972006741104913,
"loss": 0.6607,
"step": 93
},
{
"epoch": 0.4236619718309859,
"grad_norm": 0.09815848618745804,
"learning_rate": 0.00019711429828050769,
"loss": 0.6737,
"step": 94
},
{
"epoch": 0.428169014084507,
"grad_norm": 0.08678440749645233,
"learning_rate": 0.00019702662951717628,
"loss": 0.6437,
"step": 95
},
{
"epoch": 0.4326760563380282,
"grad_norm": 0.07430567592382431,
"learning_rate": 0.0001969376689876771,
"loss": 0.6445,
"step": 96
},
{
"epoch": 0.4371830985915493,
"grad_norm": 0.10983766615390778,
"learning_rate": 0.00019684741787638808,
"loss": 0.6588,
"step": 97
},
{
"epoch": 0.4416901408450704,
"grad_norm": 0.07917274534702301,
"learning_rate": 0.00019675587738486936,
"loss": 0.6532,
"step": 98
},
{
"epoch": 0.4461971830985916,
"grad_norm": 0.08504049479961395,
"learning_rate": 0.00019666304873184739,
"loss": 0.674,
"step": 99
},
{
"epoch": 0.4507042253521127,
"grad_norm": 0.08401936292648315,
"learning_rate": 0.00019656893315319837,
"loss": 0.645,
"step": 100
},
{
"epoch": 0.4552112676056338,
"grad_norm": 0.0813940167427063,
"learning_rate": 0.00019647353190193224,
"loss": 0.6491,
"step": 101
},
{
"epoch": 0.4597183098591549,
"grad_norm": 0.08745774626731873,
"learning_rate": 0.00019637684624817554,
"loss": 0.6561,
"step": 102
},
{
"epoch": 0.46422535211267607,
"grad_norm": 0.07857084274291992,
"learning_rate": 0.00019627887747915494,
"loss": 0.6419,
"step": 103
},
{
"epoch": 0.4687323943661972,
"grad_norm": 0.08520088344812393,
"learning_rate": 0.00019617962689917975,
"loss": 0.6167,
"step": 104
},
{
"epoch": 0.4732394366197183,
"grad_norm": 0.0916878879070282,
"learning_rate": 0.00019607909582962477,
"loss": 0.6165,
"step": 105
},
{
"epoch": 0.47774647887323946,
"grad_norm": 0.07891346514225006,
"learning_rate": 0.00019597728560891264,
"loss": 0.6262,
"step": 106
},
{
"epoch": 0.48225352112676056,
"grad_norm": 0.11444642394781113,
"learning_rate": 0.00019587419759249593,
"loss": 0.6252,
"step": 107
},
{
"epoch": 0.4867605633802817,
"grad_norm": 0.08692018687725067,
"learning_rate": 0.00019576983315283922,
"loss": 0.6772,
"step": 108
},
{
"epoch": 0.49126760563380284,
"grad_norm": 0.08438707143068314,
"learning_rate": 0.0001956641936794008,
"loss": 0.6159,
"step": 109
},
{
"epoch": 0.49577464788732395,
"grad_norm": 0.08054433017969131,
"learning_rate": 0.0001955572805786141,
"loss": 0.6198,
"step": 110
},
{
"epoch": 0.5002816901408451,
"grad_norm": 0.10778526216745377,
"learning_rate": 0.00019544909527386903,
"loss": 0.653,
"step": 111
},
{
"epoch": 0.5047887323943662,
"grad_norm": 0.08686485141515732,
"learning_rate": 0.00019533963920549306,
"loss": 0.6349,
"step": 112
},
{
"epoch": 0.5092957746478873,
"grad_norm": 0.08285564184188843,
"learning_rate": 0.00019522891383073196,
"loss": 0.6178,
"step": 113
},
{
"epoch": 0.5138028169014085,
"grad_norm": 0.11247093975543976,
"learning_rate": 0.00019511692062373044,
"loss": 0.6482,
"step": 114
},
{
"epoch": 0.5183098591549296,
"grad_norm": 0.11526112258434296,
"learning_rate": 0.00019500366107551252,
"loss": 0.6692,
"step": 115
},
{
"epoch": 0.5228169014084507,
"grad_norm": 0.09112968295812607,
"learning_rate": 0.00019488913669396166,
"loss": 0.6262,
"step": 116
},
{
"epoch": 0.5273239436619719,
"grad_norm": 0.1070021390914917,
"learning_rate": 0.0001947733490038008,
"loss": 0.6699,
"step": 117
},
{
"epoch": 0.5318309859154929,
"grad_norm": 0.09726863354444504,
"learning_rate": 0.00019465629954657185,
"loss": 0.6468,
"step": 118
},
{
"epoch": 0.5363380281690141,
"grad_norm": 0.08580797910690308,
"learning_rate": 0.00019453798988061535,
"loss": 0.5734,
"step": 119
},
{
"epoch": 0.5408450704225352,
"grad_norm": 0.08846008032560349,
"learning_rate": 0.00019441842158104966,
"loss": 0.5958,
"step": 120
},
{
"epoch": 0.5453521126760563,
"grad_norm": 0.08376345038414001,
"learning_rate": 0.00019429759623974991,
"loss": 0.6236,
"step": 121
},
{
"epoch": 0.5498591549295775,
"grad_norm": 0.1092655211687088,
"learning_rate": 0.00019417551546532704,
"loss": 0.629,
"step": 122
},
{
"epoch": 0.5543661971830985,
"grad_norm": 0.09457230567932129,
"learning_rate": 0.00019405218088310605,
"loss": 0.5955,
"step": 123
},
{
"epoch": 0.5588732394366197,
"grad_norm": 0.09811452776193619,
"learning_rate": 0.0001939275941351046,
"loss": 0.6683,
"step": 124
},
{
"epoch": 0.5633802816901409,
"grad_norm": 0.11004705727100372,
"learning_rate": 0.00019380175688001118,
"loss": 0.6381,
"step": 125
},
{
"epoch": 0.5678873239436619,
"grad_norm": 0.0918348953127861,
"learning_rate": 0.00019367467079316279,
"loss": 0.6285,
"step": 126
},
{
"epoch": 0.5723943661971831,
"grad_norm": 0.09264164417982101,
"learning_rate": 0.00019354633756652286,
"loss": 0.6667,
"step": 127
},
{
"epoch": 0.5769014084507043,
"grad_norm": 0.08994423598051071,
"learning_rate": 0.00019341675890865867,
"loss": 0.5938,
"step": 128
},
{
"epoch": 0.5814084507042253,
"grad_norm": 0.08977948129177094,
"learning_rate": 0.00019328593654471848,
"loss": 0.5933,
"step": 129
},
{
"epoch": 0.5859154929577465,
"grad_norm": 0.10552453994750977,
"learning_rate": 0.00019315387221640874,
"loss": 0.6596,
"step": 130
},
{
"epoch": 0.5904225352112676,
"grad_norm": 0.09184475243091583,
"learning_rate": 0.00019302056768197076,
"loss": 0.6269,
"step": 131
},
{
"epoch": 0.5949295774647887,
"grad_norm": 0.08900140970945358,
"learning_rate": 0.00019288602471615742,
"loss": 0.6168,
"step": 132
},
{
"epoch": 0.5994366197183099,
"grad_norm": 0.11255169659852982,
"learning_rate": 0.0001927502451102095,
"loss": 0.6187,
"step": 133
},
{
"epoch": 0.603943661971831,
"grad_norm": 0.091275654733181,
"learning_rate": 0.00019261323067183166,
"loss": 0.5949,
"step": 134
},
{
"epoch": 0.6084507042253521,
"grad_norm": 0.09162840992212296,
"learning_rate": 0.00019247498322516875,
"loss": 0.6069,
"step": 135
},
{
"epoch": 0.6129577464788732,
"grad_norm": 0.09226758778095245,
"learning_rate": 0.00019233550461078113,
"loss": 0.6014,
"step": 136
},
{
"epoch": 0.6174647887323944,
"grad_norm": 0.11824911832809448,
"learning_rate": 0.00019219479668562044,
"loss": 0.6126,
"step": 137
},
{
"epoch": 0.6219718309859155,
"grad_norm": 0.09837235510349274,
"learning_rate": 0.00019205286132300468,
"loss": 0.6268,
"step": 138
},
{
"epoch": 0.6264788732394366,
"grad_norm": 0.11063525825738907,
"learning_rate": 0.00019190970041259352,
"loss": 0.6101,
"step": 139
},
{
"epoch": 0.6309859154929578,
"grad_norm": 0.0999184399843216,
"learning_rate": 0.0001917653158603628,
"loss": 0.6362,
"step": 140
},
{
"epoch": 0.6354929577464788,
"grad_norm": 0.1021515354514122,
"learning_rate": 0.0001916197095885795,
"loss": 0.6094,
"step": 141
},
{
"epoch": 0.64,
"grad_norm": 0.10452960431575775,
"learning_rate": 0.00019147288353577589,
"loss": 0.6178,
"step": 142
},
{
"epoch": 0.6445070422535212,
"grad_norm": 0.14412759244441986,
"learning_rate": 0.00019132483965672386,
"loss": 0.6276,
"step": 143
},
{
"epoch": 0.6490140845070422,
"grad_norm": 0.11471202969551086,
"learning_rate": 0.00019117557992240887,
"loss": 0.6176,
"step": 144
},
{
"epoch": 0.6535211267605634,
"grad_norm": 0.1019945964217186,
"learning_rate": 0.00019102510632000363,
"loss": 0.6034,
"step": 145
},
{
"epoch": 0.6580281690140845,
"grad_norm": 0.09331919997930527,
"learning_rate": 0.00019087342085284182,
"loss": 0.6037,
"step": 146
},
{
"epoch": 0.6625352112676056,
"grad_norm": 0.1276702880859375,
"learning_rate": 0.00019072052554039122,
"loss": 0.623,
"step": 147
},
{
"epoch": 0.6670422535211268,
"grad_norm": 0.10190749168395996,
"learning_rate": 0.00019056642241822692,
"loss": 0.6346,
"step": 148
},
{
"epoch": 0.6715492957746478,
"grad_norm": 0.1176435798406601,
"learning_rate": 0.00019041111353800425,
"loss": 0.5865,
"step": 149
},
{
"epoch": 0.676056338028169,
"grad_norm": 0.12664955854415894,
"learning_rate": 0.0001902546009674314,
"loss": 0.5882,
"step": 150
},
{
"epoch": 0.6805633802816902,
"grad_norm": 0.10095725208520889,
"learning_rate": 0.0001900968867902419,
"loss": 0.5944,
"step": 151
},
{
"epoch": 0.6850704225352112,
"grad_norm": 0.09524551033973694,
"learning_rate": 0.00018993797310616698,
"loss": 0.567,
"step": 152
},
{
"epoch": 0.6895774647887324,
"grad_norm": 0.11315032839775085,
"learning_rate": 0.00018977786203090743,
"loss": 0.6368,
"step": 153
},
{
"epoch": 0.6940845070422536,
"grad_norm": 0.1286381185054779,
"learning_rate": 0.00018961655569610557,
"loss": 0.6272,
"step": 154
},
{
"epoch": 0.6985915492957746,
"grad_norm": 0.1185537725687027,
"learning_rate": 0.00018945405624931684,
"loss": 0.6102,
"step": 155
},
{
"epoch": 0.7030985915492958,
"grad_norm": 0.12012823671102524,
"learning_rate": 0.00018929036585398122,
"loss": 0.5973,
"step": 156
},
{
"epoch": 0.7076056338028169,
"grad_norm": 0.10544425994157791,
"learning_rate": 0.00018912548668939438,
"loss": 0.6194,
"step": 157
},
{
"epoch": 0.712112676056338,
"grad_norm": 0.1360369324684143,
"learning_rate": 0.0001889594209506787,
"loss": 0.625,
"step": 158
},
{
"epoch": 0.7166197183098592,
"grad_norm": 0.10620026290416718,
"learning_rate": 0.00018879217084875408,
"loss": 0.6054,
"step": 159
},
{
"epoch": 0.7211267605633803,
"grad_norm": 0.10198299586772919,
"learning_rate": 0.00018862373861030837,
"loss": 0.6489,
"step": 160
},
{
"epoch": 0.7256338028169014,
"grad_norm": 0.12448371946811676,
"learning_rate": 0.00018845412647776794,
"loss": 0.5959,
"step": 161
},
{
"epoch": 0.7301408450704225,
"grad_norm": 0.10388363152742386,
"learning_rate": 0.00018828333670926764,
"loss": 0.5915,
"step": 162
},
{
"epoch": 0.7346478873239437,
"grad_norm": 0.10535863786935806,
"learning_rate": 0.00018811137157862082,
"loss": 0.5892,
"step": 163
},
{
"epoch": 0.7391549295774648,
"grad_norm": 0.12875902652740479,
"learning_rate": 0.000187938233375289,
"loss": 0.5904,
"step": 164
},
{
"epoch": 0.7436619718309859,
"grad_norm": 0.10576613992452621,
"learning_rate": 0.00018776392440435146,
"loss": 0.616,
"step": 165
},
{
"epoch": 0.7481690140845071,
"grad_norm": 0.11017853021621704,
"learning_rate": 0.00018758844698647456,
"loss": 0.5802,
"step": 166
},
{
"epoch": 0.7526760563380281,
"grad_norm": 0.1325456202030182,
"learning_rate": 0.00018741180345788072,
"loss": 0.5984,
"step": 167
},
{
"epoch": 0.7571830985915493,
"grad_norm": 0.09516138583421707,
"learning_rate": 0.00018723399617031751,
"loss": 0.6039,
"step": 168
},
{
"epoch": 0.7616901408450705,
"grad_norm": 0.12991869449615479,
"learning_rate": 0.0001870550274910261,
"loss": 0.5868,
"step": 169
},
{
"epoch": 0.7661971830985915,
"grad_norm": 0.12943920493125916,
"learning_rate": 0.00018687489980270998,
"loss": 0.5894,
"step": 170
},
{
"epoch": 0.7707042253521127,
"grad_norm": 0.10891859978437424,
"learning_rate": 0.00018669361550350307,
"loss": 0.6213,
"step": 171
},
{
"epoch": 0.7752112676056339,
"grad_norm": 0.18440082669258118,
"learning_rate": 0.00018651117700693793,
"loss": 0.5791,
"step": 172
},
{
"epoch": 0.7797183098591549,
"grad_norm": 0.10212352871894836,
"learning_rate": 0.00018632758674191343,
"loss": 0.5957,
"step": 173
},
{
"epoch": 0.7842253521126761,
"grad_norm": 0.12365531921386719,
"learning_rate": 0.00018614284715266264,
"loss": 0.6341,
"step": 174
},
{
"epoch": 0.7887323943661971,
"grad_norm": 0.16831544041633606,
"learning_rate": 0.00018595696069872013,
"loss": 0.5904,
"step": 175
},
{
"epoch": 0.7932394366197183,
"grad_norm": 0.10575060546398163,
"learning_rate": 0.0001857699298548893,
"loss": 0.574,
"step": 176
},
{
"epoch": 0.7977464788732395,
"grad_norm": 0.09861788153648376,
"learning_rate": 0.00018558175711120946,
"loss": 0.5694,
"step": 177
},
{
"epoch": 0.8022535211267605,
"grad_norm": 0.10814764350652695,
"learning_rate": 0.00018539244497292248,
"loss": 0.5881,
"step": 178
},
{
"epoch": 0.8067605633802817,
"grad_norm": 0.1235693171620369,
"learning_rate": 0.00018520199596043976,
"loss": 0.5941,
"step": 179
},
{
"epoch": 0.8112676056338028,
"grad_norm": 0.11930661648511887,
"learning_rate": 0.0001850104126093084,
"loss": 0.598,
"step": 180
},
{
"epoch": 0.8157746478873239,
"grad_norm": 0.11203116923570633,
"learning_rate": 0.00018481769747017752,
"loss": 0.5903,
"step": 181
},
{
"epoch": 0.8202816901408451,
"grad_norm": 0.32203975319862366,
"learning_rate": 0.00018462385310876443,
"loss": 0.5577,
"step": 182
},
{
"epoch": 0.8247887323943662,
"grad_norm": 0.11607527732849121,
"learning_rate": 0.00018442888210582026,
"loss": 0.5881,
"step": 183
},
{
"epoch": 0.8292957746478873,
"grad_norm": 0.1148713156580925,
"learning_rate": 0.00018423278705709573,
"loss": 0.5661,
"step": 184
},
{
"epoch": 0.8338028169014085,
"grad_norm": 0.11103823035955429,
"learning_rate": 0.00018403557057330666,
"loss": 0.5772,
"step": 185
},
{
"epoch": 0.8383098591549296,
"grad_norm": 0.11443036794662476,
"learning_rate": 0.000183837235280099,
"loss": 0.5958,
"step": 186
},
{
"epoch": 0.8428169014084507,
"grad_norm": 0.1051875427365303,
"learning_rate": 0.00018363778381801402,
"loss": 0.5774,
"step": 187
},
{
"epoch": 0.8473239436619718,
"grad_norm": 0.10837268084287643,
"learning_rate": 0.0001834372188424532,
"loss": 0.6333,
"step": 188
},
{
"epoch": 0.851830985915493,
"grad_norm": 0.10711036622524261,
"learning_rate": 0.00018323554302364272,
"loss": 0.5734,
"step": 189
},
{
"epoch": 0.856338028169014,
"grad_norm": 0.10732840746641159,
"learning_rate": 0.00018303275904659806,
"loss": 0.5941,
"step": 190
},
{
"epoch": 0.8608450704225352,
"grad_norm": 0.1174219474196434,
"learning_rate": 0.00018282886961108817,
"loss": 0.5983,
"step": 191
},
{
"epoch": 0.8653521126760564,
"grad_norm": 0.11654770374298096,
"learning_rate": 0.0001826238774315995,
"loss": 0.6075,
"step": 192
},
{
"epoch": 0.8698591549295774,
"grad_norm": 0.09939175844192505,
"learning_rate": 0.00018241778523729995,
"loss": 0.5731,
"step": 193
},
{
"epoch": 0.8743661971830986,
"grad_norm": 0.10806521773338318,
"learning_rate": 0.0001822105957720025,
"loss": 0.6029,
"step": 194
},
{
"epoch": 0.8788732394366198,
"grad_norm": 0.11704345047473907,
"learning_rate": 0.0001820023117941287,
"loss": 0.5376,
"step": 195
},
{
"epoch": 0.8833802816901408,
"grad_norm": 0.1095748320221901,
"learning_rate": 0.00018179293607667178,
"loss": 0.5714,
"step": 196
},
{
"epoch": 0.887887323943662,
"grad_norm": 0.130691796541214,
"learning_rate": 0.00018158247140716004,
"loss": 0.6168,
"step": 197
},
{
"epoch": 0.8923943661971832,
"grad_norm": 0.1149544045329094,
"learning_rate": 0.0001813709205876194,
"loss": 0.5928,
"step": 198
},
{
"epoch": 0.8969014084507042,
"grad_norm": 0.12969551980495453,
"learning_rate": 0.00018115828643453647,
"loss": 0.6044,
"step": 199
},
{
"epoch": 0.9014084507042254,
"grad_norm": 0.12071943283081055,
"learning_rate": 0.00018094457177882068,
"loss": 0.6057,
"step": 200
},
{
"epoch": 0.9059154929577464,
"grad_norm": 0.11709054559469223,
"learning_rate": 0.00018072977946576678,
"loss": 0.5769,
"step": 201
},
{
"epoch": 0.9104225352112676,
"grad_norm": 0.12902222573757172,
"learning_rate": 0.00018051391235501696,
"loss": 0.6006,
"step": 202
},
{
"epoch": 0.9149295774647888,
"grad_norm": 0.11009194701910019,
"learning_rate": 0.00018029697332052277,
"loss": 0.5779,
"step": 203
},
{
"epoch": 0.9194366197183098,
"grad_norm": 0.1268339455127716,
"learning_rate": 0.0001800789652505068,
"loss": 0.5761,
"step": 204
},
{
"epoch": 0.923943661971831,
"grad_norm": 0.13632765412330627,
"learning_rate": 0.00017985989104742434,
"loss": 0.6021,
"step": 205
},
{
"epoch": 0.9284507042253521,
"grad_norm": 0.11911677569150925,
"learning_rate": 0.00017963975362792454,
"loss": 0.5727,
"step": 206
},
{
"epoch": 0.9329577464788732,
"grad_norm": 0.143024280667305,
"learning_rate": 0.00017941855592281184,
"loss": 0.5952,
"step": 207
},
{
"epoch": 0.9374647887323944,
"grad_norm": 0.12256220728158951,
"learning_rate": 0.00017919630087700672,
"loss": 0.5981,
"step": 208
},
{
"epoch": 0.9419718309859155,
"grad_norm": 0.12445315718650818,
"learning_rate": 0.00017897299144950662,
"loss": 0.5765,
"step": 209
},
{
"epoch": 0.9464788732394366,
"grad_norm": 0.11824553459882736,
"learning_rate": 0.00017874863061334657,
"loss": 0.5752,
"step": 210
},
{
"epoch": 0.9509859154929577,
"grad_norm": 0.10992954671382904,
"learning_rate": 0.00017852322135555946,
"loss": 0.5859,
"step": 211
},
{
"epoch": 0.9554929577464789,
"grad_norm": 0.1199469268321991,
"learning_rate": 0.00017829676667713642,
"loss": 0.5991,
"step": 212
},
{
"epoch": 0.96,
"grad_norm": 0.11440328508615494,
"learning_rate": 0.0001780692695929868,
"loss": 0.6178,
"step": 213
},
{
"epoch": 0.9645070422535211,
"grad_norm": 0.1150740534067154,
"learning_rate": 0.00017784073313189795,
"loss": 0.5874,
"step": 214
},
{
"epoch": 0.9690140845070423,
"grad_norm": 0.1373901069164276,
"learning_rate": 0.0001776111603364952,
"loss": 0.5924,
"step": 215
},
{
"epoch": 0.9735211267605633,
"grad_norm": 0.11277955770492554,
"learning_rate": 0.000177380554263201,
"loss": 0.5903,
"step": 216
},
{
"epoch": 0.9780281690140845,
"grad_norm": 0.1162528321146965,
"learning_rate": 0.0001771489179821943,
"loss": 0.5654,
"step": 217
},
{
"epoch": 0.9825352112676057,
"grad_norm": 0.1154017448425293,
"learning_rate": 0.0001769162545773699,
"loss": 0.5799,
"step": 218
},
{
"epoch": 0.9870422535211267,
"grad_norm": 0.12194578349590302,
"learning_rate": 0.00017668256714629713,
"loss": 0.5735,
"step": 219
},
{
"epoch": 0.9915492957746479,
"grad_norm": 0.1068485751748085,
"learning_rate": 0.00017644785880017874,
"loss": 0.5548,
"step": 220
},
{
"epoch": 0.9960563380281691,
"grad_norm": 0.12018559128046036,
"learning_rate": 0.0001762121326638095,
"loss": 0.6097,
"step": 221
},
{
"epoch": 1.0,
"grad_norm": 0.11836925894021988,
"learning_rate": 0.00017597539187553447,
"loss": 0.5916,
"step": 222
},
{
"epoch": 1.0045070422535212,
"grad_norm": 0.10781855881214142,
"learning_rate": 0.00017573763958720736,
"loss": 0.5735,
"step": 223
},
{
"epoch": 1.0090140845070423,
"grad_norm": 0.11357908695936203,
"learning_rate": 0.00017549887896414851,
"loss": 0.5899,
"step": 224
},
{
"epoch": 1.0135211267605633,
"grad_norm": 0.10758072882890701,
"learning_rate": 0.00017525911318510274,
"loss": 0.5952,
"step": 225
},
{
"epoch": 1.0180281690140844,
"grad_norm": 0.10980618000030518,
"learning_rate": 0.00017501834544219697,
"loss": 0.5673,
"step": 226
},
{
"epoch": 1.0225352112676056,
"grad_norm": 0.10185301303863525,
"learning_rate": 0.0001747765789408979,
"loss": 0.5709,
"step": 227
},
{
"epoch": 1.0270422535211268,
"grad_norm": 0.10787362605333328,
"learning_rate": 0.00017453381689996916,
"loss": 0.5742,
"step": 228
},
{
"epoch": 1.031549295774648,
"grad_norm": 0.10553862154483795,
"learning_rate": 0.00017429006255142851,
"loss": 0.5666,
"step": 229
},
{
"epoch": 1.036056338028169,
"grad_norm": 0.11908821016550064,
"learning_rate": 0.00017404531914050483,
"loss": 0.6023,
"step": 230
},
{
"epoch": 1.04056338028169,
"grad_norm": 0.11501052975654602,
"learning_rate": 0.00017379958992559493,
"loss": 0.6038,
"step": 231
},
{
"epoch": 1.0450704225352112,
"grad_norm": 0.11365503072738647,
"learning_rate": 0.00017355287817822013,
"loss": 0.5774,
"step": 232
},
{
"epoch": 1.0495774647887324,
"grad_norm": 0.1214151382446289,
"learning_rate": 0.00017330518718298264,
"loss": 0.5707,
"step": 233
},
{
"epoch": 1.0540845070422535,
"grad_norm": 0.12372933328151703,
"learning_rate": 0.00017305652023752205,
"loss": 0.5665,
"step": 234
},
{
"epoch": 1.0585915492957747,
"grad_norm": 0.1101953536272049,
"learning_rate": 0.00017280688065247118,
"loss": 0.5547,
"step": 235
},
{
"epoch": 1.0630985915492959,
"grad_norm": 0.12429898232221603,
"learning_rate": 0.00017255627175141215,
"loss": 0.5855,
"step": 236
},
{
"epoch": 1.0676056338028168,
"grad_norm": 0.12096191197633743,
"learning_rate": 0.0001723046968708321,
"loss": 0.5754,
"step": 237
},
{
"epoch": 1.072112676056338,
"grad_norm": 0.11296476423740387,
"learning_rate": 0.0001720521593600787,
"loss": 0.6131,
"step": 238
},
{
"epoch": 1.0766197183098591,
"grad_norm": 0.16034449636936188,
"learning_rate": 0.00017179866258131568,
"loss": 0.583,
"step": 239
},
{
"epoch": 1.0811267605633803,
"grad_norm": 0.1200270801782608,
"learning_rate": 0.000171544209909478,
"loss": 0.5689,
"step": 240
},
{
"epoch": 1.0856338028169015,
"grad_norm": 0.11855066567659378,
"learning_rate": 0.0001712888047322269,
"loss": 0.6263,
"step": 241
},
{
"epoch": 1.0901408450704226,
"grad_norm": 0.11620208621025085,
"learning_rate": 0.00017103245044990475,
"loss": 0.55,
"step": 242
},
{
"epoch": 1.0946478873239436,
"grad_norm": 0.11482568085193634,
"learning_rate": 0.00017077515047549008,
"loss": 0.5802,
"step": 243
},
{
"epoch": 1.0991549295774647,
"grad_norm": 0.12208764255046844,
"learning_rate": 0.00017051690823455162,
"loss": 0.5672,
"step": 244
},
{
"epoch": 1.103661971830986,
"grad_norm": 0.14577889442443848,
"learning_rate": 0.00017025772716520323,
"loss": 0.5889,
"step": 245
},
{
"epoch": 1.108169014084507,
"grad_norm": 0.13474218547344208,
"learning_rate": 0.00016999761071805771,
"loss": 0.5709,
"step": 246
},
{
"epoch": 1.1126760563380282,
"grad_norm": 0.15604770183563232,
"learning_rate": 0.00016973656235618113,
"loss": 0.5923,
"step": 247
},
{
"epoch": 1.1171830985915494,
"grad_norm": 0.09855290502309799,
"learning_rate": 0.00016947458555504664,
"loss": 0.551,
"step": 248
},
{
"epoch": 1.1216901408450703,
"grad_norm": 0.14074093103408813,
"learning_rate": 0.0001692116838024881,
"loss": 0.5418,
"step": 249
},
{
"epoch": 1.1261971830985915,
"grad_norm": 0.13044053316116333,
"learning_rate": 0.00016894786059865383,
"loss": 0.5672,
"step": 250
},
{
"epoch": 1.1307042253521127,
"grad_norm": 0.11453069746494293,
"learning_rate": 0.00016868311945595978,
"loss": 0.5794,
"step": 251
},
{
"epoch": 1.1352112676056338,
"grad_norm": 0.13302022218704224,
"learning_rate": 0.00016841746389904304,
"loss": 0.5389,
"step": 252
},
{
"epoch": 1.139718309859155,
"grad_norm": 0.11696815490722656,
"learning_rate": 0.0001681508974647147,
"loss": 0.5419,
"step": 253
},
{
"epoch": 1.144225352112676,
"grad_norm": 0.13234978914260864,
"learning_rate": 0.0001678834237019129,
"loss": 0.5247,
"step": 254
},
{
"epoch": 1.1487323943661971,
"grad_norm": 0.12429715692996979,
"learning_rate": 0.00016761504617165537,
"loss": 0.5477,
"step": 255
},
{
"epoch": 1.1532394366197183,
"grad_norm": 0.145405575633049,
"learning_rate": 0.00016734576844699235,
"loss": 0.5931,
"step": 256
},
{
"epoch": 1.1577464788732394,
"grad_norm": 0.12818464636802673,
"learning_rate": 0.00016707559411295874,
"loss": 0.5467,
"step": 257
},
{
"epoch": 1.1622535211267606,
"grad_norm": 0.1135251522064209,
"learning_rate": 0.00016680452676652642,
"loss": 0.5456,
"step": 258
},
{
"epoch": 1.1667605633802818,
"grad_norm": 0.12029904127120972,
"learning_rate": 0.00016653257001655652,
"loss": 0.5854,
"step": 259
},
{
"epoch": 1.1712676056338027,
"grad_norm": 0.17769469320774078,
"learning_rate": 0.00016625972748375128,
"loss": 0.5633,
"step": 260
},
{
"epoch": 1.1757746478873239,
"grad_norm": 0.09973835945129395,
"learning_rate": 0.00016598600280060566,
"loss": 0.5254,
"step": 261
},
{
"epoch": 1.180281690140845,
"grad_norm": 0.11614567786455154,
"learning_rate": 0.00016571139961135927,
"loss": 0.5781,
"step": 262
},
{
"epoch": 1.1847887323943662,
"grad_norm": 0.12455403804779053,
"learning_rate": 0.0001654359215719478,
"loss": 0.5932,
"step": 263
},
{
"epoch": 1.1892957746478874,
"grad_norm": 0.10969316214323044,
"learning_rate": 0.0001651595723499541,
"loss": 0.5989,
"step": 264
},
{
"epoch": 1.1938028169014085,
"grad_norm": 0.23572060465812683,
"learning_rate": 0.00016488235562455965,
"loss": 0.5748,
"step": 265
},
{
"epoch": 1.1983098591549295,
"grad_norm": 0.1086173951625824,
"learning_rate": 0.00016460427508649546,
"loss": 0.5773,
"step": 266
},
{
"epoch": 1.2028169014084507,
"grad_norm": 0.1331978142261505,
"learning_rate": 0.00016432533443799284,
"loss": 0.5821,
"step": 267
},
{
"epoch": 1.2073239436619718,
"grad_norm": 0.1528872549533844,
"learning_rate": 0.00016404553739273427,
"loss": 0.5652,
"step": 268
},
{
"epoch": 1.211830985915493,
"grad_norm": 0.11898130923509598,
"learning_rate": 0.0001637648876758039,
"loss": 0.5689,
"step": 269
},
{
"epoch": 1.2163380281690142,
"grad_norm": 0.1391112506389618,
"learning_rate": 0.00016348338902363787,
"loss": 0.6048,
"step": 270
},
{
"epoch": 1.220845070422535,
"grad_norm": 0.11310067027807236,
"learning_rate": 0.00016320104518397472,
"loss": 0.5668,
"step": 271
},
{
"epoch": 1.2253521126760563,
"grad_norm": 0.12252317368984222,
"learning_rate": 0.00016291785991580534,
"loss": 0.5977,
"step": 272
},
{
"epoch": 1.2298591549295774,
"grad_norm": 0.13545405864715576,
"learning_rate": 0.00016263383698932306,
"loss": 0.5651,
"step": 273
},
{
"epoch": 1.2343661971830986,
"grad_norm": 0.14084038138389587,
"learning_rate": 0.00016234898018587337,
"loss": 0.5767,
"step": 274
},
{
"epoch": 1.2388732394366198,
"grad_norm": 0.13004128634929657,
"learning_rate": 0.00016206329329790354,
"loss": 0.5675,
"step": 275
},
{
"epoch": 1.243380281690141,
"grad_norm": 0.15050950646400452,
"learning_rate": 0.00016177678012891232,
"loss": 0.5874,
"step": 276
},
{
"epoch": 1.247887323943662,
"grad_norm": 0.12289810925722122,
"learning_rate": 0.00016148944449339902,
"loss": 0.5644,
"step": 277
},
{
"epoch": 1.252394366197183,
"grad_norm": 0.13767307996749878,
"learning_rate": 0.00016120129021681296,
"loss": 0.5602,
"step": 278
},
{
"epoch": 1.2569014084507042,
"grad_norm": 0.14790986478328705,
"learning_rate": 0.0001609123211355025,
"loss": 0.5526,
"step": 279
},
{
"epoch": 1.2614084507042254,
"grad_norm": 0.12315787374973297,
"learning_rate": 0.0001606225410966638,
"loss": 0.5788,
"step": 280
},
{
"epoch": 1.2659154929577465,
"grad_norm": 0.12627176940441132,
"learning_rate": 0.00016033195395828985,
"loss": 0.5431,
"step": 281
},
{
"epoch": 1.2704225352112677,
"grad_norm": 0.12655603885650635,
"learning_rate": 0.00016004056358911883,
"loss": 0.5849,
"step": 282
},
{
"epoch": 1.2749295774647886,
"grad_norm": 0.11540428549051285,
"learning_rate": 0.0001597483738685829,
"loss": 0.5538,
"step": 283
},
{
"epoch": 1.2794366197183098,
"grad_norm": 0.14923222362995148,
"learning_rate": 0.00015945538868675628,
"loss": 0.598,
"step": 284
},
{
"epoch": 1.283943661971831,
"grad_norm": 0.1102415919303894,
"learning_rate": 0.00015916161194430372,
"loss": 0.5503,
"step": 285
},
{
"epoch": 1.2884507042253521,
"grad_norm": 0.1324516087770462,
"learning_rate": 0.00015886704755242829,
"loss": 0.5817,
"step": 286
},
{
"epoch": 1.2929577464788733,
"grad_norm": 0.11458254605531693,
"learning_rate": 0.00015857169943281948,
"loss": 0.5566,
"step": 287
},
{
"epoch": 1.2974647887323942,
"grad_norm": 0.11885883659124374,
"learning_rate": 0.00015827557151760105,
"loss": 0.5518,
"step": 288
},
{
"epoch": 1.3019718309859156,
"grad_norm": 0.12516717612743378,
"learning_rate": 0.00015797866774927848,
"loss": 0.5898,
"step": 289
},
{
"epoch": 1.3064788732394366,
"grad_norm": 0.14177542924880981,
"learning_rate": 0.00015768099208068664,
"loss": 0.6099,
"step": 290
},
{
"epoch": 1.3109859154929577,
"grad_norm": 0.1278507560491562,
"learning_rate": 0.00015738254847493704,
"loss": 0.5552,
"step": 291
},
{
"epoch": 1.315492957746479,
"grad_norm": 0.12453269958496094,
"learning_rate": 0.00015708334090536527,
"loss": 0.5922,
"step": 292
},
{
"epoch": 1.32,
"grad_norm": 0.1163586899638176,
"learning_rate": 0.00015678337335547782,
"loss": 0.5345,
"step": 293
},
{
"epoch": 1.3245070422535212,
"grad_norm": 0.11830248683691025,
"learning_rate": 0.00015648264981889934,
"loss": 0.5801,
"step": 294
},
{
"epoch": 1.3290140845070422,
"grad_norm": 0.17942926287651062,
"learning_rate": 0.00015618117429931926,
"loss": 0.5367,
"step": 295
},
{
"epoch": 1.3335211267605633,
"grad_norm": 0.13405169546604156,
"learning_rate": 0.00015587895081043844,
"loss": 0.564,
"step": 296
},
{
"epoch": 1.3380281690140845,
"grad_norm": 0.11632625758647919,
"learning_rate": 0.00015557598337591607,
"loss": 0.5629,
"step": 297
},
{
"epoch": 1.3425352112676057,
"grad_norm": 0.15242446959018707,
"learning_rate": 0.00015527227602931572,
"loss": 0.6318,
"step": 298
},
{
"epoch": 1.3470422535211268,
"grad_norm": 0.13854624330997467,
"learning_rate": 0.00015496783281405177,
"loss": 0.5683,
"step": 299
},
{
"epoch": 1.3515492957746478,
"grad_norm": 0.1268724948167801,
"learning_rate": 0.00015466265778333574,
"loss": 0.5661,
"step": 300
},
{
"epoch": 1.356056338028169,
"grad_norm": 0.1456933319568634,
"learning_rate": 0.00015435675500012212,
"loss": 0.5801,
"step": 301
},
{
"epoch": 1.36056338028169,
"grad_norm": 0.1346408724784851,
"learning_rate": 0.00015405012853705432,
"loss": 0.5811,
"step": 302
},
{
"epoch": 1.3650704225352113,
"grad_norm": 0.1391272395849228,
"learning_rate": 0.00015374278247641052,
"loss": 0.5864,
"step": 303
},
{
"epoch": 1.3695774647887324,
"grad_norm": 0.11695600301027298,
"learning_rate": 0.00015343472091004925,
"loss": 0.5919,
"step": 304
},
{
"epoch": 1.3740845070422536,
"grad_norm": 0.13721643388271332,
"learning_rate": 0.00015312594793935494,
"loss": 0.5863,
"step": 305
},
{
"epoch": 1.3785915492957748,
"grad_norm": 0.1210237517952919,
"learning_rate": 0.00015281646767518337,
"loss": 0.5549,
"step": 306
},
{
"epoch": 1.3830985915492957,
"grad_norm": 0.14290763437747955,
"learning_rate": 0.00015250628423780683,
"loss": 0.5884,
"step": 307
},
{
"epoch": 1.3876056338028169,
"grad_norm": 0.14939945936203003,
"learning_rate": 0.00015219540175685938,
"loss": 0.5369,
"step": 308
},
{
"epoch": 1.392112676056338,
"grad_norm": 0.12331175804138184,
"learning_rate": 0.00015188382437128167,
"loss": 0.5702,
"step": 309
},
{
"epoch": 1.3966197183098592,
"grad_norm": 0.11888106167316437,
"learning_rate": 0.0001515715562292662,
"loss": 0.5274,
"step": 310
},
{
"epoch": 1.4011267605633804,
"grad_norm": 0.13149474561214447,
"learning_rate": 0.00015125860148820167,
"loss": 0.5785,
"step": 311
},
{
"epoch": 1.4056338028169013,
"grad_norm": 0.12267362326383591,
"learning_rate": 0.00015094496431461795,
"loss": 0.5455,
"step": 312
},
{
"epoch": 1.4101408450704225,
"grad_norm": 0.1183052584528923,
"learning_rate": 0.00015063064888413047,
"loss": 0.5453,
"step": 313
},
{
"epoch": 1.4146478873239436,
"grad_norm": 0.14539587497711182,
"learning_rate": 0.00015031565938138458,
"loss": 0.5424,
"step": 314
},
{
"epoch": 1.4191549295774648,
"grad_norm": 0.12015419453382492,
"learning_rate": 0.00015000000000000001,
"loss": 0.555,
"step": 315
},
{
"epoch": 1.423661971830986,
"grad_norm": 0.11795421689748764,
"learning_rate": 0.00014968367494251484,
"loss": 0.5337,
"step": 316
},
{
"epoch": 1.428169014084507,
"grad_norm": 0.2184419184923172,
"learning_rate": 0.00014936668842032973,
"loss": 0.5802,
"step": 317
},
{
"epoch": 1.4326760563380283,
"grad_norm": 0.12965649366378784,
"learning_rate": 0.00014904904465365168,
"loss": 0.5455,
"step": 318
},
{
"epoch": 1.4371830985915492,
"grad_norm": 0.13248752057552338,
"learning_rate": 0.000148730747871438,
"loss": 0.5809,
"step": 319
},
{
"epoch": 1.4416901408450704,
"grad_norm": 0.12989097833633423,
"learning_rate": 0.00014841180231133988,
"loss": 0.5702,
"step": 320
},
{
"epoch": 1.4461971830985916,
"grad_norm": 0.11466707289218903,
"learning_rate": 0.00014809221221964608,
"loss": 0.5853,
"step": 321
},
{
"epoch": 1.4507042253521127,
"grad_norm": 0.13086023926734924,
"learning_rate": 0.0001477719818512263,
"loss": 0.5637,
"step": 322
},
{
"epoch": 1.455211267605634,
"grad_norm": 0.12968841195106506,
"learning_rate": 0.0001474511154694746,
"loss": 0.528,
"step": 323
},
{
"epoch": 1.4597183098591549,
"grad_norm": 0.10484195500612259,
"learning_rate": 0.00014712961734625264,
"loss": 0.5357,
"step": 324
},
{
"epoch": 1.464225352112676,
"grad_norm": 0.12601444125175476,
"learning_rate": 0.00014680749176183274,
"loss": 0.5336,
"step": 325
},
{
"epoch": 1.4687323943661972,
"grad_norm": 0.1338932365179062,
"learning_rate": 0.00014648474300484095,
"loss": 0.5427,
"step": 326
},
{
"epoch": 1.4732394366197183,
"grad_norm": 0.11979039758443832,
"learning_rate": 0.00014616137537219997,
"loss": 0.5655,
"step": 327
},
{
"epoch": 1.4777464788732395,
"grad_norm": 0.1427401751279831,
"learning_rate": 0.00014583739316907188,
"loss": 0.5734,
"step": 328
},
{
"epoch": 1.4822535211267605,
"grad_norm": 0.13836687803268433,
"learning_rate": 0.0001455128007088009,
"loss": 0.5407,
"step": 329
},
{
"epoch": 1.4867605633802816,
"grad_norm": 0.10016699135303497,
"learning_rate": 0.00014518760231285583,
"loss": 0.5709,
"step": 330
},
{
"epoch": 1.4912676056338028,
"grad_norm": 0.13350802659988403,
"learning_rate": 0.00014486180231077278,
"loss": 0.5508,
"step": 331
},
{
"epoch": 1.495774647887324,
"grad_norm": 0.11883152276277542,
"learning_rate": 0.00014453540504009714,
"loss": 0.5258,
"step": 332
},
{
"epoch": 1.5002816901408451,
"grad_norm": 0.11694121360778809,
"learning_rate": 0.0001442084148463262,
"loss": 0.567,
"step": 333
},
{
"epoch": 1.504788732394366,
"grad_norm": 0.13168871402740479,
"learning_rate": 0.00014388083608285113,
"loss": 0.5504,
"step": 334
},
{
"epoch": 1.5092957746478874,
"grad_norm": 0.11850599944591522,
"learning_rate": 0.00014355267311089897,
"loss": 0.5587,
"step": 335
},
{
"epoch": 1.5138028169014084,
"grad_norm": 0.11114364862442017,
"learning_rate": 0.00014322393029947468,
"loss": 0.5398,
"step": 336
},
{
"epoch": 1.5183098591549296,
"grad_norm": 0.1272014081478119,
"learning_rate": 0.00014289461202530296,
"loss": 0.5487,
"step": 337
},
{
"epoch": 1.5228169014084507,
"grad_norm": 0.11661308258771896,
"learning_rate": 0.00014256472267276982,
"loss": 0.5422,
"step": 338
},
{
"epoch": 1.5273239436619719,
"grad_norm": 0.11852391809225082,
"learning_rate": 0.0001422342666338645,
"loss": 0.5594,
"step": 339
},
{
"epoch": 1.531830985915493,
"grad_norm": 0.15416082739830017,
"learning_rate": 0.00014190324830812067,
"loss": 0.5379,
"step": 340
},
{
"epoch": 1.536338028169014,
"grad_norm": 0.11813542246818542,
"learning_rate": 0.00014157167210255815,
"loss": 0.5642,
"step": 341
},
{
"epoch": 1.5408450704225352,
"grad_norm": 0.11259282380342484,
"learning_rate": 0.00014123954243162404,
"loss": 0.5681,
"step": 342
},
{
"epoch": 1.5453521126760563,
"grad_norm": 0.12045090645551682,
"learning_rate": 0.00014090686371713402,
"loss": 0.5393,
"step": 343
},
{
"epoch": 1.5498591549295775,
"grad_norm": 0.11575852334499359,
"learning_rate": 0.00014057364038821347,
"loss": 0.5578,
"step": 344
},
{
"epoch": 1.5543661971830987,
"grad_norm": 0.1217094361782074,
"learning_rate": 0.0001402398768812385,
"loss": 0.5191,
"step": 345
},
{
"epoch": 1.5588732394366196,
"grad_norm": 0.12148681282997131,
"learning_rate": 0.00013990557763977695,
"loss": 0.5386,
"step": 346
},
{
"epoch": 1.563380281690141,
"grad_norm": 0.120498426258564,
"learning_rate": 0.0001395707471145291,
"loss": 0.5481,
"step": 347
},
{
"epoch": 1.567887323943662,
"grad_norm": 0.12890687584877014,
"learning_rate": 0.0001392353897632685,
"loss": 0.5487,
"step": 348
},
{
"epoch": 1.572394366197183,
"grad_norm": 0.19078081846237183,
"learning_rate": 0.0001388995100507827,
"loss": 0.5396,
"step": 349
},
{
"epoch": 1.5769014084507043,
"grad_norm": 0.1250527799129486,
"learning_rate": 0.0001385631124488136,
"loss": 0.5489,
"step": 350
},
{
"epoch": 1.5814084507042252,
"grad_norm": 0.14552123844623566,
"learning_rate": 0.00013822620143599804,
"loss": 0.5672,
"step": 351
},
{
"epoch": 1.5859154929577466,
"grad_norm": 0.10857994854450226,
"learning_rate": 0.00013788878149780827,
"loss": 0.5457,
"step": 352
},
{
"epoch": 1.5904225352112675,
"grad_norm": 0.1588989943265915,
"learning_rate": 0.00013755085712649202,
"loss": 0.5691,
"step": 353
},
{
"epoch": 1.5949295774647887,
"grad_norm": 0.1268385797739029,
"learning_rate": 0.0001372124328210129,
"loss": 0.5691,
"step": 354
},
{
"epoch": 1.5994366197183099,
"grad_norm": 0.12925177812576294,
"learning_rate": 0.00013687351308699027,
"loss": 0.5465,
"step": 355
},
{
"epoch": 1.603943661971831,
"grad_norm": 0.118260458111763,
"learning_rate": 0.00013653410243663952,
"loss": 0.5307,
"step": 356
},
{
"epoch": 1.6084507042253522,
"grad_norm": 0.12612397968769073,
"learning_rate": 0.00013619420538871178,
"loss": 0.5586,
"step": 357
},
{
"epoch": 1.6129577464788731,
"grad_norm": 0.11145804077386856,
"learning_rate": 0.00013585382646843396,
"loss": 0.5785,
"step": 358
},
{
"epoch": 1.6174647887323945,
"grad_norm": 0.12373437732458115,
"learning_rate": 0.00013551297020744825,
"loss": 0.5458,
"step": 359
},
{
"epoch": 1.6219718309859155,
"grad_norm": 0.11881903558969498,
"learning_rate": 0.00013517164114375195,
"loss": 0.5599,
"step": 360
},
{
"epoch": 1.6264788732394366,
"grad_norm": 0.11561185866594315,
"learning_rate": 0.00013482984382163712,
"loss": 0.5342,
"step": 361
},
{
"epoch": 1.6309859154929578,
"grad_norm": 0.11933106929063797,
"learning_rate": 0.00013448758279162993,
"loss": 0.5126,
"step": 362
},
{
"epoch": 1.6354929577464787,
"grad_norm": 0.1258057802915573,
"learning_rate": 0.00013414486261043008,
"loss": 0.5531,
"step": 363
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.12655043601989746,
"learning_rate": 0.00013380168784085027,
"loss": 0.5629,
"step": 364
},
{
"epoch": 1.644507042253521,
"grad_norm": 0.12148042023181915,
"learning_rate": 0.00013345806305175542,
"loss": 0.5339,
"step": 365
},
{
"epoch": 1.6490140845070422,
"grad_norm": 0.17970451712608337,
"learning_rate": 0.0001331139928180016,
"loss": 0.5586,
"step": 366
},
{
"epoch": 1.6535211267605634,
"grad_norm": 0.13397206366062164,
"learning_rate": 0.00013276948172037556,
"loss": 0.5451,
"step": 367
},
{
"epoch": 1.6580281690140843,
"grad_norm": 0.12288239598274231,
"learning_rate": 0.0001324245343455333,
"loss": 0.5378,
"step": 368
},
{
"epoch": 1.6625352112676057,
"grad_norm": 0.1233164519071579,
"learning_rate": 0.00013207915528593933,
"loss": 0.5355,
"step": 369
},
{
"epoch": 1.6670422535211267,
"grad_norm": 0.12358216941356659,
"learning_rate": 0.00013173334913980534,
"loss": 0.5578,
"step": 370
},
{
"epoch": 1.6715492957746478,
"grad_norm": 0.12331364303827286,
"learning_rate": 0.0001313871205110291,
"loss": 0.544,
"step": 371
},
{
"epoch": 1.676056338028169,
"grad_norm": 0.12452603131532669,
"learning_rate": 0.00013104047400913302,
"loss": 0.5519,
"step": 372
},
{
"epoch": 1.6805633802816902,
"grad_norm": 0.11694569885730743,
"learning_rate": 0.000130693414249203,
"loss": 0.5513,
"step": 373
},
{
"epoch": 1.6850704225352113,
"grad_norm": 0.12917865812778473,
"learning_rate": 0.00013034594585182677,
"loss": 0.5513,
"step": 374
},
{
"epoch": 1.6895774647887323,
"grad_norm": 0.12343605607748032,
"learning_rate": 0.0001299980734430324,
"loss": 0.5366,
"step": 375
},
{
"epoch": 1.6940845070422537,
"grad_norm": 0.11165818572044373,
"learning_rate": 0.000129649801654227,
"loss": 0.5371,
"step": 376
},
{
"epoch": 1.6985915492957746,
"grad_norm": 0.1372881680727005,
"learning_rate": 0.00012930113512213463,
"loss": 0.5446,
"step": 377
},
{
"epoch": 1.7030985915492958,
"grad_norm": 0.13788315653800964,
"learning_rate": 0.00012895207848873487,
"loss": 0.5343,
"step": 378
},
{
"epoch": 1.707605633802817,
"grad_norm": 0.11924686282873154,
"learning_rate": 0.00012860263640120085,
"loss": 0.5255,
"step": 379
},
{
"epoch": 1.7121126760563379,
"grad_norm": 0.11935974657535553,
"learning_rate": 0.0001282528135118375,
"loss": 0.5962,
"step": 380
},
{
"epoch": 1.7166197183098593,
"grad_norm": 0.1333233267068863,
"learning_rate": 0.0001279026144780196,
"loss": 0.5603,
"step": 381
},
{
"epoch": 1.7211267605633802,
"grad_norm": 0.1334204375743866,
"learning_rate": 0.00012755204396212966,
"loss": 0.5505,
"step": 382
},
{
"epoch": 1.7256338028169014,
"grad_norm": 0.131240114569664,
"learning_rate": 0.00012720110663149594,
"loss": 0.5538,
"step": 383
},
{
"epoch": 1.7301408450704225,
"grad_norm": 0.13378827273845673,
"learning_rate": 0.00012684980715833039,
"loss": 0.5555,
"step": 384
},
{
"epoch": 1.7346478873239437,
"grad_norm": 0.12199459224939346,
"learning_rate": 0.0001264981502196662,
"loss": 0.5642,
"step": 385
},
{
"epoch": 1.7391549295774649,
"grad_norm": 0.14442510902881622,
"learning_rate": 0.00012614614049729576,
"loss": 0.5405,
"step": 386
},
{
"epoch": 1.7436619718309858,
"grad_norm": 0.12474284321069717,
"learning_rate": 0.00012579378267770834,
"loss": 0.5849,
"step": 387
},
{
"epoch": 1.7481690140845072,
"grad_norm": 0.11631778627634048,
"learning_rate": 0.00012544108145202748,
"loss": 0.5273,
"step": 388
},
{
"epoch": 1.7526760563380281,
"grad_norm": 0.11749529093503952,
"learning_rate": 0.00012508804151594867,
"loss": 0.5916,
"step": 389
},
{
"epoch": 1.7571830985915493,
"grad_norm": 0.13148164749145508,
"learning_rate": 0.00012473466756967696,
"loss": 0.539,
"step": 390
},
{
"epoch": 1.7616901408450705,
"grad_norm": 0.12642063200473785,
"learning_rate": 0.00012438096431786408,
"loss": 0.5421,
"step": 391
},
{
"epoch": 1.7661971830985914,
"grad_norm": 0.12551520764827728,
"learning_rate": 0.00012402693646954607,
"loss": 0.5356,
"step": 392
},
{
"epoch": 1.7707042253521128,
"grad_norm": 0.12111272662878036,
"learning_rate": 0.00012367258873808052,
"loss": 0.5356,
"step": 393
},
{
"epoch": 1.7752112676056337,
"grad_norm": 0.12639492750167847,
"learning_rate": 0.00012331792584108374,
"loss": 0.5582,
"step": 394
},
{
"epoch": 1.779718309859155,
"grad_norm": 0.21740061044692993,
"learning_rate": 0.00012296295250036803,
"loss": 0.5388,
"step": 395
},
{
"epoch": 1.784225352112676,
"grad_norm": 0.12856127321720123,
"learning_rate": 0.00012260767344187873,
"loss": 0.5457,
"step": 396
},
{
"epoch": 1.788732394366197,
"grad_norm": 0.11823161691427231,
"learning_rate": 0.00012225209339563145,
"loss": 0.5385,
"step": 397
},
{
"epoch": 1.7932394366197184,
"grad_norm": 0.11629433184862137,
"learning_rate": 0.00012189621709564894,
"loss": 0.5495,
"step": 398
},
{
"epoch": 1.7977464788732394,
"grad_norm": 0.13767673075199127,
"learning_rate": 0.00012154004927989815,
"loss": 0.5202,
"step": 399
},
{
"epoch": 1.8022535211267605,
"grad_norm": 0.11361772567033768,
"learning_rate": 0.00012118359469022712,
"loss": 0.5414,
"step": 400
},
{
"epoch": 1.8067605633802817,
"grad_norm": 0.1155274510383606,
"learning_rate": 0.00012082685807230194,
"loss": 0.5372,
"step": 401
},
{
"epoch": 1.8112676056338028,
"grad_norm": 0.13188374042510986,
"learning_rate": 0.00012046984417554337,
"loss": 0.557,
"step": 402
},
{
"epoch": 1.815774647887324,
"grad_norm": 0.11928690969944,
"learning_rate": 0.00012011255775306378,
"loss": 0.5272,
"step": 403
},
{
"epoch": 1.820281690140845,
"grad_norm": 0.11563651263713837,
"learning_rate": 0.00011975500356160383,
"loss": 0.5154,
"step": 404
},
{
"epoch": 1.8247887323943663,
"grad_norm": 0.12366284430027008,
"learning_rate": 0.00011939718636146913,
"loss": 0.557,
"step": 405
},
{
"epoch": 1.8292957746478873,
"grad_norm": 0.11570242792367935,
"learning_rate": 0.00011903911091646684,
"loss": 0.5297,
"step": 406
},
{
"epoch": 1.8338028169014085,
"grad_norm": 0.12199252098798752,
"learning_rate": 0.00011868078199384229,
"loss": 0.533,
"step": 407
},
{
"epoch": 1.8383098591549296,
"grad_norm": 0.13045908510684967,
"learning_rate": 0.00011832220436421549,
"loss": 0.548,
"step": 408
},
{
"epoch": 1.8428169014084506,
"grad_norm": 0.20884227752685547,
"learning_rate": 0.00011796338280151756,
"loss": 0.5449,
"step": 409
},
{
"epoch": 1.847323943661972,
"grad_norm": 0.11599334329366684,
"learning_rate": 0.00011760432208292729,
"loss": 0.5355,
"step": 410
},
{
"epoch": 1.8518309859154929,
"grad_norm": 0.11243471503257751,
"learning_rate": 0.0001172450269888075,
"loss": 0.5569,
"step": 411
},
{
"epoch": 1.856338028169014,
"grad_norm": 0.1418810337781906,
"learning_rate": 0.00011688550230264128,
"loss": 0.5416,
"step": 412
},
{
"epoch": 1.8608450704225352,
"grad_norm": 0.12925301492214203,
"learning_rate": 0.0001165257528109685,
"loss": 0.5654,
"step": 413
},
{
"epoch": 1.8653521126760564,
"grad_norm": 0.1256323903799057,
"learning_rate": 0.0001161657833033219,
"loss": 0.5732,
"step": 414
},
{
"epoch": 1.8698591549295775,
"grad_norm": 0.12409226596355438,
"learning_rate": 0.00011580559857216347,
"loss": 0.5258,
"step": 415
},
{
"epoch": 1.8743661971830985,
"grad_norm": 0.12138303369283676,
"learning_rate": 0.00011544520341282053,
"loss": 0.5309,
"step": 416
},
{
"epoch": 1.8788732394366199,
"grad_norm": 0.12081386148929596,
"learning_rate": 0.00011508460262342197,
"loss": 0.531,
"step": 417
},
{
"epoch": 1.8833802816901408,
"grad_norm": 0.10810014605522156,
"learning_rate": 0.00011472380100483438,
"loss": 0.532,
"step": 418
},
{
"epoch": 1.887887323943662,
"grad_norm": 0.1251344531774521,
"learning_rate": 0.00011436280336059799,
"loss": 0.5214,
"step": 419
},
{
"epoch": 1.8923943661971832,
"grad_norm": 0.12114979326725006,
"learning_rate": 0.00011400161449686293,
"loss": 0.5596,
"step": 420
},
{
"epoch": 1.896901408450704,
"grad_norm": 0.11715710908174515,
"learning_rate": 0.00011364023922232503,
"loss": 0.5259,
"step": 421
},
{
"epoch": 1.9014084507042255,
"grad_norm": 0.12593631446361542,
"learning_rate": 0.00011327868234816203,
"loss": 0.5336,
"step": 422
},
{
"epoch": 1.9059154929577464,
"grad_norm": 0.1148916706442833,
"learning_rate": 0.00011291694868796929,
"loss": 0.5693,
"step": 423
},
{
"epoch": 1.9104225352112676,
"grad_norm": 0.10621920973062515,
"learning_rate": 0.00011255504305769589,
"loss": 0.5326,
"step": 424
},
{
"epoch": 1.9149295774647888,
"grad_norm": 0.12841050326824188,
"learning_rate": 0.00011219297027558038,
"loss": 0.6288,
"step": 425
},
{
"epoch": 1.9194366197183097,
"grad_norm": 0.12406205385923386,
"learning_rate": 0.0001118307351620867,
"loss": 0.5467,
"step": 426
},
{
"epoch": 1.923943661971831,
"grad_norm": 0.12403929233551025,
"learning_rate": 0.00011146834253984006,
"loss": 0.542,
"step": 427
},
{
"epoch": 1.928450704225352,
"grad_norm": 0.128191277384758,
"learning_rate": 0.00011110579723356256,
"loss": 0.568,
"step": 428
},
{
"epoch": 1.9329577464788732,
"grad_norm": 0.11979478597640991,
"learning_rate": 0.00011074310407000914,
"loss": 0.5645,
"step": 429
},
{
"epoch": 1.9374647887323944,
"grad_norm": 0.12257257848978043,
"learning_rate": 0.0001103802678779032,
"loss": 0.5444,
"step": 430
},
{
"epoch": 1.9419718309859155,
"grad_norm": 0.11612683534622192,
"learning_rate": 0.00011001729348787239,
"loss": 0.5313,
"step": 431
},
{
"epoch": 1.9464788732394367,
"grad_norm": 0.11943483352661133,
"learning_rate": 0.00010965418573238424,
"loss": 0.5698,
"step": 432
},
{
"epoch": 1.9509859154929576,
"grad_norm": 0.11692250519990921,
"learning_rate": 0.00010929094944568182,
"loss": 0.5395,
"step": 433
},
{
"epoch": 1.955492957746479,
"grad_norm": 0.12217355519533157,
"learning_rate": 0.00010892758946371944,
"loss": 0.5332,
"step": 434
},
{
"epoch": 1.96,
"grad_norm": 0.13013868033885956,
"learning_rate": 0.00010856411062409823,
"loss": 0.5402,
"step": 435
},
{
"epoch": 1.9645070422535211,
"grad_norm": 0.11624142527580261,
"learning_rate": 0.00010820051776600175,
"loss": 0.5619,
"step": 436
},
{
"epoch": 1.9690140845070423,
"grad_norm": 0.1206846535205841,
"learning_rate": 0.00010783681573013145,
"loss": 0.5469,
"step": 437
},
{
"epoch": 1.9735211267605632,
"grad_norm": 0.11562936753034592,
"learning_rate": 0.00010747300935864243,
"loss": 0.5434,
"step": 438
},
{
"epoch": 1.9780281690140846,
"grad_norm": 0.10758832097053528,
"learning_rate": 0.0001071091034950788,
"loss": 0.5295,
"step": 439
},
{
"epoch": 1.9825352112676056,
"grad_norm": 0.11116923391819,
"learning_rate": 0.00010674510298430935,
"loss": 0.5516,
"step": 440
},
{
"epoch": 1.9870422535211267,
"grad_norm": 0.10766938328742981,
"learning_rate": 0.00010638101267246283,
"loss": 0.5319,
"step": 441
},
{
"epoch": 1.991549295774648,
"grad_norm": 0.13807927072048187,
"learning_rate": 0.00010601683740686366,
"loss": 0.5482,
"step": 442
},
{
"epoch": 1.996056338028169,
"grad_norm": 0.11675193160772324,
"learning_rate": 0.00010565258203596722,
"loss": 0.5371,
"step": 443
},
{
"epoch": 2.0,
"grad_norm": 0.12133872509002686,
"learning_rate": 0.00010528825140929541,
"loss": 0.5284,
"step": 444
},
{
"epoch": 2.004507042253521,
"grad_norm": 0.12044970691204071,
"learning_rate": 0.00010492385037737207,
"loss": 0.5278,
"step": 445
},
{
"epoch": 2.0090140845070423,
"grad_norm": 0.12002495676279068,
"learning_rate": 0.00010455938379165835,
"loss": 0.5774,
"step": 446
},
{
"epoch": 2.0135211267605633,
"grad_norm": 0.11336331069469452,
"learning_rate": 0.00010419485650448814,
"loss": 0.5003,
"step": 447
},
{
"epoch": 2.0180281690140847,
"grad_norm": 0.1343923956155777,
"learning_rate": 0.00010383027336900355,
"loss": 0.5172,
"step": 448
},
{
"epoch": 2.0225352112676056,
"grad_norm": 0.12244565784931183,
"learning_rate": 0.00010346563923909014,
"loss": 0.5267,
"step": 449
},
{
"epoch": 2.0270422535211265,
"grad_norm": 0.11235007643699646,
"learning_rate": 0.00010310095896931242,
"loss": 0.5211,
"step": 450
},
{
"epoch": 2.031549295774648,
"grad_norm": 0.18670588731765747,
"learning_rate": 0.00010273623741484923,
"loss": 0.5181,
"step": 451
},
{
"epoch": 2.036056338028169,
"grad_norm": 0.12979289889335632,
"learning_rate": 0.00010237147943142898,
"loss": 0.5333,
"step": 452
},
{
"epoch": 2.0405633802816903,
"grad_norm": 0.11047399044036865,
"learning_rate": 0.00010200668987526512,
"loss": 0.5264,
"step": 453
},
{
"epoch": 2.045070422535211,
"grad_norm": 0.1219114437699318,
"learning_rate": 0.00010164187360299142,
"loss": 0.518,
"step": 454
},
{
"epoch": 2.0495774647887326,
"grad_norm": 0.12627394497394562,
"learning_rate": 0.00010127703547159739,
"loss": 0.5094,
"step": 455
},
{
"epoch": 2.0540845070422535,
"grad_norm": 0.11944904178380966,
"learning_rate": 0.00010091218033836348,
"loss": 0.5272,
"step": 456
},
{
"epoch": 2.0585915492957745,
"grad_norm": 0.13259534537792206,
"learning_rate": 0.00010054731306079656,
"loss": 0.5347,
"step": 457
},
{
"epoch": 2.063098591549296,
"grad_norm": 0.1272563338279724,
"learning_rate": 0.00010018243849656517,
"loss": 0.5144,
"step": 458
},
{
"epoch": 2.067605633802817,
"grad_norm": 0.10852837562561035,
"learning_rate": 9.981756150343485e-05,
"loss": 0.5384,
"step": 459
},
{
"epoch": 2.072112676056338,
"grad_norm": 0.12837117910385132,
"learning_rate": 9.945268693920346e-05,
"loss": 0.5534,
"step": 460
},
{
"epoch": 2.076619718309859,
"grad_norm": 0.1136341392993927,
"learning_rate": 9.908781966163655e-05,
"loss": 0.5318,
"step": 461
},
{
"epoch": 2.08112676056338,
"grad_norm": 0.11348137259483337,
"learning_rate": 9.872296452840264e-05,
"loss": 0.5138,
"step": 462
},
{
"epoch": 2.0856338028169015,
"grad_norm": 0.12767234444618225,
"learning_rate": 9.83581263970086e-05,
"loss": 0.516,
"step": 463
},
{
"epoch": 2.0901408450704224,
"grad_norm": 0.12416424602270126,
"learning_rate": 9.799331012473493e-05,
"loss": 0.5428,
"step": 464
},
{
"epoch": 2.094647887323944,
"grad_norm": 0.12291053682565689,
"learning_rate": 9.762852056857102e-05,
"loss": 0.5209,
"step": 465
},
{
"epoch": 2.0991549295774647,
"grad_norm": 0.14804038405418396,
"learning_rate": 9.726376258515078e-05,
"loss": 0.5522,
"step": 466
},
{
"epoch": 2.103661971830986,
"grad_norm": 0.12152467668056488,
"learning_rate": 9.689904103068758e-05,
"loss": 0.5358,
"step": 467
},
{
"epoch": 2.108169014084507,
"grad_norm": 0.11182091385126114,
"learning_rate": 9.653436076090988e-05,
"loss": 0.5497,
"step": 468
},
{
"epoch": 2.112676056338028,
"grad_norm": 0.107152059674263,
"learning_rate": 9.616972663099647e-05,
"loss": 0.5153,
"step": 469
},
{
"epoch": 2.1171830985915494,
"grad_norm": 0.12143649160861969,
"learning_rate": 9.580514349551187e-05,
"loss": 0.5852,
"step": 470
},
{
"epoch": 2.1216901408450703,
"grad_norm": 0.11740640550851822,
"learning_rate": 9.544061620834167e-05,
"loss": 0.5597,
"step": 471
},
{
"epoch": 2.1261971830985917,
"grad_norm": 0.10704807937145233,
"learning_rate": 9.507614962262795e-05,
"loss": 0.5306,
"step": 472
},
{
"epoch": 2.1307042253521127,
"grad_norm": 0.11234696209430695,
"learning_rate": 9.471174859070461e-05,
"loss": 0.5185,
"step": 473
},
{
"epoch": 2.1352112676056336,
"grad_norm": 0.12021481990814209,
"learning_rate": 9.434741796403282e-05,
"loss": 0.5727,
"step": 474
},
{
"epoch": 2.139718309859155,
"grad_norm": 0.11688639968633652,
"learning_rate": 9.398316259313637e-05,
"loss": 0.5302,
"step": 475
},
{
"epoch": 2.144225352112676,
"grad_norm": 0.1164378821849823,
"learning_rate": 9.361898732753716e-05,
"loss": 0.5156,
"step": 476
},
{
"epoch": 2.1487323943661973,
"grad_norm": 0.11934376507997513,
"learning_rate": 9.325489701569065e-05,
"loss": 0.5397,
"step": 477
},
{
"epoch": 2.1532394366197183,
"grad_norm": 0.1212429478764534,
"learning_rate": 9.289089650492118e-05,
"loss": 0.5451,
"step": 478
},
{
"epoch": 2.1577464788732392,
"grad_norm": 0.1096879169344902,
"learning_rate": 9.252699064135758e-05,
"loss": 0.545,
"step": 479
},
{
"epoch": 2.1622535211267606,
"grad_norm": 0.11909584701061249,
"learning_rate": 9.216318426986856e-05,
"loss": 0.5305,
"step": 480
},
{
"epoch": 2.1667605633802816,
"grad_norm": 0.12082231044769287,
"learning_rate": 9.179948223399828e-05,
"loss": 0.5121,
"step": 481
},
{
"epoch": 2.171267605633803,
"grad_norm": 0.11738288402557373,
"learning_rate": 9.143588937590178e-05,
"loss": 0.5111,
"step": 482
},
{
"epoch": 2.175774647887324,
"grad_norm": 0.1155785471200943,
"learning_rate": 9.107241053628059e-05,
"loss": 0.5558,
"step": 483
},
{
"epoch": 2.1802816901408453,
"grad_norm": 0.11050604283809662,
"learning_rate": 9.070905055431822e-05,
"loss": 0.505,
"step": 484
},
{
"epoch": 2.184788732394366,
"grad_norm": 0.11163124442100525,
"learning_rate": 9.034581426761581e-05,
"loss": 0.5204,
"step": 485
},
{
"epoch": 2.189295774647887,
"grad_norm": 0.11405828595161438,
"learning_rate": 8.998270651212764e-05,
"loss": 0.5267,
"step": 486
},
{
"epoch": 2.1938028169014085,
"grad_norm": 0.10824751108884811,
"learning_rate": 8.961973212209684e-05,
"loss": 0.5363,
"step": 487
},
{
"epoch": 2.1983098591549295,
"grad_norm": 0.120577372610569,
"learning_rate": 8.925689592999088e-05,
"loss": 0.5438,
"step": 488
},
{
"epoch": 2.202816901408451,
"grad_norm": 0.12007007002830505,
"learning_rate": 8.889420276643746e-05,
"loss": 0.5179,
"step": 489
},
{
"epoch": 2.207323943661972,
"grad_norm": 0.115611732006073,
"learning_rate": 8.853165746015997e-05,
"loss": 0.544,
"step": 490
},
{
"epoch": 2.2118309859154928,
"grad_norm": 0.1210789903998375,
"learning_rate": 8.816926483791331e-05,
"loss": 0.528,
"step": 491
},
{
"epoch": 2.216338028169014,
"grad_norm": 0.13798987865447998,
"learning_rate": 8.780702972441964e-05,
"loss": 0.5168,
"step": 492
},
{
"epoch": 2.220845070422535,
"grad_norm": 0.11814180761575699,
"learning_rate": 8.744495694230412e-05,
"loss": 0.5351,
"step": 493
},
{
"epoch": 2.2253521126760565,
"grad_norm": 0.10845314711332321,
"learning_rate": 8.708305131203072e-05,
"loss": 0.5411,
"step": 494
},
{
"epoch": 2.2298591549295774,
"grad_norm": 0.1123770996928215,
"learning_rate": 8.672131765183799e-05,
"loss": 0.5128,
"step": 495
},
{
"epoch": 2.234366197183099,
"grad_norm": 0.1154399961233139,
"learning_rate": 8.635976077767499e-05,
"loss": 0.5264,
"step": 496
},
{
"epoch": 2.2388732394366198,
"grad_norm": 0.1050579845905304,
"learning_rate": 8.599838550313713e-05,
"loss": 0.5429,
"step": 497
},
{
"epoch": 2.2433802816901407,
"grad_norm": 0.11692402511835098,
"learning_rate": 8.563719663940205e-05,
"loss": 0.5044,
"step": 498
},
{
"epoch": 2.247887323943662,
"grad_norm": 0.11380699276924133,
"learning_rate": 8.527619899516567e-05,
"loss": 0.5159,
"step": 499
},
{
"epoch": 2.252394366197183,
"grad_norm": 0.1111583560705185,
"learning_rate": 8.491539737657802e-05,
"loss": 0.5434,
"step": 500
},
{
"epoch": 2.2569014084507044,
"grad_norm": 0.14941135048866272,
"learning_rate": 8.455479658717947e-05,
"loss": 0.5846,
"step": 501
},
{
"epoch": 2.2614084507042254,
"grad_norm": 0.11948571354150772,
"learning_rate": 8.419440142783653e-05,
"loss": 0.5286,
"step": 502
},
{
"epoch": 2.2659154929577463,
"grad_norm": 0.12659218907356262,
"learning_rate": 8.383421669667812e-05,
"loss": 0.509,
"step": 503
},
{
"epoch": 2.2704225352112677,
"grad_norm": 0.11504048109054565,
"learning_rate": 8.347424718903151e-05,
"loss": 0.5394,
"step": 504
},
{
"epoch": 2.2749295774647886,
"grad_norm": 0.11881807446479797,
"learning_rate": 8.311449769735873e-05,
"loss": 0.5307,
"step": 505
},
{
"epoch": 2.27943661971831,
"grad_norm": 0.13386914134025574,
"learning_rate": 8.275497301119253e-05,
"loss": 0.5274,
"step": 506
},
{
"epoch": 2.283943661971831,
"grad_norm": 0.15793730318546295,
"learning_rate": 8.239567791707272e-05,
"loss": 0.5575,
"step": 507
},
{
"epoch": 2.288450704225352,
"grad_norm": 0.11447100341320038,
"learning_rate": 8.203661719848248e-05,
"loss": 0.5266,
"step": 508
},
{
"epoch": 2.2929577464788733,
"grad_norm": 0.1401495337486267,
"learning_rate": 8.167779563578456e-05,
"loss": 0.5321,
"step": 509
},
{
"epoch": 2.2974647887323942,
"grad_norm": 0.11609887331724167,
"learning_rate": 8.131921800615773e-05,
"loss": 0.5176,
"step": 510
},
{
"epoch": 2.3019718309859156,
"grad_norm": 0.11702638119459152,
"learning_rate": 8.096088908353315e-05,
"loss": 0.5196,
"step": 511
},
{
"epoch": 2.3064788732394366,
"grad_norm": 0.1231730580329895,
"learning_rate": 8.060281363853087e-05,
"loss": 0.5367,
"step": 512
},
{
"epoch": 2.3109859154929575,
"grad_norm": 0.12909771502017975,
"learning_rate": 8.024499643839618e-05,
"loss": 0.543,
"step": 513
},
{
"epoch": 2.315492957746479,
"grad_norm": 0.12570002675056458,
"learning_rate": 7.988744224693625e-05,
"loss": 0.5349,
"step": 514
},
{
"epoch": 2.32,
"grad_norm": 0.11345330625772476,
"learning_rate": 7.953015582445667e-05,
"loss": 0.5566,
"step": 515
},
{
"epoch": 2.3245070422535212,
"grad_norm": 0.1203540787100792,
"learning_rate": 7.917314192769808e-05,
"loss": 0.5455,
"step": 516
},
{
"epoch": 2.329014084507042,
"grad_norm": 0.12087874859571457,
"learning_rate": 7.881640530977288e-05,
"loss": 0.5499,
"step": 517
},
{
"epoch": 2.3335211267605636,
"grad_norm": 0.11830297857522964,
"learning_rate": 7.845995072010188e-05,
"loss": 0.5653,
"step": 518
},
{
"epoch": 2.3380281690140845,
"grad_norm": 0.11518878489732742,
"learning_rate": 7.810378290435108e-05,
"loss": 0.5202,
"step": 519
},
{
"epoch": 2.3425352112676054,
"grad_norm": 0.1460411250591278,
"learning_rate": 7.774790660436858e-05,
"loss": 0.5443,
"step": 520
},
{
"epoch": 2.347042253521127,
"grad_norm": 0.13443294167518616,
"learning_rate": 7.739232655812128e-05,
"loss": 0.5673,
"step": 521
},
{
"epoch": 2.3515492957746478,
"grad_norm": 0.10706333070993423,
"learning_rate": 7.703704749963201e-05,
"loss": 0.5351,
"step": 522
},
{
"epoch": 2.356056338028169,
"grad_norm": 0.1250416338443756,
"learning_rate": 7.668207415891624e-05,
"loss": 0.5491,
"step": 523
},
{
"epoch": 2.36056338028169,
"grad_norm": 0.24670056998729706,
"learning_rate": 7.632741126191947e-05,
"loss": 0.5248,
"step": 524
},
{
"epoch": 2.3650704225352115,
"grad_norm": 0.1117803156375885,
"learning_rate": 7.597306353045393e-05,
"loss": 0.5701,
"step": 525
},
{
"epoch": 2.3695774647887324,
"grad_norm": 0.10440925508737564,
"learning_rate": 7.561903568213595e-05,
"loss": 0.5028,
"step": 526
},
{
"epoch": 2.3740845070422534,
"grad_norm": 0.11765775829553604,
"learning_rate": 7.526533243032307e-05,
"loss": 0.5229,
"step": 527
},
{
"epoch": 2.3785915492957748,
"grad_norm": 0.11378080397844315,
"learning_rate": 7.491195848405135e-05,
"loss": 0.528,
"step": 528
},
{
"epoch": 2.3830985915492957,
"grad_norm": 0.12597879767417908,
"learning_rate": 7.455891854797256e-05,
"loss": 0.5027,
"step": 529
},
{
"epoch": 2.387605633802817,
"grad_norm": 0.13118213415145874,
"learning_rate": 7.420621732229169e-05,
"loss": 0.5186,
"step": 530
},
{
"epoch": 2.392112676056338,
"grad_norm": 0.11117777973413467,
"learning_rate": 7.385385950270425e-05,
"loss": 0.5207,
"step": 531
},
{
"epoch": 2.396619718309859,
"grad_norm": 0.1138320192694664,
"learning_rate": 7.350184978033386e-05,
"loss": 0.5089,
"step": 532
},
{
"epoch": 2.4011267605633804,
"grad_norm": 0.11337253451347351,
"learning_rate": 7.315019284166966e-05,
"loss": 0.5165,
"step": 533
},
{
"epoch": 2.4056338028169013,
"grad_norm": 0.10323511809110641,
"learning_rate": 7.279889336850408e-05,
"loss": 0.5154,
"step": 534
},
{
"epoch": 2.4101408450704227,
"grad_norm": 0.11857807636260986,
"learning_rate": 7.244795603787036e-05,
"loss": 0.509,
"step": 535
},
{
"epoch": 2.4146478873239436,
"grad_norm": 0.1258813738822937,
"learning_rate": 7.209738552198043e-05,
"loss": 0.5246,
"step": 536
},
{
"epoch": 2.4191549295774646,
"grad_norm": 0.11479494720697403,
"learning_rate": 7.17471864881625e-05,
"loss": 0.5376,
"step": 537
},
{
"epoch": 2.423661971830986,
"grad_norm": 0.10941333323717117,
"learning_rate": 7.139736359879916e-05,
"loss": 0.5198,
"step": 538
},
{
"epoch": 2.428169014084507,
"grad_norm": 0.1188211664557457,
"learning_rate": 7.104792151126515e-05,
"loss": 0.5174,
"step": 539
},
{
"epoch": 2.4326760563380283,
"grad_norm": 0.12148640304803848,
"learning_rate": 7.069886487786536e-05,
"loss": 0.5372,
"step": 540
},
{
"epoch": 2.4371830985915492,
"grad_norm": 0.10996753722429276,
"learning_rate": 7.035019834577301e-05,
"loss": 0.5211,
"step": 541
},
{
"epoch": 2.44169014084507,
"grad_norm": 0.1371939480304718,
"learning_rate": 7.00019265569676e-05,
"loss": 0.5551,
"step": 542
},
{
"epoch": 2.4461971830985916,
"grad_norm": 0.1196320503950119,
"learning_rate": 6.96540541481733e-05,
"loss": 0.5357,
"step": 543
},
{
"epoch": 2.4507042253521125,
"grad_norm": 0.10747256129980087,
"learning_rate": 6.930658575079705e-05,
"loss": 0.5368,
"step": 544
},
{
"epoch": 2.455211267605634,
"grad_norm": 0.11347094923257828,
"learning_rate": 6.8959525990867e-05,
"loss": 0.5174,
"step": 545
},
{
"epoch": 2.459718309859155,
"grad_norm": 0.12358327209949493,
"learning_rate": 6.861287948897091e-05,
"loss": 0.5529,
"step": 546
},
{
"epoch": 2.4642253521126762,
"grad_norm": 0.12316222488880157,
"learning_rate": 6.826665086019466e-05,
"loss": 0.5625,
"step": 547
},
{
"epoch": 2.468732394366197,
"grad_norm": 0.11993323266506195,
"learning_rate": 6.792084471406069e-05,
"loss": 0.5229,
"step": 548
},
{
"epoch": 2.473239436619718,
"grad_norm": 0.11845649033784866,
"learning_rate": 6.75754656544667e-05,
"loss": 0.5136,
"step": 549
},
{
"epoch": 2.4777464788732395,
"grad_norm": 0.11402633786201477,
"learning_rate": 6.723051827962445e-05,
"loss": 0.5516,
"step": 550
},
{
"epoch": 2.4822535211267605,
"grad_norm": 0.10994803160429001,
"learning_rate": 6.68860071819984e-05,
"loss": 0.5077,
"step": 551
},
{
"epoch": 2.486760563380282,
"grad_norm": 0.12263881415128708,
"learning_rate": 6.654193694824462e-05,
"loss": 0.548,
"step": 552
},
{
"epoch": 2.491267605633803,
"grad_norm": 0.12687471508979797,
"learning_rate": 6.619831215914974e-05,
"loss": 0.5835,
"step": 553
},
{
"epoch": 2.495774647887324,
"grad_norm": 0.11021614074707031,
"learning_rate": 6.585513738956996e-05,
"loss": 0.5442,
"step": 554
},
{
"epoch": 2.500281690140845,
"grad_norm": 0.12376144528388977,
"learning_rate": 6.551241720837014e-05,
"loss": 0.5038,
"step": 555
},
{
"epoch": 2.504788732394366,
"grad_norm": 0.12698741257190704,
"learning_rate": 6.517015617836291e-05,
"loss": 0.5143,
"step": 556
},
{
"epoch": 2.5092957746478874,
"grad_norm": 0.1203697919845581,
"learning_rate": 6.48283588562481e-05,
"loss": 0.5443,
"step": 557
},
{
"epoch": 2.5138028169014084,
"grad_norm": 0.10425330698490143,
"learning_rate": 6.448702979255176e-05,
"loss": 0.4859,
"step": 558
},
{
"epoch": 2.5183098591549298,
"grad_norm": 0.11607982218265533,
"learning_rate": 6.414617353156605e-05,
"loss": 0.5198,
"step": 559
},
{
"epoch": 2.5228169014084507,
"grad_norm": 0.1305762082338333,
"learning_rate": 6.380579461128819e-05,
"loss": 0.5886,
"step": 560
},
{
"epoch": 2.5273239436619717,
"grad_norm": 0.11528196185827255,
"learning_rate": 6.34658975633605e-05,
"loss": 0.5168,
"step": 561
},
{
"epoch": 2.531830985915493,
"grad_norm": 0.12088954448699951,
"learning_rate": 6.312648691300975e-05,
"loss": 0.5301,
"step": 562
},
{
"epoch": 2.536338028169014,
"grad_norm": 0.11259355396032333,
"learning_rate": 6.278756717898713e-05,
"loss": 0.5079,
"step": 563
},
{
"epoch": 2.5408450704225354,
"grad_norm": 0.10837467759847641,
"learning_rate": 6.2449142873508e-05,
"loss": 0.5081,
"step": 564
},
{
"epoch": 2.5453521126760563,
"grad_norm": 0.11548671871423721,
"learning_rate": 6.211121850219175e-05,
"loss": 0.5236,
"step": 565
},
{
"epoch": 2.5498591549295773,
"grad_norm": 0.11041808128356934,
"learning_rate": 6.1773798564002e-05,
"loss": 0.5309,
"step": 566
},
{
"epoch": 2.5543661971830987,
"grad_norm": 0.13341392576694489,
"learning_rate": 6.143688755118646e-05,
"loss": 0.5239,
"step": 567
},
{
"epoch": 2.5588732394366196,
"grad_norm": 0.11479108780622482,
"learning_rate": 6.110048994921734e-05,
"loss": 0.5088,
"step": 568
},
{
"epoch": 2.563380281690141,
"grad_norm": 0.10170383006334305,
"learning_rate": 6.0764610236731524e-05,
"loss": 0.5027,
"step": 569
},
{
"epoch": 2.567887323943662,
"grad_norm": 0.1148676872253418,
"learning_rate": 6.042925288547092e-05,
"loss": 0.5454,
"step": 570
},
{
"epoch": 2.572394366197183,
"grad_norm": 0.11497487127780914,
"learning_rate": 6.009442236022307e-05,
"loss": 0.5149,
"step": 571
},
{
"epoch": 2.5769014084507043,
"grad_norm": 0.11571796983480453,
"learning_rate": 5.9760123118761514e-05,
"loss": 0.5188,
"step": 572
},
{
"epoch": 2.581408450704225,
"grad_norm": 0.10864008218050003,
"learning_rate": 5.9426359611786573e-05,
"loss": 0.5249,
"step": 573
},
{
"epoch": 2.5859154929577466,
"grad_norm": 0.11008573323488235,
"learning_rate": 5.909313628286601e-05,
"loss": 0.5418,
"step": 574
},
{
"epoch": 2.5904225352112675,
"grad_norm": 0.11387594044208527,
"learning_rate": 5.8760457568375984e-05,
"loss": 0.5342,
"step": 575
},
{
"epoch": 2.5949295774647885,
"grad_norm": 0.12866191565990448,
"learning_rate": 5.842832789744186e-05,
"loss": 0.5747,
"step": 576
},
{
"epoch": 2.59943661971831,
"grad_norm": 0.11581499129533768,
"learning_rate": 5.8096751691879356e-05,
"loss": 0.5323,
"step": 577
},
{
"epoch": 2.6039436619718312,
"grad_norm": 0.13145901262760162,
"learning_rate": 5.776573336613553e-05,
"loss": 0.5125,
"step": 578
},
{
"epoch": 2.608450704225352,
"grad_norm": 0.11746399849653244,
"learning_rate": 5.7435277327230206e-05,
"loss": 0.54,
"step": 579
},
{
"epoch": 2.612957746478873,
"grad_norm": 0.10810490697622299,
"learning_rate": 5.7105387974697063e-05,
"loss": 0.5299,
"step": 580
},
{
"epoch": 2.6174647887323945,
"grad_norm": 0.11249610036611557,
"learning_rate": 5.677606970052529e-05,
"loss": 0.5365,
"step": 581
},
{
"epoch": 2.6219718309859155,
"grad_norm": 0.10808350145816803,
"learning_rate": 5.6447326889101e-05,
"loss": 0.4997,
"step": 582
},
{
"epoch": 2.626478873239437,
"grad_norm": 0.11670640110969543,
"learning_rate": 5.6119163917148866e-05,
"loss": 0.5127,
"step": 583
},
{
"epoch": 2.630985915492958,
"grad_norm": 0.10403117537498474,
"learning_rate": 5.5791585153673774e-05,
"loss": 0.4984,
"step": 584
},
{
"epoch": 2.6354929577464787,
"grad_norm": 0.11643310636281967,
"learning_rate": 5.546459495990288e-05,
"loss": 0.5363,
"step": 585
},
{
"epoch": 2.64,
"grad_norm": 0.11666381359100342,
"learning_rate": 5.513819768922723e-05,
"loss": 0.5462,
"step": 586
},
{
"epoch": 2.644507042253521,
"grad_norm": 0.1106523647904396,
"learning_rate": 5.481239768714417e-05,
"loss": 0.5234,
"step": 587
},
{
"epoch": 2.6490140845070425,
"grad_norm": 0.11416450887918472,
"learning_rate": 5.448719929119915e-05,
"loss": 0.5158,
"step": 588
},
{
"epoch": 2.6535211267605634,
"grad_norm": 0.11326763778924942,
"learning_rate": 5.416260683092814e-05,
"loss": 0.5457,
"step": 589
},
{
"epoch": 2.6580281690140843,
"grad_norm": 0.10651212185621262,
"learning_rate": 5.3838624627800074e-05,
"loss": 0.5281,
"step": 590
},
{
"epoch": 2.6625352112676057,
"grad_norm": 0.11655013263225555,
"learning_rate": 5.351525699515908e-05,
"loss": 0.5355,
"step": 591
},
{
"epoch": 2.6670422535211267,
"grad_norm": 0.10108935087919235,
"learning_rate": 5.319250823816731e-05,
"loss": 0.509,
"step": 592
},
{
"epoch": 2.671549295774648,
"grad_norm": 0.10345329344272614,
"learning_rate": 5.287038265374735e-05,
"loss": 0.5093,
"step": 593
},
{
"epoch": 2.676056338028169,
"grad_norm": 0.11032966524362564,
"learning_rate": 5.25488845305254e-05,
"loss": 0.5083,
"step": 594
},
{
"epoch": 2.68056338028169,
"grad_norm": 0.11705298721790314,
"learning_rate": 5.222801814877369e-05,
"loss": 0.5108,
"step": 595
},
{
"epoch": 2.6850704225352113,
"grad_norm": 0.11696744710206985,
"learning_rate": 5.190778778035395e-05,
"loss": 0.5118,
"step": 596
},
{
"epoch": 2.6895774647887323,
"grad_norm": 0.10620059072971344,
"learning_rate": 5.158819768866012e-05,
"loss": 0.5279,
"step": 597
},
{
"epoch": 2.6940845070422537,
"grad_norm": 0.11230248957872391,
"learning_rate": 5.126925212856202e-05,
"loss": 0.5038,
"step": 598
},
{
"epoch": 2.6985915492957746,
"grad_norm": 0.11875911056995392,
"learning_rate": 5.0950955346348314e-05,
"loss": 0.5513,
"step": 599
},
{
"epoch": 2.7030985915492955,
"grad_norm": 0.11459613591432571,
"learning_rate": 5.0633311579670296e-05,
"loss": 0.5606,
"step": 600
},
{
"epoch": 2.707605633802817,
"grad_norm": 0.12176425009965897,
"learning_rate": 5.031632505748516e-05,
"loss": 0.5262,
"step": 601
},
{
"epoch": 2.712112676056338,
"grad_norm": 0.12691204249858856,
"learning_rate": 5.000000000000002e-05,
"loss": 0.5246,
"step": 602
},
{
"epoch": 2.7166197183098593,
"grad_norm": 0.09963858127593994,
"learning_rate": 4.968434061861543e-05,
"loss": 0.4977,
"step": 603
},
{
"epoch": 2.72112676056338,
"grad_norm": 0.10625036060810089,
"learning_rate": 4.9369351115869535e-05,
"loss": 0.512,
"step": 604
},
{
"epoch": 2.725633802816901,
"grad_norm": 0.11431818455457687,
"learning_rate": 4.9055035685382055e-05,
"loss": 0.5308,
"step": 605
},
{
"epoch": 2.7301408450704225,
"grad_norm": 0.10542725026607513,
"learning_rate": 4.874139851179833e-05,
"loss": 0.5301,
"step": 606
},
{
"epoch": 2.734647887323944,
"grad_norm": 0.10275546461343765,
"learning_rate": 4.8428443770733835e-05,
"loss": 0.5209,
"step": 607
},
{
"epoch": 2.739154929577465,
"grad_norm": 0.38522496819496155,
"learning_rate": 4.811617562871832e-05,
"loss": 0.5319,
"step": 608
},
{
"epoch": 2.743661971830986,
"grad_norm": 0.10014399141073227,
"learning_rate": 4.7804598243140666e-05,
"loss": 0.5151,
"step": 609
},
{
"epoch": 2.748169014084507,
"grad_norm": 0.11353477835655212,
"learning_rate": 4.749371576219317e-05,
"loss": 0.5246,
"step": 610
},
{
"epoch": 2.752676056338028,
"grad_norm": 0.10583645105361938,
"learning_rate": 4.718353232481665e-05,
"loss": 0.5132,
"step": 611
},
{
"epoch": 2.7571830985915495,
"grad_norm": 0.12215419858694077,
"learning_rate": 4.6874052060645066e-05,
"loss": 0.5331,
"step": 612
},
{
"epoch": 2.7616901408450705,
"grad_norm": 0.11868246644735336,
"learning_rate": 4.6565279089950795e-05,
"loss": 0.5023,
"step": 613
},
{
"epoch": 2.7661971830985914,
"grad_norm": 0.12116070836782455,
"learning_rate": 4.62572175235895e-05,
"loss": 0.5461,
"step": 614
},
{
"epoch": 2.770704225352113,
"grad_norm": 0.11347726732492447,
"learning_rate": 4.5949871462945714e-05,
"loss": 0.5095,
"step": 615
},
{
"epoch": 2.7752112676056337,
"grad_norm": 0.10805897414684296,
"learning_rate": 4.56432449998779e-05,
"loss": 0.5069,
"step": 616
},
{
"epoch": 2.779718309859155,
"grad_norm": 0.12143836170434952,
"learning_rate": 4.533734221666425e-05,
"loss": 0.5361,
"step": 617
},
{
"epoch": 2.784225352112676,
"grad_norm": 0.12267526239156723,
"learning_rate": 4.503216718594825e-05,
"loss": 0.5325,
"step": 618
},
{
"epoch": 2.788732394366197,
"grad_norm": 0.1219489648938179,
"learning_rate": 4.472772397068431e-05,
"loss": 0.5408,
"step": 619
},
{
"epoch": 2.7932394366197184,
"grad_norm": 0.11085795611143112,
"learning_rate": 4.442401662408395e-05,
"loss": 0.5221,
"step": 620
},
{
"epoch": 2.7977464788732394,
"grad_norm": 0.11736064404249191,
"learning_rate": 4.4121049189561556e-05,
"loss": 0.5367,
"step": 621
},
{
"epoch": 2.8022535211267607,
"grad_norm": 0.10407859832048416,
"learning_rate": 4.381882570068079e-05,
"loss": 0.4977,
"step": 622
},
{
"epoch": 2.8067605633802817,
"grad_norm": 0.10226607322692871,
"learning_rate": 4.351735018110066e-05,
"loss": 0.4931,
"step": 623
},
{
"epoch": 2.8112676056338026,
"grad_norm": 0.10844944417476654,
"learning_rate": 4.321662664452221e-05,
"loss": 0.5228,
"step": 624
},
{
"epoch": 2.815774647887324,
"grad_norm": 0.1089576929807663,
"learning_rate": 4.291665909463477e-05,
"loss": 0.5316,
"step": 625
},
{
"epoch": 2.820281690140845,
"grad_norm": 0.11149588227272034,
"learning_rate": 4.2617451525063014e-05,
"loss": 0.5147,
"step": 626
},
{
"epoch": 2.8247887323943663,
"grad_norm": 0.12150777131319046,
"learning_rate": 4.231900791931341e-05,
"loss": 0.5386,
"step": 627
},
{
"epoch": 2.8292957746478873,
"grad_norm": 0.11250042915344238,
"learning_rate": 4.202133225072153e-05,
"loss": 0.5494,
"step": 628
},
{
"epoch": 2.8338028169014082,
"grad_norm": 0.11534137278795242,
"learning_rate": 4.1724428482398945e-05,
"loss": 0.5483,
"step": 629
},
{
"epoch": 2.8383098591549296,
"grad_norm": 0.10241974890232086,
"learning_rate": 4.142830056718052e-05,
"loss": 0.5052,
"step": 630
},
{
"epoch": 2.8428169014084506,
"grad_norm": 0.11854840815067291,
"learning_rate": 4.113295244757172e-05,
"loss": 0.5385,
"step": 631
},
{
"epoch": 2.847323943661972,
"grad_norm": 0.11492074280977249,
"learning_rate": 4.08383880556963e-05,
"loss": 0.4985,
"step": 632
},
{
"epoch": 2.851830985915493,
"grad_norm": 0.10175682604312897,
"learning_rate": 4.054461131324373e-05,
"loss": 0.5064,
"step": 633
},
{
"epoch": 2.856338028169014,
"grad_norm": 0.11565281450748444,
"learning_rate": 4.025162613141713e-05,
"loss": 0.5084,
"step": 634
},
{
"epoch": 2.860845070422535,
"grad_norm": 0.10842733085155487,
"learning_rate": 3.995943641088121e-05,
"loss": 0.5128,
"step": 635
},
{
"epoch": 2.8653521126760566,
"grad_norm": 0.11573941260576248,
"learning_rate": 3.966804604171019e-05,
"loss": 0.5279,
"step": 636
},
{
"epoch": 2.8698591549295775,
"grad_norm": 0.11557252705097198,
"learning_rate": 3.937745890333623e-05,
"loss": 0.5174,
"step": 637
},
{
"epoch": 2.8743661971830985,
"grad_norm": 0.1108415350317955,
"learning_rate": 3.908767886449752e-05,
"loss": 0.5161,
"step": 638
},
{
"epoch": 2.87887323943662,
"grad_norm": 0.1033615991473198,
"learning_rate": 3.8798709783187036e-05,
"loss": 0.529,
"step": 639
},
{
"epoch": 2.883380281690141,
"grad_norm": 0.14297285676002502,
"learning_rate": 3.8510555506600974e-05,
"loss": 0.5096,
"step": 640
},
{
"epoch": 2.887887323943662,
"grad_norm": 0.1152140274643898,
"learning_rate": 3.82232198710877e-05,
"loss": 0.5361,
"step": 641
},
{
"epoch": 2.892394366197183,
"grad_norm": 0.11221909523010254,
"learning_rate": 3.793670670209645e-05,
"loss": 0.542,
"step": 642
},
{
"epoch": 2.896901408450704,
"grad_norm": 0.11155454814434052,
"learning_rate": 3.7651019814126654e-05,
"loss": 0.5383,
"step": 643
},
{
"epoch": 2.9014084507042255,
"grad_norm": 0.11201007664203644,
"learning_rate": 3.736616301067694e-05,
"loss": 0.5295,
"step": 644
},
{
"epoch": 2.9059154929577464,
"grad_norm": 0.09936228394508362,
"learning_rate": 3.708214008419467e-05,
"loss": 0.5093,
"step": 645
},
{
"epoch": 2.910422535211268,
"grad_norm": 0.11197184026241302,
"learning_rate": 3.6798954816025286e-05,
"loss": 0.5387,
"step": 646
},
{
"epoch": 2.9149295774647888,
"grad_norm": 0.11715072393417358,
"learning_rate": 3.6516610976362155e-05,
"loss": 0.5487,
"step": 647
},
{
"epoch": 2.9194366197183097,
"grad_norm": 0.10473347455263138,
"learning_rate": 3.623511232419612e-05,
"loss": 0.5017,
"step": 648
},
{
"epoch": 2.923943661971831,
"grad_norm": 0.10394609719514847,
"learning_rate": 3.595446260726576e-05,
"loss": 0.5091,
"step": 649
},
{
"epoch": 2.928450704225352,
"grad_norm": 0.10504930466413498,
"learning_rate": 3.5674665562007184e-05,
"loss": 0.5299,
"step": 650
},
{
"epoch": 2.9329577464788734,
"grad_norm": 0.11549410969018936,
"learning_rate": 3.5395724913504545e-05,
"loss": 0.5435,
"step": 651
},
{
"epoch": 2.9374647887323944,
"grad_norm": 0.12606476247310638,
"learning_rate": 3.511764437544036e-05,
"loss": 0.5115,
"step": 652
},
{
"epoch": 2.9419718309859153,
"grad_norm": 0.10190090537071228,
"learning_rate": 3.48404276500459e-05,
"loss": 0.499,
"step": 653
},
{
"epoch": 2.9464788732394367,
"grad_norm": 0.11330178380012512,
"learning_rate": 3.456407842805223e-05,
"loss": 0.5286,
"step": 654
},
{
"epoch": 2.9509859154929576,
"grad_norm": 0.10447689890861511,
"learning_rate": 3.4288600388640714e-05,
"loss": 0.4856,
"step": 655
},
{
"epoch": 2.955492957746479,
"grad_norm": 0.11650539189577103,
"learning_rate": 3.4013997199394376e-05,
"loss": 0.5344,
"step": 656
},
{
"epoch": 2.96,
"grad_norm": 0.1107359305024147,
"learning_rate": 3.3740272516248736e-05,
"loss": 0.5121,
"step": 657
},
{
"epoch": 2.964507042253521,
"grad_norm": 0.11095371842384338,
"learning_rate": 3.346742998344348e-05,
"loss": 0.5136,
"step": 658
},
{
"epoch": 2.9690140845070423,
"grad_norm": 0.10957096517086029,
"learning_rate": 3.3195473233473584e-05,
"loss": 0.5263,
"step": 659
},
{
"epoch": 2.9735211267605632,
"grad_norm": 0.11277777701616287,
"learning_rate": 3.292440588704131e-05,
"loss": 0.5472,
"step": 660
},
{
"epoch": 2.9780281690140846,
"grad_norm": 0.10114355385303497,
"learning_rate": 3.265423155300767e-05,
"loss": 0.5272,
"step": 661
},
{
"epoch": 2.9825352112676056,
"grad_norm": 0.11590156704187393,
"learning_rate": 3.238495382834464e-05,
"loss": 0.521,
"step": 662
},
{
"epoch": 2.9870422535211265,
"grad_norm": 0.12712036073207855,
"learning_rate": 3.211657629808712e-05,
"loss": 0.5632,
"step": 663
},
{
"epoch": 2.991549295774648,
"grad_norm": 0.11136014014482498,
"learning_rate": 3.184910253528528e-05,
"loss": 0.5139,
"step": 664
},
{
"epoch": 2.9960563380281693,
"grad_norm": 0.10489501059055328,
"learning_rate": 3.158253610095697e-05,
"loss": 0.5046,
"step": 665
},
{
"epoch": 3.0,
"grad_norm": 0.1281147003173828,
"learning_rate": 3.1316880544040226e-05,
"loss": 0.5102,
"step": 666
},
{
"epoch": 3.004507042253521,
"grad_norm": 0.1174585297703743,
"learning_rate": 3.105213940134621e-05,
"loss": 0.5348,
"step": 667
},
{
"epoch": 3.0090140845070423,
"grad_norm": 0.11154928803443909,
"learning_rate": 3.07883161975119e-05,
"loss": 0.5283,
"step": 668
},
{
"epoch": 3.0135211267605633,
"grad_norm": 0.10453861951828003,
"learning_rate": 3.0525414444953396e-05,
"loss": 0.4933,
"step": 669
},
{
"epoch": 3.0180281690140847,
"grad_norm": 0.11380238831043243,
"learning_rate": 3.026343764381887e-05,
"loss": 0.5073,
"step": 670
},
{
"epoch": 3.0225352112676056,
"grad_norm": 0.0978739783167839,
"learning_rate": 3.0002389281942332e-05,
"loss": 0.5098,
"step": 671
},
{
"epoch": 3.0270422535211265,
"grad_norm": 0.11205834150314331,
"learning_rate": 2.9742272834796813e-05,
"loss": 0.55,
"step": 672
},
{
"epoch": 3.031549295774648,
"grad_norm": 0.11361253261566162,
"learning_rate": 2.9483091765448422e-05,
"loss": 0.5519,
"step": 673
},
{
"epoch": 3.036056338028169,
"grad_norm": 0.1036975234746933,
"learning_rate": 2.9224849524509936e-05,
"loss": 0.5319,
"step": 674
},
{
"epoch": 3.0405633802816903,
"grad_norm": 0.10117993503808975,
"learning_rate": 2.896754955009524e-05,
"loss": 0.5152,
"step": 675
},
{
"epoch": 3.045070422535211,
"grad_norm": 0.114353246986866,
"learning_rate": 2.871119526777315e-05,
"loss": 0.5187,
"step": 676
},
{
"epoch": 3.0495774647887326,
"grad_norm": 0.10379718244075775,
"learning_rate": 2.8455790090522017e-05,
"loss": 0.5105,
"step": 677
},
{
"epoch": 3.0540845070422535,
"grad_norm": 0.10687281936407089,
"learning_rate": 2.820133741868434e-05,
"loss": 0.502,
"step": 678
},
{
"epoch": 3.0585915492957745,
"grad_norm": 0.12042738497257233,
"learning_rate": 2.794784063992131e-05,
"loss": 0.5081,
"step": 679
},
{
"epoch": 3.063098591549296,
"grad_norm": 0.11131054908037186,
"learning_rate": 2.7695303129167927e-05,
"loss": 0.5123,
"step": 680
},
{
"epoch": 3.067605633802817,
"grad_norm": 0.10670991986989975,
"learning_rate": 2.7443728248587852e-05,
"loss": 0.5246,
"step": 681
},
{
"epoch": 3.072112676056338,
"grad_norm": 0.11702380329370499,
"learning_rate": 2.719311934752884e-05,
"loss": 0.5109,
"step": 682
},
{
"epoch": 3.076619718309859,
"grad_norm": 0.10768495500087738,
"learning_rate": 2.6943479762477964e-05,
"loss": 0.5195,
"step": 683
},
{
"epoch": 3.08112676056338,
"grad_norm": 0.1184210330247879,
"learning_rate": 2.669481281701739e-05,
"loss": 0.5585,
"step": 684
},
{
"epoch": 3.0856338028169015,
"grad_norm": 0.10743115097284317,
"learning_rate": 2.6447121821779917e-05,
"loss": 0.5134,
"step": 685
},
{
"epoch": 3.0901408450704224,
"grad_norm": 0.10095122456550598,
"learning_rate": 2.620041007440508e-05,
"loss": 0.5154,
"step": 686
},
{
"epoch": 3.094647887323944,
"grad_norm": 0.11585785448551178,
"learning_rate": 2.5954680859495163e-05,
"loss": 0.4975,
"step": 687
},
{
"epoch": 3.0991549295774647,
"grad_norm": 0.11215182393789291,
"learning_rate": 2.570993744857151e-05,
"loss": 0.5347,
"step": 688
},
{
"epoch": 3.103661971830986,
"grad_norm": 0.10896939039230347,
"learning_rate": 2.5466183100030837e-05,
"loss": 0.5013,
"step": 689
},
{
"epoch": 3.108169014084507,
"grad_norm": 0.11886619031429291,
"learning_rate": 2.5223421059102104e-05,
"loss": 0.5244,
"step": 690
},
{
"epoch": 3.112676056338028,
"grad_norm": 0.11181308329105377,
"learning_rate": 2.4981654557803026e-05,
"loss": 0.5391,
"step": 691
},
{
"epoch": 3.1171830985915494,
"grad_norm": 0.09808338433504105,
"learning_rate": 2.474088681489729e-05,
"loss": 0.525,
"step": 692
},
{
"epoch": 3.1216901408450703,
"grad_norm": 0.11004786193370819,
"learning_rate": 2.4501121035851492e-05,
"loss": 0.526,
"step": 693
},
{
"epoch": 3.1261971830985917,
"grad_norm": 0.10769461840391159,
"learning_rate": 2.426236041279266e-05,
"loss": 0.5611,
"step": 694
},
{
"epoch": 3.1307042253521127,
"grad_norm": 0.10455281287431717,
"learning_rate": 2.4024608124465585e-05,
"loss": 0.5384,
"step": 695
},
{
"epoch": 3.1352112676056336,
"grad_norm": 0.10149269551038742,
"learning_rate": 2.378786733619054e-05,
"loss": 0.5255,
"step": 696
},
{
"epoch": 3.139718309859155,
"grad_norm": 0.10262631624937057,
"learning_rate": 2.35521411998213e-05,
"loss": 0.5248,
"step": 697
},
{
"epoch": 3.144225352112676,
"grad_norm": 0.0997721254825592,
"learning_rate": 2.331743285370288e-05,
"loss": 0.5047,
"step": 698
},
{
"epoch": 3.1487323943661973,
"grad_norm": 0.09472807496786118,
"learning_rate": 2.3083745422630122e-05,
"loss": 0.5056,
"step": 699
},
{
"epoch": 3.1532394366197183,
"grad_norm": 0.10167699307203293,
"learning_rate": 2.2851082017805703e-05,
"loss": 0.5039,
"step": 700
},
{
"epoch": 3.1577464788732392,
"grad_norm": 0.11040908098220825,
"learning_rate": 2.2619445736799028e-05,
"loss": 0.5308,
"step": 701
},
{
"epoch": 3.1622535211267606,
"grad_norm": 0.1062508374452591,
"learning_rate": 2.238883966350479e-05,
"loss": 0.5251,
"step": 702
},
{
"epoch": 3.1667605633802816,
"grad_norm": 0.11194084584712982,
"learning_rate": 2.215926686810206e-05,
"loss": 0.5202,
"step": 703
},
{
"epoch": 3.171267605633803,
"grad_norm": 0.1002085953950882,
"learning_rate": 2.1930730407013245e-05,
"loss": 0.5095,
"step": 704
},
{
"epoch": 3.175774647887324,
"grad_norm": 0.09527210891246796,
"learning_rate": 2.1703233322863616e-05,
"loss": 0.5217,
"step": 705
},
{
"epoch": 3.1802816901408453,
"grad_norm": 0.10605484992265701,
"learning_rate": 2.1476778644440553e-05,
"loss": 0.5075,
"step": 706
},
{
"epoch": 3.184788732394366,
"grad_norm": 0.10591242462396622,
"learning_rate": 2.1251369386653454e-05,
"loss": 0.5129,
"step": 707
},
{
"epoch": 3.189295774647887,
"grad_norm": 0.11138251423835754,
"learning_rate": 2.1027008550493376e-05,
"loss": 0.5366,
"step": 708
},
{
"epoch": 3.1938028169014085,
"grad_norm": 0.10201424360275269,
"learning_rate": 2.0803699122993293e-05,
"loss": 0.5375,
"step": 709
},
{
"epoch": 3.1983098591549295,
"grad_norm": 0.10267283767461777,
"learning_rate": 2.0581444077188194e-05,
"loss": 0.5262,
"step": 710
},
{
"epoch": 3.202816901408451,
"grad_norm": 0.09566653519868851,
"learning_rate": 2.0360246372075466e-05,
"loss": 0.4986,
"step": 711
},
{
"epoch": 3.207323943661972,
"grad_norm": 0.09336459636688232,
"learning_rate": 2.0140108952575698e-05,
"loss": 0.4931,
"step": 712
},
{
"epoch": 3.2118309859154928,
"grad_norm": 0.10596734285354614,
"learning_rate": 1.9921034749493205e-05,
"loss": 0.5326,
"step": 713
},
{
"epoch": 3.216338028169014,
"grad_norm": 0.10336334258317947,
"learning_rate": 1.9703026679477256e-05,
"loss": 0.5073,
"step": 714
},
{
"epoch": 3.220845070422535,
"grad_norm": 0.0994623526930809,
"learning_rate": 1.9486087644983054e-05,
"loss": 0.5281,
"step": 715
},
{
"epoch": 3.2253521126760565,
"grad_norm": 0.09632834047079086,
"learning_rate": 1.9270220534233263e-05,
"loss": 0.5067,
"step": 716
},
{
"epoch": 3.2298591549295774,
"grad_norm": 0.10012217611074448,
"learning_rate": 1.9055428221179338e-05,
"loss": 0.5009,
"step": 717
},
{
"epoch": 3.234366197183099,
"grad_norm": 0.11541806161403656,
"learning_rate": 1.8841713565463548e-05,
"loss": 0.5082,
"step": 718
},
{
"epoch": 3.2388732394366198,
"grad_norm": 0.11000224947929382,
"learning_rate": 1.862907941238059e-05,
"loss": 0.4974,
"step": 719
},
{
"epoch": 3.2433802816901407,
"grad_norm": 0.09478365629911423,
"learning_rate": 1.8417528592840018e-05,
"loss": 0.4858,
"step": 720
},
{
"epoch": 3.247887323943662,
"grad_norm": 0.11213672161102295,
"learning_rate": 1.8207063923328237e-05,
"loss": 0.5708,
"step": 721
},
{
"epoch": 3.252394366197183,
"grad_norm": 0.10433446615934372,
"learning_rate": 1.799768820587132e-05,
"loss": 0.507,
"step": 722
},
{
"epoch": 3.2569014084507044,
"grad_norm": 0.10733353346586227,
"learning_rate": 1.77894042279975e-05,
"loss": 0.5225,
"step": 723
},
{
"epoch": 3.2614084507042254,
"grad_norm": 0.10116215795278549,
"learning_rate": 1.7582214762700054e-05,
"loss": 0.5272,
"step": 724
},
{
"epoch": 3.2659154929577463,
"grad_norm": 0.1019587591290474,
"learning_rate": 1.7376122568400532e-05,
"loss": 0.5085,
"step": 725
},
{
"epoch": 3.2704225352112677,
"grad_norm": 0.0982939749956131,
"learning_rate": 1.7171130388911848e-05,
"loss": 0.5176,
"step": 726
},
{
"epoch": 3.2749295774647886,
"grad_norm": 0.10578010976314545,
"learning_rate": 1.6967240953401954e-05,
"loss": 0.4992,
"step": 727
},
{
"epoch": 3.27943661971831,
"grad_norm": 0.09891770035028458,
"learning_rate": 1.676445697635728e-05,
"loss": 0.483,
"step": 728
},
{
"epoch": 3.283943661971831,
"grad_norm": 0.10481581091880798,
"learning_rate": 1.6562781157546835e-05,
"loss": 0.518,
"step": 729
},
{
"epoch": 3.288450704225352,
"grad_norm": 0.09994784742593765,
"learning_rate": 1.6362216181986002e-05,
"loss": 0.4964,
"step": 730
},
{
"epoch": 3.2929577464788733,
"grad_norm": 0.10088899731636047,
"learning_rate": 1.6162764719901046e-05,
"loss": 0.5083,
"step": 731
},
{
"epoch": 3.2974647887323942,
"grad_norm": 0.103594109416008,
"learning_rate": 1.596442942669335e-05,
"loss": 0.5287,
"step": 732
},
{
"epoch": 3.3019718309859156,
"grad_norm": 0.1050785481929779,
"learning_rate": 1.5767212942904276e-05,
"loss": 0.5247,
"step": 733
},
{
"epoch": 3.3064788732394366,
"grad_norm": 0.10635862499475479,
"learning_rate": 1.5571117894179754e-05,
"loss": 0.5207,
"step": 734
},
{
"epoch": 3.3109859154929575,
"grad_norm": 0.10588835924863815,
"learning_rate": 1.5376146891235598e-05,
"loss": 0.501,
"step": 735
},
{
"epoch": 3.315492957746479,
"grad_norm": 0.11529035866260529,
"learning_rate": 1.5182302529822479e-05,
"loss": 0.5338,
"step": 736
},
{
"epoch": 3.32,
"grad_norm": 0.11125069856643677,
"learning_rate": 1.4989587390691628e-05,
"loss": 0.5433,
"step": 737
},
{
"epoch": 3.3245070422535212,
"grad_norm": 0.10297177731990814,
"learning_rate": 1.4798004039560242e-05,
"loss": 0.5167,
"step": 738
},
{
"epoch": 3.329014084507042,
"grad_norm": 0.09564676135778427,
"learning_rate": 1.4607555027077525e-05,
"loss": 0.4976,
"step": 739
},
{
"epoch": 3.3335211267605636,
"grad_norm": 0.09943831712007523,
"learning_rate": 1.4418242888790579e-05,
"loss": 0.5223,
"step": 740
},
{
"epoch": 3.3380281690140845,
"grad_norm": 0.09775034338235855,
"learning_rate": 1.4230070145110707e-05,
"loss": 0.5125,
"step": 741
},
{
"epoch": 3.3425352112676054,
"grad_norm": 0.10720875859260559,
"learning_rate": 1.4043039301279903e-05,
"loss": 0.5541,
"step": 742
},
{
"epoch": 3.347042253521127,
"grad_norm": 0.10151630640029907,
"learning_rate": 1.3857152847337395e-05,
"loss": 0.516,
"step": 743
},
{
"epoch": 3.3515492957746478,
"grad_norm": 0.11832702904939651,
"learning_rate": 1.3672413258086592e-05,
"loss": 0.5162,
"step": 744
},
{
"epoch": 3.356056338028169,
"grad_norm": 0.09787547588348389,
"learning_rate": 1.3488822993062089e-05,
"loss": 0.4999,
"step": 745
},
{
"epoch": 3.36056338028169,
"grad_norm": 0.10296602547168732,
"learning_rate": 1.3306384496496927e-05,
"loss": 0.5479,
"step": 746
},
{
"epoch": 3.3650704225352115,
"grad_norm": 0.10161863267421722,
"learning_rate": 1.3125100197290019e-05,
"loss": 0.5444,
"step": 747
},
{
"epoch": 3.3695774647887324,
"grad_norm": 0.11383792757987976,
"learning_rate": 1.2944972508973908e-05,
"loss": 0.5736,
"step": 748
},
{
"epoch": 3.3740845070422534,
"grad_norm": 0.09610579162836075,
"learning_rate": 1.2766003829682505e-05,
"loss": 0.4991,
"step": 749
},
{
"epoch": 3.3785915492957748,
"grad_norm": 0.10732931643724442,
"learning_rate": 1.258819654211929e-05,
"loss": 0.5349,
"step": 750
},
{
"epoch": 3.3830985915492957,
"grad_norm": 0.10681644827127457,
"learning_rate": 1.2411553013525457e-05,
"loss": 0.5184,
"step": 751
},
{
"epoch": 3.387605633802817,
"grad_norm": 0.11193811148405075,
"learning_rate": 1.2236075595648566e-05,
"loss": 0.5576,
"step": 752
},
{
"epoch": 3.392112676056338,
"grad_norm": 0.11142749339342117,
"learning_rate": 1.2061766624711035e-05,
"loss": 0.5347,
"step": 753
},
{
"epoch": 3.396619718309859,
"grad_norm": 0.09409522265195847,
"learning_rate": 1.1888628421379221e-05,
"loss": 0.5002,
"step": 754
},
{
"epoch": 3.4011267605633804,
"grad_norm": 0.09690172225236893,
"learning_rate": 1.1716663290732366e-05,
"loss": 0.4881,
"step": 755
},
{
"epoch": 3.4056338028169013,
"grad_norm": 0.10559435188770294,
"learning_rate": 1.1545873522232053e-05,
"loss": 0.5071,
"step": 756
},
{
"epoch": 3.4101408450704227,
"grad_norm": 0.11353932321071625,
"learning_rate": 1.1376261389691634e-05,
"loss": 0.5303,
"step": 757
},
{
"epoch": 3.4146478873239436,
"grad_norm": 0.10134030133485794,
"learning_rate": 1.1207829151245941e-05,
"loss": 0.5171,
"step": 758
},
{
"epoch": 3.4191549295774646,
"grad_norm": 0.09742739796638489,
"learning_rate": 1.1040579049321309e-05,
"loss": 0.5122,
"step": 759
},
{
"epoch": 3.423661971830986,
"grad_norm": 0.09522052109241486,
"learning_rate": 1.0874513310605628e-05,
"loss": 0.5315,
"step": 760
},
{
"epoch": 3.428169014084507,
"grad_norm": 0.10180686414241791,
"learning_rate": 1.0709634146018798e-05,
"loss": 0.492,
"step": 761
},
{
"epoch": 3.4326760563380283,
"grad_norm": 0.10120957344770432,
"learning_rate": 1.0545943750683162e-05,
"loss": 0.5219,
"step": 762
},
{
"epoch": 3.4371830985915492,
"grad_norm": 0.10289746522903442,
"learning_rate": 1.0383444303894452e-05,
"loss": 0.5022,
"step": 763
},
{
"epoch": 3.44169014084507,
"grad_norm": 0.10165810585021973,
"learning_rate": 1.0222137969092581e-05,
"loss": 0.5154,
"step": 764
},
{
"epoch": 3.4461971830985916,
"grad_norm": 0.10728448629379272,
"learning_rate": 1.0062026893833033e-05,
"loss": 0.5083,
"step": 765
},
{
"epoch": 3.4507042253521125,
"grad_norm": 0.11152730882167816,
"learning_rate": 9.903113209758096e-06,
"loss": 0.5656,
"step": 766
},
{
"epoch": 3.455211267605634,
"grad_norm": 0.11231085658073425,
"learning_rate": 9.745399032568604e-06,
"loss": 0.5132,
"step": 767
},
{
"epoch": 3.459718309859155,
"grad_norm": 0.09897750616073608,
"learning_rate": 9.588886461995772e-06,
"loss": 0.5097,
"step": 768
},
{
"epoch": 3.4642253521126762,
"grad_norm": 0.10121899843215942,
"learning_rate": 9.43357758177309e-06,
"loss": 0.4748,
"step": 769
},
{
"epoch": 3.468732394366197,
"grad_norm": 0.11369026452302933,
"learning_rate": 9.279474459608805e-06,
"loss": 0.5531,
"step": 770
},
{
"epoch": 3.473239436619718,
"grad_norm": 0.09937481582164764,
"learning_rate": 9.126579147158187e-06,
"loss": 0.5213,
"step": 771
},
{
"epoch": 3.4777464788732395,
"grad_norm": 0.09812068939208984,
"learning_rate": 8.974893679996388e-06,
"loss": 0.5024,
"step": 772
},
{
"epoch": 3.4822535211267605,
"grad_norm": 0.11508133262395859,
"learning_rate": 8.824420077591155e-06,
"loss": 0.5378,
"step": 773
},
{
"epoch": 3.486760563380282,
"grad_norm": 0.11102595925331116,
"learning_rate": 8.675160343276167e-06,
"loss": 0.5344,
"step": 774
},
{
"epoch": 3.491267605633803,
"grad_norm": 0.09482184052467346,
"learning_rate": 8.527116464224127e-06,
"loss": 0.5032,
"step": 775
},
{
"epoch": 3.495774647887324,
"grad_norm": 0.09604179859161377,
"learning_rate": 8.380290411420522e-06,
"loss": 0.5006,
"step": 776
},
{
"epoch": 3.500281690140845,
"grad_norm": 0.10570445656776428,
"learning_rate": 8.234684139637205e-06,
"loss": 0.5378,
"step": 777
},
{
"epoch": 3.504788732394366,
"grad_norm": 0.09560370445251465,
"learning_rate": 8.090299587406514e-06,
"loss": 0.508,
"step": 778
},
{
"epoch": 3.5092957746478874,
"grad_norm": 0.09780082106590271,
"learning_rate": 7.947138676995302e-06,
"loss": 0.4905,
"step": 779
},
{
"epoch": 3.5138028169014084,
"grad_norm": 0.10680654644966125,
"learning_rate": 7.805203314379583e-06,
"loss": 0.4999,
"step": 780
},
{
"epoch": 3.5183098591549298,
"grad_norm": 0.10805977880954742,
"learning_rate": 7.664495389218884e-06,
"loss": 0.5246,
"step": 781
},
{
"epoch": 3.5228169014084507,
"grad_norm": 0.1088169664144516,
"learning_rate": 7.525016774831273e-06,
"loss": 0.4862,
"step": 782
},
{
"epoch": 3.5273239436619717,
"grad_norm": 0.09178736060857773,
"learning_rate": 7.386769328168353e-06,
"loss": 0.5066,
"step": 783
},
{
"epoch": 3.531830985915493,
"grad_norm": 0.1102149561047554,
"learning_rate": 7.249754889790539e-06,
"loss": 0.5054,
"step": 784
},
{
"epoch": 3.536338028169014,
"grad_norm": 0.11161289364099503,
"learning_rate": 7.113975283842589e-06,
"loss": 0.534,
"step": 785
},
{
"epoch": 3.5408450704225354,
"grad_norm": 0.100556880235672,
"learning_rate": 6.979432318029244e-06,
"loss": 0.5078,
"step": 786
},
{
"epoch": 3.5453521126760563,
"grad_norm": 0.10358420014381409,
"learning_rate": 6.846127783591294e-06,
"loss": 0.5192,
"step": 787
},
{
"epoch": 3.5498591549295773,
"grad_norm": 0.10767854750156403,
"learning_rate": 6.714063455281538e-06,
"loss": 0.5081,
"step": 788
},
{
"epoch": 3.5543661971830987,
"grad_norm": 0.09717968106269836,
"learning_rate": 6.583241091341353e-06,
"loss": 0.4981,
"step": 789
},
{
"epoch": 3.5588732394366196,
"grad_norm": 0.0952880010008812,
"learning_rate": 6.453662433477136e-06,
"loss": 0.512,
"step": 790
},
{
"epoch": 3.563380281690141,
"grad_norm": 0.10167808830738068,
"learning_rate": 6.325329206837216e-06,
"loss": 0.5,
"step": 791
},
{
"epoch": 3.567887323943662,
"grad_norm": 0.09986628592014313,
"learning_rate": 6.1982431199888225e-06,
"loss": 0.5119,
"step": 792
},
{
"epoch": 3.572394366197183,
"grad_norm": 0.09896910190582275,
"learning_rate": 6.072405864895403e-06,
"loss": 0.4996,
"step": 793
},
{
"epoch": 3.5769014084507043,
"grad_norm": 0.0993683710694313,
"learning_rate": 5.947819116893971e-06,
"loss": 0.4988,
"step": 794
},
{
"epoch": 3.581408450704225,
"grad_norm": 0.10620611906051636,
"learning_rate": 5.82448453467298e-06,
"loss": 0.5162,
"step": 795
},
{
"epoch": 3.5859154929577466,
"grad_norm": 0.09343092143535614,
"learning_rate": 5.7024037602500855e-06,
"loss": 0.5073,
"step": 796
},
{
"epoch": 3.5904225352112675,
"grad_norm": 0.09894745796918869,
"learning_rate": 5.581578418950373e-06,
"loss": 0.5001,
"step": 797
},
{
"epoch": 3.5949295774647885,
"grad_norm": 0.1150493249297142,
"learning_rate": 5.462010119384664e-06,
"loss": 0.5332,
"step": 798
},
{
"epoch": 3.59943661971831,
"grad_norm": 0.09008999168872833,
"learning_rate": 5.343700453428168e-06,
"loss": 0.5021,
"step": 799
},
{
"epoch": 3.6039436619718312,
"grad_norm": 0.09946510940790176,
"learning_rate": 5.226650996199223e-06,
"loss": 0.5182,
"step": 800
},
{
"epoch": 3.608450704225352,
"grad_norm": 0.09479941427707672,
"learning_rate": 5.1108633060383606e-06,
"loss": 0.5194,
"step": 801
},
{
"epoch": 3.612957746478873,
"grad_norm": 0.10796267539262772,
"learning_rate": 4.996338924487509e-06,
"loss": 0.5223,
"step": 802
},
{
"epoch": 3.6174647887323945,
"grad_norm": 0.09543827176094055,
"learning_rate": 4.883079376269573e-06,
"loss": 0.4974,
"step": 803
},
{
"epoch": 3.6219718309859155,
"grad_norm": 0.09834381937980652,
"learning_rate": 4.771086169268057e-06,
"loss": 0.502,
"step": 804
},
{
"epoch": 3.626478873239437,
"grad_norm": 0.0954863652586937,
"learning_rate": 4.660360794506946e-06,
"loss": 0.5052,
"step": 805
},
{
"epoch": 3.630985915492958,
"grad_norm": 0.10441611707210541,
"learning_rate": 4.550904726130989e-06,
"loss": 0.5249,
"step": 806
},
{
"epoch": 3.6354929577464787,
"grad_norm": 0.1028057262301445,
"learning_rate": 4.442719421385922e-06,
"loss": 0.5309,
"step": 807
},
{
"epoch": 3.64,
"grad_norm": 0.10420232266187668,
"learning_rate": 4.3358063205992336e-06,
"loss": 0.5092,
"step": 808
},
{
"epoch": 3.644507042253521,
"grad_norm": 0.09221101552248001,
"learning_rate": 4.230166847160799e-06,
"loss": 0.5059,
"step": 809
},
{
"epoch": 3.6490140845070425,
"grad_norm": 0.10424289852380753,
"learning_rate": 4.125802407504098e-06,
"loss": 0.5007,
"step": 810
},
{
"epoch": 3.6535211267605634,
"grad_norm": 0.10054883360862732,
"learning_rate": 4.022714391087379e-06,
"loss": 0.5172,
"step": 811
},
{
"epoch": 3.6580281690140843,
"grad_norm": 0.11087238043546677,
"learning_rate": 3.9209041703752395e-06,
"loss": 0.5811,
"step": 812
},
{
"epoch": 3.6625352112676057,
"grad_norm": 0.10872288048267365,
"learning_rate": 3.820373100820263e-06,
"loss": 0.5131,
"step": 813
},
{
"epoch": 3.6670422535211267,
"grad_norm": 0.1310155689716339,
"learning_rate": 3.7211225208450774e-06,
"loss": 0.4928,
"step": 814
},
{
"epoch": 3.671549295774648,
"grad_norm": 0.10429683327674866,
"learning_rate": 3.623153751824482e-06,
"loss": 0.5158,
"step": 815
},
{
"epoch": 3.676056338028169,
"grad_norm": 0.09816399961709976,
"learning_rate": 3.5264680980677924e-06,
"loss": 0.511,
"step": 816
},
{
"epoch": 3.68056338028169,
"grad_norm": 0.09635947644710541,
"learning_rate": 3.431066846801634e-06,
"loss": 0.5009,
"step": 817
},
{
"epoch": 3.6850704225352113,
"grad_norm": 0.1052606850862503,
"learning_rate": 3.3369512681526326e-06,
"loss": 0.5164,
"step": 818
},
{
"epoch": 3.6895774647887323,
"grad_norm": 0.09707427024841309,
"learning_rate": 3.2441226151306404e-06,
"loss": 0.5412,
"step": 819
},
{
"epoch": 3.6940845070422537,
"grad_norm": 0.09661922603845596,
"learning_rate": 3.1525821236119577e-06,
"loss": 0.4996,
"step": 820
},
{
"epoch": 3.6985915492957746,
"grad_norm": 0.1082405224442482,
"learning_rate": 3.0623310123229387e-06,
"loss": 0.5218,
"step": 821
},
{
"epoch": 3.7030985915492955,
"grad_norm": 0.08868458867073059,
"learning_rate": 2.973370482823734e-06,
"loss": 0.4729,
"step": 822
},
{
"epoch": 3.707605633802817,
"grad_norm": 0.1018679216504097,
"learning_rate": 2.8857017194923173e-06,
"loss": 0.5318,
"step": 823
},
{
"epoch": 3.712112676056338,
"grad_norm": 0.10018418729305267,
"learning_rate": 2.7993258895086973e-06,
"loss": 0.5007,
"step": 824
},
{
"epoch": 3.7166197183098593,
"grad_norm": 0.08692584931850433,
"learning_rate": 2.714244142839395e-06,
"loss": 0.5122,
"step": 825
},
{
"epoch": 3.72112676056338,
"grad_norm": 0.09924297034740448,
"learning_rate": 2.6304576122221035e-06,
"loss": 0.5165,
"step": 826
},
{
"epoch": 3.725633802816901,
"grad_norm": 0.09864252805709839,
"learning_rate": 2.5479674131506425e-06,
"loss": 0.5397,
"step": 827
},
{
"epoch": 3.7301408450704225,
"grad_norm": 0.09954255074262619,
"learning_rate": 2.466774643860115e-06,
"loss": 0.5043,
"step": 828
},
{
"epoch": 3.734647887323944,
"grad_norm": 0.11289742588996887,
"learning_rate": 2.386880385312218e-06,
"loss": 0.528,
"step": 829
},
{
"epoch": 3.739154929577465,
"grad_norm": 0.09993050992488861,
"learning_rate": 2.3082857011809344e-06,
"loss": 0.4994,
"step": 830
},
{
"epoch": 3.743661971830986,
"grad_norm": 0.09765881299972534,
"learning_rate": 2.230991637838309e-06,
"loss": 0.4916,
"step": 831
},
{
"epoch": 3.748169014084507,
"grad_norm": 0.10377711802721024,
"learning_rate": 2.1549992243405816e-06,
"loss": 0.5386,
"step": 832
},
{
"epoch": 3.752676056338028,
"grad_norm": 0.09763538837432861,
"learning_rate": 2.080309472414388e-06,
"loss": 0.5002,
"step": 833
},
{
"epoch": 3.7571830985915495,
"grad_norm": 0.09959236532449722,
"learning_rate": 2.006923376443415e-06,
"loss": 0.4978,
"step": 834
},
{
"epoch": 3.7616901408450705,
"grad_norm": 0.09664242714643478,
"learning_rate": 1.934841913455032e-06,
"loss": 0.5301,
"step": 835
},
{
"epoch": 3.7661971830985914,
"grad_norm": 0.10548033565282822,
"learning_rate": 1.8640660431074265e-06,
"loss": 0.5179,
"step": 836
},
{
"epoch": 3.770704225352113,
"grad_norm": 0.10041049867868423,
"learning_rate": 1.7945967076766546e-06,
"loss": 0.5119,
"step": 837
},
{
"epoch": 3.7752112676056337,
"grad_norm": 0.09538040310144424,
"learning_rate": 1.7264348320442992e-06,
"loss": 0.5145,
"step": 838
},
{
"epoch": 3.779718309859155,
"grad_norm": 0.1140114888548851,
"learning_rate": 1.6595813236849556e-06,
"loss": 0.5112,
"step": 839
},
{
"epoch": 3.784225352112676,
"grad_norm": 0.09238018095493317,
"learning_rate": 1.5940370726542863e-06,
"loss": 0.496,
"step": 840
},
{
"epoch": 3.788732394366197,
"grad_norm": 0.10022945702075958,
"learning_rate": 1.5298029515771195e-06,
"loss": 0.5104,
"step": 841
},
{
"epoch": 3.7932394366197184,
"grad_norm": 0.1022907942533493,
"learning_rate": 1.4668798156358465e-06,
"loss": 0.515,
"step": 842
},
{
"epoch": 3.7977464788732394,
"grad_norm": 0.09121408313512802,
"learning_rate": 1.4052685025590096e-06,
"loss": 0.4859,
"step": 843
},
{
"epoch": 3.8022535211267607,
"grad_norm": 0.09021437168121338,
"learning_rate": 1.344969832610199e-06,
"loss": 0.493,
"step": 844
},
{
"epoch": 3.8067605633802817,
"grad_norm": 0.09189501404762268,
"learning_rate": 1.2859846085770733e-06,
"loss": 0.4916,
"step": 845
},
{
"epoch": 3.8112676056338026,
"grad_norm": 0.09620609134435654,
"learning_rate": 1.2283136157607121e-06,
"loss": 0.5147,
"step": 846
},
{
"epoch": 3.815774647887324,
"grad_norm": 0.09854090213775635,
"learning_rate": 1.1719576219651585e-06,
"loss": 0.4978,
"step": 847
},
{
"epoch": 3.820281690140845,
"grad_norm": 0.0998944416642189,
"learning_rate": 1.1169173774871478e-06,
"loss": 0.5132,
"step": 848
},
{
"epoch": 3.8247887323943663,
"grad_norm": 0.09280737489461899,
"learning_rate": 1.0631936151062172e-06,
"loss": 0.5251,
"step": 849
},
{
"epoch": 3.8292957746478873,
"grad_norm": 0.10368622094392776,
"learning_rate": 1.010787050074835e-06,
"loss": 0.511,
"step": 850
},
{
"epoch": 3.8338028169014082,
"grad_norm": 0.09526852518320084,
"learning_rate": 9.596983801089864e-07,
"loss": 0.562,
"step": 851
},
{
"epoch": 3.8383098591549296,
"grad_norm": 0.10724498331546783,
"learning_rate": 9.099282853787805e-07,
"loss": 0.5394,
"step": 852
},
{
"epoch": 3.8428169014084506,
"grad_norm": 0.08827169984579086,
"learning_rate": 8.614774284994798e-07,
"loss": 0.4826,
"step": 853
},
{
"epoch": 3.847323943661972,
"grad_norm": 0.0911865234375,
"learning_rate": 8.143464545226298e-07,
"loss": 0.5189,
"step": 854
},
{
"epoch": 3.851830985915493,
"grad_norm": 0.08919581770896912,
"learning_rate": 7.685359909274881e-07,
"loss": 0.51,
"step": 855
},
{
"epoch": 3.856338028169014,
"grad_norm": 0.1076306402683258,
"learning_rate": 7.24046647612675e-07,
"loss": 0.5176,
"step": 856
},
{
"epoch": 3.860845070422535,
"grad_norm": 0.10025174915790558,
"learning_rate": 6.808790168880364e-07,
"loss": 0.5135,
"step": 857
},
{
"epoch": 3.8653521126760566,
"grad_norm": 0.09523887187242508,
"learning_rate": 6.390336734667823e-07,
"loss": 0.4964,
"step": 858
},
{
"epoch": 3.8698591549295775,
"grad_norm": 0.09640923887491226,
"learning_rate": 5.985111744578165e-07,
"loss": 0.4899,
"step": 859
},
{
"epoch": 3.8743661971830985,
"grad_norm": 0.0929466262459755,
"learning_rate": 5.593120593582967e-07,
"loss": 0.5196,
"step": 860
},
{
"epoch": 3.87887323943662,
"grad_norm": 0.10044664144515991,
"learning_rate": 5.214368500465305e-07,
"loss": 0.5162,
"step": 861
},
{
"epoch": 3.883380281690141,
"grad_norm": 0.1058187335729599,
"learning_rate": 4.848860507749353e-07,
"loss": 0.5232,
"step": 862
},
{
"epoch": 3.887887323943662,
"grad_norm": 0.10016901046037674,
"learning_rate": 4.496601481634e-07,
"loss": 0.5218,
"step": 863
},
{
"epoch": 3.892394366197183,
"grad_norm": 0.10483549535274506,
"learning_rate": 4.157596111927342e-07,
"loss": 0.5097,
"step": 864
},
{
"epoch": 3.896901408450704,
"grad_norm": 0.10363733023405075,
"learning_rate": 3.831848911984959e-07,
"loss": 0.5297,
"step": 865
},
{
"epoch": 3.9014084507042255,
"grad_norm": 0.08613172918558121,
"learning_rate": 3.51936421864929e-07,
"loss": 0.4929,
"step": 866
},
{
"epoch": 3.9059154929577464,
"grad_norm": 0.09748660773038864,
"learning_rate": 3.220146192192242e-07,
"loss": 0.5301,
"step": 867
},
{
"epoch": 3.910422535211268,
"grad_norm": 0.09546560794115067,
"learning_rate": 2.934198816259559e-07,
"loss": 0.517,
"step": 868
},
{
"epoch": 3.9149295774647888,
"grad_norm": 0.10289441049098969,
"learning_rate": 2.661525897817874e-07,
"loss": 0.5074,
"step": 869
},
{
"epoch": 3.9194366197183097,
"grad_norm": 0.0952000617980957,
"learning_rate": 2.402131067104296e-07,
"loss": 0.4901,
"step": 870
},
{
"epoch": 3.923943661971831,
"grad_norm": 0.10645543783903122,
"learning_rate": 2.156017777577346e-07,
"loss": 0.5192,
"step": 871
},
{
"epoch": 3.928450704225352,
"grad_norm": 0.10012649744749069,
"learning_rate": 1.9231893058718754e-07,
"loss": 0.512,
"step": 872
},
{
"epoch": 3.9329577464788734,
"grad_norm": 0.11233918368816376,
"learning_rate": 1.7036487517547717e-07,
"loss": 0.5288,
"step": 873
},
{
"epoch": 3.9374647887323944,
"grad_norm": 0.0949663370847702,
"learning_rate": 1.4973990380841019e-07,
"loss": 0.5148,
"step": 874
},
{
"epoch": 3.9419718309859153,
"grad_norm": 0.08848931640386581,
"learning_rate": 1.3044429107700318e-07,
"loss": 0.5156,
"step": 875
},
{
"epoch": 3.9464788732394367,
"grad_norm": 0.11036274582147598,
"learning_rate": 1.1247829387381892e-07,
"loss": 0.5136,
"step": 876
},
{
"epoch": 3.9509859154929576,
"grad_norm": 0.09624003618955612,
"learning_rate": 9.584215138953579e-08,
"loss": 0.5001,
"step": 877
},
{
"epoch": 3.955492957746479,
"grad_norm": 0.10382691025733948,
"learning_rate": 8.053608510982802e-08,
"loss": 0.5184,
"step": 878
},
{
"epoch": 3.96,
"grad_norm": 0.18590374290943146,
"learning_rate": 6.656029881233483e-08,
"loss": 0.5439,
"step": 879
},
{
"epoch": 3.964507042253521,
"grad_norm": 0.09918248653411865,
"learning_rate": 5.391497856399585e-08,
"loss": 0.5139,
"step": 880
},
{
"epoch": 3.9690140845070423,
"grad_norm": 0.0977630540728569,
"learning_rate": 4.260029271856425e-08,
"loss": 0.5031,
"step": 881
},
{
"epoch": 3.9735211267605632,
"grad_norm": 0.10441361367702484,
"learning_rate": 3.2616391914364054e-08,
"loss": 0.5304,
"step": 882
},
{
"epoch": 3.9780281690140846,
"grad_norm": 0.09436926990747452,
"learning_rate": 2.396340907225847e-08,
"loss": 0.4986,
"step": 883
},
{
"epoch": 3.9825352112676056,
"grad_norm": 0.09458206593990326,
"learning_rate": 1.664145939394013e-08,
"loss": 0.5522,
"step": 884
},
{
"epoch": 3.9870422535211265,
"grad_norm": 0.10429766774177551,
"learning_rate": 1.0650640360343468e-08,
"loss": 0.5175,
"step": 885
},
{
"epoch": 3.991549295774648,
"grad_norm": 0.09607253223657608,
"learning_rate": 5.991031730367969e-09,
"loss": 0.5038,
"step": 886
},
{
"epoch": 3.9960563380281693,
"grad_norm": 0.13139435648918152,
"learning_rate": 2.6626955398234564e-09,
"loss": 0.5277,
"step": 887
},
{
"epoch": 4.0,
"grad_norm": 0.1130460873246193,
"learning_rate": 6.656761005752188e-10,
"loss": 0.51,
"step": 888
}
],
"logging_steps": 1,
"max_steps": 888,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}