Ambari-One-Thrid-PEFT / trainer_state.json
AdithyaSK's picture
Upload folder using huggingface_hub
ff71004 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.30042194092827,
"eval_steps": 500,
"global_step": 267,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.33328112959861755,
"learning_rate": 2e-05,
"loss": 0.9029,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.341234415769577,
"learning_rate": 4e-05,
"loss": 0.9276,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 0.3182106614112854,
"learning_rate": 6e-05,
"loss": 0.9148,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 0.22865141928195953,
"learning_rate": 8e-05,
"loss": 0.8421,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 0.29670122265815735,
"learning_rate": 0.0001,
"loss": 0.7984,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 0.28762516379356384,
"learning_rate": 0.00012,
"loss": 0.7625,
"step": 6
},
{
"epoch": 0.01,
"grad_norm": 0.23907965421676636,
"learning_rate": 0.00014,
"loss": 0.7226,
"step": 7
},
{
"epoch": 0.01,
"grad_norm": 0.19532263278961182,
"learning_rate": 0.00016,
"loss": 0.7043,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 0.1425202488899231,
"learning_rate": 0.00018,
"loss": 0.6784,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 0.10882167518138885,
"learning_rate": 0.0002,
"loss": 0.6579,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.11275648325681686,
"learning_rate": 0.00019999935985220405,
"loss": 0.6592,
"step": 11
},
{
"epoch": 0.01,
"grad_norm": 0.10155748575925827,
"learning_rate": 0.00019999743941701188,
"loss": 0.6554,
"step": 12
},
{
"epoch": 0.01,
"grad_norm": 0.0845816433429718,
"learning_rate": 0.0001999942387190108,
"loss": 0.6513,
"step": 13
},
{
"epoch": 0.02,
"grad_norm": 0.09046202898025513,
"learning_rate": 0.0001999897577991792,
"loss": 0.6267,
"step": 14
},
{
"epoch": 0.02,
"grad_norm": 0.08296829462051392,
"learning_rate": 0.00019998399671488612,
"loss": 0.6434,
"step": 15
},
{
"epoch": 0.02,
"grad_norm": 0.07594181597232819,
"learning_rate": 0.00019997695553989042,
"loss": 0.6096,
"step": 16
},
{
"epoch": 0.02,
"grad_norm": 0.0683172270655632,
"learning_rate": 0.00019996863436433997,
"loss": 0.6143,
"step": 17
},
{
"epoch": 0.02,
"grad_norm": 0.05627186596393585,
"learning_rate": 0.0001999590332947704,
"loss": 0.6024,
"step": 18
},
{
"epoch": 0.02,
"grad_norm": 0.05644279345870018,
"learning_rate": 0.00019994815245410384,
"loss": 0.595,
"step": 19
},
{
"epoch": 0.02,
"grad_norm": 0.05661479011178017,
"learning_rate": 0.00019993599198164715,
"loss": 0.5759,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 0.04077177122235298,
"learning_rate": 0.00019992255203309033,
"loss": 0.582,
"step": 21
},
{
"epoch": 0.02,
"grad_norm": 0.04510512948036194,
"learning_rate": 0.00019990783278050448,
"loss": 0.5751,
"step": 22
},
{
"epoch": 0.03,
"grad_norm": 0.0470162108540535,
"learning_rate": 0.00019989183441233952,
"loss": 0.5716,
"step": 23
},
{
"epoch": 0.03,
"grad_norm": 0.04402562975883484,
"learning_rate": 0.00019987455713342187,
"loss": 0.564,
"step": 24
},
{
"epoch": 0.03,
"grad_norm": 0.045594893395900726,
"learning_rate": 0.00019985600116495173,
"loss": 0.5657,
"step": 25
},
{
"epoch": 0.03,
"grad_norm": 0.037670280784368515,
"learning_rate": 0.0001998361667445004,
"loss": 0.5619,
"step": 26
},
{
"epoch": 0.03,
"grad_norm": 0.034366946667432785,
"learning_rate": 0.00019981505412600706,
"loss": 0.554,
"step": 27
},
{
"epoch": 0.03,
"grad_norm": 0.044084370136260986,
"learning_rate": 0.00019979266357977564,
"loss": 0.5527,
"step": 28
},
{
"epoch": 0.03,
"grad_norm": 0.04839107394218445,
"learning_rate": 0.0001997689953924713,
"loss": 0.5534,
"step": 29
},
{
"epoch": 0.03,
"grad_norm": 0.04943666234612465,
"learning_rate": 0.0001997440498671168,
"loss": 0.5357,
"step": 30
},
{
"epoch": 0.03,
"grad_norm": 0.05644814297556877,
"learning_rate": 0.00019971782732308867,
"loss": 0.5388,
"step": 31
},
{
"epoch": 0.04,
"grad_norm": 0.05796538665890694,
"learning_rate": 0.00019969032809611287,
"loss": 0.5327,
"step": 32
},
{
"epoch": 0.04,
"grad_norm": 0.05399211496114731,
"learning_rate": 0.0001996615525382609,
"loss": 0.5447,
"step": 33
},
{
"epoch": 0.04,
"grad_norm": 0.03574652597308159,
"learning_rate": 0.0001996315010179449,
"loss": 0.5213,
"step": 34
},
{
"epoch": 0.04,
"grad_norm": 0.04394914582371712,
"learning_rate": 0.00019960017391991314,
"loss": 0.5247,
"step": 35
},
{
"epoch": 0.04,
"grad_norm": 0.05197073519229889,
"learning_rate": 0.00019956757164524516,
"loss": 0.5253,
"step": 36
},
{
"epoch": 0.04,
"grad_norm": 0.05605654790997505,
"learning_rate": 0.00019953369461134634,
"loss": 0.5289,
"step": 37
},
{
"epoch": 0.04,
"grad_norm": 0.05167644843459129,
"learning_rate": 0.00019949854325194294,
"loss": 0.5223,
"step": 38
},
{
"epoch": 0.04,
"grad_norm": 0.04118728265166283,
"learning_rate": 0.0001994621180170762,
"loss": 0.52,
"step": 39
},
{
"epoch": 0.05,
"grad_norm": 0.0373263917863369,
"learning_rate": 0.00019942441937309684,
"loss": 0.5076,
"step": 40
},
{
"epoch": 0.05,
"grad_norm": 0.04870600998401642,
"learning_rate": 0.0001993854478026589,
"loss": 0.5287,
"step": 41
},
{
"epoch": 0.05,
"grad_norm": 0.05014103651046753,
"learning_rate": 0.00019934520380471372,
"loss": 0.5048,
"step": 42
},
{
"epoch": 0.05,
"grad_norm": 0.04807833209633827,
"learning_rate": 0.0001993036878945034,
"loss": 0.5075,
"step": 43
},
{
"epoch": 0.05,
"grad_norm": 0.040045421570539474,
"learning_rate": 0.0001992609006035543,
"loss": 0.4954,
"step": 44
},
{
"epoch": 0.05,
"grad_norm": 0.04202349856495857,
"learning_rate": 0.00019921684247967028,
"loss": 0.4953,
"step": 45
},
{
"epoch": 0.05,
"grad_norm": 0.041629109531641006,
"learning_rate": 0.0001991715140869255,
"loss": 0.501,
"step": 46
},
{
"epoch": 0.05,
"grad_norm": 0.04767894372344017,
"learning_rate": 0.0001991249160056574,
"loss": 0.4878,
"step": 47
},
{
"epoch": 0.05,
"grad_norm": 0.05395263060927391,
"learning_rate": 0.00019907704883245916,
"loss": 0.5014,
"step": 48
},
{
"epoch": 0.06,
"grad_norm": 0.06040235981345177,
"learning_rate": 0.00019902791318017205,
"loss": 0.5043,
"step": 49
},
{
"epoch": 0.06,
"grad_norm": 0.06749273091554642,
"learning_rate": 0.0001989775096778777,
"loss": 0.4931,
"step": 50
},
{
"epoch": 0.06,
"grad_norm": 0.06482889503240585,
"learning_rate": 0.00019892583897088994,
"loss": 0.4869,
"step": 51
},
{
"epoch": 0.06,
"grad_norm": 0.045358914881944656,
"learning_rate": 0.0001988729017207465,
"loss": 0.479,
"step": 52
},
{
"epoch": 0.06,
"grad_norm": 0.043360061943531036,
"learning_rate": 0.00019881869860520073,
"loss": 0.4953,
"step": 53
},
{
"epoch": 0.06,
"grad_norm": 0.060205183923244476,
"learning_rate": 0.00019876323031821266,
"loss": 0.4705,
"step": 54
},
{
"epoch": 0.06,
"grad_norm": 0.05729120969772339,
"learning_rate": 0.00019870649756994037,
"loss": 0.4887,
"step": 55
},
{
"epoch": 0.06,
"grad_norm": 0.03843148052692413,
"learning_rate": 0.00019864850108673073,
"loss": 0.4737,
"step": 56
},
{
"epoch": 0.06,
"grad_norm": 0.053673889487981796,
"learning_rate": 0.00019858924161111015,
"loss": 0.4817,
"step": 57
},
{
"epoch": 0.07,
"grad_norm": 0.05148368701338768,
"learning_rate": 0.00019852871990177503,
"loss": 0.4763,
"step": 58
},
{
"epoch": 0.07,
"grad_norm": 0.05371672287583351,
"learning_rate": 0.00019846693673358226,
"loss": 0.4751,
"step": 59
},
{
"epoch": 0.07,
"grad_norm": 0.05490916967391968,
"learning_rate": 0.00019840389289753896,
"loss": 0.457,
"step": 60
},
{
"epoch": 0.07,
"grad_norm": 0.04629400372505188,
"learning_rate": 0.00019833958920079255,
"loss": 0.4692,
"step": 61
},
{
"epoch": 0.07,
"grad_norm": 0.051137253642082214,
"learning_rate": 0.00019827402646662047,
"loss": 0.4614,
"step": 62
},
{
"epoch": 0.07,
"grad_norm": 0.051790811121463776,
"learning_rate": 0.0001982072055344195,
"loss": 0.4594,
"step": 63
},
{
"epoch": 0.07,
"grad_norm": 0.0445956289768219,
"learning_rate": 0.00019813912725969509,
"loss": 0.4601,
"step": 64
},
{
"epoch": 0.07,
"grad_norm": 0.04766576737165451,
"learning_rate": 0.0001980697925140504,
"loss": 0.4631,
"step": 65
},
{
"epoch": 0.07,
"grad_norm": 0.04839074984192848,
"learning_rate": 0.0001979992021851751,
"loss": 0.4605,
"step": 66
},
{
"epoch": 0.08,
"grad_norm": 0.04736727103590965,
"learning_rate": 0.0001979273571768341,
"loss": 0.4617,
"step": 67
},
{
"epoch": 0.08,
"grad_norm": 0.057293377816677094,
"learning_rate": 0.0001978542584088558,
"loss": 0.4621,
"step": 68
},
{
"epoch": 0.08,
"grad_norm": 0.05025665834546089,
"learning_rate": 0.0001977799068171206,
"loss": 0.4671,
"step": 69
},
{
"epoch": 0.08,
"grad_norm": 0.057366687804460526,
"learning_rate": 0.0001977043033535486,
"loss": 0.4521,
"step": 70
},
{
"epoch": 0.08,
"grad_norm": 0.07595837116241455,
"learning_rate": 0.00019762744898608762,
"loss": 0.4671,
"step": 71
},
{
"epoch": 0.08,
"grad_norm": 0.07574213296175003,
"learning_rate": 0.0001975493446987007,
"loss": 0.4664,
"step": 72
},
{
"epoch": 0.08,
"grad_norm": 0.06472938507795334,
"learning_rate": 0.00019746999149135362,
"loss": 0.456,
"step": 73
},
{
"epoch": 0.08,
"grad_norm": 0.05983012542128563,
"learning_rate": 0.00019738939038000205,
"loss": 0.4459,
"step": 74
},
{
"epoch": 0.08,
"grad_norm": 0.05136057734489441,
"learning_rate": 0.00019730754239657842,
"loss": 0.4486,
"step": 75
},
{
"epoch": 0.09,
"grad_norm": 0.06191498041152954,
"learning_rate": 0.00019722444858897878,
"loss": 0.4424,
"step": 76
},
{
"epoch": 0.09,
"grad_norm": 0.06742191314697266,
"learning_rate": 0.0001971401100210496,
"loss": 0.458,
"step": 77
},
{
"epoch": 0.09,
"grad_norm": 0.06019548326730728,
"learning_rate": 0.00019705452777257377,
"loss": 0.4423,
"step": 78
},
{
"epoch": 0.09,
"grad_norm": 0.05012982338666916,
"learning_rate": 0.0001969677029392571,
"loss": 0.4466,
"step": 79
},
{
"epoch": 0.09,
"grad_norm": 0.0552060566842556,
"learning_rate": 0.00019687963663271409,
"loss": 0.4534,
"step": 80
},
{
"epoch": 0.09,
"grad_norm": 0.05883748456835747,
"learning_rate": 0.00019679032998045376,
"loss": 0.4409,
"step": 81
},
{
"epoch": 0.09,
"grad_norm": 0.07146705687046051,
"learning_rate": 0.00019669978412586528,
"loss": 0.4582,
"step": 82
},
{
"epoch": 0.09,
"grad_norm": 0.054095230996608734,
"learning_rate": 0.00019660800022820317,
"loss": 0.4487,
"step": 83
},
{
"epoch": 0.09,
"grad_norm": 0.04927053675055504,
"learning_rate": 0.00019651497946257266,
"loss": 0.4429,
"step": 84
},
{
"epoch": 0.1,
"grad_norm": 0.06037526577711105,
"learning_rate": 0.00019642072301991455,
"loss": 0.4456,
"step": 85
},
{
"epoch": 0.1,
"grad_norm": 0.05555957555770874,
"learning_rate": 0.00019632523210698987,
"loss": 0.4382,
"step": 86
},
{
"epoch": 0.1,
"grad_norm": 0.04606284573674202,
"learning_rate": 0.00019622850794636455,
"loss": 0.4411,
"step": 87
},
{
"epoch": 0.1,
"grad_norm": 0.04605920985341072,
"learning_rate": 0.00019613055177639384,
"loss": 0.4326,
"step": 88
},
{
"epoch": 0.1,
"grad_norm": 0.050325632095336914,
"learning_rate": 0.0001960313648512062,
"loss": 0.4338,
"step": 89
},
{
"epoch": 0.1,
"grad_norm": 0.04921424016356468,
"learning_rate": 0.00019593094844068748,
"loss": 0.4316,
"step": 90
},
{
"epoch": 0.1,
"grad_norm": 0.04333706200122833,
"learning_rate": 0.00019582930383046457,
"loss": 0.4441,
"step": 91
},
{
"epoch": 0.1,
"grad_norm": 0.048454612493515015,
"learning_rate": 0.0001957264323218889,
"loss": 0.4382,
"step": 92
},
{
"epoch": 0.1,
"grad_norm": 0.0541059784591198,
"learning_rate": 0.00019562233523201986,
"loss": 0.4328,
"step": 93
},
{
"epoch": 0.11,
"grad_norm": 0.043696511536836624,
"learning_rate": 0.00019551701389360795,
"loss": 0.4335,
"step": 94
},
{
"epoch": 0.11,
"grad_norm": 0.04407835751771927,
"learning_rate": 0.00019541046965507758,
"loss": 0.4327,
"step": 95
},
{
"epoch": 0.11,
"grad_norm": 0.05477238819003105,
"learning_rate": 0.00019530270388050998,
"loss": 0.4294,
"step": 96
},
{
"epoch": 0.11,
"grad_norm": 0.05609311908483505,
"learning_rate": 0.00019519371794962556,
"loss": 0.4305,
"step": 97
},
{
"epoch": 0.11,
"grad_norm": 0.045145273208618164,
"learning_rate": 0.00019508351325776642,
"loss": 0.4395,
"step": 98
},
{
"epoch": 0.11,
"grad_norm": 0.04475285857915878,
"learning_rate": 0.00019497209121587837,
"loss": 0.4284,
"step": 99
},
{
"epoch": 0.11,
"grad_norm": 0.04405711591243744,
"learning_rate": 0.00019485945325049288,
"loss": 0.4214,
"step": 100
},
{
"epoch": 0.11,
"grad_norm": 0.04461454227566719,
"learning_rate": 0.0001947456008037089,
"loss": 0.4154,
"step": 101
},
{
"epoch": 0.11,
"grad_norm": 0.04791221395134926,
"learning_rate": 0.00019463053533317425,
"loss": 0.4248,
"step": 102
},
{
"epoch": 0.12,
"grad_norm": 0.05543987452983856,
"learning_rate": 0.00019451425831206706,
"loss": 0.4303,
"step": 103
},
{
"epoch": 0.12,
"grad_norm": 0.06330578774213791,
"learning_rate": 0.00019439677122907697,
"loss": 0.4274,
"step": 104
},
{
"epoch": 0.12,
"grad_norm": 0.05569112300872803,
"learning_rate": 0.00019427807558838588,
"loss": 0.4234,
"step": 105
},
{
"epoch": 0.12,
"grad_norm": 0.047680530697107315,
"learning_rate": 0.00019415817290964883,
"loss": 0.4155,
"step": 106
},
{
"epoch": 0.12,
"grad_norm": 0.05214262008666992,
"learning_rate": 0.0001940370647279746,
"loss": 0.4224,
"step": 107
},
{
"epoch": 0.12,
"grad_norm": 0.06332990527153015,
"learning_rate": 0.00019391475259390584,
"loss": 0.4233,
"step": 108
},
{
"epoch": 0.12,
"grad_norm": 0.05726313218474388,
"learning_rate": 0.00019379123807339942,
"loss": 0.4118,
"step": 109
},
{
"epoch": 0.12,
"grad_norm": 0.044936031103134155,
"learning_rate": 0.00019366652274780628,
"loss": 0.4296,
"step": 110
},
{
"epoch": 0.12,
"grad_norm": 0.05117325484752655,
"learning_rate": 0.0001935406082138513,
"loss": 0.4287,
"step": 111
},
{
"epoch": 0.13,
"grad_norm": 0.058542776852846146,
"learning_rate": 0.00019341349608361267,
"loss": 0.4213,
"step": 112
},
{
"epoch": 0.13,
"grad_norm": 0.056066304445266724,
"learning_rate": 0.00019328518798450138,
"loss": 0.4174,
"step": 113
},
{
"epoch": 0.13,
"grad_norm": 0.049762677401304245,
"learning_rate": 0.00019315568555924035,
"loss": 0.418,
"step": 114
},
{
"epoch": 0.13,
"grad_norm": 0.043821126222610474,
"learning_rate": 0.00019302499046584348,
"loss": 0.4012,
"step": 115
},
{
"epoch": 0.13,
"grad_norm": 0.05036221817135811,
"learning_rate": 0.00019289310437759427,
"loss": 0.4237,
"step": 116
},
{
"epoch": 0.13,
"grad_norm": 0.050889529287815094,
"learning_rate": 0.00019276002898302447,
"loss": 0.4144,
"step": 117
},
{
"epoch": 0.13,
"grad_norm": 0.04269757494330406,
"learning_rate": 0.0001926257659858925,
"loss": 0.4078,
"step": 118
},
{
"epoch": 0.13,
"grad_norm": 0.04927165433764458,
"learning_rate": 0.00019249031710516162,
"loss": 0.4155,
"step": 119
},
{
"epoch": 0.14,
"grad_norm": 0.05124311521649361,
"learning_rate": 0.00019235368407497788,
"loss": 0.3966,
"step": 120
},
{
"epoch": 0.14,
"grad_norm": 0.04073040187358856,
"learning_rate": 0.00019221586864464786,
"loss": 0.4064,
"step": 121
},
{
"epoch": 0.14,
"grad_norm": 0.04988453537225723,
"learning_rate": 0.00019207687257861655,
"loss": 0.4197,
"step": 122
},
{
"epoch": 0.14,
"grad_norm": 0.05227258801460266,
"learning_rate": 0.0001919366976564444,
"loss": 0.414,
"step": 123
},
{
"epoch": 0.14,
"grad_norm": 0.0466819666326046,
"learning_rate": 0.00019179534567278475,
"loss": 0.4173,
"step": 124
},
{
"epoch": 0.14,
"grad_norm": 0.047633491456508636,
"learning_rate": 0.00019165281843736085,
"loss": 0.4085,
"step": 125
},
{
"epoch": 0.14,
"grad_norm": 0.05280464142560959,
"learning_rate": 0.00019150911777494258,
"loss": 0.4051,
"step": 126
},
{
"epoch": 0.14,
"grad_norm": 0.052302148193120956,
"learning_rate": 0.00019136424552532318,
"loss": 0.42,
"step": 127
},
{
"epoch": 0.14,
"grad_norm": 0.04875241965055466,
"learning_rate": 0.00019121820354329577,
"loss": 0.4258,
"step": 128
},
{
"epoch": 0.15,
"grad_norm": 0.04654408246278763,
"learning_rate": 0.0001910709936986293,
"loss": 0.409,
"step": 129
},
{
"epoch": 0.15,
"grad_norm": 0.05745020881295204,
"learning_rate": 0.00019092261787604492,
"loss": 0.4059,
"step": 130
},
{
"epoch": 0.15,
"grad_norm": 0.06945241987705231,
"learning_rate": 0.00019077307797519183,
"loss": 0.4038,
"step": 131
},
{
"epoch": 0.15,
"grad_norm": 0.06346461176872253,
"learning_rate": 0.00019062237591062272,
"loss": 0.4031,
"step": 132
},
{
"epoch": 0.15,
"grad_norm": 0.058026187121868134,
"learning_rate": 0.00019047051361176953,
"loss": 0.4126,
"step": 133
},
{
"epoch": 0.15,
"grad_norm": 0.04755179584026337,
"learning_rate": 0.0001903174930229185,
"loss": 0.4209,
"step": 134
},
{
"epoch": 0.15,
"grad_norm": 0.05765068158507347,
"learning_rate": 0.0001901633161031856,
"loss": 0.4067,
"step": 135
},
{
"epoch": 0.15,
"grad_norm": 0.05687811225652695,
"learning_rate": 0.000190007984826491,
"loss": 0.3975,
"step": 136
},
{
"epoch": 0.15,
"grad_norm": 0.04930473491549492,
"learning_rate": 0.0001898515011815343,
"loss": 0.4146,
"step": 137
},
{
"epoch": 0.16,
"grad_norm": 0.05147051811218262,
"learning_rate": 0.0001896938671717687,
"loss": 0.4035,
"step": 138
},
{
"epoch": 0.16,
"grad_norm": 0.05680418014526367,
"learning_rate": 0.0001895350848153754,
"loss": 0.4049,
"step": 139
},
{
"epoch": 0.16,
"grad_norm": 0.0444297268986702,
"learning_rate": 0.00018937515614523797,
"loss": 0.4085,
"step": 140
},
{
"epoch": 0.16,
"grad_norm": 0.05083802342414856,
"learning_rate": 0.00018921408320891612,
"loss": 0.4036,
"step": 141
},
{
"epoch": 0.16,
"grad_norm": 0.04978756606578827,
"learning_rate": 0.00018905186806861957,
"loss": 0.4058,
"step": 142
},
{
"epoch": 0.16,
"grad_norm": 0.04963681101799011,
"learning_rate": 0.00018888851280118155,
"loss": 0.3977,
"step": 143
},
{
"epoch": 0.16,
"grad_norm": 0.0466095507144928,
"learning_rate": 0.00018872401949803237,
"loss": 0.3945,
"step": 144
},
{
"epoch": 0.16,
"grad_norm": 0.04972768574953079,
"learning_rate": 0.00018855839026517257,
"loss": 0.4151,
"step": 145
},
{
"epoch": 0.16,
"grad_norm": 0.054370637983083725,
"learning_rate": 0.0001883916272231459,
"loss": 0.3944,
"step": 146
},
{
"epoch": 0.17,
"grad_norm": 0.054699357599020004,
"learning_rate": 0.00018822373250701224,
"loss": 0.3989,
"step": 147
},
{
"epoch": 0.17,
"grad_norm": 0.054452769458293915,
"learning_rate": 0.00018805470826632024,
"loss": 0.3984,
"step": 148
},
{
"epoch": 0.17,
"grad_norm": 0.04596908017992973,
"learning_rate": 0.00018788455666507981,
"loss": 0.4018,
"step": 149
},
{
"epoch": 0.17,
"grad_norm": 0.054354868829250336,
"learning_rate": 0.00018771327988173435,
"loss": 0.3985,
"step": 150
},
{
"epoch": 0.17,
"grad_norm": 0.05570242181420326,
"learning_rate": 0.00018754088010913304,
"loss": 0.3818,
"step": 151
},
{
"epoch": 0.17,
"grad_norm": 0.054722048342227936,
"learning_rate": 0.00018736735955450251,
"loss": 0.4111,
"step": 152
},
{
"epoch": 0.17,
"grad_norm": 0.04620000347495079,
"learning_rate": 0.00018719272043941882,
"loss": 0.3949,
"step": 153
},
{
"epoch": 0.17,
"grad_norm": 0.048443205654621124,
"learning_rate": 0.00018701696499977884,
"loss": 0.3856,
"step": 154
},
{
"epoch": 0.17,
"grad_norm": 0.06628945469856262,
"learning_rate": 0.00018684009548577168,
"loss": 0.4048,
"step": 155
},
{
"epoch": 0.18,
"grad_norm": 0.05339967459440231,
"learning_rate": 0.00018666211416184999,
"loss": 0.3894,
"step": 156
},
{
"epoch": 0.18,
"grad_norm": 0.04650304839015007,
"learning_rate": 0.00018648302330670082,
"loss": 0.4004,
"step": 157
},
{
"epoch": 0.18,
"grad_norm": 0.05634591728448868,
"learning_rate": 0.00018630282521321645,
"loss": 0.4033,
"step": 158
},
{
"epoch": 0.18,
"grad_norm": 0.048666685819625854,
"learning_rate": 0.00018612152218846513,
"loss": 0.399,
"step": 159
},
{
"epoch": 0.18,
"grad_norm": 0.04597772657871246,
"learning_rate": 0.0001859391165536615,
"loss": 0.3931,
"step": 160
},
{
"epoch": 0.18,
"grad_norm": 0.0526028610765934,
"learning_rate": 0.00018575561064413689,
"loss": 0.3879,
"step": 161
},
{
"epoch": 0.18,
"grad_norm": 0.05867009237408638,
"learning_rate": 0.00018557100680930937,
"loss": 0.3905,
"step": 162
},
{
"epoch": 0.18,
"grad_norm": 0.05077454075217247,
"learning_rate": 0.00018538530741265364,
"loss": 0.395,
"step": 163
},
{
"epoch": 0.18,
"grad_norm": 0.0461389385163784,
"learning_rate": 0.00018519851483167097,
"loss": 0.4016,
"step": 164
},
{
"epoch": 0.19,
"grad_norm": 0.059010252356529236,
"learning_rate": 0.00018501063145785846,
"loss": 0.3823,
"step": 165
},
{
"epoch": 0.19,
"grad_norm": 0.06437338888645172,
"learning_rate": 0.00018482165969667874,
"loss": 0.3918,
"step": 166
},
{
"epoch": 0.19,
"grad_norm": 0.04585932940244675,
"learning_rate": 0.00018463160196752887,
"loss": 0.3808,
"step": 167
},
{
"epoch": 0.19,
"grad_norm": 0.05361521616578102,
"learning_rate": 0.00018444046070370963,
"loss": 0.3858,
"step": 168
},
{
"epoch": 0.19,
"grad_norm": 0.05653822794556618,
"learning_rate": 0.00018424823835239417,
"loss": 0.3785,
"step": 169
},
{
"epoch": 0.19,
"grad_norm": 0.04439689964056015,
"learning_rate": 0.0001840549373745968,
"loss": 0.3894,
"step": 170
},
{
"epoch": 0.19,
"grad_norm": 0.05564529448747635,
"learning_rate": 0.00018386056024514137,
"loss": 0.3883,
"step": 171
},
{
"epoch": 0.19,
"grad_norm": 0.06035888195037842,
"learning_rate": 0.00018366510945262972,
"loss": 0.3855,
"step": 172
},
{
"epoch": 0.19,
"grad_norm": 0.044238511472940445,
"learning_rate": 0.0001834685874994098,
"loss": 0.3934,
"step": 173
},
{
"epoch": 0.2,
"grad_norm": 0.050235260277986526,
"learning_rate": 0.00018327099690154344,
"loss": 0.3819,
"step": 174
},
{
"epoch": 0.2,
"grad_norm": 0.051336683332920074,
"learning_rate": 0.00018307234018877434,
"loss": 0.3897,
"step": 175
},
{
"epoch": 0.2,
"grad_norm": 0.045052576810121536,
"learning_rate": 0.0001828726199044957,
"loss": 0.3822,
"step": 176
},
{
"epoch": 0.2,
"grad_norm": 0.05162283405661583,
"learning_rate": 0.00018267183860571753,
"loss": 0.4047,
"step": 177
},
{
"epoch": 0.2,
"grad_norm": 0.0488157793879509,
"learning_rate": 0.00018246999886303383,
"loss": 0.3947,
"step": 178
},
{
"epoch": 0.2,
"grad_norm": 0.04454487934708595,
"learning_rate": 0.00018226710326059006,
"loss": 0.3942,
"step": 179
},
{
"epoch": 0.2,
"grad_norm": 0.0500001423060894,
"learning_rate": 0.0001820631543960496,
"loss": 0.3826,
"step": 180
},
{
"epoch": 0.2,
"grad_norm": 0.04919865354895592,
"learning_rate": 0.00018185815488056076,
"loss": 0.3791,
"step": 181
},
{
"epoch": 0.2,
"grad_norm": 0.04547140747308731,
"learning_rate": 0.00018165210733872336,
"loss": 0.3879,
"step": 182
},
{
"epoch": 0.21,
"grad_norm": 0.044622063636779785,
"learning_rate": 0.00018144501440855496,
"loss": 0.3778,
"step": 183
},
{
"epoch": 0.21,
"grad_norm": 0.04467932507395744,
"learning_rate": 0.00018123687874145721,
"loss": 0.3994,
"step": 184
},
{
"epoch": 0.21,
"grad_norm": 0.04281982406973839,
"learning_rate": 0.0001810277030021819,
"loss": 0.3817,
"step": 185
},
{
"epoch": 0.21,
"grad_norm": 0.05303504317998886,
"learning_rate": 0.00018081748986879679,
"loss": 0.3749,
"step": 186
},
{
"epoch": 0.21,
"grad_norm": 0.046573616564273834,
"learning_rate": 0.00018060624203265134,
"loss": 0.3866,
"step": 187
},
{
"epoch": 0.21,
"grad_norm": 0.044320229440927505,
"learning_rate": 0.00018039396219834237,
"loss": 0.3732,
"step": 188
},
{
"epoch": 0.21,
"grad_norm": 0.05708359181880951,
"learning_rate": 0.00018018065308367912,
"loss": 0.3863,
"step": 189
},
{
"epoch": 0.21,
"grad_norm": 0.045029208064079285,
"learning_rate": 0.00017996631741964888,
"loss": 0.3862,
"step": 190
},
{
"epoch": 0.21,
"grad_norm": 0.055195923894643784,
"learning_rate": 0.00017975095795038165,
"loss": 0.3835,
"step": 191
},
{
"epoch": 0.22,
"grad_norm": 0.048293352127075195,
"learning_rate": 0.00017953457743311523,
"loss": 0.374,
"step": 192
},
{
"epoch": 0.22,
"grad_norm": 0.04677055403590202,
"learning_rate": 0.00017931717863815987,
"loss": 0.377,
"step": 193
},
{
"epoch": 0.22,
"grad_norm": 0.04955766722559929,
"learning_rate": 0.00017909876434886273,
"loss": 0.3808,
"step": 194
},
{
"epoch": 0.22,
"grad_norm": 0.04526973515748978,
"learning_rate": 0.00017887933736157233,
"loss": 0.3796,
"step": 195
},
{
"epoch": 0.22,
"grad_norm": 0.043622203171253204,
"learning_rate": 0.00017865890048560277,
"loss": 0.376,
"step": 196
},
{
"epoch": 0.22,
"grad_norm": 0.046581387519836426,
"learning_rate": 0.0001784374565431976,
"loss": 0.3716,
"step": 197
},
{
"epoch": 0.22,
"grad_norm": 0.04433497413992882,
"learning_rate": 0.00017821500836949386,
"loss": 0.3715,
"step": 198
},
{
"epoch": 0.22,
"grad_norm": 0.04146367311477661,
"learning_rate": 0.00017799155881248572,
"loss": 0.3809,
"step": 199
},
{
"epoch": 0.23,
"grad_norm": 0.045288585126399994,
"learning_rate": 0.000177767110732988,
"loss": 0.3885,
"step": 200
},
{
"epoch": 0.23,
"grad_norm": 0.04070120304822922,
"learning_rate": 0.00017754166700459958,
"loss": 0.3713,
"step": 201
},
{
"epoch": 0.23,
"grad_norm": 0.042820919305086136,
"learning_rate": 0.00017731523051366658,
"loss": 0.3839,
"step": 202
},
{
"epoch": 0.23,
"grad_norm": 0.04416365176439285,
"learning_rate": 0.00017708780415924539,
"loss": 0.3728,
"step": 203
},
{
"epoch": 0.23,
"grad_norm": 0.04461952671408653,
"learning_rate": 0.00017685939085306562,
"loss": 0.373,
"step": 204
},
{
"epoch": 0.23,
"grad_norm": 0.04675828292965889,
"learning_rate": 0.00017662999351949278,
"loss": 0.3711,
"step": 205
},
{
"epoch": 0.23,
"grad_norm": 0.04258272796869278,
"learning_rate": 0.00017639961509549078,
"loss": 0.3782,
"step": 206
},
{
"epoch": 0.23,
"grad_norm": 0.04638506844639778,
"learning_rate": 0.00017616825853058443,
"loss": 0.3592,
"step": 207
},
{
"epoch": 0.23,
"grad_norm": 0.04781416058540344,
"learning_rate": 0.00017593592678682166,
"loss": 0.383,
"step": 208
},
{
"epoch": 0.24,
"grad_norm": 0.04813629388809204,
"learning_rate": 0.00017570262283873552,
"loss": 0.3775,
"step": 209
},
{
"epoch": 0.24,
"grad_norm": 0.046996332705020905,
"learning_rate": 0.00017546834967330617,
"loss": 0.3815,
"step": 210
},
{
"epoch": 0.24,
"grad_norm": 0.04889595881104469,
"learning_rate": 0.00017523311028992268,
"loss": 0.3636,
"step": 211
},
{
"epoch": 0.24,
"grad_norm": 0.04298345744609833,
"learning_rate": 0.00017499690770034443,
"loss": 0.3672,
"step": 212
},
{
"epoch": 0.24,
"grad_norm": 0.04219110682606697,
"learning_rate": 0.00017475974492866278,
"loss": 0.3801,
"step": 213
},
{
"epoch": 0.24,
"grad_norm": 0.051573265343904495,
"learning_rate": 0.00017452162501126227,
"loss": 0.3778,
"step": 214
},
{
"epoch": 0.24,
"grad_norm": 0.048954349011182785,
"learning_rate": 0.00017428255099678167,
"loss": 0.3849,
"step": 215
},
{
"epoch": 0.24,
"grad_norm": 0.042610183358192444,
"learning_rate": 0.0001740425259460751,
"loss": 0.3682,
"step": 216
},
{
"epoch": 0.24,
"grad_norm": 0.04517417773604393,
"learning_rate": 0.00017380155293217264,
"loss": 0.3827,
"step": 217
},
{
"epoch": 0.25,
"grad_norm": 0.04968888312578201,
"learning_rate": 0.00017355963504024123,
"loss": 0.3821,
"step": 218
},
{
"epoch": 0.25,
"grad_norm": 0.051313381642103195,
"learning_rate": 0.0001733167753675449,
"loss": 0.381,
"step": 219
},
{
"epoch": 0.25,
"grad_norm": 0.044351302087306976,
"learning_rate": 0.0001730729770234054,
"loss": 0.381,
"step": 220
},
{
"epoch": 0.25,
"grad_norm": 0.03970547392964363,
"learning_rate": 0.00017282824312916218,
"loss": 0.3698,
"step": 221
},
{
"epoch": 0.25,
"grad_norm": 0.04822370782494545,
"learning_rate": 0.00017258257681813244,
"loss": 0.3838,
"step": 222
},
{
"epoch": 0.25,
"grad_norm": 0.045927174389362335,
"learning_rate": 0.0001723359812355712,
"loss": 0.3662,
"step": 223
},
{
"epoch": 0.25,
"grad_norm": 0.042983219027519226,
"learning_rate": 0.00017208845953863076,
"loss": 0.3574,
"step": 224
},
{
"epoch": 0.25,
"grad_norm": 0.0422198586165905,
"learning_rate": 0.0001718400148963206,
"loss": 0.3559,
"step": 225
},
{
"epoch": 0.25,
"grad_norm": 0.042307570576667786,
"learning_rate": 0.00017159065048946644,
"loss": 0.3834,
"step": 226
},
{
"epoch": 0.26,
"grad_norm": 0.04701109230518341,
"learning_rate": 0.0001713403695106698,
"loss": 0.3718,
"step": 227
},
{
"epoch": 0.26,
"grad_norm": 0.04007503017783165,
"learning_rate": 0.00017108917516426704,
"loss": 0.3785,
"step": 228
},
{
"epoch": 0.26,
"grad_norm": 0.04560061916708946,
"learning_rate": 0.00017083707066628832,
"loss": 0.3713,
"step": 229
},
{
"epoch": 0.26,
"grad_norm": 0.04315731301903725,
"learning_rate": 0.00017058405924441636,
"loss": 0.3702,
"step": 230
},
{
"epoch": 0.26,
"grad_norm": 0.040260497480630875,
"learning_rate": 0.0001703301441379453,
"loss": 0.367,
"step": 231
},
{
"epoch": 0.26,
"grad_norm": 0.04882992431521416,
"learning_rate": 0.000170075328597739,
"loss": 0.3737,
"step": 232
},
{
"epoch": 0.26,
"grad_norm": 0.04410382732748985,
"learning_rate": 0.0001698196158861896,
"loss": 0.3625,
"step": 233
},
{
"epoch": 0.26,
"grad_norm": 0.04889338091015816,
"learning_rate": 0.00016956300927717575,
"loss": 0.3697,
"step": 234
},
{
"epoch": 0.26,
"grad_norm": 0.044603537768125534,
"learning_rate": 0.00016930551205602043,
"loss": 0.3729,
"step": 235
},
{
"epoch": 0.27,
"grad_norm": 0.0539550743997097,
"learning_rate": 0.00016904712751944931,
"loss": 0.3625,
"step": 236
},
{
"epoch": 0.27,
"grad_norm": 0.04753349721431732,
"learning_rate": 0.00016878785897554818,
"loss": 0.3662,
"step": 237
},
{
"epoch": 0.27,
"grad_norm": 0.04425463080406189,
"learning_rate": 0.0001685277097437208,
"loss": 0.3595,
"step": 238
},
{
"epoch": 0.27,
"grad_norm": 0.04160892590880394,
"learning_rate": 0.0001682666831546463,
"loss": 0.3679,
"step": 239
},
{
"epoch": 0.27,
"grad_norm": 0.04600003361701965,
"learning_rate": 0.0001680047825502366,
"loss": 0.3702,
"step": 240
},
{
"epoch": 0.27,
"grad_norm": 0.03887678310275078,
"learning_rate": 0.00016774201128359357,
"loss": 0.3633,
"step": 241
},
{
"epoch": 0.27,
"grad_norm": 0.0477156862616539,
"learning_rate": 0.00016747837271896622,
"loss": 0.3702,
"step": 242
},
{
"epoch": 0.27,
"grad_norm": 0.04244072362780571,
"learning_rate": 0.00016721387023170737,
"loss": 0.3668,
"step": 243
},
{
"epoch": 0.27,
"grad_norm": 0.04049496725201607,
"learning_rate": 0.0001669485072082308,
"loss": 0.3785,
"step": 244
},
{
"epoch": 0.28,
"grad_norm": 0.04432998597621918,
"learning_rate": 0.00016668228704596756,
"loss": 0.3703,
"step": 245
},
{
"epoch": 0.28,
"grad_norm": 0.0432085320353508,
"learning_rate": 0.00016641521315332265,
"loss": 0.3615,
"step": 246
},
{
"epoch": 0.28,
"grad_norm": 0.03820549696683884,
"learning_rate": 0.00016614728894963135,
"loss": 0.3483,
"step": 247
},
{
"epoch": 0.28,
"grad_norm": 0.04436295107007027,
"learning_rate": 0.00016587851786511543,
"loss": 0.3661,
"step": 248
},
{
"epoch": 0.28,
"grad_norm": 0.04371733218431473,
"learning_rate": 0.00016560890334083926,
"loss": 0.3503,
"step": 249
},
{
"epoch": 0.28,
"grad_norm": 0.039205193519592285,
"learning_rate": 0.00016533844882866568,
"loss": 0.3482,
"step": 250
},
{
"epoch": 0.28,
"grad_norm": 0.04308384284377098,
"learning_rate": 0.00016506715779121187,
"loss": 0.373,
"step": 251
},
{
"epoch": 0.28,
"grad_norm": 0.040143441408872604,
"learning_rate": 0.00016479503370180507,
"loss": 0.3609,
"step": 252
},
{
"epoch": 0.28,
"grad_norm": 0.03845199570059776,
"learning_rate": 0.000164522080044438,
"loss": 0.3644,
"step": 253
},
{
"epoch": 0.29,
"grad_norm": 0.039730221033096313,
"learning_rate": 0.00016424830031372425,
"loss": 0.3514,
"step": 254
},
{
"epoch": 0.29,
"grad_norm": 0.04021477699279785,
"learning_rate": 0.00016397369801485366,
"loss": 0.3566,
"step": 255
},
{
"epoch": 0.29,
"grad_norm": 0.03929613530635834,
"learning_rate": 0.00016369827666354745,
"loss": 0.3649,
"step": 256
},
{
"epoch": 0.29,
"grad_norm": 0.040966227650642395,
"learning_rate": 0.0001634220397860129,
"loss": 0.3661,
"step": 257
},
{
"epoch": 0.29,
"grad_norm": 0.036787159740924835,
"learning_rate": 0.0001631449909188987,
"loss": 0.3572,
"step": 258
},
{
"epoch": 0.29,
"grad_norm": 0.039864350110292435,
"learning_rate": 0.00016286713360924918,
"loss": 0.3593,
"step": 259
},
{
"epoch": 0.29,
"grad_norm": 0.039622630923986435,
"learning_rate": 0.00016258847141445928,
"loss": 0.3711,
"step": 260
},
{
"epoch": 0.29,
"grad_norm": 0.03840857744216919,
"learning_rate": 0.00016230900790222878,
"loss": 0.3537,
"step": 261
},
{
"epoch": 0.29,
"grad_norm": 0.03800921142101288,
"learning_rate": 0.00016202874665051674,
"loss": 0.3662,
"step": 262
},
{
"epoch": 0.3,
"grad_norm": 0.03894530236721039,
"learning_rate": 0.0001617476912474956,
"loss": 0.3633,
"step": 263
},
{
"epoch": 0.3,
"grad_norm": 0.04486812278628349,
"learning_rate": 0.00016146584529150526,
"loss": 0.3594,
"step": 264
},
{
"epoch": 0.3,
"grad_norm": 0.0394977331161499,
"learning_rate": 0.00016118321239100712,
"loss": 0.3473,
"step": 265
},
{
"epoch": 0.3,
"grad_norm": 0.03960977867245674,
"learning_rate": 0.0001608997961645377,
"loss": 0.363,
"step": 266
},
{
"epoch": 0.3,
"grad_norm": 0.04547852650284767,
"learning_rate": 0.00016061560024066248,
"loss": 0.3698,
"step": 267
}
],
"logging_steps": 1,
"max_steps": 888,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 89,
"total_flos": 5.32595180788777e+18,
"train_batch_size": 14,
"trial_name": null,
"trial_params": null
}