filter-fullmul / trainer_state.json
rica40325's picture
Upload folder using huggingface_hub
56fbe27 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6954505940307157,
"eval_steps": 500,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0023181686467690526,
"grad_norm": 28.501070022583008,
"learning_rate": 2e-05,
"loss": 2.6862,
"step": 1
},
{
"epoch": 0.004636337293538105,
"grad_norm": 2.4246978759765625,
"learning_rate": 1.998453209590101e-05,
"loss": 1.2503,
"step": 2
},
{
"epoch": 0.006954505940307157,
"grad_norm": 8.621129989624023,
"learning_rate": 1.996906419180201e-05,
"loss": 1.9701,
"step": 3
},
{
"epoch": 0.00927267458707621,
"grad_norm": 2.628929853439331,
"learning_rate": 1.995359628770302e-05,
"loss": 1.4985,
"step": 4
},
{
"epoch": 0.011590843233845263,
"grad_norm": 5.25876522064209,
"learning_rate": 1.993812838360402e-05,
"loss": 1.3525,
"step": 5
},
{
"epoch": 0.013909011880614315,
"grad_norm": 1.959221363067627,
"learning_rate": 1.992266047950503e-05,
"loss": 1.2634,
"step": 6
},
{
"epoch": 0.016227180527383367,
"grad_norm": 4.436636447906494,
"learning_rate": 1.9907192575406035e-05,
"loss": 1.3113,
"step": 7
},
{
"epoch": 0.01854534917415242,
"grad_norm": 2.8270750045776367,
"learning_rate": 1.989172467130704e-05,
"loss": 1.2802,
"step": 8
},
{
"epoch": 0.02086351782092147,
"grad_norm": 1.8319437503814697,
"learning_rate": 1.9876256767208045e-05,
"loss": 1.2364,
"step": 9
},
{
"epoch": 0.023181686467690525,
"grad_norm": 1.2635818719863892,
"learning_rate": 1.986078886310905e-05,
"loss": 1.2281,
"step": 10
},
{
"epoch": 0.025499855114459576,
"grad_norm": 1.3797353506088257,
"learning_rate": 1.9845320959010055e-05,
"loss": 1.2345,
"step": 11
},
{
"epoch": 0.02781802376122863,
"grad_norm": 1.020139455795288,
"learning_rate": 1.982985305491106e-05,
"loss": 1.2106,
"step": 12
},
{
"epoch": 0.030136192407997683,
"grad_norm": 1.2911367416381836,
"learning_rate": 1.9814385150812065e-05,
"loss": 1.2277,
"step": 13
},
{
"epoch": 0.032454361054766734,
"grad_norm": 1.878492832183838,
"learning_rate": 1.979891724671307e-05,
"loss": 1.1971,
"step": 14
},
{
"epoch": 0.034772529701535784,
"grad_norm": 1.455005407333374,
"learning_rate": 1.978344934261408e-05,
"loss": 1.1909,
"step": 15
},
{
"epoch": 0.03709069834830484,
"grad_norm": 1.5538443326950073,
"learning_rate": 1.976798143851508e-05,
"loss": 1.227,
"step": 16
},
{
"epoch": 0.03940886699507389,
"grad_norm": 1.4034634828567505,
"learning_rate": 1.975251353441609e-05,
"loss": 1.1373,
"step": 17
},
{
"epoch": 0.04172703564184294,
"grad_norm": 1.3893576860427856,
"learning_rate": 1.973704563031709e-05,
"loss": 1.178,
"step": 18
},
{
"epoch": 0.044045204288612,
"grad_norm": 1.352433204650879,
"learning_rate": 1.97215777262181e-05,
"loss": 1.163,
"step": 19
},
{
"epoch": 0.04636337293538105,
"grad_norm": 1.179675579071045,
"learning_rate": 1.9706109822119105e-05,
"loss": 1.1223,
"step": 20
},
{
"epoch": 0.0486815415821501,
"grad_norm": 1.208991289138794,
"learning_rate": 1.9690641918020112e-05,
"loss": 1.1867,
"step": 21
},
{
"epoch": 0.05099971022891915,
"grad_norm": 0.8430147171020508,
"learning_rate": 1.9675174013921115e-05,
"loss": 1.1488,
"step": 22
},
{
"epoch": 0.05331787887568821,
"grad_norm": 0.7868801355361938,
"learning_rate": 1.9659706109822122e-05,
"loss": 1.1035,
"step": 23
},
{
"epoch": 0.05563604752245726,
"grad_norm": 0.9440180063247681,
"learning_rate": 1.9644238205723125e-05,
"loss": 1.1582,
"step": 24
},
{
"epoch": 0.05795421616922631,
"grad_norm": 1.0409374237060547,
"learning_rate": 1.9628770301624132e-05,
"loss": 1.1304,
"step": 25
},
{
"epoch": 0.06027238481599537,
"grad_norm": 1.4376304149627686,
"learning_rate": 1.961330239752514e-05,
"loss": 1.1535,
"step": 26
},
{
"epoch": 0.06259055346276442,
"grad_norm": 0.7775300741195679,
"learning_rate": 1.9597834493426142e-05,
"loss": 1.0941,
"step": 27
},
{
"epoch": 0.06490872210953347,
"grad_norm": 1.294155478477478,
"learning_rate": 1.958236658932715e-05,
"loss": 1.1859,
"step": 28
},
{
"epoch": 0.06722689075630252,
"grad_norm": 1.0435543060302734,
"learning_rate": 1.9566898685228152e-05,
"loss": 1.147,
"step": 29
},
{
"epoch": 0.06954505940307157,
"grad_norm": 1.1211307048797607,
"learning_rate": 1.955143078112916e-05,
"loss": 1.1148,
"step": 30
},
{
"epoch": 0.07186322804984063,
"grad_norm": 0.9777933955192566,
"learning_rate": 1.9535962877030165e-05,
"loss": 1.1266,
"step": 31
},
{
"epoch": 0.07418139669660968,
"grad_norm": 0.9920445680618286,
"learning_rate": 1.952049497293117e-05,
"loss": 1.1535,
"step": 32
},
{
"epoch": 0.07649956534337873,
"grad_norm": 0.7202315330505371,
"learning_rate": 1.9505027068832175e-05,
"loss": 1.139,
"step": 33
},
{
"epoch": 0.07881773399014778,
"grad_norm": 0.6207343339920044,
"learning_rate": 1.9489559164733182e-05,
"loss": 1.1105,
"step": 34
},
{
"epoch": 0.08113590263691683,
"grad_norm": 1.078873634338379,
"learning_rate": 1.9474091260634185e-05,
"loss": 1.1407,
"step": 35
},
{
"epoch": 0.08345407128368589,
"grad_norm": 0.9423937201499939,
"learning_rate": 1.9458623356535192e-05,
"loss": 1.145,
"step": 36
},
{
"epoch": 0.08577223993045494,
"grad_norm": 0.8272204399108887,
"learning_rate": 1.9443155452436195e-05,
"loss": 1.1141,
"step": 37
},
{
"epoch": 0.088090408577224,
"grad_norm": 0.9147409796714783,
"learning_rate": 1.9427687548337202e-05,
"loss": 1.1113,
"step": 38
},
{
"epoch": 0.09040857722399305,
"grad_norm": 1.4252681732177734,
"learning_rate": 1.941221964423821e-05,
"loss": 1.1633,
"step": 39
},
{
"epoch": 0.0927267458707621,
"grad_norm": 0.8701033592224121,
"learning_rate": 1.9396751740139212e-05,
"loss": 1.1243,
"step": 40
},
{
"epoch": 0.09504491451753115,
"grad_norm": 0.9681833386421204,
"learning_rate": 1.938128383604022e-05,
"loss": 1.1518,
"step": 41
},
{
"epoch": 0.0973630831643002,
"grad_norm": 1.1157395839691162,
"learning_rate": 1.9365815931941222e-05,
"loss": 1.1178,
"step": 42
},
{
"epoch": 0.09968125181106925,
"grad_norm": 0.7797672152519226,
"learning_rate": 1.935034802784223e-05,
"loss": 1.0887,
"step": 43
},
{
"epoch": 0.1019994204578383,
"grad_norm": 1.3890780210494995,
"learning_rate": 1.9334880123743235e-05,
"loss": 1.1307,
"step": 44
},
{
"epoch": 0.10431758910460737,
"grad_norm": 0.8613622784614563,
"learning_rate": 1.9319412219644242e-05,
"loss": 1.1662,
"step": 45
},
{
"epoch": 0.10663575775137642,
"grad_norm": 1.0830810070037842,
"learning_rate": 1.9303944315545245e-05,
"loss": 1.1796,
"step": 46
},
{
"epoch": 0.10895392639814547,
"grad_norm": 0.8859057426452637,
"learning_rate": 1.9288476411446252e-05,
"loss": 1.1584,
"step": 47
},
{
"epoch": 0.11127209504491452,
"grad_norm": 0.6870171427726746,
"learning_rate": 1.9273008507347255e-05,
"loss": 1.1035,
"step": 48
},
{
"epoch": 0.11359026369168357,
"grad_norm": 1.181731104850769,
"learning_rate": 1.9257540603248262e-05,
"loss": 1.1241,
"step": 49
},
{
"epoch": 0.11590843233845262,
"grad_norm": 1.8238871097564697,
"learning_rate": 1.924207269914927e-05,
"loss": 1.2359,
"step": 50
},
{
"epoch": 0.11822660098522167,
"grad_norm": 0.9966158270835876,
"learning_rate": 1.9226604795050272e-05,
"loss": 1.1387,
"step": 51
},
{
"epoch": 0.12054476963199073,
"grad_norm": 0.9722117185592651,
"learning_rate": 1.921113689095128e-05,
"loss": 1.1725,
"step": 52
},
{
"epoch": 0.12286293827875978,
"grad_norm": 1.1024895906448364,
"learning_rate": 1.9195668986852282e-05,
"loss": 1.1424,
"step": 53
},
{
"epoch": 0.12518110692552883,
"grad_norm": 1.157772421836853,
"learning_rate": 1.918020108275329e-05,
"loss": 1.1627,
"step": 54
},
{
"epoch": 0.12749927557229787,
"grad_norm": 1.0060945749282837,
"learning_rate": 1.9164733178654292e-05,
"loss": 1.1714,
"step": 55
},
{
"epoch": 0.12981744421906694,
"grad_norm": 1.3089208602905273,
"learning_rate": 1.91492652745553e-05,
"loss": 1.1394,
"step": 56
},
{
"epoch": 0.132135612865836,
"grad_norm": 1.0314549207687378,
"learning_rate": 1.9133797370456305e-05,
"loss": 1.1214,
"step": 57
},
{
"epoch": 0.13445378151260504,
"grad_norm": 0.848561704158783,
"learning_rate": 1.9118329466357312e-05,
"loss": 1.1364,
"step": 58
},
{
"epoch": 0.1367719501593741,
"grad_norm": 1.1316096782684326,
"learning_rate": 1.9102861562258315e-05,
"loss": 1.0673,
"step": 59
},
{
"epoch": 0.13909011880614314,
"grad_norm": 0.910849928855896,
"learning_rate": 1.9087393658159322e-05,
"loss": 1.0848,
"step": 60
},
{
"epoch": 0.1414082874529122,
"grad_norm": 1.2117191553115845,
"learning_rate": 1.9071925754060325e-05,
"loss": 1.2018,
"step": 61
},
{
"epoch": 0.14372645609968127,
"grad_norm": 1.3396297693252563,
"learning_rate": 1.9056457849961332e-05,
"loss": 1.1773,
"step": 62
},
{
"epoch": 0.1460446247464503,
"grad_norm": 1.0479182004928589,
"learning_rate": 1.904098994586234e-05,
"loss": 1.1557,
"step": 63
},
{
"epoch": 0.14836279339321937,
"grad_norm": 1.0438235998153687,
"learning_rate": 1.9025522041763342e-05,
"loss": 1.0985,
"step": 64
},
{
"epoch": 0.1506809620399884,
"grad_norm": 1.3567838668823242,
"learning_rate": 1.901005413766435e-05,
"loss": 1.0874,
"step": 65
},
{
"epoch": 0.15299913068675747,
"grad_norm": 1.0052990913391113,
"learning_rate": 1.8994586233565352e-05,
"loss": 1.1008,
"step": 66
},
{
"epoch": 0.1553172993335265,
"grad_norm": 1.06718111038208,
"learning_rate": 1.897911832946636e-05,
"loss": 1.1004,
"step": 67
},
{
"epoch": 0.15763546798029557,
"grad_norm": 1.326567530632019,
"learning_rate": 1.8963650425367365e-05,
"loss": 1.1033,
"step": 68
},
{
"epoch": 0.15995363662706463,
"grad_norm": 1.1070104837417603,
"learning_rate": 1.894818252126837e-05,
"loss": 1.1287,
"step": 69
},
{
"epoch": 0.16227180527383367,
"grad_norm": 1.0842565298080444,
"learning_rate": 1.8932714617169375e-05,
"loss": 1.1005,
"step": 70
},
{
"epoch": 0.16458997392060273,
"grad_norm": 0.9498984813690186,
"learning_rate": 1.8917246713070382e-05,
"loss": 1.0714,
"step": 71
},
{
"epoch": 0.16690814256737177,
"grad_norm": 1.0374786853790283,
"learning_rate": 1.8901778808971385e-05,
"loss": 1.1789,
"step": 72
},
{
"epoch": 0.16922631121414083,
"grad_norm": 0.8770291209220886,
"learning_rate": 1.8886310904872392e-05,
"loss": 1.087,
"step": 73
},
{
"epoch": 0.17154447986090987,
"grad_norm": 1.4603461027145386,
"learning_rate": 1.88708430007734e-05,
"loss": 1.1119,
"step": 74
},
{
"epoch": 0.17386264850767894,
"grad_norm": 1.0779688358306885,
"learning_rate": 1.8855375096674402e-05,
"loss": 1.1346,
"step": 75
},
{
"epoch": 0.176180817154448,
"grad_norm": 1.1575367450714111,
"learning_rate": 1.883990719257541e-05,
"loss": 1.043,
"step": 76
},
{
"epoch": 0.17849898580121704,
"grad_norm": 0.991324245929718,
"learning_rate": 1.8824439288476412e-05,
"loss": 1.0807,
"step": 77
},
{
"epoch": 0.1808171544479861,
"grad_norm": 1.2354373931884766,
"learning_rate": 1.880897138437742e-05,
"loss": 1.0485,
"step": 78
},
{
"epoch": 0.18313532309475514,
"grad_norm": 1.3966253995895386,
"learning_rate": 1.8793503480278422e-05,
"loss": 1.0349,
"step": 79
},
{
"epoch": 0.1854534917415242,
"grad_norm": 1.1162493228912354,
"learning_rate": 1.877803557617943e-05,
"loss": 1.1038,
"step": 80
},
{
"epoch": 0.18777166038829324,
"grad_norm": 1.117507815361023,
"learning_rate": 1.8762567672080435e-05,
"loss": 1.0844,
"step": 81
},
{
"epoch": 0.1900898290350623,
"grad_norm": 1.083954930305481,
"learning_rate": 1.8747099767981442e-05,
"loss": 1.1057,
"step": 82
},
{
"epoch": 0.19240799768183137,
"grad_norm": 1.0554256439208984,
"learning_rate": 1.8731631863882445e-05,
"loss": 1.0824,
"step": 83
},
{
"epoch": 0.1947261663286004,
"grad_norm": 1.1306427717208862,
"learning_rate": 1.8716163959783452e-05,
"loss": 1.0552,
"step": 84
},
{
"epoch": 0.19704433497536947,
"grad_norm": 1.1089762449264526,
"learning_rate": 1.8700696055684455e-05,
"loss": 1.0374,
"step": 85
},
{
"epoch": 0.1993625036221385,
"grad_norm": 1.0922352075576782,
"learning_rate": 1.8685228151585462e-05,
"loss": 1.2775,
"step": 86
},
{
"epoch": 0.20168067226890757,
"grad_norm": 4.459912300109863,
"learning_rate": 1.866976024748647e-05,
"loss": 1.061,
"step": 87
},
{
"epoch": 0.2039988409156766,
"grad_norm": 0.9974443316459656,
"learning_rate": 1.8654292343387472e-05,
"loss": 1.0759,
"step": 88
},
{
"epoch": 0.20631700956244567,
"grad_norm": 0.914336621761322,
"learning_rate": 1.863882443928848e-05,
"loss": 1.0549,
"step": 89
},
{
"epoch": 0.20863517820921473,
"grad_norm": 0.8654055595397949,
"learning_rate": 1.8623356535189482e-05,
"loss": 1.0634,
"step": 90
},
{
"epoch": 0.21095334685598377,
"grad_norm": 0.7419248819351196,
"learning_rate": 1.860788863109049e-05,
"loss": 1.0512,
"step": 91
},
{
"epoch": 0.21327151550275283,
"grad_norm": 1.4844622611999512,
"learning_rate": 1.8592420726991492e-05,
"loss": 1.0969,
"step": 92
},
{
"epoch": 0.21558968414952187,
"grad_norm": 1.29688560962677,
"learning_rate": 1.85769528228925e-05,
"loss": 1.0711,
"step": 93
},
{
"epoch": 0.21790785279629094,
"grad_norm": 1.421087622642517,
"learning_rate": 1.8561484918793505e-05,
"loss": 1.0772,
"step": 94
},
{
"epoch": 0.22022602144305997,
"grad_norm": 1.200110673904419,
"learning_rate": 1.8546017014694512e-05,
"loss": 1.1029,
"step": 95
},
{
"epoch": 0.22254419008982904,
"grad_norm": 1.025266408920288,
"learning_rate": 1.8530549110595515e-05,
"loss": 1.1009,
"step": 96
},
{
"epoch": 0.2248623587365981,
"grad_norm": 1.1353425979614258,
"learning_rate": 1.8515081206496522e-05,
"loss": 1.1109,
"step": 97
},
{
"epoch": 0.22718052738336714,
"grad_norm": 1.0217199325561523,
"learning_rate": 1.849961330239753e-05,
"loss": 1.1158,
"step": 98
},
{
"epoch": 0.2294986960301362,
"grad_norm": 0.9707551598548889,
"learning_rate": 1.8484145398298532e-05,
"loss": 1.0189,
"step": 99
},
{
"epoch": 0.23181686467690524,
"grad_norm": 0.8363978266716003,
"learning_rate": 1.846867749419954e-05,
"loss": 1.086,
"step": 100
},
{
"epoch": 0.2341350333236743,
"grad_norm": 1.1158548593521118,
"learning_rate": 1.8453209590100542e-05,
"loss": 1.0169,
"step": 101
},
{
"epoch": 0.23645320197044334,
"grad_norm": 0.8453333973884583,
"learning_rate": 1.843774168600155e-05,
"loss": 1.0584,
"step": 102
},
{
"epoch": 0.2387713706172124,
"grad_norm": 1.572540044784546,
"learning_rate": 1.8422273781902552e-05,
"loss": 1.105,
"step": 103
},
{
"epoch": 0.24108953926398147,
"grad_norm": 0.9463809132575989,
"learning_rate": 1.840680587780356e-05,
"loss": 1.0552,
"step": 104
},
{
"epoch": 0.2434077079107505,
"grad_norm": 0.8801397681236267,
"learning_rate": 1.8391337973704565e-05,
"loss": 1.0615,
"step": 105
},
{
"epoch": 0.24572587655751957,
"grad_norm": 1.251951813697815,
"learning_rate": 1.837587006960557e-05,
"loss": 1.1217,
"step": 106
},
{
"epoch": 0.2480440452042886,
"grad_norm": 1.297305703163147,
"learning_rate": 1.8360402165506575e-05,
"loss": 1.1102,
"step": 107
},
{
"epoch": 0.25036221385105767,
"grad_norm": 0.9023735523223877,
"learning_rate": 1.8344934261407582e-05,
"loss": 1.0786,
"step": 108
},
{
"epoch": 0.2526803824978267,
"grad_norm": 0.7729614973068237,
"learning_rate": 1.8329466357308585e-05,
"loss": 1.0636,
"step": 109
},
{
"epoch": 0.25499855114459574,
"grad_norm": 1.458217978477478,
"learning_rate": 1.8313998453209592e-05,
"loss": 1.0998,
"step": 110
},
{
"epoch": 0.25731671979136483,
"grad_norm": 1.295067548751831,
"learning_rate": 1.82985305491106e-05,
"loss": 1.0756,
"step": 111
},
{
"epoch": 0.25963488843813387,
"grad_norm": 0.7502389550209045,
"learning_rate": 1.8283062645011602e-05,
"loss": 1.048,
"step": 112
},
{
"epoch": 0.2619530570849029,
"grad_norm": 0.7939056158065796,
"learning_rate": 1.826759474091261e-05,
"loss": 1.1759,
"step": 113
},
{
"epoch": 0.264271225731672,
"grad_norm": 0.8996245861053467,
"learning_rate": 1.8252126836813612e-05,
"loss": 1.0799,
"step": 114
},
{
"epoch": 0.26658939437844104,
"grad_norm": 1.1998958587646484,
"learning_rate": 1.823665893271462e-05,
"loss": 1.0622,
"step": 115
},
{
"epoch": 0.2689075630252101,
"grad_norm": 0.7442781329154968,
"learning_rate": 1.8221191028615622e-05,
"loss": 1.1025,
"step": 116
},
{
"epoch": 0.2712257316719791,
"grad_norm": 1.0310958623886108,
"learning_rate": 1.820572312451663e-05,
"loss": 1.0557,
"step": 117
},
{
"epoch": 0.2735439003187482,
"grad_norm": 1.071614384651184,
"learning_rate": 1.8190255220417635e-05,
"loss": 1.1166,
"step": 118
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.8248165845870972,
"learning_rate": 1.8174787316318642e-05,
"loss": 1.1389,
"step": 119
},
{
"epoch": 0.2781802376122863,
"grad_norm": 0.9798437356948853,
"learning_rate": 1.8159319412219645e-05,
"loss": 1.0331,
"step": 120
},
{
"epoch": 0.28049840625905537,
"grad_norm": 0.7473148703575134,
"learning_rate": 1.8143851508120652e-05,
"loss": 1.0211,
"step": 121
},
{
"epoch": 0.2828165749058244,
"grad_norm": 1.3669893741607666,
"learning_rate": 1.812838360402166e-05,
"loss": 1.1495,
"step": 122
},
{
"epoch": 0.28513474355259344,
"grad_norm": 0.7082749009132385,
"learning_rate": 1.8112915699922662e-05,
"loss": 1.0244,
"step": 123
},
{
"epoch": 0.28745291219936253,
"grad_norm": 1.2198262214660645,
"learning_rate": 1.809744779582367e-05,
"loss": 1.1052,
"step": 124
},
{
"epoch": 0.28977108084613157,
"grad_norm": 2.2517826557159424,
"learning_rate": 1.8081979891724672e-05,
"loss": 1.0324,
"step": 125
},
{
"epoch": 0.2920892494929006,
"grad_norm": 0.9831060767173767,
"learning_rate": 1.806651198762568e-05,
"loss": 1.1176,
"step": 126
},
{
"epoch": 0.29440741813966964,
"grad_norm": 0.8122763633728027,
"learning_rate": 1.8051044083526682e-05,
"loss": 1.0238,
"step": 127
},
{
"epoch": 0.29672558678643873,
"grad_norm": 1.0002597570419312,
"learning_rate": 1.803557617942769e-05,
"loss": 1.0847,
"step": 128
},
{
"epoch": 0.29904375543320777,
"grad_norm": 0.8262125253677368,
"learning_rate": 1.8020108275328692e-05,
"loss": 1.1149,
"step": 129
},
{
"epoch": 0.3013619240799768,
"grad_norm": 1.2185602188110352,
"learning_rate": 1.80046403712297e-05,
"loss": 1.0688,
"step": 130
},
{
"epoch": 0.3036800927267459,
"grad_norm": 1.2163466215133667,
"learning_rate": 1.7989172467130705e-05,
"loss": 1.0366,
"step": 131
},
{
"epoch": 0.30599826137351493,
"grad_norm": 0.6977062225341797,
"learning_rate": 1.7973704563031712e-05,
"loss": 1.0853,
"step": 132
},
{
"epoch": 0.30831643002028397,
"grad_norm": 0.8096152544021606,
"learning_rate": 1.7958236658932715e-05,
"loss": 1.0104,
"step": 133
},
{
"epoch": 0.310634598667053,
"grad_norm": 0.9192125201225281,
"learning_rate": 1.7942768754833722e-05,
"loss": 1.0596,
"step": 134
},
{
"epoch": 0.3129527673138221,
"grad_norm": 0.7752702236175537,
"learning_rate": 1.792730085073473e-05,
"loss": 1.1281,
"step": 135
},
{
"epoch": 0.31527093596059114,
"grad_norm": 1.0385123491287231,
"learning_rate": 1.7911832946635732e-05,
"loss": 1.0857,
"step": 136
},
{
"epoch": 0.3175891046073602,
"grad_norm": 2.8603484630584717,
"learning_rate": 1.789636504253674e-05,
"loss": 1.1425,
"step": 137
},
{
"epoch": 0.31990727325412927,
"grad_norm": 1.104943037033081,
"learning_rate": 1.7880897138437742e-05,
"loss": 1.1464,
"step": 138
},
{
"epoch": 0.3222254419008983,
"grad_norm": 1.7259231805801392,
"learning_rate": 1.786542923433875e-05,
"loss": 1.0472,
"step": 139
},
{
"epoch": 0.32454361054766734,
"grad_norm": 0.9174676537513733,
"learning_rate": 1.7849961330239752e-05,
"loss": 1.0523,
"step": 140
},
{
"epoch": 0.3268617791944364,
"grad_norm": 0.9572336673736572,
"learning_rate": 1.783449342614076e-05,
"loss": 1.0327,
"step": 141
},
{
"epoch": 0.32917994784120547,
"grad_norm": 0.6567716598510742,
"learning_rate": 1.7819025522041766e-05,
"loss": 1.0979,
"step": 142
},
{
"epoch": 0.3314981164879745,
"grad_norm": 1.8695584535598755,
"learning_rate": 1.780355761794277e-05,
"loss": 1.0659,
"step": 143
},
{
"epoch": 0.33381628513474354,
"grad_norm": 0.8160743713378906,
"learning_rate": 1.7788089713843776e-05,
"loss": 1.0746,
"step": 144
},
{
"epoch": 0.33613445378151263,
"grad_norm": 0.7144508957862854,
"learning_rate": 1.7772621809744782e-05,
"loss": 1.002,
"step": 145
},
{
"epoch": 0.33845262242828167,
"grad_norm": 0.8914051055908203,
"learning_rate": 1.7757153905645786e-05,
"loss": 1.0334,
"step": 146
},
{
"epoch": 0.3407707910750507,
"grad_norm": 1.1182371377944946,
"learning_rate": 1.7741686001546792e-05,
"loss": 1.0535,
"step": 147
},
{
"epoch": 0.34308895972181974,
"grad_norm": 0.6911827325820923,
"learning_rate": 1.77262180974478e-05,
"loss": 1.0783,
"step": 148
},
{
"epoch": 0.34540712836858883,
"grad_norm": 1.141491413116455,
"learning_rate": 1.7710750193348802e-05,
"loss": 1.0522,
"step": 149
},
{
"epoch": 0.34772529701535787,
"grad_norm": 1.103798747062683,
"learning_rate": 1.769528228924981e-05,
"loss": 1.1178,
"step": 150
},
{
"epoch": 0.3500434656621269,
"grad_norm": 1.1297893524169922,
"learning_rate": 1.7679814385150812e-05,
"loss": 1.19,
"step": 151
},
{
"epoch": 0.352361634308896,
"grad_norm": 0.8850527405738831,
"learning_rate": 1.766434648105182e-05,
"loss": 0.9929,
"step": 152
},
{
"epoch": 0.35467980295566504,
"grad_norm": 1.096604585647583,
"learning_rate": 1.7648878576952822e-05,
"loss": 1.0992,
"step": 153
},
{
"epoch": 0.35699797160243407,
"grad_norm": 0.9644438624382019,
"learning_rate": 1.763341067285383e-05,
"loss": 1.0421,
"step": 154
},
{
"epoch": 0.3593161402492031,
"grad_norm": 1.0480139255523682,
"learning_rate": 1.7617942768754836e-05,
"loss": 1.0495,
"step": 155
},
{
"epoch": 0.3616343088959722,
"grad_norm": 3.11247181892395,
"learning_rate": 1.7602474864655842e-05,
"loss": 1.0407,
"step": 156
},
{
"epoch": 0.36395247754274124,
"grad_norm": 0.9178793430328369,
"learning_rate": 1.7587006960556846e-05,
"loss": 1.0208,
"step": 157
},
{
"epoch": 0.3662706461895103,
"grad_norm": 1.0008949041366577,
"learning_rate": 1.7571539056457852e-05,
"loss": 1.036,
"step": 158
},
{
"epoch": 0.36858881483627937,
"grad_norm": 0.9334746599197388,
"learning_rate": 1.755607115235886e-05,
"loss": 1.1097,
"step": 159
},
{
"epoch": 0.3709069834830484,
"grad_norm": 0.9296855330467224,
"learning_rate": 1.7540603248259862e-05,
"loss": 1.0597,
"step": 160
},
{
"epoch": 0.37322515212981744,
"grad_norm": 0.7528197765350342,
"learning_rate": 1.752513534416087e-05,
"loss": 1.0252,
"step": 161
},
{
"epoch": 0.3755433207765865,
"grad_norm": 0.7995203733444214,
"learning_rate": 1.7509667440061872e-05,
"loss": 1.0446,
"step": 162
},
{
"epoch": 0.37786148942335557,
"grad_norm": 0.7773709297180176,
"learning_rate": 1.749419953596288e-05,
"loss": 1.0527,
"step": 163
},
{
"epoch": 0.3801796580701246,
"grad_norm": 0.9108691811561584,
"learning_rate": 1.7478731631863882e-05,
"loss": 1.0571,
"step": 164
},
{
"epoch": 0.38249782671689364,
"grad_norm": 0.8565751910209656,
"learning_rate": 1.746326372776489e-05,
"loss": 1.1003,
"step": 165
},
{
"epoch": 0.38481599536366273,
"grad_norm": 1.3683419227600098,
"learning_rate": 1.7447795823665896e-05,
"loss": 1.0697,
"step": 166
},
{
"epoch": 0.38713416401043177,
"grad_norm": 0.6560070514678955,
"learning_rate": 1.74323279195669e-05,
"loss": 0.9556,
"step": 167
},
{
"epoch": 0.3894523326572008,
"grad_norm": 1.370263934135437,
"learning_rate": 1.7416860015467906e-05,
"loss": 1.0848,
"step": 168
},
{
"epoch": 0.39177050130396984,
"grad_norm": 1.013763427734375,
"learning_rate": 1.7401392111368912e-05,
"loss": 1.0643,
"step": 169
},
{
"epoch": 0.39408866995073893,
"grad_norm": 0.8194316029548645,
"learning_rate": 1.7385924207269916e-05,
"loss": 1.0708,
"step": 170
},
{
"epoch": 0.39640683859750797,
"grad_norm": 0.9241949319839478,
"learning_rate": 1.7370456303170922e-05,
"loss": 1.0502,
"step": 171
},
{
"epoch": 0.398725007244277,
"grad_norm": 0.9724448323249817,
"learning_rate": 1.735498839907193e-05,
"loss": 1.0152,
"step": 172
},
{
"epoch": 0.4010431758910461,
"grad_norm": 0.6559419631958008,
"learning_rate": 1.7339520494972932e-05,
"loss": 1.0148,
"step": 173
},
{
"epoch": 0.40336134453781514,
"grad_norm": 1.1617038249969482,
"learning_rate": 1.732405259087394e-05,
"loss": 1.0972,
"step": 174
},
{
"epoch": 0.4056795131845842,
"grad_norm": 0.7249406576156616,
"learning_rate": 1.7308584686774942e-05,
"loss": 0.9664,
"step": 175
},
{
"epoch": 0.4079976818313532,
"grad_norm": 1.2099742889404297,
"learning_rate": 1.729311678267595e-05,
"loss": 1.0314,
"step": 176
},
{
"epoch": 0.4103158504781223,
"grad_norm": 0.8690075278282166,
"learning_rate": 1.7277648878576952e-05,
"loss": 1.0789,
"step": 177
},
{
"epoch": 0.41263401912489134,
"grad_norm": 0.7662826180458069,
"learning_rate": 1.726218097447796e-05,
"loss": 1.014,
"step": 178
},
{
"epoch": 0.4149521877716604,
"grad_norm": 1.22348952293396,
"learning_rate": 1.7246713070378966e-05,
"loss": 0.9765,
"step": 179
},
{
"epoch": 0.41727035641842947,
"grad_norm": 1.0363351106643677,
"learning_rate": 1.7231245166279972e-05,
"loss": 1.0487,
"step": 180
},
{
"epoch": 0.4195885250651985,
"grad_norm": 0.8833026885986328,
"learning_rate": 1.7215777262180976e-05,
"loss": 1.0261,
"step": 181
},
{
"epoch": 0.42190669371196754,
"grad_norm": 0.8683452606201172,
"learning_rate": 1.7200309358081982e-05,
"loss": 1.0609,
"step": 182
},
{
"epoch": 0.4242248623587366,
"grad_norm": 0.8211922645568848,
"learning_rate": 1.718484145398299e-05,
"loss": 0.9942,
"step": 183
},
{
"epoch": 0.42654303100550567,
"grad_norm": 0.8936122059822083,
"learning_rate": 1.7169373549883992e-05,
"loss": 1.0591,
"step": 184
},
{
"epoch": 0.4288611996522747,
"grad_norm": 0.9455772042274475,
"learning_rate": 1.7153905645785e-05,
"loss": 1.0628,
"step": 185
},
{
"epoch": 0.43117936829904374,
"grad_norm": 1.0464543104171753,
"learning_rate": 1.7138437741686002e-05,
"loss": 0.9742,
"step": 186
},
{
"epoch": 0.43349753694581283,
"grad_norm": 1.4931954145431519,
"learning_rate": 1.712296983758701e-05,
"loss": 1.0534,
"step": 187
},
{
"epoch": 0.43581570559258187,
"grad_norm": 0.7873006463050842,
"learning_rate": 1.7107501933488012e-05,
"loss": 1.0499,
"step": 188
},
{
"epoch": 0.4381338742393509,
"grad_norm": 0.7451059222221375,
"learning_rate": 1.709203402938902e-05,
"loss": 1.0006,
"step": 189
},
{
"epoch": 0.44045204288611994,
"grad_norm": 0.8252111673355103,
"learning_rate": 1.7076566125290022e-05,
"loss": 1.057,
"step": 190
},
{
"epoch": 0.44277021153288904,
"grad_norm": 0.6444481015205383,
"learning_rate": 1.706109822119103e-05,
"loss": 1.0098,
"step": 191
},
{
"epoch": 0.44508838017965807,
"grad_norm": 0.8497568368911743,
"learning_rate": 1.7045630317092036e-05,
"loss": 1.0186,
"step": 192
},
{
"epoch": 0.4474065488264271,
"grad_norm": 1.0328199863433838,
"learning_rate": 1.7030162412993042e-05,
"loss": 1.0474,
"step": 193
},
{
"epoch": 0.4497247174731962,
"grad_norm": 0.7315878868103027,
"learning_rate": 1.7014694508894046e-05,
"loss": 0.9985,
"step": 194
},
{
"epoch": 0.45204288611996524,
"grad_norm": 1.0060752630233765,
"learning_rate": 1.6999226604795052e-05,
"loss": 1.0985,
"step": 195
},
{
"epoch": 0.4543610547667343,
"grad_norm": 0.8653793334960938,
"learning_rate": 1.698375870069606e-05,
"loss": 0.999,
"step": 196
},
{
"epoch": 0.4566792234135033,
"grad_norm": 1.0214215517044067,
"learning_rate": 1.6968290796597062e-05,
"loss": 1.0759,
"step": 197
},
{
"epoch": 0.4589973920602724,
"grad_norm": 0.7069177627563477,
"learning_rate": 1.695282289249807e-05,
"loss": 1.1569,
"step": 198
},
{
"epoch": 0.46131556070704144,
"grad_norm": 1.8065637350082397,
"learning_rate": 1.6937354988399072e-05,
"loss": 1.0398,
"step": 199
},
{
"epoch": 0.4636337293538105,
"grad_norm": 8.820870399475098,
"learning_rate": 1.692188708430008e-05,
"loss": 1.0801,
"step": 200
},
{
"epoch": 0.46595189800057957,
"grad_norm": 0.7957196235656738,
"learning_rate": 1.6906419180201082e-05,
"loss": 1.01,
"step": 201
},
{
"epoch": 0.4682700666473486,
"grad_norm": 0.9078807830810547,
"learning_rate": 1.689095127610209e-05,
"loss": 1.0068,
"step": 202
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.8792912364006042,
"learning_rate": 1.6875483372003096e-05,
"loss": 1.0169,
"step": 203
},
{
"epoch": 0.4729064039408867,
"grad_norm": 1.0289100408554077,
"learning_rate": 1.68600154679041e-05,
"loss": 0.9956,
"step": 204
},
{
"epoch": 0.47522457258765577,
"grad_norm": 2.788477897644043,
"learning_rate": 1.6844547563805106e-05,
"loss": 1.0839,
"step": 205
},
{
"epoch": 0.4775427412344248,
"grad_norm": 0.841396152973175,
"learning_rate": 1.6829079659706112e-05,
"loss": 1.0735,
"step": 206
},
{
"epoch": 0.47986090988119384,
"grad_norm": 1.1330881118774414,
"learning_rate": 1.681361175560712e-05,
"loss": 1.0486,
"step": 207
},
{
"epoch": 0.48217907852796293,
"grad_norm": 1.2185862064361572,
"learning_rate": 1.6798143851508122e-05,
"loss": 1.0962,
"step": 208
},
{
"epoch": 0.48449724717473197,
"grad_norm": 1.2216246128082275,
"learning_rate": 1.678267594740913e-05,
"loss": 1.0202,
"step": 209
},
{
"epoch": 0.486815415821501,
"grad_norm": 0.7822516560554504,
"learning_rate": 1.6767208043310132e-05,
"loss": 1.0278,
"step": 210
},
{
"epoch": 0.48913358446827004,
"grad_norm": 1.021760106086731,
"learning_rate": 1.675174013921114e-05,
"loss": 1.0626,
"step": 211
},
{
"epoch": 0.49145175311503914,
"grad_norm": 1.334328055381775,
"learning_rate": 1.6736272235112142e-05,
"loss": 1.1798,
"step": 212
},
{
"epoch": 0.4937699217618082,
"grad_norm": 0.7392475605010986,
"learning_rate": 1.672080433101315e-05,
"loss": 1.0713,
"step": 213
},
{
"epoch": 0.4960880904085772,
"grad_norm": 0.768805205821991,
"learning_rate": 1.6705336426914152e-05,
"loss": 0.9839,
"step": 214
},
{
"epoch": 0.4984062590553463,
"grad_norm": 0.7203591465950012,
"learning_rate": 1.668986852281516e-05,
"loss": 0.9926,
"step": 215
},
{
"epoch": 0.5007244277021153,
"grad_norm": 1.2835793495178223,
"learning_rate": 1.6674400618716166e-05,
"loss": 1.024,
"step": 216
},
{
"epoch": 0.5030425963488844,
"grad_norm": 0.8296486139297485,
"learning_rate": 1.6658932714617173e-05,
"loss": 1.0421,
"step": 217
},
{
"epoch": 0.5053607649956534,
"grad_norm": 0.8933680653572083,
"learning_rate": 1.6643464810518176e-05,
"loss": 0.9958,
"step": 218
},
{
"epoch": 0.5076789336424224,
"grad_norm": 0.6815921068191528,
"learning_rate": 1.6627996906419182e-05,
"loss": 1.0334,
"step": 219
},
{
"epoch": 0.5099971022891915,
"grad_norm": 0.795447051525116,
"learning_rate": 1.661252900232019e-05,
"loss": 1.0553,
"step": 220
},
{
"epoch": 0.5123152709359606,
"grad_norm": 1.1784237623214722,
"learning_rate": 1.6597061098221192e-05,
"loss": 0.9998,
"step": 221
},
{
"epoch": 0.5146334395827297,
"grad_norm": 0.9474261403083801,
"learning_rate": 1.65815931941222e-05,
"loss": 1.0704,
"step": 222
},
{
"epoch": 0.5169516082294987,
"grad_norm": 0.9175812602043152,
"learning_rate": 1.6566125290023202e-05,
"loss": 1.0779,
"step": 223
},
{
"epoch": 0.5192697768762677,
"grad_norm": 0.8006009459495544,
"learning_rate": 1.655065738592421e-05,
"loss": 1.024,
"step": 224
},
{
"epoch": 0.5215879455230368,
"grad_norm": 0.7539005875587463,
"learning_rate": 1.6535189481825212e-05,
"loss": 1.0301,
"step": 225
},
{
"epoch": 0.5239061141698058,
"grad_norm": 0.8373304009437561,
"learning_rate": 1.651972157772622e-05,
"loss": 1.0519,
"step": 226
},
{
"epoch": 0.5262242828165749,
"grad_norm": 0.7653727531433105,
"learning_rate": 1.6504253673627222e-05,
"loss": 0.9657,
"step": 227
},
{
"epoch": 0.528542451463344,
"grad_norm": 0.6552687287330627,
"learning_rate": 1.648878576952823e-05,
"loss": 1.0106,
"step": 228
},
{
"epoch": 0.530860620110113,
"grad_norm": 1.428830862045288,
"learning_rate": 1.6473317865429236e-05,
"loss": 1.0327,
"step": 229
},
{
"epoch": 0.5331787887568821,
"grad_norm": 1.0795942544937134,
"learning_rate": 1.6457849961330243e-05,
"loss": 1.0854,
"step": 230
},
{
"epoch": 0.5354969574036511,
"grad_norm": 0.5399507284164429,
"learning_rate": 1.644238205723125e-05,
"loss": 0.9932,
"step": 231
},
{
"epoch": 0.5378151260504201,
"grad_norm": 2.8251047134399414,
"learning_rate": 1.6426914153132253e-05,
"loss": 1.002,
"step": 232
},
{
"epoch": 0.5401332946971892,
"grad_norm": 0.7555001974105835,
"learning_rate": 1.641144624903326e-05,
"loss": 0.9898,
"step": 233
},
{
"epoch": 0.5424514633439582,
"grad_norm": 0.9090583324432373,
"learning_rate": 1.6395978344934263e-05,
"loss": 0.9762,
"step": 234
},
{
"epoch": 0.5447696319907274,
"grad_norm": 0.8169143199920654,
"learning_rate": 1.638051044083527e-05,
"loss": 1.0588,
"step": 235
},
{
"epoch": 0.5470878006374964,
"grad_norm": 0.7842413783073425,
"learning_rate": 1.6365042536736273e-05,
"loss": 0.9432,
"step": 236
},
{
"epoch": 0.5494059692842654,
"grad_norm": 2.24771785736084,
"learning_rate": 1.634957463263728e-05,
"loss": 1.0636,
"step": 237
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.9846341013908386,
"learning_rate": 1.6334106728538283e-05,
"loss": 0.9743,
"step": 238
},
{
"epoch": 0.5540423065778035,
"grad_norm": 0.7598584294319153,
"learning_rate": 1.631863882443929e-05,
"loss": 1.0192,
"step": 239
},
{
"epoch": 0.5563604752245725,
"grad_norm": 1.200215458869934,
"learning_rate": 1.6303170920340296e-05,
"loss": 1.1051,
"step": 240
},
{
"epoch": 0.5586786438713416,
"grad_norm": 0.8878689408302307,
"learning_rate": 1.62877030162413e-05,
"loss": 1.0622,
"step": 241
},
{
"epoch": 0.5609968125181107,
"grad_norm": 1.18966543674469,
"learning_rate": 1.6272235112142306e-05,
"loss": 1.0396,
"step": 242
},
{
"epoch": 0.5633149811648798,
"grad_norm": 0.7161230444908142,
"learning_rate": 1.6256767208043313e-05,
"loss": 1.0611,
"step": 243
},
{
"epoch": 0.5656331498116488,
"grad_norm": 0.806091845035553,
"learning_rate": 1.624129930394432e-05,
"loss": 1.0862,
"step": 244
},
{
"epoch": 0.5679513184584178,
"grad_norm": 0.9060055613517761,
"learning_rate": 1.6225831399845323e-05,
"loss": 0.9447,
"step": 245
},
{
"epoch": 0.5702694871051869,
"grad_norm": 0.6209578514099121,
"learning_rate": 1.621036349574633e-05,
"loss": 1.0291,
"step": 246
},
{
"epoch": 0.5725876557519559,
"grad_norm": 0.8875139951705933,
"learning_rate": 1.6194895591647333e-05,
"loss": 0.9596,
"step": 247
},
{
"epoch": 0.5749058243987251,
"grad_norm": 0.8894098997116089,
"learning_rate": 1.617942768754834e-05,
"loss": 1.0363,
"step": 248
},
{
"epoch": 0.5772239930454941,
"grad_norm": 0.8011065125465393,
"learning_rate": 1.6163959783449343e-05,
"loss": 0.9735,
"step": 249
},
{
"epoch": 0.5795421616922631,
"grad_norm": 1.0448037385940552,
"learning_rate": 1.614849187935035e-05,
"loss": 1.0154,
"step": 250
},
{
"epoch": 0.5818603303390322,
"grad_norm": 0.7367164492607117,
"learning_rate": 1.6133023975251353e-05,
"loss": 1.0509,
"step": 251
},
{
"epoch": 0.5841784989858012,
"grad_norm": 0.8820902705192566,
"learning_rate": 1.611755607115236e-05,
"loss": 1.074,
"step": 252
},
{
"epoch": 0.5864966676325702,
"grad_norm": 0.8512645363807678,
"learning_rate": 1.6102088167053366e-05,
"loss": 1.0478,
"step": 253
},
{
"epoch": 0.5888148362793393,
"grad_norm": 0.8832964897155762,
"learning_rate": 1.6086620262954373e-05,
"loss": 1.0502,
"step": 254
},
{
"epoch": 0.5911330049261084,
"grad_norm": 0.7311517596244812,
"learning_rate": 1.6071152358855376e-05,
"loss": 0.9904,
"step": 255
},
{
"epoch": 0.5934511735728775,
"grad_norm": 0.9509069919586182,
"learning_rate": 1.6055684454756383e-05,
"loss": 1.0628,
"step": 256
},
{
"epoch": 0.5957693422196465,
"grad_norm": 0.5056537389755249,
"learning_rate": 1.604021655065739e-05,
"loss": 0.9579,
"step": 257
},
{
"epoch": 0.5980875108664155,
"grad_norm": 0.6654573082923889,
"learning_rate": 1.6024748646558393e-05,
"loss": 0.9752,
"step": 258
},
{
"epoch": 0.6004056795131846,
"grad_norm": 0.7242197394371033,
"learning_rate": 1.60092807424594e-05,
"loss": 1.0986,
"step": 259
},
{
"epoch": 0.6027238481599536,
"grad_norm": 0.8016011118888855,
"learning_rate": 1.5993812838360403e-05,
"loss": 1.0193,
"step": 260
},
{
"epoch": 0.6050420168067226,
"grad_norm": 0.8806138038635254,
"learning_rate": 1.597834493426141e-05,
"loss": 1.0192,
"step": 261
},
{
"epoch": 0.6073601854534918,
"grad_norm": 1.1127492189407349,
"learning_rate": 1.5962877030162413e-05,
"loss": 1.1279,
"step": 262
},
{
"epoch": 0.6096783541002608,
"grad_norm": 0.8649683594703674,
"learning_rate": 1.594740912606342e-05,
"loss": 1.0059,
"step": 263
},
{
"epoch": 0.6119965227470299,
"grad_norm": 0.7879909873008728,
"learning_rate": 1.5931941221964423e-05,
"loss": 1.0077,
"step": 264
},
{
"epoch": 0.6143146913937989,
"grad_norm": 0.8802973031997681,
"learning_rate": 1.591647331786543e-05,
"loss": 1.0484,
"step": 265
},
{
"epoch": 0.6166328600405679,
"grad_norm": 1.8282607793807983,
"learning_rate": 1.5901005413766436e-05,
"loss": 0.9435,
"step": 266
},
{
"epoch": 0.618951028687337,
"grad_norm": 0.643280565738678,
"learning_rate": 1.5885537509667443e-05,
"loss": 0.948,
"step": 267
},
{
"epoch": 0.621269197334106,
"grad_norm": 0.727376401424408,
"learning_rate": 1.587006960556845e-05,
"loss": 0.9687,
"step": 268
},
{
"epoch": 0.6235873659808752,
"grad_norm": 0.9891621470451355,
"learning_rate": 1.5854601701469453e-05,
"loss": 0.995,
"step": 269
},
{
"epoch": 0.6259055346276442,
"grad_norm": 1.4208780527114868,
"learning_rate": 1.583913379737046e-05,
"loss": 1.107,
"step": 270
},
{
"epoch": 0.6282237032744132,
"grad_norm": 0.8574293851852417,
"learning_rate": 1.5823665893271463e-05,
"loss": 1.0016,
"step": 271
},
{
"epoch": 0.6305418719211823,
"grad_norm": 1.4257714748382568,
"learning_rate": 1.580819798917247e-05,
"loss": 1.3383,
"step": 272
},
{
"epoch": 0.6328600405679513,
"grad_norm": 0.7138167023658752,
"learning_rate": 1.5792730085073473e-05,
"loss": 1.0534,
"step": 273
},
{
"epoch": 0.6351782092147203,
"grad_norm": 1.5973683595657349,
"learning_rate": 1.577726218097448e-05,
"loss": 0.9784,
"step": 274
},
{
"epoch": 0.6374963778614894,
"grad_norm": 0.6794442534446716,
"learning_rate": 1.5761794276875483e-05,
"loss": 0.9809,
"step": 275
},
{
"epoch": 0.6398145465082585,
"grad_norm": 0.7616905570030212,
"learning_rate": 1.574632637277649e-05,
"loss": 0.9981,
"step": 276
},
{
"epoch": 0.6421327151550276,
"grad_norm": 1.709405541419983,
"learning_rate": 1.5730858468677496e-05,
"loss": 1.0105,
"step": 277
},
{
"epoch": 0.6444508838017966,
"grad_norm": 0.6796721816062927,
"learning_rate": 1.57153905645785e-05,
"loss": 0.9856,
"step": 278
},
{
"epoch": 0.6467690524485656,
"grad_norm": 0.7686854600906372,
"learning_rate": 1.5699922660479506e-05,
"loss": 1.0534,
"step": 279
},
{
"epoch": 0.6490872210953347,
"grad_norm": 1.0257889032363892,
"learning_rate": 1.5684454756380513e-05,
"loss": 0.9869,
"step": 280
},
{
"epoch": 0.6514053897421037,
"grad_norm": 0.7100695371627808,
"learning_rate": 1.566898685228152e-05,
"loss": 1.0433,
"step": 281
},
{
"epoch": 0.6537235583888727,
"grad_norm": 0.7201927900314331,
"learning_rate": 1.5653518948182523e-05,
"loss": 0.9854,
"step": 282
},
{
"epoch": 0.6560417270356419,
"grad_norm": 1.5743852853775024,
"learning_rate": 1.563805104408353e-05,
"loss": 1.0076,
"step": 283
},
{
"epoch": 0.6583598956824109,
"grad_norm": 0.7456634640693665,
"learning_rate": 1.5622583139984533e-05,
"loss": 1.0214,
"step": 284
},
{
"epoch": 0.66067806432918,
"grad_norm": 0.6395049691200256,
"learning_rate": 1.560711523588554e-05,
"loss": 1.0321,
"step": 285
},
{
"epoch": 0.662996232975949,
"grad_norm": 0.9406479001045227,
"learning_rate": 1.5591647331786543e-05,
"loss": 1.0387,
"step": 286
},
{
"epoch": 0.665314401622718,
"grad_norm": 0.668521523475647,
"learning_rate": 1.557617942768755e-05,
"loss": 0.9458,
"step": 287
},
{
"epoch": 0.6676325702694871,
"grad_norm": 0.8241714239120483,
"learning_rate": 1.5560711523588553e-05,
"loss": 0.994,
"step": 288
},
{
"epoch": 0.6699507389162561,
"grad_norm": 0.7906151413917542,
"learning_rate": 1.554524361948956e-05,
"loss": 1.0127,
"step": 289
},
{
"epoch": 0.6722689075630253,
"grad_norm": 3.7441999912261963,
"learning_rate": 1.5529775715390566e-05,
"loss": 1.1235,
"step": 290
},
{
"epoch": 0.6745870762097943,
"grad_norm": 0.7488934397697449,
"learning_rate": 1.5514307811291573e-05,
"loss": 1.0132,
"step": 291
},
{
"epoch": 0.6769052448565633,
"grad_norm": 0.6223219037055969,
"learning_rate": 1.5498839907192576e-05,
"loss": 0.9831,
"step": 292
},
{
"epoch": 0.6792234135033324,
"grad_norm": 0.9072495698928833,
"learning_rate": 1.5483372003093583e-05,
"loss": 0.9978,
"step": 293
},
{
"epoch": 0.6815415821501014,
"grad_norm": 1.0984013080596924,
"learning_rate": 1.546790409899459e-05,
"loss": 0.9942,
"step": 294
},
{
"epoch": 0.6838597507968704,
"grad_norm": 0.8855274319648743,
"learning_rate": 1.5452436194895593e-05,
"loss": 1.0135,
"step": 295
},
{
"epoch": 0.6861779194436395,
"grad_norm": 0.7710789442062378,
"learning_rate": 1.54369682907966e-05,
"loss": 1.034,
"step": 296
},
{
"epoch": 0.6884960880904086,
"grad_norm": 19.71253204345703,
"learning_rate": 1.5421500386697603e-05,
"loss": 0.9598,
"step": 297
},
{
"epoch": 0.6908142567371777,
"grad_norm": 0.930057942867279,
"learning_rate": 1.540603248259861e-05,
"loss": 0.9755,
"step": 298
},
{
"epoch": 0.6931324253839467,
"grad_norm": 1.0376302003860474,
"learning_rate": 1.5390564578499613e-05,
"loss": 1.0324,
"step": 299
},
{
"epoch": 0.6954505940307157,
"grad_norm": 0.8160769939422607,
"learning_rate": 1.537509667440062e-05,
"loss": 1.0805,
"step": 300
}
],
"logging_steps": 1,
"max_steps": 1293,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0925139234599731e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}