mol-tiny-v2 / last-checkpoint /trainer_state.json
mohammadmahdinouri's picture
Training in progress, step 12000, checkpoint
118ead0 verified
raw
history blame
99 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.017775776357032393,
"eval_steps": 500,
"global_step": 12000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2.9626293928387323e-05,
"grad_norm": 664.0,
"learning_rate": 1.9e-05,
"loss": 203.5701,
"step": 20
},
{
"epoch": 5.9252587856774646e-05,
"grad_norm": 70.5,
"learning_rate": 3.9e-05,
"loss": 152.8923,
"step": 40
},
{
"epoch": 8.887888178516196e-05,
"grad_norm": 103.5,
"learning_rate": 5.9e-05,
"loss": 143.9512,
"step": 60
},
{
"epoch": 0.00011850517571354929,
"grad_norm": 74.5,
"learning_rate": 7.9e-05,
"loss": 141.3113,
"step": 80
},
{
"epoch": 0.0001481314696419366,
"grad_norm": 160.0,
"learning_rate": 9.900000000000001e-05,
"loss": 138.4991,
"step": 100
},
{
"epoch": 0.00017775776357032393,
"grad_norm": 227.0,
"learning_rate": 0.00011899999999999999,
"loss": 136.2222,
"step": 120
},
{
"epoch": 0.00020738405749871127,
"grad_norm": 186.0,
"learning_rate": 0.00013900000000000002,
"loss": 133.6774,
"step": 140
},
{
"epoch": 0.00023701035142709859,
"grad_norm": 65.5,
"learning_rate": 0.00015900000000000002,
"loss": 130.8263,
"step": 160
},
{
"epoch": 0.0002666366453554859,
"grad_norm": 170.0,
"learning_rate": 0.000179,
"loss": 127.1942,
"step": 180
},
{
"epoch": 0.0002962629392838732,
"grad_norm": 143.0,
"learning_rate": 0.000199,
"loss": 122.895,
"step": 200
},
{
"epoch": 0.00032588923321226053,
"grad_norm": 109.0,
"learning_rate": 0.000219,
"loss": 119.3099,
"step": 220
},
{
"epoch": 0.00035551552714064785,
"grad_norm": 207.0,
"learning_rate": 0.00023899999999999998,
"loss": 115.1136,
"step": 240
},
{
"epoch": 0.00038514182106903517,
"grad_norm": 192.0,
"learning_rate": 0.000259,
"loss": 110.9598,
"step": 260
},
{
"epoch": 0.00041476811499742254,
"grad_norm": 81.0,
"learning_rate": 0.000279,
"loss": 107.5745,
"step": 280
},
{
"epoch": 0.00044439440892580985,
"grad_norm": 66.0,
"learning_rate": 0.000299,
"loss": 103.9611,
"step": 300
},
{
"epoch": 0.00047402070285419717,
"grad_norm": 77.0,
"learning_rate": 0.000319,
"loss": 100.1696,
"step": 320
},
{
"epoch": 0.0005036469967825845,
"grad_norm": 76.5,
"learning_rate": 0.00033900000000000005,
"loss": 96.3467,
"step": 340
},
{
"epoch": 0.0005332732907109718,
"grad_norm": 70.0,
"learning_rate": 0.000359,
"loss": 93.3187,
"step": 360
},
{
"epoch": 0.0005628995846393591,
"grad_norm": 81.0,
"learning_rate": 0.000379,
"loss": 90.8416,
"step": 380
},
{
"epoch": 0.0005925258785677464,
"grad_norm": 146.0,
"learning_rate": 0.00039900000000000005,
"loss": 88.6177,
"step": 400
},
{
"epoch": 0.0006221521724961338,
"grad_norm": 83.5,
"learning_rate": 0.000419,
"loss": 86.5326,
"step": 420
},
{
"epoch": 0.0006517784664245211,
"grad_norm": 99.5,
"learning_rate": 0.000439,
"loss": 84.3083,
"step": 440
},
{
"epoch": 0.0006814047603529084,
"grad_norm": 109.5,
"learning_rate": 0.00045900000000000004,
"loss": 81.6188,
"step": 460
},
{
"epoch": 0.0007110310542812957,
"grad_norm": 82.5,
"learning_rate": 0.000479,
"loss": 79.7902,
"step": 480
},
{
"epoch": 0.000740657348209683,
"grad_norm": 67.0,
"learning_rate": 0.000499,
"loss": 77.6862,
"step": 500
},
{
"epoch": 0.0007702836421380703,
"grad_norm": 95.0,
"learning_rate": 0.0004999953080117428,
"loss": 75.2466,
"step": 520
},
{
"epoch": 0.0007999099360664577,
"grad_norm": 96.0,
"learning_rate": 0.0004999903690767353,
"loss": 73.0597,
"step": 540
},
{
"epoch": 0.0008295362299948451,
"grad_norm": 96.5,
"learning_rate": 0.0004999854301417277,
"loss": 71.2278,
"step": 560
},
{
"epoch": 0.0008591625239232324,
"grad_norm": 88.0,
"learning_rate": 0.0004999804912067202,
"loss": 69.239,
"step": 580
},
{
"epoch": 0.0008887888178516197,
"grad_norm": 82.0,
"learning_rate": 0.0004999755522717126,
"loss": 67.4827,
"step": 600
},
{
"epoch": 0.000918415111780007,
"grad_norm": 55.75,
"learning_rate": 0.000499970613336705,
"loss": 65.9883,
"step": 620
},
{
"epoch": 0.0009480414057083943,
"grad_norm": 64.5,
"learning_rate": 0.0004999656744016975,
"loss": 64.7352,
"step": 640
},
{
"epoch": 0.0009776676996367817,
"grad_norm": 66.5,
"learning_rate": 0.0004999607354666899,
"loss": 63.1578,
"step": 660
},
{
"epoch": 0.001007293993565169,
"grad_norm": 80.0,
"learning_rate": 0.0004999557965316823,
"loss": 62.002,
"step": 680
},
{
"epoch": 0.0010369202874935563,
"grad_norm": 79.0,
"learning_rate": 0.0004999508575966748,
"loss": 61.0053,
"step": 700
},
{
"epoch": 0.0010665465814219436,
"grad_norm": 63.75,
"learning_rate": 0.0004999459186616671,
"loss": 59.9261,
"step": 720
},
{
"epoch": 0.001096172875350331,
"grad_norm": 58.5,
"learning_rate": 0.0004999409797266595,
"loss": 59.0368,
"step": 740
},
{
"epoch": 0.0011257991692787182,
"grad_norm": 71.5,
"learning_rate": 0.000499936040791652,
"loss": 58.2665,
"step": 760
},
{
"epoch": 0.0011554254632071056,
"grad_norm": 43.75,
"learning_rate": 0.0004999311018566444,
"loss": 57.2501,
"step": 780
},
{
"epoch": 0.0011850517571354929,
"grad_norm": 68.0,
"learning_rate": 0.0004999261629216369,
"loss": 56.5027,
"step": 800
},
{
"epoch": 0.0012146780510638802,
"grad_norm": 54.0,
"learning_rate": 0.0004999212239866293,
"loss": 55.9271,
"step": 820
},
{
"epoch": 0.0012443043449922675,
"grad_norm": 60.0,
"learning_rate": 0.0004999162850516217,
"loss": 55.3761,
"step": 840
},
{
"epoch": 0.0012739306389206548,
"grad_norm": 55.5,
"learning_rate": 0.0004999113461166142,
"loss": 54.6099,
"step": 860
},
{
"epoch": 0.0013035569328490421,
"grad_norm": 89.0,
"learning_rate": 0.0004999064071816066,
"loss": 53.9408,
"step": 880
},
{
"epoch": 0.0013331832267774295,
"grad_norm": 45.25,
"learning_rate": 0.000499901468246599,
"loss": 53.4026,
"step": 900
},
{
"epoch": 0.0013628095207058168,
"grad_norm": 50.25,
"learning_rate": 0.0004998965293115915,
"loss": 52.993,
"step": 920
},
{
"epoch": 0.001392435814634204,
"grad_norm": 59.0,
"learning_rate": 0.0004998915903765839,
"loss": 52.3891,
"step": 940
},
{
"epoch": 0.0014220621085625914,
"grad_norm": 61.0,
"learning_rate": 0.0004998866514415763,
"loss": 51.9013,
"step": 960
},
{
"epoch": 0.0014516884024909787,
"grad_norm": 69.5,
"learning_rate": 0.0004998817125065688,
"loss": 51.5375,
"step": 980
},
{
"epoch": 0.001481314696419366,
"grad_norm": 61.5,
"learning_rate": 0.0004998767735715612,
"loss": 50.8708,
"step": 1000
},
{
"epoch": 0.0015109409903477534,
"grad_norm": 57.0,
"learning_rate": 0.0004998718346365537,
"loss": 50.515,
"step": 1020
},
{
"epoch": 0.0015405672842761407,
"grad_norm": 56.75,
"learning_rate": 0.0004998668957015461,
"loss": 50.2177,
"step": 1040
},
{
"epoch": 0.001570193578204528,
"grad_norm": 39.25,
"learning_rate": 0.0004998619567665385,
"loss": 49.6777,
"step": 1060
},
{
"epoch": 0.0015998198721329153,
"grad_norm": 54.0,
"learning_rate": 0.000499857017831531,
"loss": 49.4364,
"step": 1080
},
{
"epoch": 0.0016294461660613028,
"grad_norm": 53.75,
"learning_rate": 0.0004998520788965234,
"loss": 49.0852,
"step": 1100
},
{
"epoch": 0.0016590724599896902,
"grad_norm": 37.75,
"learning_rate": 0.0004998471399615158,
"loss": 48.7451,
"step": 1120
},
{
"epoch": 0.0016886987539180775,
"grad_norm": 44.0,
"learning_rate": 0.0004998422010265083,
"loss": 48.4963,
"step": 1140
},
{
"epoch": 0.0017183250478464648,
"grad_norm": 64.5,
"learning_rate": 0.0004998372620915007,
"loss": 47.9198,
"step": 1160
},
{
"epoch": 0.001747951341774852,
"grad_norm": 35.25,
"learning_rate": 0.0004998323231564932,
"loss": 47.7004,
"step": 1180
},
{
"epoch": 0.0017775776357032394,
"grad_norm": 45.0,
"learning_rate": 0.0004998273842214856,
"loss": 47.231,
"step": 1200
},
{
"epoch": 0.0018072039296316267,
"grad_norm": 65.0,
"learning_rate": 0.0004998224452864781,
"loss": 47.0502,
"step": 1220
},
{
"epoch": 0.001836830223560014,
"grad_norm": 43.25,
"learning_rate": 0.0004998175063514705,
"loss": 46.6445,
"step": 1240
},
{
"epoch": 0.0018664565174884014,
"grad_norm": 37.75,
"learning_rate": 0.0004998125674164629,
"loss": 46.3884,
"step": 1260
},
{
"epoch": 0.0018960828114167887,
"grad_norm": 85.0,
"learning_rate": 0.0004998076284814553,
"loss": 46.167,
"step": 1280
},
{
"epoch": 0.001925709105345176,
"grad_norm": 40.25,
"learning_rate": 0.0004998026895464478,
"loss": 45.9901,
"step": 1300
},
{
"epoch": 0.0019553353992735633,
"grad_norm": 42.5,
"learning_rate": 0.0004997977506114401,
"loss": 45.7832,
"step": 1320
},
{
"epoch": 0.0019849616932019506,
"grad_norm": 48.0,
"learning_rate": 0.0004997928116764325,
"loss": 45.4071,
"step": 1340
},
{
"epoch": 0.002014587987130338,
"grad_norm": 49.0,
"learning_rate": 0.000499787872741425,
"loss": 45.1021,
"step": 1360
},
{
"epoch": 0.0020442142810587253,
"grad_norm": 52.5,
"learning_rate": 0.0004997829338064174,
"loss": 44.8783,
"step": 1380
},
{
"epoch": 0.0020738405749871126,
"grad_norm": 43.0,
"learning_rate": 0.0004997779948714099,
"loss": 44.6055,
"step": 1400
},
{
"epoch": 0.0021034668689155,
"grad_norm": 44.25,
"learning_rate": 0.0004997730559364023,
"loss": 44.3711,
"step": 1420
},
{
"epoch": 0.002133093162843887,
"grad_norm": 50.75,
"learning_rate": 0.0004997681170013947,
"loss": 44.3173,
"step": 1440
},
{
"epoch": 0.0021627194567722745,
"grad_norm": 55.5,
"learning_rate": 0.0004997631780663872,
"loss": 43.861,
"step": 1460
},
{
"epoch": 0.002192345750700662,
"grad_norm": 43.5,
"learning_rate": 0.0004997582391313796,
"loss": 43.7341,
"step": 1480
},
{
"epoch": 0.002221972044629049,
"grad_norm": 44.75,
"learning_rate": 0.000499753300196372,
"loss": 43.6567,
"step": 1500
},
{
"epoch": 0.0022515983385574365,
"grad_norm": 53.25,
"learning_rate": 0.0004997483612613645,
"loss": 43.2165,
"step": 1520
},
{
"epoch": 0.002281224632485824,
"grad_norm": 58.5,
"learning_rate": 0.0004997434223263569,
"loss": 43.1527,
"step": 1540
},
{
"epoch": 0.002310850926414211,
"grad_norm": 41.0,
"learning_rate": 0.0004997384833913494,
"loss": 42.8711,
"step": 1560
},
{
"epoch": 0.0023404772203425984,
"grad_norm": 37.75,
"learning_rate": 0.0004997335444563418,
"loss": 42.6794,
"step": 1580
},
{
"epoch": 0.0023701035142709857,
"grad_norm": 39.0,
"learning_rate": 0.0004997286055213343,
"loss": 42.4436,
"step": 1600
},
{
"epoch": 0.002399729808199373,
"grad_norm": 42.25,
"learning_rate": 0.0004997236665863267,
"loss": 42.3047,
"step": 1620
},
{
"epoch": 0.0024293561021277604,
"grad_norm": 34.5,
"learning_rate": 0.0004997187276513191,
"loss": 42.0222,
"step": 1640
},
{
"epoch": 0.0024589823960561477,
"grad_norm": 43.75,
"learning_rate": 0.0004997137887163115,
"loss": 42.0059,
"step": 1660
},
{
"epoch": 0.002488608689984535,
"grad_norm": 42.75,
"learning_rate": 0.000499708849781304,
"loss": 41.7808,
"step": 1680
},
{
"epoch": 0.0025182349839129223,
"grad_norm": 46.75,
"learning_rate": 0.0004997039108462964,
"loss": 41.4766,
"step": 1700
},
{
"epoch": 0.0025478612778413096,
"grad_norm": 40.25,
"learning_rate": 0.0004996989719112888,
"loss": 41.4283,
"step": 1720
},
{
"epoch": 0.002577487571769697,
"grad_norm": 43.25,
"learning_rate": 0.0004996940329762813,
"loss": 41.2702,
"step": 1740
},
{
"epoch": 0.0026071138656980843,
"grad_norm": 36.25,
"learning_rate": 0.0004996890940412737,
"loss": 41.0024,
"step": 1760
},
{
"epoch": 0.0026367401596264716,
"grad_norm": 42.5,
"learning_rate": 0.0004996841551062662,
"loss": 40.9728,
"step": 1780
},
{
"epoch": 0.002666366453554859,
"grad_norm": 42.25,
"learning_rate": 0.0004996792161712586,
"loss": 40.778,
"step": 1800
},
{
"epoch": 0.0026959927474832462,
"grad_norm": 38.0,
"learning_rate": 0.0004996742772362511,
"loss": 40.5235,
"step": 1820
},
{
"epoch": 0.0027256190414116335,
"grad_norm": 51.5,
"learning_rate": 0.0004996693383012435,
"loss": 40.4312,
"step": 1840
},
{
"epoch": 0.002755245335340021,
"grad_norm": 65.5,
"learning_rate": 0.0004996643993662359,
"loss": 40.3023,
"step": 1860
},
{
"epoch": 0.002784871629268408,
"grad_norm": 38.5,
"learning_rate": 0.0004996594604312283,
"loss": 40.2053,
"step": 1880
},
{
"epoch": 0.0028144979231967955,
"grad_norm": 48.0,
"learning_rate": 0.0004996545214962208,
"loss": 39.8467,
"step": 1900
},
{
"epoch": 0.002844124217125183,
"grad_norm": 38.5,
"learning_rate": 0.0004996495825612131,
"loss": 39.7633,
"step": 1920
},
{
"epoch": 0.00287375051105357,
"grad_norm": 49.5,
"learning_rate": 0.0004996446436262055,
"loss": 39.7014,
"step": 1940
},
{
"epoch": 0.0029033768049819574,
"grad_norm": 41.75,
"learning_rate": 0.000499639704691198,
"loss": 39.505,
"step": 1960
},
{
"epoch": 0.0029330030989103448,
"grad_norm": 40.25,
"learning_rate": 0.0004996347657561904,
"loss": 39.3954,
"step": 1980
},
{
"epoch": 0.002962629392838732,
"grad_norm": 34.25,
"learning_rate": 0.0004996298268211829,
"loss": 39.2925,
"step": 2000
},
{
"epoch": 0.0029922556867671194,
"grad_norm": 61.25,
"learning_rate": 0.0004996248878861753,
"loss": 39.1232,
"step": 2020
},
{
"epoch": 0.0030218819806955067,
"grad_norm": 44.5,
"learning_rate": 0.0004996199489511677,
"loss": 39.0561,
"step": 2040
},
{
"epoch": 0.003051508274623894,
"grad_norm": 41.25,
"learning_rate": 0.0004996150100161602,
"loss": 38.8368,
"step": 2060
},
{
"epoch": 0.0030811345685522813,
"grad_norm": 36.75,
"learning_rate": 0.0004996100710811526,
"loss": 38.8122,
"step": 2080
},
{
"epoch": 0.0031107608624806687,
"grad_norm": 49.5,
"learning_rate": 0.000499605132146145,
"loss": 38.6779,
"step": 2100
},
{
"epoch": 0.003140387156409056,
"grad_norm": 32.0,
"learning_rate": 0.0004996001932111375,
"loss": 38.5136,
"step": 2120
},
{
"epoch": 0.0031700134503374433,
"grad_norm": 38.0,
"learning_rate": 0.0004995952542761299,
"loss": 38.3378,
"step": 2140
},
{
"epoch": 0.0031996397442658306,
"grad_norm": 61.5,
"learning_rate": 0.0004995903153411224,
"loss": 38.1704,
"step": 2160
},
{
"epoch": 0.003229266038194218,
"grad_norm": 31.875,
"learning_rate": 0.0004995853764061148,
"loss": 38.1518,
"step": 2180
},
{
"epoch": 0.0032588923321226057,
"grad_norm": 41.0,
"learning_rate": 0.0004995804374711073,
"loss": 38.0705,
"step": 2200
},
{
"epoch": 0.003288518626050993,
"grad_norm": 33.5,
"learning_rate": 0.0004995754985360997,
"loss": 37.9291,
"step": 2220
},
{
"epoch": 0.0033181449199793803,
"grad_norm": 48.25,
"learning_rate": 0.0004995705596010921,
"loss": 37.7261,
"step": 2240
},
{
"epoch": 0.0033477712139077676,
"grad_norm": 47.5,
"learning_rate": 0.0004995656206660845,
"loss": 37.64,
"step": 2260
},
{
"epoch": 0.003377397507836155,
"grad_norm": 30.75,
"learning_rate": 0.000499560681731077,
"loss": 37.554,
"step": 2280
},
{
"epoch": 0.0034070238017645423,
"grad_norm": 37.0,
"learning_rate": 0.0004995557427960694,
"loss": 37.4085,
"step": 2300
},
{
"epoch": 0.0034366500956929296,
"grad_norm": 40.0,
"learning_rate": 0.0004995508038610618,
"loss": 37.3319,
"step": 2320
},
{
"epoch": 0.003466276389621317,
"grad_norm": 26.0,
"learning_rate": 0.0004995458649260543,
"loss": 37.2325,
"step": 2340
},
{
"epoch": 0.003495902683549704,
"grad_norm": 44.0,
"learning_rate": 0.0004995409259910467,
"loss": 37.1101,
"step": 2360
},
{
"epoch": 0.0035255289774780915,
"grad_norm": 41.75,
"learning_rate": 0.0004995359870560392,
"loss": 36.9086,
"step": 2380
},
{
"epoch": 0.003555155271406479,
"grad_norm": 38.75,
"learning_rate": 0.0004995310481210316,
"loss": 36.8606,
"step": 2400
},
{
"epoch": 0.003584781565334866,
"grad_norm": 46.0,
"learning_rate": 0.0004995261091860241,
"loss": 36.8799,
"step": 2420
},
{
"epoch": 0.0036144078592632535,
"grad_norm": 49.0,
"learning_rate": 0.0004995211702510165,
"loss": 36.7591,
"step": 2440
},
{
"epoch": 0.003644034153191641,
"grad_norm": 28.625,
"learning_rate": 0.0004995162313160089,
"loss": 36.5001,
"step": 2460
},
{
"epoch": 0.003673660447120028,
"grad_norm": 32.75,
"learning_rate": 0.0004995112923810013,
"loss": 36.5076,
"step": 2480
},
{
"epoch": 0.0037032867410484154,
"grad_norm": 60.5,
"learning_rate": 0.0004995063534459938,
"loss": 36.4496,
"step": 2500
},
{
"epoch": 0.0037329130349768027,
"grad_norm": 51.5,
"learning_rate": 0.0004995014145109862,
"loss": 36.2772,
"step": 2520
},
{
"epoch": 0.00376253932890519,
"grad_norm": 37.75,
"learning_rate": 0.0004994964755759787,
"loss": 36.2552,
"step": 2540
},
{
"epoch": 0.0037921656228335774,
"grad_norm": 30.25,
"learning_rate": 0.000499491536640971,
"loss": 36.1079,
"step": 2560
},
{
"epoch": 0.0038217919167619647,
"grad_norm": 29.625,
"learning_rate": 0.0004994865977059635,
"loss": 36.0163,
"step": 2580
},
{
"epoch": 0.003851418210690352,
"grad_norm": 37.5,
"learning_rate": 0.0004994816587709559,
"loss": 35.8935,
"step": 2600
},
{
"epoch": 0.0038810445046187393,
"grad_norm": 41.25,
"learning_rate": 0.0004994767198359483,
"loss": 35.8832,
"step": 2620
},
{
"epoch": 0.003910670798547127,
"grad_norm": 35.25,
"learning_rate": 0.0004994717809009407,
"loss": 35.774,
"step": 2640
},
{
"epoch": 0.003940297092475514,
"grad_norm": 42.25,
"learning_rate": 0.0004994668419659332,
"loss": 35.6311,
"step": 2660
},
{
"epoch": 0.003969923386403901,
"grad_norm": 27.25,
"learning_rate": 0.0004994619030309256,
"loss": 35.6101,
"step": 2680
},
{
"epoch": 0.003999549680332289,
"grad_norm": 29.75,
"learning_rate": 0.000499456964095918,
"loss": 35.4244,
"step": 2700
},
{
"epoch": 0.004029175974260676,
"grad_norm": 33.75,
"learning_rate": 0.0004994520251609105,
"loss": 35.4439,
"step": 2720
},
{
"epoch": 0.004058802268189063,
"grad_norm": 34.5,
"learning_rate": 0.0004994470862259029,
"loss": 35.2244,
"step": 2740
},
{
"epoch": 0.0040884285621174505,
"grad_norm": 29.75,
"learning_rate": 0.0004994421472908954,
"loss": 35.2606,
"step": 2760
},
{
"epoch": 0.004118054856045838,
"grad_norm": 33.0,
"learning_rate": 0.0004994372083558878,
"loss": 35.1311,
"step": 2780
},
{
"epoch": 0.004147681149974225,
"grad_norm": 49.75,
"learning_rate": 0.0004994322694208803,
"loss": 35.135,
"step": 2800
},
{
"epoch": 0.0041773074439026125,
"grad_norm": 44.0,
"learning_rate": 0.0004994273304858727,
"loss": 34.8227,
"step": 2820
},
{
"epoch": 0.004206933737831,
"grad_norm": 34.0,
"learning_rate": 0.0004994223915508651,
"loss": 34.8308,
"step": 2840
},
{
"epoch": 0.004236560031759387,
"grad_norm": 34.0,
"learning_rate": 0.0004994174526158575,
"loss": 34.8619,
"step": 2860
},
{
"epoch": 0.004266186325687774,
"grad_norm": 34.25,
"learning_rate": 0.00049941251368085,
"loss": 34.7495,
"step": 2880
},
{
"epoch": 0.004295812619616162,
"grad_norm": 33.0,
"learning_rate": 0.0004994075747458424,
"loss": 34.665,
"step": 2900
},
{
"epoch": 0.004325438913544549,
"grad_norm": 31.25,
"learning_rate": 0.0004994026358108349,
"loss": 34.6436,
"step": 2920
},
{
"epoch": 0.004355065207472936,
"grad_norm": 44.25,
"learning_rate": 0.0004993976968758273,
"loss": 34.5006,
"step": 2940
},
{
"epoch": 0.004384691501401324,
"grad_norm": 37.25,
"learning_rate": 0.0004993927579408197,
"loss": 34.4433,
"step": 2960
},
{
"epoch": 0.004414317795329711,
"grad_norm": 33.0,
"learning_rate": 0.0004993878190058122,
"loss": 34.3807,
"step": 2980
},
{
"epoch": 0.004443944089258098,
"grad_norm": 36.5,
"learning_rate": 0.0004993828800708046,
"loss": 34.2393,
"step": 3000
},
{
"epoch": 0.004473570383186486,
"grad_norm": 58.75,
"learning_rate": 0.0004993779411357971,
"loss": 34.2823,
"step": 3020
},
{
"epoch": 0.004503196677114873,
"grad_norm": 31.0,
"learning_rate": 0.0004993730022007895,
"loss": 34.1576,
"step": 3040
},
{
"epoch": 0.00453282297104326,
"grad_norm": 26.625,
"learning_rate": 0.0004993680632657819,
"loss": 34.1282,
"step": 3060
},
{
"epoch": 0.004562449264971648,
"grad_norm": 35.25,
"learning_rate": 0.0004993631243307743,
"loss": 33.9517,
"step": 3080
},
{
"epoch": 0.004592075558900035,
"grad_norm": 30.875,
"learning_rate": 0.0004993581853957668,
"loss": 33.9367,
"step": 3100
},
{
"epoch": 0.004621701852828422,
"grad_norm": 30.375,
"learning_rate": 0.0004993532464607592,
"loss": 33.8674,
"step": 3120
},
{
"epoch": 0.0046513281467568095,
"grad_norm": 37.0,
"learning_rate": 0.0004993483075257517,
"loss": 33.7807,
"step": 3140
},
{
"epoch": 0.004680954440685197,
"grad_norm": 50.25,
"learning_rate": 0.000499343368590744,
"loss": 33.749,
"step": 3160
},
{
"epoch": 0.004710580734613584,
"grad_norm": 31.875,
"learning_rate": 0.0004993384296557365,
"loss": 33.6065,
"step": 3180
},
{
"epoch": 0.0047402070285419715,
"grad_norm": 31.375,
"learning_rate": 0.0004993334907207289,
"loss": 33.4564,
"step": 3200
},
{
"epoch": 0.004769833322470359,
"grad_norm": 34.0,
"learning_rate": 0.0004993285517857213,
"loss": 33.531,
"step": 3220
},
{
"epoch": 0.004799459616398746,
"grad_norm": 31.25,
"learning_rate": 0.0004993236128507137,
"loss": 33.4034,
"step": 3240
},
{
"epoch": 0.0048290859103271334,
"grad_norm": 31.0,
"learning_rate": 0.0004993186739157062,
"loss": 33.4539,
"step": 3260
},
{
"epoch": 0.004858712204255521,
"grad_norm": 31.75,
"learning_rate": 0.0004993137349806986,
"loss": 33.2341,
"step": 3280
},
{
"epoch": 0.004888338498183908,
"grad_norm": 34.5,
"learning_rate": 0.000499308796045691,
"loss": 33.2486,
"step": 3300
},
{
"epoch": 0.004917964792112295,
"grad_norm": 27.375,
"learning_rate": 0.0004993038571106835,
"loss": 33.2143,
"step": 3320
},
{
"epoch": 0.004947591086040683,
"grad_norm": 25.875,
"learning_rate": 0.0004992989181756759,
"loss": 33.0835,
"step": 3340
},
{
"epoch": 0.00497721737996907,
"grad_norm": 31.125,
"learning_rate": 0.0004992939792406684,
"loss": 33.1259,
"step": 3360
},
{
"epoch": 0.005006843673897457,
"grad_norm": 35.5,
"learning_rate": 0.0004992890403056608,
"loss": 32.995,
"step": 3380
},
{
"epoch": 0.005036469967825845,
"grad_norm": 26.0,
"learning_rate": 0.0004992841013706533,
"loss": 32.8331,
"step": 3400
},
{
"epoch": 0.005066096261754232,
"grad_norm": 24.875,
"learning_rate": 0.0004992791624356457,
"loss": 32.8863,
"step": 3420
},
{
"epoch": 0.005095722555682619,
"grad_norm": 38.5,
"learning_rate": 0.0004992742235006381,
"loss": 32.7908,
"step": 3440
},
{
"epoch": 0.005125348849611007,
"grad_norm": 35.5,
"learning_rate": 0.0004992692845656305,
"loss": 32.7276,
"step": 3460
},
{
"epoch": 0.005154975143539394,
"grad_norm": 34.0,
"learning_rate": 0.000499264345630623,
"loss": 32.7076,
"step": 3480
},
{
"epoch": 0.005184601437467781,
"grad_norm": 28.625,
"learning_rate": 0.0004992594066956154,
"loss": 32.6881,
"step": 3500
},
{
"epoch": 0.0052142277313961686,
"grad_norm": 31.5,
"learning_rate": 0.0004992544677606079,
"loss": 32.5494,
"step": 3520
},
{
"epoch": 0.005243854025324556,
"grad_norm": 28.25,
"learning_rate": 0.0004992495288256003,
"loss": 32.5348,
"step": 3540
},
{
"epoch": 0.005273480319252943,
"grad_norm": 38.75,
"learning_rate": 0.0004992445898905928,
"loss": 32.477,
"step": 3560
},
{
"epoch": 0.0053031066131813305,
"grad_norm": 44.25,
"learning_rate": 0.0004992396509555852,
"loss": 32.3766,
"step": 3580
},
{
"epoch": 0.005332732907109718,
"grad_norm": 31.25,
"learning_rate": 0.0004992347120205776,
"loss": 32.3675,
"step": 3600
},
{
"epoch": 0.005362359201038105,
"grad_norm": 32.5,
"learning_rate": 0.0004992297730855701,
"loss": 32.3194,
"step": 3620
},
{
"epoch": 0.0053919854949664925,
"grad_norm": 33.0,
"learning_rate": 0.0004992248341505625,
"loss": 32.2312,
"step": 3640
},
{
"epoch": 0.00542161178889488,
"grad_norm": 26.0,
"learning_rate": 0.0004992198952155549,
"loss": 32.2567,
"step": 3660
},
{
"epoch": 0.005451238082823267,
"grad_norm": 30.75,
"learning_rate": 0.0004992149562805473,
"loss": 32.1444,
"step": 3680
},
{
"epoch": 0.005480864376751654,
"grad_norm": 31.75,
"learning_rate": 0.0004992100173455398,
"loss": 32.1415,
"step": 3700
},
{
"epoch": 0.005510490670680042,
"grad_norm": 29.25,
"learning_rate": 0.0004992050784105322,
"loss": 32.0721,
"step": 3720
},
{
"epoch": 0.005540116964608429,
"grad_norm": 34.25,
"learning_rate": 0.0004992001394755247,
"loss": 31.9281,
"step": 3740
},
{
"epoch": 0.005569743258536816,
"grad_norm": 30.75,
"learning_rate": 0.000499195200540517,
"loss": 31.8502,
"step": 3760
},
{
"epoch": 0.005599369552465204,
"grad_norm": 37.25,
"learning_rate": 0.0004991902616055095,
"loss": 31.8956,
"step": 3780
},
{
"epoch": 0.005628995846393591,
"grad_norm": 25.375,
"learning_rate": 0.0004991853226705019,
"loss": 31.8152,
"step": 3800
},
{
"epoch": 0.005658622140321978,
"grad_norm": 32.5,
"learning_rate": 0.0004991803837354943,
"loss": 31.7686,
"step": 3820
},
{
"epoch": 0.005688248434250366,
"grad_norm": 27.0,
"learning_rate": 0.0004991754448004867,
"loss": 31.7624,
"step": 3840
},
{
"epoch": 0.005717874728178753,
"grad_norm": 31.125,
"learning_rate": 0.0004991705058654792,
"loss": 31.6764,
"step": 3860
},
{
"epoch": 0.00574750102210714,
"grad_norm": 27.0,
"learning_rate": 0.0004991655669304716,
"loss": 31.5634,
"step": 3880
},
{
"epoch": 0.0057771273160355276,
"grad_norm": 42.75,
"learning_rate": 0.0004991606279954641,
"loss": 31.531,
"step": 3900
},
{
"epoch": 0.005806753609963915,
"grad_norm": 20.25,
"learning_rate": 0.0004991556890604565,
"loss": 31.5578,
"step": 3920
},
{
"epoch": 0.005836379903892302,
"grad_norm": 28.75,
"learning_rate": 0.000499150750125449,
"loss": 31.3939,
"step": 3940
},
{
"epoch": 0.0058660061978206895,
"grad_norm": 37.75,
"learning_rate": 0.0004991458111904414,
"loss": 31.483,
"step": 3960
},
{
"epoch": 0.005895632491749077,
"grad_norm": 27.125,
"learning_rate": 0.0004991408722554338,
"loss": 31.3699,
"step": 3980
},
{
"epoch": 0.005925258785677464,
"grad_norm": 34.0,
"learning_rate": 0.0004991359333204263,
"loss": 31.326,
"step": 4000
},
{
"epoch": 0.0059548850796058515,
"grad_norm": 36.0,
"learning_rate": 0.0004991309943854187,
"loss": 31.2502,
"step": 4020
},
{
"epoch": 0.005984511373534239,
"grad_norm": 28.125,
"learning_rate": 0.0004991260554504111,
"loss": 31.206,
"step": 4040
},
{
"epoch": 0.006014137667462626,
"grad_norm": 27.875,
"learning_rate": 0.0004991211165154035,
"loss": 31.1183,
"step": 4060
},
{
"epoch": 0.006043763961391013,
"grad_norm": 26.625,
"learning_rate": 0.000499116177580396,
"loss": 31.0616,
"step": 4080
},
{
"epoch": 0.006073390255319401,
"grad_norm": 33.25,
"learning_rate": 0.0004991112386453884,
"loss": 31.0764,
"step": 4100
},
{
"epoch": 0.006103016549247788,
"grad_norm": 33.25,
"learning_rate": 0.0004991062997103809,
"loss": 31.1013,
"step": 4120
},
{
"epoch": 0.006132642843176175,
"grad_norm": 25.875,
"learning_rate": 0.0004991013607753733,
"loss": 30.9594,
"step": 4140
},
{
"epoch": 0.006162269137104563,
"grad_norm": 35.5,
"learning_rate": 0.0004990964218403658,
"loss": 30.9666,
"step": 4160
},
{
"epoch": 0.00619189543103295,
"grad_norm": 31.125,
"learning_rate": 0.0004990914829053582,
"loss": 30.9864,
"step": 4180
},
{
"epoch": 0.006221521724961337,
"grad_norm": 39.75,
"learning_rate": 0.0004990865439703506,
"loss": 30.8923,
"step": 4200
},
{
"epoch": 0.006251148018889725,
"grad_norm": 36.75,
"learning_rate": 0.0004990816050353431,
"loss": 30.8131,
"step": 4220
},
{
"epoch": 0.006280774312818112,
"grad_norm": 33.25,
"learning_rate": 0.0004990766661003355,
"loss": 30.7942,
"step": 4240
},
{
"epoch": 0.006310400606746499,
"grad_norm": 25.625,
"learning_rate": 0.0004990717271653279,
"loss": 30.6945,
"step": 4260
},
{
"epoch": 0.006340026900674887,
"grad_norm": 36.5,
"learning_rate": 0.0004990667882303204,
"loss": 30.5995,
"step": 4280
},
{
"epoch": 0.006369653194603274,
"grad_norm": 29.0,
"learning_rate": 0.0004990618492953128,
"loss": 30.6263,
"step": 4300
},
{
"epoch": 0.006399279488531661,
"grad_norm": 25.625,
"learning_rate": 0.0004990569103603052,
"loss": 30.5024,
"step": 4320
},
{
"epoch": 0.0064289057824600485,
"grad_norm": 27.375,
"learning_rate": 0.0004990519714252977,
"loss": 30.5216,
"step": 4340
},
{
"epoch": 0.006458532076388436,
"grad_norm": 29.625,
"learning_rate": 0.00049904703249029,
"loss": 30.4789,
"step": 4360
},
{
"epoch": 0.006488158370316823,
"grad_norm": 29.0,
"learning_rate": 0.0004990420935552825,
"loss": 30.3852,
"step": 4380
},
{
"epoch": 0.006517784664245211,
"grad_norm": 36.25,
"learning_rate": 0.0004990371546202749,
"loss": 30.4001,
"step": 4400
},
{
"epoch": 0.006547410958173599,
"grad_norm": 27.75,
"learning_rate": 0.0004990322156852673,
"loss": 30.4478,
"step": 4420
},
{
"epoch": 0.006577037252101986,
"grad_norm": 35.25,
"learning_rate": 0.0004990272767502597,
"loss": 30.3411,
"step": 4440
},
{
"epoch": 0.006606663546030373,
"grad_norm": 25.625,
"learning_rate": 0.0004990223378152522,
"loss": 30.2584,
"step": 4460
},
{
"epoch": 0.006636289839958761,
"grad_norm": 27.625,
"learning_rate": 0.0004990173988802446,
"loss": 30.2584,
"step": 4480
},
{
"epoch": 0.006665916133887148,
"grad_norm": 28.25,
"learning_rate": 0.0004990124599452371,
"loss": 30.2201,
"step": 4500
},
{
"epoch": 0.006695542427815535,
"grad_norm": 29.0,
"learning_rate": 0.0004990075210102295,
"loss": 30.1349,
"step": 4520
},
{
"epoch": 0.0067251687217439226,
"grad_norm": 21.375,
"learning_rate": 0.000499002582075222,
"loss": 30.1322,
"step": 4540
},
{
"epoch": 0.00675479501567231,
"grad_norm": 34.5,
"learning_rate": 0.0004989976431402144,
"loss": 30.1664,
"step": 4560
},
{
"epoch": 0.006784421309600697,
"grad_norm": 28.0,
"learning_rate": 0.0004989927042052068,
"loss": 30.1264,
"step": 4580
},
{
"epoch": 0.0068140476035290845,
"grad_norm": 29.875,
"learning_rate": 0.0004989877652701993,
"loss": 29.9802,
"step": 4600
},
{
"epoch": 0.006843673897457472,
"grad_norm": 31.625,
"learning_rate": 0.0004989828263351917,
"loss": 30.0028,
"step": 4620
},
{
"epoch": 0.006873300191385859,
"grad_norm": 33.5,
"learning_rate": 0.0004989778874001841,
"loss": 29.954,
"step": 4640
},
{
"epoch": 0.0069029264853142465,
"grad_norm": 28.125,
"learning_rate": 0.0004989729484651765,
"loss": 29.8774,
"step": 4660
},
{
"epoch": 0.006932552779242634,
"grad_norm": 31.25,
"learning_rate": 0.000498968009530169,
"loss": 29.8508,
"step": 4680
},
{
"epoch": 0.006962179073171021,
"grad_norm": 30.0,
"learning_rate": 0.0004989630705951614,
"loss": 29.8786,
"step": 4700
},
{
"epoch": 0.006991805367099408,
"grad_norm": 26.0,
"learning_rate": 0.0004989581316601539,
"loss": 29.789,
"step": 4720
},
{
"epoch": 0.007021431661027796,
"grad_norm": 29.625,
"learning_rate": 0.0004989531927251463,
"loss": 29.708,
"step": 4740
},
{
"epoch": 0.007051057954956183,
"grad_norm": 32.0,
"learning_rate": 0.0004989482537901388,
"loss": 29.7006,
"step": 4760
},
{
"epoch": 0.00708068424888457,
"grad_norm": 25.875,
"learning_rate": 0.0004989433148551312,
"loss": 29.546,
"step": 4780
},
{
"epoch": 0.007110310542812958,
"grad_norm": 30.125,
"learning_rate": 0.0004989383759201236,
"loss": 29.5634,
"step": 4800
},
{
"epoch": 0.007139936836741345,
"grad_norm": 36.5,
"learning_rate": 0.0004989334369851161,
"loss": 29.5994,
"step": 4820
},
{
"epoch": 0.007169563130669732,
"grad_norm": 24.375,
"learning_rate": 0.0004989284980501085,
"loss": 29.5168,
"step": 4840
},
{
"epoch": 0.00719918942459812,
"grad_norm": 32.25,
"learning_rate": 0.0004989235591151009,
"loss": 29.4858,
"step": 4860
},
{
"epoch": 0.007228815718526507,
"grad_norm": 22.0,
"learning_rate": 0.0004989186201800934,
"loss": 29.4208,
"step": 4880
},
{
"epoch": 0.007258442012454894,
"grad_norm": 24.625,
"learning_rate": 0.0004989136812450858,
"loss": 29.4053,
"step": 4900
},
{
"epoch": 0.007288068306383282,
"grad_norm": 26.625,
"learning_rate": 0.0004989087423100783,
"loss": 29.4063,
"step": 4920
},
{
"epoch": 0.007317694600311669,
"grad_norm": 27.25,
"learning_rate": 0.0004989038033750707,
"loss": 29.3743,
"step": 4940
},
{
"epoch": 0.007347320894240056,
"grad_norm": 22.625,
"learning_rate": 0.000498898864440063,
"loss": 29.3818,
"step": 4960
},
{
"epoch": 0.0073769471881684435,
"grad_norm": 24.5,
"learning_rate": 0.0004988939255050556,
"loss": 29.2662,
"step": 4980
},
{
"epoch": 0.007406573482096831,
"grad_norm": 30.5,
"learning_rate": 0.0004988889865700479,
"loss": 29.3027,
"step": 5000
},
{
"epoch": 0.007436199776025218,
"grad_norm": 24.5,
"learning_rate": 0.0004988840476350403,
"loss": 29.1511,
"step": 5020
},
{
"epoch": 0.0074658260699536055,
"grad_norm": 24.25,
"learning_rate": 0.0004988791087000327,
"loss": 29.2143,
"step": 5040
},
{
"epoch": 0.007495452363881993,
"grad_norm": 27.25,
"learning_rate": 0.0004988741697650252,
"loss": 29.1494,
"step": 5060
},
{
"epoch": 0.00752507865781038,
"grad_norm": 21.25,
"learning_rate": 0.0004988692308300176,
"loss": 29.1171,
"step": 5080
},
{
"epoch": 0.007554704951738767,
"grad_norm": 29.0,
"learning_rate": 0.0004988642918950101,
"loss": 29.1236,
"step": 5100
},
{
"epoch": 0.007584331245667155,
"grad_norm": 40.0,
"learning_rate": 0.0004988593529600025,
"loss": 29.0323,
"step": 5120
},
{
"epoch": 0.007613957539595542,
"grad_norm": 23.5,
"learning_rate": 0.000498854414024995,
"loss": 28.9919,
"step": 5140
},
{
"epoch": 0.007643583833523929,
"grad_norm": 22.375,
"learning_rate": 0.0004988494750899874,
"loss": 28.9232,
"step": 5160
},
{
"epoch": 0.007673210127452317,
"grad_norm": 35.5,
"learning_rate": 0.0004988445361549798,
"loss": 28.918,
"step": 5180
},
{
"epoch": 0.007702836421380704,
"grad_norm": 26.75,
"learning_rate": 0.0004988395972199723,
"loss": 28.9208,
"step": 5200
},
{
"epoch": 0.007732462715309091,
"grad_norm": 29.25,
"learning_rate": 0.0004988346582849647,
"loss": 28.9147,
"step": 5220
},
{
"epoch": 0.007762089009237479,
"grad_norm": 26.375,
"learning_rate": 0.0004988297193499571,
"loss": 28.8134,
"step": 5240
},
{
"epoch": 0.007791715303165866,
"grad_norm": 27.25,
"learning_rate": 0.0004988247804149496,
"loss": 28.8583,
"step": 5260
},
{
"epoch": 0.007821341597094253,
"grad_norm": 24.375,
"learning_rate": 0.000498819841479942,
"loss": 28.7596,
"step": 5280
},
{
"epoch": 0.00785096789102264,
"grad_norm": 27.375,
"learning_rate": 0.0004988149025449345,
"loss": 28.7184,
"step": 5300
},
{
"epoch": 0.007880594184951028,
"grad_norm": 24.5,
"learning_rate": 0.0004988099636099269,
"loss": 28.7668,
"step": 5320
},
{
"epoch": 0.007910220478879414,
"grad_norm": 23.125,
"learning_rate": 0.0004988050246749193,
"loss": 28.7422,
"step": 5340
},
{
"epoch": 0.007939846772807803,
"grad_norm": 27.875,
"learning_rate": 0.0004988000857399118,
"loss": 28.7021,
"step": 5360
},
{
"epoch": 0.007969473066736189,
"grad_norm": 25.875,
"learning_rate": 0.0004987951468049042,
"loss": 28.6588,
"step": 5380
},
{
"epoch": 0.007999099360664577,
"grad_norm": 25.375,
"learning_rate": 0.0004987902078698966,
"loss": 28.5846,
"step": 5400
},
{
"epoch": 0.008028725654592964,
"grad_norm": 30.125,
"learning_rate": 0.0004987852689348891,
"loss": 28.6113,
"step": 5420
},
{
"epoch": 0.008058351948521352,
"grad_norm": 22.5,
"learning_rate": 0.0004987803299998815,
"loss": 28.5202,
"step": 5440
},
{
"epoch": 0.008087978242449738,
"grad_norm": 31.0,
"learning_rate": 0.0004987753910648739,
"loss": 28.5026,
"step": 5460
},
{
"epoch": 0.008117604536378126,
"grad_norm": 33.25,
"learning_rate": 0.0004987704521298664,
"loss": 28.5127,
"step": 5480
},
{
"epoch": 0.008147230830306513,
"grad_norm": 29.0,
"learning_rate": 0.0004987655131948588,
"loss": 28.4538,
"step": 5500
},
{
"epoch": 0.008176857124234901,
"grad_norm": 22.125,
"learning_rate": 0.0004987605742598513,
"loss": 28.457,
"step": 5520
},
{
"epoch": 0.008206483418163288,
"grad_norm": 31.625,
"learning_rate": 0.0004987556353248437,
"loss": 28.4426,
"step": 5540
},
{
"epoch": 0.008236109712091676,
"grad_norm": 23.75,
"learning_rate": 0.000498750696389836,
"loss": 28.3837,
"step": 5560
},
{
"epoch": 0.008265736006020062,
"grad_norm": 23.25,
"learning_rate": 0.0004987457574548286,
"loss": 28.3155,
"step": 5580
},
{
"epoch": 0.00829536229994845,
"grad_norm": 22.75,
"learning_rate": 0.0004987408185198209,
"loss": 28.2806,
"step": 5600
},
{
"epoch": 0.008324988593876837,
"grad_norm": 23.875,
"learning_rate": 0.0004987358795848133,
"loss": 28.2933,
"step": 5620
},
{
"epoch": 0.008354614887805225,
"grad_norm": 25.875,
"learning_rate": 0.0004987309406498058,
"loss": 28.2405,
"step": 5640
},
{
"epoch": 0.008384241181733611,
"grad_norm": 29.0,
"learning_rate": 0.0004987260017147982,
"loss": 28.22,
"step": 5660
},
{
"epoch": 0.008413867475662,
"grad_norm": 23.5,
"learning_rate": 0.0004987210627797906,
"loss": 28.1765,
"step": 5680
},
{
"epoch": 0.008443493769590386,
"grad_norm": 23.25,
"learning_rate": 0.0004987161238447831,
"loss": 28.208,
"step": 5700
},
{
"epoch": 0.008473120063518774,
"grad_norm": 27.5,
"learning_rate": 0.0004987111849097755,
"loss": 28.1571,
"step": 5720
},
{
"epoch": 0.00850274635744716,
"grad_norm": 27.5,
"learning_rate": 0.000498706245974768,
"loss": 28.0758,
"step": 5740
},
{
"epoch": 0.008532372651375549,
"grad_norm": 25.125,
"learning_rate": 0.0004987013070397604,
"loss": 28.0146,
"step": 5760
},
{
"epoch": 0.008561998945303935,
"grad_norm": 22.75,
"learning_rate": 0.0004986963681047528,
"loss": 28.0971,
"step": 5780
},
{
"epoch": 0.008591625239232323,
"grad_norm": 27.0,
"learning_rate": 0.0004986914291697453,
"loss": 27.9061,
"step": 5800
},
{
"epoch": 0.00862125153316071,
"grad_norm": 28.75,
"learning_rate": 0.0004986864902347377,
"loss": 27.9707,
"step": 5820
},
{
"epoch": 0.008650877827089098,
"grad_norm": 25.625,
"learning_rate": 0.0004986815512997301,
"loss": 27.9883,
"step": 5840
},
{
"epoch": 0.008680504121017485,
"grad_norm": 21.375,
"learning_rate": 0.0004986766123647226,
"loss": 27.8901,
"step": 5860
},
{
"epoch": 0.008710130414945873,
"grad_norm": 27.75,
"learning_rate": 0.000498671673429715,
"loss": 27.8871,
"step": 5880
},
{
"epoch": 0.008739756708874261,
"grad_norm": 21.625,
"learning_rate": 0.0004986667344947075,
"loss": 27.9139,
"step": 5900
},
{
"epoch": 0.008769383002802647,
"grad_norm": 25.0,
"learning_rate": 0.0004986617955596999,
"loss": 27.8552,
"step": 5920
},
{
"epoch": 0.008799009296731036,
"grad_norm": 29.875,
"learning_rate": 0.0004986568566246923,
"loss": 27.7934,
"step": 5940
},
{
"epoch": 0.008828635590659422,
"grad_norm": 27.875,
"learning_rate": 0.0004986519176896848,
"loss": 27.8324,
"step": 5960
},
{
"epoch": 0.00885826188458781,
"grad_norm": 31.5,
"learning_rate": 0.0004986469787546772,
"loss": 27.7529,
"step": 5980
},
{
"epoch": 0.008887888178516197,
"grad_norm": 20.25,
"learning_rate": 0.0004986420398196696,
"loss": 27.7292,
"step": 6000
},
{
"epoch": 0.008917514472444585,
"grad_norm": 25.125,
"learning_rate": 0.0004986371008846621,
"loss": 27.7583,
"step": 6020
},
{
"epoch": 0.008947140766372971,
"grad_norm": 25.25,
"learning_rate": 0.0004986321619496545,
"loss": 27.7517,
"step": 6040
},
{
"epoch": 0.00897676706030136,
"grad_norm": 25.0,
"learning_rate": 0.0004986272230146469,
"loss": 27.6723,
"step": 6060
},
{
"epoch": 0.009006393354229746,
"grad_norm": 27.125,
"learning_rate": 0.0004986222840796394,
"loss": 27.6539,
"step": 6080
},
{
"epoch": 0.009036019648158134,
"grad_norm": 21.5,
"learning_rate": 0.0004986173451446318,
"loss": 27.5415,
"step": 6100
},
{
"epoch": 0.00906564594208652,
"grad_norm": 29.125,
"learning_rate": 0.0004986124062096243,
"loss": 27.6001,
"step": 6120
},
{
"epoch": 0.009095272236014909,
"grad_norm": 29.625,
"learning_rate": 0.0004986074672746167,
"loss": 27.531,
"step": 6140
},
{
"epoch": 0.009124898529943295,
"grad_norm": 27.125,
"learning_rate": 0.000498602528339609,
"loss": 27.6403,
"step": 6160
},
{
"epoch": 0.009154524823871683,
"grad_norm": 24.625,
"learning_rate": 0.0004985975894046016,
"loss": 27.5193,
"step": 6180
},
{
"epoch": 0.00918415111780007,
"grad_norm": 26.375,
"learning_rate": 0.0004985926504695939,
"loss": 27.5217,
"step": 6200
},
{
"epoch": 0.009213777411728458,
"grad_norm": 20.625,
"learning_rate": 0.0004985877115345863,
"loss": 27.4847,
"step": 6220
},
{
"epoch": 0.009243403705656844,
"grad_norm": 34.75,
"learning_rate": 0.0004985827725995788,
"loss": 27.4528,
"step": 6240
},
{
"epoch": 0.009273029999585233,
"grad_norm": 22.75,
"learning_rate": 0.0004985778336645712,
"loss": 27.4023,
"step": 6260
},
{
"epoch": 0.009302656293513619,
"grad_norm": 26.125,
"learning_rate": 0.0004985728947295637,
"loss": 27.3692,
"step": 6280
},
{
"epoch": 0.009332282587442007,
"grad_norm": 20.875,
"learning_rate": 0.0004985679557945561,
"loss": 27.4008,
"step": 6300
},
{
"epoch": 0.009361908881370394,
"grad_norm": 24.625,
"learning_rate": 0.0004985630168595486,
"loss": 27.4807,
"step": 6320
},
{
"epoch": 0.009391535175298782,
"grad_norm": 24.75,
"learning_rate": 0.000498558077924541,
"loss": 27.3572,
"step": 6340
},
{
"epoch": 0.009421161469227168,
"grad_norm": 24.5,
"learning_rate": 0.0004985531389895334,
"loss": 27.3003,
"step": 6360
},
{
"epoch": 0.009450787763155557,
"grad_norm": 23.0,
"learning_rate": 0.0004985482000545258,
"loss": 27.1815,
"step": 6380
},
{
"epoch": 0.009480414057083943,
"grad_norm": 23.125,
"learning_rate": 0.0004985432611195183,
"loss": 27.2178,
"step": 6400
},
{
"epoch": 0.009510040351012331,
"grad_norm": 29.25,
"learning_rate": 0.0004985383221845107,
"loss": 27.2077,
"step": 6420
},
{
"epoch": 0.009539666644940718,
"grad_norm": 19.625,
"learning_rate": 0.0004985333832495031,
"loss": 27.2465,
"step": 6440
},
{
"epoch": 0.009569292938869106,
"grad_norm": 25.125,
"learning_rate": 0.0004985284443144956,
"loss": 27.1899,
"step": 6460
},
{
"epoch": 0.009598919232797492,
"grad_norm": 27.375,
"learning_rate": 0.000498523505379488,
"loss": 27.2459,
"step": 6480
},
{
"epoch": 0.00962854552672588,
"grad_norm": 22.625,
"learning_rate": 0.0004985185664444805,
"loss": 27.1158,
"step": 6500
},
{
"epoch": 0.009658171820654267,
"grad_norm": 26.125,
"learning_rate": 0.0004985136275094729,
"loss": 27.2161,
"step": 6520
},
{
"epoch": 0.009687798114582655,
"grad_norm": 25.0,
"learning_rate": 0.0004985086885744653,
"loss": 27.0711,
"step": 6540
},
{
"epoch": 0.009717424408511042,
"grad_norm": 20.5,
"learning_rate": 0.0004985037496394578,
"loss": 27.0087,
"step": 6560
},
{
"epoch": 0.00974705070243943,
"grad_norm": 21.125,
"learning_rate": 0.0004984988107044502,
"loss": 27.0451,
"step": 6580
},
{
"epoch": 0.009776676996367816,
"grad_norm": 22.0,
"learning_rate": 0.0004984938717694426,
"loss": 27.0665,
"step": 6600
},
{
"epoch": 0.009806303290296204,
"grad_norm": 18.125,
"learning_rate": 0.0004984889328344351,
"loss": 27.0015,
"step": 6620
},
{
"epoch": 0.00983592958422459,
"grad_norm": 27.125,
"learning_rate": 0.0004984839938994275,
"loss": 26.9989,
"step": 6640
},
{
"epoch": 0.009865555878152979,
"grad_norm": 26.375,
"learning_rate": 0.00049847905496442,
"loss": 26.991,
"step": 6660
},
{
"epoch": 0.009895182172081365,
"grad_norm": 21.125,
"learning_rate": 0.0004984741160294124,
"loss": 26.9069,
"step": 6680
},
{
"epoch": 0.009924808466009754,
"grad_norm": 25.0,
"learning_rate": 0.0004984691770944048,
"loss": 26.9706,
"step": 6700
},
{
"epoch": 0.00995443475993814,
"grad_norm": 26.0,
"learning_rate": 0.0004984642381593973,
"loss": 26.9443,
"step": 6720
},
{
"epoch": 0.009984061053866528,
"grad_norm": 22.5,
"learning_rate": 0.0004984592992243897,
"loss": 26.9991,
"step": 6740
},
{
"epoch": 0.010013687347794915,
"grad_norm": 25.125,
"learning_rate": 0.000498454360289382,
"loss": 26.8303,
"step": 6760
},
{
"epoch": 0.010043313641723303,
"grad_norm": 21.625,
"learning_rate": 0.0004984494213543746,
"loss": 26.7858,
"step": 6780
},
{
"epoch": 0.01007293993565169,
"grad_norm": 24.25,
"learning_rate": 0.000498444482419367,
"loss": 26.7472,
"step": 6800
},
{
"epoch": 0.010102566229580077,
"grad_norm": 21.25,
"learning_rate": 0.0004984395434843593,
"loss": 26.7279,
"step": 6820
},
{
"epoch": 0.010132192523508464,
"grad_norm": 28.125,
"learning_rate": 0.0004984346045493518,
"loss": 26.7243,
"step": 6840
},
{
"epoch": 0.010161818817436852,
"grad_norm": 22.0,
"learning_rate": 0.0004984296656143442,
"loss": 26.7933,
"step": 6860
},
{
"epoch": 0.010191445111365239,
"grad_norm": 21.375,
"learning_rate": 0.0004984247266793367,
"loss": 26.7171,
"step": 6880
},
{
"epoch": 0.010221071405293627,
"grad_norm": 26.625,
"learning_rate": 0.0004984197877443291,
"loss": 26.6968,
"step": 6900
},
{
"epoch": 0.010250697699222013,
"grad_norm": 23.25,
"learning_rate": 0.0004984148488093216,
"loss": 26.6967,
"step": 6920
},
{
"epoch": 0.010280323993150401,
"grad_norm": 20.875,
"learning_rate": 0.000498409909874314,
"loss": 26.6935,
"step": 6940
},
{
"epoch": 0.010309950287078788,
"grad_norm": 19.0,
"learning_rate": 0.0004984049709393064,
"loss": 26.6697,
"step": 6960
},
{
"epoch": 0.010339576581007176,
"grad_norm": 21.25,
"learning_rate": 0.0004984000320042988,
"loss": 26.6018,
"step": 6980
},
{
"epoch": 0.010369202874935562,
"grad_norm": 19.125,
"learning_rate": 0.0004983950930692913,
"loss": 26.5833,
"step": 7000
},
{
"epoch": 0.01039882916886395,
"grad_norm": 21.0,
"learning_rate": 0.0004983901541342837,
"loss": 26.6342,
"step": 7020
},
{
"epoch": 0.010428455462792337,
"grad_norm": 26.875,
"learning_rate": 0.0004983852151992761,
"loss": 26.6716,
"step": 7040
},
{
"epoch": 0.010458081756720725,
"grad_norm": 25.25,
"learning_rate": 0.0004983802762642686,
"loss": 26.517,
"step": 7060
},
{
"epoch": 0.010487708050649112,
"grad_norm": 23.75,
"learning_rate": 0.000498375337329261,
"loss": 26.5071,
"step": 7080
},
{
"epoch": 0.0105173343445775,
"grad_norm": 28.75,
"learning_rate": 0.0004983703983942535,
"loss": 26.4937,
"step": 7100
},
{
"epoch": 0.010546960638505886,
"grad_norm": 24.625,
"learning_rate": 0.0004983654594592459,
"loss": 26.5264,
"step": 7120
},
{
"epoch": 0.010576586932434275,
"grad_norm": 24.75,
"learning_rate": 0.0004983605205242383,
"loss": 26.4541,
"step": 7140
},
{
"epoch": 0.010606213226362661,
"grad_norm": 19.75,
"learning_rate": 0.0004983555815892308,
"loss": 26.4,
"step": 7160
},
{
"epoch": 0.01063583952029105,
"grad_norm": 20.875,
"learning_rate": 0.0004983506426542232,
"loss": 26.3095,
"step": 7180
},
{
"epoch": 0.010665465814219436,
"grad_norm": 23.875,
"learning_rate": 0.0004983457037192156,
"loss": 26.4476,
"step": 7200
},
{
"epoch": 0.010695092108147824,
"grad_norm": 20.125,
"learning_rate": 0.0004983407647842081,
"loss": 26.4167,
"step": 7220
},
{
"epoch": 0.01072471840207621,
"grad_norm": 25.125,
"learning_rate": 0.0004983358258492005,
"loss": 26.3873,
"step": 7240
},
{
"epoch": 0.010754344696004598,
"grad_norm": 20.0,
"learning_rate": 0.000498330886914193,
"loss": 26.2724,
"step": 7260
},
{
"epoch": 0.010783970989932985,
"grad_norm": 20.5,
"learning_rate": 0.0004983259479791854,
"loss": 26.3225,
"step": 7280
},
{
"epoch": 0.010813597283861373,
"grad_norm": 31.625,
"learning_rate": 0.0004983210090441779,
"loss": 26.2387,
"step": 7300
},
{
"epoch": 0.01084322357778976,
"grad_norm": 25.25,
"learning_rate": 0.0004983160701091703,
"loss": 26.2738,
"step": 7320
},
{
"epoch": 0.010872849871718148,
"grad_norm": 22.125,
"learning_rate": 0.0004983111311741627,
"loss": 26.2851,
"step": 7340
},
{
"epoch": 0.010902476165646534,
"grad_norm": 23.875,
"learning_rate": 0.000498306192239155,
"loss": 26.1881,
"step": 7360
},
{
"epoch": 0.010932102459574922,
"grad_norm": 21.625,
"learning_rate": 0.0004983012533041476,
"loss": 26.1651,
"step": 7380
},
{
"epoch": 0.010961728753503309,
"grad_norm": 21.0,
"learning_rate": 0.00049829631436914,
"loss": 26.2036,
"step": 7400
},
{
"epoch": 0.010991355047431697,
"grad_norm": 20.25,
"learning_rate": 0.0004982913754341323,
"loss": 26.1508,
"step": 7420
},
{
"epoch": 0.011020981341360083,
"grad_norm": 21.375,
"learning_rate": 0.0004982864364991248,
"loss": 26.2561,
"step": 7440
},
{
"epoch": 0.011050607635288472,
"grad_norm": 20.75,
"learning_rate": 0.0004982814975641172,
"loss": 26.0361,
"step": 7460
},
{
"epoch": 0.011080233929216858,
"grad_norm": 18.875,
"learning_rate": 0.0004982765586291097,
"loss": 26.0209,
"step": 7480
},
{
"epoch": 0.011109860223145246,
"grad_norm": 21.5,
"learning_rate": 0.0004982716196941021,
"loss": 26.0895,
"step": 7500
},
{
"epoch": 0.011139486517073633,
"grad_norm": 18.5,
"learning_rate": 0.0004982666807590946,
"loss": 26.0248,
"step": 7520
},
{
"epoch": 0.011169112811002021,
"grad_norm": 21.125,
"learning_rate": 0.000498261741824087,
"loss": 25.9958,
"step": 7540
},
{
"epoch": 0.011198739104930407,
"grad_norm": 25.875,
"learning_rate": 0.0004982568028890794,
"loss": 26.1197,
"step": 7560
},
{
"epoch": 0.011228365398858796,
"grad_norm": 20.875,
"learning_rate": 0.0004982518639540718,
"loss": 26.0943,
"step": 7580
},
{
"epoch": 0.011257991692787182,
"grad_norm": 21.875,
"learning_rate": 0.0004982469250190643,
"loss": 25.9681,
"step": 7600
},
{
"epoch": 0.01128761798671557,
"grad_norm": 20.625,
"learning_rate": 0.0004982419860840567,
"loss": 25.9861,
"step": 7620
},
{
"epoch": 0.011317244280643957,
"grad_norm": 21.75,
"learning_rate": 0.0004982370471490492,
"loss": 26.004,
"step": 7640
},
{
"epoch": 0.011346870574572345,
"grad_norm": 20.0,
"learning_rate": 0.0004982321082140416,
"loss": 25.96,
"step": 7660
},
{
"epoch": 0.011376496868500731,
"grad_norm": 24.25,
"learning_rate": 0.0004982271692790341,
"loss": 26.0032,
"step": 7680
},
{
"epoch": 0.01140612316242912,
"grad_norm": 23.0,
"learning_rate": 0.0004982222303440265,
"loss": 25.9794,
"step": 7700
},
{
"epoch": 0.011435749456357506,
"grad_norm": 26.375,
"learning_rate": 0.0004982172914090189,
"loss": 25.8826,
"step": 7720
},
{
"epoch": 0.011465375750285894,
"grad_norm": 19.125,
"learning_rate": 0.0004982123524740113,
"loss": 25.8303,
"step": 7740
},
{
"epoch": 0.01149500204421428,
"grad_norm": 20.75,
"learning_rate": 0.0004982074135390038,
"loss": 25.8461,
"step": 7760
},
{
"epoch": 0.011524628338142669,
"grad_norm": 20.625,
"learning_rate": 0.0004982024746039962,
"loss": 25.8709,
"step": 7780
},
{
"epoch": 0.011554254632071055,
"grad_norm": 21.875,
"learning_rate": 0.0004981975356689886,
"loss": 25.7963,
"step": 7800
},
{
"epoch": 0.011583880925999443,
"grad_norm": 19.75,
"learning_rate": 0.0004981925967339811,
"loss": 25.7629,
"step": 7820
},
{
"epoch": 0.01161350721992783,
"grad_norm": 25.875,
"learning_rate": 0.0004981876577989735,
"loss": 25.7592,
"step": 7840
},
{
"epoch": 0.011643133513856218,
"grad_norm": 21.25,
"learning_rate": 0.000498182718863966,
"loss": 25.7715,
"step": 7860
},
{
"epoch": 0.011672759807784604,
"grad_norm": 19.375,
"learning_rate": 0.0004981777799289584,
"loss": 25.7106,
"step": 7880
},
{
"epoch": 0.011702386101712993,
"grad_norm": 24.875,
"learning_rate": 0.0004981728409939509,
"loss": 25.7222,
"step": 7900
},
{
"epoch": 0.011732012395641379,
"grad_norm": 22.25,
"learning_rate": 0.0004981679020589433,
"loss": 25.7513,
"step": 7920
},
{
"epoch": 0.011761638689569767,
"grad_norm": 20.625,
"learning_rate": 0.0004981629631239357,
"loss": 25.6742,
"step": 7940
},
{
"epoch": 0.011791264983498154,
"grad_norm": 25.375,
"learning_rate": 0.000498158024188928,
"loss": 25.719,
"step": 7960
},
{
"epoch": 0.011820891277426542,
"grad_norm": 19.625,
"learning_rate": 0.0004981530852539206,
"loss": 25.6421,
"step": 7980
},
{
"epoch": 0.011850517571354928,
"grad_norm": 22.75,
"learning_rate": 0.000498148146318913,
"loss": 25.7049,
"step": 8000
},
{
"epoch": 0.011880143865283316,
"grad_norm": 22.5,
"learning_rate": 0.0004981432073839053,
"loss": 25.6611,
"step": 8020
},
{
"epoch": 0.011909770159211703,
"grad_norm": 20.875,
"learning_rate": 0.0004981382684488978,
"loss": 25.6121,
"step": 8040
},
{
"epoch": 0.011939396453140091,
"grad_norm": 20.75,
"learning_rate": 0.0004981333295138902,
"loss": 25.6006,
"step": 8060
},
{
"epoch": 0.011969022747068478,
"grad_norm": 18.625,
"learning_rate": 0.0004981283905788827,
"loss": 25.5466,
"step": 8080
},
{
"epoch": 0.011998649040996866,
"grad_norm": 21.375,
"learning_rate": 0.0004981234516438751,
"loss": 25.5528,
"step": 8100
},
{
"epoch": 0.012028275334925252,
"grad_norm": 19.125,
"learning_rate": 0.0004981185127088676,
"loss": 25.5331,
"step": 8120
},
{
"epoch": 0.01205790162885364,
"grad_norm": 21.0,
"learning_rate": 0.00049811357377386,
"loss": 25.6046,
"step": 8140
},
{
"epoch": 0.012087527922782027,
"grad_norm": 32.0,
"learning_rate": 0.0004981086348388524,
"loss": 25.4727,
"step": 8160
},
{
"epoch": 0.012117154216710415,
"grad_norm": 19.125,
"learning_rate": 0.0004981036959038448,
"loss": 25.4483,
"step": 8180
},
{
"epoch": 0.012146780510638801,
"grad_norm": 19.75,
"learning_rate": 0.0004980987569688373,
"loss": 25.5121,
"step": 8200
},
{
"epoch": 0.01217640680456719,
"grad_norm": 20.0,
"learning_rate": 0.0004980938180338297,
"loss": 25.4124,
"step": 8220
},
{
"epoch": 0.012206033098495576,
"grad_norm": 19.625,
"learning_rate": 0.0004980888790988222,
"loss": 25.4381,
"step": 8240
},
{
"epoch": 0.012235659392423964,
"grad_norm": 19.0,
"learning_rate": 0.0004980839401638146,
"loss": 25.4493,
"step": 8260
},
{
"epoch": 0.01226528568635235,
"grad_norm": 27.5,
"learning_rate": 0.0004980790012288071,
"loss": 25.4176,
"step": 8280
},
{
"epoch": 0.012294911980280739,
"grad_norm": 22.125,
"learning_rate": 0.0004980740622937995,
"loss": 25.443,
"step": 8300
},
{
"epoch": 0.012324538274209125,
"grad_norm": 19.25,
"learning_rate": 0.0004980691233587919,
"loss": 25.4559,
"step": 8320
},
{
"epoch": 0.012354164568137514,
"grad_norm": 22.25,
"learning_rate": 0.0004980641844237843,
"loss": 25.4706,
"step": 8340
},
{
"epoch": 0.0123837908620659,
"grad_norm": 23.625,
"learning_rate": 0.0004980592454887768,
"loss": 25.3049,
"step": 8360
},
{
"epoch": 0.012413417155994288,
"grad_norm": 19.375,
"learning_rate": 0.0004980543065537692,
"loss": 25.3767,
"step": 8380
},
{
"epoch": 0.012443043449922675,
"grad_norm": 18.5,
"learning_rate": 0.0004980493676187616,
"loss": 25.2916,
"step": 8400
},
{
"epoch": 0.012472669743851063,
"grad_norm": 16.875,
"learning_rate": 0.0004980444286837541,
"loss": 25.3168,
"step": 8420
},
{
"epoch": 0.01250229603777945,
"grad_norm": 18.375,
"learning_rate": 0.0004980394897487465,
"loss": 25.3036,
"step": 8440
},
{
"epoch": 0.012531922331707837,
"grad_norm": 19.125,
"learning_rate": 0.000498034550813739,
"loss": 25.2267,
"step": 8460
},
{
"epoch": 0.012561548625636224,
"grad_norm": 21.5,
"learning_rate": 0.0004980296118787314,
"loss": 25.2108,
"step": 8480
},
{
"epoch": 0.012591174919564612,
"grad_norm": 19.875,
"learning_rate": 0.0004980246729437239,
"loss": 25.3049,
"step": 8500
},
{
"epoch": 0.012620801213492999,
"grad_norm": 18.75,
"learning_rate": 0.0004980197340087163,
"loss": 25.1796,
"step": 8520
},
{
"epoch": 0.012650427507421387,
"grad_norm": 19.0,
"learning_rate": 0.0004980147950737087,
"loss": 25.1925,
"step": 8540
},
{
"epoch": 0.012680053801349773,
"grad_norm": 19.625,
"learning_rate": 0.000498009856138701,
"loss": 25.1745,
"step": 8560
},
{
"epoch": 0.012709680095278161,
"grad_norm": 29.375,
"learning_rate": 0.0004980049172036936,
"loss": 25.1481,
"step": 8580
},
{
"epoch": 0.012739306389206548,
"grad_norm": 19.5,
"learning_rate": 0.000497999978268686,
"loss": 25.2033,
"step": 8600
},
{
"epoch": 0.012768932683134936,
"grad_norm": 23.875,
"learning_rate": 0.0004979950393336784,
"loss": 25.2193,
"step": 8620
},
{
"epoch": 0.012798558977063322,
"grad_norm": 17.125,
"learning_rate": 0.0004979901003986708,
"loss": 25.1275,
"step": 8640
},
{
"epoch": 0.01282818527099171,
"grad_norm": 18.375,
"learning_rate": 0.0004979851614636633,
"loss": 25.0801,
"step": 8660
},
{
"epoch": 0.012857811564920097,
"grad_norm": 21.625,
"learning_rate": 0.0004979802225286557,
"loss": 25.1755,
"step": 8680
},
{
"epoch": 0.012887437858848485,
"grad_norm": 19.75,
"learning_rate": 0.0004979752835936481,
"loss": 25.0867,
"step": 8700
},
{
"epoch": 0.012917064152776872,
"grad_norm": 21.875,
"learning_rate": 0.0004979703446586406,
"loss": 25.114,
"step": 8720
},
{
"epoch": 0.01294669044670526,
"grad_norm": 20.0,
"learning_rate": 0.000497965405723633,
"loss": 24.985,
"step": 8740
},
{
"epoch": 0.012976316740633646,
"grad_norm": 20.75,
"learning_rate": 0.0004979604667886254,
"loss": 25.0454,
"step": 8760
},
{
"epoch": 0.013005943034562035,
"grad_norm": 22.375,
"learning_rate": 0.0004979555278536178,
"loss": 25.0174,
"step": 8780
},
{
"epoch": 0.013035569328490423,
"grad_norm": 21.25,
"learning_rate": 0.0004979505889186103,
"loss": 25.0277,
"step": 8800
},
{
"epoch": 0.01306519562241881,
"grad_norm": 19.5,
"learning_rate": 0.0004979456499836027,
"loss": 25.0168,
"step": 8820
},
{
"epoch": 0.013094821916347197,
"grad_norm": 19.25,
"learning_rate": 0.0004979407110485952,
"loss": 24.9805,
"step": 8840
},
{
"epoch": 0.013124448210275584,
"grad_norm": 20.75,
"learning_rate": 0.0004979357721135876,
"loss": 25.0201,
"step": 8860
},
{
"epoch": 0.013154074504203972,
"grad_norm": 20.0,
"learning_rate": 0.0004979308331785801,
"loss": 24.975,
"step": 8880
},
{
"epoch": 0.013183700798132358,
"grad_norm": 20.5,
"learning_rate": 0.0004979258942435725,
"loss": 24.9923,
"step": 8900
},
{
"epoch": 0.013213327092060747,
"grad_norm": 19.0,
"learning_rate": 0.0004979209553085649,
"loss": 24.9658,
"step": 8920
},
{
"epoch": 0.013242953385989133,
"grad_norm": 19.25,
"learning_rate": 0.0004979160163735573,
"loss": 24.9277,
"step": 8940
},
{
"epoch": 0.013272579679917521,
"grad_norm": 24.0,
"learning_rate": 0.0004979110774385498,
"loss": 24.9739,
"step": 8960
},
{
"epoch": 0.013302205973845908,
"grad_norm": 28.125,
"learning_rate": 0.0004979061385035422,
"loss": 24.9075,
"step": 8980
},
{
"epoch": 0.013331832267774296,
"grad_norm": 19.75,
"learning_rate": 0.0004979011995685347,
"loss": 24.9149,
"step": 9000
},
{
"epoch": 0.013361458561702682,
"grad_norm": 24.25,
"learning_rate": 0.0004978962606335271,
"loss": 24.7799,
"step": 9020
},
{
"epoch": 0.01339108485563107,
"grad_norm": 21.25,
"learning_rate": 0.0004978913216985195,
"loss": 24.7674,
"step": 9040
},
{
"epoch": 0.013420711149559457,
"grad_norm": 18.0,
"learning_rate": 0.000497886382763512,
"loss": 24.8117,
"step": 9060
},
{
"epoch": 0.013450337443487845,
"grad_norm": 18.5,
"learning_rate": 0.0004978814438285044,
"loss": 24.8415,
"step": 9080
},
{
"epoch": 0.013479963737416232,
"grad_norm": 29.625,
"learning_rate": 0.0004978765048934969,
"loss": 24.7777,
"step": 9100
},
{
"epoch": 0.01350959003134462,
"grad_norm": 22.375,
"learning_rate": 0.0004978715659584893,
"loss": 24.7968,
"step": 9120
},
{
"epoch": 0.013539216325273006,
"grad_norm": 21.25,
"learning_rate": 0.0004978666270234817,
"loss": 24.8259,
"step": 9140
},
{
"epoch": 0.013568842619201394,
"grad_norm": 24.125,
"learning_rate": 0.000497861688088474,
"loss": 24.8418,
"step": 9160
},
{
"epoch": 0.01359846891312978,
"grad_norm": 20.0,
"learning_rate": 0.0004978567491534666,
"loss": 24.7194,
"step": 9180
},
{
"epoch": 0.013628095207058169,
"grad_norm": 15.625,
"learning_rate": 0.000497851810218459,
"loss": 24.7353,
"step": 9200
},
{
"epoch": 0.013657721500986555,
"grad_norm": 19.625,
"learning_rate": 0.0004978468712834515,
"loss": 24.6211,
"step": 9220
},
{
"epoch": 0.013687347794914944,
"grad_norm": 20.5,
"learning_rate": 0.0004978419323484438,
"loss": 24.713,
"step": 9240
},
{
"epoch": 0.01371697408884333,
"grad_norm": 20.0,
"learning_rate": 0.0004978369934134363,
"loss": 24.6394,
"step": 9260
},
{
"epoch": 0.013746600382771718,
"grad_norm": 17.375,
"learning_rate": 0.0004978320544784287,
"loss": 24.6203,
"step": 9280
},
{
"epoch": 0.013776226676700105,
"grad_norm": 20.75,
"learning_rate": 0.0004978271155434211,
"loss": 24.6364,
"step": 9300
},
{
"epoch": 0.013805852970628493,
"grad_norm": 19.125,
"learning_rate": 0.0004978221766084135,
"loss": 24.631,
"step": 9320
},
{
"epoch": 0.01383547926455688,
"grad_norm": 17.25,
"learning_rate": 0.000497817237673406,
"loss": 24.6057,
"step": 9340
},
{
"epoch": 0.013865105558485268,
"grad_norm": 26.5,
"learning_rate": 0.0004978122987383984,
"loss": 24.6704,
"step": 9360
},
{
"epoch": 0.013894731852413654,
"grad_norm": 20.875,
"learning_rate": 0.0004978073598033908,
"loss": 24.5586,
"step": 9380
},
{
"epoch": 0.013924358146342042,
"grad_norm": 19.25,
"learning_rate": 0.0004978024208683833,
"loss": 24.617,
"step": 9400
},
{
"epoch": 0.013953984440270429,
"grad_norm": 16.25,
"learning_rate": 0.0004977974819333757,
"loss": 24.6328,
"step": 9420
},
{
"epoch": 0.013983610734198817,
"grad_norm": 17.75,
"learning_rate": 0.0004977925429983682,
"loss": 24.5858,
"step": 9440
},
{
"epoch": 0.014013237028127203,
"grad_norm": 20.625,
"learning_rate": 0.0004977876040633606,
"loss": 24.5509,
"step": 9460
},
{
"epoch": 0.014042863322055591,
"grad_norm": 17.875,
"learning_rate": 0.0004977826651283531,
"loss": 24.5998,
"step": 9480
},
{
"epoch": 0.014072489615983978,
"grad_norm": 18.25,
"learning_rate": 0.0004977777261933455,
"loss": 24.4267,
"step": 9500
},
{
"epoch": 0.014102115909912366,
"grad_norm": 17.0,
"learning_rate": 0.0004977727872583379,
"loss": 24.548,
"step": 9520
},
{
"epoch": 0.014131742203840753,
"grad_norm": 17.625,
"learning_rate": 0.0004977678483233303,
"loss": 24.5241,
"step": 9540
},
{
"epoch": 0.01416136849776914,
"grad_norm": 22.5,
"learning_rate": 0.0004977629093883228,
"loss": 24.5497,
"step": 9560
},
{
"epoch": 0.014190994791697527,
"grad_norm": 20.875,
"learning_rate": 0.0004977579704533152,
"loss": 24.4922,
"step": 9580
},
{
"epoch": 0.014220621085625915,
"grad_norm": 20.125,
"learning_rate": 0.0004977530315183077,
"loss": 24.4758,
"step": 9600
},
{
"epoch": 0.014250247379554302,
"grad_norm": 21.5,
"learning_rate": 0.0004977480925833001,
"loss": 24.4183,
"step": 9620
},
{
"epoch": 0.01427987367348269,
"grad_norm": 15.8125,
"learning_rate": 0.0004977431536482926,
"loss": 24.4069,
"step": 9640
},
{
"epoch": 0.014309499967411076,
"grad_norm": 17.625,
"learning_rate": 0.000497738214713285,
"loss": 24.3877,
"step": 9660
},
{
"epoch": 0.014339126261339465,
"grad_norm": 17.5,
"learning_rate": 0.0004977332757782774,
"loss": 24.4781,
"step": 9680
},
{
"epoch": 0.014368752555267851,
"grad_norm": 16.25,
"learning_rate": 0.0004977283368432699,
"loss": 24.3834,
"step": 9700
},
{
"epoch": 0.01439837884919624,
"grad_norm": 19.0,
"learning_rate": 0.0004977233979082623,
"loss": 24.3847,
"step": 9720
},
{
"epoch": 0.014428005143124626,
"grad_norm": 17.75,
"learning_rate": 0.0004977184589732547,
"loss": 24.4303,
"step": 9740
},
{
"epoch": 0.014457631437053014,
"grad_norm": 18.125,
"learning_rate": 0.000497713520038247,
"loss": 24.3612,
"step": 9760
},
{
"epoch": 0.0144872577309814,
"grad_norm": 18.625,
"learning_rate": 0.0004977085811032396,
"loss": 24.3586,
"step": 9780
},
{
"epoch": 0.014516884024909789,
"grad_norm": 19.875,
"learning_rate": 0.000497703642168232,
"loss": 24.3231,
"step": 9800
},
{
"epoch": 0.014546510318838175,
"grad_norm": 18.0,
"learning_rate": 0.0004976987032332245,
"loss": 24.3263,
"step": 9820
},
{
"epoch": 0.014576136612766563,
"grad_norm": 23.125,
"learning_rate": 0.0004976937642982168,
"loss": 24.2935,
"step": 9840
},
{
"epoch": 0.01460576290669495,
"grad_norm": 18.25,
"learning_rate": 0.0004976888253632093,
"loss": 24.244,
"step": 9860
},
{
"epoch": 0.014635389200623338,
"grad_norm": 17.125,
"learning_rate": 0.0004976838864282017,
"loss": 24.2778,
"step": 9880
},
{
"epoch": 0.014665015494551724,
"grad_norm": 17.875,
"learning_rate": 0.0004976789474931941,
"loss": 24.3038,
"step": 9900
},
{
"epoch": 0.014694641788480112,
"grad_norm": 16.875,
"learning_rate": 0.0004976740085581865,
"loss": 24.1973,
"step": 9920
},
{
"epoch": 0.014724268082408499,
"grad_norm": 17.25,
"learning_rate": 0.000497669069623179,
"loss": 24.2236,
"step": 9940
},
{
"epoch": 0.014753894376336887,
"grad_norm": 17.875,
"learning_rate": 0.0004976641306881714,
"loss": 24.2274,
"step": 9960
},
{
"epoch": 0.014783520670265273,
"grad_norm": 20.875,
"learning_rate": 0.0004976591917531639,
"loss": 24.3033,
"step": 9980
},
{
"epoch": 0.014813146964193662,
"grad_norm": 19.5,
"learning_rate": 0.0004976542528181563,
"loss": 24.1171,
"step": 10000
},
{
"epoch": 0.014842773258122048,
"grad_norm": 19.625,
"learning_rate": 0.0004976493138831488,
"loss": 24.1736,
"step": 10020
},
{
"epoch": 0.014872399552050436,
"grad_norm": 18.625,
"learning_rate": 0.0004976443749481412,
"loss": 24.2905,
"step": 10040
},
{
"epoch": 0.014902025845978823,
"grad_norm": 18.375,
"learning_rate": 0.0004976394360131336,
"loss": 24.1324,
"step": 10060
},
{
"epoch": 0.014931652139907211,
"grad_norm": 20.375,
"learning_rate": 0.0004976344970781261,
"loss": 24.1785,
"step": 10080
},
{
"epoch": 0.014961278433835597,
"grad_norm": 17.625,
"learning_rate": 0.0004976295581431185,
"loss": 24.1872,
"step": 10100
},
{
"epoch": 0.014990904727763986,
"grad_norm": 18.0,
"learning_rate": 0.0004976246192081109,
"loss": 24.1813,
"step": 10120
},
{
"epoch": 0.015020531021692372,
"grad_norm": 21.75,
"learning_rate": 0.0004976196802731033,
"loss": 24.1438,
"step": 10140
},
{
"epoch": 0.01505015731562076,
"grad_norm": 22.375,
"learning_rate": 0.0004976147413380958,
"loss": 24.1436,
"step": 10160
},
{
"epoch": 0.015079783609549147,
"grad_norm": 19.5,
"learning_rate": 0.0004976098024030882,
"loss": 24.1394,
"step": 10180
},
{
"epoch": 0.015109409903477535,
"grad_norm": 18.25,
"learning_rate": 0.0004976048634680807,
"loss": 24.0992,
"step": 10200
},
{
"epoch": 0.015139036197405921,
"grad_norm": 15.6875,
"learning_rate": 0.0004975999245330731,
"loss": 24.0464,
"step": 10220
},
{
"epoch": 0.01516866249133431,
"grad_norm": 17.875,
"learning_rate": 0.0004975949855980656,
"loss": 24.0805,
"step": 10240
},
{
"epoch": 0.015198288785262696,
"grad_norm": 16.875,
"learning_rate": 0.000497590046663058,
"loss": 24.0985,
"step": 10260
},
{
"epoch": 0.015227915079191084,
"grad_norm": 23.25,
"learning_rate": 0.0004975851077280504,
"loss": 24.042,
"step": 10280
},
{
"epoch": 0.01525754137311947,
"grad_norm": 19.375,
"learning_rate": 0.0004975801687930429,
"loss": 24.0048,
"step": 10300
},
{
"epoch": 0.015287167667047859,
"grad_norm": 19.75,
"learning_rate": 0.0004975752298580353,
"loss": 24.0158,
"step": 10320
},
{
"epoch": 0.015316793960976245,
"grad_norm": 20.875,
"learning_rate": 0.0004975702909230277,
"loss": 23.9875,
"step": 10340
},
{
"epoch": 0.015346420254904633,
"grad_norm": 20.75,
"learning_rate": 0.0004975653519880202,
"loss": 24.055,
"step": 10360
},
{
"epoch": 0.01537604654883302,
"grad_norm": 16.125,
"learning_rate": 0.0004975604130530126,
"loss": 23.9887,
"step": 10380
},
{
"epoch": 0.015405672842761408,
"grad_norm": 17.25,
"learning_rate": 0.000497555474118005,
"loss": 24.0268,
"step": 10400
},
{
"epoch": 0.015435299136689794,
"grad_norm": 18.125,
"learning_rate": 0.0004975505351829975,
"loss": 24.0453,
"step": 10420
},
{
"epoch": 0.015464925430618183,
"grad_norm": 21.625,
"learning_rate": 0.0004975455962479898,
"loss": 24.0189,
"step": 10440
},
{
"epoch": 0.015494551724546569,
"grad_norm": 15.625,
"learning_rate": 0.0004975406573129823,
"loss": 23.9409,
"step": 10460
},
{
"epoch": 0.015524178018474957,
"grad_norm": 18.0,
"learning_rate": 0.0004975357183779747,
"loss": 23.8996,
"step": 10480
},
{
"epoch": 0.015553804312403344,
"grad_norm": 21.875,
"learning_rate": 0.0004975307794429671,
"loss": 24.0183,
"step": 10500
},
{
"epoch": 0.015583430606331732,
"grad_norm": 19.0,
"learning_rate": 0.0004975258405079595,
"loss": 23.9392,
"step": 10520
},
{
"epoch": 0.015613056900260118,
"grad_norm": 16.375,
"learning_rate": 0.000497520901572952,
"loss": 23.9131,
"step": 10540
},
{
"epoch": 0.015642683194188507,
"grad_norm": 17.5,
"learning_rate": 0.0004975159626379444,
"loss": 23.8963,
"step": 10560
},
{
"epoch": 0.015672309488116893,
"grad_norm": 16.625,
"learning_rate": 0.0004975110237029369,
"loss": 23.9361,
"step": 10580
},
{
"epoch": 0.01570193578204528,
"grad_norm": 18.25,
"learning_rate": 0.0004975060847679293,
"loss": 23.9129,
"step": 10600
},
{
"epoch": 0.01573156207597367,
"grad_norm": 19.75,
"learning_rate": 0.0004975011458329218,
"loss": 23.8795,
"step": 10620
},
{
"epoch": 0.015761188369902056,
"grad_norm": 14.8125,
"learning_rate": 0.0004974962068979142,
"loss": 23.8412,
"step": 10640
},
{
"epoch": 0.015790814663830442,
"grad_norm": 19.625,
"learning_rate": 0.0004974912679629066,
"loss": 23.8545,
"step": 10660
},
{
"epoch": 0.01582044095775883,
"grad_norm": 17.875,
"learning_rate": 0.0004974863290278991,
"loss": 23.8848,
"step": 10680
},
{
"epoch": 0.01585006725168722,
"grad_norm": 18.125,
"learning_rate": 0.0004974813900928915,
"loss": 23.7463,
"step": 10700
},
{
"epoch": 0.015879693545615605,
"grad_norm": 17.25,
"learning_rate": 0.0004974764511578839,
"loss": 23.8657,
"step": 10720
},
{
"epoch": 0.01590931983954399,
"grad_norm": 17.875,
"learning_rate": 0.0004974715122228763,
"loss": 23.7865,
"step": 10740
},
{
"epoch": 0.015938946133472378,
"grad_norm": 18.875,
"learning_rate": 0.0004974665732878688,
"loss": 23.7971,
"step": 10760
},
{
"epoch": 0.015968572427400768,
"grad_norm": 19.125,
"learning_rate": 0.0004974616343528612,
"loss": 23.8342,
"step": 10780
},
{
"epoch": 0.015998198721329154,
"grad_norm": 16.75,
"learning_rate": 0.0004974566954178537,
"loss": 23.7571,
"step": 10800
},
{
"epoch": 0.01602782501525754,
"grad_norm": 16.75,
"learning_rate": 0.0004974517564828461,
"loss": 23.8034,
"step": 10820
},
{
"epoch": 0.016057451309185927,
"grad_norm": 17.25,
"learning_rate": 0.0004974468175478386,
"loss": 23.7763,
"step": 10840
},
{
"epoch": 0.016087077603114317,
"grad_norm": 18.25,
"learning_rate": 0.000497441878612831,
"loss": 23.803,
"step": 10860
},
{
"epoch": 0.016116703897042704,
"grad_norm": 20.875,
"learning_rate": 0.0004974369396778234,
"loss": 23.7222,
"step": 10880
},
{
"epoch": 0.01614633019097109,
"grad_norm": 20.5,
"learning_rate": 0.0004974320007428159,
"loss": 23.6994,
"step": 10900
},
{
"epoch": 0.016175956484899476,
"grad_norm": 15.3125,
"learning_rate": 0.0004974270618078083,
"loss": 23.6471,
"step": 10920
},
{
"epoch": 0.016205582778827866,
"grad_norm": 15.5,
"learning_rate": 0.0004974221228728007,
"loss": 23.7271,
"step": 10940
},
{
"epoch": 0.016235209072756253,
"grad_norm": 17.5,
"learning_rate": 0.0004974171839377932,
"loss": 23.6869,
"step": 10960
},
{
"epoch": 0.01626483536668464,
"grad_norm": 16.75,
"learning_rate": 0.0004974122450027856,
"loss": 23.6976,
"step": 10980
},
{
"epoch": 0.016294461660613026,
"grad_norm": 19.0,
"learning_rate": 0.0004974073060677781,
"loss": 23.6657,
"step": 11000
},
{
"epoch": 0.016324087954541416,
"grad_norm": 18.875,
"learning_rate": 0.0004974023671327705,
"loss": 23.6059,
"step": 11020
},
{
"epoch": 0.016353714248469802,
"grad_norm": 16.875,
"learning_rate": 0.0004973974281977628,
"loss": 23.6203,
"step": 11040
},
{
"epoch": 0.01638334054239819,
"grad_norm": 26.0,
"learning_rate": 0.0004973924892627553,
"loss": 23.5207,
"step": 11060
},
{
"epoch": 0.016412966836326575,
"grad_norm": 18.25,
"learning_rate": 0.0004973875503277477,
"loss": 23.711,
"step": 11080
},
{
"epoch": 0.016442593130254965,
"grad_norm": 17.125,
"learning_rate": 0.0004973826113927401,
"loss": 23.5764,
"step": 11100
},
{
"epoch": 0.01647221942418335,
"grad_norm": 18.125,
"learning_rate": 0.0004973776724577325,
"loss": 23.6693,
"step": 11120
},
{
"epoch": 0.016501845718111738,
"grad_norm": 20.875,
"learning_rate": 0.000497372733522725,
"loss": 23.5375,
"step": 11140
},
{
"epoch": 0.016531472012040124,
"grad_norm": 14.75,
"learning_rate": 0.0004973677945877174,
"loss": 23.5473,
"step": 11160
},
{
"epoch": 0.016561098305968514,
"grad_norm": 15.625,
"learning_rate": 0.0004973628556527099,
"loss": 23.5889,
"step": 11180
},
{
"epoch": 0.0165907245998969,
"grad_norm": 16.875,
"learning_rate": 0.0004973579167177023,
"loss": 23.5879,
"step": 11200
},
{
"epoch": 0.016620350893825287,
"grad_norm": 18.375,
"learning_rate": 0.0004973529777826948,
"loss": 23.4974,
"step": 11220
},
{
"epoch": 0.016649977187753674,
"grad_norm": 15.625,
"learning_rate": 0.0004973480388476872,
"loss": 23.4771,
"step": 11240
},
{
"epoch": 0.016679603481682063,
"grad_norm": 17.5,
"learning_rate": 0.0004973430999126796,
"loss": 23.4806,
"step": 11260
},
{
"epoch": 0.01670922977561045,
"grad_norm": 19.75,
"learning_rate": 0.0004973381609776721,
"loss": 23.651,
"step": 11280
},
{
"epoch": 0.016738856069538836,
"grad_norm": 16.625,
"learning_rate": 0.0004973332220426645,
"loss": 23.5367,
"step": 11300
},
{
"epoch": 0.016768482363467223,
"grad_norm": 19.875,
"learning_rate": 0.0004973282831076569,
"loss": 23.5171,
"step": 11320
},
{
"epoch": 0.016798108657395613,
"grad_norm": 17.125,
"learning_rate": 0.0004973233441726494,
"loss": 23.4766,
"step": 11340
},
{
"epoch": 0.016827734951324,
"grad_norm": 15.3125,
"learning_rate": 0.0004973184052376418,
"loss": 23.4622,
"step": 11360
},
{
"epoch": 0.016857361245252386,
"grad_norm": 19.375,
"learning_rate": 0.0004973134663026343,
"loss": 23.5135,
"step": 11380
},
{
"epoch": 0.016886987539180772,
"grad_norm": 17.625,
"learning_rate": 0.0004973085273676267,
"loss": 23.485,
"step": 11400
},
{
"epoch": 0.016916613833109162,
"grad_norm": 17.5,
"learning_rate": 0.0004973035884326191,
"loss": 23.4218,
"step": 11420
},
{
"epoch": 0.01694624012703755,
"grad_norm": 16.375,
"learning_rate": 0.0004972986494976116,
"loss": 23.4405,
"step": 11440
},
{
"epoch": 0.016975866420965935,
"grad_norm": 21.125,
"learning_rate": 0.000497293710562604,
"loss": 23.4308,
"step": 11460
},
{
"epoch": 0.01700549271489432,
"grad_norm": 18.875,
"learning_rate": 0.0004972887716275964,
"loss": 23.4076,
"step": 11480
},
{
"epoch": 0.01703511900882271,
"grad_norm": 16.25,
"learning_rate": 0.0004972838326925889,
"loss": 23.4027,
"step": 11500
},
{
"epoch": 0.017064745302751098,
"grad_norm": 15.9375,
"learning_rate": 0.0004972788937575813,
"loss": 23.3797,
"step": 11520
},
{
"epoch": 0.017094371596679484,
"grad_norm": 17.875,
"learning_rate": 0.0004972739548225737,
"loss": 23.403,
"step": 11540
},
{
"epoch": 0.01712399789060787,
"grad_norm": 20.25,
"learning_rate": 0.0004972690158875662,
"loss": 23.435,
"step": 11560
},
{
"epoch": 0.01715362418453626,
"grad_norm": 17.875,
"learning_rate": 0.0004972640769525586,
"loss": 23.3429,
"step": 11580
},
{
"epoch": 0.017183250478464647,
"grad_norm": 19.0,
"learning_rate": 0.0004972591380175511,
"loss": 23.3458,
"step": 11600
},
{
"epoch": 0.017212876772393033,
"grad_norm": 14.3125,
"learning_rate": 0.0004972541990825435,
"loss": 23.3765,
"step": 11620
},
{
"epoch": 0.01724250306632142,
"grad_norm": 17.375,
"learning_rate": 0.0004972492601475358,
"loss": 23.3812,
"step": 11640
},
{
"epoch": 0.01727212936024981,
"grad_norm": 17.375,
"learning_rate": 0.0004972443212125283,
"loss": 23.3419,
"step": 11660
},
{
"epoch": 0.017301755654178196,
"grad_norm": 15.9375,
"learning_rate": 0.0004972393822775207,
"loss": 23.1804,
"step": 11680
},
{
"epoch": 0.017331381948106583,
"grad_norm": 15.125,
"learning_rate": 0.0004972344433425131,
"loss": 23.2947,
"step": 11700
},
{
"epoch": 0.01736100824203497,
"grad_norm": 17.0,
"learning_rate": 0.0004972295044075056,
"loss": 23.402,
"step": 11720
},
{
"epoch": 0.01739063453596336,
"grad_norm": 18.5,
"learning_rate": 0.000497224565472498,
"loss": 23.2933,
"step": 11740
},
{
"epoch": 0.017420260829891746,
"grad_norm": 17.125,
"learning_rate": 0.0004972196265374904,
"loss": 23.296,
"step": 11760
},
{
"epoch": 0.017449887123820132,
"grad_norm": 15.5625,
"learning_rate": 0.0004972146876024829,
"loss": 23.2247,
"step": 11780
},
{
"epoch": 0.017479513417748522,
"grad_norm": 15.375,
"learning_rate": 0.0004972097486674753,
"loss": 23.1945,
"step": 11800
},
{
"epoch": 0.01750913971167691,
"grad_norm": 17.25,
"learning_rate": 0.0004972048097324678,
"loss": 23.2879,
"step": 11820
},
{
"epoch": 0.017538766005605295,
"grad_norm": 16.5,
"learning_rate": 0.0004971998707974602,
"loss": 23.2503,
"step": 11840
},
{
"epoch": 0.01756839229953368,
"grad_norm": 16.875,
"learning_rate": 0.0004971949318624526,
"loss": 23.2298,
"step": 11860
},
{
"epoch": 0.01759801859346207,
"grad_norm": 16.5,
"learning_rate": 0.0004971899929274451,
"loss": 23.2478,
"step": 11880
},
{
"epoch": 0.017627644887390458,
"grad_norm": 16.875,
"learning_rate": 0.0004971850539924375,
"loss": 23.2439,
"step": 11900
},
{
"epoch": 0.017657271181318844,
"grad_norm": 16.75,
"learning_rate": 0.0004971801150574299,
"loss": 23.2426,
"step": 11920
},
{
"epoch": 0.01768689747524723,
"grad_norm": 15.5,
"learning_rate": 0.0004971751761224224,
"loss": 23.2587,
"step": 11940
},
{
"epoch": 0.01771652376917562,
"grad_norm": 15.6875,
"learning_rate": 0.0004971702371874148,
"loss": 23.2458,
"step": 11960
},
{
"epoch": 0.017746150063104007,
"grad_norm": 17.875,
"learning_rate": 0.0004971652982524073,
"loss": 23.1944,
"step": 11980
},
{
"epoch": 0.017775776357032393,
"grad_norm": 15.5,
"learning_rate": 0.0004971603593173997,
"loss": 23.2169,
"step": 12000
}
],
"logging_steps": 20,
"max_steps": 2025228,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.255909218322743e+18,
"train_batch_size": 48,
"trial_name": null,
"trial_params": null
}