Chem_train / trainer_state.json
grohitraj's picture
Upload folder using huggingface_hub
8376519 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 850,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05917159763313609,
"grad_norm": 1.1964364051818848,
"learning_rate": 8e-05,
"loss": 2.1543,
"step": 5
},
{
"epoch": 0.11834319526627218,
"grad_norm": 1.1647157669067383,
"learning_rate": 0.00018,
"loss": 1.6346,
"step": 10
},
{
"epoch": 0.17751479289940827,
"grad_norm": 0.6181473135948181,
"learning_rate": 0.00019904761904761907,
"loss": 0.9978,
"step": 15
},
{
"epoch": 0.23668639053254437,
"grad_norm": 0.5694869160652161,
"learning_rate": 0.00019785714285714288,
"loss": 0.9807,
"step": 20
},
{
"epoch": 0.2958579881656805,
"grad_norm": 0.6708640456199646,
"learning_rate": 0.00019666666666666666,
"loss": 1.2485,
"step": 25
},
{
"epoch": 0.35502958579881655,
"grad_norm": 0.7021521925926208,
"learning_rate": 0.00019547619047619047,
"loss": 1.1107,
"step": 30
},
{
"epoch": 0.41420118343195267,
"grad_norm": 0.5067740082740784,
"learning_rate": 0.0001942857142857143,
"loss": 1.1293,
"step": 35
},
{
"epoch": 0.47337278106508873,
"grad_norm": 0.5455656051635742,
"learning_rate": 0.0001930952380952381,
"loss": 1.1371,
"step": 40
},
{
"epoch": 0.5325443786982249,
"grad_norm": 0.6190764307975769,
"learning_rate": 0.00019190476190476192,
"loss": 1.0131,
"step": 45
},
{
"epoch": 0.591715976331361,
"grad_norm": 0.544291615486145,
"learning_rate": 0.00019071428571428573,
"loss": 0.969,
"step": 50
},
{
"epoch": 0.650887573964497,
"grad_norm": 0.600204348564148,
"learning_rate": 0.0001895238095238095,
"loss": 1.0509,
"step": 55
},
{
"epoch": 0.7100591715976331,
"grad_norm": 0.5397897958755493,
"learning_rate": 0.00018833333333333335,
"loss": 0.9839,
"step": 60
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.571107804775238,
"learning_rate": 0.00018714285714285716,
"loss": 0.9281,
"step": 65
},
{
"epoch": 0.8284023668639053,
"grad_norm": 0.5789744257926941,
"learning_rate": 0.00018595238095238097,
"loss": 0.9543,
"step": 70
},
{
"epoch": 0.8875739644970414,
"grad_norm": 0.6334110498428345,
"learning_rate": 0.00018476190476190478,
"loss": 1.0625,
"step": 75
},
{
"epoch": 0.9467455621301775,
"grad_norm": 0.5225853323936462,
"learning_rate": 0.00018357142857142858,
"loss": 0.9734,
"step": 80
},
{
"epoch": 1.0,
"grad_norm": 0.7854479551315308,
"learning_rate": 0.0001823809523809524,
"loss": 0.9452,
"step": 85
},
{
"epoch": 1.0591715976331362,
"grad_norm": 0.47304806113243103,
"learning_rate": 0.0001811904761904762,
"loss": 0.8473,
"step": 90
},
{
"epoch": 1.1183431952662721,
"grad_norm": 0.6840282678604126,
"learning_rate": 0.00018,
"loss": 0.6189,
"step": 95
},
{
"epoch": 1.1775147928994083,
"grad_norm": 0.5173171162605286,
"learning_rate": 0.00017880952380952382,
"loss": 0.9301,
"step": 100
},
{
"epoch": 1.2366863905325443,
"grad_norm": 0.4892285466194153,
"learning_rate": 0.00017761904761904763,
"loss": 0.7659,
"step": 105
},
{
"epoch": 1.2958579881656804,
"grad_norm": 0.5754849910736084,
"learning_rate": 0.00017642857142857144,
"loss": 0.7756,
"step": 110
},
{
"epoch": 1.3550295857988166,
"grad_norm": 0.47277289628982544,
"learning_rate": 0.00017523809523809525,
"loss": 0.7844,
"step": 115
},
{
"epoch": 1.4142011834319526,
"grad_norm": 0.8198840022087097,
"learning_rate": 0.00017404761904761906,
"loss": 0.8969,
"step": 120
},
{
"epoch": 1.4733727810650887,
"grad_norm": 0.5040334463119507,
"learning_rate": 0.00017285714285714287,
"loss": 0.8727,
"step": 125
},
{
"epoch": 1.532544378698225,
"grad_norm": 0.5382494926452637,
"learning_rate": 0.00017166666666666667,
"loss": 0.7677,
"step": 130
},
{
"epoch": 1.5917159763313609,
"grad_norm": 0.631537914276123,
"learning_rate": 0.00017047619047619048,
"loss": 0.8364,
"step": 135
},
{
"epoch": 1.650887573964497,
"grad_norm": 0.5718739628791809,
"learning_rate": 0.0001692857142857143,
"loss": 0.7447,
"step": 140
},
{
"epoch": 1.7100591715976332,
"grad_norm": 0.557224452495575,
"learning_rate": 0.0001680952380952381,
"loss": 0.8828,
"step": 145
},
{
"epoch": 1.7692307692307692,
"grad_norm": 0.6206871271133423,
"learning_rate": 0.0001669047619047619,
"loss": 0.5674,
"step": 150
},
{
"epoch": 1.8284023668639053,
"grad_norm": 0.6297276616096497,
"learning_rate": 0.00016571428571428575,
"loss": 0.7956,
"step": 155
},
{
"epoch": 1.8875739644970415,
"grad_norm": 0.6178033351898193,
"learning_rate": 0.00016452380952380953,
"loss": 0.5527,
"step": 160
},
{
"epoch": 1.9467455621301775,
"grad_norm": 0.6269710063934326,
"learning_rate": 0.00016333333333333334,
"loss": 0.9635,
"step": 165
},
{
"epoch": 2.0,
"grad_norm": 0.7290076613426208,
"learning_rate": 0.00016214285714285715,
"loss": 0.8048,
"step": 170
},
{
"epoch": 2.059171597633136,
"grad_norm": 0.5376546382904053,
"learning_rate": 0.00016095238095238096,
"loss": 0.602,
"step": 175
},
{
"epoch": 2.1183431952662723,
"grad_norm": 0.720078706741333,
"learning_rate": 0.0001597619047619048,
"loss": 0.5738,
"step": 180
},
{
"epoch": 2.1775147928994083,
"grad_norm": 0.5647716522216797,
"learning_rate": 0.00015857142857142857,
"loss": 0.5445,
"step": 185
},
{
"epoch": 2.2366863905325443,
"grad_norm": 0.7397224307060242,
"learning_rate": 0.00015738095238095238,
"loss": 0.5383,
"step": 190
},
{
"epoch": 2.2958579881656807,
"grad_norm": 0.8834079504013062,
"learning_rate": 0.0001561904761904762,
"loss": 0.6575,
"step": 195
},
{
"epoch": 2.3550295857988166,
"grad_norm": 0.6497870683670044,
"learning_rate": 0.000155,
"loss": 0.6677,
"step": 200
},
{
"epoch": 2.4142011834319526,
"grad_norm": 0.686392605304718,
"learning_rate": 0.00015380952380952384,
"loss": 0.493,
"step": 205
},
{
"epoch": 2.4733727810650885,
"grad_norm": 0.719688892364502,
"learning_rate": 0.00015261904761904762,
"loss": 0.5356,
"step": 210
},
{
"epoch": 2.532544378698225,
"grad_norm": 0.6884217262268066,
"learning_rate": 0.00015142857142857143,
"loss": 0.7167,
"step": 215
},
{
"epoch": 2.591715976331361,
"grad_norm": 0.7767056822776794,
"learning_rate": 0.00015023809523809524,
"loss": 0.7346,
"step": 220
},
{
"epoch": 2.6508875739644973,
"grad_norm": 0.6508312225341797,
"learning_rate": 0.00014904761904761904,
"loss": 0.547,
"step": 225
},
{
"epoch": 2.710059171597633,
"grad_norm": 0.6159693598747253,
"learning_rate": 0.00014785714285714288,
"loss": 0.5539,
"step": 230
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.7028509378433228,
"learning_rate": 0.00014666666666666666,
"loss": 0.609,
"step": 235
},
{
"epoch": 2.828402366863905,
"grad_norm": 0.6096014976501465,
"learning_rate": 0.00014547619047619047,
"loss": 0.5913,
"step": 240
},
{
"epoch": 2.8875739644970415,
"grad_norm": 0.8518397212028503,
"learning_rate": 0.00014428571428571428,
"loss": 0.6356,
"step": 245
},
{
"epoch": 2.9467455621301775,
"grad_norm": 0.6462046504020691,
"learning_rate": 0.00014309523809523812,
"loss": 0.527,
"step": 250
},
{
"epoch": 3.0,
"grad_norm": 0.8931583762168884,
"learning_rate": 0.00014190476190476193,
"loss": 0.6505,
"step": 255
},
{
"epoch": 3.059171597633136,
"grad_norm": 0.6089041233062744,
"learning_rate": 0.00014071428571428573,
"loss": 0.4721,
"step": 260
},
{
"epoch": 3.1183431952662723,
"grad_norm": 0.9845924973487854,
"learning_rate": 0.00013952380952380952,
"loss": 0.4478,
"step": 265
},
{
"epoch": 3.1775147928994083,
"grad_norm": 0.6962316036224365,
"learning_rate": 0.00013833333333333333,
"loss": 0.3862,
"step": 270
},
{
"epoch": 3.2366863905325443,
"grad_norm": 0.6963745951652527,
"learning_rate": 0.00013714285714285716,
"loss": 0.5446,
"step": 275
},
{
"epoch": 3.2958579881656807,
"grad_norm": 0.9289587736129761,
"learning_rate": 0.00013595238095238097,
"loss": 0.4985,
"step": 280
},
{
"epoch": 3.3550295857988166,
"grad_norm": 0.7913327813148499,
"learning_rate": 0.00013476190476190478,
"loss": 0.4291,
"step": 285
},
{
"epoch": 3.4142011834319526,
"grad_norm": 0.7623841166496277,
"learning_rate": 0.00013357142857142856,
"loss": 0.4198,
"step": 290
},
{
"epoch": 3.4733727810650885,
"grad_norm": 1.1334826946258545,
"learning_rate": 0.00013238095238095237,
"loss": 0.4442,
"step": 295
},
{
"epoch": 3.532544378698225,
"grad_norm": 0.8162091374397278,
"learning_rate": 0.0001311904761904762,
"loss": 0.4249,
"step": 300
},
{
"epoch": 3.591715976331361,
"grad_norm": 0.7582007646560669,
"learning_rate": 0.00013000000000000002,
"loss": 0.4166,
"step": 305
},
{
"epoch": 3.6508875739644973,
"grad_norm": 0.8337474465370178,
"learning_rate": 0.00012880952380952382,
"loss": 0.3552,
"step": 310
},
{
"epoch": 3.710059171597633,
"grad_norm": 0.7497977018356323,
"learning_rate": 0.0001276190476190476,
"loss": 0.3778,
"step": 315
},
{
"epoch": 3.769230769230769,
"grad_norm": 0.9030293226242065,
"learning_rate": 0.00012642857142857144,
"loss": 0.4048,
"step": 320
},
{
"epoch": 3.828402366863905,
"grad_norm": 0.8548532128334045,
"learning_rate": 0.00012523809523809525,
"loss": 0.5433,
"step": 325
},
{
"epoch": 3.8875739644970415,
"grad_norm": 1.1865911483764648,
"learning_rate": 0.00012404761904761906,
"loss": 0.4465,
"step": 330
},
{
"epoch": 3.9467455621301775,
"grad_norm": 0.6329714059829712,
"learning_rate": 0.00012285714285714287,
"loss": 0.499,
"step": 335
},
{
"epoch": 4.0,
"grad_norm": 0.8335389494895935,
"learning_rate": 0.00012166666666666667,
"loss": 0.3774,
"step": 340
},
{
"epoch": 4.059171597633136,
"grad_norm": 0.7739379405975342,
"learning_rate": 0.00012047619047619047,
"loss": 0.2472,
"step": 345
},
{
"epoch": 4.118343195266272,
"grad_norm": 1.0731953382492065,
"learning_rate": 0.00011928571428571428,
"loss": 0.3058,
"step": 350
},
{
"epoch": 4.177514792899408,
"grad_norm": 1.051379680633545,
"learning_rate": 0.0001180952380952381,
"loss": 0.3446,
"step": 355
},
{
"epoch": 4.236686390532545,
"grad_norm": 0.6324198842048645,
"learning_rate": 0.00011690476190476191,
"loss": 0.2351,
"step": 360
},
{
"epoch": 4.295857988165681,
"grad_norm": 0.9921632409095764,
"learning_rate": 0.00011571428571428574,
"loss": 0.2795,
"step": 365
},
{
"epoch": 4.355029585798817,
"grad_norm": 0.9360544085502625,
"learning_rate": 0.00011452380952380952,
"loss": 0.4056,
"step": 370
},
{
"epoch": 4.414201183431953,
"grad_norm": 0.956781268119812,
"learning_rate": 0.00011333333333333334,
"loss": 0.3001,
"step": 375
},
{
"epoch": 4.4733727810650885,
"grad_norm": 1.0604465007781982,
"learning_rate": 0.00011214285714285715,
"loss": 0.3972,
"step": 380
},
{
"epoch": 4.5325443786982245,
"grad_norm": 0.8613020181655884,
"learning_rate": 0.00011095238095238096,
"loss": 0.2828,
"step": 385
},
{
"epoch": 4.591715976331361,
"grad_norm": 0.666599690914154,
"learning_rate": 0.00010976190476190478,
"loss": 0.3187,
"step": 390
},
{
"epoch": 4.650887573964497,
"grad_norm": 0.7497467398643494,
"learning_rate": 0.00010857142857142856,
"loss": 0.4278,
"step": 395
},
{
"epoch": 4.710059171597633,
"grad_norm": 0.733259916305542,
"learning_rate": 0.00010738095238095239,
"loss": 0.2384,
"step": 400
},
{
"epoch": 4.769230769230769,
"grad_norm": 0.7570552229881287,
"learning_rate": 0.0001061904761904762,
"loss": 0.3938,
"step": 405
},
{
"epoch": 4.828402366863905,
"grad_norm": 0.8109162449836731,
"learning_rate": 0.000105,
"loss": 0.2729,
"step": 410
},
{
"epoch": 4.887573964497041,
"grad_norm": 0.7985107898712158,
"learning_rate": 0.00010380952380952383,
"loss": 0.3805,
"step": 415
},
{
"epoch": 4.946745562130177,
"grad_norm": 0.704366147518158,
"learning_rate": 0.00010261904761904761,
"loss": 0.2468,
"step": 420
},
{
"epoch": 5.0,
"grad_norm": 1.054413080215454,
"learning_rate": 0.00010142857142857143,
"loss": 0.2892,
"step": 425
},
{
"epoch": 5.059171597633136,
"grad_norm": 0.7603329420089722,
"learning_rate": 0.00010023809523809524,
"loss": 0.2865,
"step": 430
},
{
"epoch": 5.118343195266272,
"grad_norm": 0.9612646102905273,
"learning_rate": 9.904761904761905e-05,
"loss": 0.2215,
"step": 435
},
{
"epoch": 5.177514792899408,
"grad_norm": 0.8669071793556213,
"learning_rate": 9.785714285714286e-05,
"loss": 0.2078,
"step": 440
},
{
"epoch": 5.236686390532545,
"grad_norm": 0.7441051006317139,
"learning_rate": 9.666666666666667e-05,
"loss": 0.233,
"step": 445
},
{
"epoch": 5.295857988165681,
"grad_norm": 0.5900620818138123,
"learning_rate": 9.547619047619049e-05,
"loss": 0.2581,
"step": 450
},
{
"epoch": 5.355029585798817,
"grad_norm": 0.990178644657135,
"learning_rate": 9.428571428571429e-05,
"loss": 0.2603,
"step": 455
},
{
"epoch": 5.414201183431953,
"grad_norm": 0.7644340991973877,
"learning_rate": 9.309523809523811e-05,
"loss": 0.2021,
"step": 460
},
{
"epoch": 5.4733727810650885,
"grad_norm": 0.5087964534759521,
"learning_rate": 9.19047619047619e-05,
"loss": 0.1126,
"step": 465
},
{
"epoch": 5.5325443786982245,
"grad_norm": 0.7896738052368164,
"learning_rate": 9.071428571428571e-05,
"loss": 0.2084,
"step": 470
},
{
"epoch": 5.591715976331361,
"grad_norm": 0.71749347448349,
"learning_rate": 8.952380952380953e-05,
"loss": 0.1618,
"step": 475
},
{
"epoch": 5.650887573964497,
"grad_norm": 0.8466284871101379,
"learning_rate": 8.833333333333333e-05,
"loss": 0.2574,
"step": 480
},
{
"epoch": 5.710059171597633,
"grad_norm": 1.0023925304412842,
"learning_rate": 8.714285714285715e-05,
"loss": 0.1985,
"step": 485
},
{
"epoch": 5.769230769230769,
"grad_norm": 0.8096638321876526,
"learning_rate": 8.595238095238096e-05,
"loss": 0.285,
"step": 490
},
{
"epoch": 5.828402366863905,
"grad_norm": 0.9154897332191467,
"learning_rate": 8.476190476190477e-05,
"loss": 0.2051,
"step": 495
},
{
"epoch": 5.887573964497041,
"grad_norm": 0.9506188035011292,
"learning_rate": 8.357142857142858e-05,
"loss": 0.2664,
"step": 500
},
{
"epoch": 5.946745562130177,
"grad_norm": 0.8935821056365967,
"learning_rate": 8.238095238095238e-05,
"loss": 0.1717,
"step": 505
},
{
"epoch": 6.0,
"grad_norm": 1.2274423837661743,
"learning_rate": 8.11904761904762e-05,
"loss": 0.2529,
"step": 510
},
{
"epoch": 6.059171597633136,
"grad_norm": 0.6341638565063477,
"learning_rate": 8e-05,
"loss": 0.1795,
"step": 515
},
{
"epoch": 6.118343195266272,
"grad_norm": 1.273710012435913,
"learning_rate": 7.880952380952382e-05,
"loss": 0.1612,
"step": 520
},
{
"epoch": 6.177514792899408,
"grad_norm": 1.065499186515808,
"learning_rate": 7.761904761904762e-05,
"loss": 0.1839,
"step": 525
},
{
"epoch": 6.236686390532545,
"grad_norm": 0.5382740497589111,
"learning_rate": 7.642857142857143e-05,
"loss": 0.1628,
"step": 530
},
{
"epoch": 6.295857988165681,
"grad_norm": 0.6181464791297913,
"learning_rate": 7.523809523809524e-05,
"loss": 0.141,
"step": 535
},
{
"epoch": 6.355029585798817,
"grad_norm": 0.7450206875801086,
"learning_rate": 7.404761904761905e-05,
"loss": 0.1453,
"step": 540
},
{
"epoch": 6.414201183431953,
"grad_norm": 0.9426142573356628,
"learning_rate": 7.285714285714286e-05,
"loss": 0.1403,
"step": 545
},
{
"epoch": 6.4733727810650885,
"grad_norm": 0.9675353169441223,
"learning_rate": 7.166666666666667e-05,
"loss": 0.1216,
"step": 550
},
{
"epoch": 6.5325443786982245,
"grad_norm": 0.5108327269554138,
"learning_rate": 7.047619047619048e-05,
"loss": 0.1484,
"step": 555
},
{
"epoch": 6.591715976331361,
"grad_norm": 0.6549590229988098,
"learning_rate": 6.928571428571429e-05,
"loss": 0.1338,
"step": 560
},
{
"epoch": 6.650887573964497,
"grad_norm": 0.843664288520813,
"learning_rate": 6.80952380952381e-05,
"loss": 0.1586,
"step": 565
},
{
"epoch": 6.710059171597633,
"grad_norm": 0.8650611639022827,
"learning_rate": 6.69047619047619e-05,
"loss": 0.1517,
"step": 570
},
{
"epoch": 6.769230769230769,
"grad_norm": 0.7471471428871155,
"learning_rate": 6.571428571428571e-05,
"loss": 0.1966,
"step": 575
},
{
"epoch": 6.828402366863905,
"grad_norm": 0.7219163775444031,
"learning_rate": 6.452380952380954e-05,
"loss": 0.1368,
"step": 580
},
{
"epoch": 6.887573964497041,
"grad_norm": 0.911191463470459,
"learning_rate": 6.333333333333333e-05,
"loss": 0.1681,
"step": 585
},
{
"epoch": 6.946745562130177,
"grad_norm": 0.6017201542854309,
"learning_rate": 6.214285714285714e-05,
"loss": 0.1675,
"step": 590
},
{
"epoch": 7.0,
"grad_norm": 1.0746937990188599,
"learning_rate": 6.0952380952380964e-05,
"loss": 0.1944,
"step": 595
},
{
"epoch": 7.059171597633136,
"grad_norm": 0.8227368593215942,
"learning_rate": 5.9761904761904766e-05,
"loss": 0.076,
"step": 600
},
{
"epoch": 7.118343195266272,
"grad_norm": 0.6826525330543518,
"learning_rate": 5.8571428571428575e-05,
"loss": 0.1048,
"step": 605
},
{
"epoch": 7.177514792899408,
"grad_norm": 1.0038989782333374,
"learning_rate": 5.738095238095238e-05,
"loss": 0.0998,
"step": 610
},
{
"epoch": 7.236686390532545,
"grad_norm": 0.8537135720252991,
"learning_rate": 5.619047619047619e-05,
"loss": 0.1035,
"step": 615
},
{
"epoch": 7.295857988165681,
"grad_norm": 0.7067388892173767,
"learning_rate": 5.500000000000001e-05,
"loss": 0.1186,
"step": 620
},
{
"epoch": 7.355029585798817,
"grad_norm": 0.7003813982009888,
"learning_rate": 5.380952380952381e-05,
"loss": 0.0803,
"step": 625
},
{
"epoch": 7.414201183431953,
"grad_norm": 0.9923582673072815,
"learning_rate": 5.261904761904763e-05,
"loss": 0.1121,
"step": 630
},
{
"epoch": 7.4733727810650885,
"grad_norm": 0.4991615116596222,
"learning_rate": 5.142857142857143e-05,
"loss": 0.1056,
"step": 635
},
{
"epoch": 7.5325443786982245,
"grad_norm": 0.7406235337257385,
"learning_rate": 5.023809523809524e-05,
"loss": 0.1135,
"step": 640
},
{
"epoch": 7.591715976331361,
"grad_norm": 0.5871267318725586,
"learning_rate": 4.904761904761905e-05,
"loss": 0.1141,
"step": 645
},
{
"epoch": 7.650887573964497,
"grad_norm": 0.5495700836181641,
"learning_rate": 4.785714285714286e-05,
"loss": 0.07,
"step": 650
},
{
"epoch": 7.710059171597633,
"grad_norm": 0.9295830130577087,
"learning_rate": 4.666666666666667e-05,
"loss": 0.1293,
"step": 655
},
{
"epoch": 7.769230769230769,
"grad_norm": 0.9041563272476196,
"learning_rate": 4.547619047619048e-05,
"loss": 0.1034,
"step": 660
},
{
"epoch": 7.828402366863905,
"grad_norm": 0.6490697264671326,
"learning_rate": 4.428571428571428e-05,
"loss": 0.0898,
"step": 665
},
{
"epoch": 7.887573964497041,
"grad_norm": 0.5583420991897583,
"learning_rate": 4.30952380952381e-05,
"loss": 0.0891,
"step": 670
},
{
"epoch": 7.946745562130177,
"grad_norm": 0.7829737663269043,
"learning_rate": 4.190476190476191e-05,
"loss": 0.112,
"step": 675
},
{
"epoch": 8.0,
"grad_norm": 0.48943546414375305,
"learning_rate": 4.0714285714285717e-05,
"loss": 0.115,
"step": 680
},
{
"epoch": 8.059171597633137,
"grad_norm": 0.4989611506462097,
"learning_rate": 3.9523809523809526e-05,
"loss": 0.0645,
"step": 685
},
{
"epoch": 8.118343195266272,
"grad_norm": 0.5723634362220764,
"learning_rate": 3.8333333333333334e-05,
"loss": 0.0603,
"step": 690
},
{
"epoch": 8.177514792899409,
"grad_norm": 0.5361748933792114,
"learning_rate": 3.7142857142857143e-05,
"loss": 0.0626,
"step": 695
},
{
"epoch": 8.236686390532544,
"grad_norm": 0.8497764468193054,
"learning_rate": 3.595238095238095e-05,
"loss": 0.0856,
"step": 700
},
{
"epoch": 8.29585798816568,
"grad_norm": 0.48423126339912415,
"learning_rate": 3.476190476190476e-05,
"loss": 0.0568,
"step": 705
},
{
"epoch": 8.355029585798816,
"grad_norm": 0.30722182989120483,
"learning_rate": 3.357142857142857e-05,
"loss": 0.0564,
"step": 710
},
{
"epoch": 8.414201183431953,
"grad_norm": 0.637298047542572,
"learning_rate": 3.2380952380952386e-05,
"loss": 0.0792,
"step": 715
},
{
"epoch": 8.47337278106509,
"grad_norm": 1.1620301008224487,
"learning_rate": 3.1190476190476195e-05,
"loss": 0.0848,
"step": 720
},
{
"epoch": 8.532544378698224,
"grad_norm": 0.9842550158500671,
"learning_rate": 3e-05,
"loss": 0.0695,
"step": 725
},
{
"epoch": 8.591715976331361,
"grad_norm": 0.5429280996322632,
"learning_rate": 2.880952380952381e-05,
"loss": 0.0691,
"step": 730
},
{
"epoch": 8.650887573964496,
"grad_norm": 0.4466063380241394,
"learning_rate": 2.7619047619047622e-05,
"loss": 0.0427,
"step": 735
},
{
"epoch": 8.710059171597633,
"grad_norm": 0.7048435807228088,
"learning_rate": 2.642857142857143e-05,
"loss": 0.066,
"step": 740
},
{
"epoch": 8.76923076923077,
"grad_norm": 0.71544349193573,
"learning_rate": 2.523809523809524e-05,
"loss": 0.048,
"step": 745
},
{
"epoch": 8.828402366863905,
"grad_norm": 0.27939409017562866,
"learning_rate": 2.404761904761905e-05,
"loss": 0.047,
"step": 750
},
{
"epoch": 8.887573964497042,
"grad_norm": 0.30130356550216675,
"learning_rate": 2.2857142857142858e-05,
"loss": 0.0715,
"step": 755
},
{
"epoch": 8.946745562130177,
"grad_norm": 0.6010681986808777,
"learning_rate": 2.1666666666666667e-05,
"loss": 0.0478,
"step": 760
},
{
"epoch": 9.0,
"grad_norm": 0.5269991159439087,
"learning_rate": 2.0476190476190476e-05,
"loss": 0.048,
"step": 765
},
{
"epoch": 9.059171597633137,
"grad_norm": 0.30882084369659424,
"learning_rate": 1.928571428571429e-05,
"loss": 0.0508,
"step": 770
},
{
"epoch": 9.118343195266272,
"grad_norm": 0.3261071741580963,
"learning_rate": 1.8095238095238094e-05,
"loss": 0.0439,
"step": 775
},
{
"epoch": 9.177514792899409,
"grad_norm": 0.33494994044303894,
"learning_rate": 1.6904761904761906e-05,
"loss": 0.0394,
"step": 780
},
{
"epoch": 9.236686390532544,
"grad_norm": 0.470735639333725,
"learning_rate": 1.5714285714285715e-05,
"loss": 0.0465,
"step": 785
},
{
"epoch": 9.29585798816568,
"grad_norm": 0.4816909432411194,
"learning_rate": 1.4523809523809526e-05,
"loss": 0.0461,
"step": 790
},
{
"epoch": 9.355029585798816,
"grad_norm": 0.332380086183548,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.0438,
"step": 795
},
{
"epoch": 9.414201183431953,
"grad_norm": 1.0364826917648315,
"learning_rate": 1.2142857142857144e-05,
"loss": 0.0575,
"step": 800
},
{
"epoch": 9.47337278106509,
"grad_norm": 0.31686267256736755,
"learning_rate": 1.0952380952380953e-05,
"loss": 0.0376,
"step": 805
},
{
"epoch": 9.532544378698224,
"grad_norm": 0.19316697120666504,
"learning_rate": 9.761904761904762e-06,
"loss": 0.0546,
"step": 810
},
{
"epoch": 9.591715976331361,
"grad_norm": 0.3316756784915924,
"learning_rate": 8.571428571428573e-06,
"loss": 0.0303,
"step": 815
},
{
"epoch": 9.650887573964496,
"grad_norm": 0.38312825560569763,
"learning_rate": 7.380952380952382e-06,
"loss": 0.0407,
"step": 820
},
{
"epoch": 9.710059171597633,
"grad_norm": 0.1672104001045227,
"learning_rate": 6.190476190476191e-06,
"loss": 0.0406,
"step": 825
},
{
"epoch": 9.76923076923077,
"grad_norm": 0.30679717659950256,
"learning_rate": 5e-06,
"loss": 0.0316,
"step": 830
},
{
"epoch": 9.828402366863905,
"grad_norm": 0.3245919346809387,
"learning_rate": 3.8095238095238102e-06,
"loss": 0.0361,
"step": 835
},
{
"epoch": 9.887573964497042,
"grad_norm": 0.25631895661354065,
"learning_rate": 2.6190476190476192e-06,
"loss": 0.0351,
"step": 840
},
{
"epoch": 9.946745562130177,
"grad_norm": 0.2532467842102051,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.0397,
"step": 845
},
{
"epoch": 10.0,
"grad_norm": 0.7708420157432556,
"learning_rate": 2.3809523809523814e-07,
"loss": 0.0416,
"step": 850
},
{
"epoch": 10.0,
"step": 850,
"total_flos": 5.971841440128e+16,
"train_loss": 0.3865027742876726,
"train_runtime": 1732.5271,
"train_samples_per_second": 3.902,
"train_steps_per_second": 0.491
}
],
"logging_steps": 5,
"max_steps": 850,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.971841440128e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}