question_generation_1.5B_model_v2 / trainer_state.json
tokhey's picture
End of training
54eb50f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 100,
"global_step": 880,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05714285714285714,
"grad_norm": 0.17341195046901703,
"learning_rate": 2.272727272727273e-05,
"loss": 0.9238,
"step": 5
},
{
"epoch": 0.11428571428571428,
"grad_norm": 0.151872456073761,
"learning_rate": 5.113636363636364e-05,
"loss": 0.8658,
"step": 10
},
{
"epoch": 0.17142857142857143,
"grad_norm": 0.1270148754119873,
"learning_rate": 7.954545454545455e-05,
"loss": 0.8384,
"step": 15
},
{
"epoch": 0.22857142857142856,
"grad_norm": 0.13214267790317535,
"learning_rate": 0.00010795454545454545,
"loss": 0.7931,
"step": 20
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.125133216381073,
"learning_rate": 0.00013636363636363637,
"loss": 0.7621,
"step": 25
},
{
"epoch": 0.34285714285714286,
"grad_norm": 0.1484372466802597,
"learning_rate": 0.00016477272727272727,
"loss": 0.7396,
"step": 30
},
{
"epoch": 0.4,
"grad_norm": 0.14502882957458496,
"learning_rate": 0.00019318181818181817,
"loss": 0.7783,
"step": 35
},
{
"epoch": 0.45714285714285713,
"grad_norm": 0.1474684476852417,
"learning_rate": 0.0002215909090909091,
"loss": 0.7208,
"step": 40
},
{
"epoch": 0.5142857142857142,
"grad_norm": 0.14219504594802856,
"learning_rate": 0.00025,
"loss": 0.7467,
"step": 45
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.14778761565685272,
"learning_rate": 0.0002784090909090909,
"loss": 0.6922,
"step": 50
},
{
"epoch": 0.6285714285714286,
"grad_norm": 0.1440540999174118,
"learning_rate": 0.0003068181818181818,
"loss": 0.6895,
"step": 55
},
{
"epoch": 0.6857142857142857,
"grad_norm": 0.14411456882953644,
"learning_rate": 0.00033522727272727274,
"loss": 0.7601,
"step": 60
},
{
"epoch": 0.7428571428571429,
"grad_norm": 0.1502826064825058,
"learning_rate": 0.00036363636363636367,
"loss": 0.6845,
"step": 65
},
{
"epoch": 0.8,
"grad_norm": 0.15052765607833862,
"learning_rate": 0.00039204545454545454,
"loss": 0.6665,
"step": 70
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.1469675898551941,
"learning_rate": 0.0004204545454545455,
"loss": 0.7189,
"step": 75
},
{
"epoch": 0.9142857142857143,
"grad_norm": 0.136297807097435,
"learning_rate": 0.00044886363636363635,
"loss": 0.6771,
"step": 80
},
{
"epoch": 0.9714285714285714,
"grad_norm": 0.15446212887763977,
"learning_rate": 0.0004772727272727273,
"loss": 0.6908,
"step": 85
},
{
"epoch": 1.022857142857143,
"grad_norm": 0.15397129952907562,
"learning_rate": 0.0004999980332062218,
"loss": 0.7028,
"step": 90
},
{
"epoch": 1.08,
"grad_norm": 0.15528780221939087,
"learning_rate": 0.0004999291986732823,
"loss": 0.6689,
"step": 95
},
{
"epoch": 1.1371428571428572,
"grad_norm": 0.17444616556167603,
"learning_rate": 0.0004997620553954645,
"loss": 0.6345,
"step": 100
},
{
"epoch": 1.1371428571428572,
"eval_loss": 0.6875521540641785,
"eval_runtime": 29.6981,
"eval_samples_per_second": 2.727,
"eval_steps_per_second": 2.727,
"step": 100
},
{
"epoch": 1.1942857142857144,
"grad_norm": 0.16984045505523682,
"learning_rate": 0.0004994966691179711,
"loss": 0.6399,
"step": 105
},
{
"epoch": 1.2514285714285713,
"grad_norm": 0.18500125408172607,
"learning_rate": 0.0004991331442295331,
"loss": 0.6482,
"step": 110
},
{
"epoch": 1.3085714285714285,
"grad_norm": 0.19161580502986908,
"learning_rate": 0.0004986716237213483,
"loss": 0.6554,
"step": 115
},
{
"epoch": 1.3657142857142857,
"grad_norm": 0.17785383760929108,
"learning_rate": 0.0004981122891308368,
"loss": 0.5931,
"step": 120
},
{
"epoch": 1.4228571428571428,
"grad_norm": 0.1681031882762909,
"learning_rate": 0.0004974553604702333,
"loss": 0.5979,
"step": 125
},
{
"epoch": 1.48,
"grad_norm": 0.1733732372522354,
"learning_rate": 0.0004967010961400466,
"loss": 0.6648,
"step": 130
},
{
"epoch": 1.5371428571428571,
"grad_norm": 0.1732897013425827,
"learning_rate": 0.0004958497928274184,
"loss": 0.6383,
"step": 135
},
{
"epoch": 1.5942857142857143,
"grad_norm": 0.18160653114318848,
"learning_rate": 0.000494901785389423,
"loss": 0.6114,
"step": 140
},
{
"epoch": 1.6514285714285715,
"grad_norm": 0.16878579556941986,
"learning_rate": 0.0004938574467213517,
"loss": 0.636,
"step": 145
},
{
"epoch": 1.7085714285714286,
"grad_norm": 0.16055361926555634,
"learning_rate": 0.0004927171876100363,
"loss": 0.6337,
"step": 150
},
{
"epoch": 1.7657142857142856,
"grad_norm": 0.17928151786327362,
"learning_rate": 0.0004914814565722671,
"loss": 0.61,
"step": 155
},
{
"epoch": 1.822857142857143,
"grad_norm": 0.17349475622177124,
"learning_rate": 0.0004901507396783714,
"loss": 0.656,
"step": 160
},
{
"epoch": 1.88,
"grad_norm": 0.17489437758922577,
"learning_rate": 0.0004887255603610184,
"loss": 0.6506,
"step": 165
},
{
"epoch": 1.9371428571428573,
"grad_norm": 0.18316401541233063,
"learning_rate": 0.00048720647920932994,
"loss": 0.5985,
"step": 170
},
{
"epoch": 1.9942857142857142,
"grad_norm": 0.16122691333293915,
"learning_rate": 0.0004855940937483735,
"loss": 0.6132,
"step": 175
},
{
"epoch": 2.045714285714286,
"grad_norm": 0.1657329797744751,
"learning_rate": 0.0004838890382041291,
"loss": 0.5925,
"step": 180
},
{
"epoch": 2.1028571428571428,
"grad_norm": 0.2137223184108734,
"learning_rate": 0.00048209198325401817,
"loss": 0.4924,
"step": 185
},
{
"epoch": 2.16,
"grad_norm": 0.18751586973667145,
"learning_rate": 0.0004802036357630951,
"loss": 0.5103,
"step": 190
},
{
"epoch": 2.217142857142857,
"grad_norm": 0.22524061799049377,
"learning_rate": 0.00047822473850600447,
"loss": 0.4742,
"step": 195
},
{
"epoch": 2.2742857142857145,
"grad_norm": 0.23922263085842133,
"learning_rate": 0.0004761560698748135,
"loss": 0.472,
"step": 200
},
{
"epoch": 2.2742857142857145,
"eval_loss": 0.7193492650985718,
"eval_runtime": 29.6354,
"eval_samples_per_second": 2.733,
"eval_steps_per_second": 2.733,
"step": 200
},
{
"epoch": 2.3314285714285714,
"grad_norm": 0.25317704677581787,
"learning_rate": 0.00047399844357283395,
"loss": 0.4624,
"step": 205
},
{
"epoch": 2.388571428571429,
"grad_norm": 0.22575099766254425,
"learning_rate": 0.0004717527082945554,
"loss": 0.4803,
"step": 210
},
{
"epoch": 2.4457142857142857,
"grad_norm": 0.22181181609630585,
"learning_rate": 0.0004694197473918139,
"loss": 0.4805,
"step": 215
},
{
"epoch": 2.5028571428571427,
"grad_norm": 0.2090139091014862,
"learning_rate": 0.0004670004785263289,
"loss": 0.5365,
"step": 220
},
{
"epoch": 2.56,
"grad_norm": 0.24724529683589935,
"learning_rate": 0.0004644958533087443,
"loss": 0.5005,
"step": 225
},
{
"epoch": 2.617142857142857,
"grad_norm": 0.22190535068511963,
"learning_rate": 0.0004619068569243159,
"loss": 0.4974,
"step": 230
},
{
"epoch": 2.6742857142857144,
"grad_norm": 0.21910065412521362,
"learning_rate": 0.00045923450774539243,
"loss": 0.5322,
"step": 235
},
{
"epoch": 2.7314285714285713,
"grad_norm": 0.24561282992362976,
"learning_rate": 0.0004564798569308423,
"loss": 0.5154,
"step": 240
},
{
"epoch": 2.7885714285714287,
"grad_norm": 0.22541281580924988,
"learning_rate": 0.00045364398801258396,
"loss": 0.4968,
"step": 245
},
{
"epoch": 2.8457142857142856,
"grad_norm": 0.21167173981666565,
"learning_rate": 0.000450728016469383,
"loss": 0.5049,
"step": 250
},
{
"epoch": 2.902857142857143,
"grad_norm": 0.20972833037376404,
"learning_rate": 0.0004477330892880823,
"loss": 0.5303,
"step": 255
},
{
"epoch": 2.96,
"grad_norm": 0.21511210501194,
"learning_rate": 0.0004446603845124388,
"loss": 0.5203,
"step": 260
},
{
"epoch": 3.0114285714285716,
"grad_norm": 0.20611043274402618,
"learning_rate": 0.0004415111107797445,
"loss": 0.5064,
"step": 265
},
{
"epoch": 3.0685714285714285,
"grad_norm": 0.2586754262447357,
"learning_rate": 0.0004382865068454133,
"loss": 0.3602,
"step": 270
},
{
"epoch": 3.125714285714286,
"grad_norm": 0.25579819083213806,
"learning_rate": 0.00043498784109572097,
"loss": 0.3523,
"step": 275
},
{
"epoch": 3.182857142857143,
"grad_norm": 0.2642401456832886,
"learning_rate": 0.00043161641104889003,
"loss": 0.3604,
"step": 280
},
{
"epoch": 3.24,
"grad_norm": 0.2600599527359009,
"learning_rate": 0.00042817354284471575,
"loss": 0.3497,
"step": 285
},
{
"epoch": 3.297142857142857,
"grad_norm": 0.2751370966434479,
"learning_rate": 0.00042466059072293367,
"loss": 0.3525,
"step": 290
},
{
"epoch": 3.354285714285714,
"grad_norm": 0.2760413885116577,
"learning_rate": 0.00042107893649053456,
"loss": 0.3369,
"step": 295
},
{
"epoch": 3.4114285714285715,
"grad_norm": 0.28514841198921204,
"learning_rate": 0.0004174299889782355,
"loss": 0.3499,
"step": 300
},
{
"epoch": 3.4114285714285715,
"eval_loss": 0.7963736057281494,
"eval_runtime": 29.7356,
"eval_samples_per_second": 2.724,
"eval_steps_per_second": 2.724,
"step": 300
},
{
"epoch": 3.4685714285714284,
"grad_norm": 0.28093528747558594,
"learning_rate": 0.0004137151834863213,
"loss": 0.3601,
"step": 305
},
{
"epoch": 3.525714285714286,
"grad_norm": 0.2521819472312927,
"learning_rate": 0.0004099359812200746,
"loss": 0.3768,
"step": 310
},
{
"epoch": 3.5828571428571427,
"grad_norm": 0.2599101960659027,
"learning_rate": 0.00040609386871501583,
"loss": 0.3248,
"step": 315
},
{
"epoch": 3.64,
"grad_norm": 0.277692049741745,
"learning_rate": 0.0004021903572521802,
"loss": 0.363,
"step": 320
},
{
"epoch": 3.697142857142857,
"grad_norm": 0.2820269763469696,
"learning_rate": 0.00039822698226366017,
"loss": 0.3676,
"step": 325
},
{
"epoch": 3.7542857142857144,
"grad_norm": 0.28230783343315125,
"learning_rate": 0.00039420530272864934,
"loss": 0.3556,
"step": 330
},
{
"epoch": 3.8114285714285714,
"grad_norm": 0.3273780941963196,
"learning_rate": 0.0003901269005602235,
"loss": 0.3656,
"step": 335
},
{
"epoch": 3.8685714285714283,
"grad_norm": 0.28806638717651367,
"learning_rate": 0.0003859933799831008,
"loss": 0.3499,
"step": 340
},
{
"epoch": 3.9257142857142857,
"grad_norm": 0.30954381823539734,
"learning_rate": 0.00038180636690262563,
"loss": 0.392,
"step": 345
},
{
"epoch": 3.982857142857143,
"grad_norm": 0.29088252782821655,
"learning_rate": 0.000377567508265225,
"loss": 0.3736,
"step": 350
},
{
"epoch": 4.034285714285715,
"grad_norm": 0.22520305216312408,
"learning_rate": 0.0003732784714105876,
"loss": 0.2785,
"step": 355
},
{
"epoch": 4.091428571428572,
"grad_norm": 0.3518202602863312,
"learning_rate": 0.0003689409434158224,
"loss": 0.2139,
"step": 360
},
{
"epoch": 4.148571428571429,
"grad_norm": 0.2898755967617035,
"learning_rate": 0.0003645566304318526,
"loss": 0.2234,
"step": 365
},
{
"epoch": 4.2057142857142855,
"grad_norm": 0.3240414261817932,
"learning_rate": 0.00036012725701230734,
"loss": 0.2342,
"step": 370
},
{
"epoch": 4.2628571428571425,
"grad_norm": 0.28856348991394043,
"learning_rate": 0.00035565456543517487,
"loss": 0.2117,
"step": 375
},
{
"epoch": 4.32,
"grad_norm": 0.3389490246772766,
"learning_rate": 0.0003511403150174838,
"loss": 0.2435,
"step": 380
},
{
"epoch": 4.377142857142857,
"grad_norm": 0.31090596318244934,
"learning_rate": 0.00034658628142328216,
"loss": 0.2281,
"step": 385
},
{
"epoch": 4.434285714285714,
"grad_norm": 0.3269132077693939,
"learning_rate": 0.0003419942559651863,
"loss": 0.2637,
"step": 390
},
{
"epoch": 4.491428571428571,
"grad_norm": 0.30986836552619934,
"learning_rate": 0.0003373660448997746,
"loss": 0.228,
"step": 395
},
{
"epoch": 4.548571428571429,
"grad_norm": 0.28136685490608215,
"learning_rate": 0.000332703468717103,
"loss": 0.2457,
"step": 400
},
{
"epoch": 4.548571428571429,
"eval_loss": 0.9142104983329773,
"eval_runtime": 29.7632,
"eval_samples_per_second": 2.721,
"eval_steps_per_second": 2.721,
"step": 400
},
{
"epoch": 4.605714285714286,
"grad_norm": 0.3222131133079529,
"learning_rate": 0.00032800836142462175,
"loss": 0.2238,
"step": 405
},
{
"epoch": 4.662857142857143,
"grad_norm": 0.3103245496749878,
"learning_rate": 0.0003232825698257755,
"loss": 0.2444,
"step": 410
},
{
"epoch": 4.72,
"grad_norm": 0.319525808095932,
"learning_rate": 0.00031852795279356945,
"loss": 0.2498,
"step": 415
},
{
"epoch": 4.777142857142858,
"grad_norm": 0.2778545320034027,
"learning_rate": 0.0003137463805393885,
"loss": 0.231,
"step": 420
},
{
"epoch": 4.8342857142857145,
"grad_norm": 0.3173794448375702,
"learning_rate": 0.0003089397338773569,
"loss": 0.2471,
"step": 425
},
{
"epoch": 4.8914285714285715,
"grad_norm": 0.3207133710384369,
"learning_rate": 0.00030410990348452574,
"loss": 0.2302,
"step": 430
},
{
"epoch": 4.948571428571428,
"grad_norm": 0.26637086272239685,
"learning_rate": 0.0002992587891571833,
"loss": 0.2244,
"step": 435
},
{
"epoch": 5.0,
"grad_norm": 0.48891177773475647,
"learning_rate": 0.0002943882990635759,
"loss": 0.2451,
"step": 440
},
{
"epoch": 5.057142857142857,
"grad_norm": 0.23097443580627441,
"learning_rate": 0.0002895003489933375,
"loss": 0.135,
"step": 445
},
{
"epoch": 5.114285714285714,
"grad_norm": 0.30315983295440674,
"learning_rate": 0.0002845968616039207,
"loss": 0.1345,
"step": 450
},
{
"epoch": 5.171428571428572,
"grad_norm": 0.29658541083335876,
"learning_rate": 0.0002796797656643263,
"loss": 0.147,
"step": 455
},
{
"epoch": 5.228571428571429,
"grad_norm": 0.28584039211273193,
"learning_rate": 0.00027475099529642886,
"loss": 0.1333,
"step": 460
},
{
"epoch": 5.285714285714286,
"grad_norm": 0.3031592071056366,
"learning_rate": 0.0002698124892141971,
"loss": 0.147,
"step": 465
},
{
"epoch": 5.3428571428571425,
"grad_norm": 0.3030093312263489,
"learning_rate": 0.00026486618996110777,
"loss": 0.1298,
"step": 470
},
{
"epoch": 5.4,
"grad_norm": 0.27724677324295044,
"learning_rate": 0.0002599140431460531,
"loss": 0.1406,
"step": 475
},
{
"epoch": 5.457142857142857,
"grad_norm": 0.27622610330581665,
"learning_rate": 0.00025495799667804255,
"loss": 0.1225,
"step": 480
},
{
"epoch": 5.514285714285714,
"grad_norm": 0.2955563962459564,
"learning_rate": 0.00025,
"loss": 0.1263,
"step": 485
},
{
"epoch": 5.571428571428571,
"grad_norm": 0.3078777492046356,
"learning_rate": 0.00024504200332195757,
"loss": 0.1265,
"step": 490
},
{
"epoch": 5.628571428571428,
"grad_norm": 0.33732086420059204,
"learning_rate": 0.00024008595685394692,
"loss": 0.1611,
"step": 495
},
{
"epoch": 5.685714285714286,
"grad_norm": 0.29280802607536316,
"learning_rate": 0.00023513381003889227,
"loss": 0.1229,
"step": 500
},
{
"epoch": 5.685714285714286,
"eval_loss": 1.0489529371261597,
"eval_runtime": 29.6727,
"eval_samples_per_second": 2.73,
"eval_steps_per_second": 2.73,
"step": 500
},
{
"epoch": 5.742857142857143,
"grad_norm": 0.2972449064254761,
"learning_rate": 0.00023018751078580287,
"loss": 0.138,
"step": 505
},
{
"epoch": 5.8,
"grad_norm": 0.29159605503082275,
"learning_rate": 0.00022524900470357118,
"loss": 0.1351,
"step": 510
},
{
"epoch": 5.857142857142857,
"grad_norm": 0.3165677785873413,
"learning_rate": 0.00022032023433567378,
"loss": 0.1371,
"step": 515
},
{
"epoch": 5.914285714285715,
"grad_norm": 0.3170928657054901,
"learning_rate": 0.0002154031383960793,
"loss": 0.1494,
"step": 520
},
{
"epoch": 5.9714285714285715,
"grad_norm": 0.2793048024177551,
"learning_rate": 0.0002104996510066625,
"loss": 0.139,
"step": 525
},
{
"epoch": 6.022857142857143,
"grad_norm": 0.16289132833480835,
"learning_rate": 0.00020561170093642424,
"loss": 0.1127,
"step": 530
},
{
"epoch": 6.08,
"grad_norm": 0.22523629665374756,
"learning_rate": 0.00020074121084281678,
"loss": 0.0794,
"step": 535
},
{
"epoch": 6.137142857142857,
"grad_norm": 0.301932692527771,
"learning_rate": 0.0001958900965154743,
"loss": 0.0724,
"step": 540
},
{
"epoch": 6.194285714285714,
"grad_norm": 0.21225464344024658,
"learning_rate": 0.00019106026612264316,
"loss": 0.0747,
"step": 545
},
{
"epoch": 6.251428571428572,
"grad_norm": 0.17405834794044495,
"learning_rate": 0.0001862536194606115,
"loss": 0.0638,
"step": 550
},
{
"epoch": 6.308571428571429,
"grad_norm": 0.22534868121147156,
"learning_rate": 0.00018147204720643065,
"loss": 0.0722,
"step": 555
},
{
"epoch": 6.365714285714286,
"grad_norm": 0.2658616900444031,
"learning_rate": 0.00017671743017422448,
"loss": 0.0696,
"step": 560
},
{
"epoch": 6.422857142857143,
"grad_norm": 0.27225354313850403,
"learning_rate": 0.00017199163857537826,
"loss": 0.0764,
"step": 565
},
{
"epoch": 6.48,
"grad_norm": 0.2424718737602234,
"learning_rate": 0.000167296531282897,
"loss": 0.0725,
"step": 570
},
{
"epoch": 6.537142857142857,
"grad_norm": 0.21518002450466156,
"learning_rate": 0.00016263395510022544,
"loss": 0.0688,
"step": 575
},
{
"epoch": 6.594285714285714,
"grad_norm": 0.27070116996765137,
"learning_rate": 0.00015800574403481376,
"loss": 0.0814,
"step": 580
},
{
"epoch": 6.651428571428571,
"grad_norm": 0.28266236186027527,
"learning_rate": 0.00015341371857671783,
"loss": 0.0789,
"step": 585
},
{
"epoch": 6.708571428571428,
"grad_norm": 0.2508731782436371,
"learning_rate": 0.00014885968498251623,
"loss": 0.0677,
"step": 590
},
{
"epoch": 6.765714285714286,
"grad_norm": 0.2912222146987915,
"learning_rate": 0.0001443454345648252,
"loss": 0.0755,
"step": 595
},
{
"epoch": 6.822857142857143,
"grad_norm": 0.21923169493675232,
"learning_rate": 0.00013987274298769264,
"loss": 0.0728,
"step": 600
},
{
"epoch": 6.822857142857143,
"eval_loss": 1.1974577903747559,
"eval_runtime": 29.5963,
"eval_samples_per_second": 2.737,
"eval_steps_per_second": 2.737,
"step": 600
},
{
"epoch": 6.88,
"grad_norm": 0.2651500999927521,
"learning_rate": 0.0001354433695681474,
"loss": 0.0799,
"step": 605
},
{
"epoch": 6.937142857142857,
"grad_norm": 0.24103859066963196,
"learning_rate": 0.00013105905658417755,
"loss": 0.0779,
"step": 610
},
{
"epoch": 6.994285714285715,
"grad_norm": 0.26907840371131897,
"learning_rate": 0.00012672152858941244,
"loss": 0.077,
"step": 615
},
{
"epoch": 7.045714285714285,
"grad_norm": 0.12422758340835571,
"learning_rate": 0.00012243249173477514,
"loss": 0.0527,
"step": 620
},
{
"epoch": 7.102857142857143,
"grad_norm": 0.1850104182958603,
"learning_rate": 0.00011819363309737438,
"loss": 0.0449,
"step": 625
},
{
"epoch": 7.16,
"grad_norm": 0.2082287222146988,
"learning_rate": 0.00011400662001689926,
"loss": 0.04,
"step": 630
},
{
"epoch": 7.217142857142857,
"grad_norm": 0.18963204324245453,
"learning_rate": 0.00010987309943977646,
"loss": 0.0434,
"step": 635
},
{
"epoch": 7.274285714285714,
"grad_norm": 0.20188449323177338,
"learning_rate": 0.00010579469727135068,
"loss": 0.041,
"step": 640
},
{
"epoch": 7.331428571428571,
"grad_norm": 0.19002296030521393,
"learning_rate": 0.00010177301773633993,
"loss": 0.0415,
"step": 645
},
{
"epoch": 7.388571428571429,
"grad_norm": 0.16656257212162018,
"learning_rate": 9.780964274781984e-05,
"loss": 0.0384,
"step": 650
},
{
"epoch": 7.445714285714286,
"grad_norm": 0.2054123878479004,
"learning_rate": 9.390613128498418e-05,
"loss": 0.0414,
"step": 655
},
{
"epoch": 7.502857142857143,
"grad_norm": 0.19603969156742096,
"learning_rate": 9.006401877992549e-05,
"loss": 0.04,
"step": 660
},
{
"epoch": 7.5600000000000005,
"grad_norm": 0.22474470734596252,
"learning_rate": 8.628481651367875e-05,
"loss": 0.0448,
"step": 665
},
{
"epoch": 7.617142857142857,
"grad_norm": 0.1726624220609665,
"learning_rate": 8.257001102176459e-05,
"loss": 0.0368,
"step": 670
},
{
"epoch": 7.674285714285714,
"grad_norm": 0.21024195849895477,
"learning_rate": 7.892106350946543e-05,
"loss": 0.0456,
"step": 675
},
{
"epoch": 7.731428571428571,
"grad_norm": 0.1670810580253601,
"learning_rate": 7.533940927706637e-05,
"loss": 0.0436,
"step": 680
},
{
"epoch": 7.788571428571428,
"grad_norm": 0.18880096077919006,
"learning_rate": 7.182645715528436e-05,
"loss": 0.039,
"step": 685
},
{
"epoch": 7.845714285714286,
"grad_norm": 0.16823258996009827,
"learning_rate": 6.838358895111e-05,
"loss": 0.0418,
"step": 690
},
{
"epoch": 7.902857142857143,
"grad_norm": 0.1878485232591629,
"learning_rate": 6.501215890427908e-05,
"loss": 0.0462,
"step": 695
},
{
"epoch": 7.96,
"grad_norm": 0.17427195608615875,
"learning_rate": 6.171349315458669e-05,
"loss": 0.0398,
"step": 700
},
{
"epoch": 7.96,
"eval_loss": 1.317841649055481,
"eval_runtime": 29.5847,
"eval_samples_per_second": 2.738,
"eval_steps_per_second": 2.738,
"step": 700
},
{
"epoch": 8.01142857142857,
"grad_norm": 0.11310245841741562,
"learning_rate": 5.848888922025553e-05,
"loss": 0.0364,
"step": 705
},
{
"epoch": 8.06857142857143,
"grad_norm": 0.13256970047950745,
"learning_rate": 5.533961548756128e-05,
"loss": 0.0295,
"step": 710
},
{
"epoch": 8.125714285714286,
"grad_norm": 0.12489226460456848,
"learning_rate": 5.226691071191772e-05,
"loss": 0.0281,
"step": 715
},
{
"epoch": 8.182857142857143,
"grad_norm": 0.10919743031263351,
"learning_rate": 4.9271983530617046e-05,
"loss": 0.0258,
"step": 720
},
{
"epoch": 8.24,
"grad_norm": 0.13559609651565552,
"learning_rate": 4.6356011987416066e-05,
"loss": 0.0272,
"step": 725
},
{
"epoch": 8.297142857142857,
"grad_norm": 0.117083340883255,
"learning_rate": 4.35201430691578e-05,
"loss": 0.0285,
"step": 730
},
{
"epoch": 8.354285714285714,
"grad_norm": 0.16256172955036163,
"learning_rate": 4.076549225460757e-05,
"loss": 0.0257,
"step": 735
},
{
"epoch": 8.411428571428571,
"grad_norm": 0.1305255889892578,
"learning_rate": 3.809314307568412e-05,
"loss": 0.0255,
"step": 740
},
{
"epoch": 8.468571428571428,
"grad_norm": 0.11966060847043991,
"learning_rate": 3.550414669125573e-05,
"loss": 0.0263,
"step": 745
},
{
"epoch": 8.525714285714285,
"grad_norm": 0.17564554512500763,
"learning_rate": 3.2999521473671136e-05,
"loss": 0.0266,
"step": 750
},
{
"epoch": 8.582857142857144,
"grad_norm": 0.1525678038597107,
"learning_rate": 3.0580252608186086e-05,
"loss": 0.0292,
"step": 755
},
{
"epoch": 8.64,
"grad_norm": 0.20154449343681335,
"learning_rate": 2.824729170544457e-05,
"loss": 0.0304,
"step": 760
},
{
"epoch": 8.697142857142858,
"grad_norm": 0.14955027401447296,
"learning_rate": 2.6001556427166062e-05,
"loss": 0.0277,
"step": 765
},
{
"epoch": 8.754285714285714,
"grad_norm": 0.13878516852855682,
"learning_rate": 2.3843930125186542e-05,
"loss": 0.0255,
"step": 770
},
{
"epoch": 8.811428571428571,
"grad_norm": 0.12101715803146362,
"learning_rate": 2.177526149399556e-05,
"loss": 0.0263,
"step": 775
},
{
"epoch": 8.868571428571428,
"grad_norm": 0.11588730663061142,
"learning_rate": 1.9796364236904924e-05,
"loss": 0.0319,
"step": 780
},
{
"epoch": 8.925714285714285,
"grad_norm": 0.15549886226654053,
"learning_rate": 1.7908016745981858e-05,
"loss": 0.0275,
"step": 785
},
{
"epoch": 8.982857142857142,
"grad_norm": 0.11943016201257706,
"learning_rate": 1.6110961795870906e-05,
"loss": 0.0283,
"step": 790
},
{
"epoch": 9.034285714285714,
"grad_norm": 0.10950807482004166,
"learning_rate": 1.4405906251626494e-05,
"loss": 0.0273,
"step": 795
},
{
"epoch": 9.09142857142857,
"grad_norm": 0.12409886717796326,
"learning_rate": 1.2793520790670116e-05,
"loss": 0.0213,
"step": 800
},
{
"epoch": 9.09142857142857,
"eval_loss": 1.399065613746643,
"eval_runtime": 29.6981,
"eval_samples_per_second": 2.727,
"eval_steps_per_second": 2.727,
"step": 800
},
{
"epoch": 9.14857142857143,
"grad_norm": 0.10451866686344147,
"learning_rate": 1.1274439638981532e-05,
"loss": 0.0219,
"step": 805
},
{
"epoch": 9.205714285714286,
"grad_norm": 0.11612384021282196,
"learning_rate": 9.849260321628667e-06,
"loss": 0.0239,
"step": 810
},
{
"epoch": 9.262857142857143,
"grad_norm": 0.10945022851228714,
"learning_rate": 8.51854342773295e-06,
"loss": 0.0233,
"step": 815
},
{
"epoch": 9.32,
"grad_norm": 0.11814412474632263,
"learning_rate": 7.282812389963784e-06,
"loss": 0.0221,
"step": 820
},
{
"epoch": 9.377142857142857,
"grad_norm": 0.12562333047389984,
"learning_rate": 6.142553278648239e-06,
"loss": 0.0234,
"step": 825
},
{
"epoch": 9.434285714285714,
"grad_norm": 0.11704660952091217,
"learning_rate": 5.0982146105769125e-06,
"loss": 0.0277,
"step": 830
},
{
"epoch": 9.491428571428571,
"grad_norm": 0.1091628223657608,
"learning_rate": 4.150207172581522e-06,
"loss": 0.0237,
"step": 835
},
{
"epoch": 9.548571428571428,
"grad_norm": 0.11104241013526917,
"learning_rate": 3.298903859953517e-06,
"loss": 0.0239,
"step": 840
},
{
"epoch": 9.605714285714285,
"grad_norm": 0.11044926196336746,
"learning_rate": 2.544639529766829e-06,
"loss": 0.0234,
"step": 845
},
{
"epoch": 9.662857142857142,
"grad_norm": 0.12619182467460632,
"learning_rate": 1.887710869163284e-06,
"loss": 0.0233,
"step": 850
},
{
"epoch": 9.72,
"grad_norm": 0.12324284762144089,
"learning_rate": 1.328376278651705e-06,
"loss": 0.023,
"step": 855
},
{
"epoch": 9.777142857142858,
"grad_norm": 0.11614653468132019,
"learning_rate": 8.668557704669122e-07,
"loss": 0.0238,
"step": 860
},
{
"epoch": 9.834285714285715,
"grad_norm": 0.13400106132030487,
"learning_rate": 5.033308820289185e-07,
"loss": 0.0237,
"step": 865
},
{
"epoch": 9.891428571428571,
"grad_norm": 0.12201932817697525,
"learning_rate": 2.3794460453555045e-07,
"loss": 0.0216,
"step": 870
},
{
"epoch": 9.948571428571428,
"grad_norm": 0.12058448791503906,
"learning_rate": 7.080132671774542e-08,
"loss": 0.0238,
"step": 875
},
{
"epoch": 10.0,
"grad_norm": 0.15471021831035614,
"learning_rate": 1.966793778229725e-09,
"loss": 0.0221,
"step": 880
},
{
"epoch": 10.0,
"step": 880,
"total_flos": 5.7149261611008e+16,
"train_loss": 0.2784872981296344,
"train_runtime": 8588.2113,
"train_samples_per_second": 0.815,
"train_steps_per_second": 0.102
}
],
"logging_steps": 5,
"max_steps": 880,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 40,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.7149261611008e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}