PEFT
Safetensors
lion_8bit / trainer_state.json
Gege24's picture
Upload task output 8ca8a9ea-9ae3-4938-9713-015819984d61
ad7a2d6 verified
{
"best_global_step": 1044,
"best_metric": 0.5790691375732422,
"best_model_checkpoint": "/workspace/scripts/soutputs/8ca8a9ea-9ae3-4938-9713-015819984d61/checkpoint-1044",
"epoch": 2.9914040114613183,
"eval_steps": 500,
"global_step": 1044,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014326647564469915,
"grad_norm": 1.6428219079971313,
"learning_rate": 1.4026021586989397e-06,
"loss": 0.9723,
"step": 5
},
{
"epoch": 0.02865329512893983,
"grad_norm": 1.3679360151290894,
"learning_rate": 3.155854857072614e-06,
"loss": 0.9416,
"step": 10
},
{
"epoch": 0.04297994269340974,
"grad_norm": 1.0384185314178467,
"learning_rate": 4.9091075554462895e-06,
"loss": 0.8955,
"step": 15
},
{
"epoch": 0.05730659025787966,
"grad_norm": 0.6389966607093811,
"learning_rate": 6.662360253819964e-06,
"loss": 0.8219,
"step": 20
},
{
"epoch": 0.07163323782234957,
"grad_norm": 0.46849510073661804,
"learning_rate": 8.415612952193638e-06,
"loss": 0.7583,
"step": 25
},
{
"epoch": 0.08595988538681948,
"grad_norm": 0.5466313362121582,
"learning_rate": 1.0168865650567315e-05,
"loss": 0.7283,
"step": 30
},
{
"epoch": 0.10028653295128939,
"grad_norm": 0.46641281247138977,
"learning_rate": 1.1922118348940989e-05,
"loss": 0.708,
"step": 35
},
{
"epoch": 0.11461318051575932,
"grad_norm": 0.5155534744262695,
"learning_rate": 1.2272343115538091e-05,
"loss": 0.7074,
"step": 40
},
{
"epoch": 0.12893982808022922,
"grad_norm": 0.45078691840171814,
"learning_rate": 1.2270613524924088e-05,
"loss": 0.6699,
"step": 45
},
{
"epoch": 0.14326647564469913,
"grad_norm": 0.4526143968105316,
"learning_rate": 1.2267553922326047e-05,
"loss": 0.6663,
"step": 50
},
{
"epoch": 0.15759312320916904,
"grad_norm": 0.44320717453956604,
"learning_rate": 1.2263165044858593e-05,
"loss": 0.6612,
"step": 55
},
{
"epoch": 0.17191977077363896,
"grad_norm": 0.4532703757286072,
"learning_rate": 1.2257447949883163e-05,
"loss": 0.6705,
"step": 60
},
{
"epoch": 0.18624641833810887,
"grad_norm": 0.3655495345592499,
"learning_rate": 1.2250404014753254e-05,
"loss": 0.6574,
"step": 65
},
{
"epoch": 0.20057306590257878,
"grad_norm": 0.3733099102973938,
"learning_rate": 1.2242034936482603e-05,
"loss": 0.6834,
"step": 70
},
{
"epoch": 0.2148997134670487,
"grad_norm": 0.355129599571228,
"learning_rate": 1.2232342731336339e-05,
"loss": 0.6645,
"step": 75
},
{
"epoch": 0.22922636103151864,
"grad_norm": 0.358656108379364,
"learning_rate": 1.222132973434523e-05,
"loss": 0.653,
"step": 80
},
{
"epoch": 0.24355300859598855,
"grad_norm": 0.29975712299346924,
"learning_rate": 1.2208998598743134e-05,
"loss": 0.6719,
"step": 85
},
{
"epoch": 0.25787965616045844,
"grad_norm": 0.32437002658843994,
"learning_rate": 1.2195352295327777e-05,
"loss": 0.6661,
"step": 90
},
{
"epoch": 0.2722063037249284,
"grad_norm": 0.28565841913223267,
"learning_rate": 1.2180394111745045e-05,
"loss": 0.6515,
"step": 95
},
{
"epoch": 0.28653295128939826,
"grad_norm": 0.28558802604675293,
"learning_rate": 1.2164127651696922e-05,
"loss": 0.6448,
"step": 100
},
{
"epoch": 0.28653295128939826,
"eval_loss": 0.6602650880813599,
"eval_runtime": 2.8798,
"eval_samples_per_second": 14.584,
"eval_steps_per_second": 14.584,
"step": 100
},
{
"epoch": 0.3008595988538682,
"grad_norm": 0.3361125886440277,
"learning_rate": 1.214655683407329e-05,
"loss": 0.6516,
"step": 105
},
{
"epoch": 0.3151862464183381,
"grad_norm": 0.2776224613189697,
"learning_rate": 1.2127685892007806e-05,
"loss": 0.6592,
"step": 110
},
{
"epoch": 0.32951289398280803,
"grad_norm": 0.26801374554634094,
"learning_rate": 1.2107519371858048e-05,
"loss": 0.6565,
"step": 115
},
{
"epoch": 0.3438395415472779,
"grad_norm": 0.297080934047699,
"learning_rate": 1.2086062132110227e-05,
"loss": 0.642,
"step": 120
},
{
"epoch": 0.35816618911174786,
"grad_norm": 0.28340891003608704,
"learning_rate": 1.2063319342208684e-05,
"loss": 0.6478,
"step": 125
},
{
"epoch": 0.37249283667621774,
"grad_norm": 0.2782769203186035,
"learning_rate": 1.2039296481310471e-05,
"loss": 0.6368,
"step": 130
},
{
"epoch": 0.3868194842406877,
"grad_norm": 0.292530357837677,
"learning_rate": 1.2013999336965322e-05,
"loss": 0.6153,
"step": 135
},
{
"epoch": 0.40114613180515757,
"grad_norm": 0.24663622677326202,
"learning_rate": 1.1987434003721335e-05,
"loss": 0.6424,
"step": 140
},
{
"epoch": 0.4154727793696275,
"grad_norm": 0.2681853473186493,
"learning_rate": 1.195960688165667e-05,
"loss": 0.6348,
"step": 145
},
{
"epoch": 0.4297994269340974,
"grad_norm": 0.2627250850200653,
"learning_rate": 1.1930524674837664e-05,
"loss": 0.6249,
"step": 150
},
{
"epoch": 0.44412607449856734,
"grad_norm": 0.24072442948818207,
"learning_rate": 1.1900194389703684e-05,
"loss": 0.6391,
"step": 155
},
{
"epoch": 0.4584527220630373,
"grad_norm": 0.25336554646492004,
"learning_rate": 1.1868623333379166e-05,
"loss": 0.6298,
"step": 160
},
{
"epoch": 0.47277936962750716,
"grad_norm": 0.2672167718410492,
"learning_rate": 1.1835819111913174e-05,
"loss": 0.6368,
"step": 165
},
{
"epoch": 0.4871060171919771,
"grad_norm": 0.2560673952102661,
"learning_rate": 1.1801789628446977e-05,
"loss": 0.6318,
"step": 170
},
{
"epoch": 0.501432664756447,
"grad_norm": 0.27951574325561523,
"learning_rate": 1.1766543081310029e-05,
"loss": 0.6109,
"step": 175
},
{
"epoch": 0.5157593123209169,
"grad_norm": 0.25252604484558105,
"learning_rate": 1.1730087962044844e-05,
"loss": 0.6273,
"step": 180
},
{
"epoch": 0.5300859598853869,
"grad_norm": 0.25956350564956665,
"learning_rate": 1.1692433053361224e-05,
"loss": 0.6133,
"step": 185
},
{
"epoch": 0.5444126074498568,
"grad_norm": 0.2530823349952698,
"learning_rate": 1.165358742702035e-05,
"loss": 0.6214,
"step": 190
},
{
"epoch": 0.5587392550143266,
"grad_norm": 0.2583998143672943,
"learning_rate": 1.1613560441649214e-05,
"loss": 0.6105,
"step": 195
},
{
"epoch": 0.5730659025787965,
"grad_norm": 0.27742502093315125,
"learning_rate": 1.1572361740485967e-05,
"loss": 0.6349,
"step": 200
},
{
"epoch": 0.5730659025787965,
"eval_loss": 0.6322649717330933,
"eval_runtime": 2.88,
"eval_samples_per_second": 14.583,
"eval_steps_per_second": 14.583,
"step": 200
},
{
"epoch": 0.5873925501432665,
"grad_norm": 0.2662568688392639,
"learning_rate": 1.1530001249056676e-05,
"loss": 0.6299,
"step": 205
},
{
"epoch": 0.6017191977077364,
"grad_norm": 0.2614499032497406,
"learning_rate": 1.148648917278409e-05,
"loss": 0.6005,
"step": 210
},
{
"epoch": 0.6160458452722063,
"grad_norm": 0.26987332105636597,
"learning_rate": 1.1441835994528954e-05,
"loss": 0.6214,
"step": 215
},
{
"epoch": 0.6303724928366762,
"grad_norm": 0.24090726673603058,
"learning_rate": 1.1396052472064512e-05,
"loss": 0.6245,
"step": 220
},
{
"epoch": 0.6446991404011462,
"grad_norm": 0.2746104300022125,
"learning_rate": 1.1349149635484741e-05,
"loss": 0.6222,
"step": 225
},
{
"epoch": 0.6590257879656161,
"grad_norm": 0.26875993609428406,
"learning_rate": 1.1301138784547013e-05,
"loss": 0.6092,
"step": 230
},
{
"epoch": 0.673352435530086,
"grad_norm": 0.2399819940328598,
"learning_rate": 1.1252031485949773e-05,
"loss": 0.6177,
"step": 235
},
{
"epoch": 0.6876790830945558,
"grad_norm": 0.27088305354118347,
"learning_rate": 1.1201839570545898e-05,
"loss": 0.6024,
"step": 240
},
{
"epoch": 0.7020057306590258,
"grad_norm": 0.2598998248577118,
"learning_rate": 1.1150575130492442e-05,
"loss": 0.6068,
"step": 245
},
{
"epoch": 0.7163323782234957,
"grad_norm": 0.26509082317352295,
"learning_rate": 1.1098250516337403e-05,
"loss": 0.6128,
"step": 250
},
{
"epoch": 0.7306590257879656,
"grad_norm": 0.23148998618125916,
"learning_rate": 1.1044878334044251e-05,
"loss": 0.6225,
"step": 255
},
{
"epoch": 0.7449856733524355,
"grad_norm": 0.23298867046833038,
"learning_rate": 1.0990471441954915e-05,
"loss": 0.6176,
"step": 260
},
{
"epoch": 0.7593123209169055,
"grad_norm": 0.25643882155418396,
"learning_rate": 1.093504294769198e-05,
"loss": 0.6132,
"step": 265
},
{
"epoch": 0.7736389684813754,
"grad_norm": 0.2456223964691162,
"learning_rate": 1.087860620500081e-05,
"loss": 0.6083,
"step": 270
},
{
"epoch": 0.7879656160458453,
"grad_norm": 0.24799339473247528,
"learning_rate": 1.0821174810532391e-05,
"loss": 0.6064,
"step": 275
},
{
"epoch": 0.8022922636103151,
"grad_norm": 0.24989920854568481,
"learning_rate": 1.076276260056765e-05,
"loss": 0.6063,
"step": 280
},
{
"epoch": 0.8166189111747851,
"grad_norm": 0.253239244222641,
"learning_rate": 1.0703383647684028e-05,
"loss": 0.6071,
"step": 285
},
{
"epoch": 0.830945558739255,
"grad_norm": 0.24544061720371246,
"learning_rate": 1.064305225736515e-05,
"loss": 0.611,
"step": 290
},
{
"epoch": 0.8452722063037249,
"grad_norm": 0.24104644358158112,
"learning_rate": 1.0581782964554359e-05,
"loss": 0.5985,
"step": 295
},
{
"epoch": 0.8595988538681948,
"grad_norm": 0.23256933689117432,
"learning_rate": 1.0519590530152995e-05,
"loss": 0.5887,
"step": 300
},
{
"epoch": 0.8595988538681948,
"eval_loss": 0.6149212718009949,
"eval_runtime": 2.8878,
"eval_samples_per_second": 14.544,
"eval_steps_per_second": 14.544,
"step": 300
},
{
"epoch": 0.8739255014326648,
"grad_norm": 0.26569247245788574,
"learning_rate": 1.0456489937464206e-05,
"loss": 0.5988,
"step": 305
},
{
"epoch": 0.8882521489971347,
"grad_norm": 0.2356170415878296,
"learning_rate": 1.0392496388583203e-05,
"loss": 0.6133,
"step": 310
},
{
"epoch": 0.9025787965616046,
"grad_norm": 0.25165677070617676,
"learning_rate": 1.0327625300734795e-05,
"loss": 0.6022,
"step": 315
},
{
"epoch": 0.9169054441260746,
"grad_norm": 0.2422744333744049,
"learning_rate": 1.0261892302559097e-05,
"loss": 0.6209,
"step": 320
},
{
"epoch": 0.9312320916905444,
"grad_norm": 0.2504790723323822,
"learning_rate": 1.019531323034629e-05,
"loss": 0.5836,
"step": 325
},
{
"epoch": 0.9455587392550143,
"grad_norm": 0.23083172738552094,
"learning_rate": 1.0127904124221387e-05,
"loss": 0.6036,
"step": 330
},
{
"epoch": 0.9598853868194842,
"grad_norm": 0.23841316998004913,
"learning_rate": 1.0059681224279856e-05,
"loss": 0.6028,
"step": 335
},
{
"epoch": 0.9742120343839542,
"grad_norm": 0.2634727656841278,
"learning_rate": 9.990660966675092e-06,
"loss": 0.6074,
"step": 340
},
{
"epoch": 0.9885386819484241,
"grad_norm": 0.22332459688186646,
"learning_rate": 9.920859979658633e-06,
"loss": 0.6061,
"step": 345
},
{
"epoch": 0.997134670487106,
"eval_loss": 0.6086744070053101,
"eval_runtime": 2.8877,
"eval_samples_per_second": 14.544,
"eval_steps_per_second": 14.544,
"step": 348
},
{
"epoch": 1.0,
"eval_loss": 0.6092488169670105,
"eval_runtime": 2.8916,
"eval_samples_per_second": 14.525,
"eval_steps_per_second": 14.525,
"step": 349
},
{
"epoch": 1.002865329512894,
"grad_norm": 0.23956461250782013,
"learning_rate": 9.85029507957412e-06,
"loss": 0.5824,
"step": 350
},
{
"epoch": 1.0171919770773639,
"grad_norm": 0.2437165081501007,
"learning_rate": 9.77898326680592e-06,
"loss": 0.5803,
"step": 355
},
{
"epoch": 1.0315186246418337,
"grad_norm": 0.2500912845134735,
"learning_rate": 9.706941721683432e-06,
"loss": 0.5957,
"step": 360
},
{
"epoch": 1.0458452722063036,
"grad_norm": 0.2493949979543686,
"learning_rate": 9.634187800342016e-06,
"loss": 0.5911,
"step": 365
},
{
"epoch": 1.0601719197707737,
"grad_norm": 0.23148047924041748,
"learning_rate": 9.56073903054159e-06,
"loss": 0.5688,
"step": 370
},
{
"epoch": 1.0744985673352436,
"grad_norm": 0.23534221947193146,
"learning_rate": 9.486613107443863e-06,
"loss": 0.5938,
"step": 375
},
{
"epoch": 1.0888252148997135,
"grad_norm": 0.23032759130001068,
"learning_rate": 9.411827889349254e-06,
"loss": 0.5675,
"step": 380
},
{
"epoch": 1.1031518624641834,
"grad_norm": 0.23191657662391663,
"learning_rate": 9.336401393394483e-06,
"loss": 0.5899,
"step": 385
},
{
"epoch": 1.1174785100286533,
"grad_norm": 0.2217395305633545,
"learning_rate": 9.260351791211929e-06,
"loss": 0.5726,
"step": 390
},
{
"epoch": 1.1318051575931232,
"grad_norm": 0.2425890415906906,
"learning_rate": 9.183697404551733e-06,
"loss": 0.5762,
"step": 395
},
{
"epoch": 1.146131805157593,
"grad_norm": 0.2324853092432022,
"learning_rate": 9.106456700867764e-06,
"loss": 0.596,
"step": 400
},
{
"epoch": 1.146131805157593,
"eval_loss": 0.6035182476043701,
"eval_runtime": 2.8972,
"eval_samples_per_second": 14.497,
"eval_steps_per_second": 14.497,
"step": 400
},
{
"epoch": 1.1604584527220632,
"grad_norm": 0.23952153325080872,
"learning_rate": 9.028648288868459e-06,
"loss": 0.5904,
"step": 405
},
{
"epoch": 1.174785100286533,
"grad_norm": 0.23701021075248718,
"learning_rate": 8.950290914033645e-06,
"loss": 0.5785,
"step": 410
},
{
"epoch": 1.189111747851003,
"grad_norm": 0.2227863371372223,
"learning_rate": 8.871403454098416e-06,
"loss": 0.5724,
"step": 415
},
{
"epoch": 1.2034383954154728,
"grad_norm": 0.2232217639684677,
"learning_rate": 8.792004914505126e-06,
"loss": 0.5727,
"step": 420
},
{
"epoch": 1.2177650429799427,
"grad_norm": 0.24012598395347595,
"learning_rate": 8.712114423824633e-06,
"loss": 0.589,
"step": 425
},
{
"epoch": 1.2320916905444126,
"grad_norm": 0.2352171540260315,
"learning_rate": 8.631751229147881e-06,
"loss": 0.5667,
"step": 430
},
{
"epoch": 1.2464183381088825,
"grad_norm": 0.23246026039123535,
"learning_rate": 8.550934691448907e-06,
"loss": 0.5927,
"step": 435
},
{
"epoch": 1.2607449856733524,
"grad_norm": 0.24500536918640137,
"learning_rate": 8.469684280920438e-06,
"loss": 0.5831,
"step": 440
},
{
"epoch": 1.2750716332378222,
"grad_norm": 0.22870078682899475,
"learning_rate": 8.388019572283156e-06,
"loss": 0.5851,
"step": 445
},
{
"epoch": 1.2893982808022924,
"grad_norm": 0.22906720638275146,
"learning_rate": 8.305960240069795e-06,
"loss": 0.586,
"step": 450
},
{
"epoch": 1.3037249283667622,
"grad_norm": 0.22709061205387115,
"learning_rate": 8.223526053885171e-06,
"loss": 0.5719,
"step": 455
},
{
"epoch": 1.3180515759312321,
"grad_norm": 0.2257590889930725,
"learning_rate": 8.140736873643331e-06,
"loss": 0.5718,
"step": 460
},
{
"epoch": 1.332378223495702,
"grad_norm": 0.22583012282848358,
"learning_rate": 8.05761264478293e-06,
"loss": 0.5754,
"step": 465
},
{
"epoch": 1.346704871060172,
"grad_norm": 0.22651982307434082,
"learning_rate": 7.974173393462007e-06,
"loss": 0.5651,
"step": 470
},
{
"epoch": 1.3610315186246418,
"grad_norm": 0.24124553799629211,
"learning_rate": 7.890439221733317e-06,
"loss": 0.5826,
"step": 475
},
{
"epoch": 1.3753581661891117,
"grad_norm": 0.22888998687267303,
"learning_rate": 7.806430302701367e-06,
"loss": 0.5705,
"step": 480
},
{
"epoch": 1.3896848137535818,
"grad_norm": 0.21681609749794006,
"learning_rate": 7.722166875662358e-06,
"loss": 0.5814,
"step": 485
},
{
"epoch": 1.4040114613180517,
"grad_norm": 0.2206772118806839,
"learning_rate": 7.63766924122816e-06,
"loss": 0.5844,
"step": 490
},
{
"epoch": 1.4183381088825215,
"grad_norm": 0.22052349150180817,
"learning_rate": 7.552957756435512e-06,
"loss": 0.5674,
"step": 495
},
{
"epoch": 1.4326647564469914,
"grad_norm": 0.24319517612457275,
"learning_rate": 7.468052829841645e-06,
"loss": 0.5813,
"step": 500
},
{
"epoch": 1.4326647564469914,
"eval_loss": 0.5956406593322754,
"eval_runtime": 2.8806,
"eval_samples_per_second": 14.581,
"eval_steps_per_second": 14.581,
"step": 500
},
{
"epoch": 1.4469914040114613,
"grad_norm": 0.2275008261203766,
"learning_rate": 7.382974916607492e-06,
"loss": 0.5853,
"step": 505
},
{
"epoch": 1.4613180515759312,
"grad_norm": 0.23689113557338715,
"learning_rate": 7.297744513569644e-06,
"loss": 0.5796,
"step": 510
},
{
"epoch": 1.475644699140401,
"grad_norm": 0.23207077383995056,
"learning_rate": 7.2123821543023e-06,
"loss": 0.5832,
"step": 515
},
{
"epoch": 1.4899713467048712,
"grad_norm": 0.237880676984787,
"learning_rate": 7.126908404170343e-06,
"loss": 0.5783,
"step": 520
},
{
"epoch": 1.5042979942693409,
"grad_norm": 0.22841981053352356,
"learning_rate": 7.041343855374771e-06,
"loss": 0.5623,
"step": 525
},
{
"epoch": 1.518624641833811,
"grad_norm": 0.223537415266037,
"learning_rate": 6.955709121991649e-06,
"loss": 0.574,
"step": 530
},
{
"epoch": 1.5329512893982808,
"grad_norm": 0.22695119678974152,
"learning_rate": 6.870024835005807e-06,
"loss": 0.5592,
"step": 535
},
{
"epoch": 1.5472779369627507,
"grad_norm": 0.22849540412425995,
"learning_rate": 6.784311637340442e-06,
"loss": 0.5613,
"step": 540
},
{
"epoch": 1.5616045845272206,
"grad_norm": 0.2229369729757309,
"learning_rate": 6.6985901788838775e-06,
"loss": 0.566,
"step": 545
},
{
"epoch": 1.5759312320916905,
"grad_norm": 0.21880346536636353,
"learning_rate": 6.612881111514604e-06,
"loss": 0.5767,
"step": 550
},
{
"epoch": 1.5902578796561606,
"grad_norm": 0.21992699801921844,
"learning_rate": 6.527205084125875e-06,
"loss": 0.5711,
"step": 555
},
{
"epoch": 1.6045845272206303,
"grad_norm": 0.23056058585643768,
"learning_rate": 6.441582737651007e-06,
"loss": 0.5607,
"step": 560
},
{
"epoch": 1.6189111747851004,
"grad_norm": 0.22267192602157593,
"learning_rate": 6.356034700090591e-06,
"loss": 0.5549,
"step": 565
},
{
"epoch": 1.63323782234957,
"grad_norm": 0.22011469304561615,
"learning_rate": 6.270581581542831e-06,
"loss": 0.5821,
"step": 570
},
{
"epoch": 1.6475644699140402,
"grad_norm": 0.22847089171409607,
"learning_rate": 6.185243969238195e-06,
"loss": 0.5821,
"step": 575
},
{
"epoch": 1.66189111747851,
"grad_norm": 0.22488202154636383,
"learning_rate": 6.10004242257957e-06,
"loss": 0.5585,
"step": 580
},
{
"epoch": 1.67621776504298,
"grad_norm": 0.22973030805587769,
"learning_rate": 6.01499746818912e-06,
"loss": 0.5715,
"step": 585
},
{
"epoch": 1.6905444126074498,
"grad_norm": 0.22791410982608795,
"learning_rate": 5.930129594963047e-06,
"loss": 0.5709,
"step": 590
},
{
"epoch": 1.7048710601719197,
"grad_norm": 0.2369392067193985,
"learning_rate": 5.845459249135437e-06,
"loss": 0.5712,
"step": 595
},
{
"epoch": 1.7191977077363898,
"grad_norm": 0.22787928581237793,
"learning_rate": 5.7610068293523925e-06,
"loss": 0.5806,
"step": 600
},
{
"epoch": 1.7191977077363898,
"eval_loss": 0.589396595954895,
"eval_runtime": 2.8838,
"eval_samples_per_second": 14.564,
"eval_steps_per_second": 14.564,
"step": 600
},
{
"epoch": 1.7335243553008595,
"grad_norm": 0.2262052297592163,
"learning_rate": 5.676792681757612e-06,
"loss": 0.5653,
"step": 605
},
{
"epoch": 1.7478510028653296,
"grad_norm": 0.2277483344078064,
"learning_rate": 5.5928370950906355e-06,
"loss": 0.5634,
"step": 610
},
{
"epoch": 1.7621776504297995,
"grad_norm": 0.2228267937898636,
"learning_rate": 5.5091602957989115e-06,
"loss": 0.5472,
"step": 615
},
{
"epoch": 1.7765042979942693,
"grad_norm": 0.22168482840061188,
"learning_rate": 5.425782443164878e-06,
"loss": 0.5565,
"step": 620
},
{
"epoch": 1.7908309455587392,
"grad_norm": 0.22628583014011383,
"learning_rate": 5.342723624449211e-06,
"loss": 0.558,
"step": 625
},
{
"epoch": 1.8051575931232091,
"grad_norm": 0.22420856356620789,
"learning_rate": 5.260003850051442e-06,
"loss": 0.5721,
"step": 630
},
{
"epoch": 1.8194842406876792,
"grad_norm": 0.22148585319519043,
"learning_rate": 5.177643048689078e-06,
"loss": 0.5688,
"step": 635
},
{
"epoch": 1.8338108882521489,
"grad_norm": 0.21723760664463043,
"learning_rate": 5.095661062596411e-06,
"loss": 0.5719,
"step": 640
},
{
"epoch": 1.848137535816619,
"grad_norm": 0.22150275111198425,
"learning_rate": 5.014077642744153e-06,
"loss": 0.5486,
"step": 645
},
{
"epoch": 1.8624641833810889,
"grad_norm": 0.21508848667144775,
"learning_rate": 4.932912444081069e-06,
"loss": 0.555,
"step": 650
},
{
"epoch": 1.8767908309455588,
"grad_norm": 0.2276742309331894,
"learning_rate": 4.852185020798736e-06,
"loss": 0.5527,
"step": 655
},
{
"epoch": 1.8911174785100286,
"grad_norm": 0.22282367944717407,
"learning_rate": 4.771914821620574e-06,
"loss": 0.5513,
"step": 660
},
{
"epoch": 1.9054441260744985,
"grad_norm": 0.22503264248371124,
"learning_rate": 4.6921211851162955e-06,
"loss": 0.5656,
"step": 665
},
{
"epoch": 1.9197707736389686,
"grad_norm": 0.22671757638454437,
"learning_rate": 4.612823335042883e-06,
"loss": 0.5746,
"step": 670
},
{
"epoch": 1.9340974212034383,
"grad_norm": 0.2195613831281662,
"learning_rate": 4.534040375713239e-06,
"loss": 0.5481,
"step": 675
},
{
"epoch": 1.9484240687679084,
"grad_norm": 0.2245696634054184,
"learning_rate": 4.455791287393597e-06,
"loss": 0.558,
"step": 680
},
{
"epoch": 1.962750716332378,
"grad_norm": 0.21683502197265625,
"learning_rate": 4.37809492173083e-06,
"loss": 0.5523,
"step": 685
},
{
"epoch": 1.9770773638968482,
"grad_norm": 0.2247258424758911,
"learning_rate": 4.300969997210741e-06,
"loss": 0.5735,
"step": 690
},
{
"epoch": 1.991404011461318,
"grad_norm": 0.22837325930595398,
"learning_rate": 4.224435094648434e-06,
"loss": 0.5669,
"step": 695
},
{
"epoch": 1.994269340974212,
"eval_loss": 0.5852823853492737,
"eval_runtime": 2.8671,
"eval_samples_per_second": 14.649,
"eval_steps_per_second": 14.649,
"step": 696
},
{
"epoch": 2.0,
"eval_loss": 0.5849316716194153,
"eval_runtime": 2.8768,
"eval_samples_per_second": 14.6,
"eval_steps_per_second": 14.6,
"step": 698
},
{
"epoch": 2.005730659025788,
"grad_norm": 0.21968944370746613,
"learning_rate": 4.148508652711858e-06,
"loss": 0.5577,
"step": 700
},
{
"epoch": 2.005730659025788,
"eval_loss": 0.5852600932121277,
"eval_runtime": 2.8671,
"eval_samples_per_second": 14.649,
"eval_steps_per_second": 14.649,
"step": 700
},
{
"epoch": 2.020057306590258,
"grad_norm": 0.22937500476837158,
"learning_rate": 4.073208963479584e-06,
"loss": 0.5605,
"step": 705
},
{
"epoch": 2.0343839541547277,
"grad_norm": 0.23057711124420166,
"learning_rate": 3.998554168033906e-06,
"loss": 0.5525,
"step": 710
},
{
"epoch": 2.048710601719198,
"grad_norm": 0.2270784080028534,
"learning_rate": 3.924562252090337e-06,
"loss": 0.5562,
"step": 715
},
{
"epoch": 2.0630372492836675,
"grad_norm": 0.2220994234085083,
"learning_rate": 3.8512510416644995e-06,
"loss": 0.5447,
"step": 720
},
{
"epoch": 2.0773638968481376,
"grad_norm": 0.23204341530799866,
"learning_rate": 3.778638198777512e-06,
"loss": 0.549,
"step": 725
},
{
"epoch": 2.0916905444126073,
"grad_norm": 0.22262004017829895,
"learning_rate": 3.706741217200896e-06,
"loss": 0.5499,
"step": 730
},
{
"epoch": 2.1060171919770774,
"grad_norm": 0.22019214928150177,
"learning_rate": 3.6355774182419905e-06,
"loss": 0.55,
"step": 735
},
{
"epoch": 2.1203438395415475,
"grad_norm": 0.22234179079532623,
"learning_rate": 3.5651639465709426e-06,
"loss": 0.5524,
"step": 740
},
{
"epoch": 2.134670487106017,
"grad_norm": 0.22449831664562225,
"learning_rate": 3.495517766090224e-06,
"loss": 0.5459,
"step": 745
},
{
"epoch": 2.1489971346704873,
"grad_norm": 0.23554570972919464,
"learning_rate": 3.426655655847724e-06,
"loss": 0.5617,
"step": 750
},
{
"epoch": 2.163323782234957,
"grad_norm": 0.23134228587150574,
"learning_rate": 3.3585942059943785e-06,
"loss": 0.5523,
"step": 755
},
{
"epoch": 2.177650429799427,
"grad_norm": 0.2272178828716278,
"learning_rate": 3.291349813787276e-06,
"loss": 0.5506,
"step": 760
},
{
"epoch": 2.1919770773638967,
"grad_norm": 0.22482511401176453,
"learning_rate": 3.2249386796392656e-06,
"loss": 0.5451,
"step": 765
},
{
"epoch": 2.206303724928367,
"grad_norm": 0.2274748831987381,
"learning_rate": 3.159376803215985e-06,
"loss": 0.5531,
"step": 770
},
{
"epoch": 2.2206303724928365,
"grad_norm": 0.2227988839149475,
"learning_rate": 3.0946799795812396e-06,
"loss": 0.5489,
"step": 775
},
{
"epoch": 2.2349570200573066,
"grad_norm": 0.22400720417499542,
"learning_rate": 3.030863795391684e-06,
"loss": 0.5456,
"step": 780
},
{
"epoch": 2.2492836676217767,
"grad_norm": 0.2268913835287094,
"learning_rate": 2.9679436251417016e-06,
"loss": 0.5394,
"step": 785
},
{
"epoch": 2.2636103151862463,
"grad_norm": 0.22335706651210785,
"learning_rate": 2.9059346274594124e-06,
"loss": 0.5377,
"step": 790
},
{
"epoch": 2.2779369627507164,
"grad_norm": 0.22807373106479645,
"learning_rate": 2.8448517414546884e-06,
"loss": 0.5484,
"step": 795
},
{
"epoch": 2.292263610315186,
"grad_norm": 0.22118327021598816,
"learning_rate": 2.7847096831200282e-06,
"loss": 0.5419,
"step": 800
},
{
"epoch": 2.292263610315186,
"eval_loss": 0.5827357769012451,
"eval_runtime": 2.9066,
"eval_samples_per_second": 14.45,
"eval_steps_per_second": 14.45,
"step": 800
},
{
"epoch": 2.306590257879656,
"grad_norm": 0.22792136669158936,
"learning_rate": 2.7255229417852123e-06,
"loss": 0.5496,
"step": 805
},
{
"epoch": 2.3209169054441263,
"grad_norm": 0.22095544636249542,
"learning_rate": 2.667305776626566e-06,
"loss": 0.554,
"step": 810
},
{
"epoch": 2.335243553008596,
"grad_norm": 0.22290435433387756,
"learning_rate": 2.6100722132316454e-06,
"loss": 0.5492,
"step": 815
},
{
"epoch": 2.349570200573066,
"grad_norm": 0.23009058833122253,
"learning_rate": 2.553836040220221e-06,
"loss": 0.5473,
"step": 820
},
{
"epoch": 2.3638968481375358,
"grad_norm": 0.22500832378864288,
"learning_rate": 2.49861080592235e-06,
"loss": 0.5586,
"step": 825
},
{
"epoch": 2.378223495702006,
"grad_norm": 0.22200486063957214,
"learning_rate": 2.4444098151143295e-06,
"loss": 0.5358,
"step": 830
},
{
"epoch": 2.3925501432664755,
"grad_norm": 0.22904905676841736,
"learning_rate": 2.391246125813331e-06,
"loss": 0.5524,
"step": 835
},
{
"epoch": 2.4068767908309456,
"grad_norm": 0.23062781989574432,
"learning_rate": 2.339132546131483e-06,
"loss": 0.5404,
"step": 840
},
{
"epoch": 2.4212034383954153,
"grad_norm": 0.22324807941913605,
"learning_rate": 2.288081631190158e-06,
"loss": 0.5377,
"step": 845
},
{
"epoch": 2.4355300859598854,
"grad_norm": 0.22595882415771484,
"learning_rate": 2.2381056800952273e-06,
"loss": 0.5465,
"step": 850
},
{
"epoch": 2.4498567335243555,
"grad_norm": 0.23639383912086487,
"learning_rate": 2.189216732973958e-06,
"loss": 0.5518,
"step": 855
},
{
"epoch": 2.464183381088825,
"grad_norm": 0.23035073280334473,
"learning_rate": 2.1414265680743383e-06,
"loss": 0.5444,
"step": 860
},
{
"epoch": 2.4785100286532953,
"grad_norm": 0.22556614875793457,
"learning_rate": 2.0947466989274793e-06,
"loss": 0.5519,
"step": 865
},
{
"epoch": 2.492836676217765,
"grad_norm": 0.22614265978336334,
"learning_rate": 2.0491883715737904e-06,
"loss": 0.5526,
"step": 870
},
{
"epoch": 2.507163323782235,
"grad_norm": 0.22689661383628845,
"learning_rate": 2.0047625618536037e-06,
"loss": 0.5489,
"step": 875
},
{
"epoch": 2.5214899713467047,
"grad_norm": 0.22763052582740784,
"learning_rate": 1.961479972762888e-06,
"loss": 0.5397,
"step": 880
},
{
"epoch": 2.535816618911175,
"grad_norm": 0.22761483490467072,
"learning_rate": 1.919351031874699e-06,
"loss": 0.5452,
"step": 885
},
{
"epoch": 2.5501432664756445,
"grad_norm": 0.22768139839172363,
"learning_rate": 1.8783858888269978e-06,
"loss": 0.5522,
"step": 890
},
{
"epoch": 2.5644699140401146,
"grad_norm": 0.23226258158683777,
"learning_rate": 1.8385944128773981e-06,
"loss": 0.521,
"step": 895
},
{
"epoch": 2.5787965616045847,
"grad_norm": 0.2272603064775467,
"learning_rate": 1.7999861905254893e-06,
"loss": 0.5526,
"step": 900
},
{
"epoch": 2.5787965616045847,
"eval_loss": 0.5810644030570984,
"eval_runtime": 2.9211,
"eval_samples_per_second": 14.378,
"eval_steps_per_second": 14.378,
"step": 900
},
{
"epoch": 2.5931232091690544,
"grad_norm": 0.22808772325515747,
"learning_rate": 1.7625705232032741e-06,
"loss": 0.5573,
"step": 905
},
{
"epoch": 2.6074498567335245,
"grad_norm": 0.22595611214637756,
"learning_rate": 1.726356425034279e-06,
"loss": 0.5378,
"step": 910
},
{
"epoch": 2.621776504297994,
"grad_norm": 0.22707025706768036,
"learning_rate": 1.6913526206618854e-06,
"loss": 0.5243,
"step": 915
},
{
"epoch": 2.6361031518624642,
"grad_norm": 0.2284831553697586,
"learning_rate": 1.6575675431474023e-06,
"loss": 0.5411,
"step": 920
},
{
"epoch": 2.6504297994269344,
"grad_norm": 0.22921448945999146,
"learning_rate": 1.6250093319383871e-06,
"loss": 0.5411,
"step": 925
},
{
"epoch": 2.664756446991404,
"grad_norm": 0.2303130179643631,
"learning_rate": 1.5936858309077084e-06,
"loss": 0.546,
"step": 930
},
{
"epoch": 2.6790830945558737,
"grad_norm": 0.2226521223783493,
"learning_rate": 1.5636045864637997e-06,
"loss": 0.5378,
"step": 935
},
{
"epoch": 2.693409742120344,
"grad_norm": 0.22775433957576752,
"learning_rate": 1.5347728457326013e-06,
"loss": 0.5341,
"step": 940
},
{
"epoch": 2.707736389684814,
"grad_norm": 0.23151849210262299,
"learning_rate": 1.507197554811592e-06,
"loss": 0.5411,
"step": 945
},
{
"epoch": 2.7220630372492836,
"grad_norm": 0.22131632268428802,
"learning_rate": 1.480885357096343e-06,
"loss": 0.5322,
"step": 950
},
{
"epoch": 2.7363896848137537,
"grad_norm": 0.22514161467552185,
"learning_rate": 1.4558425916800066e-06,
"loss": 0.5287,
"step": 955
},
{
"epoch": 2.7507163323782233,
"grad_norm": 0.22741974890232086,
"learning_rate": 1.4320752918261058e-06,
"loss": 0.5467,
"step": 960
},
{
"epoch": 2.7650429799426934,
"grad_norm": 0.22180503606796265,
"learning_rate": 1.4095891835150126e-06,
"loss": 0.5398,
"step": 965
},
{
"epoch": 2.7793696275071635,
"grad_norm": 0.2328280508518219,
"learning_rate": 1.3883896840644583e-06,
"loss": 0.5347,
"step": 970
},
{
"epoch": 2.793696275071633,
"grad_norm": 0.22877122461795807,
"learning_rate": 1.3684819008243952e-06,
"loss": 0.5453,
"step": 975
},
{
"epoch": 2.8080229226361033,
"grad_norm": 0.22728435695171356,
"learning_rate": 1.3498706299465446e-06,
"loss": 0.5356,
"step": 980
},
{
"epoch": 2.822349570200573,
"grad_norm": 0.22559645771980286,
"learning_rate": 1.3325603552289166e-06,
"loss": 0.5432,
"step": 985
},
{
"epoch": 2.836676217765043,
"grad_norm": 0.2304041087627411,
"learning_rate": 1.3165552470355781e-06,
"loss": 0.5441,
"step": 990
},
{
"epoch": 2.8510028653295127,
"grad_norm": 0.22864393889904022,
"learning_rate": 1.301859161291938e-06,
"loss": 0.5417,
"step": 995
},
{
"epoch": 2.865329512893983,
"grad_norm": 0.22412388026714325,
"learning_rate": 1.2884756385557813e-06,
"loss": 0.5374,
"step": 1000
},
{
"epoch": 2.865329512893983,
"eval_loss": 0.5795248746871948,
"eval_runtime": 2.889,
"eval_samples_per_second": 14.538,
"eval_steps_per_second": 14.538,
"step": 1000
},
{
"epoch": 2.8796561604584525,
"grad_norm": 0.22551295161247253,
"learning_rate": 1.2764079031642852e-06,
"loss": 0.5425,
"step": 1005
},
{
"epoch": 2.8939828080229226,
"grad_norm": 0.22314225137233734,
"learning_rate": 1.265658862457217e-06,
"loss": 0.5405,
"step": 1010
},
{
"epoch": 2.9083094555873927,
"grad_norm": 0.22527816891670227,
"learning_rate": 1.2562311060765001e-06,
"loss": 0.5436,
"step": 1015
},
{
"epoch": 2.9226361031518624,
"grad_norm": 0.22648297250270844,
"learning_rate": 1.248126905342324e-06,
"loss": 0.5497,
"step": 1020
},
{
"epoch": 2.9369627507163325,
"grad_norm": 0.2278534471988678,
"learning_rate": 1.2413482127059402e-06,
"loss": 0.5391,
"step": 1025
},
{
"epoch": 2.951289398280802,
"grad_norm": 0.2279985249042511,
"learning_rate": 1.2358966612792807e-06,
"loss": 0.5398,
"step": 1030
},
{
"epoch": 2.9656160458452723,
"grad_norm": 0.23118627071380615,
"learning_rate": 1.2317735644415136e-06,
"loss": 0.5517,
"step": 1035
},
{
"epoch": 2.9799426934097424,
"grad_norm": 0.22241578996181488,
"learning_rate": 1.228979915522621e-06,
"loss": 0.5407,
"step": 1040
},
{
"epoch": 2.9914040114613183,
"eval_loss": 0.5790691375732422,
"eval_runtime": 2.8699,
"eval_samples_per_second": 14.635,
"eval_steps_per_second": 14.635,
"step": 1044
}
],
"logging_steps": 5,
"max_steps": 1047,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1950747837551084e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}