klora_2000_skill / 4 /trainer_state.json
RayDu0010's picture
Upload folder using huggingface_hub
17f3153 verified
raw
history blame
31.6 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 894,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011198208286674132,
"grad_norm": 1.0978649854660034,
"learning_rate": 1.0714285714285714e-06,
"loss": 1.2808,
"step": 5
},
{
"epoch": 0.022396416573348264,
"grad_norm": 0.865485429763794,
"learning_rate": 2.410714285714286e-06,
"loss": 1.3219,
"step": 10
},
{
"epoch": 0.0335946248600224,
"grad_norm": 0.7268191576004028,
"learning_rate": 3.75e-06,
"loss": 1.2607,
"step": 15
},
{
"epoch": 0.04479283314669653,
"grad_norm": 0.6289479732513428,
"learning_rate": 5.0892857142857146e-06,
"loss": 1.2549,
"step": 20
},
{
"epoch": 0.055991041433370664,
"grad_norm": 0.5705351829528809,
"learning_rate": 6.428571428571429e-06,
"loss": 1.2545,
"step": 25
},
{
"epoch": 0.0671892497200448,
"grad_norm": 0.8970493078231812,
"learning_rate": 7.767857142857144e-06,
"loss": 1.2063,
"step": 30
},
{
"epoch": 0.07838745800671892,
"grad_norm": 0.5323396325111389,
"learning_rate": 9.107142857142856e-06,
"loss": 1.1831,
"step": 35
},
{
"epoch": 0.08958566629339305,
"grad_norm": 0.4824765622615814,
"learning_rate": 1.044642857142857e-05,
"loss": 1.1912,
"step": 40
},
{
"epoch": 0.10078387458006718,
"grad_norm": 0.5041835308074951,
"learning_rate": 1.1785714285714286e-05,
"loss": 1.1675,
"step": 45
},
{
"epoch": 0.11198208286674133,
"grad_norm": 0.4978722631931305,
"learning_rate": 1.3125e-05,
"loss": 1.1681,
"step": 50
},
{
"epoch": 0.12318029115341546,
"grad_norm": 0.428523451089859,
"learning_rate": 1.4464285714285715e-05,
"loss": 1.154,
"step": 55
},
{
"epoch": 0.1343784994400896,
"grad_norm": 0.4901936650276184,
"learning_rate": 1.580357142857143e-05,
"loss": 1.1488,
"step": 60
},
{
"epoch": 0.1455767077267637,
"grad_norm": 0.4443103075027466,
"learning_rate": 1.7142857142857142e-05,
"loss": 1.2096,
"step": 65
},
{
"epoch": 0.15677491601343785,
"grad_norm": 0.5249817967414856,
"learning_rate": 1.848214285714286e-05,
"loss": 1.1517,
"step": 70
},
{
"epoch": 0.167973124300112,
"grad_norm": 0.4530683755874634,
"learning_rate": 1.982142857142857e-05,
"loss": 1.1722,
"step": 75
},
{
"epoch": 0.1791713325867861,
"grad_norm": 0.47545140981674194,
"learning_rate": 2.1160714285714287e-05,
"loss": 1.1244,
"step": 80
},
{
"epoch": 0.19036954087346025,
"grad_norm": 0.4221179187297821,
"learning_rate": 2.25e-05,
"loss": 1.0931,
"step": 85
},
{
"epoch": 0.20156774916013437,
"grad_norm": 0.4603467285633087,
"learning_rate": 2.3839285714285713e-05,
"loss": 1.1179,
"step": 90
},
{
"epoch": 0.2127659574468085,
"grad_norm": 0.48785561323165894,
"learning_rate": 2.517857142857143e-05,
"loss": 1.1119,
"step": 95
},
{
"epoch": 0.22396416573348266,
"grad_norm": 0.46568208932876587,
"learning_rate": 2.6517857142857143e-05,
"loss": 1.1662,
"step": 100
},
{
"epoch": 0.23516237402015677,
"grad_norm": 0.43084895610809326,
"learning_rate": 2.7857142857142858e-05,
"loss": 1.1031,
"step": 105
},
{
"epoch": 0.24636058230683092,
"grad_norm": 0.46002936363220215,
"learning_rate": 2.9196428571428573e-05,
"loss": 1.1044,
"step": 110
},
{
"epoch": 0.25755879059350506,
"grad_norm": 0.5120033025741577,
"learning_rate": 2.9999934306758047e-05,
"loss": 1.0682,
"step": 115
},
{
"epoch": 0.2687569988801792,
"grad_norm": 0.5539029836654663,
"learning_rate": 2.9999195264394326e-05,
"loss": 1.0544,
"step": 120
},
{
"epoch": 0.2799552071668533,
"grad_norm": 0.5363937616348267,
"learning_rate": 2.9997635103707554e-05,
"loss": 1.0822,
"step": 125
},
{
"epoch": 0.2911534154535274,
"grad_norm": 0.49808958172798157,
"learning_rate": 2.999525391010742e-05,
"loss": 1.0351,
"step": 130
},
{
"epoch": 0.3023516237402016,
"grad_norm": 0.6337416768074036,
"learning_rate": 2.9992051813950364e-05,
"loss": 1.0445,
"step": 135
},
{
"epoch": 0.3135498320268757,
"grad_norm": 0.5711157321929932,
"learning_rate": 2.998802899053244e-05,
"loss": 1.0135,
"step": 140
},
{
"epoch": 0.3247480403135498,
"grad_norm": 0.6856613755226135,
"learning_rate": 2.998318566007973e-05,
"loss": 1.013,
"step": 145
},
{
"epoch": 0.335946248600224,
"grad_norm": 0.5463095307350159,
"learning_rate": 2.99775220877363e-05,
"loss": 1.0258,
"step": 150
},
{
"epoch": 0.3471444568868981,
"grad_norm": 0.6269132494926453,
"learning_rate": 2.9971038583549633e-05,
"loss": 1.0128,
"step": 155
},
{
"epoch": 0.3583426651735722,
"grad_norm": 0.5507866740226746,
"learning_rate": 2.9963735502453715e-05,
"loss": 0.9991,
"step": 160
},
{
"epoch": 0.36954087346024633,
"grad_norm": 0.6331682205200195,
"learning_rate": 2.995561324424958e-05,
"loss": 0.9721,
"step": 165
},
{
"epoch": 0.3807390817469205,
"grad_norm": 0.6370624303817749,
"learning_rate": 2.9946672253583415e-05,
"loss": 0.9839,
"step": 170
},
{
"epoch": 0.3919372900335946,
"grad_norm": 0.588683545589447,
"learning_rate": 2.9936913019922235e-05,
"loss": 0.9616,
"step": 175
},
{
"epoch": 0.40313549832026874,
"grad_norm": 0.6478800773620605,
"learning_rate": 2.9926336077527062e-05,
"loss": 0.9782,
"step": 180
},
{
"epoch": 0.4143337066069429,
"grad_norm": 0.6072382926940918,
"learning_rate": 2.9914942005423723e-05,
"loss": 0.9035,
"step": 185
},
{
"epoch": 0.425531914893617,
"grad_norm": 0.830245852470398,
"learning_rate": 2.9902731427371096e-05,
"loss": 0.9616,
"step": 190
},
{
"epoch": 0.43673012318029114,
"grad_norm": 0.7037926912307739,
"learning_rate": 2.9889705011827006e-05,
"loss": 0.9044,
"step": 195
},
{
"epoch": 0.4479283314669653,
"grad_norm": 0.689367949962616,
"learning_rate": 2.9875863471911608e-05,
"loss": 0.9293,
"step": 200
},
{
"epoch": 0.45912653975363943,
"grad_norm": 0.7107216715812683,
"learning_rate": 2.9861207565368363e-05,
"loss": 0.908,
"step": 205
},
{
"epoch": 0.47032474804031354,
"grad_norm": 0.7383051514625549,
"learning_rate": 2.9845738094522533e-05,
"loss": 0.9435,
"step": 210
},
{
"epoch": 0.48152295632698766,
"grad_norm": 0.7229745388031006,
"learning_rate": 2.9829455906237287e-05,
"loss": 0.8984,
"step": 215
},
{
"epoch": 0.49272116461366183,
"grad_norm": 0.746848464012146,
"learning_rate": 2.9812361891867325e-05,
"loss": 0.8979,
"step": 220
},
{
"epoch": 0.503919372900336,
"grad_norm": 0.723301112651825,
"learning_rate": 2.979445698721007e-05,
"loss": 0.9337,
"step": 225
},
{
"epoch": 0.5151175811870101,
"grad_norm": 0.7699582576751709,
"learning_rate": 2.9775742172454473e-05,
"loss": 0.9443,
"step": 230
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.7237251400947571,
"learning_rate": 2.9756218472127302e-05,
"loss": 0.8489,
"step": 235
},
{
"epoch": 0.5375139977603584,
"grad_norm": 0.8070515990257263,
"learning_rate": 2.9735886955037118e-05,
"loss": 0.8639,
"step": 240
},
{
"epoch": 0.5487122060470325,
"grad_norm": 0.7536729574203491,
"learning_rate": 2.9714748734215714e-05,
"loss": 0.8083,
"step": 245
},
{
"epoch": 0.5599104143337066,
"grad_norm": 0.8057552576065063,
"learning_rate": 2.969280496685719e-05,
"loss": 0.8714,
"step": 250
},
{
"epoch": 0.5711086226203808,
"grad_norm": 0.8718234896659851,
"learning_rate": 2.9670056854254634e-05,
"loss": 0.8651,
"step": 255
},
{
"epoch": 0.5823068309070548,
"grad_norm": 0.8957020044326782,
"learning_rate": 2.9646505641734324e-05,
"loss": 0.8428,
"step": 260
},
{
"epoch": 0.593505039193729,
"grad_norm": 0.7812408804893494,
"learning_rate": 2.9622152618587576e-05,
"loss": 0.8646,
"step": 265
},
{
"epoch": 0.6047032474804032,
"grad_norm": 0.918906033039093,
"learning_rate": 2.9596999118000145e-05,
"loss": 0.8891,
"step": 270
},
{
"epoch": 0.6159014557670772,
"grad_norm": 0.9143264293670654,
"learning_rate": 2.9571046516979256e-05,
"loss": 0.814,
"step": 275
},
{
"epoch": 0.6270996640537514,
"grad_norm": 0.9321989417076111,
"learning_rate": 2.954429623627821e-05,
"loss": 0.7757,
"step": 280
},
{
"epoch": 0.6382978723404256,
"grad_norm": 0.7887775301933289,
"learning_rate": 2.9516749740318623e-05,
"loss": 0.8416,
"step": 285
},
{
"epoch": 0.6494960806270996,
"grad_norm": 0.8512288331985474,
"learning_rate": 2.948840853711022e-05,
"loss": 0.8131,
"step": 290
},
{
"epoch": 0.6606942889137738,
"grad_norm": 0.8105642199516296,
"learning_rate": 2.9459274178168335e-05,
"loss": 0.8115,
"step": 295
},
{
"epoch": 0.671892497200448,
"grad_norm": 0.7953616976737976,
"learning_rate": 2.9429348258428933e-05,
"loss": 0.8266,
"step": 300
},
{
"epoch": 0.683090705487122,
"grad_norm": 2.0142805576324463,
"learning_rate": 2.9398632416161298e-05,
"loss": 0.7923,
"step": 305
},
{
"epoch": 0.6942889137737962,
"grad_norm": 0.9255501627922058,
"learning_rate": 2.936712833287837e-05,
"loss": 0.7464,
"step": 310
},
{
"epoch": 0.7054871220604704,
"grad_norm": 0.8493944406509399,
"learning_rate": 2.9334837733244686e-05,
"loss": 0.7682,
"step": 315
},
{
"epoch": 0.7166853303471444,
"grad_norm": 1.1492375135421753,
"learning_rate": 2.9301762384981944e-05,
"loss": 0.7543,
"step": 320
},
{
"epoch": 0.7278835386338186,
"grad_norm": 0.9279423952102661,
"learning_rate": 2.926790409877225e-05,
"loss": 0.7954,
"step": 325
},
{
"epoch": 0.7390817469204927,
"grad_norm": 0.8645827770233154,
"learning_rate": 2.9233264728158997e-05,
"loss": 0.8014,
"step": 330
},
{
"epoch": 0.7502799552071668,
"grad_norm": 0.8726319670677185,
"learning_rate": 2.9197846169445376e-05,
"loss": 0.7396,
"step": 335
},
{
"epoch": 0.761478163493841,
"grad_norm": 0.9269015192985535,
"learning_rate": 2.916165036159058e-05,
"loss": 0.7109,
"step": 340
},
{
"epoch": 0.7726763717805151,
"grad_norm": 0.9171246290206909,
"learning_rate": 2.912467928610366e-05,
"loss": 0.7041,
"step": 345
},
{
"epoch": 0.7838745800671892,
"grad_norm": 0.8157719373703003,
"learning_rate": 2.9086934966935015e-05,
"loss": 0.7498,
"step": 350
},
{
"epoch": 0.7950727883538634,
"grad_norm": 1.0310955047607422,
"learning_rate": 2.9048419470365656e-05,
"loss": 0.7538,
"step": 355
},
{
"epoch": 0.8062709966405375,
"grad_norm": 0.9810894131660461,
"learning_rate": 2.9009134904894015e-05,
"loss": 0.698,
"step": 360
},
{
"epoch": 0.8174692049272116,
"grad_norm": 0.8990864157676697,
"learning_rate": 2.8969083421120587e-05,
"loss": 0.6898,
"step": 365
},
{
"epoch": 0.8286674132138858,
"grad_norm": 0.8737045526504517,
"learning_rate": 2.892826721163013e-05,
"loss": 0.7175,
"step": 370
},
{
"epoch": 0.8398656215005599,
"grad_norm": 0.8609009981155396,
"learning_rate": 2.8886688510871706e-05,
"loss": 0.7466,
"step": 375
},
{
"epoch": 0.851063829787234,
"grad_norm": 0.9786498546600342,
"learning_rate": 2.884434959503628e-05,
"loss": 0.7221,
"step": 380
},
{
"epoch": 0.8622620380739082,
"grad_norm": 0.9019100069999695,
"learning_rate": 2.8801252781932194e-05,
"loss": 0.7047,
"step": 385
},
{
"epoch": 0.8734602463605823,
"grad_norm": 0.9129027128219604,
"learning_rate": 2.8757400430858193e-05,
"loss": 0.7235,
"step": 390
},
{
"epoch": 0.8846584546472565,
"grad_norm": 0.9829080700874329,
"learning_rate": 2.871279494247435e-05,
"loss": 0.6777,
"step": 395
},
{
"epoch": 0.8958566629339306,
"grad_norm": 0.9491150975227356,
"learning_rate": 2.8667438758670582e-05,
"loss": 0.7271,
"step": 400
},
{
"epoch": 0.9070548712206047,
"grad_norm": 0.9399238228797913,
"learning_rate": 2.8621334362433017e-05,
"loss": 0.6362,
"step": 405
},
{
"epoch": 0.9182530795072789,
"grad_norm": 0.9197304844856262,
"learning_rate": 2.857448427770802e-05,
"loss": 0.666,
"step": 410
},
{
"epoch": 0.9294512877939529,
"grad_norm": 0.9620344042778015,
"learning_rate": 2.8526891069264058e-05,
"loss": 0.677,
"step": 415
},
{
"epoch": 0.9406494960806271,
"grad_norm": 0.8626346588134766,
"learning_rate": 2.847855734255128e-05,
"loss": 0.6432,
"step": 420
},
{
"epoch": 0.9518477043673013,
"grad_norm": 0.9515690803527832,
"learning_rate": 2.8429485743558876e-05,
"loss": 0.6336,
"step": 425
},
{
"epoch": 0.9630459126539753,
"grad_norm": 1.0451574325561523,
"learning_rate": 2.8379678958670245e-05,
"loss": 0.6471,
"step": 430
},
{
"epoch": 0.9742441209406495,
"grad_norm": 0.9338580369949341,
"learning_rate": 2.8329139714515916e-05,
"loss": 0.6526,
"step": 435
},
{
"epoch": 0.9854423292273237,
"grad_norm": 1.034649133682251,
"learning_rate": 2.8277870777824278e-05,
"loss": 0.6459,
"step": 440
},
{
"epoch": 0.9966405375139977,
"grad_norm": 1.0309592485427856,
"learning_rate": 2.822587495527013e-05,
"loss": 0.6495,
"step": 445
},
{
"epoch": 1.0067189249720045,
"grad_norm": 0.9272093772888184,
"learning_rate": 2.817315509332102e-05,
"loss": 0.6473,
"step": 450
},
{
"epoch": 1.0179171332586787,
"grad_norm": 1.0917242765426636,
"learning_rate": 2.8119714078081428e-05,
"loss": 0.5854,
"step": 455
},
{
"epoch": 1.0291153415453527,
"grad_norm": 0.9767428040504456,
"learning_rate": 2.8065554835134766e-05,
"loss": 0.5468,
"step": 460
},
{
"epoch": 1.0403135498320268,
"grad_norm": 0.9543578028678894,
"learning_rate": 2.8010680329383213e-05,
"loss": 0.5365,
"step": 465
},
{
"epoch": 1.051511758118701,
"grad_norm": 0.9958118200302124,
"learning_rate": 2.7955093564885412e-05,
"loss": 0.5602,
"step": 470
},
{
"epoch": 1.0627099664053752,
"grad_norm": 0.9233402609825134,
"learning_rate": 2.7898797584692003e-05,
"loss": 0.5523,
"step": 475
},
{
"epoch": 1.0739081746920494,
"grad_norm": 1.1012506484985352,
"learning_rate": 2.784179547067906e-05,
"loss": 0.5821,
"step": 480
},
{
"epoch": 1.0851063829787233,
"grad_norm": 0.9179466962814331,
"learning_rate": 2.7784090343379337e-05,
"loss": 0.5227,
"step": 485
},
{
"epoch": 1.0963045912653975,
"grad_norm": 1.1877224445343018,
"learning_rate": 2.7725685361811472e-05,
"loss": 0.5462,
"step": 490
},
{
"epoch": 1.1075027995520716,
"grad_norm": 0.9985042810440063,
"learning_rate": 2.7666583723307035e-05,
"loss": 0.5437,
"step": 495
},
{
"epoch": 1.1187010078387458,
"grad_norm": 1.211738109588623,
"learning_rate": 2.7606788663335498e-05,
"loss": 0.5293,
"step": 500
},
{
"epoch": 1.12989921612542,
"grad_norm": 1.0043666362762451,
"learning_rate": 2.7546303455327093e-05,
"loss": 0.5199,
"step": 505
},
{
"epoch": 1.1410974244120942,
"grad_norm": 1.134454369544983,
"learning_rate": 2.7485131410493644e-05,
"loss": 0.5272,
"step": 510
},
{
"epoch": 1.1522956326987681,
"grad_norm": 1.006064534187317,
"learning_rate": 2.742327587764726e-05,
"loss": 0.5601,
"step": 515
},
{
"epoch": 1.1634938409854423,
"grad_norm": 1.0664838552474976,
"learning_rate": 2.7360740243017042e-05,
"loss": 0.5324,
"step": 520
},
{
"epoch": 1.1746920492721165,
"grad_norm": 0.9521324634552002,
"learning_rate": 2.729752793006368e-05,
"loss": 0.5258,
"step": 525
},
{
"epoch": 1.1858902575587906,
"grad_norm": 1.0168689489364624,
"learning_rate": 2.723364239929206e-05,
"loss": 0.5062,
"step": 530
},
{
"epoch": 1.1970884658454648,
"grad_norm": 1.09732985496521,
"learning_rate": 2.71690871480618e-05,
"loss": 0.5249,
"step": 535
},
{
"epoch": 1.2082866741321387,
"grad_norm": 1.062366008758545,
"learning_rate": 2.7103865710395803e-05,
"loss": 0.5003,
"step": 540
},
{
"epoch": 1.219484882418813,
"grad_norm": 1.0284899473190308,
"learning_rate": 2.7037981656786802e-05,
"loss": 0.4832,
"step": 545
},
{
"epoch": 1.230683090705487,
"grad_norm": 1.1590033769607544,
"learning_rate": 2.6971438594001862e-05,
"loss": 0.502,
"step": 550
},
{
"epoch": 1.2418812989921613,
"grad_norm": 1.2048217058181763,
"learning_rate": 2.690424016488496e-05,
"loss": 0.5068,
"step": 555
},
{
"epoch": 1.2530795072788354,
"grad_norm": 1.0057001113891602,
"learning_rate": 2.6836390048157555e-05,
"loss": 0.501,
"step": 560
},
{
"epoch": 1.2642777155655094,
"grad_norm": 1.0815175771713257,
"learning_rate": 2.676789195821719e-05,
"loss": 0.4754,
"step": 565
},
{
"epoch": 1.2754759238521838,
"grad_norm": 0.9652005434036255,
"learning_rate": 2.6698749644934155e-05,
"loss": 0.4928,
"step": 570
},
{
"epoch": 1.2866741321388577,
"grad_norm": 1.0275248289108276,
"learning_rate": 2.6628966893446215e-05,
"loss": 0.4819,
"step": 575
},
{
"epoch": 1.297872340425532,
"grad_norm": 1.0320004224777222,
"learning_rate": 2.655854752395137e-05,
"loss": 0.5305,
"step": 580
},
{
"epoch": 1.309070548712206,
"grad_norm": 1.03120756149292,
"learning_rate": 2.6487495391498757e-05,
"loss": 0.4958,
"step": 585
},
{
"epoch": 1.3202687569988802,
"grad_norm": 0.9786254167556763,
"learning_rate": 2.6415814385777565e-05,
"loss": 0.4638,
"step": 590
},
{
"epoch": 1.3314669652855544,
"grad_norm": 1.0948402881622314,
"learning_rate": 2.634350843090414e-05,
"loss": 0.4689,
"step": 595
},
{
"epoch": 1.3426651735722284,
"grad_norm": 1.3032407760620117,
"learning_rate": 2.6270581485207137e-05,
"loss": 0.4767,
"step": 600
},
{
"epoch": 1.3538633818589025,
"grad_norm": 1.0457333326339722,
"learning_rate": 2.6197037541010827e-05,
"loss": 0.4747,
"step": 605
},
{
"epoch": 1.3650615901455767,
"grad_norm": 0.9719673991203308,
"learning_rate": 2.6122880624416553e-05,
"loss": 0.4693,
"step": 610
},
{
"epoch": 1.3762597984322509,
"grad_norm": 1.0594805479049683,
"learning_rate": 2.604811479508231e-05,
"loss": 0.4199,
"step": 615
},
{
"epoch": 1.387458006718925,
"grad_norm": 1.0848315954208374,
"learning_rate": 2.5972744146000504e-05,
"loss": 0.4375,
"step": 620
},
{
"epoch": 1.398656215005599,
"grad_norm": 1.1041808128356934,
"learning_rate": 2.5896772803273903e-05,
"loss": 0.5127,
"step": 625
},
{
"epoch": 1.4098544232922732,
"grad_norm": 1.0456072092056274,
"learning_rate": 2.582020492588973e-05,
"loss": 0.484,
"step": 630
},
{
"epoch": 1.4210526315789473,
"grad_norm": 0.9886185526847839,
"learning_rate": 2.574304470549201e-05,
"loss": 0.4442,
"step": 635
},
{
"epoch": 1.4322508398656215,
"grad_norm": 1.148207426071167,
"learning_rate": 2.566529636615207e-05,
"loss": 0.4223,
"step": 640
},
{
"epoch": 1.4434490481522957,
"grad_norm": 0.9983200430870056,
"learning_rate": 2.5586964164137325e-05,
"loss": 0.4234,
"step": 645
},
{
"epoch": 1.4546472564389696,
"grad_norm": 1.0275447368621826,
"learning_rate": 2.5508052387678258e-05,
"loss": 0.4618,
"step": 650
},
{
"epoch": 1.465845464725644,
"grad_norm": 1.1754274368286133,
"learning_rate": 2.5428565356733672e-05,
"loss": 0.4258,
"step": 655
},
{
"epoch": 1.477043673012318,
"grad_norm": 1.3802127838134766,
"learning_rate": 2.534850742275418e-05,
"loss": 0.4388,
"step": 660
},
{
"epoch": 1.4882418812989922,
"grad_norm": 1.1148616075515747,
"learning_rate": 2.5267882968444017e-05,
"loss": 0.4366,
"step": 665
},
{
"epoch": 1.4994400895856663,
"grad_norm": 1.0787454843521118,
"learning_rate": 2.518669640752109e-05,
"loss": 0.4121,
"step": 670
},
{
"epoch": 1.5106382978723403,
"grad_norm": 1.1478720903396606,
"learning_rate": 2.5104952184475346e-05,
"loss": 0.4201,
"step": 675
},
{
"epoch": 1.5218365061590147,
"grad_norm": 0.9818703532218933,
"learning_rate": 2.5022654774325507e-05,
"loss": 0.4343,
"step": 680
},
{
"epoch": 1.5330347144456886,
"grad_norm": 1.0454514026641846,
"learning_rate": 2.4939808682374028e-05,
"loss": 0.4594,
"step": 685
},
{
"epoch": 1.5442329227323628,
"grad_norm": 0.996567964553833,
"learning_rate": 2.48564184439605e-05,
"loss": 0.4373,
"step": 690
},
{
"epoch": 1.555431131019037,
"grad_norm": 1.1099140644073486,
"learning_rate": 2.4772488624213352e-05,
"loss": 0.4076,
"step": 695
},
{
"epoch": 1.5666293393057111,
"grad_norm": 1.1161383390426636,
"learning_rate": 2.4688023817799944e-05,
"loss": 0.4096,
"step": 700
},
{
"epoch": 1.5778275475923853,
"grad_norm": 1.0627449750900269,
"learning_rate": 2.460302864867502e-05,
"loss": 0.4008,
"step": 705
},
{
"epoch": 1.5890257558790593,
"grad_norm": 1.0296977758407593,
"learning_rate": 2.4517507769827598e-05,
"loss": 0.469,
"step": 710
},
{
"epoch": 1.6002239641657336,
"grad_norm": 1.1758477687835693,
"learning_rate": 2.4431465863026223e-05,
"loss": 0.3854,
"step": 715
},
{
"epoch": 1.6114221724524076,
"grad_norm": 1.0715276002883911,
"learning_rate": 2.434490763856268e-05,
"loss": 0.4006,
"step": 720
},
{
"epoch": 1.6226203807390818,
"grad_norm": 1.1095898151397705,
"learning_rate": 2.4257837834994123e-05,
"loss": 0.3902,
"step": 725
},
{
"epoch": 1.633818589025756,
"grad_norm": 1.100608468055725,
"learning_rate": 2.4170261218883686e-05,
"loss": 0.4034,
"step": 730
},
{
"epoch": 1.64501679731243,
"grad_norm": 1.1237363815307617,
"learning_rate": 2.4082182584539526e-05,
"loss": 0.4103,
"step": 735
},
{
"epoch": 1.6562150055991043,
"grad_norm": 1.0138239860534668,
"learning_rate": 2.3993606753752356e-05,
"loss": 0.3801,
"step": 740
},
{
"epoch": 1.6674132138857782,
"grad_norm": 1.0473990440368652,
"learning_rate": 2.39045385755315e-05,
"loss": 0.3642,
"step": 745
},
{
"epoch": 1.6786114221724524,
"grad_norm": 0.9463981986045837,
"learning_rate": 2.381498292583942e-05,
"loss": 0.4,
"step": 750
},
{
"epoch": 1.6898096304591266,
"grad_norm": 1.0383639335632324,
"learning_rate": 2.37249447073248e-05,
"loss": 0.4388,
"step": 755
},
{
"epoch": 1.7010078387458005,
"grad_norm": 1.0742672681808472,
"learning_rate": 2.3634428849054156e-05,
"loss": 0.3784,
"step": 760
},
{
"epoch": 1.712206047032475,
"grad_norm": 0.9601068496704102,
"learning_rate": 2.3543440306241965e-05,
"loss": 0.4132,
"step": 765
},
{
"epoch": 1.7234042553191489,
"grad_norm": 1.1224182844161987,
"learning_rate": 2.3451984059979444e-05,
"loss": 0.3628,
"step": 770
},
{
"epoch": 1.734602463605823,
"grad_norm": 1.1569995880126953,
"learning_rate": 2.336006511696184e-05,
"loss": 0.3678,
"step": 775
},
{
"epoch": 1.7458006718924972,
"grad_norm": 1.1215687990188599,
"learning_rate": 2.3267688509214335e-05,
"loss": 0.3658,
"step": 780
},
{
"epoch": 1.7569988801791714,
"grad_norm": 1.0910794734954834,
"learning_rate": 2.317485929381658e-05,
"loss": 0.3702,
"step": 785
},
{
"epoch": 1.7681970884658456,
"grad_norm": 1.161943793296814,
"learning_rate": 2.3081582552625867e-05,
"loss": 0.3851,
"step": 790
},
{
"epoch": 1.7793952967525195,
"grad_norm": 1.1181591749191284,
"learning_rate": 2.29878633919989e-05,
"loss": 0.3623,
"step": 795
},
{
"epoch": 1.7905935050391937,
"grad_norm": 0.9931478500366211,
"learning_rate": 2.2893706942512257e-05,
"loss": 0.3783,
"step": 800
},
{
"epoch": 1.8017917133258678,
"grad_norm": 1.125067114830017,
"learning_rate": 2.2799118358681535e-05,
"loss": 0.3615,
"step": 805
},
{
"epoch": 1.812989921612542,
"grad_norm": 1.1069613695144653,
"learning_rate": 2.2704102818679164e-05,
"loss": 0.3572,
"step": 810
},
{
"epoch": 1.8241881298992162,
"grad_norm": 0.9140533208847046,
"learning_rate": 2.2608665524050923e-05,
"loss": 0.3553,
"step": 815
},
{
"epoch": 1.8353863381858901,
"grad_norm": 0.987232506275177,
"learning_rate": 2.25128116994312e-05,
"loss": 0.3563,
"step": 820
},
{
"epoch": 1.8465845464725645,
"grad_norm": 1.1312346458435059,
"learning_rate": 2.241654659225696e-05,
"loss": 0.3563,
"step": 825
},
{
"epoch": 1.8577827547592385,
"grad_norm": 1.0891382694244385,
"learning_rate": 2.231987547248049e-05,
"loss": 0.3586,
"step": 830
},
{
"epoch": 1.8689809630459127,
"grad_norm": 1.134059190750122,
"learning_rate": 2.2222803632280894e-05,
"loss": 0.3539,
"step": 835
},
{
"epoch": 1.8801791713325868,
"grad_norm": 1.0670528411865234,
"learning_rate": 2.2125336385774385e-05,
"loss": 0.3506,
"step": 840
},
{
"epoch": 1.8913773796192608,
"grad_norm": 1.1700224876403809,
"learning_rate": 2.2027479068723345e-05,
"loss": 0.344,
"step": 845
},
{
"epoch": 1.9025755879059352,
"grad_norm": 1.1153310537338257,
"learning_rate": 2.1929237038244254e-05,
"loss": 0.335,
"step": 850
},
{
"epoch": 1.9137737961926091,
"grad_norm": 3.077934741973877,
"learning_rate": 2.1830615672514404e-05,
"loss": 0.3383,
"step": 855
},
{
"epoch": 1.9249720044792833,
"grad_norm": 0.9678553342819214,
"learning_rate": 2.1731620370477468e-05,
"loss": 0.337,
"step": 860
},
{
"epoch": 1.9361702127659575,
"grad_norm": 0.9908361434936523,
"learning_rate": 2.1632256551547952e-05,
"loss": 0.3673,
"step": 865
},
{
"epoch": 1.9473684210526314,
"grad_norm": 1.0404701232910156,
"learning_rate": 2.1532529655314514e-05,
"loss": 0.3276,
"step": 870
},
{
"epoch": 1.9585666293393058,
"grad_norm": 0.9922332763671875,
"learning_rate": 2.1432445141242166e-05,
"loss": 0.3408,
"step": 875
},
{
"epoch": 1.9697648376259798,
"grad_norm": 1.036054253578186,
"learning_rate": 2.1332008488373417e-05,
"loss": 0.3758,
"step": 880
},
{
"epoch": 1.980963045912654,
"grad_norm": 1.0212777853012085,
"learning_rate": 2.12312251950283e-05,
"loss": 0.3191,
"step": 885
},
{
"epoch": 1.992161254199328,
"grad_norm": 1.0531727075576782,
"learning_rate": 2.1130100778503407e-05,
"loss": 0.3344,
"step": 890
}
],
"logging_steps": 5,
"max_steps": 2235,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.352177971225428e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}