sail-clip-hendrix-run / trainer_state.json
cringgaard's picture
Upload 7 files
5f3f42a verified
{
"best_metric": 2.3842289447784424,
"best_model_checkpoint": "./results/checkpoint-720",
"epoch": 3.977900552486188,
"eval_steps": 10,
"global_step": 720,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.055248618784530384,
"grad_norm": 3.8338866233825684,
"learning_rate": 4.972375690607735e-05,
"loss": 5.1215,
"step": 10
},
{
"epoch": 0.055248618784530384,
"eval_loss": 4.150951862335205,
"eval_runtime": 107.472,
"eval_samples_per_second": 11.947,
"eval_steps_per_second": 0.195,
"step": 10
},
{
"epoch": 0.11049723756906077,
"grad_norm": 1.0968620777130127,
"learning_rate": 4.94475138121547e-05,
"loss": 4.172,
"step": 20
},
{
"epoch": 0.11049723756906077,
"eval_loss": 4.160157203674316,
"eval_runtime": 108.2249,
"eval_samples_per_second": 11.864,
"eval_steps_per_second": 0.194,
"step": 20
},
{
"epoch": 0.16574585635359115,
"grad_norm": 9.813711166381836,
"learning_rate": 4.9171270718232046e-05,
"loss": 4.1638,
"step": 30
},
{
"epoch": 0.16574585635359115,
"eval_loss": 4.136767387390137,
"eval_runtime": 106.48,
"eval_samples_per_second": 12.059,
"eval_steps_per_second": 0.197,
"step": 30
},
{
"epoch": 0.22099447513812154,
"grad_norm": 1.0513194799423218,
"learning_rate": 4.8895027624309394e-05,
"loss": 4.1615,
"step": 40
},
{
"epoch": 0.22099447513812154,
"eval_loss": 4.142675399780273,
"eval_runtime": 108.3224,
"eval_samples_per_second": 11.853,
"eval_steps_per_second": 0.194,
"step": 40
},
{
"epoch": 0.27624309392265195,
"grad_norm": 2.283129930496216,
"learning_rate": 4.861878453038674e-05,
"loss": 4.1346,
"step": 50
},
{
"epoch": 0.27624309392265195,
"eval_loss": 4.11836576461792,
"eval_runtime": 107.0752,
"eval_samples_per_second": 11.992,
"eval_steps_per_second": 0.196,
"step": 50
},
{
"epoch": 0.3314917127071823,
"grad_norm": 2.6535284519195557,
"learning_rate": 4.834254143646409e-05,
"loss": 4.1412,
"step": 60
},
{
"epoch": 0.3314917127071823,
"eval_loss": 4.119892597198486,
"eval_runtime": 108.68,
"eval_samples_per_second": 11.814,
"eval_steps_per_second": 0.193,
"step": 60
},
{
"epoch": 0.3867403314917127,
"grad_norm": 2.246750831604004,
"learning_rate": 4.806629834254144e-05,
"loss": 4.2136,
"step": 70
},
{
"epoch": 0.3867403314917127,
"eval_loss": 4.110352516174316,
"eval_runtime": 109.321,
"eval_samples_per_second": 11.745,
"eval_steps_per_second": 0.192,
"step": 70
},
{
"epoch": 0.4419889502762431,
"grad_norm": 5.01633358001709,
"learning_rate": 4.7790055248618785e-05,
"loss": 4.1121,
"step": 80
},
{
"epoch": 0.4419889502762431,
"eval_loss": 4.114729404449463,
"eval_runtime": 108.7219,
"eval_samples_per_second": 11.81,
"eval_steps_per_second": 0.193,
"step": 80
},
{
"epoch": 0.4972375690607735,
"grad_norm": 6.723274230957031,
"learning_rate": 4.751381215469613e-05,
"loss": 4.0936,
"step": 90
},
{
"epoch": 0.4972375690607735,
"eval_loss": 4.197766304016113,
"eval_runtime": 109.473,
"eval_samples_per_second": 11.729,
"eval_steps_per_second": 0.192,
"step": 90
},
{
"epoch": 0.5524861878453039,
"grad_norm": 2.4457666873931885,
"learning_rate": 4.723756906077349e-05,
"loss": 4.1228,
"step": 100
},
{
"epoch": 0.5524861878453039,
"eval_loss": 4.094923496246338,
"eval_runtime": 109.8167,
"eval_samples_per_second": 11.692,
"eval_steps_per_second": 0.191,
"step": 100
},
{
"epoch": 0.6077348066298343,
"grad_norm": 2.2743124961853027,
"learning_rate": 4.6961325966850835e-05,
"loss": 4.0925,
"step": 110
},
{
"epoch": 0.6077348066298343,
"eval_loss": 4.078197479248047,
"eval_runtime": 110.7223,
"eval_samples_per_second": 11.597,
"eval_steps_per_second": 0.19,
"step": 110
},
{
"epoch": 0.6629834254143646,
"grad_norm": 3.141187906265259,
"learning_rate": 4.6685082872928176e-05,
"loss": 4.0646,
"step": 120
},
{
"epoch": 0.6629834254143646,
"eval_loss": 4.101787090301514,
"eval_runtime": 108.3177,
"eval_samples_per_second": 11.854,
"eval_steps_per_second": 0.194,
"step": 120
},
{
"epoch": 0.7182320441988951,
"grad_norm": 7.580991744995117,
"learning_rate": 4.6408839779005524e-05,
"loss": 4.0936,
"step": 130
},
{
"epoch": 0.7182320441988951,
"eval_loss": 4.074957847595215,
"eval_runtime": 109.0627,
"eval_samples_per_second": 11.773,
"eval_steps_per_second": 0.193,
"step": 130
},
{
"epoch": 0.7734806629834254,
"grad_norm": 5.120913028717041,
"learning_rate": 4.613259668508287e-05,
"loss": 4.0563,
"step": 140
},
{
"epoch": 0.7734806629834254,
"eval_loss": 4.028809070587158,
"eval_runtime": 108.4672,
"eval_samples_per_second": 11.838,
"eval_steps_per_second": 0.194,
"step": 140
},
{
"epoch": 0.8287292817679558,
"grad_norm": 6.340137004852295,
"learning_rate": 4.585635359116022e-05,
"loss": 4.0016,
"step": 150
},
{
"epoch": 0.8287292817679558,
"eval_loss": 4.02580451965332,
"eval_runtime": 108.9318,
"eval_samples_per_second": 11.787,
"eval_steps_per_second": 0.193,
"step": 150
},
{
"epoch": 0.8839779005524862,
"grad_norm": 3.361588954925537,
"learning_rate": 4.5580110497237574e-05,
"loss": 4.0029,
"step": 160
},
{
"epoch": 0.8839779005524862,
"eval_loss": 4.001326560974121,
"eval_runtime": 110.4105,
"eval_samples_per_second": 11.629,
"eval_steps_per_second": 0.19,
"step": 160
},
{
"epoch": 0.9392265193370166,
"grad_norm": 10.81383991241455,
"learning_rate": 4.530386740331492e-05,
"loss": 4.0072,
"step": 170
},
{
"epoch": 0.9392265193370166,
"eval_loss": 4.006492614746094,
"eval_runtime": 110.8042,
"eval_samples_per_second": 11.588,
"eval_steps_per_second": 0.19,
"step": 170
},
{
"epoch": 0.994475138121547,
"grad_norm": 4.824268817901611,
"learning_rate": 4.502762430939227e-05,
"loss": 4.0382,
"step": 180
},
{
"epoch": 0.994475138121547,
"eval_loss": 4.0064873695373535,
"eval_runtime": 109.8857,
"eval_samples_per_second": 11.685,
"eval_steps_per_second": 0.191,
"step": 180
},
{
"epoch": 1.0497237569060773,
"grad_norm": 6.9584760665893555,
"learning_rate": 4.475138121546962e-05,
"loss": 3.8569,
"step": 190
},
{
"epoch": 1.0497237569060773,
"eval_loss": 3.9973037242889404,
"eval_runtime": 111.7743,
"eval_samples_per_second": 11.487,
"eval_steps_per_second": 0.188,
"step": 190
},
{
"epoch": 1.1049723756906078,
"grad_norm": 10.90897274017334,
"learning_rate": 4.447513812154696e-05,
"loss": 3.9025,
"step": 200
},
{
"epoch": 1.1049723756906078,
"eval_loss": 3.920397996902466,
"eval_runtime": 110.0718,
"eval_samples_per_second": 11.665,
"eval_steps_per_second": 0.191,
"step": 200
},
{
"epoch": 1.160220994475138,
"grad_norm": 6.581048011779785,
"learning_rate": 4.419889502762431e-05,
"loss": 3.9265,
"step": 210
},
{
"epoch": 1.160220994475138,
"eval_loss": 3.9369547367095947,
"eval_runtime": 108.7006,
"eval_samples_per_second": 11.812,
"eval_steps_per_second": 0.193,
"step": 210
},
{
"epoch": 1.2154696132596685,
"grad_norm": 7.869502544403076,
"learning_rate": 4.392265193370166e-05,
"loss": 3.8242,
"step": 220
},
{
"epoch": 1.2154696132596685,
"eval_loss": 3.932981491088867,
"eval_runtime": 110.8711,
"eval_samples_per_second": 11.581,
"eval_steps_per_second": 0.189,
"step": 220
},
{
"epoch": 1.270718232044199,
"grad_norm": 6.994544982910156,
"learning_rate": 4.364640883977901e-05,
"loss": 3.8785,
"step": 230
},
{
"epoch": 1.270718232044199,
"eval_loss": 3.914726495742798,
"eval_runtime": 108.3935,
"eval_samples_per_second": 11.846,
"eval_steps_per_second": 0.194,
"step": 230
},
{
"epoch": 1.3259668508287292,
"grad_norm": 9.24843978881836,
"learning_rate": 4.337016574585636e-05,
"loss": 3.8126,
"step": 240
},
{
"epoch": 1.3259668508287292,
"eval_loss": 3.878098487854004,
"eval_runtime": 108.7329,
"eval_samples_per_second": 11.809,
"eval_steps_per_second": 0.193,
"step": 240
},
{
"epoch": 1.3812154696132597,
"grad_norm": 7.285367965698242,
"learning_rate": 4.3093922651933705e-05,
"loss": 3.8305,
"step": 250
},
{
"epoch": 1.3812154696132597,
"eval_loss": 3.9157791137695312,
"eval_runtime": 109.2735,
"eval_samples_per_second": 11.75,
"eval_steps_per_second": 0.192,
"step": 250
},
{
"epoch": 1.43646408839779,
"grad_norm": 6.021206378936768,
"learning_rate": 4.281767955801105e-05,
"loss": 3.7631,
"step": 260
},
{
"epoch": 1.43646408839779,
"eval_loss": 3.818014144897461,
"eval_runtime": 109.4955,
"eval_samples_per_second": 11.727,
"eval_steps_per_second": 0.192,
"step": 260
},
{
"epoch": 1.4917127071823204,
"grad_norm": 5.70164680480957,
"learning_rate": 4.25414364640884e-05,
"loss": 3.7908,
"step": 270
},
{
"epoch": 1.4917127071823204,
"eval_loss": 3.7703230381011963,
"eval_runtime": 108.0073,
"eval_samples_per_second": 11.888,
"eval_steps_per_second": 0.194,
"step": 270
},
{
"epoch": 1.5469613259668509,
"grad_norm": 8.903672218322754,
"learning_rate": 4.226519337016575e-05,
"loss": 3.7319,
"step": 280
},
{
"epoch": 1.5469613259668509,
"eval_loss": 3.813979387283325,
"eval_runtime": 108.0907,
"eval_samples_per_second": 11.879,
"eval_steps_per_second": 0.194,
"step": 280
},
{
"epoch": 1.6022099447513813,
"grad_norm": 7.5272111892700195,
"learning_rate": 4.1988950276243096e-05,
"loss": 3.765,
"step": 290
},
{
"epoch": 1.6022099447513813,
"eval_loss": 3.822376012802124,
"eval_runtime": 107.6694,
"eval_samples_per_second": 11.925,
"eval_steps_per_second": 0.195,
"step": 290
},
{
"epoch": 1.6574585635359116,
"grad_norm": 8.640732765197754,
"learning_rate": 4.1712707182320444e-05,
"loss": 3.7465,
"step": 300
},
{
"epoch": 1.6574585635359116,
"eval_loss": 3.7796106338500977,
"eval_runtime": 107.2692,
"eval_samples_per_second": 11.97,
"eval_steps_per_second": 0.196,
"step": 300
},
{
"epoch": 1.7127071823204418,
"grad_norm": 8.788119316101074,
"learning_rate": 4.143646408839779e-05,
"loss": 3.6646,
"step": 310
},
{
"epoch": 1.7127071823204418,
"eval_loss": 3.711517810821533,
"eval_runtime": 108.3013,
"eval_samples_per_second": 11.856,
"eval_steps_per_second": 0.194,
"step": 310
},
{
"epoch": 1.7679558011049723,
"grad_norm": 12.237804412841797,
"learning_rate": 4.116022099447514e-05,
"loss": 3.67,
"step": 320
},
{
"epoch": 1.7679558011049723,
"eval_loss": 3.686821460723877,
"eval_runtime": 109.2103,
"eval_samples_per_second": 11.757,
"eval_steps_per_second": 0.192,
"step": 320
},
{
"epoch": 1.8232044198895028,
"grad_norm": 10.34768295288086,
"learning_rate": 4.088397790055249e-05,
"loss": 3.6374,
"step": 330
},
{
"epoch": 1.8232044198895028,
"eval_loss": 3.646404504776001,
"eval_runtime": 108.3039,
"eval_samples_per_second": 11.856,
"eval_steps_per_second": 0.194,
"step": 330
},
{
"epoch": 1.8784530386740332,
"grad_norm": 10.875980377197266,
"learning_rate": 4.0607734806629835e-05,
"loss": 3.5907,
"step": 340
},
{
"epoch": 1.8784530386740332,
"eval_loss": 3.631521463394165,
"eval_runtime": 108.2559,
"eval_samples_per_second": 11.861,
"eval_steps_per_second": 0.194,
"step": 340
},
{
"epoch": 1.9337016574585635,
"grad_norm": 9.671201705932617,
"learning_rate": 4.033149171270719e-05,
"loss": 3.6003,
"step": 350
},
{
"epoch": 1.9337016574585635,
"eval_loss": 3.551748037338257,
"eval_runtime": 108.4367,
"eval_samples_per_second": 11.841,
"eval_steps_per_second": 0.194,
"step": 350
},
{
"epoch": 1.988950276243094,
"grad_norm": 11.600411415100098,
"learning_rate": 4.005524861878453e-05,
"loss": 3.5967,
"step": 360
},
{
"epoch": 1.988950276243094,
"eval_loss": 3.5431878566741943,
"eval_runtime": 107.9821,
"eval_samples_per_second": 11.891,
"eval_steps_per_second": 0.194,
"step": 360
},
{
"epoch": 2.044198895027624,
"grad_norm": 17.58928680419922,
"learning_rate": 3.977900552486188e-05,
"loss": 3.3887,
"step": 370
},
{
"epoch": 2.044198895027624,
"eval_loss": 3.5802059173583984,
"eval_runtime": 107.9756,
"eval_samples_per_second": 11.892,
"eval_steps_per_second": 0.194,
"step": 370
},
{
"epoch": 2.0994475138121547,
"grad_norm": 12.583136558532715,
"learning_rate": 3.950276243093923e-05,
"loss": 3.2413,
"step": 380
},
{
"epoch": 2.0994475138121547,
"eval_loss": 3.5067298412323,
"eval_runtime": 108.2401,
"eval_samples_per_second": 11.863,
"eval_steps_per_second": 0.194,
"step": 380
},
{
"epoch": 2.154696132596685,
"grad_norm": 13.868547439575195,
"learning_rate": 3.9226519337016574e-05,
"loss": 3.0502,
"step": 390
},
{
"epoch": 2.154696132596685,
"eval_loss": 3.548964738845825,
"eval_runtime": 108.3243,
"eval_samples_per_second": 11.853,
"eval_steps_per_second": 0.194,
"step": 390
},
{
"epoch": 2.2099447513812156,
"grad_norm": 18.013263702392578,
"learning_rate": 3.895027624309392e-05,
"loss": 3.2647,
"step": 400
},
{
"epoch": 2.2099447513812156,
"eval_loss": 3.41357159614563,
"eval_runtime": 108.3964,
"eval_samples_per_second": 11.845,
"eval_steps_per_second": 0.194,
"step": 400
},
{
"epoch": 2.265193370165746,
"grad_norm": 12.967714309692383,
"learning_rate": 3.867403314917128e-05,
"loss": 3.1265,
"step": 410
},
{
"epoch": 2.265193370165746,
"eval_loss": 3.4157204627990723,
"eval_runtime": 108.3207,
"eval_samples_per_second": 11.854,
"eval_steps_per_second": 0.194,
"step": 410
},
{
"epoch": 2.320441988950276,
"grad_norm": 17.192251205444336,
"learning_rate": 3.8397790055248625e-05,
"loss": 3.0176,
"step": 420
},
{
"epoch": 2.320441988950276,
"eval_loss": 3.4587888717651367,
"eval_runtime": 108.7782,
"eval_samples_per_second": 11.804,
"eval_steps_per_second": 0.193,
"step": 420
},
{
"epoch": 2.3756906077348066,
"grad_norm": 17.41048240661621,
"learning_rate": 3.812154696132597e-05,
"loss": 3.0366,
"step": 430
},
{
"epoch": 2.3756906077348066,
"eval_loss": 3.359968900680542,
"eval_runtime": 108.3212,
"eval_samples_per_second": 11.854,
"eval_steps_per_second": 0.194,
"step": 430
},
{
"epoch": 2.430939226519337,
"grad_norm": 14.966797828674316,
"learning_rate": 3.7845303867403314e-05,
"loss": 3.0515,
"step": 440
},
{
"epoch": 2.430939226519337,
"eval_loss": 3.405341148376465,
"eval_runtime": 108.5034,
"eval_samples_per_second": 11.834,
"eval_steps_per_second": 0.194,
"step": 440
},
{
"epoch": 2.4861878453038675,
"grad_norm": 14.554710388183594,
"learning_rate": 3.756906077348066e-05,
"loss": 3.1383,
"step": 450
},
{
"epoch": 2.4861878453038675,
"eval_loss": 3.261054754257202,
"eval_runtime": 107.8836,
"eval_samples_per_second": 11.902,
"eval_steps_per_second": 0.195,
"step": 450
},
{
"epoch": 2.541436464088398,
"grad_norm": 22.434762954711914,
"learning_rate": 3.729281767955801e-05,
"loss": 2.9971,
"step": 460
},
{
"epoch": 2.541436464088398,
"eval_loss": 3.229337692260742,
"eval_runtime": 108.7988,
"eval_samples_per_second": 11.802,
"eval_steps_per_second": 0.193,
"step": 460
},
{
"epoch": 2.596685082872928,
"grad_norm": 15.667607307434082,
"learning_rate": 3.7016574585635364e-05,
"loss": 3.047,
"step": 470
},
{
"epoch": 2.596685082872928,
"eval_loss": 3.224137783050537,
"eval_runtime": 110.5326,
"eval_samples_per_second": 11.616,
"eval_steps_per_second": 0.19,
"step": 470
},
{
"epoch": 2.6519337016574585,
"grad_norm": 16.986766815185547,
"learning_rate": 3.674033149171271e-05,
"loss": 2.8851,
"step": 480
},
{
"epoch": 2.6519337016574585,
"eval_loss": 3.2184762954711914,
"eval_runtime": 108.027,
"eval_samples_per_second": 11.886,
"eval_steps_per_second": 0.194,
"step": 480
},
{
"epoch": 2.707182320441989,
"grad_norm": 13.545926094055176,
"learning_rate": 3.646408839779006e-05,
"loss": 2.8976,
"step": 490
},
{
"epoch": 2.707182320441989,
"eval_loss": 3.082709550857544,
"eval_runtime": 108.5833,
"eval_samples_per_second": 11.825,
"eval_steps_per_second": 0.193,
"step": 490
},
{
"epoch": 2.7624309392265194,
"grad_norm": 16.030040740966797,
"learning_rate": 3.618784530386741e-05,
"loss": 2.8307,
"step": 500
},
{
"epoch": 2.7624309392265194,
"eval_loss": 3.0571742057800293,
"eval_runtime": 107.7725,
"eval_samples_per_second": 11.914,
"eval_steps_per_second": 0.195,
"step": 500
},
{
"epoch": 2.81767955801105,
"grad_norm": 16.842382431030273,
"learning_rate": 3.5911602209944755e-05,
"loss": 2.8896,
"step": 510
},
{
"epoch": 2.81767955801105,
"eval_loss": 2.9949567317962646,
"eval_runtime": 108.4232,
"eval_samples_per_second": 11.842,
"eval_steps_per_second": 0.194,
"step": 510
},
{
"epoch": 2.87292817679558,
"grad_norm": 18.767789840698242,
"learning_rate": 3.5635359116022096e-05,
"loss": 2.7774,
"step": 520
},
{
"epoch": 2.87292817679558,
"eval_loss": 2.9752790927886963,
"eval_runtime": 108.043,
"eval_samples_per_second": 11.884,
"eval_steps_per_second": 0.194,
"step": 520
},
{
"epoch": 2.9281767955801103,
"grad_norm": 15.322210311889648,
"learning_rate": 3.535911602209945e-05,
"loss": 2.7361,
"step": 530
},
{
"epoch": 2.9281767955801103,
"eval_loss": 2.9297850131988525,
"eval_runtime": 109.8044,
"eval_samples_per_second": 11.694,
"eval_steps_per_second": 0.191,
"step": 530
},
{
"epoch": 2.983425414364641,
"grad_norm": 19.184162139892578,
"learning_rate": 3.50828729281768e-05,
"loss": 2.6885,
"step": 540
},
{
"epoch": 2.983425414364641,
"eval_loss": 2.9156270027160645,
"eval_runtime": 107.5015,
"eval_samples_per_second": 11.944,
"eval_steps_per_second": 0.195,
"step": 540
},
{
"epoch": 3.0386740331491713,
"grad_norm": 19.8149356842041,
"learning_rate": 3.4806629834254147e-05,
"loss": 2.2378,
"step": 550
},
{
"epoch": 3.0386740331491713,
"eval_loss": 3.0476300716400146,
"eval_runtime": 110.8258,
"eval_samples_per_second": 11.586,
"eval_steps_per_second": 0.189,
"step": 550
},
{
"epoch": 3.0939226519337018,
"grad_norm": 19.72810935974121,
"learning_rate": 3.4530386740331494e-05,
"loss": 2.0295,
"step": 560
},
{
"epoch": 3.0939226519337018,
"eval_loss": 2.9673562049865723,
"eval_runtime": 107.877,
"eval_samples_per_second": 11.902,
"eval_steps_per_second": 0.195,
"step": 560
},
{
"epoch": 3.149171270718232,
"grad_norm": 34.11488723754883,
"learning_rate": 3.425414364640884e-05,
"loss": 1.9957,
"step": 570
},
{
"epoch": 3.149171270718232,
"eval_loss": 3.2292628288269043,
"eval_runtime": 109.6888,
"eval_samples_per_second": 11.706,
"eval_steps_per_second": 0.191,
"step": 570
},
{
"epoch": 3.2044198895027622,
"grad_norm": 24.54149055480957,
"learning_rate": 3.397790055248619e-05,
"loss": 1.9727,
"step": 580
},
{
"epoch": 3.2044198895027622,
"eval_loss": 2.8876142501831055,
"eval_runtime": 107.7637,
"eval_samples_per_second": 11.915,
"eval_steps_per_second": 0.195,
"step": 580
},
{
"epoch": 3.2596685082872927,
"grad_norm": 24.705358505249023,
"learning_rate": 3.370165745856354e-05,
"loss": 1.9746,
"step": 590
},
{
"epoch": 3.2596685082872927,
"eval_loss": 2.84237003326416,
"eval_runtime": 107.9704,
"eval_samples_per_second": 11.892,
"eval_steps_per_second": 0.194,
"step": 590
},
{
"epoch": 3.314917127071823,
"grad_norm": 17.960529327392578,
"learning_rate": 3.3425414364640886e-05,
"loss": 1.9393,
"step": 600
},
{
"epoch": 3.314917127071823,
"eval_loss": 2.7950246334075928,
"eval_runtime": 107.4211,
"eval_samples_per_second": 11.953,
"eval_steps_per_second": 0.195,
"step": 600
},
{
"epoch": 3.3701657458563536,
"grad_norm": 21.799556732177734,
"learning_rate": 3.3149171270718233e-05,
"loss": 1.8191,
"step": 610
},
{
"epoch": 3.3701657458563536,
"eval_loss": 2.8050460815429688,
"eval_runtime": 109.142,
"eval_samples_per_second": 11.764,
"eval_steps_per_second": 0.192,
"step": 610
},
{
"epoch": 3.425414364640884,
"grad_norm": 22.035696029663086,
"learning_rate": 3.287292817679558e-05,
"loss": 1.7885,
"step": 620
},
{
"epoch": 3.425414364640884,
"eval_loss": 2.837017774581909,
"eval_runtime": 108.619,
"eval_samples_per_second": 11.821,
"eval_steps_per_second": 0.193,
"step": 620
},
{
"epoch": 3.4806629834254146,
"grad_norm": 20.61678695678711,
"learning_rate": 3.259668508287293e-05,
"loss": 1.8065,
"step": 630
},
{
"epoch": 3.4806629834254146,
"eval_loss": 2.6381585597991943,
"eval_runtime": 107.6552,
"eval_samples_per_second": 11.927,
"eval_steps_per_second": 0.195,
"step": 630
},
{
"epoch": 3.5359116022099446,
"grad_norm": 20.068431854248047,
"learning_rate": 3.232044198895028e-05,
"loss": 1.9027,
"step": 640
},
{
"epoch": 3.5359116022099446,
"eval_loss": 2.6253230571746826,
"eval_runtime": 108.6112,
"eval_samples_per_second": 11.822,
"eval_steps_per_second": 0.193,
"step": 640
},
{
"epoch": 3.591160220994475,
"grad_norm": 20.27581024169922,
"learning_rate": 3.2044198895027625e-05,
"loss": 1.7976,
"step": 650
},
{
"epoch": 3.591160220994475,
"eval_loss": 2.8042409420013428,
"eval_runtime": 109.5439,
"eval_samples_per_second": 11.721,
"eval_steps_per_second": 0.192,
"step": 650
},
{
"epoch": 3.6464088397790055,
"grad_norm": 22.138561248779297,
"learning_rate": 3.176795580110498e-05,
"loss": 1.8324,
"step": 660
},
{
"epoch": 3.6464088397790055,
"eval_loss": 2.6126017570495605,
"eval_runtime": 108.5998,
"eval_samples_per_second": 11.823,
"eval_steps_per_second": 0.193,
"step": 660
},
{
"epoch": 3.701657458563536,
"grad_norm": 18.944120407104492,
"learning_rate": 3.149171270718232e-05,
"loss": 1.7634,
"step": 670
},
{
"epoch": 3.701657458563536,
"eval_loss": 2.5312118530273438,
"eval_runtime": 107.8698,
"eval_samples_per_second": 11.903,
"eval_steps_per_second": 0.195,
"step": 670
},
{
"epoch": 3.7569060773480665,
"grad_norm": 21.863069534301758,
"learning_rate": 3.121546961325967e-05,
"loss": 1.8946,
"step": 680
},
{
"epoch": 3.7569060773480665,
"eval_loss": 2.480397939682007,
"eval_runtime": 108.6826,
"eval_samples_per_second": 11.814,
"eval_steps_per_second": 0.193,
"step": 680
},
{
"epoch": 3.8121546961325965,
"grad_norm": 23.169885635375977,
"learning_rate": 3.0939226519337016e-05,
"loss": 1.5957,
"step": 690
},
{
"epoch": 3.8121546961325965,
"eval_loss": 2.6412222385406494,
"eval_runtime": 107.9762,
"eval_samples_per_second": 11.892,
"eval_steps_per_second": 0.194,
"step": 690
},
{
"epoch": 3.867403314917127,
"grad_norm": 20.805410385131836,
"learning_rate": 3.0662983425414364e-05,
"loss": 1.6951,
"step": 700
},
{
"epoch": 3.867403314917127,
"eval_loss": 2.462557315826416,
"eval_runtime": 110.0483,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.191,
"step": 700
},
{
"epoch": 3.9226519337016574,
"grad_norm": 24.351552963256836,
"learning_rate": 3.0386740331491715e-05,
"loss": 1.789,
"step": 710
},
{
"epoch": 3.9226519337016574,
"eval_loss": 2.510899305343628,
"eval_runtime": 108.097,
"eval_samples_per_second": 11.878,
"eval_steps_per_second": 0.194,
"step": 710
},
{
"epoch": 3.977900552486188,
"grad_norm": 20.56439971923828,
"learning_rate": 3.0110497237569063e-05,
"loss": 1.7312,
"step": 720
},
{
"epoch": 3.977900552486188,
"eval_loss": 2.3842289447784424,
"eval_runtime": 110.1597,
"eval_samples_per_second": 11.656,
"eval_steps_per_second": 0.191,
"step": 720
}
],
"logging_steps": 10,
"max_steps": 1810,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3437308831020480.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}