prm-qwen3-8b-seqcls-final1 / trainer_state.json
devangb4's picture
Upload folder using huggingface_hub
e035f7b verified
{
"best_global_step": 2000,
"best_metric": 0.09745433926582336,
"best_model_checkpoint": "/content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/checkpoints/checkpoint-2000",
"epoch": 2.254948210439955,
"eval_steps": 250,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01128089426725464,
"grad_norm": 12.25,
"learning_rate": 2.25e-05,
"loss": 0.7492,
"step": 10
},
{
"epoch": 0.02256178853450928,
"grad_norm": 10.25,
"learning_rate": 4.75e-05,
"loss": 0.6591,
"step": 20
},
{
"epoch": 0.033842682801763925,
"grad_norm": 4.75,
"learning_rate": 7.25e-05,
"loss": 0.5451,
"step": 30
},
{
"epoch": 0.04512357706901856,
"grad_norm": 4.40625,
"learning_rate": 9.75e-05,
"loss": 0.4973,
"step": 40
},
{
"epoch": 0.0564044713362732,
"grad_norm": 5.90625,
"learning_rate": 0.00012250000000000002,
"loss": 0.4536,
"step": 50
},
{
"epoch": 0.06768536560352785,
"grad_norm": 3.90625,
"learning_rate": 0.0001475,
"loss": 0.4242,
"step": 60
},
{
"epoch": 0.07896625987078248,
"grad_norm": 1.734375,
"learning_rate": 0.00017250000000000002,
"loss": 0.409,
"step": 70
},
{
"epoch": 0.09024715413803712,
"grad_norm": 3.390625,
"learning_rate": 0.00019750000000000003,
"loss": 0.3968,
"step": 80
},
{
"epoch": 0.10152804840529177,
"grad_norm": 6.0,
"learning_rate": 0.0001993025958930647,
"loss": 0.3769,
"step": 90
},
{
"epoch": 0.1128089426725464,
"grad_norm": 4.875,
"learning_rate": 0.0001985277024409144,
"loss": 0.3902,
"step": 100
},
{
"epoch": 0.12408983693980105,
"grad_norm": 7.0,
"learning_rate": 0.00019775280898876404,
"loss": 0.3347,
"step": 110
},
{
"epoch": 0.1353707312070557,
"grad_norm": 5.53125,
"learning_rate": 0.00019697791553661373,
"loss": 0.3346,
"step": 120
},
{
"epoch": 0.14665162547431032,
"grad_norm": 6.34375,
"learning_rate": 0.0001962030220844634,
"loss": 0.3517,
"step": 130
},
{
"epoch": 0.15793251974156497,
"grad_norm": 4.96875,
"learning_rate": 0.00019542812863231307,
"loss": 0.3614,
"step": 140
},
{
"epoch": 0.16921341400881962,
"grad_norm": 9.25,
"learning_rate": 0.00019465323518016273,
"loss": 0.3342,
"step": 150
},
{
"epoch": 0.18049430827607424,
"grad_norm": 1.78125,
"learning_rate": 0.0001938783417280124,
"loss": 0.3328,
"step": 160
},
{
"epoch": 0.1917752025433289,
"grad_norm": 5.4375,
"learning_rate": 0.0001931034482758621,
"loss": 0.3238,
"step": 170
},
{
"epoch": 0.20305609681058354,
"grad_norm": 1.015625,
"learning_rate": 0.00019232855482371176,
"loss": 0.3256,
"step": 180
},
{
"epoch": 0.21433699107783816,
"grad_norm": 2.390625,
"learning_rate": 0.0001915536613715614,
"loss": 0.3487,
"step": 190
},
{
"epoch": 0.2256178853450928,
"grad_norm": 8.5625,
"learning_rate": 0.0001907787679194111,
"loss": 0.3266,
"step": 200
},
{
"epoch": 0.23689877961234745,
"grad_norm": 1.5703125,
"learning_rate": 0.00019000387446726076,
"loss": 0.316,
"step": 210
},
{
"epoch": 0.2481796738796021,
"grad_norm": 1.2890625,
"learning_rate": 0.00018922898101511045,
"loss": 0.3118,
"step": 220
},
{
"epoch": 0.25946056814685675,
"grad_norm": 2.3125,
"learning_rate": 0.0001884540875629601,
"loss": 0.3226,
"step": 230
},
{
"epoch": 0.2707414624141114,
"grad_norm": 1.125,
"learning_rate": 0.00018767919411080976,
"loss": 0.3407,
"step": 240
},
{
"epoch": 0.282022356681366,
"grad_norm": 12.6875,
"learning_rate": 0.00018690430065865945,
"loss": 0.3253,
"step": 250
},
{
"epoch": 0.282022356681366,
"eval_loss": 0.35227087140083313,
"eval_runtime": 3939.2496,
"eval_samples_per_second": 29.703,
"eval_steps_per_second": 0.928,
"step": 250
},
{
"epoch": 0.29330325094862064,
"grad_norm": 11.8125,
"learning_rate": 0.00018612940720650912,
"loss": 0.3383,
"step": 260
},
{
"epoch": 0.3045841452158753,
"grad_norm": 3.25,
"learning_rate": 0.00018535451375435879,
"loss": 0.3237,
"step": 270
},
{
"epoch": 0.31586503948312994,
"grad_norm": 3.65625,
"learning_rate": 0.00018457962030220845,
"loss": 0.3091,
"step": 280
},
{
"epoch": 0.3271459337503846,
"grad_norm": 7.9375,
"learning_rate": 0.00018380472685005812,
"loss": 0.3179,
"step": 290
},
{
"epoch": 0.33842682801763924,
"grad_norm": 5.21875,
"learning_rate": 0.0001830298333979078,
"loss": 0.334,
"step": 300
},
{
"epoch": 0.34970772228489383,
"grad_norm": 2.765625,
"learning_rate": 0.00018225493994575745,
"loss": 0.3292,
"step": 310
},
{
"epoch": 0.3609886165521485,
"grad_norm": 8.375,
"learning_rate": 0.00018148004649360715,
"loss": 0.3083,
"step": 320
},
{
"epoch": 0.3722695108194031,
"grad_norm": 2.046875,
"learning_rate": 0.0001807051530414568,
"loss": 0.2957,
"step": 330
},
{
"epoch": 0.3835504050866578,
"grad_norm": 2.34375,
"learning_rate": 0.00017993025958930648,
"loss": 0.3152,
"step": 340
},
{
"epoch": 0.3948312993539124,
"grad_norm": 4.375,
"learning_rate": 0.00017915536613715614,
"loss": 0.2846,
"step": 350
},
{
"epoch": 0.4061121936211671,
"grad_norm": 9.1875,
"learning_rate": 0.0001783804726850058,
"loss": 0.2714,
"step": 360
},
{
"epoch": 0.4173930878884217,
"grad_norm": 2.46875,
"learning_rate": 0.0001776055792328555,
"loss": 0.2834,
"step": 370
},
{
"epoch": 0.4286739821556763,
"grad_norm": 2.625,
"learning_rate": 0.00017683068578070517,
"loss": 0.2854,
"step": 380
},
{
"epoch": 0.43995487642293096,
"grad_norm": 1.5546875,
"learning_rate": 0.00017605579232855484,
"loss": 0.276,
"step": 390
},
{
"epoch": 0.4512357706901856,
"grad_norm": 10.375,
"learning_rate": 0.0001752808988764045,
"loss": 0.2842,
"step": 400
},
{
"epoch": 0.46251666495744026,
"grad_norm": 9.75,
"learning_rate": 0.00017450600542425417,
"loss": 0.3073,
"step": 410
},
{
"epoch": 0.4737975592246949,
"grad_norm": 8.875,
"learning_rate": 0.00017373111197210386,
"loss": 0.2636,
"step": 420
},
{
"epoch": 0.48507845349194956,
"grad_norm": 3.234375,
"learning_rate": 0.0001729562185199535,
"loss": 0.2467,
"step": 430
},
{
"epoch": 0.4963593477592042,
"grad_norm": 1.3671875,
"learning_rate": 0.00017218132506780317,
"loss": 0.277,
"step": 440
},
{
"epoch": 0.5076402420264589,
"grad_norm": 5.9375,
"learning_rate": 0.00017140643161565286,
"loss": 0.2831,
"step": 450
},
{
"epoch": 0.5189211362937135,
"grad_norm": 1.7421875,
"learning_rate": 0.00017063153816350253,
"loss": 0.2432,
"step": 460
},
{
"epoch": 0.5302020305609682,
"grad_norm": 2.6875,
"learning_rate": 0.0001698566447113522,
"loss": 0.2442,
"step": 470
},
{
"epoch": 0.5414829248282228,
"grad_norm": 2.921875,
"learning_rate": 0.00016908175125920186,
"loss": 0.2655,
"step": 480
},
{
"epoch": 0.5527638190954773,
"grad_norm": 1.6796875,
"learning_rate": 0.00016830685780705153,
"loss": 0.2463,
"step": 490
},
{
"epoch": 0.564044713362732,
"grad_norm": 10.75,
"learning_rate": 0.00016753196435490122,
"loss": 0.2679,
"step": 500
},
{
"epoch": 0.564044713362732,
"eval_loss": 0.24632702767848969,
"eval_runtime": 3940.4364,
"eval_samples_per_second": 29.694,
"eval_steps_per_second": 0.928,
"step": 500
},
{
"epoch": 0.5753256076299866,
"grad_norm": 4.53125,
"learning_rate": 0.0001667570709027509,
"loss": 0.27,
"step": 510
},
{
"epoch": 0.5866065018972413,
"grad_norm": 3.203125,
"learning_rate": 0.00016598217745060053,
"loss": 0.2807,
"step": 520
},
{
"epoch": 0.5978873961644959,
"grad_norm": 7.75,
"learning_rate": 0.00016520728399845022,
"loss": 0.2615,
"step": 530
},
{
"epoch": 0.6091682904317506,
"grad_norm": 4.78125,
"learning_rate": 0.0001644323905462999,
"loss": 0.2567,
"step": 540
},
{
"epoch": 0.6204491846990052,
"grad_norm": 3.375,
"learning_rate": 0.00016365749709414956,
"loss": 0.2562,
"step": 550
},
{
"epoch": 0.6317300789662599,
"grad_norm": 1.8828125,
"learning_rate": 0.00016288260364199922,
"loss": 0.2389,
"step": 560
},
{
"epoch": 0.6430109732335145,
"grad_norm": 2.46875,
"learning_rate": 0.0001621077101898489,
"loss": 0.2331,
"step": 570
},
{
"epoch": 0.6542918675007692,
"grad_norm": 3.03125,
"learning_rate": 0.00016133281673769858,
"loss": 0.2522,
"step": 580
},
{
"epoch": 0.6655727617680238,
"grad_norm": 1.28125,
"learning_rate": 0.00016055792328554825,
"loss": 0.2196,
"step": 590
},
{
"epoch": 0.6768536560352785,
"grad_norm": 3.1875,
"learning_rate": 0.00015978302983339792,
"loss": 0.2523,
"step": 600
},
{
"epoch": 0.6881345503025331,
"grad_norm": 3.96875,
"learning_rate": 0.00015900813638124758,
"loss": 0.263,
"step": 610
},
{
"epoch": 0.6994154445697877,
"grad_norm": 3.21875,
"learning_rate": 0.00015823324292909725,
"loss": 0.2148,
"step": 620
},
{
"epoch": 0.7106963388370423,
"grad_norm": 1.6796875,
"learning_rate": 0.00015745834947694694,
"loss": 0.22,
"step": 630
},
{
"epoch": 0.721977233104297,
"grad_norm": 4.34375,
"learning_rate": 0.00015668345602479658,
"loss": 0.2439,
"step": 640
},
{
"epoch": 0.7332581273715516,
"grad_norm": 1.484375,
"learning_rate": 0.00015590856257264628,
"loss": 0.2244,
"step": 650
},
{
"epoch": 0.7445390216388063,
"grad_norm": 1.6015625,
"learning_rate": 0.00015513366912049594,
"loss": 0.1983,
"step": 660
},
{
"epoch": 0.7558199159060609,
"grad_norm": 6.65625,
"learning_rate": 0.0001543587756683456,
"loss": 0.2232,
"step": 670
},
{
"epoch": 0.7671008101733156,
"grad_norm": 1.03125,
"learning_rate": 0.00015358388221619528,
"loss": 0.2558,
"step": 680
},
{
"epoch": 0.7783817044405702,
"grad_norm": 4.40625,
"learning_rate": 0.00015280898876404494,
"loss": 0.249,
"step": 690
},
{
"epoch": 0.7896625987078248,
"grad_norm": 3.4375,
"learning_rate": 0.00015203409531189464,
"loss": 0.2103,
"step": 700
},
{
"epoch": 0.8009434929750795,
"grad_norm": 1.640625,
"learning_rate": 0.0001512592018597443,
"loss": 0.2328,
"step": 710
},
{
"epoch": 0.8122243872423341,
"grad_norm": 2.421875,
"learning_rate": 0.00015048430840759394,
"loss": 0.2327,
"step": 720
},
{
"epoch": 0.8235052815095888,
"grad_norm": 7.1875,
"learning_rate": 0.00014970941495544364,
"loss": 0.2027,
"step": 730
},
{
"epoch": 0.8347861757768434,
"grad_norm": 4.5,
"learning_rate": 0.0001489345215032933,
"loss": 0.2416,
"step": 740
},
{
"epoch": 0.8460670700440981,
"grad_norm": 2.078125,
"learning_rate": 0.000148159628051143,
"loss": 0.2004,
"step": 750
},
{
"epoch": 0.8460670700440981,
"eval_loss": 0.1825282871723175,
"eval_runtime": 3937.2667,
"eval_samples_per_second": 29.718,
"eval_steps_per_second": 0.929,
"step": 750
},
{
"epoch": 0.8573479643113526,
"grad_norm": 1.734375,
"learning_rate": 0.00014738473459899264,
"loss": 0.2186,
"step": 760
},
{
"epoch": 0.8686288585786073,
"grad_norm": 3.25,
"learning_rate": 0.0001466098411468423,
"loss": 0.2004,
"step": 770
},
{
"epoch": 0.8799097528458619,
"grad_norm": 1.9140625,
"learning_rate": 0.000145834947694692,
"loss": 0.2121,
"step": 780
},
{
"epoch": 0.8911906471131166,
"grad_norm": 3.125,
"learning_rate": 0.00014506005424254166,
"loss": 0.2116,
"step": 790
},
{
"epoch": 0.9024715413803712,
"grad_norm": 9.75,
"learning_rate": 0.00014428516079039133,
"loss": 0.2407,
"step": 800
},
{
"epoch": 0.9137524356476259,
"grad_norm": 3.5625,
"learning_rate": 0.000143510267338241,
"loss": 0.2077,
"step": 810
},
{
"epoch": 0.9250333299148805,
"grad_norm": 5.25,
"learning_rate": 0.00014273537388609066,
"loss": 0.2267,
"step": 820
},
{
"epoch": 0.9363142241821352,
"grad_norm": 2.5625,
"learning_rate": 0.00014196048043394036,
"loss": 0.1969,
"step": 830
},
{
"epoch": 0.9475951184493898,
"grad_norm": 1.53125,
"learning_rate": 0.00014118558698179,
"loss": 0.1998,
"step": 840
},
{
"epoch": 0.9588760127166445,
"grad_norm": 1.0390625,
"learning_rate": 0.0001404106935296397,
"loss": 0.2008,
"step": 850
},
{
"epoch": 0.9701569069838991,
"grad_norm": 6.0,
"learning_rate": 0.00013963580007748935,
"loss": 0.1914,
"step": 860
},
{
"epoch": 0.9814378012511538,
"grad_norm": 1.4609375,
"learning_rate": 0.00013886090662533902,
"loss": 0.191,
"step": 870
},
{
"epoch": 0.9927186955184084,
"grad_norm": 2.390625,
"learning_rate": 0.0001380860131731887,
"loss": 0.1816,
"step": 880
},
{
"epoch": 1.0033842682801764,
"grad_norm": 3.859375,
"learning_rate": 0.00013731111972103835,
"loss": 0.2074,
"step": 890
},
{
"epoch": 1.014665162547431,
"grad_norm": 4.28125,
"learning_rate": 0.00013653622626888802,
"loss": 0.1984,
"step": 900
},
{
"epoch": 1.0259460568146856,
"grad_norm": 1.3203125,
"learning_rate": 0.00013576133281673771,
"loss": 0.1648,
"step": 910
},
{
"epoch": 1.0372269510819403,
"grad_norm": 3.5,
"learning_rate": 0.00013498643936458738,
"loss": 0.192,
"step": 920
},
{
"epoch": 1.048507845349195,
"grad_norm": 6.53125,
"learning_rate": 0.00013421154591243705,
"loss": 0.1522,
"step": 930
},
{
"epoch": 1.0597887396164496,
"grad_norm": 4.625,
"learning_rate": 0.00013343665246028671,
"loss": 0.1715,
"step": 940
},
{
"epoch": 1.0710696338837042,
"grad_norm": 3.171875,
"learning_rate": 0.00013266175900813638,
"loss": 0.1593,
"step": 950
},
{
"epoch": 1.082350528150959,
"grad_norm": 3.65625,
"learning_rate": 0.00013188686555598607,
"loss": 0.1439,
"step": 960
},
{
"epoch": 1.0936314224182135,
"grad_norm": 2.140625,
"learning_rate": 0.0001311119721038357,
"loss": 0.1328,
"step": 970
},
{
"epoch": 1.1049123166854682,
"grad_norm": 2.359375,
"learning_rate": 0.0001303370786516854,
"loss": 0.1763,
"step": 980
},
{
"epoch": 1.1161932109527228,
"grad_norm": 1.484375,
"learning_rate": 0.00012956218519953507,
"loss": 0.1689,
"step": 990
},
{
"epoch": 1.1274741052199775,
"grad_norm": 1.59375,
"learning_rate": 0.00012878729174738474,
"loss": 0.1391,
"step": 1000
},
{
"epoch": 1.1274741052199775,
"eval_loss": 0.16099952161312103,
"eval_runtime": 3944.2805,
"eval_samples_per_second": 29.665,
"eval_steps_per_second": 0.927,
"step": 1000
},
{
"epoch": 1.1387549994872321,
"grad_norm": 2.375,
"learning_rate": 0.0001280123982952344,
"loss": 0.1669,
"step": 1010
},
{
"epoch": 1.1500358937544868,
"grad_norm": 0.89453125,
"learning_rate": 0.00012723750484308407,
"loss": 0.1542,
"step": 1020
},
{
"epoch": 1.1613167880217414,
"grad_norm": 2.90625,
"learning_rate": 0.00012646261139093377,
"loss": 0.1566,
"step": 1030
},
{
"epoch": 1.172597682288996,
"grad_norm": 2.359375,
"learning_rate": 0.00012568771793878343,
"loss": 0.1865,
"step": 1040
},
{
"epoch": 1.1838785765562507,
"grad_norm": 1.6484375,
"learning_rate": 0.00012491282448663307,
"loss": 0.1729,
"step": 1050
},
{
"epoch": 1.1951594708235054,
"grad_norm": 3.546875,
"learning_rate": 0.00012413793103448277,
"loss": 0.1755,
"step": 1060
},
{
"epoch": 1.20644036509076,
"grad_norm": 3.390625,
"learning_rate": 0.00012336303758233243,
"loss": 0.1397,
"step": 1070
},
{
"epoch": 1.2177212593580147,
"grad_norm": 2.125,
"learning_rate": 0.00012258814413018213,
"loss": 0.1583,
"step": 1080
},
{
"epoch": 1.2290021536252693,
"grad_norm": 3.5,
"learning_rate": 0.00012181325067803178,
"loss": 0.1782,
"step": 1090
},
{
"epoch": 1.2402830478925237,
"grad_norm": 0.99609375,
"learning_rate": 0.00012103835722588143,
"loss": 0.1693,
"step": 1100
},
{
"epoch": 1.2515639421597786,
"grad_norm": 3.296875,
"learning_rate": 0.00012026346377373113,
"loss": 0.1591,
"step": 1110
},
{
"epoch": 1.262844836427033,
"grad_norm": 2.375,
"learning_rate": 0.00011948857032158078,
"loss": 0.1741,
"step": 1120
},
{
"epoch": 1.2741257306942877,
"grad_norm": 1.2421875,
"learning_rate": 0.00011871367686943047,
"loss": 0.1446,
"step": 1130
},
{
"epoch": 1.2854066249615423,
"grad_norm": 1.875,
"learning_rate": 0.00011793878341728013,
"loss": 0.1627,
"step": 1140
},
{
"epoch": 1.296687519228797,
"grad_norm": 2.484375,
"learning_rate": 0.00011716388996512979,
"loss": 0.1509,
"step": 1150
},
{
"epoch": 1.3079684134960516,
"grad_norm": 1.6875,
"learning_rate": 0.00011638899651297947,
"loss": 0.1545,
"step": 1160
},
{
"epoch": 1.3192493077633063,
"grad_norm": 4.9375,
"learning_rate": 0.00011561410306082914,
"loss": 0.1503,
"step": 1170
},
{
"epoch": 1.330530202030561,
"grad_norm": 1.9375,
"learning_rate": 0.00011483920960867882,
"loss": 0.1597,
"step": 1180
},
{
"epoch": 1.3418110962978156,
"grad_norm": 1.171875,
"learning_rate": 0.00011406431615652849,
"loss": 0.1547,
"step": 1190
},
{
"epoch": 1.3530919905650702,
"grad_norm": 2.34375,
"learning_rate": 0.00011328942270437815,
"loss": 0.1701,
"step": 1200
},
{
"epoch": 1.3643728848323249,
"grad_norm": 1.0625,
"learning_rate": 0.00011251452925222783,
"loss": 0.1655,
"step": 1210
},
{
"epoch": 1.3756537790995795,
"grad_norm": 3.234375,
"learning_rate": 0.00011173963580007749,
"loss": 0.1526,
"step": 1220
},
{
"epoch": 1.3869346733668342,
"grad_norm": 1.2890625,
"learning_rate": 0.00011096474234792718,
"loss": 0.1539,
"step": 1230
},
{
"epoch": 1.3982155676340888,
"grad_norm": 3.640625,
"learning_rate": 0.00011018984889577683,
"loss": 0.1421,
"step": 1240
},
{
"epoch": 1.4094964619013435,
"grad_norm": 3.046875,
"learning_rate": 0.0001094149554436265,
"loss": 0.1549,
"step": 1250
},
{
"epoch": 1.4094964619013435,
"eval_loss": 0.13532690703868866,
"eval_runtime": 3940.6219,
"eval_samples_per_second": 29.693,
"eval_steps_per_second": 0.928,
"step": 1250
},
{
"epoch": 1.4207773561685981,
"grad_norm": 1.765625,
"learning_rate": 0.00010864006199147618,
"loss": 0.1361,
"step": 1260
},
{
"epoch": 1.4320582504358528,
"grad_norm": 2.171875,
"learning_rate": 0.00010786516853932584,
"loss": 0.1735,
"step": 1270
},
{
"epoch": 1.4433391447031074,
"grad_norm": 1.40625,
"learning_rate": 0.00010709027508717552,
"loss": 0.1747,
"step": 1280
},
{
"epoch": 1.454620038970362,
"grad_norm": 1.5078125,
"learning_rate": 0.00010631538163502519,
"loss": 0.1461,
"step": 1290
},
{
"epoch": 1.4659009332376167,
"grad_norm": 2.453125,
"learning_rate": 0.00010554048818287486,
"loss": 0.1701,
"step": 1300
},
{
"epoch": 1.4771818275048714,
"grad_norm": 3.046875,
"learning_rate": 0.00010476559473072454,
"loss": 0.1473,
"step": 1310
},
{
"epoch": 1.488462721772126,
"grad_norm": 2.140625,
"learning_rate": 0.0001039907012785742,
"loss": 0.1292,
"step": 1320
},
{
"epoch": 1.4997436160393804,
"grad_norm": 2.90625,
"learning_rate": 0.00010321580782642388,
"loss": 0.1572,
"step": 1330
},
{
"epoch": 1.511024510306635,
"grad_norm": 2.15625,
"learning_rate": 0.00010244091437427355,
"loss": 0.1446,
"step": 1340
},
{
"epoch": 1.5223054045738897,
"grad_norm": 3.40625,
"learning_rate": 0.0001016660209221232,
"loss": 0.1469,
"step": 1350
},
{
"epoch": 1.5335862988411444,
"grad_norm": 1.109375,
"learning_rate": 0.00010089112746997288,
"loss": 0.1774,
"step": 1360
},
{
"epoch": 1.544867193108399,
"grad_norm": 1.984375,
"learning_rate": 0.00010011623401782255,
"loss": 0.1371,
"step": 1370
},
{
"epoch": 1.5561480873756537,
"grad_norm": 2.15625,
"learning_rate": 9.934134056567223e-05,
"loss": 0.1459,
"step": 1380
},
{
"epoch": 1.5674289816429083,
"grad_norm": 1.6953125,
"learning_rate": 9.85664471135219e-05,
"loss": 0.138,
"step": 1390
},
{
"epoch": 1.578709875910163,
"grad_norm": 1.421875,
"learning_rate": 9.779155366137158e-05,
"loss": 0.1395,
"step": 1400
},
{
"epoch": 1.5899907701774176,
"grad_norm": 3.0,
"learning_rate": 9.701666020922123e-05,
"loss": 0.1526,
"step": 1410
},
{
"epoch": 1.6012716644446723,
"grad_norm": 1.1953125,
"learning_rate": 9.624176675707091e-05,
"loss": 0.1612,
"step": 1420
},
{
"epoch": 1.612552558711927,
"grad_norm": 1.1171875,
"learning_rate": 9.546687330492058e-05,
"loss": 0.1355,
"step": 1430
},
{
"epoch": 1.6238334529791816,
"grad_norm": 1.171875,
"learning_rate": 9.469197985277026e-05,
"loss": 0.1607,
"step": 1440
},
{
"epoch": 1.6351143472464362,
"grad_norm": 3.078125,
"learning_rate": 9.391708640061992e-05,
"loss": 0.1562,
"step": 1450
},
{
"epoch": 1.6463952415136909,
"grad_norm": 1.3359375,
"learning_rate": 9.314219294846959e-05,
"loss": 0.146,
"step": 1460
},
{
"epoch": 1.6576761357809455,
"grad_norm": 2.125,
"learning_rate": 9.236729949631926e-05,
"loss": 0.1405,
"step": 1470
},
{
"epoch": 1.6689570300482002,
"grad_norm": 1.5859375,
"learning_rate": 9.159240604416894e-05,
"loss": 0.137,
"step": 1480
},
{
"epoch": 1.6802379243154548,
"grad_norm": 2.984375,
"learning_rate": 9.08175125920186e-05,
"loss": 0.1674,
"step": 1490
},
{
"epoch": 1.6915188185827095,
"grad_norm": 2.296875,
"learning_rate": 9.004261913986827e-05,
"loss": 0.1452,
"step": 1500
},
{
"epoch": 1.6915188185827095,
"eval_loss": 0.1216062381863594,
"eval_runtime": 3942.5719,
"eval_samples_per_second": 29.678,
"eval_steps_per_second": 0.928,
"step": 1500
},
{
"epoch": 1.7027997128499641,
"grad_norm": 2.625,
"learning_rate": 8.926772568771794e-05,
"loss": 0.1491,
"step": 1510
},
{
"epoch": 1.7140806071172188,
"grad_norm": 2.75,
"learning_rate": 8.849283223556762e-05,
"loss": 0.1304,
"step": 1520
},
{
"epoch": 1.7253615013844734,
"grad_norm": 1.8359375,
"learning_rate": 8.771793878341728e-05,
"loss": 0.1367,
"step": 1530
},
{
"epoch": 1.736642395651728,
"grad_norm": 2.453125,
"learning_rate": 8.694304533126696e-05,
"loss": 0.1321,
"step": 1540
},
{
"epoch": 1.7479232899189827,
"grad_norm": 2.53125,
"learning_rate": 8.616815187911662e-05,
"loss": 0.1413,
"step": 1550
},
{
"epoch": 1.7592041841862374,
"grad_norm": 2.296875,
"learning_rate": 8.53932584269663e-05,
"loss": 0.1593,
"step": 1560
},
{
"epoch": 1.770485078453492,
"grad_norm": 3.90625,
"learning_rate": 8.461836497481596e-05,
"loss": 0.1472,
"step": 1570
},
{
"epoch": 1.7817659727207467,
"grad_norm": 1.703125,
"learning_rate": 8.384347152266564e-05,
"loss": 0.144,
"step": 1580
},
{
"epoch": 1.7930468669880013,
"grad_norm": 1.34375,
"learning_rate": 8.306857807051531e-05,
"loss": 0.1431,
"step": 1590
},
{
"epoch": 1.804327761255256,
"grad_norm": 4.5,
"learning_rate": 8.229368461836498e-05,
"loss": 0.1455,
"step": 1600
},
{
"epoch": 1.8156086555225106,
"grad_norm": 2.015625,
"learning_rate": 8.151879116621464e-05,
"loss": 0.1142,
"step": 1610
},
{
"epoch": 1.8268895497897653,
"grad_norm": 2.25,
"learning_rate": 8.074389771406432e-05,
"loss": 0.1554,
"step": 1620
},
{
"epoch": 1.83817044405702,
"grad_norm": 1.6328125,
"learning_rate": 7.996900426191399e-05,
"loss": 0.1376,
"step": 1630
},
{
"epoch": 1.8494513383242746,
"grad_norm": 1.5234375,
"learning_rate": 7.919411080976367e-05,
"loss": 0.1308,
"step": 1640
},
{
"epoch": 1.8607322325915292,
"grad_norm": 1.78125,
"learning_rate": 7.841921735761332e-05,
"loss": 0.1599,
"step": 1650
},
{
"epoch": 1.8720131268587838,
"grad_norm": 1.5625,
"learning_rate": 7.7644323905463e-05,
"loss": 0.1291,
"step": 1660
},
{
"epoch": 1.8832940211260385,
"grad_norm": 2.203125,
"learning_rate": 7.686943045331267e-05,
"loss": 0.1368,
"step": 1670
},
{
"epoch": 1.8945749153932931,
"grad_norm": 2.828125,
"learning_rate": 7.609453700116235e-05,
"loss": 0.1585,
"step": 1680
},
{
"epoch": 1.9058558096605478,
"grad_norm": 4.4375,
"learning_rate": 7.531964354901202e-05,
"loss": 0.1426,
"step": 1690
},
{
"epoch": 1.9171367039278022,
"grad_norm": 1.890625,
"learning_rate": 7.454475009686168e-05,
"loss": 0.1578,
"step": 1700
},
{
"epoch": 1.9284175981950569,
"grad_norm": 1.8984375,
"learning_rate": 7.376985664471135e-05,
"loss": 0.1251,
"step": 1710
},
{
"epoch": 1.9396984924623115,
"grad_norm": 1.953125,
"learning_rate": 7.299496319256103e-05,
"loss": 0.1229,
"step": 1720
},
{
"epoch": 1.9509793867295662,
"grad_norm": 1.640625,
"learning_rate": 7.22200697404107e-05,
"loss": 0.1122,
"step": 1730
},
{
"epoch": 1.9622602809968208,
"grad_norm": 1.2734375,
"learning_rate": 7.144517628826036e-05,
"loss": 0.144,
"step": 1740
},
{
"epoch": 1.9735411752640755,
"grad_norm": 0.921875,
"learning_rate": 7.067028283611004e-05,
"loss": 0.1526,
"step": 1750
},
{
"epoch": 1.9735411752640755,
"eval_loss": 0.10789535939693451,
"eval_runtime": 3942.9784,
"eval_samples_per_second": 29.675,
"eval_steps_per_second": 0.927,
"step": 1750
},
{
"epoch": 1.9848220695313301,
"grad_norm": 2.125,
"learning_rate": 6.989538938395971e-05,
"loss": 0.1409,
"step": 1760
},
{
"epoch": 1.9961029637985848,
"grad_norm": 0.75,
"learning_rate": 6.912049593180939e-05,
"loss": 0.1317,
"step": 1770
},
{
"epoch": 2.0067685365603527,
"grad_norm": 4.03125,
"learning_rate": 6.834560247965905e-05,
"loss": 0.1106,
"step": 1780
},
{
"epoch": 2.0180494308276073,
"grad_norm": 2.9375,
"learning_rate": 6.757070902750872e-05,
"loss": 0.0853,
"step": 1790
},
{
"epoch": 2.029330325094862,
"grad_norm": 1.078125,
"learning_rate": 6.679581557535839e-05,
"loss": 0.0947,
"step": 1800
},
{
"epoch": 2.0406112193621166,
"grad_norm": 1.734375,
"learning_rate": 6.602092212320807e-05,
"loss": 0.0886,
"step": 1810
},
{
"epoch": 2.0518921136293713,
"grad_norm": 1.21875,
"learning_rate": 6.524602867105773e-05,
"loss": 0.1107,
"step": 1820
},
{
"epoch": 2.063173007896626,
"grad_norm": 0.93359375,
"learning_rate": 6.447113521890741e-05,
"loss": 0.085,
"step": 1830
},
{
"epoch": 2.0744539021638806,
"grad_norm": 1.5078125,
"learning_rate": 6.369624176675707e-05,
"loss": 0.087,
"step": 1840
},
{
"epoch": 2.0857347964311352,
"grad_norm": 2.21875,
"learning_rate": 6.292134831460675e-05,
"loss": 0.1305,
"step": 1850
},
{
"epoch": 2.09701569069839,
"grad_norm": 1.6953125,
"learning_rate": 6.214645486245641e-05,
"loss": 0.1178,
"step": 1860
},
{
"epoch": 2.1082965849656445,
"grad_norm": 2.078125,
"learning_rate": 6.13715614103061e-05,
"loss": 0.1124,
"step": 1870
},
{
"epoch": 2.119577479232899,
"grad_norm": 0.81640625,
"learning_rate": 6.059666795815576e-05,
"loss": 0.1044,
"step": 1880
},
{
"epoch": 2.130858373500154,
"grad_norm": 1.8828125,
"learning_rate": 5.982177450600542e-05,
"loss": 0.0873,
"step": 1890
},
{
"epoch": 2.1421392677674085,
"grad_norm": 1.65625,
"learning_rate": 5.9046881053855094e-05,
"loss": 0.1124,
"step": 1900
},
{
"epoch": 2.153420162034663,
"grad_norm": 1.4921875,
"learning_rate": 5.827198760170477e-05,
"loss": 0.0943,
"step": 1910
},
{
"epoch": 2.164701056301918,
"grad_norm": 1.15625,
"learning_rate": 5.749709414955444e-05,
"loss": 0.0892,
"step": 1920
},
{
"epoch": 2.1759819505691724,
"grad_norm": 2.328125,
"learning_rate": 5.672220069740411e-05,
"loss": 0.1028,
"step": 1930
},
{
"epoch": 2.187262844836427,
"grad_norm": 1.359375,
"learning_rate": 5.594730724525378e-05,
"loss": 0.1033,
"step": 1940
},
{
"epoch": 2.1985437391036817,
"grad_norm": 1.1171875,
"learning_rate": 5.517241379310345e-05,
"loss": 0.1093,
"step": 1950
},
{
"epoch": 2.2098246333709364,
"grad_norm": 1.515625,
"learning_rate": 5.439752034095312e-05,
"loss": 0.1123,
"step": 1960
},
{
"epoch": 2.221105527638191,
"grad_norm": 1.765625,
"learning_rate": 5.362262688880279e-05,
"loss": 0.1139,
"step": 1970
},
{
"epoch": 2.2323864219054457,
"grad_norm": 1.7890625,
"learning_rate": 5.284773343665246e-05,
"loss": 0.0987,
"step": 1980
},
{
"epoch": 2.2436673161727003,
"grad_norm": 2.140625,
"learning_rate": 5.207283998450213e-05,
"loss": 0.1008,
"step": 1990
},
{
"epoch": 2.254948210439955,
"grad_norm": 2.3125,
"learning_rate": 5.1297946532351806e-05,
"loss": 0.0925,
"step": 2000
},
{
"epoch": 2.254948210439955,
"eval_loss": 0.09745433926582336,
"eval_runtime": 3942.1232,
"eval_samples_per_second": 29.682,
"eval_steps_per_second": 0.928,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 2661,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}