gemma-finetuned-s0d / trainer_state.json
sha000's picture
Upload folder using huggingface_hub
8274112 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 732,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.3894134521484376,
"epoch": 0.040983606557377046,
"grad_norm": 13.141048431396484,
"learning_rate": 1.975409836065574e-05,
"loss": 1.25,
"mean_token_accuracy": 0.67734375,
"num_tokens": 12480.0,
"step": 10
},
{
"entropy": 0.900828194618225,
"epoch": 0.08196721311475409,
"grad_norm": 52.69793701171875,
"learning_rate": 1.9480874316939892e-05,
"loss": 0.4053,
"mean_token_accuracy": 0.834375,
"num_tokens": 24960.0,
"step": 20
},
{
"entropy": 0.874811452627182,
"epoch": 0.12295081967213115,
"grad_norm": 34.038414001464844,
"learning_rate": 1.9207650273224046e-05,
"loss": 0.2804,
"mean_token_accuracy": 0.88125,
"num_tokens": 37440.0,
"step": 30
},
{
"entropy": 0.9816662311553955,
"epoch": 0.16393442622950818,
"grad_norm": 14.941110610961914,
"learning_rate": 1.89344262295082e-05,
"loss": 0.2195,
"mean_token_accuracy": 0.90625,
"num_tokens": 49920.0,
"step": 40
},
{
"entropy": 0.8901944577693939,
"epoch": 0.20491803278688525,
"grad_norm": 23.074167251586914,
"learning_rate": 1.866120218579235e-05,
"loss": 0.1932,
"mean_token_accuracy": 0.9109375,
"num_tokens": 62400.0,
"step": 50
},
{
"entropy": 0.8397292912006378,
"epoch": 0.2459016393442623,
"grad_norm": 17.52640724182129,
"learning_rate": 1.8387978142076503e-05,
"loss": 0.2027,
"mean_token_accuracy": 0.909375,
"num_tokens": 74880.0,
"step": 60
},
{
"entropy": 0.8746075868606568,
"epoch": 0.28688524590163933,
"grad_norm": 12.406326293945312,
"learning_rate": 1.8114754098360656e-05,
"loss": 0.1185,
"mean_token_accuracy": 0.95390625,
"num_tokens": 87360.0,
"step": 70
},
{
"entropy": 0.8339551448822021,
"epoch": 0.32786885245901637,
"grad_norm": 8.674996376037598,
"learning_rate": 1.784153005464481e-05,
"loss": 0.1133,
"mean_token_accuracy": 0.95390625,
"num_tokens": 99840.0,
"step": 80
},
{
"entropy": 0.8026882886886597,
"epoch": 0.36885245901639346,
"grad_norm": 27.134490966796875,
"learning_rate": 1.7568306010928963e-05,
"loss": 0.1526,
"mean_token_accuracy": 0.94296875,
"num_tokens": 112320.0,
"step": 90
},
{
"entropy": 0.8337083220481872,
"epoch": 0.4098360655737705,
"grad_norm": 12.572772026062012,
"learning_rate": 1.7295081967213117e-05,
"loss": 0.069,
"mean_token_accuracy": 0.96796875,
"num_tokens": 124800.0,
"step": 100
},
{
"entropy": 0.789594167470932,
"epoch": 0.45081967213114754,
"grad_norm": 11.466198921203613,
"learning_rate": 1.702185792349727e-05,
"loss": 0.0618,
"mean_token_accuracy": 0.978125,
"num_tokens": 137280.0,
"step": 110
},
{
"entropy": 0.7361592233181,
"epoch": 0.4918032786885246,
"grad_norm": 31.19264793395996,
"learning_rate": 1.674863387978142e-05,
"loss": 0.1167,
"mean_token_accuracy": 0.95078125,
"num_tokens": 149760.0,
"step": 120
},
{
"entropy": 0.722903597354889,
"epoch": 0.5327868852459017,
"grad_norm": 13.952592849731445,
"learning_rate": 1.6475409836065574e-05,
"loss": 0.0195,
"mean_token_accuracy": 0.99375,
"num_tokens": 162240.0,
"step": 130
},
{
"entropy": 0.6958432495594025,
"epoch": 0.5737704918032787,
"grad_norm": 1.7772663831710815,
"learning_rate": 1.6202185792349728e-05,
"loss": 0.0258,
"mean_token_accuracy": 0.98984375,
"num_tokens": 174720.0,
"step": 140
},
{
"entropy": 0.7353324055671692,
"epoch": 0.6147540983606558,
"grad_norm": 47.304779052734375,
"learning_rate": 1.592896174863388e-05,
"loss": 0.0353,
"mean_token_accuracy": 0.98828125,
"num_tokens": 187200.0,
"step": 150
},
{
"entropy": 0.6916000425815583,
"epoch": 0.6557377049180327,
"grad_norm": 30.046722412109375,
"learning_rate": 1.5655737704918035e-05,
"loss": 0.0376,
"mean_token_accuracy": 0.9875,
"num_tokens": 199680.0,
"step": 160
},
{
"entropy": 0.6607187688350677,
"epoch": 0.6967213114754098,
"grad_norm": 18.224321365356445,
"learning_rate": 1.538251366120219e-05,
"loss": 0.0174,
"mean_token_accuracy": 0.99609375,
"num_tokens": 212160.0,
"step": 170
},
{
"entropy": 0.641388189792633,
"epoch": 0.7377049180327869,
"grad_norm": 0.8001788854598999,
"learning_rate": 1.510928961748634e-05,
"loss": 0.0226,
"mean_token_accuracy": 0.99296875,
"num_tokens": 224640.0,
"step": 180
},
{
"entropy": 0.5654755473136902,
"epoch": 0.7786885245901639,
"grad_norm": 6.179544448852539,
"learning_rate": 1.4836065573770492e-05,
"loss": 0.0184,
"mean_token_accuracy": 0.990625,
"num_tokens": 237120.0,
"step": 190
},
{
"entropy": 0.5690807163715362,
"epoch": 0.819672131147541,
"grad_norm": 17.677095413208008,
"learning_rate": 1.4562841530054646e-05,
"loss": 0.0554,
"mean_token_accuracy": 0.9828125,
"num_tokens": 249600.0,
"step": 200
},
{
"entropy": 0.6085642755031586,
"epoch": 0.860655737704918,
"grad_norm": 15.046479225158691,
"learning_rate": 1.4289617486338798e-05,
"loss": 0.0131,
"mean_token_accuracy": 0.99609375,
"num_tokens": 262080.0,
"step": 210
},
{
"entropy": 0.5867221057415009,
"epoch": 0.9016393442622951,
"grad_norm": 9.699835777282715,
"learning_rate": 1.4016393442622951e-05,
"loss": 0.0116,
"mean_token_accuracy": 0.996875,
"num_tokens": 274560.0,
"step": 220
},
{
"entropy": 0.5684578359127045,
"epoch": 0.9426229508196722,
"grad_norm": 0.10982056707143784,
"learning_rate": 1.3743169398907106e-05,
"loss": 0.0051,
"mean_token_accuracy": 0.99765625,
"num_tokens": 287040.0,
"step": 230
},
{
"entropy": 0.5817050397396087,
"epoch": 0.9836065573770492,
"grad_norm": 0.11507736891508102,
"learning_rate": 1.3469945355191258e-05,
"loss": 0.0025,
"mean_token_accuracy": 0.9984375,
"num_tokens": 299520.0,
"step": 240
},
{
"epoch": 1.0,
"eval_entropy": 0.5465924368964301,
"eval_loss": 0.004227596800774336,
"eval_mean_token_accuracy": 0.9989583333333333,
"eval_num_tokens": 304200.0,
"eval_runtime": 4.2641,
"eval_samples_per_second": 333.484,
"eval_steps_per_second": 10.553,
"step": 244
},
{
"entropy": 0.5748404681682586,
"epoch": 1.0245901639344261,
"grad_norm": 0.04080405831336975,
"learning_rate": 1.3196721311475412e-05,
"loss": 0.0116,
"mean_token_accuracy": 0.996875,
"num_tokens": 311688.0,
"step": 250
},
{
"entropy": 0.5690902054309845,
"epoch": 1.0655737704918034,
"grad_norm": 25.26702880859375,
"learning_rate": 1.2923497267759564e-05,
"loss": 0.0152,
"mean_token_accuracy": 0.99453125,
"num_tokens": 324168.0,
"step": 260
},
{
"entropy": 0.5449742019176483,
"epoch": 1.1065573770491803,
"grad_norm": 0.015380386263132095,
"learning_rate": 1.2650273224043717e-05,
"loss": 0.0049,
"mean_token_accuracy": 0.9984375,
"num_tokens": 336648.0,
"step": 270
},
{
"entropy": 0.5294660866260529,
"epoch": 1.1475409836065573,
"grad_norm": 16.71912384033203,
"learning_rate": 1.2377049180327869e-05,
"loss": 0.022,
"mean_token_accuracy": 0.98984375,
"num_tokens": 349128.0,
"step": 280
},
{
"entropy": 0.5246538281440735,
"epoch": 1.1885245901639343,
"grad_norm": 0.05239957571029663,
"learning_rate": 1.2103825136612023e-05,
"loss": 0.0229,
"mean_token_accuracy": 0.99296875,
"num_tokens": 361608.0,
"step": 290
},
{
"entropy": 0.557120931148529,
"epoch": 1.2295081967213115,
"grad_norm": 6.907280445098877,
"learning_rate": 1.1830601092896176e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9984375,
"num_tokens": 374088.0,
"step": 300
},
{
"entropy": 0.5478886723518371,
"epoch": 1.2704918032786885,
"grad_norm": 0.22664949297904968,
"learning_rate": 1.155737704918033e-05,
"loss": 0.0051,
"mean_token_accuracy": 0.99921875,
"num_tokens": 386568.0,
"step": 310
},
{
"entropy": 0.5443004906177521,
"epoch": 1.3114754098360657,
"grad_norm": 0.20651134848594666,
"learning_rate": 1.1284153005464482e-05,
"loss": 0.0048,
"mean_token_accuracy": 0.9984375,
"num_tokens": 399048.0,
"step": 320
},
{
"entropy": 0.5586306273937225,
"epoch": 1.3524590163934427,
"grad_norm": 0.1290273219347,
"learning_rate": 1.1010928961748635e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.99921875,
"num_tokens": 411528.0,
"step": 330
},
{
"entropy": 0.563942140340805,
"epoch": 1.3934426229508197,
"grad_norm": 5.307101249694824,
"learning_rate": 1.0737704918032787e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.99921875,
"num_tokens": 424008.0,
"step": 340
},
{
"entropy": 0.5790425062179565,
"epoch": 1.4344262295081966,
"grad_norm": 0.037372056394815445,
"learning_rate": 1.046448087431694e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 436488.0,
"step": 350
},
{
"entropy": 0.5586484789848327,
"epoch": 1.4754098360655736,
"grad_norm": 22.3153133392334,
"learning_rate": 1.0191256830601092e-05,
"loss": 0.0068,
"mean_token_accuracy": 0.996875,
"num_tokens": 448968.0,
"step": 360
},
{
"entropy": 0.5542426824569702,
"epoch": 1.5163934426229508,
"grad_norm": 0.38335874676704407,
"learning_rate": 9.918032786885246e-06,
"loss": 0.0045,
"mean_token_accuracy": 0.99921875,
"num_tokens": 461448.0,
"step": 370
},
{
"entropy": 0.5510773122310638,
"epoch": 1.5573770491803278,
"grad_norm": 5.615074157714844,
"learning_rate": 9.6448087431694e-06,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 473928.0,
"step": 380
},
{
"entropy": 0.5534482300281525,
"epoch": 1.598360655737705,
"grad_norm": 1.5892225503921509,
"learning_rate": 9.371584699453553e-06,
"loss": 0.0046,
"mean_token_accuracy": 0.99765625,
"num_tokens": 486408.0,
"step": 390
},
{
"entropy": 0.5506496012210846,
"epoch": 1.639344262295082,
"grad_norm": 0.008472824469208717,
"learning_rate": 9.098360655737707e-06,
"loss": 0.0044,
"mean_token_accuracy": 0.9984375,
"num_tokens": 498888.0,
"step": 400
},
{
"entropy": 0.5349482536315918,
"epoch": 1.680327868852459,
"grad_norm": 1.03555166721344,
"learning_rate": 8.825136612021858e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 511368.0,
"step": 410
},
{
"entropy": 0.5442528307437897,
"epoch": 1.721311475409836,
"grad_norm": 0.12735722959041595,
"learning_rate": 8.551912568306012e-06,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 523848.0,
"step": 420
},
{
"entropy": 0.5300421416759491,
"epoch": 1.762295081967213,
"grad_norm": 0.1337188482284546,
"learning_rate": 8.278688524590165e-06,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 536328.0,
"step": 430
},
{
"entropy": 0.5553384840488433,
"epoch": 1.8032786885245902,
"grad_norm": 0.01123378612101078,
"learning_rate": 8.005464480874317e-06,
"loss": 0.0019,
"mean_token_accuracy": 0.99921875,
"num_tokens": 548808.0,
"step": 440
},
{
"entropy": 0.5755411803722381,
"epoch": 1.8442622950819674,
"grad_norm": 0.002041811356320977,
"learning_rate": 7.732240437158471e-06,
"loss": 0.0001,
"mean_token_accuracy": 1.0,
"num_tokens": 561288.0,
"step": 450
},
{
"entropy": 0.564773005247116,
"epoch": 1.8852459016393444,
"grad_norm": 0.004369141533970833,
"learning_rate": 7.459016393442624e-06,
"loss": 0.0001,
"mean_token_accuracy": 1.0,
"num_tokens": 573768.0,
"step": 460
},
{
"entropy": 0.5664382457733155,
"epoch": 1.9262295081967213,
"grad_norm": 0.1301860511302948,
"learning_rate": 7.185792349726777e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 586248.0,
"step": 470
},
{
"entropy": 0.5698906660079956,
"epoch": 1.9672131147540983,
"grad_norm": 0.011921355500817299,
"learning_rate": 6.91256830601093e-06,
"loss": 0.0001,
"mean_token_accuracy": 1.0,
"num_tokens": 598728.0,
"step": 480
},
{
"epoch": 2.0,
"eval_entropy": 0.5514066629939609,
"eval_loss": 0.0009000992868095636,
"eval_mean_token_accuracy": 0.9998263888888889,
"eval_num_tokens": 608400.0,
"eval_runtime": 4.2684,
"eval_samples_per_second": 333.145,
"eval_steps_per_second": 10.543,
"step": 488
},
{
"entropy": 0.5585790574550629,
"epoch": 2.0081967213114753,
"grad_norm": 0.16178520023822784,
"learning_rate": 6.6393442622950825e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 610896.0,
"step": 490
},
{
"entropy": 0.5657597005367279,
"epoch": 2.0491803278688523,
"grad_norm": 0.0013239571126177907,
"learning_rate": 6.366120218579236e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 623376.0,
"step": 500
},
{
"entropy": 0.555700272321701,
"epoch": 2.0901639344262297,
"grad_norm": 0.0003598702314775437,
"learning_rate": 6.092896174863389e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 635856.0,
"step": 510
},
{
"entropy": 0.5675460159778595,
"epoch": 2.1311475409836067,
"grad_norm": 0.0006554612773470581,
"learning_rate": 5.8196721311475415e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 648336.0,
"step": 520
},
{
"entropy": 0.5628720939159393,
"epoch": 2.1721311475409837,
"grad_norm": 0.0010637306841090322,
"learning_rate": 5.546448087431694e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 660816.0,
"step": 530
},
{
"entropy": 0.5665178418159484,
"epoch": 2.2131147540983607,
"grad_norm": 0.0005588372005149722,
"learning_rate": 5.273224043715848e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 673296.0,
"step": 540
},
{
"entropy": 0.572381979227066,
"epoch": 2.2540983606557377,
"grad_norm": 0.0007820471655577421,
"learning_rate": 5e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 685776.0,
"step": 550
},
{
"entropy": 0.5639007449150085,
"epoch": 2.2950819672131146,
"grad_norm": 0.0003962396876886487,
"learning_rate": 4.726775956284154e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 698256.0,
"step": 560
},
{
"entropy": 0.5546983301639556,
"epoch": 2.3360655737704916,
"grad_norm": 0.0002764791715890169,
"learning_rate": 4.453551912568307e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 710736.0,
"step": 570
},
{
"entropy": 0.5563873648643494,
"epoch": 2.3770491803278686,
"grad_norm": 0.0006734775961376727,
"learning_rate": 4.180327868852459e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 723216.0,
"step": 580
},
{
"entropy": 0.5603203475475311,
"epoch": 2.418032786885246,
"grad_norm": 0.001175143290311098,
"learning_rate": 3.907103825136612e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 735696.0,
"step": 590
},
{
"entropy": 0.5609167218208313,
"epoch": 2.459016393442623,
"grad_norm": 0.0010104449465870857,
"learning_rate": 3.6338797814207656e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 748176.0,
"step": 600
},
{
"entropy": 0.5623033583164215,
"epoch": 2.5,
"grad_norm": 0.0007569916197098792,
"learning_rate": 3.3606557377049183e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 760656.0,
"step": 610
},
{
"entropy": 0.5588847517967224,
"epoch": 2.540983606557377,
"grad_norm": 0.00043811326031573117,
"learning_rate": 3.0874316939890714e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 773136.0,
"step": 620
},
{
"entropy": 0.5473314583301544,
"epoch": 2.581967213114754,
"grad_norm": 0.00015868025366216898,
"learning_rate": 2.814207650273224e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 785616.0,
"step": 630
},
{
"entropy": 0.5686765968799591,
"epoch": 2.6229508196721314,
"grad_norm": 0.000258870713878423,
"learning_rate": 2.5409836065573773e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 798096.0,
"step": 640
},
{
"entropy": 0.5574231624603272,
"epoch": 2.663934426229508,
"grad_norm": 0.0005557859549298882,
"learning_rate": 2.2677595628415304e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 810576.0,
"step": 650
},
{
"entropy": 0.5558390915393829,
"epoch": 2.7049180327868854,
"grad_norm": 0.002176334150135517,
"learning_rate": 1.994535519125683e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 823056.0,
"step": 660
},
{
"entropy": 0.5470618724822998,
"epoch": 2.7459016393442623,
"grad_norm": 0.0007859561592340469,
"learning_rate": 1.7213114754098362e-06,
"loss": 0.0001,
"mean_token_accuracy": 1.0,
"num_tokens": 835536.0,
"step": 670
},
{
"entropy": 0.555149644613266,
"epoch": 2.7868852459016393,
"grad_norm": 0.0003778956306632608,
"learning_rate": 1.4480874316939891e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 848016.0,
"step": 680
},
{
"entropy": 0.5577639102935791,
"epoch": 2.8278688524590163,
"grad_norm": 0.00018476726836524904,
"learning_rate": 1.1748633879781422e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 860496.0,
"step": 690
},
{
"entropy": 0.5558693587779999,
"epoch": 2.8688524590163933,
"grad_norm": 0.00044889526907354593,
"learning_rate": 9.016393442622952e-07,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 872976.0,
"step": 700
},
{
"entropy": 0.5639389038085938,
"epoch": 2.9098360655737707,
"grad_norm": 0.0005407112766988575,
"learning_rate": 6.284153005464482e-07,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 885456.0,
"step": 710
},
{
"entropy": 0.5556601345539093,
"epoch": 2.9508196721311473,
"grad_norm": 0.0008689384558238089,
"learning_rate": 3.551912568306011e-07,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 897936.0,
"step": 720
},
{
"entropy": 0.5493379056453704,
"epoch": 2.9918032786885247,
"grad_norm": 0.0004895636229775846,
"learning_rate": 8.19672131147541e-08,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 910416.0,
"step": 730
},
{
"epoch": 3.0,
"eval_entropy": 0.5459074311786227,
"eval_loss": 2.739655656114337e-06,
"eval_mean_token_accuracy": 1.0,
"eval_num_tokens": 912600.0,
"eval_runtime": 4.2464,
"eval_samples_per_second": 334.874,
"eval_steps_per_second": 10.597,
"step": 732
}
],
"logging_steps": 10,
"max_steps": 732,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 549345133209600.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}