agrofinetune / checkpoint-3474 /trainer_state.json
RetrO21's picture
Upload folder using huggingface_hub
8496b6d verified
{
"best_global_step": 3474,
"best_metric": 5.55628776550293,
"best_model_checkpoint": "./output/checkpoint-3474",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 3474,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 3.606692385673523,
"epoch": 0.028785261945883708,
"grad_norm": 3.2999913692474365,
"learning_rate": 4.9e-07,
"loss": 13.6598,
"mean_token_accuracy": 0.16028020828962325,
"num_tokens": 53993.0,
"step": 50
},
{
"entropy": 3.618675880432129,
"epoch": 0.057570523891767415,
"grad_norm": 3.101252555847168,
"learning_rate": 9.9e-07,
"loss": 14.0188,
"mean_token_accuracy": 0.1508466500043869,
"num_tokens": 110134.0,
"step": 100
},
{
"entropy": 3.5215235900878907,
"epoch": 0.08635578583765112,
"grad_norm": 3.513662815093994,
"learning_rate": 1.49e-06,
"loss": 12.8555,
"mean_token_accuracy": 0.18527640983462335,
"num_tokens": 160191.0,
"step": 150
},
{
"entropy": 3.667909698486328,
"epoch": 0.11514104778353483,
"grad_norm": 4.327610492706299,
"learning_rate": 1.99e-06,
"loss": 13.5394,
"mean_token_accuracy": 0.157139780074358,
"num_tokens": 214993.0,
"step": 200
},
{
"entropy": 3.768263258934021,
"epoch": 0.14392630972941853,
"grad_norm": 4.290107250213623,
"learning_rate": 1.988450206246317e-06,
"loss": 12.8912,
"mean_token_accuracy": 0.17374794125556947,
"num_tokens": 268184.0,
"step": 250
},
{
"entropy": 3.990619196891785,
"epoch": 0.17271157167530224,
"grad_norm": 4.444278717041016,
"learning_rate": 1.976664702416028e-06,
"loss": 12.455,
"mean_token_accuracy": 0.17780130118131637,
"num_tokens": 319458.0,
"step": 300
},
{
"entropy": 4.162646284103394,
"epoch": 0.20149683362118595,
"grad_norm": 5.615262508392334,
"learning_rate": 1.9648791985857395e-06,
"loss": 12.0893,
"mean_token_accuracy": 0.18191319867968558,
"num_tokens": 373337.0,
"step": 350
},
{
"entropy": 4.532100868225098,
"epoch": 0.23028209556706966,
"grad_norm": 10.074016571044922,
"learning_rate": 1.9530936947554507e-06,
"loss": 11.9261,
"mean_token_accuracy": 0.169477596282959,
"num_tokens": 427526.0,
"step": 400
},
{
"entropy": 4.923871030807495,
"epoch": 0.25906735751295334,
"grad_norm": 16.220163345336914,
"learning_rate": 1.9413081909251622e-06,
"loss": 11.0048,
"mean_token_accuracy": 0.1704501649737358,
"num_tokens": 480528.0,
"step": 450
},
{
"entropy": 5.521005854606629,
"epoch": 0.28785261945883706,
"grad_norm": 29.904008865356445,
"learning_rate": 1.9295226870948733e-06,
"loss": 9.6524,
"mean_token_accuracy": 0.16450899541378022,
"num_tokens": 535314.0,
"step": 500
},
{
"entropy": 6.092623329162597,
"epoch": 0.31663788140472077,
"grad_norm": 17.821575164794922,
"learning_rate": 1.9177371832645845e-06,
"loss": 8.1054,
"mean_token_accuracy": 0.17205011785030366,
"num_tokens": 588410.0,
"step": 550
},
{
"entropy": 6.385262680053711,
"epoch": 0.3454231433506045,
"grad_norm": 5.502202987670898,
"learning_rate": 1.9059516794342958e-06,
"loss": 7.4313,
"mean_token_accuracy": 0.1734227080643177,
"num_tokens": 641736.0,
"step": 600
},
{
"entropy": 6.278562617301941,
"epoch": 0.3742084052964882,
"grad_norm": 5.4657697677612305,
"learning_rate": 1.8941661756040071e-06,
"loss": 6.9266,
"mean_token_accuracy": 0.18680249139666558,
"num_tokens": 692200.0,
"step": 650
},
{
"entropy": 6.553266277313233,
"epoch": 0.4029936672423719,
"grad_norm": 4.955812931060791,
"learning_rate": 1.8823806717737183e-06,
"loss": 6.9847,
"mean_token_accuracy": 0.16679802387952805,
"num_tokens": 745830.0,
"step": 700
},
{
"entropy": 6.470935583114624,
"epoch": 0.4317789291882556,
"grad_norm": 4.198381423950195,
"learning_rate": 1.8705951679434296e-06,
"loss": 6.7277,
"mean_token_accuracy": 0.17847734570503235,
"num_tokens": 798872.0,
"step": 750
},
{
"entropy": 6.5620588779449465,
"epoch": 0.4605641911341393,
"grad_norm": 3.1793746948242188,
"learning_rate": 1.8588096641131407e-06,
"loss": 6.7032,
"mean_token_accuracy": 0.17336134731769562,
"num_tokens": 853045.0,
"step": 800
},
{
"entropy": 6.532204885482788,
"epoch": 0.48934945308002303,
"grad_norm": 3.824537515640259,
"learning_rate": 1.847024160282852e-06,
"loss": 6.5762,
"mean_token_accuracy": 0.1805124071240425,
"num_tokens": 907679.0,
"step": 850
},
{
"entropy": 6.535988225936889,
"epoch": 0.5181347150259067,
"grad_norm": 4.350001811981201,
"learning_rate": 1.8352386564525632e-06,
"loss": 6.505,
"mean_token_accuracy": 0.1842605724930763,
"num_tokens": 964170.0,
"step": 900
},
{
"entropy": 6.204533562660218,
"epoch": 0.5469199769717904,
"grad_norm": 2.193660020828247,
"learning_rate": 1.8234531526222745e-06,
"loss": 6.1211,
"mean_token_accuracy": 0.21968430042266845,
"num_tokens": 1015909.0,
"step": 950
},
{
"entropy": 6.308737449645996,
"epoch": 0.5757052389176741,
"grad_norm": 2.325622320175171,
"learning_rate": 1.8116676487919857e-06,
"loss": 6.1653,
"mean_token_accuracy": 0.21636426240205764,
"num_tokens": 1068859.0,
"step": 1000
},
{
"entropy": 6.332560749053955,
"epoch": 0.6044905008635578,
"grad_norm": 2.0439090728759766,
"learning_rate": 1.799882144961697e-06,
"loss": 6.1559,
"mean_token_accuracy": 0.21859725564718246,
"num_tokens": 1123202.0,
"step": 1050
},
{
"entropy": 6.042124252319336,
"epoch": 0.6332757628094415,
"grad_norm": 3.621903657913208,
"learning_rate": 1.7880966411314081e-06,
"loss": 5.8441,
"mean_token_accuracy": 0.24906315237283708,
"num_tokens": 1173403.0,
"step": 1100
},
{
"entropy": 5.921343173980713,
"epoch": 0.6620610247553252,
"grad_norm": 5.658033847808838,
"learning_rate": 1.7763111373011195e-06,
"loss": 5.7104,
"mean_token_accuracy": 0.2625067520141602,
"num_tokens": 1225026.0,
"step": 1150
},
{
"entropy": 6.093586492538452,
"epoch": 0.690846286701209,
"grad_norm": 2.4292995929718018,
"learning_rate": 1.7645256334708308e-06,
"loss": 5.8658,
"mean_token_accuracy": 0.24842385441064835,
"num_tokens": 1279013.0,
"step": 1200
},
{
"entropy": 6.119112596511841,
"epoch": 0.7196315486470927,
"grad_norm": 3.369384288787842,
"learning_rate": 1.752740129640542e-06,
"loss": 5.8784,
"mean_token_accuracy": 0.24857850253582,
"num_tokens": 1332547.0,
"step": 1250
},
{
"entropy": 6.025163550376892,
"epoch": 0.7484168105929764,
"grad_norm": 2.5110116004943848,
"learning_rate": 1.7409546258102533e-06,
"loss": 5.7769,
"mean_token_accuracy": 0.25835376888513567,
"num_tokens": 1385192.0,
"step": 1300
},
{
"entropy": 5.877259612083435,
"epoch": 0.7772020725388601,
"grad_norm": 2.4179303646087646,
"learning_rate": 1.7291691219799646e-06,
"loss": 5.6284,
"mean_token_accuracy": 0.2756252554059029,
"num_tokens": 1437071.0,
"step": 1350
},
{
"entropy": 6.002246947288513,
"epoch": 0.8059873344847438,
"grad_norm": 3.494359016418457,
"learning_rate": 1.717383618149676e-06,
"loss": 5.747,
"mean_token_accuracy": 0.26462210685014725,
"num_tokens": 1490818.0,
"step": 1400
},
{
"entropy": 5.991955623626709,
"epoch": 0.8347725964306275,
"grad_norm": 2.340975761413574,
"learning_rate": 1.705598114319387e-06,
"loss": 5.7379,
"mean_token_accuracy": 0.26444981098175047,
"num_tokens": 1544997.0,
"step": 1450
},
{
"entropy": 5.91768889427185,
"epoch": 0.8635578583765112,
"grad_norm": 2.2394514083862305,
"learning_rate": 1.6938126104890984e-06,
"loss": 5.6564,
"mean_token_accuracy": 0.2730415526032448,
"num_tokens": 1598302.0,
"step": 1500
},
{
"entropy": 5.982716989517212,
"epoch": 0.8923431203223949,
"grad_norm": 1.876839518547058,
"learning_rate": 1.6820271066588098e-06,
"loss": 5.7215,
"mean_token_accuracy": 0.26642445534467696,
"num_tokens": 1655267.0,
"step": 1550
},
{
"entropy": 5.820467872619629,
"epoch": 0.9211283822682786,
"grad_norm": 2.219966173171997,
"learning_rate": 1.6702416028285209e-06,
"loss": 5.5555,
"mean_token_accuracy": 0.2856418335437775,
"num_tokens": 1709199.0,
"step": 1600
},
{
"entropy": 5.996349005699158,
"epoch": 0.9499136442141624,
"grad_norm": 2.247213840484619,
"learning_rate": 1.6584560989982322e-06,
"loss": 5.7283,
"mean_token_accuracy": 0.2696125540137291,
"num_tokens": 1765443.0,
"step": 1650
},
{
"entropy": 5.696683068275451,
"epoch": 0.9786989061600461,
"grad_norm": 2.8499979972839355,
"learning_rate": 1.6466705951679433e-06,
"loss": 5.4335,
"mean_token_accuracy": 0.29918427973985673,
"num_tokens": 1817494.0,
"step": 1700
},
{
"epoch": 1.0,
"eval_entropy": 5.993559589034401,
"eval_loss": 5.737204551696777,
"eval_mean_token_accuracy": 0.2618687468739699,
"eval_model_preparation_time": 0.0045,
"eval_num_tokens": 1856362.0,
"eval_runtime": 50.5332,
"eval_samples_per_second": 8.588,
"eval_steps_per_second": 4.294,
"step": 1737
},
{
"entropy": 5.746842083930969,
"epoch": 1.0074841681059297,
"grad_norm": 2.33052921295166,
"learning_rate": 1.6348850913376547e-06,
"loss": 5.4796,
"mean_token_accuracy": 0.2966849410533905,
"num_tokens": 1870353.0,
"step": 1750
},
{
"entropy": 5.859029049873352,
"epoch": 1.0362694300518134,
"grad_norm": 1.6248886585235596,
"learning_rate": 1.6230995875073658e-06,
"loss": 5.5975,
"mean_token_accuracy": 0.2838129925727844,
"num_tokens": 1926205.0,
"step": 1800
},
{
"entropy": 5.731445336341858,
"epoch": 1.065054691997697,
"grad_norm": 1.6941566467285156,
"learning_rate": 1.6113140836770771e-06,
"loss": 5.476,
"mean_token_accuracy": 0.2992346465587616,
"num_tokens": 1979821.0,
"step": 1850
},
{
"entropy": 5.6993954515457155,
"epoch": 1.0938399539435808,
"grad_norm": 1.1746597290039062,
"learning_rate": 1.5995285798467883e-06,
"loss": 5.4608,
"mean_token_accuracy": 0.3000726142525673,
"num_tokens": 2034373.0,
"step": 1900
},
{
"entropy": 5.668873124122619,
"epoch": 1.1226252158894645,
"grad_norm": 1.728211760520935,
"learning_rate": 1.5877430760164996e-06,
"loss": 5.4347,
"mean_token_accuracy": 0.3033922725915909,
"num_tokens": 2087339.0,
"step": 1950
},
{
"entropy": 5.624621086120605,
"epoch": 1.1514104778353482,
"grad_norm": 1.4078539609909058,
"learning_rate": 1.5759575721862107e-06,
"loss": 5.3954,
"mean_token_accuracy": 0.30784171640872954,
"num_tokens": 2139520.0,
"step": 2000
},
{
"entropy": 5.7141213130950925,
"epoch": 1.180195739781232,
"grad_norm": 2.186459541320801,
"learning_rate": 1.564172068355922e-06,
"loss": 5.4847,
"mean_token_accuracy": 0.29594049394130706,
"num_tokens": 2193987.0,
"step": 2050
},
{
"entropy": 5.632415266036987,
"epoch": 1.2089810017271156,
"grad_norm": 1.3601349592208862,
"learning_rate": 1.5523865645256334e-06,
"loss": 5.4135,
"mean_token_accuracy": 0.30366597563028336,
"num_tokens": 2249616.0,
"step": 2100
},
{
"entropy": 5.510904269218445,
"epoch": 1.2377662636729994,
"grad_norm": 2.065760612487793,
"learning_rate": 1.5406010606953445e-06,
"loss": 5.2904,
"mean_token_accuracy": 0.3211754837632179,
"num_tokens": 2300863.0,
"step": 2150
},
{
"entropy": 5.703383626937867,
"epoch": 1.266551525618883,
"grad_norm": 1.1172698736190796,
"learning_rate": 1.5288155568650559e-06,
"loss": 5.4802,
"mean_token_accuracy": 0.29713701367378237,
"num_tokens": 2356029.0,
"step": 2200
},
{
"entropy": 5.565930342674255,
"epoch": 1.2953367875647668,
"grad_norm": 1.7528513669967651,
"learning_rate": 1.5170300530347672e-06,
"loss": 5.3518,
"mean_token_accuracy": 0.31301232606172563,
"num_tokens": 2408957.0,
"step": 2250
},
{
"entropy": 5.496430187225342,
"epoch": 1.3241220495106505,
"grad_norm": 1.892640233039856,
"learning_rate": 1.5052445492044786e-06,
"loss": 5.2967,
"mean_token_accuracy": 0.3181899458169937,
"num_tokens": 2462569.0,
"step": 2300
},
{
"entropy": 5.725150098800659,
"epoch": 1.3529073114565342,
"grad_norm": 1.774940848350525,
"learning_rate": 1.4934590453741897e-06,
"loss": 5.5215,
"mean_token_accuracy": 0.29055028676986694,
"num_tokens": 2518544.0,
"step": 2350
},
{
"entropy": 5.4884827613830565,
"epoch": 1.381692573402418,
"grad_norm": 2.2167599201202393,
"learning_rate": 1.481673541543901e-06,
"loss": 5.2917,
"mean_token_accuracy": 0.31803421139717103,
"num_tokens": 2570863.0,
"step": 2400
},
{
"entropy": 5.697079472541809,
"epoch": 1.4104778353483016,
"grad_norm": 1.6489030122756958,
"learning_rate": 1.4698880377136124e-06,
"loss": 5.4982,
"mean_token_accuracy": 0.2925163987278938,
"num_tokens": 2626998.0,
"step": 2450
},
{
"entropy": 5.46209939956665,
"epoch": 1.4392630972941853,
"grad_norm": 1.153914451599121,
"learning_rate": 1.4581025338833235e-06,
"loss": 5.2736,
"mean_token_accuracy": 0.3182168474793434,
"num_tokens": 2681568.0,
"step": 2500
},
{
"entropy": 5.4405768728256225,
"epoch": 1.468048359240069,
"grad_norm": 3.6614978313446045,
"learning_rate": 1.4463170300530348e-06,
"loss": 5.2515,
"mean_token_accuracy": 0.3218736210465431,
"num_tokens": 2733587.0,
"step": 2550
},
{
"entropy": 5.528175053596496,
"epoch": 1.4968336211859528,
"grad_norm": 1.0849746465682983,
"learning_rate": 1.434531526222746e-06,
"loss": 5.3378,
"mean_token_accuracy": 0.31061659604310987,
"num_tokens": 2787003.0,
"step": 2600
},
{
"entropy": 5.46110897064209,
"epoch": 1.5256188831318365,
"grad_norm": 1.8315683603286743,
"learning_rate": 1.4227460223924573e-06,
"loss": 5.2782,
"mean_token_accuracy": 0.31781029611825945,
"num_tokens": 2840263.0,
"step": 2650
},
{
"entropy": 5.455560960769653,
"epoch": 1.5544041450777202,
"grad_norm": 1.1859091520309448,
"learning_rate": 1.4109605185621684e-06,
"loss": 5.2735,
"mean_token_accuracy": 0.3194814011454582,
"num_tokens": 2894186.0,
"step": 2700
},
{
"entropy": 5.430496115684509,
"epoch": 1.583189407023604,
"grad_norm": 2.3500001430511475,
"learning_rate": 1.3991750147318797e-06,
"loss": 5.2464,
"mean_token_accuracy": 0.32140792965888976,
"num_tokens": 2948171.0,
"step": 2750
},
{
"entropy": 5.588023023605347,
"epoch": 1.6119746689694876,
"grad_norm": 1.727825403213501,
"learning_rate": 1.3873895109015909e-06,
"loss": 5.4028,
"mean_token_accuracy": 0.3039530631899834,
"num_tokens": 3002678.0,
"step": 2800
},
{
"entropy": 5.410525422096253,
"epoch": 1.6407599309153713,
"grad_norm": 1.3401474952697754,
"learning_rate": 1.3756040070713022e-06,
"loss": 5.2298,
"mean_token_accuracy": 0.324065263569355,
"num_tokens": 3055844.0,
"step": 2850
},
{
"entropy": 5.36959942817688,
"epoch": 1.669545192861255,
"grad_norm": 1.1892589330673218,
"learning_rate": 1.3638185032410133e-06,
"loss": 5.1956,
"mean_token_accuracy": 0.32639502108097074,
"num_tokens": 3108636.0,
"step": 2900
},
{
"entropy": 5.53826907157898,
"epoch": 1.6983304548071387,
"grad_norm": 1.2652360200881958,
"learning_rate": 1.3520329994107247e-06,
"loss": 5.3583,
"mean_token_accuracy": 0.3074926760792732,
"num_tokens": 3162627.0,
"step": 2950
},
{
"entropy": 5.417449145317078,
"epoch": 1.7271157167530224,
"grad_norm": 1.584312915802002,
"learning_rate": 1.340247495580436e-06,
"loss": 5.2388,
"mean_token_accuracy": 0.32019727885723115,
"num_tokens": 3216409.0,
"step": 3000
},
{
"entropy": 5.241390740871429,
"epoch": 1.7559009786989062,
"grad_norm": 1.5219439268112183,
"learning_rate": 1.3284619917501471e-06,
"loss": 5.0645,
"mean_token_accuracy": 0.3445430138707161,
"num_tokens": 3266967.0,
"step": 3050
},
{
"entropy": 5.405424036979675,
"epoch": 1.7846862406447899,
"grad_norm": 2.1165153980255127,
"learning_rate": 1.3166764879198585e-06,
"loss": 5.232,
"mean_token_accuracy": 0.32085000157356264,
"num_tokens": 3319877.0,
"step": 3100
},
{
"entropy": 5.123006024360657,
"epoch": 1.8134715025906736,
"grad_norm": 1.2189785242080688,
"learning_rate": 1.3048909840895698e-06,
"loss": 4.9582,
"mean_token_accuracy": 0.356108532845974,
"num_tokens": 3368569.0,
"step": 3150
},
{
"entropy": 5.417610831260681,
"epoch": 1.8422567645365573,
"grad_norm": 1.5157604217529297,
"learning_rate": 1.2931054802592812e-06,
"loss": 5.2454,
"mean_token_accuracy": 0.31976755023002623,
"num_tokens": 3422449.0,
"step": 3200
},
{
"entropy": 5.409690895080566,
"epoch": 1.871042026482441,
"grad_norm": 1.3088161945343018,
"learning_rate": 1.2813199764289923e-06,
"loss": 5.2348,
"mean_token_accuracy": 0.32325415283441544,
"num_tokens": 3474399.0,
"step": 3250
},
{
"entropy": 5.44662567615509,
"epoch": 1.8998272884283247,
"grad_norm": 2.178372621536255,
"learning_rate": 1.2695344725987036e-06,
"loss": 5.2661,
"mean_token_accuracy": 0.3182847076654434,
"num_tokens": 3527726.0,
"step": 3300
},
{
"entropy": 5.512614865303039,
"epoch": 1.9286125503742084,
"grad_norm": 1.3050425052642822,
"learning_rate": 1.2577489687684147e-06,
"loss": 5.3416,
"mean_token_accuracy": 0.3084403133392334,
"num_tokens": 3581980.0,
"step": 3350
},
{
"entropy": 5.379772834777832,
"epoch": 1.9573978123200921,
"grad_norm": 1.4584404230117798,
"learning_rate": 1.245963464938126e-06,
"loss": 5.2087,
"mean_token_accuracy": 0.32388432770967485,
"num_tokens": 3635393.0,
"step": 3400
},
{
"entropy": 5.483665924072266,
"epoch": 1.9861830742659758,
"grad_norm": 1.2157734632492065,
"learning_rate": 1.2341779611078374e-06,
"loss": 5.3101,
"mean_token_accuracy": 0.3121953472495079,
"num_tokens": 3689894.0,
"step": 3450
},
{
"epoch": 2.0,
"eval_entropy": 5.711394641805904,
"eval_loss": 5.55628776550293,
"eval_mean_token_accuracy": 0.2764948787777105,
"eval_model_preparation_time": 0.0045,
"eval_num_tokens": 3712724.0,
"eval_runtime": 50.187,
"eval_samples_per_second": 8.648,
"eval_steps_per_second": 4.324,
"step": 3474
}
],
"logging_steps": 50,
"max_steps": 8685,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.088598592372736e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}