Salma204's picture
M2: GK SFT LoRA v2 — MMLU + TriviaQA + NQ
d7e7165 verified
{
"best_global_step": 3000,
"best_metric": 0.43371766805648804,
"best_model_checkpoint": "/scratch/gk_checkpoint_lora_v2/checkpoint-3000",
"epoch": 0.9051821679112921,
"eval_steps": 200,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.0372698324173688,
"epoch": 0.015086369465188203,
"grad_norm": 0.4944687783718109,
"learning_rate": 3.266666666666667e-05,
"loss": 2.1209957885742186,
"mean_token_accuracy": 0.6639667785167694,
"num_tokens": 213967.0,
"step": 50
},
{
"entropy": 0.4767542722821236,
"epoch": 0.030172738930376405,
"grad_norm": 0.29953643679618835,
"learning_rate": 6.6e-05,
"loss": 0.5209395980834961,
"mean_token_accuracy": 0.9028549310564995,
"num_tokens": 426365.0,
"step": 100
},
{
"entropy": 0.46151453502476214,
"epoch": 0.04525910839556461,
"grad_norm": 0.3118787109851837,
"learning_rate": 9.933333333333334e-05,
"loss": 0.508198356628418,
"mean_token_accuracy": 0.9032981966435909,
"num_tokens": 642837.0,
"step": 150
},
{
"entropy": 0.4455497920885682,
"epoch": 0.06034547786075281,
"grad_norm": 0.3013271391391754,
"learning_rate": 9.999382532513122e-05,
"loss": 0.48826019287109373,
"mean_token_accuracy": 0.907553653717041,
"num_tokens": 856475.0,
"step": 200
},
{
"epoch": 0.06034547786075281,
"eval_entropy": 0.4689600637588114,
"eval_loss": 0.4816349446773529,
"eval_mean_token_accuracy": 0.8987587762934696,
"eval_num_tokens": 856475.0,
"eval_runtime": 37.5206,
"eval_samples_per_second": 57.728,
"eval_steps_per_second": 7.223,
"step": 200
},
{
"entropy": 0.44372758489102127,
"epoch": 0.075431847325941,
"grad_norm": 0.31373220682144165,
"learning_rate": 9.997479627263544e-05,
"loss": 0.4843710327148438,
"mean_token_accuracy": 0.9082412907481193,
"num_tokens": 1069370.0,
"step": 250
},
{
"entropy": 0.4475720078870654,
"epoch": 0.09051821679112922,
"grad_norm": 0.27380964159965515,
"learning_rate": 9.994291516446573e-05,
"loss": 0.491109733581543,
"mean_token_accuracy": 0.9055162121355533,
"num_tokens": 1286132.0,
"step": 300
},
{
"entropy": 0.4463552813604474,
"epoch": 0.10560458625631741,
"grad_norm": 0.2614763677120209,
"learning_rate": 9.989819019951048e-05,
"loss": 0.4837772369384766,
"mean_token_accuracy": 0.9081570096313953,
"num_tokens": 1500851.0,
"step": 350
},
{
"entropy": 0.41013720393180847,
"epoch": 0.12069095572150562,
"grad_norm": 0.4836612045764923,
"learning_rate": 9.984063287972232e-05,
"loss": 0.44385364532470706,
"mean_token_accuracy": 0.9148843766748905,
"num_tokens": 1708807.0,
"step": 400
},
{
"epoch": 0.12069095572150562,
"eval_entropy": 0.4811463643044123,
"eval_loss": 0.4747391641139984,
"eval_mean_token_accuracy": 0.8997697566268189,
"eval_num_tokens": 1708807.0,
"eval_runtime": 37.1123,
"eval_samples_per_second": 58.363,
"eval_steps_per_second": 7.302,
"step": 400
},
{
"entropy": 0.42200190499424933,
"epoch": 0.13577732518669383,
"grad_norm": 0.19339531660079956,
"learning_rate": 9.977025800716017e-05,
"loss": 0.45712459564208985,
"mean_token_accuracy": 0.9106362241506577,
"num_tokens": 1923184.0,
"step": 450
},
{
"entropy": 0.43271509755402804,
"epoch": 0.150863694651882,
"grad_norm": 0.299110472202301,
"learning_rate": 9.968708368018253e-05,
"loss": 0.4724855422973633,
"mean_token_accuracy": 0.9078708891570568,
"num_tokens": 2139609.0,
"step": 500
},
{
"entropy": 0.43638833791017534,
"epoch": 0.16595006411707022,
"grad_norm": 0.2539425492286682,
"learning_rate": 9.959113128879322e-05,
"loss": 0.4785395050048828,
"mean_token_accuracy": 0.907249256670475,
"num_tokens": 2358080.0,
"step": 550
},
{
"entropy": 0.43115664307028057,
"epoch": 0.18103643358225843,
"grad_norm": 0.2542003393173218,
"learning_rate": 9.948242550914035e-05,
"loss": 0.4740608215332031,
"mean_token_accuracy": 0.9089943794906139,
"num_tokens": 2574667.0,
"step": 600
},
{
"epoch": 0.18103643358225843,
"eval_entropy": 0.45346878287537074,
"eval_loss": 0.46328845620155334,
"eval_mean_token_accuracy": 0.9025413253210568,
"eval_num_tokens": 2574667.0,
"eval_runtime": 37.1254,
"eval_samples_per_second": 58.343,
"eval_steps_per_second": 7.3,
"step": 600
},
{
"entropy": 0.4420048241317272,
"epoch": 0.19612280304744664,
"grad_norm": 0.26832085847854614,
"learning_rate": 9.936099429717045e-05,
"loss": 0.486652717590332,
"mean_token_accuracy": 0.9074076810479164,
"num_tokens": 2790489.0,
"step": 650
},
{
"entropy": 0.4269510039314628,
"epoch": 0.21120917251263482,
"grad_norm": 0.20635050535202026,
"learning_rate": 9.922686888143897e-05,
"loss": 0.4619187927246094,
"mean_token_accuracy": 0.910810690075159,
"num_tokens": 3003881.0,
"step": 700
},
{
"entropy": 0.42016164746135476,
"epoch": 0.22629554197782303,
"grad_norm": 0.2643264830112457,
"learning_rate": 9.908008375507924e-05,
"loss": 0.46344844818115233,
"mean_token_accuracy": 0.9113752076029777,
"num_tokens": 3218446.0,
"step": 750
},
{
"entropy": 0.3974369211867452,
"epoch": 0.24138191144301124,
"grad_norm": 0.19392798840999603,
"learning_rate": 9.89206766669318e-05,
"loss": 0.42646697998046873,
"mean_token_accuracy": 0.9165593402087688,
"num_tokens": 3428256.0,
"step": 800
},
{
"epoch": 0.24138191144301124,
"eval_entropy": 0.43642714537157784,
"eval_loss": 0.46031010150909424,
"eval_mean_token_accuracy": 0.9025695645061366,
"eval_num_tokens": 3428256.0,
"eval_runtime": 37.1078,
"eval_samples_per_second": 58.37,
"eval_steps_per_second": 7.303,
"step": 800
},
{
"entropy": 0.4217161551490426,
"epoch": 0.25646828090819945,
"grad_norm": 0.21452021598815918,
"learning_rate": 9.874868861183658e-05,
"loss": 0.4612973022460938,
"mean_token_accuracy": 0.9114794608950615,
"num_tokens": 3642529.0,
"step": 850
},
{
"entropy": 0.41439461953938006,
"epoch": 0.27155465037338766,
"grad_norm": 0.25079163908958435,
"learning_rate": 9.856416382009006e-05,
"loss": 0.4494070053100586,
"mean_token_accuracy": 0.9127840812504292,
"num_tokens": 3855962.0,
"step": 900
},
{
"entropy": 0.4270974922552705,
"epoch": 0.2866410198385759,
"grad_norm": 0.20817860960960388,
"learning_rate": 9.836714974607077e-05,
"loss": 0.46105358123779294,
"mean_token_accuracy": 0.9099104046821594,
"num_tokens": 4069157.0,
"step": 950
},
{
"entropy": 0.400859787017107,
"epoch": 0.301727389303764,
"grad_norm": 0.20943191647529602,
"learning_rate": 9.815769705603521e-05,
"loss": 0.4289055633544922,
"mean_token_accuracy": 0.9167061321437359,
"num_tokens": 4278743.0,
"step": 1000
},
{
"epoch": 0.301727389303764,
"eval_entropy": 0.44312036031946483,
"eval_loss": 0.4571220278739929,
"eval_mean_token_accuracy": 0.9029428505809545,
"eval_num_tokens": 4278743.0,
"eval_runtime": 37.0803,
"eval_samples_per_second": 58.414,
"eval_steps_per_second": 7.308,
"step": 1000
},
{
"entropy": 0.4052092955261469,
"epoch": 0.31681375876895224,
"grad_norm": 0.21670734882354736,
"learning_rate": 9.793585961508811e-05,
"loss": 0.44187084197998044,
"mean_token_accuracy": 0.9138142390549183,
"num_tokens": 4495120.0,
"step": 1050
},
{
"entropy": 0.3885428298264742,
"epoch": 0.33190012823414045,
"grad_norm": 0.2656344771385193,
"learning_rate": 9.770169447332977e-05,
"loss": 0.42026878356933595,
"mean_token_accuracy": 0.9171342994272709,
"num_tokens": 4707664.0,
"step": 1100
},
{
"entropy": 0.41116086438298227,
"epoch": 0.34698649769932866,
"grad_norm": 0.25747165083885193,
"learning_rate": 9.745526185118458e-05,
"loss": 0.44418087005615237,
"mean_token_accuracy": 0.9133055797219276,
"num_tokens": 4921515.0,
"step": 1150
},
{
"entropy": 0.40833847373723986,
"epoch": 0.36207286716451687,
"grad_norm": 0.22326916456222534,
"learning_rate": 9.719662512391396e-05,
"loss": 0.4394912338256836,
"mean_token_accuracy": 0.9140990000963211,
"num_tokens": 5133998.0,
"step": 1200
},
{
"epoch": 0.36207286716451687,
"eval_entropy": 0.44459747828240764,
"eval_loss": 0.4540960192680359,
"eval_mean_token_accuracy": 0.9033675605080664,
"eval_num_tokens": 5133998.0,
"eval_runtime": 37.1014,
"eval_samples_per_second": 58.381,
"eval_steps_per_second": 7.304,
"step": 1200
},
{
"entropy": 0.39192866910248997,
"epoch": 0.3771592366297051,
"grad_norm": 0.20642907917499542,
"learning_rate": 9.692585080531822e-05,
"loss": 0.42892616271972656,
"mean_token_accuracy": 0.9165673214197159,
"num_tokens": 5348047.0,
"step": 1250
},
{
"entropy": 0.40107650008052587,
"epoch": 0.3922456060948933,
"grad_norm": 0.2515887916088104,
"learning_rate": 9.664300853063104e-05,
"loss": 0.4329941558837891,
"mean_token_accuracy": 0.9152751086652279,
"num_tokens": 5562125.0,
"step": 1300
},
{
"entropy": 0.3964537301659584,
"epoch": 0.4073319755600815,
"grad_norm": 0.20992055535316467,
"learning_rate": 9.63481710386114e-05,
"loss": 0.4275414276123047,
"mean_token_accuracy": 0.9168652257323265,
"num_tokens": 5773685.0,
"step": 1350
},
{
"entropy": 0.39044841077178716,
"epoch": 0.42241834502526965,
"grad_norm": 0.30388110876083374,
"learning_rate": 9.604141415283728e-05,
"loss": 0.42324817657470704,
"mean_token_accuracy": 0.9169075645506382,
"num_tokens": 5986601.0,
"step": 1400
},
{
"epoch": 0.42241834502526965,
"eval_entropy": 0.4361799991658693,
"eval_loss": 0.45172417163848877,
"eval_mean_token_accuracy": 0.9033232137725802,
"eval_num_tokens": 5986601.0,
"eval_runtime": 37.1897,
"eval_samples_per_second": 58.242,
"eval_steps_per_second": 7.287,
"step": 1400
},
{
"entropy": 0.4137679870799184,
"epoch": 0.43750471449045786,
"grad_norm": 0.23755542933940887,
"learning_rate": 9.572281676220608e-05,
"loss": 0.4478377532958984,
"mean_token_accuracy": 0.911891212016344,
"num_tokens": 6203048.0,
"step": 1450
},
{
"entropy": 0.41279098089784383,
"epoch": 0.45259108395564607,
"grad_norm": 0.21780936419963837,
"learning_rate": 9.539246080064659e-05,
"loss": 0.45262195587158205,
"mean_token_accuracy": 0.9123758906126023,
"num_tokens": 6419624.0,
"step": 1500
},
{
"entropy": 0.41412957072257994,
"epoch": 0.4676774534208343,
"grad_norm": 0.2430579662322998,
"learning_rate": 9.505043122604818e-05,
"loss": 0.45246307373046873,
"mean_token_accuracy": 0.9122441673278808,
"num_tokens": 6633965.0,
"step": 1550
},
{
"entropy": 0.3798889485746622,
"epoch": 0.4827638228860225,
"grad_norm": 0.28653672337532043,
"learning_rate": 9.469681599841192e-05,
"loss": 0.41427810668945314,
"mean_token_accuracy": 0.9184439463913441,
"num_tokens": 6847358.0,
"step": 1600
},
{
"epoch": 0.4827638228860225,
"eval_entropy": 0.43242653594025826,
"eval_loss": 0.4488651752471924,
"eval_mean_token_accuracy": 0.9040581949082688,
"eval_num_tokens": 6847358.0,
"eval_runtime": 37.1595,
"eval_samples_per_second": 58.289,
"eval_steps_per_second": 7.293,
"step": 1600
},
{
"entropy": 0.38126184083521364,
"epoch": 0.4978501923512107,
"grad_norm": 0.2068091183900833,
"learning_rate": 9.433170605722996e-05,
"loss": 0.40500320434570314,
"mean_token_accuracy": 0.9181749866902829,
"num_tokens": 7062005.0,
"step": 1650
},
{
"entropy": 0.38879580337554215,
"epoch": 0.5129365618163989,
"grad_norm": 0.19908899068832397,
"learning_rate": 9.395519529809848e-05,
"loss": 0.41957916259765626,
"mean_token_accuracy": 0.9183010324835778,
"num_tokens": 7272082.0,
"step": 1700
},
{
"entropy": 0.3867104376107454,
"epoch": 0.5280229312815871,
"grad_norm": 0.21681655943393707,
"learning_rate": 9.356738054857057e-05,
"loss": 0.41496986389160156,
"mean_token_accuracy": 0.9176018598675728,
"num_tokens": 7484751.0,
"step": 1750
},
{
"entropy": 0.3900348538905382,
"epoch": 0.5431093007467753,
"grad_norm": 0.23264895379543304,
"learning_rate": 9.316836154325494e-05,
"loss": 0.4201799774169922,
"mean_token_accuracy": 0.9161376728117466,
"num_tokens": 7699385.0,
"step": 1800
},
{
"epoch": 0.5431093007467753,
"eval_entropy": 0.4371595958941977,
"eval_loss": 0.44555598497390747,
"eval_mean_token_accuracy": 0.9047388078101887,
"eval_num_tokens": 7699385.0,
"eval_runtime": 37.2391,
"eval_samples_per_second": 58.165,
"eval_steps_per_second": 7.277,
"step": 1800
},
{
"entropy": 0.38737339399755,
"epoch": 0.5581956702119635,
"grad_norm": 0.21236486732959747,
"learning_rate": 9.275824089816716e-05,
"loss": 0.4186508941650391,
"mean_token_accuracy": 0.9184837466478348,
"num_tokens": 7912846.0,
"step": 1850
},
{
"entropy": 0.38791515786200764,
"epoch": 0.5732820396771517,
"grad_norm": 0.22874821722507477,
"learning_rate": 9.233712408433972e-05,
"loss": 0.42144878387451173,
"mean_token_accuracy": 0.9170675221085548,
"num_tokens": 8126645.0,
"step": 1900
},
{
"entropy": 0.3831383780390024,
"epoch": 0.5883684091423399,
"grad_norm": 0.3072109818458557,
"learning_rate": 9.190511940069813e-05,
"loss": 0.407428092956543,
"mean_token_accuracy": 0.9184182004630566,
"num_tokens": 8341447.0,
"step": 1950
},
{
"entropy": 0.38130090072751044,
"epoch": 0.603454778607528,
"grad_norm": 0.2783527374267578,
"learning_rate": 9.146233794620944e-05,
"loss": 0.41518512725830076,
"mean_token_accuracy": 0.9192077203094959,
"num_tokens": 8553915.0,
"step": 2000
},
{
"epoch": 0.603454778607528,
"eval_entropy": 0.41359548831557874,
"eval_loss": 0.4426310062408447,
"eval_mean_token_accuracy": 0.9062746316744392,
"eval_num_tokens": 8553915.0,
"eval_runtime": 37.0772,
"eval_samples_per_second": 58.419,
"eval_steps_per_second": 7.309,
"step": 2000
},
{
"entropy": 0.40945424281060694,
"epoch": 0.6185411480727163,
"grad_norm": 0.2751815915107727,
"learning_rate": 9.100889359131093e-05,
"loss": 0.44279281616210936,
"mean_token_accuracy": 0.9126340833306312,
"num_tokens": 8773030.0,
"step": 2050
},
{
"entropy": 0.37910934548825026,
"epoch": 0.6336275175379045,
"grad_norm": 0.24518635869026184,
"learning_rate": 9.054490294862594e-05,
"loss": 0.41019065856933595,
"mean_token_accuracy": 0.9180504800379277,
"num_tokens": 8987621.0,
"step": 2100
},
{
"entropy": 0.38141273133456705,
"epoch": 0.6487138870030927,
"grad_norm": 0.2637041211128235,
"learning_rate": 9.00704853429745e-05,
"loss": 0.41344562530517576,
"mean_token_accuracy": 0.9188940741121769,
"num_tokens": 9202085.0,
"step": 2150
},
{
"entropy": 0.37150444712489844,
"epoch": 0.6638002564682809,
"grad_norm": 0.23061881959438324,
"learning_rate": 8.958576278068655e-05,
"loss": 0.4002714157104492,
"mean_token_accuracy": 0.9211013509333134,
"num_tokens": 9414528.0,
"step": 2200
},
{
"epoch": 0.6638002564682809,
"eval_entropy": 0.4198416776652706,
"eval_loss": 0.44056928157806396,
"eval_mean_token_accuracy": 0.9060557997094749,
"eval_num_tokens": 9414528.0,
"eval_runtime": 37.0429,
"eval_samples_per_second": 58.473,
"eval_steps_per_second": 7.316,
"step": 2200
},
{
"entropy": 0.3824780482426286,
"epoch": 0.6788866259334692,
"grad_norm": 0.29528528451919556,
"learning_rate": 8.909085991822532e-05,
"loss": 0.4100413513183594,
"mean_token_accuracy": 0.9181285245716572,
"num_tokens": 9631160.0,
"step": 2250
},
{
"entropy": 0.37036353170871733,
"epoch": 0.6939729953986573,
"grad_norm": 0.2833597958087921,
"learning_rate": 8.858590403012954e-05,
"loss": 0.39582439422607424,
"mean_token_accuracy": 0.9203065976500511,
"num_tokens": 9844323.0,
"step": 2300
},
{
"entropy": 0.377471005320549,
"epoch": 0.7090593648638455,
"grad_norm": 0.2559050917625427,
"learning_rate": 8.807102497628199e-05,
"loss": 0.4039160919189453,
"mean_token_accuracy": 0.9185835334658623,
"num_tokens": 10060066.0,
"step": 2350
},
{
"entropy": 0.38689912386238573,
"epoch": 0.7241457343290337,
"grad_norm": 0.3571145236492157,
"learning_rate": 8.754635516851342e-05,
"loss": 0.41998291015625,
"mean_token_accuracy": 0.9171991994976998,
"num_tokens": 10275374.0,
"step": 2400
},
{
"epoch": 0.7241457343290337,
"eval_entropy": 0.4061841280148038,
"eval_loss": 0.4392658472061157,
"eval_mean_token_accuracy": 0.9060493254573583,
"eval_num_tokens": 10275374.0,
"eval_runtime": 37.185,
"eval_samples_per_second": 58.249,
"eval_steps_per_second": 7.288,
"step": 2400
},
{
"entropy": 0.3773344187065959,
"epoch": 0.7392321037942219,
"grad_norm": 0.23827174305915833,
"learning_rate": 8.701202953655006e-05,
"loss": 0.4055968475341797,
"mean_token_accuracy": 0.9189482787251473,
"num_tokens": 10495301.0,
"step": 2450
},
{
"entropy": 0.3638977843523026,
"epoch": 0.7543184732594101,
"grad_norm": 0.247745543718338,
"learning_rate": 8.646818549331366e-05,
"loss": 0.38891139984130857,
"mean_token_accuracy": 0.9226090031862259,
"num_tokens": 10706938.0,
"step": 2500
},
{
"entropy": 0.358336652033031,
"epoch": 0.7694048427245983,
"grad_norm": 0.24292156100273132,
"learning_rate": 8.591496289958292e-05,
"loss": 0.3846548461914063,
"mean_token_accuracy": 0.923456951379776,
"num_tokens": 10918302.0,
"step": 2550
},
{
"entropy": 0.37086400829255584,
"epoch": 0.7844912121897866,
"grad_norm": 0.2979118525981903,
"learning_rate": 8.535250402802536e-05,
"loss": 0.39662261962890627,
"mean_token_accuracy": 0.9212297305464745,
"num_tokens": 11131056.0,
"step": 2600
},
{
"epoch": 0.7844912121897866,
"eval_entropy": 0.4161290250361186,
"eval_loss": 0.43674495816230774,
"eval_mean_token_accuracy": 0.9060781219788583,
"eval_num_tokens": 11131056.0,
"eval_runtime": 37.0488,
"eval_samples_per_second": 58.463,
"eval_steps_per_second": 7.315,
"step": 2600
},
{
"entropy": 0.36887906536459925,
"epoch": 0.7995775816549747,
"grad_norm": 0.25673073530197144,
"learning_rate": 8.478095352660897e-05,
"loss": 0.3948686218261719,
"mean_token_accuracy": 0.9204315200448037,
"num_tokens": 11345648.0,
"step": 2650
},
{
"entropy": 0.36981521353125574,
"epoch": 0.814663951120163,
"grad_norm": 0.2649747133255005,
"learning_rate": 8.4200458381403e-05,
"loss": 0.3937848663330078,
"mean_token_accuracy": 0.9218536545336247,
"num_tokens": 11559009.0,
"step": 2700
},
{
"entropy": 0.37904939975589513,
"epoch": 0.8297503205853511,
"grad_norm": 0.20989011228084564,
"learning_rate": 8.361116787877736e-05,
"loss": 0.4084677505493164,
"mean_token_accuracy": 0.9188165719807148,
"num_tokens": 11776255.0,
"step": 2750
},
{
"entropy": 0.3781035339459777,
"epoch": 0.8448366900505393,
"grad_norm": 0.2979874908924103,
"learning_rate": 8.301323356701069e-05,
"loss": 0.40767410278320315,
"mean_token_accuracy": 0.9183482979238033,
"num_tokens": 11994830.0,
"step": 2800
},
{
"epoch": 0.8448366900505393,
"eval_entropy": 0.3918299580962016,
"eval_loss": 0.43606311082839966,
"eval_mean_token_accuracy": 0.9074829088805786,
"eval_num_tokens": 11994830.0,
"eval_runtime": 37.1502,
"eval_samples_per_second": 58.304,
"eval_steps_per_second": 7.295,
"step": 2800
},
{
"entropy": 0.3669764836877584,
"epoch": 0.8599230595157276,
"grad_norm": 0.3718933165073395,
"learning_rate": 8.240680921731639e-05,
"loss": 0.39511192321777344,
"mean_token_accuracy": 0.9215331043303013,
"num_tokens": 12210990.0,
"step": 2850
},
{
"entropy": 0.36516126081347466,
"epoch": 0.8750094289809157,
"grad_norm": 0.2584734559059143,
"learning_rate": 8.179205078429728e-05,
"loss": 0.3858111572265625,
"mean_token_accuracy": 0.9223315984010696,
"num_tokens": 12425768.0,
"step": 2900
},
{
"entropy": 0.36489626977592704,
"epoch": 0.890095798446104,
"grad_norm": 0.260593980550766,
"learning_rate": 8.116911636583866e-05,
"loss": 0.3904818344116211,
"mean_token_accuracy": 0.921723841279745,
"num_tokens": 12644047.0,
"step": 2950
},
{
"entropy": 0.35986222576349974,
"epoch": 0.9051821679112921,
"grad_norm": 0.2872949540615082,
"learning_rate": 8.053816616245007e-05,
"loss": 0.3802699661254883,
"mean_token_accuracy": 0.922919643521309,
"num_tokens": 12858612.0,
"step": 3000
},
{
"epoch": 0.9051821679112921,
"eval_entropy": 0.39114147694128465,
"eval_loss": 0.43371766805648804,
"eval_mean_token_accuracy": 0.9085190791045608,
"eval_num_tokens": 12858612.0,
"eval_runtime": 37.1981,
"eval_samples_per_second": 58.229,
"eval_steps_per_second": 7.285,
"step": 3000
}
],
"logging_steps": 50,
"max_steps": 9945,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3087949758650778e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}