{ "best_global_step": 3000, "best_metric": 0.43371766805648804, "best_model_checkpoint": "/scratch/gk_checkpoint_lora_v2/checkpoint-3000", "epoch": 0.9051821679112921, "eval_steps": 200, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.0372698324173688, "epoch": 0.015086369465188203, "grad_norm": 0.4944687783718109, "learning_rate": 3.266666666666667e-05, "loss": 2.1209957885742186, "mean_token_accuracy": 0.6639667785167694, "num_tokens": 213967.0, "step": 50 }, { "entropy": 0.4767542722821236, "epoch": 0.030172738930376405, "grad_norm": 0.29953643679618835, "learning_rate": 6.6e-05, "loss": 0.5209395980834961, "mean_token_accuracy": 0.9028549310564995, "num_tokens": 426365.0, "step": 100 }, { "entropy": 0.46151453502476214, "epoch": 0.04525910839556461, "grad_norm": 0.3118787109851837, "learning_rate": 9.933333333333334e-05, "loss": 0.508198356628418, "mean_token_accuracy": 0.9032981966435909, "num_tokens": 642837.0, "step": 150 }, { "entropy": 0.4455497920885682, "epoch": 0.06034547786075281, "grad_norm": 0.3013271391391754, "learning_rate": 9.999382532513122e-05, "loss": 0.48826019287109373, "mean_token_accuracy": 0.907553653717041, "num_tokens": 856475.0, "step": 200 }, { "epoch": 0.06034547786075281, "eval_entropy": 0.4689600637588114, "eval_loss": 0.4816349446773529, "eval_mean_token_accuracy": 0.8987587762934696, "eval_num_tokens": 856475.0, "eval_runtime": 37.5206, "eval_samples_per_second": 57.728, "eval_steps_per_second": 7.223, "step": 200 }, { "entropy": 0.44372758489102127, "epoch": 0.075431847325941, "grad_norm": 0.31373220682144165, "learning_rate": 9.997479627263544e-05, "loss": 0.4843710327148438, "mean_token_accuracy": 0.9082412907481193, "num_tokens": 1069370.0, "step": 250 }, { "entropy": 0.4475720078870654, "epoch": 0.09051821679112922, "grad_norm": 0.27380964159965515, "learning_rate": 9.994291516446573e-05, "loss": 0.491109733581543, "mean_token_accuracy": 0.9055162121355533, "num_tokens": 1286132.0, "step": 300 }, { "entropy": 0.4463552813604474, "epoch": 0.10560458625631741, "grad_norm": 0.2614763677120209, "learning_rate": 9.989819019951048e-05, "loss": 0.4837772369384766, "mean_token_accuracy": 0.9081570096313953, "num_tokens": 1500851.0, "step": 350 }, { "entropy": 0.41013720393180847, "epoch": 0.12069095572150562, "grad_norm": 0.4836612045764923, "learning_rate": 9.984063287972232e-05, "loss": 0.44385364532470706, "mean_token_accuracy": 0.9148843766748905, "num_tokens": 1708807.0, "step": 400 }, { "epoch": 0.12069095572150562, "eval_entropy": 0.4811463643044123, "eval_loss": 0.4747391641139984, "eval_mean_token_accuracy": 0.8997697566268189, "eval_num_tokens": 1708807.0, "eval_runtime": 37.1123, "eval_samples_per_second": 58.363, "eval_steps_per_second": 7.302, "step": 400 }, { "entropy": 0.42200190499424933, "epoch": 0.13577732518669383, "grad_norm": 0.19339531660079956, "learning_rate": 9.977025800716017e-05, "loss": 0.45712459564208985, "mean_token_accuracy": 0.9106362241506577, "num_tokens": 1923184.0, "step": 450 }, { "entropy": 0.43271509755402804, "epoch": 0.150863694651882, "grad_norm": 0.299110472202301, "learning_rate": 9.968708368018253e-05, "loss": 0.4724855422973633, "mean_token_accuracy": 0.9078708891570568, "num_tokens": 2139609.0, "step": 500 }, { "entropy": 0.43638833791017534, "epoch": 0.16595006411707022, "grad_norm": 0.2539425492286682, "learning_rate": 9.959113128879322e-05, "loss": 0.4785395050048828, "mean_token_accuracy": 0.907249256670475, "num_tokens": 2358080.0, "step": 550 }, { "entropy": 0.43115664307028057, "epoch": 0.18103643358225843, "grad_norm": 0.2542003393173218, "learning_rate": 9.948242550914035e-05, "loss": 0.4740608215332031, "mean_token_accuracy": 0.9089943794906139, "num_tokens": 2574667.0, "step": 600 }, { "epoch": 0.18103643358225843, "eval_entropy": 0.45346878287537074, "eval_loss": 0.46328845620155334, "eval_mean_token_accuracy": 0.9025413253210568, "eval_num_tokens": 2574667.0, "eval_runtime": 37.1254, "eval_samples_per_second": 58.343, "eval_steps_per_second": 7.3, "step": 600 }, { "entropy": 0.4420048241317272, "epoch": 0.19612280304744664, "grad_norm": 0.26832085847854614, "learning_rate": 9.936099429717045e-05, "loss": 0.486652717590332, "mean_token_accuracy": 0.9074076810479164, "num_tokens": 2790489.0, "step": 650 }, { "entropy": 0.4269510039314628, "epoch": 0.21120917251263482, "grad_norm": 0.20635050535202026, "learning_rate": 9.922686888143897e-05, "loss": 0.4619187927246094, "mean_token_accuracy": 0.910810690075159, "num_tokens": 3003881.0, "step": 700 }, { "entropy": 0.42016164746135476, "epoch": 0.22629554197782303, "grad_norm": 0.2643264830112457, "learning_rate": 9.908008375507924e-05, "loss": 0.46344844818115233, "mean_token_accuracy": 0.9113752076029777, "num_tokens": 3218446.0, "step": 750 }, { "entropy": 0.3974369211867452, "epoch": 0.24138191144301124, "grad_norm": 0.19392798840999603, "learning_rate": 9.89206766669318e-05, "loss": 0.42646697998046873, "mean_token_accuracy": 0.9165593402087688, "num_tokens": 3428256.0, "step": 800 }, { "epoch": 0.24138191144301124, "eval_entropy": 0.43642714537157784, "eval_loss": 0.46031010150909424, "eval_mean_token_accuracy": 0.9025695645061366, "eval_num_tokens": 3428256.0, "eval_runtime": 37.1078, "eval_samples_per_second": 58.37, "eval_steps_per_second": 7.303, "step": 800 }, { "entropy": 0.4217161551490426, "epoch": 0.25646828090819945, "grad_norm": 0.21452021598815918, "learning_rate": 9.874868861183658e-05, "loss": 0.4612973022460938, "mean_token_accuracy": 0.9114794608950615, "num_tokens": 3642529.0, "step": 850 }, { "entropy": 0.41439461953938006, "epoch": 0.27155465037338766, "grad_norm": 0.25079163908958435, "learning_rate": 9.856416382009006e-05, "loss": 0.4494070053100586, "mean_token_accuracy": 0.9127840812504292, "num_tokens": 3855962.0, "step": 900 }, { "entropy": 0.4270974922552705, "epoch": 0.2866410198385759, "grad_norm": 0.20817860960960388, "learning_rate": 9.836714974607077e-05, "loss": 0.46105358123779294, "mean_token_accuracy": 0.9099104046821594, "num_tokens": 4069157.0, "step": 950 }, { "entropy": 0.400859787017107, "epoch": 0.301727389303764, "grad_norm": 0.20943191647529602, "learning_rate": 9.815769705603521e-05, "loss": 0.4289055633544922, "mean_token_accuracy": 0.9167061321437359, "num_tokens": 4278743.0, "step": 1000 }, { "epoch": 0.301727389303764, "eval_entropy": 0.44312036031946483, "eval_loss": 0.4571220278739929, "eval_mean_token_accuracy": 0.9029428505809545, "eval_num_tokens": 4278743.0, "eval_runtime": 37.0803, "eval_samples_per_second": 58.414, "eval_steps_per_second": 7.308, "step": 1000 }, { "entropy": 0.4052092955261469, "epoch": 0.31681375876895224, "grad_norm": 0.21670734882354736, "learning_rate": 9.793585961508811e-05, "loss": 0.44187084197998044, "mean_token_accuracy": 0.9138142390549183, "num_tokens": 4495120.0, "step": 1050 }, { "entropy": 0.3885428298264742, "epoch": 0.33190012823414045, "grad_norm": 0.2656344771385193, "learning_rate": 9.770169447332977e-05, "loss": 0.42026878356933595, "mean_token_accuracy": 0.9171342994272709, "num_tokens": 4707664.0, "step": 1100 }, { "entropy": 0.41116086438298227, "epoch": 0.34698649769932866, "grad_norm": 0.25747165083885193, "learning_rate": 9.745526185118458e-05, "loss": 0.44418087005615237, "mean_token_accuracy": 0.9133055797219276, "num_tokens": 4921515.0, "step": 1150 }, { "entropy": 0.40833847373723986, "epoch": 0.36207286716451687, "grad_norm": 0.22326916456222534, "learning_rate": 9.719662512391396e-05, "loss": 0.4394912338256836, "mean_token_accuracy": 0.9140990000963211, "num_tokens": 5133998.0, "step": 1200 }, { "epoch": 0.36207286716451687, "eval_entropy": 0.44459747828240764, "eval_loss": 0.4540960192680359, "eval_mean_token_accuracy": 0.9033675605080664, "eval_num_tokens": 5133998.0, "eval_runtime": 37.1014, "eval_samples_per_second": 58.381, "eval_steps_per_second": 7.304, "step": 1200 }, { "entropy": 0.39192866910248997, "epoch": 0.3771592366297051, "grad_norm": 0.20642907917499542, "learning_rate": 9.692585080531822e-05, "loss": 0.42892616271972656, "mean_token_accuracy": 0.9165673214197159, "num_tokens": 5348047.0, "step": 1250 }, { "entropy": 0.40107650008052587, "epoch": 0.3922456060948933, "grad_norm": 0.2515887916088104, "learning_rate": 9.664300853063104e-05, "loss": 0.4329941558837891, "mean_token_accuracy": 0.9152751086652279, "num_tokens": 5562125.0, "step": 1300 }, { "entropy": 0.3964537301659584, "epoch": 0.4073319755600815, "grad_norm": 0.20992055535316467, "learning_rate": 9.63481710386114e-05, "loss": 0.4275414276123047, "mean_token_accuracy": 0.9168652257323265, "num_tokens": 5773685.0, "step": 1350 }, { "entropy": 0.39044841077178716, "epoch": 0.42241834502526965, "grad_norm": 0.30388110876083374, "learning_rate": 9.604141415283728e-05, "loss": 0.42324817657470704, "mean_token_accuracy": 0.9169075645506382, "num_tokens": 5986601.0, "step": 1400 }, { "epoch": 0.42241834502526965, "eval_entropy": 0.4361799991658693, "eval_loss": 0.45172417163848877, "eval_mean_token_accuracy": 0.9033232137725802, "eval_num_tokens": 5986601.0, "eval_runtime": 37.1897, "eval_samples_per_second": 58.242, "eval_steps_per_second": 7.287, "step": 1400 }, { "entropy": 0.4137679870799184, "epoch": 0.43750471449045786, "grad_norm": 0.23755542933940887, "learning_rate": 9.572281676220608e-05, "loss": 0.4478377532958984, "mean_token_accuracy": 0.911891212016344, "num_tokens": 6203048.0, "step": 1450 }, { "entropy": 0.41279098089784383, "epoch": 0.45259108395564607, "grad_norm": 0.21780936419963837, "learning_rate": 9.539246080064659e-05, "loss": 0.45262195587158205, "mean_token_accuracy": 0.9123758906126023, "num_tokens": 6419624.0, "step": 1500 }, { "entropy": 0.41412957072257994, "epoch": 0.4676774534208343, "grad_norm": 0.2430579662322998, "learning_rate": 9.505043122604818e-05, "loss": 0.45246307373046873, "mean_token_accuracy": 0.9122441673278808, "num_tokens": 6633965.0, "step": 1550 }, { "entropy": 0.3798889485746622, "epoch": 0.4827638228860225, "grad_norm": 0.28653672337532043, "learning_rate": 9.469681599841192e-05, "loss": 0.41427810668945314, "mean_token_accuracy": 0.9184439463913441, "num_tokens": 6847358.0, "step": 1600 }, { "epoch": 0.4827638228860225, "eval_entropy": 0.43242653594025826, "eval_loss": 0.4488651752471924, "eval_mean_token_accuracy": 0.9040581949082688, "eval_num_tokens": 6847358.0, "eval_runtime": 37.1595, "eval_samples_per_second": 58.289, "eval_steps_per_second": 7.293, "step": 1600 }, { "entropy": 0.38126184083521364, "epoch": 0.4978501923512107, "grad_norm": 0.2068091183900833, "learning_rate": 9.433170605722996e-05, "loss": 0.40500320434570314, "mean_token_accuracy": 0.9181749866902829, "num_tokens": 7062005.0, "step": 1650 }, { "entropy": 0.38879580337554215, "epoch": 0.5129365618163989, "grad_norm": 0.19908899068832397, "learning_rate": 9.395519529809848e-05, "loss": 0.41957916259765626, "mean_token_accuracy": 0.9183010324835778, "num_tokens": 7272082.0, "step": 1700 }, { "entropy": 0.3867104376107454, "epoch": 0.5280229312815871, "grad_norm": 0.21681655943393707, "learning_rate": 9.356738054857057e-05, "loss": 0.41496986389160156, "mean_token_accuracy": 0.9176018598675728, "num_tokens": 7484751.0, "step": 1750 }, { "entropy": 0.3900348538905382, "epoch": 0.5431093007467753, "grad_norm": 0.23264895379543304, "learning_rate": 9.316836154325494e-05, "loss": 0.4201799774169922, "mean_token_accuracy": 0.9161376728117466, "num_tokens": 7699385.0, "step": 1800 }, { "epoch": 0.5431093007467753, "eval_entropy": 0.4371595958941977, "eval_loss": 0.44555598497390747, "eval_mean_token_accuracy": 0.9047388078101887, "eval_num_tokens": 7699385.0, "eval_runtime": 37.2391, "eval_samples_per_second": 58.165, "eval_steps_per_second": 7.277, "step": 1800 }, { "entropy": 0.38737339399755, "epoch": 0.5581956702119635, "grad_norm": 0.21236486732959747, "learning_rate": 9.275824089816716e-05, "loss": 0.4186508941650391, "mean_token_accuracy": 0.9184837466478348, "num_tokens": 7912846.0, "step": 1850 }, { "entropy": 0.38791515786200764, "epoch": 0.5732820396771517, "grad_norm": 0.22874821722507477, "learning_rate": 9.233712408433972e-05, "loss": 0.42144878387451173, "mean_token_accuracy": 0.9170675221085548, "num_tokens": 8126645.0, "step": 1900 }, { "entropy": 0.3831383780390024, "epoch": 0.5883684091423399, "grad_norm": 0.3072109818458557, "learning_rate": 9.190511940069813e-05, "loss": 0.407428092956543, "mean_token_accuracy": 0.9184182004630566, "num_tokens": 8341447.0, "step": 1950 }, { "entropy": 0.38130090072751044, "epoch": 0.603454778607528, "grad_norm": 0.2783527374267578, "learning_rate": 9.146233794620944e-05, "loss": 0.41518512725830076, "mean_token_accuracy": 0.9192077203094959, "num_tokens": 8553915.0, "step": 2000 }, { "epoch": 0.603454778607528, "eval_entropy": 0.41359548831557874, "eval_loss": 0.4426310062408447, "eval_mean_token_accuracy": 0.9062746316744392, "eval_num_tokens": 8553915.0, "eval_runtime": 37.0772, "eval_samples_per_second": 58.419, "eval_steps_per_second": 7.309, "step": 2000 }, { "entropy": 0.40945424281060694, "epoch": 0.6185411480727163, "grad_norm": 0.2751815915107727, "learning_rate": 9.100889359131093e-05, "loss": 0.44279281616210936, "mean_token_accuracy": 0.9126340833306312, "num_tokens": 8773030.0, "step": 2050 }, { "entropy": 0.37910934548825026, "epoch": 0.6336275175379045, "grad_norm": 0.24518635869026184, "learning_rate": 9.054490294862594e-05, "loss": 0.41019065856933595, "mean_token_accuracy": 0.9180504800379277, "num_tokens": 8987621.0, "step": 2100 }, { "entropy": 0.38141273133456705, "epoch": 0.6487138870030927, "grad_norm": 0.2637041211128235, "learning_rate": 9.00704853429745e-05, "loss": 0.41344562530517576, "mean_token_accuracy": 0.9188940741121769, "num_tokens": 9202085.0, "step": 2150 }, { "entropy": 0.37150444712489844, "epoch": 0.6638002564682809, "grad_norm": 0.23061881959438324, "learning_rate": 8.958576278068655e-05, "loss": 0.4002714157104492, "mean_token_accuracy": 0.9211013509333134, "num_tokens": 9414528.0, "step": 2200 }, { "epoch": 0.6638002564682809, "eval_entropy": 0.4198416776652706, "eval_loss": 0.44056928157806396, "eval_mean_token_accuracy": 0.9060557997094749, "eval_num_tokens": 9414528.0, "eval_runtime": 37.0429, "eval_samples_per_second": 58.473, "eval_steps_per_second": 7.316, "step": 2200 }, { "entropy": 0.3824780482426286, "epoch": 0.6788866259334692, "grad_norm": 0.29528528451919556, "learning_rate": 8.909085991822532e-05, "loss": 0.4100413513183594, "mean_token_accuracy": 0.9181285245716572, "num_tokens": 9631160.0, "step": 2250 }, { "entropy": 0.37036353170871733, "epoch": 0.6939729953986573, "grad_norm": 0.2833597958087921, "learning_rate": 8.858590403012954e-05, "loss": 0.39582439422607424, "mean_token_accuracy": 0.9203065976500511, "num_tokens": 9844323.0, "step": 2300 }, { "entropy": 0.377471005320549, "epoch": 0.7090593648638455, "grad_norm": 0.2559050917625427, "learning_rate": 8.807102497628199e-05, "loss": 0.4039160919189453, "mean_token_accuracy": 0.9185835334658623, "num_tokens": 10060066.0, "step": 2350 }, { "entropy": 0.38689912386238573, "epoch": 0.7241457343290337, "grad_norm": 0.3571145236492157, "learning_rate": 8.754635516851342e-05, "loss": 0.41998291015625, "mean_token_accuracy": 0.9171991994976998, "num_tokens": 10275374.0, "step": 2400 }, { "epoch": 0.7241457343290337, "eval_entropy": 0.4061841280148038, "eval_loss": 0.4392658472061157, "eval_mean_token_accuracy": 0.9060493254573583, "eval_num_tokens": 10275374.0, "eval_runtime": 37.185, "eval_samples_per_second": 58.249, "eval_steps_per_second": 7.288, "step": 2400 }, { "entropy": 0.3773344187065959, "epoch": 0.7392321037942219, "grad_norm": 0.23827174305915833, "learning_rate": 8.701202953655006e-05, "loss": 0.4055968475341797, "mean_token_accuracy": 0.9189482787251473, "num_tokens": 10495301.0, "step": 2450 }, { "entropy": 0.3638977843523026, "epoch": 0.7543184732594101, "grad_norm": 0.247745543718338, "learning_rate": 8.646818549331366e-05, "loss": 0.38891139984130857, "mean_token_accuracy": 0.9226090031862259, "num_tokens": 10706938.0, "step": 2500 }, { "entropy": 0.358336652033031, "epoch": 0.7694048427245983, "grad_norm": 0.24292156100273132, "learning_rate": 8.591496289958292e-05, "loss": 0.3846548461914063, "mean_token_accuracy": 0.923456951379776, "num_tokens": 10918302.0, "step": 2550 }, { "entropy": 0.37086400829255584, "epoch": 0.7844912121897866, "grad_norm": 0.2979118525981903, "learning_rate": 8.535250402802536e-05, "loss": 0.39662261962890627, "mean_token_accuracy": 0.9212297305464745, "num_tokens": 11131056.0, "step": 2600 }, { "epoch": 0.7844912121897866, "eval_entropy": 0.4161290250361186, "eval_loss": 0.43674495816230774, "eval_mean_token_accuracy": 0.9060781219788583, "eval_num_tokens": 11131056.0, "eval_runtime": 37.0488, "eval_samples_per_second": 58.463, "eval_steps_per_second": 7.315, "step": 2600 }, { "entropy": 0.36887906536459925, "epoch": 0.7995775816549747, "grad_norm": 0.25673073530197144, "learning_rate": 8.478095352660897e-05, "loss": 0.3948686218261719, "mean_token_accuracy": 0.9204315200448037, "num_tokens": 11345648.0, "step": 2650 }, { "entropy": 0.36981521353125574, "epoch": 0.814663951120163, "grad_norm": 0.2649747133255005, "learning_rate": 8.4200458381403e-05, "loss": 0.3937848663330078, "mean_token_accuracy": 0.9218536545336247, "num_tokens": 11559009.0, "step": 2700 }, { "entropy": 0.37904939975589513, "epoch": 0.8297503205853511, "grad_norm": 0.20989011228084564, "learning_rate": 8.361116787877736e-05, "loss": 0.4084677505493164, "mean_token_accuracy": 0.9188165719807148, "num_tokens": 11776255.0, "step": 2750 }, { "entropy": 0.3781035339459777, "epoch": 0.8448366900505393, "grad_norm": 0.2979874908924103, "learning_rate": 8.301323356701069e-05, "loss": 0.40767410278320315, "mean_token_accuracy": 0.9183482979238033, "num_tokens": 11994830.0, "step": 2800 }, { "epoch": 0.8448366900505393, "eval_entropy": 0.3918299580962016, "eval_loss": 0.43606311082839966, "eval_mean_token_accuracy": 0.9074829088805786, "eval_num_tokens": 11994830.0, "eval_runtime": 37.1502, "eval_samples_per_second": 58.304, "eval_steps_per_second": 7.295, "step": 2800 }, { "entropy": 0.3669764836877584, "epoch": 0.8599230595157276, "grad_norm": 0.3718933165073395, "learning_rate": 8.240680921731639e-05, "loss": 0.39511192321777344, "mean_token_accuracy": 0.9215331043303013, "num_tokens": 12210990.0, "step": 2850 }, { "entropy": 0.36516126081347466, "epoch": 0.8750094289809157, "grad_norm": 0.2584734559059143, "learning_rate": 8.179205078429728e-05, "loss": 0.3858111572265625, "mean_token_accuracy": 0.9223315984010696, "num_tokens": 12425768.0, "step": 2900 }, { "entropy": 0.36489626977592704, "epoch": 0.890095798446104, "grad_norm": 0.260593980550766, "learning_rate": 8.116911636583866e-05, "loss": 0.3904818344116211, "mean_token_accuracy": 0.921723841279745, "num_tokens": 12644047.0, "step": 2950 }, { "entropy": 0.35986222576349974, "epoch": 0.9051821679112921, "grad_norm": 0.2872949540615082, "learning_rate": 8.053816616245007e-05, "loss": 0.3802699661254883, "mean_token_accuracy": 0.922919643521309, "num_tokens": 12858612.0, "step": 3000 }, { "epoch": 0.9051821679112921, "eval_entropy": 0.39114147694128465, "eval_loss": 0.43371766805648804, "eval_mean_token_accuracy": 0.9085190791045608, "eval_num_tokens": 12858612.0, "eval_runtime": 37.1981, "eval_samples_per_second": 58.229, "eval_steps_per_second": 7.285, "step": 3000 } ], "logging_steps": 50, "max_steps": 9945, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3087949758650778e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }