{ "best_global_step": 3000, "best_metric": 0.5210983753204346, "best_model_checkpoint": "/scratch/gk_checkpoint_lora/checkpoint-3000", "epoch": 1.959792477302205, "eval_steps": 200, "global_step": 3400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.8664254155755043, "epoch": 0.02882259691598213, "grad_norm": 2.775977611541748, "learning_rate": 9.800000000000001e-06, "loss": 3.2154922485351562, "mean_token_accuracy": 0.5048915630578995, "num_tokens": 270598.0, "step": 50 }, { "entropy": 1.210615372657776, "epoch": 0.05764519383196426, "grad_norm": 1.1605831384658813, "learning_rate": 1.98e-05, "loss": 2.068811798095703, "mean_token_accuracy": 0.5692685562372207, "num_tokens": 543717.0, "step": 100 }, { "entropy": 0.7421895080804825, "epoch": 0.08646779074794639, "grad_norm": 0.25959399342536926, "learning_rate": 1.9989568984484556e-05, "loss": 0.8849951171875, "mean_token_accuracy": 0.8145365649461747, "num_tokens": 813682.0, "step": 150 }, { "entropy": 0.6125312650203705, "epoch": 0.11529038766392852, "grad_norm": 0.24997037649154663, "learning_rate": 1.9957442896851584e-05, "loss": 0.691104507446289, "mean_token_accuracy": 0.862597424685955, "num_tokens": 1085263.0, "step": 200 }, { "epoch": 0.11529038766392852, "eval_entropy": 0.5956721862680033, "eval_loss": 0.6279548406600952, "eval_mean_token_accuracy": 0.8725619774115713, "eval_num_tokens": 1085263.0, "eval_runtime": 25.3455, "eval_samples_per_second": 59.695, "eval_steps_per_second": 7.496, "step": 200 }, { "entropy": 0.6464120636880398, "epoch": 0.14411298457991065, "grad_norm": 0.2776671051979065, "learning_rate": 1.9903687176430222e-05, "loss": 0.6937020874023437, "mean_token_accuracy": 0.8613211107254028, "num_tokens": 1359193.0, "step": 250 }, { "entropy": 0.6303374738991261, "epoch": 0.17293558149589278, "grad_norm": 0.23006124794483185, "learning_rate": 1.9828418591803025e-05, "loss": 0.674161376953125, "mean_token_accuracy": 0.865567267537117, "num_tokens": 1633225.0, "step": 300 }, { "entropy": 0.6267825645208359, "epoch": 0.2017581784118749, "grad_norm": 0.264813095331192, "learning_rate": 1.973180064195894e-05, "loss": 0.6597396850585937, "mean_token_accuracy": 0.8673095554113388, "num_tokens": 1905535.0, "step": 350 }, { "entropy": 0.6219879929721356, "epoch": 0.23058077532785703, "grad_norm": 0.3037996292114258, "learning_rate": 1.9614043201139513e-05, "loss": 0.6504788208007812, "mean_token_accuracy": 0.868158842921257, "num_tokens": 2177042.0, "step": 400 }, { "epoch": 0.23058077532785703, "eval_entropy": 0.5888996227791435, "eval_loss": 0.5949175357818604, "eval_mean_token_accuracy": 0.8765835507919914, "eval_num_tokens": 2177042.0, "eval_runtime": 25.3953, "eval_samples_per_second": 59.578, "eval_steps_per_second": 7.482, "step": 400 }, { "entropy": 0.6223398548364639, "epoch": 0.2594033722438392, "grad_norm": 0.27440914511680603, "learning_rate": 1.9475402062948533e-05, "loss": 0.6479128265380859, "mean_token_accuracy": 0.8684594085812569, "num_tokens": 2452073.0, "step": 450 }, { "entropy": 0.610146201401949, "epoch": 0.2882259691598213, "grad_norm": 0.3176165521144867, "learning_rate": 1.9316178384715195e-05, "loss": 0.6257384109497071, "mean_token_accuracy": 0.8711106261610985, "num_tokens": 2724641.0, "step": 500 }, { "entropy": 0.5934953857213259, "epoch": 0.31704856607580345, "grad_norm": 0.2879047095775604, "learning_rate": 1.9136718033317887e-05, "loss": 0.6003322601318359, "mean_token_accuracy": 0.8751304519176483, "num_tokens": 2994313.0, "step": 550 }, { "entropy": 0.5552539291977883, "epoch": 0.34587116299178555, "grad_norm": 0.31590884923934937, "learning_rate": 1.8937410833889517e-05, "loss": 0.5751915740966796, "mean_token_accuracy": 0.8799619308114052, "num_tokens": 3262166.0, "step": 600 }, { "epoch": 0.34587116299178555, "eval_entropy": 0.5373091018513629, "eval_loss": 0.5472979545593262, "eval_mean_token_accuracy": 0.882329721827256, "eval_num_tokens": 3262166.0, "eval_runtime": 25.4147, "eval_samples_per_second": 59.533, "eval_steps_per_second": 7.476, "step": 600 }, { "entropy": 0.6037677505612373, "epoch": 0.3746937599077677, "grad_norm": 0.24785549938678741, "learning_rate": 1.871868972303645e-05, "loss": 0.6383477020263671, "mean_token_accuracy": 0.868063251376152, "num_tokens": 3540715.0, "step": 650 }, { "entropy": 0.5363423094898462, "epoch": 0.4035163568237498, "grad_norm": 0.2744317650794983, "learning_rate": 1.848102980841029e-05, "loss": 0.5591300964355469, "mean_token_accuracy": 0.8841247257590293, "num_tokens": 3809747.0, "step": 700 }, { "entropy": 0.5646945191174746, "epoch": 0.43233895373973197, "grad_norm": 0.22471845149993896, "learning_rate": 1.8224947336675485e-05, "loss": 0.5917744064331054, "mean_token_accuracy": 0.8773607212305069, "num_tokens": 4084262.0, "step": 750 }, { "entropy": 0.5565017917752266, "epoch": 0.46116155065571407, "grad_norm": 0.24923403561115265, "learning_rate": 1.79509985721144e-05, "loss": 0.581727409362793, "mean_token_accuracy": 0.8778996297717094, "num_tokens": 4358564.0, "step": 800 }, { "epoch": 0.46116155065571407, "eval_entropy": 0.5298120318274749, "eval_loss": 0.5324572324752808, "eval_mean_token_accuracy": 0.8855449488288478, "eval_num_tokens": 4358564.0, "eval_runtime": 25.3391, "eval_samples_per_second": 59.71, "eval_steps_per_second": 7.498, "step": 800 }, { "entropy": 0.5551759076118469, "epoch": 0.4899841475716962, "grad_norm": 0.2890514135360718, "learning_rate": 1.765977858830583e-05, "loss": 0.5757025909423829, "mean_token_accuracy": 0.8784776413440705, "num_tokens": 4631480.0, "step": 850 }, { "entropy": 0.5454864390939475, "epoch": 0.5188067444876784, "grad_norm": 0.23076863586902618, "learning_rate": 1.735191997550167e-05, "loss": 0.5680808258056641, "mean_token_accuracy": 0.88047192633152, "num_tokens": 4904344.0, "step": 900 }, { "entropy": 0.5318841424584388, "epoch": 0.5476293414036605, "grad_norm": 0.23075300455093384, "learning_rate": 1.7028091466509602e-05, "loss": 0.555275650024414, "mean_token_accuracy": 0.884403744339943, "num_tokens": 5172046.0, "step": 950 }, { "entropy": 0.5584682691097259, "epoch": 0.5764519383196426, "grad_norm": 0.22157496213912964, "learning_rate": 1.668899648406662e-05, "loss": 0.5754995346069336, "mean_token_accuracy": 0.877709536254406, "num_tokens": 5445539.0, "step": 1000 }, { "epoch": 0.5764519383196426, "eval_entropy": 0.5218244160476484, "eval_loss": 0.530579686164856, "eval_mean_token_accuracy": 0.8847430721709603, "eval_num_tokens": 5445539.0, "eval_runtime": 25.2955, "eval_samples_per_second": 59.813, "eval_steps_per_second": 7.511, "step": 1000 }, { "entropy": 0.5296673697978258, "epoch": 0.6052745352356247, "grad_norm": 0.2421996146440506, "learning_rate": 1.6335371612858827e-05, "loss": 0.552278709411621, "mean_token_accuracy": 0.884430148601532, "num_tokens": 5713428.0, "step": 1050 }, { "entropy": 0.5427714378386735, "epoch": 0.6340971321516069, "grad_norm": 0.22103162109851837, "learning_rate": 1.5967984999506623e-05, "loss": 0.5644734954833984, "mean_token_accuracy": 0.8810071355104446, "num_tokens": 5985117.0, "step": 1100 }, { "entropy": 0.5319583508372306, "epoch": 0.662919729067589, "grad_norm": 0.25406044721603394, "learning_rate": 1.558763468399081e-05, "loss": 0.551100959777832, "mean_token_accuracy": 0.8851462480425835, "num_tokens": 6254015.0, "step": 1150 }, { "entropy": 0.5286855664849281, "epoch": 0.6917423259835711, "grad_norm": 0.24316425621509552, "learning_rate": 1.5195146866144093e-05, "loss": 0.5527534484863281, "mean_token_accuracy": 0.8843164274096489, "num_tokens": 6524236.0, "step": 1200 }, { "epoch": 0.6917423259835711, "eval_entropy": 0.5235515671341042, "eval_loss": 0.5270145535469055, "eval_mean_token_accuracy": 0.8862221178255583, "eval_num_tokens": 6524236.0, "eval_runtime": 25.2946, "eval_samples_per_second": 59.815, "eval_steps_per_second": 7.511, "step": 1200 }, { "entropy": 0.5438067949563264, "epoch": 0.7205649228995532, "grad_norm": 0.2850089967250824, "learning_rate": 1.4791374110973555e-05, "loss": 0.5662718963623047, "mean_token_accuracy": 0.8811582899093628, "num_tokens": 6798500.0, "step": 1250 }, { "entropy": 0.5394456747919321, "epoch": 0.7493875198155354, "grad_norm": 0.2898014783859253, "learning_rate": 1.4377193496712517e-05, "loss": 0.5550812149047851, "mean_token_accuracy": 0.8827075427770614, "num_tokens": 7068572.0, "step": 1300 }, { "entropy": 0.5455511239916087, "epoch": 0.7782101167315175, "grad_norm": 0.22777394950389862, "learning_rate": 1.395350470962454e-05, "loss": 0.5698577117919922, "mean_token_accuracy": 0.8805486962199212, "num_tokens": 7338828.0, "step": 1350 }, { "entropy": 0.5357480451464653, "epoch": 0.8070327136474996, "grad_norm": 0.22745366394519806, "learning_rate": 1.3521228089698138e-05, "loss": 0.555338134765625, "mean_token_accuracy": 0.8834168764948845, "num_tokens": 7608634.0, "step": 1400 }, { "epoch": 0.8070327136474996, "eval_entropy": 0.5221125764282126, "eval_loss": 0.5261030197143555, "eval_mean_token_accuracy": 0.8857603835432153, "eval_num_tokens": 7608634.0, "eval_runtime": 25.3733, "eval_samples_per_second": 59.63, "eval_steps_per_second": 7.488, "step": 1400 }, { "entropy": 0.5366402574628591, "epoch": 0.8358553105634817, "grad_norm": 0.3329956531524658, "learning_rate": 1.3081302631477272e-05, "loss": 0.5528204345703125, "mean_token_accuracy": 0.883393512070179, "num_tokens": 7876795.0, "step": 1450 }, { "entropy": 0.5442750995606184, "epoch": 0.8646779074794639, "grad_norm": 0.22092807292938232, "learning_rate": 1.263468394437032e-05, "loss": 0.5658491897583008, "mean_token_accuracy": 0.8810335186123848, "num_tokens": 8149031.0, "step": 1500 }, { "entropy": 0.5575385902076959, "epoch": 0.893500504395446, "grad_norm": 0.22535692155361176, "learning_rate": 1.218234217686808e-05, "loss": 0.577353744506836, "mean_token_accuracy": 0.878656555712223, "num_tokens": 8420950.0, "step": 1550 }, { "entropy": 0.5398243299871683, "epoch": 0.9223231013114281, "grad_norm": 0.2995174527168274, "learning_rate": 1.1725259909179875e-05, "loss": 0.5575567626953125, "mean_token_accuracy": 0.882205625474453, "num_tokens": 8690831.0, "step": 1600 }, { "epoch": 0.9223231013114281, "eval_entropy": 0.5169236369823155, "eval_loss": 0.5248522162437439, "eval_mean_token_accuracy": 0.8859441503098137, "eval_num_tokens": 8690831.0, "eval_runtime": 25.3236, "eval_samples_per_second": 59.747, "eval_steps_per_second": 7.503, "step": 1600 }, { "entropy": 0.5338192899525166, "epoch": 0.9511456982274102, "grad_norm": 0.3139007091522217, "learning_rate": 1.1264430018865391e-05, "loss": 0.5552957916259765, "mean_token_accuracy": 0.8834528475999832, "num_tokens": 8961415.0, "step": 1650 }, { "entropy": 0.5275231996178626, "epoch": 0.9799682951433925, "grad_norm": 0.2292962670326233, "learning_rate": 1.0800853524098543e-05, "loss": 0.5416835021972656, "mean_token_accuracy": 0.8851530715823174, "num_tokens": 9229637.0, "step": 1700 }, { "entropy": 0.5638248626161461, "epoch": 1.0086467790747946, "grad_norm": 0.23777392506599426, "learning_rate": 1.0335537409248204e-05, "loss": 0.5851130676269531, "mean_token_accuracy": 0.8776284435286594, "num_tokens": 9499446.0, "step": 1750 }, { "entropy": 0.5698649657517671, "epoch": 1.0374693759907767, "grad_norm": 0.2397402822971344, "learning_rate": 9.869492437499167e-06, "loss": 0.5934230804443359, "mean_token_accuracy": 0.8746954175829887, "num_tokens": 9775732.0, "step": 1800 }, { "epoch": 1.0374693759907767, "eval_entropy": 0.5189189992452923, "eval_loss": 0.5238012671470642, "eval_mean_token_accuracy": 0.8859998445761831, "eval_num_tokens": 9775732.0, "eval_runtime": 25.2908, "eval_samples_per_second": 59.824, "eval_steps_per_second": 7.513, "step": 1800 }, { "entropy": 0.5613676090538502, "epoch": 1.0662919729067588, "grad_norm": 0.2651011645793915, "learning_rate": 9.403730955264677e-06, "loss": 0.5792824935913086, "mean_token_accuracy": 0.8779774031043053, "num_tokens": 10051416.0, "step": 1850 }, { "entropy": 0.564930793941021, "epoch": 1.0951145698227411, "grad_norm": 0.2934422194957733, "learning_rate": 8.939264693159926e-06, "loss": 0.5857321166992188, "mean_token_accuracy": 0.8772886765003204, "num_tokens": 10325112.0, "step": 1900 }, { "entropy": 0.5554238288849592, "epoch": 1.1239371667387232, "grad_norm": 0.2599495053291321, "learning_rate": 8.477102568313138e-06, "loss": 0.5767181015014649, "mean_token_accuracy": 0.8785749426484109, "num_tokens": 10599513.0, "step": 1950 }, { "entropy": 0.5364625386148691, "epoch": 1.1527597636547053, "grad_norm": 0.2258245348930359, "learning_rate": 8.01824849278814e-06, "loss": 0.555483169555664, "mean_token_accuracy": 0.883213449716568, "num_tokens": 10869586.0, "step": 2000 }, { "epoch": 1.1527597636547053, "eval_entropy": 0.519798114582112, "eval_loss": 0.5227712988853455, "eval_mean_token_accuracy": 0.8866353715720929, "eval_num_tokens": 10869586.0, "eval_runtime": 25.2263, "eval_samples_per_second": 59.977, "eval_steps_per_second": 7.532, "step": 2000 }, { "entropy": 0.5413903272151948, "epoch": 1.1815823605706874, "grad_norm": 0.2601049542427063, "learning_rate": 7.5636991928790226e-06, "loss": 0.5650748443603516, "mean_token_accuracy": 0.8822926163673401, "num_tokens": 11142115.0, "step": 2050 }, { "entropy": 0.5441782039403915, "epoch": 1.2104049574866695, "grad_norm": 0.23305267095565796, "learning_rate": 7.1144420440136945e-06, "loss": 0.5608541870117187, "mean_token_accuracy": 0.8806314519047738, "num_tokens": 11414459.0, "step": 2100 }, { "entropy": 0.5373694878071547, "epoch": 1.2392275544026516, "grad_norm": 0.2710422873497009, "learning_rate": 6.671452925969549e-06, "loss": 0.5601076126098633, "mean_token_accuracy": 0.8817597103118896, "num_tokens": 11686439.0, "step": 2150 }, { "entropy": 0.5512064357846975, "epoch": 1.2680501513186337, "grad_norm": 0.24875891208648682, "learning_rate": 6.2356941030600036e-06, "loss": 0.572663803100586, "mean_token_accuracy": 0.8795112228393555, "num_tokens": 11959949.0, "step": 2200 }, { "epoch": 1.2680501513186337, "eval_entropy": 0.5157758243774113, "eval_loss": 0.5222153663635254, "eval_mean_token_accuracy": 0.8866087377071381, "eval_num_tokens": 11959949.0, "eval_runtime": 25.3435, "eval_samples_per_second": 59.7, "eval_steps_per_second": 7.497, "step": 2200 }, { "entropy": 0.5333986005187035, "epoch": 1.296872748234616, "grad_norm": 0.2774944603443146, "learning_rate": 5.808112133896682e-06, "loss": 0.5507477569580078, "mean_token_accuracy": 0.8833928933739662, "num_tokens": 12230467.0, "step": 2250 }, { "entropy": 0.5385575620830059, "epoch": 1.325695345150598, "grad_norm": 0.24219754338264465, "learning_rate": 5.38963581526766e-06, "loss": 0.5618357849121094, "mean_token_accuracy": 0.8824037438631058, "num_tokens": 12500631.0, "step": 2300 }, { "entropy": 0.5562852944433689, "epoch": 1.3545179420665803, "grad_norm": 0.23605461418628693, "learning_rate": 4.981174164598023e-06, "loss": 0.5740032577514649, "mean_token_accuracy": 0.8787400788068771, "num_tokens": 12773793.0, "step": 2350 }, { "entropy": 0.5452473207563162, "epoch": 1.3833405389825624, "grad_norm": 0.23601187765598297, "learning_rate": 4.5836144453753595e-06, "loss": 0.5657626724243164, "mean_token_accuracy": 0.8807224997878075, "num_tokens": 13046257.0, "step": 2400 }, { "epoch": 1.3833405389825624, "eval_entropy": 0.5180542894884159, "eval_loss": 0.5217919945716858, "eval_mean_token_accuracy": 0.8867926757586629, "eval_num_tokens": 13046257.0, "eval_runtime": 25.2941, "eval_samples_per_second": 59.816, "eval_steps_per_second": 7.512, "step": 2400 }, { "entropy": 0.5406669420003891, "epoch": 1.4121631358985445, "grad_norm": 0.27414751052856445, "learning_rate": 4.197820239829295e-06, "loss": 0.5581526184082031, "mean_token_accuracy": 0.8821376091241837, "num_tokens": 13318117.0, "step": 2450 }, { "entropy": 0.5123971965163946, "epoch": 1.4409857328145266, "grad_norm": 0.27002713084220886, "learning_rate": 3.8246295730516455e-06, "loss": 0.5292396545410156, "mean_token_accuracy": 0.8868949916958809, "num_tokens": 13585112.0, "step": 2500 }, { "entropy": 0.5250877778977155, "epoch": 1.4698083297305087, "grad_norm": 0.2513103187084198, "learning_rate": 3.4648530926319634e-06, "loss": 0.5443946075439453, "mean_token_accuracy": 0.8851349979639054, "num_tokens": 13854892.0, "step": 2550 }, { "entropy": 0.5314137779176236, "epoch": 1.4986309266464908, "grad_norm": 0.21981389820575714, "learning_rate": 3.1192723077627163e-06, "loss": 0.5500518798828125, "mean_token_accuracy": 0.8838223886489868, "num_tokens": 14126513.0, "step": 2600 }, { "epoch": 1.4986309266464908, "eval_entropy": 0.5145505380473638, "eval_loss": 0.5215311050415039, "eval_mean_token_accuracy": 0.8864174689117231, "eval_num_tokens": 14126513.0, "eval_runtime": 25.2961, "eval_samples_per_second": 59.812, "eval_steps_per_second": 7.511, "step": 2600 }, { "entropy": 0.5299251443892717, "epoch": 1.5274535235624729, "grad_norm": 0.2680607736110687, "learning_rate": 2.78863789163911e-06, "loss": 0.5511317825317383, "mean_token_accuracy": 0.8838860777020454, "num_tokens": 14396535.0, "step": 2650 }, { "entropy": 0.5394382092356682, "epoch": 1.5562761204784552, "grad_norm": 0.25587576627731323, "learning_rate": 2.4736680508410902e-06, "loss": 0.5560498809814454, "mean_token_accuracy": 0.8820542943477631, "num_tokens": 14666012.0, "step": 2700 }, { "entropy": 0.5438171474635601, "epoch": 1.585098717394437, "grad_norm": 0.22950419783592224, "learning_rate": 2.1750469652395777e-06, "loss": 0.5646057891845703, "mean_token_accuracy": 0.8806953200697899, "num_tokens": 14938072.0, "step": 2750 }, { "entropy": 0.5157144083827734, "epoch": 1.6139213143104194, "grad_norm": 0.23236404359340668, "learning_rate": 1.8934233018157822e-06, "loss": 0.5314432525634766, "mean_token_accuracy": 0.8868013408780098, "num_tokens": 15206239.0, "step": 2800 }, { "epoch": 1.6139213143104194, "eval_entropy": 0.514612096783362, "eval_loss": 0.5212787985801697, "eval_mean_token_accuracy": 0.8865814996393103, "eval_num_tokens": 15206239.0, "eval_runtime": 25.284, "eval_samples_per_second": 59.84, "eval_steps_per_second": 7.515, "step": 2800 }, { "entropy": 0.5607952538132668, "epoch": 1.6427439112264015, "grad_norm": 0.27139943838119507, "learning_rate": 1.6294088056218705e-06, "loss": 0.5802758026123047, "mean_token_accuracy": 0.8771729645133018, "num_tokens": 15479383.0, "step": 2850 }, { "entropy": 0.5458529234677553, "epoch": 1.6715665081423836, "grad_norm": 0.22497807443141937, "learning_rate": 1.3835769709437307e-06, "loss": 0.5699198913574218, "mean_token_accuracy": 0.8804216027259827, "num_tokens": 15754571.0, "step": 2900 }, { "entropy": 0.5275375150889158, "epoch": 1.7003891050583657, "grad_norm": 0.25096791982650757, "learning_rate": 1.1564617955523716e-06, "loss": 0.5493584060668946, "mean_token_accuracy": 0.8840730246901513, "num_tokens": 16024591.0, "step": 2950 }, { "entropy": 0.529434435814619, "epoch": 1.7292117019743478, "grad_norm": 0.20803657174110413, "learning_rate": 9.485566207498986e-07, "loss": 0.552624740600586, "mean_token_accuracy": 0.8842183569073677, "num_tokens": 16291960.0, "step": 3000 }, { "epoch": 1.7292117019743478, "eval_entropy": 0.5173903987595909, "eval_loss": 0.5210983753204346, "eval_mean_token_accuracy": 0.8867466380721645, "eval_num_tokens": 16291960.0, "eval_runtime": 25.2973, "eval_samples_per_second": 59.809, "eval_steps_per_second": 7.511, "step": 3000 }, { "entropy": 0.5379937102645636, "epoch": 1.7580342988903301, "grad_norm": 0.279082715511322, "learning_rate": 7.603130597298147e-07, "loss": 0.5518331146240234, "mean_token_accuracy": 0.8835244515538215, "num_tokens": 16562192.0, "step": 3050 }, { "entropy": 0.5540396096557378, "epoch": 1.786856895806312, "grad_norm": 0.23533675074577332, "learning_rate": 5.921400165794255e-07, "loss": 0.5790340423583984, "mean_token_accuracy": 0.8788801202178002, "num_tokens": 16834646.0, "step": 3100 }, { "entropy": 0.5152763035148382, "epoch": 1.8156794927222943, "grad_norm": 0.2803124487400055, "learning_rate": 4.444027980552901e-07, "loss": 0.5328571701049805, "mean_token_accuracy": 0.8869862693548203, "num_tokens": 17104758.0, "step": 3150 }, { "entropy": 0.5612848294526338, "epoch": 1.8445020896382764, "grad_norm": 0.26171237230300903, "learning_rate": 3.1742232006111374e-07, "loss": 0.5824931716918945, "mean_token_accuracy": 0.8772889456152916, "num_tokens": 17379954.0, "step": 3200 }, { "epoch": 1.8445020896382764, "eval_entropy": 0.516960418381189, "eval_loss": 0.5211681723594666, "eval_mean_token_accuracy": 0.8866549037004772, "eval_num_tokens": 17379954.0, "eval_runtime": 25.3046, "eval_samples_per_second": 59.791, "eval_steps_per_second": 7.509, "step": 3200 }, { "entropy": 0.5196366369724273, "epoch": 1.8733246865542585, "grad_norm": 0.6400351524353027, "learning_rate": 2.1147441055180074e-07, "loss": 0.5364640808105469, "mean_token_accuracy": 0.8855624732375145, "num_tokens": 17647943.0, "step": 3250 }, { "entropy": 0.5125804611295461, "epoch": 1.9021472834702406, "grad_norm": 0.24278956651687622, "learning_rate": 1.2678921037788118e-07, "loss": 0.5320493316650391, "mean_token_accuracy": 0.8888507178425789, "num_tokens": 17914413.0, "step": 3300 }, { "entropy": 0.5364588350057602, "epoch": 1.9309698803862227, "grad_norm": 0.2414465695619583, "learning_rate": 6.355067337181497e-08, "loss": 0.5448334121704101, "mean_token_accuracy": 0.8835894984006881, "num_tokens": 18184605.0, "step": 3350 }, { "entropy": 0.5623560689389706, "epoch": 1.959792477302205, "grad_norm": 0.31695693731307983, "learning_rate": 2.189616676208428e-08, "loss": 0.5812443923950196, "mean_token_accuracy": 0.877460196018219, "num_tokens": 18459900.0, "step": 3400 }, { "epoch": 1.959792477302205, "eval_entropy": 0.5174976312016186, "eval_loss": 0.5211665630340576, "eval_mean_token_accuracy": 0.8866490850323125, "eval_num_tokens": 18459900.0, "eval_runtime": 25.3065, "eval_samples_per_second": 59.787, "eval_steps_per_second": 7.508, "step": 3400 } ], "logging_steps": 50, "max_steps": 3470, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.001481583701156e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }