{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2229, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.3228859663009644, "epoch": 0.013458950201884253, "grad_norm": 1.4003818035125732, "learning_rate": 2.4193548387096776e-06, "loss": 0.3947859287261963, "mean_token_accuracy": 0.8766823470592499, "num_tokens": 179351.0, "step": 10 }, { "entropy": 1.3223209857940674, "epoch": 0.026917900403768506, "grad_norm": 1.4293484687805176, "learning_rate": 5.1075268817204305e-06, "loss": 0.36992483139038085, "mean_token_accuracy": 0.8841759026050567, "num_tokens": 358681.0, "step": 20 }, { "entropy": 1.3241522073745728, "epoch": 0.040376850605652756, "grad_norm": 1.3410050868988037, "learning_rate": 7.795698924731183e-06, "loss": 0.3491047859191895, "mean_token_accuracy": 0.8843624591827393, "num_tokens": 538488.0, "step": 30 }, { "entropy": 1.3318131804466247, "epoch": 0.05383580080753701, "grad_norm": 1.08042573928833, "learning_rate": 1.0483870967741936e-05, "loss": 0.30254349708557127, "mean_token_accuracy": 0.8926964640617371, "num_tokens": 718118.0, "step": 40 }, { "entropy": 1.3337590098381042, "epoch": 0.06729475100942127, "grad_norm": 0.8615964651107788, "learning_rate": 1.3172043010752688e-05, "loss": 0.26572132110595703, "mean_token_accuracy": 0.9041078448295593, "num_tokens": 897209.0, "step": 50 }, { "entropy": 1.3126742005348206, "epoch": 0.08075370121130551, "grad_norm": 0.9317428469657898, "learning_rate": 1.586021505376344e-05, "loss": 0.23249435424804688, "mean_token_accuracy": 0.9135358154773712, "num_tokens": 1076448.0, "step": 60 }, { "entropy": 1.2961446046829224, "epoch": 0.09421265141318977, "grad_norm": 1.1822317838668823, "learning_rate": 1.8548387096774193e-05, "loss": 0.21359801292419434, "mean_token_accuracy": 0.9192376554012298, "num_tokens": 1255796.0, "step": 70 }, { "entropy": 1.2832220673561097, "epoch": 0.10767160161507403, "grad_norm": 1.1605602502822876, "learning_rate": 2.1236559139784946e-05, "loss": 0.20212924480438232, "mean_token_accuracy": 0.9217188417911529, "num_tokens": 1435611.0, "step": 80 }, { "entropy": 1.2762320637702942, "epoch": 0.12113055181695828, "grad_norm": 1.2006449699401855, "learning_rate": 2.39247311827957e-05, "loss": 0.18676402568817138, "mean_token_accuracy": 0.9310197174549103, "num_tokens": 1614872.0, "step": 90 }, { "entropy": 1.274475383758545, "epoch": 0.13458950201884254, "grad_norm": 1.2656220197677612, "learning_rate": 2.661290322580645e-05, "loss": 0.17775335311889648, "mean_token_accuracy": 0.9305342018604279, "num_tokens": 1794050.0, "step": 100 }, { "entropy": 1.2607086300849915, "epoch": 0.1480484522207268, "grad_norm": 1.2415285110473633, "learning_rate": 2.9301075268817207e-05, "loss": 0.17471678256988527, "mean_token_accuracy": 0.9317916095256805, "num_tokens": 1973292.0, "step": 110 }, { "entropy": 1.2618489027023316, "epoch": 0.16150740242261102, "grad_norm": 1.268917202949524, "learning_rate": 3.198924731182796e-05, "loss": 0.17752587795257568, "mean_token_accuracy": 0.9324639737606049, "num_tokens": 2152526.0, "step": 120 }, { "entropy": 1.2617292761802674, "epoch": 0.17496635262449528, "grad_norm": 1.1972988843917847, "learning_rate": 3.467741935483872e-05, "loss": 0.1649715781211853, "mean_token_accuracy": 0.9361909210681916, "num_tokens": 2331952.0, "step": 130 }, { "entropy": 1.2492274522781373, "epoch": 0.18842530282637954, "grad_norm": 1.2891550064086914, "learning_rate": 3.736559139784947e-05, "loss": 0.15520352125167847, "mean_token_accuracy": 0.939128577709198, "num_tokens": 2510125.0, "step": 140 }, { "entropy": 1.2332049250602721, "epoch": 0.2018842530282638, "grad_norm": 1.4108549356460571, "learning_rate": 4.005376344086022e-05, "loss": 0.16094473600387574, "mean_token_accuracy": 0.936911940574646, "num_tokens": 2690100.0, "step": 150 }, { "entropy": 1.2368563890457154, "epoch": 0.21534320323014805, "grad_norm": 1.2720197439193726, "learning_rate": 4.2741935483870973e-05, "loss": 0.15074102878570556, "mean_token_accuracy": 0.9419298887252807, "num_tokens": 2869072.0, "step": 160 }, { "entropy": 1.2400941371917724, "epoch": 0.2288021534320323, "grad_norm": 1.1434999704360962, "learning_rate": 4.543010752688172e-05, "loss": 0.15364220142364501, "mean_token_accuracy": 0.9395892560482025, "num_tokens": 3048179.0, "step": 170 }, { "entropy": 1.2370286583900452, "epoch": 0.24226110363391656, "grad_norm": 1.1543998718261719, "learning_rate": 4.811827956989248e-05, "loss": 0.1518303394317627, "mean_token_accuracy": 0.9403793513774872, "num_tokens": 3227648.0, "step": 180 }, { "entropy": 1.2381223678588866, "epoch": 0.2557200538358008, "grad_norm": 1.2674320936203003, "learning_rate": 5.080645161290323e-05, "loss": 0.14720194339752196, "mean_token_accuracy": 0.9413308918476104, "num_tokens": 3406443.0, "step": 190 }, { "entropy": 1.2209740161895752, "epoch": 0.2691790040376851, "grad_norm": 1.3895965814590454, "learning_rate": 5.349462365591398e-05, "loss": 0.14467229843139648, "mean_token_accuracy": 0.9425995826721192, "num_tokens": 3585139.0, "step": 200 }, { "entropy": 1.202421200275421, "epoch": 0.28263795423956933, "grad_norm": 1.1748970746994019, "learning_rate": 5.618279569892473e-05, "loss": 0.14418480396270753, "mean_token_accuracy": 0.944035142660141, "num_tokens": 3764815.0, "step": 210 }, { "entropy": 1.2019111514091492, "epoch": 0.2960969044414536, "grad_norm": 1.498073935508728, "learning_rate": 5.887096774193549e-05, "loss": 0.1409894585609436, "mean_token_accuracy": 0.9451915919780731, "num_tokens": 3944066.0, "step": 220 }, { "entropy": 1.208519995212555, "epoch": 0.30955585464333785, "grad_norm": 1.3500250577926636, "learning_rate": 6.155913978494624e-05, "loss": 0.14032076597213744, "mean_token_accuracy": 0.9437436699867249, "num_tokens": 4122561.0, "step": 230 }, { "entropy": 1.2071591854095458, "epoch": 0.32301480484522205, "grad_norm": 1.2740904092788696, "learning_rate": 6.4247311827957e-05, "loss": 0.13652472496032714, "mean_token_accuracy": 0.9458230376243592, "num_tokens": 4301558.0, "step": 240 }, { "entropy": 1.2087280392646789, "epoch": 0.3364737550471063, "grad_norm": 1.2498955726623535, "learning_rate": 6.693548387096774e-05, "loss": 0.137939453125, "mean_token_accuracy": 0.9447133004665375, "num_tokens": 4480708.0, "step": 250 }, { "entropy": 1.1996983647346497, "epoch": 0.34993270524899056, "grad_norm": 1.4520384073257446, "learning_rate": 6.962365591397851e-05, "loss": 0.14351414442062377, "mean_token_accuracy": 0.9436001539230346, "num_tokens": 4659616.0, "step": 260 }, { "entropy": 1.195546042919159, "epoch": 0.3633916554508748, "grad_norm": 1.1850366592407227, "learning_rate": 7.231182795698926e-05, "loss": 0.13713514804840088, "mean_token_accuracy": 0.946206146478653, "num_tokens": 4838619.0, "step": 270 }, { "entropy": 1.1904785871505736, "epoch": 0.3768506056527591, "grad_norm": 1.4309216737747192, "learning_rate": 7.500000000000001e-05, "loss": 0.13457468748092652, "mean_token_accuracy": 0.9473767280578613, "num_tokens": 5017668.0, "step": 280 }, { "entropy": 1.1928183078765868, "epoch": 0.39030955585464333, "grad_norm": 1.1399856805801392, "learning_rate": 7.768817204301076e-05, "loss": 0.13286283016204833, "mean_token_accuracy": 0.9468790471553803, "num_tokens": 5197412.0, "step": 290 }, { "entropy": 1.1921735286712647, "epoch": 0.4037685060565276, "grad_norm": 1.3263301849365234, "learning_rate": 8.037634408602151e-05, "loss": 0.1312130570411682, "mean_token_accuracy": 0.9481729447841645, "num_tokens": 5377026.0, "step": 300 }, { "entropy": 1.1928237080574036, "epoch": 0.41722745625841184, "grad_norm": 1.6233983039855957, "learning_rate": 8.306451612903227e-05, "loss": 0.1323886036872864, "mean_token_accuracy": 0.9465976357460022, "num_tokens": 5556708.0, "step": 310 }, { "entropy": 1.1836108446121216, "epoch": 0.4306864064602961, "grad_norm": 1.3058593273162842, "learning_rate": 8.575268817204302e-05, "loss": 0.13815442323684693, "mean_token_accuracy": 0.9450344681739807, "num_tokens": 5735495.0, "step": 320 }, { "entropy": 1.179207694530487, "epoch": 0.44414535666218036, "grad_norm": 1.43180513381958, "learning_rate": 8.844086021505377e-05, "loss": 0.13185629844665528, "mean_token_accuracy": 0.9473332643508912, "num_tokens": 5914707.0, "step": 330 }, { "entropy": 1.1725443840026855, "epoch": 0.4576043068640646, "grad_norm": 1.391575813293457, "learning_rate": 9.112903225806452e-05, "loss": 0.13046096563339232, "mean_token_accuracy": 0.9486047685146332, "num_tokens": 6093788.0, "step": 340 }, { "entropy": 1.17471581697464, "epoch": 0.47106325706594887, "grad_norm": 1.5146636962890625, "learning_rate": 9.381720430107528e-05, "loss": 0.13146305084228516, "mean_token_accuracy": 0.9477684259414673, "num_tokens": 6272916.0, "step": 350 }, { "entropy": 1.1633504509925843, "epoch": 0.4845222072678331, "grad_norm": 1.1611223220825195, "learning_rate": 9.650537634408603e-05, "loss": 0.12906880378723146, "mean_token_accuracy": 0.9491180777549744, "num_tokens": 6452766.0, "step": 360 }, { "entropy": 1.1688304781913756, "epoch": 0.4979811574697174, "grad_norm": 1.5618613958358765, "learning_rate": 9.919354838709678e-05, "loss": 0.1296112060546875, "mean_token_accuracy": 0.9494616508483886, "num_tokens": 6632087.0, "step": 370 }, { "entropy": 1.1618135571479797, "epoch": 0.5114401076716016, "grad_norm": 1.259940266609192, "learning_rate": 9.999975729865971e-05, "loss": 0.12792425155639647, "mean_token_accuracy": 0.9487492978572846, "num_tokens": 6811480.0, "step": 380 }, { "entropy": 1.1698922395706177, "epoch": 0.5248990578734859, "grad_norm": 1.4075767993927002, "learning_rate": 9.999856856307314e-05, "loss": 0.12948644161224365, "mean_token_accuracy": 0.9471172571182251, "num_tokens": 6990914.0, "step": 390 }, { "entropy": 1.1714325428009034, "epoch": 0.5383580080753702, "grad_norm": 1.2077672481536865, "learning_rate": 9.999638923896533e-05, "loss": 0.12964634895324706, "mean_token_accuracy": 0.9472773969173431, "num_tokens": 7170193.0, "step": 400 }, { "entropy": 1.172813320159912, "epoch": 0.5518169582772544, "grad_norm": 1.506191611289978, "learning_rate": 9.999321936951374e-05, "loss": 0.12280762195587158, "mean_token_accuracy": 0.9517758071422577, "num_tokens": 7349496.0, "step": 410 }, { "entropy": 1.179931104183197, "epoch": 0.5652759084791387, "grad_norm": 1.2185982465744019, "learning_rate": 9.998905901752091e-05, "loss": 0.12760610580444337, "mean_token_accuracy": 0.948979276418686, "num_tokens": 7528648.0, "step": 420 }, { "entropy": 1.1817825198173524, "epoch": 0.5787348586810229, "grad_norm": 1.285261631011963, "learning_rate": 9.998390826541315e-05, "loss": 0.13066411018371582, "mean_token_accuracy": 0.947555410861969, "num_tokens": 7707487.0, "step": 430 }, { "entropy": 1.1945344924926757, "epoch": 0.5921938088829072, "grad_norm": 1.2778966426849365, "learning_rate": 9.997776721523888e-05, "loss": 0.13003890514373778, "mean_token_accuracy": 0.9476695477962493, "num_tokens": 7886452.0, "step": 440 }, { "entropy": 1.192435622215271, "epoch": 0.6056527590847914, "grad_norm": 1.2676047086715698, "learning_rate": 9.99706359886667e-05, "loss": 0.13059219121932983, "mean_token_accuracy": 0.9467391848564148, "num_tokens": 8065093.0, "step": 450 }, { "entropy": 1.1952194094657898, "epoch": 0.6191117092866757, "grad_norm": 1.1667490005493164, "learning_rate": 9.996251472698281e-05, "loss": 0.1308892250061035, "mean_token_accuracy": 0.9474103152751923, "num_tokens": 8245294.0, "step": 460 }, { "entropy": 1.1830523014068604, "epoch": 0.6325706594885598, "grad_norm": 1.4168891906738281, "learning_rate": 9.995340359108844e-05, "loss": 0.1230043888092041, "mean_token_accuracy": 0.9503998339176178, "num_tokens": 8424334.0, "step": 470 }, { "entropy": 1.1928761839866637, "epoch": 0.6460296096904441, "grad_norm": 1.4041002988815308, "learning_rate": 9.994330276149649e-05, "loss": 0.12544957399368287, "mean_token_accuracy": 0.9496485233306885, "num_tokens": 8603758.0, "step": 480 }, { "entropy": 1.180042278766632, "epoch": 0.6594885598923284, "grad_norm": 1.2816451787948608, "learning_rate": 9.993221243832797e-05, "loss": 0.1197009801864624, "mean_token_accuracy": 0.9527871966361999, "num_tokens": 8782936.0, "step": 490 }, { "entropy": 1.1822824954986573, "epoch": 0.6729475100942126, "grad_norm": 1.0554879903793335, "learning_rate": 9.992013284130816e-05, "loss": 0.12416183948516846, "mean_token_accuracy": 0.9488660097122192, "num_tokens": 8962286.0, "step": 500 }, { "entropy": 1.1877296924591065, "epoch": 0.6864064602960969, "grad_norm": 2.272390842437744, "learning_rate": 9.990706420976206e-05, "loss": 0.12660024166107178, "mean_token_accuracy": 0.9495710134506226, "num_tokens": 9141244.0, "step": 510 }, { "entropy": 1.195984995365143, "epoch": 0.6998654104979811, "grad_norm": 1.106213927268982, "learning_rate": 9.989300680260985e-05, "loss": 0.12362114191055298, "mean_token_accuracy": 0.9512333691120147, "num_tokens": 9319796.0, "step": 520 }, { "entropy": 1.1847575902938843, "epoch": 0.7133243606998654, "grad_norm": 1.2385672330856323, "learning_rate": 9.98779608983616e-05, "loss": 0.12053300142288208, "mean_token_accuracy": 0.9517379641532898, "num_tokens": 9498637.0, "step": 530 }, { "entropy": 1.1887083888053893, "epoch": 0.7267833109017496, "grad_norm": 1.1992591619491577, "learning_rate": 9.986192679511189e-05, "loss": 0.12146525382995606, "mean_token_accuracy": 0.9509431838989257, "num_tokens": 9678136.0, "step": 540 }, { "entropy": 1.1956284999847413, "epoch": 0.7402422611036339, "grad_norm": 1.4346206188201904, "learning_rate": 9.984490481053372e-05, "loss": 0.12582865953445435, "mean_token_accuracy": 0.9495353937149048, "num_tokens": 9856943.0, "step": 550 }, { "entropy": 1.183466374874115, "epoch": 0.7537012113055181, "grad_norm": 1.1207399368286133, "learning_rate": 9.982689528187244e-05, "loss": 0.11938930749893188, "mean_token_accuracy": 0.9524446070194245, "num_tokens": 10036305.0, "step": 560 }, { "entropy": 1.1996586084365846, "epoch": 0.7671601615074024, "grad_norm": 1.0004290342330933, "learning_rate": 9.98078985659389e-05, "loss": 0.1256342649459839, "mean_token_accuracy": 0.9488184452056885, "num_tokens": 10215049.0, "step": 570 }, { "entropy": 1.2113700032234191, "epoch": 0.7806191117092867, "grad_norm": 1.243759036064148, "learning_rate": 9.978791503910246e-05, "loss": 0.11844713687896728, "mean_token_accuracy": 0.9520347356796265, "num_tokens": 10393513.0, "step": 580 }, { "entropy": 1.2171649575233459, "epoch": 0.7940780619111709, "grad_norm": 1.2175841331481934, "learning_rate": 9.97669450972835e-05, "loss": 0.1155052900314331, "mean_token_accuracy": 0.954187548160553, "num_tokens": 10572502.0, "step": 590 }, { "entropy": 1.2295325994491577, "epoch": 0.8075370121130552, "grad_norm": 1.1670854091644287, "learning_rate": 9.974498915594557e-05, "loss": 0.12255362272262574, "mean_token_accuracy": 0.9510588347911835, "num_tokens": 10751857.0, "step": 600 }, { "entropy": 1.2220084905624389, "epoch": 0.8209959623149394, "grad_norm": 1.3236212730407715, "learning_rate": 9.97220476500872e-05, "loss": 0.1217005968093872, "mean_token_accuracy": 0.9508337616920471, "num_tokens": 10931362.0, "step": 610 }, { "entropy": 1.20922110080719, "epoch": 0.8344549125168237, "grad_norm": 1.2529112100601196, "learning_rate": 9.969812103423325e-05, "loss": 0.11833038330078124, "mean_token_accuracy": 0.9529603838920593, "num_tokens": 11111075.0, "step": 620 }, { "entropy": 1.2208962082862853, "epoch": 0.847913862718708, "grad_norm": 1.2380986213684082, "learning_rate": 9.967320978242592e-05, "loss": 0.12019131183624268, "mean_token_accuracy": 0.9517916083335877, "num_tokens": 11289952.0, "step": 630 }, { "entropy": 1.206966769695282, "epoch": 0.8613728129205922, "grad_norm": 1.2476933002471924, "learning_rate": 9.964731438821533e-05, "loss": 0.11783044338226319, "mean_token_accuracy": 0.9523464858531951, "num_tokens": 11469661.0, "step": 640 }, { "entropy": 1.2062023997306823, "epoch": 0.8748317631224765, "grad_norm": 1.4155808687210083, "learning_rate": 9.962043536464978e-05, "loss": 0.12099127769470215, "mean_token_accuracy": 0.9519050180912018, "num_tokens": 11648570.0, "step": 650 }, { "entropy": 1.2050026655197144, "epoch": 0.8882907133243607, "grad_norm": 1.309507966041565, "learning_rate": 9.959257324426556e-05, "loss": 0.11565302610397339, "mean_token_accuracy": 0.9535071849822998, "num_tokens": 11827640.0, "step": 660 }, { "entropy": 1.2138132452964783, "epoch": 0.901749663526245, "grad_norm": 1.150227427482605, "learning_rate": 9.95637285790764e-05, "loss": 0.11565654277801514, "mean_token_accuracy": 0.9536015927791596, "num_tokens": 12006419.0, "step": 670 }, { "entropy": 1.2211383819580077, "epoch": 0.9152086137281292, "grad_norm": 1.3185595273971558, "learning_rate": 9.953390194056258e-05, "loss": 0.11686277389526367, "mean_token_accuracy": 0.9518564403057098, "num_tokens": 12184806.0, "step": 680 }, { "entropy": 1.233402180671692, "epoch": 0.9286675639300135, "grad_norm": 1.160781979560852, "learning_rate": 9.950309391965947e-05, "loss": 0.11723113059997559, "mean_token_accuracy": 0.9525671184062958, "num_tokens": 12363767.0, "step": 690 }, { "entropy": 1.2254271149635314, "epoch": 0.9421265141318977, "grad_norm": 1.0756208896636963, "learning_rate": 9.947130512674602e-05, "loss": 0.11969656944274902, "mean_token_accuracy": 0.9499428868293762, "num_tokens": 12542727.0, "step": 700 }, { "entropy": 1.2217535138130189, "epoch": 0.955585464333782, "grad_norm": 1.131346344947815, "learning_rate": 9.943853619163255e-05, "loss": 0.11605353355407715, "mean_token_accuracy": 0.9536243081092834, "num_tokens": 12721825.0, "step": 710 }, { "entropy": 1.2145210385322571, "epoch": 0.9690444145356663, "grad_norm": 1.0480105876922607, "learning_rate": 9.94047877635482e-05, "loss": 0.11278635263442993, "mean_token_accuracy": 0.9553675949573517, "num_tokens": 12902291.0, "step": 720 }, { "entropy": 1.2308586597442628, "epoch": 0.9825033647375505, "grad_norm": 1.1793105602264404, "learning_rate": 9.93700605111283e-05, "loss": 0.11050724983215332, "mean_token_accuracy": 0.9547911584377289, "num_tokens": 13082065.0, "step": 730 }, { "entropy": 1.2493423819541931, "epoch": 0.9959623149394348, "grad_norm": 1.289297103881836, "learning_rate": 9.933435512240084e-05, "loss": 0.11567041873931885, "mean_token_accuracy": 0.9526383280754089, "num_tokens": 13261041.0, "step": 740 }, { "epoch": 1.0, "eval_entropy": 1.229653848204643, "eval_loss": 0.11524093896150589, "eval_mean_token_accuracy": 0.9533016294430775, "eval_num_tokens": 13314766.0, "eval_runtime": 13.2833, "eval_samples_per_second": 376.412, "eval_steps_per_second": 11.819, "step": 743 }, { "entropy": 1.233021354675293, "epoch": 1.009421265141319, "grad_norm": 1.3314229249954224, "learning_rate": 9.929767230477305e-05, "loss": 0.10805211067199708, "mean_token_accuracy": 0.9576423108577728, "num_tokens": 13440438.0, "step": 750 }, { "entropy": 1.2277064085006715, "epoch": 1.0228802153432033, "grad_norm": 1.194751262664795, "learning_rate": 9.92600127850173e-05, "loss": 0.09916897416114807, "mean_token_accuracy": 0.9617054045200348, "num_tokens": 13619055.0, "step": 760 }, { "entropy": 1.2193793654441833, "epoch": 1.0363391655450875, "grad_norm": 1.3674660921096802, "learning_rate": 9.922137730925673e-05, "loss": 0.09446401596069336, "mean_token_accuracy": 0.9620481312274933, "num_tokens": 13798753.0, "step": 770 }, { "entropy": 1.2068825483322143, "epoch": 1.0497981157469718, "grad_norm": 1.175338625907898, "learning_rate": 9.918176664295041e-05, "loss": 0.09437270164489746, "mean_token_accuracy": 0.9626644790172577, "num_tokens": 13978445.0, "step": 780 }, { "entropy": 1.2036906003952026, "epoch": 1.063257065948856, "grad_norm": 1.3463256359100342, "learning_rate": 9.914118157087824e-05, "loss": 0.09322788715362548, "mean_token_accuracy": 0.9640586376190186, "num_tokens": 14157400.0, "step": 790 }, { "entropy": 1.195313036441803, "epoch": 1.0767160161507403, "grad_norm": 1.2407623529434204, "learning_rate": 9.909962289712538e-05, "loss": 0.10000712871551513, "mean_token_accuracy": 0.9604595184326172, "num_tokens": 14337209.0, "step": 800 }, { "entropy": 1.1997171878814696, "epoch": 1.0901749663526246, "grad_norm": 1.4212044477462769, "learning_rate": 9.905709144506629e-05, "loss": 0.09967402815818786, "mean_token_accuracy": 0.9616046726703644, "num_tokens": 14516327.0, "step": 810 }, { "entropy": 1.1916023015975952, "epoch": 1.1036339165545088, "grad_norm": 1.296561598777771, "learning_rate": 9.901358805734846e-05, "loss": 0.09139133095741273, "mean_token_accuracy": 0.9634343802928924, "num_tokens": 14695257.0, "step": 820 }, { "entropy": 1.1924808621406555, "epoch": 1.117092866756393, "grad_norm": 1.3003162145614624, "learning_rate": 9.89691135958757e-05, "loss": 0.0935364007949829, "mean_token_accuracy": 0.9624580383300781, "num_tokens": 14874609.0, "step": 830 }, { "entropy": 1.2110714197158814, "epoch": 1.1305518169582773, "grad_norm": 1.341585636138916, "learning_rate": 9.892366894179105e-05, "loss": 0.09882450699806214, "mean_token_accuracy": 0.961030250787735, "num_tokens": 15053971.0, "step": 840 }, { "entropy": 1.2037933468818665, "epoch": 1.1440107671601616, "grad_norm": 1.4529608488082886, "learning_rate": 9.887725499545937e-05, "loss": 0.09266124367713928, "mean_token_accuracy": 0.9641264617443085, "num_tokens": 15233217.0, "step": 850 }, { "entropy": 1.210195577144623, "epoch": 1.1574697173620458, "grad_norm": 0.9631540179252625, "learning_rate": 9.882987267644939e-05, "loss": 0.09560335874557495, "mean_token_accuracy": 0.9616110920906067, "num_tokens": 15412460.0, "step": 860 }, { "entropy": 1.210800564289093, "epoch": 1.17092866756393, "grad_norm": 1.078429937362671, "learning_rate": 9.878152292351563e-05, "loss": 0.0967819094657898, "mean_token_accuracy": 0.960888934135437, "num_tokens": 15590882.0, "step": 870 }, { "entropy": 1.204759907722473, "epoch": 1.1843876177658144, "grad_norm": 1.132325530052185, "learning_rate": 9.873220669457975e-05, "loss": 0.09479628801345825, "mean_token_accuracy": 0.9629071593284607, "num_tokens": 15770658.0, "step": 880 }, { "entropy": 1.2071414232254027, "epoch": 1.1978465679676986, "grad_norm": 1.0902807712554932, "learning_rate": 9.868192496671147e-05, "loss": 0.09629296064376831, "mean_token_accuracy": 0.9622378885746002, "num_tokens": 15950126.0, "step": 890 }, { "entropy": 1.2057334661483765, "epoch": 1.2113055181695827, "grad_norm": 1.2059770822525024, "learning_rate": 9.86306787361094e-05, "loss": 0.09764755368232728, "mean_token_accuracy": 0.962404465675354, "num_tokens": 16129315.0, "step": 900 }, { "entropy": 1.2059934735298157, "epoch": 1.224764468371467, "grad_norm": 1.5481969118118286, "learning_rate": 9.857846901808117e-05, "loss": 0.09670655727386475, "mean_token_accuracy": 0.9619201004505158, "num_tokens": 16307839.0, "step": 910 }, { "entropy": 1.2136071562767028, "epoch": 1.2382234185733512, "grad_norm": 1.1171293258666992, "learning_rate": 9.852529684702329e-05, "loss": 0.09502402544021607, "mean_token_accuracy": 0.9619876623153687, "num_tokens": 16487021.0, "step": 920 }, { "entropy": 1.232442605495453, "epoch": 1.2516823687752354, "grad_norm": 1.2836637496948242, "learning_rate": 9.847116327640082e-05, "loss": 0.09930729866027832, "mean_token_accuracy": 0.9604007601737976, "num_tokens": 16665995.0, "step": 930 }, { "entropy": 1.2321829080581665, "epoch": 1.2651413189771197, "grad_norm": 1.1843444108963013, "learning_rate": 9.841606937872632e-05, "loss": 0.10086537599563598, "mean_token_accuracy": 0.9602800250053406, "num_tokens": 16845090.0, "step": 940 }, { "entropy": 1.233613657951355, "epoch": 1.278600269179004, "grad_norm": 1.3496166467666626, "learning_rate": 9.836001624553869e-05, "loss": 0.09795907735824586, "mean_token_accuracy": 0.9610718429088593, "num_tokens": 17024295.0, "step": 950 }, { "entropy": 1.2220023155212403, "epoch": 1.2920592193808882, "grad_norm": 1.238175392150879, "learning_rate": 9.830300498738152e-05, "loss": 0.09709340333938599, "mean_token_accuracy": 0.9621975898742676, "num_tokens": 17203525.0, "step": 960 }, { "entropy": 1.2186771392822267, "epoch": 1.3055181695827724, "grad_norm": 1.0820763111114502, "learning_rate": 9.824503673378112e-05, "loss": 0.09260507822036743, "mean_token_accuracy": 0.9632427036762238, "num_tokens": 17382269.0, "step": 970 }, { "entropy": 1.212946391105652, "epoch": 1.3189771197846567, "grad_norm": 1.253194808959961, "learning_rate": 9.81861126332241e-05, "loss": 0.10012803077697754, "mean_token_accuracy": 0.9596389472484589, "num_tokens": 17561951.0, "step": 980 }, { "entropy": 1.2092903971672058, "epoch": 1.332436069986541, "grad_norm": 1.6471713781356812, "learning_rate": 9.812623385313461e-05, "loss": 0.1032632827758789, "mean_token_accuracy": 0.9594815850257874, "num_tokens": 17741116.0, "step": 990 }, { "entropy": 1.2158336997032166, "epoch": 1.3458950201884252, "grad_norm": 1.076393723487854, "learning_rate": 9.806540157985131e-05, "loss": 0.09857285022735596, "mean_token_accuracy": 0.9608540177345276, "num_tokens": 17920249.0, "step": 1000 }, { "entropy": 1.2093246698379516, "epoch": 1.3593539703903095, "grad_norm": 1.1203004121780396, "learning_rate": 9.800361701860368e-05, "loss": 0.09807900190353394, "mean_token_accuracy": 0.9611685931682586, "num_tokens": 18099006.0, "step": 1010 }, { "entropy": 1.2070690989494324, "epoch": 1.3728129205921937, "grad_norm": 1.3285764455795288, "learning_rate": 9.794088139348835e-05, "loss": 0.10283086299896241, "mean_token_accuracy": 0.9585156977176666, "num_tokens": 18277971.0, "step": 1020 }, { "entropy": 1.2022451281547546, "epoch": 1.386271870794078, "grad_norm": 1.0949617624282837, "learning_rate": 9.787719594744468e-05, "loss": 0.10161725282669068, "mean_token_accuracy": 0.9598902583122253, "num_tokens": 18457464.0, "step": 1030 }, { "entropy": 1.2045769929885863, "epoch": 1.3997308209959622, "grad_norm": 1.008150577545166, "learning_rate": 9.781256194223023e-05, "loss": 0.10038440227508545, "mean_token_accuracy": 0.960367614030838, "num_tokens": 18636876.0, "step": 1040 }, { "entropy": 1.204549217224121, "epoch": 1.4131897711978465, "grad_norm": 1.0495935678482056, "learning_rate": 9.774698065839577e-05, "loss": 0.09564157128334046, "mean_token_accuracy": 0.9625212967395782, "num_tokens": 18816243.0, "step": 1050 }, { "entropy": 1.2045063614845275, "epoch": 1.4266487213997308, "grad_norm": 1.2372835874557495, "learning_rate": 9.768045339525979e-05, "loss": 0.09781360626220703, "mean_token_accuracy": 0.9605839848518372, "num_tokens": 18995594.0, "step": 1060 }, { "entropy": 1.2258678078651428, "epoch": 1.440107671601615, "grad_norm": 1.0772687196731567, "learning_rate": 9.76129814708829e-05, "loss": 0.09291026592254639, "mean_token_accuracy": 0.9634248733520507, "num_tokens": 19173887.0, "step": 1070 }, { "entropy": 1.2230794191360475, "epoch": 1.4535666218034993, "grad_norm": 1.2008293867111206, "learning_rate": 9.754456622204167e-05, "loss": 0.09285001754760742, "mean_token_accuracy": 0.9633622407913208, "num_tokens": 19352678.0, "step": 1080 }, { "entropy": 1.2313218355178832, "epoch": 1.4670255720053835, "grad_norm": 1.5826188325881958, "learning_rate": 9.747520900420209e-05, "loss": 0.1002782940864563, "mean_token_accuracy": 0.9600823521614075, "num_tokens": 19532077.0, "step": 1090 }, { "entropy": 1.2246542692184448, "epoch": 1.4804845222072678, "grad_norm": 1.3970143795013428, "learning_rate": 9.740491119149277e-05, "loss": 0.1005969524383545, "mean_token_accuracy": 0.9596368432044983, "num_tokens": 19710609.0, "step": 1100 }, { "entropy": 1.207058048248291, "epoch": 1.493943472409152, "grad_norm": 1.3544780015945435, "learning_rate": 9.733367417667773e-05, "loss": 0.09367164373397827, "mean_token_accuracy": 0.9632523238658905, "num_tokens": 19889820.0, "step": 1110 }, { "entropy": 1.2027259588241577, "epoch": 1.5074024226110363, "grad_norm": 1.2393465042114258, "learning_rate": 9.726149937112873e-05, "loss": 0.09854428172111511, "mean_token_accuracy": 0.9612930059432984, "num_tokens": 20069561.0, "step": 1120 }, { "entropy": 1.2199820518493651, "epoch": 1.5208613728129206, "grad_norm": 1.4061861038208008, "learning_rate": 9.718838820479743e-05, "loss": 0.09687533378601074, "mean_token_accuracy": 0.9612306416034698, "num_tokens": 20249088.0, "step": 1130 }, { "entropy": 1.2114709615707397, "epoch": 1.5343203230148048, "grad_norm": 1.2970331907272339, "learning_rate": 9.711434212618691e-05, "loss": 0.09762253165245056, "mean_token_accuracy": 0.9609376013278961, "num_tokens": 20428600.0, "step": 1140 }, { "entropy": 1.1982413172721862, "epoch": 1.547779273216689, "grad_norm": 1.621308445930481, "learning_rate": 9.703936260232308e-05, "loss": 0.09679374098777771, "mean_token_accuracy": 0.9625207364559174, "num_tokens": 20608047.0, "step": 1150 }, { "entropy": 1.195889377593994, "epoch": 1.5612382234185733, "grad_norm": 1.2940045595169067, "learning_rate": 9.696345111872557e-05, "loss": 0.09699609279632568, "mean_token_accuracy": 0.96190345287323, "num_tokens": 20787142.0, "step": 1160 }, { "entropy": 1.1944554448127747, "epoch": 1.5746971736204576, "grad_norm": 1.3155335187911987, "learning_rate": 9.688660917937838e-05, "loss": 0.09831242561340332, "mean_token_accuracy": 0.9606768429279328, "num_tokens": 20966230.0, "step": 1170 }, { "entropy": 1.1957345604896545, "epoch": 1.5881561238223418, "grad_norm": 1.2948030233383179, "learning_rate": 9.68088383066999e-05, "loss": 0.09834452867507934, "mean_token_accuracy": 0.9612827241420746, "num_tokens": 21145768.0, "step": 1180 }, { "entropy": 1.2020023703575133, "epoch": 1.601615074024226, "grad_norm": 1.0523329973220825, "learning_rate": 9.673014004151292e-05, "loss": 0.09663949012756348, "mean_token_accuracy": 0.9620199799537659, "num_tokens": 21324592.0, "step": 1190 }, { "entropy": 1.1892358779907226, "epoch": 1.6150740242261103, "grad_norm": 1.1584330797195435, "learning_rate": 9.665051594301407e-05, "loss": 0.09669581055641174, "mean_token_accuracy": 0.961614978313446, "num_tokens": 21504539.0, "step": 1200 }, { "entropy": 1.190696406364441, "epoch": 1.6285329744279946, "grad_norm": 1.1194695234298706, "learning_rate": 9.656996758874284e-05, "loss": 0.09648081660270691, "mean_token_accuracy": 0.9612169206142426, "num_tokens": 21683905.0, "step": 1210 }, { "entropy": 1.2076977849006654, "epoch": 1.6419919246298789, "grad_norm": 1.1297376155853271, "learning_rate": 9.648849657455044e-05, "loss": 0.09605686664581299, "mean_token_accuracy": 0.961658376455307, "num_tokens": 21862162.0, "step": 1220 }, { "entropy": 1.2096962213516236, "epoch": 1.6554508748317631, "grad_norm": 1.2401906251907349, "learning_rate": 9.640610451456811e-05, "loss": 0.09206328392028809, "mean_token_accuracy": 0.962989890575409, "num_tokens": 22041015.0, "step": 1230 }, { "entropy": 1.216509222984314, "epoch": 1.6689098250336474, "grad_norm": 1.2637176513671875, "learning_rate": 9.632279304117517e-05, "loss": 0.09614999294281006, "mean_token_accuracy": 0.9613571405410767, "num_tokens": 22220655.0, "step": 1240 }, { "entropy": 1.2126171827316283, "epoch": 1.6823687752355316, "grad_norm": 1.2879180908203125, "learning_rate": 9.623856380496664e-05, "loss": 0.09818092584609986, "mean_token_accuracy": 0.9603166699409484, "num_tokens": 22399957.0, "step": 1250 }, { "entropy": 1.182448434829712, "epoch": 1.695827725437416, "grad_norm": 1.0547544956207275, "learning_rate": 9.615341847472059e-05, "loss": 0.0945388674736023, "mean_token_accuracy": 0.9623521089553833, "num_tokens": 22579222.0, "step": 1260 }, { "entropy": 1.1886864185333252, "epoch": 1.7092866756393001, "grad_norm": 1.4119364023208618, "learning_rate": 9.606735873736505e-05, "loss": 0.0979494333267212, "mean_token_accuracy": 0.9607987105846405, "num_tokens": 22758487.0, "step": 1270 }, { "entropy": 1.1918489813804627, "epoch": 1.7227456258411844, "grad_norm": 1.2551711797714233, "learning_rate": 9.598038629794461e-05, "loss": 0.09586712718009949, "mean_token_accuracy": 0.9615644454956055, "num_tokens": 22936708.0, "step": 1280 }, { "entropy": 1.1913212060928344, "epoch": 1.7362045760430687, "grad_norm": 1.0276069641113281, "learning_rate": 9.589250287958657e-05, "loss": 0.09535220861434937, "mean_token_accuracy": 0.9606883823871613, "num_tokens": 23116329.0, "step": 1290 }, { "entropy": 1.203085219860077, "epoch": 1.749663526244953, "grad_norm": 1.2456278800964355, "learning_rate": 9.580371022346693e-05, "loss": 0.09598281383514404, "mean_token_accuracy": 0.9608144044876099, "num_tokens": 23295164.0, "step": 1300 }, { "entropy": 1.1892922878265382, "epoch": 1.7631224764468372, "grad_norm": 1.1159876585006714, "learning_rate": 9.571401008877572e-05, "loss": 0.09096106886863708, "mean_token_accuracy": 0.9636982321739197, "num_tokens": 23474377.0, "step": 1310 }, { "entropy": 1.2096730828285218, "epoch": 1.7765814266487214, "grad_norm": 1.420886516571045, "learning_rate": 9.562340425268233e-05, "loss": 0.0925740659236908, "mean_token_accuracy": 0.9629011929035187, "num_tokens": 23653389.0, "step": 1320 }, { "entropy": 1.2122852802276611, "epoch": 1.7900403768506057, "grad_norm": 1.1587319374084473, "learning_rate": 9.553189451030019e-05, "loss": 0.09554123878479004, "mean_token_accuracy": 0.9622859060764313, "num_tokens": 23832469.0, "step": 1330 }, { "entropy": 1.2176487922668457, "epoch": 1.80349932705249, "grad_norm": 1.147444248199463, "learning_rate": 9.543948267465115e-05, "loss": 0.09707238674163818, "mean_token_accuracy": 0.9612141191959381, "num_tokens": 24011518.0, "step": 1340 }, { "entropy": 1.2231361389160156, "epoch": 1.8169582772543742, "grad_norm": 1.1775709390640259, "learning_rate": 9.534617057662977e-05, "loss": 0.09654755592346191, "mean_token_accuracy": 0.9617958247661591, "num_tokens": 24190267.0, "step": 1350 }, { "entropy": 1.2070120811462401, "epoch": 1.8304172274562585, "grad_norm": 1.1315947771072388, "learning_rate": 9.525196006496679e-05, "loss": 0.09382581114768981, "mean_token_accuracy": 0.9625270128250122, "num_tokens": 24369982.0, "step": 1360 }, { "entropy": 1.2057390093803406, "epoch": 1.8438761776581427, "grad_norm": 1.1973934173583984, "learning_rate": 9.515685300619271e-05, "loss": 0.09683746099472046, "mean_token_accuracy": 0.9607476830482483, "num_tokens": 24549256.0, "step": 1370 }, { "entropy": 1.207427191734314, "epoch": 1.857335127860027, "grad_norm": 1.3193334341049194, "learning_rate": 9.506085128460065e-05, "loss": 0.09461041688919067, "mean_token_accuracy": 0.9628551185131073, "num_tokens": 24727544.0, "step": 1380 }, { "entropy": 1.2011541604995728, "epoch": 1.8707940780619112, "grad_norm": 1.0681352615356445, "learning_rate": 9.496395680220918e-05, "loss": 0.0960330069065094, "mean_token_accuracy": 0.9622460305690765, "num_tokens": 24907721.0, "step": 1390 }, { "entropy": 1.2034499764442443, "epoch": 1.8842530282637955, "grad_norm": 1.2765101194381714, "learning_rate": 9.486617147872446e-05, "loss": 0.09376740455627441, "mean_token_accuracy": 0.9624415040016174, "num_tokens": 25086496.0, "step": 1400 }, { "entropy": 1.1899038195610045, "epoch": 1.8977119784656797, "grad_norm": 1.1333132982254028, "learning_rate": 9.476749725150235e-05, "loss": 0.09668049812316895, "mean_token_accuracy": 0.9621514558792115, "num_tokens": 25266204.0, "step": 1410 }, { "entropy": 1.1895189881324768, "epoch": 1.911170928667564, "grad_norm": 1.310587763786316, "learning_rate": 9.466793607550995e-05, "loss": 0.0920013129711151, "mean_token_accuracy": 0.963368022441864, "num_tokens": 25445604.0, "step": 1420 }, { "entropy": 1.1946001529693604, "epoch": 1.9246298788694483, "grad_norm": 1.3619959354400635, "learning_rate": 9.45674899232869e-05, "loss": 0.09906838536262512, "mean_token_accuracy": 0.9604476511478424, "num_tokens": 25624947.0, "step": 1430 }, { "entropy": 1.2078219771385192, "epoch": 1.9380888290713325, "grad_norm": 1.152220606803894, "learning_rate": 9.446616078490626e-05, "loss": 0.09479650259017944, "mean_token_accuracy": 0.9627270400524139, "num_tokens": 25804643.0, "step": 1440 }, { "entropy": 1.2207356214523315, "epoch": 1.9515477792732168, "grad_norm": 1.186161994934082, "learning_rate": 9.436395066793518e-05, "loss": 0.09636704921722412, "mean_token_accuracy": 0.9604843854904175, "num_tokens": 25984119.0, "step": 1450 }, { "entropy": 1.205849301815033, "epoch": 1.965006729475101, "grad_norm": 1.409846544265747, "learning_rate": 9.426086159739496e-05, "loss": 0.09743249416351318, "mean_token_accuracy": 0.9608718931674958, "num_tokens": 26163483.0, "step": 1460 }, { "entropy": 1.2089114785194397, "epoch": 1.9784656796769853, "grad_norm": 1.2226805686950684, "learning_rate": 9.415689561572107e-05, "loss": 0.09131012558937072, "mean_token_accuracy": 0.9631811439990997, "num_tokens": 26342666.0, "step": 1470 }, { "entropy": 1.1953012466430664, "epoch": 1.9919246298788695, "grad_norm": 1.0700947046279907, "learning_rate": 9.405205478272267e-05, "loss": 0.09140577316284179, "mean_token_accuracy": 0.9642649590969086, "num_tokens": 26521895.0, "step": 1480 }, { "epoch": 2.0, "eval_entropy": 1.1869331590688912, "eval_loss": 0.10991495102643967, "eval_mean_token_accuracy": 0.9554494638351878, "eval_num_tokens": 26629596.0, "eval_runtime": 12.7631, "eval_samples_per_second": 391.753, "eval_steps_per_second": 12.301, "step": 1486 }, { "entropy": 1.1832876205444336, "epoch": 2.005383580080754, "grad_norm": 1.0044941902160645, "learning_rate": 9.394634117554173e-05, "loss": 0.0840892255306244, "mean_token_accuracy": 0.967725521326065, "num_tokens": 26701394.0, "step": 1490 }, { "entropy": 1.159238350391388, "epoch": 2.018842530282638, "grad_norm": 1.4198471307754517, "learning_rate": 9.38397568886119e-05, "loss": 0.07137876152992248, "mean_token_accuracy": 0.9723187386989594, "num_tokens": 26880890.0, "step": 1500 }, { "entropy": 1.165151631832123, "epoch": 2.0323014804845223, "grad_norm": 1.1602118015289307, "learning_rate": 9.373230403361712e-05, "loss": 0.06463043689727783, "mean_token_accuracy": 0.9757274091243744, "num_tokens": 27059741.0, "step": 1510 }, { "entropy": 1.1644548654556275, "epoch": 2.0457604306864066, "grad_norm": 1.3322592973709106, "learning_rate": 9.362398473944958e-05, "loss": 0.07388677597045898, "mean_token_accuracy": 0.971617478132248, "num_tokens": 27238125.0, "step": 1520 }, { "entropy": 1.1564043641090394, "epoch": 2.059219380888291, "grad_norm": 1.1690629720687866, "learning_rate": 9.35148011521677e-05, "loss": 0.06990204453468322, "mean_token_accuracy": 0.9719981133937836, "num_tokens": 27417172.0, "step": 1530 }, { "entropy": 1.1576861262321472, "epoch": 2.072678331090175, "grad_norm": 1.7016727924346924, "learning_rate": 9.340475543495364e-05, "loss": 0.06850625276565551, "mean_token_accuracy": 0.9732699453830719, "num_tokens": 27596848.0, "step": 1540 }, { "entropy": 1.1597527265548706, "epoch": 2.0861372812920593, "grad_norm": 1.1524600982666016, "learning_rate": 9.329384976807023e-05, "loss": 0.06980778574943543, "mean_token_accuracy": 0.9729204118251801, "num_tokens": 27775617.0, "step": 1550 }, { "entropy": 1.1575467109680175, "epoch": 2.0995962314939436, "grad_norm": 1.4498176574707031, "learning_rate": 9.318208634881802e-05, "loss": 0.07390267252922059, "mean_token_accuracy": 0.9713942348957062, "num_tokens": 27954133.0, "step": 1560 }, { "entropy": 1.1575489521026612, "epoch": 2.113055181695828, "grad_norm": 1.243706464767456, "learning_rate": 9.306946739149161e-05, "loss": 0.06798491477966309, "mean_token_accuracy": 0.973270720243454, "num_tokens": 28133292.0, "step": 1570 }, { "entropy": 1.1514037609100343, "epoch": 2.126514131897712, "grad_norm": 1.256933331489563, "learning_rate": 9.29559951273358e-05, "loss": 0.0749699592590332, "mean_token_accuracy": 0.9699350416660308, "num_tokens": 28312726.0, "step": 1580 }, { "entropy": 1.1511113524436951, "epoch": 2.1399730820995964, "grad_norm": 1.1914122104644775, "learning_rate": 9.284167180450141e-05, "loss": 0.06752681732177734, "mean_token_accuracy": 0.9743177771568299, "num_tokens": 28492614.0, "step": 1590 }, { "entropy": 1.1382261991500855, "epoch": 2.1534320323014806, "grad_norm": 1.109575867652893, "learning_rate": 9.272649968800069e-05, "loss": 0.06449686884880065, "mean_token_accuracy": 0.9755833566188812, "num_tokens": 28671719.0, "step": 1600 }, { "entropy": 1.1397038459777833, "epoch": 2.166890982503365, "grad_norm": 1.3151781558990479, "learning_rate": 9.26104810596625e-05, "loss": 0.07052424550056458, "mean_token_accuracy": 0.972499680519104, "num_tokens": 28851056.0, "step": 1610 }, { "entropy": 1.136834406852722, "epoch": 2.180349932705249, "grad_norm": 1.4410197734832764, "learning_rate": 9.249361821808708e-05, "loss": 0.06850321292877197, "mean_token_accuracy": 0.9728572845458985, "num_tokens": 29030750.0, "step": 1620 }, { "entropy": 1.140534520149231, "epoch": 2.1938088829071334, "grad_norm": 1.1493765115737915, "learning_rate": 9.237591347860052e-05, "loss": 0.06934296488761901, "mean_token_accuracy": 0.972960364818573, "num_tokens": 29210336.0, "step": 1630 }, { "entropy": 1.138876986503601, "epoch": 2.2072678331090176, "grad_norm": 1.0383925437927246, "learning_rate": 9.225736917320886e-05, "loss": 0.06877213716506958, "mean_token_accuracy": 0.9730811774730682, "num_tokens": 29389788.0, "step": 1640 }, { "entropy": 1.1449824213981628, "epoch": 2.220726783310902, "grad_norm": 1.374165654182434, "learning_rate": 9.213798765055187e-05, "loss": 0.07060860991477966, "mean_token_accuracy": 0.9721363008022308, "num_tokens": 29569111.0, "step": 1650 }, { "entropy": 1.143638014793396, "epoch": 2.234185733512786, "grad_norm": 1.0196412801742554, "learning_rate": 9.20177712758566e-05, "loss": 0.07119340896606445, "mean_token_accuracy": 0.9731849789619446, "num_tokens": 29747986.0, "step": 1660 }, { "entropy": 1.1349515676498414, "epoch": 2.2476446837146704, "grad_norm": 1.1247919797897339, "learning_rate": 9.189672243089046e-05, "loss": 0.07071832418441773, "mean_token_accuracy": 0.9731756567955017, "num_tokens": 29927276.0, "step": 1670 }, { "entropy": 1.1376260280609132, "epoch": 2.2611036339165547, "grad_norm": 1.4197320938110352, "learning_rate": 9.177484351391402e-05, "loss": 0.07115572690963745, "mean_token_accuracy": 0.9723047018051147, "num_tokens": 30106267.0, "step": 1680 }, { "entropy": 1.1282797813415528, "epoch": 2.274562584118439, "grad_norm": 1.0774035453796387, "learning_rate": 9.165213693963355e-05, "loss": 0.068689626455307, "mean_token_accuracy": 0.9729084491729736, "num_tokens": 30285658.0, "step": 1690 }, { "entropy": 1.140774166584015, "epoch": 2.288021534320323, "grad_norm": 1.5625728368759155, "learning_rate": 9.152860513915314e-05, "loss": 0.07172787189483643, "mean_token_accuracy": 0.9718713641166687, "num_tokens": 30464579.0, "step": 1700 }, { "entropy": 1.1416746616363525, "epoch": 2.3014804845222074, "grad_norm": 1.2159788608551025, "learning_rate": 9.140425055992648e-05, "loss": 0.07109695672988892, "mean_token_accuracy": 0.9723007261753083, "num_tokens": 30643566.0, "step": 1710 }, { "entropy": 1.137896478176117, "epoch": 2.3149394347240917, "grad_norm": 1.1671864986419678, "learning_rate": 9.127907566570853e-05, "loss": 0.07048168182373046, "mean_token_accuracy": 0.9725371599197388, "num_tokens": 30822593.0, "step": 1720 }, { "entropy": 1.121722447872162, "epoch": 2.328398384925976, "grad_norm": 1.3899428844451904, "learning_rate": 9.115308293650653e-05, "loss": 0.07030471563339233, "mean_token_accuracy": 0.972334086894989, "num_tokens": 31001872.0, "step": 1730 }, { "entropy": 1.1253960013389588, "epoch": 2.34185733512786, "grad_norm": 1.3878816366195679, "learning_rate": 9.102627486853099e-05, "loss": 0.06956568956375123, "mean_token_accuracy": 0.9728380262851715, "num_tokens": 31181325.0, "step": 1740 }, { "entropy": 1.1276877760887145, "epoch": 2.3553162853297445, "grad_norm": 1.057102084159851, "learning_rate": 9.089865397414614e-05, "loss": 0.07267707586288452, "mean_token_accuracy": 0.9716822624206543, "num_tokens": 31360101.0, "step": 1750 }, { "entropy": 1.1322511553764343, "epoch": 2.3687752355316287, "grad_norm": 0.9549157023429871, "learning_rate": 9.077022278182024e-05, "loss": 0.0700565218925476, "mean_token_accuracy": 0.9737613677978516, "num_tokens": 31539212.0, "step": 1760 }, { "entropy": 1.1198163986206056, "epoch": 2.382234185733513, "grad_norm": 1.2134599685668945, "learning_rate": 9.064098383607545e-05, "loss": 0.07131816148757934, "mean_token_accuracy": 0.9710030317306518, "num_tokens": 31718375.0, "step": 1770 }, { "entropy": 1.1219655275344849, "epoch": 2.3956931359353972, "grad_norm": 1.3444453477859497, "learning_rate": 9.051093969743738e-05, "loss": 0.06774230003356933, "mean_token_accuracy": 0.9737627685070038, "num_tokens": 31897547.0, "step": 1780 }, { "entropy": 1.112294065952301, "epoch": 2.409152086137281, "grad_norm": 1.5304001569747925, "learning_rate": 9.03800929423844e-05, "loss": 0.0724343478679657, "mean_token_accuracy": 0.9721882760524749, "num_tokens": 32076717.0, "step": 1790 }, { "entropy": 1.1336388826370238, "epoch": 2.4226110363391653, "grad_norm": 1.2315043210983276, "learning_rate": 9.024844616329662e-05, "loss": 0.07212550640106201, "mean_token_accuracy": 0.9726522386074066, "num_tokens": 32255927.0, "step": 1800 }, { "entropy": 1.1251046061515808, "epoch": 2.4360699865410496, "grad_norm": 1.3251651525497437, "learning_rate": 9.011600196840447e-05, "loss": 0.07009173035621644, "mean_token_accuracy": 0.9724473178386688, "num_tokens": 32434929.0, "step": 1810 }, { "entropy": 1.1328514456748962, "epoch": 2.449528936742934, "grad_norm": 1.2601144313812256, "learning_rate": 8.998276298173707e-05, "loss": 0.0719257116317749, "mean_token_accuracy": 0.9720721006393432, "num_tokens": 32614063.0, "step": 1820 }, { "entropy": 1.136410367488861, "epoch": 2.462987886944818, "grad_norm": 1.2086918354034424, "learning_rate": 8.984873184307017e-05, "loss": 0.07017306089401246, "mean_token_accuracy": 0.9722030460834503, "num_tokens": 32793068.0, "step": 1830 }, { "entropy": 1.13520849943161, "epoch": 2.4764468371467023, "grad_norm": 1.453801155090332, "learning_rate": 8.971391120787397e-05, "loss": 0.07180649638175965, "mean_token_accuracy": 0.9726110398769379, "num_tokens": 32972445.0, "step": 1840 }, { "entropy": 1.1437011241912842, "epoch": 2.4899057873485866, "grad_norm": 1.1886014938354492, "learning_rate": 8.957830374726042e-05, "loss": 0.07153818607330323, "mean_token_accuracy": 0.9720338463783265, "num_tokens": 33151976.0, "step": 1850 }, { "entropy": 1.1325899600982665, "epoch": 2.503364737550471, "grad_norm": 1.1960384845733643, "learning_rate": 8.944191214793028e-05, "loss": 0.06935594081878663, "mean_token_accuracy": 0.9729972183704376, "num_tokens": 33330611.0, "step": 1860 }, { "entropy": 1.127972149848938, "epoch": 2.516823687752355, "grad_norm": 1.1048696041107178, "learning_rate": 8.930473911212e-05, "loss": 0.07217252850532532, "mean_token_accuracy": 0.9718475580215454, "num_tokens": 33509614.0, "step": 1870 }, { "entropy": 1.1299184799194335, "epoch": 2.5302826379542394, "grad_norm": 1.6520979404449463, "learning_rate": 8.916678735754809e-05, "loss": 0.07317680716514588, "mean_token_accuracy": 0.971524566411972, "num_tokens": 33688724.0, "step": 1880 }, { "entropy": 1.1270474672317505, "epoch": 2.5437415881561236, "grad_norm": 1.1285676956176758, "learning_rate": 8.902805961736123e-05, "loss": 0.07085765600204467, "mean_token_accuracy": 0.9733552634716034, "num_tokens": 33868061.0, "step": 1890 }, { "entropy": 1.1217491984367371, "epoch": 2.557200538358008, "grad_norm": 1.2642406225204468, "learning_rate": 8.88885586400803e-05, "loss": 0.06978695392608643, "mean_token_accuracy": 0.9726768732070923, "num_tokens": 34047387.0, "step": 1900 }, { "entropy": 1.1316059112548829, "epoch": 2.570659488559892, "grad_norm": 1.3016549348831177, "learning_rate": 8.874828718954576e-05, "loss": 0.07102057337760925, "mean_token_accuracy": 0.9723258554935456, "num_tokens": 34227141.0, "step": 1910 }, { "entropy": 1.137197768688202, "epoch": 2.5841184387617764, "grad_norm": 1.1534605026245117, "learning_rate": 8.86072480448629e-05, "loss": 0.07511197328567505, "mean_token_accuracy": 0.9702324509620667, "num_tokens": 34406526.0, "step": 1920 }, { "entropy": 1.1349146008491515, "epoch": 2.5975773889636606, "grad_norm": 1.3732489347457886, "learning_rate": 8.84654440003469e-05, "loss": 0.07147140502929687, "mean_token_accuracy": 0.9726706743240356, "num_tokens": 34586116.0, "step": 1930 }, { "entropy": 1.1294147491455078, "epoch": 2.611036339165545, "grad_norm": 0.9715967178344727, "learning_rate": 8.83228778654674e-05, "loss": 0.07225455045700073, "mean_token_accuracy": 0.9726594388484955, "num_tokens": 34765289.0, "step": 1940 }, { "entropy": 1.130816388130188, "epoch": 2.624495289367429, "grad_norm": 1.293736219406128, "learning_rate": 8.817955246479276e-05, "loss": 0.06845389604568482, "mean_token_accuracy": 0.9736943006515503, "num_tokens": 34944224.0, "step": 1950 }, { "entropy": 1.1200148224830628, "epoch": 2.6379542395693134, "grad_norm": 1.2962090969085693, "learning_rate": 8.803547063793422e-05, "loss": 0.07189736366271973, "mean_token_accuracy": 0.9717683315277099, "num_tokens": 35123825.0, "step": 1960 }, { "entropy": 1.1301296949386597, "epoch": 2.6514131897711977, "grad_norm": 1.4734028577804565, "learning_rate": 8.789063523948958e-05, "loss": 0.0702283263206482, "mean_token_accuracy": 0.9727118015289307, "num_tokens": 35302914.0, "step": 1970 }, { "entropy": 1.1409568905830383, "epoch": 2.664872139973082, "grad_norm": 1.4132834672927856, "learning_rate": 8.774504913898663e-05, "loss": 0.07676968574523926, "mean_token_accuracy": 0.9695852339267731, "num_tokens": 35481783.0, "step": 1980 }, { "entropy": 1.1532965421676635, "epoch": 2.678331090174966, "grad_norm": 1.1802046298980713, "learning_rate": 8.75987152208264e-05, "loss": 0.06539074182510377, "mean_token_accuracy": 0.9745090186595917, "num_tokens": 35660431.0, "step": 1990 }, { "entropy": 1.1450922250747682, "epoch": 2.6917900403768504, "grad_norm": 1.1922194957733154, "learning_rate": 8.745163638422583e-05, "loss": 0.07205181121826172, "mean_token_accuracy": 0.9712056815624237, "num_tokens": 35839308.0, "step": 2000 }, { "entropy": 1.1444142818450929, "epoch": 2.7052489905787347, "grad_norm": 1.1868208646774292, "learning_rate": 8.730381554316051e-05, "loss": 0.07235864400863648, "mean_token_accuracy": 0.9725943446159363, "num_tokens": 36018734.0, "step": 2010 }, { "entropy": 1.133234965801239, "epoch": 2.718707940780619, "grad_norm": 1.4924181699752808, "learning_rate": 8.715525562630687e-05, "loss": 0.07137352228164673, "mean_token_accuracy": 0.9720249474048615, "num_tokens": 36197607.0, "step": 2020 }, { "entropy": 1.150734007358551, "epoch": 2.732166890982503, "grad_norm": 1.0833524465560913, "learning_rate": 8.700595957698411e-05, "loss": 0.07287259101867676, "mean_token_accuracy": 0.9720607042312622, "num_tokens": 36377137.0, "step": 2030 }, { "entropy": 1.15074782371521, "epoch": 2.7456258411843875, "grad_norm": 1.1532199382781982, "learning_rate": 8.685593035309598e-05, "loss": 0.07189793586730957, "mean_token_accuracy": 0.971609354019165, "num_tokens": 36556438.0, "step": 2040 }, { "entropy": 1.1425267219543458, "epoch": 2.7590847913862717, "grad_norm": 1.65394926071167, "learning_rate": 8.670517092707213e-05, "loss": 0.07228031158447265, "mean_token_accuracy": 0.972437036037445, "num_tokens": 36734936.0, "step": 2050 }, { "entropy": 1.1443095088005066, "epoch": 2.772543741588156, "grad_norm": 1.1756309270858765, "learning_rate": 8.655368428580919e-05, "loss": 0.07032725811004639, "mean_token_accuracy": 0.9716470181941986, "num_tokens": 36913871.0, "step": 2060 }, { "entropy": 1.1369405388832092, "epoch": 2.7860026917900402, "grad_norm": 1.2845005989074707, "learning_rate": 8.640147343061165e-05, "loss": 0.07300193309783935, "mean_token_accuracy": 0.971499103307724, "num_tokens": 37093380.0, "step": 2070 }, { "entropy": 1.1441598296165467, "epoch": 2.7994616419919245, "grad_norm": 1.0858702659606934, "learning_rate": 8.624854137713234e-05, "loss": 0.07180417776107788, "mean_token_accuracy": 0.9721956551074982, "num_tokens": 37272902.0, "step": 2080 }, { "entropy": 1.1500964164733887, "epoch": 2.8129205921938087, "grad_norm": 1.1274408102035522, "learning_rate": 8.609489115531278e-05, "loss": 0.07155272960662842, "mean_token_accuracy": 0.971377295255661, "num_tokens": 37451897.0, "step": 2090 }, { "entropy": 1.1386480689048768, "epoch": 2.826379542395693, "grad_norm": 1.1894359588623047, "learning_rate": 8.594052580932301e-05, "loss": 0.06719542145729065, "mean_token_accuracy": 0.9733208954334259, "num_tokens": 37631343.0, "step": 2100 }, { "entropy": 1.1420456409454345, "epoch": 2.8398384925975773, "grad_norm": 1.266627550125122, "learning_rate": 8.578544839750141e-05, "loss": 0.06839650273323059, "mean_token_accuracy": 0.9735166966915131, "num_tokens": 37811111.0, "step": 2110 }, { "entropy": 1.136332881450653, "epoch": 2.8532974427994615, "grad_norm": 1.6329811811447144, "learning_rate": 8.562966199229399e-05, "loss": 0.0761029601097107, "mean_token_accuracy": 0.9703849673271179, "num_tokens": 37991040.0, "step": 2120 }, { "entropy": 1.1302747488021851, "epoch": 2.8667563930013458, "grad_norm": 1.355210304260254, "learning_rate": 8.547316968019363e-05, "loss": 0.07443415522575378, "mean_token_accuracy": 0.971444720029831, "num_tokens": 38169539.0, "step": 2130 }, { "entropy": 1.1407261610031127, "epoch": 2.88021534320323, "grad_norm": 1.0608882904052734, "learning_rate": 8.531597456167885e-05, "loss": 0.07463377118110656, "mean_token_accuracy": 0.9705821752548218, "num_tokens": 38348423.0, "step": 2140 }, { "entropy": 1.121661639213562, "epoch": 2.8936742934051143, "grad_norm": 1.4264295101165771, "learning_rate": 8.515807975115239e-05, "loss": 0.06971895098686218, "mean_token_accuracy": 0.9723855495452881, "num_tokens": 38527736.0, "step": 2150 }, { "entropy": 1.117723774909973, "epoch": 2.9071332436069985, "grad_norm": 1.114941120147705, "learning_rate": 8.499948837687959e-05, "loss": 0.07229661345481872, "mean_token_accuracy": 0.9710954666137696, "num_tokens": 38706893.0, "step": 2160 }, { "entropy": 1.12820805311203, "epoch": 2.920592193808883, "grad_norm": 1.219247817993164, "learning_rate": 8.484020358092625e-05, "loss": 0.07078794836997986, "mean_token_accuracy": 0.9723483324050903, "num_tokens": 38886237.0, "step": 2170 }, { "entropy": 1.1263705372810364, "epoch": 2.934051144010767, "grad_norm": 1.9336564540863037, "learning_rate": 8.468022851909657e-05, "loss": 0.07343355417251587, "mean_token_accuracy": 0.9712878108024597, "num_tokens": 39065800.0, "step": 2180 }, { "entropy": 1.1296117424964904, "epoch": 2.9475100942126513, "grad_norm": 1.1307294368743896, "learning_rate": 8.451956636087046e-05, "loss": 0.07211248874664307, "mean_token_accuracy": 0.9710810720920563, "num_tokens": 39245521.0, "step": 2190 }, { "entropy": 1.1172988891601563, "epoch": 2.9609690444145356, "grad_norm": 1.1942963600158691, "learning_rate": 8.435822028934087e-05, "loss": 0.07098879814147949, "mean_token_accuracy": 0.9725285410881043, "num_tokens": 39424709.0, "step": 2200 }, { "entropy": 1.1155216932296752, "epoch": 2.97442799461642, "grad_norm": 1.241036057472229, "learning_rate": 8.41961935011506e-05, "loss": 0.06951723098754883, "mean_token_accuracy": 0.9731462299823761, "num_tokens": 39603326.0, "step": 2210 }, { "entropy": 1.1296806812286377, "epoch": 2.987886944818304, "grad_norm": 1.249014139175415, "learning_rate": 8.403348920642911e-05, "loss": 0.07304394245147705, "mean_token_accuracy": 0.9720990836620331, "num_tokens": 39782396.0, "step": 2220 }, { "epoch": 3.0, "eval_entropy": 1.1091555235492196, "eval_loss": 0.10757029801607132, "eval_mean_token_accuracy": 0.9568785178433558, "eval_num_tokens": 39944284.0, "eval_runtime": 12.7473, "eval_samples_per_second": 392.239, "eval_steps_per_second": 12.316, "step": 2229 } ], "logging_steps": 10, "max_steps": 7430, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9036814964267418e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }