{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 939, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.771598082780838, "epoch": 0.032, "grad_norm": 0.2451171875, "learning_rate": 6.206896551724138e-05, "loss": 2.916146087646484, "mean_token_accuracy": 0.45416649207472803, "num_tokens": 55650.0, "step": 10 }, { "entropy": 2.3006236433982847, "epoch": 0.064, "grad_norm": 0.23828125, "learning_rate": 0.00013103448275862068, "loss": 2.4864336013793946, "mean_token_accuracy": 0.5021730229258538, "num_tokens": 110943.0, "step": 20 }, { "entropy": 1.8387477725744248, "epoch": 0.096, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.766120147705078, "mean_token_accuracy": 0.6066308304667473, "num_tokens": 166476.0, "step": 30 }, { "entropy": 1.2004003927111626, "epoch": 0.128, "grad_norm": 0.3203125, "learning_rate": 0.0001978021978021978, "loss": 1.1438531875610352, "mean_token_accuracy": 0.7330661401152611, "num_tokens": 221809.0, "step": 40 }, { "entropy": 0.8197565108537674, "epoch": 0.16, "grad_norm": 0.34765625, "learning_rate": 0.00019560439560439562, "loss": 0.7494671821594239, "mean_token_accuracy": 0.8239153817296028, "num_tokens": 277841.0, "step": 50 }, { "entropy": 0.5685673624277114, "epoch": 0.192, "grad_norm": 0.369140625, "learning_rate": 0.00019340659340659342, "loss": 0.49509191513061523, "mean_token_accuracy": 0.8886354997754097, "num_tokens": 333258.0, "step": 60 }, { "entropy": 0.36241610124707224, "epoch": 0.224, "grad_norm": 0.306640625, "learning_rate": 0.00019120879120879122, "loss": 0.29932661056518556, "mean_token_accuracy": 0.9331418961286545, "num_tokens": 388437.0, "step": 70 }, { "entropy": 0.2608797810971737, "epoch": 0.256, "grad_norm": 0.322265625, "learning_rate": 0.00018901098901098903, "loss": 0.20479197502136232, "mean_token_accuracy": 0.9527293920516968, "num_tokens": 444483.0, "step": 80 }, { "entropy": 0.20162589177489282, "epoch": 0.288, "grad_norm": 0.1904296875, "learning_rate": 0.00018681318681318683, "loss": 0.16634706258773804, "mean_token_accuracy": 0.9603863671422005, "num_tokens": 500408.0, "step": 90 }, { "entropy": 0.1615206029266119, "epoch": 0.32, "grad_norm": 0.1904296875, "learning_rate": 0.00018461538461538463, "loss": 0.14311870336532592, "mean_token_accuracy": 0.9628557220101357, "num_tokens": 556058.0, "step": 100 }, { "entropy": 0.1455086786299944, "epoch": 0.352, "grad_norm": 0.1796875, "learning_rate": 0.0001824175824175824, "loss": 0.1271807074546814, "mean_token_accuracy": 0.9651171401143074, "num_tokens": 611407.0, "step": 110 }, { "entropy": 0.13460372295230627, "epoch": 0.384, "grad_norm": 0.1572265625, "learning_rate": 0.00018021978021978024, "loss": 0.11452269554138184, "mean_token_accuracy": 0.9668730065226555, "num_tokens": 667323.0, "step": 120 }, { "entropy": 0.126934945769608, "epoch": 0.416, "grad_norm": 0.10986328125, "learning_rate": 0.00017802197802197802, "loss": 0.10631006956100464, "mean_token_accuracy": 0.9682586997747421, "num_tokens": 723233.0, "step": 130 }, { "entropy": 0.12308279145509005, "epoch": 0.448, "grad_norm": 0.1806640625, "learning_rate": 0.00017582417582417582, "loss": 0.09787563681602478, "mean_token_accuracy": 0.9693531423807145, "num_tokens": 779623.0, "step": 140 }, { "entropy": 0.11310338769108057, "epoch": 0.48, "grad_norm": 0.1552734375, "learning_rate": 0.00017362637362637365, "loss": 0.09501280188560486, "mean_token_accuracy": 0.9697160139679909, "num_tokens": 835343.0, "step": 150 }, { "entropy": 0.10735327322036028, "epoch": 0.512, "grad_norm": 0.12109375, "learning_rate": 0.00017142857142857143, "loss": 0.0945986807346344, "mean_token_accuracy": 0.9699150815606117, "num_tokens": 890526.0, "step": 160 }, { "entropy": 0.10488443765789271, "epoch": 0.544, "grad_norm": 0.08935546875, "learning_rate": 0.00016923076923076923, "loss": 0.09071503281593322, "mean_token_accuracy": 0.9702877476811409, "num_tokens": 946551.0, "step": 170 }, { "entropy": 0.09990130253136158, "epoch": 0.576, "grad_norm": 0.10986328125, "learning_rate": 0.00016703296703296706, "loss": 0.08788512349128723, "mean_token_accuracy": 0.9700309321284294, "num_tokens": 1002250.0, "step": 180 }, { "entropy": 0.1006152719259262, "epoch": 0.608, "grad_norm": 0.138671875, "learning_rate": 0.00016483516483516484, "loss": 0.08786565065383911, "mean_token_accuracy": 0.9703836083412171, "num_tokens": 1057942.0, "step": 190 }, { "entropy": 0.09826808385550975, "epoch": 0.64, "grad_norm": 0.10693359375, "learning_rate": 0.00016263736263736264, "loss": 0.08827171325683594, "mean_token_accuracy": 0.9702799677848816, "num_tokens": 1113313.0, "step": 200 }, { "entropy": 0.09519640635699034, "epoch": 0.672, "grad_norm": 0.109375, "learning_rate": 0.00016043956043956044, "loss": 0.08511611819267273, "mean_token_accuracy": 0.9707273244857788, "num_tokens": 1168866.0, "step": 210 }, { "entropy": 0.09626698959618807, "epoch": 0.704, "grad_norm": 0.07958984375, "learning_rate": 0.00015824175824175824, "loss": 0.08481809496879578, "mean_token_accuracy": 0.971166367828846, "num_tokens": 1224112.0, "step": 220 }, { "entropy": 0.08999720010906458, "epoch": 0.736, "grad_norm": 0.11962890625, "learning_rate": 0.00015604395604395605, "loss": 0.08373072743415833, "mean_token_accuracy": 0.9710352584719658, "num_tokens": 1279497.0, "step": 230 }, { "entropy": 0.0909602127969265, "epoch": 0.768, "grad_norm": 0.08251953125, "learning_rate": 0.00015384615384615385, "loss": 0.0811634123325348, "mean_token_accuracy": 0.9713989913463592, "num_tokens": 1335147.0, "step": 240 }, { "entropy": 0.08677519466727972, "epoch": 0.8, "grad_norm": 0.0791015625, "learning_rate": 0.00015164835164835165, "loss": 0.08006779551506042, "mean_token_accuracy": 0.9713309288024903, "num_tokens": 1390441.0, "step": 250 }, { "entropy": 0.08524699918925763, "epoch": 0.832, "grad_norm": 0.1142578125, "learning_rate": 0.00014945054945054946, "loss": 0.07839072942733764, "mean_token_accuracy": 0.9715656459331512, "num_tokens": 1446334.0, "step": 260 }, { "entropy": 0.08736930266022683, "epoch": 0.864, "grad_norm": 0.10791015625, "learning_rate": 0.00014725274725274726, "loss": 0.07965806126594543, "mean_token_accuracy": 0.971031291782856, "num_tokens": 1502214.0, "step": 270 }, { "entropy": 0.08625259138643741, "epoch": 0.896, "grad_norm": 0.08447265625, "learning_rate": 0.00014505494505494506, "loss": 0.0801069438457489, "mean_token_accuracy": 0.9713141709566117, "num_tokens": 1557731.0, "step": 280 }, { "entropy": 0.08634743597358466, "epoch": 0.928, "grad_norm": 0.06298828125, "learning_rate": 0.00014285714285714287, "loss": 0.07907066345214844, "mean_token_accuracy": 0.9716951295733451, "num_tokens": 1613126.0, "step": 290 }, { "entropy": 0.08533936887979507, "epoch": 0.96, "grad_norm": 0.06103515625, "learning_rate": 0.00014065934065934067, "loss": 0.07907315492630004, "mean_token_accuracy": 0.9721988439559937, "num_tokens": 1668452.0, "step": 300 }, { "entropy": 0.08410668671131134, "epoch": 0.992, "grad_norm": 0.0791015625, "learning_rate": 0.00013846153846153847, "loss": 0.07861064672470093, "mean_token_accuracy": 0.971791522204876, "num_tokens": 1724177.0, "step": 310 }, { "entropy": 0.08109197275418985, "epoch": 1.0224, "grad_norm": 0.07177734375, "learning_rate": 0.00013626373626373628, "loss": 0.07589302062988282, "mean_token_accuracy": 0.9723429350476516, "num_tokens": 1777420.0, "step": 320 }, { "entropy": 0.0814886923879385, "epoch": 1.0544, "grad_norm": 0.10693359375, "learning_rate": 0.00013406593406593405, "loss": 0.07642998099327088, "mean_token_accuracy": 0.97195183634758, "num_tokens": 1833060.0, "step": 330 }, { "entropy": 0.08034903313964606, "epoch": 1.0864, "grad_norm": 0.07373046875, "learning_rate": 0.00013186813186813188, "loss": 0.07447389960289001, "mean_token_accuracy": 0.9732914686203002, "num_tokens": 1889075.0, "step": 340 }, { "entropy": 0.07864065244793891, "epoch": 1.1184, "grad_norm": 0.08056640625, "learning_rate": 0.0001296703296703297, "loss": 0.07513262033462524, "mean_token_accuracy": 0.972836098074913, "num_tokens": 1944905.0, "step": 350 }, { "entropy": 0.08301715180277824, "epoch": 1.1504, "grad_norm": 0.09716796875, "learning_rate": 0.00012747252747252746, "loss": 0.07624064683914185, "mean_token_accuracy": 0.9722976922988892, "num_tokens": 2000057.0, "step": 360 }, { "entropy": 0.08098908923566342, "epoch": 1.1824, "grad_norm": 0.059814453125, "learning_rate": 0.00012527472527472527, "loss": 0.07458102107048034, "mean_token_accuracy": 0.9721322387456894, "num_tokens": 2055866.0, "step": 370 }, { "entropy": 0.07686591371893883, "epoch": 1.2144, "grad_norm": 0.06396484375, "learning_rate": 0.0001230769230769231, "loss": 0.07280548810958862, "mean_token_accuracy": 0.972561864554882, "num_tokens": 2111077.0, "step": 380 }, { "entropy": 0.07761757280677557, "epoch": 1.2464, "grad_norm": 0.07470703125, "learning_rate": 0.00012087912087912087, "loss": 0.07433983087539672, "mean_token_accuracy": 0.9725529655814171, "num_tokens": 2166081.0, "step": 390 }, { "entropy": 0.08011266030371189, "epoch": 1.2784, "grad_norm": 0.052001953125, "learning_rate": 0.00011868131868131869, "loss": 0.0738287627696991, "mean_token_accuracy": 0.9728635787963867, "num_tokens": 2221310.0, "step": 400 }, { "entropy": 0.0769817665219307, "epoch": 1.3104, "grad_norm": 0.054931640625, "learning_rate": 0.0001164835164835165, "loss": 0.07387230396270753, "mean_token_accuracy": 0.9729015439748764, "num_tokens": 2277107.0, "step": 410 }, { "entropy": 0.07817615140229464, "epoch": 1.3424, "grad_norm": 0.06787109375, "learning_rate": 0.00011428571428571428, "loss": 0.07262731790542602, "mean_token_accuracy": 0.9729507148265839, "num_tokens": 2332758.0, "step": 420 }, { "entropy": 0.07688614577054978, "epoch": 1.3744, "grad_norm": 0.051025390625, "learning_rate": 0.0001120879120879121, "loss": 0.07327454686164855, "mean_token_accuracy": 0.9719040498137475, "num_tokens": 2388461.0, "step": 430 }, { "entropy": 0.07903551124036312, "epoch": 1.4064, "grad_norm": 0.05126953125, "learning_rate": 0.0001098901098901099, "loss": 0.07202324867248536, "mean_token_accuracy": 0.9729711979627609, "num_tokens": 2443802.0, "step": 440 }, { "entropy": 0.07504601553082466, "epoch": 1.4384000000000001, "grad_norm": 0.0966796875, "learning_rate": 0.0001076923076923077, "loss": 0.07251456379890442, "mean_token_accuracy": 0.9728567853569985, "num_tokens": 2498886.0, "step": 450 }, { "entropy": 0.07635734435170889, "epoch": 1.4704, "grad_norm": 0.07861328125, "learning_rate": 0.0001054945054945055, "loss": 0.07308706045150756, "mean_token_accuracy": 0.9728769212961197, "num_tokens": 2554546.0, "step": 460 }, { "entropy": 0.07706241644918918, "epoch": 1.5024, "grad_norm": 0.052734375, "learning_rate": 0.00010329670329670331, "loss": 0.0728609800338745, "mean_token_accuracy": 0.9726115748286247, "num_tokens": 2609939.0, "step": 470 }, { "entropy": 0.07556705921888351, "epoch": 1.5344, "grad_norm": 0.087890625, "learning_rate": 0.0001010989010989011, "loss": 0.07160326838493347, "mean_token_accuracy": 0.973236757516861, "num_tokens": 2665583.0, "step": 480 }, { "entropy": 0.07504178639501333, "epoch": 1.5664, "grad_norm": 0.08349609375, "learning_rate": 9.89010989010989e-05, "loss": 0.0718912661075592, "mean_token_accuracy": 0.9726392358541489, "num_tokens": 2721224.0, "step": 490 }, { "entropy": 0.07667357344180345, "epoch": 1.5984, "grad_norm": 0.044921875, "learning_rate": 9.670329670329671e-05, "loss": 0.07309556603431702, "mean_token_accuracy": 0.9725197270512581, "num_tokens": 2776793.0, "step": 500 }, { "entropy": 0.07603078782558441, "epoch": 1.6303999999999998, "grad_norm": 0.0673828125, "learning_rate": 9.450549450549451e-05, "loss": 0.07351203560829163, "mean_token_accuracy": 0.9724631071090698, "num_tokens": 2832471.0, "step": 510 }, { "entropy": 0.07741717118769884, "epoch": 1.6623999999999999, "grad_norm": 0.052001953125, "learning_rate": 9.230769230769232e-05, "loss": 0.07223351001739502, "mean_token_accuracy": 0.9723551839590072, "num_tokens": 2888234.0, "step": 520 }, { "entropy": 0.07598806507885456, "epoch": 1.6944, "grad_norm": 0.06689453125, "learning_rate": 9.010989010989012e-05, "loss": 0.07230474948883056, "mean_token_accuracy": 0.9722696229815483, "num_tokens": 2943732.0, "step": 530 }, { "entropy": 0.0750182744115591, "epoch": 1.7264, "grad_norm": 0.04931640625, "learning_rate": 8.791208791208791e-05, "loss": 0.07132035493850708, "mean_token_accuracy": 0.9732247874140739, "num_tokens": 2999654.0, "step": 540 }, { "entropy": 0.07485976945608855, "epoch": 1.7584, "grad_norm": 0.0478515625, "learning_rate": 8.571428571428571e-05, "loss": 0.07085888981819152, "mean_token_accuracy": 0.9734048008918762, "num_tokens": 3055591.0, "step": 550 }, { "entropy": 0.07469552531838416, "epoch": 1.7904, "grad_norm": 0.04638671875, "learning_rate": 8.351648351648353e-05, "loss": 0.07132892608642578, "mean_token_accuracy": 0.9727317884564399, "num_tokens": 3111372.0, "step": 560 }, { "entropy": 0.0737810717895627, "epoch": 1.8224, "grad_norm": 0.052001953125, "learning_rate": 8.131868131868132e-05, "loss": 0.07149158120155334, "mean_token_accuracy": 0.9739102691411972, "num_tokens": 3167376.0, "step": 570 }, { "entropy": 0.0747382478788495, "epoch": 1.8544, "grad_norm": 0.055908203125, "learning_rate": 7.912087912087912e-05, "loss": 0.07183558940887451, "mean_token_accuracy": 0.972593954205513, "num_tokens": 3222797.0, "step": 580 }, { "entropy": 0.07589616179466248, "epoch": 1.8864, "grad_norm": 0.04833984375, "learning_rate": 7.692307692307693e-05, "loss": 0.07035009264945984, "mean_token_accuracy": 0.9727584093809127, "num_tokens": 3278083.0, "step": 590 }, { "entropy": 0.07409894913434982, "epoch": 1.9184, "grad_norm": 0.041015625, "learning_rate": 7.472527472527473e-05, "loss": 0.06983839273452759, "mean_token_accuracy": 0.9737206190824509, "num_tokens": 3334115.0, "step": 600 }, { "entropy": 0.07298169508576394, "epoch": 1.9504000000000001, "grad_norm": 0.055419921875, "learning_rate": 7.252747252747253e-05, "loss": 0.07096859216690063, "mean_token_accuracy": 0.9732470810413361, "num_tokens": 3389990.0, "step": 610 }, { "entropy": 0.07362735010683537, "epoch": 1.9824000000000002, "grad_norm": 0.05078125, "learning_rate": 7.032967032967034e-05, "loss": 0.0709508240222931, "mean_token_accuracy": 0.9730613023042679, "num_tokens": 3445703.0, "step": 620 }, { "entropy": 0.07376520100392793, "epoch": 2.0128, "grad_norm": 0.050048828125, "learning_rate": 6.813186813186814e-05, "loss": 0.06944339275360108, "mean_token_accuracy": 0.973434633330295, "num_tokens": 3498933.0, "step": 630 }, { "entropy": 0.07382834255695343, "epoch": 2.0448, "grad_norm": 0.048583984375, "learning_rate": 6.593406593406594e-05, "loss": 0.07052375078201294, "mean_token_accuracy": 0.9732005745172501, "num_tokens": 3553934.0, "step": 640 }, { "entropy": 0.0728354575112462, "epoch": 2.0768, "grad_norm": 0.09619140625, "learning_rate": 6.373626373626373e-05, "loss": 0.07018245458602905, "mean_token_accuracy": 0.9729705214500427, "num_tokens": 3609458.0, "step": 650 }, { "entropy": 0.07463801130652428, "epoch": 2.1088, "grad_norm": 0.04736328125, "learning_rate": 6.153846153846155e-05, "loss": 0.07015591859817505, "mean_token_accuracy": 0.9737546548247338, "num_tokens": 3664572.0, "step": 660 }, { "entropy": 0.07311667818576098, "epoch": 2.1408, "grad_norm": 0.051513671875, "learning_rate": 5.9340659340659345e-05, "loss": 0.06875128149986268, "mean_token_accuracy": 0.9733711332082748, "num_tokens": 3720225.0, "step": 670 }, { "entropy": 0.0712715208530426, "epoch": 2.1728, "grad_norm": 0.0537109375, "learning_rate": 5.714285714285714e-05, "loss": 0.06806424856185914, "mean_token_accuracy": 0.9737527936697006, "num_tokens": 3776424.0, "step": 680 }, { "entropy": 0.07198722306638956, "epoch": 2.2048, "grad_norm": 0.046630859375, "learning_rate": 5.494505494505495e-05, "loss": 0.06764371991157532, "mean_token_accuracy": 0.9738867044448852, "num_tokens": 3832496.0, "step": 690 }, { "entropy": 0.07190853431820869, "epoch": 2.2368, "grad_norm": 0.0498046875, "learning_rate": 5.274725274725275e-05, "loss": 0.06873984336853027, "mean_token_accuracy": 0.9736966788768768, "num_tokens": 3888101.0, "step": 700 }, { "entropy": 0.07232791539281606, "epoch": 2.2688, "grad_norm": 0.048828125, "learning_rate": 5.054945054945055e-05, "loss": 0.0678622543811798, "mean_token_accuracy": 0.9740407422184945, "num_tokens": 3944308.0, "step": 710 }, { "entropy": 0.07142861131578684, "epoch": 2.3008, "grad_norm": 0.04931640625, "learning_rate": 4.8351648351648355e-05, "loss": 0.06784402132034302, "mean_token_accuracy": 0.9742121011018753, "num_tokens": 4000402.0, "step": 720 }, { "entropy": 0.07213470414280891, "epoch": 2.3327999999999998, "grad_norm": 0.053466796875, "learning_rate": 4.615384615384616e-05, "loss": 0.0697720229625702, "mean_token_accuracy": 0.9730553776025772, "num_tokens": 4055460.0, "step": 730 }, { "entropy": 0.07274228539317847, "epoch": 2.3648, "grad_norm": 0.05126953125, "learning_rate": 4.3956043956043955e-05, "loss": 0.06925151348114014, "mean_token_accuracy": 0.9731604158878326, "num_tokens": 4110709.0, "step": 740 }, { "entropy": 0.0725497305393219, "epoch": 2.3968, "grad_norm": 0.05029296875, "learning_rate": 4.1758241758241765e-05, "loss": 0.0686568260192871, "mean_token_accuracy": 0.9741124615073204, "num_tokens": 4166373.0, "step": 750 }, { "entropy": 0.07181228250265122, "epoch": 2.4288, "grad_norm": 0.05029296875, "learning_rate": 3.956043956043956e-05, "loss": 0.06826226711273194, "mean_token_accuracy": 0.9736026957631111, "num_tokens": 4222123.0, "step": 760 }, { "entropy": 0.0726727832108736, "epoch": 2.4608, "grad_norm": 0.046142578125, "learning_rate": 3.7362637362637365e-05, "loss": 0.06845790147781372, "mean_token_accuracy": 0.9734082207083702, "num_tokens": 4277783.0, "step": 770 }, { "entropy": 0.0728993572294712, "epoch": 2.4928, "grad_norm": 0.050537109375, "learning_rate": 3.516483516483517e-05, "loss": 0.06819941997528076, "mean_token_accuracy": 0.9740725710988045, "num_tokens": 4333255.0, "step": 780 }, { "entropy": 0.07271347604691983, "epoch": 2.5248, "grad_norm": 0.0546875, "learning_rate": 3.296703296703297e-05, "loss": 0.06898128986358643, "mean_token_accuracy": 0.9741450414061547, "num_tokens": 4388480.0, "step": 790 }, { "entropy": 0.07090196693316102, "epoch": 2.5568, "grad_norm": 0.04638671875, "learning_rate": 3.0769230769230774e-05, "loss": 0.06766563653945923, "mean_token_accuracy": 0.9739843040704728, "num_tokens": 4444676.0, "step": 800 }, { "entropy": 0.0716133133508265, "epoch": 2.5888, "grad_norm": 0.050537109375, "learning_rate": 2.857142857142857e-05, "loss": 0.06745712161064148, "mean_token_accuracy": 0.9742712348699569, "num_tokens": 4500509.0, "step": 810 }, { "entropy": 0.07198168560862542, "epoch": 2.6208, "grad_norm": 0.052734375, "learning_rate": 2.6373626373626374e-05, "loss": 0.06861351728439331, "mean_token_accuracy": 0.9730511695146561, "num_tokens": 4555738.0, "step": 820 }, { "entropy": 0.07194693582132458, "epoch": 2.6528, "grad_norm": 0.055419921875, "learning_rate": 2.4175824175824177e-05, "loss": 0.06737480759620666, "mean_token_accuracy": 0.9742139622569084, "num_tokens": 4611454.0, "step": 830 }, { "entropy": 0.07176698800176382, "epoch": 2.6848, "grad_norm": 0.0537109375, "learning_rate": 2.1978021978021977e-05, "loss": 0.06750304102897645, "mean_token_accuracy": 0.9739004611968994, "num_tokens": 4667407.0, "step": 840 }, { "entropy": 0.07237117197364569, "epoch": 2.7168, "grad_norm": 0.04931640625, "learning_rate": 1.978021978021978e-05, "loss": 0.06796355247497558, "mean_token_accuracy": 0.9741956070065498, "num_tokens": 4723097.0, "step": 850 }, { "entropy": 0.07233156580477954, "epoch": 2.7488, "grad_norm": 0.07666015625, "learning_rate": 1.7582417582417584e-05, "loss": 0.06823940873146057, "mean_token_accuracy": 0.9739502936601638, "num_tokens": 4778509.0, "step": 860 }, { "entropy": 0.07178980130702257, "epoch": 2.7808, "grad_norm": 0.056396484375, "learning_rate": 1.5384615384615387e-05, "loss": 0.06750970482826232, "mean_token_accuracy": 0.9740586042404175, "num_tokens": 4834011.0, "step": 870 }, { "entropy": 0.07171082906425, "epoch": 2.8128, "grad_norm": 0.053955078125, "learning_rate": 1.3186813186813187e-05, "loss": 0.06713088154792786, "mean_token_accuracy": 0.9745032519102097, "num_tokens": 4889429.0, "step": 880 }, { "entropy": 0.07134337816387415, "epoch": 2.8448, "grad_norm": 0.056884765625, "learning_rate": 1.0989010989010989e-05, "loss": 0.06686720848083497, "mean_token_accuracy": 0.9745988816022872, "num_tokens": 4944966.0, "step": 890 }, { "entropy": 0.07198897190392017, "epoch": 2.8768000000000002, "grad_norm": 0.051513671875, "learning_rate": 8.791208791208792e-06, "loss": 0.06727443933486939, "mean_token_accuracy": 0.9737225085496902, "num_tokens": 5000546.0, "step": 900 }, { "entropy": 0.07114618215709925, "epoch": 2.9088000000000003, "grad_norm": 0.04833984375, "learning_rate": 6.5934065934065935e-06, "loss": 0.0675000011920929, "mean_token_accuracy": 0.9743592411279678, "num_tokens": 5056400.0, "step": 910 }, { "entropy": 0.07034891471266747, "epoch": 2.9408, "grad_norm": 0.05224609375, "learning_rate": 4.395604395604396e-06, "loss": 0.06710875034332275, "mean_token_accuracy": 0.9740387976169587, "num_tokens": 5111854.0, "step": 920 }, { "entropy": 0.07230036649852992, "epoch": 2.9728, "grad_norm": 0.05419921875, "learning_rate": 2.197802197802198e-06, "loss": 0.06789053678512573, "mean_token_accuracy": 0.9737495318055153, "num_tokens": 5167285.0, "step": 930 } ], "logging_steps": 10, "max_steps": 939, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4252630664691712e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }