{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 13947, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0035850646207897896, "grad_norm": 28.958446502685547, "learning_rate": 3.512544802867384e-06, "loss": 14.3981, "mean_token_accuracy": 0.4658013021945953, "num_tokens": 631305.0, "step": 50 }, { "epoch": 0.0035850646207897896, "eval_loss": 3.598968505859375, "eval_mean_token_accuracy": 0.4642415362596512, "eval_num_tokens": 631305.0, "eval_runtime": 55.3723, "eval_samples_per_second": 7.224, "eval_steps_per_second": 0.903, "step": 50 }, { "epoch": 0.007170129241579579, "grad_norm": 46.98331832885742, "learning_rate": 7.096774193548387e-06, "loss": 13.6155, "mean_token_accuracy": 0.47677032694220545, "num_tokens": 1263143.0, "step": 100 }, { "epoch": 0.007170129241579579, "eval_loss": 3.242854595184326, "eval_mean_token_accuracy": 0.4895547354221344, "eval_num_tokens": 1263143.0, "eval_runtime": 56.3676, "eval_samples_per_second": 7.096, "eval_steps_per_second": 0.887, "step": 100 }, { "epoch": 0.01075519386236937, "grad_norm": 23.78474235534668, "learning_rate": 1.0681003584229391e-05, "loss": 11.8849, "mean_token_accuracy": 0.5030167695879936, "num_tokens": 1896120.0, "step": 150 }, { "epoch": 0.01075519386236937, "eval_loss": 2.791355609893799, "eval_mean_token_accuracy": 0.5165965485572815, "eval_num_tokens": 1896120.0, "eval_runtime": 55.265, "eval_samples_per_second": 7.238, "eval_steps_per_second": 0.905, "step": 150 }, { "epoch": 0.014340258483159158, "grad_norm": 11.690914154052734, "learning_rate": 1.4265232974910395e-05, "loss": 9.8852, "mean_token_accuracy": 0.5450691656768322, "num_tokens": 2527332.0, "step": 200 }, { "epoch": 0.014340258483159158, "eval_loss": 2.1748604774475098, "eval_mean_token_accuracy": 0.5758289074897767, "eval_num_tokens": 2527332.0, "eval_runtime": 55.335, "eval_samples_per_second": 7.229, "eval_steps_per_second": 0.904, "step": 200 }, { "epoch": 0.01792532310394895, "grad_norm": 9.011432647705078, "learning_rate": 1.78494623655914e-05, "loss": 7.7315, "mean_token_accuracy": 0.5919728323817253, "num_tokens": 3158451.0, "step": 250 }, { "epoch": 0.01792532310394895, "eval_loss": 1.7914152145385742, "eval_mean_token_accuracy": 0.6036396706104279, "eval_num_tokens": 3158451.0, "eval_runtime": 55.3537, "eval_samples_per_second": 7.226, "eval_steps_per_second": 0.903, "step": 250 }, { "epoch": 0.02151038772473874, "grad_norm": 9.172738075256348, "learning_rate": 2.1433691756272405e-05, "loss": 6.6634, "mean_token_accuracy": 0.6193091833591461, "num_tokens": 3790537.0, "step": 300 }, { "epoch": 0.02151038772473874, "eval_loss": 1.5858986377716064, "eval_mean_token_accuracy": 0.6325684702396392, "eval_num_tokens": 3790537.0, "eval_runtime": 55.5221, "eval_samples_per_second": 7.204, "eval_steps_per_second": 0.901, "step": 300 }, { "epoch": 0.025095452345528527, "grad_norm": 6.380577087402344, "learning_rate": 2.5017921146953403e-05, "loss": 5.9955, "mean_token_accuracy": 0.6299453395605087, "num_tokens": 4416803.0, "step": 350 }, { "epoch": 0.025095452345528527, "eval_loss": 1.4283970594406128, "eval_mean_token_accuracy": 0.637500970363617, "eval_num_tokens": 4416803.0, "eval_runtime": 55.6098, "eval_samples_per_second": 7.193, "eval_steps_per_second": 0.899, "step": 350 }, { "epoch": 0.028680516966318317, "grad_norm": 8.31059455871582, "learning_rate": 2.860215053763441e-05, "loss": 5.6524, "mean_token_accuracy": 0.6386667934060096, "num_tokens": 5049525.0, "step": 400 }, { "epoch": 0.028680516966318317, "eval_loss": 1.4044820070266724, "eval_mean_token_accuracy": 0.6408572208881378, "eval_num_tokens": 5049525.0, "eval_runtime": 55.375, "eval_samples_per_second": 7.223, "eval_steps_per_second": 0.903, "step": 400 }, { "epoch": 0.03226558158710811, "grad_norm": 8.33178997039795, "learning_rate": 3.218637992831541e-05, "loss": 5.5798, "mean_token_accuracy": 0.6421743601560592, "num_tokens": 5681852.0, "step": 450 }, { "epoch": 0.03226558158710811, "eval_loss": 1.390726923942566, "eval_mean_token_accuracy": 0.642348815202713, "eval_num_tokens": 5681852.0, "eval_runtime": 55.3167, "eval_samples_per_second": 7.231, "eval_steps_per_second": 0.904, "step": 450 }, { "epoch": 0.0358506462078979, "grad_norm": 6.159327507019043, "learning_rate": 3.577060931899642e-05, "loss": 5.5753, "mean_token_accuracy": 0.6416634133458138, "num_tokens": 6314159.0, "step": 500 }, { "epoch": 0.0358506462078979, "eval_loss": 1.3759286403656006, "eval_mean_token_accuracy": 0.6448968076705932, "eval_num_tokens": 6314159.0, "eval_runtime": 55.4132, "eval_samples_per_second": 7.219, "eval_steps_per_second": 0.902, "step": 500 }, { "epoch": 0.03943571082868769, "grad_norm": 7.295239448547363, "learning_rate": 3.935483870967742e-05, "loss": 5.4486, "mean_token_accuracy": 0.6444561332464218, "num_tokens": 6948430.0, "step": 550 }, { "epoch": 0.03943571082868769, "eval_loss": 1.3677067756652832, "eval_mean_token_accuracy": 0.6456243467330932, "eval_num_tokens": 6948430.0, "eval_runtime": 55.4348, "eval_samples_per_second": 7.216, "eval_steps_per_second": 0.902, "step": 550 }, { "epoch": 0.04302077544947748, "grad_norm": 8.140225410461426, "learning_rate": 4.2939068100358425e-05, "loss": 5.491, "mean_token_accuracy": 0.6452211833000183, "num_tokens": 7574739.0, "step": 600 }, { "epoch": 0.04302077544947748, "eval_loss": 1.3580710887908936, "eval_mean_token_accuracy": 0.6465290606021881, "eval_num_tokens": 7574739.0, "eval_runtime": 55.3881, "eval_samples_per_second": 7.222, "eval_steps_per_second": 0.903, "step": 600 }, { "epoch": 0.04660584007026727, "grad_norm": 7.000651836395264, "learning_rate": 4.6523297491039434e-05, "loss": 5.4196, "mean_token_accuracy": 0.6482405418157577, "num_tokens": 8203512.0, "step": 650 }, { "epoch": 0.04660584007026727, "eval_loss": 1.3494269847869873, "eval_mean_token_accuracy": 0.6481932699680328, "eval_num_tokens": 8203512.0, "eval_runtime": 55.3883, "eval_samples_per_second": 7.222, "eval_steps_per_second": 0.903, "step": 650 }, { "epoch": 0.05019090469105705, "grad_norm": 8.582626342773438, "learning_rate": 5.0107526881720436e-05, "loss": 5.3867, "mean_token_accuracy": 0.650465478003025, "num_tokens": 8831306.0, "step": 700 }, { "epoch": 0.05019090469105705, "eval_loss": 1.3439626693725586, "eval_mean_token_accuracy": 0.6484399271011353, "eval_num_tokens": 8831306.0, "eval_runtime": 55.4414, "eval_samples_per_second": 7.215, "eval_steps_per_second": 0.902, "step": 700 }, { "epoch": 0.05377596931184685, "grad_norm": 8.785462379455566, "learning_rate": 5.369175627240144e-05, "loss": 5.3822, "mean_token_accuracy": 0.6480473777651787, "num_tokens": 9462041.0, "step": 750 }, { "epoch": 0.05377596931184685, "eval_loss": 1.3385406732559204, "eval_mean_token_accuracy": 0.649928457736969, "eval_num_tokens": 9462041.0, "eval_runtime": 55.3213, "eval_samples_per_second": 7.23, "eval_steps_per_second": 0.904, "step": 750 }, { "epoch": 0.05736103393263663, "grad_norm": 6.1994547843933105, "learning_rate": 5.727598566308244e-05, "loss": 5.305, "mean_token_accuracy": 0.6530899196863175, "num_tokens": 10095648.0, "step": 800 }, { "epoch": 0.05736103393263663, "eval_loss": 1.333341360092163, "eval_mean_token_accuracy": 0.6506243336200714, "eval_num_tokens": 10095648.0, "eval_runtime": 55.4523, "eval_samples_per_second": 7.213, "eval_steps_per_second": 0.902, "step": 800 }, { "epoch": 0.06094609855342643, "grad_norm": 5.850490570068359, "learning_rate": 6.086021505376345e-05, "loss": 5.3301, "mean_token_accuracy": 0.6499336344003678, "num_tokens": 10730377.0, "step": 850 }, { "epoch": 0.06094609855342643, "eval_loss": 1.3294757604599, "eval_mean_token_accuracy": 0.6494286286830903, "eval_num_tokens": 10730377.0, "eval_runtime": 55.628, "eval_samples_per_second": 7.191, "eval_steps_per_second": 0.899, "step": 850 }, { "epoch": 0.06453116317421621, "grad_norm": 5.629384517669678, "learning_rate": 6.444444444444446e-05, "loss": 5.2911, "mean_token_accuracy": 0.6521104833483696, "num_tokens": 11363798.0, "step": 900 }, { "epoch": 0.06453116317421621, "eval_loss": 1.3235622644424438, "eval_mean_token_accuracy": 0.6512654149532318, "eval_num_tokens": 11363798.0, "eval_runtime": 55.5848, "eval_samples_per_second": 7.196, "eval_steps_per_second": 0.9, "step": 900 }, { "epoch": 0.068116227795006, "grad_norm": 6.046393871307373, "learning_rate": 6.802867383512545e-05, "loss": 5.2478, "mean_token_accuracy": 0.6536632561683655, "num_tokens": 11993502.0, "step": 950 }, { "epoch": 0.068116227795006, "eval_loss": 1.3197156190872192, "eval_mean_token_accuracy": 0.6517732429504395, "eval_num_tokens": 11993502.0, "eval_runtime": 55.3206, "eval_samples_per_second": 7.231, "eval_steps_per_second": 0.904, "step": 950 }, { "epoch": 0.0717012924157958, "grad_norm": 6.950500011444092, "learning_rate": 7.161290322580646e-05, "loss": 5.2368, "mean_token_accuracy": 0.6554682296514511, "num_tokens": 12628081.0, "step": 1000 }, { "epoch": 0.0717012924157958, "eval_loss": 1.3148993253707886, "eval_mean_token_accuracy": 0.652986958026886, "eval_num_tokens": 12628081.0, "eval_runtime": 55.4045, "eval_samples_per_second": 7.22, "eval_steps_per_second": 0.902, "step": 1000 }, { "epoch": 0.07528635703658558, "grad_norm": 5.844649791717529, "learning_rate": 7.519713261648746e-05, "loss": 5.2604, "mean_token_accuracy": 0.6538248571753502, "num_tokens": 13254893.0, "step": 1050 }, { "epoch": 0.07528635703658558, "eval_loss": 1.3124916553497314, "eval_mean_token_accuracy": 0.6538188600540161, "eval_num_tokens": 13254893.0, "eval_runtime": 56.9195, "eval_samples_per_second": 7.027, "eval_steps_per_second": 0.878, "step": 1050 }, { "epoch": 0.07887142165737537, "grad_norm": 5.3114094734191895, "learning_rate": 7.878136200716845e-05, "loss": 5.235, "mean_token_accuracy": 0.6541680765151977, "num_tokens": 13893524.0, "step": 1100 }, { "epoch": 0.07887142165737537, "eval_loss": 1.309714674949646, "eval_mean_token_accuracy": 0.6538467502593994, "eval_num_tokens": 13893524.0, "eval_runtime": 56.3705, "eval_samples_per_second": 7.096, "eval_steps_per_second": 0.887, "step": 1100 }, { "epoch": 0.08245648627816517, "grad_norm": 5.666459083557129, "learning_rate": 8.236559139784946e-05, "loss": 5.1803, "mean_token_accuracy": 0.6568083089590072, "num_tokens": 14522906.0, "step": 1150 }, { "epoch": 0.08245648627816517, "eval_loss": 1.305640459060669, "eval_mean_token_accuracy": 0.655296059846878, "eval_num_tokens": 14522906.0, "eval_runtime": 57.2106, "eval_samples_per_second": 6.992, "eval_steps_per_second": 0.874, "step": 1150 }, { "epoch": 0.08604155089895496, "grad_norm": 6.020337104797363, "learning_rate": 8.594982078853047e-05, "loss": 5.2056, "mean_token_accuracy": 0.653913055062294, "num_tokens": 15156803.0, "step": 1200 }, { "epoch": 0.08604155089895496, "eval_loss": 1.3023688793182373, "eval_mean_token_accuracy": 0.6569918835163117, "eval_num_tokens": 15156803.0, "eval_runtime": 56.4134, "eval_samples_per_second": 7.091, "eval_steps_per_second": 0.886, "step": 1200 }, { "epoch": 0.08962661551974474, "grad_norm": 5.757259368896484, "learning_rate": 8.953405017921147e-05, "loss": 5.2154, "mean_token_accuracy": 0.6549820226430892, "num_tokens": 15788828.0, "step": 1250 }, { "epoch": 0.08962661551974474, "eval_loss": 1.3033726215362549, "eval_mean_token_accuracy": 0.6548340058326722, "eval_num_tokens": 15788828.0, "eval_runtime": 56.3999, "eval_samples_per_second": 7.092, "eval_steps_per_second": 0.887, "step": 1250 }, { "epoch": 0.09321168014053453, "grad_norm": 6.876058101654053, "learning_rate": 9.311827956989248e-05, "loss": 5.2374, "mean_token_accuracy": 0.6526922315359116, "num_tokens": 16423385.0, "step": 1300 }, { "epoch": 0.09321168014053453, "eval_loss": 1.2987463474273682, "eval_mean_token_accuracy": 0.6556110656261445, "eval_num_tokens": 16423385.0, "eval_runtime": 56.3308, "eval_samples_per_second": 7.101, "eval_steps_per_second": 0.888, "step": 1300 }, { "epoch": 0.09679674476132433, "grad_norm": 5.170133590698242, "learning_rate": 9.670250896057349e-05, "loss": 5.2584, "mean_token_accuracy": 0.6529216593503953, "num_tokens": 17058608.0, "step": 1350 }, { "epoch": 0.09679674476132433, "eval_loss": 1.2979986667633057, "eval_mean_token_accuracy": 0.6553151261806488, "eval_num_tokens": 17058608.0, "eval_runtime": 56.3719, "eval_samples_per_second": 7.096, "eval_steps_per_second": 0.887, "step": 1350 }, { "epoch": 0.1003818093821141, "grad_norm": 5.678673267364502, "learning_rate": 9.996813256851498e-05, "loss": 5.1909, "mean_token_accuracy": 0.6570302325487137, "num_tokens": 17689690.0, "step": 1400 }, { "epoch": 0.1003818093821141, "eval_loss": 1.2949328422546387, "eval_mean_token_accuracy": 0.6552190041542053, "eval_num_tokens": 17689690.0, "eval_runtime": 56.3054, "eval_samples_per_second": 7.104, "eval_steps_per_second": 0.888, "step": 1400 }, { "epoch": 0.1039668740029039, "grad_norm": 4.694892406463623, "learning_rate": 9.956978967495221e-05, "loss": 5.1132, "mean_token_accuracy": 0.6600784501433372, "num_tokens": 18321232.0, "step": 1450 }, { "epoch": 0.1039668740029039, "eval_loss": 1.2946751117706299, "eval_mean_token_accuracy": 0.6560806667804718, "eval_num_tokens": 18321232.0, "eval_runtime": 56.7345, "eval_samples_per_second": 7.05, "eval_steps_per_second": 0.881, "step": 1450 }, { "epoch": 0.1075519386236937, "grad_norm": 5.286959171295166, "learning_rate": 9.917144678138942e-05, "loss": 5.2297, "mean_token_accuracy": 0.6539956346154213, "num_tokens": 18952518.0, "step": 1500 }, { "epoch": 0.1075519386236937, "eval_loss": 1.2900216579437256, "eval_mean_token_accuracy": 0.6562173092365264, "eval_num_tokens": 18952518.0, "eval_runtime": 56.3853, "eval_samples_per_second": 7.094, "eval_steps_per_second": 0.887, "step": 1500 }, { "epoch": 0.11113700324448349, "grad_norm": 5.229610443115234, "learning_rate": 9.877310388782664e-05, "loss": 5.1376, "mean_token_accuracy": 0.6599933451414108, "num_tokens": 19580453.0, "step": 1550 }, { "epoch": 0.11113700324448349, "eval_loss": 1.2871261835098267, "eval_mean_token_accuracy": 0.6577617633342743, "eval_num_tokens": 19580453.0, "eval_runtime": 56.3376, "eval_samples_per_second": 7.1, "eval_steps_per_second": 0.888, "step": 1550 }, { "epoch": 0.11472206786527327, "grad_norm": 4.540684223175049, "learning_rate": 9.837476099426386e-05, "loss": 5.1124, "mean_token_accuracy": 0.659881052672863, "num_tokens": 20220713.0, "step": 1600 }, { "epoch": 0.11472206786527327, "eval_loss": 1.2855585813522339, "eval_mean_token_accuracy": 0.657675279378891, "eval_num_tokens": 20220713.0, "eval_runtime": 56.4741, "eval_samples_per_second": 7.083, "eval_steps_per_second": 0.885, "step": 1600 }, { "epoch": 0.11830713248606306, "grad_norm": 5.147482872009277, "learning_rate": 9.797641810070109e-05, "loss": 5.1251, "mean_token_accuracy": 0.658254965543747, "num_tokens": 20853860.0, "step": 1650 }, { "epoch": 0.11830713248606306, "eval_loss": 1.283848762512207, "eval_mean_token_accuracy": 0.6582966887950897, "eval_num_tokens": 20853860.0, "eval_runtime": 57.6401, "eval_samples_per_second": 6.94, "eval_steps_per_second": 0.867, "step": 1650 }, { "epoch": 0.12189219710685285, "grad_norm": 4.544667720794678, "learning_rate": 9.757807520713831e-05, "loss": 5.0706, "mean_token_accuracy": 0.6628770676255226, "num_tokens": 21487498.0, "step": 1700 }, { "epoch": 0.12189219710685285, "eval_loss": 1.2798463106155396, "eval_mean_token_accuracy": 0.6587248671054841, "eval_num_tokens": 21487498.0, "eval_runtime": 56.4142, "eval_samples_per_second": 7.09, "eval_steps_per_second": 0.886, "step": 1700 }, { "epoch": 0.12547726172764265, "grad_norm": 4.541973114013672, "learning_rate": 9.717973231357553e-05, "loss": 5.054, "mean_token_accuracy": 0.6628148990869522, "num_tokens": 22120725.0, "step": 1750 }, { "epoch": 0.12547726172764265, "eval_loss": 1.278252124786377, "eval_mean_token_accuracy": 0.6595626533031463, "eval_num_tokens": 22120725.0, "eval_runtime": 56.5087, "eval_samples_per_second": 7.079, "eval_steps_per_second": 0.885, "step": 1750 }, { "epoch": 0.12906232634843243, "grad_norm": 5.01814603805542, "learning_rate": 9.678138942001275e-05, "loss": 5.1334, "mean_token_accuracy": 0.6570143532752991, "num_tokens": 22751630.0, "step": 1800 }, { "epoch": 0.12906232634843243, "eval_loss": 1.2745345830917358, "eval_mean_token_accuracy": 0.6588572013378143, "eval_num_tokens": 22751630.0, "eval_runtime": 56.4708, "eval_samples_per_second": 7.083, "eval_steps_per_second": 0.885, "step": 1800 }, { "epoch": 0.1326473909692222, "grad_norm": 5.249142646789551, "learning_rate": 9.638304652644997e-05, "loss": 5.0772, "mean_token_accuracy": 0.6610330584645271, "num_tokens": 23380871.0, "step": 1850 }, { "epoch": 0.1326473909692222, "eval_loss": 1.271730661392212, "eval_mean_token_accuracy": 0.6606413364410401, "eval_num_tokens": 23380871.0, "eval_runtime": 56.3956, "eval_samples_per_second": 7.093, "eval_steps_per_second": 0.887, "step": 1850 }, { "epoch": 0.136232455590012, "grad_norm": 4.495316505432129, "learning_rate": 9.598470363288719e-05, "loss": 5.1115, "mean_token_accuracy": 0.6598174887895584, "num_tokens": 24016153.0, "step": 1900 }, { "epoch": 0.136232455590012, "eval_loss": 1.2678121328353882, "eval_mean_token_accuracy": 0.659992311000824, "eval_num_tokens": 24016153.0, "eval_runtime": 56.3408, "eval_samples_per_second": 7.1, "eval_steps_per_second": 0.887, "step": 1900 }, { "epoch": 0.1398175202108018, "grad_norm": 4.675460338592529, "learning_rate": 9.558636073932441e-05, "loss": 5.1039, "mean_token_accuracy": 0.6611927005648613, "num_tokens": 24651410.0, "step": 1950 }, { "epoch": 0.1398175202108018, "eval_loss": 1.2681583166122437, "eval_mean_token_accuracy": 0.6607218337059021, "eval_num_tokens": 24651410.0, "eval_runtime": 56.0826, "eval_samples_per_second": 7.132, "eval_steps_per_second": 0.892, "step": 1950 }, { "epoch": 0.1434025848315916, "grad_norm": 4.928748607635498, "learning_rate": 9.518801784576164e-05, "loss": 5.1005, "mean_token_accuracy": 0.6606691733002663, "num_tokens": 25282438.0, "step": 2000 }, { "epoch": 0.1434025848315916, "eval_loss": 1.2662436962127686, "eval_mean_token_accuracy": 0.6602835392951966, "eval_num_tokens": 25282438.0, "eval_runtime": 56.1345, "eval_samples_per_second": 7.126, "eval_steps_per_second": 0.891, "step": 2000 }, { "epoch": 0.14698764945238138, "grad_norm": 4.237011432647705, "learning_rate": 9.478967495219886e-05, "loss": 5.0865, "mean_token_accuracy": 0.6610729214549065, "num_tokens": 25914467.0, "step": 2050 }, { "epoch": 0.14698764945238138, "eval_loss": 1.2659285068511963, "eval_mean_token_accuracy": 0.6619378459453583, "eval_num_tokens": 25914467.0, "eval_runtime": 56.41, "eval_samples_per_second": 7.091, "eval_steps_per_second": 0.886, "step": 2050 }, { "epoch": 0.15057271407317116, "grad_norm": 4.498386383056641, "learning_rate": 9.439133205863608e-05, "loss": 5.0536, "mean_token_accuracy": 0.662959768474102, "num_tokens": 26547088.0, "step": 2100 }, { "epoch": 0.15057271407317116, "eval_loss": 1.2620855569839478, "eval_mean_token_accuracy": 0.6622272551059722, "eval_num_tokens": 26547088.0, "eval_runtime": 56.4966, "eval_samples_per_second": 7.08, "eval_steps_per_second": 0.885, "step": 2100 }, { "epoch": 0.15415777869396097, "grad_norm": 4.547789573669434, "learning_rate": 9.39929891650733e-05, "loss": 5.0074, "mean_token_accuracy": 0.6667503699660301, "num_tokens": 27181781.0, "step": 2150 }, { "epoch": 0.15415777869396097, "eval_loss": 1.2630900144577026, "eval_mean_token_accuracy": 0.6616924941539765, "eval_num_tokens": 27181781.0, "eval_runtime": 56.423, "eval_samples_per_second": 7.089, "eval_steps_per_second": 0.886, "step": 2150 }, { "epoch": 0.15774284331475075, "grad_norm": 4.9150896072387695, "learning_rate": 9.359464627151052e-05, "loss": 5.0802, "mean_token_accuracy": 0.6615395992994308, "num_tokens": 27816217.0, "step": 2200 }, { "epoch": 0.15774284331475075, "eval_loss": 1.2614257335662842, "eval_mean_token_accuracy": 0.6630010890960694, "eval_num_tokens": 27816217.0, "eval_runtime": 56.1652, "eval_samples_per_second": 7.122, "eval_steps_per_second": 0.89, "step": 2200 }, { "epoch": 0.16132790793554053, "grad_norm": 4.487524032592773, "learning_rate": 9.319630337794774e-05, "loss": 5.0135, "mean_token_accuracy": 0.6651386457681656, "num_tokens": 28449167.0, "step": 2250 }, { "epoch": 0.16132790793554053, "eval_loss": 1.2573643922805786, "eval_mean_token_accuracy": 0.6616563200950623, "eval_num_tokens": 28449167.0, "eval_runtime": 56.2616, "eval_samples_per_second": 7.11, "eval_steps_per_second": 0.889, "step": 2250 }, { "epoch": 0.16491297255633033, "grad_norm": 4.237537860870361, "learning_rate": 9.279796048438496e-05, "loss": 5.0198, "mean_token_accuracy": 0.6647991991043091, "num_tokens": 29080404.0, "step": 2300 }, { "epoch": 0.16491297255633033, "eval_loss": 1.2570703029632568, "eval_mean_token_accuracy": 0.6629821956157684, "eval_num_tokens": 29080404.0, "eval_runtime": 56.2199, "eval_samples_per_second": 7.115, "eval_steps_per_second": 0.889, "step": 2300 }, { "epoch": 0.1684980371771201, "grad_norm": 4.835733890533447, "learning_rate": 9.239961759082219e-05, "loss": 5.0592, "mean_token_accuracy": 0.6617502626776696, "num_tokens": 29711200.0, "step": 2350 }, { "epoch": 0.1684980371771201, "eval_loss": 1.2566256523132324, "eval_mean_token_accuracy": 0.6639114606380463, "eval_num_tokens": 29711200.0, "eval_runtime": 56.2408, "eval_samples_per_second": 7.112, "eval_steps_per_second": 0.889, "step": 2350 }, { "epoch": 0.17208310179790992, "grad_norm": 4.832096099853516, "learning_rate": 9.200127469725941e-05, "loss": 5.0603, "mean_token_accuracy": 0.6628168100118637, "num_tokens": 30343251.0, "step": 2400 }, { "epoch": 0.17208310179790992, "eval_loss": 1.2529999017715454, "eval_mean_token_accuracy": 0.6634249198436737, "eval_num_tokens": 30343251.0, "eval_runtime": 56.4282, "eval_samples_per_second": 7.089, "eval_steps_per_second": 0.886, "step": 2400 }, { "epoch": 0.1756681664186997, "grad_norm": 4.870041370391846, "learning_rate": 9.160293180369663e-05, "loss": 5.0036, "mean_token_accuracy": 0.6657980665564537, "num_tokens": 30983003.0, "step": 2450 }, { "epoch": 0.1756681664186997, "eval_loss": 1.2508057355880737, "eval_mean_token_accuracy": 0.664600031375885, "eval_num_tokens": 30983003.0, "eval_runtime": 56.1477, "eval_samples_per_second": 7.124, "eval_steps_per_second": 0.891, "step": 2450 }, { "epoch": 0.17925323103948948, "grad_norm": 4.8386993408203125, "learning_rate": 9.120458891013385e-05, "loss": 4.9378, "mean_token_accuracy": 0.6697717472910881, "num_tokens": 31612440.0, "step": 2500 }, { "epoch": 0.17925323103948948, "eval_loss": 1.2504231929779053, "eval_mean_token_accuracy": 0.6656661999225616, "eval_num_tokens": 31612440.0, "eval_runtime": 56.432, "eval_samples_per_second": 7.088, "eval_steps_per_second": 0.886, "step": 2500 }, { "epoch": 0.1828382956602793, "grad_norm": 4.897119045257568, "learning_rate": 9.080624601657107e-05, "loss": 5.0576, "mean_token_accuracy": 0.6637110111117362, "num_tokens": 32246052.0, "step": 2550 }, { "epoch": 0.1828382956602793, "eval_loss": 1.2486332654953003, "eval_mean_token_accuracy": 0.6662761294841766, "eval_num_tokens": 32246052.0, "eval_runtime": 56.3188, "eval_samples_per_second": 7.102, "eval_steps_per_second": 0.888, "step": 2550 }, { "epoch": 0.18642336028106907, "grad_norm": 4.67065954208374, "learning_rate": 9.040790312300828e-05, "loss": 5.0137, "mean_token_accuracy": 0.6648938983678818, "num_tokens": 32876620.0, "step": 2600 }, { "epoch": 0.18642336028106907, "eval_loss": 1.2486134767532349, "eval_mean_token_accuracy": 0.6653382694721222, "eval_num_tokens": 32876620.0, "eval_runtime": 56.3432, "eval_samples_per_second": 7.099, "eval_steps_per_second": 0.887, "step": 2600 }, { "epoch": 0.19000842490185885, "grad_norm": 4.4095563888549805, "learning_rate": 9.000956022944551e-05, "loss": 5.0065, "mean_token_accuracy": 0.6646731504797936, "num_tokens": 33511123.0, "step": 2650 }, { "epoch": 0.19000842490185885, "eval_loss": 1.2494382858276367, "eval_mean_token_accuracy": 0.6639154195785523, "eval_num_tokens": 33511123.0, "eval_runtime": 56.3106, "eval_samples_per_second": 7.103, "eval_steps_per_second": 0.888, "step": 2650 }, { "epoch": 0.19359348952264865, "grad_norm": 4.82889461517334, "learning_rate": 8.961121733588274e-05, "loss": 5.0066, "mean_token_accuracy": 0.666558310687542, "num_tokens": 34141280.0, "step": 2700 }, { "epoch": 0.19359348952264865, "eval_loss": 1.2460081577301025, "eval_mean_token_accuracy": 0.6646526777744293, "eval_num_tokens": 34141280.0, "eval_runtime": 56.6246, "eval_samples_per_second": 7.064, "eval_steps_per_second": 0.883, "step": 2700 }, { "epoch": 0.19717855414343843, "grad_norm": 4.663321018218994, "learning_rate": 8.921287444231994e-05, "loss": 4.9428, "mean_token_accuracy": 0.6681969156861305, "num_tokens": 34774698.0, "step": 2750 }, { "epoch": 0.19717855414343843, "eval_loss": 1.244221806526184, "eval_mean_token_accuracy": 0.6656762886047364, "eval_num_tokens": 34774698.0, "eval_runtime": 56.2438, "eval_samples_per_second": 7.112, "eval_steps_per_second": 0.889, "step": 2750 }, { "epoch": 0.2007636187642282, "grad_norm": 4.264768600463867, "learning_rate": 8.881453154875718e-05, "loss": 5.0028, "mean_token_accuracy": 0.6653160175681114, "num_tokens": 35406754.0, "step": 2800 }, { "epoch": 0.2007636187642282, "eval_loss": 1.2440837621688843, "eval_mean_token_accuracy": 0.6653203201293946, "eval_num_tokens": 35406754.0, "eval_runtime": 56.2314, "eval_samples_per_second": 7.113, "eval_steps_per_second": 0.889, "step": 2800 }, { "epoch": 0.20434868338501802, "grad_norm": 4.938720226287842, "learning_rate": 8.84161886551944e-05, "loss": 4.9905, "mean_token_accuracy": 0.666375992000103, "num_tokens": 36037785.0, "step": 2850 }, { "epoch": 0.20434868338501802, "eval_loss": 1.2425023317337036, "eval_mean_token_accuracy": 0.664773497581482, "eval_num_tokens": 36037785.0, "eval_runtime": 56.4922, "eval_samples_per_second": 7.081, "eval_steps_per_second": 0.885, "step": 2850 }, { "epoch": 0.2079337480058078, "grad_norm": 4.350741386413574, "learning_rate": 8.801784576163161e-05, "loss": 4.858, "mean_token_accuracy": 0.6740978673100472, "num_tokens": 36672636.0, "step": 2900 }, { "epoch": 0.2079337480058078, "eval_loss": 1.2399791479110718, "eval_mean_token_accuracy": 0.6653912532329559, "eval_num_tokens": 36672636.0, "eval_runtime": 57.5992, "eval_samples_per_second": 6.945, "eval_steps_per_second": 0.868, "step": 2900 }, { "epoch": 0.2115188126265976, "grad_norm": 4.187928676605225, "learning_rate": 8.761950286806884e-05, "loss": 4.973, "mean_token_accuracy": 0.6659010905027389, "num_tokens": 37304390.0, "step": 2950 }, { "epoch": 0.2115188126265976, "eval_loss": 1.239449143409729, "eval_mean_token_accuracy": 0.6661972737312317, "eval_num_tokens": 37304390.0, "eval_runtime": 55.9136, "eval_samples_per_second": 7.154, "eval_steps_per_second": 0.894, "step": 2950 }, { "epoch": 0.2151038772473874, "grad_norm": 4.3214802742004395, "learning_rate": 8.722115997450606e-05, "loss": 4.9911, "mean_token_accuracy": 0.6659192404150963, "num_tokens": 37937712.0, "step": 3000 }, { "epoch": 0.2151038772473874, "eval_loss": 1.2380547523498535, "eval_mean_token_accuracy": 0.6668792748451233, "eval_num_tokens": 37937712.0, "eval_runtime": 56.7402, "eval_samples_per_second": 7.05, "eval_steps_per_second": 0.881, "step": 3000 }, { "epoch": 0.21868894186817717, "grad_norm": 5.154741287231445, "learning_rate": 8.682281708094327e-05, "loss": 4.9341, "mean_token_accuracy": 0.6695549800992012, "num_tokens": 38567208.0, "step": 3050 }, { "epoch": 0.21868894186817717, "eval_loss": 1.2387843132019043, "eval_mean_token_accuracy": 0.6668635201454163, "eval_num_tokens": 38567208.0, "eval_runtime": 56.2471, "eval_samples_per_second": 7.111, "eval_steps_per_second": 0.889, "step": 3050 }, { "epoch": 0.22227400648896697, "grad_norm": 5.014278888702393, "learning_rate": 8.64244741873805e-05, "loss": 4.8853, "mean_token_accuracy": 0.6707546302676201, "num_tokens": 39198318.0, "step": 3100 }, { "epoch": 0.22227400648896697, "eval_loss": 1.2394779920578003, "eval_mean_token_accuracy": 0.6670789694786072, "eval_num_tokens": 39198318.0, "eval_runtime": 56.2239, "eval_samples_per_second": 7.114, "eval_steps_per_second": 0.889, "step": 3100 }, { "epoch": 0.22585907110975675, "grad_norm": 4.228548049926758, "learning_rate": 8.602613129381773e-05, "loss": 4.9269, "mean_token_accuracy": 0.6687791690230369, "num_tokens": 39828524.0, "step": 3150 }, { "epoch": 0.22585907110975675, "eval_loss": 1.2372474670410156, "eval_mean_token_accuracy": 0.666018306016922, "eval_num_tokens": 39828524.0, "eval_runtime": 56.2199, "eval_samples_per_second": 7.115, "eval_steps_per_second": 0.889, "step": 3150 }, { "epoch": 0.22944413573054653, "grad_norm": 4.169594764709473, "learning_rate": 8.562778840025495e-05, "loss": 4.9485, "mean_token_accuracy": 0.6667472127079964, "num_tokens": 40459992.0, "step": 3200 }, { "epoch": 0.22944413573054653, "eval_loss": 1.2357257604599, "eval_mean_token_accuracy": 0.6657440733909606, "eval_num_tokens": 40459992.0, "eval_runtime": 56.3901, "eval_samples_per_second": 7.093, "eval_steps_per_second": 0.887, "step": 3200 }, { "epoch": 0.23302920035133634, "grad_norm": 4.309950828552246, "learning_rate": 8.522944550669216e-05, "loss": 4.9128, "mean_token_accuracy": 0.671622729897499, "num_tokens": 41094373.0, "step": 3250 }, { "epoch": 0.23302920035133634, "eval_loss": 1.2348511219024658, "eval_mean_token_accuracy": 0.6659791529178619, "eval_num_tokens": 41094373.0, "eval_runtime": 56.3939, "eval_samples_per_second": 7.093, "eval_steps_per_second": 0.887, "step": 3250 }, { "epoch": 0.23661426497212612, "grad_norm": 4.153282642364502, "learning_rate": 8.48311026131294e-05, "loss": 4.9831, "mean_token_accuracy": 0.66548932492733, "num_tokens": 41725155.0, "step": 3300 }, { "epoch": 0.23661426497212612, "eval_loss": 1.2328479290008545, "eval_mean_token_accuracy": 0.6659755408763885, "eval_num_tokens": 41725155.0, "eval_runtime": 56.4734, "eval_samples_per_second": 7.083, "eval_steps_per_second": 0.885, "step": 3300 }, { "epoch": 0.2401993295929159, "grad_norm": 4.901464462280273, "learning_rate": 8.443275971956662e-05, "loss": 4.9905, "mean_token_accuracy": 0.6660814517736435, "num_tokens": 42361406.0, "step": 3350 }, { "epoch": 0.2401993295929159, "eval_loss": 1.2326833009719849, "eval_mean_token_accuracy": 0.6672022414207458, "eval_num_tokens": 42361406.0, "eval_runtime": 56.2896, "eval_samples_per_second": 7.106, "eval_steps_per_second": 0.888, "step": 3350 }, { "epoch": 0.2437843942137057, "grad_norm": 4.4632415771484375, "learning_rate": 8.403441682600382e-05, "loss": 4.8952, "mean_token_accuracy": 0.6706425687670707, "num_tokens": 42993909.0, "step": 3400 }, { "epoch": 0.2437843942137057, "eval_loss": 1.2327020168304443, "eval_mean_token_accuracy": 0.6668058276176453, "eval_num_tokens": 42993909.0, "eval_runtime": 56.2077, "eval_samples_per_second": 7.116, "eval_steps_per_second": 0.89, "step": 3400 }, { "epoch": 0.2473694588344955, "grad_norm": 4.537699222564697, "learning_rate": 8.363607393244104e-05, "loss": 4.921, "mean_token_accuracy": 0.6698031505942345, "num_tokens": 43628010.0, "step": 3450 }, { "epoch": 0.2473694588344955, "eval_loss": 1.2328044176101685, "eval_mean_token_accuracy": 0.667005888223648, "eval_num_tokens": 43628010.0, "eval_runtime": 56.142, "eval_samples_per_second": 7.125, "eval_steps_per_second": 0.891, "step": 3450 }, { "epoch": 0.2509545234552853, "grad_norm": 4.68520450592041, "learning_rate": 8.323773103887828e-05, "loss": 4.9443, "mean_token_accuracy": 0.667299503982067, "num_tokens": 44263542.0, "step": 3500 }, { "epoch": 0.2509545234552853, "eval_loss": 1.2303454875946045, "eval_mean_token_accuracy": 0.6684185063838959, "eval_num_tokens": 44263542.0, "eval_runtime": 56.0547, "eval_samples_per_second": 7.136, "eval_steps_per_second": 0.892, "step": 3500 }, { "epoch": 0.2545395880760751, "grad_norm": 4.269311428070068, "learning_rate": 8.283938814531549e-05, "loss": 4.9117, "mean_token_accuracy": 0.6701091477274894, "num_tokens": 44896448.0, "step": 3550 }, { "epoch": 0.2545395880760751, "eval_loss": 1.2301256656646729, "eval_mean_token_accuracy": 0.6679345464706421, "eval_num_tokens": 44896448.0, "eval_runtime": 56.8192, "eval_samples_per_second": 7.04, "eval_steps_per_second": 0.88, "step": 3550 }, { "epoch": 0.25812465269686485, "grad_norm": 4.6586198806762695, "learning_rate": 8.244104525175271e-05, "loss": 4.9361, "mean_token_accuracy": 0.6700941568613052, "num_tokens": 45535736.0, "step": 3600 }, { "epoch": 0.25812465269686485, "eval_loss": 1.2280727624893188, "eval_mean_token_accuracy": 0.668077005147934, "eval_num_tokens": 45535736.0, "eval_runtime": 56.7678, "eval_samples_per_second": 7.046, "eval_steps_per_second": 0.881, "step": 3600 }, { "epoch": 0.26170971731765463, "grad_norm": 4.350837230682373, "learning_rate": 8.204270235818994e-05, "loss": 4.8535, "mean_token_accuracy": 0.6710763236880303, "num_tokens": 46168014.0, "step": 3650 }, { "epoch": 0.26170971731765463, "eval_loss": 1.2273330688476562, "eval_mean_token_accuracy": 0.6685185146331787, "eval_num_tokens": 46168014.0, "eval_runtime": 56.2347, "eval_samples_per_second": 7.113, "eval_steps_per_second": 0.889, "step": 3650 }, { "epoch": 0.2652947819384444, "grad_norm": 4.489384174346924, "learning_rate": 8.164435946462715e-05, "loss": 4.9884, "mean_token_accuracy": 0.6643109431862831, "num_tokens": 46799865.0, "step": 3700 }, { "epoch": 0.2652947819384444, "eval_loss": 1.228873610496521, "eval_mean_token_accuracy": 0.6676431381702423, "eval_num_tokens": 46799865.0, "eval_runtime": 56.2386, "eval_samples_per_second": 7.113, "eval_steps_per_second": 0.889, "step": 3700 }, { "epoch": 0.26887984655923425, "grad_norm": 4.438107967376709, "learning_rate": 8.124601657106437e-05, "loss": 4.8433, "mean_token_accuracy": 0.6722122520208359, "num_tokens": 47431886.0, "step": 3750 }, { "epoch": 0.26887984655923425, "eval_loss": 1.2276620864868164, "eval_mean_token_accuracy": 0.6684055602550507, "eval_num_tokens": 47431886.0, "eval_runtime": 56.3388, "eval_samples_per_second": 7.1, "eval_steps_per_second": 0.887, "step": 3750 }, { "epoch": 0.272464911180024, "grad_norm": 4.851945877075195, "learning_rate": 8.08476736775016e-05, "loss": 4.9293, "mean_token_accuracy": 0.668489234149456, "num_tokens": 48065104.0, "step": 3800 }, { "epoch": 0.272464911180024, "eval_loss": 1.2269046306610107, "eval_mean_token_accuracy": 0.6687084710597992, "eval_num_tokens": 48065104.0, "eval_runtime": 56.1744, "eval_samples_per_second": 7.121, "eval_steps_per_second": 0.89, "step": 3800 }, { "epoch": 0.2760499758008138, "grad_norm": 4.730586528778076, "learning_rate": 8.044933078393882e-05, "loss": 4.8258, "mean_token_accuracy": 0.6736181953549385, "num_tokens": 48699465.0, "step": 3850 }, { "epoch": 0.2760499758008138, "eval_loss": 1.2258822917938232, "eval_mean_token_accuracy": 0.6694431722164154, "eval_num_tokens": 48699465.0, "eval_runtime": 56.2799, "eval_samples_per_second": 7.107, "eval_steps_per_second": 0.888, "step": 3850 }, { "epoch": 0.2796350404216036, "grad_norm": 4.539992809295654, "learning_rate": 8.005098789037604e-05, "loss": 4.9014, "mean_token_accuracy": 0.6706485760211944, "num_tokens": 49328518.0, "step": 3900 }, { "epoch": 0.2796350404216036, "eval_loss": 1.2248101234436035, "eval_mean_token_accuracy": 0.6695001828670502, "eval_num_tokens": 49328518.0, "eval_runtime": 56.4684, "eval_samples_per_second": 7.084, "eval_steps_per_second": 0.885, "step": 3900 }, { "epoch": 0.28322010504239337, "grad_norm": 5.000583648681641, "learning_rate": 7.965264499681326e-05, "loss": 4.7606, "mean_token_accuracy": 0.6768921792507172, "num_tokens": 49957049.0, "step": 3950 }, { "epoch": 0.28322010504239337, "eval_loss": 1.2227978706359863, "eval_mean_token_accuracy": 0.6691736376285553, "eval_num_tokens": 49957049.0, "eval_runtime": 56.328, "eval_samples_per_second": 7.101, "eval_steps_per_second": 0.888, "step": 3950 }, { "epoch": 0.2868051696631832, "grad_norm": 4.855432510375977, "learning_rate": 7.925430210325048e-05, "loss": 4.9544, "mean_token_accuracy": 0.6679512014985085, "num_tokens": 50593342.0, "step": 4000 }, { "epoch": 0.2868051696631832, "eval_loss": 1.222544550895691, "eval_mean_token_accuracy": 0.6694585859775544, "eval_num_tokens": 50593342.0, "eval_runtime": 56.2378, "eval_samples_per_second": 7.113, "eval_steps_per_second": 0.889, "step": 4000 }, { "epoch": 0.290390234283973, "grad_norm": 4.258941173553467, "learning_rate": 7.88559592096877e-05, "loss": 4.8759, "mean_token_accuracy": 0.6715140387415885, "num_tokens": 51220159.0, "step": 4050 }, { "epoch": 0.290390234283973, "eval_loss": 1.2212531566619873, "eval_mean_token_accuracy": 0.6692110347747803, "eval_num_tokens": 51220159.0, "eval_runtime": 56.3598, "eval_samples_per_second": 7.097, "eval_steps_per_second": 0.887, "step": 4050 }, { "epoch": 0.29397529890476276, "grad_norm": 4.41649055480957, "learning_rate": 7.845761631612492e-05, "loss": 4.8512, "mean_token_accuracy": 0.6730743369460106, "num_tokens": 51852052.0, "step": 4100 }, { "epoch": 0.29397529890476276, "eval_loss": 1.2219711542129517, "eval_mean_token_accuracy": 0.668809084892273, "eval_num_tokens": 51852052.0, "eval_runtime": 56.3222, "eval_samples_per_second": 7.102, "eval_steps_per_second": 0.888, "step": 4100 }, { "epoch": 0.29756036352555254, "grad_norm": 5.041947841644287, "learning_rate": 7.805927342256214e-05, "loss": 4.8012, "mean_token_accuracy": 0.6752137768268586, "num_tokens": 52488117.0, "step": 4150 }, { "epoch": 0.29756036352555254, "eval_loss": 1.2200063467025757, "eval_mean_token_accuracy": 0.6689476525783539, "eval_num_tokens": 52488117.0, "eval_runtime": 56.3663, "eval_samples_per_second": 7.096, "eval_steps_per_second": 0.887, "step": 4150 }, { "epoch": 0.3011454281463423, "grad_norm": 4.421684741973877, "learning_rate": 7.766093052899937e-05, "loss": 4.9011, "mean_token_accuracy": 0.6694827458262443, "num_tokens": 53124002.0, "step": 4200 }, { "epoch": 0.3011454281463423, "eval_loss": 1.2199760675430298, "eval_mean_token_accuracy": 0.6698788702487946, "eval_num_tokens": 53124002.0, "eval_runtime": 57.1381, "eval_samples_per_second": 7.001, "eval_steps_per_second": 0.875, "step": 4200 }, { "epoch": 0.30473049276713216, "grad_norm": 4.482224941253662, "learning_rate": 7.726258763543659e-05, "loss": 4.8888, "mean_token_accuracy": 0.6703590288758278, "num_tokens": 53754491.0, "step": 4250 }, { "epoch": 0.30473049276713216, "eval_loss": 1.2201299667358398, "eval_mean_token_accuracy": 0.6685253477096558, "eval_num_tokens": 53754491.0, "eval_runtime": 56.1814, "eval_samples_per_second": 7.12, "eval_steps_per_second": 0.89, "step": 4250 }, { "epoch": 0.30831555738792193, "grad_norm": 5.163293838500977, "learning_rate": 7.686424474187381e-05, "loss": 4.8934, "mean_token_accuracy": 0.6694687473773956, "num_tokens": 54384292.0, "step": 4300 }, { "epoch": 0.30831555738792193, "eval_loss": 1.2192912101745605, "eval_mean_token_accuracy": 0.670127317905426, "eval_num_tokens": 54384292.0, "eval_runtime": 56.2288, "eval_samples_per_second": 7.114, "eval_steps_per_second": 0.889, "step": 4300 }, { "epoch": 0.3119006220087117, "grad_norm": 4.469936847686768, "learning_rate": 7.646590184831103e-05, "loss": 4.8808, "mean_token_accuracy": 0.671641985476017, "num_tokens": 55015409.0, "step": 4350 }, { "epoch": 0.3119006220087117, "eval_loss": 1.2187691926956177, "eval_mean_token_accuracy": 0.6700682175159455, "eval_num_tokens": 55015409.0, "eval_runtime": 56.275, "eval_samples_per_second": 7.108, "eval_steps_per_second": 0.888, "step": 4350 }, { "epoch": 0.3154856866295015, "grad_norm": 4.397490501403809, "learning_rate": 7.606755895474825e-05, "loss": 4.8593, "mean_token_accuracy": 0.6729298800230026, "num_tokens": 55645076.0, "step": 4400 }, { "epoch": 0.3154856866295015, "eval_loss": 1.21873140335083, "eval_mean_token_accuracy": 0.6701312291622162, "eval_num_tokens": 55645076.0, "eval_runtime": 56.3255, "eval_samples_per_second": 7.102, "eval_steps_per_second": 0.888, "step": 4400 }, { "epoch": 0.3190707512502913, "grad_norm": 4.565478801727295, "learning_rate": 7.566921606118547e-05, "loss": 4.9259, "mean_token_accuracy": 0.6678701865673066, "num_tokens": 56277591.0, "step": 4450 }, { "epoch": 0.3190707512502913, "eval_loss": 1.2173478603363037, "eval_mean_token_accuracy": 0.6713691699504852, "eval_num_tokens": 56277591.0, "eval_runtime": 56.1846, "eval_samples_per_second": 7.119, "eval_steps_per_second": 0.89, "step": 4450 }, { "epoch": 0.32265581587108105, "grad_norm": 4.387983798980713, "learning_rate": 7.52708731676227e-05, "loss": 4.8506, "mean_token_accuracy": 0.6727069270610809, "num_tokens": 56909553.0, "step": 4500 }, { "epoch": 0.32265581587108105, "eval_loss": 1.2162431478500366, "eval_mean_token_accuracy": 0.6704595732688904, "eval_num_tokens": 56909553.0, "eval_runtime": 56.2594, "eval_samples_per_second": 7.11, "eval_steps_per_second": 0.889, "step": 4500 }, { "epoch": 0.3262408804918709, "grad_norm": 4.406232833862305, "learning_rate": 7.487253027405992e-05, "loss": 4.8975, "mean_token_accuracy": 0.6693887722492218, "num_tokens": 57541706.0, "step": 4550 }, { "epoch": 0.3262408804918709, "eval_loss": 1.216928243637085, "eval_mean_token_accuracy": 0.6708524739742279, "eval_num_tokens": 57541706.0, "eval_runtime": 56.2862, "eval_samples_per_second": 7.107, "eval_steps_per_second": 0.888, "step": 4550 }, { "epoch": 0.32982594511266067, "grad_norm": 4.329367637634277, "learning_rate": 7.447418738049714e-05, "loss": 4.8734, "mean_token_accuracy": 0.6720337501168251, "num_tokens": 58175251.0, "step": 4600 }, { "epoch": 0.32982594511266067, "eval_loss": 1.2153425216674805, "eval_mean_token_accuracy": 0.6709867632389068, "eval_num_tokens": 58175251.0, "eval_runtime": 56.449, "eval_samples_per_second": 7.086, "eval_steps_per_second": 0.886, "step": 4600 }, { "epoch": 0.33341100973345045, "grad_norm": 4.24669075012207, "learning_rate": 7.407584448693436e-05, "loss": 4.8742, "mean_token_accuracy": 0.6718363285064697, "num_tokens": 58807276.0, "step": 4650 }, { "epoch": 0.33341100973345045, "eval_loss": 1.2146964073181152, "eval_mean_token_accuracy": 0.6710757482051849, "eval_num_tokens": 58807276.0, "eval_runtime": 56.3596, "eval_samples_per_second": 7.097, "eval_steps_per_second": 0.887, "step": 4650 }, { "epoch": 0.3369960743542402, "grad_norm": 4.037027835845947, "learning_rate": 7.367750159337158e-05, "loss": 4.8869, "mean_token_accuracy": 0.6710193574428558, "num_tokens": 59434602.0, "step": 4700 }, { "epoch": 0.3369960743542402, "eval_loss": 1.2160181999206543, "eval_mean_token_accuracy": 0.6700944793224335, "eval_num_tokens": 59434602.0, "eval_runtime": 56.8171, "eval_samples_per_second": 7.04, "eval_steps_per_second": 0.88, "step": 4700 }, { "epoch": 0.34058113897503, "grad_norm": 4.7925262451171875, "learning_rate": 7.32791586998088e-05, "loss": 4.8639, "mean_token_accuracy": 0.6717003020644188, "num_tokens": 60067934.0, "step": 4750 }, { "epoch": 0.34058113897503, "eval_loss": 1.2146656513214111, "eval_mean_token_accuracy": 0.6714940690994262, "eval_num_tokens": 60067934.0, "eval_runtime": 56.6455, "eval_samples_per_second": 7.061, "eval_steps_per_second": 0.883, "step": 4750 }, { "epoch": 0.34416620359581984, "grad_norm": 4.179026126861572, "learning_rate": 7.288081580624602e-05, "loss": 4.7815, "mean_token_accuracy": 0.6770669308304786, "num_tokens": 60700674.0, "step": 4800 }, { "epoch": 0.34416620359581984, "eval_loss": 1.2132787704467773, "eval_mean_token_accuracy": 0.6709755408763886, "eval_num_tokens": 60700674.0, "eval_runtime": 56.6884, "eval_samples_per_second": 7.056, "eval_steps_per_second": 0.882, "step": 4800 }, { "epoch": 0.3477512682166096, "grad_norm": 4.608165740966797, "learning_rate": 7.248247291268324e-05, "loss": 4.8593, "mean_token_accuracy": 0.6736995288729668, "num_tokens": 61331555.0, "step": 4850 }, { "epoch": 0.3477512682166096, "eval_loss": 1.2121059894561768, "eval_mean_token_accuracy": 0.672556334733963, "eval_num_tokens": 61331555.0, "eval_runtime": 56.1182, "eval_samples_per_second": 7.128, "eval_steps_per_second": 0.891, "step": 4850 }, { "epoch": 0.3513363328373994, "grad_norm": 4.966649055480957, "learning_rate": 7.208413001912047e-05, "loss": 4.8738, "mean_token_accuracy": 0.671499859392643, "num_tokens": 61965343.0, "step": 4900 }, { "epoch": 0.3513363328373994, "eval_loss": 1.2120461463928223, "eval_mean_token_accuracy": 0.6718708264827729, "eval_num_tokens": 61965343.0, "eval_runtime": 56.145, "eval_samples_per_second": 7.124, "eval_steps_per_second": 0.891, "step": 4900 }, { "epoch": 0.3549213974581892, "grad_norm": 5.021463871002197, "learning_rate": 7.168578712555767e-05, "loss": 4.8567, "mean_token_accuracy": 0.6712884229421615, "num_tokens": 62596263.0, "step": 4950 }, { "epoch": 0.3549213974581892, "eval_loss": 1.2121599912643433, "eval_mean_token_accuracy": 0.671816600561142, "eval_num_tokens": 62596263.0, "eval_runtime": 56.6214, "eval_samples_per_second": 7.064, "eval_steps_per_second": 0.883, "step": 4950 }, { "epoch": 0.35850646207897896, "grad_norm": 4.346203804016113, "learning_rate": 7.128744423199491e-05, "loss": 4.8762, "mean_token_accuracy": 0.6713952556252479, "num_tokens": 63232395.0, "step": 5000 }, { "epoch": 0.35850646207897896, "eval_loss": 1.2105367183685303, "eval_mean_token_accuracy": 0.6715540933609009, "eval_num_tokens": 63232395.0, "eval_runtime": 56.1985, "eval_samples_per_second": 7.118, "eval_steps_per_second": 0.89, "step": 5000 }, { "epoch": 0.36209152669976874, "grad_norm": 4.725315570831299, "learning_rate": 7.088910133843213e-05, "loss": 4.8547, "mean_token_accuracy": 0.6712683519721031, "num_tokens": 63865141.0, "step": 5050 }, { "epoch": 0.36209152669976874, "eval_loss": 1.2104063034057617, "eval_mean_token_accuracy": 0.6721407020092011, "eval_num_tokens": 63865141.0, "eval_runtime": 56.4947, "eval_samples_per_second": 7.08, "eval_steps_per_second": 0.885, "step": 5050 }, { "epoch": 0.3656765913205586, "grad_norm": 4.475533962249756, "learning_rate": 7.049075844486934e-05, "loss": 4.8331, "mean_token_accuracy": 0.6741529366374016, "num_tokens": 64498005.0, "step": 5100 }, { "epoch": 0.3656765913205586, "eval_loss": 1.211449146270752, "eval_mean_token_accuracy": 0.6718828630447388, "eval_num_tokens": 64498005.0, "eval_runtime": 56.1253, "eval_samples_per_second": 7.127, "eval_steps_per_second": 0.891, "step": 5100 }, { "epoch": 0.36926165594134835, "grad_norm": 4.43773078918457, "learning_rate": 7.009241555130657e-05, "loss": 4.8576, "mean_token_accuracy": 0.6735836458206177, "num_tokens": 65132925.0, "step": 5150 }, { "epoch": 0.36926165594134835, "eval_loss": 1.2107973098754883, "eval_mean_token_accuracy": 0.67238405585289, "eval_num_tokens": 65132925.0, "eval_runtime": 56.3575, "eval_samples_per_second": 7.098, "eval_steps_per_second": 0.887, "step": 5150 }, { "epoch": 0.37284672056213813, "grad_norm": 4.340308666229248, "learning_rate": 6.96940726577438e-05, "loss": 4.8199, "mean_token_accuracy": 0.6730007353425026, "num_tokens": 65763710.0, "step": 5200 }, { "epoch": 0.37284672056213813, "eval_loss": 1.2099605798721313, "eval_mean_token_accuracy": 0.6728368639945984, "eval_num_tokens": 65763710.0, "eval_runtime": 56.2959, "eval_samples_per_second": 7.105, "eval_steps_per_second": 0.888, "step": 5200 }, { "epoch": 0.3764317851829279, "grad_norm": 4.555109024047852, "learning_rate": 6.9295729764181e-05, "loss": 4.8905, "mean_token_accuracy": 0.6707694306969643, "num_tokens": 66395012.0, "step": 5250 }, { "epoch": 0.3764317851829279, "eval_loss": 1.2099387645721436, "eval_mean_token_accuracy": 0.6722373139858245, "eval_num_tokens": 66395012.0, "eval_runtime": 56.4644, "eval_samples_per_second": 7.084, "eval_steps_per_second": 0.886, "step": 5250 }, { "epoch": 0.3800168498037177, "grad_norm": 4.202060699462891, "learning_rate": 6.889738687061822e-05, "loss": 4.8292, "mean_token_accuracy": 0.673227034509182, "num_tokens": 67031872.0, "step": 5300 }, { "epoch": 0.3800168498037177, "eval_loss": 1.210257887840271, "eval_mean_token_accuracy": 0.6714678919315338, "eval_num_tokens": 67031872.0, "eval_runtime": 56.3175, "eval_samples_per_second": 7.103, "eval_steps_per_second": 0.888, "step": 5300 }, { "epoch": 0.38360191442450753, "grad_norm": 4.315623760223389, "learning_rate": 6.849904397705546e-05, "loss": 4.8465, "mean_token_accuracy": 0.672520759999752, "num_tokens": 67663971.0, "step": 5350 }, { "epoch": 0.38360191442450753, "eval_loss": 1.20899498462677, "eval_mean_token_accuracy": 0.6721968007087707, "eval_num_tokens": 67663971.0, "eval_runtime": 56.4163, "eval_samples_per_second": 7.09, "eval_steps_per_second": 0.886, "step": 5350 }, { "epoch": 0.3871869790452973, "grad_norm": 4.103718280792236, "learning_rate": 6.810070108349267e-05, "loss": 4.8568, "mean_token_accuracy": 0.6714790239930153, "num_tokens": 68298646.0, "step": 5400 }, { "epoch": 0.3871869790452973, "eval_loss": 1.2082393169403076, "eval_mean_token_accuracy": 0.6712930297851563, "eval_num_tokens": 68298646.0, "eval_runtime": 56.3666, "eval_samples_per_second": 7.096, "eval_steps_per_second": 0.887, "step": 5400 }, { "epoch": 0.3907720436660871, "grad_norm": 4.669826030731201, "learning_rate": 6.770235818992989e-05, "loss": 4.8767, "mean_token_accuracy": 0.6715716090798378, "num_tokens": 68934666.0, "step": 5450 }, { "epoch": 0.3907720436660871, "eval_loss": 1.207878589630127, "eval_mean_token_accuracy": 0.672472620010376, "eval_num_tokens": 68934666.0, "eval_runtime": 56.6965, "eval_samples_per_second": 7.055, "eval_steps_per_second": 0.882, "step": 5450 }, { "epoch": 0.39435710828687687, "grad_norm": 4.467480659484863, "learning_rate": 6.730401529636712e-05, "loss": 4.8558, "mean_token_accuracy": 0.6721174070239067, "num_tokens": 69569216.0, "step": 5500 }, { "epoch": 0.39435710828687687, "eval_loss": 1.207086205482483, "eval_mean_token_accuracy": 0.6721065282821655, "eval_num_tokens": 69569216.0, "eval_runtime": 56.3201, "eval_samples_per_second": 7.102, "eval_steps_per_second": 0.888, "step": 5500 }, { "epoch": 0.39794217290766665, "grad_norm": 4.608986854553223, "learning_rate": 6.690567240280435e-05, "loss": 4.8658, "mean_token_accuracy": 0.6705604410171508, "num_tokens": 70207379.0, "step": 5550 }, { "epoch": 0.39794217290766665, "eval_loss": 1.2067745923995972, "eval_mean_token_accuracy": 0.6716193425655365, "eval_num_tokens": 70207379.0, "eval_runtime": 57.4805, "eval_samples_per_second": 6.959, "eval_steps_per_second": 0.87, "step": 5550 }, { "epoch": 0.4015272375284564, "grad_norm": 4.4026780128479, "learning_rate": 6.650732950924155e-05, "loss": 4.7928, "mean_token_accuracy": 0.6765011212229729, "num_tokens": 70836399.0, "step": 5600 }, { "epoch": 0.4015272375284564, "eval_loss": 1.2075951099395752, "eval_mean_token_accuracy": 0.6730300402641296, "eval_num_tokens": 70836399.0, "eval_runtime": 58.1991, "eval_samples_per_second": 6.873, "eval_steps_per_second": 0.859, "step": 5600 }, { "epoch": 0.40511230214924626, "grad_norm": 4.3206048011779785, "learning_rate": 6.610898661567877e-05, "loss": 4.7828, "mean_token_accuracy": 0.6755007293820381, "num_tokens": 71465978.0, "step": 5650 }, { "epoch": 0.40511230214924626, "eval_loss": 1.2060260772705078, "eval_mean_token_accuracy": 0.6736092364788056, "eval_num_tokens": 71465978.0, "eval_runtime": 55.9656, "eval_samples_per_second": 7.147, "eval_steps_per_second": 0.893, "step": 5650 }, { "epoch": 0.40869736677003604, "grad_norm": 4.6384196281433105, "learning_rate": 6.571064372211601e-05, "loss": 4.8045, "mean_token_accuracy": 0.6742757317423821, "num_tokens": 72094960.0, "step": 5700 }, { "epoch": 0.40869736677003604, "eval_loss": 1.2062655687332153, "eval_mean_token_accuracy": 0.6727207219600677, "eval_num_tokens": 72094960.0, "eval_runtime": 56.5884, "eval_samples_per_second": 7.069, "eval_steps_per_second": 0.884, "step": 5700 }, { "epoch": 0.4122824313908258, "grad_norm": 4.51801872253418, "learning_rate": 6.531230082855322e-05, "loss": 4.8502, "mean_token_accuracy": 0.6714598840475082, "num_tokens": 72728939.0, "step": 5750 }, { "epoch": 0.4122824313908258, "eval_loss": 1.2066096067428589, "eval_mean_token_accuracy": 0.6731921648979187, "eval_num_tokens": 72728939.0, "eval_runtime": 56.4183, "eval_samples_per_second": 7.09, "eval_steps_per_second": 0.886, "step": 5750 }, { "epoch": 0.4158674960116156, "grad_norm": 4.803595066070557, "learning_rate": 6.491395793499044e-05, "loss": 4.863, "mean_token_accuracy": 0.6716452211141586, "num_tokens": 73363737.0, "step": 5800 }, { "epoch": 0.4158674960116156, "eval_loss": 1.2050178050994873, "eval_mean_token_accuracy": 0.6734542024135589, "eval_num_tokens": 73363737.0, "eval_runtime": 56.4329, "eval_samples_per_second": 7.088, "eval_steps_per_second": 0.886, "step": 5800 }, { "epoch": 0.4194525606324054, "grad_norm": 4.864405155181885, "learning_rate": 6.451561504142767e-05, "loss": 4.82, "mean_token_accuracy": 0.6745044487714768, "num_tokens": 73997522.0, "step": 5850 }, { "epoch": 0.4194525606324054, "eval_loss": 1.2039889097213745, "eval_mean_token_accuracy": 0.6733579516410828, "eval_num_tokens": 73997522.0, "eval_runtime": 56.3966, "eval_samples_per_second": 7.093, "eval_steps_per_second": 0.887, "step": 5850 }, { "epoch": 0.4230376252531952, "grad_norm": 4.559540271759033, "learning_rate": 6.411727214786488e-05, "loss": 4.7735, "mean_token_accuracy": 0.6764472410082817, "num_tokens": 74632565.0, "step": 5900 }, { "epoch": 0.4230376252531952, "eval_loss": 1.2041822671890259, "eval_mean_token_accuracy": 0.6725377225875855, "eval_num_tokens": 74632565.0, "eval_runtime": 56.2069, "eval_samples_per_second": 7.117, "eval_steps_per_second": 0.89, "step": 5900 }, { "epoch": 0.426622689873985, "grad_norm": 4.625767230987549, "learning_rate": 6.37189292543021e-05, "loss": 4.8474, "mean_token_accuracy": 0.6734576171636582, "num_tokens": 75264457.0, "step": 5950 }, { "epoch": 0.426622689873985, "eval_loss": 1.2035109996795654, "eval_mean_token_accuracy": 0.6731998026371002, "eval_num_tokens": 75264457.0, "eval_runtime": 56.7209, "eval_samples_per_second": 7.052, "eval_steps_per_second": 0.882, "step": 5950 }, { "epoch": 0.4302077544947748, "grad_norm": 4.185346603393555, "learning_rate": 6.332058636073932e-05, "loss": 4.8327, "mean_token_accuracy": 0.6721743106842041, "num_tokens": 75893311.0, "step": 6000 }, { "epoch": 0.4302077544947748, "eval_loss": 1.203436255455017, "eval_mean_token_accuracy": 0.6730928170681, "eval_num_tokens": 75893311.0, "eval_runtime": 56.3504, "eval_samples_per_second": 7.098, "eval_steps_per_second": 0.887, "step": 6000 }, { "epoch": 0.43379281911556455, "grad_norm": 4.341583251953125, "learning_rate": 6.292224346717655e-05, "loss": 4.8072, "mean_token_accuracy": 0.6749084493517876, "num_tokens": 76529617.0, "step": 6050 }, { "epoch": 0.43379281911556455, "eval_loss": 1.2034169435501099, "eval_mean_token_accuracy": 0.6731595695018768, "eval_num_tokens": 76529617.0, "eval_runtime": 56.4332, "eval_samples_per_second": 7.088, "eval_steps_per_second": 0.886, "step": 6050 }, { "epoch": 0.43737788373635433, "grad_norm": 4.502080917358398, "learning_rate": 6.252390057361377e-05, "loss": 4.7284, "mean_token_accuracy": 0.6789399805665016, "num_tokens": 77158901.0, "step": 6100 }, { "epoch": 0.43737788373635433, "eval_loss": 1.2037384510040283, "eval_mean_token_accuracy": 0.6724783575534821, "eval_num_tokens": 77158901.0, "eval_runtime": 56.1762, "eval_samples_per_second": 7.12, "eval_steps_per_second": 0.89, "step": 6100 }, { "epoch": 0.4409629483571441, "grad_norm": 4.407749652862549, "learning_rate": 6.212555768005099e-05, "loss": 4.8102, "mean_token_accuracy": 0.6749192690849304, "num_tokens": 77792929.0, "step": 6150 }, { "epoch": 0.4409629483571441, "eval_loss": 1.2034553289413452, "eval_mean_token_accuracy": 0.6724519121646881, "eval_num_tokens": 77792929.0, "eval_runtime": 56.3346, "eval_samples_per_second": 7.1, "eval_steps_per_second": 0.888, "step": 6150 }, { "epoch": 0.44454801297793395, "grad_norm": 4.5488362312316895, "learning_rate": 6.172721478648821e-05, "loss": 4.8424, "mean_token_accuracy": 0.6731199064850807, "num_tokens": 78426748.0, "step": 6200 }, { "epoch": 0.44454801297793395, "eval_loss": 1.2025480270385742, "eval_mean_token_accuracy": 0.673497976064682, "eval_num_tokens": 78426748.0, "eval_runtime": 56.1462, "eval_samples_per_second": 7.124, "eval_steps_per_second": 0.891, "step": 6200 }, { "epoch": 0.44813307759872373, "grad_norm": 4.52962589263916, "learning_rate": 6.132887189292543e-05, "loss": 4.7814, "mean_token_accuracy": 0.6756982815265655, "num_tokens": 79054859.0, "step": 6250 }, { "epoch": 0.44813307759872373, "eval_loss": 1.2015492916107178, "eval_mean_token_accuracy": 0.6740836083889008, "eval_num_tokens": 79054859.0, "eval_runtime": 56.3541, "eval_samples_per_second": 7.098, "eval_steps_per_second": 0.887, "step": 6250 }, { "epoch": 0.4517181422195135, "grad_norm": 4.603536128997803, "learning_rate": 6.093052899936266e-05, "loss": 4.9195, "mean_token_accuracy": 0.6704733854532242, "num_tokens": 79688857.0, "step": 6300 }, { "epoch": 0.4517181422195135, "eval_loss": 1.2020344734191895, "eval_mean_token_accuracy": 0.6741014468669891, "eval_num_tokens": 79688857.0, "eval_runtime": 56.114, "eval_samples_per_second": 7.128, "eval_steps_per_second": 0.891, "step": 6300 }, { "epoch": 0.4553032068403033, "grad_norm": 5.02667236328125, "learning_rate": 6.053218610579987e-05, "loss": 4.7774, "mean_token_accuracy": 0.6772465297579765, "num_tokens": 80324585.0, "step": 6350 }, { "epoch": 0.4553032068403033, "eval_loss": 1.201953411102295, "eval_mean_token_accuracy": 0.6730434691905975, "eval_num_tokens": 80324585.0, "eval_runtime": 56.1284, "eval_samples_per_second": 7.127, "eval_steps_per_second": 0.891, "step": 6350 }, { "epoch": 0.45888827146109307, "grad_norm": 4.330198764801025, "learning_rate": 6.0133843212237096e-05, "loss": 4.8002, "mean_token_accuracy": 0.6744606778025627, "num_tokens": 80956366.0, "step": 6400 }, { "epoch": 0.45888827146109307, "eval_loss": 1.2010780572891235, "eval_mean_token_accuracy": 0.6726482355594635, "eval_num_tokens": 80956366.0, "eval_runtime": 56.1261, "eval_samples_per_second": 7.127, "eval_steps_per_second": 0.891, "step": 6400 }, { "epoch": 0.4624733360818829, "grad_norm": 4.510508060455322, "learning_rate": 5.973550031867432e-05, "loss": 4.8791, "mean_token_accuracy": 0.6714214497804641, "num_tokens": 81587826.0, "step": 6450 }, { "epoch": 0.4624733360818829, "eval_loss": 1.2023468017578125, "eval_mean_token_accuracy": 0.672912814617157, "eval_num_tokens": 81587826.0, "eval_runtime": 56.3171, "eval_samples_per_second": 7.103, "eval_steps_per_second": 0.888, "step": 6450 }, { "epoch": 0.4660584007026727, "grad_norm": 4.60286283493042, "learning_rate": 5.933715742511153e-05, "loss": 4.7999, "mean_token_accuracy": 0.6754237455129624, "num_tokens": 82222966.0, "step": 6500 }, { "epoch": 0.4660584007026727, "eval_loss": 1.2008494138717651, "eval_mean_token_accuracy": 0.6732868099212647, "eval_num_tokens": 82222966.0, "eval_runtime": 56.2084, "eval_samples_per_second": 7.116, "eval_steps_per_second": 0.89, "step": 6500 }, { "epoch": 0.46964346532346246, "grad_norm": 4.842785835266113, "learning_rate": 5.893881453154876e-05, "loss": 4.8096, "mean_token_accuracy": 0.6758663612604141, "num_tokens": 82855504.0, "step": 6550 }, { "epoch": 0.46964346532346246, "eval_loss": 1.200462818145752, "eval_mean_token_accuracy": 0.6739160513877869, "eval_num_tokens": 82855504.0, "eval_runtime": 56.2921, "eval_samples_per_second": 7.106, "eval_steps_per_second": 0.888, "step": 6550 }, { "epoch": 0.47322852994425224, "grad_norm": 4.244312763214111, "learning_rate": 5.854047163798598e-05, "loss": 4.773, "mean_token_accuracy": 0.6780867150425911, "num_tokens": 83488202.0, "step": 6600 }, { "epoch": 0.47322852994425224, "eval_loss": 1.200437068939209, "eval_mean_token_accuracy": 0.6741013741493225, "eval_num_tokens": 83488202.0, "eval_runtime": 56.4321, "eval_samples_per_second": 7.088, "eval_steps_per_second": 0.886, "step": 6600 }, { "epoch": 0.476813594565042, "grad_norm": 4.384121894836426, "learning_rate": 5.814212874442321e-05, "loss": 4.8256, "mean_token_accuracy": 0.6735525381565094, "num_tokens": 84123092.0, "step": 6650 }, { "epoch": 0.476813594565042, "eval_loss": 1.2007168531417847, "eval_mean_token_accuracy": 0.6743572854995727, "eval_num_tokens": 84123092.0, "eval_runtime": 56.5132, "eval_samples_per_second": 7.078, "eval_steps_per_second": 0.885, "step": 6650 }, { "epoch": 0.4803986591858318, "grad_norm": 5.510925769805908, "learning_rate": 5.774378585086042e-05, "loss": 4.7806, "mean_token_accuracy": 0.6750581926107406, "num_tokens": 84756126.0, "step": 6700 }, { "epoch": 0.4803986591858318, "eval_loss": 1.2009855508804321, "eval_mean_token_accuracy": 0.6739739573001862, "eval_num_tokens": 84756126.0, "eval_runtime": 56.6366, "eval_samples_per_second": 7.063, "eval_steps_per_second": 0.883, "step": 6700 }, { "epoch": 0.48398372380662164, "grad_norm": 4.581708908081055, "learning_rate": 5.7345442957297646e-05, "loss": 4.722, "mean_token_accuracy": 0.679350274503231, "num_tokens": 85386870.0, "step": 6750 }, { "epoch": 0.48398372380662164, "eval_loss": 1.2008088827133179, "eval_mean_token_accuracy": 0.6728281593322754, "eval_num_tokens": 85386870.0, "eval_runtime": 56.4337, "eval_samples_per_second": 7.088, "eval_steps_per_second": 0.886, "step": 6750 }, { "epoch": 0.4875687884274114, "grad_norm": 5.783533573150635, "learning_rate": 5.694710006373487e-05, "loss": 4.7616, "mean_token_accuracy": 0.6766093501448631, "num_tokens": 86015238.0, "step": 6800 }, { "epoch": 0.4875687884274114, "eval_loss": 1.2004883289337158, "eval_mean_token_accuracy": 0.6737746036052704, "eval_num_tokens": 86015238.0, "eval_runtime": 56.202, "eval_samples_per_second": 7.117, "eval_steps_per_second": 0.89, "step": 6800 }, { "epoch": 0.4911538530482012, "grad_norm": 4.4624714851379395, "learning_rate": 5.654875717017208e-05, "loss": 4.8036, "mean_token_accuracy": 0.6753204807639122, "num_tokens": 86646920.0, "step": 6850 }, { "epoch": 0.4911538530482012, "eval_loss": 1.1992673873901367, "eval_mean_token_accuracy": 0.6739728832244873, "eval_num_tokens": 86646920.0, "eval_runtime": 56.3145, "eval_samples_per_second": 7.103, "eval_steps_per_second": 0.888, "step": 6850 }, { "epoch": 0.494738917668991, "grad_norm": 4.528706073760986, "learning_rate": 5.615041427660931e-05, "loss": 4.8219, "mean_token_accuracy": 0.673625990152359, "num_tokens": 87279245.0, "step": 6900 }, { "epoch": 0.494738917668991, "eval_loss": 1.1988134384155273, "eval_mean_token_accuracy": 0.673334904909134, "eval_num_tokens": 87279245.0, "eval_runtime": 56.4107, "eval_samples_per_second": 7.091, "eval_steps_per_second": 0.886, "step": 6900 }, { "epoch": 0.49832398228978075, "grad_norm": 4.4395527839660645, "learning_rate": 5.575207138304653e-05, "loss": 4.8404, "mean_token_accuracy": 0.6731960904598236, "num_tokens": 87908715.0, "step": 6950 }, { "epoch": 0.49832398228978075, "eval_loss": 1.198786973953247, "eval_mean_token_accuracy": 0.6738696718215942, "eval_num_tokens": 87908715.0, "eval_runtime": 56.4217, "eval_samples_per_second": 7.089, "eval_steps_per_second": 0.886, "step": 6950 }, { "epoch": 0.5019090469105706, "grad_norm": 4.999813079833984, "learning_rate": 5.535372848948375e-05, "loss": 4.8014, "mean_token_accuracy": 0.6743469536304474, "num_tokens": 88541353.0, "step": 7000 }, { "epoch": 0.5019090469105706, "eval_loss": 1.1984182596206665, "eval_mean_token_accuracy": 0.6738111090660095, "eval_num_tokens": 88541353.0, "eval_runtime": 56.269, "eval_samples_per_second": 7.109, "eval_steps_per_second": 0.889, "step": 7000 }, { "epoch": 0.5054941115313604, "grad_norm": 5.244815826416016, "learning_rate": 5.4955385595920975e-05, "loss": 4.7345, "mean_token_accuracy": 0.6794330298900604, "num_tokens": 89171565.0, "step": 7050 }, { "epoch": 0.5054941115313604, "eval_loss": 1.1977105140686035, "eval_mean_token_accuracy": 0.6732430410385132, "eval_num_tokens": 89171565.0, "eval_runtime": 56.2208, "eval_samples_per_second": 7.115, "eval_steps_per_second": 0.889, "step": 7050 }, { "epoch": 0.5090791761521501, "grad_norm": 4.170567512512207, "learning_rate": 5.4557042702358196e-05, "loss": 4.7721, "mean_token_accuracy": 0.6758232372999191, "num_tokens": 89803955.0, "step": 7100 }, { "epoch": 0.5090791761521501, "eval_loss": 1.1980831623077393, "eval_mean_token_accuracy": 0.6739831912517548, "eval_num_tokens": 89803955.0, "eval_runtime": 56.1966, "eval_samples_per_second": 7.118, "eval_steps_per_second": 0.89, "step": 7100 }, { "epoch": 0.5126642407729399, "grad_norm": 4.576419830322266, "learning_rate": 5.415869980879541e-05, "loss": 4.7588, "mean_token_accuracy": 0.6762737995386123, "num_tokens": 90440293.0, "step": 7150 }, { "epoch": 0.5126642407729399, "eval_loss": 1.1975429058074951, "eval_mean_token_accuracy": 0.673925119638443, "eval_num_tokens": 90440293.0, "eval_runtime": 56.1988, "eval_samples_per_second": 7.118, "eval_steps_per_second": 0.89, "step": 7150 }, { "epoch": 0.5162493053937297, "grad_norm": 4.430201530456543, "learning_rate": 5.376035691523263e-05, "loss": 4.7482, "mean_token_accuracy": 0.6770784831047059, "num_tokens": 91071574.0, "step": 7200 }, { "epoch": 0.5162493053937297, "eval_loss": 1.1972256898880005, "eval_mean_token_accuracy": 0.6727135396003723, "eval_num_tokens": 91071574.0, "eval_runtime": 56.2879, "eval_samples_per_second": 7.106, "eval_steps_per_second": 0.888, "step": 7200 }, { "epoch": 0.5198343700145195, "grad_norm": 4.440696716308594, "learning_rate": 5.336201402166986e-05, "loss": 4.8496, "mean_token_accuracy": 0.6734337306022644, "num_tokens": 91704308.0, "step": 7250 }, { "epoch": 0.5198343700145195, "eval_loss": 1.196380853652954, "eval_mean_token_accuracy": 0.674187605381012, "eval_num_tokens": 91704308.0, "eval_runtime": 56.1737, "eval_samples_per_second": 7.121, "eval_steps_per_second": 0.89, "step": 7250 }, { "epoch": 0.5234194346353093, "grad_norm": 4.427169322967529, "learning_rate": 5.2963671128107075e-05, "loss": 4.7884, "mean_token_accuracy": 0.6760821756720543, "num_tokens": 92338569.0, "step": 7300 }, { "epoch": 0.5234194346353093, "eval_loss": 1.1962813138961792, "eval_mean_token_accuracy": 0.6740961968898773, "eval_num_tokens": 92338569.0, "eval_runtime": 56.279, "eval_samples_per_second": 7.107, "eval_steps_per_second": 0.888, "step": 7300 }, { "epoch": 0.527004499256099, "grad_norm": 4.586068630218506, "learning_rate": 5.25653282345443e-05, "loss": 4.8462, "mean_token_accuracy": 0.6725165358185768, "num_tokens": 92970201.0, "step": 7350 }, { "epoch": 0.527004499256099, "eval_loss": 1.1966549158096313, "eval_mean_token_accuracy": 0.6744921112060547, "eval_num_tokens": 92970201.0, "eval_runtime": 56.2579, "eval_samples_per_second": 7.11, "eval_steps_per_second": 0.889, "step": 7350 }, { "epoch": 0.5305895638768888, "grad_norm": 4.275878429412842, "learning_rate": 5.2166985340981525e-05, "loss": 4.7573, "mean_token_accuracy": 0.6764388364553452, "num_tokens": 93604624.0, "step": 7400 }, { "epoch": 0.5305895638768888, "eval_loss": 1.1963127851486206, "eval_mean_token_accuracy": 0.6738464891910553, "eval_num_tokens": 93604624.0, "eval_runtime": 56.2885, "eval_samples_per_second": 7.106, "eval_steps_per_second": 0.888, "step": 7400 }, { "epoch": 0.5341746284976787, "grad_norm": 4.383382797241211, "learning_rate": 5.176864244741873e-05, "loss": 4.8151, "mean_token_accuracy": 0.6745539313554764, "num_tokens": 94237768.0, "step": 7450 }, { "epoch": 0.5341746284976787, "eval_loss": 1.196314811706543, "eval_mean_token_accuracy": 0.6741366982460022, "eval_num_tokens": 94237768.0, "eval_runtime": 56.1682, "eval_samples_per_second": 7.121, "eval_steps_per_second": 0.89, "step": 7450 }, { "epoch": 0.5377596931184685, "grad_norm": 4.777865409851074, "learning_rate": 5.137029955385596e-05, "loss": 4.7179, "mean_token_accuracy": 0.6791237652301788, "num_tokens": 94868917.0, "step": 7500 }, { "epoch": 0.5377596931184685, "eval_loss": 1.1951854228973389, "eval_mean_token_accuracy": 0.6737760400772095, "eval_num_tokens": 94868917.0, "eval_runtime": 57.4744, "eval_samples_per_second": 6.96, "eval_steps_per_second": 0.87, "step": 7500 }, { "epoch": 0.5413447577392583, "grad_norm": 5.250718116760254, "learning_rate": 5.097195666029318e-05, "loss": 4.853, "mean_token_accuracy": 0.671261510848999, "num_tokens": 95504086.0, "step": 7550 }, { "epoch": 0.5413447577392583, "eval_loss": 1.1954213380813599, "eval_mean_token_accuracy": 0.6752165842056275, "eval_num_tokens": 95504086.0, "eval_runtime": 56.6697, "eval_samples_per_second": 7.058, "eval_steps_per_second": 0.882, "step": 7550 }, { "epoch": 0.544929822360048, "grad_norm": 4.873703479766846, "learning_rate": 5.05736137667304e-05, "loss": 4.7717, "mean_token_accuracy": 0.6773542383313179, "num_tokens": 96139635.0, "step": 7600 }, { "epoch": 0.544929822360048, "eval_loss": 1.1948680877685547, "eval_mean_token_accuracy": 0.6749390983581542, "eval_num_tokens": 96139635.0, "eval_runtime": 55.7291, "eval_samples_per_second": 7.178, "eval_steps_per_second": 0.897, "step": 7600 }, { "epoch": 0.5485148869808378, "grad_norm": 4.877697467803955, "learning_rate": 5.0175270873167626e-05, "loss": 4.7886, "mean_token_accuracy": 0.6762890338897705, "num_tokens": 96770646.0, "step": 7650 }, { "epoch": 0.5485148869808378, "eval_loss": 1.1947038173675537, "eval_mean_token_accuracy": 0.6754334461688996, "eval_num_tokens": 96770646.0, "eval_runtime": 55.5575, "eval_samples_per_second": 7.2, "eval_steps_per_second": 0.9, "step": 7650 }, { "epoch": 0.5520999516016276, "grad_norm": 4.1052117347717285, "learning_rate": 4.977692797960485e-05, "loss": 4.8169, "mean_token_accuracy": 0.6731060117483139, "num_tokens": 97404492.0, "step": 7700 }, { "epoch": 0.5520999516016276, "eval_loss": 1.1950753927230835, "eval_mean_token_accuracy": 0.6745660018920898, "eval_num_tokens": 97404492.0, "eval_runtime": 55.0653, "eval_samples_per_second": 7.264, "eval_steps_per_second": 0.908, "step": 7700 }, { "epoch": 0.5556850162224174, "grad_norm": 4.796311855316162, "learning_rate": 4.937858508604207e-05, "loss": 4.7797, "mean_token_accuracy": 0.6754096934199333, "num_tokens": 98040499.0, "step": 7750 }, { "epoch": 0.5556850162224174, "eval_loss": 1.1943681240081787, "eval_mean_token_accuracy": 0.6758341288566589, "eval_num_tokens": 98040499.0, "eval_runtime": 55.0042, "eval_samples_per_second": 7.272, "eval_steps_per_second": 0.909, "step": 7750 }, { "epoch": 0.5592700808432072, "grad_norm": 4.996248722076416, "learning_rate": 4.898024219247929e-05, "loss": 4.7915, "mean_token_accuracy": 0.6766231226921081, "num_tokens": 98666526.0, "step": 7800 }, { "epoch": 0.5592700808432072, "eval_loss": 1.1935983896255493, "eval_mean_token_accuracy": 0.6747439002990723, "eval_num_tokens": 98666526.0, "eval_runtime": 55.0204, "eval_samples_per_second": 7.27, "eval_steps_per_second": 0.909, "step": 7800 }, { "epoch": 0.562855145463997, "grad_norm": 4.6099534034729, "learning_rate": 4.858189929891651e-05, "loss": 4.7901, "mean_token_accuracy": 0.6749289181828498, "num_tokens": 99297640.0, "step": 7850 }, { "epoch": 0.562855145463997, "eval_loss": 1.1944231986999512, "eval_mean_token_accuracy": 0.6743023383617401, "eval_num_tokens": 99297640.0, "eval_runtime": 55.1784, "eval_samples_per_second": 7.249, "eval_steps_per_second": 0.906, "step": 7850 }, { "epoch": 0.5664402100847867, "grad_norm": 4.517291069030762, "learning_rate": 4.818355640535373e-05, "loss": 4.8626, "mean_token_accuracy": 0.6725256371498108, "num_tokens": 99928477.0, "step": 7900 }, { "epoch": 0.5664402100847867, "eval_loss": 1.1935796737670898, "eval_mean_token_accuracy": 0.6745834064483642, "eval_num_tokens": 99928477.0, "eval_runtime": 55.0686, "eval_samples_per_second": 7.264, "eval_steps_per_second": 0.908, "step": 7900 }, { "epoch": 0.5700252747055766, "grad_norm": 4.5024003982543945, "learning_rate": 4.778521351179095e-05, "loss": 4.712, "mean_token_accuracy": 0.6803113195300102, "num_tokens": 100557231.0, "step": 7950 }, { "epoch": 0.5700252747055766, "eval_loss": 1.1945058107376099, "eval_mean_token_accuracy": 0.6740881907939911, "eval_num_tokens": 100557231.0, "eval_runtime": 54.9762, "eval_samples_per_second": 7.276, "eval_steps_per_second": 0.909, "step": 7950 }, { "epoch": 0.5736103393263664, "grad_norm": 4.741750717163086, "learning_rate": 4.7386870618228176e-05, "loss": 4.7828, "mean_token_accuracy": 0.6768678402900696, "num_tokens": 101189608.0, "step": 8000 }, { "epoch": 0.5736103393263664, "eval_loss": 1.1939120292663574, "eval_mean_token_accuracy": 0.6754596734046936, "eval_num_tokens": 101189608.0, "eval_runtime": 55.073, "eval_samples_per_second": 7.263, "eval_steps_per_second": 0.908, "step": 8000 }, { "epoch": 0.5771954039471562, "grad_norm": 4.49591064453125, "learning_rate": 4.698852772466539e-05, "loss": 4.8169, "mean_token_accuracy": 0.6742269179224968, "num_tokens": 101822322.0, "step": 8050 }, { "epoch": 0.5771954039471562, "eval_loss": 1.1937360763549805, "eval_mean_token_accuracy": 0.6749664378166199, "eval_num_tokens": 101822322.0, "eval_runtime": 55.0891, "eval_samples_per_second": 7.261, "eval_steps_per_second": 0.908, "step": 8050 }, { "epoch": 0.580780468567946, "grad_norm": 5.148952007293701, "learning_rate": 4.659018483110262e-05, "loss": 4.7721, "mean_token_accuracy": 0.6771083778142929, "num_tokens": 102453524.0, "step": 8100 }, { "epoch": 0.580780468567946, "eval_loss": 1.194778323173523, "eval_mean_token_accuracy": 0.6745067381858826, "eval_num_tokens": 102453524.0, "eval_runtime": 55.1207, "eval_samples_per_second": 7.257, "eval_steps_per_second": 0.907, "step": 8100 }, { "epoch": 0.5843655331887357, "grad_norm": 5.115074634552002, "learning_rate": 4.619184193753984e-05, "loss": 4.7134, "mean_token_accuracy": 0.6806976914405822, "num_tokens": 103083835.0, "step": 8150 }, { "epoch": 0.5843655331887357, "eval_loss": 1.192982792854309, "eval_mean_token_accuracy": 0.6747034168243409, "eval_num_tokens": 103083835.0, "eval_runtime": 55.0852, "eval_samples_per_second": 7.261, "eval_steps_per_second": 0.908, "step": 8150 }, { "epoch": 0.5879505978095255, "grad_norm": 4.619081497192383, "learning_rate": 4.5793499043977055e-05, "loss": 4.796, "mean_token_accuracy": 0.6744829830527306, "num_tokens": 103715932.0, "step": 8200 }, { "epoch": 0.5879505978095255, "eval_loss": 1.192581057548523, "eval_mean_token_accuracy": 0.6748893690109253, "eval_num_tokens": 103715932.0, "eval_runtime": 55.0423, "eval_samples_per_second": 7.267, "eval_steps_per_second": 0.908, "step": 8200 }, { "epoch": 0.5915356624303153, "grad_norm": 4.433931350708008, "learning_rate": 4.539515615041428e-05, "loss": 4.7435, "mean_token_accuracy": 0.678233249783516, "num_tokens": 104345485.0, "step": 8250 }, { "epoch": 0.5915356624303153, "eval_loss": 1.1933448314666748, "eval_mean_token_accuracy": 0.6751340889930725, "eval_num_tokens": 104345485.0, "eval_runtime": 55.0423, "eval_samples_per_second": 7.267, "eval_steps_per_second": 0.908, "step": 8250 }, { "epoch": 0.5951207270511051, "grad_norm": 4.362198829650879, "learning_rate": 4.49968132568515e-05, "loss": 4.7737, "mean_token_accuracy": 0.6764271047711372, "num_tokens": 104978991.0, "step": 8300 }, { "epoch": 0.5951207270511051, "eval_loss": 1.1930105686187744, "eval_mean_token_accuracy": 0.6747807443141938, "eval_num_tokens": 104978991.0, "eval_runtime": 55.3077, "eval_samples_per_second": 7.232, "eval_steps_per_second": 0.904, "step": 8300 }, { "epoch": 0.5987057916718949, "grad_norm": 4.534180641174316, "learning_rate": 4.459847036328872e-05, "loss": 4.7728, "mean_token_accuracy": 0.677936093211174, "num_tokens": 105610427.0, "step": 8350 }, { "epoch": 0.5987057916718949, "eval_loss": 1.1923025846481323, "eval_mean_token_accuracy": 0.6750785481929779, "eval_num_tokens": 105610427.0, "eval_runtime": 55.0617, "eval_samples_per_second": 7.265, "eval_steps_per_second": 0.908, "step": 8350 }, { "epoch": 0.6022908562926846, "grad_norm": 5.027590274810791, "learning_rate": 4.420012746972594e-05, "loss": 4.7192, "mean_token_accuracy": 0.6799935781955719, "num_tokens": 106243018.0, "step": 8400 }, { "epoch": 0.6022908562926846, "eval_loss": 1.1910535097122192, "eval_mean_token_accuracy": 0.6752017951011657, "eval_num_tokens": 106243018.0, "eval_runtime": 55.087, "eval_samples_per_second": 7.261, "eval_steps_per_second": 0.908, "step": 8400 }, { "epoch": 0.6058759209134744, "grad_norm": 4.658295154571533, "learning_rate": 4.380178457616316e-05, "loss": 4.806, "mean_token_accuracy": 0.6742839315533637, "num_tokens": 106877488.0, "step": 8450 }, { "epoch": 0.6058759209134744, "eval_loss": 1.1911369562149048, "eval_mean_token_accuracy": 0.6754417788982391, "eval_num_tokens": 106877488.0, "eval_runtime": 55.1243, "eval_samples_per_second": 7.256, "eval_steps_per_second": 0.907, "step": 8450 }, { "epoch": 0.6094609855342643, "grad_norm": 4.897305488586426, "learning_rate": 4.340344168260038e-05, "loss": 4.7673, "mean_token_accuracy": 0.6776413953304291, "num_tokens": 107510169.0, "step": 8500 }, { "epoch": 0.6094609855342643, "eval_loss": 1.1910532712936401, "eval_mean_token_accuracy": 0.6744759595394134, "eval_num_tokens": 107510169.0, "eval_runtime": 55.0914, "eval_samples_per_second": 7.261, "eval_steps_per_second": 0.908, "step": 8500 }, { "epoch": 0.6130460501550541, "grad_norm": 4.881381034851074, "learning_rate": 4.3005098789037605e-05, "loss": 4.8061, "mean_token_accuracy": 0.674516750574112, "num_tokens": 108145421.0, "step": 8550 }, { "epoch": 0.6130460501550541, "eval_loss": 1.190964937210083, "eval_mean_token_accuracy": 0.6752122223377228, "eval_num_tokens": 108145421.0, "eval_runtime": 54.985, "eval_samples_per_second": 7.275, "eval_steps_per_second": 0.909, "step": 8550 }, { "epoch": 0.6166311147758439, "grad_norm": 5.073390483856201, "learning_rate": 4.2606755895474826e-05, "loss": 4.7739, "mean_token_accuracy": 0.6762193894386291, "num_tokens": 108780841.0, "step": 8600 }, { "epoch": 0.6166311147758439, "eval_loss": 1.1907490491867065, "eval_mean_token_accuracy": 0.6748946511745453, "eval_num_tokens": 108780841.0, "eval_runtime": 54.9525, "eval_samples_per_second": 7.279, "eval_steps_per_second": 0.91, "step": 8600 }, { "epoch": 0.6202161793966336, "grad_norm": 4.459120750427246, "learning_rate": 4.220841300191205e-05, "loss": 4.7864, "mean_token_accuracy": 0.6758550813794136, "num_tokens": 109418807.0, "step": 8650 }, { "epoch": 0.6202161793966336, "eval_loss": 1.1910618543624878, "eval_mean_token_accuracy": 0.6749819540977477, "eval_num_tokens": 109418807.0, "eval_runtime": 54.9049, "eval_samples_per_second": 7.285, "eval_steps_per_second": 0.911, "step": 8650 }, { "epoch": 0.6238012440174234, "grad_norm": 4.40315055847168, "learning_rate": 4.181007010834927e-05, "loss": 4.7917, "mean_token_accuracy": 0.6742719665169716, "num_tokens": 110057516.0, "step": 8700 }, { "epoch": 0.6238012440174234, "eval_loss": 1.1901732683181763, "eval_mean_token_accuracy": 0.674855477809906, "eval_num_tokens": 110057516.0, "eval_runtime": 54.9882, "eval_samples_per_second": 7.274, "eval_steps_per_second": 0.909, "step": 8700 }, { "epoch": 0.6273863086382132, "grad_norm": 4.657465934753418, "learning_rate": 4.141172721478649e-05, "loss": 4.7503, "mean_token_accuracy": 0.6786279901862144, "num_tokens": 110689903.0, "step": 8750 }, { "epoch": 0.6273863086382132, "eval_loss": 1.190616488456726, "eval_mean_token_accuracy": 0.675126885175705, "eval_num_tokens": 110689903.0, "eval_runtime": 55.2291, "eval_samples_per_second": 7.243, "eval_steps_per_second": 0.905, "step": 8750 }, { "epoch": 0.630971373259003, "grad_norm": 4.282220840454102, "learning_rate": 4.101338432122371e-05, "loss": 4.7029, "mean_token_accuracy": 0.6801710060238838, "num_tokens": 111323016.0, "step": 8800 }, { "epoch": 0.630971373259003, "eval_loss": 1.1904511451721191, "eval_mean_token_accuracy": 0.6755478191375732, "eval_num_tokens": 111323016.0, "eval_runtime": 54.9632, "eval_samples_per_second": 7.278, "eval_steps_per_second": 0.91, "step": 8800 }, { "epoch": 0.6345564378797928, "grad_norm": 4.598837852478027, "learning_rate": 4.0615041427660933e-05, "loss": 4.7755, "mean_token_accuracy": 0.6768260210752487, "num_tokens": 111959283.0, "step": 8850 }, { "epoch": 0.6345564378797928, "eval_loss": 1.1904475688934326, "eval_mean_token_accuracy": 0.6753637742996216, "eval_num_tokens": 111959283.0, "eval_runtime": 55.0394, "eval_samples_per_second": 7.268, "eval_steps_per_second": 0.908, "step": 8850 }, { "epoch": 0.6381415025005825, "grad_norm": 4.1816558837890625, "learning_rate": 4.0216698534098155e-05, "loss": 4.7758, "mean_token_accuracy": 0.6754831087589264, "num_tokens": 112596409.0, "step": 8900 }, { "epoch": 0.6381415025005825, "eval_loss": 1.1898977756500244, "eval_mean_token_accuracy": 0.6755084788799286, "eval_num_tokens": 112596409.0, "eval_runtime": 55.0202, "eval_samples_per_second": 7.27, "eval_steps_per_second": 0.909, "step": 8900 }, { "epoch": 0.6417265671213723, "grad_norm": 4.973260402679443, "learning_rate": 3.9818355640535376e-05, "loss": 4.7116, "mean_token_accuracy": 0.6818169742822647, "num_tokens": 113224902.0, "step": 8950 }, { "epoch": 0.6417265671213723, "eval_loss": 1.1896042823791504, "eval_mean_token_accuracy": 0.6757630515098572, "eval_num_tokens": 113224902.0, "eval_runtime": 54.9621, "eval_samples_per_second": 7.278, "eval_steps_per_second": 0.91, "step": 8950 }, { "epoch": 0.6453116317421621, "grad_norm": 4.470012664794922, "learning_rate": 3.94200127469726e-05, "loss": 4.713, "mean_token_accuracy": 0.6787376815080642, "num_tokens": 113858275.0, "step": 9000 }, { "epoch": 0.6453116317421621, "eval_loss": 1.1898657083511353, "eval_mean_token_accuracy": 0.6760083436965942, "eval_num_tokens": 113858275.0, "eval_runtime": 54.9184, "eval_samples_per_second": 7.284, "eval_steps_per_second": 0.91, "step": 9000 }, { "epoch": 0.648896696362952, "grad_norm": 4.098659992218018, "learning_rate": 3.902166985340981e-05, "loss": 4.7423, "mean_token_accuracy": 0.6774056190252304, "num_tokens": 114495903.0, "step": 9050 }, { "epoch": 0.648896696362952, "eval_loss": 1.1897211074829102, "eval_mean_token_accuracy": 0.6754970908164978, "eval_num_tokens": 114495903.0, "eval_runtime": 55.043, "eval_samples_per_second": 7.267, "eval_steps_per_second": 0.908, "step": 9050 }, { "epoch": 0.6524817609837418, "grad_norm": 4.9181976318359375, "learning_rate": 3.862332695984704e-05, "loss": 4.7403, "mean_token_accuracy": 0.6782529127597808, "num_tokens": 115123519.0, "step": 9100 }, { "epoch": 0.6524817609837418, "eval_loss": 1.1898068189620972, "eval_mean_token_accuracy": 0.6745435571670533, "eval_num_tokens": 115123519.0, "eval_runtime": 54.9272, "eval_samples_per_second": 7.282, "eval_steps_per_second": 0.91, "step": 9100 }, { "epoch": 0.6560668256045316, "grad_norm": 4.978320121765137, "learning_rate": 3.8224984066284255e-05, "loss": 4.8028, "mean_token_accuracy": 0.675481299161911, "num_tokens": 115755644.0, "step": 9150 }, { "epoch": 0.6560668256045316, "eval_loss": 1.1891556978225708, "eval_mean_token_accuracy": 0.6756797277927399, "eval_num_tokens": 115755644.0, "eval_runtime": 55.005, "eval_samples_per_second": 7.272, "eval_steps_per_second": 0.909, "step": 9150 }, { "epoch": 0.6596518902253213, "grad_norm": 4.682608604431152, "learning_rate": 3.7826641172721484e-05, "loss": 4.8228, "mean_token_accuracy": 0.6748796856403351, "num_tokens": 116391334.0, "step": 9200 }, { "epoch": 0.6596518902253213, "eval_loss": 1.1887702941894531, "eval_mean_token_accuracy": 0.6753370201587677, "eval_num_tokens": 116391334.0, "eval_runtime": 54.9714, "eval_samples_per_second": 7.277, "eval_steps_per_second": 0.91, "step": 9200 }, { "epoch": 0.6632369548461111, "grad_norm": 4.45632791519165, "learning_rate": 3.7428298279158705e-05, "loss": 4.7473, "mean_token_accuracy": 0.6801807761192322, "num_tokens": 117026978.0, "step": 9250 }, { "epoch": 0.6632369548461111, "eval_loss": 1.1893665790557861, "eval_mean_token_accuracy": 0.6753548145294189, "eval_num_tokens": 117026978.0, "eval_runtime": 55.0208, "eval_samples_per_second": 7.27, "eval_steps_per_second": 0.909, "step": 9250 }, { "epoch": 0.6668220194669009, "grad_norm": 4.2916951179504395, "learning_rate": 3.702995538559592e-05, "loss": 4.7187, "mean_token_accuracy": 0.6798599645495415, "num_tokens": 117660612.0, "step": 9300 }, { "epoch": 0.6668220194669009, "eval_loss": 1.189585566520691, "eval_mean_token_accuracy": 0.6756225192546844, "eval_num_tokens": 117660612.0, "eval_runtime": 55.0296, "eval_samples_per_second": 7.269, "eval_steps_per_second": 0.909, "step": 9300 }, { "epoch": 0.6704070840876907, "grad_norm": 4.559842109680176, "learning_rate": 3.663161249203315e-05, "loss": 4.7029, "mean_token_accuracy": 0.6812646022439003, "num_tokens": 118290778.0, "step": 9350 }, { "epoch": 0.6704070840876907, "eval_loss": 1.1883878707885742, "eval_mean_token_accuracy": 0.6754980099201202, "eval_num_tokens": 118290778.0, "eval_runtime": 55.0845, "eval_samples_per_second": 7.262, "eval_steps_per_second": 0.908, "step": 9350 }, { "epoch": 0.6739921487084805, "grad_norm": 4.957666873931885, "learning_rate": 3.623326959847036e-05, "loss": 4.716, "mean_token_accuracy": 0.6789155259728432, "num_tokens": 118921441.0, "step": 9400 }, { "epoch": 0.6739921487084805, "eval_loss": 1.1882615089416504, "eval_mean_token_accuracy": 0.6757899785041809, "eval_num_tokens": 118921441.0, "eval_runtime": 55.0393, "eval_samples_per_second": 7.268, "eval_steps_per_second": 0.908, "step": 9400 }, { "epoch": 0.6775772133292702, "grad_norm": 4.460175037384033, "learning_rate": 3.5834926704907584e-05, "loss": 4.7439, "mean_token_accuracy": 0.6776839691400528, "num_tokens": 119549331.0, "step": 9450 }, { "epoch": 0.6775772133292702, "eval_loss": 1.1887913942337036, "eval_mean_token_accuracy": 0.6761759769916534, "eval_num_tokens": 119549331.0, "eval_runtime": 54.9213, "eval_samples_per_second": 7.283, "eval_steps_per_second": 0.91, "step": 9450 }, { "epoch": 0.68116227795006, "grad_norm": 4.3697638511657715, "learning_rate": 3.543658381134481e-05, "loss": 4.7796, "mean_token_accuracy": 0.6753658777475358, "num_tokens": 120180706.0, "step": 9500 }, { "epoch": 0.68116227795006, "eval_loss": 1.187656044960022, "eval_mean_token_accuracy": 0.6755701994895935, "eval_num_tokens": 120180706.0, "eval_runtime": 54.925, "eval_samples_per_second": 7.283, "eval_steps_per_second": 0.91, "step": 9500 }, { "epoch": 0.6847473425708498, "grad_norm": 4.676335334777832, "learning_rate": 3.503824091778203e-05, "loss": 4.8121, "mean_token_accuracy": 0.6740303432941437, "num_tokens": 120813928.0, "step": 9550 }, { "epoch": 0.6847473425708498, "eval_loss": 1.1875134706497192, "eval_mean_token_accuracy": 0.6751706182956696, "eval_num_tokens": 120813928.0, "eval_runtime": 55.1212, "eval_samples_per_second": 7.257, "eval_steps_per_second": 0.907, "step": 9550 }, { "epoch": 0.6883324071916397, "grad_norm": 5.17042875289917, "learning_rate": 3.463989802421925e-05, "loss": 4.7668, "mean_token_accuracy": 0.6758210748434067, "num_tokens": 121446792.0, "step": 9600 }, { "epoch": 0.6883324071916397, "eval_loss": 1.1872638463974, "eval_mean_token_accuracy": 0.6760274660587311, "eval_num_tokens": 121446792.0, "eval_runtime": 55.1534, "eval_samples_per_second": 7.252, "eval_steps_per_second": 0.907, "step": 9600 }, { "epoch": 0.6919174718124295, "grad_norm": 4.4179840087890625, "learning_rate": 3.424155513065647e-05, "loss": 4.7633, "mean_token_accuracy": 0.6759152534604073, "num_tokens": 122072069.0, "step": 9650 }, { "epoch": 0.6919174718124295, "eval_loss": 1.1880455017089844, "eval_mean_token_accuracy": 0.6756154441833496, "eval_num_tokens": 122072069.0, "eval_runtime": 55.0701, "eval_samples_per_second": 7.263, "eval_steps_per_second": 0.908, "step": 9650 }, { "epoch": 0.6955025364332192, "grad_norm": 4.7966437339782715, "learning_rate": 3.384321223709369e-05, "loss": 4.7314, "mean_token_accuracy": 0.6790701761841774, "num_tokens": 122703011.0, "step": 9700 }, { "epoch": 0.6955025364332192, "eval_loss": 1.187593936920166, "eval_mean_token_accuracy": 0.6753950679302215, "eval_num_tokens": 122703011.0, "eval_runtime": 55.1146, "eval_samples_per_second": 7.258, "eval_steps_per_second": 0.907, "step": 9700 }, { "epoch": 0.699087601054009, "grad_norm": 4.6988630294799805, "learning_rate": 3.344486934353091e-05, "loss": 4.7162, "mean_token_accuracy": 0.680111817419529, "num_tokens": 123334937.0, "step": 9750 }, { "epoch": 0.699087601054009, "eval_loss": 1.1878883838653564, "eval_mean_token_accuracy": 0.6757830834388733, "eval_num_tokens": 123334937.0, "eval_runtime": 55.0822, "eval_samples_per_second": 7.262, "eval_steps_per_second": 0.908, "step": 9750 }, { "epoch": 0.7026726656747988, "grad_norm": 5.11058235168457, "learning_rate": 3.3046526449968134e-05, "loss": 4.7778, "mean_token_accuracy": 0.6746508419513703, "num_tokens": 123969291.0, "step": 9800 }, { "epoch": 0.7026726656747988, "eval_loss": 1.1869330406188965, "eval_mean_token_accuracy": 0.6755635273456574, "eval_num_tokens": 123969291.0, "eval_runtime": 55.0482, "eval_samples_per_second": 7.266, "eval_steps_per_second": 0.908, "step": 9800 }, { "epoch": 0.7062577302955886, "grad_norm": 4.539863109588623, "learning_rate": 3.2648183556405356e-05, "loss": 4.7265, "mean_token_accuracy": 0.6780777916312217, "num_tokens": 124597342.0, "step": 9850 }, { "epoch": 0.7062577302955886, "eval_loss": 1.1864935159683228, "eval_mean_token_accuracy": 0.6759455275535583, "eval_num_tokens": 124597342.0, "eval_runtime": 54.9859, "eval_samples_per_second": 7.275, "eval_steps_per_second": 0.909, "step": 9850 }, { "epoch": 0.7098427949163784, "grad_norm": 4.2660112380981445, "learning_rate": 3.224984066284258e-05, "loss": 4.7727, "mean_token_accuracy": 0.6779581853747367, "num_tokens": 125230255.0, "step": 9900 }, { "epoch": 0.7098427949163784, "eval_loss": 1.1863397359848022, "eval_mean_token_accuracy": 0.6765384769439697, "eval_num_tokens": 125230255.0, "eval_runtime": 55.0326, "eval_samples_per_second": 7.268, "eval_steps_per_second": 0.909, "step": 9900 }, { "epoch": 0.7134278595371681, "grad_norm": 4.660075664520264, "learning_rate": 3.18514977692798e-05, "loss": 4.6841, "mean_token_accuracy": 0.6811311572790146, "num_tokens": 125863729.0, "step": 9950 }, { "epoch": 0.7134278595371681, "eval_loss": 1.1863139867782593, "eval_mean_token_accuracy": 0.676126846075058, "eval_num_tokens": 125863729.0, "eval_runtime": 55.1646, "eval_samples_per_second": 7.251, "eval_steps_per_second": 0.906, "step": 9950 }, { "epoch": 0.7170129241579579, "grad_norm": 4.517760753631592, "learning_rate": 3.145315487571702e-05, "loss": 4.7781, "mean_token_accuracy": 0.6761695435643196, "num_tokens": 126496619.0, "step": 10000 }, { "epoch": 0.7170129241579579, "eval_loss": 1.1863616704940796, "eval_mean_token_accuracy": 0.6760414135456085, "eval_num_tokens": 126496619.0, "eval_runtime": 55.0497, "eval_samples_per_second": 7.266, "eval_steps_per_second": 0.908, "step": 10000 }, { "epoch": 0.7205979887787477, "grad_norm": 4.821887493133545, "learning_rate": 3.105481198215424e-05, "loss": 4.7506, "mean_token_accuracy": 0.677640742957592, "num_tokens": 127129025.0, "step": 10050 }, { "epoch": 0.7205979887787477, "eval_loss": 1.1864928007125854, "eval_mean_token_accuracy": 0.6763237309455872, "eval_num_tokens": 127129025.0, "eval_runtime": 55.0217, "eval_samples_per_second": 7.27, "eval_steps_per_second": 0.909, "step": 10050 }, { "epoch": 0.7241830533995375, "grad_norm": 5.278724670410156, "learning_rate": 3.065646908859146e-05, "loss": 4.7726, "mean_token_accuracy": 0.6762316790223122, "num_tokens": 127757800.0, "step": 10100 }, { "epoch": 0.7241830533995375, "eval_loss": 1.1857789754867554, "eval_mean_token_accuracy": 0.6754815447330474, "eval_num_tokens": 127757800.0, "eval_runtime": 55.2673, "eval_samples_per_second": 7.238, "eval_steps_per_second": 0.905, "step": 10100 }, { "epoch": 0.7277681180203274, "grad_norm": 4.649436950683594, "learning_rate": 3.025812619502868e-05, "loss": 4.803, "mean_token_accuracy": 0.6757835251092911, "num_tokens": 128391102.0, "step": 10150 }, { "epoch": 0.7277681180203274, "eval_loss": 1.1859068870544434, "eval_mean_token_accuracy": 0.6756039881706237, "eval_num_tokens": 128391102.0, "eval_runtime": 55.0647, "eval_samples_per_second": 7.264, "eval_steps_per_second": 0.908, "step": 10150 }, { "epoch": 0.7313531826411171, "grad_norm": 4.287916660308838, "learning_rate": 2.9859783301465906e-05, "loss": 4.7373, "mean_token_accuracy": 0.677640765607357, "num_tokens": 129023759.0, "step": 10200 }, { "epoch": 0.7313531826411171, "eval_loss": 1.1861519813537598, "eval_mean_token_accuracy": 0.6761102056503296, "eval_num_tokens": 129023759.0, "eval_runtime": 54.9417, "eval_samples_per_second": 7.28, "eval_steps_per_second": 0.91, "step": 10200 }, { "epoch": 0.7349382472619069, "grad_norm": 4.765435695648193, "learning_rate": 2.9461440407903124e-05, "loss": 4.7741, "mean_token_accuracy": 0.6759647503495216, "num_tokens": 129655248.0, "step": 10250 }, { "epoch": 0.7349382472619069, "eval_loss": 1.1853660345077515, "eval_mean_token_accuracy": 0.6756195032596588, "eval_num_tokens": 129655248.0, "eval_runtime": 55.3223, "eval_samples_per_second": 7.23, "eval_steps_per_second": 0.904, "step": 10250 }, { "epoch": 0.7385233118826967, "grad_norm": 4.814145088195801, "learning_rate": 2.906309751434035e-05, "loss": 4.783, "mean_token_accuracy": 0.6754540035128593, "num_tokens": 130291512.0, "step": 10300 }, { "epoch": 0.7385233118826967, "eval_loss": 1.1858062744140625, "eval_mean_token_accuracy": 0.6755566847324371, "eval_num_tokens": 130291512.0, "eval_runtime": 55.3243, "eval_samples_per_second": 7.23, "eval_steps_per_second": 0.904, "step": 10300 }, { "epoch": 0.7421083765034865, "grad_norm": 4.518885135650635, "learning_rate": 2.8664754620777567e-05, "loss": 4.7083, "mean_token_accuracy": 0.6788066929578781, "num_tokens": 130922542.0, "step": 10350 }, { "epoch": 0.7421083765034865, "eval_loss": 1.185410499572754, "eval_mean_token_accuracy": 0.6754859507083892, "eval_num_tokens": 130922542.0, "eval_runtime": 55.3328, "eval_samples_per_second": 7.229, "eval_steps_per_second": 0.904, "step": 10350 }, { "epoch": 0.7456934411242763, "grad_norm": 4.623891830444336, "learning_rate": 2.8266411727214788e-05, "loss": 4.8077, "mean_token_accuracy": 0.6753540116548539, "num_tokens": 131553716.0, "step": 10400 }, { "epoch": 0.7456934411242763, "eval_loss": 1.18582022190094, "eval_mean_token_accuracy": 0.6761428475379944, "eval_num_tokens": 131553716.0, "eval_runtime": 55.033, "eval_samples_per_second": 7.268, "eval_steps_per_second": 0.909, "step": 10400 }, { "epoch": 0.749278505745066, "grad_norm": 4.524717807769775, "learning_rate": 2.7868068833652013e-05, "loss": 4.7216, "mean_token_accuracy": 0.6801271498203277, "num_tokens": 132187857.0, "step": 10450 }, { "epoch": 0.749278505745066, "eval_loss": 1.1854428052902222, "eval_mean_token_accuracy": 0.6755358970165253, "eval_num_tokens": 132187857.0, "eval_runtime": 55.0174, "eval_samples_per_second": 7.27, "eval_steps_per_second": 0.909, "step": 10450 }, { "epoch": 0.7528635703658558, "grad_norm": 4.81862211227417, "learning_rate": 2.746972594008923e-05, "loss": 4.8024, "mean_token_accuracy": 0.6740377223491669, "num_tokens": 132818348.0, "step": 10500 }, { "epoch": 0.7528635703658558, "eval_loss": 1.184729814529419, "eval_mean_token_accuracy": 0.6763768219947814, "eval_num_tokens": 132818348.0, "eval_runtime": 54.9843, "eval_samples_per_second": 7.275, "eval_steps_per_second": 0.909, "step": 10500 }, { "epoch": 0.7564486349866456, "grad_norm": 4.876639366149902, "learning_rate": 2.707138304652645e-05, "loss": 4.691, "mean_token_accuracy": 0.682259525358677, "num_tokens": 133450534.0, "step": 10550 }, { "epoch": 0.7564486349866456, "eval_loss": 1.1844180822372437, "eval_mean_token_accuracy": 0.6765269267559052, "eval_num_tokens": 133450534.0, "eval_runtime": 55.1025, "eval_samples_per_second": 7.259, "eval_steps_per_second": 0.907, "step": 10550 }, { "epoch": 0.7600336996074354, "grad_norm": 5.20668363571167, "learning_rate": 2.6673040152963674e-05, "loss": 4.7633, "mean_token_accuracy": 0.6769976457953453, "num_tokens": 134081427.0, "step": 10600 }, { "epoch": 0.7600336996074354, "eval_loss": 1.1848989725112915, "eval_mean_token_accuracy": 0.6765480875968933, "eval_num_tokens": 134081427.0, "eval_runtime": 55.16, "eval_samples_per_second": 7.252, "eval_steps_per_second": 0.906, "step": 10600 }, { "epoch": 0.7636187642282252, "grad_norm": 4.415744304656982, "learning_rate": 2.6274697259400892e-05, "loss": 4.7572, "mean_token_accuracy": 0.6762901389598847, "num_tokens": 134717648.0, "step": 10650 }, { "epoch": 0.7636187642282252, "eval_loss": 1.1854746341705322, "eval_mean_token_accuracy": 0.6762021934986114, "eval_num_tokens": 134717648.0, "eval_runtime": 55.1597, "eval_samples_per_second": 7.252, "eval_steps_per_second": 0.906, "step": 10650 }, { "epoch": 0.7672038288490151, "grad_norm": 4.984974384307861, "learning_rate": 2.5876354365838113e-05, "loss": 4.6918, "mean_token_accuracy": 0.6813731342554092, "num_tokens": 135352362.0, "step": 10700 }, { "epoch": 0.7672038288490151, "eval_loss": 1.1856120824813843, "eval_mean_token_accuracy": 0.6763051617145538, "eval_num_tokens": 135352362.0, "eval_runtime": 54.9725, "eval_samples_per_second": 7.276, "eval_steps_per_second": 0.91, "step": 10700 }, { "epoch": 0.7707888934698048, "grad_norm": 4.5358781814575195, "learning_rate": 2.5478011472275338e-05, "loss": 4.7242, "mean_token_accuracy": 0.6771323186159134, "num_tokens": 135991559.0, "step": 10750 }, { "epoch": 0.7707888934698048, "eval_loss": 1.185410737991333, "eval_mean_token_accuracy": 0.676514265537262, "eval_num_tokens": 135991559.0, "eval_runtime": 55.0995, "eval_samples_per_second": 7.26, "eval_steps_per_second": 0.907, "step": 10750 }, { "epoch": 0.7743739580905946, "grad_norm": 4.364614009857178, "learning_rate": 2.5079668578712556e-05, "loss": 4.6947, "mean_token_accuracy": 0.6806002199649811, "num_tokens": 136624024.0, "step": 10800 }, { "epoch": 0.7743739580905946, "eval_loss": 1.184848666191101, "eval_mean_token_accuracy": 0.676878696680069, "eval_num_tokens": 136624024.0, "eval_runtime": 55.7118, "eval_samples_per_second": 7.18, "eval_steps_per_second": 0.897, "step": 10800 }, { "epoch": 0.7779590227113844, "grad_norm": 4.293883323669434, "learning_rate": 2.4681325685149778e-05, "loss": 4.7454, "mean_token_accuracy": 0.6780085292458534, "num_tokens": 137259400.0, "step": 10850 }, { "epoch": 0.7779590227113844, "eval_loss": 1.185441493988037, "eval_mean_token_accuracy": 0.6766848075389862, "eval_num_tokens": 137259400.0, "eval_runtime": 56.8526, "eval_samples_per_second": 7.036, "eval_steps_per_second": 0.879, "step": 10850 }, { "epoch": 0.7815440873321742, "grad_norm": 4.8266143798828125, "learning_rate": 2.4282982791587e-05, "loss": 4.7078, "mean_token_accuracy": 0.6800830870866775, "num_tokens": 137888937.0, "step": 10900 }, { "epoch": 0.7815440873321742, "eval_loss": 1.18449068069458, "eval_mean_token_accuracy": 0.6764636623859406, "eval_num_tokens": 137888937.0, "eval_runtime": 56.5392, "eval_samples_per_second": 7.075, "eval_steps_per_second": 0.884, "step": 10900 }, { "epoch": 0.785129151952964, "grad_norm": 4.471580982208252, "learning_rate": 2.388463989802422e-05, "loss": 4.6928, "mean_token_accuracy": 0.6799645683169365, "num_tokens": 138523132.0, "step": 10950 }, { "epoch": 0.785129151952964, "eval_loss": 1.1840896606445312, "eval_mean_token_accuracy": 0.6770457863807678, "eval_num_tokens": 138523132.0, "eval_runtime": 56.9603, "eval_samples_per_second": 7.022, "eval_steps_per_second": 0.878, "step": 10950 }, { "epoch": 0.7887142165737537, "grad_norm": 4.892276763916016, "learning_rate": 2.3486297004461442e-05, "loss": 4.7822, "mean_token_accuracy": 0.6743768805265427, "num_tokens": 139156280.0, "step": 11000 }, { "epoch": 0.7887142165737537, "eval_loss": 1.1841989755630493, "eval_mean_token_accuracy": 0.6765543162822724, "eval_num_tokens": 139156280.0, "eval_runtime": 56.1325, "eval_samples_per_second": 7.126, "eval_steps_per_second": 0.891, "step": 11000 }, { "epoch": 0.7922992811945435, "grad_norm": 5.218216896057129, "learning_rate": 2.3087954110898663e-05, "loss": 4.753, "mean_token_accuracy": 0.6765683805942535, "num_tokens": 139786084.0, "step": 11050 }, { "epoch": 0.7922992811945435, "eval_loss": 1.1838265657424927, "eval_mean_token_accuracy": 0.6768669807910919, "eval_num_tokens": 139786084.0, "eval_runtime": 57.9142, "eval_samples_per_second": 6.907, "eval_steps_per_second": 0.863, "step": 11050 }, { "epoch": 0.7958843458153333, "grad_norm": 4.109825134277344, "learning_rate": 2.2689611217335885e-05, "loss": 4.766, "mean_token_accuracy": 0.6766787865757942, "num_tokens": 140420913.0, "step": 11100 }, { "epoch": 0.7958843458153333, "eval_loss": 1.183830976486206, "eval_mean_token_accuracy": 0.6771005463600158, "eval_num_tokens": 140420913.0, "eval_runtime": 57.8895, "eval_samples_per_second": 6.91, "eval_steps_per_second": 0.864, "step": 11100 }, { "epoch": 0.7994694104361231, "grad_norm": 4.745416641235352, "learning_rate": 2.2291268323773103e-05, "loss": 4.7743, "mean_token_accuracy": 0.6760394325852395, "num_tokens": 141048743.0, "step": 11150 }, { "epoch": 0.7994694104361231, "eval_loss": 1.184319257736206, "eval_mean_token_accuracy": 0.6767275559902192, "eval_num_tokens": 141048743.0, "eval_runtime": 56.6971, "eval_samples_per_second": 7.055, "eval_steps_per_second": 0.882, "step": 11150 }, { "epoch": 0.8030544750569129, "grad_norm": 5.053956985473633, "learning_rate": 2.1892925430210324e-05, "loss": 4.7635, "mean_token_accuracy": 0.6785035586357117, "num_tokens": 141678181.0, "step": 11200 }, { "epoch": 0.8030544750569129, "eval_loss": 1.184045672416687, "eval_mean_token_accuracy": 0.6763345134258271, "eval_num_tokens": 141678181.0, "eval_runtime": 57.5357, "eval_samples_per_second": 6.952, "eval_steps_per_second": 0.869, "step": 11200 }, { "epoch": 0.8066395396777027, "grad_norm": 4.613523006439209, "learning_rate": 2.149458253664755e-05, "loss": 4.7037, "mean_token_accuracy": 0.6807671126723289, "num_tokens": 142312431.0, "step": 11250 }, { "epoch": 0.8066395396777027, "eval_loss": 1.1838032007217407, "eval_mean_token_accuracy": 0.6768004512786865, "eval_num_tokens": 142312431.0, "eval_runtime": 57.4032, "eval_samples_per_second": 6.968, "eval_steps_per_second": 0.871, "step": 11250 }, { "epoch": 0.8102246042984925, "grad_norm": 4.577108860015869, "learning_rate": 2.109623964308477e-05, "loss": 4.7398, "mean_token_accuracy": 0.6783872780203819, "num_tokens": 142944119.0, "step": 11300 }, { "epoch": 0.8102246042984925, "eval_loss": 1.1838161945343018, "eval_mean_token_accuracy": 0.676618036031723, "eval_num_tokens": 142944119.0, "eval_runtime": 57.4773, "eval_samples_per_second": 6.959, "eval_steps_per_second": 0.87, "step": 11300 }, { "epoch": 0.8138096689192823, "grad_norm": 4.523055553436279, "learning_rate": 2.069789674952199e-05, "loss": 4.7537, "mean_token_accuracy": 0.6783151313662529, "num_tokens": 143575945.0, "step": 11350 }, { "epoch": 0.8138096689192823, "eval_loss": 1.183152437210083, "eval_mean_token_accuracy": 0.6768651962280273, "eval_num_tokens": 143575945.0, "eval_runtime": 57.8648, "eval_samples_per_second": 6.913, "eval_steps_per_second": 0.864, "step": 11350 }, { "epoch": 0.8173947335400721, "grad_norm": 4.623985290527344, "learning_rate": 2.029955385595921e-05, "loss": 4.6946, "mean_token_accuracy": 0.6809823432564736, "num_tokens": 144207834.0, "step": 11400 }, { "epoch": 0.8173947335400721, "eval_loss": 1.1837332248687744, "eval_mean_token_accuracy": 0.6768518340587616, "eval_num_tokens": 144207834.0, "eval_runtime": 57.9156, "eval_samples_per_second": 6.907, "eval_steps_per_second": 0.863, "step": 11400 }, { "epoch": 0.8209797981608619, "grad_norm": 4.573394298553467, "learning_rate": 1.990121096239643e-05, "loss": 4.733, "mean_token_accuracy": 0.6784341213107109, "num_tokens": 144840226.0, "step": 11450 }, { "epoch": 0.8209797981608619, "eval_loss": 1.1838306188583374, "eval_mean_token_accuracy": 0.6770703101158142, "eval_num_tokens": 144840226.0, "eval_runtime": 57.6139, "eval_samples_per_second": 6.943, "eval_steps_per_second": 0.868, "step": 11450 }, { "epoch": 0.8245648627816516, "grad_norm": 4.6763458251953125, "learning_rate": 1.9502868068833653e-05, "loss": 4.7745, "mean_token_accuracy": 0.6781399786472321, "num_tokens": 145475463.0, "step": 11500 }, { "epoch": 0.8245648627816516, "eval_loss": 1.1832276582717896, "eval_mean_token_accuracy": 0.6768820834159851, "eval_num_tokens": 145475463.0, "eval_runtime": 56.936, "eval_samples_per_second": 7.025, "eval_steps_per_second": 0.878, "step": 11500 }, { "epoch": 0.8281499274024414, "grad_norm": 4.544640064239502, "learning_rate": 1.9104525175270875e-05, "loss": 4.6995, "mean_token_accuracy": 0.6816430819034577, "num_tokens": 146110411.0, "step": 11550 }, { "epoch": 0.8281499274024414, "eval_loss": 1.1833053827285767, "eval_mean_token_accuracy": 0.6773410534858704, "eval_num_tokens": 146110411.0, "eval_runtime": 57.0468, "eval_samples_per_second": 7.012, "eval_steps_per_second": 0.876, "step": 11550 }, { "epoch": 0.8317349920232312, "grad_norm": 4.831193923950195, "learning_rate": 1.8706182281708096e-05, "loss": 4.7265, "mean_token_accuracy": 0.6784323596954346, "num_tokens": 146744949.0, "step": 11600 }, { "epoch": 0.8317349920232312, "eval_loss": 1.1832283735275269, "eval_mean_token_accuracy": 0.6767588186264039, "eval_num_tokens": 146744949.0, "eval_runtime": 57.1574, "eval_samples_per_second": 6.998, "eval_steps_per_second": 0.875, "step": 11600 }, { "epoch": 0.835320056644021, "grad_norm": 4.2518086433410645, "learning_rate": 1.8307839388145317e-05, "loss": 4.7231, "mean_token_accuracy": 0.6789010632038116, "num_tokens": 147377879.0, "step": 11650 }, { "epoch": 0.835320056644021, "eval_loss": 1.1831552982330322, "eval_mean_token_accuracy": 0.6766877925395965, "eval_num_tokens": 147377879.0, "eval_runtime": 57.2174, "eval_samples_per_second": 6.991, "eval_steps_per_second": 0.874, "step": 11650 }, { "epoch": 0.8389051212648108, "grad_norm": 4.656574726104736, "learning_rate": 1.7909496494582535e-05, "loss": 4.7387, "mean_token_accuracy": 0.6785845035314559, "num_tokens": 148008340.0, "step": 11700 }, { "epoch": 0.8389051212648108, "eval_loss": 1.1829930543899536, "eval_mean_token_accuracy": 0.6767005050182342, "eval_num_tokens": 148008340.0, "eval_runtime": 56.5488, "eval_samples_per_second": 7.074, "eval_steps_per_second": 0.884, "step": 11700 }, { "epoch": 0.8424901858856005, "grad_norm": 5.07755184173584, "learning_rate": 1.7511153601019757e-05, "loss": 4.6949, "mean_token_accuracy": 0.6815642186999321, "num_tokens": 148637370.0, "step": 11750 }, { "epoch": 0.8424901858856005, "eval_loss": 1.1835424900054932, "eval_mean_token_accuracy": 0.6766264629364014, "eval_num_tokens": 148637370.0, "eval_runtime": 56.844, "eval_samples_per_second": 7.037, "eval_steps_per_second": 0.88, "step": 11750 }, { "epoch": 0.8460752505063904, "grad_norm": 4.937259674072266, "learning_rate": 1.7112810707456982e-05, "loss": 4.7547, "mean_token_accuracy": 0.6751632392406464, "num_tokens": 149266269.0, "step": 11800 }, { "epoch": 0.8460752505063904, "eval_loss": 1.1830496788024902, "eval_mean_token_accuracy": 0.6767467558383942, "eval_num_tokens": 149266269.0, "eval_runtime": 56.5429, "eval_samples_per_second": 7.074, "eval_steps_per_second": 0.884, "step": 11800 }, { "epoch": 0.8496603151271802, "grad_norm": 4.5733795166015625, "learning_rate": 1.67144678138942e-05, "loss": 4.7398, "mean_token_accuracy": 0.6781432759761811, "num_tokens": 149898007.0, "step": 11850 }, { "epoch": 0.8496603151271802, "eval_loss": 1.1832393407821655, "eval_mean_token_accuracy": 0.6769970893859864, "eval_num_tokens": 149898007.0, "eval_runtime": 55.172, "eval_samples_per_second": 7.25, "eval_steps_per_second": 0.906, "step": 11850 }, { "epoch": 0.85324537974797, "grad_norm": 4.531384468078613, "learning_rate": 1.631612492033142e-05, "loss": 4.7097, "mean_token_accuracy": 0.6787867891788483, "num_tokens": 150529318.0, "step": 11900 }, { "epoch": 0.85324537974797, "eval_loss": 1.1830426454544067, "eval_mean_token_accuracy": 0.6767821443080902, "eval_num_tokens": 150529318.0, "eval_runtime": 55.9298, "eval_samples_per_second": 7.152, "eval_steps_per_second": 0.894, "step": 11900 }, { "epoch": 0.8568304443687598, "grad_norm": 4.669693946838379, "learning_rate": 1.5917782026768643e-05, "loss": 4.7679, "mean_token_accuracy": 0.6752150565385818, "num_tokens": 151163202.0, "step": 11950 }, { "epoch": 0.8568304443687598, "eval_loss": 1.1831849813461304, "eval_mean_token_accuracy": 0.6765953767299652, "eval_num_tokens": 151163202.0, "eval_runtime": 56.6538, "eval_samples_per_second": 7.06, "eval_steps_per_second": 0.883, "step": 11950 }, { "epoch": 0.8604155089895495, "grad_norm": 4.184320449829102, "learning_rate": 1.5519439133205864e-05, "loss": 4.713, "mean_token_accuracy": 0.6797751143574715, "num_tokens": 151798569.0, "step": 12000 }, { "epoch": 0.8604155089895495, "eval_loss": 1.182477593421936, "eval_mean_token_accuracy": 0.6768123960494995, "eval_num_tokens": 151798569.0, "eval_runtime": 56.3382, "eval_samples_per_second": 7.1, "eval_steps_per_second": 0.887, "step": 12000 }, { "epoch": 0.8640005736103393, "grad_norm": 4.763125896453857, "learning_rate": 1.5121096239643084e-05, "loss": 4.7338, "mean_token_accuracy": 0.6781762626767158, "num_tokens": 152431951.0, "step": 12050 }, { "epoch": 0.8640005736103393, "eval_loss": 1.182124137878418, "eval_mean_token_accuracy": 0.6777550578117371, "eval_num_tokens": 152431951.0, "eval_runtime": 56.6371, "eval_samples_per_second": 7.063, "eval_steps_per_second": 0.883, "step": 12050 }, { "epoch": 0.8675856382311291, "grad_norm": 4.805209159851074, "learning_rate": 1.4722753346080307e-05, "loss": 4.7565, "mean_token_accuracy": 0.6775122970342636, "num_tokens": 153063882.0, "step": 12100 }, { "epoch": 0.8675856382311291, "eval_loss": 1.1821281909942627, "eval_mean_token_accuracy": 0.6782806706428528, "eval_num_tokens": 153063882.0, "eval_runtime": 56.6386, "eval_samples_per_second": 7.062, "eval_steps_per_second": 0.883, "step": 12100 }, { "epoch": 0.8711707028519189, "grad_norm": 4.224789142608643, "learning_rate": 1.4324410452517528e-05, "loss": 4.7722, "mean_token_accuracy": 0.6754378816485405, "num_tokens": 153698583.0, "step": 12150 }, { "epoch": 0.8711707028519189, "eval_loss": 1.182055950164795, "eval_mean_token_accuracy": 0.6773766386508941, "eval_num_tokens": 153698583.0, "eval_runtime": 56.2278, "eval_samples_per_second": 7.114, "eval_steps_per_second": 0.889, "step": 12150 }, { "epoch": 0.8747557674727087, "grad_norm": 4.622290134429932, "learning_rate": 1.392606755895475e-05, "loss": 4.7395, "mean_token_accuracy": 0.678723790049553, "num_tokens": 154330119.0, "step": 12200 }, { "epoch": 0.8747557674727087, "eval_loss": 1.1820727586746216, "eval_mean_token_accuracy": 0.6769333493709564, "eval_num_tokens": 154330119.0, "eval_runtime": 56.605, "eval_samples_per_second": 7.067, "eval_steps_per_second": 0.883, "step": 12200 }, { "epoch": 0.8783408320934984, "grad_norm": 4.508255481719971, "learning_rate": 1.352772466539197e-05, "loss": 4.6912, "mean_token_accuracy": 0.6804595556855202, "num_tokens": 154963205.0, "step": 12250 }, { "epoch": 0.8783408320934984, "eval_loss": 1.1818066835403442, "eval_mean_token_accuracy": 0.6772564661502838, "eval_num_tokens": 154963205.0, "eval_runtime": 55.5536, "eval_samples_per_second": 7.2, "eval_steps_per_second": 0.9, "step": 12250 }, { "epoch": 0.8819258967142882, "grad_norm": 4.340250492095947, "learning_rate": 1.3129381771829191e-05, "loss": 4.7228, "mean_token_accuracy": 0.6781974649429321, "num_tokens": 155595688.0, "step": 12300 }, { "epoch": 0.8819258967142882, "eval_loss": 1.1819037199020386, "eval_mean_token_accuracy": 0.677418692111969, "eval_num_tokens": 155595688.0, "eval_runtime": 55.5311, "eval_samples_per_second": 7.203, "eval_steps_per_second": 0.9, "step": 12300 }, { "epoch": 0.8855109613350781, "grad_norm": 4.752552032470703, "learning_rate": 1.2731038878266413e-05, "loss": 4.7694, "mean_token_accuracy": 0.6762826785445213, "num_tokens": 156228825.0, "step": 12350 }, { "epoch": 0.8855109613350781, "eval_loss": 1.1815353631973267, "eval_mean_token_accuracy": 0.6776353216171265, "eval_num_tokens": 156228825.0, "eval_runtime": 55.7037, "eval_samples_per_second": 7.181, "eval_steps_per_second": 0.898, "step": 12350 }, { "epoch": 0.8890960259558679, "grad_norm": 4.746983528137207, "learning_rate": 1.2332695984703634e-05, "loss": 4.7539, "mean_token_accuracy": 0.6774281883239746, "num_tokens": 156864867.0, "step": 12400 }, { "epoch": 0.8890960259558679, "eval_loss": 1.1814591884613037, "eval_mean_token_accuracy": 0.6772267067432404, "eval_num_tokens": 156864867.0, "eval_runtime": 55.2801, "eval_samples_per_second": 7.236, "eval_steps_per_second": 0.904, "step": 12400 }, { "epoch": 0.8926810905766577, "grad_norm": 4.964954376220703, "learning_rate": 1.1934353091140854e-05, "loss": 4.652, "mean_token_accuracy": 0.6832978922128677, "num_tokens": 157497835.0, "step": 12450 }, { "epoch": 0.8926810905766577, "eval_loss": 1.1815037727355957, "eval_mean_token_accuracy": 0.6775709521770478, "eval_num_tokens": 157497835.0, "eval_runtime": 55.4264, "eval_samples_per_second": 7.217, "eval_steps_per_second": 0.902, "step": 12450 }, { "epoch": 0.8962661551974475, "grad_norm": 4.32532262802124, "learning_rate": 1.1536010197578075e-05, "loss": 4.6811, "mean_token_accuracy": 0.6813494926691055, "num_tokens": 158131957.0, "step": 12500 }, { "epoch": 0.8962661551974475, "eval_loss": 1.181230902671814, "eval_mean_token_accuracy": 0.6782212293148041, "eval_num_tokens": 158131957.0, "eval_runtime": 55.638, "eval_samples_per_second": 7.189, "eval_steps_per_second": 0.899, "step": 12500 }, { "epoch": 0.8998512198182372, "grad_norm": 4.772362232208252, "learning_rate": 1.1137667304015297e-05, "loss": 4.6799, "mean_token_accuracy": 0.6808639001846314, "num_tokens": 158765520.0, "step": 12550 }, { "epoch": 0.8998512198182372, "eval_loss": 1.18119215965271, "eval_mean_token_accuracy": 0.6776848089694977, "eval_num_tokens": 158765520.0, "eval_runtime": 56.5407, "eval_samples_per_second": 7.075, "eval_steps_per_second": 0.884, "step": 12550 }, { "epoch": 0.903436284439027, "grad_norm": 4.406890869140625, "learning_rate": 1.0739324410452518e-05, "loss": 4.6572, "mean_token_accuracy": 0.6824618262052536, "num_tokens": 159396320.0, "step": 12600 }, { "epoch": 0.903436284439027, "eval_loss": 1.1813263893127441, "eval_mean_token_accuracy": 0.677431755065918, "eval_num_tokens": 159396320.0, "eval_runtime": 56.1014, "eval_samples_per_second": 7.13, "eval_steps_per_second": 0.891, "step": 12600 }, { "epoch": 0.9070213490598168, "grad_norm": 4.6225786209106445, "learning_rate": 1.034098151688974e-05, "loss": 4.7346, "mean_token_accuracy": 0.6780241671204567, "num_tokens": 160028813.0, "step": 12650 }, { "epoch": 0.9070213490598168, "eval_loss": 1.18108332157135, "eval_mean_token_accuracy": 0.6777166557312012, "eval_num_tokens": 160028813.0, "eval_runtime": 55.9817, "eval_samples_per_second": 7.145, "eval_steps_per_second": 0.893, "step": 12650 }, { "epoch": 0.9106064136806066, "grad_norm": 5.096744537353516, "learning_rate": 9.942638623326961e-06, "loss": 4.7072, "mean_token_accuracy": 0.6779900795221329, "num_tokens": 160660115.0, "step": 12700 }, { "epoch": 0.9106064136806066, "eval_loss": 1.1813737154006958, "eval_mean_token_accuracy": 0.6772573125362397, "eval_num_tokens": 160660115.0, "eval_runtime": 56.7004, "eval_samples_per_second": 7.055, "eval_steps_per_second": 0.882, "step": 12700 }, { "epoch": 0.9141914783013964, "grad_norm": 4.954991817474365, "learning_rate": 9.54429572976418e-06, "loss": 4.7609, "mean_token_accuracy": 0.6763640037178993, "num_tokens": 161295929.0, "step": 12750 }, { "epoch": 0.9141914783013964, "eval_loss": 1.1813360452651978, "eval_mean_token_accuracy": 0.6774272322654724, "eval_num_tokens": 161295929.0, "eval_runtime": 55.8406, "eval_samples_per_second": 7.163, "eval_steps_per_second": 0.895, "step": 12750 }, { "epoch": 0.9177765429221861, "grad_norm": 4.923917770385742, "learning_rate": 9.145952836201404e-06, "loss": 4.8154, "mean_token_accuracy": 0.6726123803853988, "num_tokens": 161927138.0, "step": 12800 }, { "epoch": 0.9177765429221861, "eval_loss": 1.1811388731002808, "eval_mean_token_accuracy": 0.6773995268344879, "eval_num_tokens": 161927138.0, "eval_runtime": 55.8065, "eval_samples_per_second": 7.168, "eval_steps_per_second": 0.896, "step": 12800 }, { "epoch": 0.9213616075429759, "grad_norm": 4.706872463226318, "learning_rate": 8.747609942638624e-06, "loss": 4.7658, "mean_token_accuracy": 0.6769201335310936, "num_tokens": 162553898.0, "step": 12850 }, { "epoch": 0.9213616075429759, "eval_loss": 1.1811527013778687, "eval_mean_token_accuracy": 0.6772512257099151, "eval_num_tokens": 162553898.0, "eval_runtime": 55.933, "eval_samples_per_second": 7.151, "eval_steps_per_second": 0.894, "step": 12850 }, { "epoch": 0.9249466721637658, "grad_norm": 4.29292106628418, "learning_rate": 8.349267049075845e-06, "loss": 4.7529, "mean_token_accuracy": 0.678237376511097, "num_tokens": 163187273.0, "step": 12900 }, { "epoch": 0.9249466721637658, "eval_loss": 1.1813446283340454, "eval_mean_token_accuracy": 0.6770773160457612, "eval_num_tokens": 163187273.0, "eval_runtime": 55.9165, "eval_samples_per_second": 7.154, "eval_steps_per_second": 0.894, "step": 12900 }, { "epoch": 0.9285317367845556, "grad_norm": 4.577188968658447, "learning_rate": 7.950924155513067e-06, "loss": 4.7151, "mean_token_accuracy": 0.6785858425498009, "num_tokens": 163816024.0, "step": 12950 }, { "epoch": 0.9285317367845556, "eval_loss": 1.181489109992981, "eval_mean_token_accuracy": 0.6775988221168519, "eval_num_tokens": 163816024.0, "eval_runtime": 55.8424, "eval_samples_per_second": 7.163, "eval_steps_per_second": 0.895, "step": 12950 }, { "epoch": 0.9321168014053454, "grad_norm": 4.593563556671143, "learning_rate": 7.552581261950287e-06, "loss": 4.7085, "mean_token_accuracy": 0.6803634178638458, "num_tokens": 164445021.0, "step": 13000 }, { "epoch": 0.9321168014053454, "eval_loss": 1.1816061735153198, "eval_mean_token_accuracy": 0.6776184713840485, "eval_num_tokens": 164445021.0, "eval_runtime": 55.8489, "eval_samples_per_second": 7.162, "eval_steps_per_second": 0.895, "step": 13000 }, { "epoch": 0.9357018660261351, "grad_norm": 4.309682846069336, "learning_rate": 7.1542383683875086e-06, "loss": 4.706, "mean_token_accuracy": 0.6809570705890655, "num_tokens": 165073944.0, "step": 13050 }, { "epoch": 0.9357018660261351, "eval_loss": 1.181320071220398, "eval_mean_token_accuracy": 0.6780745506286621, "eval_num_tokens": 165073944.0, "eval_runtime": 55.6478, "eval_samples_per_second": 7.188, "eval_steps_per_second": 0.899, "step": 13050 }, { "epoch": 0.9392869306469249, "grad_norm": 4.485646724700928, "learning_rate": 6.755895474824729e-06, "loss": 4.7568, "mean_token_accuracy": 0.677568726837635, "num_tokens": 165708351.0, "step": 13100 }, { "epoch": 0.9392869306469249, "eval_loss": 1.1809498071670532, "eval_mean_token_accuracy": 0.677814108133316, "eval_num_tokens": 165708351.0, "eval_runtime": 55.8268, "eval_samples_per_second": 7.165, "eval_steps_per_second": 0.896, "step": 13100 }, { "epoch": 0.9428719952677147, "grad_norm": 4.374978065490723, "learning_rate": 6.357552581261951e-06, "loss": 4.7056, "mean_token_accuracy": 0.6803146860003472, "num_tokens": 166342580.0, "step": 13150 }, { "epoch": 0.9428719952677147, "eval_loss": 1.1812047958374023, "eval_mean_token_accuracy": 0.6775732636451721, "eval_num_tokens": 166342580.0, "eval_runtime": 56.8808, "eval_samples_per_second": 7.032, "eval_steps_per_second": 0.879, "step": 13150 }, { "epoch": 0.9464570598885045, "grad_norm": 4.719696044921875, "learning_rate": 5.959209687699171e-06, "loss": 4.6996, "mean_token_accuracy": 0.6807303726673126, "num_tokens": 166972337.0, "step": 13200 }, { "epoch": 0.9464570598885045, "eval_loss": 1.1809191703796387, "eval_mean_token_accuracy": 0.6775369119644165, "eval_num_tokens": 166972337.0, "eval_runtime": 55.3898, "eval_samples_per_second": 7.222, "eval_steps_per_second": 0.903, "step": 13200 }, { "epoch": 0.9500421245092943, "grad_norm": 4.4557905197143555, "learning_rate": 5.560866794136393e-06, "loss": 4.6934, "mean_token_accuracy": 0.6803115239739418, "num_tokens": 167608984.0, "step": 13250 }, { "epoch": 0.9500421245092943, "eval_loss": 1.1811048984527588, "eval_mean_token_accuracy": 0.6776836955547333, "eval_num_tokens": 167608984.0, "eval_runtime": 56.7029, "eval_samples_per_second": 7.054, "eval_steps_per_second": 0.882, "step": 13250 }, { "epoch": 0.953627189130084, "grad_norm": 4.890408515930176, "learning_rate": 5.162523900573614e-06, "loss": 4.7407, "mean_token_accuracy": 0.6763252380490303, "num_tokens": 168240445.0, "step": 13300 }, { "epoch": 0.953627189130084, "eval_loss": 1.1808910369873047, "eval_mean_token_accuracy": 0.6770796060562134, "eval_num_tokens": 168240445.0, "eval_runtime": 56.5546, "eval_samples_per_second": 7.073, "eval_steps_per_second": 0.884, "step": 13300 }, { "epoch": 0.9572122537508738, "grad_norm": 5.145451545715332, "learning_rate": 4.7641810070108355e-06, "loss": 4.7417, "mean_token_accuracy": 0.6776255601644516, "num_tokens": 168871775.0, "step": 13350 }, { "epoch": 0.9572122537508738, "eval_loss": 1.1808017492294312, "eval_mean_token_accuracy": 0.6777704417705536, "eval_num_tokens": 168871775.0, "eval_runtime": 56.3719, "eval_samples_per_second": 7.096, "eval_steps_per_second": 0.887, "step": 13350 }, { "epoch": 0.9607973183716636, "grad_norm": 4.5986104011535645, "learning_rate": 4.365838113448056e-06, "loss": 4.742, "mean_token_accuracy": 0.678845791220665, "num_tokens": 169504277.0, "step": 13400 }, { "epoch": 0.9607973183716636, "eval_loss": 1.1809656620025635, "eval_mean_token_accuracy": 0.6775072228908539, "eval_num_tokens": 169504277.0, "eval_runtime": 55.886, "eval_samples_per_second": 7.157, "eval_steps_per_second": 0.895, "step": 13400 }, { "epoch": 0.9643823829924535, "grad_norm": 4.603204250335693, "learning_rate": 3.967495219885278e-06, "loss": 4.7013, "mean_token_accuracy": 0.6800284919142723, "num_tokens": 170139597.0, "step": 13450 }, { "epoch": 0.9643823829924535, "eval_loss": 1.1810020208358765, "eval_mean_token_accuracy": 0.6775026059150696, "eval_num_tokens": 170139597.0, "eval_runtime": 55.5754, "eval_samples_per_second": 7.197, "eval_steps_per_second": 0.9, "step": 13450 }, { "epoch": 0.9679674476132433, "grad_norm": 4.541078567504883, "learning_rate": 3.5691523263224986e-06, "loss": 4.6893, "mean_token_accuracy": 0.6810142487287522, "num_tokens": 170769337.0, "step": 13500 }, { "epoch": 0.9679674476132433, "eval_loss": 1.1807525157928467, "eval_mean_token_accuracy": 0.6778879475593567, "eval_num_tokens": 170769337.0, "eval_runtime": 55.7448, "eval_samples_per_second": 7.176, "eval_steps_per_second": 0.897, "step": 13500 }, { "epoch": 0.971552512234033, "grad_norm": 4.519087314605713, "learning_rate": 3.17080943275972e-06, "loss": 4.772, "mean_token_accuracy": 0.6758325353264809, "num_tokens": 171402451.0, "step": 13550 }, { "epoch": 0.971552512234033, "eval_loss": 1.180974006652832, "eval_mean_token_accuracy": 0.6771935153007508, "eval_num_tokens": 171402451.0, "eval_runtime": 55.9385, "eval_samples_per_second": 7.151, "eval_steps_per_second": 0.894, "step": 13550 }, { "epoch": 0.9751375768548228, "grad_norm": 4.388876914978027, "learning_rate": 2.772466539196941e-06, "loss": 4.7878, "mean_token_accuracy": 0.6759749925136567, "num_tokens": 172037593.0, "step": 13600 }, { "epoch": 0.9751375768548228, "eval_loss": 1.180649995803833, "eval_mean_token_accuracy": 0.6776048111915588, "eval_num_tokens": 172037593.0, "eval_runtime": 55.7077, "eval_samples_per_second": 7.18, "eval_steps_per_second": 0.898, "step": 13600 }, { "epoch": 0.9787226414756126, "grad_norm": 4.676353931427002, "learning_rate": 2.374123645634162e-06, "loss": 4.7004, "mean_token_accuracy": 0.6811859339475632, "num_tokens": 172671223.0, "step": 13650 }, { "epoch": 0.9787226414756126, "eval_loss": 1.1809673309326172, "eval_mean_token_accuracy": 0.6772890436649323, "eval_num_tokens": 172671223.0, "eval_runtime": 55.9611, "eval_samples_per_second": 7.148, "eval_steps_per_second": 0.893, "step": 13650 }, { "epoch": 0.9823077060964024, "grad_norm": 4.678284645080566, "learning_rate": 1.975780752071383e-06, "loss": 4.7903, "mean_token_accuracy": 0.676692801117897, "num_tokens": 173302207.0, "step": 13700 }, { "epoch": 0.9823077060964024, "eval_loss": 1.1809345483779907, "eval_mean_token_accuracy": 0.6775369548797607, "eval_num_tokens": 173302207.0, "eval_runtime": 55.9406, "eval_samples_per_second": 7.15, "eval_steps_per_second": 0.894, "step": 13700 }, { "epoch": 0.9858927707171922, "grad_norm": 4.409168243408203, "learning_rate": 1.5774378585086041e-06, "loss": 4.6684, "mean_token_accuracy": 0.6816425076127053, "num_tokens": 173935786.0, "step": 13750 }, { "epoch": 0.9858927707171922, "eval_loss": 1.1808542013168335, "eval_mean_token_accuracy": 0.6773917138576507, "eval_num_tokens": 173935786.0, "eval_runtime": 55.8468, "eval_samples_per_second": 7.162, "eval_steps_per_second": 0.895, "step": 13750 }, { "epoch": 0.989477835337982, "grad_norm": 4.55557107925415, "learning_rate": 1.1790949649458254e-06, "loss": 4.7765, "mean_token_accuracy": 0.6755864906311035, "num_tokens": 174569465.0, "step": 13800 }, { "epoch": 0.989477835337982, "eval_loss": 1.18069589138031, "eval_mean_token_accuracy": 0.6778388035297394, "eval_num_tokens": 174569465.0, "eval_runtime": 55.6007, "eval_samples_per_second": 7.194, "eval_steps_per_second": 0.899, "step": 13800 }, { "epoch": 0.9930628999587717, "grad_norm": 4.513246536254883, "learning_rate": 7.807520713830466e-07, "loss": 4.7511, "mean_token_accuracy": 0.6769631016254425, "num_tokens": 175203827.0, "step": 13850 }, { "epoch": 0.9930628999587717, "eval_loss": 1.1805609464645386, "eval_mean_token_accuracy": 0.6776606893539429, "eval_num_tokens": 175203827.0, "eval_runtime": 55.7815, "eval_samples_per_second": 7.171, "eval_steps_per_second": 0.896, "step": 13850 }, { "epoch": 0.9966479645795615, "grad_norm": 4.5654497146606445, "learning_rate": 3.824091778202677e-07, "loss": 4.7423, "mean_token_accuracy": 0.6774308422207832, "num_tokens": 175836938.0, "step": 13900 }, { "epoch": 0.9966479645795615, "eval_loss": 1.1805065870285034, "eval_mean_token_accuracy": 0.6777116668224334, "eval_num_tokens": 175836938.0, "eval_runtime": 55.8266, "eval_samples_per_second": 7.165, "eval_steps_per_second": 0.896, "step": 13900 }, { "epoch": 1.0, "mean_token_accuracy": 0.677863019991686, "num_tokens": 176425692.0, "step": 13947, "total_flos": 5.688174665250246e+18, "train_loss": 4.990768942446951, "train_runtime": 203046.5495, "train_samples_per_second": 2.198, "train_steps_per_second": 0.069 } ], "logging_steps": 50, "max_steps": 13947, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.688174665250246e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }