| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 939, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.471254324913025, |
| "epoch": 0.03194888178913738, |
| "grad_norm": 8.043270111083984, |
| "learning_rate": 1.9808306709265177e-05, |
| "loss": 1.1636, |
| "mean_token_accuracy": 0.6890625, |
| "num_tokens": 12480.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.90457843542099, |
| "epoch": 0.06389776357827476, |
| "grad_norm": 10.676708221435547, |
| "learning_rate": 1.959531416400426e-05, |
| "loss": 0.3864, |
| "mean_token_accuracy": 0.8375, |
| "num_tokens": 24960.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.027526319026947, |
| "epoch": 0.09584664536741214, |
| "grad_norm": 5.594727039337158, |
| "learning_rate": 1.9382321618743344e-05, |
| "loss": 0.3759, |
| "mean_token_accuracy": 0.83671875, |
| "num_tokens": 37440.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.192073893547058, |
| "epoch": 0.12779552715654952, |
| "grad_norm": 3.208804130554199, |
| "learning_rate": 1.916932907348243e-05, |
| "loss": 0.3341, |
| "mean_token_accuracy": 0.85703125, |
| "num_tokens": 49920.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.1430011987686157, |
| "epoch": 0.1597444089456869, |
| "grad_norm": 19.184051513671875, |
| "learning_rate": 1.895633652822151e-05, |
| "loss": 0.3429, |
| "mean_token_accuracy": 0.85078125, |
| "num_tokens": 62400.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.1257157444953918, |
| "epoch": 0.19169329073482427, |
| "grad_norm": 5.893524646759033, |
| "learning_rate": 1.87433439829606e-05, |
| "loss": 0.2334, |
| "mean_token_accuracy": 0.8875, |
| "num_tokens": 74880.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.992573595046997, |
| "epoch": 0.22364217252396165, |
| "grad_norm": 15.351304054260254, |
| "learning_rate": 1.8530351437699682e-05, |
| "loss": 0.1187, |
| "mean_token_accuracy": 0.96328125, |
| "num_tokens": 87360.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.805773138999939, |
| "epoch": 0.25559105431309903, |
| "grad_norm": 74.02106475830078, |
| "learning_rate": 1.8317358892438765e-05, |
| "loss": 0.1924, |
| "mean_token_accuracy": 0.93125, |
| "num_tokens": 99840.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.8376959323883056, |
| "epoch": 0.28753993610223644, |
| "grad_norm": 9.446106910705566, |
| "learning_rate": 1.8104366347177852e-05, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.96796875, |
| "num_tokens": 112320.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.6883749544620514, |
| "epoch": 0.3194888178913738, |
| "grad_norm": 29.95865249633789, |
| "learning_rate": 1.7891373801916932e-05, |
| "loss": 0.0712, |
| "mean_token_accuracy": 0.9671875, |
| "num_tokens": 124800.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.5861309468746185, |
| "epoch": 0.3514376996805112, |
| "grad_norm": 0.981063723564148, |
| "learning_rate": 1.767838125665602e-05, |
| "loss": 0.0339, |
| "mean_token_accuracy": 0.9890625, |
| "num_tokens": 137280.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.6167496562004089, |
| "epoch": 0.38338658146964855, |
| "grad_norm": 0.3446030020713806, |
| "learning_rate": 1.7465388711395103e-05, |
| "loss": 0.019, |
| "mean_token_accuracy": 0.9953125, |
| "num_tokens": 149760.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.6116879105567932, |
| "epoch": 0.41533546325878595, |
| "grad_norm": 7.9384846687316895, |
| "learning_rate": 1.7252396166134186e-05, |
| "loss": 0.0179, |
| "mean_token_accuracy": 0.99453125, |
| "num_tokens": 162240.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.5835295200347901, |
| "epoch": 0.4472843450479233, |
| "grad_norm": 15.288229942321777, |
| "learning_rate": 1.7039403620873273e-05, |
| "loss": 0.0144, |
| "mean_token_accuracy": 0.99375, |
| "num_tokens": 174720.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.5895743370056152, |
| "epoch": 0.4792332268370607, |
| "grad_norm": 8.906089782714844, |
| "learning_rate": 1.6826411075612353e-05, |
| "loss": 0.0277, |
| "mean_token_accuracy": 0.9953125, |
| "num_tokens": 187200.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.6350247144699097, |
| "epoch": 0.5111821086261981, |
| "grad_norm": 12.155186653137207, |
| "learning_rate": 1.661341853035144e-05, |
| "loss": 0.009, |
| "mean_token_accuracy": 0.9984375, |
| "num_tokens": 199680.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.6250557661056518, |
| "epoch": 0.5431309904153354, |
| "grad_norm": 1.7694993019104004, |
| "learning_rate": 1.6400425985090524e-05, |
| "loss": 0.0297, |
| "mean_token_accuracy": 0.9890625, |
| "num_tokens": 212160.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.6015866935253144, |
| "epoch": 0.5750798722044729, |
| "grad_norm": 24.392311096191406, |
| "learning_rate": 1.6187433439829607e-05, |
| "loss": 0.0199, |
| "mean_token_accuracy": 0.9921875, |
| "num_tokens": 224640.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.6133609235286712, |
| "epoch": 0.6070287539936102, |
| "grad_norm": 0.03015461377799511, |
| "learning_rate": 1.5974440894568694e-05, |
| "loss": 0.0131, |
| "mean_token_accuracy": 0.996875, |
| "num_tokens": 237120.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.6456725597381592, |
| "epoch": 0.6389776357827476, |
| "grad_norm": 18.586185455322266, |
| "learning_rate": 1.5761448349307774e-05, |
| "loss": 0.0133, |
| "mean_token_accuracy": 0.99609375, |
| "num_tokens": 249600.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.6170299232006073, |
| "epoch": 0.670926517571885, |
| "grad_norm": 49.949588775634766, |
| "learning_rate": 1.554845580404686e-05, |
| "loss": 0.0231, |
| "mean_token_accuracy": 0.9921875, |
| "num_tokens": 262080.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.5970291554927826, |
| "epoch": 0.7028753993610224, |
| "grad_norm": 0.9214933514595032, |
| "learning_rate": 1.5335463258785944e-05, |
| "loss": 0.0179, |
| "mean_token_accuracy": 0.990625, |
| "num_tokens": 274560.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.580757600069046, |
| "epoch": 0.7348242811501597, |
| "grad_norm": 8.092296600341797, |
| "learning_rate": 1.5122470713525028e-05, |
| "loss": 0.0349, |
| "mean_token_accuracy": 0.98984375, |
| "num_tokens": 287040.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.5787507772445679, |
| "epoch": 0.7667731629392971, |
| "grad_norm": 10.055787086486816, |
| "learning_rate": 1.4909478168264111e-05, |
| "loss": 0.0065, |
| "mean_token_accuracy": 0.99609375, |
| "num_tokens": 299520.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.5758024156093597, |
| "epoch": 0.7987220447284346, |
| "grad_norm": 61.38268280029297, |
| "learning_rate": 1.4696485623003197e-05, |
| "loss": 0.0424, |
| "mean_token_accuracy": 0.9875, |
| "num_tokens": 312000.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.6395232379436493, |
| "epoch": 0.8306709265175719, |
| "grad_norm": 2.0960898399353027, |
| "learning_rate": 1.4483493077742282e-05, |
| "loss": 0.1762, |
| "mean_token_accuracy": 0.95, |
| "num_tokens": 324480.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.6194823384284973, |
| "epoch": 0.8626198083067093, |
| "grad_norm": 2.2915937900543213, |
| "learning_rate": 1.4270500532481364e-05, |
| "loss": 0.0054, |
| "mean_token_accuracy": 0.9984375, |
| "num_tokens": 336960.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.553073239326477, |
| "epoch": 0.8945686900958466, |
| "grad_norm": 0.6241616606712341, |
| "learning_rate": 1.4057507987220449e-05, |
| "loss": 0.0121, |
| "mean_token_accuracy": 0.99765625, |
| "num_tokens": 349440.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.588155323266983, |
| "epoch": 0.9265175718849841, |
| "grad_norm": 0.8865500688552856, |
| "learning_rate": 1.3844515441959532e-05, |
| "loss": 0.0044, |
| "mean_token_accuracy": 0.99765625, |
| "num_tokens": 361920.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.6138588011264801, |
| "epoch": 0.9584664536741214, |
| "grad_norm": 0.16805018484592438, |
| "learning_rate": 1.3631522896698617e-05, |
| "loss": 0.001, |
| "mean_token_accuracy": 0.99921875, |
| "num_tokens": 374400.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.6157549917697906, |
| "epoch": 0.9904153354632588, |
| "grad_norm": 11.855587005615234, |
| "learning_rate": 1.3418530351437703e-05, |
| "loss": 0.0051, |
| "mean_token_accuracy": 0.9984375, |
| "num_tokens": 386880.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_entropy": 0.5923710940759394, |
| "eval_loss": 0.01570574752986431, |
| "eval_mean_token_accuracy": 0.995253164556962, |
| "eval_num_tokens": 389844.0, |
| "eval_runtime": 13.2845, |
| "eval_samples_per_second": 188.189, |
| "eval_steps_per_second": 5.947, |
| "step": 313 |
| }, |
| { |
| "entropy": 0.5902234852313996, |
| "epoch": 1.0223642172523961, |
| "grad_norm": 0.026217741891741753, |
| "learning_rate": 1.3205537806176784e-05, |
| "loss": 0.0056, |
| "mean_token_accuracy": 0.9984375, |
| "num_tokens": 398580.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.562452882528305, |
| "epoch": 1.0543130990415335, |
| "grad_norm": 22.588623046875, |
| "learning_rate": 1.299254526091587e-05, |
| "loss": 0.0152, |
| "mean_token_accuracy": 0.9921875, |
| "num_tokens": 411060.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.553607851266861, |
| "epoch": 1.0862619808306708, |
| "grad_norm": 2.0158348083496094, |
| "learning_rate": 1.2779552715654953e-05, |
| "loss": 0.0202, |
| "mean_token_accuracy": 0.99453125, |
| "num_tokens": 423540.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.602299690246582, |
| "epoch": 1.1182108626198084, |
| "grad_norm": 0.03288736939430237, |
| "learning_rate": 1.2566560170394038e-05, |
| "loss": 0.0265, |
| "mean_token_accuracy": 0.9921875, |
| "num_tokens": 436020.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.5916118025779724, |
| "epoch": 1.1501597444089458, |
| "grad_norm": 0.2714002728462219, |
| "learning_rate": 1.235356762513312e-05, |
| "loss": 0.0049, |
| "mean_token_accuracy": 0.996875, |
| "num_tokens": 448500.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.6014433860778808, |
| "epoch": 1.182108626198083, |
| "grad_norm": 0.8565823435783386, |
| "learning_rate": 1.2140575079872205e-05, |
| "loss": 0.0029, |
| "mean_token_accuracy": 0.99921875, |
| "num_tokens": 460980.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.558870005607605, |
| "epoch": 1.2140575079872205, |
| "grad_norm": 0.4954104721546173, |
| "learning_rate": 1.192758253461129e-05, |
| "loss": 0.0021, |
| "mean_token_accuracy": 0.99921875, |
| "num_tokens": 473460.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.5449843347072602, |
| "epoch": 1.2460063897763578, |
| "grad_norm": 0.0184471495449543, |
| "learning_rate": 1.1714589989350374e-05, |
| "loss": 0.0012, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 485940.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.5302130222320557, |
| "epoch": 1.2779552715654952, |
| "grad_norm": 0.5405293107032776, |
| "learning_rate": 1.1501597444089459e-05, |
| "loss": 0.0021, |
| "mean_token_accuracy": 0.99921875, |
| "num_tokens": 498420.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.5257469773292541, |
| "epoch": 1.3099041533546325, |
| "grad_norm": 14.052752494812012, |
| "learning_rate": 1.1288604898828541e-05, |
| "loss": 0.0054, |
| "mean_token_accuracy": 0.9984375, |
| "num_tokens": 510900.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.5204551070928574, |
| "epoch": 1.34185303514377, |
| "grad_norm": 0.041872043162584305, |
| "learning_rate": 1.1075612353567626e-05, |
| "loss": 0.0028, |
| "mean_token_accuracy": 0.9984375, |
| "num_tokens": 523380.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.5394512295722962, |
| "epoch": 1.3738019169329074, |
| "grad_norm": 0.06288646906614304, |
| "learning_rate": 1.086261980830671e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 535860.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.5291238784790039, |
| "epoch": 1.4057507987220448, |
| "grad_norm": 0.0030913001392036676, |
| "learning_rate": 1.0649627263045795e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.99921875, |
| "num_tokens": 548340.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.5285849571228027, |
| "epoch": 1.4376996805111821, |
| "grad_norm": 1.8844810724258423, |
| "learning_rate": 1.043663471778488e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 560820.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.5385882794857025, |
| "epoch": 1.4696485623003195, |
| "grad_norm": 0.11690080910921097, |
| "learning_rate": 1.0223642172523962e-05, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 573300.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.5441839516162872, |
| "epoch": 1.5015974440894568, |
| "grad_norm": 0.0011391988955438137, |
| "learning_rate": 1.0010649627263047e-05, |
| "loss": 0.0007, |
| "mean_token_accuracy": 0.99921875, |
| "num_tokens": 585780.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.5367125928401947, |
| "epoch": 1.5335463258785942, |
| "grad_norm": 0.595458984375, |
| "learning_rate": 9.79765708200213e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 598260.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.5380379557609558, |
| "epoch": 1.5654952076677318, |
| "grad_norm": 0.0110127292573452, |
| "learning_rate": 9.584664536741216e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 610740.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.5656424820423126, |
| "epoch": 1.5974440894568689, |
| "grad_norm": 0.018918879330158234, |
| "learning_rate": 9.3716719914803e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 623220.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.5534205734729767, |
| "epoch": 1.6293929712460065, |
| "grad_norm": 0.0005970252677798271, |
| "learning_rate": 9.158679446219383e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 635700.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.5591952800750732, |
| "epoch": 1.6613418530351438, |
| "grad_norm": 0.23496565222740173, |
| "learning_rate": 8.945686900958466e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 648180.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.5553164839744568, |
| "epoch": 1.6932907348242812, |
| "grad_norm": 0.015620424412190914, |
| "learning_rate": 8.732694355697551e-06, |
| "loss": 0.0006, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 660660.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.5558278143405915, |
| "epoch": 1.7252396166134185, |
| "grad_norm": 0.013437892310321331, |
| "learning_rate": 8.519701810436637e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 673140.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.5494430124759674, |
| "epoch": 1.7571884984025559, |
| "grad_norm": 0.05179116502404213, |
| "learning_rate": 8.30670926517572e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 685620.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.5591476142406464, |
| "epoch": 1.7891373801916934, |
| "grad_norm": 0.001572166453115642, |
| "learning_rate": 8.093716719914804e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 698100.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.5543432533740997, |
| "epoch": 1.8210862619808306, |
| "grad_norm": 0.0029468077700585127, |
| "learning_rate": 7.880724174653887e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 710580.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.5561375498771668, |
| "epoch": 1.8530351437699681, |
| "grad_norm": 7.772324897814542e-05, |
| "learning_rate": 7.667731629392972e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 723060.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.554932814836502, |
| "epoch": 1.8849840255591053, |
| "grad_norm": 0.023860394954681396, |
| "learning_rate": 7.454739084132056e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 735540.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.5592103660106659, |
| "epoch": 1.9169329073482428, |
| "grad_norm": 6.846313772257417e-05, |
| "learning_rate": 7.241746538871141e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 748020.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.5566479444503785, |
| "epoch": 1.9488817891373802, |
| "grad_norm": 0.00017782168288249522, |
| "learning_rate": 7.028753993610224e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 760500.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.560238641500473, |
| "epoch": 1.9808306709265175, |
| "grad_norm": 0.0009979789610952139, |
| "learning_rate": 6.815761448349309e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 772980.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_entropy": 0.5581134014491793, |
| "eval_loss": 6.910775482538156e-06, |
| "eval_mean_token_accuracy": 1.0, |
| "eval_num_tokens": 779688.0, |
| "eval_runtime": 13.4056, |
| "eval_samples_per_second": 186.489, |
| "eval_steps_per_second": 5.893, |
| "step": 626 |
| }, |
| { |
| "entropy": 0.5542679131031036, |
| "epoch": 2.012779552715655, |
| "grad_norm": 0.0001906445249915123, |
| "learning_rate": 6.602768903088392e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 784680.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.5601355612277985, |
| "epoch": 2.0447284345047922, |
| "grad_norm": 8.866995631251484e-05, |
| "learning_rate": 6.3897763578274765e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 797160.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.5562943339347839, |
| "epoch": 2.07667731629393, |
| "grad_norm": 4.927597183268517e-05, |
| "learning_rate": 6.17678381256656e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 809640.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.5592762529850006, |
| "epoch": 2.108626198083067, |
| "grad_norm": 0.0003652777522802353, |
| "learning_rate": 5.963791267305645e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 822120.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.560578465461731, |
| "epoch": 2.1405750798722045, |
| "grad_norm": 0.0005100357229821384, |
| "learning_rate": 5.7507987220447296e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 834600.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.5573894202709198, |
| "epoch": 2.1725239616613417, |
| "grad_norm": 0.0007649549515917897, |
| "learning_rate": 5.537806176783813e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 847080.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.5623638391494751, |
| "epoch": 2.2044728434504792, |
| "grad_norm": 0.007040001451969147, |
| "learning_rate": 5.324813631522897e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 859560.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.5608678042888642, |
| "epoch": 2.236421725239617, |
| "grad_norm": 0.0008389271097257733, |
| "learning_rate": 5.111821086261981e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 872040.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.562554806470871, |
| "epoch": 2.268370607028754, |
| "grad_norm": 0.0008370543946512043, |
| "learning_rate": 4.898828541001065e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 884520.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.5613407075405121, |
| "epoch": 2.3003194888178915, |
| "grad_norm": 3.100551475654356e-05, |
| "learning_rate": 4.68583599574015e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 897000.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.5588717699050904, |
| "epoch": 2.3322683706070286, |
| "grad_norm": 0.0035649905912578106, |
| "learning_rate": 4.472843450479233e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 909480.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.5609096884727478, |
| "epoch": 2.364217252396166, |
| "grad_norm": 0.0003579799085855484, |
| "learning_rate": 4.259850905218318e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 921960.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.5581632852554321, |
| "epoch": 2.3961661341853033, |
| "grad_norm": 0.00018412918143440038, |
| "learning_rate": 4.046858359957402e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 934440.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.5592087864875793, |
| "epoch": 2.428115015974441, |
| "grad_norm": 0.001302594318985939, |
| "learning_rate": 3.833865814696486e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 946920.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.5602552175521851, |
| "epoch": 2.460063897763578, |
| "grad_norm": 0.0001967909629456699, |
| "learning_rate": 3.6208732694355704e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 959400.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.5587169051170349, |
| "epoch": 2.4920127795527156, |
| "grad_norm": 3.6476201785262674e-05, |
| "learning_rate": 3.4078807241746544e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 971880.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.5619259059429169, |
| "epoch": 2.523961661341853, |
| "grad_norm": 0.00010852525883819908, |
| "learning_rate": 3.1948881789137383e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 984360.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.5560723125934601, |
| "epoch": 2.5559105431309903, |
| "grad_norm": 7.974612526595592e-05, |
| "learning_rate": 2.9818956336528226e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 996840.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.5587869763374329, |
| "epoch": 2.587859424920128, |
| "grad_norm": 0.0005656637367792428, |
| "learning_rate": 2.7689030883919065e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 1009320.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.5632799625396728, |
| "epoch": 2.619808306709265, |
| "grad_norm": 6.125601794337854e-05, |
| "learning_rate": 2.5559105431309904e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 1021800.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.558579832315445, |
| "epoch": 2.6517571884984026, |
| "grad_norm": 0.0008585830801166594, |
| "learning_rate": 2.342917997870075e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 1034280.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.5579914152622223, |
| "epoch": 2.68370607028754, |
| "grad_norm": 5.5771433835616335e-05, |
| "learning_rate": 2.129925452609159e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 1046760.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.5599809646606445, |
| "epoch": 2.7156549520766773, |
| "grad_norm": 0.00012791369226761162, |
| "learning_rate": 1.916932907348243e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 1059240.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.5608228087425232, |
| "epoch": 2.747603833865815, |
| "grad_norm": 8.307035022880882e-05, |
| "learning_rate": 1.7039403620873272e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 1071720.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 0.5607754468917847, |
| "epoch": 2.779552715654952, |
| "grad_norm": 5.250581671134569e-05, |
| "learning_rate": 1.4909478168264113e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 1084200.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.5697051167488099, |
| "epoch": 2.8115015974440896, |
| "grad_norm": 0.0002477150410413742, |
| "learning_rate": 1.2779552715654952e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 1096680.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.5599372982978821, |
| "epoch": 2.8434504792332267, |
| "grad_norm": 9.851283539319411e-05, |
| "learning_rate": 1.0649627263045796e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 1109160.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 0.5631425619125366, |
| "epoch": 2.8753993610223643, |
| "grad_norm": 5.103146395413205e-05, |
| "learning_rate": 8.519701810436636e-07, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 1121640.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.5630890011787415, |
| "epoch": 2.9073482428115014, |
| "grad_norm": 0.0011606919579207897, |
| "learning_rate": 6.389776357827476e-07, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 1134120.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.5582900941371918, |
| "epoch": 2.939297124600639, |
| "grad_norm": 0.0002894483332056552, |
| "learning_rate": 4.259850905218318e-07, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 1146600.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 0.5616473734378815, |
| "epoch": 2.9712460063897765, |
| "grad_norm": 4.966451888321899e-05, |
| "learning_rate": 2.129925452609159e-07, |
| "loss": 0.0008, |
| "mean_token_accuracy": 0.99921875, |
| "num_tokens": 1159080.0, |
| "step": 930 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 939, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 704006916867072.0, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|