| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 50, | |
| "global_step": 13947, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0035850646207897896, | |
| "grad_norm": 28.958446502685547, | |
| "learning_rate": 3.512544802867384e-06, | |
| "loss": 14.3981, | |
| "mean_token_accuracy": 0.4658013021945953, | |
| "num_tokens": 631305.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0035850646207897896, | |
| "eval_loss": 3.598968505859375, | |
| "eval_mean_token_accuracy": 0.4642415362596512, | |
| "eval_num_tokens": 631305.0, | |
| "eval_runtime": 55.3723, | |
| "eval_samples_per_second": 7.224, | |
| "eval_steps_per_second": 0.903, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.007170129241579579, | |
| "grad_norm": 46.98331832885742, | |
| "learning_rate": 7.096774193548387e-06, | |
| "loss": 13.6155, | |
| "mean_token_accuracy": 0.47677032694220545, | |
| "num_tokens": 1263143.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.007170129241579579, | |
| "eval_loss": 3.242854595184326, | |
| "eval_mean_token_accuracy": 0.4895547354221344, | |
| "eval_num_tokens": 1263143.0, | |
| "eval_runtime": 56.3676, | |
| "eval_samples_per_second": 7.096, | |
| "eval_steps_per_second": 0.887, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01075519386236937, | |
| "grad_norm": 23.78474235534668, | |
| "learning_rate": 1.0681003584229391e-05, | |
| "loss": 11.8849, | |
| "mean_token_accuracy": 0.5030167695879936, | |
| "num_tokens": 1896120.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.01075519386236937, | |
| "eval_loss": 2.791355609893799, | |
| "eval_mean_token_accuracy": 0.5165965485572815, | |
| "eval_num_tokens": 1896120.0, | |
| "eval_runtime": 55.265, | |
| "eval_samples_per_second": 7.238, | |
| "eval_steps_per_second": 0.905, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.014340258483159158, | |
| "grad_norm": 11.690914154052734, | |
| "learning_rate": 1.4265232974910395e-05, | |
| "loss": 9.8852, | |
| "mean_token_accuracy": 0.5450691656768322, | |
| "num_tokens": 2527332.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.014340258483159158, | |
| "eval_loss": 2.1748604774475098, | |
| "eval_mean_token_accuracy": 0.5758289074897767, | |
| "eval_num_tokens": 2527332.0, | |
| "eval_runtime": 55.335, | |
| "eval_samples_per_second": 7.229, | |
| "eval_steps_per_second": 0.904, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.01792532310394895, | |
| "grad_norm": 9.011432647705078, | |
| "learning_rate": 1.78494623655914e-05, | |
| "loss": 7.7315, | |
| "mean_token_accuracy": 0.5919728323817253, | |
| "num_tokens": 3158451.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.01792532310394895, | |
| "eval_loss": 1.7914152145385742, | |
| "eval_mean_token_accuracy": 0.6036396706104279, | |
| "eval_num_tokens": 3158451.0, | |
| "eval_runtime": 55.3537, | |
| "eval_samples_per_second": 7.226, | |
| "eval_steps_per_second": 0.903, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.02151038772473874, | |
| "grad_norm": 9.172738075256348, | |
| "learning_rate": 2.1433691756272405e-05, | |
| "loss": 6.6634, | |
| "mean_token_accuracy": 0.6193091833591461, | |
| "num_tokens": 3790537.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.02151038772473874, | |
| "eval_loss": 1.5858986377716064, | |
| "eval_mean_token_accuracy": 0.6325684702396392, | |
| "eval_num_tokens": 3790537.0, | |
| "eval_runtime": 55.5221, | |
| "eval_samples_per_second": 7.204, | |
| "eval_steps_per_second": 0.901, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.025095452345528527, | |
| "grad_norm": 6.380577087402344, | |
| "learning_rate": 2.5017921146953403e-05, | |
| "loss": 5.9955, | |
| "mean_token_accuracy": 0.6299453395605087, | |
| "num_tokens": 4416803.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.025095452345528527, | |
| "eval_loss": 1.4283970594406128, | |
| "eval_mean_token_accuracy": 0.637500970363617, | |
| "eval_num_tokens": 4416803.0, | |
| "eval_runtime": 55.6098, | |
| "eval_samples_per_second": 7.193, | |
| "eval_steps_per_second": 0.899, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.028680516966318317, | |
| "grad_norm": 8.31059455871582, | |
| "learning_rate": 2.860215053763441e-05, | |
| "loss": 5.6524, | |
| "mean_token_accuracy": 0.6386667934060096, | |
| "num_tokens": 5049525.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.028680516966318317, | |
| "eval_loss": 1.4044820070266724, | |
| "eval_mean_token_accuracy": 0.6408572208881378, | |
| "eval_num_tokens": 5049525.0, | |
| "eval_runtime": 55.375, | |
| "eval_samples_per_second": 7.223, | |
| "eval_steps_per_second": 0.903, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.03226558158710811, | |
| "grad_norm": 8.33178997039795, | |
| "learning_rate": 3.218637992831541e-05, | |
| "loss": 5.5798, | |
| "mean_token_accuracy": 0.6421743601560592, | |
| "num_tokens": 5681852.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.03226558158710811, | |
| "eval_loss": 1.390726923942566, | |
| "eval_mean_token_accuracy": 0.642348815202713, | |
| "eval_num_tokens": 5681852.0, | |
| "eval_runtime": 55.3167, | |
| "eval_samples_per_second": 7.231, | |
| "eval_steps_per_second": 0.904, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0358506462078979, | |
| "grad_norm": 6.159327507019043, | |
| "learning_rate": 3.577060931899642e-05, | |
| "loss": 5.5753, | |
| "mean_token_accuracy": 0.6416634133458138, | |
| "num_tokens": 6314159.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0358506462078979, | |
| "eval_loss": 1.3759286403656006, | |
| "eval_mean_token_accuracy": 0.6448968076705932, | |
| "eval_num_tokens": 6314159.0, | |
| "eval_runtime": 55.4132, | |
| "eval_samples_per_second": 7.219, | |
| "eval_steps_per_second": 0.902, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03943571082868769, | |
| "grad_norm": 7.295239448547363, | |
| "learning_rate": 3.935483870967742e-05, | |
| "loss": 5.4486, | |
| "mean_token_accuracy": 0.6444561332464218, | |
| "num_tokens": 6948430.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.03943571082868769, | |
| "eval_loss": 1.3677067756652832, | |
| "eval_mean_token_accuracy": 0.6456243467330932, | |
| "eval_num_tokens": 6948430.0, | |
| "eval_runtime": 55.4348, | |
| "eval_samples_per_second": 7.216, | |
| "eval_steps_per_second": 0.902, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.04302077544947748, | |
| "grad_norm": 8.140225410461426, | |
| "learning_rate": 4.2939068100358425e-05, | |
| "loss": 5.491, | |
| "mean_token_accuracy": 0.6452211833000183, | |
| "num_tokens": 7574739.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04302077544947748, | |
| "eval_loss": 1.3580710887908936, | |
| "eval_mean_token_accuracy": 0.6465290606021881, | |
| "eval_num_tokens": 7574739.0, | |
| "eval_runtime": 55.3881, | |
| "eval_samples_per_second": 7.222, | |
| "eval_steps_per_second": 0.903, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04660584007026727, | |
| "grad_norm": 7.000651836395264, | |
| "learning_rate": 4.6523297491039434e-05, | |
| "loss": 5.4196, | |
| "mean_token_accuracy": 0.6482405418157577, | |
| "num_tokens": 8203512.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.04660584007026727, | |
| "eval_loss": 1.3494269847869873, | |
| "eval_mean_token_accuracy": 0.6481932699680328, | |
| "eval_num_tokens": 8203512.0, | |
| "eval_runtime": 55.3883, | |
| "eval_samples_per_second": 7.222, | |
| "eval_steps_per_second": 0.903, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.05019090469105705, | |
| "grad_norm": 8.582626342773438, | |
| "learning_rate": 5.0107526881720436e-05, | |
| "loss": 5.3867, | |
| "mean_token_accuracy": 0.650465478003025, | |
| "num_tokens": 8831306.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.05019090469105705, | |
| "eval_loss": 1.3439626693725586, | |
| "eval_mean_token_accuracy": 0.6484399271011353, | |
| "eval_num_tokens": 8831306.0, | |
| "eval_runtime": 55.4414, | |
| "eval_samples_per_second": 7.215, | |
| "eval_steps_per_second": 0.902, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.05377596931184685, | |
| "grad_norm": 8.785462379455566, | |
| "learning_rate": 5.369175627240144e-05, | |
| "loss": 5.3822, | |
| "mean_token_accuracy": 0.6480473777651787, | |
| "num_tokens": 9462041.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.05377596931184685, | |
| "eval_loss": 1.3385406732559204, | |
| "eval_mean_token_accuracy": 0.649928457736969, | |
| "eval_num_tokens": 9462041.0, | |
| "eval_runtime": 55.3213, | |
| "eval_samples_per_second": 7.23, | |
| "eval_steps_per_second": 0.904, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.05736103393263663, | |
| "grad_norm": 6.1994547843933105, | |
| "learning_rate": 5.727598566308244e-05, | |
| "loss": 5.305, | |
| "mean_token_accuracy": 0.6530899196863175, | |
| "num_tokens": 10095648.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.05736103393263663, | |
| "eval_loss": 1.333341360092163, | |
| "eval_mean_token_accuracy": 0.6506243336200714, | |
| "eval_num_tokens": 10095648.0, | |
| "eval_runtime": 55.4523, | |
| "eval_samples_per_second": 7.213, | |
| "eval_steps_per_second": 0.902, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.06094609855342643, | |
| "grad_norm": 5.850490570068359, | |
| "learning_rate": 6.086021505376345e-05, | |
| "loss": 5.3301, | |
| "mean_token_accuracy": 0.6499336344003678, | |
| "num_tokens": 10730377.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.06094609855342643, | |
| "eval_loss": 1.3294757604599, | |
| "eval_mean_token_accuracy": 0.6494286286830903, | |
| "eval_num_tokens": 10730377.0, | |
| "eval_runtime": 55.628, | |
| "eval_samples_per_second": 7.191, | |
| "eval_steps_per_second": 0.899, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.06453116317421621, | |
| "grad_norm": 5.629384517669678, | |
| "learning_rate": 6.444444444444446e-05, | |
| "loss": 5.2911, | |
| "mean_token_accuracy": 0.6521104833483696, | |
| "num_tokens": 11363798.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.06453116317421621, | |
| "eval_loss": 1.3235622644424438, | |
| "eval_mean_token_accuracy": 0.6512654149532318, | |
| "eval_num_tokens": 11363798.0, | |
| "eval_runtime": 55.5848, | |
| "eval_samples_per_second": 7.196, | |
| "eval_steps_per_second": 0.9, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.068116227795006, | |
| "grad_norm": 6.046393871307373, | |
| "learning_rate": 6.802867383512545e-05, | |
| "loss": 5.2478, | |
| "mean_token_accuracy": 0.6536632561683655, | |
| "num_tokens": 11993502.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.068116227795006, | |
| "eval_loss": 1.3197156190872192, | |
| "eval_mean_token_accuracy": 0.6517732429504395, | |
| "eval_num_tokens": 11993502.0, | |
| "eval_runtime": 55.3206, | |
| "eval_samples_per_second": 7.231, | |
| "eval_steps_per_second": 0.904, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.0717012924157958, | |
| "grad_norm": 6.950500011444092, | |
| "learning_rate": 7.161290322580646e-05, | |
| "loss": 5.2368, | |
| "mean_token_accuracy": 0.6554682296514511, | |
| "num_tokens": 12628081.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0717012924157958, | |
| "eval_loss": 1.3148993253707886, | |
| "eval_mean_token_accuracy": 0.652986958026886, | |
| "eval_num_tokens": 12628081.0, | |
| "eval_runtime": 55.4045, | |
| "eval_samples_per_second": 7.22, | |
| "eval_steps_per_second": 0.902, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07528635703658558, | |
| "grad_norm": 5.844649791717529, | |
| "learning_rate": 7.519713261648746e-05, | |
| "loss": 5.2604, | |
| "mean_token_accuracy": 0.6538248571753502, | |
| "num_tokens": 13254893.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.07528635703658558, | |
| "eval_loss": 1.3124916553497314, | |
| "eval_mean_token_accuracy": 0.6538188600540161, | |
| "eval_num_tokens": 13254893.0, | |
| "eval_runtime": 56.9195, | |
| "eval_samples_per_second": 7.027, | |
| "eval_steps_per_second": 0.878, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.07887142165737537, | |
| "grad_norm": 5.3114094734191895, | |
| "learning_rate": 7.878136200716845e-05, | |
| "loss": 5.235, | |
| "mean_token_accuracy": 0.6541680765151977, | |
| "num_tokens": 13893524.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.07887142165737537, | |
| "eval_loss": 1.309714674949646, | |
| "eval_mean_token_accuracy": 0.6538467502593994, | |
| "eval_num_tokens": 13893524.0, | |
| "eval_runtime": 56.3705, | |
| "eval_samples_per_second": 7.096, | |
| "eval_steps_per_second": 0.887, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.08245648627816517, | |
| "grad_norm": 5.666459083557129, | |
| "learning_rate": 8.236559139784946e-05, | |
| "loss": 5.1803, | |
| "mean_token_accuracy": 0.6568083089590072, | |
| "num_tokens": 14522906.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.08245648627816517, | |
| "eval_loss": 1.305640459060669, | |
| "eval_mean_token_accuracy": 0.655296059846878, | |
| "eval_num_tokens": 14522906.0, | |
| "eval_runtime": 57.2106, | |
| "eval_samples_per_second": 6.992, | |
| "eval_steps_per_second": 0.874, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.08604155089895496, | |
| "grad_norm": 6.020337104797363, | |
| "learning_rate": 8.594982078853047e-05, | |
| "loss": 5.2056, | |
| "mean_token_accuracy": 0.653913055062294, | |
| "num_tokens": 15156803.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.08604155089895496, | |
| "eval_loss": 1.3023688793182373, | |
| "eval_mean_token_accuracy": 0.6569918835163117, | |
| "eval_num_tokens": 15156803.0, | |
| "eval_runtime": 56.4134, | |
| "eval_samples_per_second": 7.091, | |
| "eval_steps_per_second": 0.886, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.08962661551974474, | |
| "grad_norm": 5.757259368896484, | |
| "learning_rate": 8.953405017921147e-05, | |
| "loss": 5.2154, | |
| "mean_token_accuracy": 0.6549820226430892, | |
| "num_tokens": 15788828.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.08962661551974474, | |
| "eval_loss": 1.3033726215362549, | |
| "eval_mean_token_accuracy": 0.6548340058326722, | |
| "eval_num_tokens": 15788828.0, | |
| "eval_runtime": 56.3999, | |
| "eval_samples_per_second": 7.092, | |
| "eval_steps_per_second": 0.887, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.09321168014053453, | |
| "grad_norm": 6.876058101654053, | |
| "learning_rate": 9.311827956989248e-05, | |
| "loss": 5.2374, | |
| "mean_token_accuracy": 0.6526922315359116, | |
| "num_tokens": 16423385.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.09321168014053453, | |
| "eval_loss": 1.2987463474273682, | |
| "eval_mean_token_accuracy": 0.6556110656261445, | |
| "eval_num_tokens": 16423385.0, | |
| "eval_runtime": 56.3308, | |
| "eval_samples_per_second": 7.101, | |
| "eval_steps_per_second": 0.888, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.09679674476132433, | |
| "grad_norm": 5.170133590698242, | |
| "learning_rate": 9.670250896057349e-05, | |
| "loss": 5.2584, | |
| "mean_token_accuracy": 0.6529216593503953, | |
| "num_tokens": 17058608.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.09679674476132433, | |
| "eval_loss": 1.2979986667633057, | |
| "eval_mean_token_accuracy": 0.6553151261806488, | |
| "eval_num_tokens": 17058608.0, | |
| "eval_runtime": 56.3719, | |
| "eval_samples_per_second": 7.096, | |
| "eval_steps_per_second": 0.887, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.1003818093821141, | |
| "grad_norm": 5.678673267364502, | |
| "learning_rate": 9.996813256851498e-05, | |
| "loss": 5.1909, | |
| "mean_token_accuracy": 0.6570302325487137, | |
| "num_tokens": 17689690.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.1003818093821141, | |
| "eval_loss": 1.2949328422546387, | |
| "eval_mean_token_accuracy": 0.6552190041542053, | |
| "eval_num_tokens": 17689690.0, | |
| "eval_runtime": 56.3054, | |
| "eval_samples_per_second": 7.104, | |
| "eval_steps_per_second": 0.888, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.1039668740029039, | |
| "grad_norm": 4.694892406463623, | |
| "learning_rate": 9.956978967495221e-05, | |
| "loss": 5.1132, | |
| "mean_token_accuracy": 0.6600784501433372, | |
| "num_tokens": 18321232.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.1039668740029039, | |
| "eval_loss": 1.2946751117706299, | |
| "eval_mean_token_accuracy": 0.6560806667804718, | |
| "eval_num_tokens": 18321232.0, | |
| "eval_runtime": 56.7345, | |
| "eval_samples_per_second": 7.05, | |
| "eval_steps_per_second": 0.881, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.1075519386236937, | |
| "grad_norm": 5.286959171295166, | |
| "learning_rate": 9.917144678138942e-05, | |
| "loss": 5.2297, | |
| "mean_token_accuracy": 0.6539956346154213, | |
| "num_tokens": 18952518.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1075519386236937, | |
| "eval_loss": 1.2900216579437256, | |
| "eval_mean_token_accuracy": 0.6562173092365264, | |
| "eval_num_tokens": 18952518.0, | |
| "eval_runtime": 56.3853, | |
| "eval_samples_per_second": 7.094, | |
| "eval_steps_per_second": 0.887, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.11113700324448349, | |
| "grad_norm": 5.229610443115234, | |
| "learning_rate": 9.877310388782664e-05, | |
| "loss": 5.1376, | |
| "mean_token_accuracy": 0.6599933451414108, | |
| "num_tokens": 19580453.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.11113700324448349, | |
| "eval_loss": 1.2871261835098267, | |
| "eval_mean_token_accuracy": 0.6577617633342743, | |
| "eval_num_tokens": 19580453.0, | |
| "eval_runtime": 56.3376, | |
| "eval_samples_per_second": 7.1, | |
| "eval_steps_per_second": 0.888, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.11472206786527327, | |
| "grad_norm": 4.540684223175049, | |
| "learning_rate": 9.837476099426386e-05, | |
| "loss": 5.1124, | |
| "mean_token_accuracy": 0.659881052672863, | |
| "num_tokens": 20220713.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.11472206786527327, | |
| "eval_loss": 1.2855585813522339, | |
| "eval_mean_token_accuracy": 0.657675279378891, | |
| "eval_num_tokens": 20220713.0, | |
| "eval_runtime": 56.4741, | |
| "eval_samples_per_second": 7.083, | |
| "eval_steps_per_second": 0.885, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.11830713248606306, | |
| "grad_norm": 5.147482872009277, | |
| "learning_rate": 9.797641810070109e-05, | |
| "loss": 5.1251, | |
| "mean_token_accuracy": 0.658254965543747, | |
| "num_tokens": 20853860.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.11830713248606306, | |
| "eval_loss": 1.283848762512207, | |
| "eval_mean_token_accuracy": 0.6582966887950897, | |
| "eval_num_tokens": 20853860.0, | |
| "eval_runtime": 57.6401, | |
| "eval_samples_per_second": 6.94, | |
| "eval_steps_per_second": 0.867, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.12189219710685285, | |
| "grad_norm": 4.544667720794678, | |
| "learning_rate": 9.757807520713831e-05, | |
| "loss": 5.0706, | |
| "mean_token_accuracy": 0.6628770676255226, | |
| "num_tokens": 21487498.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.12189219710685285, | |
| "eval_loss": 1.2798463106155396, | |
| "eval_mean_token_accuracy": 0.6587248671054841, | |
| "eval_num_tokens": 21487498.0, | |
| "eval_runtime": 56.4142, | |
| "eval_samples_per_second": 7.09, | |
| "eval_steps_per_second": 0.886, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.12547726172764265, | |
| "grad_norm": 4.541973114013672, | |
| "learning_rate": 9.717973231357553e-05, | |
| "loss": 5.054, | |
| "mean_token_accuracy": 0.6628148990869522, | |
| "num_tokens": 22120725.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.12547726172764265, | |
| "eval_loss": 1.278252124786377, | |
| "eval_mean_token_accuracy": 0.6595626533031463, | |
| "eval_num_tokens": 22120725.0, | |
| "eval_runtime": 56.5087, | |
| "eval_samples_per_second": 7.079, | |
| "eval_steps_per_second": 0.885, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.12906232634843243, | |
| "grad_norm": 5.01814603805542, | |
| "learning_rate": 9.678138942001275e-05, | |
| "loss": 5.1334, | |
| "mean_token_accuracy": 0.6570143532752991, | |
| "num_tokens": 22751630.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.12906232634843243, | |
| "eval_loss": 1.2745345830917358, | |
| "eval_mean_token_accuracy": 0.6588572013378143, | |
| "eval_num_tokens": 22751630.0, | |
| "eval_runtime": 56.4708, | |
| "eval_samples_per_second": 7.083, | |
| "eval_steps_per_second": 0.885, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.1326473909692222, | |
| "grad_norm": 5.249142646789551, | |
| "learning_rate": 9.638304652644997e-05, | |
| "loss": 5.0772, | |
| "mean_token_accuracy": 0.6610330584645271, | |
| "num_tokens": 23380871.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.1326473909692222, | |
| "eval_loss": 1.271730661392212, | |
| "eval_mean_token_accuracy": 0.6606413364410401, | |
| "eval_num_tokens": 23380871.0, | |
| "eval_runtime": 56.3956, | |
| "eval_samples_per_second": 7.093, | |
| "eval_steps_per_second": 0.887, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.136232455590012, | |
| "grad_norm": 4.495316505432129, | |
| "learning_rate": 9.598470363288719e-05, | |
| "loss": 5.1115, | |
| "mean_token_accuracy": 0.6598174887895584, | |
| "num_tokens": 24016153.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.136232455590012, | |
| "eval_loss": 1.2678121328353882, | |
| "eval_mean_token_accuracy": 0.659992311000824, | |
| "eval_num_tokens": 24016153.0, | |
| "eval_runtime": 56.3408, | |
| "eval_samples_per_second": 7.1, | |
| "eval_steps_per_second": 0.887, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.1398175202108018, | |
| "grad_norm": 4.675460338592529, | |
| "learning_rate": 9.558636073932441e-05, | |
| "loss": 5.1039, | |
| "mean_token_accuracy": 0.6611927005648613, | |
| "num_tokens": 24651410.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.1398175202108018, | |
| "eval_loss": 1.2681583166122437, | |
| "eval_mean_token_accuracy": 0.6607218337059021, | |
| "eval_num_tokens": 24651410.0, | |
| "eval_runtime": 56.0826, | |
| "eval_samples_per_second": 7.132, | |
| "eval_steps_per_second": 0.892, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.1434025848315916, | |
| "grad_norm": 4.928748607635498, | |
| "learning_rate": 9.518801784576164e-05, | |
| "loss": 5.1005, | |
| "mean_token_accuracy": 0.6606691733002663, | |
| "num_tokens": 25282438.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1434025848315916, | |
| "eval_loss": 1.2662436962127686, | |
| "eval_mean_token_accuracy": 0.6602835392951966, | |
| "eval_num_tokens": 25282438.0, | |
| "eval_runtime": 56.1345, | |
| "eval_samples_per_second": 7.126, | |
| "eval_steps_per_second": 0.891, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.14698764945238138, | |
| "grad_norm": 4.237011432647705, | |
| "learning_rate": 9.478967495219886e-05, | |
| "loss": 5.0865, | |
| "mean_token_accuracy": 0.6610729214549065, | |
| "num_tokens": 25914467.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.14698764945238138, | |
| "eval_loss": 1.2659285068511963, | |
| "eval_mean_token_accuracy": 0.6619378459453583, | |
| "eval_num_tokens": 25914467.0, | |
| "eval_runtime": 56.41, | |
| "eval_samples_per_second": 7.091, | |
| "eval_steps_per_second": 0.886, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.15057271407317116, | |
| "grad_norm": 4.498386383056641, | |
| "learning_rate": 9.439133205863608e-05, | |
| "loss": 5.0536, | |
| "mean_token_accuracy": 0.662959768474102, | |
| "num_tokens": 26547088.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.15057271407317116, | |
| "eval_loss": 1.2620855569839478, | |
| "eval_mean_token_accuracy": 0.6622272551059722, | |
| "eval_num_tokens": 26547088.0, | |
| "eval_runtime": 56.4966, | |
| "eval_samples_per_second": 7.08, | |
| "eval_steps_per_second": 0.885, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.15415777869396097, | |
| "grad_norm": 4.547789573669434, | |
| "learning_rate": 9.39929891650733e-05, | |
| "loss": 5.0074, | |
| "mean_token_accuracy": 0.6667503699660301, | |
| "num_tokens": 27181781.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.15415777869396097, | |
| "eval_loss": 1.2630900144577026, | |
| "eval_mean_token_accuracy": 0.6616924941539765, | |
| "eval_num_tokens": 27181781.0, | |
| "eval_runtime": 56.423, | |
| "eval_samples_per_second": 7.089, | |
| "eval_steps_per_second": 0.886, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.15774284331475075, | |
| "grad_norm": 4.9150896072387695, | |
| "learning_rate": 9.359464627151052e-05, | |
| "loss": 5.0802, | |
| "mean_token_accuracy": 0.6615395992994308, | |
| "num_tokens": 27816217.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.15774284331475075, | |
| "eval_loss": 1.2614257335662842, | |
| "eval_mean_token_accuracy": 0.6630010890960694, | |
| "eval_num_tokens": 27816217.0, | |
| "eval_runtime": 56.1652, | |
| "eval_samples_per_second": 7.122, | |
| "eval_steps_per_second": 0.89, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.16132790793554053, | |
| "grad_norm": 4.487524032592773, | |
| "learning_rate": 9.319630337794774e-05, | |
| "loss": 5.0135, | |
| "mean_token_accuracy": 0.6651386457681656, | |
| "num_tokens": 28449167.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.16132790793554053, | |
| "eval_loss": 1.2573643922805786, | |
| "eval_mean_token_accuracy": 0.6616563200950623, | |
| "eval_num_tokens": 28449167.0, | |
| "eval_runtime": 56.2616, | |
| "eval_samples_per_second": 7.11, | |
| "eval_steps_per_second": 0.889, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.16491297255633033, | |
| "grad_norm": 4.237537860870361, | |
| "learning_rate": 9.279796048438496e-05, | |
| "loss": 5.0198, | |
| "mean_token_accuracy": 0.6647991991043091, | |
| "num_tokens": 29080404.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.16491297255633033, | |
| "eval_loss": 1.2570703029632568, | |
| "eval_mean_token_accuracy": 0.6629821956157684, | |
| "eval_num_tokens": 29080404.0, | |
| "eval_runtime": 56.2199, | |
| "eval_samples_per_second": 7.115, | |
| "eval_steps_per_second": 0.889, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.1684980371771201, | |
| "grad_norm": 4.835733890533447, | |
| "learning_rate": 9.239961759082219e-05, | |
| "loss": 5.0592, | |
| "mean_token_accuracy": 0.6617502626776696, | |
| "num_tokens": 29711200.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.1684980371771201, | |
| "eval_loss": 1.2566256523132324, | |
| "eval_mean_token_accuracy": 0.6639114606380463, | |
| "eval_num_tokens": 29711200.0, | |
| "eval_runtime": 56.2408, | |
| "eval_samples_per_second": 7.112, | |
| "eval_steps_per_second": 0.889, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.17208310179790992, | |
| "grad_norm": 4.832096099853516, | |
| "learning_rate": 9.200127469725941e-05, | |
| "loss": 5.0603, | |
| "mean_token_accuracy": 0.6628168100118637, | |
| "num_tokens": 30343251.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.17208310179790992, | |
| "eval_loss": 1.2529999017715454, | |
| "eval_mean_token_accuracy": 0.6634249198436737, | |
| "eval_num_tokens": 30343251.0, | |
| "eval_runtime": 56.4282, | |
| "eval_samples_per_second": 7.089, | |
| "eval_steps_per_second": 0.886, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.1756681664186997, | |
| "grad_norm": 4.870041370391846, | |
| "learning_rate": 9.160293180369663e-05, | |
| "loss": 5.0036, | |
| "mean_token_accuracy": 0.6657980665564537, | |
| "num_tokens": 30983003.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.1756681664186997, | |
| "eval_loss": 1.2508057355880737, | |
| "eval_mean_token_accuracy": 0.664600031375885, | |
| "eval_num_tokens": 30983003.0, | |
| "eval_runtime": 56.1477, | |
| "eval_samples_per_second": 7.124, | |
| "eval_steps_per_second": 0.891, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.17925323103948948, | |
| "grad_norm": 4.8386993408203125, | |
| "learning_rate": 9.120458891013385e-05, | |
| "loss": 4.9378, | |
| "mean_token_accuracy": 0.6697717472910881, | |
| "num_tokens": 31612440.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.17925323103948948, | |
| "eval_loss": 1.2504231929779053, | |
| "eval_mean_token_accuracy": 0.6656661999225616, | |
| "eval_num_tokens": 31612440.0, | |
| "eval_runtime": 56.432, | |
| "eval_samples_per_second": 7.088, | |
| "eval_steps_per_second": 0.886, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.1828382956602793, | |
| "grad_norm": 4.897119045257568, | |
| "learning_rate": 9.080624601657107e-05, | |
| "loss": 5.0576, | |
| "mean_token_accuracy": 0.6637110111117362, | |
| "num_tokens": 32246052.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.1828382956602793, | |
| "eval_loss": 1.2486332654953003, | |
| "eval_mean_token_accuracy": 0.6662761294841766, | |
| "eval_num_tokens": 32246052.0, | |
| "eval_runtime": 56.3188, | |
| "eval_samples_per_second": 7.102, | |
| "eval_steps_per_second": 0.888, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.18642336028106907, | |
| "grad_norm": 4.67065954208374, | |
| "learning_rate": 9.040790312300828e-05, | |
| "loss": 5.0137, | |
| "mean_token_accuracy": 0.6648938983678818, | |
| "num_tokens": 32876620.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.18642336028106907, | |
| "eval_loss": 1.2486134767532349, | |
| "eval_mean_token_accuracy": 0.6653382694721222, | |
| "eval_num_tokens": 32876620.0, | |
| "eval_runtime": 56.3432, | |
| "eval_samples_per_second": 7.099, | |
| "eval_steps_per_second": 0.887, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.19000842490185885, | |
| "grad_norm": 4.4095563888549805, | |
| "learning_rate": 9.000956022944551e-05, | |
| "loss": 5.0065, | |
| "mean_token_accuracy": 0.6646731504797936, | |
| "num_tokens": 33511123.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.19000842490185885, | |
| "eval_loss": 1.2494382858276367, | |
| "eval_mean_token_accuracy": 0.6639154195785523, | |
| "eval_num_tokens": 33511123.0, | |
| "eval_runtime": 56.3106, | |
| "eval_samples_per_second": 7.103, | |
| "eval_steps_per_second": 0.888, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.19359348952264865, | |
| "grad_norm": 4.82889461517334, | |
| "learning_rate": 8.961121733588274e-05, | |
| "loss": 5.0066, | |
| "mean_token_accuracy": 0.666558310687542, | |
| "num_tokens": 34141280.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.19359348952264865, | |
| "eval_loss": 1.2460081577301025, | |
| "eval_mean_token_accuracy": 0.6646526777744293, | |
| "eval_num_tokens": 34141280.0, | |
| "eval_runtime": 56.6246, | |
| "eval_samples_per_second": 7.064, | |
| "eval_steps_per_second": 0.883, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.19717855414343843, | |
| "grad_norm": 4.663321018218994, | |
| "learning_rate": 8.921287444231994e-05, | |
| "loss": 4.9428, | |
| "mean_token_accuracy": 0.6681969156861305, | |
| "num_tokens": 34774698.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.19717855414343843, | |
| "eval_loss": 1.244221806526184, | |
| "eval_mean_token_accuracy": 0.6656762886047364, | |
| "eval_num_tokens": 34774698.0, | |
| "eval_runtime": 56.2438, | |
| "eval_samples_per_second": 7.112, | |
| "eval_steps_per_second": 0.889, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.2007636187642282, | |
| "grad_norm": 4.264768600463867, | |
| "learning_rate": 8.881453154875718e-05, | |
| "loss": 5.0028, | |
| "mean_token_accuracy": 0.6653160175681114, | |
| "num_tokens": 35406754.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.2007636187642282, | |
| "eval_loss": 1.2440837621688843, | |
| "eval_mean_token_accuracy": 0.6653203201293946, | |
| "eval_num_tokens": 35406754.0, | |
| "eval_runtime": 56.2314, | |
| "eval_samples_per_second": 7.113, | |
| "eval_steps_per_second": 0.889, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.20434868338501802, | |
| "grad_norm": 4.938720226287842, | |
| "learning_rate": 8.84161886551944e-05, | |
| "loss": 4.9905, | |
| "mean_token_accuracy": 0.666375992000103, | |
| "num_tokens": 36037785.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.20434868338501802, | |
| "eval_loss": 1.2425023317337036, | |
| "eval_mean_token_accuracy": 0.664773497581482, | |
| "eval_num_tokens": 36037785.0, | |
| "eval_runtime": 56.4922, | |
| "eval_samples_per_second": 7.081, | |
| "eval_steps_per_second": 0.885, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.2079337480058078, | |
| "grad_norm": 4.350741386413574, | |
| "learning_rate": 8.801784576163161e-05, | |
| "loss": 4.858, | |
| "mean_token_accuracy": 0.6740978673100472, | |
| "num_tokens": 36672636.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.2079337480058078, | |
| "eval_loss": 1.2399791479110718, | |
| "eval_mean_token_accuracy": 0.6653912532329559, | |
| "eval_num_tokens": 36672636.0, | |
| "eval_runtime": 57.5992, | |
| "eval_samples_per_second": 6.945, | |
| "eval_steps_per_second": 0.868, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.2115188126265976, | |
| "grad_norm": 4.187928676605225, | |
| "learning_rate": 8.761950286806884e-05, | |
| "loss": 4.973, | |
| "mean_token_accuracy": 0.6659010905027389, | |
| "num_tokens": 37304390.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.2115188126265976, | |
| "eval_loss": 1.239449143409729, | |
| "eval_mean_token_accuracy": 0.6661972737312317, | |
| "eval_num_tokens": 37304390.0, | |
| "eval_runtime": 55.9136, | |
| "eval_samples_per_second": 7.154, | |
| "eval_steps_per_second": 0.894, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.2151038772473874, | |
| "grad_norm": 4.3214802742004395, | |
| "learning_rate": 8.722115997450606e-05, | |
| "loss": 4.9911, | |
| "mean_token_accuracy": 0.6659192404150963, | |
| "num_tokens": 37937712.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.2151038772473874, | |
| "eval_loss": 1.2380547523498535, | |
| "eval_mean_token_accuracy": 0.6668792748451233, | |
| "eval_num_tokens": 37937712.0, | |
| "eval_runtime": 56.7402, | |
| "eval_samples_per_second": 7.05, | |
| "eval_steps_per_second": 0.881, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.21868894186817717, | |
| "grad_norm": 5.154741287231445, | |
| "learning_rate": 8.682281708094327e-05, | |
| "loss": 4.9341, | |
| "mean_token_accuracy": 0.6695549800992012, | |
| "num_tokens": 38567208.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.21868894186817717, | |
| "eval_loss": 1.2387843132019043, | |
| "eval_mean_token_accuracy": 0.6668635201454163, | |
| "eval_num_tokens": 38567208.0, | |
| "eval_runtime": 56.2471, | |
| "eval_samples_per_second": 7.111, | |
| "eval_steps_per_second": 0.889, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.22227400648896697, | |
| "grad_norm": 5.014278888702393, | |
| "learning_rate": 8.64244741873805e-05, | |
| "loss": 4.8853, | |
| "mean_token_accuracy": 0.6707546302676201, | |
| "num_tokens": 39198318.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.22227400648896697, | |
| "eval_loss": 1.2394779920578003, | |
| "eval_mean_token_accuracy": 0.6670789694786072, | |
| "eval_num_tokens": 39198318.0, | |
| "eval_runtime": 56.2239, | |
| "eval_samples_per_second": 7.114, | |
| "eval_steps_per_second": 0.889, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.22585907110975675, | |
| "grad_norm": 4.228548049926758, | |
| "learning_rate": 8.602613129381773e-05, | |
| "loss": 4.9269, | |
| "mean_token_accuracy": 0.6687791690230369, | |
| "num_tokens": 39828524.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.22585907110975675, | |
| "eval_loss": 1.2372474670410156, | |
| "eval_mean_token_accuracy": 0.666018306016922, | |
| "eval_num_tokens": 39828524.0, | |
| "eval_runtime": 56.2199, | |
| "eval_samples_per_second": 7.115, | |
| "eval_steps_per_second": 0.889, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.22944413573054653, | |
| "grad_norm": 4.169594764709473, | |
| "learning_rate": 8.562778840025495e-05, | |
| "loss": 4.9485, | |
| "mean_token_accuracy": 0.6667472127079964, | |
| "num_tokens": 40459992.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.22944413573054653, | |
| "eval_loss": 1.2357257604599, | |
| "eval_mean_token_accuracy": 0.6657440733909606, | |
| "eval_num_tokens": 40459992.0, | |
| "eval_runtime": 56.3901, | |
| "eval_samples_per_second": 7.093, | |
| "eval_steps_per_second": 0.887, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.23302920035133634, | |
| "grad_norm": 4.309950828552246, | |
| "learning_rate": 8.522944550669216e-05, | |
| "loss": 4.9128, | |
| "mean_token_accuracy": 0.671622729897499, | |
| "num_tokens": 41094373.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.23302920035133634, | |
| "eval_loss": 1.2348511219024658, | |
| "eval_mean_token_accuracy": 0.6659791529178619, | |
| "eval_num_tokens": 41094373.0, | |
| "eval_runtime": 56.3939, | |
| "eval_samples_per_second": 7.093, | |
| "eval_steps_per_second": 0.887, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.23661426497212612, | |
| "grad_norm": 4.153282642364502, | |
| "learning_rate": 8.48311026131294e-05, | |
| "loss": 4.9831, | |
| "mean_token_accuracy": 0.66548932492733, | |
| "num_tokens": 41725155.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.23661426497212612, | |
| "eval_loss": 1.2328479290008545, | |
| "eval_mean_token_accuracy": 0.6659755408763885, | |
| "eval_num_tokens": 41725155.0, | |
| "eval_runtime": 56.4734, | |
| "eval_samples_per_second": 7.083, | |
| "eval_steps_per_second": 0.885, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.2401993295929159, | |
| "grad_norm": 4.901464462280273, | |
| "learning_rate": 8.443275971956662e-05, | |
| "loss": 4.9905, | |
| "mean_token_accuracy": 0.6660814517736435, | |
| "num_tokens": 42361406.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.2401993295929159, | |
| "eval_loss": 1.2326833009719849, | |
| "eval_mean_token_accuracy": 0.6672022414207458, | |
| "eval_num_tokens": 42361406.0, | |
| "eval_runtime": 56.2896, | |
| "eval_samples_per_second": 7.106, | |
| "eval_steps_per_second": 0.888, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.2437843942137057, | |
| "grad_norm": 4.4632415771484375, | |
| "learning_rate": 8.403441682600382e-05, | |
| "loss": 4.8952, | |
| "mean_token_accuracy": 0.6706425687670707, | |
| "num_tokens": 42993909.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.2437843942137057, | |
| "eval_loss": 1.2327020168304443, | |
| "eval_mean_token_accuracy": 0.6668058276176453, | |
| "eval_num_tokens": 42993909.0, | |
| "eval_runtime": 56.2077, | |
| "eval_samples_per_second": 7.116, | |
| "eval_steps_per_second": 0.89, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.2473694588344955, | |
| "grad_norm": 4.537699222564697, | |
| "learning_rate": 8.363607393244104e-05, | |
| "loss": 4.921, | |
| "mean_token_accuracy": 0.6698031505942345, | |
| "num_tokens": 43628010.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.2473694588344955, | |
| "eval_loss": 1.2328044176101685, | |
| "eval_mean_token_accuracy": 0.667005888223648, | |
| "eval_num_tokens": 43628010.0, | |
| "eval_runtime": 56.142, | |
| "eval_samples_per_second": 7.125, | |
| "eval_steps_per_second": 0.891, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.2509545234552853, | |
| "grad_norm": 4.68520450592041, | |
| "learning_rate": 8.323773103887828e-05, | |
| "loss": 4.9443, | |
| "mean_token_accuracy": 0.667299503982067, | |
| "num_tokens": 44263542.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.2509545234552853, | |
| "eval_loss": 1.2303454875946045, | |
| "eval_mean_token_accuracy": 0.6684185063838959, | |
| "eval_num_tokens": 44263542.0, | |
| "eval_runtime": 56.0547, | |
| "eval_samples_per_second": 7.136, | |
| "eval_steps_per_second": 0.892, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.2545395880760751, | |
| "grad_norm": 4.269311428070068, | |
| "learning_rate": 8.283938814531549e-05, | |
| "loss": 4.9117, | |
| "mean_token_accuracy": 0.6701091477274894, | |
| "num_tokens": 44896448.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.2545395880760751, | |
| "eval_loss": 1.2301256656646729, | |
| "eval_mean_token_accuracy": 0.6679345464706421, | |
| "eval_num_tokens": 44896448.0, | |
| "eval_runtime": 56.8192, | |
| "eval_samples_per_second": 7.04, | |
| "eval_steps_per_second": 0.88, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.25812465269686485, | |
| "grad_norm": 4.6586198806762695, | |
| "learning_rate": 8.244104525175271e-05, | |
| "loss": 4.9361, | |
| "mean_token_accuracy": 0.6700941568613052, | |
| "num_tokens": 45535736.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.25812465269686485, | |
| "eval_loss": 1.2280727624893188, | |
| "eval_mean_token_accuracy": 0.668077005147934, | |
| "eval_num_tokens": 45535736.0, | |
| "eval_runtime": 56.7678, | |
| "eval_samples_per_second": 7.046, | |
| "eval_steps_per_second": 0.881, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.26170971731765463, | |
| "grad_norm": 4.350837230682373, | |
| "learning_rate": 8.204270235818994e-05, | |
| "loss": 4.8535, | |
| "mean_token_accuracy": 0.6710763236880303, | |
| "num_tokens": 46168014.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.26170971731765463, | |
| "eval_loss": 1.2273330688476562, | |
| "eval_mean_token_accuracy": 0.6685185146331787, | |
| "eval_num_tokens": 46168014.0, | |
| "eval_runtime": 56.2347, | |
| "eval_samples_per_second": 7.113, | |
| "eval_steps_per_second": 0.889, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.2652947819384444, | |
| "grad_norm": 4.489384174346924, | |
| "learning_rate": 8.164435946462715e-05, | |
| "loss": 4.9884, | |
| "mean_token_accuracy": 0.6643109431862831, | |
| "num_tokens": 46799865.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.2652947819384444, | |
| "eval_loss": 1.228873610496521, | |
| "eval_mean_token_accuracy": 0.6676431381702423, | |
| "eval_num_tokens": 46799865.0, | |
| "eval_runtime": 56.2386, | |
| "eval_samples_per_second": 7.113, | |
| "eval_steps_per_second": 0.889, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.26887984655923425, | |
| "grad_norm": 4.438107967376709, | |
| "learning_rate": 8.124601657106437e-05, | |
| "loss": 4.8433, | |
| "mean_token_accuracy": 0.6722122520208359, | |
| "num_tokens": 47431886.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.26887984655923425, | |
| "eval_loss": 1.2276620864868164, | |
| "eval_mean_token_accuracy": 0.6684055602550507, | |
| "eval_num_tokens": 47431886.0, | |
| "eval_runtime": 56.3388, | |
| "eval_samples_per_second": 7.1, | |
| "eval_steps_per_second": 0.887, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.272464911180024, | |
| "grad_norm": 4.851945877075195, | |
| "learning_rate": 8.08476736775016e-05, | |
| "loss": 4.9293, | |
| "mean_token_accuracy": 0.668489234149456, | |
| "num_tokens": 48065104.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.272464911180024, | |
| "eval_loss": 1.2269046306610107, | |
| "eval_mean_token_accuracy": 0.6687084710597992, | |
| "eval_num_tokens": 48065104.0, | |
| "eval_runtime": 56.1744, | |
| "eval_samples_per_second": 7.121, | |
| "eval_steps_per_second": 0.89, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.2760499758008138, | |
| "grad_norm": 4.730586528778076, | |
| "learning_rate": 8.044933078393882e-05, | |
| "loss": 4.8258, | |
| "mean_token_accuracy": 0.6736181953549385, | |
| "num_tokens": 48699465.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.2760499758008138, | |
| "eval_loss": 1.2258822917938232, | |
| "eval_mean_token_accuracy": 0.6694431722164154, | |
| "eval_num_tokens": 48699465.0, | |
| "eval_runtime": 56.2799, | |
| "eval_samples_per_second": 7.107, | |
| "eval_steps_per_second": 0.888, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.2796350404216036, | |
| "grad_norm": 4.539992809295654, | |
| "learning_rate": 8.005098789037604e-05, | |
| "loss": 4.9014, | |
| "mean_token_accuracy": 0.6706485760211944, | |
| "num_tokens": 49328518.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.2796350404216036, | |
| "eval_loss": 1.2248101234436035, | |
| "eval_mean_token_accuracy": 0.6695001828670502, | |
| "eval_num_tokens": 49328518.0, | |
| "eval_runtime": 56.4684, | |
| "eval_samples_per_second": 7.084, | |
| "eval_steps_per_second": 0.885, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.28322010504239337, | |
| "grad_norm": 5.000583648681641, | |
| "learning_rate": 7.965264499681326e-05, | |
| "loss": 4.7606, | |
| "mean_token_accuracy": 0.6768921792507172, | |
| "num_tokens": 49957049.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.28322010504239337, | |
| "eval_loss": 1.2227978706359863, | |
| "eval_mean_token_accuracy": 0.6691736376285553, | |
| "eval_num_tokens": 49957049.0, | |
| "eval_runtime": 56.328, | |
| "eval_samples_per_second": 7.101, | |
| "eval_steps_per_second": 0.888, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.2868051696631832, | |
| "grad_norm": 4.855432510375977, | |
| "learning_rate": 7.925430210325048e-05, | |
| "loss": 4.9544, | |
| "mean_token_accuracy": 0.6679512014985085, | |
| "num_tokens": 50593342.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2868051696631832, | |
| "eval_loss": 1.222544550895691, | |
| "eval_mean_token_accuracy": 0.6694585859775544, | |
| "eval_num_tokens": 50593342.0, | |
| "eval_runtime": 56.2378, | |
| "eval_samples_per_second": 7.113, | |
| "eval_steps_per_second": 0.889, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.290390234283973, | |
| "grad_norm": 4.258941173553467, | |
| "learning_rate": 7.88559592096877e-05, | |
| "loss": 4.8759, | |
| "mean_token_accuracy": 0.6715140387415885, | |
| "num_tokens": 51220159.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.290390234283973, | |
| "eval_loss": 1.2212531566619873, | |
| "eval_mean_token_accuracy": 0.6692110347747803, | |
| "eval_num_tokens": 51220159.0, | |
| "eval_runtime": 56.3598, | |
| "eval_samples_per_second": 7.097, | |
| "eval_steps_per_second": 0.887, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.29397529890476276, | |
| "grad_norm": 4.41649055480957, | |
| "learning_rate": 7.845761631612492e-05, | |
| "loss": 4.8512, | |
| "mean_token_accuracy": 0.6730743369460106, | |
| "num_tokens": 51852052.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.29397529890476276, | |
| "eval_loss": 1.2219711542129517, | |
| "eval_mean_token_accuracy": 0.668809084892273, | |
| "eval_num_tokens": 51852052.0, | |
| "eval_runtime": 56.3222, | |
| "eval_samples_per_second": 7.102, | |
| "eval_steps_per_second": 0.888, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.29756036352555254, | |
| "grad_norm": 5.041947841644287, | |
| "learning_rate": 7.805927342256214e-05, | |
| "loss": 4.8012, | |
| "mean_token_accuracy": 0.6752137768268586, | |
| "num_tokens": 52488117.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.29756036352555254, | |
| "eval_loss": 1.2200063467025757, | |
| "eval_mean_token_accuracy": 0.6689476525783539, | |
| "eval_num_tokens": 52488117.0, | |
| "eval_runtime": 56.3663, | |
| "eval_samples_per_second": 7.096, | |
| "eval_steps_per_second": 0.887, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.3011454281463423, | |
| "grad_norm": 4.421684741973877, | |
| "learning_rate": 7.766093052899937e-05, | |
| "loss": 4.9011, | |
| "mean_token_accuracy": 0.6694827458262443, | |
| "num_tokens": 53124002.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.3011454281463423, | |
| "eval_loss": 1.2199760675430298, | |
| "eval_mean_token_accuracy": 0.6698788702487946, | |
| "eval_num_tokens": 53124002.0, | |
| "eval_runtime": 57.1381, | |
| "eval_samples_per_second": 7.001, | |
| "eval_steps_per_second": 0.875, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.30473049276713216, | |
| "grad_norm": 4.482224941253662, | |
| "learning_rate": 7.726258763543659e-05, | |
| "loss": 4.8888, | |
| "mean_token_accuracy": 0.6703590288758278, | |
| "num_tokens": 53754491.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.30473049276713216, | |
| "eval_loss": 1.2201299667358398, | |
| "eval_mean_token_accuracy": 0.6685253477096558, | |
| "eval_num_tokens": 53754491.0, | |
| "eval_runtime": 56.1814, | |
| "eval_samples_per_second": 7.12, | |
| "eval_steps_per_second": 0.89, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.30831555738792193, | |
| "grad_norm": 5.163293838500977, | |
| "learning_rate": 7.686424474187381e-05, | |
| "loss": 4.8934, | |
| "mean_token_accuracy": 0.6694687473773956, | |
| "num_tokens": 54384292.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.30831555738792193, | |
| "eval_loss": 1.2192912101745605, | |
| "eval_mean_token_accuracy": 0.670127317905426, | |
| "eval_num_tokens": 54384292.0, | |
| "eval_runtime": 56.2288, | |
| "eval_samples_per_second": 7.114, | |
| "eval_steps_per_second": 0.889, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.3119006220087117, | |
| "grad_norm": 4.469936847686768, | |
| "learning_rate": 7.646590184831103e-05, | |
| "loss": 4.8808, | |
| "mean_token_accuracy": 0.671641985476017, | |
| "num_tokens": 55015409.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.3119006220087117, | |
| "eval_loss": 1.2187691926956177, | |
| "eval_mean_token_accuracy": 0.6700682175159455, | |
| "eval_num_tokens": 55015409.0, | |
| "eval_runtime": 56.275, | |
| "eval_samples_per_second": 7.108, | |
| "eval_steps_per_second": 0.888, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.3154856866295015, | |
| "grad_norm": 4.397490501403809, | |
| "learning_rate": 7.606755895474825e-05, | |
| "loss": 4.8593, | |
| "mean_token_accuracy": 0.6729298800230026, | |
| "num_tokens": 55645076.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.3154856866295015, | |
| "eval_loss": 1.21873140335083, | |
| "eval_mean_token_accuracy": 0.6701312291622162, | |
| "eval_num_tokens": 55645076.0, | |
| "eval_runtime": 56.3255, | |
| "eval_samples_per_second": 7.102, | |
| "eval_steps_per_second": 0.888, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.3190707512502913, | |
| "grad_norm": 4.565478801727295, | |
| "learning_rate": 7.566921606118547e-05, | |
| "loss": 4.9259, | |
| "mean_token_accuracy": 0.6678701865673066, | |
| "num_tokens": 56277591.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.3190707512502913, | |
| "eval_loss": 1.2173478603363037, | |
| "eval_mean_token_accuracy": 0.6713691699504852, | |
| "eval_num_tokens": 56277591.0, | |
| "eval_runtime": 56.1846, | |
| "eval_samples_per_second": 7.119, | |
| "eval_steps_per_second": 0.89, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.32265581587108105, | |
| "grad_norm": 4.387983798980713, | |
| "learning_rate": 7.52708731676227e-05, | |
| "loss": 4.8506, | |
| "mean_token_accuracy": 0.6727069270610809, | |
| "num_tokens": 56909553.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.32265581587108105, | |
| "eval_loss": 1.2162431478500366, | |
| "eval_mean_token_accuracy": 0.6704595732688904, | |
| "eval_num_tokens": 56909553.0, | |
| "eval_runtime": 56.2594, | |
| "eval_samples_per_second": 7.11, | |
| "eval_steps_per_second": 0.889, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.3262408804918709, | |
| "grad_norm": 4.406232833862305, | |
| "learning_rate": 7.487253027405992e-05, | |
| "loss": 4.8975, | |
| "mean_token_accuracy": 0.6693887722492218, | |
| "num_tokens": 57541706.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.3262408804918709, | |
| "eval_loss": 1.216928243637085, | |
| "eval_mean_token_accuracy": 0.6708524739742279, | |
| "eval_num_tokens": 57541706.0, | |
| "eval_runtime": 56.2862, | |
| "eval_samples_per_second": 7.107, | |
| "eval_steps_per_second": 0.888, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.32982594511266067, | |
| "grad_norm": 4.329367637634277, | |
| "learning_rate": 7.447418738049714e-05, | |
| "loss": 4.8734, | |
| "mean_token_accuracy": 0.6720337501168251, | |
| "num_tokens": 58175251.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.32982594511266067, | |
| "eval_loss": 1.2153425216674805, | |
| "eval_mean_token_accuracy": 0.6709867632389068, | |
| "eval_num_tokens": 58175251.0, | |
| "eval_runtime": 56.449, | |
| "eval_samples_per_second": 7.086, | |
| "eval_steps_per_second": 0.886, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.33341100973345045, | |
| "grad_norm": 4.24669075012207, | |
| "learning_rate": 7.407584448693436e-05, | |
| "loss": 4.8742, | |
| "mean_token_accuracy": 0.6718363285064697, | |
| "num_tokens": 58807276.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.33341100973345045, | |
| "eval_loss": 1.2146964073181152, | |
| "eval_mean_token_accuracy": 0.6710757482051849, | |
| "eval_num_tokens": 58807276.0, | |
| "eval_runtime": 56.3596, | |
| "eval_samples_per_second": 7.097, | |
| "eval_steps_per_second": 0.887, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.3369960743542402, | |
| "grad_norm": 4.037027835845947, | |
| "learning_rate": 7.367750159337158e-05, | |
| "loss": 4.8869, | |
| "mean_token_accuracy": 0.6710193574428558, | |
| "num_tokens": 59434602.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.3369960743542402, | |
| "eval_loss": 1.2160181999206543, | |
| "eval_mean_token_accuracy": 0.6700944793224335, | |
| "eval_num_tokens": 59434602.0, | |
| "eval_runtime": 56.8171, | |
| "eval_samples_per_second": 7.04, | |
| "eval_steps_per_second": 0.88, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.34058113897503, | |
| "grad_norm": 4.7925262451171875, | |
| "learning_rate": 7.32791586998088e-05, | |
| "loss": 4.8639, | |
| "mean_token_accuracy": 0.6717003020644188, | |
| "num_tokens": 60067934.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.34058113897503, | |
| "eval_loss": 1.2146656513214111, | |
| "eval_mean_token_accuracy": 0.6714940690994262, | |
| "eval_num_tokens": 60067934.0, | |
| "eval_runtime": 56.6455, | |
| "eval_samples_per_second": 7.061, | |
| "eval_steps_per_second": 0.883, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.34416620359581984, | |
| "grad_norm": 4.179026126861572, | |
| "learning_rate": 7.288081580624602e-05, | |
| "loss": 4.7815, | |
| "mean_token_accuracy": 0.6770669308304786, | |
| "num_tokens": 60700674.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.34416620359581984, | |
| "eval_loss": 1.2132787704467773, | |
| "eval_mean_token_accuracy": 0.6709755408763886, | |
| "eval_num_tokens": 60700674.0, | |
| "eval_runtime": 56.6884, | |
| "eval_samples_per_second": 7.056, | |
| "eval_steps_per_second": 0.882, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.3477512682166096, | |
| "grad_norm": 4.608165740966797, | |
| "learning_rate": 7.248247291268324e-05, | |
| "loss": 4.8593, | |
| "mean_token_accuracy": 0.6736995288729668, | |
| "num_tokens": 61331555.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.3477512682166096, | |
| "eval_loss": 1.2121059894561768, | |
| "eval_mean_token_accuracy": 0.672556334733963, | |
| "eval_num_tokens": 61331555.0, | |
| "eval_runtime": 56.1182, | |
| "eval_samples_per_second": 7.128, | |
| "eval_steps_per_second": 0.891, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.3513363328373994, | |
| "grad_norm": 4.966649055480957, | |
| "learning_rate": 7.208413001912047e-05, | |
| "loss": 4.8738, | |
| "mean_token_accuracy": 0.671499859392643, | |
| "num_tokens": 61965343.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.3513363328373994, | |
| "eval_loss": 1.2120461463928223, | |
| "eval_mean_token_accuracy": 0.6718708264827729, | |
| "eval_num_tokens": 61965343.0, | |
| "eval_runtime": 56.145, | |
| "eval_samples_per_second": 7.124, | |
| "eval_steps_per_second": 0.891, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.3549213974581892, | |
| "grad_norm": 5.021463871002197, | |
| "learning_rate": 7.168578712555767e-05, | |
| "loss": 4.8567, | |
| "mean_token_accuracy": 0.6712884229421615, | |
| "num_tokens": 62596263.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.3549213974581892, | |
| "eval_loss": 1.2121599912643433, | |
| "eval_mean_token_accuracy": 0.671816600561142, | |
| "eval_num_tokens": 62596263.0, | |
| "eval_runtime": 56.6214, | |
| "eval_samples_per_second": 7.064, | |
| "eval_steps_per_second": 0.883, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.35850646207897896, | |
| "grad_norm": 4.346203804016113, | |
| "learning_rate": 7.128744423199491e-05, | |
| "loss": 4.8762, | |
| "mean_token_accuracy": 0.6713952556252479, | |
| "num_tokens": 63232395.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.35850646207897896, | |
| "eval_loss": 1.2105367183685303, | |
| "eval_mean_token_accuracy": 0.6715540933609009, | |
| "eval_num_tokens": 63232395.0, | |
| "eval_runtime": 56.1985, | |
| "eval_samples_per_second": 7.118, | |
| "eval_steps_per_second": 0.89, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.36209152669976874, | |
| "grad_norm": 4.725315570831299, | |
| "learning_rate": 7.088910133843213e-05, | |
| "loss": 4.8547, | |
| "mean_token_accuracy": 0.6712683519721031, | |
| "num_tokens": 63865141.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.36209152669976874, | |
| "eval_loss": 1.2104063034057617, | |
| "eval_mean_token_accuracy": 0.6721407020092011, | |
| "eval_num_tokens": 63865141.0, | |
| "eval_runtime": 56.4947, | |
| "eval_samples_per_second": 7.08, | |
| "eval_steps_per_second": 0.885, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.3656765913205586, | |
| "grad_norm": 4.475533962249756, | |
| "learning_rate": 7.049075844486934e-05, | |
| "loss": 4.8331, | |
| "mean_token_accuracy": 0.6741529366374016, | |
| "num_tokens": 64498005.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.3656765913205586, | |
| "eval_loss": 1.211449146270752, | |
| "eval_mean_token_accuracy": 0.6718828630447388, | |
| "eval_num_tokens": 64498005.0, | |
| "eval_runtime": 56.1253, | |
| "eval_samples_per_second": 7.127, | |
| "eval_steps_per_second": 0.891, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.36926165594134835, | |
| "grad_norm": 4.43773078918457, | |
| "learning_rate": 7.009241555130657e-05, | |
| "loss": 4.8576, | |
| "mean_token_accuracy": 0.6735836458206177, | |
| "num_tokens": 65132925.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.36926165594134835, | |
| "eval_loss": 1.2107973098754883, | |
| "eval_mean_token_accuracy": 0.67238405585289, | |
| "eval_num_tokens": 65132925.0, | |
| "eval_runtime": 56.3575, | |
| "eval_samples_per_second": 7.098, | |
| "eval_steps_per_second": 0.887, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.37284672056213813, | |
| "grad_norm": 4.340308666229248, | |
| "learning_rate": 6.96940726577438e-05, | |
| "loss": 4.8199, | |
| "mean_token_accuracy": 0.6730007353425026, | |
| "num_tokens": 65763710.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.37284672056213813, | |
| "eval_loss": 1.2099605798721313, | |
| "eval_mean_token_accuracy": 0.6728368639945984, | |
| "eval_num_tokens": 65763710.0, | |
| "eval_runtime": 56.2959, | |
| "eval_samples_per_second": 7.105, | |
| "eval_steps_per_second": 0.888, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.3764317851829279, | |
| "grad_norm": 4.555109024047852, | |
| "learning_rate": 6.9295729764181e-05, | |
| "loss": 4.8905, | |
| "mean_token_accuracy": 0.6707694306969643, | |
| "num_tokens": 66395012.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.3764317851829279, | |
| "eval_loss": 1.2099387645721436, | |
| "eval_mean_token_accuracy": 0.6722373139858245, | |
| "eval_num_tokens": 66395012.0, | |
| "eval_runtime": 56.4644, | |
| "eval_samples_per_second": 7.084, | |
| "eval_steps_per_second": 0.886, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.3800168498037177, | |
| "grad_norm": 4.202060699462891, | |
| "learning_rate": 6.889738687061822e-05, | |
| "loss": 4.8292, | |
| "mean_token_accuracy": 0.673227034509182, | |
| "num_tokens": 67031872.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.3800168498037177, | |
| "eval_loss": 1.210257887840271, | |
| "eval_mean_token_accuracy": 0.6714678919315338, | |
| "eval_num_tokens": 67031872.0, | |
| "eval_runtime": 56.3175, | |
| "eval_samples_per_second": 7.103, | |
| "eval_steps_per_second": 0.888, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.38360191442450753, | |
| "grad_norm": 4.315623760223389, | |
| "learning_rate": 6.849904397705546e-05, | |
| "loss": 4.8465, | |
| "mean_token_accuracy": 0.672520759999752, | |
| "num_tokens": 67663971.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.38360191442450753, | |
| "eval_loss": 1.20899498462677, | |
| "eval_mean_token_accuracy": 0.6721968007087707, | |
| "eval_num_tokens": 67663971.0, | |
| "eval_runtime": 56.4163, | |
| "eval_samples_per_second": 7.09, | |
| "eval_steps_per_second": 0.886, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.3871869790452973, | |
| "grad_norm": 4.103718280792236, | |
| "learning_rate": 6.810070108349267e-05, | |
| "loss": 4.8568, | |
| "mean_token_accuracy": 0.6714790239930153, | |
| "num_tokens": 68298646.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.3871869790452973, | |
| "eval_loss": 1.2082393169403076, | |
| "eval_mean_token_accuracy": 0.6712930297851563, | |
| "eval_num_tokens": 68298646.0, | |
| "eval_runtime": 56.3666, | |
| "eval_samples_per_second": 7.096, | |
| "eval_steps_per_second": 0.887, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.3907720436660871, | |
| "grad_norm": 4.669826030731201, | |
| "learning_rate": 6.770235818992989e-05, | |
| "loss": 4.8767, | |
| "mean_token_accuracy": 0.6715716090798378, | |
| "num_tokens": 68934666.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.3907720436660871, | |
| "eval_loss": 1.207878589630127, | |
| "eval_mean_token_accuracy": 0.672472620010376, | |
| "eval_num_tokens": 68934666.0, | |
| "eval_runtime": 56.6965, | |
| "eval_samples_per_second": 7.055, | |
| "eval_steps_per_second": 0.882, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.39435710828687687, | |
| "grad_norm": 4.467480659484863, | |
| "learning_rate": 6.730401529636712e-05, | |
| "loss": 4.8558, | |
| "mean_token_accuracy": 0.6721174070239067, | |
| "num_tokens": 69569216.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.39435710828687687, | |
| "eval_loss": 1.207086205482483, | |
| "eval_mean_token_accuracy": 0.6721065282821655, | |
| "eval_num_tokens": 69569216.0, | |
| "eval_runtime": 56.3201, | |
| "eval_samples_per_second": 7.102, | |
| "eval_steps_per_second": 0.888, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.39794217290766665, | |
| "grad_norm": 4.608986854553223, | |
| "learning_rate": 6.690567240280435e-05, | |
| "loss": 4.8658, | |
| "mean_token_accuracy": 0.6705604410171508, | |
| "num_tokens": 70207379.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.39794217290766665, | |
| "eval_loss": 1.2067745923995972, | |
| "eval_mean_token_accuracy": 0.6716193425655365, | |
| "eval_num_tokens": 70207379.0, | |
| "eval_runtime": 57.4805, | |
| "eval_samples_per_second": 6.959, | |
| "eval_steps_per_second": 0.87, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.4015272375284564, | |
| "grad_norm": 4.4026780128479, | |
| "learning_rate": 6.650732950924155e-05, | |
| "loss": 4.7928, | |
| "mean_token_accuracy": 0.6765011212229729, | |
| "num_tokens": 70836399.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.4015272375284564, | |
| "eval_loss": 1.2075951099395752, | |
| "eval_mean_token_accuracy": 0.6730300402641296, | |
| "eval_num_tokens": 70836399.0, | |
| "eval_runtime": 58.1991, | |
| "eval_samples_per_second": 6.873, | |
| "eval_steps_per_second": 0.859, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.40511230214924626, | |
| "grad_norm": 4.3206048011779785, | |
| "learning_rate": 6.610898661567877e-05, | |
| "loss": 4.7828, | |
| "mean_token_accuracy": 0.6755007293820381, | |
| "num_tokens": 71465978.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.40511230214924626, | |
| "eval_loss": 1.2060260772705078, | |
| "eval_mean_token_accuracy": 0.6736092364788056, | |
| "eval_num_tokens": 71465978.0, | |
| "eval_runtime": 55.9656, | |
| "eval_samples_per_second": 7.147, | |
| "eval_steps_per_second": 0.893, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.40869736677003604, | |
| "grad_norm": 4.6384196281433105, | |
| "learning_rate": 6.571064372211601e-05, | |
| "loss": 4.8045, | |
| "mean_token_accuracy": 0.6742757317423821, | |
| "num_tokens": 72094960.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.40869736677003604, | |
| "eval_loss": 1.2062655687332153, | |
| "eval_mean_token_accuracy": 0.6727207219600677, | |
| "eval_num_tokens": 72094960.0, | |
| "eval_runtime": 56.5884, | |
| "eval_samples_per_second": 7.069, | |
| "eval_steps_per_second": 0.884, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.4122824313908258, | |
| "grad_norm": 4.51801872253418, | |
| "learning_rate": 6.531230082855322e-05, | |
| "loss": 4.8502, | |
| "mean_token_accuracy": 0.6714598840475082, | |
| "num_tokens": 72728939.0, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.4122824313908258, | |
| "eval_loss": 1.2066096067428589, | |
| "eval_mean_token_accuracy": 0.6731921648979187, | |
| "eval_num_tokens": 72728939.0, | |
| "eval_runtime": 56.4183, | |
| "eval_samples_per_second": 7.09, | |
| "eval_steps_per_second": 0.886, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.4158674960116156, | |
| "grad_norm": 4.803595066070557, | |
| "learning_rate": 6.491395793499044e-05, | |
| "loss": 4.863, | |
| "mean_token_accuracy": 0.6716452211141586, | |
| "num_tokens": 73363737.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.4158674960116156, | |
| "eval_loss": 1.2050178050994873, | |
| "eval_mean_token_accuracy": 0.6734542024135589, | |
| "eval_num_tokens": 73363737.0, | |
| "eval_runtime": 56.4329, | |
| "eval_samples_per_second": 7.088, | |
| "eval_steps_per_second": 0.886, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.4194525606324054, | |
| "grad_norm": 4.864405155181885, | |
| "learning_rate": 6.451561504142767e-05, | |
| "loss": 4.82, | |
| "mean_token_accuracy": 0.6745044487714768, | |
| "num_tokens": 73997522.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.4194525606324054, | |
| "eval_loss": 1.2039889097213745, | |
| "eval_mean_token_accuracy": 0.6733579516410828, | |
| "eval_num_tokens": 73997522.0, | |
| "eval_runtime": 56.3966, | |
| "eval_samples_per_second": 7.093, | |
| "eval_steps_per_second": 0.887, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.4230376252531952, | |
| "grad_norm": 4.559540271759033, | |
| "learning_rate": 6.411727214786488e-05, | |
| "loss": 4.7735, | |
| "mean_token_accuracy": 0.6764472410082817, | |
| "num_tokens": 74632565.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.4230376252531952, | |
| "eval_loss": 1.2041822671890259, | |
| "eval_mean_token_accuracy": 0.6725377225875855, | |
| "eval_num_tokens": 74632565.0, | |
| "eval_runtime": 56.2069, | |
| "eval_samples_per_second": 7.117, | |
| "eval_steps_per_second": 0.89, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.426622689873985, | |
| "grad_norm": 4.625767230987549, | |
| "learning_rate": 6.37189292543021e-05, | |
| "loss": 4.8474, | |
| "mean_token_accuracy": 0.6734576171636582, | |
| "num_tokens": 75264457.0, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.426622689873985, | |
| "eval_loss": 1.2035109996795654, | |
| "eval_mean_token_accuracy": 0.6731998026371002, | |
| "eval_num_tokens": 75264457.0, | |
| "eval_runtime": 56.7209, | |
| "eval_samples_per_second": 7.052, | |
| "eval_steps_per_second": 0.882, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.4302077544947748, | |
| "grad_norm": 4.185346603393555, | |
| "learning_rate": 6.332058636073932e-05, | |
| "loss": 4.8327, | |
| "mean_token_accuracy": 0.6721743106842041, | |
| "num_tokens": 75893311.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.4302077544947748, | |
| "eval_loss": 1.203436255455017, | |
| "eval_mean_token_accuracy": 0.6730928170681, | |
| "eval_num_tokens": 75893311.0, | |
| "eval_runtime": 56.3504, | |
| "eval_samples_per_second": 7.098, | |
| "eval_steps_per_second": 0.887, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.43379281911556455, | |
| "grad_norm": 4.341583251953125, | |
| "learning_rate": 6.292224346717655e-05, | |
| "loss": 4.8072, | |
| "mean_token_accuracy": 0.6749084493517876, | |
| "num_tokens": 76529617.0, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.43379281911556455, | |
| "eval_loss": 1.2034169435501099, | |
| "eval_mean_token_accuracy": 0.6731595695018768, | |
| "eval_num_tokens": 76529617.0, | |
| "eval_runtime": 56.4332, | |
| "eval_samples_per_second": 7.088, | |
| "eval_steps_per_second": 0.886, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.43737788373635433, | |
| "grad_norm": 4.502080917358398, | |
| "learning_rate": 6.252390057361377e-05, | |
| "loss": 4.7284, | |
| "mean_token_accuracy": 0.6789399805665016, | |
| "num_tokens": 77158901.0, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.43737788373635433, | |
| "eval_loss": 1.2037384510040283, | |
| "eval_mean_token_accuracy": 0.6724783575534821, | |
| "eval_num_tokens": 77158901.0, | |
| "eval_runtime": 56.1762, | |
| "eval_samples_per_second": 7.12, | |
| "eval_steps_per_second": 0.89, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.4409629483571441, | |
| "grad_norm": 4.407749652862549, | |
| "learning_rate": 6.212555768005099e-05, | |
| "loss": 4.8102, | |
| "mean_token_accuracy": 0.6749192690849304, | |
| "num_tokens": 77792929.0, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.4409629483571441, | |
| "eval_loss": 1.2034553289413452, | |
| "eval_mean_token_accuracy": 0.6724519121646881, | |
| "eval_num_tokens": 77792929.0, | |
| "eval_runtime": 56.3346, | |
| "eval_samples_per_second": 7.1, | |
| "eval_steps_per_second": 0.888, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.44454801297793395, | |
| "grad_norm": 4.5488362312316895, | |
| "learning_rate": 6.172721478648821e-05, | |
| "loss": 4.8424, | |
| "mean_token_accuracy": 0.6731199064850807, | |
| "num_tokens": 78426748.0, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.44454801297793395, | |
| "eval_loss": 1.2025480270385742, | |
| "eval_mean_token_accuracy": 0.673497976064682, | |
| "eval_num_tokens": 78426748.0, | |
| "eval_runtime": 56.1462, | |
| "eval_samples_per_second": 7.124, | |
| "eval_steps_per_second": 0.891, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.44813307759872373, | |
| "grad_norm": 4.52962589263916, | |
| "learning_rate": 6.132887189292543e-05, | |
| "loss": 4.7814, | |
| "mean_token_accuracy": 0.6756982815265655, | |
| "num_tokens": 79054859.0, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.44813307759872373, | |
| "eval_loss": 1.2015492916107178, | |
| "eval_mean_token_accuracy": 0.6740836083889008, | |
| "eval_num_tokens": 79054859.0, | |
| "eval_runtime": 56.3541, | |
| "eval_samples_per_second": 7.098, | |
| "eval_steps_per_second": 0.887, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.4517181422195135, | |
| "grad_norm": 4.603536128997803, | |
| "learning_rate": 6.093052899936266e-05, | |
| "loss": 4.9195, | |
| "mean_token_accuracy": 0.6704733854532242, | |
| "num_tokens": 79688857.0, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.4517181422195135, | |
| "eval_loss": 1.2020344734191895, | |
| "eval_mean_token_accuracy": 0.6741014468669891, | |
| "eval_num_tokens": 79688857.0, | |
| "eval_runtime": 56.114, | |
| "eval_samples_per_second": 7.128, | |
| "eval_steps_per_second": 0.891, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.4553032068403033, | |
| "grad_norm": 5.02667236328125, | |
| "learning_rate": 6.053218610579987e-05, | |
| "loss": 4.7774, | |
| "mean_token_accuracy": 0.6772465297579765, | |
| "num_tokens": 80324585.0, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.4553032068403033, | |
| "eval_loss": 1.201953411102295, | |
| "eval_mean_token_accuracy": 0.6730434691905975, | |
| "eval_num_tokens": 80324585.0, | |
| "eval_runtime": 56.1284, | |
| "eval_samples_per_second": 7.127, | |
| "eval_steps_per_second": 0.891, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.45888827146109307, | |
| "grad_norm": 4.330198764801025, | |
| "learning_rate": 6.0133843212237096e-05, | |
| "loss": 4.8002, | |
| "mean_token_accuracy": 0.6744606778025627, | |
| "num_tokens": 80956366.0, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.45888827146109307, | |
| "eval_loss": 1.2010780572891235, | |
| "eval_mean_token_accuracy": 0.6726482355594635, | |
| "eval_num_tokens": 80956366.0, | |
| "eval_runtime": 56.1261, | |
| "eval_samples_per_second": 7.127, | |
| "eval_steps_per_second": 0.891, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.4624733360818829, | |
| "grad_norm": 4.510508060455322, | |
| "learning_rate": 5.973550031867432e-05, | |
| "loss": 4.8791, | |
| "mean_token_accuracy": 0.6714214497804641, | |
| "num_tokens": 81587826.0, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.4624733360818829, | |
| "eval_loss": 1.2023468017578125, | |
| "eval_mean_token_accuracy": 0.672912814617157, | |
| "eval_num_tokens": 81587826.0, | |
| "eval_runtime": 56.3171, | |
| "eval_samples_per_second": 7.103, | |
| "eval_steps_per_second": 0.888, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.4660584007026727, | |
| "grad_norm": 4.60286283493042, | |
| "learning_rate": 5.933715742511153e-05, | |
| "loss": 4.7999, | |
| "mean_token_accuracy": 0.6754237455129624, | |
| "num_tokens": 82222966.0, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.4660584007026727, | |
| "eval_loss": 1.2008494138717651, | |
| "eval_mean_token_accuracy": 0.6732868099212647, | |
| "eval_num_tokens": 82222966.0, | |
| "eval_runtime": 56.2084, | |
| "eval_samples_per_second": 7.116, | |
| "eval_steps_per_second": 0.89, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.46964346532346246, | |
| "grad_norm": 4.842785835266113, | |
| "learning_rate": 5.893881453154876e-05, | |
| "loss": 4.8096, | |
| "mean_token_accuracy": 0.6758663612604141, | |
| "num_tokens": 82855504.0, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.46964346532346246, | |
| "eval_loss": 1.200462818145752, | |
| "eval_mean_token_accuracy": 0.6739160513877869, | |
| "eval_num_tokens": 82855504.0, | |
| "eval_runtime": 56.2921, | |
| "eval_samples_per_second": 7.106, | |
| "eval_steps_per_second": 0.888, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.47322852994425224, | |
| "grad_norm": 4.244312763214111, | |
| "learning_rate": 5.854047163798598e-05, | |
| "loss": 4.773, | |
| "mean_token_accuracy": 0.6780867150425911, | |
| "num_tokens": 83488202.0, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.47322852994425224, | |
| "eval_loss": 1.200437068939209, | |
| "eval_mean_token_accuracy": 0.6741013741493225, | |
| "eval_num_tokens": 83488202.0, | |
| "eval_runtime": 56.4321, | |
| "eval_samples_per_second": 7.088, | |
| "eval_steps_per_second": 0.886, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.476813594565042, | |
| "grad_norm": 4.384121894836426, | |
| "learning_rate": 5.814212874442321e-05, | |
| "loss": 4.8256, | |
| "mean_token_accuracy": 0.6735525381565094, | |
| "num_tokens": 84123092.0, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.476813594565042, | |
| "eval_loss": 1.2007168531417847, | |
| "eval_mean_token_accuracy": 0.6743572854995727, | |
| "eval_num_tokens": 84123092.0, | |
| "eval_runtime": 56.5132, | |
| "eval_samples_per_second": 7.078, | |
| "eval_steps_per_second": 0.885, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.4803986591858318, | |
| "grad_norm": 5.510925769805908, | |
| "learning_rate": 5.774378585086042e-05, | |
| "loss": 4.7806, | |
| "mean_token_accuracy": 0.6750581926107406, | |
| "num_tokens": 84756126.0, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.4803986591858318, | |
| "eval_loss": 1.2009855508804321, | |
| "eval_mean_token_accuracy": 0.6739739573001862, | |
| "eval_num_tokens": 84756126.0, | |
| "eval_runtime": 56.6366, | |
| "eval_samples_per_second": 7.063, | |
| "eval_steps_per_second": 0.883, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.48398372380662164, | |
| "grad_norm": 4.581708908081055, | |
| "learning_rate": 5.7345442957297646e-05, | |
| "loss": 4.722, | |
| "mean_token_accuracy": 0.679350274503231, | |
| "num_tokens": 85386870.0, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.48398372380662164, | |
| "eval_loss": 1.2008088827133179, | |
| "eval_mean_token_accuracy": 0.6728281593322754, | |
| "eval_num_tokens": 85386870.0, | |
| "eval_runtime": 56.4337, | |
| "eval_samples_per_second": 7.088, | |
| "eval_steps_per_second": 0.886, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.4875687884274114, | |
| "grad_norm": 5.783533573150635, | |
| "learning_rate": 5.694710006373487e-05, | |
| "loss": 4.7616, | |
| "mean_token_accuracy": 0.6766093501448631, | |
| "num_tokens": 86015238.0, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.4875687884274114, | |
| "eval_loss": 1.2004883289337158, | |
| "eval_mean_token_accuracy": 0.6737746036052704, | |
| "eval_num_tokens": 86015238.0, | |
| "eval_runtime": 56.202, | |
| "eval_samples_per_second": 7.117, | |
| "eval_steps_per_second": 0.89, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.4911538530482012, | |
| "grad_norm": 4.4624714851379395, | |
| "learning_rate": 5.654875717017208e-05, | |
| "loss": 4.8036, | |
| "mean_token_accuracy": 0.6753204807639122, | |
| "num_tokens": 86646920.0, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.4911538530482012, | |
| "eval_loss": 1.1992673873901367, | |
| "eval_mean_token_accuracy": 0.6739728832244873, | |
| "eval_num_tokens": 86646920.0, | |
| "eval_runtime": 56.3145, | |
| "eval_samples_per_second": 7.103, | |
| "eval_steps_per_second": 0.888, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.494738917668991, | |
| "grad_norm": 4.528706073760986, | |
| "learning_rate": 5.615041427660931e-05, | |
| "loss": 4.8219, | |
| "mean_token_accuracy": 0.673625990152359, | |
| "num_tokens": 87279245.0, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.494738917668991, | |
| "eval_loss": 1.1988134384155273, | |
| "eval_mean_token_accuracy": 0.673334904909134, | |
| "eval_num_tokens": 87279245.0, | |
| "eval_runtime": 56.4107, | |
| "eval_samples_per_second": 7.091, | |
| "eval_steps_per_second": 0.886, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.49832398228978075, | |
| "grad_norm": 4.4395527839660645, | |
| "learning_rate": 5.575207138304653e-05, | |
| "loss": 4.8404, | |
| "mean_token_accuracy": 0.6731960904598236, | |
| "num_tokens": 87908715.0, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.49832398228978075, | |
| "eval_loss": 1.198786973953247, | |
| "eval_mean_token_accuracy": 0.6738696718215942, | |
| "eval_num_tokens": 87908715.0, | |
| "eval_runtime": 56.4217, | |
| "eval_samples_per_second": 7.089, | |
| "eval_steps_per_second": 0.886, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.5019090469105706, | |
| "grad_norm": 4.999813079833984, | |
| "learning_rate": 5.535372848948375e-05, | |
| "loss": 4.8014, | |
| "mean_token_accuracy": 0.6743469536304474, | |
| "num_tokens": 88541353.0, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.5019090469105706, | |
| "eval_loss": 1.1984182596206665, | |
| "eval_mean_token_accuracy": 0.6738111090660095, | |
| "eval_num_tokens": 88541353.0, | |
| "eval_runtime": 56.269, | |
| "eval_samples_per_second": 7.109, | |
| "eval_steps_per_second": 0.889, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.5054941115313604, | |
| "grad_norm": 5.244815826416016, | |
| "learning_rate": 5.4955385595920975e-05, | |
| "loss": 4.7345, | |
| "mean_token_accuracy": 0.6794330298900604, | |
| "num_tokens": 89171565.0, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.5054941115313604, | |
| "eval_loss": 1.1977105140686035, | |
| "eval_mean_token_accuracy": 0.6732430410385132, | |
| "eval_num_tokens": 89171565.0, | |
| "eval_runtime": 56.2208, | |
| "eval_samples_per_second": 7.115, | |
| "eval_steps_per_second": 0.889, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.5090791761521501, | |
| "grad_norm": 4.170567512512207, | |
| "learning_rate": 5.4557042702358196e-05, | |
| "loss": 4.7721, | |
| "mean_token_accuracy": 0.6758232372999191, | |
| "num_tokens": 89803955.0, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.5090791761521501, | |
| "eval_loss": 1.1980831623077393, | |
| "eval_mean_token_accuracy": 0.6739831912517548, | |
| "eval_num_tokens": 89803955.0, | |
| "eval_runtime": 56.1966, | |
| "eval_samples_per_second": 7.118, | |
| "eval_steps_per_second": 0.89, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.5126642407729399, | |
| "grad_norm": 4.576419830322266, | |
| "learning_rate": 5.415869980879541e-05, | |
| "loss": 4.7588, | |
| "mean_token_accuracy": 0.6762737995386123, | |
| "num_tokens": 90440293.0, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.5126642407729399, | |
| "eval_loss": 1.1975429058074951, | |
| "eval_mean_token_accuracy": 0.673925119638443, | |
| "eval_num_tokens": 90440293.0, | |
| "eval_runtime": 56.1988, | |
| "eval_samples_per_second": 7.118, | |
| "eval_steps_per_second": 0.89, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.5162493053937297, | |
| "grad_norm": 4.430201530456543, | |
| "learning_rate": 5.376035691523263e-05, | |
| "loss": 4.7482, | |
| "mean_token_accuracy": 0.6770784831047059, | |
| "num_tokens": 91071574.0, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.5162493053937297, | |
| "eval_loss": 1.1972256898880005, | |
| "eval_mean_token_accuracy": 0.6727135396003723, | |
| "eval_num_tokens": 91071574.0, | |
| "eval_runtime": 56.2879, | |
| "eval_samples_per_second": 7.106, | |
| "eval_steps_per_second": 0.888, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.5198343700145195, | |
| "grad_norm": 4.440696716308594, | |
| "learning_rate": 5.336201402166986e-05, | |
| "loss": 4.8496, | |
| "mean_token_accuracy": 0.6734337306022644, | |
| "num_tokens": 91704308.0, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.5198343700145195, | |
| "eval_loss": 1.196380853652954, | |
| "eval_mean_token_accuracy": 0.674187605381012, | |
| "eval_num_tokens": 91704308.0, | |
| "eval_runtime": 56.1737, | |
| "eval_samples_per_second": 7.121, | |
| "eval_steps_per_second": 0.89, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.5234194346353093, | |
| "grad_norm": 4.427169322967529, | |
| "learning_rate": 5.2963671128107075e-05, | |
| "loss": 4.7884, | |
| "mean_token_accuracy": 0.6760821756720543, | |
| "num_tokens": 92338569.0, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.5234194346353093, | |
| "eval_loss": 1.1962813138961792, | |
| "eval_mean_token_accuracy": 0.6740961968898773, | |
| "eval_num_tokens": 92338569.0, | |
| "eval_runtime": 56.279, | |
| "eval_samples_per_second": 7.107, | |
| "eval_steps_per_second": 0.888, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.527004499256099, | |
| "grad_norm": 4.586068630218506, | |
| "learning_rate": 5.25653282345443e-05, | |
| "loss": 4.8462, | |
| "mean_token_accuracy": 0.6725165358185768, | |
| "num_tokens": 92970201.0, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.527004499256099, | |
| "eval_loss": 1.1966549158096313, | |
| "eval_mean_token_accuracy": 0.6744921112060547, | |
| "eval_num_tokens": 92970201.0, | |
| "eval_runtime": 56.2579, | |
| "eval_samples_per_second": 7.11, | |
| "eval_steps_per_second": 0.889, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.5305895638768888, | |
| "grad_norm": 4.275878429412842, | |
| "learning_rate": 5.2166985340981525e-05, | |
| "loss": 4.7573, | |
| "mean_token_accuracy": 0.6764388364553452, | |
| "num_tokens": 93604624.0, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.5305895638768888, | |
| "eval_loss": 1.1963127851486206, | |
| "eval_mean_token_accuracy": 0.6738464891910553, | |
| "eval_num_tokens": 93604624.0, | |
| "eval_runtime": 56.2885, | |
| "eval_samples_per_second": 7.106, | |
| "eval_steps_per_second": 0.888, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.5341746284976787, | |
| "grad_norm": 4.383382797241211, | |
| "learning_rate": 5.176864244741873e-05, | |
| "loss": 4.8151, | |
| "mean_token_accuracy": 0.6745539313554764, | |
| "num_tokens": 94237768.0, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.5341746284976787, | |
| "eval_loss": 1.196314811706543, | |
| "eval_mean_token_accuracy": 0.6741366982460022, | |
| "eval_num_tokens": 94237768.0, | |
| "eval_runtime": 56.1682, | |
| "eval_samples_per_second": 7.121, | |
| "eval_steps_per_second": 0.89, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.5377596931184685, | |
| "grad_norm": 4.777865409851074, | |
| "learning_rate": 5.137029955385596e-05, | |
| "loss": 4.7179, | |
| "mean_token_accuracy": 0.6791237652301788, | |
| "num_tokens": 94868917.0, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5377596931184685, | |
| "eval_loss": 1.1951854228973389, | |
| "eval_mean_token_accuracy": 0.6737760400772095, | |
| "eval_num_tokens": 94868917.0, | |
| "eval_runtime": 57.4744, | |
| "eval_samples_per_second": 6.96, | |
| "eval_steps_per_second": 0.87, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5413447577392583, | |
| "grad_norm": 5.250718116760254, | |
| "learning_rate": 5.097195666029318e-05, | |
| "loss": 4.853, | |
| "mean_token_accuracy": 0.671261510848999, | |
| "num_tokens": 95504086.0, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.5413447577392583, | |
| "eval_loss": 1.1954213380813599, | |
| "eval_mean_token_accuracy": 0.6752165842056275, | |
| "eval_num_tokens": 95504086.0, | |
| "eval_runtime": 56.6697, | |
| "eval_samples_per_second": 7.058, | |
| "eval_steps_per_second": 0.882, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.544929822360048, | |
| "grad_norm": 4.873703479766846, | |
| "learning_rate": 5.05736137667304e-05, | |
| "loss": 4.7717, | |
| "mean_token_accuracy": 0.6773542383313179, | |
| "num_tokens": 96139635.0, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.544929822360048, | |
| "eval_loss": 1.1948680877685547, | |
| "eval_mean_token_accuracy": 0.6749390983581542, | |
| "eval_num_tokens": 96139635.0, | |
| "eval_runtime": 55.7291, | |
| "eval_samples_per_second": 7.178, | |
| "eval_steps_per_second": 0.897, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.5485148869808378, | |
| "grad_norm": 4.877697467803955, | |
| "learning_rate": 5.0175270873167626e-05, | |
| "loss": 4.7886, | |
| "mean_token_accuracy": 0.6762890338897705, | |
| "num_tokens": 96770646.0, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.5485148869808378, | |
| "eval_loss": 1.1947038173675537, | |
| "eval_mean_token_accuracy": 0.6754334461688996, | |
| "eval_num_tokens": 96770646.0, | |
| "eval_runtime": 55.5575, | |
| "eval_samples_per_second": 7.2, | |
| "eval_steps_per_second": 0.9, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.5520999516016276, | |
| "grad_norm": 4.1052117347717285, | |
| "learning_rate": 4.977692797960485e-05, | |
| "loss": 4.8169, | |
| "mean_token_accuracy": 0.6731060117483139, | |
| "num_tokens": 97404492.0, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.5520999516016276, | |
| "eval_loss": 1.1950753927230835, | |
| "eval_mean_token_accuracy": 0.6745660018920898, | |
| "eval_num_tokens": 97404492.0, | |
| "eval_runtime": 55.0653, | |
| "eval_samples_per_second": 7.264, | |
| "eval_steps_per_second": 0.908, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.5556850162224174, | |
| "grad_norm": 4.796311855316162, | |
| "learning_rate": 4.937858508604207e-05, | |
| "loss": 4.7797, | |
| "mean_token_accuracy": 0.6754096934199333, | |
| "num_tokens": 98040499.0, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.5556850162224174, | |
| "eval_loss": 1.1943681240081787, | |
| "eval_mean_token_accuracy": 0.6758341288566589, | |
| "eval_num_tokens": 98040499.0, | |
| "eval_runtime": 55.0042, | |
| "eval_samples_per_second": 7.272, | |
| "eval_steps_per_second": 0.909, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.5592700808432072, | |
| "grad_norm": 4.996248722076416, | |
| "learning_rate": 4.898024219247929e-05, | |
| "loss": 4.7915, | |
| "mean_token_accuracy": 0.6766231226921081, | |
| "num_tokens": 98666526.0, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.5592700808432072, | |
| "eval_loss": 1.1935983896255493, | |
| "eval_mean_token_accuracy": 0.6747439002990723, | |
| "eval_num_tokens": 98666526.0, | |
| "eval_runtime": 55.0204, | |
| "eval_samples_per_second": 7.27, | |
| "eval_steps_per_second": 0.909, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.562855145463997, | |
| "grad_norm": 4.6099534034729, | |
| "learning_rate": 4.858189929891651e-05, | |
| "loss": 4.7901, | |
| "mean_token_accuracy": 0.6749289181828498, | |
| "num_tokens": 99297640.0, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.562855145463997, | |
| "eval_loss": 1.1944231986999512, | |
| "eval_mean_token_accuracy": 0.6743023383617401, | |
| "eval_num_tokens": 99297640.0, | |
| "eval_runtime": 55.1784, | |
| "eval_samples_per_second": 7.249, | |
| "eval_steps_per_second": 0.906, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.5664402100847867, | |
| "grad_norm": 4.517291069030762, | |
| "learning_rate": 4.818355640535373e-05, | |
| "loss": 4.8626, | |
| "mean_token_accuracy": 0.6725256371498108, | |
| "num_tokens": 99928477.0, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.5664402100847867, | |
| "eval_loss": 1.1935796737670898, | |
| "eval_mean_token_accuracy": 0.6745834064483642, | |
| "eval_num_tokens": 99928477.0, | |
| "eval_runtime": 55.0686, | |
| "eval_samples_per_second": 7.264, | |
| "eval_steps_per_second": 0.908, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.5700252747055766, | |
| "grad_norm": 4.5024003982543945, | |
| "learning_rate": 4.778521351179095e-05, | |
| "loss": 4.712, | |
| "mean_token_accuracy": 0.6803113195300102, | |
| "num_tokens": 100557231.0, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.5700252747055766, | |
| "eval_loss": 1.1945058107376099, | |
| "eval_mean_token_accuracy": 0.6740881907939911, | |
| "eval_num_tokens": 100557231.0, | |
| "eval_runtime": 54.9762, | |
| "eval_samples_per_second": 7.276, | |
| "eval_steps_per_second": 0.909, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.5736103393263664, | |
| "grad_norm": 4.741750717163086, | |
| "learning_rate": 4.7386870618228176e-05, | |
| "loss": 4.7828, | |
| "mean_token_accuracy": 0.6768678402900696, | |
| "num_tokens": 101189608.0, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.5736103393263664, | |
| "eval_loss": 1.1939120292663574, | |
| "eval_mean_token_accuracy": 0.6754596734046936, | |
| "eval_num_tokens": 101189608.0, | |
| "eval_runtime": 55.073, | |
| "eval_samples_per_second": 7.263, | |
| "eval_steps_per_second": 0.908, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.5771954039471562, | |
| "grad_norm": 4.49591064453125, | |
| "learning_rate": 4.698852772466539e-05, | |
| "loss": 4.8169, | |
| "mean_token_accuracy": 0.6742269179224968, | |
| "num_tokens": 101822322.0, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.5771954039471562, | |
| "eval_loss": 1.1937360763549805, | |
| "eval_mean_token_accuracy": 0.6749664378166199, | |
| "eval_num_tokens": 101822322.0, | |
| "eval_runtime": 55.0891, | |
| "eval_samples_per_second": 7.261, | |
| "eval_steps_per_second": 0.908, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.580780468567946, | |
| "grad_norm": 5.148952007293701, | |
| "learning_rate": 4.659018483110262e-05, | |
| "loss": 4.7721, | |
| "mean_token_accuracy": 0.6771083778142929, | |
| "num_tokens": 102453524.0, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.580780468567946, | |
| "eval_loss": 1.194778323173523, | |
| "eval_mean_token_accuracy": 0.6745067381858826, | |
| "eval_num_tokens": 102453524.0, | |
| "eval_runtime": 55.1207, | |
| "eval_samples_per_second": 7.257, | |
| "eval_steps_per_second": 0.907, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.5843655331887357, | |
| "grad_norm": 5.115074634552002, | |
| "learning_rate": 4.619184193753984e-05, | |
| "loss": 4.7134, | |
| "mean_token_accuracy": 0.6806976914405822, | |
| "num_tokens": 103083835.0, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.5843655331887357, | |
| "eval_loss": 1.192982792854309, | |
| "eval_mean_token_accuracy": 0.6747034168243409, | |
| "eval_num_tokens": 103083835.0, | |
| "eval_runtime": 55.0852, | |
| "eval_samples_per_second": 7.261, | |
| "eval_steps_per_second": 0.908, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.5879505978095255, | |
| "grad_norm": 4.619081497192383, | |
| "learning_rate": 4.5793499043977055e-05, | |
| "loss": 4.796, | |
| "mean_token_accuracy": 0.6744829830527306, | |
| "num_tokens": 103715932.0, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.5879505978095255, | |
| "eval_loss": 1.192581057548523, | |
| "eval_mean_token_accuracy": 0.6748893690109253, | |
| "eval_num_tokens": 103715932.0, | |
| "eval_runtime": 55.0423, | |
| "eval_samples_per_second": 7.267, | |
| "eval_steps_per_second": 0.908, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.5915356624303153, | |
| "grad_norm": 4.433931350708008, | |
| "learning_rate": 4.539515615041428e-05, | |
| "loss": 4.7435, | |
| "mean_token_accuracy": 0.678233249783516, | |
| "num_tokens": 104345485.0, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.5915356624303153, | |
| "eval_loss": 1.1933448314666748, | |
| "eval_mean_token_accuracy": 0.6751340889930725, | |
| "eval_num_tokens": 104345485.0, | |
| "eval_runtime": 55.0423, | |
| "eval_samples_per_second": 7.267, | |
| "eval_steps_per_second": 0.908, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.5951207270511051, | |
| "grad_norm": 4.362198829650879, | |
| "learning_rate": 4.49968132568515e-05, | |
| "loss": 4.7737, | |
| "mean_token_accuracy": 0.6764271047711372, | |
| "num_tokens": 104978991.0, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.5951207270511051, | |
| "eval_loss": 1.1930105686187744, | |
| "eval_mean_token_accuracy": 0.6747807443141938, | |
| "eval_num_tokens": 104978991.0, | |
| "eval_runtime": 55.3077, | |
| "eval_samples_per_second": 7.232, | |
| "eval_steps_per_second": 0.904, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.5987057916718949, | |
| "grad_norm": 4.534180641174316, | |
| "learning_rate": 4.459847036328872e-05, | |
| "loss": 4.7728, | |
| "mean_token_accuracy": 0.677936093211174, | |
| "num_tokens": 105610427.0, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.5987057916718949, | |
| "eval_loss": 1.1923025846481323, | |
| "eval_mean_token_accuracy": 0.6750785481929779, | |
| "eval_num_tokens": 105610427.0, | |
| "eval_runtime": 55.0617, | |
| "eval_samples_per_second": 7.265, | |
| "eval_steps_per_second": 0.908, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.6022908562926846, | |
| "grad_norm": 5.027590274810791, | |
| "learning_rate": 4.420012746972594e-05, | |
| "loss": 4.7192, | |
| "mean_token_accuracy": 0.6799935781955719, | |
| "num_tokens": 106243018.0, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.6022908562926846, | |
| "eval_loss": 1.1910535097122192, | |
| "eval_mean_token_accuracy": 0.6752017951011657, | |
| "eval_num_tokens": 106243018.0, | |
| "eval_runtime": 55.087, | |
| "eval_samples_per_second": 7.261, | |
| "eval_steps_per_second": 0.908, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.6058759209134744, | |
| "grad_norm": 4.658295154571533, | |
| "learning_rate": 4.380178457616316e-05, | |
| "loss": 4.806, | |
| "mean_token_accuracy": 0.6742839315533637, | |
| "num_tokens": 106877488.0, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.6058759209134744, | |
| "eval_loss": 1.1911369562149048, | |
| "eval_mean_token_accuracy": 0.6754417788982391, | |
| "eval_num_tokens": 106877488.0, | |
| "eval_runtime": 55.1243, | |
| "eval_samples_per_second": 7.256, | |
| "eval_steps_per_second": 0.907, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.6094609855342643, | |
| "grad_norm": 4.897305488586426, | |
| "learning_rate": 4.340344168260038e-05, | |
| "loss": 4.7673, | |
| "mean_token_accuracy": 0.6776413953304291, | |
| "num_tokens": 107510169.0, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.6094609855342643, | |
| "eval_loss": 1.1910532712936401, | |
| "eval_mean_token_accuracy": 0.6744759595394134, | |
| "eval_num_tokens": 107510169.0, | |
| "eval_runtime": 55.0914, | |
| "eval_samples_per_second": 7.261, | |
| "eval_steps_per_second": 0.908, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.6130460501550541, | |
| "grad_norm": 4.881381034851074, | |
| "learning_rate": 4.3005098789037605e-05, | |
| "loss": 4.8061, | |
| "mean_token_accuracy": 0.674516750574112, | |
| "num_tokens": 108145421.0, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.6130460501550541, | |
| "eval_loss": 1.190964937210083, | |
| "eval_mean_token_accuracy": 0.6752122223377228, | |
| "eval_num_tokens": 108145421.0, | |
| "eval_runtime": 54.985, | |
| "eval_samples_per_second": 7.275, | |
| "eval_steps_per_second": 0.909, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.6166311147758439, | |
| "grad_norm": 5.073390483856201, | |
| "learning_rate": 4.2606755895474826e-05, | |
| "loss": 4.7739, | |
| "mean_token_accuracy": 0.6762193894386291, | |
| "num_tokens": 108780841.0, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.6166311147758439, | |
| "eval_loss": 1.1907490491867065, | |
| "eval_mean_token_accuracy": 0.6748946511745453, | |
| "eval_num_tokens": 108780841.0, | |
| "eval_runtime": 54.9525, | |
| "eval_samples_per_second": 7.279, | |
| "eval_steps_per_second": 0.91, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.6202161793966336, | |
| "grad_norm": 4.459120750427246, | |
| "learning_rate": 4.220841300191205e-05, | |
| "loss": 4.7864, | |
| "mean_token_accuracy": 0.6758550813794136, | |
| "num_tokens": 109418807.0, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.6202161793966336, | |
| "eval_loss": 1.1910618543624878, | |
| "eval_mean_token_accuracy": 0.6749819540977477, | |
| "eval_num_tokens": 109418807.0, | |
| "eval_runtime": 54.9049, | |
| "eval_samples_per_second": 7.285, | |
| "eval_steps_per_second": 0.911, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.6238012440174234, | |
| "grad_norm": 4.40315055847168, | |
| "learning_rate": 4.181007010834927e-05, | |
| "loss": 4.7917, | |
| "mean_token_accuracy": 0.6742719665169716, | |
| "num_tokens": 110057516.0, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.6238012440174234, | |
| "eval_loss": 1.1901732683181763, | |
| "eval_mean_token_accuracy": 0.674855477809906, | |
| "eval_num_tokens": 110057516.0, | |
| "eval_runtime": 54.9882, | |
| "eval_samples_per_second": 7.274, | |
| "eval_steps_per_second": 0.909, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.6273863086382132, | |
| "grad_norm": 4.657465934753418, | |
| "learning_rate": 4.141172721478649e-05, | |
| "loss": 4.7503, | |
| "mean_token_accuracy": 0.6786279901862144, | |
| "num_tokens": 110689903.0, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.6273863086382132, | |
| "eval_loss": 1.190616488456726, | |
| "eval_mean_token_accuracy": 0.675126885175705, | |
| "eval_num_tokens": 110689903.0, | |
| "eval_runtime": 55.2291, | |
| "eval_samples_per_second": 7.243, | |
| "eval_steps_per_second": 0.905, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.630971373259003, | |
| "grad_norm": 4.282220840454102, | |
| "learning_rate": 4.101338432122371e-05, | |
| "loss": 4.7029, | |
| "mean_token_accuracy": 0.6801710060238838, | |
| "num_tokens": 111323016.0, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.630971373259003, | |
| "eval_loss": 1.1904511451721191, | |
| "eval_mean_token_accuracy": 0.6755478191375732, | |
| "eval_num_tokens": 111323016.0, | |
| "eval_runtime": 54.9632, | |
| "eval_samples_per_second": 7.278, | |
| "eval_steps_per_second": 0.91, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.6345564378797928, | |
| "grad_norm": 4.598837852478027, | |
| "learning_rate": 4.0615041427660933e-05, | |
| "loss": 4.7755, | |
| "mean_token_accuracy": 0.6768260210752487, | |
| "num_tokens": 111959283.0, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.6345564378797928, | |
| "eval_loss": 1.1904475688934326, | |
| "eval_mean_token_accuracy": 0.6753637742996216, | |
| "eval_num_tokens": 111959283.0, | |
| "eval_runtime": 55.0394, | |
| "eval_samples_per_second": 7.268, | |
| "eval_steps_per_second": 0.908, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.6381415025005825, | |
| "grad_norm": 4.1816558837890625, | |
| "learning_rate": 4.0216698534098155e-05, | |
| "loss": 4.7758, | |
| "mean_token_accuracy": 0.6754831087589264, | |
| "num_tokens": 112596409.0, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.6381415025005825, | |
| "eval_loss": 1.1898977756500244, | |
| "eval_mean_token_accuracy": 0.6755084788799286, | |
| "eval_num_tokens": 112596409.0, | |
| "eval_runtime": 55.0202, | |
| "eval_samples_per_second": 7.27, | |
| "eval_steps_per_second": 0.909, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.6417265671213723, | |
| "grad_norm": 4.973260402679443, | |
| "learning_rate": 3.9818355640535376e-05, | |
| "loss": 4.7116, | |
| "mean_token_accuracy": 0.6818169742822647, | |
| "num_tokens": 113224902.0, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.6417265671213723, | |
| "eval_loss": 1.1896042823791504, | |
| "eval_mean_token_accuracy": 0.6757630515098572, | |
| "eval_num_tokens": 113224902.0, | |
| "eval_runtime": 54.9621, | |
| "eval_samples_per_second": 7.278, | |
| "eval_steps_per_second": 0.91, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.6453116317421621, | |
| "grad_norm": 4.470012664794922, | |
| "learning_rate": 3.94200127469726e-05, | |
| "loss": 4.713, | |
| "mean_token_accuracy": 0.6787376815080642, | |
| "num_tokens": 113858275.0, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.6453116317421621, | |
| "eval_loss": 1.1898657083511353, | |
| "eval_mean_token_accuracy": 0.6760083436965942, | |
| "eval_num_tokens": 113858275.0, | |
| "eval_runtime": 54.9184, | |
| "eval_samples_per_second": 7.284, | |
| "eval_steps_per_second": 0.91, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.648896696362952, | |
| "grad_norm": 4.098659992218018, | |
| "learning_rate": 3.902166985340981e-05, | |
| "loss": 4.7423, | |
| "mean_token_accuracy": 0.6774056190252304, | |
| "num_tokens": 114495903.0, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.648896696362952, | |
| "eval_loss": 1.1897211074829102, | |
| "eval_mean_token_accuracy": 0.6754970908164978, | |
| "eval_num_tokens": 114495903.0, | |
| "eval_runtime": 55.043, | |
| "eval_samples_per_second": 7.267, | |
| "eval_steps_per_second": 0.908, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.6524817609837418, | |
| "grad_norm": 4.9181976318359375, | |
| "learning_rate": 3.862332695984704e-05, | |
| "loss": 4.7403, | |
| "mean_token_accuracy": 0.6782529127597808, | |
| "num_tokens": 115123519.0, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.6524817609837418, | |
| "eval_loss": 1.1898068189620972, | |
| "eval_mean_token_accuracy": 0.6745435571670533, | |
| "eval_num_tokens": 115123519.0, | |
| "eval_runtime": 54.9272, | |
| "eval_samples_per_second": 7.282, | |
| "eval_steps_per_second": 0.91, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.6560668256045316, | |
| "grad_norm": 4.978320121765137, | |
| "learning_rate": 3.8224984066284255e-05, | |
| "loss": 4.8028, | |
| "mean_token_accuracy": 0.675481299161911, | |
| "num_tokens": 115755644.0, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.6560668256045316, | |
| "eval_loss": 1.1891556978225708, | |
| "eval_mean_token_accuracy": 0.6756797277927399, | |
| "eval_num_tokens": 115755644.0, | |
| "eval_runtime": 55.005, | |
| "eval_samples_per_second": 7.272, | |
| "eval_steps_per_second": 0.909, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.6596518902253213, | |
| "grad_norm": 4.682608604431152, | |
| "learning_rate": 3.7826641172721484e-05, | |
| "loss": 4.8228, | |
| "mean_token_accuracy": 0.6748796856403351, | |
| "num_tokens": 116391334.0, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.6596518902253213, | |
| "eval_loss": 1.1887702941894531, | |
| "eval_mean_token_accuracy": 0.6753370201587677, | |
| "eval_num_tokens": 116391334.0, | |
| "eval_runtime": 54.9714, | |
| "eval_samples_per_second": 7.277, | |
| "eval_steps_per_second": 0.91, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.6632369548461111, | |
| "grad_norm": 4.45632791519165, | |
| "learning_rate": 3.7428298279158705e-05, | |
| "loss": 4.7473, | |
| "mean_token_accuracy": 0.6801807761192322, | |
| "num_tokens": 117026978.0, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.6632369548461111, | |
| "eval_loss": 1.1893665790557861, | |
| "eval_mean_token_accuracy": 0.6753548145294189, | |
| "eval_num_tokens": 117026978.0, | |
| "eval_runtime": 55.0208, | |
| "eval_samples_per_second": 7.27, | |
| "eval_steps_per_second": 0.909, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.6668220194669009, | |
| "grad_norm": 4.2916951179504395, | |
| "learning_rate": 3.702995538559592e-05, | |
| "loss": 4.7187, | |
| "mean_token_accuracy": 0.6798599645495415, | |
| "num_tokens": 117660612.0, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.6668220194669009, | |
| "eval_loss": 1.189585566520691, | |
| "eval_mean_token_accuracy": 0.6756225192546844, | |
| "eval_num_tokens": 117660612.0, | |
| "eval_runtime": 55.0296, | |
| "eval_samples_per_second": 7.269, | |
| "eval_steps_per_second": 0.909, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.6704070840876907, | |
| "grad_norm": 4.559842109680176, | |
| "learning_rate": 3.663161249203315e-05, | |
| "loss": 4.7029, | |
| "mean_token_accuracy": 0.6812646022439003, | |
| "num_tokens": 118290778.0, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.6704070840876907, | |
| "eval_loss": 1.1883878707885742, | |
| "eval_mean_token_accuracy": 0.6754980099201202, | |
| "eval_num_tokens": 118290778.0, | |
| "eval_runtime": 55.0845, | |
| "eval_samples_per_second": 7.262, | |
| "eval_steps_per_second": 0.908, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.6739921487084805, | |
| "grad_norm": 4.957666873931885, | |
| "learning_rate": 3.623326959847036e-05, | |
| "loss": 4.716, | |
| "mean_token_accuracy": 0.6789155259728432, | |
| "num_tokens": 118921441.0, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.6739921487084805, | |
| "eval_loss": 1.1882615089416504, | |
| "eval_mean_token_accuracy": 0.6757899785041809, | |
| "eval_num_tokens": 118921441.0, | |
| "eval_runtime": 55.0393, | |
| "eval_samples_per_second": 7.268, | |
| "eval_steps_per_second": 0.908, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.6775772133292702, | |
| "grad_norm": 4.460175037384033, | |
| "learning_rate": 3.5834926704907584e-05, | |
| "loss": 4.7439, | |
| "mean_token_accuracy": 0.6776839691400528, | |
| "num_tokens": 119549331.0, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.6775772133292702, | |
| "eval_loss": 1.1887913942337036, | |
| "eval_mean_token_accuracy": 0.6761759769916534, | |
| "eval_num_tokens": 119549331.0, | |
| "eval_runtime": 54.9213, | |
| "eval_samples_per_second": 7.283, | |
| "eval_steps_per_second": 0.91, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.68116227795006, | |
| "grad_norm": 4.3697638511657715, | |
| "learning_rate": 3.543658381134481e-05, | |
| "loss": 4.7796, | |
| "mean_token_accuracy": 0.6753658777475358, | |
| "num_tokens": 120180706.0, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.68116227795006, | |
| "eval_loss": 1.187656044960022, | |
| "eval_mean_token_accuracy": 0.6755701994895935, | |
| "eval_num_tokens": 120180706.0, | |
| "eval_runtime": 54.925, | |
| "eval_samples_per_second": 7.283, | |
| "eval_steps_per_second": 0.91, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.6847473425708498, | |
| "grad_norm": 4.676335334777832, | |
| "learning_rate": 3.503824091778203e-05, | |
| "loss": 4.8121, | |
| "mean_token_accuracy": 0.6740303432941437, | |
| "num_tokens": 120813928.0, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.6847473425708498, | |
| "eval_loss": 1.1875134706497192, | |
| "eval_mean_token_accuracy": 0.6751706182956696, | |
| "eval_num_tokens": 120813928.0, | |
| "eval_runtime": 55.1212, | |
| "eval_samples_per_second": 7.257, | |
| "eval_steps_per_second": 0.907, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.6883324071916397, | |
| "grad_norm": 5.17042875289917, | |
| "learning_rate": 3.463989802421925e-05, | |
| "loss": 4.7668, | |
| "mean_token_accuracy": 0.6758210748434067, | |
| "num_tokens": 121446792.0, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.6883324071916397, | |
| "eval_loss": 1.1872638463974, | |
| "eval_mean_token_accuracy": 0.6760274660587311, | |
| "eval_num_tokens": 121446792.0, | |
| "eval_runtime": 55.1534, | |
| "eval_samples_per_second": 7.252, | |
| "eval_steps_per_second": 0.907, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.6919174718124295, | |
| "grad_norm": 4.4179840087890625, | |
| "learning_rate": 3.424155513065647e-05, | |
| "loss": 4.7633, | |
| "mean_token_accuracy": 0.6759152534604073, | |
| "num_tokens": 122072069.0, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.6919174718124295, | |
| "eval_loss": 1.1880455017089844, | |
| "eval_mean_token_accuracy": 0.6756154441833496, | |
| "eval_num_tokens": 122072069.0, | |
| "eval_runtime": 55.0701, | |
| "eval_samples_per_second": 7.263, | |
| "eval_steps_per_second": 0.908, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.6955025364332192, | |
| "grad_norm": 4.7966437339782715, | |
| "learning_rate": 3.384321223709369e-05, | |
| "loss": 4.7314, | |
| "mean_token_accuracy": 0.6790701761841774, | |
| "num_tokens": 122703011.0, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.6955025364332192, | |
| "eval_loss": 1.187593936920166, | |
| "eval_mean_token_accuracy": 0.6753950679302215, | |
| "eval_num_tokens": 122703011.0, | |
| "eval_runtime": 55.1146, | |
| "eval_samples_per_second": 7.258, | |
| "eval_steps_per_second": 0.907, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.699087601054009, | |
| "grad_norm": 4.6988630294799805, | |
| "learning_rate": 3.344486934353091e-05, | |
| "loss": 4.7162, | |
| "mean_token_accuracy": 0.680111817419529, | |
| "num_tokens": 123334937.0, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.699087601054009, | |
| "eval_loss": 1.1878883838653564, | |
| "eval_mean_token_accuracy": 0.6757830834388733, | |
| "eval_num_tokens": 123334937.0, | |
| "eval_runtime": 55.0822, | |
| "eval_samples_per_second": 7.262, | |
| "eval_steps_per_second": 0.908, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.7026726656747988, | |
| "grad_norm": 5.11058235168457, | |
| "learning_rate": 3.3046526449968134e-05, | |
| "loss": 4.7778, | |
| "mean_token_accuracy": 0.6746508419513703, | |
| "num_tokens": 123969291.0, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.7026726656747988, | |
| "eval_loss": 1.1869330406188965, | |
| "eval_mean_token_accuracy": 0.6755635273456574, | |
| "eval_num_tokens": 123969291.0, | |
| "eval_runtime": 55.0482, | |
| "eval_samples_per_second": 7.266, | |
| "eval_steps_per_second": 0.908, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.7062577302955886, | |
| "grad_norm": 4.539863109588623, | |
| "learning_rate": 3.2648183556405356e-05, | |
| "loss": 4.7265, | |
| "mean_token_accuracy": 0.6780777916312217, | |
| "num_tokens": 124597342.0, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.7062577302955886, | |
| "eval_loss": 1.1864935159683228, | |
| "eval_mean_token_accuracy": 0.6759455275535583, | |
| "eval_num_tokens": 124597342.0, | |
| "eval_runtime": 54.9859, | |
| "eval_samples_per_second": 7.275, | |
| "eval_steps_per_second": 0.909, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.7098427949163784, | |
| "grad_norm": 4.2660112380981445, | |
| "learning_rate": 3.224984066284258e-05, | |
| "loss": 4.7727, | |
| "mean_token_accuracy": 0.6779581853747367, | |
| "num_tokens": 125230255.0, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.7098427949163784, | |
| "eval_loss": 1.1863397359848022, | |
| "eval_mean_token_accuracy": 0.6765384769439697, | |
| "eval_num_tokens": 125230255.0, | |
| "eval_runtime": 55.0326, | |
| "eval_samples_per_second": 7.268, | |
| "eval_steps_per_second": 0.909, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.7134278595371681, | |
| "grad_norm": 4.660075664520264, | |
| "learning_rate": 3.18514977692798e-05, | |
| "loss": 4.6841, | |
| "mean_token_accuracy": 0.6811311572790146, | |
| "num_tokens": 125863729.0, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.7134278595371681, | |
| "eval_loss": 1.1863139867782593, | |
| "eval_mean_token_accuracy": 0.676126846075058, | |
| "eval_num_tokens": 125863729.0, | |
| "eval_runtime": 55.1646, | |
| "eval_samples_per_second": 7.251, | |
| "eval_steps_per_second": 0.906, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.7170129241579579, | |
| "grad_norm": 4.517760753631592, | |
| "learning_rate": 3.145315487571702e-05, | |
| "loss": 4.7781, | |
| "mean_token_accuracy": 0.6761695435643196, | |
| "num_tokens": 126496619.0, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.7170129241579579, | |
| "eval_loss": 1.1863616704940796, | |
| "eval_mean_token_accuracy": 0.6760414135456085, | |
| "eval_num_tokens": 126496619.0, | |
| "eval_runtime": 55.0497, | |
| "eval_samples_per_second": 7.266, | |
| "eval_steps_per_second": 0.908, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.7205979887787477, | |
| "grad_norm": 4.821887493133545, | |
| "learning_rate": 3.105481198215424e-05, | |
| "loss": 4.7506, | |
| "mean_token_accuracy": 0.677640742957592, | |
| "num_tokens": 127129025.0, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.7205979887787477, | |
| "eval_loss": 1.1864928007125854, | |
| "eval_mean_token_accuracy": 0.6763237309455872, | |
| "eval_num_tokens": 127129025.0, | |
| "eval_runtime": 55.0217, | |
| "eval_samples_per_second": 7.27, | |
| "eval_steps_per_second": 0.909, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.7241830533995375, | |
| "grad_norm": 5.278724670410156, | |
| "learning_rate": 3.065646908859146e-05, | |
| "loss": 4.7726, | |
| "mean_token_accuracy": 0.6762316790223122, | |
| "num_tokens": 127757800.0, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.7241830533995375, | |
| "eval_loss": 1.1857789754867554, | |
| "eval_mean_token_accuracy": 0.6754815447330474, | |
| "eval_num_tokens": 127757800.0, | |
| "eval_runtime": 55.2673, | |
| "eval_samples_per_second": 7.238, | |
| "eval_steps_per_second": 0.905, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.7277681180203274, | |
| "grad_norm": 4.649436950683594, | |
| "learning_rate": 3.025812619502868e-05, | |
| "loss": 4.803, | |
| "mean_token_accuracy": 0.6757835251092911, | |
| "num_tokens": 128391102.0, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.7277681180203274, | |
| "eval_loss": 1.1859068870544434, | |
| "eval_mean_token_accuracy": 0.6756039881706237, | |
| "eval_num_tokens": 128391102.0, | |
| "eval_runtime": 55.0647, | |
| "eval_samples_per_second": 7.264, | |
| "eval_steps_per_second": 0.908, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.7313531826411171, | |
| "grad_norm": 4.287916660308838, | |
| "learning_rate": 2.9859783301465906e-05, | |
| "loss": 4.7373, | |
| "mean_token_accuracy": 0.677640765607357, | |
| "num_tokens": 129023759.0, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.7313531826411171, | |
| "eval_loss": 1.1861519813537598, | |
| "eval_mean_token_accuracy": 0.6761102056503296, | |
| "eval_num_tokens": 129023759.0, | |
| "eval_runtime": 54.9417, | |
| "eval_samples_per_second": 7.28, | |
| "eval_steps_per_second": 0.91, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.7349382472619069, | |
| "grad_norm": 4.765435695648193, | |
| "learning_rate": 2.9461440407903124e-05, | |
| "loss": 4.7741, | |
| "mean_token_accuracy": 0.6759647503495216, | |
| "num_tokens": 129655248.0, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.7349382472619069, | |
| "eval_loss": 1.1853660345077515, | |
| "eval_mean_token_accuracy": 0.6756195032596588, | |
| "eval_num_tokens": 129655248.0, | |
| "eval_runtime": 55.3223, | |
| "eval_samples_per_second": 7.23, | |
| "eval_steps_per_second": 0.904, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.7385233118826967, | |
| "grad_norm": 4.814145088195801, | |
| "learning_rate": 2.906309751434035e-05, | |
| "loss": 4.783, | |
| "mean_token_accuracy": 0.6754540035128593, | |
| "num_tokens": 130291512.0, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.7385233118826967, | |
| "eval_loss": 1.1858062744140625, | |
| "eval_mean_token_accuracy": 0.6755566847324371, | |
| "eval_num_tokens": 130291512.0, | |
| "eval_runtime": 55.3243, | |
| "eval_samples_per_second": 7.23, | |
| "eval_steps_per_second": 0.904, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.7421083765034865, | |
| "grad_norm": 4.518885135650635, | |
| "learning_rate": 2.8664754620777567e-05, | |
| "loss": 4.7083, | |
| "mean_token_accuracy": 0.6788066929578781, | |
| "num_tokens": 130922542.0, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.7421083765034865, | |
| "eval_loss": 1.185410499572754, | |
| "eval_mean_token_accuracy": 0.6754859507083892, | |
| "eval_num_tokens": 130922542.0, | |
| "eval_runtime": 55.3328, | |
| "eval_samples_per_second": 7.229, | |
| "eval_steps_per_second": 0.904, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.7456934411242763, | |
| "grad_norm": 4.623891830444336, | |
| "learning_rate": 2.8266411727214788e-05, | |
| "loss": 4.8077, | |
| "mean_token_accuracy": 0.6753540116548539, | |
| "num_tokens": 131553716.0, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.7456934411242763, | |
| "eval_loss": 1.18582022190094, | |
| "eval_mean_token_accuracy": 0.6761428475379944, | |
| "eval_num_tokens": 131553716.0, | |
| "eval_runtime": 55.033, | |
| "eval_samples_per_second": 7.268, | |
| "eval_steps_per_second": 0.909, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.749278505745066, | |
| "grad_norm": 4.524717807769775, | |
| "learning_rate": 2.7868068833652013e-05, | |
| "loss": 4.7216, | |
| "mean_token_accuracy": 0.6801271498203277, | |
| "num_tokens": 132187857.0, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.749278505745066, | |
| "eval_loss": 1.1854428052902222, | |
| "eval_mean_token_accuracy": 0.6755358970165253, | |
| "eval_num_tokens": 132187857.0, | |
| "eval_runtime": 55.0174, | |
| "eval_samples_per_second": 7.27, | |
| "eval_steps_per_second": 0.909, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.7528635703658558, | |
| "grad_norm": 4.81862211227417, | |
| "learning_rate": 2.746972594008923e-05, | |
| "loss": 4.8024, | |
| "mean_token_accuracy": 0.6740377223491669, | |
| "num_tokens": 132818348.0, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.7528635703658558, | |
| "eval_loss": 1.184729814529419, | |
| "eval_mean_token_accuracy": 0.6763768219947814, | |
| "eval_num_tokens": 132818348.0, | |
| "eval_runtime": 54.9843, | |
| "eval_samples_per_second": 7.275, | |
| "eval_steps_per_second": 0.909, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.7564486349866456, | |
| "grad_norm": 4.876639366149902, | |
| "learning_rate": 2.707138304652645e-05, | |
| "loss": 4.691, | |
| "mean_token_accuracy": 0.682259525358677, | |
| "num_tokens": 133450534.0, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.7564486349866456, | |
| "eval_loss": 1.1844180822372437, | |
| "eval_mean_token_accuracy": 0.6765269267559052, | |
| "eval_num_tokens": 133450534.0, | |
| "eval_runtime": 55.1025, | |
| "eval_samples_per_second": 7.259, | |
| "eval_steps_per_second": 0.907, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.7600336996074354, | |
| "grad_norm": 5.20668363571167, | |
| "learning_rate": 2.6673040152963674e-05, | |
| "loss": 4.7633, | |
| "mean_token_accuracy": 0.6769976457953453, | |
| "num_tokens": 134081427.0, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.7600336996074354, | |
| "eval_loss": 1.1848989725112915, | |
| "eval_mean_token_accuracy": 0.6765480875968933, | |
| "eval_num_tokens": 134081427.0, | |
| "eval_runtime": 55.16, | |
| "eval_samples_per_second": 7.252, | |
| "eval_steps_per_second": 0.906, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.7636187642282252, | |
| "grad_norm": 4.415744304656982, | |
| "learning_rate": 2.6274697259400892e-05, | |
| "loss": 4.7572, | |
| "mean_token_accuracy": 0.6762901389598847, | |
| "num_tokens": 134717648.0, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.7636187642282252, | |
| "eval_loss": 1.1854746341705322, | |
| "eval_mean_token_accuracy": 0.6762021934986114, | |
| "eval_num_tokens": 134717648.0, | |
| "eval_runtime": 55.1597, | |
| "eval_samples_per_second": 7.252, | |
| "eval_steps_per_second": 0.906, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.7672038288490151, | |
| "grad_norm": 4.984974384307861, | |
| "learning_rate": 2.5876354365838113e-05, | |
| "loss": 4.6918, | |
| "mean_token_accuracy": 0.6813731342554092, | |
| "num_tokens": 135352362.0, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.7672038288490151, | |
| "eval_loss": 1.1856120824813843, | |
| "eval_mean_token_accuracy": 0.6763051617145538, | |
| "eval_num_tokens": 135352362.0, | |
| "eval_runtime": 54.9725, | |
| "eval_samples_per_second": 7.276, | |
| "eval_steps_per_second": 0.91, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.7707888934698048, | |
| "grad_norm": 4.5358781814575195, | |
| "learning_rate": 2.5478011472275338e-05, | |
| "loss": 4.7242, | |
| "mean_token_accuracy": 0.6771323186159134, | |
| "num_tokens": 135991559.0, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.7707888934698048, | |
| "eval_loss": 1.185410737991333, | |
| "eval_mean_token_accuracy": 0.676514265537262, | |
| "eval_num_tokens": 135991559.0, | |
| "eval_runtime": 55.0995, | |
| "eval_samples_per_second": 7.26, | |
| "eval_steps_per_second": 0.907, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.7743739580905946, | |
| "grad_norm": 4.364614009857178, | |
| "learning_rate": 2.5079668578712556e-05, | |
| "loss": 4.6947, | |
| "mean_token_accuracy": 0.6806002199649811, | |
| "num_tokens": 136624024.0, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.7743739580905946, | |
| "eval_loss": 1.184848666191101, | |
| "eval_mean_token_accuracy": 0.676878696680069, | |
| "eval_num_tokens": 136624024.0, | |
| "eval_runtime": 55.7118, | |
| "eval_samples_per_second": 7.18, | |
| "eval_steps_per_second": 0.897, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.7779590227113844, | |
| "grad_norm": 4.293883323669434, | |
| "learning_rate": 2.4681325685149778e-05, | |
| "loss": 4.7454, | |
| "mean_token_accuracy": 0.6780085292458534, | |
| "num_tokens": 137259400.0, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.7779590227113844, | |
| "eval_loss": 1.185441493988037, | |
| "eval_mean_token_accuracy": 0.6766848075389862, | |
| "eval_num_tokens": 137259400.0, | |
| "eval_runtime": 56.8526, | |
| "eval_samples_per_second": 7.036, | |
| "eval_steps_per_second": 0.879, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.7815440873321742, | |
| "grad_norm": 4.8266143798828125, | |
| "learning_rate": 2.4282982791587e-05, | |
| "loss": 4.7078, | |
| "mean_token_accuracy": 0.6800830870866775, | |
| "num_tokens": 137888937.0, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.7815440873321742, | |
| "eval_loss": 1.18449068069458, | |
| "eval_mean_token_accuracy": 0.6764636623859406, | |
| "eval_num_tokens": 137888937.0, | |
| "eval_runtime": 56.5392, | |
| "eval_samples_per_second": 7.075, | |
| "eval_steps_per_second": 0.884, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.785129151952964, | |
| "grad_norm": 4.471580982208252, | |
| "learning_rate": 2.388463989802422e-05, | |
| "loss": 4.6928, | |
| "mean_token_accuracy": 0.6799645683169365, | |
| "num_tokens": 138523132.0, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.785129151952964, | |
| "eval_loss": 1.1840896606445312, | |
| "eval_mean_token_accuracy": 0.6770457863807678, | |
| "eval_num_tokens": 138523132.0, | |
| "eval_runtime": 56.9603, | |
| "eval_samples_per_second": 7.022, | |
| "eval_steps_per_second": 0.878, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.7887142165737537, | |
| "grad_norm": 4.892276763916016, | |
| "learning_rate": 2.3486297004461442e-05, | |
| "loss": 4.7822, | |
| "mean_token_accuracy": 0.6743768805265427, | |
| "num_tokens": 139156280.0, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.7887142165737537, | |
| "eval_loss": 1.1841989755630493, | |
| "eval_mean_token_accuracy": 0.6765543162822724, | |
| "eval_num_tokens": 139156280.0, | |
| "eval_runtime": 56.1325, | |
| "eval_samples_per_second": 7.126, | |
| "eval_steps_per_second": 0.891, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.7922992811945435, | |
| "grad_norm": 5.218216896057129, | |
| "learning_rate": 2.3087954110898663e-05, | |
| "loss": 4.753, | |
| "mean_token_accuracy": 0.6765683805942535, | |
| "num_tokens": 139786084.0, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.7922992811945435, | |
| "eval_loss": 1.1838265657424927, | |
| "eval_mean_token_accuracy": 0.6768669807910919, | |
| "eval_num_tokens": 139786084.0, | |
| "eval_runtime": 57.9142, | |
| "eval_samples_per_second": 6.907, | |
| "eval_steps_per_second": 0.863, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.7958843458153333, | |
| "grad_norm": 4.109825134277344, | |
| "learning_rate": 2.2689611217335885e-05, | |
| "loss": 4.766, | |
| "mean_token_accuracy": 0.6766787865757942, | |
| "num_tokens": 140420913.0, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.7958843458153333, | |
| "eval_loss": 1.183830976486206, | |
| "eval_mean_token_accuracy": 0.6771005463600158, | |
| "eval_num_tokens": 140420913.0, | |
| "eval_runtime": 57.8895, | |
| "eval_samples_per_second": 6.91, | |
| "eval_steps_per_second": 0.864, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.7994694104361231, | |
| "grad_norm": 4.745416641235352, | |
| "learning_rate": 2.2291268323773103e-05, | |
| "loss": 4.7743, | |
| "mean_token_accuracy": 0.6760394325852395, | |
| "num_tokens": 141048743.0, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.7994694104361231, | |
| "eval_loss": 1.184319257736206, | |
| "eval_mean_token_accuracy": 0.6767275559902192, | |
| "eval_num_tokens": 141048743.0, | |
| "eval_runtime": 56.6971, | |
| "eval_samples_per_second": 7.055, | |
| "eval_steps_per_second": 0.882, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.8030544750569129, | |
| "grad_norm": 5.053956985473633, | |
| "learning_rate": 2.1892925430210324e-05, | |
| "loss": 4.7635, | |
| "mean_token_accuracy": 0.6785035586357117, | |
| "num_tokens": 141678181.0, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.8030544750569129, | |
| "eval_loss": 1.184045672416687, | |
| "eval_mean_token_accuracy": 0.6763345134258271, | |
| "eval_num_tokens": 141678181.0, | |
| "eval_runtime": 57.5357, | |
| "eval_samples_per_second": 6.952, | |
| "eval_steps_per_second": 0.869, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.8066395396777027, | |
| "grad_norm": 4.613523006439209, | |
| "learning_rate": 2.149458253664755e-05, | |
| "loss": 4.7037, | |
| "mean_token_accuracy": 0.6807671126723289, | |
| "num_tokens": 142312431.0, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.8066395396777027, | |
| "eval_loss": 1.1838032007217407, | |
| "eval_mean_token_accuracy": 0.6768004512786865, | |
| "eval_num_tokens": 142312431.0, | |
| "eval_runtime": 57.4032, | |
| "eval_samples_per_second": 6.968, | |
| "eval_steps_per_second": 0.871, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.8102246042984925, | |
| "grad_norm": 4.577108860015869, | |
| "learning_rate": 2.109623964308477e-05, | |
| "loss": 4.7398, | |
| "mean_token_accuracy": 0.6783872780203819, | |
| "num_tokens": 142944119.0, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.8102246042984925, | |
| "eval_loss": 1.1838161945343018, | |
| "eval_mean_token_accuracy": 0.676618036031723, | |
| "eval_num_tokens": 142944119.0, | |
| "eval_runtime": 57.4773, | |
| "eval_samples_per_second": 6.959, | |
| "eval_steps_per_second": 0.87, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.8138096689192823, | |
| "grad_norm": 4.523055553436279, | |
| "learning_rate": 2.069789674952199e-05, | |
| "loss": 4.7537, | |
| "mean_token_accuracy": 0.6783151313662529, | |
| "num_tokens": 143575945.0, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.8138096689192823, | |
| "eval_loss": 1.183152437210083, | |
| "eval_mean_token_accuracy": 0.6768651962280273, | |
| "eval_num_tokens": 143575945.0, | |
| "eval_runtime": 57.8648, | |
| "eval_samples_per_second": 6.913, | |
| "eval_steps_per_second": 0.864, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.8173947335400721, | |
| "grad_norm": 4.623985290527344, | |
| "learning_rate": 2.029955385595921e-05, | |
| "loss": 4.6946, | |
| "mean_token_accuracy": 0.6809823432564736, | |
| "num_tokens": 144207834.0, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.8173947335400721, | |
| "eval_loss": 1.1837332248687744, | |
| "eval_mean_token_accuracy": 0.6768518340587616, | |
| "eval_num_tokens": 144207834.0, | |
| "eval_runtime": 57.9156, | |
| "eval_samples_per_second": 6.907, | |
| "eval_steps_per_second": 0.863, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.8209797981608619, | |
| "grad_norm": 4.573394298553467, | |
| "learning_rate": 1.990121096239643e-05, | |
| "loss": 4.733, | |
| "mean_token_accuracy": 0.6784341213107109, | |
| "num_tokens": 144840226.0, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.8209797981608619, | |
| "eval_loss": 1.1838306188583374, | |
| "eval_mean_token_accuracy": 0.6770703101158142, | |
| "eval_num_tokens": 144840226.0, | |
| "eval_runtime": 57.6139, | |
| "eval_samples_per_second": 6.943, | |
| "eval_steps_per_second": 0.868, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.8245648627816516, | |
| "grad_norm": 4.6763458251953125, | |
| "learning_rate": 1.9502868068833653e-05, | |
| "loss": 4.7745, | |
| "mean_token_accuracy": 0.6781399786472321, | |
| "num_tokens": 145475463.0, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.8245648627816516, | |
| "eval_loss": 1.1832276582717896, | |
| "eval_mean_token_accuracy": 0.6768820834159851, | |
| "eval_num_tokens": 145475463.0, | |
| "eval_runtime": 56.936, | |
| "eval_samples_per_second": 7.025, | |
| "eval_steps_per_second": 0.878, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.8281499274024414, | |
| "grad_norm": 4.544640064239502, | |
| "learning_rate": 1.9104525175270875e-05, | |
| "loss": 4.6995, | |
| "mean_token_accuracy": 0.6816430819034577, | |
| "num_tokens": 146110411.0, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.8281499274024414, | |
| "eval_loss": 1.1833053827285767, | |
| "eval_mean_token_accuracy": 0.6773410534858704, | |
| "eval_num_tokens": 146110411.0, | |
| "eval_runtime": 57.0468, | |
| "eval_samples_per_second": 7.012, | |
| "eval_steps_per_second": 0.876, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.8317349920232312, | |
| "grad_norm": 4.831193923950195, | |
| "learning_rate": 1.8706182281708096e-05, | |
| "loss": 4.7265, | |
| "mean_token_accuracy": 0.6784323596954346, | |
| "num_tokens": 146744949.0, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.8317349920232312, | |
| "eval_loss": 1.1832283735275269, | |
| "eval_mean_token_accuracy": 0.6767588186264039, | |
| "eval_num_tokens": 146744949.0, | |
| "eval_runtime": 57.1574, | |
| "eval_samples_per_second": 6.998, | |
| "eval_steps_per_second": 0.875, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.835320056644021, | |
| "grad_norm": 4.2518086433410645, | |
| "learning_rate": 1.8307839388145317e-05, | |
| "loss": 4.7231, | |
| "mean_token_accuracy": 0.6789010632038116, | |
| "num_tokens": 147377879.0, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.835320056644021, | |
| "eval_loss": 1.1831552982330322, | |
| "eval_mean_token_accuracy": 0.6766877925395965, | |
| "eval_num_tokens": 147377879.0, | |
| "eval_runtime": 57.2174, | |
| "eval_samples_per_second": 6.991, | |
| "eval_steps_per_second": 0.874, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.8389051212648108, | |
| "grad_norm": 4.656574726104736, | |
| "learning_rate": 1.7909496494582535e-05, | |
| "loss": 4.7387, | |
| "mean_token_accuracy": 0.6785845035314559, | |
| "num_tokens": 148008340.0, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.8389051212648108, | |
| "eval_loss": 1.1829930543899536, | |
| "eval_mean_token_accuracy": 0.6767005050182342, | |
| "eval_num_tokens": 148008340.0, | |
| "eval_runtime": 56.5488, | |
| "eval_samples_per_second": 7.074, | |
| "eval_steps_per_second": 0.884, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.8424901858856005, | |
| "grad_norm": 5.07755184173584, | |
| "learning_rate": 1.7511153601019757e-05, | |
| "loss": 4.6949, | |
| "mean_token_accuracy": 0.6815642186999321, | |
| "num_tokens": 148637370.0, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.8424901858856005, | |
| "eval_loss": 1.1835424900054932, | |
| "eval_mean_token_accuracy": 0.6766264629364014, | |
| "eval_num_tokens": 148637370.0, | |
| "eval_runtime": 56.844, | |
| "eval_samples_per_second": 7.037, | |
| "eval_steps_per_second": 0.88, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.8460752505063904, | |
| "grad_norm": 4.937259674072266, | |
| "learning_rate": 1.7112810707456982e-05, | |
| "loss": 4.7547, | |
| "mean_token_accuracy": 0.6751632392406464, | |
| "num_tokens": 149266269.0, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.8460752505063904, | |
| "eval_loss": 1.1830496788024902, | |
| "eval_mean_token_accuracy": 0.6767467558383942, | |
| "eval_num_tokens": 149266269.0, | |
| "eval_runtime": 56.5429, | |
| "eval_samples_per_second": 7.074, | |
| "eval_steps_per_second": 0.884, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.8496603151271802, | |
| "grad_norm": 4.5733795166015625, | |
| "learning_rate": 1.67144678138942e-05, | |
| "loss": 4.7398, | |
| "mean_token_accuracy": 0.6781432759761811, | |
| "num_tokens": 149898007.0, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.8496603151271802, | |
| "eval_loss": 1.1832393407821655, | |
| "eval_mean_token_accuracy": 0.6769970893859864, | |
| "eval_num_tokens": 149898007.0, | |
| "eval_runtime": 55.172, | |
| "eval_samples_per_second": 7.25, | |
| "eval_steps_per_second": 0.906, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.85324537974797, | |
| "grad_norm": 4.531384468078613, | |
| "learning_rate": 1.631612492033142e-05, | |
| "loss": 4.7097, | |
| "mean_token_accuracy": 0.6787867891788483, | |
| "num_tokens": 150529318.0, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.85324537974797, | |
| "eval_loss": 1.1830426454544067, | |
| "eval_mean_token_accuracy": 0.6767821443080902, | |
| "eval_num_tokens": 150529318.0, | |
| "eval_runtime": 55.9298, | |
| "eval_samples_per_second": 7.152, | |
| "eval_steps_per_second": 0.894, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.8568304443687598, | |
| "grad_norm": 4.669693946838379, | |
| "learning_rate": 1.5917782026768643e-05, | |
| "loss": 4.7679, | |
| "mean_token_accuracy": 0.6752150565385818, | |
| "num_tokens": 151163202.0, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.8568304443687598, | |
| "eval_loss": 1.1831849813461304, | |
| "eval_mean_token_accuracy": 0.6765953767299652, | |
| "eval_num_tokens": 151163202.0, | |
| "eval_runtime": 56.6538, | |
| "eval_samples_per_second": 7.06, | |
| "eval_steps_per_second": 0.883, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.8604155089895495, | |
| "grad_norm": 4.184320449829102, | |
| "learning_rate": 1.5519439133205864e-05, | |
| "loss": 4.713, | |
| "mean_token_accuracy": 0.6797751143574715, | |
| "num_tokens": 151798569.0, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.8604155089895495, | |
| "eval_loss": 1.182477593421936, | |
| "eval_mean_token_accuracy": 0.6768123960494995, | |
| "eval_num_tokens": 151798569.0, | |
| "eval_runtime": 56.3382, | |
| "eval_samples_per_second": 7.1, | |
| "eval_steps_per_second": 0.887, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.8640005736103393, | |
| "grad_norm": 4.763125896453857, | |
| "learning_rate": 1.5121096239643084e-05, | |
| "loss": 4.7338, | |
| "mean_token_accuracy": 0.6781762626767158, | |
| "num_tokens": 152431951.0, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.8640005736103393, | |
| "eval_loss": 1.182124137878418, | |
| "eval_mean_token_accuracy": 0.6777550578117371, | |
| "eval_num_tokens": 152431951.0, | |
| "eval_runtime": 56.6371, | |
| "eval_samples_per_second": 7.063, | |
| "eval_steps_per_second": 0.883, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.8675856382311291, | |
| "grad_norm": 4.805209159851074, | |
| "learning_rate": 1.4722753346080307e-05, | |
| "loss": 4.7565, | |
| "mean_token_accuracy": 0.6775122970342636, | |
| "num_tokens": 153063882.0, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.8675856382311291, | |
| "eval_loss": 1.1821281909942627, | |
| "eval_mean_token_accuracy": 0.6782806706428528, | |
| "eval_num_tokens": 153063882.0, | |
| "eval_runtime": 56.6386, | |
| "eval_samples_per_second": 7.062, | |
| "eval_steps_per_second": 0.883, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.8711707028519189, | |
| "grad_norm": 4.224789142608643, | |
| "learning_rate": 1.4324410452517528e-05, | |
| "loss": 4.7722, | |
| "mean_token_accuracy": 0.6754378816485405, | |
| "num_tokens": 153698583.0, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.8711707028519189, | |
| "eval_loss": 1.182055950164795, | |
| "eval_mean_token_accuracy": 0.6773766386508941, | |
| "eval_num_tokens": 153698583.0, | |
| "eval_runtime": 56.2278, | |
| "eval_samples_per_second": 7.114, | |
| "eval_steps_per_second": 0.889, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.8747557674727087, | |
| "grad_norm": 4.622290134429932, | |
| "learning_rate": 1.392606755895475e-05, | |
| "loss": 4.7395, | |
| "mean_token_accuracy": 0.678723790049553, | |
| "num_tokens": 154330119.0, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.8747557674727087, | |
| "eval_loss": 1.1820727586746216, | |
| "eval_mean_token_accuracy": 0.6769333493709564, | |
| "eval_num_tokens": 154330119.0, | |
| "eval_runtime": 56.605, | |
| "eval_samples_per_second": 7.067, | |
| "eval_steps_per_second": 0.883, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.8783408320934984, | |
| "grad_norm": 4.508255481719971, | |
| "learning_rate": 1.352772466539197e-05, | |
| "loss": 4.6912, | |
| "mean_token_accuracy": 0.6804595556855202, | |
| "num_tokens": 154963205.0, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.8783408320934984, | |
| "eval_loss": 1.1818066835403442, | |
| "eval_mean_token_accuracy": 0.6772564661502838, | |
| "eval_num_tokens": 154963205.0, | |
| "eval_runtime": 55.5536, | |
| "eval_samples_per_second": 7.2, | |
| "eval_steps_per_second": 0.9, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.8819258967142882, | |
| "grad_norm": 4.340250492095947, | |
| "learning_rate": 1.3129381771829191e-05, | |
| "loss": 4.7228, | |
| "mean_token_accuracy": 0.6781974649429321, | |
| "num_tokens": 155595688.0, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.8819258967142882, | |
| "eval_loss": 1.1819037199020386, | |
| "eval_mean_token_accuracy": 0.677418692111969, | |
| "eval_num_tokens": 155595688.0, | |
| "eval_runtime": 55.5311, | |
| "eval_samples_per_second": 7.203, | |
| "eval_steps_per_second": 0.9, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.8855109613350781, | |
| "grad_norm": 4.752552032470703, | |
| "learning_rate": 1.2731038878266413e-05, | |
| "loss": 4.7694, | |
| "mean_token_accuracy": 0.6762826785445213, | |
| "num_tokens": 156228825.0, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.8855109613350781, | |
| "eval_loss": 1.1815353631973267, | |
| "eval_mean_token_accuracy": 0.6776353216171265, | |
| "eval_num_tokens": 156228825.0, | |
| "eval_runtime": 55.7037, | |
| "eval_samples_per_second": 7.181, | |
| "eval_steps_per_second": 0.898, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.8890960259558679, | |
| "grad_norm": 4.746983528137207, | |
| "learning_rate": 1.2332695984703634e-05, | |
| "loss": 4.7539, | |
| "mean_token_accuracy": 0.6774281883239746, | |
| "num_tokens": 156864867.0, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.8890960259558679, | |
| "eval_loss": 1.1814591884613037, | |
| "eval_mean_token_accuracy": 0.6772267067432404, | |
| "eval_num_tokens": 156864867.0, | |
| "eval_runtime": 55.2801, | |
| "eval_samples_per_second": 7.236, | |
| "eval_steps_per_second": 0.904, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.8926810905766577, | |
| "grad_norm": 4.964954376220703, | |
| "learning_rate": 1.1934353091140854e-05, | |
| "loss": 4.652, | |
| "mean_token_accuracy": 0.6832978922128677, | |
| "num_tokens": 157497835.0, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.8926810905766577, | |
| "eval_loss": 1.1815037727355957, | |
| "eval_mean_token_accuracy": 0.6775709521770478, | |
| "eval_num_tokens": 157497835.0, | |
| "eval_runtime": 55.4264, | |
| "eval_samples_per_second": 7.217, | |
| "eval_steps_per_second": 0.902, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.8962661551974475, | |
| "grad_norm": 4.32532262802124, | |
| "learning_rate": 1.1536010197578075e-05, | |
| "loss": 4.6811, | |
| "mean_token_accuracy": 0.6813494926691055, | |
| "num_tokens": 158131957.0, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.8962661551974475, | |
| "eval_loss": 1.181230902671814, | |
| "eval_mean_token_accuracy": 0.6782212293148041, | |
| "eval_num_tokens": 158131957.0, | |
| "eval_runtime": 55.638, | |
| "eval_samples_per_second": 7.189, | |
| "eval_steps_per_second": 0.899, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.8998512198182372, | |
| "grad_norm": 4.772362232208252, | |
| "learning_rate": 1.1137667304015297e-05, | |
| "loss": 4.6799, | |
| "mean_token_accuracy": 0.6808639001846314, | |
| "num_tokens": 158765520.0, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.8998512198182372, | |
| "eval_loss": 1.18119215965271, | |
| "eval_mean_token_accuracy": 0.6776848089694977, | |
| "eval_num_tokens": 158765520.0, | |
| "eval_runtime": 56.5407, | |
| "eval_samples_per_second": 7.075, | |
| "eval_steps_per_second": 0.884, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.903436284439027, | |
| "grad_norm": 4.406890869140625, | |
| "learning_rate": 1.0739324410452518e-05, | |
| "loss": 4.6572, | |
| "mean_token_accuracy": 0.6824618262052536, | |
| "num_tokens": 159396320.0, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.903436284439027, | |
| "eval_loss": 1.1813263893127441, | |
| "eval_mean_token_accuracy": 0.677431755065918, | |
| "eval_num_tokens": 159396320.0, | |
| "eval_runtime": 56.1014, | |
| "eval_samples_per_second": 7.13, | |
| "eval_steps_per_second": 0.891, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.9070213490598168, | |
| "grad_norm": 4.6225786209106445, | |
| "learning_rate": 1.034098151688974e-05, | |
| "loss": 4.7346, | |
| "mean_token_accuracy": 0.6780241671204567, | |
| "num_tokens": 160028813.0, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.9070213490598168, | |
| "eval_loss": 1.18108332157135, | |
| "eval_mean_token_accuracy": 0.6777166557312012, | |
| "eval_num_tokens": 160028813.0, | |
| "eval_runtime": 55.9817, | |
| "eval_samples_per_second": 7.145, | |
| "eval_steps_per_second": 0.893, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.9106064136806066, | |
| "grad_norm": 5.096744537353516, | |
| "learning_rate": 9.942638623326961e-06, | |
| "loss": 4.7072, | |
| "mean_token_accuracy": 0.6779900795221329, | |
| "num_tokens": 160660115.0, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.9106064136806066, | |
| "eval_loss": 1.1813737154006958, | |
| "eval_mean_token_accuracy": 0.6772573125362397, | |
| "eval_num_tokens": 160660115.0, | |
| "eval_runtime": 56.7004, | |
| "eval_samples_per_second": 7.055, | |
| "eval_steps_per_second": 0.882, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.9141914783013964, | |
| "grad_norm": 4.954991817474365, | |
| "learning_rate": 9.54429572976418e-06, | |
| "loss": 4.7609, | |
| "mean_token_accuracy": 0.6763640037178993, | |
| "num_tokens": 161295929.0, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.9141914783013964, | |
| "eval_loss": 1.1813360452651978, | |
| "eval_mean_token_accuracy": 0.6774272322654724, | |
| "eval_num_tokens": 161295929.0, | |
| "eval_runtime": 55.8406, | |
| "eval_samples_per_second": 7.163, | |
| "eval_steps_per_second": 0.895, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.9177765429221861, | |
| "grad_norm": 4.923917770385742, | |
| "learning_rate": 9.145952836201404e-06, | |
| "loss": 4.8154, | |
| "mean_token_accuracy": 0.6726123803853988, | |
| "num_tokens": 161927138.0, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.9177765429221861, | |
| "eval_loss": 1.1811388731002808, | |
| "eval_mean_token_accuracy": 0.6773995268344879, | |
| "eval_num_tokens": 161927138.0, | |
| "eval_runtime": 55.8065, | |
| "eval_samples_per_second": 7.168, | |
| "eval_steps_per_second": 0.896, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.9213616075429759, | |
| "grad_norm": 4.706872463226318, | |
| "learning_rate": 8.747609942638624e-06, | |
| "loss": 4.7658, | |
| "mean_token_accuracy": 0.6769201335310936, | |
| "num_tokens": 162553898.0, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.9213616075429759, | |
| "eval_loss": 1.1811527013778687, | |
| "eval_mean_token_accuracy": 0.6772512257099151, | |
| "eval_num_tokens": 162553898.0, | |
| "eval_runtime": 55.933, | |
| "eval_samples_per_second": 7.151, | |
| "eval_steps_per_second": 0.894, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.9249466721637658, | |
| "grad_norm": 4.29292106628418, | |
| "learning_rate": 8.349267049075845e-06, | |
| "loss": 4.7529, | |
| "mean_token_accuracy": 0.678237376511097, | |
| "num_tokens": 163187273.0, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.9249466721637658, | |
| "eval_loss": 1.1813446283340454, | |
| "eval_mean_token_accuracy": 0.6770773160457612, | |
| "eval_num_tokens": 163187273.0, | |
| "eval_runtime": 55.9165, | |
| "eval_samples_per_second": 7.154, | |
| "eval_steps_per_second": 0.894, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.9285317367845556, | |
| "grad_norm": 4.577188968658447, | |
| "learning_rate": 7.950924155513067e-06, | |
| "loss": 4.7151, | |
| "mean_token_accuracy": 0.6785858425498009, | |
| "num_tokens": 163816024.0, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.9285317367845556, | |
| "eval_loss": 1.181489109992981, | |
| "eval_mean_token_accuracy": 0.6775988221168519, | |
| "eval_num_tokens": 163816024.0, | |
| "eval_runtime": 55.8424, | |
| "eval_samples_per_second": 7.163, | |
| "eval_steps_per_second": 0.895, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.9321168014053454, | |
| "grad_norm": 4.593563556671143, | |
| "learning_rate": 7.552581261950287e-06, | |
| "loss": 4.7085, | |
| "mean_token_accuracy": 0.6803634178638458, | |
| "num_tokens": 164445021.0, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.9321168014053454, | |
| "eval_loss": 1.1816061735153198, | |
| "eval_mean_token_accuracy": 0.6776184713840485, | |
| "eval_num_tokens": 164445021.0, | |
| "eval_runtime": 55.8489, | |
| "eval_samples_per_second": 7.162, | |
| "eval_steps_per_second": 0.895, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.9357018660261351, | |
| "grad_norm": 4.309682846069336, | |
| "learning_rate": 7.1542383683875086e-06, | |
| "loss": 4.706, | |
| "mean_token_accuracy": 0.6809570705890655, | |
| "num_tokens": 165073944.0, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 0.9357018660261351, | |
| "eval_loss": 1.181320071220398, | |
| "eval_mean_token_accuracy": 0.6780745506286621, | |
| "eval_num_tokens": 165073944.0, | |
| "eval_runtime": 55.6478, | |
| "eval_samples_per_second": 7.188, | |
| "eval_steps_per_second": 0.899, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 0.9392869306469249, | |
| "grad_norm": 4.485646724700928, | |
| "learning_rate": 6.755895474824729e-06, | |
| "loss": 4.7568, | |
| "mean_token_accuracy": 0.677568726837635, | |
| "num_tokens": 165708351.0, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.9392869306469249, | |
| "eval_loss": 1.1809498071670532, | |
| "eval_mean_token_accuracy": 0.677814108133316, | |
| "eval_num_tokens": 165708351.0, | |
| "eval_runtime": 55.8268, | |
| "eval_samples_per_second": 7.165, | |
| "eval_steps_per_second": 0.896, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.9428719952677147, | |
| "grad_norm": 4.374978065490723, | |
| "learning_rate": 6.357552581261951e-06, | |
| "loss": 4.7056, | |
| "mean_token_accuracy": 0.6803146860003472, | |
| "num_tokens": 166342580.0, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 0.9428719952677147, | |
| "eval_loss": 1.1812047958374023, | |
| "eval_mean_token_accuracy": 0.6775732636451721, | |
| "eval_num_tokens": 166342580.0, | |
| "eval_runtime": 56.8808, | |
| "eval_samples_per_second": 7.032, | |
| "eval_steps_per_second": 0.879, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 0.9464570598885045, | |
| "grad_norm": 4.719696044921875, | |
| "learning_rate": 5.959209687699171e-06, | |
| "loss": 4.6996, | |
| "mean_token_accuracy": 0.6807303726673126, | |
| "num_tokens": 166972337.0, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.9464570598885045, | |
| "eval_loss": 1.1809191703796387, | |
| "eval_mean_token_accuracy": 0.6775369119644165, | |
| "eval_num_tokens": 166972337.0, | |
| "eval_runtime": 55.3898, | |
| "eval_samples_per_second": 7.222, | |
| "eval_steps_per_second": 0.903, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.9500421245092943, | |
| "grad_norm": 4.4557905197143555, | |
| "learning_rate": 5.560866794136393e-06, | |
| "loss": 4.6934, | |
| "mean_token_accuracy": 0.6803115239739418, | |
| "num_tokens": 167608984.0, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 0.9500421245092943, | |
| "eval_loss": 1.1811048984527588, | |
| "eval_mean_token_accuracy": 0.6776836955547333, | |
| "eval_num_tokens": 167608984.0, | |
| "eval_runtime": 56.7029, | |
| "eval_samples_per_second": 7.054, | |
| "eval_steps_per_second": 0.882, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 0.953627189130084, | |
| "grad_norm": 4.890408515930176, | |
| "learning_rate": 5.162523900573614e-06, | |
| "loss": 4.7407, | |
| "mean_token_accuracy": 0.6763252380490303, | |
| "num_tokens": 168240445.0, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.953627189130084, | |
| "eval_loss": 1.1808910369873047, | |
| "eval_mean_token_accuracy": 0.6770796060562134, | |
| "eval_num_tokens": 168240445.0, | |
| "eval_runtime": 56.5546, | |
| "eval_samples_per_second": 7.073, | |
| "eval_steps_per_second": 0.884, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.9572122537508738, | |
| "grad_norm": 5.145451545715332, | |
| "learning_rate": 4.7641810070108355e-06, | |
| "loss": 4.7417, | |
| "mean_token_accuracy": 0.6776255601644516, | |
| "num_tokens": 168871775.0, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 0.9572122537508738, | |
| "eval_loss": 1.1808017492294312, | |
| "eval_mean_token_accuracy": 0.6777704417705536, | |
| "eval_num_tokens": 168871775.0, | |
| "eval_runtime": 56.3719, | |
| "eval_samples_per_second": 7.096, | |
| "eval_steps_per_second": 0.887, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 0.9607973183716636, | |
| "grad_norm": 4.5986104011535645, | |
| "learning_rate": 4.365838113448056e-06, | |
| "loss": 4.742, | |
| "mean_token_accuracy": 0.678845791220665, | |
| "num_tokens": 169504277.0, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.9607973183716636, | |
| "eval_loss": 1.1809656620025635, | |
| "eval_mean_token_accuracy": 0.6775072228908539, | |
| "eval_num_tokens": 169504277.0, | |
| "eval_runtime": 55.886, | |
| "eval_samples_per_second": 7.157, | |
| "eval_steps_per_second": 0.895, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.9643823829924535, | |
| "grad_norm": 4.603204250335693, | |
| "learning_rate": 3.967495219885278e-06, | |
| "loss": 4.7013, | |
| "mean_token_accuracy": 0.6800284919142723, | |
| "num_tokens": 170139597.0, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 0.9643823829924535, | |
| "eval_loss": 1.1810020208358765, | |
| "eval_mean_token_accuracy": 0.6775026059150696, | |
| "eval_num_tokens": 170139597.0, | |
| "eval_runtime": 55.5754, | |
| "eval_samples_per_second": 7.197, | |
| "eval_steps_per_second": 0.9, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 0.9679674476132433, | |
| "grad_norm": 4.541078567504883, | |
| "learning_rate": 3.5691523263224986e-06, | |
| "loss": 4.6893, | |
| "mean_token_accuracy": 0.6810142487287522, | |
| "num_tokens": 170769337.0, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.9679674476132433, | |
| "eval_loss": 1.1807525157928467, | |
| "eval_mean_token_accuracy": 0.6778879475593567, | |
| "eval_num_tokens": 170769337.0, | |
| "eval_runtime": 55.7448, | |
| "eval_samples_per_second": 7.176, | |
| "eval_steps_per_second": 0.897, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.971552512234033, | |
| "grad_norm": 4.519087314605713, | |
| "learning_rate": 3.17080943275972e-06, | |
| "loss": 4.772, | |
| "mean_token_accuracy": 0.6758325353264809, | |
| "num_tokens": 171402451.0, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 0.971552512234033, | |
| "eval_loss": 1.180974006652832, | |
| "eval_mean_token_accuracy": 0.6771935153007508, | |
| "eval_num_tokens": 171402451.0, | |
| "eval_runtime": 55.9385, | |
| "eval_samples_per_second": 7.151, | |
| "eval_steps_per_second": 0.894, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 0.9751375768548228, | |
| "grad_norm": 4.388876914978027, | |
| "learning_rate": 2.772466539196941e-06, | |
| "loss": 4.7878, | |
| "mean_token_accuracy": 0.6759749925136567, | |
| "num_tokens": 172037593.0, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.9751375768548228, | |
| "eval_loss": 1.180649995803833, | |
| "eval_mean_token_accuracy": 0.6776048111915588, | |
| "eval_num_tokens": 172037593.0, | |
| "eval_runtime": 55.7077, | |
| "eval_samples_per_second": 7.18, | |
| "eval_steps_per_second": 0.898, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.9787226414756126, | |
| "grad_norm": 4.676353931427002, | |
| "learning_rate": 2.374123645634162e-06, | |
| "loss": 4.7004, | |
| "mean_token_accuracy": 0.6811859339475632, | |
| "num_tokens": 172671223.0, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 0.9787226414756126, | |
| "eval_loss": 1.1809673309326172, | |
| "eval_mean_token_accuracy": 0.6772890436649323, | |
| "eval_num_tokens": 172671223.0, | |
| "eval_runtime": 55.9611, | |
| "eval_samples_per_second": 7.148, | |
| "eval_steps_per_second": 0.893, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 0.9823077060964024, | |
| "grad_norm": 4.678284645080566, | |
| "learning_rate": 1.975780752071383e-06, | |
| "loss": 4.7903, | |
| "mean_token_accuracy": 0.676692801117897, | |
| "num_tokens": 173302207.0, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.9823077060964024, | |
| "eval_loss": 1.1809345483779907, | |
| "eval_mean_token_accuracy": 0.6775369548797607, | |
| "eval_num_tokens": 173302207.0, | |
| "eval_runtime": 55.9406, | |
| "eval_samples_per_second": 7.15, | |
| "eval_steps_per_second": 0.894, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.9858927707171922, | |
| "grad_norm": 4.409168243408203, | |
| "learning_rate": 1.5774378585086041e-06, | |
| "loss": 4.6684, | |
| "mean_token_accuracy": 0.6816425076127053, | |
| "num_tokens": 173935786.0, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 0.9858927707171922, | |
| "eval_loss": 1.1808542013168335, | |
| "eval_mean_token_accuracy": 0.6773917138576507, | |
| "eval_num_tokens": 173935786.0, | |
| "eval_runtime": 55.8468, | |
| "eval_samples_per_second": 7.162, | |
| "eval_steps_per_second": 0.895, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 0.989477835337982, | |
| "grad_norm": 4.55557107925415, | |
| "learning_rate": 1.1790949649458254e-06, | |
| "loss": 4.7765, | |
| "mean_token_accuracy": 0.6755864906311035, | |
| "num_tokens": 174569465.0, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.989477835337982, | |
| "eval_loss": 1.18069589138031, | |
| "eval_mean_token_accuracy": 0.6778388035297394, | |
| "eval_num_tokens": 174569465.0, | |
| "eval_runtime": 55.6007, | |
| "eval_samples_per_second": 7.194, | |
| "eval_steps_per_second": 0.899, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.9930628999587717, | |
| "grad_norm": 4.513246536254883, | |
| "learning_rate": 7.807520713830466e-07, | |
| "loss": 4.7511, | |
| "mean_token_accuracy": 0.6769631016254425, | |
| "num_tokens": 175203827.0, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 0.9930628999587717, | |
| "eval_loss": 1.1805609464645386, | |
| "eval_mean_token_accuracy": 0.6776606893539429, | |
| "eval_num_tokens": 175203827.0, | |
| "eval_runtime": 55.7815, | |
| "eval_samples_per_second": 7.171, | |
| "eval_steps_per_second": 0.896, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 0.9966479645795615, | |
| "grad_norm": 4.5654497146606445, | |
| "learning_rate": 3.824091778202677e-07, | |
| "loss": 4.7423, | |
| "mean_token_accuracy": 0.6774308422207832, | |
| "num_tokens": 175836938.0, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.9966479645795615, | |
| "eval_loss": 1.1805065870285034, | |
| "eval_mean_token_accuracy": 0.6777116668224334, | |
| "eval_num_tokens": 175836938.0, | |
| "eval_runtime": 55.8266, | |
| "eval_samples_per_second": 7.165, | |
| "eval_steps_per_second": 0.896, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "mean_token_accuracy": 0.677863019991686, | |
| "num_tokens": 176425692.0, | |
| "step": 13947, | |
| "total_flos": 5.688174665250246e+18, | |
| "train_loss": 4.990768942446951, | |
| "train_runtime": 203046.5495, | |
| "train_samples_per_second": 2.198, | |
| "train_steps_per_second": 0.069 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 13947, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.688174665250246e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |