| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 2562, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00117096018735363, | |
| "grad_norm": 3.2623555660247803, | |
| "learning_rate": 1.5503875968992249e-07, | |
| "loss": 0.591235339641571, | |
| "memory(GiB)": 130.94, | |
| "step": 1, | |
| "token_acc": 0.8447411307509088, | |
| "train_speed(iter/s)": 0.019417 | |
| }, | |
| { | |
| "epoch": 0.00585480093676815, | |
| "grad_norm": 3.0421900749206543, | |
| "learning_rate": 7.751937984496125e-07, | |
| "loss": 0.5568965077400208, | |
| "memory(GiB)": 131.94, | |
| "step": 5, | |
| "token_acc": 0.8460283319735964, | |
| "train_speed(iter/s)": 0.043842 | |
| }, | |
| { | |
| "epoch": 0.0117096018735363, | |
| "grad_norm": 2.8613500595092773, | |
| "learning_rate": 1.550387596899225e-06, | |
| "loss": 0.5551010131835937, | |
| "memory(GiB)": 131.94, | |
| "step": 10, | |
| "token_acc": 0.8356083810191254, | |
| "train_speed(iter/s)": 0.052588 | |
| }, | |
| { | |
| "epoch": 0.01756440281030445, | |
| "grad_norm": 2.1271445751190186, | |
| "learning_rate": 2.3255813953488376e-06, | |
| "loss": 0.5327572345733642, | |
| "memory(GiB)": 132.78, | |
| "step": 15, | |
| "token_acc": 0.8394243615921753, | |
| "train_speed(iter/s)": 0.057201 | |
| }, | |
| { | |
| "epoch": 0.0234192037470726, | |
| "grad_norm": 1.0847800970077515, | |
| "learning_rate": 3.10077519379845e-06, | |
| "loss": 0.46837658882141114, | |
| "memory(GiB)": 132.78, | |
| "step": 20, | |
| "token_acc": 0.8431611509225723, | |
| "train_speed(iter/s)": 0.059425 | |
| }, | |
| { | |
| "epoch": 0.02927400468384075, | |
| "grad_norm": 0.8750381469726562, | |
| "learning_rate": 3.875968992248063e-06, | |
| "loss": 0.44452896118164065, | |
| "memory(GiB)": 132.78, | |
| "step": 25, | |
| "token_acc": 0.8543377731613794, | |
| "train_speed(iter/s)": 0.060974 | |
| }, | |
| { | |
| "epoch": 0.0351288056206089, | |
| "grad_norm": 0.5234003663063049, | |
| "learning_rate": 4.651162790697675e-06, | |
| "loss": 0.42905311584472655, | |
| "memory(GiB)": 132.78, | |
| "step": 30, | |
| "token_acc": 0.8542961149814849, | |
| "train_speed(iter/s)": 0.062206 | |
| }, | |
| { | |
| "epoch": 0.040983606557377046, | |
| "grad_norm": 0.5018875598907471, | |
| "learning_rate": 5.4263565891472865e-06, | |
| "loss": 0.4269443988800049, | |
| "memory(GiB)": 132.78, | |
| "step": 35, | |
| "token_acc": 0.8522571433054558, | |
| "train_speed(iter/s)": 0.063069 | |
| }, | |
| { | |
| "epoch": 0.0468384074941452, | |
| "grad_norm": 0.38381680846214294, | |
| "learning_rate": 6.2015503875969e-06, | |
| "loss": 0.40071582794189453, | |
| "memory(GiB)": 132.78, | |
| "step": 40, | |
| "token_acc": 0.8555349764923779, | |
| "train_speed(iter/s)": 0.063736 | |
| }, | |
| { | |
| "epoch": 0.05269320843091335, | |
| "grad_norm": 0.3016009032726288, | |
| "learning_rate": 6.976744186046513e-06, | |
| "loss": 0.4113297462463379, | |
| "memory(GiB)": 132.78, | |
| "step": 45, | |
| "token_acc": 0.8491327275191562, | |
| "train_speed(iter/s)": 0.064231 | |
| }, | |
| { | |
| "epoch": 0.0585480093676815, | |
| "grad_norm": 0.2976464331150055, | |
| "learning_rate": 7.751937984496126e-06, | |
| "loss": 0.4019885540008545, | |
| "memory(GiB)": 132.78, | |
| "step": 50, | |
| "token_acc": 0.8631093056438779, | |
| "train_speed(iter/s)": 0.064655 | |
| }, | |
| { | |
| "epoch": 0.06440281030444965, | |
| "grad_norm": 0.24487970769405365, | |
| "learning_rate": 8.527131782945736e-06, | |
| "loss": 0.3938943386077881, | |
| "memory(GiB)": 132.78, | |
| "step": 55, | |
| "token_acc": 0.8631337386589192, | |
| "train_speed(iter/s)": 0.065047 | |
| }, | |
| { | |
| "epoch": 0.0702576112412178, | |
| "grad_norm": 0.23692984879016876, | |
| "learning_rate": 9.30232558139535e-06, | |
| "loss": 0.41377553939819334, | |
| "memory(GiB)": 132.97, | |
| "step": 60, | |
| "token_acc": 0.8609983103219724, | |
| "train_speed(iter/s)": 0.065302 | |
| }, | |
| { | |
| "epoch": 0.07611241217798595, | |
| "grad_norm": 0.23579329252243042, | |
| "learning_rate": 1.0077519379844963e-05, | |
| "loss": 0.3947890758514404, | |
| "memory(GiB)": 132.97, | |
| "step": 65, | |
| "token_acc": 0.8729405159237655, | |
| "train_speed(iter/s)": 0.065577 | |
| }, | |
| { | |
| "epoch": 0.08196721311475409, | |
| "grad_norm": 0.2210317999124527, | |
| "learning_rate": 1.0852713178294573e-05, | |
| "loss": 0.3936769962310791, | |
| "memory(GiB)": 132.97, | |
| "step": 70, | |
| "token_acc": 0.860379465686213, | |
| "train_speed(iter/s)": 0.065756 | |
| }, | |
| { | |
| "epoch": 0.08782201405152225, | |
| "grad_norm": 0.23814593255519867, | |
| "learning_rate": 1.1627906976744187e-05, | |
| "loss": 0.39299349784851073, | |
| "memory(GiB)": 132.97, | |
| "step": 75, | |
| "token_acc": 0.8482827629927034, | |
| "train_speed(iter/s)": 0.06599 | |
| }, | |
| { | |
| "epoch": 0.0936768149882904, | |
| "grad_norm": 0.24474237859249115, | |
| "learning_rate": 1.24031007751938e-05, | |
| "loss": 0.39170591831207274, | |
| "memory(GiB)": 132.97, | |
| "step": 80, | |
| "token_acc": 0.8571129295007489, | |
| "train_speed(iter/s)": 0.066134 | |
| }, | |
| { | |
| "epoch": 0.09953161592505855, | |
| "grad_norm": 0.232538640499115, | |
| "learning_rate": 1.3178294573643412e-05, | |
| "loss": 0.3822017669677734, | |
| "memory(GiB)": 132.97, | |
| "step": 85, | |
| "token_acc": 0.8780878727095818, | |
| "train_speed(iter/s)": 0.066366 | |
| }, | |
| { | |
| "epoch": 0.1053864168618267, | |
| "grad_norm": 0.22437641024589539, | |
| "learning_rate": 1.3953488372093025e-05, | |
| "loss": 0.38762218952178956, | |
| "memory(GiB)": 132.97, | |
| "step": 90, | |
| "token_acc": 0.8614531845562612, | |
| "train_speed(iter/s)": 0.066559 | |
| }, | |
| { | |
| "epoch": 0.11124121779859485, | |
| "grad_norm": 0.22185830771923065, | |
| "learning_rate": 1.4728682170542636e-05, | |
| "loss": 0.38779487609863283, | |
| "memory(GiB)": 132.97, | |
| "step": 95, | |
| "token_acc": 0.8664786644726099, | |
| "train_speed(iter/s)": 0.066708 | |
| }, | |
| { | |
| "epoch": 0.117096018735363, | |
| "grad_norm": 0.27393871545791626, | |
| "learning_rate": 1.550387596899225e-05, | |
| "loss": 0.3883920192718506, | |
| "memory(GiB)": 132.97, | |
| "step": 100, | |
| "token_acc": 0.8665084805343176, | |
| "train_speed(iter/s)": 0.066847 | |
| }, | |
| { | |
| "epoch": 0.12295081967213115, | |
| "grad_norm": 0.22931204736232758, | |
| "learning_rate": 1.6279069767441862e-05, | |
| "loss": 0.38483271598815916, | |
| "memory(GiB)": 132.97, | |
| "step": 105, | |
| "token_acc": 0.86842660702191, | |
| "train_speed(iter/s)": 0.066999 | |
| }, | |
| { | |
| "epoch": 0.1288056206088993, | |
| "grad_norm": 0.24479679763317108, | |
| "learning_rate": 1.7054263565891473e-05, | |
| "loss": 0.37220172882080077, | |
| "memory(GiB)": 132.97, | |
| "step": 110, | |
| "token_acc": 0.867410052595701, | |
| "train_speed(iter/s)": 0.067112 | |
| }, | |
| { | |
| "epoch": 0.13466042154566746, | |
| "grad_norm": 0.2648003101348877, | |
| "learning_rate": 1.7829457364341087e-05, | |
| "loss": 0.39059298038482665, | |
| "memory(GiB)": 132.97, | |
| "step": 115, | |
| "token_acc": 0.8748554193704952, | |
| "train_speed(iter/s)": 0.067218 | |
| }, | |
| { | |
| "epoch": 0.1405152224824356, | |
| "grad_norm": 0.26005980372428894, | |
| "learning_rate": 1.86046511627907e-05, | |
| "loss": 0.3818374156951904, | |
| "memory(GiB)": 132.97, | |
| "step": 120, | |
| "token_acc": 0.8672951527027911, | |
| "train_speed(iter/s)": 0.06731 | |
| }, | |
| { | |
| "epoch": 0.14637002341920374, | |
| "grad_norm": 0.25006258487701416, | |
| "learning_rate": 1.937984496124031e-05, | |
| "loss": 0.3956636428833008, | |
| "memory(GiB)": 132.97, | |
| "step": 125, | |
| "token_acc": 0.8609716918038115, | |
| "train_speed(iter/s)": 0.067383 | |
| }, | |
| { | |
| "epoch": 0.1522248243559719, | |
| "grad_norm": 0.2747514545917511, | |
| "learning_rate": 1.9999991663467044e-05, | |
| "loss": 0.3932375907897949, | |
| "memory(GiB)": 132.97, | |
| "step": 130, | |
| "token_acc": 0.8660186100028765, | |
| "train_speed(iter/s)": 0.06745 | |
| }, | |
| { | |
| "epoch": 0.15807962529274006, | |
| "grad_norm": 0.2641543745994568, | |
| "learning_rate": 1.9999699886272926e-05, | |
| "loss": 0.39503839015960696, | |
| "memory(GiB)": 132.97, | |
| "step": 135, | |
| "token_acc": 0.8533355723899442, | |
| "train_speed(iter/s)": 0.067497 | |
| }, | |
| { | |
| "epoch": 0.16393442622950818, | |
| "grad_norm": 0.2637743353843689, | |
| "learning_rate": 1.9998991296330317e-05, | |
| "loss": 0.39163637161254883, | |
| "memory(GiB)": 132.97, | |
| "step": 140, | |
| "token_acc": 0.8673312165879645, | |
| "train_speed(iter/s)": 0.067529 | |
| }, | |
| { | |
| "epoch": 0.16978922716627634, | |
| "grad_norm": 0.2526402175426483, | |
| "learning_rate": 1.9997865923175027e-05, | |
| "loss": 0.3822649002075195, | |
| "memory(GiB)": 132.97, | |
| "step": 145, | |
| "token_acc": 0.8725527891092668, | |
| "train_speed(iter/s)": 0.067555 | |
| }, | |
| { | |
| "epoch": 0.1756440281030445, | |
| "grad_norm": 0.2798239588737488, | |
| "learning_rate": 1.999632381371545e-05, | |
| "loss": 0.388509464263916, | |
| "memory(GiB)": 133.05, | |
| "step": 150, | |
| "token_acc": 0.8570005695948406, | |
| "train_speed(iter/s)": 0.067614 | |
| }, | |
| { | |
| "epoch": 0.18149882903981265, | |
| "grad_norm": 0.24978382885456085, | |
| "learning_rate": 1.999436503223061e-05, | |
| "loss": 0.38669638633728026, | |
| "memory(GiB)": 133.05, | |
| "step": 155, | |
| "token_acc": 0.8657276078873382, | |
| "train_speed(iter/s)": 0.067633 | |
| }, | |
| { | |
| "epoch": 0.1873536299765808, | |
| "grad_norm": 0.2820796072483063, | |
| "learning_rate": 1.9991989660367463e-05, | |
| "loss": 0.39322915077209475, | |
| "memory(GiB)": 133.05, | |
| "step": 160, | |
| "token_acc": 0.8509120957934454, | |
| "train_speed(iter/s)": 0.06766 | |
| }, | |
| { | |
| "epoch": 0.19320843091334894, | |
| "grad_norm": 0.25325024127960205, | |
| "learning_rate": 1.998919779713751e-05, | |
| "loss": 0.3963874578475952, | |
| "memory(GiB)": 133.05, | |
| "step": 165, | |
| "token_acc": 0.8568015157690381, | |
| "train_speed(iter/s)": 0.067694 | |
| }, | |
| { | |
| "epoch": 0.1990632318501171, | |
| "grad_norm": 0.23693059384822845, | |
| "learning_rate": 1.998598955891266e-05, | |
| "loss": 0.3861080169677734, | |
| "memory(GiB)": 133.05, | |
| "step": 170, | |
| "token_acc": 0.8704777077082435, | |
| "train_speed(iter/s)": 0.067738 | |
| }, | |
| { | |
| "epoch": 0.20491803278688525, | |
| "grad_norm": 0.24995002150535583, | |
| "learning_rate": 1.9982365079420382e-05, | |
| "loss": 0.3748037338256836, | |
| "memory(GiB)": 133.05, | |
| "step": 175, | |
| "token_acc": 0.8639677636839712, | |
| "train_speed(iter/s)": 0.06777 | |
| }, | |
| { | |
| "epoch": 0.2107728337236534, | |
| "grad_norm": 0.2528163492679596, | |
| "learning_rate": 1.9978324509738147e-05, | |
| "loss": 0.37778520584106445, | |
| "memory(GiB)": 133.05, | |
| "step": 180, | |
| "token_acc": 0.8692558237224801, | |
| "train_speed(iter/s)": 0.067808 | |
| }, | |
| { | |
| "epoch": 0.21662763466042154, | |
| "grad_norm": 0.26185593008995056, | |
| "learning_rate": 1.9973868018287093e-05, | |
| "loss": 0.37712826728820803, | |
| "memory(GiB)": 133.05, | |
| "step": 185, | |
| "token_acc": 0.8629621624330818, | |
| "train_speed(iter/s)": 0.067862 | |
| }, | |
| { | |
| "epoch": 0.2224824355971897, | |
| "grad_norm": 0.2565723955631256, | |
| "learning_rate": 1.9968995790825048e-05, | |
| "loss": 0.38217387199401853, | |
| "memory(GiB)": 133.05, | |
| "step": 190, | |
| "token_acc": 0.8526548122357622, | |
| "train_speed(iter/s)": 0.06787 | |
| }, | |
| { | |
| "epoch": 0.22833723653395785, | |
| "grad_norm": 0.24071918427944183, | |
| "learning_rate": 1.9963708030438754e-05, | |
| "loss": 0.38128018379211426, | |
| "memory(GiB)": 133.05, | |
| "step": 195, | |
| "token_acc": 0.86564623713995, | |
| "train_speed(iter/s)": 0.067888 | |
| }, | |
| { | |
| "epoch": 0.234192037470726, | |
| "grad_norm": 0.2468400001525879, | |
| "learning_rate": 1.995800495753542e-05, | |
| "loss": 0.38081438541412355, | |
| "memory(GiB)": 133.05, | |
| "step": 200, | |
| "token_acc": 0.8573196660493942, | |
| "train_speed(iter/s)": 0.0679 | |
| }, | |
| { | |
| "epoch": 0.24004683840749413, | |
| "grad_norm": 0.24025513231754303, | |
| "learning_rate": 1.9951886809833537e-05, | |
| "loss": 0.39122610092163085, | |
| "memory(GiB)": 133.05, | |
| "step": 205, | |
| "token_acc": 0.8568699202170693, | |
| "train_speed(iter/s)": 0.067926 | |
| }, | |
| { | |
| "epoch": 0.2459016393442623, | |
| "grad_norm": 0.262650728225708, | |
| "learning_rate": 1.9945353842352943e-05, | |
| "loss": 0.38733615875244143, | |
| "memory(GiB)": 133.05, | |
| "step": 210, | |
| "token_acc": 0.8605185069498672, | |
| "train_speed(iter/s)": 0.067945 | |
| }, | |
| { | |
| "epoch": 0.25175644028103045, | |
| "grad_norm": 0.2334696501493454, | |
| "learning_rate": 1.9938406327404233e-05, | |
| "loss": 0.38346500396728517, | |
| "memory(GiB)": 133.05, | |
| "step": 215, | |
| "token_acc": 0.8689892435384466, | |
| "train_speed(iter/s)": 0.067989 | |
| }, | |
| { | |
| "epoch": 0.2576112412177986, | |
| "grad_norm": 0.2296629697084427, | |
| "learning_rate": 1.9931044554577373e-05, | |
| "loss": 0.3805164575576782, | |
| "memory(GiB)": 133.05, | |
| "step": 220, | |
| "token_acc": 0.862054141615526, | |
| "train_speed(iter/s)": 0.068004 | |
| }, | |
| { | |
| "epoch": 0.26346604215456676, | |
| "grad_norm": 0.2337953746318817, | |
| "learning_rate": 1.992326883072965e-05, | |
| "loss": 0.38329010009765624, | |
| "memory(GiB)": 133.05, | |
| "step": 225, | |
| "token_acc": 0.8576421234268423, | |
| "train_speed(iter/s)": 0.068016 | |
| }, | |
| { | |
| "epoch": 0.2693208430913349, | |
| "grad_norm": 0.22751180827617645, | |
| "learning_rate": 1.991507947997287e-05, | |
| "loss": 0.3914541244506836, | |
| "memory(GiB)": 133.05, | |
| "step": 230, | |
| "token_acc": 0.8647305257189656, | |
| "train_speed(iter/s)": 0.068037 | |
| }, | |
| { | |
| "epoch": 0.275175644028103, | |
| "grad_norm": 0.23834733664989471, | |
| "learning_rate": 1.9906476843659866e-05, | |
| "loss": 0.3868813753128052, | |
| "memory(GiB)": 133.05, | |
| "step": 235, | |
| "token_acc": 0.8718037707532127, | |
| "train_speed(iter/s)": 0.068062 | |
| }, | |
| { | |
| "epoch": 0.2810304449648712, | |
| "grad_norm": 0.2157682329416275, | |
| "learning_rate": 1.989746128037024e-05, | |
| "loss": 0.3725996971130371, | |
| "memory(GiB)": 133.05, | |
| "step": 240, | |
| "token_acc": 0.8637500196081507, | |
| "train_speed(iter/s)": 0.068079 | |
| }, | |
| { | |
| "epoch": 0.28688524590163933, | |
| "grad_norm": 0.24432708323001862, | |
| "learning_rate": 1.988803316589545e-05, | |
| "loss": 0.38200843334198, | |
| "memory(GiB)": 133.05, | |
| "step": 245, | |
| "token_acc": 0.863402893772779, | |
| "train_speed(iter/s)": 0.068119 | |
| }, | |
| { | |
| "epoch": 0.2927400468384075, | |
| "grad_norm": 0.22754515707492828, | |
| "learning_rate": 1.987819289322311e-05, | |
| "loss": 0.38454749584198, | |
| "memory(GiB)": 133.05, | |
| "step": 250, | |
| "token_acc": 0.8616220657129776, | |
| "train_speed(iter/s)": 0.068158 | |
| }, | |
| { | |
| "epoch": 0.29859484777517564, | |
| "grad_norm": 0.22906067967414856, | |
| "learning_rate": 1.9867940872520646e-05, | |
| "loss": 0.38929970264434816, | |
| "memory(GiB)": 133.05, | |
| "step": 255, | |
| "token_acc": 0.862697854653979, | |
| "train_speed(iter/s)": 0.068143 | |
| }, | |
| { | |
| "epoch": 0.3044496487119438, | |
| "grad_norm": 0.2391372174024582, | |
| "learning_rate": 1.9857277531118173e-05, | |
| "loss": 0.38328697681427004, | |
| "memory(GiB)": 133.05, | |
| "step": 260, | |
| "token_acc": 0.875577325482754, | |
| "train_speed(iter/s)": 0.068151 | |
| }, | |
| { | |
| "epoch": 0.31030444964871196, | |
| "grad_norm": 0.23862990736961365, | |
| "learning_rate": 1.9846203313490697e-05, | |
| "loss": 0.3745781660079956, | |
| "memory(GiB)": 133.05, | |
| "step": 265, | |
| "token_acc": 0.8789255692291267, | |
| "train_speed(iter/s)": 0.068172 | |
| }, | |
| { | |
| "epoch": 0.3161592505854801, | |
| "grad_norm": 0.2886284291744232, | |
| "learning_rate": 1.983471868123958e-05, | |
| "loss": 0.37299673557281493, | |
| "memory(GiB)": 133.05, | |
| "step": 270, | |
| "token_acc": 0.8619748050993121, | |
| "train_speed(iter/s)": 0.068214 | |
| }, | |
| { | |
| "epoch": 0.32201405152224827, | |
| "grad_norm": 0.25015807151794434, | |
| "learning_rate": 1.98228241130733e-05, | |
| "loss": 0.39740839004516604, | |
| "memory(GiB)": 133.05, | |
| "step": 275, | |
| "token_acc": 0.8667058589327261, | |
| "train_speed(iter/s)": 0.068226 | |
| }, | |
| { | |
| "epoch": 0.32786885245901637, | |
| "grad_norm": 0.22695152461528778, | |
| "learning_rate": 1.98105201047875e-05, | |
| "loss": 0.3711256980895996, | |
| "memory(GiB)": 133.05, | |
| "step": 280, | |
| "token_acc": 0.8709827404894823, | |
| "train_speed(iter/s)": 0.068264 | |
| }, | |
| { | |
| "epoch": 0.3337236533957845, | |
| "grad_norm": 0.25948262214660645, | |
| "learning_rate": 1.9797807169244326e-05, | |
| "loss": 0.376755690574646, | |
| "memory(GiB)": 133.05, | |
| "step": 285, | |
| "token_acc": 0.8627933786950365, | |
| "train_speed(iter/s)": 0.068275 | |
| }, | |
| { | |
| "epoch": 0.3395784543325527, | |
| "grad_norm": 0.2252376824617386, | |
| "learning_rate": 1.9784685836351045e-05, | |
| "loss": 0.3907461166381836, | |
| "memory(GiB)": 133.05, | |
| "step": 290, | |
| "token_acc": 0.8594050471419237, | |
| "train_speed(iter/s)": 0.068273 | |
| }, | |
| { | |
| "epoch": 0.34543325526932084, | |
| "grad_norm": 0.2580513656139374, | |
| "learning_rate": 1.9771156653037944e-05, | |
| "loss": 0.38218297958374026, | |
| "memory(GiB)": 133.05, | |
| "step": 295, | |
| "token_acc": 0.8619312594063512, | |
| "train_speed(iter/s)": 0.068289 | |
| }, | |
| { | |
| "epoch": 0.351288056206089, | |
| "grad_norm": 0.21899765729904175, | |
| "learning_rate": 1.975722018323556e-05, | |
| "loss": 0.3749994277954102, | |
| "memory(GiB)": 133.05, | |
| "step": 300, | |
| "token_acc": 0.8698979752198593, | |
| "train_speed(iter/s)": 0.068281 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.2238709181547165, | |
| "learning_rate": 1.974287700785116e-05, | |
| "loss": 0.37110333442687987, | |
| "memory(GiB)": 133.05, | |
| "step": 305, | |
| "token_acc": 0.8662300629837371, | |
| "train_speed(iter/s)": 0.0683 | |
| }, | |
| { | |
| "epoch": 0.3629976580796253, | |
| "grad_norm": 0.24307382106781006, | |
| "learning_rate": 1.9728127724744516e-05, | |
| "loss": 0.36276865005493164, | |
| "memory(GiB)": 133.05, | |
| "step": 310, | |
| "token_acc": 0.8663095601853296, | |
| "train_speed(iter/s)": 0.068292 | |
| }, | |
| { | |
| "epoch": 0.36885245901639346, | |
| "grad_norm": 0.2318965196609497, | |
| "learning_rate": 1.9712972948703006e-05, | |
| "loss": 0.38519649505615233, | |
| "memory(GiB)": 133.05, | |
| "step": 315, | |
| "token_acc": 0.8719233901258103, | |
| "train_speed(iter/s)": 0.068305 | |
| }, | |
| { | |
| "epoch": 0.3747072599531616, | |
| "grad_norm": 0.22240430116653442, | |
| "learning_rate": 1.9697413311415967e-05, | |
| "loss": 0.3795146465301514, | |
| "memory(GiB)": 133.05, | |
| "step": 320, | |
| "token_acc": 0.8447559871358541, | |
| "train_speed(iter/s)": 0.06832 | |
| }, | |
| { | |
| "epoch": 0.3805620608899297, | |
| "grad_norm": 0.21727585792541504, | |
| "learning_rate": 1.9681449461448386e-05, | |
| "loss": 0.37623322010040283, | |
| "memory(GiB)": 133.05, | |
| "step": 325, | |
| "token_acc": 0.868092485549133, | |
| "train_speed(iter/s)": 0.068321 | |
| }, | |
| { | |
| "epoch": 0.3864168618266979, | |
| "grad_norm": 0.24871428310871124, | |
| "learning_rate": 1.9665082064213856e-05, | |
| "loss": 0.3804615497589111, | |
| "memory(GiB)": 133.05, | |
| "step": 330, | |
| "token_acc": 0.8632469719807496, | |
| "train_speed(iter/s)": 0.06834 | |
| }, | |
| { | |
| "epoch": 0.39227166276346603, | |
| "grad_norm": 0.2242128700017929, | |
| "learning_rate": 1.9648311801946823e-05, | |
| "loss": 0.37839736938476565, | |
| "memory(GiB)": 133.05, | |
| "step": 335, | |
| "token_acc": 0.8620339267458229, | |
| "train_speed(iter/s)": 0.068356 | |
| }, | |
| { | |
| "epoch": 0.3981264637002342, | |
| "grad_norm": 0.23243097960948944, | |
| "learning_rate": 1.9631139373674188e-05, | |
| "loss": 0.3759917736053467, | |
| "memory(GiB)": 133.05, | |
| "step": 340, | |
| "token_acc": 0.8630340491154014, | |
| "train_speed(iter/s)": 0.068364 | |
| }, | |
| { | |
| "epoch": 0.40398126463700235, | |
| "grad_norm": 0.2167743444442749, | |
| "learning_rate": 1.9613565495186126e-05, | |
| "loss": 0.36579113006591796, | |
| "memory(GiB)": 133.05, | |
| "step": 345, | |
| "token_acc": 0.8630803983851985, | |
| "train_speed(iter/s)": 0.068389 | |
| }, | |
| { | |
| "epoch": 0.4098360655737705, | |
| "grad_norm": 0.2554558515548706, | |
| "learning_rate": 1.9595590899006288e-05, | |
| "loss": 0.3840445280075073, | |
| "memory(GiB)": 133.05, | |
| "step": 350, | |
| "token_acc": 0.8682752142033024, | |
| "train_speed(iter/s)": 0.06839 | |
| }, | |
| { | |
| "epoch": 0.41569086651053866, | |
| "grad_norm": 0.23864524066448212, | |
| "learning_rate": 1.957721633436124e-05, | |
| "loss": 0.3817277908325195, | |
| "memory(GiB)": 133.05, | |
| "step": 355, | |
| "token_acc": 0.8645090065366, | |
| "train_speed(iter/s)": 0.068384 | |
| }, | |
| { | |
| "epoch": 0.4215456674473068, | |
| "grad_norm": 0.25255629420280457, | |
| "learning_rate": 1.9558442567149244e-05, | |
| "loss": 0.3791682720184326, | |
| "memory(GiB)": 133.05, | |
| "step": 360, | |
| "token_acc": 0.8775300258130478, | |
| "train_speed(iter/s)": 0.068404 | |
| }, | |
| { | |
| "epoch": 0.4274004683840749, | |
| "grad_norm": 0.2247135043144226, | |
| "learning_rate": 1.953927037990834e-05, | |
| "loss": 0.3860400915145874, | |
| "memory(GiB)": 133.05, | |
| "step": 365, | |
| "token_acc": 0.8536377662766984, | |
| "train_speed(iter/s)": 0.068414 | |
| }, | |
| { | |
| "epoch": 0.4332552693208431, | |
| "grad_norm": 0.29746949672698975, | |
| "learning_rate": 1.9519700571783718e-05, | |
| "loss": 0.3866363763809204, | |
| "memory(GiB)": 133.05, | |
| "step": 370, | |
| "token_acc": 0.8695576843716825, | |
| "train_speed(iter/s)": 0.0684 | |
| }, | |
| { | |
| "epoch": 0.43911007025761123, | |
| "grad_norm": 0.23039910197257996, | |
| "learning_rate": 1.9499733958494405e-05, | |
| "loss": 0.38268446922302246, | |
| "memory(GiB)": 133.05, | |
| "step": 375, | |
| "token_acc": 0.8581758827531537, | |
| "train_speed(iter/s)": 0.068407 | |
| }, | |
| { | |
| "epoch": 0.4449648711943794, | |
| "grad_norm": 0.23166924715042114, | |
| "learning_rate": 1.947937137229928e-05, | |
| "loss": 0.37559897899627687, | |
| "memory(GiB)": 133.05, | |
| "step": 380, | |
| "token_acc": 0.8744556465509139, | |
| "train_speed(iter/s)": 0.068418 | |
| }, | |
| { | |
| "epoch": 0.45081967213114754, | |
| "grad_norm": 0.22437815368175507, | |
| "learning_rate": 1.9458613661962366e-05, | |
| "loss": 0.37695770263671874, | |
| "memory(GiB)": 133.05, | |
| "step": 385, | |
| "token_acc": 0.8771398753952836, | |
| "train_speed(iter/s)": 0.068428 | |
| }, | |
| { | |
| "epoch": 0.4566744730679157, | |
| "grad_norm": 0.23045028746128082, | |
| "learning_rate": 1.943746169271746e-05, | |
| "loss": 0.37760295867919924, | |
| "memory(GiB)": 133.05, | |
| "step": 390, | |
| "token_acc": 0.8759578109502548, | |
| "train_speed(iter/s)": 0.068419 | |
| }, | |
| { | |
| "epoch": 0.46252927400468385, | |
| "grad_norm": 0.21340611577033997, | |
| "learning_rate": 1.941591634623206e-05, | |
| "loss": 0.38206305503845217, | |
| "memory(GiB)": 133.05, | |
| "step": 395, | |
| "token_acc": 0.8683378180616532, | |
| "train_speed(iter/s)": 0.068433 | |
| }, | |
| { | |
| "epoch": 0.468384074941452, | |
| "grad_norm": 0.2345254123210907, | |
| "learning_rate": 1.9393978520570638e-05, | |
| "loss": 0.3681832790374756, | |
| "memory(GiB)": 133.05, | |
| "step": 400, | |
| "token_acc": 0.8685244618395304, | |
| "train_speed(iter/s)": 0.068457 | |
| }, | |
| { | |
| "epoch": 0.47423887587822017, | |
| "grad_norm": 0.23758217692375183, | |
| "learning_rate": 1.9371649130157166e-05, | |
| "loss": 0.36426939964294436, | |
| "memory(GiB)": 133.05, | |
| "step": 405, | |
| "token_acc": 0.8676219452965636, | |
| "train_speed(iter/s)": 0.068464 | |
| }, | |
| { | |
| "epoch": 0.48009367681498827, | |
| "grad_norm": 0.2363872230052948, | |
| "learning_rate": 1.9348929105737044e-05, | |
| "loss": 0.37017192840576174, | |
| "memory(GiB)": 133.05, | |
| "step": 410, | |
| "token_acc": 0.8679473812363037, | |
| "train_speed(iter/s)": 0.068468 | |
| }, | |
| { | |
| "epoch": 0.4859484777517564, | |
| "grad_norm": 0.24642601609230042, | |
| "learning_rate": 1.932581939433827e-05, | |
| "loss": 0.38428258895874023, | |
| "memory(GiB)": 133.05, | |
| "step": 415, | |
| "token_acc": 0.8687720441289789, | |
| "train_speed(iter/s)": 0.06847 | |
| }, | |
| { | |
| "epoch": 0.4918032786885246, | |
| "grad_norm": 0.2268989235162735, | |
| "learning_rate": 1.9302320959231997e-05, | |
| "loss": 0.37460167407989503, | |
| "memory(GiB)": 133.05, | |
| "step": 420, | |
| "token_acc": 0.8672426525809843, | |
| "train_speed(iter/s)": 0.068479 | |
| }, | |
| { | |
| "epoch": 0.49765807962529274, | |
| "grad_norm": 0.21185266971588135, | |
| "learning_rate": 1.927843477989234e-05, | |
| "loss": 0.37124834060668943, | |
| "memory(GiB)": 133.05, | |
| "step": 425, | |
| "token_acc": 0.8814642777451279, | |
| "train_speed(iter/s)": 0.068488 | |
| }, | |
| { | |
| "epoch": 0.5035128805620609, | |
| "grad_norm": 0.21971659362316132, | |
| "learning_rate": 1.9254161851955587e-05, | |
| "loss": 0.3843217849731445, | |
| "memory(GiB)": 133.05, | |
| "step": 430, | |
| "token_acc": 0.8714790057188723, | |
| "train_speed(iter/s)": 0.068499 | |
| }, | |
| { | |
| "epoch": 0.509367681498829, | |
| "grad_norm": 0.26225098967552185, | |
| "learning_rate": 1.9229503187178694e-05, | |
| "loss": 0.3771937370300293, | |
| "memory(GiB)": 133.05, | |
| "step": 435, | |
| "token_acc": 0.8658792102647854, | |
| "train_speed(iter/s)": 0.068506 | |
| }, | |
| { | |
| "epoch": 0.5152224824355972, | |
| "grad_norm": 0.23551629483699799, | |
| "learning_rate": 1.920445981339708e-05, | |
| "loss": 0.37624967098236084, | |
| "memory(GiB)": 133.05, | |
| "step": 440, | |
| "token_acc": 0.8641905035935222, | |
| "train_speed(iter/s)": 0.068518 | |
| }, | |
| { | |
| "epoch": 0.5210772833723654, | |
| "grad_norm": 0.25343942642211914, | |
| "learning_rate": 1.9179032774481822e-05, | |
| "loss": 0.37384233474731443, | |
| "memory(GiB)": 133.05, | |
| "step": 445, | |
| "token_acc": 0.8723531724486548, | |
| "train_speed(iter/s)": 0.068533 | |
| }, | |
| { | |
| "epoch": 0.5269320843091335, | |
| "grad_norm": 0.22508122026920319, | |
| "learning_rate": 1.9153223130296125e-05, | |
| "loss": 0.3715523719787598, | |
| "memory(GiB)": 133.05, | |
| "step": 450, | |
| "token_acc": 0.8742618455654583, | |
| "train_speed(iter/s)": 0.068547 | |
| }, | |
| { | |
| "epoch": 0.5327868852459017, | |
| "grad_norm": 0.2273603081703186, | |
| "learning_rate": 1.9127031956651153e-05, | |
| "loss": 0.3753758192062378, | |
| "memory(GiB)": 133.05, | |
| "step": 455, | |
| "token_acc": 0.8717887326571352, | |
| "train_speed(iter/s)": 0.068556 | |
| }, | |
| { | |
| "epoch": 0.5386416861826698, | |
| "grad_norm": 0.24021831154823303, | |
| "learning_rate": 1.9100460345261175e-05, | |
| "loss": 0.3885939598083496, | |
| "memory(GiB)": 133.05, | |
| "step": 460, | |
| "token_acc": 0.8648985264452413, | |
| "train_speed(iter/s)": 0.068545 | |
| }, | |
| { | |
| "epoch": 0.544496487119438, | |
| "grad_norm": 0.25094419717788696, | |
| "learning_rate": 1.9073509403698062e-05, | |
| "loss": 0.3836202621459961, | |
| "memory(GiB)": 133.05, | |
| "step": 465, | |
| "token_acc": 0.8716216427648316, | |
| "train_speed(iter/s)": 0.068548 | |
| }, | |
| { | |
| "epoch": 0.550351288056206, | |
| "grad_norm": 0.2209528684616089, | |
| "learning_rate": 1.9046180255345142e-05, | |
| "loss": 0.3783407688140869, | |
| "memory(GiB)": 133.05, | |
| "step": 470, | |
| "token_acc": 0.871262499689834, | |
| "train_speed(iter/s)": 0.068558 | |
| }, | |
| { | |
| "epoch": 0.5562060889929742, | |
| "grad_norm": 0.2333252876996994, | |
| "learning_rate": 1.9018474039350342e-05, | |
| "loss": 0.37140965461730957, | |
| "memory(GiB)": 133.05, | |
| "step": 475, | |
| "token_acc": 0.870434477460474, | |
| "train_speed(iter/s)": 0.068559 | |
| }, | |
| { | |
| "epoch": 0.5620608899297423, | |
| "grad_norm": 0.22321061789989471, | |
| "learning_rate": 1.899039191057872e-05, | |
| "loss": 0.3732731819152832, | |
| "memory(GiB)": 133.05, | |
| "step": 480, | |
| "token_acc": 0.8651847926051782, | |
| "train_speed(iter/s)": 0.06856 | |
| }, | |
| { | |
| "epoch": 0.5679156908665105, | |
| "grad_norm": 0.24292093515396118, | |
| "learning_rate": 1.8961935039564338e-05, | |
| "loss": 0.3720050096511841, | |
| "memory(GiB)": 133.05, | |
| "step": 485, | |
| "token_acc": 0.8644098695583844, | |
| "train_speed(iter/s)": 0.06857 | |
| }, | |
| { | |
| "epoch": 0.5737704918032787, | |
| "grad_norm": 0.25076785683631897, | |
| "learning_rate": 1.8933104612461454e-05, | |
| "loss": 0.37432427406311036, | |
| "memory(GiB)": 133.05, | |
| "step": 490, | |
| "token_acc": 0.865598108538928, | |
| "train_speed(iter/s)": 0.068571 | |
| }, | |
| { | |
| "epoch": 0.5796252927400468, | |
| "grad_norm": 0.2353287786245346, | |
| "learning_rate": 1.8903901830995093e-05, | |
| "loss": 0.37787389755249023, | |
| "memory(GiB)": 133.05, | |
| "step": 495, | |
| "token_acc": 0.8628752281343229, | |
| "train_speed(iter/s)": 0.068571 | |
| }, | |
| { | |
| "epoch": 0.585480093676815, | |
| "grad_norm": 0.23301288485527039, | |
| "learning_rate": 1.8874327912410945e-05, | |
| "loss": 0.3894960880279541, | |
| "memory(GiB)": 133.05, | |
| "step": 500, | |
| "token_acc": 0.8649986209317486, | |
| "train_speed(iter/s)": 0.068584 | |
| }, | |
| { | |
| "epoch": 0.5913348946135831, | |
| "grad_norm": 0.23387756943702698, | |
| "learning_rate": 1.884438408942463e-05, | |
| "loss": 0.37682523727416994, | |
| "memory(GiB)": 133.05, | |
| "step": 505, | |
| "token_acc": 0.8542796019209774, | |
| "train_speed(iter/s)": 0.068582 | |
| }, | |
| { | |
| "epoch": 0.5971896955503513, | |
| "grad_norm": 0.2101481854915619, | |
| "learning_rate": 1.881407161017033e-05, | |
| "loss": 0.3712585210800171, | |
| "memory(GiB)": 133.05, | |
| "step": 510, | |
| "token_acc": 0.8757052407221665, | |
| "train_speed(iter/s)": 0.068594 | |
| }, | |
| { | |
| "epoch": 0.6030444964871194, | |
| "grad_norm": 0.2197055220603943, | |
| "learning_rate": 1.8783391738148738e-05, | |
| "loss": 0.3659008026123047, | |
| "memory(GiB)": 133.05, | |
| "step": 515, | |
| "token_acc": 0.8690927312016535, | |
| "train_speed(iter/s)": 0.068604 | |
| }, | |
| { | |
| "epoch": 0.6088992974238876, | |
| "grad_norm": 0.2129889726638794, | |
| "learning_rate": 1.875234575217441e-05, | |
| "loss": 0.36564500331878663, | |
| "memory(GiB)": 133.05, | |
| "step": 520, | |
| "token_acc": 0.8682967700230018, | |
| "train_speed(iter/s)": 0.068614 | |
| }, | |
| { | |
| "epoch": 0.6147540983606558, | |
| "grad_norm": 0.20078937709331512, | |
| "learning_rate": 1.8720934946322466e-05, | |
| "loss": 0.3801888465881348, | |
| "memory(GiB)": 133.05, | |
| "step": 525, | |
| "token_acc": 0.8619188686453682, | |
| "train_speed(iter/s)": 0.068624 | |
| }, | |
| { | |
| "epoch": 0.6206088992974239, | |
| "grad_norm": 0.20143865048885345, | |
| "learning_rate": 1.8689160629874622e-05, | |
| "loss": 0.3495650768280029, | |
| "memory(GiB)": 133.05, | |
| "step": 530, | |
| "token_acc": 0.8823268736367693, | |
| "train_speed(iter/s)": 0.068621 | |
| }, | |
| { | |
| "epoch": 0.6264637002341921, | |
| "grad_norm": 0.20651988685131073, | |
| "learning_rate": 1.865702412726465e-05, | |
| "loss": 0.36185364723205565, | |
| "memory(GiB)": 133.05, | |
| "step": 535, | |
| "token_acc": 0.879171148410336, | |
| "train_speed(iter/s)": 0.068634 | |
| }, | |
| { | |
| "epoch": 0.6323185011709602, | |
| "grad_norm": 0.2135830670595169, | |
| "learning_rate": 1.8624526778023142e-05, | |
| "loss": 0.36333141326904295, | |
| "memory(GiB)": 133.05, | |
| "step": 540, | |
| "token_acc": 0.8760890123251218, | |
| "train_speed(iter/s)": 0.068639 | |
| }, | |
| { | |
| "epoch": 0.6381733021077284, | |
| "grad_norm": 0.21670690178871155, | |
| "learning_rate": 1.85916699367217e-05, | |
| "loss": 0.36627764701843263, | |
| "memory(GiB)": 133.05, | |
| "step": 545, | |
| "token_acc": 0.8693160130902993, | |
| "train_speed(iter/s)": 0.068638 | |
| }, | |
| { | |
| "epoch": 0.6440281030444965, | |
| "grad_norm": 0.2082773894071579, | |
| "learning_rate": 1.855845497291646e-05, | |
| "loss": 0.3783770799636841, | |
| "memory(GiB)": 133.05, | |
| "step": 550, | |
| "token_acc": 0.8656727592628988, | |
| "train_speed(iter/s)": 0.068642 | |
| }, | |
| { | |
| "epoch": 0.6498829039812647, | |
| "grad_norm": 0.2064507156610489, | |
| "learning_rate": 1.8524883271091004e-05, | |
| "loss": 0.36701202392578125, | |
| "memory(GiB)": 133.05, | |
| "step": 555, | |
| "token_acc": 0.874370974788701, | |
| "train_speed(iter/s)": 0.068639 | |
| }, | |
| { | |
| "epoch": 0.6557377049180327, | |
| "grad_norm": 0.19167180359363556, | |
| "learning_rate": 1.8490956230598668e-05, | |
| "loss": 0.3856034755706787, | |
| "memory(GiB)": 133.05, | |
| "step": 560, | |
| "token_acc": 0.8677655700574375, | |
| "train_speed(iter/s)": 0.068642 | |
| }, | |
| { | |
| "epoch": 0.6615925058548009, | |
| "grad_norm": 0.22284165024757385, | |
| "learning_rate": 1.8456675265604183e-05, | |
| "loss": 0.36545207500457766, | |
| "memory(GiB)": 133.05, | |
| "step": 565, | |
| "token_acc": 0.8674683330306996, | |
| "train_speed(iter/s)": 0.068653 | |
| }, | |
| { | |
| "epoch": 0.667447306791569, | |
| "grad_norm": 0.2335020750761032, | |
| "learning_rate": 1.842204180502476e-05, | |
| "loss": 0.36900959014892576, | |
| "memory(GiB)": 133.05, | |
| "step": 570, | |
| "token_acc": 0.8763767159865549, | |
| "train_speed(iter/s)": 0.068659 | |
| }, | |
| { | |
| "epoch": 0.6733021077283372, | |
| "grad_norm": 0.2406488060951233, | |
| "learning_rate": 1.8387057292470517e-05, | |
| "loss": 0.3836709499359131, | |
| "memory(GiB)": 133.05, | |
| "step": 575, | |
| "token_acc": 0.8667227047725787, | |
| "train_speed(iter/s)": 0.068662 | |
| }, | |
| { | |
| "epoch": 0.6791569086651054, | |
| "grad_norm": 0.21748137474060059, | |
| "learning_rate": 1.8351723186184295e-05, | |
| "loss": 0.3724257707595825, | |
| "memory(GiB)": 133.05, | |
| "step": 580, | |
| "token_acc": 0.8577895654245747, | |
| "train_speed(iter/s)": 0.068669 | |
| }, | |
| { | |
| "epoch": 0.6850117096018735, | |
| "grad_norm": 0.2269269824028015, | |
| "learning_rate": 1.8316040958980896e-05, | |
| "loss": 0.3713605165481567, | |
| "memory(GiB)": 133.05, | |
| "step": 585, | |
| "token_acc": 0.8802838494896842, | |
| "train_speed(iter/s)": 0.068677 | |
| }, | |
| { | |
| "epoch": 0.6908665105386417, | |
| "grad_norm": 0.24186237156391144, | |
| "learning_rate": 1.828001209818567e-05, | |
| "loss": 0.3882193088531494, | |
| "memory(GiB)": 133.05, | |
| "step": 590, | |
| "token_acc": 0.8647495837870993, | |
| "train_speed(iter/s)": 0.068687 | |
| }, | |
| { | |
| "epoch": 0.6967213114754098, | |
| "grad_norm": 0.24182303249835968, | |
| "learning_rate": 1.8243638105572547e-05, | |
| "loss": 0.37105526924133303, | |
| "memory(GiB)": 133.05, | |
| "step": 595, | |
| "token_acc": 0.8747415704995677, | |
| "train_speed(iter/s)": 0.068693 | |
| }, | |
| { | |
| "epoch": 0.702576112412178, | |
| "grad_norm": 0.2169107049703598, | |
| "learning_rate": 1.82069204973014e-05, | |
| "loss": 0.3660942554473877, | |
| "memory(GiB)": 133.05, | |
| "step": 600, | |
| "token_acc": 0.8819129326127438, | |
| "train_speed(iter/s)": 0.068705 | |
| }, | |
| { | |
| "epoch": 0.7084309133489461, | |
| "grad_norm": 0.22826465964317322, | |
| "learning_rate": 1.816986080385489e-05, | |
| "loss": 0.38544516563415526, | |
| "memory(GiB)": 133.05, | |
| "step": 605, | |
| "token_acc": 0.850805587726625, | |
| "train_speed(iter/s)": 0.068706 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.22054514288902283, | |
| "learning_rate": 1.813246056997465e-05, | |
| "loss": 0.36968698501586916, | |
| "memory(GiB)": 133.05, | |
| "step": 610, | |
| "token_acc": 0.8651287265831155, | |
| "train_speed(iter/s)": 0.068707 | |
| }, | |
| { | |
| "epoch": 0.7201405152224825, | |
| "grad_norm": 0.2099841833114624, | |
| "learning_rate": 1.809472135459688e-05, | |
| "loss": 0.3711225509643555, | |
| "memory(GiB)": 133.05, | |
| "step": 615, | |
| "token_acc": 0.8681505343933286, | |
| "train_speed(iter/s)": 0.068708 | |
| }, | |
| { | |
| "epoch": 0.7259953161592506, | |
| "grad_norm": 0.21193836629390717, | |
| "learning_rate": 1.8056644730787412e-05, | |
| "loss": 0.3799697160720825, | |
| "memory(GiB)": 133.05, | |
| "step": 620, | |
| "token_acc": 0.8738008866124044, | |
| "train_speed(iter/s)": 0.068719 | |
| }, | |
| { | |
| "epoch": 0.7318501170960188, | |
| "grad_norm": 0.21255411207675934, | |
| "learning_rate": 1.8018232285676092e-05, | |
| "loss": 0.3608224391937256, | |
| "memory(GiB)": 133.05, | |
| "step": 625, | |
| "token_acc": 0.8694407077081082, | |
| "train_speed(iter/s)": 0.068731 | |
| }, | |
| { | |
| "epoch": 0.7377049180327869, | |
| "grad_norm": 0.21150043606758118, | |
| "learning_rate": 1.797948562039066e-05, | |
| "loss": 0.3775743246078491, | |
| "memory(GiB)": 133.05, | |
| "step": 630, | |
| "token_acc": 0.8636316861199378, | |
| "train_speed(iter/s)": 0.068723 | |
| }, | |
| { | |
| "epoch": 0.7435597189695551, | |
| "grad_norm": 0.21777065098285675, | |
| "learning_rate": 1.7940406349989987e-05, | |
| "loss": 0.3736081838607788, | |
| "memory(GiB)": 133.05, | |
| "step": 635, | |
| "token_acc": 0.8663054996457302, | |
| "train_speed(iter/s)": 0.068733 | |
| }, | |
| { | |
| "epoch": 0.7494145199063232, | |
| "grad_norm": 0.20919020473957062, | |
| "learning_rate": 1.7900996103396772e-05, | |
| "loss": 0.36686708927154543, | |
| "memory(GiB)": 133.05, | |
| "step": 640, | |
| "token_acc": 0.8738849498577591, | |
| "train_speed(iter/s)": 0.068733 | |
| }, | |
| { | |
| "epoch": 0.7552693208430913, | |
| "grad_norm": 0.2190757542848587, | |
| "learning_rate": 1.7861256523329634e-05, | |
| "loss": 0.3648522853851318, | |
| "memory(GiB)": 133.05, | |
| "step": 645, | |
| "token_acc": 0.8633769063180828, | |
| "train_speed(iter/s)": 0.068726 | |
| }, | |
| { | |
| "epoch": 0.7611241217798594, | |
| "grad_norm": 0.2133089303970337, | |
| "learning_rate": 1.7821189266234647e-05, | |
| "loss": 0.3695883274078369, | |
| "memory(GiB)": 133.05, | |
| "step": 650, | |
| "token_acc": 0.86513161996683, | |
| "train_speed(iter/s)": 0.068724 | |
| }, | |
| { | |
| "epoch": 0.7669789227166276, | |
| "grad_norm": 0.21529735624790192, | |
| "learning_rate": 1.7780796002216285e-05, | |
| "loss": 0.36347646713256837, | |
| "memory(GiB)": 133.05, | |
| "step": 655, | |
| "token_acc": 0.8618205939317708, | |
| "train_speed(iter/s)": 0.068726 | |
| }, | |
| { | |
| "epoch": 0.7728337236533958, | |
| "grad_norm": 0.2055824100971222, | |
| "learning_rate": 1.7740078414967817e-05, | |
| "loss": 0.3710654258728027, | |
| "memory(GiB)": 133.05, | |
| "step": 660, | |
| "token_acc": 0.876207411310151, | |
| "train_speed(iter/s)": 0.06873 | |
| }, | |
| { | |
| "epoch": 0.7786885245901639, | |
| "grad_norm": 0.20337671041488647, | |
| "learning_rate": 1.7699038201701132e-05, | |
| "loss": 0.363714861869812, | |
| "memory(GiB)": 133.05, | |
| "step": 665, | |
| "token_acc": 0.8631361610960301, | |
| "train_speed(iter/s)": 0.068736 | |
| }, | |
| { | |
| "epoch": 0.7845433255269321, | |
| "grad_norm": 0.2067345827817917, | |
| "learning_rate": 1.7657677073075968e-05, | |
| "loss": 0.3705836296081543, | |
| "memory(GiB)": 133.05, | |
| "step": 670, | |
| "token_acc": 0.8667006816477769, | |
| "train_speed(iter/s)": 0.068738 | |
| }, | |
| { | |
| "epoch": 0.7903981264637002, | |
| "grad_norm": 0.20614713430404663, | |
| "learning_rate": 1.761599675312864e-05, | |
| "loss": 0.37332298755645754, | |
| "memory(GiB)": 133.05, | |
| "step": 675, | |
| "token_acc": 0.8799103822873227, | |
| "train_speed(iter/s)": 0.068737 | |
| }, | |
| { | |
| "epoch": 0.7962529274004684, | |
| "grad_norm": 0.21380652487277985, | |
| "learning_rate": 1.7573998979200163e-05, | |
| "loss": 0.36742873191833497, | |
| "memory(GiB)": 133.05, | |
| "step": 680, | |
| "token_acc": 0.8703528431892178, | |
| "train_speed(iter/s)": 0.068746 | |
| }, | |
| { | |
| "epoch": 0.8021077283372365, | |
| "grad_norm": 0.19453544914722443, | |
| "learning_rate": 1.753168550186383e-05, | |
| "loss": 0.37564864158630373, | |
| "memory(GiB)": 133.05, | |
| "step": 685, | |
| "token_acc": 0.8728679647922801, | |
| "train_speed(iter/s)": 0.068749 | |
| }, | |
| { | |
| "epoch": 0.8079625292740047, | |
| "grad_norm": 0.19713324308395386, | |
| "learning_rate": 1.7489058084852247e-05, | |
| "loss": 0.37057785987854003, | |
| "memory(GiB)": 133.05, | |
| "step": 690, | |
| "token_acc": 0.8620633488698441, | |
| "train_speed(iter/s)": 0.068747 | |
| }, | |
| { | |
| "epoch": 0.8138173302107728, | |
| "grad_norm": 0.20321306586265564, | |
| "learning_rate": 1.744611850498383e-05, | |
| "loss": 0.3668221950531006, | |
| "memory(GiB)": 133.05, | |
| "step": 695, | |
| "token_acc": 0.8725824053835161, | |
| "train_speed(iter/s)": 0.068741 | |
| }, | |
| { | |
| "epoch": 0.819672131147541, | |
| "grad_norm": 0.22502325475215912, | |
| "learning_rate": 1.7402868552088724e-05, | |
| "loss": 0.3616886854171753, | |
| "memory(GiB)": 133.05, | |
| "step": 700, | |
| "token_acc": 0.8672900381533646, | |
| "train_speed(iter/s)": 0.068742 | |
| }, | |
| { | |
| "epoch": 0.8255269320843092, | |
| "grad_norm": 0.206443652510643, | |
| "learning_rate": 1.73593100289342e-05, | |
| "loss": 0.36960477828979493, | |
| "memory(GiB)": 133.05, | |
| "step": 705, | |
| "token_acc": 0.8645310315863375, | |
| "train_speed(iter/s)": 0.068749 | |
| }, | |
| { | |
| "epoch": 0.8313817330210773, | |
| "grad_norm": 0.2609001696109772, | |
| "learning_rate": 1.7315444751149533e-05, | |
| "loss": 0.3676512956619263, | |
| "memory(GiB)": 133.05, | |
| "step": 710, | |
| "token_acc": 0.8703732566911265, | |
| "train_speed(iter/s)": 0.068756 | |
| }, | |
| { | |
| "epoch": 0.8372365339578455, | |
| "grad_norm": 0.20213671028614044, | |
| "learning_rate": 1.727127454715029e-05, | |
| "loss": 0.36738247871398927, | |
| "memory(GiB)": 133.05, | |
| "step": 715, | |
| "token_acc": 0.8776044347530407, | |
| "train_speed(iter/s)": 0.068761 | |
| }, | |
| { | |
| "epoch": 0.8430913348946136, | |
| "grad_norm": 0.2078767567873001, | |
| "learning_rate": 1.722680125806214e-05, | |
| "loss": 0.3677778720855713, | |
| "memory(GiB)": 133.05, | |
| "step": 720, | |
| "token_acc": 0.8627296514081535, | |
| "train_speed(iter/s)": 0.068763 | |
| }, | |
| { | |
| "epoch": 0.8489461358313818, | |
| "grad_norm": 0.22138644754886627, | |
| "learning_rate": 1.71820267376441e-05, | |
| "loss": 0.37197351455688477, | |
| "memory(GiB)": 133.05, | |
| "step": 725, | |
| "token_acc": 0.8676777818660314, | |
| "train_speed(iter/s)": 0.068766 | |
| }, | |
| { | |
| "epoch": 0.8548009367681498, | |
| "grad_norm": 0.21397338807582855, | |
| "learning_rate": 1.7136952852211274e-05, | |
| "loss": 0.37579007148742677, | |
| "memory(GiB)": 133.05, | |
| "step": 730, | |
| "token_acc": 0.8572162173097093, | |
| "train_speed(iter/s)": 0.068772 | |
| }, | |
| { | |
| "epoch": 0.860655737704918, | |
| "grad_norm": 0.20828036963939667, | |
| "learning_rate": 1.7091581480557057e-05, | |
| "loss": 0.3636088132858276, | |
| "memory(GiB)": 133.05, | |
| "step": 735, | |
| "token_acc": 0.8666745722408246, | |
| "train_speed(iter/s)": 0.068774 | |
| }, | |
| { | |
| "epoch": 0.8665105386416861, | |
| "grad_norm": 0.21285265684127808, | |
| "learning_rate": 1.7045914513874815e-05, | |
| "loss": 0.37646629810333254, | |
| "memory(GiB)": 133.05, | |
| "step": 740, | |
| "token_acc": 0.8666506652036757, | |
| "train_speed(iter/s)": 0.068785 | |
| }, | |
| { | |
| "epoch": 0.8723653395784543, | |
| "grad_norm": 0.19855837523937225, | |
| "learning_rate": 1.699995385567907e-05, | |
| "loss": 0.37862300872802734, | |
| "memory(GiB)": 133.05, | |
| "step": 745, | |
| "token_acc": 0.8584255151366506, | |
| "train_speed(iter/s)": 0.068799 | |
| }, | |
| { | |
| "epoch": 0.8782201405152225, | |
| "grad_norm": 0.21356073021888733, | |
| "learning_rate": 1.695370142172614e-05, | |
| "loss": 0.370495080947876, | |
| "memory(GiB)": 133.05, | |
| "step": 750, | |
| "token_acc": 0.8650399529081709, | |
| "train_speed(iter/s)": 0.068798 | |
| }, | |
| { | |
| "epoch": 0.8840749414519906, | |
| "grad_norm": 0.21858234703540802, | |
| "learning_rate": 1.690715913993429e-05, | |
| "loss": 0.3731105089187622, | |
| "memory(GiB)": 133.05, | |
| "step": 755, | |
| "token_acc": 0.8690419204765525, | |
| "train_speed(iter/s)": 0.068799 | |
| }, | |
| { | |
| "epoch": 0.8899297423887588, | |
| "grad_norm": 0.21877680718898773, | |
| "learning_rate": 1.6860328950303392e-05, | |
| "loss": 0.3532438039779663, | |
| "memory(GiB)": 133.05, | |
| "step": 760, | |
| "token_acc": 0.8752962281074447, | |
| "train_speed(iter/s)": 0.068803 | |
| }, | |
| { | |
| "epoch": 0.8957845433255269, | |
| "grad_norm": 0.2116468995809555, | |
| "learning_rate": 1.6813212804834033e-05, | |
| "loss": 0.3690504550933838, | |
| "memory(GiB)": 133.05, | |
| "step": 765, | |
| "token_acc": 0.861989263346257, | |
| "train_speed(iter/s)": 0.068807 | |
| }, | |
| { | |
| "epoch": 0.9016393442622951, | |
| "grad_norm": 0.20343121886253357, | |
| "learning_rate": 1.676581266744615e-05, | |
| "loss": 0.3611701488494873, | |
| "memory(GiB)": 133.05, | |
| "step": 770, | |
| "token_acc": 0.8671105242834544, | |
| "train_speed(iter/s)": 0.06881 | |
| }, | |
| { | |
| "epoch": 0.9074941451990632, | |
| "grad_norm": 0.19857962429523468, | |
| "learning_rate": 1.6718130513897207e-05, | |
| "loss": 0.3600625038146973, | |
| "memory(GiB)": 133.05, | |
| "step": 775, | |
| "token_acc": 0.8728194751658959, | |
| "train_speed(iter/s)": 0.068813 | |
| }, | |
| { | |
| "epoch": 0.9133489461358314, | |
| "grad_norm": 0.23387958109378815, | |
| "learning_rate": 1.667016833169979e-05, | |
| "loss": 0.3759610176086426, | |
| "memory(GiB)": 133.05, | |
| "step": 780, | |
| "token_acc": 0.8710922399514741, | |
| "train_speed(iter/s)": 0.068813 | |
| }, | |
| { | |
| "epoch": 0.9192037470725996, | |
| "grad_norm": 0.2053619623184204, | |
| "learning_rate": 1.6621928120038806e-05, | |
| "loss": 0.36916725635528563, | |
| "memory(GiB)": 133.05, | |
| "step": 785, | |
| "token_acc": 0.8602640020509871, | |
| "train_speed(iter/s)": 0.068813 | |
| }, | |
| { | |
| "epoch": 0.9250585480093677, | |
| "grad_norm": 0.20847375690937042, | |
| "learning_rate": 1.657341188968811e-05, | |
| "loss": 0.36096744537353515, | |
| "memory(GiB)": 133.05, | |
| "step": 790, | |
| "token_acc": 0.8631381808792282, | |
| "train_speed(iter/s)": 0.068819 | |
| }, | |
| { | |
| "epoch": 0.9309133489461359, | |
| "grad_norm": 0.20935416221618652, | |
| "learning_rate": 1.6524621662926733e-05, | |
| "loss": 0.3602827310562134, | |
| "memory(GiB)": 133.05, | |
| "step": 795, | |
| "token_acc": 0.8806607875578047, | |
| "train_speed(iter/s)": 0.068825 | |
| }, | |
| { | |
| "epoch": 0.936768149882904, | |
| "grad_norm": 0.214552640914917, | |
| "learning_rate": 1.6475559473454558e-05, | |
| "loss": 0.369510293006897, | |
| "memory(GiB)": 133.05, | |
| "step": 800, | |
| "token_acc": 0.8770849556632923, | |
| "train_speed(iter/s)": 0.068828 | |
| }, | |
| { | |
| "epoch": 0.9426229508196722, | |
| "grad_norm": 0.21994450688362122, | |
| "learning_rate": 1.6426227366307563e-05, | |
| "loss": 0.37307014465332033, | |
| "memory(GiB)": 133.05, | |
| "step": 805, | |
| "token_acc": 0.876770090527487, | |
| "train_speed(iter/s)": 0.068823 | |
| }, | |
| { | |
| "epoch": 0.9484777517564403, | |
| "grad_norm": 0.20645499229431152, | |
| "learning_rate": 1.6376627397772576e-05, | |
| "loss": 0.37114017009735106, | |
| "memory(GiB)": 133.05, | |
| "step": 810, | |
| "token_acc": 0.8619496040676315, | |
| "train_speed(iter/s)": 0.068823 | |
| }, | |
| { | |
| "epoch": 0.9543325526932084, | |
| "grad_norm": 0.2126459777355194, | |
| "learning_rate": 1.6326761635301572e-05, | |
| "loss": 0.3650930166244507, | |
| "memory(GiB)": 133.05, | |
| "step": 815, | |
| "token_acc": 0.870646124823141, | |
| "train_speed(iter/s)": 0.068826 | |
| }, | |
| { | |
| "epoch": 0.9601873536299765, | |
| "grad_norm": 0.20105397701263428, | |
| "learning_rate": 1.6276632157425475e-05, | |
| "loss": 0.37223210334777834, | |
| "memory(GiB)": 133.05, | |
| "step": 820, | |
| "token_acc": 0.8648889553764547, | |
| "train_speed(iter/s)": 0.068826 | |
| }, | |
| { | |
| "epoch": 0.9660421545667447, | |
| "grad_norm": 0.2080501765012741, | |
| "learning_rate": 1.6226241053667536e-05, | |
| "loss": 0.37712783813476564, | |
| "memory(GiB)": 133.05, | |
| "step": 825, | |
| "token_acc": 0.8605132566814988, | |
| "train_speed(iter/s)": 0.06883 | |
| }, | |
| { | |
| "epoch": 0.9718969555035128, | |
| "grad_norm": 0.2141636610031128, | |
| "learning_rate": 1.617559042445625e-05, | |
| "loss": 0.37673077583312986, | |
| "memory(GiB)": 133.05, | |
| "step": 830, | |
| "token_acc": 0.8719900238096734, | |
| "train_speed(iter/s)": 0.06883 | |
| }, | |
| { | |
| "epoch": 0.977751756440281, | |
| "grad_norm": 0.21488763391971588, | |
| "learning_rate": 1.6124682381037767e-05, | |
| "loss": 0.3640845537185669, | |
| "memory(GiB)": 133.05, | |
| "step": 835, | |
| "token_acc": 0.8693016352169747, | |
| "train_speed(iter/s)": 0.068834 | |
| }, | |
| { | |
| "epoch": 0.9836065573770492, | |
| "grad_norm": 0.22521890699863434, | |
| "learning_rate": 1.607351904538792e-05, | |
| "loss": 0.3786426782608032, | |
| "memory(GiB)": 133.05, | |
| "step": 840, | |
| "token_acc": 0.86982781737791, | |
| "train_speed(iter/s)": 0.068827 | |
| }, | |
| { | |
| "epoch": 0.9894613583138173, | |
| "grad_norm": 0.2129945307970047, | |
| "learning_rate": 1.6022102550123775e-05, | |
| "loss": 0.365330171585083, | |
| "memory(GiB)": 133.05, | |
| "step": 845, | |
| "token_acc": 0.864430874708757, | |
| "train_speed(iter/s)": 0.06883 | |
| }, | |
| { | |
| "epoch": 0.9953161592505855, | |
| "grad_norm": 0.216830313205719, | |
| "learning_rate": 1.597043503841471e-05, | |
| "loss": 0.3653510093688965, | |
| "memory(GiB)": 133.05, | |
| "step": 850, | |
| "token_acc": 0.878798859209881, | |
| "train_speed(iter/s)": 0.068829 | |
| }, | |
| { | |
| "epoch": 1.0011709601873535, | |
| "grad_norm": 0.2833782732486725, | |
| "learning_rate": 1.5918518663893124e-05, | |
| "loss": 0.35915145874023435, | |
| "memory(GiB)": 133.05, | |
| "step": 855, | |
| "token_acc": 0.873855282676776, | |
| "train_speed(iter/s)": 0.068529 | |
| }, | |
| { | |
| "epoch": 1.0070257611241218, | |
| "grad_norm": 0.24765369296073914, | |
| "learning_rate": 1.5866355590564637e-05, | |
| "loss": 0.3397256851196289, | |
| "memory(GiB)": 133.05, | |
| "step": 860, | |
| "token_acc": 0.8892689705247213, | |
| "train_speed(iter/s)": 0.068517 | |
| }, | |
| { | |
| "epoch": 1.0128805620608898, | |
| "grad_norm": 0.2325168401002884, | |
| "learning_rate": 1.5813947992717894e-05, | |
| "loss": 0.327287483215332, | |
| "memory(GiB)": 133.05, | |
| "step": 865, | |
| "token_acc": 0.8796502265193716, | |
| "train_speed(iter/s)": 0.068508 | |
| }, | |
| { | |
| "epoch": 1.018735362997658, | |
| "grad_norm": 0.2461637407541275, | |
| "learning_rate": 1.5761298054833947e-05, | |
| "loss": 0.3370250701904297, | |
| "memory(GiB)": 133.05, | |
| "step": 870, | |
| "token_acc": 0.8818223536926445, | |
| "train_speed(iter/s)": 0.068498 | |
| }, | |
| { | |
| "epoch": 1.0245901639344261, | |
| "grad_norm": 0.22223389148712158, | |
| "learning_rate": 1.5708407971495195e-05, | |
| "loss": 0.3431839942932129, | |
| "memory(GiB)": 133.05, | |
| "step": 875, | |
| "token_acc": 0.8771204606261637, | |
| "train_speed(iter/s)": 0.068498 | |
| }, | |
| { | |
| "epoch": 1.0304449648711944, | |
| "grad_norm": 0.22983962297439575, | |
| "learning_rate": 1.565527994729389e-05, | |
| "loss": 0.333197808265686, | |
| "memory(GiB)": 133.05, | |
| "step": 880, | |
| "token_acc": 0.8869119581976505, | |
| "train_speed(iter/s)": 0.068497 | |
| }, | |
| { | |
| "epoch": 1.0362997658079625, | |
| "grad_norm": 0.21161960065364838, | |
| "learning_rate": 1.5601916196740283e-05, | |
| "loss": 0.32940354347229006, | |
| "memory(GiB)": 133.05, | |
| "step": 885, | |
| "token_acc": 0.8834938944853924, | |
| "train_speed(iter/s)": 0.068496 | |
| }, | |
| { | |
| "epoch": 1.0421545667447307, | |
| "grad_norm": 0.22903162240982056, | |
| "learning_rate": 1.5548318944170276e-05, | |
| "loss": 0.3256603956222534, | |
| "memory(GiB)": 133.05, | |
| "step": 890, | |
| "token_acc": 0.8883952211008513, | |
| "train_speed(iter/s)": 0.068494 | |
| }, | |
| { | |
| "epoch": 1.0480093676814988, | |
| "grad_norm": 0.21301260590553284, | |
| "learning_rate": 1.5494490423652732e-05, | |
| "loss": 0.3253190040588379, | |
| "memory(GiB)": 133.05, | |
| "step": 895, | |
| "token_acc": 0.8813899275623074, | |
| "train_speed(iter/s)": 0.068483 | |
| }, | |
| { | |
| "epoch": 1.053864168618267, | |
| "grad_norm": 0.2047208845615387, | |
| "learning_rate": 1.544043287889635e-05, | |
| "loss": 0.31666491031646726, | |
| "memory(GiB)": 133.05, | |
| "step": 900, | |
| "token_acc": 0.8909019236833806, | |
| "train_speed(iter/s)": 0.068481 | |
| }, | |
| { | |
| "epoch": 1.059718969555035, | |
| "grad_norm": 0.23390096426010132, | |
| "learning_rate": 1.538614856315614e-05, | |
| "loss": 0.330989408493042, | |
| "memory(GiB)": 133.05, | |
| "step": 905, | |
| "token_acc": 0.8884555161039297, | |
| "train_speed(iter/s)": 0.068482 | |
| }, | |
| { | |
| "epoch": 1.0655737704918034, | |
| "grad_norm": 0.20488137006759644, | |
| "learning_rate": 1.5331639739139477e-05, | |
| "loss": 0.3256430149078369, | |
| "memory(GiB)": 133.05, | |
| "step": 910, | |
| "token_acc": 0.8721090848001792, | |
| "train_speed(iter/s)": 0.068473 | |
| }, | |
| { | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 0.21736453473567963, | |
| "learning_rate": 1.5276908678911837e-05, | |
| "loss": 0.3228193521499634, | |
| "memory(GiB)": 133.05, | |
| "step": 915, | |
| "token_acc": 0.8874282476871164, | |
| "train_speed(iter/s)": 0.068469 | |
| }, | |
| { | |
| "epoch": 1.0772833723653397, | |
| "grad_norm": 0.206723153591156, | |
| "learning_rate": 1.5221957663802043e-05, | |
| "loss": 0.3333425521850586, | |
| "memory(GiB)": 133.17, | |
| "step": 920, | |
| "token_acc": 0.886105330059943, | |
| "train_speed(iter/s)": 0.068454 | |
| }, | |
| { | |
| "epoch": 1.0831381733021077, | |
| "grad_norm": 0.203144371509552, | |
| "learning_rate": 1.5166788984307204e-05, | |
| "loss": 0.33838639259338377, | |
| "memory(GiB)": 133.17, | |
| "step": 925, | |
| "token_acc": 0.8802329092899476, | |
| "train_speed(iter/s)": 0.068444 | |
| }, | |
| { | |
| "epoch": 1.088992974238876, | |
| "grad_norm": 0.24915394186973572, | |
| "learning_rate": 1.5111404939997227e-05, | |
| "loss": 0.33564419746398927, | |
| "memory(GiB)": 133.17, | |
| "step": 930, | |
| "token_acc": 0.8793440099130728, | |
| "train_speed(iter/s)": 0.068442 | |
| }, | |
| { | |
| "epoch": 1.094847775175644, | |
| "grad_norm": 0.2503604292869568, | |
| "learning_rate": 1.5055807839418966e-05, | |
| "loss": 0.3157151460647583, | |
| "memory(GiB)": 133.17, | |
| "step": 935, | |
| "token_acc": 0.8862683405108546, | |
| "train_speed(iter/s)": 0.068436 | |
| }, | |
| { | |
| "epoch": 1.100702576112412, | |
| "grad_norm": 0.20239044725894928, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.3377982139587402, | |
| "memory(GiB)": 133.17, | |
| "step": 940, | |
| "token_acc": 0.8886980901742478, | |
| "train_speed(iter/s)": 0.068436 | |
| }, | |
| { | |
| "epoch": 1.1065573770491803, | |
| "grad_norm": 0.20267418026924133, | |
| "learning_rate": 1.494398374795204e-05, | |
| "loss": 0.3253162145614624, | |
| "memory(GiB)": 133.17, | |
| "step": 945, | |
| "token_acc": 0.8780125495417973, | |
| "train_speed(iter/s)": 0.068437 | |
| }, | |
| { | |
| "epoch": 1.1124121779859484, | |
| "grad_norm": 0.2210346758365631, | |
| "learning_rate": 1.4887761418173947e-05, | |
| "loss": 0.3438437461853027, | |
| "memory(GiB)": 133.17, | |
| "step": 950, | |
| "token_acc": 0.8874266802316089, | |
| "train_speed(iter/s)": 0.068436 | |
| }, | |
| { | |
| "epoch": 1.1182669789227166, | |
| "grad_norm": 0.206399604678154, | |
| "learning_rate": 1.4831335354154444e-05, | |
| "loss": 0.3289347648620605, | |
| "memory(GiB)": 133.17, | |
| "step": 955, | |
| "token_acc": 0.8831363419858116, | |
| "train_speed(iter/s)": 0.068436 | |
| }, | |
| { | |
| "epoch": 1.1241217798594847, | |
| "grad_norm": 0.21163643896579742, | |
| "learning_rate": 1.4774707907874392e-05, | |
| "loss": 0.32750353813171384, | |
| "memory(GiB)": 133.17, | |
| "step": 960, | |
| "token_acc": 0.8880904228882937, | |
| "train_speed(iter/s)": 0.068427 | |
| }, | |
| { | |
| "epoch": 1.129976580796253, | |
| "grad_norm": 0.20707455277442932, | |
| "learning_rate": 1.4717881439708786e-05, | |
| "loss": 0.3284764289855957, | |
| "memory(GiB)": 133.17, | |
| "step": 965, | |
| "token_acc": 0.8722379691636817, | |
| "train_speed(iter/s)": 0.068425 | |
| }, | |
| { | |
| "epoch": 1.135831381733021, | |
| "grad_norm": 0.2046642154455185, | |
| "learning_rate": 1.4660858318328348e-05, | |
| "loss": 0.3317260265350342, | |
| "memory(GiB)": 133.17, | |
| "step": 970, | |
| "token_acc": 0.8710549063749603, | |
| "train_speed(iter/s)": 0.068419 | |
| }, | |
| { | |
| "epoch": 1.1416861826697893, | |
| "grad_norm": 0.20032472908496857, | |
| "learning_rate": 1.4603640920600813e-05, | |
| "loss": 0.33744547367095945, | |
| "memory(GiB)": 133.17, | |
| "step": 975, | |
| "token_acc": 0.8676646558084457, | |
| "train_speed(iter/s)": 0.068416 | |
| }, | |
| { | |
| "epoch": 1.1475409836065573, | |
| "grad_norm": 0.20992988348007202, | |
| "learning_rate": 1.4546231631491827e-05, | |
| "loss": 0.3331944704055786, | |
| "memory(GiB)": 133.17, | |
| "step": 980, | |
| "token_acc": 0.8770167266237555, | |
| "train_speed(iter/s)": 0.068406 | |
| }, | |
| { | |
| "epoch": 1.1533957845433256, | |
| "grad_norm": 0.2045455127954483, | |
| "learning_rate": 1.4488632843965573e-05, | |
| "loss": 0.32609896659851073, | |
| "memory(GiB)": 133.17, | |
| "step": 985, | |
| "token_acc": 0.8671518193224592, | |
| "train_speed(iter/s)": 0.068401 | |
| }, | |
| { | |
| "epoch": 1.1592505854800936, | |
| "grad_norm": 0.21106521785259247, | |
| "learning_rate": 1.4430846958884995e-05, | |
| "loss": 0.3347620010375977, | |
| "memory(GiB)": 133.17, | |
| "step": 990, | |
| "token_acc": 0.8760981150071534, | |
| "train_speed(iter/s)": 0.068396 | |
| }, | |
| { | |
| "epoch": 1.165105386416862, | |
| "grad_norm": 0.2021251767873764, | |
| "learning_rate": 1.4372876384911741e-05, | |
| "loss": 0.33538064956665037, | |
| "memory(GiB)": 133.17, | |
| "step": 995, | |
| "token_acc": 0.8768674285536101, | |
| "train_speed(iter/s)": 0.068392 | |
| }, | |
| { | |
| "epoch": 1.17096018735363, | |
| "grad_norm": 0.22672772407531738, | |
| "learning_rate": 1.4314723538405752e-05, | |
| "loss": 0.3422734260559082, | |
| "memory(GiB)": 133.17, | |
| "step": 1000, | |
| "token_acc": 0.8671538988967151, | |
| "train_speed(iter/s)": 0.068389 | |
| }, | |
| { | |
| "epoch": 1.1768149882903982, | |
| "grad_norm": 0.2139746993780136, | |
| "learning_rate": 1.4256390843324556e-05, | |
| "loss": 0.3371597766876221, | |
| "memory(GiB)": 133.17, | |
| "step": 1005, | |
| "token_acc": 0.8732182530767119, | |
| "train_speed(iter/s)": 0.068388 | |
| }, | |
| { | |
| "epoch": 1.1826697892271663, | |
| "grad_norm": 0.21347731351852417, | |
| "learning_rate": 1.4197880731122221e-05, | |
| "loss": 0.3339057922363281, | |
| "memory(GiB)": 133.17, | |
| "step": 1010, | |
| "token_acc": 0.8729292778317514, | |
| "train_speed(iter/s)": 0.06839 | |
| }, | |
| { | |
| "epoch": 1.1885245901639343, | |
| "grad_norm": 0.21436652541160583, | |
| "learning_rate": 1.4139195640648008e-05, | |
| "loss": 0.3371711730957031, | |
| "memory(GiB)": 133.17, | |
| "step": 1015, | |
| "token_acc": 0.8857815368682034, | |
| "train_speed(iter/s)": 0.068385 | |
| }, | |
| { | |
| "epoch": 1.1943793911007026, | |
| "grad_norm": 0.21145156025886536, | |
| "learning_rate": 1.4080338018044712e-05, | |
| "loss": 0.3415823459625244, | |
| "memory(GiB)": 133.17, | |
| "step": 1020, | |
| "token_acc": 0.8745781005321704, | |
| "train_speed(iter/s)": 0.068382 | |
| }, | |
| { | |
| "epoch": 1.2002341920374708, | |
| "grad_norm": 0.2704923748970032, | |
| "learning_rate": 1.4021310316646708e-05, | |
| "loss": 0.33098018169403076, | |
| "memory(GiB)": 133.17, | |
| "step": 1025, | |
| "token_acc": 0.8810291608110821, | |
| "train_speed(iter/s)": 0.06838 | |
| }, | |
| { | |
| "epoch": 1.2060889929742389, | |
| "grad_norm": 0.20703041553497314, | |
| "learning_rate": 1.3962114996877685e-05, | |
| "loss": 0.3177175045013428, | |
| "memory(GiB)": 133.17, | |
| "step": 1030, | |
| "token_acc": 0.8884392410781509, | |
| "train_speed(iter/s)": 0.068376 | |
| }, | |
| { | |
| "epoch": 1.211943793911007, | |
| "grad_norm": 0.20425967872142792, | |
| "learning_rate": 1.390275452614808e-05, | |
| "loss": 0.3208155155181885, | |
| "memory(GiB)": 133.17, | |
| "step": 1035, | |
| "token_acc": 0.8798795706976164, | |
| "train_speed(iter/s)": 0.068375 | |
| }, | |
| { | |
| "epoch": 1.2177985948477752, | |
| "grad_norm": 0.2199791669845581, | |
| "learning_rate": 1.3843231378752252e-05, | |
| "loss": 0.32726430892944336, | |
| "memory(GiB)": 133.17, | |
| "step": 1040, | |
| "token_acc": 0.8785451315143307, | |
| "train_speed(iter/s)": 0.068367 | |
| }, | |
| { | |
| "epoch": 1.2236533957845432, | |
| "grad_norm": 0.22237712144851685, | |
| "learning_rate": 1.3783548035765327e-05, | |
| "loss": 0.33181195259094237, | |
| "memory(GiB)": 133.17, | |
| "step": 1045, | |
| "token_acc": 0.8834801207851032, | |
| "train_speed(iter/s)": 0.068368 | |
| }, | |
| { | |
| "epoch": 1.2295081967213115, | |
| "grad_norm": 0.20910513401031494, | |
| "learning_rate": 1.3723706984939783e-05, | |
| "loss": 0.3189753532409668, | |
| "memory(GiB)": 133.17, | |
| "step": 1050, | |
| "token_acc": 0.8769508605389209, | |
| "train_speed(iter/s)": 0.068364 | |
| }, | |
| { | |
| "epoch": 1.2353629976580796, | |
| "grad_norm": 0.20491260290145874, | |
| "learning_rate": 1.366371072060177e-05, | |
| "loss": 0.33074491024017333, | |
| "memory(GiB)": 133.17, | |
| "step": 1055, | |
| "token_acc": 0.8681569771445384, | |
| "train_speed(iter/s)": 0.068361 | |
| }, | |
| { | |
| "epoch": 1.2412177985948478, | |
| "grad_norm": 0.1918231099843979, | |
| "learning_rate": 1.3603561743547125e-05, | |
| "loss": 0.3256643772125244, | |
| "memory(GiB)": 133.17, | |
| "step": 1060, | |
| "token_acc": 0.8732954670333983, | |
| "train_speed(iter/s)": 0.068363 | |
| }, | |
| { | |
| "epoch": 1.2470725995316159, | |
| "grad_norm": 0.21773004531860352, | |
| "learning_rate": 1.3543262560937135e-05, | |
| "loss": 0.33045885562896726, | |
| "memory(GiB)": 133.17, | |
| "step": 1065, | |
| "token_acc": 0.8785313558157261, | |
| "train_speed(iter/s)": 0.068363 | |
| }, | |
| { | |
| "epoch": 1.2529274004683841, | |
| "grad_norm": 0.21782302856445312, | |
| "learning_rate": 1.3482815686194033e-05, | |
| "loss": 0.3164831161499023, | |
| "memory(GiB)": 133.17, | |
| "step": 1070, | |
| "token_acc": 0.8841838807462733, | |
| "train_speed(iter/s)": 0.068363 | |
| }, | |
| { | |
| "epoch": 1.2587822014051522, | |
| "grad_norm": 0.21324488520622253, | |
| "learning_rate": 1.3422223638896235e-05, | |
| "loss": 0.32593531608581544, | |
| "memory(GiB)": 133.17, | |
| "step": 1075, | |
| "token_acc": 0.8798167525312546, | |
| "train_speed(iter/s)": 0.068363 | |
| }, | |
| { | |
| "epoch": 1.2646370023419204, | |
| "grad_norm": 0.22865289449691772, | |
| "learning_rate": 1.3361488944673315e-05, | |
| "loss": 0.3352835178375244, | |
| "memory(GiB)": 133.17, | |
| "step": 1080, | |
| "token_acc": 0.8729886330661392, | |
| "train_speed(iter/s)": 0.068362 | |
| }, | |
| { | |
| "epoch": 1.2704918032786885, | |
| "grad_norm": 0.20328956842422485, | |
| "learning_rate": 1.3300614135100736e-05, | |
| "loss": 0.332173490524292, | |
| "memory(GiB)": 133.17, | |
| "step": 1085, | |
| "token_acc": 0.8806762689525037, | |
| "train_speed(iter/s)": 0.068357 | |
| }, | |
| { | |
| "epoch": 1.2763466042154565, | |
| "grad_norm": 0.19926570355892181, | |
| "learning_rate": 1.3239601747594319e-05, | |
| "loss": 0.331054162979126, | |
| "memory(GiB)": 133.17, | |
| "step": 1090, | |
| "token_acc": 0.8812650906933006, | |
| "train_speed(iter/s)": 0.068351 | |
| }, | |
| { | |
| "epoch": 1.2822014051522248, | |
| "grad_norm": 0.19676311314105988, | |
| "learning_rate": 1.3178454325304472e-05, | |
| "loss": 0.33361315727233887, | |
| "memory(GiB)": 133.17, | |
| "step": 1095, | |
| "token_acc": 0.8700881415265362, | |
| "train_speed(iter/s)": 0.068351 | |
| }, | |
| { | |
| "epoch": 1.288056206088993, | |
| "grad_norm": 0.20788326859474182, | |
| "learning_rate": 1.3117174417010213e-05, | |
| "loss": 0.31841249465942384, | |
| "memory(GiB)": 133.17, | |
| "step": 1100, | |
| "token_acc": 0.8749374970517477, | |
| "train_speed(iter/s)": 0.06835 | |
| }, | |
| { | |
| "epoch": 1.2939110070257611, | |
| "grad_norm": 0.21633991599082947, | |
| "learning_rate": 1.3055764577012892e-05, | |
| "loss": 0.34844322204589845, | |
| "memory(GiB)": 133.17, | |
| "step": 1105, | |
| "token_acc": 0.8857762459338606, | |
| "train_speed(iter/s)": 0.068351 | |
| }, | |
| { | |
| "epoch": 1.2997658079625292, | |
| "grad_norm": 0.2159479707479477, | |
| "learning_rate": 1.2994227365029752e-05, | |
| "loss": 0.32929096221923826, | |
| "memory(GiB)": 133.17, | |
| "step": 1110, | |
| "token_acc": 0.8831624401350396, | |
| "train_speed(iter/s)": 0.06835 | |
| }, | |
| { | |
| "epoch": 1.3056206088992974, | |
| "grad_norm": 0.21510519087314606, | |
| "learning_rate": 1.2932565346087218e-05, | |
| "loss": 0.33609514236450194, | |
| "memory(GiB)": 133.17, | |
| "step": 1115, | |
| "token_acc": 0.8789613142554319, | |
| "train_speed(iter/s)": 0.068346 | |
| }, | |
| { | |
| "epoch": 1.3114754098360657, | |
| "grad_norm": 0.19823956489562988, | |
| "learning_rate": 1.2870781090413991e-05, | |
| "loss": 0.3340220212936401, | |
| "memory(GiB)": 133.17, | |
| "step": 1120, | |
| "token_acc": 0.8802133820301311, | |
| "train_speed(iter/s)": 0.068343 | |
| }, | |
| { | |
| "epoch": 1.3173302107728337, | |
| "grad_norm": 0.19969677925109863, | |
| "learning_rate": 1.2808877173333896e-05, | |
| "loss": 0.32896521091461184, | |
| "memory(GiB)": 133.17, | |
| "step": 1125, | |
| "token_acc": 0.8884312591176619, | |
| "train_speed(iter/s)": 0.068342 | |
| }, | |
| { | |
| "epoch": 1.3231850117096018, | |
| "grad_norm": 0.19414611160755157, | |
| "learning_rate": 1.2746856175158556e-05, | |
| "loss": 0.33699817657470704, | |
| "memory(GiB)": 133.17, | |
| "step": 1130, | |
| "token_acc": 0.8808933080116763, | |
| "train_speed(iter/s)": 0.068346 | |
| }, | |
| { | |
| "epoch": 1.32903981264637, | |
| "grad_norm": 0.20659878849983215, | |
| "learning_rate": 1.2684720681079825e-05, | |
| "loss": 0.33256163597106936, | |
| "memory(GiB)": 133.17, | |
| "step": 1135, | |
| "token_acc": 0.8659905808672699, | |
| "train_speed(iter/s)": 0.068345 | |
| }, | |
| { | |
| "epoch": 1.334894613583138, | |
| "grad_norm": 0.21766500174999237, | |
| "learning_rate": 1.2622473281062042e-05, | |
| "loss": 0.3360875129699707, | |
| "memory(GiB)": 133.17, | |
| "step": 1140, | |
| "token_acc": 0.8805351128851191, | |
| "train_speed(iter/s)": 0.068346 | |
| }, | |
| { | |
| "epoch": 1.3407494145199064, | |
| "grad_norm": 0.21836382150650024, | |
| "learning_rate": 1.256011656973406e-05, | |
| "loss": 0.3428370952606201, | |
| "memory(GiB)": 133.17, | |
| "step": 1145, | |
| "token_acc": 0.882268280446507, | |
| "train_speed(iter/s)": 0.068346 | |
| }, | |
| { | |
| "epoch": 1.3466042154566744, | |
| "grad_norm": 0.21305552124977112, | |
| "learning_rate": 1.2497653146281113e-05, | |
| "loss": 0.3323945999145508, | |
| "memory(GiB)": 133.17, | |
| "step": 1150, | |
| "token_acc": 0.8799263041729795, | |
| "train_speed(iter/s)": 0.06834 | |
| }, | |
| { | |
| "epoch": 1.3524590163934427, | |
| "grad_norm": 0.2115429788827896, | |
| "learning_rate": 1.2435085614336459e-05, | |
| "loss": 0.33839111328125, | |
| "memory(GiB)": 133.17, | |
| "step": 1155, | |
| "token_acc": 0.8877846609149278, | |
| "train_speed(iter/s)": 0.068336 | |
| }, | |
| { | |
| "epoch": 1.3583138173302107, | |
| "grad_norm": 0.20214448869228363, | |
| "learning_rate": 1.2372416581872857e-05, | |
| "loss": 0.3267178773880005, | |
| "memory(GiB)": 133.17, | |
| "step": 1160, | |
| "token_acc": 0.8858182364221651, | |
| "train_speed(iter/s)": 0.068334 | |
| }, | |
| { | |
| "epoch": 1.364168618266979, | |
| "grad_norm": 0.19922491908073425, | |
| "learning_rate": 1.2309648661093878e-05, | |
| "loss": 0.33157687187194823, | |
| "memory(GiB)": 133.17, | |
| "step": 1165, | |
| "token_acc": 0.8862414604099004, | |
| "train_speed(iter/s)": 0.068333 | |
| }, | |
| { | |
| "epoch": 1.370023419203747, | |
| "grad_norm": 0.20893344283103943, | |
| "learning_rate": 1.2246784468324993e-05, | |
| "loss": 0.3382421016693115, | |
| "memory(GiB)": 133.17, | |
| "step": 1170, | |
| "token_acc": 0.8688796266876001, | |
| "train_speed(iter/s)": 0.068327 | |
| }, | |
| { | |
| "epoch": 1.3758782201405153, | |
| "grad_norm": 0.219789519906044, | |
| "learning_rate": 1.218382662390454e-05, | |
| "loss": 0.3261989116668701, | |
| "memory(GiB)": 133.17, | |
| "step": 1175, | |
| "token_acc": 0.8682563507122426, | |
| "train_speed(iter/s)": 0.068328 | |
| }, | |
| { | |
| "epoch": 1.3817330210772834, | |
| "grad_norm": 0.2007785141468048, | |
| "learning_rate": 1.2120777752074492e-05, | |
| "loss": 0.33451414108276367, | |
| "memory(GiB)": 133.17, | |
| "step": 1180, | |
| "token_acc": 0.8779171167786075, | |
| "train_speed(iter/s)": 0.068325 | |
| }, | |
| { | |
| "epoch": 1.3875878220140514, | |
| "grad_norm": 0.20650921761989594, | |
| "learning_rate": 1.2057640480871084e-05, | |
| "loss": 0.33679168224334716, | |
| "memory(GiB)": 133.17, | |
| "step": 1185, | |
| "token_acc": 0.8786453140578265, | |
| "train_speed(iter/s)": 0.068325 | |
| }, | |
| { | |
| "epoch": 1.3934426229508197, | |
| "grad_norm": 0.20114493370056152, | |
| "learning_rate": 1.1994417442015243e-05, | |
| "loss": 0.33562412261962893, | |
| "memory(GiB)": 133.17, | |
| "step": 1190, | |
| "token_acc": 0.8844727744979327, | |
| "train_speed(iter/s)": 0.068325 | |
| }, | |
| { | |
| "epoch": 1.399297423887588, | |
| "grad_norm": 0.19498831033706665, | |
| "learning_rate": 1.193111127080292e-05, | |
| "loss": 0.3253043174743652, | |
| "memory(GiB)": 133.17, | |
| "step": 1195, | |
| "token_acc": 0.8870853046866852, | |
| "train_speed(iter/s)": 0.068326 | |
| }, | |
| { | |
| "epoch": 1.405152224824356, | |
| "grad_norm": 0.1827043890953064, | |
| "learning_rate": 1.186772460599523e-05, | |
| "loss": 0.3244746685028076, | |
| "memory(GiB)": 133.17, | |
| "step": 1200, | |
| "token_acc": 0.8863151296717072, | |
| "train_speed(iter/s)": 0.068323 | |
| }, | |
| { | |
| "epoch": 1.411007025761124, | |
| "grad_norm": 0.21396119892597198, | |
| "learning_rate": 1.1804260089708464e-05, | |
| "loss": 0.3355713367462158, | |
| "memory(GiB)": 133.17, | |
| "step": 1205, | |
| "token_acc": 0.8714279485774079, | |
| "train_speed(iter/s)": 0.068317 | |
| }, | |
| { | |
| "epoch": 1.4168618266978923, | |
| "grad_norm": 0.20849740505218506, | |
| "learning_rate": 1.1740720367303958e-05, | |
| "loss": 0.3293231725692749, | |
| "memory(GiB)": 133.17, | |
| "step": 1210, | |
| "token_acc": 0.8799478293040041, | |
| "train_speed(iter/s)": 0.068316 | |
| }, | |
| { | |
| "epoch": 1.4227166276346606, | |
| "grad_norm": 0.19985808432102203, | |
| "learning_rate": 1.1677108087277835e-05, | |
| "loss": 0.33586926460266114, | |
| "memory(GiB)": 133.17, | |
| "step": 1215, | |
| "token_acc": 0.8803578911815663, | |
| "train_speed(iter/s)": 0.068314 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.2120925784111023, | |
| "learning_rate": 1.1613425901150595e-05, | |
| "loss": 0.335320782661438, | |
| "memory(GiB)": 133.17, | |
| "step": 1220, | |
| "token_acc": 0.8822237863291518, | |
| "train_speed(iter/s)": 0.068311 | |
| }, | |
| { | |
| "epoch": 1.4344262295081966, | |
| "grad_norm": 0.20144475996494293, | |
| "learning_rate": 1.15496764633566e-05, | |
| "loss": 0.34459710121154785, | |
| "memory(GiB)": 133.17, | |
| "step": 1225, | |
| "token_acc": 0.8714527101578114, | |
| "train_speed(iter/s)": 0.068308 | |
| }, | |
| { | |
| "epoch": 1.440281030444965, | |
| "grad_norm": 0.1978883147239685, | |
| "learning_rate": 1.1485862431133445e-05, | |
| "loss": 0.334246826171875, | |
| "memory(GiB)": 133.17, | |
| "step": 1230, | |
| "token_acc": 0.8850997230525071, | |
| "train_speed(iter/s)": 0.068306 | |
| }, | |
| { | |
| "epoch": 1.446135831381733, | |
| "grad_norm": 0.20052959024906158, | |
| "learning_rate": 1.1421986464411169e-05, | |
| "loss": 0.33509197235107424, | |
| "memory(GiB)": 133.17, | |
| "step": 1235, | |
| "token_acc": 0.8704968021392047, | |
| "train_speed(iter/s)": 0.068299 | |
| }, | |
| { | |
| "epoch": 1.4519906323185012, | |
| "grad_norm": 0.19154983758926392, | |
| "learning_rate": 1.1358051225701404e-05, | |
| "loss": 0.32514162063598634, | |
| "memory(GiB)": 133.17, | |
| "step": 1240, | |
| "token_acc": 0.8735484752584716, | |
| "train_speed(iter/s)": 0.068296 | |
| }, | |
| { | |
| "epoch": 1.4578454332552693, | |
| "grad_norm": 0.20475593209266663, | |
| "learning_rate": 1.1294059379986384e-05, | |
| "loss": 0.33394522666931153, | |
| "memory(GiB)": 133.17, | |
| "step": 1245, | |
| "token_acc": 0.8737611977698427, | |
| "train_speed(iter/s)": 0.068289 | |
| }, | |
| { | |
| "epoch": 1.4637002341920375, | |
| "grad_norm": 0.20034635066986084, | |
| "learning_rate": 1.1230013594607874e-05, | |
| "loss": 0.33555524349212645, | |
| "memory(GiB)": 133.17, | |
| "step": 1250, | |
| "token_acc": 0.8783339011605555, | |
| "train_speed(iter/s)": 0.068288 | |
| }, | |
| { | |
| "epoch": 1.4695550351288056, | |
| "grad_norm": 0.206059530377388, | |
| "learning_rate": 1.1165916539155968e-05, | |
| "loss": 0.33289051055908203, | |
| "memory(GiB)": 133.17, | |
| "step": 1255, | |
| "token_acc": 0.8821623108149916, | |
| "train_speed(iter/s)": 0.068291 | |
| }, | |
| { | |
| "epoch": 1.4754098360655736, | |
| "grad_norm": 0.1955031454563141, | |
| "learning_rate": 1.1101770885357843e-05, | |
| "loss": 0.3284996509552002, | |
| "memory(GiB)": 133.17, | |
| "step": 1260, | |
| "token_acc": 0.8824508468283658, | |
| "train_speed(iter/s)": 0.068292 | |
| }, | |
| { | |
| "epoch": 1.481264637002342, | |
| "grad_norm": 0.18819548189640045, | |
| "learning_rate": 1.1037579306966365e-05, | |
| "loss": 0.32820711135864256, | |
| "memory(GiB)": 133.17, | |
| "step": 1265, | |
| "token_acc": 0.8902697768320305, | |
| "train_speed(iter/s)": 0.068288 | |
| }, | |
| { | |
| "epoch": 1.4871194379391102, | |
| "grad_norm": 0.20186524093151093, | |
| "learning_rate": 1.0973344479648652e-05, | |
| "loss": 0.3230982065200806, | |
| "memory(GiB)": 133.17, | |
| "step": 1270, | |
| "token_acc": 0.8823814255348585, | |
| "train_speed(iter/s)": 0.068286 | |
| }, | |
| { | |
| "epoch": 1.4929742388758782, | |
| "grad_norm": 0.19547297060489655, | |
| "learning_rate": 1.0909069080874556e-05, | |
| "loss": 0.3249845027923584, | |
| "memory(GiB)": 133.17, | |
| "step": 1275, | |
| "token_acc": 0.8751095158692027, | |
| "train_speed(iter/s)": 0.068285 | |
| }, | |
| { | |
| "epoch": 1.4988290398126463, | |
| "grad_norm": 0.21490275859832764, | |
| "learning_rate": 1.0844755789805042e-05, | |
| "loss": 0.3330803394317627, | |
| "memory(GiB)": 133.17, | |
| "step": 1280, | |
| "token_acc": 0.8725501507719461, | |
| "train_speed(iter/s)": 0.068283 | |
| }, | |
| { | |
| "epoch": 1.5046838407494145, | |
| "grad_norm": 0.21036967635154724, | |
| "learning_rate": 1.0780407287180526e-05, | |
| "loss": 0.33710570335388185, | |
| "memory(GiB)": 133.17, | |
| "step": 1285, | |
| "token_acc": 0.8735995618184534, | |
| "train_speed(iter/s)": 0.068276 | |
| }, | |
| { | |
| "epoch": 1.5105386416861828, | |
| "grad_norm": 0.21496160328388214, | |
| "learning_rate": 1.0716026255209124e-05, | |
| "loss": 0.3322149276733398, | |
| "memory(GiB)": 133.17, | |
| "step": 1290, | |
| "token_acc": 0.8727818581461427, | |
| "train_speed(iter/s)": 0.068276 | |
| }, | |
| { | |
| "epoch": 1.5163934426229508, | |
| "grad_norm": 0.19405636191368103, | |
| "learning_rate": 1.0651615377454872e-05, | |
| "loss": 0.33303227424621584, | |
| "memory(GiB)": 133.17, | |
| "step": 1295, | |
| "token_acc": 0.8809517074473936, | |
| "train_speed(iter/s)": 0.068274 | |
| }, | |
| { | |
| "epoch": 1.5222482435597189, | |
| "grad_norm": 0.20200887322425842, | |
| "learning_rate": 1.0587177338725834e-05, | |
| "loss": 0.3389185905456543, | |
| "memory(GiB)": 133.17, | |
| "step": 1300, | |
| "token_acc": 0.8810081420102018, | |
| "train_speed(iter/s)": 0.068274 | |
| }, | |
| { | |
| "epoch": 1.5281030444964872, | |
| "grad_norm": 0.19218453764915466, | |
| "learning_rate": 1.0522714824962228e-05, | |
| "loss": 0.32448182106018064, | |
| "memory(GiB)": 133.17, | |
| "step": 1305, | |
| "token_acc": 0.8922085069580942, | |
| "train_speed(iter/s)": 0.068274 | |
| }, | |
| { | |
| "epoch": 1.5339578454332554, | |
| "grad_norm": 0.2063508927822113, | |
| "learning_rate": 1.0458230523124443e-05, | |
| "loss": 0.3380331039428711, | |
| "memory(GiB)": 133.17, | |
| "step": 1310, | |
| "token_acc": 0.8834363870742206, | |
| "train_speed(iter/s)": 0.06827 | |
| }, | |
| { | |
| "epoch": 1.5398126463700235, | |
| "grad_norm": 0.20604784786701202, | |
| "learning_rate": 1.0393727121081057e-05, | |
| "loss": 0.33421056270599364, | |
| "memory(GiB)": 133.17, | |
| "step": 1315, | |
| "token_acc": 0.8805816011032537, | |
| "train_speed(iter/s)": 0.068273 | |
| }, | |
| { | |
| "epoch": 1.5456674473067915, | |
| "grad_norm": 0.1895345002412796, | |
| "learning_rate": 1.0329207307496785e-05, | |
| "loss": 0.3230136394500732, | |
| "memory(GiB)": 133.17, | |
| "step": 1320, | |
| "token_acc": 0.8821661202321777, | |
| "train_speed(iter/s)": 0.068273 | |
| }, | |
| { | |
| "epoch": 1.5515222482435598, | |
| "grad_norm": 0.20009098947048187, | |
| "learning_rate": 1.0264673771720429e-05, | |
| "loss": 0.331970739364624, | |
| "memory(GiB)": 133.17, | |
| "step": 1325, | |
| "token_acc": 0.8856471632036539, | |
| "train_speed(iter/s)": 0.06827 | |
| }, | |
| { | |
| "epoch": 1.5573770491803278, | |
| "grad_norm": 0.19756639003753662, | |
| "learning_rate": 1.0200129203672754e-05, | |
| "loss": 0.33203625679016113, | |
| "memory(GiB)": 133.17, | |
| "step": 1330, | |
| "token_acc": 0.8719384623094173, | |
| "train_speed(iter/s)": 0.068266 | |
| }, | |
| { | |
| "epoch": 1.5632318501170959, | |
| "grad_norm": 0.20041348040103912, | |
| "learning_rate": 1.0135576293734381e-05, | |
| "loss": 0.3236687660217285, | |
| "memory(GiB)": 133.17, | |
| "step": 1335, | |
| "token_acc": 0.8890052192879956, | |
| "train_speed(iter/s)": 0.068267 | |
| }, | |
| { | |
| "epoch": 1.5690866510538641, | |
| "grad_norm": 0.2091531604528427, | |
| "learning_rate": 1.007101773263365e-05, | |
| "loss": 0.3356754302978516, | |
| "memory(GiB)": 133.17, | |
| "step": 1340, | |
| "token_acc": 0.881420303456906, | |
| "train_speed(iter/s)": 0.068267 | |
| }, | |
| { | |
| "epoch": 1.5749414519906324, | |
| "grad_norm": 0.18961018323898315, | |
| "learning_rate": 1.0006456211334445e-05, | |
| "loss": 0.32959842681884766, | |
| "memory(GiB)": 133.17, | |
| "step": 1345, | |
| "token_acc": 0.881056978636539, | |
| "train_speed(iter/s)": 0.068269 | |
| }, | |
| { | |
| "epoch": 1.5807962529274004, | |
| "grad_norm": 0.18674606084823608, | |
| "learning_rate": 9.941894420924044e-06, | |
| "loss": 0.3274309396743774, | |
| "memory(GiB)": 133.17, | |
| "step": 1350, | |
| "token_acc": 0.8911319303466276, | |
| "train_speed(iter/s)": 0.068266 | |
| }, | |
| { | |
| "epoch": 1.5866510538641685, | |
| "grad_norm": 0.19703362882137299, | |
| "learning_rate": 9.87733505250094e-06, | |
| "loss": 0.33193011283874513, | |
| "memory(GiB)": 133.17, | |
| "step": 1355, | |
| "token_acc": 0.880100249375002, | |
| "train_speed(iter/s)": 0.068262 | |
| }, | |
| { | |
| "epoch": 1.5925058548009368, | |
| "grad_norm": 0.1925787329673767, | |
| "learning_rate": 9.812780797062678e-06, | |
| "loss": 0.328415060043335, | |
| "memory(GiB)": 133.17, | |
| "step": 1360, | |
| "token_acc": 0.8847896196463753, | |
| "train_speed(iter/s)": 0.068258 | |
| }, | |
| { | |
| "epoch": 1.598360655737705, | |
| "grad_norm": 0.19211165606975555, | |
| "learning_rate": 9.748234345393672e-06, | |
| "loss": 0.32412943840026853, | |
| "memory(GiB)": 133.17, | |
| "step": 1365, | |
| "token_acc": 0.8819075272921836, | |
| "train_speed(iter/s)": 0.068253 | |
| }, | |
| { | |
| "epoch": 1.604215456674473, | |
| "grad_norm": 0.19750450551509857, | |
| "learning_rate": 9.68369838795306e-06, | |
| "loss": 0.33218812942504883, | |
| "memory(GiB)": 133.17, | |
| "step": 1370, | |
| "token_acc": 0.8781786390424615, | |
| "train_speed(iter/s)": 0.068246 | |
| }, | |
| { | |
| "epoch": 1.6100702576112411, | |
| "grad_norm": 0.19090089201927185, | |
| "learning_rate": 9.61917561476255e-06, | |
| "loss": 0.3252577781677246, | |
| "memory(GiB)": 133.17, | |
| "step": 1375, | |
| "token_acc": 0.8718890721275258, | |
| "train_speed(iter/s)": 0.068245 | |
| }, | |
| { | |
| "epoch": 1.6159250585480094, | |
| "grad_norm": 0.2007261961698532, | |
| "learning_rate": 9.554668715294305e-06, | |
| "loss": 0.3365320205688477, | |
| "memory(GiB)": 133.17, | |
| "step": 1380, | |
| "token_acc": 0.8808937423036773, | |
| "train_speed(iter/s)": 0.068246 | |
| }, | |
| { | |
| "epoch": 1.6217798594847777, | |
| "grad_norm": 0.20129120349884033, | |
| "learning_rate": 9.490180378358826e-06, | |
| "loss": 0.33901381492614746, | |
| "memory(GiB)": 133.17, | |
| "step": 1385, | |
| "token_acc": 0.8765135837259478, | |
| "train_speed(iter/s)": 0.068245 | |
| }, | |
| { | |
| "epoch": 1.6276346604215457, | |
| "grad_norm": 0.18519413471221924, | |
| "learning_rate": 9.425713291992878e-06, | |
| "loss": 0.32805542945861815, | |
| "memory(GiB)": 133.17, | |
| "step": 1390, | |
| "token_acc": 0.8837560234916173, | |
| "train_speed(iter/s)": 0.068243 | |
| }, | |
| { | |
| "epoch": 1.6334894613583137, | |
| "grad_norm": 0.19597233831882477, | |
| "learning_rate": 9.361270143347452e-06, | |
| "loss": 0.3414484977722168, | |
| "memory(GiB)": 133.17, | |
| "step": 1395, | |
| "token_acc": 0.8769078651119291, | |
| "train_speed(iter/s)": 0.068243 | |
| }, | |
| { | |
| "epoch": 1.639344262295082, | |
| "grad_norm": 0.17986047267913818, | |
| "learning_rate": 9.296853618575753e-06, | |
| "loss": 0.32855379581451416, | |
| "memory(GiB)": 133.17, | |
| "step": 1400, | |
| "token_acc": 0.8869200388717233, | |
| "train_speed(iter/s)": 0.068244 | |
| }, | |
| { | |
| "epoch": 1.6451990632318503, | |
| "grad_norm": 0.2232111245393753, | |
| "learning_rate": 9.232466402721241e-06, | |
| "loss": 0.33907437324523926, | |
| "memory(GiB)": 133.17, | |
| "step": 1405, | |
| "token_acc": 0.8805843000676505, | |
| "train_speed(iter/s)": 0.068245 | |
| }, | |
| { | |
| "epoch": 1.651053864168618, | |
| "grad_norm": 0.19428326189517975, | |
| "learning_rate": 9.1681111796057e-06, | |
| "loss": 0.3294277906417847, | |
| "memory(GiB)": 133.17, | |
| "step": 1410, | |
| "token_acc": 0.8820219796725579, | |
| "train_speed(iter/s)": 0.068247 | |
| }, | |
| { | |
| "epoch": 1.6569086651053864, | |
| "grad_norm": 0.205523282289505, | |
| "learning_rate": 9.103790631717375e-06, | |
| "loss": 0.34450831413269045, | |
| "memory(GiB)": 133.17, | |
| "step": 1415, | |
| "token_acc": 0.8722953184421034, | |
| "train_speed(iter/s)": 0.068245 | |
| }, | |
| { | |
| "epoch": 1.6627634660421546, | |
| "grad_norm": 0.1955317109823227, | |
| "learning_rate": 9.039507440099164e-06, | |
| "loss": 0.32976531982421875, | |
| "memory(GiB)": 133.17, | |
| "step": 1420, | |
| "token_acc": 0.887285426963314, | |
| "train_speed(iter/s)": 0.068244 | |
| }, | |
| { | |
| "epoch": 1.6686182669789227, | |
| "grad_norm": 0.1974899172782898, | |
| "learning_rate": 8.975264284236866e-06, | |
| "loss": 0.33209028244018557, | |
| "memory(GiB)": 133.17, | |
| "step": 1425, | |
| "token_acc": 0.8826060927102499, | |
| "train_speed(iter/s)": 0.068246 | |
| }, | |
| { | |
| "epoch": 1.6744730679156907, | |
| "grad_norm": 0.20223510265350342, | |
| "learning_rate": 8.911063841947476e-06, | |
| "loss": 0.33354964256286623, | |
| "memory(GiB)": 133.17, | |
| "step": 1430, | |
| "token_acc": 0.8795497702238948, | |
| "train_speed(iter/s)": 0.068244 | |
| }, | |
| { | |
| "epoch": 1.680327868852459, | |
| "grad_norm": 0.19802114367485046, | |
| "learning_rate": 8.846908789267589e-06, | |
| "loss": 0.33350410461425783, | |
| "memory(GiB)": 133.17, | |
| "step": 1435, | |
| "token_acc": 0.8820895522388059, | |
| "train_speed(iter/s)": 0.068246 | |
| }, | |
| { | |
| "epoch": 1.6861826697892273, | |
| "grad_norm": 0.19948238134384155, | |
| "learning_rate": 8.78280180034184e-06, | |
| "loss": 0.3242588758468628, | |
| "memory(GiB)": 133.17, | |
| "step": 1440, | |
| "token_acc": 0.8763353704232109, | |
| "train_speed(iter/s)": 0.068246 | |
| }, | |
| { | |
| "epoch": 1.6920374707259953, | |
| "grad_norm": 0.19532591104507446, | |
| "learning_rate": 8.718745547311458e-06, | |
| "loss": 0.3360363721847534, | |
| "memory(GiB)": 133.17, | |
| "step": 1445, | |
| "token_acc": 0.8764055183683731, | |
| "train_speed(iter/s)": 0.068241 | |
| }, | |
| { | |
| "epoch": 1.6978922716627634, | |
| "grad_norm": 0.20000973343849182, | |
| "learning_rate": 8.654742700202849e-06, | |
| "loss": 0.33543264865875244, | |
| "memory(GiB)": 133.17, | |
| "step": 1450, | |
| "token_acc": 0.8791397393130521, | |
| "train_speed(iter/s)": 0.06824 | |
| }, | |
| { | |
| "epoch": 1.7037470725995316, | |
| "grad_norm": 0.193691685795784, | |
| "learning_rate": 8.590795926816348e-06, | |
| "loss": 0.32405283451080324, | |
| "memory(GiB)": 133.17, | |
| "step": 1455, | |
| "token_acc": 0.8792053838888559, | |
| "train_speed(iter/s)": 0.068239 | |
| }, | |
| { | |
| "epoch": 1.7096018735362999, | |
| "grad_norm": 0.18100841343402863, | |
| "learning_rate": 8.526907892614986e-06, | |
| "loss": 0.32940475940704345, | |
| "memory(GiB)": 133.17, | |
| "step": 1460, | |
| "token_acc": 0.8829538372890485, | |
| "train_speed(iter/s)": 0.068234 | |
| }, | |
| { | |
| "epoch": 1.715456674473068, | |
| "grad_norm": 0.2313033789396286, | |
| "learning_rate": 8.463081260613391e-06, | |
| "loss": 0.3310007810592651, | |
| "memory(GiB)": 133.17, | |
| "step": 1465, | |
| "token_acc": 0.8884524843192141, | |
| "train_speed(iter/s)": 0.068231 | |
| }, | |
| { | |
| "epoch": 1.721311475409836, | |
| "grad_norm": 0.19678162038326263, | |
| "learning_rate": 8.399318691266806e-06, | |
| "loss": 0.3346008062362671, | |
| "memory(GiB)": 133.17, | |
| "step": 1470, | |
| "token_acc": 0.8785229138209752, | |
| "train_speed(iter/s)": 0.068229 | |
| }, | |
| { | |
| "epoch": 1.7271662763466042, | |
| "grad_norm": 0.20874732732772827, | |
| "learning_rate": 8.335622842360168e-06, | |
| "loss": 0.3276866674423218, | |
| "memory(GiB)": 133.17, | |
| "step": 1475, | |
| "token_acc": 0.8830160906179125, | |
| "train_speed(iter/s)": 0.068224 | |
| }, | |
| { | |
| "epoch": 1.7330210772833725, | |
| "grad_norm": 0.20175132155418396, | |
| "learning_rate": 8.271996368897345e-06, | |
| "loss": 0.33496603965759275, | |
| "memory(GiB)": 133.17, | |
| "step": 1480, | |
| "token_acc": 0.8852224356801145, | |
| "train_speed(iter/s)": 0.06822 | |
| }, | |
| { | |
| "epoch": 1.7388758782201406, | |
| "grad_norm": 0.19031141698360443, | |
| "learning_rate": 8.208441922990454e-06, | |
| "loss": 0.32518749237060546, | |
| "memory(GiB)": 133.17, | |
| "step": 1485, | |
| "token_acc": 0.8788670711802744, | |
| "train_speed(iter/s)": 0.068219 | |
| }, | |
| { | |
| "epoch": 1.7447306791569086, | |
| "grad_norm": 0.19358490407466888, | |
| "learning_rate": 8.144962153749331e-06, | |
| "loss": 0.32768878936767576, | |
| "memory(GiB)": 133.17, | |
| "step": 1490, | |
| "token_acc": 0.8753486456636903, | |
| "train_speed(iter/s)": 0.068217 | |
| }, | |
| { | |
| "epoch": 1.7505854800936769, | |
| "grad_norm": 0.21087020635604858, | |
| "learning_rate": 8.081559707171094e-06, | |
| "loss": 0.3388930559158325, | |
| "memory(GiB)": 133.17, | |
| "step": 1495, | |
| "token_acc": 0.8764171874364358, | |
| "train_speed(iter/s)": 0.06822 | |
| }, | |
| { | |
| "epoch": 1.756440281030445, | |
| "grad_norm": 0.1951858252286911, | |
| "learning_rate": 8.01823722602986e-06, | |
| "loss": 0.3247065544128418, | |
| "memory(GiB)": 133.17, | |
| "step": 1500, | |
| "token_acc": 0.8884904457005652, | |
| "train_speed(iter/s)": 0.068221 | |
| }, | |
| { | |
| "epoch": 1.762295081967213, | |
| "grad_norm": 0.20260894298553467, | |
| "learning_rate": 7.954997349766576e-06, | |
| "loss": 0.33308422565460205, | |
| "memory(GiB)": 133.17, | |
| "step": 1505, | |
| "token_acc": 0.8817160406212514, | |
| "train_speed(iter/s)": 0.068221 | |
| }, | |
| { | |
| "epoch": 1.7681498829039812, | |
| "grad_norm": 0.19411516189575195, | |
| "learning_rate": 7.891842714379027e-06, | |
| "loss": 0.3207800626754761, | |
| "memory(GiB)": 133.17, | |
| "step": 1510, | |
| "token_acc": 0.8866104646064812, | |
| "train_speed(iter/s)": 0.068218 | |
| }, | |
| { | |
| "epoch": 1.7740046838407495, | |
| "grad_norm": 0.2132834941148758, | |
| "learning_rate": 7.828775952311921e-06, | |
| "loss": 0.32387499809265136, | |
| "memory(GiB)": 133.17, | |
| "step": 1515, | |
| "token_acc": 0.8781614519597012, | |
| "train_speed(iter/s)": 0.068214 | |
| }, | |
| { | |
| "epoch": 1.7798594847775175, | |
| "grad_norm": 0.2175895869731903, | |
| "learning_rate": 7.765799692347201e-06, | |
| "loss": 0.32644095420837405, | |
| "memory(GiB)": 133.17, | |
| "step": 1520, | |
| "token_acc": 0.878244971440831, | |
| "train_speed(iter/s)": 0.068212 | |
| }, | |
| { | |
| "epoch": 1.7857142857142856, | |
| "grad_norm": 0.20511025190353394, | |
| "learning_rate": 7.702916559494444e-06, | |
| "loss": 0.3338191032409668, | |
| "memory(GiB)": 133.17, | |
| "step": 1525, | |
| "token_acc": 0.8815095165856024, | |
| "train_speed(iter/s)": 0.068213 | |
| }, | |
| { | |
| "epoch": 1.7915690866510539, | |
| "grad_norm": 0.19504858553409576, | |
| "learning_rate": 7.64012917488146e-06, | |
| "loss": 0.31484146118164064, | |
| "memory(GiB)": 133.17, | |
| "step": 1530, | |
| "token_acc": 0.8943310386864273, | |
| "train_speed(iter/s)": 0.068215 | |
| }, | |
| { | |
| "epoch": 1.7974238875878221, | |
| "grad_norm": 0.2018832564353943, | |
| "learning_rate": 7.577440155645028e-06, | |
| "loss": 0.3253478050231934, | |
| "memory(GiB)": 133.17, | |
| "step": 1535, | |
| "token_acc": 0.883270074462929, | |
| "train_speed(iter/s)": 0.068211 | |
| }, | |
| { | |
| "epoch": 1.8032786885245902, | |
| "grad_norm": 0.18957826495170593, | |
| "learning_rate": 7.514852114821811e-06, | |
| "loss": 0.3356925010681152, | |
| "memory(GiB)": 133.17, | |
| "step": 1540, | |
| "token_acc": 0.8806853758108548, | |
| "train_speed(iter/s)": 0.06821 | |
| }, | |
| { | |
| "epoch": 1.8091334894613582, | |
| "grad_norm": 0.18248967826366425, | |
| "learning_rate": 7.452367661239433e-06, | |
| "loss": 0.3128045558929443, | |
| "memory(GiB)": 133.17, | |
| "step": 1545, | |
| "token_acc": 0.8822570031516938, | |
| "train_speed(iter/s)": 0.068208 | |
| }, | |
| { | |
| "epoch": 1.8149882903981265, | |
| "grad_norm": 0.21197733283042908, | |
| "learning_rate": 7.389989399407741e-06, | |
| "loss": 0.3383420467376709, | |
| "memory(GiB)": 133.17, | |
| "step": 1550, | |
| "token_acc": 0.8810136098103397, | |
| "train_speed(iter/s)": 0.068204 | |
| }, | |
| { | |
| "epoch": 1.8208430913348947, | |
| "grad_norm": 0.1846388280391693, | |
| "learning_rate": 7.3277199294102485e-06, | |
| "loss": 0.3210147857666016, | |
| "memory(GiB)": 133.17, | |
| "step": 1555, | |
| "token_acc": 0.8783838996638541, | |
| "train_speed(iter/s)": 0.068204 | |
| }, | |
| { | |
| "epoch": 1.8266978922716628, | |
| "grad_norm": 0.21333329379558563, | |
| "learning_rate": 7.265561846795741e-06, | |
| "loss": 0.33364644050598147, | |
| "memory(GiB)": 133.17, | |
| "step": 1560, | |
| "token_acc": 0.8799311976453201, | |
| "train_speed(iter/s)": 0.068197 | |
| }, | |
| { | |
| "epoch": 1.8325526932084308, | |
| "grad_norm": 0.1916390359401703, | |
| "learning_rate": 7.203517742470101e-06, | |
| "loss": 0.3300149440765381, | |
| "memory(GiB)": 133.17, | |
| "step": 1565, | |
| "token_acc": 0.8891839280314484, | |
| "train_speed(iter/s)": 0.068196 | |
| }, | |
| { | |
| "epoch": 1.838407494145199, | |
| "grad_norm": 0.1898123174905777, | |
| "learning_rate": 7.141590202588312e-06, | |
| "loss": 0.3347996711730957, | |
| "memory(GiB)": 133.17, | |
| "step": 1570, | |
| "token_acc": 0.8836206356563897, | |
| "train_speed(iter/s)": 0.068196 | |
| }, | |
| { | |
| "epoch": 1.8442622950819674, | |
| "grad_norm": 0.25897353887557983, | |
| "learning_rate": 7.079781808446648e-06, | |
| "loss": 0.33739614486694336, | |
| "memory(GiB)": 133.17, | |
| "step": 1575, | |
| "token_acc": 0.8688480209111277, | |
| "train_speed(iter/s)": 0.068193 | |
| }, | |
| { | |
| "epoch": 1.8501170960187352, | |
| "grad_norm": 0.18949347734451294, | |
| "learning_rate": 7.018095136375089e-06, | |
| "loss": 0.3224343299865723, | |
| "memory(GiB)": 133.17, | |
| "step": 1580, | |
| "token_acc": 0.8803287043737061, | |
| "train_speed(iter/s)": 0.068192 | |
| }, | |
| { | |
| "epoch": 1.8559718969555035, | |
| "grad_norm": 0.19546827673912048, | |
| "learning_rate": 6.956532757629945e-06, | |
| "loss": 0.3295243740081787, | |
| "memory(GiB)": 133.17, | |
| "step": 1585, | |
| "token_acc": 0.8802496310563046, | |
| "train_speed(iter/s)": 0.068189 | |
| }, | |
| { | |
| "epoch": 1.8618266978922717, | |
| "grad_norm": 0.1977819800376892, | |
| "learning_rate": 6.89509723828665e-06, | |
| "loss": 0.3339688777923584, | |
| "memory(GiB)": 133.17, | |
| "step": 1590, | |
| "token_acc": 0.8794367319992775, | |
| "train_speed(iter/s)": 0.068188 | |
| }, | |
| { | |
| "epoch": 1.8676814988290398, | |
| "grad_norm": 0.2035733312368393, | |
| "learning_rate": 6.833791139132824e-06, | |
| "loss": 0.3196906089782715, | |
| "memory(GiB)": 133.17, | |
| "step": 1595, | |
| "token_acc": 0.8819461276705585, | |
| "train_speed(iter/s)": 0.068183 | |
| }, | |
| { | |
| "epoch": 1.8735362997658078, | |
| "grad_norm": 0.18036054074764252, | |
| "learning_rate": 6.772617015561529e-06, | |
| "loss": 0.3284833192825317, | |
| "memory(GiB)": 133.17, | |
| "step": 1600, | |
| "token_acc": 0.8721648839682242, | |
| "train_speed(iter/s)": 0.068185 | |
| }, | |
| { | |
| "epoch": 1.879391100702576, | |
| "grad_norm": 0.19073913991451263, | |
| "learning_rate": 6.7115774174647475e-06, | |
| "loss": 0.3214848518371582, | |
| "memory(GiB)": 133.17, | |
| "step": 1605, | |
| "token_acc": 0.8888246134782375, | |
| "train_speed(iter/s)": 0.068184 | |
| }, | |
| { | |
| "epoch": 1.8852459016393444, | |
| "grad_norm": 0.22237442433834076, | |
| "learning_rate": 6.6506748891271045e-06, | |
| "loss": 0.3328333854675293, | |
| "memory(GiB)": 133.17, | |
| "step": 1610, | |
| "token_acc": 0.8864136225147821, | |
| "train_speed(iter/s)": 0.068183 | |
| }, | |
| { | |
| "epoch": 1.8911007025761124, | |
| "grad_norm": 0.18580298125743866, | |
| "learning_rate": 6.5899119691198025e-06, | |
| "loss": 0.3259113073348999, | |
| "memory(GiB)": 133.17, | |
| "step": 1615, | |
| "token_acc": 0.8816001292832858, | |
| "train_speed(iter/s)": 0.068182 | |
| }, | |
| { | |
| "epoch": 1.8969555035128804, | |
| "grad_norm": 0.19562335312366486, | |
| "learning_rate": 6.529291190194829e-06, | |
| "loss": 0.3301589012145996, | |
| "memory(GiB)": 133.17, | |
| "step": 1620, | |
| "token_acc": 0.8816063260815503, | |
| "train_speed(iter/s)": 0.068183 | |
| }, | |
| { | |
| "epoch": 1.9028103044496487, | |
| "grad_norm": 0.19002656638622284, | |
| "learning_rate": 6.468815079179364e-06, | |
| "loss": 0.32632834911346437, | |
| "memory(GiB)": 133.17, | |
| "step": 1625, | |
| "token_acc": 0.8859821923514176, | |
| "train_speed(iter/s)": 0.06818 | |
| }, | |
| { | |
| "epoch": 1.908665105386417, | |
| "grad_norm": 0.19892436265945435, | |
| "learning_rate": 6.408486156870466e-06, | |
| "loss": 0.33937792778015136, | |
| "memory(GiB)": 133.17, | |
| "step": 1630, | |
| "token_acc": 0.862874582417446, | |
| "train_speed(iter/s)": 0.068181 | |
| }, | |
| { | |
| "epoch": 1.914519906323185, | |
| "grad_norm": 0.19243668019771576, | |
| "learning_rate": 6.348306937929991e-06, | |
| "loss": 0.3362755537033081, | |
| "memory(GiB)": 133.17, | |
| "step": 1635, | |
| "token_acc": 0.8769627409259633, | |
| "train_speed(iter/s)": 0.068182 | |
| }, | |
| { | |
| "epoch": 1.920374707259953, | |
| "grad_norm": 0.18101197481155396, | |
| "learning_rate": 6.288279930779789e-06, | |
| "loss": 0.31793382167816164, | |
| "memory(GiB)": 133.17, | |
| "step": 1640, | |
| "token_acc": 0.890389030411674, | |
| "train_speed(iter/s)": 0.068181 | |
| }, | |
| { | |
| "epoch": 1.9262295081967213, | |
| "grad_norm": 0.2016856074333191, | |
| "learning_rate": 6.228407637497131e-06, | |
| "loss": 0.3286017417907715, | |
| "memory(GiB)": 133.17, | |
| "step": 1645, | |
| "token_acc": 0.8691879609602018, | |
| "train_speed(iter/s)": 0.068179 | |
| }, | |
| { | |
| "epoch": 1.9320843091334896, | |
| "grad_norm": 0.18602800369262695, | |
| "learning_rate": 6.1686925537104306e-06, | |
| "loss": 0.3186060905456543, | |
| "memory(GiB)": 133.17, | |
| "step": 1650, | |
| "token_acc": 0.8740382186265122, | |
| "train_speed(iter/s)": 0.068178 | |
| }, | |
| { | |
| "epoch": 1.9379391100702577, | |
| "grad_norm": 0.19921670854091644, | |
| "learning_rate": 6.109137168495205e-06, | |
| "loss": 0.325826621055603, | |
| "memory(GiB)": 133.17, | |
| "step": 1655, | |
| "token_acc": 0.8942359105977971, | |
| "train_speed(iter/s)": 0.068178 | |
| }, | |
| { | |
| "epoch": 1.9437939110070257, | |
| "grad_norm": 0.1804487407207489, | |
| "learning_rate": 6.049743964270336e-06, | |
| "loss": 0.33586409091949465, | |
| "memory(GiB)": 133.17, | |
| "step": 1660, | |
| "token_acc": 0.8788306137094006, | |
| "train_speed(iter/s)": 0.068176 | |
| }, | |
| { | |
| "epoch": 1.949648711943794, | |
| "grad_norm": 0.20771907269954681, | |
| "learning_rate": 5.990515416694591e-06, | |
| "loss": 0.3336956024169922, | |
| "memory(GiB)": 133.17, | |
| "step": 1665, | |
| "token_acc": 0.8826585274697895, | |
| "train_speed(iter/s)": 0.068173 | |
| }, | |
| { | |
| "epoch": 1.955503512880562, | |
| "grad_norm": 0.19965799152851105, | |
| "learning_rate": 5.931453994563434e-06, | |
| "loss": 0.3285707473754883, | |
| "memory(GiB)": 133.17, | |
| "step": 1670, | |
| "token_acc": 0.8875544099179484, | |
| "train_speed(iter/s)": 0.068174 | |
| }, | |
| { | |
| "epoch": 1.96135831381733, | |
| "grad_norm": 0.20612315833568573, | |
| "learning_rate": 5.872562159706116e-06, | |
| "loss": 0.3315183877944946, | |
| "memory(GiB)": 133.17, | |
| "step": 1675, | |
| "token_acc": 0.8774614658697704, | |
| "train_speed(iter/s)": 0.068172 | |
| }, | |
| { | |
| "epoch": 1.9672131147540983, | |
| "grad_norm": 0.18963313102722168, | |
| "learning_rate": 5.8138423668830605e-06, | |
| "loss": 0.324364972114563, | |
| "memory(GiB)": 133.17, | |
| "step": 1680, | |
| "token_acc": 0.8801062072294897, | |
| "train_speed(iter/s)": 0.068174 | |
| }, | |
| { | |
| "epoch": 1.9730679156908666, | |
| "grad_norm": 0.19694305956363678, | |
| "learning_rate": 5.755297063683551e-06, | |
| "loss": 0.3285407066345215, | |
| "memory(GiB)": 133.17, | |
| "step": 1685, | |
| "token_acc": 0.885107199114613, | |
| "train_speed(iter/s)": 0.068174 | |
| }, | |
| { | |
| "epoch": 1.9789227166276346, | |
| "grad_norm": 0.18662695586681366, | |
| "learning_rate": 5.696928690423693e-06, | |
| "loss": 0.32373480796813964, | |
| "memory(GiB)": 133.17, | |
| "step": 1690, | |
| "token_acc": 0.8790801928023776, | |
| "train_speed(iter/s)": 0.068176 | |
| }, | |
| { | |
| "epoch": 1.9847775175644027, | |
| "grad_norm": 0.19431762397289276, | |
| "learning_rate": 5.638739680044718e-06, | |
| "loss": 0.3377500057220459, | |
| "memory(GiB)": 133.17, | |
| "step": 1695, | |
| "token_acc": 0.8722363298833375, | |
| "train_speed(iter/s)": 0.068175 | |
| }, | |
| { | |
| "epoch": 1.990632318501171, | |
| "grad_norm": 0.2024122029542923, | |
| "learning_rate": 5.580732458011544e-06, | |
| "loss": 0.3272620439529419, | |
| "memory(GiB)": 133.17, | |
| "step": 1700, | |
| "token_acc": 0.8856717266189297, | |
| "train_speed(iter/s)": 0.068173 | |
| }, | |
| { | |
| "epoch": 1.9964871194379392, | |
| "grad_norm": 0.18394924700260162, | |
| "learning_rate": 5.522909442211708e-06, | |
| "loss": 0.32718348503112793, | |
| "memory(GiB)": 133.17, | |
| "step": 1705, | |
| "token_acc": 0.876887289049153, | |
| "train_speed(iter/s)": 0.06817 | |
| }, | |
| { | |
| "epoch": 2.002341920374707, | |
| "grad_norm": 0.2651495337486267, | |
| "learning_rate": 5.465273042854551e-06, | |
| "loss": 0.31393914222717284, | |
| "memory(GiB)": 133.17, | |
| "step": 1710, | |
| "token_acc": 0.8893703023658244, | |
| "train_speed(iter/s)": 0.06798 | |
| }, | |
| { | |
| "epoch": 2.0081967213114753, | |
| "grad_norm": 0.21041427552700043, | |
| "learning_rate": 5.407825662370778e-06, | |
| "loss": 0.299090313911438, | |
| "memory(GiB)": 133.17, | |
| "step": 1715, | |
| "token_acc": 0.8915390401403241, | |
| "train_speed(iter/s)": 0.067979 | |
| }, | |
| { | |
| "epoch": 2.0140515222482436, | |
| "grad_norm": 0.21380308270454407, | |
| "learning_rate": 5.350569695312313e-06, | |
| "loss": 0.3101144790649414, | |
| "memory(GiB)": 133.17, | |
| "step": 1720, | |
| "token_acc": 0.8875269739992413, | |
| "train_speed(iter/s)": 0.067976 | |
| }, | |
| { | |
| "epoch": 2.019906323185012, | |
| "grad_norm": 0.1987718939781189, | |
| "learning_rate": 5.293507528252474e-06, | |
| "loss": 0.3136857509613037, | |
| "memory(GiB)": 133.17, | |
| "step": 1725, | |
| "token_acc": 0.8871349620144686, | |
| "train_speed(iter/s)": 0.067974 | |
| }, | |
| { | |
| "epoch": 2.0257611241217797, | |
| "grad_norm": 0.3591626286506653, | |
| "learning_rate": 5.236641539686518e-06, | |
| "loss": 0.30123333930969237, | |
| "memory(GiB)": 133.17, | |
| "step": 1730, | |
| "token_acc": 0.8860105084502068, | |
| "train_speed(iter/s)": 0.067977 | |
| }, | |
| { | |
| "epoch": 2.031615925058548, | |
| "grad_norm": 0.19819702208042145, | |
| "learning_rate": 5.179974099932472e-06, | |
| "loss": 0.29487655162811277, | |
| "memory(GiB)": 133.17, | |
| "step": 1735, | |
| "token_acc": 0.8855569615495446, | |
| "train_speed(iter/s)": 0.067974 | |
| }, | |
| { | |
| "epoch": 2.037470725995316, | |
| "grad_norm": 0.2023162841796875, | |
| "learning_rate": 5.12350757103236e-06, | |
| "loss": 0.29470908641815186, | |
| "memory(GiB)": 133.17, | |
| "step": 1740, | |
| "token_acc": 0.8894021747623796, | |
| "train_speed(iter/s)": 0.067975 | |
| }, | |
| { | |
| "epoch": 2.0433255269320845, | |
| "grad_norm": 0.19459553062915802, | |
| "learning_rate": 5.067244306653736e-06, | |
| "loss": 0.30195889472961424, | |
| "memory(GiB)": 133.17, | |
| "step": 1745, | |
| "token_acc": 0.8966922700402876, | |
| "train_speed(iter/s)": 0.067976 | |
| }, | |
| { | |
| "epoch": 2.0491803278688523, | |
| "grad_norm": 0.20582208037376404, | |
| "learning_rate": 5.0111866519915575e-06, | |
| "loss": 0.2972427845001221, | |
| "memory(GiB)": 133.17, | |
| "step": 1750, | |
| "token_acc": 0.8860103790300714, | |
| "train_speed(iter/s)": 0.067974 | |
| }, | |
| { | |
| "epoch": 2.0550351288056206, | |
| "grad_norm": 0.21163956820964813, | |
| "learning_rate": 4.95533694367047e-06, | |
| "loss": 0.2951073408126831, | |
| "memory(GiB)": 133.17, | |
| "step": 1755, | |
| "token_acc": 0.8911898143660713, | |
| "train_speed(iter/s)": 0.067976 | |
| }, | |
| { | |
| "epoch": 2.060889929742389, | |
| "grad_norm": 0.271316796541214, | |
| "learning_rate": 4.899697509647379e-06, | |
| "loss": 0.3005206108093262, | |
| "memory(GiB)": 133.17, | |
| "step": 1760, | |
| "token_acc": 0.8827217211398426, | |
| "train_speed(iter/s)": 0.067977 | |
| }, | |
| { | |
| "epoch": 2.066744730679157, | |
| "grad_norm": 0.1982126086950302, | |
| "learning_rate": 4.844270669114424e-06, | |
| "loss": 0.30247581005096436, | |
| "memory(GiB)": 133.17, | |
| "step": 1765, | |
| "token_acc": 0.8939350325087765, | |
| "train_speed(iter/s)": 0.067979 | |
| }, | |
| { | |
| "epoch": 2.072599531615925, | |
| "grad_norm": 0.20624509453773499, | |
| "learning_rate": 4.789058732402319e-06, | |
| "loss": 0.2944344520568848, | |
| "memory(GiB)": 133.17, | |
| "step": 1770, | |
| "token_acc": 0.8877186400937866, | |
| "train_speed(iter/s)": 0.067979 | |
| }, | |
| { | |
| "epoch": 2.078454332552693, | |
| "grad_norm": 0.18864554166793823, | |
| "learning_rate": 4.734064000884044e-06, | |
| "loss": 0.31334614753723145, | |
| "memory(GiB)": 133.17, | |
| "step": 1775, | |
| "token_acc": 0.8753750599625646, | |
| "train_speed(iter/s)": 0.067981 | |
| }, | |
| { | |
| "epoch": 2.0843091334894615, | |
| "grad_norm": 0.19976413249969482, | |
| "learning_rate": 4.679288766878908e-06, | |
| "loss": 0.3065293073654175, | |
| "memory(GiB)": 133.17, | |
| "step": 1780, | |
| "token_acc": 0.8893787799945783, | |
| "train_speed(iter/s)": 0.067979 | |
| }, | |
| { | |
| "epoch": 2.0901639344262297, | |
| "grad_norm": 0.20083464682102203, | |
| "learning_rate": 4.624735313557019e-06, | |
| "loss": 0.30294094085693357, | |
| "memory(GiB)": 133.17, | |
| "step": 1785, | |
| "token_acc": 0.894106624191886, | |
| "train_speed(iter/s)": 0.06798 | |
| }, | |
| { | |
| "epoch": 2.0960187353629975, | |
| "grad_norm": 0.19687768816947937, | |
| "learning_rate": 4.570405914844105e-06, | |
| "loss": 0.29626712799072263, | |
| "memory(GiB)": 133.17, | |
| "step": 1790, | |
| "token_acc": 0.8918388887847958, | |
| "train_speed(iter/s)": 0.067977 | |
| }, | |
| { | |
| "epoch": 2.101873536299766, | |
| "grad_norm": 0.21042723953723907, | |
| "learning_rate": 4.516302835326723e-06, | |
| "loss": 0.30143260955810547, | |
| "memory(GiB)": 133.17, | |
| "step": 1795, | |
| "token_acc": 0.8979354142270508, | |
| "train_speed(iter/s)": 0.067977 | |
| }, | |
| { | |
| "epoch": 2.107728337236534, | |
| "grad_norm": 0.20909157395362854, | |
| "learning_rate": 4.462428330157886e-06, | |
| "loss": 0.29250779151916506, | |
| "memory(GiB)": 133.17, | |
| "step": 1800, | |
| "token_acc": 0.8972882018187891, | |
| "train_speed(iter/s)": 0.067976 | |
| }, | |
| { | |
| "epoch": 2.113583138173302, | |
| "grad_norm": 0.18871068954467773, | |
| "learning_rate": 4.4087846449630475e-06, | |
| "loss": 0.296770715713501, | |
| "memory(GiB)": 133.17, | |
| "step": 1805, | |
| "token_acc": 0.8939121347421645, | |
| "train_speed(iter/s)": 0.067975 | |
| }, | |
| { | |
| "epoch": 2.11943793911007, | |
| "grad_norm": 0.19644689559936523, | |
| "learning_rate": 4.355374015746493e-06, | |
| "loss": 0.29331092834472655, | |
| "memory(GiB)": 133.17, | |
| "step": 1810, | |
| "token_acc": 0.8898344723236344, | |
| "train_speed(iter/s)": 0.067974 | |
| }, | |
| { | |
| "epoch": 2.1252927400468384, | |
| "grad_norm": 0.2067333608865738, | |
| "learning_rate": 4.302198668798159e-06, | |
| "loss": 0.298096752166748, | |
| "memory(GiB)": 133.17, | |
| "step": 1815, | |
| "token_acc": 0.8860096940702505, | |
| "train_speed(iter/s)": 0.067975 | |
| }, | |
| { | |
| "epoch": 2.1311475409836067, | |
| "grad_norm": 0.19337214529514313, | |
| "learning_rate": 4.249260820600813e-06, | |
| "loss": 0.28569879531860354, | |
| "memory(GiB)": 133.17, | |
| "step": 1820, | |
| "token_acc": 0.8937030726309285, | |
| "train_speed(iter/s)": 0.067973 | |
| }, | |
| { | |
| "epoch": 2.1370023419203745, | |
| "grad_norm": 0.21502645313739777, | |
| "learning_rate": 4.1965626777376766e-06, | |
| "loss": 0.29423298835754397, | |
| "memory(GiB)": 133.17, | |
| "step": 1825, | |
| "token_acc": 0.8971041975679516, | |
| "train_speed(iter/s)": 0.067972 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 0.18807381391525269, | |
| "learning_rate": 4.144106436800453e-06, | |
| "loss": 0.30044715404510497, | |
| "memory(GiB)": 133.17, | |
| "step": 1830, | |
| "token_acc": 0.8974527790728444, | |
| "train_speed(iter/s)": 0.067975 | |
| }, | |
| { | |
| "epoch": 2.148711943793911, | |
| "grad_norm": 0.18506018817424774, | |
| "learning_rate": 4.091894284297758e-06, | |
| "loss": 0.2915837526321411, | |
| "memory(GiB)": 133.17, | |
| "step": 1835, | |
| "token_acc": 0.8848257422956048, | |
| "train_speed(iter/s)": 0.067975 | |
| }, | |
| { | |
| "epoch": 2.1545667447306793, | |
| "grad_norm": 0.22477097809314728, | |
| "learning_rate": 4.039928396563983e-06, | |
| "loss": 0.3101827621459961, | |
| "memory(GiB)": 133.17, | |
| "step": 1840, | |
| "token_acc": 0.8897657467466561, | |
| "train_speed(iter/s)": 0.067975 | |
| }, | |
| { | |
| "epoch": 2.160421545667447, | |
| "grad_norm": 0.20848192274570465, | |
| "learning_rate": 3.9882109396685845e-06, | |
| "loss": 0.28560404777526854, | |
| "memory(GiB)": 133.17, | |
| "step": 1845, | |
| "token_acc": 0.8866163430466006, | |
| "train_speed(iter/s)": 0.067973 | |
| }, | |
| { | |
| "epoch": 2.1662763466042154, | |
| "grad_norm": 0.1790919452905655, | |
| "learning_rate": 3.936744069325797e-06, | |
| "loss": 0.28580513000488283, | |
| "memory(GiB)": 133.17, | |
| "step": 1850, | |
| "token_acc": 0.8959953003524735, | |
| "train_speed(iter/s)": 0.067972 | |
| }, | |
| { | |
| "epoch": 2.1721311475409837, | |
| "grad_norm": 0.1946616917848587, | |
| "learning_rate": 3.885529930804768e-06, | |
| "loss": 0.28203678131103516, | |
| "memory(GiB)": 133.17, | |
| "step": 1855, | |
| "token_acc": 0.8865687290155894, | |
| "train_speed(iter/s)": 0.06797 | |
| }, | |
| { | |
| "epoch": 2.177985948477752, | |
| "grad_norm": 0.2024662047624588, | |
| "learning_rate": 3.834570658840152e-06, | |
| "loss": 0.3013646602630615, | |
| "memory(GiB)": 133.17, | |
| "step": 1860, | |
| "token_acc": 0.8853980676749265, | |
| "train_speed(iter/s)": 0.06797 | |
| }, | |
| { | |
| "epoch": 2.1838407494145198, | |
| "grad_norm": 0.1884947568178177, | |
| "learning_rate": 3.7838683775431106e-06, | |
| "loss": 0.2940408706665039, | |
| "memory(GiB)": 133.17, | |
| "step": 1865, | |
| "token_acc": 0.8914837094453064, | |
| "train_speed(iter/s)": 0.067971 | |
| }, | |
| { | |
| "epoch": 2.189695550351288, | |
| "grad_norm": 0.19168955087661743, | |
| "learning_rate": 3.733425200312797e-06, | |
| "loss": 0.2958191156387329, | |
| "memory(GiB)": 133.17, | |
| "step": 1870, | |
| "token_acc": 0.8822518250452361, | |
| "train_speed(iter/s)": 0.067972 | |
| }, | |
| { | |
| "epoch": 2.1955503512880563, | |
| "grad_norm": 0.194383904337883, | |
| "learning_rate": 3.683243229748249e-06, | |
| "loss": 0.28948154449462893, | |
| "memory(GiB)": 133.17, | |
| "step": 1875, | |
| "token_acc": 0.8876668322153558, | |
| "train_speed(iter/s)": 0.067972 | |
| }, | |
| { | |
| "epoch": 2.201405152224824, | |
| "grad_norm": 0.19730046391487122, | |
| "learning_rate": 3.633324557560747e-06, | |
| "loss": 0.29555392265319824, | |
| "memory(GiB)": 133.17, | |
| "step": 1880, | |
| "token_acc": 0.8939645340207796, | |
| "train_speed(iter/s)": 0.067973 | |
| }, | |
| { | |
| "epoch": 2.2072599531615924, | |
| "grad_norm": 0.18545053899288177, | |
| "learning_rate": 3.5836712644866277e-06, | |
| "loss": 0.28943870067596433, | |
| "memory(GiB)": 133.17, | |
| "step": 1885, | |
| "token_acc": 0.8883624593035462, | |
| "train_speed(iter/s)": 0.067973 | |
| }, | |
| { | |
| "epoch": 2.2131147540983607, | |
| "grad_norm": 0.20143678784370422, | |
| "learning_rate": 3.5342854202005696e-06, | |
| "loss": 0.29045825004577636, | |
| "memory(GiB)": 133.17, | |
| "step": 1890, | |
| "token_acc": 0.8931490778817771, | |
| "train_speed(iter/s)": 0.067975 | |
| }, | |
| { | |
| "epoch": 2.218969555035129, | |
| "grad_norm": 0.1933010071516037, | |
| "learning_rate": 3.485169083229293e-06, | |
| "loss": 0.2985133409500122, | |
| "memory(GiB)": 133.17, | |
| "step": 1895, | |
| "token_acc": 0.8983116114671417, | |
| "train_speed(iter/s)": 0.067977 | |
| }, | |
| { | |
| "epoch": 2.2248243559718968, | |
| "grad_norm": 0.2029975950717926, | |
| "learning_rate": 3.4363243008657842e-06, | |
| "loss": 0.29316296577453616, | |
| "memory(GiB)": 133.17, | |
| "step": 1900, | |
| "token_acc": 0.8817185537873807, | |
| "train_speed(iter/s)": 0.067975 | |
| }, | |
| { | |
| "epoch": 2.230679156908665, | |
| "grad_norm": 0.1947357952594757, | |
| "learning_rate": 3.3877531090839478e-06, | |
| "loss": 0.2983538627624512, | |
| "memory(GiB)": 133.17, | |
| "step": 1905, | |
| "token_acc": 0.8836724096943308, | |
| "train_speed(iter/s)": 0.067976 | |
| }, | |
| { | |
| "epoch": 2.2365339578454333, | |
| "grad_norm": 0.19401586055755615, | |
| "learning_rate": 3.3394575324537327e-06, | |
| "loss": 0.3019071578979492, | |
| "memory(GiB)": 133.17, | |
| "step": 1910, | |
| "token_acc": 0.8830038763307387, | |
| "train_speed(iter/s)": 0.067973 | |
| }, | |
| { | |
| "epoch": 2.2423887587822016, | |
| "grad_norm": 0.18747617304325104, | |
| "learning_rate": 3.2914395840567605e-06, | |
| "loss": 0.2899949312210083, | |
| "memory(GiB)": 133.17, | |
| "step": 1915, | |
| "token_acc": 0.8937491349698655, | |
| "train_speed(iter/s)": 0.067972 | |
| }, | |
| { | |
| "epoch": 2.2482435597189694, | |
| "grad_norm": 0.18497265875339508, | |
| "learning_rate": 3.2437012654024057e-06, | |
| "loss": 0.29514849185943604, | |
| "memory(GiB)": 133.17, | |
| "step": 1920, | |
| "token_acc": 0.8952329266162637, | |
| "train_speed(iter/s)": 0.067972 | |
| }, | |
| { | |
| "epoch": 2.2540983606557377, | |
| "grad_norm": 0.19390814006328583, | |
| "learning_rate": 3.1962445663443643e-06, | |
| "loss": 0.29795031547546386, | |
| "memory(GiB)": 133.17, | |
| "step": 1925, | |
| "token_acc": 0.8883286157922032, | |
| "train_speed(iter/s)": 0.067969 | |
| }, | |
| { | |
| "epoch": 2.259953161592506, | |
| "grad_norm": 0.19152696430683136, | |
| "learning_rate": 3.1490714649977196e-06, | |
| "loss": 0.3013578414916992, | |
| "memory(GiB)": 133.17, | |
| "step": 1930, | |
| "token_acc": 0.8966371415703348, | |
| "train_speed(iter/s)": 0.067968 | |
| }, | |
| { | |
| "epoch": 2.265807962529274, | |
| "grad_norm": 0.19523312151432037, | |
| "learning_rate": 3.102183927656488e-06, | |
| "loss": 0.29044888019561765, | |
| "memory(GiB)": 133.17, | |
| "step": 1935, | |
| "token_acc": 0.8816311924321633, | |
| "train_speed(iter/s)": 0.067969 | |
| }, | |
| { | |
| "epoch": 2.271662763466042, | |
| "grad_norm": 0.19032931327819824, | |
| "learning_rate": 3.0555839087116547e-06, | |
| "loss": 0.30231542587280275, | |
| "memory(GiB)": 133.17, | |
| "step": 1940, | |
| "token_acc": 0.89288125, | |
| "train_speed(iter/s)": 0.06797 | |
| }, | |
| { | |
| "epoch": 2.2775175644028103, | |
| "grad_norm": 0.19542452692985535, | |
| "learning_rate": 3.009273350569705e-06, | |
| "loss": 0.3001267433166504, | |
| "memory(GiB)": 133.17, | |
| "step": 1945, | |
| "token_acc": 0.8971306271312823, | |
| "train_speed(iter/s)": 0.067973 | |
| }, | |
| { | |
| "epoch": 2.2833723653395785, | |
| "grad_norm": 0.1856907606124878, | |
| "learning_rate": 2.963254183571682e-06, | |
| "loss": 0.29535422325134275, | |
| "memory(GiB)": 133.17, | |
| "step": 1950, | |
| "token_acc": 0.8930906317907196, | |
| "train_speed(iter/s)": 0.067972 | |
| }, | |
| { | |
| "epoch": 2.289227166276347, | |
| "grad_norm": 0.1937672644853592, | |
| "learning_rate": 2.9175283259126943e-06, | |
| "loss": 0.2962016582489014, | |
| "memory(GiB)": 133.17, | |
| "step": 1955, | |
| "token_acc": 0.898554810095657, | |
| "train_speed(iter/s)": 0.067972 | |
| }, | |
| { | |
| "epoch": 2.2950819672131146, | |
| "grad_norm": 0.18747203052043915, | |
| "learning_rate": 2.872097683561986e-06, | |
| "loss": 0.2947913885116577, | |
| "memory(GiB)": 133.17, | |
| "step": 1960, | |
| "token_acc": 0.9006477145474906, | |
| "train_speed(iter/s)": 0.067972 | |
| }, | |
| { | |
| "epoch": 2.300936768149883, | |
| "grad_norm": 0.188889279961586, | |
| "learning_rate": 2.8269641501834834e-06, | |
| "loss": 0.3037715911865234, | |
| "memory(GiB)": 133.17, | |
| "step": 1965, | |
| "token_acc": 0.8837665048634434, | |
| "train_speed(iter/s)": 0.067974 | |
| }, | |
| { | |
| "epoch": 2.306791569086651, | |
| "grad_norm": 0.18386943638324738, | |
| "learning_rate": 2.782129607056848e-06, | |
| "loss": 0.29630954265594484, | |
| "memory(GiB)": 133.17, | |
| "step": 1970, | |
| "token_acc": 0.8854790349100962, | |
| "train_speed(iter/s)": 0.067975 | |
| }, | |
| { | |
| "epoch": 2.312646370023419, | |
| "grad_norm": 0.1860065758228302, | |
| "learning_rate": 2.7375959229990856e-06, | |
| "loss": 0.2871407508850098, | |
| "memory(GiB)": 133.17, | |
| "step": 1975, | |
| "token_acc": 0.8921277606269294, | |
| "train_speed(iter/s)": 0.067973 | |
| }, | |
| { | |
| "epoch": 2.3185011709601873, | |
| "grad_norm": 0.18686528503894806, | |
| "learning_rate": 2.6933649542866326e-06, | |
| "loss": 0.29081072807312014, | |
| "memory(GiB)": 133.17, | |
| "step": 1980, | |
| "token_acc": 0.8890196371424658, | |
| "train_speed(iter/s)": 0.067976 | |
| }, | |
| { | |
| "epoch": 2.3243559718969555, | |
| "grad_norm": 0.17976053059101105, | |
| "learning_rate": 2.649438544577977e-06, | |
| "loss": 0.2809652090072632, | |
| "memory(GiB)": 133.17, | |
| "step": 1985, | |
| "token_acc": 0.8903470664805608, | |
| "train_speed(iter/s)": 0.067975 | |
| }, | |
| { | |
| "epoch": 2.330210772833724, | |
| "grad_norm": 0.18407879769802094, | |
| "learning_rate": 2.6058185248368317e-06, | |
| "loss": 0.2934088706970215, | |
| "memory(GiB)": 133.17, | |
| "step": 1990, | |
| "token_acc": 0.896975139427167, | |
| "train_speed(iter/s)": 0.067974 | |
| }, | |
| { | |
| "epoch": 2.3360655737704916, | |
| "grad_norm": 0.1934359222650528, | |
| "learning_rate": 2.562506713255789e-06, | |
| "loss": 0.2888351917266846, | |
| "memory(GiB)": 133.17, | |
| "step": 1995, | |
| "token_acc": 0.8959334542575936, | |
| "train_speed(iter/s)": 0.067976 | |
| }, | |
| { | |
| "epoch": 2.34192037470726, | |
| "grad_norm": 0.19506384432315826, | |
| "learning_rate": 2.519504915180555e-06, | |
| "loss": 0.29209365844726565, | |
| "memory(GiB)": 133.17, | |
| "step": 2000, | |
| "token_acc": 0.8933135020860227, | |
| "train_speed(iter/s)": 0.067975 | |
| }, | |
| { | |
| "epoch": 2.347775175644028, | |
| "grad_norm": 0.2316175252199173, | |
| "learning_rate": 2.4768149230346917e-06, | |
| "loss": 0.30724682807922366, | |
| "memory(GiB)": 133.17, | |
| "step": 2005, | |
| "token_acc": 0.8923097611088004, | |
| "train_speed(iter/s)": 0.067972 | |
| }, | |
| { | |
| "epoch": 2.3536299765807964, | |
| "grad_norm": 0.19259780645370483, | |
| "learning_rate": 2.4344385162448924e-06, | |
| "loss": 0.29259405136108396, | |
| "memory(GiB)": 133.17, | |
| "step": 2010, | |
| "token_acc": 0.8962570117930629, | |
| "train_speed(iter/s)": 0.067971 | |
| }, | |
| { | |
| "epoch": 2.3594847775175642, | |
| "grad_norm": 0.18455654382705688, | |
| "learning_rate": 2.392377461166826e-06, | |
| "loss": 0.2909110069274902, | |
| "memory(GiB)": 133.17, | |
| "step": 2015, | |
| "token_acc": 0.8950562827510747, | |
| "train_speed(iter/s)": 0.067971 | |
| }, | |
| { | |
| "epoch": 2.3653395784543325, | |
| "grad_norm": 0.19333600997924805, | |
| "learning_rate": 2.350633511011511e-06, | |
| "loss": 0.2987373352050781, | |
| "memory(GiB)": 133.17, | |
| "step": 2020, | |
| "token_acc": 0.890084898990847, | |
| "train_speed(iter/s)": 0.067969 | |
| }, | |
| { | |
| "epoch": 2.371194379391101, | |
| "grad_norm": 0.18590733408927917, | |
| "learning_rate": 2.309208405772221e-06, | |
| "loss": 0.3060739278793335, | |
| "memory(GiB)": 133.17, | |
| "step": 2025, | |
| "token_acc": 0.894386606817296, | |
| "train_speed(iter/s)": 0.06797 | |
| }, | |
| { | |
| "epoch": 2.3770491803278686, | |
| "grad_norm": 0.19246318936347961, | |
| "learning_rate": 2.2681038721519768e-06, | |
| "loss": 0.3093658208847046, | |
| "memory(GiB)": 133.17, | |
| "step": 2030, | |
| "token_acc": 0.8894458411573517, | |
| "train_speed(iter/s)": 0.067969 | |
| }, | |
| { | |
| "epoch": 2.382903981264637, | |
| "grad_norm": 0.19371892511844635, | |
| "learning_rate": 2.227321623491563e-06, | |
| "loss": 0.2991630077362061, | |
| "memory(GiB)": 133.17, | |
| "step": 2035, | |
| "token_acc": 0.8873450543890716, | |
| "train_speed(iter/s)": 0.067968 | |
| }, | |
| { | |
| "epoch": 2.388758782201405, | |
| "grad_norm": 0.17911982536315918, | |
| "learning_rate": 2.186863359698108e-06, | |
| "loss": 0.29452369213104246, | |
| "memory(GiB)": 133.17, | |
| "step": 2040, | |
| "token_acc": 0.9101415057216162, | |
| "train_speed(iter/s)": 0.067967 | |
| }, | |
| { | |
| "epoch": 2.3946135831381734, | |
| "grad_norm": 0.19897328317165375, | |
| "learning_rate": 2.1467307671742377e-06, | |
| "loss": 0.2978281736373901, | |
| "memory(GiB)": 133.17, | |
| "step": 2045, | |
| "token_acc": 0.8880359089210048, | |
| "train_speed(iter/s)": 0.067968 | |
| }, | |
| { | |
| "epoch": 2.4004683840749417, | |
| "grad_norm": 0.19614428281784058, | |
| "learning_rate": 2.106925518747779e-06, | |
| "loss": 0.2917934417724609, | |
| "memory(GiB)": 133.17, | |
| "step": 2050, | |
| "token_acc": 0.892301005603362, | |
| "train_speed(iter/s)": 0.067968 | |
| }, | |
| { | |
| "epoch": 2.4063231850117095, | |
| "grad_norm": 0.18466618657112122, | |
| "learning_rate": 2.06744927360202e-06, | |
| "loss": 0.2950620651245117, | |
| "memory(GiB)": 133.17, | |
| "step": 2055, | |
| "token_acc": 0.8911625268446858, | |
| "train_speed(iter/s)": 0.067964 | |
| }, | |
| { | |
| "epoch": 2.4121779859484778, | |
| "grad_norm": 0.19196145236492157, | |
| "learning_rate": 2.0283036772065712e-06, | |
| "loss": 0.29646165370941163, | |
| "memory(GiB)": 133.17, | |
| "step": 2060, | |
| "token_acc": 0.8943602932370165, | |
| "train_speed(iter/s)": 0.067964 | |
| }, | |
| { | |
| "epoch": 2.418032786885246, | |
| "grad_norm": 0.20200960338115692, | |
| "learning_rate": 1.9894903612487683e-06, | |
| "loss": 0.30394654273986815, | |
| "memory(GiB)": 133.17, | |
| "step": 2065, | |
| "token_acc": 0.882068843029542, | |
| "train_speed(iter/s)": 0.067964 | |
| }, | |
| { | |
| "epoch": 2.423887587822014, | |
| "grad_norm": 0.19281496107578278, | |
| "learning_rate": 1.9510109435656457e-06, | |
| "loss": 0.30283074378967284, | |
| "memory(GiB)": 133.17, | |
| "step": 2070, | |
| "token_acc": 0.8936107928433829, | |
| "train_speed(iter/s)": 0.067965 | |
| }, | |
| { | |
| "epoch": 2.429742388758782, | |
| "grad_norm": 0.1977166384458542, | |
| "learning_rate": 1.9128670280765283e-06, | |
| "loss": 0.30489649772644045, | |
| "memory(GiB)": 133.17, | |
| "step": 2075, | |
| "token_acc": 0.8869975460007921, | |
| "train_speed(iter/s)": 0.067962 | |
| }, | |
| { | |
| "epoch": 2.4355971896955504, | |
| "grad_norm": 0.185228630900383, | |
| "learning_rate": 1.8750602047161603e-06, | |
| "loss": 0.29401373863220215, | |
| "memory(GiB)": 133.17, | |
| "step": 2080, | |
| "token_acc": 0.8979298187696017, | |
| "train_speed(iter/s)": 0.067961 | |
| }, | |
| { | |
| "epoch": 2.4414519906323187, | |
| "grad_norm": 0.19245509803295135, | |
| "learning_rate": 1.8375920493684264e-06, | |
| "loss": 0.3006903171539307, | |
| "memory(GiB)": 133.17, | |
| "step": 2085, | |
| "token_acc": 0.8867607400439009, | |
| "train_speed(iter/s)": 0.067964 | |
| }, | |
| { | |
| "epoch": 2.4473067915690865, | |
| "grad_norm": 0.19419154524803162, | |
| "learning_rate": 1.8004641238006815e-06, | |
| "loss": 0.29811155796051025, | |
| "memory(GiB)": 133.17, | |
| "step": 2090, | |
| "token_acc": 0.8943640794642241, | |
| "train_speed(iter/s)": 0.067962 | |
| }, | |
| { | |
| "epoch": 2.4531615925058547, | |
| "grad_norm": 0.1823989599943161, | |
| "learning_rate": 1.7636779755986443e-06, | |
| "loss": 0.3039386749267578, | |
| "memory(GiB)": 133.17, | |
| "step": 2095, | |
| "token_acc": 0.8863327040435527, | |
| "train_speed(iter/s)": 0.067959 | |
| }, | |
| { | |
| "epoch": 2.459016393442623, | |
| "grad_norm": 0.1855112761259079, | |
| "learning_rate": 1.7272351381018792e-06, | |
| "loss": 0.3009587287902832, | |
| "memory(GiB)": 133.17, | |
| "step": 2100, | |
| "token_acc": 0.8790456780659275, | |
| "train_speed(iter/s)": 0.067959 | |
| }, | |
| { | |
| "epoch": 2.4648711943793913, | |
| "grad_norm": 0.18876492977142334, | |
| "learning_rate": 1.6911371303399048e-06, | |
| "loss": 0.28830153942108155, | |
| "memory(GiB)": 133.17, | |
| "step": 2105, | |
| "token_acc": 0.8928979046201769, | |
| "train_speed(iter/s)": 0.067956 | |
| }, | |
| { | |
| "epoch": 2.470725995316159, | |
| "grad_norm": 0.19157980382442474, | |
| "learning_rate": 1.6553854569688632e-06, | |
| "loss": 0.30360212326049807, | |
| "memory(GiB)": 133.17, | |
| "step": 2110, | |
| "token_acc": 0.8886916557875393, | |
| "train_speed(iter/s)": 0.067954 | |
| }, | |
| { | |
| "epoch": 2.4765807962529274, | |
| "grad_norm": 0.19697441160678864, | |
| "learning_rate": 1.619981608208796e-06, | |
| "loss": 0.30350236892700194, | |
| "memory(GiB)": 133.17, | |
| "step": 2115, | |
| "token_acc": 0.8755962030416897, | |
| "train_speed(iter/s)": 0.067954 | |
| }, | |
| { | |
| "epoch": 2.4824355971896956, | |
| "grad_norm": 0.19516149163246155, | |
| "learning_rate": 1.584927059781548e-06, | |
| "loss": 0.3021031379699707, | |
| "memory(GiB)": 133.17, | |
| "step": 2120, | |
| "token_acc": 0.8850454875188026, | |
| "train_speed(iter/s)": 0.067954 | |
| }, | |
| { | |
| "epoch": 2.4882903981264635, | |
| "grad_norm": 0.19685259461402893, | |
| "learning_rate": 1.5502232728492362e-06, | |
| "loss": 0.29403057098388674, | |
| "memory(GiB)": 133.17, | |
| "step": 2125, | |
| "token_acc": 0.8935650598835121, | |
| "train_speed(iter/s)": 0.067954 | |
| }, | |
| { | |
| "epoch": 2.4941451990632317, | |
| "grad_norm": 0.1944494992494583, | |
| "learning_rate": 1.5158716939533524e-06, | |
| "loss": 0.303509259223938, | |
| "memory(GiB)": 133.17, | |
| "step": 2130, | |
| "token_acc": 0.8899391835374175, | |
| "train_speed(iter/s)": 0.067955 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.18963733315467834, | |
| "learning_rate": 1.4818737549544725e-06, | |
| "loss": 0.3023875951766968, | |
| "memory(GiB)": 133.17, | |
| "step": 2135, | |
| "token_acc": 0.8820025957494603, | |
| "train_speed(iter/s)": 0.067956 | |
| }, | |
| { | |
| "epoch": 2.5058548009367683, | |
| "grad_norm": 0.2587365210056305, | |
| "learning_rate": 1.448230872972568e-06, | |
| "loss": 0.29965691566467284, | |
| "memory(GiB)": 133.17, | |
| "step": 2140, | |
| "token_acc": 0.895712561145832, | |
| "train_speed(iter/s)": 0.067953 | |
| }, | |
| { | |
| "epoch": 2.5117096018735365, | |
| "grad_norm": 0.1916307806968689, | |
| "learning_rate": 1.4149444503279297e-06, | |
| "loss": 0.3064573764801025, | |
| "memory(GiB)": 133.17, | |
| "step": 2145, | |
| "token_acc": 0.8827446402570668, | |
| "train_speed(iter/s)": 0.067953 | |
| }, | |
| { | |
| "epoch": 2.5175644028103044, | |
| "grad_norm": 0.18983621895313263, | |
| "learning_rate": 1.382015874482735e-06, | |
| "loss": 0.2994706630706787, | |
| "memory(GiB)": 133.17, | |
| "step": 2150, | |
| "token_acc": 0.889184252992907, | |
| "train_speed(iter/s)": 0.067954 | |
| }, | |
| { | |
| "epoch": 2.5234192037470726, | |
| "grad_norm": 0.19152384996414185, | |
| "learning_rate": 1.3494465179831895e-06, | |
| "loss": 0.29698777198791504, | |
| "memory(GiB)": 133.17, | |
| "step": 2155, | |
| "token_acc": 0.8862135400891181, | |
| "train_speed(iter/s)": 0.067954 | |
| }, | |
| { | |
| "epoch": 2.529274004683841, | |
| "grad_norm": 0.17604193091392517, | |
| "learning_rate": 1.3172377384023393e-06, | |
| "loss": 0.2926321029663086, | |
| "memory(GiB)": 133.17, | |
| "step": 2160, | |
| "token_acc": 0.891226010077476, | |
| "train_speed(iter/s)": 0.067956 | |
| }, | |
| { | |
| "epoch": 2.5351288056206087, | |
| "grad_norm": 0.18896515667438507, | |
| "learning_rate": 1.2853908782834722e-06, | |
| "loss": 0.29559669494628904, | |
| "memory(GiB)": 133.17, | |
| "step": 2165, | |
| "token_acc": 0.8984888499945305, | |
| "train_speed(iter/s)": 0.067953 | |
| }, | |
| { | |
| "epoch": 2.540983606557377, | |
| "grad_norm": 0.18624024093151093, | |
| "learning_rate": 1.2539072650841523e-06, | |
| "loss": 0.30248537063598635, | |
| "memory(GiB)": 133.17, | |
| "step": 2170, | |
| "token_acc": 0.8883391871864846, | |
| "train_speed(iter/s)": 0.067955 | |
| }, | |
| { | |
| "epoch": 2.5468384074941453, | |
| "grad_norm": 0.18639948964118958, | |
| "learning_rate": 1.2227882111209011e-06, | |
| "loss": 0.3061221599578857, | |
| "memory(GiB)": 133.17, | |
| "step": 2175, | |
| "token_acc": 0.8831800956700007, | |
| "train_speed(iter/s)": 0.067954 | |
| }, | |
| { | |
| "epoch": 2.552693208430913, | |
| "grad_norm": 0.4918629229068756, | |
| "learning_rate": 1.1920350135144898e-06, | |
| "loss": 0.29971723556518554, | |
| "memory(GiB)": 133.17, | |
| "step": 2180, | |
| "token_acc": 0.894886042214037, | |
| "train_speed(iter/s)": 0.067954 | |
| }, | |
| { | |
| "epoch": 2.5585480093676813, | |
| "grad_norm": 0.18684136867523193, | |
| "learning_rate": 1.1616489541358678e-06, | |
| "loss": 0.29734086990356445, | |
| "memory(GiB)": 133.17, | |
| "step": 2185, | |
| "token_acc": 0.8841362452439526, | |
| "train_speed(iter/s)": 0.067954 | |
| }, | |
| { | |
| "epoch": 2.5644028103044496, | |
| "grad_norm": 0.18606062233448029, | |
| "learning_rate": 1.1316312995527424e-06, | |
| "loss": 0.3008298873901367, | |
| "memory(GiB)": 133.17, | |
| "step": 2190, | |
| "token_acc": 0.8897950269865535, | |
| "train_speed(iter/s)": 0.067955 | |
| }, | |
| { | |
| "epoch": 2.570257611241218, | |
| "grad_norm": 0.1816904991865158, | |
| "learning_rate": 1.1019833009767744e-06, | |
| "loss": 0.29885680675506593, | |
| "memory(GiB)": 133.17, | |
| "step": 2195, | |
| "token_acc": 0.8878981843540634, | |
| "train_speed(iter/s)": 0.067954 | |
| }, | |
| { | |
| "epoch": 2.576112412177986, | |
| "grad_norm": 0.19094757735729218, | |
| "learning_rate": 1.072706194211426e-06, | |
| "loss": 0.30043601989746094, | |
| "memory(GiB)": 133.17, | |
| "step": 2200, | |
| "token_acc": 0.8938432293837546, | |
| "train_speed(iter/s)": 0.067955 | |
| }, | |
| { | |
| "epoch": 2.581967213114754, | |
| "grad_norm": 0.1899169534444809, | |
| "learning_rate": 1.0438011996004581e-06, | |
| "loss": 0.2995189905166626, | |
| "memory(GiB)": 133.17, | |
| "step": 2205, | |
| "token_acc": 0.8880722202892788, | |
| "train_speed(iter/s)": 0.067955 | |
| }, | |
| { | |
| "epoch": 2.5878220140515222, | |
| "grad_norm": 0.19649627804756165, | |
| "learning_rate": 1.0152695219770558e-06, | |
| "loss": 0.2872016429901123, | |
| "memory(GiB)": 133.17, | |
| "step": 2210, | |
| "token_acc": 0.8980766878765166, | |
| "train_speed(iter/s)": 0.067953 | |
| }, | |
| { | |
| "epoch": 2.5936768149882905, | |
| "grad_norm": 0.19790223240852356, | |
| "learning_rate": 9.871123506136037e-07, | |
| "loss": 0.29386420249938966, | |
| "memory(GiB)": 133.17, | |
| "step": 2215, | |
| "token_acc": 0.9006561928197363, | |
| "train_speed(iter/s)": 0.067953 | |
| }, | |
| { | |
| "epoch": 2.5995316159250583, | |
| "grad_norm": 0.18190743029117584, | |
| "learning_rate": 9.593308591721274e-07, | |
| "loss": 0.2908626079559326, | |
| "memory(GiB)": 133.17, | |
| "step": 2220, | |
| "token_acc": 0.8927072444113778, | |
| "train_speed(iter/s)": 0.067953 | |
| }, | |
| { | |
| "epoch": 2.6053864168618266, | |
| "grad_norm": 0.1853610724210739, | |
| "learning_rate": 9.319262056553602e-07, | |
| "loss": 0.300918436050415, | |
| "memory(GiB)": 133.17, | |
| "step": 2225, | |
| "token_acc": 0.8953082310083849, | |
| "train_speed(iter/s)": 0.067951 | |
| }, | |
| { | |
| "epoch": 2.611241217798595, | |
| "grad_norm": 0.19064903259277344, | |
| "learning_rate": 9.048995323584764e-07, | |
| "loss": 0.3040909767150879, | |
| "memory(GiB)": 133.17, | |
| "step": 2230, | |
| "token_acc": 0.8990558015887316, | |
| "train_speed(iter/s)": 0.06795 | |
| }, | |
| { | |
| "epoch": 2.617096018735363, | |
| "grad_norm": 0.18238228559494019, | |
| "learning_rate": 8.78251965821485e-07, | |
| "loss": 0.2880122184753418, | |
| "memory(GiB)": 133.17, | |
| "step": 2235, | |
| "token_acc": 0.8914627457335544, | |
| "train_speed(iter/s)": 0.067949 | |
| }, | |
| { | |
| "epoch": 2.6229508196721314, | |
| "grad_norm": 0.18738383054733276, | |
| "learning_rate": 8.519846167822665e-07, | |
| "loss": 0.2943183422088623, | |
| "memory(GiB)": 133.17, | |
| "step": 2240, | |
| "token_acc": 0.9038425869666715, | |
| "train_speed(iter/s)": 0.067949 | |
| }, | |
| { | |
| "epoch": 2.628805620608899, | |
| "grad_norm": 0.1841094046831131, | |
| "learning_rate": 8.260985801302734e-07, | |
| "loss": 0.2850812911987305, | |
| "memory(GiB)": 133.17, | |
| "step": 2245, | |
| "token_acc": 0.8894028305143251, | |
| "train_speed(iter/s)": 0.067948 | |
| }, | |
| { | |
| "epoch": 2.6346604215456675, | |
| "grad_norm": 0.18611599504947662, | |
| "learning_rate": 8.005949348608977e-07, | |
| "loss": 0.2972105979919434, | |
| "memory(GiB)": 133.17, | |
| "step": 2250, | |
| "token_acc": 0.8871099881800386, | |
| "train_speed(iter/s)": 0.067948 | |
| }, | |
| { | |
| "epoch": 2.6405152224824358, | |
| "grad_norm": 0.1857517957687378, | |
| "learning_rate": 7.754747440304911e-07, | |
| "loss": 0.30115318298339844, | |
| "memory(GiB)": 133.17, | |
| "step": 2255, | |
| "token_acc": 0.8863566925844406, | |
| "train_speed(iter/s)": 0.067949 | |
| }, | |
| { | |
| "epoch": 2.6463700234192036, | |
| "grad_norm": 0.19214338064193726, | |
| "learning_rate": 7.507390547120541e-07, | |
| "loss": 0.29389874935150145, | |
| "memory(GiB)": 133.17, | |
| "step": 2260, | |
| "token_acc": 0.8897345572130235, | |
| "train_speed(iter/s)": 0.067951 | |
| }, | |
| { | |
| "epoch": 2.652224824355972, | |
| "grad_norm": 0.18817630410194397, | |
| "learning_rate": 7.263888979515954e-07, | |
| "loss": 0.3036650657653809, | |
| "memory(GiB)": 133.17, | |
| "step": 2265, | |
| "token_acc": 0.8849702240287362, | |
| "train_speed(iter/s)": 0.067952 | |
| }, | |
| { | |
| "epoch": 2.65807962529274, | |
| "grad_norm": 0.18061281740665436, | |
| "learning_rate": 7.024252887251548e-07, | |
| "loss": 0.29589831829071045, | |
| "memory(GiB)": 133.17, | |
| "step": 2270, | |
| "token_acc": 0.8896930575764528, | |
| "train_speed(iter/s)": 0.067951 | |
| }, | |
| { | |
| "epoch": 2.663934426229508, | |
| "grad_norm": 0.18534523248672485, | |
| "learning_rate": 6.788492258964896e-07, | |
| "loss": 0.29939701557159426, | |
| "memory(GiB)": 133.17, | |
| "step": 2275, | |
| "token_acc": 0.8869800488330657, | |
| "train_speed(iter/s)": 0.067951 | |
| }, | |
| { | |
| "epoch": 2.669789227166276, | |
| "grad_norm": 0.19118830561637878, | |
| "learning_rate": 6.556616921754489e-07, | |
| "loss": 0.29693875312805174, | |
| "memory(GiB)": 133.17, | |
| "step": 2280, | |
| "token_acc": 0.8911168593654094, | |
| "train_speed(iter/s)": 0.067951 | |
| }, | |
| { | |
| "epoch": 2.6756440281030445, | |
| "grad_norm": 0.18963268399238586, | |
| "learning_rate": 6.328636540770028e-07, | |
| "loss": 0.3002347707748413, | |
| "memory(GiB)": 133.17, | |
| "step": 2285, | |
| "token_acc": 0.889527246797438, | |
| "train_speed(iter/s)": 0.06795 | |
| }, | |
| { | |
| "epoch": 2.6814988290398127, | |
| "grad_norm": 0.18357062339782715, | |
| "learning_rate": 6.10456061880963e-07, | |
| "loss": 0.304398250579834, | |
| "memory(GiB)": 133.17, | |
| "step": 2290, | |
| "token_acc": 0.8937302240569359, | |
| "train_speed(iter/s)": 0.067949 | |
| }, | |
| { | |
| "epoch": 2.687353629976581, | |
| "grad_norm": 0.18504291772842407, | |
| "learning_rate": 5.884398495923727e-07, | |
| "loss": 0.29355425834655763, | |
| "memory(GiB)": 133.17, | |
| "step": 2295, | |
| "token_acc": 0.8842415418528523, | |
| "train_speed(iter/s)": 0.06795 | |
| }, | |
| { | |
| "epoch": 2.693208430913349, | |
| "grad_norm": 0.1993853747844696, | |
| "learning_rate": 5.668159349025649e-07, | |
| "loss": 0.3113893985748291, | |
| "memory(GiB)": 133.17, | |
| "step": 2300, | |
| "token_acc": 0.8775886656746031, | |
| "train_speed(iter/s)": 0.067949 | |
| }, | |
| { | |
| "epoch": 2.699063231850117, | |
| "grad_norm": 0.18858520686626434, | |
| "learning_rate": 5.455852191509214e-07, | |
| "loss": 0.3054765224456787, | |
| "memory(GiB)": 133.17, | |
| "step": 2305, | |
| "token_acc": 0.8816947533601692, | |
| "train_speed(iter/s)": 0.067949 | |
| }, | |
| { | |
| "epoch": 2.7049180327868854, | |
| "grad_norm": 0.19768975675106049, | |
| "learning_rate": 5.247485872873026e-07, | |
| "loss": 0.29274706840515136, | |
| "memory(GiB)": 133.17, | |
| "step": 2310, | |
| "token_acc": 0.8946102350213514, | |
| "train_speed(iter/s)": 0.06795 | |
| }, | |
| { | |
| "epoch": 2.710772833723653, | |
| "grad_norm": 0.17342238128185272, | |
| "learning_rate": 5.043069078351526e-07, | |
| "loss": 0.2879345893859863, | |
| "memory(GiB)": 133.17, | |
| "step": 2315, | |
| "token_acc": 0.8956176935229068, | |
| "train_speed(iter/s)": 0.067951 | |
| }, | |
| { | |
| "epoch": 2.7166276346604215, | |
| "grad_norm": 0.18019071221351624, | |
| "learning_rate": 4.842610328552999e-07, | |
| "loss": 0.29531962871551515, | |
| "memory(GiB)": 133.17, | |
| "step": 2320, | |
| "token_acc": 0.8930268304142333, | |
| "train_speed(iter/s)": 0.06795 | |
| }, | |
| { | |
| "epoch": 2.7224824355971897, | |
| "grad_norm": 0.1835058629512787, | |
| "learning_rate": 4.6461179791044806e-07, | |
| "loss": 0.2953210353851318, | |
| "memory(GiB)": 133.21, | |
| "step": 2325, | |
| "token_acc": 0.8869091207514772, | |
| "train_speed(iter/s)": 0.067952 | |
| }, | |
| { | |
| "epoch": 2.728337236533958, | |
| "grad_norm": 0.1827324628829956, | |
| "learning_rate": 4.453600220303378e-07, | |
| "loss": 0.2804730415344238, | |
| "memory(GiB)": 133.21, | |
| "step": 2330, | |
| "token_acc": 0.8874964005358507, | |
| "train_speed(iter/s)": 0.067953 | |
| }, | |
| { | |
| "epoch": 2.7341920374707263, | |
| "grad_norm": 0.18949875235557556, | |
| "learning_rate": 4.2650650767761535e-07, | |
| "loss": 0.2842918872833252, | |
| "memory(GiB)": 133.21, | |
| "step": 2335, | |
| "token_acc": 0.8981613154267605, | |
| "train_speed(iter/s)": 0.067952 | |
| }, | |
| { | |
| "epoch": 2.740046838407494, | |
| "grad_norm": 0.2092583179473877, | |
| "learning_rate": 4.0805204071437953e-07, | |
| "loss": 0.3071431636810303, | |
| "memory(GiB)": 133.21, | |
| "step": 2340, | |
| "token_acc": 0.886714704322126, | |
| "train_speed(iter/s)": 0.06795 | |
| }, | |
| { | |
| "epoch": 2.7459016393442623, | |
| "grad_norm": 0.18031486868858337, | |
| "learning_rate": 3.899973903694243e-07, | |
| "loss": 0.30032360553741455, | |
| "memory(GiB)": 133.21, | |
| "step": 2345, | |
| "token_acc": 0.8907299680407984, | |
| "train_speed(iter/s)": 0.06795 | |
| }, | |
| { | |
| "epoch": 2.7517564402810306, | |
| "grad_norm": 0.18996600806713104, | |
| "learning_rate": 3.72343309206179e-07, | |
| "loss": 0.2920222759246826, | |
| "memory(GiB)": 133.21, | |
| "step": 2350, | |
| "token_acc": 0.8913125942460162, | |
| "train_speed(iter/s)": 0.067948 | |
| }, | |
| { | |
| "epoch": 2.7576112412177984, | |
| "grad_norm": 0.18651233613491058, | |
| "learning_rate": 3.55090533091339e-07, | |
| "loss": 0.2933474063873291, | |
| "memory(GiB)": 133.21, | |
| "step": 2355, | |
| "token_acc": 0.9025793311463176, | |
| "train_speed(iter/s)": 0.067946 | |
| }, | |
| { | |
| "epoch": 2.7634660421545667, | |
| "grad_norm": 0.17747479677200317, | |
| "learning_rate": 3.382397811641858e-07, | |
| "loss": 0.2873265266418457, | |
| "memory(GiB)": 133.21, | |
| "step": 2360, | |
| "token_acc": 0.8948029740479362, | |
| "train_speed(iter/s)": 0.067947 | |
| }, | |
| { | |
| "epoch": 2.769320843091335, | |
| "grad_norm": 0.17965154349803925, | |
| "learning_rate": 3.217917558066241e-07, | |
| "loss": 0.2922650337219238, | |
| "memory(GiB)": 133.21, | |
| "step": 2365, | |
| "token_acc": 0.886650070990299, | |
| "train_speed(iter/s)": 0.067947 | |
| }, | |
| { | |
| "epoch": 2.775175644028103, | |
| "grad_norm": 0.17902247607707977, | |
| "learning_rate": 3.057471426138958e-07, | |
| "loss": 0.3062438488006592, | |
| "memory(GiB)": 133.21, | |
| "step": 2370, | |
| "token_acc": 0.8800339720197485, | |
| "train_speed(iter/s)": 0.067946 | |
| }, | |
| { | |
| "epoch": 2.781030444964871, | |
| "grad_norm": 0.17590953409671783, | |
| "learning_rate": 2.901066103660033e-07, | |
| "loss": 0.29376084804534913, | |
| "memory(GiB)": 133.21, | |
| "step": 2375, | |
| "token_acc": 0.8933526766467255, | |
| "train_speed(iter/s)": 0.067946 | |
| }, | |
| { | |
| "epoch": 2.7868852459016393, | |
| "grad_norm": 0.19265642762184143, | |
| "learning_rate": 2.7487081099983435e-07, | |
| "loss": 0.3061210155487061, | |
| "memory(GiB)": 133.21, | |
| "step": 2380, | |
| "token_acc": 0.8897680154530525, | |
| "train_speed(iter/s)": 0.067947 | |
| }, | |
| { | |
| "epoch": 2.7927400468384076, | |
| "grad_norm": 0.18283043801784515, | |
| "learning_rate": 2.6004037958199167e-07, | |
| "loss": 0.2898393154144287, | |
| "memory(GiB)": 133.21, | |
| "step": 2385, | |
| "token_acc": 0.9019573328471696, | |
| "train_speed(iter/s)": 0.067947 | |
| }, | |
| { | |
| "epoch": 2.798594847775176, | |
| "grad_norm": 0.18570415675640106, | |
| "learning_rate": 2.4561593428231165e-07, | |
| "loss": 0.29611454010009763, | |
| "memory(GiB)": 133.21, | |
| "step": 2390, | |
| "token_acc": 0.9104894052586534, | |
| "train_speed(iter/s)": 0.067945 | |
| }, | |
| { | |
| "epoch": 2.8044496487119437, | |
| "grad_norm": 0.18174812197685242, | |
| "learning_rate": 2.3159807634811182e-07, | |
| "loss": 0.28598248958587646, | |
| "memory(GiB)": 133.21, | |
| "step": 2395, | |
| "token_acc": 0.8965343061596744, | |
| "train_speed(iter/s)": 0.067945 | |
| }, | |
| { | |
| "epoch": 2.810304449648712, | |
| "grad_norm": 0.18527300655841827, | |
| "learning_rate": 2.1798739007911517e-07, | |
| "loss": 0.3005537986755371, | |
| "memory(GiB)": 133.21, | |
| "step": 2400, | |
| "token_acc": 0.887049760238975, | |
| "train_speed(iter/s)": 0.067943 | |
| }, | |
| { | |
| "epoch": 2.8161592505854802, | |
| "grad_norm": 0.17769944667816162, | |
| "learning_rate": 2.0478444280310206e-07, | |
| "loss": 0.2945347785949707, | |
| "memory(GiB)": 133.21, | |
| "step": 2405, | |
| "token_acc": 0.8905158466381549, | |
| "train_speed(iter/s)": 0.067944 | |
| }, | |
| { | |
| "epoch": 2.822014051522248, | |
| "grad_norm": 0.22059805691242218, | |
| "learning_rate": 1.919897848522656e-07, | |
| "loss": 0.2934718132019043, | |
| "memory(GiB)": 133.21, | |
| "step": 2410, | |
| "token_acc": 0.8902092807074844, | |
| "train_speed(iter/s)": 0.067943 | |
| }, | |
| { | |
| "epoch": 2.8278688524590163, | |
| "grad_norm": 0.18694834411144257, | |
| "learning_rate": 1.796039495402646e-07, | |
| "loss": 0.2984294414520264, | |
| "memory(GiB)": 133.21, | |
| "step": 2415, | |
| "token_acc": 0.8962242022599117, | |
| "train_speed(iter/s)": 0.067942 | |
| }, | |
| { | |
| "epoch": 2.8337236533957846, | |
| "grad_norm": 0.18271034955978394, | |
| "learning_rate": 1.6762745313999795e-07, | |
| "loss": 0.3036228895187378, | |
| "memory(GiB)": 133.21, | |
| "step": 2420, | |
| "token_acc": 0.876532044285046, | |
| "train_speed(iter/s)": 0.06794 | |
| }, | |
| { | |
| "epoch": 2.839578454332553, | |
| "grad_norm": 0.17917729914188385, | |
| "learning_rate": 1.5606079486208846e-07, | |
| "loss": 0.29344632625579836, | |
| "memory(GiB)": 133.21, | |
| "step": 2425, | |
| "token_acc": 0.8902957800547429, | |
| "train_speed(iter/s)": 0.06794 | |
| }, | |
| { | |
| "epoch": 2.845433255269321, | |
| "grad_norm": 0.1874186247587204, | |
| "learning_rate": 1.449044568340663e-07, | |
| "loss": 0.3013723373413086, | |
| "memory(GiB)": 133.21, | |
| "step": 2430, | |
| "token_acc": 0.8944912877684091, | |
| "train_speed(iter/s)": 0.06794 | |
| }, | |
| { | |
| "epoch": 2.851288056206089, | |
| "grad_norm": 0.18850503861904144, | |
| "learning_rate": 1.3415890408027932e-07, | |
| "loss": 0.29042725563049315, | |
| "memory(GiB)": 133.21, | |
| "step": 2435, | |
| "token_acc": 0.8900198911125016, | |
| "train_speed(iter/s)": 0.067939 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.1918351948261261, | |
| "learning_rate": 1.2382458450250657e-07, | |
| "loss": 0.30360941886901854, | |
| "memory(GiB)": 133.21, | |
| "step": 2440, | |
| "token_acc": 0.8830958974326918, | |
| "train_speed(iter/s)": 0.06794 | |
| }, | |
| { | |
| "epoch": 2.8629976580796255, | |
| "grad_norm": 0.17968802154064178, | |
| "learning_rate": 1.1390192886129304e-07, | |
| "loss": 0.29129633903503416, | |
| "memory(GiB)": 133.21, | |
| "step": 2445, | |
| "token_acc": 0.9028775730901043, | |
| "train_speed(iter/s)": 0.06794 | |
| }, | |
| { | |
| "epoch": 2.8688524590163933, | |
| "grad_norm": 0.19108013808727264, | |
| "learning_rate": 1.0439135075798634e-07, | |
| "loss": 0.30793027877807616, | |
| "memory(GiB)": 133.21, | |
| "step": 2450, | |
| "token_acc": 0.8880401770150538, | |
| "train_speed(iter/s)": 0.067942 | |
| }, | |
| { | |
| "epoch": 2.8747072599531616, | |
| "grad_norm": 0.18335837125778198, | |
| "learning_rate": 9.529324661750494e-08, | |
| "loss": 0.301357364654541, | |
| "memory(GiB)": 133.21, | |
| "step": 2455, | |
| "token_acc": 0.8833441771706613, | |
| "train_speed(iter/s)": 0.067941 | |
| }, | |
| { | |
| "epoch": 2.88056206088993, | |
| "grad_norm": 0.18961112201213837, | |
| "learning_rate": 8.6607995671808e-08, | |
| "loss": 0.29690849781036377, | |
| "memory(GiB)": 133.21, | |
| "step": 2460, | |
| "token_acc": 0.8882351107925328, | |
| "train_speed(iter/s)": 0.067941 | |
| }, | |
| { | |
| "epoch": 2.8864168618266977, | |
| "grad_norm": 0.22844481468200684, | |
| "learning_rate": 7.833595994409248e-08, | |
| "loss": 0.2876168727874756, | |
| "memory(GiB)": 133.21, | |
| "step": 2465, | |
| "token_acc": 0.888989677822959, | |
| "train_speed(iter/s)": 0.06794 | |
| }, | |
| { | |
| "epoch": 2.892271662763466, | |
| "grad_norm": 0.19000564515590668, | |
| "learning_rate": 7.047748423370193e-08, | |
| "loss": 0.3021047353744507, | |
| "memory(GiB)": 133.21, | |
| "step": 2470, | |
| "token_acc": 0.8849937208945455, | |
| "train_speed(iter/s)": 0.067942 | |
| }, | |
| { | |
| "epoch": 2.898126463700234, | |
| "grad_norm": 0.18740171194076538, | |
| "learning_rate": 6.303289610175233e-08, | |
| "loss": 0.29048540592193606, | |
| "memory(GiB)": 133.21, | |
| "step": 2475, | |
| "token_acc": 0.8921316614420063, | |
| "train_speed(iter/s)": 0.067942 | |
| }, | |
| { | |
| "epoch": 2.9039812646370025, | |
| "grad_norm": 0.18983155488967896, | |
| "learning_rate": 5.6002505857480906e-08, | |
| "loss": 0.2961090326309204, | |
| "memory(GiB)": 133.21, | |
| "step": 2480, | |
| "token_acc": 0.8925172239013309, | |
| "train_speed(iter/s)": 0.067943 | |
| }, | |
| { | |
| "epoch": 2.9098360655737707, | |
| "grad_norm": 0.1831265538930893, | |
| "learning_rate": 4.938660654530969e-08, | |
| "loss": 0.3080202579498291, | |
| "memory(GiB)": 133.21, | |
| "step": 2485, | |
| "token_acc": 0.8858930624281501, | |
| "train_speed(iter/s)": 0.067943 | |
| }, | |
| { | |
| "epoch": 2.9156908665105385, | |
| "grad_norm": 0.19748179614543915, | |
| "learning_rate": 4.318547393263317e-08, | |
| "loss": 0.30983719825744627, | |
| "memory(GiB)": 133.21, | |
| "step": 2490, | |
| "token_acc": 0.8897056301087475, | |
| "train_speed(iter/s)": 0.067942 | |
| }, | |
| { | |
| "epoch": 2.921545667447307, | |
| "grad_norm": 0.18569178879261017, | |
| "learning_rate": 3.739936649832188e-08, | |
| "loss": 0.29312853813171386, | |
| "memory(GiB)": 133.21, | |
| "step": 2495, | |
| "token_acc": 0.8924126241525105, | |
| "train_speed(iter/s)": 0.067945 | |
| }, | |
| { | |
| "epoch": 2.927400468384075, | |
| "grad_norm": 0.1950037181377411, | |
| "learning_rate": 3.2028525421946563e-08, | |
| "loss": 0.2936956167221069, | |
| "memory(GiB)": 133.21, | |
| "step": 2500, | |
| "token_acc": 0.9020274516704794, | |
| "train_speed(iter/s)": 0.067945 | |
| }, | |
| { | |
| "epoch": 2.933255269320843, | |
| "grad_norm": 0.18167735636234283, | |
| "learning_rate": 2.70731745737296e-08, | |
| "loss": 0.2973939418792725, | |
| "memory(GiB)": 133.21, | |
| "step": 2505, | |
| "token_acc": 0.8934657981473672, | |
| "train_speed(iter/s)": 0.067946 | |
| }, | |
| { | |
| "epoch": 2.939110070257611, | |
| "grad_norm": 0.18503886461257935, | |
| "learning_rate": 2.2533520505211294e-08, | |
| "loss": 0.29192218780517576, | |
| "memory(GiB)": 133.21, | |
| "step": 2510, | |
| "token_acc": 0.8951111388611389, | |
| "train_speed(iter/s)": 0.067947 | |
| }, | |
| { | |
| "epoch": 2.9449648711943794, | |
| "grad_norm": 0.17936980724334717, | |
| "learning_rate": 1.8409752440639027e-08, | |
| "loss": 0.28421769142150877, | |
| "memory(GiB)": 133.21, | |
| "step": 2515, | |
| "token_acc": 0.8924425595173032, | |
| "train_speed(iter/s)": 0.067947 | |
| }, | |
| { | |
| "epoch": 2.9508196721311473, | |
| "grad_norm": 0.18841403722763062, | |
| "learning_rate": 1.470204226908134e-08, | |
| "loss": 0.30081515312194823, | |
| "memory(GiB)": 133.21, | |
| "step": 2520, | |
| "token_acc": 0.894779086363537, | |
| "train_speed(iter/s)": 0.067944 | |
| }, | |
| { | |
| "epoch": 2.9566744730679155, | |
| "grad_norm": 0.19020894169807434, | |
| "learning_rate": 1.1410544537263645e-08, | |
| "loss": 0.3081362247467041, | |
| "memory(GiB)": 133.21, | |
| "step": 2525, | |
| "token_acc": 0.8934638595786859, | |
| "train_speed(iter/s)": 0.067943 | |
| }, | |
| { | |
| "epoch": 2.962529274004684, | |
| "grad_norm": 0.24749897420406342, | |
| "learning_rate": 8.535396443124511e-09, | |
| "loss": 0.2878671884536743, | |
| "memory(GiB)": 133.21, | |
| "step": 2530, | |
| "token_acc": 0.8913681995528473, | |
| "train_speed(iter/s)": 0.067944 | |
| }, | |
| { | |
| "epoch": 2.968384074941452, | |
| "grad_norm": 0.17989581823349, | |
| "learning_rate": 6.076717830098e-09, | |
| "loss": 0.2899226903915405, | |
| "memory(GiB)": 133.21, | |
| "step": 2535, | |
| "token_acc": 0.8996739041991876, | |
| "train_speed(iter/s)": 0.067943 | |
| }, | |
| { | |
| "epoch": 2.9742388758782203, | |
| "grad_norm": 0.18506699800491333, | |
| "learning_rate": 4.034611182121007e-09, | |
| "loss": 0.2908132553100586, | |
| "memory(GiB)": 133.21, | |
| "step": 2540, | |
| "token_acc": 0.8988520352276212, | |
| "train_speed(iter/s)": 0.067941 | |
| }, | |
| { | |
| "epoch": 2.980093676814988, | |
| "grad_norm": 0.18510298430919647, | |
| "learning_rate": 2.40916161935445e-09, | |
| "loss": 0.29580187797546387, | |
| "memory(GiB)": 133.21, | |
| "step": 2545, | |
| "token_acc": 0.8895340031302065, | |
| "train_speed(iter/s)": 0.067941 | |
| }, | |
| { | |
| "epoch": 2.9859484777517564, | |
| "grad_norm": 0.18303260207176208, | |
| "learning_rate": 1.2004368946427758e-09, | |
| "loss": 0.2922369956970215, | |
| "memory(GiB)": 133.21, | |
| "step": 2550, | |
| "token_acc": 0.8920757330143692, | |
| "train_speed(iter/s)": 0.067941 | |
| }, | |
| { | |
| "epoch": 2.9918032786885247, | |
| "grad_norm": 0.1823214441537857, | |
| "learning_rate": 4.084873906851083e-10, | |
| "loss": 0.29749574661254885, | |
| "memory(GiB)": 133.21, | |
| "step": 2555, | |
| "token_acc": 0.8995572920769461, | |
| "train_speed(iter/s)": 0.067939 | |
| }, | |
| { | |
| "epoch": 2.9976580796252925, | |
| "grad_norm": 0.17787551879882812, | |
| "learning_rate": 3.334611793692766e-11, | |
| "loss": 0.29738173484802244, | |
| "memory(GiB)": 133.21, | |
| "step": 2560, | |
| "token_acc": 0.903360959533883, | |
| "train_speed(iter/s)": 0.067939 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2562, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1.0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2476392970944512.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |