| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 2562, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00117096018735363, | |
| "grad_norm": 4.268844127655029, | |
| "learning_rate": 1.5503875968992249e-07, | |
| "loss": 0.6057141423225403, | |
| "memory(GiB)": 127.52, | |
| "step": 1, | |
| "token_acc": 0.8403535163595337, | |
| "train_speed(iter/s)": 0.023362 | |
| }, | |
| { | |
| "epoch": 0.00585480093676815, | |
| "grad_norm": 4.00634765625, | |
| "learning_rate": 7.751937984496125e-07, | |
| "loss": 0.5714304447174072, | |
| "memory(GiB)": 127.52, | |
| "step": 5, | |
| "token_acc": 0.8414377559265664, | |
| "train_speed(iter/s)": 0.058165 | |
| }, | |
| { | |
| "epoch": 0.0117096018735363, | |
| "grad_norm": 3.714452028274536, | |
| "learning_rate": 1.550387596899225e-06, | |
| "loss": 0.5679570198059082, | |
| "memory(GiB)": 127.52, | |
| "step": 10, | |
| "token_acc": 0.8314816958517272, | |
| "train_speed(iter/s)": 0.072408 | |
| }, | |
| { | |
| "epoch": 0.01756440281030445, | |
| "grad_norm": 2.5877742767333984, | |
| "learning_rate": 2.3255813953488376e-06, | |
| "loss": 0.5387242317199707, | |
| "memory(GiB)": 127.52, | |
| "step": 15, | |
| "token_acc": 0.8362197181678389, | |
| "train_speed(iter/s)": 0.080114 | |
| }, | |
| { | |
| "epoch": 0.0234192037470726, | |
| "grad_norm": 1.0245263576507568, | |
| "learning_rate": 3.10077519379845e-06, | |
| "loss": 0.4778164863586426, | |
| "memory(GiB)": 127.52, | |
| "step": 20, | |
| "token_acc": 0.8409289456094262, | |
| "train_speed(iter/s)": 0.083991 | |
| }, | |
| { | |
| "epoch": 0.02927400468384075, | |
| "grad_norm": 1.0654064416885376, | |
| "learning_rate": 3.875968992248063e-06, | |
| "loss": 0.45667400360107424, | |
| "memory(GiB)": 127.52, | |
| "step": 25, | |
| "token_acc": 0.8518281248542258, | |
| "train_speed(iter/s)": 0.086554 | |
| }, | |
| { | |
| "epoch": 0.0351288056206089, | |
| "grad_norm": 0.5691505670547485, | |
| "learning_rate": 4.651162790697675e-06, | |
| "loss": 0.44004316329956056, | |
| "memory(GiB)": 127.52, | |
| "step": 30, | |
| "token_acc": 0.8520554823322664, | |
| "train_speed(iter/s)": 0.088726 | |
| }, | |
| { | |
| "epoch": 0.040983606557377046, | |
| "grad_norm": 0.5251653790473938, | |
| "learning_rate": 5.4263565891472865e-06, | |
| "loss": 0.43890109062194826, | |
| "memory(GiB)": 127.52, | |
| "step": 35, | |
| "token_acc": 0.8498815333197345, | |
| "train_speed(iter/s)": 0.090273 | |
| }, | |
| { | |
| "epoch": 0.0468384074941452, | |
| "grad_norm": 0.4052143096923828, | |
| "learning_rate": 6.2015503875969e-06, | |
| "loss": 0.41214742660522463, | |
| "memory(GiB)": 127.52, | |
| "step": 40, | |
| "token_acc": 0.8529612170691973, | |
| "train_speed(iter/s)": 0.091488 | |
| }, | |
| { | |
| "epoch": 0.05269320843091335, | |
| "grad_norm": 0.3396666944026947, | |
| "learning_rate": 6.976744186046513e-06, | |
| "loss": 0.423629093170166, | |
| "memory(GiB)": 127.52, | |
| "step": 45, | |
| "token_acc": 0.845838888731289, | |
| "train_speed(iter/s)": 0.092386 | |
| }, | |
| { | |
| "epoch": 0.0585480093676815, | |
| "grad_norm": 0.3074694573879242, | |
| "learning_rate": 7.751937984496126e-06, | |
| "loss": 0.41414508819580076, | |
| "memory(GiB)": 127.52, | |
| "step": 50, | |
| "token_acc": 0.8609199657045593, | |
| "train_speed(iter/s)": 0.0932 | |
| }, | |
| { | |
| "epoch": 0.06440281030444965, | |
| "grad_norm": 0.31701743602752686, | |
| "learning_rate": 8.527131782945736e-06, | |
| "loss": 0.4058098793029785, | |
| "memory(GiB)": 127.52, | |
| "step": 55, | |
| "token_acc": 0.8598621225118498, | |
| "train_speed(iter/s)": 0.093922 | |
| }, | |
| { | |
| "epoch": 0.0702576112412178, | |
| "grad_norm": 0.29167020320892334, | |
| "learning_rate": 9.30232558139535e-06, | |
| "loss": 0.42685737609863283, | |
| "memory(GiB)": 127.52, | |
| "step": 60, | |
| "token_acc": 0.8583035383662712, | |
| "train_speed(iter/s)": 0.09448 | |
| }, | |
| { | |
| "epoch": 0.07611241217798595, | |
| "grad_norm": 0.2796083092689514, | |
| "learning_rate": 1.0077519379844963e-05, | |
| "loss": 0.4080663681030273, | |
| "memory(GiB)": 127.52, | |
| "step": 65, | |
| "token_acc": 0.86975264356343, | |
| "train_speed(iter/s)": 0.095012 | |
| }, | |
| { | |
| "epoch": 0.08196721311475409, | |
| "grad_norm": 0.32925185561180115, | |
| "learning_rate": 1.0852713178294573e-05, | |
| "loss": 0.4072235584259033, | |
| "memory(GiB)": 127.52, | |
| "step": 70, | |
| "token_acc": 0.8576062541566801, | |
| "train_speed(iter/s)": 0.095364 | |
| }, | |
| { | |
| "epoch": 0.08782201405152225, | |
| "grad_norm": 0.29692476987838745, | |
| "learning_rate": 1.1627906976744187e-05, | |
| "loss": 0.4062563419342041, | |
| "memory(GiB)": 127.52, | |
| "step": 75, | |
| "token_acc": 0.8451938495195714, | |
| "train_speed(iter/s)": 0.0958 | |
| }, | |
| { | |
| "epoch": 0.0936768149882904, | |
| "grad_norm": 0.32430365681648254, | |
| "learning_rate": 1.24031007751938e-05, | |
| "loss": 0.4052871227264404, | |
| "memory(GiB)": 127.52, | |
| "step": 80, | |
| "token_acc": 0.8536078219242759, | |
| "train_speed(iter/s)": 0.096071 | |
| }, | |
| { | |
| "epoch": 0.09953161592505855, | |
| "grad_norm": 0.2918962240219116, | |
| "learning_rate": 1.3178294573643412e-05, | |
| "loss": 0.39542815685272215, | |
| "memory(GiB)": 127.52, | |
| "step": 85, | |
| "token_acc": 0.8758206774505389, | |
| "train_speed(iter/s)": 0.096477 | |
| }, | |
| { | |
| "epoch": 0.1053864168618267, | |
| "grad_norm": 0.30198103189468384, | |
| "learning_rate": 1.3953488372093025e-05, | |
| "loss": 0.4015383243560791, | |
| "memory(GiB)": 127.52, | |
| "step": 90, | |
| "token_acc": 0.8578144099246164, | |
| "train_speed(iter/s)": 0.096833 | |
| }, | |
| { | |
| "epoch": 0.11124121779859485, | |
| "grad_norm": 0.32643797993659973, | |
| "learning_rate": 1.4728682170542636e-05, | |
| "loss": 0.401915454864502, | |
| "memory(GiB)": 127.52, | |
| "step": 95, | |
| "token_acc": 0.8631059302340187, | |
| "train_speed(iter/s)": 0.097122 | |
| }, | |
| { | |
| "epoch": 0.117096018735363, | |
| "grad_norm": 0.3097076416015625, | |
| "learning_rate": 1.550387596899225e-05, | |
| "loss": 0.4027417182922363, | |
| "memory(GiB)": 127.52, | |
| "step": 100, | |
| "token_acc": 0.8636492034198335, | |
| "train_speed(iter/s)": 0.097418 | |
| }, | |
| { | |
| "epoch": 0.12295081967213115, | |
| "grad_norm": 0.28134772181510925, | |
| "learning_rate": 1.6279069767441862e-05, | |
| "loss": 0.39868090152740476, | |
| "memory(GiB)": 127.52, | |
| "step": 105, | |
| "token_acc": 0.8655913809126278, | |
| "train_speed(iter/s)": 0.0977 | |
| }, | |
| { | |
| "epoch": 0.1288056206088993, | |
| "grad_norm": 0.2668236196041107, | |
| "learning_rate": 1.7054263565891473e-05, | |
| "loss": 0.38587536811828616, | |
| "memory(GiB)": 127.52, | |
| "step": 110, | |
| "token_acc": 0.864132983946116, | |
| "train_speed(iter/s)": 0.09794 | |
| }, | |
| { | |
| "epoch": 0.13466042154566746, | |
| "grad_norm": 0.3235706686973572, | |
| "learning_rate": 1.7829457364341087e-05, | |
| "loss": 0.40470218658447266, | |
| "memory(GiB)": 127.52, | |
| "step": 115, | |
| "token_acc": 0.8720630828529737, | |
| "train_speed(iter/s)": 0.098141 | |
| }, | |
| { | |
| "epoch": 0.1405152224824356, | |
| "grad_norm": 0.2895485460758209, | |
| "learning_rate": 1.86046511627907e-05, | |
| "loss": 0.39603259563446047, | |
| "memory(GiB)": 127.52, | |
| "step": 120, | |
| "token_acc": 0.8639678736880146, | |
| "train_speed(iter/s)": 0.09831 | |
| }, | |
| { | |
| "epoch": 0.14637002341920374, | |
| "grad_norm": 0.3098626434803009, | |
| "learning_rate": 1.937984496124031e-05, | |
| "loss": 0.4097726821899414, | |
| "memory(GiB)": 127.52, | |
| "step": 125, | |
| "token_acc": 0.8581558732162836, | |
| "train_speed(iter/s)": 0.098474 | |
| }, | |
| { | |
| "epoch": 0.1522248243559719, | |
| "grad_norm": 0.35938969254493713, | |
| "learning_rate": 1.9999991663467044e-05, | |
| "loss": 0.4081538200378418, | |
| "memory(GiB)": 127.52, | |
| "step": 130, | |
| "token_acc": 0.8630013632327376, | |
| "train_speed(iter/s)": 0.098587 | |
| }, | |
| { | |
| "epoch": 0.15807962529274006, | |
| "grad_norm": 0.3397412896156311, | |
| "learning_rate": 1.9999699886272926e-05, | |
| "loss": 0.40991506576538084, | |
| "memory(GiB)": 127.52, | |
| "step": 135, | |
| "token_acc": 0.8502879675585575, | |
| "train_speed(iter/s)": 0.098664 | |
| }, | |
| { | |
| "epoch": 0.16393442622950818, | |
| "grad_norm": 0.32449835538864136, | |
| "learning_rate": 1.9998991296330317e-05, | |
| "loss": 0.40630359649658204, | |
| "memory(GiB)": 127.52, | |
| "step": 140, | |
| "token_acc": 0.8630894085796805, | |
| "train_speed(iter/s)": 0.098721 | |
| }, | |
| { | |
| "epoch": 0.16978922716627634, | |
| "grad_norm": 0.32687216997146606, | |
| "learning_rate": 1.9997865923175027e-05, | |
| "loss": 0.396761417388916, | |
| "memory(GiB)": 127.52, | |
| "step": 145, | |
| "token_acc": 0.8691767868585987, | |
| "train_speed(iter/s)": 0.09874 | |
| }, | |
| { | |
| "epoch": 0.1756440281030445, | |
| "grad_norm": 0.32365313172340393, | |
| "learning_rate": 1.999632381371545e-05, | |
| "loss": 0.40283679962158203, | |
| "memory(GiB)": 127.52, | |
| "step": 150, | |
| "token_acc": 0.8533993606842608, | |
| "train_speed(iter/s)": 0.09881 | |
| }, | |
| { | |
| "epoch": 0.18149882903981265, | |
| "grad_norm": 0.3086594343185425, | |
| "learning_rate": 1.999436503223061e-05, | |
| "loss": 0.4014937400817871, | |
| "memory(GiB)": 127.52, | |
| "step": 155, | |
| "token_acc": 0.8624249503342012, | |
| "train_speed(iter/s)": 0.098838 | |
| }, | |
| { | |
| "epoch": 0.1873536299765808, | |
| "grad_norm": 0.32935866713523865, | |
| "learning_rate": 1.9991989660367463e-05, | |
| "loss": 0.4079470634460449, | |
| "memory(GiB)": 127.52, | |
| "step": 160, | |
| "token_acc": 0.8470391967320465, | |
| "train_speed(iter/s)": 0.098876 | |
| }, | |
| { | |
| "epoch": 0.19320843091334894, | |
| "grad_norm": 0.27776622772216797, | |
| "learning_rate": 1.998919779713751e-05, | |
| "loss": 0.4115422248840332, | |
| "memory(GiB)": 127.52, | |
| "step": 165, | |
| "token_acc": 0.8531520964716057, | |
| "train_speed(iter/s)": 0.098931 | |
| }, | |
| { | |
| "epoch": 0.1990632318501171, | |
| "grad_norm": 0.28459489345550537, | |
| "learning_rate": 1.998598955891266e-05, | |
| "loss": 0.4005699634552002, | |
| "memory(GiB)": 127.52, | |
| "step": 170, | |
| "token_acc": 0.867363933744935, | |
| "train_speed(iter/s)": 0.099005 | |
| }, | |
| { | |
| "epoch": 0.20491803278688525, | |
| "grad_norm": 0.3174498379230499, | |
| "learning_rate": 1.9982365079420382e-05, | |
| "loss": 0.38856048583984376, | |
| "memory(GiB)": 127.52, | |
| "step": 175, | |
| "token_acc": 0.8610733940638768, | |
| "train_speed(iter/s)": 0.099065 | |
| }, | |
| { | |
| "epoch": 0.2107728337236534, | |
| "grad_norm": 0.30468112230300903, | |
| "learning_rate": 1.9978324509738147e-05, | |
| "loss": 0.392287540435791, | |
| "memory(GiB)": 127.52, | |
| "step": 180, | |
| "token_acc": 0.8659642567171478, | |
| "train_speed(iter/s)": 0.099132 | |
| }, | |
| { | |
| "epoch": 0.21662763466042154, | |
| "grad_norm": 0.31203576922416687, | |
| "learning_rate": 1.9973868018287093e-05, | |
| "loss": 0.3912659168243408, | |
| "memory(GiB)": 127.52, | |
| "step": 185, | |
| "token_acc": 0.8592000200480526, | |
| "train_speed(iter/s)": 0.099222 | |
| }, | |
| { | |
| "epoch": 0.2224824355971897, | |
| "grad_norm": 0.2872975766658783, | |
| "learning_rate": 1.9968995790825048e-05, | |
| "loss": 0.3968376159667969, | |
| "memory(GiB)": 127.52, | |
| "step": 190, | |
| "token_acc": 0.8492010693857249, | |
| "train_speed(iter/s)": 0.099228 | |
| }, | |
| { | |
| "epoch": 0.22833723653395785, | |
| "grad_norm": 0.3107975721359253, | |
| "learning_rate": 1.9963708030438754e-05, | |
| "loss": 0.39564805030822753, | |
| "memory(GiB)": 127.52, | |
| "step": 195, | |
| "token_acc": 0.8623048224402377, | |
| "train_speed(iter/s)": 0.099285 | |
| }, | |
| { | |
| "epoch": 0.234192037470726, | |
| "grad_norm": 0.33172452449798584, | |
| "learning_rate": 1.995800495753542e-05, | |
| "loss": 0.3955163240432739, | |
| "memory(GiB)": 127.52, | |
| "step": 200, | |
| "token_acc": 0.8543361827625122, | |
| "train_speed(iter/s)": 0.099318 | |
| }, | |
| { | |
| "epoch": 0.24004683840749413, | |
| "grad_norm": 0.4809193015098572, | |
| "learning_rate": 1.9951886809833537e-05, | |
| "loss": 0.40662593841552735, | |
| "memory(GiB)": 127.52, | |
| "step": 205, | |
| "token_acc": 0.8530674732086181, | |
| "train_speed(iter/s)": 0.099376 | |
| }, | |
| { | |
| "epoch": 0.2459016393442623, | |
| "grad_norm": 0.3544229567050934, | |
| "learning_rate": 1.9945353842352943e-05, | |
| "loss": 0.4021385669708252, | |
| "memory(GiB)": 127.52, | |
| "step": 210, | |
| "token_acc": 0.8561705450570045, | |
| "train_speed(iter/s)": 0.099425 | |
| }, | |
| { | |
| "epoch": 0.25175644028103045, | |
| "grad_norm": 0.336126446723938, | |
| "learning_rate": 1.9938406327404233e-05, | |
| "loss": 0.3979261159896851, | |
| "memory(GiB)": 127.52, | |
| "step": 215, | |
| "token_acc": 0.8645368893679286, | |
| "train_speed(iter/s)": 0.099503 | |
| }, | |
| { | |
| "epoch": 0.2576112412177986, | |
| "grad_norm": 0.33789604902267456, | |
| "learning_rate": 1.9931044554577373e-05, | |
| "loss": 0.3947408676147461, | |
| "memory(GiB)": 127.52, | |
| "step": 220, | |
| "token_acc": 0.8581383757515342, | |
| "train_speed(iter/s)": 0.099556 | |
| }, | |
| { | |
| "epoch": 0.26346604215456676, | |
| "grad_norm": 0.3256719708442688, | |
| "learning_rate": 1.992326883072965e-05, | |
| "loss": 0.39812633991241453, | |
| "memory(GiB)": 127.52, | |
| "step": 225, | |
| "token_acc": 0.8538002738372856, | |
| "train_speed(iter/s)": 0.099561 | |
| }, | |
| { | |
| "epoch": 0.2693208430913349, | |
| "grad_norm": 0.29769811034202576, | |
| "learning_rate": 1.991507947997287e-05, | |
| "loss": 0.40686187744140623, | |
| "memory(GiB)": 127.52, | |
| "step": 230, | |
| "token_acc": 0.8601537153116829, | |
| "train_speed(iter/s)": 0.099609 | |
| }, | |
| { | |
| "epoch": 0.275175644028103, | |
| "grad_norm": 0.30855706334114075, | |
| "learning_rate": 1.9906476843659866e-05, | |
| "loss": 0.40198640823364257, | |
| "memory(GiB)": 127.52, | |
| "step": 235, | |
| "token_acc": 0.8681018040834193, | |
| "train_speed(iter/s)": 0.099643 | |
| }, | |
| { | |
| "epoch": 0.2810304449648712, | |
| "grad_norm": 0.38956841826438904, | |
| "learning_rate": 1.989746128037024e-05, | |
| "loss": 0.3874382972717285, | |
| "memory(GiB)": 127.52, | |
| "step": 240, | |
| "token_acc": 0.8601923167422234, | |
| "train_speed(iter/s)": 0.099684 | |
| }, | |
| { | |
| "epoch": 0.28688524590163933, | |
| "grad_norm": 0.317061185836792, | |
| "learning_rate": 1.988803316589545e-05, | |
| "loss": 0.396057653427124, | |
| "memory(GiB)": 127.52, | |
| "step": 245, | |
| "token_acc": 0.8594824803587602, | |
| "train_speed(iter/s)": 0.099766 | |
| }, | |
| { | |
| "epoch": 0.2927400468384075, | |
| "grad_norm": 0.31615447998046875, | |
| "learning_rate": 1.987819289322311e-05, | |
| "loss": 0.39992465972900393, | |
| "memory(GiB)": 127.52, | |
| "step": 250, | |
| "token_acc": 0.858279346005983, | |
| "train_speed(iter/s)": 0.099854 | |
| }, | |
| { | |
| "epoch": 0.29859484777517564, | |
| "grad_norm": 0.32358142733573914, | |
| "learning_rate": 1.9867940872520646e-05, | |
| "loss": 0.40424213409423826, | |
| "memory(GiB)": 127.52, | |
| "step": 255, | |
| "token_acc": 0.8581384084126314, | |
| "train_speed(iter/s)": 0.09983 | |
| }, | |
| { | |
| "epoch": 0.3044496487119438, | |
| "grad_norm": 0.289928138256073, | |
| "learning_rate": 1.9857277531118173e-05, | |
| "loss": 0.3975801706314087, | |
| "memory(GiB)": 127.52, | |
| "step": 260, | |
| "token_acc": 0.8720583892069197, | |
| "train_speed(iter/s)": 0.099856 | |
| }, | |
| { | |
| "epoch": 0.31030444964871196, | |
| "grad_norm": 0.2990163266658783, | |
| "learning_rate": 1.9846203313490697e-05, | |
| "loss": 0.38855001926422117, | |
| "memory(GiB)": 127.52, | |
| "step": 265, | |
| "token_acc": 0.8751479791620219, | |
| "train_speed(iter/s)": 0.099904 | |
| }, | |
| { | |
| "epoch": 0.3161592505854801, | |
| "grad_norm": 0.3375948369503021, | |
| "learning_rate": 1.983471868123958e-05, | |
| "loss": 0.3869392156600952, | |
| "memory(GiB)": 127.52, | |
| "step": 270, | |
| "token_acc": 0.8583391727600954, | |
| "train_speed(iter/s)": 0.099986 | |
| }, | |
| { | |
| "epoch": 0.32201405152224827, | |
| "grad_norm": 0.31450051069259644, | |
| "learning_rate": 1.98228241130733e-05, | |
| "loss": 0.4127011775970459, | |
| "memory(GiB)": 127.52, | |
| "step": 275, | |
| "token_acc": 0.8624973560772896, | |
| "train_speed(iter/s)": 0.100024 | |
| }, | |
| { | |
| "epoch": 0.32786885245901637, | |
| "grad_norm": 0.30610159039497375, | |
| "learning_rate": 1.98105201047875e-05, | |
| "loss": 0.38500449657440183, | |
| "memory(GiB)": 127.52, | |
| "step": 280, | |
| "token_acc": 0.8676562826677817, | |
| "train_speed(iter/s)": 0.1001 | |
| }, | |
| { | |
| "epoch": 0.3337236533957845, | |
| "grad_norm": 0.29564493894577026, | |
| "learning_rate": 1.9797807169244326e-05, | |
| "loss": 0.39098482131958007, | |
| "memory(GiB)": 127.52, | |
| "step": 285, | |
| "token_acc": 0.8600835808177637, | |
| "train_speed(iter/s)": 0.100123 | |
| }, | |
| { | |
| "epoch": 0.3395784543325527, | |
| "grad_norm": 0.2966287136077881, | |
| "learning_rate": 1.9784685836351045e-05, | |
| "loss": 0.40611705780029295, | |
| "memory(GiB)": 127.52, | |
| "step": 290, | |
| "token_acc": 0.85560257646949, | |
| "train_speed(iter/s)": 0.100118 | |
| }, | |
| { | |
| "epoch": 0.34543325526932084, | |
| "grad_norm": 0.3238191604614258, | |
| "learning_rate": 1.9771156653037944e-05, | |
| "loss": 0.3969024419784546, | |
| "memory(GiB)": 127.52, | |
| "step": 295, | |
| "token_acc": 0.8581954258818798, | |
| "train_speed(iter/s)": 0.100158 | |
| }, | |
| { | |
| "epoch": 0.351288056206089, | |
| "grad_norm": 0.27766069769859314, | |
| "learning_rate": 1.975722018323556e-05, | |
| "loss": 0.38973977565765383, | |
| "memory(GiB)": 127.52, | |
| "step": 300, | |
| "token_acc": 0.8660634024604128, | |
| "train_speed(iter/s)": 0.100143 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.30145326256752014, | |
| "learning_rate": 1.974287700785116e-05, | |
| "loss": 0.3852071285247803, | |
| "memory(GiB)": 127.52, | |
| "step": 305, | |
| "token_acc": 0.8624855074734434, | |
| "train_speed(iter/s)": 0.100192 | |
| }, | |
| { | |
| "epoch": 0.3629976580796253, | |
| "grad_norm": 0.3129558563232422, | |
| "learning_rate": 1.9728127724744516e-05, | |
| "loss": 0.3764306306838989, | |
| "memory(GiB)": 127.52, | |
| "step": 310, | |
| "token_acc": 0.8621159494397087, | |
| "train_speed(iter/s)": 0.100188 | |
| }, | |
| { | |
| "epoch": 0.36885245901639346, | |
| "grad_norm": 0.28354689478874207, | |
| "learning_rate": 1.9712972948703006e-05, | |
| "loss": 0.4006787300109863, | |
| "memory(GiB)": 127.52, | |
| "step": 315, | |
| "token_acc": 0.8685145789802604, | |
| "train_speed(iter/s)": 0.100213 | |
| }, | |
| { | |
| "epoch": 0.3747072599531616, | |
| "grad_norm": 0.32204070687294006, | |
| "learning_rate": 1.9697413311415967e-05, | |
| "loss": 0.3947436332702637, | |
| "memory(GiB)": 127.52, | |
| "step": 320, | |
| "token_acc": 0.840712523808037, | |
| "train_speed(iter/s)": 0.100233 | |
| }, | |
| { | |
| "epoch": 0.3805620608899297, | |
| "grad_norm": 0.2838529944419861, | |
| "learning_rate": 1.9681449461448386e-05, | |
| "loss": 0.3909641981124878, | |
| "memory(GiB)": 127.52, | |
| "step": 325, | |
| "token_acc": 0.8644274332135604, | |
| "train_speed(iter/s)": 0.10024 | |
| }, | |
| { | |
| "epoch": 0.3864168618266979, | |
| "grad_norm": 0.2927788197994232, | |
| "learning_rate": 1.9665082064213856e-05, | |
| "loss": 0.3943678140640259, | |
| "memory(GiB)": 127.52, | |
| "step": 330, | |
| "token_acc": 0.8593159978638758, | |
| "train_speed(iter/s)": 0.100274 | |
| }, | |
| { | |
| "epoch": 0.39227166276346603, | |
| "grad_norm": 0.28758853673934937, | |
| "learning_rate": 1.9648311801946823e-05, | |
| "loss": 0.39302983283996584, | |
| "memory(GiB)": 127.52, | |
| "step": 335, | |
| "token_acc": 0.8576617952773522, | |
| "train_speed(iter/s)": 0.10031 | |
| }, | |
| { | |
| "epoch": 0.3981264637002342, | |
| "grad_norm": 0.32002732157707214, | |
| "learning_rate": 1.9631139373674188e-05, | |
| "loss": 0.3899127721786499, | |
| "memory(GiB)": 127.52, | |
| "step": 340, | |
| "token_acc": 0.859130068814327, | |
| "train_speed(iter/s)": 0.100326 | |
| }, | |
| { | |
| "epoch": 0.40398126463700235, | |
| "grad_norm": 0.29767319560050964, | |
| "learning_rate": 1.9613565495186126e-05, | |
| "loss": 0.38013973236083987, | |
| "memory(GiB)": 127.52, | |
| "step": 345, | |
| "token_acc": 0.8582271352459535, | |
| "train_speed(iter/s)": 0.100383 | |
| }, | |
| { | |
| "epoch": 0.4098360655737705, | |
| "grad_norm": 0.30334916710853577, | |
| "learning_rate": 1.9595590899006288e-05, | |
| "loss": 0.3990506649017334, | |
| "memory(GiB)": 127.52, | |
| "step": 350, | |
| "token_acc": 0.8646594498490017, | |
| "train_speed(iter/s)": 0.100383 | |
| }, | |
| { | |
| "epoch": 0.41569086651053866, | |
| "grad_norm": 0.27606984972953796, | |
| "learning_rate": 1.957721633436124e-05, | |
| "loss": 0.39636931419372556, | |
| "memory(GiB)": 127.52, | |
| "step": 355, | |
| "token_acc": 0.8610379971059329, | |
| "train_speed(iter/s)": 0.100374 | |
| }, | |
| { | |
| "epoch": 0.4215456674473068, | |
| "grad_norm": 0.2963041067123413, | |
| "learning_rate": 1.9558442567149244e-05, | |
| "loss": 0.3938555955886841, | |
| "memory(GiB)": 127.52, | |
| "step": 360, | |
| "token_acc": 0.8734277076877441, | |
| "train_speed(iter/s)": 0.100411 | |
| }, | |
| { | |
| "epoch": 0.4274004683840749, | |
| "grad_norm": 0.3044081926345825, | |
| "learning_rate": 1.953927037990834e-05, | |
| "loss": 0.4011641502380371, | |
| "memory(GiB)": 127.52, | |
| "step": 365, | |
| "token_acc": 0.8496909477706446, | |
| "train_speed(iter/s)": 0.100429 | |
| }, | |
| { | |
| "epoch": 0.4332552693208431, | |
| "grad_norm": 0.3151879906654358, | |
| "learning_rate": 1.9519700571783718e-05, | |
| "loss": 0.40146493911743164, | |
| "memory(GiB)": 127.52, | |
| "step": 370, | |
| "token_acc": 0.8655695668198701, | |
| "train_speed(iter/s)": 0.100402 | |
| }, | |
| { | |
| "epoch": 0.43911007025761123, | |
| "grad_norm": 0.30802202224731445, | |
| "learning_rate": 1.9499733958494405e-05, | |
| "loss": 0.3972816467285156, | |
| "memory(GiB)": 127.52, | |
| "step": 375, | |
| "token_acc": 0.8541561335505496, | |
| "train_speed(iter/s)": 0.100424 | |
| }, | |
| { | |
| "epoch": 0.4449648711943794, | |
| "grad_norm": 0.2896055579185486, | |
| "learning_rate": 1.947937137229928e-05, | |
| "loss": 0.39000208377838136, | |
| "memory(GiB)": 127.52, | |
| "step": 380, | |
| "token_acc": 0.8715701816495711, | |
| "train_speed(iter/s)": 0.10044 | |
| }, | |
| { | |
| "epoch": 0.45081967213114754, | |
| "grad_norm": 0.3016491234302521, | |
| "learning_rate": 1.9458613661962366e-05, | |
| "loss": 0.3910162687301636, | |
| "memory(GiB)": 127.52, | |
| "step": 385, | |
| "token_acc": 0.8739838931744026, | |
| "train_speed(iter/s)": 0.100469 | |
| }, | |
| { | |
| "epoch": 0.4566744730679157, | |
| "grad_norm": 0.29643046855926514, | |
| "learning_rate": 1.943746169271746e-05, | |
| "loss": 0.39229693412780764, | |
| "memory(GiB)": 127.52, | |
| "step": 390, | |
| "token_acc": 0.8722126097825781, | |
| "train_speed(iter/s)": 0.100449 | |
| }, | |
| { | |
| "epoch": 0.46252927400468385, | |
| "grad_norm": 0.27366167306900024, | |
| "learning_rate": 1.941591634623206e-05, | |
| "loss": 0.39676542282104493, | |
| "memory(GiB)": 127.52, | |
| "step": 395, | |
| "token_acc": 0.8644101402067695, | |
| "train_speed(iter/s)": 0.100471 | |
| }, | |
| { | |
| "epoch": 0.468384074941452, | |
| "grad_norm": 0.2772040069103241, | |
| "learning_rate": 1.9393978520570638e-05, | |
| "loss": 0.38228650093078614, | |
| "memory(GiB)": 127.52, | |
| "step": 400, | |
| "token_acc": 0.8660634050880627, | |
| "train_speed(iter/s)": 0.100525 | |
| }, | |
| { | |
| "epoch": 0.47423887587822017, | |
| "grad_norm": 0.27195385098457336, | |
| "learning_rate": 1.9371649130157166e-05, | |
| "loss": 0.3779789209365845, | |
| "memory(GiB)": 127.52, | |
| "step": 405, | |
| "token_acc": 0.8644070452060074, | |
| "train_speed(iter/s)": 0.100537 | |
| }, | |
| { | |
| "epoch": 0.48009367681498827, | |
| "grad_norm": 0.3120705783367157, | |
| "learning_rate": 1.9348929105737044e-05, | |
| "loss": 0.3843944549560547, | |
| "memory(GiB)": 127.52, | |
| "step": 410, | |
| "token_acc": 0.8640640315662635, | |
| "train_speed(iter/s)": 0.100541 | |
| }, | |
| { | |
| "epoch": 0.4859484777517564, | |
| "grad_norm": 0.30002740025520325, | |
| "learning_rate": 1.932581939433827e-05, | |
| "loss": 0.3987558841705322, | |
| "memory(GiB)": 127.52, | |
| "step": 415, | |
| "token_acc": 0.8650914968394279, | |
| "train_speed(iter/s)": 0.100546 | |
| }, | |
| { | |
| "epoch": 0.4918032786885246, | |
| "grad_norm": 0.2787948250770569, | |
| "learning_rate": 1.9302320959231997e-05, | |
| "loss": 0.3887160778045654, | |
| "memory(GiB)": 127.52, | |
| "step": 420, | |
| "token_acc": 0.8633874480548741, | |
| "train_speed(iter/s)": 0.100566 | |
| }, | |
| { | |
| "epoch": 0.49765807962529274, | |
| "grad_norm": 0.30231156945228577, | |
| "learning_rate": 1.927843477989234e-05, | |
| "loss": 0.38535680770874026, | |
| "memory(GiB)": 127.52, | |
| "step": 425, | |
| "token_acc": 0.8781958006354674, | |
| "train_speed(iter/s)": 0.100582 | |
| }, | |
| { | |
| "epoch": 0.5035128805620609, | |
| "grad_norm": 0.43067944049835205, | |
| "learning_rate": 1.9254161851955587e-05, | |
| "loss": 0.3992464065551758, | |
| "memory(GiB)": 127.52, | |
| "step": 430, | |
| "token_acc": 0.8681868917427511, | |
| "train_speed(iter/s)": 0.100614 | |
| }, | |
| { | |
| "epoch": 0.509367681498829, | |
| "grad_norm": 0.31797730922698975, | |
| "learning_rate": 1.9229503187178694e-05, | |
| "loss": 0.3914906978607178, | |
| "memory(GiB)": 127.52, | |
| "step": 435, | |
| "token_acc": 0.8623976908030916, | |
| "train_speed(iter/s)": 0.100629 | |
| }, | |
| { | |
| "epoch": 0.5152224824355972, | |
| "grad_norm": 0.3029649555683136, | |
| "learning_rate": 1.920445981339708e-05, | |
| "loss": 0.3909397840499878, | |
| "memory(GiB)": 127.52, | |
| "step": 440, | |
| "token_acc": 0.8603624171988666, | |
| "train_speed(iter/s)": 0.10065 | |
| }, | |
| { | |
| "epoch": 0.5210772833723654, | |
| "grad_norm": 0.30808401107788086, | |
| "learning_rate": 1.9179032774481822e-05, | |
| "loss": 0.38848447799682617, | |
| "memory(GiB)": 127.52, | |
| "step": 445, | |
| "token_acc": 0.8688334300638422, | |
| "train_speed(iter/s)": 0.10068 | |
| }, | |
| { | |
| "epoch": 0.5269320843091335, | |
| "grad_norm": 0.30352672934532166, | |
| "learning_rate": 1.9153223130296125e-05, | |
| "loss": 0.38553576469421386, | |
| "memory(GiB)": 127.52, | |
| "step": 450, | |
| "token_acc": 0.871061226654355, | |
| "train_speed(iter/s)": 0.100707 | |
| }, | |
| { | |
| "epoch": 0.5327868852459017, | |
| "grad_norm": 0.30111393332481384, | |
| "learning_rate": 1.9127031956651153e-05, | |
| "loss": 0.38896827697753905, | |
| "memory(GiB)": 127.52, | |
| "step": 455, | |
| "token_acc": 0.868666861524493, | |
| "train_speed(iter/s)": 0.10072 | |
| }, | |
| { | |
| "epoch": 0.5386416861826698, | |
| "grad_norm": 0.3043946325778961, | |
| "learning_rate": 1.9100460345261175e-05, | |
| "loss": 0.4031389236450195, | |
| "memory(GiB)": 127.52, | |
| "step": 460, | |
| "token_acc": 0.8602805306930444, | |
| "train_speed(iter/s)": 0.10069 | |
| }, | |
| { | |
| "epoch": 0.544496487119438, | |
| "grad_norm": 0.3046748638153076, | |
| "learning_rate": 1.9073509403698062e-05, | |
| "loss": 0.3981820821762085, | |
| "memory(GiB)": 127.52, | |
| "step": 465, | |
| "token_acc": 0.8679260633787171, | |
| "train_speed(iter/s)": 0.100702 | |
| }, | |
| { | |
| "epoch": 0.550351288056206, | |
| "grad_norm": 0.31403180956840515, | |
| "learning_rate": 1.9046180255345142e-05, | |
| "loss": 0.3932758569717407, | |
| "memory(GiB)": 127.52, | |
| "step": 470, | |
| "token_acc": 0.8679127068807225, | |
| "train_speed(iter/s)": 0.10072 | |
| }, | |
| { | |
| "epoch": 0.5562060889929742, | |
| "grad_norm": 0.29715070128440857, | |
| "learning_rate": 1.9018474039350342e-05, | |
| "loss": 0.3857383966445923, | |
| "memory(GiB)": 127.52, | |
| "step": 475, | |
| "token_acc": 0.8670612150699786, | |
| "train_speed(iter/s)": 0.100729 | |
| }, | |
| { | |
| "epoch": 0.5620608899297423, | |
| "grad_norm": 0.3304217755794525, | |
| "learning_rate": 1.899039191057872e-05, | |
| "loss": 0.3876671075820923, | |
| "memory(GiB)": 127.52, | |
| "step": 480, | |
| "token_acc": 0.8610883356974732, | |
| "train_speed(iter/s)": 0.100734 | |
| }, | |
| { | |
| "epoch": 0.5679156908665105, | |
| "grad_norm": 0.28700098395347595, | |
| "learning_rate": 1.8961935039564338e-05, | |
| "loss": 0.3859807252883911, | |
| "memory(GiB)": 127.52, | |
| "step": 485, | |
| "token_acc": 0.861040389753261, | |
| "train_speed(iter/s)": 0.100756 | |
| }, | |
| { | |
| "epoch": 0.5737704918032787, | |
| "grad_norm": 0.30889761447906494, | |
| "learning_rate": 1.8933104612461454e-05, | |
| "loss": 0.3886594772338867, | |
| "memory(GiB)": 127.52, | |
| "step": 490, | |
| "token_acc": 0.862199389425299, | |
| "train_speed(iter/s)": 0.100764 | |
| }, | |
| { | |
| "epoch": 0.5796252927400468, | |
| "grad_norm": 0.30002301931381226, | |
| "learning_rate": 1.8903901830995093e-05, | |
| "loss": 0.3925405740737915, | |
| "memory(GiB)": 127.52, | |
| "step": 495, | |
| "token_acc": 0.8591249033461787, | |
| "train_speed(iter/s)": 0.10076 | |
| }, | |
| { | |
| "epoch": 0.585480093676815, | |
| "grad_norm": 0.28031232953071594, | |
| "learning_rate": 1.8874327912410945e-05, | |
| "loss": 0.40421361923217775, | |
| "memory(GiB)": 127.52, | |
| "step": 500, | |
| "token_acc": 0.8617515420490447, | |
| "train_speed(iter/s)": 0.100788 | |
| }, | |
| { | |
| "epoch": 0.5913348946135831, | |
| "grad_norm": 0.27785587310791016, | |
| "learning_rate": 1.884438408942463e-05, | |
| "loss": 0.39117045402526857, | |
| "memory(GiB)": 127.52, | |
| "step": 505, | |
| "token_acc": 0.8509159982582465, | |
| "train_speed(iter/s)": 0.100792 | |
| }, | |
| { | |
| "epoch": 0.5971896955503513, | |
| "grad_norm": 0.26203179359436035, | |
| "learning_rate": 1.881407161017033e-05, | |
| "loss": 0.3850869655609131, | |
| "memory(GiB)": 127.52, | |
| "step": 510, | |
| "token_acc": 0.871426780341023, | |
| "train_speed(iter/s)": 0.100813 | |
| }, | |
| { | |
| "epoch": 0.6030444964871194, | |
| "grad_norm": 0.2775160074234009, | |
| "learning_rate": 1.8783391738148738e-05, | |
| "loss": 0.38030352592468264, | |
| "memory(GiB)": 127.52, | |
| "step": 515, | |
| "token_acc": 0.865779336694748, | |
| "train_speed(iter/s)": 0.100836 | |
| }, | |
| { | |
| "epoch": 0.6088992974238876, | |
| "grad_norm": 0.283777117729187, | |
| "learning_rate": 1.875234575217441e-05, | |
| "loss": 0.38051447868347166, | |
| "memory(GiB)": 127.52, | |
| "step": 520, | |
| "token_acc": 0.8643710911880905, | |
| "train_speed(iter/s)": 0.100855 | |
| }, | |
| { | |
| "epoch": 0.6147540983606558, | |
| "grad_norm": 0.2693696618080139, | |
| "learning_rate": 1.8720934946322466e-05, | |
| "loss": 0.3941120862960815, | |
| "memory(GiB)": 127.52, | |
| "step": 525, | |
| "token_acc": 0.8575597963261037, | |
| "train_speed(iter/s)": 0.10087 | |
| }, | |
| { | |
| "epoch": 0.6206088992974239, | |
| "grad_norm": 0.2502153515815735, | |
| "learning_rate": 1.8689160629874622e-05, | |
| "loss": 0.36350240707397463, | |
| "memory(GiB)": 127.52, | |
| "step": 530, | |
| "token_acc": 0.8788319745551232, | |
| "train_speed(iter/s)": 0.100858 | |
| }, | |
| { | |
| "epoch": 0.6264637002341921, | |
| "grad_norm": 0.2630903124809265, | |
| "learning_rate": 1.865702412726465e-05, | |
| "loss": 0.3757188081741333, | |
| "memory(GiB)": 127.52, | |
| "step": 535, | |
| "token_acc": 0.8759880681391801, | |
| "train_speed(iter/s)": 0.100883 | |
| }, | |
| { | |
| "epoch": 0.6323185011709602, | |
| "grad_norm": 0.2726694941520691, | |
| "learning_rate": 1.8624526778023142e-05, | |
| "loss": 0.3769080638885498, | |
| "memory(GiB)": 127.52, | |
| "step": 540, | |
| "token_acc": 0.8733085553248108, | |
| "train_speed(iter/s)": 0.100896 | |
| }, | |
| { | |
| "epoch": 0.6381733021077284, | |
| "grad_norm": 0.2886805832386017, | |
| "learning_rate": 1.85916699367217e-05, | |
| "loss": 0.3801791429519653, | |
| "memory(GiB)": 127.52, | |
| "step": 545, | |
| "token_acc": 0.8658838767809878, | |
| "train_speed(iter/s)": 0.100897 | |
| }, | |
| { | |
| "epoch": 0.6440281030444965, | |
| "grad_norm": 0.28697773814201355, | |
| "learning_rate": 1.855845497291646e-05, | |
| "loss": 0.3925698041915894, | |
| "memory(GiB)": 127.52, | |
| "step": 550, | |
| "token_acc": 0.8631926701668678, | |
| "train_speed(iter/s)": 0.100906 | |
| }, | |
| { | |
| "epoch": 0.6498829039812647, | |
| "grad_norm": 0.26602187752723694, | |
| "learning_rate": 1.8524883271091004e-05, | |
| "loss": 0.38099260330200196, | |
| "memory(GiB)": 127.52, | |
| "step": 555, | |
| "token_acc": 0.8710958004218123, | |
| "train_speed(iter/s)": 0.100905 | |
| }, | |
| { | |
| "epoch": 0.6557377049180327, | |
| "grad_norm": 0.2533867359161377, | |
| "learning_rate": 1.8490956230598668e-05, | |
| "loss": 0.3997593879699707, | |
| "memory(GiB)": 127.52, | |
| "step": 560, | |
| "token_acc": 0.8649844205573561, | |
| "train_speed(iter/s)": 0.100903 | |
| }, | |
| { | |
| "epoch": 0.6615925058548009, | |
| "grad_norm": 0.287895530462265, | |
| "learning_rate": 1.8456675265604183e-05, | |
| "loss": 0.3792722702026367, | |
| "memory(GiB)": 127.52, | |
| "step": 565, | |
| "token_acc": 0.8638586429067867, | |
| "train_speed(iter/s)": 0.100923 | |
| }, | |
| { | |
| "epoch": 0.667447306791569, | |
| "grad_norm": 0.30773329734802246, | |
| "learning_rate": 1.842204180502476e-05, | |
| "loss": 0.3829328536987305, | |
| "memory(GiB)": 127.52, | |
| "step": 570, | |
| "token_acc": 0.8727389815600163, | |
| "train_speed(iter/s)": 0.100938 | |
| }, | |
| { | |
| "epoch": 0.6733021077283372, | |
| "grad_norm": 0.30301594734191895, | |
| "learning_rate": 1.8387057292470517e-05, | |
| "loss": 0.39844498634338377, | |
| "memory(GiB)": 127.52, | |
| "step": 575, | |
| "token_acc": 0.8632732480308832, | |
| "train_speed(iter/s)": 0.100939 | |
| }, | |
| { | |
| "epoch": 0.6791569086651054, | |
| "grad_norm": 0.27384889125823975, | |
| "learning_rate": 1.8351723186184295e-05, | |
| "loss": 0.3866116523742676, | |
| "memory(GiB)": 127.52, | |
| "step": 580, | |
| "token_acc": 0.8537265892945595, | |
| "train_speed(iter/s)": 0.100945 | |
| }, | |
| { | |
| "epoch": 0.6850117096018735, | |
| "grad_norm": 0.300459086894989, | |
| "learning_rate": 1.8316040958980896e-05, | |
| "loss": 0.3856982707977295, | |
| "memory(GiB)": 127.52, | |
| "step": 585, | |
| "token_acc": 0.8774584957729205, | |
| "train_speed(iter/s)": 0.100955 | |
| }, | |
| { | |
| "epoch": 0.6908665105386417, | |
| "grad_norm": 0.32351046800613403, | |
| "learning_rate": 1.828001209818567e-05, | |
| "loss": 0.403375244140625, | |
| "memory(GiB)": 127.52, | |
| "step": 590, | |
| "token_acc": 0.8606907256499806, | |
| "train_speed(iter/s)": 0.100969 | |
| }, | |
| { | |
| "epoch": 0.6967213114754098, | |
| "grad_norm": 0.3171491324901581, | |
| "learning_rate": 1.8243638105572547e-05, | |
| "loss": 0.3851677656173706, | |
| "memory(GiB)": 127.52, | |
| "step": 595, | |
| "token_acc": 0.8713710233181722, | |
| "train_speed(iter/s)": 0.100978 | |
| }, | |
| { | |
| "epoch": 0.702576112412178, | |
| "grad_norm": 0.3137357532978058, | |
| "learning_rate": 1.82069204973014e-05, | |
| "loss": 0.3799635648727417, | |
| "memory(GiB)": 127.52, | |
| "step": 600, | |
| "token_acc": 0.8784900280426953, | |
| "train_speed(iter/s)": 0.101006 | |
| }, | |
| { | |
| "epoch": 0.7084309133489461, | |
| "grad_norm": 0.28434112668037415, | |
| "learning_rate": 1.816986080385489e-05, | |
| "loss": 0.40052270889282227, | |
| "memory(GiB)": 127.52, | |
| "step": 605, | |
| "token_acc": 0.8462195284773476, | |
| "train_speed(iter/s)": 0.101006 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.30604925751686096, | |
| "learning_rate": 1.813246056997465e-05, | |
| "loss": 0.3835596084594727, | |
| "memory(GiB)": 127.52, | |
| "step": 610, | |
| "token_acc": 0.8614169593452318, | |
| "train_speed(iter/s)": 0.101011 | |
| }, | |
| { | |
| "epoch": 0.7201405152224825, | |
| "grad_norm": 0.3114904463291168, | |
| "learning_rate": 1.809472135459688e-05, | |
| "loss": 0.38530282974243163, | |
| "memory(GiB)": 127.52, | |
| "step": 615, | |
| "token_acc": 0.8642289288270977, | |
| "train_speed(iter/s)": 0.101016 | |
| }, | |
| { | |
| "epoch": 0.7259953161592506, | |
| "grad_norm": 0.29733744263648987, | |
| "learning_rate": 1.8056644730787412e-05, | |
| "loss": 0.39410853385925293, | |
| "memory(GiB)": 127.52, | |
| "step": 620, | |
| "token_acc": 0.8700788764122717, | |
| "train_speed(iter/s)": 0.101043 | |
| }, | |
| { | |
| "epoch": 0.7318501170960188, | |
| "grad_norm": 0.28432950377464294, | |
| "learning_rate": 1.8018232285676092e-05, | |
| "loss": 0.3745533227920532, | |
| "memory(GiB)": 127.52, | |
| "step": 625, | |
| "token_acc": 0.8656255611667859, | |
| "train_speed(iter/s)": 0.101068 | |
| }, | |
| { | |
| "epoch": 0.7377049180327869, | |
| "grad_norm": 0.2615796625614166, | |
| "learning_rate": 1.797948562039066e-05, | |
| "loss": 0.3919194459915161, | |
| "memory(GiB)": 127.52, | |
| "step": 630, | |
| "token_acc": 0.8600643002591344, | |
| "train_speed(iter/s)": 0.101046 | |
| }, | |
| { | |
| "epoch": 0.7435597189695551, | |
| "grad_norm": 0.27267464995384216, | |
| "learning_rate": 1.7940406349989987e-05, | |
| "loss": 0.388127875328064, | |
| "memory(GiB)": 127.52, | |
| "step": 635, | |
| "token_acc": 0.8630637748223948, | |
| "train_speed(iter/s)": 0.10107 | |
| }, | |
| { | |
| "epoch": 0.7494145199063232, | |
| "grad_norm": 0.274472177028656, | |
| "learning_rate": 1.7900996103396772e-05, | |
| "loss": 0.38143386840820315, | |
| "memory(GiB)": 127.52, | |
| "step": 640, | |
| "token_acc": 0.8701312848988129, | |
| "train_speed(iter/s)": 0.101069 | |
| }, | |
| { | |
| "epoch": 0.7552693208430913, | |
| "grad_norm": 0.27030906081199646, | |
| "learning_rate": 1.7861256523329634e-05, | |
| "loss": 0.3786378145217896, | |
| "memory(GiB)": 127.52, | |
| "step": 645, | |
| "token_acc": 0.8602489884842826, | |
| "train_speed(iter/s)": 0.101063 | |
| }, | |
| { | |
| "epoch": 0.7611241217798594, | |
| "grad_norm": 0.2663189172744751, | |
| "learning_rate": 1.7821189266234647e-05, | |
| "loss": 0.38404848575592043, | |
| "memory(GiB)": 127.52, | |
| "step": 650, | |
| "token_acc": 0.8616431608743905, | |
| "train_speed(iter/s)": 0.10106 | |
| }, | |
| { | |
| "epoch": 0.7669789227166276, | |
| "grad_norm": 0.26061564683914185, | |
| "learning_rate": 1.7780796002216285e-05, | |
| "loss": 0.3781083822250366, | |
| "memory(GiB)": 127.52, | |
| "step": 655, | |
| "token_acc": 0.8578937981658266, | |
| "train_speed(iter/s)": 0.101068 | |
| }, | |
| { | |
| "epoch": 0.7728337236533958, | |
| "grad_norm": 0.2600330412387848, | |
| "learning_rate": 1.7740078414967817e-05, | |
| "loss": 0.3852128505706787, | |
| "memory(GiB)": 127.52, | |
| "step": 660, | |
| "token_acc": 0.872952104972653, | |
| "train_speed(iter/s)": 0.101073 | |
| }, | |
| { | |
| "epoch": 0.7786885245901639, | |
| "grad_norm": 0.27133384346961975, | |
| "learning_rate": 1.7699038201701132e-05, | |
| "loss": 0.37737174034118653, | |
| "memory(GiB)": 127.52, | |
| "step": 665, | |
| "token_acc": 0.8593767976691324, | |
| "train_speed(iter/s)": 0.101088 | |
| }, | |
| { | |
| "epoch": 0.7845433255269321, | |
| "grad_norm": 0.270047664642334, | |
| "learning_rate": 1.7657677073075968e-05, | |
| "loss": 0.38488593101501467, | |
| "memory(GiB)": 127.52, | |
| "step": 670, | |
| "token_acc": 0.8627122177041754, | |
| "train_speed(iter/s)": 0.101091 | |
| }, | |
| { | |
| "epoch": 0.7903981264637002, | |
| "grad_norm": 0.29772108793258667, | |
| "learning_rate": 1.761599675312864e-05, | |
| "loss": 0.3877399444580078, | |
| "memory(GiB)": 127.52, | |
| "step": 675, | |
| "token_acc": 0.8765810968128602, | |
| "train_speed(iter/s)": 0.101091 | |
| }, | |
| { | |
| "epoch": 0.7962529274004684, | |
| "grad_norm": 0.30914777517318726, | |
| "learning_rate": 1.7573998979200163e-05, | |
| "loss": 0.38101863861083984, | |
| "memory(GiB)": 127.52, | |
| "step": 680, | |
| "token_acc": 0.8670370510587819, | |
| "train_speed(iter/s)": 0.101106 | |
| }, | |
| { | |
| "epoch": 0.8021077283372365, | |
| "grad_norm": 0.24654199182987213, | |
| "learning_rate": 1.753168550186383e-05, | |
| "loss": 0.3897979259490967, | |
| "memory(GiB)": 127.52, | |
| "step": 685, | |
| "token_acc": 0.8695668499228697, | |
| "train_speed(iter/s)": 0.101113 | |
| }, | |
| { | |
| "epoch": 0.8079625292740047, | |
| "grad_norm": 0.268245667219162, | |
| "learning_rate": 1.7489058084852247e-05, | |
| "loss": 0.3852191686630249, | |
| "memory(GiB)": 127.52, | |
| "step": 690, | |
| "token_acc": 0.8590092968475919, | |
| "train_speed(iter/s)": 0.101108 | |
| }, | |
| { | |
| "epoch": 0.8138173302107728, | |
| "grad_norm": 0.2539999186992645, | |
| "learning_rate": 1.744611850498383e-05, | |
| "loss": 0.38076086044311525, | |
| "memory(GiB)": 127.52, | |
| "step": 695, | |
| "token_acc": 0.8692958838741554, | |
| "train_speed(iter/s)": 0.101093 | |
| }, | |
| { | |
| "epoch": 0.819672131147541, | |
| "grad_norm": 0.30060875415802, | |
| "learning_rate": 1.7402868552088724e-05, | |
| "loss": 0.37528285980224607, | |
| "memory(GiB)": 127.52, | |
| "step": 700, | |
| "token_acc": 0.863746098668577, | |
| "train_speed(iter/s)": 0.101099 | |
| }, | |
| { | |
| "epoch": 0.8255269320843092, | |
| "grad_norm": 0.2880835235118866, | |
| "learning_rate": 1.73593100289342e-05, | |
| "loss": 0.3839045286178589, | |
| "memory(GiB)": 127.52, | |
| "step": 705, | |
| "token_acc": 0.8606477737869129, | |
| "train_speed(iter/s)": 0.101117 | |
| }, | |
| { | |
| "epoch": 0.8313817330210773, | |
| "grad_norm": 0.27465176582336426, | |
| "learning_rate": 1.7315444751149533e-05, | |
| "loss": 0.38219666481018066, | |
| "memory(GiB)": 127.52, | |
| "step": 710, | |
| "token_acc": 0.866171235481518, | |
| "train_speed(iter/s)": 0.101137 | |
| }, | |
| { | |
| "epoch": 0.8372365339578455, | |
| "grad_norm": 0.2839786410331726, | |
| "learning_rate": 1.727127454715029e-05, | |
| "loss": 0.3815479755401611, | |
| "memory(GiB)": 127.52, | |
| "step": 715, | |
| "token_acc": 0.8742821134330966, | |
| "train_speed(iter/s)": 0.101149 | |
| }, | |
| { | |
| "epoch": 0.8430913348946136, | |
| "grad_norm": 0.31399768590927124, | |
| "learning_rate": 1.722680125806214e-05, | |
| "loss": 0.38201520442962644, | |
| "memory(GiB)": 127.52, | |
| "step": 720, | |
| "token_acc": 0.8587188600974719, | |
| "train_speed(iter/s)": 0.101155 | |
| }, | |
| { | |
| "epoch": 0.8489461358313818, | |
| "grad_norm": 0.3099398910999298, | |
| "learning_rate": 1.71820267376441e-05, | |
| "loss": 0.386704421043396, | |
| "memory(GiB)": 127.52, | |
| "step": 725, | |
| "token_acc": 0.8638798635493387, | |
| "train_speed(iter/s)": 0.101166 | |
| }, | |
| { | |
| "epoch": 0.8548009367681498, | |
| "grad_norm": 0.2707797884941101, | |
| "learning_rate": 1.7136952852211274e-05, | |
| "loss": 0.3908542156219482, | |
| "memory(GiB)": 127.52, | |
| "step": 730, | |
| "token_acc": 0.8531080479659894, | |
| "train_speed(iter/s)": 0.10118 | |
| }, | |
| { | |
| "epoch": 0.860655737704918, | |
| "grad_norm": 0.24912209808826447, | |
| "learning_rate": 1.7091581480557057e-05, | |
| "loss": 0.3775820732116699, | |
| "memory(GiB)": 127.52, | |
| "step": 735, | |
| "token_acc": 0.8631545113262953, | |
| "train_speed(iter/s)": 0.101187 | |
| }, | |
| { | |
| "epoch": 0.8665105386416861, | |
| "grad_norm": 0.2668187916278839, | |
| "learning_rate": 1.7045914513874815e-05, | |
| "loss": 0.39071335792541506, | |
| "memory(GiB)": 127.52, | |
| "step": 740, | |
| "token_acc": 0.863421279036421, | |
| "train_speed(iter/s)": 0.101213 | |
| }, | |
| { | |
| "epoch": 0.8723653395784543, | |
| "grad_norm": 0.24733468890190125, | |
| "learning_rate": 1.699995385567907e-05, | |
| "loss": 0.39272005558013917, | |
| "memory(GiB)": 127.52, | |
| "step": 745, | |
| "token_acc": 0.8545664531712299, | |
| "train_speed(iter/s)": 0.101244 | |
| }, | |
| { | |
| "epoch": 0.8782201405152225, | |
| "grad_norm": 0.2632930278778076, | |
| "learning_rate": 1.695370142172614e-05, | |
| "loss": 0.3845970630645752, | |
| "memory(GiB)": 127.52, | |
| "step": 750, | |
| "token_acc": 0.8612419217474074, | |
| "train_speed(iter/s)": 0.101242 | |
| }, | |
| { | |
| "epoch": 0.8840749414519906, | |
| "grad_norm": 0.26514074206352234, | |
| "learning_rate": 1.690715913993429e-05, | |
| "loss": 0.38790068626403806, | |
| "memory(GiB)": 127.52, | |
| "step": 755, | |
| "token_acc": 0.8648871034856036, | |
| "train_speed(iter/s)": 0.101244 | |
| }, | |
| { | |
| "epoch": 0.8899297423887588, | |
| "grad_norm": 0.26957836747169495, | |
| "learning_rate": 1.6860328950303392e-05, | |
| "loss": 0.36716523170471194, | |
| "memory(GiB)": 127.52, | |
| "step": 760, | |
| "token_acc": 0.8711639836976192, | |
| "train_speed(iter/s)": 0.101257 | |
| }, | |
| { | |
| "epoch": 0.8957845433255269, | |
| "grad_norm": 0.2675636410713196, | |
| "learning_rate": 1.6813212804834033e-05, | |
| "loss": 0.38340959548950193, | |
| "memory(GiB)": 127.52, | |
| "step": 765, | |
| "token_acc": 0.8579816582165225, | |
| "train_speed(iter/s)": 0.101264 | |
| }, | |
| { | |
| "epoch": 0.9016393442622951, | |
| "grad_norm": 0.26134225726127625, | |
| "learning_rate": 1.676581266744615e-05, | |
| "loss": 0.3752238988876343, | |
| "memory(GiB)": 127.52, | |
| "step": 770, | |
| "token_acc": 0.8638096187142661, | |
| "train_speed(iter/s)": 0.101274 | |
| }, | |
| { | |
| "epoch": 0.9074941451990632, | |
| "grad_norm": 0.2766994535923004, | |
| "learning_rate": 1.6718130513897207e-05, | |
| "loss": 0.37386231422424315, | |
| "memory(GiB)": 127.52, | |
| "step": 775, | |
| "token_acc": 0.8692816207520612, | |
| "train_speed(iter/s)": 0.10128 | |
| }, | |
| { | |
| "epoch": 0.9133489461358314, | |
| "grad_norm": 0.2736496329307556, | |
| "learning_rate": 1.667016833169979e-05, | |
| "loss": 0.3910179138183594, | |
| "memory(GiB)": 127.52, | |
| "step": 780, | |
| "token_acc": 0.8679116603442695, | |
| "train_speed(iter/s)": 0.101285 | |
| }, | |
| { | |
| "epoch": 0.9192037470725996, | |
| "grad_norm": 0.25334644317626953, | |
| "learning_rate": 1.6621928120038806e-05, | |
| "loss": 0.3837088346481323, | |
| "memory(GiB)": 127.52, | |
| "step": 785, | |
| "token_acc": 0.8568342264714894, | |
| "train_speed(iter/s)": 0.101285 | |
| }, | |
| { | |
| "epoch": 0.9250585480093677, | |
| "grad_norm": 0.2526282072067261, | |
| "learning_rate": 1.657341188968811e-05, | |
| "loss": 0.3741894721984863, | |
| "memory(GiB)": 127.52, | |
| "step": 790, | |
| "token_acc": 0.8600209680781232, | |
| "train_speed(iter/s)": 0.101298 | |
| }, | |
| { | |
| "epoch": 0.9309133489461359, | |
| "grad_norm": 0.2629476487636566, | |
| "learning_rate": 1.6524621662926733e-05, | |
| "loss": 0.3736875057220459, | |
| "memory(GiB)": 127.52, | |
| "step": 795, | |
| "token_acc": 0.8765449927636102, | |
| "train_speed(iter/s)": 0.101311 | |
| }, | |
| { | |
| "epoch": 0.936768149882904, | |
| "grad_norm": 0.26536864042282104, | |
| "learning_rate": 1.6475559473454558e-05, | |
| "loss": 0.3841824769973755, | |
| "memory(GiB)": 127.52, | |
| "step": 800, | |
| "token_acc": 0.8732290436835891, | |
| "train_speed(iter/s)": 0.101317 | |
| }, | |
| { | |
| "epoch": 0.9426229508196722, | |
| "grad_norm": 0.9267993569374084, | |
| "learning_rate": 1.6426227366307563e-05, | |
| "loss": 0.3876027584075928, | |
| "memory(GiB)": 127.52, | |
| "step": 805, | |
| "token_acc": 0.873662949476559, | |
| "train_speed(iter/s)": 0.10131 | |
| }, | |
| { | |
| "epoch": 0.9484777517564403, | |
| "grad_norm": 0.31513214111328125, | |
| "learning_rate": 1.6376627397772576e-05, | |
| "loss": 0.38577656745910643, | |
| "memory(GiB)": 127.52, | |
| "step": 810, | |
| "token_acc": 0.8582883611177872, | |
| "train_speed(iter/s)": 0.101308 | |
| }, | |
| { | |
| "epoch": 0.9543325526932084, | |
| "grad_norm": 0.43881845474243164, | |
| "learning_rate": 1.6326761635301572e-05, | |
| "loss": 0.3793084383010864, | |
| "memory(GiB)": 127.52, | |
| "step": 815, | |
| "token_acc": 0.8658072630089608, | |
| "train_speed(iter/s)": 0.101317 | |
| }, | |
| { | |
| "epoch": 0.9601873536299765, | |
| "grad_norm": 0.2627616822719574, | |
| "learning_rate": 1.6276632157425475e-05, | |
| "loss": 0.3868673801422119, | |
| "memory(GiB)": 127.52, | |
| "step": 820, | |
| "token_acc": 0.8609059346385673, | |
| "train_speed(iter/s)": 0.101319 | |
| }, | |
| { | |
| "epoch": 0.9660421545667447, | |
| "grad_norm": 0.28743499517440796, | |
| "learning_rate": 1.6226241053667536e-05, | |
| "loss": 0.39165661334991453, | |
| "memory(GiB)": 127.52, | |
| "step": 825, | |
| "token_acc": 0.8566733687511922, | |
| "train_speed(iter/s)": 0.101328 | |
| }, | |
| { | |
| "epoch": 0.9718969555035128, | |
| "grad_norm": 0.2647303640842438, | |
| "learning_rate": 1.617559042445625e-05, | |
| "loss": 0.3914238929748535, | |
| "memory(GiB)": 127.52, | |
| "step": 830, | |
| "token_acc": 0.8686447332876824, | |
| "train_speed(iter/s)": 0.101331 | |
| }, | |
| { | |
| "epoch": 0.977751756440281, | |
| "grad_norm": 0.28214219212532043, | |
| "learning_rate": 1.6124682381037767e-05, | |
| "loss": 0.3775761127471924, | |
| "memory(GiB)": 127.52, | |
| "step": 835, | |
| "token_acc": 0.8658163872414246, | |
| "train_speed(iter/s)": 0.101335 | |
| }, | |
| { | |
| "epoch": 0.9836065573770492, | |
| "grad_norm": 0.2978610694408417, | |
| "learning_rate": 1.607351904538792e-05, | |
| "loss": 0.39282917976379395, | |
| "memory(GiB)": 127.52, | |
| "step": 840, | |
| "token_acc": 0.866700342369647, | |
| "train_speed(iter/s)": 0.101325 | |
| }, | |
| { | |
| "epoch": 0.9894613583138173, | |
| "grad_norm": 0.2674395740032196, | |
| "learning_rate": 1.6022102550123775e-05, | |
| "loss": 0.3796736240386963, | |
| "memory(GiB)": 127.52, | |
| "step": 845, | |
| "token_acc": 0.8609281823639752, | |
| "train_speed(iter/s)": 0.101326 | |
| }, | |
| { | |
| "epoch": 0.9953161592505855, | |
| "grad_norm": 0.2766255736351013, | |
| "learning_rate": 1.597043503841471e-05, | |
| "loss": 0.3800792217254639, | |
| "memory(GiB)": 127.52, | |
| "step": 850, | |
| "token_acc": 0.8745568192822268, | |
| "train_speed(iter/s)": 0.101325 | |
| }, | |
| { | |
| "epoch": 1.0011709601873535, | |
| "grad_norm": 0.36053553223609924, | |
| "learning_rate": 1.5918518663893124e-05, | |
| "loss": 0.3734774589538574, | |
| "memory(GiB)": 127.52, | |
| "step": 855, | |
| "token_acc": 0.8709147849019284, | |
| "train_speed(iter/s)": 0.100996 | |
| }, | |
| { | |
| "epoch": 1.0070257611241218, | |
| "grad_norm": 0.29508745670318604, | |
| "learning_rate": 1.5866355590564637e-05, | |
| "loss": 0.3578346252441406, | |
| "memory(GiB)": 127.52, | |
| "step": 860, | |
| "token_acc": 0.8851065028386151, | |
| "train_speed(iter/s)": 0.100988 | |
| }, | |
| { | |
| "epoch": 1.0128805620608898, | |
| "grad_norm": 0.30008167028427124, | |
| "learning_rate": 1.5813947992717894e-05, | |
| "loss": 0.34525480270385744, | |
| "memory(GiB)": 127.52, | |
| "step": 865, | |
| "token_acc": 0.8753548176879359, | |
| "train_speed(iter/s)": 0.10098 | |
| }, | |
| { | |
| "epoch": 1.018735362997658, | |
| "grad_norm": 0.2938152253627777, | |
| "learning_rate": 1.5761298054833947e-05, | |
| "loss": 0.3546164035797119, | |
| "memory(GiB)": 127.52, | |
| "step": 870, | |
| "token_acc": 0.8762193571592467, | |
| "train_speed(iter/s)": 0.100965 | |
| }, | |
| { | |
| "epoch": 1.0245901639344261, | |
| "grad_norm": 0.27178069949150085, | |
| "learning_rate": 1.5708407971495195e-05, | |
| "loss": 0.3612537384033203, | |
| "memory(GiB)": 127.52, | |
| "step": 875, | |
| "token_acc": 0.8722169198754557, | |
| "train_speed(iter/s)": 0.100976 | |
| }, | |
| { | |
| "epoch": 1.0304449648711944, | |
| "grad_norm": 0.2759335935115814, | |
| "learning_rate": 1.565527994729389e-05, | |
| "loss": 0.3513669967651367, | |
| "memory(GiB)": 127.52, | |
| "step": 880, | |
| "token_acc": 0.8818436745370559, | |
| "train_speed(iter/s)": 0.100984 | |
| }, | |
| { | |
| "epoch": 1.0362997658079625, | |
| "grad_norm": 0.2735261917114258, | |
| "learning_rate": 1.5601916196740283e-05, | |
| "loss": 0.3473806858062744, | |
| "memory(GiB)": 127.52, | |
| "step": 885, | |
| "token_acc": 0.8784491835740441, | |
| "train_speed(iter/s)": 0.100979 | |
| }, | |
| { | |
| "epoch": 1.0421545667447307, | |
| "grad_norm": 0.28892189264297485, | |
| "learning_rate": 1.5548318944170276e-05, | |
| "loss": 0.3433929443359375, | |
| "memory(GiB)": 127.52, | |
| "step": 890, | |
| "token_acc": 0.8839334112478968, | |
| "train_speed(iter/s)": 0.100971 | |
| }, | |
| { | |
| "epoch": 1.0480093676814988, | |
| "grad_norm": 0.2602222263813019, | |
| "learning_rate": 1.5494490423652732e-05, | |
| "loss": 0.3427423000335693, | |
| "memory(GiB)": 127.52, | |
| "step": 895, | |
| "token_acc": 0.876471048390882, | |
| "train_speed(iter/s)": 0.100951 | |
| }, | |
| { | |
| "epoch": 1.053864168618267, | |
| "grad_norm": 0.2913144528865814, | |
| "learning_rate": 1.544043287889635e-05, | |
| "loss": 0.3336780071258545, | |
| "memory(GiB)": 127.52, | |
| "step": 900, | |
| "token_acc": 0.8869567959634185, | |
| "train_speed(iter/s)": 0.10095 | |
| }, | |
| { | |
| "epoch": 1.059718969555035, | |
| "grad_norm": 0.2634846270084381, | |
| "learning_rate": 1.538614856315614e-05, | |
| "loss": 0.3489675998687744, | |
| "memory(GiB)": 127.52, | |
| "step": 905, | |
| "token_acc": 0.8832413903915163, | |
| "train_speed(iter/s)": 0.100958 | |
| }, | |
| { | |
| "epoch": 1.0655737704918034, | |
| "grad_norm": 0.2699672281742096, | |
| "learning_rate": 1.5331639739139477e-05, | |
| "loss": 0.3432894229888916, | |
| "memory(GiB)": 127.52, | |
| "step": 910, | |
| "token_acc": 0.8669136816431162, | |
| "train_speed(iter/s)": 0.100951 | |
| }, | |
| { | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 0.2946908175945282, | |
| "learning_rate": 1.5276908678911837e-05, | |
| "loss": 0.3399630546569824, | |
| "memory(GiB)": 127.52, | |
| "step": 915, | |
| "token_acc": 0.8821736748390632, | |
| "train_speed(iter/s)": 0.100953 | |
| }, | |
| { | |
| "epoch": 1.0772833723653397, | |
| "grad_norm": 0.31119436025619507, | |
| "learning_rate": 1.5221957663802043e-05, | |
| "loss": 0.3506146430969238, | |
| "memory(GiB)": 127.52, | |
| "step": 920, | |
| "token_acc": 0.8818868935608091, | |
| "train_speed(iter/s)": 0.100935 | |
| }, | |
| { | |
| "epoch": 1.0831381733021077, | |
| "grad_norm": 0.27400681376457214, | |
| "learning_rate": 1.5166788984307204e-05, | |
| "loss": 0.35775036811828614, | |
| "memory(GiB)": 127.52, | |
| "step": 925, | |
| "token_acc": 0.8750959445346218, | |
| "train_speed(iter/s)": 0.100931 | |
| }, | |
| { | |
| "epoch": 1.088992974238876, | |
| "grad_norm": 0.3916493058204651, | |
| "learning_rate": 1.5111404939997227e-05, | |
| "loss": 0.3546015739440918, | |
| "memory(GiB)": 127.52, | |
| "step": 930, | |
| "token_acc": 0.8738711676022755, | |
| "train_speed(iter/s)": 0.100933 | |
| }, | |
| { | |
| "epoch": 1.094847775175644, | |
| "grad_norm": 0.3681865930557251, | |
| "learning_rate": 1.5055807839418966e-05, | |
| "loss": 0.33371834754943847, | |
| "memory(GiB)": 127.52, | |
| "step": 935, | |
| "token_acc": 0.8814006570111667, | |
| "train_speed(iter/s)": 0.100931 | |
| }, | |
| { | |
| "epoch": 1.100702576112412, | |
| "grad_norm": 0.27416518330574036, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.3561122417449951, | |
| "memory(GiB)": 127.52, | |
| "step": 940, | |
| "token_acc": 0.8838524966358717, | |
| "train_speed(iter/s)": 0.100932 | |
| }, | |
| { | |
| "epoch": 1.1065573770491803, | |
| "grad_norm": 0.2653830349445343, | |
| "learning_rate": 1.494398374795204e-05, | |
| "loss": 0.3430471897125244, | |
| "memory(GiB)": 127.52, | |
| "step": 945, | |
| "token_acc": 0.8739330062998951, | |
| "train_speed(iter/s)": 0.100924 | |
| }, | |
| { | |
| "epoch": 1.1124121779859484, | |
| "grad_norm": 0.29074740409851074, | |
| "learning_rate": 1.4887761418173947e-05, | |
| "loss": 0.36190090179443357, | |
| "memory(GiB)": 127.52, | |
| "step": 950, | |
| "token_acc": 0.8833006769910948, | |
| "train_speed(iter/s)": 0.100919 | |
| }, | |
| { | |
| "epoch": 1.1182669789227166, | |
| "grad_norm": 0.2751435339450836, | |
| "learning_rate": 1.4831335354154444e-05, | |
| "loss": 0.34648761749267576, | |
| "memory(GiB)": 127.52, | |
| "step": 955, | |
| "token_acc": 0.8776634838921327, | |
| "train_speed(iter/s)": 0.100926 | |
| }, | |
| { | |
| "epoch": 1.1241217798594847, | |
| "grad_norm": 0.2628922164440155, | |
| "learning_rate": 1.4774707907874392e-05, | |
| "loss": 0.34562859535217283, | |
| "memory(GiB)": 127.52, | |
| "step": 960, | |
| "token_acc": 0.8836736799002247, | |
| "train_speed(iter/s)": 0.100911 | |
| }, | |
| { | |
| "epoch": 1.129976580796253, | |
| "grad_norm": 0.2639271020889282, | |
| "learning_rate": 1.4717881439708786e-05, | |
| "loss": 0.34596388339996337, | |
| "memory(GiB)": 127.52, | |
| "step": 965, | |
| "token_acc": 0.8673695686030214, | |
| "train_speed(iter/s)": 0.100909 | |
| }, | |
| { | |
| "epoch": 1.135831381733021, | |
| "grad_norm": 0.28422874212265015, | |
| "learning_rate": 1.4660858318328348e-05, | |
| "loss": 0.3498117446899414, | |
| "memory(GiB)": 127.52, | |
| "step": 970, | |
| "token_acc": 0.866499586445358, | |
| "train_speed(iter/s)": 0.100888 | |
| }, | |
| { | |
| "epoch": 1.1416861826697893, | |
| "grad_norm": 0.2625197470188141, | |
| "learning_rate": 1.4603640920600813e-05, | |
| "loss": 0.35533895492553713, | |
| "memory(GiB)": 127.52, | |
| "step": 975, | |
| "token_acc": 0.8624783775908141, | |
| "train_speed(iter/s)": 0.100863 | |
| }, | |
| { | |
| "epoch": 1.1475409836065573, | |
| "grad_norm": 0.2902534008026123, | |
| "learning_rate": 1.4546231631491827e-05, | |
| "loss": 0.35151519775390627, | |
| "memory(GiB)": 127.52, | |
| "step": 980, | |
| "token_acc": 0.871260222085633, | |
| "train_speed(iter/s)": 0.100833 | |
| }, | |
| { | |
| "epoch": 1.1533957845433256, | |
| "grad_norm": 0.2525332570075989, | |
| "learning_rate": 1.4488632843965573e-05, | |
| "loss": 0.3441092729568481, | |
| "memory(GiB)": 127.52, | |
| "step": 985, | |
| "token_acc": 0.8626160602258469, | |
| "train_speed(iter/s)": 0.100824 | |
| }, | |
| { | |
| "epoch": 1.1592505854800936, | |
| "grad_norm": 0.26731306314468384, | |
| "learning_rate": 1.4430846958884995e-05, | |
| "loss": 0.3539264678955078, | |
| "memory(GiB)": 127.52, | |
| "step": 990, | |
| "token_acc": 0.8706765643432645, | |
| "train_speed(iter/s)": 0.100815 | |
| }, | |
| { | |
| "epoch": 1.165105386416862, | |
| "grad_norm": 0.2605798542499542, | |
| "learning_rate": 1.4372876384911741e-05, | |
| "loss": 0.35328848361968995, | |
| "memory(GiB)": 127.52, | |
| "step": 995, | |
| "token_acc": 0.8729384617783252, | |
| "train_speed(iter/s)": 0.100809 | |
| }, | |
| { | |
| "epoch": 1.17096018735363, | |
| "grad_norm": 0.2707096338272095, | |
| "learning_rate": 1.4314723538405752e-05, | |
| "loss": 0.36124861240386963, | |
| "memory(GiB)": 127.52, | |
| "step": 1000, | |
| "token_acc": 0.8623729975690332, | |
| "train_speed(iter/s)": 0.100795 | |
| }, | |
| { | |
| "epoch": 1.1768149882903982, | |
| "grad_norm": 0.26851606369018555, | |
| "learning_rate": 1.4256390843324556e-05, | |
| "loss": 0.35548346042633056, | |
| "memory(GiB)": 127.52, | |
| "step": 1005, | |
| "token_acc": 0.868687436031853, | |
| "train_speed(iter/s)": 0.100786 | |
| }, | |
| { | |
| "epoch": 1.1826697892271663, | |
| "grad_norm": 0.27084365487098694, | |
| "learning_rate": 1.4197880731122221e-05, | |
| "loss": 0.351657772064209, | |
| "memory(GiB)": 127.52, | |
| "step": 1010, | |
| "token_acc": 0.8682709314201729, | |
| "train_speed(iter/s)": 0.100787 | |
| }, | |
| { | |
| "epoch": 1.1885245901639343, | |
| "grad_norm": 0.27497202157974243, | |
| "learning_rate": 1.4139195640648008e-05, | |
| "loss": 0.355600380897522, | |
| "memory(GiB)": 127.52, | |
| "step": 1015, | |
| "token_acc": 0.8803992028496556, | |
| "train_speed(iter/s)": 0.10078 | |
| }, | |
| { | |
| "epoch": 1.1943793911007026, | |
| "grad_norm": 0.2708893418312073, | |
| "learning_rate": 1.4080338018044712e-05, | |
| "loss": 0.3596624851226807, | |
| "memory(GiB)": 127.52, | |
| "step": 1020, | |
| "token_acc": 0.8694279635903098, | |
| "train_speed(iter/s)": 0.100784 | |
| }, | |
| { | |
| "epoch": 1.2002341920374708, | |
| "grad_norm": 0.32129156589508057, | |
| "learning_rate": 1.4021310316646708e-05, | |
| "loss": 0.3490485668182373, | |
| "memory(GiB)": 127.52, | |
| "step": 1025, | |
| "token_acc": 0.8754893595663521, | |
| "train_speed(iter/s)": 0.100766 | |
| }, | |
| { | |
| "epoch": 1.2060889929742389, | |
| "grad_norm": 0.25467485189437866, | |
| "learning_rate": 1.3962114996877685e-05, | |
| "loss": 0.3347738981246948, | |
| "memory(GiB)": 127.52, | |
| "step": 1030, | |
| "token_acc": 0.8824631079656678, | |
| "train_speed(iter/s)": 0.100756 | |
| }, | |
| { | |
| "epoch": 1.211943793911007, | |
| "grad_norm": 0.2674933671951294, | |
| "learning_rate": 1.390275452614808e-05, | |
| "loss": 0.338185977935791, | |
| "memory(GiB)": 127.52, | |
| "step": 1035, | |
| "token_acc": 0.8744415325096718, | |
| "train_speed(iter/s)": 0.100755 | |
| }, | |
| { | |
| "epoch": 1.2177985948477752, | |
| "grad_norm": 0.2707443833351135, | |
| "learning_rate": 1.3843231378752252e-05, | |
| "loss": 0.3448366165161133, | |
| "memory(GiB)": 127.52, | |
| "step": 1040, | |
| "token_acc": 0.8736029828057016, | |
| "train_speed(iter/s)": 0.100747 | |
| }, | |
| { | |
| "epoch": 1.2236533957845432, | |
| "grad_norm": 0.24581725895404816, | |
| "learning_rate": 1.3783548035765327e-05, | |
| "loss": 0.34962687492370603, | |
| "memory(GiB)": 127.52, | |
| "step": 1045, | |
| "token_acc": 0.8796080775037746, | |
| "train_speed(iter/s)": 0.100757 | |
| }, | |
| { | |
| "epoch": 1.2295081967213115, | |
| "grad_norm": 0.2379993051290512, | |
| "learning_rate": 1.3723706984939783e-05, | |
| "loss": 0.33640050888061523, | |
| "memory(GiB)": 127.52, | |
| "step": 1050, | |
| "token_acc": 0.8721236366123021, | |
| "train_speed(iter/s)": 0.100739 | |
| }, | |
| { | |
| "epoch": 1.2353629976580796, | |
| "grad_norm": 0.26605796813964844, | |
| "learning_rate": 1.366371072060177e-05, | |
| "loss": 0.3490384340286255, | |
| "memory(GiB)": 127.52, | |
| "step": 1055, | |
| "token_acc": 0.862867230488973, | |
| "train_speed(iter/s)": 0.100731 | |
| }, | |
| { | |
| "epoch": 1.2412177985948478, | |
| "grad_norm": 0.25522705912590027, | |
| "learning_rate": 1.3603561743547125e-05, | |
| "loss": 0.34296507835388185, | |
| "memory(GiB)": 127.52, | |
| "step": 1060, | |
| "token_acc": 0.8687898169167538, | |
| "train_speed(iter/s)": 0.100739 | |
| }, | |
| { | |
| "epoch": 1.2470725995316159, | |
| "grad_norm": 0.2729935348033905, | |
| "learning_rate": 1.3543262560937135e-05, | |
| "loss": 0.34846017360687254, | |
| "memory(GiB)": 127.52, | |
| "step": 1065, | |
| "token_acc": 0.8741769020279135, | |
| "train_speed(iter/s)": 0.100744 | |
| }, | |
| { | |
| "epoch": 1.2529274004683841, | |
| "grad_norm": 0.2534308433532715, | |
| "learning_rate": 1.3482815686194033e-05, | |
| "loss": 0.33311474323272705, | |
| "memory(GiB)": 127.52, | |
| "step": 1070, | |
| "token_acc": 0.8795399856245507, | |
| "train_speed(iter/s)": 0.100751 | |
| }, | |
| { | |
| "epoch": 1.2587822014051522, | |
| "grad_norm": 0.2755572497844696, | |
| "learning_rate": 1.3422223638896235e-05, | |
| "loss": 0.3432854413986206, | |
| "memory(GiB)": 127.52, | |
| "step": 1075, | |
| "token_acc": 0.8758250682788096, | |
| "train_speed(iter/s)": 0.100756 | |
| }, | |
| { | |
| "epoch": 1.2646370023419204, | |
| "grad_norm": 0.2861506938934326, | |
| "learning_rate": 1.3361488944673315e-05, | |
| "loss": 0.3542114496231079, | |
| "memory(GiB)": 127.52, | |
| "step": 1080, | |
| "token_acc": 0.8687981877806241, | |
| "train_speed(iter/s)": 0.100759 | |
| }, | |
| { | |
| "epoch": 1.2704918032786885, | |
| "grad_norm": 0.3049258589744568, | |
| "learning_rate": 1.3300614135100736e-05, | |
| "loss": 0.3503614664077759, | |
| "memory(GiB)": 127.52, | |
| "step": 1085, | |
| "token_acc": 0.875489517451949, | |
| "train_speed(iter/s)": 0.100754 | |
| }, | |
| { | |
| "epoch": 1.2763466042154565, | |
| "grad_norm": 0.25668370723724365, | |
| "learning_rate": 1.3239601747594319e-05, | |
| "loss": 0.3487658739089966, | |
| "memory(GiB)": 127.52, | |
| "step": 1090, | |
| "token_acc": 0.8770075135561131, | |
| "train_speed(iter/s)": 0.100751 | |
| }, | |
| { | |
| "epoch": 1.2822014051522248, | |
| "grad_norm": 0.2401314228773117, | |
| "learning_rate": 1.3178454325304472e-05, | |
| "loss": 0.3507190465927124, | |
| "memory(GiB)": 127.52, | |
| "step": 1095, | |
| "token_acc": 0.8644839657435769, | |
| "train_speed(iter/s)": 0.100741 | |
| }, | |
| { | |
| "epoch": 1.288056206088993, | |
| "grad_norm": 0.2501038908958435, | |
| "learning_rate": 1.3117174417010213e-05, | |
| "loss": 0.3356021404266357, | |
| "memory(GiB)": 127.52, | |
| "step": 1100, | |
| "token_acc": 0.8694938440492476, | |
| "train_speed(iter/s)": 0.100738 | |
| }, | |
| { | |
| "epoch": 1.2939110070257611, | |
| "grad_norm": 0.25629186630249023, | |
| "learning_rate": 1.3055764577012892e-05, | |
| "loss": 0.3668931007385254, | |
| "memory(GiB)": 127.52, | |
| "step": 1105, | |
| "token_acc": 0.8810234328372201, | |
| "train_speed(iter/s)": 0.100745 | |
| }, | |
| { | |
| "epoch": 1.2997658079625292, | |
| "grad_norm": 0.2689758539199829, | |
| "learning_rate": 1.2994227365029752e-05, | |
| "loss": 0.34679102897644043, | |
| "memory(GiB)": 127.52, | |
| "step": 1110, | |
| "token_acc": 0.8783292769097903, | |
| "train_speed(iter/s)": 0.100746 | |
| }, | |
| { | |
| "epoch": 1.3056206088992974, | |
| "grad_norm": 0.2619406282901764, | |
| "learning_rate": 1.2932565346087218e-05, | |
| "loss": 0.35414924621582033, | |
| "memory(GiB)": 127.52, | |
| "step": 1115, | |
| "token_acc": 0.8748901150285233, | |
| "train_speed(iter/s)": 0.100729 | |
| }, | |
| { | |
| "epoch": 1.3114754098360657, | |
| "grad_norm": 0.3210083842277527, | |
| "learning_rate": 1.2870781090413991e-05, | |
| "loss": 0.35202646255493164, | |
| "memory(GiB)": 127.52, | |
| "step": 1120, | |
| "token_acc": 0.8757856423662141, | |
| "train_speed(iter/s)": 0.10072 | |
| }, | |
| { | |
| "epoch": 1.3173302107728337, | |
| "grad_norm": 0.27284613251686096, | |
| "learning_rate": 1.2808877173333896e-05, | |
| "loss": 0.3467656850814819, | |
| "memory(GiB)": 127.52, | |
| "step": 1125, | |
| "token_acc": 0.883265632074048, | |
| "train_speed(iter/s)": 0.100724 | |
| }, | |
| { | |
| "epoch": 1.3231850117096018, | |
| "grad_norm": 0.2710505425930023, | |
| "learning_rate": 1.2746856175158556e-05, | |
| "loss": 0.35611112117767335, | |
| "memory(GiB)": 127.52, | |
| "step": 1130, | |
| "token_acc": 0.8756308252586658, | |
| "train_speed(iter/s)": 0.100737 | |
| }, | |
| { | |
| "epoch": 1.32903981264637, | |
| "grad_norm": 0.26133865118026733, | |
| "learning_rate": 1.2684720681079825e-05, | |
| "loss": 0.3506006240844727, | |
| "memory(GiB)": 127.52, | |
| "step": 1135, | |
| "token_acc": 0.8604187872166245, | |
| "train_speed(iter/s)": 0.100742 | |
| }, | |
| { | |
| "epoch": 1.334894613583138, | |
| "grad_norm": 0.27019548416137695, | |
| "learning_rate": 1.2622473281062042e-05, | |
| "loss": 0.35390684604644773, | |
| "memory(GiB)": 127.52, | |
| "step": 1140, | |
| "token_acc": 0.8757172258949731, | |
| "train_speed(iter/s)": 0.100736 | |
| }, | |
| { | |
| "epoch": 1.3407494145199064, | |
| "grad_norm": 0.26330387592315674, | |
| "learning_rate": 1.256011656973406e-05, | |
| "loss": 0.36088995933532714, | |
| "memory(GiB)": 127.52, | |
| "step": 1145, | |
| "token_acc": 0.8777154145240186, | |
| "train_speed(iter/s)": 0.100733 | |
| }, | |
| { | |
| "epoch": 1.3466042154566744, | |
| "grad_norm": 0.24824829399585724, | |
| "learning_rate": 1.2497653146281113e-05, | |
| "loss": 0.3501885175704956, | |
| "memory(GiB)": 127.52, | |
| "step": 1150, | |
| "token_acc": 0.8752751123830188, | |
| "train_speed(iter/s)": 0.100712 | |
| }, | |
| { | |
| "epoch": 1.3524590163934427, | |
| "grad_norm": 0.2536720037460327, | |
| "learning_rate": 1.2435085614336459e-05, | |
| "loss": 0.3565546989440918, | |
| "memory(GiB)": 127.52, | |
| "step": 1155, | |
| "token_acc": 0.8831354083065811, | |
| "train_speed(iter/s)": 0.100705 | |
| }, | |
| { | |
| "epoch": 1.3583138173302107, | |
| "grad_norm": 0.24884596467018127, | |
| "learning_rate": 1.2372416581872857e-05, | |
| "loss": 0.34425859451293944, | |
| "memory(GiB)": 127.52, | |
| "step": 1160, | |
| "token_acc": 0.8804687524440259, | |
| "train_speed(iter/s)": 0.100705 | |
| }, | |
| { | |
| "epoch": 1.364168618266979, | |
| "grad_norm": 0.2567351162433624, | |
| "learning_rate": 1.2309648661093878e-05, | |
| "loss": 0.3500640630722046, | |
| "memory(GiB)": 127.52, | |
| "step": 1165, | |
| "token_acc": 0.8808626074837297, | |
| "train_speed(iter/s)": 0.100708 | |
| }, | |
| { | |
| "epoch": 1.370023419203747, | |
| "grad_norm": 0.27127236127853394, | |
| "learning_rate": 1.2246784468324993e-05, | |
| "loss": 0.35610170364379884, | |
| "memory(GiB)": 127.52, | |
| "step": 1170, | |
| "token_acc": 0.8642630631304163, | |
| "train_speed(iter/s)": 0.100707 | |
| }, | |
| { | |
| "epoch": 1.3758782201405153, | |
| "grad_norm": 0.25630801916122437, | |
| "learning_rate": 1.218382662390454e-05, | |
| "loss": 0.3440692901611328, | |
| "memory(GiB)": 127.52, | |
| "step": 1175, | |
| "token_acc": 0.863847903863763, | |
| "train_speed(iter/s)": 0.100714 | |
| }, | |
| { | |
| "epoch": 1.3817330210772834, | |
| "grad_norm": 0.2579875886440277, | |
| "learning_rate": 1.2120777752074492e-05, | |
| "loss": 0.35255093574523927, | |
| "memory(GiB)": 127.52, | |
| "step": 1180, | |
| "token_acc": 0.8730105052212985, | |
| "train_speed(iter/s)": 0.100715 | |
| }, | |
| { | |
| "epoch": 1.3875878220140514, | |
| "grad_norm": 0.2638234496116638, | |
| "learning_rate": 1.2057640480871084e-05, | |
| "loss": 0.3546736717224121, | |
| "memory(GiB)": 127.52, | |
| "step": 1185, | |
| "token_acc": 0.8738721335992023, | |
| "train_speed(iter/s)": 0.100725 | |
| }, | |
| { | |
| "epoch": 1.3934426229508197, | |
| "grad_norm": 0.25871458649635315, | |
| "learning_rate": 1.1994417442015243e-05, | |
| "loss": 0.35408906936645507, | |
| "memory(GiB)": 127.52, | |
| "step": 1190, | |
| "token_acc": 0.8796952149117578, | |
| "train_speed(iter/s)": 0.100732 | |
| }, | |
| { | |
| "epoch": 1.399297423887588, | |
| "grad_norm": 0.2632989287376404, | |
| "learning_rate": 1.193111127080292e-05, | |
| "loss": 0.3432591676712036, | |
| "memory(GiB)": 127.52, | |
| "step": 1195, | |
| "token_acc": 0.8828218086199104, | |
| "train_speed(iter/s)": 0.10074 | |
| }, | |
| { | |
| "epoch": 1.405152224824356, | |
| "grad_norm": 0.24726183712482452, | |
| "learning_rate": 1.186772460599523e-05, | |
| "loss": 0.34243590831756593, | |
| "memory(GiB)": 127.52, | |
| "step": 1200, | |
| "token_acc": 0.8815012144480138, | |
| "train_speed(iter/s)": 0.100741 | |
| }, | |
| { | |
| "epoch": 1.411007025761124, | |
| "grad_norm": 0.3329097032546997, | |
| "learning_rate": 1.1804260089708464e-05, | |
| "loss": 0.3537503480911255, | |
| "memory(GiB)": 127.52, | |
| "step": 1205, | |
| "token_acc": 0.8658939159898351, | |
| "train_speed(iter/s)": 0.100735 | |
| }, | |
| { | |
| "epoch": 1.4168618266978923, | |
| "grad_norm": 0.25181666016578674, | |
| "learning_rate": 1.1740720367303958e-05, | |
| "loss": 0.347446870803833, | |
| "memory(GiB)": 127.52, | |
| "step": 1210, | |
| "token_acc": 0.8740943022953225, | |
| "train_speed(iter/s)": 0.10074 | |
| }, | |
| { | |
| "epoch": 1.4227166276346606, | |
| "grad_norm": 0.2532757818698883, | |
| "learning_rate": 1.1677108087277835e-05, | |
| "loss": 0.3539264678955078, | |
| "memory(GiB)": 127.52, | |
| "step": 1215, | |
| "token_acc": 0.8749382353125137, | |
| "train_speed(iter/s)": 0.100743 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.2551215887069702, | |
| "learning_rate": 1.1613425901150595e-05, | |
| "loss": 0.35313239097595217, | |
| "memory(GiB)": 127.52, | |
| "step": 1220, | |
| "token_acc": 0.8776082867215627, | |
| "train_speed(iter/s)": 0.100745 | |
| }, | |
| { | |
| "epoch": 1.4344262295081966, | |
| "grad_norm": 0.2713333070278168, | |
| "learning_rate": 1.15496764633566e-05, | |
| "loss": 0.3634988307952881, | |
| "memory(GiB)": 127.52, | |
| "step": 1225, | |
| "token_acc": 0.8660714848651069, | |
| "train_speed(iter/s)": 0.10073 | |
| }, | |
| { | |
| "epoch": 1.440281030444965, | |
| "grad_norm": 0.26022830605506897, | |
| "learning_rate": 1.1485862431133445e-05, | |
| "loss": 0.3524580478668213, | |
| "memory(GiB)": 127.52, | |
| "step": 1230, | |
| "token_acc": 0.8803166548004755, | |
| "train_speed(iter/s)": 0.100717 | |
| }, | |
| { | |
| "epoch": 1.446135831381733, | |
| "grad_norm": 0.25171470642089844, | |
| "learning_rate": 1.1421986464411169e-05, | |
| "loss": 0.3533075571060181, | |
| "memory(GiB)": 127.52, | |
| "step": 1235, | |
| "token_acc": 0.8648047662981438, | |
| "train_speed(iter/s)": 0.100703 | |
| }, | |
| { | |
| "epoch": 1.4519906323185012, | |
| "grad_norm": 0.2464302033185959, | |
| "learning_rate": 1.1358051225701404e-05, | |
| "loss": 0.3423281192779541, | |
| "memory(GiB)": 127.52, | |
| "step": 1240, | |
| "token_acc": 0.8691011183611862, | |
| "train_speed(iter/s)": 0.100701 | |
| }, | |
| { | |
| "epoch": 1.4578454332552693, | |
| "grad_norm": 0.25466638803482056, | |
| "learning_rate": 1.1294059379986384e-05, | |
| "loss": 0.35201549530029297, | |
| "memory(GiB)": 127.52, | |
| "step": 1245, | |
| "token_acc": 0.8681012341038652, | |
| "train_speed(iter/s)": 0.100689 | |
| }, | |
| { | |
| "epoch": 1.4637002341920375, | |
| "grad_norm": 0.2576982080936432, | |
| "learning_rate": 1.1230013594607874e-05, | |
| "loss": 0.3531355857849121, | |
| "memory(GiB)": 127.52, | |
| "step": 1250, | |
| "token_acc": 0.873457880243676, | |
| "train_speed(iter/s)": 0.100693 | |
| }, | |
| { | |
| "epoch": 1.4695550351288056, | |
| "grad_norm": 0.25660985708236694, | |
| "learning_rate": 1.1165916539155968e-05, | |
| "loss": 0.35094761848449707, | |
| "memory(GiB)": 127.52, | |
| "step": 1255, | |
| "token_acc": 0.8773934266901257, | |
| "train_speed(iter/s)": 0.100701 | |
| }, | |
| { | |
| "epoch": 1.4754098360655736, | |
| "grad_norm": 0.24054618179798126, | |
| "learning_rate": 1.1101770885357843e-05, | |
| "loss": 0.34633212089538573, | |
| "memory(GiB)": 127.52, | |
| "step": 1260, | |
| "token_acc": 0.8775079994840057, | |
| "train_speed(iter/s)": 0.100711 | |
| }, | |
| { | |
| "epoch": 1.481264637002342, | |
| "grad_norm": 0.2445182204246521, | |
| "learning_rate": 1.1037579306966365e-05, | |
| "loss": 0.34541456699371337, | |
| "memory(GiB)": 127.52, | |
| "step": 1265, | |
| "token_acc": 0.8862320037137543, | |
| "train_speed(iter/s)": 0.100709 | |
| }, | |
| { | |
| "epoch": 1.4871194379391102, | |
| "grad_norm": 0.2729050815105438, | |
| "learning_rate": 1.0973344479648652e-05, | |
| "loss": 0.3409654855728149, | |
| "memory(GiB)": 127.52, | |
| "step": 1270, | |
| "token_acc": 0.8771963474914158, | |
| "train_speed(iter/s)": 0.100709 | |
| }, | |
| { | |
| "epoch": 1.4929742388758782, | |
| "grad_norm": 0.24874503910541534, | |
| "learning_rate": 1.0909069080874556e-05, | |
| "loss": 0.3430008411407471, | |
| "memory(GiB)": 127.52, | |
| "step": 1275, | |
| "token_acc": 0.8704117168634027, | |
| "train_speed(iter/s)": 0.100714 | |
| }, | |
| { | |
| "epoch": 1.4988290398126463, | |
| "grad_norm": 0.2715279459953308, | |
| "learning_rate": 1.0844755789805042e-05, | |
| "loss": 0.35068159103393554, | |
| "memory(GiB)": 127.52, | |
| "step": 1280, | |
| "token_acc": 0.8675264981305526, | |
| "train_speed(iter/s)": 0.100714 | |
| }, | |
| { | |
| "epoch": 1.5046838407494145, | |
| "grad_norm": 0.23995369672775269, | |
| "learning_rate": 1.0780407287180526e-05, | |
| "loss": 0.35523912906646726, | |
| "memory(GiB)": 127.52, | |
| "step": 1285, | |
| "token_acc": 0.8685361997709505, | |
| "train_speed(iter/s)": 0.100706 | |
| }, | |
| { | |
| "epoch": 1.5105386416861828, | |
| "grad_norm": 0.26195716857910156, | |
| "learning_rate": 1.0716026255209124e-05, | |
| "loss": 0.349694561958313, | |
| "memory(GiB)": 127.52, | |
| "step": 1290, | |
| "token_acc": 0.8676919971870162, | |
| "train_speed(iter/s)": 0.100714 | |
| }, | |
| { | |
| "epoch": 1.5163934426229508, | |
| "grad_norm": 0.24379870295524597, | |
| "learning_rate": 1.0651615377454872e-05, | |
| "loss": 0.3513511657714844, | |
| "memory(GiB)": 127.52, | |
| "step": 1295, | |
| "token_acc": 0.8762717457922776, | |
| "train_speed(iter/s)": 0.100714 | |
| }, | |
| { | |
| "epoch": 1.5222482435597189, | |
| "grad_norm": 0.2554638683795929, | |
| "learning_rate": 1.0587177338725834e-05, | |
| "loss": 0.3569997787475586, | |
| "memory(GiB)": 127.52, | |
| "step": 1300, | |
| "token_acc": 0.8766220533416101, | |
| "train_speed(iter/s)": 0.100716 | |
| }, | |
| { | |
| "epoch": 1.5281030444964872, | |
| "grad_norm": 0.2548043727874756, | |
| "learning_rate": 1.0522714824962228e-05, | |
| "loss": 0.3422648191452026, | |
| "memory(GiB)": 127.52, | |
| "step": 1305, | |
| "token_acc": 0.8870603034829783, | |
| "train_speed(iter/s)": 0.100709 | |
| }, | |
| { | |
| "epoch": 1.5339578454332554, | |
| "grad_norm": 0.24967636168003082, | |
| "learning_rate": 1.0458230523124443e-05, | |
| "loss": 0.3560429573059082, | |
| "memory(GiB)": 127.52, | |
| "step": 1310, | |
| "token_acc": 0.8787232780765522, | |
| "train_speed(iter/s)": 0.100701 | |
| }, | |
| { | |
| "epoch": 1.5398126463700235, | |
| "grad_norm": 0.2598780691623688, | |
| "learning_rate": 1.0393727121081057e-05, | |
| "loss": 0.3518627166748047, | |
| "memory(GiB)": 127.52, | |
| "step": 1315, | |
| "token_acc": 0.8750810752945474, | |
| "train_speed(iter/s)": 0.10071 | |
| }, | |
| { | |
| "epoch": 1.5456674473067915, | |
| "grad_norm": 0.23825575411319733, | |
| "learning_rate": 1.0329207307496785e-05, | |
| "loss": 0.3401672840118408, | |
| "memory(GiB)": 127.52, | |
| "step": 1320, | |
| "token_acc": 0.8770460187011242, | |
| "train_speed(iter/s)": 0.100714 | |
| }, | |
| { | |
| "epoch": 1.5515222482435598, | |
| "grad_norm": 0.2550235986709595, | |
| "learning_rate": 1.0264673771720429e-05, | |
| "loss": 0.350058913230896, | |
| "memory(GiB)": 127.52, | |
| "step": 1325, | |
| "token_acc": 0.881267240867612, | |
| "train_speed(iter/s)": 0.100714 | |
| }, | |
| { | |
| "epoch": 1.5573770491803278, | |
| "grad_norm": 0.269613653421402, | |
| "learning_rate": 1.0200129203672754e-05, | |
| "loss": 0.3502191543579102, | |
| "memory(GiB)": 127.52, | |
| "step": 1330, | |
| "token_acc": 0.8661129276756743, | |
| "train_speed(iter/s)": 0.10071 | |
| }, | |
| { | |
| "epoch": 1.5632318501170959, | |
| "grad_norm": 0.24150115251541138, | |
| "learning_rate": 1.0135576293734381e-05, | |
| "loss": 0.34059958457946776, | |
| "memory(GiB)": 127.52, | |
| "step": 1335, | |
| "token_acc": 0.8847581210563216, | |
| "train_speed(iter/s)": 0.100716 | |
| }, | |
| { | |
| "epoch": 1.5690866510538641, | |
| "grad_norm": 0.2703973650932312, | |
| "learning_rate": 1.007101773263365e-05, | |
| "loss": 0.35358033180236814, | |
| "memory(GiB)": 127.52, | |
| "step": 1340, | |
| "token_acc": 0.8770655404348506, | |
| "train_speed(iter/s)": 0.100721 | |
| }, | |
| { | |
| "epoch": 1.5749414519906324, | |
| "grad_norm": 0.23871327936649323, | |
| "learning_rate": 1.0006456211334445e-05, | |
| "loss": 0.3467454671859741, | |
| "memory(GiB)": 127.52, | |
| "step": 1345, | |
| "token_acc": 0.8759395313396612, | |
| "train_speed(iter/s)": 0.100731 | |
| }, | |
| { | |
| "epoch": 1.5807962529274004, | |
| "grad_norm": 0.25692564249038696, | |
| "learning_rate": 9.941894420924044e-06, | |
| "loss": 0.3450988054275513, | |
| "memory(GiB)": 127.52, | |
| "step": 1350, | |
| "token_acc": 0.8868195745646664, | |
| "train_speed(iter/s)": 0.100727 | |
| }, | |
| { | |
| "epoch": 1.5866510538641685, | |
| "grad_norm": 0.2428205907344818, | |
| "learning_rate": 9.87733505250094e-06, | |
| "loss": 0.3494907855987549, | |
| "memory(GiB)": 127.52, | |
| "step": 1355, | |
| "token_acc": 0.8756121235576668, | |
| "train_speed(iter/s)": 0.100725 | |
| }, | |
| { | |
| "epoch": 1.5925058548009368, | |
| "grad_norm": 0.24155238270759583, | |
| "learning_rate": 9.812780797062678e-06, | |
| "loss": 0.3456254005432129, | |
| "memory(GiB)": 127.52, | |
| "step": 1360, | |
| "token_acc": 0.8809245943605768, | |
| "train_speed(iter/s)": 0.10072 | |
| }, | |
| { | |
| "epoch": 1.598360655737705, | |
| "grad_norm": 0.464139848947525, | |
| "learning_rate": 9.748234345393672e-06, | |
| "loss": 0.34203310012817384, | |
| "memory(GiB)": 127.52, | |
| "step": 1365, | |
| "token_acc": 0.8774237555421359, | |
| "train_speed(iter/s)": 0.100715 | |
| }, | |
| { | |
| "epoch": 1.604215456674473, | |
| "grad_norm": 0.2672084867954254, | |
| "learning_rate": 9.68369838795306e-06, | |
| "loss": 0.350542688369751, | |
| "memory(GiB)": 127.52, | |
| "step": 1370, | |
| "token_acc": 0.8734205080790737, | |
| "train_speed(iter/s)": 0.100705 | |
| }, | |
| { | |
| "epoch": 1.6100702576112411, | |
| "grad_norm": 0.2600000500679016, | |
| "learning_rate": 9.61917561476255e-06, | |
| "loss": 0.3421807050704956, | |
| "memory(GiB)": 127.52, | |
| "step": 1375, | |
| "token_acc": 0.8668853013058622, | |
| "train_speed(iter/s)": 0.100709 | |
| }, | |
| { | |
| "epoch": 1.6159250585480094, | |
| "grad_norm": 0.2540619373321533, | |
| "learning_rate": 9.554668715294305e-06, | |
| "loss": 0.3543410778045654, | |
| "memory(GiB)": 127.52, | |
| "step": 1380, | |
| "token_acc": 0.8761743728864414, | |
| "train_speed(iter/s)": 0.100714 | |
| }, | |
| { | |
| "epoch": 1.6217798594847777, | |
| "grad_norm": 0.2585217356681824, | |
| "learning_rate": 9.490180378358826e-06, | |
| "loss": 0.35744295120239256, | |
| "memory(GiB)": 127.52, | |
| "step": 1385, | |
| "token_acc": 0.8715506016593595, | |
| "train_speed(iter/s)": 0.100718 | |
| }, | |
| { | |
| "epoch": 1.6276346604215457, | |
| "grad_norm": 0.26017606258392334, | |
| "learning_rate": 9.425713291992878e-06, | |
| "loss": 0.34558424949645994, | |
| "memory(GiB)": 127.52, | |
| "step": 1390, | |
| "token_acc": 0.8794015410099387, | |
| "train_speed(iter/s)": 0.100719 | |
| }, | |
| { | |
| "epoch": 1.6334894613583137, | |
| "grad_norm": 0.25051021575927734, | |
| "learning_rate": 9.361270143347452e-06, | |
| "loss": 0.35907368659973143, | |
| "memory(GiB)": 127.52, | |
| "step": 1395, | |
| "token_acc": 0.8715231746371632, | |
| "train_speed(iter/s)": 0.100723 | |
| }, | |
| { | |
| "epoch": 1.639344262295082, | |
| "grad_norm": 0.24877934157848358, | |
| "learning_rate": 9.296853618575753e-06, | |
| "loss": 0.34605088233947756, | |
| "memory(GiB)": 127.52, | |
| "step": 1400, | |
| "token_acc": 0.8828522126980963, | |
| "train_speed(iter/s)": 0.100731 | |
| }, | |
| { | |
| "epoch": 1.6451990632318503, | |
| "grad_norm": 0.23893095552921295, | |
| "learning_rate": 9.232466402721241e-06, | |
| "loss": 0.3570685625076294, | |
| "memory(GiB)": 127.52, | |
| "step": 1405, | |
| "token_acc": 0.8760022299616647, | |
| "train_speed(iter/s)": 0.10074 | |
| }, | |
| { | |
| "epoch": 1.651053864168618, | |
| "grad_norm": 0.24638938903808594, | |
| "learning_rate": 9.1681111796057e-06, | |
| "loss": 0.3466794967651367, | |
| "memory(GiB)": 127.52, | |
| "step": 1410, | |
| "token_acc": 0.8773031091974165, | |
| "train_speed(iter/s)": 0.100749 | |
| }, | |
| { | |
| "epoch": 1.6569086651053864, | |
| "grad_norm": 0.256526380777359, | |
| "learning_rate": 9.103790631717375e-06, | |
| "loss": 0.3623323917388916, | |
| "memory(GiB)": 127.52, | |
| "step": 1415, | |
| "token_acc": 0.8679865616745452, | |
| "train_speed(iter/s)": 0.100748 | |
| }, | |
| { | |
| "epoch": 1.6627634660421546, | |
| "grad_norm": 0.25238198041915894, | |
| "learning_rate": 9.039507440099164e-06, | |
| "loss": 0.3467939138412476, | |
| "memory(GiB)": 127.52, | |
| "step": 1420, | |
| "token_acc": 0.8828419526341228, | |
| "train_speed(iter/s)": 0.100746 | |
| }, | |
| { | |
| "epoch": 1.6686182669789227, | |
| "grad_norm": 0.23841890692710876, | |
| "learning_rate": 8.975264284236866e-06, | |
| "loss": 0.34966843128204345, | |
| "memory(GiB)": 127.52, | |
| "step": 1425, | |
| "token_acc": 0.8775815971188294, | |
| "train_speed(iter/s)": 0.100755 | |
| }, | |
| { | |
| "epoch": 1.6744730679156907, | |
| "grad_norm": 0.26001548767089844, | |
| "learning_rate": 8.911063841947476e-06, | |
| "loss": 0.35109724998474123, | |
| "memory(GiB)": 127.52, | |
| "step": 1430, | |
| "token_acc": 0.8745225380796411, | |
| "train_speed(iter/s)": 0.100754 | |
| }, | |
| { | |
| "epoch": 1.680327868852459, | |
| "grad_norm": 0.2468952238559723, | |
| "learning_rate": 8.846908789267589e-06, | |
| "loss": 0.35158143043518064, | |
| "memory(GiB)": 127.52, | |
| "step": 1435, | |
| "token_acc": 0.8772585276576946, | |
| "train_speed(iter/s)": 0.100766 | |
| }, | |
| { | |
| "epoch": 1.6861826697892273, | |
| "grad_norm": 0.24095061421394348, | |
| "learning_rate": 8.78280180034184e-06, | |
| "loss": 0.3411277770996094, | |
| "memory(GiB)": 127.52, | |
| "step": 1440, | |
| "token_acc": 0.8712463039204312, | |
| "train_speed(iter/s)": 0.10077 | |
| }, | |
| { | |
| "epoch": 1.6920374707259953, | |
| "grad_norm": 0.25439053773880005, | |
| "learning_rate": 8.718745547311458e-06, | |
| "loss": 0.3543074131011963, | |
| "memory(GiB)": 127.52, | |
| "step": 1445, | |
| "token_acc": 0.871980767417743, | |
| "train_speed(iter/s)": 0.100763 | |
| }, | |
| { | |
| "epoch": 1.6978922716627634, | |
| "grad_norm": 1.5297069549560547, | |
| "learning_rate": 8.654742700202849e-06, | |
| "loss": 0.3533529043197632, | |
| "memory(GiB)": 127.52, | |
| "step": 1450, | |
| "token_acc": 0.8742467882207196, | |
| "train_speed(iter/s)": 0.100766 | |
| }, | |
| { | |
| "epoch": 1.7037470725995316, | |
| "grad_norm": 0.25103631615638733, | |
| "learning_rate": 8.590795926816348e-06, | |
| "loss": 0.3418538570404053, | |
| "memory(GiB)": 127.52, | |
| "step": 1455, | |
| "token_acc": 0.8745452901882429, | |
| "train_speed(iter/s)": 0.100768 | |
| }, | |
| { | |
| "epoch": 1.7096018735362999, | |
| "grad_norm": 0.3538268208503723, | |
| "learning_rate": 8.526907892614986e-06, | |
| "loss": 0.34701027870178225, | |
| "memory(GiB)": 127.52, | |
| "step": 1460, | |
| "token_acc": 0.8781468525993731, | |
| "train_speed(iter/s)": 0.100762 | |
| }, | |
| { | |
| "epoch": 1.715456674473068, | |
| "grad_norm": 0.2575690448284149, | |
| "learning_rate": 8.463081260613391e-06, | |
| "loss": 0.3492567539215088, | |
| "memory(GiB)": 127.52, | |
| "step": 1465, | |
| "token_acc": 0.8833869870635476, | |
| "train_speed(iter/s)": 0.10076 | |
| }, | |
| { | |
| "epoch": 1.721311475409836, | |
| "grad_norm": 0.25249573588371277, | |
| "learning_rate": 8.399318691266806e-06, | |
| "loss": 0.35265603065490725, | |
| "memory(GiB)": 127.52, | |
| "step": 1470, | |
| "token_acc": 0.8733317460118548, | |
| "train_speed(iter/s)": 0.10076 | |
| }, | |
| { | |
| "epoch": 1.7271662763466042, | |
| "grad_norm": 0.26620882749557495, | |
| "learning_rate": 8.335622842360168e-06, | |
| "loss": 0.3444960594177246, | |
| "memory(GiB)": 127.52, | |
| "step": 1475, | |
| "token_acc": 0.8786412367096045, | |
| "train_speed(iter/s)": 0.100754 | |
| }, | |
| { | |
| "epoch": 1.7330210772833725, | |
| "grad_norm": 0.25925421714782715, | |
| "learning_rate": 8.271996368897345e-06, | |
| "loss": 0.35317885875701904, | |
| "memory(GiB)": 127.52, | |
| "step": 1480, | |
| "token_acc": 0.8806173955625871, | |
| "train_speed(iter/s)": 0.10074 | |
| }, | |
| { | |
| "epoch": 1.7388758782201406, | |
| "grad_norm": 0.24599948525428772, | |
| "learning_rate": 8.208441922990454e-06, | |
| "loss": 0.34299373626708984, | |
| "memory(GiB)": 127.52, | |
| "step": 1485, | |
| "token_acc": 0.8748146671484283, | |
| "train_speed(iter/s)": 0.100736 | |
| }, | |
| { | |
| "epoch": 1.7447306791569086, | |
| "grad_norm": 0.2374086081981659, | |
| "learning_rate": 8.144962153749331e-06, | |
| "loss": 0.3454796314239502, | |
| "memory(GiB)": 127.52, | |
| "step": 1490, | |
| "token_acc": 0.8697578355578018, | |
| "train_speed(iter/s)": 0.100735 | |
| }, | |
| { | |
| "epoch": 1.7505854800936769, | |
| "grad_norm": 0.2567986845970154, | |
| "learning_rate": 8.081559707171094e-06, | |
| "loss": 0.35629470348358155, | |
| "memory(GiB)": 127.52, | |
| "step": 1495, | |
| "token_acc": 0.8722708482627621, | |
| "train_speed(iter/s)": 0.100742 | |
| }, | |
| { | |
| "epoch": 1.756440281030445, | |
| "grad_norm": 0.2612420320510864, | |
| "learning_rate": 8.01823722602986e-06, | |
| "loss": 0.34243695735931395, | |
| "memory(GiB)": 127.52, | |
| "step": 1500, | |
| "token_acc": 0.8835913661147516, | |
| "train_speed(iter/s)": 0.100749 | |
| }, | |
| { | |
| "epoch": 1.762295081967213, | |
| "grad_norm": 0.25001969933509827, | |
| "learning_rate": 7.954997349766576e-06, | |
| "loss": 0.3504654407501221, | |
| "memory(GiB)": 127.52, | |
| "step": 1505, | |
| "token_acc": 0.8767294491512118, | |
| "train_speed(iter/s)": 0.100752 | |
| }, | |
| { | |
| "epoch": 1.7681498829039812, | |
| "grad_norm": 0.24179641902446747, | |
| "learning_rate": 7.891842714379027e-06, | |
| "loss": 0.3378228425979614, | |
| "memory(GiB)": 127.52, | |
| "step": 1510, | |
| "token_acc": 0.8821447808495446, | |
| "train_speed(iter/s)": 0.10075 | |
| }, | |
| { | |
| "epoch": 1.7740046838407495, | |
| "grad_norm": 0.2632296085357666, | |
| "learning_rate": 7.828775952311921e-06, | |
| "loss": 0.34106738567352296, | |
| "memory(GiB)": 127.52, | |
| "step": 1515, | |
| "token_acc": 0.872465283102722, | |
| "train_speed(iter/s)": 0.100743 | |
| }, | |
| { | |
| "epoch": 1.7798594847775175, | |
| "grad_norm": 0.2476883977651596, | |
| "learning_rate": 7.765799692347201e-06, | |
| "loss": 0.34442992210388185, | |
| "memory(GiB)": 127.52, | |
| "step": 1520, | |
| "token_acc": 0.8729373501693029, | |
| "train_speed(iter/s)": 0.100743 | |
| }, | |
| { | |
| "epoch": 1.7857142857142856, | |
| "grad_norm": 0.2630121111869812, | |
| "learning_rate": 7.702916559494444e-06, | |
| "loss": 0.3511634588241577, | |
| "memory(GiB)": 127.52, | |
| "step": 1525, | |
| "token_acc": 0.8770366431554089, | |
| "train_speed(iter/s)": 0.100748 | |
| }, | |
| { | |
| "epoch": 1.7915690866510539, | |
| "grad_norm": 0.24981631338596344, | |
| "learning_rate": 7.64012917488146e-06, | |
| "loss": 0.33224847316741946, | |
| "memory(GiB)": 127.52, | |
| "step": 1530, | |
| "token_acc": 0.8896432981333869, | |
| "train_speed(iter/s)": 0.100745 | |
| }, | |
| { | |
| "epoch": 1.7974238875878221, | |
| "grad_norm": 0.25589603185653687, | |
| "learning_rate": 7.577440155645028e-06, | |
| "loss": 0.3430049896240234, | |
| "memory(GiB)": 127.52, | |
| "step": 1535, | |
| "token_acc": 0.8784328165618647, | |
| "train_speed(iter/s)": 0.100733 | |
| }, | |
| { | |
| "epoch": 1.8032786885245902, | |
| "grad_norm": 0.24135656654834747, | |
| "learning_rate": 7.514852114821811e-06, | |
| "loss": 0.35404491424560547, | |
| "memory(GiB)": 127.52, | |
| "step": 1540, | |
| "token_acc": 0.8758359005184462, | |
| "train_speed(iter/s)": 0.100732 | |
| }, | |
| { | |
| "epoch": 1.8091334894613582, | |
| "grad_norm": 0.3086133599281311, | |
| "learning_rate": 7.452367661239433e-06, | |
| "loss": 0.3292539596557617, | |
| "memory(GiB)": 127.52, | |
| "step": 1545, | |
| "token_acc": 0.8771563599039064, | |
| "train_speed(iter/s)": 0.10073 | |
| }, | |
| { | |
| "epoch": 1.8149882903981265, | |
| "grad_norm": 0.26186585426330566, | |
| "learning_rate": 7.389989399407741e-06, | |
| "loss": 0.3564730644226074, | |
| "memory(GiB)": 127.52, | |
| "step": 1550, | |
| "token_acc": 0.8767442953125245, | |
| "train_speed(iter/s)": 0.100726 | |
| }, | |
| { | |
| "epoch": 1.8208430913348947, | |
| "grad_norm": 0.2449086308479309, | |
| "learning_rate": 7.3277199294102485e-06, | |
| "loss": 0.3377220630645752, | |
| "memory(GiB)": 127.52, | |
| "step": 1555, | |
| "token_acc": 0.8731188520277088, | |
| "train_speed(iter/s)": 0.100729 | |
| }, | |
| { | |
| "epoch": 1.8266978922716628, | |
| "grad_norm": 0.2617018222808838, | |
| "learning_rate": 7.265561846795741e-06, | |
| "loss": 0.35269980430603026, | |
| "memory(GiB)": 127.52, | |
| "step": 1560, | |
| "token_acc": 0.8755254872982656, | |
| "train_speed(iter/s)": 0.100718 | |
| }, | |
| { | |
| "epoch": 1.8325526932084308, | |
| "grad_norm": 0.2533339262008667, | |
| "learning_rate": 7.203517742470101e-06, | |
| "loss": 0.3477527856826782, | |
| "memory(GiB)": 127.52, | |
| "step": 1565, | |
| "token_acc": 0.8841913617578873, | |
| "train_speed(iter/s)": 0.100718 | |
| }, | |
| { | |
| "epoch": 1.838407494145199, | |
| "grad_norm": 0.24031810462474823, | |
| "learning_rate": 7.141590202588312e-06, | |
| "loss": 0.35293850898742674, | |
| "memory(GiB)": 127.52, | |
| "step": 1570, | |
| "token_acc": 0.8790135675181339, | |
| "train_speed(iter/s)": 0.100724 | |
| }, | |
| { | |
| "epoch": 1.8442622950819674, | |
| "grad_norm": 0.2540515661239624, | |
| "learning_rate": 7.079781808446648e-06, | |
| "loss": 0.35478663444519043, | |
| "memory(GiB)": 127.52, | |
| "step": 1575, | |
| "token_acc": 0.8638225043564849, | |
| "train_speed(iter/s)": 0.100721 | |
| }, | |
| { | |
| "epoch": 1.8501170960187352, | |
| "grad_norm": 0.24163876473903656, | |
| "learning_rate": 7.018095136375089e-06, | |
| "loss": 0.33953070640563965, | |
| "memory(GiB)": 127.52, | |
| "step": 1580, | |
| "token_acc": 0.8760248415939393, | |
| "train_speed(iter/s)": 0.100721 | |
| }, | |
| { | |
| "epoch": 1.8559718969555035, | |
| "grad_norm": 0.24985362589359283, | |
| "learning_rate": 6.956532757629945e-06, | |
| "loss": 0.34739911556243896, | |
| "memory(GiB)": 127.52, | |
| "step": 1585, | |
| "token_acc": 0.8751094324520373, | |
| "train_speed(iter/s)": 0.10072 | |
| }, | |
| { | |
| "epoch": 1.8618266978922717, | |
| "grad_norm": 0.24738718569278717, | |
| "learning_rate": 6.89509723828665e-06, | |
| "loss": 0.35140252113342285, | |
| "memory(GiB)": 127.52, | |
| "step": 1590, | |
| "token_acc": 0.8747874666018945, | |
| "train_speed(iter/s)": 0.100721 | |
| }, | |
| { | |
| "epoch": 1.8676814988290398, | |
| "grad_norm": 0.2528833746910095, | |
| "learning_rate": 6.833791139132824e-06, | |
| "loss": 0.3366274356842041, | |
| "memory(GiB)": 127.52, | |
| "step": 1595, | |
| "token_acc": 0.877359708131215, | |
| "train_speed(iter/s)": 0.100705 | |
| }, | |
| { | |
| "epoch": 1.8735362997658078, | |
| "grad_norm": 0.22930973768234253, | |
| "learning_rate": 6.772617015561529e-06, | |
| "loss": 0.34548795223236084, | |
| "memory(GiB)": 127.52, | |
| "step": 1600, | |
| "token_acc": 0.8674766998186026, | |
| "train_speed(iter/s)": 0.100705 | |
| }, | |
| { | |
| "epoch": 1.879391100702576, | |
| "grad_norm": 0.23658259212970734, | |
| "learning_rate": 6.7115774174647475e-06, | |
| "loss": 0.3390948295593262, | |
| "memory(GiB)": 127.52, | |
| "step": 1605, | |
| "token_acc": 0.883574050014699, | |
| "train_speed(iter/s)": 0.100706 | |
| }, | |
| { | |
| "epoch": 1.8852459016393444, | |
| "grad_norm": 0.25393053889274597, | |
| "learning_rate": 6.6506748891271045e-06, | |
| "loss": 0.3500185012817383, | |
| "memory(GiB)": 127.52, | |
| "step": 1610, | |
| "token_acc": 0.8819961495087196, | |
| "train_speed(iter/s)": 0.100708 | |
| }, | |
| { | |
| "epoch": 1.8911007025761124, | |
| "grad_norm": 0.23870056867599487, | |
| "learning_rate": 6.5899119691198025e-06, | |
| "loss": 0.343201732635498, | |
| "memory(GiB)": 127.52, | |
| "step": 1615, | |
| "token_acc": 0.8769540112004077, | |
| "train_speed(iter/s)": 0.100712 | |
| }, | |
| { | |
| "epoch": 1.8969555035128804, | |
| "grad_norm": 0.23795676231384277, | |
| "learning_rate": 6.529291190194829e-06, | |
| "loss": 0.3476824998855591, | |
| "memory(GiB)": 127.52, | |
| "step": 1620, | |
| "token_acc": 0.8771016372387611, | |
| "train_speed(iter/s)": 0.100717 | |
| }, | |
| { | |
| "epoch": 1.9028103044496487, | |
| "grad_norm": 0.23620595037937164, | |
| "learning_rate": 6.468815079179364e-06, | |
| "loss": 0.3438570022583008, | |
| "memory(GiB)": 127.52, | |
| "step": 1625, | |
| "token_acc": 0.8808678958099098, | |
| "train_speed(iter/s)": 0.100717 | |
| }, | |
| { | |
| "epoch": 1.908665105386417, | |
| "grad_norm": 0.27084144949913025, | |
| "learning_rate": 6.408486156870466e-06, | |
| "loss": 0.3575857162475586, | |
| "memory(GiB)": 127.52, | |
| "step": 1630, | |
| "token_acc": 0.8567800504203767, | |
| "train_speed(iter/s)": 0.10072 | |
| }, | |
| { | |
| "epoch": 1.914519906323185, | |
| "grad_norm": 0.24774354696273804, | |
| "learning_rate": 6.348306937929991e-06, | |
| "loss": 0.3539011001586914, | |
| "memory(GiB)": 127.52, | |
| "step": 1635, | |
| "token_acc": 0.8722537158121981, | |
| "train_speed(iter/s)": 0.100726 | |
| }, | |
| { | |
| "epoch": 1.920374707259953, | |
| "grad_norm": 0.23919358849525452, | |
| "learning_rate": 6.288279930779789e-06, | |
| "loss": 0.33454456329345705, | |
| "memory(GiB)": 127.52, | |
| "step": 1640, | |
| "token_acc": 0.8859452149573859, | |
| "train_speed(iter/s)": 0.100729 | |
| }, | |
| { | |
| "epoch": 1.9262295081967213, | |
| "grad_norm": 0.2600441575050354, | |
| "learning_rate": 6.228407637497131e-06, | |
| "loss": 0.34556894302368163, | |
| "memory(GiB)": 127.52, | |
| "step": 1645, | |
| "token_acc": 0.8641004272904045, | |
| "train_speed(iter/s)": 0.100727 | |
| }, | |
| { | |
| "epoch": 1.9320843091334896, | |
| "grad_norm": 0.2533404231071472, | |
| "learning_rate": 6.1686925537104306e-06, | |
| "loss": 0.3354111433029175, | |
| "memory(GiB)": 127.52, | |
| "step": 1650, | |
| "token_acc": 0.8690573840794189, | |
| "train_speed(iter/s)": 0.100726 | |
| }, | |
| { | |
| "epoch": 1.9379391100702577, | |
| "grad_norm": 0.24305778741836548, | |
| "learning_rate": 6.109137168495205e-06, | |
| "loss": 0.342392110824585, | |
| "memory(GiB)": 127.52, | |
| "step": 1655, | |
| "token_acc": 0.8907634917938944, | |
| "train_speed(iter/s)": 0.100732 | |
| }, | |
| { | |
| "epoch": 1.9437939110070257, | |
| "grad_norm": 0.23065665364265442, | |
| "learning_rate": 6.049743964270336e-06, | |
| "loss": 0.35349397659301757, | |
| "memory(GiB)": 127.52, | |
| "step": 1660, | |
| "token_acc": 0.8749648996911172, | |
| "train_speed(iter/s)": 0.100731 | |
| }, | |
| { | |
| "epoch": 1.949648711943794, | |
| "grad_norm": 0.26187312602996826, | |
| "learning_rate": 5.990515416694591e-06, | |
| "loss": 0.3514526844024658, | |
| "memory(GiB)": 127.52, | |
| "step": 1665, | |
| "token_acc": 0.8773919272455463, | |
| "train_speed(iter/s)": 0.100729 | |
| }, | |
| { | |
| "epoch": 1.955503512880562, | |
| "grad_norm": 0.2436314970254898, | |
| "learning_rate": 5.931453994563434e-06, | |
| "loss": 0.34615340232849123, | |
| "memory(GiB)": 127.52, | |
| "step": 1670, | |
| "token_acc": 0.8825784399814935, | |
| "train_speed(iter/s)": 0.100722 | |
| }, | |
| { | |
| "epoch": 1.96135831381733, | |
| "grad_norm": 1.0637788772583008, | |
| "learning_rate": 5.872562159706116e-06, | |
| "loss": 0.34925112724304197, | |
| "memory(GiB)": 127.52, | |
| "step": 1675, | |
| "token_acc": 0.8725762818496382, | |
| "train_speed(iter/s)": 0.100718 | |
| }, | |
| { | |
| "epoch": 1.9672131147540983, | |
| "grad_norm": 0.2608899176120758, | |
| "learning_rate": 5.8138423668830605e-06, | |
| "loss": 0.34130330085754396, | |
| "memory(GiB)": 127.52, | |
| "step": 1680, | |
| "token_acc": 0.876563876375788, | |
| "train_speed(iter/s)": 0.10072 | |
| }, | |
| { | |
| "epoch": 1.9730679156908666, | |
| "grad_norm": 0.24455122649669647, | |
| "learning_rate": 5.755297063683551e-06, | |
| "loss": 0.3456611633300781, | |
| "memory(GiB)": 127.52, | |
| "step": 1685, | |
| "token_acc": 0.8803155448934612, | |
| "train_speed(iter/s)": 0.100717 | |
| }, | |
| { | |
| "epoch": 1.9789227166276346, | |
| "grad_norm": 0.23744545876979828, | |
| "learning_rate": 5.696928690423693e-06, | |
| "loss": 0.3404732942581177, | |
| "memory(GiB)": 127.52, | |
| "step": 1690, | |
| "token_acc": 0.873919857146425, | |
| "train_speed(iter/s)": 0.100721 | |
| }, | |
| { | |
| "epoch": 1.9847775175644027, | |
| "grad_norm": 0.2499692440032959, | |
| "learning_rate": 5.638739680044718e-06, | |
| "loss": 0.3554127931594849, | |
| "memory(GiB)": 127.52, | |
| "step": 1695, | |
| "token_acc": 0.8678405344492528, | |
| "train_speed(iter/s)": 0.10072 | |
| }, | |
| { | |
| "epoch": 1.990632318501171, | |
| "grad_norm": 0.23933644592761993, | |
| "learning_rate": 5.580732458011544e-06, | |
| "loss": 0.34451732635498045, | |
| "memory(GiB)": 127.52, | |
| "step": 1700, | |
| "token_acc": 0.8813060735041081, | |
| "train_speed(iter/s)": 0.100721 | |
| }, | |
| { | |
| "epoch": 1.9964871194379392, | |
| "grad_norm": 0.2454347014427185, | |
| "learning_rate": 5.522909442211708e-06, | |
| "loss": 0.3448106527328491, | |
| "memory(GiB)": 127.52, | |
| "step": 1705, | |
| "token_acc": 0.8718723798596708, | |
| "train_speed(iter/s)": 0.100717 | |
| }, | |
| { | |
| "epoch": 2.002341920374707, | |
| "grad_norm": 0.30603164434432983, | |
| "learning_rate": 5.465273042854551e-06, | |
| "loss": 0.3320322036743164, | |
| "memory(GiB)": 127.52, | |
| "step": 1710, | |
| "token_acc": 0.8845191075650899, | |
| "train_speed(iter/s)": 0.10054 | |
| }, | |
| { | |
| "epoch": 2.0081967213114753, | |
| "grad_norm": 0.26624929904937744, | |
| "learning_rate": 5.407825662370778e-06, | |
| "loss": 0.3192149639129639, | |
| "memory(GiB)": 127.52, | |
| "step": 1715, | |
| "token_acc": 0.8862581577460744, | |
| "train_speed(iter/s)": 0.100533 | |
| }, | |
| { | |
| "epoch": 2.0140515222482436, | |
| "grad_norm": 0.28559088706970215, | |
| "learning_rate": 5.350569695312313e-06, | |
| "loss": 0.3315494775772095, | |
| "memory(GiB)": 127.52, | |
| "step": 1720, | |
| "token_acc": 0.8817901407312053, | |
| "train_speed(iter/s)": 0.100527 | |
| }, | |
| { | |
| "epoch": 2.019906323185012, | |
| "grad_norm": 0.24132603406906128, | |
| "learning_rate": 5.293507528252474e-06, | |
| "loss": 0.3354511737823486, | |
| "memory(GiB)": 127.52, | |
| "step": 1725, | |
| "token_acc": 0.8808201997328972, | |
| "train_speed(iter/s)": 0.100523 | |
| }, | |
| { | |
| "epoch": 2.0257611241217797, | |
| "grad_norm": 0.25403663516044617, | |
| "learning_rate": 5.236641539686518e-06, | |
| "loss": 0.3226620197296143, | |
| "memory(GiB)": 127.52, | |
| "step": 1730, | |
| "token_acc": 0.8806968959125817, | |
| "train_speed(iter/s)": 0.10053 | |
| }, | |
| { | |
| "epoch": 2.031615925058548, | |
| "grad_norm": 0.24015206098556519, | |
| "learning_rate": 5.179974099932472e-06, | |
| "loss": 0.3161166667938232, | |
| "memory(GiB)": 127.52, | |
| "step": 1735, | |
| "token_acc": 0.8794680331257753, | |
| "train_speed(iter/s)": 0.100526 | |
| }, | |
| { | |
| "epoch": 2.037470725995316, | |
| "grad_norm": 0.2842601537704468, | |
| "learning_rate": 5.12350757103236e-06, | |
| "loss": 0.31528186798095703, | |
| "memory(GiB)": 127.52, | |
| "step": 1740, | |
| "token_acc": 0.8833886035950154, | |
| "train_speed(iter/s)": 0.10053 | |
| }, | |
| { | |
| "epoch": 2.0433255269320845, | |
| "grad_norm": 0.23931631445884705, | |
| "learning_rate": 5.067244306653736e-06, | |
| "loss": 0.32300970554351804, | |
| "memory(GiB)": 127.52, | |
| "step": 1745, | |
| "token_acc": 0.8907401132070736, | |
| "train_speed(iter/s)": 0.100533 | |
| }, | |
| { | |
| "epoch": 2.0491803278688523, | |
| "grad_norm": 0.25491324067115784, | |
| "learning_rate": 5.0111866519915575e-06, | |
| "loss": 0.31856546401977537, | |
| "memory(GiB)": 127.52, | |
| "step": 1750, | |
| "token_acc": 0.8788062223735568, | |
| "train_speed(iter/s)": 0.100534 | |
| }, | |
| { | |
| "epoch": 2.0550351288056206, | |
| "grad_norm": 0.2541966140270233, | |
| "learning_rate": 4.95533694367047e-06, | |
| "loss": 0.31543042659759524, | |
| "memory(GiB)": 127.52, | |
| "step": 1755, | |
| "token_acc": 0.8854616459729288, | |
| "train_speed(iter/s)": 0.100541 | |
| }, | |
| { | |
| "epoch": 2.060889929742389, | |
| "grad_norm": 0.250337690114975, | |
| "learning_rate": 4.899697509647379e-06, | |
| "loss": 0.32208833694458006, | |
| "memory(GiB)": 127.52, | |
| "step": 1760, | |
| "token_acc": 0.8763743304143462, | |
| "train_speed(iter/s)": 0.100545 | |
| }, | |
| { | |
| "epoch": 2.066744730679157, | |
| "grad_norm": 0.23674513399600983, | |
| "learning_rate": 4.844270669114424e-06, | |
| "loss": 0.32359483242034914, | |
| "memory(GiB)": 127.52, | |
| "step": 1765, | |
| "token_acc": 0.8885440198244088, | |
| "train_speed(iter/s)": 0.100551 | |
| }, | |
| { | |
| "epoch": 2.072599531615925, | |
| "grad_norm": 0.2509515881538391, | |
| "learning_rate": 4.789058732402319e-06, | |
| "loss": 0.3145972728729248, | |
| "memory(GiB)": 127.52, | |
| "step": 1770, | |
| "token_acc": 0.8812067213755373, | |
| "train_speed(iter/s)": 0.100554 | |
| }, | |
| { | |
| "epoch": 2.078454332552693, | |
| "grad_norm": 0.27846959233283997, | |
| "learning_rate": 4.734064000884044e-06, | |
| "loss": 0.3361539840698242, | |
| "memory(GiB)": 127.52, | |
| "step": 1775, | |
| "token_acc": 0.8687031468980935, | |
| "train_speed(iter/s)": 0.100561 | |
| }, | |
| { | |
| "epoch": 2.0843091334894615, | |
| "grad_norm": 0.2520703971385956, | |
| "learning_rate": 4.679288766878908e-06, | |
| "loss": 0.3277717590332031, | |
| "memory(GiB)": 127.52, | |
| "step": 1780, | |
| "token_acc": 0.8835239754091976, | |
| "train_speed(iter/s)": 0.100561 | |
| }, | |
| { | |
| "epoch": 2.0901639344262297, | |
| "grad_norm": 0.26310279965400696, | |
| "learning_rate": 4.624735313557019e-06, | |
| "loss": 0.32394185066223147, | |
| "memory(GiB)": 127.52, | |
| "step": 1785, | |
| "token_acc": 0.8875730035291546, | |
| "train_speed(iter/s)": 0.100566 | |
| }, | |
| { | |
| "epoch": 2.0960187353629975, | |
| "grad_norm": 0.2666696310043335, | |
| "learning_rate": 4.570405914844105e-06, | |
| "loss": 0.31819107532501223, | |
| "memory(GiB)": 127.52, | |
| "step": 1790, | |
| "token_acc": 0.8859368071299645, | |
| "train_speed(iter/s)": 0.100562 | |
| }, | |
| { | |
| "epoch": 2.101873536299766, | |
| "grad_norm": 0.25196680426597595, | |
| "learning_rate": 4.516302835326723e-06, | |
| "loss": 0.322560453414917, | |
| "memory(GiB)": 127.52, | |
| "step": 1795, | |
| "token_acc": 0.8921213689835521, | |
| "train_speed(iter/s)": 0.100564 | |
| }, | |
| { | |
| "epoch": 2.107728337236534, | |
| "grad_norm": 0.24787664413452148, | |
| "learning_rate": 4.462428330157886e-06, | |
| "loss": 0.3134110927581787, | |
| "memory(GiB)": 127.52, | |
| "step": 1800, | |
| "token_acc": 0.8915973959679097, | |
| "train_speed(iter/s)": 0.100565 | |
| }, | |
| { | |
| "epoch": 2.113583138173302, | |
| "grad_norm": 0.23812943696975708, | |
| "learning_rate": 4.4087846449630475e-06, | |
| "loss": 0.31724915504455564, | |
| "memory(GiB)": 127.52, | |
| "step": 1805, | |
| "token_acc": 0.8883239519028294, | |
| "train_speed(iter/s)": 0.100568 | |
| }, | |
| { | |
| "epoch": 2.11943793911007, | |
| "grad_norm": 0.2460552453994751, | |
| "learning_rate": 4.355374015746493e-06, | |
| "loss": 0.31520168781280516, | |
| "memory(GiB)": 127.52, | |
| "step": 1810, | |
| "token_acc": 0.8825987185966718, | |
| "train_speed(iter/s)": 0.100568 | |
| }, | |
| { | |
| "epoch": 2.1252927400468384, | |
| "grad_norm": 0.2627100646495819, | |
| "learning_rate": 4.302198668798159e-06, | |
| "loss": 0.3187079906463623, | |
| "memory(GiB)": 127.52, | |
| "step": 1815, | |
| "token_acc": 0.8795669142641319, | |
| "train_speed(iter/s)": 0.100574 | |
| }, | |
| { | |
| "epoch": 2.1311475409836067, | |
| "grad_norm": 0.23737181723117828, | |
| "learning_rate": 4.249260820600813e-06, | |
| "loss": 0.30634393692016604, | |
| "memory(GiB)": 127.52, | |
| "step": 1820, | |
| "token_acc": 0.8882761935077175, | |
| "train_speed(iter/s)": 0.100574 | |
| }, | |
| { | |
| "epoch": 2.1370023419203745, | |
| "grad_norm": 0.44100987911224365, | |
| "learning_rate": 4.1965626777376766e-06, | |
| "loss": 0.3143752574920654, | |
| "memory(GiB)": 127.52, | |
| "step": 1825, | |
| "token_acc": 0.8907455736843094, | |
| "train_speed(iter/s)": 0.100576 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 0.243091881275177, | |
| "learning_rate": 4.144106436800453e-06, | |
| "loss": 0.32144436836242674, | |
| "memory(GiB)": 127.52, | |
| "step": 1830, | |
| "token_acc": 0.8904153173473116, | |
| "train_speed(iter/s)": 0.100586 | |
| }, | |
| { | |
| "epoch": 2.148711943793911, | |
| "grad_norm": 0.22646024823188782, | |
| "learning_rate": 4.091894284297758e-06, | |
| "loss": 0.3123732089996338, | |
| "memory(GiB)": 127.52, | |
| "step": 1835, | |
| "token_acc": 0.8785402692433979, | |
| "train_speed(iter/s)": 0.100589 | |
| }, | |
| { | |
| "epoch": 2.1545667447306793, | |
| "grad_norm": 0.2700958549976349, | |
| "learning_rate": 4.039928396563983e-06, | |
| "loss": 0.33238074779510496, | |
| "memory(GiB)": 127.52, | |
| "step": 1840, | |
| "token_acc": 0.8842443529070076, | |
| "train_speed(iter/s)": 0.10059 | |
| }, | |
| { | |
| "epoch": 2.160421545667447, | |
| "grad_norm": 0.2499818056821823, | |
| "learning_rate": 3.9882109396685845e-06, | |
| "loss": 0.30622167587280275, | |
| "memory(GiB)": 127.52, | |
| "step": 1845, | |
| "token_acc": 0.8795685480484824, | |
| "train_speed(iter/s)": 0.100591 | |
| }, | |
| { | |
| "epoch": 2.1662763466042154, | |
| "grad_norm": 0.22730578482151031, | |
| "learning_rate": 3.936744069325797e-06, | |
| "loss": 0.3057937860488892, | |
| "memory(GiB)": 127.52, | |
| "step": 1850, | |
| "token_acc": 0.8902019848511362, | |
| "train_speed(iter/s)": 0.100589 | |
| }, | |
| { | |
| "epoch": 2.1721311475409837, | |
| "grad_norm": 0.23967498540878296, | |
| "learning_rate": 3.885529930804768e-06, | |
| "loss": 0.3023227214813232, | |
| "memory(GiB)": 127.52, | |
| "step": 1855, | |
| "token_acc": 0.8807274179657759, | |
| "train_speed(iter/s)": 0.100589 | |
| }, | |
| { | |
| "epoch": 2.177985948477752, | |
| "grad_norm": 0.2622321844100952, | |
| "learning_rate": 3.834570658840152e-06, | |
| "loss": 0.32261273860931394, | |
| "memory(GiB)": 127.52, | |
| "step": 1860, | |
| "token_acc": 0.8792452360659205, | |
| "train_speed(iter/s)": 0.100591 | |
| }, | |
| { | |
| "epoch": 2.1838407494145198, | |
| "grad_norm": 0.23954476416110992, | |
| "learning_rate": 3.7838683775431106e-06, | |
| "loss": 0.31424174308776853, | |
| "memory(GiB)": 127.52, | |
| "step": 1865, | |
| "token_acc": 0.8843662495044312, | |
| "train_speed(iter/s)": 0.100597 | |
| }, | |
| { | |
| "epoch": 2.189695550351288, | |
| "grad_norm": 0.23363274335861206, | |
| "learning_rate": 3.733425200312797e-06, | |
| "loss": 0.316208815574646, | |
| "memory(GiB)": 127.52, | |
| "step": 1870, | |
| "token_acc": 0.876293130342547, | |
| "train_speed(iter/s)": 0.100602 | |
| }, | |
| { | |
| "epoch": 2.1955503512880563, | |
| "grad_norm": 0.24841627478599548, | |
| "learning_rate": 3.683243229748249e-06, | |
| "loss": 0.3097521781921387, | |
| "memory(GiB)": 127.52, | |
| "step": 1875, | |
| "token_acc": 0.8804246009543149, | |
| "train_speed(iter/s)": 0.100606 | |
| }, | |
| { | |
| "epoch": 2.201405152224824, | |
| "grad_norm": 0.25356635451316833, | |
| "learning_rate": 3.633324557560747e-06, | |
| "loss": 0.31675851345062256, | |
| "memory(GiB)": 127.52, | |
| "step": 1880, | |
| "token_acc": 0.8871838137645497, | |
| "train_speed(iter/s)": 0.10061 | |
| }, | |
| { | |
| "epoch": 2.2072599531615924, | |
| "grad_norm": 0.2366763949394226, | |
| "learning_rate": 3.5836712644866277e-06, | |
| "loss": 0.30890917778015137, | |
| "memory(GiB)": 127.52, | |
| "step": 1885, | |
| "token_acc": 0.8819356314491541, | |
| "train_speed(iter/s)": 0.100613 | |
| }, | |
| { | |
| "epoch": 2.2131147540983607, | |
| "grad_norm": 0.24897019565105438, | |
| "learning_rate": 3.5342854202005696e-06, | |
| "loss": 0.31049222946166993, | |
| "memory(GiB)": 127.52, | |
| "step": 1890, | |
| "token_acc": 0.8878919948532936, | |
| "train_speed(iter/s)": 0.100619 | |
| }, | |
| { | |
| "epoch": 2.218969555035129, | |
| "grad_norm": 0.239404559135437, | |
| "learning_rate": 3.485169083229293e-06, | |
| "loss": 0.31925191879272463, | |
| "memory(GiB)": 127.52, | |
| "step": 1895, | |
| "token_acc": 0.8928798404593369, | |
| "train_speed(iter/s)": 0.100627 | |
| }, | |
| { | |
| "epoch": 2.2248243559718968, | |
| "grad_norm": 0.2341826856136322, | |
| "learning_rate": 3.4363243008657842e-06, | |
| "loss": 0.31410508155822753, | |
| "memory(GiB)": 127.52, | |
| "step": 1900, | |
| "token_acc": 0.8741590609526956, | |
| "train_speed(iter/s)": 0.100624 | |
| }, | |
| { | |
| "epoch": 2.230679156908665, | |
| "grad_norm": 0.24927052855491638, | |
| "learning_rate": 3.3877531090839478e-06, | |
| "loss": 0.3199175834655762, | |
| "memory(GiB)": 127.52, | |
| "step": 1905, | |
| "token_acc": 0.8767657620459692, | |
| "train_speed(iter/s)": 0.100628 | |
| }, | |
| { | |
| "epoch": 2.2365339578454333, | |
| "grad_norm": 0.2401537299156189, | |
| "learning_rate": 3.3394575324537327e-06, | |
| "loss": 0.3235038757324219, | |
| "memory(GiB)": 127.52, | |
| "step": 1910, | |
| "token_acc": 0.8763058505839384, | |
| "train_speed(iter/s)": 0.100623 | |
| }, | |
| { | |
| "epoch": 2.2423887587822016, | |
| "grad_norm": 0.23076413571834564, | |
| "learning_rate": 3.2914395840567605e-06, | |
| "loss": 0.31050064563751223, | |
| "memory(GiB)": 127.52, | |
| "step": 1915, | |
| "token_acc": 0.8874926079243052, | |
| "train_speed(iter/s)": 0.100622 | |
| }, | |
| { | |
| "epoch": 2.2482435597189694, | |
| "grad_norm": 0.2379971295595169, | |
| "learning_rate": 3.2437012654024057e-06, | |
| "loss": 0.3159012317657471, | |
| "memory(GiB)": 127.52, | |
| "step": 1920, | |
| "token_acc": 0.8895969009656411, | |
| "train_speed(iter/s)": 0.100622 | |
| }, | |
| { | |
| "epoch": 2.2540983606557377, | |
| "grad_norm": 0.23007337749004364, | |
| "learning_rate": 3.1962445663443643e-06, | |
| "loss": 0.31895716190338136, | |
| "memory(GiB)": 127.52, | |
| "step": 1925, | |
| "token_acc": 0.8823520222942871, | |
| "train_speed(iter/s)": 0.100616 | |
| }, | |
| { | |
| "epoch": 2.259953161592506, | |
| "grad_norm": 0.2437550276517868, | |
| "learning_rate": 3.1490714649977196e-06, | |
| "loss": 0.3226035118103027, | |
| "memory(GiB)": 127.52, | |
| "step": 1930, | |
| "token_acc": 0.8907227393284292, | |
| "train_speed(iter/s)": 0.100614 | |
| }, | |
| { | |
| "epoch": 2.265807962529274, | |
| "grad_norm": 0.2513379454612732, | |
| "learning_rate": 3.102183927656488e-06, | |
| "loss": 0.31055560111999514, | |
| "memory(GiB)": 127.52, | |
| "step": 1935, | |
| "token_acc": 0.8758090614886731, | |
| "train_speed(iter/s)": 0.100617 | |
| }, | |
| { | |
| "epoch": 2.271662763466042, | |
| "grad_norm": 0.23778940737247467, | |
| "learning_rate": 3.0555839087116547e-06, | |
| "loss": 0.32387375831604004, | |
| "memory(GiB)": 127.52, | |
| "step": 1940, | |
| "token_acc": 0.887034375, | |
| "train_speed(iter/s)": 0.10062 | |
| }, | |
| { | |
| "epoch": 2.2775175644028103, | |
| "grad_norm": 0.26385143399238586, | |
| "learning_rate": 3.009273350569705e-06, | |
| "loss": 0.32143163681030273, | |
| "memory(GiB)": 127.52, | |
| "step": 1945, | |
| "token_acc": 0.8916146423189599, | |
| "train_speed(iter/s)": 0.100632 | |
| }, | |
| { | |
| "epoch": 2.2833723653395785, | |
| "grad_norm": 0.23078720271587372, | |
| "learning_rate": 2.963254183571682e-06, | |
| "loss": 0.31597721576690674, | |
| "memory(GiB)": 127.52, | |
| "step": 1950, | |
| "token_acc": 0.8873806150822559, | |
| "train_speed(iter/s)": 0.10063 | |
| }, | |
| { | |
| "epoch": 2.289227166276347, | |
| "grad_norm": 0.23988991975784302, | |
| "learning_rate": 2.9175283259126943e-06, | |
| "loss": 0.31755337715148924, | |
| "memory(GiB)": 127.52, | |
| "step": 1955, | |
| "token_acc": 0.8924940331886264, | |
| "train_speed(iter/s)": 0.100631 | |
| }, | |
| { | |
| "epoch": 2.2950819672131146, | |
| "grad_norm": 0.23374050855636597, | |
| "learning_rate": 2.872097683561986e-06, | |
| "loss": 0.3156282424926758, | |
| "memory(GiB)": 127.52, | |
| "step": 1960, | |
| "token_acc": 0.8946095897383691, | |
| "train_speed(iter/s)": 0.100632 | |
| }, | |
| { | |
| "epoch": 2.300936768149883, | |
| "grad_norm": 0.22969146072864532, | |
| "learning_rate": 2.8269641501834834e-06, | |
| "loss": 0.32587299346923826, | |
| "memory(GiB)": 127.52, | |
| "step": 1965, | |
| "token_acc": 0.8774885813450646, | |
| "train_speed(iter/s)": 0.100637 | |
| }, | |
| { | |
| "epoch": 2.306791569086651, | |
| "grad_norm": 0.23242172598838806, | |
| "learning_rate": 2.782129607056848e-06, | |
| "loss": 0.31759541034698485, | |
| "memory(GiB)": 127.52, | |
| "step": 1970, | |
| "token_acc": 0.8783747102265459, | |
| "train_speed(iter/s)": 0.10064 | |
| }, | |
| { | |
| "epoch": 2.312646370023419, | |
| "grad_norm": 0.22935490310192108, | |
| "learning_rate": 2.7375959229990856e-06, | |
| "loss": 0.307840371131897, | |
| "memory(GiB)": 127.52, | |
| "step": 1975, | |
| "token_acc": 0.8862128010598808, | |
| "train_speed(iter/s)": 0.100639 | |
| }, | |
| { | |
| "epoch": 2.3185011709601873, | |
| "grad_norm": 0.2637212574481964, | |
| "learning_rate": 2.6933649542866326e-06, | |
| "loss": 0.3114126682281494, | |
| "memory(GiB)": 127.52, | |
| "step": 1980, | |
| "token_acc": 0.8820059272541622, | |
| "train_speed(iter/s)": 0.100646 | |
| }, | |
| { | |
| "epoch": 2.3243559718969555, | |
| "grad_norm": 0.22703419625759125, | |
| "learning_rate": 2.649438544577977e-06, | |
| "loss": 0.30065155029296875, | |
| "memory(GiB)": 127.52, | |
| "step": 1985, | |
| "token_acc": 0.8849238586641156, | |
| "train_speed(iter/s)": 0.100647 | |
| }, | |
| { | |
| "epoch": 2.330210772833724, | |
| "grad_norm": 0.22714027762413025, | |
| "learning_rate": 2.6058185248368317e-06, | |
| "loss": 0.3135934352874756, | |
| "memory(GiB)": 127.52, | |
| "step": 1990, | |
| "token_acc": 0.8923622270535968, | |
| "train_speed(iter/s)": 0.100647 | |
| }, | |
| { | |
| "epoch": 2.3360655737704916, | |
| "grad_norm": 0.23052531480789185, | |
| "learning_rate": 2.562506713255789e-06, | |
| "loss": 0.3088988304138184, | |
| "memory(GiB)": 127.52, | |
| "step": 1995, | |
| "token_acc": 0.8901272198016593, | |
| "train_speed(iter/s)": 0.100652 | |
| }, | |
| { | |
| "epoch": 2.34192037470726, | |
| "grad_norm": 0.2511214017868042, | |
| "learning_rate": 2.519504915180555e-06, | |
| "loss": 0.3128695487976074, | |
| "memory(GiB)": 127.52, | |
| "step": 2000, | |
| "token_acc": 0.8865565346454385, | |
| "train_speed(iter/s)": 0.100653 | |
| }, | |
| { | |
| "epoch": 2.347775175644028, | |
| "grad_norm": 0.23098479211330414, | |
| "learning_rate": 2.4768149230346917e-06, | |
| "loss": 0.3291048526763916, | |
| "memory(GiB)": 127.52, | |
| "step": 2005, | |
| "token_acc": 0.8865806253889527, | |
| "train_speed(iter/s)": 0.100648 | |
| }, | |
| { | |
| "epoch": 2.3536299765807964, | |
| "grad_norm": 0.2332172840833664, | |
| "learning_rate": 2.4344385162448924e-06, | |
| "loss": 0.31312854290008546, | |
| "memory(GiB)": 127.52, | |
| "step": 2010, | |
| "token_acc": 0.8905434652297092, | |
| "train_speed(iter/s)": 0.100649 | |
| }, | |
| { | |
| "epoch": 2.3594847775175642, | |
| "grad_norm": 0.229131281375885, | |
| "learning_rate": 2.392377461166826e-06, | |
| "loss": 0.3113706588745117, | |
| "memory(GiB)": 127.52, | |
| "step": 2015, | |
| "token_acc": 0.889476325707392, | |
| "train_speed(iter/s)": 0.100651 | |
| }, | |
| { | |
| "epoch": 2.3653395784543325, | |
| "grad_norm": 0.24932575225830078, | |
| "learning_rate": 2.350633511011511e-06, | |
| "loss": 0.3204165458679199, | |
| "memory(GiB)": 127.52, | |
| "step": 2020, | |
| "token_acc": 0.8841538567415554, | |
| "train_speed(iter/s)": 0.100647 | |
| }, | |
| { | |
| "epoch": 2.371194379391101, | |
| "grad_norm": 0.23387765884399414, | |
| "learning_rate": 2.309208405772221e-06, | |
| "loss": 0.32724220752716066, | |
| "memory(GiB)": 127.52, | |
| "step": 2025, | |
| "token_acc": 0.8882853658229917, | |
| "train_speed(iter/s)": 0.100652 | |
| }, | |
| { | |
| "epoch": 2.3770491803278686, | |
| "grad_norm": 0.24220742285251617, | |
| "learning_rate": 2.2681038721519768e-06, | |
| "loss": 0.33083477020263674, | |
| "memory(GiB)": 127.52, | |
| "step": 2030, | |
| "token_acc": 0.8838624553173172, | |
| "train_speed(iter/s)": 0.100651 | |
| }, | |
| { | |
| "epoch": 2.382903981264637, | |
| "grad_norm": 0.2579573690891266, | |
| "learning_rate": 2.227321623491563e-06, | |
| "loss": 0.3199321746826172, | |
| "memory(GiB)": 127.52, | |
| "step": 2035, | |
| "token_acc": 0.8799424487730837, | |
| "train_speed(iter/s)": 0.100653 | |
| }, | |
| { | |
| "epoch": 2.388758782201405, | |
| "grad_norm": 0.22851942479610443, | |
| "learning_rate": 2.186863359698108e-06, | |
| "loss": 0.3142981052398682, | |
| "memory(GiB)": 127.52, | |
| "step": 2040, | |
| "token_acc": 0.9041223969400765, | |
| "train_speed(iter/s)": 0.100653 | |
| }, | |
| { | |
| "epoch": 2.3946135831381734, | |
| "grad_norm": 0.24671818315982819, | |
| "learning_rate": 2.1467307671742377e-06, | |
| "loss": 0.31820495128631593, | |
| "memory(GiB)": 127.52, | |
| "step": 2045, | |
| "token_acc": 0.8822625886964798, | |
| "train_speed(iter/s)": 0.100657 | |
| }, | |
| { | |
| "epoch": 2.4004683840749417, | |
| "grad_norm": 0.2494201809167862, | |
| "learning_rate": 2.106925518747779e-06, | |
| "loss": 0.31292271614074707, | |
| "memory(GiB)": 127.52, | |
| "step": 2050, | |
| "token_acc": 0.8868852561536922, | |
| "train_speed(iter/s)": 0.100659 | |
| }, | |
| { | |
| "epoch": 2.4063231850117095, | |
| "grad_norm": 0.25766271352767944, | |
| "learning_rate": 2.06744927360202e-06, | |
| "loss": 0.315954852104187, | |
| "memory(GiB)": 127.52, | |
| "step": 2055, | |
| "token_acc": 0.8844018739071213, | |
| "train_speed(iter/s)": 0.100653 | |
| }, | |
| { | |
| "epoch": 2.4121779859484778, | |
| "grad_norm": 0.23304541409015656, | |
| "learning_rate": 2.0283036772065712e-06, | |
| "loss": 0.31738996505737305, | |
| "memory(GiB)": 127.52, | |
| "step": 2060, | |
| "token_acc": 0.8888605233133514, | |
| "train_speed(iter/s)": 0.100656 | |
| }, | |
| { | |
| "epoch": 2.418032786885246, | |
| "grad_norm": 0.23033016920089722, | |
| "learning_rate": 1.9894903612487683e-06, | |
| "loss": 0.32506499290466306, | |
| "memory(GiB)": 127.52, | |
| "step": 2065, | |
| "token_acc": 0.8765848323481849, | |
| "train_speed(iter/s)": 0.100657 | |
| }, | |
| { | |
| "epoch": 2.423887587822014, | |
| "grad_norm": 0.2522413730621338, | |
| "learning_rate": 1.9510109435656457e-06, | |
| "loss": 0.3240881681442261, | |
| "memory(GiB)": 127.52, | |
| "step": 2070, | |
| "token_acc": 0.8874444430454654, | |
| "train_speed(iter/s)": 0.10066 | |
| }, | |
| { | |
| "epoch": 2.429742388758782, | |
| "grad_norm": 0.23793016374111176, | |
| "learning_rate": 1.9128670280765283e-06, | |
| "loss": 0.326206374168396, | |
| "memory(GiB)": 127.52, | |
| "step": 2075, | |
| "token_acc": 0.8811696876529852, | |
| "train_speed(iter/s)": 0.100656 | |
| }, | |
| { | |
| "epoch": 2.4355971896955504, | |
| "grad_norm": 0.2260826826095581, | |
| "learning_rate": 1.8750602047161603e-06, | |
| "loss": 0.3155853748321533, | |
| "memory(GiB)": 127.52, | |
| "step": 2080, | |
| "token_acc": 0.8918628516614084, | |
| "train_speed(iter/s)": 0.100657 | |
| }, | |
| { | |
| "epoch": 2.4414519906323187, | |
| "grad_norm": 0.22915047407150269, | |
| "learning_rate": 1.8375920493684264e-06, | |
| "loss": 0.32075018882751466, | |
| "memory(GiB)": 127.52, | |
| "step": 2085, | |
| "token_acc": 0.8806146127312637, | |
| "train_speed(iter/s)": 0.100664 | |
| }, | |
| { | |
| "epoch": 2.4473067915690865, | |
| "grad_norm": 0.23555633425712585, | |
| "learning_rate": 1.8004641238006815e-06, | |
| "loss": 0.3198583126068115, | |
| "memory(GiB)": 127.52, | |
| "step": 2090, | |
| "token_acc": 0.8878798889856471, | |
| "train_speed(iter/s)": 0.100663 | |
| }, | |
| { | |
| "epoch": 2.4531615925058547, | |
| "grad_norm": 0.23224787414073944, | |
| "learning_rate": 1.7636779755986443e-06, | |
| "loss": 0.32527942657470704, | |
| "memory(GiB)": 127.52, | |
| "step": 2095, | |
| "token_acc": 0.8808102158192161, | |
| "train_speed(iter/s)": 0.100659 | |
| }, | |
| { | |
| "epoch": 2.459016393442623, | |
| "grad_norm": 0.2313682585954666, | |
| "learning_rate": 1.7272351381018792e-06, | |
| "loss": 0.3221132278442383, | |
| "memory(GiB)": 127.52, | |
| "step": 2100, | |
| "token_acc": 0.8723955898759107, | |
| "train_speed(iter/s)": 0.10066 | |
| }, | |
| { | |
| "epoch": 2.4648711943793913, | |
| "grad_norm": 0.23031777143478394, | |
| "learning_rate": 1.6911371303399048e-06, | |
| "loss": 0.3093102931976318, | |
| "memory(GiB)": 127.52, | |
| "step": 2105, | |
| "token_acc": 0.887525459211663, | |
| "train_speed(iter/s)": 0.100655 | |
| }, | |
| { | |
| "epoch": 2.470725995316159, | |
| "grad_norm": 0.23843398690223694, | |
| "learning_rate": 1.6553854569688632e-06, | |
| "loss": 0.3248276710510254, | |
| "memory(GiB)": 127.52, | |
| "step": 2110, | |
| "token_acc": 0.882843537798315, | |
| "train_speed(iter/s)": 0.100654 | |
| }, | |
| { | |
| "epoch": 2.4765807962529274, | |
| "grad_norm": 0.23203721642494202, | |
| "learning_rate": 1.619981608208796e-06, | |
| "loss": 0.32454729080200195, | |
| "memory(GiB)": 127.52, | |
| "step": 2115, | |
| "token_acc": 0.869970732560573, | |
| "train_speed(iter/s)": 0.100657 | |
| }, | |
| { | |
| "epoch": 2.4824355971896956, | |
| "grad_norm": 0.23711416125297546, | |
| "learning_rate": 1.584927059781548e-06, | |
| "loss": 0.3233715295791626, | |
| "memory(GiB)": 127.52, | |
| "step": 2120, | |
| "token_acc": 0.8797791727772037, | |
| "train_speed(iter/s)": 0.100658 | |
| }, | |
| { | |
| "epoch": 2.4882903981264635, | |
| "grad_norm": 0.23975679278373718, | |
| "learning_rate": 1.5502232728492362e-06, | |
| "loss": 0.31569533348083495, | |
| "memory(GiB)": 127.52, | |
| "step": 2125, | |
| "token_acc": 0.8874189972049156, | |
| "train_speed(iter/s)": 0.100661 | |
| }, | |
| { | |
| "epoch": 2.4941451990632317, | |
| "grad_norm": 0.23424658179283142, | |
| "learning_rate": 1.5158716939533524e-06, | |
| "loss": 0.32528119087219237, | |
| "memory(GiB)": 127.52, | |
| "step": 2130, | |
| "token_acc": 0.8848355062483098, | |
| "train_speed(iter/s)": 0.100663 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.2467930018901825, | |
| "learning_rate": 1.4818737549544725e-06, | |
| "loss": 0.3232418060302734, | |
| "memory(GiB)": 127.52, | |
| "step": 2135, | |
| "token_acc": 0.8760404837079283, | |
| "train_speed(iter/s)": 0.100669 | |
| }, | |
| { | |
| "epoch": 2.5058548009367683, | |
| "grad_norm": 0.23344840109348297, | |
| "learning_rate": 1.448230872972568e-06, | |
| "loss": 0.3205883979797363, | |
| "memory(GiB)": 127.52, | |
| "step": 2140, | |
| "token_acc": 0.8896608528350288, | |
| "train_speed(iter/s)": 0.100665 | |
| }, | |
| { | |
| "epoch": 2.5117096018735365, | |
| "grad_norm": 0.2276953160762787, | |
| "learning_rate": 1.4149444503279297e-06, | |
| "loss": 0.32780184745788576, | |
| "memory(GiB)": 127.52, | |
| "step": 2145, | |
| "token_acc": 0.8763619018928553, | |
| "train_speed(iter/s)": 0.100666 | |
| }, | |
| { | |
| "epoch": 2.5175644028103044, | |
| "grad_norm": 0.23720286786556244, | |
| "learning_rate": 1.382015874482735e-06, | |
| "loss": 0.3210983037948608, | |
| "memory(GiB)": 127.52, | |
| "step": 2150, | |
| "token_acc": 0.8830952351167766, | |
| "train_speed(iter/s)": 0.100669 | |
| }, | |
| { | |
| "epoch": 2.5234192037470726, | |
| "grad_norm": 0.2429177612066269, | |
| "learning_rate": 1.3494465179831895e-06, | |
| "loss": 0.31808924674987793, | |
| "memory(GiB)": 127.52, | |
| "step": 2155, | |
| "token_acc": 0.8801182829610709, | |
| "train_speed(iter/s)": 0.100671 | |
| }, | |
| { | |
| "epoch": 2.529274004683841, | |
| "grad_norm": 0.2192358821630478, | |
| "learning_rate": 1.3172377384023393e-06, | |
| "loss": 0.3137265682220459, | |
| "memory(GiB)": 127.52, | |
| "step": 2160, | |
| "token_acc": 0.8851310631053786, | |
| "train_speed(iter/s)": 0.100675 | |
| }, | |
| { | |
| "epoch": 2.5351288056206087, | |
| "grad_norm": 0.22843384742736816, | |
| "learning_rate": 1.2853908782834722e-06, | |
| "loss": 0.31639652252197265, | |
| "memory(GiB)": 127.52, | |
| "step": 2165, | |
| "token_acc": 0.8930099545248551, | |
| "train_speed(iter/s)": 0.100673 | |
| }, | |
| { | |
| "epoch": 2.540983606557377, | |
| "grad_norm": 0.23414385318756104, | |
| "learning_rate": 1.2539072650841523e-06, | |
| "loss": 0.32384276390075684, | |
| "memory(GiB)": 127.52, | |
| "step": 2170, | |
| "token_acc": 0.8826712369541582, | |
| "train_speed(iter/s)": 0.100679 | |
| }, | |
| { | |
| "epoch": 2.5468384074941453, | |
| "grad_norm": 0.2386016696691513, | |
| "learning_rate": 1.2227882111209011e-06, | |
| "loss": 0.3276023864746094, | |
| "memory(GiB)": 127.52, | |
| "step": 2175, | |
| "token_acc": 0.876178791079083, | |
| "train_speed(iter/s)": 0.10068 | |
| }, | |
| { | |
| "epoch": 2.552693208430913, | |
| "grad_norm": 0.23498761653900146, | |
| "learning_rate": 1.1920350135144898e-06, | |
| "loss": 0.3207254409790039, | |
| "memory(GiB)": 127.52, | |
| "step": 2180, | |
| "token_acc": 0.8885690220875708, | |
| "train_speed(iter/s)": 0.100681 | |
| }, | |
| { | |
| "epoch": 2.5585480093676813, | |
| "grad_norm": 0.23011547327041626, | |
| "learning_rate": 1.1616489541358678e-06, | |
| "loss": 0.3184302806854248, | |
| "memory(GiB)": 127.52, | |
| "step": 2185, | |
| "token_acc": 0.8778273150286384, | |
| "train_speed(iter/s)": 0.100682 | |
| }, | |
| { | |
| "epoch": 2.5644028103044496, | |
| "grad_norm": 0.22844338417053223, | |
| "learning_rate": 1.1316312995527424e-06, | |
| "loss": 0.3216708183288574, | |
| "memory(GiB)": 127.52, | |
| "step": 2190, | |
| "token_acc": 0.8842230056468974, | |
| "train_speed(iter/s)": 0.100685 | |
| }, | |
| { | |
| "epoch": 2.570257611241218, | |
| "grad_norm": 0.23386669158935547, | |
| "learning_rate": 1.1019833009767744e-06, | |
| "loss": 0.3198892831802368, | |
| "memory(GiB)": 127.52, | |
| "step": 2195, | |
| "token_acc": 0.881730841074942, | |
| "train_speed(iter/s)": 0.100684 | |
| }, | |
| { | |
| "epoch": 2.576112412177986, | |
| "grad_norm": 0.23416638374328613, | |
| "learning_rate": 1.072706194211426e-06, | |
| "loss": 0.32181246280670167, | |
| "memory(GiB)": 127.52, | |
| "step": 2200, | |
| "token_acc": 0.8872248114887651, | |
| "train_speed(iter/s)": 0.100687 | |
| }, | |
| { | |
| "epoch": 2.581967213114754, | |
| "grad_norm": 0.232351616024971, | |
| "learning_rate": 1.0438011996004581e-06, | |
| "loss": 0.32013840675354005, | |
| "memory(GiB)": 127.52, | |
| "step": 2205, | |
| "token_acc": 0.8815920274367514, | |
| "train_speed(iter/s)": 0.100688 | |
| }, | |
| { | |
| "epoch": 2.5878220140515222, | |
| "grad_norm": 0.24018974602222443, | |
| "learning_rate": 1.0152695219770558e-06, | |
| "loss": 0.3074916124343872, | |
| "memory(GiB)": 127.52, | |
| "step": 2210, | |
| "token_acc": 0.8911461159004883, | |
| "train_speed(iter/s)": 0.100686 | |
| }, | |
| { | |
| "epoch": 2.5936768149882905, | |
| "grad_norm": 0.2339586764574051, | |
| "learning_rate": 9.871123506136037e-07, | |
| "loss": 0.3152151107788086, | |
| "memory(GiB)": 127.52, | |
| "step": 2215, | |
| "token_acc": 0.8945800996908322, | |
| "train_speed(iter/s)": 0.100689 | |
| }, | |
| { | |
| "epoch": 2.5995316159250583, | |
| "grad_norm": 0.23918944597244263, | |
| "learning_rate": 9.593308591721274e-07, | |
| "loss": 0.3115771532058716, | |
| "memory(GiB)": 127.52, | |
| "step": 2220, | |
| "token_acc": 0.8863534338516209, | |
| "train_speed(iter/s)": 0.100692 | |
| }, | |
| { | |
| "epoch": 2.6053864168618266, | |
| "grad_norm": 0.228268101811409, | |
| "learning_rate": 9.319262056553602e-07, | |
| "loss": 0.3226304531097412, | |
| "memory(GiB)": 127.52, | |
| "step": 2225, | |
| "token_acc": 0.8902835788085294, | |
| "train_speed(iter/s)": 0.10069 | |
| }, | |
| { | |
| "epoch": 2.611241217798595, | |
| "grad_norm": 0.23581595718860626, | |
| "learning_rate": 9.048995323584764e-07, | |
| "loss": 0.3258847713470459, | |
| "memory(GiB)": 127.52, | |
| "step": 2230, | |
| "token_acc": 0.8929581827894788, | |
| "train_speed(iter/s)": 0.10069 | |
| }, | |
| { | |
| "epoch": 2.617096018735363, | |
| "grad_norm": 0.4460615813732147, | |
| "learning_rate": 8.78251965821485e-07, | |
| "loss": 0.3083215236663818, | |
| "memory(GiB)": 127.52, | |
| "step": 2235, | |
| "token_acc": 0.8851051496528254, | |
| "train_speed(iter/s)": 0.10069 | |
| }, | |
| { | |
| "epoch": 2.6229508196721314, | |
| "grad_norm": 0.23269429802894592, | |
| "learning_rate": 8.519846167822665e-07, | |
| "loss": 0.31586997509002684, | |
| "memory(GiB)": 127.52, | |
| "step": 2240, | |
| "token_acc": 0.8981023709170914, | |
| "train_speed(iter/s)": 0.100691 | |
| }, | |
| { | |
| "epoch": 2.628805620608899, | |
| "grad_norm": 0.608095645904541, | |
| "learning_rate": 8.260985801302734e-07, | |
| "loss": 0.30504627227783204, | |
| "memory(GiB)": 127.52, | |
| "step": 2245, | |
| "token_acc": 0.8836382464618571, | |
| "train_speed(iter/s)": 0.100692 | |
| }, | |
| { | |
| "epoch": 2.6346604215456675, | |
| "grad_norm": 0.22992344200611115, | |
| "learning_rate": 8.005949348608977e-07, | |
| "loss": 0.31817898750305174, | |
| "memory(GiB)": 127.52, | |
| "step": 2250, | |
| "token_acc": 0.8803807403423412, | |
| "train_speed(iter/s)": 0.100694 | |
| }, | |
| { | |
| "epoch": 2.6405152224824358, | |
| "grad_norm": 0.2216484099626541, | |
| "learning_rate": 7.754747440304911e-07, | |
| "loss": 0.3218961000442505, | |
| "memory(GiB)": 127.52, | |
| "step": 2255, | |
| "token_acc": 0.8802025202800865, | |
| "train_speed(iter/s)": 0.1007 | |
| }, | |
| { | |
| "epoch": 2.6463700234192036, | |
| "grad_norm": 0.22643844783306122, | |
| "learning_rate": 7.507390547120541e-07, | |
| "loss": 0.31406736373901367, | |
| "memory(GiB)": 127.52, | |
| "step": 2260, | |
| "token_acc": 0.8841787048704839, | |
| "train_speed(iter/s)": 0.100704 | |
| }, | |
| { | |
| "epoch": 2.652224824355972, | |
| "grad_norm": 0.22945396602153778, | |
| "learning_rate": 7.263888979515954e-07, | |
| "loss": 0.32517061233520506, | |
| "memory(GiB)": 127.52, | |
| "step": 2265, | |
| "token_acc": 0.8788511831616095, | |
| "train_speed(iter/s)": 0.10071 | |
| }, | |
| { | |
| "epoch": 2.65807962529274, | |
| "grad_norm": 0.22719787061214447, | |
| "learning_rate": 7.024252887251548e-07, | |
| "loss": 0.31670680046081545, | |
| "memory(GiB)": 127.52, | |
| "step": 2270, | |
| "token_acc": 0.8838603030141137, | |
| "train_speed(iter/s)": 0.100707 | |
| }, | |
| { | |
| "epoch": 2.663934426229508, | |
| "grad_norm": 0.2364586889743805, | |
| "learning_rate": 6.788492258964896e-07, | |
| "loss": 0.3206209659576416, | |
| "memory(GiB)": 127.52, | |
| "step": 2275, | |
| "token_acc": 0.8808837716472833, | |
| "train_speed(iter/s)": 0.100707 | |
| }, | |
| { | |
| "epoch": 2.669789227166276, | |
| "grad_norm": 0.23205353319644928, | |
| "learning_rate": 6.556616921754489e-07, | |
| "loss": 0.3177974224090576, | |
| "memory(GiB)": 127.52, | |
| "step": 2280, | |
| "token_acc": 0.8846845210507196, | |
| "train_speed(iter/s)": 0.100709 | |
| }, | |
| { | |
| "epoch": 2.6756440281030445, | |
| "grad_norm": 0.23928001523017883, | |
| "learning_rate": 6.328636540770028e-07, | |
| "loss": 0.3218786001205444, | |
| "memory(GiB)": 127.52, | |
| "step": 2285, | |
| "token_acc": 0.8839321457165733, | |
| "train_speed(iter/s)": 0.10071 | |
| }, | |
| { | |
| "epoch": 2.6814988290398127, | |
| "grad_norm": 0.22948609292507172, | |
| "learning_rate": 6.10456061880963e-07, | |
| "loss": 0.32559771537780763, | |
| "memory(GiB)": 127.52, | |
| "step": 2290, | |
| "token_acc": 0.888954265344254, | |
| "train_speed(iter/s)": 0.10071 | |
| }, | |
| { | |
| "epoch": 2.687353629976581, | |
| "grad_norm": 0.22480416297912598, | |
| "learning_rate": 5.884398495923727e-07, | |
| "loss": 0.31432313919067384, | |
| "memory(GiB)": 127.52, | |
| "step": 2295, | |
| "token_acc": 0.8786473253733409, | |
| "train_speed(iter/s)": 0.100714 | |
| }, | |
| { | |
| "epoch": 2.693208430913349, | |
| "grad_norm": 0.49891427159309387, | |
| "learning_rate": 5.668159349025649e-07, | |
| "loss": 0.33366761207580564, | |
| "memory(GiB)": 127.52, | |
| "step": 2300, | |
| "token_acc": 0.8706380208333333, | |
| "train_speed(iter/s)": 0.100713 | |
| }, | |
| { | |
| "epoch": 2.699063231850117, | |
| "grad_norm": 0.23788191378116608, | |
| "learning_rate": 5.455852191509214e-07, | |
| "loss": 0.326168417930603, | |
| "memory(GiB)": 127.52, | |
| "step": 2305, | |
| "token_acc": 0.8757156059468948, | |
| "train_speed(iter/s)": 0.100714 | |
| }, | |
| { | |
| "epoch": 2.7049180327868854, | |
| "grad_norm": 0.23934431374073029, | |
| "learning_rate": 5.247485872873026e-07, | |
| "loss": 0.3131624460220337, | |
| "memory(GiB)": 127.52, | |
| "step": 2310, | |
| "token_acc": 0.8873159330925727, | |
| "train_speed(iter/s)": 0.100715 | |
| }, | |
| { | |
| "epoch": 2.710772833723653, | |
| "grad_norm": 0.22434021532535553, | |
| "learning_rate": 5.043069078351526e-07, | |
| "loss": 0.3083023548126221, | |
| "memory(GiB)": 127.52, | |
| "step": 2315, | |
| "token_acc": 0.8900379146919432, | |
| "train_speed(iter/s)": 0.10072 | |
| }, | |
| { | |
| "epoch": 2.7166276346604215, | |
| "grad_norm": 0.2241913378238678, | |
| "learning_rate": 4.842610328552999e-07, | |
| "loss": 0.31645286083221436, | |
| "memory(GiB)": 127.52, | |
| "step": 2320, | |
| "token_acc": 0.8860757524370778, | |
| "train_speed(iter/s)": 0.100719 | |
| }, | |
| { | |
| "epoch": 2.7224824355971897, | |
| "grad_norm": 0.22683191299438477, | |
| "learning_rate": 4.6461179791044806e-07, | |
| "loss": 0.3162517547607422, | |
| "memory(GiB)": 127.52, | |
| "step": 2325, | |
| "token_acc": 0.8806341851421645, | |
| "train_speed(iter/s)": 0.100722 | |
| }, | |
| { | |
| "epoch": 2.728337236533958, | |
| "grad_norm": 0.22332416474819183, | |
| "learning_rate": 4.453600220303378e-07, | |
| "loss": 0.3006160736083984, | |
| "memory(GiB)": 127.52, | |
| "step": 2330, | |
| "token_acc": 0.8811269139759368, | |
| "train_speed(iter/s)": 0.100726 | |
| }, | |
| { | |
| "epoch": 2.7341920374707263, | |
| "grad_norm": 0.2320730835199356, | |
| "learning_rate": 4.2650650767761535e-07, | |
| "loss": 0.3053130149841309, | |
| "memory(GiB)": 127.52, | |
| "step": 2335, | |
| "token_acc": 0.8909103410770822, | |
| "train_speed(iter/s)": 0.100726 | |
| }, | |
| { | |
| "epoch": 2.740046838407494, | |
| "grad_norm": 0.2575525939464569, | |
| "learning_rate": 4.0805204071437953e-07, | |
| "loss": 0.32894713878631593, | |
| "memory(GiB)": 127.52, | |
| "step": 2340, | |
| "token_acc": 0.880288983757294, | |
| "train_speed(iter/s)": 0.100724 | |
| }, | |
| { | |
| "epoch": 2.7459016393442623, | |
| "grad_norm": 0.2190413624048233, | |
| "learning_rate": 3.899973903694243e-07, | |
| "loss": 0.32172608375549316, | |
| "memory(GiB)": 127.52, | |
| "step": 2345, | |
| "token_acc": 0.8842697990204148, | |
| "train_speed(iter/s)": 0.100724 | |
| }, | |
| { | |
| "epoch": 2.7517564402810306, | |
| "grad_norm": 0.22509151697158813, | |
| "learning_rate": 3.72343309206179e-07, | |
| "loss": 0.31258511543273926, | |
| "memory(GiB)": 127.52, | |
| "step": 2350, | |
| "token_acc": 0.8854250593299245, | |
| "train_speed(iter/s)": 0.100723 | |
| }, | |
| { | |
| "epoch": 2.7576112412177984, | |
| "grad_norm": 0.22671233117580414, | |
| "learning_rate": 3.55090533091339e-07, | |
| "loss": 0.3143455028533936, | |
| "memory(GiB)": 127.52, | |
| "step": 2355, | |
| "token_acc": 0.896848520654861, | |
| "train_speed(iter/s)": 0.10072 | |
| }, | |
| { | |
| "epoch": 2.7634660421545667, | |
| "grad_norm": 0.21764405071735382, | |
| "learning_rate": 3.382397811641858e-07, | |
| "loss": 0.3072871208190918, | |
| "memory(GiB)": 127.52, | |
| "step": 2360, | |
| "token_acc": 0.8893455142073456, | |
| "train_speed(iter/s)": 0.100725 | |
| }, | |
| { | |
| "epoch": 2.769320843091335, | |
| "grad_norm": 0.22008980810642242, | |
| "learning_rate": 3.217917558066241e-07, | |
| "loss": 0.31331815719604494, | |
| "memory(GiB)": 127.52, | |
| "step": 2365, | |
| "token_acc": 0.8801702516246458, | |
| "train_speed(iter/s)": 0.100727 | |
| }, | |
| { | |
| "epoch": 2.775175644028103, | |
| "grad_norm": 0.2225882112979889, | |
| "learning_rate": 3.057471426138958e-07, | |
| "loss": 0.3275087833404541, | |
| "memory(GiB)": 127.52, | |
| "step": 2370, | |
| "token_acc": 0.8743533027834035, | |
| "train_speed(iter/s)": 0.100726 | |
| }, | |
| { | |
| "epoch": 2.781030444964871, | |
| "grad_norm": 0.22171831130981445, | |
| "learning_rate": 2.901066103660033e-07, | |
| "loss": 0.3129570484161377, | |
| "memory(GiB)": 127.52, | |
| "step": 2375, | |
| "token_acc": 0.8872727501597082, | |
| "train_speed(iter/s)": 0.100728 | |
| }, | |
| { | |
| "epoch": 2.7868852459016393, | |
| "grad_norm": 0.2355940192937851, | |
| "learning_rate": 2.7487081099983435e-07, | |
| "loss": 0.32728214263916017, | |
| "memory(GiB)": 127.52, | |
| "step": 2380, | |
| "token_acc": 0.882063511039243, | |
| "train_speed(iter/s)": 0.100731 | |
| }, | |
| { | |
| "epoch": 2.7927400468384076, | |
| "grad_norm": 0.21898697316646576, | |
| "learning_rate": 2.6004037958199167e-07, | |
| "loss": 0.31028578281402586, | |
| "memory(GiB)": 127.52, | |
| "step": 2385, | |
| "token_acc": 0.8959504867399893, | |
| "train_speed(iter/s)": 0.100732 | |
| }, | |
| { | |
| "epoch": 2.798594847775176, | |
| "grad_norm": 0.22940264642238617, | |
| "learning_rate": 2.4561593428231165e-07, | |
| "loss": 0.3168987274169922, | |
| "memory(GiB)": 127.52, | |
| "step": 2390, | |
| "token_acc": 0.9043824201593208, | |
| "train_speed(iter/s)": 0.100729 | |
| }, | |
| { | |
| "epoch": 2.8044496487119437, | |
| "grad_norm": 0.22128568589687347, | |
| "learning_rate": 2.3159807634811182e-07, | |
| "loss": 0.30646657943725586, | |
| "memory(GiB)": 127.52, | |
| "step": 2395, | |
| "token_acc": 0.890519620223563, | |
| "train_speed(iter/s)": 0.10073 | |
| }, | |
| { | |
| "epoch": 2.810304449648712, | |
| "grad_norm": 0.23035509884357452, | |
| "learning_rate": 2.1798739007911517e-07, | |
| "loss": 0.321412467956543, | |
| "memory(GiB)": 127.52, | |
| "step": 2400, | |
| "token_acc": 0.8813866834368367, | |
| "train_speed(iter/s)": 0.100729 | |
| }, | |
| { | |
| "epoch": 2.8161592505854802, | |
| "grad_norm": 0.22361230850219727, | |
| "learning_rate": 2.0478444280310206e-07, | |
| "loss": 0.314456582069397, | |
| "memory(GiB)": 127.52, | |
| "step": 2405, | |
| "token_acc": 0.8847936237191627, | |
| "train_speed(iter/s)": 0.100733 | |
| }, | |
| { | |
| "epoch": 2.822014051522248, | |
| "grad_norm": 0.248680979013443, | |
| "learning_rate": 1.919897848522656e-07, | |
| "loss": 0.31545486450195315, | |
| "memory(GiB)": 127.52, | |
| "step": 2410, | |
| "token_acc": 0.8842675175238047, | |
| "train_speed(iter/s)": 0.100732 | |
| }, | |
| { | |
| "epoch": 2.8278688524590163, | |
| "grad_norm": 0.2220403105020523, | |
| "learning_rate": 1.796039495402646e-07, | |
| "loss": 0.3194711923599243, | |
| "memory(GiB)": 127.52, | |
| "step": 2415, | |
| "token_acc": 0.889650254732648, | |
| "train_speed(iter/s)": 0.100731 | |
| }, | |
| { | |
| "epoch": 2.8337236533957846, | |
| "grad_norm": 0.23251083493232727, | |
| "learning_rate": 1.6762745313999795e-07, | |
| "loss": 0.32554826736450193, | |
| "memory(GiB)": 127.52, | |
| "step": 2420, | |
| "token_acc": 0.8688351785435834, | |
| "train_speed(iter/s)": 0.100728 | |
| }, | |
| { | |
| "epoch": 2.839578454332553, | |
| "grad_norm": 0.2339450716972351, | |
| "learning_rate": 1.5606079486208846e-07, | |
| "loss": 0.3137704372406006, | |
| "memory(GiB)": 127.52, | |
| "step": 2425, | |
| "token_acc": 0.8856111133651886, | |
| "train_speed(iter/s)": 0.100732 | |
| }, | |
| { | |
| "epoch": 2.845433255269321, | |
| "grad_norm": 0.22966544330120087, | |
| "learning_rate": 1.449044568340663e-07, | |
| "loss": 0.32210094928741456, | |
| "memory(GiB)": 127.52, | |
| "step": 2430, | |
| "token_acc": 0.8884470889772489, | |
| "train_speed(iter/s)": 0.100732 | |
| }, | |
| { | |
| "epoch": 2.851288056206089, | |
| "grad_norm": 0.24191494286060333, | |
| "learning_rate": 1.3415890408027932e-07, | |
| "loss": 0.31206402778625486, | |
| "memory(GiB)": 127.52, | |
| "step": 2435, | |
| "token_acc": 0.8830502196115786, | |
| "train_speed(iter/s)": 0.100731 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.23956511914730072, | |
| "learning_rate": 1.2382458450250657e-07, | |
| "loss": 0.32455346584320066, | |
| "memory(GiB)": 127.52, | |
| "step": 2440, | |
| "token_acc": 0.8758227950966726, | |
| "train_speed(iter/s)": 0.100735 | |
| }, | |
| { | |
| "epoch": 2.8629976580796255, | |
| "grad_norm": 0.22552776336669922, | |
| "learning_rate": 1.1390192886129304e-07, | |
| "loss": 0.3120935678482056, | |
| "memory(GiB)": 127.52, | |
| "step": 2445, | |
| "token_acc": 0.897060631760815, | |
| "train_speed(iter/s)": 0.100735 | |
| }, | |
| { | |
| "epoch": 2.8688524590163933, | |
| "grad_norm": 0.2666381001472473, | |
| "learning_rate": 1.0439135075798634e-07, | |
| "loss": 0.3291801452636719, | |
| "memory(GiB)": 127.52, | |
| "step": 2450, | |
| "token_acc": 0.8820067150139295, | |
| "train_speed(iter/s)": 0.100741 | |
| }, | |
| { | |
| "epoch": 2.8747072599531616, | |
| "grad_norm": 0.22115741670131683, | |
| "learning_rate": 9.529324661750494e-08, | |
| "loss": 0.32175321578979493, | |
| "memory(GiB)": 127.52, | |
| "step": 2455, | |
| "token_acc": 0.8775227487104135, | |
| "train_speed(iter/s)": 0.100739 | |
| }, | |
| { | |
| "epoch": 2.88056206088993, | |
| "grad_norm": 0.22983959317207336, | |
| "learning_rate": 8.6607995671808e-08, | |
| "loss": 0.31844320297241213, | |
| "memory(GiB)": 127.52, | |
| "step": 2460, | |
| "token_acc": 0.8813101879265747, | |
| "train_speed(iter/s)": 0.10074 | |
| }, | |
| { | |
| "epoch": 2.8864168618266977, | |
| "grad_norm": 0.23733210563659668, | |
| "learning_rate": 7.833595994409248e-08, | |
| "loss": 0.3080190658569336, | |
| "memory(GiB)": 127.52, | |
| "step": 2465, | |
| "token_acc": 0.88289333750391, | |
| "train_speed(iter/s)": 0.100738 | |
| }, | |
| { | |
| "epoch": 2.892271662763466, | |
| "grad_norm": 0.24082650244235992, | |
| "learning_rate": 7.047748423370193e-08, | |
| "loss": 0.3234051465988159, | |
| "memory(GiB)": 127.52, | |
| "step": 2470, | |
| "token_acc": 0.8791906373996674, | |
| "train_speed(iter/s)": 0.100744 | |
| }, | |
| { | |
| "epoch": 2.898126463700234, | |
| "grad_norm": 0.24151204526424408, | |
| "learning_rate": 6.303289610175233e-08, | |
| "loss": 0.31094648838043215, | |
| "memory(GiB)": 127.52, | |
| "step": 2475, | |
| "token_acc": 0.8864608150470219, | |
| "train_speed(iter/s)": 0.100743 | |
| }, | |
| { | |
| "epoch": 2.9039812646370025, | |
| "grad_norm": 0.23166167736053467, | |
| "learning_rate": 5.6002505857480906e-08, | |
| "loss": 0.3175530910491943, | |
| "memory(GiB)": 127.52, | |
| "step": 2480, | |
| "token_acc": 0.8859342832291451, | |
| "train_speed(iter/s)": 0.100739 | |
| }, | |
| { | |
| "epoch": 2.9098360655737707, | |
| "grad_norm": 0.22753314673900604, | |
| "learning_rate": 4.938660654530969e-08, | |
| "loss": 0.3289816379547119, | |
| "memory(GiB)": 127.52, | |
| "step": 2485, | |
| "token_acc": 0.8799638876393262, | |
| "train_speed(iter/s)": 0.100739 | |
| }, | |
| { | |
| "epoch": 2.9156908665105385, | |
| "grad_norm": 0.22824768722057343, | |
| "learning_rate": 4.318547393263317e-08, | |
| "loss": 0.33161611557006837, | |
| "memory(GiB)": 127.52, | |
| "step": 2490, | |
| "token_acc": 0.8840203211591419, | |
| "train_speed(iter/s)": 0.100737 | |
| }, | |
| { | |
| "epoch": 2.921545667447307, | |
| "grad_norm": 0.2232208400964737, | |
| "learning_rate": 3.739936649832188e-08, | |
| "loss": 0.31346931457519533, | |
| "memory(GiB)": 127.52, | |
| "step": 2495, | |
| "token_acc": 0.8866209251707488, | |
| "train_speed(iter/s)": 0.100742 | |
| }, | |
| { | |
| "epoch": 2.927400468384075, | |
| "grad_norm": 0.22846031188964844, | |
| "learning_rate": 3.2028525421946563e-08, | |
| "loss": 0.31502933502197267, | |
| "memory(GiB)": 127.52, | |
| "step": 2500, | |
| "token_acc": 0.8958872772065662, | |
| "train_speed(iter/s)": 0.100746 | |
| }, | |
| { | |
| "epoch": 2.933255269320843, | |
| "grad_norm": 0.22012905776500702, | |
| "learning_rate": 2.70731745737296e-08, | |
| "loss": 0.317963695526123, | |
| "memory(GiB)": 127.52, | |
| "step": 2505, | |
| "token_acc": 0.8870393801646438, | |
| "train_speed(iter/s)": 0.100749 | |
| }, | |
| { | |
| "epoch": 2.939110070257611, | |
| "grad_norm": 0.22778548300266266, | |
| "learning_rate": 2.2533520505211294e-08, | |
| "loss": 0.3122371196746826, | |
| "memory(GiB)": 127.52, | |
| "step": 2510, | |
| "token_acc": 0.888907967032967, | |
| "train_speed(iter/s)": 0.100751 | |
| }, | |
| { | |
| "epoch": 2.9449648711943794, | |
| "grad_norm": 0.22804217040538788, | |
| "learning_rate": 1.8409752440639027e-08, | |
| "loss": 0.3041959524154663, | |
| "memory(GiB)": 127.52, | |
| "step": 2515, | |
| "token_acc": 0.8861121607989981, | |
| "train_speed(iter/s)": 0.100754 | |
| }, | |
| { | |
| "epoch": 2.9508196721311473, | |
| "grad_norm": 0.2233329713344574, | |
| "learning_rate": 1.470204226908134e-08, | |
| "loss": 0.32151806354522705, | |
| "memory(GiB)": 127.52, | |
| "step": 2520, | |
| "token_acc": 0.8879425846286458, | |
| "train_speed(iter/s)": 0.100749 | |
| }, | |
| { | |
| "epoch": 2.9566744730679155, | |
| "grad_norm": 0.24781863391399384, | |
| "learning_rate": 1.1410544537263645e-08, | |
| "loss": 0.32978765964508056, | |
| "memory(GiB)": 127.52, | |
| "step": 2525, | |
| "token_acc": 0.8869459116971757, | |
| "train_speed(iter/s)": 0.100749 | |
| }, | |
| { | |
| "epoch": 2.962529274004684, | |
| "grad_norm": 0.22210603952407837, | |
| "learning_rate": 8.535396443124511e-09, | |
| "loss": 0.30834412574768066, | |
| "memory(GiB)": 127.52, | |
| "step": 2530, | |
| "token_acc": 0.8843790902885199, | |
| "train_speed(iter/s)": 0.100751 | |
| }, | |
| { | |
| "epoch": 2.968384074941452, | |
| "grad_norm": 0.22260542213916779, | |
| "learning_rate": 6.076717830098e-09, | |
| "loss": 0.31018791198730467, | |
| "memory(GiB)": 127.52, | |
| "step": 2535, | |
| "token_acc": 0.8947010997127103, | |
| "train_speed(iter/s)": 0.10075 | |
| }, | |
| { | |
| "epoch": 2.9742388758782203, | |
| "grad_norm": 0.24026013910770416, | |
| "learning_rate": 4.034611182121007e-09, | |
| "loss": 0.3117814064025879, | |
| "memory(GiB)": 127.52, | |
| "step": 2540, | |
| "token_acc": 0.8939134081534292, | |
| "train_speed(iter/s)": 0.100749 | |
| }, | |
| { | |
| "epoch": 2.980093676814988, | |
| "grad_norm": 0.22812722623348236, | |
| "learning_rate": 2.40916161935445e-09, | |
| "loss": 0.31728358268737794, | |
| "memory(GiB)": 127.52, | |
| "step": 2545, | |
| "token_acc": 0.883892058363205, | |
| "train_speed(iter/s)": 0.10075 | |
| }, | |
| { | |
| "epoch": 2.9859484777517564, | |
| "grad_norm": 0.2219596952199936, | |
| "learning_rate": 1.2004368946427758e-09, | |
| "loss": 0.31175081729888915, | |
| "memory(GiB)": 127.52, | |
| "step": 2550, | |
| "token_acc": 0.8867498701584854, | |
| "train_speed(iter/s)": 0.100752 | |
| }, | |
| { | |
| "epoch": 2.9918032786885247, | |
| "grad_norm": 0.22541016340255737, | |
| "learning_rate": 4.084873906851083e-10, | |
| "loss": 0.31843390464782717, | |
| "memory(GiB)": 127.52, | |
| "step": 2555, | |
| "token_acc": 0.893655570084918, | |
| "train_speed(iter/s)": 0.10075 | |
| }, | |
| { | |
| "epoch": 2.9976580796252925, | |
| "grad_norm": 0.22078001499176025, | |
| "learning_rate": 3.334611793692766e-11, | |
| "loss": 0.31821532249450685, | |
| "memory(GiB)": 127.52, | |
| "step": 2560, | |
| "token_acc": 0.8979642133800124, | |
| "train_speed(iter/s)": 0.100751 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2562, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1.0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1575512474484736.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |